From a06779bc2d091398ee4439f17411bd69276d85f5 Mon Sep 17 00:00:00 2001 From: Chris Duncan Date: Fri, 28 Feb 2025 22:19:48 -0800 Subject: [PATCH] Overhaul both WebGPU and WebGL to use vec4 for parallel operation hinting on supported hardware. Refactor WebGL BLAKE2b to simplify pixel-coordinate-based nonce variation, to unroll main G mix function loop for performance, and to better differentiate between search and validate processes. Simplify vertex shader now that it is only required for drawing the fullscreen quad and not for pixel coordinates. Create new downsampling fragment shader which enables larger canvases and more nonces per frame without introducing lag due to synchronous readback. Maintain canvas between draw calls unless effort has changed. Attempt to handle WebGL context loss, with improved reset function, by reinitializing class. Reduce promise stack increases when waiting for query result. Fix color buffer clearing by using correct API function. Improve nonce seed generation in both WebGL and WebGPU by switching from crypto random to insecure random which is OK in the context of PoW. Reduce garbage collection by reusing static variables. Add debugging throughout that obeys user-provided debug flag which is now stored as a static variable as well. Add Typescript typings for new WebGL types. Fix minor issues with test page. Add benchmark results table. --- benchmarks.md | 89 ++ main.min.js | 657 ++++++---- src/classes/gl.ts | 386 ++++-- src/classes/gpu.ts | 47 +- src/classes/index.ts | 6 +- src/shaders/compute.wgsl | 2333 ++++++++++++++++++---------------- src/shaders/gl-downsample.ts | 36 + src/shaders/gl-draw.ts | 210 +++ src/shaders/gl-fragment.ts | 132 -- src/shaders/gl-vertex.ts | 9 +- src/shaders/index.ts | 6 +- test.html | 33 +- types.d.ts | 19 +- 13 files changed, 2350 insertions(+), 1613 deletions(-) create mode 100644 benchmarks.md create mode 100644 src/shaders/gl-downsample.ts create mode 100644 src/shaders/gl-draw.ts delete mode 100644 src/shaders/gl-fragment.ts diff --git a/benchmarks.md b/benchmarks.md new file mode 100644 index 0000000..660fb73 --- /dev/null +++ b/benchmarks.md @@ -0,0 +1,89 @@ +# Benchmarks to compute 16,777,216 nonces +_Each test is 128 samples of one pass (dispatch or frame) at zero threshold with no early exit_ + +## Summary +- Chromium WebGPU and Firefox WebGL are the clear winners +- Chromium WebGL seems to suffer from an Nvidia driver issue +- Firefox WebGPU exhibits a strange implementation bottleneck + - It seems to restricts each pass to a minimum of 200ms + - Not shown here: This minimum is higher or lower depending on dispatch size +- Safari WebGPU performance was maintained between versions +- WebGL downsampling shader minimized readback lag and thus enabled much larger frames +- WebGPU improved dramatically on non-mobile platforms, almost halving frame times + + +## All Results +| Version | System | Browser | API | Total | Rate | Median | Mean | +|-----------|--------------|-------------|-----------|-----------|----------|----------|----------| +| RC | RTX 3070 | Chromium N | WebGPU | 747 | 218.23 | 4.5 | 4.58 | +| RC | RTX 3070 | Firefox N | WebGL | 1096 | 117.06 | 8.55 | 8.54 | +| RC | RTX 3070 | Firefox | WebGL | 1174 | 116.57 | 9 | 8.58 | +| 2.0.0 | RTX 3070 | Chromium N | WebGPU | 1339 | 112.02 | ? | 10.47 | +| RC | Intel Xe | Chromium N | WebGPU | 5107 | 25.20 | 39.5 | 39.69 | +| RC | Intel Xe | Firefox | WebGL | 7166 | 18.84 | 52 | 53.07 | +| RC | Intel Xe | Firefox N | WebGL | 8060 | 16.71 | 63 | 59.83 | +| RC | iPhone 12 | Safari | WebGPU | 8586 | 14.97 | 67 | 66.79 | +| 2.0.0 | iPhone 12 | Safari | WebGPU | 8765 | 14.69 | 68 | 68.48 | +| RC | Intel Xe | Chromium N | WebGL | 9764 | 15.42 | 62.8 | 64.85 | +| 2.0.0 | Intel Xe | Chromium N | WebGPU | 10103 | 12.63 | ? | 78.93 | +| RC | RTX 3070 | Chromium N | WebGL | 10681 | 19.81 | 50.60 | 50.47 | +| RC | iPhone 12 | Safari | WebGL | 12631 | 10.54 | 95 | 94.86 | +| RC | iPad Mini 5 | Safari | WebGPU | 14232 | 8.78 | 114 | 113.85 | +| 2.0.0 | iPad Mini 5 | Safari | WebGPU | 14903 | 8.59 | 118 | 116.41 | +| RC | iPad Mini 5 | Safari | WebGL | 18928 | 6.97 | 145 | 143.40 | +| RC | Intel Xe | Firefox N | WebGPU | 25679 | 4.99 | 200 | 200.47 | +| 2.0.0 | Intel Xe | Firefox N | WebGPU | 25805 | 4.94 | ? | 201.60 | +| 2.0.0 | RTX 3070 | Firefox N | WebGPU | 25629 | 4.97 | ? | 200.23 | +| RC | RTX 3070 | Firefox N | WebGPU | 25633 | 5.00 | 200 | 200.15 | +| 2.0.0 | RTX 3070 | Firefox | WebGL | 35224 | 3.72 | ? | 275.19 | +| 2.0.0 | RTX 3070 | Chromium N | WebGL | 47603 | 3.06 | ? | 371.90 | +| 2.0.0 | RTX 3070 | Firefox N | WebGL | Unusable | N/A | ? | N/A | +| 2.0.0 | Intel Xe | Firefox | WebGL | Unusable | N/A | ? | N/A | +| 2.0.0 | Intel Xe | Firefox N | WebGL | Unusable | N/A | ? | N/A | +| 2.0.0 | Intel Xe | Chromium N | WebGL | Unusable | N/A | ? | N/A | +| 2.0.0 | iPhone 12 | Safari | WebGL | Unusable | N/A | ? | N/A | +| 2.0.0 | iPad Mini 5 | Safari | WebGL | Unusable | N/A | ? | N/A | + +## RTX 3070 +| Version | Browser | API | Total | Rate | Median | Mean | +|-----------|-------------|-----------|-----------|----------|----------|----------| +| RC | Chromium N | WebGPU | 747 | 218.23 | 4.5 | 4.58 | +| RC | Firefox N | WebGL | 1096 | 117.06 | 8.55 | 8.54 | +| RC | Firefox | WebGL | 1174 | 116.57 | 9 | 8.58 | +| 2.0.0 | Chromium N | WebGPU | 1339 | 112.02 | ? | 10.47 | +| RC | Chromium N | WebGL | 10681 | 19.81 | 50.60 | 50.47 | +| 2.0.0 | Firefox N | WebGPU | 25629 | 4.97 | ? | 200.23 | +| RC | Firefox N | WebGPU | 25633 | 5.00 | 200 | 200.15 | +| 2.0.0 | Firefox | WebGL | 35224 | 3.72 | ? | 275.19 | +| 2.0.0 | Chromium N | WebGL | 47603 | 3.06 | ? | 371.90 | +| 2.0.0 | Firefox N | WebGL | Unusable | N/A | ? | N/A | + +## Intel Xe integrated graphics +| Version | Browser | API | Total | Rate | Median | Mean | +|-----------|-------------|-----------|-----------|----------|----------|----------| +| RC | Chromium N | WebGPU | 5107 | 25.20 | 39.5 | 39.69 | +| RC | Firefox | WebGL | 7166 | 18.84 | 52 | 53.07 | +| RC | Firefox N | WebGL | 8060 | 16.71 | 63 | 59.83 | +| RC | Chromium N | WebGL | 9764 | 15.42 | 62.8 | 64.85 | +| 2.0.0 | Chromium N | WebGPU | 10103 | 12.63 | ? | 78.93 | +| RC | Firefox N | WebGPU | 25679 | 4.99 | 200 | 200.47 | +| 2.0.0 | Firefox N | WebGPU | 25805 | 4.94 | ? | 201.60 | +| 2.0.0 | Firefox | WebGL | Unusable | N/A | ? | N/A | +| 2.0.0 | Firefox N | WebGL | Unusable | N/A | ? | N/A | +| 2.0.0 | Chromium N | WebGL | Unusable | N/A | ? | N/A | + +## iPhone 12 (A14 Bionic, ??? 4-core GPU) +| Version | Browser | API | Total | Rate | Median | Mean | +|-----------|-------------|-----------|-----------|----------|----------|----------| +| RC | Safari | WebGPU | 8586 | 14.97 | 67 | 66.79 | +| 2.0.0 | Safari | WebGPU | 8765 | 14.69 | 68 | 68.48 | +| RC | Safari | WebGL | 12631 | 10.54 | 95 | 94.86 | +| 2.0.0 | Safari | WebGL | Unusable | N/A | ? | N/A | + +## iPad Mini 5 (A12 Bionic, G11P 4-core GPU) +| Version | Browser | API | Total | Rate | Median | Mean | +|-----------|-------------|-----------|-----------|----------|----------|----------| +| RC | Safari | WebGPU | 14232 | 8.78 | 114 | 113.85 | +| 2.0.0 | Safari | WebGPU | 14903 | 8.59 | 118 | 116.41 | +| RC | Safari | WebGL | 18928 | 6.97 | 145 | 143.40 | +| 2.0.0 | Safari | WebGL | Unusable | N/A | ? | N/A | diff --git a/main.min.js b/main.min.js index 754f2f6..7bc0f63 100644 --- a/main.min.js +++ b/main.min.js @@ -1,38 +1,76 @@ // src/shaders/compute.wgsl -var compute_default = `struct UBO{blockhash:array,2>,seed:vec2,threshold:u32};@group(0)@binding(0)var ubo:UBO;struct WORK{nonce:vec2,found:atomic};@group(0)@binding(1)varwork:WORK;const BLAKE2B_IV_0=vec2(0xF2BDC900u,0x6A09E667u);const ROTATE_1=vec2(1u);const ROTATE_8=vec2(8u);const ROTATE_16=vec2(16u);const ROTATE_24=vec2(24u);const ROTATE_31=vec2(31u);var found:bool;@compute @workgroup_size(32)fn search(@builtin(global_invocation_id)global_id:vec3,@builtin(local_invocation_id)local_id:vec3){main(global_id);}@compute @workgroup_size(1)fn validate(@builtin(global_invocation_id)global_id:vec3){main(global_id);}fn main(id:vec3){let m0:vec2=ubo.seed ^ id.xy;let m1:vec2=ubo.blockhash[0u].xy;let m2:vec2=ubo.blockhash[0u].zw;let m3:vec2=ubo.blockhash[1u].xy;let m4:vec2=ubo.blockhash[1u].zw;var v0:vec2=BLAKE2B_IV_0;var v1:vec2=vec2(0x84CAA73Bu,0xBB67AE85u);var v2:vec2=vec2(0xFE94F82Bu,0x3C6EF372u);var v3:vec2=vec2(0x5F1D36F1u,0xA54FF53Au);var v4:vec2=vec2(0xADE682D1u,0x510E527Fu);var v5:vec2=vec2(0x2B3E6C1Fu,0x9B05688Cu);var v6:vec2=vec2(0xFB41BD6Bu,0x1F83D9ABu);var v7:vec2=vec2(0x137E2179u,0x5BE0CD19u);var v8:vec2=vec2(0xF3BCC908u,0x6A09E667u);var v9:vec2=vec2(0x84CAA73Bu,0xBB67AE85u);var vA:vec2=vec2(0xFE94F82Bu,0x3C6EF372u);var vB:vec2=vec2(0x5F1D36F1u,0xA54FF53Au);var vC:vec2=vec2(0xADE682F9u,0x510E527Fu);var vD:vec2=vec2(0x2B3E6C1Fu,0x9B05688Cu);var vE:vec2=vec2(0x04BE4294u,0xE07C2654u);var vF:vec2=vec2(0x137E2179u,0x5BE0CD19u);v0=v0+v4+vec2(0u,u32(v0.x+v4.x>ROTATE_24)|((v4 ^ v8).yx<>ROTATE_16)|((vC ^ v0).yx<>ROTATE_31);v1=v1+v5+vec2(0u,u32(v1.x+v5.x>ROTATE_24)|((v5 ^ v9).yx<>ROTATE_16)|((vD ^ v1).yx<>ROTATE_31);v2=v2+v6+vec2(0u,u32(v2.x+v6.x>ROTATE_24)|((v6 ^ vA).yx<>ROTATE_16)|((vE ^ v2).yx<>ROTATE_31);v3=v3+v7+vec2(0u,u32(v3.x+v7.x>ROTATE_24)|((v7 ^ vB).yx<>ROTATE_16)|((vF ^ v3).yx<>ROTATE_31);v0=v0+v5+vec2(0u,u32(v0.x+v5.x>ROTATE_24)|((v5 ^ vA).yx<>ROTATE_16)|((vF ^ v0).yx<>ROTATE_31);v1=v1+v6+vec2(0u,u32(v1.x+v6.x>ROTATE_24)|((v6 ^ vB).yx<>ROTATE_16)|((vC ^ v1).yx<>ROTATE_31);v2=v2+v7+vec2(0u,u32(v2.x+v7.x>ROTATE_24)|((v7 ^ v8).yx<>ROTATE_16)|((vD ^ v2).yx<>ROTATE_31);v3=v3+v4+vec2(0u,u32(v3.x+v4.x>ROTATE_24)|((v4 ^ v9).yx<>ROTATE_16)|((vE ^ v3).yx<>ROTATE_31);v0=v0+v4+vec2(0u,u32(v0.x+v4.x>ROTATE_24)|((v4 ^ v8).yx<>ROTATE_16)|((vC ^ v0).yx<>ROTATE_31);v1=v1+v5+vec2(0u,u32(v1.x+v5.x>ROTATE_24)|((v5 ^ v9).yx<>ROTATE_16)|((vD ^ v1).yx<>ROTATE_31);v2=v2+v6+vec2(0u,u32(v2.x+v6.x>ROTATE_24)|((v6 ^ vA).yx<>ROTATE_16)|((vE ^ v2).yx<>ROTATE_31);v3=v3+v7+vec2(0u,u32(v3.x+v7.x>ROTATE_24)|((v7 ^ vB).yx<>ROTATE_16)|((vF ^ v3).yx<>ROTATE_31);v0=v0+v5+vec2(0u,u32(v0.x+v5.x>ROTATE_24)|((v5 ^ vA).yx<>ROTATE_16)|((vF ^ v0).yx<>ROTATE_31);v1=v1+v6+vec2(0u,u32(v1.x+v6.x>ROTATE_24)|((v6 ^ vB).yx<>ROTATE_16)|((vC ^ v1).yx<>ROTATE_31);v2=v2+v7+vec2(0u,u32(v2.x+v7.x>ROTATE_24)|((v7 ^ v8).yx<>ROTATE_16)|((vD ^ v2).yx<>ROTATE_31);v3=v3+v4+vec2(0u,u32(v3.x+v4.x>ROTATE_24)|((v4 ^ v9).yx<>ROTATE_16)|((vE ^ v3).yx<>ROTATE_31);v0=v0+v4+vec2(0u,u32(v0.x+v4.x>ROTATE_24)|((v4 ^ v8).yx<>ROTATE_16)|((vC ^ v0).yx<>ROTATE_31);v1=v1+v5+vec2(0u,u32(v1.x+v5.x>ROTATE_24)|((v5 ^ v9).yx<>ROTATE_16)|((vD ^ v1).yx<>ROTATE_31);v2=v2+v6+vec2(0u,u32(v2.x+v6.x>ROTATE_24)|((v6 ^ vA).yx<>ROTATE_16)|((vE ^ v2).yx<>ROTATE_31);v3=v3+v7+vec2(0u,u32(v3.x+v7.x>ROTATE_24)|((v7 ^ vB).yx<>ROTATE_16)|((vF ^ v3).yx<>ROTATE_31);v0=v0+v5+vec2(0u,u32(v0.x+v5.x>ROTATE_24)|((v5 ^ vA).yx<>ROTATE_16)|((vF ^ v0).yx<>ROTATE_31);v1=v1+v6+vec2(0u,u32(v1.x+v6.x>ROTATE_24)|((v6 ^ vB).yx<>ROTATE_16)|((vC ^ v1).yx<>ROTATE_31);v2=v2+v7+vec2(0u,u32(v2.x+v7.x>ROTATE_24)|((v7 ^ v8).yx<>ROTATE_16)|((vD ^ v2).yx<>ROTATE_31);v3=v3+v4+vec2(0u,u32(v3.x+v4.x>ROTATE_24)|((v4 ^ v9).yx<>ROTATE_16)|((vE ^ v3).yx<>ROTATE_31);v0=v0+v4+vec2(0u,u32(v0.x+v4.x>ROTATE_24)|((v4 ^ v8).yx<>ROTATE_16)|((vC ^ v0).yx<>ROTATE_31);v1=v1+v5+vec2(0u,u32(v1.x+v5.x>ROTATE_24)|((v5 ^ v9).yx<>ROTATE_16)|((vD ^ v1).yx<>ROTATE_31);v2=v2+v6+vec2(0u,u32(v2.x+v6.x>ROTATE_24)|((v6 ^ vA).yx<>ROTATE_16)|((vE ^ v2).yx<>ROTATE_31);v3=v3+v7+vec2(0u,u32(v3.x+v7.x>ROTATE_24)|((v7 ^ vB).yx<>ROTATE_16)|((vF ^ v3).yx<>ROTATE_31);v0=v0+v5+vec2(0u,u32(v0.x+v5.x>ROTATE_24)|((v5 ^ vA).yx<>ROTATE_16)|((vF ^ v0).yx<>ROTATE_31);v1=v1+v6+vec2(0u,u32(v1.x+v6.x>ROTATE_24)|((v6 ^ vB).yx<>ROTATE_16)|((vC ^ v1).yx<>ROTATE_31);v2=v2+v7+vec2(0u,u32(v2.x+v7.x>ROTATE_24)|((v7 ^ v8).yx<>ROTATE_16)|((vD ^ v2).yx<>ROTATE_31);v3=v3+v4+vec2(0u,u32(v3.x+v4.x>ROTATE_24)|((v4 ^ v9).yx<>ROTATE_16)|((vE ^ v3).yx<>ROTATE_31);v0=v0+v4+vec2(0u,u32(v0.x+v4.x>ROTATE_24)|((v4 ^ v8).yx<>ROTATE_16)|((vC ^ v0).yx<>ROTATE_31);v1=v1+v5+vec2(0u,u32(v1.x+v5.x>ROTATE_24)|((v5 ^ v9).yx<>ROTATE_16)|((vD ^ v1).yx<>ROTATE_31);v2=v2+v6+vec2(0u,u32(v2.x+v6.x>ROTATE_24)|((v6 ^ vA).yx<>ROTATE_16)|((vE ^ v2).yx<>ROTATE_31);v3=v3+v7+vec2(0u,u32(v3.x+v7.x>ROTATE_24)|((v7 ^ vB).yx<>ROTATE_16)|((vF ^ v3).yx<>ROTATE_31);v0=v0+v5+vec2(0u,u32(v0.x+v5.x>ROTATE_24)|((v5 ^ vA).yx<>ROTATE_16)|((vF ^ v0).yx<>ROTATE_31);v1=v1+v6+vec2(0u,u32(v1.x+v6.x>ROTATE_24)|((v6 ^ vB).yx<>ROTATE_16)|((vC ^ v1).yx<>ROTATE_31);v2=v2+v7+vec2(0u,u32(v2.x+v7.x>ROTATE_24)|((v7 ^ v8).yx<>ROTATE_16)|((vD ^ v2).yx<>ROTATE_31);v3=v3+v4+vec2(0u,u32(v3.x+v4.x>ROTATE_24)|((v4 ^ v9).yx<>ROTATE_16)|((vE ^ v3).yx<>ROTATE_31);v0=v0+v4+vec2(0u,u32(v0.x+v4.x>ROTATE_24)|((v4 ^ v8).yx<>ROTATE_16)|((vC ^ v0).yx<>ROTATE_31);v1=v1+v5+vec2(0u,u32(v1.x+v5.x>ROTATE_24)|((v5 ^ v9).yx<>ROTATE_16)|((vD ^ v1).yx<>ROTATE_31);v2=v2+v6+vec2(0u,u32(v2.x+v6.x>ROTATE_24)|((v6 ^ vA).yx<>ROTATE_16)|((vE ^ v2).yx<>ROTATE_31);v3=v3+v7+vec2(0u,u32(v3.x+v7.x>ROTATE_24)|((v7 ^ vB).yx<>ROTATE_16)|((vF ^ v3).yx<>ROTATE_31);v0=v0+v5+vec2(0u,u32(v0.x+v5.x>ROTATE_24)|((v5 ^ vA).yx<>ROTATE_16)|((vF ^ v0).yx<>ROTATE_31);v1=v1+v6+vec2(0u,u32(v1.x+v6.x>ROTATE_24)|((v6 ^ vB).yx<>ROTATE_16)|((vC ^ v1).yx<>ROTATE_31);v2=v2+v7+vec2(0u,u32(v2.x+v7.x>ROTATE_24)|((v7 ^ v8).yx<>ROTATE_16)|((vD ^ v2).yx<>ROTATE_31);v3=v3+v4+vec2(0u,u32(v3.x+v4.x>ROTATE_24)|((v4 ^ v9).yx<>ROTATE_16)|((vE ^ v3).yx<>ROTATE_31);v0=v0+v4+vec2(0u,u32(v0.x+v4.x>ROTATE_24)|((v4 ^ v8).yx<>ROTATE_16)|((vC ^ v0).yx<>ROTATE_31);v1=v1+v5+vec2(0u,u32(v1.x+v5.x>ROTATE_24)|((v5 ^ v9).yx<>ROTATE_16)|((vD ^ v1).yx<>ROTATE_31);v2=v2+v6+vec2(0u,u32(v2.x+v6.x>ROTATE_24)|((v6 ^ vA).yx<>ROTATE_16)|((vE ^ v2).yx<>ROTATE_31);v3=v3+v7+vec2(0u,u32(v3.x+v7.x>ROTATE_24)|((v7 ^ vB).yx<>ROTATE_16)|((vF ^ v3).yx<>ROTATE_31);v0=v0+v5+vec2(0u,u32(v0.x+v5.x>ROTATE_24)|((v5 ^ vA).yx<>ROTATE_16)|((vF ^ v0).yx<>ROTATE_31);v1=v1+v6+vec2(0u,u32(v1.x+v6.x>ROTATE_24)|((v6 ^ vB).yx<>ROTATE_16)|((vC ^ v1).yx<>ROTATE_31);v2=v2+v7+vec2(0u,u32(v2.x+v7.x>ROTATE_24)|((v7 ^ v8).yx<>ROTATE_16)|((vD ^ v2).yx<>ROTATE_31);v3=v3+v4+vec2(0u,u32(v3.x+v4.x>ROTATE_24)|((v4 ^ v9).yx<>ROTATE_16)|((vE ^ v3).yx<>ROTATE_31);v0=v0+v4+vec2(0u,u32(v0.x+v4.x>ROTATE_24)|((v4 ^ v8).yx<>ROTATE_16)|((vC ^ v0).yx<>ROTATE_31);v1=v1+v5+vec2(0u,u32(v1.x+v5.x>ROTATE_24)|((v5 ^ v9).yx<>ROTATE_16)|((vD ^ v1).yx<>ROTATE_31);v2=v2+v6+vec2(0u,u32(v2.x+v6.x>ROTATE_24)|((v6 ^ vA).yx<>ROTATE_16)|((vE ^ v2).yx<>ROTATE_31);v3=v3+v7+vec2(0u,u32(v3.x+v7.x>ROTATE_24)|((v7 ^ vB).yx<>ROTATE_16)|((vF ^ v3).yx<>ROTATE_31);v0=v0+v5+vec2(0u,u32(v0.x+v5.x>ROTATE_24)|((v5 ^ vA).yx<>ROTATE_16)|((vF ^ v0).yx<>ROTATE_31);v1=v1+v6+vec2(0u,u32(v1.x+v6.x>ROTATE_24)|((v6 ^ vB).yx<>ROTATE_16)|((vC ^ v1).yx<>ROTATE_31);v2=v2+v7+vec2(0u,u32(v2.x+v7.x>ROTATE_24)|((v7 ^ v8).yx<>ROTATE_16)|((vD ^ v2).yx<>ROTATE_31);v3=v3+v4+vec2(0u,u32(v3.x+v4.x>ROTATE_24)|((v4 ^ v9).yx<>ROTATE_16)|((vE ^ v3).yx<>ROTATE_31);v0=v0+v4+vec2(0u,u32(v0.x+v4.x>ROTATE_24)|((v4 ^ v8).yx<>ROTATE_16)|((vC ^ v0).yx<>ROTATE_31);v1=v1+v5+vec2(0u,u32(v1.x+v5.x>ROTATE_24)|((v5 ^ v9).yx<>ROTATE_16)|((vD ^ v1).yx<>ROTATE_31);v2=v2+v6+vec2(0u,u32(v2.x+v6.x>ROTATE_24)|((v6 ^ vA).yx<>ROTATE_16)|((vE ^ v2).yx<>ROTATE_31);v3=v3+v7+vec2(0u,u32(v3.x+v7.x>ROTATE_24)|((v7 ^ vB).yx<>ROTATE_16)|((vF ^ v3).yx<>ROTATE_31);v0=v0+v5+vec2(0u,u32(v0.x+v5.x>ROTATE_24)|((v5 ^ vA).yx<>ROTATE_16)|((vF ^ v0).yx<>ROTATE_31);v1=v1+v6+vec2(0u,u32(v1.x+v6.x>ROTATE_24)|((v6 ^ vB).yx<>ROTATE_16)|((vC ^ v1).yx<>ROTATE_31);v2=v2+v7+vec2(0u,u32(v2.x+v7.x>ROTATE_24)|((v7 ^ v8).yx<>ROTATE_16)|((vD ^ v2).yx<>ROTATE_31);v3=v3+v4+vec2(0u,u32(v3.x+v4.x>ROTATE_24)|((v4 ^ v9).yx<>ROTATE_16)|((vE ^ v3).yx<>ROTATE_31);v0=v0+v4+vec2(0u,u32(v0.x+v4.x>ROTATE_24)|((v4 ^ v8).yx<>ROTATE_16)|((vC ^ v0).yx<>ROTATE_31);v1=v1+v5+vec2(0u,u32(v1.x+v5.x>ROTATE_24)|((v5 ^ v9).yx<>ROTATE_16)|((vD ^ v1).yx<>ROTATE_31);v2=v2+v6+vec2(0u,u32(v2.x+v6.x>ROTATE_24)|((v6 ^ vA).yx<>ROTATE_16)|((vE ^ v2).yx<>ROTATE_31);v3=v3+v7+vec2(0u,u32(v3.x+v7.x>ROTATE_24)|((v7 ^ vB).yx<>ROTATE_16)|((vF ^ v3).yx<>ROTATE_31);v0=v0+v5+vec2(0u,u32(v0.x+v5.x>ROTATE_24)|((v5 ^ vA).yx<>ROTATE_16)|((vF ^ v0).yx<>ROTATE_31);v1=v1+v6+vec2(0u,u32(v1.x+v6.x>ROTATE_24)|((v6 ^ vB).yx<>ROTATE_16)|((vC ^ v1).yx<>ROTATE_31);v2=v2+v7+vec2(0u,u32(v2.x+v7.x>ROTATE_24)|((v7 ^ v8).yx<>ROTATE_16)|((vD ^ v2).yx<>ROTATE_31);v3=v3+v4+vec2(0u,u32(v3.x+v4.x>ROTATE_24)|((v4 ^ v9).yx<>ROTATE_16)|((vE ^ v3).yx<>ROTATE_31);v0=v0+v4+vec2(0u,u32(v0.x+v4.x>ROTATE_24)|((v4 ^ v8).yx<>ROTATE_16)|((vC ^ v0).yx<>ROTATE_31);v1=v1+v5+vec2(0u,u32(v1.x+v5.x>ROTATE_24)|((v5 ^ v9).yx<>ROTATE_16)|((vD ^ v1).yx<>ROTATE_31);v2=v2+v6+vec2(0u,u32(v2.x+v6.x>ROTATE_24)|((v6 ^ vA).yx<>ROTATE_16)|((vE ^ v2).yx<>ROTATE_31);v3=v3+v7+vec2(0u,u32(v3.x+v7.x>ROTATE_24)|((v7 ^ vB).yx<>ROTATE_16)|((vF ^ v3).yx<>ROTATE_31);v0=v0+v5+vec2(0u,u32(v0.x+v5.x>ROTATE_24)|((v5 ^ vA).yx<>ROTATE_16)|((vF ^ v0).yx<>ROTATE_31);v1=v1+v6+vec2(0u,u32(v1.x+v6.x>ROTATE_24)|((v6 ^ vB).yx<>ROTATE_16)|((vC ^ v1).yx<>ROTATE_31);v2=v2+v7+vec2(0u,u32(v2.x+v7.x>ROTATE_24)|((v7 ^ v8).yx<>ROTATE_16)|((vD ^ v2).yx<>ROTATE_31);v3=v3+v4+vec2(0u,u32(v3.x+v4.x>ROTATE_24)|((v4 ^ v9).yx<>ROTATE_16)|((vE ^ v3).yx<>ROTATE_31);v0=v0+v4+vec2(0u,u32(v0.x+v4.x>ROTATE_24)|((v4 ^ v8).yx<>ROTATE_16)|((vC ^ v0).yx<>ROTATE_24)|((v5 ^ v9).yx<>ROTATE_16)|((vD ^ v1).yx<>ROTATE_31);v2=v2+v6+vec2(0u,u32(v2.x+v6.x>ROTATE_24)|((v6 ^ vA).yx<>ROTATE_16)|((vE ^ v2).yx<>ROTATE_24)|((v7 ^ vB).yx<>ROTATE_16)|((vF ^ v3).yx<>ROTATE_31);v0=v0+v5+vec2(0u,u32(v0.x+v5.x>ROTATE_24)|((v5 ^ vA).yx<>ROTATE_24)|((v7 ^ v8).yx<>ROTATE_16)|((vD ^ v2).yx<ubo.threshold&&atomicLoad(&work.found)==0u){atomicStore(&work.found,1u);work.nonce=m0;}return;}`; +var compute_default = "struct UBO{blockhash:array,2>,seed:vec2,threshold:u32};@group(0)@binding(0)var ubo:UBO;struct WORK{nonce:vec2,found:atomic};@group(0)@binding(1)varwork:WORK;const BLAKE2B_IV_0=vec2(0xF2BDC900u,0x6A09E667u);const BLAKE2B_IV_0=vec2(0xF2BDC900u,0x6A09E667u);const Z=vec2(0u);const CARRY=vec4(1u,0u,1u,0u);const ROTATE_1=vec4(1u);const ROTATE_8=vec4(8u);const ROTATE_16=vec4(16u);const ROTATE_24=vec4(24u);const ROTATE_31=vec4(31u);var found:bool;@compute @workgroup_size(32)fn search(@builtin(global_invocation_id)global_id:vec3,@builtin(local_invocation_id)local_id:vec3){found=(local_id.x==0u&&atomicLoad(&work.found)!=0u);workgroupBarrier();if(found){return;}main(global_id);}@compute @workgroup_size(1)fn validate(@builtin(global_invocation_id)global_id:vec3){main(global_id);}fn main(id:vec3){let m0:vec2=ubo.seed ^ id.xy;let m1:vec2=ubo.blockhash[0u].xy;let m2:vec2=ubo.blockhash[0u].zw;let m3:vec2=ubo.blockhash[1u].xy;let m4:vec2=ubo.blockhash[1u].zw;var v01:vec4=vec4(BLAKE2B_IV_0,0x84CAA73Bu,0xBB67AE85u);var v23:vec4=vec4(0xFE94F82Bu,0x3C6EF372u,0x5F1D36F1u,0xA54FF53Au);var v45:vec4=vec4(0xADE682D1u,0x510E527Fu,0x2B3E6C1Fu,0x9B05688Cu);var v67:vec4=vec4(0xFB41BD6Bu,0x1F83D9ABu,0x137E2179u,0x5BE0CD19u);var v89:vec4=vec4(0xF3BCC908u,0x6A09E667u,0x84CAA73Bu,0xBB67AE85u);var vAB:vec4=vec4(0xFE94F82Bu,0x3C6EF372u,0x5F1D36F1u,0xA54FF53Au);var vCD:vec4=vec4(0xADE682F9u,0x510E527Fu,0x2B3E6C1Fu,0x9B05688Cu);var vEF:vec4=vec4(0x04BE4294u,0xE07C2654u,0x137E2179u,0x5BE0CD19u);var v56:vec4;var vFC:vec4;var v74:vec4;var vDE:vec4;var s0:vec4;var s1:vec4;s0=v01+v45;v01=s0+(vec4(s0(s1(s0(s0(s1>ROTATE_24)|(v45<>ROTATE_24)|(v67<(s0(s1(s0>ROTATE_16)|(vCD<>ROTATE_16)|(vEF<(s0(s1>ROTATE_31).yxwz|(v45<>ROTATE_31).yxwz|(v67<(s0(s1(s0(s1>ROTATE_24)|(v56<>ROTATE_24)|(v74<(s0(s1>ROTATE_16)|(vFC<>ROTATE_16)|(vDE<(s0(s1>ROTATE_31).yxwz|(v56<>ROTATE_31).yxwz|(v74<(s0(s1(s0(s1>ROTATE_24)|(v45<>ROTATE_24)|(v67<(s0(s1>ROTATE_16)|(vCD<>ROTATE_16)|(vEF<(s0(s1>ROTATE_31).yxwz|(v45<>ROTATE_31).yxwz|(v67<(s0(s1(s0(s0(s1>ROTATE_24)|(v56<>ROTATE_24)|(v74<(s0(s1>ROTATE_16)|(vFC<>ROTATE_16)|(vDE<(s0(s1>ROTATE_31).yxwz|(v56<>ROTATE_31).yxwz|(v74<(s0(s1(s0(s1>ROTATE_24)|(v45<>ROTATE_24)|(v67<(s0(s1(s0(s1>ROTATE_16)|(vCD<>ROTATE_16)|(vEF<(s0(s1>ROTATE_31).yxwz|(v45<>ROTATE_31).yxwz|(v67<(s0(s1(s0(s1>ROTATE_24)|(v56<>ROTATE_24)|(v74<(s0(s1>ROTATE_16)|(vFC<>ROTATE_16)|(vDE<(s0(s1>ROTATE_31).yxwz|(v56<>ROTATE_31).yxwz|(v74<(s0(s1(s0(s1>ROTATE_24)|(v45<>ROTATE_24)|(v67<(s0(s1>ROTATE_16)|(vCD<>ROTATE_16)|(vEF<(s0(s1>ROTATE_31).yxwz|(v45<>ROTATE_31).yxwz|(v67<(s0(s1(s0(s1>ROTATE_24)|(v56<>ROTATE_24)|(v74<(s0(s1>ROTATE_16)|(vFC<>ROTATE_16)|(vDE<(s0(s1>ROTATE_31).yxwz|(v56<>ROTATE_31).yxwz|(v74<(s0(s1(s1(s0(s1>ROTATE_24)|(v45<>ROTATE_24)|(v67<(s0(s1>ROTATE_16)|(vCD<>ROTATE_16)|(vEF<(s0(s1>ROTATE_31).yxwz|(v45<>ROTATE_31).yxwz|(v67<(s0(s1(s1(s0(s1>ROTATE_24)|(v56<>ROTATE_24)|(v74<(s0(s1>ROTATE_16)|(vFC<>ROTATE_16)|(vDE<(s0(s1>ROTATE_31).yxwz|(v56<>ROTATE_31).yxwz|(v74<(s0(s1(s0(s1>ROTATE_24)|(v45<>ROTATE_24)|(v67<(s0(s1(s1>ROTATE_16)|(vCD<>ROTATE_16)|(vEF<(s0(s1>ROTATE_31).yxwz|(v45<>ROTATE_31).yxwz|(v67<(s0(s1(s0(s1>ROTATE_24)|(v56<>ROTATE_24)|(v74<(s0(s1>ROTATE_16)|(vFC<>ROTATE_16)|(vDE<(s0(s1>ROTATE_31).yxwz|(v56<>ROTATE_31).yxwz|(v74<(s0(s1(s0(s1>ROTATE_24)|(v45<>ROTATE_24)|(v67<(s0(s1>ROTATE_16)|(vCD<>ROTATE_16)|(vEF<(s0(s1>ROTATE_31).yxwz|(v45<>ROTATE_31).yxwz|(v67<(s0(s1(s0(s1>ROTATE_24)|(v56<>ROTATE_24)|(v74<(s0(s1(s1>ROTATE_16)|(vFC<>ROTATE_16)|(vDE<(s0(s1>ROTATE_31).yxwz|(v56<>ROTATE_31).yxwz|(v74<(s0(s1(s1(s0(s1>ROTATE_24)|(v45<>ROTATE_24)|(v67<(s0(s1>ROTATE_16)|(vCD<>ROTATE_16)|(vEF<(s0(s1>ROTATE_31).yxwz|(v45<>ROTATE_31).yxwz|(v67<(s0(s1(s0(s1>ROTATE_24)|(v56<>ROTATE_24)|(v74<(s0(s1>ROTATE_16)|(vFC<>ROTATE_16)|(vDE<(s0(s1>ROTATE_31).yxwz|(v56<>ROTATE_31).yxwz|(v74<(s0(s1(s0(s1>ROTATE_24)|(v45<>ROTATE_24)|(v67<(s0(s1>ROTATE_16)|(vCD<>ROTATE_16)|(vEF<(s0(s1>ROTATE_31).yxwz|(v45<>ROTATE_31).yxwz|(v67<(s0(s1(s0(s1>ROTATE_24)|(v56<>ROTATE_24)|(v74<(s0(s1>ROTATE_16)|(vFC<>ROTATE_16)|(vDE<(s0(s1>ROTATE_31).yxwz|(v56<>ROTATE_31).yxwz|(v74<(s0(s1(s0(s1>ROTATE_24)|(v45<>ROTATE_24)|(v67<(s0(s1(s0>ROTATE_16)|(vCD<>ROTATE_16)|(vEF<(s0(s1>ROTATE_31).yxwz|(v45<>ROTATE_31).yxwz|(v67<(s0(s1(s0(s1>ROTATE_24)|(v56<>ROTATE_24)|(v74<(s0(s1>ROTATE_16)|(vFC<>ROTATE_16)|(vDE<(s0(s1>ROTATE_31).yxwz|(v56<>ROTATE_31).yxwz|(v74<(s0(s1(s0(s0(s1>ROTATE_24)|(v45<>ROTATE_24)|(v67<(s0(s1(s0>ROTATE_16)|(vCD<>ROTATE_16)|(vEF<(s0(s1>ROTATE_31).yxwz|(v45<>ROTATE_31).yxwz|(v67<(s0(s1(s0(s1>ROTATE_24)|(v56<>ROTATE_24)|(v74<(s0(s1>ROTATE_16)|(vFC<>ROTATE_16)|(vDE<(s0(s1>ROTATE_31).yxwz|(v56<>ROTATE_31).yxwz|(v74<(s0(s1(s0(s1>ROTATE_24)|(v45<>ROTATE_24)|(v67<(s0(s1>ROTATE_16)|(vCD<>ROTATE_16)|(vEF<(s0(s1>ROTATE_31).yxwz|(v45<>ROTATE_31).yxwz|(v67<(s0(s1(s0(s0(s1>ROTATE_24)|(v56<>ROTATE_24)|(v74<(s0(s1>ROTATE_16)|(vDE<(s1ubo.threshold&&atomicLoad(&work.found)==0u){atomicStore(&work.found,1u);work.nonce=m0;}return;}"; -// src/shaders/gl-fragment.ts -var NanoPowGlFragmentShader = `#version 300 es +// src/shaders/gl-downsample.ts +var NanoPowGlDownsampleShader = `#version 300 es #pragma vscode_glsllint_stage: frag +#ifdef GL_FRAGMENT_PRECISION_HIGH precision highp float; +#else +precision mediump float; +#endif +precision highp int; -in vec2 uv_pos; out uvec4 nonce; -// blockhash - array of precalculated block hash components +// source texture to be downsampled +uniform highp usampler2D src; + +void main() { + nonce = uvec4(0u); + vec2 inputSize = vec2(textureSize(src, 0)); + vec2 texel = vec2(1.0) / inputSize; + vec2 blockCoord = (floor(gl_FragCoord.xy) * 2.0 + vec2(0.5)) / inputSize; + + uvec4 pixel = texture(src, blockCoord); + nonce = pixel.x == 0u ? nonce : pixel; + + pixel = texture(src, blockCoord + vec2(texel.x, 0.0)); + nonce = pixel.x == 0u ? nonce : pixel; + + pixel = texture(src, blockCoord + vec2(0.0, texel.y)); + nonce = pixel.x == 0u ? nonce : pixel; + + pixel = texture(src, blockCoord + vec2(texel.x, texel.y)); + nonce = pixel.x == 0u ? nonce : pixel; +} +`; + +// src/shaders/gl-draw.ts +var NanoPowGlDrawShader = `#version 300 es +#pragma vscode_glsllint_stage: frag +#ifdef GL_FRAGMENT_PRECISION_HIGH +precision highp float; +#else +precision mediump float; +#endif + +out uvec4 nonce; + +// blockhash - Array of precalculated block hash components // threshold - 0xfffffff8 for send/change blocks, 0xfffffe00 for all else -// workload - Defines canvas size +// search - Checks all pixels if true, else only checks 1 pixel to validate layout(std140) uniform UBO { uint blockhash[8]; uint threshold; - float workload; + bool search; }; -// Random work values +// Random work seed values layout(std140) uniform WORK { - uvec2 work; + uvec2 seed; }; -// Defined separately from uint v[32] below as the original value is required +// Defined separately from uint v[0].y below as the original value is required // to calculate the second uint32 of the digest for threshold comparison const uint BLAKE2B_IV32_1 = 0x6A09E667u; // Used during G for vector bit rotations -const uvec2 ROTATE_1 = uvec2(1u, 1u); -const uvec2 ROTATE_8 = uvec2(8u, 8u); -const uvec2 ROTATE_16 = uvec2(16u, 16u); -const uvec2 ROTATE_24 = uvec2(24u, 24u); -const uvec2 ROTATE_31 = uvec2(31u, 31u); +const uvec4 ROTATE_1 = uvec4(1u); +const uvec4 ROTATE_8 = uvec4(8u); +const uvec4 ROTATE_16 = uvec4(16u); +const uvec4 ROTATE_24 = uvec4(24u); +const uvec4 ROTATE_31 = uvec4(31u); // Both buffers represent 16 uint64s as 32 uint32s // because that's what GLSL offers, just like Javascript @@ -42,11 +80,10 @@ const uvec2 ROTATE_31 = uvec2(31u, 31u); // OUTLEN is constant 8 bytes // v[0] ^= 0x01010000u ^ uint(OUTLEN); // INLEN is constant 40 bytes: work value (8) + block hash (32) -// v[24] ^= uint(INLEN); +// v[12] ^= uint(INLEN); // It's always the "last" compression at this INLEN -// v[28] = ~v[28]; -// v[29] = ~v[29]; -uvec2 v[16] = uvec2[16]( +// v[14] = ~v[14]; +const uvec2 blake2b_iv[16] = uvec2[16]( uvec2(0xF2BDC900u, 0x6A09E667u), uvec2(0x84CAA73Bu, 0xBB67AE85u), uvec2(0xFE94F82Bu, 0x3C6EF372u), @@ -65,43 +102,51 @@ uvec2 v[16] = uvec2[16]( uvec2(0x137E2179u, 0x5BE0CD19u) ); +// Iterated initialization vector +uvec2 v[16]; + // Input data buffer uvec2 m[16]; -// Offsets into the input data buffer for each mixing step -const uint SIGMA[192] = uint[192]( - 0u,1u,2u,3u,4u,5u,6u,7u,8u,9u,10u,11u,12u,13u,14u,15u, - 14u,10u,4u,8u,9u,15u,13u,6u,1u,12u,0u,2u,11u,7u,5u,3u, - 11u,8u,12u,0u,5u,2u,15u,13u,10u,14u,3u,6u,7u,1u,9u,4u, - 7u,9u,3u,1u,13u,12u,11u,14u,2u,6u,5u,10u,4u,0u,15u,8u, - 9u,0u,5u,7u,2u,4u,10u,15u,14u,1u,11u,12u,6u,8u,3u,13u, - 2u,12u,6u,10u,0u,11u,8u,3u,4u,13u,7u,5u,15u,14u,1u,9u, - 12u,5u,1u,15u,14u,13u,4u,10u,0u,7u,6u,3u,9u,2u,8u,11u, - 13u,11u,7u,14u,12u,1u,3u,9u,5u,0u,15u,4u,8u,6u,2u,10u, - 6u,15u,14u,9u,11u,3u,0u,8u,12u,2u,13u,7u,1u,4u,10u,5u, - 10u,2u,8u,4u,7u,6u,1u,5u,15u,11u,9u,14u,3u,12u,13u,0u, - 0u,1u,2u,3u,4u,5u,6u,7u,8u,9u,10u,11u,12u,13u,14u,15u, - 14u,10u,4u,8u,9u,15u,13u,6u,1u,12u,0u,2u,11u,7u,5u,3u -); +// G mixing function, compressing two subprocesses into one +void G ( + uint a0, uint b0, uint c0, uint d0, uvec2 x0, uvec2 y0, + uint a1, uint b1, uint c1, uint d1, uvec2 x1, uvec2 y1 +) { + uvec4 a = uvec4(v[a0], v[a1]); + uvec4 b = uvec4(v[b0], v[b1]); + uvec4 c = uvec4(v[c0], v[c1]); + uvec4 d = uvec4(v[d0], v[d1]); + uvec4 mx = uvec4(x0, x1); + uvec4 my = uvec4(y0, y1); -// G mixing function -void G (uint a, uint b, uint c, uint d, uint x, uint y) { - v[a] = v[a] + v[b] + uvec2(0u, uint(v[a].x + v[b].x < v[b].x)); - v[a] = v[a] + m[x] + uvec2(0u, uint(v[a].x + m[x].x < m[x].x)); - v[d] = (v[d] ^ v[a]).yx; - v[c] = v[c] + v[d] + uvec2(0u, uint(v[c].x + v[d].x < v[d].x)); - v[b] = ((v[b] ^ v[c]) >> ROTATE_24) | ((v[b] ^ v[c]).yx << ROTATE_8); - v[a] = v[a] + v[b] + uvec2(0u, uint(v[a].x + v[b].x < v[b].x)); - v[a] = v[a] + m[y] + uvec2(0u, uint(v[a].x + m[y].x < m[y].x)); - v[d] = ((v[d] ^ v[a]) >> ROTATE_16) | ((v[d] ^ v[a]).yx << ROTATE_16); - v[c] = v[c] + v[d] + uvec2(0u, uint(v[c].x + v[d].x < v[d].x)); - v[b] = ((v[b] ^ v[c]).yx >> ROTATE_31) | ((v[b] ^ v[c]) << ROTATE_1); + a = a + b + uvec4(0u, uint(a.x + b.x < a.x), 0u, uint(a.z + b.z < a.z)); + a = a + mx + uvec4(0u, uint(a.x + mx.x < a.x), 0u, uint(a.z + mx.z < a.z)); + d = (d ^ a).yxwz; + c = c + d + uvec4(0u, uint(c.x + d.x < c.x), 0u, uint(c.z + d.z < c.z)); + b = ((b ^ c) >> ROTATE_24) | ((b ^ c) << ROTATE_8).yxwz; + a = a + b + uvec4(0u, uint(a.x + b.x < b.x), 0u, uint(a.z + b.z < b.z)); + a = a + my + uvec4(0u, uint(a.x + my.x < a.x), 0u, uint(a.z + my.z < a.z)); + d = ((d ^ a) >> ROTATE_16) | ((d ^ a) << ROTATE_16).yxwz; + c = c + d + uvec4(0u, uint(c.x + d.x < c.x), 0u, uint(c.z + d.z < c.z)); + b = ((b ^ c) >> ROTATE_31).yxwz | ((b ^ c) << ROTATE_1); + + v[a0] = a.xy; + v[b0] = b.xy; + v[c0] = c.xy; + v[d0] = d.xy; + v[a1] = a.zw; + v[b1] = b.zw; + v[c1] = c.zw; + v[d1] = d.zw; } void main() { + // Initialize fragment output + nonce = uvec4(0u); + // Nonce uniquely differentiated by pixel location - m[0u].x = work.x ^ uint(uv_pos.x * workload); - m[0u].y = work.y ^ uint(uv_pos.y * workload); + m[0u] = seed ^ uvec2(gl_FragCoord); // Block hash m[1u] = uvec2(blockhash[0u], blockhash[1u]); @@ -109,23 +154,91 @@ void main() { m[3u] = uvec2(blockhash[4u], blockhash[5u]); m[4u] = uvec2(blockhash[6u], blockhash[7u]); - // twelve rounds of mixing - for(uint i = 0u; i < 12u; i = i + 1u) { - G(0u, 4u, 8u, 12u, SIGMA[i * 16u + 0u], SIGMA[i * 16u + 1u]); - G(1u, 5u, 9u, 13u, SIGMA[i * 16u + 2u], SIGMA[i * 16u + 3u]); - G(2u, 6u, 10u, 14u, SIGMA[i * 16u + 4u], SIGMA[i * 16u + 5u]); - G(3u, 7u, 11u, 15u, SIGMA[i * 16u + 6u], SIGMA[i * 16u + 7u]); - G(0u, 5u, 10u, 15u, SIGMA[i * 16u + 8u], SIGMA[i * 16u + 9u]); - G(1u, 6u, 11u, 12u, SIGMA[i * 16u + 10u], SIGMA[i * 16u + 11u]); - G(2u, 7u, 8u, 13u, SIGMA[i * 16u + 12u], SIGMA[i * 16u + 13u]); - G(3u, 4u, 9u, 14u, SIGMA[i * 16u + 14u], SIGMA[i * 16u + 15u]); - } + // Reset v + v = blake2b_iv; + + // Twelve rounds of G mixing + + // Round 0 + G(0u, 4u, 8u, 12u, m[0u], m[1u], 1u, 5u, 9u, 13u, m[2u], m[3u]); + G(2u, 6u, 10u, 14u, m[4u], m[5u], 3u, 7u, 11u, 15u, m[6u], m[7u]); + G(0u, 5u, 10u, 15u, m[8u], m[9u], 1u, 6u, 11u, 12u, m[10u], m[11u]); + G(2u, 7u, 8u, 13u, m[12u], m[13u], 3u, 4u, 9u, 14u, m[14u], m[15u]); + + // Round 1 + G(0u, 4u, 8u, 12u, m[14u], m[10u], 1u, 5u, 9u, 13u, m[4u], m[8u]); + G(2u, 6u, 10u, 14u, m[9u], m[15u], 3u, 7u, 11u, 15u, m[13u], m[6u]); + G(0u, 5u, 10u, 15u, m[1u], m[12u], 1u, 6u, 11u, 12u, m[0u], m[2u]); + G(2u, 7u, 8u, 13u, m[11u], m[7u], 3u, 4u, 9u, 14u, m[5u], m[3u]); + + // Round 2 + G(0u, 4u, 8u, 12u, m[11u], m[8u], 1u, 5u, 9u, 13u, m[12u], m[0u]); + G(2u, 6u, 10u, 14u, m[5u], m[2u], 3u, 7u, 11u, 15u, m[15u], m[13u]); + G(0u, 5u, 10u, 15u, m[10u], m[14u], 1u, 6u, 11u, 12u, m[3u], m[6u]); + G(2u, 7u, 8u, 13u, m[7u], m[1u], 3u, 4u, 9u, 14u, m[9u], m[4u]); + + // Round 3 + G(0u, 4u, 8u, 12u, m[7u], m[9u], 1u, 5u, 9u, 13u, m[3u], m[1u]); + G(2u, 6u, 10u, 14u, m[13u], m[12u], 3u, 7u, 11u, 15u, m[11u], m[14u]); + G(0u, 5u, 10u, 15u, m[2u], m[6u], 1u, 6u, 11u, 12u, m[5u], m[10u]); + G(2u, 7u, 8u, 13u, m[4u], m[0u], 3u, 4u, 9u, 14u, m[15u], m[8u]); + + // Round 4 + G(0u, 4u, 8u, 12u, m[9u], m[0u], 1u, 5u, 9u, 13u, m[5u], m[7u]); + G(2u, 6u, 10u, 14u, m[2u], m[4u], 3u, 7u, 11u, 15u, m[10u], m[15u]); + G(0u, 5u, 10u, 15u, m[14u], m[1u], 1u, 6u, 11u, 12u, m[11u], m[12u]); + G(2u, 7u, 8u, 13u, m[6u], m[8u], 3u, 4u, 9u, 14u, m[3u], m[13u]); - // Pixel data set from work values + // Round 5 + G(0u, 4u, 8u, 12u, m[2u], m[12u], 1u, 5u, 9u, 13u, m[6u], m[10u]); + G(2u, 6u, 10u, 14u, m[0u], m[11u], 3u, 7u, 11u, 15u, m[8u], m[3u]); + G(0u, 5u, 10u, 15u, m[4u], m[13u], 1u, 6u, 11u, 12u, m[7u], m[5u]); + G(2u, 7u, 8u, 13u, m[15u], m[14u], 3u, 4u, 9u, 14u, m[1u], m[9u]); + + // Round 6 + G(0u, 4u, 8u, 12u, m[12u], m[5u], 1u, 5u, 9u, 13u, m[1u], m[15u]); + G(2u, 6u, 10u, 14u, m[14u], m[13u], 3u, 7u, 11u, 15u, m[4u], m[10u]); + G(0u, 5u, 10u, 15u, m[0u], m[7u], 1u, 6u, 11u, 12u, m[6u], m[3u]); + G(2u, 7u, 8u, 13u, m[9u], m[2u], 3u, 4u, 9u, 14u, m[8u], m[11u]); + + // Round 7 + G(0u, 4u, 8u, 12u, m[13u], m[11u], 1u, 5u, 9u, 13u, m[7u], m[14u]); + G(2u, 6u, 10u, 14u, m[12u], m[1u], 3u, 7u, 11u, 15u, m[3u], m[9u]); + G(0u, 5u, 10u, 15u, m[5u], m[0u], 1u, 6u, 11u, 12u, m[15u], m[4u]); + G(2u, 7u, 8u, 13u, m[8u], m[6u], 3u, 4u, 9u, 14u, m[2u], m[10u]); + + // Round 8 + G(0u, 4u, 8u, 12u, m[6u], m[15u], 1u, 5u, 9u, 13u, m[14u], m[9u]); + G(2u, 6u, 10u, 14u, m[11u], m[3u], 3u, 7u, 11u, 15u, m[0u], m[8u]); + G(0u, 5u, 10u, 15u, m[12u], m[2u], 1u, 6u, 11u, 12u, m[13u], m[7u]); + G(2u, 7u, 8u, 13u, m[1u], m[4u], 3u, 4u, 9u, 14u, m[10u], m[5u]); + + // Round 9 + G(0u, 4u, 8u, 12u, m[10u], m[2u], 1u, 5u, 9u, 13u, m[8u], m[4u]); + G(2u, 6u, 10u, 14u, m[7u], m[6u], 3u, 7u, 11u, 15u, m[1u], m[5u]); + G(0u, 5u, 10u, 15u, m[15u], m[11u], 1u, 6u, 11u, 12u, m[9u], m[14u]); + G(2u, 7u, 8u, 13u, m[3u], m[12u], 3u, 4u, 9u, 14u, m[13u], m[0u]); + + // Round 10 + G(0u, 4u, 8u, 12u, m[0u], m[1u], 1u, 5u, 9u, 13u, m[2u], m[3u]); + G(2u, 6u, 10u, 14u, m[4u], m[5u], 3u, 7u, 11u, 15u, m[6u], m[7u]); + G(0u, 5u, 10u, 15u, m[8u], m[9u], 1u, 6u, 11u, 12u, m[10u], m[11u]); + G(2u, 7u, 8u, 13u, m[12u], m[13u], 3u, 4u, 9u, 14u, m[14u], m[15u]); + + // Round 11 + G(0u, 4u, 8u, 12u, m[14u], m[10u], 1u, 5u, 9u, 13u, m[4u], m[8u]); + G(2u, 6u, 10u, 14u, m[9u], m[15u], 3u, 7u, 11u, 15u, m[13u], m[6u]); + G(0u, 5u, 10u, 15u, m[1u], m[12u], 1u, 6u, 11u, 12u, m[0u], m[2u]); + G(2u, 7u, 8u, 13u, m[11u], m[7u], 3u, 4u, 9u, 14u, m[5u], m[3u]); + + // Pixel data set from work seed values // Finalize digest from high bits, low bits can be safely ignored - if ((BLAKE2B_IV32_1 ^ v[0u].y ^ v[8u].y) > threshold) { - nonce = uvec4(1u, m[0].y, m[0].x, 1u); - } else { + if ((BLAKE2B_IV32_1 ^ v[0u].y ^ v[8u].y) >= threshold && (search || uvec2(gl_FragCoord) == uvec2(0u))) { + nonce = uvec4(1u, m[0u].y, m[0u].x, (uint(gl_FragCoord.x) << 16u) | uint(gl_FragCoord.y)); + } + + // Valid nonce not found + if (nonce.x == 0u) { discard; } } @@ -134,14 +247,15 @@ void main() { // src/shaders/gl-vertex.ts var NanoPowGlVertexShader = `#version 300 es #pragma vscode_glsllint_stage: vert +#ifdef GL_FRAGMENT_PRECISION_HIGH precision highp float; -layout (location=0) in vec4 position; -layout (location=1) in vec2 uv; +#else +precision mediump float; +#endif -out vec2 uv_pos; +layout (location=0) in vec4 position; void main() { - uv_pos = uv; gl_Position = position; } `; @@ -149,54 +263,40 @@ void main() { // src/classes/gl.ts var NanoPowGl = class _NanoPowGl { static #busy = false; - /** Used to set canvas size. Must be a multiple of 256. */ - static #WORKLOAD = 256 * Math.max(1, Math.floor(navigator.hardwareConcurrency)); + static #debug = false; + static #raf = 0; + /** Used to set canvas size. */ + static #cores = Math.max(1, Math.floor(navigator.hardwareConcurrency)); + static #WORKLOAD = 256 * this.#cores; + static #canvas = new OffscreenCanvas(this.#WORKLOAD, this.#WORKLOAD); + static get size() { + return this.#gl?.drawingBufferWidth; + } static #gl; - static #program; + static #drawProgram; + static #downsampleProgram; static #vertexShader; - static #fragmentShader; - static #texture; - static #framebuffer; + static #drawShader; + static #downsampleShader; static #positionBuffer; - static #uvBuffer; + static #drawFbo; + static #downsampleFbos = []; + static #downsampleSrcLocation; static #uboBuffer; - static #workBuffer; + static #uboView = new DataView(new ArrayBuffer(144)); + static #seedBuffer; + static #seed = new BigUint64Array(1); static #query; static #pixels; /**Vertex Positions, 2 triangles */ static #positions = new Float32Array([ -1, -1, - 0, - -1, - 1, - 0, - 1, - 1, - 0, 1, -1, - 0, 1, 1, - 0, -1, - -1, - 0 - ]); - /** Texture Positions */ - static #uvPosArray = new Float32Array([ - 1, - 1, - 1, - 0, - 0, - 0, - 0, - 1, - 0, - 0, - 1, 1 ]); /** Compile */ @@ -204,88 +304,139 @@ var NanoPowGl = class _NanoPowGl { if (this.#busy) return; this.#busy = true; try { - this.#gl = new OffscreenCanvas(this.#WORKLOAD, this.#WORKLOAD).getContext("webgl2"); + this.#canvas.addEventListener("webglcontextlost", (event) => { + event.preventDefault(); + console.warn("WebGL context lost. Waiting for it to be restored..."); + cancelAnimationFrame(this.#raf); + }, false); + this.#canvas.addEventListener("webglcontextrestored", (event) => { + console.warn("WebGL context restored. Reinitializing..."); + _NanoPowGl.init(); + }, false); + this.#gl = this.#canvas.getContext("webgl2"); if (this.#gl == null) throw new Error("WebGL 2 is required"); - this.#gl.clearColor(0, 0, 0, 1); - this.#program = this.#gl.createProgram(); - if (this.#program == null) throw new Error("Failed to create shader program"); + this.#drawProgram = this.#gl.createProgram(); + if (this.#drawProgram == null) throw new Error("Failed to create shader program"); this.#vertexShader = this.#gl.createShader(this.#gl.VERTEX_SHADER); if (this.#vertexShader == null) throw new Error("Failed to create vertex shader"); this.#gl.shaderSource(this.#vertexShader, NanoPowGlVertexShader); this.#gl.compileShader(this.#vertexShader); if (!this.#gl.getShaderParameter(this.#vertexShader, this.#gl.COMPILE_STATUS)) throw new Error(this.#gl.getShaderInfoLog(this.#vertexShader) ?? `Failed to compile vertex shader`); - this.#fragmentShader = this.#gl.createShader(this.#gl.FRAGMENT_SHADER); - if (this.#fragmentShader == null) throw new Error("Failed to create fragment shader"); - this.#gl.shaderSource(this.#fragmentShader, NanoPowGlFragmentShader); - this.#gl.compileShader(this.#fragmentShader); - if (!this.#gl.getShaderParameter(this.#fragmentShader, this.#gl.COMPILE_STATUS)) - throw new Error(this.#gl.getShaderInfoLog(this.#fragmentShader) ?? `Failed to compile fragment shader`); - this.#gl.attachShader(this.#program, this.#vertexShader); - this.#gl.attachShader(this.#program, this.#fragmentShader); - this.#gl.linkProgram(this.#program); - if (!this.#gl.getProgramParameter(this.#program, this.#gl.LINK_STATUS)) - throw new Error(this.#gl.getProgramInfoLog(this.#program) ?? `Failed to link program`); - this.#gl.useProgram(this.#program); + this.#drawShader = this.#gl.createShader(this.#gl.FRAGMENT_SHADER); + if (this.#drawShader == null) throw new Error("Failed to create fragment shader"); + this.#gl.shaderSource(this.#drawShader, NanoPowGlDrawShader); + this.#gl.compileShader(this.#drawShader); + if (!this.#gl.getShaderParameter(this.#drawShader, this.#gl.COMPILE_STATUS)) + throw new Error(this.#gl.getShaderInfoLog(this.#drawShader) ?? `Failed to compile fragment shader`); + this.#gl.attachShader(this.#drawProgram, this.#vertexShader); + this.#gl.attachShader(this.#drawProgram, this.#drawShader); + this.#gl.linkProgram(this.#drawProgram); + if (!this.#gl.getProgramParameter(this.#drawProgram, this.#gl.LINK_STATUS)) + throw new Error(this.#gl.getProgramInfoLog(this.#drawProgram) ?? `Failed to link program`); + this.#downsampleProgram = this.#gl.createProgram(); + if (this.#downsampleProgram == null) throw new Error("Failed to create downsample program"); + this.#downsampleShader = this.#gl.createShader(this.#gl.FRAGMENT_SHADER); + if (this.#downsampleShader == null) throw new Error("Failed to create downsample shader"); + this.#gl.shaderSource(this.#downsampleShader, NanoPowGlDownsampleShader); + this.#gl.compileShader(this.#downsampleShader); + if (!this.#gl.getShaderParameter(this.#downsampleShader, this.#gl.COMPILE_STATUS)) + throw new Error(this.#gl.getShaderInfoLog(this.#downsampleShader) ?? `Failed to compile downsample shader`); + this.#gl.attachShader(this.#downsampleProgram, this.#vertexShader); + this.#gl.attachShader(this.#downsampleProgram, this.#downsampleShader); + this.#gl.linkProgram(this.#downsampleProgram); + if (!this.#gl.getProgramParameter(this.#downsampleProgram, this.#gl.LINK_STATUS)) + throw new Error(this.#gl.getProgramInfoLog(this.#downsampleProgram) ?? `Failed to link program`); + this.#gl.useProgram(this.#drawProgram); const triangleArray = this.#gl.createVertexArray(); this.#gl.bindVertexArray(triangleArray); - this.#texture = this.#gl.createTexture(); - this.#gl.bindTexture(this.#gl.TEXTURE_2D, this.#texture); - this.#gl.texImage2D(this.#gl.TEXTURE_2D, 0, this.#gl.RGBA32UI, this.#gl.drawingBufferWidth, this.#gl.drawingBufferHeight, 0, this.#gl.RGBA_INTEGER, this.#gl.UNSIGNED_INT, null); - this.#gl.texParameteri(this.#gl.TEXTURE_2D, this.#gl.TEXTURE_MIN_FILTER, this.#gl.NEAREST); - this.#gl.texParameteri(this.#gl.TEXTURE_2D, this.#gl.TEXTURE_MAG_FILTER, this.#gl.NEAREST); - this.#gl.bindTexture(this.#gl.TEXTURE_2D, null); - this.#framebuffer = this.#gl.createFramebuffer(); - this.#gl.bindFramebuffer(this.#gl.FRAMEBUFFER, this.#framebuffer); - this.#gl.framebufferTexture2D(this.#gl.FRAMEBUFFER, this.#gl.COLOR_ATTACHMENT0, this.#gl.TEXTURE_2D, this.#texture, 0); - if (this.#gl.checkFramebufferStatus(this.#gl.FRAMEBUFFER) !== this.#gl.FRAMEBUFFER_COMPLETE) - throw new Error(`Failed to create framebuffer`); - this.#gl.bindFramebuffer(this.#gl.FRAMEBUFFER, null); this.#positionBuffer = this.#gl.createBuffer(); this.#gl.bindBuffer(this.#gl.ARRAY_BUFFER, this.#positionBuffer); this.#gl.bufferData(this.#gl.ARRAY_BUFFER, this.#positions, this.#gl.STATIC_DRAW); - this.#gl.vertexAttribPointer(0, 3, this.#gl.FLOAT, false, 0, 0); + this.#gl.vertexAttribPointer(0, 2, this.#gl.FLOAT, false, 0, 0); this.#gl.enableVertexAttribArray(0); this.#gl.bindBuffer(this.#gl.ARRAY_BUFFER, null); - this.#uvBuffer = this.#gl.createBuffer(); - this.#gl.bindBuffer(this.#gl.ARRAY_BUFFER, this.#uvBuffer); - this.#gl.bufferData(this.#gl.ARRAY_BUFFER, this.#uvPosArray, this.#gl.STATIC_DRAW); - this.#gl.vertexAttribPointer(1, 2, this.#gl.FLOAT, false, 0, 0); - this.#gl.enableVertexAttribArray(1); - this.#gl.bindBuffer(this.#gl.ARRAY_BUFFER, null); + const texture = this.#gl.createTexture(); + this.#gl.bindTexture(this.#gl.TEXTURE_2D, texture); + this.#gl.texImage2D(this.#gl.TEXTURE_2D, 0, this.#gl.RGBA32UI, this.#gl.drawingBufferWidth, this.#gl.drawingBufferHeight, 0, this.#gl.RGBA_INTEGER, this.#gl.UNSIGNED_INT, null); + this.#gl.texParameteri(this.#gl.TEXTURE_2D, this.#gl.TEXTURE_MIN_FILTER, this.#gl.NEAREST); + this.#gl.texParameteri(this.#gl.TEXTURE_2D, this.#gl.TEXTURE_MAG_FILTER, this.#gl.NEAREST); + const framebuffer = this.#gl.createFramebuffer(); + this.#gl.bindFramebuffer(this.#gl.FRAMEBUFFER, framebuffer); + this.#gl.framebufferTexture2D(this.#gl.FRAMEBUFFER, this.#gl.COLOR_ATTACHMENT0, this.#gl.TEXTURE_2D, texture, 0); + if (this.#gl.checkFramebufferStatus(this.#gl.FRAMEBUFFER) !== this.#gl.FRAMEBUFFER_COMPLETE) + throw new Error(`Failed to create drawing framebuffer`); + this.#drawFbo = { texture, framebuffer, size: { x: this.#gl.drawingBufferWidth, y: this.#gl.drawingBufferHeight } }; + for (let i = 1; i <= 4; i++) { + const width = this.#gl.drawingBufferWidth / 2 ** i; + const height = this.#gl.drawingBufferHeight / 2 ** i; + const texture2 = this.#gl.createTexture(); + this.#gl.bindTexture(this.#gl.TEXTURE_2D, texture2); + this.#gl.texImage2D(this.#gl.TEXTURE_2D, 0, this.#gl.RGBA32UI, width, height, 0, this.#gl.RGBA_INTEGER, this.#gl.UNSIGNED_INT, null); + this.#gl.texParameteri(this.#gl.TEXTURE_2D, this.#gl.TEXTURE_MIN_FILTER, this.#gl.NEAREST); + this.#gl.texParameteri(this.#gl.TEXTURE_2D, this.#gl.TEXTURE_MAG_FILTER, this.#gl.NEAREST); + const framebuffer2 = this.#gl.createFramebuffer(); + this.#gl.bindFramebuffer(this.#gl.FRAMEBUFFER, framebuffer2); + this.#gl.framebufferTexture2D(this.#gl.FRAMEBUFFER, this.#gl.COLOR_ATTACHMENT0, this.#gl.TEXTURE_2D, texture2, 0); + if (this.#gl.checkFramebufferStatus(this.#gl.FRAMEBUFFER) !== this.#gl.FRAMEBUFFER_COMPLETE) + throw new Error(`Failed to create downsampling framebuffer ${i}`); + this.#downsampleFbos.push({ texture: texture2, framebuffer: framebuffer2, size: { x: width, y: height } }); + } + this.#downsampleSrcLocation = this.#gl.getUniformLocation(this.#downsampleProgram, "src"); + this.#gl.bindTexture(this.#gl.TEXTURE_2D, null); + this.#gl.bindFramebuffer(this.#gl.FRAMEBUFFER, null); this.#uboBuffer = this.#gl.createBuffer(); this.#gl.bindBuffer(this.#gl.UNIFORM_BUFFER, this.#uboBuffer); this.#gl.bufferData(this.#gl.UNIFORM_BUFFER, 144, this.#gl.DYNAMIC_DRAW); this.#gl.bindBuffer(this.#gl.UNIFORM_BUFFER, null); this.#gl.bindBufferBase(this.#gl.UNIFORM_BUFFER, 0, this.#uboBuffer); - this.#gl.uniformBlockBinding(this.#program, this.#gl.getUniformBlockIndex(this.#program, "UBO"), 0); + this.#gl.uniformBlockBinding(this.#drawProgram, this.#gl.getUniformBlockIndex(this.#drawProgram, "UBO"), 0); this.#gl.bindBuffer(this.#gl.UNIFORM_BUFFER, null); - this.#workBuffer = this.#gl.createBuffer(); - this.#gl.bindBuffer(this.#gl.UNIFORM_BUFFER, this.#workBuffer); - this.#gl.bufferData(this.#gl.UNIFORM_BUFFER, 32, this.#gl.STREAM_DRAW); + this.#seedBuffer = this.#gl.createBuffer(); + this.#gl.bindBuffer(this.#gl.UNIFORM_BUFFER, this.#seedBuffer); + this.#gl.bufferData(this.#gl.UNIFORM_BUFFER, 16, this.#gl.DYNAMIC_DRAW); this.#gl.bindBuffer(this.#gl.UNIFORM_BUFFER, null); - this.#gl.bindBufferBase(this.#gl.UNIFORM_BUFFER, 1, this.#workBuffer); - this.#gl.uniformBlockBinding(this.#program, this.#gl.getUniformBlockIndex(this.#program, "WORK"), 1); + this.#gl.bindBufferBase(this.#gl.UNIFORM_BUFFER, 1, this.#seedBuffer); + this.#gl.uniformBlockBinding(this.#drawProgram, this.#gl.getUniformBlockIndex(this.#drawProgram, "WORK"), 1); this.#gl.bindBuffer(this.#gl.UNIFORM_BUFFER, null); - this.#pixels = new Uint32Array(this.#gl.drawingBufferWidth * this.#gl.drawingBufferHeight * 4); this.#query = this.#gl.createQuery(); + this.#pixels = new Uint32Array(this.#gl.drawingBufferWidth * this.#gl.drawingBufferHeight * 4); + console.log(`NanoPow WebGL initialized at ${this.#gl.drawingBufferWidth}x${this.#gl.drawingBufferHeight}. Maximum nonces checked per frame: ${this.#gl.drawingBufferWidth * this.#gl.drawingBufferHeight}`); } catch (err) { - throw new Error(`WebGL initialization failed. ${err}`); + throw new Error("WebGL initialization failed.", { cause: err }); } finally { this.#busy = false; } } static reset() { + cancelAnimationFrame(_NanoPowGl.#raf); + _NanoPowGl.#gl?.deleteQuery(_NanoPowGl.#query); _NanoPowGl.#query = null; - _NanoPowGl.#workBuffer = null; + _NanoPowGl.#gl?.deleteBuffer(_NanoPowGl.#seedBuffer); + _NanoPowGl.#seedBuffer = null; + _NanoPowGl.#gl?.deleteBuffer(_NanoPowGl.#uboBuffer); _NanoPowGl.#uboBuffer = null; - _NanoPowGl.#uvBuffer = null; + for (const fbo of _NanoPowGl.#downsampleFbos) { + _NanoPowGl.#gl?.deleteFramebuffer(fbo.framebuffer); + _NanoPowGl.#gl?.deleteTexture(fbo.texture); + } + _NanoPowGl.#downsampleFbos = []; + _NanoPowGl.#gl?.deleteShader(_NanoPowGl.#downsampleShader); + _NanoPowGl.#downsampleShader = null; + _NanoPowGl.#gl?.deleteProgram(_NanoPowGl.#downsampleProgram); + _NanoPowGl.#downsampleProgram = null; + _NanoPowGl.#gl?.deleteFramebuffer(_NanoPowGl.#drawFbo?.framebuffer ?? null); + _NanoPowGl.#drawFbo = null; + _NanoPowGl.#gl?.deleteTexture(_NanoPowGl.#drawFbo); + _NanoPowGl.#drawFbo = null; + _NanoPowGl.#gl?.deleteBuffer(_NanoPowGl.#positionBuffer); _NanoPowGl.#positionBuffer = null; - _NanoPowGl.#framebuffer = null; - _NanoPowGl.#texture = null; - _NanoPowGl.#fragmentShader = null; + _NanoPowGl.#gl?.deleteShader(_NanoPowGl.#drawShader); + _NanoPowGl.#drawShader = null; + _NanoPowGl.#gl?.deleteShader(_NanoPowGl.#vertexShader); _NanoPowGl.#vertexShader = null; - _NanoPowGl.#program = null; + _NanoPowGl.#gl?.deleteProgram(_NanoPowGl.#drawProgram); + _NanoPowGl.#drawProgram = null; _NanoPowGl.#gl = null; _NanoPowGl.#busy = false; _NanoPowGl.init(); @@ -312,36 +463,41 @@ var NanoPowGl = class _NanoPowGl { "Harmonic Mean (ms)": count / reciprocals, "Geometric Mean (ms)": Math.exp(logarithms / count) }; + console.log(`Averages: ${JSON.stringify(averages)}`); console.table(averages); } - static #draw(work) { + static #draw(seed) { if (this.#gl == null || this.#query == null) throw new Error("WebGL 2 is required to draw and query pixels"); - if (this.#workBuffer == null) throw new Error("Work buffer is required to draw"); - this.#gl.clear(this.#gl.COLOR_BUFFER_BIT); - this.#gl.bindBuffer(this.#gl.UNIFORM_BUFFER, this.#workBuffer); - this.#gl.bufferSubData(this.#gl.UNIFORM_BUFFER, 0, work); + if (this.#drawFbo == null) throw new Error("FBO is required to draw"); + if (this.#seed[0] == null || this.#seedBuffer == null) throw new Error("Seed is required to draw"); + this.#gl.bindBuffer(this.#gl.UNIFORM_BUFFER, this.#seedBuffer); + this.#gl.bufferSubData(this.#gl.UNIFORM_BUFFER, 0, seed); this.#gl.bindBuffer(this.#gl.UNIFORM_BUFFER, null); + this.#gl.useProgram(this.#drawProgram); + this.#gl.bindFramebuffer(this.#gl.FRAMEBUFFER, this.#drawFbo.framebuffer); + this.#gl.activeTexture(this.#gl.TEXTURE0); + this.#gl.bindTexture(this.#gl.TEXTURE_2D, this.#drawFbo.texture); this.#gl.beginQuery(this.#gl.ANY_SAMPLES_PASSED_CONSERVATIVE, this.#query); - this.#gl.bindFramebuffer(this.#gl.FRAMEBUFFER, this.#framebuffer); - this.#gl.drawArrays(this.#gl.TRIANGLES, 0, 6); + this.#gl.viewport(0, 0, this.#drawFbo.size.x, this.#drawFbo.size.y); + this.#gl.drawArrays(this.#gl.TRIANGLES, 0, 4); this.#gl.bindFramebuffer(this.#gl.FRAMEBUFFER, null); this.#gl.endQuery(this.#gl.ANY_SAMPLES_PASSED_CONSERVATIVE); } static async #checkQueryResult() { return new Promise((resolve, reject) => { - try { - if (this.#gl == null || this.#query == null) throw new Error("WebGL 2 is required to check query results"); - if (this.#gl.getQueryParameter(this.#query, this.#gl.QUERY_RESULT_AVAILABLE)) { - resolve(!!this.#gl.getQueryParameter(this.#query, this.#gl.QUERY_RESULT)); - } else { - requestAnimationFrame(async () => { - const result = await _NanoPowGl.#checkQueryResult(); - resolve(result); - }); + function check() { + try { + if (_NanoPowGl.#gl == null || _NanoPowGl.#query == null) throw new Error("WebGL 2 is required to check query results"); + if (_NanoPowGl.#gl.getQueryParameter(_NanoPowGl.#query, _NanoPowGl.#gl.QUERY_RESULT_AVAILABLE)) { + resolve(!!_NanoPowGl.#gl.getQueryParameter(_NanoPowGl.#query, _NanoPowGl.#gl.QUERY_RESULT)); + } else { + _NanoPowGl.#raf = requestAnimationFrame(check); + } + } catch (err) { + reject(err); } - } catch (err) { - reject(err); } + check(); }); } /** @@ -354,11 +510,34 @@ var NanoPowGl = class _NanoPowGl { */ static #readResult(workHex) { if (this.#gl == null) throw new Error("WebGL 2 is required to read pixels"); - this.#gl.bindFramebuffer(this.#gl.FRAMEBUFFER, this.#framebuffer); - this.#gl.readPixels(0, 0, this.#gl.drawingBufferWidth, this.#gl.drawingBufferHeight, this.#gl.RGBA_INTEGER, this.#gl.UNSIGNED_INT, this.#pixels); + if (this.#drawFbo == null) throw new Error("Source FBO is required to downsample"); + let source = this.#drawFbo; + let pixelCount; + const start = performance.now(); + if (workHex != null) { + this.#gl.bindFramebuffer(this.#gl.FRAMEBUFFER, source.framebuffer); + this.#gl.readPixels(0, 0, 1, 1, this.#gl.RGBA_INTEGER, this.#gl.UNSIGNED_INT, this.#pixels); + pixelCount = 4; + } else { + this.#gl.useProgram(this.#downsampleProgram); + for (const fbo of this.#downsampleFbos) { + this.#gl.bindFramebuffer(this.#gl.FRAMEBUFFER, fbo.framebuffer); + this.#gl.activeTexture(this.#gl.TEXTURE0); + this.#gl.bindTexture(this.#gl.TEXTURE_2D, source.texture); + this.#gl.uniform1i(this.#downsampleSrcLocation, 0); + this.#gl.viewport(0, 0, fbo.size.x, fbo.size.y); + this.#gl.drawArrays(this.#gl.TRIANGLES, 0, 4); + source = fbo; + } + this.#gl.bindFramebuffer(this.#gl.FRAMEBUFFER, source.framebuffer); + this.#gl.readPixels(0, 0, source.size.x, source.size.y, this.#gl.RGBA_INTEGER, this.#gl.UNSIGNED_INT, this.#pixels); + pixelCount = source.size.x * source.size.y * 4; + } this.#gl.bindFramebuffer(this.#gl.FRAMEBUFFER, null); - for (let i = 0; i < this.#pixels.length; i += 4) { + for (let i = 0; i < pixelCount; i += 4) { if (this.#pixels[i] !== 0) { + if (this.#debug) console.log(`readResults (${performance.now() - start} ms)`); + if (this.#debug) console.log(`Pixel: rgba(${this.#pixels[i]}, ${this.#pixels[i + 1]}, ${this.#pixels[i + 2]}, ${this.#pixels[i + 3].toString(16).padStart(8, "0")})`); const hex = `${this.#pixels[i + 1].toString(16).padStart(8, "0")}${this.#pixels[i + 2].toString(16).padStart(8, "0")}`; if (workHex == null || workHex == hex) return hex; } @@ -372,10 +551,8 @@ var NanoPowGl = class _NanoPowGl { * @param {number} [threshold=0xfffffff8] - Difficulty of proof-of-work calculation */ static async search(hash, options) { - if (_NanoPowGl.#gl == null) throw new Error("WebGL 2 is required"); - if (this.#gl == null) throw new Error("WebGL 2 is required"); - if (!/^[A-Fa-f0-9]{64}$/.test(hash)) throw new Error(`Invalid hash ${hash}`); if (this.#busy) { + console.log("NanoPowGl is busy. Retrying search..."); return new Promise((resolve) => { setTimeout(async () => { const result = this.search(hash, options); @@ -384,39 +561,53 @@ var NanoPowGl = class _NanoPowGl { }); } this.#busy = true; + if (!/^[A-Fa-f0-9]{64}$/.test(hash)) throw new Error(`Invalid hash ${hash}`); const threshold = typeof options?.threshold !== "number" || options.threshold < 0 || options.threshold > 4294967295 ? 4294967288 : options.threshold; - const effort = typeof options?.effort !== "number" || options.effort < 1 || options.effort > 32 ? 8 : options.effort; - const debug = !!options?.debug; + const effort = typeof options?.effort !== "number" || options.effort < 1 || options.effort > 32 ? this.#cores : options.effort; + this.#debug = !!options?.debug; if (this.#WORKLOAD !== 256 * effort) { this.#WORKLOAD = 256 * effort; + this.#canvas.height = this.#WORKLOAD; + this.#canvas.width = this.#WORKLOAD; this.reset(); } - const uboView = new DataView(new ArrayBuffer(144)); + if (_NanoPowGl.#gl == null) throw new Error("WebGL 2 is required"); + if (this.#gl == null) throw new Error("WebGL 2 is required"); + if (this.#drawFbo == null) throw new Error("WebGL framebuffer is required"); + this.#gl.bindFramebuffer(this.#gl.FRAMEBUFFER, this.#drawFbo.framebuffer); + this.#gl.clearBufferuiv(this.#gl.COLOR, 0, [0, 0, 0, 0]); + this.#gl.bindFramebuffer(this.#gl.FRAMEBUFFER, null); + for (let i = 0; i < this.#uboView.byteLength; i++) this.#uboView.setUint8(i, 0); for (let i = 0; i < 64; i += 8) { const uint32 = hash.slice(i, i + 8); - uboView.setUint32(i * 2, parseInt(uint32, 16)); + this.#uboView.setUint32(i * 2, parseInt(uint32, 16)); } - uboView.setUint32(128, threshold, true); - uboView.setFloat32(132, 256 * effort, true); - _NanoPowGl.#gl.bindBuffer(_NanoPowGl.#gl.UNIFORM_BUFFER, _NanoPowGl.#uboBuffer); - _NanoPowGl.#gl.bufferSubData(_NanoPowGl.#gl.UNIFORM_BUFFER, 0, uboView); - _NanoPowGl.#gl.bindBuffer(_NanoPowGl.#gl.UNIFORM_BUFFER, null); + this.#uboView.setUint32(128, threshold, true); + this.#uboView.setUint32(132, 1, true); + if (this.#debug) console.log("UBO", this.#uboView.buffer.slice(0)); + this.#gl.bindBuffer(this.#gl.UNIFORM_BUFFER, this.#uboBuffer); + this.#gl.bufferSubData(this.#gl.UNIFORM_BUFFER, 0, this.#uboView); + this.#gl.bindBuffer(this.#gl.UNIFORM_BUFFER, null); let times = []; let start = performance.now(); let nonce = null; - const seed = new Uint8Array(8); + if (this.#debug) console.groupCollapsed("Seeds (click to view)"); while (nonce == null) { start = performance.now(); - crypto.getRandomValues(seed); - this.#draw(seed); + const random0 = Math.floor(Math.random() * 4294967295); + const random1 = Math.floor(Math.random() * 4294967295); + this.#seed[0] = BigInt(random0) << 32n | BigInt(random1); + if (this.#debug) console.log("Seed", this.#seed); + this.#draw(this.#seed); const found = await this.#checkQueryResult(); times.push(performance.now() - start); if (found) { + if (this.#debug) console.groupEnd(); nonce = this.#readResult(); } } this.#busy = false; - if (debug) this.#logAverages(times); + if (this.#debug) this.#logAverages(times); return nonce; } /** @@ -427,11 +618,8 @@ var NanoPowGl = class _NanoPowGl { * @param {number} [threshold=0xfffffff8] - Difficulty of proof-of-work calculation */ static async validate(work, hash, options) { - if (_NanoPowGl.#gl == null) throw new Error("WebGL 2 is required"); - if (this.#gl == null) throw new Error("WebGL 2 is required"); - if (!/^[A-Fa-f0-9]{16}$/.test(work)) throw new Error(`Invalid work ${work}`); - if (!/^[A-Fa-f0-9]{64}$/.test(hash)) throw new Error(`Invalid hash ${hash}`); if (this.#busy) { + console.log("NanoPowGl is busy. Retrying validate..."); return new Promise((resolve) => { setTimeout(async () => { const result = this.validate(work, hash, options); @@ -440,27 +628,31 @@ var NanoPowGl = class _NanoPowGl { }); } this.#busy = true; + if (!/^[A-Fa-f0-9]{16}$/.test(work)) throw new Error(`Invalid work ${work}`); + if (!/^[A-Fa-f0-9]{64}$/.test(hash)) throw new Error(`Invalid hash ${hash}`); const threshold = typeof options?.threshold !== "number" || options.threshold < 0 || options.threshold > 4294967295 ? 4294967288 : options.threshold; - const debug = !!options?.debug; - if (this.#WORKLOAD !== 1) { - this.#WORKLOAD = 1; - this.reset(); - } - const uboView = new DataView(new ArrayBuffer(144)); + this.#debug = !!options?.debug; + if (_NanoPowGl.#gl == null) throw new Error("WebGL 2 is required"); + if (this.#gl == null) throw new Error("WebGL 2 is required"); + if (this.#drawFbo == null) throw new Error("WebGL framebuffer is required"); + this.#gl.bindFramebuffer(this.#gl.FRAMEBUFFER, this.#drawFbo.framebuffer); + this.#gl.clearBufferuiv(this.#gl.COLOR, 0, [0, 0, 0, 0]); + this.#gl.bindFramebuffer(this.#gl.FRAMEBUFFER, null); + for (let i = 0; i < this.#uboView.byteLength; i++) this.#uboView.setUint8(i, 0); for (let i = 0; i < 64; i += 8) { const uint32 = hash.slice(i, i + 8); - uboView.setUint32(i * 2, parseInt(uint32, 16)); + this.#uboView.setUint32(i * 2, parseInt(uint32, 16)); } - uboView.setUint32(128, threshold, true); - uboView.setFloat32(132, _NanoPowGl.#WORKLOAD - 1, true); - _NanoPowGl.#gl.bindBuffer(_NanoPowGl.#gl.UNIFORM_BUFFER, _NanoPowGl.#uboBuffer); - _NanoPowGl.#gl.bufferSubData(_NanoPowGl.#gl.UNIFORM_BUFFER, 0, uboView); - _NanoPowGl.#gl.bindBuffer(_NanoPowGl.#gl.UNIFORM_BUFFER, null); + this.#uboView.setUint32(128, threshold, true); + this.#uboView.setUint32(132, 0, true); + if (this.#debug) console.log("UBO", this.#uboView.buffer.slice(0)); + this.#gl.bindBuffer(this.#gl.UNIFORM_BUFFER, this.#uboBuffer); + this.#gl.bufferSubData(this.#gl.UNIFORM_BUFFER, 0, this.#uboView); + this.#gl.bindBuffer(this.#gl.UNIFORM_BUFFER, null); let nonce = null; - const data = new DataView(new ArrayBuffer(8)); - data.setBigUint64(0, BigInt(`0x${work}`), true); - const seed = new Uint8Array(data.buffer); - this.#draw(seed); + this.#seed[0] = BigInt(`0x${work}`); + if (this.#debug) console.log("Work", this.#seed); + this.#draw(this.#seed); let found = await this.#checkQueryResult(); if (found) { try { @@ -479,10 +671,13 @@ var NanoPowGl = class _NanoPowGl { var NanoPowGpu = class _NanoPowGpu { // Initialize WebGPU static #busy = false; + static #debug = false; static #device = null; - static #uboBuffer; + static #gpuBufferReset = new BigUint64Array([0n, 0n]); static #gpuBuffer; static #cpuBuffer; + static #uboBuffer; + static #uboView; static #bindGroupLayout; static #searchPipeline; static #validatePipeline; @@ -500,17 +695,13 @@ var NanoPowGpu = class _NanoPowGpu { this.#device = device; this.setup(); } catch (err) { - throw new Error(`WebGPU initialization failed. ${err}`); + throw new Error("WebGPU initialization failed.", { cause: err }); } finally { this.#busy = false; } } static setup() { if (this.#device == null) throw new Error(`WebGPU device failed to load.`); - this.#uboBuffer = this.#device.createBuffer({ - size: 48, - usage: GPUBufferUsage.UNIFORM | GPUBufferUsage.COPY_DST - }); this.#gpuBuffer = this.#device.createBuffer({ size: 16, usage: GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_DST | GPUBufferUsage.COPY_SRC @@ -519,6 +710,11 @@ var NanoPowGpu = class _NanoPowGpu { size: 16, usage: GPUBufferUsage.COPY_DST | GPUBufferUsage.MAP_READ }); + this.#uboBuffer = this.#device.createBuffer({ + size: 48, + usage: GPUBufferUsage.UNIFORM | GPUBufferUsage.COPY_DST + }); + this.#uboView = new DataView(new ArrayBuffer(48)); this.#bindGroupLayout = this.#device.createBindGroupLayout({ entries: [ { @@ -554,6 +750,7 @@ var NanoPowGpu = class _NanoPowGpu { module: shaderModule } }); + console.log(`NanoPow WebGPU initialized. Recommended effort: ${Math.max(1, Math.floor(navigator.hardwareConcurrency / 2))}`); } static reset() { console.warn(`GPU device lost. Reinitializing...`); @@ -596,15 +793,16 @@ var NanoPowGpu = class _NanoPowGpu { } static async #dispatch(pipeline, seed, hash, threshold, passes) { if (this.#device == null) throw new Error(`WebGPU device failed to load.`); - const uboView = new DataView(new ArrayBuffer(48)); + for (let i = 0; i < this.#uboView.byteLength; i++) this.#uboView.setUint8(i, 0); for (let i = 0; i < 64; i += 16) { const u64 = hash.slice(i, i + 16); - uboView.setBigUint64(i / 2, BigInt(`0x${u64}`)); + this.#uboView.setBigUint64(i / 2, BigInt(`0x${u64}`)); } - uboView.setBigUint64(32, seed, true); - uboView.setUint32(40, threshold, true); - this.#device.queue.writeBuffer(this.#uboBuffer, 0, uboView); - this.#device.queue.writeBuffer(this.#gpuBuffer, 0, new Uint32Array([0, 0, 0])); + this.#uboView.setBigUint64(32, seed, true); + this.#uboView.setUint32(40, threshold, true); + if (this.#debug) console.log("UBO", this.#uboView); + this.#device.queue.writeBuffer(this.#uboBuffer, 0, this.#uboView); + this.#device.queue.writeBuffer(this.#gpuBuffer, 0, this.#gpuBufferReset); const bindGroup = this.#device.createBindGroup({ layout: this.#bindGroupLayout, entries: [ @@ -640,6 +838,7 @@ var NanoPowGpu = class _NanoPowGpu { console.warn(`Error getting data from GPU. ${err}`); return this.#dispatch(pipeline, seed, hash, threshold, passes); } + if (this.#debug) console.log("gpuBuffer data", data); if (data == null) throw new Error(`Failed to get data from buffer.`); return data; } @@ -652,6 +851,7 @@ var NanoPowGpu = class _NanoPowGpu { static async search(hash, options) { if (!/^[A-Fa-f0-9]{64}$/.test(hash)) throw new TypeError(`Invalid hash ${hash}`); if (this.#busy) { + console.log("NanoPowGpu is busy. Retrying search..."); return new Promise((resolve) => { setTimeout(async () => { const result = this.search(hash, options); @@ -662,7 +862,7 @@ var NanoPowGpu = class _NanoPowGpu { this.#busy = true; const threshold = typeof options?.threshold !== "number" || options.threshold < 0 || options.threshold > 4294967295 ? 4294967288 : options.threshold; const effort = typeof options?.effort !== "number" || options.effort < 1 || options.effort > 32 ? 2048 : options.effort * 256; - const debug = !!options?.debug; + this.#debug = !!options?.debug; let loads = 0; while (this.#device == null && loads < 20) { await new Promise((resolve) => { @@ -678,14 +878,16 @@ var NanoPowGpu = class _NanoPowGpu { let nonce = 0n; do { start = performance.now(); - const random = Math.floor(Math.random() * 4294967295); - const seed = BigInt(random) << 32n | BigInt(random); + const random0 = Math.floor(Math.random() * 4294967295); + const random1 = Math.floor(Math.random() * 4294967295); + const seed = BigInt(random0) << 32n | BigInt(random1); + if (this.#debug) console.log(`seed: ${seed}`); const data = await this.#dispatch(this.#searchPipeline, seed, hash, threshold, effort); nonce = data.getBigUint64(0, true); this.#busy = !data.getUint32(8); times.push(performance.now() - start); } while (this.#busy); - if (debug) this.#logAverages(times); + if (this.#debug) this.#logAverages(times); return nonce.toString(16).padStart(16, "0"); } /** @@ -699,6 +901,7 @@ var NanoPowGpu = class _NanoPowGpu { if (!/^[A-Fa-f0-9]{16}$/.test(work)) throw new TypeError(`Invalid work ${work}`); if (!/^[A-Fa-f0-9]{64}$/.test(hash)) throw new TypeError(`Invalid hash ${hash}`); if (this.#busy) { + console.log("NanoPowGpu is busy. Retrying validate..."); return new Promise((resolve) => { setTimeout(async () => { const result = this.validate(work, hash, options); @@ -707,7 +910,7 @@ var NanoPowGpu = class _NanoPowGpu { }); } this.#busy = true; - const debug = !!options?.debug; + this.#debug = !!options?.debug; const threshold = typeof options?.threshold !== "number" || options.threshold < 0 || options.threshold > 4294967295 ? 4294967288 : options.threshold; let loads = 0; while (this.#device == null && loads < 20) { @@ -720,8 +923,10 @@ var NanoPowGpu = class _NanoPowGpu { throw new Error(`WebGPU device failed to load.`); } const seed = BigInt(`0x${work}`); + if (this.#debug) console.log(`work: ${work}`); const data = await this.#dispatch(this.#validatePipeline, seed, hash, threshold, 1); const nonce = data.getBigUint64(0, true).toString(16).padStart(16, "0"); + if (this.#debug) console.log(`nonce: ${nonce}`); const found = !!data.getUint32(8); this.#busy = false; if (found && work !== nonce) throw new Error(`Nonce (${nonce}) found but does not match work (${work})`); @@ -736,16 +941,14 @@ try { await NanoPowGpu.init(); isGpuSupported = true; } catch (err) { - console.error(err); - console.warn(`WebGPU is not supported in this environment.`); + console.warn("WebGPU is not supported in this environment.\n", err); isGpuSupported = false; } try { await NanoPowGl.init(); isGlSupported = true; } catch (err) { - console.error(err); - console.warn(`WebGL is not supported in this environment.`); + console.warn("WebGL is not supported in this environment.\n", err); isGlSupported = false; } var NanoPow = isGpuSupported ? NanoPowGpu : isGlSupported ? NanoPowGl : null; diff --git a/src/classes/gl.ts b/src/classes/gl.ts index fc67d06..520ab89 100644 --- a/src/classes/gl.ts +++ b/src/classes/gl.ts @@ -2,34 +2,39 @@ // SPDX-FileContributor: Ben Green // SPDX-License-Identifier: GPL-3.0-or-later AND MIT -import { NanoPowGlFragmentShader, NanoPowGlVertexShader } from '../shaders' -import type { NanoPowOptions } from '../../types.d.ts' +import { NanoPowGlDownsampleShader, NanoPowGlDrawShader, NanoPowGlVertexShader } from '../shaders' +import type { FBO, NanoPowOptions } from '../../types.d.ts' export class NanoPowGl { static #busy: boolean = false - /** Used to set canvas size. Must be a multiple of 256. */ - static #WORKLOAD: number = 256 * Math.max(1, Math.floor(navigator.hardwareConcurrency)) + static #debug: boolean = false + static #raf: number = 0 + /** Used to set canvas size. */ + static #cores: number = Math.max(1, Math.floor(navigator.hardwareConcurrency)) + static #WORKLOAD: number = 256 * this.#cores + static #canvas: OffscreenCanvas = new OffscreenCanvas(this.#WORKLOAD, this.#WORKLOAD) + static get size () { return this.#gl?.drawingBufferWidth } static #gl: WebGL2RenderingContext | null - static #program: WebGLProgram | null + static #drawProgram: WebGLProgram | null + static #downsampleProgram: WebGLProgram | null static #vertexShader: WebGLShader | null - static #fragmentShader: WebGLShader | null - static #texture: WebGLTexture | null - static #framebuffer: WebGLFramebuffer | null + static #drawShader: WebGLShader | null + static #downsampleShader: WebGLShader | null static #positionBuffer: WebGLBuffer | null - static #uvBuffer: WebGLBuffer | null + static #drawFbo: FBO | null + static #downsampleFbos: FBO[] = [] + static #downsampleSrcLocation: WebGLUniformLocation | null static #uboBuffer: WebGLBuffer | null - static #workBuffer: WebGLBuffer | null + static #uboView: DataView = new DataView(new ArrayBuffer(144)) + static #seedBuffer: WebGLBuffer | null + static #seed: BigUint64Array = new BigUint64Array(1) static #query: WebGLQuery | null static #pixels: Uint32Array + /**Vertex Positions, 2 triangles */ - static #positions = new Float32Array([ - -1, -1, 0, -1, 1, 0, 1, 1, 0, - 1, -1, 0, 1, 1, 0, -1, -1, 0 - ]) - /** Texture Positions */ - static #uvPosArray = new Float32Array([ - 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1 + static #positions: Float32Array = new Float32Array([ + -1, -1, 1, -1, 1, 1, -1, 1 ]) /** Compile */ @@ -38,12 +43,21 @@ export class NanoPowGl { this.#busy = true try { - this.#gl = new OffscreenCanvas(this.#WORKLOAD, this.#WORKLOAD).getContext('webgl2') + this.#canvas.addEventListener('webglcontextlost', event => { + event.preventDefault() + console.warn('WebGL context lost. Waiting for it to be restored...') + cancelAnimationFrame(this.#raf) + }, false) + this.#canvas.addEventListener('webglcontextrestored', event => { + console.warn('WebGL context restored. Reinitializing...') + NanoPowGl.init() + }, false) + this.#gl = this.#canvas.getContext('webgl2') if (this.#gl == null) throw new Error('WebGL 2 is required') - this.#gl.clearColor(0, 0, 0, 1) - this.#program = this.#gl.createProgram() - if (this.#program == null) throw new Error('Failed to create shader program') + /** Create drawing program */ + this.#drawProgram = this.#gl.createProgram() + if (this.#drawProgram == null) throw new Error('Failed to create shader program') this.#vertexShader = this.#gl.createShader(this.#gl.VERTEX_SHADER) if (this.#vertexShader == null) throw new Error('Failed to create vertex shader') @@ -52,88 +66,141 @@ export class NanoPowGl { if (!this.#gl.getShaderParameter(this.#vertexShader, this.#gl.COMPILE_STATUS)) throw new Error(this.#gl.getShaderInfoLog(this.#vertexShader) ?? `Failed to compile vertex shader`) - this.#fragmentShader = this.#gl.createShader(this.#gl.FRAGMENT_SHADER) - if (this.#fragmentShader == null) throw new Error('Failed to create fragment shader') - this.#gl.shaderSource(this.#fragmentShader, NanoPowGlFragmentShader) - this.#gl.compileShader(this.#fragmentShader) - if (!this.#gl.getShaderParameter(this.#fragmentShader, this.#gl.COMPILE_STATUS)) - throw new Error(this.#gl.getShaderInfoLog(this.#fragmentShader) ?? `Failed to compile fragment shader`) - - this.#gl.attachShader(this.#program, this.#vertexShader) - this.#gl.attachShader(this.#program, this.#fragmentShader) - this.#gl.linkProgram(this.#program) - if (!this.#gl.getProgramParameter(this.#program, this.#gl.LINK_STATUS)) - throw new Error(this.#gl.getProgramInfoLog(this.#program) ?? `Failed to link program`) - - /** Construct simple 2D geometry */ - this.#gl.useProgram(this.#program) + this.#drawShader = this.#gl.createShader(this.#gl.FRAGMENT_SHADER) + if (this.#drawShader == null) throw new Error('Failed to create fragment shader') + this.#gl.shaderSource(this.#drawShader, NanoPowGlDrawShader) + this.#gl.compileShader(this.#drawShader) + if (!this.#gl.getShaderParameter(this.#drawShader, this.#gl.COMPILE_STATUS)) + throw new Error(this.#gl.getShaderInfoLog(this.#drawShader) ?? `Failed to compile fragment shader`) + + this.#gl.attachShader(this.#drawProgram, this.#vertexShader) + this.#gl.attachShader(this.#drawProgram, this.#drawShader) + this.#gl.linkProgram(this.#drawProgram) + if (!this.#gl.getProgramParameter(this.#drawProgram, this.#gl.LINK_STATUS)) + throw new Error(this.#gl.getProgramInfoLog(this.#drawProgram) ?? `Failed to link program`) + + /** Create downsampling program */ + this.#downsampleProgram = this.#gl.createProgram() + if (this.#downsampleProgram == null) throw new Error('Failed to create downsample program') + + this.#downsampleShader = this.#gl.createShader(this.#gl.FRAGMENT_SHADER) + if (this.#downsampleShader == null) throw new Error('Failed to create downsample shader') + this.#gl.shaderSource(this.#downsampleShader, NanoPowGlDownsampleShader) + this.#gl.compileShader(this.#downsampleShader) + if (!this.#gl.getShaderParameter(this.#downsampleShader, this.#gl.COMPILE_STATUS)) + throw new Error(this.#gl.getShaderInfoLog(this.#downsampleShader) ?? `Failed to compile downsample shader`) + + this.#gl.attachShader(this.#downsampleProgram, this.#vertexShader) + this.#gl.attachShader(this.#downsampleProgram, this.#downsampleShader) + this.#gl.linkProgram(this.#downsampleProgram) + if (!this.#gl.getProgramParameter(this.#downsampleProgram, this.#gl.LINK_STATUS)) + throw new Error(this.#gl.getProgramInfoLog(this.#downsampleProgram) ?? `Failed to link program`) + + /** Construct fullscreen quad for rendering */ + this.#gl.useProgram(this.#drawProgram) const triangleArray = this.#gl.createVertexArray() this.#gl.bindVertexArray(triangleArray) - this.#texture = this.#gl.createTexture() - this.#gl.bindTexture(this.#gl.TEXTURE_2D, this.#texture) - this.#gl.texImage2D(this.#gl.TEXTURE_2D, 0, this.#gl.RGBA32UI, this.#gl.drawingBufferWidth, this.#gl.drawingBufferHeight, 0, this.#gl.RGBA_INTEGER, this.#gl.UNSIGNED_INT, null) - this.#gl.texParameteri(this.#gl.TEXTURE_2D, this.#gl.TEXTURE_MIN_FILTER, this.#gl.NEAREST) - this.#gl.texParameteri(this.#gl.TEXTURE_2D, this.#gl.TEXTURE_MAG_FILTER, this.#gl.NEAREST) - this.#gl.bindTexture(this.#gl.TEXTURE_2D, null) - - this.#framebuffer = this.#gl.createFramebuffer() - this.#gl.bindFramebuffer(this.#gl.FRAMEBUFFER, this.#framebuffer) - this.#gl.framebufferTexture2D(this.#gl.FRAMEBUFFER, this.#gl.COLOR_ATTACHMENT0, this.#gl.TEXTURE_2D, this.#texture, 0) - if (this.#gl.checkFramebufferStatus(this.#gl.FRAMEBUFFER) !== this.#gl.FRAMEBUFFER_COMPLETE) - throw new Error(`Failed to create framebuffer`) - this.#gl.bindFramebuffer(this.#gl.FRAMEBUFFER, null) - this.#positionBuffer = this.#gl.createBuffer() this.#gl.bindBuffer(this.#gl.ARRAY_BUFFER, this.#positionBuffer) this.#gl.bufferData(this.#gl.ARRAY_BUFFER, this.#positions, this.#gl.STATIC_DRAW) - this.#gl.vertexAttribPointer(0, 3, this.#gl.FLOAT, false, 0, 0) + this.#gl.vertexAttribPointer(0, 2, this.#gl.FLOAT, false, 0, 0) this.#gl.enableVertexAttribArray(0) this.#gl.bindBuffer(this.#gl.ARRAY_BUFFER, null) - this.#uvBuffer = this.#gl.createBuffer() - this.#gl.bindBuffer(this.#gl.ARRAY_BUFFER, this.#uvBuffer) - this.#gl.bufferData(this.#gl.ARRAY_BUFFER, this.#uvPosArray, this.#gl.STATIC_DRAW) - this.#gl.vertexAttribPointer(1, 2, this.#gl.FLOAT, false, 0, 0) - this.#gl.enableVertexAttribArray(1) - this.#gl.bindBuffer(this.#gl.ARRAY_BUFFER, null) + /** Create texture and framebuffer for drawing */ + const texture = this.#gl.createTexture() + this.#gl.bindTexture(this.#gl.TEXTURE_2D, texture) + this.#gl.texImage2D(this.#gl.TEXTURE_2D, 0, this.#gl.RGBA32UI, this.#gl.drawingBufferWidth, this.#gl.drawingBufferHeight, 0, this.#gl.RGBA_INTEGER, this.#gl.UNSIGNED_INT, null) + this.#gl.texParameteri(this.#gl.TEXTURE_2D, this.#gl.TEXTURE_MIN_FILTER, this.#gl.NEAREST) + this.#gl.texParameteri(this.#gl.TEXTURE_2D, this.#gl.TEXTURE_MAG_FILTER, this.#gl.NEAREST) + const framebuffer = this.#gl.createFramebuffer() + this.#gl.bindFramebuffer(this.#gl.FRAMEBUFFER, framebuffer) + this.#gl.framebufferTexture2D(this.#gl.FRAMEBUFFER, this.#gl.COLOR_ATTACHMENT0, this.#gl.TEXTURE_2D, texture, 0) + if (this.#gl.checkFramebufferStatus(this.#gl.FRAMEBUFFER) !== this.#gl.FRAMEBUFFER_COMPLETE) + throw new Error(`Failed to create drawing framebuffer`) + this.#drawFbo = { texture, framebuffer, size: { x: this.#gl.drawingBufferWidth, y: this.#gl.drawingBufferHeight } } + + /** Create textures, framebuffers, and uniform location for downsampling */ + for (let i = 1; i <= 4; i++) { + const width = this.#gl.drawingBufferWidth / (2 ** i) + const height = this.#gl.drawingBufferHeight / (2 ** i) + + const texture = this.#gl.createTexture() + this.#gl.bindTexture(this.#gl.TEXTURE_2D, texture) + this.#gl.texImage2D(this.#gl.TEXTURE_2D, 0, this.#gl.RGBA32UI, width, height, 0, this.#gl.RGBA_INTEGER, this.#gl.UNSIGNED_INT, null) + this.#gl.texParameteri(this.#gl.TEXTURE_2D, this.#gl.TEXTURE_MIN_FILTER, this.#gl.NEAREST) + this.#gl.texParameteri(this.#gl.TEXTURE_2D, this.#gl.TEXTURE_MAG_FILTER, this.#gl.NEAREST) + + const framebuffer = this.#gl.createFramebuffer() + this.#gl.bindFramebuffer(this.#gl.FRAMEBUFFER, framebuffer) + this.#gl.framebufferTexture2D(this.#gl.FRAMEBUFFER, this.#gl.COLOR_ATTACHMENT0, this.#gl.TEXTURE_2D, texture, 0) + if (this.#gl.checkFramebufferStatus(this.#gl.FRAMEBUFFER) !== this.#gl.FRAMEBUFFER_COMPLETE) + throw new Error(`Failed to create downsampling framebuffer ${i}`) + this.#downsampleFbos.push({ texture, framebuffer, size: { x: width, y: height } }) + } + this.#downsampleSrcLocation = this.#gl.getUniformLocation(this.#downsampleProgram, 'src') + this.#gl.bindTexture(this.#gl.TEXTURE_2D, null) + this.#gl.bindFramebuffer(this.#gl.FRAMEBUFFER, null) + + /** Create input buffers */ this.#uboBuffer = this.#gl.createBuffer() this.#gl.bindBuffer(this.#gl.UNIFORM_BUFFER, this.#uboBuffer) this.#gl.bufferData(this.#gl.UNIFORM_BUFFER, 144, this.#gl.DYNAMIC_DRAW) this.#gl.bindBuffer(this.#gl.UNIFORM_BUFFER, null) this.#gl.bindBufferBase(this.#gl.UNIFORM_BUFFER, 0, this.#uboBuffer) - this.#gl.uniformBlockBinding(this.#program, this.#gl.getUniformBlockIndex(this.#program, 'UBO'), 0) + this.#gl.uniformBlockBinding(this.#drawProgram, this.#gl.getUniformBlockIndex(this.#drawProgram, 'UBO'), 0) this.#gl.bindBuffer(this.#gl.UNIFORM_BUFFER, null) - this.#workBuffer = this.#gl.createBuffer() - this.#gl.bindBuffer(this.#gl.UNIFORM_BUFFER, this.#workBuffer) - this.#gl.bufferData(this.#gl.UNIFORM_BUFFER, 32, this.#gl.STREAM_DRAW) + this.#seedBuffer = this.#gl.createBuffer() + this.#gl.bindBuffer(this.#gl.UNIFORM_BUFFER, this.#seedBuffer) + this.#gl.bufferData(this.#gl.UNIFORM_BUFFER, 16, this.#gl.DYNAMIC_DRAW) this.#gl.bindBuffer(this.#gl.UNIFORM_BUFFER, null) - this.#gl.bindBufferBase(this.#gl.UNIFORM_BUFFER, 1, this.#workBuffer) - this.#gl.uniformBlockBinding(this.#program, this.#gl.getUniformBlockIndex(this.#program, 'WORK'), 1) + this.#gl.bindBufferBase(this.#gl.UNIFORM_BUFFER, 1, this.#seedBuffer) + this.#gl.uniformBlockBinding(this.#drawProgram, this.#gl.getUniformBlockIndex(this.#drawProgram, 'WORK'), 1) this.#gl.bindBuffer(this.#gl.UNIFORM_BUFFER, null) - this.#pixels = new Uint32Array(this.#gl.drawingBufferWidth * this.#gl.drawingBufferHeight * 4) + /** Finalize configuration */ this.#query = this.#gl.createQuery() + this.#pixels = new Uint32Array(this.#gl.drawingBufferWidth * this.#gl.drawingBufferHeight * 4) + console.log(`NanoPow WebGL initialized at ${this.#gl.drawingBufferWidth}x${this.#gl.drawingBufferHeight}. Maximum nonces checked per frame: ${this.#gl.drawingBufferWidth * this.#gl.drawingBufferHeight}`) } catch (err) { - throw new Error(`WebGL initialization failed. ${err}`) + throw new Error('WebGL initialization failed.', { cause: err }) } finally { this.#busy = false } } static reset (): void { + cancelAnimationFrame(NanoPowGl.#raf) + NanoPowGl.#gl?.deleteQuery(NanoPowGl.#query) NanoPowGl.#query = null - NanoPowGl.#workBuffer = null + NanoPowGl.#gl?.deleteBuffer(NanoPowGl.#seedBuffer) + NanoPowGl.#seedBuffer = null + NanoPowGl.#gl?.deleteBuffer(NanoPowGl.#uboBuffer) NanoPowGl.#uboBuffer = null - NanoPowGl.#uvBuffer = null + for (const fbo of NanoPowGl.#downsampleFbos) { + NanoPowGl.#gl?.deleteFramebuffer(fbo.framebuffer) + NanoPowGl.#gl?.deleteTexture(fbo.texture) + } + NanoPowGl.#downsampleFbos = [] + NanoPowGl.#gl?.deleteShader(NanoPowGl.#downsampleShader) + NanoPowGl.#downsampleShader = null + NanoPowGl.#gl?.deleteProgram(NanoPowGl.#downsampleProgram) + NanoPowGl.#downsampleProgram = null + NanoPowGl.#gl?.deleteFramebuffer(NanoPowGl.#drawFbo?.framebuffer ?? null) + NanoPowGl.#drawFbo = null + NanoPowGl.#gl?.deleteTexture(NanoPowGl.#drawFbo) + NanoPowGl.#drawFbo = null + NanoPowGl.#gl?.deleteBuffer(NanoPowGl.#positionBuffer) NanoPowGl.#positionBuffer = null - NanoPowGl.#framebuffer = null - NanoPowGl.#texture = null - NanoPowGl.#fragmentShader = null + NanoPowGl.#gl?.deleteShader(NanoPowGl.#drawShader) + NanoPowGl.#drawShader = null + NanoPowGl.#gl?.deleteShader(NanoPowGl.#vertexShader) NanoPowGl.#vertexShader = null - NanoPowGl.#program = null + NanoPowGl.#gl?.deleteProgram(NanoPowGl.#drawProgram) + NanoPowGl.#drawProgram = null NanoPowGl.#gl = null NanoPowGl.#busy = false NanoPowGl.init() @@ -161,42 +228,49 @@ export class NanoPowGl { "Harmonic Mean (ms)": count / reciprocals, "Geometric Mean (ms)": Math.exp(logarithms / count) } + console.log(`Averages: ${JSON.stringify(averages)}`) console.table(averages) } - static #draw (work: Uint8Array): void { + static #draw (seed: BigUint64Array): void { if (this.#gl == null || this.#query == null) throw new Error('WebGL 2 is required to draw and query pixels') - if (this.#workBuffer == null) throw new Error('Work buffer is required to draw') - this.#gl.clear(this.#gl.COLOR_BUFFER_BIT) + if (this.#drawFbo == null) throw new Error('FBO is required to draw') + if (this.#seed[0] == null || this.#seedBuffer == null) throw new Error('Seed is required to draw') - /** Upload work buffer */ - this.#gl.bindBuffer(this.#gl.UNIFORM_BUFFER, this.#workBuffer) - this.#gl.bufferSubData(this.#gl.UNIFORM_BUFFER, 0, work) + /** Upload work seed buffer */ + this.#gl.bindBuffer(this.#gl.UNIFORM_BUFFER, this.#seedBuffer) + this.#gl.bufferSubData(this.#gl.UNIFORM_BUFFER, 0, seed) this.#gl.bindBuffer(this.#gl.UNIFORM_BUFFER, null) + /** Draw full canvas */ + this.#gl.useProgram(this.#drawProgram) + this.#gl.bindFramebuffer(this.#gl.FRAMEBUFFER, this.#drawFbo.framebuffer) + this.#gl.activeTexture(this.#gl.TEXTURE0) + this.#gl.bindTexture(this.#gl.TEXTURE_2D, this.#drawFbo.texture) + this.#gl.beginQuery(this.#gl.ANY_SAMPLES_PASSED_CONSERVATIVE, this.#query) - this.#gl.bindFramebuffer(this.#gl.FRAMEBUFFER, this.#framebuffer) - this.#gl.drawArrays(this.#gl.TRIANGLES, 0, 6) + this.#gl.viewport(0, 0, this.#drawFbo.size.x, this.#drawFbo.size.y) + this.#gl.drawArrays(this.#gl.TRIANGLES, 0, 4) this.#gl.bindFramebuffer(this.#gl.FRAMEBUFFER, null) this.#gl.endQuery(this.#gl.ANY_SAMPLES_PASSED_CONSERVATIVE) } static async #checkQueryResult (): Promise { return new Promise((resolve, reject) => { - try { - if (this.#gl == null || this.#query == null) throw new Error('WebGL 2 is required to check query results') - if (this.#gl.getQueryParameter(this.#query, this.#gl.QUERY_RESULT_AVAILABLE)) { - resolve(!!(this.#gl.getQueryParameter(this.#query, this.#gl.QUERY_RESULT))) - } else { - /** Query result not yet available, check again in the next frame */ - requestAnimationFrame(async (): Promise => { - const result = await NanoPowGl.#checkQueryResult() - resolve(result) - }) + function check () { + try { + if (NanoPowGl.#gl == null || NanoPowGl.#query == null) throw new Error('WebGL 2 is required to check query results') + if (NanoPowGl.#gl.getQueryParameter(NanoPowGl.#query, NanoPowGl.#gl.QUERY_RESULT_AVAILABLE)) { + resolve(!!(NanoPowGl.#gl.getQueryParameter(NanoPowGl.#query, NanoPowGl.#gl.QUERY_RESULT))) + } else { + /** Query result not yet available, check again in the next frame */ + NanoPowGl.#raf = requestAnimationFrame(check) + } + } catch (err) { + reject(err) } - } catch (err) { - reject(err) } + check() }) } @@ -210,13 +284,41 @@ export class NanoPowGl { */ static #readResult (workHex?: string): string { if (this.#gl == null) throw new Error('WebGL 2 is required to read pixels') - this.#gl.bindFramebuffer(this.#gl.FRAMEBUFFER, this.#framebuffer) - this.#gl.readPixels(0, 0, this.#gl.drawingBufferWidth, this.#gl.drawingBufferHeight, this.#gl.RGBA_INTEGER, this.#gl.UNSIGNED_INT, this.#pixels) + if (this.#drawFbo == null) throw new Error('Source FBO is required to downsample') + + let source = this.#drawFbo + let pixelCount + const start = performance.now() + if (workHex != null) { + /** Read validate results immedidately without unnecessary downsampling */ + this.#gl.bindFramebuffer(this.#gl.FRAMEBUFFER, source.framebuffer) + this.#gl.readPixels(0, 0, 1, 1, this.#gl.RGBA_INTEGER, this.#gl.UNSIGNED_INT, this.#pixels) + pixelCount = 4 + } else { + /** Downsample framebuffer */ + this.#gl.useProgram(this.#downsampleProgram) + for (const fbo of this.#downsampleFbos) { + this.#gl.bindFramebuffer(this.#gl.FRAMEBUFFER, fbo.framebuffer) + this.#gl.activeTexture(this.#gl.TEXTURE0) + this.#gl.bindTexture(this.#gl.TEXTURE_2D, source.texture) + this.#gl.uniform1i(this.#downsampleSrcLocation, 0) + this.#gl.viewport(0, 0, fbo.size.x, fbo.size.y) + this.#gl.drawArrays(this.#gl.TRIANGLES, 0, 4) + source = fbo + } + /** Read downsampled result */ + this.#gl.bindFramebuffer(this.#gl.FRAMEBUFFER, source.framebuffer) + this.#gl.readPixels(0, 0, source.size.x, source.size.y, this.#gl.RGBA_INTEGER, this.#gl.UNSIGNED_INT, this.#pixels) + pixelCount = source.size.x * source.size.y * 4 + } this.#gl.bindFramebuffer(this.#gl.FRAMEBUFFER, null) - for (let i = 0; i < this.#pixels.length; i += 4) { + + for (let i = 0; i < pixelCount; i += 4) { if (this.#pixels[i] !== 0) { + if (this.#debug) console.log(`readResults (${performance.now() - start} ms)`) + if (this.#debug) console.log(`Pixel: rgba(${this.#pixels[i]}, ${this.#pixels[i + 1]}, ${this.#pixels[i + 2]}, ${this.#pixels[i + 3].toString(16).padStart(8, '0')})`) /** Return the work value with the custom bits */ - const hex = `${this.#pixels[i+1].toString(16).padStart(8, '0')}${this.#pixels[i+2].toString(16).padStart(8, '0')}` + const hex = `${this.#pixels[i + 1].toString(16).padStart(8, '0')}${this.#pixels[i + 2].toString(16).padStart(8, '0')}` if (workHex == null || workHex == hex) return hex } } @@ -230,10 +332,8 @@ export class NanoPowGl { * @param {number} [threshold=0xfffffff8] - Difficulty of proof-of-work calculation */ static async search (hash: string, options?: NanoPowOptions): Promise { - if (NanoPowGl.#gl == null) throw new Error('WebGL 2 is required') - if (this.#gl == null) throw new Error('WebGL 2 is required') - if (!/^[A-Fa-f0-9]{64}$/.test(hash)) throw new Error(`Invalid hash ${hash}`) if (this.#busy) { + console.log('NanoPowGl is busy. Retrying search...') return new Promise(resolve => { setTimeout(async (): Promise => { const result = this.search(hash, options) @@ -242,49 +342,67 @@ export class NanoPowGl { }) } this.#busy = true + + /** Process user input */ + if (!/^[A-Fa-f0-9]{64}$/.test(hash)) throw new Error(`Invalid hash ${hash}`) const threshold = (typeof options?.threshold !== 'number' || options.threshold < 0x0 || options.threshold > 0xffffffff) ? 0xfffffff8 : options.threshold const effort = (typeof options?.effort !== 'number' || options.effort < 0x1 || options.effort > 0x20) - ? 0x8 + ? this.#cores : options.effort - const debug = !!(options?.debug) + this.#debug = !!(options?.debug) /** Reset if user specified new level of effort */ if (this.#WORKLOAD !== 256 * effort) { this.#WORKLOAD = 256 * effort + this.#canvas.height = this.#WORKLOAD + this.#canvas.width = this.#WORKLOAD this.reset() } + if (NanoPowGl.#gl == null) throw new Error('WebGL 2 is required') + if (this.#gl == null) throw new Error('WebGL 2 is required') + if (this.#drawFbo == null) throw new Error('WebGL framebuffer is required') + + /** Clear any previous results */ + this.#gl.bindFramebuffer(this.#gl.FRAMEBUFFER, this.#drawFbo.framebuffer) + this.#gl.clearBufferuiv(this.#gl.COLOR, 0, [0, 0, 0, 0]) + this.#gl.bindFramebuffer(this.#gl.FRAMEBUFFER, null) /** Set up uniform buffer object */ - const uboView = new DataView(new ArrayBuffer(144)) + for (let i = 0; i < this.#uboView.byteLength; i++) this.#uboView.setUint8(i, 0) for (let i = 0; i < 64; i += 8) { const uint32 = hash.slice(i, i + 8) - uboView.setUint32(i * 2, parseInt(uint32, 16)) + this.#uboView.setUint32(i * 2, parseInt(uint32, 16)) } - uboView.setUint32(128, threshold, true) - uboView.setFloat32(132, 256 * effort, true) - NanoPowGl.#gl.bindBuffer(NanoPowGl.#gl.UNIFORM_BUFFER, NanoPowGl.#uboBuffer) - NanoPowGl.#gl.bufferSubData(NanoPowGl.#gl.UNIFORM_BUFFER, 0, uboView) - NanoPowGl.#gl.bindBuffer(NanoPowGl.#gl.UNIFORM_BUFFER, null) + this.#uboView.setUint32(128, threshold, true) + this.#uboView.setUint32(132, 1, true) + if (this.#debug) console.log('UBO', this.#uboView.buffer.slice(0)) + this.#gl.bindBuffer(this.#gl.UNIFORM_BUFFER, this.#uboBuffer) + this.#gl.bufferSubData(this.#gl.UNIFORM_BUFFER, 0, this.#uboView) + this.#gl.bindBuffer(this.#gl.UNIFORM_BUFFER, null) /** Start drawing to calculate one nonce per pixel */ let times = [] let start = performance.now() let nonce = null - const seed = new Uint8Array(8) + if (this.#debug) console.groupCollapsed('Seeds (click to view)') while (nonce == null) { start = performance.now() - crypto.getRandomValues(seed) - this.#draw(seed) + const random0 = Math.floor(Math.random() * 0xffffffff) + const random1 = Math.floor(Math.random() * 0xffffffff) + this.#seed[0] = (BigInt(random0) << 32n) | BigInt(random1) + if (this.#debug) console.log('Seed', this.#seed) + this.#draw(this.#seed) const found = await this.#checkQueryResult() times.push(performance.now() - start) if (found) { + if (this.#debug) console.groupEnd() nonce = this.#readResult() } } this.#busy = false - if (debug) this.#logAverages(times) + if (this.#debug) this.#logAverages(times) return nonce } @@ -296,11 +414,8 @@ export class NanoPowGl { * @param {number} [threshold=0xfffffff8] - Difficulty of proof-of-work calculation */ static async validate (work: string, hash: string, options?: NanoPowOptions): Promise { - if (NanoPowGl.#gl == null) throw new Error('WebGL 2 is required') - if (this.#gl == null) throw new Error('WebGL 2 is required') - if (!/^[A-Fa-f0-9]{16}$/.test(work)) throw new Error(`Invalid work ${work}`) - if (!/^[A-Fa-f0-9]{64}$/.test(hash)) throw new Error(`Invalid hash ${hash}`) if (this.#busy) { + console.log('NanoPowGl is busy. Retrying validate...') return new Promise(resolve => { setTimeout(async (): Promise => { const result = this.validate(work, hash, options) @@ -309,35 +424,42 @@ export class NanoPowGl { }) } this.#busy = true + + /** Process user input */ + if (!/^[A-Fa-f0-9]{16}$/.test(work)) throw new Error(`Invalid work ${work}`) + if (!/^[A-Fa-f0-9]{64}$/.test(hash)) throw new Error(`Invalid hash ${hash}`) const threshold = (typeof options?.threshold !== 'number' || options.threshold < 0x0 || options.threshold > 0xffffffff) ? 0xfffffff8 : options.threshold - const debug = !!(options?.debug) + this.#debug = !!(options?.debug) - /** Reset if user specified new level of effort */ - if (this.#WORKLOAD !== 1) { - this.#WORKLOAD = 1 - this.reset() - } + if (NanoPowGl.#gl == null) throw new Error('WebGL 2 is required') + if (this.#gl == null) throw new Error('WebGL 2 is required') + if (this.#drawFbo == null) throw new Error('WebGL framebuffer is required') + + /** Clear any previous results */ + this.#gl.bindFramebuffer(this.#gl.FRAMEBUFFER, this.#drawFbo.framebuffer) + this.#gl.clearBufferuiv(this.#gl.COLOR, 0, [0, 0, 0, 0]) + this.#gl.bindFramebuffer(this.#gl.FRAMEBUFFER, null) /** Set up uniform buffer object */ - const uboView = new DataView(new ArrayBuffer(144)) + for (let i = 0; i < this.#uboView.byteLength; i++) this.#uboView.setUint8(i, 0) for (let i = 0; i < 64; i += 8) { const uint32 = hash.slice(i, i + 8) - uboView.setUint32(i * 2, parseInt(uint32, 16)) + this.#uboView.setUint32(i * 2, parseInt(uint32, 16)) } - uboView.setUint32(128, threshold, true) - uboView.setFloat32(132, NanoPowGl.#WORKLOAD - 1, true) - NanoPowGl.#gl.bindBuffer(NanoPowGl.#gl.UNIFORM_BUFFER, NanoPowGl.#uboBuffer) - NanoPowGl.#gl.bufferSubData(NanoPowGl.#gl.UNIFORM_BUFFER, 0, uboView) - NanoPowGl.#gl.bindBuffer(NanoPowGl.#gl.UNIFORM_BUFFER, null) + this.#uboView.setUint32(128, threshold, true) + this.#uboView.setUint32(132, 0, true) + if (this.#debug) console.log('UBO', this.#uboView.buffer.slice(0)) + this.#gl.bindBuffer(this.#gl.UNIFORM_BUFFER, this.#uboBuffer) + this.#gl.bufferSubData(this.#gl.UNIFORM_BUFFER, 0, this.#uboView) + this.#gl.bindBuffer(this.#gl.UNIFORM_BUFFER, null) /** Start drawing to calculate one nonce per pixel */ let nonce = null - const data = new DataView(new ArrayBuffer(8)) - data.setBigUint64(0, BigInt(`0x${work}`), true) - const seed = new Uint8Array(data.buffer) - this.#draw(seed) + this.#seed[0] = BigInt(`0x${work}`) + if (this.#debug) console.log('Work', this.#seed) + this.#draw(this.#seed) let found = await this.#checkQueryResult() if (found) { try { diff --git a/src/classes/gpu.ts b/src/classes/gpu.ts index 04f5733..b00e727 100644 --- a/src/classes/gpu.ts +++ b/src/classes/gpu.ts @@ -11,10 +11,13 @@ export class NanoPowGpu { // Initialize WebGPU static #busy: boolean = false + static #debug: boolean = false static #device: GPUDevice | null = null - static #uboBuffer: GPUBuffer + static #gpuBufferReset: BigUint64Array = new BigUint64Array([0n, 0n]) static #gpuBuffer: GPUBuffer static #cpuBuffer: GPUBuffer + static #uboBuffer: GPUBuffer + static #uboView: DataView static #bindGroupLayout: GPUBindGroupLayout static #searchPipeline: GPUComputePipeline static #validatePipeline: GPUComputePipeline @@ -34,7 +37,7 @@ export class NanoPowGpu { this.#device = device this.setup() } catch (err) { - throw new Error(`WebGPU initialization failed. ${err}`) + throw new Error('WebGPU initialization failed.', { cause: err }) } finally { this.#busy = false } @@ -43,10 +46,6 @@ export class NanoPowGpu { static setup (): void { if (this.#device == null) throw new Error(`WebGPU device failed to load.`) // Create buffers for writing GPU calculations and reading from Javascript - this.#uboBuffer = this.#device.createBuffer({ - size: 48, - usage: GPUBufferUsage.UNIFORM | GPUBufferUsage.COPY_DST - }) this.#gpuBuffer = this.#device.createBuffer({ size: 16, usage: GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_DST | GPUBufferUsage.COPY_SRC @@ -55,6 +54,11 @@ export class NanoPowGpu { size: 16, usage: GPUBufferUsage.COPY_DST | GPUBufferUsage.MAP_READ }) + this.#uboBuffer = this.#device.createBuffer({ + size: 48, + usage: GPUBufferUsage.UNIFORM | GPUBufferUsage.COPY_DST + }) + this.#uboView = new DataView(new ArrayBuffer(48)) // Create binding group data structure and use it later once UBO is known this.#bindGroupLayout = this.#device.createBindGroupLayout({ entries: [ @@ -93,6 +97,7 @@ export class NanoPowGpu { module: shaderModule } }) + console.log(`NanoPow WebGPU initialized. Recommended effort: ${Math.max(1, Math.floor(navigator.hardwareConcurrency / 2))}`) } static reset (): void { @@ -140,17 +145,18 @@ export class NanoPowGpu { if (this.#device == null) throw new Error(`WebGPU device failed to load.`) // Set up uniform buffer object // Note: u32 size is 4, but total alignment must be multiple of 16 - const uboView = new DataView(new ArrayBuffer(48)) + for (let i = 0; i < this.#uboView.byteLength; i++) this.#uboView.setUint8(i, 0) for (let i = 0; i < 64; i += 16) { const u64 = hash.slice(i, i + 16) - uboView.setBigUint64(i / 2, BigInt(`0x${u64}`)) + this.#uboView.setBigUint64(i / 2, BigInt(`0x${u64}`)) } - uboView.setBigUint64(32, seed, true) - uboView.setUint32(40, threshold, true) - this.#device.queue.writeBuffer(this.#uboBuffer, 0, uboView) + this.#uboView.setBigUint64(32, seed, true) + this.#uboView.setUint32(40, threshold, true) + if (this.#debug) console.log('UBO', this.#uboView) + this.#device.queue.writeBuffer(this.#uboBuffer, 0, this.#uboView) // Reset `nonce` and `found` to 0u in WORK before each calculation - this.#device.queue.writeBuffer(this.#gpuBuffer, 0, new Uint32Array([0, 0, 0])) + this.#device.queue.writeBuffer(this.#gpuBuffer, 0, this.#gpuBufferReset) // Bind UBO read and GPU write buffers const bindGroup = this.#device.createBindGroup({ @@ -198,6 +204,7 @@ export class NanoPowGpu { console.warn(`Error getting data from GPU. ${err}`) return this.#dispatch(pipeline, seed, hash, threshold, passes) } + if (this.#debug) console.log('gpuBuffer data', data) if (data == null) throw new Error(`Failed to get data from buffer.`) return data } @@ -211,6 +218,7 @@ export class NanoPowGpu { static async search (hash: string, options?: NanoPowOptions): Promise { if (!/^[A-Fa-f0-9]{64}$/.test(hash)) throw new TypeError(`Invalid hash ${hash}`) if (this.#busy) { + console.log('NanoPowGpu is busy. Retrying search...') return new Promise(resolve => { setTimeout(async (): Promise => { const result = this.search(hash, options) @@ -225,7 +233,7 @@ export class NanoPowGpu { const effort = (typeof options?.effort !== 'number' || options.effort < 0x1 || options.effort > 0x20) ? 0x800 : options.effort * 0x100 - const debug = !!(options?.debug) + this.#debug = !!(options?.debug) // Ensure WebGPU is initialized before calculating let loads = 0 @@ -244,14 +252,16 @@ export class NanoPowGpu { let nonce = 0n do { start = performance.now() - const random = Math.floor(Math.random() * 0xffffffff) - const seed = (BigInt(random) << 32n) | BigInt(random) + const random0 = Math.floor(Math.random() * 0xffffffff) + const random1 = Math.floor(Math.random() * 0xffffffff) + const seed = (BigInt(random0) << 32n) | BigInt(random1) + if (this.#debug) console.log(`seed: ${seed}`) const data = await this.#dispatch(this.#searchPipeline, seed, hash, threshold, effort) nonce = data.getBigUint64(0, true) this.#busy = !data.getUint32(8) times.push(performance.now() - start) } while (this.#busy) - if (debug) this.#logAverages(times) + if (this.#debug) this.#logAverages(times) return nonce.toString(16).padStart(16, '0') } @@ -266,6 +276,7 @@ export class NanoPowGpu { if (!/^[A-Fa-f0-9]{16}$/.test(work)) throw new TypeError(`Invalid work ${work}`) if (!/^[A-Fa-f0-9]{64}$/.test(hash)) throw new TypeError(`Invalid hash ${hash}`) if (this.#busy) { + console.log('NanoPowGpu is busy. Retrying validate...') return new Promise(resolve => { setTimeout(async (): Promise => { const result = this.validate(work, hash, options) @@ -274,7 +285,7 @@ export class NanoPowGpu { }) } this.#busy = true - const debug = !!(options?.debug) + this.#debug = !!(options?.debug) const threshold = (typeof options?.threshold !== 'number' || options.threshold < 0x0 || options.threshold > 0xffffffff) ? 0xfffffff8 : options.threshold @@ -292,8 +303,10 @@ export class NanoPowGpu { } const seed = BigInt(`0x${work}`) + if (this.#debug) console.log(`work: ${work}`) const data = await this.#dispatch(this.#validatePipeline, seed, hash, threshold, 1) const nonce = data.getBigUint64(0, true).toString(16).padStart(16, '0') + if (this.#debug) console.log(`nonce: ${nonce}`) const found = !!data.getUint32(8) this.#busy = false if (found && work !== nonce) throw new Error(`Nonce (${nonce}) found but does not match work (${work})`) diff --git a/src/classes/index.ts b/src/classes/index.ts index 3e7f7e0..183899e 100644 --- a/src/classes/index.ts +++ b/src/classes/index.ts @@ -9,16 +9,14 @@ try { await NanoPowGpu.init() isGpuSupported = true } catch (err) { - console.error(err) - console.warn(`WebGPU is not supported in this environment.`) + console.warn('WebGPU is not supported in this environment.\n', err) isGpuSupported = false } try { await NanoPowGl.init() isGlSupported = true } catch (err) { - console.error(err) - console.warn(`WebGL is not supported in this environment.`) + console.warn('WebGL is not supported in this environment.\n', err) isGlSupported = false } diff --git a/src/shaders/compute.wgsl b/src/shaders/compute.wgsl index 179a478..18bf76f 100644 --- a/src/shaders/compute.wgsl +++ b/src/shaders/compute.wgsl @@ -27,14 +27,31 @@ struct WORK { */ const BLAKE2B_IV_0 = vec2(0xF2BDC900u, 0x6A09E667u); +/** +* Numeric literal used in the finalization digest is the original value of the +* first element of the initialization vector `blake2b_IV[0]` which in NanoPow +* is initialized at vector component `v01.y`. +*/ +const BLAKE2B_IV_0 = vec2(0xF2BDC900u, 0x6A09E667u); + +/** +* Used to fill partial `m` vec4 constructions. +*/ +const Z = vec2(0u); + +/** +* Used to apply boolean mask to swizzled result of carry bit comparison. +*/ +const CARRY = vec4(1u, 0u, 1u, 0u); + /** * Used to rotate bits by a fixed amount during G mixing. */ -const ROTATE_1 = vec2(1u); -const ROTATE_8 = vec2(8u); -const ROTATE_16 = vec2(16u); -const ROTATE_24 = vec2(24u); -const ROTATE_31 = vec2(31u); +const ROTATE_1 = vec4(1u); +const ROTATE_8 = vec4(8u); +const ROTATE_16 = vec4(16u); +const ROTATE_24 = vec4(24u); +const ROTATE_31 = vec4(31u); /** * Shared flag to prevent execution for all workgroup threads based on the @@ -51,9 +68,9 @@ var found: bool; */ @compute @workgroup_size(32) fn search(@builtin(global_invocation_id) global_id: vec3, @builtin(local_invocation_id) local_id: vec3) { - // found = (local_id.x == 0u && atomicLoad(&work.found) != 0u); - // workgroupBarrier(); - // if (found) { return; } + found = (local_id.x == 0u && atomicLoad(&work.found) != 0u); + workgroupBarrier(); + if (found) { return; } main(global_id); } @@ -105,22 +122,22 @@ fn main(id: vec3) { * It is always the "last" compression at this INLEN * vE = ~vE; */ - var v0: vec2 = BLAKE2B_IV_0; - var v1: vec2 = vec2(0x84CAA73Bu, 0xBB67AE85u); - var v2: vec2 = vec2(0xFE94F82Bu, 0x3C6EF372u); - var v3: vec2 = vec2(0x5F1D36F1u, 0xA54FF53Au); - var v4: vec2 = vec2(0xADE682D1u, 0x510E527Fu); - var v5: vec2 = vec2(0x2B3E6C1Fu, 0x9B05688Cu); - var v6: vec2 = vec2(0xFB41BD6Bu, 0x1F83D9ABu); - var v7: vec2 = vec2(0x137E2179u, 0x5BE0CD19u); - var v8: vec2 = vec2(0xF3BCC908u, 0x6A09E667u); - var v9: vec2 = vec2(0x84CAA73Bu, 0xBB67AE85u); - var vA: vec2 = vec2(0xFE94F82Bu, 0x3C6EF372u); - var vB: vec2 = vec2(0x5F1D36F1u, 0xA54FF53Au); - var vC: vec2 = vec2(0xADE682F9u, 0x510E527Fu); - var vD: vec2 = vec2(0x2B3E6C1Fu, 0x9B05688Cu); - var vE: vec2 = vec2(0x04BE4294u, 0xE07C2654u); - var vF: vec2 = vec2(0x137E2179u, 0x5BE0CD19u); + var v01: vec4 = vec4(BLAKE2B_IV_0, 0x84CAA73Bu, 0xBB67AE85u); + var v23: vec4 = vec4(0xFE94F82Bu, 0x3C6EF372u, 0x5F1D36F1u, 0xA54FF53Au); + var v45: vec4 = vec4(0xADE682D1u, 0x510E527Fu, 0x2B3E6C1Fu, 0x9B05688Cu); + var v67: vec4 = vec4(0xFB41BD6Bu, 0x1F83D9ABu, 0x137E2179u, 0x5BE0CD19u); + var v89: vec4 = vec4(0xF3BCC908u, 0x6A09E667u, 0x84CAA73Bu, 0xBB67AE85u); + var vAB: vec4 = vec4(0xFE94F82Bu, 0x3C6EF372u, 0x5F1D36F1u, 0xA54FF53Au); + var vCD: vec4 = vec4(0xADE682F9u, 0x510E527Fu, 0x2B3E6C1Fu, 0x9B05688Cu); + var vEF: vec4 = vec4(0x04BE4294u, 0xE07C2654u, 0x137E2179u, 0x5BE0CD19u); + + /** + * Temporary variables used for subprocesses i=4 through i=7 + */ + var v56: vec4; + var vFC: vec4; + var v74: vec4; + var vDE: vec4; /** * Twelve rounds of G mixing as part of BLAKE2b compression step, each divided @@ -156,112 +173,125 @@ fn main(id: vec3) { * Each sum step has an extra carry addition. Note that the m[sigma] sum is * skipped if m[sigma] is zero since it effectively does nothing. */ + var s0: vec4; + var s1: vec4; /**************************************************************************** * ROUND(0) * ****************************************************************************/ /** - * r=0, i=0, a=v[0], b=v[4], c=v[8], d=v[12] - */ - v0 = v0 + v4 + vec2(0u, u32(v0.x + v4.x < v0.x)); - v0 = v0 + m0 + vec2(0u, u32(v0.x + m0.x < v0.x)); - vC = (vC ^ v0).yx; - v8 = v8 + vC + vec2(0u, u32(v8.x + vC.x < v8.x)); - v4 = ((v4 ^ v8) >> ROTATE_24) | ((v4 ^ v8).yx << ROTATE_8); - v0 = v0 + v4 + vec2(0u, u32(v0.x + v4.x < v0.x)); - v0 = v0 + m1 + vec2(0u, u32(v0.x + m1.x < v0.x)); - vC = ((vC ^ v0) >> ROTATE_16) | ((vC ^ v0).yx << ROTATE_16); - v8 = v8 + vC + vec2(0u, u32(v8.x + vC.x < v8.x)); - v4 = ((v4 ^ v8) << ROTATE_1) | ((v4 ^ v8).yx >> ROTATE_31); + * r=0, i=0, a=v[0], b=v[4], c=v[8], d=v[12], m[sigma]=0, m[sigma+1]=1 + * r=0, i=1, a=v[1], b=v[5], c=v[9], d=v[13], m[sigma]=2, m[sigma+1]=3 + * r=0, i=2, a=v[2], b=v[6], c=v[10], d=v[14], m[sigma]=4, m[sigma+1]=5 + * r=0, i=3, a=v[3], b=v[7], c=v[11], d=v[15], m[sigma]=6, m[sigma+1]=7 + */ + s0 = v01 + v45; + v01 = s0 + (vec4(s0 < v01) & CARRY).yxwz; + s1 = v23 + v67; + v23 = s1 + (vec4(s1 < v23) & CARRY).yxwz; + + s0 = v01 + vec4(m0, m2); + v01 = s0 + (vec4(s0 < v01) & CARRY).yxwz; + v23 += vec4(m4, Z); + v23.y += u32(v23.x < m4.x); - /** - * r=0, i=1, a=v[1], b=v[5], c=v[9], d=v[13] - */ - v1 = v1 + v5 + vec2(0u, u32(v1.x + v5.x < v1.x)); - v1 = v1 + m2 + vec2(0u, u32(v1.x + m2.x < v1.x)); - vD = (vD ^ v1).yx; - v9 = v9 + vD + vec2(0u, u32(v9.x + vD.x < v9.x)); - v5 = ((v5 ^ v9) >> ROTATE_24) | ((v5 ^ v9).yx << ROTATE_8); - v1 = v1 + v5 + vec2(0u, u32(v1.x + v5.x < v1.x)); - v1 = v1 + m3 + vec2(0u, u32(v1.x + m3.x < v1.x)); - vD = ((vD ^ v1) >> ROTATE_16) | ((vD ^ v1).yx << ROTATE_16); - v9 = v9 + vD + vec2(0u, u32(v9.x + vD.x < v9.x)); - v5 = ((v5 ^ v9) << ROTATE_1) | ((v5 ^ v9).yx >> ROTATE_31); + vCD = (vCD ^ v01).yxwz; + vEF = (vEF ^ v23).yxwz; - /** - * r=0, i=2, a=v[2], b=v[6], c=v[10], d=v[14] - */ - v2 = v2 + v6 + vec2(0u, u32(v2.x + v6.x < v2.x)); - v2 = v2 + m4 + vec2(0u, u32(v2.x + m4.x < v2.x)); - vE = (vE ^ v2).yx; - vA = vA + vE + vec2(0u, u32(vA.x + vE.x < vA.x)); - v6 = ((v6 ^ vA) >> ROTATE_24) | ((v6 ^ vA).yx << ROTATE_8); - v2 = v2 + v6 + vec2(0u, u32(v2.x + v6.x < v2.x)); - vE = ((vE ^ v2) >> ROTATE_16) | ((vE ^ v2).yx << ROTATE_16); - vA = vA + vE + vec2(0u, u32(vA.x + vE.x < vA.x)); - v6 = ((v6 ^ vA) << ROTATE_1) | ((v6 ^ vA).yx >> ROTATE_31); + s0 = v89 + vCD; + v89 = s0 + (vec4(s0 < v89) & CARRY).yxwz; + s1 = vAB + vEF; + vAB = s1 + (vec4(s1 < vAB) & CARRY).yxwz; - /** - * r=0, i=3, a=v[3], b=v[7], c=v[11], d=v[15] - */ - v3 = v3 + v7 + vec2(0u, u32(v3.x + v7.x < v3.x)); - vF = (vF ^ v3).yx; - vB = vB + vF + vec2(0u, u32(vB.x + vF.x < vB.x)); - v7 = ((v7 ^ vB) >> ROTATE_24) | ((v7 ^ vB).yx << ROTATE_8); - v3 = v3 + v7 + vec2(0u, u32(v3.x + v7.x < v3.x)); - vF = ((vF ^ v3) >> ROTATE_16) | ((vF ^ v3).yx << ROTATE_16); - vB = vB + vF + vec2(0u, u32(vB.x + vF.x < vB.x)); - v7 = ((v7 ^ vB) << ROTATE_1) | ((v7 ^ vB).yx >> ROTATE_31); + v45 ^= v89; + v45 = (v45 >> ROTATE_24) | (v45 << ROTATE_8).yxwz; + v67 ^= vAB; + v67 = (v67 >> ROTATE_24) | (v67 << ROTATE_8).yxwz; - /** - * r=0, i=4, a=v[0], b=v[5], c=v[10], d=v[15] - */ - v0 = v0 + v5 + vec2(0u, u32(v0.x + v5.x < v0.x)); - vF = (vF ^ v0).yx; - vA = vA + vF + vec2(0u, u32(vA.x + vF.x < vA.x)); - v5 = ((v5 ^ vA) >> ROTATE_24) | ((v5 ^ vA).yx << ROTATE_8); - v0 = v0 + v5 + vec2(0u, u32(v0.x + v5.x < v0.x)); - vF = ((vF ^ v0) >> ROTATE_16) | ((vF ^ v0).yx << ROTATE_16); - vA = vA + vF + vec2(0u, u32(vA.x + vF.x < vA.x)); - v5 = ((v5 ^ vA) << ROTATE_1) | ((v5 ^ vA).yx >> ROTATE_31); + s0 = v01 + v45; + v01 = s0 + (vec4(s0 < v01) & CARRY).yxwz; + s1 = v23 + v67; + v23 = s1 + (vec4(s1 < v23) & CARRY).yxwz; - /** - * r=0, i=5, a=v[1], b=v[6], c=v[11], d=v[12] - */ - v1 = v1 + v6 + vec2(0u, u32(v1.x + v6.x < v1.x)); - vC = (vC ^ v1).yx; - vB = vB + vC + vec2(0u, u32(vB.x + vC.x < vB.x)); - v6 = ((v6 ^ vB) >> ROTATE_24) | ((v6 ^ vB).yx << ROTATE_8); - v1 = v1 + v6 + vec2(0u, u32(v1.x + v6.x < v1.x)); - vC = ((vC ^ v1) >> ROTATE_16) | ((vC ^ v1).yx << ROTATE_16); - vB = vB + vC + vec2(0u, u32(vB.x + vC.x < vB.x)); - v6 = ((v6 ^ vB) << ROTATE_1) | ((v6 ^ vB).yx >> ROTATE_31); + s0 = v01 + vec4(m1, m3); + v01 = s0 + (vec4(s0 < v01) & CARRY).yxwz; + // NOP - /** - * r=0, i=6, a=v[2], b=v[7], c=v[8], d=v[13] - */ - v2 = v2 + v7 + vec2(0u, u32(v2.x + v7.x < v2.x)); - vD = (vD ^ v2).yx; - v8 = v8 + vD + vec2(0u, u32(v8.x + vD.x < v8.x)); - v7 = ((v7 ^ v8) >> ROTATE_24) | ((v7 ^ v8).yx << ROTATE_8); - v2 = v2 + v7 + vec2(0u, u32(v2.x + v7.x < v2.x)); - vD = ((vD ^ v2) >> ROTATE_16) | ((vD ^ v2).yx << ROTATE_16); - v8 = v8 + vD + vec2(0u, u32(v8.x + vD.x < v8.x)); - v7 = ((v7 ^ v8) << ROTATE_1) | ((v7 ^ v8).yx >> ROTATE_31); + vCD ^= v01; + vCD = (vCD >> ROTATE_16) | (vCD << ROTATE_16).yxwz; + vEF ^= v23; + vEF = (vEF >> ROTATE_16) | (vEF << ROTATE_16).yxwz; + + s0 = v89 + vCD; + v89 = s0 + (vec4(s0 < v89) & CARRY).yxwz; + s1 = vAB + vEF; + vAB = s1 + (vec4(s1 < vAB) & CARRY).yxwz; + + v45 ^= v89; + v45 = (v45 >> ROTATE_31).yxwz | (v45 << ROTATE_1); + v67 ^= vAB; + v67 = (v67 >> ROTATE_31).yxwz | (v67 << ROTATE_1); /** - * r=0, i=7, a=v[3], b=v[4], c=v[9], d=v[14] + * r=0, i=4, a=v[0], b=v[5], c=v[10], d=v[15], m[sigma]=8, m[sigma+1]=9 + * r=0, i=5, a=v[1], b=v[6], c=v[11], d=v[12], m[sigma]=10, m[sigma+1]=11 + * r=0, i=6, a=v[2], b=v[7], c=v[8], d=v[13], m[sigma]=12, m[sigma+1]=13 + * r=0, i=7, a=v[3], b=v[4], c=v[9], d=v[14], m[sigma]=14, m[sigma+1]=15 */ - v3 = v3 + v4 + vec2(0u, u32(v3.x + v4.x < v3.x)); - vE = (vE ^ v3).yx; - v9 = v9 + vE + vec2(0u, u32(v9.x + vE.x < v9.x)); - v4 = ((v4 ^ v9) >> ROTATE_24) | ((v4 ^ v9).yx << ROTATE_8); - v3 = v3 + v4 + vec2(0u, u32(v3.x + v4.x < v3.x)); - vE = ((vE ^ v3) >> ROTATE_16) | ((vE ^ v3).yx << ROTATE_16); - v9 = v9 + vE + vec2(0u, u32(v9.x + vE.x < v9.x)); - v4 = ((v4 ^ v9) << ROTATE_1) | ((v4 ^ v9).yx >> ROTATE_31); + v56 = vec4(v45.zw, v67.xy); + v74 = vec4(v67.zw, v45.xy); + vFC = vec4(vEF.zw, vCD.xy); + vDE = vec4(vCD.zw, vEF.xy); + + s0 = v01 + v56; + v01 = s0 + (vec4(s0 < v01) & CARRY).yxwz; + s1 = v23 + v74; + v23 = s1 + (vec4(s1 < v23) & CARRY).yxwz; + // NOP + // NOP + + vFC = (vFC ^ v01).yxwz; + vDE = (vDE ^ v23).yxwz; + + s0 = vAB + vFC; + vAB = s0 + (vec4(s0 < vAB) & CARRY).yxwz; + s1 = v89 + vDE; + v89 = s1 + (vec4(s1 < v89) & CARRY).yxwz; + + v56 ^= vAB; + v56 = (v56 >> ROTATE_24) | (v56 << ROTATE_8).yxwz; + v74 ^= v89; + v74 = (v74 >> ROTATE_24) | (v74 << ROTATE_8).yxwz; + + s0 = v01 + v56; + v01 = s0 + (vec4(s0 < v01) & CARRY).yxwz; + s1 = v23 + v74; + v23 = s1 + (vec4(s1 < v23) & CARRY).yxwz; + + // NOP + // NOP + + vFC ^= v01; + vFC = (vFC >> ROTATE_16) | (vFC << ROTATE_16).yxwz; + vDE ^= v23; + vDE = (vDE >> ROTATE_16) | (vDE << ROTATE_16).yxwz; + + s0 = vAB + vFC; + vAB = s0 + (vec4(s0 < vAB) & CARRY).yxwz; + s1 = v89 + vDE; + v89 = s1 + (vec4(s1 < v89) & CARRY).yxwz; + + v56 ^= vAB; + v56 = (v56 >> ROTATE_31).yxwz | (v56 << ROTATE_1); + v74 ^= v89; + v74 = (v74 >> ROTATE_31).yxwz | (v74 << ROTATE_1); + + v45 = vec4(v74.zw, v56.xy); + v67 = vec4(v56.zw, v74.xy); + vCD = vec4(vFC.zw, vDE.xy); + vEF = vec4(vDE.zw, vFC.xy); @@ -271,105 +301,118 @@ fn main(id: vec3) { ****************************************************************************/ /** - * r=1, i=0, a=v[0], b=v[4], c=v[8], d=v[12] - */ - v0 = v0 + v4 + vec2(0u, u32(v0.x + v4.x < v0.x)); - vC = (vC ^ v0).yx; - v8 = v8 + vC + vec2(0u, u32(v8.x + vC.x < v8.x)); - v4 = ((v4 ^ v8) >> ROTATE_24) | ((v4 ^ v8).yx << ROTATE_8); - v0 = v0 + v4 + vec2(0u, u32(v0.x + v4.x < v0.x)); - vC = ((vC ^ v0) >> ROTATE_16) | ((vC ^ v0).yx << ROTATE_16); - v8 = v8 + vC + vec2(0u, u32(v8.x + vC.x < v8.x)); - v4 = ((v4 ^ v8) << ROTATE_1) | ((v4 ^ v8).yx >> ROTATE_31); + * r=1, i=0, a=v[0], b=v[4], c=v[8], d=v[12], m[sigma]=14, m[sigma+1]=10 + * r=1, i=1, a=v[1], b=v[5], c=v[9], d=v[13], m[sigma]=4, m[sigma+1]=8 + * r=1, i=2, a=v[2], b=v[6], c=v[10], d=v[14], m[sigma]=9, m[sigma+1]=15 + * r=1, i=3, a=v[3], b=v[7], c=v[11], d=v[15], m[sigma]=13, m[sigma+1]=6 + */ + s0 = v01 + v45; + v01 = s0 + (vec4(s0 < v01) & CARRY).yxwz; + s1 = v23 + v67; + v23 = s1 + (vec4(s1 < v23) & CARRY).yxwz; + + v01 += vec4(Z, m4); + v01.w += u32(v01.z < m4.x); + // NOP + + vCD = (vCD ^ v01).yxwz; + vEF = (vEF ^ v23).yxwz; - /** - * r=1, i=1, a=v[1], b=v[5], c=v[9], d=v[13] - */ - v1 = v1 + v5 + vec2(0u, u32(v1.x + v5.x < v1.x)); - v1 = v1 + m4 + vec2(0u, u32(v1.x + m4.x < v1.x)); - vD = (vD ^ v1).yx; - v9 = v9 + vD + vec2(0u, u32(v9.x + vD.x < v9.x)); - v5 = ((v5 ^ v9) >> ROTATE_24) | ((v5 ^ v9).yx << ROTATE_8); - v1 = v1 + v5 + vec2(0u, u32(v1.x + v5.x < v1.x)); - vD = ((vD ^ v1) >> ROTATE_16) | ((vD ^ v1).yx << ROTATE_16); - v9 = v9 + vD + vec2(0u, u32(v9.x + vD.x < v9.x)); - v5 = ((v5 ^ v9) << ROTATE_1) | ((v5 ^ v9).yx >> ROTATE_31); + s0 = v89 + vCD; + v89 = s0 + (vec4(s0 < v89) & CARRY).yxwz; + s1 = vAB + vEF; + vAB = s1 + (vec4(s1 < vAB) & CARRY).yxwz; - /** - * r=1, i=2, a=v[2], b=v[6], c=v[10], d=v[14] - */ - v2 = v2 + v6 + vec2(0u, u32(v2.x + v6.x < v2.x)); - vE = (vE ^ v2).yx; - vA = vA + vE + vec2(0u, u32(vA.x + vE.x < vA.x)); - v6 = ((v6 ^ vA) >> ROTATE_24) | ((v6 ^ vA).yx << ROTATE_8); - v2 = v2 + v6 + vec2(0u, u32(v2.x + v6.x < v2.x)); - vE = ((vE ^ v2) >> ROTATE_16) | ((vE ^ v2).yx << ROTATE_16); - vA = vA + vE + vec2(0u, u32(vA.x + vE.x < vA.x)); - v6 = ((v6 ^ vA) << ROTATE_1) | ((v6 ^ vA).yx >> ROTATE_31); + v45 ^= v89; + v45 = (v45 >> ROTATE_24) | (v45 << ROTATE_8).yxwz; + v67 ^= vAB; + v67 = (v67 >> ROTATE_24) | (v67 << ROTATE_8).yxwz; - /** - * r=1, i=3, a=v[3], b=v[7], c=v[11], d=v[15] - */ - v3 = v3 + v7 + vec2(0u, u32(v3.x + v7.x < v3.x)); - vF = (vF ^ v3).yx; - vB = vB + vF + vec2(0u, u32(vB.x + vF.x < vB.x)); - v7 = ((v7 ^ vB) >> ROTATE_24) | ((v7 ^ vB).yx << ROTATE_8); - v3 = v3 + v7 + vec2(0u, u32(v3.x + v7.x < v3.x)); - vF = ((vF ^ v3) >> ROTATE_16) | ((vF ^ v3).yx << ROTATE_16); - vB = vB + vF + vec2(0u, u32(vB.x + vF.x < vB.x)); - v7 = ((v7 ^ vB) << ROTATE_1) | ((v7 ^ vB).yx >> ROTATE_31); + s0 = v01 + v45; + v01 = s0 + (vec4(s0 < v01) & CARRY).yxwz; + s1 = v23 + v67; + v23 = s1 + (vec4(s1 < v23) & CARRY).yxwz; - /** - * r=1, i=4, a=v[0], b=v[5], c=v[10], d=v[15] - */ - v0 = v0 + v5 + vec2(0u, u32(v0.x + v5.x < v0.x)); - v0 = v0 + m1 + vec2(0u, u32(v0.x + m1.x < v0.x)); - vF = (vF ^ v0).yx; - vA = vA + vF + vec2(0u, u32(vA.x + vF.x < vA.x)); - v5 = ((v5 ^ vA) >> ROTATE_24) | ((v5 ^ vA).yx << ROTATE_8); - v0 = v0 + v5 + vec2(0u, u32(v0.x + v5.x < v0.x)); - vF = ((vF ^ v0) >> ROTATE_16) | ((vF ^ v0).yx << ROTATE_16); - vA = vA + vF + vec2(0u, u32(vA.x + vF.x < vA.x)); - v5 = ((v5 ^ vA) << ROTATE_1) | ((v5 ^ vA).yx >> ROTATE_31); + // NOP + // NOP - /** - * r=1, i=5, a=v[1], b=v[6], c=v[11], d=v[12] - */ - v1 = v1 + v6 + vec2(0u, u32(v1.x + v6.x < v1.x)); - v1 = v1 + m0 + vec2(0u, u32(v1.x + m0.x < v1.x)); - vC = (vC ^ v1).yx; - vB = vB + vC + vec2(0u, u32(vB.x + vC.x < vB.x)); - v6 = ((v6 ^ vB) >> ROTATE_24) | ((v6 ^ vB).yx << ROTATE_8); - v1 = v1 + v6 + vec2(0u, u32(v1.x + v6.x < v1.x)); - v1 = v1 + m2 + vec2(0u, u32(v1.x + m2.x < v1.x)); - vC = ((vC ^ v1) >> ROTATE_16) | ((vC ^ v1).yx << ROTATE_16); - vB = vB + vC + vec2(0u, u32(vB.x + vC.x < vB.x)); - v6 = ((v6 ^ vB) << ROTATE_1) | ((v6 ^ vB).yx >> ROTATE_31); + vCD ^= v01; + vCD = (vCD >> ROTATE_16) | (vCD << ROTATE_16).yxwz; + vEF ^= v23; + vEF = (vEF >> ROTATE_16) | (vEF << ROTATE_16).yxwz; - /** - * r=1, i=6, a=v[2], b=v[7], c=v[8], d=v[13] - */ - v2 = v2 + v7 + vec2(0u, u32(v2.x + v7.x < v2.x)); - vD = (vD ^ v2).yx; - v8 = v8 + vD + vec2(0u, u32(v8.x + vD.x < v8.x)); - v7 = ((v7 ^ v8) >> ROTATE_24) | ((v7 ^ v8).yx << ROTATE_8); - v2 = v2 + v7 + vec2(0u, u32(v2.x + v7.x < v2.x)); - vD = ((vD ^ v2) >> ROTATE_16) | ((vD ^ v2).yx << ROTATE_16); - v8 = v8 + vD + vec2(0u, u32(v8.x + vD.x < v8.x)); - v7 = ((v7 ^ v8) << ROTATE_1) | ((v7 ^ v8).yx >> ROTATE_31); + s0 = v89 + vCD; + v89 = s0 + (vec4(s0 < v89) & CARRY).yxwz; + s1 = vAB + vEF; + vAB = s1 + (vec4(s1 < vAB) & CARRY).yxwz; + + v45 ^= v89; + v45 = (v45 >> ROTATE_31).yxwz | (v45 << ROTATE_1); + v67 ^= vAB; + v67 = (v67 >> ROTATE_31).yxwz | (v67 << ROTATE_1); /** - * r=1, i=7, a=v[3], b=v[4], c=v[9], d=v[14] + * r=1, i=4, a=v[0], b=v[5], c=v[10], d=v[15], m[sigma]=1, m[sigma+1]=12 + * r=1, i=5, a=v[1], b=v[6], c=v[11], d=v[12], m[sigma]=0, m[sigma+1]=2 + * r=1, i=6, a=v[2], b=v[7], c=v[8], d=v[13], m[sigma]=11, m[sigma+1]=7 + * r=1, i=7, a=v[3], b=v[4], c=v[9], d=v[14], m[sigma]=5, m[sigma+1]=3 */ - v3 = v3 + v4 + vec2(0u, u32(v3.x + v4.x < v3.x)); - vE = (vE ^ v3).yx; - v9 = v9 + vE + vec2(0u, u32(v9.x + vE.x < v9.x)); - v4 = ((v4 ^ v9) >> ROTATE_24) | ((v4 ^ v9).yx << ROTATE_8); - v3 = v3 + v4 + vec2(0u, u32(v3.x + v4.x < v3.x)); - v3 = v3 + m3 + vec2(0u, u32(v3.x + m3.x < v3.x)); - vE = ((vE ^ v3) >> ROTATE_16) | ((vE ^ v3).yx << ROTATE_16); - v9 = v9 + vE + vec2(0u, u32(v9.x + vE.x < v9.x)); - v4 = ((v4 ^ v9) << ROTATE_1) | ((v4 ^ v9).yx >> ROTATE_31); + v56 = vec4(v45.zw, v67.xy); + v74 = vec4(v67.zw, v45.xy); + vFC = vec4(vEF.zw, vCD.xy); + vDE = vec4(vCD.zw, vEF.xy); + + s0 = v01 + v56; + v01 = s0 + (vec4(s0 < v01) & CARRY).yxwz; + s1 = v23 + v74; + v23 = s1 + (vec4(s1 < v23) & CARRY).yxwz; + + s0 = v01 + vec4(m1, m0); + v01 = s0 + (vec4(s0 < v01) & CARRY).yxwz; + // NOP + + vFC = (vFC ^ v01).yxwz; + vDE = (vDE ^ v23).yxwz; + + s0 = vAB + vFC; + vAB = s0 + (vec4(s0 < vAB) & CARRY).yxwz; + s1 = v89 + vDE; + v89 = s1 + (vec4(s1 < v89) & CARRY).yxwz; + + v56 ^= vAB; + v56 = (v56 >> ROTATE_24) | (v56 << ROTATE_8).yxwz; + v74 ^= v89; + v74 = (v74 >> ROTATE_24) | (v74 << ROTATE_8).yxwz; + + s0 = v01 + v56; + v01 = s0 + (vec4(s0 < v01) & CARRY).yxwz; + s1 = v23 + v74; + v23 = s1 + (vec4(s1 < v23) & CARRY).yxwz; + + v01 += vec4(Z, m2); + v01.w += u32(v01.z < m2.x); + v23 += vec4(Z, m3); + v23.w += u32(v23.z < m3.x); + + vFC ^= v01; + vFC = (vFC >> ROTATE_16) | (vFC << ROTATE_16).yxwz; + vDE ^= v23; + vDE = (vDE >> ROTATE_16) | (vDE << ROTATE_16).yxwz; + + s0 = vAB + vFC; + vAB = s0 + (vec4(s0 < vAB) & CARRY).yxwz; + s1 = v89 + vDE; + v89 = s1 + (vec4(s1 < v89) & CARRY).yxwz; + + v56 ^= vAB; + v56 = (v56 >> ROTATE_31).yxwz | (v56 << ROTATE_1); + v74 ^= v89; + v74 = (v74 >> ROTATE_31).yxwz | (v74 << ROTATE_1); + + v45 = vec4(v74.zw, v56.xy); + v67 = vec4(v56.zw, v74.xy); + vCD = vec4(vFC.zw, vDE.xy); + vEF = vec4(vDE.zw, vFC.xy); @@ -380,105 +423,119 @@ fn main(id: vec3) { ****************************************************************************/ /** - * r=2, i=0, a=v[0], b=v[4], c=v[8], d=v[12] + * r=2, i=0, a=v[0], b=v[4], c=v[8], d=v[12], m[sigma]=11, m[sigma+1]=8 + * r=2, i=1, a=v[1], b=v[5], c=v[9], d=v[13], m[sigma]=12, m[sigma+1]=0 + * r=2, i=2, a=v[2], b=v[6], c=v[10], d=v[14], m[sigma]=5, m[sigma+1]=2 + * r=2, i=3, a=v[3], b=v[7], c=v[11], d=v[15], m[sigma]=15, m[sigma+1]=13 */ - v0 = v0 + v4 + vec2(0u, u32(v0.x + v4.x < v0.x)); - vC = (vC ^ v0).yx; - v8 = v8 + vC + vec2(0u, u32(v8.x + vC.x < v8.x)); - v4 = ((v4 ^ v8) >> ROTATE_24) | ((v4 ^ v8).yx << ROTATE_8); - v0 = v0 + v4 + vec2(0u, u32(v0.x + v4.x < v0.x)); - vC = ((vC ^ v0) >> ROTATE_16) | ((vC ^ v0).yx << ROTATE_16); - v8 = v8 + vC + vec2(0u, u32(v8.x + vC.x < v8.x)); - v4 = ((v4 ^ v8) << ROTATE_1) | ((v4 ^ v8).yx >> ROTATE_31); + s0 = v01 + v45; + v01 = s0 + (vec4(s0 < v01) & CARRY).yxwz; + s1 = v23 + v67; + v23 = s1 + (vec4(s1 < v23) & CARRY).yxwz; + + // NOP + // NOP + + vCD = (vCD ^ v01).yxwz; + vEF = (vEF ^ v23).yxwz; - /** - * r=2, i=1, a=v[1], b=v[5], c=v[9], d=v[13] - */ - v1 = v1 + v5 + vec2(0u, u32(v1.x + v5.x < v1.x)); - vD = (vD ^ v1).yx; - v9 = v9 + vD + vec2(0u, u32(v9.x + vD.x < v9.x)); - v5 = ((v5 ^ v9) >> ROTATE_24) | ((v5 ^ v9).yx << ROTATE_8); - v1 = v1 + v5 + vec2(0u, u32(v1.x + v5.x < v1.x)); - v1 = v1 + m0 + vec2(0u, u32(v1.x + m0.x < v1.x)); - vD = ((vD ^ v1) >> ROTATE_16) | ((vD ^ v1).yx << ROTATE_16); - v9 = v9 + vD + vec2(0u, u32(v9.x + vD.x < v9.x)); - v5 = ((v5 ^ v9) << ROTATE_1) | ((v5 ^ v9).yx >> ROTATE_31); + s0 = v89 + vCD; + v89 = s0 + (vec4(s0 < v89) & CARRY).yxwz; + s1 = vAB + vEF; + vAB = s1 + (vec4(s1 < vAB) & CARRY).yxwz; - /** - * r=2, i=2, a=v[2], b=v[6], c=v[10], d=v[14] - */ - v2 = v2 + v6 + vec2(0u, u32(v2.x + v6.x < v2.x)); - vE = (vE ^ v2).yx; - vA = vA + vE + vec2(0u, u32(vA.x + vE.x < vA.x)); - v6 = ((v6 ^ vA) >> ROTATE_24) | ((v6 ^ vA).yx << ROTATE_8); - v2 = v2 + v6 + vec2(0u, u32(v2.x + v6.x < v2.x)); - v2 = v2 + m2 + vec2(0u, u32(v2.x + m2.x < v2.x)); - vE = ((vE ^ v2) >> ROTATE_16) | ((vE ^ v2).yx << ROTATE_16); - vA = vA + vE + vec2(0u, u32(vA.x + vE.x < vA.x)); - v6 = ((v6 ^ vA) << ROTATE_1) | ((v6 ^ vA).yx >> ROTATE_31); + v45 ^= v89; + v45 = (v45 >> ROTATE_24) | (v45 << ROTATE_8).yxwz; + v67 ^= vAB; + v67 = (v67 >> ROTATE_24) | (v67 << ROTATE_8).yxwz; - /** - * r=2, i=3, a=v[3], b=v[7], c=v[11], d=v[15] - */ - v3 = v3 + v7 + vec2(0u, u32(v3.x + v7.x < v3.x)); - vF = (vF ^ v3).yx; - vB = vB + vF + vec2(0u, u32(vB.x + vF.x < vB.x)); - v7 = ((v7 ^ vB) >> ROTATE_24) | ((v7 ^ vB).yx << ROTATE_8); - v3 = v3 + v7 + vec2(0u, u32(v3.x + v7.x < v3.x)); - vF = ((vF ^ v3) >> ROTATE_16) | ((vF ^ v3).yx << ROTATE_16); - vB = vB + vF + vec2(0u, u32(vB.x + vF.x < vB.x)); - v7 = ((v7 ^ vB) << ROTATE_1) | ((v7 ^ vB).yx >> ROTATE_31); + s0 = v01 + v45; + v01 = s0 + (vec4(s0 < v01) & CARRY).yxwz; + s1 = v23 + v67; + v23 = s1 + (vec4(s1 < v23) & CARRY).yxwz; - /** - * r=2, i=4, a=v[0], b=v[5], c=v[10], d=v[15] - */ - v0 = v0 + v5 + vec2(0u, u32(v0.x + v5.x < v0.x)); - vF = (vF ^ v0).yx; - vA = vA + vF + vec2(0u, u32(vA.x + vF.x < vA.x)); - v5 = ((v5 ^ vA) >> ROTATE_24) | ((v5 ^ vA).yx << ROTATE_8); - v0 = v0 + v5 + vec2(0u, u32(v0.x + v5.x < v0.x)); - vF = ((vF ^ v0) >> ROTATE_16) | ((vF ^ v0).yx << ROTATE_16); - vA = vA + vF + vec2(0u, u32(vA.x + vF.x < vA.x)); - v5 = ((v5 ^ vA) << ROTATE_1) | ((v5 ^ vA).yx >> ROTATE_31); + s0 = v01 + vec4(Z, m0); + v01= s0 + (vec4(s0 < v01) & CARRY).yxwz; + s1 = v23 + vec4(m2, Z); + v23 = s1 + (vec4(s1 < v23) & CARRY).yxwz; - /** - * r=2, i=5, a=v[1], b=v[6], c=v[11], d=v[12] - */ - v1 = v1 + v6 + vec2(0u, u32(v1.x + v6.x < v1.x)); - v1 = v1 + m3 + vec2(0u, u32(v1.x + m3.x < v1.x)); - vC = (vC ^ v1).yx; - vB = vB + vC + vec2(0u, u32(vB.x + vC.x < vB.x)); - v6 = ((v6 ^ vB) >> ROTATE_24) | ((v6 ^ vB).yx << ROTATE_8); - v1 = v1 + v6 + vec2(0u, u32(v1.x + v6.x < v1.x)); - vC = ((vC ^ v1) >> ROTATE_16) | ((vC ^ v1).yx << ROTATE_16); - vB = vB + vC + vec2(0u, u32(vB.x + vC.x < vB.x)); - v6 = ((v6 ^ vB) << ROTATE_1) | ((v6 ^ vB).yx >> ROTATE_31); - /** - * r=2, i=6, a=v[2], b=v[7], c=v[8], d=v[13] - */ - v2 = v2 + v7 + vec2(0u, u32(v2.x + v7.x < v2.x)); - vD = (vD ^ v2).yx; - v8 = v8 + vD + vec2(0u, u32(v8.x + vD.x < v8.x)); - v7 = ((v7 ^ v8) >> ROTATE_24) | ((v7 ^ v8).yx << ROTATE_8); - v2 = v2 + v7 + vec2(0u, u32(v2.x + v7.x < v2.x)); - v2 = v2 + m1 + vec2(0u, u32(v2.x + m1.x < v2.x)); - vD = ((vD ^ v2) >> ROTATE_16) | ((vD ^ v2).yx << ROTATE_16); - v8 = v8 + vD + vec2(0u, u32(v8.x + vD.x < v8.x)); - v7 = ((v7 ^ v8) << ROTATE_1) | ((v7 ^ v8).yx >> ROTATE_31); + vCD ^= v01; + vCD = (vCD >> ROTATE_16) | (vCD << ROTATE_16).yxwz; + vEF ^= v23; + vEF = (vEF >> ROTATE_16) | (vEF << ROTATE_16).yxwz; + + s0 = v89 + vCD; + v89 = s0 + (vec4(s0 < v89) & CARRY).yxwz; + s1 = vAB + vEF; + vAB = s1 + (vec4(s1 < vAB) & CARRY).yxwz; + + v45 ^= v89; + v45 = (v45 >> ROTATE_31).yxwz | (v45 << ROTATE_1); + v67 ^= vAB; + v67 = (v67 >> ROTATE_31).yxwz | (v67 << ROTATE_1); /** - * r=2, i=7, a=v[3], b=v[4], c=v[9], d=v[14] + * r=2, i=4, a=v[0], b=v[5], c=v[10], d=v[15], m[sigma]=10, m[sigma+1]=14 + * r=2, i=5, a=v[1], b=v[6], c=v[11], d=v[12], m[sigma]=3, m[sigma+1]=6 + * r=2, i=6, a=v[2], b=v[7], c=v[8], d=v[13], m[sigma]=7, m[sigma+1]=1 + * r=2, i=7, a=v[3], b=v[4], c=v[9], d=v[14], m[sigma]=9, m[sigma+1]=4 */ - v3 = v3 + v4 + vec2(0u, u32(v3.x + v4.x < v3.x)); - vE = (vE ^ v3).yx; - v9 = v9 + vE + vec2(0u, u32(v9.x + vE.x < v9.x)); - v4 = ((v4 ^ v9) >> ROTATE_24) | ((v4 ^ v9).yx << ROTATE_8); - v3 = v3 + v4 + vec2(0u, u32(v3.x + v4.x < v3.x)); - v3 = v3 + m4 + vec2(0u, u32(v3.x + m4.x < v3.x)); - vE = ((vE ^ v3) >> ROTATE_16) | ((vE ^ v3).yx << ROTATE_16); - v9 = v9 + vE + vec2(0u, u32(v9.x + vE.x < v9.x)); - v4 = ((v4 ^ v9) << ROTATE_1) | ((v4 ^ v9).yx >> ROTATE_31); + v56 = vec4(v45.zw, v67.xy); + v74 = vec4(v67.zw, v45.xy); + vFC = vec4(vEF.zw, vCD.xy); + vDE = vec4(vCD.zw, vEF.xy); + + s0 = v01 + v56; + v01 = s0 + (vec4(s0 < v01) & CARRY).yxwz; + s1 = v23 + v74; + v23 = s1 + (vec4(s1 < v23) & CARRY).yxwz; + + // v01 += vec4(Z, m3) + vec4(Z, 0u, u32(v01.z + vec4(Z, m3).z < v01.z)); + v01.z += m3.x; + v01.w += m3.y + u32(v01.z < m3.x); + // NOP + + vFC = (vFC ^ v01).yxwz; + vDE = (vDE ^ v23).yxwz; + + s0 = vAB + vFC; + vAB = s0 + (vec4(s0 < vAB) & CARRY).yxwz; + s1 = v89 + vDE; + v89 = s1 + (vec4(s1 < v89) & CARRY).yxwz; + + v56 ^= vAB; + v56 = (v56 >> ROTATE_24) | (v56 << ROTATE_8).yxwz; + v74 ^= v89; + v74 = (v74 >> ROTATE_24) | (v74 << ROTATE_8).yxwz; + + s0 = v01 + v56; + v01 = s0 + (vec4(s0 < v01) & CARRY).yxwz; + s1 = v23 + v74; + v23 = s1 + (vec4(s1 < v23) & CARRY).yxwz; + + // NOP + v23 += vec4(m1, m4) + vec4(0u, u32(v23.x + vec4(m1, m4).x < v23.x), 0u, u32(v23.z + vec4(m1, m4).z < v23.z)); + + vFC ^= v01; + vFC = (vFC >> ROTATE_16) | (vFC << ROTATE_16).yxwz; + vDE ^= v23; + vDE = (vDE >> ROTATE_16) | (vDE << ROTATE_16).yxwz; + + s0 = vAB + vFC; + vAB = s0 + (vec4(s0 < vAB) & CARRY).yxwz; + s1 = v89 + vDE; + v89 = s1 + (vec4(s1 < v89) & CARRY).yxwz; + + v56 ^= vAB; + v56 = (v56 >> ROTATE_31).yxwz | (v56 << ROTATE_1); + v74 ^= v89; + v74 = (v74 >> ROTATE_31).yxwz | (v74 << ROTATE_1); + + v45 = vec4(v74.zw, v56.xy); + v67 = vec4(v56.zw, v74.xy); + vCD = vec4(vFC.zw, vDE.xy); + vEF = vec4(vDE.zw, vFC.xy); @@ -489,105 +546,114 @@ fn main(id: vec3) { ****************************************************************************/ /** - * r=3, i=0, a=v[0], b=v[4], c=v[8], d=v[12] + * r=3, i=0, a=v[0], b=v[4], c=v[8], d=v[12], m[sigma]=7, m[sigma+1]=9 + * r=3, i=1, a=v[1], b=v[5], c=v[9], d=v[13], m[sigma]=3, m[sigma+1]=1 + * r=3, i=2, a=v[2], b=v[6], c=v[10], d=v[14], m[sigma]=13, m[sigma+1]=12 + * r=3, i=3, a=v[3], b=v[7], c=v[11], d=v[15], m[sigma]=11, m[sigma+1]=14 */ - v0 = v0 + v4 + vec2(0u, u32(v0.x + v4.x < v0.x)); - vC = (vC ^ v0).yx; - v8 = v8 + vC + vec2(0u, u32(v8.x + vC.x < v8.x)); - v4 = ((v4 ^ v8) >> ROTATE_24) | ((v4 ^ v8).yx << ROTATE_8); - v0 = v0 + v4 + vec2(0u, u32(v0.x + v4.x < v0.x)); - vC = ((vC ^ v0) >> ROTATE_16) | ((vC ^ v0).yx << ROTATE_16); - v8 = v8 + vC + vec2(0u, u32(v8.x + vC.x < v8.x)); - v4 = ((v4 ^ v8) << ROTATE_1) | ((v4 ^ v8).yx >> ROTATE_31); + s0 = v01 + v45; + v01 = s0 + (vec4(s0 < v01) & CARRY).yxwz; + s1 = v23 + v67; + v23 = s1 + (vec4(s1 < v23) & CARRY).yxwz; - /** - * r=3, i=1, a=v[1], b=v[5], c=v[9], d=v[13] - */ - v1 = v1 + v5 + vec2(0u, u32(v1.x + v5.x < v1.x)); - v1 = v1 + m3 + vec2(0u, u32(v1.x + m3.x < v1.x)); - vD = (vD ^ v1).yx; - v9 = v9 + vD + vec2(0u, u32(v9.x + vD.x < v9.x)); - v5 = ((v5 ^ v9) >> ROTATE_24) | ((v5 ^ v9).yx << ROTATE_8); - v1 = v1 + v5 + vec2(0u, u32(v1.x + v5.x < v1.x)); - v1 = v1 + m1 + vec2(0u, u32(v1.x + m1.x < v1.x)); - vD = ((vD ^ v1) >> ROTATE_16) | ((vD ^ v1).yx << ROTATE_16); - v9 = v9 + vD + vec2(0u, u32(v9.x + vD.x < v9.x)); - v5 = ((v5 ^ v9) << ROTATE_1) | ((v5 ^ v9).yx >> ROTATE_31); + v01 += vec4(Z, m3) + vec4(Z, 0u, u32(v01.z + vec4(Z, m3).z < v01.z)); + // NOP - /** - * r=3, i=2, a=v[2], b=v[6], c=v[10], d=v[14] - */ - v2 = v2 + v6 + vec2(0u, u32(v2.x + v6.x < v2.x)); - vE = (vE ^ v2).yx; - vA = vA + vE + vec2(0u, u32(vA.x + vE.x < vA.x)); - v6 = ((v6 ^ vA) >> ROTATE_24) | ((v6 ^ vA).yx << ROTATE_8); - v2 = v2 + v6 + vec2(0u, u32(v2.x + v6.x < v2.x)); - vE = ((vE ^ v2) >> ROTATE_16) | ((vE ^ v2).yx << ROTATE_16); - vA = vA + vE + vec2(0u, u32(vA.x + vE.x < vA.x)); - v6 = ((v6 ^ vA) << ROTATE_1) | ((v6 ^ vA).yx >> ROTATE_31); + vCD = (vCD ^ v01).yxwz; + vEF = (vEF ^ v23).yxwz; - /** - * r=3, i=3, a=v[3], b=v[7], c=v[11], d=v[15] - */ - v3 = v3 + v7 + vec2(0u, u32(v3.x + v7.x < v3.x)); - vF = (vF ^ v3).yx; - vB = vB + vF + vec2(0u, u32(vB.x + vF.x < vB.x)); - v7 = ((v7 ^ vB) >> ROTATE_24) | ((v7 ^ vB).yx << ROTATE_8); - v3 = v3 + v7 + vec2(0u, u32(v3.x + v7.x < v3.x)); - vF = ((vF ^ v3) >> ROTATE_16) | ((vF ^ v3).yx << ROTATE_16); - vB = vB + vF + vec2(0u, u32(vB.x + vF.x < vB.x)); - v7 = ((v7 ^ vB) << ROTATE_1) | ((v7 ^ vB).yx >> ROTATE_31); + s0 = v89 + vCD; + v89 = s0 + (vec4(s0 < v89) & CARRY).yxwz; + s1 = vAB + vEF; + vAB = s1 + (vec4(s1 < vAB) & CARRY).yxwz; - /** - * r=3, i=4, a=v[0], b=v[5], c=v[10], d=v[15] - */ - v0 = v0 + v5 + vec2(0u, u32(v0.x + v5.x < v0.x)); - v0 = v0 + m2 + vec2(0u, u32(v0.x + m2.x < v0.x)); - vF = (vF ^ v0).yx; - vA = vA + vF + vec2(0u, u32(vA.x + vF.x < vA.x)); - v5 = ((v5 ^ vA) >> ROTATE_24) | ((v5 ^ vA).yx << ROTATE_8); - v0 = v0 + v5 + vec2(0u, u32(v0.x + v5.x < v0.x)); - vF = ((vF ^ v0) >> ROTATE_16) | ((vF ^ v0).yx << ROTATE_16); - vA = vA + vF + vec2(0u, u32(vA.x + vF.x < vA.x)); - v5 = ((v5 ^ vA) << ROTATE_1) | ((v5 ^ vA).yx >> ROTATE_31); + v45 ^= v89; + v45 = (v45 >> ROTATE_24) | (v45 << ROTATE_8).yxwz; + v67 ^= vAB; + v67 = (v67 >> ROTATE_24) | (v67 << ROTATE_8).yxwz; - /** - * r=3, i=5, a=v[1], b=v[6], c=v[11], d=v[12] - */ - v1 = v1 + v6 + vec2(0u, u32(v1.x + v6.x < v1.x)); - vC = (vC ^ v1).yx; - vB = vB + vC + vec2(0u, u32(vB.x + vC.x < vB.x)); - v6 = ((v6 ^ vB) >> ROTATE_24) | ((v6 ^ vB).yx << ROTATE_8); - v1 = v1 + v6 + vec2(0u, u32(v1.x + v6.x < v1.x)); - vC = ((vC ^ v1) >> ROTATE_16) | ((vC ^ v1).yx << ROTATE_16); - vB = vB + vC + vec2(0u, u32(vB.x + vC.x < vB.x)); - v6 = ((v6 ^ vB) << ROTATE_1) | ((v6 ^ vB).yx >> ROTATE_31); + s0 = v01 + v45; + v01 = s0 + (vec4(s0 < v01) & CARRY).yxwz; + s1 = v23 + v67; + v23 = s1 + (vec4(s1 < v23) & CARRY).yxwz; - /** - * r=3, i=6, a=v[2], b=v[7], c=v[8], d=v[13] - */ - v2 = v2 + v7 + vec2(0u, u32(v2.x + v7.x < v2.x)); - v2 = v2 + m4 + vec2(0u, u32(v2.x + m4.x < v2.x)); - vD = (vD ^ v2).yx; - v8 = v8 + vD + vec2(0u, u32(v8.x + vD.x < v8.x)); - v7 = ((v7 ^ v8) >> ROTATE_24) | ((v7 ^ v8).yx << ROTATE_8); - v2 = v2 + v7 + vec2(0u, u32(v2.x + v7.x < v2.x)); - v2 = v2 + m0 + vec2(0u, u32(v2.x + m0.x < v2.x)); - vD = ((vD ^ v2) >> ROTATE_16) | ((vD ^ v2).yx << ROTATE_16); - v8 = v8 + vD + vec2(0u, u32(v8.x + vD.x < v8.x)); - v7 = ((v7 ^ v8) << ROTATE_1) | ((v7 ^ v8).yx >> ROTATE_31); + v01 += vec4(Z, m1) + vec4(Z, 0u, u32(v01.z + vec4(Z, m1).z < v01.z)); + // NOP + + vCD ^= v01; + vCD = (vCD >> ROTATE_16) | (vCD << ROTATE_16).yxwz; + vEF ^= v23; + vEF = (vEF >> ROTATE_16) | (vEF << ROTATE_16).yxwz; + + s0 = v89 + vCD; + v89 = s0 + (vec4(s0 < v89) & CARRY).yxwz; + s1 = vAB + vEF; + vAB = s1 + (vec4(s1 < vAB) & CARRY).yxwz; + + v45 ^= v89; + v45 = (v45 >> ROTATE_31).yxwz | (v45 << ROTATE_1); + v67 ^= vAB; + v67 = (v67 >> ROTATE_31).yxwz | (v67 << ROTATE_1); /** - * r=3, i=7, a=v[3], b=v[4], c=v[9], d=v[14] + * r=3, i=4, a=v[0], b=v[5], c=v[10], d=v[15], m[sigma]=2, m[sigma+1]=6 + * r=3, i=5, a=v[1], b=v[6], c=v[11], d=v[12], m[sigma]=5, m[sigma+1]=10 + * r=3, i=6, a=v[2], b=v[7], c=v[8], d=v[13], m[sigma]=4, m[sigma+1]=0 + * r=3, i=7, a=v[3], b=v[4], c=v[9], d=v[14], m[sigma]=15, m[sigma+1]=8 */ - v3 = v3 + v4 + vec2(0u, u32(v3.x + v4.x < v3.x)); - vE = (vE ^ v3).yx; - v9 = v9 + vE + vec2(0u, u32(v9.x + vE.x < v9.x)); - v4 = ((v4 ^ v9) >> ROTATE_24) | ((v4 ^ v9).yx << ROTATE_8); - v3 = v3 + v4 + vec2(0u, u32(v3.x + v4.x < v3.x)); - vE = ((vE ^ v3) >> ROTATE_16) | ((vE ^ v3).yx << ROTATE_16); - v9 = v9 + vE + vec2(0u, u32(v9.x + vE.x < v9.x)); - v4 = ((v4 ^ v9) << ROTATE_1) | ((v4 ^ v9).yx >> ROTATE_31); + v56 = vec4(v45.zw, v67.xy); + v74 = vec4(v67.zw, v45.xy); + vFC = vec4(vEF.zw, vCD.xy); + vDE = vec4(vCD.zw, vEF.xy); + + s0 = v01 + v56; + v01 = s0 + (vec4(s0 < v01) & CARRY).yxwz; + s1 = v23 + v74; + v23 = s1 + (vec4(s1 < v23) & CARRY).yxwz; + + v01 += vec4(m2, Z) + vec4(0u, u32(v01.x + vec4(m2, Z).x < v01.x), Z); + v23 += vec4(m4, Z) + vec4(0u, u32(v23.x + vec4(m4, Z).x < v23.x), Z); + + vFC = (vFC ^ v01).yxwz; + vDE = (vDE ^ v23).yxwz; + + s0 = vAB + vFC; + vAB = s0 + (vec4(s0 < vAB) & CARRY).yxwz; + s1 = v89 + vDE; + v89 = s1 + (vec4(s1 < v89) & CARRY).yxwz; + + v56 ^= vAB; + v56 = (v56 >> ROTATE_24) | (v56 << ROTATE_8).yxwz; + v74 ^= v89; + v74 = (v74 >> ROTATE_24) | (v74 << ROTATE_8).yxwz; + + s0 = v01 + v56; + v01 = s0 + (vec4(s0 < v01) & CARRY).yxwz; + s1 = v23 + v74; + v23 = s1 + (vec4(s1 < v23) & CARRY).yxwz; + + // NOP + v23 += vec4(m0, Z) + vec4(0u, u32(v23.x + vec4(m0, Z).x < v23.x), Z); + + vFC ^= v01; + vFC = (vFC >> ROTATE_16) | (vFC << ROTATE_16).yxwz; + vDE ^= v23; + vDE = (vDE >> ROTATE_16) | (vDE << ROTATE_16).yxwz; + + s0 = vAB + vFC; + vAB = s0 + (vec4(s0 < vAB) & CARRY).yxwz; + s1 = v89 + vDE; + v89 = s1 + (vec4(s1 < v89) & CARRY).yxwz; + + v56 ^= vAB; + v56 = (v56 >> ROTATE_31).yxwz | (v56 << ROTATE_1); + v74 ^= v89; + v74 = (v74 >> ROTATE_31).yxwz | (v74 << ROTATE_1); + + v45 = vec4(v74.zw, v56.xy); + v67 = vec4(v56.zw, v74.xy); + vCD = vec4(vFC.zw, vDE.xy); + vEF = vec4(vDE.zw, vFC.xy); @@ -598,105 +664,119 @@ fn main(id: vec3) { ****************************************************************************/ /** - * r=4, i=0, a=v[0], b=v[4], c=v[8], d=v[12] + * r=4, i=0, a=v[0], b=v[4], c=v[8], d=v[12], m[sigma]=9, m[sigma+1]=0 + * r=4, i=1, a=v[1], b=v[5], c=v[9], d=v[13], m[sigma]=5, m[sigma+1]=7 + * r=4, i=2, a=v[2], b=v[6], c=v[10], d=v[14], m[sigma]=2, m[sigma+1]=4 + * r=4, i=3, a=v[3], b=v[7], c=v[11], d=v[15], m[sigma]=10, m[sigma+1]=15 */ - v0 = v0 + v4 + vec2(0u, u32(v0.x + v4.x < v0.x)); - vC = (vC ^ v0).yx; - v8 = v8 + vC + vec2(0u, u32(v8.x + vC.x < v8.x)); - v4 = ((v4 ^ v8) >> ROTATE_24) | ((v4 ^ v8).yx << ROTATE_8); - v0 = v0 + v4 + vec2(0u, u32(v0.x + v4.x < v0.x)); - v0 = v0 + m0 + vec2(0u, u32(v0.x + m0.x < v0.x)); - vC = ((vC ^ v0) >> ROTATE_16) | ((vC ^ v0).yx << ROTATE_16); - v8 = v8 + vC + vec2(0u, u32(v8.x + vC.x < v8.x)); - v4 = ((v4 ^ v8) << ROTATE_1) | ((v4 ^ v8).yx >> ROTATE_31); + s0 = v01 + v45; + v01 = s0 + (vec4(s0 < v01) & CARRY).yxwz; + s1 = v23 + v67; + v23 = s1 + (vec4(s1 < v23) & CARRY).yxwz; - /** - * r=4, i=1, a=v[1], b=v[5], c=v[9], d=v[13] - */ - v1 = v1 + v5 + vec2(0u, u32(v1.x + v5.x < v1.x)); - vD = (vD ^ v1).yx; - v9 = v9 + vD + vec2(0u, u32(v9.x + vD.x < v9.x)); - v5 = ((v5 ^ v9) >> ROTATE_24) | ((v5 ^ v9).yx << ROTATE_8); - v1 = v1 + v5 + vec2(0u, u32(v1.x + v5.x < v1.x)); - vD = ((vD ^ v1) >> ROTATE_16) | ((vD ^ v1).yx << ROTATE_16); - v9 = v9 + vD + vec2(0u, u32(v9.x + vD.x < v9.x)); - v5 = ((v5 ^ v9) << ROTATE_1) | ((v5 ^ v9).yx >> ROTATE_31); + // NOP + s1 = v23 + vec4(m2, Z); + v23 = s1 + (vec4(s1 < v23) & CARRY).yxwz; - /** - * r=4, i=2, a=v[2], b=v[6], c=v[10], d=v[14] - */ - v2 = v2 + v6 + vec2(0u, u32(v2.x + v6.x < v2.x)); - v2 = v2 + m2 + vec2(0u, u32(v2.x + m2.x < v2.x)); - vE = (vE ^ v2).yx; - vA = vA + vE + vec2(0u, u32(vA.x + vE.x < vA.x)); - v6 = ((v6 ^ vA) >> ROTATE_24) | ((v6 ^ vA).yx << ROTATE_8); - v2 = v2 + v6 + vec2(0u, u32(v2.x + v6.x < v2.x)); - v2 = v2 + m4 + vec2(0u, u32(v2.x + m4.x < v2.x)); - vE = ((vE ^ v2) >> ROTATE_16) | ((vE ^ v2).yx << ROTATE_16); - vA = vA + vE + vec2(0u, u32(vA.x + vE.x < vA.x)); - v6 = ((v6 ^ vA) << ROTATE_1) | ((v6 ^ vA).yx >> ROTATE_31); + vCD = (vCD ^ v01).yxwz; + vEF = (vEF ^ v23).yxwz; - /** - * r=4, i=3, a=v[3], b=v[7], c=v[11], d=v[15] - */ - v3 = v3 + v7 + vec2(0u, u32(v3.x + v7.x < v3.x)); - vF = (vF ^ v3).yx; - vB = vB + vF + vec2(0u, u32(vB.x + vF.x < vB.x)); - v7 = ((v7 ^ vB) >> ROTATE_24) | ((v7 ^ vB).yx << ROTATE_8); - v3 = v3 + v7 + vec2(0u, u32(v3.x + v7.x < v3.x)); - vF = ((vF ^ v3) >> ROTATE_16) | ((vF ^ v3).yx << ROTATE_16); - vB = vB + vF + vec2(0u, u32(vB.x + vF.x < vB.x)); - v7 = ((v7 ^ vB) << ROTATE_1) | ((v7 ^ vB).yx >> ROTATE_31); + s0 = v89 + vCD; + v89 = s0 + (vec4(s0 < v89) & CARRY).yxwz; + s1 = vAB + vEF; + vAB = s1 + (vec4(s1 < vAB) & CARRY).yxwz; - /** - * r=4, i=4, a=v[0], b=v[5], c=v[10], d=v[15] - */ - v0 = v0 + v5 + vec2(0u, u32(v0.x + v5.x < v0.x)); - vF = (vF ^ v0).yx; - vA = vA + vF + vec2(0u, u32(vA.x + vF.x < vA.x)); - v5 = ((v5 ^ vA) >> ROTATE_24) | ((v5 ^ vA).yx << ROTATE_8); - v0 = v0 + v5 + vec2(0u, u32(v0.x + v5.x < v0.x)); - v0 = v0 + m1 + vec2(0u, u32(v0.x + m1.x < v0.x)); - vF = ((vF ^ v0) >> ROTATE_16) | ((vF ^ v0).yx << ROTATE_16); - vA = vA + vF + vec2(0u, u32(vA.x + vF.x < vA.x)); - v5 = ((v5 ^ vA) << ROTATE_1) | ((v5 ^ vA).yx >> ROTATE_31); + v45 ^= v89; + v45 = (v45 >> ROTATE_24) | (v45 << ROTATE_8).yxwz; + v67 ^= vAB; + v67 = (v67 >> ROTATE_24) | (v67 << ROTATE_8).yxwz; - /** - * r=4, i=5, a=v[1], b=v[6], c=v[11], d=v[12] - */ - v1 = v1 + v6 + vec2(0u, u32(v1.x + v6.x < v1.x)); - vC = (vC ^ v1).yx; - vB = vB + vC + vec2(0u, u32(vB.x + vC.x < vB.x)); - v6 = ((v6 ^ vB) >> ROTATE_24) | ((v6 ^ vB).yx << ROTATE_8); - v1 = v1 + v6 + vec2(0u, u32(v1.x + v6.x < v1.x)); - vC = ((vC ^ v1) >> ROTATE_16) | ((vC ^ v1).yx << ROTATE_16); - vB = vB + vC + vec2(0u, u32(vB.x + vC.x < vB.x)); - v6 = ((v6 ^ vB) << ROTATE_1) | ((v6 ^ vB).yx >> ROTATE_31); + s0 = v01 + v45; + v01 = s0 + (vec4(s0 < v01) & CARRY).yxwz; + s1 = v23 + v67; + v23 = s1 + (vec4(s1 < v23) & CARRY).yxwz; + + v01 += vec4(m0, Z) + vec4(0u, u32(v01.x + vec4(m0, Z).x < v01.x), Z); + v23 += vec4(m4, Z) + vec4(0u, u32(v23.x + vec4(m4, Z).x < v23.x), Z); + + vCD ^= v01; + vCD = (vCD >> ROTATE_16) | (vCD << ROTATE_16).yxwz; + vEF ^= v23; + vEF = (vEF >> ROTATE_16) | (vEF << ROTATE_16).yxwz; + + s0 = v89 + vCD; + v89 = s0 + (vec4(s0 < v89) & CARRY).yxwz; + s1 = vAB + vEF; + vAB = s1 + (vec4(s1 < vAB) & CARRY).yxwz; + + v45 ^= v89; + v45 = (v45 >> ROTATE_31).yxwz | (v45 << ROTATE_1); + v67 ^= vAB; + v67 = (v67 >> ROTATE_31).yxwz | (v67 << ROTATE_1); /** - * r=4, i=6, a=v[2], b=v[7], c=v[8], d=v[13] */ - v2 = v2 + v7 + vec2(0u, u32(v2.x + v7.x < v2.x)); - vD = (vD ^ v2).yx; - v8 = v8 + vD + vec2(0u, u32(v8.x + vD.x < v8.x)); - v7 = ((v7 ^ v8) >> ROTATE_24) | ((v7 ^ v8).yx << ROTATE_8); - v2 = v2 + v7 + vec2(0u, u32(v2.x + v7.x < v2.x)); - vD = ((vD ^ v2) >> ROTATE_16) | ((vD ^ v2).yx << ROTATE_16); - v8 = v8 + vD + vec2(0u, u32(v8.x + vD.x < v8.x)); - v7 = ((v7 ^ v8) << ROTATE_1) | ((v7 ^ v8).yx >> ROTATE_31); /** - * r=4, i=7, a=v[3], b=v[4], c=v[9], d=v[14] + * r=4, i=4, a=v[0], b=v[5], c=v[10], d=v[15], m[sigma]=14, m[sigma+1]=1 + * r=4, i=5, a=v[1], b=v[6], c=v[11], d=v[12], m[sigma]=11, m[sigma+1]=12 + * r=4, i=6, a=v[2], b=v[7], c=v[8], d=v[13], m[sigma]=6, m[sigma+1]=8 + * r=4, i=7, a=v[3], b=v[4], c=v[9], d=v[14], m[sigma]=3, m[sigma+1]=13 */ - v3 = v3 + v4 + vec2(0u, u32(v3.x + v4.x < v3.x)); - v3 = v3 + m3 + vec2(0u, u32(v3.x + m3.x < v3.x)); - vE = (vE ^ v3).yx; - v9 = v9 + vE + vec2(0u, u32(v9.x + vE.x < v9.x)); - v4 = ((v4 ^ v9) >> ROTATE_24) | ((v4 ^ v9).yx << ROTATE_8); - v3 = v3 + v4 + vec2(0u, u32(v3.x + v4.x < v3.x)); - vE = ((vE ^ v3) >> ROTATE_16) | ((vE ^ v3).yx << ROTATE_16); - v9 = v9 + vE + vec2(0u, u32(v9.x + vE.x < v9.x)); - v4 = ((v4 ^ v9) << ROTATE_1) | ((v4 ^ v9).yx >> ROTATE_31); + v56 = vec4(v45.zw, v67.xy); + v74 = vec4(v67.zw, v45.xy); + vFC = vec4(vEF.zw, vCD.xy); + vDE = vec4(vCD.zw, vEF.xy); + + s0 = v01 + v56; + v01 = s0 + (vec4(s0 < v01) & CARRY).yxwz; + s1 = v23 + v74; + v23 = s1 + (vec4(s1 < v23) & CARRY).yxwz; + + // NOP + s1 = v23 + vec4(Z, m3); + v23 = s1 + (vec4(s1 < v23) & CARRY).yxwz; + + vFC = (vFC ^ v01).yxwz; + vDE = (vDE ^ v23).yxwz; + + s0 = vAB + vFC; + vAB = s0 + (vec4(s0 < vAB) & CARRY).yxwz; + s1 = v89 + vDE; + v89 = s1 + (vec4(s1 < v89) & CARRY).yxwz; + + v56 ^= vAB; + v56 = (v56 >> ROTATE_24) | (v56 << ROTATE_8).yxwz; + v74 ^= v89; + v74 = (v74 >> ROTATE_24) | (v74 << ROTATE_8).yxwz; + + s0 = v01 + v56; + v01 = s0 + (vec4(s0 < v01) & CARRY).yxwz; + s1 = v23 + v74; + v23 = s1 + (vec4(s1 < v23) & CARRY).yxwz; + + v01 += vec4(m1, Z) + vec4(0u, u32(v01.x + vec4(m1, Z).x < v01.x), Z); + // NOP + + vFC ^= v01; + vFC = (vFC >> ROTATE_16) | (vFC << ROTATE_16).yxwz; + vDE ^= v23; + vDE = (vDE >> ROTATE_16) | (vDE << ROTATE_16).yxwz; + + s0 = vAB + vFC; + vAB = s0 + (vec4(s0 < vAB) & CARRY).yxwz; + s1 = v89 + vDE; + v89 = s1 + (vec4(s1 < v89) & CARRY).yxwz; + + v56 ^= vAB; + v56 = (v56 >> ROTATE_31).yxwz | (v56 << ROTATE_1); + v74 ^= v89; + v74 = (v74 >> ROTATE_31).yxwz | (v74 << ROTATE_1); + + v45 = vec4(v74.zw, v56.xy); + v67 = vec4(v56.zw, v74.xy); + vCD = vec4(vFC.zw, vDE.xy); + vEF = vec4(vDE.zw, vFC.xy); @@ -707,105 +787,115 @@ fn main(id: vec3) { ****************************************************************************/ /** - * r=5, i=0, a=v[0], b=v[4], c=v[8], d=v[12] + * r=5, i=0, a=v[0], b=v[4], c=v[8], d=v[12], m[sigma]=2, m[sigma+1]=12 + * r=5, i=1, a=v[1], b=v[5], c=v[9], d=v[13], m[sigma]=6, m[sigma+1]=10 + * r=5, i=2, a=v[2], b=v[6], c=v[10], d=v[14], m[sigma]=0, m[sigma+1]=11 + * r=5, i=3, a=v[3], b=v[7], c=v[11], d=v[15], m[sigma]=8, m[sigma+1]=3 */ - v0 = v0 + v4 + vec2(0u, u32(v0.x + v4.x < v0.x)); - v0 = v0 + m2 + vec2(0u, u32(v0.x + m2.x < v0.x)); - vC = (vC ^ v0).yx; - v8 = v8 + vC + vec2(0u, u32(v8.x + vC.x < v8.x)); - v4 = ((v4 ^ v8) >> ROTATE_24) | ((v4 ^ v8).yx << ROTATE_8); - v0 = v0 + v4 + vec2(0u, u32(v0.x + v4.x < v0.x)); - vC = ((vC ^ v0) >> ROTATE_16) | ((vC ^ v0).yx << ROTATE_16); - v8 = v8 + vC + vec2(0u, u32(v8.x + vC.x < v8.x)); - v4 = ((v4 ^ v8) << ROTATE_1) | ((v4 ^ v8).yx >> ROTATE_31); + s0 = v01 + v45; + v01 = s0 + (vec4(s0 < v01) & CARRY).yxwz; + s1 = v23 + v67; + v23 = s1 + (vec4(s1 < v23) & CARRY).yxwz; + + v01 += vec4(m2, Z) + vec4(0u, u32(v01.x + vec4(m2, Z).x < v01.x), Z); + v23 += vec4(m0, Z) + vec4(0u, u32(v23.x + vec4(m0, Z).x < v23.x), Z); - /** - * r=5, i=1, a=v[1], b=v[5], c=v[9], d=v[13] - */ - v1 = v1 + v5 + vec2(0u, u32(v1.x + v5.x < v1.x)); - vD = (vD ^ v1).yx; - v9 = v9 + vD + vec2(0u, u32(v9.x + vD.x < v9.x)); - v5 = ((v5 ^ v9) >> ROTATE_24) | ((v5 ^ v9).yx << ROTATE_8); - v1 = v1 + v5 + vec2(0u, u32(v1.x + v5.x < v1.x)); - vD = ((vD ^ v1) >> ROTATE_16) | ((vD ^ v1).yx << ROTATE_16); - v9 = v9 + vD + vec2(0u, u32(v9.x + vD.x < v9.x)); - v5 = ((v5 ^ v9) << ROTATE_1) | ((v5 ^ v9).yx >> ROTATE_31); + vCD = (vCD ^ v01).yxwz; + vEF = (vEF ^ v23).yxwz; - /** - * r=5, i=2, a=v[2], b=v[6], c=v[10], d=v[14] - */ - v2 = v2 + v6 + vec2(0u, u32(v2.x + v6.x < v2.x)); - v2 = v2 + m0 + vec2(0u, u32(v2.x + m0.x < v2.x)); - vE = (vE ^ v2).yx; - vA = vA + vE + vec2(0u, u32(vA.x + vE.x < vA.x)); - v6 = ((v6 ^ vA) >> ROTATE_24) | ((v6 ^ vA).yx << ROTATE_8); - v2 = v2 + v6 + vec2(0u, u32(v2.x + v6.x < v2.x)); - vE = ((vE ^ v2) >> ROTATE_16) | ((vE ^ v2).yx << ROTATE_16); - vA = vA + vE + vec2(0u, u32(vA.x + vE.x < vA.x)); - v6 = ((v6 ^ vA) << ROTATE_1) | ((v6 ^ vA).yx >> ROTATE_31); + s0 = v89 + vCD; + v89 = s0 + (vec4(s0 < v89) & CARRY).yxwz; + s1 = vAB + vEF; + vAB = s1 + (vec4(s1 < vAB) & CARRY).yxwz; - /** - * r=5, i=3, a=v[3], b=v[7], c=v[11], d=v[15] - */ - v3 = v3 + v7 + vec2(0u, u32(v3.x + v7.x < v3.x)); - vF = (vF ^ v3).yx; - vB = vB + vF + vec2(0u, u32(vB.x + vF.x < vB.x)); - v7 = ((v7 ^ vB) >> ROTATE_24) | ((v7 ^ vB).yx << ROTATE_8); - v3 = v3 + v7 + vec2(0u, u32(v3.x + v7.x < v3.x)); - v3 = v3 + m3 + vec2(0u, u32(v3.x + m3.x < v3.x)); - vF = ((vF ^ v3) >> ROTATE_16) | ((vF ^ v3).yx << ROTATE_16); - vB = vB + vF + vec2(0u, u32(vB.x + vF.x < vB.x)); - v7 = ((v7 ^ vB) << ROTATE_1) | ((v7 ^ vB).yx >> ROTATE_31); + v45 ^= v89; + v45 = (v45 >> ROTATE_24) | (v45 << ROTATE_8).yxwz; + v67 ^= vAB; + v67 = (v67 >> ROTATE_24) | (v67 << ROTATE_8).yxwz; - /** - * r=5, i=4, a=v[0], b=v[5], c=v[10], d=v[15] - */ - v0 = v0 + v5 + vec2(0u, u32(v0.x + v5.x < v0.x)); - v0 = v0 + m4 + vec2(0u, u32(v0.x + m4.x < v0.x)); - vF = (vF ^ v0).yx; - vA = vA + vF + vec2(0u, u32(vA.x + vF.x < vA.x)); - v5 = ((v5 ^ vA) >> ROTATE_24) | ((v5 ^ vA).yx << ROTATE_8); - v0 = v0 + v5 + vec2(0u, u32(v0.x + v5.x < v0.x)); - vF = ((vF ^ v0) >> ROTATE_16) | ((vF ^ v0).yx << ROTATE_16); - vA = vA + vF + vec2(0u, u32(vA.x + vF.x < vA.x)); - v5 = ((v5 ^ vA) << ROTATE_1) | ((v5 ^ vA).yx >> ROTATE_31); + s0 = v01 + v45; + v01 = s0 + (vec4(s0 < v01) & CARRY).yxwz; + s1 = v23 + v67; + v23 = s1 + (vec4(s1 < v23) & CARRY).yxwz; - /** - * r=5, i=5, a=v[1], b=v[6], c=v[11], d=v[12] - */ - v1 = v1 + v6 + vec2(0u, u32(v1.x + v6.x < v1.x)); - vC = (vC ^ v1).yx; - vB = vB + vC + vec2(0u, u32(vB.x + vC.x < vB.x)); - v6 = ((v6 ^ vB) >> ROTATE_24) | ((v6 ^ vB).yx << ROTATE_8); - v1 = v1 + v6 + vec2(0u, u32(v1.x + v6.x < v1.x)); - vC = ((vC ^ v1) >> ROTATE_16) | ((vC ^ v1).yx << ROTATE_16); - vB = vB + vC + vec2(0u, u32(vB.x + vC.x < vB.x)); - v6 = ((v6 ^ vB) << ROTATE_1) | ((v6 ^ vB).yx >> ROTATE_31); + // NOP + s1 = v23 + vec4(Z, m3); + v23 = s1 + (vec4(s1 < v23) & CARRY).yxwz; - /** - * r=5, i=6, a=v[2], b=v[7], c=v[8], d=v[13] - */ - v2 = v2 + v7 + vec2(0u, u32(v2.x + v7.x < v2.x)); - vD = (vD ^ v2).yx; - v8 = v8 + vD + vec2(0u, u32(v8.x + vD.x < v8.x)); - v7 = ((v7 ^ v8) >> ROTATE_24) | ((v7 ^ v8).yx << ROTATE_8); - v2 = v2 + v7 + vec2(0u, u32(v2.x + v7.x < v2.x)); - vD = ((vD ^ v2) >> ROTATE_16) | ((vD ^ v2).yx << ROTATE_16); - v8 = v8 + vD + vec2(0u, u32(v8.x + vD.x < v8.x)); - v7 = ((v7 ^ v8) << ROTATE_1) | ((v7 ^ v8).yx >> ROTATE_31); + vCD ^= v01; + vCD = (vCD >> ROTATE_16) | (vCD << ROTATE_16).yxwz; + vEF ^= v23; + vEF = (vEF >> ROTATE_16) | (vEF << ROTATE_16).yxwz; + + s0 = v89 + vCD; + v89 = s0 + (vec4(s0 < v89) & CARRY).yxwz; + s1 = vAB + vEF; + vAB = s1 + (vec4(s1 < vAB) & CARRY).yxwz; + + v45 ^= v89; + v45 = (v45 >> ROTATE_31).yxwz | (v45 << ROTATE_1); + v67 ^= vAB; + v67 = (v67 >> ROTATE_31).yxwz | (v67 << ROTATE_1); /** - * r=5, i=7, a=v[3], b=v[4], c=v[9], d=v[14] + * r=5, i=4, a=v[0], b=v[5], c=v[10], d=v[15], m[sigma]=4, m[sigma+1]=13 + * r=5, i=5, a=v[1], b=v[6], c=v[11], d=v[12], m[sigma]=7, m[sigma+1]=5 + * r=5, i=6, a=v[2], b=v[7], c=v[8], d=v[13], m[sigma]=15, m[sigma+1]=14 + * r=5, i=7, a=v[3], b=v[4], c=v[9], d=v[14], m[sigma]=1, m[sigma+1]=9 */ - v3 = v3 + v4 + vec2(0u, u32(v3.x + v4.x < v3.x)); - v3 = v3 + m1 + vec2(0u, u32(v3.x + m1.x < v3.x)); - vE = (vE ^ v3).yx; - v9 = v9 + vE + vec2(0u, u32(v9.x + vE.x < v9.x)); - v4 = ((v4 ^ v9) >> ROTATE_24) | ((v4 ^ v9).yx << ROTATE_8); - v3 = v3 + v4 + vec2(0u, u32(v3.x + v4.x < v3.x)); - vE = ((vE ^ v3) >> ROTATE_16) | ((vE ^ v3).yx << ROTATE_16); - v9 = v9 + vE + vec2(0u, u32(v9.x + vE.x < v9.x)); - v4 = ((v4 ^ v9) << ROTATE_1) | ((v4 ^ v9).yx >> ROTATE_31); + v56 = vec4(v45.zw, v67.xy); + v74 = vec4(v67.zw, v45.xy); + vFC = vec4(vEF.zw, vCD.xy); + vDE = vec4(vCD.zw, vEF.xy); + + s0 = v01 + v56; + v01 = s0 + (vec4(s0 < v01) & CARRY).yxwz; + s1 = v23 + v74; + v23 = s1 + (vec4(s1 < v23) & CARRY).yxwz; + + v01 += vec4(m4, Z) + vec4(0u, u32(v01.x + vec4(m4, Z).x < v01.x), Z); + v23 += vec4(Z, m1) + vec4(Z, 0u, u32(v23.z + vec4(Z, m1).z < v23.z)); + + vFC = (vFC ^ v01).yxwz; + vDE = (vDE ^ v23).yxwz; + + s0 = vAB + vFC; + vAB = s0 + (vec4(s0 < vAB) & CARRY).yxwz; + s1 = v89 + vDE; + v89 = s1 + (vec4(s1 < v89) & CARRY).yxwz; + + v56 ^= vAB; + v56 = (v56 >> ROTATE_24) | (v56 << ROTATE_8).yxwz; + v74 ^= v89; + v74 = (v74 >> ROTATE_24) | (v74 << ROTATE_8).yxwz; + + s0 = v01 + v56; + v01 = s0 + (vec4(s0 < v01) & CARRY).yxwz; + s1 = v23 + v74; + v23 = s1 + (vec4(s1 < v23) & CARRY).yxwz; + + // NOP + // NOP + + vFC ^= v01; + vFC = (vFC >> ROTATE_16) | (vFC << ROTATE_16).yxwz; + vDE ^= v23; + vDE = (vDE >> ROTATE_16) | (vDE << ROTATE_16).yxwz; + + s0 = vAB + vFC; + vAB = s0 + (vec4(s0 < vAB) & CARRY).yxwz; + s1 = v89 + vDE; + v89 = s1 + (vec4(s1 < v89) & CARRY).yxwz; + + v56 ^= vAB; + v56 = (v56 >> ROTATE_31).yxwz | (v56 << ROTATE_1); + v74 ^= v89; + v74 = (v74 >> ROTATE_31).yxwz | (v74 << ROTATE_1); + + v45 = vec4(v74.zw, v56.xy); + v67 = vec4(v56.zw, v74.xy); + vCD = vec4(vFC.zw, vDE.xy); + vEF = vec4(vDE.zw, vFC.xy); @@ -816,105 +906,115 @@ fn main(id: vec3) { ****************************************************************************/ /** - * r=6, i=0, a=v[0], b=v[4], c=v[8], d=v[12] + * r=6, i=0, a=v[0], b=v[4], c=v[8], d=v[12], m[sigma]=12, m[sigma+1]=5 + * r=6, i=1, a=v[1], b=v[5], c=v[9], d=v[13], m[sigma]=1, m[sigma+1]=15 + * r=6, i=2, a=v[2], b=v[6], c=v[10], d=v[14], m[sigma]=14, m[sigma+1]=13 + * r=6, i=3, a=v[3], b=v[7], c=v[11], d=v[15], m[sigma]=4, m[sigma+1]=10 */ - v0 = v0 + v4 + vec2(0u, u32(v0.x + v4.x < v0.x)); - vC = (vC ^ v0).yx; - v8 = v8 + vC + vec2(0u, u32(v8.x + vC.x < v8.x)); - v4 = ((v4 ^ v8) >> ROTATE_24) | ((v4 ^ v8).yx << ROTATE_8); - v0 = v0 + v4 + vec2(0u, u32(v0.x + v4.x < v0.x)); - vC = ((vC ^ v0) >> ROTATE_16) | ((vC ^ v0).yx << ROTATE_16); - v8 = v8 + vC + vec2(0u, u32(v8.x + vC.x < v8.x)); - v4 = ((v4 ^ v8) << ROTATE_1) | ((v4 ^ v8).yx >> ROTATE_31); + s0 = v01 + v45; + v01 = s0 + (vec4(s0 < v01) & CARRY).yxwz; + s1 = v23 + v67; + v23 = s1 + (vec4(s1 < v23) & CARRY).yxwz; + + v01 += vec4(Z, m1) + vec4(Z, 0u, u32(v01.z + vec4(Z, m1).z < v01.z)); + v23 += vec4(Z, m4) + vec4(Z, 0u, u32(v23.z + vec4(Z, m4).z < v23.z)); - /** - * r=6, i=1, a=v[1], b=v[5], c=v[9], d=v[13] - */ - v1 = v1 + v5 + vec2(0u, u32(v1.x + v5.x < v1.x)); - v1 = v1 + m1 + vec2(0u, u32(v1.x + m1.x < v1.x)); - vD = (vD ^ v1).yx; - v9 = v9 + vD + vec2(0u, u32(v9.x + vD.x < v9.x)); - v5 = ((v5 ^ v9) >> ROTATE_24) | ((v5 ^ v9).yx << ROTATE_8); - v1 = v1 + v5 + vec2(0u, u32(v1.x + v5.x < v1.x)); - vD = ((vD ^ v1) >> ROTATE_16) | ((vD ^ v1).yx << ROTATE_16); - v9 = v9 + vD + vec2(0u, u32(v9.x + vD.x < v9.x)); - v5 = ((v5 ^ v9) << ROTATE_1) | ((v5 ^ v9).yx >> ROTATE_31); + vCD = (vCD ^ v01).yxwz; + vEF = (vEF ^ v23).yxwz; - /** - * r=6, i=2, a=v[2], b=v[6], c=v[10], d=v[14] - */ - v2 = v2 + v6 + vec2(0u, u32(v2.x + v6.x < v2.x)); - vE = (vE ^ v2).yx; - vA = vA + vE + vec2(0u, u32(vA.x + vE.x < vA.x)); - v6 = ((v6 ^ vA) >> ROTATE_24) | ((v6 ^ vA).yx << ROTATE_8); - v2 = v2 + v6 + vec2(0u, u32(v2.x + v6.x < v2.x)); - vE = ((vE ^ v2) >> ROTATE_16) | ((vE ^ v2).yx << ROTATE_16); - vA = vA + vE + vec2(0u, u32(vA.x + vE.x < vA.x)); - v6 = ((v6 ^ vA) << ROTATE_1) | ((v6 ^ vA).yx >> ROTATE_31); + s0 = v89 + vCD; + v89 = s0 + (vec4(s0 < v89) & CARRY).yxwz; + s1 = vAB + vEF; + vAB = s1 + (vec4(s1 < vAB) & CARRY).yxwz; - /** - * r=6, i=3, a=v[3], b=v[7], c=v[11], d=v[15] - */ - v3 = v3 + v7 + vec2(0u, u32(v3.x + v7.x < v3.x)); - v3 = v3 + m4 + vec2(0u, u32(v3.x + m4.x < v3.x)); - vF = (vF ^ v3).yx; - vB = vB + vF + vec2(0u, u32(vB.x + vF.x < vB.x)); - v7 = ((v7 ^ vB) >> ROTATE_24) | ((v7 ^ vB).yx << ROTATE_8); - v3 = v3 + v7 + vec2(0u, u32(v3.x + v7.x < v3.x)); - vF = ((vF ^ v3) >> ROTATE_16) | ((vF ^ v3).yx << ROTATE_16); - vB = vB + vF + vec2(0u, u32(vB.x + vF.x < vB.x)); - v7 = ((v7 ^ vB) << ROTATE_1) | ((v7 ^ vB).yx >> ROTATE_31); + v45 ^= v89; + v45 = (v45 >> ROTATE_24) | (v45 << ROTATE_8).yxwz; + v67 ^= vAB; + v67 = (v67 >> ROTATE_24) | (v67 << ROTATE_8).yxwz; - /** - * r=6, i=4, a=v[0], b=v[5], c=v[10], d=v[15] - */ - v0 = v0 + v5 + vec2(0u, u32(v0.x + v5.x < v0.x)); - v0 = v0 + m0 + vec2(0u, u32(v0.x + m0.x < v0.x)); - vF = (vF ^ v0).yx; - vA = vA + vF + vec2(0u, u32(vA.x + vF.x < vA.x)); - v5 = ((v5 ^ vA) >> ROTATE_24) | ((v5 ^ vA).yx << ROTATE_8); - v0 = v0 + v5 + vec2(0u, u32(v0.x + v5.x < v0.x)); - vF = ((vF ^ v0) >> ROTATE_16) | ((vF ^ v0).yx << ROTATE_16); - vA = vA + vF + vec2(0u, u32(vA.x + vF.x < vA.x)); - v5 = ((v5 ^ vA) << ROTATE_1) | ((v5 ^ vA).yx >> ROTATE_31); + s0 = v01 + v45; + v01 = s0 + (vec4(s0 < v01) & CARRY).yxwz; + s1 = v23 + v67; + v23 = s1 + (vec4(s1 < v23) & CARRY).yxwz; - /** - * r=6, i=5, a=v[1], b=v[6], c=v[11], d=v[12] - */ - v1 = v1 + v6 + vec2(0u, u32(v1.x + v6.x < v1.x)); - vC = (vC ^ v1).yx; - vB = vB + vC + vec2(0u, u32(vB.x + vC.x < vB.x)); - v6 = ((v6 ^ vB) >> ROTATE_24) | ((v6 ^ vB).yx << ROTATE_8); - v1 = v1 + v6 + vec2(0u, u32(v1.x + v6.x < v1.x)); - v1 = v1 + m3 + vec2(0u, u32(v1.x + m3.x < v1.x)); - vC = ((vC ^ v1) >> ROTATE_16) | ((vC ^ v1).yx << ROTATE_16); - vB = vB + vC + vec2(0u, u32(vB.x + vC.x < vB.x)); - v6 = ((v6 ^ vB) << ROTATE_1) | ((v6 ^ vB).yx >> ROTATE_31); + // NOP + // NOP - /** - * r=6, i=6, a=v[2], b=v[7], c=v[8], d=v[13] - */ - v2 = v2 + v7 + vec2(0u, u32(v2.x + v7.x < v2.x)); - vD = (vD ^ v2).yx; - v8 = v8 + vD + vec2(0u, u32(v8.x + vD.x < v8.x)); - v7 = ((v7 ^ v8) >> ROTATE_24) | ((v7 ^ v8).yx << ROTATE_8); - v2 = v2 + v7 + vec2(0u, u32(v2.x + v7.x < v2.x)); - v2 = v2 + m2 + vec2(0u, u32(v2.x + m2.x < v2.x)); - vD = ((vD ^ v2) >> ROTATE_16) | ((vD ^ v2).yx << ROTATE_16); - v8 = v8 + vD + vec2(0u, u32(v8.x + vD.x < v8.x)); - v7 = ((v7 ^ v8) << ROTATE_1) | ((v7 ^ v8).yx >> ROTATE_31); + vCD ^= v01; + vCD = (vCD >> ROTATE_16) | (vCD << ROTATE_16).yxwz; + vEF ^= v23; + vEF = (vEF >> ROTATE_16) | (vEF << ROTATE_16).yxwz; + + s0 = v89 + vCD; + v89 = s0 + (vec4(s0 < v89) & CARRY).yxwz; + s1 = vAB + vEF; + vAB = s1 + (vec4(s1 < vAB) & CARRY).yxwz; + + v45 ^= v89; + v45 = (v45 >> ROTATE_31).yxwz | (v45 << ROTATE_1); + v67 ^= vAB; + v67 = (v67 >> ROTATE_31).yxwz | (v67 << ROTATE_1); /** - * r=6, i=7, a=v[3], b=v[4], c=v[9], d=v[14] + * r=6, i=4, a=v[0], b=v[5], c=v[10], d=v[15], m[sigma]=0, m[sigma+1]=7 + * r=6, i=5, a=v[1], b=v[6], c=v[11], d=v[12], m[sigma]=6, m[sigma+1]=3 + * r=6, i=6, a=v[2], b=v[7], c=v[8], d=v[13], m[sigma]=9, m[sigma+1]=2 + * r=6, i=7, a=v[3], b=v[4], c=v[9], d=v[14], m[sigma]=8, m[sigma+1]=11 */ - v3 = v3 + v4 + vec2(0u, u32(v3.x + v4.x < v3.x)); - vE = (vE ^ v3).yx; - v9 = v9 + vE + vec2(0u, u32(v9.x + vE.x < v9.x)); - v4 = ((v4 ^ v9) >> ROTATE_24) | ((v4 ^ v9).yx << ROTATE_8); - v3 = v3 + v4 + vec2(0u, u32(v3.x + v4.x < v3.x)); - vE = ((vE ^ v3) >> ROTATE_16) | ((vE ^ v3).yx << ROTATE_16); - v9 = v9 + vE + vec2(0u, u32(v9.x + vE.x < v9.x)); - v4 = ((v4 ^ v9) << ROTATE_1) | ((v4 ^ v9).yx >> ROTATE_31); + v56 = vec4(v45.zw, v67.xy); + v74 = vec4(v67.zw, v45.xy); + vFC = vec4(vEF.zw, vCD.xy); + vDE = vec4(vCD.zw, vEF.xy); + + s0 = v01 + v56; + v01 = s0 + (vec4(s0 < v01) & CARRY).yxwz; + s1 = v23 + v74; + v23 = s1 + (vec4(s1 < v23) & CARRY).yxwz; + + v01 += vec4(m0, Z) + vec4(0u, u32(v01.x + vec4(m0, Z).x < v01.x), Z); + // NOP + + vFC = (vFC ^ v01).yxwz; + vDE = (vDE ^ v23).yxwz; + + s0 = vAB + vFC; + vAB = s0 + (vec4(s0 < vAB) & CARRY).yxwz; + s1 = v89 + vDE; + v89 = s1 + (vec4(s1 < v89) & CARRY).yxwz; + + v56 ^= vAB; + v56 = (v56 >> ROTATE_24) | (v56 << ROTATE_8).yxwz; + v74 ^= v89; + v74 = (v74 >> ROTATE_24) | (v74 << ROTATE_8).yxwz; + + s0 = v01 + v56; + v01 = s0 + (vec4(s0 < v01) & CARRY).yxwz; + s1 = v23 + v74; + v23 = s1 + (vec4(s1 < v23) & CARRY).yxwz; + + v01 += vec4(Z, m3) + vec4(Z, 0u, u32(v01.z + vec4(Z, m3).z < v01.z)); + s1 = v23 + vec4(m2, Z); + v23 = s1 + (vec4(s1 < v23) & CARRY).yxwz; + + vFC ^= v01; + vFC = (vFC >> ROTATE_16) | (vFC << ROTATE_16).yxwz; + vDE ^= v23; + vDE = (vDE >> ROTATE_16) | (vDE << ROTATE_16).yxwz; + + s0 = vAB + vFC; + vAB = s0 + (vec4(s0 < vAB) & CARRY).yxwz; + s1 = v89 + vDE; + v89 = s1 + (vec4(s1 < v89) & CARRY).yxwz; + + v56 ^= vAB; + v56 = (v56 >> ROTATE_31).yxwz | (v56 << ROTATE_1); + v74 ^= v89; + v74 = (v74 >> ROTATE_31).yxwz | (v74 << ROTATE_1); + + v45 = vec4(v74.zw, v56.xy); + v67 = vec4(v56.zw, v74.xy); + vCD = vec4(vFC.zw, vDE.xy); + vEF = vec4(vDE.zw, vFC.xy); @@ -925,105 +1025,115 @@ fn main(id: vec3) { ****************************************************************************/ /** - * r=7, i=0, a=v[0], b=v[4], c=v[8], d=v[12] + * r=7, i=0, a=v[0], b=v[4], c=v[8], d=v[12], m[sigma]=13, m[sigma+1]=11 + * r=7, i=1, a=v[1], b=v[5], c=v[9], d=v[13], m[sigma]=7, m[sigma+1]=14 + * r=7, i=2, a=v[2], b=v[6], c=v[10], d=v[14], m[sigma]=12, m[sigma+1]=1 + * r=7, i=3, a=v[3], b=v[7], c=v[11], d=v[15], m[sigma]=3, m[sigma+1]=9 */ - v0 = v0 + v4 + vec2(0u, u32(v0.x + v4.x < v0.x)); - vC = (vC ^ v0).yx; - v8 = v8 + vC + vec2(0u, u32(v8.x + vC.x < v8.x)); - v4 = ((v4 ^ v8) >> ROTATE_24) | ((v4 ^ v8).yx << ROTATE_8); - v0 = v0 + v4 + vec2(0u, u32(v0.x + v4.x < v0.x)); - vC = ((vC ^ v0) >> ROTATE_16) | ((vC ^ v0).yx << ROTATE_16); - v8 = v8 + vC + vec2(0u, u32(v8.x + vC.x < v8.x)); - v4 = ((v4 ^ v8) << ROTATE_1) | ((v4 ^ v8).yx >> ROTATE_31); + s0 = v01 + v45; + v01 = s0 + (vec4(s0 < v01) & CARRY).yxwz; + s1 = v23 + v67; + v23 = s1 + (vec4(s1 < v23) & CARRY).yxwz; + + // NOP + s1 = v23 + vec4(Z, m3); + v23 = s1 + (vec4(s1 < v23) & CARRY).yxwz; - /** - * r=7, i=1, a=v[1], b=v[5], c=v[9], d=v[13] - */ - v1 = v1 + v5 + vec2(0u, u32(v1.x + v5.x < v1.x)); - vD = (vD ^ v1).yx; - v9 = v9 + vD + vec2(0u, u32(v9.x + vD.x < v9.x)); - v5 = ((v5 ^ v9) >> ROTATE_24) | ((v5 ^ v9).yx << ROTATE_8); - v1 = v1 + v5 + vec2(0u, u32(v1.x + v5.x < v1.x)); - vD = ((vD ^ v1) >> ROTATE_16) | ((vD ^ v1).yx << ROTATE_16); - v9 = v9 + vD + vec2(0u, u32(v9.x + vD.x < v9.x)); - v5 = ((v5 ^ v9) << ROTATE_1) | ((v5 ^ v9).yx >> ROTATE_31); + vCD = (vCD ^ v01).yxwz; + vEF = (vEF ^ v23).yxwz; - /** - * r=7, i=2, a=v[2], b=v[6], c=v[10], d=v[14] - */ - v2 = v2 + v6 + vec2(0u, u32(v2.x + v6.x < v2.x)); - vE = (vE ^ v2).yx; - vA = vA + vE + vec2(0u, u32(vA.x + vE.x < vA.x)); - v6 = ((v6 ^ vA) >> ROTATE_24) | ((v6 ^ vA).yx << ROTATE_8); - v2 = v2 + v6 + vec2(0u, u32(v2.x + v6.x < v2.x)); - v2 = v2 + m1 + vec2(0u, u32(v2.x + m1.x < v2.x)); - vE = ((vE ^ v2) >> ROTATE_16) | ((vE ^ v2).yx << ROTATE_16); - vA = vA + vE + vec2(0u, u32(vA.x + vE.x < vA.x)); - v6 = ((v6 ^ vA) << ROTATE_1) | ((v6 ^ vA).yx >> ROTATE_31); + s0 = v89 + vCD; + v89 = s0 + (vec4(s0 < v89) & CARRY).yxwz; + s1 = vAB + vEF; + vAB = s1 + (vec4(s1 < vAB) & CARRY).yxwz; - /** - * r=7, i=3, a=v[3], b=v[7], c=v[11], d=v[15] - */ - v3 = v3 + v7 + vec2(0u, u32(v3.x + v7.x < v3.x)); - v3 = v3 + m3 + vec2(0u, u32(v3.x + m3.x < v3.x)); - vF = (vF ^ v3).yx; - vB = vB + vF + vec2(0u, u32(vB.x + vF.x < vB.x)); - v7 = ((v7 ^ vB) >> ROTATE_24) | ((v7 ^ vB).yx << ROTATE_8); - v3 = v3 + v7 + vec2(0u, u32(v3.x + v7.x < v3.x)); - vF = ((vF ^ v3) >> ROTATE_16) | ((vF ^ v3).yx << ROTATE_16); - vB = vB + vF + vec2(0u, u32(vB.x + vF.x < vB.x)); - v7 = ((v7 ^ vB) << ROTATE_1) | ((v7 ^ vB).yx >> ROTATE_31); + v45 ^= v89; + v45 = (v45 >> ROTATE_24) | (v45 << ROTATE_8).yxwz; + v67 ^= vAB; + v67 = (v67 >> ROTATE_24) | (v67 << ROTATE_8).yxwz; - /** - * r=7, i=4, a=v[0], b=v[5], c=v[10], d=v[15] - */ - v0 = v0 + v5 + vec2(0u, u32(v0.x + v5.x < v0.x)); - vF = (vF ^ v0).yx; - vA = vA + vF + vec2(0u, u32(vA.x + vF.x < vA.x)); - v5 = ((v5 ^ vA) >> ROTATE_24) | ((v5 ^ vA).yx << ROTATE_8); - v0 = v0 + v5 + vec2(0u, u32(v0.x + v5.x < v0.x)); - v0 = v0 + m0 + vec2(0u, u32(v0.x + m0.x < v0.x)); - vF = ((vF ^ v0) >> ROTATE_16) | ((vF ^ v0).yx << ROTATE_16); - vA = vA + vF + vec2(0u, u32(vA.x + vF.x < vA.x)); - v5 = ((v5 ^ vA) << ROTATE_1) | ((v5 ^ vA).yx >> ROTATE_31); + s0 = v01 + v45; + v01 = s0 + (vec4(s0 < v01) & CARRY).yxwz; + s1 = v23 + v67; + v23 = s1 + (vec4(s1 < v23) & CARRY).yxwz; - /** - * r=7, i=5, a=v[1], b=v[6], c=v[11], d=v[12] - */ - v1 = v1 + v6 + vec2(0u, u32(v1.x + v6.x < v1.x)); - vC = (vC ^ v1).yx; - vB = vB + vC + vec2(0u, u32(vB.x + vC.x < vB.x)); - v6 = ((v6 ^ vB) >> ROTATE_24) | ((v6 ^ vB).yx << ROTATE_8); - v1 = v1 + v6 + vec2(0u, u32(v1.x + v6.x < v1.x)); - v1 = v1 + m4 + vec2(0u, u32(v1.x + m4.x < v1.x)); - vC = ((vC ^ v1) >> ROTATE_16) | ((vC ^ v1).yx << ROTATE_16); - vB = vB + vC + vec2(0u, u32(vB.x + vC.x < vB.x)); - v6 = ((v6 ^ vB) << ROTATE_1) | ((v6 ^ vB).yx >> ROTATE_31); + // NOP + v23 += vec4(m1, Z) + vec4(0u, u32(v23.x + vec4(m1, Z).x < v23.x), Z); - /** - * r=7, i=6, a=v[2], b=v[7], c=v[8], d=v[13] - */ - v2 = v2 + v7 + vec2(0u, u32(v2.x + v7.x < v2.x)); - vD = (vD ^ v2).yx; - v8 = v8 + vD + vec2(0u, u32(v8.x + vD.x < v8.x)); - v7 = ((v7 ^ v8) >> ROTATE_24) | ((v7 ^ v8).yx << ROTATE_8); - v2 = v2 + v7 + vec2(0u, u32(v2.x + v7.x < v2.x)); - vD = ((vD ^ v2) >> ROTATE_16) | ((vD ^ v2).yx << ROTATE_16); - v8 = v8 + vD + vec2(0u, u32(v8.x + vD.x < v8.x)); - v7 = ((v7 ^ v8) << ROTATE_1) | ((v7 ^ v8).yx >> ROTATE_31); + vCD ^= v01; + vCD = (vCD >> ROTATE_16) | (vCD << ROTATE_16).yxwz; + vEF ^= v23; + vEF = (vEF >> ROTATE_16) | (vEF << ROTATE_16).yxwz; + + s0 = v89 + vCD; + v89 = s0 + (vec4(s0 < v89) & CARRY).yxwz; + s1 = vAB + vEF; + vAB = s1 + (vec4(s1 < vAB) & CARRY).yxwz; + + v45 ^= v89; + v45 = (v45 >> ROTATE_31).yxwz | (v45 << ROTATE_1); + v67 ^= vAB; + v67 = (v67 >> ROTATE_31).yxwz | (v67 << ROTATE_1); /** - * r=7, i=7, a=v[3], b=v[4], c=v[9], d=v[14] + * r=7, i=4, a=v[0], b=v[5], c=v[10], d=v[15], m[sigma]=5, m[sigma+1]=0 + * r=7, i=5, a=v[1], b=v[6], c=v[11], d=v[12], m[sigma]=15, m[sigma+1]=4 + * r=7, i=6, a=v[2], b=v[7], c=v[8], d=v[13], m[sigma]=8, m[sigma+1]=6 + * r=7, i=7, a=v[3], b=v[4], c=v[9], d=v[14], m[sigma]=2, m[sigma+1]=10 */ - v3 = v3 + v4 + vec2(0u, u32(v3.x + v4.x < v3.x)); - v3 = v3 + m2 + vec2(0u, u32(v3.x + m2.x < v3.x)); - vE = (vE ^ v3).yx; - v9 = v9 + vE + vec2(0u, u32(v9.x + vE.x < v9.x)); - v4 = ((v4 ^ v9) >> ROTATE_24) | ((v4 ^ v9).yx << ROTATE_8); - v3 = v3 + v4 + vec2(0u, u32(v3.x + v4.x < v3.x)); - vE = ((vE ^ v3) >> ROTATE_16) | ((vE ^ v3).yx << ROTATE_16); - v9 = v9 + vE + vec2(0u, u32(v9.x + vE.x < v9.x)); - v4 = ((v4 ^ v9) << ROTATE_1) | ((v4 ^ v9).yx >> ROTATE_31); + v56 = vec4(v45.zw, v67.xy); + v74 = vec4(v67.zw, v45.xy); + vFC = vec4(vEF.zw, vCD.xy); + vDE = vec4(vCD.zw, vEF.xy); + + s0 = v01 + v56; + v01 = s0 + (vec4(s0 < v01) & CARRY).yxwz; + s1 = v23 + v74; + v23 = s1 + (vec4(s1 < v23) & CARRY).yxwz; + + // NOP + v23 += vec4(Z, m2) + vec4(Z, 0u, u32(v23.z + vec4(Z, m2).z < v23.z)); + + vFC = (vFC ^ v01).yxwz; + vDE = (vDE ^ v23).yxwz; + + s0 = vAB + vFC; + vAB = s0 + (vec4(s0 < vAB) & CARRY).yxwz; + s1 = v89 + vDE; + v89 = s1 + (vec4(s1 < v89) & CARRY).yxwz; + + v56 ^= vAB; + v56 = (v56 >> ROTATE_24) | (v56 << ROTATE_8).yxwz; + v74 ^= v89; + v74 = (v74 >> ROTATE_24) | (v74 << ROTATE_8).yxwz; + + s0 = v01 + v56; + v01 = s0 + (vec4(s0 < v01) & CARRY).yxwz; + s1 = v23 + v74; + v23 = s1 + (vec4(s1 < v23) & CARRY).yxwz; + + v01 += vec4(m0, m4) + vec4(0u, u32(v01.x + vec4(m0, m4).x < v01.x), 0u, u32(v01.z + vec4(m0, m4).z < v01.z)); + // NOP + + vFC ^= v01; + vFC = (vFC >> ROTATE_16) | (vFC << ROTATE_16).yxwz; + vDE ^= v23; + vDE = (vDE >> ROTATE_16) | (vDE << ROTATE_16).yxwz; + + s0 = vAB + vFC; + vAB = s0 + (vec4(s0 < vAB) & CARRY).yxwz; + s1 = v89 + vDE; + v89 = s1 + (vec4(s1 < v89) & CARRY).yxwz; + + v56 ^= vAB; + v56 = (v56 >> ROTATE_31).yxwz | (v56 << ROTATE_1); + v74 ^= v89; + v74 = (v74 >> ROTATE_31).yxwz | (v74 << ROTATE_1); + + v45 = vec4(v74.zw, v56.xy); + v67 = vec4(v56.zw, v74.xy); + vCD = vec4(vFC.zw, vDE.xy); + vEF = vec4(vDE.zw, vFC.xy); @@ -1034,105 +1144,115 @@ fn main(id: vec3) { ****************************************************************************/ /** - * r=8, i=0, a=v[0], b=v[4], c=v[8], d=v[12] + * r=8, i=0, a=v[0], b=v[4], c=v[8], d=v[12], m[sigma]=6, m[sigma+1]=15 + * r=8, i=1, a=v[1], b=v[5], c=v[9], d=v[13], m[sigma]=14, m[sigma+1]=9 + * r=8, i=2, a=v[2], b=v[6], c=v[10], d=v[14], m[sigma]=11, m[sigma+1]=3 + * r=8, i=3, a=v[3], b=v[7], c=v[11], d=v[15], m[sigma]=0, m[sigma+1]=8 */ - v0 = v0 + v4 + vec2(0u, u32(v0.x + v4.x < v0.x)); - vC = (vC ^ v0).yx; - v8 = v8 + vC + vec2(0u, u32(v8.x + vC.x < v8.x)); - v4 = ((v4 ^ v8) >> ROTATE_24) | ((v4 ^ v8).yx << ROTATE_8); - v0 = v0 + v4 + vec2(0u, u32(v0.x + v4.x < v0.x)); - vC = ((vC ^ v0) >> ROTATE_16) | ((vC ^ v0).yx << ROTATE_16); - v8 = v8 + vC + vec2(0u, u32(v8.x + vC.x < v8.x)); - v4 = ((v4 ^ v8) << ROTATE_1) | ((v4 ^ v8).yx >> ROTATE_31); + s0 = v01 + v45; + v01 = s0 + (vec4(s0 < v01) & CARRY).yxwz; + s1 = v23 + v67; + v23 = s1 + (vec4(s1 < v23) & CARRY).yxwz; + + // NOP + v23 += vec4(Z, m0); + v23.w += u32(v23.z < m0.x); - /** - * r=8, i=1, a=v[1], b=v[5], c=v[9], d=v[13] - */ - v1 = v1 + v5 + vec2(0u, u32(v1.x + v5.x < v1.x)); - vD = (vD ^ v1).yx; - v9 = v9 + vD + vec2(0u, u32(v9.x + vD.x < v9.x)); - v5 = ((v5 ^ v9) >> ROTATE_24) | ((v5 ^ v9).yx << ROTATE_8); - v1 = v1 + v5 + vec2(0u, u32(v1.x + v5.x < v1.x)); - vD = ((vD ^ v1) >> ROTATE_16) | ((vD ^ v1).yx << ROTATE_16); - v9 = v9 + vD + vec2(0u, u32(v9.x + vD.x < v9.x)); - v5 = ((v5 ^ v9) << ROTATE_1) | ((v5 ^ v9).yx >> ROTATE_31); + vCD = (vCD ^ v01).yxwz; + vEF = (vEF ^ v23).yxwz; - /** - * r=8, i=2, a=v[2], b=v[6], c=v[10], d=v[14] - */ - v2 = v2 + v6 + vec2(0u, u32(v2.x + v6.x < v2.x)); - vE = (vE ^ v2).yx; - vA = vA + vE + vec2(0u, u32(vA.x + vE.x < vA.x)); - v6 = ((v6 ^ vA) >> ROTATE_24) | ((v6 ^ vA).yx << ROTATE_8); - v2 = v2 + v6 + vec2(0u, u32(v2.x + v6.x < v2.x)); - v2 = v2 + m3 + vec2(0u, u32(v2.x + m3.x < v2.x)); - vE = ((vE ^ v2) >> ROTATE_16) | ((vE ^ v2).yx << ROTATE_16); - vA = vA + vE + vec2(0u, u32(vA.x + vE.x < vA.x)); - v6 = ((v6 ^ vA) << ROTATE_1) | ((v6 ^ vA).yx >> ROTATE_31); + s0 = v89 + vCD; + v89 = s0 + (vec4(s0 < v89) & CARRY).yxwz; + s1 = vAB + vEF; + vAB = s1 + (vec4(s1 < vAB) & CARRY).yxwz; - /** - * r=8, i=3, a=v[3], b=v[7], c=v[11], d=v[15] - */ - v3 = v3 + v7 + vec2(0u, u32(v3.x + v7.x < v3.x)); - v3 = v3 + m0 + vec2(0u, u32(v3.x + m0.x < v3.x)); - vF = (vF ^ v3).yx; - vB = vB + vF + vec2(0u, u32(vB.x + vF.x < vB.x)); - v7 = ((v7 ^ vB) >> ROTATE_24) | ((v7 ^ vB).yx << ROTATE_8); - v3 = v3 + v7 + vec2(0u, u32(v3.x + v7.x < v3.x)); - vF = ((vF ^ v3) >> ROTATE_16) | ((vF ^ v3).yx << ROTATE_16); - vB = vB + vF + vec2(0u, u32(vB.x + vF.x < vB.x)); - v7 = ((v7 ^ vB) << ROTATE_1) | ((v7 ^ vB).yx >> ROTATE_31); + v45 ^= v89; + v45 = (v45 >> ROTATE_24) | (v45 << ROTATE_8).yxwz; + v67 ^= vAB; + v67 = (v67 >> ROTATE_24) | (v67 << ROTATE_8).yxwz; - /** - * r=8, i=4, a=v[0], b=v[5], c=v[10], d=v[15] - */ - v0 = v0 + v5 + vec2(0u, u32(v0.x + v5.x < v0.x)); - vF = (vF ^ v0).yx; - vA = vA + vF + vec2(0u, u32(vA.x + vF.x < vA.x)); - v5 = ((v5 ^ vA) >> ROTATE_24) | ((v5 ^ vA).yx << ROTATE_8); - v0 = v0 + v5 + vec2(0u, u32(v0.x + v5.x < v0.x)); - v0 = v0 + m2 + vec2(0u, u32(v0.x + m2.x < v0.x)); - vF = ((vF ^ v0) >> ROTATE_16) | ((vF ^ v0).yx << ROTATE_16); - vA = vA + vF + vec2(0u, u32(vA.x + vF.x < vA.x)); - v5 = ((v5 ^ vA) << ROTATE_1) | ((v5 ^ vA).yx >> ROTATE_31); + s0 = v01 + v45; + v01 = s0 + (vec4(s0 < v01) & CARRY).yxwz; + s1 = v23 + v67; + v23 = s1 + (vec4(s1 < v23) & CARRY).yxwz; - /** - * r=8, i=5, a=v[1], b=v[6], c=v[11], d=v[12] - */ - v1 = v1 + v6 + vec2(0u, u32(v1.x + v6.x < v1.x)); - vC = (vC ^ v1).yx; - vB = vB + vC + vec2(0u, u32(vB.x + vC.x < vB.x)); - v6 = ((v6 ^ vB) >> ROTATE_24) | ((v6 ^ vB).yx << ROTATE_8); - v1 = v1 + v6 + vec2(0u, u32(v1.x + v6.x < v1.x)); - vC = ((vC ^ v1) >> ROTATE_16) | ((vC ^ v1).yx << ROTATE_16); - vB = vB + vC + vec2(0u, u32(vB.x + vC.x < vB.x)); - v6 = ((v6 ^ vB) << ROTATE_1) | ((v6 ^ vB).yx >> ROTATE_31); + // NOP + v23 += vec4(m3, Z) + vec4(0u, u32(v23.x + vec4(m3, Z).x < v23.x), Z); - /** - * r=8, i=6, a=v[2], b=v[7], c=v[8], d=v[13] - */ - v2 = v2 + v7 + vec2(0u, u32(v2.x + v7.x < v2.x)); - v2 = v2 + m1 + vec2(0u, u32(v2.x + m1.x < v2.x)); - vD = (vD ^ v2).yx; - v8 = v8 + vD + vec2(0u, u32(v8.x + vD.x < v8.x)); - v7 = ((v7 ^ v8) >> ROTATE_24) | ((v7 ^ v8).yx << ROTATE_8); - v2 = v2 + v7 + vec2(0u, u32(v2.x + v7.x < v2.x)); - v2 = v2 + m4 + vec2(0u, u32(v2.x + m4.x < v2.x)); - vD = ((vD ^ v2) >> ROTATE_16) | ((vD ^ v2).yx << ROTATE_16); - v8 = v8 + vD + vec2(0u, u32(v8.x + vD.x < v8.x)); - v7 = ((v7 ^ v8) << ROTATE_1) | ((v7 ^ v8).yx >> ROTATE_31); + vCD ^= v01; + vCD = (vCD >> ROTATE_16) | (vCD << ROTATE_16).yxwz; + vEF ^= v23; + vEF = (vEF >> ROTATE_16) | (vEF << ROTATE_16).yxwz; + + s0 = v89 + vCD; + v89 = s0 + (vec4(s0 < v89) & CARRY).yxwz; + s1 = vAB + vEF; + vAB = s1 + (vec4(s1 < vAB) & CARRY).yxwz; + + v45 ^= v89; + v45 = (v45 >> ROTATE_31).yxwz | (v45 << ROTATE_1); + v67 ^= vAB; + v67 = (v67 >> ROTATE_31).yxwz | (v67 << ROTATE_1); /** - * r=8, i=7, a=v[3], b=v[4], c=v[9], d=v[14] + * r=8, i=4, a=v[0], b=v[5], c=v[10], d=v[15], m[sigma]=12, m[sigma+1]=2 + * r=8, i=5, a=v[1], b=v[6], c=v[11], d=v[12], m[sigma]=13, m[sigma+1]=7 + * r=8, i=6, a=v[2], b=v[7], c=v[8], d=v[13], m[sigma]=1, m[sigma+1]=4 + * r=8, i=7, a=v[3], b=v[4], c=v[9], d=v[14], m[sigma]=10, m[sigma+1]=5 */ - v3 = v3 + v4 + vec2(0u, u32(v3.x + v4.x < v3.x)); - vE = (vE ^ v3).yx; - v9 = v9 + vE + vec2(0u, u32(v9.x + vE.x < v9.x)); - v4 = ((v4 ^ v9) >> ROTATE_24) | ((v4 ^ v9).yx << ROTATE_8); - v3 = v3 + v4 + vec2(0u, u32(v3.x + v4.x < v3.x)); - vE = ((vE ^ v3) >> ROTATE_16) | ((vE ^ v3).yx << ROTATE_16); - v9 = v9 + vE + vec2(0u, u32(v9.x + vE.x < v9.x)); - v4 = ((v4 ^ v9) << ROTATE_1) | ((v4 ^ v9).yx >> ROTATE_31); + v56 = vec4(v45.zw, v67.xy); + v74 = vec4(v67.zw, v45.xy); + vFC = vec4(vEF.zw, vCD.xy); + vDE = vec4(vCD.zw, vEF.xy); + + s0 = v01 + v56; + v01 = s0 + (vec4(s0 < v01) & CARRY).yxwz; + s1 = v23 + v74; + v23 = s1 + (vec4(s1 < v23) & CARRY).yxwz; + + // NOP + v23 += vec4(m1, Z) + vec4(0u, u32(v23.x + vec4(m1, Z).x < v23.x), Z); + + vFC = (vFC ^ v01).yxwz; + vDE = (vDE ^ v23).yxwz; + + s0 = vAB + vFC; + vAB = s0 + (vec4(s0 < vAB) & CARRY).yxwz; + s1 = v89 + vDE; + v89 = s1 + (vec4(s1 < v89) & CARRY).yxwz; + + v56 ^= vAB; + v56 = (v56 >> ROTATE_24) | (v56 << ROTATE_8).yxwz; + v74 ^= v89; + v74 = (v74 >> ROTATE_24) | (v74 << ROTATE_8).yxwz; + + s0 = v01 + v56; + v01 = s0 + (vec4(s0 < v01) & CARRY).yxwz; + s1 = v23 + v74; + v23 = s1 + (vec4(s1 < v23) & CARRY).yxwz; + + v01 += vec4(m2, Z) + vec4(0u, u32(v01.x + vec4(m2, Z).x < v01.x), Z); + v23 += vec4(m4, Z) + vec4(0u, u32(v23.x + vec4(m4, Z).x < v23.x), Z); + + vFC ^= v01; + vFC = (vFC >> ROTATE_16) | (vFC << ROTATE_16).yxwz; + vDE ^= v23; + vDE = (vDE >> ROTATE_16) | (vDE << ROTATE_16).yxwz; + + s0 = vAB + vFC; + vAB = s0 + (vec4(s0 < vAB) & CARRY).yxwz; + s1 = v89 + vDE; + v89 = s1 + (vec4(s1 < v89) & CARRY).yxwz; + + v56 ^= vAB; + v56 = (v56 >> ROTATE_31).yxwz | (v56 << ROTATE_1); + v74 ^= v89; + v74 = (v74 >> ROTATE_31).yxwz | (v74 << ROTATE_1); + + v45 = vec4(v74.zw, v56.xy); + v67 = vec4(v56.zw, v74.xy); + vCD = vec4(vFC.zw, vDE.xy); + vEF = vec4(vDE.zw, vFC.xy); @@ -1143,105 +1263,118 @@ fn main(id: vec3) { ****************************************************************************/ /** - * r=9, i=0, a=v[0], b=v[4], c=v[8], d=v[12] - */ - v0 = v0 + v4 + vec2(0u, u32(v0.x + v4.x < v0.x)); - vC = (vC ^ v0).yx; - v8 = v8 + vC + vec2(0u, u32(v8.x + vC.x < v8.x)); - v4 = ((v4 ^ v8) >> ROTATE_24) | ((v4 ^ v8).yx << ROTATE_8); - v0 = v0 + v4 + vec2(0u, u32(v0.x + v4.x < v0.x)); - v0 = v0 + m2 + vec2(0u, u32(v0.x + m2.x < v0.x)); - vC = ((vC ^ v0) >> ROTATE_16) | ((vC ^ v0).yx << ROTATE_16); - v8 = v8 + vC + vec2(0u, u32(v8.x + vC.x < v8.x)); - v4 = ((v4 ^ v8) << ROTATE_1) | ((v4 ^ v8).yx >> ROTATE_31); + * r=9, i=0, a=v[0], b=v[4], c=v[8], d=v[12], m[sigma]=10, m[sigma+1]=2 + * r=9, i=1, a=v[1], b=v[5], c=v[9], d=v[13], m[sigma]=8, m[sigma+1]=4 + * r=9, i=2, a=v[2], b=v[6], c=v[10], d=v[14], m[sigma]=7, m[sigma+1]=6 + * r=9, i=3, a=v[3], b=v[7], c=v[11], d=v[15], m[sigma]=1, m[sigma+1]=5 + */ + s0 = v01 + v45; + v01 = s0 + (vec4(s0 < v01) & CARRY).yxwz; + s1 = v23 + v67; + v23 = s1 + (vec4(s1 < v23) & CARRY).yxwz; + + // NOP + v23 += vec4(Z, m1); + v23.w += u32(v23.z < m1.x); + + vCD = (vCD ^ v01).yxwz; + vEF = (vEF ^ v23).yxwz; - /** - * r=9, i=1, a=v[1], b=v[5], c=v[9], d=v[13] - */ - v1 = v1 + v5 + vec2(0u, u32(v1.x + v5.x < v1.x)); - vD = (vD ^ v1).yx; - v9 = v9 + vD + vec2(0u, u32(v9.x + vD.x < v9.x)); - v5 = ((v5 ^ v9) >> ROTATE_24) | ((v5 ^ v9).yx << ROTATE_8); - v1 = v1 + v5 + vec2(0u, u32(v1.x + v5.x < v1.x)); - v1 = v1 + m4 + vec2(0u, u32(v1.x + m4.x < v1.x)); - vD = ((vD ^ v1) >> ROTATE_16) | ((vD ^ v1).yx << ROTATE_16); - v9 = v9 + vD + vec2(0u, u32(v9.x + vD.x < v9.x)); - v5 = ((v5 ^ v9) << ROTATE_1) | ((v5 ^ v9).yx >> ROTATE_31); + s0 = v89 + vCD; + v89 = s0 + (vec4(s0 < v89) & CARRY).yxwz; + s1 = vAB + vEF; + vAB = s1 + (vec4(s1 < vAB) & CARRY).yxwz; - /** - * r=9, i=2, a=v[2], b=v[6], c=v[10], d=v[14] - */ - v2 = v2 + v6 + vec2(0u, u32(v2.x + v6.x < v2.x)); - vE = (vE ^ v2).yx; - vA = vA + vE + vec2(0u, u32(vA.x + vE.x < vA.x)); - v6 = ((v6 ^ vA) >> ROTATE_24) | ((v6 ^ vA).yx << ROTATE_8); - v2 = v2 + v6 + vec2(0u, u32(v2.x + v6.x < v2.x)); - vE = ((vE ^ v2) >> ROTATE_16) | ((vE ^ v2).yx << ROTATE_16); - vA = vA + vE + vec2(0u, u32(vA.x + vE.x < vA.x)); - v6 = ((v6 ^ vA) << ROTATE_1) | ((v6 ^ vA).yx >> ROTATE_31); + v45 ^= v89; + v45 = (v45 >> ROTATE_24) | (v45 << ROTATE_8).yxwz; + v67 ^= vAB; + v67 = (v67 >> ROTATE_24) | (v67 << ROTATE_8).yxwz; - /** - * r=9, i=3, a=v[3], b=v[7], c=v[11], d=v[15] - */ - v3 = v3 + v7 + vec2(0u, u32(v3.x + v7.x < v3.x)); - v3 = v3 + m1 + vec2(0u, u32(v3.x + m1.x < v3.x)); - vF = (vF ^ v3).yx; - vB = vB + vF + vec2(0u, u32(vB.x + vF.x < vB.x)); - v7 = ((v7 ^ vB) >> ROTATE_24) | ((v7 ^ vB).yx << ROTATE_8); - v3 = v3 + v7 + vec2(0u, u32(v3.x + v7.x < v3.x)); - vF = ((vF ^ v3) >> ROTATE_16) | ((vF ^ v3).yx << ROTATE_16); - vB = vB + vF + vec2(0u, u32(vB.x + vF.x < vB.x)); - v7 = ((v7 ^ vB) << ROTATE_1) | ((v7 ^ vB).yx >> ROTATE_31); + s0 = v01 + v45; + v01 = s0 + (vec4(s0 < v01) & CARRY).yxwz; + s1 = v23 + v67; + v23 = s1 + (vec4(s1 < v23) & CARRY).yxwz; - /** - * r=9, i=4, a=v[0], b=v[5], c=v[10], d=v[15] - */ - v0 = v0 + v5 + vec2(0u, u32(v0.x + v5.x < v0.x)); - vF = (vF ^ v0).yx; - vA = vA + vF + vec2(0u, u32(vA.x + vF.x < vA.x)); - v5 = ((v5 ^ vA) >> ROTATE_24) | ((v5 ^ vA).yx << ROTATE_8); - v0 = v0 + v5 + vec2(0u, u32(v0.x + v5.x < v0.x)); - vF = ((vF ^ v0) >> ROTATE_16) | ((vF ^ v0).yx << ROTATE_16); - vA = vA + vF + vec2(0u, u32(vA.x + vF.x < vA.x)); - v5 = ((v5 ^ vA) << ROTATE_1) | ((v5 ^ vA).yx >> ROTATE_31); + s0 = v01 + vec4(m2, m4); + v01 = s0 + (vec4(s0 < v01) & CARRY).yxwz; + // NOP - /** - * r=9, i=5, a=v[1], b=v[6], c=v[11], d=v[12] - */ - v1 = v1 + v6 + vec2(0u, u32(v1.x + v6.x < v1.x)); - vC = (vC ^ v1).yx; - vB = vB + vC + vec2(0u, u32(vB.x + vC.x < vB.x)); - v6 = ((v6 ^ vB) >> ROTATE_24) | ((v6 ^ vB).yx << ROTATE_8); - v1 = v1 + v6 + vec2(0u, u32(v1.x + v6.x < v1.x)); - vC = ((vC ^ v1) >> ROTATE_16) | ((vC ^ v1).yx << ROTATE_16); - vB = vB + vC + vec2(0u, u32(vB.x + vC.x < vB.x)); - v6 = ((v6 ^ vB) << ROTATE_1) | ((v6 ^ vB).yx >> ROTATE_31); + vCD ^= v01; + vCD = (vCD >> ROTATE_16) | (vCD << ROTATE_16).yxwz; + vEF ^= v23; + vEF = (vEF >> ROTATE_16) | (vEF << ROTATE_16).yxwz; - /** - * r=9, i=6, a=v[2], b=v[7], c=v[8], d=v[13] - */ - v2 = v2 + v7 + vec2(0u, u32(v2.x + v7.x < v2.x)); - v2 = v2 + m3 + vec2(0u, u32(v2.x + m3.x < v2.x)); - vD = (vD ^ v2).yx; - v8 = v8 + vD + vec2(0u, u32(v8.x + vD.x < v8.x)); - v7 = ((v7 ^ v8) >> ROTATE_24) | ((v7 ^ v8).yx << ROTATE_8); - v2 = v2 + v7 + vec2(0u, u32(v2.x + v7.x < v2.x)); - vD = ((vD ^ v2) >> ROTATE_16) | ((vD ^ v2).yx << ROTATE_16); - v8 = v8 + vD + vec2(0u, u32(v8.x + vD.x < v8.x)); - v7 = ((v7 ^ v8) << ROTATE_1) | ((v7 ^ v8).yx >> ROTATE_31); + s0 = v89 + vCD; + v89 = s0 + (vec4(s0 < v89) & CARRY).yxwz; + s1 = vAB + vEF; + vAB = s1 + (vec4(s1 < vAB) & CARRY).yxwz; + + v45 ^= v89; + v45 = (v45 >> ROTATE_31).yxwz | (v45 << ROTATE_1); + v67 ^= vAB; + v67 = (v67 >> ROTATE_31).yxwz | (v67 << ROTATE_1); /** - * r=9, i=7, a=v[3], b=v[4], c=v[9], d=v[14] + * r=9, i=4, a=v[0], b=v[5], c=v[10], d=v[15], m[sigma]=15, m[sigma+1]=11 + * r=9, i=5, a=v[1], b=v[6], c=v[11], d=v[12], m[sigma]=9, m[sigma+1]=14 + * r=9, i=6, a=v[2], b=v[7], c=v[8], d=v[13], m[sigma]=3, m[sigma+1]=12 + * r=9, i=7, a=v[3], b=v[4], c=v[9], d=v[14], m[sigma]=13, m[sigma+1]=0 */ - v3 = v3 + v4 + vec2(0u, u32(v3.x + v4.x < v3.x)); - vE = (vE ^ v3).yx; - v9 = v9 + vE + vec2(0u, u32(v9.x + vE.x < v9.x)); - v4 = ((v4 ^ v9) >> ROTATE_24) | ((v4 ^ v9).yx << ROTATE_8); - v3 = v3 + v4 + vec2(0u, u32(v3.x + v4.x < v3.x)); - v3 = v3 + m0 + vec2(0u, u32(v3.x + m0.x < v3.x)); - vE = ((vE ^ v3) >> ROTATE_16) | ((vE ^ v3).yx << ROTATE_16); - v9 = v9 + vE + vec2(0u, u32(v9.x + vE.x < v9.x)); - v4 = ((v4 ^ v9) << ROTATE_1) | ((v4 ^ v9).yx >> ROTATE_31); + v56 = vec4(v45.zw, v67.xy); + v74 = vec4(v67.zw, v45.xy); + vFC = vec4(vEF.zw, vCD.xy); + vDE = vec4(vCD.zw, vEF.xy); + + s0 = v01 + v56; + v01 = s0 + (vec4(s0 < v01) & CARRY).yxwz; + s1 = v23 + v74; + v23 = s1 + (vec4(s1 < v23) & CARRY).yxwz; + + // NOP + v23 += vec4(m3, Z); + v23.y += u32(v23.x < m3.x); + + vFC = (vFC ^ v01).yxwz; + vDE = (vDE ^ v23).yxwz; + + s0 = vAB + vFC; + vAB = s0 + (vec4(s0 < vAB) & CARRY).yxwz; + s1 = v89 + vDE; + v89 = s1 + (vec4(s1 < v89) & CARRY).yxwz; + + v56 ^= vAB; + v56 = (v56 >> ROTATE_24) | (v56 << ROTATE_8).yxwz; + v74 ^= v89; + v74 = (v74 >> ROTATE_24) | (v74 << ROTATE_8).yxwz; + + s0 = v01 + v56; + v01 = s0 + (vec4(s0 < v01) & CARRY).yxwz; + s1 = v23 + v74; + v23 = s1 + (vec4(s1 < v23) & CARRY).yxwz; + + // NOP + v23 += vec4(Z, m0); + v23.w += u32(v23.z < m0.x); + + vFC ^= v01; + vFC = (vFC >> ROTATE_16) | (vFC << ROTATE_16).yxwz; + vDE ^= v23; + vDE = (vDE >> ROTATE_16) | (vDE << ROTATE_16).yxwz; + + s0 = vAB + vFC; + vAB = s0 + (vec4(s0 < vAB) & CARRY).yxwz; + s1 = v89 + vDE; + v89 = s1 + (vec4(s1 < v89) & CARRY).yxwz; + + v56 ^= vAB; + v56 = (v56 >> ROTATE_31).yxwz | (v56 << ROTATE_1); + v74 ^= v89; + v74 = (v74 >> ROTATE_31).yxwz | (v74 << ROTATE_1); + + v45 = vec4(v74.zw, v56.xy); + v67 = vec4(v56.zw, v74.xy); + vCD = vec4(vFC.zw, vDE.xy); + vEF = vec4(vDE.zw, vFC.xy); @@ -1252,105 +1385,117 @@ fn main(id: vec3) { ****************************************************************************/ /** - * r=10, i=0, a=v[0], b=v[4], c=v[8], d=v[12] - */ - v0 = v0 + v4 + vec2(0u, u32(v0.x + v4.x < v0.x)); - v0 = v0 + m0 + vec2(0u, u32(v0.x + m0.x < v0.x)); - vC = (vC ^ v0).yx; - v8 = v8 + vC + vec2(0u, u32(v8.x + vC.x < v8.x)); - v4 = ((v4 ^ v8) >> ROTATE_24) | ((v4 ^ v8).yx << ROTATE_8); - v0 = v0 + v4 + vec2(0u, u32(v0.x + v4.x < v0.x)); - v0 = v0 + m1 + vec2(0u, u32(v0.x + m1.x < v0.x)); - vC = ((vC ^ v0) >> ROTATE_16) | ((vC ^ v0).yx << ROTATE_16); - v8 = v8 + vC + vec2(0u, u32(v8.x + vC.x < v8.x)); - v4 = ((v4 ^ v8) << ROTATE_1) | ((v4 ^ v8).yx >> ROTATE_31); + * r=10, i=0, a=v[0], b=v[4], c=v[8], d=v[12], m[sigma]=0, m[sigma+1]=1 + * r=10, i=1, a=v[1], b=v[5], c=v[9], d=v[13], m[sigma]=2, m[sigma+1]=3 + * r=10, i=2, a=v[2], b=v[6], c=v[10], d=v[14], m[sigma]=4, m[sigma+1]=5 + * r=10, i=3, a=v[3], b=v[7], c=v[11], d=v[15], m[sigma]=6, m[sigma+1]=7 + */ + s0 = v01 + v45; + v01 = s0 + (vec4(s0 < v01) & CARRY).yxwz; + s1 = v23 + v67; + v23 = s1 + (vec4(s1 < v23) & CARRY).yxwz; + + s0 = v01 + vec4(m0, m2); + v01 = s0 + (vec4(s0 < v01) & CARRY).yxwz; + v23 += vec4(m4, Z); + v23.y += u32(v23.x < m4.x); - /** - * r=10, i=1, a=v[1], b=v[5], c=v[9], d=v[13] - */ - v1 = v1 + v5 + vec2(0u, u32(v1.x + v5.x < v1.x)); - v1 = v1 + m2 + vec2(0u, u32(v1.x + m2.x < v1.x)); - vD = (vD ^ v1).yx; - v9 = v9 + vD + vec2(0u, u32(v9.x + vD.x < v9.x)); - v5 = ((v5 ^ v9) >> ROTATE_24) | ((v5 ^ v9).yx << ROTATE_8); - v1 = v1 + v5 + vec2(0u, u32(v1.x + v5.x < v1.x)); - v1 = v1 + m3 + vec2(0u, u32(v1.x + m3.x < v1.x)); - vD = ((vD ^ v1) >> ROTATE_16) | ((vD ^ v1).yx << ROTATE_16); - v9 = v9 + vD + vec2(0u, u32(v9.x + vD.x < v9.x)); - v5 = ((v5 ^ v9) << ROTATE_1) | ((v5 ^ v9).yx >> ROTATE_31); + vCD = (vCD ^ v01).yxwz; + vEF = (vEF ^ v23).yxwz; - /** - * r=10, i=2, a=v[2], b=v[6], c=v[10], d=v[14] - */ - v2 = v2 + v6 + vec2(0u, u32(v2.x + v6.x < v2.x)); - v2 = v2 + m4 + vec2(0u, u32(v2.x + m4.x < v2.x)); - vE = (vE ^ v2).yx; - vA = vA + vE + vec2(0u, u32(vA.x + vE.x < vA.x)); - v6 = ((v6 ^ vA) >> ROTATE_24) | ((v6 ^ vA).yx << ROTATE_8); - v2 = v2 + v6 + vec2(0u, u32(v2.x + v6.x < v2.x)); - vE = ((vE ^ v2) >> ROTATE_16) | ((vE ^ v2).yx << ROTATE_16); - vA = vA + vE + vec2(0u, u32(vA.x + vE.x < vA.x)); - v6 = ((v6 ^ vA) << ROTATE_1) | ((v6 ^ vA).yx >> ROTATE_31); + s0 = v89 + vCD; + v89 = s0 + (vec4(s0 < v89) & CARRY).yxwz; + s1 = vAB + vEF; + vAB = s1 + (vec4(s1 < vAB) & CARRY).yxwz; - /** - * r=10, i=3, a=v[3], b=v[7], c=v[11], d=v[15] - */ - v3 = v3 + v7 + vec2(0u, u32(v3.x + v7.x < v3.x)); - vF = (vF ^ v3).yx; - vB = vB + vF + vec2(0u, u32(vB.x + vF.x < vB.x)); - v7 = ((v7 ^ vB) >> ROTATE_24) | ((v7 ^ vB).yx << ROTATE_8); - v3 = v3 + v7 + vec2(0u, u32(v3.x + v7.x < v3.x)); - vF = ((vF ^ v3) >> ROTATE_16) | ((vF ^ v3).yx << ROTATE_16); - vB = vB + vF + vec2(0u, u32(vB.x + vF.x < vB.x)); - v7 = ((v7 ^ vB) << ROTATE_1) | ((v7 ^ vB).yx >> ROTATE_31); + v45 ^= v89; + v45 = (v45 >> ROTATE_24) | (v45 << ROTATE_8).yxwz; + v67 ^= vAB; + v67 = (v67 >> ROTATE_24) | (v67 << ROTATE_8).yxwz; - /** - * r=10, i=4, a=v[0], b=v[5], c=v[10], d=v[15] - */ - v0 = v0 + v5 + vec2(0u, u32(v0.x + v5.x < v0.x)); - vF = (vF ^ v0).yx; - vA = vA + vF + vec2(0u, u32(vA.x + vF.x < vA.x)); - v5 = ((v5 ^ vA) >> ROTATE_24) | ((v5 ^ vA).yx << ROTATE_8); - v0 = v0 + v5 + vec2(0u, u32(v0.x + v5.x < v0.x)); - vF = ((vF ^ v0) >> ROTATE_16) | ((vF ^ v0).yx << ROTATE_16); - vA = vA + vF + vec2(0u, u32(vA.x + vF.x < vA.x)); - v5 = ((v5 ^ vA) << ROTATE_1) | ((v5 ^ vA).yx >> ROTATE_31); + s0 = v01 + v45; + v01 = s0 + (vec4(s0 < v01) & CARRY).yxwz; + s1 = v23 + v67; + v23 = s1 + (vec4(s1 < v23) & CARRY).yxwz; - /** - * r=10, i=5, a=v[1], b=v[6], c=v[11], d=v[12] - */ - v1 = v1 + v6 + vec2(0u, u32(v1.x + v6.x < v1.x)); - vC = (vC ^ v1).yx; - vB = vB + vC + vec2(0u, u32(vB.x + vC.x < vB.x)); - v6 = ((v6 ^ vB) >> ROTATE_24) | ((v6 ^ vB).yx << ROTATE_8); - v1 = v1 + v6 + vec2(0u, u32(v1.x + v6.x < v1.x)); - vC = ((vC ^ v1) >> ROTATE_16) | ((vC ^ v1).yx << ROTATE_16); - vB = vB + vC + vec2(0u, u32(vB.x + vC.x < vB.x)); - v6 = ((v6 ^ vB) << ROTATE_1) | ((v6 ^ vB).yx >> ROTATE_31); + s0 = v01 + vec4(m1, m3); + v01 = s0 + (vec4(s0 < v01) & CARRY).yxwz; + // NOP - /** - * r=10, i=6, a=v[2], b=v[7], c=v[8], d=v[13] - */ - v2 = v2 + v7 + vec2(0u, u32(v2.x + v7.x < v2.x)); - vD = (vD ^ v2).yx; - v8 = v8 + vD + vec2(0u, u32(v8.x + vD.x < v8.x)); - v7 = ((v7 ^ v8) >> ROTATE_24) | ((v7 ^ v8).yx << ROTATE_8); - v2 = v2 + v7 + vec2(0u, u32(v2.x + v7.x < v2.x)); - vD = ((vD ^ v2) >> ROTATE_16) | ((vD ^ v2).yx << ROTATE_16); - v8 = v8 + vD + vec2(0u, u32(v8.x + vD.x < v8.x)); - v7 = ((v7 ^ v8) << ROTATE_1) | ((v7 ^ v8).yx >> ROTATE_31); + vCD ^= v01; + vCD = (vCD >> ROTATE_16) | (vCD << ROTATE_16).yxwz; + vEF ^= v23; + vEF = (vEF >> ROTATE_16) | (vEF << ROTATE_16).yxwz; + + s0 = v89 + vCD; + v89 = s0 + (vec4(s0 < v89) & CARRY).yxwz; + s1 = vAB + vEF; + vAB = s1 + (vec4(s1 < vAB) & CARRY).yxwz; + + v45 ^= v89; + v45 = (v45 >> ROTATE_31).yxwz | (v45 << ROTATE_1); + v67 ^= vAB; + v67 = (v67 >> ROTATE_31).yxwz | (v67 << ROTATE_1); /** - * r=10, i=7, a=v[3], b=v[4], c=v[9], d=v[14] + * r=10, i=4, a=v[0], b=v[5], c=v[10], d=v[15], m[sigma]=8, m[sigma+1]=9 + * r=10, i=5, a=v[1], b=v[6], c=v[11], d=v[12], m[sigma]=10, m[sigma+1]=11 + * r=10, i=6, a=v[2], b=v[7], c=v[8], d=v[13], m[sigma]=12, m[sigma+1]=13 + * r=10, i=7, a=v[3], b=v[4], c=v[9], d=v[14], m[sigma]=14, m[sigma+1]=15 */ - v3 = v3 + v4 + vec2(0u, u32(v3.x + v4.x < v3.x)); - vE = (vE ^ v3).yx; - v9 = v9 + vE + vec2(0u, u32(v9.x + vE.x < v9.x)); - v4 = ((v4 ^ v9) >> ROTATE_24) | ((v4 ^ v9).yx << ROTATE_8); - v3 = v3 + v4 + vec2(0u, u32(v3.x + v4.x < v3.x)); - vE = ((vE ^ v3) >> ROTATE_16) | ((vE ^ v3).yx << ROTATE_16); - v9 = v9 + vE + vec2(0u, u32(v9.x + vE.x < v9.x)); - v4 = ((v4 ^ v9) << ROTATE_1) | ((v4 ^ v9).yx >> ROTATE_31); + v56 = vec4(v45.zw, v67.xy); + v74 = vec4(v67.zw, v45.xy); + vFC = vec4(vEF.zw, vCD.xy); + vDE = vec4(vCD.zw, vEF.xy); + + s0 = v01 + v56; + v01 = s0 + (vec4(s0 < v01) & CARRY).yxwz; + s1 = v23 + v74; + v23 = s1 + (vec4(s1 < v23) & CARRY).yxwz; + + // NOP + // NOP + + vFC = (vFC ^ v01).yxwz; + vDE = (vDE ^ v23).yxwz; + + s0 = vAB + vFC; + vAB = s0 + (vec4(s0 < vAB) & CARRY).yxwz; + s1 = v89 + vDE; + v89 = s1 + (vec4(s1 < v89) & CARRY).yxwz; + + v56 ^= vAB; + v56 = (v56 >> ROTATE_24) | (v56 << ROTATE_8).yxwz; + v74 ^= v89; + v74 = (v74 >> ROTATE_24) | (v74 << ROTATE_8).yxwz; + + s0 = v01 + v56; + v01 = s0 + (vec4(s0 < v01) & CARRY).yxwz; + s1 = v23 + v74; + v23 = s1 + (vec4(s1 < v23) & CARRY).yxwz; + + // NOP + // NOP + + vFC ^= v01; + vFC = (vFC >> ROTATE_16) | (vFC << ROTATE_16).yxwz; + vDE ^= v23; + vDE = (vDE >> ROTATE_16) | (vDE << ROTATE_16).yxwz; + + s0 = vAB + vFC; + vAB = s0 + (vec4(s0 < vAB) & CARRY).yxwz; + s1 = v89 + vDE; + v89 = s1 + (vec4(s1 < v89) & CARRY).yxwz; + + v56 ^= vAB; + v56 = (v56 >> ROTATE_31).yxwz | (v56 << ROTATE_1); + v74 ^= v89; + v74 = (v74 >> ROTATE_31).yxwz | (v74 << ROTATE_1); + + v45 = vec4(v74.zw, v56.xy); + v67 = vec4(v56.zw, v74.xy); + vCD = vec4(vFC.zw, vDE.xy); + vEF = vec4(vDE.zw, vFC.xy); @@ -1361,88 +1506,118 @@ fn main(id: vec3) { ****************************************************************************/ /** - * r=11, i=0, a=v[0], b=v[4], c=v[8], d=v[12] - */ - v0 = v0 + v4 + vec2(0u, u32(v0.x + v4.x < v0.x)); - vC = (vC ^ v0).yx; - v8 = v8 + vC + vec2(0u, u32(v8.x + vC.x < v8.x)); - v4 = ((v4 ^ v8) >> ROTATE_24) | ((v4 ^ v8).yx << ROTATE_8); - v0 = v0 + v4 + vec2(0u, u32(v0.x + v4.x < v0.x)); - vC = ((vC ^ v0) >> ROTATE_16) | ((vC ^ v0).yx << ROTATE_16); - v8 = v8 + vC + vec2(0u, u32(v8.x + vC.x < v8.x)); - // skip since it does not affect the final values of `v0` and `v8` + * r=11, i=0, a=v[0], b=v[4], c=v[8], d=v[12], m[sigma]=14, m[sigma+1]=10 + * r=11, i=1, a=v[1], b=v[5], c=v[9], d=v[13], m[sigma]=4, m[sigma+1]=8 + * r=11, i=2, a=v[2], b=v[6], c=v[10], d=v[14], m[sigma]=9, m[sigma+1]=15 + * r=11, i=3, a=v[3], b=v[7], c=v[11], d=v[15], m[sigma]=13, m[sigma+1]=6 + */ + s0 = v01 + v45; + v01 = s0 + (vec4(s0 < v01) & CARRY).yxwz; + s1 = v23 + v67; + v23 = s1 + (vec4(s1 < v23) & CARRY).yxwz; + + v01 += vec4(Z, m4); + v01.w += u32(v01.z < m4.x); + // NOP + + vCD = (vCD ^ v01).yxwz; + vEF = (vEF ^ v23).yxwz; - /** - * r=11, i=1, a=v[1], b=v[5], c=v[9], d=v[13] - */ - v1 = v1 + v5 + vec2(0u, u32(v1.x + v5.x < v1.x)); - v1 = v1 + m4 + vec2(0u, u32(v1.x + m4.x < v1.x)); - vD = (vD ^ v1).yx; - v9 = v9 + vD + vec2(0u, u32(v9.x + vD.x < v9.x)); - v5 = ((v5 ^ v9) >> ROTATE_24) | ((v5 ^ v9).yx << ROTATE_8); - v1 = v1 + v5 + vec2(0u, u32(v1.x + v5.x < v1.x)); - vD = ((vD ^ v1) >> ROTATE_16) | ((vD ^ v1).yx << ROTATE_16); - v9 = v9 + vD + vec2(0u, u32(v9.x + vD.x < v9.x)); - v5 = ((v5 ^ v9) << ROTATE_1) | ((v5 ^ v9).yx >> ROTATE_31); + s0 = v89 + vCD; + v89 = s0 + (vec4(s0 < v89) & CARRY).yxwz; + s1 = vAB + vEF; + vAB = s1 + (vec4(s1 < vAB) & CARRY).yxwz; - /** - * r=11, i=2, a=v[2], b=v[6], c=v[10], d=v[14] - */ - v2 = v2 + v6 + vec2(0u, u32(v2.x + v6.x < v2.x)); - vE = (vE ^ v2).yx; - vA = vA + vE + vec2(0u, u32(vA.x + vE.x < vA.x)); - v6 = ((v6 ^ vA) >> ROTATE_24) | ((v6 ^ vA).yx << ROTATE_8); - v2 = v2 + v6 + vec2(0u, u32(v2.x + v6.x < v2.x)); - vE = ((vE ^ v2) >> ROTATE_16) | ((vE ^ v2).yx << ROTATE_16); - vA = vA + vE + vec2(0u, u32(vA.x + vE.x < vA.x)); - // skip since it does not affect the final values of `v0` and `v8` + v45 ^= v89; + v45 = (v45 >> ROTATE_24) | (v45 << ROTATE_8).yxwz; + v67 ^= vAB; + v67 = (v67 >> ROTATE_24) | (v67 << ROTATE_8).yxwz; - /** - * r=11, i=3, a=v[3], b=v[7], c=v[11], d=v[15] - */ - v3 = v3 + v7 + vec2(0u, u32(v3.x + v7.x < v3.x)); - vF = (vF ^ v3).yx; - vB = vB + vF + vec2(0u, u32(vB.x + vF.x < vB.x)); - v7 = ((v7 ^ vB) >> ROTATE_24) | ((v7 ^ vB).yx << ROTATE_8); - v3 = v3 + v7 + vec2(0u, u32(v3.x + v7.x < v3.x)); - vF = ((vF ^ v3) >> ROTATE_16) | ((vF ^ v3).yx << ROTATE_16); - vB = vB + vF + vec2(0u, u32(vB.x + vF.x < vB.x)); - v7 = ((v7 ^ vB) << ROTATE_1) | ((v7 ^ vB).yx >> ROTATE_31); + s0 = v01 + v45; + v01 = s0 + (vec4(s0 < v01) & CARRY).yxwz; + s1 = v23 + v67; + v23 = s1 + (vec4(s1 < v23) & CARRY).yxwz; - /** - * r=11, i=4, a=v[0], b=v[5], c=v[10], d=v[15] - */ - v0 = v0 + v5 + vec2(0u, u32(v0.x + v5.x < v0.x)); - v0 = v0 + m1 + vec2(0u, u32(v0.x + m1.x < v0.x)); - vF = (vF ^ v0).yx; - vA = vA + vF + vec2(0u, u32(vA.x + vF.x < vA.x)); - v5 = ((v5 ^ vA) >> ROTATE_24) | ((v5 ^ vA).yx << ROTATE_8); - v0 = v0 + v5 + vec2(0u, u32(v0.x + v5.x < v0.x)); - // skip since it does not affect the final values of `v0` and `v8` - // skip since it does not affect the final values of `v0` and `v8` - // skip since it does not affect the final values of `v0` and `v8` + // NOP + // NOP - /** - * r=11, i=5, a=v[1], b=v[6], c=v[11], d=v[12] - */ - // skip entire step since it does not affect the final values of `v0` and `v8` + vCD ^= v01; + vCD = (vCD >> ROTATE_16) | (vCD << ROTATE_16).yxwz; + vEF ^= v23; + vEF = (vEF >> ROTATE_16) | (vEF << ROTATE_16).yxwz; - /** - * r=11, i=6, a=v[2], b=v[7], c=v[8], d=v[13] - */ - v2 = v2 + v7 + vec2(0u, u32(v2.x + v7.x < v2.x)); - vD = (vD ^ v2).yx; - v8 = v8 + vD + vec2(0u, u32(v8.x + vD.x < v8.x)); - v7 = ((v7 ^ v8) >> ROTATE_24) | ((v7 ^ v8).yx << ROTATE_8); - v2 = v2 + v7 + vec2(0u, u32(v2.x + v7.x < v2.x)); - vD = ((vD ^ v2) >> ROTATE_16) | ((vD ^ v2).yx << ROTATE_16); - v8 = v8 + vD + vec2(0u, u32(v8.x + vD.x < v8.x)); - // skip since we already have the final values of `v0` and `v8` + s0 = v89 + vCD; + v89 = s0 + (vec4(s0 < v89) & CARRY).yxwz; + s1 = vAB + vEF; + vAB = s1 + (vec4(s1 < vAB) & CARRY).yxwz; + + v45 ^= v89; + v45 = (v45 >> ROTATE_31).yxwz | (v45 << ROTATE_1); + v67 ^= vAB; + v67 = (v67 >> ROTATE_31).yxwz | (v67 << ROTATE_1); /** - * r=11, i=7, a=v[3], b=v[4], c=v[9], d=v[14] + * r=11, i=4, a=v[0], b=v[5], c=v[10], d=v[15], m[sigma]=1, m[sigma+1]=12 + * r=11, i=5, a=v[1], b=v[6], c=v[11], d=v[12], m[sigma]=0, m[sigma+1]=2 + * r=11, i=6, a=v[2], b=v[7], c=v[8], d=v[13], m[sigma]=11, m[sigma+1]=7 + * r=11, i=7, a=v[3], b=v[4], c=v[9], d=v[14], m[sigma]=5, m[sigma+1]=3 */ - // skip entire step since we already have the final values of `v0` and `v8` + v56 = vec4(v45.zw, v67.xy); + v74 = vec4(v67.zw, v45.xy); + vFC = vec4(vEF.zw, vCD.xy); + vDE = vec4(vCD.zw, vEF.xy); + + s0 = v01 + v56; + v01 = s0 + (vec4(s0 < v01) & CARRY).yxwz; + s1 = v23 + v74; + v23 = s1 + (vec4(s1 < v23) & CARRY).yxwz; + + s0 = v01 + vec4(m1, m0); + v01 = s0 + (vec4(s0 < v01) & CARRY).yxwz; + // NOP + + vFC = (vFC ^ v01).yxwz; + vDE = (vDE ^ v23).yxwz; + + s0 = vAB + vFC; + vAB = s0 + (vec4(s0 < vAB) & CARRY).yxwz; + s1 = v89 + vDE; + v89 = s1 + (vec4(s1 < v89) & CARRY).yxwz; + + v56 ^= vAB; + v56 = (v56 >> ROTATE_24) | (v56 << ROTATE_8).yxwz; + v74 ^= v89; + v74 = (v74 >> ROTATE_24) | (v74 << ROTATE_8).yxwz; + + s0 = v01 + v56; + v01 = s0 + (vec4(s0 < v01) & CARRY).yxwz; + s1 = v23 + v74; + v23 = s1 + (vec4(s1 < v23) & CARRY).yxwz; + + v01 += vec4(Z, m2); + v01.w += u32(v01.z < m2.x); + v23 += vec4(Z, m3); + v23.w += u32(v23.z < m3.x); + + // vFC ^= v01; + // vFC = (vFC >> ROTATE_16) | (vFC << ROTATE_16).yxwz; + vDE ^= v23; + vDE = (vDE >> ROTATE_16) | (vDE << ROTATE_16).yxwz; + + // s0 = vAB + vFC; + // vAB = s0 + (vec4(s0 < vAB).yxwz & CARRY); + s1 = v89 + vDE; + v89 = s1 + (vec4(s1 < v89).yxwz & CARRY); + + // v56 ^= vAB; + // v74 ^= v89; + // v56 = (v56 << ROTATE_1) | (v56 >> ROTATE_31).yxwz; + // v74 = (v74 << ROTATE_1) | (v74 >> ROTATE_31).yxwz; + + // v45 = vec4(v74.zw, v56.xy); + // v67 = vec4(v56.zw, v74.xy); + // vCD = vec4(vFC.zw, vDE.xy); + // vEF = vec4(vDE.zw, vFC.xy); @@ -1456,7 +1631,7 @@ fn main(id: vec3) { * Set nonce if it passes the threshold and no other thread has set it. * Only high bits are needed for comparison since threshold low bits are zero. */ - if ((BLAKE2B_IV_0.y ^ v0.y ^ v8.y) > ubo.threshold && atomicLoad(&work.found) == 0u) { + if ((BLAKE2B_IV_0.y ^ v01.y ^ v89.y) > ubo.threshold && atomicLoad(&work.found) == 0u) { atomicStore(&work.found, 1u); work.nonce = m0; } diff --git a/src/shaders/gl-downsample.ts b/src/shaders/gl-downsample.ts new file mode 100644 index 0000000..edfc8dc --- /dev/null +++ b/src/shaders/gl-downsample.ts @@ -0,0 +1,36 @@ +// SPDX-FileCopyrightText: 2025 Chris Duncan +// SPDX-License-Identifier: GPL-3.0-or-later + +export const NanoPowGlDownsampleShader = `#version 300 es +#pragma vscode_glsllint_stage: frag +#ifdef GL_FRAGMENT_PRECISION_HIGH +precision highp float; +#else +precision mediump float; +#endif +precision highp int; + +out uvec4 nonce; + +// source texture to be downsampled +uniform highp usampler2D src; + +void main() { + nonce = uvec4(0u); + vec2 inputSize = vec2(textureSize(src, 0)); + vec2 texel = vec2(1.0) / inputSize; + vec2 blockCoord = (floor(gl_FragCoord.xy) * 2.0 + vec2(0.5)) / inputSize; + + uvec4 pixel = texture(src, blockCoord); + nonce = pixel.x == 0u ? nonce : pixel; + + pixel = texture(src, blockCoord + vec2(texel.x, 0.0)); + nonce = pixel.x == 0u ? nonce : pixel; + + pixel = texture(src, blockCoord + vec2(0.0, texel.y)); + nonce = pixel.x == 0u ? nonce : pixel; + + pixel = texture(src, blockCoord + vec2(texel.x, texel.y)); + nonce = pixel.x == 0u ? nonce : pixel; +} +` diff --git a/src/shaders/gl-draw.ts b/src/shaders/gl-draw.ts new file mode 100644 index 0000000..b4d5d8b --- /dev/null +++ b/src/shaders/gl-draw.ts @@ -0,0 +1,210 @@ +// SPDX-FileCopyrightText: 2025 Chris Duncan +// SPDX-FileContributor: Ben Green +// SPDX-License-Identifier: GPL-3.0-or-later AND MIT + +export const NanoPowGlDrawShader = `#version 300 es +#pragma vscode_glsllint_stage: frag +#ifdef GL_FRAGMENT_PRECISION_HIGH +precision highp float; +#else +precision mediump float; +#endif + +out uvec4 nonce; + +// blockhash - Array of precalculated block hash components +// threshold - 0xfffffff8 for send/change blocks, 0xfffffe00 for all else +// search - Checks all pixels if true, else only checks 1 pixel to validate +layout(std140) uniform UBO { + uint blockhash[8]; + uint threshold; + bool search; +}; + +// Random work seed values +layout(std140) uniform WORK { + uvec2 seed; +}; + +// Defined separately from uint v[0].y below as the original value is required +// to calculate the second uint32 of the digest for threshold comparison +const uint BLAKE2B_IV32_1 = 0x6A09E667u; + +// Used during G for vector bit rotations +const uvec4 ROTATE_1 = uvec4(1u); +const uvec4 ROTATE_8 = uvec4(8u); +const uvec4 ROTATE_16 = uvec4(16u); +const uvec4 ROTATE_24 = uvec4(24u); +const uvec4 ROTATE_31 = uvec4(31u); + +// Both buffers represent 16 uint64s as 32 uint32s +// because that's what GLSL offers, just like Javascript + +// Compression buffer, intialized to 2 instances of the initialization vector +// The following values have been modified from the BLAKE2B_IV: +// OUTLEN is constant 8 bytes +// v[0] ^= 0x01010000u ^ uint(OUTLEN); +// INLEN is constant 40 bytes: work value (8) + block hash (32) +// v[12] ^= uint(INLEN); +// It's always the "last" compression at this INLEN +// v[14] = ~v[14]; +const uvec2 blake2b_iv[16] = uvec2[16]( + uvec2(0xF2BDC900u, 0x6A09E667u), + uvec2(0x84CAA73Bu, 0xBB67AE85u), + uvec2(0xFE94F82Bu, 0x3C6EF372u), + uvec2(0x5F1D36F1u, 0xA54FF53Au), + uvec2(0xADE682D1u, 0x510E527Fu), + uvec2(0x2B3E6C1Fu, 0x9B05688Cu), + uvec2(0xFB41BD6Bu, 0x1F83D9ABu), + uvec2(0x137E2179u, 0x5BE0CD19u), + uvec2(0xF3BCC908u, 0x6A09E667u), + uvec2(0x84CAA73Bu, 0xBB67AE85u), + uvec2(0xFE94F82Bu, 0x3C6EF372u), + uvec2(0x5F1D36F1u, 0xA54FF53Au), + uvec2(0xADE682F9u, 0x510E527Fu), + uvec2(0x2B3E6C1Fu, 0x9B05688Cu), + uvec2(0x04BE4294u, 0xE07C2654u), + uvec2(0x137E2179u, 0x5BE0CD19u) +); + +// Iterated initialization vector +uvec2 v[16]; + +// Input data buffer +uvec2 m[16]; + +// G mixing function, compressing two subprocesses into one +void G ( + uint a0, uint b0, uint c0, uint d0, uvec2 x0, uvec2 y0, + uint a1, uint b1, uint c1, uint d1, uvec2 x1, uvec2 y1 +) { + uvec4 a = uvec4(v[a0], v[a1]); + uvec4 b = uvec4(v[b0], v[b1]); + uvec4 c = uvec4(v[c0], v[c1]); + uvec4 d = uvec4(v[d0], v[d1]); + uvec4 mx = uvec4(x0, x1); + uvec4 my = uvec4(y0, y1); + + a = a + b + uvec4(0u, uint(a.x + b.x < a.x), 0u, uint(a.z + b.z < a.z)); + a = a + mx + uvec4(0u, uint(a.x + mx.x < a.x), 0u, uint(a.z + mx.z < a.z)); + d = (d ^ a).yxwz; + c = c + d + uvec4(0u, uint(c.x + d.x < c.x), 0u, uint(c.z + d.z < c.z)); + b = ((b ^ c) >> ROTATE_24) | ((b ^ c) << ROTATE_8).yxwz; + a = a + b + uvec4(0u, uint(a.x + b.x < b.x), 0u, uint(a.z + b.z < b.z)); + a = a + my + uvec4(0u, uint(a.x + my.x < a.x), 0u, uint(a.z + my.z < a.z)); + d = ((d ^ a) >> ROTATE_16) | ((d ^ a) << ROTATE_16).yxwz; + c = c + d + uvec4(0u, uint(c.x + d.x < c.x), 0u, uint(c.z + d.z < c.z)); + b = ((b ^ c) >> ROTATE_31).yxwz | ((b ^ c) << ROTATE_1); + + v[a0] = a.xy; + v[b0] = b.xy; + v[c0] = c.xy; + v[d0] = d.xy; + v[a1] = a.zw; + v[b1] = b.zw; + v[c1] = c.zw; + v[d1] = d.zw; +} + +void main() { + // Initialize fragment output + nonce = uvec4(0u); + + // Nonce uniquely differentiated by pixel location + m[0u] = seed ^ uvec2(gl_FragCoord); + + // Block hash + m[1u] = uvec2(blockhash[0u], blockhash[1u]); + m[2u] = uvec2(blockhash[2u], blockhash[3u]); + m[3u] = uvec2(blockhash[4u], blockhash[5u]); + m[4u] = uvec2(blockhash[6u], blockhash[7u]); + + // Reset v + v = blake2b_iv; + + // Twelve rounds of G mixing + + // Round 0 + G(0u, 4u, 8u, 12u, m[0u], m[1u], 1u, 5u, 9u, 13u, m[2u], m[3u]); + G(2u, 6u, 10u, 14u, m[4u], m[5u], 3u, 7u, 11u, 15u, m[6u], m[7u]); + G(0u, 5u, 10u, 15u, m[8u], m[9u], 1u, 6u, 11u, 12u, m[10u], m[11u]); + G(2u, 7u, 8u, 13u, m[12u], m[13u], 3u, 4u, 9u, 14u, m[14u], m[15u]); + + // Round 1 + G(0u, 4u, 8u, 12u, m[14u], m[10u], 1u, 5u, 9u, 13u, m[4u], m[8u]); + G(2u, 6u, 10u, 14u, m[9u], m[15u], 3u, 7u, 11u, 15u, m[13u], m[6u]); + G(0u, 5u, 10u, 15u, m[1u], m[12u], 1u, 6u, 11u, 12u, m[0u], m[2u]); + G(2u, 7u, 8u, 13u, m[11u], m[7u], 3u, 4u, 9u, 14u, m[5u], m[3u]); + + // Round 2 + G(0u, 4u, 8u, 12u, m[11u], m[8u], 1u, 5u, 9u, 13u, m[12u], m[0u]); + G(2u, 6u, 10u, 14u, m[5u], m[2u], 3u, 7u, 11u, 15u, m[15u], m[13u]); + G(0u, 5u, 10u, 15u, m[10u], m[14u], 1u, 6u, 11u, 12u, m[3u], m[6u]); + G(2u, 7u, 8u, 13u, m[7u], m[1u], 3u, 4u, 9u, 14u, m[9u], m[4u]); + + // Round 3 + G(0u, 4u, 8u, 12u, m[7u], m[9u], 1u, 5u, 9u, 13u, m[3u], m[1u]); + G(2u, 6u, 10u, 14u, m[13u], m[12u], 3u, 7u, 11u, 15u, m[11u], m[14u]); + G(0u, 5u, 10u, 15u, m[2u], m[6u], 1u, 6u, 11u, 12u, m[5u], m[10u]); + G(2u, 7u, 8u, 13u, m[4u], m[0u], 3u, 4u, 9u, 14u, m[15u], m[8u]); + + // Round 4 + G(0u, 4u, 8u, 12u, m[9u], m[0u], 1u, 5u, 9u, 13u, m[5u], m[7u]); + G(2u, 6u, 10u, 14u, m[2u], m[4u], 3u, 7u, 11u, 15u, m[10u], m[15u]); + G(0u, 5u, 10u, 15u, m[14u], m[1u], 1u, 6u, 11u, 12u, m[11u], m[12u]); + G(2u, 7u, 8u, 13u, m[6u], m[8u], 3u, 4u, 9u, 14u, m[3u], m[13u]); + + // Round 5 + G(0u, 4u, 8u, 12u, m[2u], m[12u], 1u, 5u, 9u, 13u, m[6u], m[10u]); + G(2u, 6u, 10u, 14u, m[0u], m[11u], 3u, 7u, 11u, 15u, m[8u], m[3u]); + G(0u, 5u, 10u, 15u, m[4u], m[13u], 1u, 6u, 11u, 12u, m[7u], m[5u]); + G(2u, 7u, 8u, 13u, m[15u], m[14u], 3u, 4u, 9u, 14u, m[1u], m[9u]); + + // Round 6 + G(0u, 4u, 8u, 12u, m[12u], m[5u], 1u, 5u, 9u, 13u, m[1u], m[15u]); + G(2u, 6u, 10u, 14u, m[14u], m[13u], 3u, 7u, 11u, 15u, m[4u], m[10u]); + G(0u, 5u, 10u, 15u, m[0u], m[7u], 1u, 6u, 11u, 12u, m[6u], m[3u]); + G(2u, 7u, 8u, 13u, m[9u], m[2u], 3u, 4u, 9u, 14u, m[8u], m[11u]); + + // Round 7 + G(0u, 4u, 8u, 12u, m[13u], m[11u], 1u, 5u, 9u, 13u, m[7u], m[14u]); + G(2u, 6u, 10u, 14u, m[12u], m[1u], 3u, 7u, 11u, 15u, m[3u], m[9u]); + G(0u, 5u, 10u, 15u, m[5u], m[0u], 1u, 6u, 11u, 12u, m[15u], m[4u]); + G(2u, 7u, 8u, 13u, m[8u], m[6u], 3u, 4u, 9u, 14u, m[2u], m[10u]); + + // Round 8 + G(0u, 4u, 8u, 12u, m[6u], m[15u], 1u, 5u, 9u, 13u, m[14u], m[9u]); + G(2u, 6u, 10u, 14u, m[11u], m[3u], 3u, 7u, 11u, 15u, m[0u], m[8u]); + G(0u, 5u, 10u, 15u, m[12u], m[2u], 1u, 6u, 11u, 12u, m[13u], m[7u]); + G(2u, 7u, 8u, 13u, m[1u], m[4u], 3u, 4u, 9u, 14u, m[10u], m[5u]); + + // Round 9 + G(0u, 4u, 8u, 12u, m[10u], m[2u], 1u, 5u, 9u, 13u, m[8u], m[4u]); + G(2u, 6u, 10u, 14u, m[7u], m[6u], 3u, 7u, 11u, 15u, m[1u], m[5u]); + G(0u, 5u, 10u, 15u, m[15u], m[11u], 1u, 6u, 11u, 12u, m[9u], m[14u]); + G(2u, 7u, 8u, 13u, m[3u], m[12u], 3u, 4u, 9u, 14u, m[13u], m[0u]); + + // Round 10 + G(0u, 4u, 8u, 12u, m[0u], m[1u], 1u, 5u, 9u, 13u, m[2u], m[3u]); + G(2u, 6u, 10u, 14u, m[4u], m[5u], 3u, 7u, 11u, 15u, m[6u], m[7u]); + G(0u, 5u, 10u, 15u, m[8u], m[9u], 1u, 6u, 11u, 12u, m[10u], m[11u]); + G(2u, 7u, 8u, 13u, m[12u], m[13u], 3u, 4u, 9u, 14u, m[14u], m[15u]); + + // Round 11 + G(0u, 4u, 8u, 12u, m[14u], m[10u], 1u, 5u, 9u, 13u, m[4u], m[8u]); + G(2u, 6u, 10u, 14u, m[9u], m[15u], 3u, 7u, 11u, 15u, m[13u], m[6u]); + G(0u, 5u, 10u, 15u, m[1u], m[12u], 1u, 6u, 11u, 12u, m[0u], m[2u]); + G(2u, 7u, 8u, 13u, m[11u], m[7u], 3u, 4u, 9u, 14u, m[5u], m[3u]); + + // Pixel data set from work seed values + // Finalize digest from high bits, low bits can be safely ignored + if ((BLAKE2B_IV32_1 ^ v[0u].y ^ v[8u].y) >= threshold && (search || uvec2(gl_FragCoord) == uvec2(0u))) { + nonce = uvec4(1u, m[0u].y, m[0u].x, (uint(gl_FragCoord.x) << 16u) | uint(gl_FragCoord.y)); + } + + // Valid nonce not found + if (nonce.x == 0u) { + discard; + } +} +` diff --git a/src/shaders/gl-fragment.ts b/src/shaders/gl-fragment.ts deleted file mode 100644 index 1bed519..0000000 --- a/src/shaders/gl-fragment.ts +++ /dev/null @@ -1,132 +0,0 @@ -// SPDX-FileCopyrightText: 2025 Chris Duncan -// SPDX-FileContributor: Ben Green -// SPDX-License-Identifier: GPL-3.0-or-later AND MIT - -export const NanoPowGlFragmentShader = `#version 300 es -#pragma vscode_glsllint_stage: frag -precision highp float; - -in vec2 uv_pos; -out uvec4 nonce; - -// blockhash - array of precalculated block hash components -// threshold - 0xfffffff8 for send/change blocks, 0xfffffe00 for all else -// workload - Defines canvas size -layout(std140) uniform UBO { - uint blockhash[8]; - uint threshold; - float workload; -}; - -// Random work values -layout(std140) uniform WORK { - uvec2 work; -}; - -// Defined separately from uint v[32] below as the original value is required -// to calculate the second uint32 of the digest for threshold comparison -const uint BLAKE2B_IV32_1 = 0x6A09E667u; - -// Used during G for vector bit rotations -const uvec2 ROTATE_1 = uvec2(1u, 1u); -const uvec2 ROTATE_8 = uvec2(8u, 8u); -const uvec2 ROTATE_16 = uvec2(16u, 16u); -const uvec2 ROTATE_24 = uvec2(24u, 24u); -const uvec2 ROTATE_31 = uvec2(31u, 31u); - -// Both buffers represent 16 uint64s as 32 uint32s -// because that's what GLSL offers, just like Javascript - -// Compression buffer, intialized to 2 instances of the initialization vector -// The following values have been modified from the BLAKE2B_IV: -// OUTLEN is constant 8 bytes -// v[0] ^= 0x01010000u ^ uint(OUTLEN); -// INLEN is constant 40 bytes: work value (8) + block hash (32) -// v[24] ^= uint(INLEN); -// It's always the "last" compression at this INLEN -// v[28] = ~v[28]; -// v[29] = ~v[29]; -uvec2 v[16] = uvec2[16]( - uvec2(0xF2BDC900u, 0x6A09E667u), - uvec2(0x84CAA73Bu, 0xBB67AE85u), - uvec2(0xFE94F82Bu, 0x3C6EF372u), - uvec2(0x5F1D36F1u, 0xA54FF53Au), - uvec2(0xADE682D1u, 0x510E527Fu), - uvec2(0x2B3E6C1Fu, 0x9B05688Cu), - uvec2(0xFB41BD6Bu, 0x1F83D9ABu), - uvec2(0x137E2179u, 0x5BE0CD19u), - uvec2(0xF3BCC908u, 0x6A09E667u), - uvec2(0x84CAA73Bu, 0xBB67AE85u), - uvec2(0xFE94F82Bu, 0x3C6EF372u), - uvec2(0x5F1D36F1u, 0xA54FF53Au), - uvec2(0xADE682F9u, 0x510E527Fu), - uvec2(0x2B3E6C1Fu, 0x9B05688Cu), - uvec2(0x04BE4294u, 0xE07C2654u), - uvec2(0x137E2179u, 0x5BE0CD19u) -); - -// Input data buffer -uvec2 m[16]; - -// Offsets into the input data buffer for each mixing step -const uint SIGMA[192] = uint[192]( - 0u,1u,2u,3u,4u,5u,6u,7u,8u,9u,10u,11u,12u,13u,14u,15u, - 14u,10u,4u,8u,9u,15u,13u,6u,1u,12u,0u,2u,11u,7u,5u,3u, - 11u,8u,12u,0u,5u,2u,15u,13u,10u,14u,3u,6u,7u,1u,9u,4u, - 7u,9u,3u,1u,13u,12u,11u,14u,2u,6u,5u,10u,4u,0u,15u,8u, - 9u,0u,5u,7u,2u,4u,10u,15u,14u,1u,11u,12u,6u,8u,3u,13u, - 2u,12u,6u,10u,0u,11u,8u,3u,4u,13u,7u,5u,15u,14u,1u,9u, - 12u,5u,1u,15u,14u,13u,4u,10u,0u,7u,6u,3u,9u,2u,8u,11u, - 13u,11u,7u,14u,12u,1u,3u,9u,5u,0u,15u,4u,8u,6u,2u,10u, - 6u,15u,14u,9u,11u,3u,0u,8u,12u,2u,13u,7u,1u,4u,10u,5u, - 10u,2u,8u,4u,7u,6u,1u,5u,15u,11u,9u,14u,3u,12u,13u,0u, - 0u,1u,2u,3u,4u,5u,6u,7u,8u,9u,10u,11u,12u,13u,14u,15u, - 14u,10u,4u,8u,9u,15u,13u,6u,1u,12u,0u,2u,11u,7u,5u,3u -); - -// G mixing function -void G (uint a, uint b, uint c, uint d, uint x, uint y) { - v[a] = v[a] + v[b] + uvec2(0u, uint(v[a].x + v[b].x < v[b].x)); - v[a] = v[a] + m[x] + uvec2(0u, uint(v[a].x + m[x].x < m[x].x)); - v[d] = (v[d] ^ v[a]).yx; - v[c] = v[c] + v[d] + uvec2(0u, uint(v[c].x + v[d].x < v[d].x)); - v[b] = ((v[b] ^ v[c]) >> ROTATE_24) | ((v[b] ^ v[c]).yx << ROTATE_8); - v[a] = v[a] + v[b] + uvec2(0u, uint(v[a].x + v[b].x < v[b].x)); - v[a] = v[a] + m[y] + uvec2(0u, uint(v[a].x + m[y].x < m[y].x)); - v[d] = ((v[d] ^ v[a]) >> ROTATE_16) | ((v[d] ^ v[a]).yx << ROTATE_16); - v[c] = v[c] + v[d] + uvec2(0u, uint(v[c].x + v[d].x < v[d].x)); - v[b] = ((v[b] ^ v[c]).yx >> ROTATE_31) | ((v[b] ^ v[c]) << ROTATE_1); -} - -void main() { - // Nonce uniquely differentiated by pixel location - m[0u].x = work.x ^ uint(uv_pos.x * workload); - m[0u].y = work.y ^ uint(uv_pos.y * workload); - - // Block hash - m[1u] = uvec2(blockhash[0u], blockhash[1u]); - m[2u] = uvec2(blockhash[2u], blockhash[3u]); - m[3u] = uvec2(blockhash[4u], blockhash[5u]); - m[4u] = uvec2(blockhash[6u], blockhash[7u]); - - // twelve rounds of mixing - for(uint i = 0u; i < 12u; i = i + 1u) { - G(0u, 4u, 8u, 12u, SIGMA[i * 16u + 0u], SIGMA[i * 16u + 1u]); - G(1u, 5u, 9u, 13u, SIGMA[i * 16u + 2u], SIGMA[i * 16u + 3u]); - G(2u, 6u, 10u, 14u, SIGMA[i * 16u + 4u], SIGMA[i * 16u + 5u]); - G(3u, 7u, 11u, 15u, SIGMA[i * 16u + 6u], SIGMA[i * 16u + 7u]); - G(0u, 5u, 10u, 15u, SIGMA[i * 16u + 8u], SIGMA[i * 16u + 9u]); - G(1u, 6u, 11u, 12u, SIGMA[i * 16u + 10u], SIGMA[i * 16u + 11u]); - G(2u, 7u, 8u, 13u, SIGMA[i * 16u + 12u], SIGMA[i * 16u + 13u]); - G(3u, 4u, 9u, 14u, SIGMA[i * 16u + 14u], SIGMA[i * 16u + 15u]); - } - - // Pixel data set from work values - // Finalize digest from high bits, low bits can be safely ignored - if ((BLAKE2B_IV32_1 ^ v[0u].y ^ v[8u].y) > threshold) { - nonce = uvec4(1u, m[0].y, m[0].x, 1u); - } else { - discard; - } -} -` diff --git a/src/shaders/gl-vertex.ts b/src/shaders/gl-vertex.ts index 6227dee..9378c78 100644 --- a/src/shaders/gl-vertex.ts +++ b/src/shaders/gl-vertex.ts @@ -4,14 +4,15 @@ export const NanoPowGlVertexShader = `#version 300 es #pragma vscode_glsllint_stage: vert +#ifdef GL_FRAGMENT_PRECISION_HIGH precision highp float; -layout (location=0) in vec4 position; -layout (location=1) in vec2 uv; +#else +precision mediump float; +#endif -out vec2 uv_pos; +layout (location=0) in vec4 position; void main() { - uv_pos = uv; gl_Position = position; } ` diff --git a/src/shaders/index.ts b/src/shaders/index.ts index 5be438e..81042f5 100644 --- a/src/shaders/index.ts +++ b/src/shaders/index.ts @@ -2,11 +2,13 @@ // SPDX-License-Identifier: GPL-3.0-or-later import { default as NanoPowGpuComputeShader } from "./compute.wgsl" -import { NanoPowGlFragmentShader } from "./gl-fragment.js" +import { NanoPowGlDownsampleShader } from "./gl-downsample.js" +import { NanoPowGlDrawShader } from "./gl-draw.js" import { NanoPowGlVertexShader } from "./gl-vertex.js" export { NanoPowGpuComputeShader, - NanoPowGlFragmentShader, + NanoPowGlDownsampleShader, + NanoPowGlDrawShader, NanoPowGlVertexShader } diff --git a/test.html b/test.html index f4158eb..2855a96 100644 --- a/test.html +++ b/test.html @@ -6,7 +6,7 @@ SPDX-License-Identifier: GPL-3.0-or-later - + @@ -191,7 +194,7 @@ SPDX-License-Identifier: GPL-3.0-or-later - + diff --git a/types.d.ts b/types.d.ts index e521600..b105ee8 100644 --- a/types.d.ts +++ b/types.d.ts @@ -3,12 +3,29 @@ import "@webgpu/types" -export declare const NanoPowGlFragmentShader: string +export declare const NanoPowGlDownsampleShader: string +export declare const NanoPowGlDrawShader: string export declare const NanoPowGlVertexShader: string export declare const NanoPowGpuComputeShader: any declare const NanoPow: typeof NanoPowGl | typeof NanoPowGpu | null +/** +* Used to create WebGL framebuffer objects. +* +* @param {WebGLTexture} - Defines storage size +* @param {WebGLFramebuffer} - Holds texture data +* @param {size} - 2D lengths of texture +*/ +export type FBO = { + texture: WebGLTexture + framebuffer: WebGLFramebuffer + size: { + x: number + y: number + } +} + /** * Used to configure NanoPow. * -- 2.34.1