From 2da857887ea69c63f7b973702bcf3f25d660586d Mon Sep 17 00:00:00 2001 From: Chris Duncan Date: Tue, 4 Feb 2025 05:57:56 -0800 Subject: [PATCH] Reorder G round computation to try allowing compiler to improve instruction scheduling and reduce assignment statements. Compiles and validates. --- src/shaders/compute.wgsl | 100 +++++++++++++++++++++------------------ 1 file changed, 55 insertions(+), 45 deletions(-) diff --git a/src/shaders/compute.wgsl b/src/shaders/compute.wgsl index 1e663af..0f46c74 100644 --- a/src/shaders/compute.wgsl +++ b/src/shaders/compute.wgsl @@ -150,6 +150,10 @@ fn main(id: vec3) { var d: vec4; var x: vec4; var y: vec4; + var v56: vec4; + var vFC: vec4; + var v74: vec4; + var vDE: vec4; /**************************************************************************** * ROUND(0) * ****************************************************************************/ @@ -157,78 +161,84 @@ fn main(id: vec3) { /** * r=0, i=0, a=v[0], b=v[4], c=v[8], d=v[12], m[sigma]=0, m[sigma+1]=1 * r=0, i=1, a=v[1], b=v[5], c=v[9], d=v[13], m[sigma]=2, m[sigma+1]=3 - */ - v01 += v45 + vec4(0u, u32(v01.x + v45.x < v01.x), 0u, u32(v01.z + v45.z < v01.z)); - v01 += vec4(m0, m2) + vec4(0u, u32(v01.x + m0.x < v01.x), 0u, u32(v01.z + m2.x < v01.z)); - vCD = (vCD ^ v01).yxwz; - v89 += vCD + vec4(0u, u32(v89.x + vCD.x < v89.x), 0u, u32(v89.z + vCD.z < v89.z)); - v45 = ((v45 ^ v89) >> ROTATE_24) | ((v45 ^ v89).yxwz << ROTATE_8); - v01 += v45 + vec4(0u, u32(v01.x + v45.x < v01.x), 0u, u32(v01.z + v45.z < v01.z)); - v01 += vec4(m1, m3) + vec4(0u, u32(v01.x + m1.x < v01.x), 0u, u32(v01.z + m3.x < v01.z)); - vCD = ((vCD ^ v01) >> ROTATE_16) | ((vCD ^ v01).yxwz << ROTATE_16); - v89 += vCD + vec4(0u, u32(v89.x + vCD.x < v89.x), 0u, u32(v89.z + vCD.z < v89.z)); - v45 = ((v45 ^ v89) << ROTATE_1) | ((v45 ^ v89).yxwz >> ROTATE_31); - - /** * r=0, i=2, a=v[2], b=v[6], c=v[10], d=v[14], m[sigma]=4, m[sigma+1]=5 * r=0, i=3, a=v[3], b=v[7], c=v[11], d=v[15], m[sigma]=6, m[sigma+1]=7 */ + v01 += v45 + vec4(0u, u32(v01.x + v45.x < v01.x), 0u, u32(v01.z + v45.z < v01.z)); v23 += v67 + vec4(0u, u32(v23.x + v67.x < v23.x), 0u, u32(v23.z + v67.z < v23.z)); + + v01 += vec4(m0, m2) + vec4(0u, u32(v01.x + m0.x < v01.x), 0u, u32(v01.z + m2.x < v01.z)); v23 += vec4(m4, Z) + vec4(0u, u32(v23.x + m4.x < v23.x), Z); + + vCD = (vCD ^ v01).yxwz; vEF = (vEF ^ v23).yxwz; + + v89 += vCD + vec4(0u, u32(v89.x + vCD.x < v89.x), 0u, u32(v89.z + vCD.z < v89.z)); vAB += vEF + vec4(0u, u32(vAB.x + vEF.x < vAB.x), 0u, u32(vAB.z + vEF.z < vAB.z)); + + v45 = ((v45 ^ v89) >> ROTATE_24) | ((v45 ^ v89).yxwz << ROTATE_8); v67 = ((v67 ^ vAB) >> ROTATE_24) | ((v67 ^ vAB).yxwz << ROTATE_8); + + v01 += v45 + vec4(0u, u32(v01.x + v45.x < v01.x), 0u, u32(v01.z + v45.z < v01.z)); v23 += v67 + vec4(0u, u32(v23.x + v67.x < v23.x), 0u, u32(v23.z + v67.z < v23.z)); + + v01 += vec4(m1, m3) + vec4(0u, u32(v01.x + m1.x < v01.x), 0u, u32(v01.z + m3.x < v01.z)); // NOP + + vCD = ((vCD ^ v01) >> ROTATE_16) | ((vCD ^ v01).yxwz << ROTATE_16); vEF = ((vEF ^ v23) >> ROTATE_16) | ((vEF ^ v23).yxwz << ROTATE_16); + + v89 += vCD + vec4(0u, u32(v89.x + vCD.x < v89.x), 0u, u32(v89.z + vCD.z < v89.z)); vAB += vEF + vec4(0u, u32(vAB.x + vEF.x < vAB.x), 0u, u32(vAB.z + vEF.z < vAB.z)); + + v45 = ((v45 ^ v89) << ROTATE_1) | ((v45 ^ v89).yxwz >> ROTATE_31); v67 = ((v67 ^ vAB) << ROTATE_1) | ((v67 ^ vAB).yxwz >> ROTATE_31); /** * r=0, i=4, a=v[0], b=v[5], c=v[10], d=v[15], m[sigma]=8, m[sigma+1]=9 * r=0, i=5, a=v[1], b=v[6], c=v[11], d=v[12], m[sigma]=10, m[sigma+1]=11 + * r=0, i=6, a=v[2], b=v[7], c=v[8], d=v[13], m[sigma]=12, m[sigma+1]=13 + * r=0, i=7, a=v[3], b=v[4], c=v[9], d=v[14], m[sigma]=14, m[sigma+1]=15 */ - b = vec4(v45.zw, v67.xy); - d = vec4(vEF.zw, vCD.xy); + v56 = vec4(v45.zw, v67.xy); + vFC = vec4(vEF.zw, vCD.xy); + v74 = vec4(v67.zw, v45.xy); + vDE = vec4(vCD.zw, vEF.xy); + + v01 += v56 + vec4(0u, u32(v01.x + v56.x < v01.x), 0u, u32(v01.z + v56.z < v01.z)); + v23 += v74 + vec4(0u, u32(v23.x + v74.x < v23.x), 0u, u32(v23.z + v74.z < v23.z)); - v01 += b + vec4(0u, u32(v01.x + b.x < v01.x), 0u, u32(v01.z + b.z < v01.z)); // NOP - d = (d ^ v01).yxwz; - vAB += d + vec4(0u, u32(vAB.x + d.x < vAB.x), 0u, u32(vAB.z + d.z < vAB.z)); - b = ((b ^ vAB) >> ROTATE_24) | ((b ^ vAB).yxwz << ROTATE_8); - v01 += b + vec4(0u, u32(v01.x + b.x < v01.x), 0u, u32(v01.z + b.z < v01.z)); // NOP - d = ((d ^ v01) >> ROTATE_16) | ((d ^ v01).yxwz << ROTATE_16); - vAB += d + vec4(0u, u32(vAB.x + d.x < vAB.x), 0u, u32(vAB.z + d.z < vAB.z)); - b = ((b ^ vAB) << ROTATE_1) | ((b ^ vAB).yxwz >> ROTATE_31); - v45 = vec4(v45.xy, b.xy); - v67 = vec4(b.zw, v67.zw); - vEF = vec4(vEF.xy, d.xy); - vCD = vec4(d.zw, vCD.zw); + vFC = (vFC ^ v01).yxwz; + vDE = (vDE ^ v23).yxwz; - /** - * r=0, i=6, a=v[2], b=v[7], c=v[8], d=v[13], m[sigma]=12, m[sigma+1]=13 - * r=0, i=7, a=v[3], b=v[4], c=v[9], d=v[14], m[sigma]=14, m[sigma+1]=15 - */ - b = vec4(v67.zw, v45.xy); - d = vec4(vCD.zw, vEF.xy); + vAB += vFC + vec4(0u, u32(vAB.x + vFC.x < vAB.x), 0u, u32(vAB.z + vFC.z < vAB.z)); + v89 += vDE + vec4(0u, u32(v89.x + vDE.x < v89.x), 0u, u32(v89.z + vDE.z < v89.z)); + + v56 = ((v56 ^ vAB) >> ROTATE_24) | ((v56 ^ vAB).yxwz << ROTATE_8); + v74 = ((v74 ^ v89) >> ROTATE_24) | ((v74 ^ v89).yxwz << ROTATE_8); + + v01 += v56 + vec4(0u, u32(v01.x + v56.x < v01.x), 0u, u32(v01.z + v56.z < v01.z)); + v23 += v74 + vec4(0u, u32(v23.x + v74.x < v23.x), 0u, u32(v23.z + v74.z < v23.z)); - v23 += b + vec4(0u, u32(v23.x + b.x < v23.x), 0u, u32(v23.z + b.z < v23.z)); // NOP - d = (d ^ v23).yxwz; - v89 += d + vec4(0u, u32(v89.x + d.x < v89.x), 0u, u32(v89.z + d.z < v89.z)); - b = ((b ^ v89) >> ROTATE_24) | ((b ^ v89).yxwz << ROTATE_8); - v23 += b + vec4(0u, u32(v23.x + b.x < v23.x), 0u, u32(v23.z + b.z < v23.z)); // NOP - d = ((d ^ v23) >> ROTATE_16) | ((d ^ v23).yxwz << ROTATE_16); - v89 += d + vec4(0u, u32(v89.x + d.x < v89.x), 0u, u32(v89.z + d.z < v89.z)); - b = ((b ^ v89) << ROTATE_1) | ((b ^ v89).yxwz >> ROTATE_31); - v67 = vec4(v67.xy, b.xy); - v45 = vec4(b.zw, v45.zw); - vCD = vec4(vCD.xy, d.xy); - vEF = vec4(d.zw, vEF.zw); + vFC = ((vFC ^ v01) >> ROTATE_16) | ((vFC ^ v01).yxwz << ROTATE_16); + vDE = ((vDE ^ v23) >> ROTATE_16) | ((vDE ^ v23).yxwz << ROTATE_16); + + vAB += vFC + vec4(0u, u32(vAB.x + vFC.x < vAB.x), 0u, u32(vAB.z + vFC.z < vAB.z)); + v89 += vDE + vec4(0u, u32(v89.x + vDE.x < v89.x), 0u, u32(v89.z + vDE.z < v89.z)); + + v56 = ((v56 ^ vAB) << ROTATE_1) | ((v56 ^ vAB).yxwz >> ROTATE_31); + v74 = ((v74 ^ v89) << ROTATE_1) | ((v74 ^ v89).yxwz >> ROTATE_31); + + v45 = vec4(v74.zw, v56.xy); + v67 = vec4(v56.zw, v74.xy); + vCD = vec4(vFC.zw, vDE.xy); + vEF = vec4(vDE.zw, vFC.xy); -- 2.34.1