From cd4cdbcae05e744c8eaabb25a7b02f40c36e0127 Mon Sep 17 00:00:00 2001 From: Chris Duncan Date: Tue, 14 Jan 2025 11:28:01 -0800 Subject: [PATCH] Replace more 24-bit scalar rotations with vector rotations. --- src/shaders/compute.wgsl | 144 ++++++++++++++++++++++++++------------- 1 file changed, 96 insertions(+), 48 deletions(-) diff --git a/src/shaders/compute.wgsl b/src/shaders/compute.wgsl index 193f9ba..2981957 100644 --- a/src/shaders/compute.wgsl +++ b/src/shaders/compute.wgsl @@ -649,10 +649,14 @@ fn main(@builtin(global_invocation_id) id: vec3) { v25 = v_2425.y; // b = rotr64(b ^ c, 24) - xor0 = v12 ^ v22; - xor1 = v13 ^ v23; - v12 = (xor0 >> 24u) ^ (xor1 << 8u); - v13 = (xor1 >> 24u) ^ (xor0 << 8u); + v_2223.x = v22; + v_2223.y = v23; + v_1213.x = v12; + v_1213.y = v13; + xor = v_1213 ^ v_2223; + v_1213 = vec2((xor.x >> 24u) | (xor.y << 8u), (xor.y >> 24u) | (xor.x << 8u)); + v12 = v_1213.x; + v13 = v_1213.y; // a = a + b v_23.x = v2; @@ -1423,10 +1427,14 @@ fn main(@builtin(global_invocation_id) id: vec3) { v25 = v_2425.y; // b = rotr64(b ^ c, 24) - xor0 = v12 ^ v22; - xor1 = v13 ^ v23; - v12 = (xor0 >> 24u) ^ (xor1 << 8u); - v13 = (xor1 >> 24u) ^ (xor0 << 8u); + v_2223.x = v22; + v_2223.y = v23; + v_1213.x = v12; + v_1213.y = v13; + xor = v_1213 ^ v_2223; + v_1213 = vec2((xor.x >> 24u) | (xor.y << 8u), (xor.y >> 24u) | (xor.x << 8u)); + v12 = v_1213.x; + v13 = v_1213.y; // a = a + b v_23.x = v2; @@ -2182,10 +2190,14 @@ fn main(@builtin(global_invocation_id) id: vec3) { v25 = v_2425.y; // b = rotr64(b ^ c, 24) - xor0 = v12 ^ v22; - xor1 = v13 ^ v23; - v12 = (xor0 >> 24u) ^ (xor1 << 8u); - v13 = (xor1 >> 24u) ^ (xor0 << 8u); + v_2223.x = v22; + v_2223.y = v23; + v_1213.x = v12; + v_1213.y = v13; + xor = v_1213 ^ v_2223; + v_1213 = vec2((xor.x >> 24u) | (xor.y << 8u), (xor.y >> 24u) | (xor.x << 8u)); + v12 = v_1213.x; + v13 = v_1213.y; // a = a + b v_23.x = v2; @@ -2950,10 +2962,14 @@ fn main(@builtin(global_invocation_id) id: vec3) { v25 = v_2425.y; // b = rotr64(b ^ c, 24) - xor0 = v12 ^ v22; - xor1 = v13 ^ v23; - v12 = (xor0 >> 24u) ^ (xor1 << 8u); - v13 = (xor1 >> 24u) ^ (xor0 << 8u); + v_2223.x = v22; + v_2223.y = v23; + v_1213.x = v12; + v_1213.y = v13; + xor = v_1213 ^ v_2223; + v_1213 = vec2((xor.x >> 24u) | (xor.y << 8u), (xor.y >> 24u) | (xor.x << 8u)); + v12 = v_1213.x; + v13 = v_1213.y; // a = a + b v_23.x = v2; @@ -3724,10 +3740,14 @@ fn main(@builtin(global_invocation_id) id: vec3) { v25 = v_2425.y; // b = rotr64(b ^ c, 24) - xor0 = v12 ^ v22; - xor1 = v13 ^ v23; - v12 = (xor0 >> 24u) ^ (xor1 << 8u); - v13 = (xor1 >> 24u) ^ (xor0 << 8u); + v_2223.x = v22; + v_2223.y = v23; + v_1213.x = v12; + v_1213.y = v13; + xor = v_1213 ^ v_2223; + v_1213 = vec2((xor.x >> 24u) | (xor.y << 8u), (xor.y >> 24u) | (xor.x << 8u)); + v12 = v_1213.x; + v13 = v_1213.y; // a = a + b v_23.x = v2; @@ -4501,10 +4521,14 @@ fn main(@builtin(global_invocation_id) id: vec3) { v25 = v_2425.y; // b = rotr64(b ^ c, 24) - xor0 = v12 ^ v22; - xor1 = v13 ^ v23; - v12 = (xor0 >> 24u) ^ (xor1 << 8u); - v13 = (xor1 >> 24u) ^ (xor0 << 8u); + v_2223.x = v22; + v_2223.y = v23; + v_1213.x = v12; + v_1213.y = v13; + xor = v_1213 ^ v_2223; + v_1213 = vec2((xor.x >> 24u) | (xor.y << 8u), (xor.y >> 24u) | (xor.x << 8u)); + v12 = v_1213.x; + v13 = v_1213.y; // a = a + b v_23.x = v2; @@ -5272,10 +5296,14 @@ fn main(@builtin(global_invocation_id) id: vec3) { v25 = v_2425.y; // b = rotr64(b ^ c, 24) - xor0 = v12 ^ v22; - xor1 = v13 ^ v23; - v12 = (xor0 >> 24u) ^ (xor1 << 8u); - v13 = (xor1 >> 24u) ^ (xor0 << 8u); + v_2223.x = v22; + v_2223.y = v23; + v_1213.x = v12; + v_1213.y = v13; + xor = v_1213 ^ v_2223; + v_1213 = vec2((xor.x >> 24u) | (xor.y << 8u), (xor.y >> 24u) | (xor.x << 8u)); + v12 = v_1213.x; + v13 = v_1213.y; // a = a + b v_23.x = v2; @@ -6040,10 +6068,14 @@ fn main(@builtin(global_invocation_id) id: vec3) { v25 = v_2425.y; // b = rotr64(b ^ c, 24) - xor0 = v12 ^ v22; - xor1 = v13 ^ v23; - v12 = (xor0 >> 24u) ^ (xor1 << 8u); - v13 = (xor1 >> 24u) ^ (xor0 << 8u); + v_2223.x = v22; + v_2223.y = v23; + v_1213.x = v12; + v_1213.y = v13; + xor = v_1213 ^ v_2223; + v_1213 = vec2((xor.x >> 24u) | (xor.y << 8u), (xor.y >> 24u) | (xor.x << 8u)); + v12 = v_1213.x; + v13 = v_1213.y; // a = a + b v_23.x = v2; @@ -6808,10 +6840,14 @@ fn main(@builtin(global_invocation_id) id: vec3) { v25 = v_2425.y; // b = rotr64(b ^ c, 24) - xor0 = v12 ^ v22; - xor1 = v13 ^ v23; - v12 = (xor0 >> 24u) ^ (xor1 << 8u); - v13 = (xor1 >> 24u) ^ (xor0 << 8u); + v_2223.x = v22; + v_2223.y = v23; + v_1213.x = v12; + v_1213.y = v13; + xor = v_1213 ^ v_2223; + v_1213 = vec2((xor.x >> 24u) | (xor.y << 8u), (xor.y >> 24u) | (xor.x << 8u)); + v12 = v_1213.x; + v13 = v_1213.y; // a = a + b v_23.x = v2; @@ -7576,10 +7612,14 @@ fn main(@builtin(global_invocation_id) id: vec3) { v25 = v_2425.y; // b = rotr64(b ^ c, 24) - xor0 = v12 ^ v22; - xor1 = v13 ^ v23; - v12 = (xor0 >> 24u) ^ (xor1 << 8u); - v13 = (xor1 >> 24u) ^ (xor0 << 8u); + v_2223.x = v22; + v_2223.y = v23; + v_1213.x = v12; + v_1213.y = v13; + xor = v_1213 ^ v_2223; + v_1213 = vec2((xor.x >> 24u) | (xor.y << 8u), (xor.y >> 24u) | (xor.x << 8u)); + v12 = v_1213.x; + v13 = v_1213.y; // a = a + b v_23.x = v2; @@ -8347,10 +8387,14 @@ fn main(@builtin(global_invocation_id) id: vec3) { v25 = v_2425.y; // b = rotr64(b ^ c, 24) - xor0 = v12 ^ v22; - xor1 = v13 ^ v23; - v12 = (xor0 >> 24u) ^ (xor1 << 8u); - v13 = (xor1 >> 24u) ^ (xor0 << 8u); + v_2223.x = v22; + v_2223.y = v23; + v_1213.x = v12; + v_1213.y = v13; + xor = v_1213 ^ v_2223; + v_1213 = vec2((xor.x >> 24u) | (xor.y << 8u), (xor.y >> 24u) | (xor.x << 8u)); + v12 = v_1213.x; + v13 = v_1213.y; // a = a + b v_23.x = v2; @@ -9121,10 +9165,14 @@ fn main(@builtin(global_invocation_id) id: vec3) { v25 = v_2425.y; // b = rotr64(b ^ c, 24) - xor0 = v12 ^ v22; - xor1 = v13 ^ v23; - v12 = (xor0 >> 24u) ^ (xor1 << 8u); - v13 = (xor1 >> 24u) ^ (xor0 << 8u); + v_2223.x = v22; + v_2223.y = v23; + v_1213.x = v12; + v_1213.y = v13; + xor = v_1213 ^ v_2223; + v_1213 = vec2((xor.x >> 24u) | (xor.y << 8u), (xor.y >> 24u) | (xor.x << 8u)); + v12 = v_1213.x; + v13 = v_1213.y; // a = a + b v_23.x = v2; -- 2.34.1