From ab5acd060b320467f9c7f50ed311b3325cefc42f Mon Sep 17 00:00:00 2001 From: Chris Duncan Date: Tue, 14 Jan 2025 11:24:20 -0800 Subject: [PATCH] Replace more 24-bit scalar rotations with vector rotations. --- src/shaders/compute.wgsl | 144 ++++++++++++++++++++++++++------------- 1 file changed, 96 insertions(+), 48 deletions(-) diff --git a/src/shaders/compute.wgsl b/src/shaders/compute.wgsl index 59d4396..06e17f7 100644 --- a/src/shaders/compute.wgsl +++ b/src/shaders/compute.wgsl @@ -358,10 +358,14 @@ fn main(@builtin(global_invocation_id) id: vec3) { v29 = v_2829.y; // b = rotr64(b ^ c, 24) - xor0 = v12 ^ v20; - xor1 = v13 ^ v21; - v12 = (xor0 >> 24u) ^ (xor1 << 8u); - v13 = (xor1 >> 24u) ^ (xor0 << 8u); + v_2021.x = v20; + v_2021.y = v21; + v_1213.x = v12; + v_1213.y = v13; + xor = v_1213 ^ v_2021; + v_1213 = vec2((xor.x >> 24u) | (xor.y << 8u), (xor.y >> 24u) | (xor.x << 8u)); + v12 = v_1213.x; + v13 = v_1213.y; // a = a + b v_45.x = v4; @@ -1117,10 +1121,14 @@ fn main(@builtin(global_invocation_id) id: vec3) { v29 = v_2829.y; // b = rotr64(b ^ c, 24) - xor0 = v12 ^ v20; - xor1 = v13 ^ v21; - v12 = (xor0 >> 24u) ^ (xor1 << 8u); - v13 = (xor1 >> 24u) ^ (xor0 << 8u); + v_2021.x = v20; + v_2021.y = v21; + v_1213.x = v12; + v_1213.y = v13; + xor = v_1213 ^ v_2021; + v_1213 = vec2((xor.x >> 24u) | (xor.y << 8u), (xor.y >> 24u) | (xor.x << 8u)); + v12 = v_1213.x; + v13 = v_1213.y; // a = a + b v_45.x = v4; @@ -1873,10 +1881,14 @@ fn main(@builtin(global_invocation_id) id: vec3) { v29 = v_2829.y; // b = rotr64(b ^ c, 24) - xor0 = v12 ^ v20; - xor1 = v13 ^ v21; - v12 = (xor0 >> 24u) ^ (xor1 << 8u); - v13 = (xor1 >> 24u) ^ (xor0 << 8u); + v_2021.x = v20; + v_2021.y = v21; + v_1213.x = v12; + v_1213.y = v13; + xor = v_1213 ^ v_2021; + v_1213 = vec2((xor.x >> 24u) | (xor.y << 8u), (xor.y >> 24u) | (xor.x << 8u)); + v12 = v_1213.x; + v13 = v_1213.y; // a = a + b v_45.x = v4; @@ -2617,10 +2629,14 @@ fn main(@builtin(global_invocation_id) id: vec3) { v29 = v_2829.y; // b = rotr64(b ^ c, 24) - xor0 = v12 ^ v20; - xor1 = v13 ^ v21; - v12 = (xor0 >> 24u) ^ (xor1 << 8u); - v13 = (xor1 >> 24u) ^ (xor0 << 8u); + v_2021.x = v20; + v_2021.y = v21; + v_1213.x = v12; + v_1213.y = v13; + xor = v_1213 ^ v_2021; + v_1213 = vec2((xor.x >> 24u) | (xor.y << 8u), (xor.y >> 24u) | (xor.x << 8u)); + v12 = v_1213.x; + v13 = v_1213.y; // a = a + b v_45.x = v4; @@ -3382,10 +3398,14 @@ fn main(@builtin(global_invocation_id) id: vec3) { v29 = v_2829.y; // b = rotr64(b ^ c, 24) - xor0 = v12 ^ v20; - xor1 = v13 ^ v21; - v12 = (xor0 >> 24u) ^ (xor1 << 8u); - v13 = (xor1 >> 24u) ^ (xor0 << 8u); + v_2021.x = v20; + v_2021.y = v21; + v_1213.x = v12; + v_1213.y = v13; + xor = v_1213 ^ v_2021; + v_1213 = vec2((xor.x >> 24u) | (xor.y << 8u), (xor.y >> 24u) | (xor.x << 8u)); + v12 = v_1213.x; + v13 = v_1213.y; // a = a + b v_45.x = v4; @@ -4147,10 +4167,14 @@ fn main(@builtin(global_invocation_id) id: vec3) { v29 = v_2829.y; // b = rotr64(b ^ c, 24) - xor0 = v12 ^ v20; - xor1 = v13 ^ v21; - v12 = (xor0 >> 24u) ^ (xor1 << 8u); - v13 = (xor1 >> 24u) ^ (xor0 << 8u); + v_2021.x = v20; + v_2021.y = v21; + v_1213.x = v12; + v_1213.y = v13; + xor = v_1213 ^ v_2021; + v_1213 = vec2((xor.x >> 24u) | (xor.y << 8u), (xor.y >> 24u) | (xor.x << 8u)); + v12 = v_1213.x; + v13 = v_1213.y; // a = a + b v_45.x = v4; @@ -4906,10 +4930,14 @@ fn main(@builtin(global_invocation_id) id: vec3) { v29 = v_2829.y; // b = rotr64(b ^ c, 24) - xor0 = v12 ^ v20; - xor1 = v13 ^ v21; - v12 = (xor0 >> 24u) ^ (xor1 << 8u); - v13 = (xor1 >> 24u) ^ (xor0 << 8u); + v_2021.x = v20; + v_2021.y = v21; + v_1213.x = v12; + v_1213.y = v13; + xor = v_1213 ^ v_2021; + v_1213 = vec2((xor.x >> 24u) | (xor.y << 8u), (xor.y >> 24u) | (xor.x << 8u)); + v12 = v_1213.x; + v13 = v_1213.y; // a = a + b v_45.x = v4; @@ -5665,10 +5693,14 @@ fn main(@builtin(global_invocation_id) id: vec3) { v29 = v_2829.y; // b = rotr64(b ^ c, 24) - xor0 = v12 ^ v20; - xor1 = v13 ^ v21; - v12 = (xor0 >> 24u) ^ (xor1 << 8u); - v13 = (xor1 >> 24u) ^ (xor0 << 8u); + v_2021.x = v20; + v_2021.y = v21; + v_1213.x = v12; + v_1213.y = v13; + xor = v_1213 ^ v_2021; + v_1213 = vec2((xor.x >> 24u) | (xor.y << 8u), (xor.y >> 24u) | (xor.x << 8u)); + v12 = v_1213.x; + v13 = v_1213.y; // a = a + b v_45.x = v4; @@ -6421,10 +6453,14 @@ fn main(@builtin(global_invocation_id) id: vec3) { v29 = v_2829.y; // b = rotr64(b ^ c, 24) - xor0 = v12 ^ v20; - xor1 = v13 ^ v21; - v12 = (xor0 >> 24u) ^ (xor1 << 8u); - v13 = (xor1 >> 24u) ^ (xor0 << 8u); + v_2021.x = v20; + v_2021.y = v21; + v_1213.x = v12; + v_1213.y = v13; + xor = v_1213 ^ v_2021; + v_1213 = vec2((xor.x >> 24u) | (xor.y << 8u), (xor.y >> 24u) | (xor.x << 8u)); + v12 = v_1213.x; + v13 = v_1213.y; // a = a + b v_45.x = v4; @@ -7180,10 +7216,14 @@ fn main(@builtin(global_invocation_id) id: vec3) { v29 = v_2829.y; // b = rotr64(b ^ c, 24) - xor0 = v12 ^ v20; - xor1 = v13 ^ v21; - v12 = (xor0 >> 24u) ^ (xor1 << 8u); - v13 = (xor1 >> 24u) ^ (xor0 << 8u); + v_2021.x = v20; + v_2021.y = v21; + v_1213.x = v12; + v_1213.y = v13; + xor = v_1213 ^ v_2021; + v_1213 = vec2((xor.x >> 24u) | (xor.y << 8u), (xor.y >> 24u) | (xor.x << 8u)); + v12 = v_1213.x; + v13 = v_1213.y; // a = a + b v_45.x = v4; @@ -7936,10 +7976,14 @@ fn main(@builtin(global_invocation_id) id: vec3) { v29 = v_2829.y; // b = rotr64(b ^ c, 24) - xor0 = v12 ^ v20; - xor1 = v13 ^ v21; - v12 = (xor0 >> 24u) ^ (xor1 << 8u); - v13 = (xor1 >> 24u) ^ (xor0 << 8u); + v_2021.x = v20; + v_2021.y = v21; + v_1213.x = v12; + v_1213.y = v13; + xor = v_1213 ^ v_2021; + v_1213 = vec2((xor.x >> 24u) | (xor.y << 8u), (xor.y >> 24u) | (xor.x << 8u)); + v12 = v_1213.x; + v13 = v_1213.y; // a = a + b v_45.x = v4; @@ -8695,10 +8739,14 @@ fn main(@builtin(global_invocation_id) id: vec3) { v29 = v_2829.y; // b = rotr64(b ^ c, 24) - xor0 = v12 ^ v20; - xor1 = v13 ^ v21; - v12 = (xor0 >> 24u) ^ (xor1 << 8u); - v13 = (xor1 >> 24u) ^ (xor0 << 8u); + v_2021.x = v20; + v_2021.y = v21; + v_1213.x = v12; + v_1213.y = v13; + xor = v_1213 ^ v_2021; + v_1213 = vec2((xor.x >> 24u) | (xor.y << 8u), (xor.y >> 24u) | (xor.x << 8u)); + v12 = v_1213.x; + v13 = v_1213.y; // a = a + b v_45.x = v4; -- 2.34.1