From e196935634001501d9c156fda17b19e09aae74c1 Mon Sep 17 00:00:00 2001 From: Chris Duncan Date: Tue, 14 Jan 2025 11:22:50 -0800 Subject: [PATCH] Replace more 24-bit scalar rotations with vector rotations. --- src/shaders/compute.wgsl | 144 ++++++++++++++++++++++++++------------- 1 file changed, 96 insertions(+), 48 deletions(-) diff --git a/src/shaders/compute.wgsl b/src/shaders/compute.wgsl index 5a3d119..59d4396 100644 --- a/src/shaders/compute.wgsl +++ b/src/shaders/compute.wgsl @@ -261,10 +261,14 @@ fn main(@builtin(global_invocation_id) id: vec3) { v27 = v_2627.y; // b = rotr64(b ^ c, 24) - xor0 = v10 ^ v18; - xor1 = v11 ^ v19; - v10 = (xor0 >> 24u) ^ (xor1 << 8u); - v11 = (xor1 >> 24u) ^ (xor0 << 8u); + v_1819.x = v18; + v_1819.y = v19; + v_1011.x = v10; + v_1011.y = v11; + xor = v_1011 ^ v_1819; + v_1011 = vec2((xor.x >> 24u) | (xor.y << 8u), (xor.y >> 24u) | (xor.x << 8u)); + v10 = v_1011.x; + v11 = v_1011.y; // a = a + b v_23.x = v2; @@ -1010,10 +1014,14 @@ fn main(@builtin(global_invocation_id) id: vec3) { v27 = v_2627.y; // b = rotr64(b ^ c, 24) - xor0 = v10 ^ v18; - xor1 = v11 ^ v19; - v10 = (xor0 >> 24u) ^ (xor1 << 8u); - v11 = (xor1 >> 24u) ^ (xor0 << 8u); + v_1819.x = v18; + v_1819.y = v19; + v_1011.x = v10; + v_1011.y = v11; + xor = v_1011 ^ v_1819; + v_1011 = vec2((xor.x >> 24u) | (xor.y << 8u), (xor.y >> 24u) | (xor.x << 8u)); + v10 = v_1011.x; + v11 = v_1011.y; // a = a + b v_23.x = v2; @@ -1765,10 +1773,14 @@ fn main(@builtin(global_invocation_id) id: vec3) { v27 = v_2627.y; // b = rotr64(b ^ c, 24) - xor0 = v10 ^ v18; - xor1 = v11 ^ v19; - v10 = (xor0 >> 24u) ^ (xor1 << 8u); - v11 = (xor1 >> 24u) ^ (xor0 << 8u); + v_1819.x = v18; + v_1819.y = v19; + v_1011.x = v10; + v_1011.y = v11; + xor = v_1011 ^ v_1819; + v_1011 = vec2((xor.x >> 24u) | (xor.y << 8u), (xor.y >> 24u) | (xor.x << 8u)); + v10 = v_1011.x; + v11 = v_1011.y; // a = a + b v_23.x = v2; @@ -2505,10 +2517,14 @@ fn main(@builtin(global_invocation_id) id: vec3) { v27 = v_2627.y; // b = rotr64(b ^ c, 24) - xor0 = v10 ^ v18; - xor1 = v11 ^ v19; - v10 = (xor0 >> 24u) ^ (xor1 << 8u); - v11 = (xor1 >> 24u) ^ (xor0 << 8u); + v_1819.x = v18; + v_1819.y = v19; + v_1011.x = v10; + v_1011.y = v11; + xor = v_1011 ^ v_1819; + v_1011 = vec2((xor.x >> 24u) | (xor.y << 8u), (xor.y >> 24u) | (xor.x << 8u)); + v10 = v_1011.x; + v11 = v_1011.y; // a = a + b v_23.x = v2; @@ -3266,10 +3282,14 @@ fn main(@builtin(global_invocation_id) id: vec3) { v27 = v_2627.y; // b = rotr64(b ^ c, 24) - xor0 = v10 ^ v18; - xor1 = v11 ^ v19; - v10 = (xor0 >> 24u) ^ (xor1 << 8u); - v11 = (xor1 >> 24u) ^ (xor0 << 8u); + v_1819.x = v18; + v_1819.y = v19; + v_1011.x = v10; + v_1011.y = v11; + xor = v_1011 ^ v_1819; + v_1011 = vec2((xor.x >> 24u) | (xor.y << 8u), (xor.y >> 24u) | (xor.x << 8u)); + v10 = v_1011.x; + v11 = v_1011.y; // a = a + b v_23.x = v2; @@ -4027,10 +4047,14 @@ fn main(@builtin(global_invocation_id) id: vec3) { v27 = v_2627.y; // b = rotr64(b ^ c, 24) - xor0 = v10 ^ v18; - xor1 = v11 ^ v19; - v10 = (xor0 >> 24u) ^ (xor1 << 8u); - v11 = (xor1 >> 24u) ^ (xor0 << 8u); + v_1819.x = v18; + v_1819.y = v19; + v_1011.x = v10; + v_1011.y = v11; + xor = v_1011 ^ v_1819; + v_1011 = vec2((xor.x >> 24u) | (xor.y << 8u), (xor.y >> 24u) | (xor.x << 8u)); + v10 = v_1011.x; + v11 = v_1011.y; // a = a + b v_23.x = v2; @@ -4779,10 +4803,14 @@ fn main(@builtin(global_invocation_id) id: vec3) { v27 = v_2627.y; // b = rotr64(b ^ c, 24) - xor0 = v10 ^ v18; - xor1 = v11 ^ v19; - v10 = (xor0 >> 24u) ^ (xor1 << 8u); - v11 = (xor1 >> 24u) ^ (xor0 << 8u); + v_1819.x = v18; + v_1819.y = v19; + v_1011.x = v10; + v_1011.y = v11; + xor = v_1011 ^ v_1819; + v_1011 = vec2((xor.x >> 24u) | (xor.y << 8u), (xor.y >> 24u) | (xor.x << 8u)); + v10 = v_1011.x; + v11 = v_1011.y; // a = a + b v_23.x = v2; @@ -5534,10 +5562,14 @@ fn main(@builtin(global_invocation_id) id: vec3) { v27 = v_2627.y; // b = rotr64(b ^ c, 24) - xor0 = v10 ^ v18; - xor1 = v11 ^ v19; - v10 = (xor0 >> 24u) ^ (xor1 << 8u); - v11 = (xor1 >> 24u) ^ (xor0 << 8u); + v_1819.x = v18; + v_1819.y = v19; + v_1011.x = v10; + v_1011.y = v11; + xor = v_1011 ^ v_1819; + v_1011 = vec2((xor.x >> 24u) | (xor.y << 8u), (xor.y >> 24u) | (xor.x << 8u)); + v10 = v_1011.x; + v11 = v_1011.y; // a = a + b v_23.x = v2; @@ -6286,10 +6318,14 @@ fn main(@builtin(global_invocation_id) id: vec3) { v27 = v_2627.y; // b = rotr64(b ^ c, 24) - xor0 = v10 ^ v18; - xor1 = v11 ^ v19; - v10 = (xor0 >> 24u) ^ (xor1 << 8u); - v11 = (xor1 >> 24u) ^ (xor0 << 8u); + v_1819.x = v18; + v_1819.y = v19; + v_1011.x = v10; + v_1011.y = v11; + xor = v_1011 ^ v_1819; + v_1011 = vec2((xor.x >> 24u) | (xor.y << 8u), (xor.y >> 24u) | (xor.x << 8u)); + v10 = v_1011.x; + v11 = v_1011.y; // a = a + b v_23.x = v2; @@ -7044,10 +7080,14 @@ fn main(@builtin(global_invocation_id) id: vec3) { v27 = v_2627.y; // b = rotr64(b ^ c, 24) - xor0 = v10 ^ v18; - xor1 = v11 ^ v19; - v10 = (xor0 >> 24u) ^ (xor1 << 8u); - v11 = (xor1 >> 24u) ^ (xor0 << 8u); + v_1819.x = v18; + v_1819.y = v19; + v_1011.x = v10; + v_1011.y = v11; + xor = v_1011 ^ v_1819; + v_1011 = vec2((xor.x >> 24u) | (xor.y << 8u), (xor.y >> 24u) | (xor.x << 8u)); + v10 = v_1011.x; + v11 = v_1011.y; // a = a + b v_23.x = v2; @@ -7799,10 +7839,14 @@ fn main(@builtin(global_invocation_id) id: vec3) { v27 = v_2627.y; // b = rotr64(b ^ c, 24) - xor0 = v10 ^ v18; - xor1 = v11 ^ v19; - v10 = (xor0 >> 24u) ^ (xor1 << 8u); - v11 = (xor1 >> 24u) ^ (xor0 << 8u); + v_1819.x = v18; + v_1819.y = v19; + v_1011.x = v10; + v_1011.y = v11; + xor = v_1011 ^ v_1819; + v_1011 = vec2((xor.x >> 24u) | (xor.y << 8u), (xor.y >> 24u) | (xor.x << 8u)); + v10 = v_1011.x; + v11 = v_1011.y; // a = a + b v_23.x = v2; @@ -8548,10 +8592,14 @@ fn main(@builtin(global_invocation_id) id: vec3) { v27 = v_2627.y; // b = rotr64(b ^ c, 24) - xor0 = v10 ^ v18; - xor1 = v11 ^ v19; - v10 = (xor0 >> 24u) ^ (xor1 << 8u); - v11 = (xor1 >> 24u) ^ (xor0 << 8u); + v_1819.x = v18; + v_1819.y = v19; + v_1011.x = v10; + v_1011.y = v11; + xor = v_1011 ^ v_1819; + v_1011 = vec2((xor.x >> 24u) | (xor.y << 8u), (xor.y >> 24u) | (xor.x << 8u)); + v10 = v_1011.x; + v11 = v_1011.y; // a = a + b v_23.x = v2; -- 2.34.1