From 5a5db31f6e77444bc4fb9e0a3230f442a005a6e0 Mon Sep 17 00:00:00 2001 From: Chris Duncan Date: Tue, 14 Jan 2025 11:25:37 -0800 Subject: [PATCH] Replace more 24-bit scalar rotations with vector rotations. --- src/shaders/compute.wgsl | 144 ++++++++++++++++++++++++++------------- 1 file changed, 96 insertions(+), 48 deletions(-) diff --git a/src/shaders/compute.wgsl b/src/shaders/compute.wgsl index 06e17f7..e8750c4 100644 --- a/src/shaders/compute.wgsl +++ b/src/shaders/compute.wgsl @@ -461,10 +461,14 @@ fn main(@builtin(global_invocation_id) id: vec3) { v31 = v_3031.y; // b = rotr64(b ^ c, 24) - xor0 = v14 ^ v22; - xor1 = v15 ^ v23; - v14 = (xor0 >> 24u) ^ (xor1 << 8u); - v15 = (xor1 >> 24u) ^ (xor0 << 8u); + v_2223.x = v22; + v_2223.y = v23; + v_1415.x = v14; + v_1415.y = v15; + xor = v_1415 ^ v_2223; + v_1415 = vec2((xor.x >> 24u) | (xor.y << 8u), (xor.y >> 24u) | (xor.x << 8u)); + v14 = v_1415.x; + v15 = v_1415.y; // a = a + b v_67.x = v6; @@ -1224,10 +1228,14 @@ fn main(@builtin(global_invocation_id) id: vec3) { v31 = v_3031.y; // b = rotr64(b ^ c, 24) - xor0 = v14 ^ v22; - xor1 = v15 ^ v23; - v14 = (xor0 >> 24u) ^ (xor1 << 8u); - v15 = (xor1 >> 24u) ^ (xor0 << 8u); + v_2223.x = v22; + v_2223.y = v23; + v_1415.x = v14; + v_1415.y = v15; + xor = v_1415 ^ v_2223; + v_1415 = vec2((xor.x >> 24u) | (xor.y << 8u), (xor.y >> 24u) | (xor.x << 8u)); + v14 = v_1415.x; + v15 = v_1415.y; // a = a + b v_67.x = v6; @@ -1981,10 +1989,14 @@ fn main(@builtin(global_invocation_id) id: vec3) { v31 = v_3031.y; // b = rotr64(b ^ c, 24) - xor0 = v14 ^ v22; - xor1 = v15 ^ v23; - v14 = (xor0 >> 24u) ^ (xor1 << 8u); - v15 = (xor1 >> 24u) ^ (xor0 << 8u); + v_2223.x = v22; + v_2223.y = v23; + v_1415.x = v14; + v_1415.y = v15; + xor = v_1415 ^ v_2223; + v_1415 = vec2((xor.x >> 24u) | (xor.y << 8u), (xor.y >> 24u) | (xor.x << 8u)); + v14 = v_1415.x; + v15 = v_1415.y; // a = a + b v_67.x = v6; @@ -2732,10 +2744,14 @@ fn main(@builtin(global_invocation_id) id: vec3) { v31 = v_3031.y; // b = rotr64(b ^ c, 24) - xor0 = v14 ^ v22; - xor1 = v15 ^ v23; - v14 = (xor0 >> 24u) ^ (xor1 << 8u); - v15 = (xor1 >> 24u) ^ (xor0 << 8u); + v_2223.x = v22; + v_2223.y = v23; + v_1415.x = v14; + v_1415.y = v15; + xor = v_1415 ^ v_2223; + v_1415 = vec2((xor.x >> 24u) | (xor.y << 8u), (xor.y >> 24u) | (xor.x << 8u)); + v14 = v_1415.x; + v15 = v_1415.y; // a = a + b v_67.x = v6; @@ -3498,10 +3514,14 @@ fn main(@builtin(global_invocation_id) id: vec3) { v31 = v_3031.y; // b = rotr64(b ^ c, 24) - xor0 = v14 ^ v22; - xor1 = v15 ^ v23; - v14 = (xor0 >> 24u) ^ (xor1 << 8u); - v15 = (xor1 >> 24u) ^ (xor0 << 8u); + v_2223.x = v22; + v_2223.y = v23; + v_1415.x = v14; + v_1415.y = v15; + xor = v_1415 ^ v_2223; + v_1415 = vec2((xor.x >> 24u) | (xor.y << 8u), (xor.y >> 24u) | (xor.x << 8u)); + v14 = v_1415.x; + v15 = v_1415.y; // a = a + b v_67.x = v6; @@ -4270,10 +4290,14 @@ fn main(@builtin(global_invocation_id) id: vec3) { v31 = v_3031.y; // b = rotr64(b ^ c, 24) - xor0 = v14 ^ v22; - xor1 = v15 ^ v23; - v14 = (xor0 >> 24u) ^ (xor1 << 8u); - v15 = (xor1 >> 24u) ^ (xor0 << 8u); + v_2223.x = v22; + v_2223.y = v23; + v_1415.x = v14; + v_1415.y = v15; + xor = v_1415 ^ v_2223; + v_1415 = vec2((xor.x >> 24u) | (xor.y << 8u), (xor.y >> 24u) | (xor.x << 8u)); + v14 = v_1415.x; + v15 = v_1415.y; // a = a + b v_67.x = v6; @@ -5030,10 +5054,14 @@ fn main(@builtin(global_invocation_id) id: vec3) { v31 = v_3031.y; // b = rotr64(b ^ c, 24) - xor0 = v14 ^ v22; - xor1 = v15 ^ v23; - v14 = (xor0 >> 24u) ^ (xor1 << 8u); - v15 = (xor1 >> 24u) ^ (xor0 << 8u); + v_2223.x = v22; + v_2223.y = v23; + v_1415.x = v14; + v_1415.y = v15; + xor = v_1415 ^ v_2223; + v_1415 = vec2((xor.x >> 24u) | (xor.y << 8u), (xor.y >> 24u) | (xor.x << 8u)); + v14 = v_1415.x; + v15 = v_1415.y; // a = a + b v_67.x = v6; @@ -5790,10 +5818,14 @@ fn main(@builtin(global_invocation_id) id: vec3) { v31 = v_3031.y; // b = rotr64(b ^ c, 24) - xor0 = v14 ^ v22; - xor1 = v15 ^ v23; - v14 = (xor0 >> 24u) ^ (xor1 << 8u); - v15 = (xor1 >> 24u) ^ (xor0 << 8u); + v_2223.x = v22; + v_2223.y = v23; + v_1415.x = v14; + v_1415.y = v15; + xor = v_1415 ^ v_2223; + v_1415 = vec2((xor.x >> 24u) | (xor.y << 8u), (xor.y >> 24u) | (xor.x << 8u)); + v14 = v_1415.x; + v15 = v_1415.y; // a = a + b v_67.x = v6; @@ -6550,10 +6582,14 @@ fn main(@builtin(global_invocation_id) id: vec3) { v31 = v_3031.y; // b = rotr64(b ^ c, 24) - xor0 = v14 ^ v22; - xor1 = v15 ^ v23; - v14 = (xor0 >> 24u) ^ (xor1 << 8u); - v15 = (xor1 >> 24u) ^ (xor0 << 8u); + v_2223.x = v22; + v_2223.y = v23; + v_1415.x = v14; + v_1415.y = v15; + xor = v_1415 ^ v_2223; + v_1415 = vec2((xor.x >> 24u) | (xor.y << 8u), (xor.y >> 24u) | (xor.x << 8u)); + v14 = v_1415.x; + v15 = v_1415.y; // a = a + b v_67.x = v6; @@ -7316,10 +7352,14 @@ fn main(@builtin(global_invocation_id) id: vec3) { v31 = v_3031.y; // b = rotr64(b ^ c, 24) - xor0 = v14 ^ v22; - xor1 = v15 ^ v23; - v14 = (xor0 >> 24u) ^ (xor1 << 8u); - v15 = (xor1 >> 24u) ^ (xor0 << 8u); + v_2223.x = v22; + v_2223.y = v23; + v_1415.x = v14; + v_1415.y = v15; + xor = v_1415 ^ v_2223; + v_1415 = vec2((xor.x >> 24u) | (xor.y << 8u), (xor.y >> 24u) | (xor.x << 8u)); + v14 = v_1415.x; + v15 = v_1415.y; // a = a + b v_67.x = v6; @@ -8079,10 +8119,14 @@ fn main(@builtin(global_invocation_id) id: vec3) { v31 = v_3031.y; // b = rotr64(b ^ c, 24) - xor0 = v14 ^ v22; - xor1 = v15 ^ v23; - v14 = (xor0 >> 24u) ^ (xor1 << 8u); - v15 = (xor1 >> 24u) ^ (xor0 << 8u); + v_2223.x = v22; + v_2223.y = v23; + v_1415.x = v14; + v_1415.y = v15; + xor = v_1415 ^ v_2223; + v_1415 = vec2((xor.x >> 24u) | (xor.y << 8u), (xor.y >> 24u) | (xor.x << 8u)); + v14 = v_1415.x; + v15 = v_1415.y; // a = a + b v_67.x = v6; @@ -8842,10 +8886,14 @@ fn main(@builtin(global_invocation_id) id: vec3) { v31 = v_3031.y; // b = rotr64(b ^ c, 24) - xor0 = v14 ^ v22; - xor1 = v15 ^ v23; - v14 = (xor0 >> 24u) ^ (xor1 << 8u); - v15 = (xor1 >> 24u) ^ (xor0 << 8u); + v_2223.x = v22; + v_2223.y = v23; + v_1415.x = v14; + v_1415.y = v15; + xor = v_1415 ^ v_2223; + v_1415 = vec2((xor.x >> 24u) | (xor.y << 8u), (xor.y >> 24u) | (xor.x << 8u)); + v14 = v_1415.x; + v15 = v_1415.y; // a = a + b v_67.x = v6; -- 2.34.1