From: Chris Duncan Date: Tue, 14 Jan 2025 19:26:53 +0000 (-0800) Subject: Replace more 24-bit scalar rotations with vector rotations. X-Git-Tag: v2.0.0~112 X-Git-Url: https://zoso.dev/?a=commitdiff_plain;h=025f6683fce24c1a556ecfa8cb51cfa36695b333;p=nano-pow.git Replace more 24-bit scalar rotations with vector rotations. --- diff --git a/src/shaders/compute.wgsl b/src/shaders/compute.wgsl index e8750c4..193f9ba 100644 --- a/src/shaders/compute.wgsl +++ b/src/shaders/compute.wgsl @@ -555,10 +555,14 @@ fn main(@builtin(global_invocation_id) id: vec3) { v31 = v_3031.y; // b = rotr64(b ^ c, 24) - xor0 = v10 ^ v20; - xor1 = v11 ^ v21; - v10 = (xor0 >> 24u) ^ (xor1 << 8u); - v11 = (xor1 >> 24u) ^ (xor0 << 8u); + v_2021.x = v20; + v_2021.y = v21; + v_1011.x = v10; + v_1011.y = v11; + xor = v_1011 ^ v_2021; + v_1011 = vec2((xor.x >> 24u) | (xor.y << 8u), (xor.y >> 24u) | (xor.x << 8u)); + v10 = v_1011.x; + v11 = v_1011.y; // a = a + b v_1011.x = v10; @@ -1328,10 +1332,14 @@ fn main(@builtin(global_invocation_id) id: vec3) { v31 = v_3031.y; // b = rotr64(b ^ c, 24) - xor0 = v10 ^ v20; - xor1 = v11 ^ v21; - v10 = (xor0 >> 24u) ^ (xor1 << 8u); - v11 = (xor1 >> 24u) ^ (xor0 << 8u); + v_2021.x = v20; + v_2021.y = v21; + v_1011.x = v10; + v_1011.y = v11; + xor = v_1011 ^ v_2021; + v_1011 = vec2((xor.x >> 24u) | (xor.y << 8u), (xor.y >> 24u) | (xor.x << 8u)); + v10 = v_1011.x; + v11 = v_1011.y; // a = a + b v_1011.x = v10; @@ -2083,10 +2091,14 @@ fn main(@builtin(global_invocation_id) id: vec3) { v31 = v_3031.y; // b = rotr64(b ^ c, 24) - xor0 = v10 ^ v20; - xor1 = v11 ^ v21; - v10 = (xor0 >> 24u) ^ (xor1 << 8u); - v11 = (xor1 >> 24u) ^ (xor0 << 8u); + v_2021.x = v20; + v_2021.y = v21; + v_1011.x = v10; + v_1011.y = v11; + xor = v_1011 ^ v_2021; + v_1011 = vec2((xor.x >> 24u) | (xor.y << 8u), (xor.y >> 24u) | (xor.x << 8u)); + v10 = v_1011.x; + v11 = v_1011.y; // a = a + b v_1011.x = v10; @@ -2844,10 +2856,14 @@ fn main(@builtin(global_invocation_id) id: vec3) { v31 = v_3031.y; // b = rotr64(b ^ c, 24) - xor0 = v10 ^ v20; - xor1 = v11 ^ v21; - v10 = (xor0 >> 24u) ^ (xor1 << 8u); - v11 = (xor1 >> 24u) ^ (xor0 << 8u); + v_2021.x = v20; + v_2021.y = v21; + v_1011.x = v10; + v_1011.y = v11; + xor = v_1011 ^ v_2021; + v_1011 = vec2((xor.x >> 24u) | (xor.y << 8u), (xor.y >> 24u) | (xor.x << 8u)); + v10 = v_1011.x; + v11 = v_1011.y; // a = a + b v_1011.x = v10; @@ -3608,10 +3624,14 @@ fn main(@builtin(global_invocation_id) id: vec3) { v31 = v_3031.y; // b = rotr64(b ^ c, 24) - xor0 = v10 ^ v20; - xor1 = v11 ^ v21; - v10 = (xor0 >> 24u) ^ (xor1 << 8u); - v11 = (xor1 >> 24u) ^ (xor0 << 8u); + v_2021.x = v20; + v_2021.y = v21; + v_1011.x = v10; + v_1011.y = v11; + xor = v_1011 ^ v_2021; + v_1011 = vec2((xor.x >> 24u) | (xor.y << 8u), (xor.y >> 24u) | (xor.x << 8u)); + v10 = v_1011.x; + v11 = v_1011.y; // a = a + b v_1011.x = v10; @@ -4387,10 +4407,14 @@ fn main(@builtin(global_invocation_id) id: vec3) { v31 = v_3031.y; // b = rotr64(b ^ c, 24) - xor0 = v10 ^ v20; - xor1 = v11 ^ v21; - v10 = (xor0 >> 24u) ^ (xor1 << 8u); - v11 = (xor1 >> 24u) ^ (xor0 << 8u); + v_2021.x = v20; + v_2021.y = v21; + v_1011.x = v10; + v_1011.y = v11; + xor = v_1011 ^ v_2021; + v_1011 = vec2((xor.x >> 24u) | (xor.y << 8u), (xor.y >> 24u) | (xor.x << 8u)); + v10 = v_1011.x; + v11 = v_1011.y; // a = a + b v_1011.x = v10; @@ -5154,10 +5178,14 @@ fn main(@builtin(global_invocation_id) id: vec3) { v31 = v_3031.y; // b = rotr64(b ^ c, 24) - xor0 = v10 ^ v20; - xor1 = v11 ^ v21; - v10 = (xor0 >> 24u) ^ (xor1 << 8u); - v11 = (xor1 >> 24u) ^ (xor0 << 8u); + v_2021.x = v20; + v_2021.y = v21; + v_1011.x = v10; + v_1011.y = v11; + xor = v_1011 ^ v_2021; + v_1011 = vec2((xor.x >> 24u) | (xor.y << 8u), (xor.y >> 24u) | (xor.x << 8u)); + v10 = v_1011.x; + v11 = v_1011.y; // a = a + b v_1011.x = v10; @@ -5912,10 +5940,14 @@ fn main(@builtin(global_invocation_id) id: vec3) { v31 = v_3031.y; // b = rotr64(b ^ c, 24) - xor0 = v10 ^ v20; - xor1 = v11 ^ v21; - v10 = (xor0 >> 24u) ^ (xor1 << 8u); - v11 = (xor1 >> 24u) ^ (xor0 << 8u); + v_2021.x = v20; + v_2021.y = v21; + v_1011.x = v10; + v_1011.y = v11; + xor = v_1011 ^ v_2021; + v_1011 = vec2((xor.x >> 24u) | (xor.y << 8u), (xor.y >> 24u) | (xor.x << 8u)); + v10 = v_1011.x; + v11 = v_1011.y; // a = a + b v_1011.x = v10; @@ -6676,10 +6708,14 @@ fn main(@builtin(global_invocation_id) id: vec3) { v31 = v_3031.y; // b = rotr64(b ^ c, 24) - xor0 = v10 ^ v20; - xor1 = v11 ^ v21; - v10 = (xor0 >> 24u) ^ (xor1 << 8u); - v11 = (xor1 >> 24u) ^ (xor0 << 8u); + v_2021.x = v20; + v_2021.y = v21; + v_1011.x = v10; + v_1011.y = v11; + xor = v_1011 ^ v_2021; + v_1011 = vec2((xor.x >> 24u) | (xor.y << 8u), (xor.y >> 24u) | (xor.x << 8u)); + v10 = v_1011.x; + v11 = v_1011.y; // a = a + b v_1011.x = v10; @@ -7446,10 +7482,14 @@ fn main(@builtin(global_invocation_id) id: vec3) { v31 = v_3031.y; // b = rotr64(b ^ c, 24) - xor0 = v10 ^ v20; - xor1 = v11 ^ v21; - v10 = (xor0 >> 24u) ^ (xor1 << 8u); - v11 = (xor1 >> 24u) ^ (xor0 << 8u); + v_2021.x = v20; + v_2021.y = v21; + v_1011.x = v10; + v_1011.y = v11; + xor = v_1011 ^ v_2021; + v_1011 = vec2((xor.x >> 24u) | (xor.y << 8u), (xor.y >> 24u) | (xor.x << 8u)); + v10 = v_1011.x; + v11 = v_1011.y; // a = a + b v_1011.x = v10; @@ -8213,10 +8253,14 @@ fn main(@builtin(global_invocation_id) id: vec3) { v31 = v_3031.y; // b = rotr64(b ^ c, 24) - xor0 = v10 ^ v20; - xor1 = v11 ^ v21; - v10 = (xor0 >> 24u) ^ (xor1 << 8u); - v11 = (xor1 >> 24u) ^ (xor0 << 8u); + v_2021.x = v20; + v_2021.y = v21; + v_1011.x = v10; + v_1011.y = v11; + xor = v_1011 ^ v_2021; + v_1011 = vec2((xor.x >> 24u) | (xor.y << 8u), (xor.y >> 24u) | (xor.x << 8u)); + v10 = v_1011.x; + v11 = v_1011.y; // a = a + b v_1011.x = v10; @@ -8986,10 +9030,14 @@ fn main(@builtin(global_invocation_id) id: vec3) { v31 = v_3031.y; // b = rotr64(b ^ c, 24) - xor0 = v10 ^ v20; - xor1 = v11 ^ v21; - v10 = (xor0 >> 24u) ^ (xor1 << 8u); - v11 = (xor1 >> 24u) ^ (xor0 << 8u); + v_2021.x = v20; + v_2021.y = v21; + v_1011.x = v10; + v_1011.y = v11; + xor = v_1011 ^ v_2021; + v_1011 = vec2((xor.x >> 24u) | (xor.y << 8u), (xor.y >> 24u) | (xor.x << 8u)); + v10 = v_1011.x; + v11 = v_1011.y; // a = a + b v_1011.x = v10;