]> zoso.dev Git - nano-pow.git/commitdiff
Log more benchmarks, and note that dispatch 0x800 and workgroup 8,8 was the best...
authorChris Duncan <chris@zoso.dev>
Thu, 16 Jan 2025 18:59:32 +0000 (10:59 -0800)
committerChris Duncan <chris@zoso.dev>
Thu, 16 Jan 2025 18:59:32 +0000 (10:59 -0800)
benchmarks.md
src/shaders/compute.wgsl

index e9fa62e0adb14b3353aabacbd9148c591f88c066..8dd42d0911fe6a6566a91a3e4bc1b78e5c30bc3b 100644 (file)
@@ -159,6 +159,187 @@ NanoPow (WebGPU) 3070 (dispatch 0x400 workgroup 32)
        "geometric": 178.59991348235997
 }
 
+NanoPow (WebGPU) 3070 (dispatch 0x2000 workgroup 16,16)
+{
+       "count": 512,
+       "total": 171064.90000003576,
+       "rate": 3.6180268186231834,
+       "min": 81.3999999910593,
+       "max": 1453.3999999910593,
+       "arithmetic": 334.11113281256985,
+       "truncated": 276.39375000004657,
+       "harmonic": 217.83822531129073,
+       "geometric": 267.8978578209064
+}
+
+NanoPow (WebGPU) 3070 (dispatch 0x1000 workgroup 16,16)
+{
+       "count": 512,
+       "total": 167172.10000005364,
+       "rate": 3.8445942901746104,
+       "min": 78.29999999701977,
+       "max": 3207.6000000089407,
+       "arithmetic": 326.5080078126048,
+       "truncated": 260.1054687501455,
+       "harmonic": 210.8258370543174,
+       "geometric": 257.37154756448814
+}
+
+NanoPow (WebGPU) 3070 (dispatch 0x800 workgroup 16,16)
+{
+       "count": 512,
+       "total": 161936.60000008345,
+       "rate": 3.740251239688252,
+       "min": 80.6000000089407,
+       "max": 1553.8999999910593,
+       "arithmetic": 316.282421875163,
+       "truncated": 267.3617187500349,
+       "harmonic": 204.59200263160372,
+       "geometric": 250.96434072996382
+}
+
+NanoPow (WebGPU) 3070 (dispatch 0x400 workgroup 16,16)
+{
+       "count": 512,
+       "total": 158915.09999985993,
+       "rate": 3.7711889999175563,
+       "min": 81,
+       "max": 2144.199999988079,
+       "arithmetic": 310.3810546872264,
+       "truncated": 265.1683593746857,
+       "harmonic": 201.944298796899,
+       "geometric": 246.41513206426896
+}
+
+NanoPow (WebGPU) 3070 (dispatch 0x2000 workgroup 8,8)
+{
+       "count": 512,
+       "total": 139888.0000000596,
+       "rate": 4.623788851791469,
+       "min": 24.100000008940697,
+       "max": 1754.5,
+       "arithmetic": 273.2187500001164,
+       "truncated": 216.27285156253492,
+       "harmonic": 124.13430182526332,
+       "geometric": 186.66857347407046
+}
+
+NanoPow (WebGPU) 3070 (dispatch 0x1000 workgroup 8,8)
+{
+       "count": 512,
+       "total": 141471.0000000447,
+       "rate": 4.8109313380457674,
+       "min": 25.799999997019768,
+       "max": 1512.4000000059605,
+       "arithmetic": 276.3105468750873,
+       "truncated": 207.8599609376688,
+       "harmonic": 127.00893713470423,
+       "geometric": 192.06862657670237
+}
+
+NanoPow (WebGPU) 3070 (dispatch 0x800 workgroup 8,8)
+{
+       "count": 512,
+       "total": 133226.09999994934,
+       "rate": 4.905215336077563,
+       "min": 24.599999994039536,
+       "max": 1432.3999999910593,
+       "arithmetic": 260.20722656240105,
+       "truncated": 203.86464843756403,
+       "harmonic": 116.05589569169062,
+       "geometric": 178.05592700404114
+}
+
+NanoPow (WebGPU) 3070 (dispatch 0x400 workgroup 8,8)
+{
+       "count": 512,
+       "total": 146197.10000024736,
+       "rate": 4.400326586729279,
+       "min": 25.399999991059303,
+       "max": 1748.5,
+       "arithmetic": 285.5412109379831,
+       "truncated": 227.25585937549477,
+       "harmonic": 130.29367565620703,
+       "geometric": 196.68629173860154
+}
+
+NanoPow (WebGPU) 3070 (dispatch 0x2000 workgroup 8,4)
+{
+       "count": 512,
+       "total": 146798.40000000596,
+       "rate": 4.501937943601046,
+       "min": 18.099999994039536,
+       "max": 1979.1000000089407,
+       "arithmetic": 286.71562500001164,
+       "truncated": 222.12656249990687,
+       "harmonic": 103.21952194085847,
+       "geometric": 176.78141792063872
+}
+
+NanoPow (WebGPU) 3070 (dispatch 0x1000 workgroup 8,4)
+{
+       "count": 512,
+       "total": 138210.7000002265,
+       "rate": 4.76931689781462,
+       "min": 16.799999997019768,
+       "max": 1626.0999999940395,
+       "arithmetic": 269.9427734379424,
+       "truncated": 209.67363281274447,
+       "harmonic": 101.34635015953711,
+       "geometric": 172.06200959967907
+}
+
+NanoPow (WebGPU) 3070 (dispatch 0x800 workgroup 8,4)
+{
+       "count": 512,
+       "total": 149949.19999992847,
+       "rate": 4.415311818463056,
+       "min": 16.799999997019768,
+       "max": 1790.7999999970198,
+       "arithmetic": 292.8695312498603,
+       "truncated": 226.48457031243015,
+       "harmonic": 112.43872189933657,
+       "geometric": 189.39141120585325
+}
+
+NanoPow (WebGPU) 3070 (dispatch 0x400 workgroup 8,4)
+{
+       "count": 512,
+       "total": 147114.90000009537,
+       "rate": 4.581196336470157,
+       "min": 17.600000008940697,
+       "max": 3584.5999999940395,
+       "arithmetic": 287.33378906268626,
+       "truncated": 218.28359375020955,
+       "harmonic": 118.69066922246898,
+       "geometric": 188.03357141542313
+}
+
+NanoPow (WebGPU) 3070 (dispatch 0x2000 workgroup 4,4)
+{
+       "count": 512,
+       "total": 275257.4999998361,
+       "rate": 2.5033687912081977,
+       "min": 16.700000002980232,
+       "max": 4439.20000000298,
+       "arithmetic": 537.6123046871799,
+       "truncated": 399.461718749546,
+       "harmonic": 179.34949948078622,
+       "geometric": 340.32029726440055
+}
+
+NanoPow (WebGPU) 3070 (dispatch 0x1000 workgroup 4,4)
+{
+       "count": 512,
+       "total": 265632.5000000298,
+       "rate": 2.4961071453998693,
+       "min": 19,
+       "max": 4817.0999999940395,
+       "arithmetic": 518.8134765625582,
+       "truncated": 400.6238281248952,
+       "harmonic": 152.2455042043822,
+       "geometric": 300.11657404770995
+}
 
 -----
 
index 2895232c95e0bcf0241b7b341dbeff9e6e5baf3d..79486d0be45a005328073201bc46fafb5f64f860 100644 (file)
@@ -33,7 +33,7 @@ const ROTATE_31 = vec2(31u, 31u);
 * Search compute function
 * Calls main with a workgroup size of 64 which has been tested as optimal
 */
-@compute @workgroup_size(64)
+@compute @workgroup_size(8,8)
 fn search(@builtin(global_invocation_id) global_id: vec3<u32>) {
        main(global_id);
 }