From 425380f1a11b3df97f58b717595ebefdd64f9cfe Mon Sep 17 00:00:00 2001
From: Corentin <corentin-pro@mail.com>
Date: Thu, 24 Jun 2021 18:36:10 +0900
Subject: [PATCH] Automated work group parameters and tiling matmul

---
 .../python_naive_matmul/1_naive_matmul.py     |  75 ++++++++---
 .../python_naive_matmul/2_tiled_matmul.py     | 127 ++++++++++++++++++
 examples/python_naive_matmul/README.md        |   2 +-
 3 files changed, 183 insertions(+), 21 deletions(-)
 create mode 100644 examples/python_naive_matmul/2_tiled_matmul.py

diff --git a/examples/python_naive_matmul/1_naive_matmul.py b/examples/python_naive_matmul/1_naive_matmul.py
index e7fe092a8..d5fe70cba 100644
--- a/examples/python_naive_matmul/1_naive_matmul.py
+++ b/examples/python_naive_matmul/1_naive_matmul.py
@@ -5,35 +5,64 @@ import numpy as np
 
 
 class MatMulOp:
-    def __init__(self, manager: kp.Manager, local_size_x: int = 1, local_size_y: int = 1):
+    def __init__(self, manager: kp.Manager, local_size_x: int = -1, local_size_y: int = -1):
         self.mgr = manager
-        assert(local_size_x > 0)
-        assert(local_size_y > 0)
+
+        props = self.mgr.get_device_properties()
+        max_workgroup_invocation = props['max_work_group_invocations']
+        max_workgroup_size = props['max_work_group_size']
+        if local_size_x < 1:
+            if local_size_y > 0:
+                local_size_x = 1
+                while (2 * local_size_x * local_size_y <= max_workgroup_invocation
+                       and 2 * local_size_x <= max_workgroup_size[0]):
+                    local_size_x *= 2
+            else:
+                local_size_x = 1
+                local_size_y = 1
+                while 2 * local_size_x * local_size_y <= max_workgroup_invocation:
+                    if 2 * local_size_x <= max_workgroup_size[0]:
+                        local_size_x *= 2
+                    if 2 * local_size_y <= max_workgroup_size[1]:
+                        local_size_y *= 2
+                    elif 2 * local_size_x > max_workgroup_size[0]:  # stop if neither x nor y can be double
+                        break
+        elif local_size_y < 0:
+            local_size_y = 1
+            while (2 * local_size_x * local_size_y <= max_workgroup_invocation
+                   and 2 * local_size_x <= max_workgroup_size[0]):
+                local_size_y *= 2
+
+        assert local_size_x > 0
+        assert local_size_y > 0
+        assert local_size_x * local_size_y <= max_workgroup_invocation
+        assert local_size_x <= max_workgroup_size[0]
+        assert local_size_y <= max_workgroup_size[1]
         self.local_size_x = local_size_x
         self.local_size_y = local_size_y
 
         self.shader = kp.Shader.compile_source(f'''
-    #version 450
+#version 450
 
-    layout (local_size_x = {local_size_x}, local_size_y = {local_size_y}) in;
+layout (local_size_x = {local_size_x}, local_size_y = {local_size_y}) in;
 
-    layout (set = 0, binding = 0) readonly buffer buf_in_tensor_1 {{ float in_tensor_1[]; }};
-    layout (set = 0, binding = 1) readonly buffer buf_in_tensor_2 {{ float in_tensor_2[]; }};
-    layout (set = 0, binding = 2) writeonly buffer buf_out_tensor {{ float out_tensor[]; }};
+layout (set = 0, binding = 0) readonly buffer buf_in_tensor_1 {{ float in_tensor_1[]; }};
+layout (set = 0, binding = 1) readonly buffer buf_in_tensor_2 {{ float in_tensor_2[]; }};
+layout (set = 0, binding = 2) writeonly buffer buf_out_tensor {{ float out_tensor[]; }};
 
-    layout (constant_id = 0) const float tensor_size_f = 0;
+layout (constant_id = 0) const float tensor_size_f = 0;
 
 
-    void main()
-    {{
-        uint globalRow = gl_GlobalInvocationID.x;
-        uint globalCol = gl_GlobalInvocationID.y;
-        uint tensor_size = uint(tensor_size_f);
-        float acc = 0.0;
-        for(uint k = 0u; k < tensor_size; k++)
-            acc += in_tensor_1[(k * tensor_size) + globalRow] * in_tensor_2[(globalCol * tensor_size) + k];
-        out_tensor[(globalCol * tensor_size) + globalRow] = acc;
-    }}''')
+void main()
+{{
+    uint globalRow = gl_GlobalInvocationID.x;
+    uint globalCol = gl_GlobalInvocationID.y;
+    uint tensor_size = uint(tensor_size_f);
+    float acc = 0.0;
+    for(uint k = 0u; k < tensor_size; k++)
+        acc += in_tensor_1[(k * tensor_size) + globalRow] * in_tensor_2[(globalCol * tensor_size) + k];
+    out_tensor[(globalCol * tensor_size) + globalRow] = acc;
+}}''')
         self.tensor_shape: tuple[int, int] = (0, 0)
         self.params: list[kp.Tensor] = []
         self.algo = None
@@ -62,7 +91,7 @@ class MatMulOp:
 def main():
     mgr = kp.Manager()
 
-    matmul_op = MatMulOp(mgr, 64, 64)
+    matmul_op = MatMulOp(mgr)
 
     tensor_size = 512
     tensor_shape = [tensor_size, tensor_size]
@@ -92,5 +121,11 @@ def main():
           f'{experiment_count * op_count / (1e9 * experiment_time):0.2f}GFLOPS')
 
 
+def test():
+    main()
+
+
 if __name__ == '__main__':
     main()
+else:
+    test()
diff --git a/examples/python_naive_matmul/2_tiled_matmul.py b/examples/python_naive_matmul/2_tiled_matmul.py
new file mode 100644
index 000000000..be403a46a
--- /dev/null
+++ b/examples/python_naive_matmul/2_tiled_matmul.py
@@ -0,0 +1,127 @@
+import time
+
+import kp
+import numpy as np
+
+
+class MatMulOp:
+    def __init__(self, manager: kp.Manager, tile_size: int = -1):
+        self.mgr = manager
+
+        props = self.mgr.get_device_properties()
+        max_workgroup_invocation = props['max_work_group_invocations']
+        max_workgroup_size = props['max_work_group_size']
+        if tile_size < 0:
+            tile_size = 1
+            while (2 * tile_size * tile_size <= max_workgroup_invocation
+                   and 2 * tile_size <= max_workgroup_size[0]
+                   and 2 * tile_size <= max_workgroup_size[1]):
+                tile_size *= 2
+
+        assert tile_size > 0
+        assert tile_size * tile_size <= max_workgroup_invocation
+        assert tile_size <= max_workgroup_size[0]
+        assert tile_size <= max_workgroup_size[1]
+        self.tile_size = tile_size
+
+        self.shader = kp.Shader.compile_source(f'''
+#version 450
+
+layout (local_size_x = {tile_size}, local_size_y = {tile_size}) in;
+
+layout (set = 0, binding = 0) readonly buffer buf_in_tensor_1 {{ float in_tensor_1[]; }};
+layout (set = 0, binding = 1) readonly buffer buf_in_tensor_2 {{ float in_tensor_2[]; }};
+layout (set = 0, binding = 2) writeonly buffer buf_out_tensor {{ float out_tensor[]; }};
+
+layout (constant_id = 0) const float tensor_size_f = 0;
+
+shared float sub_tensor_1[{tile_size}][{tile_size}];
+shared float sub_tensor_2[{tile_size}][{tile_size}];
+
+void main()
+{{
+    uint row = gl_GlobalInvocationID.x;
+    uint col = gl_GlobalInvocationID.y;
+    uint globalRow = {tile_size} * gl_WorkGroupID.x + row;
+    uint globalCol = {tile_size} * gl_WorkGroupID.y + row;
+
+    uint tensor_size = uint(tensor_size_f);
+    float acc = 0.0;
+    uint numTiles = tensor_size / {tile_size};
+    for(uint t = 0u; t < numTiles; t++)
+    {{
+        uint tiledRow = {tile_size} * t + row;
+        uint tiledCol = {tile_size} * t + col;
+        sub_tensor_1[col][row] = in_tensor_1[tiledCol * tensor_size + globalRow];
+        sub_tensor_2[col][row] = in_tensor_2[globalCol * tensor_size + tiledRow];
+
+        memoryBarrierShared();
+        barrier();
+
+        for(uint k = 0u; k < {tile_size}; k++)
+            acc += sub_tensor_1[k][row] * sub_tensor_2[col][k];
+
+        barrier();
+    }}
+    out_tensor[(globalCol * tensor_size) + globalRow] = acc;
+}}''')
+        self.tensor_shape: tuple[int, int] = (0, 0)
+        self.params: list[kp.Tensor] = []
+        self.algo = None
+
+    def __call__(self, tensor_shape: tuple[int, int], tensor_in_1: kp.Tensor, tensor_in_2: kp.Tensor,
+                 tensor_out: kp.Tensor):
+        params = [tensor_in_1, tensor_in_2, tensor_out]
+
+        if self.algo is None or self.tensor_shape != tensor_shape or self.params != params:
+            self.tensor_shape = tensor_shape
+            self.params = params
+            self.algo = self.mgr.algorithm(
+                params,  # params
+                self.shader,  # spirv
+                (tensor_shape[0] // self.tile_size, tensor_shape[1] // self.tile_size, 1),  # workgroup
+                [float(tensor_shape[0])],  # spec_consts
+                [])  # push_consts
+
+        (self.mgr.sequence()
+         .record(kp.OpTensorSyncDevice(self.params))
+         .record(kp.OpAlgoDispatch(self.algo))
+         .record(kp.OpTensorSyncLocal(self.params))
+         .eval())
+
+
+def main():
+    mgr = kp.Manager()
+
+    matmul_op = MatMulOp(mgr)
+
+    tensor_size = 512
+    tensor_shape = [tensor_size, tensor_size]
+    tensor_in_1 = mgr.tensor(np.triu(np.ones(tensor_shape)))
+    tensor_in_2 = mgr.tensor(np.triu(np.ones(tensor_shape)))
+    tensor_out = mgr.tensor(np.zeros(tensor_shape))
+
+    print(f'{tensor_shape} input tensors:\n'
+          f'{tensor_in_1.data().reshape(tensor_shape)}\n'
+          f'{tensor_in_2.data().reshape(tensor_shape)}\n')
+
+    matmul_op(tensor_shape, tensor_in_1, tensor_in_2, tensor_out)
+
+    experiment_count = 1000
+    start_time = time.time()
+    for _ in range(experiment_count):
+        matmul_op(tensor_shape, tensor_in_1, tensor_in_2, tensor_out)
+    end_time = time.time()
+    experiment_time = end_time - start_time
+    op_count = tensor_shape[0] * tensor_shape[1] * (tensor_shape[1] - 1)
+
+    print(f'Output :\n{tensor_out.data().reshape(tensor_shape)}')
+
+    print(f'{experiment_count} matmul time : '
+          f'{experiment_time * 1000:0.2f}ms => '
+          f'{experiment_count / experiment_time:0.2f}op/s or '
+          f'{experiment_count * op_count / (1e9 * experiment_time):0.2f}GFLOPS')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/examples/python_naive_matmul/README.md b/examples/python_naive_matmul/README.md
index 26a89172b..0688bb079 100644
--- a/examples/python_naive_matmul/README.md
+++ b/examples/python_naive_matmul/README.md
@@ -1,6 +1,6 @@
 # Naive Matmul Implementation
 
-This demonstrate a basic matmul implementation using Python and vulkan-kompute
+This demonstrate a basic matmul implementation using Python and vulkan-kompute. Many thanks for the very helpful [SGEMM in WebGL2-compute](https://www.ibiblio.org/e-notes/webgl/gpu/mul/sgemm.htm) article on the public library [ibiblio.org](https://www.ibiblio.org/).
 
 To test the implementation simply run the `matmul.py` script :