From 0b7d0577277963d4d66cf3349afe41f737aaf4f9 Mon Sep 17 00:00:00 2001
From: Corentin <corentin-pro@mail.com>
Date: Tue, 22 Jun 2021 16:13:20 +0900
Subject: [PATCH 1/8] Naive matrice multiplication example

---
 examples/python_naive_matmul/README.md |  9 ++++
 examples/python_naive_matmul/matmul.py | 60 ++++++++++++++++++++++++++
 2 files changed, 69 insertions(+)
 create mode 100644 examples/python_naive_matmul/README.md
 create mode 100644 examples/python_naive_matmul/matmul.py

diff --git a/examples/python_naive_matmul/README.md b/examples/python_naive_matmul/README.md
new file mode 100644
index 000000000..26a89172b
--- /dev/null
+++ b/examples/python_naive_matmul/README.md
@@ -0,0 +1,9 @@
+# Naive Matmul Implementation
+
+This demonstrate a basic matmul implementation using Python and vulkan-kompute
+
+To test the implementation simply run the `matmul.py` script :
+
+```
+python matmul.py
+```
diff --git a/examples/python_naive_matmul/matmul.py b/examples/python_naive_matmul/matmul.py
new file mode 100644
index 000000000..1e7caa871
--- /dev/null
+++ b/examples/python_naive_matmul/matmul.py
@@ -0,0 +1,60 @@
+import kp
+import numpy as np
+
+
+def main():
+    mgr = kp.Manager()
+
+    tensor_size = 4
+    tensor_shape = [tensor_size, tensor_size]
+    tensor_in_1 = mgr.tensor(np.triu(np.ones(tensor_shape)))
+    tensor_in_2 = mgr.tensor(np.triu(np.ones(tensor_shape)))
+    tensor_out = mgr.tensor(np.zeros(tensor_shape))
+
+    print(f'Input tensors:\n'
+          f'{tensor_in_1.data().reshape(tensor_shape)}\n'
+          f'{tensor_in_2.data().reshape(tensor_shape)}\n')
+
+    params = [tensor_in_1, tensor_in_2, tensor_out]
+
+    matmul_shader = kp.Shader.compile_source('''
+#version 450
+
+layout (local_size_x = 1, local_size_y = 1) in;
+
+layout (set = 0, binding = 0) readonly buffer buf_in_tensor_1 { float in_tensor_1[]; };
+layout (set = 0, binding = 1) readonly buffer buf_in_tensor_2 { float in_tensor_2[]; };
+layout (set = 0, binding = 2) writeonly buffer buf_out_tensor { float out_tensor[]; };
+
+layout (constant_id = 0) const float tensor_size_f = 0;
+
+
+void main()
+{
+    uint globalRow = gl_GlobalInvocationID.x;
+    uint globalCol = gl_GlobalInvocationID.y;
+    uint tensor_size = uint(tensor_size_f);
+    float acc = 0.0;
+    for(uint k = 0u; k < tensor_size; k++)
+        acc += in_tensor_1[(k * tensor_size) + globalRow] * in_tensor_2[(globalCol * tensor_size) + k];
+    out_tensor[(globalCol * tensor_size) + globalRow] = acc;
+}''')
+
+    algo = mgr.algorithm(
+        params,  # params
+        matmul_shader,  # spirv
+        (*tensor_shape, 1),  # workgroup
+        [float(tensor_size)],  # spec_consts
+        [])  # push_consts
+
+    (mgr.sequence()
+     .record(kp.OpTensorSyncDevice(params))
+     .record(kp.OpAlgoDispatch(algo))
+     .record(kp.OpTensorSyncLocal(params))
+     .eval())
+
+    print(f'Output :\n{tensor_out.data().reshape(tensor_shape)}')
+
+
+if __name__ == '__main__':
+    main()

From 8c9ad5f2a4f324891c805db25572682477cad97b Mon Sep 17 00:00:00 2001
From: Corentin <corentin-pro@mail.com>
Date: Tue, 22 Jun 2021 18:21:32 +0900
Subject: [PATCH 2/8] Naive matmul benchmark

---
 .../python_naive_matmul/1_naive_matmul.py     | 96 +++++++++++++++++++
 .../{matmul.py => first_example.py}           |  0
 2 files changed, 96 insertions(+)
 create mode 100644 examples/python_naive_matmul/1_naive_matmul.py
 rename examples/python_naive_matmul/{matmul.py => first_example.py} (100%)

diff --git a/examples/python_naive_matmul/1_naive_matmul.py b/examples/python_naive_matmul/1_naive_matmul.py
new file mode 100644
index 000000000..e7fe092a8
--- /dev/null
+++ b/examples/python_naive_matmul/1_naive_matmul.py
@@ -0,0 +1,96 @@
+import time
+
+import kp
+import numpy as np
+
+
+class MatMulOp:
+    def __init__(self, manager: kp.Manager, local_size_x: int = 1, local_size_y: int = 1):
+        self.mgr = manager
+        assert(local_size_x > 0)
+        assert(local_size_y > 0)
+        self.local_size_x = local_size_x
+        self.local_size_y = local_size_y
+
+        self.shader = kp.Shader.compile_source(f'''
+    #version 450
+
+    layout (local_size_x = {local_size_x}, local_size_y = {local_size_y}) in;
+
+    layout (set = 0, binding = 0) readonly buffer buf_in_tensor_1 {{ float in_tensor_1[]; }};
+    layout (set = 0, binding = 1) readonly buffer buf_in_tensor_2 {{ float in_tensor_2[]; }};
+    layout (set = 0, binding = 2) writeonly buffer buf_out_tensor {{ float out_tensor[]; }};
+
+    layout (constant_id = 0) const float tensor_size_f = 0;
+
+
+    void main()
+    {{
+        uint globalRow = gl_GlobalInvocationID.x;
+        uint globalCol = gl_GlobalInvocationID.y;
+        uint tensor_size = uint(tensor_size_f);
+        float acc = 0.0;
+        for(uint k = 0u; k < tensor_size; k++)
+            acc += in_tensor_1[(k * tensor_size) + globalRow] * in_tensor_2[(globalCol * tensor_size) + k];
+        out_tensor[(globalCol * tensor_size) + globalRow] = acc;
+    }}''')
+        self.tensor_shape: tuple[int, int] = (0, 0)
+        self.params: list[kp.Tensor] = []
+        self.algo = None
+
+    def __call__(self, tensor_shape: tuple[int, int], tensor_in_1: kp.Tensor, tensor_in_2: kp.Tensor,
+                 tensor_out: kp.Tensor):
+        params = [tensor_in_1, tensor_in_2, tensor_out]
+
+        if self.algo is None or self.tensor_shape != tensor_shape or self.params != params:
+            self.tensor_shape = tensor_shape
+            self.params = params
+            self.algo = self.mgr.algorithm(
+                params,  # params
+                self.shader,  # spirv
+                (tensor_shape[0] // self.local_size_x, tensor_shape[1] // self.local_size_y, 1),  # workgroup
+                [float(tensor_shape[0])],  # spec_consts
+                [])  # push_consts
+
+        (self.mgr.sequence()
+         .record(kp.OpTensorSyncDevice(self.params))
+         .record(kp.OpAlgoDispatch(self.algo))
+         .record(kp.OpTensorSyncLocal(self.params))
+         .eval())
+
+
+def main():
+    mgr = kp.Manager()
+
+    matmul_op = MatMulOp(mgr, 64, 64)
+
+    tensor_size = 512
+    tensor_shape = [tensor_size, tensor_size]
+    tensor_in_1 = mgr.tensor(np.triu(np.ones(tensor_shape)))
+    tensor_in_2 = mgr.tensor(np.triu(np.ones(tensor_shape)))
+    tensor_out = mgr.tensor(np.zeros(tensor_shape))
+
+    print(f'{tensor_shape} input tensors:\n'
+          f'{tensor_in_1.data().reshape(tensor_shape)}\n'
+          f'{tensor_in_2.data().reshape(tensor_shape)}\n')
+
+    matmul_op(tensor_shape, tensor_in_1, tensor_in_2, tensor_out)
+
+    experiment_count = 1000
+    start_time = time.time()
+    for _ in range(experiment_count):
+        matmul_op(tensor_shape, tensor_in_1, tensor_in_2, tensor_out)
+    end_time = time.time()
+    experiment_time = end_time - start_time
+    op_count = tensor_shape[0] * tensor_shape[1] * (tensor_shape[1] - 1)
+
+    print(f'Output :\n{tensor_out.data().reshape(tensor_shape)}')
+
+    print(f'{experiment_count} matmul time : '
+          f'{experiment_time * 1000:0.2f}ms => '
+          f'{experiment_count / experiment_time:0.2f}op/s or '
+          f'{experiment_count * op_count / (1e9 * experiment_time):0.2f}GFLOPS')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/examples/python_naive_matmul/matmul.py b/examples/python_naive_matmul/first_example.py
similarity index 100%
rename from examples/python_naive_matmul/matmul.py
rename to examples/python_naive_matmul/first_example.py

From 425380f1a11b3df97f58b717595ebefdd64f9cfe Mon Sep 17 00:00:00 2001
From: Corentin <corentin-pro@mail.com>
Date: Thu, 24 Jun 2021 18:36:10 +0900
Subject: [PATCH 3/8] Automated work group parameters and tiling matmul

---
 .../python_naive_matmul/1_naive_matmul.py     |  75 ++++++++---
 .../python_naive_matmul/2_tiled_matmul.py     | 127 ++++++++++++++++++
 examples/python_naive_matmul/README.md        |   2 +-
 3 files changed, 183 insertions(+), 21 deletions(-)
 create mode 100644 examples/python_naive_matmul/2_tiled_matmul.py

diff --git a/examples/python_naive_matmul/1_naive_matmul.py b/examples/python_naive_matmul/1_naive_matmul.py
index e7fe092a8..d5fe70cba 100644
--- a/examples/python_naive_matmul/1_naive_matmul.py
+++ b/examples/python_naive_matmul/1_naive_matmul.py
@@ -5,35 +5,64 @@ import numpy as np
 
 
 class MatMulOp:
-    def __init__(self, manager: kp.Manager, local_size_x: int = 1, local_size_y: int = 1):
+    def __init__(self, manager: kp.Manager, local_size_x: int = -1, local_size_y: int = -1):
         self.mgr = manager
-        assert(local_size_x > 0)
-        assert(local_size_y > 0)
+
+        props = self.mgr.get_device_properties()
+        max_workgroup_invocation = props['max_work_group_invocations']
+        max_workgroup_size = props['max_work_group_size']
+        if local_size_x < 1:
+            if local_size_y > 0:
+                local_size_x = 1
+                while (2 * local_size_x * local_size_y <= max_workgroup_invocation
+                       and 2 * local_size_x <= max_workgroup_size[0]):
+                    local_size_x *= 2
+            else:
+                local_size_x = 1
+                local_size_y = 1
+                while 2 * local_size_x * local_size_y <= max_workgroup_invocation:
+                    if 2 * local_size_x <= max_workgroup_size[0]:
+                        local_size_x *= 2
+                    if 2 * local_size_y <= max_workgroup_size[1]:
+                        local_size_y *= 2
+                    elif 2 * local_size_x > max_workgroup_size[0]:  # stop if neither x nor y can be double
+                        break
+        elif local_size_y < 0:
+            local_size_y = 1
+            while (2 * local_size_x * local_size_y <= max_workgroup_invocation
+                   and 2 * local_size_x <= max_workgroup_size[0]):
+                local_size_y *= 2
+
+        assert local_size_x > 0
+        assert local_size_y > 0
+        assert local_size_x * local_size_y <= max_workgroup_invocation
+        assert local_size_x <= max_workgroup_size[0]
+        assert local_size_y <= max_workgroup_size[1]
         self.local_size_x = local_size_x
         self.local_size_y = local_size_y
 
         self.shader = kp.Shader.compile_source(f'''
-    #version 450
+#version 450
 
-    layout (local_size_x = {local_size_x}, local_size_y = {local_size_y}) in;
+layout (local_size_x = {local_size_x}, local_size_y = {local_size_y}) in;
 
-    layout (set = 0, binding = 0) readonly buffer buf_in_tensor_1 {{ float in_tensor_1[]; }};
-    layout (set = 0, binding = 1) readonly buffer buf_in_tensor_2 {{ float in_tensor_2[]; }};
-    layout (set = 0, binding = 2) writeonly buffer buf_out_tensor {{ float out_tensor[]; }};
+layout (set = 0, binding = 0) readonly buffer buf_in_tensor_1 {{ float in_tensor_1[]; }};
+layout (set = 0, binding = 1) readonly buffer buf_in_tensor_2 {{ float in_tensor_2[]; }};
+layout (set = 0, binding = 2) writeonly buffer buf_out_tensor {{ float out_tensor[]; }};
 
-    layout (constant_id = 0) const float tensor_size_f = 0;
+layout (constant_id = 0) const float tensor_size_f = 0;
 
 
-    void main()
-    {{
-        uint globalRow = gl_GlobalInvocationID.x;
-        uint globalCol = gl_GlobalInvocationID.y;
-        uint tensor_size = uint(tensor_size_f);
-        float acc = 0.0;
-        for(uint k = 0u; k < tensor_size; k++)
-            acc += in_tensor_1[(k * tensor_size) + globalRow] * in_tensor_2[(globalCol * tensor_size) + k];
-        out_tensor[(globalCol * tensor_size) + globalRow] = acc;
-    }}''')
+void main()
+{{
+    uint globalRow = gl_GlobalInvocationID.x;
+    uint globalCol = gl_GlobalInvocationID.y;
+    uint tensor_size = uint(tensor_size_f);
+    float acc = 0.0;
+    for(uint k = 0u; k < tensor_size; k++)
+        acc += in_tensor_1[(k * tensor_size) + globalRow] * in_tensor_2[(globalCol * tensor_size) + k];
+    out_tensor[(globalCol * tensor_size) + globalRow] = acc;
+}}''')
         self.tensor_shape: tuple[int, int] = (0, 0)
         self.params: list[kp.Tensor] = []
         self.algo = None
@@ -62,7 +91,7 @@ class MatMulOp:
 def main():
     mgr = kp.Manager()
 
-    matmul_op = MatMulOp(mgr, 64, 64)
+    matmul_op = MatMulOp(mgr)
 
     tensor_size = 512
     tensor_shape = [tensor_size, tensor_size]
@@ -92,5 +121,11 @@ def main():
           f'{experiment_count * op_count / (1e9 * experiment_time):0.2f}GFLOPS')
 
 
+def test():
+    main()
+
+
 if __name__ == '__main__':
     main()
+else:
+    test()
diff --git a/examples/python_naive_matmul/2_tiled_matmul.py b/examples/python_naive_matmul/2_tiled_matmul.py
new file mode 100644
index 000000000..be403a46a
--- /dev/null
+++ b/examples/python_naive_matmul/2_tiled_matmul.py
@@ -0,0 +1,127 @@
+import time
+
+import kp
+import numpy as np
+
+
+class MatMulOp:
+    def __init__(self, manager: kp.Manager, tile_size: int = -1):
+        self.mgr = manager
+
+        props = self.mgr.get_device_properties()
+        max_workgroup_invocation = props['max_work_group_invocations']
+        max_workgroup_size = props['max_work_group_size']
+        if tile_size < 0:
+            tile_size = 1
+            while (2 * tile_size * tile_size <= max_workgroup_invocation
+                   and 2 * tile_size <= max_workgroup_size[0]
+                   and 2 * tile_size <= max_workgroup_size[1]):
+                tile_size *= 2
+
+        assert tile_size > 0
+        assert tile_size * tile_size <= max_workgroup_invocation
+        assert tile_size <= max_workgroup_size[0]
+        assert tile_size <= max_workgroup_size[1]
+        self.tile_size = tile_size
+
+        self.shader = kp.Shader.compile_source(f'''
+#version 450
+
+layout (local_size_x = {tile_size}, local_size_y = {tile_size}) in;
+
+layout (set = 0, binding = 0) readonly buffer buf_in_tensor_1 {{ float in_tensor_1[]; }};
+layout (set = 0, binding = 1) readonly buffer buf_in_tensor_2 {{ float in_tensor_2[]; }};
+layout (set = 0, binding = 2) writeonly buffer buf_out_tensor {{ float out_tensor[]; }};
+
+layout (constant_id = 0) const float tensor_size_f = 0;
+
+shared float sub_tensor_1[{tile_size}][{tile_size}];
+shared float sub_tensor_2[{tile_size}][{tile_size}];
+
+void main()
+{{
+    uint row = gl_GlobalInvocationID.x;
+    uint col = gl_GlobalInvocationID.y;
+    uint globalRow = {tile_size} * gl_WorkGroupID.x + row;
+    uint globalCol = {tile_size} * gl_WorkGroupID.y + row;
+
+    uint tensor_size = uint(tensor_size_f);
+    float acc = 0.0;
+    uint numTiles = tensor_size / {tile_size};
+    for(uint t = 0u; t < numTiles; t++)
+    {{
+        uint tiledRow = {tile_size} * t + row;
+        uint tiledCol = {tile_size} * t + col;
+        sub_tensor_1[col][row] = in_tensor_1[tiledCol * tensor_size + globalRow];
+        sub_tensor_2[col][row] = in_tensor_2[globalCol * tensor_size + tiledRow];
+
+        memoryBarrierShared();
+        barrier();
+
+        for(uint k = 0u; k < {tile_size}; k++)
+            acc += sub_tensor_1[k][row] * sub_tensor_2[col][k];
+
+        barrier();
+    }}
+    out_tensor[(globalCol * tensor_size) + globalRow] = acc;
+}}''')
+        self.tensor_shape: tuple[int, int] = (0, 0)
+        self.params: list[kp.Tensor] = []
+        self.algo = None
+
+    def __call__(self, tensor_shape: tuple[int, int], tensor_in_1: kp.Tensor, tensor_in_2: kp.Tensor,
+                 tensor_out: kp.Tensor):
+        params = [tensor_in_1, tensor_in_2, tensor_out]
+
+        if self.algo is None or self.tensor_shape != tensor_shape or self.params != params:
+            self.tensor_shape = tensor_shape
+            self.params = params
+            self.algo = self.mgr.algorithm(
+                params,  # params
+                self.shader,  # spirv
+                (tensor_shape[0] // self.tile_size, tensor_shape[1] // self.tile_size, 1),  # workgroup
+                [float(tensor_shape[0])],  # spec_consts
+                [])  # push_consts
+
+        (self.mgr.sequence()
+         .record(kp.OpTensorSyncDevice(self.params))
+         .record(kp.OpAlgoDispatch(self.algo))
+         .record(kp.OpTensorSyncLocal(self.params))
+         .eval())
+
+
+def main():
+    mgr = kp.Manager()
+
+    matmul_op = MatMulOp(mgr)
+
+    tensor_size = 512
+    tensor_shape = [tensor_size, tensor_size]
+    tensor_in_1 = mgr.tensor(np.triu(np.ones(tensor_shape)))
+    tensor_in_2 = mgr.tensor(np.triu(np.ones(tensor_shape)))
+    tensor_out = mgr.tensor(np.zeros(tensor_shape))
+
+    print(f'{tensor_shape} input tensors:\n'
+          f'{tensor_in_1.data().reshape(tensor_shape)}\n'
+          f'{tensor_in_2.data().reshape(tensor_shape)}\n')
+
+    matmul_op(tensor_shape, tensor_in_1, tensor_in_2, tensor_out)
+
+    experiment_count = 1000
+    start_time = time.time()
+    for _ in range(experiment_count):
+        matmul_op(tensor_shape, tensor_in_1, tensor_in_2, tensor_out)
+    end_time = time.time()
+    experiment_time = end_time - start_time
+    op_count = tensor_shape[0] * tensor_shape[1] * (tensor_shape[1] - 1)
+
+    print(f'Output :\n{tensor_out.data().reshape(tensor_shape)}')
+
+    print(f'{experiment_count} matmul time : '
+          f'{experiment_time * 1000:0.2f}ms => '
+          f'{experiment_count / experiment_time:0.2f}op/s or '
+          f'{experiment_count * op_count / (1e9 * experiment_time):0.2f}GFLOPS')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/examples/python_naive_matmul/README.md b/examples/python_naive_matmul/README.md
index 26a89172b..0688bb079 100644
--- a/examples/python_naive_matmul/README.md
+++ b/examples/python_naive_matmul/README.md
@@ -1,6 +1,6 @@
 # Naive Matmul Implementation
 
-This demonstrate a basic matmul implementation using Python and vulkan-kompute
+This demonstrate a basic matmul implementation using Python and vulkan-kompute. Many thanks for the very helpful [SGEMM in WebGL2-compute](https://www.ibiblio.org/e-notes/webgl/gpu/mul/sgemm.htm) article on the public library [ibiblio.org](https://www.ibiblio.org/).
 
 To test the implementation simply run the `matmul.py` script :
 

From 6f04eb9db2a4bfa6e5196bb015d9bacd0245fc10 Mon Sep 17 00:00:00 2001
From: Corentin <corentin-pro@mail.com>
Date: Thu, 24 Jun 2021 19:36:08 +0900
Subject: [PATCH 4/8] Better tiling implementation, fixed tiling asserts

---
 .../python_naive_matmul/2_tiled_matmul.py     |   2 +-
 .../python_naive_matmul/3_better_tiling.py    | 141 ++++++++++++++++++
 2 files changed, 142 insertions(+), 1 deletion(-)
 create mode 100644 examples/python_naive_matmul/3_better_tiling.py

diff --git a/examples/python_naive_matmul/2_tiled_matmul.py b/examples/python_naive_matmul/2_tiled_matmul.py
index be403a46a..839b07989 100644
--- a/examples/python_naive_matmul/2_tiled_matmul.py
+++ b/examples/python_naive_matmul/2_tiled_matmul.py
@@ -13,7 +13,7 @@ class MatMulOp:
         max_workgroup_size = props['max_work_group_size']
         if tile_size < 0:
             tile_size = 1
-            while (2 * tile_size * tile_size <= max_workgroup_invocation
+            while (4 * tile_size * tile_size <= max_workgroup_invocation
                    and 2 * tile_size <= max_workgroup_size[0]
                    and 2 * tile_size <= max_workgroup_size[1]):
                 tile_size *= 2
diff --git a/examples/python_naive_matmul/3_better_tiling.py b/examples/python_naive_matmul/3_better_tiling.py
new file mode 100644
index 000000000..5dcc52349
--- /dev/null
+++ b/examples/python_naive_matmul/3_better_tiling.py
@@ -0,0 +1,141 @@
+import time
+
+import kp
+import numpy as np
+
+
+class MatMulOp:
+    def __init__(self, manager: kp.Manager, tile_size: int = -1, thread_work_ratio: int = 8):
+        self.mgr = manager
+
+        props = self.mgr.get_device_properties()
+        max_workgroup_invocation = props['max_work_group_invocations']
+        max_workgroup_size = props['max_work_group_size']
+        if tile_size < 0:
+            tile_size = 1
+            local_size_y = tile_size // thread_work_ratio
+            while (4 * tile_size * local_size_y <= max_workgroup_invocation
+                   and 2 * tile_size <= max_workgroup_size[0]
+                   and 2 * local_size_y <= max_workgroup_size[1]):
+                tile_size *= 2
+                local_size_y = tile_size // thread_work_ratio
+        else:
+            local_size_y = tile_size // thread_work_ratio
+
+        assert tile_size > 0
+        assert thread_work_ratio > 0
+        assert tile_size * local_size_y <= max_workgroup_invocation
+        assert tile_size <= max_workgroup_size[0]
+        assert local_size_y <= max_workgroup_size[1]
+        self.tile_size = tile_size
+        self.thread_work_ratio = thread_work_ratio
+
+        local_size_y = tile_size // thread_work_ratio
+        self.shader = kp.Shader.compile_source(f'''
+#version 450
+
+layout (local_size_x = {tile_size}, local_size_y = {local_size_y}) in;
+
+layout (set = 0, binding = 0) readonly buffer buf_in_tensor_1 {{ float in_tensor_1[]; }};
+layout (set = 0, binding = 1) readonly buffer buf_in_tensor_2 {{ float in_tensor_2[]; }};
+layout (set = 0, binding = 2) writeonly buffer buf_out_tensor {{ float out_tensor[]; }};
+
+layout (constant_id = 0) const float tensor_size_f = 0;
+
+shared float sub_tensor_1[{tile_size}][{tile_size}];
+shared float sub_tensor_2[{tile_size}][{tile_size}];
+
+void main()
+{{
+    uint row = gl_GlobalInvocationID.x;
+    uint col = gl_GlobalInvocationID.y;
+    uint globalRow = {tile_size} * gl_WorkGroupID.x + row;
+    uint globalCol = {tile_size} * gl_WorkGroupID.y + row;
+
+    uint tensor_size = uint(tensor_size_f);
+    float acc[{thread_work_ratio}];
+    for (uint l = 0u; l < {thread_work_ratio}; l++)
+        acc[l] = 0.0;
+
+    uint numTiles = tensor_size / {tile_size};
+    for(uint t = 0u; t < numTiles; t++)
+    {{
+        uint tiledRow = {tile_size} * t + row;
+        uint tiledCol = {tile_size} * t + col;
+        sub_tensor_1[col + t * {local_size_y}][row] = in_tensor_1[
+            (tiledCol + t * {local_size_y}) * tensor_size + globalRow];
+        sub_tensor_2[col + t * {local_size_y}][row] = in_tensor_2[
+            (globalCol + t * {local_size_y})* tensor_size + tiledRow];
+
+        memoryBarrierShared();
+        barrier();
+
+        for(uint k = 0u; k < {tile_size}; k++)
+            for(uint l = 0u; l < {thread_work_ratio}; l++)
+                acc[l] += sub_tensor_1[k][row] * sub_tensor_2[col + l * {local_size_y}][k];
+
+        barrier();
+    }}
+    for(uint l = 0u; l < {thread_work_ratio}; l++)
+        out_tensor[(globalCol + l * {local_size_y}) * tensor_size + globalRow] = acc[l];
+}}''')
+        self.tensor_shape: tuple[int, int] = (0, 0)
+        self.params: list[kp.Tensor] = []
+        self.algo = None
+
+    def __call__(self, tensor_shape: tuple[int, int], tensor_in_1: kp.Tensor, tensor_in_2: kp.Tensor,
+                 tensor_out: kp.Tensor):
+        params = [tensor_in_1, tensor_in_2, tensor_out]
+
+        if self.algo is None or self.tensor_shape != tensor_shape or self.params != params:
+            self.tensor_shape = tensor_shape
+            self.params = params
+            self.algo = self.mgr.algorithm(
+                params,  # params
+                self.shader,  # spirv
+                (tensor_shape[0] // self.tile_size, tensor_shape[1] // self.tile_size, 1),  # workgroup
+                [float(tensor_shape[0])],  # spec_consts
+                [])  # push_consts
+
+        (self.mgr.sequence()
+         .record(kp.OpTensorSyncDevice(self.params))
+         .record(kp.OpAlgoDispatch(self.algo))
+         .record(kp.OpTensorSyncLocal(self.params))
+         .eval())
+
+
+def main():
+    mgr = kp.Manager()
+
+    matmul_op = MatMulOp(mgr)
+
+    tensor_size = 512
+    tensor_shape = [tensor_size, tensor_size]
+    tensor_in_1 = mgr.tensor(np.triu(np.ones(tensor_shape)))
+    tensor_in_2 = mgr.tensor(np.triu(np.ones(tensor_shape)))
+    tensor_out = mgr.tensor(np.zeros(tensor_shape))
+
+    print(f'{tensor_shape} input tensors:\n'
+          f'{tensor_in_1.data().reshape(tensor_shape)}\n'
+          f'{tensor_in_2.data().reshape(tensor_shape)}\n')
+
+    matmul_op(tensor_shape, tensor_in_1, tensor_in_2, tensor_out)
+
+    experiment_count = 1000
+    start_time = time.time()
+    for _ in range(experiment_count):
+        matmul_op(tensor_shape, tensor_in_1, tensor_in_2, tensor_out)
+    end_time = time.time()
+    experiment_time = end_time - start_time
+    op_count = tensor_shape[0] * tensor_shape[1] * (tensor_shape[1] - 1)
+
+    print(f'Output :\n{tensor_out.data().reshape(tensor_shape)}')
+
+    print(f'{experiment_count} matmul time : '
+          f'{experiment_time * 1000:0.2f}ms => '
+          f'{experiment_count / experiment_time:0.2f}op/s or '
+          f'{experiment_count * op_count / (1e9 * experiment_time):0.2f}GFLOPS')
+
+
+if __name__ == '__main__':
+    main()

From 7f4ec272353a337e663c5e462401f0421a228614 Mon Sep 17 00:00:00 2001
From: Corentin <corentin-pro@mail.com>
Date: Fri, 25 Jun 2021 02:49:28 +0900
Subject: [PATCH 5/8] Fix second implementation, add benchmark script

* Third implementation is broken (WIP)
---
 examples/python_naive_matmul/benchmark.py     |  49 ++++++
 .../{1_naive_matmul.py => imp1_naive.py}      |  18 +-
 .../{2_tiled_matmul.py => imp2_tiled.py}      |  32 ++--
 .../python_naive_matmul/imp2_tiled_debug.py   | 156 ++++++++++++++++++
 ...better_tiling.py => imp3_better_tiling.py} |  35 ++--
 examples/python_naive_matmul/matmul_plot.py   |  97 +++++++++++
 6 files changed, 347 insertions(+), 40 deletions(-)
 create mode 100644 examples/python_naive_matmul/benchmark.py
 rename examples/python_naive_matmul/{1_naive_matmul.py => imp1_naive.py} (92%)
 rename examples/python_naive_matmul/{2_tiled_matmul.py => imp2_tiled.py} (79%)
 create mode 100644 examples/python_naive_matmul/imp2_tiled_debug.py
 rename examples/python_naive_matmul/{3_better_tiling.py => imp3_better_tiling.py} (78%)
 create mode 100644 examples/python_naive_matmul/matmul_plot.py

diff --git a/examples/python_naive_matmul/benchmark.py b/examples/python_naive_matmul/benchmark.py
new file mode 100644
index 000000000..b10369d7c
--- /dev/null
+++ b/examples/python_naive_matmul/benchmark.py
@@ -0,0 +1,49 @@
+import time
+
+import kp
+import numpy as np
+from imp1_naive import MatMulOp as MatMulOp1
+from imp2_tiled import MatMulOp as MatMulOp2
+from imp3_better_tiling import MatMulOp as MatMulOp3
+
+
+def main():
+    experiment_count = 1000
+    tensor_size = 512
+    tensor_shape = [tensor_size, tensor_size]
+    mat_1 = np.triu(np.ones(tensor_shape))
+    mat_2 = np.triu(np.ones(tensor_shape))
+    mat_result = mat_1 @ mat_2
+    tensor_shape = [tensor_size, tensor_size]
+
+    print(f'{tensor_shape} input tensors:\n'
+          f'{mat_1}\n'
+          f'{mat_2}\n')
+    print(f'Output :\n{mat_result}')
+
+    mgr = kp.Manager()
+    tensor_in_1 = mgr.tensor(mat_1)
+    tensor_in_2 = mgr.tensor(mat_2)
+    tensor_out = mgr.tensor(np.zeros(tensor_shape))
+    for MatMulOp in [MatMulOp1, MatMulOp2, MatMulOp3]:
+        matmul_op = MatMulOp(mgr)
+        matmul_op(tensor_shape, tensor_in_1, tensor_in_2, tensor_out)
+
+        start_time = time.time()
+        for _ in range(experiment_count):
+            matmul_op(tensor_shape, tensor_in_1, tensor_in_2, tensor_out)
+        end_time = time.time()
+        experiment_time = end_time - start_time
+        op_count = tensor_shape[0] * tensor_shape[1] * (tensor_shape[1] - 1)
+
+        if (tensor_out.data().reshape(tensor_shape) == mat_result).all():
+            print(f'From {MatMulOp.__module__} : {experiment_count} matmul time : '
+                  f'{experiment_time * 1000:0.2f}ms => '
+                  f'{experiment_count / experiment_time:0.2f}op/s or '
+                  f'{experiment_count * op_count / (1e9 * experiment_time):0.2f}GFLOPS')
+        else:
+            print(f'Test failed => output tensor is wrong :\n{tensor_out.data().reshape(tensor_shape)}')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/examples/python_naive_matmul/1_naive_matmul.py b/examples/python_naive_matmul/imp1_naive.py
similarity index 92%
rename from examples/python_naive_matmul/1_naive_matmul.py
rename to examples/python_naive_matmul/imp1_naive.py
index d5fe70cba..faefec563 100644
--- a/examples/python_naive_matmul/1_naive_matmul.py
+++ b/examples/python_naive_matmul/imp1_naive.py
@@ -41,7 +41,7 @@ class MatMulOp:
         self.local_size_x = local_size_x
         self.local_size_y = local_size_y
 
-        self.shader = kp.Shader.compile_source(f'''
+        self.shader = f'''
 #version 450
 
 layout (local_size_x = {local_size_x}, local_size_y = {local_size_y}) in;
@@ -62,7 +62,8 @@ void main()
     for(uint k = 0u; k < tensor_size; k++)
         acc += in_tensor_1[(k * tensor_size) + globalRow] * in_tensor_2[(globalCol * tensor_size) + k];
     out_tensor[(globalCol * tensor_size) + globalRow] = acc;
-}}''')
+}}'''
+        self.compiled_shader = kp.Shader.compile_source(self.shader)
         self.tensor_shape: tuple[int, int] = (0, 0)
         self.params: list[kp.Tensor] = []
         self.algo = None
@@ -74,17 +75,18 @@ void main()
         if self.algo is None or self.tensor_shape != tensor_shape or self.params != params:
             self.tensor_shape = tensor_shape
             self.params = params
+            workgroup = (tensor_shape[0] // self.local_size_x, tensor_shape[1] // self.local_size_y, 1)
             self.algo = self.mgr.algorithm(
                 params,  # params
-                self.shader,  # spirv
-                (tensor_shape[0] // self.local_size_x, tensor_shape[1] // self.local_size_y, 1),  # workgroup
+                self.compiled_shader,  # spirv
+                workgroup,  # workgroup
                 [float(tensor_shape[0])],  # spec_consts
                 [])  # push_consts
 
         (self.mgr.sequence()
          .record(kp.OpTensorSyncDevice(self.params))
          .record(kp.OpAlgoDispatch(self.algo))
-         .record(kp.OpTensorSyncLocal(self.params))
+         .record(kp.OpTensorSyncLocal([tensor_out]))
          .eval())
 
 
@@ -121,11 +123,5 @@ def main():
           f'{experiment_count * op_count / (1e9 * experiment_time):0.2f}GFLOPS')
 
 
-def test():
-    main()
-
-
 if __name__ == '__main__':
     main()
-else:
-    test()
diff --git a/examples/python_naive_matmul/2_tiled_matmul.py b/examples/python_naive_matmul/imp2_tiled.py
similarity index 79%
rename from examples/python_naive_matmul/2_tiled_matmul.py
rename to examples/python_naive_matmul/imp2_tiled.py
index 839b07989..ed6a1ddf8 100644
--- a/examples/python_naive_matmul/2_tiled_matmul.py
+++ b/examples/python_naive_matmul/imp2_tiled.py
@@ -24,7 +24,7 @@ class MatMulOp:
         assert tile_size <= max_workgroup_size[1]
         self.tile_size = tile_size
 
-        self.shader = kp.Shader.compile_source(f'''
+        self.shader = f'''
 #version 450
 
 layout (local_size_x = {tile_size}, local_size_y = {tile_size}) in;
@@ -40,20 +40,21 @@ shared float sub_tensor_2[{tile_size}][{tile_size}];
 
 void main()
 {{
-    uint row = gl_GlobalInvocationID.x;
-    uint col = gl_GlobalInvocationID.y;
-    uint globalRow = {tile_size} * gl_WorkGroupID.x + row;
-    uint globalCol = {tile_size} * gl_WorkGroupID.y + row;
+    uint row = gl_LocalInvocationID.x; // 0 .. tile_size
+    uint col = gl_LocalInvocationID.y; // 0 .. tile_size
+    // gl_WorkGroupID : 0 .. tensor_size / tile_size
+    uint globalRow = ({tile_size} * gl_WorkGroupID.x) + row;
+    uint globalCol = ({tile_size} * gl_WorkGroupID.y) + col;
 
     uint tensor_size = uint(tensor_size_f);
     float acc = 0.0;
     uint numTiles = tensor_size / {tile_size};
     for(uint t = 0u; t < numTiles; t++)
     {{
-        uint tiledRow = {tile_size} * t + row;
-        uint tiledCol = {tile_size} * t + col;
-        sub_tensor_1[col][row] = in_tensor_1[tiledCol * tensor_size + globalRow];
-        sub_tensor_2[col][row] = in_tensor_2[globalCol * tensor_size + tiledRow];
+        uint tiledRow = ({tile_size} * t) + row;
+        uint tiledCol = ({tile_size} * t) + col;
+        sub_tensor_1[col][row] = in_tensor_1[(tiledCol * tensor_size) + globalRow];
+        sub_tensor_2[col][row] = in_tensor_2[(globalCol * tensor_size) + tiledRow];
 
         memoryBarrierShared();
         barrier();
@@ -63,8 +64,10 @@ void main()
 
         barrier();
     }}
-    out_tensor[(globalCol * tensor_size) + globalRow] = acc;
-}}''')
+    uint globalIndex = (tensor_size * globalCol) + globalRow;
+    out_tensor[globalIndex] = acc;
+}}'''
+        self.compiled_shader = kp.Shader.compile_source(self.shader)
         self.tensor_shape: tuple[int, int] = (0, 0)
         self.params: list[kp.Tensor] = []
         self.algo = None
@@ -76,17 +79,18 @@ void main()
         if self.algo is None or self.tensor_shape != tensor_shape or self.params != params:
             self.tensor_shape = tensor_shape
             self.params = params
+            workgroup = (tensor_shape[0] // self.tile_size, tensor_shape[1] // self.tile_size, 1)
             self.algo = self.mgr.algorithm(
                 params,  # params
-                self.shader,  # spirv
-                (tensor_shape[0] // self.tile_size, tensor_shape[1] // self.tile_size, 1),  # workgroup
+                self.compiled_shader,  # spirv
+                workgroup,  # workgroup
                 [float(tensor_shape[0])],  # spec_consts
                 [])  # push_consts
 
         (self.mgr.sequence()
          .record(kp.OpTensorSyncDevice(self.params))
          .record(kp.OpAlgoDispatch(self.algo))
-         .record(kp.OpTensorSyncLocal(self.params))
+         .record(kp.OpTensorSyncLocal([tensor_out]))
          .eval())
 
 
diff --git a/examples/python_naive_matmul/imp2_tiled_debug.py b/examples/python_naive_matmul/imp2_tiled_debug.py
new file mode 100644
index 000000000..b10cbfd7d
--- /dev/null
+++ b/examples/python_naive_matmul/imp2_tiled_debug.py
@@ -0,0 +1,156 @@
+import time
+
+import kp
+import numpy as np
+
+
+class MatMulOp:
+    def __init__(self, manager: kp.Manager, tile_size: int = -1):
+        self.mgr = manager
+
+        props = self.mgr.get_device_properties()
+        max_workgroup_invocation = props['max_work_group_invocations']
+        max_workgroup_size = props['max_work_group_size']
+        if tile_size < 0:
+            tile_size = 1
+            while (4 * tile_size * tile_size <= max_workgroup_invocation
+                   and 2 * tile_size <= max_workgroup_size[0]
+                   and 2 * tile_size <= max_workgroup_size[1]):
+                tile_size *= 2
+
+        assert tile_size > 0
+        assert tile_size * tile_size <= max_workgroup_invocation
+        assert tile_size <= max_workgroup_size[0]
+        assert tile_size <= max_workgroup_size[1]
+        self.tile_size = tile_size
+
+        print(f'{tile_size=}')
+        self.shader = f'''
+#version 450
+
+layout (local_size_x = {tile_size}, local_size_y = {tile_size}) in;
+
+layout (set = 0, binding = 0) readonly buffer buf_in_tensor_1 {{ float in_tensor_1[]; }};
+layout (set = 0, binding = 1) readonly buffer buf_in_tensor_2 {{ float in_tensor_2[]; }};
+layout (set = 0, binding = 2) writeonly buffer buf_out_tensor {{ float out_tensor[]; }};
+layout (set = 0, binding = 3) writeonly buffer buf_test1_tensor {{ float test1_tensor[]; }};
+layout (set = 0, binding = 4) writeonly buffer buf_test2_tensor {{ float test2_tensor[]; }};
+layout (set = 0, binding = 5) writeonly buffer buf_test3_tensor {{ float test3_tensor[]; }};
+layout (set = 0, binding = 6) writeonly buffer buf_test4_tensor {{ float test4_tensor[]; }};
+
+layout (constant_id = 0) const float tensor_size_f = 0;
+
+shared float sub_tensor_1[{tile_size}][{tile_size}];
+shared float sub_tensor_2[{tile_size}][{tile_size}];
+
+void main()
+{{
+    uint row = gl_LocalInvocationID.x; // 0 .. tile_size
+    uint col = gl_LocalInvocationID.y; // 0 .. tile_size
+    // gl_WorkGroupID : 0 .. tensor_size / tile_size
+    uint globalRow = ({tile_size} * gl_WorkGroupID.x) + row;
+    uint globalCol = ({tile_size} * gl_WorkGroupID.y) + col;
+
+    uint tensor_size = uint(tensor_size_f);
+    float acc = 0.0;
+    uint numTiles = tensor_size / {tile_size};
+    for(uint t = 0u; t < numTiles; t++)
+    {{
+        uint tiledRow = ({tile_size} * t) + row;
+        uint tiledCol = ({tile_size} * t) + col;
+        sub_tensor_1[col][row] = in_tensor_1[(tiledCol * tensor_size) + globalRow];
+        sub_tensor_2[col][row] = in_tensor_2[(globalCol * tensor_size) + tiledRow];
+
+        memoryBarrierShared();
+        barrier();
+
+        for(uint k = 0u; k < {tile_size}; k++)
+            acc += sub_tensor_1[k][row] * sub_tensor_2[col][k];
+
+        barrier();
+    }}
+    uint globalIndex = (tensor_size * globalCol) + globalRow;
+    out_tensor[globalIndex] = acc;
+    test1_tensor[globalIndex] = row;
+    test2_tensor[globalIndex] = col;
+    test3_tensor[globalIndex] = gl_WorkGroupID.x;
+    test4_tensor[globalIndex] = gl_WorkGroupID.y;
+}}'''
+        print(self.shader)
+        self.compiled_shader = kp.Shader.compile_source(self.shader)
+        self.tensor_shape: tuple[int, int] = (0, 0)
+        self.params: list[kp.Tensor] = []
+        self.algo = None
+
+    def __call__(self, tensor_shape: tuple[int, int], tensor_in_1: kp.Tensor, tensor_in_2: kp.Tensor,
+                 tensor_out: kp.Tensor, tensor_test_1: kp.Tensor, tensor_test_2: kp.Tensor,
+                 tensor_test_3: kp.Tensor, tensor_test_4: kp.Tensor):
+        # params = [tensor_in_1, tensor_in_2, tensor_out]
+        params = [tensor_in_1, tensor_in_2, tensor_out, tensor_test_1, tensor_test_2, tensor_test_3, tensor_test_4]
+
+        if self.algo is None or self.tensor_shape != tensor_shape or self.params != params:
+            self.tensor_shape = tensor_shape
+            self.params = params
+            workgroup = (tensor_shape[0] // self.tile_size, tensor_shape[1] // self.tile_size, 1)
+            # workgroup = (2, 2, 1)
+            print(f'{float(tensor_shape[0])=} {self.tile_size=} {workgroup=}')
+            self.algo = self.mgr.algorithm(
+                params,  # params
+                self.compiled_shader,  # spirv
+                workgroup,  # workgroup
+                [float(tensor_shape[0])],  # spec_consts
+                [])  # push_consts
+
+        (self.mgr.sequence()
+         .record(kp.OpTensorSyncDevice(self.params))
+         .record(kp.OpAlgoDispatch(self.algo))
+         .record(kp.OpTensorSyncLocal(self.params[2:]))
+         # .record(kp.OpTensorSyncLocal([tensor_out]))
+         .eval())
+
+
+def main():
+    mgr = kp.Manager()
+
+    matmul_op = MatMulOp(mgr, 4)
+
+    tensor_size = 8
+    tensor_shape = [tensor_size, tensor_size]
+    tensor_in_1 = mgr.tensor(np.triu(np.ones(tensor_shape)))
+    tensor_in_2 = mgr.tensor(np.triu(np.ones(tensor_shape)))
+    tensor_out = mgr.tensor(np.zeros(tensor_shape))
+    tensor_test_1 = mgr.tensor(np.zeros(tensor_shape))
+    tensor_test_2 = mgr.tensor(np.zeros(tensor_shape))
+    tensor_test_3 = mgr.tensor(np.zeros(tensor_shape))
+    tensor_test_4 = mgr.tensor(np.zeros(tensor_shape))
+
+    print(f'{tensor_shape} input tensors:\n'
+          f'{tensor_in_1.data().reshape(tensor_shape)}\n'
+          f'{tensor_in_2.data().reshape(tensor_shape)}\n')
+
+    # matmul_op(tensor_shape, tensor_in_1, tensor_in_2, tensor_out)
+    matmul_op(tensor_shape, tensor_in_1, tensor_in_2,
+              tensor_out, tensor_test_1, tensor_test_2, tensor_test_3, tensor_test_4)
+
+    # experiment_count = 10
+    # start_time = time.time()
+    # for _ in range(experiment_count):
+    #     matmul_op(tensor_shape, tensor_in_1, tensor_in_2, tensor_out)
+    # end_time = time.time()
+    # experiment_time = end_time - start_time
+    # op_count = tensor_shape[0] * tensor_shape[1] * (tensor_shape[1] - 1)
+
+    print(f'Output :\n{tensor_out.data().reshape(tensor_shape)}')
+    print(f'test_1 :\n{tensor_test_1.data().reshape(tensor_shape)}')
+    print(f'test_2 :\n{tensor_test_2.data().reshape(tensor_shape)}')
+    print(f'test_3 :\n{tensor_test_3.data().reshape(tensor_shape)}')
+    print(f'test_4 :\n{tensor_test_4.data().reshape(tensor_shape)}')
+
+    # print(f'{experiment_count} matmul time : '
+    #       f'{experiment_time * 1000:0.2f}ms => '
+    #       f'{experiment_count / experiment_time:0.2f}op/s or '
+    #       f'{experiment_count * op_count / (1e9 * experiment_time):0.2f}GFLOPS')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/examples/python_naive_matmul/3_better_tiling.py b/examples/python_naive_matmul/imp3_better_tiling.py
similarity index 78%
rename from examples/python_naive_matmul/3_better_tiling.py
rename to examples/python_naive_matmul/imp3_better_tiling.py
index 5dcc52349..6b3ada314 100644
--- a/examples/python_naive_matmul/3_better_tiling.py
+++ b/examples/python_naive_matmul/imp3_better_tiling.py
@@ -30,11 +30,12 @@ class MatMulOp:
         self.tile_size = tile_size
         self.thread_work_ratio = thread_work_ratio
 
-        local_size_y = tile_size // thread_work_ratio
-        self.shader = kp.Shader.compile_source(f'''
+        self.local_size_x = tile_size
+        self.local_size_y = tile_size // thread_work_ratio
+        self.shader = f'''
 #version 450
 
-layout (local_size_x = {tile_size}, local_size_y = {local_size_y}) in;
+layout (local_size_x = {tile_size}, local_size_y = {self.local_size_y}) in;
 
 layout (set = 0, binding = 0) readonly buffer buf_in_tensor_1 {{ float in_tensor_1[]; }};
 layout (set = 0, binding = 1) readonly buffer buf_in_tensor_2 {{ float in_tensor_2[]; }};
@@ -47,8 +48,8 @@ shared float sub_tensor_2[{tile_size}][{tile_size}];
 
 void main()
 {{
-    uint row = gl_GlobalInvocationID.x;
-    uint col = gl_GlobalInvocationID.y;
+    uint row = gl_LocalInvocationID.x;
+    uint col = gl_LocalInvocationID.y;
     uint globalRow = {tile_size} * gl_WorkGroupID.x + row;
     uint globalCol = {tile_size} * gl_WorkGroupID.y + row;
 
@@ -62,23 +63,24 @@ void main()
     {{
         uint tiledRow = {tile_size} * t + row;
         uint tiledCol = {tile_size} * t + col;
-        sub_tensor_1[col + t * {local_size_y}][row] = in_tensor_1[
-            (tiledCol + t * {local_size_y}) * tensor_size + globalRow];
-        sub_tensor_2[col + t * {local_size_y}][row] = in_tensor_2[
-            (globalCol + t * {local_size_y})* tensor_size + tiledRow];
+        sub_tensor_1[col + t * {self.local_size_y}][row] = in_tensor_1[
+            (tiledCol + t * {self.local_size_y}) * tensor_size + globalRow];
+        sub_tensor_2[col + t * {self.local_size_y}][row] = in_tensor_2[
+            (globalCol + t * {self.local_size_y})* tensor_size + tiledRow];
 
         memoryBarrierShared();
         barrier();
 
         for(uint k = 0u; k < {tile_size}; k++)
             for(uint l = 0u; l < {thread_work_ratio}; l++)
-                acc[l] += sub_tensor_1[k][row] * sub_tensor_2[col + l * {local_size_y}][k];
+                acc[l] += sub_tensor_1[k][row] * sub_tensor_2[col + l * {self.local_size_y}][k];
 
         barrier();
     }}
     for(uint l = 0u; l < {thread_work_ratio}; l++)
-        out_tensor[(globalCol + l * {local_size_y}) * tensor_size + globalRow] = acc[l];
-}}''')
+        out_tensor[(globalCol + l * {self.local_size_y}) * tensor_size + globalRow] = acc[l];
+}}'''
+        self.compiled_shader = kp.Shader.compile_source(self.shader)
         self.tensor_shape: tuple[int, int] = (0, 0)
         self.params: list[kp.Tensor] = []
         self.algo = None
@@ -90,17 +92,20 @@ void main()
         if self.algo is None or self.tensor_shape != tensor_shape or self.params != params:
             self.tensor_shape = tensor_shape
             self.params = params
+            print(
+                tensor_shape, self.local_size_x, self.local_size_y,
+                (tensor_shape[0] // self.local_size_x, tensor_shape[1] // self.local_size_y, 1))
             self.algo = self.mgr.algorithm(
                 params,  # params
-                self.shader,  # spirv
-                (tensor_shape[0] // self.tile_size, tensor_shape[1] // self.tile_size, 1),  # workgroup
+                self.compiled_shader,  # spirv
+                (tensor_shape[0] // self.local_size_x, tensor_shape[1] // self.local_size_y, 1),  # workgroup
                 [float(tensor_shape[0])],  # spec_consts
                 [])  # push_consts
 
         (self.mgr.sequence()
          .record(kp.OpTensorSyncDevice(self.params))
          .record(kp.OpAlgoDispatch(self.algo))
-         .record(kp.OpTensorSyncLocal(self.params))
+         .record(kp.OpTensorSyncLocal([tensor_out]))
          .eval())
 
 
diff --git a/examples/python_naive_matmul/matmul_plot.py b/examples/python_naive_matmul/matmul_plot.py
new file mode 100644
index 000000000..81763a439
--- /dev/null
+++ b/examples/python_naive_matmul/matmul_plot.py
@@ -0,0 +1,97 @@
+from argparse import ArgumentParser
+
+import cv2
+import numpy as np
+
+
+def plot_tensor(window_name: str, tensor: np.ndarray, coord_highlight: tuple[int, int] = None):
+    font_size = 48
+    image = np.zeros((tensor.shape[1] * font_size, tensor.shape[0] * font_size, 3), dtype=np.uint8)
+
+    for y in range(tensor.shape[1]):
+        for x in range(tensor.shape[0]):
+            if coord_highlight and x == coord_highlight[1] and y == coord_highlight[0]:
+                cv2.putText(
+                    image, str(int(tensor[y, x])), (x * font_size, int((y + 0.8) * font_size)),
+                    cv2.FONT_HERSHEY_TRIPLEX, 1., (127, 127, 255))
+            else:
+                cv2.putText(
+                    image, str(int(tensor[y, x])), (x * font_size, int((y + 0.8) * font_size)),
+                    cv2.FONT_HERSHEY_TRIPLEX, 1., (255, 255, 255))
+
+    cv2.imshow(window_name, image)
+
+
+def main():
+    parser = ArgumentParser()
+    parser.add_argument('tensor_size', type=int, help='Size of the square tensors')
+    parser.add_argument('tile_size', type=int)
+    parser.add_argument('local_size', type=int, nargs=2)
+    parser.add_argument('workgroup', type=int, nargs=2)
+    arguments = parser.parse_args()
+
+    tensor_size: int = arguments.tensor_size
+    tile_size: int = arguments.tile_size
+    local_size: tuple[int, int, int] = tuple(arguments.local_size)
+    workgroup: tuple[int, int, int] = tuple(arguments.workgroup)
+
+    tensor_shape = (tensor_size, tensor_size)
+    tensor_1 = np.triu(np.ones(tensor_shape))
+    tensor_2 = np.triu(np.ones(tensor_shape))
+    tensor_out = np.zeros(tensor_shape)
+    tensor_test_1 = np.zeros(tensor_shape)
+    tensor_test_2 = np.zeros(tensor_shape)
+    tensor_test_3 = np.zeros(tensor_shape)
+    tensor_test_4 = np.zeros(tensor_shape)
+    tensor_test_5 = np.zeros(tensor_shape)
+
+    plot_tensor('tensor_1', tensor_1)
+    plot_tensor('tensor_2', tensor_2)
+    plot_tensor('tensor_out', tensor_out)
+    plot_tensor('tensor_test_1', tensor_test_1)
+    plot_tensor('tensor_test_2', tensor_test_2)
+    plot_tensor('tensor_test_3', tensor_test_3)
+    plot_tensor('tensor_test_4', tensor_test_4)
+    plot_tensor('tensor_test_5', tensor_test_5)
+    cv2.waitKey(-1)
+
+    print(f'{workgroup=} {local_size=}')
+    for workgroup_x in range(workgroup[0]):
+        for workgroup_y in range(workgroup[1]):
+            for invocation_x in range(workgroup_x * local_size[0], (workgroup_x + 1) * local_size[0]):
+                for invocation_y in range(workgroup_y * local_size[1], (workgroup_y + 1) * local_size[1]):
+                    row = invocation_x
+                    col = invocation_y
+                    globalRow = (tile_size * workgroup_x) + row
+                    globalCol = (tile_size * workgroup_y) + col
+                    try:
+                        tensor_out[row, col] = row
+                        tensor_test_1[row, col] = col
+                        tensor_test_2[row, col] = workgroup_x
+                        tensor_test_3[row, col] = workgroup_y
+                        tensor_test_4[row, col] = globalRow
+                        tensor_test_5[row, col] = globalCol
+                        plot_tensor('tensor_out', tensor_out, (row, col))
+                        plot_tensor('tensor_test_1', tensor_test_1, (row, col))
+                        plot_tensor('tensor_test_2', tensor_test_2, (row, col))
+                        plot_tensor('tensor_test_3', tensor_test_3, (row, col))
+                        plot_tensor('tensor_test_4', tensor_test_4, (row, col))
+                        plot_tensor('tensor_test_5', tensor_test_5, (row, col))
+                        cv2.waitKey(-1)
+                    except IndexError as error:
+                        print(f'{workgroup_x=} {workgroup_y=} {row=} {col=}')
+                        raise error
+
+    plot_tensor('tensor_1', tensor_1)
+    plot_tensor('tensor_2', tensor_2)
+    plot_tensor('tensor_out', tensor_out)
+    plot_tensor('tensor_test_1', tensor_test_1)
+    plot_tensor('tensor_test_2', tensor_test_2)
+    plot_tensor('tensor_test_3', tensor_test_3)
+    plot_tensor('tensor_test_4', tensor_test_4)
+    plot_tensor('tensor_test_5', tensor_test_5)
+    cv2.waitKey(-1)
+
+
+if __name__ == '__main__':
+    main()

From a3f7793c17fb2fd8ba3608e809caf31e29195058 Mon Sep 17 00:00:00 2001
From: Corentin <corentin-pro@mail.com>
Date: Fri, 25 Jun 2021 03:27:52 +0900
Subject: [PATCH 6/8] Fix FLOPS calculation

---
 examples/python_naive_matmul/benchmark.py     |  4 +-
 examples/python_naive_matmul/imp1_naive.py    |  9 ++--
 examples/python_naive_matmul/imp2_tiled.py    | 15 +++---
 .../python_naive_matmul/imp3_better_tiling.py | 47 +++++++++++--------
 4 files changed, 41 insertions(+), 34 deletions(-)

diff --git a/examples/python_naive_matmul/benchmark.py b/examples/python_naive_matmul/benchmark.py
index b10369d7c..8b92dda2f 100644
--- a/examples/python_naive_matmul/benchmark.py
+++ b/examples/python_naive_matmul/benchmark.py
@@ -34,13 +34,13 @@ def main():
             matmul_op(tensor_shape, tensor_in_1, tensor_in_2, tensor_out)
         end_time = time.time()
         experiment_time = end_time - start_time
-        op_count = tensor_shape[0] * tensor_shape[1] * (tensor_shape[1] - 1)
+        op_count = tensor_shape[0] * tensor_shape[1] * ((tensor_shape[1] * 2) - 1)
 
         if (tensor_out.data().reshape(tensor_shape) == mat_result).all():
             print(f'From {MatMulOp.__module__} : {experiment_count} matmul time : '
                   f'{experiment_time * 1000:0.2f}ms => '
                   f'{experiment_count / experiment_time:0.2f}op/s or '
-                  f'{experiment_count * op_count / (1e9 * experiment_time):0.2f}GFLOPS')
+                  f'{experiment_count * op_count / (1e9 * experiment_time):0.2f} GFLOPS')
         else:
             print(f'Test failed => output tensor is wrong :\n{tensor_out.data().reshape(tensor_shape)}')
 
diff --git a/examples/python_naive_matmul/imp1_naive.py b/examples/python_naive_matmul/imp1_naive.py
index faefec563..420336260 100644
--- a/examples/python_naive_matmul/imp1_naive.py
+++ b/examples/python_naive_matmul/imp1_naive.py
@@ -76,6 +76,7 @@ void main()
             self.tensor_shape = tensor_shape
             self.params = params
             workgroup = (tensor_shape[0] // self.local_size_x, tensor_shape[1] // self.local_size_y, 1)
+            print(f'{workgroup=} {self.local_size_x=} {self.local_size_y=}')
             self.algo = self.mgr.algorithm(
                 params,  # params
                 self.compiled_shader,  # spirv
@@ -95,7 +96,7 @@ def main():
 
     matmul_op = MatMulOp(mgr)
 
-    tensor_size = 512
+    tensor_size = 4064
     tensor_shape = [tensor_size, tensor_size]
     tensor_in_1 = mgr.tensor(np.triu(np.ones(tensor_shape)))
     tensor_in_2 = mgr.tensor(np.triu(np.ones(tensor_shape)))
@@ -107,20 +108,20 @@ def main():
 
     matmul_op(tensor_shape, tensor_in_1, tensor_in_2, tensor_out)
 
-    experiment_count = 1000
+    experiment_count = 8
     start_time = time.time()
     for _ in range(experiment_count):
         matmul_op(tensor_shape, tensor_in_1, tensor_in_2, tensor_out)
     end_time = time.time()
     experiment_time = end_time - start_time
-    op_count = tensor_shape[0] * tensor_shape[1] * (tensor_shape[1] - 1)
+    op_count = tensor_shape[0] * tensor_shape[1] * ((tensor_shape[1] * 2) - 1)
 
     print(f'Output :\n{tensor_out.data().reshape(tensor_shape)}')
 
     print(f'{experiment_count} matmul time : '
           f'{experiment_time * 1000:0.2f}ms => '
           f'{experiment_count / experiment_time:0.2f}op/s or '
-          f'{experiment_count * op_count / (1e9 * experiment_time):0.2f}GFLOPS')
+          f'{experiment_count * op_count / (1e9 * experiment_time):0.2f} GFLOPS')
 
 
 if __name__ == '__main__':
diff --git a/examples/python_naive_matmul/imp2_tiled.py b/examples/python_naive_matmul/imp2_tiled.py
index ed6a1ddf8..8ddf53745 100644
--- a/examples/python_naive_matmul/imp2_tiled.py
+++ b/examples/python_naive_matmul/imp2_tiled.py
@@ -43,8 +43,8 @@ void main()
     uint row = gl_LocalInvocationID.x; // 0 .. tile_size
     uint col = gl_LocalInvocationID.y; // 0 .. tile_size
     // gl_WorkGroupID : 0 .. tensor_size / tile_size
-    uint globalRow = ({tile_size} * gl_WorkGroupID.x) + row;
-    uint globalCol = ({tile_size} * gl_WorkGroupID.y) + col;
+    uint globalRow = {tile_size} * gl_WorkGroupID.x + row;
+    uint globalCol = {tile_size} * gl_WorkGroupID.y + col;
 
     uint tensor_size = uint(tensor_size_f);
     float acc = 0.0;
@@ -64,8 +64,7 @@ void main()
 
         barrier();
     }}
-    uint globalIndex = (tensor_size * globalCol) + globalRow;
-    out_tensor[globalIndex] = acc;
+    out_tensor[tensor_size * globalCol + globalRow] = acc;
 }}'''
         self.compiled_shader = kp.Shader.compile_source(self.shader)
         self.tensor_shape: tuple[int, int] = (0, 0)
@@ -99,7 +98,7 @@ def main():
 
     matmul_op = MatMulOp(mgr)
 
-    tensor_size = 512
+    tensor_size = 4096
     tensor_shape = [tensor_size, tensor_size]
     tensor_in_1 = mgr.tensor(np.triu(np.ones(tensor_shape)))
     tensor_in_2 = mgr.tensor(np.triu(np.ones(tensor_shape)))
@@ -111,20 +110,20 @@ def main():
 
     matmul_op(tensor_shape, tensor_in_1, tensor_in_2, tensor_out)
 
-    experiment_count = 1000
+    experiment_count = 8
     start_time = time.time()
     for _ in range(experiment_count):
         matmul_op(tensor_shape, tensor_in_1, tensor_in_2, tensor_out)
     end_time = time.time()
     experiment_time = end_time - start_time
-    op_count = tensor_shape[0] * tensor_shape[1] * (tensor_shape[1] - 1)
+    op_count = tensor_shape[0] * tensor_shape[1] * ((tensor_shape[1] * 2) - 1)
 
     print(f'Output :\n{tensor_out.data().reshape(tensor_shape)}')
 
     print(f'{experiment_count} matmul time : '
           f'{experiment_time * 1000:0.2f}ms => '
           f'{experiment_count / experiment_time:0.2f}op/s or '
-          f'{experiment_count * op_count / (1e9 * experiment_time):0.2f}GFLOPS')
+          f'{experiment_count * op_count / (1e9 * experiment_time):0.2f} GFLOPS')
 
 
 if __name__ == '__main__':
diff --git a/examples/python_naive_matmul/imp3_better_tiling.py b/examples/python_naive_matmul/imp3_better_tiling.py
index 6b3ada314..e97eb88d4 100644
--- a/examples/python_naive_matmul/imp3_better_tiling.py
+++ b/examples/python_naive_matmul/imp3_better_tiling.py
@@ -55,30 +55,37 @@ void main()
 
     uint tensor_size = uint(tensor_size_f);
     float acc[{thread_work_ratio}];
-    for (uint l = 0u; l < {thread_work_ratio}; l++)
-        acc[l] = 0.0;
+    for(uint w = 0u; w < {thread_work_ratio}; w++)
+        acc[w] = 0.0;
 
+    /*
     uint numTiles = tensor_size / {tile_size};
     for(uint t = 0u; t < numTiles; t++)
     {{
-        uint tiledRow = {tile_size} * t + row;
-        uint tiledCol = {tile_size} * t + col;
-        sub_tensor_1[col + t * {self.local_size_y}][row] = in_tensor_1[
-            (tiledCol + t * {self.local_size_y}) * tensor_size + globalRow];
-        sub_tensor_2[col + t * {self.local_size_y}][row] = in_tensor_2[
-            (globalCol + t * {self.local_size_y})* tensor_size + tiledRow];
+        for(uint w = 0u; w < {thread_work_ratio}; w++)
+        {{
+            uint tiledRow = {tile_size} * t + row;
+            uint tiledCol = {tile_size} * t + col;
+            sub_tensor_1[col + t * {self.local_size_y}][row] = in_tensor_1[
+                (tiledCol + w * {self.local_size_y}) * tensor_size + globalRow];
+            sub_tensor_2[col + t * {self.local_size_y}][row] = in_tensor_2[
+                (globalCol + w * {self.local_size_y})* tensor_size + tiledRow];
+        }}
 
         memoryBarrierShared();
         barrier();
 
         for(uint k = 0u; k < {tile_size}; k++)
-            for(uint l = 0u; l < {thread_work_ratio}; l++)
-                acc[l] += sub_tensor_1[k][row] * sub_tensor_2[col + l * {self.local_size_y}][k];
+            for(uint w = 0u; w < {thread_work_ratio}; w++)
+                acc[w] += sub_tensor_1[k][row] * sub_tensor_2[col + w * {self.local_size_y}][k];
 
         barrier();
+    }}*/
+    for(uint w = 0u; w < {thread_work_ratio}; w++)
+    {{
+        //out_tensor[(globalCol + w * {self.local_size_y}) * tensor_size + globalRow] = acc[w];
+        out_tensor[(globalCol + w * {self.local_size_y}) * tensor_size + globalRow] = w;
     }}
-    for(uint l = 0u; l < {thread_work_ratio}; l++)
-        out_tensor[(globalCol + l * {self.local_size_y}) * tensor_size + globalRow] = acc[l];
 }}'''
         self.compiled_shader = kp.Shader.compile_source(self.shader)
         self.tensor_shape: tuple[int, int] = (0, 0)
@@ -92,13 +99,13 @@ void main()
         if self.algo is None or self.tensor_shape != tensor_shape or self.params != params:
             self.tensor_shape = tensor_shape
             self.params = params
-            print(
-                tensor_shape, self.local_size_x, self.local_size_y,
-                (tensor_shape[0] // self.local_size_x, tensor_shape[1] // self.local_size_y, 1))
+            # workgroup = (tensor_shape[0] // self.local_size_x, tensor_shape[1] // self.local_size_y, 1)
+            workgroup = (2, 2, 1)
+            print(tensor_shape, self.local_size_x, self.local_size_y, workgroup)
             self.algo = self.mgr.algorithm(
                 params,  # params
                 self.compiled_shader,  # spirv
-                (tensor_shape[0] // self.local_size_x, tensor_shape[1] // self.local_size_y, 1),  # workgroup
+                workgroup,  # workgroup
                 [float(tensor_shape[0])],  # spec_consts
                 [])  # push_consts
 
@@ -114,7 +121,7 @@ def main():
 
     matmul_op = MatMulOp(mgr)
 
-    tensor_size = 512
+    tensor_size = 4096
     tensor_shape = [tensor_size, tensor_size]
     tensor_in_1 = mgr.tensor(np.triu(np.ones(tensor_shape)))
     tensor_in_2 = mgr.tensor(np.triu(np.ones(tensor_shape)))
@@ -126,20 +133,20 @@ def main():
 
     matmul_op(tensor_shape, tensor_in_1, tensor_in_2, tensor_out)
 
-    experiment_count = 1000
+    experiment_count = 8
     start_time = time.time()
     for _ in range(experiment_count):
         matmul_op(tensor_shape, tensor_in_1, tensor_in_2, tensor_out)
     end_time = time.time()
     experiment_time = end_time - start_time
-    op_count = tensor_shape[0] * tensor_shape[1] * (tensor_shape[1] - 1)
+    op_count = tensor_shape[0] * tensor_shape[1] * ((tensor_shape[1] * 2) - 1)
 
     print(f'Output :\n{tensor_out.data().reshape(tensor_shape)}')
 
     print(f'{experiment_count} matmul time : '
           f'{experiment_time * 1000:0.2f}ms => '
           f'{experiment_count / experiment_time:0.2f}op/s or '
-          f'{experiment_count * op_count / (1e9 * experiment_time):0.2f}GFLOPS')
+          f'{experiment_count * op_count / (1e9 * experiment_time):0.2f} GFLOPS')
 
 
 if __name__ == '__main__':

From 3962ee70afc47bed63a90a675bbccd523275c402 Mon Sep 17 00:00:00 2001
From: Corentin <corentin-pro@mail.com>
Date: Mon, 28 Jun 2021 19:05:39 +0900
Subject: [PATCH 7/8] Fix small matrices matmuls, imp3 working but slow

---
 examples/python_naive_matmul/benchmark.py     |  71 ++++----
 examples/python_naive_matmul/imp1_naive.py    |  13 +-
 examples/python_naive_matmul/imp2_tiled.py    |  10 +-
 .../python_naive_matmul/imp2_tiled_debug.py   | 156 ------------------
 .../python_naive_matmul/imp3_better_tiling.py |  46 +++---
 5 files changed, 77 insertions(+), 219 deletions(-)
 delete mode 100644 examples/python_naive_matmul/imp2_tiled_debug.py

diff --git a/examples/python_naive_matmul/benchmark.py b/examples/python_naive_matmul/benchmark.py
index 8b92dda2f..768a854d4 100644
--- a/examples/python_naive_matmul/benchmark.py
+++ b/examples/python_naive_matmul/benchmark.py
@@ -8,41 +8,48 @@ from imp3_better_tiling import MatMulOp as MatMulOp3
 
 
 def main():
-    experiment_count = 1000
-    tensor_size = 512
-    tensor_shape = [tensor_size, tensor_size]
-    mat_1 = np.triu(np.ones(tensor_shape))
-    mat_2 = np.triu(np.ones(tensor_shape))
-    mat_result = mat_1 @ mat_2
-    tensor_shape = [tensor_size, tensor_size]
-
-    print(f'{tensor_shape} input tensors:\n'
-          f'{mat_1}\n'
-          f'{mat_2}\n')
-    print(f'Output :\n{mat_result}')
-
     mgr = kp.Manager()
-    tensor_in_1 = mgr.tensor(mat_1)
-    tensor_in_2 = mgr.tensor(mat_2)
-    tensor_out = mgr.tensor(np.zeros(tensor_shape))
-    for MatMulOp in [MatMulOp1, MatMulOp2, MatMulOp3]:
-        matmul_op = MatMulOp(mgr)
-        matmul_op(tensor_shape, tensor_in_1, tensor_in_2, tensor_out)
+    for tensor_size, experiment_count in [(512, 1000), (4096, 5)]:
+        tensor_shape = [tensor_size, tensor_size]
+        tensor_shape = [tensor_size, tensor_size]
+        mat_1 = np.triu(np.ones(tensor_shape))
+        mat_2 = np.triu(np.ones(tensor_shape))
 
-        start_time = time.time()
-        for _ in range(experiment_count):
-            matmul_op(tensor_shape, tensor_in_1, tensor_in_2, tensor_out)
-        end_time = time.time()
-        experiment_time = end_time - start_time
-        op_count = tensor_shape[0] * tensor_shape[1] * ((tensor_shape[1] * 2) - 1)
-
-        if (tensor_out.data().reshape(tensor_shape) == mat_result).all():
-            print(f'From {MatMulOp.__module__} : {experiment_count} matmul time : '
-                  f'{experiment_time * 1000:0.2f}ms => '
-                  f'{experiment_count / experiment_time:0.2f}op/s or '
-                  f'{experiment_count * op_count / (1e9 * experiment_time):0.2f} GFLOPS')
+        tensor_in_1 = mgr.tensor(mat_1)
+        tensor_in_2 = mgr.tensor(mat_2)
+        tensor_out = mgr.tensor(np.zeros(tensor_shape))
+        if tensor_size <= 512:
+            mat_result = mat_1 @ mat_2
         else:
-            print(f'Test failed => output tensor is wrong :\n{tensor_out.data().reshape(tensor_shape)}')
+            MatMulOp1(mgr)(tensor_shape, tensor_in_1, tensor_in_2, tensor_out)
+            mat_result = tensor_out.data().reshape(tensor_shape)  # CPU is too slow for big sizes
+
+        print(f'{tensor_shape} input tensors:\n'
+              f'{mat_1}\n'
+              f'{mat_2}\n')
+        print(f'Output :\n{mat_result}')
+
+        for MatMulOp in [MatMulOp1, MatMulOp2, MatMulOp3]:
+            tensor_out.data()[:] = 0
+            mgr.sequence().record(kp.OpTensorSyncDevice([tensor_out]))
+            matmul_op = MatMulOp(mgr)
+            matmul_op(tensor_shape, tensor_in_1, tensor_in_2, tensor_out)
+
+            start_time = time.time()
+            for _ in range(experiment_count):
+                matmul_op(tensor_shape, tensor_in_1, tensor_in_2, tensor_out)
+            end_time = time.time()
+            experiment_time = end_time - start_time
+            op_count = tensor_shape[0] * tensor_shape[1] * ((tensor_shape[1] * 2) - 1)
+
+            # print(tensor_out.data().reshape(tensor_shape))
+            if (tensor_out.data().reshape(tensor_shape) == mat_result).all():
+                print(f'From {MatMulOp.__module__} : {experiment_count} matmul time : '
+                      f'{experiment_time * 1000:0.2f}ms => '
+                      f'{experiment_count / experiment_time:0.2f}op/s or '
+                      f'{experiment_count * op_count / (1e9 * experiment_time):0.2f} GFLOPS')
+            else:
+                print(f'Test failed => output tensor is wrong :\n{tensor_out.data().reshape(tensor_shape)}')
 
 
 if __name__ == '__main__':
diff --git a/examples/python_naive_matmul/imp1_naive.py b/examples/python_naive_matmul/imp1_naive.py
index 420336260..a791662d2 100644
--- a/examples/python_naive_matmul/imp1_naive.py
+++ b/examples/python_naive_matmul/imp1_naive.py
@@ -41,7 +41,7 @@ class MatMulOp:
         self.local_size_x = local_size_x
         self.local_size_y = local_size_y
 
-        self.shader = f'''
+        self.shader = '''
 #version 450
 
 layout (local_size_x = {local_size_x}, local_size_y = {local_size_y}) in;
@@ -63,7 +63,8 @@ void main()
         acc += in_tensor_1[(k * tensor_size) + globalRow] * in_tensor_2[(globalCol * tensor_size) + k];
     out_tensor[(globalCol * tensor_size) + globalRow] = acc;
 }}'''
-        self.compiled_shader = kp.Shader.compile_source(self.shader)
+        self.compiled_shader = kp.Shader.compile_source(self.shader.format(
+            local_size_x=self.local_size_x, local_size_y=self.local_size_y))
         self.tensor_shape: tuple[int, int] = (0, 0)
         self.params: list[kp.Tensor] = []
         self.algo = None
@@ -75,7 +76,11 @@ void main()
         if self.algo is None or self.tensor_shape != tensor_shape or self.params != params:
             self.tensor_shape = tensor_shape
             self.params = params
-            workgroup = (tensor_shape[0] // self.local_size_x, tensor_shape[1] // self.local_size_y, 1)
+            local_size_x = min(self.local_size_x, tensor_shape[0])
+            local_size_y = min(self.local_size_y, tensor_shape[1])
+            self.compiled_shader = kp.Shader.compile_source(self.shader.format(
+                local_size_x=local_size_x, local_size_y=local_size_y))
+            workgroup = (tensor_shape[0] // local_size_x, tensor_shape[1] // local_size_y, 1)
             print(f'{workgroup=} {self.local_size_x=} {self.local_size_y=}')
             self.algo = self.mgr.algorithm(
                 params,  # params
@@ -85,7 +90,7 @@ void main()
                 [])  # push_consts
 
         (self.mgr.sequence()
-         .record(kp.OpTensorSyncDevice(self.params))
+         .record(kp.OpTensorSyncDevice([tensor_in_1, tensor_in_2]))
          .record(kp.OpAlgoDispatch(self.algo))
          .record(kp.OpTensorSyncLocal([tensor_out]))
          .eval())
diff --git a/examples/python_naive_matmul/imp2_tiled.py b/examples/python_naive_matmul/imp2_tiled.py
index 8ddf53745..1ac13e858 100644
--- a/examples/python_naive_matmul/imp2_tiled.py
+++ b/examples/python_naive_matmul/imp2_tiled.py
@@ -24,7 +24,7 @@ class MatMulOp:
         assert tile_size <= max_workgroup_size[1]
         self.tile_size = tile_size
 
-        self.shader = f'''
+        self.shader = '''
 #version 450
 
 layout (local_size_x = {tile_size}, local_size_y = {tile_size}) in;
@@ -66,7 +66,7 @@ void main()
     }}
     out_tensor[tensor_size * globalCol + globalRow] = acc;
 }}'''
-        self.compiled_shader = kp.Shader.compile_source(self.shader)
+        self.compiled_shader = kp.Shader.compile_source(self.shader.format(tile_size=tile_size))
         self.tensor_shape: tuple[int, int] = (0, 0)
         self.params: list[kp.Tensor] = []
         self.algo = None
@@ -78,7 +78,9 @@ void main()
         if self.algo is None or self.tensor_shape != tensor_shape or self.params != params:
             self.tensor_shape = tensor_shape
             self.params = params
-            workgroup = (tensor_shape[0] // self.tile_size, tensor_shape[1] // self.tile_size, 1)
+            tile_size = min(tensor_shape[0], tensor_shape[1], self.tile_size)
+            self.compiled_shader = kp.Shader.compile_source(self.shader.format(tile_size=tile_size))
+            workgroup = [tensor_shape[0] // tile_size, tensor_shape[1] // tile_size, 1]
             self.algo = self.mgr.algorithm(
                 params,  # params
                 self.compiled_shader,  # spirv
@@ -87,7 +89,7 @@ void main()
                 [])  # push_consts
 
         (self.mgr.sequence()
-         .record(kp.OpTensorSyncDevice(self.params))
+         .record(kp.OpTensorSyncDevice([tensor_in_1, tensor_in_2]))
          .record(kp.OpAlgoDispatch(self.algo))
          .record(kp.OpTensorSyncLocal([tensor_out]))
          .eval())
diff --git a/examples/python_naive_matmul/imp2_tiled_debug.py b/examples/python_naive_matmul/imp2_tiled_debug.py
deleted file mode 100644
index b10cbfd7d..000000000
--- a/examples/python_naive_matmul/imp2_tiled_debug.py
+++ /dev/null
@@ -1,156 +0,0 @@
-import time
-
-import kp
-import numpy as np
-
-
-class MatMulOp:
-    def __init__(self, manager: kp.Manager, tile_size: int = -1):
-        self.mgr = manager
-
-        props = self.mgr.get_device_properties()
-        max_workgroup_invocation = props['max_work_group_invocations']
-        max_workgroup_size = props['max_work_group_size']
-        if tile_size < 0:
-            tile_size = 1
-            while (4 * tile_size * tile_size <= max_workgroup_invocation
-                   and 2 * tile_size <= max_workgroup_size[0]
-                   and 2 * tile_size <= max_workgroup_size[1]):
-                tile_size *= 2
-
-        assert tile_size > 0
-        assert tile_size * tile_size <= max_workgroup_invocation
-        assert tile_size <= max_workgroup_size[0]
-        assert tile_size <= max_workgroup_size[1]
-        self.tile_size = tile_size
-
-        print(f'{tile_size=}')
-        self.shader = f'''
-#version 450
-
-layout (local_size_x = {tile_size}, local_size_y = {tile_size}) in;
-
-layout (set = 0, binding = 0) readonly buffer buf_in_tensor_1 {{ float in_tensor_1[]; }};
-layout (set = 0, binding = 1) readonly buffer buf_in_tensor_2 {{ float in_tensor_2[]; }};
-layout (set = 0, binding = 2) writeonly buffer buf_out_tensor {{ float out_tensor[]; }};
-layout (set = 0, binding = 3) writeonly buffer buf_test1_tensor {{ float test1_tensor[]; }};
-layout (set = 0, binding = 4) writeonly buffer buf_test2_tensor {{ float test2_tensor[]; }};
-layout (set = 0, binding = 5) writeonly buffer buf_test3_tensor {{ float test3_tensor[]; }};
-layout (set = 0, binding = 6) writeonly buffer buf_test4_tensor {{ float test4_tensor[]; }};
-
-layout (constant_id = 0) const float tensor_size_f = 0;
-
-shared float sub_tensor_1[{tile_size}][{tile_size}];
-shared float sub_tensor_2[{tile_size}][{tile_size}];
-
-void main()
-{{
-    uint row = gl_LocalInvocationID.x; // 0 .. tile_size
-    uint col = gl_LocalInvocationID.y; // 0 .. tile_size
-    // gl_WorkGroupID : 0 .. tensor_size / tile_size
-    uint globalRow = ({tile_size} * gl_WorkGroupID.x) + row;
-    uint globalCol = ({tile_size} * gl_WorkGroupID.y) + col;
-
-    uint tensor_size = uint(tensor_size_f);
-    float acc = 0.0;
-    uint numTiles = tensor_size / {tile_size};
-    for(uint t = 0u; t < numTiles; t++)
-    {{
-        uint tiledRow = ({tile_size} * t) + row;
-        uint tiledCol = ({tile_size} * t) + col;
-        sub_tensor_1[col][row] = in_tensor_1[(tiledCol * tensor_size) + globalRow];
-        sub_tensor_2[col][row] = in_tensor_2[(globalCol * tensor_size) + tiledRow];
-
-        memoryBarrierShared();
-        barrier();
-
-        for(uint k = 0u; k < {tile_size}; k++)
-            acc += sub_tensor_1[k][row] * sub_tensor_2[col][k];
-
-        barrier();
-    }}
-    uint globalIndex = (tensor_size * globalCol) + globalRow;
-    out_tensor[globalIndex] = acc;
-    test1_tensor[globalIndex] = row;
-    test2_tensor[globalIndex] = col;
-    test3_tensor[globalIndex] = gl_WorkGroupID.x;
-    test4_tensor[globalIndex] = gl_WorkGroupID.y;
-}}'''
-        print(self.shader)
-        self.compiled_shader = kp.Shader.compile_source(self.shader)
-        self.tensor_shape: tuple[int, int] = (0, 0)
-        self.params: list[kp.Tensor] = []
-        self.algo = None
-
-    def __call__(self, tensor_shape: tuple[int, int], tensor_in_1: kp.Tensor, tensor_in_2: kp.Tensor,
-                 tensor_out: kp.Tensor, tensor_test_1: kp.Tensor, tensor_test_2: kp.Tensor,
-                 tensor_test_3: kp.Tensor, tensor_test_4: kp.Tensor):
-        # params = [tensor_in_1, tensor_in_2, tensor_out]
-        params = [tensor_in_1, tensor_in_2, tensor_out, tensor_test_1, tensor_test_2, tensor_test_3, tensor_test_4]
-
-        if self.algo is None or self.tensor_shape != tensor_shape or self.params != params:
-            self.tensor_shape = tensor_shape
-            self.params = params
-            workgroup = (tensor_shape[0] // self.tile_size, tensor_shape[1] // self.tile_size, 1)
-            # workgroup = (2, 2, 1)
-            print(f'{float(tensor_shape[0])=} {self.tile_size=} {workgroup=}')
-            self.algo = self.mgr.algorithm(
-                params,  # params
-                self.compiled_shader,  # spirv
-                workgroup,  # workgroup
-                [float(tensor_shape[0])],  # spec_consts
-                [])  # push_consts
-
-        (self.mgr.sequence()
-         .record(kp.OpTensorSyncDevice(self.params))
-         .record(kp.OpAlgoDispatch(self.algo))
-         .record(kp.OpTensorSyncLocal(self.params[2:]))
-         # .record(kp.OpTensorSyncLocal([tensor_out]))
-         .eval())
-
-
-def main():
-    mgr = kp.Manager()
-
-    matmul_op = MatMulOp(mgr, 4)
-
-    tensor_size = 8
-    tensor_shape = [tensor_size, tensor_size]
-    tensor_in_1 = mgr.tensor(np.triu(np.ones(tensor_shape)))
-    tensor_in_2 = mgr.tensor(np.triu(np.ones(tensor_shape)))
-    tensor_out = mgr.tensor(np.zeros(tensor_shape))
-    tensor_test_1 = mgr.tensor(np.zeros(tensor_shape))
-    tensor_test_2 = mgr.tensor(np.zeros(tensor_shape))
-    tensor_test_3 = mgr.tensor(np.zeros(tensor_shape))
-    tensor_test_4 = mgr.tensor(np.zeros(tensor_shape))
-
-    print(f'{tensor_shape} input tensors:\n'
-          f'{tensor_in_1.data().reshape(tensor_shape)}\n'
-          f'{tensor_in_2.data().reshape(tensor_shape)}\n')
-
-    # matmul_op(tensor_shape, tensor_in_1, tensor_in_2, tensor_out)
-    matmul_op(tensor_shape, tensor_in_1, tensor_in_2,
-              tensor_out, tensor_test_1, tensor_test_2, tensor_test_3, tensor_test_4)
-
-    # experiment_count = 10
-    # start_time = time.time()
-    # for _ in range(experiment_count):
-    #     matmul_op(tensor_shape, tensor_in_1, tensor_in_2, tensor_out)
-    # end_time = time.time()
-    # experiment_time = end_time - start_time
-    # op_count = tensor_shape[0] * tensor_shape[1] * (tensor_shape[1] - 1)
-
-    print(f'Output :\n{tensor_out.data().reshape(tensor_shape)}')
-    print(f'test_1 :\n{tensor_test_1.data().reshape(tensor_shape)}')
-    print(f'test_2 :\n{tensor_test_2.data().reshape(tensor_shape)}')
-    print(f'test_3 :\n{tensor_test_3.data().reshape(tensor_shape)}')
-    print(f'test_4 :\n{tensor_test_4.data().reshape(tensor_shape)}')
-
-    # print(f'{experiment_count} matmul time : '
-    #       f'{experiment_time * 1000:0.2f}ms => '
-    #       f'{experiment_count / experiment_time:0.2f}op/s or '
-    #       f'{experiment_count * op_count / (1e9 * experiment_time):0.2f}GFLOPS')
-
-
-if __name__ == '__main__':
-    main()
diff --git a/examples/python_naive_matmul/imp3_better_tiling.py b/examples/python_naive_matmul/imp3_better_tiling.py
index e97eb88d4..8cd44277b 100644
--- a/examples/python_naive_matmul/imp3_better_tiling.py
+++ b/examples/python_naive_matmul/imp3_better_tiling.py
@@ -5,7 +5,7 @@ import numpy as np
 
 
 class MatMulOp:
-    def __init__(self, manager: kp.Manager, tile_size: int = -1, thread_work_ratio: int = 8):
+    def __init__(self, manager: kp.Manager, tile_size: int = -1, thread_work_ratio: int = 16):
         self.mgr = manager
 
         props = self.mgr.get_device_properties()
@@ -14,9 +14,9 @@ class MatMulOp:
         if tile_size < 0:
             tile_size = 1
             local_size_y = tile_size // thread_work_ratio
-            while (4 * tile_size * local_size_y <= max_workgroup_invocation
+            while (4 * tile_size * tile_size <= max_workgroup_invocation
                    and 2 * tile_size <= max_workgroup_size[0]
-                   and 2 * local_size_y <= max_workgroup_size[1]):
+                   and 2 * tile_size <= max_workgroup_size[1]):
                 tile_size *= 2
                 local_size_y = tile_size // thread_work_ratio
         else:
@@ -32,10 +32,10 @@ class MatMulOp:
 
         self.local_size_x = tile_size
         self.local_size_y = tile_size // thread_work_ratio
-        self.shader = f'''
+        self.shader = '''
 #version 450
 
-layout (local_size_x = {tile_size}, local_size_y = {self.local_size_y}) in;
+layout (local_size_x = {tile_size}, local_size_y = {local_size_y}) in;
 
 layout (set = 0, binding = 0) readonly buffer buf_in_tensor_1 {{ float in_tensor_1[]; }};
 layout (set = 0, binding = 1) readonly buffer buf_in_tensor_2 {{ float in_tensor_2[]; }};
@@ -51,14 +51,13 @@ void main()
     uint row = gl_LocalInvocationID.x;
     uint col = gl_LocalInvocationID.y;
     uint globalRow = {tile_size} * gl_WorkGroupID.x + row;
-    uint globalCol = {tile_size} * gl_WorkGroupID.y + row;
+    uint globalCol = {tile_size} * gl_WorkGroupID.y + col;
 
     uint tensor_size = uint(tensor_size_f);
     float acc[{thread_work_ratio}];
     for(uint w = 0u; w < {thread_work_ratio}; w++)
         acc[w] = 0.0;
 
-    /*
     uint numTiles = tensor_size / {tile_size};
     for(uint t = 0u; t < numTiles; t++)
     {{
@@ -66,10 +65,10 @@ void main()
         {{
             uint tiledRow = {tile_size} * t + row;
             uint tiledCol = {tile_size} * t + col;
-            sub_tensor_1[col + t * {self.local_size_y}][row] = in_tensor_1[
-                (tiledCol + w * {self.local_size_y}) * tensor_size + globalRow];
-            sub_tensor_2[col + t * {self.local_size_y}][row] = in_tensor_2[
-                (globalCol + w * {self.local_size_y})* tensor_size + tiledRow];
+            sub_tensor_1[col + w * {local_size_y}][row] = in_tensor_1[
+                (tiledCol + w * {local_size_y}) * tensor_size + globalRow];
+            sub_tensor_2[col + w * {local_size_y}][row] = in_tensor_2[
+                (globalCol + w * {local_size_y})* tensor_size + tiledRow];
         }}
 
         memoryBarrierShared();
@@ -77,17 +76,15 @@ void main()
 
         for(uint k = 0u; k < {tile_size}; k++)
             for(uint w = 0u; w < {thread_work_ratio}; w++)
-                acc[w] += sub_tensor_1[k][row] * sub_tensor_2[col + w * {self.local_size_y}][k];
+                acc[w] += sub_tensor_1[k][row] * sub_tensor_2[col + w * {local_size_y}][k];
 
         barrier();
-    }}*/
-    for(uint w = 0u; w < {thread_work_ratio}; w++)
-    {{
-        //out_tensor[(globalCol + w * {self.local_size_y}) * tensor_size + globalRow] = acc[w];
-        out_tensor[(globalCol + w * {self.local_size_y}) * tensor_size + globalRow] = w;
     }}
+    for(uint w = 0u; w < {thread_work_ratio}; w++)
+        out_tensor[(globalCol + w * {local_size_y}) * tensor_size + globalRow] = acc[w];
 }}'''
-        self.compiled_shader = kp.Shader.compile_source(self.shader)
+        self.compiled_shader = kp.Shader.compile_source(self.shader.format(
+            tile_size=tile_size, thread_work_ratio=thread_work_ratio, local_size_y=local_size_y))
         self.tensor_shape: tuple[int, int] = (0, 0)
         self.params: list[kp.Tensor] = []
         self.algo = None
@@ -99,9 +96,12 @@ void main()
         if self.algo is None or self.tensor_shape != tensor_shape or self.params != params:
             self.tensor_shape = tensor_shape
             self.params = params
-            # workgroup = (tensor_shape[0] // self.local_size_x, tensor_shape[1] // self.local_size_y, 1)
-            workgroup = (2, 2, 1)
-            print(tensor_shape, self.local_size_x, self.local_size_y, workgroup)
+            tile_size = min(self.tensor_shape[0], self.tile_size)
+            thread_work_ratio = min(self.tensor_shape[1] // self.tile_size, self.thread_work_ratio)
+            local_size_y = tile_size // thread_work_ratio
+            self.compiled_shader = kp.Shader.compile_source(self.shader.format(
+                tile_size=tile_size, thread_work_ratio=thread_work_ratio, local_size_y=local_size_y))
+            workgroup = (tensor_shape[0] // self.local_size_x, tensor_shape[1] // self.local_size_y, 1)
             self.algo = self.mgr.algorithm(
                 params,  # params
                 self.compiled_shader,  # spirv
@@ -110,7 +110,7 @@ void main()
                 [])  # push_consts
 
         (self.mgr.sequence()
-         .record(kp.OpTensorSyncDevice(self.params))
+         .record(kp.OpTensorSyncDevice([tensor_in_1, tensor_in_2]))
          .record(kp.OpAlgoDispatch(self.algo))
          .record(kp.OpTensorSyncLocal([tensor_out]))
          .eval())
@@ -133,7 +133,7 @@ def main():
 
     matmul_op(tensor_shape, tensor_in_1, tensor_in_2, tensor_out)
 
-    experiment_count = 8
+    experiment_count = 2
     start_time = time.time()
     for _ in range(experiment_count):
         matmul_op(tensor_shape, tensor_in_1, tensor_in_2, tensor_out)

From 137ef147db7c64a11f50687b7e3b7d053063735e Mon Sep 17 00:00:00 2001
From: Alejandro Saucedo <axsauze@gmail.com>
Date: Sun, 25 Jul 2021 18:45:28 +0100
Subject: [PATCH 8/8] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index a3a363770..6d195c8f6 100644
--- a/README.md
+++ b/README.md
@@ -235,7 +235,7 @@ You are able to try out the interactive Colab Notebooks which allow you to use a
 <tr>
 
 <td width="50%">
-<h5>Try the interactive <a href="https://colab.research.google.com/drive/1l3hNSq2AcJ5j2E3YIw__jKy5n6M615GP?authuser=1#scrollTo=1BipBsO-fQRD">C++ Colab</a> from <a href="https://towardsdatascience.com/machine-learning-and-data-processing-in-the-gpu-with-vulkan-kompute-c9350e5e5d3a">Blog Post</a></h5>
+<h5>Try the interactive <a href="https://colab.research.google.com/drive/1l3hNSq2AcJ5j2E3YIw__jKy5n6M615GP?usp=sharing">C++ Colab</a> from <a href="https://towardsdatascience.com/machine-learning-and-data-processing-in-the-gpu-with-vulkan-kompute-c9350e5e5d3a">Blog Post</a></h5>
 </td>
 
 <td>