From a3f7793c17fb2fd8ba3608e809caf31e29195058 Mon Sep 17 00:00:00 2001
From: Corentin <corentin-pro@mail.com>
Date: Fri, 25 Jun 2021 03:27:52 +0900
Subject: [PATCH] Fix FLOPS calculation

---
 examples/python_naive_matmul/benchmark.py     |  4 +-
 examples/python_naive_matmul/imp1_naive.py    |  9 ++--
 examples/python_naive_matmul/imp2_tiled.py    | 15 +++---
 .../python_naive_matmul/imp3_better_tiling.py | 47 +++++++++++--------
 4 files changed, 41 insertions(+), 34 deletions(-)

diff --git a/examples/python_naive_matmul/benchmark.py b/examples/python_naive_matmul/benchmark.py
index b10369d7c..8b92dda2f 100644
--- a/examples/python_naive_matmul/benchmark.py
+++ b/examples/python_naive_matmul/benchmark.py
@@ -34,13 +34,13 @@ def main():
             matmul_op(tensor_shape, tensor_in_1, tensor_in_2, tensor_out)
         end_time = time.time()
         experiment_time = end_time - start_time
-        op_count = tensor_shape[0] * tensor_shape[1] * (tensor_shape[1] - 1)
+        op_count = tensor_shape[0] * tensor_shape[1] * ((tensor_shape[1] * 2) - 1)
 
         if (tensor_out.data().reshape(tensor_shape) == mat_result).all():
             print(f'From {MatMulOp.__module__} : {experiment_count} matmul time : '
                   f'{experiment_time * 1000:0.2f}ms => '
                   f'{experiment_count / experiment_time:0.2f}op/s or '
-                  f'{experiment_count * op_count / (1e9 * experiment_time):0.2f}GFLOPS')
+                  f'{experiment_count * op_count / (1e9 * experiment_time):0.2f} GFLOPS')
         else:
             print(f'Test failed => output tensor is wrong :\n{tensor_out.data().reshape(tensor_shape)}')
 
diff --git a/examples/python_naive_matmul/imp1_naive.py b/examples/python_naive_matmul/imp1_naive.py
index faefec563..420336260 100644
--- a/examples/python_naive_matmul/imp1_naive.py
+++ b/examples/python_naive_matmul/imp1_naive.py
@@ -76,6 +76,7 @@ void main()
             self.tensor_shape = tensor_shape
             self.params = params
             workgroup = (tensor_shape[0] // self.local_size_x, tensor_shape[1] // self.local_size_y, 1)
+            print(f'{workgroup=} {self.local_size_x=} {self.local_size_y=}')
             self.algo = self.mgr.algorithm(
                 params,  # params
                 self.compiled_shader,  # spirv
@@ -95,7 +96,7 @@ def main():
 
     matmul_op = MatMulOp(mgr)
 
-    tensor_size = 512
+    tensor_size = 4064
     tensor_shape = [tensor_size, tensor_size]
     tensor_in_1 = mgr.tensor(np.triu(np.ones(tensor_shape)))
     tensor_in_2 = mgr.tensor(np.triu(np.ones(tensor_shape)))
@@ -107,20 +108,20 @@ def main():
 
     matmul_op(tensor_shape, tensor_in_1, tensor_in_2, tensor_out)
 
-    experiment_count = 1000
+    experiment_count = 8
     start_time = time.time()
     for _ in range(experiment_count):
         matmul_op(tensor_shape, tensor_in_1, tensor_in_2, tensor_out)
     end_time = time.time()
     experiment_time = end_time - start_time
-    op_count = tensor_shape[0] * tensor_shape[1] * (tensor_shape[1] - 1)
+    op_count = tensor_shape[0] * tensor_shape[1] * ((tensor_shape[1] * 2) - 1)
 
     print(f'Output :\n{tensor_out.data().reshape(tensor_shape)}')
 
     print(f'{experiment_count} matmul time : '
           f'{experiment_time * 1000:0.2f}ms => '
           f'{experiment_count / experiment_time:0.2f}op/s or '
-          f'{experiment_count * op_count / (1e9 * experiment_time):0.2f}GFLOPS')
+          f'{experiment_count * op_count / (1e9 * experiment_time):0.2f} GFLOPS')
 
 
 if __name__ == '__main__':
diff --git a/examples/python_naive_matmul/imp2_tiled.py b/examples/python_naive_matmul/imp2_tiled.py
index ed6a1ddf8..8ddf53745 100644
--- a/examples/python_naive_matmul/imp2_tiled.py
+++ b/examples/python_naive_matmul/imp2_tiled.py
@@ -43,8 +43,8 @@ void main()
     uint row = gl_LocalInvocationID.x; // 0 .. tile_size
     uint col = gl_LocalInvocationID.y; // 0 .. tile_size
     // gl_WorkGroupID : 0 .. tensor_size / tile_size
-    uint globalRow = ({tile_size} * gl_WorkGroupID.x) + row;
-    uint globalCol = ({tile_size} * gl_WorkGroupID.y) + col;
+    uint globalRow = {tile_size} * gl_WorkGroupID.x + row;
+    uint globalCol = {tile_size} * gl_WorkGroupID.y + col;
 
     uint tensor_size = uint(tensor_size_f);
     float acc = 0.0;
@@ -64,8 +64,7 @@ void main()
 
         barrier();
     }}
-    uint globalIndex = (tensor_size * globalCol) + globalRow;
-    out_tensor[globalIndex] = acc;
+    out_tensor[tensor_size * globalCol + globalRow] = acc;
 }}'''
         self.compiled_shader = kp.Shader.compile_source(self.shader)
         self.tensor_shape: tuple[int, int] = (0, 0)
@@ -99,7 +98,7 @@ def main():
 
     matmul_op = MatMulOp(mgr)
 
-    tensor_size = 512
+    tensor_size = 4096
     tensor_shape = [tensor_size, tensor_size]
     tensor_in_1 = mgr.tensor(np.triu(np.ones(tensor_shape)))
     tensor_in_2 = mgr.tensor(np.triu(np.ones(tensor_shape)))
@@ -111,20 +110,20 @@ def main():
 
     matmul_op(tensor_shape, tensor_in_1, tensor_in_2, tensor_out)
 
-    experiment_count = 1000
+    experiment_count = 8
     start_time = time.time()
     for _ in range(experiment_count):
         matmul_op(tensor_shape, tensor_in_1, tensor_in_2, tensor_out)
     end_time = time.time()
     experiment_time = end_time - start_time
-    op_count = tensor_shape[0] * tensor_shape[1] * (tensor_shape[1] - 1)
+    op_count = tensor_shape[0] * tensor_shape[1] * ((tensor_shape[1] * 2) - 1)
 
     print(f'Output :\n{tensor_out.data().reshape(tensor_shape)}')
 
     print(f'{experiment_count} matmul time : '
           f'{experiment_time * 1000:0.2f}ms => '
           f'{experiment_count / experiment_time:0.2f}op/s or '
-          f'{experiment_count * op_count / (1e9 * experiment_time):0.2f}GFLOPS')
+          f'{experiment_count * op_count / (1e9 * experiment_time):0.2f} GFLOPS')
 
 
 if __name__ == '__main__':
diff --git a/examples/python_naive_matmul/imp3_better_tiling.py b/examples/python_naive_matmul/imp3_better_tiling.py
index 6b3ada314..e97eb88d4 100644
--- a/examples/python_naive_matmul/imp3_better_tiling.py
+++ b/examples/python_naive_matmul/imp3_better_tiling.py
@@ -55,30 +55,37 @@ void main()
 
     uint tensor_size = uint(tensor_size_f);
     float acc[{thread_work_ratio}];
-    for (uint l = 0u; l < {thread_work_ratio}; l++)
-        acc[l] = 0.0;
+    for(uint w = 0u; w < {thread_work_ratio}; w++)
+        acc[w] = 0.0;
 
+    /*
     uint numTiles = tensor_size / {tile_size};
     for(uint t = 0u; t < numTiles; t++)
     {{
-        uint tiledRow = {tile_size} * t + row;
-        uint tiledCol = {tile_size} * t + col;
-        sub_tensor_1[col + t * {self.local_size_y}][row] = in_tensor_1[
-            (tiledCol + t * {self.local_size_y}) * tensor_size + globalRow];
-        sub_tensor_2[col + t * {self.local_size_y}][row] = in_tensor_2[
-            (globalCol + t * {self.local_size_y})* tensor_size + tiledRow];
+        for(uint w = 0u; w < {thread_work_ratio}; w++)
+        {{
+            uint tiledRow = {tile_size} * t + row;
+            uint tiledCol = {tile_size} * t + col;
+            sub_tensor_1[col + t * {self.local_size_y}][row] = in_tensor_1[
+                (tiledCol + w * {self.local_size_y}) * tensor_size + globalRow];
+            sub_tensor_2[col + t * {self.local_size_y}][row] = in_tensor_2[
+                (globalCol + w * {self.local_size_y})* tensor_size + tiledRow];
+        }}
 
         memoryBarrierShared();
         barrier();
 
         for(uint k = 0u; k < {tile_size}; k++)
-            for(uint l = 0u; l < {thread_work_ratio}; l++)
-                acc[l] += sub_tensor_1[k][row] * sub_tensor_2[col + l * {self.local_size_y}][k];
+            for(uint w = 0u; w < {thread_work_ratio}; w++)
+                acc[w] += sub_tensor_1[k][row] * sub_tensor_2[col + w * {self.local_size_y}][k];
 
         barrier();
+    }}*/
+    for(uint w = 0u; w < {thread_work_ratio}; w++)
+    {{
+        //out_tensor[(globalCol + w * {self.local_size_y}) * tensor_size + globalRow] = acc[w];
+        out_tensor[(globalCol + w * {self.local_size_y}) * tensor_size + globalRow] = w;
     }}
-    for(uint l = 0u; l < {thread_work_ratio}; l++)
-        out_tensor[(globalCol + l * {self.local_size_y}) * tensor_size + globalRow] = acc[l];
 }}'''
         self.compiled_shader = kp.Shader.compile_source(self.shader)
         self.tensor_shape: tuple[int, int] = (0, 0)
@@ -92,13 +99,13 @@ void main()
         if self.algo is None or self.tensor_shape != tensor_shape or self.params != params:
             self.tensor_shape = tensor_shape
             self.params = params
-            print(
-                tensor_shape, self.local_size_x, self.local_size_y,
-                (tensor_shape[0] // self.local_size_x, tensor_shape[1] // self.local_size_y, 1))
+            # workgroup = (tensor_shape[0] // self.local_size_x, tensor_shape[1] // self.local_size_y, 1)
+            workgroup = (2, 2, 1)
+            print(tensor_shape, self.local_size_x, self.local_size_y, workgroup)
             self.algo = self.mgr.algorithm(
                 params,  # params
                 self.compiled_shader,  # spirv
-                (tensor_shape[0] // self.local_size_x, tensor_shape[1] // self.local_size_y, 1),  # workgroup
+                workgroup,  # workgroup
                 [float(tensor_shape[0])],  # spec_consts
                 [])  # push_consts
 
@@ -114,7 +121,7 @@ def main():
 
     matmul_op = MatMulOp(mgr)
 
-    tensor_size = 512
+    tensor_size = 4096
     tensor_shape = [tensor_size, tensor_size]
     tensor_in_1 = mgr.tensor(np.triu(np.ones(tensor_shape)))
     tensor_in_2 = mgr.tensor(np.triu(np.ones(tensor_shape)))
@@ -126,20 +133,20 @@ def main():
 
     matmul_op(tensor_shape, tensor_in_1, tensor_in_2, tensor_out)
 
-    experiment_count = 1000
+    experiment_count = 8
     start_time = time.time()
     for _ in range(experiment_count):
         matmul_op(tensor_shape, tensor_in_1, tensor_in_2, tensor_out)
     end_time = time.time()
     experiment_time = end_time - start_time
-    op_count = tensor_shape[0] * tensor_shape[1] * (tensor_shape[1] - 1)
+    op_count = tensor_shape[0] * tensor_shape[1] * ((tensor_shape[1] * 2) - 1)
 
     print(f'Output :\n{tensor_out.data().reshape(tensor_shape)}')
 
     print(f'{experiment_count} matmul time : '
           f'{experiment_time * 1000:0.2f}ms => '
           f'{experiment_count / experiment_time:0.2f}op/s or '
-          f'{experiment_count * op_count / (1e9 * experiment_time):0.2f}GFLOPS')
+          f'{experiment_count * op_count / (1e9 * experiment_time):0.2f} GFLOPS')
 
 
 if __name__ == '__main__':