diff --git a/README.md b/README.md
index ad69806f2..2eea6cb10 100644
--- a/README.md
+++ b/README.md
@@ -250,7 +250,7 @@ You are able to try out the interactive Colab Notebooks which allow you to use a
|
-
+
|
diff --git a/examples/python_naive_matmul/README.md b/examples/python_naive_matmul/README.md
new file mode 100644
index 000000000..0688bb079
--- /dev/null
+++ b/examples/python_naive_matmul/README.md
@@ -0,0 +1,9 @@
+# Naive Matmul Implementation
+
+This demonstrate a basic matmul implementation using Python and vulkan-kompute. Many thanks for the very helpful [SGEMM in WebGL2-compute](https://www.ibiblio.org/e-notes/webgl/gpu/mul/sgemm.htm) article on the public library [ibiblio.org](https://www.ibiblio.org/).
+
+To test the implementation simply run the `matmul.py` script :
+
+```
+python matmul.py
+```
diff --git a/examples/python_naive_matmul/benchmark.py b/examples/python_naive_matmul/benchmark.py
new file mode 100644
index 000000000..768a854d4
--- /dev/null
+++ b/examples/python_naive_matmul/benchmark.py
@@ -0,0 +1,56 @@
+import time
+
+import kp
+import numpy as np
+from imp1_naive import MatMulOp as MatMulOp1
+from imp2_tiled import MatMulOp as MatMulOp2
+from imp3_better_tiling import MatMulOp as MatMulOp3
+
+
+def main():
+ mgr = kp.Manager()
+ for tensor_size, experiment_count in [(512, 1000), (4096, 5)]:
+ tensor_shape = [tensor_size, tensor_size]
+ tensor_shape = [tensor_size, tensor_size]
+ mat_1 = np.triu(np.ones(tensor_shape))
+ mat_2 = np.triu(np.ones(tensor_shape))
+
+ tensor_in_1 = mgr.tensor(mat_1)
+ tensor_in_2 = mgr.tensor(mat_2)
+ tensor_out = mgr.tensor(np.zeros(tensor_shape))
+ if tensor_size <= 512:
+ mat_result = mat_1 @ mat_2
+ else:
+ MatMulOp1(mgr)(tensor_shape, tensor_in_1, tensor_in_2, tensor_out)
+ mat_result = tensor_out.data().reshape(tensor_shape) # CPU is too slow for big sizes
+
+ print(f'{tensor_shape} input tensors:\n'
+ f'{mat_1}\n'
+ f'{mat_2}\n')
+ print(f'Output :\n{mat_result}')
+
+ for MatMulOp in [MatMulOp1, MatMulOp2, MatMulOp3]:
+ tensor_out.data()[:] = 0
+ mgr.sequence().record(kp.OpTensorSyncDevice([tensor_out]))
+ matmul_op = MatMulOp(mgr)
+ matmul_op(tensor_shape, tensor_in_1, tensor_in_2, tensor_out)
+
+ start_time = time.time()
+ for _ in range(experiment_count):
+ matmul_op(tensor_shape, tensor_in_1, tensor_in_2, tensor_out)
+ end_time = time.time()
+ experiment_time = end_time - start_time
+ op_count = tensor_shape[0] * tensor_shape[1] * ((tensor_shape[1] * 2) - 1)
+
+ # print(tensor_out.data().reshape(tensor_shape))
+ if (tensor_out.data().reshape(tensor_shape) == mat_result).all():
+ print(f'From {MatMulOp.__module__} : {experiment_count} matmul time : '
+ f'{experiment_time * 1000:0.2f}ms => '
+ f'{experiment_count / experiment_time:0.2f}op/s or '
+ f'{experiment_count * op_count / (1e9 * experiment_time):0.2f} GFLOPS')
+ else:
+ print(f'Test failed => output tensor is wrong :\n{tensor_out.data().reshape(tensor_shape)}')
+
+
+if __name__ == '__main__':
+ main()
diff --git a/examples/python_naive_matmul/first_example.py b/examples/python_naive_matmul/first_example.py
new file mode 100644
index 000000000..1e7caa871
--- /dev/null
+++ b/examples/python_naive_matmul/first_example.py
@@ -0,0 +1,60 @@
+import kp
+import numpy as np
+
+
+def main():
+ mgr = kp.Manager()
+
+ tensor_size = 4
+ tensor_shape = [tensor_size, tensor_size]
+ tensor_in_1 = mgr.tensor(np.triu(np.ones(tensor_shape)))
+ tensor_in_2 = mgr.tensor(np.triu(np.ones(tensor_shape)))
+ tensor_out = mgr.tensor(np.zeros(tensor_shape))
+
+ print(f'Input tensors:\n'
+ f'{tensor_in_1.data().reshape(tensor_shape)}\n'
+ f'{tensor_in_2.data().reshape(tensor_shape)}\n')
+
+ params = [tensor_in_1, tensor_in_2, tensor_out]
+
+ matmul_shader = kp.Shader.compile_source('''
+#version 450
+
+layout (local_size_x = 1, local_size_y = 1) in;
+
+layout (set = 0, binding = 0) readonly buffer buf_in_tensor_1 { float in_tensor_1[]; };
+layout (set = 0, binding = 1) readonly buffer buf_in_tensor_2 { float in_tensor_2[]; };
+layout (set = 0, binding = 2) writeonly buffer buf_out_tensor { float out_tensor[]; };
+
+layout (constant_id = 0) const float tensor_size_f = 0;
+
+
+void main()
+{
+ uint globalRow = gl_GlobalInvocationID.x;
+ uint globalCol = gl_GlobalInvocationID.y;
+ uint tensor_size = uint(tensor_size_f);
+ float acc = 0.0;
+ for(uint k = 0u; k < tensor_size; k++)
+ acc += in_tensor_1[(k * tensor_size) + globalRow] * in_tensor_2[(globalCol * tensor_size) + k];
+ out_tensor[(globalCol * tensor_size) + globalRow] = acc;
+}''')
+
+ algo = mgr.algorithm(
+ params, # params
+ matmul_shader, # spirv
+ (*tensor_shape, 1), # workgroup
+ [float(tensor_size)], # spec_consts
+ []) # push_consts
+
+ (mgr.sequence()
+ .record(kp.OpTensorSyncDevice(params))
+ .record(kp.OpAlgoDispatch(algo))
+ .record(kp.OpTensorSyncLocal(params))
+ .eval())
+
+ print(f'Output :\n{tensor_out.data().reshape(tensor_shape)}')
+
+
+if __name__ == '__main__':
+ main()
diff --git a/examples/python_naive_matmul/imp1_naive.py b/examples/python_naive_matmul/imp1_naive.py
new file mode 100644
index 000000000..a791662d2
--- /dev/null
+++ b/examples/python_naive_matmul/imp1_naive.py
@@ -0,0 +1,133 @@
+import time
+
+import kp
+import numpy as np
+
+
+class MatMulOp:
+ def __init__(self, manager: kp.Manager, local_size_x: int = -1, local_size_y: int = -1):
+ self.mgr = manager
+
+ props = self.mgr.get_device_properties()
+ max_workgroup_invocation = props['max_work_group_invocations']
+ max_workgroup_size = props['max_work_group_size']
+ if local_size_x < 1:
+ if local_size_y > 0:
+ local_size_x = 1
+ while (2 * local_size_x * local_size_y <= max_workgroup_invocation
+ and 2 * local_size_x <= max_workgroup_size[0]):
+ local_size_x *= 2
+ else:
+ local_size_x = 1
+ local_size_y = 1
+ while 2 * local_size_x * local_size_y <= max_workgroup_invocation:
+ if 2 * local_size_x <= max_workgroup_size[0]:
+ local_size_x *= 2
+ if 2 * local_size_y <= max_workgroup_size[1]:
+ local_size_y *= 2
+ elif 2 * local_size_x > max_workgroup_size[0]: # stop if neither x nor y can be double
+ break
+ elif local_size_y < 0:
+ local_size_y = 1
+ while (2 * local_size_x * local_size_y <= max_workgroup_invocation
+ and 2 * local_size_x <= max_workgroup_size[0]):
+ local_size_y *= 2
+
+ assert local_size_x > 0
+ assert local_size_y > 0
+ assert local_size_x * local_size_y <= max_workgroup_invocation
+ assert local_size_x <= max_workgroup_size[0]
+ assert local_size_y <= max_workgroup_size[1]
+ self.local_size_x = local_size_x
+ self.local_size_y = local_size_y
+
+ self.shader = '''
+#version 450
+
+layout (local_size_x = {local_size_x}, local_size_y = {local_size_y}) in;
+
+layout (set = 0, binding = 0) readonly buffer buf_in_tensor_1 {{ float in_tensor_1[]; }};
+layout (set = 0, binding = 1) readonly buffer buf_in_tensor_2 {{ float in_tensor_2[]; }};
+layout (set = 0, binding = 2) writeonly buffer buf_out_tensor {{ float out_tensor[]; }};
+
+layout (constant_id = 0) const float tensor_size_f = 0;
+
+
+void main()
+{{
+ uint globalRow = gl_GlobalInvocationID.x;
+ uint globalCol = gl_GlobalInvocationID.y;
+ uint tensor_size = uint(tensor_size_f);
+ float acc = 0.0;
+ for(uint k = 0u; k < tensor_size; k++)
+ acc += in_tensor_1[(k * tensor_size) + globalRow] * in_tensor_2[(globalCol * tensor_size) + k];
+ out_tensor[(globalCol * tensor_size) + globalRow] = acc;
+}}'''
+ self.compiled_shader = kp.Shader.compile_source(self.shader.format(
+ local_size_x=self.local_size_x, local_size_y=self.local_size_y))
+ self.tensor_shape: tuple[int, int] = (0, 0)
+ self.params: list[kp.Tensor] = []
+ self.algo = None
+
+ def __call__(self, tensor_shape: tuple[int, int], tensor_in_1: kp.Tensor, tensor_in_2: kp.Tensor,
+ tensor_out: kp.Tensor):
+ params = [tensor_in_1, tensor_in_2, tensor_out]
+
+ if self.algo is None or self.tensor_shape != tensor_shape or self.params != params:
+ self.tensor_shape = tensor_shape
+ self.params = params
+ local_size_x = min(self.local_size_x, tensor_shape[0])
+ local_size_y = min(self.local_size_y, tensor_shape[1])
+ self.compiled_shader = kp.Shader.compile_source(self.shader.format(
+ local_size_x=local_size_x, local_size_y=local_size_y))
+ workgroup = (tensor_shape[0] // local_size_x, tensor_shape[1] // local_size_y, 1)
+ print(f'{workgroup=} {self.local_size_x=} {self.local_size_y=}')
+ self.algo = self.mgr.algorithm(
+ params, # params
+ self.compiled_shader, # spirv
+ workgroup, # workgroup
+ [float(tensor_shape[0])], # spec_consts
+ []) # push_consts
+
+ (self.mgr.sequence()
+ .record(kp.OpTensorSyncDevice([tensor_in_1, tensor_in_2]))
+ .record(kp.OpAlgoDispatch(self.algo))
+ .record(kp.OpTensorSyncLocal([tensor_out]))
+ .eval())
+
+
+def main():
+ mgr = kp.Manager()
+
+ matmul_op = MatMulOp(mgr)
+
+ tensor_size = 4064
+ tensor_shape = [tensor_size, tensor_size]
+ tensor_in_1 = mgr.tensor(np.triu(np.ones(tensor_shape)))
+ tensor_in_2 = mgr.tensor(np.triu(np.ones(tensor_shape)))
+ tensor_out = mgr.tensor(np.zeros(tensor_shape))
+
+ print(f'{tensor_shape} input tensors:\n'
+ f'{tensor_in_1.data().reshape(tensor_shape)}\n'
+ f'{tensor_in_2.data().reshape(tensor_shape)}\n')
+
+ matmul_op(tensor_shape, tensor_in_1, tensor_in_2, tensor_out)
+
+ experiment_count = 8
+ start_time = time.time()
+ for _ in range(experiment_count):
+ matmul_op(tensor_shape, tensor_in_1, tensor_in_2, tensor_out)
+ end_time = time.time()
+ experiment_time = end_time - start_time
+ op_count = tensor_shape[0] * tensor_shape[1] * ((tensor_shape[1] * 2) - 1)
+
+ print(f'Output :\n{tensor_out.data().reshape(tensor_shape)}')
+
+ print(f'{experiment_count} matmul time : '
+ f'{experiment_time * 1000:0.2f}ms => '
+ f'{experiment_count / experiment_time:0.2f}op/s or '
+ f'{experiment_count * op_count / (1e9 * experiment_time):0.2f} GFLOPS')
+
+
+if __name__ == '__main__':
+ main()
diff --git a/examples/python_naive_matmul/imp2_tiled.py b/examples/python_naive_matmul/imp2_tiled.py
new file mode 100644
index 000000000..1ac13e858
--- /dev/null
+++ b/examples/python_naive_matmul/imp2_tiled.py
@@ -0,0 +1,132 @@
+import time
+
+import kp
+import numpy as np
+
+
+class MatMulOp:
+ def __init__(self, manager: kp.Manager, tile_size: int = -1):
+ self.mgr = manager
+
+ props = self.mgr.get_device_properties()
+ max_workgroup_invocation = props['max_work_group_invocations']
+ max_workgroup_size = props['max_work_group_size']
+ if tile_size < 0:
+ tile_size = 1
+ while (4 * tile_size * tile_size <= max_workgroup_invocation
+ and 2 * tile_size <= max_workgroup_size[0]
+ and 2 * tile_size <= max_workgroup_size[1]):
+ tile_size *= 2
+
+ assert tile_size > 0
+ assert tile_size * tile_size <= max_workgroup_invocation
+ assert tile_size <= max_workgroup_size[0]
+ assert tile_size <= max_workgroup_size[1]
+ self.tile_size = tile_size
+
+ self.shader = '''
+#version 450
+
+layout (local_size_x = {tile_size}, local_size_y = {tile_size}) in;
+
+layout (set = 0, binding = 0) readonly buffer buf_in_tensor_1 {{ float in_tensor_1[]; }};
+layout (set = 0, binding = 1) readonly buffer buf_in_tensor_2 {{ float in_tensor_2[]; }};
+layout (set = 0, binding = 2) writeonly buffer buf_out_tensor {{ float out_tensor[]; }};
+
+layout (constant_id = 0) const float tensor_size_f = 0;
+
+shared float sub_tensor_1[{tile_size}][{tile_size}];
+shared float sub_tensor_2[{tile_size}][{tile_size}];
+
+void main()
+{{
+ uint row = gl_LocalInvocationID.x; // 0 .. tile_size
+ uint col = gl_LocalInvocationID.y; // 0 .. tile_size
+ // gl_WorkGroupID : 0 .. tensor_size / tile_size
+ uint globalRow = {tile_size} * gl_WorkGroupID.x + row;
+ uint globalCol = {tile_size} * gl_WorkGroupID.y + col;
+
+ uint tensor_size = uint(tensor_size_f);
+ float acc = 0.0;
+ uint numTiles = tensor_size / {tile_size};
+ for(uint t = 0u; t < numTiles; t++)
+ {{
+ uint tiledRow = ({tile_size} * t) + row;
+ uint tiledCol = ({tile_size} * t) + col;
+ sub_tensor_1[col][row] = in_tensor_1[(tiledCol * tensor_size) + globalRow];
+ sub_tensor_2[col][row] = in_tensor_2[(globalCol * tensor_size) + tiledRow];
+
+ memoryBarrierShared();
+ barrier();
+
+ for(uint k = 0u; k < {tile_size}; k++)
+ acc += sub_tensor_1[k][row] * sub_tensor_2[col][k];
+
+ barrier();
+ }}
+ out_tensor[tensor_size * globalCol + globalRow] = acc;
+}}'''
+ self.compiled_shader = kp.Shader.compile_source(self.shader.format(tile_size=tile_size))
+ self.tensor_shape: tuple[int, int] = (0, 0)
+ self.params: list[kp.Tensor] = []
+ self.algo = None
+
+ def __call__(self, tensor_shape: tuple[int, int], tensor_in_1: kp.Tensor, tensor_in_2: kp.Tensor,
+ tensor_out: kp.Tensor):
+ params = [tensor_in_1, tensor_in_2, tensor_out]
+
+ if self.algo is None or self.tensor_shape != tensor_shape or self.params != params:
+ self.tensor_shape = tensor_shape
+ self.params = params
+ tile_size = min(tensor_shape[0], tensor_shape[1], self.tile_size)
+ self.compiled_shader = kp.Shader.compile_source(self.shader.format(tile_size=tile_size))
+ workgroup = [tensor_shape[0] // tile_size, tensor_shape[1] // tile_size, 1]
+ self.algo = self.mgr.algorithm(
+ params, # params
+ self.compiled_shader, # spirv
+ workgroup, # workgroup
+ [float(tensor_shape[0])], # spec_consts
+ []) # push_consts
+
+ (self.mgr.sequence()
+ .record(kp.OpTensorSyncDevice([tensor_in_1, tensor_in_2]))
+ .record(kp.OpAlgoDispatch(self.algo))
+ .record(kp.OpTensorSyncLocal([tensor_out]))
+ .eval())
+
+
+def main():
+ mgr = kp.Manager()
+
+ matmul_op = MatMulOp(mgr)
+
+ tensor_size = 4096
+ tensor_shape = [tensor_size, tensor_size]
+ tensor_in_1 = mgr.tensor(np.triu(np.ones(tensor_shape)))
+ tensor_in_2 = mgr.tensor(np.triu(np.ones(tensor_shape)))
+ tensor_out = mgr.tensor(np.zeros(tensor_shape))
+
+ print(f'{tensor_shape} input tensors:\n'
+ f'{tensor_in_1.data().reshape(tensor_shape)}\n'
+ f'{tensor_in_2.data().reshape(tensor_shape)}\n')
+
+ matmul_op(tensor_shape, tensor_in_1, tensor_in_2, tensor_out)
+
+ experiment_count = 8
+ start_time = time.time()
+ for _ in range(experiment_count):
+ matmul_op(tensor_shape, tensor_in_1, tensor_in_2, tensor_out)
+ end_time = time.time()
+ experiment_time = end_time - start_time
+ op_count = tensor_shape[0] * tensor_shape[1] * ((tensor_shape[1] * 2) - 1)
+
+ print(f'Output :\n{tensor_out.data().reshape(tensor_shape)}')
+
+ print(f'{experiment_count} matmul time : '
+ f'{experiment_time * 1000:0.2f}ms => '
+ f'{experiment_count / experiment_time:0.2f}op/s or '
+ f'{experiment_count * op_count / (1e9 * experiment_time):0.2f} GFLOPS')
+
+
+if __name__ == '__main__':
+ main()
diff --git a/examples/python_naive_matmul/imp3_better_tiling.py b/examples/python_naive_matmul/imp3_better_tiling.py
new file mode 100644
index 000000000..8cd44277b
--- /dev/null
+++ b/examples/python_naive_matmul/imp3_better_tiling.py
@@ -0,0 +1,153 @@
+import time
+
+import kp
+import numpy as np
+
+
+class MatMulOp:
+ def __init__(self, manager: kp.Manager, tile_size: int = -1, thread_work_ratio: int = 16):
+ self.mgr = manager
+
+ props = self.mgr.get_device_properties()
+ max_workgroup_invocation = props['max_work_group_invocations']
+ max_workgroup_size = props['max_work_group_size']
+ if tile_size < 0:
+ tile_size = 1
+ local_size_y = tile_size // thread_work_ratio
+ while (4 * tile_size * tile_size <= max_workgroup_invocation
+ and 2 * tile_size <= max_workgroup_size[0]
+ and 2 * tile_size <= max_workgroup_size[1]):
+ tile_size *= 2
+ local_size_y = tile_size // thread_work_ratio
+ else:
+ local_size_y = tile_size // thread_work_ratio
+
+ assert tile_size > 0
+ assert thread_work_ratio > 0
+ assert tile_size * local_size_y <= max_workgroup_invocation
+ assert tile_size <= max_workgroup_size[0]
+ assert local_size_y <= max_workgroup_size[1]
+ self.tile_size = tile_size
+ self.thread_work_ratio = thread_work_ratio
+
+ self.local_size_x = tile_size
+ self.local_size_y = tile_size // thread_work_ratio
+ self.shader = '''
+#version 450
+
+layout (local_size_x = {tile_size}, local_size_y = {local_size_y}) in;
+
+layout (set = 0, binding = 0) readonly buffer buf_in_tensor_1 {{ float in_tensor_1[]; }};
+layout (set = 0, binding = 1) readonly buffer buf_in_tensor_2 {{ float in_tensor_2[]; }};
+layout (set = 0, binding = 2) writeonly buffer buf_out_tensor {{ float out_tensor[]; }};
+
+layout (constant_id = 0) const float tensor_size_f = 0;
+
+shared float sub_tensor_1[{tile_size}][{tile_size}];
+shared float sub_tensor_2[{tile_size}][{tile_size}];
+
+void main()
+{{
+ uint row = gl_LocalInvocationID.x;
+ uint col = gl_LocalInvocationID.y;
+ uint globalRow = {tile_size} * gl_WorkGroupID.x + row;
+ uint globalCol = {tile_size} * gl_WorkGroupID.y + col;
+
+ uint tensor_size = uint(tensor_size_f);
+ float acc[{thread_work_ratio}];
+ for(uint w = 0u; w < {thread_work_ratio}; w++)
+ acc[w] = 0.0;
+
+ uint numTiles = tensor_size / {tile_size};
+ for(uint t = 0u; t < numTiles; t++)
+ {{
+ for(uint w = 0u; w < {thread_work_ratio}; w++)
+ {{
+ uint tiledRow = {tile_size} * t + row;
+ uint tiledCol = {tile_size} * t + col;
+ sub_tensor_1[col + w * {local_size_y}][row] = in_tensor_1[
+ (tiledCol + w * {local_size_y}) * tensor_size + globalRow];
+ sub_tensor_2[col + w * {local_size_y}][row] = in_tensor_2[
+ (globalCol + w * {local_size_y})* tensor_size + tiledRow];
+ }}
+
+ memoryBarrierShared();
+ barrier();
+
+ for(uint k = 0u; k < {tile_size}; k++)
+ for(uint w = 0u; w < {thread_work_ratio}; w++)
+ acc[w] += sub_tensor_1[k][row] * sub_tensor_2[col + w * {local_size_y}][k];
+
+ barrier();
+ }}
+ for(uint w = 0u; w < {thread_work_ratio}; w++)
+ out_tensor[(globalCol + w * {local_size_y}) * tensor_size + globalRow] = acc[w];
+}}'''
+ self.compiled_shader = kp.Shader.compile_source(self.shader.format(
+ tile_size=tile_size, thread_work_ratio=thread_work_ratio, local_size_y=local_size_y))
+ self.tensor_shape: tuple[int, int] = (0, 0)
+ self.params: list[kp.Tensor] = []
+ self.algo = None
+
+ def __call__(self, tensor_shape: tuple[int, int], tensor_in_1: kp.Tensor, tensor_in_2: kp.Tensor,
+ tensor_out: kp.Tensor):
+ params = [tensor_in_1, tensor_in_2, tensor_out]
+
+ if self.algo is None or self.tensor_shape != tensor_shape or self.params != params:
+ self.tensor_shape = tensor_shape
+ self.params = params
+ tile_size = min(self.tensor_shape[0], self.tile_size)
+ thread_work_ratio = min(self.tensor_shape[1] // self.tile_size, self.thread_work_ratio)
+ local_size_y = tile_size // thread_work_ratio
+ self.compiled_shader = kp.Shader.compile_source(self.shader.format(
+ tile_size=tile_size, thread_work_ratio=thread_work_ratio, local_size_y=local_size_y))
+ workgroup = (tensor_shape[0] // self.local_size_x, tensor_shape[1] // self.local_size_y, 1)
+ self.algo = self.mgr.algorithm(
+ params, # params
+ self.compiled_shader, # spirv
+ workgroup, # workgroup
+ [float(tensor_shape[0])], # spec_consts
+ []) # push_consts
+
+ (self.mgr.sequence()
+ .record(kp.OpTensorSyncDevice([tensor_in_1, tensor_in_2]))
+ .record(kp.OpAlgoDispatch(self.algo))
+ .record(kp.OpTensorSyncLocal([tensor_out]))
+ .eval())
+
+
+def main():
+ mgr = kp.Manager()
+
+ matmul_op = MatMulOp(mgr)
+
+ tensor_size = 4096
+ tensor_shape = [tensor_size, tensor_size]
+ tensor_in_1 = mgr.tensor(np.triu(np.ones(tensor_shape)))
+ tensor_in_2 = mgr.tensor(np.triu(np.ones(tensor_shape)))
+ tensor_out = mgr.tensor(np.zeros(tensor_shape))
+
+ print(f'{tensor_shape} input tensors:\n'
+ f'{tensor_in_1.data().reshape(tensor_shape)}\n'
+ f'{tensor_in_2.data().reshape(tensor_shape)}\n')
+
+ matmul_op(tensor_shape, tensor_in_1, tensor_in_2, tensor_out)
+
+ experiment_count = 2
+ start_time = time.time()
+ for _ in range(experiment_count):
+ matmul_op(tensor_shape, tensor_in_1, tensor_in_2, tensor_out)
+ end_time = time.time()
+ experiment_time = end_time - start_time
+ op_count = tensor_shape[0] * tensor_shape[1] * ((tensor_shape[1] * 2) - 1)
+
+ print(f'Output :\n{tensor_out.data().reshape(tensor_shape)}')
+
+ print(f'{experiment_count} matmul time : '
+ f'{experiment_time * 1000:0.2f}ms => '
+ f'{experiment_count / experiment_time:0.2f}op/s or '
+ f'{experiment_count * op_count / (1e9 * experiment_time):0.2f} GFLOPS')
+
+
+if __name__ == '__main__':
+ main()
diff --git a/examples/python_naive_matmul/matmul_plot.py b/examples/python_naive_matmul/matmul_plot.py
new file mode 100644
index 000000000..81763a439
--- /dev/null
+++ b/examples/python_naive_matmul/matmul_plot.py
@@ -0,0 +1,97 @@
+from argparse import ArgumentParser
+
+import cv2
+import numpy as np
+
+
+def plot_tensor(window_name: str, tensor: np.ndarray, coord_highlight: tuple[int, int] = None):
+ font_size = 48
+ image = np.zeros((tensor.shape[1] * font_size, tensor.shape[0] * font_size, 3), dtype=np.uint8)
+
+ for y in range(tensor.shape[1]):
+ for x in range(tensor.shape[0]):
+ if coord_highlight and x == coord_highlight[1] and y == coord_highlight[0]:
+ cv2.putText(
+ image, str(int(tensor[y, x])), (x * font_size, int((y + 0.8) * font_size)),
+ cv2.FONT_HERSHEY_TRIPLEX, 1., (127, 127, 255))
+ else:
+ cv2.putText(
+ image, str(int(tensor[y, x])), (x * font_size, int((y + 0.8) * font_size)),
+ cv2.FONT_HERSHEY_TRIPLEX, 1., (255, 255, 255))
+
+ cv2.imshow(window_name, image)
+
+
+def main():
+ parser = ArgumentParser()
+ parser.add_argument('tensor_size', type=int, help='Size of the square tensors')
+ parser.add_argument('tile_size', type=int)
+ parser.add_argument('local_size', type=int, nargs=2)
+ parser.add_argument('workgroup', type=int, nargs=2)
+ arguments = parser.parse_args()
+
+ tensor_size: int = arguments.tensor_size
+ tile_size: int = arguments.tile_size
+ local_size: tuple[int, int, int] = tuple(arguments.local_size)
+ workgroup: tuple[int, int, int] = tuple(arguments.workgroup)
+
+ tensor_shape = (tensor_size, tensor_size)
+ tensor_1 = np.triu(np.ones(tensor_shape))
+ tensor_2 = np.triu(np.ones(tensor_shape))
+ tensor_out = np.zeros(tensor_shape)
+ tensor_test_1 = np.zeros(tensor_shape)
+ tensor_test_2 = np.zeros(tensor_shape)
+ tensor_test_3 = np.zeros(tensor_shape)
+ tensor_test_4 = np.zeros(tensor_shape)
+ tensor_test_5 = np.zeros(tensor_shape)
+
+ plot_tensor('tensor_1', tensor_1)
+ plot_tensor('tensor_2', tensor_2)
+ plot_tensor('tensor_out', tensor_out)
+ plot_tensor('tensor_test_1', tensor_test_1)
+ plot_tensor('tensor_test_2', tensor_test_2)
+ plot_tensor('tensor_test_3', tensor_test_3)
+ plot_tensor('tensor_test_4', tensor_test_4)
+ plot_tensor('tensor_test_5', tensor_test_5)
+ cv2.waitKey(-1)
+
+ print(f'{workgroup=} {local_size=}')
+ for workgroup_x in range(workgroup[0]):
+ for workgroup_y in range(workgroup[1]):
+ for invocation_x in range(workgroup_x * local_size[0], (workgroup_x + 1) * local_size[0]):
+ for invocation_y in range(workgroup_y * local_size[1], (workgroup_y + 1) * local_size[1]):
+ row = invocation_x
+ col = invocation_y
+ globalRow = (tile_size * workgroup_x) + row
+ globalCol = (tile_size * workgroup_y) + col
+ try:
+ tensor_out[row, col] = row
+ tensor_test_1[row, col] = col
+ tensor_test_2[row, col] = workgroup_x
+ tensor_test_3[row, col] = workgroup_y
+ tensor_test_4[row, col] = globalRow
+ tensor_test_5[row, col] = globalCol
+ plot_tensor('tensor_out', tensor_out, (row, col))
+ plot_tensor('tensor_test_1', tensor_test_1, (row, col))
+ plot_tensor('tensor_test_2', tensor_test_2, (row, col))
+ plot_tensor('tensor_test_3', tensor_test_3, (row, col))
+ plot_tensor('tensor_test_4', tensor_test_4, (row, col))
+ plot_tensor('tensor_test_5', tensor_test_5, (row, col))
+ cv2.waitKey(-1)
+ except IndexError as error:
+ print(f'{workgroup_x=} {workgroup_y=} {row=} {col=}')
+ raise error
+
+ plot_tensor('tensor_1', tensor_1)
+ plot_tensor('tensor_2', tensor_2)
+ plot_tensor('tensor_out', tensor_out)
+ plot_tensor('tensor_test_1', tensor_test_1)
+ plot_tensor('tensor_test_2', tensor_test_2)
+ plot_tensor('tensor_test_3', tensor_test_3)
+ plot_tensor('tensor_test_4', tensor_test_4)
+ plot_tensor('tensor_test_5', tensor_test_5)
+ cv2.waitKey(-1)
+
+
+if __name__ == '__main__':
+ main()
|