From 0b7d0577277963d4d66cf3349afe41f737aaf4f9 Mon Sep 17 00:00:00 2001 From: Corentin Date: Tue, 22 Jun 2021 16:13:20 +0900 Subject: [PATCH 1/8] Naive matrice multiplication example --- examples/python_naive_matmul/README.md | 9 ++++ examples/python_naive_matmul/matmul.py | 60 ++++++++++++++++++++++++++ 2 files changed, 69 insertions(+) create mode 100644 examples/python_naive_matmul/README.md create mode 100644 examples/python_naive_matmul/matmul.py diff --git a/examples/python_naive_matmul/README.md b/examples/python_naive_matmul/README.md new file mode 100644 index 000000000..26a89172b --- /dev/null +++ b/examples/python_naive_matmul/README.md @@ -0,0 +1,9 @@ +# Naive Matmul Implementation + +This demonstrate a basic matmul implementation using Python and vulkan-kompute + +To test the implementation simply run the `matmul.py` script : + +``` +python matmul.py +``` diff --git a/examples/python_naive_matmul/matmul.py b/examples/python_naive_matmul/matmul.py new file mode 100644 index 000000000..1e7caa871 --- /dev/null +++ b/examples/python_naive_matmul/matmul.py @@ -0,0 +1,60 @@ +import kp +import numpy as np + + +def main(): + mgr = kp.Manager() + + tensor_size = 4 + tensor_shape = [tensor_size, tensor_size] + tensor_in_1 = mgr.tensor(np.triu(np.ones(tensor_shape))) + tensor_in_2 = mgr.tensor(np.triu(np.ones(tensor_shape))) + tensor_out = mgr.tensor(np.zeros(tensor_shape)) + + print(f'Input tensors:\n' + f'{tensor_in_1.data().reshape(tensor_shape)}\n' + f'{tensor_in_2.data().reshape(tensor_shape)}\n') + + params = [tensor_in_1, tensor_in_2, tensor_out] + + matmul_shader = kp.Shader.compile_source(''' +#version 450 + +layout (local_size_x = 1, local_size_y = 1) in; + +layout (set = 0, binding = 0) readonly buffer buf_in_tensor_1 { float in_tensor_1[]; }; +layout (set = 0, binding = 1) readonly buffer buf_in_tensor_2 { float in_tensor_2[]; }; +layout (set = 0, binding = 2) writeonly buffer buf_out_tensor { float out_tensor[]; }; + +layout (constant_id = 0) const float tensor_size_f = 0; + + +void main() +{ + uint globalRow = gl_GlobalInvocationID.x; + uint globalCol = gl_GlobalInvocationID.y; + uint tensor_size = uint(tensor_size_f); + float acc = 0.0; + for(uint k = 0u; k < tensor_size; k++) + acc += in_tensor_1[(k * tensor_size) + globalRow] * in_tensor_2[(globalCol * tensor_size) + k]; + out_tensor[(globalCol * tensor_size) + globalRow] = acc; +}''') + + algo = mgr.algorithm( + params, # params + matmul_shader, # spirv + (*tensor_shape, 1), # workgroup + [float(tensor_size)], # spec_consts + []) # push_consts + + (mgr.sequence() + .record(kp.OpTensorSyncDevice(params)) + .record(kp.OpAlgoDispatch(algo)) + .record(kp.OpTensorSyncLocal(params)) + .eval()) + + print(f'Output :\n{tensor_out.data().reshape(tensor_shape)}') + + +if __name__ == '__main__': + main() From 8c9ad5f2a4f324891c805db25572682477cad97b Mon Sep 17 00:00:00 2001 From: Corentin Date: Tue, 22 Jun 2021 18:21:32 +0900 Subject: [PATCH 2/8] Naive matmul benchmark --- .../python_naive_matmul/1_naive_matmul.py | 96 +++++++++++++++++++ .../{matmul.py => first_example.py} | 0 2 files changed, 96 insertions(+) create mode 100644 examples/python_naive_matmul/1_naive_matmul.py rename examples/python_naive_matmul/{matmul.py => first_example.py} (100%) diff --git a/examples/python_naive_matmul/1_naive_matmul.py b/examples/python_naive_matmul/1_naive_matmul.py new file mode 100644 index 000000000..e7fe092a8 --- /dev/null +++ b/examples/python_naive_matmul/1_naive_matmul.py @@ -0,0 +1,96 @@ +import time + +import kp +import numpy as np + + +class MatMulOp: + def __init__(self, manager: kp.Manager, local_size_x: int = 1, local_size_y: int = 1): + self.mgr = manager + assert(local_size_x > 0) + assert(local_size_y > 0) + self.local_size_x = local_size_x + self.local_size_y = local_size_y + + self.shader = kp.Shader.compile_source(f''' + #version 450 + + layout (local_size_x = {local_size_x}, local_size_y = {local_size_y}) in; + + layout (set = 0, binding = 0) readonly buffer buf_in_tensor_1 {{ float in_tensor_1[]; }}; + layout (set = 0, binding = 1) readonly buffer buf_in_tensor_2 {{ float in_tensor_2[]; }}; + layout (set = 0, binding = 2) writeonly buffer buf_out_tensor {{ float out_tensor[]; }}; + + layout (constant_id = 0) const float tensor_size_f = 0; + + + void main() + {{ + uint globalRow = gl_GlobalInvocationID.x; + uint globalCol = gl_GlobalInvocationID.y; + uint tensor_size = uint(tensor_size_f); + float acc = 0.0; + for(uint k = 0u; k < tensor_size; k++) + acc += in_tensor_1[(k * tensor_size) + globalRow] * in_tensor_2[(globalCol * tensor_size) + k]; + out_tensor[(globalCol * tensor_size) + globalRow] = acc; + }}''') + self.tensor_shape: tuple[int, int] = (0, 0) + self.params: list[kp.Tensor] = [] + self.algo = None + + def __call__(self, tensor_shape: tuple[int, int], tensor_in_1: kp.Tensor, tensor_in_2: kp.Tensor, + tensor_out: kp.Tensor): + params = [tensor_in_1, tensor_in_2, tensor_out] + + if self.algo is None or self.tensor_shape != tensor_shape or self.params != params: + self.tensor_shape = tensor_shape + self.params = params + self.algo = self.mgr.algorithm( + params, # params + self.shader, # spirv + (tensor_shape[0] // self.local_size_x, tensor_shape[1] // self.local_size_y, 1), # workgroup + [float(tensor_shape[0])], # spec_consts + []) # push_consts + + (self.mgr.sequence() + .record(kp.OpTensorSyncDevice(self.params)) + .record(kp.OpAlgoDispatch(self.algo)) + .record(kp.OpTensorSyncLocal(self.params)) + .eval()) + + +def main(): + mgr = kp.Manager() + + matmul_op = MatMulOp(mgr, 64, 64) + + tensor_size = 512 + tensor_shape = [tensor_size, tensor_size] + tensor_in_1 = mgr.tensor(np.triu(np.ones(tensor_shape))) + tensor_in_2 = mgr.tensor(np.triu(np.ones(tensor_shape))) + tensor_out = mgr.tensor(np.zeros(tensor_shape)) + + print(f'{tensor_shape} input tensors:\n' + f'{tensor_in_1.data().reshape(tensor_shape)}\n' + f'{tensor_in_2.data().reshape(tensor_shape)}\n') + + matmul_op(tensor_shape, tensor_in_1, tensor_in_2, tensor_out) + + experiment_count = 1000 + start_time = time.time() + for _ in range(experiment_count): + matmul_op(tensor_shape, tensor_in_1, tensor_in_2, tensor_out) + end_time = time.time() + experiment_time = end_time - start_time + op_count = tensor_shape[0] * tensor_shape[1] * (tensor_shape[1] - 1) + + print(f'Output :\n{tensor_out.data().reshape(tensor_shape)}') + + print(f'{experiment_count} matmul time : ' + f'{experiment_time * 1000:0.2f}ms => ' + f'{experiment_count / experiment_time:0.2f}op/s or ' + f'{experiment_count * op_count / (1e9 * experiment_time):0.2f}GFLOPS') + + +if __name__ == '__main__': + main() diff --git a/examples/python_naive_matmul/matmul.py b/examples/python_naive_matmul/first_example.py similarity index 100% rename from examples/python_naive_matmul/matmul.py rename to examples/python_naive_matmul/first_example.py From 425380f1a11b3df97f58b717595ebefdd64f9cfe Mon Sep 17 00:00:00 2001 From: Corentin Date: Thu, 24 Jun 2021 18:36:10 +0900 Subject: [PATCH 3/8] Automated work group parameters and tiling matmul --- .../python_naive_matmul/1_naive_matmul.py | 75 ++++++++--- .../python_naive_matmul/2_tiled_matmul.py | 127 ++++++++++++++++++ examples/python_naive_matmul/README.md | 2 +- 3 files changed, 183 insertions(+), 21 deletions(-) create mode 100644 examples/python_naive_matmul/2_tiled_matmul.py diff --git a/examples/python_naive_matmul/1_naive_matmul.py b/examples/python_naive_matmul/1_naive_matmul.py index e7fe092a8..d5fe70cba 100644 --- a/examples/python_naive_matmul/1_naive_matmul.py +++ b/examples/python_naive_matmul/1_naive_matmul.py @@ -5,35 +5,64 @@ import numpy as np class MatMulOp: - def __init__(self, manager: kp.Manager, local_size_x: int = 1, local_size_y: int = 1): + def __init__(self, manager: kp.Manager, local_size_x: int = -1, local_size_y: int = -1): self.mgr = manager - assert(local_size_x > 0) - assert(local_size_y > 0) + + props = self.mgr.get_device_properties() + max_workgroup_invocation = props['max_work_group_invocations'] + max_workgroup_size = props['max_work_group_size'] + if local_size_x < 1: + if local_size_y > 0: + local_size_x = 1 + while (2 * local_size_x * local_size_y <= max_workgroup_invocation + and 2 * local_size_x <= max_workgroup_size[0]): + local_size_x *= 2 + else: + local_size_x = 1 + local_size_y = 1 + while 2 * local_size_x * local_size_y <= max_workgroup_invocation: + if 2 * local_size_x <= max_workgroup_size[0]: + local_size_x *= 2 + if 2 * local_size_y <= max_workgroup_size[1]: + local_size_y *= 2 + elif 2 * local_size_x > max_workgroup_size[0]: # stop if neither x nor y can be double + break + elif local_size_y < 0: + local_size_y = 1 + while (2 * local_size_x * local_size_y <= max_workgroup_invocation + and 2 * local_size_x <= max_workgroup_size[0]): + local_size_y *= 2 + + assert local_size_x > 0 + assert local_size_y > 0 + assert local_size_x * local_size_y <= max_workgroup_invocation + assert local_size_x <= max_workgroup_size[0] + assert local_size_y <= max_workgroup_size[1] self.local_size_x = local_size_x self.local_size_y = local_size_y self.shader = kp.Shader.compile_source(f''' - #version 450 +#version 450 - layout (local_size_x = {local_size_x}, local_size_y = {local_size_y}) in; +layout (local_size_x = {local_size_x}, local_size_y = {local_size_y}) in; - layout (set = 0, binding = 0) readonly buffer buf_in_tensor_1 {{ float in_tensor_1[]; }}; - layout (set = 0, binding = 1) readonly buffer buf_in_tensor_2 {{ float in_tensor_2[]; }}; - layout (set = 0, binding = 2) writeonly buffer buf_out_tensor {{ float out_tensor[]; }}; +layout (set = 0, binding = 0) readonly buffer buf_in_tensor_1 {{ float in_tensor_1[]; }}; +layout (set = 0, binding = 1) readonly buffer buf_in_tensor_2 {{ float in_tensor_2[]; }}; +layout (set = 0, binding = 2) writeonly buffer buf_out_tensor {{ float out_tensor[]; }}; - layout (constant_id = 0) const float tensor_size_f = 0; +layout (constant_id = 0) const float tensor_size_f = 0; - void main() - {{ - uint globalRow = gl_GlobalInvocationID.x; - uint globalCol = gl_GlobalInvocationID.y; - uint tensor_size = uint(tensor_size_f); - float acc = 0.0; - for(uint k = 0u; k < tensor_size; k++) - acc += in_tensor_1[(k * tensor_size) + globalRow] * in_tensor_2[(globalCol * tensor_size) + k]; - out_tensor[(globalCol * tensor_size) + globalRow] = acc; - }}''') +void main() +{{ + uint globalRow = gl_GlobalInvocationID.x; + uint globalCol = gl_GlobalInvocationID.y; + uint tensor_size = uint(tensor_size_f); + float acc = 0.0; + for(uint k = 0u; k < tensor_size; k++) + acc += in_tensor_1[(k * tensor_size) + globalRow] * in_tensor_2[(globalCol * tensor_size) + k]; + out_tensor[(globalCol * tensor_size) + globalRow] = acc; +}}''') self.tensor_shape: tuple[int, int] = (0, 0) self.params: list[kp.Tensor] = [] self.algo = None @@ -62,7 +91,7 @@ class MatMulOp: def main(): mgr = kp.Manager() - matmul_op = MatMulOp(mgr, 64, 64) + matmul_op = MatMulOp(mgr) tensor_size = 512 tensor_shape = [tensor_size, tensor_size] @@ -92,5 +121,11 @@ def main(): f'{experiment_count * op_count / (1e9 * experiment_time):0.2f}GFLOPS') +def test(): + main() + + if __name__ == '__main__': main() +else: + test() diff --git a/examples/python_naive_matmul/2_tiled_matmul.py b/examples/python_naive_matmul/2_tiled_matmul.py new file mode 100644 index 000000000..be403a46a --- /dev/null +++ b/examples/python_naive_matmul/2_tiled_matmul.py @@ -0,0 +1,127 @@ +import time + +import kp +import numpy as np + + +class MatMulOp: + def __init__(self, manager: kp.Manager, tile_size: int = -1): + self.mgr = manager + + props = self.mgr.get_device_properties() + max_workgroup_invocation = props['max_work_group_invocations'] + max_workgroup_size = props['max_work_group_size'] + if tile_size < 0: + tile_size = 1 + while (2 * tile_size * tile_size <= max_workgroup_invocation + and 2 * tile_size <= max_workgroup_size[0] + and 2 * tile_size <= max_workgroup_size[1]): + tile_size *= 2 + + assert tile_size > 0 + assert tile_size * tile_size <= max_workgroup_invocation + assert tile_size <= max_workgroup_size[0] + assert tile_size <= max_workgroup_size[1] + self.tile_size = tile_size + + self.shader = kp.Shader.compile_source(f''' +#version 450 + +layout (local_size_x = {tile_size}, local_size_y = {tile_size}) in; + +layout (set = 0, binding = 0) readonly buffer buf_in_tensor_1 {{ float in_tensor_1[]; }}; +layout (set = 0, binding = 1) readonly buffer buf_in_tensor_2 {{ float in_tensor_2[]; }}; +layout (set = 0, binding = 2) writeonly buffer buf_out_tensor {{ float out_tensor[]; }}; + +layout (constant_id = 0) const float tensor_size_f = 0; + +shared float sub_tensor_1[{tile_size}][{tile_size}]; +shared float sub_tensor_2[{tile_size}][{tile_size}]; + +void main() +{{ + uint row = gl_GlobalInvocationID.x; + uint col = gl_GlobalInvocationID.y; + uint globalRow = {tile_size} * gl_WorkGroupID.x + row; + uint globalCol = {tile_size} * gl_WorkGroupID.y + row; + + uint tensor_size = uint(tensor_size_f); + float acc = 0.0; + uint numTiles = tensor_size / {tile_size}; + for(uint t = 0u; t < numTiles; t++) + {{ + uint tiledRow = {tile_size} * t + row; + uint tiledCol = {tile_size} * t + col; + sub_tensor_1[col][row] = in_tensor_1[tiledCol * tensor_size + globalRow]; + sub_tensor_2[col][row] = in_tensor_2[globalCol * tensor_size + tiledRow]; + + memoryBarrierShared(); + barrier(); + + for(uint k = 0u; k < {tile_size}; k++) + acc += sub_tensor_1[k][row] * sub_tensor_2[col][k]; + + barrier(); + }} + out_tensor[(globalCol * tensor_size) + globalRow] = acc; +}}''') + self.tensor_shape: tuple[int, int] = (0, 0) + self.params: list[kp.Tensor] = [] + self.algo = None + + def __call__(self, tensor_shape: tuple[int, int], tensor_in_1: kp.Tensor, tensor_in_2: kp.Tensor, + tensor_out: kp.Tensor): + params = [tensor_in_1, tensor_in_2, tensor_out] + + if self.algo is None or self.tensor_shape != tensor_shape or self.params != params: + self.tensor_shape = tensor_shape + self.params = params + self.algo = self.mgr.algorithm( + params, # params + self.shader, # spirv + (tensor_shape[0] // self.tile_size, tensor_shape[1] // self.tile_size, 1), # workgroup + [float(tensor_shape[0])], # spec_consts + []) # push_consts + + (self.mgr.sequence() + .record(kp.OpTensorSyncDevice(self.params)) + .record(kp.OpAlgoDispatch(self.algo)) + .record(kp.OpTensorSyncLocal(self.params)) + .eval()) + + +def main(): + mgr = kp.Manager() + + matmul_op = MatMulOp(mgr) + + tensor_size = 512 + tensor_shape = [tensor_size, tensor_size] + tensor_in_1 = mgr.tensor(np.triu(np.ones(tensor_shape))) + tensor_in_2 = mgr.tensor(np.triu(np.ones(tensor_shape))) + tensor_out = mgr.tensor(np.zeros(tensor_shape)) + + print(f'{tensor_shape} input tensors:\n' + f'{tensor_in_1.data().reshape(tensor_shape)}\n' + f'{tensor_in_2.data().reshape(tensor_shape)}\n') + + matmul_op(tensor_shape, tensor_in_1, tensor_in_2, tensor_out) + + experiment_count = 1000 + start_time = time.time() + for _ in range(experiment_count): + matmul_op(tensor_shape, tensor_in_1, tensor_in_2, tensor_out) + end_time = time.time() + experiment_time = end_time - start_time + op_count = tensor_shape[0] * tensor_shape[1] * (tensor_shape[1] - 1) + + print(f'Output :\n{tensor_out.data().reshape(tensor_shape)}') + + print(f'{experiment_count} matmul time : ' + f'{experiment_time * 1000:0.2f}ms => ' + f'{experiment_count / experiment_time:0.2f}op/s or ' + f'{experiment_count * op_count / (1e9 * experiment_time):0.2f}GFLOPS') + + +if __name__ == '__main__': + main() diff --git a/examples/python_naive_matmul/README.md b/examples/python_naive_matmul/README.md index 26a89172b..0688bb079 100644 --- a/examples/python_naive_matmul/README.md +++ b/examples/python_naive_matmul/README.md @@ -1,6 +1,6 @@ # Naive Matmul Implementation -This demonstrate a basic matmul implementation using Python and vulkan-kompute +This demonstrate a basic matmul implementation using Python and vulkan-kompute. Many thanks for the very helpful [SGEMM in WebGL2-compute](https://www.ibiblio.org/e-notes/webgl/gpu/mul/sgemm.htm) article on the public library [ibiblio.org](https://www.ibiblio.org/). To test the implementation simply run the `matmul.py` script : From 6f04eb9db2a4bfa6e5196bb015d9bacd0245fc10 Mon Sep 17 00:00:00 2001 From: Corentin Date: Thu, 24 Jun 2021 19:36:08 +0900 Subject: [PATCH 4/8] Better tiling implementation, fixed tiling asserts --- .../python_naive_matmul/2_tiled_matmul.py | 2 +- .../python_naive_matmul/3_better_tiling.py | 141 ++++++++++++++++++ 2 files changed, 142 insertions(+), 1 deletion(-) create mode 100644 examples/python_naive_matmul/3_better_tiling.py diff --git a/examples/python_naive_matmul/2_tiled_matmul.py b/examples/python_naive_matmul/2_tiled_matmul.py index be403a46a..839b07989 100644 --- a/examples/python_naive_matmul/2_tiled_matmul.py +++ b/examples/python_naive_matmul/2_tiled_matmul.py @@ -13,7 +13,7 @@ class MatMulOp: max_workgroup_size = props['max_work_group_size'] if tile_size < 0: tile_size = 1 - while (2 * tile_size * tile_size <= max_workgroup_invocation + while (4 * tile_size * tile_size <= max_workgroup_invocation and 2 * tile_size <= max_workgroup_size[0] and 2 * tile_size <= max_workgroup_size[1]): tile_size *= 2 diff --git a/examples/python_naive_matmul/3_better_tiling.py b/examples/python_naive_matmul/3_better_tiling.py new file mode 100644 index 000000000..5dcc52349 --- /dev/null +++ b/examples/python_naive_matmul/3_better_tiling.py @@ -0,0 +1,141 @@ +import time + +import kp +import numpy as np + + +class MatMulOp: + def __init__(self, manager: kp.Manager, tile_size: int = -1, thread_work_ratio: int = 8): + self.mgr = manager + + props = self.mgr.get_device_properties() + max_workgroup_invocation = props['max_work_group_invocations'] + max_workgroup_size = props['max_work_group_size'] + if tile_size < 0: + tile_size = 1 + local_size_y = tile_size // thread_work_ratio + while (4 * tile_size * local_size_y <= max_workgroup_invocation + and 2 * tile_size <= max_workgroup_size[0] + and 2 * local_size_y <= max_workgroup_size[1]): + tile_size *= 2 + local_size_y = tile_size // thread_work_ratio + else: + local_size_y = tile_size // thread_work_ratio + + assert tile_size > 0 + assert thread_work_ratio > 0 + assert tile_size * local_size_y <= max_workgroup_invocation + assert tile_size <= max_workgroup_size[0] + assert local_size_y <= max_workgroup_size[1] + self.tile_size = tile_size + self.thread_work_ratio = thread_work_ratio + + local_size_y = tile_size // thread_work_ratio + self.shader = kp.Shader.compile_source(f''' +#version 450 + +layout (local_size_x = {tile_size}, local_size_y = {local_size_y}) in; + +layout (set = 0, binding = 0) readonly buffer buf_in_tensor_1 {{ float in_tensor_1[]; }}; +layout (set = 0, binding = 1) readonly buffer buf_in_tensor_2 {{ float in_tensor_2[]; }}; +layout (set = 0, binding = 2) writeonly buffer buf_out_tensor {{ float out_tensor[]; }}; + +layout (constant_id = 0) const float tensor_size_f = 0; + +shared float sub_tensor_1[{tile_size}][{tile_size}]; +shared float sub_tensor_2[{tile_size}][{tile_size}]; + +void main() +{{ + uint row = gl_GlobalInvocationID.x; + uint col = gl_GlobalInvocationID.y; + uint globalRow = {tile_size} * gl_WorkGroupID.x + row; + uint globalCol = {tile_size} * gl_WorkGroupID.y + row; + + uint tensor_size = uint(tensor_size_f); + float acc[{thread_work_ratio}]; + for (uint l = 0u; l < {thread_work_ratio}; l++) + acc[l] = 0.0; + + uint numTiles = tensor_size / {tile_size}; + for(uint t = 0u; t < numTiles; t++) + {{ + uint tiledRow = {tile_size} * t + row; + uint tiledCol = {tile_size} * t + col; + sub_tensor_1[col + t * {local_size_y}][row] = in_tensor_1[ + (tiledCol + t * {local_size_y}) * tensor_size + globalRow]; + sub_tensor_2[col + t * {local_size_y}][row] = in_tensor_2[ + (globalCol + t * {local_size_y})* tensor_size + tiledRow]; + + memoryBarrierShared(); + barrier(); + + for(uint k = 0u; k < {tile_size}; k++) + for(uint l = 0u; l < {thread_work_ratio}; l++) + acc[l] += sub_tensor_1[k][row] * sub_tensor_2[col + l * {local_size_y}][k]; + + barrier(); + }} + for(uint l = 0u; l < {thread_work_ratio}; l++) + out_tensor[(globalCol + l * {local_size_y}) * tensor_size + globalRow] = acc[l]; +}}''') + self.tensor_shape: tuple[int, int] = (0, 0) + self.params: list[kp.Tensor] = [] + self.algo = None + + def __call__(self, tensor_shape: tuple[int, int], tensor_in_1: kp.Tensor, tensor_in_2: kp.Tensor, + tensor_out: kp.Tensor): + params = [tensor_in_1, tensor_in_2, tensor_out] + + if self.algo is None or self.tensor_shape != tensor_shape or self.params != params: + self.tensor_shape = tensor_shape + self.params = params + self.algo = self.mgr.algorithm( + params, # params + self.shader, # spirv + (tensor_shape[0] // self.tile_size, tensor_shape[1] // self.tile_size, 1), # workgroup + [float(tensor_shape[0])], # spec_consts + []) # push_consts + + (self.mgr.sequence() + .record(kp.OpTensorSyncDevice(self.params)) + .record(kp.OpAlgoDispatch(self.algo)) + .record(kp.OpTensorSyncLocal(self.params)) + .eval()) + + +def main(): + mgr = kp.Manager() + + matmul_op = MatMulOp(mgr) + + tensor_size = 512 + tensor_shape = [tensor_size, tensor_size] + tensor_in_1 = mgr.tensor(np.triu(np.ones(tensor_shape))) + tensor_in_2 = mgr.tensor(np.triu(np.ones(tensor_shape))) + tensor_out = mgr.tensor(np.zeros(tensor_shape)) + + print(f'{tensor_shape} input tensors:\n' + f'{tensor_in_1.data().reshape(tensor_shape)}\n' + f'{tensor_in_2.data().reshape(tensor_shape)}\n') + + matmul_op(tensor_shape, tensor_in_1, tensor_in_2, tensor_out) + + experiment_count = 1000 + start_time = time.time() + for _ in range(experiment_count): + matmul_op(tensor_shape, tensor_in_1, tensor_in_2, tensor_out) + end_time = time.time() + experiment_time = end_time - start_time + op_count = tensor_shape[0] * tensor_shape[1] * (tensor_shape[1] - 1) + + print(f'Output :\n{tensor_out.data().reshape(tensor_shape)}') + + print(f'{experiment_count} matmul time : ' + f'{experiment_time * 1000:0.2f}ms => ' + f'{experiment_count / experiment_time:0.2f}op/s or ' + f'{experiment_count * op_count / (1e9 * experiment_time):0.2f}GFLOPS') + + +if __name__ == '__main__': + main() From 7f4ec272353a337e663c5e462401f0421a228614 Mon Sep 17 00:00:00 2001 From: Corentin Date: Fri, 25 Jun 2021 02:49:28 +0900 Subject: [PATCH 5/8] Fix second implementation, add benchmark script * Third implementation is broken (WIP) --- examples/python_naive_matmul/benchmark.py | 49 ++++++ .../{1_naive_matmul.py => imp1_naive.py} | 18 +- .../{2_tiled_matmul.py => imp2_tiled.py} | 32 ++-- .../python_naive_matmul/imp2_tiled_debug.py | 156 ++++++++++++++++++ ...better_tiling.py => imp3_better_tiling.py} | 35 ++-- examples/python_naive_matmul/matmul_plot.py | 97 +++++++++++ 6 files changed, 347 insertions(+), 40 deletions(-) create mode 100644 examples/python_naive_matmul/benchmark.py rename examples/python_naive_matmul/{1_naive_matmul.py => imp1_naive.py} (92%) rename examples/python_naive_matmul/{2_tiled_matmul.py => imp2_tiled.py} (79%) create mode 100644 examples/python_naive_matmul/imp2_tiled_debug.py rename examples/python_naive_matmul/{3_better_tiling.py => imp3_better_tiling.py} (78%) create mode 100644 examples/python_naive_matmul/matmul_plot.py diff --git a/examples/python_naive_matmul/benchmark.py b/examples/python_naive_matmul/benchmark.py new file mode 100644 index 000000000..b10369d7c --- /dev/null +++ b/examples/python_naive_matmul/benchmark.py @@ -0,0 +1,49 @@ +import time + +import kp +import numpy as np +from imp1_naive import MatMulOp as MatMulOp1 +from imp2_tiled import MatMulOp as MatMulOp2 +from imp3_better_tiling import MatMulOp as MatMulOp3 + + +def main(): + experiment_count = 1000 + tensor_size = 512 + tensor_shape = [tensor_size, tensor_size] + mat_1 = np.triu(np.ones(tensor_shape)) + mat_2 = np.triu(np.ones(tensor_shape)) + mat_result = mat_1 @ mat_2 + tensor_shape = [tensor_size, tensor_size] + + print(f'{tensor_shape} input tensors:\n' + f'{mat_1}\n' + f'{mat_2}\n') + print(f'Output :\n{mat_result}') + + mgr = kp.Manager() + tensor_in_1 = mgr.tensor(mat_1) + tensor_in_2 = mgr.tensor(mat_2) + tensor_out = mgr.tensor(np.zeros(tensor_shape)) + for MatMulOp in [MatMulOp1, MatMulOp2, MatMulOp3]: + matmul_op = MatMulOp(mgr) + matmul_op(tensor_shape, tensor_in_1, tensor_in_2, tensor_out) + + start_time = time.time() + for _ in range(experiment_count): + matmul_op(tensor_shape, tensor_in_1, tensor_in_2, tensor_out) + end_time = time.time() + experiment_time = end_time - start_time + op_count = tensor_shape[0] * tensor_shape[1] * (tensor_shape[1] - 1) + + if (tensor_out.data().reshape(tensor_shape) == mat_result).all(): + print(f'From {MatMulOp.__module__} : {experiment_count} matmul time : ' + f'{experiment_time * 1000:0.2f}ms => ' + f'{experiment_count / experiment_time:0.2f}op/s or ' + f'{experiment_count * op_count / (1e9 * experiment_time):0.2f}GFLOPS') + else: + print(f'Test failed => output tensor is wrong :\n{tensor_out.data().reshape(tensor_shape)}') + + +if __name__ == '__main__': + main() diff --git a/examples/python_naive_matmul/1_naive_matmul.py b/examples/python_naive_matmul/imp1_naive.py similarity index 92% rename from examples/python_naive_matmul/1_naive_matmul.py rename to examples/python_naive_matmul/imp1_naive.py index d5fe70cba..faefec563 100644 --- a/examples/python_naive_matmul/1_naive_matmul.py +++ b/examples/python_naive_matmul/imp1_naive.py @@ -41,7 +41,7 @@ class MatMulOp: self.local_size_x = local_size_x self.local_size_y = local_size_y - self.shader = kp.Shader.compile_source(f''' + self.shader = f''' #version 450 layout (local_size_x = {local_size_x}, local_size_y = {local_size_y}) in; @@ -62,7 +62,8 @@ void main() for(uint k = 0u; k < tensor_size; k++) acc += in_tensor_1[(k * tensor_size) + globalRow] * in_tensor_2[(globalCol * tensor_size) + k]; out_tensor[(globalCol * tensor_size) + globalRow] = acc; -}}''') +}}''' + self.compiled_shader = kp.Shader.compile_source(self.shader) self.tensor_shape: tuple[int, int] = (0, 0) self.params: list[kp.Tensor] = [] self.algo = None @@ -74,17 +75,18 @@ void main() if self.algo is None or self.tensor_shape != tensor_shape or self.params != params: self.tensor_shape = tensor_shape self.params = params + workgroup = (tensor_shape[0] // self.local_size_x, tensor_shape[1] // self.local_size_y, 1) self.algo = self.mgr.algorithm( params, # params - self.shader, # spirv - (tensor_shape[0] // self.local_size_x, tensor_shape[1] // self.local_size_y, 1), # workgroup + self.compiled_shader, # spirv + workgroup, # workgroup [float(tensor_shape[0])], # spec_consts []) # push_consts (self.mgr.sequence() .record(kp.OpTensorSyncDevice(self.params)) .record(kp.OpAlgoDispatch(self.algo)) - .record(kp.OpTensorSyncLocal(self.params)) + .record(kp.OpTensorSyncLocal([tensor_out])) .eval()) @@ -121,11 +123,5 @@ def main(): f'{experiment_count * op_count / (1e9 * experiment_time):0.2f}GFLOPS') -def test(): - main() - - if __name__ == '__main__': main() -else: - test() diff --git a/examples/python_naive_matmul/2_tiled_matmul.py b/examples/python_naive_matmul/imp2_tiled.py similarity index 79% rename from examples/python_naive_matmul/2_tiled_matmul.py rename to examples/python_naive_matmul/imp2_tiled.py index 839b07989..ed6a1ddf8 100644 --- a/examples/python_naive_matmul/2_tiled_matmul.py +++ b/examples/python_naive_matmul/imp2_tiled.py @@ -24,7 +24,7 @@ class MatMulOp: assert tile_size <= max_workgroup_size[1] self.tile_size = tile_size - self.shader = kp.Shader.compile_source(f''' + self.shader = f''' #version 450 layout (local_size_x = {tile_size}, local_size_y = {tile_size}) in; @@ -40,20 +40,21 @@ shared float sub_tensor_2[{tile_size}][{tile_size}]; void main() {{ - uint row = gl_GlobalInvocationID.x; - uint col = gl_GlobalInvocationID.y; - uint globalRow = {tile_size} * gl_WorkGroupID.x + row; - uint globalCol = {tile_size} * gl_WorkGroupID.y + row; + uint row = gl_LocalInvocationID.x; // 0 .. tile_size + uint col = gl_LocalInvocationID.y; // 0 .. tile_size + // gl_WorkGroupID : 0 .. tensor_size / tile_size + uint globalRow = ({tile_size} * gl_WorkGroupID.x) + row; + uint globalCol = ({tile_size} * gl_WorkGroupID.y) + col; uint tensor_size = uint(tensor_size_f); float acc = 0.0; uint numTiles = tensor_size / {tile_size}; for(uint t = 0u; t < numTiles; t++) {{ - uint tiledRow = {tile_size} * t + row; - uint tiledCol = {tile_size} * t + col; - sub_tensor_1[col][row] = in_tensor_1[tiledCol * tensor_size + globalRow]; - sub_tensor_2[col][row] = in_tensor_2[globalCol * tensor_size + tiledRow]; + uint tiledRow = ({tile_size} * t) + row; + uint tiledCol = ({tile_size} * t) + col; + sub_tensor_1[col][row] = in_tensor_1[(tiledCol * tensor_size) + globalRow]; + sub_tensor_2[col][row] = in_tensor_2[(globalCol * tensor_size) + tiledRow]; memoryBarrierShared(); barrier(); @@ -63,8 +64,10 @@ void main() barrier(); }} - out_tensor[(globalCol * tensor_size) + globalRow] = acc; -}}''') + uint globalIndex = (tensor_size * globalCol) + globalRow; + out_tensor[globalIndex] = acc; +}}''' + self.compiled_shader = kp.Shader.compile_source(self.shader) self.tensor_shape: tuple[int, int] = (0, 0) self.params: list[kp.Tensor] = [] self.algo = None @@ -76,17 +79,18 @@ void main() if self.algo is None or self.tensor_shape != tensor_shape or self.params != params: self.tensor_shape = tensor_shape self.params = params + workgroup = (tensor_shape[0] // self.tile_size, tensor_shape[1] // self.tile_size, 1) self.algo = self.mgr.algorithm( params, # params - self.shader, # spirv - (tensor_shape[0] // self.tile_size, tensor_shape[1] // self.tile_size, 1), # workgroup + self.compiled_shader, # spirv + workgroup, # workgroup [float(tensor_shape[0])], # spec_consts []) # push_consts (self.mgr.sequence() .record(kp.OpTensorSyncDevice(self.params)) .record(kp.OpAlgoDispatch(self.algo)) - .record(kp.OpTensorSyncLocal(self.params)) + .record(kp.OpTensorSyncLocal([tensor_out])) .eval()) diff --git a/examples/python_naive_matmul/imp2_tiled_debug.py b/examples/python_naive_matmul/imp2_tiled_debug.py new file mode 100644 index 000000000..b10cbfd7d --- /dev/null +++ b/examples/python_naive_matmul/imp2_tiled_debug.py @@ -0,0 +1,156 @@ +import time + +import kp +import numpy as np + + +class MatMulOp: + def __init__(self, manager: kp.Manager, tile_size: int = -1): + self.mgr = manager + + props = self.mgr.get_device_properties() + max_workgroup_invocation = props['max_work_group_invocations'] + max_workgroup_size = props['max_work_group_size'] + if tile_size < 0: + tile_size = 1 + while (4 * tile_size * tile_size <= max_workgroup_invocation + and 2 * tile_size <= max_workgroup_size[0] + and 2 * tile_size <= max_workgroup_size[1]): + tile_size *= 2 + + assert tile_size > 0 + assert tile_size * tile_size <= max_workgroup_invocation + assert tile_size <= max_workgroup_size[0] + assert tile_size <= max_workgroup_size[1] + self.tile_size = tile_size + + print(f'{tile_size=}') + self.shader = f''' +#version 450 + +layout (local_size_x = {tile_size}, local_size_y = {tile_size}) in; + +layout (set = 0, binding = 0) readonly buffer buf_in_tensor_1 {{ float in_tensor_1[]; }}; +layout (set = 0, binding = 1) readonly buffer buf_in_tensor_2 {{ float in_tensor_2[]; }}; +layout (set = 0, binding = 2) writeonly buffer buf_out_tensor {{ float out_tensor[]; }}; +layout (set = 0, binding = 3) writeonly buffer buf_test1_tensor {{ float test1_tensor[]; }}; +layout (set = 0, binding = 4) writeonly buffer buf_test2_tensor {{ float test2_tensor[]; }}; +layout (set = 0, binding = 5) writeonly buffer buf_test3_tensor {{ float test3_tensor[]; }}; +layout (set = 0, binding = 6) writeonly buffer buf_test4_tensor {{ float test4_tensor[]; }}; + +layout (constant_id = 0) const float tensor_size_f = 0; + +shared float sub_tensor_1[{tile_size}][{tile_size}]; +shared float sub_tensor_2[{tile_size}][{tile_size}]; + +void main() +{{ + uint row = gl_LocalInvocationID.x; // 0 .. tile_size + uint col = gl_LocalInvocationID.y; // 0 .. tile_size + // gl_WorkGroupID : 0 .. tensor_size / tile_size + uint globalRow = ({tile_size} * gl_WorkGroupID.x) + row; + uint globalCol = ({tile_size} * gl_WorkGroupID.y) + col; + + uint tensor_size = uint(tensor_size_f); + float acc = 0.0; + uint numTiles = tensor_size / {tile_size}; + for(uint t = 0u; t < numTiles; t++) + {{ + uint tiledRow = ({tile_size} * t) + row; + uint tiledCol = ({tile_size} * t) + col; + sub_tensor_1[col][row] = in_tensor_1[(tiledCol * tensor_size) + globalRow]; + sub_tensor_2[col][row] = in_tensor_2[(globalCol * tensor_size) + tiledRow]; + + memoryBarrierShared(); + barrier(); + + for(uint k = 0u; k < {tile_size}; k++) + acc += sub_tensor_1[k][row] * sub_tensor_2[col][k]; + + barrier(); + }} + uint globalIndex = (tensor_size * globalCol) + globalRow; + out_tensor[globalIndex] = acc; + test1_tensor[globalIndex] = row; + test2_tensor[globalIndex] = col; + test3_tensor[globalIndex] = gl_WorkGroupID.x; + test4_tensor[globalIndex] = gl_WorkGroupID.y; +}}''' + print(self.shader) + self.compiled_shader = kp.Shader.compile_source(self.shader) + self.tensor_shape: tuple[int, int] = (0, 0) + self.params: list[kp.Tensor] = [] + self.algo = None + + def __call__(self, tensor_shape: tuple[int, int], tensor_in_1: kp.Tensor, tensor_in_2: kp.Tensor, + tensor_out: kp.Tensor, tensor_test_1: kp.Tensor, tensor_test_2: kp.Tensor, + tensor_test_3: kp.Tensor, tensor_test_4: kp.Tensor): + # params = [tensor_in_1, tensor_in_2, tensor_out] + params = [tensor_in_1, tensor_in_2, tensor_out, tensor_test_1, tensor_test_2, tensor_test_3, tensor_test_4] + + if self.algo is None or self.tensor_shape != tensor_shape or self.params != params: + self.tensor_shape = tensor_shape + self.params = params + workgroup = (tensor_shape[0] // self.tile_size, tensor_shape[1] // self.tile_size, 1) + # workgroup = (2, 2, 1) + print(f'{float(tensor_shape[0])=} {self.tile_size=} {workgroup=}') + self.algo = self.mgr.algorithm( + params, # params + self.compiled_shader, # spirv + workgroup, # workgroup + [float(tensor_shape[0])], # spec_consts + []) # push_consts + + (self.mgr.sequence() + .record(kp.OpTensorSyncDevice(self.params)) + .record(kp.OpAlgoDispatch(self.algo)) + .record(kp.OpTensorSyncLocal(self.params[2:])) + # .record(kp.OpTensorSyncLocal([tensor_out])) + .eval()) + + +def main(): + mgr = kp.Manager() + + matmul_op = MatMulOp(mgr, 4) + + tensor_size = 8 + tensor_shape = [tensor_size, tensor_size] + tensor_in_1 = mgr.tensor(np.triu(np.ones(tensor_shape))) + tensor_in_2 = mgr.tensor(np.triu(np.ones(tensor_shape))) + tensor_out = mgr.tensor(np.zeros(tensor_shape)) + tensor_test_1 = mgr.tensor(np.zeros(tensor_shape)) + tensor_test_2 = mgr.tensor(np.zeros(tensor_shape)) + tensor_test_3 = mgr.tensor(np.zeros(tensor_shape)) + tensor_test_4 = mgr.tensor(np.zeros(tensor_shape)) + + print(f'{tensor_shape} input tensors:\n' + f'{tensor_in_1.data().reshape(tensor_shape)}\n' + f'{tensor_in_2.data().reshape(tensor_shape)}\n') + + # matmul_op(tensor_shape, tensor_in_1, tensor_in_2, tensor_out) + matmul_op(tensor_shape, tensor_in_1, tensor_in_2, + tensor_out, tensor_test_1, tensor_test_2, tensor_test_3, tensor_test_4) + + # experiment_count = 10 + # start_time = time.time() + # for _ in range(experiment_count): + # matmul_op(tensor_shape, tensor_in_1, tensor_in_2, tensor_out) + # end_time = time.time() + # experiment_time = end_time - start_time + # op_count = tensor_shape[0] * tensor_shape[1] * (tensor_shape[1] - 1) + + print(f'Output :\n{tensor_out.data().reshape(tensor_shape)}') + print(f'test_1 :\n{tensor_test_1.data().reshape(tensor_shape)}') + print(f'test_2 :\n{tensor_test_2.data().reshape(tensor_shape)}') + print(f'test_3 :\n{tensor_test_3.data().reshape(tensor_shape)}') + print(f'test_4 :\n{tensor_test_4.data().reshape(tensor_shape)}') + + # print(f'{experiment_count} matmul time : ' + # f'{experiment_time * 1000:0.2f}ms => ' + # f'{experiment_count / experiment_time:0.2f}op/s or ' + # f'{experiment_count * op_count / (1e9 * experiment_time):0.2f}GFLOPS') + + +if __name__ == '__main__': + main() diff --git a/examples/python_naive_matmul/3_better_tiling.py b/examples/python_naive_matmul/imp3_better_tiling.py similarity index 78% rename from examples/python_naive_matmul/3_better_tiling.py rename to examples/python_naive_matmul/imp3_better_tiling.py index 5dcc52349..6b3ada314 100644 --- a/examples/python_naive_matmul/3_better_tiling.py +++ b/examples/python_naive_matmul/imp3_better_tiling.py @@ -30,11 +30,12 @@ class MatMulOp: self.tile_size = tile_size self.thread_work_ratio = thread_work_ratio - local_size_y = tile_size // thread_work_ratio - self.shader = kp.Shader.compile_source(f''' + self.local_size_x = tile_size + self.local_size_y = tile_size // thread_work_ratio + self.shader = f''' #version 450 -layout (local_size_x = {tile_size}, local_size_y = {local_size_y}) in; +layout (local_size_x = {tile_size}, local_size_y = {self.local_size_y}) in; layout (set = 0, binding = 0) readonly buffer buf_in_tensor_1 {{ float in_tensor_1[]; }}; layout (set = 0, binding = 1) readonly buffer buf_in_tensor_2 {{ float in_tensor_2[]; }}; @@ -47,8 +48,8 @@ shared float sub_tensor_2[{tile_size}][{tile_size}]; void main() {{ - uint row = gl_GlobalInvocationID.x; - uint col = gl_GlobalInvocationID.y; + uint row = gl_LocalInvocationID.x; + uint col = gl_LocalInvocationID.y; uint globalRow = {tile_size} * gl_WorkGroupID.x + row; uint globalCol = {tile_size} * gl_WorkGroupID.y + row; @@ -62,23 +63,24 @@ void main() {{ uint tiledRow = {tile_size} * t + row; uint tiledCol = {tile_size} * t + col; - sub_tensor_1[col + t * {local_size_y}][row] = in_tensor_1[ - (tiledCol + t * {local_size_y}) * tensor_size + globalRow]; - sub_tensor_2[col + t * {local_size_y}][row] = in_tensor_2[ - (globalCol + t * {local_size_y})* tensor_size + tiledRow]; + sub_tensor_1[col + t * {self.local_size_y}][row] = in_tensor_1[ + (tiledCol + t * {self.local_size_y}) * tensor_size + globalRow]; + sub_tensor_2[col + t * {self.local_size_y}][row] = in_tensor_2[ + (globalCol + t * {self.local_size_y})* tensor_size + tiledRow]; memoryBarrierShared(); barrier(); for(uint k = 0u; k < {tile_size}; k++) for(uint l = 0u; l < {thread_work_ratio}; l++) - acc[l] += sub_tensor_1[k][row] * sub_tensor_2[col + l * {local_size_y}][k]; + acc[l] += sub_tensor_1[k][row] * sub_tensor_2[col + l * {self.local_size_y}][k]; barrier(); }} for(uint l = 0u; l < {thread_work_ratio}; l++) - out_tensor[(globalCol + l * {local_size_y}) * tensor_size + globalRow] = acc[l]; -}}''') + out_tensor[(globalCol + l * {self.local_size_y}) * tensor_size + globalRow] = acc[l]; +}}''' + self.compiled_shader = kp.Shader.compile_source(self.shader) self.tensor_shape: tuple[int, int] = (0, 0) self.params: list[kp.Tensor] = [] self.algo = None @@ -90,17 +92,20 @@ void main() if self.algo is None or self.tensor_shape != tensor_shape or self.params != params: self.tensor_shape = tensor_shape self.params = params + print( + tensor_shape, self.local_size_x, self.local_size_y, + (tensor_shape[0] // self.local_size_x, tensor_shape[1] // self.local_size_y, 1)) self.algo = self.mgr.algorithm( params, # params - self.shader, # spirv - (tensor_shape[0] // self.tile_size, tensor_shape[1] // self.tile_size, 1), # workgroup + self.compiled_shader, # spirv + (tensor_shape[0] // self.local_size_x, tensor_shape[1] // self.local_size_y, 1), # workgroup [float(tensor_shape[0])], # spec_consts []) # push_consts (self.mgr.sequence() .record(kp.OpTensorSyncDevice(self.params)) .record(kp.OpAlgoDispatch(self.algo)) - .record(kp.OpTensorSyncLocal(self.params)) + .record(kp.OpTensorSyncLocal([tensor_out])) .eval()) diff --git a/examples/python_naive_matmul/matmul_plot.py b/examples/python_naive_matmul/matmul_plot.py new file mode 100644 index 000000000..81763a439 --- /dev/null +++ b/examples/python_naive_matmul/matmul_plot.py @@ -0,0 +1,97 @@ +from argparse import ArgumentParser + +import cv2 +import numpy as np + + +def plot_tensor(window_name: str, tensor: np.ndarray, coord_highlight: tuple[int, int] = None): + font_size = 48 + image = np.zeros((tensor.shape[1] * font_size, tensor.shape[0] * font_size, 3), dtype=np.uint8) + + for y in range(tensor.shape[1]): + for x in range(tensor.shape[0]): + if coord_highlight and x == coord_highlight[1] and y == coord_highlight[0]: + cv2.putText( + image, str(int(tensor[y, x])), (x * font_size, int((y + 0.8) * font_size)), + cv2.FONT_HERSHEY_TRIPLEX, 1., (127, 127, 255)) + else: + cv2.putText( + image, str(int(tensor[y, x])), (x * font_size, int((y + 0.8) * font_size)), + cv2.FONT_HERSHEY_TRIPLEX, 1., (255, 255, 255)) + + cv2.imshow(window_name, image) + + +def main(): + parser = ArgumentParser() + parser.add_argument('tensor_size', type=int, help='Size of the square tensors') + parser.add_argument('tile_size', type=int) + parser.add_argument('local_size', type=int, nargs=2) + parser.add_argument('workgroup', type=int, nargs=2) + arguments = parser.parse_args() + + tensor_size: int = arguments.tensor_size + tile_size: int = arguments.tile_size + local_size: tuple[int, int, int] = tuple(arguments.local_size) + workgroup: tuple[int, int, int] = tuple(arguments.workgroup) + + tensor_shape = (tensor_size, tensor_size) + tensor_1 = np.triu(np.ones(tensor_shape)) + tensor_2 = np.triu(np.ones(tensor_shape)) + tensor_out = np.zeros(tensor_shape) + tensor_test_1 = np.zeros(tensor_shape) + tensor_test_2 = np.zeros(tensor_shape) + tensor_test_3 = np.zeros(tensor_shape) + tensor_test_4 = np.zeros(tensor_shape) + tensor_test_5 = np.zeros(tensor_shape) + + plot_tensor('tensor_1', tensor_1) + plot_tensor('tensor_2', tensor_2) + plot_tensor('tensor_out', tensor_out) + plot_tensor('tensor_test_1', tensor_test_1) + plot_tensor('tensor_test_2', tensor_test_2) + plot_tensor('tensor_test_3', tensor_test_3) + plot_tensor('tensor_test_4', tensor_test_4) + plot_tensor('tensor_test_5', tensor_test_5) + cv2.waitKey(-1) + + print(f'{workgroup=} {local_size=}') + for workgroup_x in range(workgroup[0]): + for workgroup_y in range(workgroup[1]): + for invocation_x in range(workgroup_x * local_size[0], (workgroup_x + 1) * local_size[0]): + for invocation_y in range(workgroup_y * local_size[1], (workgroup_y + 1) * local_size[1]): + row = invocation_x + col = invocation_y + globalRow = (tile_size * workgroup_x) + row + globalCol = (tile_size * workgroup_y) + col + try: + tensor_out[row, col] = row + tensor_test_1[row, col] = col + tensor_test_2[row, col] = workgroup_x + tensor_test_3[row, col] = workgroup_y + tensor_test_4[row, col] = globalRow + tensor_test_5[row, col] = globalCol + plot_tensor('tensor_out', tensor_out, (row, col)) + plot_tensor('tensor_test_1', tensor_test_1, (row, col)) + plot_tensor('tensor_test_2', tensor_test_2, (row, col)) + plot_tensor('tensor_test_3', tensor_test_3, (row, col)) + plot_tensor('tensor_test_4', tensor_test_4, (row, col)) + plot_tensor('tensor_test_5', tensor_test_5, (row, col)) + cv2.waitKey(-1) + except IndexError as error: + print(f'{workgroup_x=} {workgroup_y=} {row=} {col=}') + raise error + + plot_tensor('tensor_1', tensor_1) + plot_tensor('tensor_2', tensor_2) + plot_tensor('tensor_out', tensor_out) + plot_tensor('tensor_test_1', tensor_test_1) + plot_tensor('tensor_test_2', tensor_test_2) + plot_tensor('tensor_test_3', tensor_test_3) + plot_tensor('tensor_test_4', tensor_test_4) + plot_tensor('tensor_test_5', tensor_test_5) + cv2.waitKey(-1) + + +if __name__ == '__main__': + main() From a3f7793c17fb2fd8ba3608e809caf31e29195058 Mon Sep 17 00:00:00 2001 From: Corentin Date: Fri, 25 Jun 2021 03:27:52 +0900 Subject: [PATCH 6/8] Fix FLOPS calculation --- examples/python_naive_matmul/benchmark.py | 4 +- examples/python_naive_matmul/imp1_naive.py | 9 ++-- examples/python_naive_matmul/imp2_tiled.py | 15 +++--- .../python_naive_matmul/imp3_better_tiling.py | 47 +++++++++++-------- 4 files changed, 41 insertions(+), 34 deletions(-) diff --git a/examples/python_naive_matmul/benchmark.py b/examples/python_naive_matmul/benchmark.py index b10369d7c..8b92dda2f 100644 --- a/examples/python_naive_matmul/benchmark.py +++ b/examples/python_naive_matmul/benchmark.py @@ -34,13 +34,13 @@ def main(): matmul_op(tensor_shape, tensor_in_1, tensor_in_2, tensor_out) end_time = time.time() experiment_time = end_time - start_time - op_count = tensor_shape[0] * tensor_shape[1] * (tensor_shape[1] - 1) + op_count = tensor_shape[0] * tensor_shape[1] * ((tensor_shape[1] * 2) - 1) if (tensor_out.data().reshape(tensor_shape) == mat_result).all(): print(f'From {MatMulOp.__module__} : {experiment_count} matmul time : ' f'{experiment_time * 1000:0.2f}ms => ' f'{experiment_count / experiment_time:0.2f}op/s or ' - f'{experiment_count * op_count / (1e9 * experiment_time):0.2f}GFLOPS') + f'{experiment_count * op_count / (1e9 * experiment_time):0.2f} GFLOPS') else: print(f'Test failed => output tensor is wrong :\n{tensor_out.data().reshape(tensor_shape)}') diff --git a/examples/python_naive_matmul/imp1_naive.py b/examples/python_naive_matmul/imp1_naive.py index faefec563..420336260 100644 --- a/examples/python_naive_matmul/imp1_naive.py +++ b/examples/python_naive_matmul/imp1_naive.py @@ -76,6 +76,7 @@ void main() self.tensor_shape = tensor_shape self.params = params workgroup = (tensor_shape[0] // self.local_size_x, tensor_shape[1] // self.local_size_y, 1) + print(f'{workgroup=} {self.local_size_x=} {self.local_size_y=}') self.algo = self.mgr.algorithm( params, # params self.compiled_shader, # spirv @@ -95,7 +96,7 @@ def main(): matmul_op = MatMulOp(mgr) - tensor_size = 512 + tensor_size = 4064 tensor_shape = [tensor_size, tensor_size] tensor_in_1 = mgr.tensor(np.triu(np.ones(tensor_shape))) tensor_in_2 = mgr.tensor(np.triu(np.ones(tensor_shape))) @@ -107,20 +108,20 @@ def main(): matmul_op(tensor_shape, tensor_in_1, tensor_in_2, tensor_out) - experiment_count = 1000 + experiment_count = 8 start_time = time.time() for _ in range(experiment_count): matmul_op(tensor_shape, tensor_in_1, tensor_in_2, tensor_out) end_time = time.time() experiment_time = end_time - start_time - op_count = tensor_shape[0] * tensor_shape[1] * (tensor_shape[1] - 1) + op_count = tensor_shape[0] * tensor_shape[1] * ((tensor_shape[1] * 2) - 1) print(f'Output :\n{tensor_out.data().reshape(tensor_shape)}') print(f'{experiment_count} matmul time : ' f'{experiment_time * 1000:0.2f}ms => ' f'{experiment_count / experiment_time:0.2f}op/s or ' - f'{experiment_count * op_count / (1e9 * experiment_time):0.2f}GFLOPS') + f'{experiment_count * op_count / (1e9 * experiment_time):0.2f} GFLOPS') if __name__ == '__main__': diff --git a/examples/python_naive_matmul/imp2_tiled.py b/examples/python_naive_matmul/imp2_tiled.py index ed6a1ddf8..8ddf53745 100644 --- a/examples/python_naive_matmul/imp2_tiled.py +++ b/examples/python_naive_matmul/imp2_tiled.py @@ -43,8 +43,8 @@ void main() uint row = gl_LocalInvocationID.x; // 0 .. tile_size uint col = gl_LocalInvocationID.y; // 0 .. tile_size // gl_WorkGroupID : 0 .. tensor_size / tile_size - uint globalRow = ({tile_size} * gl_WorkGroupID.x) + row; - uint globalCol = ({tile_size} * gl_WorkGroupID.y) + col; + uint globalRow = {tile_size} * gl_WorkGroupID.x + row; + uint globalCol = {tile_size} * gl_WorkGroupID.y + col; uint tensor_size = uint(tensor_size_f); float acc = 0.0; @@ -64,8 +64,7 @@ void main() barrier(); }} - uint globalIndex = (tensor_size * globalCol) + globalRow; - out_tensor[globalIndex] = acc; + out_tensor[tensor_size * globalCol + globalRow] = acc; }}''' self.compiled_shader = kp.Shader.compile_source(self.shader) self.tensor_shape: tuple[int, int] = (0, 0) @@ -99,7 +98,7 @@ def main(): matmul_op = MatMulOp(mgr) - tensor_size = 512 + tensor_size = 4096 tensor_shape = [tensor_size, tensor_size] tensor_in_1 = mgr.tensor(np.triu(np.ones(tensor_shape))) tensor_in_2 = mgr.tensor(np.triu(np.ones(tensor_shape))) @@ -111,20 +110,20 @@ def main(): matmul_op(tensor_shape, tensor_in_1, tensor_in_2, tensor_out) - experiment_count = 1000 + experiment_count = 8 start_time = time.time() for _ in range(experiment_count): matmul_op(tensor_shape, tensor_in_1, tensor_in_2, tensor_out) end_time = time.time() experiment_time = end_time - start_time - op_count = tensor_shape[0] * tensor_shape[1] * (tensor_shape[1] - 1) + op_count = tensor_shape[0] * tensor_shape[1] * ((tensor_shape[1] * 2) - 1) print(f'Output :\n{tensor_out.data().reshape(tensor_shape)}') print(f'{experiment_count} matmul time : ' f'{experiment_time * 1000:0.2f}ms => ' f'{experiment_count / experiment_time:0.2f}op/s or ' - f'{experiment_count * op_count / (1e9 * experiment_time):0.2f}GFLOPS') + f'{experiment_count * op_count / (1e9 * experiment_time):0.2f} GFLOPS') if __name__ == '__main__': diff --git a/examples/python_naive_matmul/imp3_better_tiling.py b/examples/python_naive_matmul/imp3_better_tiling.py index 6b3ada314..e97eb88d4 100644 --- a/examples/python_naive_matmul/imp3_better_tiling.py +++ b/examples/python_naive_matmul/imp3_better_tiling.py @@ -55,30 +55,37 @@ void main() uint tensor_size = uint(tensor_size_f); float acc[{thread_work_ratio}]; - for (uint l = 0u; l < {thread_work_ratio}; l++) - acc[l] = 0.0; + for(uint w = 0u; w < {thread_work_ratio}; w++) + acc[w] = 0.0; + /* uint numTiles = tensor_size / {tile_size}; for(uint t = 0u; t < numTiles; t++) {{ - uint tiledRow = {tile_size} * t + row; - uint tiledCol = {tile_size} * t + col; - sub_tensor_1[col + t * {self.local_size_y}][row] = in_tensor_1[ - (tiledCol + t * {self.local_size_y}) * tensor_size + globalRow]; - sub_tensor_2[col + t * {self.local_size_y}][row] = in_tensor_2[ - (globalCol + t * {self.local_size_y})* tensor_size + tiledRow]; + for(uint w = 0u; w < {thread_work_ratio}; w++) + {{ + uint tiledRow = {tile_size} * t + row; + uint tiledCol = {tile_size} * t + col; + sub_tensor_1[col + t * {self.local_size_y}][row] = in_tensor_1[ + (tiledCol + w * {self.local_size_y}) * tensor_size + globalRow]; + sub_tensor_2[col + t * {self.local_size_y}][row] = in_tensor_2[ + (globalCol + w * {self.local_size_y})* tensor_size + tiledRow]; + }} memoryBarrierShared(); barrier(); for(uint k = 0u; k < {tile_size}; k++) - for(uint l = 0u; l < {thread_work_ratio}; l++) - acc[l] += sub_tensor_1[k][row] * sub_tensor_2[col + l * {self.local_size_y}][k]; + for(uint w = 0u; w < {thread_work_ratio}; w++) + acc[w] += sub_tensor_1[k][row] * sub_tensor_2[col + w * {self.local_size_y}][k]; barrier(); + }}*/ + for(uint w = 0u; w < {thread_work_ratio}; w++) + {{ + //out_tensor[(globalCol + w * {self.local_size_y}) * tensor_size + globalRow] = acc[w]; + out_tensor[(globalCol + w * {self.local_size_y}) * tensor_size + globalRow] = w; }} - for(uint l = 0u; l < {thread_work_ratio}; l++) - out_tensor[(globalCol + l * {self.local_size_y}) * tensor_size + globalRow] = acc[l]; }}''' self.compiled_shader = kp.Shader.compile_source(self.shader) self.tensor_shape: tuple[int, int] = (0, 0) @@ -92,13 +99,13 @@ void main() if self.algo is None or self.tensor_shape != tensor_shape or self.params != params: self.tensor_shape = tensor_shape self.params = params - print( - tensor_shape, self.local_size_x, self.local_size_y, - (tensor_shape[0] // self.local_size_x, tensor_shape[1] // self.local_size_y, 1)) + # workgroup = (tensor_shape[0] // self.local_size_x, tensor_shape[1] // self.local_size_y, 1) + workgroup = (2, 2, 1) + print(tensor_shape, self.local_size_x, self.local_size_y, workgroup) self.algo = self.mgr.algorithm( params, # params self.compiled_shader, # spirv - (tensor_shape[0] // self.local_size_x, tensor_shape[1] // self.local_size_y, 1), # workgroup + workgroup, # workgroup [float(tensor_shape[0])], # spec_consts []) # push_consts @@ -114,7 +121,7 @@ def main(): matmul_op = MatMulOp(mgr) - tensor_size = 512 + tensor_size = 4096 tensor_shape = [tensor_size, tensor_size] tensor_in_1 = mgr.tensor(np.triu(np.ones(tensor_shape))) tensor_in_2 = mgr.tensor(np.triu(np.ones(tensor_shape))) @@ -126,20 +133,20 @@ def main(): matmul_op(tensor_shape, tensor_in_1, tensor_in_2, tensor_out) - experiment_count = 1000 + experiment_count = 8 start_time = time.time() for _ in range(experiment_count): matmul_op(tensor_shape, tensor_in_1, tensor_in_2, tensor_out) end_time = time.time() experiment_time = end_time - start_time - op_count = tensor_shape[0] * tensor_shape[1] * (tensor_shape[1] - 1) + op_count = tensor_shape[0] * tensor_shape[1] * ((tensor_shape[1] * 2) - 1) print(f'Output :\n{tensor_out.data().reshape(tensor_shape)}') print(f'{experiment_count} matmul time : ' f'{experiment_time * 1000:0.2f}ms => ' f'{experiment_count / experiment_time:0.2f}op/s or ' - f'{experiment_count * op_count / (1e9 * experiment_time):0.2f}GFLOPS') + f'{experiment_count * op_count / (1e9 * experiment_time):0.2f} GFLOPS') if __name__ == '__main__': From 3962ee70afc47bed63a90a675bbccd523275c402 Mon Sep 17 00:00:00 2001 From: Corentin Date: Mon, 28 Jun 2021 19:05:39 +0900 Subject: [PATCH 7/8] Fix small matrices matmuls, imp3 working but slow --- examples/python_naive_matmul/benchmark.py | 71 ++++---- examples/python_naive_matmul/imp1_naive.py | 13 +- examples/python_naive_matmul/imp2_tiled.py | 10 +- .../python_naive_matmul/imp2_tiled_debug.py | 156 ------------------ .../python_naive_matmul/imp3_better_tiling.py | 46 +++--- 5 files changed, 77 insertions(+), 219 deletions(-) delete mode 100644 examples/python_naive_matmul/imp2_tiled_debug.py diff --git a/examples/python_naive_matmul/benchmark.py b/examples/python_naive_matmul/benchmark.py index 8b92dda2f..768a854d4 100644 --- a/examples/python_naive_matmul/benchmark.py +++ b/examples/python_naive_matmul/benchmark.py @@ -8,41 +8,48 @@ from imp3_better_tiling import MatMulOp as MatMulOp3 def main(): - experiment_count = 1000 - tensor_size = 512 - tensor_shape = [tensor_size, tensor_size] - mat_1 = np.triu(np.ones(tensor_shape)) - mat_2 = np.triu(np.ones(tensor_shape)) - mat_result = mat_1 @ mat_2 - tensor_shape = [tensor_size, tensor_size] - - print(f'{tensor_shape} input tensors:\n' - f'{mat_1}\n' - f'{mat_2}\n') - print(f'Output :\n{mat_result}') - mgr = kp.Manager() - tensor_in_1 = mgr.tensor(mat_1) - tensor_in_2 = mgr.tensor(mat_2) - tensor_out = mgr.tensor(np.zeros(tensor_shape)) - for MatMulOp in [MatMulOp1, MatMulOp2, MatMulOp3]: - matmul_op = MatMulOp(mgr) - matmul_op(tensor_shape, tensor_in_1, tensor_in_2, tensor_out) + for tensor_size, experiment_count in [(512, 1000), (4096, 5)]: + tensor_shape = [tensor_size, tensor_size] + tensor_shape = [tensor_size, tensor_size] + mat_1 = np.triu(np.ones(tensor_shape)) + mat_2 = np.triu(np.ones(tensor_shape)) - start_time = time.time() - for _ in range(experiment_count): - matmul_op(tensor_shape, tensor_in_1, tensor_in_2, tensor_out) - end_time = time.time() - experiment_time = end_time - start_time - op_count = tensor_shape[0] * tensor_shape[1] * ((tensor_shape[1] * 2) - 1) - - if (tensor_out.data().reshape(tensor_shape) == mat_result).all(): - print(f'From {MatMulOp.__module__} : {experiment_count} matmul time : ' - f'{experiment_time * 1000:0.2f}ms => ' - f'{experiment_count / experiment_time:0.2f}op/s or ' - f'{experiment_count * op_count / (1e9 * experiment_time):0.2f} GFLOPS') + tensor_in_1 = mgr.tensor(mat_1) + tensor_in_2 = mgr.tensor(mat_2) + tensor_out = mgr.tensor(np.zeros(tensor_shape)) + if tensor_size <= 512: + mat_result = mat_1 @ mat_2 else: - print(f'Test failed => output tensor is wrong :\n{tensor_out.data().reshape(tensor_shape)}') + MatMulOp1(mgr)(tensor_shape, tensor_in_1, tensor_in_2, tensor_out) + mat_result = tensor_out.data().reshape(tensor_shape) # CPU is too slow for big sizes + + print(f'{tensor_shape} input tensors:\n' + f'{mat_1}\n' + f'{mat_2}\n') + print(f'Output :\n{mat_result}') + + for MatMulOp in [MatMulOp1, MatMulOp2, MatMulOp3]: + tensor_out.data()[:] = 0 + mgr.sequence().record(kp.OpTensorSyncDevice([tensor_out])) + matmul_op = MatMulOp(mgr) + matmul_op(tensor_shape, tensor_in_1, tensor_in_2, tensor_out) + + start_time = time.time() + for _ in range(experiment_count): + matmul_op(tensor_shape, tensor_in_1, tensor_in_2, tensor_out) + end_time = time.time() + experiment_time = end_time - start_time + op_count = tensor_shape[0] * tensor_shape[1] * ((tensor_shape[1] * 2) - 1) + + # print(tensor_out.data().reshape(tensor_shape)) + if (tensor_out.data().reshape(tensor_shape) == mat_result).all(): + print(f'From {MatMulOp.__module__} : {experiment_count} matmul time : ' + f'{experiment_time * 1000:0.2f}ms => ' + f'{experiment_count / experiment_time:0.2f}op/s or ' + f'{experiment_count * op_count / (1e9 * experiment_time):0.2f} GFLOPS') + else: + print(f'Test failed => output tensor is wrong :\n{tensor_out.data().reshape(tensor_shape)}') if __name__ == '__main__': diff --git a/examples/python_naive_matmul/imp1_naive.py b/examples/python_naive_matmul/imp1_naive.py index 420336260..a791662d2 100644 --- a/examples/python_naive_matmul/imp1_naive.py +++ b/examples/python_naive_matmul/imp1_naive.py @@ -41,7 +41,7 @@ class MatMulOp: self.local_size_x = local_size_x self.local_size_y = local_size_y - self.shader = f''' + self.shader = ''' #version 450 layout (local_size_x = {local_size_x}, local_size_y = {local_size_y}) in; @@ -63,7 +63,8 @@ void main() acc += in_tensor_1[(k * tensor_size) + globalRow] * in_tensor_2[(globalCol * tensor_size) + k]; out_tensor[(globalCol * tensor_size) + globalRow] = acc; }}''' - self.compiled_shader = kp.Shader.compile_source(self.shader) + self.compiled_shader = kp.Shader.compile_source(self.shader.format( + local_size_x=self.local_size_x, local_size_y=self.local_size_y)) self.tensor_shape: tuple[int, int] = (0, 0) self.params: list[kp.Tensor] = [] self.algo = None @@ -75,7 +76,11 @@ void main() if self.algo is None or self.tensor_shape != tensor_shape or self.params != params: self.tensor_shape = tensor_shape self.params = params - workgroup = (tensor_shape[0] // self.local_size_x, tensor_shape[1] // self.local_size_y, 1) + local_size_x = min(self.local_size_x, tensor_shape[0]) + local_size_y = min(self.local_size_y, tensor_shape[1]) + self.compiled_shader = kp.Shader.compile_source(self.shader.format( + local_size_x=local_size_x, local_size_y=local_size_y)) + workgroup = (tensor_shape[0] // local_size_x, tensor_shape[1] // local_size_y, 1) print(f'{workgroup=} {self.local_size_x=} {self.local_size_y=}') self.algo = self.mgr.algorithm( params, # params @@ -85,7 +90,7 @@ void main() []) # push_consts (self.mgr.sequence() - .record(kp.OpTensorSyncDevice(self.params)) + .record(kp.OpTensorSyncDevice([tensor_in_1, tensor_in_2])) .record(kp.OpAlgoDispatch(self.algo)) .record(kp.OpTensorSyncLocal([tensor_out])) .eval()) diff --git a/examples/python_naive_matmul/imp2_tiled.py b/examples/python_naive_matmul/imp2_tiled.py index 8ddf53745..1ac13e858 100644 --- a/examples/python_naive_matmul/imp2_tiled.py +++ b/examples/python_naive_matmul/imp2_tiled.py @@ -24,7 +24,7 @@ class MatMulOp: assert tile_size <= max_workgroup_size[1] self.tile_size = tile_size - self.shader = f''' + self.shader = ''' #version 450 layout (local_size_x = {tile_size}, local_size_y = {tile_size}) in; @@ -66,7 +66,7 @@ void main() }} out_tensor[tensor_size * globalCol + globalRow] = acc; }}''' - self.compiled_shader = kp.Shader.compile_source(self.shader) + self.compiled_shader = kp.Shader.compile_source(self.shader.format(tile_size=tile_size)) self.tensor_shape: tuple[int, int] = (0, 0) self.params: list[kp.Tensor] = [] self.algo = None @@ -78,7 +78,9 @@ void main() if self.algo is None or self.tensor_shape != tensor_shape or self.params != params: self.tensor_shape = tensor_shape self.params = params - workgroup = (tensor_shape[0] // self.tile_size, tensor_shape[1] // self.tile_size, 1) + tile_size = min(tensor_shape[0], tensor_shape[1], self.tile_size) + self.compiled_shader = kp.Shader.compile_source(self.shader.format(tile_size=tile_size)) + workgroup = [tensor_shape[0] // tile_size, tensor_shape[1] // tile_size, 1] self.algo = self.mgr.algorithm( params, # params self.compiled_shader, # spirv @@ -87,7 +89,7 @@ void main() []) # push_consts (self.mgr.sequence() - .record(kp.OpTensorSyncDevice(self.params)) + .record(kp.OpTensorSyncDevice([tensor_in_1, tensor_in_2])) .record(kp.OpAlgoDispatch(self.algo)) .record(kp.OpTensorSyncLocal([tensor_out])) .eval()) diff --git a/examples/python_naive_matmul/imp2_tiled_debug.py b/examples/python_naive_matmul/imp2_tiled_debug.py deleted file mode 100644 index b10cbfd7d..000000000 --- a/examples/python_naive_matmul/imp2_tiled_debug.py +++ /dev/null @@ -1,156 +0,0 @@ -import time - -import kp -import numpy as np - - -class MatMulOp: - def __init__(self, manager: kp.Manager, tile_size: int = -1): - self.mgr = manager - - props = self.mgr.get_device_properties() - max_workgroup_invocation = props['max_work_group_invocations'] - max_workgroup_size = props['max_work_group_size'] - if tile_size < 0: - tile_size = 1 - while (4 * tile_size * tile_size <= max_workgroup_invocation - and 2 * tile_size <= max_workgroup_size[0] - and 2 * tile_size <= max_workgroup_size[1]): - tile_size *= 2 - - assert tile_size > 0 - assert tile_size * tile_size <= max_workgroup_invocation - assert tile_size <= max_workgroup_size[0] - assert tile_size <= max_workgroup_size[1] - self.tile_size = tile_size - - print(f'{tile_size=}') - self.shader = f''' -#version 450 - -layout (local_size_x = {tile_size}, local_size_y = {tile_size}) in; - -layout (set = 0, binding = 0) readonly buffer buf_in_tensor_1 {{ float in_tensor_1[]; }}; -layout (set = 0, binding = 1) readonly buffer buf_in_tensor_2 {{ float in_tensor_2[]; }}; -layout (set = 0, binding = 2) writeonly buffer buf_out_tensor {{ float out_tensor[]; }}; -layout (set = 0, binding = 3) writeonly buffer buf_test1_tensor {{ float test1_tensor[]; }}; -layout (set = 0, binding = 4) writeonly buffer buf_test2_tensor {{ float test2_tensor[]; }}; -layout (set = 0, binding = 5) writeonly buffer buf_test3_tensor {{ float test3_tensor[]; }}; -layout (set = 0, binding = 6) writeonly buffer buf_test4_tensor {{ float test4_tensor[]; }}; - -layout (constant_id = 0) const float tensor_size_f = 0; - -shared float sub_tensor_1[{tile_size}][{tile_size}]; -shared float sub_tensor_2[{tile_size}][{tile_size}]; - -void main() -{{ - uint row = gl_LocalInvocationID.x; // 0 .. tile_size - uint col = gl_LocalInvocationID.y; // 0 .. tile_size - // gl_WorkGroupID : 0 .. tensor_size / tile_size - uint globalRow = ({tile_size} * gl_WorkGroupID.x) + row; - uint globalCol = ({tile_size} * gl_WorkGroupID.y) + col; - - uint tensor_size = uint(tensor_size_f); - float acc = 0.0; - uint numTiles = tensor_size / {tile_size}; - for(uint t = 0u; t < numTiles; t++) - {{ - uint tiledRow = ({tile_size} * t) + row; - uint tiledCol = ({tile_size} * t) + col; - sub_tensor_1[col][row] = in_tensor_1[(tiledCol * tensor_size) + globalRow]; - sub_tensor_2[col][row] = in_tensor_2[(globalCol * tensor_size) + tiledRow]; - - memoryBarrierShared(); - barrier(); - - for(uint k = 0u; k < {tile_size}; k++) - acc += sub_tensor_1[k][row] * sub_tensor_2[col][k]; - - barrier(); - }} - uint globalIndex = (tensor_size * globalCol) + globalRow; - out_tensor[globalIndex] = acc; - test1_tensor[globalIndex] = row; - test2_tensor[globalIndex] = col; - test3_tensor[globalIndex] = gl_WorkGroupID.x; - test4_tensor[globalIndex] = gl_WorkGroupID.y; -}}''' - print(self.shader) - self.compiled_shader = kp.Shader.compile_source(self.shader) - self.tensor_shape: tuple[int, int] = (0, 0) - self.params: list[kp.Tensor] = [] - self.algo = None - - def __call__(self, tensor_shape: tuple[int, int], tensor_in_1: kp.Tensor, tensor_in_2: kp.Tensor, - tensor_out: kp.Tensor, tensor_test_1: kp.Tensor, tensor_test_2: kp.Tensor, - tensor_test_3: kp.Tensor, tensor_test_4: kp.Tensor): - # params = [tensor_in_1, tensor_in_2, tensor_out] - params = [tensor_in_1, tensor_in_2, tensor_out, tensor_test_1, tensor_test_2, tensor_test_3, tensor_test_4] - - if self.algo is None or self.tensor_shape != tensor_shape or self.params != params: - self.tensor_shape = tensor_shape - self.params = params - workgroup = (tensor_shape[0] // self.tile_size, tensor_shape[1] // self.tile_size, 1) - # workgroup = (2, 2, 1) - print(f'{float(tensor_shape[0])=} {self.tile_size=} {workgroup=}') - self.algo = self.mgr.algorithm( - params, # params - self.compiled_shader, # spirv - workgroup, # workgroup - [float(tensor_shape[0])], # spec_consts - []) # push_consts - - (self.mgr.sequence() - .record(kp.OpTensorSyncDevice(self.params)) - .record(kp.OpAlgoDispatch(self.algo)) - .record(kp.OpTensorSyncLocal(self.params[2:])) - # .record(kp.OpTensorSyncLocal([tensor_out])) - .eval()) - - -def main(): - mgr = kp.Manager() - - matmul_op = MatMulOp(mgr, 4) - - tensor_size = 8 - tensor_shape = [tensor_size, tensor_size] - tensor_in_1 = mgr.tensor(np.triu(np.ones(tensor_shape))) - tensor_in_2 = mgr.tensor(np.triu(np.ones(tensor_shape))) - tensor_out = mgr.tensor(np.zeros(tensor_shape)) - tensor_test_1 = mgr.tensor(np.zeros(tensor_shape)) - tensor_test_2 = mgr.tensor(np.zeros(tensor_shape)) - tensor_test_3 = mgr.tensor(np.zeros(tensor_shape)) - tensor_test_4 = mgr.tensor(np.zeros(tensor_shape)) - - print(f'{tensor_shape} input tensors:\n' - f'{tensor_in_1.data().reshape(tensor_shape)}\n' - f'{tensor_in_2.data().reshape(tensor_shape)}\n') - - # matmul_op(tensor_shape, tensor_in_1, tensor_in_2, tensor_out) - matmul_op(tensor_shape, tensor_in_1, tensor_in_2, - tensor_out, tensor_test_1, tensor_test_2, tensor_test_3, tensor_test_4) - - # experiment_count = 10 - # start_time = time.time() - # for _ in range(experiment_count): - # matmul_op(tensor_shape, tensor_in_1, tensor_in_2, tensor_out) - # end_time = time.time() - # experiment_time = end_time - start_time - # op_count = tensor_shape[0] * tensor_shape[1] * (tensor_shape[1] - 1) - - print(f'Output :\n{tensor_out.data().reshape(tensor_shape)}') - print(f'test_1 :\n{tensor_test_1.data().reshape(tensor_shape)}') - print(f'test_2 :\n{tensor_test_2.data().reshape(tensor_shape)}') - print(f'test_3 :\n{tensor_test_3.data().reshape(tensor_shape)}') - print(f'test_4 :\n{tensor_test_4.data().reshape(tensor_shape)}') - - # print(f'{experiment_count} matmul time : ' - # f'{experiment_time * 1000:0.2f}ms => ' - # f'{experiment_count / experiment_time:0.2f}op/s or ' - # f'{experiment_count * op_count / (1e9 * experiment_time):0.2f}GFLOPS') - - -if __name__ == '__main__': - main() diff --git a/examples/python_naive_matmul/imp3_better_tiling.py b/examples/python_naive_matmul/imp3_better_tiling.py index e97eb88d4..8cd44277b 100644 --- a/examples/python_naive_matmul/imp3_better_tiling.py +++ b/examples/python_naive_matmul/imp3_better_tiling.py @@ -5,7 +5,7 @@ import numpy as np class MatMulOp: - def __init__(self, manager: kp.Manager, tile_size: int = -1, thread_work_ratio: int = 8): + def __init__(self, manager: kp.Manager, tile_size: int = -1, thread_work_ratio: int = 16): self.mgr = manager props = self.mgr.get_device_properties() @@ -14,9 +14,9 @@ class MatMulOp: if tile_size < 0: tile_size = 1 local_size_y = tile_size // thread_work_ratio - while (4 * tile_size * local_size_y <= max_workgroup_invocation + while (4 * tile_size * tile_size <= max_workgroup_invocation and 2 * tile_size <= max_workgroup_size[0] - and 2 * local_size_y <= max_workgroup_size[1]): + and 2 * tile_size <= max_workgroup_size[1]): tile_size *= 2 local_size_y = tile_size // thread_work_ratio else: @@ -32,10 +32,10 @@ class MatMulOp: self.local_size_x = tile_size self.local_size_y = tile_size // thread_work_ratio - self.shader = f''' + self.shader = ''' #version 450 -layout (local_size_x = {tile_size}, local_size_y = {self.local_size_y}) in; +layout (local_size_x = {tile_size}, local_size_y = {local_size_y}) in; layout (set = 0, binding = 0) readonly buffer buf_in_tensor_1 {{ float in_tensor_1[]; }}; layout (set = 0, binding = 1) readonly buffer buf_in_tensor_2 {{ float in_tensor_2[]; }}; @@ -51,14 +51,13 @@ void main() uint row = gl_LocalInvocationID.x; uint col = gl_LocalInvocationID.y; uint globalRow = {tile_size} * gl_WorkGroupID.x + row; - uint globalCol = {tile_size} * gl_WorkGroupID.y + row; + uint globalCol = {tile_size} * gl_WorkGroupID.y + col; uint tensor_size = uint(tensor_size_f); float acc[{thread_work_ratio}]; for(uint w = 0u; w < {thread_work_ratio}; w++) acc[w] = 0.0; - /* uint numTiles = tensor_size / {tile_size}; for(uint t = 0u; t < numTiles; t++) {{ @@ -66,10 +65,10 @@ void main() {{ uint tiledRow = {tile_size} * t + row; uint tiledCol = {tile_size} * t + col; - sub_tensor_1[col + t * {self.local_size_y}][row] = in_tensor_1[ - (tiledCol + w * {self.local_size_y}) * tensor_size + globalRow]; - sub_tensor_2[col + t * {self.local_size_y}][row] = in_tensor_2[ - (globalCol + w * {self.local_size_y})* tensor_size + tiledRow]; + sub_tensor_1[col + w * {local_size_y}][row] = in_tensor_1[ + (tiledCol + w * {local_size_y}) * tensor_size + globalRow]; + sub_tensor_2[col + w * {local_size_y}][row] = in_tensor_2[ + (globalCol + w * {local_size_y})* tensor_size + tiledRow]; }} memoryBarrierShared(); @@ -77,17 +76,15 @@ void main() for(uint k = 0u; k < {tile_size}; k++) for(uint w = 0u; w < {thread_work_ratio}; w++) - acc[w] += sub_tensor_1[k][row] * sub_tensor_2[col + w * {self.local_size_y}][k]; + acc[w] += sub_tensor_1[k][row] * sub_tensor_2[col + w * {local_size_y}][k]; barrier(); - }}*/ - for(uint w = 0u; w < {thread_work_ratio}; w++) - {{ - //out_tensor[(globalCol + w * {self.local_size_y}) * tensor_size + globalRow] = acc[w]; - out_tensor[(globalCol + w * {self.local_size_y}) * tensor_size + globalRow] = w; }} + for(uint w = 0u; w < {thread_work_ratio}; w++) + out_tensor[(globalCol + w * {local_size_y}) * tensor_size + globalRow] = acc[w]; }}''' - self.compiled_shader = kp.Shader.compile_source(self.shader) + self.compiled_shader = kp.Shader.compile_source(self.shader.format( + tile_size=tile_size, thread_work_ratio=thread_work_ratio, local_size_y=local_size_y)) self.tensor_shape: tuple[int, int] = (0, 0) self.params: list[kp.Tensor] = [] self.algo = None @@ -99,9 +96,12 @@ void main() if self.algo is None or self.tensor_shape != tensor_shape or self.params != params: self.tensor_shape = tensor_shape self.params = params - # workgroup = (tensor_shape[0] // self.local_size_x, tensor_shape[1] // self.local_size_y, 1) - workgroup = (2, 2, 1) - print(tensor_shape, self.local_size_x, self.local_size_y, workgroup) + tile_size = min(self.tensor_shape[0], self.tile_size) + thread_work_ratio = min(self.tensor_shape[1] // self.tile_size, self.thread_work_ratio) + local_size_y = tile_size // thread_work_ratio + self.compiled_shader = kp.Shader.compile_source(self.shader.format( + tile_size=tile_size, thread_work_ratio=thread_work_ratio, local_size_y=local_size_y)) + workgroup = (tensor_shape[0] // self.local_size_x, tensor_shape[1] // self.local_size_y, 1) self.algo = self.mgr.algorithm( params, # params self.compiled_shader, # spirv @@ -110,7 +110,7 @@ void main() []) # push_consts (self.mgr.sequence() - .record(kp.OpTensorSyncDevice(self.params)) + .record(kp.OpTensorSyncDevice([tensor_in_1, tensor_in_2])) .record(kp.OpAlgoDispatch(self.algo)) .record(kp.OpTensorSyncLocal([tensor_out])) .eval()) @@ -133,7 +133,7 @@ def main(): matmul_op(tensor_shape, tensor_in_1, tensor_in_2, tensor_out) - experiment_count = 8 + experiment_count = 2 start_time = time.time() for _ in range(experiment_count): matmul_op(tensor_shape, tensor_in_1, tensor_in_2, tensor_out) From 137ef147db7c64a11f50687b7e3b7d053063735e Mon Sep 17 00:00:00 2001 From: Alejandro Saucedo Date: Sun, 25 Jul 2021 18:45:28 +0100 Subject: [PATCH 8/8] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index a3a363770..6d195c8f6 100644 --- a/README.md +++ b/README.md @@ -235,7 +235,7 @@ You are able to try out the interactive Colab Notebooks which allow you to use a -
Try the interactive C++ Colab from Blog Post
+
Try the interactive C++ Colab from Blog Post