From 425380f1a11b3df97f58b717595ebefdd64f9cfe Mon Sep 17 00:00:00 2001 From: Corentin Date: Thu, 24 Jun 2021 18:36:10 +0900 Subject: [PATCH] Automated work group parameters and tiling matmul --- .../python_naive_matmul/1_naive_matmul.py | 75 ++++++++--- .../python_naive_matmul/2_tiled_matmul.py | 127 ++++++++++++++++++ examples/python_naive_matmul/README.md | 2 +- 3 files changed, 183 insertions(+), 21 deletions(-) create mode 100644 examples/python_naive_matmul/2_tiled_matmul.py diff --git a/examples/python_naive_matmul/1_naive_matmul.py b/examples/python_naive_matmul/1_naive_matmul.py index e7fe092a8..d5fe70cba 100644 --- a/examples/python_naive_matmul/1_naive_matmul.py +++ b/examples/python_naive_matmul/1_naive_matmul.py @@ -5,35 +5,64 @@ import numpy as np class MatMulOp: - def __init__(self, manager: kp.Manager, local_size_x: int = 1, local_size_y: int = 1): + def __init__(self, manager: kp.Manager, local_size_x: int = -1, local_size_y: int = -1): self.mgr = manager - assert(local_size_x > 0) - assert(local_size_y > 0) + + props = self.mgr.get_device_properties() + max_workgroup_invocation = props['max_work_group_invocations'] + max_workgroup_size = props['max_work_group_size'] + if local_size_x < 1: + if local_size_y > 0: + local_size_x = 1 + while (2 * local_size_x * local_size_y <= max_workgroup_invocation + and 2 * local_size_x <= max_workgroup_size[0]): + local_size_x *= 2 + else: + local_size_x = 1 + local_size_y = 1 + while 2 * local_size_x * local_size_y <= max_workgroup_invocation: + if 2 * local_size_x <= max_workgroup_size[0]: + local_size_x *= 2 + if 2 * local_size_y <= max_workgroup_size[1]: + local_size_y *= 2 + elif 2 * local_size_x > max_workgroup_size[0]: # stop if neither x nor y can be double + break + elif local_size_y < 0: + local_size_y = 1 + while (2 * local_size_x * local_size_y <= max_workgroup_invocation + and 2 * local_size_x <= max_workgroup_size[0]): + local_size_y *= 2 + + assert local_size_x > 0 + assert local_size_y > 0 + assert local_size_x * local_size_y <= max_workgroup_invocation + assert local_size_x <= max_workgroup_size[0] + assert local_size_y <= max_workgroup_size[1] self.local_size_x = local_size_x self.local_size_y = local_size_y self.shader = kp.Shader.compile_source(f''' - #version 450 +#version 450 - layout (local_size_x = {local_size_x}, local_size_y = {local_size_y}) in; +layout (local_size_x = {local_size_x}, local_size_y = {local_size_y}) in; - layout (set = 0, binding = 0) readonly buffer buf_in_tensor_1 {{ float in_tensor_1[]; }}; - layout (set = 0, binding = 1) readonly buffer buf_in_tensor_2 {{ float in_tensor_2[]; }}; - layout (set = 0, binding = 2) writeonly buffer buf_out_tensor {{ float out_tensor[]; }}; +layout (set = 0, binding = 0) readonly buffer buf_in_tensor_1 {{ float in_tensor_1[]; }}; +layout (set = 0, binding = 1) readonly buffer buf_in_tensor_2 {{ float in_tensor_2[]; }}; +layout (set = 0, binding = 2) writeonly buffer buf_out_tensor {{ float out_tensor[]; }}; - layout (constant_id = 0) const float tensor_size_f = 0; +layout (constant_id = 0) const float tensor_size_f = 0; - void main() - {{ - uint globalRow = gl_GlobalInvocationID.x; - uint globalCol = gl_GlobalInvocationID.y; - uint tensor_size = uint(tensor_size_f); - float acc = 0.0; - for(uint k = 0u; k < tensor_size; k++) - acc += in_tensor_1[(k * tensor_size) + globalRow] * in_tensor_2[(globalCol * tensor_size) + k]; - out_tensor[(globalCol * tensor_size) + globalRow] = acc; - }}''') +void main() +{{ + uint globalRow = gl_GlobalInvocationID.x; + uint globalCol = gl_GlobalInvocationID.y; + uint tensor_size = uint(tensor_size_f); + float acc = 0.0; + for(uint k = 0u; k < tensor_size; k++) + acc += in_tensor_1[(k * tensor_size) + globalRow] * in_tensor_2[(globalCol * tensor_size) + k]; + out_tensor[(globalCol * tensor_size) + globalRow] = acc; +}}''') self.tensor_shape: tuple[int, int] = (0, 0) self.params: list[kp.Tensor] = [] self.algo = None @@ -62,7 +91,7 @@ class MatMulOp: def main(): mgr = kp.Manager() - matmul_op = MatMulOp(mgr, 64, 64) + matmul_op = MatMulOp(mgr) tensor_size = 512 tensor_shape = [tensor_size, tensor_size] @@ -92,5 +121,11 @@ def main(): f'{experiment_count * op_count / (1e9 * experiment_time):0.2f}GFLOPS') +def test(): + main() + + if __name__ == '__main__': main() +else: + test() diff --git a/examples/python_naive_matmul/2_tiled_matmul.py b/examples/python_naive_matmul/2_tiled_matmul.py new file mode 100644 index 000000000..be403a46a --- /dev/null +++ b/examples/python_naive_matmul/2_tiled_matmul.py @@ -0,0 +1,127 @@ +import time + +import kp +import numpy as np + + +class MatMulOp: + def __init__(self, manager: kp.Manager, tile_size: int = -1): + self.mgr = manager + + props = self.mgr.get_device_properties() + max_workgroup_invocation = props['max_work_group_invocations'] + max_workgroup_size = props['max_work_group_size'] + if tile_size < 0: + tile_size = 1 + while (2 * tile_size * tile_size <= max_workgroup_invocation + and 2 * tile_size <= max_workgroup_size[0] + and 2 * tile_size <= max_workgroup_size[1]): + tile_size *= 2 + + assert tile_size > 0 + assert tile_size * tile_size <= max_workgroup_invocation + assert tile_size <= max_workgroup_size[0] + assert tile_size <= max_workgroup_size[1] + self.tile_size = tile_size + + self.shader = kp.Shader.compile_source(f''' +#version 450 + +layout (local_size_x = {tile_size}, local_size_y = {tile_size}) in; + +layout (set = 0, binding = 0) readonly buffer buf_in_tensor_1 {{ float in_tensor_1[]; }}; +layout (set = 0, binding = 1) readonly buffer buf_in_tensor_2 {{ float in_tensor_2[]; }}; +layout (set = 0, binding = 2) writeonly buffer buf_out_tensor {{ float out_tensor[]; }}; + +layout (constant_id = 0) const float tensor_size_f = 0; + +shared float sub_tensor_1[{tile_size}][{tile_size}]; +shared float sub_tensor_2[{tile_size}][{tile_size}]; + +void main() +{{ + uint row = gl_GlobalInvocationID.x; + uint col = gl_GlobalInvocationID.y; + uint globalRow = {tile_size} * gl_WorkGroupID.x + row; + uint globalCol = {tile_size} * gl_WorkGroupID.y + row; + + uint tensor_size = uint(tensor_size_f); + float acc = 0.0; + uint numTiles = tensor_size / {tile_size}; + for(uint t = 0u; t < numTiles; t++) + {{ + uint tiledRow = {tile_size} * t + row; + uint tiledCol = {tile_size} * t + col; + sub_tensor_1[col][row] = in_tensor_1[tiledCol * tensor_size + globalRow]; + sub_tensor_2[col][row] = in_tensor_2[globalCol * tensor_size + tiledRow]; + + memoryBarrierShared(); + barrier(); + + for(uint k = 0u; k < {tile_size}; k++) + acc += sub_tensor_1[k][row] * sub_tensor_2[col][k]; + + barrier(); + }} + out_tensor[(globalCol * tensor_size) + globalRow] = acc; +}}''') + self.tensor_shape: tuple[int, int] = (0, 0) + self.params: list[kp.Tensor] = [] + self.algo = None + + def __call__(self, tensor_shape: tuple[int, int], tensor_in_1: kp.Tensor, tensor_in_2: kp.Tensor, + tensor_out: kp.Tensor): + params = [tensor_in_1, tensor_in_2, tensor_out] + + if self.algo is None or self.tensor_shape != tensor_shape or self.params != params: + self.tensor_shape = tensor_shape + self.params = params + self.algo = self.mgr.algorithm( + params, # params + self.shader, # spirv + (tensor_shape[0] // self.tile_size, tensor_shape[1] // self.tile_size, 1), # workgroup + [float(tensor_shape[0])], # spec_consts + []) # push_consts + + (self.mgr.sequence() + .record(kp.OpTensorSyncDevice(self.params)) + .record(kp.OpAlgoDispatch(self.algo)) + .record(kp.OpTensorSyncLocal(self.params)) + .eval()) + + +def main(): + mgr = kp.Manager() + + matmul_op = MatMulOp(mgr) + + tensor_size = 512 + tensor_shape = [tensor_size, tensor_size] + tensor_in_1 = mgr.tensor(np.triu(np.ones(tensor_shape))) + tensor_in_2 = mgr.tensor(np.triu(np.ones(tensor_shape))) + tensor_out = mgr.tensor(np.zeros(tensor_shape)) + + print(f'{tensor_shape} input tensors:\n' + f'{tensor_in_1.data().reshape(tensor_shape)}\n' + f'{tensor_in_2.data().reshape(tensor_shape)}\n') + + matmul_op(tensor_shape, tensor_in_1, tensor_in_2, tensor_out) + + experiment_count = 1000 + start_time = time.time() + for _ in range(experiment_count): + matmul_op(tensor_shape, tensor_in_1, tensor_in_2, tensor_out) + end_time = time.time() + experiment_time = end_time - start_time + op_count = tensor_shape[0] * tensor_shape[1] * (tensor_shape[1] - 1) + + print(f'Output :\n{tensor_out.data().reshape(tensor_shape)}') + + print(f'{experiment_count} matmul time : ' + f'{experiment_time * 1000:0.2f}ms => ' + f'{experiment_count / experiment_time:0.2f}op/s or ' + f'{experiment_count * op_count / (1e9 * experiment_time):0.2f}GFLOPS') + + +if __name__ == '__main__': + main() diff --git a/examples/python_naive_matmul/README.md b/examples/python_naive_matmul/README.md index 26a89172b..0688bb079 100644 --- a/examples/python_naive_matmul/README.md +++ b/examples/python_naive_matmul/README.md @@ -1,6 +1,6 @@ # Naive Matmul Implementation -This demonstrate a basic matmul implementation using Python and vulkan-kompute +This demonstrate a basic matmul implementation using Python and vulkan-kompute. Many thanks for the very helpful [SGEMM in WebGL2-compute](https://www.ibiblio.org/e-notes/webgl/gpu/mul/sgemm.htm) article on the public library [ibiblio.org](https://www.ibiblio.org/). To test the implementation simply run the `matmul.py` script :