Automated work group parameters and tiling matmul

This commit is contained in:
Corentin 2021-06-24 18:36:10 +09:00
parent 8c9ad5f2a4
commit 425380f1a1
3 changed files with 183 additions and 21 deletions

View file

@ -5,35 +5,64 @@ import numpy as np
class MatMulOp:
def __init__(self, manager: kp.Manager, local_size_x: int = 1, local_size_y: int = 1):
def __init__(self, manager: kp.Manager, local_size_x: int = -1, local_size_y: int = -1):
self.mgr = manager
assert(local_size_x > 0)
assert(local_size_y > 0)
props = self.mgr.get_device_properties()
max_workgroup_invocation = props['max_work_group_invocations']
max_workgroup_size = props['max_work_group_size']
if local_size_x < 1:
if local_size_y > 0:
local_size_x = 1
while (2 * local_size_x * local_size_y <= max_workgroup_invocation
and 2 * local_size_x <= max_workgroup_size[0]):
local_size_x *= 2
else:
local_size_x = 1
local_size_y = 1
while 2 * local_size_x * local_size_y <= max_workgroup_invocation:
if 2 * local_size_x <= max_workgroup_size[0]:
local_size_x *= 2
if 2 * local_size_y <= max_workgroup_size[1]:
local_size_y *= 2
elif 2 * local_size_x > max_workgroup_size[0]: # stop if neither x nor y can be double
break
elif local_size_y < 0:
local_size_y = 1
while (2 * local_size_x * local_size_y <= max_workgroup_invocation
and 2 * local_size_x <= max_workgroup_size[0]):
local_size_y *= 2
assert local_size_x > 0
assert local_size_y > 0
assert local_size_x * local_size_y <= max_workgroup_invocation
assert local_size_x <= max_workgroup_size[0]
assert local_size_y <= max_workgroup_size[1]
self.local_size_x = local_size_x
self.local_size_y = local_size_y
self.shader = kp.Shader.compile_source(f'''
#version 450
#version 450
layout (local_size_x = {local_size_x}, local_size_y = {local_size_y}) in;
layout (local_size_x = {local_size_x}, local_size_y = {local_size_y}) in;
layout (set = 0, binding = 0) readonly buffer buf_in_tensor_1 {{ float in_tensor_1[]; }};
layout (set = 0, binding = 1) readonly buffer buf_in_tensor_2 {{ float in_tensor_2[]; }};
layout (set = 0, binding = 2) writeonly buffer buf_out_tensor {{ float out_tensor[]; }};
layout (set = 0, binding = 0) readonly buffer buf_in_tensor_1 {{ float in_tensor_1[]; }};
layout (set = 0, binding = 1) readonly buffer buf_in_tensor_2 {{ float in_tensor_2[]; }};
layout (set = 0, binding = 2) writeonly buffer buf_out_tensor {{ float out_tensor[]; }};
layout (constant_id = 0) const float tensor_size_f = 0;
layout (constant_id = 0) const float tensor_size_f = 0;
void main()
{{
uint globalRow = gl_GlobalInvocationID.x;
uint globalCol = gl_GlobalInvocationID.y;
uint tensor_size = uint(tensor_size_f);
float acc = 0.0;
for(uint k = 0u; k < tensor_size; k++)
acc += in_tensor_1[(k * tensor_size) + globalRow] * in_tensor_2[(globalCol * tensor_size) + k];
out_tensor[(globalCol * tensor_size) + globalRow] = acc;
}}''')
void main()
{{
uint globalRow = gl_GlobalInvocationID.x;
uint globalCol = gl_GlobalInvocationID.y;
uint tensor_size = uint(tensor_size_f);
float acc = 0.0;
for(uint k = 0u; k < tensor_size; k++)
acc += in_tensor_1[(k * tensor_size) + globalRow] * in_tensor_2[(globalCol * tensor_size) + k];
out_tensor[(globalCol * tensor_size) + globalRow] = acc;
}}''')
self.tensor_shape: tuple[int, int] = (0, 0)
self.params: list[kp.Tensor] = []
self.algo = None
@ -62,7 +91,7 @@ class MatMulOp:
def main():
mgr = kp.Manager()
matmul_op = MatMulOp(mgr, 64, 64)
matmul_op = MatMulOp(mgr)
tensor_size = 512
tensor_shape = [tensor_size, tensor_size]
@ -92,5 +121,11 @@ def main():
f'{experiment_count * op_count / (1e9 * experiment_time):0.2f}GFLOPS')
def test():
main()
if __name__ == '__main__':
main()
else:
test()

View file

@ -0,0 +1,127 @@
import time
import kp
import numpy as np
class MatMulOp:
def __init__(self, manager: kp.Manager, tile_size: int = -1):
self.mgr = manager
props = self.mgr.get_device_properties()
max_workgroup_invocation = props['max_work_group_invocations']
max_workgroup_size = props['max_work_group_size']
if tile_size < 0:
tile_size = 1
while (2 * tile_size * tile_size <= max_workgroup_invocation
and 2 * tile_size <= max_workgroup_size[0]
and 2 * tile_size <= max_workgroup_size[1]):
tile_size *= 2
assert tile_size > 0
assert tile_size * tile_size <= max_workgroup_invocation
assert tile_size <= max_workgroup_size[0]
assert tile_size <= max_workgroup_size[1]
self.tile_size = tile_size
self.shader = kp.Shader.compile_source(f'''
#version 450
layout (local_size_x = {tile_size}, local_size_y = {tile_size}) in;
layout (set = 0, binding = 0) readonly buffer buf_in_tensor_1 {{ float in_tensor_1[]; }};
layout (set = 0, binding = 1) readonly buffer buf_in_tensor_2 {{ float in_tensor_2[]; }};
layout (set = 0, binding = 2) writeonly buffer buf_out_tensor {{ float out_tensor[]; }};
layout (constant_id = 0) const float tensor_size_f = 0;
shared float sub_tensor_1[{tile_size}][{tile_size}];
shared float sub_tensor_2[{tile_size}][{tile_size}];
void main()
{{
uint row = gl_GlobalInvocationID.x;
uint col = gl_GlobalInvocationID.y;
uint globalRow = {tile_size} * gl_WorkGroupID.x + row;
uint globalCol = {tile_size} * gl_WorkGroupID.y + row;
uint tensor_size = uint(tensor_size_f);
float acc = 0.0;
uint numTiles = tensor_size / {tile_size};
for(uint t = 0u; t < numTiles; t++)
{{
uint tiledRow = {tile_size} * t + row;
uint tiledCol = {tile_size} * t + col;
sub_tensor_1[col][row] = in_tensor_1[tiledCol * tensor_size + globalRow];
sub_tensor_2[col][row] = in_tensor_2[globalCol * tensor_size + tiledRow];
memoryBarrierShared();
barrier();
for(uint k = 0u; k < {tile_size}; k++)
acc += sub_tensor_1[k][row] * sub_tensor_2[col][k];
barrier();
}}
out_tensor[(globalCol * tensor_size) + globalRow] = acc;
}}''')
self.tensor_shape: tuple[int, int] = (0, 0)
self.params: list[kp.Tensor] = []
self.algo = None
def __call__(self, tensor_shape: tuple[int, int], tensor_in_1: kp.Tensor, tensor_in_2: kp.Tensor,
tensor_out: kp.Tensor):
params = [tensor_in_1, tensor_in_2, tensor_out]
if self.algo is None or self.tensor_shape != tensor_shape or self.params != params:
self.tensor_shape = tensor_shape
self.params = params
self.algo = self.mgr.algorithm(
params, # params
self.shader, # spirv
(tensor_shape[0] // self.tile_size, tensor_shape[1] // self.tile_size, 1), # workgroup
[float(tensor_shape[0])], # spec_consts
[]) # push_consts
(self.mgr.sequence()
.record(kp.OpTensorSyncDevice(self.params))
.record(kp.OpAlgoDispatch(self.algo))
.record(kp.OpTensorSyncLocal(self.params))
.eval())
def main():
mgr = kp.Manager()
matmul_op = MatMulOp(mgr)
tensor_size = 512
tensor_shape = [tensor_size, tensor_size]
tensor_in_1 = mgr.tensor(np.triu(np.ones(tensor_shape)))
tensor_in_2 = mgr.tensor(np.triu(np.ones(tensor_shape)))
tensor_out = mgr.tensor(np.zeros(tensor_shape))
print(f'{tensor_shape} input tensors:\n'
f'{tensor_in_1.data().reshape(tensor_shape)}\n'
f'{tensor_in_2.data().reshape(tensor_shape)}\n')
matmul_op(tensor_shape, tensor_in_1, tensor_in_2, tensor_out)
experiment_count = 1000
start_time = time.time()
for _ in range(experiment_count):
matmul_op(tensor_shape, tensor_in_1, tensor_in_2, tensor_out)
end_time = time.time()
experiment_time = end_time - start_time
op_count = tensor_shape[0] * tensor_shape[1] * (tensor_shape[1] - 1)
print(f'Output :\n{tensor_out.data().reshape(tensor_shape)}')
print(f'{experiment_count} matmul time : '
f'{experiment_time * 1000:0.2f}ms => '
f'{experiment_count / experiment_time:0.2f}op/s or '
f'{experiment_count * op_count / (1e9 * experiment_time):0.2f}GFLOPS')
if __name__ == '__main__':
main()

View file

@ -1,6 +1,6 @@
# Naive Matmul Implementation
This demonstrate a basic matmul implementation using Python and vulkan-kompute
This demonstrate a basic matmul implementation using Python and vulkan-kompute. Many thanks for the very helpful [SGEMM in WebGL2-compute](https://www.ibiblio.org/e-notes/webgl/gpu/mul/sgemm.htm) article on the public library [ibiblio.org](https://www.ibiblio.org/).
To test the implementation simply run the `matmul.py` script :