From a3f7793c17fb2fd8ba3608e809caf31e29195058 Mon Sep 17 00:00:00 2001 From: Corentin Date: Fri, 25 Jun 2021 03:27:52 +0900 Subject: [PATCH] Fix FLOPS calculation --- examples/python_naive_matmul/benchmark.py | 4 +- examples/python_naive_matmul/imp1_naive.py | 9 ++-- examples/python_naive_matmul/imp2_tiled.py | 15 +++--- .../python_naive_matmul/imp3_better_tiling.py | 47 +++++++++++-------- 4 files changed, 41 insertions(+), 34 deletions(-) diff --git a/examples/python_naive_matmul/benchmark.py b/examples/python_naive_matmul/benchmark.py index b10369d7c..8b92dda2f 100644 --- a/examples/python_naive_matmul/benchmark.py +++ b/examples/python_naive_matmul/benchmark.py @@ -34,13 +34,13 @@ def main(): matmul_op(tensor_shape, tensor_in_1, tensor_in_2, tensor_out) end_time = time.time() experiment_time = end_time - start_time - op_count = tensor_shape[0] * tensor_shape[1] * (tensor_shape[1] - 1) + op_count = tensor_shape[0] * tensor_shape[1] * ((tensor_shape[1] * 2) - 1) if (tensor_out.data().reshape(tensor_shape) == mat_result).all(): print(f'From {MatMulOp.__module__} : {experiment_count} matmul time : ' f'{experiment_time * 1000:0.2f}ms => ' f'{experiment_count / experiment_time:0.2f}op/s or ' - f'{experiment_count * op_count / (1e9 * experiment_time):0.2f}GFLOPS') + f'{experiment_count * op_count / (1e9 * experiment_time):0.2f} GFLOPS') else: print(f'Test failed => output tensor is wrong :\n{tensor_out.data().reshape(tensor_shape)}') diff --git a/examples/python_naive_matmul/imp1_naive.py b/examples/python_naive_matmul/imp1_naive.py index faefec563..420336260 100644 --- a/examples/python_naive_matmul/imp1_naive.py +++ b/examples/python_naive_matmul/imp1_naive.py @@ -76,6 +76,7 @@ void main() self.tensor_shape = tensor_shape self.params = params workgroup = (tensor_shape[0] // self.local_size_x, tensor_shape[1] // self.local_size_y, 1) + print(f'{workgroup=} {self.local_size_x=} {self.local_size_y=}') self.algo = self.mgr.algorithm( params, # params self.compiled_shader, # spirv @@ -95,7 +96,7 @@ def main(): matmul_op = MatMulOp(mgr) - tensor_size = 512 + tensor_size = 4064 tensor_shape = [tensor_size, tensor_size] tensor_in_1 = mgr.tensor(np.triu(np.ones(tensor_shape))) tensor_in_2 = mgr.tensor(np.triu(np.ones(tensor_shape))) @@ -107,20 +108,20 @@ def main(): matmul_op(tensor_shape, tensor_in_1, tensor_in_2, tensor_out) - experiment_count = 1000 + experiment_count = 8 start_time = time.time() for _ in range(experiment_count): matmul_op(tensor_shape, tensor_in_1, tensor_in_2, tensor_out) end_time = time.time() experiment_time = end_time - start_time - op_count = tensor_shape[0] * tensor_shape[1] * (tensor_shape[1] - 1) + op_count = tensor_shape[0] * tensor_shape[1] * ((tensor_shape[1] * 2) - 1) print(f'Output :\n{tensor_out.data().reshape(tensor_shape)}') print(f'{experiment_count} matmul time : ' f'{experiment_time * 1000:0.2f}ms => ' f'{experiment_count / experiment_time:0.2f}op/s or ' - f'{experiment_count * op_count / (1e9 * experiment_time):0.2f}GFLOPS') + f'{experiment_count * op_count / (1e9 * experiment_time):0.2f} GFLOPS') if __name__ == '__main__': diff --git a/examples/python_naive_matmul/imp2_tiled.py b/examples/python_naive_matmul/imp2_tiled.py index ed6a1ddf8..8ddf53745 100644 --- a/examples/python_naive_matmul/imp2_tiled.py +++ b/examples/python_naive_matmul/imp2_tiled.py @@ -43,8 +43,8 @@ void main() uint row = gl_LocalInvocationID.x; // 0 .. tile_size uint col = gl_LocalInvocationID.y; // 0 .. tile_size // gl_WorkGroupID : 0 .. tensor_size / tile_size - uint globalRow = ({tile_size} * gl_WorkGroupID.x) + row; - uint globalCol = ({tile_size} * gl_WorkGroupID.y) + col; + uint globalRow = {tile_size} * gl_WorkGroupID.x + row; + uint globalCol = {tile_size} * gl_WorkGroupID.y + col; uint tensor_size = uint(tensor_size_f); float acc = 0.0; @@ -64,8 +64,7 @@ void main() barrier(); }} - uint globalIndex = (tensor_size * globalCol) + globalRow; - out_tensor[globalIndex] = acc; + out_tensor[tensor_size * globalCol + globalRow] = acc; }}''' self.compiled_shader = kp.Shader.compile_source(self.shader) self.tensor_shape: tuple[int, int] = (0, 0) @@ -99,7 +98,7 @@ def main(): matmul_op = MatMulOp(mgr) - tensor_size = 512 + tensor_size = 4096 tensor_shape = [tensor_size, tensor_size] tensor_in_1 = mgr.tensor(np.triu(np.ones(tensor_shape))) tensor_in_2 = mgr.tensor(np.triu(np.ones(tensor_shape))) @@ -111,20 +110,20 @@ def main(): matmul_op(tensor_shape, tensor_in_1, tensor_in_2, tensor_out) - experiment_count = 1000 + experiment_count = 8 start_time = time.time() for _ in range(experiment_count): matmul_op(tensor_shape, tensor_in_1, tensor_in_2, tensor_out) end_time = time.time() experiment_time = end_time - start_time - op_count = tensor_shape[0] * tensor_shape[1] * (tensor_shape[1] - 1) + op_count = tensor_shape[0] * tensor_shape[1] * ((tensor_shape[1] * 2) - 1) print(f'Output :\n{tensor_out.data().reshape(tensor_shape)}') print(f'{experiment_count} matmul time : ' f'{experiment_time * 1000:0.2f}ms => ' f'{experiment_count / experiment_time:0.2f}op/s or ' - f'{experiment_count * op_count / (1e9 * experiment_time):0.2f}GFLOPS') + f'{experiment_count * op_count / (1e9 * experiment_time):0.2f} GFLOPS') if __name__ == '__main__': diff --git a/examples/python_naive_matmul/imp3_better_tiling.py b/examples/python_naive_matmul/imp3_better_tiling.py index 6b3ada314..e97eb88d4 100644 --- a/examples/python_naive_matmul/imp3_better_tiling.py +++ b/examples/python_naive_matmul/imp3_better_tiling.py @@ -55,30 +55,37 @@ void main() uint tensor_size = uint(tensor_size_f); float acc[{thread_work_ratio}]; - for (uint l = 0u; l < {thread_work_ratio}; l++) - acc[l] = 0.0; + for(uint w = 0u; w < {thread_work_ratio}; w++) + acc[w] = 0.0; + /* uint numTiles = tensor_size / {tile_size}; for(uint t = 0u; t < numTiles; t++) {{ - uint tiledRow = {tile_size} * t + row; - uint tiledCol = {tile_size} * t + col; - sub_tensor_1[col + t * {self.local_size_y}][row] = in_tensor_1[ - (tiledCol + t * {self.local_size_y}) * tensor_size + globalRow]; - sub_tensor_2[col + t * {self.local_size_y}][row] = in_tensor_2[ - (globalCol + t * {self.local_size_y})* tensor_size + tiledRow]; + for(uint w = 0u; w < {thread_work_ratio}; w++) + {{ + uint tiledRow = {tile_size} * t + row; + uint tiledCol = {tile_size} * t + col; + sub_tensor_1[col + t * {self.local_size_y}][row] = in_tensor_1[ + (tiledCol + w * {self.local_size_y}) * tensor_size + globalRow]; + sub_tensor_2[col + t * {self.local_size_y}][row] = in_tensor_2[ + (globalCol + w * {self.local_size_y})* tensor_size + tiledRow]; + }} memoryBarrierShared(); barrier(); for(uint k = 0u; k < {tile_size}; k++) - for(uint l = 0u; l < {thread_work_ratio}; l++) - acc[l] += sub_tensor_1[k][row] * sub_tensor_2[col + l * {self.local_size_y}][k]; + for(uint w = 0u; w < {thread_work_ratio}; w++) + acc[w] += sub_tensor_1[k][row] * sub_tensor_2[col + w * {self.local_size_y}][k]; barrier(); + }}*/ + for(uint w = 0u; w < {thread_work_ratio}; w++) + {{ + //out_tensor[(globalCol + w * {self.local_size_y}) * tensor_size + globalRow] = acc[w]; + out_tensor[(globalCol + w * {self.local_size_y}) * tensor_size + globalRow] = w; }} - for(uint l = 0u; l < {thread_work_ratio}; l++) - out_tensor[(globalCol + l * {self.local_size_y}) * tensor_size + globalRow] = acc[l]; }}''' self.compiled_shader = kp.Shader.compile_source(self.shader) self.tensor_shape: tuple[int, int] = (0, 0) @@ -92,13 +99,13 @@ void main() if self.algo is None or self.tensor_shape != tensor_shape or self.params != params: self.tensor_shape = tensor_shape self.params = params - print( - tensor_shape, self.local_size_x, self.local_size_y, - (tensor_shape[0] // self.local_size_x, tensor_shape[1] // self.local_size_y, 1)) + # workgroup = (tensor_shape[0] // self.local_size_x, tensor_shape[1] // self.local_size_y, 1) + workgroup = (2, 2, 1) + print(tensor_shape, self.local_size_x, self.local_size_y, workgroup) self.algo = self.mgr.algorithm( params, # params self.compiled_shader, # spirv - (tensor_shape[0] // self.local_size_x, tensor_shape[1] // self.local_size_y, 1), # workgroup + workgroup, # workgroup [float(tensor_shape[0])], # spec_consts []) # push_consts @@ -114,7 +121,7 @@ def main(): matmul_op = MatMulOp(mgr) - tensor_size = 512 + tensor_size = 4096 tensor_shape = [tensor_size, tensor_size] tensor_in_1 = mgr.tensor(np.triu(np.ones(tensor_shape))) tensor_in_2 = mgr.tensor(np.triu(np.ones(tensor_shape))) @@ -126,20 +133,20 @@ def main(): matmul_op(tensor_shape, tensor_in_1, tensor_in_2, tensor_out) - experiment_count = 1000 + experiment_count = 8 start_time = time.time() for _ in range(experiment_count): matmul_op(tensor_shape, tensor_in_1, tensor_in_2, tensor_out) end_time = time.time() experiment_time = end_time - start_time - op_count = tensor_shape[0] * tensor_shape[1] * (tensor_shape[1] - 1) + op_count = tensor_shape[0] * tensor_shape[1] * ((tensor_shape[1] * 2) - 1) print(f'Output :\n{tensor_out.data().reshape(tensor_shape)}') print(f'{experiment_count} matmul time : ' f'{experiment_time * 1000:0.2f}ms => ' f'{experiment_count / experiment_time:0.2f}op/s or ' - f'{experiment_count * op_count / (1e9 * experiment_time):0.2f}GFLOPS') + f'{experiment_count * op_count / (1e9 * experiment_time):0.2f} GFLOPS') if __name__ == '__main__':