Fix small matrices matmuls, imp3 working but slow

2021-06-28 19:05:39 +09:00 · 2021-06-28 19:05:39 +09:00 · 3962ee70af
commit 3962ee70af
parent a3f7793c17
5 changed files with 77 additions and 219 deletions
--- a/examples/python_naive_matmul/benchmark.py
+++ b/examples/python_naive_matmul/benchmark.py
@ -8,41 +8,48 @@ from imp3_better_tiling import MatMulOp as MatMulOp3


 def main():
-    experiment_count = 1000
-    tensor_size = 512
-    tensor_shape = [tensor_size, tensor_size]
-    mat_1 = np.triu(np.ones(tensor_shape))
-    mat_2 = np.triu(np.ones(tensor_shape))
-    mat_result = mat_1 @ mat_2
-    tensor_shape = [tensor_size, tensor_size]
-
-    print(f'{tensor_shape} input tensors:\n'
-          f'{mat_1}\n'
-          f'{mat_2}\n')
-    print(f'Output :\n{mat_result}')
-
    mgr = kp.Manager()
-    tensor_in_1 = mgr.tensor(mat_1)
-    tensor_in_2 = mgr.tensor(mat_2)
-    tensor_out = mgr.tensor(np.zeros(tensor_shape))
-    for MatMulOp in [MatMulOp1, MatMulOp2, MatMulOp3]:
-        matmul_op = MatMulOp(mgr)
-        matmul_op(tensor_shape, tensor_in_1, tensor_in_2, tensor_out)
+    for tensor_size, experiment_count in [(512, 1000), (4096, 5)]:
+        tensor_shape = [tensor_size, tensor_size]
+        tensor_shape = [tensor_size, tensor_size]
+        mat_1 = np.triu(np.ones(tensor_shape))
+        mat_2 = np.triu(np.ones(tensor_shape))

-        start_time = time.time()
-        for _ in range(experiment_count):
-            matmul_op(tensor_shape, tensor_in_1, tensor_in_2, tensor_out)
-        end_time = time.time()
-        experiment_time = end_time - start_time
-        op_count = tensor_shape[0] * tensor_shape[1] * ((tensor_shape[1] * 2) - 1)
-
-        if (tensor_out.data().reshape(tensor_shape) == mat_result).all():
-            print(f'From {MatMulOp.__module__} : {experiment_count} matmul time : '
-                  f'{experiment_time * 1000:0.2f}ms => '
-                  f'{experiment_count / experiment_time:0.2f}op/s or '
-                  f'{experiment_count * op_count / (1e9 * experiment_time):0.2f} GFLOPS')
+        tensor_in_1 = mgr.tensor(mat_1)
+        tensor_in_2 = mgr.tensor(mat_2)
+        tensor_out = mgr.tensor(np.zeros(tensor_shape))
+        if tensor_size <= 512:
+            mat_result = mat_1 @ mat_2
        else:
-            print(f'Test failed => output tensor is wrong :\n{tensor_out.data().reshape(tensor_shape)}')
+            MatMulOp1(mgr)(tensor_shape, tensor_in_1, tensor_in_2, tensor_out)
+            mat_result = tensor_out.data().reshape(tensor_shape)  # CPU is too slow for big sizes
+
+        print(f'{tensor_shape} input tensors:\n'
+              f'{mat_1}\n'
+              f'{mat_2}\n')
+        print(f'Output :\n{mat_result}')
+
+        for MatMulOp in [MatMulOp1, MatMulOp2, MatMulOp3]:
+            tensor_out.data()[:] = 0
+            mgr.sequence().record(kp.OpTensorSyncDevice([tensor_out]))
+            matmul_op = MatMulOp(mgr)
+            matmul_op(tensor_shape, tensor_in_1, tensor_in_2, tensor_out)
+
+            start_time = time.time()
+            for _ in range(experiment_count):
+                matmul_op(tensor_shape, tensor_in_1, tensor_in_2, tensor_out)
+            end_time = time.time()
+            experiment_time = end_time - start_time
+            op_count = tensor_shape[0] * tensor_shape[1] * ((tensor_shape[1] * 2) - 1)
+
+            # print(tensor_out.data().reshape(tensor_shape))
+            if (tensor_out.data().reshape(tensor_shape) == mat_result).all():
+                print(f'From {MatMulOp.__module__} : {experiment_count} matmul time : '
+                      f'{experiment_time * 1000:0.2f}ms => '
+                      f'{experiment_count / experiment_time:0.2f}op/s or '
+                      f'{experiment_count * op_count / (1e9 * experiment_time):0.2f} GFLOPS')
+            else:
+                print(f'Test failed => output tensor is wrong :\n{tensor_out.data().reshape(tensor_shape)}')


 if __name__ == '__main__':
--- a/examples/python_naive_matmul/imp1_naive.py
+++ b/examples/python_naive_matmul/imp1_naive.py
@ -41,7 +41,7 @@ class MatMulOp:
        self.local_size_x = local_size_x
        self.local_size_y = local_size_y

-        self.shader = f'''
+        self.shader = '''
 #version 450

 layout (local_size_x = {local_size_x}, local_size_y = {local_size_y}) in;
@ -63,7 +63,8 @@ void main()
        acc += in_tensor_1[(k * tensor_size) + globalRow] * in_tensor_2[(globalCol * tensor_size) + k];
    out_tensor[(globalCol * tensor_size) + globalRow] = acc;
 }}'''
-        self.compiled_shader = kp.Shader.compile_source(self.shader)
+        self.compiled_shader = kp.Shader.compile_source(self.shader.format(
+            local_size_x=self.local_size_x, local_size_y=self.local_size_y))
        self.tensor_shape: tuple[int, int] = (0, 0)
        self.params: list[kp.Tensor] = []
        self.algo = None
@ -75,7 +76,11 @@ void main()
        if self.algo is None or self.tensor_shape != tensor_shape or self.params != params:
            self.tensor_shape = tensor_shape
            self.params = params
-            workgroup = (tensor_shape[0] // self.local_size_x, tensor_shape[1] // self.local_size_y, 1)
+            local_size_x = min(self.local_size_x, tensor_shape[0])
+            local_size_y = min(self.local_size_y, tensor_shape[1])
+            self.compiled_shader = kp.Shader.compile_source(self.shader.format(
+                local_size_x=local_size_x, local_size_y=local_size_y))
+            workgroup = (tensor_shape[0] // local_size_x, tensor_shape[1] // local_size_y, 1)
            print(f'{workgroup=} {self.local_size_x=} {self.local_size_y=}')
            self.algo = self.mgr.algorithm(
                params,  # params
@ -85,7 +90,7 @@ void main()
                [])  # push_consts

        (self.mgr.sequence()
-         .record(kp.OpTensorSyncDevice(self.params))
+         .record(kp.OpTensorSyncDevice([tensor_in_1, tensor_in_2]))
         .record(kp.OpAlgoDispatch(self.algo))
         .record(kp.OpTensorSyncLocal([tensor_out]))
         .eval())
--- a/examples/python_naive_matmul/imp2_tiled.py
+++ b/examples/python_naive_matmul/imp2_tiled.py
@ -24,7 +24,7 @@ class MatMulOp:
        assert tile_size <= max_workgroup_size[1]
        self.tile_size = tile_size

-        self.shader = f'''
+        self.shader = '''
 #version 450

 layout (local_size_x = {tile_size}, local_size_y = {tile_size}) in;
@ -66,7 +66,7 @@ void main()
    }}
    out_tensor[tensor_size * globalCol + globalRow] = acc;
 }}'''
-        self.compiled_shader = kp.Shader.compile_source(self.shader)
+        self.compiled_shader = kp.Shader.compile_source(self.shader.format(tile_size=tile_size))
        self.tensor_shape: tuple[int, int] = (0, 0)
        self.params: list[kp.Tensor] = []
        self.algo = None
@ -78,7 +78,9 @@ void main()
        if self.algo is None or self.tensor_shape != tensor_shape or self.params != params:
            self.tensor_shape = tensor_shape
            self.params = params
-            workgroup = (tensor_shape[0] // self.tile_size, tensor_shape[1] // self.tile_size, 1)
+            tile_size = min(tensor_shape[0], tensor_shape[1], self.tile_size)
+            self.compiled_shader = kp.Shader.compile_source(self.shader.format(tile_size=tile_size))
+            workgroup = [tensor_shape[0] // tile_size, tensor_shape[1] // tile_size, 1]
            self.algo = self.mgr.algorithm(
                params,  # params
                self.compiled_shader,  # spirv
@ -87,7 +89,7 @@ void main()
                [])  # push_consts

        (self.mgr.sequence()
-         .record(kp.OpTensorSyncDevice(self.params))
+         .record(kp.OpTensorSyncDevice([tensor_in_1, tensor_in_2]))
         .record(kp.OpAlgoDispatch(self.algo))
         .record(kp.OpTensorSyncLocal([tensor_out]))
         .eval())
--- a/examples/python_naive_matmul/imp2_tiled_debug.py
+++ b/examples/python_naive_matmul/imp2_tiled_debug.py
@ -1,156 +0,0 @@
-import time
-
-import kp
-import numpy as np
-
-
-class MatMulOp:
-    def __init__(self, manager: kp.Manager, tile_size: int = -1):
-        self.mgr = manager
-
-        props = self.mgr.get_device_properties()
-        max_workgroup_invocation = props['max_work_group_invocations']
-        max_workgroup_size = props['max_work_group_size']
-        if tile_size < 0:
-            tile_size = 1
-            while (4 * tile_size * tile_size <= max_workgroup_invocation
-                   and 2 * tile_size <= max_workgroup_size[0]
-                   and 2 * tile_size <= max_workgroup_size[1]):
-                tile_size *= 2
-
-        assert tile_size > 0
-        assert tile_size * tile_size <= max_workgroup_invocation
-        assert tile_size <= max_workgroup_size[0]
-        assert tile_size <= max_workgroup_size[1]
-        self.tile_size = tile_size
-
-        print(f'{tile_size=}')
-        self.shader = f'''
-#version 450
-
-layout (local_size_x = {tile_size}, local_size_y = {tile_size}) in;
-
-layout (set = 0, binding = 0) readonly buffer buf_in_tensor_1 {{ float in_tensor_1[]; }};
-layout (set = 0, binding = 1) readonly buffer buf_in_tensor_2 {{ float in_tensor_2[]; }};
-layout (set = 0, binding = 2) writeonly buffer buf_out_tensor {{ float out_tensor[]; }};
-layout (set = 0, binding = 3) writeonly buffer buf_test1_tensor {{ float test1_tensor[]; }};
-layout (set = 0, binding = 4) writeonly buffer buf_test2_tensor {{ float test2_tensor[]; }};
-layout (set = 0, binding = 5) writeonly buffer buf_test3_tensor {{ float test3_tensor[]; }};
-layout (set = 0, binding = 6) writeonly buffer buf_test4_tensor {{ float test4_tensor[]; }};
-
-layout (constant_id = 0) const float tensor_size_f = 0;
-
-shared float sub_tensor_1[{tile_size}][{tile_size}];
-shared float sub_tensor_2[{tile_size}][{tile_size}];
-
-void main()
-{{
-    uint row = gl_LocalInvocationID.x; // 0 .. tile_size
-    uint col = gl_LocalInvocationID.y; // 0 .. tile_size
-    // gl_WorkGroupID : 0 .. tensor_size / tile_size
-    uint globalRow = ({tile_size} * gl_WorkGroupID.x) + row;
-    uint globalCol = ({tile_size} * gl_WorkGroupID.y) + col;
-
-    uint tensor_size = uint(tensor_size_f);
-    float acc = 0.0;
-    uint numTiles = tensor_size / {tile_size};
-    for(uint t = 0u; t < numTiles; t++)
-    {{
-        uint tiledRow = ({tile_size} * t) + row;
-        uint tiledCol = ({tile_size} * t) + col;
-        sub_tensor_1[col][row] = in_tensor_1[(tiledCol * tensor_size) + globalRow];
-        sub_tensor_2[col][row] = in_tensor_2[(globalCol * tensor_size) + tiledRow];
-
-        memoryBarrierShared();
-        barrier();
-
-        for(uint k = 0u; k < {tile_size}; k++)
-            acc += sub_tensor_1[k][row] * sub_tensor_2[col][k];
-
-        barrier();
-    }}
-    uint globalIndex = (tensor_size * globalCol) + globalRow;
-    out_tensor[globalIndex] = acc;
-    test1_tensor[globalIndex] = row;
-    test2_tensor[globalIndex] = col;
-    test3_tensor[globalIndex] = gl_WorkGroupID.x;
-    test4_tensor[globalIndex] = gl_WorkGroupID.y;
-}}'''
-        print(self.shader)
-        self.compiled_shader = kp.Shader.compile_source(self.shader)
-        self.tensor_shape: tuple[int, int] = (0, 0)
-        self.params: list[kp.Tensor] = []
-        self.algo = None
-
-    def __call__(self, tensor_shape: tuple[int, int], tensor_in_1: kp.Tensor, tensor_in_2: kp.Tensor,
-                 tensor_out: kp.Tensor, tensor_test_1: kp.Tensor, tensor_test_2: kp.Tensor,
-                 tensor_test_3: kp.Tensor, tensor_test_4: kp.Tensor):
-        # params = [tensor_in_1, tensor_in_2, tensor_out]
-        params = [tensor_in_1, tensor_in_2, tensor_out, tensor_test_1, tensor_test_2, tensor_test_3, tensor_test_4]
-
-        if self.algo is None or self.tensor_shape != tensor_shape or self.params != params:
-            self.tensor_shape = tensor_shape
-            self.params = params
-            workgroup = (tensor_shape[0] // self.tile_size, tensor_shape[1] // self.tile_size, 1)
-            # workgroup = (2, 2, 1)
-            print(f'{float(tensor_shape[0])=} {self.tile_size=} {workgroup=}')
-            self.algo = self.mgr.algorithm(
-                params,  # params
-                self.compiled_shader,  # spirv
-                workgroup,  # workgroup
-                [float(tensor_shape[0])],  # spec_consts
-                [])  # push_consts
-
-        (self.mgr.sequence()
-         .record(kp.OpTensorSyncDevice(self.params))
-         .record(kp.OpAlgoDispatch(self.algo))
-         .record(kp.OpTensorSyncLocal(self.params[2:]))
-         # .record(kp.OpTensorSyncLocal([tensor_out]))
-         .eval())
-
-
-def main():
-    mgr = kp.Manager()
-
-    matmul_op = MatMulOp(mgr, 4)
-
-    tensor_size = 8
-    tensor_shape = [tensor_size, tensor_size]
-    tensor_in_1 = mgr.tensor(np.triu(np.ones(tensor_shape)))
-    tensor_in_2 = mgr.tensor(np.triu(np.ones(tensor_shape)))
-    tensor_out = mgr.tensor(np.zeros(tensor_shape))
-    tensor_test_1 = mgr.tensor(np.zeros(tensor_shape))
-    tensor_test_2 = mgr.tensor(np.zeros(tensor_shape))
-    tensor_test_3 = mgr.tensor(np.zeros(tensor_shape))
-    tensor_test_4 = mgr.tensor(np.zeros(tensor_shape))
-
-    print(f'{tensor_shape} input tensors:\n'
-          f'{tensor_in_1.data().reshape(tensor_shape)}\n'
-          f'{tensor_in_2.data().reshape(tensor_shape)}\n')
-
-    # matmul_op(tensor_shape, tensor_in_1, tensor_in_2, tensor_out)
-    matmul_op(tensor_shape, tensor_in_1, tensor_in_2,
-              tensor_out, tensor_test_1, tensor_test_2, tensor_test_3, tensor_test_4)
-
-    # experiment_count = 10
-    # start_time = time.time()
-    # for _ in range(experiment_count):
-    #     matmul_op(tensor_shape, tensor_in_1, tensor_in_2, tensor_out)
-    # end_time = time.time()
-    # experiment_time = end_time - start_time
-    # op_count = tensor_shape[0] * tensor_shape[1] * (tensor_shape[1] - 1)
-
-    print(f'Output :\n{tensor_out.data().reshape(tensor_shape)}')
-    print(f'test_1 :\n{tensor_test_1.data().reshape(tensor_shape)}')
-    print(f'test_2 :\n{tensor_test_2.data().reshape(tensor_shape)}')
-    print(f'test_3 :\n{tensor_test_3.data().reshape(tensor_shape)}')
-    print(f'test_4 :\n{tensor_test_4.data().reshape(tensor_shape)}')
-
-    # print(f'{experiment_count} matmul time : '
-    #       f'{experiment_time * 1000:0.2f}ms => '
-    #       f'{experiment_count / experiment_time:0.2f}op/s or '
-    #       f'{experiment_count * op_count / (1e9 * experiment_time):0.2f}GFLOPS')
-
-
-if __name__ == '__main__':
-    main()
--- a/examples/python_naive_matmul/imp3_better_tiling.py
+++ b/examples/python_naive_matmul/imp3_better_tiling.py
@ -5,7 +5,7 @@ import numpy as np


 class MatMulOp:
-    def __init__(self, manager: kp.Manager, tile_size: int = -1, thread_work_ratio: int = 8):
+    def __init__(self, manager: kp.Manager, tile_size: int = -1, thread_work_ratio: int = 16):
        self.mgr = manager

        props = self.mgr.get_device_properties()
@ -14,9 +14,9 @@ class MatMulOp:
        if tile_size < 0:
            tile_size = 1
            local_size_y = tile_size // thread_work_ratio
-            while (4 * tile_size * local_size_y <= max_workgroup_invocation
+            while (4 * tile_size * tile_size <= max_workgroup_invocation
                   and 2 * tile_size <= max_workgroup_size[0]
-                   and 2 * local_size_y <= max_workgroup_size[1]):
+                   and 2 * tile_size <= max_workgroup_size[1]):
                tile_size *= 2
                local_size_y = tile_size // thread_work_ratio
        else:
@ -32,10 +32,10 @@ class MatMulOp:

        self.local_size_x = tile_size
        self.local_size_y = tile_size // thread_work_ratio
-        self.shader = f'''
+        self.shader = '''
 #version 450

-layout (local_size_x = {tile_size}, local_size_y = {self.local_size_y}) in;
+layout (local_size_x = {tile_size}, local_size_y = {local_size_y}) in;

 layout (set = 0, binding = 0) readonly buffer buf_in_tensor_1 {{ float in_tensor_1[]; }};
 layout (set = 0, binding = 1) readonly buffer buf_in_tensor_2 {{ float in_tensor_2[]; }};
@ -51,14 +51,13 @@ void main()
    uint row = gl_LocalInvocationID.x;
    uint col = gl_LocalInvocationID.y;
    uint globalRow = {tile_size} * gl_WorkGroupID.x + row;
-    uint globalCol = {tile_size} * gl_WorkGroupID.y + row;
+    uint globalCol = {tile_size} * gl_WorkGroupID.y + col;

    uint tensor_size = uint(tensor_size_f);
    float acc[{thread_work_ratio}];
    for(uint w = 0u; w < {thread_work_ratio}; w++)
        acc[w] = 0.0;

-    /*
    uint numTiles = tensor_size / {tile_size};
    for(uint t = 0u; t < numTiles; t++)
    {{
@ -66,10 +65,10 @@ void main()
        {{
            uint tiledRow = {tile_size} * t + row;
            uint tiledCol = {tile_size} * t + col;
-            sub_tensor_1[col + t * {self.local_size_y}][row] = in_tensor_1[
-                (tiledCol + w * {self.local_size_y}) * tensor_size + globalRow];
-            sub_tensor_2[col + t * {self.local_size_y}][row] = in_tensor_2[
-                (globalCol + w * {self.local_size_y})* tensor_size + tiledRow];
+            sub_tensor_1[col + w * {local_size_y}][row] = in_tensor_1[
+                (tiledCol + w * {local_size_y}) * tensor_size + globalRow];
+            sub_tensor_2[col + w * {local_size_y}][row] = in_tensor_2[
+                (globalCol + w * {local_size_y})* tensor_size + tiledRow];
        }}

        memoryBarrierShared();
@ -77,17 +76,15 @@ void main()

        for(uint k = 0u; k < {tile_size}; k++)
            for(uint w = 0u; w < {thread_work_ratio}; w++)
-                acc[w] += sub_tensor_1[k][row] * sub_tensor_2[col + w * {self.local_size_y}][k];
+                acc[w] += sub_tensor_1[k][row] * sub_tensor_2[col + w * {local_size_y}][k];

        barrier();
-    }}*/
-    for(uint w = 0u; w < {thread_work_ratio}; w++)
-    {{
-        //out_tensor[(globalCol + w * {self.local_size_y}) * tensor_size + globalRow] = acc[w];
-        out_tensor[(globalCol + w * {self.local_size_y}) * tensor_size + globalRow] = w;
    }}
+    for(uint w = 0u; w < {thread_work_ratio}; w++)
+        out_tensor[(globalCol + w * {local_size_y}) * tensor_size + globalRow] = acc[w];
 }}'''
-        self.compiled_shader = kp.Shader.compile_source(self.shader)
+        self.compiled_shader = kp.Shader.compile_source(self.shader.format(
+            tile_size=tile_size, thread_work_ratio=thread_work_ratio, local_size_y=local_size_y))
        self.tensor_shape: tuple[int, int] = (0, 0)
        self.params: list[kp.Tensor] = []
        self.algo = None
@ -99,9 +96,12 @@ void main()
        if self.algo is None or self.tensor_shape != tensor_shape or self.params != params:
            self.tensor_shape = tensor_shape
            self.params = params
-            # workgroup = (tensor_shape[0] // self.local_size_x, tensor_shape[1] // self.local_size_y, 1)
-            workgroup = (2, 2, 1)
-            print(tensor_shape, self.local_size_x, self.local_size_y, workgroup)
+            tile_size = min(self.tensor_shape[0], self.tile_size)
+            thread_work_ratio = min(self.tensor_shape[1] // self.tile_size, self.thread_work_ratio)
+            local_size_y = tile_size // thread_work_ratio
+            self.compiled_shader = kp.Shader.compile_source(self.shader.format(
+                tile_size=tile_size, thread_work_ratio=thread_work_ratio, local_size_y=local_size_y))
+            workgroup = (tensor_shape[0] // self.local_size_x, tensor_shape[1] // self.local_size_y, 1)
            self.algo = self.mgr.algorithm(
                params,  # params
                self.compiled_shader,  # spirv
@ -110,7 +110,7 @@ void main()
                [])  # push_consts

        (self.mgr.sequence()
-         .record(kp.OpTensorSyncDevice(self.params))
+         .record(kp.OpTensorSyncDevice([tensor_in_1, tensor_in_2]))
         .record(kp.OpAlgoDispatch(self.algo))
         .record(kp.OpTensorSyncLocal([tensor_out]))
         .eval())
@ -133,7 +133,7 @@ def main():

    matmul_op(tensor_shape, tensor_in_1, tensor_in_2, tensor_out)

-    experiment_count = 8
+    experiment_count = 2
    start_time = time.time()
    for _ in range(experiment_count):
        matmul_op(tensor_shape, tensor_in_1, tensor_in_2, tensor_out)