diff --git a/README.md b/README.md index 8d8e13947..03caa4c96 100644 --- a/README.md +++ b/README.md @@ -56,35 +56,65 @@ int main() { // 2. Create and initialise Kompute Tensors through manager auto tensorInA = mgr.tensor({ 2., 2., 2. }); auto tensorInB = mgr.tensor({ 1., 2., 3. }); - auto tensorOut = mgr.tensor({ 0., 0., 0. }); + auto tensorOutA = mgr.tensor({ 0., 0., 0. }); + auto tensorOutB = mgr.tensor({ 0., 0., 0. }); - // 3. Specify "multiply shader" code (can also be raw string, spir-v bytes or file path) - std::string shaderString = (R"( + std::vector> params = {tensorInA, tensorInB, tensorOutA, tensorOutB}; + + // 3. Create algorithm based on shader (supports buffers & push/spec constants) + std::string shader = (R"( #version 450 layout (local_size_x = 1) in; // The input tensors bind index is relative to index in parameter passed - layout(set = 0, binding = 0) buffer bina { float tina[]; }; - layout(set = 0, binding = 1) buffer binb { float tinb[]; }; - layout(set = 0, binding = 2) buffer bout { float tout[]; }; + layout(set = 0, binding = 0) buffer buf_in_a { float in_a[]; }; + layout(set = 0, binding = 1) buffer buf_in_b { float in_b[]; }; + layout(set = 0, binding = 2) buffer buf_out_a { float out_a[]; }; + layout(set = 0, binding = 3) buffer buf_out_b { float out_b[]; }; + + // Kompute supports push constants updated on dispatch + layout(push_constant) uniform PushConstants { + float val; + } push_const; + + // Kompute also supports spec constants on initalization + layout(constant_id = 0) const float const_one = 0; void main() { uint index = gl_GlobalInvocationID.x; - tout[index] = tina[index] * tinb[index]; + out_a[index] += in_a[index] * in_b[index]; + out_b[index] += const_one * push_const.val; } )"); - // 3. Run operation with string shader synchronously - mgr.evalOpDefault( - { tensorInA, tensorInB, tensorOut }, - kp::Shader::compile_source(shaderString)); + kp::Workgroup workgroup({3, 1, 1}); + kp::Constants specConsts({ 2 }); - // 4. Map results back from GPU memory to print the results - mgr.evalOpDefault({ tensorInA, tensorInB, tensorOut }); + auto algorithm = mgr.algorithm(params, kp::Shader::compile_source(shader), workgroup, specConsts); - // Prints the output which is Output: { 2, 4, 6 } - for (const float& elem : tensorOut->data()) std::cout << elem << " "; + kp::Constants pushConstsA({ 2.0 }); + kp::Constants pushConstsB({ 3.0 }); + + // 4. Run operation synchronously using sequence + mgr.sequence() + ->record(params) + ->record(algorithm, pushConstsA) + ->record(algorithm, pushConstsB) + ->eval(); + + // 5. Sync results from the GPU asynchronously + sq = mgr.sequence() + sq->evalAsync(params); + + // ... Do other work asynchronously whilst GPU finishes + + sq->evalAwait(); + + // Prints the first output which is: { 4, 8, 12 } + for (const float& elem : tensorOutA->data()) std::cout << elem << " "; + // Prints the second output which is: { 10, 10, 10 } + for (const float& elem : tensorOutB->data()) std::cout << elem << " "; } ``` @@ -94,34 +124,72 @@ int main() { The [Python package](https://kompute.cc/overview/python-package.html) provides a [high level interactive interface](https://kompute.cc/overview/python-reference.html) that enables for experimentation whilst ensuring high performance and fast development workflows. ```python + # 1. Create Kompute Manager with default settings (device 0 and first compute compatible queue) -mgr = Manager() +mgr = kp.Manager() -# 2. Create and initialise Kompute Tensors (can be initialized with List[] or np.Array) -tensor_in_a = Tensor([2, 2, 2]) -tensor_in_b = Tensor([1, 2, 3]) -tensor_out = Tensor([0, 0, 0]) +# 2. Create and initialise Kompute Tensors through manager +tensor_in_a = mgr.tensor([2, 2, 2]) +tensor_in_b = mgr.tensor([1, 2, 3]) +tensor_out_a = mgr.tensor([0, 0, 0]) +tensor_out_b = mgr.tensor([0, 0, 0]) -mgr.eval_tensor_create_def([tensor_in_a, tensor_in_b, tensor_out]) +params = [tensor_in_a, tensor_in_b, tensor_out_a, tensor_out_b] -# 3. Specify "multiply shader" code (can also be raw string, spir-v bytes or file path) -@python2shader -def compute_shader_multiply(index=("input", "GlobalInvocationId", ivec3), - data1=("buffer", 0, Array(f32)), - data2=("buffer", 1, Array(f32)), - data3=("buffer", 2, Array(f32))): - i = index.x - data3[i] = data1[i] * data2[i] +# 3. Create algorithm based on shader (supports buffers & push/spec constants) +shader = """ + #version 450 -# 4. Run multiplication operation synchronously -mgr.eval_algo_data_def( - [tensor_in_a, tensor_in_b, tensor_out], compute_shader_multiply.to_spirv()) + layout (local_size_x = 1) in; -# 5. Map results back from GPU memory to print the results -mgr.eval_tensor_sync_local_def([tensor_out]) + // The input tensors bind index is relative to index in parameter passed + layout(set = 0, binding = 0) buffer buf_in_a { float in_a[]; }; + layout(set = 0, binding = 1) buffer buf_in_b { float in_b[]; }; + layout(set = 0, binding = 2) buffer buf_out_a { float out_a[]; }; + layout(set = 0, binding = 3) buffer buf_out_b { float out_b[]; }; + + // Kompute supports push constants updated on dispatch + layout(push_constant) uniform PushConstants { + float val; + } push_const; + + // Kompute also supports spec constants on initalization + layout(constant_id = 0) const float const_one = 0; + + void main() { + uint index = gl_GlobalInvocationID.x; + out_a[index] += in_a[index] * in_b[index]; + out_b[index] += const_one * push_const.val; + } +""" + +workgroup = (3, 1, 1) +spec_consts = [2] +push_consts_a = [2] +push_consts_b = [3] + +algo = mgr.algorithm(params, kp.Shader.compile_source(shader), workgroup, spec_consts) + +# 4. Run operation synchronously using sequence +(mgr.sequence() + .record(kp.OpTensorSyncDevice(params)) + .record(kp.OpAlgoDispatch(algo, push_consts_a)) + .record(kp.OpAlgoDispatch(algo, push_consts_b)) + .eval()) + +# 5. Sync results from the GPU asynchronously +sq = mgr.sequence() +sq.eval_async(kp.OpTensorSyncLocal(params)) + +# ... Do other work asynchronously whilst GPU finishes + +sq.eval_await() + +# Prints the first output which is: { 4, 8, 12 } +print(tensor_out_a) +# Prints the first output which is: { 10, 10, 10 } +print(tensor_out_b) -# Prints [2.0, 4.0, 6.0] -print(tensor_out.data()) ``` ### Interactive Notebooks & Hands on Videos diff --git a/python/test/test_kompute.py b/python/test/test_kompute.py index 4514e2dd2..865f72d92 100644 --- a/python/test/test_kompute.py +++ b/python/test/test_kompute.py @@ -30,6 +30,63 @@ kp_log = logging.getLogger("kp") # # assert tensor_out.data() == [2.0, 4.0, 6.0] +def test_end_to_end(): + + mgr = kp.Manager() + + tensor_in_a = mgr.tensor([2, 2, 2]) + tensor_in_b = mgr.tensor([1, 2, 3]) + tensor_out_a = mgr.tensor([0, 0, 0]) + tensor_out_b = mgr.tensor([0, 0, 0]) + + params = [tensor_in_a, tensor_in_b, tensor_out_a, tensor_out_b] + + shader = """ + #version 450 + + layout (local_size_x = 1) in; + + // The input tensors bind index is relative to index in parameter passed + layout(set = 0, binding = 0) buffer buf_in_a { float in_a[]; }; + layout(set = 0, binding = 1) buffer buf_in_b { float in_b[]; }; + layout(set = 0, binding = 2) buffer buf_out_a { float out_a[]; }; + layout(set = 0, binding = 3) buffer buf_out_b { float out_b[]; }; + + // Kompute supports push constants updated on dispatch + layout(push_constant) uniform PushConstants { + float val; + } push_const; + + // Kompute also supports spec constants on initalization + layout(constant_id = 0) const float const_one = 0; + + void main() { + uint index = gl_GlobalInvocationID.x; + out_a[index] += in_a[index] * in_b[index]; + out_b[index] += const_one * push_const.val; + } + """ + + workgroup = (3, 1, 1) + spec_consts = [2] + push_consts_a = [2] + push_consts_b = [3] + + algo = mgr.algorithm(params, kp.Shader.compile_source(shader), workgroup, spec_consts) + + (mgr.sequence() + .record(kp.OpTensorSyncDevice(params)) + .record(kp.OpAlgoDispatch(algo, push_consts_a)) + .record(kp.OpAlgoDispatch(algo, push_consts_b)) + .eval()) + + sq = mgr.sequence() + sq.eval_async(kp.OpTensorSyncLocal(params)) + + sq.eval_await() + + assert tensor_out_a.data().tolist() == [4, 8, 12] + assert tensor_out_b.data().tolist() == [10, 10, 10] def test_shader_str(): diff --git a/test/TestMultipleAlgoExecutions.cpp b/test/TestMultipleAlgoExecutions.cpp index 0ddbbffc8..8be2e6d82 100644 --- a/test/TestMultipleAlgoExecutions.cpp +++ b/test/TestMultipleAlgoExecutions.cpp @@ -3,6 +3,66 @@ #include "kompute/Kompute.hpp" +TEST(TestMultipleAlgoExecutions, TestEndToEndFunctionality) { + + kp::Manager mgr; + + auto tensorInA = mgr.tensor({ 2., 2., 2. }); + auto tensorInB = mgr.tensor({ 1., 2., 3. }); + auto tensorOutA = mgr.tensor({ 0., 0., 0. }); + auto tensorOutB = mgr.tensor({ 0., 0., 0. }); + + std::string shader = (R"( + #version 450 + + layout (local_size_x = 1) in; + + // The input tensors bind index is relative to index in parameter passed + layout(set = 0, binding = 0) buffer buf_in_a { float in_a[]; }; + layout(set = 0, binding = 1) buffer buf_in_b { float in_b[]; }; + layout(set = 0, binding = 2) buffer buf_out_a { float out_a[]; }; + layout(set = 0, binding = 3) buffer buf_out_b { float out_b[]; }; + + // Kompute supports push constants updated on dispatch + layout(push_constant) uniform PushConstants { + float val; + } push_const; + + // Kompute also supports spec constants on initalization + layout(constant_id = 0) const float const_one = 0; + + void main() { + uint index = gl_GlobalInvocationID.x; + out_a[index] += in_a[index] * in_b[index]; + out_b[index] += const_one * push_const.val; + } + )"); + + std::vector> params = {tensorInA, tensorInB, tensorOutA, tensorOutB}; + + kp::Workgroup workgroup({3, 1, 1}); + kp::Constants specConsts({ 2 }); + kp::Constants pushConstsA({ 2.0 }); + kp::Constants pushConstsB({ 3.0 }); + + auto algorithm = mgr.algorithm(params, kp::Shader::compile_source(shader), workgroup, specConsts); + + // 3. Run operation with string shader synchronously + mgr.sequence() + ->record(params) + ->record(algorithm, pushConstsA) + ->record(algorithm, pushConstsB) + ->eval(); + + auto sq = mgr.sequence(); + sq->evalAsync(params); + + sq->evalAwait(); + + EXPECT_EQ(tensorOutA->data(), std::vector({ 4, 8, 12 })); + EXPECT_EQ(tensorOutB->data(), std::vector({ 10, 10, 10 })); +} + TEST(TestMultipleAlgoExecutions, SingleSequenceRecord) {