From f163aaf5e8b0c24b63f25397aa69264e91292b4f Mon Sep 17 00:00:00 2001 From: Alejandro Saucedo Date: Mon, 1 Mar 2021 07:58:12 +0000 Subject: [PATCH] Updated advanced example docs --- docs/overview/advanced-examples.rst | 223 +++++++++------------------- 1 file changed, 69 insertions(+), 154 deletions(-) diff --git a/docs/overview/advanced-examples.rst b/docs/overview/advanced-examples.rst index bd9d5506a..f7e5432eb 100644 --- a/docs/overview/advanced-examples.rst +++ b/docs/overview/advanced-examples.rst @@ -28,107 +28,6 @@ End-to-end examples * `Game Development Kompute ML in Godot Engine `_ -Simple Shader Example -~~~~~~~~~~~~~~~~~~~~~ - -Pass compute shader data in glsl/hlsl text or compiled SPIR-V format (or as path to the file). Back to `examples list <#simple-examples>`_. - -.. code-block:: cpp - :linenos: - int main() { - - // You can allow Kompute to create the Vulkan components, or pass your existing ones - kp::Manager mgr; // Selects device 0 unless explicitly requested - - // Creates tensor an initializes GPU memory (below we show more granularity) - auto tensorA = std::make_shared(kp::Tensor({ 3., 4., 5. })); - auto tensorB = std::make_shared(kp::Tensor({ 0., 0., 0. })); - - // Create tensors data explicitly in GPU with an operation - mgr.rebuild({ tensorA, tensorB }); - - // Define your shader as a string (using string literals for simplicity) - // (You can also pass the raw compiled bytes, or even path to file) - std::string shader(R"( - #version 450 - - layout (local_size_x = 1) in; - - layout(set = 0, binding = 0) buffer a { float pa[]; }; - layout(set = 0, binding = 1) buffer b { float pb[]; }; - - void main() { - uint index = gl_GlobalInvocationID.x; - pb[index] = pa[index]; - pa[index] = index; - } - )"); - - // Run Kompute operation on the parameters provided with dispatch layout - mgr.evalOpDefault( - { tensorA, tensorB }, - kp::Shader::compile_source(shader)); - - // Sync the GPU memory back to the local tensor - mgr.evalOpDefault({ tensorA, tensorB }); - - // Prints the output which is A: { 0, 1, 2 } B: { 3, 4, 5 } - std::cout << fmt::format("A: {}, B: {}", - tensorA.data(), tensorB.data()) << std::endl; - } - -Record batch commands -~~~~~~~~~~~~~~~~~~~~~ - -Record commands in a single submit by using a Sequence to send in batch to GPU. Back to `examples list <#simple-examples>`_ - -.. code-block:: cpp - :linenos: - - int main() { - - kp::Manager mgr; - - std::shared_ptr tensorLHS{ new kp::Tensor({ 1., 1., 1. }) }; - std::shared_ptr tensorRHS{ new kp::Tensor({ 2., 2., 2. }) }; - std::shared_ptr tensorOutput{ new kp::Tensor({ 0., 0., 0. }) }; - - // Create all the tensors in memory - mgr.evalOpDefault({tensorLHS, tensorRHS, tensorOutput}); - - // Create a new sequence - std::weak_ptr sqWeakPtr = mgr.sequence(); - - if (std::shared_ptr sq = sqWeakPtr.lock()) - { - // Begin recording commands - sq->begin(); - - // Record batch commands to send to GPU - sq->record({ tensorLHS, tensorRHS, tensorOutput }); - sq->record({tensorOutput, tensorLHS, tensorRHS}); - - // Stop recording - sq->end(); - - // Submit multiple batch operations to GPU - size_t ITERATIONS = 5; - for (size_t i = 0; i < ITERATIONS; i++) { - sq->eval(); - } - - // Sync GPU memory back to local tensor - sq->begin(); - sq->record({tensorOutput}); - sq->end(); - sq->eval(); - } - - // Print the output which iterates through OpMult 5 times - // in this case the output is {32, 32 , 32} - std::cout << fmt::format("Output: {}", tensorOutput.data()) << std::endl; - } - Asynchronous Operations ~~~~~~~~~~~~~~~~~~~~~~~ @@ -143,10 +42,7 @@ You can submit operations asynchronously with the async/await commands in the kp kp::Manager mgr; // Selects device 0 unless explicitly requested // Creates tensor an initializes GPU memory (below we show more granularity) - auto tensor = std::make_shared(kp::Tensor(std::vector(10, 0.0))); - - // Create tensors data explicitly in GPU with an operation - mgr.rebuild(tensor) + auto tensor = mgr.tensor(10, 0.0); // Define your shader as a string (using string literals for simplicity) // (You can also pass the raw compiled bytes, or even path to file) @@ -176,25 +72,19 @@ You can submit operations asynchronously with the async/await commands in the kp std::vector spirv = kp::Shader::compile_source(shader); - // We can now await for the previous submitted command - // The first parameter can be the amount of time to wait - // The time provided is in nanoseconds - mgr.evalOpAwaitDefault(10000); + auto sq = mgr.sequence(); - // Run Async Kompute operation on the parameters provided - mgr.evalOpAsyncDefault( - { tensor }, - spirv); + sq.eval({tensor}); - // Here we can do other work + sq.evalAsync(mgr.algorithm({tensor}, spirv)); // When we're ready we can wait // The default wait time is UINT64_MAX - mgr.evalOpAwaitDefault() + sq.evalAwait(10000) // Sync the GPU memory back to the local tensor // We can still run synchronous jobs in our created sequence - mgr.evalOpDefault({ tensor }); + sq.eval({ tensor }); // Prints the output: B: { 100000000, ... } std::cout << fmt::format("B: {}", @@ -225,18 +115,12 @@ Back to `examples list <#simple-examples>`_. // We create a manager with device index, and queues by queue family index kp::Manager mgr(deviceIndex, familyIndices); - // We need to create explicit sequences with their respective queues - // The second parameter is the index in the familyIndex array which is relative - // to the vector we created the manager with. - mgr.sequence("queueOne", 0); - mgr.sequence("queueTwo", 1); - // Creates tensor an initializes GPU memory (below we show more granularity) - auto tensorA = std::make_shared(kp::Tensor(std::vector(10, 0.0))); - auto tensorB = std::make_shared(kp::Tensor(std::vector(10, 0.0))); + auto tensorA = mgr.tensor({ 10, 0.0 }); + auto tensorB = mgr.tensor({ 10, 0.0 }); - // We run the first step synchronously on the default sequence - mgr.rebuild({ tensorA, tensorB }); + // Copies the data into GPU memory + mgr.sequence().eval({tensorA tensorB}); // Define your shader as a string (using string literals for simplicity) // (You can also pass the raw compiled bytes, or even path to file) @@ -266,26 +150,28 @@ Back to `examples list <#simple-examples>`_. std::vector spirv = kp::Shader::compile_source(shader); + std::shared_ptr algo = mgr.algorithm({tensorA, tenssorB}, spirv); + + // We need to create explicit sequences with their respective queues + // The second parameter is the index in the familyIndex array which is relative + // to the vector we created the manager with. + sqOne = mgr.sequence(0); + sqTwo = mgr.sequence(1); + // Run the first parallel operation in the `queueOne` sequence - mgr.evalOpAsync( - { tensorA }, - "queueOne", - spirv); + sqOne->evalAsync(algo); // Run the second parallel operation in the `queueTwo` sequence - mgr.evalOpAsync( - { tensorB }, - "queueTwo", - spirv); + sqTwo->evalAsync(algo); // Here we can do other work // We can now wait for the two parallel tasks to finish - mgr.evalOpAwait("queueOne") - mgr.evalOpAwait("queueTwo") + sqOne.evalOpAwait() + sqTwo.evalOpAwait() // Sync the GPU memory back to the local tensor - mgr.evalOp({ tensorA, tensorB }); + mgr.sequence()->eval({ tensorA, tensorB }); // Prints the output: A: 100000000 B: 100000000 std::cout << fmt::format("A: {}, B: {}", @@ -302,17 +188,47 @@ We also provide tools that allow you to `convert shaders into C++ headers physicalDevice, - std::shared_ptr device, - std::shared_ptr commandBuffer, - std::vector> tensors) - : OpAlgoBase(physicalDevice, device, commandBuffer, tensors, "") + OpMyCustom(std::vector> tensors, + std::shared_ptr algorithm) + : OpAlgoBase(algorithm) { - // Perform your custom steps such as reading from a shader file - this->mShaderFilePath = "shaders/glsl/opmult.comp.spv"; + if (tensors.size() != 3) { + throw std::runtime_error("Kompute OpMult expected 3 tensors but got " + tensors.size()); + } + + std::vector spirv = kp::Shader::compile_source(R"( + #version 450 + + layout(set = 0, binding = 0) buffer tensorLhs { + float valuesLhs[ ]; + }; + + layout(set = 0, binding = 1) buffer tensorRhs { + float valuesRhs[ ]; + }; + + layout(set = 0, binding = 2) buffer tensorOutput { + float valuesOutput[ ]; + }; + + layout (constant_id = 0) const uint LEN_LHS = 0; + layout (constant_id = 1) const uint LEN_RHS = 0; + layout (constant_id = 2) const uint LEN_OUT = 0; + + layout (local_size_x = 1, local_size_y = 1, local_size_z = 1) in; + + void main() + { + uint index = gl_GlobalInvocationID.x; + + valuesOutput[index] = valuesLhs[index] * valuesRhs[index]; + } + )"); + + algorithm->rebuild(tensors, spirv); } } @@ -322,16 +238,15 @@ We also provide tools that allow you to `convert shaders into C++ headers (kp::Tensor({ 0., 1., 2. })); - auto tensorRhs = std::make_shared(kp::Tensor({ 2., 4., 6. })); - auto tensorOut = std::make_shared(kp::Tensor({ 0., 0., 0. })); + auto tensorLhs = mgr.tensor({ 0., 1., 2. }); + auto tensorRhs = mgr.tensor({ 2., 4., 6. }); + auto tensorOut = mgr.tensor({ 0., 0., 0. }); - // Create tensors data explicitly in GPU with an operation - mgr.rebuild({ tensorLhs, tensorRhs, tensorOut }); - - // Run Kompute operation on the parameters provided with dispatch layout - mgr.evalOpDefault>( - { tensorLhs, tensorRhs, tensorOut }); + mgr.sequence() + ->record({tensorLhs, tensorRhs, tensorOut}) + ->record({tensorLhs, tensorRhs, tensorOut}, mgr.algorithm()) + ->record({tensorLhs, tensorRhs, tensorOut}) + ->eval(); // Prints the output which is { 0, 4, 12 } std::cout << fmt::format("Output: {}", tensorOutput.data()) << std::endl;