From 7f686b47daa9cba5d8ec37995364b44eaa329f0b Mon Sep 17 00:00:00 2001 From: Alejandro Saucedo Date: Mon, 1 Mar 2021 21:13:08 +0000 Subject: [PATCH] Updated examples to new interface --- README.md | 173 ++++++------ docs/overview/advanced-examples.rst | 398 +++++++++++++++++----------- docs/overview/async-parallel.rst | 255 +----------------- docs/overview/custom-operations.rst | 63 +---- docs/overview/memory-management.rst | 18 +- docs/overview/python-examples.rst | 110 ++++---- 6 files changed, 410 insertions(+), 607 deletions(-) diff --git a/README.md b/README.md index 657cfc4a1..eccccf4dc 100644 --- a/README.md +++ b/README.md @@ -48,7 +48,8 @@ Below you can find a GPU multiplication example using the C++ and Python Kompute The C++ interface provides low level access to the native components of Kompute and Vulkan, enabling for [advanced optimizations](https://kompute.cc/overview/async-parallel.html) as well as [extension of components](https://kompute.cc/overview/reference.html). ```c++ -int main() { + +void kompute(const std::string& shader) { // 1. Create Kompute Manager with default settings (device 0 and first compute compatible queue) kp::Manager mgr; @@ -62,6 +63,42 @@ int main() { std::vector> params = {tensorInA, tensorInB, tensorOutA, tensorOutB}; // 3. Create algorithm based on shader (supports buffers & push/spec constants) + kp::Workgroup workgroup({3, 1, 1}); + kp::Constants specConsts({ 2 }); + kp::Constants pushConstsA({ 2.0 }); + kp::Constants pushConstsB({ 3.0 }); + + auto algorithm = mgr.algorithm(params, + kp::Shader::compile_source(shader), + workgroup, + specConsts); + + // 4. Run operation synchronously using sequence + mgr.sequence() + ->record(params) + ->record(algorithm, pushConstsA) + ->record(algorithm, pushConstsB) + ->eval(); + + // 5. Sync results from the GPU asynchronously + sq = mgr.sequence() + sq->evalAsync(params); + + // ... Do other work asynchronously whilst GPU finishes + + sq->evalAwait(); + + // Prints the first output which is: { 4, 8, 12 } + for (const float& elem : tensorOutA->data()) std::cout << elem << " "; + // Prints the second output which is: { 10, 10, 10 } + for (const float& elem : tensorOutB->data()) std::cout << elem << " "; + +} // Manages / releases all CPU and GPU memory resources + +int main() { + + // Define a raw string shader (or use the Kompute tools to compile to SPIRV / C++ header + // files). This shader shows some of the main components including constants, buffers, etc std::string shader = (R"( #version 450 @@ -88,33 +125,8 @@ int main() { } )"); - kp::Workgroup workgroup({3, 1, 1}); - kp::Constants specConsts({ 2 }); - - auto algorithm = mgr.algorithm(params, kp::Shader::compile_source(shader), workgroup, specConsts); - - kp::Constants pushConstsA({ 2.0 }); - kp::Constants pushConstsB({ 3.0 }); - - // 4. Run operation synchronously using sequence - mgr.sequence() - ->record(params) - ->record(algorithm, pushConstsA) - ->record(algorithm, pushConstsB) - ->eval(); - - // 5. Sync results from the GPU asynchronously - sq = mgr.sequence() - sq->evalAsync(params); - - // ... Do other work asynchronously whilst GPU finishes - - sq->evalAwait(); - - // Prints the first output which is: { 4, 8, 12 } - for (const float& elem : tensorOutA->data()) std::cout << elem << " "; - // Prints the second output which is: { 10, 10, 10 } - for (const float& elem : tensorOutB->data()) std::cout << elem << " "; + // Run the function declared above with our raw string shader + kompute(shader); } ``` @@ -125,70 +137,77 @@ The [Python package](https://kompute.cc/overview/python-package.html) provides a ```python -# 1. Create Kompute Manager with default settings (device 0 and first compute compatible queue) -mgr = kp.Manager() +def kompute(shader): + # 1. Create Kompute Manager with default settings (device 0 and first compute compatible queue) + mgr = kp.Manager() -# 2. Create and initialise Kompute Tensors through manager -tensor_in_a = mgr.tensor([2, 2, 2]) -tensor_in_b = mgr.tensor([1, 2, 3]) -tensor_out_a = mgr.tensor([0, 0, 0]) -tensor_out_b = mgr.tensor([0, 0, 0]) + # 2. Create and initialise Kompute Tensors through manager + tensor_in_a = mgr.tensor([2, 2, 2]) + tensor_in_b = mgr.tensor([1, 2, 3]) + tensor_out_a = mgr.tensor([0, 0, 0]) + tensor_out_b = mgr.tensor([0, 0, 0]) -params = [tensor_in_a, tensor_in_b, tensor_out_a, tensor_out_b] + params = [tensor_in_a, tensor_in_b, tensor_out_a, tensor_out_b] -# 3. Create algorithm based on shader (supports buffers & push/spec constants) -shader = """ - #version 450 + # 3. Create algorithm based on shader (supports buffers & push/spec constants) + workgroup = (3, 1, 1) + spec_consts = [2] + push_consts_a = [2] + push_consts_b = [3] - layout (local_size_x = 1) in; + algo = mgr.algorithm(params, kp.Shader.compile_source(shader), workgroup, spec_consts) - // The input tensors bind index is relative to index in parameter passed - layout(set = 0, binding = 0) buffer buf_in_a { float in_a[]; }; - layout(set = 0, binding = 1) buffer buf_in_b { float in_b[]; }; - layout(set = 0, binding = 2) buffer buf_out_a { float out_a[]; }; - layout(set = 0, binding = 3) buffer buf_out_b { float out_b[]; }; + # 4. Run operation synchronously using sequence + (mgr.sequence() + .record(kp.OpTensorSyncDevice(params)) + .record(kp.OpAlgoDispatch(algo, push_consts_a)) + .record(kp.OpAlgoDispatch(algo, push_consts_b)) + .eval()) - // Kompute supports push constants updated on dispatch - layout(push_constant) uniform PushConstants { - float val; - } push_const; + # 5. Sync results from the GPU asynchronously + sq = mgr.sequence() + sq.eval_async(kp.OpTensorSyncLocal(params)) - // Kompute also supports spec constants on initalization - layout(constant_id = 0) const float const_one = 0; + # ... Do other work asynchronously whilst GPU finishes - void main() { - uint index = gl_GlobalInvocationID.x; - out_a[index] += in_a[index] * in_b[index]; - out_b[index] += const_one * push_const.val; - } -""" + sq.eval_await() -workgroup = (3, 1, 1) -spec_consts = [2] -push_consts_a = [2] -push_consts_b = [3] + # Prints the first output which is: { 4, 8, 12 } + print(tensor_out_a) + # Prints the first output which is: { 10, 10, 10 } + print(tensor_out_b) -algo = mgr.algorithm(params, kp.Shader.compile_source(shader), workgroup, spec_consts) +if __name__ == "__main__": -# 4. Run operation synchronously using sequence -(mgr.sequence() - .record(kp.OpTensorSyncDevice(params)) - .record(kp.OpAlgoDispatch(algo, push_consts_a)) - .record(kp.OpAlgoDispatch(algo, push_consts_b)) - .eval()) + # Define a raw string shader (or use the Kompute tools to compile to SPIRV / C++ header + # files). This shader shows some of the main components including constants, buffers, etc + shader = """ + #version 450 -# 5. Sync results from the GPU asynchronously -sq = mgr.sequence() -sq.eval_async(kp.OpTensorSyncLocal(params)) + layout (local_size_x = 1) in; -# ... Do other work asynchronously whilst GPU finishes + // The input tensors bind index is relative to index in parameter passed + layout(set = 0, binding = 0) buffer buf_in_a { float in_a[]; }; + layout(set = 0, binding = 1) buffer buf_in_b { float in_b[]; }; + layout(set = 0, binding = 2) buffer buf_out_a { float out_a[]; }; + layout(set = 0, binding = 3) buffer buf_out_b { float out_b[]; }; -sq.eval_await() + // Kompute supports push constants updated on dispatch + layout(push_constant) uniform PushConstants { + float val; + } push_const; -# Prints the first output which is: { 4, 8, 12 } -print(tensor_out_a) -# Prints the first output which is: { 10, 10, 10 } -print(tensor_out_b) + // Kompute also supports spec constants on initalization + layout(constant_id = 0) const float const_one = 0; + + void main() { + uint index = gl_GlobalInvocationID.x; + out_a[index] += in_a[index] * in_b[index]; + out_b[index] += const_one * push_const.val; + } + """ + + kompute(shader) ``` diff --git a/docs/overview/advanced-examples.rst b/docs/overview/advanced-examples.rst index f7e5432eb..80df20e42 100644 --- a/docs/overview/advanced-examples.rst +++ b/docs/overview/advanced-examples.rst @@ -10,13 +10,9 @@ The power of Kompute comes in when the interface is used for complex computation Simple examples ^^^^^^^^^^^^^^^ - -* `Pass shader as raw string <#simple-shader-example>`_ -* `Record batch commands with a Kompute Sequence <#record-batch-commands>`_ +* `Create your custom Kompute Operations <#your-custom-kompute-operation>`_ * `Run Asynchronous Operations <#asynchronous-operations>`_ * `Run Parallel Operations Across Multiple GPU Queues <#parallel-operations>`_ -* `Create your custom Kompute Operations <#your-custom-kompute-operation>`_ -* `Implementing logistic regression from scratch <#logistic-regression-example>`_ End-to-end examples ^^^^^^^^^^^^^^^^^^^ @@ -28,156 +24,6 @@ End-to-end examples * `Game Development Kompute ML in Godot Engine `_ -Asynchronous Operations -~~~~~~~~~~~~~~~~~~~~~~~ - -You can submit operations asynchronously with the async/await commands in the kp::Manager and kp::Sequence, which provides granularity on waiting on the vk::Fence. Back to `examples list <#simple-examples>`_ - -.. code-block:: cpp - :linenos: - - int main() { - - // You can allow Kompute to create the Vulkan components, or pass your existing ones - kp::Manager mgr; // Selects device 0 unless explicitly requested - - // Creates tensor an initializes GPU memory (below we show more granularity) - auto tensor = mgr.tensor(10, 0.0); - - // Define your shader as a string (using string literals for simplicity) - // (You can also pass the raw compiled bytes, or even path to file) - std::string shader(R"( - #version 450 - - layout (local_size_x = 1) in; - - layout(set = 0, binding = 0) buffer b { float pb[]; }; - - shared uint sharedTotal[1]; - - void main() { - uint index = gl_GlobalInvocationID.x; - - sharedTotal[0] = 0; - - // Iterating to simulate longer process - for (int i = 0; i < 100000000; i++) - { - atomicAdd(sharedTotal[0], 1); - } - - pb[index] = sharedTotal[0]; - } - )"); - - std::vector spirv = kp::Shader::compile_source(shader); - - auto sq = mgr.sequence(); - - sq.eval({tensor}); - - sq.evalAsync(mgr.algorithm({tensor}, spirv)); - - // When we're ready we can wait - // The default wait time is UINT64_MAX - sq.evalAwait(10000) - - // Sync the GPU memory back to the local tensor - // We can still run synchronous jobs in our created sequence - sq.eval({ tensor }); - - // Prints the output: B: { 100000000, ... } - std::cout << fmt::format("B: {}", - tensor.data()) << std::endl; - } - -Parallel Operations -~~~~~~~~~~~~~~~~~~~ - -Besides being able to submit asynchronous operations, you can also leverage the underlying GPU compute queues to process operations in parallel. - -This will depend on your underlying graphics card, but for example in NVIDIA graphics cards the operations submitted across queues in one family are not parallelizable, but operations submitted across queueFamilies can be parallelizable. - -Below we show how you can parallelize operations in an `NVIDIA 1650 `_\ , which has a ``GRAPHICS+COMPUTE`` family on ``index 0``\ , and ``COMPUTE`` family on ``index 2``. - -Back to `examples list <#simple-examples>`_. - -.. code-block:: cpp - :linenos: - - int main() { - - // In this case we select device 0, and for queues, one queue from familyIndex 0 - // and one queue from familyIndex 2 - uint32_t deviceIndex(0); - std::vector familyIndices = {0, 2}; - - // We create a manager with device index, and queues by queue family index - kp::Manager mgr(deviceIndex, familyIndices); - - // Creates tensor an initializes GPU memory (below we show more granularity) - auto tensorA = mgr.tensor({ 10, 0.0 }); - auto tensorB = mgr.tensor({ 10, 0.0 }); - - // Copies the data into GPU memory - mgr.sequence().eval({tensorA tensorB}); - - // Define your shader as a string (using string literals for simplicity) - // (You can also pass the raw compiled bytes, or even path to file) - std::string shader(R"( - #version 450 - - layout (local_size_x = 1) in; - - layout(set = 0, binding = 0) buffer b { float pb[]; }; - - shared uint sharedTotal[1]; - - void main() { - uint index = gl_GlobalInvocationID.x; - - sharedTotal[0] = 0; - - // Iterating to simulate longer process - for (int i = 0; i < 100000000; i++) - { - atomicAdd(sharedTotal[0], 1); - } - - pb[index] = sharedTotal[0]; - } - )"); - - std::vector spirv = kp::Shader::compile_source(shader); - - std::shared_ptr algo = mgr.algorithm({tensorA, tenssorB}, spirv); - - // We need to create explicit sequences with their respective queues - // The second parameter is the index in the familyIndex array which is relative - // to the vector we created the manager with. - sqOne = mgr.sequence(0); - sqTwo = mgr.sequence(1); - - // Run the first parallel operation in the `queueOne` sequence - sqOne->evalAsync(algo); - - // Run the second parallel operation in the `queueTwo` sequence - sqTwo->evalAsync(algo); - - // Here we can do other work - - // We can now wait for the two parallel tasks to finish - sqOne.evalOpAwait() - sqTwo.evalOpAwait() - - // Sync the GPU memory back to the local tensor - mgr.sequence()->eval({ tensorA, tensorB }); - - // Prints the output: A: 100000000 B: 100000000 - std::cout << fmt::format("A: {}, B: {}", - tensorA.data()[0], tensorB.data()[0]) << std::endl; - } - Your Custom Kompute Operation ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -252,4 +98,246 @@ We also provide tools that allow you to `convert shaders into C++ headers eval({tensor}); + + +While this is running we can actually do other things like in this case create the shader we'll be using. + +In this case we create a shader that should take a couple of milliseconds to run. + +.. code-block:: cpp + :linenos: + + // Define your shader as a string (using string literals for simplicity) + // (You can also pass the raw compiled bytes, or even path to file) + std::string shader(R"( + #version 450 + + layout (local_size_x = 1) in; + + layout(set = 0, binding = 0) buffer b { float pb[]; }; + + shared uint sharedTotal[1]; + + void main() { + uint index = gl_GlobalInvocationID.x; + + sharedTotal[0] = 0; + + // Iterating to simulate longer process + for (int i = 0; i < 100000000; i++) + { + atomicAdd(sharedTotal[0], 1); + } + + pb[index] = sharedTotal[0]; + } + )"); + + auto algo = mgr.algorithm({tensor}, kp::Shader::compile_source(shader)); + +Now we are able to run the await function on the default sequence. + +If we are using the manager, we need to make sure that we are awaiting the same named sequence that was triggered asynchronously. + +If the sequence is not running or has finished running, it would return immediately. + +The parameter provided is the maximum amount of time to wait in nanoseconds. When the timeout expires, the sequence would return (with false value), but it does not stop the processing in the GPU - the processing would continue as normal. + +.. code-block:: cpp + :linenos: + + auto sq = mgr.sequence() + + // Run Async Kompute operation on the parameters provided + sq->evalAsync(algo); + + // Here we can do other work + + // When we're ready we can wait + // The default wait time is UINT64_MAX + sq.evalAwait() + + +Finally, below you can see that we can also run syncrhonous commands without having to change anything. + +.. code-block:: cpp + :linenos: + + // Sync the GPU memory back to the local tensor + // We can still run synchronous jobs in our created sequence + sq.eval({ tensor }); + + // Prints the output: B: { 100000000, ... } + std::cout << fmt::format("B: {}", + tensor.data()) << std::endl; + + +Parallel Operation Submission +----------- + +In order to work with parallel execution of tasks, it is important that you understand some of the core GPU processing limitations, as these can be quite broad and hardware dependent, which means they will vary across NVIDIA / AMD / ETC video cards. + +Conceptual Overview +^^^^^^^^^^^^^^^^^^^^^ + +If you are familiar with Vulkan, you will have experience that the first few things you do is fetching the physical Queues from the device. The queues themselves tend to have three main particular features - they can be GRAPHICS, TRANSFER and COMPUTE (between a few others we'll skip for simplicity). + +Queues can have multiple properties - namely a queue can be of type GRAPHICS+TRANSFER+COMPUTE, etc. Now here comes the key point: the underlying hardware may (or may not) support parallelized processing at multiple levels. + +Let's take a tangible example. The [NVIDIA 1650](http://vulkan.gpuinfo.org/displayreport.php?id=9700#queuefamilies) for example has 16 `GRAPHICS+TRANSFER+COMPUTE` queues on `familyIndex 0`, then 2 `TRANSFER` queues in `familyIndex 1` and finally 8 `COMPUTE+TRANSFER` queues in `familyIndex 2`. + +With this in mind, the NVIDIA 1650 as of today does not support intra-family parallelization, which means that if you were to submit commands in multiple queues of the same family, these would still be exectured synchronously. + +However the NVIDIA 1650 does support inter-family parallelization, which means that if we were to submit commands across multiple queues from different families, these would execute in parallel. + +This means that we would be able to execute parallel workloads as long as we're running them across multiple queue families. This is one of the reasons why Vulkan Kompute enables users to explicitly select the underlying queues and queue families to run particular workloads on. + +It is important that you understand what are the capabilities and limitations of your hardware, as parallelization capabilities can vary, so you will want to make sure you account for potential discrepancies in processing structures, mainyl to avoid undesired/unexpected race conditions. + +Parallel Execution Example +^^^^^^^^^^^^^^^^^^^^^ + +In this example we will demonstrate how you can set up parallel processing across two compute families to achieve 2x speedups when running processing workloads. + +To start, you will see that we do have to create the manager with extra parameters. This includes the GPU device index we want to use, together with the array of the queues that we want to enable. + +In this case we are using only two queues, which as per the section above, these would be familyIndex 0 which is of type `GRAPHICS+COMPUTE+TRANSFER` and familyIndex 2 which is of type `COMPUTE+TRANSFER`. + +In this case based on the specifications of the NVIDIA 1650 we could define up to 16 graphics queues (familyIndex 0), 2 transfer queues (familyIndex 1), and 8 compute queues (familyIndex 2) in no particular order. This means that we could have something like `{ 0, 1, 1, 2, 2, 2, 0, ... }` as our initialization value. + +You will want to keep track of the indices you initialize your manager, as you will be referring back to this ordering when creating sequences with particular queues. + +.. code-block:: cpp + :linenos: + + // In this case we select device 0, and for queues, one queue from familyIndex 0 + // and one queue from familyIndex 2 + uint32_t deviceIndex(0); + std::vector familyIndices = {0, 2}; + + // We create a manager with device index, and queues by queue family index + kp::Manager mgr(deviceIndex, familyIndices); + + +We are now able to create sequences with a particular queue. + +By default the Kompute Manager is created with device 0, and with a single queue of the first compatible familyIndex. Similarly, by default sequences are created with the first available queue. + +In this case we are able to specify which queue we want to use. Below we initialize "queueOne" named sequence with the graphics family queue, and "queueTwo" with the compute family queue. + +It's worth mentioning you can have multiple sequences referencing the same queue. + +.. code-block:: cpp + :linenos: + + // We need to create explicit sequences with their respective queues + // The second parameter is the index in the familyIndex array which is relative + // to the vector we created the manager with. + sqOne = mgr.sequence(0); + sqTwo = mgr.sequence(1); + +We create the tensors without modifications. + +.. code-block:: cpp + :linenos: + + // Creates tensor an initializes GPU memory (below we show more granularity) + auto tensorA = mgr.tensor({ 10, 0.0 }); + auto tensorB = mgr.tensor({ 10, 0.0 }); + + // Copies the data into GPU memory + mgr.sequence().eval({tensorA tensorB}); + +Similar to the asyncrhonous usecase above, we can still run synchronous commands without modifications. + +.. code-block:: cpp + :linenos: + + // Define your shader as a string (using string literals for simplicity) + // (You can also pass the raw compiled bytes, or even path to file) + std::string shader(R"( + #version 450 + + layout (local_size_x = 1) in; + + layout(set = 0, binding = 0) buffer b { float pb[]; }; + + shared uint sharedTotal[1]; + + void main() { + uint index = gl_GlobalInvocationID.x; + + sharedTotal[0] = 0; + + // Iterating to simulate longer process + for (int i = 0; i < 100000000; i++) + { + atomicAdd(sharedTotal[0], 1); + } + + pb[index] = sharedTotal[0]; + } + )"); + + std::vector spirv = kp::Shader::compile_source(shader); + + std::shared_ptr algo = mgr.algorithm({tensorA, tenssorB}, spirv); + +Now we can actually trigger the parallel processing, running two OpAlgoBase Operations - each in a different sequence / queue. + +.. code-block:: cpp + :linenos: + + // Run the first parallel operation in the `queueOne` sequence + sqOne->evalAsync(algo); + + // Run the second parallel operation in the `queueTwo` sequence + sqTwo->evalAsync(algo); + + +Similar to the asynchronous example above, we are able to do other work whilst the tasks are executing. + +We are able to wait for the tasks to complete by triggering the `evalOpAwait` on the respective sequence. + +.. code-block:: cpp + :linenos: + + // Here we can do other work + + // We can now wait for the two parallel tasks to finish + sqOne.evalOpAwait() + sqTwo.evalOpAwait() + + // Sync the GPU memory back to the local tensor + mgr.sequence()->eval({ tensorA, tensorB }); + + // Prints the output: A: 100000000 B: 100000000 + std::cout << fmt::format("A: {}, B: {}", + tensorA.data()[0], tensorB.data()[0]) << std::endl; + diff --git a/docs/overview/async-parallel.rst b/docs/overview/async-parallel.rst index 0a31ef17f..6b1f68b62 100644 --- a/docs/overview/async-parallel.rst +++ b/docs/overview/async-parallel.rst @@ -40,257 +40,8 @@ One important thing to bare in mind when using asynchronous submissions, is that The reason why this is important is that the Await function not only waits for the fence, but also runs the `postEval` functions across all operations, which is required for several operations. -Async/Await Example -^^^^^^^^^^^^^^^^^^^^^ +Async and Parallel Examples +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -A simple example of asynchronous submission can be found below. - -First we are able to create the manager as we normally would. - -.. code-block:: cpp - :linenos: - - // You can allow Kompute to create the Vulkan components, or pass your existing ones - kp::Manager mgr; // Selects device 0 unless explicitly requested - - // Creates tensor an initializes GPU memory (below we show more granularity) - auto tensor = std::make_shared(kp::Tensor(std::vector(10, 0.0))); - -We can now run our first asynchronous command, which in this case we can use the default sequence. - -Sequences can be executed in synchronously or asynchronously without having to change anything. - -.. code-block:: cpp - :linenos: - - // Create tensors data explicitly in GPU with an operation - mgr.rebuild({ tensor }); - - -While this is running we can actually do other things like in this case create the shader we'll be using. - -In this case we create a shader that should take a couple of milliseconds to run. - -.. code-block:: cpp - :linenos: - - // Define your shader as a string (using string literals for simplicity) - // (You can also pass the raw compiled bytes, or even path to file) - std::string shader(R"( - #version 450 - - layout (local_size_x = 1) in; - - layout(set = 0, binding = 0) buffer b { float pb[]; }; - - shared uint sharedTotal[1]; - - void main() { - uint index = gl_GlobalInvocationID.x; - - sharedTotal[0] = 0; - - // Iterating to simulate longer process - for (int i = 0; i < 100000000; i++) - { - atomicAdd(sharedTotal[0], 1); - } - - pb[index] = sharedTotal[0]; - } - )"); - -Now we are able to run the await function on the default sequence. - -If we are using the manager, we need to make sure that we are awaiting the same named sequence that was triggered asynchronously. - -If the sequence is not running or has finished running, it would return immediately. - -The parameter provided is the maximum amount of time to wait in nanoseconds. When the timeout expires, the sequence would return (with false value), but it does not stop the processing in the GPU - the processing would continue as normal. - -.. code-block:: cpp - :linenos: - - // We can now await for the previous submitted command - // The first parameter can be the amount of time to wait - // The time provided is in nanoseconds - mgr.evalOpAwaitDefault(10000); - - -Similar to above we can run other commands such as the `OpAlgoBase` asynchronously. - -.. code-block:: cpp - :linenos: - - // Run Async Kompute operation on the parameters provided - mgr.evalOpAsyncDefault>( - { tensor }, - kp::Shader::compile_source(shader)); - - // Here we can do other work - - // When we're ready we can wait - // The default wait time is UINT64_MAX - mgr.evalOpAwaitDefault() - - -Finally, below you can see that we can also run syncrhonous commands without having to change anything. - -.. code-block:: cpp - :linenos: - - // Sync the GPU memory back to the local tensor - // We can still run synchronous jobs in our created sequence - mgr.evalOpDefault({ tensor }); - - // Prints the output: B: { 100000000, ... } - std::cout << fmt::format("B: {}", - tensor.data()) << std::endl; - - -Parallel Operation Submission ------------ - -In order to work with parallel execution of tasks, it is important that you understand some of the core GPU processing limitations, as these can be quite broad and hardware dependent, which means they will vary across NVIDIA / AMD / ETC video cards. - -Conceptual Overview -^^^^^^^^^^^^^^^^^^^^^ - -If you are familiar with Vulkan, you will have experience that the first few things you do is fetching the physical Queues from the device. The queues themselves tend to have three main particular features - they can be GRAPHICS, TRANSFER and COMPUTE (between a few others we'll skip for simplicity). - -Queues can have multiple properties - namely a queue can be of type GRAPHICS+TRANSFER+COMPUTE, etc. Now here comes the key point: the underlying hardware may (or may not) support parallelized processing at multiple levels. - -Let's take a tangible example. The [NVIDIA 1650](http://vulkan.gpuinfo.org/displayreport.php?id=9700#queuefamilies) for example has 16 `GRAPHICS+TRANSFER+COMPUTE` queues on `familyIndex 0`, then 2 `TRANSFER` queues in `familyIndex 1` and finally 8 `COMPUTE+TRANSFER` queues in `familyIndex 2`. - -With this in mind, the NVIDIA 1650 as of today does not support intra-family parallelization, which means that if you were to submit commands in multiple queues of the same family, these would still be exectured synchronously. - -However the NVIDIA 1650 does support inter-family parallelization, which means that if we were to submit commands across multiple queues from different families, these would execute in parallel. - -This means that we would be able to execute parallel workloads as long as we're running them across multiple queue families. This is one of the reasons why Vulkan Kompute enables users to explicitly select the underlying queues and queue families to run particular workloads on. - -It is important that you understand what are the capabilities and limitations of your hardware, as parallelization capabilities can vary, so you will want to make sure you account for potential discrepancies in processing structures, mainyl to avoid undesired/unexpected race conditions. - -Parallel Execution Example -^^^^^^^^^^^^^^^^^^^^^ - -In this example we will demonstrate how you can set up parallel processing across two compute families to achieve 2x speedups when running processing workloads. - -To start, you will see that we do have to create the manager with extra parameters. This includes the GPU device index we want to use, together with the array of the queues that we want to enable. - -In this case we are using only two queues, which as per the section above, these would be familyIndex 0 which is of type `GRAPHICS+COMPUTE+TRANSFER` and familyIndex 2 which is of type `COMPUTE+TRANSFER`. - -In this case based on the specifications of the NVIDIA 1650 we could define up to 16 graphics queues (familyIndex 0), 2 transfer queues (familyIndex 1), and 8 compute queues (familyIndex 2) in no particular order. This means that we could have something like `{ 0, 1, 1, 2, 2, 2, 0, ... }` as our initialization value. - -You will want to keep track of the indices you initialize your manager, as you will be referring back to this ordering when creating sequences with particular queues. - -.. code-block:: cpp - :linenos: - - // In this case we select device 0, and for queues, one queue from familyIndex 0 - // and one queue from familyIndex 2 - uint32_t deviceIndex(0); - std::vector familyIndices = {0, 2}; - - // We create a manager with device index, and queues by queue family index - kp::Manager mgr(deviceIndex, familyIndices); - -We are now able to create sequences with a particular queue. - -By default the Kompute Manager is created with device 0, and with a single queue of the first compatible familyIndex. Similarly, by default sequences are created with the first available queue. - -In this case we are able to specify which queue we want to use. Below we initialize "queueOne" named sequence with the graphics family queue, and "queueTwo" with the compute family queue. - -It's worth mentioning you can have multiple sequences referencing the same queue. - -.. code-block:: cpp - :linenos: - - // We need to create explicit sequences with their respective queues - // The second parameter is the index in the familyIndex array which is relative - // to the vector we created the manager with. - mgr.sequence("queueOne", 0); - mgr.sequence("queueTwo", 1); - -We create the tensors without modifications. - -.. code-block:: cpp - :linenos: - - // Creates tensor an initializes GPU memory (below we show more granularity) - auto tensorA = std::make_shared(kp::Tensor(std::vector(10, 0.0))); - auto tensorB = std::make_shared(kp::Tensor(std::vector(10, 0.0))); - -Similar to the asyncrhonous usecase above, we can still run synchronous commands without modifications. - -.. code-block:: cpp - :linenos: - - // We run the first step synchronously on the default sequence - mgr.rebuild({ tensorA, tensorB }); - - // Define your shader as a string (using string literals for simplicity) - // (You can also pass the raw compiled bytes, or even path to file) - std::string shader(R"( - #version 450 - - layout (local_size_x = 1) in; - - layout(set = 0, binding = 0) buffer b { float pb[]; }; - - shared uint sharedTotal[1]; - - void main() { - uint index = gl_GlobalInvocationID.x; - - sharedTotal[0] = 0; - - // Iterating to simulate longer process - for (int i = 0; i < 100000000; i++) - { - atomicAdd(sharedTotal[0], 1); - } - - pb[index] = sharedTotal[0]; - } - )"); - -Now we can actually trigger the parallel processing, running two OpAlgoBase Operations - each in a different sequence / queue. - -.. code-block:: cpp - :linenos: - - std::vector spirv = kp::Shader::compile_source(shader); - - // Run the first parallel operation in the `queueOne` sequence - mgr.evalOpAsync>( - { tensorA }, - "queueOne", - spirv); - - // Run the second parallel operation in the `queueTwo` sequence - mgr.evalOpAsync>( - { tensorB }, - "queueTwo", - spirv); - - -Similar to the asynchronous example above, we are able to do other work whilst the tasks are executing. - -We are able to wait for the tasks to complete by triggering the `evalOpAwait` on the respective sequence. - -.. code-block:: cpp - :linenos: - - // Here we can do other work - - // We can now wait for the two parallel tasks to finish - mgr.evalOpAwait("queueOne") - mgr.evalOpAwait("queueTwo") - - // Sync the GPU memory back to the local tensor - mgr.evalOp({ tensorA, tensorB }); - - // Prints the output: A: 100000000 B: 100000000 - std::cout << fmt::format("A: {}, B: {}", - tensorA.data()[0], tensorB.data()[0]) << std::endl; +We have added a set of examples for asynchronous and parallel processing examples in the `Advanced Examples documentation page `_ diff --git a/docs/overview/custom-operations.rst b/docs/overview/custom-operations.rst index 4947196cd..21f1fb82c 100644 --- a/docs/overview/custom-operations.rst +++ b/docs/overview/custom-operations.rst @@ -39,74 +39,19 @@ Below you Simple Operation Extending OpAlgoBase ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Below we show a very simple example that enables you to create an operation with a pre-specified shader. In this case it is the multiplication shader. +You can find an example in the `Advanced Examples documentation section `_ that shows how to create your own custom function. -.. code-block:: cpp - :linenos: - - class OpMyCustom : public OpAlgoBase - { - public: - OpMyCustom(std::shared_ptr physicalDevice, - std::shared_ptr device, - std::shared_ptr commandBuffer, - std::vector> tensors) - : OpAlgoBase(physicalDevice, device, commandBuffer, tensors, "") - { - // Perform your custom steps such as reading from a shader file - this->mShaderFilePath = "shaders/glsl/opmult.comp"; - } - } +You can also see an implementation in the codebase through the `OpMult` class: - int main() { - - kp::Manager mgr; // Automatically selects Device 0 - - // Create 3 tensors of default type float - auto tensorLhs = std::make_shared(kp::Tensor({ 0., 1., 2. })); - auto tensorRhs = std::make_shared(kp::Tensor({ 2., 4., 6. })); - auto tensorOut = std::make_shared(kp::Tensor({ 0., 0., 0. })); - - // Create tensors data explicitly in GPU with an operation - mgr.evalOpDefault({ tensorLhs, tensorRhs, tensorOut }); - - // Run Kompute operation on the parameters provided with dispatch layout - mgr.evalOpDefault( - { tensorLhs, tensorRhs, tensorOut }); - - // Prints the output which is { 0, 4, 12 } - std::cout << fmt::format("Output: {}", tensorOutput.data()) << std::endl; - } - - -More Complex Operation Extending OpAlgoBase -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Below we show a more complex operation that performs the following: - -* Expects three tensors for an operation, two inputs and one output -* Expects the tensors to be initialised -* Checks that the tensors are of the same size -* Expects output tensor to be of type TensorTypes::eDevice (and creates staging tensor) -* Has functionality to read shader from file or directly from spirv bytes -* Records relevant bufferMemoryBarriers -* Records dispatch command -* Records copy command from device tensor to staging output tensor -* In postEval it maps data from staging tensor to output tensor's data - - -For starters, the header file contains the functions that will be overriden: - - -.. literalinclude:: ../../src/include/kompute/operations/OpAlgoLhsRhsOut.hpp +.. literalinclude:: ../../src/include/kompute/operations/OpMult.hpp :language: cpp Then the implementation outlines all the implementations that perform the actions above: ~~~~~~~~~~~~~~~~~~~ -.. literalinclude:: ../../src/OpAlgoLhsRhsOut.cpp +.. literalinclude:: ../../src/OpMult.cpp :language: cpp diff --git a/docs/overview/memory-management.rst b/docs/overview/memory-management.rst index f47b989cd..5ecfd7c0f 100755 --- a/docs/overview/memory-management.rst +++ b/docs/overview/memory-management.rst @@ -4,18 +4,22 @@ Memory Management Principles The principle in Vulkan Kompute on memory management is summarised as follows: -* Explicit is better than implicit for specifying memory management -* Interfaces for memory management are constant until freed -* Memory management responsibilities are acyclic from static object references * Memory management by Kompute is optional and only in place if resource is created by Kompute +* Memory management ownership architecture are acyclic and with a single top manager +* Operations do not manage any GPU memory or resources +* Top level manager is main owner of GPU resources and removes all resources when destroyed +* Manager holds weak pointers to ensure that if object created outside is destroyed it's released +* Once a resource is destroyed it cannot be recreated +* Resources can only be rebuilt if they haven't been destroyed -Vulkan Kompute is responsible for managing both the CPU and GPU memory allocations and resources, and is important that they are able to explicitly define when these objects are released or destroyed. Similarly, it's important that the memory resources created by the application are released safely. +Vulkan Kompute is responsible for managing both the CPU and GPU memory allocations and resources that it creates, and is important that they are able to explicitly define when these objects are released or destroyed. Similarly, it's important that the memory resources created by the application are released safely. -Vulkan Kompute is built with the BYOV principle in mind (Bring your own Vulkan). This means that even though the top level resources are managing the memory to its owned resources, they themselves may not have full ownership of the GPU / Vulkan components themselves. +Vulkan Kompute is built with the BYOV principle in mind (Bring your own Vulkan). This means that even though the top level resources are managing the memory to its owned resources, they themselves may not have full ownership of the GPU / Vulkan components - this is in the case that you may want to use Kompute with an existing Vulkan enabled application, and may want to initialise Kompute components with existing Vulkan resources. -The memory ownership is hierarchically outlined in the component architecture - in this diagram, the arrows provide an intuition on the memory management ownership relationships (in this case you can ignore the arrow from the Algorithm, as this is the only one that as of today doesn't manage the memory of the Tensors). +The memory ownership is hierarchically outlined in the component architecture - in this diagram, the arrows provide an intuition on the memory management ownership relationships. It's worth mentioning that the memory relationship may be different to the way components interact with each other - for this, you can see the high level component overview. More specifically: +* The purple arrows denote GPU memory management -.. image:: ../images/kompute-architecture.jpg +.. image:: ../images/kompute-vulkan-architecture.jpg :width: 100% Optional Memory Management diff --git a/docs/overview/python-examples.rst b/docs/overview/python-examples.rst index 7c160dcfd..ac6417928 100644 --- a/docs/overview/python-examples.rst +++ b/docs/overview/python-examples.rst @@ -14,17 +14,19 @@ Then you can interact with it from your interpreter. Below is the same sample as .. code-block:: python :linenos: - from kp import Manager, Tensor + from kp import Manager, Tensor, OpTensorSyncDevice, OpTensorSyncLocal, OpAlgoDispatch from pyshader import python2shader, ivec3, f32, Array mgr = Manager() # Can be initialized with List[] or np.Array - tensor_in_a = Tensor([2, 2, 2]) - tensor_in_b = Tensor([1, 2, 3]) - tensor_out = Tensor([0, 0, 0]) + tensor_in_a = mgr.tensor([2, 2, 2]) + tensor_in_b = mgr.tensor([1, 2, 3]) + tensor_out = mgr.tensor([0, 0, 0]) - mgr.eval_tensor_create_def([tensor_in_a, tensor_in_b, tensor_out]) + sq = mgr.sequence() + + sq.eval(OpTensorSyncLocal([tensor_in_a, tensor_in_b, tensor_out])) # Define the function via PyShader or directly as glsl string or spirv bytes @python2shader @@ -35,15 +37,13 @@ Then you can interact with it from your interpreter. Below is the same sample as i = index.x data3[i] = data1[i] * data2[i] + algo = mgr.algorithm([tensor_in_a, tensor_in_b, tensor_out], compute_shader_multiply.to_spirv()) + # Run shader operation synchronously - mgr.eval_algo_data_def( - [tensor_in_a, tensor_in_b, tensor_out], compute_shader_multiply.to_spirv()) + sq.eval(OpAlgoDispatch(algo)) + sq.eval(OpAlgoSyncLocal([tensor_out])) - mgr.eval_await_def() - - mgr.eval_tensor_sync_local_def([tensor_out]) - - assert tensor_out.data() == [2.0, 4.0, 6.0] + assert tensor_out.data().tolist() == [2.0, 4.0, 6.0] Python Example (Extended) @@ -55,6 +55,7 @@ Similarly you can find the same extended example as above: :linenos: from kp import Manager, Tensor + import kp from pyshader import python2shader, ivec3, f32, Array mgr = Manager(0, [2]) @@ -77,20 +78,19 @@ Similarly you can find the same extended example as above: i = index.x data3[i] = data1[i] * data2[i] - # Run shader operation asynchronously and then await - mgr.eval_async_algo_data_def( - [tensor_in_a, tensor_in_b, tensor_out], compute_shader_multiply.to_spirv()) - mgr.eval_await_def() + algo = mgr.algorithm([tensor_in_a, tensor_in_b, tensor_out], compute_shader_multiply.to_spirv()) - seq.begin() - seq.record_tensor_sync_local([tensor_in_a]) - seq.record_tensor_sync_local([tensor_in_b]) - seq.record_tensor_sync_local([tensor_out]) - seq.end() + # Run shader operation asynchronously and then await + mgr.eval_async(kp.OpAlgoDispatch(algo))) + mgr.eval_await() + + seq.record(kp.OpTensorSyncLocal([tensor_in_a])) + seq.record(kp.OpTensorSyncLocal([tensor_in_b])) + seq.record(kp.OpTensorSyncLocal([tensor_out])) seq.eval() - assert tensor_out.data() == [2.0, 4.0, 6.0] + assert tensor_out.data().tolist() == [2.0, 4.0, 6.0] Kompute Operation Capabilities ^^^^^ @@ -101,33 +101,29 @@ Handling multiple capabilites of processing can be done by compute shaders being :linenos: from kp import Manager + import kp # We'll assume we have the shader data available from my_spv_shader_data import mult_shader, sum_shader mgr = Manager() - t1 = mgr.build_tensor([2,2,2]) - t2 = mgr.build_tensor([1,2,3]) - t3 = mgr.build_tensor([1,2,3]) + t1 = mgr.tensor([2,2,2]) + t2 = mgr.tensor([1,2,3]) + t3 = mgr.tensor([1,2,3]) + + mgr.sequence().eval(kp.OpTensorSyncLocal([t1, t3])) # Create multiple separate sequences - sq_mult = mgr.create_sequence("SQ_MULT") - sq_sum = mgr.create_sequence("SQ_SUM") - sq_sync = mgr.create_sequence("SQ_SYNC") + sq_mult = mgr.sequence() + sq_sum = mgr.sequence() + sq_sync = mgr.sequence() - # Initialize sq_mult - sq_mult.begin() - sq_mult.record_algo_data([t1, t2, t3], add_shader) - sq_mult.end() + sq_mult.record(kp.OpAlgoDispatch(mgr.algorithm([t1, t2, t3], add_shader)) - sq_sum.begin() - sq_sum.record_algo_data([t3, t2, t1], sum_shader) - sq_sum.end() + sq_sum.record(kp.OpAlgoDispatch(mgr.algorithm([t3, t2, t1], sum_shader)) - sq_sync.begin() - sq_sync.record_tensor_sync_local([t1, t3]) - sq_sync.end() + sq_sync.record(kp.OpTensorSyncLocal([t1, t3])) # Run multiple iterations for i in range(10): @@ -147,6 +143,7 @@ Similar to the logistic regression implementation in the C++ examples section, b :linenos: from kp import Manager, Tensor + import kp from pyshader import python2shader, ivec3, f32, Array @python2shader @@ -189,38 +186,37 @@ Similar to the logistic regression implementation in the C++ examples section, b l_out[i] = loss + mgr = Manager() + # First we create input and ouput tensors for shader - tensor_x_i = Tensor([0.0, 1.0, 1.0, 1.0, 1.0]) - tensor_x_j = Tensor([0.0, 0.0, 0.0, 1.0, 1.0]) + tensor_x_i = mgr.tensor([0.0, 1.0, 1.0, 1.0, 1.0]) + tensor_x_j = mgr.tensor([0.0, 0.0, 0.0, 1.0, 1.0]) - tensor_y = Tensor([0.0, 0.0, 0.0, 1.0, 1.0]) + tensor_y = mgr.tensor([0.0, 0.0, 0.0, 1.0, 1.0]) - tensor_w_in = Tensor([0.001, 0.001]) - tensor_w_out_i = Tensor([0.0, 0.0, 0.0, 0.0, 0.0]) - tensor_w_out_j = Tensor([0.0, 0.0, 0.0, 0.0, 0.0]) + tensor_w_in = mgr.tensor([0.001, 0.001]) + tensor_w_out_i = mgr.tensor([0.0, 0.0, 0.0, 0.0, 0.0]) + tensor_w_out_j = mgr.tensor([0.0, 0.0, 0.0, 0.0, 0.0]) - tensor_b_in = Tensor([0.0]) - tensor_b_out = Tensor([0.0, 0.0, 0.0, 0.0, 0.0]) + tensor_b_in = mgr.tensor([0.0]) + tensor_b_out = mgr.tensor([0.0, 0.0, 0.0, 0.0, 0.0]) - tensor_l_out = Tensor([0.0, 0.0, 0.0, 0.0, 0.0]) + tensor_l_out = mgr.tensor([0.0, 0.0, 0.0, 0.0, 0.0]) - tensor_m = Tensor([ 5.0 ]) + tensor_m = mgr.tensor([ 5.0 ]) # We store them in an array for easier interaction params = [tensor_x_i, tensor_x_j, tensor_y, tensor_w_in, tensor_w_out_i, tensor_w_out_j, tensor_b_in, tensor_b_out, tensor_l_out, tensor_m] - mgr = Manager() - - mgr.eval_tensor_create_def(params) + sq.sequence().eval(kp.OpTensorSyncDevice(params)) # Record commands for efficient evaluation - sq = mgr.create_sequence() - sq.begin() - sq.record_tensor_sync_device([tensor_w_in, tensor_b_in]) - sq.record_algo_data(params, compute_shader.to_spirv()) - sq.record_tensor_sync_local([tensor_w_out_i, tensor_w_out_j, tensor_b_out, tensor_l_out]) - sq.end() + sq = mgr.sequence() + + sq.record(kp.OpTensorSyncDevice([tensor_w_in, tensor_b_in])) + sq.record(kp.OpAlgoDispatch(mgr.algorithm(params, compute_shader.to_spirv()))) + sq.record(kp.OpTensorSyncLocal([tensor_w_out_i, tensor_w_out_j, tensor_b_out, tensor_l_out])) ITERATIONS = 100 learning_rate = 0.1