Updated examples to new interface

This commit is contained in:
Alejandro Saucedo 2021-03-01 21:13:08 +00:00
parent f163aaf5e8
commit 7f686b47da
6 changed files with 410 additions and 607 deletions

173
README.md
View file

@ -48,7 +48,8 @@ Below you can find a GPU multiplication example using the C++ and Python Kompute
The C++ interface provides low level access to the native components of Kompute and Vulkan, enabling for [advanced optimizations](https://kompute.cc/overview/async-parallel.html) as well as [extension of components](https://kompute.cc/overview/reference.html).
```c++
int main() {
void kompute(const std::string& shader) {
// 1. Create Kompute Manager with default settings (device 0 and first compute compatible queue)
kp::Manager mgr;
@ -62,6 +63,42 @@ int main() {
std::vector<std::shared_ptr<kp::Tensor>> params = {tensorInA, tensorInB, tensorOutA, tensorOutB};
// 3. Create algorithm based on shader (supports buffers & push/spec constants)
kp::Workgroup workgroup({3, 1, 1});
kp::Constants specConsts({ 2 });
kp::Constants pushConstsA({ 2.0 });
kp::Constants pushConstsB({ 3.0 });
auto algorithm = mgr.algorithm(params,
kp::Shader::compile_source(shader),
workgroup,
specConsts);
// 4. Run operation synchronously using sequence
mgr.sequence()
->record<kp::OpTensorSyncDevice>(params)
->record<kp::OpAlgoDispatch>(algorithm, pushConstsA)
->record<kp::OpAlgoDispatch>(algorithm, pushConstsB)
->eval();
// 5. Sync results from the GPU asynchronously
sq = mgr.sequence()
sq->evalAsync<kp::OpTensorSyncLocal>(params);
// ... Do other work asynchronously whilst GPU finishes
sq->evalAwait();
// Prints the first output which is: { 4, 8, 12 }
for (const float& elem : tensorOutA->data()) std::cout << elem << " ";
// Prints the second output which is: { 10, 10, 10 }
for (const float& elem : tensorOutB->data()) std::cout << elem << " ";
} // Manages / releases all CPU and GPU memory resources
int main() {
// Define a raw string shader (or use the Kompute tools to compile to SPIRV / C++ header
// files). This shader shows some of the main components including constants, buffers, etc
std::string shader = (R"(
#version 450
@ -88,33 +125,8 @@ int main() {
}
)");
kp::Workgroup workgroup({3, 1, 1});
kp::Constants specConsts({ 2 });
auto algorithm = mgr.algorithm(params, kp::Shader::compile_source(shader), workgroup, specConsts);
kp::Constants pushConstsA({ 2.0 });
kp::Constants pushConstsB({ 3.0 });
// 4. Run operation synchronously using sequence
mgr.sequence()
->record<kp::OpTensorSyncDevice>(params)
->record<kp::OpAlgoDispatch>(algorithm, pushConstsA)
->record<kp::OpAlgoDispatch>(algorithm, pushConstsB)
->eval();
// 5. Sync results from the GPU asynchronously
sq = mgr.sequence()
sq->evalAsync<kp::OpTensorSyncLocal>(params);
// ... Do other work asynchronously whilst GPU finishes
sq->evalAwait();
// Prints the first output which is: { 4, 8, 12 }
for (const float& elem : tensorOutA->data()) std::cout << elem << " ";
// Prints the second output which is: { 10, 10, 10 }
for (const float& elem : tensorOutB->data()) std::cout << elem << " ";
// Run the function declared above with our raw string shader
kompute(shader);
}
```
@ -125,70 +137,77 @@ The [Python package](https://kompute.cc/overview/python-package.html) provides a
```python
# 1. Create Kompute Manager with default settings (device 0 and first compute compatible queue)
mgr = kp.Manager()
def kompute(shader):
# 1. Create Kompute Manager with default settings (device 0 and first compute compatible queue)
mgr = kp.Manager()
# 2. Create and initialise Kompute Tensors through manager
tensor_in_a = mgr.tensor([2, 2, 2])
tensor_in_b = mgr.tensor([1, 2, 3])
tensor_out_a = mgr.tensor([0, 0, 0])
tensor_out_b = mgr.tensor([0, 0, 0])
# 2. Create and initialise Kompute Tensors through manager
tensor_in_a = mgr.tensor([2, 2, 2])
tensor_in_b = mgr.tensor([1, 2, 3])
tensor_out_a = mgr.tensor([0, 0, 0])
tensor_out_b = mgr.tensor([0, 0, 0])
params = [tensor_in_a, tensor_in_b, tensor_out_a, tensor_out_b]
params = [tensor_in_a, tensor_in_b, tensor_out_a, tensor_out_b]
# 3. Create algorithm based on shader (supports buffers & push/spec constants)
shader = """
#version 450
# 3. Create algorithm based on shader (supports buffers & push/spec constants)
workgroup = (3, 1, 1)
spec_consts = [2]
push_consts_a = [2]
push_consts_b = [3]
layout (local_size_x = 1) in;
algo = mgr.algorithm(params, kp.Shader.compile_source(shader), workgroup, spec_consts)
// The input tensors bind index is relative to index in parameter passed
layout(set = 0, binding = 0) buffer buf_in_a { float in_a[]; };
layout(set = 0, binding = 1) buffer buf_in_b { float in_b[]; };
layout(set = 0, binding = 2) buffer buf_out_a { float out_a[]; };
layout(set = 0, binding = 3) buffer buf_out_b { float out_b[]; };
# 4. Run operation synchronously using sequence
(mgr.sequence()
.record(kp.OpTensorSyncDevice(params))
.record(kp.OpAlgoDispatch(algo, push_consts_a))
.record(kp.OpAlgoDispatch(algo, push_consts_b))
.eval())
// Kompute supports push constants updated on dispatch
layout(push_constant) uniform PushConstants {
float val;
} push_const;
# 5. Sync results from the GPU asynchronously
sq = mgr.sequence()
sq.eval_async(kp.OpTensorSyncLocal(params))
// Kompute also supports spec constants on initalization
layout(constant_id = 0) const float const_one = 0;
# ... Do other work asynchronously whilst GPU finishes
void main() {
uint index = gl_GlobalInvocationID.x;
out_a[index] += in_a[index] * in_b[index];
out_b[index] += const_one * push_const.val;
}
"""
sq.eval_await()
workgroup = (3, 1, 1)
spec_consts = [2]
push_consts_a = [2]
push_consts_b = [3]
# Prints the first output which is: { 4, 8, 12 }
print(tensor_out_a)
# Prints the first output which is: { 10, 10, 10 }
print(tensor_out_b)
algo = mgr.algorithm(params, kp.Shader.compile_source(shader), workgroup, spec_consts)
if __name__ == "__main__":
# 4. Run operation synchronously using sequence
(mgr.sequence()
.record(kp.OpTensorSyncDevice(params))
.record(kp.OpAlgoDispatch(algo, push_consts_a))
.record(kp.OpAlgoDispatch(algo, push_consts_b))
.eval())
# Define a raw string shader (or use the Kompute tools to compile to SPIRV / C++ header
# files). This shader shows some of the main components including constants, buffers, etc
shader = """
#version 450
# 5. Sync results from the GPU asynchronously
sq = mgr.sequence()
sq.eval_async(kp.OpTensorSyncLocal(params))
layout (local_size_x = 1) in;
# ... Do other work asynchronously whilst GPU finishes
// The input tensors bind index is relative to index in parameter passed
layout(set = 0, binding = 0) buffer buf_in_a { float in_a[]; };
layout(set = 0, binding = 1) buffer buf_in_b { float in_b[]; };
layout(set = 0, binding = 2) buffer buf_out_a { float out_a[]; };
layout(set = 0, binding = 3) buffer buf_out_b { float out_b[]; };
sq.eval_await()
// Kompute supports push constants updated on dispatch
layout(push_constant) uniform PushConstants {
float val;
} push_const;
# Prints the first output which is: { 4, 8, 12 }
print(tensor_out_a)
# Prints the first output which is: { 10, 10, 10 }
print(tensor_out_b)
// Kompute also supports spec constants on initalization
layout(constant_id = 0) const float const_one = 0;
void main() {
uint index = gl_GlobalInvocationID.x;
out_a[index] += in_a[index] * in_b[index];
out_b[index] += const_one * push_const.val;
}
"""
kompute(shader)
```

View file

@ -10,13 +10,9 @@ The power of Kompute comes in when the interface is used for complex computation
Simple examples
^^^^^^^^^^^^^^^
* `Pass shader as raw string <#simple-shader-example>`_
* `Record batch commands with a Kompute Sequence <#record-batch-commands>`_
* `Create your custom Kompute Operations <#your-custom-kompute-operation>`_
* `Run Asynchronous Operations <#asynchronous-operations>`_
* `Run Parallel Operations Across Multiple GPU Queues <#parallel-operations>`_
* `Create your custom Kompute Operations <#your-custom-kompute-operation>`_
* `Implementing logistic regression from scratch <#logistic-regression-example>`_
End-to-end examples
^^^^^^^^^^^^^^^^^^^
@ -28,156 +24,6 @@ End-to-end examples
* `Game Development Kompute ML in Godot Engine <https://towardsdatascience.com/supercharging-game-development-with-gpu-accelerated-ml-using-vulkan-kompute-the-godot-game-engine-4e75a84ea9f0>`_
Asynchronous Operations
~~~~~~~~~~~~~~~~~~~~~~~
You can submit operations asynchronously with the async/await commands in the kp::Manager and kp::Sequence, which provides granularity on waiting on the vk::Fence. Back to `examples list <#simple-examples>`_
.. code-block:: cpp
:linenos:
int main() {
// You can allow Kompute to create the Vulkan components, or pass your existing ones
kp::Manager mgr; // Selects device 0 unless explicitly requested
// Creates tensor an initializes GPU memory (below we show more granularity)
auto tensor = mgr.tensor(10, 0.0);
// Define your shader as a string (using string literals for simplicity)
// (You can also pass the raw compiled bytes, or even path to file)
std::string shader(R"(
#version 450
layout (local_size_x = 1) in;
layout(set = 0, binding = 0) buffer b { float pb[]; };
shared uint sharedTotal[1];
void main() {
uint index = gl_GlobalInvocationID.x;
sharedTotal[0] = 0;
// Iterating to simulate longer process
for (int i = 0; i < 100000000; i++)
{
atomicAdd(sharedTotal[0], 1);
}
pb[index] = sharedTotal[0];
}
)");
std::vector<uint32_t> spirv = kp::Shader::compile_source(shader);
auto sq = mgr.sequence();
sq.eval<kp::OpTensorSyncDevice>({tensor});
sq.evalAsync<kp::OpAlgoDispatch>(mgr.algorithm({tensor}, spirv));
// When we're ready we can wait
// The default wait time is UINT64_MAX
sq.evalAwait(10000)
// Sync the GPU memory back to the local tensor
// We can still run synchronous jobs in our created sequence
sq.eval<kp::OpTensorSyncLocal>({ tensor });
// Prints the output: B: { 100000000, ... }
std::cout << fmt::format("B: {}",
tensor.data()) << std::endl;
}
Parallel Operations
~~~~~~~~~~~~~~~~~~~
Besides being able to submit asynchronous operations, you can also leverage the underlying GPU compute queues to process operations in parallel.
This will depend on your underlying graphics card, but for example in NVIDIA graphics cards the operations submitted across queues in one family are not parallelizable, but operations submitted across queueFamilies can be parallelizable.
Below we show how you can parallelize operations in an `NVIDIA 1650 <http://vulkan.gpuinfo.org/displayreport.php?id=9700#queuefamilies>`_\ , which has a ``GRAPHICS+COMPUTE`` family on ``index 0``\ , and ``COMPUTE`` family on ``index 2``.
Back to `examples list <#simple-examples>`_.
.. code-block:: cpp
:linenos:
int main() {
// In this case we select device 0, and for queues, one queue from familyIndex 0
// and one queue from familyIndex 2
uint32_t deviceIndex(0);
std::vector<uint32_t> familyIndices = {0, 2};
// We create a manager with device index, and queues by queue family index
kp::Manager mgr(deviceIndex, familyIndices);
// Creates tensor an initializes GPU memory (below we show more granularity)
auto tensorA = mgr.tensor({ 10, 0.0 });
auto tensorB = mgr.tensor({ 10, 0.0 });
// Copies the data into GPU memory
mgr.sequence().eval<kp::OpTensorSyncDevice>({tensorA tensorB});
// Define your shader as a string (using string literals for simplicity)
// (You can also pass the raw compiled bytes, or even path to file)
std::string shader(R"(
#version 450
layout (local_size_x = 1) in;
layout(set = 0, binding = 0) buffer b { float pb[]; };
shared uint sharedTotal[1];
void main() {
uint index = gl_GlobalInvocationID.x;
sharedTotal[0] = 0;
// Iterating to simulate longer process
for (int i = 0; i < 100000000; i++)
{
atomicAdd(sharedTotal[0], 1);
}
pb[index] = sharedTotal[0];
}
)");
std::vector<uint32_t> spirv = kp::Shader::compile_source(shader);
std::shared_ptr<kp::Algorithm> algo = mgr.algorithm({tensorA, tenssorB}, spirv);
// We need to create explicit sequences with their respective queues
// The second parameter is the index in the familyIndex array which is relative
// to the vector we created the manager with.
sqOne = mgr.sequence(0);
sqTwo = mgr.sequence(1);
// Run the first parallel operation in the `queueOne` sequence
sqOne->evalAsync<kp::OpAlgoDispatch>(algo);
// Run the second parallel operation in the `queueTwo` sequence
sqTwo->evalAsync<kp::OpAlgoDispatch>(algo);
// Here we can do other work
// We can now wait for the two parallel tasks to finish
sqOne.evalOpAwait()
sqTwo.evalOpAwait()
// Sync the GPU memory back to the local tensor
mgr.sequence()->eval<kp::OpTensorSyncLocal>({ tensorA, tensorB });
// Prints the output: A: 100000000 B: 100000000
std::cout << fmt::format("A: {}, B: {}",
tensorA.data()[0], tensorB.data()[0]) << std::endl;
}
Your Custom Kompute Operation
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@ -252,4 +98,246 @@ We also provide tools that allow you to `convert shaders into C++ headers <https
std::cout << fmt::format("Output: {}", tensorOutput.data()) << std::endl;
}
Async/Await Example
^^^^^^^^^^^^^^^^^^^^^
A simple example of asynchronous submission can be found below.
First we are able to create the manager as we normally would.
.. code-block:: cpp
:linenos:
// You can allow Kompute to create the Vulkan components, or pass your existing ones
kp::Manager mgr; // Selects device 0 unless explicitly requested
// Creates tensor an initializes GPU memory (below we show more granularity)
auto tensor = mgr.tensor(10, 0.0);
We can now run our first asynchronous command, which in this case we can use the default sequence.
Sequences can be executed in synchronously or asynchronously without having to change anything.
.. code-block:: cpp
:linenos:
// Create tensors data explicitly in GPU with an operation
mgr.sequence()->eval<kp::OpTensorSyncDevice>({tensor});
While this is running we can actually do other things like in this case create the shader we'll be using.
In this case we create a shader that should take a couple of milliseconds to run.
.. code-block:: cpp
:linenos:
// Define your shader as a string (using string literals for simplicity)
// (You can also pass the raw compiled bytes, or even path to file)
std::string shader(R"(
#version 450
layout (local_size_x = 1) in;
layout(set = 0, binding = 0) buffer b { float pb[]; };
shared uint sharedTotal[1];
void main() {
uint index = gl_GlobalInvocationID.x;
sharedTotal[0] = 0;
// Iterating to simulate longer process
for (int i = 0; i < 100000000; i++)
{
atomicAdd(sharedTotal[0], 1);
}
pb[index] = sharedTotal[0];
}
)");
auto algo = mgr.algorithm({tensor}, kp::Shader::compile_source(shader));
Now we are able to run the await function on the default sequence.
If we are using the manager, we need to make sure that we are awaiting the same named sequence that was triggered asynchronously.
If the sequence is not running or has finished running, it would return immediately.
The parameter provided is the maximum amount of time to wait in nanoseconds. When the timeout expires, the sequence would return (with false value), but it does not stop the processing in the GPU - the processing would continue as normal.
.. code-block:: cpp
:linenos:
auto sq = mgr.sequence()
// Run Async Kompute operation on the parameters provided
sq->evalAsync<kp::OpAlgoDispatch>(algo);
// Here we can do other work
// When we're ready we can wait
// The default wait time is UINT64_MAX
sq.evalAwait()
Finally, below you can see that we can also run syncrhonous commands without having to change anything.
.. code-block:: cpp
:linenos:
// Sync the GPU memory back to the local tensor
// We can still run synchronous jobs in our created sequence
sq.eval<kp::OpTensorSyncLocal>({ tensor });
// Prints the output: B: { 100000000, ... }
std::cout << fmt::format("B: {}",
tensor.data()) << std::endl;
Parallel Operation Submission
-----------
In order to work with parallel execution of tasks, it is important that you understand some of the core GPU processing limitations, as these can be quite broad and hardware dependent, which means they will vary across NVIDIA / AMD / ETC video cards.
Conceptual Overview
^^^^^^^^^^^^^^^^^^^^^
If you are familiar with Vulkan, you will have experience that the first few things you do is fetching the physical Queues from the device. The queues themselves tend to have three main particular features - they can be GRAPHICS, TRANSFER and COMPUTE (between a few others we'll skip for simplicity).
Queues can have multiple properties - namely a queue can be of type GRAPHICS+TRANSFER+COMPUTE, etc. Now here comes the key point: the underlying hardware may (or may not) support parallelized processing at multiple levels.
Let's take a tangible example. The [NVIDIA 1650](http://vulkan.gpuinfo.org/displayreport.php?id=9700#queuefamilies) for example has 16 `GRAPHICS+TRANSFER+COMPUTE` queues on `familyIndex 0`, then 2 `TRANSFER` queues in `familyIndex 1` and finally 8 `COMPUTE+TRANSFER` queues in `familyIndex 2`.
With this in mind, the NVIDIA 1650 as of today does not support intra-family parallelization, which means that if you were to submit commands in multiple queues of the same family, these would still be exectured synchronously.
However the NVIDIA 1650 does support inter-family parallelization, which means that if we were to submit commands across multiple queues from different families, these would execute in parallel.
This means that we would be able to execute parallel workloads as long as we're running them across multiple queue families. This is one of the reasons why Vulkan Kompute enables users to explicitly select the underlying queues and queue families to run particular workloads on.
It is important that you understand what are the capabilities and limitations of your hardware, as parallelization capabilities can vary, so you will want to make sure you account for potential discrepancies in processing structures, mainyl to avoid undesired/unexpected race conditions.
Parallel Execution Example
^^^^^^^^^^^^^^^^^^^^^
In this example we will demonstrate how you can set up parallel processing across two compute families to achieve 2x speedups when running processing workloads.
To start, you will see that we do have to create the manager with extra parameters. This includes the GPU device index we want to use, together with the array of the queues that we want to enable.
In this case we are using only two queues, which as per the section above, these would be familyIndex 0 which is of type `GRAPHICS+COMPUTE+TRANSFER` and familyIndex 2 which is of type `COMPUTE+TRANSFER`.
In this case based on the specifications of the NVIDIA 1650 we could define up to 16 graphics queues (familyIndex 0), 2 transfer queues (familyIndex 1), and 8 compute queues (familyIndex 2) in no particular order. This means that we could have something like `{ 0, 1, 1, 2, 2, 2, 0, ... }` as our initialization value.
You will want to keep track of the indices you initialize your manager, as you will be referring back to this ordering when creating sequences with particular queues.
.. code-block:: cpp
:linenos:
// In this case we select device 0, and for queues, one queue from familyIndex 0
// and one queue from familyIndex 2
uint32_t deviceIndex(0);
std::vector<uint32_t> familyIndices = {0, 2};
// We create a manager with device index, and queues by queue family index
kp::Manager mgr(deviceIndex, familyIndices);
We are now able to create sequences with a particular queue.
By default the Kompute Manager is created with device 0, and with a single queue of the first compatible familyIndex. Similarly, by default sequences are created with the first available queue.
In this case we are able to specify which queue we want to use. Below we initialize "queueOne" named sequence with the graphics family queue, and "queueTwo" with the compute family queue.
It's worth mentioning you can have multiple sequences referencing the same queue.
.. code-block:: cpp
:linenos:
// We need to create explicit sequences with their respective queues
// The second parameter is the index in the familyIndex array which is relative
// to the vector we created the manager with.
sqOne = mgr.sequence(0);
sqTwo = mgr.sequence(1);
We create the tensors without modifications.
.. code-block:: cpp
:linenos:
// Creates tensor an initializes GPU memory (below we show more granularity)
auto tensorA = mgr.tensor({ 10, 0.0 });
auto tensorB = mgr.tensor({ 10, 0.0 });
// Copies the data into GPU memory
mgr.sequence().eval<kp::OpTensorSyncDevice>({tensorA tensorB});
Similar to the asyncrhonous usecase above, we can still run synchronous commands without modifications.
.. code-block:: cpp
:linenos:
// Define your shader as a string (using string literals for simplicity)
// (You can also pass the raw compiled bytes, or even path to file)
std::string shader(R"(
#version 450
layout (local_size_x = 1) in;
layout(set = 0, binding = 0) buffer b { float pb[]; };
shared uint sharedTotal[1];
void main() {
uint index = gl_GlobalInvocationID.x;
sharedTotal[0] = 0;
// Iterating to simulate longer process
for (int i = 0; i < 100000000; i++)
{
atomicAdd(sharedTotal[0], 1);
}
pb[index] = sharedTotal[0];
}
)");
std::vector<uint32_t> spirv = kp::Shader::compile_source(shader);
std::shared_ptr<kp::Algorithm> algo = mgr.algorithm({tensorA, tenssorB}, spirv);
Now we can actually trigger the parallel processing, running two OpAlgoBase Operations - each in a different sequence / queue.
.. code-block:: cpp
:linenos:
// Run the first parallel operation in the `queueOne` sequence
sqOne->evalAsync<kp::OpAlgoDispatch>(algo);
// Run the second parallel operation in the `queueTwo` sequence
sqTwo->evalAsync<kp::OpAlgoDispatch>(algo);
Similar to the asynchronous example above, we are able to do other work whilst the tasks are executing.
We are able to wait for the tasks to complete by triggering the `evalOpAwait` on the respective sequence.
.. code-block:: cpp
:linenos:
// Here we can do other work
// We can now wait for the two parallel tasks to finish
sqOne.evalOpAwait()
sqTwo.evalOpAwait()
// Sync the GPU memory back to the local tensor
mgr.sequence()->eval<kp::OpTensorSyncLocal>({ tensorA, tensorB });
// Prints the output: A: 100000000 B: 100000000
std::cout << fmt::format("A: {}, B: {}",
tensorA.data()[0], tensorB.data()[0]) << std::endl;

View file

@ -40,257 +40,8 @@ One important thing to bare in mind when using asynchronous submissions, is that
The reason why this is important is that the Await function not only waits for the fence, but also runs the `postEval` functions across all operations, which is required for several operations.
Async/Await Example
^^^^^^^^^^^^^^^^^^^^^
Async and Parallel Examples
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
A simple example of asynchronous submission can be found below.
First we are able to create the manager as we normally would.
.. code-block:: cpp
:linenos:
// You can allow Kompute to create the Vulkan components, or pass your existing ones
kp::Manager mgr; // Selects device 0 unless explicitly requested
// Creates tensor an initializes GPU memory (below we show more granularity)
auto tensor = std::make_shared<kp::Tensor>(kp::Tensor(std::vector<float>(10, 0.0)));
We can now run our first asynchronous command, which in this case we can use the default sequence.
Sequences can be executed in synchronously or asynchronously without having to change anything.
.. code-block:: cpp
:linenos:
// Create tensors data explicitly in GPU with an operation
mgr.rebuild({ tensor });
While this is running we can actually do other things like in this case create the shader we'll be using.
In this case we create a shader that should take a couple of milliseconds to run.
.. code-block:: cpp
:linenos:
// Define your shader as a string (using string literals for simplicity)
// (You can also pass the raw compiled bytes, or even path to file)
std::string shader(R"(
#version 450
layout (local_size_x = 1) in;
layout(set = 0, binding = 0) buffer b { float pb[]; };
shared uint sharedTotal[1];
void main() {
uint index = gl_GlobalInvocationID.x;
sharedTotal[0] = 0;
// Iterating to simulate longer process
for (int i = 0; i < 100000000; i++)
{
atomicAdd(sharedTotal[0], 1);
}
pb[index] = sharedTotal[0];
}
)");
Now we are able to run the await function on the default sequence.
If we are using the manager, we need to make sure that we are awaiting the same named sequence that was triggered asynchronously.
If the sequence is not running or has finished running, it would return immediately.
The parameter provided is the maximum amount of time to wait in nanoseconds. When the timeout expires, the sequence would return (with false value), but it does not stop the processing in the GPU - the processing would continue as normal.
.. code-block:: cpp
:linenos:
// We can now await for the previous submitted command
// The first parameter can be the amount of time to wait
// The time provided is in nanoseconds
mgr.evalOpAwaitDefault(10000);
Similar to above we can run other commands such as the `OpAlgoBase` asynchronously.
.. code-block:: cpp
:linenos:
// Run Async Kompute operation on the parameters provided
mgr.evalOpAsyncDefault<kp::OpAlgoBase<>>(
{ tensor },
kp::Shader::compile_source(shader));
// Here we can do other work
// When we're ready we can wait
// The default wait time is UINT64_MAX
mgr.evalOpAwaitDefault()
Finally, below you can see that we can also run syncrhonous commands without having to change anything.
.. code-block:: cpp
:linenos:
// Sync the GPU memory back to the local tensor
// We can still run synchronous jobs in our created sequence
mgr.evalOpDefault<kp::OpTensorSyncLocal>({ tensor });
// Prints the output: B: { 100000000, ... }
std::cout << fmt::format("B: {}",
tensor.data()) << std::endl;
Parallel Operation Submission
-----------
In order to work with parallel execution of tasks, it is important that you understand some of the core GPU processing limitations, as these can be quite broad and hardware dependent, which means they will vary across NVIDIA / AMD / ETC video cards.
Conceptual Overview
^^^^^^^^^^^^^^^^^^^^^
If you are familiar with Vulkan, you will have experience that the first few things you do is fetching the physical Queues from the device. The queues themselves tend to have three main particular features - they can be GRAPHICS, TRANSFER and COMPUTE (between a few others we'll skip for simplicity).
Queues can have multiple properties - namely a queue can be of type GRAPHICS+TRANSFER+COMPUTE, etc. Now here comes the key point: the underlying hardware may (or may not) support parallelized processing at multiple levels.
Let's take a tangible example. The [NVIDIA 1650](http://vulkan.gpuinfo.org/displayreport.php?id=9700#queuefamilies) for example has 16 `GRAPHICS+TRANSFER+COMPUTE` queues on `familyIndex 0`, then 2 `TRANSFER` queues in `familyIndex 1` and finally 8 `COMPUTE+TRANSFER` queues in `familyIndex 2`.
With this in mind, the NVIDIA 1650 as of today does not support intra-family parallelization, which means that if you were to submit commands in multiple queues of the same family, these would still be exectured synchronously.
However the NVIDIA 1650 does support inter-family parallelization, which means that if we were to submit commands across multiple queues from different families, these would execute in parallel.
This means that we would be able to execute parallel workloads as long as we're running them across multiple queue families. This is one of the reasons why Vulkan Kompute enables users to explicitly select the underlying queues and queue families to run particular workloads on.
It is important that you understand what are the capabilities and limitations of your hardware, as parallelization capabilities can vary, so you will want to make sure you account for potential discrepancies in processing structures, mainyl to avoid undesired/unexpected race conditions.
Parallel Execution Example
^^^^^^^^^^^^^^^^^^^^^
In this example we will demonstrate how you can set up parallel processing across two compute families to achieve 2x speedups when running processing workloads.
To start, you will see that we do have to create the manager with extra parameters. This includes the GPU device index we want to use, together with the array of the queues that we want to enable.
In this case we are using only two queues, which as per the section above, these would be familyIndex 0 which is of type `GRAPHICS+COMPUTE+TRANSFER` and familyIndex 2 which is of type `COMPUTE+TRANSFER`.
In this case based on the specifications of the NVIDIA 1650 we could define up to 16 graphics queues (familyIndex 0), 2 transfer queues (familyIndex 1), and 8 compute queues (familyIndex 2) in no particular order. This means that we could have something like `{ 0, 1, 1, 2, 2, 2, 0, ... }` as our initialization value.
You will want to keep track of the indices you initialize your manager, as you will be referring back to this ordering when creating sequences with particular queues.
.. code-block:: cpp
:linenos:
// In this case we select device 0, and for queues, one queue from familyIndex 0
// and one queue from familyIndex 2
uint32_t deviceIndex(0);
std::vector<uint32_t> familyIndices = {0, 2};
// We create a manager with device index, and queues by queue family index
kp::Manager mgr(deviceIndex, familyIndices);
We are now able to create sequences with a particular queue.
By default the Kompute Manager is created with device 0, and with a single queue of the first compatible familyIndex. Similarly, by default sequences are created with the first available queue.
In this case we are able to specify which queue we want to use. Below we initialize "queueOne" named sequence with the graphics family queue, and "queueTwo" with the compute family queue.
It's worth mentioning you can have multiple sequences referencing the same queue.
.. code-block:: cpp
:linenos:
// We need to create explicit sequences with their respective queues
// The second parameter is the index in the familyIndex array which is relative
// to the vector we created the manager with.
mgr.sequence("queueOne", 0);
mgr.sequence("queueTwo", 1);
We create the tensors without modifications.
.. code-block:: cpp
:linenos:
// Creates tensor an initializes GPU memory (below we show more granularity)
auto tensorA = std::make_shared<kp::Tensor>(kp::Tensor(std::vector<float>(10, 0.0)));
auto tensorB = std::make_shared<kp::Tensor>(kp::Tensor(std::vector<float>(10, 0.0)));
Similar to the asyncrhonous usecase above, we can still run synchronous commands without modifications.
.. code-block:: cpp
:linenos:
// We run the first step synchronously on the default sequence
mgr.rebuild({ tensorA, tensorB });
// Define your shader as a string (using string literals for simplicity)
// (You can also pass the raw compiled bytes, or even path to file)
std::string shader(R"(
#version 450
layout (local_size_x = 1) in;
layout(set = 0, binding = 0) buffer b { float pb[]; };
shared uint sharedTotal[1];
void main() {
uint index = gl_GlobalInvocationID.x;
sharedTotal[0] = 0;
// Iterating to simulate longer process
for (int i = 0; i < 100000000; i++)
{
atomicAdd(sharedTotal[0], 1);
}
pb[index] = sharedTotal[0];
}
)");
Now we can actually trigger the parallel processing, running two OpAlgoBase Operations - each in a different sequence / queue.
.. code-block:: cpp
:linenos:
std::vector<uint32_t> spirv = kp::Shader::compile_source(shader);
// Run the first parallel operation in the `queueOne` sequence
mgr.evalOpAsync<kp::OpAlgoBase<>>(
{ tensorA },
"queueOne",
spirv);
// Run the second parallel operation in the `queueTwo` sequence
mgr.evalOpAsync<kp::OpAlgoBase<>>(
{ tensorB },
"queueTwo",
spirv);
Similar to the asynchronous example above, we are able to do other work whilst the tasks are executing.
We are able to wait for the tasks to complete by triggering the `evalOpAwait` on the respective sequence.
.. code-block:: cpp
:linenos:
// Here we can do other work
// We can now wait for the two parallel tasks to finish
mgr.evalOpAwait("queueOne")
mgr.evalOpAwait("queueTwo")
// Sync the GPU memory back to the local tensor
mgr.evalOp<kp::OpTensorSyncLocal>({ tensorA, tensorB });
// Prints the output: A: 100000000 B: 100000000
std::cout << fmt::format("A: {}, B: {}",
tensorA.data()[0], tensorB.data()[0]) << std::endl;
We have added a set of examples for asynchronous and parallel processing examples in the `Advanced Examples documentation page <advanced-examples.rst>`_

View file

@ -39,74 +39,19 @@ Below you
Simple Operation Extending OpAlgoBase
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Below we show a very simple example that enables you to create an operation with a pre-specified shader. In this case it is the multiplication shader.
You can find an example in the `Advanced Examples documentation section <advanced-examples.rst>`_ that shows how to create your own custom function.
.. code-block:: cpp
:linenos:
class OpMyCustom : public OpAlgoBase
{
public:
OpMyCustom(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
std::shared_ptr<vk::Device> device,
std::shared_ptr<vk::CommandBuffer> commandBuffer,
std::vector<std::shared_ptr<Tensor>> tensors)
: OpAlgoBase(physicalDevice, device, commandBuffer, tensors, "")
{
// Perform your custom steps such as reading from a shader file
this->mShaderFilePath = "shaders/glsl/opmult.comp";
}
}
You can also see an implementation in the codebase through the `OpMult` class:
int main() {
kp::Manager mgr; // Automatically selects Device 0
// Create 3 tensors of default type float
auto tensorLhs = std::make_shared<kp::Tensor>(kp::Tensor({ 0., 1., 2. }));
auto tensorRhs = std::make_shared<kp::Tensor>(kp::Tensor({ 2., 4., 6. }));
auto tensorOut = std::make_shared<kp::Tensor>(kp::Tensor({ 0., 0., 0. }));
// Create tensors data explicitly in GPU with an operation
mgr.evalOpDefault<kp::OpTensorCreate>({ tensorLhs, tensorRhs, tensorOut });
// Run Kompute operation on the parameters provided with dispatch layout
mgr.evalOpDefault<kp::OpMyCustom>(
{ tensorLhs, tensorRhs, tensorOut });
// Prints the output which is { 0, 4, 12 }
std::cout << fmt::format("Output: {}", tensorOutput.data()) << std::endl;
}
More Complex Operation Extending OpAlgoBase
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Below we show a more complex operation that performs the following:
* Expects three tensors for an operation, two inputs and one output
* Expects the tensors to be initialised
* Checks that the tensors are of the same size
* Expects output tensor to be of type TensorTypes::eDevice (and creates staging tensor)
* Has functionality to read shader from file or directly from spirv bytes
* Records relevant bufferMemoryBarriers
* Records dispatch command
* Records copy command from device tensor to staging output tensor
* In postEval it maps data from staging tensor to output tensor's data
For starters, the header file contains the functions that will be overriden:
.. literalinclude:: ../../src/include/kompute/operations/OpAlgoLhsRhsOut.hpp
.. literalinclude:: ../../src/include/kompute/operations/OpMult.hpp
:language: cpp
Then the implementation outlines all the implementations that perform the actions above:
~~~~~~~~~~~~~~~~~~~
.. literalinclude:: ../../src/OpAlgoLhsRhsOut.cpp
.. literalinclude:: ../../src/OpMult.cpp
:language: cpp

View file

@ -4,18 +4,22 @@ Memory Management Principles
The principle in Vulkan Kompute on memory management is summarised as follows:
* Explicit is better than implicit for specifying memory management
* Interfaces for memory management are constant until freed
* Memory management responsibilities are acyclic from static object references
* Memory management by Kompute is optional and only in place if resource is created by Kompute
* Memory management ownership architecture are acyclic and with a single top manager
* Operations do not manage any GPU memory or resources
* Top level manager is main owner of GPU resources and removes all resources when destroyed
* Manager holds weak pointers to ensure that if object created outside is destroyed it's released
* Once a resource is destroyed it cannot be recreated
* Resources can only be rebuilt if they haven't been destroyed
Vulkan Kompute is responsible for managing both the CPU and GPU memory allocations and resources, and is important that they are able to explicitly define when these objects are released or destroyed. Similarly, it's important that the memory resources created by the application are released safely.
Vulkan Kompute is responsible for managing both the CPU and GPU memory allocations and resources that it creates, and is important that they are able to explicitly define when these objects are released or destroyed. Similarly, it's important that the memory resources created by the application are released safely.
Vulkan Kompute is built with the BYOV principle in mind (Bring your own Vulkan). This means that even though the top level resources are managing the memory to its owned resources, they themselves may not have full ownership of the GPU / Vulkan components themselves.
Vulkan Kompute is built with the BYOV principle in mind (Bring your own Vulkan). This means that even though the top level resources are managing the memory to its owned resources, they themselves may not have full ownership of the GPU / Vulkan components - this is in the case that you may want to use Kompute with an existing Vulkan enabled application, and may want to initialise Kompute components with existing Vulkan resources.
The memory ownership is hierarchically outlined in the component architecture - in this diagram, the arrows provide an intuition on the memory management ownership relationships (in this case you can ignore the arrow from the Algorithm, as this is the only one that as of today doesn't manage the memory of the Tensors).
The memory ownership is hierarchically outlined in the component architecture - in this diagram, the arrows provide an intuition on the memory management ownership relationships. It's worth mentioning that the memory relationship may be different to the way components interact with each other - for this, you can see the high level component overview. More specifically:
* The purple arrows denote GPU memory management
.. image:: ../images/kompute-architecture.jpg
.. image:: ../images/kompute-vulkan-architecture.jpg
:width: 100%
Optional Memory Management

View file

@ -14,17 +14,19 @@ Then you can interact with it from your interpreter. Below is the same sample as
.. code-block:: python
:linenos:
from kp import Manager, Tensor
from kp import Manager, Tensor, OpTensorSyncDevice, OpTensorSyncLocal, OpAlgoDispatch
from pyshader import python2shader, ivec3, f32, Array
mgr = Manager()
# Can be initialized with List[] or np.Array
tensor_in_a = Tensor([2, 2, 2])
tensor_in_b = Tensor([1, 2, 3])
tensor_out = Tensor([0, 0, 0])
tensor_in_a = mgr.tensor([2, 2, 2])
tensor_in_b = mgr.tensor([1, 2, 3])
tensor_out = mgr.tensor([0, 0, 0])
mgr.eval_tensor_create_def([tensor_in_a, tensor_in_b, tensor_out])
sq = mgr.sequence()
sq.eval(OpTensorSyncLocal([tensor_in_a, tensor_in_b, tensor_out]))
# Define the function via PyShader or directly as glsl string or spirv bytes
@python2shader
@ -35,15 +37,13 @@ Then you can interact with it from your interpreter. Below is the same sample as
i = index.x
data3[i] = data1[i] * data2[i]
algo = mgr.algorithm([tensor_in_a, tensor_in_b, tensor_out], compute_shader_multiply.to_spirv())
# Run shader operation synchronously
mgr.eval_algo_data_def(
[tensor_in_a, tensor_in_b, tensor_out], compute_shader_multiply.to_spirv())
sq.eval(OpAlgoDispatch(algo))
sq.eval(OpAlgoSyncLocal([tensor_out]))
mgr.eval_await_def()
mgr.eval_tensor_sync_local_def([tensor_out])
assert tensor_out.data() == [2.0, 4.0, 6.0]
assert tensor_out.data().tolist() == [2.0, 4.0, 6.0]
Python Example (Extended)
@ -55,6 +55,7 @@ Similarly you can find the same extended example as above:
:linenos:
from kp import Manager, Tensor
import kp
from pyshader import python2shader, ivec3, f32, Array
mgr = Manager(0, [2])
@ -77,20 +78,19 @@ Similarly you can find the same extended example as above:
i = index.x
data3[i] = data1[i] * data2[i]
# Run shader operation asynchronously and then await
mgr.eval_async_algo_data_def(
[tensor_in_a, tensor_in_b, tensor_out], compute_shader_multiply.to_spirv())
mgr.eval_await_def()
algo = mgr.algorithm([tensor_in_a, tensor_in_b, tensor_out], compute_shader_multiply.to_spirv())
seq.begin()
seq.record_tensor_sync_local([tensor_in_a])
seq.record_tensor_sync_local([tensor_in_b])
seq.record_tensor_sync_local([tensor_out])
seq.end()
# Run shader operation asynchronously and then await
mgr.eval_async(kp.OpAlgoDispatch(algo)))
mgr.eval_await()
seq.record(kp.OpTensorSyncLocal([tensor_in_a]))
seq.record(kp.OpTensorSyncLocal([tensor_in_b]))
seq.record(kp.OpTensorSyncLocal([tensor_out]))
seq.eval()
assert tensor_out.data() == [2.0, 4.0, 6.0]
assert tensor_out.data().tolist() == [2.0, 4.0, 6.0]
Kompute Operation Capabilities
^^^^^
@ -101,33 +101,29 @@ Handling multiple capabilites of processing can be done by compute shaders being
:linenos:
from kp import Manager
import kp
# We'll assume we have the shader data available
from my_spv_shader_data import mult_shader, sum_shader
mgr = Manager()
t1 = mgr.build_tensor([2,2,2])
t2 = mgr.build_tensor([1,2,3])
t3 = mgr.build_tensor([1,2,3])
t1 = mgr.tensor([2,2,2])
t2 = mgr.tensor([1,2,3])
t3 = mgr.tensor([1,2,3])
mgr.sequence().eval(kp.OpTensorSyncLocal([t1, t3]))
# Create multiple separate sequences
sq_mult = mgr.create_sequence("SQ_MULT")
sq_sum = mgr.create_sequence("SQ_SUM")
sq_sync = mgr.create_sequence("SQ_SYNC")
sq_mult = mgr.sequence()
sq_sum = mgr.sequence()
sq_sync = mgr.sequence()
# Initialize sq_mult
sq_mult.begin()
sq_mult.record_algo_data([t1, t2, t3], add_shader)
sq_mult.end()
sq_mult.record(kp.OpAlgoDispatch(mgr.algorithm([t1, t2, t3], add_shader))
sq_sum.begin()
sq_sum.record_algo_data([t3, t2, t1], sum_shader)
sq_sum.end()
sq_sum.record(kp.OpAlgoDispatch(mgr.algorithm([t3, t2, t1], sum_shader))
sq_sync.begin()
sq_sync.record_tensor_sync_local([t1, t3])
sq_sync.end()
sq_sync.record(kp.OpTensorSyncLocal([t1, t3]))
# Run multiple iterations
for i in range(10):
@ -147,6 +143,7 @@ Similar to the logistic regression implementation in the C++ examples section, b
:linenos:
from kp import Manager, Tensor
import kp
from pyshader import python2shader, ivec3, f32, Array
@python2shader
@ -189,38 +186,37 @@ Similar to the logistic regression implementation in the C++ examples section, b
l_out[i] = loss
mgr = Manager()
# First we create input and ouput tensors for shader
tensor_x_i = Tensor([0.0, 1.0, 1.0, 1.0, 1.0])
tensor_x_j = Tensor([0.0, 0.0, 0.0, 1.0, 1.0])
tensor_x_i = mgr.tensor([0.0, 1.0, 1.0, 1.0, 1.0])
tensor_x_j = mgr.tensor([0.0, 0.0, 0.0, 1.0, 1.0])
tensor_y = Tensor([0.0, 0.0, 0.0, 1.0, 1.0])
tensor_y = mgr.tensor([0.0, 0.0, 0.0, 1.0, 1.0])
tensor_w_in = Tensor([0.001, 0.001])
tensor_w_out_i = Tensor([0.0, 0.0, 0.0, 0.0, 0.0])
tensor_w_out_j = Tensor([0.0, 0.0, 0.0, 0.0, 0.0])
tensor_w_in = mgr.tensor([0.001, 0.001])
tensor_w_out_i = mgr.tensor([0.0, 0.0, 0.0, 0.0, 0.0])
tensor_w_out_j = mgr.tensor([0.0, 0.0, 0.0, 0.0, 0.0])
tensor_b_in = Tensor([0.0])
tensor_b_out = Tensor([0.0, 0.0, 0.0, 0.0, 0.0])
tensor_b_in = mgr.tensor([0.0])
tensor_b_out = mgr.tensor([0.0, 0.0, 0.0, 0.0, 0.0])
tensor_l_out = Tensor([0.0, 0.0, 0.0, 0.0, 0.0])
tensor_l_out = mgr.tensor([0.0, 0.0, 0.0, 0.0, 0.0])
tensor_m = Tensor([ 5.0 ])
tensor_m = mgr.tensor([ 5.0 ])
# We store them in an array for easier interaction
params = [tensor_x_i, tensor_x_j, tensor_y, tensor_w_in, tensor_w_out_i,
tensor_w_out_j, tensor_b_in, tensor_b_out, tensor_l_out, tensor_m]
mgr = Manager()
mgr.eval_tensor_create_def(params)
sq.sequence().eval(kp.OpTensorSyncDevice(params))
# Record commands for efficient evaluation
sq = mgr.create_sequence()
sq.begin()
sq.record_tensor_sync_device([tensor_w_in, tensor_b_in])
sq.record_algo_data(params, compute_shader.to_spirv())
sq.record_tensor_sync_local([tensor_w_out_i, tensor_w_out_j, tensor_b_out, tensor_l_out])
sq.end()
sq = mgr.sequence()
sq.record(kp.OpTensorSyncDevice([tensor_w_in, tensor_b_in]))
sq.record(kp.OpAlgoDispatch(mgr.algorithm(params, compute_shader.to_spirv())))
sq.record(kp.OpTensorSyncLocal([tensor_w_out_i, tensor_w_out_j, tensor_b_out, tensor_l_out]))
ITERATIONS = 100
learning_rate = 0.1