diff --git a/docs/CMakeLists.txt b/docs/CMakeLists.txt index c59635275..0c6f2c8a0 100644 --- a/docs/CMakeLists.txt +++ b/docs/CMakeLists.txt @@ -66,10 +66,14 @@ add_custom_target(gendocsall ALL -E copy ${PROJECT_SOURCE_DIR}/CNAME ${SPHINX_BUILD}/CNAME +# Create assets directory + COMMAND ${CMAKE_COMMAND} + -E make_directory + ${SPHINX_BUILD}/_static/assets/ # Copy the custom asset folder COMMAND ${CMAKE_COMMAND} -E copy_directory ${PROJECT_SOURCE_DIR}/docs/assets/ - ${SPHINX_BUILD}/_static/ + ${SPHINX_BUILD}/_static/assets/ DEPENDS gensphinx) diff --git a/docs/conf.py b/docs/conf.py index 2ede3b7ff..4dea568fe 100755 --- a/docs/conf.py +++ b/docs/conf.py @@ -102,7 +102,7 @@ html_sidebars = { html_static_path = ['_static'] html_css_files = [ - 'custom.css', + 'assets/custom.css', ] diff --git a/docs/index.rst b/docs/index.rst index 62f25286a..a29db7ccc 100755 --- a/docs/index.rst +++ b/docs/index.rst @@ -20,6 +20,7 @@ Index C++ Memory Management Principles C++ Build System Deep Dive C++ Converting GLSL/HLSL Shaders to Cpp Headers + C++ Extending Kompute with Custom Operations C++ Class Documentation & Reference .. toctree:: @@ -34,7 +35,7 @@ Index :titlesonly: :caption: Concepts & Deep Dives: - CI, Docker Images & Tests + CI, Docker Images Docs & Tests Asynchronous & Parallel Operations Mobile App Integration (Android) Game Engine Integration (Godot Engine) diff --git a/docs/overview/advanced-examples.rst b/docs/overview/advanced-examples.rst index 53dc7d470..5823c6df1 100644 --- a/docs/overview/advanced-examples.rst +++ b/docs/overview/advanced-examples.rst @@ -65,7 +65,7 @@ Pass compute shader data in glsl/hlsl text or compiled SPIR-V format (or as path )"); // Run Kompute operation on the parameters provided with dispatch layout - mgr.evalOpDefault>( + mgr.evalOpDefault( { tensorA, tensorB }, std::vector(shader.begin(), shader.end())); @@ -180,7 +180,7 @@ You can submit operations asynchronously with the async/await commands in the kp mgr.evalOpAwaitDefault(10000); // Run Async Kompute operation on the parameters provided - mgr.evalOpAsyncDefault>( + mgr.evalOpAsyncDefault( { tensor }, std::vector(shader.begin(), shader.end())); @@ -263,13 +263,13 @@ Back to `examples list <#simple-examples>`_. )"); // Run the first parallel operation in the `queueOne` sequence - mgr.evalOpAsync>( + mgr.evalOpAsync( { tensorA }, "queueOne", std::vector(shader.begin(), shader.end())); // Run the second parallel operation in the `queueTwo` sequence - mgr.evalOpAsync>( + mgr.evalOpAsync( { tensorB }, "queueTwo", std::vector(shader.begin(), shader.end())); @@ -298,15 +298,14 @@ We also provide tools that allow you to `convert shaders into C++ headers - class OpMyCustom : public OpAlgoBase + class OpMyCustom : public OpAlgoBase { public: OpMyCustom(std::shared_ptr physicalDevice, std::shared_ptr device, std::shared_ptr commandBuffer, std::vector> tensors) - : OpAlgoBase(physicalDevice, device, commandBuffer, tensors, "") + : OpAlgoBase(physicalDevice, device, commandBuffer, tensors, "") { // Perform your custom steps such as reading from a shader file this->mShaderFilePath = "shaders/glsl/opmult.comp"; @@ -352,76 +351,70 @@ In summary, we have: With this we will: - * Optimize the function simplified as ``Y = WX + b`` * We'll want our program to learn the parameters ``W`` and ``b`` -Converting to Kompute Terminology +We will have to convert this into Kompute terminology. -.. code-block:: +First specifically around the inputs, we will be using the following: + +* Two vertors for the variable `X`, vector `Xi` and `Xj` +* One vector `Y` for the true predictions +* A vector `W` containing the two input weight values to use for inference +* A vector `B` containing a single input parameter for `b` + +.. code-block:: cpp + :linenos: + + std::vector wInVec = { 0.001, 0.001 }; + std::vector bInVec = { 0 }; + + std::shared_ptr xI{ new kp::Tensor({ 0, 1, 1, 1, 1 })}; + std::shared_ptr xJ{ new kp::Tensor({ 0, 0, 0, 1, 1 })}; + + std::shared_ptr y{ new kp::Tensor({ 0, 0, 0, 1, 1 })}; + + std::shared_ptr wIn{ + new kp::Tensor(wInVec, kp::Tensor::TensorTypes::eStaging)}; + + std::shared_ptr bIn{ + new kp::Tensor(bInVec, kp::Tensor::TensorTypes::eStaging)}; - We will have to convert this into Kompute terminology. +We will have the following output vectors: - First specifically around the inputs, we will be using the following: +* Two output vectors `Wi` and `Wj` to store all the deltas to perform gradient descent on W +* One output vector `Bout` to store all the deltas to perform gradient descent on B - * Two vertors for the variable `X`, vector `Xi` and `Xj` - * One vector `Y` for the true predictions - * A vector `W` containing the two input weight values to use for inference - * A vector `B` containing a single input parameter for `b` +.. code-block:: cpp + :linenos: - .. code-block:: cpp - :linenos: + std::shared_ptr wOutI{ new kp::Tensor({ 0, 0, 0, 0, 0 })}; + std::shared_ptr wOutJ{ new kp::Tensor({ 0, 0, 0, 0, 0 })}; - std::vector wInVec = { 0.001, 0.001 }; - std::vector bInVec = { 0 }; - - std::shared_ptr xI{ new kp::Tensor({ 0, 1, 1, 1, 1 })}; - std::shared_ptr xJ{ new kp::Tensor({ 0, 0, 0, 1, 1 })}; - - std::shared_ptr y{ new kp::Tensor({ 0, 0, 0, 1, 1 })}; - - std::shared_ptr wIn{ - new kp::Tensor(wInVec, kp::Tensor::TensorTypes::eStaging)}; - - std::shared_ptr bIn{ - new kp::Tensor(bInVec, kp::Tensor::TensorTypes::eStaging)}; + std::shared_ptr bOut{ new kp::Tensor({ 0, 0, 0, 0, 0 })}; - We will have the following output vectors: +For simplicity we will store all the tensors inside a params variable: - * Two output vectors `Wi` and `Wj` to store all the deltas to perform gradient descent on W - * One output vector `Bout` to store all the deltas to perform gradient descent on B +.. code-block:: cpp + :linenos: - .. code-block:: cpp - :linenos: - - std::shared_ptr wOutI{ new kp::Tensor({ 0, 0, 0, 0, 0 })}; - std::shared_ptr wOutJ{ new kp::Tensor({ 0, 0, 0, 0, 0 })}; - - std::shared_ptr bOut{ new kp::Tensor({ 0, 0, 0, 0, 0 })}; + std::vector> params = + {xI, xJ, y, wIn, wOutI, wOutJ, bIn, bOut}; - For simplicity we will store all the tensors inside a params variable: +Now that we have the inputs and outputs we will be able to use them in the processing. The workflow we will be using is the following: - .. code-block:: cpp - :linenos: +1. Create a Sequence to record and submit GPU commands +2. Submit OpCreateTensor to create all the tensors +3. Record the OpAlgo with the Logistic Regression shader +4. Loop across number of iterations: + 4-a. Submit algo operation on LR shader + 4-b. Re-calculate weights from loss +5. Print output weights and bias - std::vector> params = - {xI, xJ, y, wIn, wOutI, wOutJ, bIn, bOut}; - - - Now that we have the inputs and outputs we will be able to use them in the processing. The workflow we will be using is the following: - - 1. Create a Sequence to record and submit GPU commands - 2. Submit OpCreateTensor to create all the tensors - 3. Record the OpAlgo with the Logistic Regression shader - 4. Loop across number of iterations: - 4-a. Submit algo operation on LR shader - 4-b. Re-calculate weights from loss - 5. Print output weights and bias - - 1. Create a sequence to record and submit GPU commands +1. Create a sequence to record and submit GPU commands .. code-block:: cpp :linenos: @@ -435,8 +428,7 @@ Converting to Kompute Terminology -#. Submit OpCreateTensor to create all the tensors - :raw-html-m2r:`~`\ :raw-html-m2r:`~`\ :raw-html-m2r:`~`\ :raw-html-m2r:`~`\ ~~ +Submit OpCreateTensor to create all the tensors .. code-block:: cpp :linenos: @@ -452,20 +444,13 @@ Converting to Kompute Terminology sq->eval(); - - -#. Record the OpAlgo with the Logistic Regression shader - :raw-html-m2r:`~`\ :raw-html-m2r:`~`\ :raw-html-m2r:`~`\ :raw-html-m2r:`~`\ ~~ +Record the OpAlgo with the Logistic Regression shader Once we re-record, all the instructions that were recorded previously are cleared. Because of this we can record now the new commands which will consist of the following: -#. Copy the tensor data from local to device -#. Run the logistic regression shader -#. Copy the output data - .. code-block:: cpp :linenos: @@ -476,7 +461,7 @@ Because of this we can record now the new commands which will consist of the fol sq->record({wIn, bIn}); - sq->record>( + sq->record( params, false, // Whether to copy output from device "test/shaders/glsl/test_logistic_regression.comp"); @@ -487,8 +472,7 @@ Because of this we can record now the new commands which will consist of the fol -#. Loop across number of iterations + 4-a. Submit algo operation on LR shader - :raw-html-m2r:`~`\ :raw-html-m2r:`~`\ :raw-html-m2r:`~`\ :raw-html-m2r:`~`\ ~~ +Loop across number of iterations + 4-a. Submit algo operation on LR shader .. code-block:: cpp :linenos: @@ -507,37 +491,35 @@ Because of this we can record now the new commands which will consist of the fol 4-b. Re-calculate weights from loss -.. code-block:: +Once the shader code is executed, we are able to use the outputs from the shader calculation. - Once the shader code is executed, we are able to use the outputs from the shader calculation. +In this case we want to basically add all the calculated weights and bias from the back-prop step. - In this case we want to basically add all the calculated weights and bias from the back-prop step. - - .. code-block:: cpp - :linenos: +.. code-block:: cpp + :linenos: + { + // ... + for (size_t i = 0; i < ITERATIONS; i++) { - // ... - for (size_t i = 0; i < ITERATIONS; i++) - { - // ... continuing from codeblock above + // ... continuing from codeblock above - // Run evaluation which passes data through shader once - sq->eval(); + // Run evaluation which passes data through shader once + sq->eval(); - // Subtract the resulting weights and biases - for(size_t j = 0; j < bOut->size(); j++) { - wInVec[0] -= wOutI->data()[j]; - wInVec[1] -= wOutJ->data()[j]; - bInVec[0] -= bOut->data()[j]; - } - // Set the data for the GPU to use in the next iteration - wIn->mapDataIntoHostMemory(); - bIn->mapDataIntoHostMemory(); + // Subtract the resulting weights and biases + for(size_t j = 0; j < bOut->size(); j++) { + wInVec[0] -= wOutI->data()[j]; + wInVec[1] -= wOutJ->data()[j]; + bInVec[0] -= bOut->data()[j]; } + // Set the data for the GPU to use in the next iteration + wIn->mapDataIntoHostMemory(); + bIn->mapDataIntoHostMemory(); + } - 5. Print output weights and bias +5. Print output weights and bias .. code-block:: cpp :linenos: diff --git a/docs/overview/ci-tests.rst b/docs/overview/ci-tests.rst index 81443ec86..df29bb7fc 100644 --- a/docs/overview/ci-tests.rst +++ b/docs/overview/ci-tests.rst @@ -1,5 +1,5 @@ -CI, Docker Images & Tests +CI, Docker Images, Docs & Tests ====================== This section contains an overview of the steps run on CI, as well as the tools used to simplify the testing (such as running Vulkan on CPU). @@ -63,4 +63,15 @@ The dockerfiles created provide functionality to simplify the interaction with t - Image contained a linux build of the full Vulkan SDK to reduce time via multi-staged builds +Running / Building Documentation +~~~~~~~~~~~~~ + +In order to build the documentation you will need the following dependencies: + +* Install CI dependencies under `scripts/requirements.txt` + +Once this installed: + +* You can build the documentation using the `gendocsall` cmake target +* You can serve the documentation locally using the `mk_run_docs` command in the Makefile diff --git a/docs/overview/custom-operations.rst b/docs/overview/custom-operations.rst new file mode 100644 index 000000000..2758c8a4b --- /dev/null +++ b/docs/overview/custom-operations.rst @@ -0,0 +1,112 @@ + +Extending Kompute with Custom C++ Operations +================= + +Kompute provides an extenisble architecture which allows for the core components to be extended by building custom operations. + +Building operations is intuitive however it requires knowing some nuances around the order in which each of the class functions across the operation are called as a sequence is executed. + +These nuances are important for more advanced users of Kompute, as this will provide further intuition in what are the specific functions and components that the native functions (like OpTensorCreate, OpAlgoBase, etc) contain which define their specific behaviour. + +Flow of Function Calls +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The top level operation which all operations inherit from is the `kp::OpBase` class. Some of the "Core Native Operations" like `kp::OpTensorCopy`, `kp::OpTensorCreate`, etc all inherit from the base operation class. + +The `kp::OpAlgoBase` is another base operation that is specifically built to enable users to create their own operations that contain custom shader logic (i.e. requiring Vulkan Compute Pipelines, DescriptorSets, etc). The next section contains an example which shows how to extend the OpAlgoBase class. + +Below you + +.. list-table:: + :header-rows: 1 + + * - Function + - Description + * - OpBase(..., tensors, freeTensors) + - Constructor for class where you can load/define resources such as shaders, etc. + * - ~OpBase() + - Destructor that frees vulkan resources (if owned) which should be used to manage any memory allocations created through the operation. + * - init() + - Init function gets called in the Sequence / Manager inside the record step. This function allows for relevant objects to be initialised within the operation. + * - record() + - Record function that gets called in the Sequence / Manager inside the record step after init(). In this function you can directly record to the Vulkan command buffer. + * - preEval() + - When the Sequence is Evaluated this preEval is called across all operations before dispatching the batch of recorded commands to the GPU. This is useful for example if you need to copy data from local to host memory. + * - postEval() + - After the sequence is Evaluated this postEval is called across all operations. When running asynchronously the postEval is called when you call `evalAwait()`, which is why it's important to always run evalAwait() to ensure the process doesn't go into inconsistent state. + + +Simple Operation Extending OpAlgoBase +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Below we show a very simple example that enables you to create an operation with a pre-specified shader. In this case it is the multiplication shader. + +.. code-block:: cpp + :linenos: + + class OpMyCustom : public OpAlgoBase + { + public: + OpMyCustom(std::shared_ptr physicalDevice, + std::shared_ptr device, + std::shared_ptr commandBuffer, + std::vector> tensors) + : OpAlgoBase(physicalDevice, device, commandBuffer, tensors, "") + { + // Perform your custom steps such as reading from a shader file + this->mShaderFilePath = "shaders/glsl/opmult.comp"; + } + } + + + int main() { + + kp::Manager mgr; // Automatically selects Device 0 + + // Create 3 tensors of default type float + auto tensorLhs = std::make_shared(kp::Tensor({ 0., 1., 2. })); + auto tensorRhs = std::make_shared(kp::Tensor({ 2., 4., 6. })); + auto tensorOut = std::make_shared(kp::Tensor({ 0., 0., 0. })); + + // Create tensors data explicitly in GPU with an operation + mgr.evalOpDefault({ tensorLhs, tensorRhs, tensorOut }); + + // Run Kompute operation on the parameters provided with dispatch layout + mgr.evalOpDefault( + { tensorLhs, tensorRhs, tensorOut }); + + // Prints the output which is { 0, 4, 12 } + std::cout << fmt::format("Output: {}", tensorOutput.data()) << std::endl; + } + + +More Complex Operation Extending OpAlgoBase +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Below we show a more complex operation that performs the following: + +* Expects three tensors for an operation, two inputs and one output +* Expects the tensors to be initialised +* Checks that the tensors are of the same size +* Expects output tensor to be of type TensorTypes::eDevice (and creates staging tensor) +* Has functionality to read shader from file or directly from spirv bytes +* Records relevant bufferMemoryBarriers +* Records dispatch command +* Records copy command from device tensor to staging output tensor +* In postEval it maps data from staging tensor to output tensor's data + + +For starters, the header file contains the functions that will be overriden: + + +.. literalinclude:: ../../src/include/kompute/operations/OpAlgoLhsRhsOut.hpp + :language: cpp + + +Then the implementation outlines all the implementations that perform the actions above: +~~~~~~~~~~~~~~~~~~~ + +.. literalinclude:: ../../src/OpAlgoLhsRhsOut.cpp + :language: cpp + +