Updated documentation to reflect updated interface

2020-08-29 18:44:09 +01:00 · 2020-08-29 18:44:09 +01:00 · 6c69d832d3
commit 6c69d832d3
parent 6cbbb48827
4 changed files with 65 additions and 236 deletions
--- a/README.md
+++ b/README.md
@ -52,17 +52,50 @@ int main() {

    kp::Manager mgr; // Automatically selects Device 0

-    std::shared_ptr<kp::Tensor> tensorLHS{ new kp::Tensor({ 0.0, 1.0, 2.0 }) };
-    mgr.evalOp<kp::OpCreateTensor>({ tensorLHS });
+    auto tensorLhs = std::make_shared<kp::Tensor>(kp::Tensor({ 0, 1, 2 }));
+    auto tensorRhs = std::make_shared<kp::Tensor>(kp::Tensor({ 2, 4, 6 }));
+    auto tensorOut = std::make_shared<kp::Tensor>(kp::Tensor({ 0, 0, 0 }));

-    std::shared_ptr<kp::Tensor> tensorRHS{ new kp::Tensor( { 2.0, 4.0, 6.0 }) };
-    mgr.evalOp<kp::OpCreateTensor>({ tensorRHS });
+    auto params = std::vector<kp::Tensor>({ tensorLhs, tensorRhs, tensorOut })

-    // TODO: Add capabilities for just output tensor types
-    std::shared_ptr<kp::Tensor> tensorOutput{ new kp::Tensor({ 0.0, 0.0, 0.0 }) };
-    mgr.evalOp<kp::OpCreateTensor>({ tensorOutput });
+    // Create tensor data in GPU
+    mgr.evalOp<kp::OpCreateTensor>(params);

-    mgr.evalOp<kp::OpMult>({ tensorLHS, tensorRHS, tensorOutput });
+    // Run Kompute operation on the parameters provided with dispatch layout
+    mgr.evalOp<kp::OpAlgoShader<10, 1, 1>>(params, "path/to/shader.comp.spv");
+
+    // Print the output
+    std::cout << fmt::format("Output: {}", tensorOutput.data()) << std::endl;
+}
+```
+
+Create your own operations with full control on each of the steps.
+
+```c++
+template<uint32_t tX = 0, uint32_t tY = 0, uint32_t tZ = 0>
+class OpCustom : public OpAlgoBase<tX, tY, tZ> {
+    // ...
+    OpCustom(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
+           std::shared_ptr<vk::Device> device,
+           std::shared_ptr<vk::CommandBuffer> commandBuffer,
+           std::vector<std::shared_ptr<Tensor>>& tensors)
+      : OpAlgoBase<tX, tY, tZ>(physicalDevice, device, commandBuffer, tensors, true)
+    {
+        // ... extra steps to perform custom setup
+        this->mOptSpirvBinPath = "shaders/glsl/opmult.comp.spv";
+    }
+}
+
+int main() {
+    kp::Manager mgr; // Automatically selects Device 0
+
+    // Create parameters but don't initialise if customOp performs multiple
+    auto tensorLhs = std::make_shared<kp::Tensor>(kp::Tensor({ 0, 1, 2 }));
+    auto tensorRhs = std::make_shared<kp::Tensor>(kp::Tensor({ 2, 4, 6 }));
+    auto tensorOut = std::make_shared<kp::Tensor>(kp::Tensor({ 0, 0, 0 }));
+
+    // Pass parameters to custom operation which performs relevant steps
+    mgr.evalOp<kp::OpCustom>({ tensorLHS, tensorRHS, tensorOutput });

    std::cout << fmt::format("Output: {}", tensorOutput.data()) << std::endl;
 }
@ -72,6 +105,7 @@ Record commands in a single submit by using a Sequence to send in batch to GPU.

 ```c++
 int main() {
+
    kp::Manager mgr;

    std::shared_ptr<kp::Tensor> tensorLHS{ new kp::Tensor({ 0.0, 1.0, 2.0 }) };
@ -90,8 +124,10 @@ int main() {

        sq.record<kp::OpMult<>>({ tensorLHS, tensorRHS, tensorOutput });
    }
+
    // Stop recording
    sq.end();
+
    // Submit operations to GPU
    sq.eval();

@ -99,29 +135,6 @@ int main() {
 }
 ```

-Create your own custom operations to leverage Vulkan Compute for your specialised use-cases.
-
-```c++
-class OpCustom : kp::OpBase {
-    // ...
-    void init(std::shared_ptr<Tensor> tensors) {
-        // ... extra steps to initialise tensors
-        this->mAlgorithm->init("path/to/your/shader.compute.spv", tensors);
-    }
-}
-
-int main() {
-    kp::Manager mgr; // Automatically selects Device 0
-
-    std::shared_ptr<kp::Tensor> tensor{ new kp::Tensor({ 0.0, 1.0, 2.0 }) };
-    mgr.evalOp<kp::OpCreateTensor>({ tensorLHS });
-
-    mgr.evalOp<kp::OpCustom>({ tensorLHS, tensorRHS, tensorOutput });
-
-    std::cout << fmt::format("Output: {}", tensorOutput.data()) << std::endl;
-}
-```
-
 ## Motivations

 Vulkan Kompute was created after identifying the challenge most GPU processing projects with Vulkan undergo - namely having to build extensive boilerplate for Vulkan and create abstractions and interfaces that expose the core compute capabilities. It is only after a few thousand lines of code that it's possible to start building the application-specific logic. 
--- a/single_include/AggregateHeaders.cpp
+++ b/single_include/AggregateHeaders.cpp
@ -5,7 +5,6 @@
 #include "kompute/operations/OpBase.hpp"
 #include "kompute/operations/OpAlgoBase.hpp"
 #include "kompute/operations/OpAlgoLhsRhsOut.hpp"
-#include "kompute/operations/OpAlgoAllInOut.hpp"
 #include "kompute/operations/OpMult.hpp"
 #include "kompute/operations/OpCreateTensor.hpp"
 #include "kompute/Algorithm.hpp"
--- a/single_include/kompute/Kompute.hpp
+++ b/single_include/kompute/Kompute.hpp
@ -526,7 +526,8 @@ class Sequence
     * not be able to add the operation.
     *
     * @param tensors Vector of tensors to use for the operation
-     * @param TArgs Template parameters that are used to initialise operation which allows for extensible configurations on initialisation.
+     * @param TArgs Template parameters that are used to initialise operation
+     * which allows for extensible configurations on initialisation.
     */
    template<typename T, typename... TArgs>
    bool record(std::vector<std::shared_ptr<Tensor>> tensors, TArgs&&... params)
@ -655,7 +656,8 @@ class Manager
     *
     * @param tensors The tensors to be used in the operation recorded
     * @param sequenceName The name of the sequence to be retrieved or created
-     * @param TArgs Template parameters that will be used to initialise Operation to allow for extensible configurations on initialisation
+     * @param TArgs Template parameters that will be used to initialise
+     * Operation to allow for extensible configurations on initialisation
     */
    template<typename T, typename... TArgs>
    void evalOp(std::vector<std::shared_ptr<Tensor>> tensors,
@ -801,6 +803,18 @@ namespace kp {
 /**
 * Operation that provides a general abstraction that simplifies the use of 
 * algorithm and parameter components which can be used with shaders.
+ * By default it enables the user to provide a dynamic number of tensors
+ * which are then passed as inputs. 
+ *
+ * All of these tensors are expected to be initlaised and this is checked with throw std exception in the init function.
+ *
+ * It is possible to also choose if the user requires all of the tensors to be
+ * copied from device memory to their host data. This can be disabled by either
+ * passing the copyOutputData constructor parameter and/or by overriding the 
+ * functions to carry out copy commands accordingly. 
+ *
+ * See OpLhsRhsOut for an example implementation on a more specific granularity on tensor parameters.
+ * 
 * The template parameters specify the processing GPU layout number of
 * iterations for each x, y, z parameter. More specifically, this will be the
 * input to ".dispatch(uint32_t tX, uint32_t tY, uint32_t, tZ)"
@ -1267,203 +1281,6 @@ OpAlgoLhsRhsOut<tX, tY, tZ>::postSubmit()

 #include <fstream>

-namespace kp {
-
-/**
- * Operation base class to simplify the creation of operations that require
- * multiple unknown number of tensors, all which will be expected to be
- * Device storage tensors with the data already stored. All the tensors
- * will also be used as outputs so the data will be copied from the device
- * into the respective tensors.
- * The template parameters specify the processing GPU layout number of
- * iterations for each x, y, z parameter. More specifically, this will be the
- * input to ".dispatch(uint32_t tX, uint32_t tY, uint32_t, tZ)"
- */
-template<uint32_t tX = 0, uint32_t tY = 0, uint32_t tZ = 0>
-class OpAlgoAllInOut : public OpAlgoBase<tX, tY, tZ>
-{
-  public:
-    /**
-     *  Base constructor, should not be used unless explicitly intended.
-     */
-    OpAlgoAllInOut();
-
-    /**
-     * Default constructor with parameters that provides the bare minimum
-     * requirements for the operations to be able to create and manage their
-     * sub-components.
-     *
-     * @param physicalDevice Vulkan physical device used to find device queues
-     * @param device Vulkan logical device for passing to Algorithm
-     * @param commandBuffer Vulkan Command Buffer to record commands into
-     * @param tensors Tensors that are to be used in this operation
-     * @param freeTensors Whether operation manages the memory of the Tensors
-     */
-    OpAlgoAllInOut(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
-           std::shared_ptr<vk::Device> device,
-           std::shared_ptr<vk::CommandBuffer> commandBuffer,
-           std::vector<std::shared_ptr<Tensor>>& tensors);
-
-    /**
-     * Default destructor, which is in charge of destroying the algorithm
-     * components but does not destroy the underlying tensors
-     */
-    ~OpAlgoAllInOut();
-
-    /**
-     * The init function is responsible for ensuring that all of the tensors
-     * passed into the function have been initialised and are of type Device.
-     * This is required as the parameters provided are expected to be 
-     * used as storage buffers, as well as output buffers, so the data will
-     * be transferred out from the Device into the Tensors replacing existing
-     * data.
-     */
-    void init() override;
-
-    /**
-     * This records the commands that are to be sent to the GPU. This includes
-     * the barriers that ensure the memory has been copied before going in and
-     * out of the shader, as well as the dispatch operation that sends the
-     * shader processing to the gpu. This function also records the GPU memory
-     * copy of the output data for the staging bufffer so it can be read by the
-     * host.
-     */
-    void record() override;
-
-    /**
-     * Executes after the recorded commands are submitted, and performs a copy
-     * of the GPU Device memory into the staging buffer so the output data can
-     * be retrieved.
-     */
-    void postSubmit() override;
-
-  protected:
-    // -------------- ALWAYS OWNED RESOURCES
-    std::vector<std::shared_ptr<Tensor>> mOutputStagingTensors; ///< Array of output staging tensors which will be expected to be the same size as the number of inputs.
-};
-
-} // End namespace kp
-
-// Including implemenation for template class
-#ifndef OPALGOALLINOUT_CPP
-#define OPALGOALLINOUT_CPP
-
-namespace kp {
-
-template<uint32_t tX, uint32_t tY, uint32_t tZ>
-OpAlgoAllInOut<tX, tY, tZ>::OpAlgoAllInOut()
-{
-    SPDLOG_DEBUG("Kompute OpAlgoAllInOut constructor base");
-}
-
-template<uint32_t tX, uint32_t tY, uint32_t tZ>
-OpAlgoAllInOut<tX, tY, tZ>::OpAlgoAllInOut(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
-                           std::shared_ptr<vk::Device> device,
-                           std::shared_ptr<vk::CommandBuffer> commandBuffer,
-                           std::vector<std::shared_ptr<Tensor>>& tensors)
-  : OpAlgoBase<tX, tY, tZ>(physicalDevice, device, commandBuffer, tensors)
-{
-    SPDLOG_DEBUG("Kompute OpAlgoAllInOut constructor with params");
-}
-
-template<uint32_t tX, uint32_t tY, uint32_t tZ>
-OpAlgoAllInOut<tX, tY, tZ>::~OpAlgoAllInOut()
-{
-    SPDLOG_DEBUG("Kompute OpAlgoAllInOut destructor started");
-
-    SPDLOG_DEBUG("Kompute OpAlgoAllInOut destroying staging tensors");
-    for (std::shared_ptr<Tensor> stagingTensor : this->mOutputStagingTensors) {
-        stagingTensor->freeMemoryDestroyGPUResources();
-    }
-}
-
-template<uint32_t tX, uint32_t tY, uint32_t tZ>
-void
-OpAlgoAllInOut<tX, tY, tZ>::init()
-{
-    SPDLOG_DEBUG("Kompute OpAlgoAllInOut init called");
-
-    if (this->mTensors.size() < 1) {
-        throw std::runtime_error(
-          "Kompute OpAlgoAllInOut called with less than 1 tensor");
-    } 
-
-    for (std::shared_ptr<Tensor> tensor : this->mTensors) {
-        if(!tensor->isInit()) {
-            throw std::runtime_error("Kompute OpAlgoAllInOut validation failed; all tensor parameters must be initialised.");
-        }
-    }
-
-    SPDLOG_DEBUG("Kompute OpAlgoAllInOut creating staging output tensors");
-
-    for (std::shared_ptr<Tensor> tensor : this->mTensors) {
-        std::shared_ptr<Tensor> stagingTensor = std::make_shared<Tensor>(
-          tensor->data(), Tensor::TensorTypes::eStaging);
-        stagingTensor->init(
-            this->mPhysicalDevice, this->mDevice, this->mCommandBuffer);
-        this->mOutputStagingTensors.push_back(stagingTensor);
-    }
-
-    SPDLOG_DEBUG("Kompute OpAlgoAllInOut fetching spirv data");
-
-    std::vector<char>& shaderFileData = this->fetchSpirvBinaryData();
-
-    SPDLOG_DEBUG("Kompute OpAlgoAllInOut Initialising algorithm component");
-
-    this->mAlgorithm->init(shaderFileData, this->mTensors);
-}
-
-template<uint32_t tX, uint32_t tY, uint32_t tZ>
-void
-OpAlgoAllInOut<tX, tY, tZ>::record()
-{
-    SPDLOG_DEBUG("Kompute OpAlgoAllInOut record called");
-
-    // Barrier to ensure the data is finished writing to buffer memory
-    for (std::shared_ptr<Tensor> tensor : this->mTensors) {
-        tensor->recordBufferMemoryBarrier(
-          vk::AccessFlagBits::eHostWrite,
-          vk::AccessFlagBits::eShaderRead,
-          vk::PipelineStageFlagBits::eHost,
-          vk::PipelineStageFlagBits::eComputeShader);
-    }
-
-    this->mAlgorithm->recordDispatch(this->mX, this->mY, this->mZ);
-
-    // Barrier to ensure the shader code is executed before buffer read
-    for (std::shared_ptr<Tensor> tensor : this->mTensors) {
-        tensor->recordBufferMemoryBarrier(
-          vk::AccessFlagBits::eShaderWrite,
-          vk::AccessFlagBits::eTransferRead,
-          vk::PipelineStageFlagBits::eComputeShader,
-          vk::PipelineStageFlagBits::eTransfer);
-    }
-
-    // Record copy from and create barrier for STAGING tensors
-    for (std::shared_ptr<Tensor> stagingTensor : this->mOutputStagingTensors) {
-        stagingTensor->recordCopyFrom(this->mTensorOutput, true);
-    }
-}
-
-template<uint32_t tX, uint32_t tY, uint32_t tZ>
-void
-OpAlgoAllInOut<tX, tY, tZ>::postSubmit()
-{
-    SPDLOG_DEBUG("Kompute OpAlgoAllInOut postSubmit called");
-
-    for (size_t i = 0; i < this->mTensors.size(); i++) {
-        this->mOutputStagingTensors[i]->mapDataFromHostMemory();
-
-        this->mTensors[i]->setData(this->mOutputStagingTensors[i]->data());
-    }
-}
-
-}
-
-#endif // #ifndef OPALGOALLINOUT_CPP
-
-#include <fstream>
-
 #if RELEASE

 #endif
@ -1477,7 +1294,7 @@ namespace kp {
 * input to ".dispatch(uint32_t tX, uint32_t tY, uint32_t, tZ)"
 */
 template<uint32_t tX = 0, uint32_t tY = 0, uint32_t tZ = 0>
-class OpMult : public OpAlgoLhsRhsOut<tX, tY, tZ>
+class OpMult : public OpAlgoBase<tX, tY, tZ>
 {
  public:
    /**
@ -1502,7 +1319,7 @@ class OpMult : public OpAlgoLhsRhsOut<tX, tY, tZ>
           std::shared_ptr<vk::Device> device,
           std::shared_ptr<vk::CommandBuffer> commandBuffer,
           std::vector<std::shared_ptr<Tensor>>& tensors)
-      : OpAlgoLhsRhsOut<tX, tY, tZ>(physicalDevice, device, commandBuffer, tensors)
+      : OpAlgoBase<tX, tY, tZ>(physicalDevice, device, commandBuffer, tensors, true)
    {
        SPDLOG_DEBUG("Kompute OpMult constructor with params");

--- a/src/include/kompute/operations/OpMult.hpp
+++ b/src/include/kompute/operations/OpMult.hpp
@ -11,7 +11,7 @@
 #include "kompute/Algorithm.hpp"
 #include "kompute/Tensor.hpp"

-#include "kompute/operations/OpAlgoLhsRhsOut.hpp"
+#include "kompute/operations/OpAlgoBase.hpp"

 namespace kp {

@ -22,7 +22,7 @@ namespace kp {
 * input to ".dispatch(uint32_t tX, uint32_t tY, uint32_t, tZ)"
 */
 template<uint32_t tX = 0, uint32_t tY = 0, uint32_t tZ = 0>
-class OpMult : public OpAlgoLhsRhsOut<tX, tY, tZ>
+class OpMult : public OpAlgoBase<tX, tY, tZ>
 {
  public:
    /**
@ -47,7 +47,7 @@ class OpMult : public OpAlgoLhsRhsOut<tX, tY, tZ>
           std::shared_ptr<vk::Device> device,
           std::shared_ptr<vk::CommandBuffer> commandBuffer,
           std::vector<std::shared_ptr<Tensor>>& tensors)
-      : OpAlgoLhsRhsOut<tX, tY, tZ>(physicalDevice, device, commandBuffer, tensors)
+      : OpAlgoBase<tX, tY, tZ>(physicalDevice, device, commandBuffer, tensors, true)
    {
        SPDLOG_DEBUG("Kompute OpMult constructor with params");