Moved logic of opallinout into opalgobase which now optionally outputs all files

2020-08-29 18:12:36 +01:00 · 2020-08-29 18:12:36 +01:00 · 7a6d80c435
commit 7a6d80c435
parent 3f8c4fb9b7
3 changed files with 429 additions and 72 deletions
--- a/single_include/kompute/Kompute.hpp
+++ b/single_include/kompute/Kompute.hpp
@ -827,7 +827,8 @@ class OpAlgoBase : public OpBase
    OpAlgoBase(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
           std::shared_ptr<vk::Device> device,
           std::shared_ptr<vk::CommandBuffer> commandBuffer,
-           std::vector<std::shared_ptr<Tensor>>& tensors);
+           std::vector<std::shared_ptr<Tensor>>& tensors,
+           bool copyOutputData);

    /**
     * Default destructor, which is in charge of destroying the algorithm
@ -868,6 +869,9 @@ class OpAlgoBase : public OpBase
    bool mFreeAlgorithm = false;

    // -------------- ALWAYS OWNED RESOURCES
+    std::vector<std::shared_ptr<Tensor>> mOutputStagingTensors; ///< Array of output staging tensors which will be expected to be the same size as the number of inputs.
+    bool mCopyOutputData; ///< Configuration parameter which states whether data will be copied back to all provided tensors for convenience. This can be deactivated by setting this flag and or overriding the functions provided.
+
    uint32_t mX;
    uint32_t mY;
    uint32_t mZ;
@ -895,11 +899,14 @@ template<uint32_t tX, uint32_t tY, uint32_t tZ>
 OpAlgoBase<tX, tY, tZ>::OpAlgoBase(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
                           std::shared_ptr<vk::Device> device,
                           std::shared_ptr<vk::CommandBuffer> commandBuffer,
-                           std::vector<std::shared_ptr<Tensor>>& tensors)
+                           std::vector<std::shared_ptr<Tensor>>& tensors,
+                           bool copyOutputData)
  : OpBase(physicalDevice, device, commandBuffer, tensors, false)
 {
    SPDLOG_DEBUG("Kompute OpAlgoBase constructor with params");

+    SPDLOG_DEBUG("Kompute OpAlgoBase configured for copy output data: {}", copyOutputData);
+
    // The dispatch size is set up based on either explicitly provided template
    // parameters or by default it would take the shape and size of the tensors
    if (tX > 0) {
@ -920,6 +927,8 @@ OpAlgoBase<tX, tY, tZ>::OpAlgoBase(std::shared_ptr<vk::PhysicalDevice> physicalD
                 this->mY,
                 this->mZ);

+    this->mCopyOutputData = copyOutputData;
+
    this->mAlgorithm = std::make_shared<Algorithm>(device, commandBuffer);
 }

@ -927,6 +936,101 @@ template<uint32_t tX, uint32_t tY, uint32_t tZ>
 OpAlgoBase<tX, tY, tZ>::~OpAlgoBase()
 {
    SPDLOG_DEBUG("Kompute OpAlgoBase destructor started");
+
+    if (this->mCopyOutputData) {
+        SPDLOG_DEBUG("Kompute OpAlgoBase destroying staging tensors");
+        for (std::shared_ptr<Tensor> stagingTensor : this->mOutputStagingTensors) {
+            stagingTensor->freeMemoryDestroyGPUResources();
+        }
+    }
+}
+
+template<uint32_t tX, uint32_t tY, uint32_t tZ>
+void
+OpAlgoBase<tX, tY, tZ>::init()
+{
+    SPDLOG_DEBUG("Kompute OpAlgoBase init called");
+
+    if (this->mTensors.size() < 1) {
+        throw std::runtime_error(
+          "Kompute OpAlgoBase called with less than 1 tensor");
+    } 
+
+    for (std::shared_ptr<Tensor> tensor : this->mTensors) {
+        if(!tensor->isInit()) {
+            throw std::runtime_error("Kompute OpAlgoBase validation failed; all tensor parameters must be initialised.");
+        }
+    }
+
+    if (this->mCopyOutputData) {
+        SPDLOG_DEBUG("Kompute OpAlgoBase creating staging output tensors");
+
+        for (std::shared_ptr<Tensor> tensor : this->mTensors) {
+            std::shared_ptr<Tensor> stagingTensor = std::make_shared<Tensor>(
+              tensor->data(), Tensor::TensorTypes::eStaging);
+            stagingTensor->init(
+                this->mPhysicalDevice, this->mDevice, this->mCommandBuffer);
+            this->mOutputStagingTensors.push_back(stagingTensor);
+        }
+    }
+
+    SPDLOG_DEBUG("Kompute OpAlgoBase fetching spirv data");
+
+    std::vector<char>& shaderFileData = this->fetchSpirvBinaryData();
+
+    SPDLOG_DEBUG("Kompute OpAlgoBase Initialising algorithm component");
+
+    this->mAlgorithm->init(shaderFileData, this->mTensors);
+}
+
+template<uint32_t tX, uint32_t tY, uint32_t tZ>
+void
+OpAlgoBase<tX, tY, tZ>::record()
+{
+    SPDLOG_DEBUG("Kompute OpAlgoBase record called");
+
+    // Barrier to ensure the data is finished writing to buffer memory
+    for (std::shared_ptr<Tensor> tensor : this->mTensors) {
+        tensor->recordBufferMemoryBarrier(
+          vk::AccessFlagBits::eHostWrite,
+          vk::AccessFlagBits::eShaderRead,
+          vk::PipelineStageFlagBits::eHost,
+          vk::PipelineStageFlagBits::eComputeShader);
+    }
+
+    this->mAlgorithm->recordDispatch(this->mX, this->mY, this->mZ);
+
+    if (this->mCopyOutputData) {
+        // Barrier to ensure the shader code is executed before buffer read
+        for (std::shared_ptr<Tensor> tensor : this->mTensors) {
+            tensor->recordBufferMemoryBarrier(
+              vk::AccessFlagBits::eShaderWrite,
+              vk::AccessFlagBits::eTransferRead,
+              vk::PipelineStageFlagBits::eComputeShader,
+              vk::PipelineStageFlagBits::eTransfer);
+        }
+
+        // Record copy from and create barrier for STAGING tensors
+        for (size_t i = 0; i < this->mTensors.size(); i++) {
+            this->mOutputStagingTensors[i]->recordCopyFrom(
+                this->mTensors[i], true);
+        }
+    }
+}
+
+template<uint32_t tX, uint32_t tY, uint32_t tZ>
+void
+OpAlgoBase<tX, tY, tZ>::postSubmit()
+{
+    SPDLOG_DEBUG("Kompute OpAlgoBase postSubmit called");
+
+    if (this->mCopyOutputData) {
+        for (size_t i = 0; i < this->mTensors.size(); i++) {
+            this->mOutputStagingTensors[i]->mapDataFromHostMemory();
+
+            this->mTensors[i]->setData(this->mOutputStagingTensors[i]->data());
+        }
+    }
 }

 template<uint32_t tX, uint32_t tY, uint32_t tZ>
@ -948,45 +1052,12 @@ std::vector<char> OpAlgoBase<tX, tY, tZ>::fetchSpirvBinaryData()
                             shaderDataRaw + shaderFileSize);
 }

-template<uint32_t tX, uint32_t tY, uint32_t tZ>
-void
-OpAlgoBase<tX, tY, tZ>::init()
-{
-    SPDLOG_DEBUG("Kompute OpAlgoBase init called");
-
-    std::vector<char> shaderFileData = this->fetchSpirvBinaryData();
-
-    this->mAlgorithm->init(shaderFileData, this->mTensors);
-}
-
-template<uint32_t tX, uint32_t tY, uint32_t tZ>
-void
-OpAlgoBase<tX, tY, tZ>::record()
-{
-    SPDLOG_DEBUG("Kompute OpAlgoBase record called");
-
-    this->mAlgorithm->recordDispatch(this->mX, this->mY, this->mZ);
-}
-
-template<uint32_t tX, uint32_t tY, uint32_t tZ>
-void
-OpAlgoBase<tX, tY, tZ>::postSubmit()
-{
-    SPDLOG_DEBUG("Kompute OpAlgoBase postSubmit called");
-}
-
 }

 #endif // #ifndef OPALGOBASE_IMPL

 #include <fstream>

-#if RELEASE
-
-#endif
-
-#include <fstream>
-
 namespace kp {

 /**
@ -1034,7 +1105,7 @@ class OpAlgoLhsRhsOut : public OpAlgoBase<tX, tY, tZ>
     * tensors, and  creates the algorithm component which processes the
     * computation.
     */
-    void init() override;
+    virtual void init() override;

    /**
     * This records the commands that are to be sent to the GPU. This includes
@ -1044,14 +1115,14 @@ class OpAlgoLhsRhsOut : public OpAlgoBase<tX, tY, tZ>
     * copy of the output data for the staging bufffer so it can be read by the
     * host.
     */
-    void record() override;
+    virtual void record() override;

    /**
     * Executes after the recorded commands are submitted, and performs a copy
     * of the GPU Device memory into the staging buffer so the output data can
     * be retrieved.
     */
-    void postSubmit() override;
+    virtual void postSubmit() override;

  protected:
    // -------------- NEVER OWNED RESOURCES
@ -1082,7 +1153,10 @@ OpAlgoLhsRhsOut<tX, tY, tZ>::OpAlgoLhsRhsOut(std::shared_ptr<vk::PhysicalDevice>
                           std::shared_ptr<vk::Device> device,
                           std::shared_ptr<vk::CommandBuffer> commandBuffer,
                           std::vector<std::shared_ptr<Tensor>>& tensors)
-  : OpAlgoBase<tX, tY, tZ>(physicalDevice, device, commandBuffer, tensors)
+  // The inheritance is initialised with the copyOutputData to false given that
+  // this depencendant class handles the transfer of data via staging buffers in 
+  // a granular way.
+  : OpAlgoBase<tX, tY, tZ>(physicalDevice, device, commandBuffer, tensors, false)
 {
    SPDLOG_DEBUG("Kompute OpAlgoLhsRhsOut constructor with params");
 }
@ -1191,6 +1265,209 @@ OpAlgoLhsRhsOut<tX, tY, tZ>::postSubmit()

 #endif // #ifndef OPALGOLHSRHSOUT_CPP

+#include <fstream>
+
+namespace kp {
+
+/**
+ * Operation base class to simplify the creation of operations that require
+ * multiple unknown number of tensors, all which will be expected to be
+ * Device storage tensors with the data already stored. All the tensors
+ * will also be used as outputs so the data will be copied from the device
+ * into the respective tensors.
+ * The template parameters specify the processing GPU layout number of
+ * iterations for each x, y, z parameter. More specifically, this will be the
+ * input to ".dispatch(uint32_t tX, uint32_t tY, uint32_t, tZ)"
+ */
+template<uint32_t tX = 0, uint32_t tY = 0, uint32_t tZ = 0>
+class OpAlgoAllInOut : public OpAlgoBase<tX, tY, tZ>
+{
+  public:
+    /**
+     *  Base constructor, should not be used unless explicitly intended.
+     */
+    OpAlgoAllInOut();
+
+    /**
+     * Default constructor with parameters that provides the bare minimum
+     * requirements for the operations to be able to create and manage their
+     * sub-components.
+     *
+     * @param physicalDevice Vulkan physical device used to find device queues
+     * @param device Vulkan logical device for passing to Algorithm
+     * @param commandBuffer Vulkan Command Buffer to record commands into
+     * @param tensors Tensors that are to be used in this operation
+     * @param freeTensors Whether operation manages the memory of the Tensors
+     */
+    OpAlgoAllInOut(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
+           std::shared_ptr<vk::Device> device,
+           std::shared_ptr<vk::CommandBuffer> commandBuffer,
+           std::vector<std::shared_ptr<Tensor>>& tensors);
+
+    /**
+     * Default destructor, which is in charge of destroying the algorithm
+     * components but does not destroy the underlying tensors
+     */
+    ~OpAlgoAllInOut();
+
+    /**
+     * The init function is responsible for ensuring that all of the tensors
+     * passed into the function have been initialised and are of type Device.
+     * This is required as the parameters provided are expected to be 
+     * used as storage buffers, as well as output buffers, so the data will
+     * be transferred out from the Device into the Tensors replacing existing
+     * data.
+     */
+    void init() override;
+
+    /**
+     * This records the commands that are to be sent to the GPU. This includes
+     * the barriers that ensure the memory has been copied before going in and
+     * out of the shader, as well as the dispatch operation that sends the
+     * shader processing to the gpu. This function also records the GPU memory
+     * copy of the output data for the staging bufffer so it can be read by the
+     * host.
+     */
+    void record() override;
+
+    /**
+     * Executes after the recorded commands are submitted, and performs a copy
+     * of the GPU Device memory into the staging buffer so the output data can
+     * be retrieved.
+     */
+    void postSubmit() override;
+
+  protected:
+    // -------------- ALWAYS OWNED RESOURCES
+    std::vector<std::shared_ptr<Tensor>> mOutputStagingTensors; ///< Array of output staging tensors which will be expected to be the same size as the number of inputs.
+};
+
+} // End namespace kp
+
+// Including implemenation for template class
+#ifndef OPALGOALLINOUT_CPP
+#define OPALGOALLINOUT_CPP
+
+namespace kp {
+
+template<uint32_t tX, uint32_t tY, uint32_t tZ>
+OpAlgoAllInOut<tX, tY, tZ>::OpAlgoAllInOut()
+{
+    SPDLOG_DEBUG("Kompute OpAlgoAllInOut constructor base");
+}
+
+template<uint32_t tX, uint32_t tY, uint32_t tZ>
+OpAlgoAllInOut<tX, tY, tZ>::OpAlgoAllInOut(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
+                           std::shared_ptr<vk::Device> device,
+                           std::shared_ptr<vk::CommandBuffer> commandBuffer,
+                           std::vector<std::shared_ptr<Tensor>>& tensors)
+  : OpAlgoBase<tX, tY, tZ>(physicalDevice, device, commandBuffer, tensors)
+{
+    SPDLOG_DEBUG("Kompute OpAlgoAllInOut constructor with params");
+}
+
+template<uint32_t tX, uint32_t tY, uint32_t tZ>
+OpAlgoAllInOut<tX, tY, tZ>::~OpAlgoAllInOut()
+{
+    SPDLOG_DEBUG("Kompute OpAlgoAllInOut destructor started");
+
+    SPDLOG_DEBUG("Kompute OpAlgoAllInOut destroying staging tensors");
+    for (std::shared_ptr<Tensor> stagingTensor : this->mOutputStagingTensors) {
+        stagingTensor->freeMemoryDestroyGPUResources();
+    }
+}
+
+template<uint32_t tX, uint32_t tY, uint32_t tZ>
+void
+OpAlgoAllInOut<tX, tY, tZ>::init()
+{
+    SPDLOG_DEBUG("Kompute OpAlgoAllInOut init called");
+
+    if (this->mTensors.size() < 1) {
+        throw std::runtime_error(
+          "Kompute OpAlgoAllInOut called with less than 1 tensor");
+    } 
+
+    for (std::shared_ptr<Tensor> tensor : this->mTensors) {
+        if(!tensor->isInit()) {
+            throw std::runtime_error("Kompute OpAlgoAllInOut validation failed; all tensor parameters must be initialised.");
+        }
+    }
+
+    SPDLOG_DEBUG("Kompute OpAlgoAllInOut creating staging output tensors");
+
+    for (std::shared_ptr<Tensor> tensor : this->mTensors) {
+        std::shared_ptr<Tensor> stagingTensor = std::make_shared<Tensor>(
+          tensor->data(), Tensor::TensorTypes::eStaging);
+        stagingTensor->init(
+            this->mPhysicalDevice, this->mDevice, this->mCommandBuffer);
+        this->mOutputStagingTensors.push_back(stagingTensor);
+    }
+
+    SPDLOG_DEBUG("Kompute OpAlgoAllInOut fetching spirv data");
+
+    std::vector<char>& shaderFileData = this->fetchSpirvBinaryData();
+
+    SPDLOG_DEBUG("Kompute OpAlgoAllInOut Initialising algorithm component");
+
+    this->mAlgorithm->init(shaderFileData, this->mTensors);
+}
+
+template<uint32_t tX, uint32_t tY, uint32_t tZ>
+void
+OpAlgoAllInOut<tX, tY, tZ>::record()
+{
+    SPDLOG_DEBUG("Kompute OpAlgoAllInOut record called");
+
+    // Barrier to ensure the data is finished writing to buffer memory
+    for (std::shared_ptr<Tensor> tensor : this->mTensors) {
+        tensor->recordBufferMemoryBarrier(
+          vk::AccessFlagBits::eHostWrite,
+          vk::AccessFlagBits::eShaderRead,
+          vk::PipelineStageFlagBits::eHost,
+          vk::PipelineStageFlagBits::eComputeShader);
+    }
+
+    this->mAlgorithm->recordDispatch(this->mX, this->mY, this->mZ);
+
+    // Barrier to ensure the shader code is executed before buffer read
+    for (std::shared_ptr<Tensor> tensor : this->mTensors) {
+        tensor->recordBufferMemoryBarrier(
+          vk::AccessFlagBits::eShaderWrite,
+          vk::AccessFlagBits::eTransferRead,
+          vk::PipelineStageFlagBits::eComputeShader,
+          vk::PipelineStageFlagBits::eTransfer);
+    }
+
+    // Record copy from and create barrier for STAGING tensors
+    for (std::shared_ptr<Tensor> stagingTensor : this->mOutputStagingTensors) {
+        stagingTensor->recordCopyFrom(this->mTensorOutput, true);
+    }
+}
+
+template<uint32_t tX, uint32_t tY, uint32_t tZ>
+void
+OpAlgoAllInOut<tX, tY, tZ>::postSubmit()
+{
+    SPDLOG_DEBUG("Kompute OpAlgoAllInOut postSubmit called");
+
+    for (size_t i = 0; i < this->mTensors.size(); i++) {
+        this->mOutputStagingTensors[i]->mapDataFromHostMemory();
+
+        this->mTensors[i]->setData(this->mOutputStagingTensors[i]->data());
+    }
+}
+
+}
+
+#endif // #ifndef OPALGOALLINOUT_CPP
+
+#include <fstream>
+
+#if RELEASE
+
+#endif
+
 namespace kp {

 /**
--- a/src/include/kompute/operations/OpAlgoBase.hpp
+++ b/src/include/kompute/operations/OpAlgoBase.hpp
@ -42,7 +42,8 @@ class OpAlgoBase : public OpBase
    OpAlgoBase(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
           std::shared_ptr<vk::Device> device,
           std::shared_ptr<vk::CommandBuffer> commandBuffer,
-           std::vector<std::shared_ptr<Tensor>>& tensors);
+           std::vector<std::shared_ptr<Tensor>>& tensors,
+           bool copyOutputData);

    /**
     * Default destructor, which is in charge of destroying the algorithm
@ -83,6 +84,9 @@ class OpAlgoBase : public OpBase
    bool mFreeAlgorithm = false;

    // -------------- ALWAYS OWNED RESOURCES
+    std::vector<std::shared_ptr<Tensor>> mOutputStagingTensors; ///< Array of output staging tensors which will be expected to be the same size as the number of inputs.
+    bool mCopyOutputData; ///< Configuration parameter which states whether data will be copied back to all provided tensors for convenience. This can be deactivated by setting this flag and or overriding the functions provided.
+
    uint32_t mX;
    uint32_t mY;
    uint32_t mZ;
@ -110,11 +114,14 @@ template<uint32_t tX, uint32_t tY, uint32_t tZ>
 OpAlgoBase<tX, tY, tZ>::OpAlgoBase(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
                           std::shared_ptr<vk::Device> device,
                           std::shared_ptr<vk::CommandBuffer> commandBuffer,
-                           std::vector<std::shared_ptr<Tensor>>& tensors)
+                           std::vector<std::shared_ptr<Tensor>>& tensors,
+                           bool copyOutputData)
  : OpBase(physicalDevice, device, commandBuffer, tensors, false)
 {
    SPDLOG_DEBUG("Kompute OpAlgoBase constructor with params");

+    SPDLOG_DEBUG("Kompute OpAlgoBase configured for copy output data: {}", copyOutputData);
+
    // The dispatch size is set up based on either explicitly provided template
    // parameters or by default it would take the shape and size of the tensors
    if (tX > 0) {
@ -135,6 +142,8 @@ OpAlgoBase<tX, tY, tZ>::OpAlgoBase(std::shared_ptr<vk::PhysicalDevice> physicalD
                 this->mY,
                 this->mZ);

+    this->mCopyOutputData = copyOutputData;
+
    this->mAlgorithm = std::make_shared<Algorithm>(device, commandBuffer);
 }

@ -142,6 +151,101 @@ template<uint32_t tX, uint32_t tY, uint32_t tZ>
 OpAlgoBase<tX, tY, tZ>::~OpAlgoBase()
 {
    SPDLOG_DEBUG("Kompute OpAlgoBase destructor started");
+
+    if (this->mCopyOutputData) {
+        SPDLOG_DEBUG("Kompute OpAlgoBase destroying staging tensors");
+        for (std::shared_ptr<Tensor> stagingTensor : this->mOutputStagingTensors) {
+            stagingTensor->freeMemoryDestroyGPUResources();
+        }
+    }
+}
+
+template<uint32_t tX, uint32_t tY, uint32_t tZ>
+void
+OpAlgoBase<tX, tY, tZ>::init()
+{
+    SPDLOG_DEBUG("Kompute OpAlgoBase init called");
+
+    if (this->mTensors.size() < 1) {
+        throw std::runtime_error(
+          "Kompute OpAlgoBase called with less than 1 tensor");
+    } 
+
+    for (std::shared_ptr<Tensor> tensor : this->mTensors) {
+        if(!tensor->isInit()) {
+            throw std::runtime_error("Kompute OpAlgoBase validation failed; all tensor parameters must be initialised.");
+        }
+    }
+
+    if (this->mCopyOutputData) {
+        SPDLOG_DEBUG("Kompute OpAlgoBase creating staging output tensors");
+
+        for (std::shared_ptr<Tensor> tensor : this->mTensors) {
+            std::shared_ptr<Tensor> stagingTensor = std::make_shared<Tensor>(
+              tensor->data(), Tensor::TensorTypes::eStaging);
+            stagingTensor->init(
+                this->mPhysicalDevice, this->mDevice, this->mCommandBuffer);
+            this->mOutputStagingTensors.push_back(stagingTensor);
+        }
+    }
+
+    SPDLOG_DEBUG("Kompute OpAlgoBase fetching spirv data");
+
+    std::vector<char>& shaderFileData = this->fetchSpirvBinaryData();
+
+    SPDLOG_DEBUG("Kompute OpAlgoBase Initialising algorithm component");
+
+    this->mAlgorithm->init(shaderFileData, this->mTensors);
+}
+
+template<uint32_t tX, uint32_t tY, uint32_t tZ>
+void
+OpAlgoBase<tX, tY, tZ>::record()
+{
+    SPDLOG_DEBUG("Kompute OpAlgoBase record called");
+
+    // Barrier to ensure the data is finished writing to buffer memory
+    for (std::shared_ptr<Tensor> tensor : this->mTensors) {
+        tensor->recordBufferMemoryBarrier(
+          vk::AccessFlagBits::eHostWrite,
+          vk::AccessFlagBits::eShaderRead,
+          vk::PipelineStageFlagBits::eHost,
+          vk::PipelineStageFlagBits::eComputeShader);
+    }
+
+    this->mAlgorithm->recordDispatch(this->mX, this->mY, this->mZ);
+
+    if (this->mCopyOutputData) {
+        // Barrier to ensure the shader code is executed before buffer read
+        for (std::shared_ptr<Tensor> tensor : this->mTensors) {
+            tensor->recordBufferMemoryBarrier(
+              vk::AccessFlagBits::eShaderWrite,
+              vk::AccessFlagBits::eTransferRead,
+              vk::PipelineStageFlagBits::eComputeShader,
+              vk::PipelineStageFlagBits::eTransfer);
+        }
+
+        // Record copy from and create barrier for STAGING tensors
+        for (size_t i = 0; i < this->mTensors.size(); i++) {
+            this->mOutputStagingTensors[i]->recordCopyFrom(
+                this->mTensors[i], true);
+        }
+    }
+}
+
+template<uint32_t tX, uint32_t tY, uint32_t tZ>
+void
+OpAlgoBase<tX, tY, tZ>::postSubmit()
+{
+    SPDLOG_DEBUG("Kompute OpAlgoBase postSubmit called");
+
+    if (this->mCopyOutputData) {
+        for (size_t i = 0; i < this->mTensors.size(); i++) {
+            this->mOutputStagingTensors[i]->mapDataFromHostMemory();
+
+            this->mTensors[i]->setData(this->mOutputStagingTensors[i]->data());
+        }
+    }
 }

 template<uint32_t tX, uint32_t tY, uint32_t tZ>
@ -163,33 +267,6 @@ std::vector<char> OpAlgoBase<tX, tY, tZ>::fetchSpirvBinaryData()
                             shaderDataRaw + shaderFileSize);
 }

-template<uint32_t tX, uint32_t tY, uint32_t tZ>
-void
-OpAlgoBase<tX, tY, tZ>::init()
-{
-    SPDLOG_DEBUG("Kompute OpAlgoBase init called");
-
-    std::vector<char> shaderFileData = this->fetchSpirvBinaryData();
-
-    this->mAlgorithm->init(shaderFileData, this->mTensors);
-}
-
-template<uint32_t tX, uint32_t tY, uint32_t tZ>
-void
-OpAlgoBase<tX, tY, tZ>::record()
-{
-    SPDLOG_DEBUG("Kompute OpAlgoBase record called");
-
-    this->mAlgorithm->recordDispatch(this->mX, this->mY, this->mZ);
-}
-
-template<uint32_t tX, uint32_t tY, uint32_t tZ>
-void
-OpAlgoBase<tX, tY, tZ>::postSubmit()
-{
-    SPDLOG_DEBUG("Kompute OpAlgoBase postSubmit called");
-}
-
 }

 #endif // #ifndef OPALGOBASE_IMPL
--- a/src/include/kompute/operations/OpAlgoLhsRhsOut.hpp
+++ b/src/include/kompute/operations/OpAlgoLhsRhsOut.hpp
@ -56,7 +56,7 @@ class OpAlgoLhsRhsOut : public OpAlgoBase<tX, tY, tZ>
     * tensors, and  creates the algorithm component which processes the
     * computation.
     */
-    void init() override;
+    virtual void init() override;

    /**
     * This records the commands that are to be sent to the GPU. This includes
@ -66,14 +66,14 @@ class OpAlgoLhsRhsOut : public OpAlgoBase<tX, tY, tZ>
     * copy of the output data for the staging bufffer so it can be read by the
     * host.
     */
-    void record() override;
+    virtual void record() override;

    /**
     * Executes after the recorded commands are submitted, and performs a copy
     * of the GPU Device memory into the staging buffer so the output data can
     * be retrieved.
     */
-    void postSubmit() override;
+    virtual void postSubmit() override;

  protected:
    // -------------- NEVER OWNED RESOURCES
@ -104,7 +104,10 @@ OpAlgoLhsRhsOut<tX, tY, tZ>::OpAlgoLhsRhsOut(std::shared_ptr<vk::PhysicalDevice>
                           std::shared_ptr<vk::Device> device,
                           std::shared_ptr<vk::CommandBuffer> commandBuffer,
                           std::vector<std::shared_ptr<Tensor>>& tensors)
-  : OpAlgoBase<tX, tY, tZ>(physicalDevice, device, commandBuffer, tensors)
+  // The inheritance is initialised with the copyOutputData to false given that
+  // this depencendant class handles the transfer of data via staging buffers in 
+  // a granular way.
+  : OpAlgoBase<tX, tY, tZ>(physicalDevice, device, commandBuffer, tensors, false)
 {
    SPDLOG_DEBUG("Kompute OpAlgoLhsRhsOut constructor with params");
 }