Removed workgroup templates on opalgobase classes

2020-11-01 16:28:48 +00:00 · 2020-11-01 16:28:48 +00:00 · 3ad5e4d3e7
commit 3ad5e4d3e7
parent 6afe6463c2
5 changed files with 322 additions and 344 deletions
--- a/src/OpAlgoBase.cpp
+++ b/src/OpAlgoBase.cpp
@ -0,0 +1,162 @@
+#pragma once
+
+#include "kompute/operations/OpAlgoBase.hpp"
+
+namespace kp {
+
+OpAlgoBase::OpAlgoBase()
+{
+    SPDLOG_DEBUG("Kompute OpAlgoBase constructor base");
+}
+
+OpAlgoBase::OpAlgoBase(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
+                           std::shared_ptr<vk::Device> device,
+                           std::shared_ptr<vk::CommandBuffer> commandBuffer,
+                           std::vector<std::shared_ptr<Tensor>>& tensors,
+                           KomputeWorkgroup komputeWorkgroup)
+  : OpBase(physicalDevice, device, commandBuffer, tensors, false)
+{
+    SPDLOG_DEBUG("Kompute OpAlgoBase constructor with params numTensors: {}", tensors.size());
+
+    // The dispatch size is set up based on either explicitly provided template
+    // parameters or by default it would take the shape and size of the tensors
+    if (komputeWorkgroup.x > 0) {
+        // If at least the x value is provided we use mainly the parameters
+        // provided
+        this->mKomputeWorkgroup = {
+            0,
+            komputeWorkgroup.y > 0 ? komputeWorkgroup.y : 1,
+            komputeWorkgroup.z > 0 ? komputeWorkgroup.z : 1
+        };
+    } else {
+        this->mKomputeWorkgroup = {tensors[0]->size(), 1, 1};
+    }
+    SPDLOG_INFO("Kompute OpAlgoBase dispatch size X: {}, Y: {}, Z: {}",
+                 this->mKomputeWorkgroup.x,
+                 this->mKomputeWorkgroup.y,
+                 this->mKomputeWorkgroup.z);
+
+    this->mAlgorithm = std::make_shared<Algorithm>(device, commandBuffer);
+}
+
+OpAlgoBase::OpAlgoBase(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
+                           std::shared_ptr<vk::Device> device,
+                           std::shared_ptr<vk::CommandBuffer> commandBuffer,
+                           std::vector<std::shared_ptr<Tensor>>& tensors,
+                           std::string shaderFilePath,
+                           KomputeWorkgroup komputeWorkgroup)
+  : OpAlgoBase(physicalDevice, device, commandBuffer, tensors, komputeWorkgroup)
+{
+    SPDLOG_DEBUG("Kompute OpAlgoBase shaderFilePath constructo with shaderfile path: {}", shaderFilePath);
+
+    this->mShaderFilePath = shaderFilePath;
+}
+
+OpAlgoBase::OpAlgoBase(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
+                           std::shared_ptr<vk::Device> device,
+                           std::shared_ptr<vk::CommandBuffer> commandBuffer,
+                           std::vector<std::shared_ptr<Tensor>>& tensors,
+                           const std::vector<char>& shaderDataRaw,
+                           KomputeWorkgroup komputeWorkgroup)
+  : OpAlgoBase(physicalDevice, device, commandBuffer, tensors, komputeWorkgroup)
+{
+    SPDLOG_DEBUG("Kompute OpAlgoBase shaderFilePath constructo with shader raw data length: {}", shaderDataRaw.size());
+
+    this->mShaderDataRaw = shaderDataRaw;
+}
+
+OpAlgoBase::~OpAlgoBase()
+{
+    SPDLOG_DEBUG("Kompute OpAlgoBase destructor started");
+}
+
+void
+OpAlgoBase::init()
+{
+    SPDLOG_DEBUG("Kompute OpAlgoBase init called");
+
+    if (this->mTensors.size() < 1) {
+        throw std::runtime_error(
+          "Kompute OpAlgoBase called with less than 1 tensor");
+    } 
+
+    for (std::shared_ptr<Tensor> tensor : this->mTensors) {
+        if(!tensor->isInit()) {
+            throw std::runtime_error("Kompute OpAlgoBase validation failed; all tensor parameters must be initialised.");
+        }
+    }
+
+    SPDLOG_DEBUG("Kompute OpAlgoBase fetching spirv data");
+
+    std::vector<char> shaderFileData = this->fetchSpirvBinaryData();
+
+    SPDLOG_DEBUG("Kompute OpAlgoBase Initialising algorithm component");
+
+    this->mAlgorithm->init(shaderFileData, this->mTensors);
+}
+
+void
+OpAlgoBase::record()
+{
+    SPDLOG_DEBUG("Kompute OpAlgoBase record called");
+
+    // Barrier to ensure the data is finished writing to buffer memory
+    for (std::shared_ptr<Tensor> tensor : this->mTensors) {
+        tensor->recordBufferMemoryBarrier(
+          this->mCommandBuffer,
+          vk::AccessFlagBits::eHostWrite,
+          vk::AccessFlagBits::eShaderRead,
+          vk::PipelineStageFlagBits::eHost,
+          vk::PipelineStageFlagBits::eComputeShader);
+    }
+
+    this->mAlgorithm->recordDispatch(this->mKomputeWorkgroup.x, this->mKomputeWorkgroup.y, this->mKomputeWorkgroup.z);
+}
+
+void
+OpAlgoBase::preEval()
+{
+    SPDLOG_DEBUG("Kompute OpAlgoBase preEval called");
+}
+
+void
+OpAlgoBase::postEval()
+{
+    SPDLOG_DEBUG("Kompute OpAlgoBase postSubmit called");
+}
+
+std::vector<char> OpAlgoBase::fetchSpirvBinaryData() 
+{
+    SPDLOG_WARN(
+      "Kompute OpAlgoBase Running shaders directly from spirv file");
+
+    if (this->mShaderFilePath.size()) {
+        std::ifstream fileStream(this->mShaderFilePath,
+                                 std::ios::binary | std::ios::in | std::ios::ate);
+
+        if (!fileStream.good()) {
+            throw std::runtime_error("Error reading file: " + this->mShaderFilePath);
+        }
+
+        size_t shaderFileSize = fileStream.tellg();
+        fileStream.seekg(0, std::ios::beg);
+        char* shaderDataRaw = new char[shaderFileSize];
+        fileStream.read(shaderDataRaw, shaderFileSize);
+        fileStream.close();
+
+        SPDLOG_WARN(
+          "Kompute OpAlgoBase fetched {} bytes", shaderFileSize);
+
+        return std::vector<char>(shaderDataRaw,
+                                 shaderDataRaw + shaderFileSize);
+    }
+    else if (this->mShaderDataRaw.size()) {
+        return this->mShaderDataRaw;
+    }
+    else {
+        throw std::runtime_error("Kompute OpAlgoBase Error reached fetchSpirvBinaryData but neither filepath nor data provided");
+    }
+}
+
+}
+
--- a/src/OpAlgoLhsRhsOut.cpp
+++ b/src/OpAlgoLhsRhsOut.cpp
@ -0,0 +1,129 @@
+#pragma once
+
+#include "kompute/operations/OpAlgoLhsRhsOut.hpp"
+
+namespace kp {
+
+OpAlgoLhsRhsOut::OpAlgoLhsRhsOut()
+{
+    SPDLOG_DEBUG("Kompute OpAlgoLhsRhsOut constructor base");
+}
+
+OpAlgoLhsRhsOut::OpAlgoLhsRhsOut(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
+                           std::shared_ptr<vk::Device> device,
+                           std::shared_ptr<vk::CommandBuffer> commandBuffer,
+                           std::vector<std::shared_ptr<Tensor>> tensors,
+                           KomputeWorkgroup komputeWorkgroup)
+  // The inheritance is initialised with the copyOutputData to false given that
+  // this depencendant class handles the transfer of data via staging buffers in 
+  // a granular way.
+  : OpAlgoBase(physicalDevice, device, commandBuffer, tensors, komputeWorkgroup)
+{
+    SPDLOG_DEBUG("Kompute OpAlgoLhsRhsOut constructor with params");
+}
+
+OpAlgoLhsRhsOut::~OpAlgoLhsRhsOut()
+{
+    SPDLOG_DEBUG("Kompute OpAlgoLhsRhsOut destructor started");
+}
+
+void
+OpAlgoLhsRhsOut::init()
+{
+    SPDLOG_DEBUG("Kompute OpAlgoLhsRhsOut init called");
+
+    if (this->mTensors.size() < 3) {
+        throw std::runtime_error(
+          "Kompute OpAlgoLhsRhsOut called with less than 1 tensor");
+    } else if (this->mTensors.size() > 3) {
+        SPDLOG_WARN("Kompute OpAlgoLhsRhsOut called with more than 3 this->mTensors");
+    }
+
+    this->mTensorLHS = this->mTensors[0];
+    this->mTensorRHS = this->mTensors[1];
+    this->mTensorOutput = this->mTensors[2];
+
+
+    if (!(this->mTensorLHS->isInit() && this->mTensorRHS->isInit() &&
+          this->mTensorOutput->isInit())) {
+        throw std::runtime_error(
+          "Kompute OpAlgoLhsRhsOut all tensor parameters must be initialised. LHS: " +
+          std::to_string(this->mTensorLHS->isInit()) +
+          " RHS: " + std::to_string(this->mTensorRHS->isInit()) +
+          " Output: " + std::to_string(this->mTensorOutput->isInit()));
+    }
+
+    if (!(this->mTensorLHS->size() == this->mTensorRHS->size() &&
+          this->mTensorRHS->size() == this->mTensorOutput->size())) {
+        throw std::runtime_error(
+          "Kompute OpAlgoLhsRhsOut all tensor parameters must be the same size LHS: " +
+          std::to_string(this->mTensorLHS->size()) +
+          " RHS: " + std::to_string(this->mTensorRHS->size()) +
+          " Output: " + std::to_string(this->mTensorOutput->size()));
+    }
+
+    this->mTensorOutputStaging = std::make_shared<Tensor>(
+      this->mTensorOutput->data(), Tensor::TensorTypes::eStaging);
+
+    this->mTensorOutputStaging->init(
+      this->mPhysicalDevice, this->mDevice);
+
+    SPDLOG_DEBUG("Kompute OpAlgoLhsRhsOut fetching spirv data");
+
+    std::vector<char> shaderFileData = this->fetchSpirvBinaryData();
+
+    SPDLOG_DEBUG("Kompute OpAlgoLhsRhsOut Initialising algorithm component");
+
+    this->mAlgorithm->init(shaderFileData, this->mTensors);
+}
+
+void
+OpAlgoLhsRhsOut::record()
+{
+    SPDLOG_DEBUG("Kompute OpAlgoLhsRhsOut record called");
+
+    // Barrier to ensure the data is finished writing to buffer memory
+    this->mTensorLHS->recordBufferMemoryBarrier(
+      this->mCommandBuffer,
+      vk::AccessFlagBits::eHostWrite,
+      vk::AccessFlagBits::eShaderRead,
+      vk::PipelineStageFlagBits::eHost,
+      vk::PipelineStageFlagBits::eComputeShader);
+    this->mTensorRHS->recordBufferMemoryBarrier(
+      this->mCommandBuffer,
+      vk::AccessFlagBits::eHostWrite,
+      vk::AccessFlagBits::eShaderRead,
+      vk::PipelineStageFlagBits::eHost,
+      vk::PipelineStageFlagBits::eComputeShader);
+
+    this->mAlgorithm->recordDispatch(
+                this->mKomputeWorkgroup.x,
+                this->mKomputeWorkgroup.y,
+                this->mKomputeWorkgroup.z);
+
+    // Barrier to ensure the shader code is executed before buffer read
+    this->mTensorOutput->recordBufferMemoryBarrier(
+      this->mCommandBuffer,
+      vk::AccessFlagBits::eShaderWrite,
+      vk::AccessFlagBits::eTransferRead,
+      vk::PipelineStageFlagBits::eComputeShader,
+      vk::PipelineStageFlagBits::eTransfer);
+
+    this->mTensorOutputStaging->recordCopyFrom(
+            this->mCommandBuffer,
+            this->mTensorOutput,
+            true);
+}
+
+void
+OpAlgoLhsRhsOut::postEval()
+{
+    SPDLOG_DEBUG("Kompute OpAlgoLhsRhsOut postSubmit called");
+
+    this->mTensorOutputStaging->mapDataFromHostMemory();
+
+    this->mTensorOutput->setData(this->mTensorOutputStaging->data());
+}
+
+}
+
--- a/src/include/kompute/operations/OpAlgoBase.hpp
+++ b/src/include/kompute/operations/OpAlgoBase.hpp
@ -17,20 +17,17 @@ namespace kp {
 * Operation that provides a general abstraction that simplifies the use of 
 * algorithm and parameter components which can be used with shaders.
 * By default it enables the user to provide a dynamic number of tensors
- * which are then passed as inputs. 
- *
- * All of these tensors are expected to be initlaised and this is checked with throw std exception in the init function.
- *
- * See OpLhsRhsOut for an example implementation on a more specific granularity on tensor parameters.
- * 
- * The template parameters specify the processing GPU layout number of
- * iterations for each x, y, z parameter. More specifically, this will be the
- * input to ".dispatch(uint32_t tX, uint32_t tY, uint32_t, tZ)"
+ * which are then passed as inputs.
 */
-template<uint32_t tX = 0, uint32_t tY = 0, uint32_t tZ = 0>
 class OpAlgoBase : public OpBase
 {
  public:
+    struct KomputeWorkgroup {
+        uint32_t x;
+        uint32_t y;
+        uint32_t z;
+    };
+
    /**
     *  Base constructor, should not be used unless explicitly intended.
     */
@ -46,11 +43,13 @@ class OpAlgoBase : public OpBase
     * @param commandBuffer Vulkan Command Buffer to record commands into
     * @param tensors Tensors that are to be used in this operation
     * @param shaderFilePath Optional parameter to specify the shader to load (either in spirv or raw format)
+     * @param komputeWorkgroup Optional parameter to specify the layout for processing
     */
    OpAlgoBase(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
           std::shared_ptr<vk::Device> device,
           std::shared_ptr<vk::CommandBuffer> commandBuffer,
-           std::vector<std::shared_ptr<Tensor>>& tensors);
+           std::vector<std::shared_ptr<Tensor>>& tensors,
+           KomputeWorkgroup komputeWorkgroup = KomputeWorkgroup());

    /**
     * Constructor that enables a file to be passed to the operation with
@ -61,13 +60,15 @@ class OpAlgoBase : public OpBase
     * @param device Vulkan logical device for passing to Algorithm
     * @param commandBuffer Vulkan Command Buffer to record commands into
     * @param tensors Tensors that are to be used in this operation
-     * @param shaderFilePath Optional parameter to specify the shader to load (either in spirv or raw format)
+     * @param shaderFilePath Parameter to specify the shader to load (either in spirv or raw format)
+     * @param komputeWorkgroup Optional parameter to specify the layout for processing
     */
    OpAlgoBase(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
           std::shared_ptr<vk::Device> device,
           std::shared_ptr<vk::CommandBuffer> commandBuffer,
           std::vector<std::shared_ptr<Tensor>>& tensors,
-           std::string shaderFilePath);
+           std::string shaderFilePath,
+           KomputeWorkgroup komputeWorkgroup = KomputeWorkgroup());

    /**
     * Constructor that enables raw shader data to be passed to the main operation
@ -78,12 +79,14 @@ class OpAlgoBase : public OpBase
     * @param commandBuffer Vulkan Command Buffer to record commands into
     * @param tensors Tensors that are to be used in this operation
     * @param shaderDataRaw Optional parameter to specify the shader data either in binary or raw form
+     * @param komputeWorkgroup Optional parameter to specify the layout for processing
     */
    OpAlgoBase(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
           std::shared_ptr<vk::Device> device,
           std::shared_ptr<vk::CommandBuffer> commandBuffer,
           std::vector<std::shared_ptr<Tensor>>& tensors,
-           const std::vector<char>& shaderDataRaw);
+           const std::vector<char>& shaderDataRaw,
+           KomputeWorkgroup komputeWorkgroup = KomputeWorkgroup());

    /**
     * Default destructor, which is in charge of destroying the algorithm
@ -131,9 +134,7 @@ class OpAlgoBase : public OpBase

    // -------------- ALWAYS OWNED RESOURCES

-    uint32_t mX;
-    uint32_t mY;
-    uint32_t mZ;
+    KomputeWorkgroup mKomputeWorkgroup;

    std::string mShaderFilePath; ///< Optional member variable which can be provided for the OpAlgoBase to find the data automatically and load for processing
    std::vector<char> mShaderDataRaw; ///< Optional member variable which can be provided to contain either the raw shader content or the spirv binary content
@ -143,174 +144,3 @@ class OpAlgoBase : public OpBase

 } // End namespace kp

-// Including implementation for template class
-#ifndef OPALGOBASE_IMPL
-#define OPALGOBASE_IMPL
-
-namespace kp {
-
-template<uint32_t tX, uint32_t tY, uint32_t tZ>
-OpAlgoBase<tX, tY, tZ>::OpAlgoBase()
-{
-    SPDLOG_DEBUG("Kompute OpAlgoBase constructor base");
-}
-
-template<uint32_t tX, uint32_t tY, uint32_t tZ>
-OpAlgoBase<tX, tY, tZ>::OpAlgoBase(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
-                           std::shared_ptr<vk::Device> device,
-                           std::shared_ptr<vk::CommandBuffer> commandBuffer,
-                           std::vector<std::shared_ptr<Tensor>>& tensors)
-  : OpBase(physicalDevice, device, commandBuffer, tensors, false)
-{
-    SPDLOG_DEBUG("Kompute OpAlgoBase constructor with params numTensors: {}", tensors.size());
-
-    // The dispatch size is set up based on either explicitly provided template
-    // parameters or by default it would take the shape and size of the tensors
-    if (tX > 0) {
-        // If at least the x value is provided we use mainly the parameters
-        // provided
-        this->mX = tX;
-        this->mY = tY > 0 ? tY : 1;
-        this->mZ = tZ > 0 ? tZ : 1;
-    } else {
-        this->mX = tensors[0]->size();
-        this->mY = 1;
-        this->mZ = 1;
-    }
-    SPDLOG_INFO("Kompute OpAlgoBase dispatch size X: {}, Y: {}, Z: {}",
-                 this->mX,
-                 this->mY,
-                 this->mZ);
-
-    this->mAlgorithm = std::make_shared<Algorithm>(device, commandBuffer);
-}
-
-template<uint32_t tX, uint32_t tY, uint32_t tZ>
-OpAlgoBase<tX, tY, tZ>::OpAlgoBase(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
-                           std::shared_ptr<vk::Device> device,
-                           std::shared_ptr<vk::CommandBuffer> commandBuffer,
-                           std::vector<std::shared_ptr<Tensor>>& tensors,
-                           std::string shaderFilePath)
-  : OpAlgoBase(physicalDevice, device, commandBuffer, tensors)
-{
-    SPDLOG_DEBUG("Kompute OpAlgoBase shaderFilePath constructo with shaderfile path: {}", shaderFilePath);
-
-    this->mShaderFilePath = shaderFilePath;
-}
-
-template<uint32_t tX, uint32_t tY, uint32_t tZ>
-OpAlgoBase<tX, tY, tZ>::OpAlgoBase(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
-                           std::shared_ptr<vk::Device> device,
-                           std::shared_ptr<vk::CommandBuffer> commandBuffer,
-                           std::vector<std::shared_ptr<Tensor>>& tensors,
-                           const std::vector<char>& shaderDataRaw)
-  : OpAlgoBase(physicalDevice, device, commandBuffer, tensors)
-{
-    SPDLOG_DEBUG("Kompute OpAlgoBase shaderFilePath constructo with shader raw data length: {}", shaderDataRaw.size());
-
-    this->mShaderDataRaw = shaderDataRaw;
-}
-
-template<uint32_t tX, uint32_t tY, uint32_t tZ>
-OpAlgoBase<tX, tY, tZ>::~OpAlgoBase()
-{
-    SPDLOG_DEBUG("Kompute OpAlgoBase destructor started");
-}
-
-template<uint32_t tX, uint32_t tY, uint32_t tZ>
-void
-OpAlgoBase<tX, tY, tZ>::init()
-{
-    SPDLOG_DEBUG("Kompute OpAlgoBase init called");
-
-    if (this->mTensors.size() < 1) {
-        throw std::runtime_error(
-          "Kompute OpAlgoBase called with less than 1 tensor");
-    } 
-
-    for (std::shared_ptr<Tensor> tensor : this->mTensors) {
-        if(!tensor->isInit()) {
-            throw std::runtime_error("Kompute OpAlgoBase validation failed; all tensor parameters must be initialised.");
-        }
-    }
-
-    SPDLOG_DEBUG("Kompute OpAlgoBase fetching spirv data");
-
-    std::vector<char> shaderFileData = this->fetchSpirvBinaryData();
-
-    SPDLOG_DEBUG("Kompute OpAlgoBase Initialising algorithm component");
-
-    this->mAlgorithm->init(shaderFileData, this->mTensors);
-}
-
-template<uint32_t tX, uint32_t tY, uint32_t tZ>
-void
-OpAlgoBase<tX, tY, tZ>::record()
-{
-    SPDLOG_DEBUG("Kompute OpAlgoBase record called");
-
-    // Barrier to ensure the data is finished writing to buffer memory
-    for (std::shared_ptr<Tensor> tensor : this->mTensors) {
-        tensor->recordBufferMemoryBarrier(
-          this->mCommandBuffer,
-          vk::AccessFlagBits::eHostWrite,
-          vk::AccessFlagBits::eShaderRead,
-          vk::PipelineStageFlagBits::eHost,
-          vk::PipelineStageFlagBits::eComputeShader);
-    }
-
-    this->mAlgorithm->recordDispatch(this->mX, this->mY, this->mZ);
-}
-
-template<uint32_t tX, uint32_t tY, uint32_t tZ>
-void
-OpAlgoBase<tX, tY, tZ>::preEval()
-{
-    SPDLOG_DEBUG("Kompute OpAlgoBase preEval called");
-}
-
-template<uint32_t tX, uint32_t tY, uint32_t tZ>
-void
-OpAlgoBase<tX, tY, tZ>::postEval()
-{
-    SPDLOG_DEBUG("Kompute OpAlgoBase postSubmit called");
-}
-
-template<uint32_t tX, uint32_t tY, uint32_t tZ>
-std::vector<char> OpAlgoBase<tX, tY, tZ>::fetchSpirvBinaryData() 
-{
-    SPDLOG_WARN(
-      "Kompute OpAlgoBase Running shaders directly from spirv file");
-
-    if (this->mShaderFilePath.size()) {
-        std::ifstream fileStream(this->mShaderFilePath,
-                                 std::ios::binary | std::ios::in | std::ios::ate);
-
-        if (!fileStream.good()) {
-            throw std::runtime_error("Error reading file: " + this->mShaderFilePath);
-        }
-
-        size_t shaderFileSize = fileStream.tellg();
-        fileStream.seekg(0, std::ios::beg);
-        char* shaderDataRaw = new char[shaderFileSize];
-        fileStream.read(shaderDataRaw, shaderFileSize);
-        fileStream.close();
-
-        SPDLOG_WARN(
-          "Kompute OpAlgoBase fetched {} bytes", shaderFileSize);
-
-        return std::vector<char>(shaderDataRaw,
-                                 shaderDataRaw + shaderFileSize);
-    }
-    else if (this->mShaderDataRaw.size()) {
-        return this->mShaderDataRaw;
-    }
-    else {
-        throw std::runtime_error("Kompute OpAlgoBase Error reached fetchSpirvBinaryData but neither filepath nor data provided");
-    }
-}
-
-}
-
-#endif // #ifndef OPALGOBASE_IMPL
-
--- a/src/include/kompute/operations/OpAlgoLhsRhsOut.hpp
+++ b/src/include/kompute/operations/OpAlgoLhsRhsOut.hpp
@ -15,12 +15,8 @@ namespace kp {
 * Operation base class to simplify the creation of operations that require
 * right hand and left hand side datapoints together with a single output.
 * The expected data passed is two input tensors and one output tensor.
- * The template parameters specify the processing GPU layout number of
- * iterations for each x, y, z parameter. More specifically, this will be the
- * input to ".dispatch(uint32_t tX, uint32_t tY, uint32_t, tZ)"
 */
-template<uint32_t tX = 0, uint32_t tY = 0, uint32_t tZ = 0>
-class OpAlgoLhsRhsOut : public OpAlgoBase<tX, tY, tZ>
+class OpAlgoLhsRhsOut : public OpAlgoBase
 {
  public:
    /**
@ -38,11 +34,13 @@ class OpAlgoLhsRhsOut : public OpAlgoBase<tX, tY, tZ>
     * @param commandBuffer Vulkan Command Buffer to record commands into
     * @param tensors Tensors that are to be used in this operation
     * @param freeTensors Whether operation manages the memory of the Tensors
+     * @param komputeWorkgroup Optional parameter to specify the layout for processing
     */
    OpAlgoLhsRhsOut(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
           std::shared_ptr<vk::Device> device,
           std::shared_ptr<vk::CommandBuffer> commandBuffer,
-           std::vector<std::shared_ptr<Tensor>> tensors);
+           std::vector<std::shared_ptr<Tensor>> tensors,
+           KomputeWorkgroup komputeWorkgroup = KomputeWorkgroup());

    /**
     * Default destructor, which is in charge of destroying the algorithm
@ -73,7 +71,7 @@ class OpAlgoLhsRhsOut : public OpAlgoBase<tX, tY, tZ>
     * of the GPU Device memory into the staging buffer so the output data can
     * be retrieved.
     */
-    virtual void postSubmit() override;
+    virtual void postEval() override;

  protected:
    // -------------- NEVER OWNED RESOURCES
@ -87,136 +85,3 @@ class OpAlgoLhsRhsOut : public OpAlgoBase<tX, tY, tZ>

 } // End namespace kp

-// Including implementation for template class
-#ifndef OPALGOLHSRHSOUT_CPP
-#define OPALGOLHSRHSOUT_CPP
-
-namespace kp {
-
-template<uint32_t tX, uint32_t tY, uint32_t tZ>
-OpAlgoLhsRhsOut<tX, tY, tZ>::OpAlgoLhsRhsOut()
-{
-    SPDLOG_DEBUG("Kompute OpAlgoLhsRhsOut constructor base");
-}
-
-template<uint32_t tX, uint32_t tY, uint32_t tZ>
-OpAlgoLhsRhsOut<tX, tY, tZ>::OpAlgoLhsRhsOut(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
-                           std::shared_ptr<vk::Device> device,
-                           std::shared_ptr<vk::CommandBuffer> commandBuffer,
-                           std::vector<std::shared_ptr<Tensor>> tensors)
-  // The inheritance is initialised with the copyOutputData to false given that
-  // this depencendant class handles the transfer of data via staging buffers in 
-  // a granular way.
-  : OpAlgoBase<tX, tY, tZ>(physicalDevice, device, commandBuffer, tensors)
-{
-    SPDLOG_DEBUG("Kompute OpAlgoLhsRhsOut constructor with params");
-}
-
-template<uint32_t tX, uint32_t tY, uint32_t tZ>
-OpAlgoLhsRhsOut<tX, tY, tZ>::~OpAlgoLhsRhsOut()
-{
-    SPDLOG_DEBUG("Kompute OpAlgoLhsRhsOut destructor started");
-}
-
-template<uint32_t tX, uint32_t tY, uint32_t tZ>
-void
-OpAlgoLhsRhsOut<tX, tY, tZ>::init()
-{
-    SPDLOG_DEBUG("Kompute OpAlgoLhsRhsOut init called");
-
-    if (this->mTensors.size() < 3) {
-        throw std::runtime_error(
-          "Kompute OpAlgoLhsRhsOut called with less than 1 tensor");
-    } else if (this->mTensors.size() > 3) {
-        SPDLOG_WARN("Kompute OpAlgoLhsRhsOut called with more than 3 this->mTensors");
-    }
-
-    this->mTensorLHS = this->mTensors[0];
-    this->mTensorRHS = this->mTensors[1];
-    this->mTensorOutput = this->mTensors[2];
-
-
-    if (!(this->mTensorLHS->isInit() && this->mTensorRHS->isInit() &&
-          this->mTensorOutput->isInit())) {
-        throw std::runtime_error(
-          "Kompute OpAlgoLhsRhsOut all tensor parameters must be initialised. LHS: " +
-          std::to_string(this->mTensorLHS->isInit()) +
-          " RHS: " + std::to_string(this->mTensorRHS->isInit()) +
-          " Output: " + std::to_string(this->mTensorOutput->isInit()));
-    }
-
-    if (!(this->mTensorLHS->size() == this->mTensorRHS->size() &&
-          this->mTensorRHS->size() == this->mTensorOutput->size())) {
-        throw std::runtime_error(
-          "Kompute OpAlgoLhsRhsOut all tensor parameters must be the same size LHS: " +
-          std::to_string(this->mTensorLHS->size()) +
-          " RHS: " + std::to_string(this->mTensorRHS->size()) +
-          " Output: " + std::to_string(this->mTensorOutput->size()));
-    }
-
-    this->mTensorOutputStaging = std::make_shared<Tensor>(
-      this->mTensorOutput->data(), Tensor::TensorTypes::eStaging);
-
-    this->mTensorOutputStaging->init(
-      this->mPhysicalDevice, this->mDevice, this->mCommandBuffer);
-
-    SPDLOG_DEBUG("Kompute OpAlgoLhsRhsOut fetching spirv data");
-
-    std::vector<char> shaderFileData = this->fetchSpirvBinaryData();
-
-    SPDLOG_DEBUG("Kompute OpAlgoLhsRhsOut Initialising algorithm component");
-
-    this->mAlgorithm->init(shaderFileData, this->mTensors);
-}
-
-template<uint32_t tX, uint32_t tY, uint32_t tZ>
-void
-OpAlgoLhsRhsOut<tX, tY, tZ>::record()
-{
-    SPDLOG_DEBUG("Kompute OpAlgoLhsRhsOut record called");
-
-    // Barrier to ensure the data is finished writing to buffer memory
-    this->mTensorLHS->recordBufferMemoryBarrier(
-      this->mCommandBuffer,
-      vk::AccessFlagBits::eHostWrite,
-      vk::AccessFlagBits::eShaderRead,
-      vk::PipelineStageFlagBits::eHost,
-      vk::PipelineStageFlagBits::eComputeShader);
-    this->mTensorRHS->recordBufferMemoryBarrier(
-      this->mCommandBuffer,
-      vk::AccessFlagBits::eHostWrite,
-      vk::AccessFlagBits::eShaderRead,
-      vk::PipelineStageFlagBits::eHost,
-      vk::PipelineStageFlagBits::eComputeShader);
-
-    this->mAlgorithm->recordDispatch(this->mX, this->mY, this->mZ);
-
-    // Barrier to ensure the shader code is executed before buffer read
-    this->mTensorOutput->recordBufferMemoryBarrier(
-      this->mCommandBuffer,
-      vk::AccessFlagBits::eShaderWrite,
-      vk::AccessFlagBits::eTransferRead,
-      vk::PipelineStageFlagBits::eComputeShader,
-      vk::PipelineStageFlagBits::eTransfer);
-
-    this->mTensorOutputStaging->recordCopyFrom(
-            this->mCommandBuffer,
-            this->mTensorOutput,
-            true);
-}
-
-template<uint32_t tX, uint32_t tY, uint32_t tZ>
-void
-OpAlgoLhsRhsOut<tX, tY, tZ>::postSubmit()
-{
-    SPDLOG_DEBUG("Kompute OpAlgoLhsRhsOut postSubmit called");
-
-    this->mTensorOutputStaging->mapDataFromHostMemory();
-
-    this->mTensorOutput->setData(this->mTensorOutputStaging->data());
-}
-
-}
-
-#endif // #ifndef OPALGOLHSRHSOUT_CPP
-
--- a/src/include/kompute/operations/OpMult.hpp
+++ b/src/include/kompute/operations/OpMult.hpp
@ -17,12 +17,9 @@ namespace kp {

 /**
 * Operation that performs multiplication on two tensors and outpus on third
- * tensor. The template parameters specify the processing GPU layout number of
- * iterations for each x, y, z parameter. More specifically, this will be the
- * input to ".dispatch(uint32_t tX, uint32_t tY, uint32_t, tZ)"
+ * tensor.
 */
-template<uint32_t tX = 0, uint32_t tY = 0, uint32_t tZ = 0>
-class OpMult : public OpAlgoBase<tX, tY, tZ>
+class OpMult : public OpAlgoBase
 {
  public:
    /**
@ -41,13 +38,14 @@ class OpMult : public OpAlgoBase<tX, tY, tZ>
     * @param device Vulkan logical device for passing to Algorithm
     * @param commandBuffer Vulkan Command Buffer to record commands into
     * @param tensors Tensors that are to be used in this operation
-     * @param freeTensors Whether operation manages the memory of the Tensors
+     * @param komputeWorkgroup Optional parameter to specify the layout for processing
     */
    OpMult(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
           std::shared_ptr<vk::Device> device,
           std::shared_ptr<vk::CommandBuffer> commandBuffer,
-           std::vector<std::shared_ptr<Tensor>> tensors)
-      : OpAlgoBase<tX, tY, tZ>(physicalDevice, device, commandBuffer, tensors, "")
+           std::vector<std::shared_ptr<Tensor>> tensors,
+           KomputeWorkgroup komputeWorkgroup = KomputeWorkgroup())
+      : OpAlgoBase(physicalDevice, device, commandBuffer, tensors, "", komputeWorkgroup)
    {
        SPDLOG_DEBUG("Kompute OpMult constructor with params");

@ -58,14 +56,8 @@ class OpMult : public OpAlgoBase<tX, tY, tZ>

 #if RELEASE
    /**
-     * If release it will be using the static version of the shader which is 
-     * loaded using this file directly.
-     *
-     * @param physicalDevice Vulkan physical device used to find device queues
-     * @param device Vulkan logical device for passing to Algorithm
-     * @param commandBuffer Vulkan Command Buffer to record commands into
-     * @param tensors Tensors that are to be used in this operation
-     * @param freeTensors Whether operation manages the memory of the Tensors
+     * If RELEASE=1 it will be using the static version of the shader which is 
+     * loaded using this file directly. Otherwise it should not override the function.
     */
    std::vector<char> fetchSpirvBinaryData() override
    {