From b0d394a50b6f7f633f41073d75d4774b0bb4fe99 Mon Sep 17 00:00:00 2001
From: Alejandro Saucedo <axsauze@gmail.com>
Date: Sun, 1 Nov 2020 16:29:27 +0000
Subject: [PATCH] Updated single include with non-templated opalgobase classes

---
 single_include/kompute/Kompute.hpp | 374 +++--------------------------
 1 file changed, 31 insertions(+), 343 deletions(-)
diff --git a/single_include/kompute/Kompute.hpp b/single_include/kompute/Kompute.hpp
index 8def06e4a..382b7131d 100755
--- a/single_include/kompute/Kompute.hpp
+++ b/single_include/kompute/Kompute.hpp
@@ -1620,20 +1620,17 @@ namespace kp {
  * Operation that provides a general abstraction that simplifies the use of 
  * algorithm and parameter components which can be used with shaders.
  * By default it enables the user to provide a dynamic number of tensors
- * which are then passed as inputs. 
- *
- * All of these tensors are expected to be initlaised and this is checked with throw std exception in the init function.
- *
- * See OpLhsRhsOut for an example implementation on a more specific granularity on tensor parameters.
- * 
- * The template parameters specify the processing GPU layout number of
- * iterations for each x, y, z parameter. More specifically, this will be the
- * input to ".dispatch(uint32_t tX, uint32_t tY, uint32_t, tZ)"
+ * which are then passed as inputs.
  */
-template<uint32_t tX = 0, uint32_t tY = 0, uint32_t tZ = 0>
 class OpAlgoBase : public OpBase
 {
   public:
+    struct KomputeWorkgroup {
+        uint32_t x;
+        uint32_t y;
+        uint32_t z;
+    };
+
     /**
      *  Base constructor, should not be used unless explicitly intended.
      */
@@ -1649,11 +1646,13 @@ class OpAlgoBase : public OpBase
      * @param commandBuffer Vulkan Command Buffer to record commands into
      * @param tensors Tensors that are to be used in this operation
      * @param shaderFilePath Optional parameter to specify the shader to load (either in spirv or raw format)
+     * @param komputeWorkgroup Optional parameter to specify the layout for processing
      */
     OpAlgoBase(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
            std::shared_ptr<vk::Device> device,
            std::shared_ptr<vk::CommandBuffer> commandBuffer,
-           std::vector<std::shared_ptr<Tensor>>& tensors);
+           std::vector<std::shared_ptr<Tensor>>& tensors,
+           KomputeWorkgroup komputeWorkgroup = KomputeWorkgroup());
 
     /**
      * Constructor that enables a file to be passed to the operation with
@@ -1664,13 +1663,15 @@ class OpAlgoBase : public OpBase
      * @param device Vulkan logical device for passing to Algorithm
      * @param commandBuffer Vulkan Command Buffer to record commands into
      * @param tensors Tensors that are to be used in this operation
-     * @param shaderFilePath Optional parameter to specify the shader to load (either in spirv or raw format)
+     * @param shaderFilePath Parameter to specify the shader to load (either in spirv or raw format)
+     * @param komputeWorkgroup Optional parameter to specify the layout for processing
      */
     OpAlgoBase(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
            std::shared_ptr<vk::Device> device,
            std::shared_ptr<vk::CommandBuffer> commandBuffer,
            std::vector<std::shared_ptr<Tensor>>& tensors,
-           std::string shaderFilePath);
+           std::string shaderFilePath,
+           KomputeWorkgroup komputeWorkgroup = KomputeWorkgroup());
 
     /**
      * Constructor that enables raw shader data to be passed to the main operation
@@ -1681,12 +1682,14 @@ class OpAlgoBase : public OpBase
      * @param commandBuffer Vulkan Command Buffer to record commands into
      * @param tensors Tensors that are to be used in this operation
      * @param shaderDataRaw Optional parameter to specify the shader data either in binary or raw form
+     * @param komputeWorkgroup Optional parameter to specify the layout for processing
      */
     OpAlgoBase(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
            std::shared_ptr<vk::Device> device,
            std::shared_ptr<vk::CommandBuffer> commandBuffer,
            std::vector<std::shared_ptr<Tensor>>& tensors,
-           const std::vector<char>& shaderDataRaw);
+           const std::vector<char>& shaderDataRaw,
+           KomputeWorkgroup komputeWorkgroup = KomputeWorkgroup());
 
     /**
      * Default destructor, which is in charge of destroying the algorithm
@@ -1733,9 +1736,7 @@ class OpAlgoBase : public OpBase
 
     // -------------- ALWAYS OWNED RESOURCES
 
-    uint32_t mX;
-    uint32_t mY;
-    uint32_t mZ;
+    KomputeWorkgroup mKomputeWorkgroup;
 
     std::string mShaderFilePath; ///< Optional member variable which can be provided for the OpAlgoBase to find the data automatically and load for processing
     std::vector<char> mShaderDataRaw; ///< Optional member variable which can be provided to contain either the raw shader content or the spirv binary content
@@ -1745,177 +1746,6 @@ class OpAlgoBase : public OpBase
 
 } // End namespace kp
 
-// Including implementation for template class
-#ifndef OPALGOBASE_IMPL
-#define OPALGOBASE_IMPL
-
-namespace kp {
-
-template<uint32_t tX, uint32_t tY, uint32_t tZ>
-OpAlgoBase<tX, tY, tZ>::OpAlgoBase()
-{
-    SPDLOG_DEBUG("Kompute OpAlgoBase constructor base");
-}
-
-template<uint32_t tX, uint32_t tY, uint32_t tZ>
-OpAlgoBase<tX, tY, tZ>::OpAlgoBase(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
-                           std::shared_ptr<vk::Device> device,
-                           std::shared_ptr<vk::CommandBuffer> commandBuffer,
-                           std::vector<std::shared_ptr<Tensor>>& tensors)
-  : OpBase(physicalDevice, device, commandBuffer, tensors, false)
-{
-    SPDLOG_DEBUG("Kompute OpAlgoBase constructor with params numTensors: {}", tensors.size());
-
-    // The dispatch size is set up based on either explicitly provided template
-    // parameters or by default it would take the shape and size of the tensors
-    if (tX > 0) {
-        // If at least the x value is provided we use mainly the parameters
-        // provided
-        this->mX = tX;
-        this->mY = tY > 0 ? tY : 1;
-        this->mZ = tZ > 0 ? tZ : 1;
-    } else {
-        this->mX = tensors[0]->size();
-        this->mY = 1;
-        this->mZ = 1;
-    }
-    SPDLOG_INFO("Kompute OpAlgoBase dispatch size X: {}, Y: {}, Z: {}",
-                 this->mX,
-                 this->mY,
-                 this->mZ);
-
-    this->mAlgorithm = std::make_shared<Algorithm>(device, commandBuffer);
-}
-
-template<uint32_t tX, uint32_t tY, uint32_t tZ>
-OpAlgoBase<tX, tY, tZ>::OpAlgoBase(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
-                           std::shared_ptr<vk::Device> device,
-                           std::shared_ptr<vk::CommandBuffer> commandBuffer,
-                           std::vector<std::shared_ptr<Tensor>>& tensors,
-                           std::string shaderFilePath)
-  : OpAlgoBase(physicalDevice, device, commandBuffer, tensors)
-{
-    SPDLOG_DEBUG("Kompute OpAlgoBase shaderFilePath constructo with shaderfile path: {}", shaderFilePath);
-
-    this->mShaderFilePath = shaderFilePath;
-}
-
-template<uint32_t tX, uint32_t tY, uint32_t tZ>
-OpAlgoBase<tX, tY, tZ>::OpAlgoBase(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
-                           std::shared_ptr<vk::Device> device,
-                           std::shared_ptr<vk::CommandBuffer> commandBuffer,
-                           std::vector<std::shared_ptr<Tensor>>& tensors,
-                           const std::vector<char>& shaderDataRaw)
-  : OpAlgoBase(physicalDevice, device, commandBuffer, tensors)
-{
-    SPDLOG_DEBUG("Kompute OpAlgoBase shaderFilePath constructo with shader raw data length: {}", shaderDataRaw.size());
-
-    this->mShaderDataRaw = shaderDataRaw;
-}
-
-template<uint32_t tX, uint32_t tY, uint32_t tZ>
-OpAlgoBase<tX, tY, tZ>::~OpAlgoBase()
-{
-    SPDLOG_DEBUG("Kompute OpAlgoBase destructor started");
-}
-
-template<uint32_t tX, uint32_t tY, uint32_t tZ>
-void
-OpAlgoBase<tX, tY, tZ>::init()
-{
-    SPDLOG_DEBUG("Kompute OpAlgoBase init called");
-
-    if (this->mTensors.size() < 1) {
-        throw std::runtime_error(
-          "Kompute OpAlgoBase called with less than 1 tensor");
-    } 
-
-    for (std::shared_ptr<Tensor> tensor : this->mTensors) {
-        if(!tensor->isInit()) {
-            throw std::runtime_error("Kompute OpAlgoBase validation failed; all tensor parameters must be initialised.");
-        }
-    }
-
-    SPDLOG_DEBUG("Kompute OpAlgoBase fetching spirv data");
-
-    std::vector<char> shaderFileData = this->fetchSpirvBinaryData();
-
-    SPDLOG_DEBUG("Kompute OpAlgoBase Initialising algorithm component");
-
-    this->mAlgorithm->init(shaderFileData, this->mTensors);
-}
-
-template<uint32_t tX, uint32_t tY, uint32_t tZ>
-void
-OpAlgoBase<tX, tY, tZ>::record()
-{
-    SPDLOG_DEBUG("Kompute OpAlgoBase record called");
-
-    // Barrier to ensure the data is finished writing to buffer memory
-    for (std::shared_ptr<Tensor> tensor : this->mTensors) {
-        tensor->recordBufferMemoryBarrier(
-          this->mCommandBuffer,
-          vk::AccessFlagBits::eHostWrite,
-          vk::AccessFlagBits::eShaderRead,
-          vk::PipelineStageFlagBits::eHost,
-          vk::PipelineStageFlagBits::eComputeShader);
-    }
-
-    this->mAlgorithm->recordDispatch(this->mX, this->mY, this->mZ);
-}
-
-template<uint32_t tX, uint32_t tY, uint32_t tZ>
-void
-OpAlgoBase<tX, tY, tZ>::preEval()
-{
-    SPDLOG_DEBUG("Kompute OpAlgoBase preEval called");
-}
-
-template<uint32_t tX, uint32_t tY, uint32_t tZ>
-void
-OpAlgoBase<tX, tY, tZ>::postEval()
-{
-    SPDLOG_DEBUG("Kompute OpAlgoBase postSubmit called");
-}
-
-template<uint32_t tX, uint32_t tY, uint32_t tZ>
-std::vector<char> OpAlgoBase<tX, tY, tZ>::fetchSpirvBinaryData() 
-{
-    SPDLOG_WARN(
-      "Kompute OpAlgoBase Running shaders directly from spirv file");
-
-    if (this->mShaderFilePath.size()) {
-        std::ifstream fileStream(this->mShaderFilePath,
-                                 std::ios::binary | std::ios::in | std::ios::ate);
-
-        if (!fileStream.good()) {
-            throw std::runtime_error("Error reading file: " + this->mShaderFilePath);
-        }
-
-        size_t shaderFileSize = fileStream.tellg();
-        fileStream.seekg(0, std::ios::beg);
-        char* shaderDataRaw = new char[shaderFileSize];
-        fileStream.read(shaderDataRaw, shaderFileSize);
-        fileStream.close();
-
-        SPDLOG_WARN(
-          "Kompute OpAlgoBase fetched {} bytes", shaderFileSize);
-
-        return std::vector<char>(shaderDataRaw,
-                                 shaderDataRaw + shaderFileSize);
-    }
-    else if (this->mShaderDataRaw.size()) {
-        return this->mShaderDataRaw;
-    }
-    else {
-        throw std::runtime_error("Kompute OpAlgoBase Error reached fetchSpirvBinaryData but neither filepath nor data provided");
-    }
-}
-
-}
-
-#endif // #ifndef OPALGOBASE_IMPL
-
 #include <fstream>
 
 namespace kp {
@@ -1924,12 +1754,8 @@ namespace kp {
  * Operation base class to simplify the creation of operations that require
  * right hand and left hand side datapoints together with a single output.
  * The expected data passed is two input tensors and one output tensor.
- * The template parameters specify the processing GPU layout number of
- * iterations for each x, y, z parameter. More specifically, this will be the
- * input to ".dispatch(uint32_t tX, uint32_t tY, uint32_t, tZ)"
  */
-template<uint32_t tX = 0, uint32_t tY = 0, uint32_t tZ = 0>
-class OpAlgoLhsRhsOut : public OpAlgoBase<tX, tY, tZ>
+class OpAlgoLhsRhsOut : public OpAlgoBase
 {
   public:
     /**
@@ -1947,11 +1773,13 @@ class OpAlgoLhsRhsOut : public OpAlgoBase<tX, tY, tZ>
      * @param commandBuffer Vulkan Command Buffer to record commands into
      * @param tensors Tensors that are to be used in this operation
      * @param freeTensors Whether operation manages the memory of the Tensors
+     * @param komputeWorkgroup Optional parameter to specify the layout for processing
      */
     OpAlgoLhsRhsOut(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
            std::shared_ptr<vk::Device> device,
            std::shared_ptr<vk::CommandBuffer> commandBuffer,
-           std::vector<std::shared_ptr<Tensor>> tensors);
+           std::vector<std::shared_ptr<Tensor>> tensors,
+           KomputeWorkgroup komputeWorkgroup = KomputeWorkgroup());
 
     /**
      * Default destructor, which is in charge of destroying the algorithm
@@ -1982,7 +1810,7 @@ class OpAlgoLhsRhsOut : public OpAlgoBase<tX, tY, tZ>
      * of the GPU Device memory into the staging buffer so the output data can
      * be retrieved.
      */
-    virtual void postSubmit() override;
+    virtual void postEval() override;
 
   protected:
     // -------------- NEVER OWNED RESOURCES
@@ -1996,138 +1824,6 @@ class OpAlgoLhsRhsOut : public OpAlgoBase<tX, tY, tZ>
 
 } // End namespace kp
 
-// Including implementation for template class
-#ifndef OPALGOLHSRHSOUT_CPP
-#define OPALGOLHSRHSOUT_CPP
-
-namespace kp {
-
-template<uint32_t tX, uint32_t tY, uint32_t tZ>
-OpAlgoLhsRhsOut<tX, tY, tZ>::OpAlgoLhsRhsOut()
-{
-    SPDLOG_DEBUG("Kompute OpAlgoLhsRhsOut constructor base");
-}
-
-template<uint32_t tX, uint32_t tY, uint32_t tZ>
-OpAlgoLhsRhsOut<tX, tY, tZ>::OpAlgoLhsRhsOut(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
-                           std::shared_ptr<vk::Device> device,
-                           std::shared_ptr<vk::CommandBuffer> commandBuffer,
-                           std::vector<std::shared_ptr<Tensor>> tensors)
-  // The inheritance is initialised with the copyOutputData to false given that
-  // this depencendant class handles the transfer of data via staging buffers in 
-  // a granular way.
-  : OpAlgoBase<tX, tY, tZ>(physicalDevice, device, commandBuffer, tensors)
-{
-    SPDLOG_DEBUG("Kompute OpAlgoLhsRhsOut constructor with params");
-}
-
-template<uint32_t tX, uint32_t tY, uint32_t tZ>
-OpAlgoLhsRhsOut<tX, tY, tZ>::~OpAlgoLhsRhsOut()
-{
-    SPDLOG_DEBUG("Kompute OpAlgoLhsRhsOut destructor started");
-}
-
-template<uint32_t tX, uint32_t tY, uint32_t tZ>
-void
-OpAlgoLhsRhsOut<tX, tY, tZ>::init()
-{
-    SPDLOG_DEBUG("Kompute OpAlgoLhsRhsOut init called");
-
-    if (this->mTensors.size() < 3) {
-        throw std::runtime_error(
-          "Kompute OpAlgoLhsRhsOut called with less than 1 tensor");
-    } else if (this->mTensors.size() > 3) {
-        SPDLOG_WARN("Kompute OpAlgoLhsRhsOut called with more than 3 this->mTensors");
-    }
-
-    this->mTensorLHS = this->mTensors[0];
-    this->mTensorRHS = this->mTensors[1];
-    this->mTensorOutput = this->mTensors[2];
-
-    if (!(this->mTensorLHS->isInit() && this->mTensorRHS->isInit() &&
-          this->mTensorOutput->isInit())) {
-        throw std::runtime_error(
-          "Kompute OpAlgoLhsRhsOut all tensor parameters must be initialised. LHS: " +
-          std::to_string(this->mTensorLHS->isInit()) +
-          " RHS: " + std::to_string(this->mTensorRHS->isInit()) +
-          " Output: " + std::to_string(this->mTensorOutput->isInit()));
-    }
-
-    if (!(this->mTensorLHS->size() == this->mTensorRHS->size() &&
-          this->mTensorRHS->size() == this->mTensorOutput->size())) {
-        throw std::runtime_error(
-          "Kompute OpAlgoLhsRhsOut all tensor parameters must be the same size LHS: " +
-          std::to_string(this->mTensorLHS->size()) +
-          " RHS: " + std::to_string(this->mTensorRHS->size()) +
-          " Output: " + std::to_string(this->mTensorOutput->size()));
-    }
-
-    this->mTensorOutputStaging = std::make_shared<Tensor>(
-      this->mTensorOutput->data(), Tensor::TensorTypes::eStaging);
-
-    this->mTensorOutputStaging->init(
-      this->mPhysicalDevice, this->mDevice, this->mCommandBuffer);
-
-    SPDLOG_DEBUG("Kompute OpAlgoLhsRhsOut fetching spirv data");
-
-    std::vector<char> shaderFileData = this->fetchSpirvBinaryData();
-
-    SPDLOG_DEBUG("Kompute OpAlgoLhsRhsOut Initialising algorithm component");
-
-    this->mAlgorithm->init(shaderFileData, this->mTensors);
-}
-
-template<uint32_t tX, uint32_t tY, uint32_t tZ>
-void
-OpAlgoLhsRhsOut<tX, tY, tZ>::record()
-{
-    SPDLOG_DEBUG("Kompute OpAlgoLhsRhsOut record called");
-
-    // Barrier to ensure the data is finished writing to buffer memory
-    this->mTensorLHS->recordBufferMemoryBarrier(
-      this->mCommandBuffer,
-      vk::AccessFlagBits::eHostWrite,
-      vk::AccessFlagBits::eShaderRead,
-      vk::PipelineStageFlagBits::eHost,
-      vk::PipelineStageFlagBits::eComputeShader);
-    this->mTensorRHS->recordBufferMemoryBarrier(
-      this->mCommandBuffer,
-      vk::AccessFlagBits::eHostWrite,
-      vk::AccessFlagBits::eShaderRead,
-      vk::PipelineStageFlagBits::eHost,
-      vk::PipelineStageFlagBits::eComputeShader);
-
-    this->mAlgorithm->recordDispatch(this->mX, this->mY, this->mZ);
-
-    // Barrier to ensure the shader code is executed before buffer read
-    this->mTensorOutput->recordBufferMemoryBarrier(
-      this->mCommandBuffer,
-      vk::AccessFlagBits::eShaderWrite,
-      vk::AccessFlagBits::eTransferRead,
-      vk::PipelineStageFlagBits::eComputeShader,
-      vk::PipelineStageFlagBits::eTransfer);
-
-    this->mTensorOutputStaging->recordCopyFrom(
-            this->mCommandBuffer,
-            this->mTensorOutput,
-            true);
-}
-
-template<uint32_t tX, uint32_t tY, uint32_t tZ>
-void
-OpAlgoLhsRhsOut<tX, tY, tZ>::postSubmit()
-{
-    SPDLOG_DEBUG("Kompute OpAlgoLhsRhsOut postSubmit called");
-
-    this->mTensorOutputStaging->mapDataFromHostMemory();
-
-    this->mTensorOutput->setData(this->mTensorOutputStaging->data());
-}
-
-}
-
-#endif // #ifndef OPALGOLHSRHSOUT_CPP
-
 #include <fstream>
 
 #if RELEASE
@@ -2138,12 +1834,9 @@ namespace kp {
 
 /**
  * Operation that performs multiplication on two tensors and outpus on third
- * tensor. The template parameters specify the processing GPU layout number of
- * iterations for each x, y, z parameter. More specifically, this will be the
- * input to ".dispatch(uint32_t tX, uint32_t tY, uint32_t, tZ)"
+ * tensor.
  */
-template<uint32_t tX = 0, uint32_t tY = 0, uint32_t tZ = 0>
-class OpMult : public OpAlgoBase<tX, tY, tZ>
+class OpMult : public OpAlgoBase
 {
   public:
     /**
@@ -2162,13 +1855,14 @@ class OpMult : public OpAlgoBase<tX, tY, tZ>
      * @param device Vulkan logical device for passing to Algorithm
      * @param commandBuffer Vulkan Command Buffer to record commands into
      * @param tensors Tensors that are to be used in this operation
-     * @param freeTensors Whether operation manages the memory of the Tensors
+     * @param komputeWorkgroup Optional parameter to specify the layout for processing
      */
     OpMult(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
            std::shared_ptr<vk::Device> device,
            std::shared_ptr<vk::CommandBuffer> commandBuffer,
-           std::vector<std::shared_ptr<Tensor>> tensors)
-      : OpAlgoBase<tX, tY, tZ>(physicalDevice, device, commandBuffer, tensors, "")
+           std::vector<std::shared_ptr<Tensor>> tensors,
+           KomputeWorkgroup komputeWorkgroup = KomputeWorkgroup())
+      : OpAlgoBase(physicalDevice, device, commandBuffer, tensors, "", komputeWorkgroup)
     {
         SPDLOG_DEBUG("Kompute OpMult constructor with params");
 
@@ -2179,14 +1873,8 @@ class OpMult : public OpAlgoBase<tX, tY, tZ>
 
 #if RELEASE
     /**
-     * If release it will be using the static version of the shader which is 
-     * loaded using this file directly.
-     *
-     * @param physicalDevice Vulkan physical device used to find device queues
-     * @param device Vulkan logical device for passing to Algorithm
-     * @param commandBuffer Vulkan Command Buffer to record commands into
-     * @param tensors Tensors that are to be used in this operation
-     * @param freeTensors Whether operation manages the memory of the Tensors
+     * If RELEASE=1 it will be using the static version of the shader which is 
+     * loaded using this file directly. Otherwise it should not override the function.
      */
     std::vector<char> fetchSpirvBinaryData() override
     {