diff --git a/README.md b/README.md
index e1c19f185..c8650b4e1 100644
--- a/README.md
+++ b/README.md
@@ -15,13 +15,15 @@
 <td>
 
 <h1>Vulkan Kompute</h1>
-<h3>The General Purpose Vulkan Compute Framework. Blazing fast, lightweight, easy to set up and optimized for advanced data processing usecases.</h3>
+<h3>The General Purpose Vulkan Compute Framework. </h3>
 
 </td>
 
 </tr>
 </table>
 
+<h4>Blazing fast, lightweight, easy to set up and optimized for advanced data processing usecases.</h4>
+
 🔋 [Documentation](https://axsaucedo.github.io/vulkan-kompute/) 💻 [Import to your project](https://axsaucedo.github.io/vulkan-kompute/) ⌨ [Tutorials](https://axsaucedo.github.io/vulkan-kompute/) 💾
 
 
@@ -38,7 +40,7 @@
 
 ### Setup
 
-Kompute is provided as a single header file `Kompute.hpp` that can be simply included in your code.
+Kompute is provided as a single header file [`Kompute.hpp`](single_include/kompute/Kompute.hpp) that can be simply included in your code.
 
 You can go to our [release page]() to grab the latest library or you can [build from source]().
 
@@ -62,7 +64,7 @@ int main() {
     mgr.evalOp<kp::OpCreateTensor>(params);
 
     // Run Kompute operation on the parameters provided with dispatch layout
-    mgr.evalOp<kp::OpAlgoShader<10, 1, 1>>(params, "path/to/shader.comp.spv");
+    mgr.evalOp<kp::OpAlgoShader<3, 1, 1>>(params, "path/to/shader.comp.spv");
 
     // Print the output
     std::cout << fmt::format("Output: {}", tensorOutput.data()) << std::endl;
diff --git a/single_include/kompute/Kompute.hpp b/single_include/kompute/Kompute.hpp
index 204fd3823..e9aa48848 100755
--- a/single_include/kompute/Kompute.hpp
+++ b/single_include/kompute/Kompute.hpp
@@ -214,7 +214,7 @@ class Tensor
     ~Tensor();
 
     /**
-     * Initialiser creates the buffer and GPU memory.
+     * Initialiser which calls the initialisation for all the respective tensors as well as creates the respective staging tensors. The staging tensors woudl only be created for the tensors of type TensorType::eDevice as otherwise there is no need to copy from host memory.
      */
     void init(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
               std::shared_ptr<vk::Device> device,
@@ -383,6 +383,7 @@ class OpBase
         this->mDevice = device;
         this->mCommandBuffer = commandBuffer;
         this->mTensors = tensors;
+        this->mFreeTensors = freeTensors;
     }
 
     /**
@@ -1105,7 +1106,7 @@ class OpAlgoLhsRhsOut : public OpAlgoBase<tX, tY, tZ>
     OpAlgoLhsRhsOut(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
            std::shared_ptr<vk::Device> device,
            std::shared_ptr<vk::CommandBuffer> commandBuffer,
-           std::vector<std::shared_ptr<Tensor>>& tensors);
+           std::vector<std::shared_ptr<Tensor>> tensors);
 
     /**
      * Default destructor, which is in charge of destroying the algorithm
@@ -1166,7 +1167,7 @@ template<uint32_t tX, uint32_t tY, uint32_t tZ>
 OpAlgoLhsRhsOut<tX, tY, tZ>::OpAlgoLhsRhsOut(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
                            std::shared_ptr<vk::Device> device,
                            std::shared_ptr<vk::CommandBuffer> commandBuffer,
-                           std::vector<std::shared_ptr<Tensor>>& tensors)
+                           std::vector<std::shared_ptr<Tensor>> tensors)
   // The inheritance is initialised with the copyOutputData to false given that
   // this depencendant class handles the transfer of data via staging buffers in 
   // a granular way.
@@ -1318,7 +1319,7 @@ class OpMult : public OpAlgoBase<tX, tY, tZ>
     OpMult(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
            std::shared_ptr<vk::Device> device,
            std::shared_ptr<vk::CommandBuffer> commandBuffer,
-           std::vector<std::shared_ptr<Tensor>>& tensors)
+           std::vector<std::shared_ptr<Tensor>> tensors)
       : OpAlgoBase<tX, tY, tZ>(physicalDevice, device, commandBuffer, tensors, true)
     {
         SPDLOG_DEBUG("Kompute OpMult constructor with params");
@@ -1383,13 +1384,13 @@ class OpCreateTensor : public OpBase
      * @param physicalDevice Vulkan physical device used to find device queues
      * @param device Vulkan logical device for passing to Algorithm
      * @param commandBuffer Vulkan Command Buffer to record commands into
-     * @param tensors Tensors that are to be used in this operation
+     * @param tensors Tensors that will be used to create in operation.
      * @param freeTensors Whether operation manages the memory of the Tensors
      */
     OpCreateTensor(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
                    std::shared_ptr<vk::Device> device,
                    std::shared_ptr<vk::CommandBuffer> commandBuffer,
-                   std::vector<std::shared_ptr<Tensor>>& tensors);
+                   std::vector<std::shared_ptr<Tensor>> tensors);
 
     /**
      * Default destructor which in this case expects the parent class to free
@@ -1418,8 +1419,7 @@ class OpCreateTensor : public OpBase
 
   private:
     // Never owned resources
-    std::shared_ptr<Tensor> mPrimaryTensor;
-    std::shared_ptr<Tensor> mStagingTensor;
+    std::vector<std::shared_ptr<Tensor>> mStagingTensors;
 };
 
 } // End namespace kp
diff --git a/src/OpCreateTensor.cpp b/src/OpCreateTensor.cpp
index 266f57479..f99a81ba5 100644
--- a/src/OpCreateTensor.cpp
+++ b/src/OpCreateTensor.cpp
@@ -14,7 +14,7 @@ OpCreateTensor::OpCreateTensor(
   std::shared_ptr<vk::PhysicalDevice> physicalDevice,
   std::shared_ptr<vk::Device> device,
   std::shared_ptr<vk::CommandBuffer> commandBuffer,
-  std::vector<std::shared_ptr<Tensor>>& tensors)
+  std::vector<std::shared_ptr<Tensor>> tensors)
   : OpBase(physicalDevice, device, commandBuffer, tensors, true)
 {
     SPDLOG_DEBUG("Kompute OpCreateTensor constructor with params");
@@ -23,6 +23,13 @@ OpCreateTensor::OpCreateTensor(
 OpCreateTensor::~OpCreateTensor()
 {
     SPDLOG_DEBUG("Kompute OpCreateTensor destructor started");
+
+    SPDLOG_DEBUG("Kompute OpCreateTensor destroying staging tensors");
+    for (size_t i = 0; i < this->mStagingTensors.size(); i++) {
+        if (this->mStagingTensors[i]) {
+            this->mStagingTensors[i]->freeMemoryDestroyGPUResources();
+        }
+    }
 }
 
 void
@@ -33,30 +40,35 @@ OpCreateTensor::init()
     if (this->mTensors.size() < 1) {
         throw std::runtime_error(
           "Kompute OpCreateTensor called with less than 1 tensor");
-    } else if (this->mTensors.size() > 1) {
-        spdlog::warn("Kompute OpCreateTensor called with more than 1 tensor");
     }
 
-    this->mPrimaryTensor = this->mTensors[0];
+    for (std::shared_ptr<Tensor> tensor: this->mTensors) {
+        if (tensor->isInit()) {
+            throw std::runtime_error("Kompute OpCreateTensor: Tensor has already been initialized");
+        }
+        if (tensor->tensorType() == Tensor::TensorTypes::eDevice) {
+            tensor->init(
+              this->mPhysicalDevice, this->mDevice, this->mCommandBuffer);
 
-    if (this->mPrimaryTensor->tensorType() == Tensor::TensorTypes::eDevice) {
-        this->mPrimaryTensor->init(
-          this->mPhysicalDevice, this->mDevice, this->mCommandBuffer);
+            std::shared_ptr<Tensor> stagingTensor = std::make_shared<Tensor>(
+              tensor->data(), Tensor::TensorTypes::eStaging);
 
-        this->mStagingTensor = std::make_shared<Tensor>(
-          this->mPrimaryTensor->data(), Tensor::TensorTypes::eStaging);
+            stagingTensor->init(
+              this->mPhysicalDevice, this->mDevice, this->mCommandBuffer);
 
-        this->mStagingTensor->init(
-          this->mPhysicalDevice, this->mDevice, this->mCommandBuffer);
+            stagingTensor->mapDataIntoHostMemory();
 
-        this->mStagingTensor->mapDataIntoHostMemory();
+            this->mStagingTensors.push_back(stagingTensor);
 
-        // Adding to the OpBase owned resource so they are freed
-        this->mTensors.push_back(this->mStagingTensor);
+        } else {
 
-    } else {
-        this->mPrimaryTensor->init(
-          this->mPhysicalDevice, this->mDevice, this->mCommandBuffer);
+            tensor->init(
+              this->mPhysicalDevice, this->mDevice, this->mCommandBuffer);
+
+            // We push a nullptr when no staging tensor is needed to match 
+            // index number in array to have one to one mapping with tensors
+            this->mStagingTensors.push_back(nullptr);
+        }
     }
 }
 
@@ -65,8 +77,10 @@ OpCreateTensor::record()
 {
     SPDLOG_DEBUG("Kompute OpCreateTensor record called");
 
-    if (this->mPrimaryTensor->tensorType() == Tensor::TensorTypes::eDevice) {
-        this->mPrimaryTensor->recordCopyFrom(this->mStagingTensor, true);
+    for (size_t i = 0; i < this->mTensors.size(); i++) {
+        if (this->mTensors[i]->tensorType() == Tensor::TensorTypes::eDevice) {
+            this->mTensors[i]->recordCopyFrom(this->mStagingTensors[i], false);
+        }
     }
 }
 
@@ -75,9 +89,13 @@ OpCreateTensor::postSubmit()
 {
     SPDLOG_DEBUG("Kompute OpCreateTensor postSubmit called");
 
-    this->mStagingTensor->mapDataFromHostMemory();
+    for (size_t i = 0; i < this->mTensors.size(); i++) {
+        if (this->mTensors[i]->tensorType() == Tensor::TensorTypes::eDevice) {
+            this->mStagingTensors[i]->mapDataFromHostMemory();
 
-    this->mPrimaryTensor->setData(this->mStagingTensor->data());
+            this->mTensors[i]->setData(this->mStagingTensors[i]->data());
+        }
+    }
 }
 
 }
diff --git a/src/Tensor.cpp b/src/Tensor.cpp
index 882f11630..36871ea7a 100644
--- a/src/Tensor.cpp
+++ b/src/Tensor.cpp
@@ -25,7 +25,7 @@ Tensor::Tensor(std::vector<uint32_t> data, TensorTypes tensorType)
 
 Tensor::~Tensor()
 {
-    SPDLOG_DEBUG("Kompute Tensor destructor started");
+    SPDLOG_DEBUG("Kompute Tensor destructor started. Type: {}", this->tensorType());
 
     if (this->isInit()) {
         this->freeMemoryDestroyGPUResources();
diff --git a/src/include/kompute/Tensor.hpp b/src/include/kompute/Tensor.hpp
index e5cdd6932..d5649f17b 100644
--- a/src/include/kompute/Tensor.hpp
+++ b/src/include/kompute/Tensor.hpp
@@ -52,7 +52,7 @@ class Tensor
     ~Tensor();
 
     /**
-     * Initialiser creates the buffer and GPU memory.
+     * Initialiser which calls the initialisation for all the respective tensors as well as creates the respective staging tensors. The staging tensors woudl only be created for the tensors of type TensorType::eDevice as otherwise there is no need to copy from host memory.
      */
     void init(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
               std::shared_ptr<vk::Device> device,
diff --git a/src/include/kompute/operations/OpAlgoLhsRhsOut.hpp b/src/include/kompute/operations/OpAlgoLhsRhsOut.hpp
index dca11eb0c..2480ea6e4 100644
--- a/src/include/kompute/operations/OpAlgoLhsRhsOut.hpp
+++ b/src/include/kompute/operations/OpAlgoLhsRhsOut.hpp
@@ -42,7 +42,7 @@ class OpAlgoLhsRhsOut : public OpAlgoBase<tX, tY, tZ>
     OpAlgoLhsRhsOut(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
            std::shared_ptr<vk::Device> device,
            std::shared_ptr<vk::CommandBuffer> commandBuffer,
-           std::vector<std::shared_ptr<Tensor>>& tensors);
+           std::vector<std::shared_ptr<Tensor>> tensors);
 
     /**
      * Default destructor, which is in charge of destroying the algorithm
@@ -103,7 +103,7 @@ template<uint32_t tX, uint32_t tY, uint32_t tZ>
 OpAlgoLhsRhsOut<tX, tY, tZ>::OpAlgoLhsRhsOut(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
                            std::shared_ptr<vk::Device> device,
                            std::shared_ptr<vk::CommandBuffer> commandBuffer,
-                           std::vector<std::shared_ptr<Tensor>>& tensors)
+                           std::vector<std::shared_ptr<Tensor>> tensors)
   // The inheritance is initialised with the copyOutputData to false given that
   // this depencendant class handles the transfer of data via staging buffers in 
   // a granular way.
diff --git a/src/include/kompute/operations/OpBase.hpp b/src/include/kompute/operations/OpBase.hpp
index 41d8f50f3..c8c1af432 100644
--- a/src/include/kompute/operations/OpBase.hpp
+++ b/src/include/kompute/operations/OpBase.hpp
@@ -45,6 +45,7 @@ class OpBase
         this->mDevice = device;
         this->mCommandBuffer = commandBuffer;
         this->mTensors = tensors;
+        this->mFreeTensors = freeTensors;
     }
 
     /**
diff --git a/src/include/kompute/operations/OpCreateTensor.hpp b/src/include/kompute/operations/OpCreateTensor.hpp
index e7f7320af..f08bef14c 100644
--- a/src/include/kompute/operations/OpCreateTensor.hpp
+++ b/src/include/kompute/operations/OpCreateTensor.hpp
@@ -25,13 +25,13 @@ class OpCreateTensor : public OpBase
      * @param physicalDevice Vulkan physical device used to find device queues
      * @param device Vulkan logical device for passing to Algorithm
      * @param commandBuffer Vulkan Command Buffer to record commands into
-     * @param tensors Tensors that are to be used in this operation
+     * @param tensors Tensors that will be used to create in operation.
      * @param freeTensors Whether operation manages the memory of the Tensors
      */
     OpCreateTensor(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
                    std::shared_ptr<vk::Device> device,
                    std::shared_ptr<vk::CommandBuffer> commandBuffer,
-                   std::vector<std::shared_ptr<Tensor>>& tensors);
+                   std::vector<std::shared_ptr<Tensor>> tensors);
 
     /**
      * Default destructor which in this case expects the parent class to free
@@ -60,8 +60,7 @@ class OpCreateTensor : public OpBase
 
   private:
     // Never owned resources
-    std::shared_ptr<Tensor> mPrimaryTensor;
-    std::shared_ptr<Tensor> mStagingTensor;
+    std::vector<std::shared_ptr<Tensor>> mStagingTensors;
 };
 
 } // End namespace kp
diff --git a/src/include/kompute/operations/OpMult.hpp b/src/include/kompute/operations/OpMult.hpp
index f2b62da91..45a63f54c 100644
--- a/src/include/kompute/operations/OpMult.hpp
+++ b/src/include/kompute/operations/OpMult.hpp
@@ -46,7 +46,7 @@ class OpMult : public OpAlgoBase<tX, tY, tZ>
     OpMult(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
            std::shared_ptr<vk::Device> device,
            std::shared_ptr<vk::CommandBuffer> commandBuffer,
-           std::vector<std::shared_ptr<Tensor>>& tensors)
+           std::vector<std::shared_ptr<Tensor>> tensors)
       : OpAlgoBase<tX, tY, tZ>(physicalDevice, device, commandBuffer, tensors, true)
     {
         SPDLOG_DEBUG("Kompute OpMult constructor with params");