From 1b4e4b6b18c9a78995f74ad09b9b180018df28fd Mon Sep 17 00:00:00 2001
From: Alejandro Saucedo <axsauze@gmail.com>
Date: Sun, 6 Sep 2020 12:52:45 +0100
Subject: [PATCH] Added optensorSyncDevice and optensorcopy tests

---
 single_include/AggregateHeaders.cpp           |   2 +
 single_include/kompute/Kompute.hpp            | 116 +++++++++++++-
 src/OpTensorCopy.cpp                          |   2 +-
 src/OpTensorCreate.cpp                        |  25 +---
 src/OpTensorSyncDevice.cpp                    |  50 +++++--
 src/OpTensorSyncLocal.cpp                     |  97 ++++++++++++
 src/include/kompute/operations/OpAlgoBase.hpp |   1 +
 .../kompute/operations/OpTensorCreate.hpp     |   8 +-
 .../kompute/operations/OpTensorSyncDevice.hpp |   8 +-
 .../kompute/operations/OpTensorSyncLocal.hpp  |  60 ++++++++
 test/TestOpTensorCopy.cpp                     | 141 +++++++++---------
 test/TestTensor.cpp                           |   4 +-
 12 files changed, 398 insertions(+), 116 deletions(-)
 create mode 100644 src/OpTensorSyncLocal.cpp
 create mode 100644 src/include/kompute/operations/OpTensorSyncLocal.hpp
diff --git a/single_include/AggregateHeaders.cpp b/single_include/AggregateHeaders.cpp
index 57cd16e5e..dd47e9ee3 100644
--- a/single_include/AggregateHeaders.cpp
+++ b/single_include/AggregateHeaders.cpp
@@ -8,5 +8,7 @@
 #include "kompute/operations/OpMult.hpp"
 #include "kompute/operations/OpTensorCreate.hpp"
 #include "kompute/operations/OpTensorCopy.hpp"
+#include "kompute/operations/OpTensorSyncDevice.hpp"
+#include "kompute/operations/OpTensorSyncLocal.hpp"
 #include "kompute/Algorithm.hpp"
 #include "kompute/Tensor.hpp"
diff --git a/single_include/kompute/Kompute.hpp b/single_include/kompute/Kompute.hpp
index 2d718b9e6..83e97fdd8 100755
--- a/single_include/kompute/Kompute.hpp
+++ b/single_include/kompute/Kompute.hpp
@@ -1150,6 +1150,7 @@ OpAlgoBase<tX, tY, tZ>::record()
         }
 
         // Record copy from and create barrier for STAGING tensors
+        // TODO: This only accounts for device tensors need to account for staging and storage
         for (size_t i = 0; i < this->mTensors.size(); i++) {
             this->mOutputStagingTensors[i]->recordCopyFrom(
                 this->mCommandBuffer,
@@ -1539,15 +1540,17 @@ class OpTensorCreate : public OpBase
     /**
      * In charge of initialising the primary Tensor as well as the staging
      * tensor as required. It will only initialise a staging tensor if the
-     * Primary tensor is of type Device.
+     * Primary tensor is of type Device. For staging tensors it performs a 
+     * mapDataIntoHostMemory which would perform immediately as opposed to 
+     * on sequence eval/submission.
      */
     void init() override;
 
     /**
      * Record runs the core actions to create the tensors. For device tensors
      * it records a copyCommand to move the data from the staging tensor to the 
-     * device tensor. For staging tensors it performs a mapDataIntoHostMemory
-     * which would perform immediately as opposed to on sequence eval/submission.
+     * device tensor. The mapping for staging tensors happens in the init function
+     * not in the record function.
      */
     void record() override;
 
@@ -1567,7 +1570,7 @@ class OpTensorCreate : public OpBase
 namespace kp {
 
 /**
-    Operation that copies the data from the first tensor to the rest of the tensors provided, using a record command for all the vectors. This operation does not own/manage the memory of the tensors passed to it.
+    Operation that copies the data from the first tensor to the rest of the tensors provided, using a record command for all the vectors. This operation does not own/manage the memory of the tensors passed to it. The operation must only receive tensors of type 
 */
 class OpTensorCopy : public OpBase
 {
@@ -1588,13 +1591,12 @@ class OpTensorCopy : public OpBase
                    std::vector<std::shared_ptr<Tensor>> tensors);
 
     /**
-     * Default destructor which in this case expects the parent class to free
-     * the tensors
+     * Default destructor. This class does not manage memory so it won't be expecting the parent to perform a release.
      */
     ~OpTensorCopy() override;
 
     /**
-     * TODO
+     * Performs basic checks such as ensuring there are at least two tensors provided, that they are initialised and that they are not of type TensorTypes::eStorage.
      */
     void init() override;
 
@@ -1612,3 +1614,103 @@ class OpTensorCopy : public OpBase
 };
 
 } // End namespace kp
+
+namespace kp {
+
+/**
+    Operation that syncs tensor's device by mapping local data into the device memory. For TensorTypes::eDevice it will use a staging tensor to perform the copy. For TensorTypes::eStaging it will only copy the data and perform a map, which will be executed during the record (as opposed to during the sequence eval/submit). This function cannot be carried out for TensorTypes::eStaging.
+*/
+class OpTensorSyncDevice : public OpBase
+{
+  public:
+    OpTensorSyncDevice();
+
+    /**
+     * Default constructor with parameters that provides the core vulkan resources and the tensors that will be used in the operation. The tensos provided cannot be of type TensorTypes::eStorage.
+     *
+     * @param physicalDevice Vulkan physical device used to find device queues
+     * @param device Vulkan logical device for passing to Algorithm
+     * @param commandBuffer Vulkan Command Buffer to record commands into
+     * @param tensors Tensors that will be used to create in operation.
+     */
+    OpTensorSyncDevice(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
+                   std::shared_ptr<vk::Device> device,
+                   std::shared_ptr<vk::CommandBuffer> commandBuffer,
+                   std::vector<std::shared_ptr<Tensor>> tensors);
+
+    /**
+     * Default destructor. This class does not manage memory so it won't be expecting the parent to perform a release.
+     */
+    ~OpTensorSyncDevice() override;
+
+    /**
+     * Performs basic checks such as ensuring that there is at least one tensor provided, that they are initialized and that they are not of type TensorTpes::eStaging. For staging tensors in host memory, the map is performed during the init function.
+     */
+    void init() override;
+
+    /**
+     * For device tensors, it records the copy command to the device tensor from the temporary staging tensor.
+     */
+    void record() override;
+
+    /**
+     * Does not perform any further sync functions. Frees the staging tensors together with their respective memory.
+     */
+    void postSubmit() override;
+
+  private:
+    // Never owned resources
+    std::vector<std::shared_ptr<Tensor>> mStagingTensors;
+};
+
+} // End namespace kp
+
+namespace kp {
+
+/**
+    Operation that syncs tensor's local data by mapping the data from device memory into the local vector. For TensorTypes::eDevice it will use a staging tensor to perform the copy. For TensorTypes::eStaging it will only copy the data and perform a map, which will be executed during the postSubmit (there will be no copy during the sequence eval/submit). This function cannot be carried out for TensorTypes::eStaging.
+*/
+class OpTensorSyncLocal : public OpBase
+{
+  public:
+    OpTensorSyncLocal();
+
+    /**
+     * Default constructor with parameters that provides the core vulkan resources and the tensors that will be used in the operation. The tensors provided cannot be of type TensorTypes::eStorage.
+     *
+     * @param physicalDevice Vulkan physical device used to find device queues
+     * @param device Vulkan logical device for passing to Algorithm
+     * @param commandBuffer Vulkan Command Buffer to record commands into
+     * @param tensors Tensors that will be used to create in operation.
+     */
+    OpTensorSyncLocal(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
+                   std::shared_ptr<vk::Device> device,
+                   std::shared_ptr<vk::CommandBuffer> commandBuffer,
+                   std::vector<std::shared_ptr<Tensor>> tensors);
+
+    /**
+     * Default destructor. This class manages the memory of the staging tensors it owns but these are released in the postSubmit, before it arrives to the destructor.
+     */
+    ~OpTensorSyncLocal() override;
+
+    /**
+     * Performs basic checks such as ensuring that there is at least one tensor provided, that they are initialized and that they are not of type TensorTpes::eStaging.
+     */
+    void init() override;
+
+    /**
+     * For device tensors, it records the copy command into the staging tensor from the device tensor.
+     */
+    void record() override;
+
+    /**
+     * For host tensors it performs the map command from the host memory into local memory. Frees the staging tensors together with their respective memory.
+     */
+    void postSubmit() override;
+
+  private:
+    // Never owned resources
+    std::vector<std::shared_ptr<Tensor>> mStagingTensors;
+};
+
+} // End namespace kp
diff --git a/src/OpTensorCopy.cpp b/src/OpTensorCopy.cpp
index c0e1f5046..50eb9c4c1 100644
--- a/src/OpTensorCopy.cpp
+++ b/src/OpTensorCopy.cpp
@@ -40,7 +40,7 @@ OpTensorCopy::init()
             throw std::runtime_error("Kompute OpTensorCopy tensor parameter has not been initialized");
         }
         if (tensor->tensorType() == Tensor::TensorTypes::eStorage) {
-            throw std::runtime_error("Kompute OpTensorCopy tensor parameter is of type storage and hence cannot be used to receive or pass data.");
+            throw std::runtime_error("Kompute OpTensorCopy tensor parameter is of TensorTypes::eStorage and hence cannot be used to receive or pass data.");
         }
     }
 }
diff --git a/src/OpTensorCreate.cpp b/src/OpTensorCreate.cpp
index 32688da1f..5bd7317d1 100644
--- a/src/OpTensorCreate.cpp
+++ b/src/OpTensorCreate.cpp
@@ -23,13 +23,6 @@ OpTensorCreate::OpTensorCreate(
 OpTensorCreate::~OpTensorCreate()
 {
     SPDLOG_DEBUG("Kompute OpTensorCreate destructor started");
-
-    SPDLOG_DEBUG("Kompute OpTensorCreate destroying staging tensors");
-    for (size_t i = 0; i < this->mStagingTensors.size(); i++) {
-        if (this->mStagingTensors[i]) {
-            this->mStagingTensors[i]->freeMemoryDestroyGPUResources();
-        }
-    }
 }
 
 void
@@ -65,6 +58,8 @@ OpTensorCreate::init()
             tensor->init(
               this->mPhysicalDevice, this->mDevice);
 
+            tensor->mapDataIntoHostMemory();
+
             // We push a nullptr when no staging tensor is needed to match 
             // index number in array to have one to one mapping with tensors
             this->mStagingTensors.push_back(nullptr);
@@ -80,9 +75,7 @@ OpTensorCreate::record()
     for (size_t i = 0; i < this->mTensors.size(); i++) {
         if (this->mTensors[i]->tensorType() == Tensor::TensorTypes::eDevice) {
             this->mTensors[i]->recordCopyFrom(this->mCommandBuffer, this->mStagingTensors[i], false);
-        } else if (this->mTensors[i]->tensorType() == Tensor::TensorTypes::eStaging) {
-            this->mTensors[i]->mapDataIntoHostMemory();
-        }
+        } 
     }
 }
 
@@ -91,13 +84,11 @@ OpTensorCreate::postSubmit()
 {
     SPDLOG_DEBUG("Kompute OpTensorCreate postSubmit called");
 
-    for (size_t i = 0; i < this->mTensors.size(); i++) {
-        if (this->mTensors[i]->tensorType() == Tensor::TensorTypes::eDevice) {
-            this->mStagingTensors[i]->mapDataFromHostMemory();
-
-            this->mTensors[i]->setData(this->mStagingTensors[i]->data());
-        }
-    }
+    // TODO: Remove and add a test that checks that the memory in 
+    // the staging tensor is actually storing the data
+    SPDLOG_DEBUG("Kompute OpTensorCreate destroying staging tensors");
+    // TODO: This would cause issues if there is no CPU barrier
+    this->mStagingTensors.clear();
 }
 
 }
diff --git a/src/OpTensorSyncDevice.cpp b/src/OpTensorSyncDevice.cpp
index 1652b88ba..7c87245cd 100644
--- a/src/OpTensorSyncDevice.cpp
+++ b/src/OpTensorSyncDevice.cpp
@@ -1,4 +1,6 @@
 
+#include "kompute/Tensor.hpp"
+
 #include "kompute/operations/OpTensorSyncDevice.hpp"
 
 namespace kp {
@@ -21,8 +23,6 @@ OpTensorSyncDevice::OpTensorSyncDevice(
 OpTensorSyncDevice::~OpTensorSyncDevice()
 {
     SPDLOG_DEBUG("Kompute OpTensorSyncDevice destructor started");
-
-    SPDLOG_DEBUG("Kompute OpTensorSyncDevice destroying staging tensors");
 }
 
 void
@@ -30,17 +30,37 @@ OpTensorSyncDevice::init()
 {
     SPDLOG_DEBUG("Kompute OpTensorSyncDevice init called");
 
-    if (this->mTensors.size() < 2) {
+    if (this->mTensors.size() < 1) {
         throw std::runtime_error(
-          "Kompute OpTensorSyncDevice called with less than 2 tensor");
+          "Kompute OpTensorSyncDevice called with less than 1 tensor");
     }
 
     for (std::shared_ptr<Tensor> tensor: this->mTensors) {
-        if (!tensor->isInit()) {
-            throw std::runtime_error("Kompute OpTensorSyncDevice tensor parameter has not been initialized");
+        if (tensor->isInit()) {
+            throw std::runtime_error("Kompute OpTensorSyncDevice: Tensor has already been initialized");
         }
         if (tensor->tensorType() == Tensor::TensorTypes::eStorage) {
-            throw std::runtime_error("Kompute OpTensorSyncDevice tensor parameter is of type storage and hence cannot be used to receive or pass data.");
+            throw std::runtime_error("Kompute OpTensorSyncLocal tensor parameter is of type TensorTypes::eStorage and hence cannot be used to receive or pass data.");
+        }
+        if (tensor->tensorType() == Tensor::TensorTypes::eDevice) {
+
+            std::shared_ptr<Tensor> stagingTensor = std::make_shared<Tensor>(
+              tensor->data(), Tensor::TensorTypes::eStaging);
+
+            stagingTensor->init(
+              this->mPhysicalDevice, this->mDevice);
+
+            stagingTensor->mapDataIntoHostMemory();
+
+            this->mStagingTensors.push_back(stagingTensor);
+
+        } else {
+
+            tensor->mapDataIntoHostMemory();
+
+            // We push a nullptr when no staging tensor is needed to match 
+            // index number in array to have one to one mapping with tensors
+            this->mStagingTensors.push_back(nullptr);
         }
     }
 }
@@ -50,9 +70,10 @@ OpTensorSyncDevice::record()
 {
     SPDLOG_DEBUG("Kompute OpTensorSyncDevice record called");
 
-    // We iterate from the second tensor onwards and record a copy to all
-    for (size_t i = 1; i < this->mTensors.size(); i++) {
-        this->mTensors[i]->recordCopyFrom(this->mCommandBuffer, this->mTensors[0], false);
+    for (size_t i = 0; i < this->mTensors.size(); i++) {
+        if (this->mTensors[i]->tensorType() == Tensor::TensorTypes::eDevice) {
+            this->mTensors[i]->recordCopyFrom(this->mCommandBuffer, this->mStagingTensors[i], false);
+        }
     }
 }
 
@@ -61,11 +82,10 @@ OpTensorSyncDevice::postSubmit()
 {
     SPDLOG_DEBUG("Kompute OpTensorSyncDevice postSubmit called");
 
-    // Copy the data from the first tensor into all the tensors
-    for (size_t i = 1; i < this->mTensors.size(); i++) {
-        this->mTensors[i]->setData(this->mTensors[0]->data());
-    }
+    // Remove all staging tensors as they are not required after operation
+    SPDLOG_DEBUG("Kompute OpTensorSyncDevice destroying staging tensors");
+    // TODO: This would cause issues if there is no CPU barrier
+    this->mStagingTensors.clear();
 }
 
 }
-
diff --git a/src/OpTensorSyncLocal.cpp b/src/OpTensorSyncLocal.cpp
new file mode 100644
index 000000000..37037ff99
--- /dev/null
+++ b/src/OpTensorSyncLocal.cpp
@@ -0,0 +1,97 @@
+
+#include "kompute/Tensor.hpp"
+
+#include "kompute/operations/OpTensorSyncLocal.hpp"
+
+namespace kp {
+
+OpTensorSyncLocal::OpTensorSyncLocal()
+{
+    SPDLOG_DEBUG("Kompute OpTensorSyncLocal constructor base");
+}
+
+OpTensorSyncLocal::OpTensorSyncLocal(
+  std::shared_ptr<vk::PhysicalDevice> physicalDevice,
+  std::shared_ptr<vk::Device> device,
+  std::shared_ptr<vk::CommandBuffer> commandBuffer,
+  std::vector<std::shared_ptr<Tensor>> tensors)
+  : OpBase(physicalDevice, device, commandBuffer, tensors, false)
+{
+    SPDLOG_DEBUG("Kompute OpTensorSyncLocal constructor with params");
+}
+
+OpTensorSyncLocal::~OpTensorSyncLocal()
+{
+    SPDLOG_DEBUG("Kompute OpTensorSyncLocal destructor started");
+}
+
+void
+OpTensorSyncLocal::init()
+{
+    SPDLOG_DEBUG("Kompute OpTensorSyncLocal init called");
+
+    if (this->mTensors.size() < 1) {
+        throw std::runtime_error(
+          "Kompute OpTensorSyncLocal called with less than 1 tensor");
+    }
+
+    for (std::shared_ptr<Tensor> tensor: this->mTensors) {
+        if (!tensor->isInit()) {
+            throw std::runtime_error("Kompute OpTensorSyncLocal: Tensor has not been initialized");
+        }
+        if (tensor->tensorType() == Tensor::TensorTypes::eStorage) {
+            throw std::runtime_error("Kompute OpTensorSyncLocal tensor parameter is of type TensorTypes::eStorage and hence cannot be used to receive or pass data.");
+        }
+        if (tensor->tensorType() == Tensor::TensorTypes::eDevice) {
+
+            std::shared_ptr<Tensor> stagingTensor = std::make_shared<Tensor>(
+              tensor->data(), Tensor::TensorTypes::eStaging);
+
+            stagingTensor->init(
+              this->mPhysicalDevice, this->mDevice);
+
+            this->mStagingTensors.push_back(stagingTensor);
+
+        } else {
+
+            // We push a nullptr when no staging tensor is needed to match 
+            // index number in array to have one to one mapping with tensors
+            this->mStagingTensors.push_back(nullptr);
+        }
+    }
+}
+
+void
+OpTensorSyncLocal::record()
+{
+    SPDLOG_DEBUG("Kompute OpTensorSyncLocal record called");
+
+    for (size_t i = 0; i < this->mTensors.size(); i++) {
+        if (this->mTensors[i]->tensorType() == Tensor::TensorTypes::eDevice) {
+            this->mStagingTensors[i]->recordCopyFrom(this->mCommandBuffer, this->mTensors[i], true);
+        }
+    }
+}
+
+void
+OpTensorSyncLocal::postSubmit()
+{
+    SPDLOG_DEBUG("Kompute OpTensorSyncLocal postSubmit called");
+
+    for (size_t i = 0; i < this->mTensors.size(); i++) {
+        if (this->mTensors[i]->tensorType() == Tensor::TensorTypes::eDevice) {
+            this->mStagingTensors[i]->mapDataFromHostMemory();
+            this->mTensors[i]->setData(this->mStagingTensors[i]->data());
+        } else {
+            this->mTensors[i]->mapDataFromHostMemory();
+        }
+    }
+
+    // Remove all staging tensors as they are not required after operation
+    SPDLOG_DEBUG("Kompute OpTensorSyncLocal destroying staging tensors");
+    // TODO: This would cause issues if there is no CPU barrier
+    this->mStagingTensors.clear();
+}
+
+}
+
diff --git a/src/include/kompute/operations/OpAlgoBase.hpp b/src/include/kompute/operations/OpAlgoBase.hpp
index 86e03c442..92c7e607b 100644
--- a/src/include/kompute/operations/OpAlgoBase.hpp
+++ b/src/include/kompute/operations/OpAlgoBase.hpp
@@ -304,6 +304,7 @@ OpAlgoBase<tX, tY, tZ>::record()
         }
 
         // Record copy from and create barrier for STAGING tensors
+        // TODO: This only accounts for device tensors need to account for staging and storage
         for (size_t i = 0; i < this->mTensors.size(); i++) {
             this->mOutputStagingTensors[i]->recordCopyFrom(
                 this->mCommandBuffer,
diff --git a/src/include/kompute/operations/OpTensorCreate.hpp b/src/include/kompute/operations/OpTensorCreate.hpp
index 7360e5bdc..1702237eb 100644
--- a/src/include/kompute/operations/OpTensorCreate.hpp
+++ b/src/include/kompute/operations/OpTensorCreate.hpp
@@ -42,15 +42,17 @@ class OpTensorCreate : public OpBase
     /**
      * In charge of initialising the primary Tensor as well as the staging
      * tensor as required. It will only initialise a staging tensor if the
-     * Primary tensor is of type Device.
+     * Primary tensor is of type Device. For staging tensors it performs a 
+     * mapDataIntoHostMemory which would perform immediately as opposed to 
+     * on sequence eval/submission.
      */
     void init() override;
 
     /**
      * Record runs the core actions to create the tensors. For device tensors
      * it records a copyCommand to move the data from the staging tensor to the 
-     * device tensor. For staging tensors it performs a mapDataIntoHostMemory
-     * which would perform immediately as opposed to on sequence eval/submission.
+     * device tensor. The mapping for staging tensors happens in the init function
+     * not in the record function.
      */
     void record() override;
 
diff --git a/src/include/kompute/operations/OpTensorSyncDevice.hpp b/src/include/kompute/operations/OpTensorSyncDevice.hpp
index 14f95a7be..de57e0683 100644
--- a/src/include/kompute/operations/OpTensorSyncDevice.hpp
+++ b/src/include/kompute/operations/OpTensorSyncDevice.hpp
@@ -35,21 +35,23 @@ class OpTensorSyncDevice : public OpBase
     ~OpTensorSyncDevice() override;
 
     /**
-     * Performs basic checks such as ensuring that there is at least one tensor provided, that they are initialized and that they are not of type TensorTpes::eStaging.
+     * Performs basic checks such as ensuring that there is at least one tensor provided, that they are initialized and that they are not of type TensorTpes::eStaging. For staging tensors in host memory, the map is performed during the init function.
      */
     void init() override;
 
     /**
-     * Records the copy commands from teh first tensor into all the other tensors provided. Also optionally records a barrier.
+     * For device tensors, it records the copy command to the device tensor from the temporary staging tensor.
      */
     void record() override;
 
     /**
-     * Copies the local vectors for all the tensors to sync the data with the gpu.
+     * Does not perform any further sync functions. Frees the staging tensors together with their respective memory.
      */
     void postSubmit() override;
 
   private:
+    // Never owned resources
+    std::vector<std::shared_ptr<Tensor>> mStagingTensors;
 };
 
 } // End namespace kp
diff --git a/src/include/kompute/operations/OpTensorSyncLocal.hpp b/src/include/kompute/operations/OpTensorSyncLocal.hpp
new file mode 100644
index 000000000..d06629c29
--- /dev/null
+++ b/src/include/kompute/operations/OpTensorSyncLocal.hpp
@@ -0,0 +1,60 @@
+#pragma once
+
+#include "kompute/Core.hpp"
+
+#include "kompute/Tensor.hpp"
+
+#include "kompute/operations/OpBase.hpp"
+
+namespace kp {
+
+/**
+    Operation that syncs tensor's local data by mapping the data from device memory into the local vector. For TensorTypes::eDevice it will use a staging tensor to perform the copy. For TensorTypes::eStaging it will only copy the data and perform a map, which will be executed during the postSubmit (there will be no copy during the sequence eval/submit). This function cannot be carried out for TensorTypes::eStaging.
+*/
+class OpTensorSyncLocal : public OpBase
+{
+  public:
+    OpTensorSyncLocal();
+
+    /**
+     * Default constructor with parameters that provides the core vulkan resources and the tensors that will be used in the operation. The tensors provided cannot be of type TensorTypes::eStorage.
+     *
+     * @param physicalDevice Vulkan physical device used to find device queues
+     * @param device Vulkan logical device for passing to Algorithm
+     * @param commandBuffer Vulkan Command Buffer to record commands into
+     * @param tensors Tensors that will be used to create in operation.
+     */
+    OpTensorSyncLocal(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
+                   std::shared_ptr<vk::Device> device,
+                   std::shared_ptr<vk::CommandBuffer> commandBuffer,
+                   std::vector<std::shared_ptr<Tensor>> tensors);
+
+    /**
+     * Default destructor. This class manages the memory of the staging tensors it owns but these are released in the postSubmit, before it arrives to the destructor.
+     */
+    ~OpTensorSyncLocal() override;
+
+    /**
+     * Performs basic checks such as ensuring that there is at least one tensor provided, that they are initialized and that they are not of type TensorTpes::eStaging.
+     */
+    void init() override;
+
+    /**
+     * For device tensors, it records the copy command into the staging tensor from the device tensor.
+     */
+    void record() override;
+
+    /**
+     * For host tensors it performs the map command from the host memory into local memory. Frees the staging tensors together with their respective memory.
+     */
+    void postSubmit() override;
+
+  private:
+    // Never owned resources
+    std::vector<std::shared_ptr<Tensor>> mStagingTensors;
+};
+
+} // End namespace kp
+
+
+
diff --git a/test/TestOpTensorCopy.cpp b/test/TestOpTensorCopy.cpp
index 6b36e078a..9325cacf5 100644
--- a/test/TestOpTensorCopy.cpp
+++ b/test/TestOpTensorCopy.cpp
@@ -8,25 +8,7 @@ TEST(TestOpTensorCopy, CopyDeviceToDeviceTensor) {
     kp::Manager mgr;
 
     std::vector<float> testVecA{ 9, 8, 7 };
-
-    std::shared_ptr<kp::Tensor> tensorA{new kp::Tensor(testVecA)};
-
-    mgr.evalOpDefault<kp::OpTensorCreate>({tensorA});
-
-    EXPECT_TRUE(tensorA->isInit());
-
-    EXPECT_EQ(tensorA->data(), testVecA);
-
-    tensorA->freeMemoryDestroyGPUResources();
-    EXPECT_FALSE(tensorA->isInit());
-}
-
-TEST(TestOpTensorCopy, CreateMultipleTensorSingleOp) {
-
-    kp::Manager mgr;
-
-    std::vector<float> testVecA{ 9, 8, 7 };
-    std::vector<float> testVecB{ 6, 5, 4 };
+    std::vector<float> testVecB{ 0, 0, 0 };
 
     std::shared_ptr<kp::Tensor> tensorA{new kp::Tensor(testVecA)};
     std::shared_ptr<kp::Tensor> tensorB{new kp::Tensor(testVecB)};
@@ -36,76 +18,101 @@ TEST(TestOpTensorCopy, CreateMultipleTensorSingleOp) {
     EXPECT_TRUE(tensorA->isInit());
     EXPECT_TRUE(tensorB->isInit());
 
-    EXPECT_EQ(tensorA->data(), testVecA);
-    EXPECT_EQ(tensorB->data(), testVecB);
+    mgr.evalOpDefault<kp::OpTensorCopy>({tensorA, tensorB});
+
+    EXPECT_EQ(tensorA->data(), tensorB->data());
+
+    // Making sure the GPU holds the same data
+    mgr.evalOpDefault<kp::OpTensorSyncLocal>({tensorB});
+    EXPECT_EQ(tensorA->data(), tensorB->data());
 }
 
-TEST(TestOpTensorCopy, CreateMultipleTensorMultipleOp) {
+TEST(TestOpTensorCopy, CopyDeviceToStagingTensor) {
 
     kp::Manager mgr;
 
     std::vector<float> testVecA{ 9, 8, 7 };
-    std::vector<float> testVecB{ 6, 5, 4 };
+    std::vector<float> testVecB{ 0, 0, 0 };
 
     std::shared_ptr<kp::Tensor> tensorA{new kp::Tensor(testVecA)};
-    std::shared_ptr<kp::Tensor> tensorB{new kp::Tensor(testVecB)};
+    std::shared_ptr<kp::Tensor> tensorB{new kp::Tensor(testVecB, kp::Tensor::TensorTypes::eStaging)};
 
-    mgr.evalOpDefault<kp::OpTensorCreate>({tensorA});
-    mgr.evalOpDefault<kp::OpTensorCreate>({tensorB});
+    mgr.evalOpDefault<kp::OpTensorCreate>({tensorA, tensorB});
 
     EXPECT_TRUE(tensorA->isInit());
     EXPECT_TRUE(tensorB->isInit());
 
-    EXPECT_EQ(tensorA->data(), testVecA);
-    EXPECT_EQ(tensorB->data(), testVecB);
+    mgr.evalOpDefault<kp::OpTensorCopy>({tensorA, tensorB});
+
+    EXPECT_EQ(tensorA->data(), tensorB->data());
+
+    // Making sure the GPU holds the same data
+    mgr.evalOpDefault<kp::OpTensorSyncLocal>({tensorB});
+    EXPECT_EQ(tensorA->data(), tensorB->data());
 }
 
-TEST(TestOpTensorCopy, ManageTensorMemoryWhenOpTensorCreateDestroyed) {
-
-    std::vector<float> testVecA{ 9, 8, 7 };
-    std::vector<float> testVecB{ 6, 5, 4 };
-
-    std::shared_ptr<kp::Tensor> tensorA{new kp::Tensor(testVecA)};
-    std::shared_ptr<kp::Tensor> tensorB{new kp::Tensor(testVecB)};
-
-    {
-        kp::Manager mgr;
-        mgr.evalOpDefault<kp::OpTensorCreate>({tensorA});
-        mgr.evalOpDefault<kp::OpTensorCreate>({tensorB});
-
-        EXPECT_TRUE(tensorA->isInit());
-        EXPECT_TRUE(tensorB->isInit());
-
-        EXPECT_EQ(tensorA->data(), testVecA);
-        EXPECT_EQ(tensorB->data(), testVecB);
-    }
-
-    EXPECT_FALSE(tensorA->isInit());
-    EXPECT_FALSE(tensorB->isInit());
-}
-
-TEST(TestOpTensorCopy, NoErrorIfTensorFreedBefore) {
-
-    std::vector<float> testVecA{ 9, 8, 7 };
-    std::vector<float> testVecB{ 6, 5, 4 };
-
-    std::shared_ptr<kp::Tensor> tensorA{new kp::Tensor(testVecA)};
-    std::shared_ptr<kp::Tensor> tensorB{new kp::Tensor(testVecB)};
+TEST(TestOpTensorCopy, CopyStagingToDeviceTensor) {
 
     kp::Manager mgr;
 
-    mgr.evalOpDefault<kp::OpTensorCreate>({tensorA});
-    mgr.evalOpDefault<kp::OpTensorCreate>({tensorB});
+    std::vector<float> testVecA{ 9, 8, 7 };
+    std::vector<float> testVecB{ 0, 0, 0 };
+
+    std::shared_ptr<kp::Tensor> tensorA{new kp::Tensor(testVecA, kp::Tensor::TensorTypes::eStaging)};
+    std::shared_ptr<kp::Tensor> tensorB{new kp::Tensor(testVecB)};
+
+    mgr.evalOpDefault<kp::OpTensorCreate>({tensorA, tensorB});
 
     EXPECT_TRUE(tensorA->isInit());
     EXPECT_TRUE(tensorB->isInit());
 
-    EXPECT_EQ(tensorA->data(), testVecA);
-    EXPECT_EQ(tensorB->data(), testVecB);
+    mgr.evalOpDefault<kp::OpTensorCopy>({tensorA, tensorB});
 
-    tensorA->freeMemoryDestroyGPUResources();
-    tensorB->freeMemoryDestroyGPUResources();
-    EXPECT_FALSE(tensorA->isInit());
-    EXPECT_FALSE(tensorB->isInit());
+    EXPECT_EQ(tensorA->data(), tensorB->data());
+
+    // Making sure the GPU holds the same data
+    mgr.evalOpDefault<kp::OpTensorSyncLocal>({tensorB});
+    EXPECT_EQ(tensorA->data(), tensorB->data());
+}
+
+TEST(TestOpTensorCopy, CopyStagingToStagingTensor) {
+
+    kp::Manager mgr;
+
+    std::vector<float> testVecA{ 9, 8, 7 };
+    std::vector<float> testVecB{ 0, 0, 0 };
+
+    std::shared_ptr<kp::Tensor> tensorA{new kp::Tensor(testVecA, kp::Tensor::TensorTypes::eStaging)};
+    std::shared_ptr<kp::Tensor> tensorB{new kp::Tensor(testVecB, kp::Tensor::TensorTypes::eStaging)};
+
+    mgr.evalOpDefault<kp::OpTensorCreate>({tensorA, tensorB});
+
+    EXPECT_TRUE(tensorA->isInit());
+    EXPECT_TRUE(tensorB->isInit());
+
+    mgr.evalOpDefault<kp::OpTensorCopy>({tensorA, tensorB});
+
+    EXPECT_EQ(tensorA->data(), tensorB->data());
+
+    // Making sure the GPU holds the same data
+    mgr.evalOpDefault<kp::OpTensorSyncLocal>({tensorB});
+    EXPECT_EQ(tensorA->data(), tensorB->data());
+}
+
+TEST(TestOpTensorCopy, SingleTensorShouldFail) {
+
+    kp::Manager mgr;
+
+    std::vector<float> testVecA{ 9, 8, 7 };
+
+    std::shared_ptr<kp::Tensor> tensorA{new kp::Tensor(testVecA, kp::Tensor::TensorTypes::eStaging)};
+
+    mgr.evalOpDefault<kp::OpTensorCreate>({tensorA});
+
+    EXPECT_TRUE(tensorA->isInit());
+
+    EXPECT_THROW(
+        mgr.evalOpDefault<kp::OpTensorCopy>({tensorA}),
+        std::runtime_error);
 }
 
diff --git a/test/TestTensor.cpp b/test/TestTensor.cpp
index 5f90dc7bf..58a677820 100644
--- a/test/TestTensor.cpp
+++ b/test/TestTensor.cpp
@@ -18,7 +18,7 @@ TEST(TestTensor, CopyFromHostData) {
             vecA,
             kp::Tensor::TensorTypes::eStaging);
     std::shared_ptr<kp::Tensor> tensorB = std::make_shared<kp::Tensor>(
-            vecA,
+            vecB,
             kp::Tensor::TensorTypes::eStaging);
 
     kp::Manager mgr;
@@ -35,8 +35,6 @@ TEST(TestTensor, CopyFromHostData) {
         sq->end();
 
         sq->eval();
-
-        tensorB->mapDataFromHostMemory();
     }
 
     EXPECT_EQ(tensorA->data(), tensorB->data());