Added optensorSyncDevice and optensorcopy tests

2020-09-06 12:52:45 +01:00 · 2020-09-06 12:52:45 +01:00 · 1b4e4b6b18
commit 1b4e4b6b18
parent ec89fc6d56
12 changed files with 398 additions and 116 deletions
--- a/single_include/AggregateHeaders.cpp
+++ b/single_include/AggregateHeaders.cpp
@ -8,5 +8,7 @@
 #include "kompute/operations/OpMult.hpp"
 #include "kompute/operations/OpTensorCreate.hpp"
 #include "kompute/operations/OpTensorCopy.hpp"
+#include "kompute/operations/OpTensorSyncDevice.hpp"
+#include "kompute/operations/OpTensorSyncLocal.hpp"
 #include "kompute/Algorithm.hpp"
 #include "kompute/Tensor.hpp"
--- a/single_include/kompute/Kompute.hpp
+++ b/single_include/kompute/Kompute.hpp
@ -1150,6 +1150,7 @@ OpAlgoBase<tX, tY, tZ>::record()
        }

        // Record copy from and create barrier for STAGING tensors
+        // TODO: This only accounts for device tensors need to account for staging and storage
        for (size_t i = 0; i < this->mTensors.size(); i++) {
            this->mOutputStagingTensors[i]->recordCopyFrom(
                this->mCommandBuffer,
@ -1539,15 +1540,17 @@ class OpTensorCreate : public OpBase
    /**
     * In charge of initialising the primary Tensor as well as the staging
     * tensor as required. It will only initialise a staging tensor if the
-     * Primary tensor is of type Device.
+     * Primary tensor is of type Device. For staging tensors it performs a 
+     * mapDataIntoHostMemory which would perform immediately as opposed to 
+     * on sequence eval/submission.
     */
    void init() override;

    /**
     * Record runs the core actions to create the tensors. For device tensors
     * it records a copyCommand to move the data from the staging tensor to the 
-     * device tensor. For staging tensors it performs a mapDataIntoHostMemory
-     * which would perform immediately as opposed to on sequence eval/submission.
+     * device tensor. The mapping for staging tensors happens in the init function
+     * not in the record function.
     */
    void record() override;

@ -1567,7 +1570,7 @@ class OpTensorCreate : public OpBase
 namespace kp {

 /**
-    Operation that copies the data from the first tensor to the rest of the tensors provided, using a record command for all the vectors. This operation does not own/manage the memory of the tensors passed to it.
+    Operation that copies the data from the first tensor to the rest of the tensors provided, using a record command for all the vectors. This operation does not own/manage the memory of the tensors passed to it. The operation must only receive tensors of type 
 */
 class OpTensorCopy : public OpBase
 {
@ -1588,13 +1591,12 @@ class OpTensorCopy : public OpBase
                   std::vector<std::shared_ptr<Tensor>> tensors);

    /**
-     * Default destructor which in this case expects the parent class to free
-     * the tensors
+     * Default destructor. This class does not manage memory so it won't be expecting the parent to perform a release.
     */
    ~OpTensorCopy() override;

    /**
-     * TODO
+     * Performs basic checks such as ensuring there are at least two tensors provided, that they are initialised and that they are not of type TensorTypes::eStorage.
     */
    void init() override;

@ -1612,3 +1614,103 @@ class OpTensorCopy : public OpBase
 };

 } // End namespace kp
+
+namespace kp {
+
+/**
+    Operation that syncs tensor's device by mapping local data into the device memory. For TensorTypes::eDevice it will use a staging tensor to perform the copy. For TensorTypes::eStaging it will only copy the data and perform a map, which will be executed during the record (as opposed to during the sequence eval/submit). This function cannot be carried out for TensorTypes::eStaging.
+*/
+class OpTensorSyncDevice : public OpBase
+{
+  public:
+    OpTensorSyncDevice();
+
+    /**
+     * Default constructor with parameters that provides the core vulkan resources and the tensors that will be used in the operation. The tensos provided cannot be of type TensorTypes::eStorage.
+     *
+     * @param physicalDevice Vulkan physical device used to find device queues
+     * @param device Vulkan logical device for passing to Algorithm
+     * @param commandBuffer Vulkan Command Buffer to record commands into
+     * @param tensors Tensors that will be used to create in operation.
+     */
+    OpTensorSyncDevice(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
+                   std::shared_ptr<vk::Device> device,
+                   std::shared_ptr<vk::CommandBuffer> commandBuffer,
+                   std::vector<std::shared_ptr<Tensor>> tensors);
+
+    /**
+     * Default destructor. This class does not manage memory so it won't be expecting the parent to perform a release.
+     */
+    ~OpTensorSyncDevice() override;
+
+    /**
+     * Performs basic checks such as ensuring that there is at least one tensor provided, that they are initialized and that they are not of type TensorTpes::eStaging. For staging tensors in host memory, the map is performed during the init function.
+     */
+    void init() override;
+
+    /**
+     * For device tensors, it records the copy command to the device tensor from the temporary staging tensor.
+     */
+    void record() override;
+
+    /**
+     * Does not perform any further sync functions. Frees the staging tensors together with their respective memory.
+     */
+    void postSubmit() override;
+
+  private:
+    // Never owned resources
+    std::vector<std::shared_ptr<Tensor>> mStagingTensors;
+};
+
+} // End namespace kp
+
+namespace kp {
+
+/**
+    Operation that syncs tensor's local data by mapping the data from device memory into the local vector. For TensorTypes::eDevice it will use a staging tensor to perform the copy. For TensorTypes::eStaging it will only copy the data and perform a map, which will be executed during the postSubmit (there will be no copy during the sequence eval/submit). This function cannot be carried out for TensorTypes::eStaging.
+*/
+class OpTensorSyncLocal : public OpBase
+{
+  public:
+    OpTensorSyncLocal();
+
+    /**
+     * Default constructor with parameters that provides the core vulkan resources and the tensors that will be used in the operation. The tensors provided cannot be of type TensorTypes::eStorage.
+     *
+     * @param physicalDevice Vulkan physical device used to find device queues
+     * @param device Vulkan logical device for passing to Algorithm
+     * @param commandBuffer Vulkan Command Buffer to record commands into
+     * @param tensors Tensors that will be used to create in operation.
+     */
+    OpTensorSyncLocal(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
+                   std::shared_ptr<vk::Device> device,
+                   std::shared_ptr<vk::CommandBuffer> commandBuffer,
+                   std::vector<std::shared_ptr<Tensor>> tensors);
+
+    /**
+     * Default destructor. This class manages the memory of the staging tensors it owns but these are released in the postSubmit, before it arrives to the destructor.
+     */
+    ~OpTensorSyncLocal() override;
+
+    /**
+     * Performs basic checks such as ensuring that there is at least one tensor provided, that they are initialized and that they are not of type TensorTpes::eStaging.
+     */
+    void init() override;
+
+    /**
+     * For device tensors, it records the copy command into the staging tensor from the device tensor.
+     */
+    void record() override;
+
+    /**
+     * For host tensors it performs the map command from the host memory into local memory. Frees the staging tensors together with their respective memory.
+     */
+    void postSubmit() override;
+
+  private:
+    // Never owned resources
+    std::vector<std::shared_ptr<Tensor>> mStagingTensors;
+};
+
+} // End namespace kp
--- a/src/OpTensorCopy.cpp
+++ b/src/OpTensorCopy.cpp
@ -40,7 +40,7 @@ OpTensorCopy::init()
            throw std::runtime_error("Kompute OpTensorCopy tensor parameter has not been initialized");
        }
        if (tensor->tensorType() == Tensor::TensorTypes::eStorage) {
-            throw std::runtime_error("Kompute OpTensorCopy tensor parameter is of type storage and hence cannot be used to receive or pass data.");
+            throw std::runtime_error("Kompute OpTensorCopy tensor parameter is of TensorTypes::eStorage and hence cannot be used to receive or pass data.");
        }
    }
 }
--- a/src/OpTensorCreate.cpp
+++ b/src/OpTensorCreate.cpp
@ -23,13 +23,6 @@ OpTensorCreate::OpTensorCreate(
 OpTensorCreate::~OpTensorCreate()
 {
    SPDLOG_DEBUG("Kompute OpTensorCreate destructor started");
-
-    SPDLOG_DEBUG("Kompute OpTensorCreate destroying staging tensors");
-    for (size_t i = 0; i < this->mStagingTensors.size(); i++) {
-        if (this->mStagingTensors[i]) {
-            this->mStagingTensors[i]->freeMemoryDestroyGPUResources();
-        }
-    }
 }

 void
@ -65,6 +58,8 @@ OpTensorCreate::init()
            tensor->init(
              this->mPhysicalDevice, this->mDevice);

+            tensor->mapDataIntoHostMemory();
+
            // We push a nullptr when no staging tensor is needed to match 
            // index number in array to have one to one mapping with tensors
            this->mStagingTensors.push_back(nullptr);
@ -80,9 +75,7 @@ OpTensorCreate::record()
    for (size_t i = 0; i < this->mTensors.size(); i++) {
        if (this->mTensors[i]->tensorType() == Tensor::TensorTypes::eDevice) {
            this->mTensors[i]->recordCopyFrom(this->mCommandBuffer, this->mStagingTensors[i], false);
-        } else if (this->mTensors[i]->tensorType() == Tensor::TensorTypes::eStaging) {
-            this->mTensors[i]->mapDataIntoHostMemory();
-        }
+        } 
    }
 }

@ -91,13 +84,11 @@ OpTensorCreate::postSubmit()
 {
    SPDLOG_DEBUG("Kompute OpTensorCreate postSubmit called");

-    for (size_t i = 0; i < this->mTensors.size(); i++) {
-        if (this->mTensors[i]->tensorType() == Tensor::TensorTypes::eDevice) {
-            this->mStagingTensors[i]->mapDataFromHostMemory();
-
-            this->mTensors[i]->setData(this->mStagingTensors[i]->data());
-        }
-    }
+    // TODO: Remove and add a test that checks that the memory in 
+    // the staging tensor is actually storing the data
+    SPDLOG_DEBUG("Kompute OpTensorCreate destroying staging tensors");
+    // TODO: This would cause issues if there is no CPU barrier
+    this->mStagingTensors.clear();
 }

 }
--- a/src/OpTensorSyncDevice.cpp
+++ b/src/OpTensorSyncDevice.cpp
@ -1,4 +1,6 @@

+#include "kompute/Tensor.hpp"
+
 #include "kompute/operations/OpTensorSyncDevice.hpp"

 namespace kp {
@ -21,8 +23,6 @@ OpTensorSyncDevice::OpTensorSyncDevice(
 OpTensorSyncDevice::~OpTensorSyncDevice()
 {
    SPDLOG_DEBUG("Kompute OpTensorSyncDevice destructor started");
-
-    SPDLOG_DEBUG("Kompute OpTensorSyncDevice destroying staging tensors");
 }

 void
@ -30,17 +30,37 @@ OpTensorSyncDevice::init()
 {
    SPDLOG_DEBUG("Kompute OpTensorSyncDevice init called");

-    if (this->mTensors.size() < 2) {
+    if (this->mTensors.size() < 1) {
        throw std::runtime_error(
-          "Kompute OpTensorSyncDevice called with less than 2 tensor");
+          "Kompute OpTensorSyncDevice called with less than 1 tensor");
    }

    for (std::shared_ptr<Tensor> tensor: this->mTensors) {
-        if (!tensor->isInit()) {
-            throw std::runtime_error("Kompute OpTensorSyncDevice tensor parameter has not been initialized");
+        if (tensor->isInit()) {
+            throw std::runtime_error("Kompute OpTensorSyncDevice: Tensor has already been initialized");
        }
        if (tensor->tensorType() == Tensor::TensorTypes::eStorage) {
-            throw std::runtime_error("Kompute OpTensorSyncDevice tensor parameter is of type storage and hence cannot be used to receive or pass data.");
+            throw std::runtime_error("Kompute OpTensorSyncLocal tensor parameter is of type TensorTypes::eStorage and hence cannot be used to receive or pass data.");
+        }
+        if (tensor->tensorType() == Tensor::TensorTypes::eDevice) {
+
+            std::shared_ptr<Tensor> stagingTensor = std::make_shared<Tensor>(
+              tensor->data(), Tensor::TensorTypes::eStaging);
+
+            stagingTensor->init(
+              this->mPhysicalDevice, this->mDevice);
+
+            stagingTensor->mapDataIntoHostMemory();
+
+            this->mStagingTensors.push_back(stagingTensor);
+
+        } else {
+
+            tensor->mapDataIntoHostMemory();
+
+            // We push a nullptr when no staging tensor is needed to match 
+            // index number in array to have one to one mapping with tensors
+            this->mStagingTensors.push_back(nullptr);
        }
    }
 }
@ -50,9 +70,10 @@ OpTensorSyncDevice::record()
 {
    SPDLOG_DEBUG("Kompute OpTensorSyncDevice record called");

-    // We iterate from the second tensor onwards and record a copy to all
-    for (size_t i = 1; i < this->mTensors.size(); i++) {
-        this->mTensors[i]->recordCopyFrom(this->mCommandBuffer, this->mTensors[0], false);
+    for (size_t i = 0; i < this->mTensors.size(); i++) {
+        if (this->mTensors[i]->tensorType() == Tensor::TensorTypes::eDevice) {
+            this->mTensors[i]->recordCopyFrom(this->mCommandBuffer, this->mStagingTensors[i], false);
+        }
    }
 }

@ -61,11 +82,10 @@ OpTensorSyncDevice::postSubmit()
 {
    SPDLOG_DEBUG("Kompute OpTensorSyncDevice postSubmit called");

-    // Copy the data from the first tensor into all the tensors
-    for (size_t i = 1; i < this->mTensors.size(); i++) {
-        this->mTensors[i]->setData(this->mTensors[0]->data());
-    }
+    // Remove all staging tensors as they are not required after operation
+    SPDLOG_DEBUG("Kompute OpTensorSyncDevice destroying staging tensors");
+    // TODO: This would cause issues if there is no CPU barrier
+    this->mStagingTensors.clear();
 }

 }
-
--- a/src/OpTensorSyncLocal.cpp
+++ b/src/OpTensorSyncLocal.cpp
@ -0,0 +1,97 @@
+
+#include "kompute/Tensor.hpp"
+
+#include "kompute/operations/OpTensorSyncLocal.hpp"
+
+namespace kp {
+
+OpTensorSyncLocal::OpTensorSyncLocal()
+{
+    SPDLOG_DEBUG("Kompute OpTensorSyncLocal constructor base");
+}
+
+OpTensorSyncLocal::OpTensorSyncLocal(
+  std::shared_ptr<vk::PhysicalDevice> physicalDevice,
+  std::shared_ptr<vk::Device> device,
+  std::shared_ptr<vk::CommandBuffer> commandBuffer,
+  std::vector<std::shared_ptr<Tensor>> tensors)
+  : OpBase(physicalDevice, device, commandBuffer, tensors, false)
+{
+    SPDLOG_DEBUG("Kompute OpTensorSyncLocal constructor with params");
+}
+
+OpTensorSyncLocal::~OpTensorSyncLocal()
+{
+    SPDLOG_DEBUG("Kompute OpTensorSyncLocal destructor started");
+}
+
+void
+OpTensorSyncLocal::init()
+{
+    SPDLOG_DEBUG("Kompute OpTensorSyncLocal init called");
+
+    if (this->mTensors.size() < 1) {
+        throw std::runtime_error(
+          "Kompute OpTensorSyncLocal called with less than 1 tensor");
+    }
+
+    for (std::shared_ptr<Tensor> tensor: this->mTensors) {
+        if (!tensor->isInit()) {
+            throw std::runtime_error("Kompute OpTensorSyncLocal: Tensor has not been initialized");
+        }
+        if (tensor->tensorType() == Tensor::TensorTypes::eStorage) {
+            throw std::runtime_error("Kompute OpTensorSyncLocal tensor parameter is of type TensorTypes::eStorage and hence cannot be used to receive or pass data.");
+        }
+        if (tensor->tensorType() == Tensor::TensorTypes::eDevice) {
+
+            std::shared_ptr<Tensor> stagingTensor = std::make_shared<Tensor>(
+              tensor->data(), Tensor::TensorTypes::eStaging);
+
+            stagingTensor->init(
+              this->mPhysicalDevice, this->mDevice);
+
+            this->mStagingTensors.push_back(stagingTensor);
+
+        } else {
+
+            // We push a nullptr when no staging tensor is needed to match 
+            // index number in array to have one to one mapping with tensors
+            this->mStagingTensors.push_back(nullptr);
+        }
+    }
+}
+
+void
+OpTensorSyncLocal::record()
+{
+    SPDLOG_DEBUG("Kompute OpTensorSyncLocal record called");
+
+    for (size_t i = 0; i < this->mTensors.size(); i++) {
+        if (this->mTensors[i]->tensorType() == Tensor::TensorTypes::eDevice) {
+            this->mStagingTensors[i]->recordCopyFrom(this->mCommandBuffer, this->mTensors[i], true);
+        }
+    }
+}
+
+void
+OpTensorSyncLocal::postSubmit()
+{
+    SPDLOG_DEBUG("Kompute OpTensorSyncLocal postSubmit called");
+
+    for (size_t i = 0; i < this->mTensors.size(); i++) {
+        if (this->mTensors[i]->tensorType() == Tensor::TensorTypes::eDevice) {
+            this->mStagingTensors[i]->mapDataFromHostMemory();
+            this->mTensors[i]->setData(this->mStagingTensors[i]->data());
+        } else {
+            this->mTensors[i]->mapDataFromHostMemory();
+        }
+    }
+
+    // Remove all staging tensors as they are not required after operation
+    SPDLOG_DEBUG("Kompute OpTensorSyncLocal destroying staging tensors");
+    // TODO: This would cause issues if there is no CPU barrier
+    this->mStagingTensors.clear();
+}
+
+}
+
--- a/src/include/kompute/operations/OpAlgoBase.hpp
+++ b/src/include/kompute/operations/OpAlgoBase.hpp
@ -304,6 +304,7 @@ OpAlgoBase<tX, tY, tZ>::record()
        }

        // Record copy from and create barrier for STAGING tensors
+        // TODO: This only accounts for device tensors need to account for staging and storage
        for (size_t i = 0; i < this->mTensors.size(); i++) {
            this->mOutputStagingTensors[i]->recordCopyFrom(
                this->mCommandBuffer,
--- a/src/include/kompute/operations/OpTensorCreate.hpp
+++ b/src/include/kompute/operations/OpTensorCreate.hpp
@ -42,15 +42,17 @@ class OpTensorCreate : public OpBase
    /**
     * In charge of initialising the primary Tensor as well as the staging
     * tensor as required. It will only initialise a staging tensor if the
-     * Primary tensor is of type Device.
+     * Primary tensor is of type Device. For staging tensors it performs a 
+     * mapDataIntoHostMemory which would perform immediately as opposed to 
+     * on sequence eval/submission.
     */
    void init() override;

    /**
     * Record runs the core actions to create the tensors. For device tensors
     * it records a copyCommand to move the data from the staging tensor to the 
-     * device tensor. For staging tensors it performs a mapDataIntoHostMemory
-     * which would perform immediately as opposed to on sequence eval/submission.
+     * device tensor. The mapping for staging tensors happens in the init function
+     * not in the record function.
     */
    void record() override;

--- a/src/include/kompute/operations/OpTensorSyncDevice.hpp
+++ b/src/include/kompute/operations/OpTensorSyncDevice.hpp
@ -35,21 +35,23 @@ class OpTensorSyncDevice : public OpBase
    ~OpTensorSyncDevice() override;

    /**
-     * Performs basic checks such as ensuring that there is at least one tensor provided, that they are initialized and that they are not of type TensorTpes::eStaging.
+     * Performs basic checks such as ensuring that there is at least one tensor provided, that they are initialized and that they are not of type TensorTpes::eStaging. For staging tensors in host memory, the map is performed during the init function.
     */
    void init() override;

    /**
-     * Records the copy commands from teh first tensor into all the other tensors provided. Also optionally records a barrier.
+     * For device tensors, it records the copy command to the device tensor from the temporary staging tensor.
     */
    void record() override;

    /**
-     * Copies the local vectors for all the tensors to sync the data with the gpu.
+     * Does not perform any further sync functions. Frees the staging tensors together with their respective memory.
     */
    void postSubmit() override;

  private:
+    // Never owned resources
+    std::vector<std::shared_ptr<Tensor>> mStagingTensors;
 };

 } // End namespace kp
--- a/src/include/kompute/operations/OpTensorSyncLocal.hpp
+++ b/src/include/kompute/operations/OpTensorSyncLocal.hpp
@ -0,0 +1,60 @@
+#pragma once
+
+#include "kompute/Core.hpp"
+
+#include "kompute/Tensor.hpp"
+
+#include "kompute/operations/OpBase.hpp"
+
+namespace kp {
+
+/**
+    Operation that syncs tensor's local data by mapping the data from device memory into the local vector. For TensorTypes::eDevice it will use a staging tensor to perform the copy. For TensorTypes::eStaging it will only copy the data and perform a map, which will be executed during the postSubmit (there will be no copy during the sequence eval/submit). This function cannot be carried out for TensorTypes::eStaging.
+*/
+class OpTensorSyncLocal : public OpBase
+{
+  public:
+    OpTensorSyncLocal();
+
+    /**
+     * Default constructor with parameters that provides the core vulkan resources and the tensors that will be used in the operation. The tensors provided cannot be of type TensorTypes::eStorage.
+     *
+     * @param physicalDevice Vulkan physical device used to find device queues
+     * @param device Vulkan logical device for passing to Algorithm
+     * @param commandBuffer Vulkan Command Buffer to record commands into
+     * @param tensors Tensors that will be used to create in operation.
+     */
+    OpTensorSyncLocal(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
+                   std::shared_ptr<vk::Device> device,
+                   std::shared_ptr<vk::CommandBuffer> commandBuffer,
+                   std::vector<std::shared_ptr<Tensor>> tensors);
+
+    /**
+     * Default destructor. This class manages the memory of the staging tensors it owns but these are released in the postSubmit, before it arrives to the destructor.
+     */
+    ~OpTensorSyncLocal() override;
+
+    /**
+     * Performs basic checks such as ensuring that there is at least one tensor provided, that they are initialized and that they are not of type TensorTpes::eStaging.
+     */
+    void init() override;
+
+    /**
+     * For device tensors, it records the copy command into the staging tensor from the device tensor.
+     */
+    void record() override;
+
+    /**
+     * For host tensors it performs the map command from the host memory into local memory. Frees the staging tensors together with their respective memory.
+     */
+    void postSubmit() override;
+
+  private:
+    // Never owned resources
+    std::vector<std::shared_ptr<Tensor>> mStagingTensors;
+};
+
+} // End namespace kp
+
+
+
--- a/test/TestOpTensorCopy.cpp
+++ b/test/TestOpTensorCopy.cpp
@ -8,25 +8,7 @@ TEST(TestOpTensorCopy, CopyDeviceToDeviceTensor) {
    kp::Manager mgr;

    std::vector<float> testVecA{ 9, 8, 7 };
-
-    std::shared_ptr<kp::Tensor> tensorA{new kp::Tensor(testVecA)};
-
-    mgr.evalOpDefault<kp::OpTensorCreate>({tensorA});
-
-    EXPECT_TRUE(tensorA->isInit());
-
-    EXPECT_EQ(tensorA->data(), testVecA);
-
-    tensorA->freeMemoryDestroyGPUResources();
-    EXPECT_FALSE(tensorA->isInit());
-}
-
-TEST(TestOpTensorCopy, CreateMultipleTensorSingleOp) {
-
-    kp::Manager mgr;
-
-    std::vector<float> testVecA{ 9, 8, 7 };
-    std::vector<float> testVecB{ 6, 5, 4 };
+    std::vector<float> testVecB{ 0, 0, 0 };

    std::shared_ptr<kp::Tensor> tensorA{new kp::Tensor(testVecA)};
    std::shared_ptr<kp::Tensor> tensorB{new kp::Tensor(testVecB)};
@ -36,76 +18,101 @@ TEST(TestOpTensorCopy, CreateMultipleTensorSingleOp) {
    EXPECT_TRUE(tensorA->isInit());
    EXPECT_TRUE(tensorB->isInit());

-    EXPECT_EQ(tensorA->data(), testVecA);
-    EXPECT_EQ(tensorB->data(), testVecB);
+    mgr.evalOpDefault<kp::OpTensorCopy>({tensorA, tensorB});
+
+    EXPECT_EQ(tensorA->data(), tensorB->data());
+
+    // Making sure the GPU holds the same data
+    mgr.evalOpDefault<kp::OpTensorSyncLocal>({tensorB});
+    EXPECT_EQ(tensorA->data(), tensorB->data());
 }

-TEST(TestOpTensorCopy, CreateMultipleTensorMultipleOp) {
+TEST(TestOpTensorCopy, CopyDeviceToStagingTensor) {

    kp::Manager mgr;

    std::vector<float> testVecA{ 9, 8, 7 };
-    std::vector<float> testVecB{ 6, 5, 4 };
+    std::vector<float> testVecB{ 0, 0, 0 };

    std::shared_ptr<kp::Tensor> tensorA{new kp::Tensor(testVecA)};
-    std::shared_ptr<kp::Tensor> tensorB{new kp::Tensor(testVecB)};
+    std::shared_ptr<kp::Tensor> tensorB{new kp::Tensor(testVecB, kp::Tensor::TensorTypes::eStaging)};

-    mgr.evalOpDefault<kp::OpTensorCreate>({tensorA});
-    mgr.evalOpDefault<kp::OpTensorCreate>({tensorB});
+    mgr.evalOpDefault<kp::OpTensorCreate>({tensorA, tensorB});

    EXPECT_TRUE(tensorA->isInit());
    EXPECT_TRUE(tensorB->isInit());

-    EXPECT_EQ(tensorA->data(), testVecA);
-    EXPECT_EQ(tensorB->data(), testVecB);
+    mgr.evalOpDefault<kp::OpTensorCopy>({tensorA, tensorB});
+
+    EXPECT_EQ(tensorA->data(), tensorB->data());
+
+    // Making sure the GPU holds the same data
+    mgr.evalOpDefault<kp::OpTensorSyncLocal>({tensorB});
+    EXPECT_EQ(tensorA->data(), tensorB->data());
 }

-TEST(TestOpTensorCopy, ManageTensorMemoryWhenOpTensorCreateDestroyed) {
-
-    std::vector<float> testVecA{ 9, 8, 7 };
-    std::vector<float> testVecB{ 6, 5, 4 };
-
-    std::shared_ptr<kp::Tensor> tensorA{new kp::Tensor(testVecA)};
-    std::shared_ptr<kp::Tensor> tensorB{new kp::Tensor(testVecB)};
-
-    {
-        kp::Manager mgr;
-        mgr.evalOpDefault<kp::OpTensorCreate>({tensorA});
-        mgr.evalOpDefault<kp::OpTensorCreate>({tensorB});
-
-        EXPECT_TRUE(tensorA->isInit());
-        EXPECT_TRUE(tensorB->isInit());
-
-        EXPECT_EQ(tensorA->data(), testVecA);
-        EXPECT_EQ(tensorB->data(), testVecB);
-    }
-
-    EXPECT_FALSE(tensorA->isInit());
-    EXPECT_FALSE(tensorB->isInit());
-}
-
-TEST(TestOpTensorCopy, NoErrorIfTensorFreedBefore) {
-
-    std::vector<float> testVecA{ 9, 8, 7 };
-    std::vector<float> testVecB{ 6, 5, 4 };
-
-    std::shared_ptr<kp::Tensor> tensorA{new kp::Tensor(testVecA)};
-    std::shared_ptr<kp::Tensor> tensorB{new kp::Tensor(testVecB)};
+TEST(TestOpTensorCopy, CopyStagingToDeviceTensor) {

    kp::Manager mgr;

-    mgr.evalOpDefault<kp::OpTensorCreate>({tensorA});
-    mgr.evalOpDefault<kp::OpTensorCreate>({tensorB});
+    std::vector<float> testVecA{ 9, 8, 7 };
+    std::vector<float> testVecB{ 0, 0, 0 };
+
+    std::shared_ptr<kp::Tensor> tensorA{new kp::Tensor(testVecA, kp::Tensor::TensorTypes::eStaging)};
+    std::shared_ptr<kp::Tensor> tensorB{new kp::Tensor(testVecB)};
+
+    mgr.evalOpDefault<kp::OpTensorCreate>({tensorA, tensorB});

    EXPECT_TRUE(tensorA->isInit());
    EXPECT_TRUE(tensorB->isInit());

-    EXPECT_EQ(tensorA->data(), testVecA);
-    EXPECT_EQ(tensorB->data(), testVecB);
+    mgr.evalOpDefault<kp::OpTensorCopy>({tensorA, tensorB});

-    tensorA->freeMemoryDestroyGPUResources();
-    tensorB->freeMemoryDestroyGPUResources();
-    EXPECT_FALSE(tensorA->isInit());
-    EXPECT_FALSE(tensorB->isInit());
+    EXPECT_EQ(tensorA->data(), tensorB->data());
+
+    // Making sure the GPU holds the same data
+    mgr.evalOpDefault<kp::OpTensorSyncLocal>({tensorB});
+    EXPECT_EQ(tensorA->data(), tensorB->data());
+}
+
+TEST(TestOpTensorCopy, CopyStagingToStagingTensor) {
+
+    kp::Manager mgr;
+
+    std::vector<float> testVecA{ 9, 8, 7 };
+    std::vector<float> testVecB{ 0, 0, 0 };
+
+    std::shared_ptr<kp::Tensor> tensorA{new kp::Tensor(testVecA, kp::Tensor::TensorTypes::eStaging)};
+    std::shared_ptr<kp::Tensor> tensorB{new kp::Tensor(testVecB, kp::Tensor::TensorTypes::eStaging)};
+
+    mgr.evalOpDefault<kp::OpTensorCreate>({tensorA, tensorB});
+
+    EXPECT_TRUE(tensorA->isInit());
+    EXPECT_TRUE(tensorB->isInit());
+
+    mgr.evalOpDefault<kp::OpTensorCopy>({tensorA, tensorB});
+
+    EXPECT_EQ(tensorA->data(), tensorB->data());
+
+    // Making sure the GPU holds the same data
+    mgr.evalOpDefault<kp::OpTensorSyncLocal>({tensorB});
+    EXPECT_EQ(tensorA->data(), tensorB->data());
+}
+
+TEST(TestOpTensorCopy, SingleTensorShouldFail) {
+
+    kp::Manager mgr;
+
+    std::vector<float> testVecA{ 9, 8, 7 };
+
+    std::shared_ptr<kp::Tensor> tensorA{new kp::Tensor(testVecA, kp::Tensor::TensorTypes::eStaging)};
+
+    mgr.evalOpDefault<kp::OpTensorCreate>({tensorA});
+
+    EXPECT_TRUE(tensorA->isInit());
+
+    EXPECT_THROW(
+        mgr.evalOpDefault<kp::OpTensorCopy>({tensorA}),
+        std::runtime_error);
 }

--- a/test/TestTensor.cpp
+++ b/test/TestTensor.cpp
@ -18,7 +18,7 @@ TEST(TestTensor, CopyFromHostData) {
            vecA,
            kp::Tensor::TensorTypes::eStaging);
    std::shared_ptr<kp::Tensor> tensorB = std::make_shared<kp::Tensor>(
-            vecA,
+            vecB,
            kp::Tensor::TensorTypes::eStaging);

    kp::Manager mgr;
@ -35,8 +35,6 @@ TEST(TestTensor, CopyFromHostData) {
        sq->end();

        sq->eval();
-
-        tensorB->mapDataFromHostMemory();
    }

    EXPECT_EQ(tensorA->data(), tensorB->data());