Merge pull request #137 from EthicalML/revamp_memory_tensor_mgmt

Removed Staging Tensors in favour of having two buffer & memory in a Tensor to minimise data transfer
2021-02-09 07:55:10 +00:00 · 2021-02-09 07:55:10 +00:00 · 698883992f
commit 698883992f
parent 373ecd0185 bf401019c9
19 changed files with 361 additions and 238 deletions
--- a/python/src/docstrings.hpp
+++ b/python/src/docstrings.hpp
@ -513,10 +513,10 @@ function not in the record function.)doc";
 static const char *__doc_kp_OpTensorSyncDevice =
 R"doc(Operation that syncs tensor's device by mapping local data into the
 device memory. For TensorTypes::eDevice it will use a staging tensor
-to perform the copy. For TensorTypes::eStaging it will only copy the
+to perform the copy. For TensorTypes::eHost it will only copy the
 data and perform a map, which will be executed during the record (as
 opposed to during the sequence eval/submit). This function cannot be
-carried out for TensorTypes::eStaging.)doc";
+carried out for TensorTypes::eHost.)doc";

 static const char *__doc_kp_OpTensorSyncDevice_OpTensorSyncDevice = R"doc()doc";

@ -533,7 +533,7 @@ queues @param device Vulkan logical device for passing to Algorithm
 static const char *__doc_kp_OpTensorSyncDevice_init =
 R"doc(Performs basic checks such as ensuring that there is at least one
 tensor provided, that they are initialized and that they are not of
-type TensorTpes::eStaging. For staging tensors in host memory, the map
+type TensorTpes::eHost. For staging tensors in host memory, the map
 is performed during the init function.)doc";

 static const char *__doc_kp_OpTensorSyncDevice_mStagingTensors = R"doc()doc";
@ -549,11 +549,11 @@ from the temporary staging tensor.)doc";
 static const char *__doc_kp_OpTensorSyncLocal =
 R"doc(Operation that syncs tensor's local data by mapping the data from
 device memory into the local vector. For TensorTypes::eDevice it will
-use a staging tensor to perform the copy. For TensorTypes::eStaging it
+use a staging tensor to perform the copy. For TensorTypes::eHost it
 will only copy the data and perform a map, which will be executed
 during the postSubmit (there will be no copy during the sequence
 eval/submit). This function cannot be carried out for
-TensorTypes::eStaging.)doc";
+TensorTypes::eHost.)doc";

 static const char *__doc_kp_OpTensorSyncLocal_OpTensorSyncLocal = R"doc()doc";

@ -570,7 +570,7 @@ queues @param device Vulkan logical device for passing to Algorithm
 static const char *__doc_kp_OpTensorSyncLocal_init =
 R"doc(Performs basic checks such as ensuring that there is at least one
 tensor provided, that they are initialized and that they are not of
-type TensorTpes::eStaging.)doc";
+type TensorTpes::eHost.)doc";

 static const char *__doc_kp_OpTensorSyncLocal_mStagingTensors = R"doc()doc";

@ -719,7 +719,7 @@ shader storage).)doc";

 static const char *__doc_kp_Tensor_TensorTypes_eDevice = R"doc(< Type is device memory, source and destination)doc";

-static const char *__doc_kp_Tensor_TensorTypes_eStaging = R"doc(< Type is host memory, source and destination)doc";
+static const char *__doc_kp_Tensor_TensorTypes_eHost = R"doc(< Type is host memory, source and destination)doc";

 static const char *__doc_kp_Tensor_TensorTypes_eStorage = R"doc(< Type is Device memory (only))doc";

--- a/python/src/main.cpp
+++ b/python/src/main.cpp
@ -26,7 +26,7 @@ PYBIND11_MODULE(kp, m) {

    py::enum_<kp::Tensor::TensorTypes>(m, "TensorTypes", DOC(kp, Tensor, TensorTypes))
        .value("device", kp::Tensor::TensorTypes::eDevice, "Tensor holding data in GPU memory.")
-        .value("staging", kp::Tensor::TensorTypes::eStaging, "Tensor used for transfer of data to device.")
+        .value("host", kp::Tensor::TensorTypes::eHost, "Tensor used for CPU visible GPU data.")
        .value("storage", kp::Tensor::TensorTypes::eStorage, "Tensor with host visible gpu memory.")
        .export_values();

@ -112,7 +112,7 @@ PYBIND11_MODULE(kp, m) {
        .def("record_tensor_sync_device", &kp::Sequence::record<kp::OpTensorSyncDevice>,
            "Records operation to sync tensor from local memory to GPU memory")
        .def("record_tensor_sync_local", &kp::Sequence::record<kp::OpTensorSyncLocal>,
-            "Records operation to sync tensor(s) from GPU memory to local memory using staging tensors")
+            "Records operation to sync tensor(s) from GPU memory to local memory")
        .def("record_algo_mult", &kp::Sequence::record<kp::OpMult>,
            "Records operation to run multiplication compute shader to two input tensors and an output tensor")
        .def("record_algo_file", [](kp::Sequence &self, 
@ -179,7 +179,7 @@ PYBIND11_MODULE(kp, m) {
        .def("eval_tensor_sync_device_def", &kp::Manager::evalOpDefault<kp::OpTensorSyncDevice>,
            "Evaluates operation to sync tensor from local memory to GPU memory with new anonymous Sequence")
        .def("eval_tensor_sync_local_def", &kp::Manager::evalOpDefault<kp::OpTensorSyncLocal>,
-            "Evaluates operation to sync tensor(s) from GPU memory to local memory using staging tensors with new anonymous Sequence")
+            "Evaluates operation to sync tensor(s) from GPU memory to local memory with new anonymous Sequence")
        .def("eval_algo_mult_def", &kp::Manager::evalOpDefault<kp::OpMult>,
            "Evaluates operation to run multiplication compute shader to two input tensors and an output tensor with new anonymous Sequence")
        .def("eval_algo_file_def", &kp::Manager::evalOpDefault<kp::OpAlgoBase, std::string>,
@ -216,7 +216,7 @@ PYBIND11_MODULE(kp, m) {
        .def("eval_tensor_sync_device", &kp::Manager::evalOp<kp::OpTensorSyncDevice>,
            "Evaluates operation to sync tensor from local memory to GPU memory with explicitly named Sequence")
        .def("eval_tensor_sync_local", &kp::Manager::evalOp<kp::OpTensorSyncLocal>,
-            "Evaluates operation to sync tensor(s) from GPU memory to local memory using staging tensors with explicitly named Sequence")
+            "Evaluates operation to sync tensor(s) from GPU memory to local memory with explicitly named Sequence")
        .def("eval_algo_mult", &kp::Manager::evalOp<kp::OpMult>,
            "Evaluates operation to run multiplication compute shader to two input tensors and an output tensor with explicitly named Sequence")
        .def("eval_algo_file", &kp::Manager::evalOp<kp::OpAlgoBase, std::string>,
@ -256,7 +256,7 @@ PYBIND11_MODULE(kp, m) {
        .def("eval_async_tensor_sync_device_def", &kp::Manager::evalOpAsyncDefault<kp::OpTensorSyncDevice>,
            "Evaluates asynchronously operation to sync tensor from local memory to GPU memory with anonymous Sequence")
        .def("eval_async_tensor_sync_local_def", &kp::Manager::evalOpAsyncDefault<kp::OpTensorSyncLocal>,
-            "Evaluates asynchronously operation to sync tensor(s) from GPU memory to local memory using staging tensors with anonymous Sequence")
+            "Evaluates asynchronously operation to sync tensor(s) from GPU memory to local memory with anonymous Sequence")
        .def("eval_async_algo_mult_def", &kp::Manager::evalOpAsyncDefault<kp::OpMult>,
            "Evaluates asynchronously operation to run multiplication compute shader to two input tensors and an output tensor with anonymous Sequence")
        .def("eval_async_algo_file_def", &kp::Manager::evalOpAsyncDefault<kp::OpAlgoBase, std::string>,
@ -293,7 +293,7 @@ PYBIND11_MODULE(kp, m) {
        .def("eval_async_tensor_sync_device", &kp::Manager::evalOpAsync<kp::OpTensorSyncDevice>,
            "Evaluates asynchronously operation to sync tensor from local memory to GPU memory with explicitly named Sequence")
        .def("eval_async_tensor_sync_local", &kp::Manager::evalOpAsync<kp::OpTensorSyncLocal>,
-            "Evaluates asynchronously operation to sync tensor(s) from GPU memory to local memory using staging tensors with explicitly named Sequence")
+            "Evaluates asynchronously operation to sync tensor(s) from GPU memory to local memory with explicitly named Sequence")
        .def("eval_async_algo_mult", &kp::Manager::evalOpAsync<kp::OpMult>,
            "Evaluates asynchronously operation to run multiplication compute shader to two input tensors and an output tensor with explicitly named Sequence")
        .def("eval_async_algo_file", &kp::Manager::evalOpAsync<kp::OpAlgoBase, std::string>,
--- a/single_include/kompute/Kompute.hpp
+++ b/single_include/kompute/Kompute.hpp
@ -723,7 +723,7 @@ class Tensor
    enum class TensorTypes
    {
        eDevice = 0,  ///< Type is device memory, source and destination
-        eStaging = 1, ///< Type is host memory, source and destination
+        eHost = 1, ///< Type is host memory, source and destination
        eStorage = 2, ///< Type is Device memory (only)
    };

@ -828,6 +828,26 @@ class Tensor
                        std::shared_ptr<Tensor> copyFromTensor,
                        bool createBarrier);

+    /**
+     * Records a copy from the internal staging memory to the device memory using an optional barrier to wait for the operation. This function would only be relevant for kp::Tensors of type eDevice.
+     *
+     * @param commandBuffer Vulkan Command Buffer to record the commands into
+     * @param createBarrier Whether to create a barrier that ensures the data is
+     * copied before further operations. Default is true.
+     */
+    void recordCopyFromStagingToDevice(std::shared_ptr<vk::CommandBuffer> commandBuffer,
+                        bool createBarrier);
+
+    /**
+     * Records a copy from the internal device memory to the staging memory using an optional barrier to wait for the operation. This function would only be relevant for kp::Tensors of type eDevice.
+     *
+     * @param commandBuffer Vulkan Command Buffer to record the commands into
+     * @param createBarrier Whether to create a barrier that ensures the data is
+     * copied before further operations. Default is true.
+     */
+    void recordCopyFromDeviceToStaging(std::shared_ptr<vk::CommandBuffer> commandBuffer,
+                        bool createBarrier);
+
    /**
     * Records the buffer memory barrier into the command buffer which
     * ensures that relevant data transfers are carried out correctly.
@ -870,10 +890,14 @@ class Tensor
    std::shared_ptr<vk::Device> mDevice;

    // -------------- OPTIONALLY OWNED RESOURCES
-    std::shared_ptr<vk::Buffer> mBuffer;
-    bool mFreeBuffer;
-    std::shared_ptr<vk::DeviceMemory> mMemory;
-    bool mFreeMemory;
+    std::shared_ptr<vk::Buffer> mPrimaryBuffer;
+    bool mFreePrimaryBuffer = false;
+    std::shared_ptr<vk::Buffer> mStagingBuffer;
+    bool mFreeStagingBuffer = false;
+    std::shared_ptr<vk::DeviceMemory> mPrimaryMemory;
+    bool mFreePrimaryMemory = false;
+    std::shared_ptr<vk::DeviceMemory> mStagingMemory;
+    bool mFreeStagingMemory = false;

    // -------------- ALWAYS OWNED RESOURCES
    std::vector<float> mData;
@ -883,11 +907,16 @@ class Tensor
    std::array<uint32_t, KP_MAX_DIM_SIZE> mShape;
    bool mIsInit = false;

-    void createBuffer(); // Creates the vulkan buffer
+    void allocateMemoryCreateGPUResources(); // Creates the vulkan buffer
+    void createBuffer(std::shared_ptr<vk::Buffer> buffer, vk::BufferUsageFlags bufferUsageFlags);
+    void allocateBindMemory(std::shared_ptr<vk::Buffer> buffer, std::shared_ptr<vk::DeviceMemory> memory, vk::MemoryPropertyFlags memoryPropertyFlags);
+    void copyBuffer(std::shared_ptr<vk::CommandBuffer> commandBuffer, std::shared_ptr<vk::Buffer> bufferFrom, std::shared_ptr<vk::Buffer> bufferTo, vk::DeviceSize bufferSize, vk::BufferCopy copyRegion, bool createBarrier);

    // Private util functions
-    vk::BufferUsageFlags getBufferUsageFlags();
-    vk::MemoryPropertyFlags getMemoryPropertyFlags();
+    vk::BufferUsageFlags getPrimaryBufferUsageFlags();
+    vk::MemoryPropertyFlags getPrimaryMemoryPropertyFlags();
+    vk::BufferUsageFlags getStagingBufferUsageFlags();
+    vk::MemoryPropertyFlags getStagingMemoryPropertyFlags();
    uint64_t memorySize();
 };

@ -958,7 +987,7 @@ class OpBase
                if (tensor && tensor->isInit()) {
                    tensor->freeMemoryDestroyGPUResources();
                } else {
-                    SPDLOG_ERROR("Kompute OpBase expected to free "
+                    SPDLOG_WARN("Kompute OpBase expected to free "
                                  "tensor but has already been freed.");
                }
            }
@ -1264,8 +1293,6 @@ class OpTensorCreate : public OpBase
    virtual void postEval() override;

  private:
-    // Never owned resources
-    std::vector<std::shared_ptr<Tensor>> mStagingTensors;
 };

 } // End namespace kp
@ -1836,9 +1863,6 @@ class OpAlgoLhsRhsOut : public OpAlgoBase
    std::shared_ptr<Tensor> mTensorLHS; ///< Reference to the parameter used in the left hand side equation of the shader
    std::shared_ptr<Tensor> mTensorRHS; ///< Reference to the parameter used in the right hand side equation of the shader
    std::shared_ptr<Tensor> mTensorOutput; ///< Reference to the parameter used in the output of the shader and will be copied with a staging vector
-
-    // -------------- ALWAYS OWNED RESOURCES
-    std::shared_ptr<Tensor> mTensorOutputStaging; ///< Staging temporary tensor user do to copy the output of the tensor
 };

 } // End namespace kp
@ -1976,7 +2000,7 @@ class OpTensorCopy : public OpBase
 namespace kp {

 /**
-    Operation that syncs tensor's device by mapping local data into the device memory. For TensorTypes::eDevice it will use a staging tensor to perform the copy. For TensorTypes::eStaging it will only copy the data and perform a map, which will be executed during the record (as opposed to during the sequence eval/submit). This function cannot be carried out for TensorTypes::eStaging.
+    Operation that syncs tensor's device by mapping local data into the device memory. For TensorTypes::eDevice it will use a record operation for the memory to be syncd into GPU memory which means that the operation will be done in sync with GPU commands. For TensorTypes::eStaging it will only map the data into host memory which will happen during preEval before the recorded commands are dispatched. This operation won't have any effect on TensorTypes::eStaging.
 */
 class OpTensorSyncDevice : public OpBase
 {
@ -2002,12 +2026,12 @@ class OpTensorSyncDevice : public OpBase
    ~OpTensorSyncDevice() override;

    /**
-     * Performs basic checks such as ensuring that there is at least one tensor provided, that they are initialized and that they are not of type TensorTpes::eStaging. For staging tensors in host memory, the map is performed during the init function.
+     * Performs basic checks such as ensuring that there is at least one tensor provided with min memory of 1 element.
     */
    void init() override;

    /**
-     * For device tensors, it records the copy command to the device tensor from the temporary staging tensor.
+     * For device tensors, it records the copy command for the tensor to copy the data from its staging to device memory.
     */
    void record() override;

@ -2022,8 +2046,6 @@ class OpTensorSyncDevice : public OpBase
    virtual void postEval() override;

  private:
-    // Never owned resources
-    std::vector<std::shared_ptr<Tensor>> mStagingTensors;
 };

 } // End namespace kp
@ -2031,7 +2053,7 @@ class OpTensorSyncDevice : public OpBase
 namespace kp {

 /**
-    Operation that syncs tensor's local data by mapping the data from device memory into the local vector. For TensorTypes::eDevice it will use a staging tensor to perform the copy. For TensorTypes::eStaging it will only copy the data and perform a map, which will be executed during the postSubmit (there will be no copy during the sequence eval/submit). This function cannot be carried out for TensorTypes::eStaging.
+    Operation that syncs tensor's local memory by mapping device data into the local CPU memory. For TensorTypes::eDevice it will use a record operation for the memory to be syncd into GPU memory which means that the operation will be done in sync with GPU commands. For TensorTypes::eStaging it will only map the data into host memory which will happen during preEval before the recorded commands are dispatched. This operation won't have any effect on TensorTypes::eStaging.
 */
 class OpTensorSyncLocal : public OpBase
 {
@ -2052,17 +2074,17 @@ class OpTensorSyncLocal : public OpBase
                   std::vector<std::shared_ptr<Tensor>> tensors);

    /**
-     * Default destructor. This class manages the memory of the staging tensors it owns but these are released in the postSubmit, before it arrives to the destructor.
+     * Default destructor. This class does not manage memory so it won't be expecting the parent to perform a release.
     */
    ~OpTensorSyncLocal() override;

    /**
-     * Performs basic checks such as ensuring that there is at least one tensor provided, that they are initialized and that they are not of type TensorTpes::eStaging.
+     * Performs basic checks such as ensuring that there is at least one tensor provided with min memory of 1 element.
     */
    void init() override;

    /**
-     * For device tensors, it records the copy command into the staging tensor from the device tensor.
+     * For device tensors, it records the copy command for the tensor to copy the data from its device to staging memory.
     */
    void record() override;

@ -2077,8 +2099,6 @@ class OpTensorSyncLocal : public OpBase
    virtual void postEval() override;

  private:
-    // Never owned resources
-    std::vector<std::shared_ptr<Tensor>> mStagingTensors;
 };

 } // End namespace kp
--- a/src/OpAlgoLhsRhsOut.cpp
+++ b/src/OpAlgoLhsRhsOut.cpp
@ -65,11 +65,6 @@ OpAlgoLhsRhsOut::init()
          " Output: " + std::to_string(this->mTensorOutput->size()));
    }

-    this->mTensorOutputStaging = std::make_shared<Tensor>(
-      this->mTensorOutput->data(), Tensor::TensorTypes::eStaging);
-
-    this->mTensorOutputStaging->init(this->mPhysicalDevice, this->mDevice);
-
    SPDLOG_DEBUG("Kompute OpAlgoLhsRhsOut fetching spirv data");

    std::vector<char> shaderFileData = this->fetchSpirvBinaryData();
@ -110,8 +105,10 @@ OpAlgoLhsRhsOut::record()
      vk::PipelineStageFlagBits::eComputeShader,
      vk::PipelineStageFlagBits::eTransfer);

-    this->mTensorOutputStaging->recordCopyFrom(
-      this->mCommandBuffer, this->mTensorOutput, true);
+    if (this->mTensorOutput->tensorType() == Tensor::TensorTypes::eDevice) {
+        this->mTensorOutput->recordCopyFromDeviceToStaging(this->mCommandBuffer,
+                                                           true);
+    }
 }

 void
@ -119,9 +116,7 @@ OpAlgoLhsRhsOut::postEval()
 {
    SPDLOG_DEBUG("Kompute OpAlgoLhsRhsOut postSubmit called");

-    this->mTensorOutputStaging->mapDataFromHostMemory();
-
-    this->mTensorOutput->setData(this->mTensorOutputStaging->data());
+    this->mTensorOutput->mapDataFromHostMemory();
 }

 }
--- a/src/OpTensorCreate.cpp
+++ b/src/OpTensorCreate.cpp
@ -23,16 +23,6 @@ OpTensorCreate::OpTensorCreate(
 OpTensorCreate::~OpTensorCreate()
 {
    SPDLOG_DEBUG("Kompute OpTensorCreate destructor started");
-
-    SPDLOG_DEBUG("Kompute OpTensorCreate freeing staging tensors");
-    for (std::shared_ptr<Tensor> tensor : this->mStagingTensors) {
-        if (tensor && tensor->isInit()) {
-            tensor->freeMemoryDestroyGPUResources();
-        } else {
-            SPDLOG_ERROR("Kompute OpTensorCreate expected to free "
-                          "tensor but has already been freed.");
-        }
-    }
 }

 void
@ -50,27 +40,10 @@ OpTensorCreate::init()
            throw std::runtime_error(
              "Kompute OpTensorCreate: Tensor has already been initialized");
        }
-        if (tensor->tensorType() == Tensor::TensorTypes::eDevice) {
-            tensor->init(this->mPhysicalDevice, this->mDevice);
-
-            std::shared_ptr<Tensor> stagingTensor = std::make_shared<Tensor>(
-              tensor->data(), Tensor::TensorTypes::eStaging);
-
-            stagingTensor->init(this->mPhysicalDevice, this->mDevice);
-
-            stagingTensor->mapDataIntoHostMemory();
-
-            this->mStagingTensors.push_back(stagingTensor);
-
-        } else {
-
+        if (tensor->tensorType() != Tensor::TensorTypes::eStorage) {
            tensor->init(this->mPhysicalDevice, this->mDevice);

            tensor->mapDataIntoHostMemory();
-
-            // We push a nullptr when no staging tensor is needed to match
-            // index number in array to have one to one mapping with tensors
-            this->mStagingTensors.push_back(nullptr);
        }
    }
 }
@ -82,8 +55,8 @@ OpTensorCreate::record()

    for (size_t i = 0; i < this->mTensors.size(); i++) {
        if (this->mTensors[i]->tensorType() == Tensor::TensorTypes::eDevice) {
-            this->mTensors[i]->recordCopyFrom(
-              this->mCommandBuffer, this->mStagingTensors[i], false);
+            this->mTensors[i]->recordCopyFromStagingToDevice(
+              this->mCommandBuffer, false);
        }
    }
 }
--- a/src/OpTensorSyncDevice.cpp
+++ b/src/OpTensorSyncDevice.cpp
@ -41,25 +41,11 @@ OpTensorSyncDevice::init()
                                     "has not been initialized");
        }
        if (tensor->tensorType() == Tensor::TensorTypes::eStorage) {
-            throw std::runtime_error(
+            SPDLOG_WARN(
              "Kompute OpTensorSyncLocal tensor parameter is of type "
              "TensorTypes::eStorage and hence cannot be used to receive or "
              "pass data.");
        }
-        if (tensor->tensorType() == Tensor::TensorTypes::eDevice) {
-
-            std::shared_ptr<Tensor> stagingTensor = std::make_shared<Tensor>(
-              tensor->data(), Tensor::TensorTypes::eStaging);
-
-            stagingTensor->init(this->mPhysicalDevice, this->mDevice);
-
-            this->mStagingTensors.push_back(stagingTensor);
-
-        } else {
-            // We push a nullptr when no staging tensor is needed to match
-            // index number in array to have one to one mapping with tensors
-            this->mStagingTensors.push_back(nullptr);
-        }
    }
 }

@ -70,8 +56,8 @@ OpTensorSyncDevice::record()

    for (size_t i = 0; i < this->mTensors.size(); i++) {
        if (this->mTensors[i]->tensorType() == Tensor::TensorTypes::eDevice) {
-            this->mTensors[i]->recordCopyFrom(
-              this->mCommandBuffer, this->mStagingTensors[i], false);
+            this->mTensors[i]->recordCopyFromStagingToDevice(
+              this->mCommandBuffer, false);
        }
    }
 }
@ -83,11 +69,8 @@ OpTensorSyncDevice::preEval()

    // Performing sync of data as eval can be called multiple times with same op
    for (size_t i = 0; i < this->mTensors.size(); i++) {
-        if (this->mTensors[i]->tensorType() == Tensor::TensorTypes::eDevice) {
-            this->mStagingTensors[i]->setData(this->mTensors[i]->data());
-            this->mStagingTensors[i]->mapDataIntoHostMemory();
-        } else {
-            this->mTensors[i]->mapDataFromHostMemory();
+        if (this->mTensors[i]->tensorType() != Tensor::TensorTypes::eStorage) {
+            this->mTensors[i]->mapDataIntoHostMemory();
        }
    }
 }
--- a/src/OpTensorSyncLocal.cpp
+++ b/src/OpTensorSyncLocal.cpp
@ -41,26 +41,11 @@ OpTensorSyncLocal::init()
              "Kompute OpTensorSyncLocal: Tensor has not been initialized");
        }
        if (tensor->tensorType() == Tensor::TensorTypes::eStorage) {
-            throw std::runtime_error(
+            SPDLOG_WARN(
              "Kompute OpTensorSyncLocal tensor parameter is of type "
              "TensorTypes::eStorage and hence cannot be used to receive or "
              "pass data.");
        }
-        if (tensor->tensorType() == Tensor::TensorTypes::eDevice) {
-
-            std::shared_ptr<Tensor> stagingTensor = std::make_shared<Tensor>(
-              tensor->data(), Tensor::TensorTypes::eStaging);
-
-            stagingTensor->init(this->mPhysicalDevice, this->mDevice);
-
-            this->mStagingTensors.push_back(stagingTensor);
-
-        } else {
-
-            // We push a nullptr when no staging tensor is needed to match
-            // index number in array to have one to one mapping with tensors
-            this->mStagingTensors.push_back(nullptr);
-        }
    }
 }

@ -71,8 +56,8 @@ OpTensorSyncLocal::record()

    for (size_t i = 0; i < this->mTensors.size(); i++) {
        if (this->mTensors[i]->tensorType() == Tensor::TensorTypes::eDevice) {
-            this->mStagingTensors[i]->recordCopyFrom(
-              this->mCommandBuffer, this->mTensors[i], true);
+            this->mTensors[i]->recordCopyFromDeviceToStaging(
+              this->mCommandBuffer, true);
        }
    }
 }
@ -90,10 +75,7 @@ OpTensorSyncLocal::postEval()

    SPDLOG_DEBUG("Kompute OpTensorSyncLocal mapping data into tensor local");
    for (size_t i = 0; i < this->mTensors.size(); i++) {
-        if (this->mTensors[i]->tensorType() == Tensor::TensorTypes::eDevice) {
-            this->mStagingTensors[i]->mapDataFromHostMemory();
-            this->mTensors[i]->setData(this->mStagingTensors[i]->data());
-        } else {
+        if (this->mTensors[i]->tensorType() != Tensor::TensorTypes::eStorage) {
            this->mTensors[i]->mapDataFromHostMemory();
        }
    }
--- a/src/Tensor.cpp
+++ b/src/Tensor.cpp
@ -47,7 +47,7 @@ Tensor::init(std::shared_ptr<vk::PhysicalDevice> physicalDevice,

    this->mIsInit = true;

-    this->createBuffer();
+    this->allocateMemoryCreateGPUResources();
 }

 std::vector<float>&
@ -89,7 +89,7 @@ Tensor::tensorType()
 bool
 Tensor::isInit()
 {
-    return this->mIsInit && this->mBuffer && this->mMemory;
+    return this->mIsInit && this->mPrimaryBuffer && this->mPrimaryMemory;
 }

 void
@ -107,20 +107,71 @@ Tensor::recordCopyFrom(std::shared_ptr<vk::CommandBuffer> commandBuffer,
                       std::shared_ptr<Tensor> copyFromTensor,
                       bool createBarrier)
 {
-    SPDLOG_DEBUG("Kompute Tensor recordCopyFrom called");

-    if (!this->mIsInit || !copyFromTensor->mIsInit) {
-        throw std::runtime_error(
-          "Kompute Tensor attempted to run createBuffer without init");
-    }
+    vk::DeviceSize bufferSize(this->memorySize());
+    vk::BufferCopy copyRegion(0, 0, bufferSize);

+    SPDLOG_DEBUG("Kompute Tensor recordCopyFrom data size {}.", bufferSize);
+
+    this->copyBuffer(commandBuffer,
+                     copyFromTensor->mPrimaryBuffer,
+                     this->mPrimaryBuffer,
+                     bufferSize,
+                     copyRegion,
+                     createBarrier);
+}
+
+void
+Tensor::recordCopyFromStagingToDevice(
+  std::shared_ptr<vk::CommandBuffer> commandBuffer,
+  bool createBarrier)
+{
    vk::DeviceSize bufferSize(this->memorySize());
    vk::BufferCopy copyRegion(0, 0, bufferSize);

    SPDLOG_DEBUG("Kompute Tensor copying data size {}.", bufferSize);

-    commandBuffer->copyBuffer(
-      *copyFromTensor->mBuffer, *this->mBuffer, copyRegion);
+    this->copyBuffer(commandBuffer,
+                     this->mStagingBuffer,
+                     this->mPrimaryBuffer,
+                     bufferSize,
+                     copyRegion,
+                     createBarrier);
+}
+
+void
+Tensor::recordCopyFromDeviceToStaging(
+  std::shared_ptr<vk::CommandBuffer> commandBuffer,
+  bool createBarrier)
+{
+    vk::DeviceSize bufferSize(this->memorySize());
+    vk::BufferCopy copyRegion(0, 0, bufferSize);
+
+    SPDLOG_DEBUG("Kompute Tensor copying data size {}.", bufferSize);
+
+    this->copyBuffer(commandBuffer,
+                     this->mPrimaryBuffer,
+                     this->mStagingBuffer,
+                     bufferSize,
+                     copyRegion,
+                     createBarrier);
+}
+
+void
+Tensor::copyBuffer(std::shared_ptr<vk::CommandBuffer> commandBuffer,
+                   std::shared_ptr<vk::Buffer> bufferFrom,
+                   std::shared_ptr<vk::Buffer> bufferTo,
+                   vk::DeviceSize bufferSize,
+                   vk::BufferCopy copyRegion,
+                   bool createBarrier)
+{
+
+    if (!this->mIsInit) {
+        throw std::runtime_error(
+          "Kompute Tensor attempted to run copyBuffer without init");
+    }
+
+    commandBuffer->copyBuffer(*bufferFrom, *bufferTo, copyRegion);

    if (createBarrier) {
        // Buffer to ensure wait until data is copied to staging buffer
@ -145,7 +196,7 @@ Tensor::recordBufferMemoryBarrier(
    vk::DeviceSize bufferSize = this->memorySize();

    vk::BufferMemoryBarrier bufferMemoryBarrier;
-    bufferMemoryBarrier.buffer = *this->mBuffer;
+    bufferMemoryBarrier.buffer = *this->mPrimaryBuffer;
    bufferMemoryBarrier.size = bufferSize;
    bufferMemoryBarrier.srcAccessMask = srcAccessMask;
    bufferMemoryBarrier.dstAccessMask = dstAccessMask;
@ -164,7 +215,7 @@ vk::DescriptorBufferInfo
 Tensor::constructDescriptorBufferInfo()
 {
    vk::DeviceSize bufferSize = this->memorySize();
-    return vk::DescriptorBufferInfo(*this->mBuffer,
+    return vk::DescriptorBufferInfo(*this->mPrimaryBuffer,
                                    0, // offset
                                    bufferSize);
 }
@ -174,20 +225,21 @@ Tensor::mapDataFromHostMemory()
 {
    SPDLOG_DEBUG("Kompute Tensor mapping data from host buffer");

-    if (this->mTensorType != TensorTypes::eStaging) {
-        SPDLOG_ERROR(
-          "Mapping tensor data manually from DEVICE buffer instead of "
-          "using record GPU command with staging buffer");
-        return;
+    std::shared_ptr<vk::DeviceMemory> hostVisibleMemory = nullptr;
+
+    if (this->mTensorType == TensorTypes::eHost) {
+        hostVisibleMemory = this->mPrimaryMemory;
+    } else {
+        hostVisibleMemory = this->mStagingMemory;
    }

    vk::DeviceSize bufferSize = this->memorySize();
    void* mapped = this->mDevice->mapMemory(
-      *this->mMemory, 0, bufferSize, vk::MemoryMapFlags());
-    vk::MappedMemoryRange mappedMemoryRange(*this->mMemory, 0, bufferSize);
+      *hostVisibleMemory, 0, bufferSize, vk::MemoryMapFlags());
+    vk::MappedMemoryRange mappedMemoryRange(*hostVisibleMemory, 0, bufferSize);
    this->mDevice->invalidateMappedMemoryRanges(mappedMemoryRange);
    memcpy(this->mData.data(), mapped, bufferSize);
-    this->mDevice->unmapMemory(*this->mMemory);
+    this->mDevice->unmapMemory(*hostVisibleMemory);
 }

 void
@ -196,24 +248,26 @@ Tensor::mapDataIntoHostMemory()

    SPDLOG_DEBUG("Kompute Tensor local mapping tensor data to host buffer");

-    if (this->mTensorType != TensorTypes::eStaging) {
-        SPDLOG_ERROR("Mapping tensor data manually to DEVICE memory instead of "
-                     "using record GPU command with staging buffer");
-        return;
+    std::shared_ptr<vk::DeviceMemory> hostVisibleMemory = nullptr;
+
+    if (this->mTensorType == TensorTypes::eHost) {
+        hostVisibleMemory = this->mPrimaryMemory;
+    } else {
+        hostVisibleMemory = this->mStagingMemory;
    }

    vk::DeviceSize bufferSize = this->memorySize();

    void* mapped = this->mDevice->mapMemory(
-      *this->mMemory, 0, bufferSize, vk::MemoryMapFlags());
+      *hostVisibleMemory, 0, bufferSize, vk::MemoryMapFlags());
    memcpy(mapped, this->mData.data(), bufferSize);
-    vk::MappedMemoryRange mappedRange(*this->mMemory, 0, bufferSize);
+    vk::MappedMemoryRange mappedRange(*hostVisibleMemory, 0, bufferSize);
    this->mDevice->flushMappedMemoryRanges(1, &mappedRange);
-    this->mDevice->unmapMemory(*this->mMemory);
+    this->mDevice->unmapMemory(*hostVisibleMemory);
 }

 vk::BufferUsageFlags
-Tensor::getBufferUsageFlags()
+Tensor::getPrimaryBufferUsageFlags()
 {
    switch (this->mTensorType) {
        case TensorTypes::eDevice:
@ -221,8 +275,9 @@ Tensor::getBufferUsageFlags()
                   vk::BufferUsageFlagBits::eTransferSrc |
                   vk::BufferUsageFlagBits::eTransferDst;
            break;
-        case TensorTypes::eStaging:
-            return vk::BufferUsageFlagBits::eTransferSrc |
+        case TensorTypes::eHost:
+            return vk::BufferUsageFlagBits::eStorageBuffer |
+                   vk::BufferUsageFlagBits::eTransferSrc |
                   vk::BufferUsageFlagBits::eTransferDst;
            break;
        case TensorTypes::eStorage:
@ -234,13 +289,13 @@ Tensor::getBufferUsageFlags()
 }

 vk::MemoryPropertyFlags
-Tensor::getMemoryPropertyFlags()
+Tensor::getPrimaryMemoryPropertyFlags()
 {
    switch (this->mTensorType) {
        case TensorTypes::eDevice:
            return vk::MemoryPropertyFlagBits::eDeviceLocal;
            break;
-        case TensorTypes::eStaging:
+        case TensorTypes::eHost:
            return vk::MemoryPropertyFlagBits::eHostVisible;
            break;
        case TensorTypes::eStorage:
@ -251,8 +306,33 @@ Tensor::getMemoryPropertyFlags()
    }
 }

+vk::BufferUsageFlags
+Tensor::getStagingBufferUsageFlags()
+{
+    switch (this->mTensorType) {
+        case TensorTypes::eDevice:
+            return vk::BufferUsageFlagBits::eTransferSrc |
+                   vk::BufferUsageFlagBits::eTransferDst;
+            break;
+        default:
+            throw std::runtime_error("Kompute Tensor invalid tensor type");
+    }
+}
+
+vk::MemoryPropertyFlags
+Tensor::getStagingMemoryPropertyFlags()
+{
+    switch (this->mTensorType) {
+        case TensorTypes::eDevice:
+            return vk::MemoryPropertyFlagBits::eHostVisible;
+            break;
+        default:
+            throw std::runtime_error("Kompute Tensor invalid tensor type");
+    }
+}
+
 void
-Tensor::createBuffer()
+Tensor::allocateMemoryCreateGPUResources()
 {
    SPDLOG_DEBUG("Kompute Tensor creating buffer");

@ -268,43 +348,79 @@ Tensor::createBuffer()
        throw std::runtime_error("Kompute Tensor device is null");
    }

+    SPDLOG_DEBUG("Kompute Tensor creating primary buffer and memory");

-    vk::BufferUsageFlags usageFlags = this->getBufferUsageFlags();
-    vk::DeviceSize bufferSize = this->memorySize();
-    if(bufferSize<1){
-        throw std::runtime_error("Kompute Tensor attempted to create a zero-sized buffer");
+    this->mPrimaryBuffer = std::make_shared<vk::Buffer>();
+    this->createBuffer(this->mPrimaryBuffer,
+                       this->getPrimaryBufferUsageFlags());
+    this->mFreePrimaryBuffer = true;
+    this->mPrimaryMemory = std::make_shared<vk::DeviceMemory>();
+    this->allocateBindMemory(this->mPrimaryBuffer,
+                             this->mPrimaryMemory,
+                             this->getPrimaryMemoryPropertyFlags());
+    this->mFreePrimaryMemory = true;
+
+    if (this->mTensorType == TensorTypes::eDevice) {
+        SPDLOG_DEBUG("Kompute Tensor creating staging buffer and memory");
+
+        this->mStagingBuffer = std::make_shared<vk::Buffer>();
+        this->createBuffer(this->mStagingBuffer,
+                           this->getStagingBufferUsageFlags());
+        this->mFreeStagingBuffer = true;
+        this->mStagingMemory = std::make_shared<vk::DeviceMemory>();
+        this->allocateBindMemory(this->mStagingBuffer,
+                                 this->mStagingMemory,
+                                 this->getStagingMemoryPropertyFlags());
+        this->mFreeStagingMemory = true;
+    }
+
+    SPDLOG_DEBUG("Kompute Tensor buffer & memory creation successful");
+}
+
+void
+Tensor::createBuffer(std::shared_ptr<vk::Buffer> buffer,
+                     vk::BufferUsageFlags bufferUsageFlags)
+{
+
+    vk::DeviceSize bufferSize = this->memorySize();
+
+    if (bufferSize < 1) {
+        throw std::runtime_error(
+          "Kompute Tensor attempted to create a zero-sized buffer");
    }
-    
-    this->mFreeBuffer = true;

    SPDLOG_DEBUG("Kompute Tensor creating buffer with memory size: {}, and "
                 "usage flags: {}",
                 bufferSize,
-                 vk::to_string(usageFlags));
+                 vk::to_string(bufferUsageFlags));

+    // TODO: Explore having concurrent sharing mode (with option)
    vk::BufferCreateInfo bufferInfo(vk::BufferCreateFlags(),
                                    bufferSize,
-                                    usageFlags,
+                                    bufferUsageFlags,
                                    vk::SharingMode::eExclusive);

-    this->mBuffer = std::make_shared<vk::Buffer>();
-    this->mDevice->createBuffer(&bufferInfo, nullptr, this->mBuffer.get());
+    this->mDevice->createBuffer(&bufferInfo, nullptr, buffer.get());
+}

-    SPDLOG_DEBUG("Kompute Tensor buffer created now creating memory");
+void
+Tensor::allocateBindMemory(std::shared_ptr<vk::Buffer> buffer,
+                           std::shared_ptr<vk::DeviceMemory> memory,
+                           vk::MemoryPropertyFlags memoryPropertyFlags)
+{
+
+    SPDLOG_DEBUG("Kompute Tensor allocating and binding memory");

    vk::PhysicalDeviceMemoryProperties memoryProperties =
      this->mPhysicalDevice->getMemoryProperties();

    vk::MemoryRequirements memoryRequirements =
-      this->mDevice->getBufferMemoryRequirements(*this->mBuffer);
-
-    vk::MemoryPropertyFlags memoryPropertyFlags =
-      this->getMemoryPropertyFlags();
+      this->mDevice->getBufferMemoryRequirements(*buffer);

    uint32_t memoryTypeIndex = -1;
    for (uint32_t i = 0; i < memoryProperties.memoryTypeCount; i++) {
        if (memoryRequirements.memoryTypeBits & (1 << i)) {
-            if ((memoryProperties.memoryTypes[i].propertyFlags &
+            if (((memoryProperties.memoryTypes[i]).propertyFlags &
                 memoryPropertyFlags) == memoryPropertyFlags) {
                memoryTypeIndex = i;
                break;
@ -316,8 +432,6 @@ Tensor::createBuffer()
          "Memory type index for buffer creation not found");
    }

-    this->mFreeMemory = true;
-
    SPDLOG_DEBUG(
      "Kompute Tensor allocating memory index: {}, size {}, flags: {}",
      memoryTypeIndex,
@ -327,13 +441,9 @@ Tensor::createBuffer()
    vk::MemoryAllocateInfo memoryAllocateInfo(memoryRequirements.size,
                                              memoryTypeIndex);

-    this->mMemory = std::make_shared<vk::DeviceMemory>();
-    this->mDevice->allocateMemory(
-      &memoryAllocateInfo, nullptr, this->mMemory.get());
+    this->mDevice->allocateMemory(&memoryAllocateInfo, nullptr, memory.get());

-    this->mDevice->bindBufferMemory(*this->mBuffer, *this->mMemory, 0);
-
-    SPDLOG_DEBUG("Kompute Tensor buffer & memory creation successful");
+    this->mDevice->bindBufferMemory(*buffer, *memory, 0);
 }

 void
@ -349,29 +459,55 @@ Tensor::freeMemoryDestroyGPUResources()
        return;
    }

-    if (this->mFreeBuffer) {
-        if (!this->mBuffer) {
-            SPDLOG_ERROR(
-              "Kompose Tensor expected to free buffer but got null buffer");
+    if (this->mFreePrimaryBuffer) {
+        if (!this->mPrimaryBuffer) {
+            SPDLOG_ERROR("Kompose Tensor expected to destroy primary buffer "
+                         "but got null buffer");
        } else {
-            SPDLOG_DEBUG("Kompose Tensor destroying buffer");
+            SPDLOG_DEBUG("Kompose Tensor destroying primary buffer");
            this->mDevice->destroy(
-              *this->mBuffer,
+              *this->mPrimaryBuffer,
              (vk::Optional<const vk::AllocationCallbacks>)nullptr);
-            this->mBuffer = nullptr;
+            this->mPrimaryBuffer = nullptr;
        }
    }

-    if (this->mFreeMemory) {
-        if (!this->mMemory) {
-            SPDLOG_ERROR(
-              "Kompose Tensor expected to free buffer but got null memory");
+    if (this->mFreeStagingBuffer) {
+        if (!this->mStagingBuffer) {
+            SPDLOG_ERROR("Kompose Tensor expected to destroy staging buffer "
+                         "but got null buffer");
        } else {
-            SPDLOG_DEBUG("Kompose Tensor freeing memory");
-            this->mDevice->freeMemory(
-              *this->mMemory,
+            SPDLOG_DEBUG("Kompose Tensor destroying staging buffer");
+            this->mDevice->destroy(
+              *this->mStagingBuffer,
              (vk::Optional<const vk::AllocationCallbacks>)nullptr);
-            this->mDevice = nullptr;
+            this->mStagingBuffer = nullptr;
+        }
+    }
+
+    if (this->mFreePrimaryMemory) {
+        if (!this->mPrimaryMemory) {
+            SPDLOG_ERROR("Kompose Tensor expected to free primary memory but "
+                         "got null memory");
+        } else {
+            SPDLOG_DEBUG("Kompose Tensor freeing primary memory");
+            this->mDevice->freeMemory(
+              *this->mPrimaryMemory,
+              (vk::Optional<const vk::AllocationCallbacks>)nullptr);
+            this->mPrimaryMemory = nullptr;
+        }
+    }
+
+    if (this->mFreeStagingMemory) {
+        if (!this->mStagingMemory) {
+            SPDLOG_ERROR("Kompose Tensor expected to free staging memory but "
+                         "got null memory");
+        } else {
+            SPDLOG_DEBUG("Kompose Tensor freeing staging memory");
+            this->mDevice->freeMemory(
+              *this->mStagingMemory,
+              (vk::Optional<const vk::AllocationCallbacks>)nullptr);
+            this->mStagingMemory = nullptr;
        }
    }

--- a/src/include/kompute/Tensor.hpp
+++ b/src/include/kompute/Tensor.hpp
@ -26,7 +26,7 @@ class Tensor
    enum class TensorTypes
    {
        eDevice = 0,  ///< Type is device memory, source and destination
-        eStaging = 1, ///< Type is host memory, source and destination
+        eHost = 1,    ///< Type is host memory, source and destination
        eStorage = 2, ///< Type is Device memory (only)
    };

@ -39,7 +39,8 @@ class Tensor
     *  Default constructor with data provided which would be used to create the
     * respective vulkan buffer and memory.
     *
-     *  @param data Non-zero-sized vector of data that will be used by the tensor
+     *  @param data Non-zero-sized vector of data that will be used by the
+     * tensor
     *  @param tensorType Type for the tensor which is of type TensorTypes
     */
    Tensor(const std::vector<float>& data,
@ -131,6 +132,32 @@ class Tensor
                        std::shared_ptr<Tensor> copyFromTensor,
                        bool createBarrier);

+    /**
+     * Records a copy from the internal staging memory to the device memory
+     * using an optional barrier to wait for the operation. This function would
+     * only be relevant for kp::Tensors of type eDevice.
+     *
+     * @param commandBuffer Vulkan Command Buffer to record the commands into
+     * @param createBarrier Whether to create a barrier that ensures the data is
+     * copied before further operations. Default is true.
+     */
+    void recordCopyFromStagingToDevice(
+      std::shared_ptr<vk::CommandBuffer> commandBuffer,
+      bool createBarrier);
+
+    /**
+     * Records a copy from the internal device memory to the staging memory
+     * using an optional barrier to wait for the operation. This function would
+     * only be relevant for kp::Tensors of type eDevice.
+     *
+     * @param commandBuffer Vulkan Command Buffer to record the commands into
+     * @param createBarrier Whether to create a barrier that ensures the data is
+     * copied before further operations. Default is true.
+     */
+    void recordCopyFromDeviceToStaging(
+      std::shared_ptr<vk::CommandBuffer> commandBuffer,
+      bool createBarrier);
+
    /**
     * Records the buffer memory barrier into the command buffer which
     * ensures that relevant data transfers are carried out correctly.
@ -173,10 +200,14 @@ class Tensor
    std::shared_ptr<vk::Device> mDevice;

    // -------------- OPTIONALLY OWNED RESOURCES
-    std::shared_ptr<vk::Buffer> mBuffer;
-    bool mFreeBuffer;
-    std::shared_ptr<vk::DeviceMemory> mMemory;
-    bool mFreeMemory;
+    std::shared_ptr<vk::Buffer> mPrimaryBuffer;
+    bool mFreePrimaryBuffer = false;
+    std::shared_ptr<vk::Buffer> mStagingBuffer;
+    bool mFreeStagingBuffer = false;
+    std::shared_ptr<vk::DeviceMemory> mPrimaryMemory;
+    bool mFreePrimaryMemory = false;
+    std::shared_ptr<vk::DeviceMemory> mStagingMemory;
+    bool mFreeStagingMemory = false;

    // -------------- ALWAYS OWNED RESOURCES
    std::vector<float> mData;
@ -186,11 +217,24 @@ class Tensor
    std::array<uint32_t, KP_MAX_DIM_SIZE> mShape;
    bool mIsInit = false;

-    void createBuffer(); // Creates the vulkan buffer
+    void allocateMemoryCreateGPUResources(); // Creates the vulkan buffer
+    void createBuffer(std::shared_ptr<vk::Buffer> buffer,
+                      vk::BufferUsageFlags bufferUsageFlags);
+    void allocateBindMemory(std::shared_ptr<vk::Buffer> buffer,
+                            std::shared_ptr<vk::DeviceMemory> memory,
+                            vk::MemoryPropertyFlags memoryPropertyFlags);
+    void copyBuffer(std::shared_ptr<vk::CommandBuffer> commandBuffer,
+                    std::shared_ptr<vk::Buffer> bufferFrom,
+                    std::shared_ptr<vk::Buffer> bufferTo,
+                    vk::DeviceSize bufferSize,
+                    vk::BufferCopy copyRegion,
+                    bool createBarrier);

    // Private util functions
-    vk::BufferUsageFlags getBufferUsageFlags();
-    vk::MemoryPropertyFlags getMemoryPropertyFlags();
+    vk::BufferUsageFlags getPrimaryBufferUsageFlags();
+    vk::MemoryPropertyFlags getPrimaryMemoryPropertyFlags();
+    vk::BufferUsageFlags getStagingBufferUsageFlags();
+    vk::MemoryPropertyFlags getStagingMemoryPropertyFlags();
    uint64_t memorySize();
 };

--- a/src/include/kompute/operations/OpAlgoLhsRhsOut.hpp
+++ b/src/include/kompute/operations/OpAlgoLhsRhsOut.hpp
@ -78,9 +78,6 @@ class OpAlgoLhsRhsOut : public OpAlgoBase
    std::shared_ptr<Tensor> mTensorLHS; ///< Reference to the parameter used in the left hand side equation of the shader
    std::shared_ptr<Tensor> mTensorRHS; ///< Reference to the parameter used in the right hand side equation of the shader
    std::shared_ptr<Tensor> mTensorOutput; ///< Reference to the parameter used in the output of the shader and will be copied with a staging vector
-
-    // -------------- ALWAYS OWNED RESOURCES
-    std::shared_ptr<Tensor> mTensorOutputStaging; ///< Staging temporary tensor user do to copy the output of the tensor
 };

 } // End namespace kp
--- a/src/include/kompute/operations/OpBase.hpp
+++ b/src/include/kompute/operations/OpBase.hpp
@ -69,7 +69,7 @@ class OpBase
                if (tensor && tensor->isInit()) {
                    tensor->freeMemoryDestroyGPUResources();
                } else {
-                    SPDLOG_ERROR("Kompute OpBase expected to free "
+                    SPDLOG_WARN("Kompute OpBase expected to free "
                                  "tensor but has already been freed.");
                }
            }
--- a/src/include/kompute/operations/OpTensorCreate.hpp
+++ b/src/include/kompute/operations/OpTensorCreate.hpp
@ -69,8 +69,6 @@ class OpTensorCreate : public OpBase


  private:
-    // Never owned resources
-    std::vector<std::shared_ptr<Tensor>> mStagingTensors;
 };

 } // End namespace kp
--- a/src/include/kompute/operations/OpTensorSyncDevice.hpp
+++ b/src/include/kompute/operations/OpTensorSyncDevice.hpp
@ -9,7 +9,7 @@
 namespace kp {

 /**
-    Operation that syncs tensor's device by mapping local data into the device memory. For TensorTypes::eDevice it will use a staging tensor to perform the copy. For TensorTypes::eStaging it will only copy the data and perform a map, which will be executed during the record (as opposed to during the sequence eval/submit). This function cannot be carried out for TensorTypes::eStaging.
+    Operation that syncs tensor's device by mapping local data into the device memory. For TensorTypes::eDevice it will use a record operation for the memory to be syncd into GPU memory which means that the operation will be done in sync with GPU commands. For TensorTypes::eStaging it will only map the data into host memory which will happen during preEval before the recorded commands are dispatched. This operation won't have any effect on TensorTypes::eStaging.
 */
 class OpTensorSyncDevice : public OpBase
 {
@ -35,12 +35,12 @@ class OpTensorSyncDevice : public OpBase
    ~OpTensorSyncDevice() override;

    /**
-     * Performs basic checks such as ensuring that there is at least one tensor provided, that they are initialized and that they are not of type TensorTpes::eStaging. For staging tensors in host memory, the map is performed during the init function.
+     * Performs basic checks such as ensuring that there is at least one tensor provided with min memory of 1 element.
     */
    void init() override;

    /**
-     * For device tensors, it records the copy command to the device tensor from the temporary staging tensor.
+     * For device tensors, it records the copy command for the tensor to copy the data from its staging to device memory.
     */
    void record() override;

@ -55,8 +55,6 @@ class OpTensorSyncDevice : public OpBase
    virtual void postEval() override;

  private:
-    // Never owned resources
-    std::vector<std::shared_ptr<Tensor>> mStagingTensors;
 };

 } // End namespace kp
--- a/src/include/kompute/operations/OpTensorSyncLocal.hpp
+++ b/src/include/kompute/operations/OpTensorSyncLocal.hpp
@ -9,7 +9,7 @@
 namespace kp {

 /**
-    Operation that syncs tensor's local data by mapping the data from device memory into the local vector. For TensorTypes::eDevice it will use a staging tensor to perform the copy. For TensorTypes::eStaging it will only copy the data and perform a map, which will be executed during the postSubmit (there will be no copy during the sequence eval/submit). This function cannot be carried out for TensorTypes::eStaging.
+    Operation that syncs tensor's local memory by mapping device data into the local CPU memory. For TensorTypes::eDevice it will use a record operation for the memory to be syncd into GPU memory which means that the operation will be done in sync with GPU commands. For TensorTypes::eStaging it will only map the data into host memory which will happen during preEval before the recorded commands are dispatched. This operation won't have any effect on TensorTypes::eStaging.
 */
 class OpTensorSyncLocal : public OpBase
 {
@ -30,17 +30,17 @@ class OpTensorSyncLocal : public OpBase
                   std::vector<std::shared_ptr<Tensor>> tensors);

    /**
-     * Default destructor. This class manages the memory of the staging tensors it owns but these are released in the postSubmit, before it arrives to the destructor.
+     * Default destructor. This class does not manage memory so it won't be expecting the parent to perform a release.
     */
    ~OpTensorSyncLocal() override;

    /**
-     * Performs basic checks such as ensuring that there is at least one tensor provided, that they are initialized and that they are not of type TensorTpes::eStaging.
+     * Performs basic checks such as ensuring that there is at least one tensor provided with min memory of 1 element.
     */
    void init() override;

    /**
-     * For device tensors, it records the copy command into the staging tensor from the device tensor.
+     * For device tensors, it records the copy command for the tensor to copy the data from its device to staging memory.
     */
    void record() override;

@ -56,8 +56,6 @@ class OpTensorSyncLocal : public OpBase


  private:
-    // Never owned resources
-    std::vector<std::shared_ptr<Tensor>> mStagingTensors;
 };

 } // End namespace kp
--- a/test/TestLogisticRegression.cpp
+++ b/test/TestLogisticRegression.cpp
@ -105,12 +105,12 @@ TEST(TestLogisticRegressionAlgorithm, TestMainLogisticRegressionManualCopy)
    std::shared_ptr<kp::Tensor> y{ new kp::Tensor({ 0, 0, 0, 1, 1 }) };

    std::shared_ptr<kp::Tensor> wIn{ new kp::Tensor(
-      wInVec, kp::Tensor::TensorTypes::eStaging) };
+      wInVec, kp::Tensor::TensorTypes::eHost) };
    std::shared_ptr<kp::Tensor> wOutI{ new kp::Tensor({ 0, 0, 0, 0, 0 }) };
    std::shared_ptr<kp::Tensor> wOutJ{ new kp::Tensor({ 0, 0, 0, 0, 0 }) };

    std::shared_ptr<kp::Tensor> bIn{ new kp::Tensor(
-      bInVec, kp::Tensor::TensorTypes::eStaging) };
+      bInVec, kp::Tensor::TensorTypes::eHost) };
    std::shared_ptr<kp::Tensor> bOut{ new kp::Tensor({ 0, 0, 0, 0, 0 }) };

    std::shared_ptr<kp::Tensor> lOut{ new kp::Tensor({ 0, 0, 0, 0, 0 }) };
--- a/test/TestManager.cpp
+++ b/test/TestManager.cpp
@ -126,7 +126,7 @@ TEST(TestManager, TestCreateInitTensor)
    EXPECT_EQ(tensorB->data(), std::vector<float>({ 0, 1, 2 }));

    std::shared_ptr<kp::Tensor> tensorC =
-      mgr.buildTensor({ 0, 0, 0 }, kp::Tensor::TensorTypes::eStaging);
+      mgr.buildTensor({ 0, 0, 0 }, kp::Tensor::TensorTypes::eHost);

    mgr.evalOpDefault<kp::OpTensorCopy>({ tensorA, tensorC });

--- a/test/TestOpTensorCopy.cpp
+++ b/test/TestOpTensorCopy.cpp
@ -58,7 +58,7 @@ TEST(TestOpTensorCopy, CopyDeviceToDeviceTensorMulti)
    EXPECT_EQ(tensorA->data(), tensorC->data());
 }

-TEST(TestOpTensorCopy, CopyDeviceToStagingTensor)
+TEST(TestOpTensorCopy, CopyDeviceToHostTensor)
 {

    kp::Manager mgr;
@ -68,7 +68,7 @@ TEST(TestOpTensorCopy, CopyDeviceToStagingTensor)

    std::shared_ptr<kp::Tensor> tensorA{ new kp::Tensor(testVecA) };
    std::shared_ptr<kp::Tensor> tensorB{ new kp::Tensor(
-      testVecB, kp::Tensor::TensorTypes::eStaging) };
+      testVecB, kp::Tensor::TensorTypes::eHost) };

    mgr.evalOpDefault<kp::OpTensorCreate>({ tensorA, tensorB });

@ -84,7 +84,7 @@ TEST(TestOpTensorCopy, CopyDeviceToStagingTensor)
    EXPECT_EQ(tensorA->data(), tensorB->data());
 }

-TEST(TestOpTensorCopy, CopyStagingToDeviceTensor)
+TEST(TestOpTensorCopy, CopyHostToDeviceTensor)
 {

    kp::Manager mgr;
@ -93,7 +93,7 @@ TEST(TestOpTensorCopy, CopyStagingToDeviceTensor)
    std::vector<float> testVecB{ 0, 0, 0 };

    std::shared_ptr<kp::Tensor> tensorA{ new kp::Tensor(
-      testVecA, kp::Tensor::TensorTypes::eStaging) };
+      testVecA, kp::Tensor::TensorTypes::eHost) };
    std::shared_ptr<kp::Tensor> tensorB{ new kp::Tensor(testVecB) };

    mgr.evalOpDefault<kp::OpTensorCreate>({ tensorA, tensorB });
@ -110,7 +110,7 @@ TEST(TestOpTensorCopy, CopyStagingToDeviceTensor)
    EXPECT_EQ(tensorA->data(), tensorB->data());
 }

-TEST(TestOpTensorCopy, CopyStagingToStagingTensor)
+TEST(TestOpTensorCopy, CopyHostToHostTensor)
 {

    kp::Manager mgr;
@ -119,9 +119,9 @@ TEST(TestOpTensorCopy, CopyStagingToStagingTensor)
    std::vector<float> testVecB{ 0, 0, 0 };

    std::shared_ptr<kp::Tensor> tensorA{ new kp::Tensor(
-      testVecA, kp::Tensor::TensorTypes::eStaging) };
+      testVecA, kp::Tensor::TensorTypes::eHost) };
    std::shared_ptr<kp::Tensor> tensorB{ new kp::Tensor(
-      testVecB, kp::Tensor::TensorTypes::eStaging) };
+      testVecB, kp::Tensor::TensorTypes::eHost) };

    mgr.evalOpDefault<kp::OpTensorCreate>({ tensorA, tensorB });

@ -145,7 +145,7 @@ TEST(TestOpTensorCopy, SingleTensorShouldFail)
    std::vector<float> testVecA{ 9, 8, 7 };

    std::shared_ptr<kp::Tensor> tensorA{ new kp::Tensor(
-      testVecA, kp::Tensor::TensorTypes::eStaging) };
+      testVecA, kp::Tensor::TensorTypes::eHost) };

    mgr.evalOpDefault<kp::OpTensorCreate>({ tensorA });

--- a/test/TestOpTensorCreate.cpp
+++ b/test/TestOpTensorCreate.cpp
@ -114,7 +114,6 @@ TEST(TestOpTensorCreate, NoErrorIfTensorFreedBefore)
    EXPECT_FALSE(tensorB->isInit());
 }

-
 TEST(TestOpTensorCreate, ExceptionOnZeroSizeTensor)
 {
    std::vector<float> testVecA;
@ -123,11 +122,11 @@ TEST(TestOpTensorCreate, ExceptionOnZeroSizeTensor)

    kp::Manager mgr;

-    try{
+    try {
        mgr.evalOpDefault<kp::OpTensorCreate>({ tensorA });
-    } catch( const std::runtime_error& err ) {
-         // check exception
-        ASSERT_TRUE( std::string(err.what()).find("zero-sized") != std::string::npos );
+    } catch (const std::runtime_error& err) {
+        // check exception
+        ASSERT_TRUE(std::string(err.what()).find("zero-sized") !=
+                    std::string::npos);
    }
-
 }
--- a/test/TestTensor.cpp
+++ b/test/TestTensor.cpp
@ -17,9 +17,9 @@ TEST(TestTensor, CopyFromHostData)
    std::vector<float> vecB{ 0, 0, 0 };

    std::shared_ptr<kp::Tensor> tensorA =
-      std::make_shared<kp::Tensor>(vecA, kp::Tensor::TensorTypes::eStaging);
+      std::make_shared<kp::Tensor>(vecA, kp::Tensor::TensorTypes::eHost);
    std::shared_ptr<kp::Tensor> tensorB =
-      std::make_shared<kp::Tensor>(vecB, kp::Tensor::TensorTypes::eStaging);
+      std::make_shared<kp::Tensor>(vecB, kp::Tensor::TensorTypes::eHost);

    kp::Manager mgr;