From b61f3f22975621a1f0aca8845a4523ea428ad623 Mon Sep 17 00:00:00 2001
From: Alejandro Saucedo <axsauze@gmail.com>
Date: Sun, 7 Feb 2021 22:00:58 +0000
Subject: [PATCH 1/6] Added initial iteration of tensor with two memory
 compoennts

---
 src/Tensor.cpp                 | 184 +++++++++++++++++++++++----------
 src/include/kompute/Tensor.hpp |  24 +++--
 2 files changed, 144 insertions(+), 64 deletions(-)
diff --git a/src/Tensor.cpp b/src/Tensor.cpp
index 63e667053..b26132e9d 100644
--- a/src/Tensor.cpp
+++ b/src/Tensor.cpp
@@ -47,7 +47,7 @@ Tensor::init(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
 
     this->mIsInit = true;
 
-    this->createBuffer();
+    this->allocateMemoryCreateGPUResources();
 }
 
 std::vector<float>&
@@ -89,7 +89,7 @@ Tensor::tensorType()
 bool
 Tensor::isInit()
 {
-    return this->mIsInit && this->mBuffer && this->mMemory;
+    return this->mIsInit && this->mPrimaryBuffer && this->mPrimaryMemory;
 }
 
 void
@@ -120,7 +120,7 @@ Tensor::recordCopyFrom(std::shared_ptr<vk::CommandBuffer> commandBuffer,
     SPDLOG_DEBUG("Kompute Tensor copying data size {}.", bufferSize);
 
     commandBuffer->copyBuffer(
-      *copyFromTensor->mBuffer, *this->mBuffer, copyRegion);
+      *copyFromTensor->mPrimaryBuffer, *this->mPrimaryBuffer, copyRegion);
 
     if (createBarrier) {
         // Buffer to ensure wait until data is copied to staging buffer
@@ -145,7 +145,7 @@ Tensor::recordBufferMemoryBarrier(
     vk::DeviceSize bufferSize = this->memorySize();
 
     vk::BufferMemoryBarrier bufferMemoryBarrier;
-    bufferMemoryBarrier.buffer = *this->mBuffer;
+    bufferMemoryBarrier.buffer = *this->mPrimaryBuffer;
     bufferMemoryBarrier.size = bufferSize;
     bufferMemoryBarrier.srcAccessMask = srcAccessMask;
     bufferMemoryBarrier.dstAccessMask = dstAccessMask;
@@ -164,7 +164,7 @@ vk::DescriptorBufferInfo
 Tensor::constructDescriptorBufferInfo()
 {
     vk::DeviceSize bufferSize = this->memorySize();
-    return vk::DescriptorBufferInfo(*this->mBuffer,
+    return vk::DescriptorBufferInfo(*this->mPrimaryBuffer,
                                     0, // offset
                                     bufferSize);
 }
@@ -174,20 +174,22 @@ Tensor::mapDataFromHostMemory()
 {
     SPDLOG_DEBUG("Kompute Tensor mapping data from host buffer");
 
-    if (this->mTensorType != TensorTypes::eStaging) {
-        SPDLOG_ERROR(
-          "Mapping tensor data manually from DEVICE buffer instead of "
-          "using record GPU command with staging buffer");
-        return;
+    std::shared_ptr<vk::DeviceMemory> hostVisibleMemory = nullptr;
+
+    if (this->mTensorType == TensorTypes::eHost) {
+        hostVisibleMemory = this->mPrimaryMemory;
+    }
+    else {
+        hostVisibleMemory = this->mStagingMemory;
     }
 
     vk::DeviceSize bufferSize = this->memorySize();
     void* mapped = this->mDevice->mapMemory(
-      *this->mMemory, 0, bufferSize, vk::MemoryMapFlags());
-    vk::MappedMemoryRange mappedMemoryRange(*this->mMemory, 0, bufferSize);
+      *hostVisibleMemory, 0, bufferSize, vk::MemoryMapFlags());
+    vk::MappedMemoryRange mappedMemoryRange(*hostVisibleMemory, 0, bufferSize);
     this->mDevice->invalidateMappedMemoryRanges(mappedMemoryRange);
     memcpy(this->mData.data(), mapped, bufferSize);
-    this->mDevice->unmapMemory(*this->mMemory);
+    this->mDevice->unmapMemory(*hostVisibleMemory);
 }
 
 void
@@ -196,24 +198,27 @@ Tensor::mapDataIntoHostMemory()
 
     SPDLOG_DEBUG("Kompute Tensor local mapping tensor data to host buffer");
 
-    if (this->mTensorType != TensorTypes::eStaging) {
-        SPDLOG_ERROR("Mapping tensor data manually to DEVICE memory instead of "
-                     "using record GPU command with staging buffer");
-        return;
+    std::shared_ptr<vk::DeviceMemory> hostVisibleMemory = nullptr;
+
+    if (this->mTensorType == TensorTypes::eHost) {
+        hostVisibleMemory = this->mPrimaryMemory;
+    }
+    else {
+        hostVisibleMemory = this->mStagingMemory;
     }
 
     vk::DeviceSize bufferSize = this->memorySize();
 
     void* mapped = this->mDevice->mapMemory(
-      *this->mMemory, 0, bufferSize, vk::MemoryMapFlags());
+      *hostVisibleMemory, 0, bufferSize, vk::MemoryMapFlags());
     memcpy(mapped, this->mData.data(), bufferSize);
-    vk::MappedMemoryRange mappedRange(*this->mMemory, 0, bufferSize);
+    vk::MappedMemoryRange mappedRange(*hostVisibleMemory, 0, bufferSize);
     this->mDevice->flushMappedMemoryRanges(1, &mappedRange);
-    this->mDevice->unmapMemory(*this->mMemory);
+    this->mDevice->unmapMemory(*hostVisibleMemory);
 }
 
 vk::BufferUsageFlags
-Tensor::getBufferUsageFlags()
+Tensor::getPrimaryBufferUsageFlags()
 {
     switch (this->mTensorType) {
         case TensorTypes::eDevice:
@@ -221,8 +226,9 @@ Tensor::getBufferUsageFlags()
                    vk::BufferUsageFlagBits::eTransferSrc |
                    vk::BufferUsageFlagBits::eTransferDst;
             break;
-        case TensorTypes::eStaging:
-            return vk::BufferUsageFlagBits::eTransferSrc |
+        case TensorTypes::eHost:
+            return vk::BufferUsageFlagBits::eStorageBuffer |
+                   vk::BufferUsageFlagBits::eTransferSrc |
                    vk::BufferUsageFlagBits::eTransferDst;
             break;
         case TensorTypes::eStorage:
@@ -234,13 +240,13 @@ Tensor::getBufferUsageFlags()
 }
 
 vk::MemoryPropertyFlags
-Tensor::getMemoryPropertyFlags()
+Tensor::getPrimaryMemoryPropertyFlags()
 {
     switch (this->mTensorType) {
         case TensorTypes::eDevice:
             return vk::MemoryPropertyFlagBits::eDeviceLocal;
             break;
-        case TensorTypes::eStaging:
+        case TensorTypes::eHost:
             return vk::MemoryPropertyFlagBits::eHostVisible;
             break;
         case TensorTypes::eStorage:
@@ -251,8 +257,33 @@ Tensor::getMemoryPropertyFlags()
     }
 }
 
+vk::BufferUsageFlags
+Tensor::getStagingBufferUsageFlags()
+{
+    switch (this->mTensorType) {
+        case TensorTypes::eDevice:
+            return vk::BufferUsageFlagBits::eTransferSrc |
+                   vk::BufferUsageFlagBits::eTransferDst;
+            break;
+        default:
+            throw std::runtime_error("Kompute Tensor invalid tensor type");
+    }
+}
+
+vk::MemoryPropertyFlags
+Tensor::getStagingMemoryPropertyFlags()
+{
+    switch (this->mTensorType) {
+        case TensorTypes::eDevice:
+            return vk::MemoryPropertyFlagBits::eDeviceLocal;
+            break;
+        default:
+            throw std::runtime_error("Kompute Tensor invalid tensor type");
+    }
+}
+
 void
-Tensor::createBuffer()
+Tensor::allocateMemoryCreateGPUResources()
 {
     SPDLOG_DEBUG("Kompute Tensor creating buffer");
 
@@ -268,44 +299,64 @@ Tensor::createBuffer()
         throw std::runtime_error("Kompute Tensor device is null");
     }
 
+    this->mPrimaryBuffer = std::make_shared<vk::Buffer>();
+    this->createBuffer(this->mPrimaryBuffer, this->getPrimaryBufferUsageFlags());
+    this->mFreePrimaryBuffer = true;
+    this->allocateBindMemory(this->mPrimaryBuffer, this->mPrimaryMemory, this->getPrimaryMemoryPropertyFlags());
+    this->mFreePrimaryMemory = true;
+
+    if (this->mTensorType == TensorTypes::eDevice) {
+        this->mStagingBuffer = std::make_shared<vk::Buffer>();
+        this->createBuffer(this->mStagingBuffer, this->getStagingBufferUsageFlags());
+        this->mFreeStagingBuffer = true;
+        this->allocateBindMemory(this->mStagingBuffer, this->mStagingMemory, this->getStagingMemoryPropertyFlags());
+        this->mFreeStagingMemory = true;
+    }
+
+    SPDLOG_DEBUG("Kompute Tensor buffer & memory creation successful");
+}
+
+void
+Tensor::createBuffer(std::shared_ptr<vk::Buffer> buffer, vk::BufferUsageFlags bufferUsageFlags) {
+
 
-    vk::BufferUsageFlags usageFlags = this->getBufferUsageFlags();
     vk::DeviceSize bufferSize = this->memorySize();
+
     if(bufferSize<1){
         throw std::runtime_error("Kompute Tensor attempted to create a zero-sized buffer");
     }
     
-    this->mFreeBuffer = true;
 
     SPDLOG_DEBUG("Kompute Tensor creating buffer with memory size: {}, and "
                  "usage flags: {}",
                  bufferSize,
                  vk::to_string(usageFlags));
 
+    // TODO: Explore having concurrent sharing mode (with option)
     vk::BufferCreateInfo bufferInfo(vk::BufferCreateFlags(),
                                     bufferSize,
-                                    usageFlags,
+                                    bufferUsageFlags,
                                     vk::SharingMode::eExclusive);
 
-    this->mBuffer = std::make_shared<vk::Buffer>();
-    this->mDevice->createBuffer(&bufferInfo, nullptr, this->mBuffer.get());
+    this->mDevice->createBuffer(&bufferInfo, nullptr, buffer.get());
 
-    SPDLOG_DEBUG("Kompute Tensor buffer created now creating memory");
+}
+
+void
+Tensor::allocateBindMemory(std::shared_ptr<vk::Buffer> buffer, std::shared_ptr<vk::DeviceMemory> memory, vk::MemoryPropertyFlags memoryPropertyFlags) {
+
+    SPDLOG_DEBUG("Kompute Tensor allocating and binding memory");
 
     vk::PhysicalDeviceMemoryProperties memoryProperties =
       this->mPhysicalDevice->getMemoryProperties();
 
     vk::MemoryRequirements memoryRequirements =
-      this->mDevice->getBufferMemoryRequirements(*this->mBuffer);
-
-    vk::MemoryPropertyFlags memoryPropertyFlags =
-      this->getMemoryPropertyFlags();
+      this->mDevice->getBufferMemoryRequirements(*buffer);
 
     uint32_t memoryTypeIndex = -1;
     for (uint32_t i = 0; i < memoryProperties.memoryTypeCount; i++) {
         if (memoryRequirements.memoryTypeBits & (1 << i)) {
-            if ((memoryProperties.memoryTypes[i].propertyFlags &
-                 memoryPropertyFlags) == memoryPropertyFlags) {
+            if (((memoryProperties.memoryTypes[i]).propertyFlags & memoryPropertyFlags) == memoryPropertyFlags) {
                 memoryTypeIndex = i;
                 break;
             }
@@ -316,8 +367,6 @@ Tensor::createBuffer()
           "Memory type index for buffer creation not found");
     }
 
-    this->mFreeMemory = true;
-
     SPDLOG_DEBUG(
       "Kompute Tensor allocating memory index: {}, size {}, flags: {}",
       memoryTypeIndex,
@@ -327,13 +376,10 @@ Tensor::createBuffer()
     vk::MemoryAllocateInfo memoryAllocateInfo(memoryRequirements.size,
                                               memoryTypeIndex);
 
-    this->mMemory = std::make_shared<vk::DeviceMemory>();
     this->mDevice->allocateMemory(
-      &memoryAllocateInfo, nullptr, this->mMemory.get());
+      &memoryAllocateInfo, nullptr, memory.get());
 
-    this->mDevice->bindBufferMemory(*this->mBuffer, *this->mMemory, 0);
-
-    SPDLOG_DEBUG("Kompute Tensor buffer & memory creation successful");
+    this->mDevice->bindBufferMemory(*buffer, *memory, 0);
 }
 
 void
@@ -349,27 +395,53 @@ Tensor::freeMemoryDestroyGPUResources()
         return;
     }
 
-    if (this->mFreeBuffer) {
-        if (!this->mBuffer) {
+    if (this->mFreePrimaryBuffer) {
+        if (!this->mPrimaryBuffer) {
             SPDLOG_ERROR(
-              "Kompose Tensor expected to free buffer but got null buffer");
+              "Kompose Tensor expected to destroy primary buffer but got null buffer");
         } else {
-            SPDLOG_DEBUG("Kompose Tensor destroying buffer");
+            SPDLOG_DEBUG("Kompose Tensor destroying primary buffer");
             this->mDevice->destroy(
-              *this->mBuffer,
+              *this->mPrimaryBuffer,
               (vk::Optional<const vk::AllocationCallbacks>)nullptr);
-            this->mBuffer = nullptr;
+            this->mPrimaryBuffer = nullptr;
         }
     }
 
-    if (this->mFreeMemory) {
-        if (!this->mMemory) {
+    if (this->mFreeStagingBuffer) {
+        if (!this->mStagingBuffer) {
             SPDLOG_ERROR(
-              "Kompose Tensor expected to free buffer but got null memory");
+              "Kompose Tensor expected to destroy staging buffer but got null buffer");
         } else {
-            SPDLOG_DEBUG("Kompose Tensor freeing memory");
+            SPDLOG_DEBUG("Kompose Tensor destroying staging buffer");
+            this->mDevice->destroy(
+              *this->mStagingBuffer,
+              (vk::Optional<const vk::AllocationCallbacks>)nullptr);
+            this->mStagingBuffer = nullptr;
+        }
+    }
+
+    if (this->mFreePrimaryMemory) {
+        if (!this->mPrimaryMemory) {
+            SPDLOG_ERROR(
+              "Kompose Tensor expected to free primary memory but got null memory");
+        } else {
+            SPDLOG_DEBUG("Kompose Tensor freeing primary memory");
             this->mDevice->freeMemory(
-              *this->mMemory,
+              *this->mPrimaryMemory,
+              (vk::Optional<const vk::AllocationCallbacks>)nullptr);
+            this->mDevice = nullptr;
+        }
+    }
+
+    if (this->mFreeStagingMemory) {
+        if (!this->mStagingMemory) {
+            SPDLOG_ERROR(
+              "Kompose Tensor expected to free staging memory but got null memory");
+        } else {
+            SPDLOG_DEBUG("Kompose Tensor freeing staging memory");
+            this->mDevice->freeMemory(
+              *this->mStagingMemory,
               (vk::Optional<const vk::AllocationCallbacks>)nullptr);
             this->mDevice = nullptr;
         }
diff --git a/src/include/kompute/Tensor.hpp b/src/include/kompute/Tensor.hpp
index d7b7e8f8e..7ab6f4e02 100644
--- a/src/include/kompute/Tensor.hpp
+++ b/src/include/kompute/Tensor.hpp
@@ -26,7 +26,7 @@ class Tensor
     enum class TensorTypes
     {
         eDevice = 0,  ///< Type is device memory, source and destination
-        eStaging = 1, ///< Type is host memory, source and destination
+        eHost = 1, ///< Type is host memory, source and destination
         eStorage = 2, ///< Type is Device memory (only)
     };
 
@@ -173,10 +173,14 @@ class Tensor
     std::shared_ptr<vk::Device> mDevice;
 
     // -------------- OPTIONALLY OWNED RESOURCES
-    std::shared_ptr<vk::Buffer> mBuffer;
-    bool mFreeBuffer;
-    std::shared_ptr<vk::DeviceMemory> mMemory;
-    bool mFreeMemory;
+    std::shared_ptr<vk::Buffer> mPrimaryBuffer;
+    bool mFreePrimaryBuffer;
+    std::shared_ptr<vk::Buffer> mStagingBuffer;
+    bool mFreeStagingBuffer;
+    std::shared_ptr<vk::DeviceMemory> mPrimaryMemory;
+    bool mFreePrimaryMemory;
+    std::shared_ptr<vk::DeviceMemory> mStagingMemory;
+    bool mFreeStagingMemory;
 
     // -------------- ALWAYS OWNED RESOURCES
     std::vector<float> mData;
@@ -186,11 +190,15 @@ class Tensor
     std::array<uint32_t, KP_MAX_DIM_SIZE> mShape;
     bool mIsInit = false;
 
-    void createBuffer(); // Creates the vulkan buffer
+    void allocateMemoryCreateGPUResources(); // Creates the vulkan buffer
+    void createBuffer(std::shared_ptr<vk::Buffer> buffer, vk::BufferUsageFlags bufferUsageFlags);
+    void allocateBindMemory(std::shared_ptr<vk::Buffer> buffer, std::shared_ptr<vk::DeviceMemory> memory, vk::MemoryPropertyFlags memoryPropertyFlags);
 
     // Private util functions
-    vk::BufferUsageFlags getBufferUsageFlags();
-    vk::MemoryPropertyFlags getMemoryPropertyFlags();
+    vk::BufferUsageFlags getPrimaryBufferUsageFlags();
+    vk::MemoryPropertyFlags getPrimaryMemoryPropertyFlags();
+    vk::BufferUsageFlags getStagingBufferUsageFlags();
+    vk::MemoryPropertyFlags getStagingMemoryPropertyFlags();
     uint64_t memorySize();
 };
 

From 04853df4697710f8a8334a1fc4238d32ca0de94a Mon Sep 17 00:00:00 2001
From: Alejandro Saucedo <axsauze@gmail.com>
Date: Mon, 8 Feb 2021 07:17:54 +0000
Subject: [PATCH 2/6] Updated Tensor Memory to hold staging within class

---
 src/OpAlgoLhsRhsOut.cpp                       | 15 ++---
 src/OpTensorCreate.cpp                        | 33 +----------
 src/OpTensorSyncDevice.cpp                    | 27 ++-------
 src/OpTensorSyncLocal.cpp                     | 26 ++-------
 src/Tensor.cpp                                | 57 +++++++++++++++----
 src/include/kompute/Tensor.hpp                | 29 ++++++++--
 .../kompute/operations/OpAlgoLhsRhsOut.hpp    |  3 -
 src/include/kompute/operations/OpBase.hpp     |  2 +-
 .../kompute/operations/OpTensorCreate.hpp     |  2 -
 .../kompute/operations/OpTensorSyncDevice.hpp |  8 +--
 .../kompute/operations/OpTensorSyncLocal.hpp  | 10 ++--
 11 files changed, 97 insertions(+), 115 deletions(-)

diff --git a/src/OpAlgoLhsRhsOut.cpp b/src/OpAlgoLhsRhsOut.cpp
index ab759fed8..622a4f431 100644
--- a/src/OpAlgoLhsRhsOut.cpp
+++ b/src/OpAlgoLhsRhsOut.cpp
@@ -65,11 +65,6 @@ OpAlgoLhsRhsOut::init()
           " Output: " + std::to_string(this->mTensorOutput->size()));
     }
 
-    this->mTensorOutputStaging = std::make_shared<Tensor>(
-      this->mTensorOutput->data(), Tensor::TensorTypes::eStaging);
-
-    this->mTensorOutputStaging->init(this->mPhysicalDevice, this->mDevice);
-
     SPDLOG_DEBUG("Kompute OpAlgoLhsRhsOut fetching spirv data");
 
     std::vector<char> shaderFileData = this->fetchSpirvBinaryData();
@@ -110,8 +105,10 @@ OpAlgoLhsRhsOut::record()
       vk::PipelineStageFlagBits::eComputeShader,
       vk::PipelineStageFlagBits::eTransfer);
 
-    this->mTensorOutputStaging->recordCopyFrom(
-      this->mCommandBuffer, this->mTensorOutput, true);
+    if (this->mTensorOutput->tensorType() == Tensor::TensorTypes::eDevice) {
+        this->mTensorOutput->recordCopyFromDeviceToStaging(
+          this->mCommandBuffer, true);
+    }
 }
 
 void
@@ -119,9 +116,7 @@ OpAlgoLhsRhsOut::postEval()
 {
     SPDLOG_DEBUG("Kompute OpAlgoLhsRhsOut postSubmit called");
 
-    this->mTensorOutputStaging->mapDataFromHostMemory();
-
-    this->mTensorOutput->setData(this->mTensorOutputStaging->data());
+    this->mTensorOutput->mapDataFromHostMemory();
 }
 
 }
diff --git a/src/OpTensorCreate.cpp b/src/OpTensorCreate.cpp
index ac9485baf..7918415e9 100644
--- a/src/OpTensorCreate.cpp
+++ b/src/OpTensorCreate.cpp
@@ -23,16 +23,6 @@ OpTensorCreate::OpTensorCreate(
 OpTensorCreate::~OpTensorCreate()
 {
     SPDLOG_DEBUG("Kompute OpTensorCreate destructor started");
-
-    SPDLOG_DEBUG("Kompute OpTensorCreate freeing staging tensors");
-    for (std::shared_ptr<Tensor> tensor : this->mStagingTensors) {
-        if (tensor && tensor->isInit()) {
-            tensor->freeMemoryDestroyGPUResources();
-        } else {
-            SPDLOG_ERROR("Kompute OpTensorCreate expected to free "
-                          "tensor but has already been freed.");
-        }
-    }
 }
 
 void
@@ -50,27 +40,10 @@ OpTensorCreate::init()
             throw std::runtime_error(
               "Kompute OpTensorCreate: Tensor has already been initialized");
         }
-        if (tensor->tensorType() == Tensor::TensorTypes::eDevice) {
-            tensor->init(this->mPhysicalDevice, this->mDevice);
-
-            std::shared_ptr<Tensor> stagingTensor = std::make_shared<Tensor>(
-              tensor->data(), Tensor::TensorTypes::eStaging);
-
-            stagingTensor->init(this->mPhysicalDevice, this->mDevice);
-
-            stagingTensor->mapDataIntoHostMemory();
-
-            this->mStagingTensors.push_back(stagingTensor);
-
-        } else {
-
+        if (tensor->tensorType() != Tensor::TensorTypes::eStorage) {
             tensor->init(this->mPhysicalDevice, this->mDevice);
 
             tensor->mapDataIntoHostMemory();
-
-            // We push a nullptr when no staging tensor is needed to match
-            // index number in array to have one to one mapping with tensors
-            this->mStagingTensors.push_back(nullptr);
         }
     }
 }
@@ -82,8 +55,8 @@ OpTensorCreate::record()
 
     for (size_t i = 0; i < this->mTensors.size(); i++) {
         if (this->mTensors[i]->tensorType() == Tensor::TensorTypes::eDevice) {
-            this->mTensors[i]->recordCopyFrom(
-              this->mCommandBuffer, this->mStagingTensors[i], false);
+            this->mTensors[i]->recordCopyFromStagingToDevice(
+              this->mCommandBuffer, false);
         }
     }
 }
diff --git a/src/OpTensorSyncDevice.cpp b/src/OpTensorSyncDevice.cpp
index b975d2a9b..340786eb5 100644
--- a/src/OpTensorSyncDevice.cpp
+++ b/src/OpTensorSyncDevice.cpp
@@ -41,25 +41,11 @@ OpTensorSyncDevice::init()
                                      "has not been initialized");
         }
         if (tensor->tensorType() == Tensor::TensorTypes::eStorage) {
-            throw std::runtime_error(
+            SPDLOG_WARN(
               "Kompute OpTensorSyncLocal tensor parameter is of type "
               "TensorTypes::eStorage and hence cannot be used to receive or "
               "pass data.");
         }
-        if (tensor->tensorType() == Tensor::TensorTypes::eDevice) {
-
-            std::shared_ptr<Tensor> stagingTensor = std::make_shared<Tensor>(
-              tensor->data(), Tensor::TensorTypes::eStaging);
-
-            stagingTensor->init(this->mPhysicalDevice, this->mDevice);
-
-            this->mStagingTensors.push_back(stagingTensor);
-
-        } else {
-            // We push a nullptr when no staging tensor is needed to match
-            // index number in array to have one to one mapping with tensors
-            this->mStagingTensors.push_back(nullptr);
-        }
     }
 }
 
@@ -70,8 +56,8 @@ OpTensorSyncDevice::record()
 
     for (size_t i = 0; i < this->mTensors.size(); i++) {
         if (this->mTensors[i]->tensorType() == Tensor::TensorTypes::eDevice) {
-            this->mTensors[i]->recordCopyFrom(
-              this->mCommandBuffer, this->mStagingTensors[i], false);
+            this->mTensors[i]->recordCopyFromStagingToDevice(
+              this->mCommandBuffer, false);
         }
     }
 }
@@ -83,11 +69,8 @@ OpTensorSyncDevice::preEval()
 
     // Performing sync of data as eval can be called multiple times with same op
     for (size_t i = 0; i < this->mTensors.size(); i++) {
-        if (this->mTensors[i]->tensorType() == Tensor::TensorTypes::eDevice) {
-            this->mStagingTensors[i]->setData(this->mTensors[i]->data());
-            this->mStagingTensors[i]->mapDataIntoHostMemory();
-        } else {
-            this->mTensors[i]->mapDataFromHostMemory();
+        if (this->mTensors[i]->tensorType() != Tensor::TensorTypes::eStorage) {
+            this->mTensors[i]->mapDataIntoHostMemory();
         }
     }
 }
diff --git a/src/OpTensorSyncLocal.cpp b/src/OpTensorSyncLocal.cpp
index 24a737bdd..09d966e12 100644
--- a/src/OpTensorSyncLocal.cpp
+++ b/src/OpTensorSyncLocal.cpp
@@ -41,26 +41,11 @@ OpTensorSyncLocal::init()
               "Kompute OpTensorSyncLocal: Tensor has not been initialized");
         }
         if (tensor->tensorType() == Tensor::TensorTypes::eStorage) {
-            throw std::runtime_error(
+            SPDLOG_WARN(
               "Kompute OpTensorSyncLocal tensor parameter is of type "
               "TensorTypes::eStorage and hence cannot be used to receive or "
               "pass data.");
         }
-        if (tensor->tensorType() == Tensor::TensorTypes::eDevice) {
-
-            std::shared_ptr<Tensor> stagingTensor = std::make_shared<Tensor>(
-              tensor->data(), Tensor::TensorTypes::eStaging);
-
-            stagingTensor->init(this->mPhysicalDevice, this->mDevice);
-
-            this->mStagingTensors.push_back(stagingTensor);
-
-        } else {
-
-            // We push a nullptr when no staging tensor is needed to match
-            // index number in array to have one to one mapping with tensors
-            this->mStagingTensors.push_back(nullptr);
-        }
     }
 }
 
@@ -71,8 +56,8 @@ OpTensorSyncLocal::record()
 
     for (size_t i = 0; i < this->mTensors.size(); i++) {
         if (this->mTensors[i]->tensorType() == Tensor::TensorTypes::eDevice) {
-            this->mStagingTensors[i]->recordCopyFrom(
-              this->mCommandBuffer, this->mTensors[i], true);
+            this->mTensors[i]->recordCopyFromDeviceToStaging(
+              this->mCommandBuffer, true);
         }
     }
 }
@@ -90,10 +75,7 @@ OpTensorSyncLocal::postEval()
 
     SPDLOG_DEBUG("Kompute OpTensorSyncLocal mapping data into tensor local");
     for (size_t i = 0; i < this->mTensors.size(); i++) {
-        if (this->mTensors[i]->tensorType() == Tensor::TensorTypes::eDevice) {
-            this->mStagingTensors[i]->mapDataFromHostMemory();
-            this->mTensors[i]->setData(this->mStagingTensors[i]->data());
-        } else {
+        if (this->mTensors[i]->tensorType() != Tensor::TensorTypes::eStorage) {
             this->mTensors[i]->mapDataFromHostMemory();
         }
     }
diff --git a/src/Tensor.cpp b/src/Tensor.cpp
index b26132e9d..726723def 100644
--- a/src/Tensor.cpp
+++ b/src/Tensor.cpp
@@ -107,20 +107,51 @@ Tensor::recordCopyFrom(std::shared_ptr<vk::CommandBuffer> commandBuffer,
                        std::shared_ptr<Tensor> copyFromTensor,
                        bool createBarrier)
 {
-    SPDLOG_DEBUG("Kompute Tensor recordCopyFrom called");
 
-    if (!this->mIsInit || !copyFromTensor->mIsInit) {
-        throw std::runtime_error(
-          "Kompute Tensor attempted to run createBuffer without init");
-    }
+    vk::DeviceSize bufferSize(this->memorySize());
+    vk::BufferCopy copyRegion(0, 0, bufferSize);
 
+    SPDLOG_DEBUG("Kompute Tensor recordCopyFrom data size {}.", bufferSize);
+
+    this->copyBuffer(commandBuffer, copyFromTensor->mPrimaryBuffer, this->mPrimaryBuffer, bufferSize, copyRegion, createBarrier);
+
+}
+
+void
+Tensor::recordCopyFromStagingToDevice(std::shared_ptr<vk::CommandBuffer> commandBuffer,
+                       bool createBarrier)
+{
     vk::DeviceSize bufferSize(this->memorySize());
     vk::BufferCopy copyRegion(0, 0, bufferSize);
 
     SPDLOG_DEBUG("Kompute Tensor copying data size {}.", bufferSize);
 
+    this->copyBuffer(commandBuffer, this->mStagingBuffer, this->mPrimaryBuffer, bufferSize, copyRegion, createBarrier);
+}
+
+void
+Tensor::recordCopyFromDeviceToStaging(std::shared_ptr<vk::CommandBuffer> commandBuffer,
+                       bool createBarrier)
+{
+    vk::DeviceSize bufferSize(this->memorySize());
+    vk::BufferCopy copyRegion(0, 0, bufferSize);
+
+    SPDLOG_DEBUG("Kompute Tensor copying data size {}.", bufferSize);
+
+    this->copyBuffer(commandBuffer, this->mPrimaryBuffer, this->mStagingBuffer, bufferSize, copyRegion, createBarrier);
+
+}
+
+void
+Tensor::copyBuffer(std::shared_ptr<vk::CommandBuffer> commandBuffer, std::shared_ptr<vk::Buffer> bufferFrom, std::shared_ptr<vk::Buffer> bufferTo, vk::DeviceSize bufferSize, vk::BufferCopy copyRegion, bool createBarrier) {
+
+    if (!this->mIsInit) {
+        throw std::runtime_error(
+          "Kompute Tensor attempted to run copyBuffer without init");
+    }
+
     commandBuffer->copyBuffer(
-      *copyFromTensor->mPrimaryBuffer, *this->mPrimaryBuffer, copyRegion);
+      *bufferFrom, *bufferTo, copyRegion);
 
     if (createBarrier) {
         // Buffer to ensure wait until data is copied to staging buffer
@@ -275,7 +306,7 @@ Tensor::getStagingMemoryPropertyFlags()
 {
     switch (this->mTensorType) {
         case TensorTypes::eDevice:
-            return vk::MemoryPropertyFlagBits::eDeviceLocal;
+            return vk::MemoryPropertyFlagBits::eHostVisible;
             break;
         default:
             throw std::runtime_error("Kompute Tensor invalid tensor type");
@@ -299,16 +330,22 @@ Tensor::allocateMemoryCreateGPUResources()
         throw std::runtime_error("Kompute Tensor device is null");
     }
 
+    SPDLOG_DEBUG("Kompute Tensor creating primary buffer and memory");
+
     this->mPrimaryBuffer = std::make_shared<vk::Buffer>();
     this->createBuffer(this->mPrimaryBuffer, this->getPrimaryBufferUsageFlags());
     this->mFreePrimaryBuffer = true;
+    this->mPrimaryMemory = std::make_shared<vk::DeviceMemory>();
     this->allocateBindMemory(this->mPrimaryBuffer, this->mPrimaryMemory, this->getPrimaryMemoryPropertyFlags());
     this->mFreePrimaryMemory = true;
 
     if (this->mTensorType == TensorTypes::eDevice) {
+        SPDLOG_DEBUG("Kompute Tensor creating staging buffer and memory");
+
         this->mStagingBuffer = std::make_shared<vk::Buffer>();
         this->createBuffer(this->mStagingBuffer, this->getStagingBufferUsageFlags());
         this->mFreeStagingBuffer = true;
+        this->mStagingMemory = std::make_shared<vk::DeviceMemory>();
         this->allocateBindMemory(this->mStagingBuffer, this->mStagingMemory, this->getStagingMemoryPropertyFlags());
         this->mFreeStagingMemory = true;
     }
@@ -330,7 +367,7 @@ Tensor::createBuffer(std::shared_ptr<vk::Buffer> buffer, vk::BufferUsageFlags bu
     SPDLOG_DEBUG("Kompute Tensor creating buffer with memory size: {}, and "
                  "usage flags: {}",
                  bufferSize,
-                 vk::to_string(usageFlags));
+                 vk::to_string(bufferUsageFlags));
 
     // TODO: Explore having concurrent sharing mode (with option)
     vk::BufferCreateInfo bufferInfo(vk::BufferCreateFlags(),
@@ -430,7 +467,7 @@ Tensor::freeMemoryDestroyGPUResources()
             this->mDevice->freeMemory(
               *this->mPrimaryMemory,
               (vk::Optional<const vk::AllocationCallbacks>)nullptr);
-            this->mDevice = nullptr;
+            this->mPrimaryMemory = nullptr;
         }
     }
 
@@ -443,7 +480,7 @@ Tensor::freeMemoryDestroyGPUResources()
             this->mDevice->freeMemory(
               *this->mStagingMemory,
               (vk::Optional<const vk::AllocationCallbacks>)nullptr);
-            this->mDevice = nullptr;
+            this->mStagingMemory = nullptr;
         }
     }
 
diff --git a/src/include/kompute/Tensor.hpp b/src/include/kompute/Tensor.hpp
index 7ab6f4e02..09ae89fd3 100644
--- a/src/include/kompute/Tensor.hpp
+++ b/src/include/kompute/Tensor.hpp
@@ -131,6 +131,26 @@ class Tensor
                         std::shared_ptr<Tensor> copyFromTensor,
                         bool createBarrier);
 
+    /**
+     * Records a copy from the internal staging memory to the device memory using an optional barrier to wait for the operation. This function would only be relevant for kp::Tensors of type eDevice.
+     *
+     * @param commandBuffer Vulkan Command Buffer to record the commands into
+     * @param createBarrier Whether to create a barrier that ensures the data is
+     * copied before further operations. Default is true.
+     */
+    void recordCopyFromStagingToDevice(std::shared_ptr<vk::CommandBuffer> commandBuffer,
+                        bool createBarrier);
+
+    /**
+     * Records a copy from the internal device memory to the staging memory using an optional barrier to wait for the operation. This function would only be relevant for kp::Tensors of type eDevice.
+     *
+     * @param commandBuffer Vulkan Command Buffer to record the commands into
+     * @param createBarrier Whether to create a barrier that ensures the data is
+     * copied before further operations. Default is true.
+     */
+    void recordCopyFromDeviceToStaging(std::shared_ptr<vk::CommandBuffer> commandBuffer,
+                        bool createBarrier);
+
     /**
      * Records the buffer memory barrier into the command buffer which
      * ensures that relevant data transfers are carried out correctly.
@@ -174,13 +194,13 @@ class Tensor
 
     // -------------- OPTIONALLY OWNED RESOURCES
     std::shared_ptr<vk::Buffer> mPrimaryBuffer;
-    bool mFreePrimaryBuffer;
+    bool mFreePrimaryBuffer = false;
     std::shared_ptr<vk::Buffer> mStagingBuffer;
-    bool mFreeStagingBuffer;
+    bool mFreeStagingBuffer = false;
     std::shared_ptr<vk::DeviceMemory> mPrimaryMemory;
-    bool mFreePrimaryMemory;
+    bool mFreePrimaryMemory = false;
     std::shared_ptr<vk::DeviceMemory> mStagingMemory;
-    bool mFreeStagingMemory;
+    bool mFreeStagingMemory = false;
 
     // -------------- ALWAYS OWNED RESOURCES
     std::vector<float> mData;
@@ -193,6 +213,7 @@ class Tensor
     void allocateMemoryCreateGPUResources(); // Creates the vulkan buffer
     void createBuffer(std::shared_ptr<vk::Buffer> buffer, vk::BufferUsageFlags bufferUsageFlags);
     void allocateBindMemory(std::shared_ptr<vk::Buffer> buffer, std::shared_ptr<vk::DeviceMemory> memory, vk::MemoryPropertyFlags memoryPropertyFlags);
+    void copyBuffer(std::shared_ptr<vk::CommandBuffer> commandBuffer, std::shared_ptr<vk::Buffer> bufferFrom, std::shared_ptr<vk::Buffer> bufferTo, vk::DeviceSize bufferSize, vk::BufferCopy copyRegion, bool createBarrier);
 
     // Private util functions
     vk::BufferUsageFlags getPrimaryBufferUsageFlags();
diff --git a/src/include/kompute/operations/OpAlgoLhsRhsOut.hpp b/src/include/kompute/operations/OpAlgoLhsRhsOut.hpp
index c826bd324..db79fa6eb 100644
--- a/src/include/kompute/operations/OpAlgoLhsRhsOut.hpp
+++ b/src/include/kompute/operations/OpAlgoLhsRhsOut.hpp
@@ -78,9 +78,6 @@ class OpAlgoLhsRhsOut : public OpAlgoBase
     std::shared_ptr<Tensor> mTensorLHS; ///< Reference to the parameter used in the left hand side equation of the shader
     std::shared_ptr<Tensor> mTensorRHS; ///< Reference to the parameter used in the right hand side equation of the shader
     std::shared_ptr<Tensor> mTensorOutput; ///< Reference to the parameter used in the output of the shader and will be copied with a staging vector
-
-    // -------------- ALWAYS OWNED RESOURCES
-    std::shared_ptr<Tensor> mTensorOutputStaging; ///< Staging temporary tensor user do to copy the output of the tensor
 };
 
 } // End namespace kp
diff --git a/src/include/kompute/operations/OpBase.hpp b/src/include/kompute/operations/OpBase.hpp
index dc0da487f..6e35df994 100644
--- a/src/include/kompute/operations/OpBase.hpp
+++ b/src/include/kompute/operations/OpBase.hpp
@@ -69,7 +69,7 @@ class OpBase
                 if (tensor && tensor->isInit()) {
                     tensor->freeMemoryDestroyGPUResources();
                 } else {
-                    SPDLOG_ERROR("Kompute OpBase expected to free "
+                    SPDLOG_WARN("Kompute OpBase expected to free "
                                   "tensor but has already been freed.");
                 }
             }
diff --git a/src/include/kompute/operations/OpTensorCreate.hpp b/src/include/kompute/operations/OpTensorCreate.hpp
index ca143b334..4b8c784cc 100644
--- a/src/include/kompute/operations/OpTensorCreate.hpp
+++ b/src/include/kompute/operations/OpTensorCreate.hpp
@@ -69,8 +69,6 @@ class OpTensorCreate : public OpBase
 
 
   private:
-    // Never owned resources
-    std::vector<std::shared_ptr<Tensor>> mStagingTensors;
 };
 
 } // End namespace kp
diff --git a/src/include/kompute/operations/OpTensorSyncDevice.hpp b/src/include/kompute/operations/OpTensorSyncDevice.hpp
index a19e40dca..b80cc1db0 100644
--- a/src/include/kompute/operations/OpTensorSyncDevice.hpp
+++ b/src/include/kompute/operations/OpTensorSyncDevice.hpp
@@ -9,7 +9,7 @@
 namespace kp {
 
 /**
-    Operation that syncs tensor's device by mapping local data into the device memory. For TensorTypes::eDevice it will use a staging tensor to perform the copy. For TensorTypes::eStaging it will only copy the data and perform a map, which will be executed during the record (as opposed to during the sequence eval/submit). This function cannot be carried out for TensorTypes::eStaging.
+    Operation that syncs tensor's device by mapping local data into the device memory. For TensorTypes::eDevice it will use a record operation for the memory to be syncd into GPU memory which means that the operation will be done in sync with GPU commands. For TensorTypes::eStaging it will only map the data into host memory which will happen during preEval before the recorded commands are dispatched. This operation won't have any effect on TensorTypes::eStaging.
 */
 class OpTensorSyncDevice : public OpBase
 {
@@ -35,12 +35,12 @@ class OpTensorSyncDevice : public OpBase
     ~OpTensorSyncDevice() override;
 
     /**
-     * Performs basic checks such as ensuring that there is at least one tensor provided, that they are initialized and that they are not of type TensorTpes::eStaging. For staging tensors in host memory, the map is performed during the init function.
+     * Performs basic checks such as ensuring that there is at least one tensor provided with min memory of 1 element.
      */
     void init() override;
 
     /**
-     * For device tensors, it records the copy command to the device tensor from the temporary staging tensor.
+     * For device tensors, it records the copy command for the tensor to copy the data from its staging to device memory.
      */
     void record() override;
 
@@ -55,8 +55,6 @@ class OpTensorSyncDevice : public OpBase
     virtual void postEval() override;
 
   private:
-    // Never owned resources
-    std::vector<std::shared_ptr<Tensor>> mStagingTensors;
 };
 
 } // End namespace kp
diff --git a/src/include/kompute/operations/OpTensorSyncLocal.hpp b/src/include/kompute/operations/OpTensorSyncLocal.hpp
index caf0ec9b1..dd4549b00 100644
--- a/src/include/kompute/operations/OpTensorSyncLocal.hpp
+++ b/src/include/kompute/operations/OpTensorSyncLocal.hpp
@@ -9,7 +9,7 @@
 namespace kp {
 
 /**
-    Operation that syncs tensor's local data by mapping the data from device memory into the local vector. For TensorTypes::eDevice it will use a staging tensor to perform the copy. For TensorTypes::eStaging it will only copy the data and perform a map, which will be executed during the postSubmit (there will be no copy during the sequence eval/submit). This function cannot be carried out for TensorTypes::eStaging.
+    Operation that syncs tensor's local memory by mapping device data into the local CPU memory. For TensorTypes::eDevice it will use a record operation for the memory to be syncd into GPU memory which means that the operation will be done in sync with GPU commands. For TensorTypes::eStaging it will only map the data into host memory which will happen during preEval before the recorded commands are dispatched. This operation won't have any effect on TensorTypes::eStaging.
 */
 class OpTensorSyncLocal : public OpBase
 {
@@ -30,17 +30,17 @@ class OpTensorSyncLocal : public OpBase
                    std::vector<std::shared_ptr<Tensor>> tensors);
 
     /**
-     * Default destructor. This class manages the memory of the staging tensors it owns but these are released in the postSubmit, before it arrives to the destructor.
+     * Default destructor. This class does not manage memory so it won't be expecting the parent to perform a release.
      */
     ~OpTensorSyncLocal() override;
 
     /**
-     * Performs basic checks such as ensuring that there is at least one tensor provided, that they are initialized and that they are not of type TensorTpes::eStaging.
+     * Performs basic checks such as ensuring that there is at least one tensor provided with min memory of 1 element.
      */
     void init() override;
 
     /**
-     * For device tensors, it records the copy command into the staging tensor from the device tensor.
+     * For device tensors, it records the copy command for the tensor to copy the data from its device to staging memory.
      */
     void record() override;
 
@@ -56,8 +56,6 @@ class OpTensorSyncLocal : public OpBase
 
 
   private:
-    // Never owned resources
-    std::vector<std::shared_ptr<Tensor>> mStagingTensors;
 };
 
 } // End namespace kp

From 8a66c6b1e2a691d9387cadb18391d581c50494ac Mon Sep 17 00:00:00 2001
From: Alejandro Saucedo <axsauze@gmail.com>
Date: Mon, 8 Feb 2021 07:18:05 +0000
Subject: [PATCH 3/6] Updated tests to reflect staging

---
 test/TestLogisticRegression.cpp |  4 ++--
 test/TestManager.cpp            |  2 +-
 test/TestOpTensorCopy.cpp       | 16 ++++++++--------
 test/TestTensor.cpp             |  4 ++--
 4 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/test/TestLogisticRegression.cpp b/test/TestLogisticRegression.cpp
index 5fa3032f8..7c3f15387 100644
--- a/test/TestLogisticRegression.cpp
+++ b/test/TestLogisticRegression.cpp
@@ -105,12 +105,12 @@ TEST(TestLogisticRegressionAlgorithm, TestMainLogisticRegressionManualCopy)
     std::shared_ptr<kp::Tensor> y{ new kp::Tensor({ 0, 0, 0, 1, 1 }) };
 
     std::shared_ptr<kp::Tensor> wIn{ new kp::Tensor(
-      wInVec, kp::Tensor::TensorTypes::eStaging) };
+      wInVec, kp::Tensor::TensorTypes::eHost) };
     std::shared_ptr<kp::Tensor> wOutI{ new kp::Tensor({ 0, 0, 0, 0, 0 }) };
     std::shared_ptr<kp::Tensor> wOutJ{ new kp::Tensor({ 0, 0, 0, 0, 0 }) };
 
     std::shared_ptr<kp::Tensor> bIn{ new kp::Tensor(
-      bInVec, kp::Tensor::TensorTypes::eStaging) };
+      bInVec, kp::Tensor::TensorTypes::eHost) };
     std::shared_ptr<kp::Tensor> bOut{ new kp::Tensor({ 0, 0, 0, 0, 0 }) };
 
     std::shared_ptr<kp::Tensor> lOut{ new kp::Tensor({ 0, 0, 0, 0, 0 }) };
diff --git a/test/TestManager.cpp b/test/TestManager.cpp
index 3076b2a62..198e617af 100644
--- a/test/TestManager.cpp
+++ b/test/TestManager.cpp
@@ -126,7 +126,7 @@ TEST(TestManager, TestCreateInitTensor)
     EXPECT_EQ(tensorB->data(), std::vector<float>({ 0, 1, 2 }));
 
     std::shared_ptr<kp::Tensor> tensorC =
-      mgr.buildTensor({ 0, 0, 0 }, kp::Tensor::TensorTypes::eStaging);
+      mgr.buildTensor({ 0, 0, 0 }, kp::Tensor::TensorTypes::eHost);
 
     mgr.evalOpDefault<kp::OpTensorCopy>({ tensorA, tensorC });
 
diff --git a/test/TestOpTensorCopy.cpp b/test/TestOpTensorCopy.cpp
index ab5b67402..0e840cad6 100644
--- a/test/TestOpTensorCopy.cpp
+++ b/test/TestOpTensorCopy.cpp
@@ -58,7 +58,7 @@ TEST(TestOpTensorCopy, CopyDeviceToDeviceTensorMulti)
     EXPECT_EQ(tensorA->data(), tensorC->data());
 }
 
-TEST(TestOpTensorCopy, CopyDeviceToStagingTensor)
+TEST(TestOpTensorCopy, CopyDeviceToHostTensor)
 {
 
     kp::Manager mgr;
@@ -68,7 +68,7 @@ TEST(TestOpTensorCopy, CopyDeviceToStagingTensor)
 
     std::shared_ptr<kp::Tensor> tensorA{ new kp::Tensor(testVecA) };
     std::shared_ptr<kp::Tensor> tensorB{ new kp::Tensor(
-      testVecB, kp::Tensor::TensorTypes::eStaging) };
+      testVecB, kp::Tensor::TensorTypes::eHost) };
 
     mgr.evalOpDefault<kp::OpTensorCreate>({ tensorA, tensorB });
 
@@ -84,7 +84,7 @@ TEST(TestOpTensorCopy, CopyDeviceToStagingTensor)
     EXPECT_EQ(tensorA->data(), tensorB->data());
 }
 
-TEST(TestOpTensorCopy, CopyStagingToDeviceTensor)
+TEST(TestOpTensorCopy, CopyHostToDeviceTensor)
 {
 
     kp::Manager mgr;
@@ -93,7 +93,7 @@ TEST(TestOpTensorCopy, CopyStagingToDeviceTensor)
     std::vector<float> testVecB{ 0, 0, 0 };
 
     std::shared_ptr<kp::Tensor> tensorA{ new kp::Tensor(
-      testVecA, kp::Tensor::TensorTypes::eStaging) };
+      testVecA, kp::Tensor::TensorTypes::eHost) };
     std::shared_ptr<kp::Tensor> tensorB{ new kp::Tensor(testVecB) };
 
     mgr.evalOpDefault<kp::OpTensorCreate>({ tensorA, tensorB });
@@ -110,7 +110,7 @@ TEST(TestOpTensorCopy, CopyStagingToDeviceTensor)
     EXPECT_EQ(tensorA->data(), tensorB->data());
 }
 
-TEST(TestOpTensorCopy, CopyStagingToStagingTensor)
+TEST(TestOpTensorCopy, CopyHostToHostTensor)
 {
 
     kp::Manager mgr;
@@ -119,9 +119,9 @@ TEST(TestOpTensorCopy, CopyStagingToStagingTensor)
     std::vector<float> testVecB{ 0, 0, 0 };
 
     std::shared_ptr<kp::Tensor> tensorA{ new kp::Tensor(
-      testVecA, kp::Tensor::TensorTypes::eStaging) };
+      testVecA, kp::Tensor::TensorTypes::eHost) };
     std::shared_ptr<kp::Tensor> tensorB{ new kp::Tensor(
-      testVecB, kp::Tensor::TensorTypes::eStaging) };
+      testVecB, kp::Tensor::TensorTypes::eHost) };
 
     mgr.evalOpDefault<kp::OpTensorCreate>({ tensorA, tensorB });
 
@@ -145,7 +145,7 @@ TEST(TestOpTensorCopy, SingleTensorShouldFail)
     std::vector<float> testVecA{ 9, 8, 7 };
 
     std::shared_ptr<kp::Tensor> tensorA{ new kp::Tensor(
-      testVecA, kp::Tensor::TensorTypes::eStaging) };
+      testVecA, kp::Tensor::TensorTypes::eHost) };
 
     mgr.evalOpDefault<kp::OpTensorCreate>({ tensorA });
 
diff --git a/test/TestTensor.cpp b/test/TestTensor.cpp
index 42731bcfe..5e54e8585 100644
--- a/test/TestTensor.cpp
+++ b/test/TestTensor.cpp
@@ -17,9 +17,9 @@ TEST(TestTensor, CopyFromHostData)
     std::vector<float> vecB{ 0, 0, 0 };
 
     std::shared_ptr<kp::Tensor> tensorA =
-      std::make_shared<kp::Tensor>(vecA, kp::Tensor::TensorTypes::eStaging);
+      std::make_shared<kp::Tensor>(vecA, kp::Tensor::TensorTypes::eHost);
     std::shared_ptr<kp::Tensor> tensorB =
-      std::make_shared<kp::Tensor>(vecB, kp::Tensor::TensorTypes::eStaging);
+      std::make_shared<kp::Tensor>(vecB, kp::Tensor::TensorTypes::eHost);
 
     kp::Manager mgr;
 

From 815acfa1fe2cbc223162f90ec3a9486ee6331841 Mon Sep 17 00:00:00 2001
From: Alejandro Saucedo <axsauze@gmail.com>
Date: Mon, 8 Feb 2021 07:18:14 +0000
Subject: [PATCH 4/6] Updatd single include

---
 single_include/kompute/Kompute.hpp | 70 +++++++++++++++++++-----------
 1 file changed, 45 insertions(+), 25 deletions(-)

diff --git a/single_include/kompute/Kompute.hpp b/single_include/kompute/Kompute.hpp
index 244a742f7..d388fa24b 100755
--- a/single_include/kompute/Kompute.hpp
+++ b/single_include/kompute/Kompute.hpp
@@ -723,7 +723,7 @@ class Tensor
     enum class TensorTypes
     {
         eDevice = 0,  ///< Type is device memory, source and destination
-        eStaging = 1, ///< Type is host memory, source and destination
+        eHost = 1, ///< Type is host memory, source and destination
         eStorage = 2, ///< Type is Device memory (only)
     };
 
@@ -828,6 +828,26 @@ class Tensor
                         std::shared_ptr<Tensor> copyFromTensor,
                         bool createBarrier);
 
+    /**
+     * Records a copy from the internal staging memory to the device memory using an optional barrier to wait for the operation. This function would only be relevant for kp::Tensors of type eDevice.
+     *
+     * @param commandBuffer Vulkan Command Buffer to record the commands into
+     * @param createBarrier Whether to create a barrier that ensures the data is
+     * copied before further operations. Default is true.
+     */
+    void recordCopyFromStagingToDevice(std::shared_ptr<vk::CommandBuffer> commandBuffer,
+                        bool createBarrier);
+
+    /**
+     * Records a copy from the internal device memory to the staging memory using an optional barrier to wait for the operation. This function would only be relevant for kp::Tensors of type eDevice.
+     *
+     * @param commandBuffer Vulkan Command Buffer to record the commands into
+     * @param createBarrier Whether to create a barrier that ensures the data is
+     * copied before further operations. Default is true.
+     */
+    void recordCopyFromDeviceToStaging(std::shared_ptr<vk::CommandBuffer> commandBuffer,
+                        bool createBarrier);
+
     /**
      * Records the buffer memory barrier into the command buffer which
      * ensures that relevant data transfers are carried out correctly.
@@ -870,10 +890,14 @@ class Tensor
     std::shared_ptr<vk::Device> mDevice;
 
     // -------------- OPTIONALLY OWNED RESOURCES
-    std::shared_ptr<vk::Buffer> mBuffer;
-    bool mFreeBuffer;
-    std::shared_ptr<vk::DeviceMemory> mMemory;
-    bool mFreeMemory;
+    std::shared_ptr<vk::Buffer> mPrimaryBuffer;
+    bool mFreePrimaryBuffer = false;
+    std::shared_ptr<vk::Buffer> mStagingBuffer;
+    bool mFreeStagingBuffer = false;
+    std::shared_ptr<vk::DeviceMemory> mPrimaryMemory;
+    bool mFreePrimaryMemory = false;
+    std::shared_ptr<vk::DeviceMemory> mStagingMemory;
+    bool mFreeStagingMemory = false;
 
     // -------------- ALWAYS OWNED RESOURCES
     std::vector<float> mData;
@@ -883,11 +907,16 @@ class Tensor
     std::array<uint32_t, KP_MAX_DIM_SIZE> mShape;
     bool mIsInit = false;
 
-    void createBuffer(); // Creates the vulkan buffer
+    void allocateMemoryCreateGPUResources(); // Creates the vulkan buffer
+    void createBuffer(std::shared_ptr<vk::Buffer> buffer, vk::BufferUsageFlags bufferUsageFlags);
+    void allocateBindMemory(std::shared_ptr<vk::Buffer> buffer, std::shared_ptr<vk::DeviceMemory> memory, vk::MemoryPropertyFlags memoryPropertyFlags);
+    void copyBuffer(std::shared_ptr<vk::CommandBuffer> commandBuffer, std::shared_ptr<vk::Buffer> bufferFrom, std::shared_ptr<vk::Buffer> bufferTo, vk::DeviceSize bufferSize, vk::BufferCopy copyRegion, bool createBarrier);
 
     // Private util functions
-    vk::BufferUsageFlags getBufferUsageFlags();
-    vk::MemoryPropertyFlags getMemoryPropertyFlags();
+    vk::BufferUsageFlags getPrimaryBufferUsageFlags();
+    vk::MemoryPropertyFlags getPrimaryMemoryPropertyFlags();
+    vk::BufferUsageFlags getStagingBufferUsageFlags();
+    vk::MemoryPropertyFlags getStagingMemoryPropertyFlags();
     uint64_t memorySize();
 };
 
@@ -958,7 +987,7 @@ class OpBase
                 if (tensor && tensor->isInit()) {
                     tensor->freeMemoryDestroyGPUResources();
                 } else {
-                    SPDLOG_ERROR("Kompute OpBase expected to free "
+                    SPDLOG_WARN("Kompute OpBase expected to free "
                                   "tensor but has already been freed.");
                 }
             }
@@ -1264,8 +1293,6 @@ class OpTensorCreate : public OpBase
     virtual void postEval() override;
 
   private:
-    // Never owned resources
-    std::vector<std::shared_ptr<Tensor>> mStagingTensors;
 };
 
 } // End namespace kp
@@ -1836,9 +1863,6 @@ class OpAlgoLhsRhsOut : public OpAlgoBase
     std::shared_ptr<Tensor> mTensorLHS; ///< Reference to the parameter used in the left hand side equation of the shader
     std::shared_ptr<Tensor> mTensorRHS; ///< Reference to the parameter used in the right hand side equation of the shader
     std::shared_ptr<Tensor> mTensorOutput; ///< Reference to the parameter used in the output of the shader and will be copied with a staging vector
-
-    // -------------- ALWAYS OWNED RESOURCES
-    std::shared_ptr<Tensor> mTensorOutputStaging; ///< Staging temporary tensor user do to copy the output of the tensor
 };
 
 } // End namespace kp
@@ -1976,7 +2000,7 @@ class OpTensorCopy : public OpBase
 namespace kp {
 
 /**
-    Operation that syncs tensor's device by mapping local data into the device memory. For TensorTypes::eDevice it will use a staging tensor to perform the copy. For TensorTypes::eStaging it will only copy the data and perform a map, which will be executed during the record (as opposed to during the sequence eval/submit). This function cannot be carried out for TensorTypes::eStaging.
+    Operation that syncs tensor's device by mapping local data into the device memory. For TensorTypes::eDevice it will use a record operation for the memory to be syncd into GPU memory which means that the operation will be done in sync with GPU commands. For TensorTypes::eStaging it will only map the data into host memory which will happen during preEval before the recorded commands are dispatched. This operation won't have any effect on TensorTypes::eStaging.
 */
 class OpTensorSyncDevice : public OpBase
 {
@@ -2002,12 +2026,12 @@ class OpTensorSyncDevice : public OpBase
     ~OpTensorSyncDevice() override;
 
     /**
-     * Performs basic checks such as ensuring that there is at least one tensor provided, that they are initialized and that they are not of type TensorTpes::eStaging. For staging tensors in host memory, the map is performed during the init function.
+     * Performs basic checks such as ensuring that there is at least one tensor provided with min memory of 1 element.
      */
     void init() override;
 
     /**
-     * For device tensors, it records the copy command to the device tensor from the temporary staging tensor.
+     * For device tensors, it records the copy command for the tensor to copy the data from its staging to device memory.
      */
     void record() override;
 
@@ -2022,8 +2046,6 @@ class OpTensorSyncDevice : public OpBase
     virtual void postEval() override;
 
   private:
-    // Never owned resources
-    std::vector<std::shared_ptr<Tensor>> mStagingTensors;
 };
 
 } // End namespace kp
@@ -2031,7 +2053,7 @@ class OpTensorSyncDevice : public OpBase
 namespace kp {
 
 /**
-    Operation that syncs tensor's local data by mapping the data from device memory into the local vector. For TensorTypes::eDevice it will use a staging tensor to perform the copy. For TensorTypes::eStaging it will only copy the data and perform a map, which will be executed during the postSubmit (there will be no copy during the sequence eval/submit). This function cannot be carried out for TensorTypes::eStaging.
+    Operation that syncs tensor's local memory by mapping device data into the local CPU memory. For TensorTypes::eDevice it will use a record operation for the memory to be syncd into GPU memory which means that the operation will be done in sync with GPU commands. For TensorTypes::eStaging it will only map the data into host memory which will happen during preEval before the recorded commands are dispatched. This operation won't have any effect on TensorTypes::eStaging.
 */
 class OpTensorSyncLocal : public OpBase
 {
@@ -2052,17 +2074,17 @@ class OpTensorSyncLocal : public OpBase
                    std::vector<std::shared_ptr<Tensor>> tensors);
 
     /**
-     * Default destructor. This class manages the memory of the staging tensors it owns but these are released in the postSubmit, before it arrives to the destructor.
+     * Default destructor. This class does not manage memory so it won't be expecting the parent to perform a release.
      */
     ~OpTensorSyncLocal() override;
 
     /**
-     * Performs basic checks such as ensuring that there is at least one tensor provided, that they are initialized and that they are not of type TensorTpes::eStaging.
+     * Performs basic checks such as ensuring that there is at least one tensor provided with min memory of 1 element.
      */
     void init() override;
 
     /**
-     * For device tensors, it records the copy command into the staging tensor from the device tensor.
+     * For device tensors, it records the copy command for the tensor to copy the data from its device to staging memory.
      */
     void record() override;
 
@@ -2077,8 +2099,6 @@ class OpTensorSyncLocal : public OpBase
     virtual void postEval() override;
 
   private:
-    // Never owned resources
-    std::vector<std::shared_ptr<Tensor>> mStagingTensors;
 };
 
 } // End namespace kp

From d24dfb759073650bdedce2500bc670bcf559f705 Mon Sep 17 00:00:00 2001
From: Alejandro Saucedo <axsauze@gmail.com>
Date: Mon, 8 Feb 2021 07:18:32 +0000
Subject: [PATCH 5/6] Reformat

---
 src/OpAlgoLhsRhsOut.cpp        |   4 +-
 src/Tensor.cpp                 | 103 +++++++++++++++++++++------------
 src/include/kompute/Tensor.hpp |  37 ++++++++----
 test/TestOpTensorCreate.cpp    |  11 ++--
 4 files changed, 98 insertions(+), 57 deletions(-)

diff --git a/src/OpAlgoLhsRhsOut.cpp b/src/OpAlgoLhsRhsOut.cpp
index 622a4f431..3b78fa7d9 100644
--- a/src/OpAlgoLhsRhsOut.cpp
+++ b/src/OpAlgoLhsRhsOut.cpp
@@ -106,8 +106,8 @@ OpAlgoLhsRhsOut::record()
       vk::PipelineStageFlagBits::eTransfer);
 
     if (this->mTensorOutput->tensorType() == Tensor::TensorTypes::eDevice) {
-        this->mTensorOutput->recordCopyFromDeviceToStaging(
-          this->mCommandBuffer, true);
+        this->mTensorOutput->recordCopyFromDeviceToStaging(this->mCommandBuffer,
+                                                           true);
     }
 }
 
diff --git a/src/Tensor.cpp b/src/Tensor.cpp
index 726723def..f04165cf9 100644
--- a/src/Tensor.cpp
+++ b/src/Tensor.cpp
@@ -113,45 +113,65 @@ Tensor::recordCopyFrom(std::shared_ptr<vk::CommandBuffer> commandBuffer,
 
     SPDLOG_DEBUG("Kompute Tensor recordCopyFrom data size {}.", bufferSize);
 
-    this->copyBuffer(commandBuffer, copyFromTensor->mPrimaryBuffer, this->mPrimaryBuffer, bufferSize, copyRegion, createBarrier);
-
+    this->copyBuffer(commandBuffer,
+                     copyFromTensor->mPrimaryBuffer,
+                     this->mPrimaryBuffer,
+                     bufferSize,
+                     copyRegion,
+                     createBarrier);
 }
 
 void
-Tensor::recordCopyFromStagingToDevice(std::shared_ptr<vk::CommandBuffer> commandBuffer,
-                       bool createBarrier)
+Tensor::recordCopyFromStagingToDevice(
+  std::shared_ptr<vk::CommandBuffer> commandBuffer,
+  bool createBarrier)
 {
     vk::DeviceSize bufferSize(this->memorySize());
     vk::BufferCopy copyRegion(0, 0, bufferSize);
 
     SPDLOG_DEBUG("Kompute Tensor copying data size {}.", bufferSize);
 
-    this->copyBuffer(commandBuffer, this->mStagingBuffer, this->mPrimaryBuffer, bufferSize, copyRegion, createBarrier);
+    this->copyBuffer(commandBuffer,
+                     this->mStagingBuffer,
+                     this->mPrimaryBuffer,
+                     bufferSize,
+                     copyRegion,
+                     createBarrier);
 }
 
 void
-Tensor::recordCopyFromDeviceToStaging(std::shared_ptr<vk::CommandBuffer> commandBuffer,
-                       bool createBarrier)
+Tensor::recordCopyFromDeviceToStaging(
+  std::shared_ptr<vk::CommandBuffer> commandBuffer,
+  bool createBarrier)
 {
     vk::DeviceSize bufferSize(this->memorySize());
     vk::BufferCopy copyRegion(0, 0, bufferSize);
 
     SPDLOG_DEBUG("Kompute Tensor copying data size {}.", bufferSize);
 
-    this->copyBuffer(commandBuffer, this->mPrimaryBuffer, this->mStagingBuffer, bufferSize, copyRegion, createBarrier);
-
+    this->copyBuffer(commandBuffer,
+                     this->mPrimaryBuffer,
+                     this->mStagingBuffer,
+                     bufferSize,
+                     copyRegion,
+                     createBarrier);
 }
 
 void
-Tensor::copyBuffer(std::shared_ptr<vk::CommandBuffer> commandBuffer, std::shared_ptr<vk::Buffer> bufferFrom, std::shared_ptr<vk::Buffer> bufferTo, vk::DeviceSize bufferSize, vk::BufferCopy copyRegion, bool createBarrier) {
+Tensor::copyBuffer(std::shared_ptr<vk::CommandBuffer> commandBuffer,
+                   std::shared_ptr<vk::Buffer> bufferFrom,
+                   std::shared_ptr<vk::Buffer> bufferTo,
+                   vk::DeviceSize bufferSize,
+                   vk::BufferCopy copyRegion,
+                   bool createBarrier)
+{
 
     if (!this->mIsInit) {
         throw std::runtime_error(
           "Kompute Tensor attempted to run copyBuffer without init");
     }
 
-    commandBuffer->copyBuffer(
-      *bufferFrom, *bufferTo, copyRegion);
+    commandBuffer->copyBuffer(*bufferFrom, *bufferTo, copyRegion);
 
     if (createBarrier) {
         // Buffer to ensure wait until data is copied to staging buffer
@@ -209,8 +229,7 @@ Tensor::mapDataFromHostMemory()
 
     if (this->mTensorType == TensorTypes::eHost) {
         hostVisibleMemory = this->mPrimaryMemory;
-    }
-    else {
+    } else {
         hostVisibleMemory = this->mStagingMemory;
     }
 
@@ -233,8 +252,7 @@ Tensor::mapDataIntoHostMemory()
 
     if (this->mTensorType == TensorTypes::eHost) {
         hostVisibleMemory = this->mPrimaryMemory;
-    }
-    else {
+    } else {
         hostVisibleMemory = this->mStagingMemory;
     }
 
@@ -333,20 +351,26 @@ Tensor::allocateMemoryCreateGPUResources()
     SPDLOG_DEBUG("Kompute Tensor creating primary buffer and memory");
 
     this->mPrimaryBuffer = std::make_shared<vk::Buffer>();
-    this->createBuffer(this->mPrimaryBuffer, this->getPrimaryBufferUsageFlags());
+    this->createBuffer(this->mPrimaryBuffer,
+                       this->getPrimaryBufferUsageFlags());
     this->mFreePrimaryBuffer = true;
     this->mPrimaryMemory = std::make_shared<vk::DeviceMemory>();
-    this->allocateBindMemory(this->mPrimaryBuffer, this->mPrimaryMemory, this->getPrimaryMemoryPropertyFlags());
+    this->allocateBindMemory(this->mPrimaryBuffer,
+                             this->mPrimaryMemory,
+                             this->getPrimaryMemoryPropertyFlags());
     this->mFreePrimaryMemory = true;
 
     if (this->mTensorType == TensorTypes::eDevice) {
         SPDLOG_DEBUG("Kompute Tensor creating staging buffer and memory");
 
         this->mStagingBuffer = std::make_shared<vk::Buffer>();
-        this->createBuffer(this->mStagingBuffer, this->getStagingBufferUsageFlags());
+        this->createBuffer(this->mStagingBuffer,
+                           this->getStagingBufferUsageFlags());
         this->mFreeStagingBuffer = true;
         this->mStagingMemory = std::make_shared<vk::DeviceMemory>();
-        this->allocateBindMemory(this->mStagingBuffer, this->mStagingMemory, this->getStagingMemoryPropertyFlags());
+        this->allocateBindMemory(this->mStagingBuffer,
+                                 this->mStagingMemory,
+                                 this->getStagingMemoryPropertyFlags());
         this->mFreeStagingMemory = true;
     }
 
@@ -354,15 +378,16 @@ Tensor::allocateMemoryCreateGPUResources()
 }
 
 void
-Tensor::createBuffer(std::shared_ptr<vk::Buffer> buffer, vk::BufferUsageFlags bufferUsageFlags) {
-
+Tensor::createBuffer(std::shared_ptr<vk::Buffer> buffer,
+                     vk::BufferUsageFlags bufferUsageFlags)
+{
 
     vk::DeviceSize bufferSize = this->memorySize();
 
-    if(bufferSize<1){
-        throw std::runtime_error("Kompute Tensor attempted to create a zero-sized buffer");
+    if (bufferSize < 1) {
+        throw std::runtime_error(
+          "Kompute Tensor attempted to create a zero-sized buffer");
     }
-    
 
     SPDLOG_DEBUG("Kompute Tensor creating buffer with memory size: {}, and "
                  "usage flags: {}",
@@ -376,11 +401,13 @@ Tensor::createBuffer(std::shared_ptr<vk::Buffer> buffer, vk::BufferUsageFlags bu
                                     vk::SharingMode::eExclusive);
 
     this->mDevice->createBuffer(&bufferInfo, nullptr, buffer.get());
-
 }
 
 void
-Tensor::allocateBindMemory(std::shared_ptr<vk::Buffer> buffer, std::shared_ptr<vk::DeviceMemory> memory, vk::MemoryPropertyFlags memoryPropertyFlags) {
+Tensor::allocateBindMemory(std::shared_ptr<vk::Buffer> buffer,
+                           std::shared_ptr<vk::DeviceMemory> memory,
+                           vk::MemoryPropertyFlags memoryPropertyFlags)
+{
 
     SPDLOG_DEBUG("Kompute Tensor allocating and binding memory");
 
@@ -393,7 +420,8 @@ Tensor::allocateBindMemory(std::shared_ptr<vk::Buffer> buffer, std::shared_ptr<v
     uint32_t memoryTypeIndex = -1;
     for (uint32_t i = 0; i < memoryProperties.memoryTypeCount; i++) {
         if (memoryRequirements.memoryTypeBits & (1 << i)) {
-            if (((memoryProperties.memoryTypes[i]).propertyFlags & memoryPropertyFlags) == memoryPropertyFlags) {
+            if (((memoryProperties.memoryTypes[i]).propertyFlags &
+                 memoryPropertyFlags) == memoryPropertyFlags) {
                 memoryTypeIndex = i;
                 break;
             }
@@ -413,8 +441,7 @@ Tensor::allocateBindMemory(std::shared_ptr<vk::Buffer> buffer, std::shared_ptr<v
     vk::MemoryAllocateInfo memoryAllocateInfo(memoryRequirements.size,
                                               memoryTypeIndex);
 
-    this->mDevice->allocateMemory(
-      &memoryAllocateInfo, nullptr, memory.get());
+    this->mDevice->allocateMemory(&memoryAllocateInfo, nullptr, memory.get());
 
     this->mDevice->bindBufferMemory(*buffer, *memory, 0);
 }
@@ -434,8 +461,8 @@ Tensor::freeMemoryDestroyGPUResources()
 
     if (this->mFreePrimaryBuffer) {
         if (!this->mPrimaryBuffer) {
-            SPDLOG_ERROR(
-              "Kompose Tensor expected to destroy primary buffer but got null buffer");
+            SPDLOG_ERROR("Kompose Tensor expected to destroy primary buffer "
+                         "but got null buffer");
         } else {
             SPDLOG_DEBUG("Kompose Tensor destroying primary buffer");
             this->mDevice->destroy(
@@ -447,8 +474,8 @@ Tensor::freeMemoryDestroyGPUResources()
 
     if (this->mFreeStagingBuffer) {
         if (!this->mStagingBuffer) {
-            SPDLOG_ERROR(
-              "Kompose Tensor expected to destroy staging buffer but got null buffer");
+            SPDLOG_ERROR("Kompose Tensor expected to destroy staging buffer "
+                         "but got null buffer");
         } else {
             SPDLOG_DEBUG("Kompose Tensor destroying staging buffer");
             this->mDevice->destroy(
@@ -460,8 +487,8 @@ Tensor::freeMemoryDestroyGPUResources()
 
     if (this->mFreePrimaryMemory) {
         if (!this->mPrimaryMemory) {
-            SPDLOG_ERROR(
-              "Kompose Tensor expected to free primary memory but got null memory");
+            SPDLOG_ERROR("Kompose Tensor expected to free primary memory but "
+                         "got null memory");
         } else {
             SPDLOG_DEBUG("Kompose Tensor freeing primary memory");
             this->mDevice->freeMemory(
@@ -473,8 +500,8 @@ Tensor::freeMemoryDestroyGPUResources()
 
     if (this->mFreeStagingMemory) {
         if (!this->mStagingMemory) {
-            SPDLOG_ERROR(
-              "Kompose Tensor expected to free staging memory but got null memory");
+            SPDLOG_ERROR("Kompose Tensor expected to free staging memory but "
+                         "got null memory");
         } else {
             SPDLOG_DEBUG("Kompose Tensor freeing staging memory");
             this->mDevice->freeMemory(
diff --git a/src/include/kompute/Tensor.hpp b/src/include/kompute/Tensor.hpp
index 09ae89fd3..5d9fb07df 100644
--- a/src/include/kompute/Tensor.hpp
+++ b/src/include/kompute/Tensor.hpp
@@ -26,7 +26,7 @@ class Tensor
     enum class TensorTypes
     {
         eDevice = 0,  ///< Type is device memory, source and destination
-        eHost = 1, ///< Type is host memory, source and destination
+        eHost = 1,    ///< Type is host memory, source and destination
         eStorage = 2, ///< Type is Device memory (only)
     };
 
@@ -39,7 +39,8 @@ class Tensor
      *  Default constructor with data provided which would be used to create the
      * respective vulkan buffer and memory.
      *
-     *  @param data Non-zero-sized vector of data that will be used by the tensor
+     *  @param data Non-zero-sized vector of data that will be used by the
+     * tensor
      *  @param tensorType Type for the tensor which is of type TensorTypes
      */
     Tensor(const std::vector<float>& data,
@@ -132,24 +133,30 @@ class Tensor
                         bool createBarrier);
 
     /**
-     * Records a copy from the internal staging memory to the device memory using an optional barrier to wait for the operation. This function would only be relevant for kp::Tensors of type eDevice.
+     * Records a copy from the internal staging memory to the device memory
+     * using an optional barrier to wait for the operation. This function would
+     * only be relevant for kp::Tensors of type eDevice.
      *
      * @param commandBuffer Vulkan Command Buffer to record the commands into
      * @param createBarrier Whether to create a barrier that ensures the data is
      * copied before further operations. Default is true.
      */
-    void recordCopyFromStagingToDevice(std::shared_ptr<vk::CommandBuffer> commandBuffer,
-                        bool createBarrier);
+    void recordCopyFromStagingToDevice(
+      std::shared_ptr<vk::CommandBuffer> commandBuffer,
+      bool createBarrier);
 
     /**
-     * Records a copy from the internal device memory to the staging memory using an optional barrier to wait for the operation. This function would only be relevant for kp::Tensors of type eDevice.
+     * Records a copy from the internal device memory to the staging memory
+     * using an optional barrier to wait for the operation. This function would
+     * only be relevant for kp::Tensors of type eDevice.
      *
      * @param commandBuffer Vulkan Command Buffer to record the commands into
      * @param createBarrier Whether to create a barrier that ensures the data is
      * copied before further operations. Default is true.
      */
-    void recordCopyFromDeviceToStaging(std::shared_ptr<vk::CommandBuffer> commandBuffer,
-                        bool createBarrier);
+    void recordCopyFromDeviceToStaging(
+      std::shared_ptr<vk::CommandBuffer> commandBuffer,
+      bool createBarrier);
 
     /**
      * Records the buffer memory barrier into the command buffer which
@@ -211,9 +218,17 @@ class Tensor
     bool mIsInit = false;
 
     void allocateMemoryCreateGPUResources(); // Creates the vulkan buffer
-    void createBuffer(std::shared_ptr<vk::Buffer> buffer, vk::BufferUsageFlags bufferUsageFlags);
-    void allocateBindMemory(std::shared_ptr<vk::Buffer> buffer, std::shared_ptr<vk::DeviceMemory> memory, vk::MemoryPropertyFlags memoryPropertyFlags);
-    void copyBuffer(std::shared_ptr<vk::CommandBuffer> commandBuffer, std::shared_ptr<vk::Buffer> bufferFrom, std::shared_ptr<vk::Buffer> bufferTo, vk::DeviceSize bufferSize, vk::BufferCopy copyRegion, bool createBarrier);
+    void createBuffer(std::shared_ptr<vk::Buffer> buffer,
+                      vk::BufferUsageFlags bufferUsageFlags);
+    void allocateBindMemory(std::shared_ptr<vk::Buffer> buffer,
+                            std::shared_ptr<vk::DeviceMemory> memory,
+                            vk::MemoryPropertyFlags memoryPropertyFlags);
+    void copyBuffer(std::shared_ptr<vk::CommandBuffer> commandBuffer,
+                    std::shared_ptr<vk::Buffer> bufferFrom,
+                    std::shared_ptr<vk::Buffer> bufferTo,
+                    vk::DeviceSize bufferSize,
+                    vk::BufferCopy copyRegion,
+                    bool createBarrier);
 
     // Private util functions
     vk::BufferUsageFlags getPrimaryBufferUsageFlags();
diff --git a/test/TestOpTensorCreate.cpp b/test/TestOpTensorCreate.cpp
index 1281e1be2..f0ba87433 100644
--- a/test/TestOpTensorCreate.cpp
+++ b/test/TestOpTensorCreate.cpp
@@ -114,7 +114,6 @@ TEST(TestOpTensorCreate, NoErrorIfTensorFreedBefore)
     EXPECT_FALSE(tensorB->isInit());
 }
 
-
 TEST(TestOpTensorCreate, ExceptionOnZeroSizeTensor)
 {
     std::vector<float> testVecA;
@@ -123,11 +122,11 @@ TEST(TestOpTensorCreate, ExceptionOnZeroSizeTensor)
 
     kp::Manager mgr;
 
-    try{
+    try {
         mgr.evalOpDefault<kp::OpTensorCreate>({ tensorA });
-    } catch( const std::runtime_error& err ) {
-         // check exception
-        ASSERT_TRUE( std::string(err.what()).find("zero-sized") != std::string::npos );
+    } catch (const std::runtime_error& err) {
+        // check exception
+        ASSERT_TRUE(std::string(err.what()).find("zero-sized") !=
+                    std::string::npos);
     }
-
 }

From bf401019c9564b995ae2738dc3fe676f470bbcba Mon Sep 17 00:00:00 2001
From: Alejandro Saucedo <axsauze@gmail.com>
Date: Mon, 8 Feb 2021 07:29:34 +0000
Subject: [PATCH 6/6] Updated python

---
 python/src/docstrings.hpp | 14 +++++++-------
 python/src/main.cpp       | 12 ++++++------
 2 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/python/src/docstrings.hpp b/python/src/docstrings.hpp
index 37f3ff785..6b3a1dc78 100644
--- a/python/src/docstrings.hpp
+++ b/python/src/docstrings.hpp
@@ -513,10 +513,10 @@ function not in the record function.)doc";
 static const char *__doc_kp_OpTensorSyncDevice =
 R"doc(Operation that syncs tensor's device by mapping local data into the
 device memory. For TensorTypes::eDevice it will use a staging tensor
-to perform the copy. For TensorTypes::eStaging it will only copy the
+to perform the copy. For TensorTypes::eHost it will only copy the
 data and perform a map, which will be executed during the record (as
 opposed to during the sequence eval/submit). This function cannot be
-carried out for TensorTypes::eStaging.)doc";
+carried out for TensorTypes::eHost.)doc";
 
 static const char *__doc_kp_OpTensorSyncDevice_OpTensorSyncDevice = R"doc()doc";
 
@@ -533,7 +533,7 @@ queues @param device Vulkan logical device for passing to Algorithm
 static const char *__doc_kp_OpTensorSyncDevice_init =
 R"doc(Performs basic checks such as ensuring that there is at least one
 tensor provided, that they are initialized and that they are not of
-type TensorTpes::eStaging. For staging tensors in host memory, the map
+type TensorTpes::eHost. For staging tensors in host memory, the map
 is performed during the init function.)doc";
 
 static const char *__doc_kp_OpTensorSyncDevice_mStagingTensors = R"doc()doc";
@@ -549,11 +549,11 @@ from the temporary staging tensor.)doc";
 static const char *__doc_kp_OpTensorSyncLocal =
 R"doc(Operation that syncs tensor's local data by mapping the data from
 device memory into the local vector. For TensorTypes::eDevice it will
-use a staging tensor to perform the copy. For TensorTypes::eStaging it
+use a staging tensor to perform the copy. For TensorTypes::eHost it
 will only copy the data and perform a map, which will be executed
 during the postSubmit (there will be no copy during the sequence
 eval/submit). This function cannot be carried out for
-TensorTypes::eStaging.)doc";
+TensorTypes::eHost.)doc";
 
 static const char *__doc_kp_OpTensorSyncLocal_OpTensorSyncLocal = R"doc()doc";
 
@@ -570,7 +570,7 @@ queues @param device Vulkan logical device for passing to Algorithm
 static const char *__doc_kp_OpTensorSyncLocal_init =
 R"doc(Performs basic checks such as ensuring that there is at least one
 tensor provided, that they are initialized and that they are not of
-type TensorTpes::eStaging.)doc";
+type TensorTpes::eHost.)doc";
 
 static const char *__doc_kp_OpTensorSyncLocal_mStagingTensors = R"doc()doc";
 
@@ -719,7 +719,7 @@ shader storage).)doc";
 
 static const char *__doc_kp_Tensor_TensorTypes_eDevice = R"doc(< Type is device memory, source and destination)doc";
 
-static const char *__doc_kp_Tensor_TensorTypes_eStaging = R"doc(< Type is host memory, source and destination)doc";
+static const char *__doc_kp_Tensor_TensorTypes_eHost = R"doc(< Type is host memory, source and destination)doc";
 
 static const char *__doc_kp_Tensor_TensorTypes_eStorage = R"doc(< Type is Device memory (only))doc";
 
diff --git a/python/src/main.cpp b/python/src/main.cpp
index 09368f2cd..6e795fad5 100644
--- a/python/src/main.cpp
+++ b/python/src/main.cpp
@@ -26,7 +26,7 @@ PYBIND11_MODULE(kp, m) {
 
     py::enum_<kp::Tensor::TensorTypes>(m, "TensorTypes", DOC(kp, Tensor, TensorTypes))
         .value("device", kp::Tensor::TensorTypes::eDevice, "Tensor holding data in GPU memory.")
-        .value("staging", kp::Tensor::TensorTypes::eStaging, "Tensor used for transfer of data to device.")
+        .value("host", kp::Tensor::TensorTypes::eHost, "Tensor used for CPU visible GPU data.")
         .value("storage", kp::Tensor::TensorTypes::eStorage, "Tensor with host visible gpu memory.")
         .export_values();
 
@@ -112,7 +112,7 @@ PYBIND11_MODULE(kp, m) {
         .def("record_tensor_sync_device", &kp::Sequence::record<kp::OpTensorSyncDevice>,
             "Records operation to sync tensor from local memory to GPU memory")
         .def("record_tensor_sync_local", &kp::Sequence::record<kp::OpTensorSyncLocal>,
-            "Records operation to sync tensor(s) from GPU memory to local memory using staging tensors")
+            "Records operation to sync tensor(s) from GPU memory to local memory")
         .def("record_algo_mult", &kp::Sequence::record<kp::OpMult>,
             "Records operation to run multiplication compute shader to two input tensors and an output tensor")
         .def("record_algo_file", [](kp::Sequence &self, 
@@ -179,7 +179,7 @@ PYBIND11_MODULE(kp, m) {
         .def("eval_tensor_sync_device_def", &kp::Manager::evalOpDefault<kp::OpTensorSyncDevice>,
             "Evaluates operation to sync tensor from local memory to GPU memory with new anonymous Sequence")
         .def("eval_tensor_sync_local_def", &kp::Manager::evalOpDefault<kp::OpTensorSyncLocal>,
-            "Evaluates operation to sync tensor(s) from GPU memory to local memory using staging tensors with new anonymous Sequence")
+            "Evaluates operation to sync tensor(s) from GPU memory to local memory with new anonymous Sequence")
         .def("eval_algo_mult_def", &kp::Manager::evalOpDefault<kp::OpMult>,
             "Evaluates operation to run multiplication compute shader to two input tensors and an output tensor with new anonymous Sequence")
         .def("eval_algo_file_def", &kp::Manager::evalOpDefault<kp::OpAlgoBase, std::string>,
@@ -216,7 +216,7 @@ PYBIND11_MODULE(kp, m) {
         .def("eval_tensor_sync_device", &kp::Manager::evalOp<kp::OpTensorSyncDevice>,
             "Evaluates operation to sync tensor from local memory to GPU memory with explicitly named Sequence")
         .def("eval_tensor_sync_local", &kp::Manager::evalOp<kp::OpTensorSyncLocal>,
-            "Evaluates operation to sync tensor(s) from GPU memory to local memory using staging tensors with explicitly named Sequence")
+            "Evaluates operation to sync tensor(s) from GPU memory to local memory with explicitly named Sequence")
         .def("eval_algo_mult", &kp::Manager::evalOp<kp::OpMult>,
             "Evaluates operation to run multiplication compute shader to two input tensors and an output tensor with explicitly named Sequence")
         .def("eval_algo_file", &kp::Manager::evalOp<kp::OpAlgoBase, std::string>,
@@ -256,7 +256,7 @@ PYBIND11_MODULE(kp, m) {
         .def("eval_async_tensor_sync_device_def", &kp::Manager::evalOpAsyncDefault<kp::OpTensorSyncDevice>,
             "Evaluates asynchronously operation to sync tensor from local memory to GPU memory with anonymous Sequence")
         .def("eval_async_tensor_sync_local_def", &kp::Manager::evalOpAsyncDefault<kp::OpTensorSyncLocal>,
-            "Evaluates asynchronously operation to sync tensor(s) from GPU memory to local memory using staging tensors with anonymous Sequence")
+            "Evaluates asynchronously operation to sync tensor(s) from GPU memory to local memory with anonymous Sequence")
         .def("eval_async_algo_mult_def", &kp::Manager::evalOpAsyncDefault<kp::OpMult>,
             "Evaluates asynchronously operation to run multiplication compute shader to two input tensors and an output tensor with anonymous Sequence")
         .def("eval_async_algo_file_def", &kp::Manager::evalOpAsyncDefault<kp::OpAlgoBase, std::string>,
@@ -293,7 +293,7 @@ PYBIND11_MODULE(kp, m) {
         .def("eval_async_tensor_sync_device", &kp::Manager::evalOpAsync<kp::OpTensorSyncDevice>,
             "Evaluates asynchronously operation to sync tensor from local memory to GPU memory with explicitly named Sequence")
         .def("eval_async_tensor_sync_local", &kp::Manager::evalOpAsync<kp::OpTensorSyncLocal>,
-            "Evaluates asynchronously operation to sync tensor(s) from GPU memory to local memory using staging tensors with explicitly named Sequence")
+            "Evaluates asynchronously operation to sync tensor(s) from GPU memory to local memory with explicitly named Sequence")
         .def("eval_async_algo_mult", &kp::Manager::evalOpAsync<kp::OpMult>,
             "Evaluates asynchronously operation to run multiplication compute shader to two input tensors and an output tensor with explicitly named Sequence")
         .def("eval_async_algo_file", &kp::Manager::evalOpAsync<kp::OpAlgoBase, std::string>,