From 263f392cbb3385421f989b8ff21ec7955a3c6c63 Mon Sep 17 00:00:00 2001
From: Alejandro Saucedo <axsauze@gmail.com>
Date: Tue, 9 Mar 2021 08:06:27 +0000
Subject: [PATCH 1/5] Updated memory barriers to include staging buffers

---
 single_include/kompute/Kompute.hpp | 271 +++++++++++++++--------------
 src/OpAlgoDispatch.cpp             |   2 +-
 src/OpTensorCopy.cpp               |   2 +-
 src/OpTensorSyncDevice.cpp         |   3 +-
 src/OpTensorSyncLocal.cpp          |  10 +-
 src/Tensor.cpp                     |  65 ++++---
 src/include/kompute/Tensor.hpp     |  52 +++---
 7 files changed, 230 insertions(+), 175 deletions(-)
diff --git a/single_include/kompute/Kompute.hpp b/single_include/kompute/Kompute.hpp
index 9b41e1ead..fc7d697fb 100755
--- a/single_include/kompute/Kompute.hpp
+++ b/single_include/kompute/Kompute.hpp
@@ -741,7 +741,6 @@ namespace kp {
 class Shader
 {
   public:
-
     // The default resource limit for the GLSL compiler, can be overwritten
     // Has been adopted by:
     // https://github.com/KhronosGroup/glslang/blob/master/StandAlone/ResourceLimits.cpp
@@ -888,12 +887,9 @@ class Tensor
      *
      * @param commandBuffer Vulkan Command Buffer to record the commands into
      * @param copyFromTensor Tensor to copy the data from
-     * @param createBarrier Whether to create a barrier that ensures the data is
-     * copied before further operations. Default is true.
      */
     void recordCopyFrom(const vk::CommandBuffer& commandBuffer,
-                        std::shared_ptr<Tensor> copyFromTensor,
-                        bool createBarrier);
+                        std::shared_ptr<Tensor> copyFromTensor);
 
     /**
      * Records a copy from the internal staging memory to the device memory
@@ -901,11 +897,8 @@ class Tensor
      * only be relevant for kp::Tensors of type eDevice.
      *
      * @param commandBuffer Vulkan Command Buffer to record the commands into
-     * @param createBarrier Whether to create a barrier that ensures the data is
-     * copied before further operations. Default is true.
      */
-    void recordCopyFromStagingToDevice(const vk::CommandBuffer& commandBuffer,
-                                       bool createBarrier);
+    void recordCopyFromStagingToDevice(const vk::CommandBuffer& commandBuffer);
 
     /**
      * Records a copy from the internal device memory to the staging memory
@@ -913,14 +906,11 @@ class Tensor
      * only be relevant for kp::Tensors of type eDevice.
      *
      * @param commandBuffer Vulkan Command Buffer to record the commands into
-     * @param createBarrier Whether to create a barrier that ensures the data is
-     * copied before further operations. Default is true.
      */
-    void recordCopyFromDeviceToStaging(const vk::CommandBuffer& commandBuffer,
-                                       bool createBarrier);
+    void recordCopyFromDeviceToStaging(const vk::CommandBuffer& commandBuffer);
 
     /**
-     * Records the buffer memory barrier into the command buffer which
+     * Records the buffer memory barrier into the primary buffer and command buffer which
      * ensures that relevant data transfers are carried out correctly.
      *
      * @param commandBuffer Vulkan Command Buffer to record the commands into
@@ -929,11 +919,26 @@ class Tensor
      * @param scrStageMask Pipeline stage flags for source stage mask
      * @param dstStageMask Pipeline stage flags for destination stage mask
      */
-    void recordBufferMemoryBarrier(const vk::CommandBuffer& commandBuffer,
-                                   vk::AccessFlagBits srcAccessMask,
-                                   vk::AccessFlagBits dstAccessMask,
-                                   vk::PipelineStageFlagBits srcStageMask,
-                                   vk::PipelineStageFlagBits dstStageMask);
+    void recordPrimaryBufferMemoryBarrier(const vk::CommandBuffer& commandBuffer,
+                                          vk::AccessFlagBits srcAccessMask,
+                                          vk::AccessFlagBits dstAccessMask,
+                                          vk::PipelineStageFlagBits srcStageMask,
+                                          vk::PipelineStageFlagBits dstStageMask);
+    /**
+     * Records the buffer memory barrier into the staging buffer and command buffer which
+     * ensures that relevant data transfers are carried out correctly.
+     *
+     * @param commandBuffer Vulkan Command Buffer to record the commands into
+     * @param srcAccessMask Access flags for source access mask
+     * @param dstAccessMask Access flags for destination access mask
+     * @param scrStageMask Pipeline stage flags for source stage mask
+     * @param dstStageMask Pipeline stage flags for destination stage mask
+     */
+    void recordStagingBufferMemoryBarrier(const vk::CommandBuffer& commandBuffer,
+                                          vk::AccessFlagBits srcAccessMask,
+                                          vk::AccessFlagBits dstAccessMask,
+                                          vk::PipelineStageFlagBits srcStageMask,
+                                          vk::PipelineStageFlagBits dstStageMask);
 
     /**
      * Constructs a vulkan descriptor buffer info which can be used to specify
@@ -951,41 +956,33 @@ class Tensor
      * @return Unsigned integer representing the total number of elements
      */
     // TODO: move to cpp
-    uint32_t size() {
-        return this->mSize;
-    }
+    uint32_t size() { return this->mSize; }
 
     // TODO: move to cpp
-    uint32_t dataTypeMemorySize() {
-        return this->mDataTypeMemorySize;
-    }
+    uint32_t dataTypeMemorySize() { return this->mDataTypeMemorySize; }
 
     // TODO: move to cpp
-    uint32_t memorySize() {
-        return this->mSize * this->mDataTypeMemorySize;
-    }
+    uint32_t memorySize() { return this->mSize * this->mDataTypeMemorySize; }
 
     /**
      * Retrieve the underlying data type of the Tensor
      *
      * @return Data type of tensor of type kp::Tensor::TensorDataTypes
      */
-    TensorDataTypes dataType() {
-        return this->mDataType;
-    }
+    TensorDataTypes dataType() { return this->mDataType; }
 
-    void* rawData() {
-        return this->mRawData;
-    }
+    void* rawData() { return this->mRawData; }
 
     // TODO: move to cpp
-    template <typename T>
-    T* data() {
+    template<typename T>
+    T* data()
+    {
         return (T*)this->mRawData;
     }
 
-    template <typename T>
-    std::vector<T> vector() {
+    template<typename T>
+    std::vector<T> vector()
+    {
         return { (T*)this->mRawData, ((T*)this->mRawData) + this->size() };
     }
 
@@ -993,9 +990,9 @@ class Tensor
      * Sets / resets the vector data of the tensor. This function does not
      * perform any copies into GPU memory and is only performed on the host.
      */
-    void setRawData(const void* data) 
+    void setRawData(const void* data)
     {
-        // Copy data 
+        // Copy data
         memcpy(this->mRawData, data, this->memorySize());
     }
 
@@ -1008,7 +1005,8 @@ class Tensor
     void* mRawData;
 
   private:
-    void mapRawData() {
+    void mapRawData()
+    {
 
         KP_LOG_DEBUG("Kompute Tensor mapping data from host buffer");
 
@@ -1026,14 +1024,17 @@ class Tensor
 
         vk::DeviceSize bufferSize = this->memorySize();
 
-        // Given we request coherent host memory we don't need to invalidate / flush
+        // Given we request coherent host memory we don't need to invalidate /
+        // flush
         this->mRawData = this->mDevice->mapMemory(
           *hostVisibleMemory, 0, bufferSize, vk::MemoryMapFlags());
 
-        vk::MappedMemoryRange mappedMemoryRange(*hostVisibleMemory, 0, bufferSize);
+        vk::MappedMemoryRange mappedMemoryRange(
+          *hostVisibleMemory, 0, bufferSize);
     }
 
-    void unmapRawData() {
+    void unmapRawData()
+    {
 
         KP_LOG_DEBUG("Kompute Tensor mapping data from host buffer");
 
@@ -1079,57 +1080,59 @@ class Tensor
                           std::shared_ptr<vk::Buffer> bufferFrom,
                           std::shared_ptr<vk::Buffer> bufferTo,
                           vk::DeviceSize bufferSize,
-                          vk::BufferCopy copyRegion,
-                          bool createBarrier);
+                          vk::BufferCopy copyRegion);
+    void recordBufferMemoryBarrier(const vk::CommandBuffer& commandBuffer,
+                                   const vk::Buffer& buffer,
+                                   vk::AccessFlagBits srcAccessMask,
+                                   vk::AccessFlagBits dstAccessMask,
+                                   vk::PipelineStageFlagBits srcStageMask,
+                                   vk::PipelineStageFlagBits dstStageMask);
 
     // Private util functions
     vk::BufferUsageFlags getPrimaryBufferUsageFlags();
     vk::MemoryPropertyFlags getPrimaryMemoryPropertyFlags();
     vk::BufferUsageFlags getStagingBufferUsageFlags();
     vk::MemoryPropertyFlags getStagingMemoryPropertyFlags();
-
 };
 
 // TODO: Limit T to be only float, bool, double, etc
-template <typename T>
-class TensorT: public Tensor
+template<typename T>
+class TensorT : public Tensor
 {
 
   public:
     TensorT(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
-           std::shared_ptr<vk::Device> device,
-           const std::vector<T>& data,
-           const TensorTypes& tensorType = TensorTypes::eDevice)
-        : Tensor(physicalDevice,
-                 device,
-                 (void*)data.data(),
-                 data.size(),
-                 sizeof(T),
-                 this->dataType(),
-                 tensorType)
+            std::shared_ptr<vk::Device> device,
+            const std::vector<T>& data,
+            const TensorTypes& tensorType = TensorTypes::eDevice)
+      : Tensor(physicalDevice,
+               device,
+               (void*)data.data(),
+               data.size(),
+               sizeof(T),
+               this->dataType(),
+               tensorType)
     {
-        KP_LOG_DEBUG("Kompute TensorT constructor with data size {}", data.size());
+        KP_LOG_DEBUG("Kompute TensorT constructor with data size {}",
+                     data.size());
     }
 
-    ~TensorT() {
-        KP_LOG_DEBUG("Kompute TensorT destructor");
-    }
+    ~TensorT() { KP_LOG_DEBUG("Kompute TensorT destructor"); }
 
-    T* data() {
-        return (T*)this->mRawData;
-    }
+    T* data() { return (T*)this->mRawData; }
 
-    std::vector<T> vector() {
+    std::vector<T> vector()
+    {
         return { (T*)this->mRawData, ((T*)this->mRawData) + this->size() };
     }
 
-    T& operator[](int index) {
-        return *(((T*)this->mRawData) + index);
-    }
+    T& operator[](int index) { return *(((T*)this->mRawData) + index); }
 
-    void setData(const std::vector<T>& data) {
+    void setData(const std::vector<T>& data)
+    {
 
-        KP_LOG_DEBUG("Kompute TensorT setting data with data size {}", data.size());
+        KP_LOG_DEBUG("Kompute TensorT setting data with data size {}",
+                     data.size());
 
         if (data.size() != this->mSize) {
             throw std::runtime_error(
@@ -1140,7 +1143,6 @@ class TensorT: public Tensor
     }
 
     TensorDataTypes dataType();
-
 };
 
 } // End namespace kp
@@ -1159,15 +1161,17 @@ class Algorithm
      *  the underlying resources.
      *
      *  @param device The Vulkan device to use for creating resources
-     *  @param tensors (optional) The tensors to use to create the descriptor resources
+     *  @param tensors (optional) The tensors to use to create the descriptor
+     * resources
      *  @param spirv (optional) The spirv code to use to create the algorithm
-     *  @param workgroup (optional) The kp::Workgroup to use for the dispatch which defaults to
-     *  kp::Workgroup(tensor[0].size(), 1, 1) if not set.
-     *  @param specializationConstants (optional) The kp::Constants to use to initialize
-     *  the specialization constants which cannot be changed once set.
-     *  @param pushConstants (optional) The kp::Constants to use when initializing the
-     *  pipeline, which set the size of the push constants - these can be modified but
-     *  all new values must have the same vector size as this initial value.
+     *  @param workgroup (optional) The kp::Workgroup to use for the dispatch
+     * which defaults to kp::Workgroup(tensor[0].size(), 1, 1) if not set.
+     *  @param specializationConstants (optional) The kp::Constants to use to
+     * initialize the specialization constants which cannot be changed once set.
+     *  @param pushConstants (optional) The kp::Constants to use when
+     * initializing the pipeline, which set the size of the push constants -
+     * these can be modified but all new values must have the same vector size
+     * as this initial value.
      */
     Algorithm(std::shared_ptr<vk::Device> device,
               const std::vector<std::shared_ptr<Tensor>>& tensors = {},
@@ -1177,18 +1181,19 @@ class Algorithm
               const Constants& pushConstants = {});
 
     /**
-     *  Rebuild function to reconstruct algorithm with configuration parameters to create
-     *  the underlying resources.
+     *  Rebuild function to reconstruct algorithm with configuration parameters
+     * to create the underlying resources.
      *
      *  @param tensors The tensors to use to create the descriptor resources
      *  @param spirv The spirv code to use to create the algorithm
-     *  @param workgroup (optional) The kp::Workgroup to use for the dispatch which defaults to
-     *  kp::Workgroup(tensor[0].size(), 1, 1) if not set.
-     *  @param specializationConstants (optional) The kp::Constants to use to initialize
-     *  the specialization constants which cannot be changed once set.
-     *  @param pushConstants (optional) The kp::Constants to use when initializing the
-     *  pipeline, which set the size of the push constants - these can be modified but
-     *  all new values must have the same vector size as this initial value.
+     *  @param workgroup (optional) The kp::Workgroup to use for the dispatch
+     * which defaults to kp::Workgroup(tensor[0].size(), 1, 1) if not set.
+     *  @param specializationConstants (optional) The kp::Constants to use to
+     * initialize the specialization constants which cannot be changed once set.
+     *  @param pushConstants (optional) The kp::Constants to use when
+     * initializing the pipeline, which set the size of the push constants -
+     * these can be modified but all new values must have the same vector size
+     * as this initial value.
      */
     void rebuild(const std::vector<std::shared_ptr<Tensor>>& tensors,
                  const std::vector<uint32_t>& spirv,
@@ -1211,25 +1216,26 @@ class Algorithm
     void recordDispatch(const vk::CommandBuffer& commandBuffer);
 
     /**
-     * Records command that binds the "core" algorithm components which consist of
-     * binding the pipeline and binding the descriptorsets.
+     * Records command that binds the "core" algorithm components which consist
+     * of binding the pipeline and binding the descriptorsets.
      *
      * @param commandBuffer Command buffer to record the algorithm resources to
      */
     void recordBindCore(const vk::CommandBuffer& commandBuffer);
 
     /**
-     * Records command that binds the push constants to the command buffer provided
-     * - it is required that the pushConstants provided are of the same size as the
-     *   ones provided during initialization.
+     * Records command that binds the push constants to the command buffer
+     * provided
+     * - it is required that the pushConstants provided are of the same size as
+     * the ones provided during initialization.
      *
      * @param commandBuffer Command buffer to record the algorithm resources to
      */
     void recordBindPush(const vk::CommandBuffer& commandBuffer);
 
     /**
-     * function that checks all the gpu resource components to verify if these have
-     * been created and returns true if all are valid.
+     * function that checks all the gpu resource components to verify if these
+     * have been created and returns true if all are valid.
      *
      * @returns returns true if the algorithm is currently initialized.
      */
@@ -1238,26 +1244,28 @@ class Algorithm
     /**
      * Sets the work group to use in the recordDispatch
      *
-     * @param workgroup The kp::Workgroup value to use to update the algorithm. It
-     * must have a value greater than 1 on the x value (index 1) otherwise it will
-     * be initialized on the size of the first tensor (ie. this->mTensor[0]->size())
+     * @param workgroup The kp::Workgroup value to use to update the algorithm.
+     * It must have a value greater than 1 on the x value (index 1) otherwise it
+     * will be initialized on the size of the first tensor (ie.
+     * this->mTensor[0]->size())
      */
     void setWorkgroup(const Workgroup& workgroup, uint32_t minSize = 1);
     /**
-     * Sets the push constants to the new value provided to use in the next bindPush()
+     * Sets the push constants to the new value provided to use in the next
+     * bindPush()
      *
-     * @param The kp::Constant to use to set the push constants to use in the next
-     * bindPush(...) calls. The constants provided must be of the same size as the
-     * ones created during initialization.
+     * @param The kp::Constant to use to set the push constants to use in the
+     * next bindPush(...) calls. The constants provided must be of the same size
+     * as the ones created during initialization.
      */
     void setPush(const Constants& pushConstants);
 
     /**
      * Gets the current workgroup from the algorithm.
      *
-     * @param The kp::Constant to use to set the push constants to use in the next
-     * bindPush(...) calls. The constants provided must be of the same size as the
-     * ones created during initialization.
+     * @param The kp::Constant to use to set the push constants to use in the
+     * next bindPush(...) calls. The constants provided must be of the same size
+     * as the ones created during initialization.
      */
     const Workgroup& getWorkgroup();
     /**
@@ -1690,8 +1698,8 @@ class Sequence : public std::enable_shared_from_this<Sequence>
      * function also requires the Sequence to be recording, otherwise it will
      * not be able to add the operation.
      *
-     * @param op Object derived from kp::BaseOp that will be recoreded by the sequence
-     * which will be used when the operation is evaluated.
+     * @param op Object derived from kp::BaseOp that will be recoreded by the
+     * sequence which will be used when the operation is evaluated.
      * @return shared_ptr<Sequence> of the Sequence class itself
      */
     std::shared_ptr<Sequence> record(std::shared_ptr<OpBase> op);
@@ -1709,7 +1717,8 @@ class Sequence : public std::enable_shared_from_this<Sequence>
      */
     template<typename T, typename... TArgs>
     std::shared_ptr<Sequence> record(
-      std::vector<std::shared_ptr<Tensor>> tensors, TArgs&&... params)
+      std::vector<std::shared_ptr<Tensor>> tensors,
+      TArgs&&... params)
     {
         std::shared_ptr<T> op{ new T(tensors, std::forward<TArgs>(params)...) };
         return this->record(op);
@@ -1744,8 +1753,9 @@ class Sequence : public std::enable_shared_from_this<Sequence>
     std::shared_ptr<Sequence> eval();
 
     /**
-     * Resets all the recorded and stored operations, records the operation 
-     * provided and submits into the gpu as a submit job synchronously (with a barrier).
+     * Resets all the recorded and stored operations, records the operation
+     * provided and submits into the gpu as a submit job synchronously (with a
+     * barrier).
      *
      * @return shared_ptr<Sequence> of the Sequence class itself
      */
@@ -1788,16 +1798,18 @@ class Sequence : public std::enable_shared_from_this<Sequence>
 
     /**
      * Eval Async sends all the recorded and stored operations in the vector of
-     * operations into the gpu as a submit job without a barrier. EvalAwait() must
-     * ALWAYS be called after to ensure the sequence is terminated correctly.
+     * operations into the gpu as a submit job without a barrier. EvalAwait()
+     * must ALWAYS be called after to ensure the sequence is terminated
+     * correctly.
      *
      * @return Boolean stating whether execution was successful.
      */
     std::shared_ptr<Sequence> evalAsync();
     /**
      * Clears currnet operations to record provided one in the vector of
-     * operations into the gpu as a submit job without a barrier. EvalAwait() must
-     * ALWAYS be called after to ensure the sequence is terminated correctly.
+     * operations into the gpu as a submit job without a barrier. EvalAwait()
+     * must ALWAYS be called after to ensure the sequence is terminated
+     * correctly.
      *
      * @return Boolean stating whether execution was successful.
      */
@@ -1891,9 +1903,9 @@ class Sequence : public std::enable_shared_from_this<Sequence>
     bool isInit();
 
     /**
-     * Clears command buffer and triggers re-record of all the current operations 
-     * saved, which is useful if the underlying kp::Tensors or kp::Algorithms
-     * are modified and need to be re-recorded.
+     * Clears command buffer and triggers re-record of all the current
+     * operations saved, which is useful if the underlying kp::Tensors or
+     * kp::Algorithms are modified and need to be re-recorded.
      */
     void rerecord();
 
@@ -1961,13 +1973,14 @@ class Manager
     Manager();
 
     /**
-     * Similar to base constructor but allows for further configuration to use when
-     * creating the Vulkan resources.
+     * Similar to base constructor but allows for further configuration to use
+     * when creating the Vulkan resources.
      *
      * @param physicalDeviceIndex The index of the physical device to use
      * @param familyQueueIndices (Optional) List of queue indices to add for
      * explicit allocation
-     * @param desiredExtensions The desired extensions to load from physicalDevice
+     * @param desiredExtensions The desired extensions to load from
+     * physicalDevice
      */
     Manager(uint32_t physicalDeviceIndex,
             const std::vector<uint32_t>& familyQueueIndices = {},
@@ -2001,7 +2014,8 @@ class Manager
      * If zero (default), disables latching of timestamps.
      * @returns Shared pointer with initialised sequence
      */
-    std::shared_ptr<Sequence> sequence(uint32_t queueIndex = 0, uint32_t totalTimestamps = 0);
+    std::shared_ptr<Sequence> sequence(uint32_t queueIndex = 0,
+                                       uint32_t totalTimestamps = 0);
 
     /**
      * Create a managed tensor that will be destroyed by this manager
@@ -2011,7 +2025,7 @@ class Manager
      * @param tensorType The type of tensor to initialize
      * @returns Shared pointer with initialised tensor
      */
-    template <typename T>
+    template<typename T>
     std::shared_ptr<TensorT<T>> tensorT(
       const std::vector<T>& data,
       Tensor::TensorTypes tensorType = Tensor::TensorTypes::eDevice)
@@ -2042,8 +2056,13 @@ class Manager
       const Tensor::TensorDataTypes& dataType,
       Tensor::TensorTypes tensorType = Tensor::TensorTypes::eDevice)
     {
-        std::shared_ptr<Tensor> tensor{ new kp::Tensor(
-          this->mPhysicalDevice, this->mDevice, data, elementTotalCount, elementMemorySize, dataType, tensorType) };
+        std::shared_ptr<Tensor> tensor{ new kp::Tensor(this->mPhysicalDevice,
+                                                       this->mDevice,
+                                                       data,
+                                                       elementTotalCount,
+                                                       elementMemorySize,
+                                                       dataType,
+                                                       tensorType) };
 
         if (this->mManageResources) {
             this->mManagedTensors.push_back(tensor);
diff --git a/src/OpAlgoDispatch.cpp b/src/OpAlgoDispatch.cpp
index 44908adb3..94502cc76 100644
--- a/src/OpAlgoDispatch.cpp
+++ b/src/OpAlgoDispatch.cpp
@@ -26,7 +26,7 @@ OpAlgoDispatch::record(const vk::CommandBuffer& commandBuffer)
     // Barrier to ensure the data is finished writing to buffer memory
     for (const std::shared_ptr<Tensor>& tensor :
          this->mAlgorithm->getTensors()) {
-        tensor->recordBufferMemoryBarrier(
+        tensor->recordPrimaryBufferMemoryBarrier(
           commandBuffer,
           vk::AccessFlagBits::eHostWrite,
           vk::AccessFlagBits::eShaderRead,
diff --git a/src/OpTensorCopy.cpp b/src/OpTensorCopy.cpp
index 33b9eb838..13e189a58 100644
--- a/src/OpTensorCopy.cpp
+++ b/src/OpTensorCopy.cpp
@@ -45,7 +45,7 @@ OpTensorCopy::record(const vk::CommandBuffer& commandBuffer)
     // We iterate from the second tensor onwards and record a copy to all
     for (size_t i = 1; i < this->mTensors.size(); i++) {
         this->mTensors[i]->recordCopyFrom(
-          commandBuffer, this->mTensors[0], false);
+          commandBuffer, this->mTensors[0]);
     }
 }
 
diff --git a/src/OpTensorSyncDevice.cpp b/src/OpTensorSyncDevice.cpp
index cd887c148..ad071d8d3 100644
--- a/src/OpTensorSyncDevice.cpp
+++ b/src/OpTensorSyncDevice.cpp
@@ -30,8 +30,7 @@ OpTensorSyncDevice::record(const vk::CommandBuffer& commandBuffer)
 
     for (size_t i = 0; i < this->mTensors.size(); i++) {
         if (this->mTensors[i]->tensorType() == Tensor::TensorTypes::eDevice) {
-            this->mTensors[i]->recordCopyFromStagingToDevice(commandBuffer,
-                                                             false);
+            this->mTensors[i]->recordCopyFromStagingToDevice(commandBuffer);
         }
     }
 }
diff --git a/src/OpTensorSyncLocal.cpp b/src/OpTensorSyncLocal.cpp
index f7e15ffd5..70642b9de 100644
--- a/src/OpTensorSyncLocal.cpp
+++ b/src/OpTensorSyncLocal.cpp
@@ -30,8 +30,14 @@ OpTensorSyncLocal::record(const vk::CommandBuffer& commandBuffer)
 
     for (size_t i = 0; i < this->mTensors.size(); i++) {
         if (this->mTensors[i]->tensorType() == Tensor::TensorTypes::eDevice) {
-            this->mTensors[i]->recordCopyFromDeviceToStaging(commandBuffer,
-                                                             true);
+
+            this->mTensors[i]->recordCopyFromDeviceToStaging(commandBuffer);
+
+            this->mTensors[i]->recordStagingBufferMemoryBarrier(commandBuffer,
+                                        vk::AccessFlagBits::eTransferWrite,
+                                        vk::AccessFlagBits::eHostRead,
+                                        vk::PipelineStageFlagBits::eTransfer,
+                                        vk::PipelineStageFlagBits::eHost);
         }
     }
 }
diff --git a/src/Tensor.cpp b/src/Tensor.cpp
index 90c21fc8a..7e7af4ca4 100644
--- a/src/Tensor.cpp
+++ b/src/Tensor.cpp
@@ -70,8 +70,7 @@ Tensor::isInit()
 
 void
 Tensor::recordCopyFrom(const vk::CommandBuffer& commandBuffer,
-                       std::shared_ptr<Tensor> copyFromTensor,
-                       bool createBarrier)
+                       std::shared_ptr<Tensor> copyFromTensor)
 {
 
     vk::DeviceSize bufferSize(this->memorySize());
@@ -83,13 +82,11 @@ Tensor::recordCopyFrom(const vk::CommandBuffer& commandBuffer,
                            copyFromTensor->mPrimaryBuffer,
                            this->mPrimaryBuffer,
                            bufferSize,
-                           copyRegion,
-                           createBarrier);
+                           copyRegion);
 }
 
 void
-Tensor::recordCopyFromStagingToDevice(const vk::CommandBuffer& commandBuffer,
-                                      bool createBarrier)
+Tensor::recordCopyFromStagingToDevice(const vk::CommandBuffer& commandBuffer)
 {
     vk::DeviceSize bufferSize(this->memorySize());
     vk::BufferCopy copyRegion(0, 0, bufferSize);
@@ -100,13 +97,11 @@ Tensor::recordCopyFromStagingToDevice(const vk::CommandBuffer& commandBuffer,
                            this->mStagingBuffer,
                            this->mPrimaryBuffer,
                            bufferSize,
-                           copyRegion,
-                           createBarrier);
+                           copyRegion);
 }
 
 void
-Tensor::recordCopyFromDeviceToStaging(const vk::CommandBuffer& commandBuffer,
-                                      bool createBarrier)
+Tensor::recordCopyFromDeviceToStaging(const vk::CommandBuffer& commandBuffer)
 {
     vk::DeviceSize bufferSize(this->memorySize());
     vk::BufferCopy copyRegion(0, 0, bufferSize);
@@ -117,8 +112,7 @@ Tensor::recordCopyFromDeviceToStaging(const vk::CommandBuffer& commandBuffer,
                            this->mPrimaryBuffer,
                            this->mStagingBuffer,
                            bufferSize,
-                           copyRegion,
-                           createBarrier);
+                           copyRegion);
 }
 
 void
@@ -126,24 +120,49 @@ Tensor::recordCopyBuffer(const vk::CommandBuffer& commandBuffer,
                          std::shared_ptr<vk::Buffer> bufferFrom,
                          std::shared_ptr<vk::Buffer> bufferTo,
                          vk::DeviceSize bufferSize,
-                         vk::BufferCopy copyRegion,
-                         bool createBarrier)
+                         vk::BufferCopy copyRegion)
 {
 
     commandBuffer.copyBuffer(*bufferFrom, *bufferTo, copyRegion);
+}
 
-    if (createBarrier) {
-        // Buffer to ensure wait until data is copied to staging buffer
-        this->recordBufferMemoryBarrier(commandBuffer,
-                                        vk::AccessFlagBits::eTransferWrite,
-                                        vk::AccessFlagBits::eHostRead,
-                                        vk::PipelineStageFlagBits::eTransfer,
-                                        vk::PipelineStageFlagBits::eHost);
-    }
+void
+Tensor::recordPrimaryBufferMemoryBarrier(const vk::CommandBuffer& commandBuffer,
+                                  vk::AccessFlagBits srcAccessMask,
+                                  vk::AccessFlagBits dstAccessMask,
+                                  vk::PipelineStageFlagBits srcStageMask,
+                                  vk::PipelineStageFlagBits dstStageMask)
+{
+    KP_LOG_DEBUG("Kompute Tensor recording PRIMARY buffer memory barrier");
+
+    this->recordBufferMemoryBarrier(commandBuffer,
+            *this->mPrimaryBuffer,
+            srcAccessMask,
+            dstAccessMask,
+            srcStageMask,
+            dstStageMask);
+}
+
+void
+Tensor::recordStagingBufferMemoryBarrier(const vk::CommandBuffer& commandBuffer,
+                                  vk::AccessFlagBits srcAccessMask,
+                                  vk::AccessFlagBits dstAccessMask,
+                                  vk::PipelineStageFlagBits srcStageMask,
+                                  vk::PipelineStageFlagBits dstStageMask)
+{
+    KP_LOG_DEBUG("Kompute Tensor recording PRIMARY buffer memory barrier");
+
+    this->recordBufferMemoryBarrier(commandBuffer,
+            *this->mStagingBuffer,
+            srcAccessMask,
+            dstAccessMask,
+            srcStageMask,
+            dstStageMask);
 }
 
 void
 Tensor::recordBufferMemoryBarrier(const vk::CommandBuffer& commandBuffer,
+                                  const vk::Buffer& buffer,
                                   vk::AccessFlagBits srcAccessMask,
                                   vk::AccessFlagBits dstAccessMask,
                                   vk::PipelineStageFlagBits srcStageMask,
@@ -154,7 +173,7 @@ Tensor::recordBufferMemoryBarrier(const vk::CommandBuffer& commandBuffer,
     vk::DeviceSize bufferSize = this->memorySize();
 
     vk::BufferMemoryBarrier bufferMemoryBarrier;
-    bufferMemoryBarrier.buffer = *this->mPrimaryBuffer;
+    bufferMemoryBarrier.buffer = buffer;
     bufferMemoryBarrier.size = bufferSize;
     bufferMemoryBarrier.srcAccessMask = srcAccessMask;
     bufferMemoryBarrier.dstAccessMask = dstAccessMask;
diff --git a/src/include/kompute/Tensor.hpp b/src/include/kompute/Tensor.hpp
index dc4a4f51a..6cd75996a 100644
--- a/src/include/kompute/Tensor.hpp
+++ b/src/include/kompute/Tensor.hpp
@@ -97,12 +97,9 @@ class Tensor
      *
      * @param commandBuffer Vulkan Command Buffer to record the commands into
      * @param copyFromTensor Tensor to copy the data from
-     * @param createBarrier Whether to create a barrier that ensures the data is
-     * copied before further operations. Default is true.
      */
     void recordCopyFrom(const vk::CommandBuffer& commandBuffer,
-                        std::shared_ptr<Tensor> copyFromTensor,
-                        bool createBarrier);
+                        std::shared_ptr<Tensor> copyFromTensor);
 
     /**
      * Records a copy from the internal staging memory to the device memory
@@ -110,11 +107,8 @@ class Tensor
      * only be relevant for kp::Tensors of type eDevice.
      *
      * @param commandBuffer Vulkan Command Buffer to record the commands into
-     * @param createBarrier Whether to create a barrier that ensures the data is
-     * copied before further operations. Default is true.
      */
-    void recordCopyFromStagingToDevice(const vk::CommandBuffer& commandBuffer,
-                                       bool createBarrier);
+    void recordCopyFromStagingToDevice(const vk::CommandBuffer& commandBuffer);
 
     /**
      * Records a copy from the internal device memory to the staging memory
@@ -122,14 +116,11 @@ class Tensor
      * only be relevant for kp::Tensors of type eDevice.
      *
      * @param commandBuffer Vulkan Command Buffer to record the commands into
-     * @param createBarrier Whether to create a barrier that ensures the data is
-     * copied before further operations. Default is true.
      */
-    void recordCopyFromDeviceToStaging(const vk::CommandBuffer& commandBuffer,
-                                       bool createBarrier);
+    void recordCopyFromDeviceToStaging(const vk::CommandBuffer& commandBuffer);
 
     /**
-     * Records the buffer memory barrier into the command buffer which
+     * Records the buffer memory barrier into the primary buffer and command buffer which
      * ensures that relevant data transfers are carried out correctly.
      *
      * @param commandBuffer Vulkan Command Buffer to record the commands into
@@ -138,11 +129,27 @@ class Tensor
      * @param scrStageMask Pipeline stage flags for source stage mask
      * @param dstStageMask Pipeline stage flags for destination stage mask
      */
-    void recordBufferMemoryBarrier(const vk::CommandBuffer& commandBuffer,
-                                   vk::AccessFlagBits srcAccessMask,
-                                   vk::AccessFlagBits dstAccessMask,
-                                   vk::PipelineStageFlagBits srcStageMask,
-                                   vk::PipelineStageFlagBits dstStageMask);
+    void recordPrimaryBufferMemoryBarrier(const vk::CommandBuffer& commandBuffer,
+                                          vk::AccessFlagBits srcAccessMask,
+                                          vk::AccessFlagBits dstAccessMask,
+                                          vk::PipelineStageFlagBits srcStageMask,
+                                          vk::PipelineStageFlagBits dstStageMask);
+    /**
+     * Records the buffer memory barrier into the staging buffer and command buffer which
+     * ensures that relevant data transfers are carried out correctly.
+     *
+     * @param commandBuffer Vulkan Command Buffer to record the commands into
+     * @param srcAccessMask Access flags for source access mask
+     * @param dstAccessMask Access flags for destination access mask
+     * @param scrStageMask Pipeline stage flags for source stage mask
+     * @param dstStageMask Pipeline stage flags for destination stage mask
+     */
+    void recordStagingBufferMemoryBarrier(const vk::CommandBuffer& commandBuffer,
+                                          vk::AccessFlagBits srcAccessMask,
+                                          vk::AccessFlagBits dstAccessMask,
+                                          vk::PipelineStageFlagBits srcStageMask,
+                                          vk::PipelineStageFlagBits dstStageMask);
+
 
     /**
      * Constructs a vulkan descriptor buffer info which can be used to specify
@@ -284,8 +291,13 @@ class Tensor
                           std::shared_ptr<vk::Buffer> bufferFrom,
                           std::shared_ptr<vk::Buffer> bufferTo,
                           vk::DeviceSize bufferSize,
-                          vk::BufferCopy copyRegion,
-                          bool createBarrier);
+                          vk::BufferCopy copyRegion);
+    void recordBufferMemoryBarrier(const vk::CommandBuffer& commandBuffer,
+                                   const vk::Buffer& buffer,
+                                   vk::AccessFlagBits srcAccessMask,
+                                   vk::AccessFlagBits dstAccessMask,
+                                   vk::PipelineStageFlagBits srcStageMask,
+                                   vk::PipelineStageFlagBits dstStageMask);
 
     // Private util functions
     vk::BufferUsageFlags getPrimaryBufferUsageFlags();

From 0ea3a7912375253eb2e51030c946ba462b4ac48a Mon Sep 17 00:00:00 2001
From: Alejandro Saucedo <axsauze@gmail.com>
Date: Tue, 9 Mar 2021 08:52:43 +0000
Subject: [PATCH 2/5] Updated access masks for the pipeline barriers

---
 src/OpAlgoDispatch.cpp    |  4 ++--
 src/OpTensorSyncLocal.cpp | 12 ++++++------
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/OpAlgoDispatch.cpp b/src/OpAlgoDispatch.cpp
index 94502cc76..517a70d52 100644
--- a/src/OpAlgoDispatch.cpp
+++ b/src/OpAlgoDispatch.cpp
@@ -28,9 +28,9 @@ OpAlgoDispatch::record(const vk::CommandBuffer& commandBuffer)
          this->mAlgorithm->getTensors()) {
         tensor->recordPrimaryBufferMemoryBarrier(
           commandBuffer,
-          vk::AccessFlagBits::eHostWrite,
+          vk::AccessFlagBits::eTransferWrite,
           vk::AccessFlagBits::eShaderRead,
-          vk::PipelineStageFlagBits::eHost,
+          vk::PipelineStageFlagBits::eTransfer,
           vk::PipelineStageFlagBits::eComputeShader);
     }
 
diff --git a/src/OpTensorSyncLocal.cpp b/src/OpTensorSyncLocal.cpp
index 70642b9de..8a99c780b 100644
--- a/src/OpTensorSyncLocal.cpp
+++ b/src/OpTensorSyncLocal.cpp
@@ -31,13 +31,13 @@ OpTensorSyncLocal::record(const vk::CommandBuffer& commandBuffer)
     for (size_t i = 0; i < this->mTensors.size(); i++) {
         if (this->mTensors[i]->tensorType() == Tensor::TensorTypes::eDevice) {
 
-            this->mTensors[i]->recordCopyFromDeviceToStaging(commandBuffer);
+            this->mTensors[i]->recordPrimaryBufferMemoryBarrier(commandBuffer,
+                                        vk::AccessFlagBits::eShaderWrite,
+                                        vk::AccessFlagBits::eTransferRead,
+                                        vk::PipelineStageFlagBits::eComputeShader,
+                                        vk::PipelineStageFlagBits::eTransfer);
 
-            this->mTensors[i]->recordStagingBufferMemoryBarrier(commandBuffer,
-                                        vk::AccessFlagBits::eTransferWrite,
-                                        vk::AccessFlagBits::eHostRead,
-                                        vk::PipelineStageFlagBits::eTransfer,
-                                        vk::PipelineStageFlagBits::eHost);
+            this->mTensors[i]->recordCopyFromDeviceToStaging(commandBuffer);
         }
     }
 }

From c9bd406c8b2fb268459da5e8716bb59c25b0f95b Mon Sep 17 00:00:00 2001
From: Alejandro Saucedo <axsauze@gmail.com>
Date: Tue, 9 Mar 2021 10:00:33 +0000
Subject: [PATCH 3/5] Added barrier

---
 src/OpTensorSyncLocal.cpp | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/OpTensorSyncLocal.cpp b/src/OpTensorSyncLocal.cpp
index 8a99c780b..5e653154a 100644
--- a/src/OpTensorSyncLocal.cpp
+++ b/src/OpTensorSyncLocal.cpp
@@ -38,6 +38,12 @@ OpTensorSyncLocal::record(const vk::CommandBuffer& commandBuffer)
                                         vk::PipelineStageFlagBits::eTransfer);
 
             this->mTensors[i]->recordCopyFromDeviceToStaging(commandBuffer);
+
+            this->mTensors[i]->recordPrimaryBufferMemoryBarrier(commandBuffer,
+                                        vk::AccessFlagBits::eTransferWrite,
+                                        vk::AccessFlagBits::eHostRead,
+                                        vk::PipelineStageFlagBits::eTransfer,
+                                        vk::PipelineStageFlagBits::eHost);
         }
     }
 }

From d7b98f149b41f3877e59f5b8dc4863cfff43d3c9 Mon Sep 17 00:00:00 2001
From: Alejandro Saucedo <axsauze@gmail.com>
Date: Tue, 9 Mar 2021 22:11:54 +0000
Subject: [PATCH 4/5] Added OpMemoryBarrier to the available operations

---
 single_include/AggregateHeaders.cpp           |  1 +
 single_include/kompute/Kompute.hpp            | 71 +++++++++++++++++
 src/OpMemoryBarrier.cpp                       | 69 ++++++++++++++++
 .../kompute/operations/OpMemoryBarrier.hpp    | 78 +++++++++++++++++++
 4 files changed, 219 insertions(+)
 create mode 100644 src/OpMemoryBarrier.cpp
 create mode 100644 src/include/kompute/operations/OpMemoryBarrier.hpp

diff --git a/single_include/AggregateHeaders.cpp b/single_include/AggregateHeaders.cpp
index 25dc04edb..e4d1014e9 100644
--- a/single_include/AggregateHeaders.cpp
+++ b/single_include/AggregateHeaders.cpp
@@ -6,6 +6,7 @@
 #include "kompute/Tensor.hpp"
 #include "kompute/Algorithm.hpp"
 #include "kompute/operations/OpBase.hpp"
+#include "kompute/operations/OpMemoryBarrier.hpp"
 #include "kompute/operations/OpTensorCopy.hpp"
 #include "kompute/operations/OpTensorSyncDevice.hpp"
 #include "kompute/operations/OpTensorSyncLocal.hpp"
diff --git a/single_include/kompute/Kompute.hpp b/single_include/kompute/Kompute.hpp
index fc7d697fb..2353f62b8 100755
--- a/single_include/kompute/Kompute.hpp
+++ b/single_include/kompute/Kompute.hpp
@@ -1388,6 +1388,77 @@ class OpBase
 
 namespace kp {
 
+/**
+ * Operation that provides a general abstraction that simplifies the use of 
+ * algorithm and parameter components which can be used with shaders.
+ * It exposes the pipeline barrier functionality specifically for memory
+ * barriers that can be configured through the respective source and destination
+ * masks
+ */
+class OpMemoryBarrier : public OpBase
+{
+  public:
+
+    /**
+     * Constructor that stores tensors as well as memory barrier parameters to be
+     * used to create a pipeline barrier on the respective primary or staging tensor.
+     *
+     * @param tensors The tensors to apply the memory barriers on
+     * @param srcAccessMask The kp::AccessFlagBits for the source access mask
+     * @param dstAccessMask The kp::AccessFlagBits for the destination access mask
+     * @param srcStageMask The kp::PipelineStageFlagBits for the source stage mask
+     * @param dstStageMask The kp::PipelineStageFlagBits for the destination stage mask
+     * @param barrierOnPrimary Boolean to select primary or secondary buffers on tensors
+     */
+    OpMemoryBarrier(
+            const std::vector<std::shared_ptr<Tensor>>& tensors,
+            const vk::AccessFlagBits& srcAccessMask,
+            const vk::AccessFlagBits& dstAccessMask,
+            const vk::PipelineStageFlagBits& srcStageMask,
+            const vk::PipelineStageFlagBits& dstStageMask,
+            bool barrierOnPrimary = true);
+
+    /**
+     * Default destructor, which is in charge of destroying the reference to the tensors
+     * and all the relevant access / stage masks created
+     */
+    virtual ~OpMemoryBarrier() override;
+
+    /**
+     * This records the memory barrier with the access and stage masks provided
+     * across all relevant tensors.
+     *
+     * @param commandBuffer The command buffer to record the command into.
+     */
+    virtual void record(const vk::CommandBuffer& commandBuffer) override;
+
+    /**
+     * Does not perform any preEval commands.
+     *
+     * @param commandBuffer The command buffer to record the command into.
+     */
+    virtual void preEval(const vk::CommandBuffer& commandBuffer) override;
+
+    /**
+     * Does not perform any postEval commands.
+     *
+     * @param commandBuffer The command buffer to record the command into.
+     */
+    virtual void postEval(const vk::CommandBuffer& commandBuffer) override;
+
+private:
+    const vk::AccessFlagBits mSrcAccessMask;
+    const vk::AccessFlagBits mDstAccessMask;
+    const vk::PipelineStageFlagBits mSrcStageMask;
+    const vk::PipelineStageFlagBits mDstStageMask;
+    const bool mBarrierOnPrimary;
+    const std::vector<std::shared_ptr<Tensor>> mTensors;
+};
+
+} // End namespace kp
+
+namespace kp {
+
 /**
  * Operation that copies the data from the first tensor to the rest of the tensors 
  * provided, using a record command for all the vectors. This operation does not 
diff --git a/src/OpMemoryBarrier.cpp b/src/OpMemoryBarrier.cpp
new file mode 100644
index 000000000..9e45d0c4e
--- /dev/null
+++ b/src/OpMemoryBarrier.cpp
@@ -0,0 +1,69 @@
+#pragma once
+
+#include "kompute/operations/OpMemoryBarrier.hpp"
+
+namespace kp {
+
+OpMemoryBarrier::OpMemoryBarrier(
+            const std::vector<std::shared_ptr<Tensor>>& tensors,
+            const vk::AccessFlagBits& srcAccessMask,
+            const vk::AccessFlagBits& dstAccessMask,
+            const vk::PipelineStageFlagBits& srcStageMask,
+            const vk::PipelineStageFlagBits& dstStageMask,
+            bool barrierOnPrimary)
+    : mTensors(tensors),
+      mSrcAccessMask(srcAccessMask),
+      mDstAccessMask(dstAccessMask),
+      mSrcStageMask(srcStageMask),
+      mDstStageMask(dstStageMask),
+      mBarrierOnPrimary(barrierOnPrimary)
+{
+    KP_LOG_DEBUG("Kompute OpMemoryBarrier constructor");
+
+}
+
+OpMemoryBarrier::~OpMemoryBarrier()
+{
+    KP_LOG_DEBUG("Kompute OpMemoryBarrier destructor started");
+}
+
+void
+OpMemoryBarrier::record(const vk::CommandBuffer& commandBuffer)
+{
+    KP_LOG_DEBUG("Kompute OpMemoryBarrier record called");
+
+    // Barrier to ensure the data is finished writing to buffer memory
+    if (this->mBarrierOnPrimary) {
+        for (const std::shared_ptr<Tensor>& tensor : this->mTensors) {
+            tensor->recordPrimaryBufferMemoryBarrier(
+              commandBuffer,
+              this->mSrcAccessMask,
+              this->mDstAccessMask,
+              this->mSrcStageMask,
+              this->mDstStageMask);
+        }
+    } else {
+        for (const std::shared_ptr<Tensor>& tensor : this->mTensors) {
+            tensor->recordStagingBufferMemoryBarrier(
+              commandBuffer,
+              this->mSrcAccessMask,
+              this->mDstAccessMask,
+              this->mSrcStageMask,
+              this->mDstStageMask);
+        }
+    }
+}
+
+void
+OpMemoryBarrier::preEval(const vk::CommandBuffer& commandBuffer)
+{
+    KP_LOG_DEBUG("Kompute OpMemoryBarrier preEval called");
+}
+
+void
+OpMemoryBarrier::postEval(const vk::CommandBuffer& commandBuffer)
+{
+    KP_LOG_DEBUG("Kompute OpMemoryBarrier postSubmit called");
+}
+
+}
diff --git a/src/include/kompute/operations/OpMemoryBarrier.hpp b/src/include/kompute/operations/OpMemoryBarrier.hpp
new file mode 100644
index 000000000..6ae7100db
--- /dev/null
+++ b/src/include/kompute/operations/OpMemoryBarrier.hpp
@@ -0,0 +1,78 @@
+#pragma once
+
+#include "kompute/Core.hpp"
+#include "kompute/Algorithm.hpp"
+#include "kompute/Tensor.hpp"
+#include "kompute/operations/OpBase.hpp"
+
+namespace kp {
+
+/**
+ * Operation that provides a general abstraction that simplifies the use of 
+ * algorithm and parameter components which can be used with shaders.
+ * It exposes the pipeline barrier functionality specifically for memory
+ * barriers that can be configured through the respective source and destination
+ * masks
+ */
+class OpMemoryBarrier : public OpBase
+{
+  public:
+
+    /**
+     * Constructor that stores tensors as well as memory barrier parameters to be
+     * used to create a pipeline barrier on the respective primary or staging tensor.
+     *
+     * @param tensors The tensors to apply the memory barriers on
+     * @param srcAccessMask The kp::AccessFlagBits for the source access mask
+     * @param dstAccessMask The kp::AccessFlagBits for the destination access mask
+     * @param srcStageMask The kp::PipelineStageFlagBits for the source stage mask
+     * @param dstStageMask The kp::PipelineStageFlagBits for the destination stage mask
+     * @param barrierOnPrimary Boolean to select primary or secondary buffers on tensors
+     */
+    OpMemoryBarrier(
+            const std::vector<std::shared_ptr<Tensor>>& tensors,
+            const vk::AccessFlagBits& srcAccessMask,
+            const vk::AccessFlagBits& dstAccessMask,
+            const vk::PipelineStageFlagBits& srcStageMask,
+            const vk::PipelineStageFlagBits& dstStageMask,
+            bool barrierOnPrimary = true);
+
+    /**
+     * Default destructor, which is in charge of destroying the reference to the tensors
+     * and all the relevant access / stage masks created
+     */
+    virtual ~OpMemoryBarrier() override;
+
+    /**
+     * This records the memory barrier with the access and stage masks provided
+     * across all relevant tensors.
+     *
+     * @param commandBuffer The command buffer to record the command into.
+     */
+    virtual void record(const vk::CommandBuffer& commandBuffer) override;
+
+    /**
+     * Does not perform any preEval commands.
+     *
+     * @param commandBuffer The command buffer to record the command into.
+     */
+    virtual void preEval(const vk::CommandBuffer& commandBuffer) override;
+
+    /**
+     * Does not perform any postEval commands.
+     *
+     * @param commandBuffer The command buffer to record the command into.
+     */
+    virtual void postEval(const vk::CommandBuffer& commandBuffer) override;
+
+private:
+    const vk::AccessFlagBits mSrcAccessMask;
+    const vk::AccessFlagBits mDstAccessMask;
+    const vk::PipelineStageFlagBits mSrcStageMask;
+    const vk::PipelineStageFlagBits mDstStageMask;
+    const bool mBarrierOnPrimary;
+    const std::vector<std::shared_ptr<Tensor>> mTensors;
+};
+
+} // End namespace kp
+

From 4e86562cff4c3fc3737160c93a9f9aebe41f089f Mon Sep 17 00:00:00 2001
From: Alejandro Saucedo <axsauze@gmail.com>
Date: Sat, 13 Mar 2021 07:40:20 +0000
Subject: [PATCH 5/5] Updated staging memory barrier to ensure it's always
 coherent

---
 src/Tensor.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/Tensor.cpp b/src/Tensor.cpp
index 7e7af4ca4..6ea92ccd6 100644
--- a/src/Tensor.cpp
+++ b/src/Tensor.cpp
@@ -258,7 +258,8 @@ Tensor::getStagingMemoryPropertyFlags()
 {
     switch (this->mTensorType) {
         case TensorTypes::eDevice:
-            return vk::MemoryPropertyFlagBits::eHostVisible;
+            return vk::MemoryPropertyFlagBits::eHostVisible |
+                   vk::MemoryPropertyFlagBits::eHostCoherent;
             break;
         default:
             throw std::runtime_error("Kompute Tensor invalid tensor type");