diff --git a/single_include/kompute/Kompute.hpp b/single_include/kompute/Kompute.hpp
index cb13b744f..f85285e2d 100755
--- a/single_include/kompute/Kompute.hpp
+++ b/single_include/kompute/Kompute.hpp
@@ -246,8 +246,12 @@ class Tensor
      * Records a copy from the memory of the tensor provided to the current
      * thensor. This is intended to pass memory into a processing, to perform
      * a staging buffer transfer, or to gather output (between others).
+     *
+     * @param copyFromTensor Tensor to copy the data from
+     * @param createBarrier Whether to create a barrier that ensures the data is copied before further operations. Default is true.
      */
-    void recordCopyFrom(std::shared_ptr<Tensor> copyFromTensor);
+    void recordCopyFrom(std::shared_ptr<Tensor> copyFromTensor,
+            bool createBarrier = true);
 
     /**
      * Records the buffer memory barrier into the command buffer which
@@ -1077,7 +1081,6 @@ OpMult<tX, tY, tZ>::init()
 #endif
 
     SPDLOG_DEBUG("Kompute OpMult Initialising algorithm component");
-    SPDLOG_DEBUG("Kompute vector size {}", shaderFileData.size());
 
     this->mAlgorithm->init(shaderFileData, this->mTensors);
 }
@@ -1103,11 +1106,6 @@ OpMult<tX, tY, tZ>::record()
     this->mAlgorithm->recordDispatch(this->mX, this->mY, this->mZ);
 
     // Barrier to ensure the shader code is executed before buffer read
-    this->mTensorLHS->recordBufferMemoryBarrier(
-      vk::AccessFlagBits::eShaderWrite,
-      vk::AccessFlagBits::eTransferRead,
-      vk::PipelineStageFlagBits::eComputeShader,
-      vk::PipelineStageFlagBits::eTransfer);
     this->mTensorOutput->recordBufferMemoryBarrier(
       vk::AccessFlagBits::eShaderWrite,
       vk::AccessFlagBits::eTransferRead,
@@ -1115,18 +1113,6 @@ OpMult<tX, tY, tZ>::record()
       vk::PipelineStageFlagBits::eTransfer);
 
     this->mTensorOutputStaging->recordCopyFrom(this->mTensorOutput);
-
-    // Buffer to ensure wait until data is copied to staging buffer
-    this->mTensorLHS->recordBufferMemoryBarrier(
-      vk::AccessFlagBits::eTransferWrite,
-      vk::AccessFlagBits::eHostRead,
-      vk::PipelineStageFlagBits::eTransfer,
-      vk::PipelineStageFlagBits::eHost);
-    this->mTensorOutput->recordBufferMemoryBarrier(
-      vk::AccessFlagBits::eTransferWrite,
-      vk::AccessFlagBits::eHostRead,
-      vk::PipelineStageFlagBits::eTransfer,
-      vk::PipelineStageFlagBits::eHost);
 }
 
 template<uint32_t tX, uint32_t tY, uint32_t tZ>
diff --git a/src/Tensor.cpp b/src/Tensor.cpp
index 04e8f740e..e1b85a42e 100644
--- a/src/Tensor.cpp
+++ b/src/Tensor.cpp
@@ -95,7 +95,7 @@ Tensor::setData(const std::vector<uint32_t>& data)
 }
 
 void
-Tensor::recordCopyFrom(std::shared_ptr<Tensor> copyFromTensor)
+Tensor::recordCopyFrom(std::shared_ptr<Tensor> copyFromTensor, bool createBarrier)
 {
     SPDLOG_DEBUG("Kompute Tensor recordCopyFrom called");
 
@@ -114,6 +114,15 @@ Tensor::recordCopyFrom(std::shared_ptr<Tensor> copyFromTensor)
     // TODO: Ensure command buffer is in same device from buffer
     this->mCommandBuffer->copyBuffer(
       *copyFromTensor->mBuffer, *this->mBuffer, copyRegion);
+
+    if (createBarrier) {
+        // Buffer to ensure wait until data is copied to staging buffer
+        this->recordBufferMemoryBarrier(
+          vk::AccessFlagBits::eTransferWrite,
+          vk::AccessFlagBits::eHostRead,
+          vk::PipelineStageFlagBits::eTransfer,
+          vk::PipelineStageFlagBits::eHost);
+        }
 }
 
 void
diff --git a/src/include/kompute/Tensor.hpp b/src/include/kompute/Tensor.hpp
index 6e16af85e..0ff811300 100644
--- a/src/include/kompute/Tensor.hpp
+++ b/src/include/kompute/Tensor.hpp
@@ -96,8 +96,12 @@ class Tensor
      * Records a copy from the memory of the tensor provided to the current
      * thensor. This is intended to pass memory into a processing, to perform
      * a staging buffer transfer, or to gather output (between others).
+     *
+     * @param copyFromTensor Tensor to copy the data from
+     * @param createBarrier Whether to create a barrier that ensures the data is copied before further operations. Default is true.
      */
-    void recordCopyFrom(std::shared_ptr<Tensor> copyFromTensor);
+    void recordCopyFrom(std::shared_ptr<Tensor> copyFromTensor,
+            bool createBarrier = true);
 
     /**
      * Records the buffer memory barrier into the command buffer which
diff --git a/src/include/kompute/operations/OpMult.hpp b/src/include/kompute/operations/OpMult.hpp
index fdc7a3282..32128643d 100644
--- a/src/include/kompute/operations/OpMult.hpp
+++ b/src/include/kompute/operations/OpMult.hpp
@@ -198,11 +198,6 @@ OpMult<tX, tY, tZ>::record()
     this->mAlgorithm->recordDispatch(this->mX, this->mY, this->mZ);
 
     // Barrier to ensure the shader code is executed before buffer read
-    this->mTensorLHS->recordBufferMemoryBarrier(
-      vk::AccessFlagBits::eShaderWrite,
-      vk::AccessFlagBits::eTransferRead,
-      vk::PipelineStageFlagBits::eComputeShader,
-      vk::PipelineStageFlagBits::eTransfer);
     this->mTensorOutput->recordBufferMemoryBarrier(
       vk::AccessFlagBits::eShaderWrite,
       vk::AccessFlagBits::eTransferRead,
@@ -210,18 +205,6 @@ OpMult<tX, tY, tZ>::record()
       vk::PipelineStageFlagBits::eTransfer);
 
     this->mTensorOutputStaging->recordCopyFrom(this->mTensorOutput);
-
-    // Buffer to ensure wait until data is copied to staging buffer
-    this->mTensorLHS->recordBufferMemoryBarrier(
-      vk::AccessFlagBits::eTransferWrite,
-      vk::AccessFlagBits::eHostRead,
-      vk::PipelineStageFlagBits::eTransfer,
-      vk::PipelineStageFlagBits::eHost);
-    this->mTensorOutput->recordBufferMemoryBarrier(
-      vk::AccessFlagBits::eTransferWrite,
-      vk::AccessFlagBits::eHostRead,
-      vk::PipelineStageFlagBits::eTransfer,
-      vk::PipelineStageFlagBits::eHost);
 }
 
 template<uint32_t tX, uint32_t tY, uint32_t tZ>