diff --git a/README.md b/README.md
index 8adfd6c93..eb438a473 100644
--- a/README.md
+++ b/README.md
@@ -61,8 +61,11 @@ int main() {
     kp::Manager mgr; // Selects device 0 unless explicitly requested
 
     // Creates tensor an initializes GPU memory (below we show more granularity)
-    auto tensorA = mgr.buildTensor({ 3, 4, 5 });
-    auto tensorB = mgr.buildTensor({ 0, 0, 0 });
+    auto tensorA = std::make_shared<kp::Tensor>(kp::Tensor({ 3., 4., 5. }));
+    auto tensorB = std::make_shared<kp::Tensor>(kp::Tensor({ 0., 0., 0. }));
+
+    // Create tensors data explicitly in GPU with an operation
+    mgr.evalOpDefault<kp::OpTensorCreate>({ tensorA, tensorB });
 
     // Define your shader as a string (using string literals for simplicity)
     // (You can also pass the raw compiled bytes, or even path to file)
@@ -82,11 +85,13 @@ int main() {
     )");
 
     // Run Kompute operation on the parameters provided with dispatch layout
-    mgr.evalOpDefault<kp::OpMult<3, 1, 1>>(
+    mgr.evalOpDefault<kp::OpAlgoBase<3, 1, 1>>(
         { tensorA, tensorB }, 
-        true, // Whether to retrieve the output from GPU memory
         std::vector<char>(shader.begin(), shader.end()));
 
+    // Sync the GPU memory back to the local tensor
+    mgr.evalOpDefault<kp::OpTensorSyncLocal>({ tensorA, tensorB });
+
     // Prints the output which is A: { 0, 1, 2 } B: { 3, 4, 5 }
     std::cout << fmt::format("A: {}, B: {}", 
         tensorA.data(), tensorB.data()) << std::endl;
@@ -107,7 +112,7 @@ class OpMyCustom : public OpAlgoBase<tX, tY, tZ>
            std::shared_ptr<vk::Device> device,
            std::shared_ptr<vk::CommandBuffer> commandBuffer,
            std::vector<std::shared_ptr<Tensor>> tensors)
-      : OpAlgoBase<tX, tY, tZ>(physicalDevice, device, commandBuffer, tensors, true, "")
+      : OpAlgoBase<tX, tY, tZ>(physicalDevice, device, commandBuffer, tensors, "")
     {
         // Perform your custom steps such as reading from a shader file
         this->mShaderFilePath = "shaders/glsl/opmult.comp";
@@ -144,7 +149,7 @@ int main() {
     kp::Manager mgr;
 
     std::shared_ptr<kp::Tensor> tensorLHS{ new kp::Tensor({ 1., 1., 1. }) };
-    std::shared_ptr<kp::Tensor> tensorRHS{ new kp::Tensor( { 2., 2., 2. }) };
+    std::shared_ptr<kp::Tensor> tensorRHS{ new kp::Tensor({ 2., 2., 2. }) };
     std::shared_ptr<kp::Tensor> tensorOutput{ new kp::Tensor({ 0., 0., 0. }) };
 
     // Create all the tensors in memory
@@ -159,17 +164,23 @@ int main() {
         sq.begin();
 
         // Record batch commands to send to GPU
-        sq.record<kp::OpMult<>>({ tensorLHS, tensorRHS, tensorOutput });
-        sq.record<kp::OpTensorCopy>({tensorOutput, tensorLHS, tensorRHS});
+        sq->record<kp::OpMult<>>({ tensorLHS, tensorRHS, tensorOutput });
+        sq->record<kp::OpTensorCopy>({tensorOutput, tensorLHS, tensorRHS});
 
         // Stop recording
-        sq.end();
+        sq->end();
 
         // Submit multiple batch operations to GPU
         size_t ITERATIONS = 5;
         for (size_t i = 0; i < ITERATIONS; i++) {
-            sq.eval();
+            sq->eval();
         }
+
+        // Sync GPU memory back to local tensor
+        sq->begin();
+        sq->record<kp::OpTensorSyncLocal>({tensorOutput});
+        sq->end();
+        sq->eval();
     }
 
     // Print the output which iterates through OpMult 5 times
diff --git a/single_include/kompute/Kompute.hpp b/single_include/kompute/Kompute.hpp
index afc873810..f4acff5af 100755
--- a/single_include/kompute/Kompute.hpp
+++ b/single_include/kompute/Kompute.hpp
@@ -971,11 +971,6 @@ namespace kp {
  *
  * All of these tensors are expected to be initlaised and this is checked with throw std exception in the init function.
  *
- * It is possible to also choose if the user requires all of the tensors to be
- * copied from device memory to their host data. This can be disabled by either
- * passing the copyOutputData constructor parameter and/or by overriding the 
- * functions to carry out copy commands accordingly. 
- *
  * See OpLhsRhsOut for an example implementation on a more specific granularity on tensor parameters.
  * 
  * The template parameters specify the processing GPU layout number of
@@ -1000,14 +995,12 @@ class OpAlgoBase : public OpBase
      * @param device Vulkan logical device for passing to Algorithm
      * @param commandBuffer Vulkan Command Buffer to record commands into
      * @param tensors Tensors that are to be used in this operation
-     * @param copyOutputData Whether to map device data for all output tensors back to their host data vectors
      * @param shaderFilePath Optional parameter to specify the shader to load (either in spirv or raw format)
      */
     OpAlgoBase(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
            std::shared_ptr<vk::Device> device,
            std::shared_ptr<vk::CommandBuffer> commandBuffer,
-           std::vector<std::shared_ptr<Tensor>>& tensors,
-           bool copyOutputData);
+           std::vector<std::shared_ptr<Tensor>>& tensors);
 
     /**
      * Constructor that enables a file to be passed to the operation with
@@ -1018,14 +1011,12 @@ class OpAlgoBase : public OpBase
      * @param device Vulkan logical device for passing to Algorithm
      * @param commandBuffer Vulkan Command Buffer to record commands into
      * @param tensors Tensors that are to be used in this operation
-     * @param copyOutputData Whether to map device data for all output tensors back to their host data vectors
      * @param shaderFilePath Optional parameter to specify the shader to load (either in spirv or raw format)
      */
     OpAlgoBase(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
            std::shared_ptr<vk::Device> device,
            std::shared_ptr<vk::CommandBuffer> commandBuffer,
            std::vector<std::shared_ptr<Tensor>>& tensors,
-           bool copyOutputData,
            std::string shaderFilePath);
 
     /**
@@ -1036,14 +1027,12 @@ class OpAlgoBase : public OpBase
      * @param device Vulkan logical device for passing to Algorithm
      * @param commandBuffer Vulkan Command Buffer to record commands into
      * @param tensors Tensors that are to be used in this operation
-     * @param copyOutputData Whether to map device data for all output tensors back to their host data vectors
      * @param shaderDataRaw Optional parameter to specify the shader data either in binary or raw form
      */
     OpAlgoBase(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
            std::shared_ptr<vk::Device> device,
            std::shared_ptr<vk::CommandBuffer> commandBuffer,
            std::vector<std::shared_ptr<Tensor>>& tensors,
-           bool copyOutputData,
            const std::vector<char>& shaderDataRaw);
 
     /**
@@ -1090,8 +1079,6 @@ class OpAlgoBase : public OpBase
     bool mFreeAlgorithm = false;
 
     // -------------- ALWAYS OWNED RESOURCES
-    std::vector<std::shared_ptr<Tensor>> mOutputStagingTensors; ///< Array of output staging tensors which will be expected to be the same size as the number of inputs.
-    bool mCopyOutputData; ///< Configuration parameter which states whether data will be copied back to all provided tensors for convenience. This can be deactivated by setting this flag and or overriding the functions provided.
 
     uint32_t mX;
     uint32_t mY;
@@ -1121,11 +1108,10 @@ template<uint32_t tX, uint32_t tY, uint32_t tZ>
 OpAlgoBase<tX, tY, tZ>::OpAlgoBase(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
                            std::shared_ptr<vk::Device> device,
                            std::shared_ptr<vk::CommandBuffer> commandBuffer,
-                           std::vector<std::shared_ptr<Tensor>>& tensors,
-                           bool copyOutputData)
+                           std::vector<std::shared_ptr<Tensor>>& tensors)
   : OpBase(physicalDevice, device, commandBuffer, tensors, false)
 {
-    SPDLOG_DEBUG("Kompute OpAlgoBase constructor with params numTensors: {} copyOutputData: {}, shaderFilePath: {}", tensors.size(), copyOutputData);
+    SPDLOG_DEBUG("Kompute OpAlgoBase constructor with params numTensors: {} , shaderFilePath: {}", tensors.size());
 
     // The dispatch size is set up based on either explicitly provided template
     // parameters or by default it would take the shape and size of the tensors
@@ -1145,8 +1131,6 @@ OpAlgoBase<tX, tY, tZ>::OpAlgoBase(std::shared_ptr<vk::PhysicalDevice> physicalD
                  this->mY,
                  this->mZ);
 
-    this->mCopyOutputData = copyOutputData;
-
     this->mAlgorithm = std::make_shared<Algorithm>(device, commandBuffer);
 }
 
@@ -1155,9 +1139,8 @@ OpAlgoBase<tX, tY, tZ>::OpAlgoBase(std::shared_ptr<vk::PhysicalDevice> physicalD
                            std::shared_ptr<vk::Device> device,
                            std::shared_ptr<vk::CommandBuffer> commandBuffer,
                            std::vector<std::shared_ptr<Tensor>>& tensors,
-                           bool copyOutputData,
                            std::string shaderFilePath)
-  : OpAlgoBase(physicalDevice, device, commandBuffer, tensors, copyOutputData)
+  : OpAlgoBase(physicalDevice, device, commandBuffer, tensors)
 {
     SPDLOG_DEBUG("Kompute OpAlgoBase shaderFilePath constructo with shaderfile path: {}", shaderFilePath);
 
@@ -1169,9 +1152,8 @@ OpAlgoBase<tX, tY, tZ>::OpAlgoBase(std::shared_ptr<vk::PhysicalDevice> physicalD
                            std::shared_ptr<vk::Device> device,
                            std::shared_ptr<vk::CommandBuffer> commandBuffer,
                            std::vector<std::shared_ptr<Tensor>>& tensors,
-                           bool copyOutputData,
                            const std::vector<char>& shaderDataRaw)
-  : OpAlgoBase(physicalDevice, device, commandBuffer, tensors, copyOutputData)
+  : OpAlgoBase(physicalDevice, device, commandBuffer, tensors)
 {
     SPDLOG_DEBUG("Kompute OpAlgoBase shaderFilePath constructo with shader raw data length: {}", shaderDataRaw.size());
 
@@ -1182,13 +1164,6 @@ template<uint32_t tX, uint32_t tY, uint32_t tZ>
 OpAlgoBase<tX, tY, tZ>::~OpAlgoBase()
 {
     SPDLOG_DEBUG("Kompute OpAlgoBase destructor started");
-
-    if (this->mCopyOutputData) {
-        SPDLOG_DEBUG("Kompute OpAlgoBase destroying staging tensors");
-        for (std::shared_ptr<Tensor> stagingTensor : this->mOutputStagingTensors) {
-            stagingTensor->freeMemoryDestroyGPUResources();
-        }
-    }
 }
 
 template<uint32_t tX, uint32_t tY, uint32_t tZ>
@@ -1208,18 +1183,6 @@ OpAlgoBase<tX, tY, tZ>::init()
         }
     }
 
-    if (this->mCopyOutputData) {
-        SPDLOG_DEBUG("Kompute OpAlgoBase creating staging output tensors");
-
-        for (std::shared_ptr<Tensor> tensor : this->mTensors) {
-            std::shared_ptr<Tensor> stagingTensor = std::make_shared<Tensor>(
-              tensor->data(), Tensor::TensorTypes::eStaging);
-            stagingTensor->init(
-                this->mPhysicalDevice, this->mDevice);
-            this->mOutputStagingTensors.push_back(stagingTensor);
-        }
-    }
-
     SPDLOG_DEBUG("Kompute OpAlgoBase fetching spirv data");
 
     std::vector<char>& shaderFileData = this->fetchSpirvBinaryData();
@@ -1246,27 +1209,6 @@ OpAlgoBase<tX, tY, tZ>::record()
     }
 
     this->mAlgorithm->recordDispatch(this->mX, this->mY, this->mZ);
-
-    if (this->mCopyOutputData) {
-        // Barrier to ensure the shader code is executed before buffer read
-        for (const std::shared_ptr<Tensor>& tensor : this->mTensors) {
-            tensor->recordBufferMemoryBarrier(
-              this->mCommandBuffer,
-              vk::AccessFlagBits::eShaderWrite,
-              vk::AccessFlagBits::eTransferRead,
-              vk::PipelineStageFlagBits::eComputeShader,
-              vk::PipelineStageFlagBits::eTransfer);
-        }
-
-        // Record copy from and create barrier for STAGING tensors
-        // TODO: This only accounts for device tensors need to account for staging and storage
-        for (size_t i = 0; i < this->mTensors.size(); i++) {
-            this->mOutputStagingTensors[i]->recordCopyFrom(
-                this->mCommandBuffer,
-                this->mTensors[i], 
-                true);
-        }
-    }
 }
 
 template<uint32_t tX, uint32_t tY, uint32_t tZ>
@@ -1281,14 +1223,6 @@ void
 OpAlgoBase<tX, tY, tZ>::postEval()
 {
     SPDLOG_DEBUG("Kompute OpAlgoBase postSubmit called");
-
-    if (this->mCopyOutputData) {
-        for (size_t i = 0; i < this->mTensors.size(); i++) {
-            this->mOutputStagingTensors[i]->mapDataFromHostMemory();
-
-            this->mTensors[i]->setData(this->mOutputStagingTensors[i]->data());
-        }
-    }
 }
 
 template<uint32_t tX, uint32_t tY, uint32_t tZ>
@@ -1429,7 +1363,7 @@ OpAlgoLhsRhsOut<tX, tY, tZ>::OpAlgoLhsRhsOut(std::shared_ptr<vk::PhysicalDevice>
   // The inheritance is initialised with the copyOutputData to false given that
   // this depencendant class handles the transfer of data via staging buffers in 
   // a granular way.
-  : OpAlgoBase<tX, tY, tZ>(physicalDevice, device, commandBuffer, tensors, false)
+  : OpAlgoBase<tX, tY, tZ>(physicalDevice, device, commandBuffer, tensors)
 {
     SPDLOG_DEBUG("Kompute OpAlgoLhsRhsOut constructor with params");
 }
@@ -1575,7 +1509,7 @@ class OpMult : public OpAlgoBase<tX, tY, tZ>
            std::shared_ptr<vk::Device> device,
            std::shared_ptr<vk::CommandBuffer> commandBuffer,
            std::vector<std::shared_ptr<Tensor>> tensors)
-      : OpAlgoBase<tX, tY, tZ>(physicalDevice, device, commandBuffer, tensors, true, "")
+      : OpAlgoBase<tX, tY, tZ>(physicalDevice, device, commandBuffer, tensors, "")
     {
         SPDLOG_DEBUG("Kompute OpMult constructor with params");
 
diff --git a/src/include/kompute/operations/OpAlgoBase.hpp b/src/include/kompute/operations/OpAlgoBase.hpp
index 417a05550..03109ec34 100644
--- a/src/include/kompute/operations/OpAlgoBase.hpp
+++ b/src/include/kompute/operations/OpAlgoBase.hpp
@@ -21,11 +21,6 @@ namespace kp {
  *
  * All of these tensors are expected to be initlaised and this is checked with throw std exception in the init function.
  *
- * It is possible to also choose if the user requires all of the tensors to be
- * copied from device memory to their host data. This can be disabled by either
- * passing the copyOutputData constructor parameter and/or by overriding the 
- * functions to carry out copy commands accordingly. 
- *
  * See OpLhsRhsOut for an example implementation on a more specific granularity on tensor parameters.
  * 
  * The template parameters specify the processing GPU layout number of
@@ -50,14 +45,12 @@ class OpAlgoBase : public OpBase
      * @param device Vulkan logical device for passing to Algorithm
      * @param commandBuffer Vulkan Command Buffer to record commands into
      * @param tensors Tensors that are to be used in this operation
-     * @param copyOutputData Whether to map device data for all output tensors back to their host data vectors
      * @param shaderFilePath Optional parameter to specify the shader to load (either in spirv or raw format)
      */
     OpAlgoBase(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
            std::shared_ptr<vk::Device> device,
            std::shared_ptr<vk::CommandBuffer> commandBuffer,
-           std::vector<std::shared_ptr<Tensor>>& tensors,
-           bool copyOutputData);
+           std::vector<std::shared_ptr<Tensor>>& tensors);
 
     /**
      * Constructor that enables a file to be passed to the operation with
@@ -68,14 +61,12 @@ class OpAlgoBase : public OpBase
      * @param device Vulkan logical device for passing to Algorithm
      * @param commandBuffer Vulkan Command Buffer to record commands into
      * @param tensors Tensors that are to be used in this operation
-     * @param copyOutputData Whether to map device data for all output tensors back to their host data vectors
      * @param shaderFilePath Optional parameter to specify the shader to load (either in spirv or raw format)
      */
     OpAlgoBase(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
            std::shared_ptr<vk::Device> device,
            std::shared_ptr<vk::CommandBuffer> commandBuffer,
            std::vector<std::shared_ptr<Tensor>>& tensors,
-           bool copyOutputData,
            std::string shaderFilePath);
 
     /**
@@ -86,14 +77,12 @@ class OpAlgoBase : public OpBase
      * @param device Vulkan logical device for passing to Algorithm
      * @param commandBuffer Vulkan Command Buffer to record commands into
      * @param tensors Tensors that are to be used in this operation
-     * @param copyOutputData Whether to map device data for all output tensors back to their host data vectors
      * @param shaderDataRaw Optional parameter to specify the shader data either in binary or raw form
      */
     OpAlgoBase(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
            std::shared_ptr<vk::Device> device,
            std::shared_ptr<vk::CommandBuffer> commandBuffer,
            std::vector<std::shared_ptr<Tensor>>& tensors,
-           bool copyOutputData,
            const std::vector<char>& shaderDataRaw);
 
     /**
@@ -141,8 +130,6 @@ class OpAlgoBase : public OpBase
     bool mFreeAlgorithm = false;
 
     // -------------- ALWAYS OWNED RESOURCES
-    std::vector<std::shared_ptr<Tensor>> mOutputStagingTensors; ///< Array of output staging tensors which will be expected to be the same size as the number of inputs.
-    bool mCopyOutputData; ///< Configuration parameter which states whether data will be copied back to all provided tensors for convenience. This can be deactivated by setting this flag and or overriding the functions provided.
 
     uint32_t mX;
     uint32_t mY;
@@ -172,11 +159,10 @@ template<uint32_t tX, uint32_t tY, uint32_t tZ>
 OpAlgoBase<tX, tY, tZ>::OpAlgoBase(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
                            std::shared_ptr<vk::Device> device,
                            std::shared_ptr<vk::CommandBuffer> commandBuffer,
-                           std::vector<std::shared_ptr<Tensor>>& tensors,
-                           bool copyOutputData)
+                           std::vector<std::shared_ptr<Tensor>>& tensors)
   : OpBase(physicalDevice, device, commandBuffer, tensors, false)
 {
-    SPDLOG_DEBUG("Kompute OpAlgoBase constructor with params numTensors: {} copyOutputData: {}, shaderFilePath: {}", tensors.size(), copyOutputData);
+    SPDLOG_DEBUG("Kompute OpAlgoBase constructor with params numTensors: {} , shaderFilePath: {}", tensors.size());
 
     // The dispatch size is set up based on either explicitly provided template
     // parameters or by default it would take the shape and size of the tensors
@@ -196,8 +182,6 @@ OpAlgoBase<tX, tY, tZ>::OpAlgoBase(std::shared_ptr<vk::PhysicalDevice> physicalD
                  this->mY,
                  this->mZ);
 
-    this->mCopyOutputData = copyOutputData;
-
     this->mAlgorithm = std::make_shared<Algorithm>(device, commandBuffer);
 }
 
@@ -206,9 +190,8 @@ OpAlgoBase<tX, tY, tZ>::OpAlgoBase(std::shared_ptr<vk::PhysicalDevice> physicalD
                            std::shared_ptr<vk::Device> device,
                            std::shared_ptr<vk::CommandBuffer> commandBuffer,
                            std::vector<std::shared_ptr<Tensor>>& tensors,
-                           bool copyOutputData,
                            std::string shaderFilePath)
-  : OpAlgoBase(physicalDevice, device, commandBuffer, tensors, copyOutputData)
+  : OpAlgoBase(physicalDevice, device, commandBuffer, tensors)
 {
     SPDLOG_DEBUG("Kompute OpAlgoBase shaderFilePath constructo with shaderfile path: {}", shaderFilePath);
 
@@ -220,9 +203,8 @@ OpAlgoBase<tX, tY, tZ>::OpAlgoBase(std::shared_ptr<vk::PhysicalDevice> physicalD
                            std::shared_ptr<vk::Device> device,
                            std::shared_ptr<vk::CommandBuffer> commandBuffer,
                            std::vector<std::shared_ptr<Tensor>>& tensors,
-                           bool copyOutputData,
                            const std::vector<char>& shaderDataRaw)
-  : OpAlgoBase(physicalDevice, device, commandBuffer, tensors, copyOutputData)
+  : OpAlgoBase(physicalDevice, device, commandBuffer, tensors)
 {
     SPDLOG_DEBUG("Kompute OpAlgoBase shaderFilePath constructo with shader raw data length: {}", shaderDataRaw.size());
 
@@ -233,13 +215,6 @@ template<uint32_t tX, uint32_t tY, uint32_t tZ>
 OpAlgoBase<tX, tY, tZ>::~OpAlgoBase()
 {
     SPDLOG_DEBUG("Kompute OpAlgoBase destructor started");
-
-    if (this->mCopyOutputData) {
-        SPDLOG_DEBUG("Kompute OpAlgoBase destroying staging tensors");
-        for (std::shared_ptr<Tensor> stagingTensor : this->mOutputStagingTensors) {
-            stagingTensor->freeMemoryDestroyGPUResources();
-        }
-    }
 }
 
 template<uint32_t tX, uint32_t tY, uint32_t tZ>
@@ -259,18 +234,6 @@ OpAlgoBase<tX, tY, tZ>::init()
         }
     }
 
-    if (this->mCopyOutputData) {
-        SPDLOG_DEBUG("Kompute OpAlgoBase creating staging output tensors");
-
-        for (std::shared_ptr<Tensor> tensor : this->mTensors) {
-            std::shared_ptr<Tensor> stagingTensor = std::make_shared<Tensor>(
-              tensor->data(), Tensor::TensorTypes::eStaging);
-            stagingTensor->init(
-                this->mPhysicalDevice, this->mDevice);
-            this->mOutputStagingTensors.push_back(stagingTensor);
-        }
-    }
-
     SPDLOG_DEBUG("Kompute OpAlgoBase fetching spirv data");
 
     std::vector<char>& shaderFileData = this->fetchSpirvBinaryData();
@@ -297,27 +260,6 @@ OpAlgoBase<tX, tY, tZ>::record()
     }
 
     this->mAlgorithm->recordDispatch(this->mX, this->mY, this->mZ);
-
-    if (this->mCopyOutputData) {
-        // Barrier to ensure the shader code is executed before buffer read
-        for (const std::shared_ptr<Tensor>& tensor : this->mTensors) {
-            tensor->recordBufferMemoryBarrier(
-              this->mCommandBuffer,
-              vk::AccessFlagBits::eShaderWrite,
-              vk::AccessFlagBits::eTransferRead,
-              vk::PipelineStageFlagBits::eComputeShader,
-              vk::PipelineStageFlagBits::eTransfer);
-        }
-
-        // Record copy from and create barrier for STAGING tensors
-        // TODO: This only accounts for device tensors need to account for staging and storage
-        for (size_t i = 0; i < this->mTensors.size(); i++) {
-            this->mOutputStagingTensors[i]->recordCopyFrom(
-                this->mCommandBuffer,
-                this->mTensors[i], 
-                true);
-        }
-    }
 }
 
 template<uint32_t tX, uint32_t tY, uint32_t tZ>
@@ -332,14 +274,6 @@ void
 OpAlgoBase<tX, tY, tZ>::postEval()
 {
     SPDLOG_DEBUG("Kompute OpAlgoBase postSubmit called");
-
-    if (this->mCopyOutputData) {
-        for (size_t i = 0; i < this->mTensors.size(); i++) {
-            this->mOutputStagingTensors[i]->mapDataFromHostMemory();
-
-            this->mTensors[i]->setData(this->mOutputStagingTensors[i]->data());
-        }
-    }
 }
 
 template<uint32_t tX, uint32_t tY, uint32_t tZ>
diff --git a/src/include/kompute/operations/OpAlgoLhsRhsOut.hpp b/src/include/kompute/operations/OpAlgoLhsRhsOut.hpp
index 1f9605878..e513bb820 100644
--- a/src/include/kompute/operations/OpAlgoLhsRhsOut.hpp
+++ b/src/include/kompute/operations/OpAlgoLhsRhsOut.hpp
@@ -107,7 +107,7 @@ OpAlgoLhsRhsOut<tX, tY, tZ>::OpAlgoLhsRhsOut(std::shared_ptr<vk::PhysicalDevice>
   // The inheritance is initialised with the copyOutputData to false given that
   // this depencendant class handles the transfer of data via staging buffers in 
   // a granular way.
-  : OpAlgoBase<tX, tY, tZ>(physicalDevice, device, commandBuffer, tensors, false)
+  : OpAlgoBase<tX, tY, tZ>(physicalDevice, device, commandBuffer, tensors)
 {
     SPDLOG_DEBUG("Kompute OpAlgoLhsRhsOut constructor with params");
 }
diff --git a/src/include/kompute/operations/OpMult.hpp b/src/include/kompute/operations/OpMult.hpp
index 5d51286a7..ba3cb21a0 100644
--- a/src/include/kompute/operations/OpMult.hpp
+++ b/src/include/kompute/operations/OpMult.hpp
@@ -47,7 +47,7 @@ class OpMult : public OpAlgoBase<tX, tY, tZ>
            std::shared_ptr<vk::Device> device,
            std::shared_ptr<vk::CommandBuffer> commandBuffer,
            std::vector<std::shared_ptr<Tensor>> tensors)
-      : OpAlgoBase<tX, tY, tZ>(physicalDevice, device, commandBuffer, tensors, true, "")
+      : OpAlgoBase<tX, tY, tZ>(physicalDevice, device, commandBuffer, tensors, "")
     {
         SPDLOG_DEBUG("Kompute OpMult constructor with params");
 
diff --git a/test/TestLogisticRegression.cpp b/test/TestLogisticRegression.cpp
index 603a49c7d..ae746c68f 100644
--- a/test/TestLogisticRegression.cpp
+++ b/test/TestLogisticRegression.cpp
@@ -48,7 +48,6 @@ TEST(TestLogisticRegressionAlgorithm, TestMainLogisticRegression) {
 
             sq->record<kp::OpAlgoBase<>>(
                     params, 
-                    false, // Whether to copy output from device
                     "test/shaders/glsl/test_logistic_regression.comp");
 
             sq->record<kp::OpTensorSyncLocal>({wOutI, wOutJ, bOut});
@@ -125,9 +124,10 @@ TEST(TestLogisticRegressionAlgorithm, TestMainLogisticRegressionManualCopy) {
 
             sq->record<kp::OpAlgoBase<>>(
                     params, 
-                    true, // Whether to copy output from device
                     "test/shaders/glsl/test_logistic_regression.comp");
 
+            sq->record<kp::OpTensorSyncLocal>({wOutI, wOutJ, bOut});
+
             sq->end();
 
             // Iterate across all expected iterations
diff --git a/test/TestManager.cpp b/test/TestManager.cpp
index 666ff3978..6d4690ef7 100755
--- a/test/TestManager.cpp
+++ b/test/TestManager.cpp
@@ -10,16 +10,17 @@ TEST(TestManager, EndToEndOpMultFlow)
     std::shared_ptr<kp::Tensor> tensorLHS{ new kp::Tensor({ 0, 1, 2 }) };
     mgr.evalOp<kp::OpTensorCreate>({ tensorLHS });
 
-    std::shared_ptr<kp::Tensor> tensorRHS{ new kp::Tensor(
-      { 2, 4, 6 }) };
+    std::shared_ptr<kp::Tensor> tensorRHS{ new kp::Tensor( { 2, 4, 6 }) };
     mgr.evalOp<kp::OpTensorCreate>({ tensorRHS });
 
-    std::shared_ptr<kp::Tensor> tensorOutput{ new kp::Tensor(
-      { 0, 0, 0 }) };
+    std::shared_ptr<kp::Tensor> tensorOutput{ new kp::Tensor( { 0, 0, 0 }) };
+
     mgr.evalOp<kp::OpTensorCreate>({ tensorOutput });
 
     mgr.evalOp<kp::OpMult<>>({ tensorLHS, tensorRHS, tensorOutput });
 
+    mgr.evalOp<kp::OpTensorSyncLocal>({ tensorOutput });
+
     EXPECT_EQ(tensorOutput->data(), std::vector<float>({0, 4, 12}));
 }
 
@@ -46,6 +47,8 @@ TEST(TestManager, OpMultSequenceFlow) {
 
         sq->record<kp::OpMult<>>({ tensorLHS, tensorRHS, tensorOutput });
 
+        sq->record<kp::OpTensorSyncLocal>({ tensorOutput });
+
         sq->end();
         sq->eval();
     }
@@ -100,6 +103,8 @@ TEST(TestManager, TestMultipleTensorsAtOnce) {
 
         sq->record<kp::OpMult<>>({ tensorLHS, tensorRHS, tensorOutput });
 
+        sq->record<kp::OpTensorSyncLocal>({ tensorOutput });
+
         sq->end();
         sq->eval();
     }
diff --git a/test/TestMultipleAlgoExecutions.cpp b/test/TestMultipleAlgoExecutions.cpp
index df381b97a..e6ab0c6fa 100644
--- a/test/TestMultipleAlgoExecutions.cpp
+++ b/test/TestMultipleAlgoExecutions.cpp
@@ -27,17 +27,16 @@ TEST(TestMultipleAlgoExecutions, SingleSequenceRecord) {
 
         sq->record<kp::OpAlgoBase<3, 1, 1>>(
                 { tensorA }, 
-                false, // Whether to copy output from device
                 std::vector<char>(shader.begin(), shader.end()));
         sq->record<kp::OpAlgoBase<3, 1, 1>>(
                 { tensorA }, 
-                false, // Whether to copy output from device
                 std::vector<char>(shader.begin(), shader.end()));
         sq->record<kp::OpAlgoBase<3, 1, 1>>(
                 { tensorA }, 
-                true, // Whether to copy output from device
                 std::vector<char>(shader.begin(), shader.end()));
 
+        sq->record<kp::OpTensorSyncLocal>({ tensorA });
+
         sq->end();
         sq->eval();
     }
@@ -70,7 +69,6 @@ TEST(TestMultipleAlgoExecutions, MultipleCmdBufRecords) {
 
         sq->record<kp::OpAlgoBase<3, 1, 1>>(
                 { tensorA }, 
-                false, // Whether to copy output from device
                 std::vector<char>(shader.begin(), shader.end()));
 
         sq->end();
@@ -80,7 +78,6 @@ TEST(TestMultipleAlgoExecutions, MultipleCmdBufRecords) {
 
         sq->record<kp::OpAlgoBase<3, 1, 1>>(
                 { tensorA }, 
-                false, // Whether to copy output from device
                 std::vector<char>(shader.begin(), shader.end()));
 
         sq->end();
@@ -90,11 +87,18 @@ TEST(TestMultipleAlgoExecutions, MultipleCmdBufRecords) {
 
         sq->record<kp::OpAlgoBase<3, 1, 1>>(
                 { tensorA }, 
-                true, // Whether to copy output from device
                 std::vector<char>(shader.begin(), shader.end()));
 
         sq->end();
         sq->eval();
+
+        sq->begin();
+
+        sq->record<kp::OpTensorSyncLocal>(
+                { tensorA });
+
+        sq->end();
+        sq->eval();
     }
     sqWeakPtr.reset();
 
@@ -126,7 +130,6 @@ TEST(TestMultipleAlgoExecutions, MultipleSequences) {
 
         sq->record<kp::OpAlgoBase<3, 1, 1>>(
                 { tensorA }, 
-                true, // Whether to copy output from device
                 std::vector<char>(shader.begin(), shader.end()));
 
         sq->end();
@@ -134,12 +137,11 @@ TEST(TestMultipleAlgoExecutions, MultipleSequences) {
     }
 
     std::weak_ptr<kp::Sequence> sqWeakPtr2 = mgr.getOrCreateManagedSequence("newSequence2");
-    if (std::shared_ptr<kp::Sequence> sq = sqWeakPtr.lock()) {
+    if (std::shared_ptr<kp::Sequence> sq = sqWeakPtr2.lock()) {
         sq->begin();
 
         sq->record<kp::OpAlgoBase<3, 1, 1>>(
                 { tensorA }, 
-                true, // Whether to copy output from device
                 std::vector<char>(shader.begin(), shader.end()));
 
         sq->end();
@@ -148,18 +150,28 @@ TEST(TestMultipleAlgoExecutions, MultipleSequences) {
 
 
     std::weak_ptr<kp::Sequence> sqWeakPtr3 = mgr.getOrCreateManagedSequence("newSequence3");
-    if (std::shared_ptr<kp::Sequence> sq = sqWeakPtr.lock()) {
+    if (std::shared_ptr<kp::Sequence> sq = sqWeakPtr3.lock()) {
         sq->begin();
 
         sq->record<kp::OpAlgoBase<3, 1, 1>>(
                 { tensorA }, 
-                true, // Whether to copy output from device
                 std::vector<char>(shader.begin(), shader.end()));
 
         sq->end();
         sq->eval();
     }
 
+    std::weak_ptr<kp::Sequence> sqWeakPtr4 = mgr.getOrCreateManagedSequence("newSequence5");
+    if (std::shared_ptr<kp::Sequence> sq = sqWeakPtr4.lock()) {
+        sq->begin();
+
+        sq->record<kp::OpTensorSyncLocal>(
+                { tensorA });
+
+        sq->end();
+        sq->eval();
+    }
+
     EXPECT_EQ(tensorA->data(), std::vector<float>({3, 3, 3}));
 }
 
@@ -190,12 +202,11 @@ TEST(TestMultipleAlgoExecutions, SingleRecordMultipleEval) {
     }
 
     std::weak_ptr<kp::Sequence> sqWeakPtr2 = mgr.getOrCreateManagedSequence("newSequence2");
-    if (std::shared_ptr<kp::Sequence> sq = sqWeakPtr.lock()) {
+    if (std::shared_ptr<kp::Sequence> sq = sqWeakPtr2.lock()) {
         sq->begin();
 
         sq->record<kp::OpAlgoBase<3, 1, 1>>(
                 { tensorA }, 
-                true, // Whether to copy output from device
                 std::vector<char>(shader.begin(), shader.end()));
 
         sq->end();
@@ -205,6 +216,20 @@ TEST(TestMultipleAlgoExecutions, SingleRecordMultipleEval) {
         sq->eval();
     }
 
+    std::weak_ptr<kp::Sequence> sqWeakPtr3 = mgr.getOrCreateManagedSequence("newSequence3");
+    if (std::shared_ptr<kp::Sequence> sq = sqWeakPtr2.lock()) {
+        sq->begin();
+
+        sq->record<kp::OpTensorSyncLocal>(
+                { tensorA });
+
+        sq->end();
+
+        sq->eval();
+        sq->eval();
+        sq->eval();
+    }
+
     EXPECT_EQ(tensorA->data(), std::vector<float>({3, 3, 3}));
 }
 
diff --git a/test/TestOpAlgoLoopsPassingData.cpp b/test/TestOpAlgoLoopsPassingData.cpp
index 218eddb9d..9370686f3 100644
--- a/test/TestOpAlgoLoopsPassingData.cpp
+++ b/test/TestOpAlgoLoopsPassingData.cpp
@@ -50,7 +50,6 @@ TEST(TestProcessingIterations, IterateThroughMultipleSumAndCopies) {
 
         sq->record<kp::OpAlgoBase<>>(
                 { tensorA, tensorB }, 
-                true, // Whether to copy output from device
                 std::vector<char>(shader.begin(), shader.end()));
 
         sq->record<kp::OpTensorCopy>({tensorB, tensorA});
diff --git a/test/TestOpShadersFromStringAndFile.cpp b/test/TestOpShadersFromStringAndFile.cpp
index 92d2c50e0..f8d927b75 100644
--- a/test/TestOpShadersFromStringAndFile.cpp
+++ b/test/TestOpShadersFromStringAndFile.cpp
@@ -29,9 +29,10 @@ TEST(TestOpAlgoBase, ShaderRawDataFromConstructor) {
 
     mgr.evalOpDefault<kp::OpAlgoBase<>>(
             { tensorA, tensorB }, 
-            true, // Whether to copy output from device
             std::vector<char>(shader.begin(), shader.end()));
 
+    mgr.evalOpDefault<kp::OpTensorSyncLocal>({tensorA, tensorB});
+
     EXPECT_EQ(tensorA->data(), std::vector<float>({0, 1, 2}));
     EXPECT_EQ(tensorB->data(), std::vector<float>({3, 4, 5}));
 }
@@ -45,12 +46,13 @@ TEST(TestOpAlgoBase, ShaderCompiledDataFromConstructor) {
 
     mgr.evalOpDefault<kp::OpAlgoBase<>>(
             { tensorA, tensorB }, 
-            true, // Whether to copy output from device
             std::vector<char>(
                 kp::shader_data::test_shaders_glsl_test_op_custom_shader_comp_spv,
                 kp::shader_data::test_shaders_glsl_test_op_custom_shader_comp_spv +
                 kp::shader_data::test_shaders_glsl_test_op_custom_shader_comp_spv_len));
 
+    mgr.evalOpDefault<kp::OpTensorSyncLocal>({tensorA, tensorB});
+
     EXPECT_EQ(tensorA->data(), std::vector<float>({0, 1, 2}));
     EXPECT_EQ(tensorB->data(), std::vector<float>({3, 4, 5}));
 }
@@ -64,9 +66,10 @@ TEST(TestOpAlgoBase, ShaderRawDataFromFile) {
 
     mgr.evalOpDefault<kp::OpAlgoBase<>>(
             { tensorA, tensorB }, 
-            true, // Whether to copy output from device
             "test/shaders/glsl/test_op_custom_shader.comp");
 
+    mgr.evalOpDefault<kp::OpTensorSyncLocal>({tensorA, tensorB});
+
     EXPECT_EQ(tensorA->data(), std::vector<float>({0, 1, 2}));
     EXPECT_EQ(tensorB->data(), std::vector<float>({3, 4, 5}));
 }
@@ -80,9 +83,10 @@ TEST(TestOpAlgoBase, ShaderCompiledDataFromFile) {
 
     mgr.evalOpDefault<kp::OpAlgoBase<>>(
             { tensorA, tensorB }, 
-            true, // Whether to copy output from device
             "test/shaders/glsl/test_op_custom_shader.comp.spv");
 
+    mgr.evalOpDefault<kp::OpTensorSyncLocal>({tensorA, tensorB});
+
     EXPECT_EQ(tensorA->data(), std::vector<float>({0, 1, 2}));
     EXPECT_EQ(tensorB->data(), std::vector<float>({3, 4, 5}));
 }