Updated OpAlgoBase to not copy data as optensorsync operations are introduced

2020-09-12 09:14:35 +01:00 · 2020-09-12 09:14:35 +01:00 · 9f8508075a
commit 9f8508075a
parent 4171786b6f
10 changed files with 92 additions and 180 deletions
--- a/README.md
+++ b/README.md
@ -61,8 +61,11 @@ int main() {
    kp::Manager mgr; // Selects device 0 unless explicitly requested

    // Creates tensor an initializes GPU memory (below we show more granularity)
-    auto tensorA = mgr.buildTensor({ 3, 4, 5 });
-    auto tensorB = mgr.buildTensor({ 0, 0, 0 });
+    auto tensorA = std::make_shared<kp::Tensor>(kp::Tensor({ 3., 4., 5. }));
+    auto tensorB = std::make_shared<kp::Tensor>(kp::Tensor({ 0., 0., 0. }));
+
+    // Create tensors data explicitly in GPU with an operation
+    mgr.evalOpDefault<kp::OpTensorCreate>({ tensorA, tensorB });

    // Define your shader as a string (using string literals for simplicity)
    // (You can also pass the raw compiled bytes, or even path to file)
@ -82,11 +85,13 @@ int main() {
    )");

    // Run Kompute operation on the parameters provided with dispatch layout
-    mgr.evalOpDefault<kp::OpMult<3, 1, 1>>(
+    mgr.evalOpDefault<kp::OpAlgoBase<3, 1, 1>>(
        { tensorA, tensorB }, 
-        true, // Whether to retrieve the output from GPU memory
        std::vector<char>(shader.begin(), shader.end()));

+    // Sync the GPU memory back to the local tensor
+    mgr.evalOpDefault<kp::OpTensorSyncLocal>({ tensorA, tensorB });
+
    // Prints the output which is A: { 0, 1, 2 } B: { 3, 4, 5 }
    std::cout << fmt::format("A: {}, B: {}", 
        tensorA.data(), tensorB.data()) << std::endl;
@ -107,7 +112,7 @@ class OpMyCustom : public OpAlgoBase<tX, tY, tZ>
           std::shared_ptr<vk::Device> device,
           std::shared_ptr<vk::CommandBuffer> commandBuffer,
           std::vector<std::shared_ptr<Tensor>> tensors)
-      : OpAlgoBase<tX, tY, tZ>(physicalDevice, device, commandBuffer, tensors, true, "")
+      : OpAlgoBase<tX, tY, tZ>(physicalDevice, device, commandBuffer, tensors, "")
    {
        // Perform your custom steps such as reading from a shader file
        this->mShaderFilePath = "shaders/glsl/opmult.comp";
@ -144,7 +149,7 @@ int main() {
    kp::Manager mgr;

    std::shared_ptr<kp::Tensor> tensorLHS{ new kp::Tensor({ 1., 1., 1. }) };
-    std::shared_ptr<kp::Tensor> tensorRHS{ new kp::Tensor( { 2., 2., 2. }) };
+    std::shared_ptr<kp::Tensor> tensorRHS{ new kp::Tensor({ 2., 2., 2. }) };
    std::shared_ptr<kp::Tensor> tensorOutput{ new kp::Tensor({ 0., 0., 0. }) };

    // Create all the tensors in memory
@ -159,17 +164,23 @@ int main() {
        sq.begin();

        // Record batch commands to send to GPU
-        sq.record<kp::OpMult<>>({ tensorLHS, tensorRHS, tensorOutput });
-        sq.record<kp::OpTensorCopy>({tensorOutput, tensorLHS, tensorRHS});
+        sq->record<kp::OpMult<>>({ tensorLHS, tensorRHS, tensorOutput });
+        sq->record<kp::OpTensorCopy>({tensorOutput, tensorLHS, tensorRHS});

        // Stop recording
-        sq.end();
+        sq->end();

        // Submit multiple batch operations to GPU
        size_t ITERATIONS = 5;
        for (size_t i = 0; i < ITERATIONS; i++) {
-            sq.eval();
+            sq->eval();
        }
+
+        // Sync GPU memory back to local tensor
+        sq->begin();
+        sq->record<kp::OpTensorSyncLocal>({tensorOutput});
+        sq->end();
+        sq->eval();
    }

    // Print the output which iterates through OpMult 5 times
--- a/single_include/kompute/Kompute.hpp
+++ b/single_include/kompute/Kompute.hpp
@ -971,11 +971,6 @@ namespace kp {
 *
 * All of these tensors are expected to be initlaised and this is checked with throw std exception in the init function.
 *
- * It is possible to also choose if the user requires all of the tensors to be
- * copied from device memory to their host data. This can be disabled by either
- * passing the copyOutputData constructor parameter and/or by overriding the 
- * functions to carry out copy commands accordingly. 
- *
 * See OpLhsRhsOut for an example implementation on a more specific granularity on tensor parameters.
 * 
 * The template parameters specify the processing GPU layout number of
@ -1000,14 +995,12 @@ class OpAlgoBase : public OpBase
     * @param device Vulkan logical device for passing to Algorithm
     * @param commandBuffer Vulkan Command Buffer to record commands into
     * @param tensors Tensors that are to be used in this operation
-     * @param copyOutputData Whether to map device data for all output tensors back to their host data vectors
     * @param shaderFilePath Optional parameter to specify the shader to load (either in spirv or raw format)
     */
    OpAlgoBase(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
           std::shared_ptr<vk::Device> device,
           std::shared_ptr<vk::CommandBuffer> commandBuffer,
-           std::vector<std::shared_ptr<Tensor>>& tensors,
-           bool copyOutputData);
+           std::vector<std::shared_ptr<Tensor>>& tensors);

    /**
     * Constructor that enables a file to be passed to the operation with
@ -1018,14 +1011,12 @@ class OpAlgoBase : public OpBase
     * @param device Vulkan logical device for passing to Algorithm
     * @param commandBuffer Vulkan Command Buffer to record commands into
     * @param tensors Tensors that are to be used in this operation
-     * @param copyOutputData Whether to map device data for all output tensors back to their host data vectors
     * @param shaderFilePath Optional parameter to specify the shader to load (either in spirv or raw format)
     */
    OpAlgoBase(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
           std::shared_ptr<vk::Device> device,
           std::shared_ptr<vk::CommandBuffer> commandBuffer,
           std::vector<std::shared_ptr<Tensor>>& tensors,
-           bool copyOutputData,
           std::string shaderFilePath);

    /**
@ -1036,14 +1027,12 @@ class OpAlgoBase : public OpBase
     * @param device Vulkan logical device for passing to Algorithm
     * @param commandBuffer Vulkan Command Buffer to record commands into
     * @param tensors Tensors that are to be used in this operation
-     * @param copyOutputData Whether to map device data for all output tensors back to their host data vectors
     * @param shaderDataRaw Optional parameter to specify the shader data either in binary or raw form
     */
    OpAlgoBase(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
           std::shared_ptr<vk::Device> device,
           std::shared_ptr<vk::CommandBuffer> commandBuffer,
           std::vector<std::shared_ptr<Tensor>>& tensors,
-           bool copyOutputData,
           const std::vector<char>& shaderDataRaw);

    /**
@ -1090,8 +1079,6 @@ class OpAlgoBase : public OpBase
    bool mFreeAlgorithm = false;

    // -------------- ALWAYS OWNED RESOURCES
-    std::vector<std::shared_ptr<Tensor>> mOutputStagingTensors; ///< Array of output staging tensors which will be expected to be the same size as the number of inputs.
-    bool mCopyOutputData; ///< Configuration parameter which states whether data will be copied back to all provided tensors for convenience. This can be deactivated by setting this flag and or overriding the functions provided.

    uint32_t mX;
    uint32_t mY;
@ -1121,11 +1108,10 @@ template<uint32_t tX, uint32_t tY, uint32_t tZ>
 OpAlgoBase<tX, tY, tZ>::OpAlgoBase(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
                           std::shared_ptr<vk::Device> device,
                           std::shared_ptr<vk::CommandBuffer> commandBuffer,
-                           std::vector<std::shared_ptr<Tensor>>& tensors,
-                           bool copyOutputData)
+                           std::vector<std::shared_ptr<Tensor>>& tensors)
  : OpBase(physicalDevice, device, commandBuffer, tensors, false)
 {
-    SPDLOG_DEBUG("Kompute OpAlgoBase constructor with params numTensors: {} copyOutputData: {}, shaderFilePath: {}", tensors.size(), copyOutputData);
+    SPDLOG_DEBUG("Kompute OpAlgoBase constructor with params numTensors: {} , shaderFilePath: {}", tensors.size());

    // The dispatch size is set up based on either explicitly provided template
    // parameters or by default it would take the shape and size of the tensors
@ -1145,8 +1131,6 @@ OpAlgoBase<tX, tY, tZ>::OpAlgoBase(std::shared_ptr<vk::PhysicalDevice> physicalD
                 this->mY,
                 this->mZ);

-    this->mCopyOutputData = copyOutputData;
-
    this->mAlgorithm = std::make_shared<Algorithm>(device, commandBuffer);
 }

@ -1155,9 +1139,8 @@ OpAlgoBase<tX, tY, tZ>::OpAlgoBase(std::shared_ptr<vk::PhysicalDevice> physicalD
                           std::shared_ptr<vk::Device> device,
                           std::shared_ptr<vk::CommandBuffer> commandBuffer,
                           std::vector<std::shared_ptr<Tensor>>& tensors,
-                           bool copyOutputData,
                           std::string shaderFilePath)
-  : OpAlgoBase(physicalDevice, device, commandBuffer, tensors, copyOutputData)
+  : OpAlgoBase(physicalDevice, device, commandBuffer, tensors)
 {
    SPDLOG_DEBUG("Kompute OpAlgoBase shaderFilePath constructo with shaderfile path: {}", shaderFilePath);

@ -1169,9 +1152,8 @@ OpAlgoBase<tX, tY, tZ>::OpAlgoBase(std::shared_ptr<vk::PhysicalDevice> physicalD
                           std::shared_ptr<vk::Device> device,
                           std::shared_ptr<vk::CommandBuffer> commandBuffer,
                           std::vector<std::shared_ptr<Tensor>>& tensors,
-                           bool copyOutputData,
                           const std::vector<char>& shaderDataRaw)
-  : OpAlgoBase(physicalDevice, device, commandBuffer, tensors, copyOutputData)
+  : OpAlgoBase(physicalDevice, device, commandBuffer, tensors)
 {
    SPDLOG_DEBUG("Kompute OpAlgoBase shaderFilePath constructo with shader raw data length: {}", shaderDataRaw.size());

@ -1182,13 +1164,6 @@ template<uint32_t tX, uint32_t tY, uint32_t tZ>
 OpAlgoBase<tX, tY, tZ>::~OpAlgoBase()
 {
    SPDLOG_DEBUG("Kompute OpAlgoBase destructor started");
-
-    if (this->mCopyOutputData) {
-        SPDLOG_DEBUG("Kompute OpAlgoBase destroying staging tensors");
-        for (std::shared_ptr<Tensor> stagingTensor : this->mOutputStagingTensors) {
-            stagingTensor->freeMemoryDestroyGPUResources();
-        }
-    }
 }

 template<uint32_t tX, uint32_t tY, uint32_t tZ>
@ -1208,18 +1183,6 @@ OpAlgoBase<tX, tY, tZ>::init()
        }
    }

-    if (this->mCopyOutputData) {
-        SPDLOG_DEBUG("Kompute OpAlgoBase creating staging output tensors");
-
-        for (std::shared_ptr<Tensor> tensor : this->mTensors) {
-            std::shared_ptr<Tensor> stagingTensor = std::make_shared<Tensor>(
-              tensor->data(), Tensor::TensorTypes::eStaging);
-            stagingTensor->init(
-                this->mPhysicalDevice, this->mDevice);
-            this->mOutputStagingTensors.push_back(stagingTensor);
-        }
-    }
-
    SPDLOG_DEBUG("Kompute OpAlgoBase fetching spirv data");

    std::vector<char>& shaderFileData = this->fetchSpirvBinaryData();
@ -1246,27 +1209,6 @@ OpAlgoBase<tX, tY, tZ>::record()
    }

    this->mAlgorithm->recordDispatch(this->mX, this->mY, this->mZ);
-
-    if (this->mCopyOutputData) {
-        // Barrier to ensure the shader code is executed before buffer read
-        for (const std::shared_ptr<Tensor>& tensor : this->mTensors) {
-            tensor->recordBufferMemoryBarrier(
-              this->mCommandBuffer,
-              vk::AccessFlagBits::eShaderWrite,
-              vk::AccessFlagBits::eTransferRead,
-              vk::PipelineStageFlagBits::eComputeShader,
-              vk::PipelineStageFlagBits::eTransfer);
-        }
-
-        // Record copy from and create barrier for STAGING tensors
-        // TODO: This only accounts for device tensors need to account for staging and storage
-        for (size_t i = 0; i < this->mTensors.size(); i++) {
-            this->mOutputStagingTensors[i]->recordCopyFrom(
-                this->mCommandBuffer,
-                this->mTensors[i], 
-                true);
-        }
-    }
 }

 template<uint32_t tX, uint32_t tY, uint32_t tZ>
@ -1281,14 +1223,6 @@ void
 OpAlgoBase<tX, tY, tZ>::postEval()
 {
    SPDLOG_DEBUG("Kompute OpAlgoBase postSubmit called");
-
-    if (this->mCopyOutputData) {
-        for (size_t i = 0; i < this->mTensors.size(); i++) {
-            this->mOutputStagingTensors[i]->mapDataFromHostMemory();
-
-            this->mTensors[i]->setData(this->mOutputStagingTensors[i]->data());
-        }
-    }
 }

 template<uint32_t tX, uint32_t tY, uint32_t tZ>
@ -1429,7 +1363,7 @@ OpAlgoLhsRhsOut<tX, tY, tZ>::OpAlgoLhsRhsOut(std::shared_ptr<vk::PhysicalDevice>
  // The inheritance is initialised with the copyOutputData to false given that
  // this depencendant class handles the transfer of data via staging buffers in 
  // a granular way.
-  : OpAlgoBase<tX, tY, tZ>(physicalDevice, device, commandBuffer, tensors, false)
+  : OpAlgoBase<tX, tY, tZ>(physicalDevice, device, commandBuffer, tensors)
 {
    SPDLOG_DEBUG("Kompute OpAlgoLhsRhsOut constructor with params");
 }
@ -1575,7 +1509,7 @@ class OpMult : public OpAlgoBase<tX, tY, tZ>
           std::shared_ptr<vk::Device> device,
           std::shared_ptr<vk::CommandBuffer> commandBuffer,
           std::vector<std::shared_ptr<Tensor>> tensors)
-      : OpAlgoBase<tX, tY, tZ>(physicalDevice, device, commandBuffer, tensors, true, "")
+      : OpAlgoBase<tX, tY, tZ>(physicalDevice, device, commandBuffer, tensors, "")
    {
        SPDLOG_DEBUG("Kompute OpMult constructor with params");

--- a/src/include/kompute/operations/OpAlgoBase.hpp
+++ b/src/include/kompute/operations/OpAlgoBase.hpp
@ -21,11 +21,6 @@ namespace kp {
 *
 * All of these tensors are expected to be initlaised and this is checked with throw std exception in the init function.
 *
- * It is possible to also choose if the user requires all of the tensors to be
- * copied from device memory to their host data. This can be disabled by either
- * passing the copyOutputData constructor parameter and/or by overriding the 
- * functions to carry out copy commands accordingly. 
- *
 * See OpLhsRhsOut for an example implementation on a more specific granularity on tensor parameters.
 * 
 * The template parameters specify the processing GPU layout number of
@ -50,14 +45,12 @@ class OpAlgoBase : public OpBase
     * @param device Vulkan logical device for passing to Algorithm
     * @param commandBuffer Vulkan Command Buffer to record commands into
     * @param tensors Tensors that are to be used in this operation
-     * @param copyOutputData Whether to map device data for all output tensors back to their host data vectors
     * @param shaderFilePath Optional parameter to specify the shader to load (either in spirv or raw format)
     */
    OpAlgoBase(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
           std::shared_ptr<vk::Device> device,
           std::shared_ptr<vk::CommandBuffer> commandBuffer,
-           std::vector<std::shared_ptr<Tensor>>& tensors,
-           bool copyOutputData);
+           std::vector<std::shared_ptr<Tensor>>& tensors);

    /**
     * Constructor that enables a file to be passed to the operation with
@ -68,14 +61,12 @@ class OpAlgoBase : public OpBase
     * @param device Vulkan logical device for passing to Algorithm
     * @param commandBuffer Vulkan Command Buffer to record commands into
     * @param tensors Tensors that are to be used in this operation
-     * @param copyOutputData Whether to map device data for all output tensors back to their host data vectors
     * @param shaderFilePath Optional parameter to specify the shader to load (either in spirv or raw format)
     */
    OpAlgoBase(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
           std::shared_ptr<vk::Device> device,
           std::shared_ptr<vk::CommandBuffer> commandBuffer,
           std::vector<std::shared_ptr<Tensor>>& tensors,
-           bool copyOutputData,
           std::string shaderFilePath);

    /**
@ -86,14 +77,12 @@ class OpAlgoBase : public OpBase
     * @param device Vulkan logical device for passing to Algorithm
     * @param commandBuffer Vulkan Command Buffer to record commands into
     * @param tensors Tensors that are to be used in this operation
-     * @param copyOutputData Whether to map device data for all output tensors back to their host data vectors
     * @param shaderDataRaw Optional parameter to specify the shader data either in binary or raw form
     */
    OpAlgoBase(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
           std::shared_ptr<vk::Device> device,
           std::shared_ptr<vk::CommandBuffer> commandBuffer,
           std::vector<std::shared_ptr<Tensor>>& tensors,
-           bool copyOutputData,
           const std::vector<char>& shaderDataRaw);

    /**
@ -141,8 +130,6 @@ class OpAlgoBase : public OpBase
    bool mFreeAlgorithm = false;

    // -------------- ALWAYS OWNED RESOURCES
-    std::vector<std::shared_ptr<Tensor>> mOutputStagingTensors; ///< Array of output staging tensors which will be expected to be the same size as the number of inputs.
-    bool mCopyOutputData; ///< Configuration parameter which states whether data will be copied back to all provided tensors for convenience. This can be deactivated by setting this flag and or overriding the functions provided.

    uint32_t mX;
    uint32_t mY;
@ -172,11 +159,10 @@ template<uint32_t tX, uint32_t tY, uint32_t tZ>
 OpAlgoBase<tX, tY, tZ>::OpAlgoBase(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
                           std::shared_ptr<vk::Device> device,
                           std::shared_ptr<vk::CommandBuffer> commandBuffer,
-                           std::vector<std::shared_ptr<Tensor>>& tensors,
-                           bool copyOutputData)
+                           std::vector<std::shared_ptr<Tensor>>& tensors)
  : OpBase(physicalDevice, device, commandBuffer, tensors, false)
 {
-    SPDLOG_DEBUG("Kompute OpAlgoBase constructor with params numTensors: {} copyOutputData: {}, shaderFilePath: {}", tensors.size(), copyOutputData);
+    SPDLOG_DEBUG("Kompute OpAlgoBase constructor with params numTensors: {} , shaderFilePath: {}", tensors.size());

    // The dispatch size is set up based on either explicitly provided template
    // parameters or by default it would take the shape and size of the tensors
@ -196,8 +182,6 @@ OpAlgoBase<tX, tY, tZ>::OpAlgoBase(std::shared_ptr<vk::PhysicalDevice> physicalD
                 this->mY,
                 this->mZ);

-    this->mCopyOutputData = copyOutputData;
-
    this->mAlgorithm = std::make_shared<Algorithm>(device, commandBuffer);
 }

@ -206,9 +190,8 @@ OpAlgoBase<tX, tY, tZ>::OpAlgoBase(std::shared_ptr<vk::PhysicalDevice> physicalD
                           std::shared_ptr<vk::Device> device,
                           std::shared_ptr<vk::CommandBuffer> commandBuffer,
                           std::vector<std::shared_ptr<Tensor>>& tensors,
-                           bool copyOutputData,
                           std::string shaderFilePath)
-  : OpAlgoBase(physicalDevice, device, commandBuffer, tensors, copyOutputData)
+  : OpAlgoBase(physicalDevice, device, commandBuffer, tensors)
 {
    SPDLOG_DEBUG("Kompute OpAlgoBase shaderFilePath constructo with shaderfile path: {}", shaderFilePath);

@ -220,9 +203,8 @@ OpAlgoBase<tX, tY, tZ>::OpAlgoBase(std::shared_ptr<vk::PhysicalDevice> physicalD
                           std::shared_ptr<vk::Device> device,
                           std::shared_ptr<vk::CommandBuffer> commandBuffer,
                           std::vector<std::shared_ptr<Tensor>>& tensors,
-                           bool copyOutputData,
                           const std::vector<char>& shaderDataRaw)
-  : OpAlgoBase(physicalDevice, device, commandBuffer, tensors, copyOutputData)
+  : OpAlgoBase(physicalDevice, device, commandBuffer, tensors)
 {
    SPDLOG_DEBUG("Kompute OpAlgoBase shaderFilePath constructo with shader raw data length: {}", shaderDataRaw.size());

@ -233,13 +215,6 @@ template<uint32_t tX, uint32_t tY, uint32_t tZ>
 OpAlgoBase<tX, tY, tZ>::~OpAlgoBase()
 {
    SPDLOG_DEBUG("Kompute OpAlgoBase destructor started");
-
-    if (this->mCopyOutputData) {
-        SPDLOG_DEBUG("Kompute OpAlgoBase destroying staging tensors");
-        for (std::shared_ptr<Tensor> stagingTensor : this->mOutputStagingTensors) {
-            stagingTensor->freeMemoryDestroyGPUResources();
-        }
-    }
 }

 template<uint32_t tX, uint32_t tY, uint32_t tZ>
@ -259,18 +234,6 @@ OpAlgoBase<tX, tY, tZ>::init()
        }
    }

-    if (this->mCopyOutputData) {
-        SPDLOG_DEBUG("Kompute OpAlgoBase creating staging output tensors");
-
-        for (std::shared_ptr<Tensor> tensor : this->mTensors) {
-            std::shared_ptr<Tensor> stagingTensor = std::make_shared<Tensor>(
-              tensor->data(), Tensor::TensorTypes::eStaging);
-            stagingTensor->init(
-                this->mPhysicalDevice, this->mDevice);
-            this->mOutputStagingTensors.push_back(stagingTensor);
-        }
-    }
-
    SPDLOG_DEBUG("Kompute OpAlgoBase fetching spirv data");

    std::vector<char>& shaderFileData = this->fetchSpirvBinaryData();
@ -297,27 +260,6 @@ OpAlgoBase<tX, tY, tZ>::record()
    }

    this->mAlgorithm->recordDispatch(this->mX, this->mY, this->mZ);
-
-    if (this->mCopyOutputData) {
-        // Barrier to ensure the shader code is executed before buffer read
-        for (const std::shared_ptr<Tensor>& tensor : this->mTensors) {
-            tensor->recordBufferMemoryBarrier(
-              this->mCommandBuffer,
-              vk::AccessFlagBits::eShaderWrite,
-              vk::AccessFlagBits::eTransferRead,
-              vk::PipelineStageFlagBits::eComputeShader,
-              vk::PipelineStageFlagBits::eTransfer);
-        }
-
-        // Record copy from and create barrier for STAGING tensors
-        // TODO: This only accounts for device tensors need to account for staging and storage
-        for (size_t i = 0; i < this->mTensors.size(); i++) {
-            this->mOutputStagingTensors[i]->recordCopyFrom(
-                this->mCommandBuffer,
-                this->mTensors[i], 
-                true);
-        }
-    }
 }

 template<uint32_t tX, uint32_t tY, uint32_t tZ>
@ -332,14 +274,6 @@ void
 OpAlgoBase<tX, tY, tZ>::postEval()
 {
    SPDLOG_DEBUG("Kompute OpAlgoBase postSubmit called");
-
-    if (this->mCopyOutputData) {
-        for (size_t i = 0; i < this->mTensors.size(); i++) {
-            this->mOutputStagingTensors[i]->mapDataFromHostMemory();
-
-            this->mTensors[i]->setData(this->mOutputStagingTensors[i]->data());
-        }
-    }
 }

 template<uint32_t tX, uint32_t tY, uint32_t tZ>
--- a/src/include/kompute/operations/OpAlgoLhsRhsOut.hpp
+++ b/src/include/kompute/operations/OpAlgoLhsRhsOut.hpp
@ -107,7 +107,7 @@ OpAlgoLhsRhsOut<tX, tY, tZ>::OpAlgoLhsRhsOut(std::shared_ptr<vk::PhysicalDevice>
  // The inheritance is initialised with the copyOutputData to false given that
  // this depencendant class handles the transfer of data via staging buffers in 
  // a granular way.
-  : OpAlgoBase<tX, tY, tZ>(physicalDevice, device, commandBuffer, tensors, false)
+  : OpAlgoBase<tX, tY, tZ>(physicalDevice, device, commandBuffer, tensors)
 {
    SPDLOG_DEBUG("Kompute OpAlgoLhsRhsOut constructor with params");
 }
--- a/src/include/kompute/operations/OpMult.hpp
+++ b/src/include/kompute/operations/OpMult.hpp
@ -47,7 +47,7 @@ class OpMult : public OpAlgoBase<tX, tY, tZ>
           std::shared_ptr<vk::Device> device,
           std::shared_ptr<vk::CommandBuffer> commandBuffer,
           std::vector<std::shared_ptr<Tensor>> tensors)
-      : OpAlgoBase<tX, tY, tZ>(physicalDevice, device, commandBuffer, tensors, true, "")
+      : OpAlgoBase<tX, tY, tZ>(physicalDevice, device, commandBuffer, tensors, "")
    {
        SPDLOG_DEBUG("Kompute OpMult constructor with params");

--- a/test/TestLogisticRegression.cpp
+++ b/test/TestLogisticRegression.cpp
@ -48,7 +48,6 @@ TEST(TestLogisticRegressionAlgorithm, TestMainLogisticRegression) {

            sq->record<kp::OpAlgoBase<>>(
                    params, 
-                    false, // Whether to copy output from device
                    "test/shaders/glsl/test_logistic_regression.comp");

            sq->record<kp::OpTensorSyncLocal>({wOutI, wOutJ, bOut});
@ -125,9 +124,10 @@ TEST(TestLogisticRegressionAlgorithm, TestMainLogisticRegressionManualCopy) {

            sq->record<kp::OpAlgoBase<>>(
                    params, 
-                    true, // Whether to copy output from device
                    "test/shaders/glsl/test_logistic_regression.comp");

+            sq->record<kp::OpTensorSyncLocal>({wOutI, wOutJ, bOut});
+
            sq->end();

            // Iterate across all expected iterations
--- a/test/TestManager.cpp
+++ b/test/TestManager.cpp
@ -10,16 +10,17 @@ TEST(TestManager, EndToEndOpMultFlow)
    std::shared_ptr<kp::Tensor> tensorLHS{ new kp::Tensor({ 0, 1, 2 }) };
    mgr.evalOp<kp::OpTensorCreate>({ tensorLHS });

-    std::shared_ptr<kp::Tensor> tensorRHS{ new kp::Tensor(
-      { 2, 4, 6 }) };
+    std::shared_ptr<kp::Tensor> tensorRHS{ new kp::Tensor( { 2, 4, 6 }) };
    mgr.evalOp<kp::OpTensorCreate>({ tensorRHS });

-    std::shared_ptr<kp::Tensor> tensorOutput{ new kp::Tensor(
-      { 0, 0, 0 }) };
+    std::shared_ptr<kp::Tensor> tensorOutput{ new kp::Tensor( { 0, 0, 0 }) };
+
    mgr.evalOp<kp::OpTensorCreate>({ tensorOutput });

    mgr.evalOp<kp::OpMult<>>({ tensorLHS, tensorRHS, tensorOutput });

+    mgr.evalOp<kp::OpTensorSyncLocal>({ tensorOutput });
+
    EXPECT_EQ(tensorOutput->data(), std::vector<float>({0, 4, 12}));
 }

@ -46,6 +47,8 @@ TEST(TestManager, OpMultSequenceFlow) {

        sq->record<kp::OpMult<>>({ tensorLHS, tensorRHS, tensorOutput });

+        sq->record<kp::OpTensorSyncLocal>({ tensorOutput });
+
        sq->end();
        sq->eval();
    }
@ -100,6 +103,8 @@ TEST(TestManager, TestMultipleTensorsAtOnce) {

        sq->record<kp::OpMult<>>({ tensorLHS, tensorRHS, tensorOutput });

+        sq->record<kp::OpTensorSyncLocal>({ tensorOutput });
+
        sq->end();
        sq->eval();
    }
--- a/test/TestMultipleAlgoExecutions.cpp
+++ b/test/TestMultipleAlgoExecutions.cpp
@ -27,17 +27,16 @@ TEST(TestMultipleAlgoExecutions, SingleSequenceRecord) {

        sq->record<kp::OpAlgoBase<3, 1, 1>>(
                { tensorA }, 
-                false, // Whether to copy output from device
                std::vector<char>(shader.begin(), shader.end()));
        sq->record<kp::OpAlgoBase<3, 1, 1>>(
                { tensorA }, 
-                false, // Whether to copy output from device
                std::vector<char>(shader.begin(), shader.end()));
        sq->record<kp::OpAlgoBase<3, 1, 1>>(
                { tensorA }, 
-                true, // Whether to copy output from device
                std::vector<char>(shader.begin(), shader.end()));

+        sq->record<kp::OpTensorSyncLocal>({ tensorA });
+
        sq->end();
        sq->eval();
    }
@ -70,7 +69,6 @@ TEST(TestMultipleAlgoExecutions, MultipleCmdBufRecords) {

        sq->record<kp::OpAlgoBase<3, 1, 1>>(
                { tensorA }, 
-                false, // Whether to copy output from device
                std::vector<char>(shader.begin(), shader.end()));

        sq->end();
@ -80,7 +78,6 @@ TEST(TestMultipleAlgoExecutions, MultipleCmdBufRecords) {

        sq->record<kp::OpAlgoBase<3, 1, 1>>(
                { tensorA }, 
-                false, // Whether to copy output from device
                std::vector<char>(shader.begin(), shader.end()));

        sq->end();
@ -90,11 +87,18 @@ TEST(TestMultipleAlgoExecutions, MultipleCmdBufRecords) {

        sq->record<kp::OpAlgoBase<3, 1, 1>>(
                { tensorA }, 
-                true, // Whether to copy output from device
                std::vector<char>(shader.begin(), shader.end()));

        sq->end();
        sq->eval();
+
+        sq->begin();
+
+        sq->record<kp::OpTensorSyncLocal>(
+                { tensorA });
+
+        sq->end();
+        sq->eval();
    }
    sqWeakPtr.reset();

@ -126,7 +130,6 @@ TEST(TestMultipleAlgoExecutions, MultipleSequences) {

        sq->record<kp::OpAlgoBase<3, 1, 1>>(
                { tensorA }, 
-                true, // Whether to copy output from device
                std::vector<char>(shader.begin(), shader.end()));

        sq->end();
@ -134,12 +137,11 @@ TEST(TestMultipleAlgoExecutions, MultipleSequences) {
    }

    std::weak_ptr<kp::Sequence> sqWeakPtr2 = mgr.getOrCreateManagedSequence("newSequence2");
-    if (std::shared_ptr<kp::Sequence> sq = sqWeakPtr.lock()) {
+    if (std::shared_ptr<kp::Sequence> sq = sqWeakPtr2.lock()) {
        sq->begin();

        sq->record<kp::OpAlgoBase<3, 1, 1>>(
                { tensorA }, 
-                true, // Whether to copy output from device
                std::vector<char>(shader.begin(), shader.end()));

        sq->end();
@ -148,18 +150,28 @@ TEST(TestMultipleAlgoExecutions, MultipleSequences) {


    std::weak_ptr<kp::Sequence> sqWeakPtr3 = mgr.getOrCreateManagedSequence("newSequence3");
-    if (std::shared_ptr<kp::Sequence> sq = sqWeakPtr.lock()) {
+    if (std::shared_ptr<kp::Sequence> sq = sqWeakPtr3.lock()) {
        sq->begin();

        sq->record<kp::OpAlgoBase<3, 1, 1>>(
                { tensorA }, 
-                true, // Whether to copy output from device
                std::vector<char>(shader.begin(), shader.end()));

        sq->end();
        sq->eval();
    }

+    std::weak_ptr<kp::Sequence> sqWeakPtr4 = mgr.getOrCreateManagedSequence("newSequence5");
+    if (std::shared_ptr<kp::Sequence> sq = sqWeakPtr4.lock()) {
+        sq->begin();
+
+        sq->record<kp::OpTensorSyncLocal>(
+                { tensorA });
+
+        sq->end();
+        sq->eval();
+    }
+
    EXPECT_EQ(tensorA->data(), std::vector<float>({3, 3, 3}));
 }

@ -190,12 +202,11 @@ TEST(TestMultipleAlgoExecutions, SingleRecordMultipleEval) {
    }

    std::weak_ptr<kp::Sequence> sqWeakPtr2 = mgr.getOrCreateManagedSequence("newSequence2");
-    if (std::shared_ptr<kp::Sequence> sq = sqWeakPtr.lock()) {
+    if (std::shared_ptr<kp::Sequence> sq = sqWeakPtr2.lock()) {
        sq->begin();

        sq->record<kp::OpAlgoBase<3, 1, 1>>(
                { tensorA }, 
-                true, // Whether to copy output from device
                std::vector<char>(shader.begin(), shader.end()));

        sq->end();
@ -205,6 +216,20 @@ TEST(TestMultipleAlgoExecutions, SingleRecordMultipleEval) {
        sq->eval();
    }

+    std::weak_ptr<kp::Sequence> sqWeakPtr3 = mgr.getOrCreateManagedSequence("newSequence3");
+    if (std::shared_ptr<kp::Sequence> sq = sqWeakPtr2.lock()) {
+        sq->begin();
+
+        sq->record<kp::OpTensorSyncLocal>(
+                { tensorA });
+
+        sq->end();
+
+        sq->eval();
+        sq->eval();
+        sq->eval();
+    }
+
    EXPECT_EQ(tensorA->data(), std::vector<float>({3, 3, 3}));
 }

--- a/test/TestOpAlgoLoopsPassingData.cpp
+++ b/test/TestOpAlgoLoopsPassingData.cpp
@ -50,7 +50,6 @@ TEST(TestProcessingIterations, IterateThroughMultipleSumAndCopies) {

        sq->record<kp::OpAlgoBase<>>(
                { tensorA, tensorB }, 
-                true, // Whether to copy output from device
                std::vector<char>(shader.begin(), shader.end()));

        sq->record<kp::OpTensorCopy>({tensorB, tensorA});
--- a/test/TestOpShadersFromStringAndFile.cpp
+++ b/test/TestOpShadersFromStringAndFile.cpp
@ -29,9 +29,10 @@ TEST(TestOpAlgoBase, ShaderRawDataFromConstructor) {

    mgr.evalOpDefault<kp::OpAlgoBase<>>(
            { tensorA, tensorB }, 
-            true, // Whether to copy output from device
            std::vector<char>(shader.begin(), shader.end()));

+    mgr.evalOpDefault<kp::OpTensorSyncLocal>({tensorA, tensorB});
+
    EXPECT_EQ(tensorA->data(), std::vector<float>({0, 1, 2}));
    EXPECT_EQ(tensorB->data(), std::vector<float>({3, 4, 5}));
 }
@ -45,12 +46,13 @@ TEST(TestOpAlgoBase, ShaderCompiledDataFromConstructor) {

    mgr.evalOpDefault<kp::OpAlgoBase<>>(
            { tensorA, tensorB }, 
-            true, // Whether to copy output from device
            std::vector<char>(
                kp::shader_data::test_shaders_glsl_test_op_custom_shader_comp_spv,
                kp::shader_data::test_shaders_glsl_test_op_custom_shader_comp_spv +
                kp::shader_data::test_shaders_glsl_test_op_custom_shader_comp_spv_len));

+    mgr.evalOpDefault<kp::OpTensorSyncLocal>({tensorA, tensorB});
+
    EXPECT_EQ(tensorA->data(), std::vector<float>({0, 1, 2}));
    EXPECT_EQ(tensorB->data(), std::vector<float>({3, 4, 5}));
 }
@ -64,9 +66,10 @@ TEST(TestOpAlgoBase, ShaderRawDataFromFile) {

    mgr.evalOpDefault<kp::OpAlgoBase<>>(
            { tensorA, tensorB }, 
-            true, // Whether to copy output from device
            "test/shaders/glsl/test_op_custom_shader.comp");

+    mgr.evalOpDefault<kp::OpTensorSyncLocal>({tensorA, tensorB});
+
    EXPECT_EQ(tensorA->data(), std::vector<float>({0, 1, 2}));
    EXPECT_EQ(tensorB->data(), std::vector<float>({3, 4, 5}));
 }
@ -80,9 +83,10 @@ TEST(TestOpAlgoBase, ShaderCompiledDataFromFile) {

    mgr.evalOpDefault<kp::OpAlgoBase<>>(
            { tensorA, tensorB }, 
-            true, // Whether to copy output from device
            "test/shaders/glsl/test_op_custom_shader.comp.spv");

+    mgr.evalOpDefault<kp::OpTensorSyncLocal>({tensorA, tensorB});
+
    EXPECT_EQ(tensorA->data(), std::vector<float>({0, 1, 2}));
    EXPECT_EQ(tensorB->data(), std::vector<float>({3, 4, 5}));
 }