diff --git a/Makefile b/Makefile
index 849ecfd01..63f76c5db 100755
--- a/Makefile
+++ b/Makefile
@@ -73,7 +73,8 @@ mk_run_tests: mk_build_tests
 ####### Visual studio build shortcut commands #######
 
 VS_BUILD_TYPE ?= "Debug"
-VS_CMAKE_EXTRA_FLAGS ?= ""
+# Run with multiprocessin / parallel build by default
+VS_CMAKE_EXTRA_FLAGS ?= "/MP"
 
 vs_cmake:
 	$(CMAKE_BIN) \
diff --git a/README.md b/README.md
index 271a3f6b4..85963bef9 100644
--- a/README.md
+++ b/README.md
@@ -15,14 +15,14 @@
 <td>
 
 <h1>Vulkan Kompute</h1>
-<h3>The General Purpose Vulkan Compute Framework. </h3>
+<h3>The General Purpose Vulkan Compute Framework.</h3>
 
 </td>
 
 </tr>
 </table>
 
-<h4>Blazing fast, lightweight, easy to set up and optimized for advanced GPU processing usecases.</h4>
+<h4>Blazing fast, lightweight, mobile-enabled, and optimized for advanced GPU processing usecases.</h4>
 
 🔋 [Documentation](https://axsaucedo.github.io/vulkan-kompute/) 💻 [Import to your project](https://axsaucedo.github.io/vulkan-kompute/) ⌨ [Tutorials](https://axsaucedo.github.io/vulkan-kompute/) 💾
 
@@ -261,8 +261,6 @@ Simplified Kompute Components
 </tr>
 </table>
 
-
-
 ## Kompute Development
 
 We appreciate PRs and Issues. If you want to contribute try checking the "Good first issue" tag, but even using Vulkan Kompute and reporting issues is a great contribution!
diff --git a/single_include/AggregateHeaders.cpp b/single_include/AggregateHeaders.cpp
index dd756c65b..2f3607e2d 100644
--- a/single_include/AggregateHeaders.cpp
+++ b/single_include/AggregateHeaders.cpp
@@ -7,5 +7,6 @@
 #include "kompute/operations/OpAlgoLhsRhsOut.hpp"
 #include "kompute/operations/OpMult.hpp"
 #include "kompute/operations/OpCreateTensor.hpp"
+#include "kompute/operations/OpTensorCopy.hpp"
 #include "kompute/Algorithm.hpp"
 #include "kompute/Tensor.hpp"
diff --git a/single_include/kompute/Kompute.hpp b/single_include/kompute/Kompute.hpp
index 6a5ee7698..fcaed2666 100755
--- a/single_include/kompute/Kompute.hpp
+++ b/single_include/kompute/Kompute.hpp
@@ -218,9 +218,9 @@ class Tensor
      */
     enum class TensorTypes
     {
-        eDevice = 0,
-        eStaging = 1,
-        eStorage = 2,
+        eDevice = 0, ///< Type is device memory, source and destination
+        eStaging = 1, ///< Type is host memory, source and destination
+        eStorage = 2, ///< Type is Device memory (only)
     };
 
     /**
@@ -248,8 +248,7 @@ class Tensor
      * Initialiser which calls the initialisation for all the respective tensors as well as creates the respective staging tensors. The staging tensors woudl only be created for the tensors of type TensorType::eDevice as otherwise there is no need to copy from host memory.
      */
     void init(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
-              std::shared_ptr<vk::Device> device,
-              std::shared_ptr<vk::CommandBuffer> commandBuffer);
+              std::shared_ptr<vk::Device> device);
 
     /**
      * Destroys and frees the GPU resources which include the buffer and memory.
@@ -312,23 +311,27 @@ class Tensor
      * thensor. This is intended to pass memory into a processing, to perform
      * a staging buffer transfer, or to gather output (between others).
      *
+     * @param commandBuffer Vulkan Command Buffer to record the commands into
      * @param copyFromTensor Tensor to copy the data from
      * @param createBarrier Whether to create a barrier that ensures the data is
      * copied before further operations. Default is true.
      */
-    void recordCopyFrom(std::shared_ptr<Tensor> copyFromTensor,
+    void recordCopyFrom(std::shared_ptr<vk::CommandBuffer> commandBuffer,
+                        std::shared_ptr<Tensor> copyFromTensor,
                         bool createBarrier);
 
     /**
      * Records the buffer memory barrier into the command buffer which
      * ensures that relevant data transfers are carried out correctly.
      *
+     * @param commandBuffer Vulkan Command Buffer to record the commands into
      * @param srcAccessMask Access flags for source access mask
      * @param dstAccessMask Access flags for destination access mask
      * @param scrStageMask Pipeline stage flags for source stage mask
      * @param dstStageMask Pipeline stage flags for destination stage mask
      */
-    void recordBufferMemoryBarrier(vk::AccessFlagBits srcAccessMask,
+    void recordBufferMemoryBarrier(std::shared_ptr<vk::CommandBuffer> commandBuffer,
+                                   vk::AccessFlagBits srcAccessMask,
                                    vk::AccessFlagBits dstAccessMask,
                                    vk::PipelineStageFlagBits srcStageMask,
                                    vk::PipelineStageFlagBits dstStageMask);
@@ -356,7 +359,6 @@ class Tensor
     // -------------- NEVER OWNED RESOURCES
     std::shared_ptr<vk::PhysicalDevice> mPhysicalDevice;
     std::shared_ptr<vk::Device> mDevice;
-    std::shared_ptr<vk::CommandBuffer> mCommandBuffer;
 
     // -------------- OPTIONALLY OWNED RESOURCES
     std::shared_ptr<vk::Buffer> mBuffer;
@@ -1104,7 +1106,7 @@ OpAlgoBase<tX, tY, tZ>::init()
             std::shared_ptr<Tensor> stagingTensor = std::make_shared<Tensor>(
               tensor->data(), Tensor::TensorTypes::eStaging);
             stagingTensor->init(
-                this->mPhysicalDevice, this->mDevice, this->mCommandBuffer);
+                this->mPhysicalDevice, this->mDevice);
             this->mOutputStagingTensors.push_back(stagingTensor);
         }
     }
@@ -1127,6 +1129,7 @@ OpAlgoBase<tX, tY, tZ>::record()
     // Barrier to ensure the data is finished writing to buffer memory
     for (std::shared_ptr<Tensor> tensor : this->mTensors) {
         tensor->recordBufferMemoryBarrier(
+          this->mCommandBuffer,
           vk::AccessFlagBits::eHostWrite,
           vk::AccessFlagBits::eShaderRead,
           vk::PipelineStageFlagBits::eHost,
@@ -1139,6 +1142,7 @@ OpAlgoBase<tX, tY, tZ>::record()
         // Barrier to ensure the shader code is executed before buffer read
         for (const std::shared_ptr<Tensor>& tensor : this->mTensors) {
             tensor->recordBufferMemoryBarrier(
+              this->mCommandBuffer,
               vk::AccessFlagBits::eShaderWrite,
               vk::AccessFlagBits::eTransferRead,
               vk::PipelineStageFlagBits::eComputeShader,
@@ -1148,7 +1152,9 @@ OpAlgoBase<tX, tY, tZ>::record()
         // Record copy from and create barrier for STAGING tensors
         for (size_t i = 0; i < this->mTensors.size(); i++) {
             this->mOutputStagingTensors[i]->recordCopyFrom(
-                this->mTensors[i], true);
+                this->mCommandBuffer,
+                this->mTensors[i], 
+                true);
         }
     }
 }
@@ -1327,7 +1333,7 @@ OpAlgoLhsRhsOut<tX, tY, tZ>::init()
         throw std::runtime_error(
           "Kompute OpAlgoLhsRhsOut called with less than 1 tensor");
     } else if (this->mTensors.size() > 3) {
-        spdlog::warn("Kompute OpAlgoLhsRhsOut called with more than 3 this->mTensors");
+        SPDLOG_WARN("Kompute OpAlgoLhsRhsOut called with more than 3 this->mTensors");
     }
 
     this->mTensorLHS = this->mTensors[0];
@@ -1555,3 +1561,52 @@ class OpCreateTensor : public OpBase
 };
 
 } // End namespace kp
+
+namespace kp {
+
+/**
+    Operation that copies the data from the first tensor to the rest of the tensors provided, using a record command for all the vectors. This operation does not own/manage the memory of the tensors passed to it.
+*/
+class OpTensorCopy : public OpBase
+{
+  public:
+    OpTensorCopy();
+
+    /**
+     * Default constructor with parameters that provides the core vulkan resources and the tensors that will be used in the operation.
+     *
+     * @param physicalDevice Vulkan physical device used to find device queues
+     * @param device Vulkan logical device for passing to Algorithm
+     * @param commandBuffer Vulkan Command Buffer to record commands into
+     * @param tensors Tensors that will be used to create in operation.
+     */
+    OpTensorCopy(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
+                   std::shared_ptr<vk::Device> device,
+                   std::shared_ptr<vk::CommandBuffer> commandBuffer,
+                   std::vector<std::shared_ptr<Tensor>> tensors);
+
+    /**
+     * Default destructor which in this case expects the parent class to free
+     * the tensors
+     */
+    ~OpTensorCopy() override;
+
+    /**
+     * TODO
+     */
+    void init() override;
+
+    /**
+     * Records the copy commands from teh first tensor into all the other tensors provided. Also optionally records a barrier.
+     */
+    void record() override;
+
+    /**
+     * Copies the local vectors for all the tensors to sync the data with the gpu.
+     */
+    void postSubmit() override;
+
+  private:
+};
+
+} // End namespace kp
diff --git a/src/OpCreateTensor.cpp b/src/OpCreateTensor.cpp
index f99a81ba5..ae551e259 100644
--- a/src/OpCreateTensor.cpp
+++ b/src/OpCreateTensor.cpp
@@ -48,13 +48,13 @@ OpCreateTensor::init()
         }
         if (tensor->tensorType() == Tensor::TensorTypes::eDevice) {
             tensor->init(
-              this->mPhysicalDevice, this->mDevice, this->mCommandBuffer);
+              this->mPhysicalDevice, this->mDevice);
 
             std::shared_ptr<Tensor> stagingTensor = std::make_shared<Tensor>(
               tensor->data(), Tensor::TensorTypes::eStaging);
 
             stagingTensor->init(
-              this->mPhysicalDevice, this->mDevice, this->mCommandBuffer);
+              this->mPhysicalDevice, this->mDevice);
 
             stagingTensor->mapDataIntoHostMemory();
 
@@ -63,7 +63,7 @@ OpCreateTensor::init()
         } else {
 
             tensor->init(
-              this->mPhysicalDevice, this->mDevice, this->mCommandBuffer);
+              this->mPhysicalDevice, this->mDevice);
 
             // We push a nullptr when no staging tensor is needed to match 
             // index number in array to have one to one mapping with tensors
@@ -79,7 +79,9 @@ OpCreateTensor::record()
 
     for (size_t i = 0; i < this->mTensors.size(); i++) {
         if (this->mTensors[i]->tensorType() == Tensor::TensorTypes::eDevice) {
-            this->mTensors[i]->recordCopyFrom(this->mStagingTensors[i], false);
+            this->mTensors[i]->recordCopyFrom(this->mCommandBuffer, this->mStagingTensors[i], false);
+        } else if (this->mTensors[i]->tensorType() == Tensor::TensorTypes::eStaging) {
+            this->mTensors[i]->mapDataIntoHostMemory();
         }
     }
 }
diff --git a/src/OpTensorCopy.cpp b/src/OpTensorCopy.cpp
new file mode 100644
index 000000000..c0e1f5046
--- /dev/null
+++ b/src/OpTensorCopy.cpp
@@ -0,0 +1,71 @@
+
+#include "kompute/operations/OpTensorCopy.hpp"
+
+namespace kp {
+
+OpTensorCopy::OpTensorCopy()
+{
+    SPDLOG_DEBUG("Kompute OpTensorCopy constructor base");
+}
+
+OpTensorCopy::OpTensorCopy(
+  std::shared_ptr<vk::PhysicalDevice> physicalDevice,
+  std::shared_ptr<vk::Device> device,
+  std::shared_ptr<vk::CommandBuffer> commandBuffer,
+  std::vector<std::shared_ptr<Tensor>> tensors)
+  : OpBase(physicalDevice, device, commandBuffer, tensors, false)
+{
+    SPDLOG_DEBUG("Kompute OpTensorCopy constructor with params");
+}
+
+OpTensorCopy::~OpTensorCopy()
+{
+    SPDLOG_DEBUG("Kompute OpTensorCopy destructor started");
+
+    SPDLOG_DEBUG("Kompute OpTensorCopy destroying staging tensors");
+}
+
+void
+OpTensorCopy::init()
+{
+    SPDLOG_DEBUG("Kompute OpTensorCopy init called");
+
+    if (this->mTensors.size() < 2) {
+        throw std::runtime_error(
+          "Kompute OpTensorCopy called with less than 2 tensor");
+    }
+
+    for (std::shared_ptr<Tensor> tensor: this->mTensors) {
+        if (!tensor->isInit()) {
+            throw std::runtime_error("Kompute OpTensorCopy tensor parameter has not been initialized");
+        }
+        if (tensor->tensorType() == Tensor::TensorTypes::eStorage) {
+            throw std::runtime_error("Kompute OpTensorCopy tensor parameter is of type storage and hence cannot be used to receive or pass data.");
+        }
+    }
+}
+
+void
+OpTensorCopy::record()
+{
+    SPDLOG_DEBUG("Kompute OpTensorCopy record called");
+
+    // We iterate from the second tensor onwards and record a copy to all
+    for (size_t i = 1; i < this->mTensors.size(); i++) {
+        this->mTensors[i]->recordCopyFrom(this->mCommandBuffer, this->mTensors[0], false);
+    }
+}
+
+void
+OpTensorCopy::postSubmit()
+{
+    SPDLOG_DEBUG("Kompute OpTensorCopy postSubmit called");
+
+    // Copy the data from the first tensor into all the tensors
+    for (size_t i = 1; i < this->mTensors.size(); i++) {
+        this->mTensors[i]->setData(this->mTensors[0]->data());
+    }
+}
+
+}
+
diff --git a/src/Tensor.cpp b/src/Tensor.cpp
index bca6c3500..a229a5eec 100644
--- a/src/Tensor.cpp
+++ b/src/Tensor.cpp
@@ -41,8 +41,7 @@ Tensor::~Tensor()
 
 void
 Tensor::init(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
-             std::shared_ptr<vk::Device> device,
-             std::shared_ptr<vk::CommandBuffer> commandBuffer)
+             std::shared_ptr<vk::Device> device)
 {
     SPDLOG_DEBUG("Kompute Tensor running init with Vulkan params and num data "
                  "elementS: {}",
@@ -50,7 +49,6 @@ Tensor::init(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
 
     this->mPhysicalDevice = physicalDevice;
     this->mDevice = device;
-    this->mCommandBuffer = commandBuffer;
 
     this->mIsInit = true;
 
@@ -106,8 +104,10 @@ Tensor::setData(const std::vector<float>& data)
 }
 
 void
-Tensor::recordCopyFrom(std::shared_ptr<Tensor> copyFromTensor,
-                       bool createBarrier)
+Tensor::recordCopyFrom(
+            std::shared_ptr<vk::CommandBuffer> commandBuffer,
+            std::shared_ptr<Tensor> copyFromTensor,
+            bool createBarrier)
 {
     SPDLOG_DEBUG("Kompute Tensor recordCopyFrom called");
 
@@ -121,12 +121,13 @@ Tensor::recordCopyFrom(std::shared_ptr<Tensor> copyFromTensor,
 
     SPDLOG_DEBUG("Kompute Tensor copying data size {}.", bufferSize);
 
-    this->mCommandBuffer->copyBuffer(
+    commandBuffer->copyBuffer(
       *copyFromTensor->mBuffer, *this->mBuffer, copyRegion);
 
     if (createBarrier) {
         // Buffer to ensure wait until data is copied to staging buffer
-        this->recordBufferMemoryBarrier(vk::AccessFlagBits::eTransferWrite,
+        this->recordBufferMemoryBarrier(commandBuffer,
+                                        vk::AccessFlagBits::eTransferWrite,
                                         vk::AccessFlagBits::eHostRead,
                                         vk::PipelineStageFlagBits::eTransfer,
                                         vk::PipelineStageFlagBits::eHost);
@@ -134,7 +135,8 @@ Tensor::recordCopyFrom(std::shared_ptr<Tensor> copyFromTensor,
 }
 
 void
-Tensor::recordBufferMemoryBarrier(vk::AccessFlagBits srcAccessMask,
+Tensor::recordBufferMemoryBarrier(std::shared_ptr<vk::CommandBuffer> commandBuffer,
+                                  vk::AccessFlagBits srcAccessMask,
                                   vk::AccessFlagBits dstAccessMask,
                                   vk::PipelineStageFlagBits srcStageMask,
                                   vk::PipelineStageFlagBits dstStageMask)
@@ -151,7 +153,7 @@ Tensor::recordBufferMemoryBarrier(vk::AccessFlagBits srcAccessMask,
     bufferMemoryBarrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
     bufferMemoryBarrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
 
-    this->mCommandBuffer->pipelineBarrier(srcStageMask,
+    commandBuffer->pipelineBarrier(srcStageMask,
                                           dstStageMask,
                                           vk::DependencyFlags(),
                                           nullptr,
diff --git a/src/include/kompute/Tensor.hpp b/src/include/kompute/Tensor.hpp
index eea3d6c44..ee34949e0 100644
--- a/src/include/kompute/Tensor.hpp
+++ b/src/include/kompute/Tensor.hpp
@@ -25,9 +25,9 @@ class Tensor
      */
     enum class TensorTypes
     {
-        eDevice = 0,
-        eStaging = 1,
-        eStorage = 2,
+        eDevice = 0, ///< Type is device memory, source and destination
+        eStaging = 1, ///< Type is host memory, source and destination
+        eStorage = 2, ///< Type is Device memory (only)
     };
 
     /**
@@ -55,8 +55,7 @@ class Tensor
      * Initialiser which calls the initialisation for all the respective tensors as well as creates the respective staging tensors. The staging tensors woudl only be created for the tensors of type TensorType::eDevice as otherwise there is no need to copy from host memory.
      */
     void init(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
-              std::shared_ptr<vk::Device> device,
-              std::shared_ptr<vk::CommandBuffer> commandBuffer);
+              std::shared_ptr<vk::Device> device);
 
     /**
      * Destroys and frees the GPU resources which include the buffer and memory.
@@ -119,23 +118,27 @@ class Tensor
      * thensor. This is intended to pass memory into a processing, to perform
      * a staging buffer transfer, or to gather output (between others).
      *
+     * @param commandBuffer Vulkan Command Buffer to record the commands into
      * @param copyFromTensor Tensor to copy the data from
      * @param createBarrier Whether to create a barrier that ensures the data is
      * copied before further operations. Default is true.
      */
-    void recordCopyFrom(std::shared_ptr<Tensor> copyFromTensor,
+    void recordCopyFrom(std::shared_ptr<vk::CommandBuffer> commandBuffer,
+                        std::shared_ptr<Tensor> copyFromTensor,
                         bool createBarrier);
 
     /**
      * Records the buffer memory barrier into the command buffer which
      * ensures that relevant data transfers are carried out correctly.
      *
+     * @param commandBuffer Vulkan Command Buffer to record the commands into
      * @param srcAccessMask Access flags for source access mask
      * @param dstAccessMask Access flags for destination access mask
      * @param scrStageMask Pipeline stage flags for source stage mask
      * @param dstStageMask Pipeline stage flags for destination stage mask
      */
-    void recordBufferMemoryBarrier(vk::AccessFlagBits srcAccessMask,
+    void recordBufferMemoryBarrier(std::shared_ptr<vk::CommandBuffer> commandBuffer,
+                                   vk::AccessFlagBits srcAccessMask,
                                    vk::AccessFlagBits dstAccessMask,
                                    vk::PipelineStageFlagBits srcStageMask,
                                    vk::PipelineStageFlagBits dstStageMask);
@@ -163,7 +166,6 @@ class Tensor
     // -------------- NEVER OWNED RESOURCES
     std::shared_ptr<vk::PhysicalDevice> mPhysicalDevice;
     std::shared_ptr<vk::Device> mDevice;
-    std::shared_ptr<vk::CommandBuffer> mCommandBuffer;
 
     // -------------- OPTIONALLY OWNED RESOURCES
     std::shared_ptr<vk::Buffer> mBuffer;
diff --git a/src/include/kompute/operations/OpAlgoBase.hpp b/src/include/kompute/operations/OpAlgoBase.hpp
index 9709e146b..86e03c442 100644
--- a/src/include/kompute/operations/OpAlgoBase.hpp
+++ b/src/include/kompute/operations/OpAlgoBase.hpp
@@ -260,7 +260,7 @@ OpAlgoBase<tX, tY, tZ>::init()
             std::shared_ptr<Tensor> stagingTensor = std::make_shared<Tensor>(
               tensor->data(), Tensor::TensorTypes::eStaging);
             stagingTensor->init(
-                this->mPhysicalDevice, this->mDevice, this->mCommandBuffer);
+                this->mPhysicalDevice, this->mDevice);
             this->mOutputStagingTensors.push_back(stagingTensor);
         }
     }
@@ -283,6 +283,7 @@ OpAlgoBase<tX, tY, tZ>::record()
     // Barrier to ensure the data is finished writing to buffer memory
     for (std::shared_ptr<Tensor> tensor : this->mTensors) {
         tensor->recordBufferMemoryBarrier(
+          this->mCommandBuffer,
           vk::AccessFlagBits::eHostWrite,
           vk::AccessFlagBits::eShaderRead,
           vk::PipelineStageFlagBits::eHost,
@@ -295,6 +296,7 @@ OpAlgoBase<tX, tY, tZ>::record()
         // Barrier to ensure the shader code is executed before buffer read
         for (const std::shared_ptr<Tensor>& tensor : this->mTensors) {
             tensor->recordBufferMemoryBarrier(
+              this->mCommandBuffer,
               vk::AccessFlagBits::eShaderWrite,
               vk::AccessFlagBits::eTransferRead,
               vk::PipelineStageFlagBits::eComputeShader,
@@ -304,7 +306,9 @@ OpAlgoBase<tX, tY, tZ>::record()
         // Record copy from and create barrier for STAGING tensors
         for (size_t i = 0; i < this->mTensors.size(); i++) {
             this->mOutputStagingTensors[i]->recordCopyFrom(
-                this->mTensors[i], true);
+                this->mCommandBuffer,
+                this->mTensors[i], 
+                true);
         }
     }
 }
diff --git a/src/include/kompute/operations/OpAlgoLhsRhsOut.hpp b/src/include/kompute/operations/OpAlgoLhsRhsOut.hpp
index ecb7e33d2..1f9605878 100644
--- a/src/include/kompute/operations/OpAlgoLhsRhsOut.hpp
+++ b/src/include/kompute/operations/OpAlgoLhsRhsOut.hpp
@@ -128,7 +128,7 @@ OpAlgoLhsRhsOut<tX, tY, tZ>::init()
         throw std::runtime_error(
           "Kompute OpAlgoLhsRhsOut called with less than 1 tensor");
     } else if (this->mTensors.size() > 3) {
-        spdlog::warn("Kompute OpAlgoLhsRhsOut called with more than 3 this->mTensors");
+        SPDLOG_WARN("Kompute OpAlgoLhsRhsOut called with more than 3 this->mTensors");
     }
 
     this->mTensorLHS = this->mTensors[0];
diff --git a/src/include/kompute/operations/OpCreateTensor.hpp b/src/include/kompute/operations/OpCreateTensor.hpp
index 025bf2862..e04e65062 100644
--- a/src/include/kompute/operations/OpCreateTensor.hpp
+++ b/src/include/kompute/operations/OpCreateTensor.hpp
@@ -47,8 +47,10 @@ class OpCreateTensor : public OpBase
     void init() override;
 
     /**
-     * Records the copy command into the GPU memory from the staging or host
-     * memory depending on the type of tensor.
+     * Record runs the core actions to create the tensors. For device tensors
+     * it records a copyCommand to move the data from the staging tensor to the 
+     * device tensor. For staging tensors it performs a mapDataIntoHostMemory
+     * which would perform immediately as opposed to on sequence eval/submission.
      */
     void record() override;
 
diff --git a/src/include/kompute/operations/OpTensorCopy.hpp b/src/include/kompute/operations/OpTensorCopy.hpp
new file mode 100644
index 000000000..646d1b158
--- /dev/null
+++ b/src/include/kompute/operations/OpTensorCopy.hpp
@@ -0,0 +1,57 @@
+#pragma once
+
+#include "kompute/Core.hpp"
+
+#include "kompute/Tensor.hpp"
+
+#include "kompute/operations/OpBase.hpp"
+
+namespace kp {
+
+/**
+    Operation that copies the data from the first tensor to the rest of the tensors provided, using a record command for all the vectors. This operation does not own/manage the memory of the tensors passed to it.
+*/
+class OpTensorCopy : public OpBase
+{
+  public:
+    OpTensorCopy();
+
+    /**
+     * Default constructor with parameters that provides the core vulkan resources and the tensors that will be used in the operation.
+     *
+     * @param physicalDevice Vulkan physical device used to find device queues
+     * @param device Vulkan logical device for passing to Algorithm
+     * @param commandBuffer Vulkan Command Buffer to record commands into
+     * @param tensors Tensors that will be used to create in operation.
+     */
+    OpTensorCopy(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
+                   std::shared_ptr<vk::Device> device,
+                   std::shared_ptr<vk::CommandBuffer> commandBuffer,
+                   std::vector<std::shared_ptr<Tensor>> tensors);
+
+    /**
+     * Default destructor which in this case expects the parent class to free
+     * the tensors
+     */
+    ~OpTensorCopy() override;
+
+    /**
+     * TODO
+     */
+    void init() override;
+
+    /**
+     * Records the copy commands from teh first tensor into all the other tensors provided. Also optionally records a barrier.
+     */
+    void record() override;
+
+    /**
+     * Copies the local vectors for all the tensors to sync the data with the gpu.
+     */
+    void postSubmit() override;
+
+  private:
+};
+
+} // End namespace kp
+
diff --git a/test/TestTensor.cpp b/test/TestTensor.cpp
index d6195ed25..94f8aa92d 100644
--- a/test/TestTensor.cpp
+++ b/test/TestTensor.cpp
@@ -30,9 +30,7 @@ TEST(TestTensor, CopyFromHostData) {
 
         sq->record<kp::OpCreateTensor>({tensorA, tensorB});
 
-        tensorA->mapDataIntoHostMemory();
-
-        tensorB->recordCopyFrom(tensorA, true);
+        sq->record<kp::OpTensorCopy>({tensorA, tensorB});
 
         sq->end();