From 8ec486b4abb8d02229e5e2469919c8c6c2c419c4 Mon Sep 17 00:00:00 2001
From: Alejandro Saucedo <axsauze@gmail.com>
Date: Wed, 14 Oct 2020 21:50:33 +0100
Subject: [PATCH 01/19] Added initial async capability

---
 src/Sequence.cpp                 | 58 ++++++++++++++++++++++++++------
 src/include/kompute/Manager.hpp  | 57 +++++++++++++++++++++++++++++++
 src/include/kompute/Sequence.hpp | 27 ++++++++++++++-
 3 files changed, 130 insertions(+), 12 deletions(-)
diff --git a/src/Sequence.cpp b/src/Sequence.cpp
index 7f2c63a44..6fd9773e2 100644
--- a/src/Sequence.cpp
+++ b/src/Sequence.cpp
@@ -118,38 +118,74 @@ Sequence::end()
 bool
 Sequence::eval()
 {
-    SPDLOG_DEBUG("Kompute sequence compute recording EVAL");
+    SPDLOG_DEBUG("Kompute sequence EVAL BEGIN");
 
-    if (this->isRecording()) {
-        SPDLOG_WARN("Kompute Sequence eval called when still recording");
+    bool evalResult = this->evalAsync();
+    if (!evalResult) {
+        SPDLOG_DEBUG("Kompute sequence EVAL FAILURE");
         return false;
     }
 
+    evalResult = this->evalAwait();
+
+    SPDLOG_DEBUG("Kompute sequence EVAL SUCCESS");
+
+    return evalResult;
+}
+
+bool
+Sequence::evalAsync()
+{
+    if (this->isRecording()) {
+        SPDLOG_WARN("Kompute Sequence evalAsync called when still recording");
+        return false;
+    }
+    if (this->mEvalBusy) {
+        SPDLOG_WARN("Kompute Sequence evalAsync called when an eval async was called without successful wait");
+        return false;
+    }
+
+    this->mEvalBusy = true;
+
     for (size_t i = 0; i < this->mOperations.size(); i++) {
         this->mOperations[i]->preEval();
     }
 
     const vk::PipelineStageFlags waitStageMask =
-      vk::PipelineStageFlagBits::eTransfer;
+      vk::PipelineStageFlagBits::eTransfer & vk::PipelineStageFlagBits::eComputeShader;
     vk::SubmitInfo submitInfo(
       0, nullptr, &waitStageMask, 1, this->mCommandBuffer.get());
 
-    vk::Fence fence = this->mDevice->createFence(vk::FenceCreateInfo());
+    this->mFence = this->mDevice->createFence(vk::FenceCreateInfo());
 
     SPDLOG_DEBUG(
       "Kompute sequence submitting command buffer into compute queue");
 
-    this->mComputeQueue->submit(1, &submitInfo, fence);
-    this->mDevice->waitForFences(1, &fence, VK_TRUE, UINT64_MAX);
-    this->mDevice->destroy(fence);
+    this->mComputeQueue->submit(1, &submitInfo, this->mFence);
+}
+
+bool
+Sequence::evalAwait(uint64_t waitFor)
+{
+    if (!this->mEvalBusy) {
+        SPDLOG_WARN("Kompute Sequence evalAwait called without existing eval");
+        return false;
+    }
+
+    vk::Result result = this->mDevice->waitForFences(1, &this->mFence, VK_TRUE, waitFor);
+    this->mDevice->destroy(this->mFence);
+
+    if (result == vk::Result::eTimeout) {
+        SPDLOG_WARN("Kompute Sequence evalAwait timed out");
+        this->mEvalBusy = false;
+        return false;
+    }
 
     for (size_t i = 0; i < this->mOperations.size(); i++) {
         this->mOperations[i]->postEval();
     }
 
-    SPDLOG_DEBUG("Kompute sequence EVAL success");
-
-    return true;
+    this->mEvalBusy = false;
 }
 
 bool
diff --git a/src/include/kompute/Manager.hpp b/src/include/kompute/Manager.hpp
index e2c7832fd..70eea2e73 100644
--- a/src/include/kompute/Manager.hpp
+++ b/src/include/kompute/Manager.hpp
@@ -96,6 +96,63 @@ class Manager
         SPDLOG_DEBUG("Kompute Manager evalOp running sequence SUCCESS");
     }
 
+    /**
+     * Operation that adds extra operations to existing or new created
+     * sequences.
+     *
+     * @param tensors The tensors to be used in the operation recorded
+     * @param sequenceName The name of the sequence to be retrieved or created
+     * @param TArgs Template parameters that will be used to initialise
+     * Operation to allow for extensible configurations on initialisation
+     */
+    template<typename T, typename... TArgs>
+    void evalOpAsync(std::vector<std::shared_ptr<Tensor>> tensors,
+                std::string sequenceName,
+                TArgs&&... params)
+    {
+        SPDLOG_DEBUG("Kompute Manager evalOp triggered");
+        std::weak_ptr<Sequence> sqWeakPtr =
+          this->getOrCreateManagedSequence(sequenceName);
+
+        if (std::shared_ptr<kp::Sequence> sq = sqWeakPtr.lock()) {
+            SPDLOG_DEBUG("Kompute Manager evalOp running sequence BEGIN");
+            sq->begin();
+
+            SPDLOG_DEBUG("Kompute Manager evalOp running sequence RECORD");
+            sq->record<T>(tensors, std::forward<TArgs>(params)...);
+
+            SPDLOG_DEBUG("Kompute Manager evalOp running sequence END");
+            sq->end();
+
+            SPDLOG_DEBUG("Kompute Manager evalOp running sequence EVAL");
+            sq->evalAsync();
+        }
+        SPDLOG_DEBUG("Kompute Manager evalOp running sequence SUCCESS");
+    }
+
+    /**
+     * Operation that adds extra operations to existing or new created
+     * sequences.
+     *
+     * @param tensors The tensors to be used in the operation recorded
+     * @param sequenceName The name of the sequence to be retrieved or created
+     * @param TArgs Template parameters that will be used to initialise
+     * Operation to allow for extensible configurations on initialisation
+     */
+    template<typename T, typename... TArgs>
+    void evalOpAwait(std::string sequenceName, uint64_t waitFor = UINT64_MAX)
+    {
+        SPDLOG_DEBUG("Kompute Manager evalOpAwait triggered");
+        std::weak_ptr<Sequence> sqWeakPtr =
+          this->getOrCreateManagedSequence(sequenceName);
+
+        if (std::shared_ptr<kp::Sequence> sq = sqWeakPtr.lock()) {
+            SPDLOG_DEBUG("Kompute Manager evalOpAwait running sequence EVAL AWAIT");
+            sq->evalAwait(waitFor);
+        }
+        SPDLOG_DEBUG("Kompute Manager evalOpAwait running sequence SUCCESS");
+    }
+
     /**
      * Operation that adds extra operations to existing or new created
      * sequences.
diff --git a/src/include/kompute/Sequence.hpp b/src/include/kompute/Sequence.hpp
index 6e4a4ab4f..749fdbeeb 100644
--- a/src/include/kompute/Sequence.hpp
+++ b/src/include/kompute/Sequence.hpp
@@ -45,19 +45,42 @@ class Sequence
     /**
      * Begins recording commands for commands to be submitted into the command
      * buffer.
+     *
+     * @return Boolean stating whether execution was successful.
      */
     bool begin();
+
     /**
      * Ends the recording and stops recording commands when the record command
      * is sent.
+     *
+     * @return Boolean stating whether execution was successful.
      */
     bool end();
+
     /**
      * Eval sends all the recorded and stored operations in the vector of
      * operations into the gpu as a submit job with a barrier.
+     *
+     * @return Boolean stating whether execution was successful.
      */
     bool eval();
 
+    /**
+     * Eval Async sends all the recorded and stored operations in the vector of operations into the gpu as a submit job with a barrier. EvalAwait() must be called after to ensure the sequence is terminated correctly.
+     *
+     * @return Boolean stating whether execution was successful.
+     */
+    bool evalAsync();
+
+    /**
+     * Eval Await waits for the fence to finish processing and then once it finishes, it runs the postEval of all operations.
+     *
+     * @param waitFor Number of milliseconds to wait before timing out.
+     * @return Boolean stating whether execution was successful.
+     */
+    bool evalAwait(uint64_t waitFor = UINT64_MAX);
+
     /**
      * Returns true if the sequence is currently in recording activated.
      *
@@ -134,12 +157,14 @@ class Sequence
     std::shared_ptr<vk::CommandBuffer> mCommandBuffer = nullptr;
     bool mFreeCommandBuffer = false;
 
-    // Base op objects
+    // -------------- ALWAYS OWNED RESOURCES
+    vk::Fence mFence;
     std::vector<std::unique_ptr<OpBase>> mOperations;
 
     // State
     bool mIsInit = false;
     bool mRecording = false;
+    bool mEvalBusy = false;
 
     // Create functions
     void createCommandPool();

From 33df1dec4e8e157ce8d4860a78f85230af3f61e3 Mon Sep 17 00:00:00 2001
From: Alejandro Saucedo <axsauze@gmail.com>
Date: Thu, 15 Oct 2020 07:16:01 +0100
Subject: [PATCH 02/19] Added evalOpAwait and evalOpAsync into the manager

---
 src/include/kompute/Manager.hpp | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/src/include/kompute/Manager.hpp b/src/include/kompute/Manager.hpp
index 70eea2e73..7f95aa9ab 100644
--- a/src/include/kompute/Manager.hpp
+++ b/src/include/kompute/Manager.hpp
@@ -143,14 +143,16 @@ class Manager
     void evalOpAwait(std::string sequenceName, uint64_t waitFor = UINT64_MAX)
     {
         SPDLOG_DEBUG("Kompute Manager evalOpAwait triggered");
-        std::weak_ptr<Sequence> sqWeakPtr =
-          this->getOrCreateManagedSequence(sequenceName);
+        std::unordered_map<std::string, std::shared_ptr<Sequence>>::iterator found =
+              this->mManagedSequences.find(sequenceName);
 
-        if (std::shared_ptr<kp::Sequence> sq = sqWeakPtr.lock()) {
-            SPDLOG_DEBUG("Kompute Manager evalOpAwait running sequence EVAL AWAIT");
-            sq->evalAwait(waitFor);
+        if (found == this->mManagedSequences.end()) {
+            if (std::shared_ptr<kp::Sequence> sq = found->second) {
+                SPDLOG_DEBUG("Kompute Manager evalOpAwait running sequence Sequence EVAL AWAIT");
+                sq->evalAwait(waitFor);
+            }
+            SPDLOG_DEBUG("Kompute Manager evalOpAwait running sequence SUCCESS");
         }
-        SPDLOG_DEBUG("Kompute Manager evalOpAwait running sequence SUCCESS");
     }
 
     /**

From 48805e16399751f3df54ca1dcfd8ad404a82a84f Mon Sep 17 00:00:00 2001
From: Alejandro Saucedo <axsauze@gmail.com>
Date: Thu, 15 Oct 2020 09:25:16 +0100
Subject: [PATCH 03/19] Updated to function for async

---
 single_include/kompute/Kompute.hpp            | 98 ++++++++++++++++++-
 src/Sequence.cpp                              | 21 ++--
 src/include/kompute/Manager.hpp               | 33 ++++---
 src/include/kompute/Sequence.hpp              |  9 +-
 src/include/kompute/operations/OpAlgoBase.hpp |  2 +-
 5 files changed, 138 insertions(+), 25 deletions(-)

diff --git a/single_include/kompute/Kompute.hpp b/single_include/kompute/Kompute.hpp
index 2718b6599..ee8705df5 100755
--- a/single_include/kompute/Kompute.hpp
+++ b/single_include/kompute/Kompute.hpp
@@ -1033,19 +1033,42 @@ class Sequence
     /**
      * Begins recording commands for commands to be submitted into the command
      * buffer.
+     *
+     * @return Boolean stating whether execution was successful.
      */
     bool begin();
+
     /**
      * Ends the recording and stops recording commands when the record command
      * is sent.
+     *
+     * @return Boolean stating whether execution was successful.
      */
     bool end();
+
     /**
      * Eval sends all the recorded and stored operations in the vector of
      * operations into the gpu as a submit job with a barrier.
+     *
+     * @return Boolean stating whether execution was successful.
      */
     bool eval();
 
+    /**
+     * Eval Async sends all the recorded and stored operations in the vector of operations into the gpu as a submit job with a barrier. EvalAwait() must be called after to ensure the sequence is terminated correctly.
+     *
+     * @return Boolean stating whether execution was successful.
+     */
+    bool evalAsync();
+
+    /**
+     * Eval Await waits for the fence to finish processing and then once it finishes, it runs the postEval of all operations.
+     *
+     * @param waitFor Number of milliseconds to wait before timing out.
+     * @return Boolean stating whether execution was successful.
+     */
+    bool evalAwait(uint64_t waitFor = UINT64_MAX);
+
     /**
      * Returns true if the sequence is currently in recording activated.
      *
@@ -1053,6 +1076,13 @@ class Sequence
      */
     bool isRecording();
 
+    /**
+     * Returns true if the sequence is currently running - mostly used for async workloads.
+     *
+     * @return Boolean stating if currently running.
+     */
+    bool isRunning();
+
     /**
      * Returns true if the sequence has been successfully initialised.
      *
@@ -1122,12 +1152,14 @@ class Sequence
     std::shared_ptr<vk::CommandBuffer> mCommandBuffer = nullptr;
     bool mFreeCommandBuffer = false;
 
-    // Base op objects
+    // -------------- ALWAYS OWNED RESOURCES
+    vk::Fence mFence;
     std::vector<std::unique_ptr<OpBase>> mOperations;
 
     // State
     bool mIsInit = false;
     bool mRecording = false;
+    bool mIsRunning = false;
 
     // Create functions
     void createCommandPool();
@@ -1292,6 +1324,68 @@ class Manager
         SPDLOG_DEBUG("Kompute Manager evalOp running sequence SUCCESS");
     }
 
+    /**
+     * Operation that adds extra operations to existing or new created
+     * sequences.
+     *
+     * @param tensors The tensors to be used in the operation recorded
+     * @param sequenceName The name of the sequence to be retrieved or created
+     * @param params Template parameters that will be used to initialise
+     * Operation to allow for extensible configurations on initialisation
+     */
+    template<typename T, typename... TArgs>
+    void evalOpAsync(std::vector<std::shared_ptr<Tensor>> tensors,
+                std::string sequenceName,
+                TArgs&&... params)
+    {
+        SPDLOG_DEBUG("Kompute Manager evalOpAsync triggered");
+        std::weak_ptr<Sequence> sqWeakPtr =
+          this->getOrCreateManagedSequence(sequenceName);
+
+        if (std::shared_ptr<kp::Sequence> sq = sqWeakPtr.lock()) {
+            SPDLOG_DEBUG("Kompute Manager evalOpAsync running sequence BEGIN");
+            sq->begin();
+
+            SPDLOG_DEBUG("Kompute Manager evalOpAsync running sequence RECORD");
+            sq->record<T>(tensors, std::forward<TArgs>(params)...);
+
+            SPDLOG_DEBUG("Kompute Manager evalOpAsync running sequence END");
+            sq->end();
+
+            SPDLOG_DEBUG("Kompute Manager evalOpAsync running sequence EVAL");
+            sq->evalAsync();
+        }
+        SPDLOG_DEBUG("Kompute Manager evalOpAsync running sequence SUCCESS");
+    }
+
+    /**
+     * Operation that adds extra operations to existing or new created
+     * sequences.
+     *
+     * @param sequenceName The name of the sequence to wait for termination
+     * @param waitFor The amount of time to wait before timing out
+     */
+    template<typename... TArgs>
+    void evalOpAwait(std::string sequenceName, uint64_t waitFor = UINT64_MAX)
+    {
+        SPDLOG_DEBUG("Kompute Manager evalOpAwait triggered");
+        std::unordered_map<std::string, std::shared_ptr<Sequence>>::iterator found =
+        this->mManagedSequences.find(sequenceName);
+
+        if (found != this->mManagedSequences.end()) {
+            if (std::shared_ptr<kp::Sequence> sq = found->second) {
+                SPDLOG_DEBUG("Kompute Manager evalOpAwait running sequence Sequence EVAL AWAIT");
+                if (sq->isRunning()) {
+                    sq->evalAwait(waitFor);
+                }
+            }
+            SPDLOG_DEBUG("Kompute Manager evalOpAwait running sequence SUCCESS");
+        }
+        else {
+            SPDLOG_ERROR("Sequence not found");
+ 		}
+    }
+
     /**
      * Operation that adds extra operations to existing or new created
      * sequences.
@@ -1599,7 +1693,7 @@ OpAlgoBase<tX, tY, tZ>::OpAlgoBase(std::shared_ptr<vk::PhysicalDevice> physicalD
                            std::vector<std::shared_ptr<Tensor>>& tensors)
   : OpBase(physicalDevice, device, commandBuffer, tensors, false)
 {
-    SPDLOG_DEBUG("Kompute OpAlgoBase constructor with params numTensors: {} , shaderFilePath: {}", tensors.size());
+    SPDLOG_DEBUG("Kompute OpAlgoBase constructor with params numTensors: {}", tensors.size());
 
     // The dispatch size is set up based on either explicitly provided template
     // parameters or by default it would take the shape and size of the tensors
diff --git a/src/Sequence.cpp b/src/Sequence.cpp
index 6fd9773e2..9038db2a7 100644
--- a/src/Sequence.cpp
+++ b/src/Sequence.cpp
@@ -140,12 +140,12 @@ Sequence::evalAsync()
         SPDLOG_WARN("Kompute Sequence evalAsync called when still recording");
         return false;
     }
-    if (this->mEvalBusy) {
+    if (this->mIsRunning) {
         SPDLOG_WARN("Kompute Sequence evalAsync called when an eval async was called without successful wait");
         return false;
     }
 
-    this->mEvalBusy = true;
+    this->mIsRunning = true;
 
     for (size_t i = 0; i < this->mOperations.size(); i++) {
         this->mOperations[i]->preEval();
@@ -154,7 +154,7 @@ Sequence::evalAsync()
     const vk::PipelineStageFlags waitStageMask =
       vk::PipelineStageFlagBits::eTransfer & vk::PipelineStageFlagBits::eComputeShader;
     vk::SubmitInfo submitInfo(
-      0, nullptr, &waitStageMask, 1, this->mCommandBuffer.get());
+      0, nullptr, nullptr, 1, this->mCommandBuffer.get());
 
     this->mFence = this->mDevice->createFence(vk::FenceCreateInfo());
 
@@ -162,12 +162,14 @@ Sequence::evalAsync()
       "Kompute sequence submitting command buffer into compute queue");
 
     this->mComputeQueue->submit(1, &submitInfo, this->mFence);
+
+    return true;
 }
 
 bool
 Sequence::evalAwait(uint64_t waitFor)
 {
-    if (!this->mEvalBusy) {
+    if (!this->mIsRunning) {
         SPDLOG_WARN("Kompute Sequence evalAwait called without existing eval");
         return false;
     }
@@ -177,7 +179,7 @@ Sequence::evalAwait(uint64_t waitFor)
 
     if (result == vk::Result::eTimeout) {
         SPDLOG_WARN("Kompute Sequence evalAwait timed out");
-        this->mEvalBusy = false;
+        this->mIsRunning = false;
         return false;
     }
 
@@ -185,7 +187,14 @@ Sequence::evalAwait(uint64_t waitFor)
         this->mOperations[i]->postEval();
     }
 
-    this->mEvalBusy = false;
+    this->mIsRunning = false;
+
+    return true;
+}
+
+bool
+Sequence::isRunning() {
+    return this->mIsRunning;
 }
 
 bool
diff --git a/src/include/kompute/Manager.hpp b/src/include/kompute/Manager.hpp
index 7f95aa9ab..e795bbf2e 100644
--- a/src/include/kompute/Manager.hpp
+++ b/src/include/kompute/Manager.hpp
@@ -102,7 +102,7 @@ class Manager
      *
      * @param tensors The tensors to be used in the operation recorded
      * @param sequenceName The name of the sequence to be retrieved or created
-     * @param TArgs Template parameters that will be used to initialise
+     * @param params Template parameters that will be used to initialise
      * Operation to allow for extensible configurations on initialisation
      */
     template<typename T, typename... TArgs>
@@ -110,49 +110,52 @@ class Manager
                 std::string sequenceName,
                 TArgs&&... params)
     {
-        SPDLOG_DEBUG("Kompute Manager evalOp triggered");
+        SPDLOG_DEBUG("Kompute Manager evalOpAsync triggered");
         std::weak_ptr<Sequence> sqWeakPtr =
           this->getOrCreateManagedSequence(sequenceName);
 
         if (std::shared_ptr<kp::Sequence> sq = sqWeakPtr.lock()) {
-            SPDLOG_DEBUG("Kompute Manager evalOp running sequence BEGIN");
+            SPDLOG_DEBUG("Kompute Manager evalOpAsync running sequence BEGIN");
             sq->begin();
 
-            SPDLOG_DEBUG("Kompute Manager evalOp running sequence RECORD");
+            SPDLOG_DEBUG("Kompute Manager evalOpAsync running sequence RECORD");
             sq->record<T>(tensors, std::forward<TArgs>(params)...);
 
-            SPDLOG_DEBUG("Kompute Manager evalOp running sequence END");
+            SPDLOG_DEBUG("Kompute Manager evalOpAsync running sequence END");
             sq->end();
 
-            SPDLOG_DEBUG("Kompute Manager evalOp running sequence EVAL");
+            SPDLOG_DEBUG("Kompute Manager evalOpAsync running sequence EVAL");
             sq->evalAsync();
         }
-        SPDLOG_DEBUG("Kompute Manager evalOp running sequence SUCCESS");
+        SPDLOG_DEBUG("Kompute Manager evalOpAsync running sequence SUCCESS");
     }
 
     /**
      * Operation that adds extra operations to existing or new created
      * sequences.
      *
-     * @param tensors The tensors to be used in the operation recorded
-     * @param sequenceName The name of the sequence to be retrieved or created
-     * @param TArgs Template parameters that will be used to initialise
-     * Operation to allow for extensible configurations on initialisation
+     * @param sequenceName The name of the sequence to wait for termination
+     * @param waitFor The amount of time to wait before timing out
      */
-    template<typename T, typename... TArgs>
+    template<typename... TArgs>
     void evalOpAwait(std::string sequenceName, uint64_t waitFor = UINT64_MAX)
     {
         SPDLOG_DEBUG("Kompute Manager evalOpAwait triggered");
         std::unordered_map<std::string, std::shared_ptr<Sequence>>::iterator found =
-              this->mManagedSequences.find(sequenceName);
+        this->mManagedSequences.find(sequenceName);
 
-        if (found == this->mManagedSequences.end()) {
+        if (found != this->mManagedSequences.end()) {
             if (std::shared_ptr<kp::Sequence> sq = found->second) {
                 SPDLOG_DEBUG("Kompute Manager evalOpAwait running sequence Sequence EVAL AWAIT");
-                sq->evalAwait(waitFor);
+                if (sq->isRunning()) {
+                    sq->evalAwait(waitFor);
+                }
             }
             SPDLOG_DEBUG("Kompute Manager evalOpAwait running sequence SUCCESS");
         }
+        else {
+            SPDLOG_ERROR("Sequence not found");
+ 		}
     }
 
     /**
diff --git a/src/include/kompute/Sequence.hpp b/src/include/kompute/Sequence.hpp
index 749fdbeeb..2d9ef8e51 100644
--- a/src/include/kompute/Sequence.hpp
+++ b/src/include/kompute/Sequence.hpp
@@ -88,6 +88,13 @@ class Sequence
      */
     bool isRecording();
 
+    /**
+     * Returns true if the sequence is currently running - mostly used for async workloads.
+     *
+     * @return Boolean stating if currently running.
+     */
+    bool isRunning();
+
     /**
      * Returns true if the sequence has been successfully initialised.
      *
@@ -164,7 +171,7 @@ class Sequence
     // State
     bool mIsInit = false;
     bool mRecording = false;
-    bool mEvalBusy = false;
+    bool mIsRunning = false;
 
     // Create functions
     void createCommandPool();
diff --git a/src/include/kompute/operations/OpAlgoBase.hpp b/src/include/kompute/operations/OpAlgoBase.hpp
index 3d1de2992..a32e20011 100644
--- a/src/include/kompute/operations/OpAlgoBase.hpp
+++ b/src/include/kompute/operations/OpAlgoBase.hpp
@@ -162,7 +162,7 @@ OpAlgoBase<tX, tY, tZ>::OpAlgoBase(std::shared_ptr<vk::PhysicalDevice> physicalD
                            std::vector<std::shared_ptr<Tensor>>& tensors)
   : OpBase(physicalDevice, device, commandBuffer, tensors, false)
 {
-    SPDLOG_DEBUG("Kompute OpAlgoBase constructor with params numTensors: {} , shaderFilePath: {}", tensors.size());
+    SPDLOG_DEBUG("Kompute OpAlgoBase constructor with params numTensors: {}", tensors.size());
 
     // The dispatch size is set up based on either explicitly provided template
     // parameters or by default it would take the shape and size of the tensors

From 3e5364fc4448e079f753c980f76d74754c18b4e5 Mon Sep 17 00:00:00 2001
From: Alejandro Saucedo <axsauze@gmail.com>
Date: Thu, 15 Oct 2020 09:25:23 +0100
Subject: [PATCH 04/19] Added async test

---
 test/TestAsyncOperations.cpp | 95 ++++++++++++++++++++++++++++++++++++
 1 file changed, 95 insertions(+)
 create mode 100644 test/TestAsyncOperations.cpp

diff --git a/test/TestAsyncOperations.cpp b/test/TestAsyncOperations.cpp
new file mode 100644
index 000000000..ba8b203de
--- /dev/null
+++ b/test/TestAsyncOperations.cpp
@@ -0,0 +1,95 @@
+
+#include "gtest/gtest.h"
+
+#include <chrono>
+
+#include "kompute/Kompute.hpp"
+
+TEST(TestAsyncOperations, TestManagerAsync)
+{
+    uint32_t size = 100000;
+
+    std::vector<float> data(size, 0.0);
+    std::vector<float> resultSync(size, 100000);
+    std::vector<float> resultAsync(size, 200000);
+
+    std::shared_ptr<kp::Tensor> tensorA{ new kp::Tensor(data) };
+    std::shared_ptr<kp::Tensor> tensorB{ new kp::Tensor(data) };
+    std::shared_ptr<kp::Tensor> tensorC{ new kp::Tensor(data) };
+    std::shared_ptr<kp::Tensor> tensorD{ new kp::Tensor(data) };
+    std::shared_ptr<kp::Tensor> tensorE{ new kp::Tensor(data) };
+    std::shared_ptr<kp::Tensor> tensorF{ new kp::Tensor(data) };
+
+    kp::Manager mgr;
+
+    mgr.evalOpDefault<kp::OpTensorCreate>({ tensorA, tensorB, tensorC, tensorD, tensorE, tensorF });
+
+    std::string shader(R"(
+        #version 450
+
+        layout (local_size_x = 1) in;
+
+        layout(set = 0, binding = 0) buffer a { float pa[]; };
+        layout(set = 0, binding = 1) buffer b { float pb[]; };
+
+        void main() {
+            uint index = gl_GlobalInvocationID.x;
+
+            for (int i = 0; i < 100000; i++)
+            {
+                pa[index] += 1.0;
+            }
+
+            pb[index] = pa[index];
+        }
+    )");
+
+    auto startSync = std::chrono::high_resolution_clock::now();
+
+    mgr.evalOpDefault<kp::OpAlgoBase<>>(
+      { tensorA, tensorB }, std::vector<char>(shader.begin(), shader.end()));
+
+    mgr.evalOpDefault<kp::OpAlgoBase<>>(
+      { tensorC, tensorD }, std::vector<char>(shader.begin(), shader.end()));
+
+    mgr.evalOpDefault<kp::OpAlgoBase<>>(
+      { tensorE, tensorF }, std::vector<char>(shader.begin(), shader.end()));
+
+    auto endSync = std::chrono::high_resolution_clock::now();
+
+    mgr.evalOpDefault<kp::OpTensorSyncLocal>({ tensorB, tensorD, tensorF });
+
+    EXPECT_EQ(tensorB->data(), resultSync);
+    EXPECT_EQ(tensorD->data(), resultSync);
+    EXPECT_EQ(tensorF->data(), resultSync);
+
+    auto durationSync = std::chrono::duration_cast<std::chrono::microseconds>(endSync - startSync).count();
+
+    auto startAsync = std::chrono::high_resolution_clock::now();
+
+    mgr.evalOpAsync<kp::OpAlgoBase<>>(
+      { tensorA, tensorB }, "asyncOne", std::vector<char>(shader.begin(), shader.end()));
+
+    mgr.evalOpAsync<kp::OpAlgoBase<>>(
+      { tensorC, tensorD }, "asyncTwo", std::vector<char>(shader.begin(), shader.end()));
+
+    mgr.evalOpAsync<kp::OpAlgoBase<>>(
+      { tensorE, tensorF }, "asyncThree", std::vector<char>(shader.begin(), shader.end()));
+
+    mgr.evalOpAwait("asyncOne");
+    mgr.evalOpAwait("asyncTwo");
+    mgr.evalOpAwait("asyncThree");
+
+    auto endAsync = std::chrono::high_resolution_clock::now();
+
+    auto durationAsync = std::chrono::duration_cast<std::chrono::microseconds>(endAsync - startAsync).count();
+
+    mgr.evalOpDefault<kp::OpTensorSyncLocal>({ tensorB, tensorD, tensorF });
+
+    EXPECT_EQ(tensorB->data(), resultAsync);
+    EXPECT_EQ(tensorD->data(), resultAsync);
+    EXPECT_EQ(tensorF->data(), resultAsync);
+
+    SPDLOG_DEBUG("Total Sync: {}", durationSync);
+    SPDLOG_DEBUG("Total Async: {}", durationAsync);
+}

From 4e697bb7873c3edd07bd57fdf711688d0194919d Mon Sep 17 00:00:00 2001
From: Alejandro Saucedo <axsauze@gmail.com>
Date: Thu, 15 Oct 2020 21:40:31 +0100
Subject: [PATCH 05/19] Updated to current multiple queue implementation

---
 single_include/kompute/Kompute.hpp |  41 +++++++++---
 src/Manager.cpp                    |  47 +++++++++-----
 src/include/kompute/Manager.hpp    |  41 +++++++++---
 test/TestAsyncOperations.cpp       | 101 +++++++++++++++++------------
 4 files changed, 151 insertions(+), 79 deletions(-)

diff --git a/single_include/kompute/Kompute.hpp b/single_include/kompute/Kompute.hpp
index ee8705df5..a9860425c 100755
--- a/single_include/kompute/Kompute.hpp
+++ b/single_include/kompute/Kompute.hpp
@@ -1253,19 +1253,22 @@ class Manager
     Manager();
 
     /**
-        Similar to base constructor but allows the user to provide the device
-       they would like to create the resources on.
+      * Similar to base constructor but allows the user to provide the device
+      * they would like to create the resources on.
+      *
+      * @param physicalDeviceIndex The index of the physical device to use
+      * @param totalQueues The total number of compute queues to create.
     */
-    Manager(uint32_t physicalDeviceIndex);
+    Manager(uint32_t physicalDeviceIndex, uint32_t totalComputeQueues = 1);
 
     /**
      * Manager constructor which allows your own vulkan application to integrate
      * with the vulkan kompute use.
      *
      * @param instance Vulkan compute instance to base this application
-     * @physicalDevice Vulkan physical device to use for application
-     * @device Vulkan logical device to use for all base resources
-     * @physicalDeviceIndex Index for vulkan physical device used
+     * @param physicalDevice Vulkan physical device to use for application
+     * @param device Vulkan logical device to use for all base resources
+     * @param physicalDeviceIndex Index for vulkan physical device used
      */
     Manager(std::shared_ptr<vk::Instance> instance,
             std::shared_ptr<vk::PhysicalDevice> physicalDevice,
@@ -1290,6 +1293,16 @@ class Manager
     std::weak_ptr<Sequence> getOrCreateManagedSequence(
       std::string sequenceName);
 
+    /**
+     * Create a new managed Kompute sequence so it's available within the manager.
+     *
+     * @param sequenceName The name for the named sequence to be created
+     * @param queueIndex The queue to use from the available queues
+     * @return Weak pointer to the manager owned sequence resource
+     */
+    std::weak_ptr<Sequence> createManagedSequence(
+      std::string sequenceName, uint32_t queueIndex = 0);
+
     /**
      * Operation that adds extra operations to existing or new created
      * sequences.
@@ -1342,7 +1355,12 @@ class Manager
         std::weak_ptr<Sequence> sqWeakPtr =
           this->getOrCreateManagedSequence(sequenceName);
 
-        if (std::shared_ptr<kp::Sequence> sq = sqWeakPtr.lock()) {
+        std::unordered_map<std::string, std::shared_ptr<Sequence>>::iterator found =
+          this->mManagedSequences.find(sequenceName);
+
+        if (found == this->mManagedSequences.end()) {
+            std::shared_ptr<Sequence> sq = found->second;
+
             SPDLOG_DEBUG("Kompute Manager evalOpAsync running sequence BEGIN");
             sq->begin();
 
@@ -1355,6 +1373,9 @@ class Manager
             SPDLOG_DEBUG("Kompute Manager evalOpAsync running sequence EVAL");
             sq->evalAsync();
         }
+        else {
+            SPDLOG_ERROR("Kompute Manager evalOpAsync sequence [{}] not found", sequenceName);
+        }
         SPDLOG_DEBUG("Kompute Manager evalOpAsync running sequence SUCCESS");
     }
 
@@ -1382,7 +1403,7 @@ class Manager
             SPDLOG_DEBUG("Kompute Manager evalOpAwait running sequence SUCCESS");
         }
         else {
-            SPDLOG_ERROR("Sequence not found");
+            SPDLOG_ERROR("Kompute Manager evalOpAwait Sequence not found");
  		}
     }
 
@@ -1437,7 +1458,7 @@ class Manager
     std::shared_ptr<vk::Device> mDevice = nullptr;
     bool mFreeDevice = false;
     uint32_t mComputeQueueFamilyIndex = -1;
-    std::shared_ptr<vk::Queue> mComputeQueue = nullptr;
+    std::vector<std::shared_ptr<vk::Queue>> mComputeQueues;
 
     // -------------- ALWAYS OWNED RESOURCES
     std::unordered_map<std::string, std::shared_ptr<Sequence>>
@@ -1452,7 +1473,7 @@ class Manager
 
     // Create functions
     void createInstance();
-    void createDevice();
+    void createDevice(uint32_t totalComputeQueues);
 };
 
 } // End namespace kp
diff --git a/src/Manager.cpp b/src/Manager.cpp
index 1debb4c9a..58f067843 100644
--- a/src/Manager.cpp
+++ b/src/Manager.cpp
@@ -25,15 +25,15 @@ debugMessageCallback(VkDebugReportFlagsEXT flags,
 #endif
 
 Manager::Manager()
-  : Manager(0)
+  : Manager(0, 1)
 {}
 
-Manager::Manager(uint32_t physicalDeviceIndex)
+Manager::Manager(uint32_t physicalDeviceIndex, uint32_t totalComputeQueues)
 {
     this->mPhysicalDeviceIndex = physicalDeviceIndex;
 
     this->createInstance();
-    this->createDevice();
+    this->createDevice(totalComputeQueues);
 }
 
 Manager::Manager(std::shared_ptr<vk::Instance> instance,
@@ -98,19 +98,27 @@ Manager::getOrCreateManagedSequence(std::string sequenceName)
       this->mManagedSequences.find(sequenceName);
 
     if (found == this->mManagedSequences.end()) {
-        std::shared_ptr<Sequence> sq =
-          std::make_shared<Sequence>(this->mPhysicalDevice,
-                                     this->mDevice,
-                                     this->mComputeQueue,
-                                     this->mComputeQueueFamilyIndex);
-        sq->init();
-        this->mManagedSequences.insert({ sequenceName, sq });
-        return sq;
+        return this->createManagedSequence(sequenceName);
     } else {
         return found->second;
     }
 }
 
+std::weak_ptr<Sequence>
+Manager::createManagedSequence(std::string sequenceName, uint32_t queueIndex) {
+
+    SPDLOG_DEBUG("Kompute Manager createManagedSequence with sequenceName: {} and queueIndex: {}", sequenceName, queueIndex);
+
+    std::shared_ptr<Sequence> sq =
+      std::make_shared<Sequence>(this->mPhysicalDevice,
+                                 this->mDevice,
+                                 this->mComputeQueues[queueIndex],
+                                 this->mComputeQueueFamilyIndex);
+    sq->init();
+    this->mManagedSequences.insert({ sequenceName, sq });
+    return sq;
+}
+
 void
 Manager::createInstance()
 {
@@ -197,7 +205,7 @@ Manager::createInstance()
 }
 
 void
-Manager::createDevice()
+Manager::createDevice(uint32_t totalComputeQueues)
 {
 
     SPDLOG_DEBUG("Kompute Manager creating Device");
@@ -248,7 +256,7 @@ Manager::createDevice()
     }
 
     const float defaultQueuePriority(0.0f);
-    const uint32_t defaultQueueCount(1);
+    const uint32_t defaultQueueCount(totalComputeQueues);
     vk::DeviceQueueCreateInfo deviceQueueCreateInfo(
       vk::DeviceQueueCreateFlags(),
       this->mComputeQueueFamilyIndex,
@@ -264,9 +272,16 @@ Manager::createDevice()
       &deviceCreateInfo, nullptr, this->mDevice.get());
     SPDLOG_DEBUG("Kompute Manager device created");
 
-    this->mComputeQueue = std::make_shared<vk::Queue>();
-    this->mDevice->getQueue(
-      this->mComputeQueueFamilyIndex, 0, this->mComputeQueue.get());
+    for (uint32_t i = 0; i < totalComputeQueues; i++) 
+    {
+        std::shared_ptr<vk::Queue> currQueue = std::make_shared<vk::Queue>();
+
+        this->mDevice->getQueue(
+          this->mComputeQueueFamilyIndex, i, currQueue.get());
+
+        this->mComputeQueues.push_back(currQueue);
+    }
+
     SPDLOG_DEBUG("Kompute Manager compute queue obtained");
 }
 
diff --git a/src/include/kompute/Manager.hpp b/src/include/kompute/Manager.hpp
index e795bbf2e..e2709836c 100644
--- a/src/include/kompute/Manager.hpp
+++ b/src/include/kompute/Manager.hpp
@@ -25,19 +25,22 @@ class Manager
     Manager();
 
     /**
-        Similar to base constructor but allows the user to provide the device
-       they would like to create the resources on.
+      * Similar to base constructor but allows the user to provide the device
+      * they would like to create the resources on.
+      *
+      * @param physicalDeviceIndex The index of the physical device to use
+      * @param totalQueues The total number of compute queues to create.
     */
-    Manager(uint32_t physicalDeviceIndex);
+    Manager(uint32_t physicalDeviceIndex, uint32_t totalComputeQueues = 1);
 
     /**
      * Manager constructor which allows your own vulkan application to integrate
      * with the vulkan kompute use.
      *
      * @param instance Vulkan compute instance to base this application
-     * @physicalDevice Vulkan physical device to use for application
-     * @device Vulkan logical device to use for all base resources
-     * @physicalDeviceIndex Index for vulkan physical device used
+     * @param physicalDevice Vulkan physical device to use for application
+     * @param device Vulkan logical device to use for all base resources
+     * @param physicalDeviceIndex Index for vulkan physical device used
      */
     Manager(std::shared_ptr<vk::Instance> instance,
             std::shared_ptr<vk::PhysicalDevice> physicalDevice,
@@ -62,6 +65,16 @@ class Manager
     std::weak_ptr<Sequence> getOrCreateManagedSequence(
       std::string sequenceName);
 
+    /**
+     * Create a new managed Kompute sequence so it's available within the manager.
+     *
+     * @param sequenceName The name for the named sequence to be created
+     * @param queueIndex The queue to use from the available queues
+     * @return Weak pointer to the manager owned sequence resource
+     */
+    std::weak_ptr<Sequence> createManagedSequence(
+      std::string sequenceName, uint32_t queueIndex = 0);
+
     /**
      * Operation that adds extra operations to existing or new created
      * sequences.
@@ -114,7 +127,12 @@ class Manager
         std::weak_ptr<Sequence> sqWeakPtr =
           this->getOrCreateManagedSequence(sequenceName);
 
-        if (std::shared_ptr<kp::Sequence> sq = sqWeakPtr.lock()) {
+        std::unordered_map<std::string, std::shared_ptr<Sequence>>::iterator found =
+          this->mManagedSequences.find(sequenceName);
+
+        if (found == this->mManagedSequences.end()) {
+            std::shared_ptr<Sequence> sq = found->second;
+
             SPDLOG_DEBUG("Kompute Manager evalOpAsync running sequence BEGIN");
             sq->begin();
 
@@ -127,6 +145,9 @@ class Manager
             SPDLOG_DEBUG("Kompute Manager evalOpAsync running sequence EVAL");
             sq->evalAsync();
         }
+        else {
+            SPDLOG_ERROR("Kompute Manager evalOpAsync sequence [{}] not found", sequenceName);
+        }
         SPDLOG_DEBUG("Kompute Manager evalOpAsync running sequence SUCCESS");
     }
 
@@ -154,7 +175,7 @@ class Manager
             SPDLOG_DEBUG("Kompute Manager evalOpAwait running sequence SUCCESS");
         }
         else {
-            SPDLOG_ERROR("Sequence not found");
+            SPDLOG_ERROR("Kompute Manager evalOpAwait Sequence not found");
  		}
     }
 
@@ -209,7 +230,7 @@ class Manager
     std::shared_ptr<vk::Device> mDevice = nullptr;
     bool mFreeDevice = false;
     uint32_t mComputeQueueFamilyIndex = -1;
-    std::shared_ptr<vk::Queue> mComputeQueue = nullptr;
+    std::vector<std::shared_ptr<vk::Queue>> mComputeQueues;
 
     // -------------- ALWAYS OWNED RESOURCES
     std::unordered_map<std::string, std::shared_ptr<Sequence>>
@@ -224,7 +245,7 @@ class Manager
 
     // Create functions
     void createInstance();
-    void createDevice();
+    void createDevice(uint32_t totalComputeQueues);
 };
 
 } // End namespace kp
diff --git a/test/TestAsyncOperations.cpp b/test/TestAsyncOperations.cpp
index ba8b203de..15fd9caef 100644
--- a/test/TestAsyncOperations.cpp
+++ b/test/TestAsyncOperations.cpp
@@ -9,21 +9,6 @@ TEST(TestAsyncOperations, TestManagerAsync)
 {
     uint32_t size = 100000;
 
-    std::vector<float> data(size, 0.0);
-    std::vector<float> resultSync(size, 100000);
-    std::vector<float> resultAsync(size, 200000);
-
-    std::shared_ptr<kp::Tensor> tensorA{ new kp::Tensor(data) };
-    std::shared_ptr<kp::Tensor> tensorB{ new kp::Tensor(data) };
-    std::shared_ptr<kp::Tensor> tensorC{ new kp::Tensor(data) };
-    std::shared_ptr<kp::Tensor> tensorD{ new kp::Tensor(data) };
-    std::shared_ptr<kp::Tensor> tensorE{ new kp::Tensor(data) };
-    std::shared_ptr<kp::Tensor> tensorF{ new kp::Tensor(data) };
-
-    kp::Manager mgr;
-
-    mgr.evalOpDefault<kp::OpTensorCreate>({ tensorA, tensorB, tensorC, tensorD, tensorE, tensorF });
-
     std::string shader(R"(
         #version 450
 
@@ -44,52 +29,82 @@ TEST(TestAsyncOperations, TestManagerAsync)
         }
     )");
 
+    std::vector<float> data(size, 0.0);
+    std::vector<float> resultSync(size, 100000);
+    std::vector<float> resultAsync(size, 100000);
+
+    std::shared_ptr<kp::Tensor> tensorSyncA{ new kp::Tensor(data) };
+    std::shared_ptr<kp::Tensor> tensorSyncB{ new kp::Tensor(data) };
+    std::shared_ptr<kp::Tensor> tensorSyncC{ new kp::Tensor(data) };
+    std::shared_ptr<kp::Tensor> tensorSyncD{ new kp::Tensor(data) };
+    std::shared_ptr<kp::Tensor> tensorSyncE{ new kp::Tensor(data) };
+    std::shared_ptr<kp::Tensor> tensorSyncF{ new kp::Tensor(data) };
+
+    kp::Manager mgr;
+
+    mgr.evalOpDefault<kp::OpTensorCreate>({ tensorSyncA, tensorSyncB, tensorSyncC, tensorSyncD, tensorSyncE, tensorSyncF });
+
     auto startSync = std::chrono::high_resolution_clock::now();
 
     mgr.evalOpDefault<kp::OpAlgoBase<>>(
-      { tensorA, tensorB }, std::vector<char>(shader.begin(), shader.end()));
+      { tensorSyncA, tensorSyncB }, std::vector<char>(shader.begin(), shader.end()));
 
     mgr.evalOpDefault<kp::OpAlgoBase<>>(
-      { tensorC, tensorD }, std::vector<char>(shader.begin(), shader.end()));
+      { tensorSyncC, tensorSyncD }, std::vector<char>(shader.begin(), shader.end()));
 
     mgr.evalOpDefault<kp::OpAlgoBase<>>(
-      { tensorE, tensorF }, std::vector<char>(shader.begin(), shader.end()));
+      { tensorSyncE, tensorSyncF }, std::vector<char>(shader.begin(), shader.end()));
+
+    mgr.evalOpDefault<kp::OpTensorSyncLocal>({ tensorSyncB, tensorSyncD, tensorSyncF });
 
     auto endSync = std::chrono::high_resolution_clock::now();
-
-    mgr.evalOpDefault<kp::OpTensorSyncLocal>({ tensorB, tensorD, tensorF });
-
-    EXPECT_EQ(tensorB->data(), resultSync);
-    EXPECT_EQ(tensorD->data(), resultSync);
-    EXPECT_EQ(tensorF->data(), resultSync);
-
     auto durationSync = std::chrono::duration_cast<std::chrono::microseconds>(endSync - startSync).count();
 
-    auto startAsync = std::chrono::high_resolution_clock::now();
+    EXPECT_EQ(tensorSyncB->data(), resultSync);
+    EXPECT_EQ(tensorSyncD->data(), resultSync);
+    EXPECT_EQ(tensorSyncF->data(), resultSync);
 
-    mgr.evalOpAsync<kp::OpAlgoBase<>>(
-      { tensorA, tensorB }, "asyncOne", std::vector<char>(shader.begin(), shader.end()));
+    //std::shared_ptr<kp::Tensor> tensorAsyncA{ new kp::Tensor(data) };
+    //std::shared_ptr<kp::Tensor> tensorAsyncB{ new kp::Tensor(data) };
+    //std::shared_ptr<kp::Tensor> tensorAsyncC{ new kp::Tensor(data) };
+    //std::shared_ptr<kp::Tensor> tensorAsyncD{ new kp::Tensor(data) };
+    //std::shared_ptr<kp::Tensor> tensorAsyncE{ new kp::Tensor(data) };
+    //std::shared_ptr<kp::Tensor> tensorAsyncF{ new kp::Tensor(data) };
 
-    mgr.evalOpAsync<kp::OpAlgoBase<>>(
-      { tensorC, tensorD }, "asyncTwo", std::vector<char>(shader.begin(), shader.end()));
+    //kp::Manager mgrAsync(0, 1);
 
-    mgr.evalOpAsync<kp::OpAlgoBase<>>(
-      { tensorE, tensorF }, "asyncThree", std::vector<char>(shader.begin(), shader.end()));
+    //mgrAsync.evalOpDefault<kp::OpTensorCreate>({ tensorAsyncA, tensorAsyncB, tensorAsyncC, tensorAsyncD, tensorAsyncE, tensorAsyncF });
 
-    mgr.evalOpAwait("asyncOne");
-    mgr.evalOpAwait("asyncTwo");
-    mgr.evalOpAwait("asyncThree");
+    //mgrAsync.createManagedSequence("async0", 0);
+    ////mgrAsync.createManagedSequence("async1", 1);
+    ////mgrAsync.createManagedSequence("async2", 2);
 
-    auto endAsync = std::chrono::high_resolution_clock::now();
+    //auto startAsync = std::chrono::high_resolution_clock::now();
 
-    auto durationAsync = std::chrono::duration_cast<std::chrono::microseconds>(endAsync - startAsync).count();
+    //mgrAsync.evalOpAsync<kp::OpAlgoBase<>>(
+    //  { tensorAsyncA, tensorAsyncB }, "async0", std::vector<char>(shader.begin(), shader.end()));
 
-    mgr.evalOpDefault<kp::OpTensorSyncLocal>({ tensorB, tensorD, tensorF });
+    ////mgrAsync.evalOpAsync<kp::OpAlgoBase<>>(
+    ////  { tensorAsyncC, tensorAsyncD }, "async1", std::vector<char>(shader.begin(), shader.end()));
 
-    EXPECT_EQ(tensorB->data(), resultAsync);
-    EXPECT_EQ(tensorD->data(), resultAsync);
-    EXPECT_EQ(tensorF->data(), resultAsync);
+    ////mgrAsync.evalOpAsync<kp::OpAlgoBase<>>(
+    ////  { tensorAsyncE, tensorAsyncF }, "async2", std::vector<char>(shader.begin(), shader.end()));
 
-    SPDLOG_DEBUG("Total Sync: {}", durationSync);
-    SPDLOG_DEBUG("Total Async: {}", durationAsync);
+    //mgrAsync.evalOpAwait("async0");
+    ////mgrAsync.evalOpAwait("async1");
+    ////mgrAsync.evalOpAwait("async2");
+
+    //mgrAsync.evalOpDefault<kp::OpTensorSyncLocal>({ tensorAsyncB });
+    ////mgrAsync.evalOpDefault<kp::OpTensorSyncLocal>({ tensorAsyncD });
+    ////mgrAsync.evalOpDefault<kp::OpTensorSyncLocal>({ tensorAsyncF });
+
+    //auto endAsync = std::chrono::high_resolution_clock::now();
+    //auto durationAsync = std::chrono::duration_cast<std::chrono::microseconds>(endAsync - startAsync).count();
+
+    //EXPECT_EQ(tensorAsyncB->data(), resultAsync);
+    ////EXPECT_EQ(tensorAsyncD->data(), resultAsync);
+    ////EXPECT_EQ(tensorAsyncF->data(), resultAsync);
+
+    ////SPDLOG_DEBUG("Total Sync: {}", durationSync);
+    //SPDLOG_DEBUG("Total Async: {}", durationAsync);
 }

From 9e79b9f352c7f52918fb56e0a432cd1f18d075c0 Mon Sep 17 00:00:00 2001
From: Alejandro Saucedo <axsauze@gmail.com>
Date: Thu, 15 Oct 2020 22:13:12 +0100
Subject: [PATCH 06/19] Updated to implementation of queues but no speedups yet

---
 single_include/kompute/Kompute.hpp |   2 +-
 src/include/kompute/Manager.hpp    |   2 +-
 test/TestAsyncOperations.cpp       | 109 +++++++++++++++--------------
 3 files changed, 57 insertions(+), 56 deletions(-)

diff --git a/single_include/kompute/Kompute.hpp b/single_include/kompute/Kompute.hpp
index a9860425c..9b032bb4f 100755
--- a/single_include/kompute/Kompute.hpp
+++ b/single_include/kompute/Kompute.hpp
@@ -1358,7 +1358,7 @@ class Manager
         std::unordered_map<std::string, std::shared_ptr<Sequence>>::iterator found =
           this->mManagedSequences.find(sequenceName);
 
-        if (found == this->mManagedSequences.end()) {
+        if (found != this->mManagedSequences.end()) {
             std::shared_ptr<Sequence> sq = found->second;
 
             SPDLOG_DEBUG("Kompute Manager evalOpAsync running sequence BEGIN");
diff --git a/src/include/kompute/Manager.hpp b/src/include/kompute/Manager.hpp
index e2709836c..340fd782d 100644
--- a/src/include/kompute/Manager.hpp
+++ b/src/include/kompute/Manager.hpp
@@ -130,7 +130,7 @@ class Manager
         std::unordered_map<std::string, std::shared_ptr<Sequence>>::iterator found =
           this->mManagedSequences.find(sequenceName);
 
-        if (found == this->mManagedSequences.end()) {
+        if (found != this->mManagedSequences.end()) {
             std::shared_ptr<Sequence> sq = found->second;
 
             SPDLOG_DEBUG("Kompute Manager evalOpAsync running sequence BEGIN");
diff --git a/test/TestAsyncOperations.cpp b/test/TestAsyncOperations.cpp
index 15fd9caef..1c0f74e59 100644
--- a/test/TestAsyncOperations.cpp
+++ b/test/TestAsyncOperations.cpp
@@ -9,6 +9,8 @@ TEST(TestAsyncOperations, TestManagerAsync)
 {
     uint32_t size = 100000;
 
+    uint32_t numParallel = 6;
+
     std::string shader(R"(
         #version 450
 
@@ -20,91 +22,90 @@ TEST(TestAsyncOperations, TestManagerAsync)
         void main() {
             uint index = gl_GlobalInvocationID.x;
 
-            for (int i = 0; i < 100000; i++)
+            for (int i = 0; i < 10000; i++)
             {
                 pa[index] += 1.0;
             }
 
             pb[index] = pa[index];
+            pa[index] = 0;
         }
     )");
 
     std::vector<float> data(size, 0.0);
-    std::vector<float> resultSync(size, 100000);
-    std::vector<float> resultAsync(size, 100000);
-
-    std::shared_ptr<kp::Tensor> tensorSyncA{ new kp::Tensor(data) };
-    std::shared_ptr<kp::Tensor> tensorSyncB{ new kp::Tensor(data) };
-    std::shared_ptr<kp::Tensor> tensorSyncC{ new kp::Tensor(data) };
-    std::shared_ptr<kp::Tensor> tensorSyncD{ new kp::Tensor(data) };
-    std::shared_ptr<kp::Tensor> tensorSyncE{ new kp::Tensor(data) };
-    std::shared_ptr<kp::Tensor> tensorSyncF{ new kp::Tensor(data) };
+    std::vector<float> resultSync(size, 10000);
+    std::vector<float> resultAsync(size, 10000);
 
     kp::Manager mgr;
 
-    mgr.evalOpDefault<kp::OpTensorCreate>({ tensorSyncA, tensorSyncB, tensorSyncC, tensorSyncD, tensorSyncE, tensorSyncF });
+    std::vector<std::shared_ptr<kp::Tensor>> inputsSyncA;
+    std::vector<std::shared_ptr<kp::Tensor>> inputsSyncB;
+
+    for (uint32_t i = 0; i < numParallel; i++) {
+        inputsSyncA.push_back(std::make_shared<kp::Tensor>(kp::Tensor(data)));
+        inputsSyncB.push_back(std::make_shared<kp::Tensor>(kp::Tensor(data)));
+    }
+
+    mgr.evalOpDefault<kp::OpTensorCreate>(inputsSyncA);
+    mgr.evalOpDefault<kp::OpTensorCreate>(inputsSyncB);
 
     auto startSync = std::chrono::high_resolution_clock::now();
 
-    mgr.evalOpDefault<kp::OpAlgoBase<>>(
-      { tensorSyncA, tensorSyncB }, std::vector<char>(shader.begin(), shader.end()));
+    for (uint32_t i = 0; i < numParallel; i++) {
+        mgr.evalOpDefault<kp::OpAlgoBase<>>(
+          { inputsSyncA[i],  inputsSyncB[i] }, 
+          std::vector<char>(shader.begin(), shader.end()));
 
-    mgr.evalOpDefault<kp::OpAlgoBase<>>(
-      { tensorSyncC, tensorSyncD }, std::vector<char>(shader.begin(), shader.end()));
+    }
 
-    mgr.evalOpDefault<kp::OpAlgoBase<>>(
-      { tensorSyncE, tensorSyncF }, std::vector<char>(shader.begin(), shader.end()));
-
-    mgr.evalOpDefault<kp::OpTensorSyncLocal>({ tensorSyncB, tensorSyncD, tensorSyncF });
+    mgr.evalOpDefault<kp::OpTensorSyncLocal>(inputsSyncB);
 
     auto endSync = std::chrono::high_resolution_clock::now();
     auto durationSync = std::chrono::duration_cast<std::chrono::microseconds>(endSync - startSync).count();
 
-    EXPECT_EQ(tensorSyncB->data(), resultSync);
-    EXPECT_EQ(tensorSyncD->data(), resultSync);
-    EXPECT_EQ(tensorSyncF->data(), resultSync);
+    for (uint32_t i = 0; i < numParallel; i++) {
+        EXPECT_EQ(inputsSyncB[i]->data(), resultSync);
+    }
 
-    //std::shared_ptr<kp::Tensor> tensorAsyncA{ new kp::Tensor(data) };
-    //std::shared_ptr<kp::Tensor> tensorAsyncB{ new kp::Tensor(data) };
-    //std::shared_ptr<kp::Tensor> tensorAsyncC{ new kp::Tensor(data) };
-    //std::shared_ptr<kp::Tensor> tensorAsyncD{ new kp::Tensor(data) };
-    //std::shared_ptr<kp::Tensor> tensorAsyncE{ new kp::Tensor(data) };
-    //std::shared_ptr<kp::Tensor> tensorAsyncF{ new kp::Tensor(data) };
+    kp::Manager mgrAsync(0, numParallel);
 
-    //kp::Manager mgrAsync(0, 1);
+    std::vector<std::shared_ptr<kp::Tensor>> inputsAsyncA;
+    std::vector<std::shared_ptr<kp::Tensor>> inputsAsyncB;
 
-    //mgrAsync.evalOpDefault<kp::OpTensorCreate>({ tensorAsyncA, tensorAsyncB, tensorAsyncC, tensorAsyncD, tensorAsyncE, tensorAsyncF });
+    for (uint32_t i = 0; i < numParallel; i++) {
+        inputsAsyncA.push_back(std::make_shared<kp::Tensor>(kp::Tensor(data)));
+        inputsAsyncB.push_back(std::make_shared<kp::Tensor>(kp::Tensor(data)));
+    }
 
-    //mgrAsync.createManagedSequence("async0", 0);
-    ////mgrAsync.createManagedSequence("async1", 1);
-    ////mgrAsync.createManagedSequence("async2", 2);
+    mgrAsync.evalOpDefault<kp::OpTensorCreate>(inputsAsyncA);
+    mgrAsync.evalOpDefault<kp::OpTensorCreate>(inputsAsyncB);
 
-    //auto startAsync = std::chrono::high_resolution_clock::now();
+    for (uint32_t i = 0; i < numParallel; i++) {
+        mgrAsync.createManagedSequence("async" + std::to_string(i), i);
+    }
 
-    //mgrAsync.evalOpAsync<kp::OpAlgoBase<>>(
-    //  { tensorAsyncA, tensorAsyncB }, "async0", std::vector<char>(shader.begin(), shader.end()));
+    auto startAsync = std::chrono::high_resolution_clock::now();
 
-    ////mgrAsync.evalOpAsync<kp::OpAlgoBase<>>(
-    ////  { tensorAsyncC, tensorAsyncD }, "async1", std::vector<char>(shader.begin(), shader.end()));
+    for (uint32_t i = 0; i < numParallel; i++) {
+        mgrAsync.evalOpAsync<kp::OpAlgoBase<>>(
+          { inputsAsyncA[i], inputsAsyncB[i] }, 
+          "async" + std::to_string(i), 
+          std::vector<char>(shader.begin(), shader.end()));
+    }
 
-    ////mgrAsync.evalOpAsync<kp::OpAlgoBase<>>(
-    ////  { tensorAsyncE, tensorAsyncF }, "async2", std::vector<char>(shader.begin(), shader.end()));
+    for (uint32_t i = 0; i < numParallel; i++) {
+        mgrAsync.evalOpAwait("async" + std::to_string(i));
+    }
 
-    //mgrAsync.evalOpAwait("async0");
-    ////mgrAsync.evalOpAwait("async1");
-    ////mgrAsync.evalOpAwait("async2");
+    mgrAsync.evalOpDefault<kp::OpTensorSyncLocal>({ inputsAsyncB });
 
-    //mgrAsync.evalOpDefault<kp::OpTensorSyncLocal>({ tensorAsyncB });
-    ////mgrAsync.evalOpDefault<kp::OpTensorSyncLocal>({ tensorAsyncD });
-    ////mgrAsync.evalOpDefault<kp::OpTensorSyncLocal>({ tensorAsyncF });
+    auto endAsync = std::chrono::high_resolution_clock::now();
+    auto durationAsync = std::chrono::duration_cast<std::chrono::microseconds>(endAsync - startAsync).count();
 
-    //auto endAsync = std::chrono::high_resolution_clock::now();
-    //auto durationAsync = std::chrono::duration_cast<std::chrono::microseconds>(endAsync - startAsync).count();
+    for (uint32_t i = 0; i < numParallel; i++) {
+        EXPECT_EQ(inputsAsyncB[i]->data(), resultAsync);
+    }
 
-    //EXPECT_EQ(tensorAsyncB->data(), resultAsync);
-    ////EXPECT_EQ(tensorAsyncD->data(), resultAsync);
-    ////EXPECT_EQ(tensorAsyncF->data(), resultAsync);
-
-    ////SPDLOG_DEBUG("Total Sync: {}", durationSync);
-    //SPDLOG_DEBUG("Total Async: {}", durationAsync);
+    SPDLOG_ERROR("Total Sync: {}", durationSync);
+    SPDLOG_ERROR("Total Async: {}", durationAsync);
 }

From c69fcb7e606e4f7e340cd268b71463003cdfc56c Mon Sep 17 00:00:00 2001
From: Alejandro Saucedo <axsauze@gmail.com>
Date: Fri, 16 Oct 2020 07:39:37 +0100
Subject: [PATCH 07/19] UPdated to have shorter dispatch size and larger loop
 size

---
 test/TestAsyncOperations.cpp | 36 ++++++++++++++++++++++++++----------
 1 file changed, 26 insertions(+), 10 deletions(-)

diff --git a/test/TestAsyncOperations.cpp b/test/TestAsyncOperations.cpp
index 1c0f74e59..6da9533a9 100644
--- a/test/TestAsyncOperations.cpp
+++ b/test/TestAsyncOperations.cpp
@@ -7,7 +7,7 @@
 
 TEST(TestAsyncOperations, TestManagerAsync)
 {
-    uint32_t size = 100000;
+    uint32_t size = 10;
 
     uint32_t numParallel = 6;
 
@@ -19,22 +19,38 @@ TEST(TestAsyncOperations, TestManagerAsync)
         layout(set = 0, binding = 0) buffer a { float pa[]; };
         layout(set = 0, binding = 1) buffer b { float pb[]; };
 
+        shared uint sharedTotal[1];
+
         void main() {
             uint index = gl_GlobalInvocationID.x;
 
-            for (int i = 0; i < 10000; i++)
+            sharedTotal[0] = 0;
+
+            barrier();
+            memoryBarrierShared();
+
+            for (int i = 0; i < 100000000; i++)
             {
-                pa[index] += 1.0;
+                atomicAdd(sharedTotal[0], 1);
+                atomicAdd(sharedTotal[0], -1);
+                atomicAdd(sharedTotal[0], 1);
+                atomicAdd(sharedTotal[0], -1);
+                atomicAdd(sharedTotal[0], 1);
+                atomicAdd(sharedTotal[0], -1);
+                atomicAdd(sharedTotal[0], 1);
             }
 
-            pb[index] = pa[index];
+            barrier();
+            memoryBarrierShared();
+
+            pb[index] = sharedTotal[0];
             pa[index] = 0;
         }
     )");
 
     std::vector<float> data(size, 0.0);
-    std::vector<float> resultSync(size, 10000);
-    std::vector<float> resultAsync(size, 10000);
+    std::vector<float> resultSync(size, 100000000);
+    std::vector<float> resultAsync(size, 100000000);
 
     kp::Manager mgr;
 
@@ -58,11 +74,11 @@ TEST(TestAsyncOperations, TestManagerAsync)
 
     }
 
-    mgr.evalOpDefault<kp::OpTensorSyncLocal>(inputsSyncB);
-
     auto endSync = std::chrono::high_resolution_clock::now();
     auto durationSync = std::chrono::duration_cast<std::chrono::microseconds>(endSync - startSync).count();
 
+    mgr.evalOpDefault<kp::OpTensorSyncLocal>(inputsSyncB);
+
     for (uint32_t i = 0; i < numParallel; i++) {
         EXPECT_EQ(inputsSyncB[i]->data(), resultSync);
     }
@@ -97,11 +113,11 @@ TEST(TestAsyncOperations, TestManagerAsync)
         mgrAsync.evalOpAwait("async" + std::to_string(i));
     }
 
-    mgrAsync.evalOpDefault<kp::OpTensorSyncLocal>({ inputsAsyncB });
-
     auto endAsync = std::chrono::high_resolution_clock::now();
     auto durationAsync = std::chrono::duration_cast<std::chrono::microseconds>(endAsync - startAsync).count();
 
+    mgrAsync.evalOpDefault<kp::OpTensorSyncLocal>({ inputsAsyncB });
+
     for (uint32_t i = 0; i < numParallel; i++) {
         EXPECT_EQ(inputsAsyncB[i]->data(), resultAsync);
     }

From 79ca746ebc4b368edba1a3a4c6fa7fe3eaab3fcf Mon Sep 17 00:00:00 2001
From: Alejandro Saucedo <axsauze@gmail.com>
Date: Fri, 16 Oct 2020 07:54:42 +0100
Subject: [PATCH 08/19] Removed wait stage mask from submit info

---
 src/Sequence.cpp | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/Sequence.cpp b/src/Sequence.cpp
index 9038db2a7..1496e67d9 100644
--- a/src/Sequence.cpp
+++ b/src/Sequence.cpp
@@ -151,8 +151,6 @@ Sequence::evalAsync()
         this->mOperations[i]->preEval();
     }
 
-    const vk::PipelineStageFlags waitStageMask =
-      vk::PipelineStageFlagBits::eTransfer & vk::PipelineStageFlagBits::eComputeShader;
     vk::SubmitInfo submitInfo(
       0, nullptr, nullptr, 1, this->mCommandBuffer.get());
 

From 6fc0b0b3e3f42c5d68d4d924cfa9079415266b15 Mon Sep 17 00:00:00 2001
From: Alejandro Saucedo <axsauze@gmail.com>
Date: Sat, 17 Oct 2020 10:40:43 +0100
Subject: [PATCH 09/19] Added capabilities for multiple queues

---
 single_include/kompute/Kompute.hpp | 12 +++--
 src/Manager.cpp                    | 79 +++++++++++++++++++-----------
 src/include/kompute/Manager.hpp    | 12 +++--
 3 files changed, 65 insertions(+), 38 deletions(-)
 mode change 100644 => 100755 src/Manager.cpp

diff --git a/single_include/kompute/Kompute.hpp b/single_include/kompute/Kompute.hpp
index 9b032bb4f..15e03ccc4 100755
--- a/single_include/kompute/Kompute.hpp
+++ b/single_include/kompute/Kompute.hpp
@@ -1245,7 +1245,7 @@ namespace kp {
 */
 class Manager
 {
-  public:
+public:
     /**
         Base constructor and default used which creates the base resources
        including choosing the device 0 by default.
@@ -1257,9 +1257,10 @@ class Manager
       * they would like to create the resources on.
       *
       * @param physicalDeviceIndex The index of the physical device to use
+      * @param familyQueueIndeces (Optional) List of queue indeces to add for explicit allocation
       * @param totalQueues The total number of compute queues to create.
     */
-    Manager(uint32_t physicalDeviceIndex, uint32_t totalComputeQueues = 1);
+    Manager(uint32_t physicalDeviceIndex, const std::vector<uint32_t> & familyQueueIndeces = {});
 
     /**
      * Manager constructor which allows your own vulkan application to integrate
@@ -1457,13 +1458,14 @@ class Manager
     uint32_t mPhysicalDeviceIndex = -1;
     std::shared_ptr<vk::Device> mDevice = nullptr;
     bool mFreeDevice = false;
-    uint32_t mComputeQueueFamilyIndex = -1;
-    std::vector<std::shared_ptr<vk::Queue>> mComputeQueues;
 
     // -------------- ALWAYS OWNED RESOURCES
     std::unordered_map<std::string, std::shared_ptr<Sequence>>
       mManagedSequences;
 
+    std::vector<uint32_t> mComputeQueueFamilyIndeces;
+    std::vector<std::shared_ptr<vk::Queue>> mComputeQueues;
+
 #if DEBUG
 #ifndef KOMPUTE_DISABLE_VK_DEBUG_LAYERS
     vk::DebugReportCallbackEXT mDebugReportCallback;
@@ -1473,7 +1475,7 @@ class Manager
 
     // Create functions
     void createInstance();
-    void createDevice(uint32_t totalComputeQueues);
+    void createDevice(const std::vector<uint32_t> & familyQueueIndeces = {});
 };
 
 } // End namespace kp
diff --git a/src/Manager.cpp b/src/Manager.cpp
old mode 100644
new mode 100755
index 58f067843..7e5cc831a
--- a/src/Manager.cpp
+++ b/src/Manager.cpp
@@ -25,15 +25,15 @@ debugMessageCallback(VkDebugReportFlagsEXT flags,
 #endif
 
 Manager::Manager()
-  : Manager(0, 1)
+  : Manager(0)
 {}
 
-Manager::Manager(uint32_t physicalDeviceIndex, uint32_t totalComputeQueues)
+Manager::Manager(uint32_t physicalDeviceIndex, const std::vector<uint32_t> & familyQueueIndeces)
 {
     this->mPhysicalDeviceIndex = physicalDeviceIndex;
 
     this->createInstance();
-    this->createDevice(totalComputeQueues);
+    this->createDevice(familyQueueIndeces);
 }
 
 Manager::Manager(std::shared_ptr<vk::Instance> instance,
@@ -113,7 +113,7 @@ Manager::createManagedSequence(std::string sequenceName, uint32_t queueIndex) {
       std::make_shared<Sequence>(this->mPhysicalDevice,
                                  this->mDevice,
                                  this->mComputeQueues[queueIndex],
-                                 this->mComputeQueueFamilyIndex);
+                                 this->mComputeQueueFamilyIndeces[queueIndex]);
     sq->init();
     this->mManagedSequences.insert({ sequenceName, sq });
     return sq;
@@ -205,7 +205,7 @@ Manager::createInstance()
 }
 
 void
-Manager::createDevice(uint32_t totalComputeQueues)
+Manager::createDevice(const std::vector<uint32_t> & familyQueueIndeces)
 {
 
     SPDLOG_DEBUG("Kompute Manager creating Device");
@@ -236,48 +236,71 @@ Manager::createDevice(uint32_t totalComputeQueues)
                 this->mPhysicalDeviceIndex,
                 physicalDeviceProperties.deviceName);
 
-    // Find compute queue
-    std::vector<vk::QueueFamilyProperties> allQueueFamilyProperties =
-      physicalDevice.getQueueFamilyProperties();
+    if (!familyQueueIndeces.size()) {
+        // Find compute queue
+        std::vector<vk::QueueFamilyProperties> allQueueFamilyProperties =
+          physicalDevice.getQueueFamilyProperties();
 
-    this->mComputeQueueFamilyIndex = -1;
-    for (uint32_t i = 0; i < allQueueFamilyProperties.size(); i++) {
-        vk::QueueFamilyProperties queueFamilyProperties =
-          allQueueFamilyProperties[i];
+        uint32_t computeQueueFamilyIndex = -1;
+        for (uint32_t i = 0; i < allQueueFamilyProperties.size(); i++) {
+            vk::QueueFamilyProperties queueFamilyProperties =
+              allQueueFamilyProperties[i];
 
-        if (queueFamilyProperties.queueFlags & vk::QueueFlagBits::eCompute) {
-            this->mComputeQueueFamilyIndex = i;
-            break;
+            if (queueFamilyProperties.queueFlags & vk::QueueFlagBits::eCompute) {
+                computeQueueFamilyIndex = i;
+                break;
+            }
         }
+
+        if (computeQueueFamilyIndex < 0) {
+            throw std::runtime_error("Compute queue is not supported");
+        }
+
+        this->mComputeQueueFamilyIndeces.push_back(computeQueueFamilyIndex);
+    }
+    else {
+        this->mComputeQueueFamilyIndeces = familyQueueIndeces;
     }
 
-    if (this->mComputeQueueFamilyIndex < 0) {
-        throw std::runtime_error("Compute queue is not supported");
+    std::unordered_map<uint32_t, uint32_t> familyQueueCounts;
+    std::unordered_map<uint32_t, std::vector<float>> familyQueuePriorities;
+    for (const auto& value : this->mComputeQueueFamilyIndeces) {
+        familyQueueCounts[value]++;
+        familyQueuePriorities[value].push_back(1.0f);
     }
 
-    const float defaultQueuePriority(0.0f);
-    const uint32_t defaultQueueCount(totalComputeQueues);
-    vk::DeviceQueueCreateInfo deviceQueueCreateInfo(
-      vk::DeviceQueueCreateFlags(),
-      this->mComputeQueueFamilyIndex,
-      defaultQueueCount,
-      &defaultQueuePriority);
+    std::unordered_map<uint32_t, uint32_t> familyQueueIndexCount;
+    std::vector<vk::DeviceQueueCreateInfo> deviceQueueCreateInfos;
+    for (const auto& familyQueueInfo : familyQueueCounts) {
+        // Setting the device count to 0
+        familyQueueIndexCount[familyQueueInfo.first] = 0;
+
+        // Creating the respective device queue
+        vk::DeviceQueueCreateInfo deviceQueueCreateInfo(
+                vk::DeviceQueueCreateFlags(),
+                familyQueueInfo.first,
+                familyQueueInfo.second,
+                familyQueuePriorities[familyQueueInfo.first].data());
+        deviceQueueCreateInfos.push_back(deviceQueueCreateInfo);
+    }
 
     vk::DeviceCreateInfo deviceCreateInfo(vk::DeviceCreateFlags(),
-                                          1, // Number of deviceQueueCreateInfo
-                                          &deviceQueueCreateInfo);
+        deviceQueueCreateInfos.size(),
+        deviceQueueCreateInfos.data());
 
     this->mDevice = std::make_shared<vk::Device>();
     physicalDevice.createDevice(
       &deviceCreateInfo, nullptr, this->mDevice.get());
     SPDLOG_DEBUG("Kompute Manager device created");
 
-    for (uint32_t i = 0; i < totalComputeQueues; i++) 
+    for (const uint32_t & familyQueueIndex : this->mComputeQueueFamilyIndeces) 
     {
         std::shared_ptr<vk::Queue> currQueue = std::make_shared<vk::Queue>();
 
         this->mDevice->getQueue(
-          this->mComputeQueueFamilyIndex, i, currQueue.get());
+          familyQueueIndex, familyQueueIndexCount[familyQueueIndex], currQueue.get());
+
+        familyQueueIndexCount[familyQueueIndex]++;
 
         this->mComputeQueues.push_back(currQueue);
     }
diff --git a/src/include/kompute/Manager.hpp b/src/include/kompute/Manager.hpp
index 340fd782d..2581139d9 100644
--- a/src/include/kompute/Manager.hpp
+++ b/src/include/kompute/Manager.hpp
@@ -17,7 +17,7 @@ namespace kp {
 */
 class Manager
 {
-  public:
+public:
     /**
         Base constructor and default used which creates the base resources
        including choosing the device 0 by default.
@@ -29,9 +29,10 @@ class Manager
       * they would like to create the resources on.
       *
       * @param physicalDeviceIndex The index of the physical device to use
+      * @param familyQueueIndeces (Optional) List of queue indeces to add for explicit allocation
       * @param totalQueues The total number of compute queues to create.
     */
-    Manager(uint32_t physicalDeviceIndex, uint32_t totalComputeQueues = 1);
+    Manager(uint32_t physicalDeviceIndex, const std::vector<uint32_t> & familyQueueIndeces = {});
 
     /**
      * Manager constructor which allows your own vulkan application to integrate
@@ -229,13 +230,14 @@ class Manager
     uint32_t mPhysicalDeviceIndex = -1;
     std::shared_ptr<vk::Device> mDevice = nullptr;
     bool mFreeDevice = false;
-    uint32_t mComputeQueueFamilyIndex = -1;
-    std::vector<std::shared_ptr<vk::Queue>> mComputeQueues;
 
     // -------------- ALWAYS OWNED RESOURCES
     std::unordered_map<std::string, std::shared_ptr<Sequence>>
       mManagedSequences;
 
+    std::vector<uint32_t> mComputeQueueFamilyIndeces;
+    std::vector<std::shared_ptr<vk::Queue>> mComputeQueues;
+
 #if DEBUG
 #ifndef KOMPUTE_DISABLE_VK_DEBUG_LAYERS
     vk::DebugReportCallbackEXT mDebugReportCallback;
@@ -245,7 +247,7 @@ class Manager
 
     // Create functions
     void createInstance();
-    void createDevice(uint32_t totalComputeQueues);
+    void createDevice(const std::vector<uint32_t> & familyQueueIndeces = {});
 };
 
 } // End namespace kp

From 356a56347119ee492bc922ac2676ad47a9b82859 Mon Sep 17 00:00:00 2001
From: Alejandro Saucedo <axsauze@gmail.com>
Date: Sat, 17 Oct 2020 10:42:07 +0100
Subject: [PATCH 10/19] Added test that ensures multiprocessing is working

---
 test/TestAsyncOperations.cpp | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)
 mode change 100644 => 100755 test/TestAsyncOperations.cpp

diff --git a/test/TestAsyncOperations.cpp b/test/TestAsyncOperations.cpp
old mode 100644
new mode 100755
index 6da9533a9..6c7de15e8
--- a/test/TestAsyncOperations.cpp
+++ b/test/TestAsyncOperations.cpp
@@ -9,7 +9,7 @@ TEST(TestAsyncOperations, TestManagerAsync)
 {
     uint32_t size = 10;
 
-    uint32_t numParallel = 6;
+    uint32_t numParallel = 2;
 
     std::string shader(R"(
         #version 450
@@ -83,7 +83,7 @@ TEST(TestAsyncOperations, TestManagerAsync)
         EXPECT_EQ(inputsSyncB[i]->data(), resultSync);
     }
 
-    kp::Manager mgrAsync(0, numParallel);
+    kp::Manager mgrAsync(0, {0, 2});
 
     std::vector<std::shared_ptr<kp::Tensor>> inputsAsyncA;
     std::vector<std::shared_ptr<kp::Tensor>> inputsAsyncB;
@@ -109,6 +109,8 @@ TEST(TestAsyncOperations, TestManagerAsync)
           std::vector<char>(shader.begin(), shader.end()));
     }
 
+    // TODO: Add function to print device details (or link)
+    // TODO: Seems to fail if await called twice
     for (uint32_t i = 0; i < numParallel; i++) {
         mgrAsync.evalOpAwait("async" + std::to_string(i));
     }
@@ -122,6 +124,6 @@ TEST(TestAsyncOperations, TestManagerAsync)
         EXPECT_EQ(inputsAsyncB[i]->data(), resultAsync);
     }
 
-    SPDLOG_ERROR("Total Sync: {}", durationSync);
-    SPDLOG_ERROR("Total Async: {}", durationAsync);
+    // The speedup should be at least 40% 
+    EXPECT_LT(durationAsync, durationSync * 0.6);
 }

From 40fc293eb1fb3c445dd04c6e134ca3370f4031f2 Mon Sep 17 00:00:00 2001
From: Alejandro Saucedo <axsauze@gmail.com>
Date: Sat, 17 Oct 2020 11:17:23 +0100
Subject: [PATCH 11/19] Updated tests to work with async tests

---
 test/TestAsyncOperations.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/test/TestAsyncOperations.cpp b/test/TestAsyncOperations.cpp
index 6c7de15e8..45027942f 100755
--- a/test/TestAsyncOperations.cpp
+++ b/test/TestAsyncOperations.cpp
@@ -109,8 +109,6 @@ TEST(TestAsyncOperations, TestManagerAsync)
           std::vector<char>(shader.begin(), shader.end()));
     }
 
-    // TODO: Add function to print device details (or link)
-    // TODO: Seems to fail if await called twice
     for (uint32_t i = 0; i < numParallel; i++) {
         mgrAsync.evalOpAwait("async" + std::to_string(i));
     }
@@ -124,6 +122,9 @@ TEST(TestAsyncOperations, TestManagerAsync)
         EXPECT_EQ(inputsAsyncB[i]->data(), resultAsync);
     }
 
+    SPDLOG_ERROR("sync {}", durationSync);
+    SPDLOG_ERROR("async {}", durationAsync);
+
     // The speedup should be at least 40% 
     EXPECT_LT(durationAsync, durationSync * 0.6);
 }

From 872a0bc7170e24206866d8920d406fb7e204ba80 Mon Sep 17 00:00:00 2001
From: Alejandro Saucedo <axsauze@gmail.com>
Date: Sat, 17 Oct 2020 11:18:50 +0100
Subject: [PATCH 12/19] Added details on test

---
 test/TestAsyncOperations.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/test/TestAsyncOperations.cpp b/test/TestAsyncOperations.cpp
index 45027942f..b9e84e1b5 100755
--- a/test/TestAsyncOperations.cpp
+++ b/test/TestAsyncOperations.cpp
@@ -7,6 +7,9 @@
 
 TEST(TestAsyncOperations, TestManagerAsync)
 {
+    // This test is built for NVIDIA 1650. It assumes:
+    // * Queue family 0 and 2 have compute capabilities
+    // * GPU is able to process parallel shader code across different families
     uint32_t size = 10;
 
     uint32_t numParallel = 2;

From 9a64339e953c8eb23c7a2a1d39fb7f0f5a4a4e67 Mon Sep 17 00:00:00 2001
From: Alejandro Saucedo <axsauze@gmail.com>
Date: Sat, 17 Oct 2020 11:19:25 +0100
Subject: [PATCH 13/19] Reformatted

---
 src/Algorithm.cpp                   |  3 +-
 src/Manager.cpp                     | 39 ++++++++++---------
 src/Sequence.cpp                    |  9 +++--
 src/include/kompute/Core.hpp        | 17 ++++++---
 src/include/kompute/Manager.hpp     | 58 +++++++++++++++--------------
 src/include/kompute/Sequence.hpp    | 10 +++--
 test/TestAsyncOperations.cpp        | 19 ++++++----
 test/TestMultipleAlgoExecutions.cpp | 12 +++---
 8 files changed, 95 insertions(+), 72 deletions(-)
 mode change 100755 => 100644 src/Manager.cpp
 mode change 100755 => 100644 test/TestAsyncOperations.cpp

diff --git a/src/Algorithm.cpp b/src/Algorithm.cpp
index 0138201c7..845036475 100644
--- a/src/Algorithm.cpp
+++ b/src/Algorithm.cpp
@@ -266,8 +266,7 @@ Algorithm::createPipeline(std::vector<uint32_t> specializationData)
 
 #ifdef KOMPUTE_CREATE_PIPELINE_RESULT_VALUE
     vk::ResultValue<vk::Pipeline> pipelineResult =
-      this->mDevice->createComputePipeline(*this->mPipelineCache,
-pipelineInfo);
+      this->mDevice->createComputePipeline(*this->mPipelineCache, pipelineInfo);
 
     if (pipelineResult.result != vk::Result::eSuccess) {
         throw std::runtime_error("Failed to create pipeline result: " +
diff --git a/src/Manager.cpp b/src/Manager.cpp
old mode 100755
new mode 100644
index 7e5cc831a..b2739d6be
--- a/src/Manager.cpp
+++ b/src/Manager.cpp
@@ -28,7 +28,8 @@ Manager::Manager()
   : Manager(0)
 {}
 
-Manager::Manager(uint32_t physicalDeviceIndex, const std::vector<uint32_t> & familyQueueIndeces)
+Manager::Manager(uint32_t physicalDeviceIndex,
+                 const std::vector<uint32_t>& familyQueueIndeces)
 {
     this->mPhysicalDeviceIndex = physicalDeviceIndex;
 
@@ -105,9 +106,13 @@ Manager::getOrCreateManagedSequence(std::string sequenceName)
 }
 
 std::weak_ptr<Sequence>
-Manager::createManagedSequence(std::string sequenceName, uint32_t queueIndex) {
+Manager::createManagedSequence(std::string sequenceName, uint32_t queueIndex)
+{
 
-    SPDLOG_DEBUG("Kompute Manager createManagedSequence with sequenceName: {} and queueIndex: {}", sequenceName, queueIndex);
+    SPDLOG_DEBUG("Kompute Manager createManagedSequence with sequenceName: {} "
+                 "and queueIndex: {}",
+                 sequenceName,
+                 queueIndex);
 
     std::shared_ptr<Sequence> sq =
       std::make_shared<Sequence>(this->mPhysicalDevice,
@@ -205,7 +210,7 @@ Manager::createInstance()
 }
 
 void
-Manager::createDevice(const std::vector<uint32_t> & familyQueueIndeces)
+Manager::createDevice(const std::vector<uint32_t>& familyQueueIndeces)
 {
 
     SPDLOG_DEBUG("Kompute Manager creating Device");
@@ -246,7 +251,8 @@ Manager::createDevice(const std::vector<uint32_t> & familyQueueIndeces)
             vk::QueueFamilyProperties queueFamilyProperties =
               allQueueFamilyProperties[i];
 
-            if (queueFamilyProperties.queueFlags & vk::QueueFlagBits::eCompute) {
+            if (queueFamilyProperties.queueFlags &
+                vk::QueueFlagBits::eCompute) {
                 computeQueueFamilyIndex = i;
                 break;
             }
@@ -257,8 +263,7 @@ Manager::createDevice(const std::vector<uint32_t> & familyQueueIndeces)
         }
 
         this->mComputeQueueFamilyIndeces.push_back(computeQueueFamilyIndex);
-    }
-    else {
+    } else {
         this->mComputeQueueFamilyIndeces = familyQueueIndeces;
     }
 
@@ -277,28 +282,28 @@ Manager::createDevice(const std::vector<uint32_t> & familyQueueIndeces)
 
         // Creating the respective device queue
         vk::DeviceQueueCreateInfo deviceQueueCreateInfo(
-                vk::DeviceQueueCreateFlags(),
-                familyQueueInfo.first,
-                familyQueueInfo.second,
-                familyQueuePriorities[familyQueueInfo.first].data());
+          vk::DeviceQueueCreateFlags(),
+          familyQueueInfo.first,
+          familyQueueInfo.second,
+          familyQueuePriorities[familyQueueInfo.first].data());
         deviceQueueCreateInfos.push_back(deviceQueueCreateInfo);
     }
 
     vk::DeviceCreateInfo deviceCreateInfo(vk::DeviceCreateFlags(),
-        deviceQueueCreateInfos.size(),
-        deviceQueueCreateInfos.data());
+                                          deviceQueueCreateInfos.size(),
+                                          deviceQueueCreateInfos.data());
 
     this->mDevice = std::make_shared<vk::Device>();
     physicalDevice.createDevice(
       &deviceCreateInfo, nullptr, this->mDevice.get());
     SPDLOG_DEBUG("Kompute Manager device created");
 
-    for (const uint32_t & familyQueueIndex : this->mComputeQueueFamilyIndeces) 
-    {
+    for (const uint32_t& familyQueueIndex : this->mComputeQueueFamilyIndeces) {
         std::shared_ptr<vk::Queue> currQueue = std::make_shared<vk::Queue>();
 
-        this->mDevice->getQueue(
-          familyQueueIndex, familyQueueIndexCount[familyQueueIndex], currQueue.get());
+        this->mDevice->getQueue(familyQueueIndex,
+                                familyQueueIndexCount[familyQueueIndex],
+                                currQueue.get());
 
         familyQueueIndexCount[familyQueueIndex]++;
 
diff --git a/src/Sequence.cpp b/src/Sequence.cpp
index 1496e67d9..a03a34afe 100644
--- a/src/Sequence.cpp
+++ b/src/Sequence.cpp
@@ -141,7 +141,8 @@ Sequence::evalAsync()
         return false;
     }
     if (this->mIsRunning) {
-        SPDLOG_WARN("Kompute Sequence evalAsync called when an eval async was called without successful wait");
+        SPDLOG_WARN("Kompute Sequence evalAsync called when an eval async was "
+                    "called without successful wait");
         return false;
     }
 
@@ -172,7 +173,8 @@ Sequence::evalAwait(uint64_t waitFor)
         return false;
     }
 
-    vk::Result result = this->mDevice->waitForFences(1, &this->mFence, VK_TRUE, waitFor);
+    vk::Result result =
+      this->mDevice->waitForFences(1, &this->mFence, VK_TRUE, waitFor);
     this->mDevice->destroy(this->mFence);
 
     if (result == vk::Result::eTimeout) {
@@ -191,7 +193,8 @@ Sequence::evalAwait(uint64_t waitFor)
 }
 
 bool
-Sequence::isRunning() {
+Sequence::isRunning()
+{
     return this->mIsRunning;
 }
 
diff --git a/src/include/kompute/Core.hpp b/src/include/kompute/Core.hpp
index dfa83d7a5..72ffa5346 100644
--- a/src/include/kompute/Core.hpp
+++ b/src/include/kompute/Core.hpp
@@ -18,10 +18,11 @@ static const char* KOMPUTE_LOG_TAG = "KomputeLog";
 #ifndef KOMPUTE_VK_API_MINOR_VERSION
 #define KOMPUTE_VK_API_MINOR_VERSION 1
 #endif // KOMPUTE_VK_API_MINOR_VERSION
-#define KOMPUTE_VK_API_VERSION VK_MAKE_VERSION(KOMPUTE_VK_API_MAJOR_VERSION, KOMPUTE_VK_API_MINOR_VERSION, 0)
+#define KOMPUTE_VK_API_VERSION                                                 \
+    VK_MAKE_VERSION(                                                           \
+      KOMPUTE_VK_API_MAJOR_VERSION, KOMPUTE_VK_API_MINOR_VERSION, 0)
 #endif // KOMPUTE_VK_API_VERSION
 
-
 // SPDLOG_ACTIVE_LEVEL must be defined before spdlog.h import
 #if !defined(SPDLOG_ACTIVE_LEVEL)
 #if DEBUG
@@ -40,7 +41,8 @@ static const char* KOMPUTE_LOG_TAG = "KomputeLog";
 #define SPDLOG_DEBUG(message, ...)
 #else
 #if defined(VK_USE_PLATFORM_ANDROID_KHR)
-#define SPDLOG_DEBUG(message, ...) ((void)__android_log_print(ANDROID_LOG_DEBUG, KOMPUTE_LOG_TAG, message))
+#define SPDLOG_DEBUG(message, ...)                                             \
+    ((void)__android_log_print(ANDROID_LOG_DEBUG, KOMPUTE_LOG_TAG, message))
 #else
 #define SPDLOG_DEBUG(message, ...)                                             \
     std::cout << "DEBUG: " << message << std::endl
@@ -50,7 +52,8 @@ static const char* KOMPUTE_LOG_TAG = "KomputeLog";
 #define SPDLOG_INFO(message, ...)
 #else
 #if defined(VK_USE_PLATFORM_ANDROID_KHR)
-#define SPDLOG_INFO(message, ...) ((void)__android_log_print(ANDROID_LOG_INFO, KOMPUTE_LOG_TAG, message))
+#define SPDLOG_INFO(message, ...)                                              \
+    ((void)__android_log_print(ANDROID_LOG_INFO, KOMPUTE_LOG_TAG, message))
 #else
 #define SPDLOG_INFO(message, ...) std::cout << "INFO: " << message << std::endl
 #endif // VK_USE_PLATFORM_ANDROID_KHR
@@ -59,7 +62,8 @@ static const char* KOMPUTE_LOG_TAG = "KomputeLog";
 #define SPDLOG_WARN(message, ...)
 #else
 #if defined(VK_USE_PLATFORM_ANDROID_KHR)
-#define SPDLOG_WARN(message, ...) ((void)__android_log_print(ANDROID_LOG_INFO, KOMPUTE_LOG_TAG, message))
+#define SPDLOG_WARN(message, ...)                                              \
+    ((void)__android_log_print(ANDROID_LOG_INFO, KOMPUTE_LOG_TAG, message))
 #else
 #define SPDLOG_WARN(message, ...)                                              \
     std::cout << "WARNING: " << message << std::endl
@@ -69,7 +73,8 @@ static const char* KOMPUTE_LOG_TAG = "KomputeLog";
 #define SPDLOG_ERROR(message, ...)
 #else
 #if defined(VK_USE_PLATFORM_ANDROID_KHR)
-#define SPDLOG_ERROR(message, ...) ((void)__android_log_print(ANDROID_LOG_INFO, KOMPUTE_LOG_TAG, message))
+#define SPDLOG_ERROR(message, ...)                                             \
+    ((void)__android_log_print(ANDROID_LOG_INFO, KOMPUTE_LOG_TAG, message))
 #else
 #define SPDLOG_ERROR(message, ...)                                             \
     std::cout << "ERROR: " << message << std::endl
diff --git a/src/include/kompute/Manager.hpp b/src/include/kompute/Manager.hpp
index 2581139d9..650da5ca0 100644
--- a/src/include/kompute/Manager.hpp
+++ b/src/include/kompute/Manager.hpp
@@ -17,7 +17,7 @@ namespace kp {
 */
 class Manager
 {
-public:
+  public:
     /**
         Base constructor and default used which creates the base resources
        including choosing the device 0 by default.
@@ -25,14 +25,16 @@ public:
     Manager();
 
     /**
-      * Similar to base constructor but allows the user to provide the device
-      * they would like to create the resources on.
-      *
-      * @param physicalDeviceIndex The index of the physical device to use
-      * @param familyQueueIndeces (Optional) List of queue indeces to add for explicit allocation
-      * @param totalQueues The total number of compute queues to create.
-    */
-    Manager(uint32_t physicalDeviceIndex, const std::vector<uint32_t> & familyQueueIndeces = {});
+     * Similar to base constructor but allows the user to provide the device
+     * they would like to create the resources on.
+     *
+     * @param physicalDeviceIndex The index of the physical device to use
+     * @param familyQueueIndeces (Optional) List of queue indeces to add for
+     * explicit allocation
+     * @param totalQueues The total number of compute queues to create.
+     */
+    Manager(uint32_t physicalDeviceIndex,
+            const std::vector<uint32_t>& familyQueueIndeces = {});
 
     /**
      * Manager constructor which allows your own vulkan application to integrate
@@ -67,14 +69,15 @@ public:
       std::string sequenceName);
 
     /**
-     * Create a new managed Kompute sequence so it's available within the manager.
+     * Create a new managed Kompute sequence so it's available within the
+     * manager.
      *
      * @param sequenceName The name for the named sequence to be created
      * @param queueIndex The queue to use from the available queues
      * @return Weak pointer to the manager owned sequence resource
      */
-    std::weak_ptr<Sequence> createManagedSequence(
-      std::string sequenceName, uint32_t queueIndex = 0);
+    std::weak_ptr<Sequence> createManagedSequence(std::string sequenceName,
+                                                  uint32_t queueIndex = 0);
 
     /**
      * Operation that adds extra operations to existing or new created
@@ -121,15 +124,15 @@ public:
      */
     template<typename T, typename... TArgs>
     void evalOpAsync(std::vector<std::shared_ptr<Tensor>> tensors,
-                std::string sequenceName,
-                TArgs&&... params)
+                     std::string sequenceName,
+                     TArgs&&... params)
     {
         SPDLOG_DEBUG("Kompute Manager evalOpAsync triggered");
         std::weak_ptr<Sequence> sqWeakPtr =
           this->getOrCreateManagedSequence(sequenceName);
 
-        std::unordered_map<std::string, std::shared_ptr<Sequence>>::iterator found =
-          this->mManagedSequences.find(sequenceName);
+        std::unordered_map<std::string, std::shared_ptr<Sequence>>::iterator
+          found = this->mManagedSequences.find(sequenceName);
 
         if (found != this->mManagedSequences.end()) {
             std::shared_ptr<Sequence> sq = found->second;
@@ -145,9 +148,9 @@ public:
 
             SPDLOG_DEBUG("Kompute Manager evalOpAsync running sequence EVAL");
             sq->evalAsync();
-        }
-        else {
-            SPDLOG_ERROR("Kompute Manager evalOpAsync sequence [{}] not found", sequenceName);
+        } else {
+            SPDLOG_ERROR("Kompute Manager evalOpAsync sequence [{}] not found",
+                         sequenceName);
         }
         SPDLOG_DEBUG("Kompute Manager evalOpAsync running sequence SUCCESS");
     }
@@ -163,21 +166,22 @@ public:
     void evalOpAwait(std::string sequenceName, uint64_t waitFor = UINT64_MAX)
     {
         SPDLOG_DEBUG("Kompute Manager evalOpAwait triggered");
-        std::unordered_map<std::string, std::shared_ptr<Sequence>>::iterator found =
-        this->mManagedSequences.find(sequenceName);
+        std::unordered_map<std::string, std::shared_ptr<Sequence>>::iterator
+          found = this->mManagedSequences.find(sequenceName);
 
         if (found != this->mManagedSequences.end()) {
             if (std::shared_ptr<kp::Sequence> sq = found->second) {
-                SPDLOG_DEBUG("Kompute Manager evalOpAwait running sequence Sequence EVAL AWAIT");
+                SPDLOG_DEBUG("Kompute Manager evalOpAwait running sequence "
+                             "Sequence EVAL AWAIT");
                 if (sq->isRunning()) {
                     sq->evalAwait(waitFor);
                 }
             }
-            SPDLOG_DEBUG("Kompute Manager evalOpAwait running sequence SUCCESS");
-        }
-        else {
+            SPDLOG_DEBUG(
+              "Kompute Manager evalOpAwait running sequence SUCCESS");
+        } else {
             SPDLOG_ERROR("Kompute Manager evalOpAwait Sequence not found");
- 		}
+        }
     }
 
     /**
@@ -247,7 +251,7 @@ public:
 
     // Create functions
     void createInstance();
-    void createDevice(const std::vector<uint32_t> & familyQueueIndeces = {});
+    void createDevice(const std::vector<uint32_t>& familyQueueIndeces = {});
 };
 
 } // End namespace kp
diff --git a/src/include/kompute/Sequence.hpp b/src/include/kompute/Sequence.hpp
index 2d9ef8e51..314de6657 100644
--- a/src/include/kompute/Sequence.hpp
+++ b/src/include/kompute/Sequence.hpp
@@ -67,14 +67,17 @@ class Sequence
     bool eval();
 
     /**
-     * Eval Async sends all the recorded and stored operations in the vector of operations into the gpu as a submit job with a barrier. EvalAwait() must be called after to ensure the sequence is terminated correctly.
+     * Eval Async sends all the recorded and stored operations in the vector of
+     * operations into the gpu as a submit job with a barrier. EvalAwait() must
+     * be called after to ensure the sequence is terminated correctly.
      *
      * @return Boolean stating whether execution was successful.
      */
     bool evalAsync();
 
     /**
-     * Eval Await waits for the fence to finish processing and then once it finishes, it runs the postEval of all operations.
+     * Eval Await waits for the fence to finish processing and then once it
+     * finishes, it runs the postEval of all operations.
      *
      * @param waitFor Number of milliseconds to wait before timing out.
      * @return Boolean stating whether execution was successful.
@@ -89,7 +92,8 @@ class Sequence
     bool isRecording();
 
     /**
-     * Returns true if the sequence is currently running - mostly used for async workloads.
+     * Returns true if the sequence is currently running - mostly used for async
+     * workloads.
      *
      * @return Boolean stating if currently running.
      */
diff --git a/test/TestAsyncOperations.cpp b/test/TestAsyncOperations.cpp
old mode 100755
new mode 100644
index b9e84e1b5..28dd8c144
--- a/test/TestAsyncOperations.cpp
+++ b/test/TestAsyncOperations.cpp
@@ -72,13 +72,14 @@ TEST(TestAsyncOperations, TestManagerAsync)
 
     for (uint32_t i = 0; i < numParallel; i++) {
         mgr.evalOpDefault<kp::OpAlgoBase<>>(
-          { inputsSyncA[i],  inputsSyncB[i] }, 
+          { inputsSyncA[i], inputsSyncB[i] },
           std::vector<char>(shader.begin(), shader.end()));
-
     }
 
     auto endSync = std::chrono::high_resolution_clock::now();
-    auto durationSync = std::chrono::duration_cast<std::chrono::microseconds>(endSync - startSync).count();
+    auto durationSync =
+      std::chrono::duration_cast<std::chrono::microseconds>(endSync - startSync)
+        .count();
 
     mgr.evalOpDefault<kp::OpTensorSyncLocal>(inputsSyncB);
 
@@ -86,7 +87,7 @@ TEST(TestAsyncOperations, TestManagerAsync)
         EXPECT_EQ(inputsSyncB[i]->data(), resultSync);
     }
 
-    kp::Manager mgrAsync(0, {0, 2});
+    kp::Manager mgrAsync(0, { 0, 2 });
 
     std::vector<std::shared_ptr<kp::Tensor>> inputsAsyncA;
     std::vector<std::shared_ptr<kp::Tensor>> inputsAsyncB;
@@ -107,8 +108,8 @@ TEST(TestAsyncOperations, TestManagerAsync)
 
     for (uint32_t i = 0; i < numParallel; i++) {
         mgrAsync.evalOpAsync<kp::OpAlgoBase<>>(
-          { inputsAsyncA[i], inputsAsyncB[i] }, 
-          "async" + std::to_string(i), 
+          { inputsAsyncA[i], inputsAsyncB[i] },
+          "async" + std::to_string(i),
           std::vector<char>(shader.begin(), shader.end()));
     }
 
@@ -117,7 +118,9 @@ TEST(TestAsyncOperations, TestManagerAsync)
     }
 
     auto endAsync = std::chrono::high_resolution_clock::now();
-    auto durationAsync = std::chrono::duration_cast<std::chrono::microseconds>(endAsync - startAsync).count();
+    auto durationAsync = std::chrono::duration_cast<std::chrono::microseconds>(
+                           endAsync - startAsync)
+                           .count();
 
     mgrAsync.evalOpDefault<kp::OpTensorSyncLocal>({ inputsAsyncB });
 
@@ -128,6 +131,6 @@ TEST(TestAsyncOperations, TestManagerAsync)
     SPDLOG_ERROR("sync {}", durationSync);
     SPDLOG_ERROR("async {}", durationAsync);
 
-    // The speedup should be at least 40% 
+    // The speedup should be at least 40%
     EXPECT_LT(durationAsync, durationSync * 0.6);
 }
diff --git a/test/TestMultipleAlgoExecutions.cpp b/test/TestMultipleAlgoExecutions.cpp
index fb0803690..1b59d3516 100644
--- a/test/TestMultipleAlgoExecutions.cpp
+++ b/test/TestMultipleAlgoExecutions.cpp
@@ -258,10 +258,10 @@ TEST(TestMultipleAlgoExecutions, ManagerEvalMultSourceStrOpCreate)
       )");
 
     mgr.evalOpDefault<kp::OpAlgoBase<>>(
-            { tensorInA, tensorInB, tensorOut },
-            std::vector<char>(shader.begin(), shader.end()));
+      { tensorInA, tensorInB, tensorOut },
+      std::vector<char>(shader.begin(), shader.end()));
 
-    mgr.evalOpDefault<kp::OpTensorSyncLocal>({tensorOut});
+    mgr.evalOpDefault<kp::OpTensorSyncLocal>({ tensorOut });
 
     EXPECT_EQ(tensorOut->data(), std::vector<float>({ 0.0, 4.0, 12.0 }));
 }
@@ -295,10 +295,10 @@ TEST(TestMultipleAlgoExecutions, ManagerEvalMultSourceStrMgrCreate)
       )");
 
     mgr.evalOpDefault<kp::OpAlgoBase<>>(
-            { tensorInA, tensorInB, tensorOut },
-            std::vector<char>(shader.begin(), shader.end()));
+      { tensorInA, tensorInB, tensorOut },
+      std::vector<char>(shader.begin(), shader.end()));
 
-    mgr.evalOpDefault<kp::OpTensorSyncLocal>({tensorOut});
+    mgr.evalOpDefault<kp::OpTensorSyncLocal>({ tensorOut });
 
     EXPECT_EQ(tensorOut->data(), std::vector<float>({ 0.0, 4.0, 12.0 }));
 }

From 5cfc2ab8ce8d1389815c7e5871214053b8151f05 Mon Sep 17 00:00:00 2001
From: Alejandro Saucedo <axsauze@gmail.com>
Date: Sat, 17 Oct 2020 11:20:33 +0100
Subject: [PATCH 14/19] Added single include with async

---
 single_include/kompute/Kompute.hpp | 84 +++++++++++++++++-------------
 1 file changed, 49 insertions(+), 35 deletions(-)

diff --git a/single_include/kompute/Kompute.hpp b/single_include/kompute/Kompute.hpp
index 15e03ccc4..382b8332d 100755
--- a/single_include/kompute/Kompute.hpp
+++ b/single_include/kompute/Kompute.hpp
@@ -18,7 +18,9 @@ static const char* KOMPUTE_LOG_TAG = "KomputeLog";
 #ifndef KOMPUTE_VK_API_MINOR_VERSION
 #define KOMPUTE_VK_API_MINOR_VERSION 1
 #endif // KOMPUTE_VK_API_MINOR_VERSION
-#define KOMPUTE_VK_API_VERSION VK_MAKE_VERSION(KOMPUTE_VK_API_MAJOR_VERSION, KOMPUTE_VK_API_MINOR_VERSION, 0)
+#define KOMPUTE_VK_API_VERSION                                                 \
+    VK_MAKE_VERSION(                                                           \
+      KOMPUTE_VK_API_MAJOR_VERSION, KOMPUTE_VK_API_MINOR_VERSION, 0)
 #endif // KOMPUTE_VK_API_VERSION
 
 // SPDLOG_ACTIVE_LEVEL must be defined before spdlog.h import
@@ -39,7 +41,8 @@ static const char* KOMPUTE_LOG_TAG = "KomputeLog";
 #define SPDLOG_DEBUG(message, ...)
 #else
 #if defined(VK_USE_PLATFORM_ANDROID_KHR)
-#define SPDLOG_DEBUG(message, ...) ((void)__android_log_print(ANDROID_LOG_DEBUG, KOMPUTE_LOG_TAG, message))
+#define SPDLOG_DEBUG(message, ...)                                             \
+    ((void)__android_log_print(ANDROID_LOG_DEBUG, KOMPUTE_LOG_TAG, message))
 #else
 #define SPDLOG_DEBUG(message, ...)                                             \
     std::cout << "DEBUG: " << message << std::endl
@@ -49,7 +52,8 @@ static const char* KOMPUTE_LOG_TAG = "KomputeLog";
 #define SPDLOG_INFO(message, ...)
 #else
 #if defined(VK_USE_PLATFORM_ANDROID_KHR)
-#define SPDLOG_INFO(message, ...) ((void)__android_log_print(ANDROID_LOG_INFO, KOMPUTE_LOG_TAG, message))
+#define SPDLOG_INFO(message, ...)                                              \
+    ((void)__android_log_print(ANDROID_LOG_INFO, KOMPUTE_LOG_TAG, message))
 #else
 #define SPDLOG_INFO(message, ...) std::cout << "INFO: " << message << std::endl
 #endif // VK_USE_PLATFORM_ANDROID_KHR
@@ -58,7 +62,8 @@ static const char* KOMPUTE_LOG_TAG = "KomputeLog";
 #define SPDLOG_WARN(message, ...)
 #else
 #if defined(VK_USE_PLATFORM_ANDROID_KHR)
-#define SPDLOG_WARN(message, ...) ((void)__android_log_print(ANDROID_LOG_INFO, KOMPUTE_LOG_TAG, message))
+#define SPDLOG_WARN(message, ...)                                              \
+    ((void)__android_log_print(ANDROID_LOG_INFO, KOMPUTE_LOG_TAG, message))
 #else
 #define SPDLOG_WARN(message, ...)                                              \
     std::cout << "WARNING: " << message << std::endl
@@ -68,7 +73,8 @@ static const char* KOMPUTE_LOG_TAG = "KomputeLog";
 #define SPDLOG_ERROR(message, ...)
 #else
 #if defined(VK_USE_PLATFORM_ANDROID_KHR)
-#define SPDLOG_ERROR(message, ...) ((void)__android_log_print(ANDROID_LOG_INFO, KOMPUTE_LOG_TAG, message))
+#define SPDLOG_ERROR(message, ...)                                             \
+    ((void)__android_log_print(ANDROID_LOG_INFO, KOMPUTE_LOG_TAG, message))
 #else
 #define SPDLOG_ERROR(message, ...)                                             \
     std::cout << "ERROR: " << message << std::endl
@@ -1055,14 +1061,17 @@ class Sequence
     bool eval();
 
     /**
-     * Eval Async sends all the recorded and stored operations in the vector of operations into the gpu as a submit job with a barrier. EvalAwait() must be called after to ensure the sequence is terminated correctly.
+     * Eval Async sends all the recorded and stored operations in the vector of
+     * operations into the gpu as a submit job with a barrier. EvalAwait() must
+     * be called after to ensure the sequence is terminated correctly.
      *
      * @return Boolean stating whether execution was successful.
      */
     bool evalAsync();
 
     /**
-     * Eval Await waits for the fence to finish processing and then once it finishes, it runs the postEval of all operations.
+     * Eval Await waits for the fence to finish processing and then once it
+     * finishes, it runs the postEval of all operations.
      *
      * @param waitFor Number of milliseconds to wait before timing out.
      * @return Boolean stating whether execution was successful.
@@ -1077,7 +1086,8 @@ class Sequence
     bool isRecording();
 
     /**
-     * Returns true if the sequence is currently running - mostly used for async workloads.
+     * Returns true if the sequence is currently running - mostly used for async
+     * workloads.
      *
      * @return Boolean stating if currently running.
      */
@@ -1245,7 +1255,7 @@ namespace kp {
 */
 class Manager
 {
-public:
+  public:
     /**
         Base constructor and default used which creates the base resources
        including choosing the device 0 by default.
@@ -1253,14 +1263,16 @@ public:
     Manager();
 
     /**
-      * Similar to base constructor but allows the user to provide the device
-      * they would like to create the resources on.
-      *
-      * @param physicalDeviceIndex The index of the physical device to use
-      * @param familyQueueIndeces (Optional) List of queue indeces to add for explicit allocation
-      * @param totalQueues The total number of compute queues to create.
-    */
-    Manager(uint32_t physicalDeviceIndex, const std::vector<uint32_t> & familyQueueIndeces = {});
+     * Similar to base constructor but allows the user to provide the device
+     * they would like to create the resources on.
+     *
+     * @param physicalDeviceIndex The index of the physical device to use
+     * @param familyQueueIndeces (Optional) List of queue indeces to add for
+     * explicit allocation
+     * @param totalQueues The total number of compute queues to create.
+     */
+    Manager(uint32_t physicalDeviceIndex,
+            const std::vector<uint32_t>& familyQueueIndeces = {});
 
     /**
      * Manager constructor which allows your own vulkan application to integrate
@@ -1295,14 +1307,15 @@ public:
       std::string sequenceName);
 
     /**
-     * Create a new managed Kompute sequence so it's available within the manager.
+     * Create a new managed Kompute sequence so it's available within the
+     * manager.
      *
      * @param sequenceName The name for the named sequence to be created
      * @param queueIndex The queue to use from the available queues
      * @return Weak pointer to the manager owned sequence resource
      */
-    std::weak_ptr<Sequence> createManagedSequence(
-      std::string sequenceName, uint32_t queueIndex = 0);
+    std::weak_ptr<Sequence> createManagedSequence(std::string sequenceName,
+                                                  uint32_t queueIndex = 0);
 
     /**
      * Operation that adds extra operations to existing or new created
@@ -1349,15 +1362,15 @@ public:
      */
     template<typename T, typename... TArgs>
     void evalOpAsync(std::vector<std::shared_ptr<Tensor>> tensors,
-                std::string sequenceName,
-                TArgs&&... params)
+                     std::string sequenceName,
+                     TArgs&&... params)
     {
         SPDLOG_DEBUG("Kompute Manager evalOpAsync triggered");
         std::weak_ptr<Sequence> sqWeakPtr =
           this->getOrCreateManagedSequence(sequenceName);
 
-        std::unordered_map<std::string, std::shared_ptr<Sequence>>::iterator found =
-          this->mManagedSequences.find(sequenceName);
+        std::unordered_map<std::string, std::shared_ptr<Sequence>>::iterator
+          found = this->mManagedSequences.find(sequenceName);
 
         if (found != this->mManagedSequences.end()) {
             std::shared_ptr<Sequence> sq = found->second;
@@ -1373,9 +1386,9 @@ public:
 
             SPDLOG_DEBUG("Kompute Manager evalOpAsync running sequence EVAL");
             sq->evalAsync();
-        }
-        else {
-            SPDLOG_ERROR("Kompute Manager evalOpAsync sequence [{}] not found", sequenceName);
+        } else {
+            SPDLOG_ERROR("Kompute Manager evalOpAsync sequence [{}] not found",
+                         sequenceName);
         }
         SPDLOG_DEBUG("Kompute Manager evalOpAsync running sequence SUCCESS");
     }
@@ -1391,21 +1404,22 @@ public:
     void evalOpAwait(std::string sequenceName, uint64_t waitFor = UINT64_MAX)
     {
         SPDLOG_DEBUG("Kompute Manager evalOpAwait triggered");
-        std::unordered_map<std::string, std::shared_ptr<Sequence>>::iterator found =
-        this->mManagedSequences.find(sequenceName);
+        std::unordered_map<std::string, std::shared_ptr<Sequence>>::iterator
+          found = this->mManagedSequences.find(sequenceName);
 
         if (found != this->mManagedSequences.end()) {
             if (std::shared_ptr<kp::Sequence> sq = found->second) {
-                SPDLOG_DEBUG("Kompute Manager evalOpAwait running sequence Sequence EVAL AWAIT");
+                SPDLOG_DEBUG("Kompute Manager evalOpAwait running sequence "
+                             "Sequence EVAL AWAIT");
                 if (sq->isRunning()) {
                     sq->evalAwait(waitFor);
                 }
             }
-            SPDLOG_DEBUG("Kompute Manager evalOpAwait running sequence SUCCESS");
-        }
-        else {
+            SPDLOG_DEBUG(
+              "Kompute Manager evalOpAwait running sequence SUCCESS");
+        } else {
             SPDLOG_ERROR("Kompute Manager evalOpAwait Sequence not found");
- 		}
+        }
     }
 
     /**
@@ -1475,7 +1489,7 @@ public:
 
     // Create functions
     void createInstance();
-    void createDevice(const std::vector<uint32_t> & familyQueueIndeces = {});
+    void createDevice(const std::vector<uint32_t>& familyQueueIndeces = {});
 };
 
 } // End namespace kp

From abe2c4104118fd7e6e7be31d89aeae7eaa9c2d7b Mon Sep 17 00:00:00 2001
From: Alejandro Saucedo <axsauze@gmail.com>
Date: Sat, 17 Oct 2020 11:57:20 +0100
Subject: [PATCH 15/19] Simplified test and shortened run time

---
 test/TestAsyncOperations.cpp | 26 +++-----------------------
 1 file changed, 3 insertions(+), 23 deletions(-)

diff --git a/test/TestAsyncOperations.cpp b/test/TestAsyncOperations.cpp
index 28dd8c144..ac7422785 100644
--- a/test/TestAsyncOperations.cpp
+++ b/test/TestAsyncOperations.cpp
@@ -19,8 +19,7 @@ TEST(TestAsyncOperations, TestManagerAsync)
 
         layout (local_size_x = 1) in;
 
-        layout(set = 0, binding = 0) buffer a { float pa[]; };
-        layout(set = 0, binding = 1) buffer b { float pb[]; };
+        layout(set = 0, binding = 0) buffer b { float pb[]; };
 
         shared uint sharedTotal[1];
 
@@ -29,25 +28,12 @@ TEST(TestAsyncOperations, TestManagerAsync)
 
             sharedTotal[0] = 0;
 
-            barrier();
-            memoryBarrierShared();
-
             for (int i = 0; i < 100000000; i++)
             {
                 atomicAdd(sharedTotal[0], 1);
-                atomicAdd(sharedTotal[0], -1);
-                atomicAdd(sharedTotal[0], 1);
-                atomicAdd(sharedTotal[0], -1);
-                atomicAdd(sharedTotal[0], 1);
-                atomicAdd(sharedTotal[0], -1);
-                atomicAdd(sharedTotal[0], 1);
             }
 
-            barrier();
-            memoryBarrierShared();
-
             pb[index] = sharedTotal[0];
-            pa[index] = 0;
         }
     )");
 
@@ -57,22 +43,19 @@ TEST(TestAsyncOperations, TestManagerAsync)
 
     kp::Manager mgr;
 
-    std::vector<std::shared_ptr<kp::Tensor>> inputsSyncA;
     std::vector<std::shared_ptr<kp::Tensor>> inputsSyncB;
 
     for (uint32_t i = 0; i < numParallel; i++) {
-        inputsSyncA.push_back(std::make_shared<kp::Tensor>(kp::Tensor(data)));
         inputsSyncB.push_back(std::make_shared<kp::Tensor>(kp::Tensor(data)));
     }
 
-    mgr.evalOpDefault<kp::OpTensorCreate>(inputsSyncA);
     mgr.evalOpDefault<kp::OpTensorCreate>(inputsSyncB);
 
     auto startSync = std::chrono::high_resolution_clock::now();
 
     for (uint32_t i = 0; i < numParallel; i++) {
         mgr.evalOpDefault<kp::OpAlgoBase<>>(
-          { inputsSyncA[i], inputsSyncB[i] },
+          { inputsSyncB[i] },
           std::vector<char>(shader.begin(), shader.end()));
     }
 
@@ -89,15 +72,12 @@ TEST(TestAsyncOperations, TestManagerAsync)
 
     kp::Manager mgrAsync(0, { 0, 2 });
 
-    std::vector<std::shared_ptr<kp::Tensor>> inputsAsyncA;
     std::vector<std::shared_ptr<kp::Tensor>> inputsAsyncB;
 
     for (uint32_t i = 0; i < numParallel; i++) {
-        inputsAsyncA.push_back(std::make_shared<kp::Tensor>(kp::Tensor(data)));
         inputsAsyncB.push_back(std::make_shared<kp::Tensor>(kp::Tensor(data)));
     }
 
-    mgrAsync.evalOpDefault<kp::OpTensorCreate>(inputsAsyncA);
     mgrAsync.evalOpDefault<kp::OpTensorCreate>(inputsAsyncB);
 
     for (uint32_t i = 0; i < numParallel; i++) {
@@ -108,7 +88,7 @@ TEST(TestAsyncOperations, TestManagerAsync)
 
     for (uint32_t i = 0; i < numParallel; i++) {
         mgrAsync.evalOpAsync<kp::OpAlgoBase<>>(
-          { inputsAsyncA[i], inputsAsyncB[i] },
+          { inputsAsyncB[i] },
           "async" + std::to_string(i),
           std::vector<char>(shader.begin(), shader.end()));
     }

From 10faaab81e51a755953ce6e02bb7769389601da3 Mon Sep 17 00:00:00 2001
From: Alejandro Saucedo <axsauze@gmail.com>
Date: Sat, 17 Oct 2020 11:57:33 +0100
Subject: [PATCH 16/19] Added basic documentation and examples in readme

---
 README.md | 256 ++++++++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 209 insertions(+), 47 deletions(-)

diff --git a/README.md b/README.md
index ff15caf44..13426bca9 100644
--- a/README.md
+++ b/README.md
@@ -23,7 +23,7 @@
 </tr>
 </table>
 
-<h4>Blazing fast, lightweight, mobile-enabled, and optimized for advanced GPU processing usecases.</h4>
+<h4>Blazing fast, mobile-enabled, asynchronous, and optimized for advanced GPU processing usecases.</h4>
 
 🔋 [Documentation](https://kompute.cc) 💻 [Blog Post](https://medium.com/@AxSaucedo/machine-learning-and-data-processing-in-the-gpu-with-vulkan-kompute-c9350e5e5d3a) ⌨ [Examples](#more-examples) 💾
 
@@ -34,6 +34,7 @@
 * [Documentation](https://kompute.cc) leveraging doxygen and sphinx 
 * BYOV: Bring-your-own-Vulkan design to play nice with existing Vulkan applications
 * Non-Vulkan core naming conventions to disambiguate Vulkan vs Kompute components
+* Asynchronous processing capabilities with granular mult-queue workload processing
 * Fast development cycles with shader tooling, but robust static shader binary bundles for prod
 * Explicit relationships for GPU and host memory ownership and memory management
 * End-to-end examples for [machine learning 🤖](https://towardsdatascience.com/machine-learning-and-data-processing-in-the-gpu-with-vulkan-kompute-c9350e5e5d3a), [mobile development 📱](https://towardsdatascience.com/gpu-accelerated-machine-learning-in-your-mobile-applications-using-the-android-ndk-vulkan-kompute-1e9da37b7617), [game development 🎮](https://towardsdatascience.com/supercharging-game-development-with-gpu-accelerated-ml-using-vulkan-kompute-the-godot-game-engine-4e75a84ea9f0).
@@ -110,8 +111,10 @@ We are currently developing Vulkan Kompute not to hide the Vulkan SDK interface
 ### Simple examples
 
 * [Pass shader as raw string](#your-first-kompute)
-* [Create your custom Kompute Operations](#your-custom-kompute-operation)
 * [Record batch commands with a Kompute Sequence](#record-batch-commands)
+* [Run Asynchronous Operations](#asynchronous-operations)
+* [Run Parallel Operations Across Multiple GPU Queues](#parallel-operations)
+* [Create your custom Kompute Operations](#your-custom-kompute-operation)
 
 ### End-to-end examples
 
@@ -119,51 +122,6 @@ We are currently developing Vulkan Kompute not to hide the Vulkan SDK interface
 * [Android NDK Mobile Kompute ML Application](https://towardsdatascience.com/gpu-accelerated-machine-learning-in-your-mobile-applications-using-the-android-ndk-vulkan-kompute-1e9da37b7617)
 * [Game Development Kompute ML in Godot Engine](https://towardsdatascience.com/supercharging-game-development-with-gpu-accelerated-ml-using-vulkan-kompute-the-godot-game-engine-4e75a84ea9f0)
 
-### Your Custom Kompute Operation
-
-Build your own pre-compiled operations for domain specific workflows. Back to [more examples](#simple-examples)
-
-We also provide tools that allow you to [convert shaders into C++ headers](https://github.com/EthicalML/vulkan-kompute/blob/master/scripts/convert_shaders.py#L40).
-
-```c++
-
-template<uint32_t tX = 0, uint32_t tY = 0, uint32_t tZ = 0>
-class OpMyCustom : public OpAlgoBase<tX, tY, tZ>
-{
-  public:
-    OpMyCustom(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
-           std::shared_ptr<vk::Device> device,
-           std::shared_ptr<vk::CommandBuffer> commandBuffer,
-           std::vector<std::shared_ptr<Tensor>> tensors)
-      : OpAlgoBase<tX, tY, tZ>(physicalDevice, device, commandBuffer, tensors, "")
-    {
-        // Perform your custom steps such as reading from a shader file
-        this->mShaderFilePath = "shaders/glsl/opmult.comp";
-    }
-}
-
-
-int main() {
-
-    kp::Manager mgr; // Automatically selects Device 0
-
-    // Create 3 tensors of default type float
-    auto tensorLhs = std::make_shared<kp::Tensor>(kp::Tensor({ 0., 1., 2. }));
-    auto tensorRhs = std::make_shared<kp::Tensor>(kp::Tensor({ 2., 4., 6. }));
-    auto tensorOut = std::make_shared<kp::Tensor>(kp::Tensor({ 0., 0., 0. }));
-
-    // Create tensors data explicitly in GPU with an operation
-    mgr.evalOpDefault<kp::OpTensorCreate>({ tensorLhs, tensorRhs, tensorOut });
-
-    // Run Kompute operation on the parameters provided with dispatch layout
-    mgr.evalOpDefault<kp::OpMyCustom<3, 1, 1>>(
-        { tensorLhs, tensorRhs, tensorOut });
-
-    // Prints the output which is { 0, 4, 12 }
-    std::cout << fmt::format("Output: {}", tensorOutput.data()) << std::endl;
-}
-```
-
 #### Record batch commands
 
 Record commands in a single submit by using a Sequence to send in batch to GPU. Back to [more examples](#simple-examples)
@@ -214,6 +172,210 @@ int main() {
 }
 ```
 
+#### Asynchronous Operations
+
+You can submit operations asynchronously with the async/await commands in the kp::Manager and kp::Sequence, which provides granularity on waiting on the vk::Fence. Back to [more examples](#simple-examples)
+
+```c++
+int main() {
+
+    // You can allow Kompute to create the Vulkan components, or pass your existing ones
+    kp::Manager mgr; // Selects device 0 unless explicitly requested
+
+    // For synchronous steps we must already have a sequence created
+    mgr.createManagedSequence("async");
+
+    // Creates tensor an initializes GPU memory (below we show more granularity)
+    auto tensor = std::make_shared<kp::Tensor>(kp::Tensor(std::vector<float>(10, 0.0)));
+
+    // Create tensors data explicitly in GPU with an operation
+    mgr.evalOpAsync<kp::OpTensorCreate>({ tensor }, "async");
+
+    // Define your shader as a string (using string literals for simplicity)
+    // (You can also pass the raw compiled bytes, or even path to file)
+    std::string shader(R"(
+        #version 450
+
+        layout (local_size_x = 1) in;
+
+        layout(set = 0, binding = 0) buffer b { float pb[]; };
+
+        shared uint sharedTotal[1];
+
+        void main() {
+            uint index = gl_GlobalInvocationID.x;
+
+            sharedTotal[0] = 0;
+
+            // Iterating to simulate longer process
+            for (int i = 0; i < 100000000; i++)
+            {
+                atomicAdd(sharedTotal[0], 1);
+            }
+
+            pb[index] = sharedTotal[0];
+        }
+    )");
+
+    // We can now await for the previous submitted command
+    // The second parameter can be the amount of time to wait
+    // The time provided is in nanoseconds
+    mgr.evalOpAwait("async", 10000);
+
+    // Run Async Kompute operation on the parameters provided
+    mgr.evalOpAsync<kp::OpAlgoBase<>>(
+        { tensor }, 
+        "async",
+        std::vector<char>(shader.begin(), shader.end()));
+
+    // Here we can do other work
+
+    // When we're ready we can wait 
+    // The default wait time is UINT64_MAX
+    mgr.evalOpAwait("async")
+
+    // Sync the GPU memory back to the local tensor
+    // We can still run synchronous jobs in our created sequence
+    mgr.evalOp<kp::OpTensorSyncLocal>({ tensor }, "async");
+
+    // Prints the output: B: { 100000000, ... }
+    std::cout << fmt::format("B: {}", 
+        tensor.data()) << std::endl;
+}
+```
+
+#### Parallel Operations
+
+Besides being able to submit asynchronous operations, you can also leverage the underlying GPU compute queues to process operations in parallel.
+
+This will depend on your underlying graphics card, but for example in NVIDIA graphics cards the operations submitted across queues in one family are not parallelizable, but operations submitted across queueFamilies can be parallelizable.
+
+Below we show how you can parallelize operations in an [NVIDIA 1650](http://vulkan.gpuinfo.org/displayreport.php?id=9700#queuefamilies), which has a `GRAPHICS+COMPUTE` family on `index 0`, and `COMPUTE` family on `index 2`.
+
+Back to [more examples](#simple-examples)
+
+```c++
+int main() {
+
+    // In this case we select device 0, and for queues, one queue from familyIndex 0
+    // and one queue from familyIndex 2
+    uint32_t deviceIndex(0);
+    std::vector<uint32_t> familyIndeces = {0, 2};
+
+    // We create a manager with device index, and queues by queue family index
+    kp::Manager mgr(deviceIndex, familyIndeces);
+
+    // We need to create explicit sequences with their respective queues
+    // The second parameter is the index in the familyIndex array which is relative
+    //      to the vector we created the manager with.
+    mgr.createManagedSequence("queueOne", 0);
+    mgr.createManagedSequence("queueTwo", 1);
+
+    // Creates tensor an initializes GPU memory (below we show more granularity)
+    auto tensorA = std::make_shared<kp::Tensor>(kp::Tensor(std::vector<float>(10, 0.0)));
+    auto tensorB = std::make_shared<kp::Tensor>(kp::Tensor(std::vector<float>(10, 0.0)));
+
+    // We run the first step synchronously on the default sequence
+    mgr.evalOpDefault<kp::OpTensorCreate>({ tensorA, tensorB });
+
+    // Define your shader as a string (using string literals for simplicity)
+    // (You can also pass the raw compiled bytes, or even path to file)
+    std::string shader(R"(
+        #version 450
+
+        layout (local_size_x = 1) in;
+
+        layout(set = 0, binding = 0) buffer b { float pb[]; };
+
+        shared uint sharedTotal[1];
+
+        void main() {
+            uint index = gl_GlobalInvocationID.x;
+
+            sharedTotal[0] = 0;
+
+            // Iterating to simulate longer process
+            for (int i = 0; i < 100000000; i++)
+            {
+                atomicAdd(sharedTotal[0], 1);
+            }
+
+            pb[index] = sharedTotal[0];
+        }
+    )");
+
+    // Run the first parallel operation in the `queueOne` sequence
+    mgr.evalOpAsync<kp::OpAlgoBase<>>(
+        { tensorA }, 
+        "queueOne",
+        std::vector<char>(shader.begin(), shader.end()));
+
+    // Run the second parallel operation in the `queueTwo` sequence
+    mgr.evalOpAsync<kp::OpAlgoBase<>>(
+        { tensorB }, 
+        "queueTwo",
+        std::vector<char>(shader.begin(), shader.end()));
+
+    // Here we can do other work
+
+    // We can now wait for thw two parallel tasks to finish
+    mgr.evalOpAwait("queueOne")
+    mgr.evalOpAwait("queueTwo")
+
+    // Sync the GPU memory back to the local tensor
+    mgr.evalOp<kp::OpTensorSyncLocal>({ tensorA, tensorB });
+
+    // Prints the output: A: 100000000 B: 100000000
+    std::cout << fmt::format("A: {}, B: {}", 
+        tensorA.data()[0], tensorB.data()[0]) << std::endl;
+}
+```
+
+### Your Custom Kompute Operation
+
+Build your own pre-compiled operations for domain specific workflows. Back to [more examples](#simple-examples)
+
+We also provide tools that allow you to [convert shaders into C++ headers](https://github.com/EthicalML/vulkan-kompute/blob/master/scripts/convert_shaders.py#L40).
+
+```c++
+
+template<uint32_t tX = 0, uint32_t tY = 0, uint32_t tZ = 0>
+class OpMyCustom : public OpAlgoBase<tX, tY, tZ>
+{
+  public:
+    OpMyCustom(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
+           std::shared_ptr<vk::Device> device,
+           std::shared_ptr<vk::CommandBuffer> commandBuffer,
+           std::vector<std::shared_ptr<Tensor>> tensors)
+      : OpAlgoBase<tX, tY, tZ>(physicalDevice, device, commandBuffer, tensors, "")
+    {
+        // Perform your custom steps such as reading from a shader file
+        this->mShaderFilePath = "shaders/glsl/opmult.comp";
+    }
+}
+
+
+int main() {
+
+    kp::Manager mgr; // Automatically selects Device 0
+
+    // Create 3 tensors of default type float
+    auto tensorLhs = std::make_shared<kp::Tensor>(kp::Tensor({ 0., 1., 2. }));
+    auto tensorRhs = std::make_shared<kp::Tensor>(kp::Tensor({ 2., 4., 6. }));
+    auto tensorOut = std::make_shared<kp::Tensor>(kp::Tensor({ 0., 0., 0. }));
+
+    // Create tensors data explicitly in GPU with an operation
+    mgr.evalOpDefault<kp::OpTensorCreate>({ tensorLhs, tensorRhs, tensorOut });
+
+    // Run Kompute operation on the parameters provided with dispatch layout
+    mgr.evalOpDefault<kp::OpMyCustom<3, 1, 1>>(
+        { tensorLhs, tensorRhs, tensorOut });
+
+    // Prints the output which is { 0, 4, 12 }
+    std::cout << fmt::format("Output: {}", tensorOutput.data()) << std::endl;
+}
+```
+
 ## Components & Architecture
 
 The core architecture of Kompute include the following:

From 8c66b4b66bf2d5c0a4115f2684003de85d12cedb Mon Sep 17 00:00:00 2001
From: Alejandro Saucedo <axsauze@gmail.com>
Date: Sat, 17 Oct 2020 12:31:20 +0100
Subject: [PATCH 17/19] Added evalOpAwaitDefault and evalOpAsyncDefault

---
 single_include/kompute/Kompute.hpp | 58 +++++++++++++++++++++---------
 src/include/kompute/Manager.hpp    | 58 +++++++++++++++++++++---------
 2 files changed, 82 insertions(+), 34 deletions(-)

diff --git a/single_include/kompute/Kompute.hpp b/single_include/kompute/Kompute.hpp
index 382b8332d..bd7b3a125 100755
--- a/single_include/kompute/Kompute.hpp
+++ b/single_include/kompute/Kompute.hpp
@@ -1318,8 +1318,7 @@ class Manager
                                                   uint32_t queueIndex = 0);
 
     /**
-     * Operation that adds extra operations to existing or new created
-     * sequences.
+     * Function that evaluates operation against named sequence.
      *
      * @param tensors The tensors to be used in the operation recorded
      * @param sequenceName The name of the sequence to be retrieved or created
@@ -1352,8 +1351,23 @@ class Manager
     }
 
     /**
-     * Operation that adds extra operations to existing or new created
-     * sequences.
+     * Function that evaluates operation against default sequence.
+     *
+     * @param tensors The tensors to be used in the operation recorded
+     * @param TArgs Template parameters that will be used to initialise
+     * Operation to allow for extensible configurations on initialisation
+     */
+    template<typename T, typename... TArgs>
+    void evalOpDefault(std::vector<std::shared_ptr<Tensor>> tensors,
+                       TArgs&&... params)
+    {
+        SPDLOG_DEBUG("Kompute Manager evalOp Default triggered");
+        this->evalOp<T>(
+          tensors, KP_DEFAULT_SESSION, std::forward<TArgs>(params)...);
+    }
+
+    /**
+     * Function that evaluates operation against named sequence asynchronously.
      *
      * @param tensors The tensors to be used in the operation recorded
      * @param sequenceName The name of the sequence to be retrieved or created
@@ -1394,16 +1408,30 @@ class Manager
     }
 
     /**
-     * Operation that adds extra operations to existing or new created
-     * sequences.
+     * Operation that evaluates operation against default sequence asynchronously.
+     *
+     * @param tensors The tensors to be used in the operation recorded
+     * @param params Template parameters that will be used to initialise
+     * Operation to allow for extensible configurations on initialisation
+     */
+    template<typename T, typename... TArgs>
+    void evalOpAsyncDefault(std::vector<std::shared_ptr<Tensor>> tensors,
+                     TArgs&&... params)
+    {
+        SPDLOG_DEBUG("Kompute Manager evalOpAsyncDefault triggered");
+        this->evalOpAsync<T>(
+          tensors, KP_DEFAULT_SESSION, std::forward<TArgs>(params)...);
+    }
+
+    /**
+     * Operation that awaits for named sequence to finish.
      *
      * @param sequenceName The name of the sequence to wait for termination
      * @param waitFor The amount of time to wait before timing out
      */
-    template<typename... TArgs>
     void evalOpAwait(std::string sequenceName, uint64_t waitFor = UINT64_MAX)
     {
-        SPDLOG_DEBUG("Kompute Manager evalOpAwait triggered");
+        SPDLOG_DEBUG("Kompute Manager evalOpAwait triggered with sequence {}", sequenceName);
         std::unordered_map<std::string, std::shared_ptr<Sequence>>::iterator
           found = this->mManagedSequences.find(sequenceName);
 
@@ -1423,20 +1451,16 @@ class Manager
     }
 
     /**
-     * Operation that adds extra operations to existing or new created
-     * sequences.
+     * Operation that awaits for default sequence to finish.
      *
      * @param tensors The tensors to be used in the operation recorded
-     * @param TArgs Template parameters that will be used to initialise
+     * @param params Template parameters that will be used to initialise
      * Operation to allow for extensible configurations on initialisation
      */
-    template<typename T, typename... TArgs>
-    void evalOpDefault(std::vector<std::shared_ptr<Tensor>> tensors,
-                       TArgs&&... params)
+    void evalOpAwaitDefault(uint64_t waitFor = UINT64_MAX)
     {
-        SPDLOG_DEBUG("Kompute Manager evalOp Default triggered");
-        this->evalOp<T>(
-          tensors, KP_DEFAULT_SESSION, std::forward<TArgs>(params)...);
+        SPDLOG_DEBUG("Kompute Manager evalOpAwaitDefault triggered");
+        this->evalOpAwait(KP_DEFAULT_SESSION, waitFor);
     }
 
     /**
diff --git a/src/include/kompute/Manager.hpp b/src/include/kompute/Manager.hpp
index 650da5ca0..969af9ac5 100644
--- a/src/include/kompute/Manager.hpp
+++ b/src/include/kompute/Manager.hpp
@@ -80,8 +80,7 @@ class Manager
                                                   uint32_t queueIndex = 0);
 
     /**
-     * Operation that adds extra operations to existing or new created
-     * sequences.
+     * Function that evaluates operation against named sequence.
      *
      * @param tensors The tensors to be used in the operation recorded
      * @param sequenceName The name of the sequence to be retrieved or created
@@ -114,8 +113,23 @@ class Manager
     }
 
     /**
-     * Operation that adds extra operations to existing or new created
-     * sequences.
+     * Function that evaluates operation against default sequence.
+     *
+     * @param tensors The tensors to be used in the operation recorded
+     * @param TArgs Template parameters that will be used to initialise
+     * Operation to allow for extensible configurations on initialisation
+     */
+    template<typename T, typename... TArgs>
+    void evalOpDefault(std::vector<std::shared_ptr<Tensor>> tensors,
+                       TArgs&&... params)
+    {
+        SPDLOG_DEBUG("Kompute Manager evalOp Default triggered");
+        this->evalOp<T>(
+          tensors, KP_DEFAULT_SESSION, std::forward<TArgs>(params)...);
+    }
+
+    /**
+     * Function that evaluates operation against named sequence asynchronously.
      *
      * @param tensors The tensors to be used in the operation recorded
      * @param sequenceName The name of the sequence to be retrieved or created
@@ -156,16 +170,30 @@ class Manager
     }
 
     /**
-     * Operation that adds extra operations to existing or new created
-     * sequences.
+     * Operation that evaluates operation against default sequence asynchronously.
+     *
+     * @param tensors The tensors to be used in the operation recorded
+     * @param params Template parameters that will be used to initialise
+     * Operation to allow for extensible configurations on initialisation
+     */
+    template<typename T, typename... TArgs>
+    void evalOpAsyncDefault(std::vector<std::shared_ptr<Tensor>> tensors,
+                     TArgs&&... params)
+    {
+        SPDLOG_DEBUG("Kompute Manager evalOpAsyncDefault triggered");
+        this->evalOpAsync<T>(
+          tensors, KP_DEFAULT_SESSION, std::forward<TArgs>(params)...);
+    }
+
+    /**
+     * Operation that awaits for named sequence to finish.
      *
      * @param sequenceName The name of the sequence to wait for termination
      * @param waitFor The amount of time to wait before timing out
      */
-    template<typename... TArgs>
     void evalOpAwait(std::string sequenceName, uint64_t waitFor = UINT64_MAX)
     {
-        SPDLOG_DEBUG("Kompute Manager evalOpAwait triggered");
+        SPDLOG_DEBUG("Kompute Manager evalOpAwait triggered with sequence {}", sequenceName);
         std::unordered_map<std::string, std::shared_ptr<Sequence>>::iterator
           found = this->mManagedSequences.find(sequenceName);
 
@@ -185,20 +213,16 @@ class Manager
     }
 
     /**
-     * Operation that adds extra operations to existing or new created
-     * sequences.
+     * Operation that awaits for default sequence to finish.
      *
      * @param tensors The tensors to be used in the operation recorded
-     * @param TArgs Template parameters that will be used to initialise
+     * @param params Template parameters that will be used to initialise
      * Operation to allow for extensible configurations on initialisation
      */
-    template<typename T, typename... TArgs>
-    void evalOpDefault(std::vector<std::shared_ptr<Tensor>> tensors,
-                       TArgs&&... params)
+    void evalOpAwaitDefault(uint64_t waitFor = UINT64_MAX)
     {
-        SPDLOG_DEBUG("Kompute Manager evalOp Default triggered");
-        this->evalOp<T>(
-          tensors, KP_DEFAULT_SESSION, std::forward<TArgs>(params)...);
+        SPDLOG_DEBUG("Kompute Manager evalOpAwaitDefault triggered");
+        this->evalOpAwait(KP_DEFAULT_SESSION, waitFor);
     }
 
     /**

From f4275e2c31cf0fd8c368cee708b0de27ff5bd7bb Mon Sep 17 00:00:00 2001
From: Alejandro Saucedo <axsauze@gmail.com>
Date: Sat, 17 Oct 2020 12:31:28 +0100
Subject: [PATCH 18/19] Added test just for the await component

---
 test/TestAsyncOperations.cpp | 56 +++++++++++++++++++++++++++++++++---
 1 file changed, 52 insertions(+), 4 deletions(-)

diff --git a/test/TestAsyncOperations.cpp b/test/TestAsyncOperations.cpp
index ac7422785..5c8612fc3 100644
--- a/test/TestAsyncOperations.cpp
+++ b/test/TestAsyncOperations.cpp
@@ -5,7 +5,7 @@
 
 #include "kompute/Kompute.hpp"
 
-TEST(TestAsyncOperations, TestManagerAsync)
+TEST(TestAsyncOperations, TestManagerParallelExecution)
 {
     // This test is built for NVIDIA 1650. It assumes:
     // * Queue family 0 and 2 have compute capabilities
@@ -108,9 +108,57 @@ TEST(TestAsyncOperations, TestManagerAsync)
         EXPECT_EQ(inputsAsyncB[i]->data(), resultAsync);
     }
 
-    SPDLOG_ERROR("sync {}", durationSync);
-    SPDLOG_ERROR("async {}", durationAsync);
-
     // The speedup should be at least 40%
     EXPECT_LT(durationAsync, durationSync * 0.6);
 }
+
+TEST(TestAsyncOperations, TestManagerAsyncExecution)
+{
+    uint32_t size = 10;
+
+    std::string shader(R"(
+        #version 450
+
+        layout (local_size_x = 1) in;
+
+        layout(set = 0, binding = 0) buffer b { float pb[]; };
+
+        shared uint sharedTotal[1];
+
+        void main() {
+            uint index = gl_GlobalInvocationID.x;
+
+            sharedTotal[0] = 0;
+
+            for (int i = 0; i < 100000000; i++)
+            {
+                atomicAdd(sharedTotal[0], 1);
+            }
+
+            pb[index] = sharedTotal[0];
+        }
+    )");
+
+    std::vector<float> data(size, 0.0);
+    std::vector<float> resultAsync(size, 100000000);
+
+    kp::Manager mgrAsync(0);
+
+    std::vector<std::shared_ptr<kp::Tensor>> inputsAsyncB;
+
+    inputsAsyncB.push_back(std::make_shared<kp::Tensor>(kp::Tensor(data)));
+
+    mgrAsync.evalOpAsyncDefault<kp::OpTensorCreate>(inputsAsyncB);
+    mgrAsync.evalOpAwaitDefault();
+
+    mgrAsync.evalOpAsyncDefault<kp::OpAlgoBase<>>(
+      { inputsAsyncB[0] },
+      std::vector<char>(shader.begin(), shader.end()));
+
+    mgrAsync.evalOpAwaitDefault();
+
+    mgrAsync.evalOpAsyncDefault<kp::OpTensorSyncLocal>({ inputsAsyncB });
+    mgrAsync.evalOpAwaitDefault();
+
+    EXPECT_EQ(inputsAsyncB[0]->data(), resultAsync);
+}

From c6695f130cd18b7fcac3e3d93fde322a9f7652e7 Mon Sep 17 00:00:00 2001
From: Alejandro Saucedo <axsauze@gmail.com>
Date: Sat, 17 Oct 2020 12:37:34 +0100
Subject: [PATCH 19/19] Added basic documentation on async / parallel

---
 README.md                        |  16 ++---
 docs/index.rst                   |   5 +-
 docs/overview/async-parallel.rst | 108 +++++++++++++++++++++++++++++++
 docs/overview/reference.rst      |   2 +-
 4 files changed, 118 insertions(+), 13 deletions(-)
 create mode 100644 docs/overview/async-parallel.rst

diff --git a/README.md b/README.md
index 13426bca9..fdd721af8 100644
--- a/README.md
+++ b/README.md
@@ -182,14 +182,11 @@ int main() {
     // You can allow Kompute to create the Vulkan components, or pass your existing ones
     kp::Manager mgr; // Selects device 0 unless explicitly requested
 
-    // For synchronous steps we must already have a sequence created
-    mgr.createManagedSequence("async");
-
     // Creates tensor an initializes GPU memory (below we show more granularity)
     auto tensor = std::make_shared<kp::Tensor>(kp::Tensor(std::vector<float>(10, 0.0)));
 
     // Create tensors data explicitly in GPU with an operation
-    mgr.evalOpAsync<kp::OpTensorCreate>({ tensor }, "async");
+    mgr.evalOpAsyncDefault<kp::OpTensorCreate>({ tensor });
 
     // Define your shader as a string (using string literals for simplicity)
     // (You can also pass the raw compiled bytes, or even path to file)
@@ -218,25 +215,24 @@ int main() {
     )");
 
     // We can now await for the previous submitted command
-    // The second parameter can be the amount of time to wait
+    // The first parameter can be the amount of time to wait
     // The time provided is in nanoseconds
-    mgr.evalOpAwait("async", 10000);
+    mgr.evalOpAwaitDefault(10000);
 
     // Run Async Kompute operation on the parameters provided
-    mgr.evalOpAsync<kp::OpAlgoBase<>>(
+    mgr.evalOpAsyncDefault<kp::OpAlgoBase<>>(
         { tensor }, 
-        "async",
         std::vector<char>(shader.begin(), shader.end()));
 
     // Here we can do other work
 
     // When we're ready we can wait 
     // The default wait time is UINT64_MAX
-    mgr.evalOpAwait("async")
+    mgr.evalOpAwaitDefault()
 
     // Sync the GPU memory back to the local tensor
     // We can still run synchronous jobs in our created sequence
-    mgr.evalOp<kp::OpTensorSyncLocal>({ tensor }, "async");
+    mgr.evalOpDefault<kp::OpTensorSyncLocal>({ tensor });
 
     // Prints the output: B: { 100000000, ... }
     std::cout << fmt::format("B: {}", 
diff --git a/docs/index.rst b/docs/index.rst
index d29f1fbf6..ca8d59398 100755
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -15,7 +15,8 @@ Index
     Mobile App Intergration (Android) <overview/mobile-android>
     Game Engine Integration (Godot Engine) <overview/game-engine-godot>
     Converting GLSL/HLSL Shaders to C++ Headers <overview/shaders-to-headers>
-    Class Reference <overview/reference>
-    Memory management principles <overview/memory-management>
+    Asynchronous & Parallel Operations <overview/async-parallel>
+    Class Documentation and C++ Reference <overview/reference>
+    Memory Management Principles <overview/memory-management>
     Code Index <genindex>
 
diff --git a/docs/overview/async-parallel.rst b/docs/overview/async-parallel.rst
new file mode 100644
index 000000000..0505424d6
--- /dev/null
+++ b/docs/overview/async-parallel.rst
@@ -0,0 +1,108 @@
+
+Asynchronous and Parallel Operations
+=============
+
+In GPU computing it is possible to have multiple levels of asynchronous and parallel processing of GPU tasks.
+
+It is important to understand the conceptual distinctions of the diffent terminology when using each of these components.
+
+In this section we will cover the following points:
+
+* Asynchronous operation submission
+* Parallel processing of operations
+
+Asynchronous operation submission
+---------------------------------
+
+As the name implies, this refers to the asynchronous submission of operations. This means that operations can be submitted to the GPU, and the C++ / host CPU can continue performing tasks, until when the user desires to run `await` to wait until the operation finishes.
+
+This basically provides further granularity on Vulkan Fences, which is its means to enable the CPU host to know when GPU commands have finished executing. 
+
+It is important that submitting tasks asynchronously, does not mean that these will be executed in parallel. Parallel execution of operations will be covered in the following section.
+
+Asynchronous operation submission can be achieved through the kp::Manager, or directly through the kp::Sequence. Below is an example using the Kompute manager.
+
+Async/Await Example
+^^^^^^^^^^^^^^^^^^^^^
+
+Asynchronous job submission is done using `evalOpAsync` and `evalOpAwait` functions.
+
+For simplicity the `evalOpAsyncDefault` and `evalOpAwaitDefault` functions are provided, which can be used similar to the synchronous counterparts (which basically use the default named sequence).
+
+A simple example of asynchronous submission can be found below.
+
+One important thing to bare in mind when using asynchronous submissions, is that you should make sure that any overlapping asynchronous functions are run in separate sequences.
+
+The reason why this is important is that the Await function not only waits for the fence, but also runs the `postEval` functions across all operations, which is required for several operations.
+
+.. code-block:: cpp
+    :linenos:
+
+    // You can allow Kompute to create the Vulkan components, or pass your existing ones
+    kp::Manager mgr; // Selects device 0 unless explicitly requested
+
+    // Creates tensor an initializes GPU memory (below we show more granularity)
+    auto tensor = std::make_shared<kp::Tensor>(kp::Tensor(std::vector<float>(10, 0.0)));
+
+    // Create tensors data explicitly in GPU with an operation
+    mgr.evalOpAsyncDefault<kp::OpTensorCreate>({ tensor });
+
+    // Define your shader as a string (using string literals for simplicity)
+    // (You can also pass the raw compiled bytes, or even path to file)
+    std::string shader(R"(
+        #version 450
+
+        layout (local_size_x = 1) in;
+
+        layout(set = 0, binding = 0) buffer b { float pb[]; };
+
+        shared uint sharedTotal[1];
+
+        void main() {
+            uint index = gl_GlobalInvocationID.x;
+
+            sharedTotal[0] = 0;
+
+            // Iterating to simulate longer process
+            for (int i = 0; i < 100000000; i++)
+            {
+                atomicAdd(sharedTotal[0], 1);
+            }
+
+            pb[index] = sharedTotal[0];
+        }
+    )");
+
+    // We can now await for the previous submitted command
+    // The first parameter can be the amount of time to wait
+    // The time provided is in nanoseconds
+    mgr.evalOpAwaitDefault(10000);
+
+    // Run Async Kompute operation on the parameters provided
+    mgr.evalOpAsyncDefault<kp::OpAlgoBase<>>(
+        { tensor }, 
+        std::vector<char>(shader.begin(), shader.end()));
+
+    // Here we can do other work
+
+    // When we're ready we can wait 
+    // The default wait time is UINT64_MAX
+    mgr.evalOpAwaitDefault()
+
+    // Sync the GPU memory back to the local tensor
+    // We can still run synchronous jobs in our created sequence
+    mgr.evalOpDefault<kp::OpTensorSyncLocal>({ tensor });
+
+    // Prints the output: B: { 100000000, ... }
+    std::cout << fmt::format("B: {}", 
+        tensor.data()) << std::endl;
+
+
+Parallel Operation Submission
+-----------
+
+In order to work with parallel execution of tasks, it is important that you understand some of the core GPU processing limitations, as these can be quite broad and hardware dependent, which means they will vary across NVIDIA / AMD / ETC video cards.
+
+GPUs by default will optimize towards GPU 
+
+
diff --git a/docs/overview/reference.rst b/docs/overview/reference.rst
index f2b8eda18..5bffdcddd 100644
--- a/docs/overview/reference.rst
+++ b/docs/overview/reference.rst
@@ -1,5 +1,5 @@
 
-Reference
+Class Documentation and C++ Reference
 ========
 
 This section provides a breakdown of the cpp classes and what each of their functions provide. It is partially generated and augomented from the Doxygen autodoc content. You can also go directly to the `raw doxygen docs <../doxygen/annotated.html>`_.