Merge pull request #73 from EthicalML/51_async

Adding Asynchronous Processing Capabilities with Multiple Queue Support
2020-10-17 13:53:07 +01:00 · 2020-10-17 13:53:07 +01:00 · fba0a244a1
commit fba0a244a1
parent c1a0334e9d c6695f130c
14 changed files with 975 additions and 147 deletions
--- a/README.md
+++ b/README.md
@ -23,7 +23,7 @@
 </tr>
 </table>

-<h4>Blazing fast, lightweight, mobile-enabled, and optimized for advanced GPU processing usecases.</h4>
+<h4>Blazing fast, mobile-enabled, asynchronous, and optimized for advanced GPU processing usecases.</h4>

 🔋 [Documentation](https://kompute.cc) 💻 [Blog Post](https://medium.com/@AxSaucedo/machine-learning-and-data-processing-in-the-gpu-with-vulkan-kompute-c9350e5e5d3a) ⌨ [Examples](#more-examples) 💾

@ -34,6 +34,7 @@
 * [Documentation](https://kompute.cc) leveraging doxygen and sphinx 
 * BYOV: Bring-your-own-Vulkan design to play nice with existing Vulkan applications
 * Non-Vulkan core naming conventions to disambiguate Vulkan vs Kompute components
+* Asynchronous processing capabilities with granular mult-queue workload processing
 * Fast development cycles with shader tooling, but robust static shader binary bundles for prod
 * Explicit relationships for GPU and host memory ownership and memory management
 * End-to-end examples for [machine learning 🤖](https://towardsdatascience.com/machine-learning-and-data-processing-in-the-gpu-with-vulkan-kompute-c9350e5e5d3a), [mobile development 📱](https://towardsdatascience.com/gpu-accelerated-machine-learning-in-your-mobile-applications-using-the-android-ndk-vulkan-kompute-1e9da37b7617), [game development 🎮](https://towardsdatascience.com/supercharging-game-development-with-gpu-accelerated-ml-using-vulkan-kompute-the-godot-game-engine-4e75a84ea9f0).
@ -110,8 +111,10 @@ We are currently developing Vulkan Kompute not to hide the Vulkan SDK interface
 ### Simple examples

 * [Pass shader as raw string](#your-first-kompute)
-* [Create your custom Kompute Operations](#your-custom-kompute-operation)
 * [Record batch commands with a Kompute Sequence](#record-batch-commands)
+* [Run Asynchronous Operations](#asynchronous-operations)
+* [Run Parallel Operations Across Multiple GPU Queues](#parallel-operations)
+* [Create your custom Kompute Operations](#your-custom-kompute-operation)

 ### End-to-end examples

@ -119,51 +122,6 @@ We are currently developing Vulkan Kompute not to hide the Vulkan SDK interface
 * [Android NDK Mobile Kompute ML Application](https://towardsdatascience.com/gpu-accelerated-machine-learning-in-your-mobile-applications-using-the-android-ndk-vulkan-kompute-1e9da37b7617)
 * [Game Development Kompute ML in Godot Engine](https://towardsdatascience.com/supercharging-game-development-with-gpu-accelerated-ml-using-vulkan-kompute-the-godot-game-engine-4e75a84ea9f0)

-### Your Custom Kompute Operation
-
-Build your own pre-compiled operations for domain specific workflows. Back to [more examples](#simple-examples)
-
-We also provide tools that allow you to [convert shaders into C++ headers](https://github.com/EthicalML/vulkan-kompute/blob/master/scripts/convert_shaders.py#L40).
-
-```c++
-
-template<uint32_t tX = 0, uint32_t tY = 0, uint32_t tZ = 0>
-class OpMyCustom : public OpAlgoBase<tX, tY, tZ>
-{
-  public:
-    OpMyCustom(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
-           std::shared_ptr<vk::Device> device,
-           std::shared_ptr<vk::CommandBuffer> commandBuffer,
-           std::vector<std::shared_ptr<Tensor>> tensors)
-      : OpAlgoBase<tX, tY, tZ>(physicalDevice, device, commandBuffer, tensors, "")
-    {
-        // Perform your custom steps such as reading from a shader file
-        this->mShaderFilePath = "shaders/glsl/opmult.comp";
-    }
-}
-
-
-int main() {
-
-    kp::Manager mgr; // Automatically selects Device 0
-
-    // Create 3 tensors of default type float
-    auto tensorLhs = std::make_shared<kp::Tensor>(kp::Tensor({ 0., 1., 2. }));
-    auto tensorRhs = std::make_shared<kp::Tensor>(kp::Tensor({ 2., 4., 6. }));
-    auto tensorOut = std::make_shared<kp::Tensor>(kp::Tensor({ 0., 0., 0. }));
-
-    // Create tensors data explicitly in GPU with an operation
-    mgr.evalOpDefault<kp::OpTensorCreate>({ tensorLhs, tensorRhs, tensorOut });
-
-    // Run Kompute operation on the parameters provided with dispatch layout
-    mgr.evalOpDefault<kp::OpMyCustom<3, 1, 1>>(
-        { tensorLhs, tensorRhs, tensorOut });
-
-    // Prints the output which is { 0, 4, 12 }
-    std::cout << fmt::format("Output: {}", tensorOutput.data()) << std::endl;
-}
-```
-
 #### Record batch commands

 Record commands in a single submit by using a Sequence to send in batch to GPU. Back to [more examples](#simple-examples)
@ -214,6 +172,206 @@ int main() {
 }
 ```

+#### Asynchronous Operations
+
+You can submit operations asynchronously with the async/await commands in the kp::Manager and kp::Sequence, which provides granularity on waiting on the vk::Fence. Back to [more examples](#simple-examples)
+
+```c++
+int main() {
+
+    // You can allow Kompute to create the Vulkan components, or pass your existing ones
+    kp::Manager mgr; // Selects device 0 unless explicitly requested
+
+    // Creates tensor an initializes GPU memory (below we show more granularity)
+    auto tensor = std::make_shared<kp::Tensor>(kp::Tensor(std::vector<float>(10, 0.0)));
+
+    // Create tensors data explicitly in GPU with an operation
+    mgr.evalOpAsyncDefault<kp::OpTensorCreate>({ tensor });
+
+    // Define your shader as a string (using string literals for simplicity)
+    // (You can also pass the raw compiled bytes, or even path to file)
+    std::string shader(R"(
+        #version 450
+
+        layout (local_size_x = 1) in;
+
+        layout(set = 0, binding = 0) buffer b { float pb[]; };
+
+        shared uint sharedTotal[1];
+
+        void main() {
+            uint index = gl_GlobalInvocationID.x;
+
+            sharedTotal[0] = 0;
+
+            // Iterating to simulate longer process
+            for (int i = 0; i < 100000000; i++)
+            {
+                atomicAdd(sharedTotal[0], 1);
+            }
+
+            pb[index] = sharedTotal[0];
+        }
+    )");
+
+    // We can now await for the previous submitted command
+    // The first parameter can be the amount of time to wait
+    // The time provided is in nanoseconds
+    mgr.evalOpAwaitDefault(10000);
+
+    // Run Async Kompute operation on the parameters provided
+    mgr.evalOpAsyncDefault<kp::OpAlgoBase<>>(
+        { tensor }, 
+        std::vector<char>(shader.begin(), shader.end()));
+
+    // Here we can do other work
+
+    // When we're ready we can wait 
+    // The default wait time is UINT64_MAX
+    mgr.evalOpAwaitDefault()
+
+    // Sync the GPU memory back to the local tensor
+    // We can still run synchronous jobs in our created sequence
+    mgr.evalOpDefault<kp::OpTensorSyncLocal>({ tensor });
+
+    // Prints the output: B: { 100000000, ... }
+    std::cout << fmt::format("B: {}", 
+        tensor.data()) << std::endl;
+}
+```
+
+#### Parallel Operations
+
+Besides being able to submit asynchronous operations, you can also leverage the underlying GPU compute queues to process operations in parallel.
+
+This will depend on your underlying graphics card, but for example in NVIDIA graphics cards the operations submitted across queues in one family are not parallelizable, but operations submitted across queueFamilies can be parallelizable.
+
+Below we show how you can parallelize operations in an [NVIDIA 1650](http://vulkan.gpuinfo.org/displayreport.php?id=9700#queuefamilies), which has a `GRAPHICS+COMPUTE` family on `index 0`, and `COMPUTE` family on `index 2`.
+
+Back to [more examples](#simple-examples)
+
+```c++
+int main() {
+
+    // In this case we select device 0, and for queues, one queue from familyIndex 0
+    // and one queue from familyIndex 2
+    uint32_t deviceIndex(0);
+    std::vector<uint32_t> familyIndeces = {0, 2};
+
+    // We create a manager with device index, and queues by queue family index
+    kp::Manager mgr(deviceIndex, familyIndeces);
+
+    // We need to create explicit sequences with their respective queues
+    // The second parameter is the index in the familyIndex array which is relative
+    //      to the vector we created the manager with.
+    mgr.createManagedSequence("queueOne", 0);
+    mgr.createManagedSequence("queueTwo", 1);
+
+    // Creates tensor an initializes GPU memory (below we show more granularity)
+    auto tensorA = std::make_shared<kp::Tensor>(kp::Tensor(std::vector<float>(10, 0.0)));
+    auto tensorB = std::make_shared<kp::Tensor>(kp::Tensor(std::vector<float>(10, 0.0)));
+
+    // We run the first step synchronously on the default sequence
+    mgr.evalOpDefault<kp::OpTensorCreate>({ tensorA, tensorB });
+
+    // Define your shader as a string (using string literals for simplicity)
+    // (You can also pass the raw compiled bytes, or even path to file)
+    std::string shader(R"(
+        #version 450
+
+        layout (local_size_x = 1) in;
+
+        layout(set = 0, binding = 0) buffer b { float pb[]; };
+
+        shared uint sharedTotal[1];
+
+        void main() {
+            uint index = gl_GlobalInvocationID.x;
+
+            sharedTotal[0] = 0;
+
+            // Iterating to simulate longer process
+            for (int i = 0; i < 100000000; i++)
+            {
+                atomicAdd(sharedTotal[0], 1);
+            }
+
+            pb[index] = sharedTotal[0];
+        }
+    )");
+
+    // Run the first parallel operation in the `queueOne` sequence
+    mgr.evalOpAsync<kp::OpAlgoBase<>>(
+        { tensorA }, 
+        "queueOne",
+        std::vector<char>(shader.begin(), shader.end()));
+
+    // Run the second parallel operation in the `queueTwo` sequence
+    mgr.evalOpAsync<kp::OpAlgoBase<>>(
+        { tensorB }, 
+        "queueTwo",
+        std::vector<char>(shader.begin(), shader.end()));
+
+    // Here we can do other work
+
+    // We can now wait for thw two parallel tasks to finish
+    mgr.evalOpAwait("queueOne")
+    mgr.evalOpAwait("queueTwo")
+
+    // Sync the GPU memory back to the local tensor
+    mgr.evalOp<kp::OpTensorSyncLocal>({ tensorA, tensorB });
+
+    // Prints the output: A: 100000000 B: 100000000
+    std::cout << fmt::format("A: {}, B: {}", 
+        tensorA.data()[0], tensorB.data()[0]) << std::endl;
+}
+```
+
+### Your Custom Kompute Operation
+
+Build your own pre-compiled operations for domain specific workflows. Back to [more examples](#simple-examples)
+
+We also provide tools that allow you to [convert shaders into C++ headers](https://github.com/EthicalML/vulkan-kompute/blob/master/scripts/convert_shaders.py#L40).
+
+```c++
+
+template<uint32_t tX = 0, uint32_t tY = 0, uint32_t tZ = 0>
+class OpMyCustom : public OpAlgoBase<tX, tY, tZ>
+{
+  public:
+    OpMyCustom(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
+           std::shared_ptr<vk::Device> device,
+           std::shared_ptr<vk::CommandBuffer> commandBuffer,
+           std::vector<std::shared_ptr<Tensor>> tensors)
+      : OpAlgoBase<tX, tY, tZ>(physicalDevice, device, commandBuffer, tensors, "")
+    {
+        // Perform your custom steps such as reading from a shader file
+        this->mShaderFilePath = "shaders/glsl/opmult.comp";
+    }
+}
+
+
+int main() {
+
+    kp::Manager mgr; // Automatically selects Device 0
+
+    // Create 3 tensors of default type float
+    auto tensorLhs = std::make_shared<kp::Tensor>(kp::Tensor({ 0., 1., 2. }));
+    auto tensorRhs = std::make_shared<kp::Tensor>(kp::Tensor({ 2., 4., 6. }));
+    auto tensorOut = std::make_shared<kp::Tensor>(kp::Tensor({ 0., 0., 0. }));
+
+    // Create tensors data explicitly in GPU with an operation
+    mgr.evalOpDefault<kp::OpTensorCreate>({ tensorLhs, tensorRhs, tensorOut });
+
+    // Run Kompute operation on the parameters provided with dispatch layout
+    mgr.evalOpDefault<kp::OpMyCustom<3, 1, 1>>(
+        { tensorLhs, tensorRhs, tensorOut });
+
+    // Prints the output which is { 0, 4, 12 }
+    std::cout << fmt::format("Output: {}", tensorOutput.data()) << std::endl;
+}
+```
+
 ## Components & Architecture

 The core architecture of Kompute include the following:
--- a/docs/index.rst
+++ b/docs/index.rst
@ -15,7 +15,8 @@ Index
    Mobile App Intergration (Android) <overview/mobile-android>
    Game Engine Integration (Godot Engine) <overview/game-engine-godot>
    Converting GLSL/HLSL Shaders to C++ Headers <overview/shaders-to-headers>
-    Class Reference <overview/reference>
-    Memory management principles <overview/memory-management>
+    Asynchronous & Parallel Operations <overview/async-parallel>
+    Class Documentation and C++ Reference <overview/reference>
+    Memory Management Principles <overview/memory-management>
    Code Index <genindex>

--- a/docs/overview/async-parallel.rst
+++ b/docs/overview/async-parallel.rst
@ -0,0 +1,108 @@
+
+Asynchronous and Parallel Operations
+=============
+
+In GPU computing it is possible to have multiple levels of asynchronous and parallel processing of GPU tasks.
+
+It is important to understand the conceptual distinctions of the diffent terminology when using each of these components.
+
+In this section we will cover the following points:
+
+* Asynchronous operation submission
+* Parallel processing of operations
+
+Asynchronous operation submission
+---------------------------------
+
+As the name implies, this refers to the asynchronous submission of operations. This means that operations can be submitted to the GPU, and the C++ / host CPU can continue performing tasks, until when the user desires to run `await` to wait until the operation finishes.
+
+This basically provides further granularity on Vulkan Fences, which is its means to enable the CPU host to know when GPU commands have finished executing. 
+
+It is important that submitting tasks asynchronously, does not mean that these will be executed in parallel. Parallel execution of operations will be covered in the following section.
+
+Asynchronous operation submission can be achieved through the kp::Manager, or directly through the kp::Sequence. Below is an example using the Kompute manager.
+
+Async/Await Example
+^^^^^^^^^^^^^^^^^^^^^
+
+Asynchronous job submission is done using `evalOpAsync` and `evalOpAwait` functions.
+
+For simplicity the `evalOpAsyncDefault` and `evalOpAwaitDefault` functions are provided, which can be used similar to the synchronous counterparts (which basically use the default named sequence).
+
+A simple example of asynchronous submission can be found below.
+
+One important thing to bare in mind when using asynchronous submissions, is that you should make sure that any overlapping asynchronous functions are run in separate sequences.
+
+The reason why this is important is that the Await function not only waits for the fence, but also runs the `postEval` functions across all operations, which is required for several operations.
+
+.. code-block:: cpp
+    :linenos:
+
+    // You can allow Kompute to create the Vulkan components, or pass your existing ones
+    kp::Manager mgr; // Selects device 0 unless explicitly requested
+
+    // Creates tensor an initializes GPU memory (below we show more granularity)
+    auto tensor = std::make_shared<kp::Tensor>(kp::Tensor(std::vector<float>(10, 0.0)));
+
+    // Create tensors data explicitly in GPU with an operation
+    mgr.evalOpAsyncDefault<kp::OpTensorCreate>({ tensor });
+
+    // Define your shader as a string (using string literals for simplicity)
+    // (You can also pass the raw compiled bytes, or even path to file)
+    std::string shader(R"(
+        #version 450
+
+        layout (local_size_x = 1) in;
+
+        layout(set = 0, binding = 0) buffer b { float pb[]; };
+
+        shared uint sharedTotal[1];
+
+        void main() {
+            uint index = gl_GlobalInvocationID.x;
+
+            sharedTotal[0] = 0;
+
+            // Iterating to simulate longer process
+            for (int i = 0; i < 100000000; i++)
+            {
+                atomicAdd(sharedTotal[0], 1);
+            }
+
+            pb[index] = sharedTotal[0];
+        }
+    )");
+
+    // We can now await for the previous submitted command
+    // The first parameter can be the amount of time to wait
+    // The time provided is in nanoseconds
+    mgr.evalOpAwaitDefault(10000);
+
+    // Run Async Kompute operation on the parameters provided
+    mgr.evalOpAsyncDefault<kp::OpAlgoBase<>>(
+        { tensor }, 
+        std::vector<char>(shader.begin(), shader.end()));
+
+    // Here we can do other work
+
+    // When we're ready we can wait 
+    // The default wait time is UINT64_MAX
+    mgr.evalOpAwaitDefault()
+
+    // Sync the GPU memory back to the local tensor
+    // We can still run synchronous jobs in our created sequence
+    mgr.evalOpDefault<kp::OpTensorSyncLocal>({ tensor });
+
+    // Prints the output: B: { 100000000, ... }
+    std::cout << fmt::format("B: {}", 
+        tensor.data()) << std::endl;
+
+
+Parallel Operation Submission
+-----------
+
+In order to work with parallel execution of tasks, it is important that you understand some of the core GPU processing limitations, as these can be quite broad and hardware dependent, which means they will vary across NVIDIA / AMD / ETC video cards.
+
+GPUs by default will optimize towards GPU 
+
+
--- a/docs/overview/reference.rst
+++ b/docs/overview/reference.rst
@ -1,5 +1,5 @@

-Reference
+Class Documentation and C++ Reference
 ========

 This section provides a breakdown of the cpp classes and what each of their functions provide. It is partially generated and augomented from the Doxygen autodoc content. You can also go directly to the `raw doxygen docs <../doxygen/annotated.html>`_.
--- a/single_include/kompute/Kompute.hpp
+++ b/single_include/kompute/Kompute.hpp
@ -18,7 +18,9 @@ static const char* KOMPUTE_LOG_TAG = "KomputeLog";
 #ifndef KOMPUTE_VK_API_MINOR_VERSION
 #define KOMPUTE_VK_API_MINOR_VERSION 1
 #endif // KOMPUTE_VK_API_MINOR_VERSION
-#define KOMPUTE_VK_API_VERSION VK_MAKE_VERSION(KOMPUTE_VK_API_MAJOR_VERSION, KOMPUTE_VK_API_MINOR_VERSION, 0)
+#define KOMPUTE_VK_API_VERSION                                                 \
+    VK_MAKE_VERSION(                                                           \
+      KOMPUTE_VK_API_MAJOR_VERSION, KOMPUTE_VK_API_MINOR_VERSION, 0)
 #endif // KOMPUTE_VK_API_VERSION

 // SPDLOG_ACTIVE_LEVEL must be defined before spdlog.h import
@ -39,7 +41,8 @@ static const char* KOMPUTE_LOG_TAG = "KomputeLog";
 #define SPDLOG_DEBUG(message, ...)
 #else
 #if defined(VK_USE_PLATFORM_ANDROID_KHR)
-#define SPDLOG_DEBUG(message, ...) ((void)__android_log_print(ANDROID_LOG_DEBUG, KOMPUTE_LOG_TAG, message))
+#define SPDLOG_DEBUG(message, ...)                                             \
+    ((void)__android_log_print(ANDROID_LOG_DEBUG, KOMPUTE_LOG_TAG, message))
 #else
 #define SPDLOG_DEBUG(message, ...)                                             \
    std::cout << "DEBUG: " << message << std::endl
@ -49,7 +52,8 @@ static const char* KOMPUTE_LOG_TAG = "KomputeLog";
 #define SPDLOG_INFO(message, ...)
 #else
 #if defined(VK_USE_PLATFORM_ANDROID_KHR)
-#define SPDLOG_INFO(message, ...) ((void)__android_log_print(ANDROID_LOG_INFO, KOMPUTE_LOG_TAG, message))
+#define SPDLOG_INFO(message, ...)                                              \
+    ((void)__android_log_print(ANDROID_LOG_INFO, KOMPUTE_LOG_TAG, message))
 #else
 #define SPDLOG_INFO(message, ...) std::cout << "INFO: " << message << std::endl
 #endif // VK_USE_PLATFORM_ANDROID_KHR
@ -58,7 +62,8 @@ static const char* KOMPUTE_LOG_TAG = "KomputeLog";
 #define SPDLOG_WARN(message, ...)
 #else
 #if defined(VK_USE_PLATFORM_ANDROID_KHR)
-#define SPDLOG_WARN(message, ...) ((void)__android_log_print(ANDROID_LOG_INFO, KOMPUTE_LOG_TAG, message))
+#define SPDLOG_WARN(message, ...)                                              \
+    ((void)__android_log_print(ANDROID_LOG_INFO, KOMPUTE_LOG_TAG, message))
 #else
 #define SPDLOG_WARN(message, ...)                                              \
    std::cout << "WARNING: " << message << std::endl
@ -68,7 +73,8 @@ static const char* KOMPUTE_LOG_TAG = "KomputeLog";
 #define SPDLOG_ERROR(message, ...)
 #else
 #if defined(VK_USE_PLATFORM_ANDROID_KHR)
-#define SPDLOG_ERROR(message, ...) ((void)__android_log_print(ANDROID_LOG_INFO, KOMPUTE_LOG_TAG, message))
+#define SPDLOG_ERROR(message, ...)                                             \
+    ((void)__android_log_print(ANDROID_LOG_INFO, KOMPUTE_LOG_TAG, message))
 #else
 #define SPDLOG_ERROR(message, ...)                                             \
    std::cout << "ERROR: " << message << std::endl
@ -1033,19 +1039,45 @@ class Sequence
    /**
     * Begins recording commands for commands to be submitted into the command
     * buffer.
+     *
+     * @return Boolean stating whether execution was successful.
     */
    bool begin();
+
    /**
     * Ends the recording and stops recording commands when the record command
     * is sent.
+     *
+     * @return Boolean stating whether execution was successful.
     */
    bool end();
+
    /**
     * Eval sends all the recorded and stored operations in the vector of
     * operations into the gpu as a submit job with a barrier.
+     *
+     * @return Boolean stating whether execution was successful.
     */
    bool eval();

+    /**
+     * Eval Async sends all the recorded and stored operations in the vector of
+     * operations into the gpu as a submit job with a barrier. EvalAwait() must
+     * be called after to ensure the sequence is terminated correctly.
+     *
+     * @return Boolean stating whether execution was successful.
+     */
+    bool evalAsync();
+
+    /**
+     * Eval Await waits for the fence to finish processing and then once it
+     * finishes, it runs the postEval of all operations.
+     *
+     * @param waitFor Number of milliseconds to wait before timing out.
+     * @return Boolean stating whether execution was successful.
+     */
+    bool evalAwait(uint64_t waitFor = UINT64_MAX);
+
    /**
     * Returns true if the sequence is currently in recording activated.
     *
@ -1053,6 +1085,14 @@ class Sequence
     */
    bool isRecording();

+    /**
+     * Returns true if the sequence is currently running - mostly used for async
+     * workloads.
+     *
+     * @return Boolean stating if currently running.
+     */
+    bool isRunning();
+
    /**
     * Returns true if the sequence has been successfully initialised.
     *
@ -1122,12 +1162,14 @@ class Sequence
    std::shared_ptr<vk::CommandBuffer> mCommandBuffer = nullptr;
    bool mFreeCommandBuffer = false;

-    // Base op objects
+    // -------------- ALWAYS OWNED RESOURCES
+    vk::Fence mFence;
    std::vector<std::unique_ptr<OpBase>> mOperations;

    // State
    bool mIsInit = false;
    bool mRecording = false;
+    bool mIsRunning = false;

    // Create functions
    void createCommandPool();
@ -1221,19 +1263,25 @@ class Manager
    Manager();

    /**
-        Similar to base constructor but allows the user to provide the device
-       they would like to create the resources on.
-    */
-    Manager(uint32_t physicalDeviceIndex);
+     * Similar to base constructor but allows the user to provide the device
+     * they would like to create the resources on.
+     *
+     * @param physicalDeviceIndex The index of the physical device to use
+     * @param familyQueueIndeces (Optional) List of queue indeces to add for
+     * explicit allocation
+     * @param totalQueues The total number of compute queues to create.
+     */
+    Manager(uint32_t physicalDeviceIndex,
+            const std::vector<uint32_t>& familyQueueIndeces = {});

    /**
     * Manager constructor which allows your own vulkan application to integrate
     * with the vulkan kompute use.
     *
     * @param instance Vulkan compute instance to base this application
-     * @physicalDevice Vulkan physical device to use for application
-     * @device Vulkan logical device to use for all base resources
-     * @physicalDeviceIndex Index for vulkan physical device used
+     * @param physicalDevice Vulkan physical device to use for application
+     * @param device Vulkan logical device to use for all base resources
+     * @param physicalDeviceIndex Index for vulkan physical device used
     */
    Manager(std::shared_ptr<vk::Instance> instance,
            std::shared_ptr<vk::PhysicalDevice> physicalDevice,
@ -1259,8 +1307,18 @@ class Manager
      std::string sequenceName);

    /**
-     * Operation that adds extra operations to existing or new created
-     * sequences.
+     * Create a new managed Kompute sequence so it's available within the
+     * manager.
+     *
+     * @param sequenceName The name for the named sequence to be created
+     * @param queueIndex The queue to use from the available queues
+     * @return Weak pointer to the manager owned sequence resource
+     */
+    std::weak_ptr<Sequence> createManagedSequence(std::string sequenceName,
+                                                  uint32_t queueIndex = 0);
+
+    /**
+     * Function that evaluates operation against named sequence.
     *
     * @param tensors The tensors to be used in the operation recorded
     * @param sequenceName The name of the sequence to be retrieved or created
@ -1293,8 +1351,7 @@ class Manager
    }

    /**
-     * Operation that adds extra operations to existing or new created
-     * sequences.
+     * Function that evaluates operation against default sequence.
     *
     * @param tensors The tensors to be used in the operation recorded
     * @param TArgs Template parameters that will be used to initialise
@ -1309,6 +1366,103 @@ class Manager
          tensors, KP_DEFAULT_SESSION, std::forward<TArgs>(params)...);
    }

+    /**
+     * Function that evaluates operation against named sequence asynchronously.
+     *
+     * @param tensors The tensors to be used in the operation recorded
+     * @param sequenceName The name of the sequence to be retrieved or created
+     * @param params Template parameters that will be used to initialise
+     * Operation to allow for extensible configurations on initialisation
+     */
+    template<typename T, typename... TArgs>
+    void evalOpAsync(std::vector<std::shared_ptr<Tensor>> tensors,
+                     std::string sequenceName,
+                     TArgs&&... params)
+    {
+        SPDLOG_DEBUG("Kompute Manager evalOpAsync triggered");
+        std::weak_ptr<Sequence> sqWeakPtr =
+          this->getOrCreateManagedSequence(sequenceName);
+
+        std::unordered_map<std::string, std::shared_ptr<Sequence>>::iterator
+          found = this->mManagedSequences.find(sequenceName);
+
+        if (found != this->mManagedSequences.end()) {
+            std::shared_ptr<Sequence> sq = found->second;
+
+            SPDLOG_DEBUG("Kompute Manager evalOpAsync running sequence BEGIN");
+            sq->begin();
+
+            SPDLOG_DEBUG("Kompute Manager evalOpAsync running sequence RECORD");
+            sq->record<T>(tensors, std::forward<TArgs>(params)...);
+
+            SPDLOG_DEBUG("Kompute Manager evalOpAsync running sequence END");
+            sq->end();
+
+            SPDLOG_DEBUG("Kompute Manager evalOpAsync running sequence EVAL");
+            sq->evalAsync();
+        } else {
+            SPDLOG_ERROR("Kompute Manager evalOpAsync sequence [{}] not found",
+                         sequenceName);
+        }
+        SPDLOG_DEBUG("Kompute Manager evalOpAsync running sequence SUCCESS");
+    }
+
+    /**
+     * Operation that evaluates operation against default sequence asynchronously.
+     *
+     * @param tensors The tensors to be used in the operation recorded
+     * @param params Template parameters that will be used to initialise
+     * Operation to allow for extensible configurations on initialisation
+     */
+    template<typename T, typename... TArgs>
+    void evalOpAsyncDefault(std::vector<std::shared_ptr<Tensor>> tensors,
+                     TArgs&&... params)
+    {
+        SPDLOG_DEBUG("Kompute Manager evalOpAsyncDefault triggered");
+        this->evalOpAsync<T>(
+          tensors, KP_DEFAULT_SESSION, std::forward<TArgs>(params)...);
+    }
+
+    /**
+     * Operation that awaits for named sequence to finish.
+     *
+     * @param sequenceName The name of the sequence to wait for termination
+     * @param waitFor The amount of time to wait before timing out
+     */
+    void evalOpAwait(std::string sequenceName, uint64_t waitFor = UINT64_MAX)
+    {
+        SPDLOG_DEBUG("Kompute Manager evalOpAwait triggered with sequence {}", sequenceName);
+        std::unordered_map<std::string, std::shared_ptr<Sequence>>::iterator
+          found = this->mManagedSequences.find(sequenceName);
+
+        if (found != this->mManagedSequences.end()) {
+            if (std::shared_ptr<kp::Sequence> sq = found->second) {
+                SPDLOG_DEBUG("Kompute Manager evalOpAwait running sequence "
+                             "Sequence EVAL AWAIT");
+                if (sq->isRunning()) {
+                    sq->evalAwait(waitFor);
+                }
+            }
+            SPDLOG_DEBUG(
+              "Kompute Manager evalOpAwait running sequence SUCCESS");
+        } else {
+            SPDLOG_ERROR("Kompute Manager evalOpAwait Sequence not found");
+        }
+    }
+
+    /**
+     * Operation that awaits for default sequence to finish.
+     *
+     * @param tensors The tensors to be used in the operation recorded
+     * @param params Template parameters that will be used to initialise
+     * Operation to allow for extensible configurations on initialisation
+     */
+    void evalOpAwaitDefault(uint64_t waitFor = UINT64_MAX)
+    {
+        SPDLOG_DEBUG("Kompute Manager evalOpAwaitDefault triggered");
+        this->evalOpAwait(KP_DEFAULT_SESSION, waitFor);
+    }
+
    /**
     * Function that simplifies the common workflow of tensor creation and
     * initialization. It will take the constructor parameters for a Tensor
@ -1342,13 +1496,14 @@ class Manager
    uint32_t mPhysicalDeviceIndex = -1;
    std::shared_ptr<vk::Device> mDevice = nullptr;
    bool mFreeDevice = false;
-    uint32_t mComputeQueueFamilyIndex = -1;
-    std::shared_ptr<vk::Queue> mComputeQueue = nullptr;

    // -------------- ALWAYS OWNED RESOURCES
    std::unordered_map<std::string, std::shared_ptr<Sequence>>
      mManagedSequences;

+    std::vector<uint32_t> mComputeQueueFamilyIndeces;
+    std::vector<std::shared_ptr<vk::Queue>> mComputeQueues;
+
 #if DEBUG
 #ifndef KOMPUTE_DISABLE_VK_DEBUG_LAYERS
    vk::DebugReportCallbackEXT mDebugReportCallback;
@ -1358,7 +1513,7 @@ class Manager

    // Create functions
    void createInstance();
-    void createDevice();
+    void createDevice(const std::vector<uint32_t>& familyQueueIndeces = {});
 };

 } // End namespace kp
@ -1599,7 +1754,7 @@ OpAlgoBase<tX, tY, tZ>::OpAlgoBase(std::shared_ptr<vk::PhysicalDevice> physicalD
                           std::vector<std::shared_ptr<Tensor>>& tensors)
  : OpBase(physicalDevice, device, commandBuffer, tensors, false)
 {
-    SPDLOG_DEBUG("Kompute OpAlgoBase constructor with params numTensors: {} , shaderFilePath: {}", tensors.size());
+    SPDLOG_DEBUG("Kompute OpAlgoBase constructor with params numTensors: {}", tensors.size());

    // The dispatch size is set up based on either explicitly provided template
    // parameters or by default it would take the shape and size of the tensors
--- a/src/Algorithm.cpp
+++ b/src/Algorithm.cpp
@ -266,8 +266,7 @@ Algorithm::createPipeline(std::vector<uint32_t> specializationData)

 #ifdef KOMPUTE_CREATE_PIPELINE_RESULT_VALUE
    vk::ResultValue<vk::Pipeline> pipelineResult =
-      this->mDevice->createComputePipeline(*this->mPipelineCache,
-pipelineInfo);
+      this->mDevice->createComputePipeline(*this->mPipelineCache, pipelineInfo);

    if (pipelineResult.result != vk::Result::eSuccess) {
        throw std::runtime_error("Failed to create pipeline result: " +
--- a/src/Manager.cpp
+++ b/src/Manager.cpp
@ -28,12 +28,13 @@ Manager::Manager()
  : Manager(0)
 {}

-Manager::Manager(uint32_t physicalDeviceIndex)
+Manager::Manager(uint32_t physicalDeviceIndex,
+                 const std::vector<uint32_t>& familyQueueIndeces)
 {
    this->mPhysicalDeviceIndex = physicalDeviceIndex;

    this->createInstance();
-    this->createDevice();
+    this->createDevice(familyQueueIndeces);
 }

 Manager::Manager(std::shared_ptr<vk::Instance> instance,
@ -98,19 +99,31 @@ Manager::getOrCreateManagedSequence(std::string sequenceName)
      this->mManagedSequences.find(sequenceName);

    if (found == this->mManagedSequences.end()) {
-        std::shared_ptr<Sequence> sq =
-          std::make_shared<Sequence>(this->mPhysicalDevice,
-                                     this->mDevice,
-                                     this->mComputeQueue,
-                                     this->mComputeQueueFamilyIndex);
-        sq->init();
-        this->mManagedSequences.insert({ sequenceName, sq });
-        return sq;
+        return this->createManagedSequence(sequenceName);
    } else {
        return found->second;
    }
 }

+std::weak_ptr<Sequence>
+Manager::createManagedSequence(std::string sequenceName, uint32_t queueIndex)
+{
+
+    SPDLOG_DEBUG("Kompute Manager createManagedSequence with sequenceName: {} "
+                 "and queueIndex: {}",
+                 sequenceName,
+                 queueIndex);
+
+    std::shared_ptr<Sequence> sq =
+      std::make_shared<Sequence>(this->mPhysicalDevice,
+                                 this->mDevice,
+                                 this->mComputeQueues[queueIndex],
+                                 this->mComputeQueueFamilyIndeces[queueIndex]);
+    sq->init();
+    this->mManagedSequences.insert({ sequenceName, sq });
+    return sq;
+}
+
 void
 Manager::createInstance()
 {
@ -197,7 +210,7 @@ Manager::createInstance()
 }

 void
-Manager::createDevice()
+Manager::createDevice(const std::vector<uint32_t>& familyQueueIndeces)
 {

    SPDLOG_DEBUG("Kompute Manager creating Device");
@ -228,45 +241,75 @@ Manager::createDevice()
                this->mPhysicalDeviceIndex,
                physicalDeviceProperties.deviceName);

-    // Find compute queue
-    std::vector<vk::QueueFamilyProperties> allQueueFamilyProperties =
-      physicalDevice.getQueueFamilyProperties();
+    if (!familyQueueIndeces.size()) {
+        // Find compute queue
+        std::vector<vk::QueueFamilyProperties> allQueueFamilyProperties =
+          physicalDevice.getQueueFamilyProperties();

-    this->mComputeQueueFamilyIndex = -1;
-    for (uint32_t i = 0; i < allQueueFamilyProperties.size(); i++) {
-        vk::QueueFamilyProperties queueFamilyProperties =
-          allQueueFamilyProperties[i];
+        uint32_t computeQueueFamilyIndex = -1;
+        for (uint32_t i = 0; i < allQueueFamilyProperties.size(); i++) {
+            vk::QueueFamilyProperties queueFamilyProperties =
+              allQueueFamilyProperties[i];

-        if (queueFamilyProperties.queueFlags & vk::QueueFlagBits::eCompute) {
-            this->mComputeQueueFamilyIndex = i;
-            break;
+            if (queueFamilyProperties.queueFlags &
+                vk::QueueFlagBits::eCompute) {
+                computeQueueFamilyIndex = i;
+                break;
+            }
        }
+
+        if (computeQueueFamilyIndex < 0) {
+            throw std::runtime_error("Compute queue is not supported");
+        }
+
+        this->mComputeQueueFamilyIndeces.push_back(computeQueueFamilyIndex);
+    } else {
+        this->mComputeQueueFamilyIndeces = familyQueueIndeces;
    }

-    if (this->mComputeQueueFamilyIndex < 0) {
-        throw std::runtime_error("Compute queue is not supported");
+    std::unordered_map<uint32_t, uint32_t> familyQueueCounts;
+    std::unordered_map<uint32_t, std::vector<float>> familyQueuePriorities;
+    for (const auto& value : this->mComputeQueueFamilyIndeces) {
+        familyQueueCounts[value]++;
+        familyQueuePriorities[value].push_back(1.0f);
    }

-    const float defaultQueuePriority(0.0f);
-    const uint32_t defaultQueueCount(1);
-    vk::DeviceQueueCreateInfo deviceQueueCreateInfo(
-      vk::DeviceQueueCreateFlags(),
-      this->mComputeQueueFamilyIndex,
-      defaultQueueCount,
-      &defaultQueuePriority);
+    std::unordered_map<uint32_t, uint32_t> familyQueueIndexCount;
+    std::vector<vk::DeviceQueueCreateInfo> deviceQueueCreateInfos;
+    for (const auto& familyQueueInfo : familyQueueCounts) {
+        // Setting the device count to 0
+        familyQueueIndexCount[familyQueueInfo.first] = 0;
+
+        // Creating the respective device queue
+        vk::DeviceQueueCreateInfo deviceQueueCreateInfo(
+          vk::DeviceQueueCreateFlags(),
+          familyQueueInfo.first,
+          familyQueueInfo.second,
+          familyQueuePriorities[familyQueueInfo.first].data());
+        deviceQueueCreateInfos.push_back(deviceQueueCreateInfo);
+    }

    vk::DeviceCreateInfo deviceCreateInfo(vk::DeviceCreateFlags(),
-                                          1, // Number of deviceQueueCreateInfo
-                                          &deviceQueueCreateInfo);
+                                          deviceQueueCreateInfos.size(),
+                                          deviceQueueCreateInfos.data());

    this->mDevice = std::make_shared<vk::Device>();
    physicalDevice.createDevice(
      &deviceCreateInfo, nullptr, this->mDevice.get());
    SPDLOG_DEBUG("Kompute Manager device created");

-    this->mComputeQueue = std::make_shared<vk::Queue>();
-    this->mDevice->getQueue(
-      this->mComputeQueueFamilyIndex, 0, this->mComputeQueue.get());
+    for (const uint32_t& familyQueueIndex : this->mComputeQueueFamilyIndeces) {
+        std::shared_ptr<vk::Queue> currQueue = std::make_shared<vk::Queue>();
+
+        this->mDevice->getQueue(familyQueueIndex,
+                                familyQueueIndexCount[familyQueueIndex],
+                                currQueue.get());
+
+        familyQueueIndexCount[familyQueueIndex]++;
+
+        this->mComputeQueues.push_back(currQueue);
+    }
+
    SPDLOG_DEBUG("Kompute Manager compute queue obtained");
 }

--- a/src/Sequence.cpp
+++ b/src/Sequence.cpp
@ -118,40 +118,86 @@ Sequence::end()
 bool
 Sequence::eval()
 {
-    SPDLOG_DEBUG("Kompute sequence compute recording EVAL");
+    SPDLOG_DEBUG("Kompute sequence EVAL BEGIN");

-    if (this->isRecording()) {
-        SPDLOG_WARN("Kompute Sequence eval called when still recording");
+    bool evalResult = this->evalAsync();
+    if (!evalResult) {
+        SPDLOG_DEBUG("Kompute sequence EVAL FAILURE");
        return false;
    }

+    evalResult = this->evalAwait();
+
+    SPDLOG_DEBUG("Kompute sequence EVAL SUCCESS");
+
+    return evalResult;
+}
+
+bool
+Sequence::evalAsync()
+{
+    if (this->isRecording()) {
+        SPDLOG_WARN("Kompute Sequence evalAsync called when still recording");
+        return false;
+    }
+    if (this->mIsRunning) {
+        SPDLOG_WARN("Kompute Sequence evalAsync called when an eval async was "
+                    "called without successful wait");
+        return false;
+    }
+
+    this->mIsRunning = true;
+
    for (size_t i = 0; i < this->mOperations.size(); i++) {
        this->mOperations[i]->preEval();
    }

-    const vk::PipelineStageFlags waitStageMask =
-      vk::PipelineStageFlagBits::eTransfer;
    vk::SubmitInfo submitInfo(
-      0, nullptr, &waitStageMask, 1, this->mCommandBuffer.get());
+      0, nullptr, nullptr, 1, this->mCommandBuffer.get());

-    vk::Fence fence = this->mDevice->createFence(vk::FenceCreateInfo());
+    this->mFence = this->mDevice->createFence(vk::FenceCreateInfo());

    SPDLOG_DEBUG(
      "Kompute sequence submitting command buffer into compute queue");

-    this->mComputeQueue->submit(1, &submitInfo, fence);
-    this->mDevice->waitForFences(1, &fence, VK_TRUE, UINT64_MAX);
-    this->mDevice->destroy(fence);
+    this->mComputeQueue->submit(1, &submitInfo, this->mFence);
+
+    return true;
+}
+
+bool
+Sequence::evalAwait(uint64_t waitFor)
+{
+    if (!this->mIsRunning) {
+        SPDLOG_WARN("Kompute Sequence evalAwait called without existing eval");
+        return false;
+    }
+
+    vk::Result result =
+      this->mDevice->waitForFences(1, &this->mFence, VK_TRUE, waitFor);
+    this->mDevice->destroy(this->mFence);
+
+    if (result == vk::Result::eTimeout) {
+        SPDLOG_WARN("Kompute Sequence evalAwait timed out");
+        this->mIsRunning = false;
+        return false;
+    }

    for (size_t i = 0; i < this->mOperations.size(); i++) {
        this->mOperations[i]->postEval();
    }

-    SPDLOG_DEBUG("Kompute sequence EVAL success");
+    this->mIsRunning = false;

    return true;
 }

+bool
+Sequence::isRunning()
+{
+    return this->mIsRunning;
+}
+
 bool
 Sequence::isRecording()
 {
--- a/src/include/kompute/Core.hpp
+++ b/src/include/kompute/Core.hpp
@ -18,10 +18,11 @@ static const char* KOMPUTE_LOG_TAG = "KomputeLog";
 #ifndef KOMPUTE_VK_API_MINOR_VERSION
 #define KOMPUTE_VK_API_MINOR_VERSION 1
 #endif // KOMPUTE_VK_API_MINOR_VERSION
-#define KOMPUTE_VK_API_VERSION VK_MAKE_VERSION(KOMPUTE_VK_API_MAJOR_VERSION, KOMPUTE_VK_API_MINOR_VERSION, 0)
+#define KOMPUTE_VK_API_VERSION                                                 \
+    VK_MAKE_VERSION(                                                           \
+      KOMPUTE_VK_API_MAJOR_VERSION, KOMPUTE_VK_API_MINOR_VERSION, 0)
 #endif // KOMPUTE_VK_API_VERSION

-
 // SPDLOG_ACTIVE_LEVEL must be defined before spdlog.h import
 #if !defined(SPDLOG_ACTIVE_LEVEL)
 #if DEBUG
@ -40,7 +41,8 @@ static const char* KOMPUTE_LOG_TAG = "KomputeLog";
 #define SPDLOG_DEBUG(message, ...)
 #else
 #if defined(VK_USE_PLATFORM_ANDROID_KHR)
-#define SPDLOG_DEBUG(message, ...) ((void)__android_log_print(ANDROID_LOG_DEBUG, KOMPUTE_LOG_TAG, message))
+#define SPDLOG_DEBUG(message, ...)                                             \
+    ((void)__android_log_print(ANDROID_LOG_DEBUG, KOMPUTE_LOG_TAG, message))
 #else
 #define SPDLOG_DEBUG(message, ...)                                             \
    std::cout << "DEBUG: " << message << std::endl
@ -50,7 +52,8 @@ static const char* KOMPUTE_LOG_TAG = "KomputeLog";
 #define SPDLOG_INFO(message, ...)
 #else
 #if defined(VK_USE_PLATFORM_ANDROID_KHR)
-#define SPDLOG_INFO(message, ...) ((void)__android_log_print(ANDROID_LOG_INFO, KOMPUTE_LOG_TAG, message))
+#define SPDLOG_INFO(message, ...)                                              \
+    ((void)__android_log_print(ANDROID_LOG_INFO, KOMPUTE_LOG_TAG, message))
 #else
 #define SPDLOG_INFO(message, ...) std::cout << "INFO: " << message << std::endl
 #endif // VK_USE_PLATFORM_ANDROID_KHR
@ -59,7 +62,8 @@ static const char* KOMPUTE_LOG_TAG = "KomputeLog";
 #define SPDLOG_WARN(message, ...)
 #else
 #if defined(VK_USE_PLATFORM_ANDROID_KHR)
-#define SPDLOG_WARN(message, ...) ((void)__android_log_print(ANDROID_LOG_INFO, KOMPUTE_LOG_TAG, message))
+#define SPDLOG_WARN(message, ...)                                              \
+    ((void)__android_log_print(ANDROID_LOG_INFO, KOMPUTE_LOG_TAG, message))
 #else
 #define SPDLOG_WARN(message, ...)                                              \
    std::cout << "WARNING: " << message << std::endl
@ -69,7 +73,8 @@ static const char* KOMPUTE_LOG_TAG = "KomputeLog";
 #define SPDLOG_ERROR(message, ...)
 #else
 #if defined(VK_USE_PLATFORM_ANDROID_KHR)
-#define SPDLOG_ERROR(message, ...) ((void)__android_log_print(ANDROID_LOG_INFO, KOMPUTE_LOG_TAG, message))
+#define SPDLOG_ERROR(message, ...)                                             \
+    ((void)__android_log_print(ANDROID_LOG_INFO, KOMPUTE_LOG_TAG, message))
 #else
 #define SPDLOG_ERROR(message, ...)                                             \
    std::cout << "ERROR: " << message << std::endl
--- a/src/include/kompute/Manager.hpp
+++ b/src/include/kompute/Manager.hpp
@ -25,19 +25,25 @@ class Manager
    Manager();

    /**
-        Similar to base constructor but allows the user to provide the device
-       they would like to create the resources on.
-    */
-    Manager(uint32_t physicalDeviceIndex);
+     * Similar to base constructor but allows the user to provide the device
+     * they would like to create the resources on.
+     *
+     * @param physicalDeviceIndex The index of the physical device to use
+     * @param familyQueueIndeces (Optional) List of queue indeces to add for
+     * explicit allocation
+     * @param totalQueues The total number of compute queues to create.
+     */
+    Manager(uint32_t physicalDeviceIndex,
+            const std::vector<uint32_t>& familyQueueIndeces = {});

    /**
     * Manager constructor which allows your own vulkan application to integrate
     * with the vulkan kompute use.
     *
     * @param instance Vulkan compute instance to base this application
-     * @physicalDevice Vulkan physical device to use for application
-     * @device Vulkan logical device to use for all base resources
-     * @physicalDeviceIndex Index for vulkan physical device used
+     * @param physicalDevice Vulkan physical device to use for application
+     * @param device Vulkan logical device to use for all base resources
+     * @param physicalDeviceIndex Index for vulkan physical device used
     */
    Manager(std::shared_ptr<vk::Instance> instance,
            std::shared_ptr<vk::PhysicalDevice> physicalDevice,
@ -63,8 +69,18 @@ class Manager
      std::string sequenceName);

    /**
-     * Operation that adds extra operations to existing or new created
-     * sequences.
+     * Create a new managed Kompute sequence so it's available within the
+     * manager.
+     *
+     * @param sequenceName The name for the named sequence to be created
+     * @param queueIndex The queue to use from the available queues
+     * @return Weak pointer to the manager owned sequence resource
+     */
+    std::weak_ptr<Sequence> createManagedSequence(std::string sequenceName,
+                                                  uint32_t queueIndex = 0);
+
+    /**
+     * Function that evaluates operation against named sequence.
     *
     * @param tensors The tensors to be used in the operation recorded
     * @param sequenceName The name of the sequence to be retrieved or created
@ -97,8 +113,7 @@ class Manager
    }

    /**
-     * Operation that adds extra operations to existing or new created
-     * sequences.
+     * Function that evaluates operation against default sequence.
     *
     * @param tensors The tensors to be used in the operation recorded
     * @param TArgs Template parameters that will be used to initialise
@ -113,6 +128,103 @@ class Manager
          tensors, KP_DEFAULT_SESSION, std::forward<TArgs>(params)...);
    }

+    /**
+     * Function that evaluates operation against named sequence asynchronously.
+     *
+     * @param tensors The tensors to be used in the operation recorded
+     * @param sequenceName The name of the sequence to be retrieved or created
+     * @param params Template parameters that will be used to initialise
+     * Operation to allow for extensible configurations on initialisation
+     */
+    template<typename T, typename... TArgs>
+    void evalOpAsync(std::vector<std::shared_ptr<Tensor>> tensors,
+                     std::string sequenceName,
+                     TArgs&&... params)
+    {
+        SPDLOG_DEBUG("Kompute Manager evalOpAsync triggered");
+        std::weak_ptr<Sequence> sqWeakPtr =
+          this->getOrCreateManagedSequence(sequenceName);
+
+        std::unordered_map<std::string, std::shared_ptr<Sequence>>::iterator
+          found = this->mManagedSequences.find(sequenceName);
+
+        if (found != this->mManagedSequences.end()) {
+            std::shared_ptr<Sequence> sq = found->second;
+
+            SPDLOG_DEBUG("Kompute Manager evalOpAsync running sequence BEGIN");
+            sq->begin();
+
+            SPDLOG_DEBUG("Kompute Manager evalOpAsync running sequence RECORD");
+            sq->record<T>(tensors, std::forward<TArgs>(params)...);
+
+            SPDLOG_DEBUG("Kompute Manager evalOpAsync running sequence END");
+            sq->end();
+
+            SPDLOG_DEBUG("Kompute Manager evalOpAsync running sequence EVAL");
+            sq->evalAsync();
+        } else {
+            SPDLOG_ERROR("Kompute Manager evalOpAsync sequence [{}] not found",
+                         sequenceName);
+        }
+        SPDLOG_DEBUG("Kompute Manager evalOpAsync running sequence SUCCESS");
+    }
+
+    /**
+     * Operation that evaluates operation against default sequence asynchronously.
+     *
+     * @param tensors The tensors to be used in the operation recorded
+     * @param params Template parameters that will be used to initialise
+     * Operation to allow for extensible configurations on initialisation
+     */
+    template<typename T, typename... TArgs>
+    void evalOpAsyncDefault(std::vector<std::shared_ptr<Tensor>> tensors,
+                     TArgs&&... params)
+    {
+        SPDLOG_DEBUG("Kompute Manager evalOpAsyncDefault triggered");
+        this->evalOpAsync<T>(
+          tensors, KP_DEFAULT_SESSION, std::forward<TArgs>(params)...);
+    }
+
+    /**
+     * Operation that awaits for named sequence to finish.
+     *
+     * @param sequenceName The name of the sequence to wait for termination
+     * @param waitFor The amount of time to wait before timing out
+     */
+    void evalOpAwait(std::string sequenceName, uint64_t waitFor = UINT64_MAX)
+    {
+        SPDLOG_DEBUG("Kompute Manager evalOpAwait triggered with sequence {}", sequenceName);
+        std::unordered_map<std::string, std::shared_ptr<Sequence>>::iterator
+          found = this->mManagedSequences.find(sequenceName);
+
+        if (found != this->mManagedSequences.end()) {
+            if (std::shared_ptr<kp::Sequence> sq = found->second) {
+                SPDLOG_DEBUG("Kompute Manager evalOpAwait running sequence "
+                             "Sequence EVAL AWAIT");
+                if (sq->isRunning()) {
+                    sq->evalAwait(waitFor);
+                }
+            }
+            SPDLOG_DEBUG(
+              "Kompute Manager evalOpAwait running sequence SUCCESS");
+        } else {
+            SPDLOG_ERROR("Kompute Manager evalOpAwait Sequence not found");
+        }
+    }
+
+    /**
+     * Operation that awaits for default sequence to finish.
+     *
+     * @param tensors The tensors to be used in the operation recorded
+     * @param params Template parameters that will be used to initialise
+     * Operation to allow for extensible configurations on initialisation
+     */
+    void evalOpAwaitDefault(uint64_t waitFor = UINT64_MAX)
+    {
+        SPDLOG_DEBUG("Kompute Manager evalOpAwaitDefault triggered");
+        this->evalOpAwait(KP_DEFAULT_SESSION, waitFor);
+    }
+
    /**
     * Function that simplifies the common workflow of tensor creation and
     * initialization. It will take the constructor parameters for a Tensor
@ -146,13 +258,14 @@ class Manager
    uint32_t mPhysicalDeviceIndex = -1;
    std::shared_ptr<vk::Device> mDevice = nullptr;
    bool mFreeDevice = false;
-    uint32_t mComputeQueueFamilyIndex = -1;
-    std::shared_ptr<vk::Queue> mComputeQueue = nullptr;

    // -------------- ALWAYS OWNED RESOURCES
    std::unordered_map<std::string, std::shared_ptr<Sequence>>
      mManagedSequences;

+    std::vector<uint32_t> mComputeQueueFamilyIndeces;
+    std::vector<std::shared_ptr<vk::Queue>> mComputeQueues;
+
 #if DEBUG
 #ifndef KOMPUTE_DISABLE_VK_DEBUG_LAYERS
    vk::DebugReportCallbackEXT mDebugReportCallback;
@ -162,7 +275,7 @@ class Manager

    // Create functions
    void createInstance();
-    void createDevice();
+    void createDevice(const std::vector<uint32_t>& familyQueueIndeces = {});
 };

 } // End namespace kp
--- a/src/include/kompute/Sequence.hpp
+++ b/src/include/kompute/Sequence.hpp
@ -45,19 +45,45 @@ class Sequence
    /**
     * Begins recording commands for commands to be submitted into the command
     * buffer.
+     *
+     * @return Boolean stating whether execution was successful.
     */
    bool begin();
+
    /**
     * Ends the recording and stops recording commands when the record command
     * is sent.
+     *
+     * @return Boolean stating whether execution was successful.
     */
    bool end();
+
    /**
     * Eval sends all the recorded and stored operations in the vector of
     * operations into the gpu as a submit job with a barrier.
+     *
+     * @return Boolean stating whether execution was successful.
     */
    bool eval();

+    /**
+     * Eval Async sends all the recorded and stored operations in the vector of
+     * operations into the gpu as a submit job with a barrier. EvalAwait() must
+     * be called after to ensure the sequence is terminated correctly.
+     *
+     * @return Boolean stating whether execution was successful.
+     */
+    bool evalAsync();
+
+    /**
+     * Eval Await waits for the fence to finish processing and then once it
+     * finishes, it runs the postEval of all operations.
+     *
+     * @param waitFor Number of milliseconds to wait before timing out.
+     * @return Boolean stating whether execution was successful.
+     */
+    bool evalAwait(uint64_t waitFor = UINT64_MAX);
+
    /**
     * Returns true if the sequence is currently in recording activated.
     *
@ -65,6 +91,14 @@ class Sequence
     */
    bool isRecording();

+    /**
+     * Returns true if the sequence is currently running - mostly used for async
+     * workloads.
+     *
+     * @return Boolean stating if currently running.
+     */
+    bool isRunning();
+
    /**
     * Returns true if the sequence has been successfully initialised.
     *
@ -134,12 +168,14 @@ class Sequence
    std::shared_ptr<vk::CommandBuffer> mCommandBuffer = nullptr;
    bool mFreeCommandBuffer = false;

-    // Base op objects
+    // -------------- ALWAYS OWNED RESOURCES
+    vk::Fence mFence;
    std::vector<std::unique_ptr<OpBase>> mOperations;

    // State
    bool mIsInit = false;
    bool mRecording = false;
+    bool mIsRunning = false;

    // Create functions
    void createCommandPool();
--- a/src/include/kompute/operations/OpAlgoBase.hpp
+++ b/src/include/kompute/operations/OpAlgoBase.hpp
@ -162,7 +162,7 @@ OpAlgoBase<tX, tY, tZ>::OpAlgoBase(std::shared_ptr<vk::PhysicalDevice> physicalD
                           std::vector<std::shared_ptr<Tensor>>& tensors)
  : OpBase(physicalDevice, device, commandBuffer, tensors, false)
 {
-    SPDLOG_DEBUG("Kompute OpAlgoBase constructor with params numTensors: {} , shaderFilePath: {}", tensors.size());
+    SPDLOG_DEBUG("Kompute OpAlgoBase constructor with params numTensors: {}", tensors.size());

    // The dispatch size is set up based on either explicitly provided template
    // parameters or by default it would take the shape and size of the tensors
--- a/test/TestAsyncOperations.cpp
+++ b/test/TestAsyncOperations.cpp
@ -0,0 +1,164 @@
+
+#include "gtest/gtest.h"
+
+#include <chrono>
+
+#include "kompute/Kompute.hpp"
+
+TEST(TestAsyncOperations, TestManagerParallelExecution)
+{
+    // This test is built for NVIDIA 1650. It assumes:
+    // * Queue family 0 and 2 have compute capabilities
+    // * GPU is able to process parallel shader code across different families
+    uint32_t size = 10;
+
+    uint32_t numParallel = 2;
+
+    std::string shader(R"(
+        #version 450
+
+        layout (local_size_x = 1) in;
+
+        layout(set = 0, binding = 0) buffer b { float pb[]; };
+
+        shared uint sharedTotal[1];
+
+        void main() {
+            uint index = gl_GlobalInvocationID.x;
+
+            sharedTotal[0] = 0;
+
+            for (int i = 0; i < 100000000; i++)
+            {
+                atomicAdd(sharedTotal[0], 1);
+            }
+
+            pb[index] = sharedTotal[0];
+        }
+    )");
+
+    std::vector<float> data(size, 0.0);
+    std::vector<float> resultSync(size, 100000000);
+    std::vector<float> resultAsync(size, 100000000);
+
+    kp::Manager mgr;
+
+    std::vector<std::shared_ptr<kp::Tensor>> inputsSyncB;
+
+    for (uint32_t i = 0; i < numParallel; i++) {
+        inputsSyncB.push_back(std::make_shared<kp::Tensor>(kp::Tensor(data)));
+    }
+
+    mgr.evalOpDefault<kp::OpTensorCreate>(inputsSyncB);
+
+    auto startSync = std::chrono::high_resolution_clock::now();
+
+    for (uint32_t i = 0; i < numParallel; i++) {
+        mgr.evalOpDefault<kp::OpAlgoBase<>>(
+          { inputsSyncB[i] },
+          std::vector<char>(shader.begin(), shader.end()));
+    }
+
+    auto endSync = std::chrono::high_resolution_clock::now();
+    auto durationSync =
+      std::chrono::duration_cast<std::chrono::microseconds>(endSync - startSync)
+        .count();
+
+    mgr.evalOpDefault<kp::OpTensorSyncLocal>(inputsSyncB);
+
+    for (uint32_t i = 0; i < numParallel; i++) {
+        EXPECT_EQ(inputsSyncB[i]->data(), resultSync);
+    }
+
+    kp::Manager mgrAsync(0, { 0, 2 });
+
+    std::vector<std::shared_ptr<kp::Tensor>> inputsAsyncB;
+
+    for (uint32_t i = 0; i < numParallel; i++) {
+        inputsAsyncB.push_back(std::make_shared<kp::Tensor>(kp::Tensor(data)));
+    }
+
+    mgrAsync.evalOpDefault<kp::OpTensorCreate>(inputsAsyncB);
+
+    for (uint32_t i = 0; i < numParallel; i++) {
+        mgrAsync.createManagedSequence("async" + std::to_string(i), i);
+    }
+
+    auto startAsync = std::chrono::high_resolution_clock::now();
+
+    for (uint32_t i = 0; i < numParallel; i++) {
+        mgrAsync.evalOpAsync<kp::OpAlgoBase<>>(
+          { inputsAsyncB[i] },
+          "async" + std::to_string(i),
+          std::vector<char>(shader.begin(), shader.end()));
+    }
+
+    for (uint32_t i = 0; i < numParallel; i++) {
+        mgrAsync.evalOpAwait("async" + std::to_string(i));
+    }
+
+    auto endAsync = std::chrono::high_resolution_clock::now();
+    auto durationAsync = std::chrono::duration_cast<std::chrono::microseconds>(
+                           endAsync - startAsync)
+                           .count();
+
+    mgrAsync.evalOpDefault<kp::OpTensorSyncLocal>({ inputsAsyncB });
+
+    for (uint32_t i = 0; i < numParallel; i++) {
+        EXPECT_EQ(inputsAsyncB[i]->data(), resultAsync);
+    }
+
+    // The speedup should be at least 40%
+    EXPECT_LT(durationAsync, durationSync * 0.6);
+}
+
+TEST(TestAsyncOperations, TestManagerAsyncExecution)
+{
+    uint32_t size = 10;
+
+    std::string shader(R"(
+        #version 450
+
+        layout (local_size_x = 1) in;
+
+        layout(set = 0, binding = 0) buffer b { float pb[]; };
+
+        shared uint sharedTotal[1];
+
+        void main() {
+            uint index = gl_GlobalInvocationID.x;
+
+            sharedTotal[0] = 0;
+
+            for (int i = 0; i < 100000000; i++)
+            {
+                atomicAdd(sharedTotal[0], 1);
+            }
+
+            pb[index] = sharedTotal[0];
+        }
+    )");
+
+    std::vector<float> data(size, 0.0);
+    std::vector<float> resultAsync(size, 100000000);
+
+    kp::Manager mgrAsync(0);
+
+    std::vector<std::shared_ptr<kp::Tensor>> inputsAsyncB;
+
+    inputsAsyncB.push_back(std::make_shared<kp::Tensor>(kp::Tensor(data)));
+
+    mgrAsync.evalOpAsyncDefault<kp::OpTensorCreate>(inputsAsyncB);
+    mgrAsync.evalOpAwaitDefault();
+
+    mgrAsync.evalOpAsyncDefault<kp::OpAlgoBase<>>(
+      { inputsAsyncB[0] },
+      std::vector<char>(shader.begin(), shader.end()));
+
+    mgrAsync.evalOpAwaitDefault();
+
+    mgrAsync.evalOpAsyncDefault<kp::OpTensorSyncLocal>({ inputsAsyncB });
+    mgrAsync.evalOpAwaitDefault();
+
+    EXPECT_EQ(inputsAsyncB[0]->data(), resultAsync);
+}
--- a/test/TestMultipleAlgoExecutions.cpp
+++ b/test/TestMultipleAlgoExecutions.cpp
@ -258,10 +258,10 @@ TEST(TestMultipleAlgoExecutions, ManagerEvalMultSourceStrOpCreate)
      )");

    mgr.evalOpDefault<kp::OpAlgoBase<>>(
-            { tensorInA, tensorInB, tensorOut },
-            std::vector<char>(shader.begin(), shader.end()));
+      { tensorInA, tensorInB, tensorOut },
+      std::vector<char>(shader.begin(), shader.end()));

-    mgr.evalOpDefault<kp::OpTensorSyncLocal>({tensorOut});
+    mgr.evalOpDefault<kp::OpTensorSyncLocal>({ tensorOut });

    EXPECT_EQ(tensorOut->data(), std::vector<float>({ 0.0, 4.0, 12.0 }));
 }
@ -295,10 +295,10 @@ TEST(TestMultipleAlgoExecutions, ManagerEvalMultSourceStrMgrCreate)
      )");

    mgr.evalOpDefault<kp::OpAlgoBase<>>(
-            { tensorInA, tensorInB, tensorOut },
-            std::vector<char>(shader.begin(), shader.end()));
+      { tensorInA, tensorInB, tensorOut },
+      std::vector<char>(shader.begin(), shader.end()));

-    mgr.evalOpDefault<kp::OpTensorSyncLocal>({tensorOut});
+    mgr.evalOpDefault<kp::OpTensorSyncLocal>({ tensorOut });

    EXPECT_EQ(tensorOut->data(), std::vector<float>({ 0.0, 4.0, 12.0 }));
 }