Merge pull request #88 from EthicalML/pybind_python

Added python bindings with kp as python module
2020-11-03 18:24:52 +00:00 · 2020-11-03 18:24:52 +00:00 · e62dfea2a2
commit e62dfea2a2
parent 9babbc54ee 3811ef2dba
47 changed files with 1338 additions and 1036 deletions
--- a/.ccls
+++ b/.ccls
@ -13,6 +13,7 @@
 -DDEBUG=1
 -DKOMPUTE_INCLUDE_FOR_SYNTAX

+-I./python/pybind11/include/
 -I./external/Vulkan-Headers/include/
 -I./external/googletest/googletest/include/
 -I./external/spdlog/include/
--- a/.gitmodules
+++ b/.gitmodules
@ -10,3 +10,6 @@
 	path = external/spdlog
 	url = https://github.com/gabime/spdlog
 	branch = v1.8.1
+[submodule "python/pybind11"]
+	path = python/pybind11
+	url = https://github.com/pybind/pybind11
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -1,5 +1,5 @@
 cmake_minimum_required(VERSION 3.4.1)
-project(kompute VERSION 0.3.0)
+project(kompute VERSION 0.4.1)

 set(CMAKE_CXX_STANDARD 14)

@ -13,6 +13,7 @@ option(KOMPUTE_OPT_BUILD_SHADERS "Enable if you want to re-build all shader file
 option(KOMPUTE_OPT_BUILD_SINGLE_HEADER "Enable if you want to build the single header file" 0)
 option(KOMPUTE_OPT_INSTALL "Enable if you want to enable installation" 0)
 # Build options
+option(KOMPUTE_OPT_BUILD_PYTHON "Enable if you want to build python bindings" 0)
 option(KOMPUTE_OPT_ENABLE_SPDLOG "Extra compile flags for Kompute, see docs for full list" 0)
 option(KOMPUTE_OPT_REPO_SUBMODULE_BUILD, "Use the submodule repos instead of external package manager" 0)
 option(KOMPUTE_OPT_ANDOID_BUILD "Enable android compilation flags required" 0)
@ -43,12 +44,16 @@ endfunction()

 add_subdirectory(src)

+if(KOMPUTE_OPT_BUILD_TESTS)
+    add_subdirectory(test)
+endif()
+
 if(KOMPUTE_OPT_BUILD_DOCS)
    set(CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/config" ${CMAKE_MODULE_PATH})
    add_subdirectory(docs)
 endif()

-if(KOMPUTE_OPT_BUILD_TESTS)
-    add_subdirectory(test)
+if(KOMPUTE_OPT_BUILD_PYTHON)
+    add_subdirectory(python)
 endif()

--- a/README.md
+++ b/README.md
@ -1,5 +1,5 @@

-![GitHub](https://img.shields.io/badge/Version-0.4.0-green.svg)
+![GitHub](https://img.shields.io/badge/Version-0.4.1-green.svg)
 ![GitHub](https://img.shields.io/badge/C++-14—20-purple.svg)
 ![GitHub](https://img.shields.io/badge/Build-cmake-red.svg)
 ![GitHub](https://img.shields.io/badge/Python-3.5—3.8-blue.svg)
@ -15,7 +15,7 @@
 <td>

 <h1>Vulkan Kompute</h1>
-<h3>The General Purpose Vulkan Compute Framework.</h3>
+<h3>The General Purpose Vulkan Compute Framework for C++ and Python.</h3>

 </td>

@ -29,10 +29,10 @@

 ## Principles & Features

-* [Single header](#setup) library for simple import to your project
-* [Documentation](https://kompute.cc) leveraging doxygen and sphinx 
-* [Asynchronous & parallel processing](#asynchronous-and-parallel-operations) capabilities with multi-queue command submission
-* [Non-Vulkan naming conventions](#architectural-overview) to disambiguate Vulkan vs Kompute components
+* [Single header](#setup) for simple import with flexible build-system configuration
+* Multi-language support with C++ as core SDK as well as [optimized Python bindings](#python-package)
+* [Asynchronous & parallel processing](#asynchronous-and-parallel-operations) support through GPU family queues
+* [Mobile enabled](#mobile-enabled) with examples in Android studio across several architectures
 * BYOV: [Bring-your-own-Vulkan design](#motivations) to play nice with existing Vulkan applications
 * Explicit relationships for GPU and host [memory ownership and memory management](https://kompute.cc/overview/memory-management.html)
 * [Short code examples](#simple-examples) showing the core features 
@ -118,7 +118,7 @@ int main() {
    mgr.evalOpAwaitDefault();

    // 5. Create managed sequence to submit batch operations to the CPU
-    std::shared_ptr<kp::Sequence> sq = mgr.getOrCreateManagedSequence("seq").lock();
+    std::shared_ptr<kp::Sequence> sq = mgr.getOrCreateManagedSequence("seq");

    // 5.1. Explicitly begin recording batch commands
    sq->begin();
@ -255,13 +255,79 @@ You can also access the <a href="https://github.com/EthicalML/vulkan-kompute/tre
 </tr>
 </table>

-## Motivations
+## Python Package

-This project started after seeing that a lot of new and renowned ML & DL projects like Pytorch, Tensorflow, Alibaba DNN, Tencent NCNN - among others - have either integrated or are looking to integrate the Vulkan SDK to add mobile (and cross-vendor) GPU support.
+Besides the C++ core SDK you can also use the Python package of Kompute, which exposes the same core functionality, and supports interoperability with Python objects like Lists, Numpy Arrays, etc.

-The Vulkan SDK offers a great low level interface that enables for highly specialized optimizations - however it comes at a cost of highly verbose code which requires 500-2000 lines of code to even begin writing application code. This has resulted in each of these projects having to implement the same baseline to abstract the non-compute related features of Vulkan. This large amount of non-standardised boiler-plate can result in limited knowledge transfer, higher chance of unique framework implementation bugs being introduced, etc.
+You can install from the repository by running:

-We are currently developing Vulkan Kompute not to hide the Vulkan SDK interface (as it's incredibly well designed) but to augment it with a direct focus on Vulkan's GPU computing capabilities. [This article](https://towardsdatascience.com/machine-learning-and-data-processing-in-the-gpu-with-vulkan-kompute-c9350e5e5d3a) provides a high level overview of the motivations of Kompute, together with a set of hands on examples that introduce both GPU computing as well as the core Vulkan Kompute architecture.
+```
+pip install .
+```
+
+For further details you can read the [Python Package documentation](https://kompute.cc/overview/python-package.html) or the [Python Class Reference documentation](https://kompute.cc/overview/python-reference.html).
+
+### Python Example (Simple)
+
+Then you can interact with it from your interpreter. Below is the same sample as above "Your First Kompute (Simple Version)" but in Python:
+
+```python
+mgr = Manager()
+
+# Can be initialized with List[] or np.Array
+tensor_in_a = Tensor([2, 2, 2])
+tensor_in_b = Tensor([1, 2, 3])
+tensor_out = Tensor([0, 0, 0])
+
+mgr.eval_tensor_create_def([tensor_in_a, tensor_in_b, tensor_out])
+
+shaderFilePath = "shaders/glsl/opmult.comp"
+mgr.eval_async_algo_file_def([tensor_in_a, tensor_in_b, tensor_out], shaderFilePath)
+
+# Alternatively can pass raw string/bytes:
+# shaderFileData = """ shader code here... """
+# mgr.eval_algo_data_def([tensor_in_a, tensor_in_b, tensor_out], list(shaderFileData))
+
+mgr.eval_await_def()
+
+mgr.eval_tensor_sync_local_def([tensor_out])
+
+assert tensor_out.data() == [2.0, 4.0, 6.0]
+```
+
+### Python Example (Extended)
+
+Similarly you can find the same extended example as above:
+
+```python
+mgr = Manager(0, [2])
+
+# Can be initialized with List[] or np.Array
+tensor_in_a = Tensor([2, 2, 2])
+tensor_in_b = Tensor([1, 2, 3])
+tensor_out = Tensor([0, 0, 0])
+
+shaderFilePath = "../../shaders/glsl/opmult.comp"
+
+mgr.eval_tensor_create_def([tensor_in_a, tensor_in_b, tensor_out])
+
+seq = mgr.create_sequence("op")
+
+mgr.eval_async_algo_file_def([tensor_in_a, tensor_in_b, tensor_out], shaderFilePath)
+mgr.eval_await_def()
+
+seq.begin()
+seq.record_tensor_sync_local([tensor_in_a])
+seq.record_tensor_sync_local([tensor_in_b])
+seq.record_tensor_sync_local([tensor_out])
+seq.end()
+
+seq.eval()
+
+assert tensor_out.data() == [2.0, 4.0, 6.0]
+```
+
+For further details you can read the [Python Package documentation](https://kompute.cc/overview/python-package.html) or the [Python Class Reference documentation](https://kompute.cc/overview/python-reference.html).

 ## More examples

@ -281,6 +347,7 @@ We are currently developing Vulkan Kompute not to hide the Vulkan SDK interface
 * [Android NDK Mobile Kompute ML Application](https://towardsdatascience.com/gpu-accelerated-machine-learning-in-your-mobile-applications-using-the-android-ndk-vulkan-kompute-1e9da37b7617)
 * [Game Development Kompute ML in Godot Engine](https://towardsdatascience.com/supercharging-game-development-with-gpu-accelerated-ml-using-vulkan-kompute-the-godot-game-engine-4e75a84ea9f0)

+
 ## Build Overview

 The build system provided uses `cmake`, which allows for cross platform builds.
@ -344,3 +411,11 @@ make mk_cmake MK_BUILD_TYPE="Release"
 make mk_run_tests
 ```

+## Motivations
+
+This project started after seeing that a lot of new and renowned ML & DL projects like Pytorch, Tensorflow, Alibaba DNN, Tencent NCNN - among others - have either integrated or are looking to integrate the Vulkan SDK to add mobile (and cross-vendor) GPU support.
+
+The Vulkan SDK offers a great low level interface that enables for highly specialized optimizations - however it comes at a cost of highly verbose code which requires 500-2000 lines of code to even begin writing application code. This has resulted in each of these projects having to implement the same baseline to abstract the non-compute related features of Vulkan. This large amount of non-standardised boiler-plate can result in limited knowledge transfer, higher chance of unique framework implementation bugs being introduced, etc.
+
+We are currently developing Vulkan Kompute not to hide the Vulkan SDK interface (as it's incredibly well designed) but to augment it with a direct focus on Vulkan's GPU computing capabilities. [This article](https://towardsdatascience.com/machine-learning-and-data-processing-in-the-gpu-with-vulkan-kompute-c9350e5e5d3a) provides a high level overview of the motivations of Kompute, together with a set of hands on examples that introduce both GPU computing as well as the core Vulkan Kompute architecture.
+
--- a/docs/conf.py
+++ b/docs/conf.py
@ -16,13 +16,16 @@


 # -- Project information -----------------------------------------------------
+import sys
+import os
+import kp

 project = 'Vulkan Kompute'
 copyright = '2020, The Institute for Ethical AI & Machine Learning'
 author = 'Alejandro Saucedo'

 # The full version, including alpha/beta/rc tags
-release = '0.4.0'
+release = '0.4.1'


 # -- General configuration ---------------------------------------------------
@ -31,6 +34,7 @@ release = '0.4.0'
 # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 # ones.
 extensions = [
+    "sphinx.ext.autodoc",
    # Creates .nojekyll config
    'sphinx.ext.githubpages',
    # Integrates with doxygen
--- a/docs/index.rst
+++ b/docs/index.rst
@ -11,13 +11,15 @@ Index
    :maxdepth: 2
    :titlesonly:

-    Class Documentation and C++ Reference <overview/reference>
-    Advanced Examples <overview/advanced-examples>
+    Simple & Advanced Examples <overview/advanced-examples>
+    Python Package Overview <overview/python-package>
    Asynchronous & Parallel Operations <overview/async-parallel>
    Memory Management Principles <overview/memory-management>
    Build System Deep Dive <overview/build-system>
    Converting GLSL/HLSL Shaders to C++ Headers <overview/shaders-to-headers>
    Mobile App Integration (Android) <overview/mobile-android>
    Game Engine Integration (Godot Engine) <overview/game-engine-godot>
+    Python Class Documentation & Reference <overview/python-reference>
+    C++ Class Documentation & Reference <overview/reference>
    Code Index <genindex>

--- a/docs/overview/python-package.rst
+++ b/docs/overview/python-package.rst
@ -0,0 +1,91 @@
+
+Python Package Overview
+========
+
+This section provides an overview of the Python Package from a functionality perspective. If you wish to see all the classes and their respective functions you can find that in the `Python Class Reference Section <python-reference>`_.
+
+Below is a diagram that provides insights on the relationship between Vulkan Kompute objects and Vulkan resources, which primarily encompass ownership of either CPU and/or GPU memory.
+
+.. image:: ../images/kompute-architecture.jpg
+   :width: 70%
+
+Python Components
+^^^^^^^^
+
+The Python package exposes three main classes:
+
+* :class:`kp.Manager` - Manages all high level Vulkan and Kompute resources created
+* :class:`kp.Sequence` - Contains a set of recorded operations that can be reused
+* :class:`kp.Tensor` - Core data component to manage GPU and host data used in operations
+
+One thing that you will notice is that the class :class:`kp::OpBase` and all its relevant operator subclasses are not exposed in Python.
+
+This is primarily because the way to interact with the operations are through the respective :class:`kp.Manager` and :class:`kp.Sequence` functions.
+
+More specifically, it can be through the following functions:
+
+* mgr.eval_<opname> - Runs operation under an existing named sequence
+* mgr.eval_<opname>_def - Runs operation under a new anonymous sequence
+* mgr.eval_async_<opname> - Runs operation asynchronously under an existing named sequence
+* mgr.eval_async_<opname>_def - Runs operation asynchronously under a new anonymous sequence
+* seq.record_<opname> - Records operation in sequence (requires sequence to be in recording mode)
+
+You can see these operations being used in the `Simple Python example <https://kompute.cc/index.html#python-example-simple>`_ and in the `Extended Python Example <https://kompute.cc/index.html#python-example-extended>`_.
+
+Kompute Operation Capabilities
+^^^^^
+
+Handling multiple capabilites of processing can be done by compute shaders being loaded into separate sequences. The example below shows how this can be done:
+
+.. code-block:: python
+    :linenos:
+    from kp import Manager
+
+    # We'll assume we have the shader data available
+    from my_spv_shader_data import mult_shader, sum_shader
+
+    mgr = Manager()
+
+    t1 = mgr.build_tensor([2,2,2])
+    t2 = mgr.build_tensor([1,2,3])
+    t3 = mgr.build_tensor([1,2,3])
+
+    # Create multiple separate sequences
+    sq_mult = mgr.create_sequence("SQ_MULT")
+    sq_sum = mgr.create_sequence("SQ_SUM")
+    sq_sync = mgr.create_sequence("SQ_SYNC")
+
+    # Initialize sq_mult
+    sq_mult.begin()
+    sq_mult.record_algo_data([t1, t2, t3], add_shader)
+    sq_mult.end()
+
+    sq_sum.begin()
+    sq_sum.record_algo_data([t3, t2, t1], sum_shader)
+    sq_sum.end()
+
+    sq_sync.begin()
+    sq_sync.record_tensor_sync_local([t1, t3])
+    sq_sync.end()
+
+    # Run multiple iterations
+    for i in range(10):
+        sq_mult.eval()
+        sq_sum.eval()
+
+    sq_sync.eval()
+
+    print(t1.data(), t2.data(), t3.data())
+
+
+Package Installation 
+^^^^^^^^^
+
+The package can be installed through the top level `setup.py` by running:
+
+```
+pip install .
+```
+
+
+
--- a/docs/overview/python-reference.rst
+++ b/docs/overview/python-reference.rst
@ -0,0 +1,44 @@
+
+
+Python Class Documentation & Reference
+========
+
+This section provides a breakdown of the Python classes and what each of their functions provide.
+Below is a diagram that provides insights on the relationship between Vulkan Kompute objects and Vulkan resources, which primarily encompass ownership of either CPU and/or GPU memory.
+
+.. image:: ../images/kompute-architecture.jpg
+   :width: 70%
+
+Manager
+-------
+
+The Kompute Manager provides a high level interface to simplify interaction with underlying :class:`kp.Sequence` of Operations.
+
+.. autoclass:: kp.Manager
+   :members:
+
+
+Sequence
+-------
+
+The Kompute Sequence consists of batches of Kompute Operations, which are executed on a respective GPU queue. The execution of sequences can be synchronous or asynchronous, and it can be coordinated through its respective Vulkan Fence.
+
+.. autoclass:: kp.Sequence
+   :members:
+
+
+Tensor
+-------
+
+The Kompute Tensor is the atomic unit in Kompute, and it is used primarily for handling Host and GPU Device data.
+
+.. autoclass:: kp.Tensor
+   :members:
+
+
+TensorType
+-------
+
+.. automodule:: kp
+   :members:
+
--- a/examples/android/android-simple/app/src/main/cpp/KomputeModelML.cpp
+++ b/examples/android/android-simple/app/src/main/cpp/KomputeModelML.cpp
@ -44,14 +44,14 @@ void KomputeModelML::train(std::vector<float> yData, std::vector<float> xIData,
        {

            std::shared_ptr<kp::Sequence> sqTensor =
-              mgr.createManagedSequence().lock();
+              mgr.createManagedSequence();

            sqTensor->begin();
            sqTensor->record<kp::OpTensorCreate>(params);
            sqTensor->end();
            sqTensor->eval();

-            std::shared_ptr<kp::Sequence> sq = mgr.createManagedSequence().lock();
+            std::shared_ptr<kp::Sequence> sq = mgr.createManagedSequence();

            // Record op algo base
            sq->begin();
@ -60,11 +60,11 @@ void KomputeModelML::train(std::vector<float> yData, std::vector<float> xIData,

 #ifdef KOMPUTE_ANDROID_SHADER_FROM_STRING
            // Newer versions of Android are able to use shaderc to read raw string
-            sq->record<kp::OpAlgoBase<>>(
+            sq->record<kp::OpAlgoBase>(
                    params, std::vector<char>(LR_SHADER.begin(), LR_SHADER.end()));
 #else
            // Older versions of Android require the SPIRV binary directly
-            sq->record<kp::OpAlgoBase<>>(
+            sq->record<kp::OpAlgoBase>(
                    params, std::vector<char>(
                            kp::shader_data::shaders_glsl_logisticregression_comp_spv,
                            kp::shader_data::shaders_glsl_logisticregression_comp_spv
--- a/examples/array_multiplication/CMakeLists.txt
+++ b/examples/array_multiplication/CMakeLists.txt
@ -3,20 +3,42 @@ project(kompute_array_mult VERSION 0.1.0)

 set(CMAKE_CXX_STANDARD 14)

+option(KOMPUTE_ARR_OPT_INSTALLED_KOMPUTE "Enable if you prefer to use your installed Kompute library" 0)
 option(KOMPUTE_OPT_ENABLE_SPDLOG "Extra compile flags for Kompute, see docs for full list" 0)
 set(KOMPUTE_EXTRA_CXX_FLAGS "" CACHE STRING "Extra compile flags for Kompute, see docs for full list")

+if(KOMPUTE_OPT_ENABLE_SPDLOG)
+    set(KOMPUTE_EXTRA_CXX_FLAGS "${KOMPUTE_EXTRA_CXX_FLAGS} -DKOMPUTE_ENABLE_SPDLOG=1")
+endif()
+
 # It is necessary to pass the DEBUG or RELEASE flag accordingly to Kompute
 set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -DDEBUG=1 ${KOMPUTE_EXTRA_CXX_FLAGS}")
 set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -DRELEASE=1 ${KOMPUTE_EXTRA_CXX_FLAGS}")

-find_package(kompute REQUIRED)
+if(KOMPUTE_ARR_OPT_INSTALLED_KOMPUTE)
+    find_package(kompute REQUIRED)
+else()
+    add_subdirectory(../../ ${CMAKE_CURRENT_BINARY_DIR}/kompute_build)
+endif()
+
 find_package(Vulkan REQUIRED)

+if(KOMPUTE_OPT_ENABLE_SPDLOG)
+    find_package(spdlog REQUIRED)
+endif()
+
 add_executable(kompute_array_mult
    src/Main.cpp)

 target_link_libraries(kompute_array_mult
    kompute::kompute
-    Vulkan::Vulkan
-)
+    Vulkan::Vulkan)
+
+include_directories(
+        ../../single_include/)
+
+if(KOMPUTE_OPT_ENABLE_SPDLOG)
+    target_link_libraries(kompute_array_mult
+        spdlog::spdlog)
+endif()
+
--- a/examples/array_multiplication/README.md
+++ b/examples/array_multiplication/README.md
@ -6,14 +6,32 @@ This example is structured such that you will be able to extend it for your proj

 It contains a cmake build configuration that can be used in your production applications.

+## Building the example
+
+You will notice that it's a standalone project, so you can re-use it for your application.
+
+This project has the option to either import the Kompute dependency relative to the project or use your existing installation of Kompute.
+
+To build you just need to run the cmake command in this folder as follows:
+
+```
+cmake \
+    -Bbuild
+```
+
+You can pass the following optional parameters based on your desired configuration:
+* If you wish to install with spdlog support you just have to pass `-DKOMPUTE_OPT_ENABLE_SPDLOG=1`.
+* If you are using a package manager such as `vcpkg` make sure you pass the `-DCMAKE_TOOLCHAIN_FILE=` parameter 
+* If you wish to load shader from raw glsl string instead of spirv bytes you can use `-DKOMPUTE_ANDROID_SHADER_FROM_STRING`
+
 ## Pre-requisites

 In order to run this example, you will need the following dependencies:

 * REQUIRED
-    + Vulkan Kompute library must be accessible
    + The Vulkan SDK must be installed
 * OPTIONAL
+    + Vulkan Kompute library must be accessible (by default it uses the source directory)
    + SPDLOG - for logging
    + FMT - for text formatting

@ -25,50 +43,5 @@ For the other libraries, because they are optional you can just make sure you bu

 Alternatively you can use package managers such as vcpkg to help you install them, although to simplify things you can start without the dependencies first.

-## Set Up Vulkan Kompute Dependency

-You have multiple options to set up Vulkan Kompute. The easiest is to perform a local installation.
-
-For this, you will want to go to the main repo and run the following cmake command, which will configure it without SPDLOG by default.
-
-```
-cmake \
-    -Bbuild
-```
-
-You can pass the following optional parameters based on your desired configuration:
-* If you wish to install with spdlog support you just have to pass `-DKOMPUTE_ENABLE_SPDLOG=1`.
-* If you wish to perform the installation on the local folder instead of in your system you can use `-DCMAKE_INSTALL_PREFIX="build/src/CMakeFiles/Export/"` which will basically ensure that the final files are created in the local directory.
-* If you are using a package manager such as `vcpkg` make sure you pass the `-DCMAKE_TOOLCHAIN_FILE=` parameter 
-
-Then you can proceed to run the installation:
-
-* For Windows / Visual Studio you just have to build `INSTALL.vcxproj`
-* For Linux you can just run the `install` target via `make -C build install`
-
-You also have the option to build as `Release` or `Debug` - just make sure that you build your example with the same build/debug flags as required.
-
-## Building the example
-
-Now that you've set up the dependencies / installation of Vulkan Kompute you can build this example.
-
-You will notice that it's a standalone project, so you can re-use it for your application.
-
-To build you just need to run the cmake command in this folder as follows:
-
-```
-cmake \
-    -Bbuild
-```
-
-Make sure to pass the required flags depending on the configuration above:
-* If you built with Debug make sure you build your example with Debug as well
-* If you installed in the local folder, make sure you pass the CMAKE_PREFIX_PATH pointing to the respective folder (e.g. `-DCMAKE_PREFIX_PATH=../../build/src/CMakeFiles/Export/lib/cmake/kompute/` if parent folder is main repo).
-* If you built Vulkan Kompute with spdlog enabled, make sure to pass `-DKOMPUTE_OPT_ENABLE_SPDLOG=1`
-* If you are using a package manager such as `vcpkg` make sure you pass the `-DCMAKE_TOOLCHAIN_FILE=` parameter 
-
-Now you just have to build your application as above:
-
-* For Windows / Visual Studio you just have to build and run `kompute_array_mult.vcxproj`
-* For Linux you can just run the `kompute_array_mult` target via `make -C build kompute_array_mult`

--- a/examples/array_multiplication/src/Main.cpp
+++ b/examples/array_multiplication/src/Main.cpp
@ -18,6 +18,7 @@ int main()
    auto tensorInB = mgr.buildTensor({ 0.0, 1.0, 2.0 });
    auto tensorOut = mgr.buildTensor({ 0.0, 0.0, 0.0 });

+#ifdef KOMPUTE_ANDROID_SHADER_FROM_STRING
    std::string shader(R"(
        // The version to use 
        #version 450
@ -37,9 +38,17 @@ int main()
        }
      )");

-    mgr.evalOpDefault<kp::OpAlgoBase<>>(
+    mgr.evalOpDefault<kp::OpAlgoBase>(
            { tensorInA, tensorInB, tensorOut },
            std::vector<char>(shader.begin(), shader.end()));
+#else
+    mgr.evalOpDefault<kp::OpAlgoBase>(
+            { tensorInA, tensorInB, tensorOut },
+            std::vector<char>(
+            kp::shader_data::shaders_glsl_opmult_comp_spv,
+            kp::shader_data::shaders_glsl_opmult_comp_spv
+                + kp::shader_data::shaders_glsl_opmult_comp_spv_len));
+#endif

    mgr.evalOpDefault<kp::OpTensorSyncLocal>({tensorOut});

--- a/examples/godot_examples/custom_module/kompute_summator/KomputeSummatorNode.h
+++ b/examples/godot_examples/custom_module/kompute_summator/KomputeSummatorNode.h
@ -24,7 +24,7 @@ protected:

 private:
    kp::Manager mManager;
-    std::weak_ptr<kp::Sequence> mSequence;
+    std::shared_ptr<kp::Sequence> mSequence;
    std::shared_ptr<kp::Tensor> mPrimaryTensor;
    std::shared_ptr<kp::Tensor> mSecondaryTensor;
 };
--- a/examples/godot_examples/gdnative_shared/src/KomputeSummator.cpp
+++ b/examples/godot_examples/gdnative_shared/src/KomputeSummator.cpp
@ -16,12 +16,7 @@ void KomputeSummator::add(float value) {
    // Set the new data in the local device
    this->mSecondaryTensor->setData({value});
    // Execute recorded sequence
-    if (std::shared_ptr<kp::Sequence> sq = this->mSequence.lock()) {
-        sq->eval();
-    }
-    else {
-        throw std::runtime_error("Sequence pointer no longer available");
-    }
+    this->mSequence->eval();
 }

 void KomputeSummator::reset() {
@ -38,9 +33,7 @@ void KomputeSummator::_init() {
    this->mSequence = this->mManager.getOrCreateManagedSequence("AdditionSeq");

    // We now record the steps in the sequence
-    if (std::shared_ptr<kp::Sequence> sq = this->mSequence.lock())
    {
-
        std::string shader(R"(
            #version 450

@ -55,26 +48,23 @@ void KomputeSummator::_init() {
            }
        )");

-        sq->begin();
+        this->mSequence->begin();

        // First we ensure secondary tensor loads to GPU
        // No need to sync the primary tensor as it should not be changed
-        sq->record<kp::OpTensorSyncDevice>(
+        this->mSequence->record<kp::OpTensorSyncDevice>(
                { this->mSecondaryTensor });

        // Then we run the operation with both tensors
-        sq->record<kp::OpAlgoBase<>>(
+        this->mSequence->record<kp::OpAlgoBase>(
            { this->mPrimaryTensor, this->mSecondaryTensor }, 
            std::vector<char>(shader.begin(), shader.end()));

        // We map the result back to local 
-        sq->record<kp::OpTensorSyncLocal>(
+        this->mSequence->record<kp::OpTensorSyncLocal>(
                { this->mPrimaryTensor });

-        sq->end();
-    }
-    else {
-        throw std::runtime_error("Sequence pointer no longer available");
+        this->mSequence->end();
    }
 }

--- a/examples/godot_examples/gdnative_shared/src/KomputeSummator.hpp
+++ b/examples/godot_examples/gdnative_shared/src/KomputeSummator.hpp
@ -26,7 +26,7 @@ public:

 private:
    kp::Manager mManager;
-    std::weak_ptr<kp::Sequence> mSequence;
+    std::shared_ptr<kp::Sequence> mSequence;
    std::shared_ptr<kp::Tensor> mPrimaryTensor;
    std::shared_ptr<kp::Tensor> mSecondaryTensor;
 };
--- a/examples/godot_logistic_regression/custom_module/kompute_model_ml/KomputeModelMLNode.cpp
+++ b/examples/godot_logistic_regression/custom_module/kompute_model_ml/KomputeModelMLNode.cpp
@ -51,14 +51,14 @@ void KomputeModelMLNode::train(Array yArr, Array xIArr, Array xJArr) {
        kp::Manager mgr;

            std::shared_ptr<kp::Sequence> sqTensor =
-              mgr.createManagedSequence().lock();
+              mgr.createManagedSequence();

            sqTensor->begin();
            sqTensor->record<kp::OpTensorCreate>(params);
            sqTensor->end();
            sqTensor->eval();

-            std::shared_ptr<kp::Sequence> sq = mgr.createManagedSequence().lock();
+            std::shared_ptr<kp::Sequence> sq = mgr.createManagedSequence();

            // Record op algo base
            sq->begin();
@ -67,11 +67,11 @@ void KomputeModelMLNode::train(Array yArr, Array xIArr, Array xJArr) {

 #ifdef KOMPUTE_ANDROID_SHADER_FROM_STRING
            // Newer versions of Android are able to use shaderc to read raw string
-            sq->record<kp::OpAlgoBase<>>(
+            sq->record<kp::OpAlgoBase>(
                    params, std::vector<char>(LR_SHADER.begin(), LR_SHADER.end()));
 #else
            // Older versions of Android require the SPIRV binary directly
-            sq->record<kp::OpAlgoBase<>>(
+            sq->record<kp::OpAlgoBase>(
                    params, std::vector<char>(
                            kp::shader_data::shaders_glsl_logisticregression_comp_spv,
                            kp::shader_data::shaders_glsl_logisticregression_comp_spv
--- a/examples/godot_logistic_regression/gdnative_shared/src/KomputeModelML.cpp
+++ b/examples/godot_logistic_regression/gdnative_shared/src/KomputeModelML.cpp
@ -56,14 +56,14 @@ void KomputeModelML::train(Array yArr, Array xIArr, Array xJArr) {

        {
            std::shared_ptr<kp::Sequence> sqTensor =
-              mgr.createManagedSequence().lock();
+              mgr.createManagedSequence();

            sqTensor->begin();
            sqTensor->record<kp::OpTensorCreate>(params);
            sqTensor->end();
            sqTensor->eval();

-            std::shared_ptr<kp::Sequence> sq = mgr.createManagedSequence().lock();
+            std::shared_ptr<kp::Sequence> sq = mgr.createManagedSequence();

            // Record op algo base
            sq->begin();
@ -72,11 +72,11 @@ void KomputeModelML::train(Array yArr, Array xIArr, Array xJArr) {

 #ifdef KOMPUTE_ANDROID_SHADER_FROM_STRING
            // Newer versions of Android are able to use shaderc to read raw string
-            sq->record<kp::OpAlgoBase<>>(
+            sq->record<kp::OpAlgoBase>(
                    params, std::vector<char>(LR_SHADER.begin(), LR_SHADER.end()));
 #else
            // Older versions of Android require the SPIRV binary directly
-            sq->record<kp::OpAlgoBase<>>(
+            sq->record<kp::OpAlgoBase>(
                    params, std::vector<char>(
                            kp::shader_data::shaders_glsl_logisticregression_comp_spv,
                            kp::shader_data::shaders_glsl_logisticregression_comp_spv
--- a/examples/logistic_regression/CMakeLists.txt
+++ b/examples/logistic_regression/CMakeLists.txt
@ -3,6 +3,7 @@ project(kompute_linear_reg VERSION 0.1.0)

 set(CMAKE_CXX_STANDARD 14)

+option(KOMPUTE_ARR_OPT_INSTALLED_KOMPUTE "Enable if you prefer to use your installed Kompute library" 0)
 option(KOMPUTE_OPT_ENABLE_SPDLOG "Extra compile flags for Kompute, see docs for full list" 0)
 set(KOMPUTE_EXTRA_CXX_FLAGS "" CACHE STRING "Extra compile flags for Kompute, see docs for full list")

@ -14,12 +15,16 @@ endif()
 set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -DDEBUG=1 ${KOMPUTE_EXTRA_CXX_FLAGS}")
 set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -DRELEASE=1 ${KOMPUTE_EXTRA_CXX_FLAGS}")

-find_package(kompute REQUIRED)
+if(KOMPUTE_ARR_OPT_INSTALLED_KOMPUTE)
+    find_package(kompute REQUIRED)
+else()
+    add_subdirectory(../../ ${CMAKE_CURRENT_BINARY_DIR}/kompute_build)
+endif()
+
 find_package(Vulkan REQUIRED)

 if(KOMPUTE_OPT_ENABLE_SPDLOG)
    find_package(spdlog REQUIRED)
-    find_package(fmt REQUIRED)
 endif()

 add_executable(kompute_linear_reg
@ -30,11 +35,11 @@ target_link_libraries(kompute_linear_reg
    Vulkan::Vulkan
 )

+include_directories(
+        ../../single_include/)
+
 if(KOMPUTE_OPT_ENABLE_SPDLOG)
-    target_link_libraries(kompute_linear_reg
-        kompute::kompute
-        fmt::fmt
-        spdlog::spdlog
-    )
+    target_link_libraries(kompute_array_mult
+        spdlog::spdlog)
 endif()

--- a/examples/logistic_regression/README.md
+++ b/examples/logistic_regression/README.md
@ -6,54 +6,12 @@ This example is structured such that you will be able to extend it for your proj

 It contains a cmake build configuration that can be used in your production applications.

-## Pre-requisites
-
-In order to run this example, you will need the following dependencies:
-
-* REQUIRED
-    + Vulkan Kompute library must be accessible
-    + The Vulkan SDK must be installed
-* OPTIONAL
-    + SPDLOG - for logging
-    + FMT - for text formatting
-
-We will cover how you can install Vulkan Kompute in the next section.
-
-For the Vulkan SDK, the simplest way to install it is through [their website](https://vulkan.lunarg.com/sdk/home). You just have to follow the instructions for the relevant platform.
-
-For the other libraries, because they are optional you can just make sure you build and install Kompute with these disabled (this will be covered in more detail below).
-
-Alternatively you can use package managers such as vcpkg to help you install them, although to simplify things you can start without the dependencies first.
-
-## Set Up Vulkan Kompute Dependency
-
-You have multiple options to set up Vulkan Kompute. The easiest is to perform a local installation.
-
-For this, you will want to go to the main repo and run the following cmake command, which will configure it without SPDLOG by default.
-
-```
-cmake \
-    -Bbuild
-```
-
-You can pass the following optional parameters based on your desired configuration:
-* If you wish to install with spdlog support you just have to pass `-DKOMPUTE_ENABLE_SPDLOG=1`.
-* If you wish to perform the installation on the local folder instead of in your system you can use `-DCMAKE_INSTALL_PREFIX="build/src/CMakeFiles/Export/"` which will basically ensure that the final files are created in the local directory.
-* If you are using a package manager such as `vcpkg` make sure you pass the `-DCMAKE_TOOLCHAIN_FILE=` parameter 
-
-Then you can proceed to run the installation:
-
-* For Windows / Visual Studio you just have to build `INSTALL.vcxproj`
-* For Linux you can just run the `install` target via `make -C build install`
-
-You also have the option to build as `Release` or `Debug` - just make sure that you build your example with the same build/debug flags as required.
-
 ## Building the example

-Now that you've set up the dependencies / installation of Vulkan Kompute you can build this example.
-
 You will notice that it's a standalone project, so you can re-use it for your application.

+This project has the option to either import the Kompute dependency relative to the project or use your existing installation of Kompute.
+
 To build you just need to run the cmake command in this folder as follows:

 ```
@ -61,14 +19,19 @@ cmake \
    -Bbuild
 ```

-Make sure to pass the required flags depending on the configuration above:
-* If you built with Debug make sure you build your example with Debug as well
-* If you installed in the local folder, make sure you pass the CMAKE_PREFIX_PATH pointing to the respective folder (e.g. `-DCMAKE_PREFIX_PATH=../../build/src/CMakeFiles/Export/lib/cmake/kompute/` if parent folder is main repo).
-* If you built Vulkan Kompute with spdlog enabled, make sure to pass `-DKOMPUTE_OPT_ENABLE_SPDLOG=1`
+You can pass the following optional parameters based on your desired configuration:
+* If you wish to install with spdlog support you just have to pass `-DKOMPUTE_OPT_ENABLE_SPDLOG=1`.
 * If you are using a package manager such as `vcpkg` make sure you pass the `-DCMAKE_TOOLCHAIN_FILE=` parameter 
+* If you wish to load shader from raw glsl string instead of spirv bytes you can use `-DKOMPUTE_ANDROID_SHADER_FROM_STRING`

-Now you just have to build your application as above:
+## Pre-requisites

-* For Windows / Visual Studio you just have to build and run `kompute_linear_reg.vcxproj`
-* For Linux you can just run the `kompute_linear_reg` target via `make -C build kompute_linear_reg`
+In order to run this example, you will need the following dependencies:
+
+* REQUIRED
+    + The Vulkan SDK must be installed
+* OPTIONAL
+    + Vulkan Kompute library must be accessible (by default it uses the source directory)
+    + SPDLOG - for logging
+    + FMT - for text formatting

--- a/examples/logistic_regression/src/Main.cpp
+++ b/examples/logistic_regression/src/Main.cpp
@ -36,22 +36,30 @@ int main()
    kp::Manager mgr;

    std::shared_ptr<kp::Sequence> sqTensor =
-      mgr.createManagedSequence().lock();
+      mgr.createManagedSequence();

    sqTensor->begin();
    sqTensor->record<kp::OpTensorCreate>(params);
    sqTensor->end();
    sqTensor->eval();

-    std::shared_ptr<kp::Sequence> sq = mgr.createManagedSequence().lock();
+    std::shared_ptr<kp::Sequence> sq = mgr.createManagedSequence();

    // Record op algo base
    sq->begin();

    sq->record<kp::OpTensorSyncDevice>({ wIn, bIn });

-    sq->record<kp::OpAlgoBase<>>(
+#ifdef KOMPUTE_ANDROID_SHADER_FROM_STRING
+    sq->record<kp::OpAlgoBase>(
      params, "shaders/glsl/logistic_regression.comp");
+#else
+    sq->record<kp::OpAlgoBase>(
+        params, std::vector<char>(
+                kp::shader_data::shaders_glsl_logisticregression_comp_spv,
+                kp::shader_data::shaders_glsl_logisticregression_comp_spv
+                    + kp::shader_data::shaders_glsl_logisticregression_comp_spv_len));
+#endif

    sq->record<kp::OpTensorSyncLocal>({ wOutI, wOutJ, bOut, lOut });

--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@ -0,0 +1,11 @@
+
+add_subdirectory(pybind11)
+pybind11_add_module(kp src/main.cpp)
+
+include_directories(
+        ${PROJECT_SOURCE_DIR}/single_include/)
+
+target_link_libraries(
+    kp PRIVATE
+    kompute::kompute)
+
--- a/python/README.md
+++ b/python/README.md
@ -0,0 +1,2 @@
+# Python Bindings for Vulkan Kompute
+
--- a/python/pybind11
+++ b/python/pybind11
@ -0,0 +1 @@
+Subproject commit 06a54018c8a9fd9a7be5f5b56414b5da9259f637
--- a/python/src/main.cpp
+++ b/python/src/main.cpp
@ -0,0 +1,160 @@
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include <kompute/Kompute.hpp>
+
+namespace py = pybind11;
+
+PYBIND11_MODULE(kp, m) {
+
+    py::enum_<kp::Tensor::TensorTypes>(m, "TensorTypes", "Enum with GPU memory types for Tensor.")
+        .value("device", kp::Tensor::TensorTypes::eDevice, "Tensor holding data in GPU memory.")
+        .value("staging", kp::Tensor::TensorTypes::eStaging, "Tensor used for transfer of data to device.")
+        .value("storage", kp::Tensor::TensorTypes::eStorage, "Tensor with host visible gpu memory.")
+        .export_values();
+
+    py::class_<kp::Tensor, std::shared_ptr<kp::Tensor>>(m, "Tensor", "Structured data used in GPU operations.")
+        .def(py::init(
+            [](const std::vector<float>& data) {
+                return std::unique_ptr<kp::Tensor>(new kp::Tensor(data));
+            }), "Initialiser with only list of data components.")
+        .def(py::init(
+            [](const std::vector<float>& data, kp::Tensor::TensorTypes tensorTypes) {
+                return std::unique_ptr<kp::Tensor>(new kp::Tensor(data, tensorTypes));
+            }), "Initialiser with list of data components and tensor GPU memory type.")
+        .def("data", &kp::Tensor::data, "Retrieves the data as a list containing the local Tensor memory data.")
+        .def("size", &kp::Tensor::size, "Retrieves the size of the Tensor data as per the local Tensor memory.")
+        .def("tensor_type", &kp::Tensor::tensorType, "Retreves the memory type of the tensor.")
+        .def("is_init", &kp::Tensor::isInit, "Checks whether the tensor GPU memory has been initialised.")
+        .def("set_data", &kp::Tensor::setData, "Overrides the data in the local Tensor memory.")
+        .def("map_data_from_host", &kp::Tensor::mapDataFromHostMemory, "Maps data into GPU memory from tensor local data.")
+        .def("map_data_into_host", &kp::Tensor::mapDataIntoHostMemory, "Maps data from GPU memory into tensor local data.");
+
+    py::class_<kp::Sequence, std::shared_ptr<kp::Sequence>>(m, "Sequence")
+        .def("init", &kp::Sequence::init, "Initialises Vulkan resources within sequence using provided device.")
+        // record
+        .def("begin", &kp::Sequence::begin, "Clears previous commands and starts recording commands in sequence which can be run in batch.")
+        .def("end", &kp::Sequence::end, "Stops listening and recording for new commands.")
+        // eval
+        .def("eval", &kp::Sequence::eval, "Executes the currently recorded commands synchronously by waiting on Vulkan Fence.")
+        .def("eval_async", &kp::Sequence::evalAsync, "Executes the currently recorded commands asynchronously.")
+        .def("eval_await", &kp::Sequence::evalAwait, "Waits until the execution finishes using Vulkan Fence.")
+        // status
+        .def("is_running", &kp::Sequence::isRunning, "Checks whether the Sequence operations are currently still executing.")
+        .def("is_rec", &kp::Sequence::isRecording, "Checks whether the Sequence is currently in recording mode.")
+        .def("is_init", &kp::Sequence::isInit, "Checks if the Sequence has been initialized")
+        // record
+        .def("record_tensor_create", &kp::Sequence::record<kp::OpTensorCreate>,
+            "Records operation to create and initialise tensor GPU memory and buffer")
+        .def("record_tensor_copy", &kp::Sequence::record<kp::OpTensorCopy>,
+            "Records operation to copy one tensor to one or many tensors")
+        .def("record_tensor_sync_device", &kp::Sequence::record<kp::OpTensorSyncDevice>,
+            "Records operation to sync tensor from local memory to GPU memory")
+        .def("record_tensor_sync_local", &kp::Sequence::record<kp::OpTensorSyncLocal>,
+            "Records operation to sync tensor(s) from GPU memory to local memory using staging tensors")
+        .def("record_algo_mult", &kp::Sequence::record<kp::OpMult>,
+            "Records operation to run multiplication compute shader to two input tensors and an output tensor")
+        .def("record_algo_file", &kp::Sequence::record<kp::OpAlgoBase, std::string>,
+            "Records an operation using a custom shader provided from a shader path")
+        .def("record_algo_data", &kp::Sequence::record<kp::OpAlgoBase, std::vector<char>>,
+            "Records an operation using a custom shader provided as raw string or spirv bytes")
+        .def("record_algo_lro", &kp::Sequence::record<kp::OpAlgoLhsRhsOut>,
+            "Records operation to run left right out operation with custom shader");
+
+    py::class_<kp::Manager>(m, "Manager")
+        .def(py::init(), "Default initializer uses device 0 and first compute compatible GPU queueFamily")
+        .def(py::init(
+            [](uint32_t physicalDeviceIndex) {
+                return std::unique_ptr<kp::Manager>(new kp::Manager(physicalDeviceIndex));
+            }), "Manager initialiser can provide specified device index but will use first compute compatible GPU queueFamily")
+        .def(py::init(
+            [](uint32_t physicalDeviceIndex, const std::vector<uint32_t>& familyQueueIndices) {
+                return std::unique_ptr<kp::Manager>(new kp::Manager(physicalDeviceIndex, familyQueueIndices));
+            }), "Manager initialiser can provide specified device and array of GPU queueFamilies to load.")
+        .def("get_create_sequence", &kp::Manager::getOrCreateManagedSequence, "Get a Sequence or create a new one with given name")
+        .def("create_sequence", &kp::Manager::createManagedSequence,
+                py::arg("name"), py::arg("queueIndex") = 0, "Create a sequence with specific name and specified index of available queues")
+        .def("build_tensor", &kp::Manager::buildTensor, 
+                py::arg("data"), py::arg("tensorType") = kp::Tensor::TensorTypes::eDevice,
+                "Build and initialise tensor")
+        // Await functions
+        .def("eval_await", &kp::Manager::evalOpAwait,
+                py::arg("sequenceName"), py::arg("waitFor") = UINT64_MAX,
+                "Awaits for asynchronous operation on a named Sequence")
+        .def("eval_await_def", &kp::Manager::evalOpAwaitDefault,
+                py::arg("waitFor") = UINT64_MAX, "Awaits for asynchronous operation on the last anonymous Sequence created")
+        // eval default
+        .def("eval_tensor_create_def", &kp::Manager::evalOpDefault<kp::OpTensorCreate>,
+            "Evaluates operation to create and initialise tensor GPU memory and buffer with new anonymous Sequence")
+        .def("eval_tensor_copy_def", &kp::Manager::evalOpDefault<kp::OpTensorCopy>,
+            "Evaluates operation to copy one tensor to one or many tensors with new anonymous Sequence")
+        .def("eval_tensor_sync_device_def", &kp::Manager::evalOpDefault<kp::OpTensorSyncDevice>,
+            "Evaluates operation to sync tensor from local memory to GPU memory with new anonymous Sequence")
+        .def("eval_tensor_sync_local_def", &kp::Manager::evalOpDefault<kp::OpTensorSyncLocal>,
+            "Evaluates operation to sync tensor(s) from GPU memory to local memory using staging tensors with new anonymous Sequence")
+        .def("eval_algo_mult_def", &kp::Manager::evalOpDefault<kp::OpMult>,
+            "Evaluates operation to run multiplication compute shader to two input tensors and an output tensor with new anonymous Sequence")
+        .def("eval_algo_file_def", &kp::Manager::evalOpDefault<kp::OpAlgoBase, std::string>,
+            "Evaluates an operation using a custom shader provided from a shader path with new anonymous Sequence")
+        .def("eval_algo_data_def", &kp::Manager::evalOpDefault<kp::OpAlgoBase, std::vector<char>>,
+            "Evaluates an operation using a custom shader provided as raw string or spirv bytes with new anonymous Sequence")
+        .def("eval_algo_lro_def", &kp::Manager::evalOpDefault<kp::OpAlgoLhsRhsOut>,
+            "Evaluates operation to run left right out operation with custom shader with new anonymous Sequence")
+        // eval
+        .def("eval_tensor_create", &kp::Manager::evalOp<kp::OpTensorCreate>,
+            "Evaluates operation to create and initialise tensor GPU memory and buffer with explicitly named Sequence")
+        .def("eval_tensor_copy", &kp::Manager::evalOp<kp::OpTensorCopy>,
+            "Evaluates operation to copy one tensor to one or many tensors with explicitly named Sequence")
+        .def("eval_tensor_sync_device", &kp::Manager::evalOp<kp::OpTensorSyncDevice>,
+            "Evaluates operation to sync tensor from local memory to GPU memory with explicitly named Sequence")
+        .def("eval_tensor_sync_local", &kp::Manager::evalOp<kp::OpTensorSyncLocal>,
+            "Evaluates operation to sync tensor(s) from GPU memory to local memory using staging tensors with explicitly named Sequence")
+        .def("eval_algo_mult", &kp::Manager::evalOp<kp::OpMult>,
+            "Evaluates operation to run multiplication compute shader to two input tensors and an output tensor with explicitly named Sequence")
+        .def("eval_algo_file", &kp::Manager::evalOp<kp::OpAlgoBase, std::string>,
+            "Evaluates an operation using a custom shader provided from a shader path with explicitly named Sequence")
+        .def("eval_algo_data", &kp::Manager::evalOp<kp::OpAlgoBase, std::vector<char>>,
+            "Evaluates an operation using a custom shader provided as raw string or spirv bytes with explicitly named Sequence")
+        .def("eval_algo_lro", &kp::Manager::evalOp<kp::OpAlgoLhsRhsOut>,
+            "Evaluates operation to run left right out operation with custom shader with explicitly named Sequence")
+        // eval async default
+        .def("eval_async_tensor_create_def", &kp::Manager::evalOpAsyncDefault<kp::OpTensorCreate>,
+            "Evaluates asynchronously operation to create and initialise tensor GPU memory and buffer with anonymous Sequence")
+        .def("eval_async_tensor_copy_def", &kp::Manager::evalOpAsyncDefault<kp::OpTensorCopy>,
+            "Evaluates asynchronously operation to copy one tensor to one or many tensors with anonymous Sequence")
+        .def("eval_async_tensor_sync_device_def", &kp::Manager::evalOpAsyncDefault<kp::OpTensorSyncDevice>,
+            "Evaluates asynchronously operation to sync tensor from local memory to GPU memory with anonymous Sequence")
+        .def("eval_async_tensor_sync_local_def", &kp::Manager::evalOpAsyncDefault<kp::OpTensorSyncLocal>,
+            "Evaluates asynchronously operation to sync tensor(s) from GPU memory to local memory using staging tensors with anonymous Sequence")
+        .def("eval_async_algo_mult_def", &kp::Manager::evalOpAsyncDefault<kp::OpMult>,
+            "Evaluates asynchronously operation to run multiplication compute shader to two input tensors and an output tensor with anonymous Sequence")
+        .def("eval_async_algo_file_def", &kp::Manager::evalOpAsyncDefault<kp::OpAlgoBase, std::string>,
+            "Evaluates asynchronously an operation using a custom shader provided from a shader path with anonymous Sequence")
+        .def("eval_async_algo_data_def", &kp::Manager::evalOpAsyncDefault<kp::OpAlgoBase, std::vector<char>>,
+            "Evaluates asynchronously an operation using a custom shader provided as raw string or spirv bytes with anonymous Sequence")
+        .def("eval_async_algo_lro_def", &kp::Manager::evalOpAsyncDefault<kp::OpAlgoLhsRhsOut>,
+            "Evaluates asynchronously operation to run left right out operation with custom shader with anonymous Sequence")
+        // eval async
+        .def("eval_async_tensor_create", &kp::Manager::evalOpAsync<kp::OpTensorCreate>,
+            "Evaluates asynchronously operation to create and initialise tensor GPU memory and buffer with explicitly named Sequence")
+        .def("eval_async_tensor_copy", &kp::Manager::evalOpAsync<kp::OpTensorCopy>,
+            "Evaluates asynchronously operation to copy one tensor to one or many tensors with explicitly named Sequence")
+        .def("eval_async_tensor_sync_device", &kp::Manager::evalOpAsync<kp::OpTensorSyncDevice>,
+            "Evaluates asynchronously operation to sync tensor from local memory to GPU memory with explicitly named Sequence")
+        .def("eval_async_tensor_sync_local", &kp::Manager::evalOpAsync<kp::OpTensorSyncLocal>,
+            "Evaluates asynchronously operation to sync tensor(s) from GPU memory to local memory using staging tensors with explicitly named Sequence")
+        .def("eval_async_algo_mult", &kp::Manager::evalOpAsync<kp::OpMult>,
+            "Evaluates asynchronously operation to run multiplication compute shader to two input tensors and an output tensor with explicitly named Sequence")
+        .def("eval_async_algo_file", &kp::Manager::evalOpAsync<kp::OpAlgoBase, std::string>,
+            "Evaluates asynchronously an operation using a custom shader provided from a shader path with explicitly named Sequence")
+        .def("eval_async_algo_data", &kp::Manager::evalOpAsync<kp::OpAlgoBase, std::vector<char>>,
+            "Evaluates asynchronously an operation using a custom shader provided as raw string or spirv bytes with explicitly named Sequence")
+        .def("eval_async_algo_lro", &kp::Manager::evalOpAsync<kp::OpAlgoLhsRhsOut>,
+            "Evaluates asynchronously operation to run left right out operation with custom shader with explicitly named Sequence");
+
+#ifdef VERSION_INFO
+    m.attr("__version__") = VERSION_INFO;
+#else
+    m.attr("__version__") = "dev";
+#endif
+}
--- a/python/test/test_kompute.py
+++ b/python/test/test_kompute.py
@ -0,0 +1,110 @@
+
+from kp import Tensor, Manager, Sequence
+
+def test_opmult():
+    """
+    Test basic OpMult operation
+    """
+
+    tensor_in_a = Tensor([2, 2, 2])
+    tensor_in_b = Tensor([1, 2, 3])
+    tensor_out = Tensor([0, 0, 0])
+
+    mgr = Manager()
+
+    mgr.eval_tensor_create_def([tensor_in_a, tensor_in_b, tensor_out])
+
+    mgr.eval_algo_mult_def([tensor_in_a, tensor_in_b, tensor_out])
+
+    mgr.eval_tensor_sync_local_def([tensor_out])
+
+    assert tensor_out.data() == [2.0, 4.0, 6.0]
+
+def test_opalgobase_data():
+    """
+    Test basic OpAlgoBase operation
+    """
+
+    tensor_in_a = Tensor([2, 2, 2])
+    tensor_in_b = Tensor([1, 2, 3])
+    tensor_out = Tensor([0, 0, 0])
+
+    mgr = Manager()
+
+    shaderData = """
+        #version 450
+
+        layout (local_size_x = 1) in;
+
+        // The input tensors bind index is relative to index in parameter passed
+        layout(set = 0, binding = 0) buffer bina { float tina[]; };
+        layout(set = 0, binding = 1) buffer binb { float tinb[]; };
+        layout(set = 0, binding = 2) buffer bout { float tout[]; };
+
+        void main() {
+            uint index = gl_GlobalInvocationID.x;
+            tout[index] = tina[index] * tinb[index];
+        }
+    """
+
+    mgr.eval_tensor_create_def([tensor_in_a, tensor_in_b, tensor_out])
+
+    mgr.eval_algo_data_def([tensor_in_a, tensor_in_b, tensor_out], list(shaderData))
+
+    mgr.eval_tensor_sync_local_def([tensor_out])
+
+    assert tensor_out.data() == [2.0, 4.0, 6.0]
+
+
+def test_opalgobase_file():
+    """
+    Test basic OpAlgoBase operation
+    """
+
+    tensor_in_a = Tensor([2, 2, 2])
+    tensor_in_b = Tensor([1, 2, 3])
+    tensor_out = Tensor([0, 0, 0])
+
+    mgr = Manager()
+
+    shaderFilePath = "../../shaders/glsl/opmult.comp"
+
+    mgr.eval_tensor_create_def([tensor_in_a, tensor_in_b, tensor_out])
+
+    mgr.eval_algo_file_def([tensor_in_a, tensor_in_b, tensor_out], shaderFilePath)
+
+    mgr.eval_tensor_sync_local_def([tensor_out])
+
+    assert tensor_out.data() == [2.0, 4.0, 6.0]
+
+def test_sequence():
+    """
+    Test basic OpAlgoBase operation
+    """
+
+    mgr = Manager(0, [2])
+
+    tensor_in_a = Tensor([2, 2, 2])
+    tensor_in_b = Tensor([1, 2, 3])
+    tensor_out = Tensor([0, 0, 0])
+
+    mgr.eval_tensor_create_def([tensor_in_a, tensor_in_b, tensor_out])
+
+    seq = mgr.create_sequence("op")
+
+    shaderFilePath = "../../shaders/glsl/opmult.comp"
+    mgr.eval_async_algo_file_def([tensor_in_a, tensor_in_b, tensor_out], shaderFilePath)
+    mgr.eval_await_def()
+
+    seq.begin()
+    seq.record_tensor_sync_local([tensor_in_a])
+    seq.record_tensor_sync_local([tensor_in_b])
+    seq.record_tensor_sync_local([tensor_out])
+    seq.end()
+
+    seq.eval()
+
+    assert tensor_out.data() == [2.0, 4.0, 6.0]
+
+if __name__ == "__main__":
+    test_sequence()
--- a/setup.py
+++ b/setup.py
@ -0,0 +1,76 @@
+import os
+import re
+import sys
+import platform
+import subprocess
+
+from setuptools import setup, Extension
+from setuptools.command.build_ext import build_ext
+from distutils.version import LooseVersion
+
+
+class CMakeExtension(Extension):
+    def __init__(self, name, sourcedir=''):
+        Extension.__init__(self, name, sources=[])
+        self.sourcedir = os.path.abspath(sourcedir)
+
+
+class CMakeBuild(build_ext):
+    def run(self):
+        try:
+            out = subprocess.check_output(['cmake', '--version'])
+        except OSError:
+            raise RuntimeError("CMake must be installed to build the following extensions: " +
+                               ", ".join(e.name for e in self.extensions))
+
+        if platform.system() == "Windows":
+            cmake_version = LooseVersion(re.search(r'version\s*([\d.]+)', out.decode()).group(1))
+            if cmake_version < '3.1.0':
+                raise RuntimeError("CMake >= 3.1.0 is required on Windows")
+
+        for ext in self.extensions:
+            self.build_extension(ext)
+
+    def build_extension(self, ext):
+        extdir = os.path.abspath(os.path.dirname(self.get_ext_fullpath(ext.name)))
+        # required for auto-detection of auxiliary "native" libs
+        if not extdir.endswith(os.path.sep):
+            extdir += os.path.sep
+
+        cmake_args = ['-DCMAKE_LIBRARY_OUTPUT_DIRECTORY=' + extdir,
+                      '-DKOMPUTE_OPT_BUILD_PYTHON=1',
+                      '-DKOMPUTE_OPT_BUILD_SINGLE_HEADER=1',
+                      '-DPYTHON_EXECUTABLE=' + sys.executable]
+
+        cfg = 'Debug' if self.debug else 'Release'
+        build_args = ['--config', cfg]
+
+        if platform.system() == "Windows":
+            cmake_args += ['-DCMAKE_LIBRARY_OUTPUT_DIRECTORY_{}={}'.format(cfg.upper(), extdir)]
+            if sys.maxsize > 2**32:
+                cmake_args += ['-A', 'x64']
+            build_args += ['--', '/m']
+        else:
+            cmake_args += ['-DKOMPUTE_EXTRA_CXX_FLAGS="-fPIC"']
+            cmake_args += ['-DCMAKE_BUILD_TYPE=' + cfg]
+            build_args += ['--', '-j2']
+
+        env = os.environ.copy()
+        env['CXXFLAGS'] = '{} -DVERSION_INFO=\\"{}\\"'.format(env.get('CXXFLAGS', ''),
+                                                              self.distribution.get_version())
+        if not os.path.exists(self.build_temp):
+            os.makedirs(self.build_temp)
+
+        subprocess.check_call(['cmake', ext.sourcedir] + cmake_args, cwd=self.build_temp, env=env)
+        subprocess.check_call(['cmake', '--build', '.'] + build_args, cwd=self.build_temp)
+
+setup(
+    name='kp',
+    version='0.0.1',
+    author='Alejandro Saucedo',
+    description='Vulkan Kompute: Blazing fast, mobile-enabled, asynchronous, and optimized for advanced GPU processing usecases.',
+    long_description='',
+    ext_modules=[CMakeExtension('kp')],
+    cmdclass=dict(build_ext=CMakeBuild),
+    zip_safe=False,
+)
--- a/single_include/kompute/Kompute.hpp
+++ b/single_include/kompute/Kompute.hpp
@ -1100,6 +1100,12 @@ class Sequence
     */
    bool isInit();

+    /**
+     * Destroys and frees the GPU resources which include the buffer and memory
+     * and sets the sequence as init=False.
+     */
+    void freeMemoryDestroyGPUResources();
+
    /**
     * Record function for operation to be added to the GPU queue in batch. This
     * template requires classes to be derived from the OpBase class. This
@ -1301,9 +1307,9 @@ class Manager
     *
     * @param sequenceName The name for the named sequence to be retrieved or
     * created
-     * @return Weak pointer to the manager owned sequence resource
+     * @return Shared pointer to the manager owned sequence resource
     */
-    std::weak_ptr<Sequence> getOrCreateManagedSequence(
+    std::shared_ptr<Sequence> getOrCreateManagedSequence(
      std::string sequenceName);

    /**
@ -1315,8 +1321,9 @@ class Manager
     * @param queueIndex The queue to use from the available queues
     * @return Weak pointer to the manager owned sequence resource
     */
-    std::weak_ptr<Sequence> createManagedSequence(std::string sequenceName = "",
-                                                  uint32_t queueIndex = 0);
+    std::shared_ptr<Sequence> createManagedSequence(
+      std::string sequenceName = "",
+      uint32_t queueIndex = 0);

    /**
     * Function that evaluates operation against named sequence.
@ -1332,22 +1339,21 @@ class Manager
                TArgs&&... params)
    {
        SPDLOG_DEBUG("Kompute Manager evalOp triggered");
-        std::weak_ptr<Sequence> sqWeakPtr =
+        std::shared_ptr<kp::Sequence> sq =
          this->getOrCreateManagedSequence(sequenceName);

-        if (std::shared_ptr<kp::Sequence> sq = sqWeakPtr.lock()) {
-            SPDLOG_DEBUG("Kompute Manager evalOp running sequence BEGIN");
-            sq->begin();
+        SPDLOG_DEBUG("Kompute Manager evalOp running sequence BEGIN");
+        sq->begin();

-            SPDLOG_DEBUG("Kompute Manager evalOp running sequence RECORD");
-            sq->record<T>(tensors, std::forward<TArgs>(params)...);
+        SPDLOG_DEBUG("Kompute Manager evalOp running sequence RECORD");
+        sq->record<T>(tensors, std::forward<TArgs>(params)...);

-            SPDLOG_DEBUG("Kompute Manager evalOp running sequence END");
-            sq->end();
+        SPDLOG_DEBUG("Kompute Manager evalOp running sequence END");
+        sq->end();
+
+        SPDLOG_DEBUG("Kompute Manager evalOp running sequence EVAL");
+        sq->eval();

-            SPDLOG_DEBUG("Kompute Manager evalOp running sequence EVAL");
-            sq->eval();
-        }
        SPDLOG_DEBUG("Kompute Manager evalOp running sequence SUCCESS");
    }

@ -1385,26 +1391,21 @@ class Manager
    {
        SPDLOG_DEBUG("Kompute Manager evalOpAsync triggered");

-        std::weak_ptr<Sequence> sqWeakPtr =
+        std::shared_ptr<kp::Sequence> sq =
          this->getOrCreateManagedSequence(sequenceName);

-        if (std::shared_ptr<kp::Sequence> sq = sqWeakPtr.lock()) {
+        SPDLOG_DEBUG("Kompute Manager evalOpAsync running sequence BEGIN");
+        sq->begin();

-            SPDLOG_DEBUG("Kompute Manager evalOpAsync running sequence BEGIN");
-            sq->begin();
+        SPDLOG_DEBUG("Kompute Manager evalOpAsync running sequence RECORD");
+        sq->record<T>(tensors, std::forward<TArgs>(params)...);

-            SPDLOG_DEBUG("Kompute Manager evalOpAsync running sequence RECORD");
-            sq->record<T>(tensors, std::forward<TArgs>(params)...);
+        SPDLOG_DEBUG("Kompute Manager evalOpAsync running sequence END");
+        sq->end();

-            SPDLOG_DEBUG("Kompute Manager evalOpAsync running sequence END");
-            sq->end();
+        SPDLOG_DEBUG("Kompute Manager evalOpAsync running sequence EVAL");
+        sq->evalAsync();

-            SPDLOG_DEBUG("Kompute Manager evalOpAsync running sequence EVAL");
-            sq->evalAsync();
-        } else {
-            SPDLOG_ERROR("Kompute Manager evalOpAsync sequence [{}] not found",
-                         sequenceName);
-        }
        SPDLOG_DEBUG("Kompute Manager evalOpAsync running sequence SUCCESS");
    }

@ -1620,20 +1621,17 @@ namespace kp {
 * Operation that provides a general abstraction that simplifies the use of 
 * algorithm and parameter components which can be used with shaders.
 * By default it enables the user to provide a dynamic number of tensors
- * which are then passed as inputs. 
- *
- * All of these tensors are expected to be initlaised and this is checked with throw std exception in the init function.
- *
- * See OpLhsRhsOut for an example implementation on a more specific granularity on tensor parameters.
- * 
- * The template parameters specify the processing GPU layout number of
- * iterations for each x, y, z parameter. More specifically, this will be the
- * input to ".dispatch(uint32_t tX, uint32_t tY, uint32_t, tZ)"
+ * which are then passed as inputs.
 */
-template<uint32_t tX = 0, uint32_t tY = 0, uint32_t tZ = 0>
 class OpAlgoBase : public OpBase
 {
  public:
+    struct KomputeWorkgroup {
+        uint32_t x;
+        uint32_t y;
+        uint32_t z;
+    };
+
    /**
     *  Base constructor, should not be used unless explicitly intended.
     */
@ -1649,11 +1647,13 @@ class OpAlgoBase : public OpBase
     * @param commandBuffer Vulkan Command Buffer to record commands into
     * @param tensors Tensors that are to be used in this operation
     * @param shaderFilePath Optional parameter to specify the shader to load (either in spirv or raw format)
+     * @param komputeWorkgroup Optional parameter to specify the layout for processing
     */
    OpAlgoBase(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
           std::shared_ptr<vk::Device> device,
           std::shared_ptr<vk::CommandBuffer> commandBuffer,
-           std::vector<std::shared_ptr<Tensor>>& tensors);
+           std::vector<std::shared_ptr<Tensor>>& tensors,
+           KomputeWorkgroup komputeWorkgroup = KomputeWorkgroup());

    /**
     * Constructor that enables a file to be passed to the operation with
@ -1664,13 +1664,15 @@ class OpAlgoBase : public OpBase
     * @param device Vulkan logical device for passing to Algorithm
     * @param commandBuffer Vulkan Command Buffer to record commands into
     * @param tensors Tensors that are to be used in this operation
-     * @param shaderFilePath Optional parameter to specify the shader to load (either in spirv or raw format)
+     * @param shaderFilePath Parameter to specify the shader to load (either in spirv or raw format)
+     * @param komputeWorkgroup Optional parameter to specify the layout for processing
     */
    OpAlgoBase(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
           std::shared_ptr<vk::Device> device,
           std::shared_ptr<vk::CommandBuffer> commandBuffer,
           std::vector<std::shared_ptr<Tensor>>& tensors,
-           std::string shaderFilePath);
+           std::string shaderFilePath,
+           KomputeWorkgroup komputeWorkgroup = KomputeWorkgroup());

    /**
     * Constructor that enables raw shader data to be passed to the main operation
@ -1681,12 +1683,14 @@ class OpAlgoBase : public OpBase
     * @param commandBuffer Vulkan Command Buffer to record commands into
     * @param tensors Tensors that are to be used in this operation
     * @param shaderDataRaw Optional parameter to specify the shader data either in binary or raw form
+     * @param komputeWorkgroup Optional parameter to specify the layout for processing
     */
    OpAlgoBase(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
           std::shared_ptr<vk::Device> device,
           std::shared_ptr<vk::CommandBuffer> commandBuffer,
           std::vector<std::shared_ptr<Tensor>>& tensors,
-           const std::vector<char>& shaderDataRaw);
+           const std::vector<char>& shaderDataRaw,
+           KomputeWorkgroup komputeWorkgroup = KomputeWorkgroup());

    /**
     * Default destructor, which is in charge of destroying the algorithm
@ -1733,9 +1737,7 @@ class OpAlgoBase : public OpBase

    // -------------- ALWAYS OWNED RESOURCES

-    uint32_t mX;
-    uint32_t mY;
-    uint32_t mZ;
+    KomputeWorkgroup mKomputeWorkgroup;

    std::string mShaderFilePath; ///< Optional member variable which can be provided for the OpAlgoBase to find the data automatically and load for processing
    std::vector<char> mShaderDataRaw; ///< Optional member variable which can be provided to contain either the raw shader content or the spirv binary content
@ -1745,177 +1747,6 @@ class OpAlgoBase : public OpBase

 } // End namespace kp

-// Including implementation for template class
-#ifndef OPALGOBASE_IMPL
-#define OPALGOBASE_IMPL
-
-namespace kp {
-
-template<uint32_t tX, uint32_t tY, uint32_t tZ>
-OpAlgoBase<tX, tY, tZ>::OpAlgoBase()
-{
-    SPDLOG_DEBUG("Kompute OpAlgoBase constructor base");
-}
-
-template<uint32_t tX, uint32_t tY, uint32_t tZ>
-OpAlgoBase<tX, tY, tZ>::OpAlgoBase(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
-                           std::shared_ptr<vk::Device> device,
-                           std::shared_ptr<vk::CommandBuffer> commandBuffer,
-                           std::vector<std::shared_ptr<Tensor>>& tensors)
-  : OpBase(physicalDevice, device, commandBuffer, tensors, false)
-{
-    SPDLOG_DEBUG("Kompute OpAlgoBase constructor with params numTensors: {}", tensors.size());
-
-    // The dispatch size is set up based on either explicitly provided template
-    // parameters or by default it would take the shape and size of the tensors
-    if (tX > 0) {
-        // If at least the x value is provided we use mainly the parameters
-        // provided
-        this->mX = tX;
-        this->mY = tY > 0 ? tY : 1;
-        this->mZ = tZ > 0 ? tZ : 1;
-    } else {
-        this->mX = tensors[0]->size();
-        this->mY = 1;
-        this->mZ = 1;
-    }
-    SPDLOG_INFO("Kompute OpAlgoBase dispatch size X: {}, Y: {}, Z: {}",
-                 this->mX,
-                 this->mY,
-                 this->mZ);
-
-    this->mAlgorithm = std::make_shared<Algorithm>(device, commandBuffer);
-}
-
-template<uint32_t tX, uint32_t tY, uint32_t tZ>
-OpAlgoBase<tX, tY, tZ>::OpAlgoBase(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
-                           std::shared_ptr<vk::Device> device,
-                           std::shared_ptr<vk::CommandBuffer> commandBuffer,
-                           std::vector<std::shared_ptr<Tensor>>& tensors,
-                           std::string shaderFilePath)
-  : OpAlgoBase(physicalDevice, device, commandBuffer, tensors)
-{
-    SPDLOG_DEBUG("Kompute OpAlgoBase shaderFilePath constructo with shaderfile path: {}", shaderFilePath);
-
-    this->mShaderFilePath = shaderFilePath;
-}
-
-template<uint32_t tX, uint32_t tY, uint32_t tZ>
-OpAlgoBase<tX, tY, tZ>::OpAlgoBase(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
-                           std::shared_ptr<vk::Device> device,
-                           std::shared_ptr<vk::CommandBuffer> commandBuffer,
-                           std::vector<std::shared_ptr<Tensor>>& tensors,
-                           const std::vector<char>& shaderDataRaw)
-  : OpAlgoBase(physicalDevice, device, commandBuffer, tensors)
-{
-    SPDLOG_DEBUG("Kompute OpAlgoBase shaderFilePath constructo with shader raw data length: {}", shaderDataRaw.size());
-
-    this->mShaderDataRaw = shaderDataRaw;
-}
-
-template<uint32_t tX, uint32_t tY, uint32_t tZ>
-OpAlgoBase<tX, tY, tZ>::~OpAlgoBase()
-{
-    SPDLOG_DEBUG("Kompute OpAlgoBase destructor started");
-}
-
-template<uint32_t tX, uint32_t tY, uint32_t tZ>
-void
-OpAlgoBase<tX, tY, tZ>::init()
-{
-    SPDLOG_DEBUG("Kompute OpAlgoBase init called");
-
-    if (this->mTensors.size() < 1) {
-        throw std::runtime_error(
-          "Kompute OpAlgoBase called with less than 1 tensor");
-    } 
-
-    for (std::shared_ptr<Tensor> tensor : this->mTensors) {
-        if(!tensor->isInit()) {
-            throw std::runtime_error("Kompute OpAlgoBase validation failed; all tensor parameters must be initialised.");
-        }
-    }
-
-    SPDLOG_DEBUG("Kompute OpAlgoBase fetching spirv data");
-
-    std::vector<char> shaderFileData = this->fetchSpirvBinaryData();
-
-    SPDLOG_DEBUG("Kompute OpAlgoBase Initialising algorithm component");
-
-    this->mAlgorithm->init(shaderFileData, this->mTensors);
-}
-
-template<uint32_t tX, uint32_t tY, uint32_t tZ>
-void
-OpAlgoBase<tX, tY, tZ>::record()
-{
-    SPDLOG_DEBUG("Kompute OpAlgoBase record called");
-
-    // Barrier to ensure the data is finished writing to buffer memory
-    for (std::shared_ptr<Tensor> tensor : this->mTensors) {
-        tensor->recordBufferMemoryBarrier(
-          this->mCommandBuffer,
-          vk::AccessFlagBits::eHostWrite,
-          vk::AccessFlagBits::eShaderRead,
-          vk::PipelineStageFlagBits::eHost,
-          vk::PipelineStageFlagBits::eComputeShader);
-    }
-
-    this->mAlgorithm->recordDispatch(this->mX, this->mY, this->mZ);
-}
-
-template<uint32_t tX, uint32_t tY, uint32_t tZ>
-void
-OpAlgoBase<tX, tY, tZ>::preEval()
-{
-    SPDLOG_DEBUG("Kompute OpAlgoBase preEval called");
-}
-
-template<uint32_t tX, uint32_t tY, uint32_t tZ>
-void
-OpAlgoBase<tX, tY, tZ>::postEval()
-{
-    SPDLOG_DEBUG("Kompute OpAlgoBase postSubmit called");
-}
-
-template<uint32_t tX, uint32_t tY, uint32_t tZ>
-std::vector<char> OpAlgoBase<tX, tY, tZ>::fetchSpirvBinaryData() 
-{
-    SPDLOG_WARN(
-      "Kompute OpAlgoBase Running shaders directly from spirv file");
-
-    if (this->mShaderFilePath.size()) {
-        std::ifstream fileStream(this->mShaderFilePath,
-                                 std::ios::binary | std::ios::in | std::ios::ate);
-
-        if (!fileStream.good()) {
-            throw std::runtime_error("Error reading file: " + this->mShaderFilePath);
-        }
-
-        size_t shaderFileSize = fileStream.tellg();
-        fileStream.seekg(0, std::ios::beg);
-        char* shaderDataRaw = new char[shaderFileSize];
-        fileStream.read(shaderDataRaw, shaderFileSize);
-        fileStream.close();
-
-        SPDLOG_WARN(
-          "Kompute OpAlgoBase fetched {} bytes", shaderFileSize);
-
-        return std::vector<char>(shaderDataRaw,
-                                 shaderDataRaw + shaderFileSize);
-    }
-    else if (this->mShaderDataRaw.size()) {
-        return this->mShaderDataRaw;
-    }
-    else {
-        throw std::runtime_error("Kompute OpAlgoBase Error reached fetchSpirvBinaryData but neither filepath nor data provided");
-    }
-}
-
-}
-
-#endif // #ifndef OPALGOBASE_IMPL
-
 #include <fstream>

 namespace kp {
@ -1924,12 +1755,8 @@ namespace kp {
 * Operation base class to simplify the creation of operations that require
 * right hand and left hand side datapoints together with a single output.
 * The expected data passed is two input tensors and one output tensor.
- * The template parameters specify the processing GPU layout number of
- * iterations for each x, y, z parameter. More specifically, this will be the
- * input to ".dispatch(uint32_t tX, uint32_t tY, uint32_t, tZ)"
 */
-template<uint32_t tX = 0, uint32_t tY = 0, uint32_t tZ = 0>
-class OpAlgoLhsRhsOut : public OpAlgoBase<tX, tY, tZ>
+class OpAlgoLhsRhsOut : public OpAlgoBase
 {
  public:
    /**
@ -1947,11 +1774,13 @@ class OpAlgoLhsRhsOut : public OpAlgoBase<tX, tY, tZ>
     * @param commandBuffer Vulkan Command Buffer to record commands into
     * @param tensors Tensors that are to be used in this operation
     * @param freeTensors Whether operation manages the memory of the Tensors
+     * @param komputeWorkgroup Optional parameter to specify the layout for processing
     */
    OpAlgoLhsRhsOut(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
           std::shared_ptr<vk::Device> device,
           std::shared_ptr<vk::CommandBuffer> commandBuffer,
-           std::vector<std::shared_ptr<Tensor>> tensors);
+           std::vector<std::shared_ptr<Tensor>> tensors,
+           KomputeWorkgroup komputeWorkgroup = KomputeWorkgroup());

    /**
     * Default destructor, which is in charge of destroying the algorithm
@ -1982,7 +1811,7 @@ class OpAlgoLhsRhsOut : public OpAlgoBase<tX, tY, tZ>
     * of the GPU Device memory into the staging buffer so the output data can
     * be retrieved.
     */
-    virtual void postSubmit() override;
+    virtual void postEval() override;

  protected:
    // -------------- NEVER OWNED RESOURCES
@ -1996,138 +1825,6 @@ class OpAlgoLhsRhsOut : public OpAlgoBase<tX, tY, tZ>

 } // End namespace kp

-// Including implementation for template class
-#ifndef OPALGOLHSRHSOUT_CPP
-#define OPALGOLHSRHSOUT_CPP
-
-namespace kp {
-
-template<uint32_t tX, uint32_t tY, uint32_t tZ>
-OpAlgoLhsRhsOut<tX, tY, tZ>::OpAlgoLhsRhsOut()
-{
-    SPDLOG_DEBUG("Kompute OpAlgoLhsRhsOut constructor base");
-}
-
-template<uint32_t tX, uint32_t tY, uint32_t tZ>
-OpAlgoLhsRhsOut<tX, tY, tZ>::OpAlgoLhsRhsOut(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
-                           std::shared_ptr<vk::Device> device,
-                           std::shared_ptr<vk::CommandBuffer> commandBuffer,
-                           std::vector<std::shared_ptr<Tensor>> tensors)
-  // The inheritance is initialised with the copyOutputData to false given that
-  // this depencendant class handles the transfer of data via staging buffers in 
-  // a granular way.
-  : OpAlgoBase<tX, tY, tZ>(physicalDevice, device, commandBuffer, tensors)
-{
-    SPDLOG_DEBUG("Kompute OpAlgoLhsRhsOut constructor with params");
-}
-
-template<uint32_t tX, uint32_t tY, uint32_t tZ>
-OpAlgoLhsRhsOut<tX, tY, tZ>::~OpAlgoLhsRhsOut()
-{
-    SPDLOG_DEBUG("Kompute OpAlgoLhsRhsOut destructor started");
-}
-
-template<uint32_t tX, uint32_t tY, uint32_t tZ>
-void
-OpAlgoLhsRhsOut<tX, tY, tZ>::init()
-{
-    SPDLOG_DEBUG("Kompute OpAlgoLhsRhsOut init called");
-
-    if (this->mTensors.size() < 3) {
-        throw std::runtime_error(
-          "Kompute OpAlgoLhsRhsOut called with less than 1 tensor");
-    } else if (this->mTensors.size() > 3) {
-        SPDLOG_WARN("Kompute OpAlgoLhsRhsOut called with more than 3 this->mTensors");
-    }
-
-    this->mTensorLHS = this->mTensors[0];
-    this->mTensorRHS = this->mTensors[1];
-    this->mTensorOutput = this->mTensors[2];
-
-    if (!(this->mTensorLHS->isInit() && this->mTensorRHS->isInit() &&
-          this->mTensorOutput->isInit())) {
-        throw std::runtime_error(
-          "Kompute OpAlgoLhsRhsOut all tensor parameters must be initialised. LHS: " +
-          std::to_string(this->mTensorLHS->isInit()) +
-          " RHS: " + std::to_string(this->mTensorRHS->isInit()) +
-          " Output: " + std::to_string(this->mTensorOutput->isInit()));
-    }
-
-    if (!(this->mTensorLHS->size() == this->mTensorRHS->size() &&
-          this->mTensorRHS->size() == this->mTensorOutput->size())) {
-        throw std::runtime_error(
-          "Kompute OpAlgoLhsRhsOut all tensor parameters must be the same size LHS: " +
-          std::to_string(this->mTensorLHS->size()) +
-          " RHS: " + std::to_string(this->mTensorRHS->size()) +
-          " Output: " + std::to_string(this->mTensorOutput->size()));
-    }
-
-    this->mTensorOutputStaging = std::make_shared<Tensor>(
-      this->mTensorOutput->data(), Tensor::TensorTypes::eStaging);
-
-    this->mTensorOutputStaging->init(
-      this->mPhysicalDevice, this->mDevice, this->mCommandBuffer);
-
-    SPDLOG_DEBUG("Kompute OpAlgoLhsRhsOut fetching spirv data");
-
-    std::vector<char> shaderFileData = this->fetchSpirvBinaryData();
-
-    SPDLOG_DEBUG("Kompute OpAlgoLhsRhsOut Initialising algorithm component");
-
-    this->mAlgorithm->init(shaderFileData, this->mTensors);
-}
-
-template<uint32_t tX, uint32_t tY, uint32_t tZ>
-void
-OpAlgoLhsRhsOut<tX, tY, tZ>::record()
-{
-    SPDLOG_DEBUG("Kompute OpAlgoLhsRhsOut record called");
-
-    // Barrier to ensure the data is finished writing to buffer memory
-    this->mTensorLHS->recordBufferMemoryBarrier(
-      this->mCommandBuffer,
-      vk::AccessFlagBits::eHostWrite,
-      vk::AccessFlagBits::eShaderRead,
-      vk::PipelineStageFlagBits::eHost,
-      vk::PipelineStageFlagBits::eComputeShader);
-    this->mTensorRHS->recordBufferMemoryBarrier(
-      this->mCommandBuffer,
-      vk::AccessFlagBits::eHostWrite,
-      vk::AccessFlagBits::eShaderRead,
-      vk::PipelineStageFlagBits::eHost,
-      vk::PipelineStageFlagBits::eComputeShader);
-
-    this->mAlgorithm->recordDispatch(this->mX, this->mY, this->mZ);
-
-    // Barrier to ensure the shader code is executed before buffer read
-    this->mTensorOutput->recordBufferMemoryBarrier(
-      this->mCommandBuffer,
-      vk::AccessFlagBits::eShaderWrite,
-      vk::AccessFlagBits::eTransferRead,
-      vk::PipelineStageFlagBits::eComputeShader,
-      vk::PipelineStageFlagBits::eTransfer);
-
-    this->mTensorOutputStaging->recordCopyFrom(
-            this->mCommandBuffer,
-            this->mTensorOutput,
-            true);
-}
-
-template<uint32_t tX, uint32_t tY, uint32_t tZ>
-void
-OpAlgoLhsRhsOut<tX, tY, tZ>::postSubmit()
-{
-    SPDLOG_DEBUG("Kompute OpAlgoLhsRhsOut postSubmit called");
-
-    this->mTensorOutputStaging->mapDataFromHostMemory();
-
-    this->mTensorOutput->setData(this->mTensorOutputStaging->data());
-}
-
-}
-
-#endif // #ifndef OPALGOLHSRHSOUT_CPP
-
 #include <fstream>

 #if RELEASE
@ -2138,12 +1835,9 @@ namespace kp {

 /**
 * Operation that performs multiplication on two tensors and outpus on third
- * tensor. The template parameters specify the processing GPU layout number of
- * iterations for each x, y, z parameter. More specifically, this will be the
- * input to ".dispatch(uint32_t tX, uint32_t tY, uint32_t, tZ)"
+ * tensor.
 */
-template<uint32_t tX = 0, uint32_t tY = 0, uint32_t tZ = 0>
-class OpMult : public OpAlgoBase<tX, tY, tZ>
+class OpMult : public OpAlgoBase
 {
  public:
    /**
@ -2162,13 +1856,14 @@ class OpMult : public OpAlgoBase<tX, tY, tZ>
     * @param device Vulkan logical device for passing to Algorithm
     * @param commandBuffer Vulkan Command Buffer to record commands into
     * @param tensors Tensors that are to be used in this operation
-     * @param freeTensors Whether operation manages the memory of the Tensors
+     * @param komputeWorkgroup Optional parameter to specify the layout for processing
     */
    OpMult(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
           std::shared_ptr<vk::Device> device,
           std::shared_ptr<vk::CommandBuffer> commandBuffer,
-           std::vector<std::shared_ptr<Tensor>> tensors)
-      : OpAlgoBase<tX, tY, tZ>(physicalDevice, device, commandBuffer, tensors, "")
+           std::vector<std::shared_ptr<Tensor>> tensors,
+           KomputeWorkgroup komputeWorkgroup = KomputeWorkgroup())
+      : OpAlgoBase(physicalDevice, device, commandBuffer, tensors, "", komputeWorkgroup)
    {
        SPDLOG_DEBUG("Kompute OpMult constructor with params");

@ -2179,14 +1874,8 @@ class OpMult : public OpAlgoBase<tX, tY, tZ>

 #if RELEASE
    /**
-     * If release it will be using the static version of the shader which is 
-     * loaded using this file directly.
-     *
-     * @param physicalDevice Vulkan physical device used to find device queues
-     * @param device Vulkan logical device for passing to Algorithm
-     * @param commandBuffer Vulkan Command Buffer to record commands into
-     * @param tensors Tensors that are to be used in this operation
-     * @param freeTensors Whether operation manages the memory of the Tensors
+     * If RELEASE=1 it will be using the static version of the shader which is 
+     * loaded using this file directly. Otherwise it should not override the function.
     */
    std::vector<char> fetchSpirvBinaryData() override
    {
--- a/src/Algorithm.cpp
+++ b/src/Algorithm.cpp
@ -34,7 +34,9 @@ Algorithm::~Algorithm()
            SPDLOG_ERROR("Kompute Algorithm Error requested to destroy "
                         "pipeline but it is null");
        }
-        this->mDevice->destroy(*this->mPipeline, (vk::Optional<const vk::AllocationCallbacks>)nullptr);
+        this->mDevice->destroy(
+          *this->mPipeline,
+          (vk::Optional<const vk::AllocationCallbacks>)nullptr);
    }

    if (this->mFreePipelineCache) {
@ -43,7 +45,9 @@ Algorithm::~Algorithm()
            SPDLOG_ERROR("Kompute Algorithm Error requested to destroy "
                         "pipeline cache but it is null");
        }
-        this->mDevice->destroy(*this->mPipelineCache, (vk::Optional<const vk::AllocationCallbacks>)nullptr);
+        this->mDevice->destroy(
+          *this->mPipelineCache,
+          (vk::Optional<const vk::AllocationCallbacks>)nullptr);
    }

    if (this->mFreePipelineLayout) {
@ -52,7 +56,9 @@ Algorithm::~Algorithm()
            SPDLOG_ERROR("Kompute Algorithm Error requested to destroy "
                         "pipeline layout but it is null");
        }
-        this->mDevice->destroy(*this->mPipelineLayout, (vk::Optional<const vk::AllocationCallbacks>)nullptr);
+        this->mDevice->destroy(
+          *this->mPipelineLayout,
+          (vk::Optional<const vk::AllocationCallbacks>)nullptr);
    }

    if (this->mFreeShaderModule) {
@ -61,7 +67,9 @@ Algorithm::~Algorithm()
            SPDLOG_ERROR("Kompute Algorithm Error requested to destroy shader "
                         "module but it is null");
        }
-        this->mDevice->destroy(*this->mShaderModule, (vk::Optional<const vk::AllocationCallbacks>)nullptr);
+        this->mDevice->destroy(
+          *this->mShaderModule,
+          (vk::Optional<const vk::AllocationCallbacks>)nullptr);
    }

    if (this->mFreeDescriptorSet) {
@ -80,7 +88,9 @@ Algorithm::~Algorithm()
            SPDLOG_ERROR("Kompute Algorithm Error requested to destroy "
                         "descriptor set layout but it is null");
        }
-        this->mDevice->destroy(*this->mDescriptorSetLayout, (vk::Optional<const vk::AllocationCallbacks>)nullptr);
+        this->mDevice->destroy(
+          *this->mDescriptorSetLayout,
+          (vk::Optional<const vk::AllocationCallbacks>)nullptr);
    }

    if (this->mFreeDescriptorPool) {
@ -89,7 +99,9 @@ Algorithm::~Algorithm()
            SPDLOG_ERROR("Kompute Algorithm Error requested to destroy "
                         "descriptor pool but it is null");
        }
-        this->mDevice->destroy(*this->mDescriptorPool, (vk::Optional<const vk::AllocationCallbacks>)nullptr);
+        this->mDevice->destroy(
+          *this->mDescriptorPool,
+          (vk::Optional<const vk::AllocationCallbacks>)nullptr);
    }
 }

--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@ -48,7 +48,8 @@ if(KOMPUTE_OPT_ANDOID_BUILD)
        ${PROJECT_SOURCE_DIR}/vk_ndk_wrapper_include/kompute_vk_ndk_wrapper.cpp)
 endif()

-add_library(kompute
+add_library(
+    kompute STATIC
    ${kompute_CPP})

 target_include_directories(
--- a/src/Manager.cpp
+++ b/src/Manager.cpp
@ -59,13 +59,19 @@ Manager::~Manager()
    }

    if (this->mManagedSequences.size()) {
-        SPDLOG_DEBUG("Releasing managed sequence");
+        SPDLOG_DEBUG("Kompute Manager explicitly running destructor for "
+                     "managed sequences");
+        for (const std::pair<std::string, std::shared_ptr<Sequence>>& sqPair :
+             this->mManagedSequences) {
+            sqPair.second->freeMemoryDestroyGPUResources();
+        }
        this->mManagedSequences.clear();
    }

    if (this->mFreeDevice) {
        SPDLOG_INFO("Destroying device");
-        this->mDevice->destroy((vk::Optional<const vk::AllocationCallbacks>)nullptr);
+        this->mDevice->destroy(
+          (vk::Optional<const vk::AllocationCallbacks>)nullptr);
        SPDLOG_DEBUG("Kompute Manager Destroyed Device");
    }

@ -86,12 +92,13 @@ Manager::~Manager()
 #endif

    if (this->mFreeInstance) {
-        this->mInstance->destroy((vk::Optional<const vk::AllocationCallbacks>)nullptr);
+        this->mInstance->destroy(
+          (vk::Optional<const vk::AllocationCallbacks>)nullptr);
        SPDLOG_DEBUG("Kompute Manager Destroyed Instance");
    }
 }

-std::weak_ptr<Sequence>
+std::shared_ptr<Sequence>
 Manager::getOrCreateManagedSequence(std::string sequenceName)
 {
    SPDLOG_DEBUG("Kompute Manager creating Sequence object");
@ -106,7 +113,7 @@ Manager::getOrCreateManagedSequence(std::string sequenceName)
    }
 }

-std::weak_ptr<Sequence>
+std::shared_ptr<Sequence>
 Manager::createManagedSequence(std::string sequenceName, uint32_t queueIndex)
 {

--- a/src/OpAlgoBase.cpp
+++ b/src/OpAlgoBase.cpp
@ -0,0 +1,170 @@
+#pragma once
+
+#include "kompute/operations/OpAlgoBase.hpp"
+
+namespace kp {
+
+OpAlgoBase::OpAlgoBase()
+{
+    SPDLOG_DEBUG("Kompute OpAlgoBase constructor base");
+}
+
+OpAlgoBase::OpAlgoBase(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
+                       std::shared_ptr<vk::Device> device,
+                       std::shared_ptr<vk::CommandBuffer> commandBuffer,
+                       std::vector<std::shared_ptr<Tensor>>& tensors,
+                       KomputeWorkgroup komputeWorkgroup)
+  : OpBase(physicalDevice, device, commandBuffer, tensors, false)
+{
+    SPDLOG_DEBUG("Kompute OpAlgoBase constructor with params numTensors: {}",
+                 tensors.size());
+
+    // The dispatch size is set up based on either explicitly provided template
+    // parameters or by default it would take the shape and size of the tensors
+    if (komputeWorkgroup.x > 0) {
+        // If at least the x value is provided we use mainly the parameters
+        // provided
+        this->mKomputeWorkgroup = {
+            0,
+            komputeWorkgroup.y > 0 ? komputeWorkgroup.y : 1,
+            komputeWorkgroup.z > 0 ? komputeWorkgroup.z : 1
+        };
+    } else {
+        this->mKomputeWorkgroup = { tensors[0]->size(), 1, 1 };
+    }
+    SPDLOG_INFO("Kompute OpAlgoBase dispatch size X: {}, Y: {}, Z: {}",
+                this->mKomputeWorkgroup.x,
+                this->mKomputeWorkgroup.y,
+                this->mKomputeWorkgroup.z);
+
+    this->mAlgorithm = std::make_shared<Algorithm>(device, commandBuffer);
+}
+
+OpAlgoBase::OpAlgoBase(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
+                       std::shared_ptr<vk::Device> device,
+                       std::shared_ptr<vk::CommandBuffer> commandBuffer,
+                       std::vector<std::shared_ptr<Tensor>>& tensors,
+                       std::string shaderFilePath,
+                       KomputeWorkgroup komputeWorkgroup)
+  : OpAlgoBase(physicalDevice, device, commandBuffer, tensors, komputeWorkgroup)
+{
+    SPDLOG_DEBUG(
+      "Kompute OpAlgoBase shaderFilePath constructo with shaderfile path: {}",
+      shaderFilePath);
+
+    this->mShaderFilePath = shaderFilePath;
+}
+
+OpAlgoBase::OpAlgoBase(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
+                       std::shared_ptr<vk::Device> device,
+                       std::shared_ptr<vk::CommandBuffer> commandBuffer,
+                       std::vector<std::shared_ptr<Tensor>>& tensors,
+                       const std::vector<char>& shaderDataRaw,
+                       KomputeWorkgroup komputeWorkgroup)
+  : OpAlgoBase(physicalDevice, device, commandBuffer, tensors, komputeWorkgroup)
+{
+    SPDLOG_DEBUG("Kompute OpAlgoBase shaderFilePath constructo with shader raw "
+                 "data length: {}",
+                 shaderDataRaw.size());
+
+    this->mShaderDataRaw = shaderDataRaw;
+}
+
+OpAlgoBase::~OpAlgoBase()
+{
+    SPDLOG_DEBUG("Kompute OpAlgoBase destructor started");
+}
+
+void
+OpAlgoBase::init()
+{
+    SPDLOG_DEBUG("Kompute OpAlgoBase init called");
+
+    if (this->mTensors.size() < 1) {
+        throw std::runtime_error(
+          "Kompute OpAlgoBase called with less than 1 tensor");
+    }
+
+    for (std::shared_ptr<Tensor> tensor : this->mTensors) {
+        if (!tensor->isInit()) {
+            throw std::runtime_error(
+              "Kompute OpAlgoBase validation failed; all tensor parameters "
+              "must be initialised.");
+        }
+    }
+
+    SPDLOG_DEBUG("Kompute OpAlgoBase fetching spirv data");
+
+    std::vector<char> shaderFileData = this->fetchSpirvBinaryData();
+
+    SPDLOG_DEBUG("Kompute OpAlgoBase Initialising algorithm component");
+
+    this->mAlgorithm->init(shaderFileData, this->mTensors);
+}
+
+void
+OpAlgoBase::record()
+{
+    SPDLOG_DEBUG("Kompute OpAlgoBase record called");
+
+    // Barrier to ensure the data is finished writing to buffer memory
+    for (std::shared_ptr<Tensor> tensor : this->mTensors) {
+        tensor->recordBufferMemoryBarrier(
+          this->mCommandBuffer,
+          vk::AccessFlagBits::eHostWrite,
+          vk::AccessFlagBits::eShaderRead,
+          vk::PipelineStageFlagBits::eHost,
+          vk::PipelineStageFlagBits::eComputeShader);
+    }
+
+    this->mAlgorithm->recordDispatch(this->mKomputeWorkgroup.x,
+                                     this->mKomputeWorkgroup.y,
+                                     this->mKomputeWorkgroup.z);
+}
+
+void
+OpAlgoBase::preEval()
+{
+    SPDLOG_DEBUG("Kompute OpAlgoBase preEval called");
+}
+
+void
+OpAlgoBase::postEval()
+{
+    SPDLOG_DEBUG("Kompute OpAlgoBase postSubmit called");
+}
+
+std::vector<char>
+OpAlgoBase::fetchSpirvBinaryData()
+{
+    SPDLOG_WARN("Kompute OpAlgoBase Running shaders directly from spirv file");
+
+    if (this->mShaderFilePath.size()) {
+        std::ifstream fileStream(this->mShaderFilePath,
+                                 std::ios::binary | std::ios::in |
+                                   std::ios::ate);
+
+        if (!fileStream.good()) {
+            throw std::runtime_error("Error reading file: " +
+                                     this->mShaderFilePath);
+        }
+
+        size_t shaderFileSize = fileStream.tellg();
+        fileStream.seekg(0, std::ios::beg);
+        char* shaderDataRaw = new char[shaderFileSize];
+        fileStream.read(shaderDataRaw, shaderFileSize);
+        fileStream.close();
+
+        SPDLOG_WARN("Kompute OpAlgoBase fetched {} bytes", shaderFileSize);
+
+        return std::vector<char>(shaderDataRaw, shaderDataRaw + shaderFileSize);
+    } else if (this->mShaderDataRaw.size()) {
+        return this->mShaderDataRaw;
+    } else {
+        throw std::runtime_error(
+          "Kompute OpAlgoBase Error reached fetchSpirvBinaryData but neither "
+          "filepath nor data provided");
+    }
+}
+
+}
--- a/src/OpAlgoLhsRhsOut.cpp
+++ b/src/OpAlgoLhsRhsOut.cpp
@ -0,0 +1,127 @@
+#pragma once
+
+#include "kompute/operations/OpAlgoLhsRhsOut.hpp"
+
+namespace kp {
+
+OpAlgoLhsRhsOut::OpAlgoLhsRhsOut()
+{
+    SPDLOG_DEBUG("Kompute OpAlgoLhsRhsOut constructor base");
+}
+
+OpAlgoLhsRhsOut::OpAlgoLhsRhsOut(
+  std::shared_ptr<vk::PhysicalDevice> physicalDevice,
+  std::shared_ptr<vk::Device> device,
+  std::shared_ptr<vk::CommandBuffer> commandBuffer,
+  std::vector<std::shared_ptr<Tensor>> tensors,
+  KomputeWorkgroup komputeWorkgroup)
+  // The inheritance is initialised with the copyOutputData to false given that
+  // this depencendant class handles the transfer of data via staging buffers in
+  // a granular way.
+  : OpAlgoBase(physicalDevice, device, commandBuffer, tensors, komputeWorkgroup)
+{
+    SPDLOG_DEBUG("Kompute OpAlgoLhsRhsOut constructor with params");
+}
+
+OpAlgoLhsRhsOut::~OpAlgoLhsRhsOut()
+{
+    SPDLOG_DEBUG("Kompute OpAlgoLhsRhsOut destructor started");
+}
+
+void
+OpAlgoLhsRhsOut::init()
+{
+    SPDLOG_DEBUG("Kompute OpAlgoLhsRhsOut init called");
+
+    if (this->mTensors.size() < 3) {
+        throw std::runtime_error(
+          "Kompute OpAlgoLhsRhsOut called with less than 1 tensor");
+    } else if (this->mTensors.size() > 3) {
+        SPDLOG_WARN(
+          "Kompute OpAlgoLhsRhsOut called with more than 3 this->mTensors");
+    }
+
+    this->mTensorLHS = this->mTensors[0];
+    this->mTensorRHS = this->mTensors[1];
+    this->mTensorOutput = this->mTensors[2];
+
+    if (!(this->mTensorLHS->isInit() && this->mTensorRHS->isInit() &&
+          this->mTensorOutput->isInit())) {
+        throw std::runtime_error(
+          "Kompute OpAlgoLhsRhsOut all tensor parameters must be initialised. "
+          "LHS: " +
+          std::to_string(this->mTensorLHS->isInit()) +
+          " RHS: " + std::to_string(this->mTensorRHS->isInit()) +
+          " Output: " + std::to_string(this->mTensorOutput->isInit()));
+    }
+
+    if (!(this->mTensorLHS->size() == this->mTensorRHS->size() &&
+          this->mTensorRHS->size() == this->mTensorOutput->size())) {
+        throw std::runtime_error(
+          "Kompute OpAlgoLhsRhsOut all tensor parameters must be the same size "
+          "LHS: " +
+          std::to_string(this->mTensorLHS->size()) +
+          " RHS: " + std::to_string(this->mTensorRHS->size()) +
+          " Output: " + std::to_string(this->mTensorOutput->size()));
+    }
+
+    this->mTensorOutputStaging = std::make_shared<Tensor>(
+      this->mTensorOutput->data(), Tensor::TensorTypes::eStaging);
+
+    this->mTensorOutputStaging->init(this->mPhysicalDevice, this->mDevice);
+
+    SPDLOG_DEBUG("Kompute OpAlgoLhsRhsOut fetching spirv data");
+
+    std::vector<char> shaderFileData = this->fetchSpirvBinaryData();
+
+    SPDLOG_DEBUG("Kompute OpAlgoLhsRhsOut Initialising algorithm component");
+
+    this->mAlgorithm->init(shaderFileData, this->mTensors);
+}
+
+void
+OpAlgoLhsRhsOut::record()
+{
+    SPDLOG_DEBUG("Kompute OpAlgoLhsRhsOut record called");
+
+    // Barrier to ensure the data is finished writing to buffer memory
+    this->mTensorLHS->recordBufferMemoryBarrier(
+      this->mCommandBuffer,
+      vk::AccessFlagBits::eHostWrite,
+      vk::AccessFlagBits::eShaderRead,
+      vk::PipelineStageFlagBits::eHost,
+      vk::PipelineStageFlagBits::eComputeShader);
+    this->mTensorRHS->recordBufferMemoryBarrier(
+      this->mCommandBuffer,
+      vk::AccessFlagBits::eHostWrite,
+      vk::AccessFlagBits::eShaderRead,
+      vk::PipelineStageFlagBits::eHost,
+      vk::PipelineStageFlagBits::eComputeShader);
+
+    this->mAlgorithm->recordDispatch(this->mKomputeWorkgroup.x,
+                                     this->mKomputeWorkgroup.y,
+                                     this->mKomputeWorkgroup.z);
+
+    // Barrier to ensure the shader code is executed before buffer read
+    this->mTensorOutput->recordBufferMemoryBarrier(
+      this->mCommandBuffer,
+      vk::AccessFlagBits::eShaderWrite,
+      vk::AccessFlagBits::eTransferRead,
+      vk::PipelineStageFlagBits::eComputeShader,
+      vk::PipelineStageFlagBits::eTransfer);
+
+    this->mTensorOutputStaging->recordCopyFrom(
+      this->mCommandBuffer, this->mTensorOutput, true);
+}
+
+void
+OpAlgoLhsRhsOut::postEval()
+{
+    SPDLOG_DEBUG("Kompute OpAlgoLhsRhsOut postSubmit called");
+
+    this->mTensorOutputStaging->mapDataFromHostMemory();
+
+    this->mTensorOutput->setData(this->mTensorOutputStaging->data());
+}
+
+}
--- a/src/Sequence.cpp
+++ b/src/Sequence.cpp
@ -27,33 +27,13 @@ Sequence::~Sequence()
 {
    SPDLOG_DEBUG("Kompute Sequence Destructor started");

-    if (!this->mDevice) {
-        SPDLOG_ERROR(
-          "Kompute Sequence destructor reached with null Device pointer");
+    if (!this->mIsInit) {
+        SPDLOG_INFO("Kompute Sequence destructor called but sequence is not "
+                    "initialized so no need to removing GPU resources.");
        return;
    }
-
-    if (this->mFreeCommandBuffer) {
-        SPDLOG_INFO("Freeing CommandBuffer");
-        if (!this->mCommandBuffer) {
-            SPDLOG_ERROR("Kompute Sequence destructor reached with null "
-                         "CommandPool pointer");
-            return;
-        }
-        this->mDevice->freeCommandBuffers(
-          *this->mCommandPool, 1, this->mCommandBuffer.get());
-        SPDLOG_DEBUG("Kompute Sequence Freed CommandBuffer");
-    }
-
-    if (this->mFreeCommandPool) {
-        SPDLOG_INFO("Destroying CommandPool");
-        if (this->mCommandPool == nullptr) {
-            SPDLOG_ERROR("Kompute Sequence destructor reached with null "
-                         "CommandPool pointer");
-            return;
-        }
-        this->mDevice->destroy(*this->mCommandPool, (vk::Optional<const vk::AllocationCallbacks>)nullptr);
-        SPDLOG_DEBUG("Kompute Sequence Destroyed CommandPool");
+    else {
+        this->freeMemoryDestroyGPUResources();
    }
 }

@ -186,7 +166,8 @@ Sequence::evalAwait(uint64_t waitFor)

    vk::Result result =
      this->mDevice->waitForFences(1, &this->mFence, VK_TRUE, waitFor);
-    this->mDevice->destroy(this->mFence, (vk::Optional<const vk::AllocationCallbacks>)nullptr);
+    this->mDevice->destroy(
+      this->mFence, (vk::Optional<const vk::AllocationCallbacks>)nullptr);

    this->mIsRunning = false;

@ -220,6 +201,53 @@ Sequence::isInit()
    return this->mIsInit;
 }

+void
+Sequence::freeMemoryDestroyGPUResources()
+{
+    if (!this->mIsInit) {
+        SPDLOG_ERROR("Kompute Sequence freeMemoryDestroyGPUResources called "
+            "but Sequence is not initialized so there's no relevant GPU resources.");
+        return;
+    }
+
+    if (!this->mDevice) {
+        SPDLOG_ERROR(
+          "Kompute Sequence freeMemoryDestroyGPUResources called with null Device pointer");
+        this->mIsInit = false;
+        return;
+    }
+
+    if (this->mFreeCommandBuffer) {
+        SPDLOG_INFO("Freeing CommandBuffer");
+        if (!this->mCommandBuffer) {
+            SPDLOG_ERROR("Kompute Sequence freeMemoryDestroyGPUResources called with null "
+                         "CommandPool pointer");
+            this->mIsInit = false;
+            return;
+        }
+        this->mDevice->freeCommandBuffers(
+          *this->mCommandPool, 1, this->mCommandBuffer.get());
+        SPDLOG_DEBUG("Kompute Sequence Freed CommandBuffer");
+    }
+
+    if (this->mFreeCommandPool) {
+        SPDLOG_INFO("Destroying CommandPool");
+        if (this->mCommandPool == nullptr) {
+            SPDLOG_ERROR("Kompute Sequence freeMemoryDestroyGPUResources called with null "
+                         "CommandPool pointer");
+            this->mIsInit = false;
+            return;
+        }
+        this->mDevice->destroy(
+          *this->mCommandPool,
+          (vk::Optional<const vk::AllocationCallbacks>)nullptr);
+        SPDLOG_DEBUG("Kompute Sequence Destroyed CommandPool");
+    }
+
+    this->mIsInit = false;
+
+}
+
 void
 Sequence::createCommandPool()
 {
--- a/src/Tensor.cpp
+++ b/src/Tensor.cpp
@ -12,8 +12,9 @@ Tensor::Tensor()
 Tensor::Tensor(const std::vector<float>& data, TensorTypes tensorType)
 {
 #if DEBUG
-    SPDLOG_DEBUG(
-      "Kompute Tensor constructor data length: {}, and type: {}", data.size(), tensorType);
+    SPDLOG_DEBUG("Kompute Tensor constructor data length: {}, and type: {}",
+                 data.size(),
+                 tensorType);
 #endif

    this->mData = data;
@ -350,7 +351,9 @@ Tensor::freeMemoryDestroyGPUResources()
              "Kompose Tensor expected to free buffer but got null buffer");
        } else {
            SPDLOG_DEBUG("Kompose Tensor destroying buffer");
-            this->mDevice->destroy(*this->mBuffer, (vk::Optional<const vk::AllocationCallbacks>)nullptr);
+            this->mDevice->destroy(
+              *this->mBuffer,
+              (vk::Optional<const vk::AllocationCallbacks>)nullptr);
            this->mBuffer = nullptr;
        }
    }
@ -361,7 +364,9 @@ Tensor::freeMemoryDestroyGPUResources()
              "Kompose Tensor expected to free buffer but got null memory");
        } else {
            SPDLOG_DEBUG("Kompose Tensor freeing memory");
-            this->mDevice->freeMemory(*this->mMemory, (vk::Optional<const vk::AllocationCallbacks>)nullptr);
+            this->mDevice->freeMemory(
+              *this->mMemory,
+              (vk::Optional<const vk::AllocationCallbacks>)nullptr);
            this->mDevice = nullptr;
        }
    }
--- a/src/include/kompute/Manager.hpp
+++ b/src/include/kompute/Manager.hpp
@ -63,9 +63,9 @@ class Manager
     *
     * @param sequenceName The name for the named sequence to be retrieved or
     * created
-     * @return Weak pointer to the manager owned sequence resource
+     * @return Shared pointer to the manager owned sequence resource
     */
-    std::weak_ptr<Sequence> getOrCreateManagedSequence(
+    std::shared_ptr<Sequence> getOrCreateManagedSequence(
      std::string sequenceName);

    /**
@ -77,8 +77,9 @@ class Manager
     * @param queueIndex The queue to use from the available queues
     * @return Weak pointer to the manager owned sequence resource
     */
-    std::weak_ptr<Sequence> createManagedSequence(std::string sequenceName = "",
-                                                  uint32_t queueIndex = 0);
+    std::shared_ptr<Sequence> createManagedSequence(
+      std::string sequenceName = "",
+      uint32_t queueIndex = 0);

    /**
     * Function that evaluates operation against named sequence.
@ -94,22 +95,21 @@ class Manager
                TArgs&&... params)
    {
        SPDLOG_DEBUG("Kompute Manager evalOp triggered");
-        std::weak_ptr<Sequence> sqWeakPtr =
+        std::shared_ptr<kp::Sequence> sq =
          this->getOrCreateManagedSequence(sequenceName);

-        if (std::shared_ptr<kp::Sequence> sq = sqWeakPtr.lock()) {
-            SPDLOG_DEBUG("Kompute Manager evalOp running sequence BEGIN");
-            sq->begin();
+        SPDLOG_DEBUG("Kompute Manager evalOp running sequence BEGIN");
+        sq->begin();

-            SPDLOG_DEBUG("Kompute Manager evalOp running sequence RECORD");
-            sq->record<T>(tensors, std::forward<TArgs>(params)...);
+        SPDLOG_DEBUG("Kompute Manager evalOp running sequence RECORD");
+        sq->record<T>(tensors, std::forward<TArgs>(params)...);

-            SPDLOG_DEBUG("Kompute Manager evalOp running sequence END");
-            sq->end();
+        SPDLOG_DEBUG("Kompute Manager evalOp running sequence END");
+        sq->end();
+
+        SPDLOG_DEBUG("Kompute Manager evalOp running sequence EVAL");
+        sq->eval();

-            SPDLOG_DEBUG("Kompute Manager evalOp running sequence EVAL");
-            sq->eval();
-        }
        SPDLOG_DEBUG("Kompute Manager evalOp running sequence SUCCESS");
    }

@ -147,26 +147,21 @@ class Manager
    {
        SPDLOG_DEBUG("Kompute Manager evalOpAsync triggered");

-        std::weak_ptr<Sequence> sqWeakPtr =
+        std::shared_ptr<kp::Sequence> sq =
          this->getOrCreateManagedSequence(sequenceName);

-        if (std::shared_ptr<kp::Sequence> sq = sqWeakPtr.lock()) {
+        SPDLOG_DEBUG("Kompute Manager evalOpAsync running sequence BEGIN");
+        sq->begin();

-            SPDLOG_DEBUG("Kompute Manager evalOpAsync running sequence BEGIN");
-            sq->begin();
+        SPDLOG_DEBUG("Kompute Manager evalOpAsync running sequence RECORD");
+        sq->record<T>(tensors, std::forward<TArgs>(params)...);

-            SPDLOG_DEBUG("Kompute Manager evalOpAsync running sequence RECORD");
-            sq->record<T>(tensors, std::forward<TArgs>(params)...);
+        SPDLOG_DEBUG("Kompute Manager evalOpAsync running sequence END");
+        sq->end();

-            SPDLOG_DEBUG("Kompute Manager evalOpAsync running sequence END");
-            sq->end();
+        SPDLOG_DEBUG("Kompute Manager evalOpAsync running sequence EVAL");
+        sq->evalAsync();

-            SPDLOG_DEBUG("Kompute Manager evalOpAsync running sequence EVAL");
-            sq->evalAsync();
-        } else {
-            SPDLOG_ERROR("Kompute Manager evalOpAsync sequence [{}] not found",
-                         sequenceName);
-        }
        SPDLOG_DEBUG("Kompute Manager evalOpAsync running sequence SUCCESS");
    }

--- a/src/include/kompute/Sequence.hpp
+++ b/src/include/kompute/Sequence.hpp
@ -106,6 +106,12 @@ class Sequence
     */
    bool isInit();

+    /**
+     * Destroys and frees the GPU resources which include the buffer and memory
+     * and sets the sequence as init=False.
+     */
+    void freeMemoryDestroyGPUResources();
+
    /**
     * Record function for operation to be added to the GPU queue in batch. This
     * template requires classes to be derived from the OpBase class. This
--- a/src/include/kompute/operations/OpAlgoBase.hpp
+++ b/src/include/kompute/operations/OpAlgoBase.hpp
@ -17,20 +17,17 @@ namespace kp {
 * Operation that provides a general abstraction that simplifies the use of 
 * algorithm and parameter components which can be used with shaders.
 * By default it enables the user to provide a dynamic number of tensors
- * which are then passed as inputs. 
- *
- * All of these tensors are expected to be initlaised and this is checked with throw std exception in the init function.
- *
- * See OpLhsRhsOut for an example implementation on a more specific granularity on tensor parameters.
- * 
- * The template parameters specify the processing GPU layout number of
- * iterations for each x, y, z parameter. More specifically, this will be the
- * input to ".dispatch(uint32_t tX, uint32_t tY, uint32_t, tZ)"
+ * which are then passed as inputs.
 */
-template<uint32_t tX = 0, uint32_t tY = 0, uint32_t tZ = 0>
 class OpAlgoBase : public OpBase
 {
  public:
+    struct KomputeWorkgroup {
+        uint32_t x;
+        uint32_t y;
+        uint32_t z;
+    };
+
    /**
     *  Base constructor, should not be used unless explicitly intended.
     */
@ -46,11 +43,13 @@ class OpAlgoBase : public OpBase
     * @param commandBuffer Vulkan Command Buffer to record commands into
     * @param tensors Tensors that are to be used in this operation
     * @param shaderFilePath Optional parameter to specify the shader to load (either in spirv or raw format)
+     * @param komputeWorkgroup Optional parameter to specify the layout for processing
     */
    OpAlgoBase(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
           std::shared_ptr<vk::Device> device,
           std::shared_ptr<vk::CommandBuffer> commandBuffer,
-           std::vector<std::shared_ptr<Tensor>>& tensors);
+           std::vector<std::shared_ptr<Tensor>>& tensors,
+           KomputeWorkgroup komputeWorkgroup = KomputeWorkgroup());

    /**
     * Constructor that enables a file to be passed to the operation with
@ -61,13 +60,15 @@ class OpAlgoBase : public OpBase
     * @param device Vulkan logical device for passing to Algorithm
     * @param commandBuffer Vulkan Command Buffer to record commands into
     * @param tensors Tensors that are to be used in this operation
-     * @param shaderFilePath Optional parameter to specify the shader to load (either in spirv or raw format)
+     * @param shaderFilePath Parameter to specify the shader to load (either in spirv or raw format)
+     * @param komputeWorkgroup Optional parameter to specify the layout for processing
     */
    OpAlgoBase(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
           std::shared_ptr<vk::Device> device,
           std::shared_ptr<vk::CommandBuffer> commandBuffer,
           std::vector<std::shared_ptr<Tensor>>& tensors,
-           std::string shaderFilePath);
+           std::string shaderFilePath,
+           KomputeWorkgroup komputeWorkgroup = KomputeWorkgroup());

    /**
     * Constructor that enables raw shader data to be passed to the main operation
@ -78,12 +79,14 @@ class OpAlgoBase : public OpBase
     * @param commandBuffer Vulkan Command Buffer to record commands into
     * @param tensors Tensors that are to be used in this operation
     * @param shaderDataRaw Optional parameter to specify the shader data either in binary or raw form
+     * @param komputeWorkgroup Optional parameter to specify the layout for processing
     */
    OpAlgoBase(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
           std::shared_ptr<vk::Device> device,
           std::shared_ptr<vk::CommandBuffer> commandBuffer,
           std::vector<std::shared_ptr<Tensor>>& tensors,
-           const std::vector<char>& shaderDataRaw);
+           const std::vector<char>& shaderDataRaw,
+           KomputeWorkgroup komputeWorkgroup = KomputeWorkgroup());

    /**
     * Default destructor, which is in charge of destroying the algorithm
@ -131,9 +134,7 @@ class OpAlgoBase : public OpBase

    // -------------- ALWAYS OWNED RESOURCES

-    uint32_t mX;
-    uint32_t mY;
-    uint32_t mZ;
+    KomputeWorkgroup mKomputeWorkgroup;

    std::string mShaderFilePath; ///< Optional member variable which can be provided for the OpAlgoBase to find the data automatically and load for processing
    std::vector<char> mShaderDataRaw; ///< Optional member variable which can be provided to contain either the raw shader content or the spirv binary content
@ -143,174 +144,3 @@ class OpAlgoBase : public OpBase

 } // End namespace kp

-// Including implementation for template class
-#ifndef OPALGOBASE_IMPL
-#define OPALGOBASE_IMPL
-
-namespace kp {
-
-template<uint32_t tX, uint32_t tY, uint32_t tZ>
-OpAlgoBase<tX, tY, tZ>::OpAlgoBase()
-{
-    SPDLOG_DEBUG("Kompute OpAlgoBase constructor base");
-}
-
-template<uint32_t tX, uint32_t tY, uint32_t tZ>
-OpAlgoBase<tX, tY, tZ>::OpAlgoBase(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
-                           std::shared_ptr<vk::Device> device,
-                           std::shared_ptr<vk::CommandBuffer> commandBuffer,
-                           std::vector<std::shared_ptr<Tensor>>& tensors)
-  : OpBase(physicalDevice, device, commandBuffer, tensors, false)
-{
-    SPDLOG_DEBUG("Kompute OpAlgoBase constructor with params numTensors: {}", tensors.size());
-
-    // The dispatch size is set up based on either explicitly provided template
-    // parameters or by default it would take the shape and size of the tensors
-    if (tX > 0) {
-        // If at least the x value is provided we use mainly the parameters
-        // provided
-        this->mX = tX;
-        this->mY = tY > 0 ? tY : 1;
-        this->mZ = tZ > 0 ? tZ : 1;
-    } else {
-        this->mX = tensors[0]->size();
-        this->mY = 1;
-        this->mZ = 1;
-    }
-    SPDLOG_INFO("Kompute OpAlgoBase dispatch size X: {}, Y: {}, Z: {}",
-                 this->mX,
-                 this->mY,
-                 this->mZ);
-
-    this->mAlgorithm = std::make_shared<Algorithm>(device, commandBuffer);
-}
-
-template<uint32_t tX, uint32_t tY, uint32_t tZ>
-OpAlgoBase<tX, tY, tZ>::OpAlgoBase(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
-                           std::shared_ptr<vk::Device> device,
-                           std::shared_ptr<vk::CommandBuffer> commandBuffer,
-                           std::vector<std::shared_ptr<Tensor>>& tensors,
-                           std::string shaderFilePath)
-  : OpAlgoBase(physicalDevice, device, commandBuffer, tensors)
-{
-    SPDLOG_DEBUG("Kompute OpAlgoBase shaderFilePath constructo with shaderfile path: {}", shaderFilePath);
-
-    this->mShaderFilePath = shaderFilePath;
-}
-
-template<uint32_t tX, uint32_t tY, uint32_t tZ>
-OpAlgoBase<tX, tY, tZ>::OpAlgoBase(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
-                           std::shared_ptr<vk::Device> device,
-                           std::shared_ptr<vk::CommandBuffer> commandBuffer,
-                           std::vector<std::shared_ptr<Tensor>>& tensors,
-                           const std::vector<char>& shaderDataRaw)
-  : OpAlgoBase(physicalDevice, device, commandBuffer, tensors)
-{
-    SPDLOG_DEBUG("Kompute OpAlgoBase shaderFilePath constructo with shader raw data length: {}", shaderDataRaw.size());
-
-    this->mShaderDataRaw = shaderDataRaw;
-}
-
-template<uint32_t tX, uint32_t tY, uint32_t tZ>
-OpAlgoBase<tX, tY, tZ>::~OpAlgoBase()
-{
-    SPDLOG_DEBUG("Kompute OpAlgoBase destructor started");
-}
-
-template<uint32_t tX, uint32_t tY, uint32_t tZ>
-void
-OpAlgoBase<tX, tY, tZ>::init()
-{
-    SPDLOG_DEBUG("Kompute OpAlgoBase init called");
-
-    if (this->mTensors.size() < 1) {
-        throw std::runtime_error(
-          "Kompute OpAlgoBase called with less than 1 tensor");
-    } 
-
-    for (std::shared_ptr<Tensor> tensor : this->mTensors) {
-        if(!tensor->isInit()) {
-            throw std::runtime_error("Kompute OpAlgoBase validation failed; all tensor parameters must be initialised.");
-        }
-    }
-
-    SPDLOG_DEBUG("Kompute OpAlgoBase fetching spirv data");
-
-    std::vector<char> shaderFileData = this->fetchSpirvBinaryData();
-
-    SPDLOG_DEBUG("Kompute OpAlgoBase Initialising algorithm component");
-
-    this->mAlgorithm->init(shaderFileData, this->mTensors);
-}
-
-template<uint32_t tX, uint32_t tY, uint32_t tZ>
-void
-OpAlgoBase<tX, tY, tZ>::record()
-{
-    SPDLOG_DEBUG("Kompute OpAlgoBase record called");
-
-    // Barrier to ensure the data is finished writing to buffer memory
-    for (std::shared_ptr<Tensor> tensor : this->mTensors) {
-        tensor->recordBufferMemoryBarrier(
-          this->mCommandBuffer,
-          vk::AccessFlagBits::eHostWrite,
-          vk::AccessFlagBits::eShaderRead,
-          vk::PipelineStageFlagBits::eHost,
-          vk::PipelineStageFlagBits::eComputeShader);
-    }
-
-    this->mAlgorithm->recordDispatch(this->mX, this->mY, this->mZ);
-}
-
-template<uint32_t tX, uint32_t tY, uint32_t tZ>
-void
-OpAlgoBase<tX, tY, tZ>::preEval()
-{
-    SPDLOG_DEBUG("Kompute OpAlgoBase preEval called");
-}
-
-template<uint32_t tX, uint32_t tY, uint32_t tZ>
-void
-OpAlgoBase<tX, tY, tZ>::postEval()
-{
-    SPDLOG_DEBUG("Kompute OpAlgoBase postSubmit called");
-}
-
-template<uint32_t tX, uint32_t tY, uint32_t tZ>
-std::vector<char> OpAlgoBase<tX, tY, tZ>::fetchSpirvBinaryData() 
-{
-    SPDLOG_WARN(
-      "Kompute OpAlgoBase Running shaders directly from spirv file");
-
-    if (this->mShaderFilePath.size()) {
-        std::ifstream fileStream(this->mShaderFilePath,
-                                 std::ios::binary | std::ios::in | std::ios::ate);
-
-        if (!fileStream.good()) {
-            throw std::runtime_error("Error reading file: " + this->mShaderFilePath);
-        }
-
-        size_t shaderFileSize = fileStream.tellg();
-        fileStream.seekg(0, std::ios::beg);
-        char* shaderDataRaw = new char[shaderFileSize];
-        fileStream.read(shaderDataRaw, shaderFileSize);
-        fileStream.close();
-
-        SPDLOG_WARN(
-          "Kompute OpAlgoBase fetched {} bytes", shaderFileSize);
-
-        return std::vector<char>(shaderDataRaw,
-                                 shaderDataRaw + shaderFileSize);
-    }
-    else if (this->mShaderDataRaw.size()) {
-        return this->mShaderDataRaw;
-    }
-    else {
-        throw std::runtime_error("Kompute OpAlgoBase Error reached fetchSpirvBinaryData but neither filepath nor data provided");
-    }
-}
-
-}
-
-#endif // #ifndef OPALGOBASE_IMPL
-
--- a/src/include/kompute/operations/OpAlgoLhsRhsOut.hpp
+++ b/src/include/kompute/operations/OpAlgoLhsRhsOut.hpp
@ -15,12 +15,8 @@ namespace kp {
 * Operation base class to simplify the creation of operations that require
 * right hand and left hand side datapoints together with a single output.
 * The expected data passed is two input tensors and one output tensor.
- * The template parameters specify the processing GPU layout number of
- * iterations for each x, y, z parameter. More specifically, this will be the
- * input to ".dispatch(uint32_t tX, uint32_t tY, uint32_t, tZ)"
 */
-template<uint32_t tX = 0, uint32_t tY = 0, uint32_t tZ = 0>
-class OpAlgoLhsRhsOut : public OpAlgoBase<tX, tY, tZ>
+class OpAlgoLhsRhsOut : public OpAlgoBase
 {
  public:
    /**
@ -38,11 +34,13 @@ class OpAlgoLhsRhsOut : public OpAlgoBase<tX, tY, tZ>
     * @param commandBuffer Vulkan Command Buffer to record commands into
     * @param tensors Tensors that are to be used in this operation
     * @param freeTensors Whether operation manages the memory of the Tensors
+     * @param komputeWorkgroup Optional parameter to specify the layout for processing
     */
    OpAlgoLhsRhsOut(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
           std::shared_ptr<vk::Device> device,
           std::shared_ptr<vk::CommandBuffer> commandBuffer,
-           std::vector<std::shared_ptr<Tensor>> tensors);
+           std::vector<std::shared_ptr<Tensor>> tensors,
+           KomputeWorkgroup komputeWorkgroup = KomputeWorkgroup());

    /**
     * Default destructor, which is in charge of destroying the algorithm
@ -73,7 +71,7 @@ class OpAlgoLhsRhsOut : public OpAlgoBase<tX, tY, tZ>
     * of the GPU Device memory into the staging buffer so the output data can
     * be retrieved.
     */
-    virtual void postSubmit() override;
+    virtual void postEval() override;

  protected:
    // -------------- NEVER OWNED RESOURCES
@ -87,136 +85,3 @@ class OpAlgoLhsRhsOut : public OpAlgoBase<tX, tY, tZ>

 } // End namespace kp

-// Including implementation for template class
-#ifndef OPALGOLHSRHSOUT_CPP
-#define OPALGOLHSRHSOUT_CPP
-
-namespace kp {
-
-template<uint32_t tX, uint32_t tY, uint32_t tZ>
-OpAlgoLhsRhsOut<tX, tY, tZ>::OpAlgoLhsRhsOut()
-{
-    SPDLOG_DEBUG("Kompute OpAlgoLhsRhsOut constructor base");
-}
-
-template<uint32_t tX, uint32_t tY, uint32_t tZ>
-OpAlgoLhsRhsOut<tX, tY, tZ>::OpAlgoLhsRhsOut(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
-                           std::shared_ptr<vk::Device> device,
-                           std::shared_ptr<vk::CommandBuffer> commandBuffer,
-                           std::vector<std::shared_ptr<Tensor>> tensors)
-  // The inheritance is initialised with the copyOutputData to false given that
-  // this depencendant class handles the transfer of data via staging buffers in 
-  // a granular way.
-  : OpAlgoBase<tX, tY, tZ>(physicalDevice, device, commandBuffer, tensors)
-{
-    SPDLOG_DEBUG("Kompute OpAlgoLhsRhsOut constructor with params");
-}
-
-template<uint32_t tX, uint32_t tY, uint32_t tZ>
-OpAlgoLhsRhsOut<tX, tY, tZ>::~OpAlgoLhsRhsOut()
-{
-    SPDLOG_DEBUG("Kompute OpAlgoLhsRhsOut destructor started");
-}
-
-template<uint32_t tX, uint32_t tY, uint32_t tZ>
-void
-OpAlgoLhsRhsOut<tX, tY, tZ>::init()
-{
-    SPDLOG_DEBUG("Kompute OpAlgoLhsRhsOut init called");
-
-    if (this->mTensors.size() < 3) {
-        throw std::runtime_error(
-          "Kompute OpAlgoLhsRhsOut called with less than 1 tensor");
-    } else if (this->mTensors.size() > 3) {
-        SPDLOG_WARN("Kompute OpAlgoLhsRhsOut called with more than 3 this->mTensors");
-    }
-
-    this->mTensorLHS = this->mTensors[0];
-    this->mTensorRHS = this->mTensors[1];
-    this->mTensorOutput = this->mTensors[2];
-
-
-    if (!(this->mTensorLHS->isInit() && this->mTensorRHS->isInit() &&
-          this->mTensorOutput->isInit())) {
-        throw std::runtime_error(
-          "Kompute OpAlgoLhsRhsOut all tensor parameters must be initialised. LHS: " +
-          std::to_string(this->mTensorLHS->isInit()) +
-          " RHS: " + std::to_string(this->mTensorRHS->isInit()) +
-          " Output: " + std::to_string(this->mTensorOutput->isInit()));
-    }
-
-    if (!(this->mTensorLHS->size() == this->mTensorRHS->size() &&
-          this->mTensorRHS->size() == this->mTensorOutput->size())) {
-        throw std::runtime_error(
-          "Kompute OpAlgoLhsRhsOut all tensor parameters must be the same size LHS: " +
-          std::to_string(this->mTensorLHS->size()) +
-          " RHS: " + std::to_string(this->mTensorRHS->size()) +
-          " Output: " + std::to_string(this->mTensorOutput->size()));
-    }
-
-    this->mTensorOutputStaging = std::make_shared<Tensor>(
-      this->mTensorOutput->data(), Tensor::TensorTypes::eStaging);
-
-    this->mTensorOutputStaging->init(
-      this->mPhysicalDevice, this->mDevice, this->mCommandBuffer);
-
-    SPDLOG_DEBUG("Kompute OpAlgoLhsRhsOut fetching spirv data");
-
-    std::vector<char> shaderFileData = this->fetchSpirvBinaryData();
-
-    SPDLOG_DEBUG("Kompute OpAlgoLhsRhsOut Initialising algorithm component");
-
-    this->mAlgorithm->init(shaderFileData, this->mTensors);
-}
-
-template<uint32_t tX, uint32_t tY, uint32_t tZ>
-void
-OpAlgoLhsRhsOut<tX, tY, tZ>::record()
-{
-    SPDLOG_DEBUG("Kompute OpAlgoLhsRhsOut record called");
-
-    // Barrier to ensure the data is finished writing to buffer memory
-    this->mTensorLHS->recordBufferMemoryBarrier(
-      this->mCommandBuffer,
-      vk::AccessFlagBits::eHostWrite,
-      vk::AccessFlagBits::eShaderRead,
-      vk::PipelineStageFlagBits::eHost,
-      vk::PipelineStageFlagBits::eComputeShader);
-    this->mTensorRHS->recordBufferMemoryBarrier(
-      this->mCommandBuffer,
-      vk::AccessFlagBits::eHostWrite,
-      vk::AccessFlagBits::eShaderRead,
-      vk::PipelineStageFlagBits::eHost,
-      vk::PipelineStageFlagBits::eComputeShader);
-
-    this->mAlgorithm->recordDispatch(this->mX, this->mY, this->mZ);
-
-    // Barrier to ensure the shader code is executed before buffer read
-    this->mTensorOutput->recordBufferMemoryBarrier(
-      this->mCommandBuffer,
-      vk::AccessFlagBits::eShaderWrite,
-      vk::AccessFlagBits::eTransferRead,
-      vk::PipelineStageFlagBits::eComputeShader,
-      vk::PipelineStageFlagBits::eTransfer);
-
-    this->mTensorOutputStaging->recordCopyFrom(
-            this->mCommandBuffer,
-            this->mTensorOutput,
-            true);
-}
-
-template<uint32_t tX, uint32_t tY, uint32_t tZ>
-void
-OpAlgoLhsRhsOut<tX, tY, tZ>::postSubmit()
-{
-    SPDLOG_DEBUG("Kompute OpAlgoLhsRhsOut postSubmit called");
-
-    this->mTensorOutputStaging->mapDataFromHostMemory();
-
-    this->mTensorOutput->setData(this->mTensorOutputStaging->data());
-}
-
-}
-
-#endif // #ifndef OPALGOLHSRHSOUT_CPP
-
--- a/src/include/kompute/operations/OpMult.hpp
+++ b/src/include/kompute/operations/OpMult.hpp
@ -17,12 +17,9 @@ namespace kp {

 /**
 * Operation that performs multiplication on two tensors and outpus on third
- * tensor. The template parameters specify the processing GPU layout number of
- * iterations for each x, y, z parameter. More specifically, this will be the
- * input to ".dispatch(uint32_t tX, uint32_t tY, uint32_t, tZ)"
+ * tensor.
 */
-template<uint32_t tX = 0, uint32_t tY = 0, uint32_t tZ = 0>
-class OpMult : public OpAlgoBase<tX, tY, tZ>
+class OpMult : public OpAlgoBase
 {
  public:
    /**
@ -41,13 +38,14 @@ class OpMult : public OpAlgoBase<tX, tY, tZ>
     * @param device Vulkan logical device for passing to Algorithm
     * @param commandBuffer Vulkan Command Buffer to record commands into
     * @param tensors Tensors that are to be used in this operation
-     * @param freeTensors Whether operation manages the memory of the Tensors
+     * @param komputeWorkgroup Optional parameter to specify the layout for processing
     */
    OpMult(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
           std::shared_ptr<vk::Device> device,
           std::shared_ptr<vk::CommandBuffer> commandBuffer,
-           std::vector<std::shared_ptr<Tensor>> tensors)
-      : OpAlgoBase<tX, tY, tZ>(physicalDevice, device, commandBuffer, tensors, "")
+           std::vector<std::shared_ptr<Tensor>> tensors,
+           KomputeWorkgroup komputeWorkgroup = KomputeWorkgroup())
+      : OpAlgoBase(physicalDevice, device, commandBuffer, tensors, "", komputeWorkgroup)
    {
        SPDLOG_DEBUG("Kompute OpMult constructor with params");

@ -58,14 +56,8 @@ class OpMult : public OpAlgoBase<tX, tY, tZ>

 #if RELEASE
    /**
-     * If release it will be using the static version of the shader which is 
-     * loaded using this file directly.
-     *
-     * @param physicalDevice Vulkan physical device used to find device queues
-     * @param device Vulkan logical device for passing to Algorithm
-     * @param commandBuffer Vulkan Command Buffer to record commands into
-     * @param tensors Tensors that are to be used in this operation
-     * @param freeTensors Whether operation manages the memory of the Tensors
+     * If RELEASE=1 it will be using the static version of the shader which is 
+     * loaded using this file directly. Otherwise it should not override the function.
     */
    std::vector<char> fetchSpirvBinaryData() override
    {
--- a/test/TestAsyncOperations.cpp
+++ b/test/TestAsyncOperations.cpp
@ -54,7 +54,7 @@ TEST(TestAsyncOperations, TestManagerParallelExecution)
    auto startSync = std::chrono::high_resolution_clock::now();

    for (uint32_t i = 0; i < numParallel; i++) {
-        mgr.evalOpDefault<kp::OpAlgoBase<>>(
+        mgr.evalOpDefault<kp::OpAlgoBase>(
          { inputsSyncB[i] }, std::vector<char>(shader.begin(), shader.end()));
    }

@ -86,7 +86,7 @@ TEST(TestAsyncOperations, TestManagerParallelExecution)
    auto startAsync = std::chrono::high_resolution_clock::now();

    for (uint32_t i = 0; i < numParallel; i++) {
-        mgrAsync.evalOpAsync<kp::OpAlgoBase<>>(
+        mgrAsync.evalOpAsync<kp::OpAlgoBase>(
          { inputsAsyncB[i] },
          "async" + std::to_string(i),
          std::vector<char>(shader.begin(), shader.end()));
@ -151,10 +151,10 @@ TEST(TestAsyncOperations, TestManagerAsyncExecution)

    mgr.evalOpDefault<kp::OpTensorCreate>({ tensorA, tensorB });

-    mgr.evalOpAsync<kp::OpAlgoBase<>>(
+    mgr.evalOpAsync<kp::OpAlgoBase>(
      { tensorA }, "asyncOne", std::vector<char>(shader.begin(), shader.end()));

-    mgr.evalOpAsync<kp::OpAlgoBase<>>(
+    mgr.evalOpAsync<kp::OpAlgoBase>(
      { tensorB }, "asyncTwo", std::vector<char>(shader.begin(), shader.end()));

    mgr.evalOpAwait("asyncOne");
--- a/test/TestLogisticRegression.cpp
+++ b/test/TestLogisticRegression.cpp
@ -31,22 +31,21 @@ TEST(TestLogisticRegressionAlgorithm, TestMainLogisticRegression)
    {
        kp::Manager mgr;

-        std::shared_ptr<kp::Sequence> sqTensor =
-          mgr.createManagedSequence().lock();
+        std::shared_ptr<kp::Sequence> sqTensor = mgr.createManagedSequence();

        sqTensor->begin();
        sqTensor->record<kp::OpTensorCreate>(params);
        sqTensor->end();
        sqTensor->eval();

-        std::shared_ptr<kp::Sequence> sq = mgr.createManagedSequence().lock();
+        std::shared_ptr<kp::Sequence> sq = mgr.createManagedSequence();

        // Record op algo base
        sq->begin();

        sq->record<kp::OpTensorSyncDevice>({ wIn, bIn });

-        sq->record<kp::OpAlgoBase<>>(
+        sq->record<kp::OpAlgoBase>(
          params, "test/shaders/glsl/test_logistic_regression.comp");

        sq->record<kp::OpTensorSyncLocal>({ wOutI, wOutJ, bOut, lOut });
@ -76,7 +75,7 @@ TEST(TestLogisticRegressionAlgorithm, TestMainLogisticRegression)
    EXPECT_LT(bIn->data()[0], 0.0);
    EXPECT_LT(bIn->data()[0], 0.0);

-    //SPDLOG_WARN("Result wIn: {}, bIn: {}, loss: {}",
+    // SPDLOG_WARN("Result wIn: {}, bIn: {}, loss: {}",
    //            wIn->data(),
    //            bIn->data(),
    //            lOut->data());
@ -114,20 +113,19 @@ TEST(TestLogisticRegressionAlgorithm, TestMainLogisticRegressionManualCopy)
    {
        kp::Manager mgr;

-        std::shared_ptr<kp::Sequence> sqTensor =
-          mgr.createManagedSequence().lock();
+        std::shared_ptr<kp::Sequence> sqTensor = mgr.createManagedSequence();

        sqTensor->begin();
        sqTensor->record<kp::OpTensorCreate>(params);
        sqTensor->end();
        sqTensor->eval();

-        std::shared_ptr<kp::Sequence> sq = mgr.createManagedSequence().lock();
+        std::shared_ptr<kp::Sequence> sq = mgr.createManagedSequence();

        // Record op algo base
        sq->begin();

-        sq->record<kp::OpAlgoBase<>>(
+        sq->record<kp::OpAlgoBase>(
          params, "test/shaders/glsl/test_logistic_regression.comp");

        sq->record<kp::OpTensorSyncLocal>({ wOutI, wOutJ, bOut, lOut });
@ -158,7 +156,7 @@ TEST(TestLogisticRegressionAlgorithm, TestMainLogisticRegressionManualCopy)
    EXPECT_GT(wIn->data()[1], 1.0);
    EXPECT_LT(bIn->data()[0], 0.0);

-    //SPDLOG_WARN("Result wIn: {}, bIn: {}, loss: {}",
+    // SPDLOG_WARN("Result wIn: {}, bIn: {}, loss: {}",
    //            wIn->data(),
    //            bIn->data(),
    //            lOut->data());
--- a/test/TestManager.cpp
+++ b/test/TestManager.cpp
@ -17,7 +17,7 @@ TEST(TestManager, EndToEndOpMultFlow)

    mgr.evalOpDefault<kp::OpTensorCreate>({ tensorOutput });

-    mgr.evalOpDefault<kp::OpMult<>>({ tensorLHS, tensorRHS, tensorOutput });
+    mgr.evalOpDefault<kp::OpMult>({ tensorLHS, tensorRHS, tensorOutput });

    mgr.evalOpDefault<kp::OpTensorSyncLocal>({ tensorOutput });

@ -35,23 +35,23 @@ TEST(TestManager, OpMultSequenceFlow)

    kp::Manager mgr;

-    std::weak_ptr<kp::Sequence> sqWeakPtr =
-      mgr.getOrCreateManagedSequence("newSequence");
-    if (std::shared_ptr<kp::Sequence> sq = sqWeakPtr.lock()) {
+    {
+        std::shared_ptr<kp::Sequence> sq =
+          mgr.getOrCreateManagedSequence("newSequence");
+
        sq->begin();

        sq->record<kp::OpTensorCreate>({ tensorLHS });
        sq->record<kp::OpTensorCreate>({ tensorRHS });
        sq->record<kp::OpTensorCreate>({ tensorOutput });

-        sq->record<kp::OpMult<>>({ tensorLHS, tensorRHS, tensorOutput });
+        sq->record<kp::OpMult>({ tensorLHS, tensorRHS, tensorOutput });

        sq->record<kp::OpTensorSyncLocal>({ tensorOutput });

        sq->end();
        sq->eval();
    }
-    sqWeakPtr.reset();

    EXPECT_EQ(tensorOutput->data(), std::vector<float>({ 0, 4, 12 }));
 }
@ -60,22 +60,22 @@ TEST(TestManager, TestMultipleSequences)
 {
    kp::Manager mgr;

-    std::weak_ptr<kp::Sequence> sqWeakPtrOne =
+    std::shared_ptr<kp::Sequence> sqOne =
      mgr.getOrCreateManagedSequence("sqOne");

-    std::weak_ptr<kp::Sequence> sqWeakPtrTwo =
+    std::shared_ptr<kp::Sequence> sqTwo =
      mgr.getOrCreateManagedSequence("sqTwo");

-    std::weak_ptr<kp::Sequence> sqWeakPtrOneRef =
+    std::shared_ptr<kp::Sequence> sqOneRef =
      mgr.getOrCreateManagedSequence("sqOne");

-    std::weak_ptr<kp::Sequence> sqWeakPtrTwoRef =
+    std::shared_ptr<kp::Sequence> sqTwoRef =
      mgr.getOrCreateManagedSequence("sqTwo");

-    EXPECT_EQ(sqWeakPtrOne.lock(), sqWeakPtrOneRef.lock());
-    EXPECT_NE(sqWeakPtrTwo.lock(), sqWeakPtrOneRef.lock());
-    EXPECT_EQ(sqWeakPtrTwo.lock(), sqWeakPtrTwoRef.lock());
-    EXPECT_NE(sqWeakPtrOneRef.lock(), sqWeakPtrTwoRef.lock());
+    EXPECT_EQ(sqOne, sqOneRef);
+    EXPECT_NE(sqTwo, sqOneRef);
+    EXPECT_EQ(sqTwo, sqTwoRef);
+    EXPECT_NE(sqOneRef, sqTwoRef);
 }

 TEST(TestManager, TestMultipleTensorsAtOnce)
@ -89,9 +89,10 @@ TEST(TestManager, TestMultipleTensorsAtOnce)

    kp::Manager mgr;

-    std::weak_ptr<kp::Sequence> sqWeakPtr =
+    std::shared_ptr<kp::Sequence> sq =
      mgr.getOrCreateManagedSequence("newSequence");
-    if (std::shared_ptr<kp::Sequence> sq = sqWeakPtr.lock()) {
+
+    {
        sq->begin();

        sq->record<kp::OpTensorCreate>({ tensorLHS, tensorRHS, tensorOutput });
@ -100,14 +101,13 @@ TEST(TestManager, TestMultipleTensorsAtOnce)
        EXPECT_TRUE(tensorRHS->isInit());
        EXPECT_TRUE(tensorOutput->isInit());

-        sq->record<kp::OpMult<>>({ tensorLHS, tensorRHS, tensorOutput });
+        sq->record<kp::OpMult>({ tensorLHS, tensorRHS, tensorOutput });

        sq->record<kp::OpTensorSyncLocal>({ tensorOutput });

        sq->end();
        sq->eval();
    }
-    sqWeakPtr.reset();

    EXPECT_EQ(tensorOutput->data(), std::vector<float>({ 0, 4, 12 }));
 }
--- a/test/TestMultipleAlgoExecutions.cpp
+++ b/test/TestMultipleAlgoExecutions.cpp
@ -19,18 +19,19 @@ TEST(TestMultipleAlgoExecutions, SingleSequenceRecord)
          pa[index] = pa[index] + 1;
      })");

-    std::weak_ptr<kp::Sequence> sqWeakPtr =
+    std::shared_ptr<kp::Sequence> sq =
      mgr.getOrCreateManagedSequence("newSequence");
-    if (std::shared_ptr<kp::Sequence> sq = sqWeakPtr.lock()) {
+
+    {
        sq->begin();

        sq->record<kp::OpTensorCreate>({ tensorA });

-        sq->record<kp::OpAlgoBase<3, 1, 1>>(
+        sq->record<kp::OpAlgoBase>(
          { tensorA }, std::vector<char>(shader.begin(), shader.end()));
-        sq->record<kp::OpAlgoBase<3, 1, 1>>(
+        sq->record<kp::OpAlgoBase>(
          { tensorA }, std::vector<char>(shader.begin(), shader.end()));
-        sq->record<kp::OpAlgoBase<3, 1, 1>>(
+        sq->record<kp::OpAlgoBase>(
          { tensorA }, std::vector<char>(shader.begin(), shader.end()));

        sq->record<kp::OpTensorSyncLocal>({ tensorA });
@ -38,7 +39,6 @@ TEST(TestMultipleAlgoExecutions, SingleSequenceRecord)
        sq->end();
        sq->eval();
    }
-    sqWeakPtr.reset();

    EXPECT_EQ(tensorA->data(), std::vector<float>({ 3, 3, 3 }));
 }
@ -58,9 +58,9 @@ TEST(TestMultipleAlgoExecutions, MultipleCmdBufRecords)
          pa[index] = pa[index] + 1;
      })");

-    std::shared_ptr<kp::Sequence> sqTensor = mgr.createManagedSequence().lock();
+    std::shared_ptr<kp::Sequence> sqTensor = mgr.createManagedSequence();

-    std::shared_ptr<kp::Sequence> sq = mgr.createManagedSequence().lock();
+    std::shared_ptr<kp::Sequence> sq = mgr.createManagedSequence();

    // First create the tensor in a separate sequence
    sqTensor->begin();
@ -70,20 +70,20 @@ TEST(TestMultipleAlgoExecutions, MultipleCmdBufRecords)

    // Then perform the computations
    sq->begin();
-    sq->record<kp::OpAlgoBase<3, 1, 1>>(
-      { tensorA }, std::vector<char>(shader.begin(), shader.end()));
+    sq->record<kp::OpAlgoBase>({ tensorA },
+                               std::vector<char>(shader.begin(), shader.end()));
    sq->end();
    sq->eval();

    sq->begin();
-    sq->record<kp::OpAlgoBase<3, 1, 1>>(
-      { tensorA }, std::vector<char>(shader.begin(), shader.end()));
+    sq->record<kp::OpAlgoBase>({ tensorA },
+                               std::vector<char>(shader.begin(), shader.end()));
    sq->end();
    sq->eval();

    sq->begin();
-    sq->record<kp::OpAlgoBase<3, 1, 1>>(
-      { tensorA }, std::vector<char>(shader.begin(), shader.end()));
+    sq->record<kp::OpAlgoBase>({ tensorA },
+                               std::vector<char>(shader.begin(), shader.end()));
    sq->end();
    sq->eval();

@ -111,47 +111,51 @@ TEST(TestMultipleAlgoExecutions, MultipleSequences)
          pa[index] = pa[index] + 1;
      })");

-    std::weak_ptr<kp::Sequence> sqWeakPtr =
-      mgr.getOrCreateManagedSequence("newSequence");
-    if (std::shared_ptr<kp::Sequence> sq = sqWeakPtr.lock()) {
+    {
+        std::shared_ptr<kp::Sequence> sq =
+          mgr.getOrCreateManagedSequence("newSequence");
+
        sq->begin();

        sq->record<kp::OpTensorCreate>({ tensorA });

-        sq->record<kp::OpAlgoBase<3, 1, 1>>(
+        sq->record<kp::OpAlgoBase>(
          { tensorA }, std::vector<char>(shader.begin(), shader.end()));

        sq->end();
        sq->eval();
    }

-    std::weak_ptr<kp::Sequence> sqWeakPtr2 =
-      mgr.getOrCreateManagedSequence("newSequence2");
-    if (std::shared_ptr<kp::Sequence> sq = sqWeakPtr2.lock()) {
+    {
+        std::shared_ptr<kp::Sequence> sq =
+          mgr.getOrCreateManagedSequence("newSequence2");
+
        sq->begin();

-        sq->record<kp::OpAlgoBase<3, 1, 1>>(
+        sq->record<kp::OpAlgoBase>(
          { tensorA }, std::vector<char>(shader.begin(), shader.end()));

        sq->end();
        sq->eval();
    }

-    std::weak_ptr<kp::Sequence> sqWeakPtr3 =
-      mgr.getOrCreateManagedSequence("newSequence3");
-    if (std::shared_ptr<kp::Sequence> sq = sqWeakPtr3.lock()) {
+    {
+        std::shared_ptr<kp::Sequence> sq =
+          mgr.getOrCreateManagedSequence("newSequence3");
+
        sq->begin();

-        sq->record<kp::OpAlgoBase<3, 1, 1>>(
+        sq->record<kp::OpAlgoBase>(
          { tensorA }, std::vector<char>(shader.begin(), shader.end()));

        sq->end();
        sq->eval();
    }

-    std::weak_ptr<kp::Sequence> sqWeakPtr4 =
-      mgr.getOrCreateManagedSequence("newSequence5");
-    if (std::shared_ptr<kp::Sequence> sq = sqWeakPtr4.lock()) {
+    {
+        std::shared_ptr<kp::Sequence> sq =
+          mgr.getOrCreateManagedSequence("newSequence5");
+
        sq->begin();

        sq->record<kp::OpTensorSyncLocal>({ tensorA });
@ -179,9 +183,10 @@ TEST(TestMultipleAlgoExecutions, SingleRecordMultipleEval)
          pa[index] = pa[index] + 1;
      })");

-    std::weak_ptr<kp::Sequence> sqWeakPtr =
-      mgr.getOrCreateManagedSequence("newSequence");
-    if (std::shared_ptr<kp::Sequence> sq = sqWeakPtr.lock()) {
+    {
+        std::shared_ptr<kp::Sequence> sq =
+          mgr.getOrCreateManagedSequence("newSequence");
+
        sq->begin();

        sq->record<kp::OpTensorCreate>({ tensorA });
@ -190,12 +195,13 @@ TEST(TestMultipleAlgoExecutions, SingleRecordMultipleEval)
        sq->eval();
    }

-    std::weak_ptr<kp::Sequence> sqWeakPtr2 =
-      mgr.getOrCreateManagedSequence("newSequence2");
-    if (std::shared_ptr<kp::Sequence> sq = sqWeakPtr2.lock()) {
+    {
+        std::shared_ptr<kp::Sequence> sq =
+          mgr.getOrCreateManagedSequence("newSequence2");
+
        sq->begin();

-        sq->record<kp::OpAlgoBase<3, 1, 1>>(
+        sq->record<kp::OpAlgoBase>(
          { tensorA }, std::vector<char>(shader.begin(), shader.end()));

        sq->end();
@ -205,9 +211,10 @@ TEST(TestMultipleAlgoExecutions, SingleRecordMultipleEval)
        sq->eval();
    }

-    std::weak_ptr<kp::Sequence> sqWeakPtr3 =
-      mgr.getOrCreateManagedSequence("newSequence3");
-    if (std::shared_ptr<kp::Sequence> sq = sqWeakPtr2.lock()) {
+    {
+        std::shared_ptr<kp::Sequence> sq =
+          mgr.getOrCreateManagedSequence("newSequence3");
+
        sq->begin();

        sq->record<kp::OpTensorSyncLocal>({ tensorA });
@ -252,7 +259,7 @@ TEST(TestMultipleAlgoExecutions, ManagerEvalMultSourceStrOpCreate)
        }
      )");

-    mgr.evalOpDefault<kp::OpAlgoBase<>>(
+    mgr.evalOpDefault<kp::OpAlgoBase>(
      { tensorInA, tensorInB, tensorOut },
      std::vector<char>(shader.begin(), shader.end()));

@ -289,7 +296,7 @@ TEST(TestMultipleAlgoExecutions, ManagerEvalMultSourceStrMgrCreate)
        }
      )");

-    mgr.evalOpDefault<kp::OpAlgoBase<>>(
+    mgr.evalOpDefault<kp::OpAlgoBase>(
      { tensorInA, tensorInB, tensorOut },
      std::vector<char>(shader.begin(), shader.end()));

--- a/test/TestOpAlgoLoopsPassingData.cpp
+++ b/test/TestOpAlgoLoopsPassingData.cpp
@ -30,10 +30,10 @@ TEST(TestProcessingIterations, IterateThroughMultipleSumAndCopies)
        }
    )");

-    std::weak_ptr<kp::Sequence> sqWeakPtr =
-      mgr.getOrCreateManagedSequence("default");
+    {
+        std::shared_ptr<kp::Sequence> sq =
+          mgr.getOrCreateManagedSequence("default");

-    if (std::shared_ptr<kp::Sequence> sq = sqWeakPtr.lock()) {
        sq->begin();

        sq->record<kp::OpTensorCreate>({ tensorA, tensorB });
@ -43,13 +43,13 @@ TEST(TestProcessingIterations, IterateThroughMultipleSumAndCopies)
        sq->eval();
    }

-    std::weak_ptr<kp::Sequence> sqWeakPtr2 =
-      mgr.getOrCreateManagedSequence("run");
+    {
+        std::shared_ptr<kp::Sequence> sq =
+          mgr.getOrCreateManagedSequence("run");

-    if (std::shared_ptr<kp::Sequence> sq = sqWeakPtr2.lock()) {
        sq->begin();

-        sq->record<kp::OpAlgoBase<>>(
+        sq->record<kp::OpAlgoBase>(
          { tensorA, tensorB },
          std::vector<char>(shader.begin(), shader.end()));

@ -61,10 +61,10 @@ TEST(TestProcessingIterations, IterateThroughMultipleSumAndCopies)
        }
    }

-    std::weak_ptr<kp::Sequence> sqWeakPtr3 =
-      mgr.getOrCreateManagedSequence("export");
+    {
+        std::shared_ptr<kp::Sequence> sq =
+          mgr.getOrCreateManagedSequence("export");

-    if (std::shared_ptr<kp::Sequence> sq = sqWeakPtr3.lock()) {
        sq->begin();

        sq->record<kp::OpTensorSyncLocal>({ tensorA, tensorB });
--- a/test/TestOpShadersFromStringAndFile.cpp
+++ b/test/TestOpShadersFromStringAndFile.cpp
@ -28,7 +28,7 @@ TEST(TestOpAlgoBase, ShaderRawDataFromConstructor)
        }
    )");

-    mgr.evalOpDefault<kp::OpAlgoBase<>>(
+    mgr.evalOpDefault<kp::OpAlgoBase>(
      { tensorA, tensorB }, std::vector<char>(shader.begin(), shader.end()));

    mgr.evalOpDefault<kp::OpTensorSyncLocal>({ tensorA, tensorB });
@ -45,7 +45,7 @@ TEST(TestOpAlgoBase, ShaderCompiledDataFromConstructor)
    std::shared_ptr<kp::Tensor> tensorB{ new kp::Tensor({ 0, 0, 0 }) };
    mgr.evalOpDefault<kp::OpTensorCreate>({ tensorA, tensorB });

-    mgr.evalOpDefault<kp::OpAlgoBase<>>(
+    mgr.evalOpDefault<kp::OpAlgoBase>(
      { tensorA, tensorB },
      std::vector<char>(
        kp::shader_data::test_shaders_glsl_test_op_custom_shader_comp_spv,
@ -67,7 +67,7 @@ TEST(TestOpAlgoBase, ShaderRawDataFromFile)
    std::shared_ptr<kp::Tensor> tensorB{ new kp::Tensor({ 0, 0, 0 }) };
    mgr.evalOpDefault<kp::OpTensorCreate>({ tensorA, tensorB });

-    mgr.evalOpDefault<kp::OpAlgoBase<>>(
+    mgr.evalOpDefault<kp::OpAlgoBase>(
      { tensorA, tensorB }, "test/shaders/glsl/test_op_custom_shader.comp");

    mgr.evalOpDefault<kp::OpTensorSyncLocal>({ tensorA, tensorB });
@ -84,7 +84,7 @@ TEST(TestOpAlgoBase, ShaderCompiledDataFromFile)
    std::shared_ptr<kp::Tensor> tensorB{ new kp::Tensor({ 0, 0, 0 }) };
    mgr.evalOpDefault<kp::OpTensorCreate>({ tensorA, tensorB });

-    mgr.evalOpDefault<kp::OpAlgoBase<>>(
+    mgr.evalOpDefault<kp::OpAlgoBase>(
      { tensorA, tensorB }, "test/shaders/glsl/test_op_custom_shader.comp.spv");

    mgr.evalOpDefault<kp::OpTensorSyncLocal>({ tensorA, tensorB });
--- a/test/TestSequence.cpp
+++ b/test/TestSequence.cpp
@ -7,10 +7,10 @@ TEST(TestSequence, CmdBufSequenceBeginEnd)
 {
    kp::Manager mgr;

-    std::weak_ptr<kp::Sequence> sqWeakPtr =
-      mgr.getOrCreateManagedSequence("newSequence");
+    {
+        std::shared_ptr<kp::Sequence> sq =
+          mgr.getOrCreateManagedSequence("newSequence");

-    if (std::shared_ptr<kp::Sequence> sq = sqWeakPtr.lock()) {
        EXPECT_TRUE(sq->eval());
        EXPECT_TRUE(!sq->isRecording());
        EXPECT_TRUE(sq->begin());
@ -24,3 +24,18 @@ TEST(TestSequence, CmdBufSequenceBeginEnd)
        EXPECT_TRUE(sq->eval());
    }
 }
+
+TEST(TestSequence, SequenceDestructorViaManager)
+{
+    std::shared_ptr<kp::Sequence> sq = nullptr;
+
+    {
+        kp::Manager mgr;
+
+        sq = mgr.getOrCreateManagedSequence("newSequence");
+
+        EXPECT_TRUE(sq->isInit());
+    }
+
+    EXPECT_FALSE(sq->isInit());
+}
--- a/test/TestTensor.cpp
+++ b/test/TestTensor.cpp
@ -24,7 +24,7 @@ TEST(TestTensor, CopyFromHostData)
    kp::Manager mgr;

    if (std::shared_ptr<kp::Sequence> sq =
-          mgr.getOrCreateManagedSequence("new").lock()) {
+          mgr.getOrCreateManagedSequence("new")) {
        sq->begin();

        sq->record<kp::OpTensorCreate>({ tensorA, tensorB });
				`@ -0,0 +1 @@`
				`Subproject commit 06a54018c8a9fd9a7be5f5b56414b5da9259f637`