diff --git a/.ccls b/.ccls
index 2ce15d72f..f215ea9d1 100644
--- a/.ccls
+++ b/.ccls
@@ -13,6 +13,7 @@
 -DDEBUG=1
 -DKOMPUTE_INCLUDE_FOR_SYNTAX
 
+-I./python/pybind11/include/
 -I./external/Vulkan-Headers/include/
 -I./external/googletest/googletest/include/
 -I./external/spdlog/include/
diff --git a/.gitmodules b/.gitmodules
index 1c5db0adc..33549db54 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -10,3 +10,6 @@
 	path = external/spdlog
 	url = https://github.com/gabime/spdlog
 	branch = v1.8.1
+[submodule "python/pybind11"]
+	path = python/pybind11
+	url = https://github.com/pybind/pybind11
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 52e45fcf9..454876d4e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,5 +1,5 @@
 cmake_minimum_required(VERSION 3.4.1)
-project(kompute VERSION 0.3.0)
+project(kompute VERSION 0.4.1)
 
 set(CMAKE_CXX_STANDARD 14)
 
@@ -13,6 +13,7 @@ option(KOMPUTE_OPT_BUILD_SHADERS "Enable if you want to re-build all shader file
 option(KOMPUTE_OPT_BUILD_SINGLE_HEADER "Enable if you want to build the single header file" 0)
 option(KOMPUTE_OPT_INSTALL "Enable if you want to enable installation" 0)
 # Build options
+option(KOMPUTE_OPT_BUILD_PYTHON "Enable if you want to build python bindings" 0)
 option(KOMPUTE_OPT_ENABLE_SPDLOG "Extra compile flags for Kompute, see docs for full list" 0)
 option(KOMPUTE_OPT_REPO_SUBMODULE_BUILD, "Use the submodule repos instead of external package manager" 0)
 option(KOMPUTE_OPT_ANDOID_BUILD "Enable android compilation flags required" 0)
@@ -43,12 +44,16 @@ endfunction()
 
 add_subdirectory(src)
 
+if(KOMPUTE_OPT_BUILD_TESTS)
+    add_subdirectory(test)
+endif()
+
 if(KOMPUTE_OPT_BUILD_DOCS)
     set(CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/config" ${CMAKE_MODULE_PATH})
     add_subdirectory(docs)
 endif()
 
-if(KOMPUTE_OPT_BUILD_TESTS)
-    add_subdirectory(test)
+if(KOMPUTE_OPT_BUILD_PYTHON)
+    add_subdirectory(python)
 endif()
 
diff --git a/README.md b/README.md
index 2ff7e0f9a..b745ccfbf 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,5 @@
 
-![GitHub](https://img.shields.io/badge/Version-0.4.0-green.svg)
+![GitHub](https://img.shields.io/badge/Version-0.4.1-green.svg)
 ![GitHub](https://img.shields.io/badge/C++-14—20-purple.svg)
 ![GitHub](https://img.shields.io/badge/Build-cmake-red.svg)
 ![GitHub](https://img.shields.io/badge/Python-3.5—3.8-blue.svg)
@@ -15,7 +15,7 @@
 <td>
 
 <h1>Vulkan Kompute</h1>
-<h3>The General Purpose Vulkan Compute Framework.</h3>
+<h3>The General Purpose Vulkan Compute Framework for C++ and Python.</h3>
 
 </td>
 
@@ -29,10 +29,10 @@
 
 ## Principles & Features
 
-* [Single header](#setup) library for simple import to your project
-* [Documentation](https://kompute.cc) leveraging doxygen and sphinx 
-* [Asynchronous & parallel processing](#asynchronous-and-parallel-operations) capabilities with multi-queue command submission
-* [Non-Vulkan naming conventions](#architectural-overview) to disambiguate Vulkan vs Kompute components
+* [Single header](#setup) for simple import with flexible build-system configuration
+* Multi-language support with C++ as core SDK as well as [optimized Python bindings](#python-package)
+* [Asynchronous & parallel processing](#asynchronous-and-parallel-operations) support through GPU family queues
+* [Mobile enabled](#mobile-enabled) with examples in Android studio across several architectures
 * BYOV: [Bring-your-own-Vulkan design](#motivations) to play nice with existing Vulkan applications
 * Explicit relationships for GPU and host [memory ownership and memory management](https://kompute.cc/overview/memory-management.html)
 * [Short code examples](#simple-examples) showing the core features 
@@ -118,7 +118,7 @@ int main() {
     mgr.evalOpAwaitDefault();
 
     // 5. Create managed sequence to submit batch operations to the CPU
-    std::shared_ptr<kp::Sequence> sq = mgr.getOrCreateManagedSequence("seq").lock();
+    std::shared_ptr<kp::Sequence> sq = mgr.getOrCreateManagedSequence("seq");
 
     // 5.1. Explicitly begin recording batch commands
     sq->begin();
@@ -255,13 +255,79 @@ You can also access the <a href="https://github.com/EthicalML/vulkan-kompute/tre
 </tr>
 </table>
 
-## Motivations
+## Python Package
 
-This project started after seeing that a lot of new and renowned ML & DL projects like Pytorch, Tensorflow, Alibaba DNN, Tencent NCNN - among others - have either integrated or are looking to integrate the Vulkan SDK to add mobile (and cross-vendor) GPU support.
+Besides the C++ core SDK you can also use the Python package of Kompute, which exposes the same core functionality, and supports interoperability with Python objects like Lists, Numpy Arrays, etc.
 
-The Vulkan SDK offers a great low level interface that enables for highly specialized optimizations - however it comes at a cost of highly verbose code which requires 500-2000 lines of code to even begin writing application code. This has resulted in each of these projects having to implement the same baseline to abstract the non-compute related features of Vulkan. This large amount of non-standardised boiler-plate can result in limited knowledge transfer, higher chance of unique framework implementation bugs being introduced, etc.
+You can install from the repository by running:
 
-We are currently developing Vulkan Kompute not to hide the Vulkan SDK interface (as it's incredibly well designed) but to augment it with a direct focus on Vulkan's GPU computing capabilities. [This article](https://towardsdatascience.com/machine-learning-and-data-processing-in-the-gpu-with-vulkan-kompute-c9350e5e5d3a) provides a high level overview of the motivations of Kompute, together with a set of hands on examples that introduce both GPU computing as well as the core Vulkan Kompute architecture.
+```
+pip install .
+```
+
+For further details you can read the [Python Package documentation](https://kompute.cc/overview/python-package.html) or the [Python Class Reference documentation](https://kompute.cc/overview/python-reference.html).
+
+### Python Example (Simple)
+
+Then you can interact with it from your interpreter. Below is the same sample as above "Your First Kompute (Simple Version)" but in Python:
+
+```python
+mgr = Manager()
+
+# Can be initialized with List[] or np.Array
+tensor_in_a = Tensor([2, 2, 2])
+tensor_in_b = Tensor([1, 2, 3])
+tensor_out = Tensor([0, 0, 0])
+
+mgr.eval_tensor_create_def([tensor_in_a, tensor_in_b, tensor_out])
+
+shaderFilePath = "shaders/glsl/opmult.comp"
+mgr.eval_async_algo_file_def([tensor_in_a, tensor_in_b, tensor_out], shaderFilePath)
+
+# Alternatively can pass raw string/bytes:
+# shaderFileData = """ shader code here... """
+# mgr.eval_algo_data_def([tensor_in_a, tensor_in_b, tensor_out], list(shaderFileData))
+
+mgr.eval_await_def()
+
+mgr.eval_tensor_sync_local_def([tensor_out])
+
+assert tensor_out.data() == [2.0, 4.0, 6.0]
+```
+
+### Python Example (Extended)
+
+Similarly you can find the same extended example as above:
+
+```python
+mgr = Manager(0, [2])
+
+# Can be initialized with List[] or np.Array
+tensor_in_a = Tensor([2, 2, 2])
+tensor_in_b = Tensor([1, 2, 3])
+tensor_out = Tensor([0, 0, 0])
+
+shaderFilePath = "../../shaders/glsl/opmult.comp"
+
+mgr.eval_tensor_create_def([tensor_in_a, tensor_in_b, tensor_out])
+
+seq = mgr.create_sequence("op")
+
+mgr.eval_async_algo_file_def([tensor_in_a, tensor_in_b, tensor_out], shaderFilePath)
+mgr.eval_await_def()
+
+seq.begin()
+seq.record_tensor_sync_local([tensor_in_a])
+seq.record_tensor_sync_local([tensor_in_b])
+seq.record_tensor_sync_local([tensor_out])
+seq.end()
+
+seq.eval()
+
+assert tensor_out.data() == [2.0, 4.0, 6.0]
+```
+
+For further details you can read the [Python Package documentation](https://kompute.cc/overview/python-package.html) or the [Python Class Reference documentation](https://kompute.cc/overview/python-reference.html).
 
 ## More examples
 
@@ -281,6 +347,7 @@ We are currently developing Vulkan Kompute not to hide the Vulkan SDK interface
 * [Android NDK Mobile Kompute ML Application](https://towardsdatascience.com/gpu-accelerated-machine-learning-in-your-mobile-applications-using-the-android-ndk-vulkan-kompute-1e9da37b7617)
 * [Game Development Kompute ML in Godot Engine](https://towardsdatascience.com/supercharging-game-development-with-gpu-accelerated-ml-using-vulkan-kompute-the-godot-game-engine-4e75a84ea9f0)
 
+
 ## Build Overview
 
 The build system provided uses `cmake`, which allows for cross platform builds.
@@ -344,3 +411,11 @@ make mk_cmake MK_BUILD_TYPE="Release"
 make mk_run_tests
 ```
 
+## Motivations
+
+This project started after seeing that a lot of new and renowned ML & DL projects like Pytorch, Tensorflow, Alibaba DNN, Tencent NCNN - among others - have either integrated or are looking to integrate the Vulkan SDK to add mobile (and cross-vendor) GPU support.
+
+The Vulkan SDK offers a great low level interface that enables for highly specialized optimizations - however it comes at a cost of highly verbose code which requires 500-2000 lines of code to even begin writing application code. This has resulted in each of these projects having to implement the same baseline to abstract the non-compute related features of Vulkan. This large amount of non-standardised boiler-plate can result in limited knowledge transfer, higher chance of unique framework implementation bugs being introduced, etc.
+
+We are currently developing Vulkan Kompute not to hide the Vulkan SDK interface (as it's incredibly well designed) but to augment it with a direct focus on Vulkan's GPU computing capabilities. [This article](https://towardsdatascience.com/machine-learning-and-data-processing-in-the-gpu-with-vulkan-kompute-c9350e5e5d3a) provides a high level overview of the motivations of Kompute, together with a set of hands on examples that introduce both GPU computing as well as the core Vulkan Kompute architecture.
+
diff --git a/docs/conf.py b/docs/conf.py
index 2c6eb74e0..1771846e3 100755
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -16,13 +16,16 @@
 
 
 # -- Project information -----------------------------------------------------
+import sys
+import os
+import kp
 
 project = 'Vulkan Kompute'
 copyright = '2020, The Institute for Ethical AI & Machine Learning'
 author = 'Alejandro Saucedo'
 
 # The full version, including alpha/beta/rc tags
-release = '0.4.0'
+release = '0.4.1'
 
 
 # -- General configuration ---------------------------------------------------
@@ -31,6 +34,7 @@ release = '0.4.0'
 # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 # ones.
 extensions = [
+    "sphinx.ext.autodoc",
     # Creates .nojekyll config
     'sphinx.ext.githubpages',
     # Integrates with doxygen
diff --git a/docs/index.rst b/docs/index.rst
index 60d01f21b..340b3458b 100755
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -11,13 +11,15 @@ Index
     :maxdepth: 2
     :titlesonly:
 
-    Class Documentation and C++ Reference <overview/reference>
-    Advanced Examples <overview/advanced-examples>
+    Simple & Advanced Examples <overview/advanced-examples>
+    Python Package Overview <overview/python-package>
     Asynchronous & Parallel Operations <overview/async-parallel>
     Memory Management Principles <overview/memory-management>
     Build System Deep Dive <overview/build-system>
     Converting GLSL/HLSL Shaders to C++ Headers <overview/shaders-to-headers>
     Mobile App Integration (Android) <overview/mobile-android>
     Game Engine Integration (Godot Engine) <overview/game-engine-godot>
+    Python Class Documentation & Reference <overview/python-reference>
+    C++ Class Documentation & Reference <overview/reference>
     Code Index <genindex>
 
diff --git a/docs/overview/python-package.rst b/docs/overview/python-package.rst
new file mode 100644
index 000000000..74e0cba91
--- /dev/null
+++ b/docs/overview/python-package.rst
@@ -0,0 +1,91 @@
+
+Python Package Overview
+========
+
+This section provides an overview of the Python Package from a functionality perspective. If you wish to see all the classes and their respective functions you can find that in the `Python Class Reference Section <python-reference>`_.
+
+Below is a diagram that provides insights on the relationship between Vulkan Kompute objects and Vulkan resources, which primarily encompass ownership of either CPU and/or GPU memory.
+
+.. image:: ../images/kompute-architecture.jpg
+   :width: 70%
+
+Python Components
+^^^^^^^^
+
+The Python package exposes three main classes:
+
+* :class:`kp.Manager` - Manages all high level Vulkan and Kompute resources created
+* :class:`kp.Sequence` - Contains a set of recorded operations that can be reused
+* :class:`kp.Tensor` - Core data component to manage GPU and host data used in operations
+
+One thing that you will notice is that the class :class:`kp::OpBase` and all its relevant operator subclasses are not exposed in Python.
+
+This is primarily because the way to interact with the operations are through the respective :class:`kp.Manager` and :class:`kp.Sequence` functions.
+
+More specifically, it can be through the following functions:
+
+* mgr.eval_<opname> - Runs operation under an existing named sequence
+* mgr.eval_<opname>_def - Runs operation under a new anonymous sequence
+* mgr.eval_async_<opname> - Runs operation asynchronously under an existing named sequence
+* mgr.eval_async_<opname>_def - Runs operation asynchronously under a new anonymous sequence
+* seq.record_<opname> - Records operation in sequence (requires sequence to be in recording mode)
+
+You can see these operations being used in the `Simple Python example <https://kompute.cc/index.html#python-example-simple>`_ and in the `Extended Python Example <https://kompute.cc/index.html#python-example-extended>`_.
+
+Kompute Operation Capabilities
+^^^^^
+
+Handling multiple capabilites of processing can be done by compute shaders being loaded into separate sequences. The example below shows how this can be done:
+
+.. code-block:: python
+    :linenos:
+    from kp import Manager
+
+    # We'll assume we have the shader data available
+    from my_spv_shader_data import mult_shader, sum_shader
+
+    mgr = Manager()
+
+    t1 = mgr.build_tensor([2,2,2])
+    t2 = mgr.build_tensor([1,2,3])
+    t3 = mgr.build_tensor([1,2,3])
+
+    # Create multiple separate sequences
+    sq_mult = mgr.create_sequence("SQ_MULT")
+    sq_sum = mgr.create_sequence("SQ_SUM")
+    sq_sync = mgr.create_sequence("SQ_SYNC")
+
+    # Initialize sq_mult
+    sq_mult.begin()
+    sq_mult.record_algo_data([t1, t2, t3], add_shader)
+    sq_mult.end()
+
+    sq_sum.begin()
+    sq_sum.record_algo_data([t3, t2, t1], sum_shader)
+    sq_sum.end()
+
+    sq_sync.begin()
+    sq_sync.record_tensor_sync_local([t1, t3])
+    sq_sync.end()
+
+    # Run multiple iterations
+    for i in range(10):
+        sq_mult.eval()
+        sq_sum.eval()
+
+    sq_sync.eval()
+
+    print(t1.data(), t2.data(), t3.data())
+
+
+Package Installation 
+^^^^^^^^^
+
+The package can be installed through the top level `setup.py` by running:
+
+```
+pip install .
+```
+
+
+
diff --git a/docs/overview/python-reference.rst b/docs/overview/python-reference.rst
new file mode 100644
index 000000000..0a8eb7a23
--- /dev/null
+++ b/docs/overview/python-reference.rst
@@ -0,0 +1,44 @@
+
+
+Python Class Documentation & Reference
+========
+
+This section provides a breakdown of the Python classes and what each of their functions provide.
+Below is a diagram that provides insights on the relationship between Vulkan Kompute objects and Vulkan resources, which primarily encompass ownership of either CPU and/or GPU memory.
+
+.. image:: ../images/kompute-architecture.jpg
+   :width: 70%
+
+Manager
+-------
+
+The Kompute Manager provides a high level interface to simplify interaction with underlying :class:`kp.Sequence` of Operations.
+
+.. autoclass:: kp.Manager
+   :members:
+
+
+Sequence
+-------
+
+The Kompute Sequence consists of batches of Kompute Operations, which are executed on a respective GPU queue. The execution of sequences can be synchronous or asynchronous, and it can be coordinated through its respective Vulkan Fence.
+
+.. autoclass:: kp.Sequence
+   :members:
+
+
+Tensor
+-------
+
+The Kompute Tensor is the atomic unit in Kompute, and it is used primarily for handling Host and GPU Device data.
+
+.. autoclass:: kp.Tensor
+   :members:
+
+
+TensorType
+-------
+
+.. automodule:: kp
+   :members:
+
diff --git a/examples/android/android-simple/app/src/main/cpp/KomputeModelML.cpp b/examples/android/android-simple/app/src/main/cpp/KomputeModelML.cpp
index a7a18c849..e22f2aa00 100755
--- a/examples/android/android-simple/app/src/main/cpp/KomputeModelML.cpp
+++ b/examples/android/android-simple/app/src/main/cpp/KomputeModelML.cpp
@@ -44,14 +44,14 @@ void KomputeModelML::train(std::vector<float> yData, std::vector<float> xIData,
         {
 
             std::shared_ptr<kp::Sequence> sqTensor =
-              mgr.createManagedSequence().lock();
+              mgr.createManagedSequence();
 
             sqTensor->begin();
             sqTensor->record<kp::OpTensorCreate>(params);
             sqTensor->end();
             sqTensor->eval();
 
-            std::shared_ptr<kp::Sequence> sq = mgr.createManagedSequence().lock();
+            std::shared_ptr<kp::Sequence> sq = mgr.createManagedSequence();
 
             // Record op algo base
             sq->begin();
@@ -60,11 +60,11 @@ void KomputeModelML::train(std::vector<float> yData, std::vector<float> xIData,
 
 #ifdef KOMPUTE_ANDROID_SHADER_FROM_STRING
             // Newer versions of Android are able to use shaderc to read raw string
-            sq->record<kp::OpAlgoBase<>>(
+            sq->record<kp::OpAlgoBase>(
                     params, std::vector<char>(LR_SHADER.begin(), LR_SHADER.end()));
 #else
             // Older versions of Android require the SPIRV binary directly
-            sq->record<kp::OpAlgoBase<>>(
+            sq->record<kp::OpAlgoBase>(
                     params, std::vector<char>(
                             kp::shader_data::shaders_glsl_logisticregression_comp_spv,
                             kp::shader_data::shaders_glsl_logisticregression_comp_spv
diff --git a/examples/array_multiplication/CMakeLists.txt b/examples/array_multiplication/CMakeLists.txt
index 5aeebb450..0b648382e 100644
--- a/examples/array_multiplication/CMakeLists.txt
+++ b/examples/array_multiplication/CMakeLists.txt
@@ -3,20 +3,42 @@ project(kompute_array_mult VERSION 0.1.0)
 
 set(CMAKE_CXX_STANDARD 14)
 
+option(KOMPUTE_ARR_OPT_INSTALLED_KOMPUTE "Enable if you prefer to use your installed Kompute library" 0)
 option(KOMPUTE_OPT_ENABLE_SPDLOG "Extra compile flags for Kompute, see docs for full list" 0)
 set(KOMPUTE_EXTRA_CXX_FLAGS "" CACHE STRING "Extra compile flags for Kompute, see docs for full list")
 
+if(KOMPUTE_OPT_ENABLE_SPDLOG)
+    set(KOMPUTE_EXTRA_CXX_FLAGS "${KOMPUTE_EXTRA_CXX_FLAGS} -DKOMPUTE_ENABLE_SPDLOG=1")
+endif()
+
 # It is necessary to pass the DEBUG or RELEASE flag accordingly to Kompute
 set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -DDEBUG=1 ${KOMPUTE_EXTRA_CXX_FLAGS}")
 set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -DRELEASE=1 ${KOMPUTE_EXTRA_CXX_FLAGS}")
 
-find_package(kompute REQUIRED)
+if(KOMPUTE_ARR_OPT_INSTALLED_KOMPUTE)
+    find_package(kompute REQUIRED)
+else()
+    add_subdirectory(../../ ${CMAKE_CURRENT_BINARY_DIR}/kompute_build)
+endif()
+
 find_package(Vulkan REQUIRED)
 
+if(KOMPUTE_OPT_ENABLE_SPDLOG)
+    find_package(spdlog REQUIRED)
+endif()
+
 add_executable(kompute_array_mult
     src/Main.cpp)
 
 target_link_libraries(kompute_array_mult
     kompute::kompute
-    Vulkan::Vulkan
-)
+    Vulkan::Vulkan)
+
+include_directories(
+        ../../single_include/)
+
+if(KOMPUTE_OPT_ENABLE_SPDLOG)
+    target_link_libraries(kompute_array_mult
+        spdlog::spdlog)
+endif()
+
diff --git a/examples/array_multiplication/README.md b/examples/array_multiplication/README.md
index 9838b7217..931c7d639 100644
--- a/examples/array_multiplication/README.md
+++ b/examples/array_multiplication/README.md
@@ -6,14 +6,32 @@ This example is structured such that you will be able to extend it for your proj
 
 It contains a cmake build configuration that can be used in your production applications.
 
+## Building the example
+
+You will notice that it's a standalone project, so you can re-use it for your application.
+
+This project has the option to either import the Kompute dependency relative to the project or use your existing installation of Kompute.
+
+To build you just need to run the cmake command in this folder as follows:
+
+```
+cmake \
+    -Bbuild
+```
+
+You can pass the following optional parameters based on your desired configuration:
+* If you wish to install with spdlog support you just have to pass `-DKOMPUTE_OPT_ENABLE_SPDLOG=1`.
+* If you are using a package manager such as `vcpkg` make sure you pass the `-DCMAKE_TOOLCHAIN_FILE=` parameter 
+* If you wish to load shader from raw glsl string instead of spirv bytes you can use `-DKOMPUTE_ANDROID_SHADER_FROM_STRING`
+
 ## Pre-requisites
 
 In order to run this example, you will need the following dependencies:
 
 * REQUIRED
-    + Vulkan Kompute library must be accessible
     + The Vulkan SDK must be installed
 * OPTIONAL
+    + Vulkan Kompute library must be accessible (by default it uses the source directory)
     + SPDLOG - for logging
     + FMT - for text formatting
 
@@ -25,50 +43,5 @@ For the other libraries, because they are optional you can just make sure you bu
 
 Alternatively you can use package managers such as vcpkg to help you install them, although to simplify things you can start without the dependencies first.
 
-## Set Up Vulkan Kompute Dependency
 
-You have multiple options to set up Vulkan Kompute. The easiest is to perform a local installation.
-
-For this, you will want to go to the main repo and run the following cmake command, which will configure it without SPDLOG by default.
-
-```
-cmake \
-    -Bbuild
-```
-
-You can pass the following optional parameters based on your desired configuration:
-* If you wish to install with spdlog support you just have to pass `-DKOMPUTE_ENABLE_SPDLOG=1`.
-* If you wish to perform the installation on the local folder instead of in your system you can use `-DCMAKE_INSTALL_PREFIX="build/src/CMakeFiles/Export/"` which will basically ensure that the final files are created in the local directory.
-* If you are using a package manager such as `vcpkg` make sure you pass the `-DCMAKE_TOOLCHAIN_FILE=` parameter 
-
-Then you can proceed to run the installation:
-
-* For Windows / Visual Studio you just have to build `INSTALL.vcxproj`
-* For Linux you can just run the `install` target via `make -C build install`
-
-You also have the option to build as `Release` or `Debug` - just make sure that you build your example with the same build/debug flags as required.
-
-## Building the example
-
-Now that you've set up the dependencies / installation of Vulkan Kompute you can build this example.
-
-You will notice that it's a standalone project, so you can re-use it for your application.
-
-To build you just need to run the cmake command in this folder as follows:
-
-```
-cmake \
-    -Bbuild
-```
-
-Make sure to pass the required flags depending on the configuration above:
-* If you built with Debug make sure you build your example with Debug as well
-* If you installed in the local folder, make sure you pass the CMAKE_PREFIX_PATH pointing to the respective folder (e.g. `-DCMAKE_PREFIX_PATH=../../build/src/CMakeFiles/Export/lib/cmake/kompute/` if parent folder is main repo).
-* If you built Vulkan Kompute with spdlog enabled, make sure to pass `-DKOMPUTE_OPT_ENABLE_SPDLOG=1`
-* If you are using a package manager such as `vcpkg` make sure you pass the `-DCMAKE_TOOLCHAIN_FILE=` parameter 
-
-Now you just have to build your application as above:
-
-* For Windows / Visual Studio you just have to build and run `kompute_array_mult.vcxproj`
-* For Linux you can just run the `kompute_array_mult` target via `make -C build kompute_array_mult`
 
diff --git a/examples/array_multiplication/src/Main.cpp b/examples/array_multiplication/src/Main.cpp
index f3587cae8..14b58cba9 100755
--- a/examples/array_multiplication/src/Main.cpp
+++ b/examples/array_multiplication/src/Main.cpp
@@ -18,6 +18,7 @@ int main()
     auto tensorInB = mgr.buildTensor({ 0.0, 1.0, 2.0 });
     auto tensorOut = mgr.buildTensor({ 0.0, 0.0, 0.0 });
 
+#ifdef KOMPUTE_ANDROID_SHADER_FROM_STRING
     std::string shader(R"(
         // The version to use 
         #version 450
@@ -37,9 +38,17 @@ int main()
         }
       )");
 
-    mgr.evalOpDefault<kp::OpAlgoBase<>>(
+    mgr.evalOpDefault<kp::OpAlgoBase>(
             { tensorInA, tensorInB, tensorOut },
             std::vector<char>(shader.begin(), shader.end()));
+#else
+    mgr.evalOpDefault<kp::OpAlgoBase>(
+            { tensorInA, tensorInB, tensorOut },
+            std::vector<char>(
+            kp::shader_data::shaders_glsl_opmult_comp_spv,
+            kp::shader_data::shaders_glsl_opmult_comp_spv
+                + kp::shader_data::shaders_glsl_opmult_comp_spv_len));
+#endif
 
     mgr.evalOpDefault<kp::OpTensorSyncLocal>({tensorOut});
 
diff --git a/examples/godot_examples/custom_module/kompute_summator/KomputeSummatorNode.h b/examples/godot_examples/custom_module/kompute_summator/KomputeSummatorNode.h
index 5bc201a90..1d94da9a5 100644
--- a/examples/godot_examples/custom_module/kompute_summator/KomputeSummatorNode.h
+++ b/examples/godot_examples/custom_module/kompute_summator/KomputeSummatorNode.h
@@ -24,7 +24,7 @@ protected:
 
 private:
     kp::Manager mManager;
-    std::weak_ptr<kp::Sequence> mSequence;
+    std::shared_ptr<kp::Sequence> mSequence;
     std::shared_ptr<kp::Tensor> mPrimaryTensor;
     std::shared_ptr<kp::Tensor> mSecondaryTensor;
 };
diff --git a/examples/godot_examples/gdnative_shared/src/KomputeSummator.cpp b/examples/godot_examples/gdnative_shared/src/KomputeSummator.cpp
index f64e0d088..788486e82 100644
--- a/examples/godot_examples/gdnative_shared/src/KomputeSummator.cpp
+++ b/examples/godot_examples/gdnative_shared/src/KomputeSummator.cpp
@@ -16,12 +16,7 @@ void KomputeSummator::add(float value) {
     // Set the new data in the local device
     this->mSecondaryTensor->setData({value});
     // Execute recorded sequence
-    if (std::shared_ptr<kp::Sequence> sq = this->mSequence.lock()) {
-        sq->eval();
-    }
-    else {
-        throw std::runtime_error("Sequence pointer no longer available");
-    }
+    this->mSequence->eval();
 }
 
 void KomputeSummator::reset() {
@@ -38,9 +33,7 @@ void KomputeSummator::_init() {
     this->mSequence = this->mManager.getOrCreateManagedSequence("AdditionSeq");
 
     // We now record the steps in the sequence
-    if (std::shared_ptr<kp::Sequence> sq = this->mSequence.lock())
     {
-
         std::string shader(R"(
             #version 450
 
@@ -55,26 +48,23 @@ void KomputeSummator::_init() {
             }
         )");
 
-        sq->begin();
+        this->mSequence->begin();
 
         // First we ensure secondary tensor loads to GPU
         // No need to sync the primary tensor as it should not be changed
-        sq->record<kp::OpTensorSyncDevice>(
+        this->mSequence->record<kp::OpTensorSyncDevice>(
                 { this->mSecondaryTensor });
 
         // Then we run the operation with both tensors
-        sq->record<kp::OpAlgoBase<>>(
+        this->mSequence->record<kp::OpAlgoBase>(
             { this->mPrimaryTensor, this->mSecondaryTensor }, 
             std::vector<char>(shader.begin(), shader.end()));
 
         // We map the result back to local 
-        sq->record<kp::OpTensorSyncLocal>(
+        this->mSequence->record<kp::OpTensorSyncLocal>(
                 { this->mPrimaryTensor });
 
-        sq->end();
-    }
-    else {
-        throw std::runtime_error("Sequence pointer no longer available");
+        this->mSequence->end();
     }
 }
 
diff --git a/examples/godot_examples/gdnative_shared/src/KomputeSummator.hpp b/examples/godot_examples/gdnative_shared/src/KomputeSummator.hpp
index 9131e7f57..7f6b42e82 100644
--- a/examples/godot_examples/gdnative_shared/src/KomputeSummator.hpp
+++ b/examples/godot_examples/gdnative_shared/src/KomputeSummator.hpp
@@ -26,7 +26,7 @@ public:
 
 private:
     kp::Manager mManager;
-    std::weak_ptr<kp::Sequence> mSequence;
+    std::shared_ptr<kp::Sequence> mSequence;
     std::shared_ptr<kp::Tensor> mPrimaryTensor;
     std::shared_ptr<kp::Tensor> mSecondaryTensor;
 };
diff --git a/examples/godot_logistic_regression/custom_module/kompute_model_ml/KomputeModelMLNode.cpp b/examples/godot_logistic_regression/custom_module/kompute_model_ml/KomputeModelMLNode.cpp
index fe0a911a5..f583d910f 100644
--- a/examples/godot_logistic_regression/custom_module/kompute_model_ml/KomputeModelMLNode.cpp
+++ b/examples/godot_logistic_regression/custom_module/kompute_model_ml/KomputeModelMLNode.cpp
@@ -51,14 +51,14 @@ void KomputeModelMLNode::train(Array yArr, Array xIArr, Array xJArr) {
         kp::Manager mgr;
 
             std::shared_ptr<kp::Sequence> sqTensor =
-              mgr.createManagedSequence().lock();
+              mgr.createManagedSequence();
 
             sqTensor->begin();
             sqTensor->record<kp::OpTensorCreate>(params);
             sqTensor->end();
             sqTensor->eval();
 
-            std::shared_ptr<kp::Sequence> sq = mgr.createManagedSequence().lock();
+            std::shared_ptr<kp::Sequence> sq = mgr.createManagedSequence();
 
             // Record op algo base
             sq->begin();
@@ -67,11 +67,11 @@ void KomputeModelMLNode::train(Array yArr, Array xIArr, Array xJArr) {
 
 #ifdef KOMPUTE_ANDROID_SHADER_FROM_STRING
             // Newer versions of Android are able to use shaderc to read raw string
-            sq->record<kp::OpAlgoBase<>>(
+            sq->record<kp::OpAlgoBase>(
                     params, std::vector<char>(LR_SHADER.begin(), LR_SHADER.end()));
 #else
             // Older versions of Android require the SPIRV binary directly
-            sq->record<kp::OpAlgoBase<>>(
+            sq->record<kp::OpAlgoBase>(
                     params, std::vector<char>(
                             kp::shader_data::shaders_glsl_logisticregression_comp_spv,
                             kp::shader_data::shaders_glsl_logisticregression_comp_spv
diff --git a/examples/godot_logistic_regression/gdnative_shared/src/KomputeModelML.cpp b/examples/godot_logistic_regression/gdnative_shared/src/KomputeModelML.cpp
index 174398501..4135e83ed 100644
--- a/examples/godot_logistic_regression/gdnative_shared/src/KomputeModelML.cpp
+++ b/examples/godot_logistic_regression/gdnative_shared/src/KomputeModelML.cpp
@@ -56,14 +56,14 @@ void KomputeModelML::train(Array yArr, Array xIArr, Array xJArr) {
 
         {
             std::shared_ptr<kp::Sequence> sqTensor =
-              mgr.createManagedSequence().lock();
+              mgr.createManagedSequence();
 
             sqTensor->begin();
             sqTensor->record<kp::OpTensorCreate>(params);
             sqTensor->end();
             sqTensor->eval();
 
-            std::shared_ptr<kp::Sequence> sq = mgr.createManagedSequence().lock();
+            std::shared_ptr<kp::Sequence> sq = mgr.createManagedSequence();
 
             // Record op algo base
             sq->begin();
@@ -72,11 +72,11 @@ void KomputeModelML::train(Array yArr, Array xIArr, Array xJArr) {
 
 #ifdef KOMPUTE_ANDROID_SHADER_FROM_STRING
             // Newer versions of Android are able to use shaderc to read raw string
-            sq->record<kp::OpAlgoBase<>>(
+            sq->record<kp::OpAlgoBase>(
                     params, std::vector<char>(LR_SHADER.begin(), LR_SHADER.end()));
 #else
             // Older versions of Android require the SPIRV binary directly
-            sq->record<kp::OpAlgoBase<>>(
+            sq->record<kp::OpAlgoBase>(
                     params, std::vector<char>(
                             kp::shader_data::shaders_glsl_logisticregression_comp_spv,
                             kp::shader_data::shaders_glsl_logisticregression_comp_spv
diff --git a/examples/logistic_regression/CMakeLists.txt b/examples/logistic_regression/CMakeLists.txt
index b12e8227f..f918bbf21 100644
--- a/examples/logistic_regression/CMakeLists.txt
+++ b/examples/logistic_regression/CMakeLists.txt
@@ -3,6 +3,7 @@ project(kompute_linear_reg VERSION 0.1.0)
 
 set(CMAKE_CXX_STANDARD 14)
 
+option(KOMPUTE_ARR_OPT_INSTALLED_KOMPUTE "Enable if you prefer to use your installed Kompute library" 0)
 option(KOMPUTE_OPT_ENABLE_SPDLOG "Extra compile flags for Kompute, see docs for full list" 0)
 set(KOMPUTE_EXTRA_CXX_FLAGS "" CACHE STRING "Extra compile flags for Kompute, see docs for full list")
 
@@ -14,12 +15,16 @@ endif()
 set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -DDEBUG=1 ${KOMPUTE_EXTRA_CXX_FLAGS}")
 set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -DRELEASE=1 ${KOMPUTE_EXTRA_CXX_FLAGS}")
 
-find_package(kompute REQUIRED)
+if(KOMPUTE_ARR_OPT_INSTALLED_KOMPUTE)
+    find_package(kompute REQUIRED)
+else()
+    add_subdirectory(../../ ${CMAKE_CURRENT_BINARY_DIR}/kompute_build)
+endif()
+
 find_package(Vulkan REQUIRED)
 
 if(KOMPUTE_OPT_ENABLE_SPDLOG)
     find_package(spdlog REQUIRED)
-    find_package(fmt REQUIRED)
 endif()
 
 add_executable(kompute_linear_reg
@@ -30,11 +35,11 @@ target_link_libraries(kompute_linear_reg
     Vulkan::Vulkan
 )
 
+include_directories(
+        ../../single_include/)
+
 if(KOMPUTE_OPT_ENABLE_SPDLOG)
-    target_link_libraries(kompute_linear_reg
-        kompute::kompute
-        fmt::fmt
-        spdlog::spdlog
-    )
+    target_link_libraries(kompute_array_mult
+        spdlog::spdlog)
 endif()
 
diff --git a/examples/logistic_regression/README.md b/examples/logistic_regression/README.md
index 29aa89c01..0de7ee30a 100644
--- a/examples/logistic_regression/README.md
+++ b/examples/logistic_regression/README.md
@@ -6,54 +6,12 @@ This example is structured such that you will be able to extend it for your proj
 
 It contains a cmake build configuration that can be used in your production applications.
 
-## Pre-requisites
-
-In order to run this example, you will need the following dependencies:
-
-* REQUIRED
-    + Vulkan Kompute library must be accessible
-    + The Vulkan SDK must be installed
-* OPTIONAL
-    + SPDLOG - for logging
-    + FMT - for text formatting
-
-We will cover how you can install Vulkan Kompute in the next section.
-
-For the Vulkan SDK, the simplest way to install it is through [their website](https://vulkan.lunarg.com/sdk/home). You just have to follow the instructions for the relevant platform.
-
-For the other libraries, because they are optional you can just make sure you build and install Kompute with these disabled (this will be covered in more detail below).
-
-Alternatively you can use package managers such as vcpkg to help you install them, although to simplify things you can start without the dependencies first.
-
-## Set Up Vulkan Kompute Dependency
-
-You have multiple options to set up Vulkan Kompute. The easiest is to perform a local installation.
-
-For this, you will want to go to the main repo and run the following cmake command, which will configure it without SPDLOG by default.
-
-```
-cmake \
-    -Bbuild
-```
-
-You can pass the following optional parameters based on your desired configuration:
-* If you wish to install with spdlog support you just have to pass `-DKOMPUTE_ENABLE_SPDLOG=1`.
-* If you wish to perform the installation on the local folder instead of in your system you can use `-DCMAKE_INSTALL_PREFIX="build/src/CMakeFiles/Export/"` which will basically ensure that the final files are created in the local directory.
-* If you are using a package manager such as `vcpkg` make sure you pass the `-DCMAKE_TOOLCHAIN_FILE=` parameter 
-
-Then you can proceed to run the installation:
-
-* For Windows / Visual Studio you just have to build `INSTALL.vcxproj`
-* For Linux you can just run the `install` target via `make -C build install`
-
-You also have the option to build as `Release` or `Debug` - just make sure that you build your example with the same build/debug flags as required.
-
 ## Building the example
 
-Now that you've set up the dependencies / installation of Vulkan Kompute you can build this example.
-
 You will notice that it's a standalone project, so you can re-use it for your application.
 
+This project has the option to either import the Kompute dependency relative to the project or use your existing installation of Kompute.
+
 To build you just need to run the cmake command in this folder as follows:
 
 ```
@@ -61,14 +19,19 @@ cmake \
     -Bbuild
 ```
 
-Make sure to pass the required flags depending on the configuration above:
-* If you built with Debug make sure you build your example with Debug as well
-* If you installed in the local folder, make sure you pass the CMAKE_PREFIX_PATH pointing to the respective folder (e.g. `-DCMAKE_PREFIX_PATH=../../build/src/CMakeFiles/Export/lib/cmake/kompute/` if parent folder is main repo).
-* If you built Vulkan Kompute with spdlog enabled, make sure to pass `-DKOMPUTE_OPT_ENABLE_SPDLOG=1`
+You can pass the following optional parameters based on your desired configuration:
+* If you wish to install with spdlog support you just have to pass `-DKOMPUTE_OPT_ENABLE_SPDLOG=1`.
 * If you are using a package manager such as `vcpkg` make sure you pass the `-DCMAKE_TOOLCHAIN_FILE=` parameter 
+* If you wish to load shader from raw glsl string instead of spirv bytes you can use `-DKOMPUTE_ANDROID_SHADER_FROM_STRING`
 
-Now you just have to build your application as above:
+## Pre-requisites
 
-* For Windows / Visual Studio you just have to build and run `kompute_linear_reg.vcxproj`
-* For Linux you can just run the `kompute_linear_reg` target via `make -C build kompute_linear_reg`
+In order to run this example, you will need the following dependencies:
+
+* REQUIRED
+    + The Vulkan SDK must be installed
+* OPTIONAL
+    + Vulkan Kompute library must be accessible (by default it uses the source directory)
+    + SPDLOG - for logging
+    + FMT - for text formatting
 
diff --git a/examples/logistic_regression/src/Main.cpp b/examples/logistic_regression/src/Main.cpp
index 853fa9d67..d3b8b3557 100755
--- a/examples/logistic_regression/src/Main.cpp
+++ b/examples/logistic_regression/src/Main.cpp
@@ -36,22 +36,30 @@ int main()
     kp::Manager mgr;
 
     std::shared_ptr<kp::Sequence> sqTensor =
-      mgr.createManagedSequence().lock();
+      mgr.createManagedSequence();
 
     sqTensor->begin();
     sqTensor->record<kp::OpTensorCreate>(params);
     sqTensor->end();
     sqTensor->eval();
 
-    std::shared_ptr<kp::Sequence> sq = mgr.createManagedSequence().lock();
+    std::shared_ptr<kp::Sequence> sq = mgr.createManagedSequence();
 
     // Record op algo base
     sq->begin();
 
     sq->record<kp::OpTensorSyncDevice>({ wIn, bIn });
 
-    sq->record<kp::OpAlgoBase<>>(
+#ifdef KOMPUTE_ANDROID_SHADER_FROM_STRING
+    sq->record<kp::OpAlgoBase>(
       params, "shaders/glsl/logistic_regression.comp");
+#else
+    sq->record<kp::OpAlgoBase>(
+        params, std::vector<char>(
+                kp::shader_data::shaders_glsl_logisticregression_comp_spv,
+                kp::shader_data::shaders_glsl_logisticregression_comp_spv
+                    + kp::shader_data::shaders_glsl_logisticregression_comp_spv_len));
+#endif
 
     sq->record<kp::OpTensorSyncLocal>({ wOutI, wOutJ, bOut, lOut });
 
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
new file mode 100644
index 000000000..5f3036986
--- /dev/null
+++ b/python/CMakeLists.txt
@@ -0,0 +1,11 @@
+
+add_subdirectory(pybind11)
+pybind11_add_module(kp src/main.cpp)
+
+include_directories(
+        ${PROJECT_SOURCE_DIR}/single_include/)
+
+target_link_libraries(
+    kp PRIVATE
+    kompute::kompute)
+
diff --git a/python/README.md b/python/README.md
new file mode 100644
index 000000000..7b0d89f0e
--- /dev/null
+++ b/python/README.md
@@ -0,0 +1,2 @@
+# Python Bindings for Vulkan Kompute
+
diff --git a/python/pybind11 b/python/pybind11
new file mode 160000
index 000000000..06a54018c
--- /dev/null
+++ b/python/pybind11
@@ -0,0 +1 @@
+Subproject commit 06a54018c8a9fd9a7be5f5b56414b5da9259f637
diff --git a/python/src/main.cpp b/python/src/main.cpp
new file mode 100644
index 000000000..0f10ea349
--- /dev/null
+++ b/python/src/main.cpp
@@ -0,0 +1,160 @@
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include <kompute/Kompute.hpp>
+
+namespace py = pybind11;
+
+PYBIND11_MODULE(kp, m) {
+
+    py::enum_<kp::Tensor::TensorTypes>(m, "TensorTypes", "Enum with GPU memory types for Tensor.")
+        .value("device", kp::Tensor::TensorTypes::eDevice, "Tensor holding data in GPU memory.")
+        .value("staging", kp::Tensor::TensorTypes::eStaging, "Tensor used for transfer of data to device.")
+        .value("storage", kp::Tensor::TensorTypes::eStorage, "Tensor with host visible gpu memory.")
+        .export_values();
+
+    py::class_<kp::Tensor, std::shared_ptr<kp::Tensor>>(m, "Tensor", "Structured data used in GPU operations.")
+        .def(py::init(
+            [](const std::vector<float>& data) {
+                return std::unique_ptr<kp::Tensor>(new kp::Tensor(data));
+            }), "Initialiser with only list of data components.")
+        .def(py::init(
+            [](const std::vector<float>& data, kp::Tensor::TensorTypes tensorTypes) {
+                return std::unique_ptr<kp::Tensor>(new kp::Tensor(data, tensorTypes));
+            }), "Initialiser with list of data components and tensor GPU memory type.")
+        .def("data", &kp::Tensor::data, "Retrieves the data as a list containing the local Tensor memory data.")
+        .def("size", &kp::Tensor::size, "Retrieves the size of the Tensor data as per the local Tensor memory.")
+        .def("tensor_type", &kp::Tensor::tensorType, "Retreves the memory type of the tensor.")
+        .def("is_init", &kp::Tensor::isInit, "Checks whether the tensor GPU memory has been initialised.")
+        .def("set_data", &kp::Tensor::setData, "Overrides the data in the local Tensor memory.")
+        .def("map_data_from_host", &kp::Tensor::mapDataFromHostMemory, "Maps data into GPU memory from tensor local data.")
+        .def("map_data_into_host", &kp::Tensor::mapDataIntoHostMemory, "Maps data from GPU memory into tensor local data.");
+
+    py::class_<kp::Sequence, std::shared_ptr<kp::Sequence>>(m, "Sequence")
+        .def("init", &kp::Sequence::init, "Initialises Vulkan resources within sequence using provided device.")
+        // record
+        .def("begin", &kp::Sequence::begin, "Clears previous commands and starts recording commands in sequence which can be run in batch.")
+        .def("end", &kp::Sequence::end, "Stops listening and recording for new commands.")
+        // eval
+        .def("eval", &kp::Sequence::eval, "Executes the currently recorded commands synchronously by waiting on Vulkan Fence.")
+        .def("eval_async", &kp::Sequence::evalAsync, "Executes the currently recorded commands asynchronously.")
+        .def("eval_await", &kp::Sequence::evalAwait, "Waits until the execution finishes using Vulkan Fence.")
+        // status
+        .def("is_running", &kp::Sequence::isRunning, "Checks whether the Sequence operations are currently still executing.")
+        .def("is_rec", &kp::Sequence::isRecording, "Checks whether the Sequence is currently in recording mode.")
+        .def("is_init", &kp::Sequence::isInit, "Checks if the Sequence has been initialized")
+        // record
+        .def("record_tensor_create", &kp::Sequence::record<kp::OpTensorCreate>,
+            "Records operation to create and initialise tensor GPU memory and buffer")
+        .def("record_tensor_copy", &kp::Sequence::record<kp::OpTensorCopy>,
+            "Records operation to copy one tensor to one or many tensors")
+        .def("record_tensor_sync_device", &kp::Sequence::record<kp::OpTensorSyncDevice>,
+            "Records operation to sync tensor from local memory to GPU memory")
+        .def("record_tensor_sync_local", &kp::Sequence::record<kp::OpTensorSyncLocal>,
+            "Records operation to sync tensor(s) from GPU memory to local memory using staging tensors")
+        .def("record_algo_mult", &kp::Sequence::record<kp::OpMult>,
+            "Records operation to run multiplication compute shader to two input tensors and an output tensor")
+        .def("record_algo_file", &kp::Sequence::record<kp::OpAlgoBase, std::string>,
+            "Records an operation using a custom shader provided from a shader path")
+        .def("record_algo_data", &kp::Sequence::record<kp::OpAlgoBase, std::vector<char>>,
+            "Records an operation using a custom shader provided as raw string or spirv bytes")
+        .def("record_algo_lro", &kp::Sequence::record<kp::OpAlgoLhsRhsOut>,
+            "Records operation to run left right out operation with custom shader");
+
+    py::class_<kp::Manager>(m, "Manager")
+        .def(py::init(), "Default initializer uses device 0 and first compute compatible GPU queueFamily")
+        .def(py::init(
+            [](uint32_t physicalDeviceIndex) {
+                return std::unique_ptr<kp::Manager>(new kp::Manager(physicalDeviceIndex));
+            }), "Manager initialiser can provide specified device index but will use first compute compatible GPU queueFamily")
+        .def(py::init(
+            [](uint32_t physicalDeviceIndex, const std::vector<uint32_t>& familyQueueIndices) {
+                return std::unique_ptr<kp::Manager>(new kp::Manager(physicalDeviceIndex, familyQueueIndices));
+            }), "Manager initialiser can provide specified device and array of GPU queueFamilies to load.")
+        .def("get_create_sequence", &kp::Manager::getOrCreateManagedSequence, "Get a Sequence or create a new one with given name")
+        .def("create_sequence", &kp::Manager::createManagedSequence,
+                py::arg("name"), py::arg("queueIndex") = 0, "Create a sequence with specific name and specified index of available queues")
+        .def("build_tensor", &kp::Manager::buildTensor, 
+                py::arg("data"), py::arg("tensorType") = kp::Tensor::TensorTypes::eDevice,
+                "Build and initialise tensor")
+        // Await functions
+        .def("eval_await", &kp::Manager::evalOpAwait,
+                py::arg("sequenceName"), py::arg("waitFor") = UINT64_MAX,
+                "Awaits for asynchronous operation on a named Sequence")
+        .def("eval_await_def", &kp::Manager::evalOpAwaitDefault,
+                py::arg("waitFor") = UINT64_MAX, "Awaits for asynchronous operation on the last anonymous Sequence created")
+        // eval default
+        .def("eval_tensor_create_def", &kp::Manager::evalOpDefault<kp::OpTensorCreate>,
+            "Evaluates operation to create and initialise tensor GPU memory and buffer with new anonymous Sequence")
+        .def("eval_tensor_copy_def", &kp::Manager::evalOpDefault<kp::OpTensorCopy>,
+            "Evaluates operation to copy one tensor to one or many tensors with new anonymous Sequence")
+        .def("eval_tensor_sync_device_def", &kp::Manager::evalOpDefault<kp::OpTensorSyncDevice>,
+            "Evaluates operation to sync tensor from local memory to GPU memory with new anonymous Sequence")
+        .def("eval_tensor_sync_local_def", &kp::Manager::evalOpDefault<kp::OpTensorSyncLocal>,
+            "Evaluates operation to sync tensor(s) from GPU memory to local memory using staging tensors with new anonymous Sequence")
+        .def("eval_algo_mult_def", &kp::Manager::evalOpDefault<kp::OpMult>,
+            "Evaluates operation to run multiplication compute shader to two input tensors and an output tensor with new anonymous Sequence")
+        .def("eval_algo_file_def", &kp::Manager::evalOpDefault<kp::OpAlgoBase, std::string>,
+            "Evaluates an operation using a custom shader provided from a shader path with new anonymous Sequence")
+        .def("eval_algo_data_def", &kp::Manager::evalOpDefault<kp::OpAlgoBase, std::vector<char>>,
+            "Evaluates an operation using a custom shader provided as raw string or spirv bytes with new anonymous Sequence")
+        .def("eval_algo_lro_def", &kp::Manager::evalOpDefault<kp::OpAlgoLhsRhsOut>,
+            "Evaluates operation to run left right out operation with custom shader with new anonymous Sequence")
+        // eval
+        .def("eval_tensor_create", &kp::Manager::evalOp<kp::OpTensorCreate>,
+            "Evaluates operation to create and initialise tensor GPU memory and buffer with explicitly named Sequence")
+        .def("eval_tensor_copy", &kp::Manager::evalOp<kp::OpTensorCopy>,
+            "Evaluates operation to copy one tensor to one or many tensors with explicitly named Sequence")
+        .def("eval_tensor_sync_device", &kp::Manager::evalOp<kp::OpTensorSyncDevice>,
+            "Evaluates operation to sync tensor from local memory to GPU memory with explicitly named Sequence")
+        .def("eval_tensor_sync_local", &kp::Manager::evalOp<kp::OpTensorSyncLocal>,
+            "Evaluates operation to sync tensor(s) from GPU memory to local memory using staging tensors with explicitly named Sequence")
+        .def("eval_algo_mult", &kp::Manager::evalOp<kp::OpMult>,
+            "Evaluates operation to run multiplication compute shader to two input tensors and an output tensor with explicitly named Sequence")
+        .def("eval_algo_file", &kp::Manager::evalOp<kp::OpAlgoBase, std::string>,
+            "Evaluates an operation using a custom shader provided from a shader path with explicitly named Sequence")
+        .def("eval_algo_data", &kp::Manager::evalOp<kp::OpAlgoBase, std::vector<char>>,
+            "Evaluates an operation using a custom shader provided as raw string or spirv bytes with explicitly named Sequence")
+        .def("eval_algo_lro", &kp::Manager::evalOp<kp::OpAlgoLhsRhsOut>,
+            "Evaluates operation to run left right out operation with custom shader with explicitly named Sequence")
+        // eval async default
+        .def("eval_async_tensor_create_def", &kp::Manager::evalOpAsyncDefault<kp::OpTensorCreate>,
+            "Evaluates asynchronously operation to create and initialise tensor GPU memory and buffer with anonymous Sequence")
+        .def("eval_async_tensor_copy_def", &kp::Manager::evalOpAsyncDefault<kp::OpTensorCopy>,
+            "Evaluates asynchronously operation to copy one tensor to one or many tensors with anonymous Sequence")
+        .def("eval_async_tensor_sync_device_def", &kp::Manager::evalOpAsyncDefault<kp::OpTensorSyncDevice>,
+            "Evaluates asynchronously operation to sync tensor from local memory to GPU memory with anonymous Sequence")
+        .def("eval_async_tensor_sync_local_def", &kp::Manager::evalOpAsyncDefault<kp::OpTensorSyncLocal>,
+            "Evaluates asynchronously operation to sync tensor(s) from GPU memory to local memory using staging tensors with anonymous Sequence")
+        .def("eval_async_algo_mult_def", &kp::Manager::evalOpAsyncDefault<kp::OpMult>,
+            "Evaluates asynchronously operation to run multiplication compute shader to two input tensors and an output tensor with anonymous Sequence")
+        .def("eval_async_algo_file_def", &kp::Manager::evalOpAsyncDefault<kp::OpAlgoBase, std::string>,
+            "Evaluates asynchronously an operation using a custom shader provided from a shader path with anonymous Sequence")
+        .def("eval_async_algo_data_def", &kp::Manager::evalOpAsyncDefault<kp::OpAlgoBase, std::vector<char>>,
+            "Evaluates asynchronously an operation using a custom shader provided as raw string or spirv bytes with anonymous Sequence")
+        .def("eval_async_algo_lro_def", &kp::Manager::evalOpAsyncDefault<kp::OpAlgoLhsRhsOut>,
+            "Evaluates asynchronously operation to run left right out operation with custom shader with anonymous Sequence")
+        // eval async
+        .def("eval_async_tensor_create", &kp::Manager::evalOpAsync<kp::OpTensorCreate>,
+            "Evaluates asynchronously operation to create and initialise tensor GPU memory and buffer with explicitly named Sequence")
+        .def("eval_async_tensor_copy", &kp::Manager::evalOpAsync<kp::OpTensorCopy>,
+            "Evaluates asynchronously operation to copy one tensor to one or many tensors with explicitly named Sequence")
+        .def("eval_async_tensor_sync_device", &kp::Manager::evalOpAsync<kp::OpTensorSyncDevice>,
+            "Evaluates asynchronously operation to sync tensor from local memory to GPU memory with explicitly named Sequence")
+        .def("eval_async_tensor_sync_local", &kp::Manager::evalOpAsync<kp::OpTensorSyncLocal>,
+            "Evaluates asynchronously operation to sync tensor(s) from GPU memory to local memory using staging tensors with explicitly named Sequence")
+        .def("eval_async_algo_mult", &kp::Manager::evalOpAsync<kp::OpMult>,
+            "Evaluates asynchronously operation to run multiplication compute shader to two input tensors and an output tensor with explicitly named Sequence")
+        .def("eval_async_algo_file", &kp::Manager::evalOpAsync<kp::OpAlgoBase, std::string>,
+            "Evaluates asynchronously an operation using a custom shader provided from a shader path with explicitly named Sequence")
+        .def("eval_async_algo_data", &kp::Manager::evalOpAsync<kp::OpAlgoBase, std::vector<char>>,
+            "Evaluates asynchronously an operation using a custom shader provided as raw string or spirv bytes with explicitly named Sequence")
+        .def("eval_async_algo_lro", &kp::Manager::evalOpAsync<kp::OpAlgoLhsRhsOut>,
+            "Evaluates asynchronously operation to run left right out operation with custom shader with explicitly named Sequence");
+
+#ifdef VERSION_INFO
+    m.attr("__version__") = VERSION_INFO;
+#else
+    m.attr("__version__") = "dev";
+#endif
+}
diff --git a/python/test/test_kompute.py b/python/test/test_kompute.py
new file mode 100644
index 000000000..43baf77d1
--- /dev/null
+++ b/python/test/test_kompute.py
@@ -0,0 +1,110 @@
+
+from kp import Tensor, Manager, Sequence
+
+def test_opmult():
+    """
+    Test basic OpMult operation
+    """
+
+    tensor_in_a = Tensor([2, 2, 2])
+    tensor_in_b = Tensor([1, 2, 3])
+    tensor_out = Tensor([0, 0, 0])
+
+    mgr = Manager()
+
+    mgr.eval_tensor_create_def([tensor_in_a, tensor_in_b, tensor_out])
+
+    mgr.eval_algo_mult_def([tensor_in_a, tensor_in_b, tensor_out])
+
+    mgr.eval_tensor_sync_local_def([tensor_out])
+
+    assert tensor_out.data() == [2.0, 4.0, 6.0]
+
+def test_opalgobase_data():
+    """
+    Test basic OpAlgoBase operation
+    """
+
+    tensor_in_a = Tensor([2, 2, 2])
+    tensor_in_b = Tensor([1, 2, 3])
+    tensor_out = Tensor([0, 0, 0])
+
+    mgr = Manager()
+
+    shaderData = """
+        #version 450
+
+        layout (local_size_x = 1) in;
+
+        // The input tensors bind index is relative to index in parameter passed
+        layout(set = 0, binding = 0) buffer bina { float tina[]; };
+        layout(set = 0, binding = 1) buffer binb { float tinb[]; };
+        layout(set = 0, binding = 2) buffer bout { float tout[]; };
+
+        void main() {
+            uint index = gl_GlobalInvocationID.x;
+            tout[index] = tina[index] * tinb[index];
+        }
+    """
+
+    mgr.eval_tensor_create_def([tensor_in_a, tensor_in_b, tensor_out])
+
+    mgr.eval_algo_data_def([tensor_in_a, tensor_in_b, tensor_out], list(shaderData))
+
+    mgr.eval_tensor_sync_local_def([tensor_out])
+
+    assert tensor_out.data() == [2.0, 4.0, 6.0]
+
+
+def test_opalgobase_file():
+    """
+    Test basic OpAlgoBase operation
+    """
+
+    tensor_in_a = Tensor([2, 2, 2])
+    tensor_in_b = Tensor([1, 2, 3])
+    tensor_out = Tensor([0, 0, 0])
+
+    mgr = Manager()
+
+    shaderFilePath = "../../shaders/glsl/opmult.comp"
+
+    mgr.eval_tensor_create_def([tensor_in_a, tensor_in_b, tensor_out])
+
+    mgr.eval_algo_file_def([tensor_in_a, tensor_in_b, tensor_out], shaderFilePath)
+
+    mgr.eval_tensor_sync_local_def([tensor_out])
+
+    assert tensor_out.data() == [2.0, 4.0, 6.0]
+
+def test_sequence():
+    """
+    Test basic OpAlgoBase operation
+    """
+
+    mgr = Manager(0, [2])
+
+    tensor_in_a = Tensor([2, 2, 2])
+    tensor_in_b = Tensor([1, 2, 3])
+    tensor_out = Tensor([0, 0, 0])
+
+    mgr.eval_tensor_create_def([tensor_in_a, tensor_in_b, tensor_out])
+
+    seq = mgr.create_sequence("op")
+
+    shaderFilePath = "../../shaders/glsl/opmult.comp"
+    mgr.eval_async_algo_file_def([tensor_in_a, tensor_in_b, tensor_out], shaderFilePath)
+    mgr.eval_await_def()
+
+    seq.begin()
+    seq.record_tensor_sync_local([tensor_in_a])
+    seq.record_tensor_sync_local([tensor_in_b])
+    seq.record_tensor_sync_local([tensor_out])
+    seq.end()
+
+    seq.eval()
+
+    assert tensor_out.data() == [2.0, 4.0, 6.0]
+
+if __name__ == "__main__":
+    test_sequence()
diff --git a/setup.py b/setup.py
new file mode 100644
index 000000000..0b5db2f9c
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,76 @@
+import os
+import re
+import sys
+import platform
+import subprocess
+
+from setuptools import setup, Extension
+from setuptools.command.build_ext import build_ext
+from distutils.version import LooseVersion
+
+
+class CMakeExtension(Extension):
+    def __init__(self, name, sourcedir=''):
+        Extension.__init__(self, name, sources=[])
+        self.sourcedir = os.path.abspath(sourcedir)
+
+
+class CMakeBuild(build_ext):
+    def run(self):
+        try:
+            out = subprocess.check_output(['cmake', '--version'])
+        except OSError:
+            raise RuntimeError("CMake must be installed to build the following extensions: " +
+                               ", ".join(e.name for e in self.extensions))
+
+        if platform.system() == "Windows":
+            cmake_version = LooseVersion(re.search(r'version\s*([\d.]+)', out.decode()).group(1))
+            if cmake_version < '3.1.0':
+                raise RuntimeError("CMake >= 3.1.0 is required on Windows")
+
+        for ext in self.extensions:
+            self.build_extension(ext)
+
+    def build_extension(self, ext):
+        extdir = os.path.abspath(os.path.dirname(self.get_ext_fullpath(ext.name)))
+        # required for auto-detection of auxiliary "native" libs
+        if not extdir.endswith(os.path.sep):
+            extdir += os.path.sep
+
+        cmake_args = ['-DCMAKE_LIBRARY_OUTPUT_DIRECTORY=' + extdir,
+                      '-DKOMPUTE_OPT_BUILD_PYTHON=1',
+                      '-DKOMPUTE_OPT_BUILD_SINGLE_HEADER=1',
+                      '-DPYTHON_EXECUTABLE=' + sys.executable]
+
+        cfg = 'Debug' if self.debug else 'Release'
+        build_args = ['--config', cfg]
+
+        if platform.system() == "Windows":
+            cmake_args += ['-DCMAKE_LIBRARY_OUTPUT_DIRECTORY_{}={}'.format(cfg.upper(), extdir)]
+            if sys.maxsize > 2**32:
+                cmake_args += ['-A', 'x64']
+            build_args += ['--', '/m']
+        else:
+            cmake_args += ['-DKOMPUTE_EXTRA_CXX_FLAGS="-fPIC"']
+            cmake_args += ['-DCMAKE_BUILD_TYPE=' + cfg]
+            build_args += ['--', '-j2']
+
+        env = os.environ.copy()
+        env['CXXFLAGS'] = '{} -DVERSION_INFO=\\"{}\\"'.format(env.get('CXXFLAGS', ''),
+                                                              self.distribution.get_version())
+        if not os.path.exists(self.build_temp):
+            os.makedirs(self.build_temp)
+
+        subprocess.check_call(['cmake', ext.sourcedir] + cmake_args, cwd=self.build_temp, env=env)
+        subprocess.check_call(['cmake', '--build', '.'] + build_args, cwd=self.build_temp)
+
+setup(
+    name='kp',
+    version='0.0.1',
+    author='Alejandro Saucedo',
+    description='Vulkan Kompute: Blazing fast, mobile-enabled, asynchronous, and optimized for advanced GPU processing usecases.',
+    long_description='',
+    ext_modules=[CMakeExtension('kp')],
+    cmdclass=dict(build_ext=CMakeBuild),
+    zip_safe=False,
+)
diff --git a/single_include/kompute/Kompute.hpp b/single_include/kompute/Kompute.hpp
index 8def06e4a..3ae98b483 100755
--- a/single_include/kompute/Kompute.hpp
+++ b/single_include/kompute/Kompute.hpp
@@ -1100,6 +1100,12 @@ class Sequence
      */
     bool isInit();
 
+    /**
+     * Destroys and frees the GPU resources which include the buffer and memory
+     * and sets the sequence as init=False.
+     */
+    void freeMemoryDestroyGPUResources();
+
     /**
      * Record function for operation to be added to the GPU queue in batch. This
      * template requires classes to be derived from the OpBase class. This
@@ -1301,9 +1307,9 @@ class Manager
      *
      * @param sequenceName The name for the named sequence to be retrieved or
      * created
-     * @return Weak pointer to the manager owned sequence resource
+     * @return Shared pointer to the manager owned sequence resource
      */
-    std::weak_ptr<Sequence> getOrCreateManagedSequence(
+    std::shared_ptr<Sequence> getOrCreateManagedSequence(
       std::string sequenceName);
 
     /**
@@ -1315,8 +1321,9 @@ class Manager
      * @param queueIndex The queue to use from the available queues
      * @return Weak pointer to the manager owned sequence resource
      */
-    std::weak_ptr<Sequence> createManagedSequence(std::string sequenceName = "",
-                                                  uint32_t queueIndex = 0);
+    std::shared_ptr<Sequence> createManagedSequence(
+      std::string sequenceName = "",
+      uint32_t queueIndex = 0);
 
     /**
      * Function that evaluates operation against named sequence.
@@ -1332,22 +1339,21 @@ class Manager
                 TArgs&&... params)
     {
         SPDLOG_DEBUG("Kompute Manager evalOp triggered");
-        std::weak_ptr<Sequence> sqWeakPtr =
+        std::shared_ptr<kp::Sequence> sq =
           this->getOrCreateManagedSequence(sequenceName);
 
-        if (std::shared_ptr<kp::Sequence> sq = sqWeakPtr.lock()) {
-            SPDLOG_DEBUG("Kompute Manager evalOp running sequence BEGIN");
-            sq->begin();
+        SPDLOG_DEBUG("Kompute Manager evalOp running sequence BEGIN");
+        sq->begin();
 
-            SPDLOG_DEBUG("Kompute Manager evalOp running sequence RECORD");
-            sq->record<T>(tensors, std::forward<TArgs>(params)...);
+        SPDLOG_DEBUG("Kompute Manager evalOp running sequence RECORD");
+        sq->record<T>(tensors, std::forward<TArgs>(params)...);
 
-            SPDLOG_DEBUG("Kompute Manager evalOp running sequence END");
-            sq->end();
+        SPDLOG_DEBUG("Kompute Manager evalOp running sequence END");
+        sq->end();
+
+        SPDLOG_DEBUG("Kompute Manager evalOp running sequence EVAL");
+        sq->eval();
 
-            SPDLOG_DEBUG("Kompute Manager evalOp running sequence EVAL");
-            sq->eval();
-        }
         SPDLOG_DEBUG("Kompute Manager evalOp running sequence SUCCESS");
     }
 
@@ -1385,26 +1391,21 @@ class Manager
     {
         SPDLOG_DEBUG("Kompute Manager evalOpAsync triggered");
 
-        std::weak_ptr<Sequence> sqWeakPtr =
+        std::shared_ptr<kp::Sequence> sq =
           this->getOrCreateManagedSequence(sequenceName);
 
-        if (std::shared_ptr<kp::Sequence> sq = sqWeakPtr.lock()) {
+        SPDLOG_DEBUG("Kompute Manager evalOpAsync running sequence BEGIN");
+        sq->begin();
 
-            SPDLOG_DEBUG("Kompute Manager evalOpAsync running sequence BEGIN");
-            sq->begin();
+        SPDLOG_DEBUG("Kompute Manager evalOpAsync running sequence RECORD");
+        sq->record<T>(tensors, std::forward<TArgs>(params)...);
 
-            SPDLOG_DEBUG("Kompute Manager evalOpAsync running sequence RECORD");
-            sq->record<T>(tensors, std::forward<TArgs>(params)...);
+        SPDLOG_DEBUG("Kompute Manager evalOpAsync running sequence END");
+        sq->end();
 
-            SPDLOG_DEBUG("Kompute Manager evalOpAsync running sequence END");
-            sq->end();
+        SPDLOG_DEBUG("Kompute Manager evalOpAsync running sequence EVAL");
+        sq->evalAsync();
 
-            SPDLOG_DEBUG("Kompute Manager evalOpAsync running sequence EVAL");
-            sq->evalAsync();
-        } else {
-            SPDLOG_ERROR("Kompute Manager evalOpAsync sequence [{}] not found",
-                         sequenceName);
-        }
         SPDLOG_DEBUG("Kompute Manager evalOpAsync running sequence SUCCESS");
     }
 
@@ -1620,20 +1621,17 @@ namespace kp {
  * Operation that provides a general abstraction that simplifies the use of 
  * algorithm and parameter components which can be used with shaders.
  * By default it enables the user to provide a dynamic number of tensors
- * which are then passed as inputs. 
- *
- * All of these tensors are expected to be initlaised and this is checked with throw std exception in the init function.
- *
- * See OpLhsRhsOut for an example implementation on a more specific granularity on tensor parameters.
- * 
- * The template parameters specify the processing GPU layout number of
- * iterations for each x, y, z parameter. More specifically, this will be the
- * input to ".dispatch(uint32_t tX, uint32_t tY, uint32_t, tZ)"
+ * which are then passed as inputs.
  */
-template<uint32_t tX = 0, uint32_t tY = 0, uint32_t tZ = 0>
 class OpAlgoBase : public OpBase
 {
   public:
+    struct KomputeWorkgroup {
+        uint32_t x;
+        uint32_t y;
+        uint32_t z;
+    };
+
     /**
      *  Base constructor, should not be used unless explicitly intended.
      */
@@ -1649,11 +1647,13 @@ class OpAlgoBase : public OpBase
      * @param commandBuffer Vulkan Command Buffer to record commands into
      * @param tensors Tensors that are to be used in this operation
      * @param shaderFilePath Optional parameter to specify the shader to load (either in spirv or raw format)
+     * @param komputeWorkgroup Optional parameter to specify the layout for processing
      */
     OpAlgoBase(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
            std::shared_ptr<vk::Device> device,
            std::shared_ptr<vk::CommandBuffer> commandBuffer,
-           std::vector<std::shared_ptr<Tensor>>& tensors);
+           std::vector<std::shared_ptr<Tensor>>& tensors,
+           KomputeWorkgroup komputeWorkgroup = KomputeWorkgroup());
 
     /**
      * Constructor that enables a file to be passed to the operation with
@@ -1664,13 +1664,15 @@ class OpAlgoBase : public OpBase
      * @param device Vulkan logical device for passing to Algorithm
      * @param commandBuffer Vulkan Command Buffer to record commands into
      * @param tensors Tensors that are to be used in this operation
-     * @param shaderFilePath Optional parameter to specify the shader to load (either in spirv or raw format)
+     * @param shaderFilePath Parameter to specify the shader to load (either in spirv or raw format)
+     * @param komputeWorkgroup Optional parameter to specify the layout for processing
      */
     OpAlgoBase(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
            std::shared_ptr<vk::Device> device,
            std::shared_ptr<vk::CommandBuffer> commandBuffer,
            std::vector<std::shared_ptr<Tensor>>& tensors,
-           std::string shaderFilePath);
+           std::string shaderFilePath,
+           KomputeWorkgroup komputeWorkgroup = KomputeWorkgroup());
 
     /**
      * Constructor that enables raw shader data to be passed to the main operation
@@ -1681,12 +1683,14 @@ class OpAlgoBase : public OpBase
      * @param commandBuffer Vulkan Command Buffer to record commands into
      * @param tensors Tensors that are to be used in this operation
      * @param shaderDataRaw Optional parameter to specify the shader data either in binary or raw form
+     * @param komputeWorkgroup Optional parameter to specify the layout for processing
      */
     OpAlgoBase(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
            std::shared_ptr<vk::Device> device,
            std::shared_ptr<vk::CommandBuffer> commandBuffer,
            std::vector<std::shared_ptr<Tensor>>& tensors,
-           const std::vector<char>& shaderDataRaw);
+           const std::vector<char>& shaderDataRaw,
+           KomputeWorkgroup komputeWorkgroup = KomputeWorkgroup());
 
     /**
      * Default destructor, which is in charge of destroying the algorithm
@@ -1733,9 +1737,7 @@ class OpAlgoBase : public OpBase
 
     // -------------- ALWAYS OWNED RESOURCES
 
-    uint32_t mX;
-    uint32_t mY;
-    uint32_t mZ;
+    KomputeWorkgroup mKomputeWorkgroup;
 
     std::string mShaderFilePath; ///< Optional member variable which can be provided for the OpAlgoBase to find the data automatically and load for processing
     std::vector<char> mShaderDataRaw; ///< Optional member variable which can be provided to contain either the raw shader content or the spirv binary content
@@ -1745,177 +1747,6 @@ class OpAlgoBase : public OpBase
 
 } // End namespace kp
 
-// Including implementation for template class
-#ifndef OPALGOBASE_IMPL
-#define OPALGOBASE_IMPL
-
-namespace kp {
-
-template<uint32_t tX, uint32_t tY, uint32_t tZ>
-OpAlgoBase<tX, tY, tZ>::OpAlgoBase()
-{
-    SPDLOG_DEBUG("Kompute OpAlgoBase constructor base");
-}
-
-template<uint32_t tX, uint32_t tY, uint32_t tZ>
-OpAlgoBase<tX, tY, tZ>::OpAlgoBase(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
-                           std::shared_ptr<vk::Device> device,
-                           std::shared_ptr<vk::CommandBuffer> commandBuffer,
-                           std::vector<std::shared_ptr<Tensor>>& tensors)
-  : OpBase(physicalDevice, device, commandBuffer, tensors, false)
-{
-    SPDLOG_DEBUG("Kompute OpAlgoBase constructor with params numTensors: {}", tensors.size());
-
-    // The dispatch size is set up based on either explicitly provided template
-    // parameters or by default it would take the shape and size of the tensors
-    if (tX > 0) {
-        // If at least the x value is provided we use mainly the parameters
-        // provided
-        this->mX = tX;
-        this->mY = tY > 0 ? tY : 1;
-        this->mZ = tZ > 0 ? tZ : 1;
-    } else {
-        this->mX = tensors[0]->size();
-        this->mY = 1;
-        this->mZ = 1;
-    }
-    SPDLOG_INFO("Kompute OpAlgoBase dispatch size X: {}, Y: {}, Z: {}",
-                 this->mX,
-                 this->mY,
-                 this->mZ);
-
-    this->mAlgorithm = std::make_shared<Algorithm>(device, commandBuffer);
-}
-
-template<uint32_t tX, uint32_t tY, uint32_t tZ>
-OpAlgoBase<tX, tY, tZ>::OpAlgoBase(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
-                           std::shared_ptr<vk::Device> device,
-                           std::shared_ptr<vk::CommandBuffer> commandBuffer,
-                           std::vector<std::shared_ptr<Tensor>>& tensors,
-                           std::string shaderFilePath)
-  : OpAlgoBase(physicalDevice, device, commandBuffer, tensors)
-{
-    SPDLOG_DEBUG("Kompute OpAlgoBase shaderFilePath constructo with shaderfile path: {}", shaderFilePath);
-
-    this->mShaderFilePath = shaderFilePath;
-}
-
-template<uint32_t tX, uint32_t tY, uint32_t tZ>
-OpAlgoBase<tX, tY, tZ>::OpAlgoBase(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
-                           std::shared_ptr<vk::Device> device,
-                           std::shared_ptr<vk::CommandBuffer> commandBuffer,
-                           std::vector<std::shared_ptr<Tensor>>& tensors,
-                           const std::vector<char>& shaderDataRaw)
-  : OpAlgoBase(physicalDevice, device, commandBuffer, tensors)
-{
-    SPDLOG_DEBUG("Kompute OpAlgoBase shaderFilePath constructo with shader raw data length: {}", shaderDataRaw.size());
-
-    this->mShaderDataRaw = shaderDataRaw;
-}
-
-template<uint32_t tX, uint32_t tY, uint32_t tZ>
-OpAlgoBase<tX, tY, tZ>::~OpAlgoBase()
-{
-    SPDLOG_DEBUG("Kompute OpAlgoBase destructor started");
-}
-
-template<uint32_t tX, uint32_t tY, uint32_t tZ>
-void
-OpAlgoBase<tX, tY, tZ>::init()
-{
-    SPDLOG_DEBUG("Kompute OpAlgoBase init called");
-
-    if (this->mTensors.size() < 1) {
-        throw std::runtime_error(
-          "Kompute OpAlgoBase called with less than 1 tensor");
-    } 
-
-    for (std::shared_ptr<Tensor> tensor : this->mTensors) {
-        if(!tensor->isInit()) {
-            throw std::runtime_error("Kompute OpAlgoBase validation failed; all tensor parameters must be initialised.");
-        }
-    }
-
-    SPDLOG_DEBUG("Kompute OpAlgoBase fetching spirv data");
-
-    std::vector<char> shaderFileData = this->fetchSpirvBinaryData();
-
-    SPDLOG_DEBUG("Kompute OpAlgoBase Initialising algorithm component");
-
-    this->mAlgorithm->init(shaderFileData, this->mTensors);
-}
-
-template<uint32_t tX, uint32_t tY, uint32_t tZ>
-void
-OpAlgoBase<tX, tY, tZ>::record()
-{
-    SPDLOG_DEBUG("Kompute OpAlgoBase record called");
-
-    // Barrier to ensure the data is finished writing to buffer memory
-    for (std::shared_ptr<Tensor> tensor : this->mTensors) {
-        tensor->recordBufferMemoryBarrier(
-          this->mCommandBuffer,
-          vk::AccessFlagBits::eHostWrite,
-          vk::AccessFlagBits::eShaderRead,
-          vk::PipelineStageFlagBits::eHost,
-          vk::PipelineStageFlagBits::eComputeShader);
-    }
-
-    this->mAlgorithm->recordDispatch(this->mX, this->mY, this->mZ);
-}
-
-template<uint32_t tX, uint32_t tY, uint32_t tZ>
-void
-OpAlgoBase<tX, tY, tZ>::preEval()
-{
-    SPDLOG_DEBUG("Kompute OpAlgoBase preEval called");
-}
-
-template<uint32_t tX, uint32_t tY, uint32_t tZ>
-void
-OpAlgoBase<tX, tY, tZ>::postEval()
-{
-    SPDLOG_DEBUG("Kompute OpAlgoBase postSubmit called");
-}
-
-template<uint32_t tX, uint32_t tY, uint32_t tZ>
-std::vector<char> OpAlgoBase<tX, tY, tZ>::fetchSpirvBinaryData() 
-{
-    SPDLOG_WARN(
-      "Kompute OpAlgoBase Running shaders directly from spirv file");
-
-    if (this->mShaderFilePath.size()) {
-        std::ifstream fileStream(this->mShaderFilePath,
-                                 std::ios::binary | std::ios::in | std::ios::ate);
-
-        if (!fileStream.good()) {
-            throw std::runtime_error("Error reading file: " + this->mShaderFilePath);
-        }
-
-        size_t shaderFileSize = fileStream.tellg();
-        fileStream.seekg(0, std::ios::beg);
-        char* shaderDataRaw = new char[shaderFileSize];
-        fileStream.read(shaderDataRaw, shaderFileSize);
-        fileStream.close();
-
-        SPDLOG_WARN(
-          "Kompute OpAlgoBase fetched {} bytes", shaderFileSize);
-
-        return std::vector<char>(shaderDataRaw,
-                                 shaderDataRaw + shaderFileSize);
-    }
-    else if (this->mShaderDataRaw.size()) {
-        return this->mShaderDataRaw;
-    }
-    else {
-        throw std::runtime_error("Kompute OpAlgoBase Error reached fetchSpirvBinaryData but neither filepath nor data provided");
-    }
-}
-
-}
-
-#endif // #ifndef OPALGOBASE_IMPL
-
 #include <fstream>
 
 namespace kp {
@@ -1924,12 +1755,8 @@ namespace kp {
  * Operation base class to simplify the creation of operations that require
  * right hand and left hand side datapoints together with a single output.
  * The expected data passed is two input tensors and one output tensor.
- * The template parameters specify the processing GPU layout number of
- * iterations for each x, y, z parameter. More specifically, this will be the
- * input to ".dispatch(uint32_t tX, uint32_t tY, uint32_t, tZ)"
  */
-template<uint32_t tX = 0, uint32_t tY = 0, uint32_t tZ = 0>
-class OpAlgoLhsRhsOut : public OpAlgoBase<tX, tY, tZ>
+class OpAlgoLhsRhsOut : public OpAlgoBase
 {
   public:
     /**
@@ -1947,11 +1774,13 @@ class OpAlgoLhsRhsOut : public OpAlgoBase<tX, tY, tZ>
      * @param commandBuffer Vulkan Command Buffer to record commands into
      * @param tensors Tensors that are to be used in this operation
      * @param freeTensors Whether operation manages the memory of the Tensors
+     * @param komputeWorkgroup Optional parameter to specify the layout for processing
      */
     OpAlgoLhsRhsOut(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
            std::shared_ptr<vk::Device> device,
            std::shared_ptr<vk::CommandBuffer> commandBuffer,
-           std::vector<std::shared_ptr<Tensor>> tensors);
+           std::vector<std::shared_ptr<Tensor>> tensors,
+           KomputeWorkgroup komputeWorkgroup = KomputeWorkgroup());
 
     /**
      * Default destructor, which is in charge of destroying the algorithm
@@ -1982,7 +1811,7 @@ class OpAlgoLhsRhsOut : public OpAlgoBase<tX, tY, tZ>
      * of the GPU Device memory into the staging buffer so the output data can
      * be retrieved.
      */
-    virtual void postSubmit() override;
+    virtual void postEval() override;
 
   protected:
     // -------------- NEVER OWNED RESOURCES
@@ -1996,138 +1825,6 @@ class OpAlgoLhsRhsOut : public OpAlgoBase<tX, tY, tZ>
 
 } // End namespace kp
 
-// Including implementation for template class
-#ifndef OPALGOLHSRHSOUT_CPP
-#define OPALGOLHSRHSOUT_CPP
-
-namespace kp {
-
-template<uint32_t tX, uint32_t tY, uint32_t tZ>
-OpAlgoLhsRhsOut<tX, tY, tZ>::OpAlgoLhsRhsOut()
-{
-    SPDLOG_DEBUG("Kompute OpAlgoLhsRhsOut constructor base");
-}
-
-template<uint32_t tX, uint32_t tY, uint32_t tZ>
-OpAlgoLhsRhsOut<tX, tY, tZ>::OpAlgoLhsRhsOut(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
-                           std::shared_ptr<vk::Device> device,
-                           std::shared_ptr<vk::CommandBuffer> commandBuffer,
-                           std::vector<std::shared_ptr<Tensor>> tensors)
-  // The inheritance is initialised with the copyOutputData to false given that
-  // this depencendant class handles the transfer of data via staging buffers in 
-  // a granular way.
-  : OpAlgoBase<tX, tY, tZ>(physicalDevice, device, commandBuffer, tensors)
-{
-    SPDLOG_DEBUG("Kompute OpAlgoLhsRhsOut constructor with params");
-}
-
-template<uint32_t tX, uint32_t tY, uint32_t tZ>
-OpAlgoLhsRhsOut<tX, tY, tZ>::~OpAlgoLhsRhsOut()
-{
-    SPDLOG_DEBUG("Kompute OpAlgoLhsRhsOut destructor started");
-}
-
-template<uint32_t tX, uint32_t tY, uint32_t tZ>
-void
-OpAlgoLhsRhsOut<tX, tY, tZ>::init()
-{
-    SPDLOG_DEBUG("Kompute OpAlgoLhsRhsOut init called");
-
-    if (this->mTensors.size() < 3) {
-        throw std::runtime_error(
-          "Kompute OpAlgoLhsRhsOut called with less than 1 tensor");
-    } else if (this->mTensors.size() > 3) {
-        SPDLOG_WARN("Kompute OpAlgoLhsRhsOut called with more than 3 this->mTensors");
-    }
-
-    this->mTensorLHS = this->mTensors[0];
-    this->mTensorRHS = this->mTensors[1];
-    this->mTensorOutput = this->mTensors[2];
-
-    if (!(this->mTensorLHS->isInit() && this->mTensorRHS->isInit() &&
-          this->mTensorOutput->isInit())) {
-        throw std::runtime_error(
-          "Kompute OpAlgoLhsRhsOut all tensor parameters must be initialised. LHS: " +
-          std::to_string(this->mTensorLHS->isInit()) +
-          " RHS: " + std::to_string(this->mTensorRHS->isInit()) +
-          " Output: " + std::to_string(this->mTensorOutput->isInit()));
-    }
-
-    if (!(this->mTensorLHS->size() == this->mTensorRHS->size() &&
-          this->mTensorRHS->size() == this->mTensorOutput->size())) {
-        throw std::runtime_error(
-          "Kompute OpAlgoLhsRhsOut all tensor parameters must be the same size LHS: " +
-          std::to_string(this->mTensorLHS->size()) +
-          " RHS: " + std::to_string(this->mTensorRHS->size()) +
-          " Output: " + std::to_string(this->mTensorOutput->size()));
-    }
-
-    this->mTensorOutputStaging = std::make_shared<Tensor>(
-      this->mTensorOutput->data(), Tensor::TensorTypes::eStaging);
-
-    this->mTensorOutputStaging->init(
-      this->mPhysicalDevice, this->mDevice, this->mCommandBuffer);
-
-    SPDLOG_DEBUG("Kompute OpAlgoLhsRhsOut fetching spirv data");
-
-    std::vector<char> shaderFileData = this->fetchSpirvBinaryData();
-
-    SPDLOG_DEBUG("Kompute OpAlgoLhsRhsOut Initialising algorithm component");
-
-    this->mAlgorithm->init(shaderFileData, this->mTensors);
-}
-
-template<uint32_t tX, uint32_t tY, uint32_t tZ>
-void
-OpAlgoLhsRhsOut<tX, tY, tZ>::record()
-{
-    SPDLOG_DEBUG("Kompute OpAlgoLhsRhsOut record called");
-
-    // Barrier to ensure the data is finished writing to buffer memory
-    this->mTensorLHS->recordBufferMemoryBarrier(
-      this->mCommandBuffer,
-      vk::AccessFlagBits::eHostWrite,
-      vk::AccessFlagBits::eShaderRead,
-      vk::PipelineStageFlagBits::eHost,
-      vk::PipelineStageFlagBits::eComputeShader);
-    this->mTensorRHS->recordBufferMemoryBarrier(
-      this->mCommandBuffer,
-      vk::AccessFlagBits::eHostWrite,
-      vk::AccessFlagBits::eShaderRead,
-      vk::PipelineStageFlagBits::eHost,
-      vk::PipelineStageFlagBits::eComputeShader);
-
-    this->mAlgorithm->recordDispatch(this->mX, this->mY, this->mZ);
-
-    // Barrier to ensure the shader code is executed before buffer read
-    this->mTensorOutput->recordBufferMemoryBarrier(
-      this->mCommandBuffer,
-      vk::AccessFlagBits::eShaderWrite,
-      vk::AccessFlagBits::eTransferRead,
-      vk::PipelineStageFlagBits::eComputeShader,
-      vk::PipelineStageFlagBits::eTransfer);
-
-    this->mTensorOutputStaging->recordCopyFrom(
-            this->mCommandBuffer,
-            this->mTensorOutput,
-            true);
-}
-
-template<uint32_t tX, uint32_t tY, uint32_t tZ>
-void
-OpAlgoLhsRhsOut<tX, tY, tZ>::postSubmit()
-{
-    SPDLOG_DEBUG("Kompute OpAlgoLhsRhsOut postSubmit called");
-
-    this->mTensorOutputStaging->mapDataFromHostMemory();
-
-    this->mTensorOutput->setData(this->mTensorOutputStaging->data());
-}
-
-}
-
-#endif // #ifndef OPALGOLHSRHSOUT_CPP
-
 #include <fstream>
 
 #if RELEASE
@@ -2138,12 +1835,9 @@ namespace kp {
 
 /**
  * Operation that performs multiplication on two tensors and outpus on third
- * tensor. The template parameters specify the processing GPU layout number of
- * iterations for each x, y, z parameter. More specifically, this will be the
- * input to ".dispatch(uint32_t tX, uint32_t tY, uint32_t, tZ)"
+ * tensor.
  */
-template<uint32_t tX = 0, uint32_t tY = 0, uint32_t tZ = 0>
-class OpMult : public OpAlgoBase<tX, tY, tZ>
+class OpMult : public OpAlgoBase
 {
   public:
     /**
@@ -2162,13 +1856,14 @@ class OpMult : public OpAlgoBase<tX, tY, tZ>
      * @param device Vulkan logical device for passing to Algorithm
      * @param commandBuffer Vulkan Command Buffer to record commands into
      * @param tensors Tensors that are to be used in this operation
-     * @param freeTensors Whether operation manages the memory of the Tensors
+     * @param komputeWorkgroup Optional parameter to specify the layout for processing
      */
     OpMult(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
            std::shared_ptr<vk::Device> device,
            std::shared_ptr<vk::CommandBuffer> commandBuffer,
-           std::vector<std::shared_ptr<Tensor>> tensors)
-      : OpAlgoBase<tX, tY, tZ>(physicalDevice, device, commandBuffer, tensors, "")
+           std::vector<std::shared_ptr<Tensor>> tensors,
+           KomputeWorkgroup komputeWorkgroup = KomputeWorkgroup())
+      : OpAlgoBase(physicalDevice, device, commandBuffer, tensors, "", komputeWorkgroup)
     {
         SPDLOG_DEBUG("Kompute OpMult constructor with params");
 
@@ -2179,14 +1874,8 @@ class OpMult : public OpAlgoBase<tX, tY, tZ>
 
 #if RELEASE
     /**
-     * If release it will be using the static version of the shader which is 
-     * loaded using this file directly.
-     *
-     * @param physicalDevice Vulkan physical device used to find device queues
-     * @param device Vulkan logical device for passing to Algorithm
-     * @param commandBuffer Vulkan Command Buffer to record commands into
-     * @param tensors Tensors that are to be used in this operation
-     * @param freeTensors Whether operation manages the memory of the Tensors
+     * If RELEASE=1 it will be using the static version of the shader which is 
+     * loaded using this file directly. Otherwise it should not override the function.
      */
     std::vector<char> fetchSpirvBinaryData() override
     {
diff --git a/src/Algorithm.cpp b/src/Algorithm.cpp
index 70092a3d6..eb0be22a8 100644
--- a/src/Algorithm.cpp
+++ b/src/Algorithm.cpp
@@ -34,7 +34,9 @@ Algorithm::~Algorithm()
             SPDLOG_ERROR("Kompute Algorithm Error requested to destroy "
                          "pipeline but it is null");
         }
-        this->mDevice->destroy(*this->mPipeline, (vk::Optional<const vk::AllocationCallbacks>)nullptr);
+        this->mDevice->destroy(
+          *this->mPipeline,
+          (vk::Optional<const vk::AllocationCallbacks>)nullptr);
     }
 
     if (this->mFreePipelineCache) {
@@ -43,7 +45,9 @@ Algorithm::~Algorithm()
             SPDLOG_ERROR("Kompute Algorithm Error requested to destroy "
                          "pipeline cache but it is null");
         }
-        this->mDevice->destroy(*this->mPipelineCache, (vk::Optional<const vk::AllocationCallbacks>)nullptr);
+        this->mDevice->destroy(
+          *this->mPipelineCache,
+          (vk::Optional<const vk::AllocationCallbacks>)nullptr);
     }
 
     if (this->mFreePipelineLayout) {
@@ -52,7 +56,9 @@ Algorithm::~Algorithm()
             SPDLOG_ERROR("Kompute Algorithm Error requested to destroy "
                          "pipeline layout but it is null");
         }
-        this->mDevice->destroy(*this->mPipelineLayout, (vk::Optional<const vk::AllocationCallbacks>)nullptr);
+        this->mDevice->destroy(
+          *this->mPipelineLayout,
+          (vk::Optional<const vk::AllocationCallbacks>)nullptr);
     }
 
     if (this->mFreeShaderModule) {
@@ -61,7 +67,9 @@ Algorithm::~Algorithm()
             SPDLOG_ERROR("Kompute Algorithm Error requested to destroy shader "
                          "module but it is null");
         }
-        this->mDevice->destroy(*this->mShaderModule, (vk::Optional<const vk::AllocationCallbacks>)nullptr);
+        this->mDevice->destroy(
+          *this->mShaderModule,
+          (vk::Optional<const vk::AllocationCallbacks>)nullptr);
     }
 
     if (this->mFreeDescriptorSet) {
@@ -80,7 +88,9 @@ Algorithm::~Algorithm()
             SPDLOG_ERROR("Kompute Algorithm Error requested to destroy "
                          "descriptor set layout but it is null");
         }
-        this->mDevice->destroy(*this->mDescriptorSetLayout, (vk::Optional<const vk::AllocationCallbacks>)nullptr);
+        this->mDevice->destroy(
+          *this->mDescriptorSetLayout,
+          (vk::Optional<const vk::AllocationCallbacks>)nullptr);
     }
 
     if (this->mFreeDescriptorPool) {
@@ -89,7 +99,9 @@ Algorithm::~Algorithm()
             SPDLOG_ERROR("Kompute Algorithm Error requested to destroy "
                          "descriptor pool but it is null");
         }
-        this->mDevice->destroy(*this->mDescriptorPool, (vk::Optional<const vk::AllocationCallbacks>)nullptr);
+        this->mDevice->destroy(
+          *this->mDescriptorPool,
+          (vk::Optional<const vk::AllocationCallbacks>)nullptr);
     }
 }
 
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 6161b782b..348c0536a 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -48,7 +48,8 @@ if(KOMPUTE_OPT_ANDOID_BUILD)
         ${PROJECT_SOURCE_DIR}/vk_ndk_wrapper_include/kompute_vk_ndk_wrapper.cpp)
 endif()
 
-add_library(kompute
+add_library(
+    kompute STATIC
     ${kompute_CPP})
 
 target_include_directories(
diff --git a/src/Manager.cpp b/src/Manager.cpp
old mode 100644
new mode 100755
index ec86b18ed..df9d64db6
--- a/src/Manager.cpp
+++ b/src/Manager.cpp
@@ -59,13 +59,19 @@ Manager::~Manager()
     }
 
     if (this->mManagedSequences.size()) {
-        SPDLOG_DEBUG("Releasing managed sequence");
+        SPDLOG_DEBUG("Kompute Manager explicitly running destructor for "
+                     "managed sequences");
+        for (const std::pair<std::string, std::shared_ptr<Sequence>>& sqPair :
+             this->mManagedSequences) {
+            sqPair.second->freeMemoryDestroyGPUResources();
+        }
         this->mManagedSequences.clear();
     }
 
     if (this->mFreeDevice) {
         SPDLOG_INFO("Destroying device");
-        this->mDevice->destroy((vk::Optional<const vk::AllocationCallbacks>)nullptr);
+        this->mDevice->destroy(
+          (vk::Optional<const vk::AllocationCallbacks>)nullptr);
         SPDLOG_DEBUG("Kompute Manager Destroyed Device");
     }
 
@@ -86,12 +92,13 @@ Manager::~Manager()
 #endif
 
     if (this->mFreeInstance) {
-        this->mInstance->destroy((vk::Optional<const vk::AllocationCallbacks>)nullptr);
+        this->mInstance->destroy(
+          (vk::Optional<const vk::AllocationCallbacks>)nullptr);
         SPDLOG_DEBUG("Kompute Manager Destroyed Instance");
     }
 }
 
-std::weak_ptr<Sequence>
+std::shared_ptr<Sequence>
 Manager::getOrCreateManagedSequence(std::string sequenceName)
 {
     SPDLOG_DEBUG("Kompute Manager creating Sequence object");
@@ -106,7 +113,7 @@ Manager::getOrCreateManagedSequence(std::string sequenceName)
     }
 }
 
-std::weak_ptr<Sequence>
+std::shared_ptr<Sequence>
 Manager::createManagedSequence(std::string sequenceName, uint32_t queueIndex)
 {
 
diff --git a/src/OpAlgoBase.cpp b/src/OpAlgoBase.cpp
new file mode 100644
index 000000000..68e22de3b
--- /dev/null
+++ b/src/OpAlgoBase.cpp
@@ -0,0 +1,170 @@
+#pragma once
+
+#include "kompute/operations/OpAlgoBase.hpp"
+
+namespace kp {
+
+OpAlgoBase::OpAlgoBase()
+{
+    SPDLOG_DEBUG("Kompute OpAlgoBase constructor base");
+}
+
+OpAlgoBase::OpAlgoBase(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
+                       std::shared_ptr<vk::Device> device,
+                       std::shared_ptr<vk::CommandBuffer> commandBuffer,
+                       std::vector<std::shared_ptr<Tensor>>& tensors,
+                       KomputeWorkgroup komputeWorkgroup)
+  : OpBase(physicalDevice, device, commandBuffer, tensors, false)
+{
+    SPDLOG_DEBUG("Kompute OpAlgoBase constructor with params numTensors: {}",
+                 tensors.size());
+
+    // The dispatch size is set up based on either explicitly provided template
+    // parameters or by default it would take the shape and size of the tensors
+    if (komputeWorkgroup.x > 0) {
+        // If at least the x value is provided we use mainly the parameters
+        // provided
+        this->mKomputeWorkgroup = {
+            0,
+            komputeWorkgroup.y > 0 ? komputeWorkgroup.y : 1,
+            komputeWorkgroup.z > 0 ? komputeWorkgroup.z : 1
+        };
+    } else {
+        this->mKomputeWorkgroup = { tensors[0]->size(), 1, 1 };
+    }
+    SPDLOG_INFO("Kompute OpAlgoBase dispatch size X: {}, Y: {}, Z: {}",
+                this->mKomputeWorkgroup.x,
+                this->mKomputeWorkgroup.y,
+                this->mKomputeWorkgroup.z);
+
+    this->mAlgorithm = std::make_shared<Algorithm>(device, commandBuffer);
+}
+
+OpAlgoBase::OpAlgoBase(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
+                       std::shared_ptr<vk::Device> device,
+                       std::shared_ptr<vk::CommandBuffer> commandBuffer,
+                       std::vector<std::shared_ptr<Tensor>>& tensors,
+                       std::string shaderFilePath,
+                       KomputeWorkgroup komputeWorkgroup)
+  : OpAlgoBase(physicalDevice, device, commandBuffer, tensors, komputeWorkgroup)
+{
+    SPDLOG_DEBUG(
+      "Kompute OpAlgoBase shaderFilePath constructo with shaderfile path: {}",
+      shaderFilePath);
+
+    this->mShaderFilePath = shaderFilePath;
+}
+
+OpAlgoBase::OpAlgoBase(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
+                       std::shared_ptr<vk::Device> device,
+                       std::shared_ptr<vk::CommandBuffer> commandBuffer,
+                       std::vector<std::shared_ptr<Tensor>>& tensors,
+                       const std::vector<char>& shaderDataRaw,
+                       KomputeWorkgroup komputeWorkgroup)
+  : OpAlgoBase(physicalDevice, device, commandBuffer, tensors, komputeWorkgroup)
+{
+    SPDLOG_DEBUG("Kompute OpAlgoBase shaderFilePath constructo with shader raw "
+                 "data length: {}",
+                 shaderDataRaw.size());
+
+    this->mShaderDataRaw = shaderDataRaw;
+}
+
+OpAlgoBase::~OpAlgoBase()
+{
+    SPDLOG_DEBUG("Kompute OpAlgoBase destructor started");
+}
+
+void
+OpAlgoBase::init()
+{
+    SPDLOG_DEBUG("Kompute OpAlgoBase init called");
+
+    if (this->mTensors.size() < 1) {
+        throw std::runtime_error(
+          "Kompute OpAlgoBase called with less than 1 tensor");
+    }
+
+    for (std::shared_ptr<Tensor> tensor : this->mTensors) {
+        if (!tensor->isInit()) {
+            throw std::runtime_error(
+              "Kompute OpAlgoBase validation failed; all tensor parameters "
+              "must be initialised.");
+        }
+    }
+
+    SPDLOG_DEBUG("Kompute OpAlgoBase fetching spirv data");
+
+    std::vector<char> shaderFileData = this->fetchSpirvBinaryData();
+
+    SPDLOG_DEBUG("Kompute OpAlgoBase Initialising algorithm component");
+
+    this->mAlgorithm->init(shaderFileData, this->mTensors);
+}
+
+void
+OpAlgoBase::record()
+{
+    SPDLOG_DEBUG("Kompute OpAlgoBase record called");
+
+    // Barrier to ensure the data is finished writing to buffer memory
+    for (std::shared_ptr<Tensor> tensor : this->mTensors) {
+        tensor->recordBufferMemoryBarrier(
+          this->mCommandBuffer,
+          vk::AccessFlagBits::eHostWrite,
+          vk::AccessFlagBits::eShaderRead,
+          vk::PipelineStageFlagBits::eHost,
+          vk::PipelineStageFlagBits::eComputeShader);
+    }
+
+    this->mAlgorithm->recordDispatch(this->mKomputeWorkgroup.x,
+                                     this->mKomputeWorkgroup.y,
+                                     this->mKomputeWorkgroup.z);
+}
+
+void
+OpAlgoBase::preEval()
+{
+    SPDLOG_DEBUG("Kompute OpAlgoBase preEval called");
+}
+
+void
+OpAlgoBase::postEval()
+{
+    SPDLOG_DEBUG("Kompute OpAlgoBase postSubmit called");
+}
+
+std::vector<char>
+OpAlgoBase::fetchSpirvBinaryData()
+{
+    SPDLOG_WARN("Kompute OpAlgoBase Running shaders directly from spirv file");
+
+    if (this->mShaderFilePath.size()) {
+        std::ifstream fileStream(this->mShaderFilePath,
+                                 std::ios::binary | std::ios::in |
+                                   std::ios::ate);
+
+        if (!fileStream.good()) {
+            throw std::runtime_error("Error reading file: " +
+                                     this->mShaderFilePath);
+        }
+
+        size_t shaderFileSize = fileStream.tellg();
+        fileStream.seekg(0, std::ios::beg);
+        char* shaderDataRaw = new char[shaderFileSize];
+        fileStream.read(shaderDataRaw, shaderFileSize);
+        fileStream.close();
+
+        SPDLOG_WARN("Kompute OpAlgoBase fetched {} bytes", shaderFileSize);
+
+        return std::vector<char>(shaderDataRaw, shaderDataRaw + shaderFileSize);
+    } else if (this->mShaderDataRaw.size()) {
+        return this->mShaderDataRaw;
+    } else {
+        throw std::runtime_error(
+          "Kompute OpAlgoBase Error reached fetchSpirvBinaryData but neither "
+          "filepath nor data provided");
+    }
+}
+
+}
diff --git a/src/OpAlgoLhsRhsOut.cpp b/src/OpAlgoLhsRhsOut.cpp
new file mode 100644
index 000000000..ab759fed8
--- /dev/null
+++ b/src/OpAlgoLhsRhsOut.cpp
@@ -0,0 +1,127 @@
+#pragma once
+
+#include "kompute/operations/OpAlgoLhsRhsOut.hpp"
+
+namespace kp {
+
+OpAlgoLhsRhsOut::OpAlgoLhsRhsOut()
+{
+    SPDLOG_DEBUG("Kompute OpAlgoLhsRhsOut constructor base");
+}
+
+OpAlgoLhsRhsOut::OpAlgoLhsRhsOut(
+  std::shared_ptr<vk::PhysicalDevice> physicalDevice,
+  std::shared_ptr<vk::Device> device,
+  std::shared_ptr<vk::CommandBuffer> commandBuffer,
+  std::vector<std::shared_ptr<Tensor>> tensors,
+  KomputeWorkgroup komputeWorkgroup)
+  // The inheritance is initialised with the copyOutputData to false given that
+  // this depencendant class handles the transfer of data via staging buffers in
+  // a granular way.
+  : OpAlgoBase(physicalDevice, device, commandBuffer, tensors, komputeWorkgroup)
+{
+    SPDLOG_DEBUG("Kompute OpAlgoLhsRhsOut constructor with params");
+}
+
+OpAlgoLhsRhsOut::~OpAlgoLhsRhsOut()
+{
+    SPDLOG_DEBUG("Kompute OpAlgoLhsRhsOut destructor started");
+}
+
+void
+OpAlgoLhsRhsOut::init()
+{
+    SPDLOG_DEBUG("Kompute OpAlgoLhsRhsOut init called");
+
+    if (this->mTensors.size() < 3) {
+        throw std::runtime_error(
+          "Kompute OpAlgoLhsRhsOut called with less than 1 tensor");
+    } else if (this->mTensors.size() > 3) {
+        SPDLOG_WARN(
+          "Kompute OpAlgoLhsRhsOut called with more than 3 this->mTensors");
+    }
+
+    this->mTensorLHS = this->mTensors[0];
+    this->mTensorRHS = this->mTensors[1];
+    this->mTensorOutput = this->mTensors[2];
+
+    if (!(this->mTensorLHS->isInit() && this->mTensorRHS->isInit() &&
+          this->mTensorOutput->isInit())) {
+        throw std::runtime_error(
+          "Kompute OpAlgoLhsRhsOut all tensor parameters must be initialised. "
+          "LHS: " +
+          std::to_string(this->mTensorLHS->isInit()) +
+          " RHS: " + std::to_string(this->mTensorRHS->isInit()) +
+          " Output: " + std::to_string(this->mTensorOutput->isInit()));
+    }
+
+    if (!(this->mTensorLHS->size() == this->mTensorRHS->size() &&
+          this->mTensorRHS->size() == this->mTensorOutput->size())) {
+        throw std::runtime_error(
+          "Kompute OpAlgoLhsRhsOut all tensor parameters must be the same size "
+          "LHS: " +
+          std::to_string(this->mTensorLHS->size()) +
+          " RHS: " + std::to_string(this->mTensorRHS->size()) +
+          " Output: " + std::to_string(this->mTensorOutput->size()));
+    }
+
+    this->mTensorOutputStaging = std::make_shared<Tensor>(
+      this->mTensorOutput->data(), Tensor::TensorTypes::eStaging);
+
+    this->mTensorOutputStaging->init(this->mPhysicalDevice, this->mDevice);
+
+    SPDLOG_DEBUG("Kompute OpAlgoLhsRhsOut fetching spirv data");
+
+    std::vector<char> shaderFileData = this->fetchSpirvBinaryData();
+
+    SPDLOG_DEBUG("Kompute OpAlgoLhsRhsOut Initialising algorithm component");
+
+    this->mAlgorithm->init(shaderFileData, this->mTensors);
+}
+
+void
+OpAlgoLhsRhsOut::record()
+{
+    SPDLOG_DEBUG("Kompute OpAlgoLhsRhsOut record called");
+
+    // Barrier to ensure the data is finished writing to buffer memory
+    this->mTensorLHS->recordBufferMemoryBarrier(
+      this->mCommandBuffer,
+      vk::AccessFlagBits::eHostWrite,
+      vk::AccessFlagBits::eShaderRead,
+      vk::PipelineStageFlagBits::eHost,
+      vk::PipelineStageFlagBits::eComputeShader);
+    this->mTensorRHS->recordBufferMemoryBarrier(
+      this->mCommandBuffer,
+      vk::AccessFlagBits::eHostWrite,
+      vk::AccessFlagBits::eShaderRead,
+      vk::PipelineStageFlagBits::eHost,
+      vk::PipelineStageFlagBits::eComputeShader);
+
+    this->mAlgorithm->recordDispatch(this->mKomputeWorkgroup.x,
+                                     this->mKomputeWorkgroup.y,
+                                     this->mKomputeWorkgroup.z);
+
+    // Barrier to ensure the shader code is executed before buffer read
+    this->mTensorOutput->recordBufferMemoryBarrier(
+      this->mCommandBuffer,
+      vk::AccessFlagBits::eShaderWrite,
+      vk::AccessFlagBits::eTransferRead,
+      vk::PipelineStageFlagBits::eComputeShader,
+      vk::PipelineStageFlagBits::eTransfer);
+
+    this->mTensorOutputStaging->recordCopyFrom(
+      this->mCommandBuffer, this->mTensorOutput, true);
+}
+
+void
+OpAlgoLhsRhsOut::postEval()
+{
+    SPDLOG_DEBUG("Kompute OpAlgoLhsRhsOut postSubmit called");
+
+    this->mTensorOutputStaging->mapDataFromHostMemory();
+
+    this->mTensorOutput->setData(this->mTensorOutputStaging->data());
+}
+
+}
diff --git a/src/Sequence.cpp b/src/Sequence.cpp
index c4446ff37..4f01891c4 100644
--- a/src/Sequence.cpp
+++ b/src/Sequence.cpp
@@ -27,33 +27,13 @@ Sequence::~Sequence()
 {
     SPDLOG_DEBUG("Kompute Sequence Destructor started");
 
-    if (!this->mDevice) {
-        SPDLOG_ERROR(
-          "Kompute Sequence destructor reached with null Device pointer");
+    if (!this->mIsInit) {
+        SPDLOG_INFO("Kompute Sequence destructor called but sequence is not "
+                    "initialized so no need to removing GPU resources.");
         return;
     }
-
-    if (this->mFreeCommandBuffer) {
-        SPDLOG_INFO("Freeing CommandBuffer");
-        if (!this->mCommandBuffer) {
-            SPDLOG_ERROR("Kompute Sequence destructor reached with null "
-                         "CommandPool pointer");
-            return;
-        }
-        this->mDevice->freeCommandBuffers(
-          *this->mCommandPool, 1, this->mCommandBuffer.get());
-        SPDLOG_DEBUG("Kompute Sequence Freed CommandBuffer");
-    }
-
-    if (this->mFreeCommandPool) {
-        SPDLOG_INFO("Destroying CommandPool");
-        if (this->mCommandPool == nullptr) {
-            SPDLOG_ERROR("Kompute Sequence destructor reached with null "
-                         "CommandPool pointer");
-            return;
-        }
-        this->mDevice->destroy(*this->mCommandPool, (vk::Optional<const vk::AllocationCallbacks>)nullptr);
-        SPDLOG_DEBUG("Kompute Sequence Destroyed CommandPool");
+    else {
+        this->freeMemoryDestroyGPUResources();
     }
 }
 
@@ -186,7 +166,8 @@ Sequence::evalAwait(uint64_t waitFor)
 
     vk::Result result =
       this->mDevice->waitForFences(1, &this->mFence, VK_TRUE, waitFor);
-    this->mDevice->destroy(this->mFence, (vk::Optional<const vk::AllocationCallbacks>)nullptr);
+    this->mDevice->destroy(
+      this->mFence, (vk::Optional<const vk::AllocationCallbacks>)nullptr);
 
     this->mIsRunning = false;
 
@@ -220,6 +201,53 @@ Sequence::isInit()
     return this->mIsInit;
 }
 
+void
+Sequence::freeMemoryDestroyGPUResources()
+{
+    if (!this->mIsInit) {
+        SPDLOG_ERROR("Kompute Sequence freeMemoryDestroyGPUResources called "
+            "but Sequence is not initialized so there's no relevant GPU resources.");
+        return;
+    }
+
+    if (!this->mDevice) {
+        SPDLOG_ERROR(
+          "Kompute Sequence freeMemoryDestroyGPUResources called with null Device pointer");
+        this->mIsInit = false;
+        return;
+    }
+
+    if (this->mFreeCommandBuffer) {
+        SPDLOG_INFO("Freeing CommandBuffer");
+        if (!this->mCommandBuffer) {
+            SPDLOG_ERROR("Kompute Sequence freeMemoryDestroyGPUResources called with null "
+                         "CommandPool pointer");
+            this->mIsInit = false;
+            return;
+        }
+        this->mDevice->freeCommandBuffers(
+          *this->mCommandPool, 1, this->mCommandBuffer.get());
+        SPDLOG_DEBUG("Kompute Sequence Freed CommandBuffer");
+    }
+
+    if (this->mFreeCommandPool) {
+        SPDLOG_INFO("Destroying CommandPool");
+        if (this->mCommandPool == nullptr) {
+            SPDLOG_ERROR("Kompute Sequence freeMemoryDestroyGPUResources called with null "
+                         "CommandPool pointer");
+            this->mIsInit = false;
+            return;
+        }
+        this->mDevice->destroy(
+          *this->mCommandPool,
+          (vk::Optional<const vk::AllocationCallbacks>)nullptr);
+        SPDLOG_DEBUG("Kompute Sequence Destroyed CommandPool");
+    }
+
+    this->mIsInit = false;
+
+}
+
 void
 Sequence::createCommandPool()
 {
diff --git a/src/Tensor.cpp b/src/Tensor.cpp
index 299622ee4..214ac2eb0 100644
--- a/src/Tensor.cpp
+++ b/src/Tensor.cpp
@@ -12,8 +12,9 @@ Tensor::Tensor()
 Tensor::Tensor(const std::vector<float>& data, TensorTypes tensorType)
 {
 #if DEBUG
-    SPDLOG_DEBUG(
-      "Kompute Tensor constructor data length: {}, and type: {}", data.size(), tensorType);
+    SPDLOG_DEBUG("Kompute Tensor constructor data length: {}, and type: {}",
+                 data.size(),
+                 tensorType);
 #endif
 
     this->mData = data;
@@ -350,7 +351,9 @@ Tensor::freeMemoryDestroyGPUResources()
               "Kompose Tensor expected to free buffer but got null buffer");
         } else {
             SPDLOG_DEBUG("Kompose Tensor destroying buffer");
-            this->mDevice->destroy(*this->mBuffer, (vk::Optional<const vk::AllocationCallbacks>)nullptr);
+            this->mDevice->destroy(
+              *this->mBuffer,
+              (vk::Optional<const vk::AllocationCallbacks>)nullptr);
             this->mBuffer = nullptr;
         }
     }
@@ -361,7 +364,9 @@ Tensor::freeMemoryDestroyGPUResources()
               "Kompose Tensor expected to free buffer but got null memory");
         } else {
             SPDLOG_DEBUG("Kompose Tensor freeing memory");
-            this->mDevice->freeMemory(*this->mMemory, (vk::Optional<const vk::AllocationCallbacks>)nullptr);
+            this->mDevice->freeMemory(
+              *this->mMemory,
+              (vk::Optional<const vk::AllocationCallbacks>)nullptr);
             this->mDevice = nullptr;
         }
     }
diff --git a/src/include/kompute/Manager.hpp b/src/include/kompute/Manager.hpp
index 32c04535b..8c689ba57 100644
--- a/src/include/kompute/Manager.hpp
+++ b/src/include/kompute/Manager.hpp
@@ -63,9 +63,9 @@ class Manager
      *
      * @param sequenceName The name for the named sequence to be retrieved or
      * created
-     * @return Weak pointer to the manager owned sequence resource
+     * @return Shared pointer to the manager owned sequence resource
      */
-    std::weak_ptr<Sequence> getOrCreateManagedSequence(
+    std::shared_ptr<Sequence> getOrCreateManagedSequence(
       std::string sequenceName);
 
     /**
@@ -77,8 +77,9 @@ class Manager
      * @param queueIndex The queue to use from the available queues
      * @return Weak pointer to the manager owned sequence resource
      */
-    std::weak_ptr<Sequence> createManagedSequence(std::string sequenceName = "",
-                                                  uint32_t queueIndex = 0);
+    std::shared_ptr<Sequence> createManagedSequence(
+      std::string sequenceName = "",
+      uint32_t queueIndex = 0);
 
     /**
      * Function that evaluates operation against named sequence.
@@ -94,22 +95,21 @@ class Manager
                 TArgs&&... params)
     {
         SPDLOG_DEBUG("Kompute Manager evalOp triggered");
-        std::weak_ptr<Sequence> sqWeakPtr =
+        std::shared_ptr<kp::Sequence> sq =
           this->getOrCreateManagedSequence(sequenceName);
 
-        if (std::shared_ptr<kp::Sequence> sq = sqWeakPtr.lock()) {
-            SPDLOG_DEBUG("Kompute Manager evalOp running sequence BEGIN");
-            sq->begin();
+        SPDLOG_DEBUG("Kompute Manager evalOp running sequence BEGIN");
+        sq->begin();
 
-            SPDLOG_DEBUG("Kompute Manager evalOp running sequence RECORD");
-            sq->record<T>(tensors, std::forward<TArgs>(params)...);
+        SPDLOG_DEBUG("Kompute Manager evalOp running sequence RECORD");
+        sq->record<T>(tensors, std::forward<TArgs>(params)...);
 
-            SPDLOG_DEBUG("Kompute Manager evalOp running sequence END");
-            sq->end();
+        SPDLOG_DEBUG("Kompute Manager evalOp running sequence END");
+        sq->end();
+
+        SPDLOG_DEBUG("Kompute Manager evalOp running sequence EVAL");
+        sq->eval();
 
-            SPDLOG_DEBUG("Kompute Manager evalOp running sequence EVAL");
-            sq->eval();
-        }
         SPDLOG_DEBUG("Kompute Manager evalOp running sequence SUCCESS");
     }
 
@@ -147,26 +147,21 @@ class Manager
     {
         SPDLOG_DEBUG("Kompute Manager evalOpAsync triggered");
 
-        std::weak_ptr<Sequence> sqWeakPtr =
+        std::shared_ptr<kp::Sequence> sq =
           this->getOrCreateManagedSequence(sequenceName);
 
-        if (std::shared_ptr<kp::Sequence> sq = sqWeakPtr.lock()) {
+        SPDLOG_DEBUG("Kompute Manager evalOpAsync running sequence BEGIN");
+        sq->begin();
 
-            SPDLOG_DEBUG("Kompute Manager evalOpAsync running sequence BEGIN");
-            sq->begin();
+        SPDLOG_DEBUG("Kompute Manager evalOpAsync running sequence RECORD");
+        sq->record<T>(tensors, std::forward<TArgs>(params)...);
 
-            SPDLOG_DEBUG("Kompute Manager evalOpAsync running sequence RECORD");
-            sq->record<T>(tensors, std::forward<TArgs>(params)...);
+        SPDLOG_DEBUG("Kompute Manager evalOpAsync running sequence END");
+        sq->end();
 
-            SPDLOG_DEBUG("Kompute Manager evalOpAsync running sequence END");
-            sq->end();
+        SPDLOG_DEBUG("Kompute Manager evalOpAsync running sequence EVAL");
+        sq->evalAsync();
 
-            SPDLOG_DEBUG("Kompute Manager evalOpAsync running sequence EVAL");
-            sq->evalAsync();
-        } else {
-            SPDLOG_ERROR("Kompute Manager evalOpAsync sequence [{}] not found",
-                         sequenceName);
-        }
         SPDLOG_DEBUG("Kompute Manager evalOpAsync running sequence SUCCESS");
     }
 
diff --git a/src/include/kompute/Sequence.hpp b/src/include/kompute/Sequence.hpp
index 314de6657..5d483c27a 100644
--- a/src/include/kompute/Sequence.hpp
+++ b/src/include/kompute/Sequence.hpp
@@ -106,6 +106,12 @@ class Sequence
      */
     bool isInit();
 
+    /**
+     * Destroys and frees the GPU resources which include the buffer and memory
+     * and sets the sequence as init=False.
+     */
+    void freeMemoryDestroyGPUResources();
+
     /**
      * Record function for operation to be added to the GPU queue in batch. This
      * template requires classes to be derived from the OpBase class. This
diff --git a/src/include/kompute/operations/OpAlgoBase.hpp b/src/include/kompute/operations/OpAlgoBase.hpp
index 653006952..74108d285 100644
--- a/src/include/kompute/operations/OpAlgoBase.hpp
+++ b/src/include/kompute/operations/OpAlgoBase.hpp
@@ -17,20 +17,17 @@ namespace kp {
  * Operation that provides a general abstraction that simplifies the use of 
  * algorithm and parameter components which can be used with shaders.
  * By default it enables the user to provide a dynamic number of tensors
- * which are then passed as inputs. 
- *
- * All of these tensors are expected to be initlaised and this is checked with throw std exception in the init function.
- *
- * See OpLhsRhsOut for an example implementation on a more specific granularity on tensor parameters.
- * 
- * The template parameters specify the processing GPU layout number of
- * iterations for each x, y, z parameter. More specifically, this will be the
- * input to ".dispatch(uint32_t tX, uint32_t tY, uint32_t, tZ)"
+ * which are then passed as inputs.
  */
-template<uint32_t tX = 0, uint32_t tY = 0, uint32_t tZ = 0>
 class OpAlgoBase : public OpBase
 {
   public:
+    struct KomputeWorkgroup {
+        uint32_t x;
+        uint32_t y;
+        uint32_t z;
+    };
+
     /**
      *  Base constructor, should not be used unless explicitly intended.
      */
@@ -46,11 +43,13 @@ class OpAlgoBase : public OpBase
      * @param commandBuffer Vulkan Command Buffer to record commands into
      * @param tensors Tensors that are to be used in this operation
      * @param shaderFilePath Optional parameter to specify the shader to load (either in spirv or raw format)
+     * @param komputeWorkgroup Optional parameter to specify the layout for processing
      */
     OpAlgoBase(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
            std::shared_ptr<vk::Device> device,
            std::shared_ptr<vk::CommandBuffer> commandBuffer,
-           std::vector<std::shared_ptr<Tensor>>& tensors);
+           std::vector<std::shared_ptr<Tensor>>& tensors,
+           KomputeWorkgroup komputeWorkgroup = KomputeWorkgroup());
 
     /**
      * Constructor that enables a file to be passed to the operation with
@@ -61,13 +60,15 @@ class OpAlgoBase : public OpBase
      * @param device Vulkan logical device for passing to Algorithm
      * @param commandBuffer Vulkan Command Buffer to record commands into
      * @param tensors Tensors that are to be used in this operation
-     * @param shaderFilePath Optional parameter to specify the shader to load (either in spirv or raw format)
+     * @param shaderFilePath Parameter to specify the shader to load (either in spirv or raw format)
+     * @param komputeWorkgroup Optional parameter to specify the layout for processing
      */
     OpAlgoBase(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
            std::shared_ptr<vk::Device> device,
            std::shared_ptr<vk::CommandBuffer> commandBuffer,
            std::vector<std::shared_ptr<Tensor>>& tensors,
-           std::string shaderFilePath);
+           std::string shaderFilePath,
+           KomputeWorkgroup komputeWorkgroup = KomputeWorkgroup());
 
     /**
      * Constructor that enables raw shader data to be passed to the main operation
@@ -78,12 +79,14 @@ class OpAlgoBase : public OpBase
      * @param commandBuffer Vulkan Command Buffer to record commands into
      * @param tensors Tensors that are to be used in this operation
      * @param shaderDataRaw Optional parameter to specify the shader data either in binary or raw form
+     * @param komputeWorkgroup Optional parameter to specify the layout for processing
      */
     OpAlgoBase(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
            std::shared_ptr<vk::Device> device,
            std::shared_ptr<vk::CommandBuffer> commandBuffer,
            std::vector<std::shared_ptr<Tensor>>& tensors,
-           const std::vector<char>& shaderDataRaw);
+           const std::vector<char>& shaderDataRaw,
+           KomputeWorkgroup komputeWorkgroup = KomputeWorkgroup());
 
     /**
      * Default destructor, which is in charge of destroying the algorithm
@@ -131,9 +134,7 @@ class OpAlgoBase : public OpBase
 
     // -------------- ALWAYS OWNED RESOURCES
 
-    uint32_t mX;
-    uint32_t mY;
-    uint32_t mZ;
+    KomputeWorkgroup mKomputeWorkgroup;
 
     std::string mShaderFilePath; ///< Optional member variable which can be provided for the OpAlgoBase to find the data automatically and load for processing
     std::vector<char> mShaderDataRaw; ///< Optional member variable which can be provided to contain either the raw shader content or the spirv binary content
@@ -143,174 +144,3 @@ class OpAlgoBase : public OpBase
 
 } // End namespace kp
 
-// Including implementation for template class
-#ifndef OPALGOBASE_IMPL
-#define OPALGOBASE_IMPL
-
-namespace kp {
-
-template<uint32_t tX, uint32_t tY, uint32_t tZ>
-OpAlgoBase<tX, tY, tZ>::OpAlgoBase()
-{
-    SPDLOG_DEBUG("Kompute OpAlgoBase constructor base");
-}
-
-template<uint32_t tX, uint32_t tY, uint32_t tZ>
-OpAlgoBase<tX, tY, tZ>::OpAlgoBase(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
-                           std::shared_ptr<vk::Device> device,
-                           std::shared_ptr<vk::CommandBuffer> commandBuffer,
-                           std::vector<std::shared_ptr<Tensor>>& tensors)
-  : OpBase(physicalDevice, device, commandBuffer, tensors, false)
-{
-    SPDLOG_DEBUG("Kompute OpAlgoBase constructor with params numTensors: {}", tensors.size());
-
-    // The dispatch size is set up based on either explicitly provided template
-    // parameters or by default it would take the shape and size of the tensors
-    if (tX > 0) {
-        // If at least the x value is provided we use mainly the parameters
-        // provided
-        this->mX = tX;
-        this->mY = tY > 0 ? tY : 1;
-        this->mZ = tZ > 0 ? tZ : 1;
-    } else {
-        this->mX = tensors[0]->size();
-        this->mY = 1;
-        this->mZ = 1;
-    }
-    SPDLOG_INFO("Kompute OpAlgoBase dispatch size X: {}, Y: {}, Z: {}",
-                 this->mX,
-                 this->mY,
-                 this->mZ);
-
-    this->mAlgorithm = std::make_shared<Algorithm>(device, commandBuffer);
-}
-
-template<uint32_t tX, uint32_t tY, uint32_t tZ>
-OpAlgoBase<tX, tY, tZ>::OpAlgoBase(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
-                           std::shared_ptr<vk::Device> device,
-                           std::shared_ptr<vk::CommandBuffer> commandBuffer,
-                           std::vector<std::shared_ptr<Tensor>>& tensors,
-                           std::string shaderFilePath)
-  : OpAlgoBase(physicalDevice, device, commandBuffer, tensors)
-{
-    SPDLOG_DEBUG("Kompute OpAlgoBase shaderFilePath constructo with shaderfile path: {}", shaderFilePath);
-
-    this->mShaderFilePath = shaderFilePath;
-}
-
-template<uint32_t tX, uint32_t tY, uint32_t tZ>
-OpAlgoBase<tX, tY, tZ>::OpAlgoBase(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
-                           std::shared_ptr<vk::Device> device,
-                           std::shared_ptr<vk::CommandBuffer> commandBuffer,
-                           std::vector<std::shared_ptr<Tensor>>& tensors,
-                           const std::vector<char>& shaderDataRaw)
-  : OpAlgoBase(physicalDevice, device, commandBuffer, tensors)
-{
-    SPDLOG_DEBUG("Kompute OpAlgoBase shaderFilePath constructo with shader raw data length: {}", shaderDataRaw.size());
-
-    this->mShaderDataRaw = shaderDataRaw;
-}
-
-template<uint32_t tX, uint32_t tY, uint32_t tZ>
-OpAlgoBase<tX, tY, tZ>::~OpAlgoBase()
-{
-    SPDLOG_DEBUG("Kompute OpAlgoBase destructor started");
-}
-
-template<uint32_t tX, uint32_t tY, uint32_t tZ>
-void
-OpAlgoBase<tX, tY, tZ>::init()
-{
-    SPDLOG_DEBUG("Kompute OpAlgoBase init called");
-
-    if (this->mTensors.size() < 1) {
-        throw std::runtime_error(
-          "Kompute OpAlgoBase called with less than 1 tensor");
-    } 
-
-    for (std::shared_ptr<Tensor> tensor : this->mTensors) {
-        if(!tensor->isInit()) {
-            throw std::runtime_error("Kompute OpAlgoBase validation failed; all tensor parameters must be initialised.");
-        }
-    }
-
-    SPDLOG_DEBUG("Kompute OpAlgoBase fetching spirv data");
-
-    std::vector<char> shaderFileData = this->fetchSpirvBinaryData();
-
-    SPDLOG_DEBUG("Kompute OpAlgoBase Initialising algorithm component");
-
-    this->mAlgorithm->init(shaderFileData, this->mTensors);
-}
-
-template<uint32_t tX, uint32_t tY, uint32_t tZ>
-void
-OpAlgoBase<tX, tY, tZ>::record()
-{
-    SPDLOG_DEBUG("Kompute OpAlgoBase record called");
-
-    // Barrier to ensure the data is finished writing to buffer memory
-    for (std::shared_ptr<Tensor> tensor : this->mTensors) {
-        tensor->recordBufferMemoryBarrier(
-          this->mCommandBuffer,
-          vk::AccessFlagBits::eHostWrite,
-          vk::AccessFlagBits::eShaderRead,
-          vk::PipelineStageFlagBits::eHost,
-          vk::PipelineStageFlagBits::eComputeShader);
-    }
-
-    this->mAlgorithm->recordDispatch(this->mX, this->mY, this->mZ);
-}
-
-template<uint32_t tX, uint32_t tY, uint32_t tZ>
-void
-OpAlgoBase<tX, tY, tZ>::preEval()
-{
-    SPDLOG_DEBUG("Kompute OpAlgoBase preEval called");
-}
-
-template<uint32_t tX, uint32_t tY, uint32_t tZ>
-void
-OpAlgoBase<tX, tY, tZ>::postEval()
-{
-    SPDLOG_DEBUG("Kompute OpAlgoBase postSubmit called");
-}
-
-template<uint32_t tX, uint32_t tY, uint32_t tZ>
-std::vector<char> OpAlgoBase<tX, tY, tZ>::fetchSpirvBinaryData() 
-{
-    SPDLOG_WARN(
-      "Kompute OpAlgoBase Running shaders directly from spirv file");
-
-    if (this->mShaderFilePath.size()) {
-        std::ifstream fileStream(this->mShaderFilePath,
-                                 std::ios::binary | std::ios::in | std::ios::ate);
-
-        if (!fileStream.good()) {
-            throw std::runtime_error("Error reading file: " + this->mShaderFilePath);
-        }
-
-        size_t shaderFileSize = fileStream.tellg();
-        fileStream.seekg(0, std::ios::beg);
-        char* shaderDataRaw = new char[shaderFileSize];
-        fileStream.read(shaderDataRaw, shaderFileSize);
-        fileStream.close();
-
-        SPDLOG_WARN(
-          "Kompute OpAlgoBase fetched {} bytes", shaderFileSize);
-
-        return std::vector<char>(shaderDataRaw,
-                                 shaderDataRaw + shaderFileSize);
-    }
-    else if (this->mShaderDataRaw.size()) {
-        return this->mShaderDataRaw;
-    }
-    else {
-        throw std::runtime_error("Kompute OpAlgoBase Error reached fetchSpirvBinaryData but neither filepath nor data provided");
-    }
-}
-
-}
-
-#endif // #ifndef OPALGOBASE_IMPL
-
diff --git a/src/include/kompute/operations/OpAlgoLhsRhsOut.hpp b/src/include/kompute/operations/OpAlgoLhsRhsOut.hpp
index 5c22bdcc6..c826bd324 100644
--- a/src/include/kompute/operations/OpAlgoLhsRhsOut.hpp
+++ b/src/include/kompute/operations/OpAlgoLhsRhsOut.hpp
@@ -15,12 +15,8 @@ namespace kp {
  * Operation base class to simplify the creation of operations that require
  * right hand and left hand side datapoints together with a single output.
  * The expected data passed is two input tensors and one output tensor.
- * The template parameters specify the processing GPU layout number of
- * iterations for each x, y, z parameter. More specifically, this will be the
- * input to ".dispatch(uint32_t tX, uint32_t tY, uint32_t, tZ)"
  */
-template<uint32_t tX = 0, uint32_t tY = 0, uint32_t tZ = 0>
-class OpAlgoLhsRhsOut : public OpAlgoBase<tX, tY, tZ>
+class OpAlgoLhsRhsOut : public OpAlgoBase
 {
   public:
     /**
@@ -38,11 +34,13 @@ class OpAlgoLhsRhsOut : public OpAlgoBase<tX, tY, tZ>
      * @param commandBuffer Vulkan Command Buffer to record commands into
      * @param tensors Tensors that are to be used in this operation
      * @param freeTensors Whether operation manages the memory of the Tensors
+     * @param komputeWorkgroup Optional parameter to specify the layout for processing
      */
     OpAlgoLhsRhsOut(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
            std::shared_ptr<vk::Device> device,
            std::shared_ptr<vk::CommandBuffer> commandBuffer,
-           std::vector<std::shared_ptr<Tensor>> tensors);
+           std::vector<std::shared_ptr<Tensor>> tensors,
+           KomputeWorkgroup komputeWorkgroup = KomputeWorkgroup());
 
     /**
      * Default destructor, which is in charge of destroying the algorithm
@@ -73,7 +71,7 @@ class OpAlgoLhsRhsOut : public OpAlgoBase<tX, tY, tZ>
      * of the GPU Device memory into the staging buffer so the output data can
      * be retrieved.
      */
-    virtual void postSubmit() override;
+    virtual void postEval() override;
 
   protected:
     // -------------- NEVER OWNED RESOURCES
@@ -87,136 +85,3 @@ class OpAlgoLhsRhsOut : public OpAlgoBase<tX, tY, tZ>
 
 } // End namespace kp
 
-// Including implementation for template class
-#ifndef OPALGOLHSRHSOUT_CPP
-#define OPALGOLHSRHSOUT_CPP
-
-namespace kp {
-
-template<uint32_t tX, uint32_t tY, uint32_t tZ>
-OpAlgoLhsRhsOut<tX, tY, tZ>::OpAlgoLhsRhsOut()
-{
-    SPDLOG_DEBUG("Kompute OpAlgoLhsRhsOut constructor base");
-}
-
-template<uint32_t tX, uint32_t tY, uint32_t tZ>
-OpAlgoLhsRhsOut<tX, tY, tZ>::OpAlgoLhsRhsOut(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
-                           std::shared_ptr<vk::Device> device,
-                           std::shared_ptr<vk::CommandBuffer> commandBuffer,
-                           std::vector<std::shared_ptr<Tensor>> tensors)
-  // The inheritance is initialised with the copyOutputData to false given that
-  // this depencendant class handles the transfer of data via staging buffers in 
-  // a granular way.
-  : OpAlgoBase<tX, tY, tZ>(physicalDevice, device, commandBuffer, tensors)
-{
-    SPDLOG_DEBUG("Kompute OpAlgoLhsRhsOut constructor with params");
-}
-
-template<uint32_t tX, uint32_t tY, uint32_t tZ>
-OpAlgoLhsRhsOut<tX, tY, tZ>::~OpAlgoLhsRhsOut()
-{
-    SPDLOG_DEBUG("Kompute OpAlgoLhsRhsOut destructor started");
-}
-
-template<uint32_t tX, uint32_t tY, uint32_t tZ>
-void
-OpAlgoLhsRhsOut<tX, tY, tZ>::init()
-{
-    SPDLOG_DEBUG("Kompute OpAlgoLhsRhsOut init called");
-
-    if (this->mTensors.size() < 3) {
-        throw std::runtime_error(
-          "Kompute OpAlgoLhsRhsOut called with less than 1 tensor");
-    } else if (this->mTensors.size() > 3) {
-        SPDLOG_WARN("Kompute OpAlgoLhsRhsOut called with more than 3 this->mTensors");
-    }
-
-    this->mTensorLHS = this->mTensors[0];
-    this->mTensorRHS = this->mTensors[1];
-    this->mTensorOutput = this->mTensors[2];
-
-
-    if (!(this->mTensorLHS->isInit() && this->mTensorRHS->isInit() &&
-          this->mTensorOutput->isInit())) {
-        throw std::runtime_error(
-          "Kompute OpAlgoLhsRhsOut all tensor parameters must be initialised. LHS: " +
-          std::to_string(this->mTensorLHS->isInit()) +
-          " RHS: " + std::to_string(this->mTensorRHS->isInit()) +
-          " Output: " + std::to_string(this->mTensorOutput->isInit()));
-    }
-
-    if (!(this->mTensorLHS->size() == this->mTensorRHS->size() &&
-          this->mTensorRHS->size() == this->mTensorOutput->size())) {
-        throw std::runtime_error(
-          "Kompute OpAlgoLhsRhsOut all tensor parameters must be the same size LHS: " +
-          std::to_string(this->mTensorLHS->size()) +
-          " RHS: " + std::to_string(this->mTensorRHS->size()) +
-          " Output: " + std::to_string(this->mTensorOutput->size()));
-    }
-
-    this->mTensorOutputStaging = std::make_shared<Tensor>(
-      this->mTensorOutput->data(), Tensor::TensorTypes::eStaging);
-
-    this->mTensorOutputStaging->init(
-      this->mPhysicalDevice, this->mDevice, this->mCommandBuffer);
-
-    SPDLOG_DEBUG("Kompute OpAlgoLhsRhsOut fetching spirv data");
-
-    std::vector<char> shaderFileData = this->fetchSpirvBinaryData();
-
-    SPDLOG_DEBUG("Kompute OpAlgoLhsRhsOut Initialising algorithm component");
-
-    this->mAlgorithm->init(shaderFileData, this->mTensors);
-}
-
-template<uint32_t tX, uint32_t tY, uint32_t tZ>
-void
-OpAlgoLhsRhsOut<tX, tY, tZ>::record()
-{
-    SPDLOG_DEBUG("Kompute OpAlgoLhsRhsOut record called");
-
-    // Barrier to ensure the data is finished writing to buffer memory
-    this->mTensorLHS->recordBufferMemoryBarrier(
-      this->mCommandBuffer,
-      vk::AccessFlagBits::eHostWrite,
-      vk::AccessFlagBits::eShaderRead,
-      vk::PipelineStageFlagBits::eHost,
-      vk::PipelineStageFlagBits::eComputeShader);
-    this->mTensorRHS->recordBufferMemoryBarrier(
-      this->mCommandBuffer,
-      vk::AccessFlagBits::eHostWrite,
-      vk::AccessFlagBits::eShaderRead,
-      vk::PipelineStageFlagBits::eHost,
-      vk::PipelineStageFlagBits::eComputeShader);
-
-    this->mAlgorithm->recordDispatch(this->mX, this->mY, this->mZ);
-
-    // Barrier to ensure the shader code is executed before buffer read
-    this->mTensorOutput->recordBufferMemoryBarrier(
-      this->mCommandBuffer,
-      vk::AccessFlagBits::eShaderWrite,
-      vk::AccessFlagBits::eTransferRead,
-      vk::PipelineStageFlagBits::eComputeShader,
-      vk::PipelineStageFlagBits::eTransfer);
-
-    this->mTensorOutputStaging->recordCopyFrom(
-            this->mCommandBuffer,
-            this->mTensorOutput,
-            true);
-}
-
-template<uint32_t tX, uint32_t tY, uint32_t tZ>
-void
-OpAlgoLhsRhsOut<tX, tY, tZ>::postSubmit()
-{
-    SPDLOG_DEBUG("Kompute OpAlgoLhsRhsOut postSubmit called");
-
-    this->mTensorOutputStaging->mapDataFromHostMemory();
-
-    this->mTensorOutput->setData(this->mTensorOutputStaging->data());
-}
-
-}
-
-#endif // #ifndef OPALGOLHSRHSOUT_CPP
-
diff --git a/src/include/kompute/operations/OpMult.hpp b/src/include/kompute/operations/OpMult.hpp
index ba3cb21a0..f555f8ac1 100644
--- a/src/include/kompute/operations/OpMult.hpp
+++ b/src/include/kompute/operations/OpMult.hpp
@@ -17,12 +17,9 @@ namespace kp {
 
 /**
  * Operation that performs multiplication on two tensors and outpus on third
- * tensor. The template parameters specify the processing GPU layout number of
- * iterations for each x, y, z parameter. More specifically, this will be the
- * input to ".dispatch(uint32_t tX, uint32_t tY, uint32_t, tZ)"
+ * tensor.
  */
-template<uint32_t tX = 0, uint32_t tY = 0, uint32_t tZ = 0>
-class OpMult : public OpAlgoBase<tX, tY, tZ>
+class OpMult : public OpAlgoBase
 {
   public:
     /**
@@ -41,13 +38,14 @@ class OpMult : public OpAlgoBase<tX, tY, tZ>
      * @param device Vulkan logical device for passing to Algorithm
      * @param commandBuffer Vulkan Command Buffer to record commands into
      * @param tensors Tensors that are to be used in this operation
-     * @param freeTensors Whether operation manages the memory of the Tensors
+     * @param komputeWorkgroup Optional parameter to specify the layout for processing
      */
     OpMult(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
            std::shared_ptr<vk::Device> device,
            std::shared_ptr<vk::CommandBuffer> commandBuffer,
-           std::vector<std::shared_ptr<Tensor>> tensors)
-      : OpAlgoBase<tX, tY, tZ>(physicalDevice, device, commandBuffer, tensors, "")
+           std::vector<std::shared_ptr<Tensor>> tensors,
+           KomputeWorkgroup komputeWorkgroup = KomputeWorkgroup())
+      : OpAlgoBase(physicalDevice, device, commandBuffer, tensors, "", komputeWorkgroup)
     {
         SPDLOG_DEBUG("Kompute OpMult constructor with params");
 
@@ -58,14 +56,8 @@ class OpMult : public OpAlgoBase<tX, tY, tZ>
 
 #if RELEASE
     /**
-     * If release it will be using the static version of the shader which is 
-     * loaded using this file directly.
-     *
-     * @param physicalDevice Vulkan physical device used to find device queues
-     * @param device Vulkan logical device for passing to Algorithm
-     * @param commandBuffer Vulkan Command Buffer to record commands into
-     * @param tensors Tensors that are to be used in this operation
-     * @param freeTensors Whether operation manages the memory of the Tensors
+     * If RELEASE=1 it will be using the static version of the shader which is 
+     * loaded using this file directly. Otherwise it should not override the function.
      */
     std::vector<char> fetchSpirvBinaryData() override
     {
diff --git a/test/TestAsyncOperations.cpp b/test/TestAsyncOperations.cpp
index 11bdee98f..43bccf99b 100644
--- a/test/TestAsyncOperations.cpp
+++ b/test/TestAsyncOperations.cpp
@@ -54,7 +54,7 @@ TEST(TestAsyncOperations, TestManagerParallelExecution)
     auto startSync = std::chrono::high_resolution_clock::now();
 
     for (uint32_t i = 0; i < numParallel; i++) {
-        mgr.evalOpDefault<kp::OpAlgoBase<>>(
+        mgr.evalOpDefault<kp::OpAlgoBase>(
           { inputsSyncB[i] }, std::vector<char>(shader.begin(), shader.end()));
     }
 
@@ -86,7 +86,7 @@ TEST(TestAsyncOperations, TestManagerParallelExecution)
     auto startAsync = std::chrono::high_resolution_clock::now();
 
     for (uint32_t i = 0; i < numParallel; i++) {
-        mgrAsync.evalOpAsync<kp::OpAlgoBase<>>(
+        mgrAsync.evalOpAsync<kp::OpAlgoBase>(
           { inputsAsyncB[i] },
           "async" + std::to_string(i),
           std::vector<char>(shader.begin(), shader.end()));
@@ -151,10 +151,10 @@ TEST(TestAsyncOperations, TestManagerAsyncExecution)
 
     mgr.evalOpDefault<kp::OpTensorCreate>({ tensorA, tensorB });
 
-    mgr.evalOpAsync<kp::OpAlgoBase<>>(
+    mgr.evalOpAsync<kp::OpAlgoBase>(
       { tensorA }, "asyncOne", std::vector<char>(shader.begin(), shader.end()));
 
-    mgr.evalOpAsync<kp::OpAlgoBase<>>(
+    mgr.evalOpAsync<kp::OpAlgoBase>(
       { tensorB }, "asyncTwo", std::vector<char>(shader.begin(), shader.end()));
 
     mgr.evalOpAwait("asyncOne");
diff --git a/test/TestLogisticRegression.cpp b/test/TestLogisticRegression.cpp
index fa8dc7b59..eda6ca635 100644
--- a/test/TestLogisticRegression.cpp
+++ b/test/TestLogisticRegression.cpp
@@ -31,22 +31,21 @@ TEST(TestLogisticRegressionAlgorithm, TestMainLogisticRegression)
     {
         kp::Manager mgr;
 
-        std::shared_ptr<kp::Sequence> sqTensor =
-          mgr.createManagedSequence().lock();
+        std::shared_ptr<kp::Sequence> sqTensor = mgr.createManagedSequence();
 
         sqTensor->begin();
         sqTensor->record<kp::OpTensorCreate>(params);
         sqTensor->end();
         sqTensor->eval();
 
-        std::shared_ptr<kp::Sequence> sq = mgr.createManagedSequence().lock();
+        std::shared_ptr<kp::Sequence> sq = mgr.createManagedSequence();
 
         // Record op algo base
         sq->begin();
 
         sq->record<kp::OpTensorSyncDevice>({ wIn, bIn });
 
-        sq->record<kp::OpAlgoBase<>>(
+        sq->record<kp::OpAlgoBase>(
           params, "test/shaders/glsl/test_logistic_regression.comp");
 
         sq->record<kp::OpTensorSyncLocal>({ wOutI, wOutJ, bOut, lOut });
@@ -76,7 +75,7 @@ TEST(TestLogisticRegressionAlgorithm, TestMainLogisticRegression)
     EXPECT_LT(bIn->data()[0], 0.0);
     EXPECT_LT(bIn->data()[0], 0.0);
 
-    //SPDLOG_WARN("Result wIn: {}, bIn: {}, loss: {}",
+    // SPDLOG_WARN("Result wIn: {}, bIn: {}, loss: {}",
     //            wIn->data(),
     //            bIn->data(),
     //            lOut->data());
@@ -114,20 +113,19 @@ TEST(TestLogisticRegressionAlgorithm, TestMainLogisticRegressionManualCopy)
     {
         kp::Manager mgr;
 
-        std::shared_ptr<kp::Sequence> sqTensor =
-          mgr.createManagedSequence().lock();
+        std::shared_ptr<kp::Sequence> sqTensor = mgr.createManagedSequence();
 
         sqTensor->begin();
         sqTensor->record<kp::OpTensorCreate>(params);
         sqTensor->end();
         sqTensor->eval();
 
-        std::shared_ptr<kp::Sequence> sq = mgr.createManagedSequence().lock();
+        std::shared_ptr<kp::Sequence> sq = mgr.createManagedSequence();
 
         // Record op algo base
         sq->begin();
 
-        sq->record<kp::OpAlgoBase<>>(
+        sq->record<kp::OpAlgoBase>(
           params, "test/shaders/glsl/test_logistic_regression.comp");
 
         sq->record<kp::OpTensorSyncLocal>({ wOutI, wOutJ, bOut, lOut });
@@ -158,7 +156,7 @@ TEST(TestLogisticRegressionAlgorithm, TestMainLogisticRegressionManualCopy)
     EXPECT_GT(wIn->data()[1], 1.0);
     EXPECT_LT(bIn->data()[0], 0.0);
 
-    //SPDLOG_WARN("Result wIn: {}, bIn: {}, loss: {}",
+    // SPDLOG_WARN("Result wIn: {}, bIn: {}, loss: {}",
     //            wIn->data(),
     //            bIn->data(),
     //            lOut->data());
diff --git a/test/TestManager.cpp b/test/TestManager.cpp
index 1550d8efb..3076b2a62 100644
--- a/test/TestManager.cpp
+++ b/test/TestManager.cpp
@@ -17,7 +17,7 @@ TEST(TestManager, EndToEndOpMultFlow)
 
     mgr.evalOpDefault<kp::OpTensorCreate>({ tensorOutput });
 
-    mgr.evalOpDefault<kp::OpMult<>>({ tensorLHS, tensorRHS, tensorOutput });
+    mgr.evalOpDefault<kp::OpMult>({ tensorLHS, tensorRHS, tensorOutput });
 
     mgr.evalOpDefault<kp::OpTensorSyncLocal>({ tensorOutput });
 
@@ -35,23 +35,23 @@ TEST(TestManager, OpMultSequenceFlow)
 
     kp::Manager mgr;
 
-    std::weak_ptr<kp::Sequence> sqWeakPtr =
-      mgr.getOrCreateManagedSequence("newSequence");
-    if (std::shared_ptr<kp::Sequence> sq = sqWeakPtr.lock()) {
+    {
+        std::shared_ptr<kp::Sequence> sq =
+          mgr.getOrCreateManagedSequence("newSequence");
+
         sq->begin();
 
         sq->record<kp::OpTensorCreate>({ tensorLHS });
         sq->record<kp::OpTensorCreate>({ tensorRHS });
         sq->record<kp::OpTensorCreate>({ tensorOutput });
 
-        sq->record<kp::OpMult<>>({ tensorLHS, tensorRHS, tensorOutput });
+        sq->record<kp::OpMult>({ tensorLHS, tensorRHS, tensorOutput });
 
         sq->record<kp::OpTensorSyncLocal>({ tensorOutput });
 
         sq->end();
         sq->eval();
     }
-    sqWeakPtr.reset();
 
     EXPECT_EQ(tensorOutput->data(), std::vector<float>({ 0, 4, 12 }));
 }
@@ -60,22 +60,22 @@ TEST(TestManager, TestMultipleSequences)
 {
     kp::Manager mgr;
 
-    std::weak_ptr<kp::Sequence> sqWeakPtrOne =
+    std::shared_ptr<kp::Sequence> sqOne =
       mgr.getOrCreateManagedSequence("sqOne");
 
-    std::weak_ptr<kp::Sequence> sqWeakPtrTwo =
+    std::shared_ptr<kp::Sequence> sqTwo =
       mgr.getOrCreateManagedSequence("sqTwo");
 
-    std::weak_ptr<kp::Sequence> sqWeakPtrOneRef =
+    std::shared_ptr<kp::Sequence> sqOneRef =
       mgr.getOrCreateManagedSequence("sqOne");
 
-    std::weak_ptr<kp::Sequence> sqWeakPtrTwoRef =
+    std::shared_ptr<kp::Sequence> sqTwoRef =
       mgr.getOrCreateManagedSequence("sqTwo");
 
-    EXPECT_EQ(sqWeakPtrOne.lock(), sqWeakPtrOneRef.lock());
-    EXPECT_NE(sqWeakPtrTwo.lock(), sqWeakPtrOneRef.lock());
-    EXPECT_EQ(sqWeakPtrTwo.lock(), sqWeakPtrTwoRef.lock());
-    EXPECT_NE(sqWeakPtrOneRef.lock(), sqWeakPtrTwoRef.lock());
+    EXPECT_EQ(sqOne, sqOneRef);
+    EXPECT_NE(sqTwo, sqOneRef);
+    EXPECT_EQ(sqTwo, sqTwoRef);
+    EXPECT_NE(sqOneRef, sqTwoRef);
 }
 
 TEST(TestManager, TestMultipleTensorsAtOnce)
@@ -89,9 +89,10 @@ TEST(TestManager, TestMultipleTensorsAtOnce)
 
     kp::Manager mgr;
 
-    std::weak_ptr<kp::Sequence> sqWeakPtr =
+    std::shared_ptr<kp::Sequence> sq =
       mgr.getOrCreateManagedSequence("newSequence");
-    if (std::shared_ptr<kp::Sequence> sq = sqWeakPtr.lock()) {
+
+    {
         sq->begin();
 
         sq->record<kp::OpTensorCreate>({ tensorLHS, tensorRHS, tensorOutput });
@@ -100,14 +101,13 @@ TEST(TestManager, TestMultipleTensorsAtOnce)
         EXPECT_TRUE(tensorRHS->isInit());
         EXPECT_TRUE(tensorOutput->isInit());
 
-        sq->record<kp::OpMult<>>({ tensorLHS, tensorRHS, tensorOutput });
+        sq->record<kp::OpMult>({ tensorLHS, tensorRHS, tensorOutput });
 
         sq->record<kp::OpTensorSyncLocal>({ tensorOutput });
 
         sq->end();
         sq->eval();
     }
-    sqWeakPtr.reset();
 
     EXPECT_EQ(tensorOutput->data(), std::vector<float>({ 0, 4, 12 }));
 }
diff --git a/test/TestMultipleAlgoExecutions.cpp b/test/TestMultipleAlgoExecutions.cpp
index cdccd62fc..11e94caa4 100644
--- a/test/TestMultipleAlgoExecutions.cpp
+++ b/test/TestMultipleAlgoExecutions.cpp
@@ -19,18 +19,19 @@ TEST(TestMultipleAlgoExecutions, SingleSequenceRecord)
           pa[index] = pa[index] + 1;
       })");
 
-    std::weak_ptr<kp::Sequence> sqWeakPtr =
+    std::shared_ptr<kp::Sequence> sq =
       mgr.getOrCreateManagedSequence("newSequence");
-    if (std::shared_ptr<kp::Sequence> sq = sqWeakPtr.lock()) {
+
+    {
         sq->begin();
 
         sq->record<kp::OpTensorCreate>({ tensorA });
 
-        sq->record<kp::OpAlgoBase<3, 1, 1>>(
+        sq->record<kp::OpAlgoBase>(
           { tensorA }, std::vector<char>(shader.begin(), shader.end()));
-        sq->record<kp::OpAlgoBase<3, 1, 1>>(
+        sq->record<kp::OpAlgoBase>(
           { tensorA }, std::vector<char>(shader.begin(), shader.end()));
-        sq->record<kp::OpAlgoBase<3, 1, 1>>(
+        sq->record<kp::OpAlgoBase>(
           { tensorA }, std::vector<char>(shader.begin(), shader.end()));
 
         sq->record<kp::OpTensorSyncLocal>({ tensorA });
@@ -38,7 +39,6 @@ TEST(TestMultipleAlgoExecutions, SingleSequenceRecord)
         sq->end();
         sq->eval();
     }
-    sqWeakPtr.reset();
 
     EXPECT_EQ(tensorA->data(), std::vector<float>({ 3, 3, 3 }));
 }
@@ -58,9 +58,9 @@ TEST(TestMultipleAlgoExecutions, MultipleCmdBufRecords)
           pa[index] = pa[index] + 1;
       })");
 
-    std::shared_ptr<kp::Sequence> sqTensor = mgr.createManagedSequence().lock();
+    std::shared_ptr<kp::Sequence> sqTensor = mgr.createManagedSequence();
 
-    std::shared_ptr<kp::Sequence> sq = mgr.createManagedSequence().lock();
+    std::shared_ptr<kp::Sequence> sq = mgr.createManagedSequence();
 
     // First create the tensor in a separate sequence
     sqTensor->begin();
@@ -70,20 +70,20 @@ TEST(TestMultipleAlgoExecutions, MultipleCmdBufRecords)
 
     // Then perform the computations
     sq->begin();
-    sq->record<kp::OpAlgoBase<3, 1, 1>>(
-      { tensorA }, std::vector<char>(shader.begin(), shader.end()));
+    sq->record<kp::OpAlgoBase>({ tensorA },
+                               std::vector<char>(shader.begin(), shader.end()));
     sq->end();
     sq->eval();
 
     sq->begin();
-    sq->record<kp::OpAlgoBase<3, 1, 1>>(
-      { tensorA }, std::vector<char>(shader.begin(), shader.end()));
+    sq->record<kp::OpAlgoBase>({ tensorA },
+                               std::vector<char>(shader.begin(), shader.end()));
     sq->end();
     sq->eval();
 
     sq->begin();
-    sq->record<kp::OpAlgoBase<3, 1, 1>>(
-      { tensorA }, std::vector<char>(shader.begin(), shader.end()));
+    sq->record<kp::OpAlgoBase>({ tensorA },
+                               std::vector<char>(shader.begin(), shader.end()));
     sq->end();
     sq->eval();
 
@@ -111,47 +111,51 @@ TEST(TestMultipleAlgoExecutions, MultipleSequences)
           pa[index] = pa[index] + 1;
       })");
 
-    std::weak_ptr<kp::Sequence> sqWeakPtr =
-      mgr.getOrCreateManagedSequence("newSequence");
-    if (std::shared_ptr<kp::Sequence> sq = sqWeakPtr.lock()) {
+    {
+        std::shared_ptr<kp::Sequence> sq =
+          mgr.getOrCreateManagedSequence("newSequence");
+
         sq->begin();
 
         sq->record<kp::OpTensorCreate>({ tensorA });
 
-        sq->record<kp::OpAlgoBase<3, 1, 1>>(
+        sq->record<kp::OpAlgoBase>(
           { tensorA }, std::vector<char>(shader.begin(), shader.end()));
 
         sq->end();
         sq->eval();
     }
 
-    std::weak_ptr<kp::Sequence> sqWeakPtr2 =
-      mgr.getOrCreateManagedSequence("newSequence2");
-    if (std::shared_ptr<kp::Sequence> sq = sqWeakPtr2.lock()) {
+    {
+        std::shared_ptr<kp::Sequence> sq =
+          mgr.getOrCreateManagedSequence("newSequence2");
+
         sq->begin();
 
-        sq->record<kp::OpAlgoBase<3, 1, 1>>(
+        sq->record<kp::OpAlgoBase>(
           { tensorA }, std::vector<char>(shader.begin(), shader.end()));
 
         sq->end();
         sq->eval();
     }
 
-    std::weak_ptr<kp::Sequence> sqWeakPtr3 =
-      mgr.getOrCreateManagedSequence("newSequence3");
-    if (std::shared_ptr<kp::Sequence> sq = sqWeakPtr3.lock()) {
+    {
+        std::shared_ptr<kp::Sequence> sq =
+          mgr.getOrCreateManagedSequence("newSequence3");
+
         sq->begin();
 
-        sq->record<kp::OpAlgoBase<3, 1, 1>>(
+        sq->record<kp::OpAlgoBase>(
           { tensorA }, std::vector<char>(shader.begin(), shader.end()));
 
         sq->end();
         sq->eval();
     }
 
-    std::weak_ptr<kp::Sequence> sqWeakPtr4 =
-      mgr.getOrCreateManagedSequence("newSequence5");
-    if (std::shared_ptr<kp::Sequence> sq = sqWeakPtr4.lock()) {
+    {
+        std::shared_ptr<kp::Sequence> sq =
+          mgr.getOrCreateManagedSequence("newSequence5");
+
         sq->begin();
 
         sq->record<kp::OpTensorSyncLocal>({ tensorA });
@@ -179,9 +183,10 @@ TEST(TestMultipleAlgoExecutions, SingleRecordMultipleEval)
           pa[index] = pa[index] + 1;
       })");
 
-    std::weak_ptr<kp::Sequence> sqWeakPtr =
-      mgr.getOrCreateManagedSequence("newSequence");
-    if (std::shared_ptr<kp::Sequence> sq = sqWeakPtr.lock()) {
+    {
+        std::shared_ptr<kp::Sequence> sq =
+          mgr.getOrCreateManagedSequence("newSequence");
+
         sq->begin();
 
         sq->record<kp::OpTensorCreate>({ tensorA });
@@ -190,12 +195,13 @@ TEST(TestMultipleAlgoExecutions, SingleRecordMultipleEval)
         sq->eval();
     }
 
-    std::weak_ptr<kp::Sequence> sqWeakPtr2 =
-      mgr.getOrCreateManagedSequence("newSequence2");
-    if (std::shared_ptr<kp::Sequence> sq = sqWeakPtr2.lock()) {
+    {
+        std::shared_ptr<kp::Sequence> sq =
+          mgr.getOrCreateManagedSequence("newSequence2");
+
         sq->begin();
 
-        sq->record<kp::OpAlgoBase<3, 1, 1>>(
+        sq->record<kp::OpAlgoBase>(
           { tensorA }, std::vector<char>(shader.begin(), shader.end()));
 
         sq->end();
@@ -205,9 +211,10 @@ TEST(TestMultipleAlgoExecutions, SingleRecordMultipleEval)
         sq->eval();
     }
 
-    std::weak_ptr<kp::Sequence> sqWeakPtr3 =
-      mgr.getOrCreateManagedSequence("newSequence3");
-    if (std::shared_ptr<kp::Sequence> sq = sqWeakPtr2.lock()) {
+    {
+        std::shared_ptr<kp::Sequence> sq =
+          mgr.getOrCreateManagedSequence("newSequence3");
+
         sq->begin();
 
         sq->record<kp::OpTensorSyncLocal>({ tensorA });
@@ -252,7 +259,7 @@ TEST(TestMultipleAlgoExecutions, ManagerEvalMultSourceStrOpCreate)
         }
       )");
 
-    mgr.evalOpDefault<kp::OpAlgoBase<>>(
+    mgr.evalOpDefault<kp::OpAlgoBase>(
       { tensorInA, tensorInB, tensorOut },
       std::vector<char>(shader.begin(), shader.end()));
 
@@ -289,7 +296,7 @@ TEST(TestMultipleAlgoExecutions, ManagerEvalMultSourceStrMgrCreate)
         }
       )");
 
-    mgr.evalOpDefault<kp::OpAlgoBase<>>(
+    mgr.evalOpDefault<kp::OpAlgoBase>(
       { tensorInA, tensorInB, tensorOut },
       std::vector<char>(shader.begin(), shader.end()));
 
diff --git a/test/TestOpAlgoLoopsPassingData.cpp b/test/TestOpAlgoLoopsPassingData.cpp
index 2c47b0de3..bd7727790 100644
--- a/test/TestOpAlgoLoopsPassingData.cpp
+++ b/test/TestOpAlgoLoopsPassingData.cpp
@@ -30,10 +30,10 @@ TEST(TestProcessingIterations, IterateThroughMultipleSumAndCopies)
         }
     )");
 
-    std::weak_ptr<kp::Sequence> sqWeakPtr =
-      mgr.getOrCreateManagedSequence("default");
+    {
+        std::shared_ptr<kp::Sequence> sq =
+          mgr.getOrCreateManagedSequence("default");
 
-    if (std::shared_ptr<kp::Sequence> sq = sqWeakPtr.lock()) {
         sq->begin();
 
         sq->record<kp::OpTensorCreate>({ tensorA, tensorB });
@@ -43,13 +43,13 @@ TEST(TestProcessingIterations, IterateThroughMultipleSumAndCopies)
         sq->eval();
     }
 
-    std::weak_ptr<kp::Sequence> sqWeakPtr2 =
-      mgr.getOrCreateManagedSequence("run");
+    {
+        std::shared_ptr<kp::Sequence> sq =
+          mgr.getOrCreateManagedSequence("run");
 
-    if (std::shared_ptr<kp::Sequence> sq = sqWeakPtr2.lock()) {
         sq->begin();
 
-        sq->record<kp::OpAlgoBase<>>(
+        sq->record<kp::OpAlgoBase>(
           { tensorA, tensorB },
           std::vector<char>(shader.begin(), shader.end()));
 
@@ -61,10 +61,10 @@ TEST(TestProcessingIterations, IterateThroughMultipleSumAndCopies)
         }
     }
 
-    std::weak_ptr<kp::Sequence> sqWeakPtr3 =
-      mgr.getOrCreateManagedSequence("export");
+    {
+        std::shared_ptr<kp::Sequence> sq =
+          mgr.getOrCreateManagedSequence("export");
 
-    if (std::shared_ptr<kp::Sequence> sq = sqWeakPtr3.lock()) {
         sq->begin();
 
         sq->record<kp::OpTensorSyncLocal>({ tensorA, tensorB });
diff --git a/test/TestOpShadersFromStringAndFile.cpp b/test/TestOpShadersFromStringAndFile.cpp
index 58a361558..273421b26 100644
--- a/test/TestOpShadersFromStringAndFile.cpp
+++ b/test/TestOpShadersFromStringAndFile.cpp
@@ -28,7 +28,7 @@ TEST(TestOpAlgoBase, ShaderRawDataFromConstructor)
         }
     )");
 
-    mgr.evalOpDefault<kp::OpAlgoBase<>>(
+    mgr.evalOpDefault<kp::OpAlgoBase>(
       { tensorA, tensorB }, std::vector<char>(shader.begin(), shader.end()));
 
     mgr.evalOpDefault<kp::OpTensorSyncLocal>({ tensorA, tensorB });
@@ -45,7 +45,7 @@ TEST(TestOpAlgoBase, ShaderCompiledDataFromConstructor)
     std::shared_ptr<kp::Tensor> tensorB{ new kp::Tensor({ 0, 0, 0 }) };
     mgr.evalOpDefault<kp::OpTensorCreate>({ tensorA, tensorB });
 
-    mgr.evalOpDefault<kp::OpAlgoBase<>>(
+    mgr.evalOpDefault<kp::OpAlgoBase>(
       { tensorA, tensorB },
       std::vector<char>(
         kp::shader_data::test_shaders_glsl_test_op_custom_shader_comp_spv,
@@ -67,7 +67,7 @@ TEST(TestOpAlgoBase, ShaderRawDataFromFile)
     std::shared_ptr<kp::Tensor> tensorB{ new kp::Tensor({ 0, 0, 0 }) };
     mgr.evalOpDefault<kp::OpTensorCreate>({ tensorA, tensorB });
 
-    mgr.evalOpDefault<kp::OpAlgoBase<>>(
+    mgr.evalOpDefault<kp::OpAlgoBase>(
       { tensorA, tensorB }, "test/shaders/glsl/test_op_custom_shader.comp");
 
     mgr.evalOpDefault<kp::OpTensorSyncLocal>({ tensorA, tensorB });
@@ -84,7 +84,7 @@ TEST(TestOpAlgoBase, ShaderCompiledDataFromFile)
     std::shared_ptr<kp::Tensor> tensorB{ new kp::Tensor({ 0, 0, 0 }) };
     mgr.evalOpDefault<kp::OpTensorCreate>({ tensorA, tensorB });
 
-    mgr.evalOpDefault<kp::OpAlgoBase<>>(
+    mgr.evalOpDefault<kp::OpAlgoBase>(
       { tensorA, tensorB }, "test/shaders/glsl/test_op_custom_shader.comp.spv");
 
     mgr.evalOpDefault<kp::OpTensorSyncLocal>({ tensorA, tensorB });
diff --git a/test/TestSequence.cpp b/test/TestSequence.cpp
index c66dcf43a..882729dcf 100644
--- a/test/TestSequence.cpp
+++ b/test/TestSequence.cpp
@@ -7,10 +7,10 @@ TEST(TestSequence, CmdBufSequenceBeginEnd)
 {
     kp::Manager mgr;
 
-    std::weak_ptr<kp::Sequence> sqWeakPtr =
-      mgr.getOrCreateManagedSequence("newSequence");
+    {
+        std::shared_ptr<kp::Sequence> sq =
+          mgr.getOrCreateManagedSequence("newSequence");
 
-    if (std::shared_ptr<kp::Sequence> sq = sqWeakPtr.lock()) {
         EXPECT_TRUE(sq->eval());
         EXPECT_TRUE(!sq->isRecording());
         EXPECT_TRUE(sq->begin());
@@ -24,3 +24,18 @@ TEST(TestSequence, CmdBufSequenceBeginEnd)
         EXPECT_TRUE(sq->eval());
     }
 }
+
+TEST(TestSequence, SequenceDestructorViaManager)
+{
+    std::shared_ptr<kp::Sequence> sq = nullptr;
+
+    {
+        kp::Manager mgr;
+
+        sq = mgr.getOrCreateManagedSequence("newSequence");
+
+        EXPECT_TRUE(sq->isInit());
+    }
+
+    EXPECT_FALSE(sq->isInit());
+}
diff --git a/test/TestTensor.cpp b/test/TestTensor.cpp
index 676b9f423..42731bcfe 100644
--- a/test/TestTensor.cpp
+++ b/test/TestTensor.cpp
@@ -24,7 +24,7 @@ TEST(TestTensor, CopyFromHostData)
     kp::Manager mgr;
 
     if (std::shared_ptr<kp::Sequence> sq =
-          mgr.getOrCreateManagedSequence("new").lock()) {
+          mgr.getOrCreateManagedSequence("new")) {
         sq->begin();
 
         sq->record<kp::OpTensorCreate>({ tensorA, tensorB });