From 5d3795b539a7f23d23727abedfe5057ed8543b29 Mon Sep 17 00:00:00 2001
From: Alejandro Saucedo <axsauze@gmail.com>
Date: Sat, 31 Oct 2020 17:36:05 +0000
Subject: [PATCH 01/39] Initial exploration of pybind11 for python bindings

---
 pybind/README.md | 2 ++
 1 file changed, 2 insertions(+)
 create mode 100644 pybind/README.md

diff --git a/pybind/README.md b/pybind/README.md
new file mode 100644
index 000000000..7b0d89f0e
--- /dev/null
+++ b/pybind/README.md
@@ -0,0 +1,2 @@
+# Python Bindings for Vulkan Kompute
+

From 68c119df4c02b4deff0b1596f1a4a21abeaef8e7 Mon Sep 17 00:00:00 2001
From: Alejandro Saucedo <axsauze@gmail.com>
Date: Sat, 31 Oct 2020 18:54:10 +0000
Subject: [PATCH 02/39] Added pybind as a submodule

---
 .gitmodules     | 3 +++
 pybind/pybind11 | 1 +
 2 files changed, 4 insertions(+)
 create mode 160000 pybind/pybind11

diff --git a/.gitmodules b/.gitmodules
index 1c5db0adc..7365ba0fd 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -10,3 +10,6 @@
 	path = external/spdlog
 	url = https://github.com/gabime/spdlog
 	branch = v1.8.1
+[submodule "pybind/pybind11"]
+	path = pybind/pybind11
+	url = https://github.com/pybind/pybind11
diff --git a/pybind/pybind11 b/pybind/pybind11
new file mode 160000
index 000000000..06a54018c
--- /dev/null
+++ b/pybind/pybind11
@@ -0,0 +1 @@
+Subproject commit 06a54018c8a9fd9a7be5f5b56414b5da9259f637

From ac06761f1bc69f66cd25d7aebc7912f5e3394c01 Mon Sep 17 00:00:00 2001
From: Alejandro Saucedo <axsauze@gmail.com>
Date: Sat, 31 Oct 2020 19:00:18 +0000
Subject: [PATCH 03/39] Added basic version of example of python bindings

---
 pybind/CMakeLists.txt |  5 +++
 pybind/setup.py       | 73 +++++++++++++++++++++++++++++++++++++++++++
 pybind/src/main.cpp   | 40 ++++++++++++++++++++++++
 3 files changed, 118 insertions(+)
 create mode 100644 pybind/CMakeLists.txt
 create mode 100644 pybind/setup.py
 create mode 100644 pybind/src/main.cpp

diff --git a/pybind/CMakeLists.txt b/pybind/CMakeLists.txt
new file mode 100644
index 000000000..31449ec1c
--- /dev/null
+++ b/pybind/CMakeLists.txt
@@ -0,0 +1,5 @@
+cmake_minimum_required(VERSION 2.8.12)
+project(cmake_example)
+
+add_subdirectory(pybind11)
+pybind11_add_module(cmake_example src/main.cpp)
diff --git a/pybind/setup.py b/pybind/setup.py
new file mode 100644
index 000000000..bd30b12b7
--- /dev/null
+++ b/pybind/setup.py
@@ -0,0 +1,73 @@
+import os
+import re
+import sys
+import platform
+import subprocess
+
+from setuptools import setup, Extension
+from setuptools.command.build_ext import build_ext
+from distutils.version import LooseVersion
+
+
+class CMakeExtension(Extension):
+    def __init__(self, name, sourcedir=''):
+        Extension.__init__(self, name, sources=[])
+        self.sourcedir = os.path.abspath(sourcedir)
+
+
+class CMakeBuild(build_ext):
+    def run(self):
+        try:
+            out = subprocess.check_output(['cmake', '--version'])
+        except OSError:
+            raise RuntimeError("CMake must be installed to build the following extensions: " +
+                               ", ".join(e.name for e in self.extensions))
+
+        if platform.system() == "Windows":
+            cmake_version = LooseVersion(re.search(r'version\s*([\d.]+)', out.decode()).group(1))
+            if cmake_version < '3.1.0':
+                raise RuntimeError("CMake >= 3.1.0 is required on Windows")
+
+        for ext in self.extensions:
+            self.build_extension(ext)
+
+    def build_extension(self, ext):
+        extdir = os.path.abspath(os.path.dirname(self.get_ext_fullpath(ext.name)))
+        # required for auto-detection of auxiliary "native" libs
+        if not extdir.endswith(os.path.sep):
+            extdir += os.path.sep
+
+        cmake_args = ['-DCMAKE_LIBRARY_OUTPUT_DIRECTORY=' + extdir,
+                      '-DPYTHON_EXECUTABLE=' + sys.executable]
+
+        cfg = 'Debug' if self.debug else 'Release'
+        build_args = ['--config', cfg]
+
+        if platform.system() == "Windows":
+            cmake_args += ['-DCMAKE_LIBRARY_OUTPUT_DIRECTORY_{}={}'.format(cfg.upper(), extdir)]
+            if sys.maxsize > 2**32:
+                cmake_args += ['-A', 'x64']
+            build_args += ['--', '/m']
+        else:
+            cmake_args += ['-DCMAKE_BUILD_TYPE=' + cfg]
+            build_args += ['--', '-j2']
+
+        env = os.environ.copy()
+        env['CXXFLAGS'] = '{} -DVERSION_INFO=\\"{}\\"'.format(env.get('CXXFLAGS', ''),
+                                                              self.distribution.get_version())
+        if not os.path.exists(self.build_temp):
+            os.makedirs(self.build_temp)
+        subprocess.check_call(['cmake', ext.sourcedir] + cmake_args, cwd=self.build_temp, env=env)
+        subprocess.check_call(['cmake', '--build', '.'] + build_args, cwd=self.build_temp)
+
+setup(
+    name='cmake_example',
+    version='0.0.1',
+    author='Dean Moldovan',
+    author_email='dean0x7d@gmail.com',
+    description='A test project using pybind11 and CMake',
+    long_description='',
+    ext_modules=[CMakeExtension('cmake_example')],
+    cmdclass=dict(build_ext=CMakeBuild),
+    zip_safe=False,
+)
diff --git a/pybind/src/main.cpp b/pybind/src/main.cpp
new file mode 100644
index 000000000..86ab58210
--- /dev/null
+++ b/pybind/src/main.cpp
@@ -0,0 +1,40 @@
+#include <pybind11/pybind11.h>
+
+int add(int i, int j) {
+    return i + j;
+}
+
+namespace py = pybind11;
+
+PYBIND11_MODULE(cmake_example, m) {
+    m.doc() = R"pbdoc(
+        Pybind11 example plugin
+        -----------------------
+
+        .. currentmodule:: cmake_example
+
+        .. autosummary::
+           :toctree: _generate
+
+           add
+           subtract
+    )pbdoc";
+
+    m.def("add", &add, R"pbdoc(
+        Add two numbers
+
+        Some other explanation about the add function.
+    )pbdoc");
+
+    m.def("subtract", [](int i, int j) { return i - j; }, R"pbdoc(
+        Subtract two numbers
+
+        Some other explanation about the subtract function.
+    )pbdoc");
+
+#ifdef VERSION_INFO
+    m.attr("__version__") = VERSION_INFO;
+#else
+    m.attr("__version__") = "dev";
+#endif
+}

From e3e111e07ffd0bf485fda1949a503c4b32c888ff Mon Sep 17 00:00:00 2001
From: Alejandro Saucedo <axsauze@gmail.com>
Date: Sat, 31 Oct 2020 19:09:01 +0000
Subject: [PATCH 04/39] UPdated modules

---
 .gitmodules | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitmodules b/.gitmodules
index 7365ba0fd..c16e05825 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -13,3 +13,4 @@
 [submodule "pybind/pybind11"]
 	path = pybind/pybind11
 	url = https://github.com/pybind/pybind11
+

From 281aabf05b28a1c27622914167264152a88d6b68 Mon Sep 17 00:00:00 2001
From: Alejandro Saucedo <axsauze@gmail.com>
Date: Sat, 31 Oct 2020 19:10:09 +0000
Subject: [PATCH 05/39] Updated folder

---
 pybind/pybind11                   | 1 -
 {pybind => python}/CMakeLists.txt | 0
 {pybind => python}/README.md      | 0
 {pybind => python}/setup.py       | 0
 {pybind => python}/src/main.cpp   | 0
 5 files changed, 1 deletion(-)
 delete mode 160000 pybind/pybind11
 rename {pybind => python}/CMakeLists.txt (100%)
 rename {pybind => python}/README.md (100%)
 rename {pybind => python}/setup.py (100%)
 rename {pybind => python}/src/main.cpp (100%)

diff --git a/pybind/pybind11 b/pybind/pybind11
deleted file mode 160000
index 06a54018c..000000000
--- a/pybind/pybind11
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 06a54018c8a9fd9a7be5f5b56414b5da9259f637
diff --git a/pybind/CMakeLists.txt b/python/CMakeLists.txt
similarity index 100%
rename from pybind/CMakeLists.txt
rename to python/CMakeLists.txt
diff --git a/pybind/README.md b/python/README.md
similarity index 100%
rename from pybind/README.md
rename to python/README.md
diff --git a/pybind/setup.py b/python/setup.py
similarity index 100%
rename from pybind/setup.py
rename to python/setup.py
diff --git a/pybind/src/main.cpp b/python/src/main.cpp
similarity index 100%
rename from pybind/src/main.cpp
rename to python/src/main.cpp

From 816c5c7f5d10c4c453b0bbb9950f057311a0d68b Mon Sep 17 00:00:00 2001
From: Alejandro Saucedo <axsauze@gmail.com>
Date: Sat, 31 Oct 2020 19:12:23 +0000
Subject: [PATCH 06/39] Readded pybind module

---
 .gitmodules     | 5 ++---
 python/pybind11 | 1 +
 2 files changed, 3 insertions(+), 3 deletions(-)
 create mode 160000 python/pybind11

diff --git a/.gitmodules b/.gitmodules
index c16e05825..33549db54 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -10,7 +10,6 @@
 	path = external/spdlog
 	url = https://github.com/gabime/spdlog
 	branch = v1.8.1
-[submodule "pybind/pybind11"]
-	path = pybind/pybind11
+[submodule "python/pybind11"]
+	path = python/pybind11
 	url = https://github.com/pybind/pybind11
-
diff --git a/python/pybind11 b/python/pybind11
new file mode 160000
index 000000000..06a54018c
--- /dev/null
+++ b/python/pybind11
@@ -0,0 +1 @@
+Subproject commit 06a54018c8a9fd9a7be5f5b56414b5da9259f637

From f86e5b1341850ea0606fb5ab74fa8d960765350e Mon Sep 17 00:00:00 2001
From: Alejandro Saucedo <axsauze@gmail.com>
Date: Sun, 1 Nov 2020 06:53:51 +0000
Subject: [PATCH 07/39] Updated setup.py to build base python setup

---
 CMakeLists.txt              | 11 ++++++++---
 python/CMakeLists.txt       |  3 +--
 python/src/main.cpp         | 29 +----------------------------
 python/setup.py => setup.py |  3 +++
 4 files changed, 13 insertions(+), 33 deletions(-)
 rename python/setup.py => setup.py (95%)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 52e45fcf9..6f1338b87 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,5 +1,5 @@
 cmake_minimum_required(VERSION 3.4.1)
-project(kompute VERSION 0.3.0)
+project(kompute VERSION 0.4.2)
 
 set(CMAKE_CXX_STANDARD 14)
 
@@ -13,6 +13,7 @@ option(KOMPUTE_OPT_BUILD_SHADERS "Enable if you want to re-build all shader file
 option(KOMPUTE_OPT_BUILD_SINGLE_HEADER "Enable if you want to build the single header file" 0)
 option(KOMPUTE_OPT_INSTALL "Enable if you want to enable installation" 0)
 # Build options
+option(KOMPUTE_OPT_BUILD_PYTHON "Enable if you want to build python bindings" 0)
 option(KOMPUTE_OPT_ENABLE_SPDLOG "Extra compile flags for Kompute, see docs for full list" 0)
 option(KOMPUTE_OPT_REPO_SUBMODULE_BUILD, "Use the submodule repos instead of external package manager" 0)
 option(KOMPUTE_OPT_ANDOID_BUILD "Enable android compilation flags required" 0)
@@ -43,12 +44,16 @@ endfunction()
 
 add_subdirectory(src)
 
+if(KOMPUTE_OPT_BUILD_TESTS)
+    add_subdirectory(test)
+endif()
+
 if(KOMPUTE_OPT_BUILD_DOCS)
     set(CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/config" ${CMAKE_MODULE_PATH})
     add_subdirectory(docs)
 endif()
 
-if(KOMPUTE_OPT_BUILD_TESTS)
-    add_subdirectory(test)
+if(KOMPUTE_OPT_BUILD_PYTHON)
+    add_subdirectory(python)
 endif()
 
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index 31449ec1c..f0b4949ac 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -1,5 +1,4 @@
-cmake_minimum_required(VERSION 2.8.12)
-project(cmake_example)
 
 add_subdirectory(pybind11)
 pybind11_add_module(cmake_example src/main.cpp)
+
diff --git a/python/src/main.cpp b/python/src/main.cpp
index 86ab58210..1330bab19 100644
--- a/python/src/main.cpp
+++ b/python/src/main.cpp
@@ -1,36 +1,9 @@
 #include <pybind11/pybind11.h>
-
-int add(int i, int j) {
-    return i + j;
-}
+#include "kompute/Kompute.hpp"
 
 namespace py = pybind11;
 
 PYBIND11_MODULE(cmake_example, m) {
-    m.doc() = R"pbdoc(
-        Pybind11 example plugin
-        -----------------------
-
-        .. currentmodule:: cmake_example
-
-        .. autosummary::
-           :toctree: _generate
-
-           add
-           subtract
-    )pbdoc";
-
-    m.def("add", &add, R"pbdoc(
-        Add two numbers
-
-        Some other explanation about the add function.
-    )pbdoc");
-
-    m.def("subtract", [](int i, int j) { return i - j; }, R"pbdoc(
-        Subtract two numbers
-
-        Some other explanation about the subtract function.
-    )pbdoc");
 
 #ifdef VERSION_INFO
     m.attr("__version__") = VERSION_INFO;
diff --git a/python/setup.py b/setup.py
similarity index 95%
rename from python/setup.py
rename to setup.py
index bd30b12b7..07d769b5b 100644
--- a/python/setup.py
+++ b/setup.py
@@ -38,6 +38,8 @@ class CMakeBuild(build_ext):
             extdir += os.path.sep
 
         cmake_args = ['-DCMAKE_LIBRARY_OUTPUT_DIRECTORY=' + extdir,
+                      '-DKOMPUTE_OPT_BUILD_PYTHON=1',
+                      '-DKOMPUTE_OPT_BUILD_SINGLE_HEADER=1',
                       '-DPYTHON_EXECUTABLE=' + sys.executable]
 
         cfg = 'Debug' if self.debug else 'Release'
@@ -57,6 +59,7 @@ class CMakeBuild(build_ext):
                                                               self.distribution.get_version())
         if not os.path.exists(self.build_temp):
             os.makedirs(self.build_temp)
+
         subprocess.check_call(['cmake', ext.sourcedir] + cmake_args, cwd=self.build_temp, env=env)
         subprocess.check_call(['cmake', '--build', '.'] + build_args, cwd=self.build_temp)
 

From 9559c79eee1b6a00decb48e5f1d27f59a637e617 Mon Sep 17 00:00:00 2001
From: Alejandro Saucedo <axsauze@gmail.com>
Date: Sun, 1 Nov 2020 15:13:33 +0000
Subject: [PATCH 08/39] Updated ccls to include pybind

---
 .ccls | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.ccls b/.ccls
index 2ce15d72f..f215ea9d1 100644
--- a/.ccls
+++ b/.ccls
@@ -13,6 +13,7 @@
 -DDEBUG=1
 -DKOMPUTE_INCLUDE_FOR_SYNTAX
 
+-I./python/pybind11/include/
 -I./external/Vulkan-Headers/include/
 -I./external/googletest/googletest/include/
 -I./external/spdlog/include/

From 0e9ba00b710e060b46ec1c23a4ee4f9542b3b031 Mon Sep 17 00:00:00 2001
From: Alejandro Saucedo <axsauze@gmail.com>
Date: Sun, 1 Nov 2020 15:14:22 +0000
Subject: [PATCH 09/39] Added base capabilities for tensor in python

---
 python/CMakeLists.txt |  9 ++++++++-
 python/src/main.cpp   | 22 ++++++++++++++++++++--
 setup.py              | 10 +++++-----
 src/CMakeLists.txt    |  3 ++-
 4 files changed, 35 insertions(+), 9 deletions(-)

diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index f0b4949ac..6ef7fde4b 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -1,4 +1,11 @@
 
 add_subdirectory(pybind11)
-pybind11_add_module(cmake_example src/main.cpp)
+pybind11_add_module(komputepy src/main.cpp)
+
+include_directories(
+        ${PROJECT_SOURCE_DIR}/single_include/)
+
+target_link_libraries(
+    komputepy PRIVATE
+    kompute::kompute)
 
diff --git a/python/src/main.cpp b/python/src/main.cpp
index 1330bab19..5fe74f021 100644
--- a/python/src/main.cpp
+++ b/python/src/main.cpp
@@ -1,9 +1,27 @@
 #include <pybind11/pybind11.h>
-#include "kompute/Kompute.hpp"
+#include <pybind11/stl.h>
+
+#include <kompute/Kompute.hpp>
 
 namespace py = pybind11;
 
-PYBIND11_MODULE(cmake_example, m) {
+PYBIND11_MODULE(komputepy, m) {
+
+    py::enum_<kp::Tensor::TensorTypes>(m, "TensorTypes")
+        .value("eDevice", kp::Tensor::TensorTypes::eDevice)
+        .value("eStaging", kp::Tensor::TensorTypes::eStaging)
+        .value("eStorage", kp::Tensor::TensorTypes::eStorage)
+        .export_values();
+
+    py::class_<kp::Tensor>(m, "Tensor")
+        .def(py::init(
+            [](const std::vector<float>& data) {
+                return std::unique_ptr<kp::Tensor>(new kp::Tensor(data));
+            }))
+        .def(py::init(
+            [](const std::vector<float>& data, kp::Tensor::TensorTypes tensorTypes) {
+                return std::unique_ptr<kp::Tensor>(new kp::Tensor(data, tensorTypes));
+            }));
 
 #ifdef VERSION_INFO
     m.attr("__version__") = VERSION_INFO;
diff --git a/setup.py b/setup.py
index 07d769b5b..e09673a97 100644
--- a/setup.py
+++ b/setup.py
@@ -51,6 +51,7 @@ class CMakeBuild(build_ext):
                 cmake_args += ['-A', 'x64']
             build_args += ['--', '/m']
         else:
+            cmake_args += ['-DKOMPUTE_EXTRA_CXX_FLAGS="-fPIC"']
             cmake_args += ['-DCMAKE_BUILD_TYPE=' + cfg]
             build_args += ['--', '-j2']
 
@@ -64,13 +65,12 @@ class CMakeBuild(build_ext):
         subprocess.check_call(['cmake', '--build', '.'] + build_args, cwd=self.build_temp)
 
 setup(
-    name='cmake_example',
+    name='komputepy',
     version='0.0.1',
-    author='Dean Moldovan',
-    author_email='dean0x7d@gmail.com',
-    description='A test project using pybind11 and CMake',
+    author='Alejandro Saucedo',
+    description='Blazing fast, mobile-enabled, asynchronous, and optimized for advanced GPU processing usecases.',
     long_description='',
-    ext_modules=[CMakeExtension('cmake_example')],
+    ext_modules=[CMakeExtension('komputepy')],
     cmdclass=dict(build_ext=CMakeBuild),
     zip_safe=False,
 )
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 6161b782b..348c0536a 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -48,7 +48,8 @@ if(KOMPUTE_OPT_ANDOID_BUILD)
         ${PROJECT_SOURCE_DIR}/vk_ndk_wrapper_include/kompute_vk_ndk_wrapper.cpp)
 endif()
 
-add_library(kompute
+add_library(
+    kompute STATIC
     ${kompute_CPP})
 
 target_include_directories(

From 6afe6463c2f737fad2aab39c01e4f5a3732e29b3 Mon Sep 17 00:00:00 2001
From: Alejandro Saucedo <axsauze@gmail.com>
Date: Sun, 1 Nov 2020 16:27:59 +0000
Subject: [PATCH 10/39] Updated to add opbase

---
 python/src/main.cpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/python/src/main.cpp b/python/src/main.cpp
index 5fe74f021..e3b7fb371 100644
--- a/python/src/main.cpp
+++ b/python/src/main.cpp
@@ -21,7 +21,10 @@ PYBIND11_MODULE(komputepy, m) {
         .def(py::init(
             [](const std::vector<float>& data, kp::Tensor::TensorTypes tensorTypes) {
                 return std::unique_ptr<kp::Tensor>(new kp::Tensor(data, tensorTypes));
-            }));
+            }))
+        .def("data", &kp::Tensor::data);
+
+    py::class_<kp::OpBase>(m, "OpBase");
 
 #ifdef VERSION_INFO
     m.attr("__version__") = VERSION_INFO;

From 3ad5e4d3e780e1bbd78cf698bf081134822d4a06 Mon Sep 17 00:00:00 2001
From: Alejandro Saucedo <axsauze@gmail.com>
Date: Sun, 1 Nov 2020 16:28:48 +0000
Subject: [PATCH 11/39] Removed workgroup templates on opalgobase classes

---
 src/OpAlgoBase.cpp                            | 162 ++++++++++++++
 src/OpAlgoLhsRhsOut.cpp                       | 129 +++++++++++
 src/include/kompute/operations/OpAlgoBase.hpp | 206 ++----------------
 .../kompute/operations/OpAlgoLhsRhsOut.hpp    | 145 +-----------
 src/include/kompute/operations/OpMult.hpp     |  24 +-
 5 files changed, 322 insertions(+), 344 deletions(-)
 create mode 100644 src/OpAlgoBase.cpp
 create mode 100644 src/OpAlgoLhsRhsOut.cpp

diff --git a/src/OpAlgoBase.cpp b/src/OpAlgoBase.cpp
new file mode 100644
index 000000000..99e3a9ac1
--- /dev/null
+++ b/src/OpAlgoBase.cpp
@@ -0,0 +1,162 @@
+#pragma once
+
+#include "kompute/operations/OpAlgoBase.hpp"
+
+namespace kp {
+
+OpAlgoBase::OpAlgoBase()
+{
+    SPDLOG_DEBUG("Kompute OpAlgoBase constructor base");
+}
+
+OpAlgoBase::OpAlgoBase(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
+                           std::shared_ptr<vk::Device> device,
+                           std::shared_ptr<vk::CommandBuffer> commandBuffer,
+                           std::vector<std::shared_ptr<Tensor>>& tensors,
+                           KomputeWorkgroup komputeWorkgroup)
+  : OpBase(physicalDevice, device, commandBuffer, tensors, false)
+{
+    SPDLOG_DEBUG("Kompute OpAlgoBase constructor with params numTensors: {}", tensors.size());
+
+    // The dispatch size is set up based on either explicitly provided template
+    // parameters or by default it would take the shape and size of the tensors
+    if (komputeWorkgroup.x > 0) {
+        // If at least the x value is provided we use mainly the parameters
+        // provided
+        this->mKomputeWorkgroup = {
+            0,
+            komputeWorkgroup.y > 0 ? komputeWorkgroup.y : 1,
+            komputeWorkgroup.z > 0 ? komputeWorkgroup.z : 1
+        };
+    } else {
+        this->mKomputeWorkgroup = {tensors[0]->size(), 1, 1};
+    }
+    SPDLOG_INFO("Kompute OpAlgoBase dispatch size X: {}, Y: {}, Z: {}",
+                 this->mKomputeWorkgroup.x,
+                 this->mKomputeWorkgroup.y,
+                 this->mKomputeWorkgroup.z);
+
+    this->mAlgorithm = std::make_shared<Algorithm>(device, commandBuffer);
+}
+
+OpAlgoBase::OpAlgoBase(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
+                           std::shared_ptr<vk::Device> device,
+                           std::shared_ptr<vk::CommandBuffer> commandBuffer,
+                           std::vector<std::shared_ptr<Tensor>>& tensors,
+                           std::string shaderFilePath,
+                           KomputeWorkgroup komputeWorkgroup)
+  : OpAlgoBase(physicalDevice, device, commandBuffer, tensors, komputeWorkgroup)
+{
+    SPDLOG_DEBUG("Kompute OpAlgoBase shaderFilePath constructo with shaderfile path: {}", shaderFilePath);
+
+    this->mShaderFilePath = shaderFilePath;
+}
+
+OpAlgoBase::OpAlgoBase(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
+                           std::shared_ptr<vk::Device> device,
+                           std::shared_ptr<vk::CommandBuffer> commandBuffer,
+                           std::vector<std::shared_ptr<Tensor>>& tensors,
+                           const std::vector<char>& shaderDataRaw,
+                           KomputeWorkgroup komputeWorkgroup)
+  : OpAlgoBase(physicalDevice, device, commandBuffer, tensors, komputeWorkgroup)
+{
+    SPDLOG_DEBUG("Kompute OpAlgoBase shaderFilePath constructo with shader raw data length: {}", shaderDataRaw.size());
+
+    this->mShaderDataRaw = shaderDataRaw;
+}
+
+OpAlgoBase::~OpAlgoBase()
+{
+    SPDLOG_DEBUG("Kompute OpAlgoBase destructor started");
+}
+
+void
+OpAlgoBase::init()
+{
+    SPDLOG_DEBUG("Kompute OpAlgoBase init called");
+
+    if (this->mTensors.size() < 1) {
+        throw std::runtime_error(
+          "Kompute OpAlgoBase called with less than 1 tensor");
+    } 
+
+    for (std::shared_ptr<Tensor> tensor : this->mTensors) {
+        if(!tensor->isInit()) {
+            throw std::runtime_error("Kompute OpAlgoBase validation failed; all tensor parameters must be initialised.");
+        }
+    }
+
+    SPDLOG_DEBUG("Kompute OpAlgoBase fetching spirv data");
+
+    std::vector<char> shaderFileData = this->fetchSpirvBinaryData();
+
+    SPDLOG_DEBUG("Kompute OpAlgoBase Initialising algorithm component");
+
+    this->mAlgorithm->init(shaderFileData, this->mTensors);
+}
+
+void
+OpAlgoBase::record()
+{
+    SPDLOG_DEBUG("Kompute OpAlgoBase record called");
+
+    // Barrier to ensure the data is finished writing to buffer memory
+    for (std::shared_ptr<Tensor> tensor : this->mTensors) {
+        tensor->recordBufferMemoryBarrier(
+          this->mCommandBuffer,
+          vk::AccessFlagBits::eHostWrite,
+          vk::AccessFlagBits::eShaderRead,
+          vk::PipelineStageFlagBits::eHost,
+          vk::PipelineStageFlagBits::eComputeShader);
+    }
+
+    this->mAlgorithm->recordDispatch(this->mKomputeWorkgroup.x, this->mKomputeWorkgroup.y, this->mKomputeWorkgroup.z);
+}
+
+void
+OpAlgoBase::preEval()
+{
+    SPDLOG_DEBUG("Kompute OpAlgoBase preEval called");
+}
+
+void
+OpAlgoBase::postEval()
+{
+    SPDLOG_DEBUG("Kompute OpAlgoBase postSubmit called");
+}
+
+std::vector<char> OpAlgoBase::fetchSpirvBinaryData() 
+{
+    SPDLOG_WARN(
+      "Kompute OpAlgoBase Running shaders directly from spirv file");
+
+    if (this->mShaderFilePath.size()) {
+        std::ifstream fileStream(this->mShaderFilePath,
+                                 std::ios::binary | std::ios::in | std::ios::ate);
+
+        if (!fileStream.good()) {
+            throw std::runtime_error("Error reading file: " + this->mShaderFilePath);
+        }
+
+        size_t shaderFileSize = fileStream.tellg();
+        fileStream.seekg(0, std::ios::beg);
+        char* shaderDataRaw = new char[shaderFileSize];
+        fileStream.read(shaderDataRaw, shaderFileSize);
+        fileStream.close();
+
+        SPDLOG_WARN(
+          "Kompute OpAlgoBase fetched {} bytes", shaderFileSize);
+
+        return std::vector<char>(shaderDataRaw,
+                                 shaderDataRaw + shaderFileSize);
+    }
+    else if (this->mShaderDataRaw.size()) {
+        return this->mShaderDataRaw;
+    }
+    else {
+        throw std::runtime_error("Kompute OpAlgoBase Error reached fetchSpirvBinaryData but neither filepath nor data provided");
+    }
+}
+
+}
+
diff --git a/src/OpAlgoLhsRhsOut.cpp b/src/OpAlgoLhsRhsOut.cpp
new file mode 100644
index 000000000..444ec63a3
--- /dev/null
+++ b/src/OpAlgoLhsRhsOut.cpp
@@ -0,0 +1,129 @@
+#pragma once
+
+#include "kompute/operations/OpAlgoLhsRhsOut.hpp"
+
+namespace kp {
+
+OpAlgoLhsRhsOut::OpAlgoLhsRhsOut()
+{
+    SPDLOG_DEBUG("Kompute OpAlgoLhsRhsOut constructor base");
+}
+
+OpAlgoLhsRhsOut::OpAlgoLhsRhsOut(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
+                           std::shared_ptr<vk::Device> device,
+                           std::shared_ptr<vk::CommandBuffer> commandBuffer,
+                           std::vector<std::shared_ptr<Tensor>> tensors,
+                           KomputeWorkgroup komputeWorkgroup)
+  // The inheritance is initialised with the copyOutputData to false given that
+  // this depencendant class handles the transfer of data via staging buffers in 
+  // a granular way.
+  : OpAlgoBase(physicalDevice, device, commandBuffer, tensors, komputeWorkgroup)
+{
+    SPDLOG_DEBUG("Kompute OpAlgoLhsRhsOut constructor with params");
+}
+
+OpAlgoLhsRhsOut::~OpAlgoLhsRhsOut()
+{
+    SPDLOG_DEBUG("Kompute OpAlgoLhsRhsOut destructor started");
+}
+
+void
+OpAlgoLhsRhsOut::init()
+{
+    SPDLOG_DEBUG("Kompute OpAlgoLhsRhsOut init called");
+
+    if (this->mTensors.size() < 3) {
+        throw std::runtime_error(
+          "Kompute OpAlgoLhsRhsOut called with less than 1 tensor");
+    } else if (this->mTensors.size() > 3) {
+        SPDLOG_WARN("Kompute OpAlgoLhsRhsOut called with more than 3 this->mTensors");
+    }
+
+    this->mTensorLHS = this->mTensors[0];
+    this->mTensorRHS = this->mTensors[1];
+    this->mTensorOutput = this->mTensors[2];
+
+
+    if (!(this->mTensorLHS->isInit() && this->mTensorRHS->isInit() &&
+          this->mTensorOutput->isInit())) {
+        throw std::runtime_error(
+          "Kompute OpAlgoLhsRhsOut all tensor parameters must be initialised. LHS: " +
+          std::to_string(this->mTensorLHS->isInit()) +
+          " RHS: " + std::to_string(this->mTensorRHS->isInit()) +
+          " Output: " + std::to_string(this->mTensorOutput->isInit()));
+    }
+
+    if (!(this->mTensorLHS->size() == this->mTensorRHS->size() &&
+          this->mTensorRHS->size() == this->mTensorOutput->size())) {
+        throw std::runtime_error(
+          "Kompute OpAlgoLhsRhsOut all tensor parameters must be the same size LHS: " +
+          std::to_string(this->mTensorLHS->size()) +
+          " RHS: " + std::to_string(this->mTensorRHS->size()) +
+          " Output: " + std::to_string(this->mTensorOutput->size()));
+    }
+
+    this->mTensorOutputStaging = std::make_shared<Tensor>(
+      this->mTensorOutput->data(), Tensor::TensorTypes::eStaging);
+
+    this->mTensorOutputStaging->init(
+      this->mPhysicalDevice, this->mDevice);
+
+    SPDLOG_DEBUG("Kompute OpAlgoLhsRhsOut fetching spirv data");
+
+    std::vector<char> shaderFileData = this->fetchSpirvBinaryData();
+
+    SPDLOG_DEBUG("Kompute OpAlgoLhsRhsOut Initialising algorithm component");
+
+    this->mAlgorithm->init(shaderFileData, this->mTensors);
+}
+
+void
+OpAlgoLhsRhsOut::record()
+{
+    SPDLOG_DEBUG("Kompute OpAlgoLhsRhsOut record called");
+
+    // Barrier to ensure the data is finished writing to buffer memory
+    this->mTensorLHS->recordBufferMemoryBarrier(
+      this->mCommandBuffer,
+      vk::AccessFlagBits::eHostWrite,
+      vk::AccessFlagBits::eShaderRead,
+      vk::PipelineStageFlagBits::eHost,
+      vk::PipelineStageFlagBits::eComputeShader);
+    this->mTensorRHS->recordBufferMemoryBarrier(
+      this->mCommandBuffer,
+      vk::AccessFlagBits::eHostWrite,
+      vk::AccessFlagBits::eShaderRead,
+      vk::PipelineStageFlagBits::eHost,
+      vk::PipelineStageFlagBits::eComputeShader);
+
+    this->mAlgorithm->recordDispatch(
+                this->mKomputeWorkgroup.x,
+                this->mKomputeWorkgroup.y,
+                this->mKomputeWorkgroup.z);
+
+    // Barrier to ensure the shader code is executed before buffer read
+    this->mTensorOutput->recordBufferMemoryBarrier(
+      this->mCommandBuffer,
+      vk::AccessFlagBits::eShaderWrite,
+      vk::AccessFlagBits::eTransferRead,
+      vk::PipelineStageFlagBits::eComputeShader,
+      vk::PipelineStageFlagBits::eTransfer);
+
+    this->mTensorOutputStaging->recordCopyFrom(
+            this->mCommandBuffer,
+            this->mTensorOutput,
+            true);
+}
+
+void
+OpAlgoLhsRhsOut::postEval()
+{
+    SPDLOG_DEBUG("Kompute OpAlgoLhsRhsOut postSubmit called");
+
+    this->mTensorOutputStaging->mapDataFromHostMemory();
+
+    this->mTensorOutput->setData(this->mTensorOutputStaging->data());
+}
+
+}
+
diff --git a/src/include/kompute/operations/OpAlgoBase.hpp b/src/include/kompute/operations/OpAlgoBase.hpp
index 653006952..74108d285 100644
--- a/src/include/kompute/operations/OpAlgoBase.hpp
+++ b/src/include/kompute/operations/OpAlgoBase.hpp
@@ -17,20 +17,17 @@ namespace kp {
  * Operation that provides a general abstraction that simplifies the use of 
  * algorithm and parameter components which can be used with shaders.
  * By default it enables the user to provide a dynamic number of tensors
- * which are then passed as inputs. 
- *
- * All of these tensors are expected to be initlaised and this is checked with throw std exception in the init function.
- *
- * See OpLhsRhsOut for an example implementation on a more specific granularity on tensor parameters.
- * 
- * The template parameters specify the processing GPU layout number of
- * iterations for each x, y, z parameter. More specifically, this will be the
- * input to ".dispatch(uint32_t tX, uint32_t tY, uint32_t, tZ)"
+ * which are then passed as inputs.
  */
-template<uint32_t tX = 0, uint32_t tY = 0, uint32_t tZ = 0>
 class OpAlgoBase : public OpBase
 {
   public:
+    struct KomputeWorkgroup {
+        uint32_t x;
+        uint32_t y;
+        uint32_t z;
+    };
+
     /**
      *  Base constructor, should not be used unless explicitly intended.
      */
@@ -46,11 +43,13 @@ class OpAlgoBase : public OpBase
      * @param commandBuffer Vulkan Command Buffer to record commands into
      * @param tensors Tensors that are to be used in this operation
      * @param shaderFilePath Optional parameter to specify the shader to load (either in spirv or raw format)
+     * @param komputeWorkgroup Optional parameter to specify the layout for processing
      */
     OpAlgoBase(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
            std::shared_ptr<vk::Device> device,
            std::shared_ptr<vk::CommandBuffer> commandBuffer,
-           std::vector<std::shared_ptr<Tensor>>& tensors);
+           std::vector<std::shared_ptr<Tensor>>& tensors,
+           KomputeWorkgroup komputeWorkgroup = KomputeWorkgroup());
 
     /**
      * Constructor that enables a file to be passed to the operation with
@@ -61,13 +60,15 @@ class OpAlgoBase : public OpBase
      * @param device Vulkan logical device for passing to Algorithm
      * @param commandBuffer Vulkan Command Buffer to record commands into
      * @param tensors Tensors that are to be used in this operation
-     * @param shaderFilePath Optional parameter to specify the shader to load (either in spirv or raw format)
+     * @param shaderFilePath Parameter to specify the shader to load (either in spirv or raw format)
+     * @param komputeWorkgroup Optional parameter to specify the layout for processing
      */
     OpAlgoBase(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
            std::shared_ptr<vk::Device> device,
            std::shared_ptr<vk::CommandBuffer> commandBuffer,
            std::vector<std::shared_ptr<Tensor>>& tensors,
-           std::string shaderFilePath);
+           std::string shaderFilePath,
+           KomputeWorkgroup komputeWorkgroup = KomputeWorkgroup());
 
     /**
      * Constructor that enables raw shader data to be passed to the main operation
@@ -78,12 +79,14 @@ class OpAlgoBase : public OpBase
      * @param commandBuffer Vulkan Command Buffer to record commands into
      * @param tensors Tensors that are to be used in this operation
      * @param shaderDataRaw Optional parameter to specify the shader data either in binary or raw form
+     * @param komputeWorkgroup Optional parameter to specify the layout for processing
      */
     OpAlgoBase(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
            std::shared_ptr<vk::Device> device,
            std::shared_ptr<vk::CommandBuffer> commandBuffer,
            std::vector<std::shared_ptr<Tensor>>& tensors,
-           const std::vector<char>& shaderDataRaw);
+           const std::vector<char>& shaderDataRaw,
+           KomputeWorkgroup komputeWorkgroup = KomputeWorkgroup());
 
     /**
      * Default destructor, which is in charge of destroying the algorithm
@@ -131,9 +134,7 @@ class OpAlgoBase : public OpBase
 
     // -------------- ALWAYS OWNED RESOURCES
 
-    uint32_t mX;
-    uint32_t mY;
-    uint32_t mZ;
+    KomputeWorkgroup mKomputeWorkgroup;
 
     std::string mShaderFilePath; ///< Optional member variable which can be provided for the OpAlgoBase to find the data automatically and load for processing
     std::vector<char> mShaderDataRaw; ///< Optional member variable which can be provided to contain either the raw shader content or the spirv binary content
@@ -143,174 +144,3 @@ class OpAlgoBase : public OpBase
 
 } // End namespace kp
 
-// Including implementation for template class
-#ifndef OPALGOBASE_IMPL
-#define OPALGOBASE_IMPL
-
-namespace kp {
-
-template<uint32_t tX, uint32_t tY, uint32_t tZ>
-OpAlgoBase<tX, tY, tZ>::OpAlgoBase()
-{
-    SPDLOG_DEBUG("Kompute OpAlgoBase constructor base");
-}
-
-template<uint32_t tX, uint32_t tY, uint32_t tZ>
-OpAlgoBase<tX, tY, tZ>::OpAlgoBase(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
-                           std::shared_ptr<vk::Device> device,
-                           std::shared_ptr<vk::CommandBuffer> commandBuffer,
-                           std::vector<std::shared_ptr<Tensor>>& tensors)
-  : OpBase(physicalDevice, device, commandBuffer, tensors, false)
-{
-    SPDLOG_DEBUG("Kompute OpAlgoBase constructor with params numTensors: {}", tensors.size());
-
-    // The dispatch size is set up based on either explicitly provided template
-    // parameters or by default it would take the shape and size of the tensors
-    if (tX > 0) {
-        // If at least the x value is provided we use mainly the parameters
-        // provided
-        this->mX = tX;
-        this->mY = tY > 0 ? tY : 1;
-        this->mZ = tZ > 0 ? tZ : 1;
-    } else {
-        this->mX = tensors[0]->size();
-        this->mY = 1;
-        this->mZ = 1;
-    }
-    SPDLOG_INFO("Kompute OpAlgoBase dispatch size X: {}, Y: {}, Z: {}",
-                 this->mX,
-                 this->mY,
-                 this->mZ);
-
-    this->mAlgorithm = std::make_shared<Algorithm>(device, commandBuffer);
-}
-
-template<uint32_t tX, uint32_t tY, uint32_t tZ>
-OpAlgoBase<tX, tY, tZ>::OpAlgoBase(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
-                           std::shared_ptr<vk::Device> device,
-                           std::shared_ptr<vk::CommandBuffer> commandBuffer,
-                           std::vector<std::shared_ptr<Tensor>>& tensors,
-                           std::string shaderFilePath)
-  : OpAlgoBase(physicalDevice, device, commandBuffer, tensors)
-{
-    SPDLOG_DEBUG("Kompute OpAlgoBase shaderFilePath constructo with shaderfile path: {}", shaderFilePath);
-
-    this->mShaderFilePath = shaderFilePath;
-}
-
-template<uint32_t tX, uint32_t tY, uint32_t tZ>
-OpAlgoBase<tX, tY, tZ>::OpAlgoBase(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
-                           std::shared_ptr<vk::Device> device,
-                           std::shared_ptr<vk::CommandBuffer> commandBuffer,
-                           std::vector<std::shared_ptr<Tensor>>& tensors,
-                           const std::vector<char>& shaderDataRaw)
-  : OpAlgoBase(physicalDevice, device, commandBuffer, tensors)
-{
-    SPDLOG_DEBUG("Kompute OpAlgoBase shaderFilePath constructo with shader raw data length: {}", shaderDataRaw.size());
-
-    this->mShaderDataRaw = shaderDataRaw;
-}
-
-template<uint32_t tX, uint32_t tY, uint32_t tZ>
-OpAlgoBase<tX, tY, tZ>::~OpAlgoBase()
-{
-    SPDLOG_DEBUG("Kompute OpAlgoBase destructor started");
-}
-
-template<uint32_t tX, uint32_t tY, uint32_t tZ>
-void
-OpAlgoBase<tX, tY, tZ>::init()
-{
-    SPDLOG_DEBUG("Kompute OpAlgoBase init called");
-
-    if (this->mTensors.size() < 1) {
-        throw std::runtime_error(
-          "Kompute OpAlgoBase called with less than 1 tensor");
-    } 
-
-    for (std::shared_ptr<Tensor> tensor : this->mTensors) {
-        if(!tensor->isInit()) {
-            throw std::runtime_error("Kompute OpAlgoBase validation failed; all tensor parameters must be initialised.");
-        }
-    }
-
-    SPDLOG_DEBUG("Kompute OpAlgoBase fetching spirv data");
-
-    std::vector<char> shaderFileData = this->fetchSpirvBinaryData();
-
-    SPDLOG_DEBUG("Kompute OpAlgoBase Initialising algorithm component");
-
-    this->mAlgorithm->init(shaderFileData, this->mTensors);
-}
-
-template<uint32_t tX, uint32_t tY, uint32_t tZ>
-void
-OpAlgoBase<tX, tY, tZ>::record()
-{
-    SPDLOG_DEBUG("Kompute OpAlgoBase record called");
-
-    // Barrier to ensure the data is finished writing to buffer memory
-    for (std::shared_ptr<Tensor> tensor : this->mTensors) {
-        tensor->recordBufferMemoryBarrier(
-          this->mCommandBuffer,
-          vk::AccessFlagBits::eHostWrite,
-          vk::AccessFlagBits::eShaderRead,
-          vk::PipelineStageFlagBits::eHost,
-          vk::PipelineStageFlagBits::eComputeShader);
-    }
-
-    this->mAlgorithm->recordDispatch(this->mX, this->mY, this->mZ);
-}
-
-template<uint32_t tX, uint32_t tY, uint32_t tZ>
-void
-OpAlgoBase<tX, tY, tZ>::preEval()
-{
-    SPDLOG_DEBUG("Kompute OpAlgoBase preEval called");
-}
-
-template<uint32_t tX, uint32_t tY, uint32_t tZ>
-void
-OpAlgoBase<tX, tY, tZ>::postEval()
-{
-    SPDLOG_DEBUG("Kompute OpAlgoBase postSubmit called");
-}
-
-template<uint32_t tX, uint32_t tY, uint32_t tZ>
-std::vector<char> OpAlgoBase<tX, tY, tZ>::fetchSpirvBinaryData() 
-{
-    SPDLOG_WARN(
-      "Kompute OpAlgoBase Running shaders directly from spirv file");
-
-    if (this->mShaderFilePath.size()) {
-        std::ifstream fileStream(this->mShaderFilePath,
-                                 std::ios::binary | std::ios::in | std::ios::ate);
-
-        if (!fileStream.good()) {
-            throw std::runtime_error("Error reading file: " + this->mShaderFilePath);
-        }
-
-        size_t shaderFileSize = fileStream.tellg();
-        fileStream.seekg(0, std::ios::beg);
-        char* shaderDataRaw = new char[shaderFileSize];
-        fileStream.read(shaderDataRaw, shaderFileSize);
-        fileStream.close();
-
-        SPDLOG_WARN(
-          "Kompute OpAlgoBase fetched {} bytes", shaderFileSize);
-
-        return std::vector<char>(shaderDataRaw,
-                                 shaderDataRaw + shaderFileSize);
-    }
-    else if (this->mShaderDataRaw.size()) {
-        return this->mShaderDataRaw;
-    }
-    else {
-        throw std::runtime_error("Kompute OpAlgoBase Error reached fetchSpirvBinaryData but neither filepath nor data provided");
-    }
-}
-
-}
-
-#endif // #ifndef OPALGOBASE_IMPL
-
diff --git a/src/include/kompute/operations/OpAlgoLhsRhsOut.hpp b/src/include/kompute/operations/OpAlgoLhsRhsOut.hpp
index 5c22bdcc6..c826bd324 100644
--- a/src/include/kompute/operations/OpAlgoLhsRhsOut.hpp
+++ b/src/include/kompute/operations/OpAlgoLhsRhsOut.hpp
@@ -15,12 +15,8 @@ namespace kp {
  * Operation base class to simplify the creation of operations that require
  * right hand and left hand side datapoints together with a single output.
  * The expected data passed is two input tensors and one output tensor.
- * The template parameters specify the processing GPU layout number of
- * iterations for each x, y, z parameter. More specifically, this will be the
- * input to ".dispatch(uint32_t tX, uint32_t tY, uint32_t, tZ)"
  */
-template<uint32_t tX = 0, uint32_t tY = 0, uint32_t tZ = 0>
-class OpAlgoLhsRhsOut : public OpAlgoBase<tX, tY, tZ>
+class OpAlgoLhsRhsOut : public OpAlgoBase
 {
   public:
     /**
@@ -38,11 +34,13 @@ class OpAlgoLhsRhsOut : public OpAlgoBase<tX, tY, tZ>
      * @param commandBuffer Vulkan Command Buffer to record commands into
      * @param tensors Tensors that are to be used in this operation
      * @param freeTensors Whether operation manages the memory of the Tensors
+     * @param komputeWorkgroup Optional parameter to specify the layout for processing
      */
     OpAlgoLhsRhsOut(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
            std::shared_ptr<vk::Device> device,
            std::shared_ptr<vk::CommandBuffer> commandBuffer,
-           std::vector<std::shared_ptr<Tensor>> tensors);
+           std::vector<std::shared_ptr<Tensor>> tensors,
+           KomputeWorkgroup komputeWorkgroup = KomputeWorkgroup());
 
     /**
      * Default destructor, which is in charge of destroying the algorithm
@@ -73,7 +71,7 @@ class OpAlgoLhsRhsOut : public OpAlgoBase<tX, tY, tZ>
      * of the GPU Device memory into the staging buffer so the output data can
      * be retrieved.
      */
-    virtual void postSubmit() override;
+    virtual void postEval() override;
 
   protected:
     // -------------- NEVER OWNED RESOURCES
@@ -87,136 +85,3 @@ class OpAlgoLhsRhsOut : public OpAlgoBase<tX, tY, tZ>
 
 } // End namespace kp
 
-// Including implementation for template class
-#ifndef OPALGOLHSRHSOUT_CPP
-#define OPALGOLHSRHSOUT_CPP
-
-namespace kp {
-
-template<uint32_t tX, uint32_t tY, uint32_t tZ>
-OpAlgoLhsRhsOut<tX, tY, tZ>::OpAlgoLhsRhsOut()
-{
-    SPDLOG_DEBUG("Kompute OpAlgoLhsRhsOut constructor base");
-}
-
-template<uint32_t tX, uint32_t tY, uint32_t tZ>
-OpAlgoLhsRhsOut<tX, tY, tZ>::OpAlgoLhsRhsOut(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
-                           std::shared_ptr<vk::Device> device,
-                           std::shared_ptr<vk::CommandBuffer> commandBuffer,
-                           std::vector<std::shared_ptr<Tensor>> tensors)
-  // The inheritance is initialised with the copyOutputData to false given that
-  // this depencendant class handles the transfer of data via staging buffers in 
-  // a granular way.
-  : OpAlgoBase<tX, tY, tZ>(physicalDevice, device, commandBuffer, tensors)
-{
-    SPDLOG_DEBUG("Kompute OpAlgoLhsRhsOut constructor with params");
-}
-
-template<uint32_t tX, uint32_t tY, uint32_t tZ>
-OpAlgoLhsRhsOut<tX, tY, tZ>::~OpAlgoLhsRhsOut()
-{
-    SPDLOG_DEBUG("Kompute OpAlgoLhsRhsOut destructor started");
-}
-
-template<uint32_t tX, uint32_t tY, uint32_t tZ>
-void
-OpAlgoLhsRhsOut<tX, tY, tZ>::init()
-{
-    SPDLOG_DEBUG("Kompute OpAlgoLhsRhsOut init called");
-
-    if (this->mTensors.size() < 3) {
-        throw std::runtime_error(
-          "Kompute OpAlgoLhsRhsOut called with less than 1 tensor");
-    } else if (this->mTensors.size() > 3) {
-        SPDLOG_WARN("Kompute OpAlgoLhsRhsOut called with more than 3 this->mTensors");
-    }
-
-    this->mTensorLHS = this->mTensors[0];
-    this->mTensorRHS = this->mTensors[1];
-    this->mTensorOutput = this->mTensors[2];
-
-
-    if (!(this->mTensorLHS->isInit() && this->mTensorRHS->isInit() &&
-          this->mTensorOutput->isInit())) {
-        throw std::runtime_error(
-          "Kompute OpAlgoLhsRhsOut all tensor parameters must be initialised. LHS: " +
-          std::to_string(this->mTensorLHS->isInit()) +
-          " RHS: " + std::to_string(this->mTensorRHS->isInit()) +
-          " Output: " + std::to_string(this->mTensorOutput->isInit()));
-    }
-
-    if (!(this->mTensorLHS->size() == this->mTensorRHS->size() &&
-          this->mTensorRHS->size() == this->mTensorOutput->size())) {
-        throw std::runtime_error(
-          "Kompute OpAlgoLhsRhsOut all tensor parameters must be the same size LHS: " +
-          std::to_string(this->mTensorLHS->size()) +
-          " RHS: " + std::to_string(this->mTensorRHS->size()) +
-          " Output: " + std::to_string(this->mTensorOutput->size()));
-    }
-
-    this->mTensorOutputStaging = std::make_shared<Tensor>(
-      this->mTensorOutput->data(), Tensor::TensorTypes::eStaging);
-
-    this->mTensorOutputStaging->init(
-      this->mPhysicalDevice, this->mDevice, this->mCommandBuffer);
-
-    SPDLOG_DEBUG("Kompute OpAlgoLhsRhsOut fetching spirv data");
-
-    std::vector<char> shaderFileData = this->fetchSpirvBinaryData();
-
-    SPDLOG_DEBUG("Kompute OpAlgoLhsRhsOut Initialising algorithm component");
-
-    this->mAlgorithm->init(shaderFileData, this->mTensors);
-}
-
-template<uint32_t tX, uint32_t tY, uint32_t tZ>
-void
-OpAlgoLhsRhsOut<tX, tY, tZ>::record()
-{
-    SPDLOG_DEBUG("Kompute OpAlgoLhsRhsOut record called");
-
-    // Barrier to ensure the data is finished writing to buffer memory
-    this->mTensorLHS->recordBufferMemoryBarrier(
-      this->mCommandBuffer,
-      vk::AccessFlagBits::eHostWrite,
-      vk::AccessFlagBits::eShaderRead,
-      vk::PipelineStageFlagBits::eHost,
-      vk::PipelineStageFlagBits::eComputeShader);
-    this->mTensorRHS->recordBufferMemoryBarrier(
-      this->mCommandBuffer,
-      vk::AccessFlagBits::eHostWrite,
-      vk::AccessFlagBits::eShaderRead,
-      vk::PipelineStageFlagBits::eHost,
-      vk::PipelineStageFlagBits::eComputeShader);
-
-    this->mAlgorithm->recordDispatch(this->mX, this->mY, this->mZ);
-
-    // Barrier to ensure the shader code is executed before buffer read
-    this->mTensorOutput->recordBufferMemoryBarrier(
-      this->mCommandBuffer,
-      vk::AccessFlagBits::eShaderWrite,
-      vk::AccessFlagBits::eTransferRead,
-      vk::PipelineStageFlagBits::eComputeShader,
-      vk::PipelineStageFlagBits::eTransfer);
-
-    this->mTensorOutputStaging->recordCopyFrom(
-            this->mCommandBuffer,
-            this->mTensorOutput,
-            true);
-}
-
-template<uint32_t tX, uint32_t tY, uint32_t tZ>
-void
-OpAlgoLhsRhsOut<tX, tY, tZ>::postSubmit()
-{
-    SPDLOG_DEBUG("Kompute OpAlgoLhsRhsOut postSubmit called");
-
-    this->mTensorOutputStaging->mapDataFromHostMemory();
-
-    this->mTensorOutput->setData(this->mTensorOutputStaging->data());
-}
-
-}
-
-#endif // #ifndef OPALGOLHSRHSOUT_CPP
-
diff --git a/src/include/kompute/operations/OpMult.hpp b/src/include/kompute/operations/OpMult.hpp
index ba3cb21a0..f555f8ac1 100644
--- a/src/include/kompute/operations/OpMult.hpp
+++ b/src/include/kompute/operations/OpMult.hpp
@@ -17,12 +17,9 @@ namespace kp {
 
 /**
  * Operation that performs multiplication on two tensors and outpus on third
- * tensor. The template parameters specify the processing GPU layout number of
- * iterations for each x, y, z parameter. More specifically, this will be the
- * input to ".dispatch(uint32_t tX, uint32_t tY, uint32_t, tZ)"
+ * tensor.
  */
-template<uint32_t tX = 0, uint32_t tY = 0, uint32_t tZ = 0>
-class OpMult : public OpAlgoBase<tX, tY, tZ>
+class OpMult : public OpAlgoBase
 {
   public:
     /**
@@ -41,13 +38,14 @@ class OpMult : public OpAlgoBase<tX, tY, tZ>
      * @param device Vulkan logical device for passing to Algorithm
      * @param commandBuffer Vulkan Command Buffer to record commands into
      * @param tensors Tensors that are to be used in this operation
-     * @param freeTensors Whether operation manages the memory of the Tensors
+     * @param komputeWorkgroup Optional parameter to specify the layout for processing
      */
     OpMult(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
            std::shared_ptr<vk::Device> device,
            std::shared_ptr<vk::CommandBuffer> commandBuffer,
-           std::vector<std::shared_ptr<Tensor>> tensors)
-      : OpAlgoBase<tX, tY, tZ>(physicalDevice, device, commandBuffer, tensors, "")
+           std::vector<std::shared_ptr<Tensor>> tensors,
+           KomputeWorkgroup komputeWorkgroup = KomputeWorkgroup())
+      : OpAlgoBase(physicalDevice, device, commandBuffer, tensors, "", komputeWorkgroup)
     {
         SPDLOG_DEBUG("Kompute OpMult constructor with params");
 
@@ -58,14 +56,8 @@ class OpMult : public OpAlgoBase<tX, tY, tZ>
 
 #if RELEASE
     /**
-     * If release it will be using the static version of the shader which is 
-     * loaded using this file directly.
-     *
-     * @param physicalDevice Vulkan physical device used to find device queues
-     * @param device Vulkan logical device for passing to Algorithm
-     * @param commandBuffer Vulkan Command Buffer to record commands into
-     * @param tensors Tensors that are to be used in this operation
-     * @param freeTensors Whether operation manages the memory of the Tensors
+     * If RELEASE=1 it will be using the static version of the shader which is 
+     * loaded using this file directly. Otherwise it should not override the function.
      */
     std::vector<char> fetchSpirvBinaryData() override
     {

From 552a6c051fee47c81e883cef0b295bae78499741 Mon Sep 17 00:00:00 2001
From: Alejandro Saucedo <axsauze@gmail.com>
Date: Sun, 1 Nov 2020 16:29:07 +0000
Subject: [PATCH 12/39] Updated tests without tempaltes on opalgobase classes

---
 test/TestAsyncOperations.cpp            |  8 ++++----
 test/TestLogisticRegression.cpp         |  4 ++--
 test/TestManager.cpp                    |  6 +++---
 test/TestMultipleAlgoExecutions.cpp     | 24 ++++++++++++------------
 test/TestOpAlgoLoopsPassingData.cpp     |  2 +-
 test/TestOpShadersFromStringAndFile.cpp |  8 ++++----
 6 files changed, 26 insertions(+), 26 deletions(-)

diff --git a/test/TestAsyncOperations.cpp b/test/TestAsyncOperations.cpp
index 11bdee98f..43bccf99b 100644
--- a/test/TestAsyncOperations.cpp
+++ b/test/TestAsyncOperations.cpp
@@ -54,7 +54,7 @@ TEST(TestAsyncOperations, TestManagerParallelExecution)
     auto startSync = std::chrono::high_resolution_clock::now();
 
     for (uint32_t i = 0; i < numParallel; i++) {
-        mgr.evalOpDefault<kp::OpAlgoBase<>>(
+        mgr.evalOpDefault<kp::OpAlgoBase>(
           { inputsSyncB[i] }, std::vector<char>(shader.begin(), shader.end()));
     }
 
@@ -86,7 +86,7 @@ TEST(TestAsyncOperations, TestManagerParallelExecution)
     auto startAsync = std::chrono::high_resolution_clock::now();
 
     for (uint32_t i = 0; i < numParallel; i++) {
-        mgrAsync.evalOpAsync<kp::OpAlgoBase<>>(
+        mgrAsync.evalOpAsync<kp::OpAlgoBase>(
           { inputsAsyncB[i] },
           "async" + std::to_string(i),
           std::vector<char>(shader.begin(), shader.end()));
@@ -151,10 +151,10 @@ TEST(TestAsyncOperations, TestManagerAsyncExecution)
 
     mgr.evalOpDefault<kp::OpTensorCreate>({ tensorA, tensorB });
 
-    mgr.evalOpAsync<kp::OpAlgoBase<>>(
+    mgr.evalOpAsync<kp::OpAlgoBase>(
       { tensorA }, "asyncOne", std::vector<char>(shader.begin(), shader.end()));
 
-    mgr.evalOpAsync<kp::OpAlgoBase<>>(
+    mgr.evalOpAsync<kp::OpAlgoBase>(
       { tensorB }, "asyncTwo", std::vector<char>(shader.begin(), shader.end()));
 
     mgr.evalOpAwait("asyncOne");
diff --git a/test/TestLogisticRegression.cpp b/test/TestLogisticRegression.cpp
index fa8dc7b59..9822c08d1 100644
--- a/test/TestLogisticRegression.cpp
+++ b/test/TestLogisticRegression.cpp
@@ -46,7 +46,7 @@ TEST(TestLogisticRegressionAlgorithm, TestMainLogisticRegression)
 
         sq->record<kp::OpTensorSyncDevice>({ wIn, bIn });
 
-        sq->record<kp::OpAlgoBase<>>(
+        sq->record<kp::OpAlgoBase>(
           params, "test/shaders/glsl/test_logistic_regression.comp");
 
         sq->record<kp::OpTensorSyncLocal>({ wOutI, wOutJ, bOut, lOut });
@@ -127,7 +127,7 @@ TEST(TestLogisticRegressionAlgorithm, TestMainLogisticRegressionManualCopy)
         // Record op algo base
         sq->begin();
 
-        sq->record<kp::OpAlgoBase<>>(
+        sq->record<kp::OpAlgoBase>(
           params, "test/shaders/glsl/test_logistic_regression.comp");
 
         sq->record<kp::OpTensorSyncLocal>({ wOutI, wOutJ, bOut, lOut });
diff --git a/test/TestManager.cpp b/test/TestManager.cpp
index 1550d8efb..0cb2a78fd 100644
--- a/test/TestManager.cpp
+++ b/test/TestManager.cpp
@@ -17,7 +17,7 @@ TEST(TestManager, EndToEndOpMultFlow)
 
     mgr.evalOpDefault<kp::OpTensorCreate>({ tensorOutput });
 
-    mgr.evalOpDefault<kp::OpMult<>>({ tensorLHS, tensorRHS, tensorOutput });
+    mgr.evalOpDefault<kp::OpMult>({ tensorLHS, tensorRHS, tensorOutput });
 
     mgr.evalOpDefault<kp::OpTensorSyncLocal>({ tensorOutput });
 
@@ -44,7 +44,7 @@ TEST(TestManager, OpMultSequenceFlow)
         sq->record<kp::OpTensorCreate>({ tensorRHS });
         sq->record<kp::OpTensorCreate>({ tensorOutput });
 
-        sq->record<kp::OpMult<>>({ tensorLHS, tensorRHS, tensorOutput });
+        sq->record<kp::OpMult>({ tensorLHS, tensorRHS, tensorOutput });
 
         sq->record<kp::OpTensorSyncLocal>({ tensorOutput });
 
@@ -100,7 +100,7 @@ TEST(TestManager, TestMultipleTensorsAtOnce)
         EXPECT_TRUE(tensorRHS->isInit());
         EXPECT_TRUE(tensorOutput->isInit());
 
-        sq->record<kp::OpMult<>>({ tensorLHS, tensorRHS, tensorOutput });
+        sq->record<kp::OpMult>({ tensorLHS, tensorRHS, tensorOutput });
 
         sq->record<kp::OpTensorSyncLocal>({ tensorOutput });
 
diff --git a/test/TestMultipleAlgoExecutions.cpp b/test/TestMultipleAlgoExecutions.cpp
index cdccd62fc..a0355416c 100644
--- a/test/TestMultipleAlgoExecutions.cpp
+++ b/test/TestMultipleAlgoExecutions.cpp
@@ -26,11 +26,11 @@ TEST(TestMultipleAlgoExecutions, SingleSequenceRecord)
 
         sq->record<kp::OpTensorCreate>({ tensorA });
 
-        sq->record<kp::OpAlgoBase<3, 1, 1>>(
+        sq->record<kp::OpAlgoBase>(
           { tensorA }, std::vector<char>(shader.begin(), shader.end()));
-        sq->record<kp::OpAlgoBase<3, 1, 1>>(
+        sq->record<kp::OpAlgoBase>(
           { tensorA }, std::vector<char>(shader.begin(), shader.end()));
-        sq->record<kp::OpAlgoBase<3, 1, 1>>(
+        sq->record<kp::OpAlgoBase>(
           { tensorA }, std::vector<char>(shader.begin(), shader.end()));
 
         sq->record<kp::OpTensorSyncLocal>({ tensorA });
@@ -70,19 +70,19 @@ TEST(TestMultipleAlgoExecutions, MultipleCmdBufRecords)
 
     // Then perform the computations
     sq->begin();
-    sq->record<kp::OpAlgoBase<3, 1, 1>>(
+    sq->record<kp::OpAlgoBase>(
       { tensorA }, std::vector<char>(shader.begin(), shader.end()));
     sq->end();
     sq->eval();
 
     sq->begin();
-    sq->record<kp::OpAlgoBase<3, 1, 1>>(
+    sq->record<kp::OpAlgoBase>(
       { tensorA }, std::vector<char>(shader.begin(), shader.end()));
     sq->end();
     sq->eval();
 
     sq->begin();
-    sq->record<kp::OpAlgoBase<3, 1, 1>>(
+    sq->record<kp::OpAlgoBase>(
       { tensorA }, std::vector<char>(shader.begin(), shader.end()));
     sq->end();
     sq->eval();
@@ -118,7 +118,7 @@ TEST(TestMultipleAlgoExecutions, MultipleSequences)
 
         sq->record<kp::OpTensorCreate>({ tensorA });
 
-        sq->record<kp::OpAlgoBase<3, 1, 1>>(
+        sq->record<kp::OpAlgoBase>(
           { tensorA }, std::vector<char>(shader.begin(), shader.end()));
 
         sq->end();
@@ -130,7 +130,7 @@ TEST(TestMultipleAlgoExecutions, MultipleSequences)
     if (std::shared_ptr<kp::Sequence> sq = sqWeakPtr2.lock()) {
         sq->begin();
 
-        sq->record<kp::OpAlgoBase<3, 1, 1>>(
+        sq->record<kp::OpAlgoBase>(
           { tensorA }, std::vector<char>(shader.begin(), shader.end()));
 
         sq->end();
@@ -142,7 +142,7 @@ TEST(TestMultipleAlgoExecutions, MultipleSequences)
     if (std::shared_ptr<kp::Sequence> sq = sqWeakPtr3.lock()) {
         sq->begin();
 
-        sq->record<kp::OpAlgoBase<3, 1, 1>>(
+        sq->record<kp::OpAlgoBase>(
           { tensorA }, std::vector<char>(shader.begin(), shader.end()));
 
         sq->end();
@@ -195,7 +195,7 @@ TEST(TestMultipleAlgoExecutions, SingleRecordMultipleEval)
     if (std::shared_ptr<kp::Sequence> sq = sqWeakPtr2.lock()) {
         sq->begin();
 
-        sq->record<kp::OpAlgoBase<3, 1, 1>>(
+        sq->record<kp::OpAlgoBase>(
           { tensorA }, std::vector<char>(shader.begin(), shader.end()));
 
         sq->end();
@@ -252,7 +252,7 @@ TEST(TestMultipleAlgoExecutions, ManagerEvalMultSourceStrOpCreate)
         }
       )");
 
-    mgr.evalOpDefault<kp::OpAlgoBase<>>(
+    mgr.evalOpDefault<kp::OpAlgoBase>(
       { tensorInA, tensorInB, tensorOut },
       std::vector<char>(shader.begin(), shader.end()));
 
@@ -289,7 +289,7 @@ TEST(TestMultipleAlgoExecutions, ManagerEvalMultSourceStrMgrCreate)
         }
       )");
 
-    mgr.evalOpDefault<kp::OpAlgoBase<>>(
+    mgr.evalOpDefault<kp::OpAlgoBase>(
       { tensorInA, tensorInB, tensorOut },
       std::vector<char>(shader.begin(), shader.end()));
 
diff --git a/test/TestOpAlgoLoopsPassingData.cpp b/test/TestOpAlgoLoopsPassingData.cpp
index 2c47b0de3..9c592e356 100644
--- a/test/TestOpAlgoLoopsPassingData.cpp
+++ b/test/TestOpAlgoLoopsPassingData.cpp
@@ -49,7 +49,7 @@ TEST(TestProcessingIterations, IterateThroughMultipleSumAndCopies)
     if (std::shared_ptr<kp::Sequence> sq = sqWeakPtr2.lock()) {
         sq->begin();
 
-        sq->record<kp::OpAlgoBase<>>(
+        sq->record<kp::OpAlgoBase>(
           { tensorA, tensorB },
           std::vector<char>(shader.begin(), shader.end()));
 
diff --git a/test/TestOpShadersFromStringAndFile.cpp b/test/TestOpShadersFromStringAndFile.cpp
index 58a361558..273421b26 100644
--- a/test/TestOpShadersFromStringAndFile.cpp
+++ b/test/TestOpShadersFromStringAndFile.cpp
@@ -28,7 +28,7 @@ TEST(TestOpAlgoBase, ShaderRawDataFromConstructor)
         }
     )");
 
-    mgr.evalOpDefault<kp::OpAlgoBase<>>(
+    mgr.evalOpDefault<kp::OpAlgoBase>(
       { tensorA, tensorB }, std::vector<char>(shader.begin(), shader.end()));
 
     mgr.evalOpDefault<kp::OpTensorSyncLocal>({ tensorA, tensorB });
@@ -45,7 +45,7 @@ TEST(TestOpAlgoBase, ShaderCompiledDataFromConstructor)
     std::shared_ptr<kp::Tensor> tensorB{ new kp::Tensor({ 0, 0, 0 }) };
     mgr.evalOpDefault<kp::OpTensorCreate>({ tensorA, tensorB });
 
-    mgr.evalOpDefault<kp::OpAlgoBase<>>(
+    mgr.evalOpDefault<kp::OpAlgoBase>(
       { tensorA, tensorB },
       std::vector<char>(
         kp::shader_data::test_shaders_glsl_test_op_custom_shader_comp_spv,
@@ -67,7 +67,7 @@ TEST(TestOpAlgoBase, ShaderRawDataFromFile)
     std::shared_ptr<kp::Tensor> tensorB{ new kp::Tensor({ 0, 0, 0 }) };
     mgr.evalOpDefault<kp::OpTensorCreate>({ tensorA, tensorB });
 
-    mgr.evalOpDefault<kp::OpAlgoBase<>>(
+    mgr.evalOpDefault<kp::OpAlgoBase>(
       { tensorA, tensorB }, "test/shaders/glsl/test_op_custom_shader.comp");
 
     mgr.evalOpDefault<kp::OpTensorSyncLocal>({ tensorA, tensorB });
@@ -84,7 +84,7 @@ TEST(TestOpAlgoBase, ShaderCompiledDataFromFile)
     std::shared_ptr<kp::Tensor> tensorB{ new kp::Tensor({ 0, 0, 0 }) };
     mgr.evalOpDefault<kp::OpTensorCreate>({ tensorA, tensorB });
 
-    mgr.evalOpDefault<kp::OpAlgoBase<>>(
+    mgr.evalOpDefault<kp::OpAlgoBase>(
       { tensorA, tensorB }, "test/shaders/glsl/test_op_custom_shader.comp.spv");
 
     mgr.evalOpDefault<kp::OpTensorSyncLocal>({ tensorA, tensorB });

From b0d394a50b6f7f633f41073d75d4774b0bb4fe99 Mon Sep 17 00:00:00 2001
From: Alejandro Saucedo <axsauze@gmail.com>
Date: Sun, 1 Nov 2020 16:29:27 +0000
Subject: [PATCH 13/39] Updated single include with non-templated opalgobase
 classes

---
 single_include/kompute/Kompute.hpp | 374 +++--------------------------
 1 file changed, 31 insertions(+), 343 deletions(-)

diff --git a/single_include/kompute/Kompute.hpp b/single_include/kompute/Kompute.hpp
index 8def06e4a..382b7131d 100755
--- a/single_include/kompute/Kompute.hpp
+++ b/single_include/kompute/Kompute.hpp
@@ -1620,20 +1620,17 @@ namespace kp {
  * Operation that provides a general abstraction that simplifies the use of 
  * algorithm and parameter components which can be used with shaders.
  * By default it enables the user to provide a dynamic number of tensors
- * which are then passed as inputs. 
- *
- * All of these tensors are expected to be initlaised and this is checked with throw std exception in the init function.
- *
- * See OpLhsRhsOut for an example implementation on a more specific granularity on tensor parameters.
- * 
- * The template parameters specify the processing GPU layout number of
- * iterations for each x, y, z parameter. More specifically, this will be the
- * input to ".dispatch(uint32_t tX, uint32_t tY, uint32_t, tZ)"
+ * which are then passed as inputs.
  */
-template<uint32_t tX = 0, uint32_t tY = 0, uint32_t tZ = 0>
 class OpAlgoBase : public OpBase
 {
   public:
+    struct KomputeWorkgroup {
+        uint32_t x;
+        uint32_t y;
+        uint32_t z;
+    };
+
     /**
      *  Base constructor, should not be used unless explicitly intended.
      */
@@ -1649,11 +1646,13 @@ class OpAlgoBase : public OpBase
      * @param commandBuffer Vulkan Command Buffer to record commands into
      * @param tensors Tensors that are to be used in this operation
      * @param shaderFilePath Optional parameter to specify the shader to load (either in spirv or raw format)
+     * @param komputeWorkgroup Optional parameter to specify the layout for processing
      */
     OpAlgoBase(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
            std::shared_ptr<vk::Device> device,
            std::shared_ptr<vk::CommandBuffer> commandBuffer,
-           std::vector<std::shared_ptr<Tensor>>& tensors);
+           std::vector<std::shared_ptr<Tensor>>& tensors,
+           KomputeWorkgroup komputeWorkgroup = KomputeWorkgroup());
 
     /**
      * Constructor that enables a file to be passed to the operation with
@@ -1664,13 +1663,15 @@ class OpAlgoBase : public OpBase
      * @param device Vulkan logical device for passing to Algorithm
      * @param commandBuffer Vulkan Command Buffer to record commands into
      * @param tensors Tensors that are to be used in this operation
-     * @param shaderFilePath Optional parameter to specify the shader to load (either in spirv or raw format)
+     * @param shaderFilePath Parameter to specify the shader to load (either in spirv or raw format)
+     * @param komputeWorkgroup Optional parameter to specify the layout for processing
      */
     OpAlgoBase(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
            std::shared_ptr<vk::Device> device,
            std::shared_ptr<vk::CommandBuffer> commandBuffer,
            std::vector<std::shared_ptr<Tensor>>& tensors,
-           std::string shaderFilePath);
+           std::string shaderFilePath,
+           KomputeWorkgroup komputeWorkgroup = KomputeWorkgroup());
 
     /**
      * Constructor that enables raw shader data to be passed to the main operation
@@ -1681,12 +1682,14 @@ class OpAlgoBase : public OpBase
      * @param commandBuffer Vulkan Command Buffer to record commands into
      * @param tensors Tensors that are to be used in this operation
      * @param shaderDataRaw Optional parameter to specify the shader data either in binary or raw form
+     * @param komputeWorkgroup Optional parameter to specify the layout for processing
      */
     OpAlgoBase(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
            std::shared_ptr<vk::Device> device,
            std::shared_ptr<vk::CommandBuffer> commandBuffer,
            std::vector<std::shared_ptr<Tensor>>& tensors,
-           const std::vector<char>& shaderDataRaw);
+           const std::vector<char>& shaderDataRaw,
+           KomputeWorkgroup komputeWorkgroup = KomputeWorkgroup());
 
     /**
      * Default destructor, which is in charge of destroying the algorithm
@@ -1733,9 +1736,7 @@ class OpAlgoBase : public OpBase
 
     // -------------- ALWAYS OWNED RESOURCES
 
-    uint32_t mX;
-    uint32_t mY;
-    uint32_t mZ;
+    KomputeWorkgroup mKomputeWorkgroup;
 
     std::string mShaderFilePath; ///< Optional member variable which can be provided for the OpAlgoBase to find the data automatically and load for processing
     std::vector<char> mShaderDataRaw; ///< Optional member variable which can be provided to contain either the raw shader content or the spirv binary content
@@ -1745,177 +1746,6 @@ class OpAlgoBase : public OpBase
 
 } // End namespace kp
 
-// Including implementation for template class
-#ifndef OPALGOBASE_IMPL
-#define OPALGOBASE_IMPL
-
-namespace kp {
-
-template<uint32_t tX, uint32_t tY, uint32_t tZ>
-OpAlgoBase<tX, tY, tZ>::OpAlgoBase()
-{
-    SPDLOG_DEBUG("Kompute OpAlgoBase constructor base");
-}
-
-template<uint32_t tX, uint32_t tY, uint32_t tZ>
-OpAlgoBase<tX, tY, tZ>::OpAlgoBase(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
-                           std::shared_ptr<vk::Device> device,
-                           std::shared_ptr<vk::CommandBuffer> commandBuffer,
-                           std::vector<std::shared_ptr<Tensor>>& tensors)
-  : OpBase(physicalDevice, device, commandBuffer, tensors, false)
-{
-    SPDLOG_DEBUG("Kompute OpAlgoBase constructor with params numTensors: {}", tensors.size());
-
-    // The dispatch size is set up based on either explicitly provided template
-    // parameters or by default it would take the shape and size of the tensors
-    if (tX > 0) {
-        // If at least the x value is provided we use mainly the parameters
-        // provided
-        this->mX = tX;
-        this->mY = tY > 0 ? tY : 1;
-        this->mZ = tZ > 0 ? tZ : 1;
-    } else {
-        this->mX = tensors[0]->size();
-        this->mY = 1;
-        this->mZ = 1;
-    }
-    SPDLOG_INFO("Kompute OpAlgoBase dispatch size X: {}, Y: {}, Z: {}",
-                 this->mX,
-                 this->mY,
-                 this->mZ);
-
-    this->mAlgorithm = std::make_shared<Algorithm>(device, commandBuffer);
-}
-
-template<uint32_t tX, uint32_t tY, uint32_t tZ>
-OpAlgoBase<tX, tY, tZ>::OpAlgoBase(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
-                           std::shared_ptr<vk::Device> device,
-                           std::shared_ptr<vk::CommandBuffer> commandBuffer,
-                           std::vector<std::shared_ptr<Tensor>>& tensors,
-                           std::string shaderFilePath)
-  : OpAlgoBase(physicalDevice, device, commandBuffer, tensors)
-{
-    SPDLOG_DEBUG("Kompute OpAlgoBase shaderFilePath constructo with shaderfile path: {}", shaderFilePath);
-
-    this->mShaderFilePath = shaderFilePath;
-}
-
-template<uint32_t tX, uint32_t tY, uint32_t tZ>
-OpAlgoBase<tX, tY, tZ>::OpAlgoBase(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
-                           std::shared_ptr<vk::Device> device,
-                           std::shared_ptr<vk::CommandBuffer> commandBuffer,
-                           std::vector<std::shared_ptr<Tensor>>& tensors,
-                           const std::vector<char>& shaderDataRaw)
-  : OpAlgoBase(physicalDevice, device, commandBuffer, tensors)
-{
-    SPDLOG_DEBUG("Kompute OpAlgoBase shaderFilePath constructo with shader raw data length: {}", shaderDataRaw.size());
-
-    this->mShaderDataRaw = shaderDataRaw;
-}
-
-template<uint32_t tX, uint32_t tY, uint32_t tZ>
-OpAlgoBase<tX, tY, tZ>::~OpAlgoBase()
-{
-    SPDLOG_DEBUG("Kompute OpAlgoBase destructor started");
-}
-
-template<uint32_t tX, uint32_t tY, uint32_t tZ>
-void
-OpAlgoBase<tX, tY, tZ>::init()
-{
-    SPDLOG_DEBUG("Kompute OpAlgoBase init called");
-
-    if (this->mTensors.size() < 1) {
-        throw std::runtime_error(
-          "Kompute OpAlgoBase called with less than 1 tensor");
-    } 
-
-    for (std::shared_ptr<Tensor> tensor : this->mTensors) {
-        if(!tensor->isInit()) {
-            throw std::runtime_error("Kompute OpAlgoBase validation failed; all tensor parameters must be initialised.");
-        }
-    }
-
-    SPDLOG_DEBUG("Kompute OpAlgoBase fetching spirv data");
-
-    std::vector<char> shaderFileData = this->fetchSpirvBinaryData();
-
-    SPDLOG_DEBUG("Kompute OpAlgoBase Initialising algorithm component");
-
-    this->mAlgorithm->init(shaderFileData, this->mTensors);
-}
-
-template<uint32_t tX, uint32_t tY, uint32_t tZ>
-void
-OpAlgoBase<tX, tY, tZ>::record()
-{
-    SPDLOG_DEBUG("Kompute OpAlgoBase record called");
-
-    // Barrier to ensure the data is finished writing to buffer memory
-    for (std::shared_ptr<Tensor> tensor : this->mTensors) {
-        tensor->recordBufferMemoryBarrier(
-          this->mCommandBuffer,
-          vk::AccessFlagBits::eHostWrite,
-          vk::AccessFlagBits::eShaderRead,
-          vk::PipelineStageFlagBits::eHost,
-          vk::PipelineStageFlagBits::eComputeShader);
-    }
-
-    this->mAlgorithm->recordDispatch(this->mX, this->mY, this->mZ);
-}
-
-template<uint32_t tX, uint32_t tY, uint32_t tZ>
-void
-OpAlgoBase<tX, tY, tZ>::preEval()
-{
-    SPDLOG_DEBUG("Kompute OpAlgoBase preEval called");
-}
-
-template<uint32_t tX, uint32_t tY, uint32_t tZ>
-void
-OpAlgoBase<tX, tY, tZ>::postEval()
-{
-    SPDLOG_DEBUG("Kompute OpAlgoBase postSubmit called");
-}
-
-template<uint32_t tX, uint32_t tY, uint32_t tZ>
-std::vector<char> OpAlgoBase<tX, tY, tZ>::fetchSpirvBinaryData() 
-{
-    SPDLOG_WARN(
-      "Kompute OpAlgoBase Running shaders directly from spirv file");
-
-    if (this->mShaderFilePath.size()) {
-        std::ifstream fileStream(this->mShaderFilePath,
-                                 std::ios::binary | std::ios::in | std::ios::ate);
-
-        if (!fileStream.good()) {
-            throw std::runtime_error("Error reading file: " + this->mShaderFilePath);
-        }
-
-        size_t shaderFileSize = fileStream.tellg();
-        fileStream.seekg(0, std::ios::beg);
-        char* shaderDataRaw = new char[shaderFileSize];
-        fileStream.read(shaderDataRaw, shaderFileSize);
-        fileStream.close();
-
-        SPDLOG_WARN(
-          "Kompute OpAlgoBase fetched {} bytes", shaderFileSize);
-
-        return std::vector<char>(shaderDataRaw,
-                                 shaderDataRaw + shaderFileSize);
-    }
-    else if (this->mShaderDataRaw.size()) {
-        return this->mShaderDataRaw;
-    }
-    else {
-        throw std::runtime_error("Kompute OpAlgoBase Error reached fetchSpirvBinaryData but neither filepath nor data provided");
-    }
-}
-
-}
-
-#endif // #ifndef OPALGOBASE_IMPL
-
 #include <fstream>
 
 namespace kp {
@@ -1924,12 +1754,8 @@ namespace kp {
  * Operation base class to simplify the creation of operations that require
  * right hand and left hand side datapoints together with a single output.
  * The expected data passed is two input tensors and one output tensor.
- * The template parameters specify the processing GPU layout number of
- * iterations for each x, y, z parameter. More specifically, this will be the
- * input to ".dispatch(uint32_t tX, uint32_t tY, uint32_t, tZ)"
  */
-template<uint32_t tX = 0, uint32_t tY = 0, uint32_t tZ = 0>
-class OpAlgoLhsRhsOut : public OpAlgoBase<tX, tY, tZ>
+class OpAlgoLhsRhsOut : public OpAlgoBase
 {
   public:
     /**
@@ -1947,11 +1773,13 @@ class OpAlgoLhsRhsOut : public OpAlgoBase<tX, tY, tZ>
      * @param commandBuffer Vulkan Command Buffer to record commands into
      * @param tensors Tensors that are to be used in this operation
      * @param freeTensors Whether operation manages the memory of the Tensors
+     * @param komputeWorkgroup Optional parameter to specify the layout for processing
      */
     OpAlgoLhsRhsOut(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
            std::shared_ptr<vk::Device> device,
            std::shared_ptr<vk::CommandBuffer> commandBuffer,
-           std::vector<std::shared_ptr<Tensor>> tensors);
+           std::vector<std::shared_ptr<Tensor>> tensors,
+           KomputeWorkgroup komputeWorkgroup = KomputeWorkgroup());
 
     /**
      * Default destructor, which is in charge of destroying the algorithm
@@ -1982,7 +1810,7 @@ class OpAlgoLhsRhsOut : public OpAlgoBase<tX, tY, tZ>
      * of the GPU Device memory into the staging buffer so the output data can
      * be retrieved.
      */
-    virtual void postSubmit() override;
+    virtual void postEval() override;
 
   protected:
     // -------------- NEVER OWNED RESOURCES
@@ -1996,138 +1824,6 @@ class OpAlgoLhsRhsOut : public OpAlgoBase<tX, tY, tZ>
 
 } // End namespace kp
 
-// Including implementation for template class
-#ifndef OPALGOLHSRHSOUT_CPP
-#define OPALGOLHSRHSOUT_CPP
-
-namespace kp {
-
-template<uint32_t tX, uint32_t tY, uint32_t tZ>
-OpAlgoLhsRhsOut<tX, tY, tZ>::OpAlgoLhsRhsOut()
-{
-    SPDLOG_DEBUG("Kompute OpAlgoLhsRhsOut constructor base");
-}
-
-template<uint32_t tX, uint32_t tY, uint32_t tZ>
-OpAlgoLhsRhsOut<tX, tY, tZ>::OpAlgoLhsRhsOut(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
-                           std::shared_ptr<vk::Device> device,
-                           std::shared_ptr<vk::CommandBuffer> commandBuffer,
-                           std::vector<std::shared_ptr<Tensor>> tensors)
-  // The inheritance is initialised with the copyOutputData to false given that
-  // this depencendant class handles the transfer of data via staging buffers in 
-  // a granular way.
-  : OpAlgoBase<tX, tY, tZ>(physicalDevice, device, commandBuffer, tensors)
-{
-    SPDLOG_DEBUG("Kompute OpAlgoLhsRhsOut constructor with params");
-}
-
-template<uint32_t tX, uint32_t tY, uint32_t tZ>
-OpAlgoLhsRhsOut<tX, tY, tZ>::~OpAlgoLhsRhsOut()
-{
-    SPDLOG_DEBUG("Kompute OpAlgoLhsRhsOut destructor started");
-}
-
-template<uint32_t tX, uint32_t tY, uint32_t tZ>
-void
-OpAlgoLhsRhsOut<tX, tY, tZ>::init()
-{
-    SPDLOG_DEBUG("Kompute OpAlgoLhsRhsOut init called");
-
-    if (this->mTensors.size() < 3) {
-        throw std::runtime_error(
-          "Kompute OpAlgoLhsRhsOut called with less than 1 tensor");
-    } else if (this->mTensors.size() > 3) {
-        SPDLOG_WARN("Kompute OpAlgoLhsRhsOut called with more than 3 this->mTensors");
-    }
-
-    this->mTensorLHS = this->mTensors[0];
-    this->mTensorRHS = this->mTensors[1];
-    this->mTensorOutput = this->mTensors[2];
-
-    if (!(this->mTensorLHS->isInit() && this->mTensorRHS->isInit() &&
-          this->mTensorOutput->isInit())) {
-        throw std::runtime_error(
-          "Kompute OpAlgoLhsRhsOut all tensor parameters must be initialised. LHS: " +
-          std::to_string(this->mTensorLHS->isInit()) +
-          " RHS: " + std::to_string(this->mTensorRHS->isInit()) +
-          " Output: " + std::to_string(this->mTensorOutput->isInit()));
-    }
-
-    if (!(this->mTensorLHS->size() == this->mTensorRHS->size() &&
-          this->mTensorRHS->size() == this->mTensorOutput->size())) {
-        throw std::runtime_error(
-          "Kompute OpAlgoLhsRhsOut all tensor parameters must be the same size LHS: " +
-          std::to_string(this->mTensorLHS->size()) +
-          " RHS: " + std::to_string(this->mTensorRHS->size()) +
-          " Output: " + std::to_string(this->mTensorOutput->size()));
-    }
-
-    this->mTensorOutputStaging = std::make_shared<Tensor>(
-      this->mTensorOutput->data(), Tensor::TensorTypes::eStaging);
-
-    this->mTensorOutputStaging->init(
-      this->mPhysicalDevice, this->mDevice, this->mCommandBuffer);
-
-    SPDLOG_DEBUG("Kompute OpAlgoLhsRhsOut fetching spirv data");
-
-    std::vector<char> shaderFileData = this->fetchSpirvBinaryData();
-
-    SPDLOG_DEBUG("Kompute OpAlgoLhsRhsOut Initialising algorithm component");
-
-    this->mAlgorithm->init(shaderFileData, this->mTensors);
-}
-
-template<uint32_t tX, uint32_t tY, uint32_t tZ>
-void
-OpAlgoLhsRhsOut<tX, tY, tZ>::record()
-{
-    SPDLOG_DEBUG("Kompute OpAlgoLhsRhsOut record called");
-
-    // Barrier to ensure the data is finished writing to buffer memory
-    this->mTensorLHS->recordBufferMemoryBarrier(
-      this->mCommandBuffer,
-      vk::AccessFlagBits::eHostWrite,
-      vk::AccessFlagBits::eShaderRead,
-      vk::PipelineStageFlagBits::eHost,
-      vk::PipelineStageFlagBits::eComputeShader);
-    this->mTensorRHS->recordBufferMemoryBarrier(
-      this->mCommandBuffer,
-      vk::AccessFlagBits::eHostWrite,
-      vk::AccessFlagBits::eShaderRead,
-      vk::PipelineStageFlagBits::eHost,
-      vk::PipelineStageFlagBits::eComputeShader);
-
-    this->mAlgorithm->recordDispatch(this->mX, this->mY, this->mZ);
-
-    // Barrier to ensure the shader code is executed before buffer read
-    this->mTensorOutput->recordBufferMemoryBarrier(
-      this->mCommandBuffer,
-      vk::AccessFlagBits::eShaderWrite,
-      vk::AccessFlagBits::eTransferRead,
-      vk::PipelineStageFlagBits::eComputeShader,
-      vk::PipelineStageFlagBits::eTransfer);
-
-    this->mTensorOutputStaging->recordCopyFrom(
-            this->mCommandBuffer,
-            this->mTensorOutput,
-            true);
-}
-
-template<uint32_t tX, uint32_t tY, uint32_t tZ>
-void
-OpAlgoLhsRhsOut<tX, tY, tZ>::postSubmit()
-{
-    SPDLOG_DEBUG("Kompute OpAlgoLhsRhsOut postSubmit called");
-
-    this->mTensorOutputStaging->mapDataFromHostMemory();
-
-    this->mTensorOutput->setData(this->mTensorOutputStaging->data());
-}
-
-}
-
-#endif // #ifndef OPALGOLHSRHSOUT_CPP
-
 #include <fstream>
 
 #if RELEASE
@@ -2138,12 +1834,9 @@ namespace kp {
 
 /**
  * Operation that performs multiplication on two tensors and outpus on third
- * tensor. The template parameters specify the processing GPU layout number of
- * iterations for each x, y, z parameter. More specifically, this will be the
- * input to ".dispatch(uint32_t tX, uint32_t tY, uint32_t, tZ)"
+ * tensor.
  */
-template<uint32_t tX = 0, uint32_t tY = 0, uint32_t tZ = 0>
-class OpMult : public OpAlgoBase<tX, tY, tZ>
+class OpMult : public OpAlgoBase
 {
   public:
     /**
@@ -2162,13 +1855,14 @@ class OpMult : public OpAlgoBase<tX, tY, tZ>
      * @param device Vulkan logical device for passing to Algorithm
      * @param commandBuffer Vulkan Command Buffer to record commands into
      * @param tensors Tensors that are to be used in this operation
-     * @param freeTensors Whether operation manages the memory of the Tensors
+     * @param komputeWorkgroup Optional parameter to specify the layout for processing
      */
     OpMult(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
            std::shared_ptr<vk::Device> device,
            std::shared_ptr<vk::CommandBuffer> commandBuffer,
-           std::vector<std::shared_ptr<Tensor>> tensors)
-      : OpAlgoBase<tX, tY, tZ>(physicalDevice, device, commandBuffer, tensors, "")
+           std::vector<std::shared_ptr<Tensor>> tensors,
+           KomputeWorkgroup komputeWorkgroup = KomputeWorkgroup())
+      : OpAlgoBase(physicalDevice, device, commandBuffer, tensors, "", komputeWorkgroup)
     {
         SPDLOG_DEBUG("Kompute OpMult constructor with params");
 
@@ -2179,14 +1873,8 @@ class OpMult : public OpAlgoBase<tX, tY, tZ>
 
 #if RELEASE
     /**
-     * If release it will be using the static version of the shader which is 
-     * loaded using this file directly.
-     *
-     * @param physicalDevice Vulkan physical device used to find device queues
-     * @param device Vulkan logical device for passing to Algorithm
-     * @param commandBuffer Vulkan Command Buffer to record commands into
-     * @param tensors Tensors that are to be used in this operation
-     * @param freeTensors Whether operation manages the memory of the Tensors
+     * If RELEASE=1 it will be using the static version of the shader which is 
+     * loaded using this file directly. Otherwise it should not override the function.
      */
     std::vector<char> fetchSpirvBinaryData() override
     {

From 473031d1f3b424c8d8229a3ed14c40e70a1ddeb4 Mon Sep 17 00:00:00 2001
From: Alejandro Saucedo <axsauze@gmail.com>
Date: Sun, 1 Nov 2020 20:25:15 +0000
Subject: [PATCH 14/39] Sequence now exposed via shared_ptr instead of weak_ptr
 and memory release is done through destructor based on the isInit member
 variable

---
 src/Manager.cpp                 |  9 ++++--
 src/Sequence.cpp                | 10 +++++++
 src/include/kompute/Manager.hpp | 50 +++++++++++++++------------------
 3 files changed, 38 insertions(+), 31 deletions(-)

diff --git a/src/Manager.cpp b/src/Manager.cpp
index ec86b18ed..5c7a2d3be 100644
--- a/src/Manager.cpp
+++ b/src/Manager.cpp
@@ -59,7 +59,10 @@ Manager::~Manager()
     }
 
     if (this->mManagedSequences.size()) {
-        SPDLOG_DEBUG("Releasing managed sequence");
+        SPDLOG_DEBUG("Kompute Manager explicitly running destructor for managed sequences");
+        for (const std::pair<std::string, std::shared_ptr<Sequence>> &sqPair : this->mManagedSequences) {
+            sqPair.second->~Sequence();
+        }
         this->mManagedSequences.clear();
     }
 
@@ -91,7 +94,7 @@ Manager::~Manager()
     }
 }
 
-std::weak_ptr<Sequence>
+std::shared_ptr<Sequence>
 Manager::getOrCreateManagedSequence(std::string sequenceName)
 {
     SPDLOG_DEBUG("Kompute Manager creating Sequence object");
@@ -106,7 +109,7 @@ Manager::getOrCreateManagedSequence(std::string sequenceName)
     }
 }
 
-std::weak_ptr<Sequence>
+std::shared_ptr<Sequence>
 Manager::createManagedSequence(std::string sequenceName, uint32_t queueIndex)
 {
 
diff --git a/src/Sequence.cpp b/src/Sequence.cpp
index c4446ff37..0f6eccfd2 100644
--- a/src/Sequence.cpp
+++ b/src/Sequence.cpp
@@ -27,9 +27,15 @@ Sequence::~Sequence()
 {
     SPDLOG_DEBUG("Kompute Sequence Destructor started");
 
+    if (!this->mIsInit) {
+        SPDLOG_WARN("Kompute Sequence destructor called but sequence is not initialized.");
+        return;
+    }
+
     if (!this->mDevice) {
         SPDLOG_ERROR(
           "Kompute Sequence destructor reached with null Device pointer");
+        this->mIsInit = false;
         return;
     }
 
@@ -38,6 +44,7 @@ Sequence::~Sequence()
         if (!this->mCommandBuffer) {
             SPDLOG_ERROR("Kompute Sequence destructor reached with null "
                          "CommandPool pointer");
+            this->mIsInit = false;
             return;
         }
         this->mDevice->freeCommandBuffers(
@@ -50,11 +57,14 @@ Sequence::~Sequence()
         if (this->mCommandPool == nullptr) {
             SPDLOG_ERROR("Kompute Sequence destructor reached with null "
                          "CommandPool pointer");
+            this->mIsInit = false;
             return;
         }
         this->mDevice->destroy(*this->mCommandPool, (vk::Optional<const vk::AllocationCallbacks>)nullptr);
         SPDLOG_DEBUG("Kompute Sequence Destroyed CommandPool");
     }
+
+    this->mIsInit = false;
 }
 
 void
diff --git a/src/include/kompute/Manager.hpp b/src/include/kompute/Manager.hpp
index 32c04535b..98e8e82c5 100644
--- a/src/include/kompute/Manager.hpp
+++ b/src/include/kompute/Manager.hpp
@@ -63,9 +63,9 @@ class Manager
      *
      * @param sequenceName The name for the named sequence to be retrieved or
      * created
-     * @return Weak pointer to the manager owned sequence resource
+     * @return Shared pointer to the manager owned sequence resource
      */
-    std::weak_ptr<Sequence> getOrCreateManagedSequence(
+    std::shared_ptr<Sequence> getOrCreateManagedSequence(
       std::string sequenceName);
 
     /**
@@ -77,7 +77,7 @@ class Manager
      * @param queueIndex The queue to use from the available queues
      * @return Weak pointer to the manager owned sequence resource
      */
-    std::weak_ptr<Sequence> createManagedSequence(std::string sequenceName = "",
+    std::shared_ptr<Sequence> createManagedSequence(std::string sequenceName = "",
                                                   uint32_t queueIndex = 0);
 
     /**
@@ -94,22 +94,21 @@ class Manager
                 TArgs&&... params)
     {
         SPDLOG_DEBUG("Kompute Manager evalOp triggered");
-        std::weak_ptr<Sequence> sqWeakPtr =
+        std::shared_ptr<kp::Sequence> sq =
           this->getOrCreateManagedSequence(sequenceName);
 
-        if (std::shared_ptr<kp::Sequence> sq = sqWeakPtr.lock()) {
-            SPDLOG_DEBUG("Kompute Manager evalOp running sequence BEGIN");
-            sq->begin();
+        SPDLOG_DEBUG("Kompute Manager evalOp running sequence BEGIN");
+        sq->begin();
 
-            SPDLOG_DEBUG("Kompute Manager evalOp running sequence RECORD");
-            sq->record<T>(tensors, std::forward<TArgs>(params)...);
+        SPDLOG_DEBUG("Kompute Manager evalOp running sequence RECORD");
+        sq->record<T>(tensors, std::forward<TArgs>(params)...);
 
-            SPDLOG_DEBUG("Kompute Manager evalOp running sequence END");
-            sq->end();
+        SPDLOG_DEBUG("Kompute Manager evalOp running sequence END");
+        sq->end();
+
+        SPDLOG_DEBUG("Kompute Manager evalOp running sequence EVAL");
+        sq->eval();
 
-            SPDLOG_DEBUG("Kompute Manager evalOp running sequence EVAL");
-            sq->eval();
-        }
         SPDLOG_DEBUG("Kompute Manager evalOp running sequence SUCCESS");
     }
 
@@ -147,26 +146,21 @@ class Manager
     {
         SPDLOG_DEBUG("Kompute Manager evalOpAsync triggered");
 
-        std::weak_ptr<Sequence> sqWeakPtr =
+        std::shared_ptr<kp::Sequence> sq =
           this->getOrCreateManagedSequence(sequenceName);
 
-        if (std::shared_ptr<kp::Sequence> sq = sqWeakPtr.lock()) {
+        SPDLOG_DEBUG("Kompute Manager evalOpAsync running sequence BEGIN");
+        sq->begin();
 
-            SPDLOG_DEBUG("Kompute Manager evalOpAsync running sequence BEGIN");
-            sq->begin();
+        SPDLOG_DEBUG("Kompute Manager evalOpAsync running sequence RECORD");
+        sq->record<T>(tensors, std::forward<TArgs>(params)...);
 
-            SPDLOG_DEBUG("Kompute Manager evalOpAsync running sequence RECORD");
-            sq->record<T>(tensors, std::forward<TArgs>(params)...);
+        SPDLOG_DEBUG("Kompute Manager evalOpAsync running sequence END");
+        sq->end();
 
-            SPDLOG_DEBUG("Kompute Manager evalOpAsync running sequence END");
-            sq->end();
+        SPDLOG_DEBUG("Kompute Manager evalOpAsync running sequence EVAL");
+        sq->evalAsync();
 
-            SPDLOG_DEBUG("Kompute Manager evalOpAsync running sequence EVAL");
-            sq->evalAsync();
-        } else {
-            SPDLOG_ERROR("Kompute Manager evalOpAsync sequence [{}] not found",
-                         sequenceName);
-        }
         SPDLOG_DEBUG("Kompute Manager evalOpAsync running sequence SUCCESS");
     }
 

From e2f6e876bc376d029fec0a5cd8993fdf6a00f8ae Mon Sep 17 00:00:00 2001
From: Alejandro Saucedo <axsauze@gmail.com>
Date: Sun, 1 Nov 2020 20:25:53 +0000
Subject: [PATCH 15/39] Updated tests to align with new sequence memory
 management workflow

---
 test/TestLogisticRegression.cpp     |  8 ++--
 test/TestManager.cpp                | 30 +++++++--------
 test/TestMultipleAlgoExecutions.cpp | 60 ++++++++++++++++-------------
 test/TestTensor.cpp                 |  2 +-
 4 files changed, 54 insertions(+), 46 deletions(-)

diff --git a/test/TestLogisticRegression.cpp b/test/TestLogisticRegression.cpp
index 9822c08d1..91dd1f430 100644
--- a/test/TestLogisticRegression.cpp
+++ b/test/TestLogisticRegression.cpp
@@ -32,14 +32,14 @@ TEST(TestLogisticRegressionAlgorithm, TestMainLogisticRegression)
         kp::Manager mgr;
 
         std::shared_ptr<kp::Sequence> sqTensor =
-          mgr.createManagedSequence().lock();
+          mgr.createManagedSequence();
 
         sqTensor->begin();
         sqTensor->record<kp::OpTensorCreate>(params);
         sqTensor->end();
         sqTensor->eval();
 
-        std::shared_ptr<kp::Sequence> sq = mgr.createManagedSequence().lock();
+        std::shared_ptr<kp::Sequence> sq = mgr.createManagedSequence();
 
         // Record op algo base
         sq->begin();
@@ -115,14 +115,14 @@ TEST(TestLogisticRegressionAlgorithm, TestMainLogisticRegressionManualCopy)
         kp::Manager mgr;
 
         std::shared_ptr<kp::Sequence> sqTensor =
-          mgr.createManagedSequence().lock();
+          mgr.createManagedSequence();
 
         sqTensor->begin();
         sqTensor->record<kp::OpTensorCreate>(params);
         sqTensor->end();
         sqTensor->eval();
 
-        std::shared_ptr<kp::Sequence> sq = mgr.createManagedSequence().lock();
+        std::shared_ptr<kp::Sequence> sq = mgr.createManagedSequence();
 
         // Record op algo base
         sq->begin();
diff --git a/test/TestManager.cpp b/test/TestManager.cpp
index 0cb2a78fd..3076b2a62 100644
--- a/test/TestManager.cpp
+++ b/test/TestManager.cpp
@@ -35,9 +35,10 @@ TEST(TestManager, OpMultSequenceFlow)
 
     kp::Manager mgr;
 
-    std::weak_ptr<kp::Sequence> sqWeakPtr =
-      mgr.getOrCreateManagedSequence("newSequence");
-    if (std::shared_ptr<kp::Sequence> sq = sqWeakPtr.lock()) {
+    {
+        std::shared_ptr<kp::Sequence> sq =
+          mgr.getOrCreateManagedSequence("newSequence");
+
         sq->begin();
 
         sq->record<kp::OpTensorCreate>({ tensorLHS });
@@ -51,7 +52,6 @@ TEST(TestManager, OpMultSequenceFlow)
         sq->end();
         sq->eval();
     }
-    sqWeakPtr.reset();
 
     EXPECT_EQ(tensorOutput->data(), std::vector<float>({ 0, 4, 12 }));
 }
@@ -60,22 +60,22 @@ TEST(TestManager, TestMultipleSequences)
 {
     kp::Manager mgr;
 
-    std::weak_ptr<kp::Sequence> sqWeakPtrOne =
+    std::shared_ptr<kp::Sequence> sqOne =
       mgr.getOrCreateManagedSequence("sqOne");
 
-    std::weak_ptr<kp::Sequence> sqWeakPtrTwo =
+    std::shared_ptr<kp::Sequence> sqTwo =
       mgr.getOrCreateManagedSequence("sqTwo");
 
-    std::weak_ptr<kp::Sequence> sqWeakPtrOneRef =
+    std::shared_ptr<kp::Sequence> sqOneRef =
       mgr.getOrCreateManagedSequence("sqOne");
 
-    std::weak_ptr<kp::Sequence> sqWeakPtrTwoRef =
+    std::shared_ptr<kp::Sequence> sqTwoRef =
       mgr.getOrCreateManagedSequence("sqTwo");
 
-    EXPECT_EQ(sqWeakPtrOne.lock(), sqWeakPtrOneRef.lock());
-    EXPECT_NE(sqWeakPtrTwo.lock(), sqWeakPtrOneRef.lock());
-    EXPECT_EQ(sqWeakPtrTwo.lock(), sqWeakPtrTwoRef.lock());
-    EXPECT_NE(sqWeakPtrOneRef.lock(), sqWeakPtrTwoRef.lock());
+    EXPECT_EQ(sqOne, sqOneRef);
+    EXPECT_NE(sqTwo, sqOneRef);
+    EXPECT_EQ(sqTwo, sqTwoRef);
+    EXPECT_NE(sqOneRef, sqTwoRef);
 }
 
 TEST(TestManager, TestMultipleTensorsAtOnce)
@@ -89,9 +89,10 @@ TEST(TestManager, TestMultipleTensorsAtOnce)
 
     kp::Manager mgr;
 
-    std::weak_ptr<kp::Sequence> sqWeakPtr =
+    std::shared_ptr<kp::Sequence> sq =
       mgr.getOrCreateManagedSequence("newSequence");
-    if (std::shared_ptr<kp::Sequence> sq = sqWeakPtr.lock()) {
+
+    {
         sq->begin();
 
         sq->record<kp::OpTensorCreate>({ tensorLHS, tensorRHS, tensorOutput });
@@ -107,7 +108,6 @@ TEST(TestManager, TestMultipleTensorsAtOnce)
         sq->end();
         sq->eval();
     }
-    sqWeakPtr.reset();
 
     EXPECT_EQ(tensorOutput->data(), std::vector<float>({ 0, 4, 12 }));
 }
diff --git a/test/TestMultipleAlgoExecutions.cpp b/test/TestMultipleAlgoExecutions.cpp
index a0355416c..f45367313 100644
--- a/test/TestMultipleAlgoExecutions.cpp
+++ b/test/TestMultipleAlgoExecutions.cpp
@@ -19,9 +19,10 @@ TEST(TestMultipleAlgoExecutions, SingleSequenceRecord)
           pa[index] = pa[index] + 1;
       })");
 
-    std::weak_ptr<kp::Sequence> sqWeakPtr =
+    std::shared_ptr<kp::Sequence> sq =
       mgr.getOrCreateManagedSequence("newSequence");
-    if (std::shared_ptr<kp::Sequence> sq = sqWeakPtr.lock()) {
+
+    {
         sq->begin();
 
         sq->record<kp::OpTensorCreate>({ tensorA });
@@ -38,7 +39,6 @@ TEST(TestMultipleAlgoExecutions, SingleSequenceRecord)
         sq->end();
         sq->eval();
     }
-    sqWeakPtr.reset();
 
     EXPECT_EQ(tensorA->data(), std::vector<float>({ 3, 3, 3 }));
 }
@@ -58,9 +58,9 @@ TEST(TestMultipleAlgoExecutions, MultipleCmdBufRecords)
           pa[index] = pa[index] + 1;
       })");
 
-    std::shared_ptr<kp::Sequence> sqTensor = mgr.createManagedSequence().lock();
+    std::shared_ptr<kp::Sequence> sqTensor = mgr.createManagedSequence();
 
-    std::shared_ptr<kp::Sequence> sq = mgr.createManagedSequence().lock();
+    std::shared_ptr<kp::Sequence> sq = mgr.createManagedSequence();
 
     // First create the tensor in a separate sequence
     sqTensor->begin();
@@ -111,9 +111,10 @@ TEST(TestMultipleAlgoExecutions, MultipleSequences)
           pa[index] = pa[index] + 1;
       })");
 
-    std::weak_ptr<kp::Sequence> sqWeakPtr =
-      mgr.getOrCreateManagedSequence("newSequence");
-    if (std::shared_ptr<kp::Sequence> sq = sqWeakPtr.lock()) {
+    {
+        std::shared_ptr<kp::Sequence> sq = 
+          mgr.getOrCreateManagedSequence("newSequence");
+
         sq->begin();
 
         sq->record<kp::OpTensorCreate>({ tensorA });
@@ -125,9 +126,10 @@ TEST(TestMultipleAlgoExecutions, MultipleSequences)
         sq->eval();
     }
 
-    std::weak_ptr<kp::Sequence> sqWeakPtr2 =
-      mgr.getOrCreateManagedSequence("newSequence2");
-    if (std::shared_ptr<kp::Sequence> sq = sqWeakPtr2.lock()) {
+    {
+        std::shared_ptr<kp::Sequence> sq =
+          mgr.getOrCreateManagedSequence("newSequence2");
+
         sq->begin();
 
         sq->record<kp::OpAlgoBase>(
@@ -137,9 +139,10 @@ TEST(TestMultipleAlgoExecutions, MultipleSequences)
         sq->eval();
     }
 
-    std::weak_ptr<kp::Sequence> sqWeakPtr3 =
-      mgr.getOrCreateManagedSequence("newSequence3");
-    if (std::shared_ptr<kp::Sequence> sq = sqWeakPtr3.lock()) {
+    {
+        std::shared_ptr<kp::Sequence> sq =
+          mgr.getOrCreateManagedSequence("newSequence3");
+
         sq->begin();
 
         sq->record<kp::OpAlgoBase>(
@@ -149,9 +152,10 @@ TEST(TestMultipleAlgoExecutions, MultipleSequences)
         sq->eval();
     }
 
-    std::weak_ptr<kp::Sequence> sqWeakPtr4 =
-      mgr.getOrCreateManagedSequence("newSequence5");
-    if (std::shared_ptr<kp::Sequence> sq = sqWeakPtr4.lock()) {
+    {
+        std::shared_ptr<kp::Sequence> sq =
+          mgr.getOrCreateManagedSequence("newSequence5");
+
         sq->begin();
 
         sq->record<kp::OpTensorSyncLocal>({ tensorA });
@@ -179,9 +183,10 @@ TEST(TestMultipleAlgoExecutions, SingleRecordMultipleEval)
           pa[index] = pa[index] + 1;
       })");
 
-    std::weak_ptr<kp::Sequence> sqWeakPtr =
-      mgr.getOrCreateManagedSequence("newSequence");
-    if (std::shared_ptr<kp::Sequence> sq = sqWeakPtr.lock()) {
+    {
+        std::shared_ptr<kp::Sequence> sq =
+          mgr.getOrCreateManagedSequence("newSequence");
+
         sq->begin();
 
         sq->record<kp::OpTensorCreate>({ tensorA });
@@ -190,9 +195,10 @@ TEST(TestMultipleAlgoExecutions, SingleRecordMultipleEval)
         sq->eval();
     }
 
-    std::weak_ptr<kp::Sequence> sqWeakPtr2 =
-      mgr.getOrCreateManagedSequence("newSequence2");
-    if (std::shared_ptr<kp::Sequence> sq = sqWeakPtr2.lock()) {
+    {
+        std::shared_ptr<kp::Sequence> sq =
+          mgr.getOrCreateManagedSequence("newSequence2");
+
         sq->begin();
 
         sq->record<kp::OpAlgoBase>(
@@ -205,9 +211,11 @@ TEST(TestMultipleAlgoExecutions, SingleRecordMultipleEval)
         sq->eval();
     }
 
-    std::weak_ptr<kp::Sequence> sqWeakPtr3 =
-      mgr.getOrCreateManagedSequence("newSequence3");
-    if (std::shared_ptr<kp::Sequence> sq = sqWeakPtr2.lock()) {
+
+    {
+        std::shared_ptr<kp::Sequence> sq =
+          mgr.getOrCreateManagedSequence("newSequence3");
+
         sq->begin();
 
         sq->record<kp::OpTensorSyncLocal>({ tensorA });
diff --git a/test/TestTensor.cpp b/test/TestTensor.cpp
index 676b9f423..42731bcfe 100644
--- a/test/TestTensor.cpp
+++ b/test/TestTensor.cpp
@@ -24,7 +24,7 @@ TEST(TestTensor, CopyFromHostData)
     kp::Manager mgr;
 
     if (std::shared_ptr<kp::Sequence> sq =
-          mgr.getOrCreateManagedSequence("new").lock()) {
+          mgr.getOrCreateManagedSequence("new")) {
         sq->begin();
 
         sq->record<kp::OpTensorCreate>({ tensorA, tensorB });

From ac33cb450a91577624eb73f13829754199ad4d1e Mon Sep 17 00:00:00 2001
From: Alejandro Saucedo <axsauze@gmail.com>
Date: Sun, 1 Nov 2020 20:26:04 +0000
Subject: [PATCH 16/39] Updated tests to align with new sequence memory
 management workflow

---
 test/TestOpAlgoLoopsPassingData.cpp | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/test/TestOpAlgoLoopsPassingData.cpp b/test/TestOpAlgoLoopsPassingData.cpp
index 9c592e356..35a08e02a 100644
--- a/test/TestOpAlgoLoopsPassingData.cpp
+++ b/test/TestOpAlgoLoopsPassingData.cpp
@@ -30,10 +30,11 @@ TEST(TestProcessingIterations, IterateThroughMultipleSumAndCopies)
         }
     )");
 
-    std::weak_ptr<kp::Sequence> sqWeakPtr =
-      mgr.getOrCreateManagedSequence("default");
 
-    if (std::shared_ptr<kp::Sequence> sq = sqWeakPtr.lock()) {
+    {
+        std::shared_ptr<kp::Sequence> sq =
+          mgr.getOrCreateManagedSequence("default");
+
         sq->begin();
 
         sq->record<kp::OpTensorCreate>({ tensorA, tensorB });
@@ -43,10 +44,10 @@ TEST(TestProcessingIterations, IterateThroughMultipleSumAndCopies)
         sq->eval();
     }
 
-    std::weak_ptr<kp::Sequence> sqWeakPtr2 =
-      mgr.getOrCreateManagedSequence("run");
+    {
+        std::shared_ptr<kp::Sequence> sq =
+          mgr.getOrCreateManagedSequence("run");
 
-    if (std::shared_ptr<kp::Sequence> sq = sqWeakPtr2.lock()) {
         sq->begin();
 
         sq->record<kp::OpAlgoBase>(
@@ -61,10 +62,10 @@ TEST(TestProcessingIterations, IterateThroughMultipleSumAndCopies)
         }
     }
 
-    std::weak_ptr<kp::Sequence> sqWeakPtr3 =
-      mgr.getOrCreateManagedSequence("export");
+    {
+        std::shared_ptr<kp::Sequence> sq =
+          mgr.getOrCreateManagedSequence("export");
 
-    if (std::shared_ptr<kp::Sequence> sq = sqWeakPtr3.lock()) {
         sq->begin();
 
         sq->record<kp::OpTensorSyncLocal>({ tensorA, tensorB });

From 81277aa60ef4c4db9408cdd7b355dd3021264e92 Mon Sep 17 00:00:00 2001
From: Alejandro Saucedo <axsauze@gmail.com>
Date: Sun, 1 Nov 2020 20:26:29 +0000
Subject: [PATCH 17/39] Added test to verify memory management via isInit
 member variable

---
 test/TestSequence.cpp | 21 ++++++++++++++++++---
 1 file changed, 18 insertions(+), 3 deletions(-)

diff --git a/test/TestSequence.cpp b/test/TestSequence.cpp
index c66dcf43a..882729dcf 100644
--- a/test/TestSequence.cpp
+++ b/test/TestSequence.cpp
@@ -7,10 +7,10 @@ TEST(TestSequence, CmdBufSequenceBeginEnd)
 {
     kp::Manager mgr;
 
-    std::weak_ptr<kp::Sequence> sqWeakPtr =
-      mgr.getOrCreateManagedSequence("newSequence");
+    {
+        std::shared_ptr<kp::Sequence> sq =
+          mgr.getOrCreateManagedSequence("newSequence");
 
-    if (std::shared_ptr<kp::Sequence> sq = sqWeakPtr.lock()) {
         EXPECT_TRUE(sq->eval());
         EXPECT_TRUE(!sq->isRecording());
         EXPECT_TRUE(sq->begin());
@@ -24,3 +24,18 @@ TEST(TestSequence, CmdBufSequenceBeginEnd)
         EXPECT_TRUE(sq->eval());
     }
 }
+
+TEST(TestSequence, SequenceDestructorViaManager)
+{
+    std::shared_ptr<kp::Sequence> sq = nullptr;
+
+    {
+        kp::Manager mgr;
+
+        sq = mgr.getOrCreateManagedSequence("newSequence");
+
+        EXPECT_TRUE(sq->isInit());
+    }
+
+    EXPECT_FALSE(sq->isInit());
+}

From a33f65a90b65d83d4d9f558ca967c0535e4017c3 Mon Sep 17 00:00:00 2001
From: Alejandro Saucedo <axsauze@gmail.com>
Date: Sun, 1 Nov 2020 20:26:41 +0000
Subject: [PATCH 18/39] Updated single_include header

---
 single_include/kompute/Kompute.hpp | 50 +++++++++++++-----------------
 1 file changed, 22 insertions(+), 28 deletions(-)

diff --git a/single_include/kompute/Kompute.hpp b/single_include/kompute/Kompute.hpp
index 382b7131d..c417182c2 100755
--- a/single_include/kompute/Kompute.hpp
+++ b/single_include/kompute/Kompute.hpp
@@ -1301,9 +1301,9 @@ class Manager
      *
      * @param sequenceName The name for the named sequence to be retrieved or
      * created
-     * @return Weak pointer to the manager owned sequence resource
+     * @return Shared pointer to the manager owned sequence resource
      */
-    std::weak_ptr<Sequence> getOrCreateManagedSequence(
+    std::shared_ptr<Sequence> getOrCreateManagedSequence(
       std::string sequenceName);
 
     /**
@@ -1315,7 +1315,7 @@ class Manager
      * @param queueIndex The queue to use from the available queues
      * @return Weak pointer to the manager owned sequence resource
      */
-    std::weak_ptr<Sequence> createManagedSequence(std::string sequenceName = "",
+    std::shared_ptr<Sequence> createManagedSequence(std::string sequenceName = "",
                                                   uint32_t queueIndex = 0);
 
     /**
@@ -1332,22 +1332,21 @@ class Manager
                 TArgs&&... params)
     {
         SPDLOG_DEBUG("Kompute Manager evalOp triggered");
-        std::weak_ptr<Sequence> sqWeakPtr =
+        std::shared_ptr<kp::Sequence> sq =
           this->getOrCreateManagedSequence(sequenceName);
 
-        if (std::shared_ptr<kp::Sequence> sq = sqWeakPtr.lock()) {
-            SPDLOG_DEBUG("Kompute Manager evalOp running sequence BEGIN");
-            sq->begin();
+        SPDLOG_DEBUG("Kompute Manager evalOp running sequence BEGIN");
+        sq->begin();
 
-            SPDLOG_DEBUG("Kompute Manager evalOp running sequence RECORD");
-            sq->record<T>(tensors, std::forward<TArgs>(params)...);
+        SPDLOG_DEBUG("Kompute Manager evalOp running sequence RECORD");
+        sq->record<T>(tensors, std::forward<TArgs>(params)...);
 
-            SPDLOG_DEBUG("Kompute Manager evalOp running sequence END");
-            sq->end();
+        SPDLOG_DEBUG("Kompute Manager evalOp running sequence END");
+        sq->end();
+
+        SPDLOG_DEBUG("Kompute Manager evalOp running sequence EVAL");
+        sq->eval();
 
-            SPDLOG_DEBUG("Kompute Manager evalOp running sequence EVAL");
-            sq->eval();
-        }
         SPDLOG_DEBUG("Kompute Manager evalOp running sequence SUCCESS");
     }
 
@@ -1385,26 +1384,21 @@ class Manager
     {
         SPDLOG_DEBUG("Kompute Manager evalOpAsync triggered");
 
-        std::weak_ptr<Sequence> sqWeakPtr =
+        std::shared_ptr<kp::Sequence> sq =
           this->getOrCreateManagedSequence(sequenceName);
 
-        if (std::shared_ptr<kp::Sequence> sq = sqWeakPtr.lock()) {
+        SPDLOG_DEBUG("Kompute Manager evalOpAsync running sequence BEGIN");
+        sq->begin();
 
-            SPDLOG_DEBUG("Kompute Manager evalOpAsync running sequence BEGIN");
-            sq->begin();
+        SPDLOG_DEBUG("Kompute Manager evalOpAsync running sequence RECORD");
+        sq->record<T>(tensors, std::forward<TArgs>(params)...);
 
-            SPDLOG_DEBUG("Kompute Manager evalOpAsync running sequence RECORD");
-            sq->record<T>(tensors, std::forward<TArgs>(params)...);
+        SPDLOG_DEBUG("Kompute Manager evalOpAsync running sequence END");
+        sq->end();
 
-            SPDLOG_DEBUG("Kompute Manager evalOpAsync running sequence END");
-            sq->end();
+        SPDLOG_DEBUG("Kompute Manager evalOpAsync running sequence EVAL");
+        sq->evalAsync();
 
-            SPDLOG_DEBUG("Kompute Manager evalOpAsync running sequence EVAL");
-            sq->evalAsync();
-        } else {
-            SPDLOG_ERROR("Kompute Manager evalOpAsync sequence [{}] not found",
-                         sequenceName);
-        }
         SPDLOG_DEBUG("Kompute Manager evalOpAsync running sequence SUCCESS");
     }
 

From 8ce3b669de2cc7c620ea96ad7cb5795012233ba6 Mon Sep 17 00:00:00 2001
From: Alejandro Saucedo <axsauze@gmail.com>
Date: Sun, 1 Nov 2020 20:27:02 +0000
Subject: [PATCH 19/39] Added functioning python bindings for Kompute

---
 python/src/main.cpp | 72 +++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 70 insertions(+), 2 deletions(-)

diff --git a/python/src/main.cpp b/python/src/main.cpp
index e3b7fb371..9573b8074 100644
--- a/python/src/main.cpp
+++ b/python/src/main.cpp
@@ -13,7 +13,7 @@ PYBIND11_MODULE(komputepy, m) {
         .value("eStorage", kp::Tensor::TensorTypes::eStorage)
         .export_values();
 
-    py::class_<kp::Tensor>(m, "Tensor")
+    py::class_<kp::Tensor, std::shared_ptr<kp::Tensor>>(m, "Tensor")
         .def(py::init(
             [](const std::vector<float>& data) {
                 return std::unique_ptr<kp::Tensor>(new kp::Tensor(data));
@@ -24,7 +24,75 @@ PYBIND11_MODULE(komputepy, m) {
             }))
         .def("data", &kp::Tensor::data);
 
-    py::class_<kp::OpBase>(m, "OpBase");
+    py::class_<kp::Sequence, std::shared_ptr<kp::Sequence>>(m, "Sequence")
+        .def("init", &kp::Sequence::init)
+        .def("begin", &kp::Sequence::begin)
+        .def("end", &kp::Sequence::end)
+        .def("eval", &kp::Sequence::eval)
+        .def("evalAsync", &kp::Sequence::evalAsync)
+        .def("evalAwait", &kp::Sequence::evalAwait)
+        .def("isRunning", &kp::Sequence::isRunning)
+        .def("isRecording", &kp::Sequence::isRecording)
+        .def("isInit", &kp::Sequence::isInit)
+        .def("recordOpTensorCreate", &kp::Sequence::record<kp::OpTensorCreate>)
+        .def("recordOpTensorCopy", &kp::Sequence::record<kp::OpTensorCopy>)
+        .def("recordOpTensorSyncDevice", &kp::Sequence::record<kp::OpTensorSyncDevice>)
+        .def("recordOpTensorSyncLocal", &kp::Sequence::record<kp::OpTensorSyncLocal>)
+        .def("recordOpAlgoMult", &kp::Sequence::record<kp::OpMult>)
+        .def("recordOpAlgoBaseFile", &kp::Sequence::record<kp::OpAlgoBase, std::string>)
+        .def("recordOpAlgoBaseData", &kp::Sequence::record<kp::OpAlgoBase, std::vector<char>>)
+        .def("recordOpAlgoLhsRhsOut", &kp::Sequence::record<kp::OpAlgoLhsRhsOut>);
+
+    py::class_<kp::Manager>(m, "Manager")
+        .def(py::init())
+        .def(py::init(
+            [](uint32_t physicalDeviceIndex) {
+                return std::unique_ptr<kp::Manager>(new kp::Manager(physicalDeviceIndex));
+            }))
+        .def(py::init(
+            [](uint32_t physicalDeviceIndex, const std::vector<uint32_t>& familyQueueIndices) {
+                return std::unique_ptr<kp::Manager>(new kp::Manager(physicalDeviceIndex, familyQueueIndices));
+            }))
+        .def("getOrCreateManagedSequence", &kp::Manager::getOrCreateManagedSequence)
+        .def("createManagedSequence", &kp::Manager::createManagedSequence,
+                py::arg("name"), py::arg("queueIndex") = 0)
+        .def("buildTensor", &kp::Manager::buildTensor, 
+                py::arg("data"), py::arg("tensorType") = kp::Tensor::TensorTypes::eDevice)
+        .def("evalOpAsync", &kp::Manager::evalOpAsync<kp::OpMult>)
+        .def("evalOpAsyncDefault", &kp::Manager::evalOpAsyncDefault<kp::OpMult>)
+        .def("evalOpDefaultTensorCreate", &kp::Manager::evalOpDefault<kp::OpTensorCreate>)
+        .def("evalOpDefaultTensorCopy", &kp::Manager::evalOpDefault<kp::OpTensorCopy>)
+        .def("evalOpDefaultTensorSyncDevice", &kp::Manager::evalOpDefault<kp::OpTensorSyncDevice>)
+        .def("evalOpDefaultTensorSyncLocal", &kp::Manager::evalOpDefault<kp::OpTensorSyncLocal>)
+        .def("evalOpDefaultAlgoMult", &kp::Manager::evalOpDefault<kp::OpMult>)
+        .def("evalOpDefaultAlgoBaseFile", &kp::Manager::evalOpDefault<kp::OpAlgoBase, std::string>)
+        .def("evalOpDefaultAlgoBaseData", &kp::Manager::evalOpDefault<kp::OpAlgoBase, std::vector<char>>)
+        .def("evalOpDefaultAlgoLhsRhsOut", &kp::Manager::evalOpDefault<kp::OpAlgoLhsRhsOut>)
+        .def("evalOpTensorCreate", &kp::Manager::evalOp<kp::OpTensorCreate>)
+        .def("evalOpTensorCopy", &kp::Manager::evalOp<kp::OpTensorCopy>)
+        .def("evalOpTensorSyncDevice", &kp::Manager::evalOp<kp::OpTensorSyncDevice>)
+        .def("evalOpTensorSyncLocal", &kp::Manager::evalOp<kp::OpTensorSyncLocal>)
+        .def("evalOpAlgoMult", &kp::Manager::evalOp<kp::OpMult>)
+        .def("evalOpAlgoBaseFile", &kp::Manager::evalOp<kp::OpAlgoBase, std::string>)
+        .def("evalOpAlgoBaseData", &kp::Manager::evalOp<kp::OpAlgoBase, std::vector<char>>)
+        .def("evalOpAlgoLhsRhsOut", &kp::Manager::evalOp<kp::OpAlgoLhsRhsOut>)
+        .def("evalOpAsyncDefaultTensorCreate", &kp::Manager::evalOpAsyncDefault<kp::OpTensorCreate>)
+        .def("evalOpAsyncDefaultTensorCopy", &kp::Manager::evalOpAsyncDefault<kp::OpTensorCopy>)
+        .def("evalOpAsyncDefaultTensorSyncDevice", &kp::Manager::evalOpAsyncDefault<kp::OpTensorSyncDevice>)
+        .def("evalOpAsyncDefaultTensorSyncLocal", &kp::Manager::evalOpAsyncDefault<kp::OpTensorSyncLocal>)
+        .def("evalOpAsyncDefaultAlgoMult", &kp::Manager::evalOpAsyncDefault<kp::OpMult>)
+        .def("evalOpAsyncDefaultAlgoBaseFile", &kp::Manager::evalOpAsyncDefault<kp::OpAlgoBase, std::string>)
+        .def("evalOpAsyncDefaultAlgoBaseData", &kp::Manager::evalOpAsyncDefault<kp::OpAlgoBase, std::vector<char>>)
+        .def("evalOpAsyncDefaultAlgoLhsRhsOut", &kp::Manager::evalOpAsyncDefault<kp::OpAlgoLhsRhsOut>)
+        .def("evalOpAsyncTensorCreate", &kp::Manager::evalOpAsync<kp::OpTensorCreate>)
+        .def("evalOpAsyncTensorCopy", &kp::Manager::evalOpAsync<kp::OpTensorCopy>)
+        .def("evalOpAsyncTensorSyncDevice", &kp::Manager::evalOpAsync<kp::OpTensorSyncDevice>)
+        .def("evalOpAsyncTensorSyncLocal", &kp::Manager::evalOpAsync<kp::OpTensorSyncLocal>)
+        .def("evalOpAsync", &kp::Manager::evalOpAsync<kp::OpMult>)
+        .def("evalOpAsyncAlgoBaseFile", &kp::Manager::evalOpAsync<kp::OpAlgoBase, std::string>)
+        .def("evalOpAsyncAlgoBase", &kp::Manager::evalOpAsync<kp::OpAlgoBase, std::vector<char>>)
+        .def("evalOpAsyncAlgoLhsRhsOut", &kp::Manager::evalOpAsync<kp::OpAlgoLhsRhsOut>);
+
 
 #ifdef VERSION_INFO
     m.attr("__version__") = VERSION_INFO;

From 3036cbd95f448d1139f409327f73ef1b9364721f Mon Sep 17 00:00:00 2001
From: Alejandro Saucedo <axsauze@gmail.com>
Date: Sun, 1 Nov 2020 20:27:17 +0000
Subject: [PATCH 20/39] Added tests for python bindings in python

---
 python/test/test_kompute.py | 108 ++++++++++++++++++++++++++++++++++++
 1 file changed, 108 insertions(+)
 create mode 100644 python/test/test_kompute.py

diff --git a/python/test/test_kompute.py b/python/test/test_kompute.py
new file mode 100644
index 000000000..058e906f2
--- /dev/null
+++ b/python/test/test_kompute.py
@@ -0,0 +1,108 @@
+
+from komputepy import Tensor, Manager, Sequence
+
+def test_opmult():
+    """
+    Test basic OpMult operation
+    """
+
+    tensor_in_a = Tensor([2, 2, 2])
+    tensor_in_b = Tensor([1, 2, 3])
+    tensor_out = Tensor([0, 0, 0])
+
+    mgr = Manager()
+
+    mgr.evalOpDefaultTensorCreate([tensor_in_a, tensor_in_b, tensor_out])
+
+    mgr.evalOpDefaultAlgoMult([tensor_in_a, tensor_in_b, tensor_out])
+
+    mgr.evalOpDefaultTensorSyncLocal([tensor_out])
+
+    assert tensor_out.data() == [2.0, 4.0, 6.0]
+
+def test_opalgobase_data():
+    """
+    Test basic OpAlgoBase operation
+    """
+
+    tensor_in_a = Tensor([2, 2, 2])
+    tensor_in_b = Tensor([1, 2, 3])
+    tensor_out = Tensor([0, 0, 0])
+
+    mgr = Manager()
+
+    shaderData = """
+        #version 450
+
+        layout (local_size_x = 1) in;
+
+        // The input tensors bind index is relative to index in parameter passed
+        layout(set = 0, binding = 0) buffer bina { float tina[]; };
+        layout(set = 0, binding = 1) buffer binb { float tinb[]; };
+        layout(set = 0, binding = 2) buffer bout { float tout[]; };
+
+        void main() {
+            uint index = gl_GlobalInvocationID.x;
+            tout[index] = tina[index] * tinb[index];
+        }
+    """
+
+    mgr.evalOpDefaultTensorCreate([tensor_in_a, tensor_in_b, tensor_out])
+
+    mgr.evalOpDefaultAlgoBaseData([tensor_in_a, tensor_in_b, tensor_out], list(shaderData))
+
+    mgr.evalOpDefaultTensorSyncLocal([tensor_out])
+
+    assert tensor_out.data() == [2.0, 4.0, 6.0]
+
+
+def test_opalgobase_file():
+    """
+    Test basic OpAlgoBase operation
+    """
+
+    tensor_in_a = Tensor([2, 2, 2])
+    tensor_in_b = Tensor([1, 2, 3])
+    tensor_out = Tensor([0, 0, 0])
+
+    mgr = Manager()
+
+    shaderFilePath = "../../shaders/glsl/opmult.comp"
+
+    mgr.evalOpDefaultTensorCreate([tensor_in_a, tensor_in_b, tensor_out])
+
+    mgr.evalOpDefaultAlgoBaseFile([tensor_in_a, tensor_in_b, tensor_out], shaderFilePath)
+
+    mgr.evalOpDefaultTensorSyncLocal([tensor_out])
+
+    assert tensor_out.data() == [2.0, 4.0, 6.0]
+
+def test_sequence():
+    """
+    Test basic OpAlgoBase operation
+    """
+
+    tensor_in_a = Tensor([2, 2, 2])
+    tensor_in_b = Tensor([1, 2, 3])
+    tensor_out = Tensor([0, 0, 0])
+
+    mgr = Manager()
+
+    shaderFilePath = "../../shaders/glsl/opmult.comp"
+
+    mgr.evalOpDefaultTensorCreate([tensor_in_a, tensor_in_b, tensor_out])
+
+    seq = mgr.createManagedSequence("op")
+
+    seq.begin()
+    seq.recordOpAlgoBaseFile([tensor_in_a, tensor_in_b, tensor_out], shaderFilePath)
+    seq.end()
+
+    seq.eval()
+
+    mgr.evalOpDefaultTensorSyncLocal([tensor_out])
+
+    assert tensor_out.data() == [2.0, 4.0, 6.0]
+
+if __name__ == "__main__":
+    test_sequence()

From 1f614a87e44feaff866d05639734b0be9739b847 Mon Sep 17 00:00:00 2001
From: Alejandro Saucedo <axsauze@gmail.com>
Date: Sun, 1 Nov 2020 20:56:03 +0000
Subject: [PATCH 21/39] Reformatted

---
 single_include/kompute/Kompute.hpp  |  5 +-
 src/Algorithm.cpp                   | 24 ++++++--
 src/Manager.cpp                     | 12 ++--
 src/OpAlgoBase.cpp                  | 88 ++++++++++++++++-------------
 src/OpAlgoLhsRhsOut.cpp             | 38 ++++++-------
 src/Sequence.cpp                    | 10 +++-
 src/Tensor.cpp                      | 13 +++--
 src/include/kompute/Manager.hpp     |  5 +-
 test/TestLogisticRegression.cpp     | 10 ++--
 test/TestMultipleAlgoExecutions.cpp | 15 +++--
 test/TestOpAlgoLoopsPassingData.cpp |  1 -
 11 files changed, 125 insertions(+), 96 deletions(-)

diff --git a/single_include/kompute/Kompute.hpp b/single_include/kompute/Kompute.hpp
index c417182c2..932375cd4 100755
--- a/single_include/kompute/Kompute.hpp
+++ b/single_include/kompute/Kompute.hpp
@@ -1315,8 +1315,9 @@ class Manager
      * @param queueIndex The queue to use from the available queues
      * @return Weak pointer to the manager owned sequence resource
      */
-    std::shared_ptr<Sequence> createManagedSequence(std::string sequenceName = "",
-                                                  uint32_t queueIndex = 0);
+    std::shared_ptr<Sequence> createManagedSequence(
+      std::string sequenceName = "",
+      uint32_t queueIndex = 0);
 
     /**
      * Function that evaluates operation against named sequence.
diff --git a/src/Algorithm.cpp b/src/Algorithm.cpp
index 70092a3d6..eb0be22a8 100644
--- a/src/Algorithm.cpp
+++ b/src/Algorithm.cpp
@@ -34,7 +34,9 @@ Algorithm::~Algorithm()
             SPDLOG_ERROR("Kompute Algorithm Error requested to destroy "
                          "pipeline but it is null");
         }
-        this->mDevice->destroy(*this->mPipeline, (vk::Optional<const vk::AllocationCallbacks>)nullptr);
+        this->mDevice->destroy(
+          *this->mPipeline,
+          (vk::Optional<const vk::AllocationCallbacks>)nullptr);
     }
 
     if (this->mFreePipelineCache) {
@@ -43,7 +45,9 @@ Algorithm::~Algorithm()
             SPDLOG_ERROR("Kompute Algorithm Error requested to destroy "
                          "pipeline cache but it is null");
         }
-        this->mDevice->destroy(*this->mPipelineCache, (vk::Optional<const vk::AllocationCallbacks>)nullptr);
+        this->mDevice->destroy(
+          *this->mPipelineCache,
+          (vk::Optional<const vk::AllocationCallbacks>)nullptr);
     }
 
     if (this->mFreePipelineLayout) {
@@ -52,7 +56,9 @@ Algorithm::~Algorithm()
             SPDLOG_ERROR("Kompute Algorithm Error requested to destroy "
                          "pipeline layout but it is null");
         }
-        this->mDevice->destroy(*this->mPipelineLayout, (vk::Optional<const vk::AllocationCallbacks>)nullptr);
+        this->mDevice->destroy(
+          *this->mPipelineLayout,
+          (vk::Optional<const vk::AllocationCallbacks>)nullptr);
     }
 
     if (this->mFreeShaderModule) {
@@ -61,7 +67,9 @@ Algorithm::~Algorithm()
             SPDLOG_ERROR("Kompute Algorithm Error requested to destroy shader "
                          "module but it is null");
         }
-        this->mDevice->destroy(*this->mShaderModule, (vk::Optional<const vk::AllocationCallbacks>)nullptr);
+        this->mDevice->destroy(
+          *this->mShaderModule,
+          (vk::Optional<const vk::AllocationCallbacks>)nullptr);
     }
 
     if (this->mFreeDescriptorSet) {
@@ -80,7 +88,9 @@ Algorithm::~Algorithm()
             SPDLOG_ERROR("Kompute Algorithm Error requested to destroy "
                          "descriptor set layout but it is null");
         }
-        this->mDevice->destroy(*this->mDescriptorSetLayout, (vk::Optional<const vk::AllocationCallbacks>)nullptr);
+        this->mDevice->destroy(
+          *this->mDescriptorSetLayout,
+          (vk::Optional<const vk::AllocationCallbacks>)nullptr);
     }
 
     if (this->mFreeDescriptorPool) {
@@ -89,7 +99,9 @@ Algorithm::~Algorithm()
             SPDLOG_ERROR("Kompute Algorithm Error requested to destroy "
                          "descriptor pool but it is null");
         }
-        this->mDevice->destroy(*this->mDescriptorPool, (vk::Optional<const vk::AllocationCallbacks>)nullptr);
+        this->mDevice->destroy(
+          *this->mDescriptorPool,
+          (vk::Optional<const vk::AllocationCallbacks>)nullptr);
     }
 }
 
diff --git a/src/Manager.cpp b/src/Manager.cpp
index 5c7a2d3be..b763f2eb0 100644
--- a/src/Manager.cpp
+++ b/src/Manager.cpp
@@ -59,8 +59,10 @@ Manager::~Manager()
     }
 
     if (this->mManagedSequences.size()) {
-        SPDLOG_DEBUG("Kompute Manager explicitly running destructor for managed sequences");
-        for (const std::pair<std::string, std::shared_ptr<Sequence>> &sqPair : this->mManagedSequences) {
+        SPDLOG_DEBUG("Kompute Manager explicitly running destructor for "
+                     "managed sequences");
+        for (const std::pair<std::string, std::shared_ptr<Sequence>>& sqPair :
+             this->mManagedSequences) {
             sqPair.second->~Sequence();
         }
         this->mManagedSequences.clear();
@@ -68,7 +70,8 @@ Manager::~Manager()
 
     if (this->mFreeDevice) {
         SPDLOG_INFO("Destroying device");
-        this->mDevice->destroy((vk::Optional<const vk::AllocationCallbacks>)nullptr);
+        this->mDevice->destroy(
+          (vk::Optional<const vk::AllocationCallbacks>)nullptr);
         SPDLOG_DEBUG("Kompute Manager Destroyed Device");
     }
 
@@ -89,7 +92,8 @@ Manager::~Manager()
 #endif
 
     if (this->mFreeInstance) {
-        this->mInstance->destroy((vk::Optional<const vk::AllocationCallbacks>)nullptr);
+        this->mInstance->destroy(
+          (vk::Optional<const vk::AllocationCallbacks>)nullptr);
         SPDLOG_DEBUG("Kompute Manager Destroyed Instance");
     }
 }
diff --git a/src/OpAlgoBase.cpp b/src/OpAlgoBase.cpp
index 99e3a9ac1..68e22de3b 100644
--- a/src/OpAlgoBase.cpp
+++ b/src/OpAlgoBase.cpp
@@ -10,13 +10,14 @@ OpAlgoBase::OpAlgoBase()
 }
 
 OpAlgoBase::OpAlgoBase(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
-                           std::shared_ptr<vk::Device> device,
-                           std::shared_ptr<vk::CommandBuffer> commandBuffer,
-                           std::vector<std::shared_ptr<Tensor>>& tensors,
-                           KomputeWorkgroup komputeWorkgroup)
+                       std::shared_ptr<vk::Device> device,
+                       std::shared_ptr<vk::CommandBuffer> commandBuffer,
+                       std::vector<std::shared_ptr<Tensor>>& tensors,
+                       KomputeWorkgroup komputeWorkgroup)
   : OpBase(physicalDevice, device, commandBuffer, tensors, false)
 {
-    SPDLOG_DEBUG("Kompute OpAlgoBase constructor with params numTensors: {}", tensors.size());
+    SPDLOG_DEBUG("Kompute OpAlgoBase constructor with params numTensors: {}",
+                 tensors.size());
 
     // The dispatch size is set up based on either explicitly provided template
     // parameters or by default it would take the shape and size of the tensors
@@ -29,38 +30,42 @@ OpAlgoBase::OpAlgoBase(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
             komputeWorkgroup.z > 0 ? komputeWorkgroup.z : 1
         };
     } else {
-        this->mKomputeWorkgroup = {tensors[0]->size(), 1, 1};
+        this->mKomputeWorkgroup = { tensors[0]->size(), 1, 1 };
     }
     SPDLOG_INFO("Kompute OpAlgoBase dispatch size X: {}, Y: {}, Z: {}",
-                 this->mKomputeWorkgroup.x,
-                 this->mKomputeWorkgroup.y,
-                 this->mKomputeWorkgroup.z);
+                this->mKomputeWorkgroup.x,
+                this->mKomputeWorkgroup.y,
+                this->mKomputeWorkgroup.z);
 
     this->mAlgorithm = std::make_shared<Algorithm>(device, commandBuffer);
 }
 
 OpAlgoBase::OpAlgoBase(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
-                           std::shared_ptr<vk::Device> device,
-                           std::shared_ptr<vk::CommandBuffer> commandBuffer,
-                           std::vector<std::shared_ptr<Tensor>>& tensors,
-                           std::string shaderFilePath,
-                           KomputeWorkgroup komputeWorkgroup)
+                       std::shared_ptr<vk::Device> device,
+                       std::shared_ptr<vk::CommandBuffer> commandBuffer,
+                       std::vector<std::shared_ptr<Tensor>>& tensors,
+                       std::string shaderFilePath,
+                       KomputeWorkgroup komputeWorkgroup)
   : OpAlgoBase(physicalDevice, device, commandBuffer, tensors, komputeWorkgroup)
 {
-    SPDLOG_DEBUG("Kompute OpAlgoBase shaderFilePath constructo with shaderfile path: {}", shaderFilePath);
+    SPDLOG_DEBUG(
+      "Kompute OpAlgoBase shaderFilePath constructo with shaderfile path: {}",
+      shaderFilePath);
 
     this->mShaderFilePath = shaderFilePath;
 }
 
 OpAlgoBase::OpAlgoBase(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
-                           std::shared_ptr<vk::Device> device,
-                           std::shared_ptr<vk::CommandBuffer> commandBuffer,
-                           std::vector<std::shared_ptr<Tensor>>& tensors,
-                           const std::vector<char>& shaderDataRaw,
-                           KomputeWorkgroup komputeWorkgroup)
+                       std::shared_ptr<vk::Device> device,
+                       std::shared_ptr<vk::CommandBuffer> commandBuffer,
+                       std::vector<std::shared_ptr<Tensor>>& tensors,
+                       const std::vector<char>& shaderDataRaw,
+                       KomputeWorkgroup komputeWorkgroup)
   : OpAlgoBase(physicalDevice, device, commandBuffer, tensors, komputeWorkgroup)
 {
-    SPDLOG_DEBUG("Kompute OpAlgoBase shaderFilePath constructo with shader raw data length: {}", shaderDataRaw.size());
+    SPDLOG_DEBUG("Kompute OpAlgoBase shaderFilePath constructo with shader raw "
+                 "data length: {}",
+                 shaderDataRaw.size());
 
     this->mShaderDataRaw = shaderDataRaw;
 }
@@ -78,11 +83,13 @@ OpAlgoBase::init()
     if (this->mTensors.size() < 1) {
         throw std::runtime_error(
           "Kompute OpAlgoBase called with less than 1 tensor");
-    } 
+    }
 
     for (std::shared_ptr<Tensor> tensor : this->mTensors) {
-        if(!tensor->isInit()) {
-            throw std::runtime_error("Kompute OpAlgoBase validation failed; all tensor parameters must be initialised.");
+        if (!tensor->isInit()) {
+            throw std::runtime_error(
+              "Kompute OpAlgoBase validation failed; all tensor parameters "
+              "must be initialised.");
         }
     }
 
@@ -110,7 +117,9 @@ OpAlgoBase::record()
           vk::PipelineStageFlagBits::eComputeShader);
     }
 
-    this->mAlgorithm->recordDispatch(this->mKomputeWorkgroup.x, this->mKomputeWorkgroup.y, this->mKomputeWorkgroup.z);
+    this->mAlgorithm->recordDispatch(this->mKomputeWorkgroup.x,
+                                     this->mKomputeWorkgroup.y,
+                                     this->mKomputeWorkgroup.z);
 }
 
 void
@@ -125,17 +134,19 @@ OpAlgoBase::postEval()
     SPDLOG_DEBUG("Kompute OpAlgoBase postSubmit called");
 }
 
-std::vector<char> OpAlgoBase::fetchSpirvBinaryData() 
+std::vector<char>
+OpAlgoBase::fetchSpirvBinaryData()
 {
-    SPDLOG_WARN(
-      "Kompute OpAlgoBase Running shaders directly from spirv file");
+    SPDLOG_WARN("Kompute OpAlgoBase Running shaders directly from spirv file");
 
     if (this->mShaderFilePath.size()) {
         std::ifstream fileStream(this->mShaderFilePath,
-                                 std::ios::binary | std::ios::in | std::ios::ate);
+                                 std::ios::binary | std::ios::in |
+                                   std::ios::ate);
 
         if (!fileStream.good()) {
-            throw std::runtime_error("Error reading file: " + this->mShaderFilePath);
+            throw std::runtime_error("Error reading file: " +
+                                     this->mShaderFilePath);
         }
 
         size_t shaderFileSize = fileStream.tellg();
@@ -144,19 +155,16 @@ std::vector<char> OpAlgoBase::fetchSpirvBinaryData()
         fileStream.read(shaderDataRaw, shaderFileSize);
         fileStream.close();
 
-        SPDLOG_WARN(
-          "Kompute OpAlgoBase fetched {} bytes", shaderFileSize);
+        SPDLOG_WARN("Kompute OpAlgoBase fetched {} bytes", shaderFileSize);
 
-        return std::vector<char>(shaderDataRaw,
-                                 shaderDataRaw + shaderFileSize);
-    }
-    else if (this->mShaderDataRaw.size()) {
+        return std::vector<char>(shaderDataRaw, shaderDataRaw + shaderFileSize);
+    } else if (this->mShaderDataRaw.size()) {
         return this->mShaderDataRaw;
-    }
-    else {
-        throw std::runtime_error("Kompute OpAlgoBase Error reached fetchSpirvBinaryData but neither filepath nor data provided");
+    } else {
+        throw std::runtime_error(
+          "Kompute OpAlgoBase Error reached fetchSpirvBinaryData but neither "
+          "filepath nor data provided");
     }
 }
 
 }
-
diff --git a/src/OpAlgoLhsRhsOut.cpp b/src/OpAlgoLhsRhsOut.cpp
index 444ec63a3..ab759fed8 100644
--- a/src/OpAlgoLhsRhsOut.cpp
+++ b/src/OpAlgoLhsRhsOut.cpp
@@ -9,13 +9,14 @@ OpAlgoLhsRhsOut::OpAlgoLhsRhsOut()
     SPDLOG_DEBUG("Kompute OpAlgoLhsRhsOut constructor base");
 }
 
-OpAlgoLhsRhsOut::OpAlgoLhsRhsOut(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
-                           std::shared_ptr<vk::Device> device,
-                           std::shared_ptr<vk::CommandBuffer> commandBuffer,
-                           std::vector<std::shared_ptr<Tensor>> tensors,
-                           KomputeWorkgroup komputeWorkgroup)
+OpAlgoLhsRhsOut::OpAlgoLhsRhsOut(
+  std::shared_ptr<vk::PhysicalDevice> physicalDevice,
+  std::shared_ptr<vk::Device> device,
+  std::shared_ptr<vk::CommandBuffer> commandBuffer,
+  std::vector<std::shared_ptr<Tensor>> tensors,
+  KomputeWorkgroup komputeWorkgroup)
   // The inheritance is initialised with the copyOutputData to false given that
-  // this depencendant class handles the transfer of data via staging buffers in 
+  // this depencendant class handles the transfer of data via staging buffers in
   // a granular way.
   : OpAlgoBase(physicalDevice, device, commandBuffer, tensors, komputeWorkgroup)
 {
@@ -36,18 +37,19 @@ OpAlgoLhsRhsOut::init()
         throw std::runtime_error(
           "Kompute OpAlgoLhsRhsOut called with less than 1 tensor");
     } else if (this->mTensors.size() > 3) {
-        SPDLOG_WARN("Kompute OpAlgoLhsRhsOut called with more than 3 this->mTensors");
+        SPDLOG_WARN(
+          "Kompute OpAlgoLhsRhsOut called with more than 3 this->mTensors");
     }
 
     this->mTensorLHS = this->mTensors[0];
     this->mTensorRHS = this->mTensors[1];
     this->mTensorOutput = this->mTensors[2];
 
-
     if (!(this->mTensorLHS->isInit() && this->mTensorRHS->isInit() &&
           this->mTensorOutput->isInit())) {
         throw std::runtime_error(
-          "Kompute OpAlgoLhsRhsOut all tensor parameters must be initialised. LHS: " +
+          "Kompute OpAlgoLhsRhsOut all tensor parameters must be initialised. "
+          "LHS: " +
           std::to_string(this->mTensorLHS->isInit()) +
           " RHS: " + std::to_string(this->mTensorRHS->isInit()) +
           " Output: " + std::to_string(this->mTensorOutput->isInit()));
@@ -56,7 +58,8 @@ OpAlgoLhsRhsOut::init()
     if (!(this->mTensorLHS->size() == this->mTensorRHS->size() &&
           this->mTensorRHS->size() == this->mTensorOutput->size())) {
         throw std::runtime_error(
-          "Kompute OpAlgoLhsRhsOut all tensor parameters must be the same size LHS: " +
+          "Kompute OpAlgoLhsRhsOut all tensor parameters must be the same size "
+          "LHS: " +
           std::to_string(this->mTensorLHS->size()) +
           " RHS: " + std::to_string(this->mTensorRHS->size()) +
           " Output: " + std::to_string(this->mTensorOutput->size()));
@@ -65,8 +68,7 @@ OpAlgoLhsRhsOut::init()
     this->mTensorOutputStaging = std::make_shared<Tensor>(
       this->mTensorOutput->data(), Tensor::TensorTypes::eStaging);
 
-    this->mTensorOutputStaging->init(
-      this->mPhysicalDevice, this->mDevice);
+    this->mTensorOutputStaging->init(this->mPhysicalDevice, this->mDevice);
 
     SPDLOG_DEBUG("Kompute OpAlgoLhsRhsOut fetching spirv data");
 
@@ -96,10 +98,9 @@ OpAlgoLhsRhsOut::record()
       vk::PipelineStageFlagBits::eHost,
       vk::PipelineStageFlagBits::eComputeShader);
 
-    this->mAlgorithm->recordDispatch(
-                this->mKomputeWorkgroup.x,
-                this->mKomputeWorkgroup.y,
-                this->mKomputeWorkgroup.z);
+    this->mAlgorithm->recordDispatch(this->mKomputeWorkgroup.x,
+                                     this->mKomputeWorkgroup.y,
+                                     this->mKomputeWorkgroup.z);
 
     // Barrier to ensure the shader code is executed before buffer read
     this->mTensorOutput->recordBufferMemoryBarrier(
@@ -110,9 +111,7 @@ OpAlgoLhsRhsOut::record()
       vk::PipelineStageFlagBits::eTransfer);
 
     this->mTensorOutputStaging->recordCopyFrom(
-            this->mCommandBuffer,
-            this->mTensorOutput,
-            true);
+      this->mCommandBuffer, this->mTensorOutput, true);
 }
 
 void
@@ -126,4 +125,3 @@ OpAlgoLhsRhsOut::postEval()
 }
 
 }
-
diff --git a/src/Sequence.cpp b/src/Sequence.cpp
index 0f6eccfd2..b27c547be 100644
--- a/src/Sequence.cpp
+++ b/src/Sequence.cpp
@@ -28,7 +28,8 @@ Sequence::~Sequence()
     SPDLOG_DEBUG("Kompute Sequence Destructor started");
 
     if (!this->mIsInit) {
-        SPDLOG_WARN("Kompute Sequence destructor called but sequence is not initialized.");
+        SPDLOG_WARN("Kompute Sequence destructor called but sequence is not "
+                    "initialized.");
         return;
     }
 
@@ -60,7 +61,9 @@ Sequence::~Sequence()
             this->mIsInit = false;
             return;
         }
-        this->mDevice->destroy(*this->mCommandPool, (vk::Optional<const vk::AllocationCallbacks>)nullptr);
+        this->mDevice->destroy(
+          *this->mCommandPool,
+          (vk::Optional<const vk::AllocationCallbacks>)nullptr);
         SPDLOG_DEBUG("Kompute Sequence Destroyed CommandPool");
     }
 
@@ -196,7 +199,8 @@ Sequence::evalAwait(uint64_t waitFor)
 
     vk::Result result =
       this->mDevice->waitForFences(1, &this->mFence, VK_TRUE, waitFor);
-    this->mDevice->destroy(this->mFence, (vk::Optional<const vk::AllocationCallbacks>)nullptr);
+    this->mDevice->destroy(
+      this->mFence, (vk::Optional<const vk::AllocationCallbacks>)nullptr);
 
     this->mIsRunning = false;
 
diff --git a/src/Tensor.cpp b/src/Tensor.cpp
index 299622ee4..214ac2eb0 100644
--- a/src/Tensor.cpp
+++ b/src/Tensor.cpp
@@ -12,8 +12,9 @@ Tensor::Tensor()
 Tensor::Tensor(const std::vector<float>& data, TensorTypes tensorType)
 {
 #if DEBUG
-    SPDLOG_DEBUG(
-      "Kompute Tensor constructor data length: {}, and type: {}", data.size(), tensorType);
+    SPDLOG_DEBUG("Kompute Tensor constructor data length: {}, and type: {}",
+                 data.size(),
+                 tensorType);
 #endif
 
     this->mData = data;
@@ -350,7 +351,9 @@ Tensor::freeMemoryDestroyGPUResources()
               "Kompose Tensor expected to free buffer but got null buffer");
         } else {
             SPDLOG_DEBUG("Kompose Tensor destroying buffer");
-            this->mDevice->destroy(*this->mBuffer, (vk::Optional<const vk::AllocationCallbacks>)nullptr);
+            this->mDevice->destroy(
+              *this->mBuffer,
+              (vk::Optional<const vk::AllocationCallbacks>)nullptr);
             this->mBuffer = nullptr;
         }
     }
@@ -361,7 +364,9 @@ Tensor::freeMemoryDestroyGPUResources()
               "Kompose Tensor expected to free buffer but got null memory");
         } else {
             SPDLOG_DEBUG("Kompose Tensor freeing memory");
-            this->mDevice->freeMemory(*this->mMemory, (vk::Optional<const vk::AllocationCallbacks>)nullptr);
+            this->mDevice->freeMemory(
+              *this->mMemory,
+              (vk::Optional<const vk::AllocationCallbacks>)nullptr);
             this->mDevice = nullptr;
         }
     }
diff --git a/src/include/kompute/Manager.hpp b/src/include/kompute/Manager.hpp
index 98e8e82c5..8c689ba57 100644
--- a/src/include/kompute/Manager.hpp
+++ b/src/include/kompute/Manager.hpp
@@ -77,8 +77,9 @@ class Manager
      * @param queueIndex The queue to use from the available queues
      * @return Weak pointer to the manager owned sequence resource
      */
-    std::shared_ptr<Sequence> createManagedSequence(std::string sequenceName = "",
-                                                  uint32_t queueIndex = 0);
+    std::shared_ptr<Sequence> createManagedSequence(
+      std::string sequenceName = "",
+      uint32_t queueIndex = 0);
 
     /**
      * Function that evaluates operation against named sequence.
diff --git a/test/TestLogisticRegression.cpp b/test/TestLogisticRegression.cpp
index 91dd1f430..eda6ca635 100644
--- a/test/TestLogisticRegression.cpp
+++ b/test/TestLogisticRegression.cpp
@@ -31,8 +31,7 @@ TEST(TestLogisticRegressionAlgorithm, TestMainLogisticRegression)
     {
         kp::Manager mgr;
 
-        std::shared_ptr<kp::Sequence> sqTensor =
-          mgr.createManagedSequence();
+        std::shared_ptr<kp::Sequence> sqTensor = mgr.createManagedSequence();
 
         sqTensor->begin();
         sqTensor->record<kp::OpTensorCreate>(params);
@@ -76,7 +75,7 @@ TEST(TestLogisticRegressionAlgorithm, TestMainLogisticRegression)
     EXPECT_LT(bIn->data()[0], 0.0);
     EXPECT_LT(bIn->data()[0], 0.0);
 
-    //SPDLOG_WARN("Result wIn: {}, bIn: {}, loss: {}",
+    // SPDLOG_WARN("Result wIn: {}, bIn: {}, loss: {}",
     //            wIn->data(),
     //            bIn->data(),
     //            lOut->data());
@@ -114,8 +113,7 @@ TEST(TestLogisticRegressionAlgorithm, TestMainLogisticRegressionManualCopy)
     {
         kp::Manager mgr;
 
-        std::shared_ptr<kp::Sequence> sqTensor =
-          mgr.createManagedSequence();
+        std::shared_ptr<kp::Sequence> sqTensor = mgr.createManagedSequence();
 
         sqTensor->begin();
         sqTensor->record<kp::OpTensorCreate>(params);
@@ -158,7 +156,7 @@ TEST(TestLogisticRegressionAlgorithm, TestMainLogisticRegressionManualCopy)
     EXPECT_GT(wIn->data()[1], 1.0);
     EXPECT_LT(bIn->data()[0], 0.0);
 
-    //SPDLOG_WARN("Result wIn: {}, bIn: {}, loss: {}",
+    // SPDLOG_WARN("Result wIn: {}, bIn: {}, loss: {}",
     //            wIn->data(),
     //            bIn->data(),
     //            lOut->data());
diff --git a/test/TestMultipleAlgoExecutions.cpp b/test/TestMultipleAlgoExecutions.cpp
index f45367313..11e94caa4 100644
--- a/test/TestMultipleAlgoExecutions.cpp
+++ b/test/TestMultipleAlgoExecutions.cpp
@@ -70,20 +70,20 @@ TEST(TestMultipleAlgoExecutions, MultipleCmdBufRecords)
 
     // Then perform the computations
     sq->begin();
-    sq->record<kp::OpAlgoBase>(
-      { tensorA }, std::vector<char>(shader.begin(), shader.end()));
+    sq->record<kp::OpAlgoBase>({ tensorA },
+                               std::vector<char>(shader.begin(), shader.end()));
     sq->end();
     sq->eval();
 
     sq->begin();
-    sq->record<kp::OpAlgoBase>(
-      { tensorA }, std::vector<char>(shader.begin(), shader.end()));
+    sq->record<kp::OpAlgoBase>({ tensorA },
+                               std::vector<char>(shader.begin(), shader.end()));
     sq->end();
     sq->eval();
 
     sq->begin();
-    sq->record<kp::OpAlgoBase>(
-      { tensorA }, std::vector<char>(shader.begin(), shader.end()));
+    sq->record<kp::OpAlgoBase>({ tensorA },
+                               std::vector<char>(shader.begin(), shader.end()));
     sq->end();
     sq->eval();
 
@@ -112,7 +112,7 @@ TEST(TestMultipleAlgoExecutions, MultipleSequences)
       })");
 
     {
-        std::shared_ptr<kp::Sequence> sq = 
+        std::shared_ptr<kp::Sequence> sq =
           mgr.getOrCreateManagedSequence("newSequence");
 
         sq->begin();
@@ -211,7 +211,6 @@ TEST(TestMultipleAlgoExecutions, SingleRecordMultipleEval)
         sq->eval();
     }
 
-
     {
         std::shared_ptr<kp::Sequence> sq =
           mgr.getOrCreateManagedSequence("newSequence3");
diff --git a/test/TestOpAlgoLoopsPassingData.cpp b/test/TestOpAlgoLoopsPassingData.cpp
index 35a08e02a..bd7727790 100644
--- a/test/TestOpAlgoLoopsPassingData.cpp
+++ b/test/TestOpAlgoLoopsPassingData.cpp
@@ -30,7 +30,6 @@ TEST(TestProcessingIterations, IterateThroughMultipleSumAndCopies)
         }
     )");
 
-
     {
         std::shared_ptr<kp::Sequence> sq =
           mgr.getOrCreateManagedSequence("default");

From 96cd1e3c92fefedba7d2b57b51076e052bed1a04 Mon Sep 17 00:00:00 2001
From: Alejandro Saucedo <axsauze@gmail.com>
Date: Sun, 1 Nov 2020 21:24:20 +0000
Subject: [PATCH 22/39] Updated function names

---
 python/src/main.cpp | 125 +++++++++++++++++++++++++-------------------
 1 file changed, 71 insertions(+), 54 deletions(-)

diff --git a/python/src/main.cpp b/python/src/main.cpp
index 9573b8074..3cc3e214e 100644
--- a/python/src/main.cpp
+++ b/python/src/main.cpp
@@ -8,9 +8,9 @@ namespace py = pybind11;
 PYBIND11_MODULE(komputepy, m) {
 
     py::enum_<kp::Tensor::TensorTypes>(m, "TensorTypes")
-        .value("eDevice", kp::Tensor::TensorTypes::eDevice)
-        .value("eStaging", kp::Tensor::TensorTypes::eStaging)
-        .value("eStorage", kp::Tensor::TensorTypes::eStorage)
+        .value("device", kp::Tensor::TensorTypes::eDevice)
+        .value("staging", kp::Tensor::TensorTypes::eStaging)
+        .value("storage", kp::Tensor::TensorTypes::eStorage)
         .export_values();
 
     py::class_<kp::Tensor, std::shared_ptr<kp::Tensor>>(m, "Tensor")
@@ -22,26 +22,36 @@ PYBIND11_MODULE(komputepy, m) {
             [](const std::vector<float>& data, kp::Tensor::TensorTypes tensorTypes) {
                 return std::unique_ptr<kp::Tensor>(new kp::Tensor(data, tensorTypes));
             }))
-        .def("data", &kp::Tensor::data);
+        .def("data", &kp::Tensor::data)
+        .def("size", &kp::Tensor::size)
+        .def("tensor_type", &kp::Tensor::tensorType)
+        .def("is_init", &kp::Tensor::isInit)
+        .def("set_data", &kp::Tensor::setData)
+        .def("map_data_from_host", &kp::Tensor::mapDataFromHostMemory)
+        .def("map_data_into_host", &kp::Tensor::mapDataIntoHostMemory);
 
     py::class_<kp::Sequence, std::shared_ptr<kp::Sequence>>(m, "Sequence")
         .def("init", &kp::Sequence::init)
+        // record
         .def("begin", &kp::Sequence::begin)
         .def("end", &kp::Sequence::end)
+        // eval
         .def("eval", &kp::Sequence::eval)
-        .def("evalAsync", &kp::Sequence::evalAsync)
-        .def("evalAwait", &kp::Sequence::evalAwait)
-        .def("isRunning", &kp::Sequence::isRunning)
-        .def("isRecording", &kp::Sequence::isRecording)
-        .def("isInit", &kp::Sequence::isInit)
-        .def("recordOpTensorCreate", &kp::Sequence::record<kp::OpTensorCreate>)
-        .def("recordOpTensorCopy", &kp::Sequence::record<kp::OpTensorCopy>)
-        .def("recordOpTensorSyncDevice", &kp::Sequence::record<kp::OpTensorSyncDevice>)
-        .def("recordOpTensorSyncLocal", &kp::Sequence::record<kp::OpTensorSyncLocal>)
-        .def("recordOpAlgoMult", &kp::Sequence::record<kp::OpMult>)
-        .def("recordOpAlgoBaseFile", &kp::Sequence::record<kp::OpAlgoBase, std::string>)
-        .def("recordOpAlgoBaseData", &kp::Sequence::record<kp::OpAlgoBase, std::vector<char>>)
-        .def("recordOpAlgoLhsRhsOut", &kp::Sequence::record<kp::OpAlgoLhsRhsOut>);
+        .def("eval_async", &kp::Sequence::evalAsync)
+        .def("eval_await", &kp::Sequence::evalAwait)
+        // status
+        .def("is_running", &kp::Sequence::isRunning)
+        .def("is_rec", &kp::Sequence::isRecording)
+        .def("is_init", &kp::Sequence::isInit)
+        // record
+        .def("record_tensor_create", &kp::Sequence::record<kp::OpTensorCreate>)
+        .def("record_tensor_copy", &kp::Sequence::record<kp::OpTensorCopy>)
+        .def("record_tensor_sync_device", &kp::Sequence::record<kp::OpTensorSyncDevice>)
+        .def("record_tensor_sync_local", &kp::Sequence::record<kp::OpTensorSyncLocal>)
+        .def("record_algo_mult", &kp::Sequence::record<kp::OpMult>)
+        .def("record_algo_file", &kp::Sequence::record<kp::OpAlgoBase, std::string>)
+        .def("record_algo_data", &kp::Sequence::record<kp::OpAlgoBase, std::vector<char>>)
+        .def("record_algo_lro", &kp::Sequence::record<kp::OpAlgoLhsRhsOut>);
 
     py::class_<kp::Manager>(m, "Manager")
         .def(py::init())
@@ -53,45 +63,52 @@ PYBIND11_MODULE(komputepy, m) {
             [](uint32_t physicalDeviceIndex, const std::vector<uint32_t>& familyQueueIndices) {
                 return std::unique_ptr<kp::Manager>(new kp::Manager(physicalDeviceIndex, familyQueueIndices));
             }))
-        .def("getOrCreateManagedSequence", &kp::Manager::getOrCreateManagedSequence)
-        .def("createManagedSequence", &kp::Manager::createManagedSequence,
+        .def("get_create_sequence", &kp::Manager::getOrCreateManagedSequence)
+        .def("create_sequence", &kp::Manager::createManagedSequence,
                 py::arg("name"), py::arg("queueIndex") = 0)
-        .def("buildTensor", &kp::Manager::buildTensor, 
+        .def("build_tensor", &kp::Manager::buildTensor, 
                 py::arg("data"), py::arg("tensorType") = kp::Tensor::TensorTypes::eDevice)
-        .def("evalOpAsync", &kp::Manager::evalOpAsync<kp::OpMult>)
-        .def("evalOpAsyncDefault", &kp::Manager::evalOpAsyncDefault<kp::OpMult>)
-        .def("evalOpDefaultTensorCreate", &kp::Manager::evalOpDefault<kp::OpTensorCreate>)
-        .def("evalOpDefaultTensorCopy", &kp::Manager::evalOpDefault<kp::OpTensorCopy>)
-        .def("evalOpDefaultTensorSyncDevice", &kp::Manager::evalOpDefault<kp::OpTensorSyncDevice>)
-        .def("evalOpDefaultTensorSyncLocal", &kp::Manager::evalOpDefault<kp::OpTensorSyncLocal>)
-        .def("evalOpDefaultAlgoMult", &kp::Manager::evalOpDefault<kp::OpMult>)
-        .def("evalOpDefaultAlgoBaseFile", &kp::Manager::evalOpDefault<kp::OpAlgoBase, std::string>)
-        .def("evalOpDefaultAlgoBaseData", &kp::Manager::evalOpDefault<kp::OpAlgoBase, std::vector<char>>)
-        .def("evalOpDefaultAlgoLhsRhsOut", &kp::Manager::evalOpDefault<kp::OpAlgoLhsRhsOut>)
-        .def("evalOpTensorCreate", &kp::Manager::evalOp<kp::OpTensorCreate>)
-        .def("evalOpTensorCopy", &kp::Manager::evalOp<kp::OpTensorCopy>)
-        .def("evalOpTensorSyncDevice", &kp::Manager::evalOp<kp::OpTensorSyncDevice>)
-        .def("evalOpTensorSyncLocal", &kp::Manager::evalOp<kp::OpTensorSyncLocal>)
-        .def("evalOpAlgoMult", &kp::Manager::evalOp<kp::OpMult>)
-        .def("evalOpAlgoBaseFile", &kp::Manager::evalOp<kp::OpAlgoBase, std::string>)
-        .def("evalOpAlgoBaseData", &kp::Manager::evalOp<kp::OpAlgoBase, std::vector<char>>)
-        .def("evalOpAlgoLhsRhsOut", &kp::Manager::evalOp<kp::OpAlgoLhsRhsOut>)
-        .def("evalOpAsyncDefaultTensorCreate", &kp::Manager::evalOpAsyncDefault<kp::OpTensorCreate>)
-        .def("evalOpAsyncDefaultTensorCopy", &kp::Manager::evalOpAsyncDefault<kp::OpTensorCopy>)
-        .def("evalOpAsyncDefaultTensorSyncDevice", &kp::Manager::evalOpAsyncDefault<kp::OpTensorSyncDevice>)
-        .def("evalOpAsyncDefaultTensorSyncLocal", &kp::Manager::evalOpAsyncDefault<kp::OpTensorSyncLocal>)
-        .def("evalOpAsyncDefaultAlgoMult", &kp::Manager::evalOpAsyncDefault<kp::OpMult>)
-        .def("evalOpAsyncDefaultAlgoBaseFile", &kp::Manager::evalOpAsyncDefault<kp::OpAlgoBase, std::string>)
-        .def("evalOpAsyncDefaultAlgoBaseData", &kp::Manager::evalOpAsyncDefault<kp::OpAlgoBase, std::vector<char>>)
-        .def("evalOpAsyncDefaultAlgoLhsRhsOut", &kp::Manager::evalOpAsyncDefault<kp::OpAlgoLhsRhsOut>)
-        .def("evalOpAsyncTensorCreate", &kp::Manager::evalOpAsync<kp::OpTensorCreate>)
-        .def("evalOpAsyncTensorCopy", &kp::Manager::evalOpAsync<kp::OpTensorCopy>)
-        .def("evalOpAsyncTensorSyncDevice", &kp::Manager::evalOpAsync<kp::OpTensorSyncDevice>)
-        .def("evalOpAsyncTensorSyncLocal", &kp::Manager::evalOpAsync<kp::OpTensorSyncLocal>)
-        .def("evalOpAsync", &kp::Manager::evalOpAsync<kp::OpMult>)
-        .def("evalOpAsyncAlgoBaseFile", &kp::Manager::evalOpAsync<kp::OpAlgoBase, std::string>)
-        .def("evalOpAsyncAlgoBase", &kp::Manager::evalOpAsync<kp::OpAlgoBase, std::vector<char>>)
-        .def("evalOpAsyncAlgoLhsRhsOut", &kp::Manager::evalOpAsync<kp::OpAlgoLhsRhsOut>);
+        // Await functions
+        .def("eval_await", &kp::Manager::evalOpAwait,
+                py::arg("sequenceName"), py::arg("waitFor") = UINT64_MAX)
+        .def("eval_await_def", &kp::Manager::evalOpAwaitDefault,
+                py::arg("waitFor") = UINT64_MAX)
+        // eval default
+        .def("eval_tensor_create_def", &kp::Manager::evalOpDefault<kp::OpTensorCreate>)
+        .def("eval_tensor_copy_def", &kp::Manager::evalOpDefault<kp::OpTensorCopy>)
+        .def("eval_tensor_sync_device_def", &kp::Manager::evalOpDefault<kp::OpTensorSyncDevice>)
+        .def("eval_tensor_sync_local_def", &kp::Manager::evalOpDefault<kp::OpTensorSyncLocal>)
+        .def("eval_algo_mult_def", &kp::Manager::evalOpDefault<kp::OpMult>)
+        .def("eval_algo_file_def", &kp::Manager::evalOpDefault<kp::OpAlgoBase, std::string>)
+        .def("eval_algo_data_def", &kp::Manager::evalOpDefault<kp::OpAlgoBase, std::vector<char>>)
+        .def("eval_algo_lro_def", &kp::Manager::evalOpDefault<kp::OpAlgoLhsRhsOut>)
+        // eval
+        .def("eval_tensor_create", &kp::Manager::evalOp<kp::OpTensorCreate>)
+        .def("eval_tensor_copy", &kp::Manager::evalOp<kp::OpTensorCopy>)
+        .def("eval_tensor_sync_device", &kp::Manager::evalOp<kp::OpTensorSyncDevice>)
+        .def("eval_tensor_sync_local", &kp::Manager::evalOp<kp::OpTensorSyncLocal>)
+        .def("eval_algo_mult", &kp::Manager::evalOp<kp::OpMult>)
+        .def("eval_algo_file", &kp::Manager::evalOp<kp::OpAlgoBase, std::string>)
+        .def("eval_algo_data", &kp::Manager::evalOp<kp::OpAlgoBase, std::vector<char>>)
+        .def("eval_algo_lro", &kp::Manager::evalOp<kp::OpAlgoLhsRhsOut>)
+        // eval async default
+        .def("eval_async_tensor_create_def", &kp::Manager::evalOpAsyncDefault<kp::OpTensorCreate>)
+        .def("eval_async_tensor_copy_def", &kp::Manager::evalOpAsyncDefault<kp::OpTensorCopy>)
+        .def("eval_async_tensor_sync_device_def", &kp::Manager::evalOpAsyncDefault<kp::OpTensorSyncDevice>)
+        .def("eval_async_tensor_sync_local_def", &kp::Manager::evalOpAsyncDefault<kp::OpTensorSyncLocal>)
+        .def("eval_async_algo_mult_def", &kp::Manager::evalOpAsyncDefault<kp::OpMult>)
+        .def("eval_async_algo_file_def", &kp::Manager::evalOpAsyncDefault<kp::OpAlgoBase, std::string>)
+        .def("eval_async_algo_data_def", &kp::Manager::evalOpAsyncDefault<kp::OpAlgoBase, std::vector<char>>)
+        .def("eval_async_algo_lro_def", &kp::Manager::evalOpAsyncDefault<kp::OpAlgoLhsRhsOut>)
+        // eval async
+        .def("eval_tensor_create", &kp::Manager::evalOpAsync<kp::OpTensorCreate>)
+        .def("eval_tensor_copy", &kp::Manager::evalOpAsync<kp::OpTensorCopy>)
+        .def("eval_tensor_sync_device", &kp::Manager::evalOpAsync<kp::OpTensorSyncDevice>)
+        .def("eval_tensor_sync_local", &kp::Manager::evalOpAsync<kp::OpTensorSyncLocal>)
+        .def("eval_algo_mult", &kp::Manager::evalOpAsync<kp::OpMult>)
+        .def("eval_algo_file", &kp::Manager::evalOpAsync<kp::OpAlgoBase, std::string>)
+        .def("eval_algo_data", &kp::Manager::evalOpAsync<kp::OpAlgoBase, std::vector<char>>)
+        .def("eval_algo_lro", &kp::Manager::evalOpAsync<kp::OpAlgoLhsRhsOut>);
 
 
 #ifdef VERSION_INFO

From 85b39baf1944599b0d386bed077a4797e63abc7e Mon Sep 17 00:00:00 2001
From: Alejandro Saucedo <axsauze@gmail.com>
Date: Sun, 1 Nov 2020 21:24:28 +0000
Subject: [PATCH 23/39] Updated tests

---
 python/test/test_kompute.py | 36 +++++++++++++++++++-----------------
 1 file changed, 19 insertions(+), 17 deletions(-)

diff --git a/python/test/test_kompute.py b/python/test/test_kompute.py
index 058e906f2..7b85de47b 100644
--- a/python/test/test_kompute.py
+++ b/python/test/test_kompute.py
@@ -12,11 +12,11 @@ def test_opmult():
 
     mgr = Manager()
 
-    mgr.evalOpDefaultTensorCreate([tensor_in_a, tensor_in_b, tensor_out])
+    mgr.eval_tensor_create_def([tensor_in_a, tensor_in_b, tensor_out])
 
-    mgr.evalOpDefaultAlgoMult([tensor_in_a, tensor_in_b, tensor_out])
+    mgr.eval_algo_mult_def([tensor_in_a, tensor_in_b, tensor_out])
 
-    mgr.evalOpDefaultTensorSyncLocal([tensor_out])
+    mgr.eval_tensor_sync_local_def([tensor_out])
 
     assert tensor_out.data() == [2.0, 4.0, 6.0]
 
@@ -47,11 +47,11 @@ def test_opalgobase_data():
         }
     """
 
-    mgr.evalOpDefaultTensorCreate([tensor_in_a, tensor_in_b, tensor_out])
+    mgr.eval_tensor_create_def([tensor_in_a, tensor_in_b, tensor_out])
 
-    mgr.evalOpDefaultAlgoBaseData([tensor_in_a, tensor_in_b, tensor_out], list(shaderData))
+    mgr.eval_algo_data_def([tensor_in_a, tensor_in_b, tensor_out], list(shaderData))
 
-    mgr.evalOpDefaultTensorSyncLocal([tensor_out])
+    mgr.eval_tensor_sync_local_def([tensor_out])
 
     assert tensor_out.data() == [2.0, 4.0, 6.0]
 
@@ -69,11 +69,11 @@ def test_opalgobase_file():
 
     shaderFilePath = "../../shaders/glsl/opmult.comp"
 
-    mgr.evalOpDefaultTensorCreate([tensor_in_a, tensor_in_b, tensor_out])
+    mgr.eval_tensor_create_def([tensor_in_a, tensor_in_b, tensor_out])
 
-    mgr.evalOpDefaultAlgoBaseFile([tensor_in_a, tensor_in_b, tensor_out], shaderFilePath)
+    mgr.eval_algo_file_def([tensor_in_a, tensor_in_b, tensor_out], shaderFilePath)
 
-    mgr.evalOpDefaultTensorSyncLocal([tensor_out])
+    mgr.eval_tensor_sync_local_def([tensor_out])
 
     assert tensor_out.data() == [2.0, 4.0, 6.0]
 
@@ -82,26 +82,28 @@ def test_sequence():
     Test basic OpAlgoBase operation
     """
 
+    mgr = Manager(0, [2])
+
     tensor_in_a = Tensor([2, 2, 2])
     tensor_in_b = Tensor([1, 2, 3])
     tensor_out = Tensor([0, 0, 0])
 
-    mgr = Manager()
+    mgr.eval_tensor_create_def([tensor_in_a, tensor_in_b, tensor_out])
+
+    seq = mgr.create_sequence("op")
 
     shaderFilePath = "../../shaders/glsl/opmult.comp"
-
-    mgr.evalOpDefaultTensorCreate([tensor_in_a, tensor_in_b, tensor_out])
-
-    seq = mgr.createManagedSequence("op")
+    mgr.eval_async_algo_file_def([tensor_in_a, tensor_in_b, tensor_out], shaderFilePath)
+    mgr.eval_await_def()
 
     seq.begin()
-    seq.recordOpAlgoBaseFile([tensor_in_a, tensor_in_b, tensor_out], shaderFilePath)
+    seq.record_tensor_sync_local([tensor_in_a])
+    seq.record_tensor_sync_local([tensor_in_b])
+    seq.record_tensor_sync_local([tensor_out])
     seq.end()
 
     seq.eval()
 
-    mgr.evalOpDefaultTensorSyncLocal([tensor_out])
-
     assert tensor_out.data() == [2.0, 4.0, 6.0]
 
 if __name__ == "__main__":

From 02406d46ca43d90be8c8a45cb04387533abcb92d Mon Sep 17 00:00:00 2001
From: Alejandro Saucedo <axsauze@gmail.com>
Date: Sun, 1 Nov 2020 21:24:39 +0000
Subject: [PATCH 24/39] Updated readme to reflect python example

---
 README.md | 85 ++++++++++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 75 insertions(+), 10 deletions(-)

diff --git a/README.md b/README.md
index 2ff7e0f9a..f75f466bc 100644
--- a/README.md
+++ b/README.md
@@ -15,7 +15,7 @@
 <td>
 
 <h1>Vulkan Kompute</h1>
-<h3>The General Purpose Vulkan Compute Framework.</h3>
+<h3>The General Purpose Vulkan Compute Framework for C++ and Python.</h3>
 
 </td>
 
@@ -29,10 +29,10 @@
 
 ## Principles & Features
 
-* [Single header](#setup) library for simple import to your project
-* [Documentation](https://kompute.cc) leveraging doxygen and sphinx 
-* [Asynchronous & parallel processing](#asynchronous-and-parallel-operations) capabilities with multi-queue command submission
-* [Non-Vulkan naming conventions](#architectural-overview) to disambiguate Vulkan vs Kompute components
+* [Single header](#setup) for simple import with flexible build-system configuration
+* Multi-language support with C++ as core SDK as well as [optimized Python bindings](#python-package)
+* [Asynchronous & parallel processing](#asynchronous-and-parallel-operations) support through GPU family queues
+* [Mobile enabled](#mobile-enabled) with examples in Android studio across several architectures
 * BYOV: [Bring-your-own-Vulkan design](#motivations) to play nice with existing Vulkan applications
 * Explicit relationships for GPU and host [memory ownership and memory management](https://kompute.cc/overview/memory-management.html)
 * [Short code examples](#simple-examples) showing the core features 
@@ -118,7 +118,7 @@ int main() {
     mgr.evalOpAwaitDefault();
 
     // 5. Create managed sequence to submit batch operations to the CPU
-    std::shared_ptr<kp::Sequence> sq = mgr.getOrCreateManagedSequence("seq").lock();
+    std::shared_ptr<kp::Sequence> sq = mgr.getOrCreateManagedSequence("seq");
 
     // 5.1. Explicitly begin recording batch commands
     sq->begin();
@@ -255,13 +255,69 @@ You can also access the <a href="https://github.com/EthicalML/vulkan-kompute/tre
 </tr>
 </table>
 
-## Motivations
+## Python Package
 
-This project started after seeing that a lot of new and renowned ML & DL projects like Pytorch, Tensorflow, Alibaba DNN, Tencent NCNN - among others - have either integrated or are looking to integrate the Vulkan SDK to add mobile (and cross-vendor) GPU support.
+Besides the C++ core SDK you can also use the Python package of Kompute, which exposes the same core functionality.
 
-The Vulkan SDK offers a great low level interface that enables for highly specialized optimizations - however it comes at a cost of highly verbose code which requires 500-2000 lines of code to even begin writing application code. This has resulted in each of these projects having to implement the same baseline to abstract the non-compute related features of Vulkan. This large amount of non-standardised boiler-plate can result in limited knowledge transfer, higher chance of unique framework implementation bugs being introduced, etc.
+You can install from the repository by running:
 
-We are currently developing Vulkan Kompute not to hide the Vulkan SDK interface (as it's incredibly well designed) but to augment it with a direct focus on Vulkan's GPU computing capabilities. [This article](https://towardsdatascience.com/machine-learning-and-data-processing-in-the-gpu-with-vulkan-kompute-c9350e5e5d3a) provides a high level overview of the motivations of Kompute, together with a set of hands on examples that introduce both GPU computing as well as the core Vulkan Kompute architecture.
+```
+pip install .
+```
+
+Then you can interact with it from your interpreter. Below is the same sample as above "Your First Kompute (Simple Version)" but in Python:
+
+```python
+tensor_in_a = Tensor([2, 2, 2])
+tensor_in_b = Tensor([1, 2, 3])
+tensor_out = Tensor([0, 0, 0])
+
+mgr = Manager()
+
+mgr.eval_tensor_create_def([tensor_in_a, tensor_in_b, tensor_out])
+
+shaderFilePath = "shaders/glsl/opmult.comp"
+mgr.eval_async_algo_file_def([tensor_in_a, tensor_in_b, tensor_out], shaderFilePath)
+
+# Alternatively can pass raw string/bytes:
+# shaderFileData = """ shader code here... """
+# mgr.eval_algo_data_def([tensor_in_a, tensor_in_b, tensor_out], list(shaderFileData))
+
+mgr.eval_await_def()
+
+mgr.eval_tensor_sync_local_def([tensor_out])
+
+assert tensor_out.data() == [2.0, 4.0, 6.0]
+```
+
+Similarly you can find the same extended example as above:
+
+```python
+mgr = Manager(0, [2])
+
+tensor_in_a = Tensor([2, 2, 2])
+tensor_in_b = Tensor([1, 2, 3])
+tensor_out = Tensor([0, 0, 0])
+
+shaderFilePath = "../../shaders/glsl/opmult.comp"
+
+mgr.eval_tensor_create_def([tensor_in_a, tensor_in_b, tensor_out])
+
+seq = mgr.create_sequence("op")
+
+mgr.eval_async_algo_file_def([tensor_in_a, tensor_in_b, tensor_out], shaderFilePath)
+mgr.eval_await_def()
+
+seq.begin()
+seq.record_tensor_sync_local([tensor_in_a])
+seq.record_tensor_sync_local([tensor_in_b])
+seq.record_tensor_sync_local([tensor_out])
+seq.end()
+
+seq.eval()
+
+assert tensor_out.data() == [2.0, 4.0, 6.0]
+```
 
 ## More examples
 
@@ -281,6 +337,7 @@ We are currently developing Vulkan Kompute not to hide the Vulkan SDK interface
 * [Android NDK Mobile Kompute ML Application](https://towardsdatascience.com/gpu-accelerated-machine-learning-in-your-mobile-applications-using-the-android-ndk-vulkan-kompute-1e9da37b7617)
 * [Game Development Kompute ML in Godot Engine](https://towardsdatascience.com/supercharging-game-development-with-gpu-accelerated-ml-using-vulkan-kompute-the-godot-game-engine-4e75a84ea9f0)
 
+
 ## Build Overview
 
 The build system provided uses `cmake`, which allows for cross platform builds.
@@ -344,3 +401,11 @@ make mk_cmake MK_BUILD_TYPE="Release"
 make mk_run_tests
 ```
 
+## Motivations
+
+This project started after seeing that a lot of new and renowned ML & DL projects like Pytorch, Tensorflow, Alibaba DNN, Tencent NCNN - among others - have either integrated or are looking to integrate the Vulkan SDK to add mobile (and cross-vendor) GPU support.
+
+The Vulkan SDK offers a great low level interface that enables for highly specialized optimizations - however it comes at a cost of highly verbose code which requires 500-2000 lines of code to even begin writing application code. This has resulted in each of these projects having to implement the same baseline to abstract the non-compute related features of Vulkan. This large amount of non-standardised boiler-plate can result in limited knowledge transfer, higher chance of unique framework implementation bugs being introduced, etc.
+
+We are currently developing Vulkan Kompute not to hide the Vulkan SDK interface (as it's incredibly well designed) but to augment it with a direct focus on Vulkan's GPU computing capabilities. [This article](https://towardsdatascience.com/machine-learning-and-data-processing-in-the-gpu-with-vulkan-kompute-c9350e5e5d3a) provides a high level overview of the motivations of Kompute, together with a set of hands on examples that introduce both GPU computing as well as the core Vulkan Kompute architecture.
+

From 159504f20f3a7e79021bc53ffc2d9b8cd806e499 Mon Sep 17 00:00:00 2001
From: Alejandro Saucedo <axsauze@gmail.com>
Date: Mon, 2 Nov 2020 21:47:05 +0000
Subject: [PATCH 25/39] Updated documentaion to include python

---
 docs/conf.py                     |  4 +++
 docs/index.rst                   |  5 ++--
 docs/overview/python-package.rst | 44 ++++++++++++++++++++++++++++++++
 3 files changed, 51 insertions(+), 2 deletions(-)
 create mode 100644 docs/overview/python-package.rst

diff --git a/docs/conf.py b/docs/conf.py
index 2c6eb74e0..2daab8833 100755
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -16,6 +16,9 @@
 
 
 # -- Project information -----------------------------------------------------
+import sys
+import os
+import komputepy
 
 project = 'Vulkan Kompute'
 copyright = '2020, The Institute for Ethical AI & Machine Learning'
@@ -31,6 +34,7 @@ release = '0.4.0'
 # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 # ones.
 extensions = [
+    "sphinx.ext.autodoc",
     # Creates .nojekyll config
     'sphinx.ext.githubpages',
     # Integrates with doxygen
diff --git a/docs/index.rst b/docs/index.rst
index 60d01f21b..57f1a1271 100755
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -11,13 +11,14 @@ Index
     :maxdepth: 2
     :titlesonly:
 
-    Class Documentation and C++ Reference <overview/reference>
-    Advanced Examples <overview/advanced-examples>
+    Simple & Advanced Examples <overview/advanced-examples>
     Asynchronous & Parallel Operations <overview/async-parallel>
     Memory Management Principles <overview/memory-management>
     Build System Deep Dive <overview/build-system>
     Converting GLSL/HLSL Shaders to C++ Headers <overview/shaders-to-headers>
     Mobile App Integration (Android) <overview/mobile-android>
     Game Engine Integration (Godot Engine) <overview/game-engine-godot>
+    Python Class Documentation & Reference <overview/python-package>
+    C++ Class Documentation & Reference <overview/reference>
     Code Index <genindex>
 
diff --git a/docs/overview/python-package.rst b/docs/overview/python-package.rst
new file mode 100644
index 000000000..92dc8ca38
--- /dev/null
+++ b/docs/overview/python-package.rst
@@ -0,0 +1,44 @@
+
+
+Python Class Documentation & Reference
+========
+
+This section provides a breakdown of the Python classes and what each of their functions provide.
+Below is a diagram that provides insights on the relationship between Vulkan Kompute objects and Vulkan resources, which primarily encompass ownership of either CPU and/or GPU memory.
+
+.. image:: ../images/kompute-architecture.jpg
+   :width: 70%
+
+Manager
+-------
+
+The Kompute Manager provides a high level interface to simplify interaction with underlying :class:`komputepy.Sequence` of Operations.
+
+.. autoclass:: komputepy.Manager
+   :members:
+
+
+Sequence
+-------
+
+The Kompute Sequence consists of batches of Kompute Operations, which are executed on a respective GPU queue. The execution of sequences can be synchronous or asynchronous, and it can be coordinated through its respective Vulkan Fence.
+
+.. autoclass:: komputepy.Sequence
+   :members:
+
+
+Tensor
+-------
+
+The Kompute Tensor is the atomic unit in Kompute, and it is used primarily for handling Host and GPU Device data.
+
+.. autoclass:: komputepy.Tensor
+   :members:
+
+
+TensorType
+-------
+
+.. automodule:: komputepy
+   :members:
+

From b23e04e1a497d3afc3e5506f5c775c613bbd40c6 Mon Sep 17 00:00:00 2001
From: Alejandro Saucedo <axsauze@gmail.com>
Date: Mon, 2 Nov 2020 21:47:26 +0000
Subject: [PATCH 26/39] Added documentation to reference python

---
 python/src/main.cpp | 185 +++++++++++++++++++++++++++-----------------
 1 file changed, 113 insertions(+), 72 deletions(-)

diff --git a/python/src/main.cpp b/python/src/main.cpp
index 3cc3e214e..34a0e6bbc 100644
--- a/python/src/main.cpp
+++ b/python/src/main.cpp
@@ -7,109 +7,150 @@ namespace py = pybind11;
 
 PYBIND11_MODULE(komputepy, m) {
 
-    py::enum_<kp::Tensor::TensorTypes>(m, "TensorTypes")
-        .value("device", kp::Tensor::TensorTypes::eDevice)
-        .value("staging", kp::Tensor::TensorTypes::eStaging)
-        .value("storage", kp::Tensor::TensorTypes::eStorage)
+    py::enum_<kp::Tensor::TensorTypes>(m, "TensorTypes", "Enum with GPU memory types for Tensor.")
+        .value("device", kp::Tensor::TensorTypes::eDevice, "Tensor holding data in GPU memory.")
+        .value("staging", kp::Tensor::TensorTypes::eStaging, "Tensor used for transfer of data to device.")
+        .value("storage", kp::Tensor::TensorTypes::eStorage, "Tensor with host visible gpu memory.")
         .export_values();
 
-    py::class_<kp::Tensor, std::shared_ptr<kp::Tensor>>(m, "Tensor")
+    py::class_<kp::Tensor, std::shared_ptr<kp::Tensor>>(m, "Tensor", "Structured data used in GPU operations.")
         .def(py::init(
             [](const std::vector<float>& data) {
                 return std::unique_ptr<kp::Tensor>(new kp::Tensor(data));
-            }))
+            }), "Initialiser with only list of data components.")
         .def(py::init(
             [](const std::vector<float>& data, kp::Tensor::TensorTypes tensorTypes) {
                 return std::unique_ptr<kp::Tensor>(new kp::Tensor(data, tensorTypes));
-            }))
-        .def("data", &kp::Tensor::data)
-        .def("size", &kp::Tensor::size)
-        .def("tensor_type", &kp::Tensor::tensorType)
-        .def("is_init", &kp::Tensor::isInit)
-        .def("set_data", &kp::Tensor::setData)
-        .def("map_data_from_host", &kp::Tensor::mapDataFromHostMemory)
-        .def("map_data_into_host", &kp::Tensor::mapDataIntoHostMemory);
+            }), "Initialiser with list of data components and tensor GPU memory type.")
+        .def("data", &kp::Tensor::data, "Retrieves the data as a list containing the local Tensor memory data.")
+        .def("size", &kp::Tensor::size, "Retrieves the size of the Tensor data as per the local Tensor memory.")
+        .def("tensor_type", &kp::Tensor::tensorType, "Retreves the memory type of the tensor.")
+        .def("is_init", &kp::Tensor::isInit, "Checks whether the tensor GPU memory has been initialised.")
+        .def("set_data", &kp::Tensor::setData, "Overrides the data in the local Tensor memory.")
+        .def("map_data_from_host", &kp::Tensor::mapDataFromHostMemory, "Maps data into GPU memory from tensor local data.")
+        .def("map_data_into_host", &kp::Tensor::mapDataIntoHostMemory, "Maps data from GPU memory into tensor local data.");
 
     py::class_<kp::Sequence, std::shared_ptr<kp::Sequence>>(m, "Sequence")
-        .def("init", &kp::Sequence::init)
+        .def("init", &kp::Sequence::init, "Initialises Vulkan resources within sequence using provided device.")
         // record
-        .def("begin", &kp::Sequence::begin)
-        .def("end", &kp::Sequence::end)
+        .def("begin", &kp::Sequence::begin, "Clears previous commands and starts recording commands in sequence which can be run in batch.")
+        .def("end", &kp::Sequence::end, "Stops listening and recording for new commands.")
         // eval
-        .def("eval", &kp::Sequence::eval)
-        .def("eval_async", &kp::Sequence::evalAsync)
-        .def("eval_await", &kp::Sequence::evalAwait)
+        .def("eval", &kp::Sequence::eval, "Executes the currently recorded commands synchronously by waiting on Vulkan Fence.")
+        .def("eval_async", &kp::Sequence::evalAsync, "Executes the currently recorded commands asynchronously.")
+        .def("eval_await", &kp::Sequence::evalAwait, "Waits until the execution finishes using Vulkan Fence.")
         // status
-        .def("is_running", &kp::Sequence::isRunning)
-        .def("is_rec", &kp::Sequence::isRecording)
-        .def("is_init", &kp::Sequence::isInit)
+        .def("is_running", &kp::Sequence::isRunning, "Checks whether the Sequence operations are currently still executing.")
+        .def("is_rec", &kp::Sequence::isRecording, "Checks whether the Sequence is currently in recording mode.")
+        .def("is_init", &kp::Sequence::isInit, "Checks if the Sequence has been initialized")
         // record
-        .def("record_tensor_create", &kp::Sequence::record<kp::OpTensorCreate>)
-        .def("record_tensor_copy", &kp::Sequence::record<kp::OpTensorCopy>)
-        .def("record_tensor_sync_device", &kp::Sequence::record<kp::OpTensorSyncDevice>)
-        .def("record_tensor_sync_local", &kp::Sequence::record<kp::OpTensorSyncLocal>)
-        .def("record_algo_mult", &kp::Sequence::record<kp::OpMult>)
-        .def("record_algo_file", &kp::Sequence::record<kp::OpAlgoBase, std::string>)
-        .def("record_algo_data", &kp::Sequence::record<kp::OpAlgoBase, std::vector<char>>)
-        .def("record_algo_lro", &kp::Sequence::record<kp::OpAlgoLhsRhsOut>);
+        .def("record_tensor_create", &kp::Sequence::record<kp::OpTensorCreate>,
+            "Records operation to create and initialise tensor GPU memory and buffer")
+        .def("record_tensor_copy", &kp::Sequence::record<kp::OpTensorCopy>,
+            "Records operation to copy one tensor to one or many tensors")
+        .def("record_tensor_sync_device", &kp::Sequence::record<kp::OpTensorSyncDevice>,
+            "Records operation to sync tensor from local memory to GPU memory")
+        .def("record_tensor_sync_local", &kp::Sequence::record<kp::OpTensorSyncLocal>,
+            "Records operation to sync tensor(s) from GPU memory to local memory using staging tensors")
+        .def("record_algo_mult", &kp::Sequence::record<kp::OpMult>,
+            "Records operation to run multiplication compute shader to two input tensors and an output tensor")
+        .def("record_algo_file", &kp::Sequence::record<kp::OpAlgoBase, std::string>,
+            "Records an operation using a custom shader provided from a shader path")
+        .def("record_algo_data", &kp::Sequence::record<kp::OpAlgoBase, std::vector<char>>,
+            "Records an operation using a custom shader provided as raw string or spirv bytes")
+        .def("record_algo_lro", &kp::Sequence::record<kp::OpAlgoLhsRhsOut>,
+            "Records operation to run left right out operation with custom shader");
 
     py::class_<kp::Manager>(m, "Manager")
-        .def(py::init())
+        .def(py::init(), "Default initializer uses device 0 and first compute compatible GPU queueFamily")
         .def(py::init(
             [](uint32_t physicalDeviceIndex) {
                 return std::unique_ptr<kp::Manager>(new kp::Manager(physicalDeviceIndex));
-            }))
+            }), "Manager initialiser can provide specified device index but will use first compute compatible GPU queueFamily")
         .def(py::init(
             [](uint32_t physicalDeviceIndex, const std::vector<uint32_t>& familyQueueIndices) {
                 return std::unique_ptr<kp::Manager>(new kp::Manager(physicalDeviceIndex, familyQueueIndices));
-            }))
-        .def("get_create_sequence", &kp::Manager::getOrCreateManagedSequence)
+            }), "Manager initialiser can provide specified device and array of GPU queueFamilies to load.")
+        .def("get_create_sequence", &kp::Manager::getOrCreateManagedSequence, "Get a Sequence or create a new one with given name")
         .def("create_sequence", &kp::Manager::createManagedSequence,
-                py::arg("name"), py::arg("queueIndex") = 0)
+                py::arg("name"), py::arg("queueIndex") = 0, "Create a sequence with specific name and specified index of available queues")
         .def("build_tensor", &kp::Manager::buildTensor, 
-                py::arg("data"), py::arg("tensorType") = kp::Tensor::TensorTypes::eDevice)
+                py::arg("data"), py::arg("tensorType") = kp::Tensor::TensorTypes::eDevice,
+                "Build and initialise tensor")
         // Await functions
         .def("eval_await", &kp::Manager::evalOpAwait,
-                py::arg("sequenceName"), py::arg("waitFor") = UINT64_MAX)
+                py::arg("sequenceName"), py::arg("waitFor") = UINT64_MAX,
+                "Awaits for asynchronous operation on a named Sequence")
         .def("eval_await_def", &kp::Manager::evalOpAwaitDefault,
-                py::arg("waitFor") = UINT64_MAX)
+                py::arg("waitFor") = UINT64_MAX, "Awaits for asynchronous operation on the last anonymous Sequence created")
         // eval default
-        .def("eval_tensor_create_def", &kp::Manager::evalOpDefault<kp::OpTensorCreate>)
-        .def("eval_tensor_copy_def", &kp::Manager::evalOpDefault<kp::OpTensorCopy>)
-        .def("eval_tensor_sync_device_def", &kp::Manager::evalOpDefault<kp::OpTensorSyncDevice>)
-        .def("eval_tensor_sync_local_def", &kp::Manager::evalOpDefault<kp::OpTensorSyncLocal>)
-        .def("eval_algo_mult_def", &kp::Manager::evalOpDefault<kp::OpMult>)
-        .def("eval_algo_file_def", &kp::Manager::evalOpDefault<kp::OpAlgoBase, std::string>)
-        .def("eval_algo_data_def", &kp::Manager::evalOpDefault<kp::OpAlgoBase, std::vector<char>>)
-        .def("eval_algo_lro_def", &kp::Manager::evalOpDefault<kp::OpAlgoLhsRhsOut>)
+        .def("eval_tensor_create_def", &kp::Manager::evalOpDefault<kp::OpTensorCreate>,
+            "Evaluates operation to create and initialise tensor GPU memory and buffer with new anonymous Sequence")
+        .def("eval_tensor_copy_def", &kp::Manager::evalOpDefault<kp::OpTensorCopy>,
+            "Evaluates operation to copy one tensor to one or many tensors with new anonymous Sequence")
+        .def("eval_tensor_sync_device_def", &kp::Manager::evalOpDefault<kp::OpTensorSyncDevice>,
+            "Evaluates operation to sync tensor from local memory to GPU memory with new anonymous Sequence")
+        .def("eval_tensor_sync_local_def", &kp::Manager::evalOpDefault<kp::OpTensorSyncLocal>,
+            "Evaluates operation to sync tensor(s) from GPU memory to local memory using staging tensors with new anonymous Sequence")
+        .def("eval_algo_mult_def", &kp::Manager::evalOpDefault<kp::OpMult>,
+            "Evaluates operation to run multiplication compute shader to two input tensors and an output tensor with new anonymous Sequence")
+        .def("eval_algo_file_def", &kp::Manager::evalOpDefault<kp::OpAlgoBase, std::string>,
+            "Evaluates an operation using a custom shader provided from a shader path with new anonymous Sequence")
+        .def("eval_algo_data_def", &kp::Manager::evalOpDefault<kp::OpAlgoBase, std::vector<char>>,
+            "Evaluates an operation using a custom shader provided as raw string or spirv bytes with new anonymous Sequence")
+        .def("eval_algo_lro_def", &kp::Manager::evalOpDefault<kp::OpAlgoLhsRhsOut>,
+            "Evaluates operation to run left right out operation with custom shader with new anonymous Sequence")
         // eval
-        .def("eval_tensor_create", &kp::Manager::evalOp<kp::OpTensorCreate>)
-        .def("eval_tensor_copy", &kp::Manager::evalOp<kp::OpTensorCopy>)
-        .def("eval_tensor_sync_device", &kp::Manager::evalOp<kp::OpTensorSyncDevice>)
-        .def("eval_tensor_sync_local", &kp::Manager::evalOp<kp::OpTensorSyncLocal>)
-        .def("eval_algo_mult", &kp::Manager::evalOp<kp::OpMult>)
-        .def("eval_algo_file", &kp::Manager::evalOp<kp::OpAlgoBase, std::string>)
-        .def("eval_algo_data", &kp::Manager::evalOp<kp::OpAlgoBase, std::vector<char>>)
-        .def("eval_algo_lro", &kp::Manager::evalOp<kp::OpAlgoLhsRhsOut>)
+        .def("eval_tensor_create", &kp::Manager::evalOp<kp::OpTensorCreate>,
+            "Evaluates operation to create and initialise tensor GPU memory and buffer with explicitly named Sequence")
+        .def("eval_tensor_copy", &kp::Manager::evalOp<kp::OpTensorCopy>,
+            "Evaluates operation to copy one tensor to one or many tensors with explicitly named Sequence")
+        .def("eval_tensor_sync_device", &kp::Manager::evalOp<kp::OpTensorSyncDevice>,
+            "Evaluates operation to sync tensor from local memory to GPU memory with explicitly named Sequence")
+        .def("eval_tensor_sync_local", &kp::Manager::evalOp<kp::OpTensorSyncLocal>,
+            "Evaluates operation to sync tensor(s) from GPU memory to local memory using staging tensors with explicitly named Sequence")
+        .def("eval_algo_mult", &kp::Manager::evalOp<kp::OpMult>,
+            "Evaluates operation to run multiplication compute shader to two input tensors and an output tensor with explicitly named Sequence")
+        .def("eval_algo_file", &kp::Manager::evalOp<kp::OpAlgoBase, std::string>,
+            "Evaluates an operation using a custom shader provided from a shader path with explicitly named Sequence")
+        .def("eval_algo_data", &kp::Manager::evalOp<kp::OpAlgoBase, std::vector<char>>,
+            "Evaluates an operation using a custom shader provided as raw string or spirv bytes with explicitly named Sequence")
+        .def("eval_algo_lro", &kp::Manager::evalOp<kp::OpAlgoLhsRhsOut>,
+            "Evaluates operation to run left right out operation with custom shader with explicitly named Sequence")
         // eval async default
-        .def("eval_async_tensor_create_def", &kp::Manager::evalOpAsyncDefault<kp::OpTensorCreate>)
-        .def("eval_async_tensor_copy_def", &kp::Manager::evalOpAsyncDefault<kp::OpTensorCopy>)
-        .def("eval_async_tensor_sync_device_def", &kp::Manager::evalOpAsyncDefault<kp::OpTensorSyncDevice>)
-        .def("eval_async_tensor_sync_local_def", &kp::Manager::evalOpAsyncDefault<kp::OpTensorSyncLocal>)
-        .def("eval_async_algo_mult_def", &kp::Manager::evalOpAsyncDefault<kp::OpMult>)
-        .def("eval_async_algo_file_def", &kp::Manager::evalOpAsyncDefault<kp::OpAlgoBase, std::string>)
-        .def("eval_async_algo_data_def", &kp::Manager::evalOpAsyncDefault<kp::OpAlgoBase, std::vector<char>>)
-        .def("eval_async_algo_lro_def", &kp::Manager::evalOpAsyncDefault<kp::OpAlgoLhsRhsOut>)
+        .def("eval_async_tensor_create_def", &kp::Manager::evalOpAsyncDefault<kp::OpTensorCreate>,
+            "Evaluates asynchronously operation to create and initialise tensor GPU memory and buffer with anonymous Sequence")
+        .def("eval_async_tensor_copy_def", &kp::Manager::evalOpAsyncDefault<kp::OpTensorCopy>,
+            "Evaluates asynchronously operation to copy one tensor to one or many tensors with anonymous Sequence")
+        .def("eval_async_tensor_sync_device_def", &kp::Manager::evalOpAsyncDefault<kp::OpTensorSyncDevice>,
+            "Evaluates asynchronously operation to sync tensor from local memory to GPU memory with anonymous Sequence")
+        .def("eval_async_tensor_sync_local_def", &kp::Manager::evalOpAsyncDefault<kp::OpTensorSyncLocal>,
+            "Evaluates asynchronously operation to sync tensor(s) from GPU memory to local memory using staging tensors with anonymous Sequence")
+        .def("eval_async_algo_mult_def", &kp::Manager::evalOpAsyncDefault<kp::OpMult>,
+            "Evaluates asynchronously operation to run multiplication compute shader to two input tensors and an output tensor with anonymous Sequence")
+        .def("eval_async_algo_file_def", &kp::Manager::evalOpAsyncDefault<kp::OpAlgoBase, std::string>,
+            "Evaluates asynchronously an operation using a custom shader provided from a shader path with anonymous Sequence")
+        .def("eval_async_algo_data_def", &kp::Manager::evalOpAsyncDefault<kp::OpAlgoBase, std::vector<char>>,
+            "Evaluates asynchronously an operation using a custom shader provided as raw string or spirv bytes with anonymous Sequence")
+        .def("eval_async_algo_lro_def", &kp::Manager::evalOpAsyncDefault<kp::OpAlgoLhsRhsOut>,
+            "Evaluates asynchronously operation to run left right out operation with custom shader with anonymous Sequence")
         // eval async
-        .def("eval_tensor_create", &kp::Manager::evalOpAsync<kp::OpTensorCreate>)
-        .def("eval_tensor_copy", &kp::Manager::evalOpAsync<kp::OpTensorCopy>)
-        .def("eval_tensor_sync_device", &kp::Manager::evalOpAsync<kp::OpTensorSyncDevice>)
-        .def("eval_tensor_sync_local", &kp::Manager::evalOpAsync<kp::OpTensorSyncLocal>)
-        .def("eval_algo_mult", &kp::Manager::evalOpAsync<kp::OpMult>)
-        .def("eval_algo_file", &kp::Manager::evalOpAsync<kp::OpAlgoBase, std::string>)
-        .def("eval_algo_data", &kp::Manager::evalOpAsync<kp::OpAlgoBase, std::vector<char>>)
-        .def("eval_algo_lro", &kp::Manager::evalOpAsync<kp::OpAlgoLhsRhsOut>);
-
+        .def("eval_async_tensor_create", &kp::Manager::evalOpAsync<kp::OpTensorCreate>,
+            "Evaluates asynchronously operation to create and initialise tensor GPU memory and buffer with explicitly named Sequence")
+        .def("eval_async_tensor_copy", &kp::Manager::evalOpAsync<kp::OpTensorCopy>,
+            "Evaluates asynchronously operation to copy one tensor to one or many tensors with explicitly named Sequence")
+        .def("eval_async_tensor_sync_device", &kp::Manager::evalOpAsync<kp::OpTensorSyncDevice>,
+            "Evaluates asynchronously operation to sync tensor from local memory to GPU memory with explicitly named Sequence")
+        .def("eval_async_tensor_sync_local", &kp::Manager::evalOpAsync<kp::OpTensorSyncLocal>,
+            "Evaluates asynchronously operation to sync tensor(s) from GPU memory to local memory using staging tensors with explicitly named Sequence")
+        .def("eval_async_algo_mult", &kp::Manager::evalOpAsync<kp::OpMult>,
+            "Evaluates asynchronously operation to run multiplication compute shader to two input tensors and an output tensor with explicitly named Sequence")
+        .def("eval_async_algo_file", &kp::Manager::evalOpAsync<kp::OpAlgoBase, std::string>,
+            "Evaluates asynchronously an operation using a custom shader provided from a shader path with explicitly named Sequence")
+        .def("eval_async_algo_data", &kp::Manager::evalOpAsync<kp::OpAlgoBase, std::vector<char>>,
+            "Evaluates asynchronously an operation using a custom shader provided as raw string or spirv bytes with explicitly named Sequence")
+        .def("eval_async_algo_lro", &kp::Manager::evalOpAsync<kp::OpAlgoLhsRhsOut>,
+            "Evaluates asynchronously operation to run left right out operation with custom shader with explicitly named Sequence");
 
 #ifdef VERSION_INFO
     m.attr("__version__") = VERSION_INFO;

From 991cfdcbcc0f0de2fa51a792d541b3aba2370d90 Mon Sep 17 00:00:00 2001
From: Alejandro Saucedo <axsauze@gmail.com>
Date: Mon, 2 Nov 2020 21:57:10 +0000
Subject: [PATCH 27/39] Updated to use kp instead of komputepy for module name

---
 docs/conf.py                     |  2 +-
 docs/overview/python-package.rst | 10 +++++-----
 python/src/main.cpp              |  2 +-
 python/test/test_kompute.py      |  2 +-
 setup.py                         |  6 +++---
 5 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/docs/conf.py b/docs/conf.py
index 2daab8833..f1255e3d8 100755
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -18,7 +18,7 @@
 # -- Project information -----------------------------------------------------
 import sys
 import os
-import komputepy
+import kp
 
 project = 'Vulkan Kompute'
 copyright = '2020, The Institute for Ethical AI & Machine Learning'
diff --git a/docs/overview/python-package.rst b/docs/overview/python-package.rst
index 92dc8ca38..0a8eb7a23 100644
--- a/docs/overview/python-package.rst
+++ b/docs/overview/python-package.rst
@@ -12,9 +12,9 @@ Below is a diagram that provides insights on the relationship between Vulkan Kom
 Manager
 -------
 
-The Kompute Manager provides a high level interface to simplify interaction with underlying :class:`komputepy.Sequence` of Operations.
+The Kompute Manager provides a high level interface to simplify interaction with underlying :class:`kp.Sequence` of Operations.
 
-.. autoclass:: komputepy.Manager
+.. autoclass:: kp.Manager
    :members:
 
 
@@ -23,7 +23,7 @@ Sequence
 
 The Kompute Sequence consists of batches of Kompute Operations, which are executed on a respective GPU queue. The execution of sequences can be synchronous or asynchronous, and it can be coordinated through its respective Vulkan Fence.
 
-.. autoclass:: komputepy.Sequence
+.. autoclass:: kp.Sequence
    :members:
 
 
@@ -32,13 +32,13 @@ Tensor
 
 The Kompute Tensor is the atomic unit in Kompute, and it is used primarily for handling Host and GPU Device data.
 
-.. autoclass:: komputepy.Tensor
+.. autoclass:: kp.Tensor
    :members:
 
 
 TensorType
 -------
 
-.. automodule:: komputepy
+.. automodule:: kp
    :members:
 
diff --git a/python/src/main.cpp b/python/src/main.cpp
index 34a0e6bbc..0f10ea349 100644
--- a/python/src/main.cpp
+++ b/python/src/main.cpp
@@ -5,7 +5,7 @@
 
 namespace py = pybind11;
 
-PYBIND11_MODULE(komputepy, m) {
+PYBIND11_MODULE(kp, m) {
 
     py::enum_<kp::Tensor::TensorTypes>(m, "TensorTypes", "Enum with GPU memory types for Tensor.")
         .value("device", kp::Tensor::TensorTypes::eDevice, "Tensor holding data in GPU memory.")
diff --git a/python/test/test_kompute.py b/python/test/test_kompute.py
index 7b85de47b..43baf77d1 100644
--- a/python/test/test_kompute.py
+++ b/python/test/test_kompute.py
@@ -1,5 +1,5 @@
 
-from komputepy import Tensor, Manager, Sequence
+from kp import Tensor, Manager, Sequence
 
 def test_opmult():
     """
diff --git a/setup.py b/setup.py
index e09673a97..0b5db2f9c 100644
--- a/setup.py
+++ b/setup.py
@@ -65,12 +65,12 @@ class CMakeBuild(build_ext):
         subprocess.check_call(['cmake', '--build', '.'] + build_args, cwd=self.build_temp)
 
 setup(
-    name='komputepy',
+    name='kp',
     version='0.0.1',
     author='Alejandro Saucedo',
-    description='Blazing fast, mobile-enabled, asynchronous, and optimized for advanced GPU processing usecases.',
+    description='Vulkan Kompute: Blazing fast, mobile-enabled, asynchronous, and optimized for advanced GPU processing usecases.',
     long_description='',
-    ext_modules=[CMakeExtension('komputepy')],
+    ext_modules=[CMakeExtension('kp')],
     cmdclass=dict(build_ext=CMakeBuild),
     zip_safe=False,
 )

From 214a43ad59a51c7fc61d2001e1e6ff232624ab68 Mon Sep 17 00:00:00 2001
From: Alejandro Saucedo <axsauze@gmail.com>
Date: Tue, 3 Nov 2020 08:00:20 +0000
Subject: [PATCH 28/39] Updated build to rename kp

---
 python/CMakeLists.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index 6ef7fde4b..5f3036986 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -1,11 +1,11 @@
 
 add_subdirectory(pybind11)
-pybind11_add_module(komputepy src/main.cpp)
+pybind11_add_module(kp src/main.cpp)
 
 include_directories(
         ${PROJECT_SOURCE_DIR}/single_include/)
 
 target_link_libraries(
-    komputepy PRIVATE
+    kp PRIVATE
     kompute::kompute)
 

From 958bf3f3c97a57a88cc648e47e93d05fc2a2ff96 Mon Sep 17 00:00:00 2001
From: Alejandro Saucedo <axsauze@gmail.com>
Date: Tue, 3 Nov 2020 08:00:38 +0000
Subject: [PATCH 29/39] Added python overview to documentation

---
 docs/index.rst                     |   3 +-
 docs/overview/python-package.rst   | 101 +++++++++++++++++++++--------
 docs/overview/python-reference.rst |  44 +++++++++++++
 3 files changed, 120 insertions(+), 28 deletions(-)
 create mode 100644 docs/overview/python-reference.rst

diff --git a/docs/index.rst b/docs/index.rst
index 57f1a1271..340b3458b 100755
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -12,13 +12,14 @@ Index
     :titlesonly:
 
     Simple & Advanced Examples <overview/advanced-examples>
+    Python Package Overview <overview/python-package>
     Asynchronous & Parallel Operations <overview/async-parallel>
     Memory Management Principles <overview/memory-management>
     Build System Deep Dive <overview/build-system>
     Converting GLSL/HLSL Shaders to C++ Headers <overview/shaders-to-headers>
     Mobile App Integration (Android) <overview/mobile-android>
     Game Engine Integration (Godot Engine) <overview/game-engine-godot>
-    Python Class Documentation & Reference <overview/python-package>
+    Python Class Documentation & Reference <overview/python-reference>
     C++ Class Documentation & Reference <overview/reference>
     Code Index <genindex>
 
diff --git a/docs/overview/python-package.rst b/docs/overview/python-package.rst
index 0a8eb7a23..74e0cba91 100644
--- a/docs/overview/python-package.rst
+++ b/docs/overview/python-package.rst
@@ -1,44 +1,91 @@
 
-
-Python Class Documentation & Reference
+Python Package Overview
 ========
 
-This section provides a breakdown of the Python classes and what each of their functions provide.
+This section provides an overview of the Python Package from a functionality perspective. If you wish to see all the classes and their respective functions you can find that in the `Python Class Reference Section <python-reference>`_.
+
 Below is a diagram that provides insights on the relationship between Vulkan Kompute objects and Vulkan resources, which primarily encompass ownership of either CPU and/or GPU memory.
 
 .. image:: ../images/kompute-architecture.jpg
    :width: 70%
 
-Manager
--------
+Python Components
+^^^^^^^^
 
-The Kompute Manager provides a high level interface to simplify interaction with underlying :class:`kp.Sequence` of Operations.
+The Python package exposes three main classes:
 
-.. autoclass:: kp.Manager
-   :members:
+* :class:`kp.Manager` - Manages all high level Vulkan and Kompute resources created
+* :class:`kp.Sequence` - Contains a set of recorded operations that can be reused
+* :class:`kp.Tensor` - Core data component to manage GPU and host data used in operations
+
+One thing that you will notice is that the class :class:`kp::OpBase` and all its relevant operator subclasses are not exposed in Python.
+
+This is primarily because the way to interact with the operations are through the respective :class:`kp.Manager` and :class:`kp.Sequence` functions.
+
+More specifically, it can be through the following functions:
+
+* mgr.eval_<opname> - Runs operation under an existing named sequence
+* mgr.eval_<opname>_def - Runs operation under a new anonymous sequence
+* mgr.eval_async_<opname> - Runs operation asynchronously under an existing named sequence
+* mgr.eval_async_<opname>_def - Runs operation asynchronously under a new anonymous sequence
+* seq.record_<opname> - Records operation in sequence (requires sequence to be in recording mode)
+
+You can see these operations being used in the `Simple Python example <https://kompute.cc/index.html#python-example-simple>`_ and in the `Extended Python Example <https://kompute.cc/index.html#python-example-extended>`_.
+
+Kompute Operation Capabilities
+^^^^^
+
+Handling multiple capabilites of processing can be done by compute shaders being loaded into separate sequences. The example below shows how this can be done:
+
+.. code-block:: python
+    :linenos:
+    from kp import Manager
+
+    # We'll assume we have the shader data available
+    from my_spv_shader_data import mult_shader, sum_shader
+
+    mgr = Manager()
+
+    t1 = mgr.build_tensor([2,2,2])
+    t2 = mgr.build_tensor([1,2,3])
+    t3 = mgr.build_tensor([1,2,3])
+
+    # Create multiple separate sequences
+    sq_mult = mgr.create_sequence("SQ_MULT")
+    sq_sum = mgr.create_sequence("SQ_SUM")
+    sq_sync = mgr.create_sequence("SQ_SYNC")
+
+    # Initialize sq_mult
+    sq_mult.begin()
+    sq_mult.record_algo_data([t1, t2, t3], add_shader)
+    sq_mult.end()
+
+    sq_sum.begin()
+    sq_sum.record_algo_data([t3, t2, t1], sum_shader)
+    sq_sum.end()
+
+    sq_sync.begin()
+    sq_sync.record_tensor_sync_local([t1, t3])
+    sq_sync.end()
+
+    # Run multiple iterations
+    for i in range(10):
+        sq_mult.eval()
+        sq_sum.eval()
+
+    sq_sync.eval()
+
+    print(t1.data(), t2.data(), t3.data())
 
 
-Sequence
--------
+Package Installation 
+^^^^^^^^^
 
-The Kompute Sequence consists of batches of Kompute Operations, which are executed on a respective GPU queue. The execution of sequences can be synchronous or asynchronous, and it can be coordinated through its respective Vulkan Fence.
+The package can be installed through the top level `setup.py` by running:
 
-.. autoclass:: kp.Sequence
-   :members:
+```
+pip install .
+```
 
 
-Tensor
--------
-
-The Kompute Tensor is the atomic unit in Kompute, and it is used primarily for handling Host and GPU Device data.
-
-.. autoclass:: kp.Tensor
-   :members:
-
-
-TensorType
--------
-
-.. automodule:: kp
-   :members:
 
diff --git a/docs/overview/python-reference.rst b/docs/overview/python-reference.rst
new file mode 100644
index 000000000..0a8eb7a23
--- /dev/null
+++ b/docs/overview/python-reference.rst
@@ -0,0 +1,44 @@
+
+
+Python Class Documentation & Reference
+========
+
+This section provides a breakdown of the Python classes and what each of their functions provide.
+Below is a diagram that provides insights on the relationship between Vulkan Kompute objects and Vulkan resources, which primarily encompass ownership of either CPU and/or GPU memory.
+
+.. image:: ../images/kompute-architecture.jpg
+   :width: 70%
+
+Manager
+-------
+
+The Kompute Manager provides a high level interface to simplify interaction with underlying :class:`kp.Sequence` of Operations.
+
+.. autoclass:: kp.Manager
+   :members:
+
+
+Sequence
+-------
+
+The Kompute Sequence consists of batches of Kompute Operations, which are executed on a respective GPU queue. The execution of sequences can be synchronous or asynchronous, and it can be coordinated through its respective Vulkan Fence.
+
+.. autoclass:: kp.Sequence
+   :members:
+
+
+Tensor
+-------
+
+The Kompute Tensor is the atomic unit in Kompute, and it is used primarily for handling Host and GPU Device data.
+
+.. autoclass:: kp.Tensor
+   :members:
+
+
+TensorType
+-------
+
+.. automodule:: kp
+   :members:
+

From e78b425f6662b50c6e6c62a099742811ef3f7396 Mon Sep 17 00:00:00 2001
From: Alejandro Saucedo <axsauze@gmail.com>
Date: Tue, 3 Nov 2020 08:02:19 +0000
Subject: [PATCH 30/39] Added readme for python documentation

---
 README.md | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index f75f466bc..9e63007d4 100644
--- a/README.md
+++ b/README.md
@@ -257,7 +257,7 @@ You can also access the <a href="https://github.com/EthicalML/vulkan-kompute/tre
 
 ## Python Package
 
-Besides the C++ core SDK you can also use the Python package of Kompute, which exposes the same core functionality.
+Besides the C++ core SDK you can also use the Python package of Kompute, which exposes the same core functionality, and supports interoperability with Python objects like Lists, Numpy Arrays, etc.
 
 You can install from the repository by running:
 
@@ -265,15 +265,20 @@ You can install from the repository by running:
 pip install .
 ```
 
+For further details you can read the [Python Package documentation](https://kompute.cc/overview/python-package.html) or the [Python Class Reference documentation](https://kompute.cc/overview/python-reference.html).
+
+### Python Example (Simple)
+
 Then you can interact with it from your interpreter. Below is the same sample as above "Your First Kompute (Simple Version)" but in Python:
 
 ```python
+mgr = Manager()
+
+# Can be initialized with List[] or np.Array
 tensor_in_a = Tensor([2, 2, 2])
 tensor_in_b = Tensor([1, 2, 3])
 tensor_out = Tensor([0, 0, 0])
 
-mgr = Manager()
-
 mgr.eval_tensor_create_def([tensor_in_a, tensor_in_b, tensor_out])
 
 shaderFilePath = "shaders/glsl/opmult.comp"
@@ -290,11 +295,14 @@ mgr.eval_tensor_sync_local_def([tensor_out])
 assert tensor_out.data() == [2.0, 4.0, 6.0]
 ```
 
+### Python Example (Extended)
+
 Similarly you can find the same extended example as above:
 
 ```python
 mgr = Manager(0, [2])
 
+# Can be initialized with List[] or np.Array
 tensor_in_a = Tensor([2, 2, 2])
 tensor_in_b = Tensor([1, 2, 3])
 tensor_out = Tensor([0, 0, 0])
@@ -319,6 +327,8 @@ seq.eval()
 assert tensor_out.data() == [2.0, 4.0, 6.0]
 ```
 
+For further details you can read the [Python Package documentation](https://kompute.cc/overview/python-package.html) or the [Python Class Reference documentation](https://kompute.cc/overview/python-reference.html).
+
 ## More examples
 
 ### Simple examples

From 0d33dc018d812cc1349e5140454eeec300b6dce0 Mon Sep 17 00:00:00 2001
From: Alejandro Saucedo <axsauze@gmail.com>
Date: Tue, 3 Nov 2020 08:04:37 +0000
Subject: [PATCH 31/39] Updated version to 0.4.1

---
 CMakeLists.txt | 2 +-
 README.md      | 2 +-
 docs/conf.py   | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6f1338b87..454876d4e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,5 +1,5 @@
 cmake_minimum_required(VERSION 3.4.1)
-project(kompute VERSION 0.4.2)
+project(kompute VERSION 0.4.1)
 
 set(CMAKE_CXX_STANDARD 14)
 
diff --git a/README.md b/README.md
index 9e63007d4..b745ccfbf 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,5 @@
 
-![GitHub](https://img.shields.io/badge/Version-0.4.0-green.svg)
+![GitHub](https://img.shields.io/badge/Version-0.4.1-green.svg)
 ![GitHub](https://img.shields.io/badge/C++-14—20-purple.svg)
 ![GitHub](https://img.shields.io/badge/Build-cmake-red.svg)
 ![GitHub](https://img.shields.io/badge/Python-3.5—3.8-blue.svg)
diff --git a/docs/conf.py b/docs/conf.py
index f1255e3d8..1771846e3 100755
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -25,7 +25,7 @@ copyright = '2020, The Institute for Ethical AI & Machine Learning'
 author = 'Alejandro Saucedo'
 
 # The full version, including alpha/beta/rc tags
-release = '0.4.0'
+release = '0.4.1'
 
 
 # -- General configuration ---------------------------------------------------

From b636a80d069cf334403dbd3b9d7d0e8421284dca Mon Sep 17 00:00:00 2001
From: Alejandro Saucedo <axsauze@gmail.com>
Date: Tue, 3 Nov 2020 08:42:27 +0000
Subject: [PATCH 32/39] Updated array example

---
 examples/array_multiplication/CMakeLists.txt | 24 +++++++++++--
 examples/array_multiplication/README.md      | 38 +++-----------------
 examples/array_multiplication/src/Main.cpp   | 11 +++++-
 src/Manager.cpp                              |  0
 4 files changed, 36 insertions(+), 37 deletions(-)
 mode change 100644 => 100755 src/Manager.cpp

diff --git a/examples/array_multiplication/CMakeLists.txt b/examples/array_multiplication/CMakeLists.txt
index 5aeebb450..63c58a842 100644
--- a/examples/array_multiplication/CMakeLists.txt
+++ b/examples/array_multiplication/CMakeLists.txt
@@ -3,6 +3,7 @@ project(kompute_array_mult VERSION 0.1.0)
 
 set(CMAKE_CXX_STANDARD 14)
 
+option(KOMPUTE_ARR_OPT_INSTALLED_KOMPUTE "Enable if you prefer to use your installed Kompute library" 0)
 option(KOMPUTE_OPT_ENABLE_SPDLOG "Extra compile flags for Kompute, see docs for full list" 0)
 set(KOMPUTE_EXTRA_CXX_FLAGS "" CACHE STRING "Extra compile flags for Kompute, see docs for full list")
 
@@ -10,13 +11,30 @@ set(KOMPUTE_EXTRA_CXX_FLAGS "" CACHE STRING "Extra compile flags for Kompute, se
 set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -DDEBUG=1 ${KOMPUTE_EXTRA_CXX_FLAGS}")
 set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -DRELEASE=1 ${KOMPUTE_EXTRA_CXX_FLAGS}")
 
-find_package(kompute REQUIRED)
+if(KOMPUTE_ARR_OPT_INSTALLED_KOMPUTE)
+    find_package(kompute REQUIRED)
+else()
+    add_subdirectory(../../ ${CMAKE_CURRENT_BINARY_DIR}/kompute_build)
+endif()
+
 find_package(Vulkan REQUIRED)
 
+if(KOMPUTE_OPT_ENABLE_SPDLOG)
+    find_package(spdlog REQUIRED)
+endif()
+
 add_executable(kompute_array_mult
     src/Main.cpp)
 
 target_link_libraries(kompute_array_mult
     kompute::kompute
-    Vulkan::Vulkan
-)
+    Vulkan::Vulkan)
+
+include_directories(
+        ../../single_include/)
+
+if(KOMPUTE_OPT_ENABLE_SPDLOG)
+    target_link_libraries(kompute_array_mult
+        spdlog::spdlog)
+endif()
+
diff --git a/examples/array_multiplication/README.md b/examples/array_multiplication/README.md
index 9838b7217..2a1ab8ae1 100644
--- a/examples/array_multiplication/README.md
+++ b/examples/array_multiplication/README.md
@@ -25,35 +25,12 @@ For the other libraries, because they are optional you can just make sure you bu
 
 Alternatively you can use package managers such as vcpkg to help you install them, although to simplify things you can start without the dependencies first.
 
-## Set Up Vulkan Kompute Dependency
-
-You have multiple options to set up Vulkan Kompute. The easiest is to perform a local installation.
-
-For this, you will want to go to the main repo and run the following cmake command, which will configure it without SPDLOG by default.
-
-```
-cmake \
-    -Bbuild
-```
-
-You can pass the following optional parameters based on your desired configuration:
-* If you wish to install with spdlog support you just have to pass `-DKOMPUTE_ENABLE_SPDLOG=1`.
-* If you wish to perform the installation on the local folder instead of in your system you can use `-DCMAKE_INSTALL_PREFIX="build/src/CMakeFiles/Export/"` which will basically ensure that the final files are created in the local directory.
-* If you are using a package manager such as `vcpkg` make sure you pass the `-DCMAKE_TOOLCHAIN_FILE=` parameter 
-
-Then you can proceed to run the installation:
-
-* For Windows / Visual Studio you just have to build `INSTALL.vcxproj`
-* For Linux you can just run the `install` target via `make -C build install`
-
-You also have the option to build as `Release` or `Debug` - just make sure that you build your example with the same build/debug flags as required.
-
 ## Building the example
 
-Now that you've set up the dependencies / installation of Vulkan Kompute you can build this example.
-
 You will notice that it's a standalone project, so you can re-use it for your application.
 
+This project has the option to either import the Kompute dependency relative to the project or use your existing installation of Kompute.
+
 To build you just need to run the cmake command in this folder as follows:
 
 ```
@@ -61,14 +38,9 @@ cmake \
     -Bbuild
 ```
 
-Make sure to pass the required flags depending on the configuration above:
-* If you built with Debug make sure you build your example with Debug as well
-* If you installed in the local folder, make sure you pass the CMAKE_PREFIX_PATH pointing to the respective folder (e.g. `-DCMAKE_PREFIX_PATH=../../build/src/CMakeFiles/Export/lib/cmake/kompute/` if parent folder is main repo).
-* If you built Vulkan Kompute with spdlog enabled, make sure to pass `-DKOMPUTE_OPT_ENABLE_SPDLOG=1`
+You can pass the following optional parameters based on your desired configuration:
+* If you wish to install with spdlog support you just have to pass `-DKOMPUTE_OPT_ENABLE_SPDLOG=1`.
 * If you are using a package manager such as `vcpkg` make sure you pass the `-DCMAKE_TOOLCHAIN_FILE=` parameter 
+* If you wish to load shader from raw glsl string instead of spirv bytes you can use `-DKOMPUTE_ANDROID_SHADER_FROM_STRING`
 
-Now you just have to build your application as above:
-
-* For Windows / Visual Studio you just have to build and run `kompute_array_mult.vcxproj`
-* For Linux you can just run the `kompute_array_mult` target via `make -C build kompute_array_mult`
 
diff --git a/examples/array_multiplication/src/Main.cpp b/examples/array_multiplication/src/Main.cpp
index f3587cae8..14b58cba9 100755
--- a/examples/array_multiplication/src/Main.cpp
+++ b/examples/array_multiplication/src/Main.cpp
@@ -18,6 +18,7 @@ int main()
     auto tensorInB = mgr.buildTensor({ 0.0, 1.0, 2.0 });
     auto tensorOut = mgr.buildTensor({ 0.0, 0.0, 0.0 });
 
+#ifdef KOMPUTE_ANDROID_SHADER_FROM_STRING
     std::string shader(R"(
         // The version to use 
         #version 450
@@ -37,9 +38,17 @@ int main()
         }
       )");
 
-    mgr.evalOpDefault<kp::OpAlgoBase<>>(
+    mgr.evalOpDefault<kp::OpAlgoBase>(
             { tensorInA, tensorInB, tensorOut },
             std::vector<char>(shader.begin(), shader.end()));
+#else
+    mgr.evalOpDefault<kp::OpAlgoBase>(
+            { tensorInA, tensorInB, tensorOut },
+            std::vector<char>(
+            kp::shader_data::shaders_glsl_opmult_comp_spv,
+            kp::shader_data::shaders_glsl_opmult_comp_spv
+                + kp::shader_data::shaders_glsl_opmult_comp_spv_len));
+#endif
 
     mgr.evalOpDefault<kp::OpTensorSyncLocal>({tensorOut});
 
diff --git a/src/Manager.cpp b/src/Manager.cpp
old mode 100644
new mode 100755

From 5fbb4ce6f6c00fc72dfd2f91b016a72e2a374516 Mon Sep 17 00:00:00 2001
From: Alejandro Saucedo <axsauze@gmail.com>
Date: Tue, 3 Nov 2020 08:59:32 +0000
Subject: [PATCH 33/39] Added function in sequence
 freeMemoryDestroyGPUResources to de-init

---
 single_include/kompute/Kompute.hpp |  5 ++
 src/Manager.cpp                    |  2 +-
 src/Sequence.cpp                   | 88 +++++++++++++++++-------------
 src/include/kompute/Sequence.hpp   |  5 ++
 4 files changed, 62 insertions(+), 38 deletions(-)

diff --git a/single_include/kompute/Kompute.hpp b/single_include/kompute/Kompute.hpp
index 932375cd4..c1dfd8762 100755
--- a/single_include/kompute/Kompute.hpp
+++ b/single_include/kompute/Kompute.hpp
@@ -1100,6 +1100,11 @@ class Sequence
      */
     bool isInit();
 
+    /**
+     * Destroys and frees the GPU resources which include the buffer and memory.
+     */
+    void freeMemoryDestroyGPUResources();
+
     /**
      * Record function for operation to be added to the GPU queue in batch. This
      * template requires classes to be derived from the OpBase class. This
diff --git a/src/Manager.cpp b/src/Manager.cpp
index b763f2eb0..df9d64db6 100755
--- a/src/Manager.cpp
+++ b/src/Manager.cpp
@@ -63,7 +63,7 @@ Manager::~Manager()
                      "managed sequences");
         for (const std::pair<std::string, std::shared_ptr<Sequence>>& sqPair :
              this->mManagedSequences) {
-            sqPair.second->~Sequence();
+            sqPair.second->freeMemoryDestroyGPUResources();
         }
         this->mManagedSequences.clear();
     }
diff --git a/src/Sequence.cpp b/src/Sequence.cpp
index b27c547be..4f01891c4 100644
--- a/src/Sequence.cpp
+++ b/src/Sequence.cpp
@@ -28,46 +28,13 @@ Sequence::~Sequence()
     SPDLOG_DEBUG("Kompute Sequence Destructor started");
 
     if (!this->mIsInit) {
-        SPDLOG_WARN("Kompute Sequence destructor called but sequence is not "
-                    "initialized.");
+        SPDLOG_INFO("Kompute Sequence destructor called but sequence is not "
+                    "initialized so no need to removing GPU resources.");
         return;
     }
-
-    if (!this->mDevice) {
-        SPDLOG_ERROR(
-          "Kompute Sequence destructor reached with null Device pointer");
-        this->mIsInit = false;
-        return;
+    else {
+        this->freeMemoryDestroyGPUResources();
     }
-
-    if (this->mFreeCommandBuffer) {
-        SPDLOG_INFO("Freeing CommandBuffer");
-        if (!this->mCommandBuffer) {
-            SPDLOG_ERROR("Kompute Sequence destructor reached with null "
-                         "CommandPool pointer");
-            this->mIsInit = false;
-            return;
-        }
-        this->mDevice->freeCommandBuffers(
-          *this->mCommandPool, 1, this->mCommandBuffer.get());
-        SPDLOG_DEBUG("Kompute Sequence Freed CommandBuffer");
-    }
-
-    if (this->mFreeCommandPool) {
-        SPDLOG_INFO("Destroying CommandPool");
-        if (this->mCommandPool == nullptr) {
-            SPDLOG_ERROR("Kompute Sequence destructor reached with null "
-                         "CommandPool pointer");
-            this->mIsInit = false;
-            return;
-        }
-        this->mDevice->destroy(
-          *this->mCommandPool,
-          (vk::Optional<const vk::AllocationCallbacks>)nullptr);
-        SPDLOG_DEBUG("Kompute Sequence Destroyed CommandPool");
-    }
-
-    this->mIsInit = false;
 }
 
 void
@@ -234,6 +201,53 @@ Sequence::isInit()
     return this->mIsInit;
 }
 
+void
+Sequence::freeMemoryDestroyGPUResources()
+{
+    if (!this->mIsInit) {
+        SPDLOG_ERROR("Kompute Sequence freeMemoryDestroyGPUResources called "
+            "but Sequence is not initialized so there's no relevant GPU resources.");
+        return;
+    }
+
+    if (!this->mDevice) {
+        SPDLOG_ERROR(
+          "Kompute Sequence freeMemoryDestroyGPUResources called with null Device pointer");
+        this->mIsInit = false;
+        return;
+    }
+
+    if (this->mFreeCommandBuffer) {
+        SPDLOG_INFO("Freeing CommandBuffer");
+        if (!this->mCommandBuffer) {
+            SPDLOG_ERROR("Kompute Sequence freeMemoryDestroyGPUResources called with null "
+                         "CommandPool pointer");
+            this->mIsInit = false;
+            return;
+        }
+        this->mDevice->freeCommandBuffers(
+          *this->mCommandPool, 1, this->mCommandBuffer.get());
+        SPDLOG_DEBUG("Kompute Sequence Freed CommandBuffer");
+    }
+
+    if (this->mFreeCommandPool) {
+        SPDLOG_INFO("Destroying CommandPool");
+        if (this->mCommandPool == nullptr) {
+            SPDLOG_ERROR("Kompute Sequence freeMemoryDestroyGPUResources called with null "
+                         "CommandPool pointer");
+            this->mIsInit = false;
+            return;
+        }
+        this->mDevice->destroy(
+          *this->mCommandPool,
+          (vk::Optional<const vk::AllocationCallbacks>)nullptr);
+        SPDLOG_DEBUG("Kompute Sequence Destroyed CommandPool");
+    }
+
+    this->mIsInit = false;
+
+}
+
 void
 Sequence::createCommandPool()
 {
diff --git a/src/include/kompute/Sequence.hpp b/src/include/kompute/Sequence.hpp
index 314de6657..09247fe3f 100644
--- a/src/include/kompute/Sequence.hpp
+++ b/src/include/kompute/Sequence.hpp
@@ -106,6 +106,11 @@ class Sequence
      */
     bool isInit();
 
+    /**
+     * Destroys and frees the GPU resources which include the buffer and memory.
+     */
+    void freeMemoryDestroyGPUResources();
+
     /**
      * Record function for operation to be added to the GPU queue in batch. This
      * template requires classes to be derived from the OpBase class. This

From 157b6592dd82a55d6cd728bf5eb2aef3b5c179bf Mon Sep 17 00:00:00 2001
From: Alejandro Saucedo <axsauze@gmail.com>
Date: Tue, 3 Nov 2020 09:00:03 +0000
Subject: [PATCH 34/39] Updated docstring

---
 src/include/kompute/Sequence.hpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/include/kompute/Sequence.hpp b/src/include/kompute/Sequence.hpp
index 09247fe3f..5d483c27a 100644
--- a/src/include/kompute/Sequence.hpp
+++ b/src/include/kompute/Sequence.hpp
@@ -107,7 +107,8 @@ class Sequence
     bool isInit();
 
     /**
-     * Destroys and frees the GPU resources which include the buffer and memory.
+     * Destroys and frees the GPU resources which include the buffer and memory
+     * and sets the sequence as init=False.
      */
     void freeMemoryDestroyGPUResources();
 

From 5822850ef2085330e89ce1bde8f6126fe9cfa6d1 Mon Sep 17 00:00:00 2001
From: Alejandro Saucedo <axsauze@gmail.com>
Date: Tue, 3 Nov 2020 18:04:03 +0000
Subject: [PATCH 35/39] Updated logistic regression model

---
 examples/logistic_regression/CMakeLists.txt | 19 +++---
 examples/logistic_regression/README.md      | 65 +++++----------------
 examples/logistic_regression/src/Main.cpp   | 14 ++++-
 3 files changed, 37 insertions(+), 61 deletions(-)

diff --git a/examples/logistic_regression/CMakeLists.txt b/examples/logistic_regression/CMakeLists.txt
index b12e8227f..f918bbf21 100644
--- a/examples/logistic_regression/CMakeLists.txt
+++ b/examples/logistic_regression/CMakeLists.txt
@@ -3,6 +3,7 @@ project(kompute_linear_reg VERSION 0.1.0)
 
 set(CMAKE_CXX_STANDARD 14)
 
+option(KOMPUTE_ARR_OPT_INSTALLED_KOMPUTE "Enable if you prefer to use your installed Kompute library" 0)
 option(KOMPUTE_OPT_ENABLE_SPDLOG "Extra compile flags for Kompute, see docs for full list" 0)
 set(KOMPUTE_EXTRA_CXX_FLAGS "" CACHE STRING "Extra compile flags for Kompute, see docs for full list")
 
@@ -14,12 +15,16 @@ endif()
 set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -DDEBUG=1 ${KOMPUTE_EXTRA_CXX_FLAGS}")
 set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -DRELEASE=1 ${KOMPUTE_EXTRA_CXX_FLAGS}")
 
-find_package(kompute REQUIRED)
+if(KOMPUTE_ARR_OPT_INSTALLED_KOMPUTE)
+    find_package(kompute REQUIRED)
+else()
+    add_subdirectory(../../ ${CMAKE_CURRENT_BINARY_DIR}/kompute_build)
+endif()
+
 find_package(Vulkan REQUIRED)
 
 if(KOMPUTE_OPT_ENABLE_SPDLOG)
     find_package(spdlog REQUIRED)
-    find_package(fmt REQUIRED)
 endif()
 
 add_executable(kompute_linear_reg
@@ -30,11 +35,11 @@ target_link_libraries(kompute_linear_reg
     Vulkan::Vulkan
 )
 
+include_directories(
+        ../../single_include/)
+
 if(KOMPUTE_OPT_ENABLE_SPDLOG)
-    target_link_libraries(kompute_linear_reg
-        kompute::kompute
-        fmt::fmt
-        spdlog::spdlog
-    )
+    target_link_libraries(kompute_array_mult
+        spdlog::spdlog)
 endif()
 
diff --git a/examples/logistic_regression/README.md b/examples/logistic_regression/README.md
index 29aa89c01..0de7ee30a 100644
--- a/examples/logistic_regression/README.md
+++ b/examples/logistic_regression/README.md
@@ -6,54 +6,12 @@ This example is structured such that you will be able to extend it for your proj
 
 It contains a cmake build configuration that can be used in your production applications.
 
-## Pre-requisites
-
-In order to run this example, you will need the following dependencies:
-
-* REQUIRED
-    + Vulkan Kompute library must be accessible
-    + The Vulkan SDK must be installed
-* OPTIONAL
-    + SPDLOG - for logging
-    + FMT - for text formatting
-
-We will cover how you can install Vulkan Kompute in the next section.
-
-For the Vulkan SDK, the simplest way to install it is through [their website](https://vulkan.lunarg.com/sdk/home). You just have to follow the instructions for the relevant platform.
-
-For the other libraries, because they are optional you can just make sure you build and install Kompute with these disabled (this will be covered in more detail below).
-
-Alternatively you can use package managers such as vcpkg to help you install them, although to simplify things you can start without the dependencies first.
-
-## Set Up Vulkan Kompute Dependency
-
-You have multiple options to set up Vulkan Kompute. The easiest is to perform a local installation.
-
-For this, you will want to go to the main repo and run the following cmake command, which will configure it without SPDLOG by default.
-
-```
-cmake \
-    -Bbuild
-```
-
-You can pass the following optional parameters based on your desired configuration:
-* If you wish to install with spdlog support you just have to pass `-DKOMPUTE_ENABLE_SPDLOG=1`.
-* If you wish to perform the installation on the local folder instead of in your system you can use `-DCMAKE_INSTALL_PREFIX="build/src/CMakeFiles/Export/"` which will basically ensure that the final files are created in the local directory.
-* If you are using a package manager such as `vcpkg` make sure you pass the `-DCMAKE_TOOLCHAIN_FILE=` parameter 
-
-Then you can proceed to run the installation:
-
-* For Windows / Visual Studio you just have to build `INSTALL.vcxproj`
-* For Linux you can just run the `install` target via `make -C build install`
-
-You also have the option to build as `Release` or `Debug` - just make sure that you build your example with the same build/debug flags as required.
-
 ## Building the example
 
-Now that you've set up the dependencies / installation of Vulkan Kompute you can build this example.
-
 You will notice that it's a standalone project, so you can re-use it for your application.
 
+This project has the option to either import the Kompute dependency relative to the project or use your existing installation of Kompute.
+
 To build you just need to run the cmake command in this folder as follows:
 
 ```
@@ -61,14 +19,19 @@ cmake \
     -Bbuild
 ```
 
-Make sure to pass the required flags depending on the configuration above:
-* If you built with Debug make sure you build your example with Debug as well
-* If you installed in the local folder, make sure you pass the CMAKE_PREFIX_PATH pointing to the respective folder (e.g. `-DCMAKE_PREFIX_PATH=../../build/src/CMakeFiles/Export/lib/cmake/kompute/` if parent folder is main repo).
-* If you built Vulkan Kompute with spdlog enabled, make sure to pass `-DKOMPUTE_OPT_ENABLE_SPDLOG=1`
+You can pass the following optional parameters based on your desired configuration:
+* If you wish to install with spdlog support you just have to pass `-DKOMPUTE_OPT_ENABLE_SPDLOG=1`.
 * If you are using a package manager such as `vcpkg` make sure you pass the `-DCMAKE_TOOLCHAIN_FILE=` parameter 
+* If you wish to load shader from raw glsl string instead of spirv bytes you can use `-DKOMPUTE_ANDROID_SHADER_FROM_STRING`
 
-Now you just have to build your application as above:
+## Pre-requisites
 
-* For Windows / Visual Studio you just have to build and run `kompute_linear_reg.vcxproj`
-* For Linux you can just run the `kompute_linear_reg` target via `make -C build kompute_linear_reg`
+In order to run this example, you will need the following dependencies:
+
+* REQUIRED
+    + The Vulkan SDK must be installed
+* OPTIONAL
+    + Vulkan Kompute library must be accessible (by default it uses the source directory)
+    + SPDLOG - for logging
+    + FMT - for text formatting
 
diff --git a/examples/logistic_regression/src/Main.cpp b/examples/logistic_regression/src/Main.cpp
index 853fa9d67..d3b8b3557 100755
--- a/examples/logistic_regression/src/Main.cpp
+++ b/examples/logistic_regression/src/Main.cpp
@@ -36,22 +36,30 @@ int main()
     kp::Manager mgr;
 
     std::shared_ptr<kp::Sequence> sqTensor =
-      mgr.createManagedSequence().lock();
+      mgr.createManagedSequence();
 
     sqTensor->begin();
     sqTensor->record<kp::OpTensorCreate>(params);
     sqTensor->end();
     sqTensor->eval();
 
-    std::shared_ptr<kp::Sequence> sq = mgr.createManagedSequence().lock();
+    std::shared_ptr<kp::Sequence> sq = mgr.createManagedSequence();
 
     // Record op algo base
     sq->begin();
 
     sq->record<kp::OpTensorSyncDevice>({ wIn, bIn });
 
-    sq->record<kp::OpAlgoBase<>>(
+#ifdef KOMPUTE_ANDROID_SHADER_FROM_STRING
+    sq->record<kp::OpAlgoBase>(
       params, "shaders/glsl/logistic_regression.comp");
+#else
+    sq->record<kp::OpAlgoBase>(
+        params, std::vector<char>(
+                kp::shader_data::shaders_glsl_logisticregression_comp_spv,
+                kp::shader_data::shaders_glsl_logisticregression_comp_spv
+                    + kp::shader_data::shaders_glsl_logisticregression_comp_spv_len));
+#endif
 
     sq->record<kp::OpTensorSyncLocal>({ wOutI, wOutJ, bOut, lOut });
 

From 8285f2f878222893bdd4d07ad86d2079685ffd99 Mon Sep 17 00:00:00 2001
From: Alejandro Saucedo <axsauze@gmail.com>
Date: Tue, 3 Nov 2020 18:04:29 +0000
Subject: [PATCH 36/39] Updated logistic regression model

---
 .../kompute_summator/KomputeSummatorNode.h    |  2 +-
 .../gdnative_shared/src/KomputeSummator.cpp   | 22 +++++--------------
 .../gdnative_shared/src/KomputeSummator.hpp   |  2 +-
 .../kompute_model_ml/KomputeModelMLNode.cpp   |  8 +++----
 .../gdnative_shared/src/KomputeModelML.cpp    |  8 +++----
 5 files changed, 16 insertions(+), 26 deletions(-)

diff --git a/examples/godot_examples/custom_module/kompute_summator/KomputeSummatorNode.h b/examples/godot_examples/custom_module/kompute_summator/KomputeSummatorNode.h
index 5bc201a90..1d94da9a5 100644
--- a/examples/godot_examples/custom_module/kompute_summator/KomputeSummatorNode.h
+++ b/examples/godot_examples/custom_module/kompute_summator/KomputeSummatorNode.h
@@ -24,7 +24,7 @@ protected:
 
 private:
     kp::Manager mManager;
-    std::weak_ptr<kp::Sequence> mSequence;
+    std::shared_ptr<kp::Sequence> mSequence;
     std::shared_ptr<kp::Tensor> mPrimaryTensor;
     std::shared_ptr<kp::Tensor> mSecondaryTensor;
 };
diff --git a/examples/godot_examples/gdnative_shared/src/KomputeSummator.cpp b/examples/godot_examples/gdnative_shared/src/KomputeSummator.cpp
index f64e0d088..788486e82 100644
--- a/examples/godot_examples/gdnative_shared/src/KomputeSummator.cpp
+++ b/examples/godot_examples/gdnative_shared/src/KomputeSummator.cpp
@@ -16,12 +16,7 @@ void KomputeSummator::add(float value) {
     // Set the new data in the local device
     this->mSecondaryTensor->setData({value});
     // Execute recorded sequence
-    if (std::shared_ptr<kp::Sequence> sq = this->mSequence.lock()) {
-        sq->eval();
-    }
-    else {
-        throw std::runtime_error("Sequence pointer no longer available");
-    }
+    this->mSequence->eval();
 }
 
 void KomputeSummator::reset() {
@@ -38,9 +33,7 @@ void KomputeSummator::_init() {
     this->mSequence = this->mManager.getOrCreateManagedSequence("AdditionSeq");
 
     // We now record the steps in the sequence
-    if (std::shared_ptr<kp::Sequence> sq = this->mSequence.lock())
     {
-
         std::string shader(R"(
             #version 450
 
@@ -55,26 +48,23 @@ void KomputeSummator::_init() {
             }
         )");
 
-        sq->begin();
+        this->mSequence->begin();
 
         // First we ensure secondary tensor loads to GPU
         // No need to sync the primary tensor as it should not be changed
-        sq->record<kp::OpTensorSyncDevice>(
+        this->mSequence->record<kp::OpTensorSyncDevice>(
                 { this->mSecondaryTensor });
 
         // Then we run the operation with both tensors
-        sq->record<kp::OpAlgoBase<>>(
+        this->mSequence->record<kp::OpAlgoBase>(
             { this->mPrimaryTensor, this->mSecondaryTensor }, 
             std::vector<char>(shader.begin(), shader.end()));
 
         // We map the result back to local 
-        sq->record<kp::OpTensorSyncLocal>(
+        this->mSequence->record<kp::OpTensorSyncLocal>(
                 { this->mPrimaryTensor });
 
-        sq->end();
-    }
-    else {
-        throw std::runtime_error("Sequence pointer no longer available");
+        this->mSequence->end();
     }
 }
 
diff --git a/examples/godot_examples/gdnative_shared/src/KomputeSummator.hpp b/examples/godot_examples/gdnative_shared/src/KomputeSummator.hpp
index 9131e7f57..7f6b42e82 100644
--- a/examples/godot_examples/gdnative_shared/src/KomputeSummator.hpp
+++ b/examples/godot_examples/gdnative_shared/src/KomputeSummator.hpp
@@ -26,7 +26,7 @@ public:
 
 private:
     kp::Manager mManager;
-    std::weak_ptr<kp::Sequence> mSequence;
+    std::shared_ptr<kp::Sequence> mSequence;
     std::shared_ptr<kp::Tensor> mPrimaryTensor;
     std::shared_ptr<kp::Tensor> mSecondaryTensor;
 };
diff --git a/examples/godot_logistic_regression/custom_module/kompute_model_ml/KomputeModelMLNode.cpp b/examples/godot_logistic_regression/custom_module/kompute_model_ml/KomputeModelMLNode.cpp
index fe0a911a5..f583d910f 100644
--- a/examples/godot_logistic_regression/custom_module/kompute_model_ml/KomputeModelMLNode.cpp
+++ b/examples/godot_logistic_regression/custom_module/kompute_model_ml/KomputeModelMLNode.cpp
@@ -51,14 +51,14 @@ void KomputeModelMLNode::train(Array yArr, Array xIArr, Array xJArr) {
         kp::Manager mgr;
 
             std::shared_ptr<kp::Sequence> sqTensor =
-              mgr.createManagedSequence().lock();
+              mgr.createManagedSequence();
 
             sqTensor->begin();
             sqTensor->record<kp::OpTensorCreate>(params);
             sqTensor->end();
             sqTensor->eval();
 
-            std::shared_ptr<kp::Sequence> sq = mgr.createManagedSequence().lock();
+            std::shared_ptr<kp::Sequence> sq = mgr.createManagedSequence();
 
             // Record op algo base
             sq->begin();
@@ -67,11 +67,11 @@ void KomputeModelMLNode::train(Array yArr, Array xIArr, Array xJArr) {
 
 #ifdef KOMPUTE_ANDROID_SHADER_FROM_STRING
             // Newer versions of Android are able to use shaderc to read raw string
-            sq->record<kp::OpAlgoBase<>>(
+            sq->record<kp::OpAlgoBase>(
                     params, std::vector<char>(LR_SHADER.begin(), LR_SHADER.end()));
 #else
             // Older versions of Android require the SPIRV binary directly
-            sq->record<kp::OpAlgoBase<>>(
+            sq->record<kp::OpAlgoBase>(
                     params, std::vector<char>(
                             kp::shader_data::shaders_glsl_logisticregression_comp_spv,
                             kp::shader_data::shaders_glsl_logisticregression_comp_spv
diff --git a/examples/godot_logistic_regression/gdnative_shared/src/KomputeModelML.cpp b/examples/godot_logistic_regression/gdnative_shared/src/KomputeModelML.cpp
index 174398501..4135e83ed 100644
--- a/examples/godot_logistic_regression/gdnative_shared/src/KomputeModelML.cpp
+++ b/examples/godot_logistic_regression/gdnative_shared/src/KomputeModelML.cpp
@@ -56,14 +56,14 @@ void KomputeModelML::train(Array yArr, Array xIArr, Array xJArr) {
 
         {
             std::shared_ptr<kp::Sequence> sqTensor =
-              mgr.createManagedSequence().lock();
+              mgr.createManagedSequence();
 
             sqTensor->begin();
             sqTensor->record<kp::OpTensorCreate>(params);
             sqTensor->end();
             sqTensor->eval();
 
-            std::shared_ptr<kp::Sequence> sq = mgr.createManagedSequence().lock();
+            std::shared_ptr<kp::Sequence> sq = mgr.createManagedSequence();
 
             // Record op algo base
             sq->begin();
@@ -72,11 +72,11 @@ void KomputeModelML::train(Array yArr, Array xIArr, Array xJArr) {
 
 #ifdef KOMPUTE_ANDROID_SHADER_FROM_STRING
             // Newer versions of Android are able to use shaderc to read raw string
-            sq->record<kp::OpAlgoBase<>>(
+            sq->record<kp::OpAlgoBase>(
                     params, std::vector<char>(LR_SHADER.begin(), LR_SHADER.end()));
 #else
             // Older versions of Android require the SPIRV binary directly
-            sq->record<kp::OpAlgoBase<>>(
+            sq->record<kp::OpAlgoBase>(
                     params, std::vector<char>(
                             kp::shader_data::shaders_glsl_logisticregression_comp_spv,
                             kp::shader_data::shaders_glsl_logisticregression_comp_spv

From 88df1b312da492330f3a3d752aeb7e1bf5b79c53 Mon Sep 17 00:00:00 2001
From: Alejandro Saucedo <axsauze@gmail.com>
Date: Tue, 3 Nov 2020 18:04:39 +0000
Subject: [PATCH 37/39] Updated logistic regression model

---
 .../android-simple/app/src/main/cpp/KomputeModelML.cpp    | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/examples/android/android-simple/app/src/main/cpp/KomputeModelML.cpp b/examples/android/android-simple/app/src/main/cpp/KomputeModelML.cpp
index a7a18c849..e22f2aa00 100755
--- a/examples/android/android-simple/app/src/main/cpp/KomputeModelML.cpp
+++ b/examples/android/android-simple/app/src/main/cpp/KomputeModelML.cpp
@@ -44,14 +44,14 @@ void KomputeModelML::train(std::vector<float> yData, std::vector<float> xIData,
         {
 
             std::shared_ptr<kp::Sequence> sqTensor =
-              mgr.createManagedSequence().lock();
+              mgr.createManagedSequence();
 
             sqTensor->begin();
             sqTensor->record<kp::OpTensorCreate>(params);
             sqTensor->end();
             sqTensor->eval();
 
-            std::shared_ptr<kp::Sequence> sq = mgr.createManagedSequence().lock();
+            std::shared_ptr<kp::Sequence> sq = mgr.createManagedSequence();
 
             // Record op algo base
             sq->begin();
@@ -60,11 +60,11 @@ void KomputeModelML::train(std::vector<float> yData, std::vector<float> xIData,
 
 #ifdef KOMPUTE_ANDROID_SHADER_FROM_STRING
             // Newer versions of Android are able to use shaderc to read raw string
-            sq->record<kp::OpAlgoBase<>>(
+            sq->record<kp::OpAlgoBase>(
                     params, std::vector<char>(LR_SHADER.begin(), LR_SHADER.end()));
 #else
             // Older versions of Android require the SPIRV binary directly
-            sq->record<kp::OpAlgoBase<>>(
+            sq->record<kp::OpAlgoBase>(
                     params, std::vector<char>(
                             kp::shader_data::shaders_glsl_logisticregression_comp_spv,
                             kp::shader_data::shaders_glsl_logisticregression_comp_spv

From 53e1a3aa54c467a4be4b93e125fe5bd54b943dd9 Mon Sep 17 00:00:00 2001
From: Alejandro Saucedo <axsauze@gmail.com>
Date: Tue, 3 Nov 2020 18:04:54 +0000
Subject: [PATCH 38/39] Updated array example

---
 examples/array_multiplication/CMakeLists.txt |  4 ++
 examples/array_multiplication/README.md      | 39 ++++++++++----------
 2 files changed, 24 insertions(+), 19 deletions(-)

diff --git a/examples/array_multiplication/CMakeLists.txt b/examples/array_multiplication/CMakeLists.txt
index 63c58a842..0b648382e 100644
--- a/examples/array_multiplication/CMakeLists.txt
+++ b/examples/array_multiplication/CMakeLists.txt
@@ -7,6 +7,10 @@ option(KOMPUTE_ARR_OPT_INSTALLED_KOMPUTE "Enable if you prefer to use your insta
 option(KOMPUTE_OPT_ENABLE_SPDLOG "Extra compile flags for Kompute, see docs for full list" 0)
 set(KOMPUTE_EXTRA_CXX_FLAGS "" CACHE STRING "Extra compile flags for Kompute, see docs for full list")
 
+if(KOMPUTE_OPT_ENABLE_SPDLOG)
+    set(KOMPUTE_EXTRA_CXX_FLAGS "${KOMPUTE_EXTRA_CXX_FLAGS} -DKOMPUTE_ENABLE_SPDLOG=1")
+endif()
+
 # It is necessary to pass the DEBUG or RELEASE flag accordingly to Kompute
 set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -DDEBUG=1 ${KOMPUTE_EXTRA_CXX_FLAGS}")
 set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -DRELEASE=1 ${KOMPUTE_EXTRA_CXX_FLAGS}")
diff --git a/examples/array_multiplication/README.md b/examples/array_multiplication/README.md
index 2a1ab8ae1..931c7d639 100644
--- a/examples/array_multiplication/README.md
+++ b/examples/array_multiplication/README.md
@@ -6,25 +6,6 @@ This example is structured such that you will be able to extend it for your proj
 
 It contains a cmake build configuration that can be used in your production applications.
 
-## Pre-requisites
-
-In order to run this example, you will need the following dependencies:
-
-* REQUIRED
-    + Vulkan Kompute library must be accessible
-    + The Vulkan SDK must be installed
-* OPTIONAL
-    + SPDLOG - for logging
-    + FMT - for text formatting
-
-We will cover how you can install Vulkan Kompute in the next section.
-
-For the Vulkan SDK, the simplest way to install it is through [their website](https://vulkan.lunarg.com/sdk/home). You just have to follow the instructions for the relevant platform.
-
-For the other libraries, because they are optional you can just make sure you build and install Kompute with these disabled (this will be covered in more detail below).
-
-Alternatively you can use package managers such as vcpkg to help you install them, although to simplify things you can start without the dependencies first.
-
 ## Building the example
 
 You will notice that it's a standalone project, so you can re-use it for your application.
@@ -43,4 +24,24 @@ You can pass the following optional parameters based on your desired configurati
 * If you are using a package manager such as `vcpkg` make sure you pass the `-DCMAKE_TOOLCHAIN_FILE=` parameter 
 * If you wish to load shader from raw glsl string instead of spirv bytes you can use `-DKOMPUTE_ANDROID_SHADER_FROM_STRING`
 
+## Pre-requisites
+
+In order to run this example, you will need the following dependencies:
+
+* REQUIRED
+    + The Vulkan SDK must be installed
+* OPTIONAL
+    + Vulkan Kompute library must be accessible (by default it uses the source directory)
+    + SPDLOG - for logging
+    + FMT - for text formatting
+
+We will cover how you can install Vulkan Kompute in the next section.
+
+For the Vulkan SDK, the simplest way to install it is through [their website](https://vulkan.lunarg.com/sdk/home). You just have to follow the instructions for the relevant platform.
+
+For the other libraries, because they are optional you can just make sure you build and install Kompute with these disabled (this will be covered in more detail below).
+
+Alternatively you can use package managers such as vcpkg to help you install them, although to simplify things you can start without the dependencies first.
+
+
 

From 3811ef2dba4c8453b893b8570867f7b94779c8ff Mon Sep 17 00:00:00 2001
From: Alejandro Saucedo <axsauze@gmail.com>
Date: Tue, 3 Nov 2020 18:05:37 +0000
Subject: [PATCH 39/39] Updated docstrings

---
 single_include/kompute/Kompute.hpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/single_include/kompute/Kompute.hpp b/single_include/kompute/Kompute.hpp
index c1dfd8762..3ae98b483 100755
--- a/single_include/kompute/Kompute.hpp
+++ b/single_include/kompute/Kompute.hpp
@@ -1101,7 +1101,8 @@ class Sequence
     bool isInit();
 
     /**
-     * Destroys and frees the GPU resources which include the buffer and memory.
+     * Destroys and frees the GPU resources which include the buffer and memory
+     * and sets the sequence as init=False.
      */
     void freeMemoryDestroyGPUResources();