From 5133ffe5488aac3fb9e381f21a8ce47044fac79c Mon Sep 17 00:00:00 2001
From: Alejandro Saucedo <axsauze@gmail.com>
Date: Sat, 7 Nov 2020 16:52:56 +0000
Subject: [PATCH 01/11] Added automated generated documentation

---
 python/src/docstrings.hpp | 846 ++++++++++++++++++++++++++++++++++++++
 python/src/main.cpp       |  10 +-
 2 files changed, 852 insertions(+), 4 deletions(-)
 create mode 100644 python/src/docstrings.hpp

diff --git a/python/src/docstrings.hpp b/python/src/docstrings.hpp
new file mode 100644
index 000000000..37f3ff785
--- /dev/null
+++ b/python/src/docstrings.hpp
@@ -0,0 +1,846 @@
+/*
+  This file contains docstrings for use in the Python bindings.
+  Do not edit! They were automatically extracted by pybind11_mkdoc.
+ */
+
+#define __EXPAND(x)                                      x
+#define __COUNT(_1, _2, _3, _4, _5, _6, _7, COUNT, ...)  COUNT
+#define __VA_SIZE(...)                                   __EXPAND(__COUNT(__VA_ARGS__, 7, 6, 5, 4, 3, 2, 1))
+#define __CAT1(a, b)                                     a ## b
+#define __CAT2(a, b)                                     __CAT1(a, b)
+#define __DOC1(n1)                                       __doc_##n1
+#define __DOC2(n1, n2)                                   __doc_##n1##_##n2
+#define __DOC3(n1, n2, n3)                               __doc_##n1##_##n2##_##n3
+#define __DOC4(n1, n2, n3, n4)                           __doc_##n1##_##n2##_##n3##_##n4
+#define __DOC5(n1, n2, n3, n4, n5)                       __doc_##n1##_##n2##_##n3##_##n4##_##n5
+#define __DOC6(n1, n2, n3, n4, n5, n6)                   __doc_##n1##_##n2##_##n3##_##n4##_##n5##_##n6
+#define __DOC7(n1, n2, n3, n4, n5, n6, n7)               __doc_##n1##_##n2##_##n3##_##n4##_##n5##_##n6##_##n7
+#define DOC(...)                                         __EXPAND(__EXPAND(__CAT2(__DOC, __VA_SIZE(__VA_ARGS__)))(__VA_ARGS__))
+
+#if defined(__GNUG__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-variable"
+#endif
+
+
+static const char *__doc_kp_Algorithm =
+R"doc(Abstraction for compute shaders that are run on top of tensors grouped
+via ParameterGroups (which group descriptorsets))doc";
+
+static const char *__doc_kp_Algorithm_Algorithm =
+R"doc(Base constructor for Algorithm. Should not be used unless explicit
+intended.)doc";
+
+static const char *__doc_kp_Algorithm_Algorithm_2 =
+R"doc(Default constructor for Algorithm
+
+@param device The Vulkan device to use for creating resources @param
+commandBuffer The vulkan command buffer to bind the pipeline and
+shaders)doc";
+
+static const char *__doc_kp_Algorithm_createDescriptorPool = R"doc()doc";
+
+static const char *__doc_kp_Algorithm_createParameters = R"doc()doc";
+
+static const char *__doc_kp_Algorithm_createPipeline = R"doc()doc";
+
+static const char *__doc_kp_Algorithm_createShaderModule = R"doc()doc";
+
+static const char *__doc_kp_Algorithm_init =
+R"doc(Initialiser for the shader data provided to the algorithm as well as
+tensor parameters that will be used in shader.
+
+@param shaderFileData The bytes in spir-v format of the shader
+@tensorParams The Tensors to be used in the Algorithm / shader for
+processing)doc";
+
+static const char *__doc_kp_Algorithm_mCommandBuffer = R"doc()doc";
+
+static const char *__doc_kp_Algorithm_mDescriptorPool = R"doc()doc";
+
+static const char *__doc_kp_Algorithm_mDescriptorSet = R"doc()doc";
+
+static const char *__doc_kp_Algorithm_mDescriptorSetLayout = R"doc()doc";
+
+static const char *__doc_kp_Algorithm_mDevice = R"doc()doc";
+
+static const char *__doc_kp_Algorithm_mFreeDescriptorPool = R"doc()doc";
+
+static const char *__doc_kp_Algorithm_mFreeDescriptorSet = R"doc()doc";
+
+static const char *__doc_kp_Algorithm_mFreeDescriptorSetLayout = R"doc()doc";
+
+static const char *__doc_kp_Algorithm_mFreePipeline = R"doc()doc";
+
+static const char *__doc_kp_Algorithm_mFreePipelineCache = R"doc()doc";
+
+static const char *__doc_kp_Algorithm_mFreePipelineLayout = R"doc()doc";
+
+static const char *__doc_kp_Algorithm_mFreeShaderModule = R"doc()doc";
+
+static const char *__doc_kp_Algorithm_mPipeline = R"doc()doc";
+
+static const char *__doc_kp_Algorithm_mPipelineCache = R"doc()doc";
+
+static const char *__doc_kp_Algorithm_mPipelineLayout = R"doc()doc";
+
+static const char *__doc_kp_Algorithm_mShaderModule = R"doc()doc";
+
+static const char *__doc_kp_Algorithm_recordDispatch =
+R"doc(Records the dispatch function with the provided template parameters or
+alternatively using the size of the tensor by default.
+
+@param x Layout X dispatch value @param y Layout Y dispatch value
+@param z Layout Z dispatch value)doc";
+
+static const char *__doc_kp_Manager =
+R"doc(Base orchestrator which creates and manages device and child
+components)doc";
+
+static const char *__doc_kp_Manager_Manager =
+R"doc(Base constructor and default used which creates the base resources
+including choosing the device 0 by default.)doc";
+
+static const char *__doc_kp_Manager_Manager_2 =
+R"doc(Similar to base constructor but allows the user to provide the device
+they would like to create the resources on.
+
+@param physicalDeviceIndex The index of the physical device to use
+@param familyQueueIndices (Optional) List of queue indices to add for
+explicit allocation @param totalQueues The total number of compute
+queues to create.)doc";
+
+static const char *__doc_kp_Manager_Manager_3 =
+R"doc(Manager constructor which allows your own vulkan application to
+integrate with the vulkan kompute use.
+
+@param instance Vulkan compute instance to base this application
+@param physicalDevice Vulkan physical device to use for application
+@param device Vulkan logical device to use for all base resources
+@param physicalDeviceIndex Index for vulkan physical device used)doc";
+
+static const char *__doc_kp_Manager_buildTensor =
+R"doc(Function that simplifies the common workflow of tensor creation and
+initialization. It will take the constructor parameters for a Tensor
+and will will us it to create a new Tensor and then create it using
+the OpCreateTensor command.
+
+@param data The data to initialize the tensor with @param tensorType
+The type of tensor to initialize @returns Initialized Tensor with
+memory Syncd to GPU device)doc";
+
+static const char *__doc_kp_Manager_createDevice = R"doc()doc";
+
+static const char *__doc_kp_Manager_createInstance = R"doc()doc";
+
+static const char *__doc_kp_Manager_createManagedSequence =
+R"doc(Create a new managed Kompute sequence so it's available within the
+manager.
+
+@param sequenceName The name for the named sequence to be created, if
+empty then default indexed value is used @param queueIndex The queue
+to use from the available queues @return Weak pointer to the manager
+owned sequence resource)doc";
+
+static const char *__doc_kp_Manager_evalOp =
+R"doc(Function that evaluates operation against named sequence.
+
+@param tensors The tensors to be used in the operation recorded @param
+sequenceName The name of the sequence to be retrieved or created
+@param TArgs Template parameters that will be used to initialise
+Operation to allow for extensible configurations on initialisation)doc";
+
+static const char *__doc_kp_Manager_evalOpAsync =
+R"doc(Function that evaluates operation against named sequence
+asynchronously.
+
+@param tensors The tensors to be used in the operation recorded @param
+sequenceName The name of the sequence to be retrieved or created
+@param params Template parameters that will be used to initialise
+Operation to allow for extensible configurations on initialisation)doc";
+
+static const char *__doc_kp_Manager_evalOpAsyncDefault =
+R"doc(Operation that evaluates operation against default sequence
+asynchronously.
+
+@param tensors The tensors to be used in the operation recorded @param
+params Template parameters that will be used to initialise Operation
+to allow for extensible configurations on initialisation)doc";
+
+static const char *__doc_kp_Manager_evalOpAwait =
+R"doc(Operation that awaits for named sequence to finish.
+
+@param sequenceName The name of the sequence to wait for termination
+@param waitFor The amount of time to wait before timing out)doc";
+
+static const char *__doc_kp_Manager_evalOpAwaitDefault =
+R"doc(Operation that awaits for default sequence to finish.
+
+@param tensors The tensors to be used in the operation recorded @param
+params Template parameters that will be used to initialise Operation
+to allow for extensible configurations on initialisation)doc";
+
+static const char *__doc_kp_Manager_evalOpDefault =
+R"doc(Function that evaluates operation against a newly created sequence.
+
+@param tensors The tensors to be used in the operation recorded @param
+TArgs Template parameters that will be used to initialise Operation to
+allow for extensible configurations on initialisation)doc";
+
+static const char *__doc_kp_Manager_getOrCreateManagedSequence =
+R"doc(Get or create a managed Sequence that will be contained by this
+manager. If the named sequence does not currently exist, it would be
+created and initialised.
+
+@param sequenceName The name for the named sequence to be retrieved or
+created @return Shared pointer to the manager owned sequence resource)doc";
+
+static const char *__doc_kp_Manager_mComputeQueueFamilyIndices = R"doc()doc";
+
+static const char *__doc_kp_Manager_mComputeQueues = R"doc()doc";
+
+static const char *__doc_kp_Manager_mCurrentSequenceIndex = R"doc()doc";
+
+static const char *__doc_kp_Manager_mDevice = R"doc()doc";
+
+static const char *__doc_kp_Manager_mFreeDevice = R"doc()doc";
+
+static const char *__doc_kp_Manager_mFreeInstance = R"doc()doc";
+
+static const char *__doc_kp_Manager_mInstance = R"doc()doc";
+
+static const char *__doc_kp_Manager_mManagedSequences = R"doc()doc";
+
+static const char *__doc_kp_Manager_mPhysicalDevice = R"doc()doc";
+
+static const char *__doc_kp_Manager_mPhysicalDeviceIndex = R"doc()doc";
+
+static const char *__doc_kp_OpAlgoBase =
+R"doc(Operation that provides a general abstraction that simplifies the use
+of algorithm and parameter components which can be used with shaders.
+By default it enables the user to provide a dynamic number of tensors
+which are then passed as inputs.)doc";
+
+static const char *__doc_kp_OpAlgoBase_KomputeWorkgroup = R"doc()doc";
+
+static const char *__doc_kp_OpAlgoBase_KomputeWorkgroup_x = R"doc()doc";
+
+static const char *__doc_kp_OpAlgoBase_KomputeWorkgroup_y = R"doc()doc";
+
+static const char *__doc_kp_OpAlgoBase_KomputeWorkgroup_z = R"doc()doc";
+
+static const char *__doc_kp_OpAlgoBase_OpAlgoBase = R"doc(Base constructor, should not be used unless explicitly intended.)doc";
+
+static const char *__doc_kp_OpAlgoBase_OpAlgoBase_2 =
+R"doc(Default constructor with parameters that provides the bare minimum
+requirements for the operations to be able to create and manage their
+sub-components.
+
+@param physicalDevice Vulkan physical device used to find device
+queues @param device Vulkan logical device for passing to Algorithm
+@param commandBuffer Vulkan Command Buffer to record commands into
+@param tensors Tensors that are to be used in this operation @param
+shaderFilePath Optional parameter to specify the shader to load
+(either in spirv or raw format) @param komputeWorkgroup Optional
+parameter to specify the layout for processing)doc";
+
+static const char *__doc_kp_OpAlgoBase_OpAlgoBase_3 =
+R"doc(Constructor that enables a file to be passed to the operation with the
+contents of the shader. This can be either in raw format or in
+compiled SPIR-V binary format.
+
+@param physicalDevice Vulkan physical device used to find device
+queues @param device Vulkan logical device for passing to Algorithm
+@param commandBuffer Vulkan Command Buffer to record commands into
+@param tensors Tensors that are to be used in this operation @param
+shaderFilePath Parameter to specify the shader to load (either in
+spirv or raw format) @param komputeWorkgroup Optional parameter to
+specify the layout for processing)doc";
+
+static const char *__doc_kp_OpAlgoBase_OpAlgoBase_4 =
+R"doc(Constructor that enables raw shader data to be passed to the main
+operation which can be either in raw shader glsl code or in compiled
+SPIR-V binary.
+
+@param physicalDevice Vulkan physical device used to find device
+queues @param device Vulkan logical device for passing to Algorithm
+@param commandBuffer Vulkan Command Buffer to record commands into
+@param tensors Tensors that are to be used in this operation @param
+shaderDataRaw Optional parameter to specify the shader data either in
+binary or raw form @param komputeWorkgroup Optional parameter to
+specify the layout for processing)doc";
+
+static const char *__doc_kp_OpAlgoBase_fetchSpirvBinaryData = R"doc()doc";
+
+static const char *__doc_kp_OpAlgoBase_init =
+R"doc(The init function is responsible for the initialisation of the
+algorithm component based on the parameters specified, and allows for
+extensibility on the options provided. Further dependent classes can
+perform more specific checks such as ensuring tensors provided are
+initialised, etc.)doc";
+
+static const char *__doc_kp_OpAlgoBase_mAlgorithm = R"doc()doc";
+
+static const char *__doc_kp_OpAlgoBase_mFreeAlgorithm = R"doc()doc";
+
+static const char *__doc_kp_OpAlgoBase_mKomputeWorkgroup = R"doc()doc";
+
+static const char *__doc_kp_OpAlgoBase_mShaderDataRaw =
+R"doc(< Optional member variable which can be provided to contain either the
+raw shader content or the spirv binary content)doc";
+
+static const char *__doc_kp_OpAlgoBase_mShaderFilePath =
+R"doc(< Optional member variable which can be provided for the OpAlgoBase to
+find the data automatically and load for processing)doc";
+
+static const char *__doc_kp_OpAlgoBase_postEval =
+R"doc(Executes after the recorded commands are submitted, and performs a
+copy of the GPU Device memory into the staging buffer so the output
+data can be retrieved.)doc";
+
+static const char *__doc_kp_OpAlgoBase_preEval = R"doc(Does not perform any preEval commands.)doc";
+
+static const char *__doc_kp_OpAlgoBase_record =
+R"doc(This records the commands that are to be sent to the GPU. This
+includes the barriers that ensure the memory has been copied before
+going in and out of the shader, as well as the dispatch operation that
+sends the shader processing to the gpu. This function also records the
+GPU memory copy of the output data for the staging buffer so it can be
+read by the host.)doc";
+
+static const char *__doc_kp_OpAlgoLhsRhsOut =
+R"doc(Operation base class to simplify the creation of operations that
+require right hand and left hand side datapoints together with a
+single output. The expected data passed is two input tensors and one
+output tensor.)doc";
+
+static const char *__doc_kp_OpAlgoLhsRhsOut_OpAlgoLhsRhsOut = R"doc(Base constructor, should not be used unless explicitly intended.)doc";
+
+static const char *__doc_kp_OpAlgoLhsRhsOut_OpAlgoLhsRhsOut_2 =
+R"doc(Default constructor with parameters that provides the bare minimum
+requirements for the operations to be able to create and manage their
+sub-components.
+
+@param physicalDevice Vulkan physical device used to find device
+queues @param device Vulkan logical device for passing to Algorithm
+@param commandBuffer Vulkan Command Buffer to record commands into
+@param tensors Tensors that are to be used in this operation @param
+freeTensors Whether operation manages the memory of the Tensors @param
+komputeWorkgroup Optional parameter to specify the layout for
+processing)doc";
+
+static const char *__doc_kp_OpAlgoLhsRhsOut_init =
+R"doc(The init function is responsible for ensuring that all of the tensors
+provided are aligned with requirements such as LHS, RHS and Output
+tensors, and creates the algorithm component which processes the
+computation.)doc";
+
+static const char *__doc_kp_OpAlgoLhsRhsOut_mTensorLHS =
+R"doc(< Reference to the parameter used in the left hand side equation of
+the shader)doc";
+
+static const char *__doc_kp_OpAlgoLhsRhsOut_mTensorOutput =
+R"doc(< Reference to the parameter used in the output of the shader and will
+be copied with a staging vector)doc";
+
+static const char *__doc_kp_OpAlgoLhsRhsOut_mTensorOutputStaging = R"doc(< Staging temporary tensor user do to copy the output of the tensor)doc";
+
+static const char *__doc_kp_OpAlgoLhsRhsOut_mTensorRHS =
+R"doc(< Reference to the parameter used in the right hand side equation of
+the shader)doc";
+
+static const char *__doc_kp_OpAlgoLhsRhsOut_postEval =
+R"doc(Executes after the recorded commands are submitted, and performs a
+copy of the GPU Device memory into the staging buffer so the output
+data can be retrieved.)doc";
+
+static const char *__doc_kp_OpAlgoLhsRhsOut_record =
+R"doc(This records the commands that are to be sent to the GPU. This
+includes the barriers that ensure the memory has been copied before
+going in and out of the shader, as well as the dispatch operation that
+sends the shader processing to the gpu. This function also records the
+GPU memory copy of the output data for the staging buffer so it can be
+read by the host.)doc";
+
+static const char *__doc_kp_OpBase =
+R"doc(Base Operation which provides the high level interface that Kompute
+operations implement in order to perform a set of actions in the GPU.
+
+Operations can perform actions on tensors, and optionally can also own
+an Algorithm with respective parameters. kp::Operations with
+kp::Algorithms would inherit from kp::OpBaseAlgo.)doc";
+
+static const char *__doc_kp_OpBase_OpBase = R"doc(Base constructor, should not be used unless explicitly intended.)doc";
+
+static const char *__doc_kp_OpBase_OpBase_2 =
+R"doc(Default constructor with parameters that provides the bare minimum
+requirements for the operations to be able to create and manage their
+sub-components.
+
+@param physicalDevice Vulkan physical device used to find device
+queues @param device Vulkan logical device for passing to Algorithm
+@param commandBuffer Vulkan Command Buffer to record commands into
+@param tensors Tensors that are to be used in this operation @param
+freeTensors Whether operation manages the memory of the Tensors)doc";
+
+static const char *__doc_kp_OpBase_init =
+R"doc(The init function is responsible for setting up all the resources and
+should be called after the Operation has been created.)doc";
+
+static const char *__doc_kp_OpBase_mCommandBuffer = R"doc(< Vulkan Command Buffer)doc";
+
+static const char *__doc_kp_OpBase_mDevice = R"doc(< Vulkan Logical Device)doc";
+
+static const char *__doc_kp_OpBase_mFreeTensors =
+R"doc(< Explicit boolean that specifies whether the < tensors are freed (if
+they are managed))doc";
+
+static const char *__doc_kp_OpBase_mPhysicalDevice = R"doc(< Vulkan Physical Device)doc";
+
+static const char *__doc_kp_OpBase_mTensors =
+R"doc(< Tensors referenced by operation that can be managed < optionally by
+operation)doc";
+
+static const char *__doc_kp_OpBase_postEval =
+R"doc(Post eval is called after the Sequence has called eval and submitted
+the commands to the GPU for processing, and can be used to perform any
+tear-down steps required as the computation iteration finishes. It's
+worth noting that there are situations where eval can be called
+multiple times, so the resources that are destroyed should not require
+a re-init unless explicitly provided by the user.)doc";
+
+static const char *__doc_kp_OpBase_preEval =
+R"doc(Pre eval is called before the Sequence has called eval and submitted
+the commands to the GPU for processing, and can be used to perform any
+per-eval setup steps required as the computation iteration begins.
+It's worth noting that there are situations where eval can be called
+multiple times, so the resources that are created should be idempotent
+in case it's called multiple times in a row.)doc";
+
+static const char *__doc_kp_OpBase_record =
+R"doc(The record function is intended to only send a record command or run
+commands that are expected to record operations that are to be
+submitted as a batch into the GPU.)doc";
+
+static const char *__doc_kp_OpMult =
+R"doc(Operation that performs multiplication on two tensors and outpus on
+third tensor.)doc";
+
+static const char *__doc_kp_OpMult_OpMult = R"doc(Base constructor, should not be used unless explicitly intended.)doc";
+
+static const char *__doc_kp_OpMult_OpMult_2 =
+R"doc(Default constructor with parameters that provides the bare minimum
+requirements for the operations to be able to create and manage their
+sub-components.
+
+@param physicalDevice Vulkan physical device used to find device
+queues @param device Vulkan logical device for passing to Algorithm
+@param commandBuffer Vulkan Command Buffer to record commands into
+@param tensors Tensors that are to be used in this operation @param
+komputeWorkgroup Optional parameter to specify the layout for
+processing)doc";
+
+static const char *__doc_kp_OpTensorCopy =
+R"doc(Operation that copies the data from the first tensor to the rest of
+the tensors provided, using a record command for all the vectors. This
+operation does not own/manage the memory of the tensors passed to it.
+The operation must only receive tensors of type)doc";
+
+static const char *__doc_kp_OpTensorCopy_OpTensorCopy = R"doc()doc";
+
+static const char *__doc_kp_OpTensorCopy_OpTensorCopy_2 =
+R"doc(Default constructor with parameters that provides the core vulkan
+resources and the tensors that will be used in the operation.
+
+@param physicalDevice Vulkan physical device used to find device
+queues @param device Vulkan logical device for passing to Algorithm
+@param commandBuffer Vulkan Command Buffer to record commands into
+@param tensors Tensors that will be used to create in operation.)doc";
+
+static const char *__doc_kp_OpTensorCopy_init =
+R"doc(Performs basic checks such as ensuring there are at least two tensors
+provided, that they are initialised and that they are not of type
+TensorTypes::eStorage.)doc";
+
+static const char *__doc_kp_OpTensorCopy_postEval =
+R"doc(Copies the local vectors for all the tensors to sync the data with the
+gpu.)doc";
+
+static const char *__doc_kp_OpTensorCopy_preEval = R"doc(Does not perform any preEval commands.)doc";
+
+static const char *__doc_kp_OpTensorCopy_record =
+R"doc(Records the copy commands from the first tensor into all the other
+tensors provided. Also optionally records a barrier.)doc";
+
+static const char *__doc_kp_OpTensorCreate =
+R"doc(Operation that creates tensor and manages the memory of the components
+created)doc";
+
+static const char *__doc_kp_OpTensorCreate_OpTensorCreate = R"doc()doc";
+
+static const char *__doc_kp_OpTensorCreate_OpTensorCreate_2 =
+R"doc(Default constructor with parameters that provides the bare minimum
+requirements for the operations to be able to create and manage their
+sub-components.
+
+@param physicalDevice Vulkan physical device used to find device
+queues @param device Vulkan logical device for passing to Algorithm
+@param commandBuffer Vulkan Command Buffer to record commands into
+@param tensors Tensors that will be used to create in operation.
+@param freeTensors Whether operation manages the memory of the Tensors)doc";
+
+static const char *__doc_kp_OpTensorCreate_init =
+R"doc(In charge of initialising the primary Tensor as well as the staging
+tensor as required. It will only initialise a staging tensor if the
+Primary tensor is of type Device. For staging tensors it performs a
+mapDataIntoHostMemory which would perform immediately as opposed to on
+sequence eval/submission.)doc";
+
+static const char *__doc_kp_OpTensorCreate_mStagingTensors = R"doc()doc";
+
+static const char *__doc_kp_OpTensorCreate_postEval =
+R"doc(Performs a copy back into the main tensor to ensure that the data
+contained is the one that is now being stored in the GPU.)doc";
+
+static const char *__doc_kp_OpTensorCreate_preEval = R"doc(Does not perform any preEval commands.)doc";
+
+static const char *__doc_kp_OpTensorCreate_record =
+R"doc(Record runs the core actions to create the tensors. For device tensors
+it records a copyCommand to move the data from the staging tensor to
+the device tensor. The mapping for staging tensors happens in the init
+function not in the record function.)doc";
+
+static const char *__doc_kp_OpTensorSyncDevice =
+R"doc(Operation that syncs tensor's device by mapping local data into the
+device memory. For TensorTypes::eDevice it will use a staging tensor
+to perform the copy. For TensorTypes::eStaging it will only copy the
+data and perform a map, which will be executed during the record (as
+opposed to during the sequence eval/submit). This function cannot be
+carried out for TensorTypes::eStaging.)doc";
+
+static const char *__doc_kp_OpTensorSyncDevice_OpTensorSyncDevice = R"doc()doc";
+
+static const char *__doc_kp_OpTensorSyncDevice_OpTensorSyncDevice_2 =
+R"doc(Default constructor with parameters that provides the core vulkan
+resources and the tensors that will be used in the operation. The
+tensos provided cannot be of type TensorTypes::eStorage.
+
+@param physicalDevice Vulkan physical device used to find device
+queues @param device Vulkan logical device for passing to Algorithm
+@param commandBuffer Vulkan Command Buffer to record commands into
+@param tensors Tensors that will be used to create in operation.)doc";
+
+static const char *__doc_kp_OpTensorSyncDevice_init =
+R"doc(Performs basic checks such as ensuring that there is at least one
+tensor provided, that they are initialized and that they are not of
+type TensorTpes::eStaging. For staging tensors in host memory, the map
+is performed during the init function.)doc";
+
+static const char *__doc_kp_OpTensorSyncDevice_mStagingTensors = R"doc()doc";
+
+static const char *__doc_kp_OpTensorSyncDevice_postEval = R"doc(Does not perform any postEval commands.)doc";
+
+static const char *__doc_kp_OpTensorSyncDevice_preEval = R"doc(Does not perform any preEval commands.)doc";
+
+static const char *__doc_kp_OpTensorSyncDevice_record =
+R"doc(For device tensors, it records the copy command to the device tensor
+from the temporary staging tensor.)doc";
+
+static const char *__doc_kp_OpTensorSyncLocal =
+R"doc(Operation that syncs tensor's local data by mapping the data from
+device memory into the local vector. For TensorTypes::eDevice it will
+use a staging tensor to perform the copy. For TensorTypes::eStaging it
+will only copy the data and perform a map, which will be executed
+during the postSubmit (there will be no copy during the sequence
+eval/submit). This function cannot be carried out for
+TensorTypes::eStaging.)doc";
+
+static const char *__doc_kp_OpTensorSyncLocal_OpTensorSyncLocal = R"doc()doc";
+
+static const char *__doc_kp_OpTensorSyncLocal_OpTensorSyncLocal_2 =
+R"doc(Default constructor with parameters that provides the core vulkan
+resources and the tensors that will be used in the operation. The
+tensors provided cannot be of type TensorTypes::eStorage.
+
+@param physicalDevice Vulkan physical device used to find device
+queues @param device Vulkan logical device for passing to Algorithm
+@param commandBuffer Vulkan Command Buffer to record commands into
+@param tensors Tensors that will be used to create in operation.)doc";
+
+static const char *__doc_kp_OpTensorSyncLocal_init =
+R"doc(Performs basic checks such as ensuring that there is at least one
+tensor provided, that they are initialized and that they are not of
+type TensorTpes::eStaging.)doc";
+
+static const char *__doc_kp_OpTensorSyncLocal_mStagingTensors = R"doc()doc";
+
+static const char *__doc_kp_OpTensorSyncLocal_postEval =
+R"doc(For host tensors it performs the map command from the host memory into
+local memory.)doc";
+
+static const char *__doc_kp_OpTensorSyncLocal_preEval = R"doc(Does not perform any preEval commands.)doc";
+
+static const char *__doc_kp_OpTensorSyncLocal_record =
+R"doc(For device tensors, it records the copy command into the staging
+tensor from the device tensor.)doc";
+
+static const char *__doc_kp_Sequence = R"doc(Container of operations that can be sent to GPU as batch)doc";
+
+static const char *__doc_kp_Sequence_Sequence =
+R"doc(Base constructor for Sequence. Should not be used unless explicit
+intended.)doc";
+
+static const char *__doc_kp_Sequence_Sequence_2 =
+R"doc(Main constructor for sequence which requires core vulkan components to
+generate all dependent resources.
+
+@param physicalDevice Vulkan physical device @param device Vulkan
+logical device @param computeQueue Vulkan compute queue @param
+queueIndex Vulkan compute queue index in device)doc";
+
+static const char *__doc_kp_Sequence_begin =
+R"doc(Begins recording commands for commands to be submitted into the
+command buffer.
+
+@return Boolean stating whether execution was successful.)doc";
+
+static const char *__doc_kp_Sequence_createCommandBuffer = R"doc()doc";
+
+static const char *__doc_kp_Sequence_createCommandPool = R"doc()doc";
+
+static const char *__doc_kp_Sequence_end =
+R"doc(Ends the recording and stops recording commands when the record
+command is sent.
+
+@return Boolean stating whether execution was successful.)doc";
+
+static const char *__doc_kp_Sequence_eval =
+R"doc(Eval sends all the recorded and stored operations in the vector of
+operations into the gpu as a submit job with a barrier.
+
+@return Boolean stating whether execution was successful.)doc";
+
+static const char *__doc_kp_Sequence_evalAsync =
+R"doc(Eval Async sends all the recorded and stored operations in the vector
+of operations into the gpu as a submit job with a barrier. EvalAwait()
+must be called after to ensure the sequence is terminated correctly.
+
+@return Boolean stating whether execution was successful.)doc";
+
+static const char *__doc_kp_Sequence_evalAwait =
+R"doc(Eval Await waits for the fence to finish processing and then once it
+finishes, it runs the postEval of all operations.
+
+@param waitFor Number of milliseconds to wait before timing out.
+@return Boolean stating whether execution was successful.)doc";
+
+static const char *__doc_kp_Sequence_freeMemoryDestroyGPUResources =
+R"doc(Destroys and frees the GPU resources which include the buffer and
+memory and sets the sequence as init=False.)doc";
+
+static const char *__doc_kp_Sequence_init =
+R"doc(Initialises sequence including the creation of the command pool and
+the command buffer.)doc";
+
+static const char *__doc_kp_Sequence_isInit =
+R"doc(Returns true if the sequence has been successfully initialised.
+
+@return Boolean stating if sequence has been initialised.)doc";
+
+static const char *__doc_kp_Sequence_isRecording =
+R"doc(Returns true if the sequence is currently in recording activated.
+
+@return Boolean stating if recording ongoing.)doc";
+
+static const char *__doc_kp_Sequence_isRunning =
+R"doc(Returns true if the sequence is currently running - mostly used for
+async workloads.
+
+@return Boolean stating if currently running.)doc";
+
+static const char *__doc_kp_Sequence_mCommandBuffer = R"doc()doc";
+
+static const char *__doc_kp_Sequence_mCommandPool = R"doc()doc";
+
+static const char *__doc_kp_Sequence_mComputeQueue = R"doc()doc";
+
+static const char *__doc_kp_Sequence_mDevice = R"doc()doc";
+
+static const char *__doc_kp_Sequence_mFence = R"doc()doc";
+
+static const char *__doc_kp_Sequence_mFreeCommandBuffer = R"doc()doc";
+
+static const char *__doc_kp_Sequence_mFreeCommandPool = R"doc()doc";
+
+static const char *__doc_kp_Sequence_mIsInit = R"doc()doc";
+
+static const char *__doc_kp_Sequence_mIsRunning = R"doc()doc";
+
+static const char *__doc_kp_Sequence_mOperations = R"doc()doc";
+
+static const char *__doc_kp_Sequence_mPhysicalDevice = R"doc()doc";
+
+static const char *__doc_kp_Sequence_mQueueIndex = R"doc()doc";
+
+static const char *__doc_kp_Sequence_mRecording = R"doc()doc";
+
+static const char *__doc_kp_Sequence_record =
+R"doc(Record function for operation to be added to the GPU queue in batch.
+This template requires classes to be derived from the OpBase class.
+This function also requires the Sequence to be recording, otherwise it
+will not be able to add the operation.
+
+@param tensors Vector of tensors to use for the operation @param TArgs
+Template parameters that are used to initialise operation which allows
+for extensible configurations on initialisation.)doc";
+
+static const char *__doc_kp_Tensor =
+R"doc(Structured data used in GPU operations.
+
+Tensors are the base building block in Kompute to perform operations
+across GPUs. Each tensor would have a respective Vulkan memory and
+buffer, which would be used to store their respective data. The
+tensors can be used for GPU data storage or transfer.)doc";
+
+static const char *__doc_kp_Tensor_Tensor = R"doc(Base constructor, should not be used unless explicitly intended.)doc";
+
+static const char *__doc_kp_Tensor_Tensor_2 =
+R"doc(Default constructor with data provided which would be used to create
+the respective vulkan buffer and memory.
+
+@param data Vector of data that will be used by the tensor @param
+tensorType Type for the tensor which is of type TensorTypes)doc";
+
+static const char *__doc_kp_Tensor_TensorTypes =
+R"doc(Type for tensors created: Device allows memory to be transferred from
+staging buffers. Staging are host memory visible. Storage are device
+visible but are not set up to transfer or receive data (only for
+shader storage).)doc";
+
+static const char *__doc_kp_Tensor_TensorTypes_eDevice = R"doc(< Type is device memory, source and destination)doc";
+
+static const char *__doc_kp_Tensor_TensorTypes_eStaging = R"doc(< Type is host memory, source and destination)doc";
+
+static const char *__doc_kp_Tensor_TensorTypes_eStorage = R"doc(< Type is Device memory (only))doc";
+
+static const char *__doc_kp_Tensor_constructDescriptorBufferInfo =
+R"doc(Constructs a vulkan descriptor buffer info which can be used to
+specify and reference the underlying buffer component of the tensor
+without exposing it.
+
+@return Descriptor buffer info with own buffer)doc";
+
+static const char *__doc_kp_Tensor_createBuffer = R"doc()doc";
+
+static const char *__doc_kp_Tensor_data =
+R"doc(Returns the vector of data currently contained by the Tensor. It is
+important to ensure that there is no out-of-sync data with the GPU
+memory.
+
+@return Reference to vector of elements representing the data in the
+tensor.)doc";
+
+static const char *__doc_kp_Tensor_freeMemoryDestroyGPUResources =
+R"doc(Destroys and frees the GPU resources which include the buffer and
+memory.)doc";
+
+static const char *__doc_kp_Tensor_getBufferUsageFlags = R"doc()doc";
+
+static const char *__doc_kp_Tensor_getMemoryPropertyFlags = R"doc()doc";
+
+static const char *__doc_kp_Tensor_init =
+R"doc(Initialiser which calls the initialisation for all the respective
+tensors as well as creates the respective staging tensors. The staging
+tensors would only be created for the tensors of type
+TensorType::eDevice as otherwise there is no need to copy from host
+memory.)doc";
+
+static const char *__doc_kp_Tensor_isInit =
+R"doc(Returns true if the tensor initialisation function has been carried
+out successful, which would mean that the buffer and memory will have
+been provisioned.)doc";
+
+static const char *__doc_kp_Tensor_mBuffer = R"doc()doc";
+
+static const char *__doc_kp_Tensor_mData = R"doc()doc";
+
+static const char *__doc_kp_Tensor_mDevice = R"doc()doc";
+
+static const char *__doc_kp_Tensor_mFreeBuffer = R"doc()doc";
+
+static const char *__doc_kp_Tensor_mFreeMemory = R"doc()doc";
+
+static const char *__doc_kp_Tensor_mIsInit = R"doc()doc";
+
+static const char *__doc_kp_Tensor_mMemory = R"doc()doc";
+
+static const char *__doc_kp_Tensor_mPhysicalDevice = R"doc()doc";
+
+static const char *__doc_kp_Tensor_mShape = R"doc()doc";
+
+static const char *__doc_kp_Tensor_mTensorType = R"doc()doc";
+
+static const char *__doc_kp_Tensor_mapDataFromHostMemory =
+R"doc(Maps data from the Host Visible GPU memory into the data vector. It
+requires the Tensor to be of staging type for it to work.)doc";
+
+static const char *__doc_kp_Tensor_mapDataIntoHostMemory =
+R"doc(Maps data from the data vector into the Host Visible GPU memory. It
+requires the tensor to be of staging type for it to work.)doc";
+
+static const char *__doc_kp_Tensor_memorySize = R"doc()doc";
+
+static const char *__doc_kp_Tensor_operator_array =
+R"doc(Overrides the subscript operator to expose the underlying data's
+subscript operator which in this case would be its underlying
+vector's.
+
+@param i The index where the element will be returned from. @return
+Returns the element in the position requested.)doc";
+
+static const char *__doc_kp_Tensor_recordBufferMemoryBarrier =
+R"doc(Records the buffer memory barrier into the command buffer which
+ensures that relevant data transfers are carried out correctly.
+
+@param commandBuffer Vulkan Command Buffer to record the commands into
+@param srcAccessMask Access flags for source access mask @param
+dstAccessMask Access flags for destination access mask @param
+scrStageMask Pipeline stage flags for source stage mask @param
+dstStageMask Pipeline stage flags for destination stage mask)doc";
+
+static const char *__doc_kp_Tensor_recordCopyFrom =
+R"doc(Records a copy from the memory of the tensor provided to the current
+thensor. This is intended to pass memory into a processing, to perform
+a staging buffer transfer, or to gather output (between others).
+
+@param commandBuffer Vulkan Command Buffer to record the commands into
+@param copyFromTensor Tensor to copy the data from @param
+createBarrier Whether to create a barrier that ensures the data is
+copied before further operations. Default is true.)doc";
+
+static const char *__doc_kp_Tensor_setData =
+R"doc(Sets / resets the vector data of the tensor. This function does not
+perform any copies into GPU memory and is only performed on the host.)doc";
+
+static const char *__doc_kp_Tensor_shape =
+R"doc(Returns the shape of the tensor, which includes the number of
+dimensions and the size per dimension.
+
+@return Array containing the sizes for each dimension. Zero means
+respective dimension is not active.)doc";
+
+static const char *__doc_kp_Tensor_size =
+R"doc(Returns the size/magnitude of the Tensor, which will be the total
+number of elements across all dimensions
+
+@return Unsigned integer representing the total number of elements)doc";
+
+static const char *__doc_kp_Tensor_tensorType =
+R"doc(Retrieve the tensor type of the Tensor
+
+@return Tensor type of tensor)doc";
+
+#if defined(__GNUG__)
+#pragma GCC diagnostic pop
+#endif
+
diff --git a/python/src/main.cpp b/python/src/main.cpp
index e50ec7945..f368d77ae 100644
--- a/python/src/main.cpp
+++ b/python/src/main.cpp
@@ -3,6 +3,8 @@
 
 #include <kompute/Kompute.hpp>
 
+#include "docstrings.hpp"
+
 namespace py = pybind11;
 
 PYBIND11_MODULE(kp, m) {
@@ -21,22 +23,22 @@ PYBIND11_MODULE(kp, m) {
 #endif
         });
 
-    py::enum_<kp::Tensor::TensorTypes>(m, "TensorTypes", "Enum with GPU memory types for Tensor.")
+    py::enum_<kp::Tensor::TensorTypes>(m, "TensorTypes", DOC(kp, Tensor, TensorTypes))
         .value("device", kp::Tensor::TensorTypes::eDevice, "Tensor holding data in GPU memory.")
         .value("staging", kp::Tensor::TensorTypes::eStaging, "Tensor used for transfer of data to device.")
         .value("storage", kp::Tensor::TensorTypes::eStorage, "Tensor with host visible gpu memory.")
         .export_values();
 
-    py::class_<kp::Tensor, std::shared_ptr<kp::Tensor>>(m, "Tensor", "Structured data used in GPU operations.")
+    py::class_<kp::Tensor, std::shared_ptr<kp::Tensor>>(m, "Tensor", DOC(kp, Tensor))
         .def(py::init(
             [](const std::vector<float>& data) {
                 return std::unique_ptr<kp::Tensor>(new kp::Tensor(data));
-            }), "Initialiser with only list of data components.")
+            }), DOC(kp, Tensor, Tensor, 2))
         .def(py::init(
             [](const std::vector<float>& data, kp::Tensor::TensorTypes tensorTypes) {
                 return std::unique_ptr<kp::Tensor>(new kp::Tensor(data, tensorTypes));
             }), "Initialiser with list of data components and tensor GPU memory type.")
-        .def("data", &kp::Tensor::data, "Retrieves the data as a list containing the local Tensor memory data.")
+        .def("data", &kp::Tensor::data, DOC(kp, Tensor, data))
         .def("size", &kp::Tensor::size, "Retrieves the size of the Tensor data as per the local Tensor memory.")
         .def("tensor_type", &kp::Tensor::tensorType, "Retreves the memory type of the tensor.")
         .def("is_init", &kp::Tensor::isInit, "Checks whether the tensor GPU memory has been initialised.")

From a4523338be45d78806f1a5e35cdc2a1a69e9b169 Mon Sep 17 00:00:00 2001
From: Alejandro Saucedo <axsauze@gmail.com>
Date: Sat, 7 Nov 2020 18:42:51 +0000
Subject: [PATCH 02/11] Updated python function to be updated to py::bytes

---
 python/src/main.cpp | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/python/src/main.cpp b/python/src/main.cpp
index f368d77ae..52b39eb9a 100644
--- a/python/src/main.cpp
+++ b/python/src/main.cpp
@@ -72,7 +72,7 @@ PYBIND11_MODULE(kp, m) {
             "Records operation to run multiplication compute shader to two input tensors and an output tensor")
         .def("record_algo_file", &kp::Sequence::record<kp::OpAlgoBase, std::string>,
             "Records an operation using a custom shader provided from a shader path")
-        .def("record_algo_data", &kp::Sequence::record<kp::OpAlgoBase, std::vector<char>>,
+        .def("record_algo_data", &kp::Sequence::record<kp::OpAlgoBase, py::bytes>,
             "Records an operation using a custom shader provided as raw string or spirv bytes")
         .def("record_algo_lro", &kp::Sequence::record<kp::OpAlgoLhsRhsOut>,
             "Records operation to run left right out operation with custom shader");
@@ -112,7 +112,7 @@ PYBIND11_MODULE(kp, m) {
             "Evaluates operation to run multiplication compute shader to two input tensors and an output tensor with new anonymous Sequence")
         .def("eval_algo_file_def", &kp::Manager::evalOpDefault<kp::OpAlgoBase, std::string>,
             "Evaluates an operation using a custom shader provided from a shader path with new anonymous Sequence")
-        .def("eval_algo_data_def", &kp::Manager::evalOpDefault<kp::OpAlgoBase, std::vector<char>>,
+        .def("eval_algo_data_def", &kp::Manager::evalOpDefault<kp::OpAlgoBase, py::bytes>,
             "Evaluates an operation using a custom shader provided as raw string or spirv bytes with new anonymous Sequence")
         .def("eval_algo_lro_def", &kp::Manager::evalOpDefault<kp::OpAlgoLhsRhsOut>,
             "Evaluates operation to run left right out operation with custom shader with new anonymous Sequence")
@@ -129,7 +129,7 @@ PYBIND11_MODULE(kp, m) {
             "Evaluates operation to run multiplication compute shader to two input tensors and an output tensor with explicitly named Sequence")
         .def("eval_algo_file", &kp::Manager::evalOp<kp::OpAlgoBase, std::string>,
             "Evaluates an operation using a custom shader provided from a shader path with explicitly named Sequence")
-        .def("eval_algo_data", &kp::Manager::evalOp<kp::OpAlgoBase, std::vector<char>>,
+        .def("eval_algo_data", &kp::Manager::evalOp<kp::OpAlgoBase, py::bytes>,
             "Evaluates an operation using a custom shader provided as raw string or spirv bytes with explicitly named Sequence")
         .def("eval_algo_lro", &kp::Manager::evalOp<kp::OpAlgoLhsRhsOut>,
             "Evaluates operation to run left right out operation with custom shader with explicitly named Sequence")
@@ -146,7 +146,7 @@ PYBIND11_MODULE(kp, m) {
             "Evaluates asynchronously operation to run multiplication compute shader to two input tensors and an output tensor with anonymous Sequence")
         .def("eval_async_algo_file_def", &kp::Manager::evalOpAsyncDefault<kp::OpAlgoBase, std::string>,
             "Evaluates asynchronously an operation using a custom shader provided from a shader path with anonymous Sequence")
-        .def("eval_async_algo_data_def", &kp::Manager::evalOpAsyncDefault<kp::OpAlgoBase, std::vector<char>>,
+        .def("eval_async_algo_data_def", &kp::Manager::evalOpAsyncDefault<kp::OpAlgoBase, py::bytes>,
             "Evaluates asynchronously an operation using a custom shader provided as raw string or spirv bytes with anonymous Sequence")
         .def("eval_async_algo_lro_def", &kp::Manager::evalOpAsyncDefault<kp::OpAlgoLhsRhsOut>,
             "Evaluates asynchronously operation to run left right out operation with custom shader with anonymous Sequence")
@@ -163,7 +163,7 @@ PYBIND11_MODULE(kp, m) {
             "Evaluates asynchronously operation to run multiplication compute shader to two input tensors and an output tensor with explicitly named Sequence")
         .def("eval_async_algo_file", &kp::Manager::evalOpAsync<kp::OpAlgoBase, std::string>,
             "Evaluates asynchronously an operation using a custom shader provided from a shader path with explicitly named Sequence")
-        .def("eval_async_algo_data", &kp::Manager::evalOpAsync<kp::OpAlgoBase, std::vector<char>>,
+        .def("eval_async_algo_data", &kp::Manager::evalOpAsync<kp::OpAlgoBase, py::bytes>,
             "Evaluates asynchronously an operation using a custom shader provided as raw string or spirv bytes with explicitly named Sequence")
         .def("eval_async_algo_lro", &kp::Manager::evalOpAsync<kp::OpAlgoLhsRhsOut>,
             "Evaluates asynchronously operation to run left right out operation with custom shader with explicitly named Sequence");

From 6c6132942247bcc7a04144b9f29e12eacc58b74e Mon Sep 17 00:00:00 2001
From: Alejandro Saucedo <axsauze@gmail.com>
Date: Sun, 8 Nov 2020 11:20:12 +0000
Subject: [PATCH 03/11] Updated to add separate bytes load and str load
 functions

---
 python/src/main.cpp | 76 ++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 69 insertions(+), 7 deletions(-)

diff --git a/python/src/main.cpp b/python/src/main.cpp
index 52b39eb9a..265df8f52 100644
--- a/python/src/main.cpp
+++ b/python/src/main.cpp
@@ -72,7 +72,17 @@ PYBIND11_MODULE(kp, m) {
             "Records operation to run multiplication compute shader to two input tensors and an output tensor")
         .def("record_algo_file", &kp::Sequence::record<kp::OpAlgoBase, std::string>,
             "Records an operation using a custom shader provided from a shader path")
-        .def("record_algo_data", &kp::Sequence::record<kp::OpAlgoBase, py::bytes>,
+        .def("record_algo_data", [](kp::Sequence &self,
+                                    std::vector<std::shared_ptr<kp::Tensor>> tensors,
+                                    py::bytes &bytes) {
+                // Bytes have to be converted into std::vector
+                py::buffer_info info(py::buffer(bytes).request());
+                const char *data = reinterpret_cast<const char *>(info.ptr);
+                size_t length = static_cast<size_t>(info.size);
+                self.record<kp::OpAlgoBase>(
+                    tensors,
+                    std::vector<char>(data, data + length));
+            },
             "Records an operation using a custom shader provided as raw string or spirv bytes")
         .def("record_algo_lro", &kp::Sequence::record<kp::OpAlgoLhsRhsOut>,
             "Records operation to run left right out operation with custom shader");
@@ -112,8 +122,20 @@ PYBIND11_MODULE(kp, m) {
             "Evaluates operation to run multiplication compute shader to two input tensors and an output tensor with new anonymous Sequence")
         .def("eval_algo_file_def", &kp::Manager::evalOpDefault<kp::OpAlgoBase, std::string>,
             "Evaluates an operation using a custom shader provided from a shader path with new anonymous Sequence")
-        .def("eval_algo_data_def", &kp::Manager::evalOpDefault<kp::OpAlgoBase, py::bytes>,
-            "Evaluates an operation using a custom shader provided as raw string or spirv bytes with new anonymous Sequence")
+        .def("eval_algo_str_def", &kp::Manager::evalOpDefault<kp::OpAlgoBase, std::vector<char>>,
+            "Evaluates an operation using a custom shader provided as string provided as list of characters with new anonymous Sequence")
+        .def("eval_algo_data_def", [](kp::Manager &self,
+                                    std::vector<std::shared_ptr<kp::Tensor>> tensors,
+                                    py::bytes &bytes) {
+                // Bytes have to be converted into std::vector
+                py::buffer_info info(py::buffer(bytes).request());
+                const char *data = reinterpret_cast<const char *>(info.ptr);
+                size_t length = static_cast<size_t>(info.size);
+                self.evalOpDefault<kp::OpAlgoBase>(
+                    tensors,
+                    std::vector<char>(data, data + length));
+            },
+            "Evaluates an operation using a custom shader provided as spirv bytes with new anonymous Sequence")
         .def("eval_algo_lro_def", &kp::Manager::evalOpDefault<kp::OpAlgoLhsRhsOut>,
             "Evaluates operation to run left right out operation with custom shader with new anonymous Sequence")
         // eval
@@ -129,8 +151,22 @@ PYBIND11_MODULE(kp, m) {
             "Evaluates operation to run multiplication compute shader to two input tensors and an output tensor with explicitly named Sequence")
         .def("eval_algo_file", &kp::Manager::evalOp<kp::OpAlgoBase, std::string>,
             "Evaluates an operation using a custom shader provided from a shader path with explicitly named Sequence")
-        .def("eval_algo_data", &kp::Manager::evalOp<kp::OpAlgoBase, py::bytes>,
-            "Evaluates an operation using a custom shader provided as raw string or spirv bytes with explicitly named Sequence")
+        .def("eval_algo_str", &kp::Manager::evalOp<kp::OpAlgoBase, std::vector<char>>,
+            "Evaluates an operation using a custom shader provided as string provided as list of characters with explicitly named Sequence")
+        .def("eval_algo_data", [](kp::Manager &self,
+                                    std::vector<std::shared_ptr<kp::Tensor>> tensors,
+                                    std::string sequenceName,
+                                    py::bytes &bytes) {
+                // Bytes have to be converted into std::vector
+                py::buffer_info info(py::buffer(bytes).request());
+                const char *data = reinterpret_cast<const char *>(info.ptr);
+                size_t length = static_cast<size_t>(info.size);
+                self.evalOp<kp::OpAlgoBase>(
+                    tensors,
+                    sequenceName,
+                    std::vector<char>(data, data + length));
+            },
+            "Evaluates an operation using a custom shader provided as spirv bytes with explicitly named Sequence")
         .def("eval_algo_lro", &kp::Manager::evalOp<kp::OpAlgoLhsRhsOut>,
             "Evaluates operation to run left right out operation with custom shader with explicitly named Sequence")
         // eval async default
@@ -146,7 +182,19 @@ PYBIND11_MODULE(kp, m) {
             "Evaluates asynchronously operation to run multiplication compute shader to two input tensors and an output tensor with anonymous Sequence")
         .def("eval_async_algo_file_def", &kp::Manager::evalOpAsyncDefault<kp::OpAlgoBase, std::string>,
             "Evaluates asynchronously an operation using a custom shader provided from a shader path with anonymous Sequence")
-        .def("eval_async_algo_data_def", &kp::Manager::evalOpAsyncDefault<kp::OpAlgoBase, py::bytes>,
+        .def("eval_async_algo_str_def", &kp::Manager::evalOpAsyncDefault<kp::OpAlgoBase, std::vector<char>>,
+            "Evaluates Asynchronously an operation using a custom shader provided as string provided as list of characters with new anonymous Sequence")
+        .def("eval_async_algo_data_def", [](kp::Manager &self,
+                                    std::vector<std::shared_ptr<kp::Tensor>> tensors,
+                                    py::bytes &bytes) {
+                // Bytes have to be converted into std::vector
+                py::buffer_info info(py::buffer(bytes).request());
+                const char *data = reinterpret_cast<const char *>(info.ptr);
+                size_t length = static_cast<size_t>(info.size);
+                self.evalOpAsyncDefault<kp::OpAlgoBase>(
+                    tensors,
+                    std::vector<char>(data, data + length));
+            },
             "Evaluates asynchronously an operation using a custom shader provided as raw string or spirv bytes with anonymous Sequence")
         .def("eval_async_algo_lro_def", &kp::Manager::evalOpAsyncDefault<kp::OpAlgoLhsRhsOut>,
             "Evaluates asynchronously operation to run left right out operation with custom shader with anonymous Sequence")
@@ -163,7 +211,21 @@ PYBIND11_MODULE(kp, m) {
             "Evaluates asynchronously operation to run multiplication compute shader to two input tensors and an output tensor with explicitly named Sequence")
         .def("eval_async_algo_file", &kp::Manager::evalOpAsync<kp::OpAlgoBase, std::string>,
             "Evaluates asynchronously an operation using a custom shader provided from a shader path with explicitly named Sequence")
-        .def("eval_async_algo_data", &kp::Manager::evalOpAsync<kp::OpAlgoBase, py::bytes>,
+        .def("eval_async_algo_str", &kp::Manager::evalOpAsync<kp::OpAlgoBase, std::vector<char>>,
+            "Evaluates Asynchronous an operation using a custom shader provided as string provided as list of characters with explicitly named Sequence")
+        .def("eval_async_algo_data", [](kp::Manager &self,
+                                    std::vector<std::shared_ptr<kp::Tensor>> tensors,
+                                    std::string sequenceName,
+                                    py::bytes &bytes) {
+                // Bytes have to be converted into std::vector
+                py::buffer_info info(py::buffer(bytes).request());
+                const char *data = reinterpret_cast<const char *>(info.ptr);
+                size_t length = static_cast<size_t>(info.size);
+                self.evalOpAsync<kp::OpAlgoBase>(
+                    tensors,
+                    sequenceName,
+                    std::vector<char>(data, data + length));
+            },
             "Evaluates asynchronously an operation using a custom shader provided as raw string or spirv bytes with explicitly named Sequence")
         .def("eval_async_algo_lro", &kp::Manager::evalOpAsync<kp::OpAlgoLhsRhsOut>,
             "Evaluates asynchronously operation to run left right out operation with custom shader with explicitly named Sequence");

From 65b52f3023a5c4bed54be4d5c6efea078985e1ad Mon Sep 17 00:00:00 2001
From: Alejandro Saucedo <axsauze@gmail.com>
Date: Sun, 8 Nov 2020 13:18:13 +0000
Subject: [PATCH 04/11] Updated tests to cover str and data load, one of the
 tests leveraging pyshader

---
 python/test/requirements-dev.txt |  1 +
 python/test/test_kompute.py      | 31 +++++++++++++++++++++++--------
 2 files changed, 24 insertions(+), 8 deletions(-)
 create mode 100644 python/test/requirements-dev.txt

diff --git a/python/test/requirements-dev.txt b/python/test/requirements-dev.txt
new file mode 100644
index 000000000..5718a0210
--- /dev/null
+++ b/python/test/requirements-dev.txt
@@ -0,0 +1 @@
+pyshader==0.7.0
diff --git a/python/test/test_kompute.py b/python/test/test_kompute.py
index 43baf77d1..8c95f1f70 100644
--- a/python/test/test_kompute.py
+++ b/python/test/test_kompute.py
@@ -49,7 +49,7 @@ def test_opalgobase_data():
 
     mgr.eval_tensor_create_def([tensor_in_a, tensor_in_b, tensor_out])
 
-    mgr.eval_algo_data_def([tensor_in_a, tensor_in_b, tensor_out], list(shaderData))
+    mgr.eval_algo_str_def([tensor_in_a, tensor_in_b, tensor_out], list(shaderData))
 
     mgr.eval_tensor_sync_local_def([tensor_out])
 
@@ -81,28 +81,43 @@ def test_sequence():
     """
     Test basic OpAlgoBase operation
     """
-
     mgr = Manager(0, [2])
-
     tensor_in_a = Tensor([2, 2, 2])
     tensor_in_b = Tensor([1, 2, 3])
     tensor_out = Tensor([0, 0, 0])
-
     mgr.eval_tensor_create_def([tensor_in_a, tensor_in_b, tensor_out])
-
     seq = mgr.create_sequence("op")
-
     shaderFilePath = "../../shaders/glsl/opmult.comp"
     mgr.eval_async_algo_file_def([tensor_in_a, tensor_in_b, tensor_out], shaderFilePath)
     mgr.eval_await_def()
-
     seq.begin()
     seq.record_tensor_sync_local([tensor_in_a])
     seq.record_tensor_sync_local([tensor_in_b])
     seq.record_tensor_sync_local([tensor_out])
     seq.end()
-
     seq.eval()
+    assert tensor_out.data() == [2.0, 4.0, 6.0]
+
+def test_pyshader_generated():
+    from pyshader import python2shader, f32, ivec3, Array
+
+    @python2shader
+    def compute_shader_multiply(index: ("input", "GlobalInvocationId", ivec3),
+                                data1: ("buffer", 0, Array(f32)),
+                                data2: ("buffer", 1, Array(f32)),
+                                data3: ("buffer", 2, Array(f32))):
+        i = index.x
+        data3[i] = data1[i] * data2[i]
+
+    tensor_in_a = Tensor([2, 2, 2])
+    tensor_in_b = Tensor([1, 2, 3])
+    tensor_out = Tensor([0, 0, 0])
+
+    mgr = Manager()
+
+    mgr.eval_tensor_create_def([tensor_in_a, tensor_in_b, tensor_out])
+    mgr.eval_algo_data_def([tensor_in_a, tensor_in_b, tensor_out], compute_shader_multiply.to_spirv())
+    mgr.eval_tensor_sync_local_def([tensor_out])
 
     assert tensor_out.data() == [2.0, 4.0, 6.0]
 

From 9af9cb7a50336a992f6ad9f4e6280eeac0a5ddee Mon Sep 17 00:00:00 2001
From: Alejandro Saucedo <axsauze@gmail.com>
Date: Sun, 8 Nov 2020 15:38:18 +0000
Subject: [PATCH 05/11] Implemented logistic regression in python (naive
 version without sequence) and added to test

---
 python/test/test_kompute.py | 109 ++++++++++++++++++++++++++++++++++--
 1 file changed, 103 insertions(+), 6 deletions(-)

diff --git a/python/test/test_kompute.py b/python/test/test_kompute.py
index 8c95f1f70..fd6611550 100644
--- a/python/test/test_kompute.py
+++ b/python/test/test_kompute.py
@@ -1,4 +1,7 @@
 
+from pyshader import python2shader, f32, ivec3, Array
+from pyshader.stdlib import exp, log
+
 from kp import Tensor, Manager, Sequence
 
 def test_opmult():
@@ -98,14 +101,13 @@ def test_sequence():
     seq.eval()
     assert tensor_out.data() == [2.0, 4.0, 6.0]
 
-def test_pyshader_generated():
-    from pyshader import python2shader, f32, ivec3, Array
+def test_pyshader_pyshader():
 
     @python2shader
-    def compute_shader_multiply(index: ("input", "GlobalInvocationId", ivec3),
-                                data1: ("buffer", 0, Array(f32)),
-                                data2: ("buffer", 1, Array(f32)),
-                                data3: ("buffer", 2, Array(f32))):
+    def compute_shader_multiply(index=("input", "GlobalInvocationId", ivec3),
+                                data1=("buffer", 0, Array(f32)),
+                                data2=("buffer", 1, Array(f32)),
+                                data3=("buffer", 2, Array(f32))):
         i = index.x
         data3[i] = data1[i] * data2[i]
 
@@ -121,5 +123,100 @@ def test_pyshader_generated():
 
     assert tensor_out.data() == [2.0, 4.0, 6.0]
 
+def test_logistic_regression_pyshader():
+    @python2shader
+    def compute_shader(
+            index   = ("input", "GlobalInvocationId", ivec3),
+            x_i     = ("buffer", 0, Array(f32)),
+            x_j     = ("buffer", 1, Array(f32)),
+            y       = ("buffer", 2, Array(f32)),
+            w_in    = ("buffer", 3, Array(f32)),
+            w_out_i = ("buffer", 4, Array(f32)),
+            w_out_j = ("buffer", 5, Array(f32)),
+            b_in    = ("buffer", 6, Array(f32)),
+            b_out   = ("buffer", 7, Array(f32)),
+            l_out   = ("buffer", 8, Array(f32)),
+            M       = ("buffer", 9, Array(f32))):
+
+        i = index.x
+
+        m = M[0]
+
+        w_curr = vec2(w_in[0], w_in[1])
+        b_curr = b_in[0]
+
+        x_curr = vec2(x_i[i], x_j[i])
+        y_curr = y[i]
+
+        z_dot = w_curr @ x_curr
+        z = z_dot + b_curr
+        y_hat = 1.0 / (1.0 + exp(-z))
+
+        d_z = y_hat - y_curr
+        d_w = (1.0 / m) * x_curr * d_z
+        d_b = (1.0 / m) * d_z
+
+        loss = -((y_curr * log(y_hat)) + ((1.0 + y_curr) * log(1.0 - y_hat)))
+
+        w_out_i[i] = d_w.x
+        w_out_j[i] = d_w.y
+        b_out[i] = d_b
+        l_out[i] = loss
+
+
+    # First we create input and ouput tensors for shader
+    tensor_x_i = Tensor([0.0, 1.0, 1.0, 1.0, 1.0])
+    tensor_x_j = Tensor([0.0, 0.0, 0.0, 1.0, 1.0])
+
+    tensor_y = Tensor([0.0, 0.0, 0.0, 1.0, 1.0])
+
+    tensor_w_in = Tensor([0.001, 0.001])
+    tensor_w_out_i = Tensor([0.0, 0.0, 0.0, 0.0, 0.0])
+    tensor_w_out_j = Tensor([0.0, 0.0, 0.0, 0.0, 0.0])
+
+    tensor_b_in = Tensor([0.0])
+    tensor_b_out = Tensor([0.0, 0.0, 0.0, 0.0, 0.0])
+
+    tensor_l_out = Tensor([0.0, 0.0, 0.0, 0.0, 0.0])
+
+    tensor_m = Tensor([ 5.0 ])
+
+    # We store them in an array for easier interaction
+    params = [tensor_x_i, tensor_x_j, tensor_y, tensor_w_in, tensor_w_out_i,
+        tensor_w_out_j, tensor_b_in, tensor_b_out, tensor_l_out, tensor_m]
+
+    mgr = Manager()
+
+    mgr.eval_tensor_create_def(params)
+
+    ITERATIONS = 100
+    learning_rate = 0.1
+
+    # Perform machine learning training and inference across all input X and Y
+    for i_iter in range(ITERATIONS):
+        mgr.eval_tensor_sync_device_def([tensor_w_in, tensor_b_in])
+        mgr.eval_algo_data_def(params, compute_shader.to_spirv())
+        mgr.eval_tensor_sync_local_def([tensor_w_out_i, tensor_w_out_j, tensor_b_out, tensor_l_out])
+
+        # Calculate the parameters based on the respective derivatives calculated
+        w_in_i_val = tensor_w_in.data()[0]
+        w_in_j_val = tensor_w_in.data()[1]
+        b_in_val = tensor_b_in.data()[0]
+
+        for j_iter in range(tensor_b_out.size()):
+            w_in_i_val -= learning_rate * tensor_w_out_i.data()[j_iter]
+            w_in_j_val -= learning_rate * tensor_w_out_j.data()[j_iter]
+            b_in_val -= learning_rate * tensor_b_out.data()[j_iter]
+
+        # Update the parameters to process inference again
+        tensor_w_in.set_data([w_in_i_val, w_in_j_val])
+        tensor_b_in.set_data([b_in_val])
+
+    assert tensor_w_in.data()[0] < 0.01
+    assert tensor_w_in.data()[0] > 0.0
+    assert tensor_w_in.data()[1] > 1.5
+    assert tensor_b_in.data()[0] < 0.7
+
+
 if __name__ == "__main__":
     test_sequence()

From 2ba3c8eadb7fec89fb8392fde29261729a53ee6b Mon Sep 17 00:00:00 2001
From: Alejandro Saucedo <axsauze@gmail.com>
Date: Sun, 8 Nov 2020 15:38:38 +0000
Subject: [PATCH 06/11] Updated lr cpp test to print without fmt

---
 test/TestLogisticRegression.cpp | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

diff --git a/test/TestLogisticRegression.cpp b/test/TestLogisticRegression.cpp
index eda6ca635..c360542b6 100644
--- a/test/TestLogisticRegression.cpp
+++ b/test/TestLogisticRegression.cpp
@@ -1,7 +1,6 @@
 
 #include "gtest/gtest.h"
 
-//#include <spdlog/fmt/bundled/ranges.h>
 #include "kompute/Kompute.hpp"
 
 TEST(TestLogisticRegressionAlgorithm, TestMainLogisticRegression)
@@ -73,12 +72,11 @@ TEST(TestLogisticRegressionAlgorithm, TestMainLogisticRegression)
     EXPECT_LT(wIn->data()[0], 0.01);
     EXPECT_GT(wIn->data()[1], 1.0);
     EXPECT_LT(bIn->data()[0], 0.0);
-    EXPECT_LT(bIn->data()[0], 0.0);
 
-    // SPDLOG_WARN("Result wIn: {}, bIn: {}, loss: {}",
-    //            wIn->data(),
-    //            bIn->data(),
-    //            lOut->data());
+    SPDLOG_WARN("Result wIn i: {}, wIn j: {}, bIn: {}",
+               wIn->data()[0],
+               wIn->data()[1],
+               bIn->data()[0]);
 }
 
 TEST(TestLogisticRegressionAlgorithm, TestMainLogisticRegressionManualCopy)
@@ -156,8 +154,8 @@ TEST(TestLogisticRegressionAlgorithm, TestMainLogisticRegressionManualCopy)
     EXPECT_GT(wIn->data()[1], 1.0);
     EXPECT_LT(bIn->data()[0], 0.0);
 
-    // SPDLOG_WARN("Result wIn: {}, bIn: {}, loss: {}",
-    //            wIn->data(),
-    //            bIn->data(),
-    //            lOut->data());
+    SPDLOG_WARN("Result wIn i: {}, wIn j: {}, bIn: {}",
+               wIn->data()[0],
+               wIn->data()[1],
+               bIn->data()[0]);
 }

From 93e03ae46312e2706c152bfe6b54dec7742c7963 Mon Sep 17 00:00:00 2001
From: Alejandro Saucedo <axsauze@gmail.com>
Date: Sun, 8 Nov 2020 15:54:41 +0000
Subject: [PATCH 07/11] Updated function create_sequence to have default param
 for create_sequence name to empty string

---
 python/src/main.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/src/main.cpp b/python/src/main.cpp
index 265df8f52..59d16abbb 100644
--- a/python/src/main.cpp
+++ b/python/src/main.cpp
@@ -99,7 +99,7 @@ PYBIND11_MODULE(kp, m) {
             }), "Manager initialiser can provide specified device and array of GPU queueFamilies to load.")
         .def("get_create_sequence", &kp::Manager::getOrCreateManagedSequence, "Get a Sequence or create a new one with given name")
         .def("create_sequence", &kp::Manager::createManagedSequence,
-                py::arg("name"), py::arg("queueIndex") = 0, "Create a sequence with specific name and specified index of available queues")
+                py::arg("name") = "", py::arg("queueIndex") = 0, "Create a sequence with specific name and specified index of available queues")
         .def("build_tensor", &kp::Manager::buildTensor, 
                 py::arg("data"), py::arg("tensorType") = kp::Tensor::TensorTypes::eDevice,
                 "Build and initialise tensor")

From 358f496549a8bf9d2102f3fd592bdaee40ebad38 Mon Sep 17 00:00:00 2001
From: Alejandro Saucedo <axsauze@gmail.com>
Date: Sun, 8 Nov 2020 15:56:43 +0000
Subject: [PATCH 08/11] Updated python lr impl to use sequence for more
 efficient management of sequences

---
 python/test/test_kompute.py | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/python/test/test_kompute.py b/python/test/test_kompute.py
index fd6611550..559600eba 100644
--- a/python/test/test_kompute.py
+++ b/python/test/test_kompute.py
@@ -189,14 +189,20 @@ def test_logistic_regression_pyshader():
 
     mgr.eval_tensor_create_def(params)
 
+    # Record commands for efficient evaluation
+    sq = mgr.create_sequence()
+    sq.begin()
+    sq.record_tensor_sync_device([tensor_w_in, tensor_b_in])
+    sq.record_algo_data(params, compute_shader.to_spirv())
+    sq.record_tensor_sync_local([tensor_w_out_i, tensor_w_out_j, tensor_b_out, tensor_l_out])
+    sq.end()
+
     ITERATIONS = 100
     learning_rate = 0.1
 
     # Perform machine learning training and inference across all input X and Y
     for i_iter in range(ITERATIONS):
-        mgr.eval_tensor_sync_device_def([tensor_w_in, tensor_b_in])
-        mgr.eval_algo_data_def(params, compute_shader.to_spirv())
-        mgr.eval_tensor_sync_local_def([tensor_w_out_i, tensor_w_out_j, tensor_b_out, tensor_l_out])
+        sq.eval()
 
         # Calculate the parameters based on the respective derivatives calculated
         w_in_i_val = tensor_w_in.data()[0]

From 13503e763975c1803d089e7925378c04a22b87c2 Mon Sep 17 00:00:00 2001
From: Alejandro Saucedo <axsauze@gmail.com>
Date: Sun, 8 Nov 2020 15:58:43 +0000
Subject: [PATCH 09/11] Removed last line from py tests

---
 python/test/test_kompute.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/python/test/test_kompute.py b/python/test/test_kompute.py
index 559600eba..ea82799e8 100644
--- a/python/test/test_kompute.py
+++ b/python/test/test_kompute.py
@@ -223,6 +223,3 @@ def test_logistic_regression_pyshader():
     assert tensor_w_in.data()[1] > 1.5
     assert tensor_b_in.data()[0] < 0.7
 
-
-if __name__ == "__main__":
-    test_sequence()

From b68446beeb73cc5aac4e6fe2f6483bcbe3112a06 Mon Sep 17 00:00:00 2001
From: Alejandro Saucedo <axsauze@gmail.com>
Date: Sun, 8 Nov 2020 16:04:05 +0000
Subject: [PATCH 10/11] Updated readme for python example

---
 README.md | 29 ++++++++++++++++++++++++-----
 1 file changed, 24 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index 43b3b8511..4facb1137 100644
--- a/README.md
+++ b/README.md
@@ -306,8 +306,18 @@ tensor_out = Tensor([0, 0, 0])
 
 mgr.eval_tensor_create_def([tensor_in_a, tensor_in_b, tensor_out])
 
-shaderFilePath = "shaders/glsl/opmult.comp"
-mgr.eval_async_algo_file_def([tensor_in_a, tensor_in_b, tensor_out], shaderFilePath)
+# Define the function via PyShader or directly as glsl string or spirv bytes
+@python2shader
+def compute_shader_multiply(index=("input", "GlobalInvocationId", ivec3),
+                            data1=("buffer", 0, Array(f32)),
+                            data2=("buffer", 1, Array(f32)),
+                            data3=("buffer", 2, Array(f32))):
+    i = index.x
+    data3[i] = data1[i] * data2[i]
+
+# Run shader operation synchronously
+mgr.eval_algo_data_def(
+    [tensor_in_a, tensor_in_b, tensor_out], compute_shader_multiply.to_spirv())
 
 # Alternatively can pass raw string/bytes:
 # shaderFileData = """ shader code here... """
@@ -332,13 +342,22 @@ tensor_in_a = Tensor([2, 2, 2])
 tensor_in_b = Tensor([1, 2, 3])
 tensor_out = Tensor([0, 0, 0])
 
-shaderFilePath = "../../shaders/glsl/opmult.comp"
-
 mgr.eval_tensor_create_def([tensor_in_a, tensor_in_b, tensor_out])
 
 seq = mgr.create_sequence("op")
 
-mgr.eval_async_algo_file_def([tensor_in_a, tensor_in_b, tensor_out], shaderFilePath)
+# Define the function via PyShader or directly as glsl string or spirv bytes
+@python2shader
+def compute_shader_multiply(index=("input", "GlobalInvocationId", ivec3),
+                            data1=("buffer", 0, Array(f32)),
+                            data2=("buffer", 1, Array(f32)),
+                            data3=("buffer", 2, Array(f32))):
+    i = index.x
+    data3[i] = data1[i] * data2[i]
+
+# Run shader operation asynchronously and then await
+mgr.eval_async_algo_data_def(
+    [tensor_in_a, tensor_in_b, tensor_out], compute_shader_multiply.to_spirv())
 mgr.eval_await_def()
 
 seq.begin()

From 3b540d00e15bb7af1ec2c41593f186d9dd458099 Mon Sep 17 00:00:00 2001
From: Alejandro Saucedo <axsauze@gmail.com>
Date: Sun, 8 Nov 2020 16:27:01 +0000
Subject: [PATCH 11/11] Updated python package documentation

---
 docs/overview/python-package.rst   | 200 ++++++++++++++++++++++++++++-
 docs/overview/python-reference.rst |   3 -
 2 files changed, 197 insertions(+), 6 deletions(-)

diff --git a/docs/overview/python-package.rst b/docs/overview/python-package.rst
index ffe5b272e..004f16a56 100644
--- a/docs/overview/python-package.rst
+++ b/docs/overview/python-package.rst
@@ -9,7 +9,7 @@ Below is a diagram that provides insights on the relationship between Vulkan Kom
 .. image:: ../images/kompute-architecture.jpg
    :width: 70%
 
-Python Components
+Core Python Components
 ^^^^^^^^
 
 The Python package exposes three main classes:
@@ -30,7 +30,89 @@ More specifically, it can be through the following functions:
 * mgr.eval_async_<opname>_def - Runs operation asynchronously under a new anonymous sequence
 * seq.record_<opname> - Records operation in sequence (requires sequence to be in recording mode)
 
-You can see these operations being used in the `Simple Python example <https://kompute.cc/index.html#python-example-simple>`_ and in the `Extended Python Example <https://kompute.cc/index.html#python-example-extended>`_.
+Python Example (Simple)
+^^^^^
+
+Then you can interact with it from your interpreter. Below is the same sample as above "Your First Kompute (Simple Version)" but in Python:
+
+.. code-block:: python
+   :linenos:
+
+   mgr = Manager()
+
+   # Can be initialized with List[] or np.Array
+   tensor_in_a = Tensor([2, 2, 2])
+   tensor_in_b = Tensor([1, 2, 3])
+   tensor_out = Tensor([0, 0, 0])
+
+   mgr.eval_tensor_create_def([tensor_in_a, tensor_in_b, tensor_out])
+
+   # Define the function via PyShader or directly as glsl string or spirv bytes
+   @python2shader
+   def compute_shader_multiply(index=("input", "GlobalInvocationId", ivec3),
+                               data1=("buffer", 0, Array(f32)),
+                               data2=("buffer", 1, Array(f32)),
+                               data3=("buffer", 2, Array(f32))):
+       i = index.x
+       data3[i] = data1[i] * data2[i]
+
+   # Run shader operation synchronously
+   mgr.eval_algo_data_def(
+       [tensor_in_a, tensor_in_b, tensor_out], compute_shader_multiply.to_spirv())
+
+   # Alternatively can pass raw string/bytes:
+   # shaderFileData = """ shader code here... """
+   # mgr.eval_algo_data_def([tensor_in_a, tensor_in_b, tensor_out], list(shaderFileData))
+
+   mgr.eval_await_def()
+
+   mgr.eval_tensor_sync_local_def([tensor_out])
+
+   assert tensor_out.data() == [2.0, 4.0, 6.0]
+
+
+Python Example (Extended)
+^^^^^
+
+Similarly you can find the same extended example as above:
+
+.. code-block:: python
+   :linenos:
+
+    mgr = Manager(0, [2])
+
+    # Can be initialized with List[] or np.Array
+    tensor_in_a = Tensor([2, 2, 2])
+    tensor_in_b = Tensor([1, 2, 3])
+    tensor_out = Tensor([0, 0, 0])
+
+    mgr.eval_tensor_create_def([tensor_in_a, tensor_in_b, tensor_out])
+
+    seq = mgr.create_sequence("op")
+
+    # Define the function via PyShader or directly as glsl string or spirv bytes
+    @python2shader
+    def compute_shader_multiply(index=("input", "GlobalInvocationId", ivec3),
+                                data1=("buffer", 0, Array(f32)),
+                                data2=("buffer", 1, Array(f32)),
+                                data3=("buffer", 2, Array(f32))):
+        i = index.x
+        data3[i] = data1[i] * data2[i]
+
+    # Run shader operation asynchronously and then await
+    mgr.eval_async_algo_data_def(
+        [tensor_in_a, tensor_in_b, tensor_out], compute_shader_multiply.to_spirv())
+    mgr.eval_await_def()
+
+    seq.begin()
+    seq.record_tensor_sync_local([tensor_in_a])
+    seq.record_tensor_sync_local([tensor_in_b])
+    seq.record_tensor_sync_local([tensor_out])
+    seq.end()
+
+    seq.eval()
+
+    assert tensor_out.data() == [2.0, 4.0, 6.0]
 
 Kompute Operation Capabilities
 ^^^^^
@@ -38,7 +120,8 @@ Kompute Operation Capabilities
 Handling multiple capabilites of processing can be done by compute shaders being loaded into separate sequences. The example below shows how this can be done:
 
 .. code-block:: python
-    :linenos:
+   :linenos:
+
     from kp import Manager
 
     # We'll assume we have the shader data available
@@ -77,6 +160,117 @@ Handling multiple capabilites of processing can be done by compute shaders being
 
     print(t1.data(), t2.data(), t3.data())
 
+Machine Learning Logistic Regression Implementation
+^^^^^^
+
+Similar to the logistic regression implementation in the C++ examples section, below you can find the Python implementation of the Logistic Regression algorithm.
+
+.. code-block:: python
+   :linenos:
+
+    @python2shader
+    def compute_shader(
+            index   = ("input", "GlobalInvocationId", ivec3),
+            x_i     = ("buffer", 0, Array(f32)),
+            x_j     = ("buffer", 1, Array(f32)),
+            y       = ("buffer", 2, Array(f32)),
+            w_in    = ("buffer", 3, Array(f32)),
+            w_out_i = ("buffer", 4, Array(f32)),
+            w_out_j = ("buffer", 5, Array(f32)),
+            b_in    = ("buffer", 6, Array(f32)),
+            b_out   = ("buffer", 7, Array(f32)),
+            l_out   = ("buffer", 8, Array(f32)),
+            M       = ("buffer", 9, Array(f32))):
+
+        i = index.x
+
+        m = M[0]
+
+        w_curr = vec2(w_in[0], w_in[1])
+        b_curr = b_in[0]
+
+        x_curr = vec2(x_i[i], x_j[i])
+        y_curr = y[i]
+
+        z_dot = w_curr @ x_curr
+        z = z_dot + b_curr
+        y_hat = 1.0 / (1.0 + exp(-z))
+
+        d_z = y_hat - y_curr
+        d_w = (1.0 / m) * x_curr * d_z
+        d_b = (1.0 / m) * d_z
+
+        loss = -((y_curr * log(y_hat)) + ((1.0 + y_curr) * log(1.0 - y_hat)))
+
+        w_out_i[i] = d_w.x
+        w_out_j[i] = d_w.y
+        b_out[i] = d_b
+        l_out[i] = loss
+
+
+    # First we create input and ouput tensors for shader
+    tensor_x_i = Tensor([0.0, 1.0, 1.0, 1.0, 1.0])
+    tensor_x_j = Tensor([0.0, 0.0, 0.0, 1.0, 1.0])
+
+    tensor_y = Tensor([0.0, 0.0, 0.0, 1.0, 1.0])
+
+    tensor_w_in = Tensor([0.001, 0.001])
+    tensor_w_out_i = Tensor([0.0, 0.0, 0.0, 0.0, 0.0])
+    tensor_w_out_j = Tensor([0.0, 0.0, 0.0, 0.0, 0.0])
+
+    tensor_b_in = Tensor([0.0])
+    tensor_b_out = Tensor([0.0, 0.0, 0.0, 0.0, 0.0])
+
+    tensor_l_out = Tensor([0.0, 0.0, 0.0, 0.0, 0.0])
+
+    tensor_m = Tensor([ 5.0 ])
+
+    # We store them in an array for easier interaction
+    params = [tensor_x_i, tensor_x_j, tensor_y, tensor_w_in, tensor_w_out_i,
+        tensor_w_out_j, tensor_b_in, tensor_b_out, tensor_l_out, tensor_m]
+
+    mgr = Manager()
+
+    mgr.eval_tensor_create_def(params)
+
+    # Record commands for efficient evaluation
+    sq = mgr.create_sequence()
+    sq.begin()
+    sq.record_tensor_sync_device([tensor_w_in, tensor_b_in])
+    sq.record_algo_data(params, compute_shader.to_spirv())
+    sq.record_tensor_sync_local([tensor_w_out_i, tensor_w_out_j, tensor_b_out, tensor_l_out])
+    sq.end()
+
+    ITERATIONS = 100
+    learning_rate = 0.1
+
+    # Perform machine learning training and inference across all input X and Y
+    for i_iter in range(ITERATIONS):
+        sq.eval()
+
+        # Calculate the parameters based on the respective derivatives calculated
+        w_in_i_val = tensor_w_in.data()[0]
+        w_in_j_val = tensor_w_in.data()[1]
+        b_in_val = tensor_b_in.data()[0]
+
+        for j_iter in range(tensor_b_out.size()):
+            w_in_i_val -= learning_rate * tensor_w_out_i.data()[j_iter]
+            w_in_j_val -= learning_rate * tensor_w_out_j.data()[j_iter]
+            b_in_val -= learning_rate * tensor_b_out.data()[j_iter]
+
+        # Update the parameters to process inference again
+        tensor_w_in.set_data([w_in_i_val, w_in_j_val])
+        tensor_b_in.set_data([b_in_val])
+
+    assert tensor_w_in.data()[0] < 0.01
+    assert tensor_w_in.data()[0] > 0.0
+    assert tensor_w_in.data()[1] > 1.5
+    assert tensor_b_in.data()[0] < 0.7
+
+    # Print outputs
+    print(tensor_w_in.data())
+    print(tensor_b_in.data())
+
 
 Package Installation 
 ^^^^^^^^^
diff --git a/docs/overview/python-reference.rst b/docs/overview/python-reference.rst
index 0a8eb7a23..89b426ce0 100644
--- a/docs/overview/python-reference.rst
+++ b/docs/overview/python-reference.rst
@@ -6,9 +6,6 @@ Python Class Documentation & Reference
 This section provides a breakdown of the Python classes and what each of their functions provide.
 Below is a diagram that provides insights on the relationship between Vulkan Kompute objects and Vulkan resources, which primarily encompass ownership of either CPU and/or GPU memory.
 
-.. image:: ../images/kompute-architecture.jpg
-   :width: 70%
-
 Manager
 -------