Merge pull request #177 from EthicalML/add_tensor_types

Add support for bool, double, int32, uint32 and float32 on Tensors via TensorT
2021-03-07 14:25:19 +00:00 · 2021-03-07 14:25:19 +00:00 · 1d2d33b269
commit 1d2d33b269
parent cc1ec748a7 2e1022410b
41 changed files with 1262 additions and 690 deletions
--- a/README.md
+++ b/README.md
@ -55,10 +55,13 @@ void kompute(const std::string& shader) {
    kp::Manager mgr; 

    // 2. Create and initialise Kompute Tensors through manager
+
+    // Default tensor constructor simplifies creation of float values
    auto tensorInA = mgr.tensor({ 2., 2., 2. });
    auto tensorInB = mgr.tensor({ 1., 2., 3. });
-    auto tensorOutA = mgr.tensor({ 0., 0., 0. });
-    auto tensorOutB = mgr.tensor({ 0., 0., 0. });
+    // Explicit type constructor supports uint32, int32, double, float and bool
+    auto tensorOutA = mgr.tensorT<uint32_t>({ 0, 0, 0 });
+    auto tensorOutB = mgr.tensorT<uint32_t>({ 0, 0, 0 });

    std::vector<std::shared_ptr<kp::Tensor>> params = {tensorInA, tensorInB, tensorOutA, tensorOutB};

@ -109,8 +112,8 @@ int main() {
        // The input tensors bind index is relative to index in parameter passed
        layout(set = 0, binding = 0) buffer buf_in_a { float in_a[]; };
        layout(set = 0, binding = 1) buffer buf_in_b { float in_b[]; };
-        layout(set = 0, binding = 2) buffer buf_out_a { float out_a[]; };
-        layout(set = 0, binding = 3) buffer buf_out_b { float out_b[]; };
+        layout(set = 0, binding = 2) buffer buf_out_a { uint out_a[]; };
+        layout(set = 0, binding = 3) buffer buf_out_b { uint out_b[]; };

        // Kompute supports push constants updated on dispatch
        layout(push_constant) uniform PushConstants {
@ -122,8 +125,8 @@ int main() {

        void main() {
            uint index = gl_GlobalInvocationID.x;
-            out_a[index] += in_a[index] * in_b[index];
-            out_b[index] += const_one * push_const.val;
+            out_a[index] += uint( in_a[index] * in_b[index] );
+            out_b[index] += uint( const_one * push_const.val );
        }
    )");

@ -144,10 +147,13 @@ def kompute(shader):
    mgr = kp.Manager()

    # 2. Create and initialise Kompute Tensors through manager
+
+    # Default tensor constructor simplifies creation of float values
    tensor_in_a = mgr.tensor([2, 2, 2])
    tensor_in_b = mgr.tensor([1, 2, 3])
-    tensor_out_a = mgr.tensor([0, 0, 0])
-    tensor_out_b = mgr.tensor([0, 0, 0])
+    # Explicit type constructor supports uint32, int32, double, float and bool
+    tensor_out_a = mgr.tensor_t(np.array([0, 0, 0], dtype=np.uint32))
+    tensor_out_b = mgr.tensor_t(np.array([0, 0, 0], dtype=np.uint32))

    params = [tensor_in_a, tensor_in_b, tensor_out_a, tensor_out_b]

@ -194,8 +200,8 @@ if __name__ == "__main__":
        // The input tensors bind index is relative to index in parameter passed
        layout(set = 0, binding = 0) buffer buf_in_a { float in_a[]; };
        layout(set = 0, binding = 1) buffer buf_in_b { float in_b[]; };
-        layout(set = 0, binding = 2) buffer buf_out_a { float out_a[]; };
-        layout(set = 0, binding = 3) buffer buf_out_b { float out_b[]; };
+        layout(set = 0, binding = 2) buffer buf_out_a { uint out_a[]; };
+        layout(set = 0, binding = 3) buffer buf_out_b { uint out_b[]; };

        // Kompute supports push constants updated on dispatch
        layout(push_constant) uniform PushConstants {
@ -207,8 +213,8 @@ if __name__ == "__main__":

        void main() {
            uint index = gl_GlobalInvocationID.x;
-            out_a[index] += in_a[index] * in_b[index];
-            out_b[index] += const_one * push_const.val;
+            out_a[index] += uint( in_a[index] * in_b[index] );
+            out_b[index] += uint( const_one * push_const.val );
        }
    """

--- a/examples/array_multiplication/CMakeLists.txt
+++ b/examples/array_multiplication/CMakeLists.txt
@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.17.0)
+cmake_minimum_required(VERSION 3.4.1)
 project(kompute_array_mult VERSION 0.1.0)

 set(CMAKE_CXX_STANDARD 14)
@ -23,10 +23,6 @@ endif()

 find_package(Vulkan REQUIRED)

-if(KOMPUTE_OPT_ENABLE_SPDLOG)
-    find_package(spdlog REQUIRED)
-endif()
-
 add_executable(kompute_array_mult
    src/Main.cpp)

--- a/examples/array_multiplication/README.md
+++ b/examples/array_multiplication/README.md
@ -15,8 +15,11 @@ This project has the option to either import the Kompute dependency relative to
 To build you just need to run the cmake command in this folder as follows:

 ```
-cmake \
-    -Bbuild
+cmake -Bbuild/ \
+          -DCMAKE_BUILD_TYPE=Debug                   \
+          -DKOMPUTE_OPT_INSTALL=0                    \
+          -DKOMPUTE_OPT_REPO_SUBMODULE_BUILD=1       \
+          -DKOMPUTE_OPT_ENABLE_SPDLOG=1
 ```

 You can pass the following optional parameters based on your desired configuration:
--- a/examples/array_multiplication/src/Main.cpp
+++ b/examples/array_multiplication/src/Main.cpp
@ -39,16 +39,17 @@ int main()

    std::vector<std::shared_ptr<kp::Tensor>> params = { tensorInA, tensorInB, tensorOut };

-    std::shared_ptr<kp::Algorithm> algo = mgr.algorithm(params, kp::Shader::compile_source(shader));
+    std::shared_ptr<kp::Algorithm> algo = mgr.algorithm(params, kp::Shader::compileSource(shader));

    mgr.sequence()
        ->record<kp::OpTensorSyncDevice>(params)
        ->record<kp::OpAlgoDispatch>(algo)
-        ->record<kp::OpTensorSyncLocal>(params);
+        ->record<kp::OpTensorSyncLocal>(params)
+        ->eval();

    // prints "Output {  0  4  12  }"
    std::cout<< "Output: {  ";
-    for (const float& elem : tensorOut->data()) {
+    for (const float& elem : tensorOut->vector()) {
      std::cout << elem << "  ";
    }
    std::cout << "}" << std::endl;
--- a/examples/godot_examples/custom_module/kompute_summator/KomputeSummatorNode.cpp
+++ b/examples/godot_examples/custom_module/kompute_summator/KomputeSummatorNode.cpp
@ -54,7 +54,7 @@ void KomputeSummatorNode::_init() {
        std::shared_ptr<kp::Algorithm> algo =
          mgr.algorithm(
                { this->mPrimaryTensor, this->mSecondaryTensor },
-                kp::Shader::compile_source(shader));
+                kp::Shader::compileSource(shader));


        // First we ensure secondary tensor loads to GPU
--- a/examples/godot_examples/gdnative_shared/src/KomputeSummator.cpp
+++ b/examples/godot_examples/gdnative_shared/src/KomputeSummator.cpp
@ -58,7 +58,7 @@ void KomputeSummator::_init() {
        // Then we run the operation with both tensors
        this->mSequence->record<kp::OpAlgoCreate>(
            { this->mPrimaryTensor, this->mSecondaryTensor }, 
-            kp::Shader::compile_source(shader));
+            kp::Shader::compileSource(shader));

        // We map the result back to local 
        this->mSequence->record<kp::OpTensorSyncLocal>(
--- a/examples/logistic_regression/CMakeLists.txt
+++ b/examples/logistic_regression/CMakeLists.txt
@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.17.0)
+cmake_minimum_required(VERSION 3.4.1)
 project(kompute_linear_reg VERSION 0.1.0)

 set(CMAKE_CXX_STANDARD 14)
@ -23,10 +23,6 @@ endif()

 find_package(Vulkan REQUIRED)

-if(KOMPUTE_OPT_ENABLE_SPDLOG)
-    find_package(spdlog REQUIRED)
-endif()
-
 add_executable(kompute_linear_reg
    src/Main.cpp)

@ -39,7 +35,7 @@ include_directories(
        ../../single_include/)

 if(KOMPUTE_OPT_ENABLE_SPDLOG)
-    target_link_libraries(kompute_array_mult
+    target_link_libraries(kompute_linear_reg
        spdlog::spdlog)
 endif()

--- a/examples/logistic_regression/README.md
+++ b/examples/logistic_regression/README.md
@ -15,8 +15,11 @@ This project has the option to either import the Kompute dependency relative to
 To build you just need to run the cmake command in this folder as follows:

 ```
-cmake \
-    -Bbuild
+cmake -Bbuild/ \
+          -DCMAKE_BUILD_TYPE=Debug                   \
+          -DKOMPUTE_OPT_INSTALL=0                    \
+          -DKOMPUTE_OPT_REPO_SUBMODULE_BUILD=1       \
+          -DKOMPUTE_OPT_ENABLE_SPDLOG=1
 ```

 You can pass the following optional parameters based on your desired configuration:
--- a/examples/logistic_regression/src/Main.cpp
+++ b/examples/logistic_regression/src/Main.cpp
@ -17,19 +17,19 @@ int main()

    kp::Manager mgr;

-    std::shared_ptr<kp::Tensor> xI = mgr.tensor({ 0, 1, 1, 1, 1 });
-    std::shared_ptr<kp::Tensor> xJ = mgr.tensor({ 0, 0, 0, 1, 1 });
+    auto xI = mgr.tensor({ 0, 1, 1, 1, 1 });
+    auto xJ = mgr.tensor({ 0, 0, 0, 1, 1 });

-    std::shared_ptr<kp::Tensor> y = mgr.tensor({ 0, 0, 0, 1, 1 });
+    auto y = mgr.tensor({ 0, 0, 0, 1, 1 });

-    std::shared_ptr<kp::Tensor> wIn = mgr.tensor({ 0.001, 0.001 });
-    std::shared_ptr<kp::Tensor> wOutI = mgr.tensor({ 0, 0, 0, 0, 0 });
-    std::shared_ptr<kp::Tensor> wOutJ = mgr.tensor({ 0, 0, 0, 0, 0 });
+    auto wIn = mgr.tensor({ 0.001, 0.001 });
+    auto wOutI = mgr.tensor({ 0, 0, 0, 0, 0 });
+    auto wOutJ = mgr.tensor({ 0, 0, 0, 0, 0 });

-    std::shared_ptr<kp::Tensor> bIn = mgr.tensor({ 0 });
-    std::shared_ptr<kp::Tensor> bOut = mgr.tensor({ 0, 0, 0, 0, 0 });
+    auto bIn = mgr.tensor({ 0 });
+    auto bOut = mgr.tensor({ 0, 0, 0, 0, 0 });

-    std::shared_ptr<kp::Tensor> lOut = mgr.tensor({ 0, 0, 0, 0, 0 });
+    auto lOut = mgr.tensor({ 0, 0, 0, 0, 0 });

    std::vector<std::shared_ptr<kp::Tensor>> params = { xI,  xJ,    y,
                                                        wIn, wOutI, wOutJ,
@ -40,7 +40,8 @@ int main()
                (uint32_t*)(kp::shader_data::shaders_glsl_logisticregression_comp_spv
                    + kp::shader_data::shaders_glsl_logisticregression_comp_spv_len));

-    std::shared_ptr<kp::Algorithm> algo = mgr.algorithm(params, spirv);
+    std::shared_ptr<kp::Algorithm> algo = mgr.algorithm(
+            params, spirv, kp::Workgroup({ 5 }), kp::Constants({ 5.0 }));

    mgr.sequence()->eval<kp::OpTensorSyncDevice>(params);

--- a/python/src/docstrings.hpp
+++ b/python/src/docstrings.hpp
@ -247,10 +247,16 @@ static const char *__doc_kp_Manager_sequence =
 R"doc(Create a managed sequence that will be destroyed by this manager if it
 hasn't been destroyed by its reference count going to zero.

-@param queueIndex The queue to use from the available queues @returns
-Shared pointer with initialised sequence)doc";
+@param queueIndex The queue to use from the available queues @param
+nrOfTimestamps The maximum number of timestamps to allocate. If zero
+(default), disables latching of timestamps. @returns Shared pointer
+with initialised sequence)doc";

-static const char *__doc_kp_Manager_tensor =
+static const char *__doc_kp_Manager_tensor = R"doc()doc";
+
+static const char *__doc_kp_Manager_tensor_2 = R"doc()doc";
+
+static const char *__doc_kp_Manager_tensorT =
 R"doc(Create a managed tensor that will be destroyed by this manager if it
 hasn't been destroyed by its reference count going to zero.

@ -264,18 +270,26 @@ of algorithm and parameter components which can be used with shaders.
 By default it enables the user to provide a dynamic number of tensors
 which are then passed as inputs.)doc";

-static const char *__doc_kp_OpAlgoDispatch_OpAlgoDispatch = R"doc()doc";
+static const char *__doc_kp_OpAlgoDispatch_OpAlgoDispatch =
+R"doc(Constructor that stores the algorithm to use as well as the relevant
+push constants to override when recording.
+
+@param algorithm The algorithm object to use for dispatch @param
+pushConstants The push constants to use for override)doc";

 static const char *__doc_kp_OpAlgoDispatch_mAlgorithm = R"doc()doc";

 static const char *__doc_kp_OpAlgoDispatch_mPushConstants = R"doc()doc";

 static const char *__doc_kp_OpAlgoDispatch_postEval =
-R"doc(Executes after the recorded commands are submitted, and performs a
-copy of the GPU Device memory into the staging buffer so the output
-data can be retrieved.)doc";
+R"doc(Does not perform any postEval commands.

-static const char *__doc_kp_OpAlgoDispatch_preEval = R"doc(Does not perform any preEval commands.)doc";
+@param commandBuffer The command buffer to record the command into.)doc";
+
+static const char *__doc_kp_OpAlgoDispatch_preEval =
+R"doc(Does not perform any preEval commands.
+
+@param commandBuffer The command buffer to record the command into.)doc";

 static const char *__doc_kp_OpAlgoDispatch_record =
 R"doc(This records the commands that are to be sent to the GPU. This
@ -283,7 +297,9 @@ includes the barriers that ensure the memory has been copied before
 going in and out of the shader, as well as the dispatch operation that
 sends the shader processing to the gpu. This function also records the
 GPU memory copy of the output data for the staging buffer so it can be
-read by the host.)doc";
+read by the host.
+
+@param commandBuffer The command buffer to record the command into.)doc";

 static const char *__doc_kp_OpBase =
 R"doc(Base Operation which provides the high level interface that Kompute
@ -299,7 +315,9 @@ the commands to the GPU for processing, and can be used to perform any
 tear-down steps required as the computation iteration finishes. It's
 worth noting that there are situations where eval can be called
 multiple times, so the resources that are destroyed should not require
-a re-init unless explicitly provided by the user.)doc";
+a re-init unless explicitly provided by the user.
+
+@param commandBuffer The command buffer to record the command into.)doc";

 static const char *__doc_kp_OpBase_preEval =
 R"doc(Pre eval is called before the Sequence has called eval and submitted
@ -307,12 +325,16 @@ the commands to the GPU for processing, and can be used to perform any
 per-eval setup steps required as the computation iteration begins.
 It's worth noting that there are situations where eval can be called
 multiple times, so the resources that are created should be idempotent
-in case it's called multiple times in a row.)doc";
+in case it's called multiple times in a row.
+
+@param commandBuffer The command buffer to record the command into.)doc";

 static const char *__doc_kp_OpBase_record =
 R"doc(The record function is intended to only send a record command or run
 commands that are expected to record operations that are to be
-submitted as a batch into the GPU.)doc";
+submitted as a batch into the GPU.
+
+@param commandBuffer The command buffer to record the command into.)doc";

 static const char *__doc_kp_OpMult =
 R"doc(Operation that performs multiplication on two tensors and outpus on
@ -323,12 +345,9 @@ R"doc(Default constructor with parameters that provides the bare minimum
 requirements for the operations to be able to create and manage their
 sub-components.

-@param physicalDevice Vulkan physical device used to find device
-queues @param device Vulkan logical device for passing to Algorithm
-@param commandBuffer Vulkan Command Buffer to record commands into
@param tensors Tensors that are to be used in this operation @param
-komputeWorkgroup Optional parameter to specify the layout for
-processing)doc";
+algorithm An algorithm that will be overridden with the OpMult shader
+data and the tensors provided which are expected to be 3)doc";

 static const char *__doc_kp_OpTensorCopy =
 R"doc(Operation that copies the data from the first tensor to the rest of
@ -340,84 +359,95 @@ static const char *__doc_kp_OpTensorCopy_OpTensorCopy =
 R"doc(Default constructor with parameters that provides the core vulkan
 resources and the tensors that will be used in the operation.

-@param physicalDevice Vulkan physical device used to find device
-queues @param device Vulkan logical device for passing to Algorithm
-@param commandBuffer Vulkan Command Buffer to record commands into
@param tensors Tensors that will be used to create in operation.)doc";

 static const char *__doc_kp_OpTensorCopy_mTensors = R"doc()doc";

 static const char *__doc_kp_OpTensorCopy_postEval =
 R"doc(Copies the local vectors for all the tensors to sync the data with the
-gpu.)doc";
+gpu.

-static const char *__doc_kp_OpTensorCopy_preEval = R"doc(Does not perform any preEval commands.)doc";
+@param commandBuffer The command buffer to record the command into.)doc";
+
+static const char *__doc_kp_OpTensorCopy_preEval =
+R"doc(Does not perform any preEval commands.
+
+@param commandBuffer The command buffer to record the command into.)doc";

 static const char *__doc_kp_OpTensorCopy_record =
 R"doc(Records the copy commands from the first tensor into all the other
-tensors provided. Also optionally records a barrier.)doc";
+tensors provided. Also optionally records a barrier.
+
+@param commandBuffer The command buffer to record the command into.)doc";

 static const char *__doc_kp_OpTensorSyncDevice =
 R"doc(Operation that syncs tensor's device by mapping local data into the
 device memory. For TensorTypes::eDevice it will use a record operation
 for the memory to be syncd into GPU memory which means that the
 operation will be done in sync with GPU commands. For
-TensorTypes::eStaging it will only map the data into host memory which
+TensorTypes::eHost it will only map the data into host memory which
 will happen during preEval before the recorded commands are
-dispatched. This operation won't have any effect on
-TensorTypes::eStaging.)doc";
+dispatched.)doc";

 static const char *__doc_kp_OpTensorSyncDevice_OpTensorSyncDevice =
 R"doc(Default constructor with parameters that provides the core vulkan
 resources and the tensors that will be used in the operation. The
 tensos provided cannot be of type TensorTypes::eStorage.

-@param physicalDevice Vulkan physical device used to find device
-queues @param device Vulkan logical device for passing to Algorithm
-@param commandBuffer Vulkan Command Buffer to record commands into
@param tensors Tensors that will be used to create in operation.)doc";

 static const char *__doc_kp_OpTensorSyncDevice_mTensors = R"doc()doc";

-static const char *__doc_kp_OpTensorSyncDevice_postEval = R"doc(Does not perform any postEval commands.)doc";
+static const char *__doc_kp_OpTensorSyncDevice_postEval =
+R"doc(Does not perform any postEval commands.

-static const char *__doc_kp_OpTensorSyncDevice_preEval = R"doc(Does not perform any preEval commands.)doc";
+@param commandBuffer The command buffer to record the command into.)doc";
+
+static const char *__doc_kp_OpTensorSyncDevice_preEval =
+R"doc(Does not perform any preEval commands.
+
+@param commandBuffer The command buffer to record the command into.)doc";

 static const char *__doc_kp_OpTensorSyncDevice_record =
 R"doc(For device tensors, it records the copy command for the tensor to copy
-the data from its staging to device memory.)doc";
+the data from its staging to device memory.
+
+@param commandBuffer The command buffer to record the command into.)doc";

 static const char *__doc_kp_OpTensorSyncLocal =
 R"doc(Operation that syncs tensor's local memory by mapping device data into
 the local CPU memory. For TensorTypes::eDevice it will use a record
 operation for the memory to be syncd into GPU memory which means that
 the operation will be done in sync with GPU commands. For
-TensorTypes::eStaging it will only map the data into host memory which
+TensorTypes::eHost it will only map the data into host memory which
 will happen during preEval before the recorded commands are
-dispatched. This operation won't have any effect on
-TensorTypes::eStaging.)doc";
+dispatched.)doc";

 static const char *__doc_kp_OpTensorSyncLocal_OpTensorSyncLocal =
 R"doc(Default constructor with parameters that provides the core vulkan
 resources and the tensors that will be used in the operation. The
 tensors provided cannot be of type TensorTypes::eStorage.

-@param physicalDevice Vulkan physical device used to find device
-queues @param device Vulkan logical device for passing to Algorithm
-@param commandBuffer Vulkan Command Buffer to record commands into
@param tensors Tensors that will be used to create in operation.)doc";

 static const char *__doc_kp_OpTensorSyncLocal_mTensors = R"doc()doc";

 static const char *__doc_kp_OpTensorSyncLocal_postEval =
 R"doc(For host tensors it performs the map command from the host memory into
-local memory.)doc";
+local memory.

-static const char *__doc_kp_OpTensorSyncLocal_preEval = R"doc(Does not perform any preEval commands.)doc";
+@param commandBuffer The command buffer to record the command into.)doc";
+
+static const char *__doc_kp_OpTensorSyncLocal_preEval =
+R"doc(Does not perform any preEval commands.
+
+@param commandBuffer The command buffer to record the command into.)doc";

 static const char *__doc_kp_OpTensorSyncLocal_record =
 R"doc(For device tensors, it records the copy command for the tensor to copy
-the data from its device to staging memory.)doc";
+the data from its device to staging memory.
+
+@param commandBuffer The command buffer to record the command into.)doc";

 static const char *__doc_kp_Sequence = R"doc(Container of operations that can be sent to GPU as batch)doc";

@ -427,7 +457,8 @@ generate all dependent resources.

@param physicalDevice Vulkan physical device @param device Vulkan
 logical device @param computeQueue Vulkan compute queue @param
-queueIndex Vulkan compute queue index in device)doc";
+queueIndex Vulkan compute queue index in device @param totalTimestamps
+Maximum number of timestamps to allocate)doc";

 static const char *__doc_kp_Sequence_begin =
 R"doc(Begins recording commands for commands to be submitted into the
@ -443,6 +474,8 @@ static const char *__doc_kp_Sequence_createCommandBuffer = R"doc()doc";

 static const char *__doc_kp_Sequence_createCommandPool = R"doc()doc";

+static const char *__doc_kp_Sequence_createTimestampQueryPool = R"doc()doc";
+
 static const char *__doc_kp_Sequence_destroy =
 R"doc(Destroys and frees the GPU resources which include the buffer and
 memory and sets the sequence as init=False.)doc";
@ -528,6 +561,10 @@ finishes, it runs the postEval of all operations.
@param waitFor Number of milliseconds to wait before timing out.
@return shared_ptr<Sequence> of the Sequence class itself)doc";

+static const char *__doc_kp_Sequence_getTimestamps =
+R"doc(Return the timestamps that were latched at the beginning and after
+each operation during the last eval() call.)doc";
+
 static const char *__doc_kp_Sequence_isInit =
 R"doc(Returns true if the sequence has been initialised, and it's based on
 the GPU resources being refrenced.
@ -607,9 +644,11 @@ R"doc(Clears command buffer and triggers re-record of all the current
 operations saved, which is useful if the underlying kp::Tensors or
 kp::Algorithms are modified and need to be re-recorded.)doc";

+static const char *__doc_kp_Sequence_timestampQueryPool = R"doc()doc";
+
 static const char *__doc_kp_Shader = R"doc(Shader utily class with functions to compile and process glsl files.)doc";

-static const char *__doc_kp_Shader_compile_source =
+static const char *__doc_kp_Shader_compileSource =
 R"doc(Compile a single glslang source from string value. Currently this
 function uses the glslang C++ interface which is not thread safe so
 this funciton should not be called from multiple threads concurrently.
@ -622,7 +661,7 @@ List of pairs containing key value definitions @param resourcesLimit A
 list that contains the resource limits for the GLSL compiler @return
 The compiled SPIR-V binary in unsigned int32 format)doc";

-static const char *__doc_kp_Shader_compile_sources =
+static const char *__doc_kp_Shader_compileSources =
 R"doc(Compile multiple sources with optional filenames. Currently this
 function uses the glslang C++ interface which is not thread safe so
 this funciton should not be called from multiple threads concurrently.
@ -644,14 +683,42 @@ across GPUs. Each tensor would have a respective Vulkan memory and
 buffer, which would be used to store their respective data. The
 tensors can be used for GPU data storage or transfer.)doc";

-static const char *__doc_kp_Tensor_Tensor =
-R"doc(Default constructor with data provided which would be used to create
-the respective vulkan buffer and memory.
+static const char *__doc_kp_TensorT = R"doc()doc";

+static const char *__doc_kp_TensorT_TensorT = R"doc()doc";
+
+static const char *__doc_kp_TensorT_data = R"doc()doc";
+
+static const char *__doc_kp_TensorT_dataType = R"doc()doc";
+
+static const char *__doc_kp_TensorT_operator_array = R"doc()doc";
+
+static const char *__doc_kp_TensorT_setData = R"doc()doc";
+
+static const char *__doc_kp_TensorT_vector = R"doc()doc";
+
+static const char *__doc_kp_Tensor_Tensor =
+R"doc(Constructor with data provided which would be used to create the
+respective vulkan buffer and memory.
+
+@param physicalDevice The physical device to use to fetch properties
+@param device The device to use to create the buffer and memory from
@param data Non-zero-sized vector of data that will be used by the
-tensor @param tensorType Type for the tensor which is of type
+tensor @param tensorTypes Type for the tensor which is of type
 TensorTypes)doc";

+static const char *__doc_kp_Tensor_TensorDataTypes = R"doc()doc";
+
+static const char *__doc_kp_Tensor_TensorDataTypes_eBool = R"doc()doc";
+
+static const char *__doc_kp_Tensor_TensorDataTypes_eDouble = R"doc()doc";
+
+static const char *__doc_kp_Tensor_TensorDataTypes_eFloat = R"doc()doc";
+
+static const char *__doc_kp_Tensor_TensorDataTypes_eInt = R"doc()doc";
+
+static const char *__doc_kp_Tensor_TensorDataTypes_eUnsignedInt = R"doc()doc";
+
 static const char *__doc_kp_Tensor_TensorTypes =
 R"doc(Type for tensors created: Device allows memory to be transferred from
 staging buffers. Staging are host memory visible. Storage are device
@ -677,13 +744,14 @@ without exposing it.

 static const char *__doc_kp_Tensor_createBuffer = R"doc()doc";

-static const char *__doc_kp_Tensor_data =
-R"doc(Returns the vector of data currently contained by the Tensor. It is
-important to ensure that there is no out-of-sync data with the GPU
-memory.
+static const char *__doc_kp_Tensor_data = R"doc()doc";

-@return Reference to vector of elements representing the data in the
-tensor.)doc";
+static const char *__doc_kp_Tensor_dataType =
+R"doc(Retrieve the underlying data type of the Tensor
+
+@return Data type of tensor of type kp::Tensor::TensorDataTypes)doc";
+
+static const char *__doc_kp_Tensor_dataTypeMemorySize = R"doc()doc";

 static const char *__doc_kp_Tensor_destroy =
 R"doc(Destroys and frees the GPU resources which include the buffer and
@ -697,9 +765,15 @@ static const char *__doc_kp_Tensor_getStagingBufferUsageFlags = R"doc()doc";

 static const char *__doc_kp_Tensor_getStagingMemoryPropertyFlags = R"doc()doc";

-static const char *__doc_kp_Tensor_isInit = R"doc()doc";
+static const char *__doc_kp_Tensor_isInit =
+R"doc(Check whether tensor is initialized based on the created gpu
+resources.

-static const char *__doc_kp_Tensor_mData = R"doc()doc";
+@returns Boolean stating whether tensor is initialized)doc";
+
+static const char *__doc_kp_Tensor_mDataType = R"doc()doc";
+
+static const char *__doc_kp_Tensor_mDataTypeMemorySize = R"doc()doc";

 static const char *__doc_kp_Tensor_mDevice = R"doc()doc";

@ -717,36 +791,28 @@ static const char *__doc_kp_Tensor_mPrimaryBuffer = R"doc()doc";

 static const char *__doc_kp_Tensor_mPrimaryMemory = R"doc()doc";

+static const char *__doc_kp_Tensor_mRawData = R"doc()doc";
+
+static const char *__doc_kp_Tensor_mSize = R"doc()doc";
+
 static const char *__doc_kp_Tensor_mStagingBuffer = R"doc()doc";

 static const char *__doc_kp_Tensor_mStagingMemory = R"doc()doc";

 static const char *__doc_kp_Tensor_mTensorType = R"doc()doc";

-static const char *__doc_kp_Tensor_mapDataFromHostMemory =
-R"doc(Maps data from the Host Visible GPU memory into the data vector. It
-requires the Tensor to be of staging type for it to work.)doc";
-
-static const char *__doc_kp_Tensor_mapDataIntoHostMemory =
-R"doc(Maps data from the data vector into the Host Visible GPU memory. It
-requires the tensor to be of staging type for it to work.)doc";
+static const char *__doc_kp_Tensor_mapRawData = R"doc()doc";

 static const char *__doc_kp_Tensor_memorySize = R"doc()doc";

-static const char *__doc_kp_Tensor_operator_array =
-R"doc(Overrides the subscript operator to expose the underlying data's
-subscript operator which in this case would be its underlying
-vector's.
-
-@param i The index where the element will be returned from. @return
-Returns the element in the position requested.)doc";
+static const char *__doc_kp_Tensor_rawData = R"doc()doc";

 static const char *__doc_kp_Tensor_rebuild =
-R"doc(Initialiser which calls the initialisation for all the respective
-tensors as well as creates the respective staging tensors. The staging
-tensors would only be created for the tensors of type
-TensorType::eDevice as otherwise there is no need to copy from host
-memory.)doc";
+R"doc(Function to trigger reinitialisation of the tensor buffer and memory
+with new data as well as new potential device type.
+
+@param data Vector of data to use to initialise vector from @param
+tensorType The type to use for the tensor)doc";

 static const char *__doc_kp_Tensor_recordBufferMemoryBarrier =
 R"doc(Records the buffer memory barrier into the command buffer which
@ -788,7 +854,7 @@ would only be relevant for kp::Tensors of type eDevice.
@param createBarrier Whether to create a barrier that ensures the data
 is copied before further operations. Default is true.)doc";

-static const char *__doc_kp_Tensor_setData =
+static const char *__doc_kp_Tensor_setRawData =
 R"doc(Sets / resets the vector data of the tensor. This function does not
 perform any copies into GPU memory and is only performed on the host.)doc";

@ -803,6 +869,10 @@ R"doc(Retrieve the tensor type of the Tensor

@return Tensor type of tensor)doc";

+static const char *__doc_kp_Tensor_unmapRawData = R"doc()doc";
+
+static const char *__doc_kp_Tensor_vector = R"doc()doc";
+
 #if defined(__GNUG__)
 #pragma GCC diagnostic pop
 #endif
--- a/python/src/main.cpp
+++ b/python/src/main.cpp
@ -26,9 +26,9 @@ PYBIND11_MODULE(kp, m) {
    py::module_ np = py::module_::import("numpy");

    py::enum_<kp::Tensor::TensorTypes>(m, "TensorTypes")
-        .value("device", kp::Tensor::TensorTypes::eDevice, "Tensor holding data in GPU memory.")
-        .value("host", kp::Tensor::TensorTypes::eHost, "Tensor used for CPU visible GPU data.")
-        .value("storage", kp::Tensor::TensorTypes::eStorage, "Tensor with host visible gpu memory.")
+        .value("device", kp::Tensor::TensorTypes::eDevice, DOC(kp, Tensor, TensorTypes, eDevice))
+        .value("host", kp::Tensor::TensorTypes::eHost, DOC(kp, Tensor, TensorTypes, eHost))
+        .value("storage", kp::Tensor::TensorTypes::eStorage, DOC(kp, Tensor, TensorTypes, eStorage))
        .export_values();

 #if !defined(KOMPUTE_DISABLE_SHADER_UTILS) || !KOMPUTE_DISABLE_SHADER_UTILS
@ -37,119 +37,168 @@ PYBIND11_MODULE(kp, m) {
                                    const std::string& source,
                                    const std::string& entryPoint,
                                    const std::vector<std::pair<std::string,std::string>>& definitions) {
-                std::vector<uint32_t> spirv = kp::Shader::compile_source(source, entryPoint, definitions);
+                std::vector<uint32_t> spirv = kp::Shader::compileSource(source, entryPoint, definitions);
                return py::bytes((const char*)spirv.data(), spirv.size() * sizeof(uint32_t));
            },
-            "Compiles string source provided and returns the value in bytes",
-            py::arg("source"), py::arg("entryPoint") = "main", py::arg("definitions") = std::vector<std::pair<std::string,std::string>>() )
+            DOC(kp, Shader, compileSource),
+            py::arg("source"),
+            py::arg("entryPoint") = "main",
+            py::arg("definitions") = std::vector<std::pair<std::string,std::string>>() )
        .def_static("compile_sources", [](
                                    const std::vector<std::string>& source,
                                    const std::vector<std::string>& files,
                                    const std::string& entryPoint,
                                    const std::vector<std::pair<std::string,std::string>>& definitions) {
-                std::vector<uint32_t> spirv = kp::Shader::compile_sources(source, files, entryPoint, definitions);
+                std::vector<uint32_t> spirv = kp::Shader::compileSources(source, files, entryPoint, definitions);
                return py::bytes((const char*)spirv.data(), spirv.size() * sizeof(uint32_t));
            },
-            "Compiles sources provided with file names and returns the value in bytes",
-            py::arg("sources"), py::arg("files") = std::vector<std::string>(), py::arg("entryPoint") = "main", py::arg("definitions") = std::vector<std::pair<std::string,std::string>>() );
+            DOC(kp, Shader, compileSources),
+            py::arg("sources"),
+            py::arg("files") = std::vector<std::string>(),
+            py::arg("entryPoint") = "main",
+            py::arg("definitions") = std::vector<std::pair<std::string,std::string>>() );
 #endif // KOMPUTE_DISABLE_SHADER_UTILS

-    py::class_<kp::OpBase, std::shared_ptr<kp::OpBase>>(m, "OpBase");
+    py::class_<kp::OpBase, std::shared_ptr<kp::OpBase>>(m, "OpBase", DOC(kp, OpBase));

-    py::class_<kp::OpTensorSyncDevice, std::shared_ptr<kp::OpTensorSyncDevice>>(m, "OpTensorSyncDevice", py::base<kp::OpBase>())
-        .def(py::init<const std::vector<std::shared_ptr<kp::Tensor>>&>());
+    py::class_<kp::OpTensorSyncDevice, std::shared_ptr<kp::OpTensorSyncDevice>>(
+            m, "OpTensorSyncDevice", py::base<kp::OpBase>(), DOC(kp, OpTensorSyncDevice))
+        .def(py::init<const std::vector<std::shared_ptr<kp::Tensor>>&>(), DOC(kp, OpTensorSyncDevice, OpTensorSyncDevice));

-    py::class_<kp::OpTensorSyncLocal, std::shared_ptr<kp::OpTensorSyncLocal>>(m, "OpTensorSyncLocal", py::base<kp::OpBase>())
-        .def(py::init<const std::vector<std::shared_ptr<kp::Tensor>>&>());
+    py::class_<kp::OpTensorSyncLocal, std::shared_ptr<kp::OpTensorSyncLocal>>(
+            m, "OpTensorSyncLocal", py::base<kp::OpBase>(), DOC(kp, OpTensorSyncLocal))
+        .def(py::init<const std::vector<std::shared_ptr<kp::Tensor>>&>(), DOC(kp, OpTensorSyncLocal, OpTensorSyncLocal));

-    py::class_<kp::OpTensorCopy, std::shared_ptr<kp::OpTensorCopy>>(m, "OpTensorCopy", py::base<kp::OpBase>())
-        .def(py::init<const std::vector<std::shared_ptr<kp::Tensor>>&>());
+    py::class_<kp::OpTensorCopy, std::shared_ptr<kp::OpTensorCopy>>(
+            m, "OpTensorCopy", py::base<kp::OpBase>(), DOC(kp, OpTensorCopy))
+        .def(py::init<const std::vector<std::shared_ptr<kp::Tensor>>&>(), DOC(kp, OpTensorCopy, OpTensorCopy));

-    py::class_<kp::OpAlgoDispatch, std::shared_ptr<kp::OpAlgoDispatch>>(m, "OpAlgoDispatch", py::base<kp::OpBase>())
+    py::class_<kp::OpAlgoDispatch, std::shared_ptr<kp::OpAlgoDispatch>>(
+            m, "OpAlgoDispatch", py::base<kp::OpBase>(), DOC(kp, OpAlgoDispatch))
        .def(py::init<const std::shared_ptr<kp::Algorithm>&,const kp::Constants&>(),
+                DOC(kp, OpAlgoDispatch, OpAlgoDispatch),
                py::arg("algorithm"), py::arg("push_consts") = kp::Constants());

-    py::class_<kp::OpMult, std::shared_ptr<kp::OpMult>>(m, "OpMult", py::base<kp::OpBase>())
-        .def(py::init<const std::vector<std::shared_ptr<kp::Tensor>>&,const std::shared_ptr<kp::Algorithm>&>());
+    py::class_<kp::OpMult, std::shared_ptr<kp::OpMult>>(
+            m, "OpMult", py::base<kp::OpBase>(), DOC(kp, OpMult))
+        .def(py::init<const std::vector<std::shared_ptr<kp::Tensor>>&,const std::shared_ptr<kp::Algorithm>&>(),
+                DOC(kp, OpMult, OpMult));

-    py::class_<kp::Algorithm, std::shared_ptr<kp::Algorithm>>(m, "Algorithm")
-        .def("get_tensors", &kp::Algorithm::getTensors)
-        .def("destroy", &kp::Algorithm::destroy)
-        .def("get_spec_consts", &kp::Algorithm::getSpecializationConstants)
-        .def("is_init", &kp::Algorithm::isInit);
+    py::class_<kp::Algorithm, std::shared_ptr<kp::Algorithm>>(m, "Algorithm", DOC(kp, Algorithm, Algorithm))
+        .def("get_tensors", &kp::Algorithm::getTensors, DOC(kp, Algorithm, getTensors))
+        .def("destroy", &kp::Algorithm::destroy, DOC(kp, Algorithm, destroy))
+        .def("get_spec_consts", &kp::Algorithm::getSpecializationConstants, DOC(kp, Algorithm, getSpecializationConstants))
+        .def("is_init", &kp::Algorithm::isInit, DOC(kp, Algorithm, isInit));

    py::class_<kp::Tensor, std::shared_ptr<kp::Tensor>>(m, "Tensor", DOC(kp, Tensor))
        .def("data", [](kp::Tensor& self) {
-                return py::array(self.data().size(), self.data().data());
-            }, "Returns stored data as a new numpy array.")
-        .def("__getitem__", [](kp::Tensor &self, size_t index) -> float { return self.data()[index]; },
-                "When only an index is necessary")
-        .def("__setitem__", [](kp::Tensor &self, size_t index, float value) {
-                self.data()[index] = value; })
-        .def("set_data", [np](kp::Tensor &self, const py::array_t<float> data){
-                const py::array_t<float> flatdata = np.attr("ravel")(data);
-                const py::buffer_info info        = flatdata.request();
-                const float* ptr                  = (float*) info.ptr;
-                self.setData(std::vector<float>(ptr, ptr+flatdata.size()));
-            }, "Overrides the data in the local Tensor memory.")
-        .def("__iter__", [](kp::Tensor &self) {
-                return py::make_iterator(self.data().begin(), self.data().end());
-            }, py::keep_alive<0, 1>(), // Required to keep alive iterator while exists
-            "Iterator to enable looping within data structure as required.")
-        .def("__contains__", [](kp::Tensor &self, float v) {
-                for (size_t i = 0; i < self.data().size(); ++i) {
-                    if (v == self.data()[i]) {
-                            return true;
-                        }
-                    }
-                return false;
-            })
-        .def("__reversed__", [](kp::Tensor &self) { 
-                size_t size = self.data().size();
-                std::vector<float> reversed(size);
-                for (size_t i = 0; i < size; i++) {
-                    reversed[size - i - 1] = self.data()[i];
+                // Non-owning container exposing the underlying pointer
+                py::str dummyDataOwner; // Explicitly request data to not be owned by np
+                switch (self.dataType()) {
+                case kp::Tensor::TensorDataTypes::eFloat:
+                    return py::array(self.size(), self.data<float>(), dummyDataOwner);
+                case kp::Tensor::TensorDataTypes::eUnsignedInt:
+                    return py::array(self.size(), self.data<uint32_t>(), dummyDataOwner);
+                case kp::Tensor::TensorDataTypes::eInt:
+                    return py::array(self.size(), self.data<int32_t>(), dummyDataOwner);
+                case kp::Tensor::TensorDataTypes::eDouble:
+                    return py::array(self.size(), self.data<double>(), dummyDataOwner);
+                case kp::Tensor::TensorDataTypes::eBool:
+                    return py::array(self.size(), self.data<bool>(), dummyDataOwner);
+                default:
+                    throw std::runtime_error("Kompute Python data type not supported");
                }
-                return reversed;
-            })
-        .def("size", &kp::Tensor::size, "Retrieves the size of the Tensor data as per the local Tensor memory.")
-        .def("__len__", &kp::Tensor::size, "Retrieves the size of the Tensor data as per the local Tensor memory.")
-        .def("tensor_type", &kp::Tensor::tensorType, "Retreves the memory type of the tensor.")
-        .def("is_init", &kp::Tensor::isInit, "Checks whether the tensor GPU memory has been initialised.")
-        .def("destroy", &kp::Tensor::destroy, "Destroy tensor GPU resources.");
+            }, DOC(kp, Tensor, data))
+        .def("size", &kp::Tensor::size, DOC(kp, Tensor, size))
+        .def("__len__", &kp::Tensor::size, DOC(kp, Tensor, size))
+        .def("tensor_type", &kp::Tensor::tensorType, DOC(kp, Tensor, tensorType))
+        .def("data_type", &kp::Tensor::dataType, DOC(kp, Tensor, dataType))
+        .def("is_init", &kp::Tensor::isInit, DOC(kp, Tensor, isInit))
+        .def("destroy", &kp::Tensor::destroy, DOC(kp, Tensor, destroy));

    py::class_<kp::Sequence, std::shared_ptr<kp::Sequence>>(m, "Sequence")
-        .def("record", [](kp::Sequence& self, std::shared_ptr<kp::OpBase> op) { return self.record(op); })
-        .def("eval", [](kp::Sequence& self) { return self.eval(); })
-        .def("eval", [](kp::Sequence& self, std::shared_ptr<kp::OpBase> op) { return self.eval(op); })
-        .def("eval_async", [](kp::Sequence& self) { return self.eval(); })
-        .def("eval_async", [](kp::Sequence& self, std::shared_ptr<kp::OpBase> op) { return self.evalAsync(op); })
-        .def("eval_await", [](kp::Sequence& self) { return self.evalAwait(); })
-        .def("eval_await", [](kp::Sequence& self, uint32_t wait) { return self.evalAwait(wait); })
-        .def("is_recording", &kp::Sequence::isRecording)
-        .def("is_running", &kp::Sequence::isRunning)
-        .def("is_init", &kp::Sequence::isInit)
-        .def("get_timestamps", &kp::Sequence::getTimestamps)
-        .def("clear", &kp::Sequence::clear)
-        .def("destroy", &kp::Sequence::destroy);
+        .def("record", [](kp::Sequence& self, std::shared_ptr<kp::OpBase> op) { return self.record(op); },
+                DOC(kp, Sequence, record))
+        .def("eval", [](kp::Sequence& self) { return self.eval(); },
+                DOC(kp, Sequence, eval))
+        .def("eval", [](kp::Sequence& self, std::shared_ptr<kp::OpBase> op) { return self.eval(op); },
+                DOC(kp, Sequence, eval_2))
+        .def("eval_async", [](kp::Sequence& self) { return self.eval(); },
+                DOC(kp, Sequence, evalAwait))
+        .def("eval_async", [](kp::Sequence& self, std::shared_ptr<kp::OpBase> op) { return self.evalAsync(op); },
+                DOC(kp, Sequence, evalAsync))
+        .def("eval_await", [](kp::Sequence& self) { return self.evalAwait(); },
+                DOC(kp, Sequence, evalAwait))
+        .def("eval_await", [](kp::Sequence& self, uint32_t wait) { return self.evalAwait(wait); },
+                DOC(kp, Sequence, evalAwait))
+        .def("is_recording", &kp::Sequence::isRecording,
+                DOC(kp, Sequence, isRecording))
+        .def("is_running", &kp::Sequence::isRunning,
+                DOC(kp, Sequence, isRunning))
+        .def("is_init", &kp::Sequence::isInit,
+                DOC(kp, Sequence, isInit))
+        .def("clear", &kp::Sequence::clear,
+                DOC(kp, Sequence, clear))
+        .def("rerecord", &kp::Sequence::rerecord,
+                DOC(kp, Sequence, rerecord))
+        .def("get_timestamps", &kp::Sequence::getTimestamps,
+            DOC(kp, Sequence, getTimestamps))
+        .def("destroy", &kp::Sequence::destroy,
+                DOC(kp, Sequence, destroy));

-    py::class_<kp::Manager, std::shared_ptr<kp::Manager>>(m, "Manager")
-        .def(py::init())
-        .def(py::init<uint32_t>())
+    py::class_<kp::Manager, std::shared_ptr<kp::Manager>>(m, "Manager", DOC(kp, Manager))
+        .def(py::init(), DOC(kp, Manager, Manager))
+        .def(py::init<uint32_t>(), DOC(kp, Manager, Manager_2))
        .def(py::init<uint32_t,const std::vector<uint32_t>&,const std::vector<std::string>&>(),
+                DOC(kp, Manager, Manager_2),
                py::arg("device") = 0,
                py::arg("family_queue_indices") = std::vector<uint32_t>(),
                py::arg("desired_extensions") = std::vector<std::string>())
-        .def("sequence", &kp::Manager::sequence, py::arg("queue_index") = 0, py::arg("total_timestamps") = 0)
+        .def("sequence", &kp::Manager::sequence, DOC(kp, Manager, sequence),
+                py::arg("queue_index") = 0, py::arg("total_timestamps") = 0)
        .def("tensor", [np](kp::Manager& self,
-                            const py::array_t<float> data,
+                            const py::array_t<float>& data,
                            kp::Tensor::TensorTypes tensor_type) {
-                const py::array_t<float> flatdata = np.attr("ravel")(data);
+                const py::array_t<float>& flatdata = np.attr("ravel")(data);
                const py::buffer_info info        = flatdata.request();
-                const float* ptr                  = (float*) info.ptr;
-                return self.tensor(std::vector<float>(ptr, ptr+flatdata.size()), tensor_type);
+                KP_LOG_DEBUG("Kompute Python Manager tensor() creating tensor float with data size {}", flatdata.size());
+                return self.tensor(
+                        info.ptr,
+                        flatdata.size(),
+                        sizeof(float),
+                        kp::Tensor::TensorDataTypes::eFloat,
+                        tensor_type);
            },
-            "Tensor initialisation function with data and tensor type",
+            DOC(kp, Manager, tensor),
+            py::arg("data"), py::arg("tensor_type") = kp::Tensor::TensorTypes::eDevice)
+        .def("tensor_t", [np](kp::Manager& self,
+                            const py::array& data,
+                            kp::Tensor::TensorTypes tensor_type) {
+                // TODO: Suppport strides in numpy format
+                const py::array& flatdata = np.attr("ravel")(data);
+                const py::buffer_info info        = flatdata.request();
+                KP_LOG_DEBUG("Kompute Python Manager creating tensor_T with data size {} dtype {}",
+                        flatdata.size(), std::string(py::str(flatdata.dtype())));
+                if (flatdata.dtype() == py::dtype::of<std::float_t>()) {
+                    return self.tensor(
+                            info.ptr, flatdata.size(), sizeof(float), kp::Tensor::TensorDataTypes::eFloat, tensor_type);
+                } else if (flatdata.dtype() == py::dtype::of<std::uint32_t>()) {
+                    return self.tensor(
+                            info.ptr, flatdata.size(), sizeof(uint32_t), kp::Tensor::TensorDataTypes::eUnsignedInt, tensor_type);
+                } else if (flatdata.dtype() == py::dtype::of<std::int32_t>()) {
+                    return self.tensor(
+                            info.ptr, flatdata.size(), sizeof(int32_t), kp::Tensor::TensorDataTypes::eInt, tensor_type);
+                } else if (flatdata.dtype() == py::dtype::of<std::double_t>()) {
+                    return self.tensor(
+                            info.ptr, flatdata.size(), sizeof(double), kp::Tensor::TensorDataTypes::eDouble, tensor_type);
+                } else if (flatdata.dtype() == py::dtype::of<bool>()) {
+                    return self.tensor(
+                            info.ptr, flatdata.size(), sizeof(bool), kp::Tensor::TensorDataTypes::eBool, tensor_type);
+                } else {
+                    throw std::runtime_error("Kompute Python no valid dtype supported");
+                }
+            },
+            DOC(kp, Manager, tensorT),
            py::arg("data"), py::arg("tensor_type") = kp::Tensor::TensorTypes::eDevice)
        .def("algorithm", [](kp::Manager& self,
                             const std::vector<std::shared_ptr<kp::Tensor>>& tensors,
@ -163,8 +212,12 @@ PYBIND11_MODULE(kp, m) {
                    std::vector<uint32_t> spirvVec((uint32_t*)data, (uint32_t*)(data + length));
                    return self.algorithm(tensors, spirvVec, workgroup, spec_consts, push_consts);
                },
-            "Algorithm initialisation function",
-            py::arg("tensors"), py::arg("spirv"), py::arg("workgroup") = kp::Workgroup(), py::arg("spec_consts") = kp::Constants(), py::arg("push_consts") = kp::Constants());
+            DOC(kp, Manager, algorithm),
+            py::arg("tensors"),
+            py::arg("spirv"),
+            py::arg("workgroup") = kp::Workgroup(),
+            py::arg("spec_consts") = kp::Constants(),
+            py::arg("push_consts") = kp::Constants());

 #ifdef VERSION_INFO
    m.attr("__version__") = VERSION_INFO;
--- a/python/test/test_array_multiplication.py
+++ b/python/test/test_array_multiplication.py
@ -9,9 +9,9 @@ def test_array_multiplication():
    mgr = kp.Manager()

    # 2. Create Kompute Tensors to hold data
-    tensor_in_a = mgr.tensor([2, 2, 2])
-    tensor_in_b = mgr.tensor([1, 2, 3])
-    tensor_out = mgr.tensor([0, 0, 0])
+    tensor_in_a = mgr.tensor(np.array([2, 2, 2]))
+    tensor_in_b = mgr.tensor(np.array([1, 2, 3]))
+    tensor_out = mgr.tensor(np.array([0, 0, 0]))

    params = [tensor_in_a, tensor_in_b, tensor_out]

--- a/python/test/test_kompute.py
+++ b/python/test/test_kompute.py
@ -9,35 +9,15 @@ DIRNAME = os.path.dirname(os.path.abspath(__file__))

 kp_log = logging.getLogger("kp")

-# TODO: Add example with file
-#def test_opalgobase_file():
-#    """
-#    Test basic OpMult operation
-#    """
-#
-#    tensor_in_a = kp.Tensor([2, 2, 2])
-#    tensor_in_b = kp.Tensor([1, 2, 3])
-#    tensor_out = kp.Tensor([0, 0, 0])
-#
-#    mgr = kp.Manager()
-#    mgr.rebuild([tensor_in_a, tensor_in_b, tensor_out])
-#
-#    shader_path = os.path.join(DIRNAME, "../../shaders/glsl/opmult.comp.spv")
-#
-#    mgr.eval_algo_file_def([tensor_in_a, tensor_in_b, tensor_out], shader_path)
-#
-#    mgr.eval_tensor_sync_local_def([tensor_out])
-#
-#    assert tensor_out.data() == [2.0, 4.0, 6.0]
-
 def test_end_to_end():

    mgr = kp.Manager()

    tensor_in_a = mgr.tensor([2, 2, 2])
    tensor_in_b = mgr.tensor([1, 2, 3])
-    tensor_out_a = mgr.tensor([0, 0, 0])
-    tensor_out_b = mgr.tensor([0, 0, 0])
+    # Explicit type constructor supports int, in32, double, float and int
+    tensor_out_a = mgr.tensor_t(np.array([0, 0, 0], dtype=np.uint32))
+    tensor_out_b = mgr.tensor_t(np.array([0, 0, 0], dtype=np.uint32))

    params = [tensor_in_a, tensor_in_b, tensor_out_a, tensor_out_b]

@ -49,8 +29,8 @@ def test_end_to_end():
        // The input tensors bind index is relative to index in parameter passed
        layout(set = 0, binding = 0) buffer buf_in_a { float in_a[]; };
        layout(set = 0, binding = 1) buffer buf_in_b { float in_b[]; };
-        layout(set = 0, binding = 2) buffer buf_out_a { float out_a[]; };
-        layout(set = 0, binding = 3) buffer buf_out_b { float out_b[]; };
+        layout(set = 0, binding = 2) buffer buf_out_a { uint out_a[]; };
+        layout(set = 0, binding = 3) buffer buf_out_b { uint out_b[]; };

        // Kompute supports push constants updated on dispatch
        layout(push_constant) uniform PushConstants {
@ -62,8 +42,8 @@ def test_end_to_end():

        void main() {
            uint index = gl_GlobalInvocationID.x;
-            out_a[index] += in_a[index] * in_b[index];
-            out_b[index] += const_one * push_const.val;
+            out_a[index] += uint( in_a[index] * in_b[index] );
+            out_b[index] += uint( const_one * push_const.val );
        }
    """

--- a/python/test/test_logistic_regression.py
+++ b/python/test/test_logistic_regression.py
@ -1,4 +1,5 @@
 import pyshader as ps
+import numpy as np
 import kp

 def test_logistic_regression():
@ -46,21 +47,21 @@ def test_logistic_regression():
    mgr = kp.Manager(0)

    # First we create input and ouput tensors for shader
-    tensor_x_i = mgr.tensor([0.0, 1.0, 1.0, 1.0, 1.0])
-    tensor_x_j = mgr.tensor([0.0, 0.0, 0.0, 1.0, 1.0])
+    tensor_x_i = mgr.tensor(np.array([0.0, 1.0, 1.0, 1.0, 1.0]))
+    tensor_x_j = mgr.tensor(np.array([0.0, 0.0, 0.0, 1.0, 1.0]))

-    tensor_y = mgr.tensor([0.0, 0.0, 0.0, 1.0, 1.0])
+    tensor_y = mgr.tensor(np.array([0.0, 0.0, 0.0, 1.0, 1.0]))

-    tensor_w_in = mgr.tensor([0.001, 0.001])
-    tensor_w_out_i = mgr.tensor([0.0, 0.0, 0.0, 0.0, 0.0])
-    tensor_w_out_j = mgr.tensor([0.0, 0.0, 0.0, 0.0, 0.0])
+    tensor_w_in = mgr.tensor(np.array([0.001, 0.001]))
+    tensor_w_out_i = mgr.tensor(np.array([0.0, 0.0, 0.0, 0.0, 0.0]))
+    tensor_w_out_j = mgr.tensor(np.array([0.0, 0.0, 0.0, 0.0, 0.0]))

-    tensor_b_in = mgr.tensor([0.0])
-    tensor_b_out = mgr.tensor([0.0, 0.0, 0.0, 0.0, 0.0])
+    tensor_b_in = mgr.tensor(np.array([0.0]))
+    tensor_b_out = mgr.tensor(np.array([0.0, 0.0, 0.0, 0.0, 0.0]))

-    tensor_l_out = mgr.tensor([0.0, 0.0, 0.0, 0.0, 0.0])
+    tensor_l_out = mgr.tensor(np.array([0.0, 0.0, 0.0, 0.0, 0.0]))

-    tensor_m = mgr.tensor([ tensor_y.size() ])
+    tensor_m = mgr.tensor(np.array([ tensor_y.size() ]))

    # We store them in an array for easier interaction
    params = [tensor_x_i, tensor_x_j, tensor_y, tensor_w_in, tensor_w_out_i,
@ -91,9 +92,9 @@ def test_logistic_regression():

        # Calculate the parameters based on the respective derivatives calculated
        for j_iter in range(tensor_b_out.size()):
-            tensor_w_in[0] -= learning_rate * tensor_w_out_i.data()[j_iter]
-            tensor_w_in[1] -= learning_rate * tensor_w_out_j.data()[j_iter]
-            tensor_b_in[0] -= learning_rate * tensor_b_out.data()[j_iter]
+            tensor_w_in.data()[0] -= learning_rate * tensor_w_out_i.data()[j_iter]
+            tensor_w_in.data()[1] -= learning_rate * tensor_w_out_j.data()[j_iter]
+            tensor_b_in.data()[0] -= learning_rate * tensor_b_out.data()[j_iter]

    assert tensor_w_in.data()[0] < 0.01
    assert tensor_w_in.data()[0] > 0.0
--- a/python/test/test_tensor_types.py
+++ b/python/test/test_tensor_types.py
@ -0,0 +1,206 @@
+import pyshader as ps
+import os
+import pytest
+import kp
+import numpy as np
+
+
+def test_type_float():
+
+    shader = """
+        #version 450
+        layout(set = 0, binding = 0) buffer tensorLhs {float valuesLhs[];};
+        layout(set = 0, binding = 1) buffer tensorRhs {float valuesRhs[];};
+        layout(set = 0, binding = 2) buffer tensorOutput { float valuesOutput[];};
+        layout (local_size_x = 1, local_size_y = 1, local_size_z = 1) in;
+
+        void main()
+        {
+            uint index = gl_GlobalInvocationID.x;
+            valuesOutput[index] = valuesLhs[index] * valuesRhs[index];
+        }
+    """
+
+    spirv = kp.Shader.compile_source(shader)
+
+    arr_in_a = np.array([123., 153., 231.], dtype=np.float32)
+    arr_in_b = np.array([9482, 1208, 1238], dtype=np.float32)
+    arr_out = np.array([0, 0, 0], dtype=np.float32)
+
+    mgr = kp.Manager()
+
+    tensor_in_a = mgr.tensor(arr_in_a)
+    tensor_in_b = mgr.tensor(arr_in_b)
+    tensor_out = mgr.tensor(arr_out)
+
+    params = [tensor_in_a, tensor_in_b, tensor_out]
+
+    (mgr.sequence()
+        .record(kp.OpTensorSyncDevice(params))
+        .record(kp.OpAlgoDispatch(mgr.algorithm(params, spirv)))
+        .record(kp.OpTensorSyncLocal([tensor_out]))
+        .eval())
+
+    assert np.all(tensor_out.data() == arr_in_a * arr_in_b)
+
+
+def test_type_float_double_incorrect():
+
+    shader = """
+        #version 450
+        layout(set = 0, binding = 0) buffer tensorLhs {float valuesLhs[];};
+        layout(set = 0, binding = 1) buffer tensorRhs {float valuesRhs[];};
+        layout(set = 0, binding = 2) buffer tensorOutput { float valuesOutput[];};
+        layout (local_size_x = 1, local_size_y = 1, local_size_z = 1) in;
+
+        void main()
+        {
+            uint index = gl_GlobalInvocationID.x;
+            valuesOutput[index] = valuesLhs[index] * valuesRhs[index];
+        }
+    """
+
+    spirv = kp.Shader.compile_source(shader)
+
+    arr_in_a = np.array([123., 153., 231.], dtype=np.float32)
+    arr_in_b = np.array([9482, 1208, 1238], dtype=np.uint32)
+    arr_out = np.array([0, 0, 0], dtype=np.float32)
+
+    mgr = kp.Manager()
+
+    tensor_in_a = mgr.tensor_t(arr_in_a)
+    tensor_in_b = mgr.tensor_t(arr_in_b)
+    tensor_out = mgr.tensor_t(arr_out)
+
+    params = [tensor_in_a, tensor_in_b, tensor_out]
+
+    (mgr.sequence()
+        .record(kp.OpTensorSyncDevice(params))
+        .record(kp.OpAlgoDispatch(mgr.algorithm(params, spirv)))
+        .record(kp.OpTensorSyncLocal([tensor_out]))
+        .eval())
+
+    assert np.all(tensor_out.data() != arr_in_a * arr_in_b)
+
+@pytest.mark.skipif("swiftshader" in os.environ.get("VK_ICD_FILENAMES"),
+                    reason="Swiftshader doesn't support double")
+def test_type_double():
+
+    shader = """
+        #version 450
+        layout(set = 0, binding = 0) buffer tensorLhs { double valuesLhs[]; };
+        layout(set = 0, binding = 1) buffer tensorRhs { double valuesRhs[]; };
+        layout(set = 0, binding = 2) buffer tensorOutput { double valuesOutput[]; };
+        layout (local_size_x = 1, local_size_y = 1, local_size_z = 1) in;
+
+        void main()
+        {
+            uint index = gl_GlobalInvocationID.x;
+            valuesOutput[index] = valuesLhs[index] * valuesRhs[index];
+        }
+    """
+
+    spirv = kp.Shader.compile_source(shader)
+
+    arr_in_a = np.array([123., 153., 231.], dtype=np.float64)
+    arr_in_b = np.array([9482, 1208, 1238], dtype=np.float64)
+    arr_out = np.array([0, 0, 0], dtype=np.float64)
+
+    mgr = kp.Manager()
+
+    tensor_in_a = mgr.tensor_t(arr_in_a)
+    tensor_in_b = mgr.tensor_t(arr_in_b)
+    tensor_out = mgr.tensor_t(arr_out)
+
+    params = [tensor_in_a, tensor_in_b, tensor_out]
+
+    (mgr.sequence()
+        .record(kp.OpTensorSyncDevice(params))
+        .record(kp.OpAlgoDispatch(mgr.algorithm(params, spirv)))
+        .record(kp.OpTensorSyncLocal([tensor_out]))
+        .eval())
+
+    print(f"Dtype value {tensor_out.data().dtype}")
+
+    assert np.all(tensor_out.data() == arr_in_a * arr_in_b)
+
+def test_type_int():
+
+    shader = """
+        #version 450
+        layout(set = 0, binding = 0) buffer tensorLhs { int valuesLhs[]; };
+        layout(set = 0, binding = 1) buffer tensorRhs { int valuesRhs[]; };
+        layout(set = 0, binding = 2) buffer tensorOutput { int valuesOutput[]; };
+        layout (local_size_x = 1, local_size_y = 1, local_size_z = 1) in;
+
+        void main()
+        {
+            uint index = gl_GlobalInvocationID.x;
+            valuesOutput[index] = valuesLhs[index] * valuesRhs[index];
+        }
+    """
+
+    spirv = kp.Shader.compile_source(shader)
+
+    arr_in_a = np.array([123, 153, 231], dtype=np.int32)
+    arr_in_b = np.array([9482, 1208, 1238], dtype=np.int32)
+    arr_out = np.array([0, 0, 0], dtype=np.int32)
+
+    mgr = kp.Manager()
+
+    tensor_in_a = mgr.tensor_t(arr_in_a)
+    tensor_in_b = mgr.tensor_t(arr_in_b)
+    tensor_out = mgr.tensor_t(arr_out)
+
+    params = [tensor_in_a, tensor_in_b, tensor_out]
+
+    (mgr.sequence()
+        .record(kp.OpTensorSyncDevice(params))
+        .record(kp.OpAlgoDispatch(mgr.algorithm(params, spirv)))
+        .record(kp.OpTensorSyncLocal([tensor_out]))
+        .eval())
+
+    print(f"Dtype value {tensor_out.data().dtype}")
+
+    assert np.all(tensor_out.data() == arr_in_a * arr_in_b)
+
+def test_type_unsigned_int():
+
+    shader = """
+        #version 450
+        layout(set = 0, binding = 0) buffer tensorLhs { uint valuesLhs[]; };
+        layout(set = 0, binding = 1) buffer tensorRhs { uint valuesRhs[]; };
+        layout(set = 0, binding = 2) buffer tensorOutput { uint valuesOutput[]; };
+        layout (local_size_x = 1, local_size_y = 1, local_size_z = 1) in;
+
+        void main()
+        {
+            uint index = gl_GlobalInvocationID.x;
+            valuesOutput[index] = valuesLhs[index] * valuesRhs[index];
+        }
+    """
+
+    spirv = kp.Shader.compile_source(shader)
+
+    arr_in_a = np.array([123, 153, 231], dtype=np.uint32)
+    arr_in_b = np.array([9482, 1208, 1238], dtype=np.uint32)
+    arr_out = np.array([0, 0, 0], dtype=np.uint32)
+
+    mgr = kp.Manager()
+
+    tensor_in_a = mgr.tensor_t(arr_in_a)
+    tensor_in_b = mgr.tensor_t(arr_in_b)
+    tensor_out = mgr.tensor_t(arr_out)
+
+    params = [tensor_in_a, tensor_in_b, tensor_out]
+
+    (mgr.sequence()
+        .record(kp.OpTensorSyncDevice(params))
+        .record(kp.OpAlgoDispatch(mgr.algorithm(params, spirv)))
+        .record(kp.OpTensorSyncLocal([tensor_out]))
+        .eval())
+
+    print(f"Dtype value {tensor_out.data().dtype}")
+
+    assert np.all(tensor_out.data() == arr_in_a * arr_in_b)
+
--- a/setup.py
+++ b/setup.py
@ -57,7 +57,7 @@ class CMakeBuild(build_ext):
        else:
            cmake_args += ['-DKOMPUTE_EXTRA_CXX_FLAGS="-fPIC"']
            cmake_args += ['-DCMAKE_BUILD_TYPE=' + cfg]
-            build_args += ['--', '-j2']
+            build_args += ['--', '-j']

        env = os.environ.copy()
        env['CXXFLAGS'] = '{} -DVERSION_INFO=\\"{}\\"'.format(env.get('CXXFLAGS', ''),
--- a/single_include/kompute/Kompute.hpp
+++ b/single_include/kompute/Kompute.hpp
@ -762,7 +762,7 @@ class Shader
     * GLSL compiler
     * @return The compiled SPIR-V binary in unsigned int32 format
     */
-    static std::vector<uint32_t> compile_sources(
+    static std::vector<uint32_t> compileSources(
      const std::vector<std::string>& sources,
      const std::vector<std::string>& files = {},
      const std::string& entryPoint = "main",
@ -783,7 +783,7 @@ class Shader
     * GLSL compiler
     * @return The compiled SPIR-V binary in unsigned int32 format
     */
-    static std::vector<uint32_t> compile_source(
+    static std::vector<uint32_t> compileSource(
      const std::string& source,
      const std::string& entryPoint = "main",
      std::vector<std::pair<std::string, std::string>> definitions = {},
@ -818,6 +818,14 @@ class Tensor
        eHost = 1,    ///< Type is host memory, source and destination
        eStorage = 2, ///< Type is Device memory (only)
    };
+    enum class TensorDataTypes
+    {
+        eBool = 0,
+        eInt = 1,
+        eUnsignedInt = 2,
+        eFloat = 3,
+        eDouble = 4,
+    };

    /**
     *  Constructor with data provided which would be used to create the
@ -831,14 +839,17 @@ class Tensor
     */
    Tensor(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
           std::shared_ptr<vk::Device> device,
-           const std::vector<float>& data,
+           void* data,
+           uint32_t elementTotalCount,
+           uint32_t elementMemorySize,
+           const TensorDataTypes& dataType,
           const TensorTypes& tensorType = TensorTypes::eDevice);

    /**
     * Destructor which is in charge of freeing vulkan resources unless they
     * have been provided externally.
     */
-    ~Tensor();
+    virtual ~Tensor();

    /**
     * Function to trigger reinitialisation of the tensor buffer and memory with
@ -847,8 +858,9 @@ class Tensor
     * @param data Vector of data to use to initialise vector from
     * @param tensorType The type to use for the tensor
     */
-    void rebuild(const std::vector<float>& data,
-                 TensorTypes tensorType = TensorTypes::eDevice);
+    void rebuild(void* data,
+                 uint32_t elementTotalCount,
+                 uint32_t elementMemorySize);

    /**
     * Destroys and frees the GPU resources which include the buffer and memory.
@ -862,32 +874,6 @@ class Tensor
     */
    bool isInit();

-    /**
-     * Returns the vector of data currently contained by the Tensor. It is
-     * important to ensure that there is no out-of-sync data with the GPU
-     * memory.
-     *
-     * @return Reference to vector of elements representing the data in the
-     * tensor.
-     */
-    std::vector<float>& data();
-    /**
-     * Overrides the subscript operator to expose the underlying data's
-     * subscript operator which in this case would be its underlying
-     * vector's.
-     *
-     * @param i The index where the element will be returned from.
-     * @return Returns the element in the position requested.
-     */
-    float& operator[](int index);
-    /**
-     * Returns the size/magnitude of the Tensor, which will be the total number
-     * of elements across all dimensions
-     *
-     * @return Unsigned integer representing the total number of elements
-     */
-    uint32_t size();
-
    /**
     * Retrieve the tensor type of the Tensor
     *
@ -895,12 +881,6 @@ class Tensor
     */
    TensorTypes tensorType();

-    /**
-     * Sets / resets the vector data of the tensor. This function does not
-     * perform any copies into GPU memory and is only performed on the host.
-     */
-    void setData(const std::vector<float>& data);
-
    /**
     * Records a copy from the memory of the tensor provided to the current
     * thensor. This is intended to pass memory into a processing, to perform
@ -963,18 +943,118 @@ class Tensor
     * @return Descriptor buffer info with own buffer
     */
    vk::DescriptorBufferInfo constructDescriptorBufferInfo();
+
    /**
-     * Maps data from the Host Visible GPU memory into the data vector. It
-     * requires the Tensor to be of staging type for it to work.
+     * Returns the size/magnitude of the Tensor, which will be the total number
+     * of elements across all dimensions
+     *
+     * @return Unsigned integer representing the total number of elements
     */
-    void mapDataFromHostMemory();
+    // TODO: move to cpp
+    uint32_t size() {
+        return this->mSize;
+    }
+
+    // TODO: move to cpp
+    uint32_t dataTypeMemorySize() {
+        return this->mDataTypeMemorySize;
+    }
+
+    // TODO: move to cpp
+    uint32_t memorySize() {
+        return this->mSize * this->mDataTypeMemorySize;
+    }
+
    /**
-     * Maps data from the data vector into the Host Visible GPU memory. It
-     * requires the tensor to be of staging type for it to work.
+     * Retrieve the underlying data type of the Tensor
+     *
+     * @return Data type of tensor of type kp::Tensor::TensorDataTypes
     */
-    void mapDataIntoHostMemory();
+    TensorDataTypes dataType() {
+        return this->mDataType;
+    }
+
+    void* rawData() {
+        return this->mRawData;
+    }
+
+    // TODO: move to cpp
+    template <typename T>
+    T* data() {
+        return (T*)this->mRawData;
+    }
+
+    template <typename T>
+    std::vector<T> vector() {
+        return { (T*)this->mRawData, ((T*)this->mRawData) + this->size() };
+    }
+
+    /**
+     * Sets / resets the vector data of the tensor. This function does not
+     * perform any copies into GPU memory and is only performed on the host.
+     */
+    void setRawData(const void* data) 
+    {
+        // Copy data 
+        memcpy(this->mRawData, data, this->memorySize());
+    }
+
+  protected:
+    // -------------- ALWAYS OWNED RESOURCES
+    TensorTypes mTensorType;
+    TensorDataTypes mDataType;
+    uint32_t mSize;
+    uint32_t mDataTypeMemorySize;
+    void* mRawData;

  private:
+    void mapRawData() {
+
+        KP_LOG_DEBUG("Kompute Tensor mapping data from host buffer");
+
+        std::shared_ptr<vk::DeviceMemory> hostVisibleMemory = nullptr;
+
+        if (this->mTensorType == TensorTypes::eHost) {
+            hostVisibleMemory = this->mPrimaryMemory;
+        } else if (this->mTensorType == TensorTypes::eDevice) {
+            hostVisibleMemory = this->mStagingMemory;
+        } else {
+            KP_LOG_WARN(
+              "Kompute Tensor mapping data not supported on storage tensor");
+            return;
+        }
+
+        vk::DeviceSize bufferSize = this->memorySize();
+
+        // Given we request coherent host memory we don't need to invalidate / flush
+        this->mRawData = this->mDevice->mapMemory(
+          *hostVisibleMemory, 0, bufferSize, vk::MemoryMapFlags());
+
+        vk::MappedMemoryRange mappedMemoryRange(*hostVisibleMemory, 0, bufferSize);
+    }
+
+    void unmapRawData() {
+
+        KP_LOG_DEBUG("Kompute Tensor mapping data from host buffer");
+
+        std::shared_ptr<vk::DeviceMemory> hostVisibleMemory = nullptr;
+
+        if (this->mTensorType == TensorTypes::eHost) {
+            hostVisibleMemory = this->mPrimaryMemory;
+        } else if (this->mTensorType == TensorTypes::eDevice) {
+            hostVisibleMemory = this->mStagingMemory;
+        } else {
+            KP_LOG_WARN(
+              "Kompute Tensor mapping data not supported on storage tensor");
+            return;
+        }
+
+        vk::DeviceSize bufferSize = this->memorySize();
+        vk::MappedMemoryRange mappedRange(*hostVisibleMemory, 0, bufferSize);
+        this->mDevice->flushMappedMemoryRanges(1, &mappedRange);
+        this->mDevice->unmapMemory(*hostVisibleMemory);
+    }
+
    // -------------- NEVER OWNED RESOURCES
    std::shared_ptr<vk::PhysicalDevice> mPhysicalDevice;
    std::shared_ptr<vk::Device> mDevice;
@ -989,11 +1069,6 @@ class Tensor
    std::shared_ptr<vk::DeviceMemory> mStagingMemory;
    bool mFreeStagingMemory = false;

-    // -------------- ALWAYS OWNED RESOURCES
-    std::vector<float> mData;
-
-    TensorTypes mTensorType = TensorTypes::eDevice;
-
    void allocateMemoryCreateGPUResources(); // Creates the vulkan buffer
    void createBuffer(std::shared_ptr<vk::Buffer> buffer,
                      vk::BufferUsageFlags bufferUsageFlags);
@ -1012,7 +1087,60 @@ class Tensor
    vk::MemoryPropertyFlags getPrimaryMemoryPropertyFlags();
    vk::BufferUsageFlags getStagingBufferUsageFlags();
    vk::MemoryPropertyFlags getStagingMemoryPropertyFlags();
-    uint64_t memorySize();
+
+};
+
+// TODO: Limit T to be only float, bool, double, etc
+template <typename T>
+class TensorT: public Tensor
+{
+
+  public:
+    TensorT(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
+           std::shared_ptr<vk::Device> device,
+           const std::vector<T>& data,
+           const TensorTypes& tensorType = TensorTypes::eDevice)
+        : Tensor(physicalDevice,
+                 device,
+                 (void*)data.data(),
+                 data.size(),
+                 sizeof(T),
+                 this->dataType(),
+                 tensorType)
+    {
+        KP_LOG_DEBUG("Kompute TensorT constructor with data size {}", data.size());
+    }
+
+    ~TensorT() {
+        KP_LOG_DEBUG("Kompute TensorT destructor");
+    }
+
+    T* data() {
+        return (T*)this->mRawData;
+    }
+
+    std::vector<T> vector() {
+        return { (T*)this->mRawData, ((T*)this->mRawData) + this->size() };
+    }
+
+    T& operator[](int index) {
+        return *(((T*)this->mRawData) + index);
+    }
+
+    void setData(const std::vector<T>& data) {
+
+        KP_LOG_DEBUG("Kompute TensorT setting data with data size {}", data.size());
+
+        if (data.size() != this->mSize) {
+            throw std::runtime_error(
+              "Kompute TensorT Cannot set data of different sizes");
+        }
+
+        Tensor::setRawData(data.data());
+    }
+
+    TensorDataTypes dataType();
+
 };

 } // End namespace kp
@ -1873,7 +2001,7 @@ class Manager
     * If zero (default), disables latching of timestamps.
     * @returns Shared pointer with initialised sequence
     */
-    std::shared_ptr<Sequence> sequence(uint32_t queueIndex = 0, uint32_t nrOfTimestamps = 0);
+    std::shared_ptr<Sequence> sequence(uint32_t queueIndex = 0, uint32_t totalTimestamps = 0);

    /**
     * Create a managed tensor that will be destroyed by this manager
@ -1883,9 +2011,46 @@ class Manager
     * @param tensorType The type of tensor to initialize
     * @returns Shared pointer with initialised tensor
     */
-    std::shared_ptr<Tensor> tensor(
+    template <typename T>
+    std::shared_ptr<TensorT<T>> tensorT(
+      const std::vector<T>& data,
+      Tensor::TensorTypes tensorType = Tensor::TensorTypes::eDevice)
+    {
+        KP_LOG_DEBUG("Kompute Manager tensor creation triggered");
+
+        std::shared_ptr<TensorT<T>> tensor{ new kp::TensorT<T>(
+          this->mPhysicalDevice, this->mDevice, data, tensorType) };
+
+        if (this->mManageResources) {
+            this->mManagedTensors.push_back(tensor);
+        }
+
+        return tensor;
+    }
+
+    std::shared_ptr<TensorT<float>> tensor(
      const std::vector<float>& data,
-      Tensor::TensorTypes tensorType = Tensor::TensorTypes::eDevice);
+      Tensor::TensorTypes tensorType = Tensor::TensorTypes::eDevice)
+    {
+        return this->tensorT<float>(data, tensorType);
+    }
+
+    std::shared_ptr<Tensor> tensor(
+      void* data,
+      uint32_t elementTotalCount,
+      uint32_t elementMemorySize,
+      const Tensor::TensorDataTypes& dataType,
+      Tensor::TensorTypes tensorType = Tensor::TensorTypes::eDevice)
+    {
+        std::shared_ptr<Tensor> tensor{ new kp::Tensor(
+          this->mPhysicalDevice, this->mDevice, data, elementTotalCount, elementMemorySize, dataType, tensorType) };
+
+        if (this->mManageResources) {
+            this->mManagedTensors.push_back(tensor);
+        }
+
+        return tensor;
+    }

    /**
     * Create a managed algorithm that will be destroyed by this manager
--- a/src/Manager.cpp
+++ b/src/Manager.cpp
@ -395,21 +395,6 @@ Manager::createDevice(const std::vector<uint32_t>& familyQueueIndices,
    KP_LOG_DEBUG("Kompute Manager compute queue obtained");
 }

-std::shared_ptr<Tensor>
-Manager::tensor(const std::vector<float>& data, Tensor::TensorTypes tensorType)
-{
-    KP_LOG_DEBUG("Kompute Manager tensor creation triggered");
-
-    std::shared_ptr<Tensor> tensor{ new kp::Tensor(
-      this->mPhysicalDevice, this->mDevice, data, tensorType) };
-
-    if (this->mManageResources) {
-        this->mManagedTensors.push_back(tensor);
-    }
-
-    return tensor;
-}
-
 std::shared_ptr<Algorithm>
 Manager::algorithm(const std::vector<std::shared_ptr<Tensor>>& tensors,
                   const std::vector<uint32_t>& spirv,
--- a/src/OpTensorCopy.cpp
+++ b/src/OpTensorCopy.cpp
@ -13,6 +13,20 @@ OpTensorCopy::OpTensorCopy(const std::vector<std::shared_ptr<Tensor>>& tensors)
        throw std::runtime_error(
          "Kompute OpTensorCopy called with less than 2 tensor");
    }
+
+    kp::Tensor::TensorDataTypes dataType = this->mTensors[0]->dataType();
+    uint32_t size = this->mTensors[0]->size();
+    for (const std::shared_ptr<Tensor>& tensor : tensors) {
+        if (tensor->dataType() != dataType) {
+            throw std::runtime_error(fmt::format("Attempting to copy tensors of different types from {} to {}",
+                        dataType, tensor->dataType()));
+        }
+        if (tensor->size() != size) {
+            throw std::runtime_error(fmt::format("Attempting to copy tensors of different sizes from {} to {}",
+                        size, tensor->size()));
+
+        }
+    }
 }

 OpTensorCopy::~OpTensorCopy()
@ -43,9 +57,15 @@ OpTensorCopy::postEval(const vk::CommandBuffer& commandBuffer)
 {
    KP_LOG_DEBUG("Kompute OpTensorCopy postEval called");

+    // TODO: Simplify with a copyRawData
+    uint32_t size = this->mTensors[0]->size();
+    uint32_t dataTypeMemSize = this->mTensors[0]->dataTypeMemorySize();
+    uint32_t memSize = size * dataTypeMemSize;
+    void* data = this->mTensors[0]->rawData();
+
    // Copy the data from the first tensor into all the tensors
    for (size_t i = 1; i < this->mTensors.size(); i++) {
-        this->mTensors[i]->setData(this->mTensors[0]->data());
+        this->mTensors[i]->setRawData(data);
    }
 }

--- a/src/OpTensorSyncDevice.cpp
+++ b/src/OpTensorSyncDevice.cpp
@ -41,12 +41,6 @@ OpTensorSyncDevice::preEval(const vk::CommandBuffer& commandBuffer)
 {
    KP_LOG_DEBUG("Kompute OpTensorSyncDevice preEval called");

-    // Performing sync of data as eval can be called multiple times with same op
-    for (size_t i = 0; i < this->mTensors.size(); i++) {
-        if (this->mTensors[i]->tensorType() != Tensor::TensorTypes::eStorage) {
-            this->mTensors[i]->mapDataIntoHostMemory();
-        }
-    }
 }

 void
--- a/src/OpTensorSyncLocal.cpp
+++ b/src/OpTensorSyncLocal.cpp
@ -48,11 +48,6 @@ OpTensorSyncLocal::postEval(const vk::CommandBuffer& commandBuffer)
    KP_LOG_DEBUG("Kompute OpTensorSyncLocal postEval called");

    KP_LOG_DEBUG("Kompute OpTensorSyncLocal mapping data into tensor local");
-    for (size_t i = 0; i < this->mTensors.size(); i++) {
-        if (this->mTensors[i]->tensorType() != Tensor::TensorTypes::eStorage) {
-            this->mTensors[i]->mapDataFromHostMemory();
-        }
-    }
 }

 }
--- a/src/Shader.cpp
+++ b/src/Shader.cpp
@ -5,7 +5,7 @@
 namespace kp {

 std::vector<uint32_t>
-Shader::compile_sources(
+Shader::compileSources(
  const std::vector<std::string>& sources,
  const std::vector<std::string>& files,
  const std::string& entryPoint,
@ -92,13 +92,13 @@ Shader::compile_sources(
 }

 std::vector<uint32_t>
-Shader::compile_source(
+Shader::compileSource(
  const std::string& source,
  const std::string& entryPoint,
  std::vector<std::pair<std::string, std::string>> definitions,
  const TBuiltInResource& resource)
 {
-    return compile_sources({ source },
+    return compileSources({ source },
                           std::vector<std::string>({}),
                           entryPoint,
                           definitions,
--- a/src/Tensor.cpp
+++ b/src/Tensor.cpp
@ -5,17 +5,22 @@ namespace kp {

 Tensor::Tensor(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
               std::shared_ptr<vk::Device> device,
-               const std::vector<float>& data,
+               void* data,
+               uint32_t elementTotalCount,
+               uint32_t elementMemorySize,
+               const TensorDataTypes& dataType,
               const TensorTypes& tensorType)
 {
    KP_LOG_DEBUG("Kompute Tensor constructor data length: {}, and type: {}",
-                 data.size(),
+                 elementTotalCount,
                 tensorType);

    this->mPhysicalDevice = physicalDevice;
    this->mDevice = device;
+    this->mDataType = dataType;
+    this->mTensorType = tensorType;

-    this->rebuild(data, tensorType);
+    this->rebuild(data, elementTotalCount, elementMemorySize);
 }

 Tensor::~Tensor()
@ -29,12 +34,14 @@ Tensor::~Tensor()
 }

 void
-Tensor::rebuild(const std::vector<float>& data, TensorTypes tensorType)
+Tensor::rebuild(void* data,
+                uint32_t elementTotalCount,
+                uint32_t elementMemorySize)
 {
-    KP_LOG_DEBUG("Kompute Tensor rebuilding with size {}", data.size());
+    KP_LOG_DEBUG("Kompute Tensor rebuilding with size {}", elementTotalCount);

-    this->mData = data;
-    this->mTensorType = tensorType;
+    this->mSize = elementTotalCount;
+    this->mDataTypeMemorySize = elementMemorySize;

    if (this->mPrimaryBuffer || this->mPrimaryMemory) {
        KP_LOG_DEBUG(
@ -43,30 +50,9 @@ Tensor::rebuild(const std::vector<float>& data, TensorTypes tensorType)
    }

    this->allocateMemoryCreateGPUResources();
-}
+    this->mapRawData();

-std::vector<float>&
-Tensor::data()
-{
-    return this->mData;
-}
-
-float&
-Tensor::operator[](int index)
-{
-    return this->mData[index];
-}
-
-uint64_t
-Tensor::memorySize()
-{
-    return this->size() * sizeof(float);
-}
-
-uint32_t
-Tensor::size()
-{
-    return static_cast<uint32_t>(this->mData.size());
+    memcpy(this->mRawData, data, this->memorySize());
 }

 Tensor::TensorTypes
@ -78,18 +64,12 @@ Tensor::tensorType()
 bool
 Tensor::isInit()
 {
-    return this->mDevice && this->mPrimaryBuffer && this->mPrimaryMemory;
+    return this->mDevice
+        && this->mPrimaryBuffer
+        && this->mPrimaryMemory
+        && this->mRawData;
 }

-void
-Tensor::setData(const std::vector<float>& data)
-{
-    if (data.size() != this->mData.size()) {
-        throw std::runtime_error(
-          "Kompute Tensor Cannot set data of different sizes");
-    }
-    this->mData = data;
-}

 void
 Tensor::recordCopyFrom(const vk::CommandBuffer& commandBuffer,
@ -195,66 +175,13 @@ Tensor::recordBufferMemoryBarrier(const vk::CommandBuffer& commandBuffer,
 vk::DescriptorBufferInfo
 Tensor::constructDescriptorBufferInfo()
 {
+    KP_LOG_DEBUG("Kompute Tensor construct descriptor buffer info size {}", this->memorySize());
    vk::DeviceSize bufferSize = this->memorySize();
    return vk::DescriptorBufferInfo(*this->mPrimaryBuffer,
                                    0, // offset
                                    bufferSize);
 }

-void
-Tensor::mapDataFromHostMemory()
-{
-    KP_LOG_DEBUG("Kompute Tensor mapping data from host buffer");
-
-    std::shared_ptr<vk::DeviceMemory> hostVisibleMemory = nullptr;
-
-    if (this->mTensorType == TensorTypes::eHost) {
-        hostVisibleMemory = this->mPrimaryMemory;
-    } else if (this->mTensorType == TensorTypes::eDevice) {
-        hostVisibleMemory = this->mStagingMemory;
-    } else {
-        KP_LOG_WARN(
-          "Kompute Tensor mapping data not supported on storage tensor");
-        return;
-    }
-
-    vk::DeviceSize bufferSize = this->memorySize();
-    void* mapped = this->mDevice->mapMemory(
-      *hostVisibleMemory, 0, bufferSize, vk::MemoryMapFlags());
-    vk::MappedMemoryRange mappedMemoryRange(*hostVisibleMemory, 0, bufferSize);
-    this->mDevice->invalidateMappedMemoryRanges(mappedMemoryRange);
-    memcpy(this->mData.data(), mapped, bufferSize);
-    this->mDevice->unmapMemory(*hostVisibleMemory);
-}
-
-void
-Tensor::mapDataIntoHostMemory()
-{
-
-    KP_LOG_DEBUG("Kompute Tensor local mapping tensor data to host buffer");
-
-    std::shared_ptr<vk::DeviceMemory> hostVisibleMemory = nullptr;
-
-    if (this->mTensorType == TensorTypes::eHost) {
-        hostVisibleMemory = this->mPrimaryMemory;
-    } else if (this->mTensorType == TensorTypes::eDevice) {
-        hostVisibleMemory = this->mStagingMemory;
-    } else {
-        KP_LOG_WARN(
-          "Kompute Tensor mapping data not supported on storage tensor");
-        return;
-    }
-
-    vk::DeviceSize bufferSize = this->memorySize();
-
-    void* mapped = this->mDevice->mapMemory(
-      *hostVisibleMemory, 0, bufferSize, vk::MemoryMapFlags());
-    memcpy(mapped, this->mData.data(), bufferSize);
-    vk::MappedMemoryRange mappedRange(*hostVisibleMemory, 0, bufferSize);
-    this->mDevice->flushMappedMemoryRanges(1, &mappedRange);
-    this->mDevice->unmapMemory(*hostVisibleMemory);
-}
-
 vk::BufferUsageFlags
 Tensor::getPrimaryBufferUsageFlags()
 {
@ -285,7 +212,8 @@ Tensor::getPrimaryMemoryPropertyFlags()
            return vk::MemoryPropertyFlagBits::eDeviceLocal;
            break;
        case TensorTypes::eHost:
-            return vk::MemoryPropertyFlagBits::eHostVisible;
+            return vk::MemoryPropertyFlagBits::eHostVisible |
+                vk::MemoryPropertyFlagBits::eHostCoherent;
            break;
        case TensorTypes::eStorage:
            return vk::MemoryPropertyFlagBits::eDeviceLocal;
@ -435,12 +363,20 @@ Tensor::destroy()
 {
    KP_LOG_DEBUG("Kompute Tensor started destroy()");

+    // Setting raw data to null regardless whether device is available to invalidate Tensor
+    this->mRawData = nullptr;
+    this->mSize = 0;
+    this->mDataTypeMemorySize = 0;
+
    if (!this->mDevice) {
        KP_LOG_WARN(
          "Kompute Tensor destructor reached with null Device pointer");
        return;
    }

+    // Unmap the current memory data
+    this->unmapRawData();
+
    if (this->mFreePrimaryBuffer) {
        if (!this->mPrimaryBuffer) {
            KP_LOG_WARN("Kompose Tensor expected to destroy primary buffer "
@ -504,4 +440,34 @@ Tensor::destroy()
    KP_LOG_DEBUG("Kompute Tensor successful destroy()");
 }

+template<>
+Tensor::TensorDataTypes
+TensorT<bool>::dataType() {
+    return Tensor::TensorDataTypes::eBool;
+}
+
+template<>
+Tensor::TensorDataTypes
+TensorT<int32_t>::dataType() {
+    return Tensor::TensorDataTypes::eInt;
+}
+
+template<>
+Tensor::TensorDataTypes
+TensorT<uint32_t>::dataType() {
+    return Tensor::TensorDataTypes::eUnsignedInt;
+}
+
+template<>
+Tensor::TensorDataTypes
+TensorT<float>::dataType() {
+    return Tensor::TensorDataTypes::eFloat;
+}
+
+template<>
+Tensor::TensorDataTypes
+TensorT<double>::dataType() {
+    return Tensor::TensorDataTypes::eDouble;
+}
+
 }
--- a/src/include/kompute/Manager.hpp
+++ b/src/include/kompute/Manager.hpp
@ -74,9 +74,46 @@ class Manager
     * @param tensorType The type of tensor to initialize
     * @returns Shared pointer with initialised tensor
     */
-    std::shared_ptr<Tensor> tensor(
+    template <typename T>
+    std::shared_ptr<TensorT<T>> tensorT(
+      const std::vector<T>& data,
+      Tensor::TensorTypes tensorType = Tensor::TensorTypes::eDevice)
+    {
+        KP_LOG_DEBUG("Kompute Manager tensor creation triggered");
+
+        std::shared_ptr<TensorT<T>> tensor{ new kp::TensorT<T>(
+          this->mPhysicalDevice, this->mDevice, data, tensorType) };
+
+        if (this->mManageResources) {
+            this->mManagedTensors.push_back(tensor);
+        }
+
+        return tensor;
+    }
+
+    std::shared_ptr<TensorT<float>> tensor(
      const std::vector<float>& data,
-      Tensor::TensorTypes tensorType = Tensor::TensorTypes::eDevice);
+      Tensor::TensorTypes tensorType = Tensor::TensorTypes::eDevice)
+    {
+        return this->tensorT<float>(data, tensorType);
+    }
+
+    std::shared_ptr<Tensor> tensor(
+      void* data,
+      uint32_t elementTotalCount,
+      uint32_t elementMemorySize,
+      const Tensor::TensorDataTypes& dataType,
+      Tensor::TensorTypes tensorType = Tensor::TensorTypes::eDevice)
+    {
+        std::shared_ptr<Tensor> tensor{ new kp::Tensor(
+          this->mPhysicalDevice, this->mDevice, data, elementTotalCount, elementMemorySize, dataType, tensorType) };
+
+        if (this->mManageResources) {
+            this->mManagedTensors.push_back(tensor);
+        }
+
+        return tensor;
+    }

    /**
     * Create a managed algorithm that will be destroyed by this manager
--- a/src/include/kompute/Shader.hpp
+++ b/src/include/kompute/Shader.hpp
@ -39,7 +39,7 @@ class Shader
     * GLSL compiler
     * @return The compiled SPIR-V binary in unsigned int32 format
     */
-    static std::vector<uint32_t> compile_sources(
+    static std::vector<uint32_t> compileSources(
      const std::vector<std::string>& sources,
      const std::vector<std::string>& files = {},
      const std::string& entryPoint = "main",
@ -60,7 +60,7 @@ class Shader
     * GLSL compiler
     * @return The compiled SPIR-V binary in unsigned int32 format
     */
-    static std::vector<uint32_t> compile_source(
+    static std::vector<uint32_t> compileSource(
      const std::string& source,
      const std::string& entryPoint = "main",
      std::vector<std::pair<std::string, std::string>> definitions = {},
--- a/src/include/kompute/Tensor.hpp
+++ b/src/include/kompute/Tensor.hpp
@ -27,6 +27,14 @@ class Tensor
        eHost = 1,    ///< Type is host memory, source and destination
        eStorage = 2, ///< Type is Device memory (only)
    };
+    enum class TensorDataTypes
+    {
+        eBool = 0,
+        eInt = 1,
+        eUnsignedInt = 2,
+        eFloat = 3,
+        eDouble = 4,
+    };

    /**
     *  Constructor with data provided which would be used to create the
@ -40,14 +48,17 @@ class Tensor
     */
    Tensor(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
           std::shared_ptr<vk::Device> device,
-           const std::vector<float>& data,
+           void* data,
+           uint32_t elementTotalCount,
+           uint32_t elementMemorySize,
+           const TensorDataTypes& dataType,
           const TensorTypes& tensorType = TensorTypes::eDevice);

    /**
     * Destructor which is in charge of freeing vulkan resources unless they
     * have been provided externally.
     */
-    ~Tensor();
+    virtual ~Tensor();

    /**
     * Function to trigger reinitialisation of the tensor buffer and memory with
@ -56,8 +67,9 @@ class Tensor
     * @param data Vector of data to use to initialise vector from
     * @param tensorType The type to use for the tensor
     */
-    void rebuild(const std::vector<float>& data,
-                 TensorTypes tensorType = TensorTypes::eDevice);
+    void rebuild(void* data,
+                 uint32_t elementTotalCount,
+                 uint32_t elementMemorySize);

    /**
     * Destroys and frees the GPU resources which include the buffer and memory.
@ -71,32 +83,6 @@ class Tensor
     */
    bool isInit();

-    /**
-     * Returns the vector of data currently contained by the Tensor. It is
-     * important to ensure that there is no out-of-sync data with the GPU
-     * memory.
-     *
-     * @return Reference to vector of elements representing the data in the
-     * tensor.
-     */
-    std::vector<float>& data();
-    /**
-     * Overrides the subscript operator to expose the underlying data's
-     * subscript operator which in this case would be its underlying
-     * vector's.
-     *
-     * @param i The index where the element will be returned from.
-     * @return Returns the element in the position requested.
-     */
-    float& operator[](int index);
-    /**
-     * Returns the size/magnitude of the Tensor, which will be the total number
-     * of elements across all dimensions
-     *
-     * @return Unsigned integer representing the total number of elements
-     */
-    uint32_t size();
-
    /**
     * Retrieve the tensor type of the Tensor
     *
@ -104,12 +90,6 @@ class Tensor
     */
    TensorTypes tensorType();

-    /**
-     * Sets / resets the vector data of the tensor. This function does not
-     * perform any copies into GPU memory and is only performed on the host.
-     */
-    void setData(const std::vector<float>& data);
-
    /**
     * Records a copy from the memory of the tensor provided to the current
     * thensor. This is intended to pass memory into a processing, to perform
@ -172,18 +152,118 @@ class Tensor
     * @return Descriptor buffer info with own buffer
     */
    vk::DescriptorBufferInfo constructDescriptorBufferInfo();
+
    /**
-     * Maps data from the Host Visible GPU memory into the data vector. It
-     * requires the Tensor to be of staging type for it to work.
+     * Returns the size/magnitude of the Tensor, which will be the total number
+     * of elements across all dimensions
+     *
+     * @return Unsigned integer representing the total number of elements
     */
-    void mapDataFromHostMemory();
+    // TODO: move to cpp
+    uint32_t size() {
+        return this->mSize;
+    }
+
+    // TODO: move to cpp
+    uint32_t dataTypeMemorySize() {
+        return this->mDataTypeMemorySize;
+    }
+
+    // TODO: move to cpp
+    uint32_t memorySize() {
+        return this->mSize * this->mDataTypeMemorySize;
+    }
+
    /**
-     * Maps data from the data vector into the Host Visible GPU memory. It
-     * requires the tensor to be of staging type for it to work.
+     * Retrieve the underlying data type of the Tensor
+     *
+     * @return Data type of tensor of type kp::Tensor::TensorDataTypes
     */
-    void mapDataIntoHostMemory();
+    TensorDataTypes dataType() {
+        return this->mDataType;
+    }
+
+    void* rawData() {
+        return this->mRawData;
+    }
+
+    // TODO: move to cpp
+    template <typename T>
+    T* data() {
+        return (T*)this->mRawData;
+    }
+
+    template <typename T>
+    std::vector<T> vector() {
+        return { (T*)this->mRawData, ((T*)this->mRawData) + this->size() };
+    }
+
+    /**
+     * Sets / resets the vector data of the tensor. This function does not
+     * perform any copies into GPU memory and is only performed on the host.
+     */
+    void setRawData(const void* data) 
+    {
+        // Copy data 
+        memcpy(this->mRawData, data, this->memorySize());
+    }
+
+  protected:
+    // -------------- ALWAYS OWNED RESOURCES
+    TensorTypes mTensorType;
+    TensorDataTypes mDataType;
+    uint32_t mSize;
+    uint32_t mDataTypeMemorySize;
+    void* mRawData;

  private:
+    void mapRawData() {
+
+        KP_LOG_DEBUG("Kompute Tensor mapping data from host buffer");
+
+        std::shared_ptr<vk::DeviceMemory> hostVisibleMemory = nullptr;
+
+        if (this->mTensorType == TensorTypes::eHost) {
+            hostVisibleMemory = this->mPrimaryMemory;
+        } else if (this->mTensorType == TensorTypes::eDevice) {
+            hostVisibleMemory = this->mStagingMemory;
+        } else {
+            KP_LOG_WARN(
+              "Kompute Tensor mapping data not supported on storage tensor");
+            return;
+        }
+
+        vk::DeviceSize bufferSize = this->memorySize();
+
+        // Given we request coherent host memory we don't need to invalidate / flush
+        this->mRawData = this->mDevice->mapMemory(
+          *hostVisibleMemory, 0, bufferSize, vk::MemoryMapFlags());
+
+        vk::MappedMemoryRange mappedMemoryRange(*hostVisibleMemory, 0, bufferSize);
+    }
+
+    void unmapRawData() {
+
+        KP_LOG_DEBUG("Kompute Tensor mapping data from host buffer");
+
+        std::shared_ptr<vk::DeviceMemory> hostVisibleMemory = nullptr;
+
+        if (this->mTensorType == TensorTypes::eHost) {
+            hostVisibleMemory = this->mPrimaryMemory;
+        } else if (this->mTensorType == TensorTypes::eDevice) {
+            hostVisibleMemory = this->mStagingMemory;
+        } else {
+            KP_LOG_WARN(
+              "Kompute Tensor mapping data not supported on storage tensor");
+            return;
+        }
+
+        vk::DeviceSize bufferSize = this->memorySize();
+        vk::MappedMemoryRange mappedRange(*hostVisibleMemory, 0, bufferSize);
+        this->mDevice->flushMappedMemoryRanges(1, &mappedRange);
+        this->mDevice->unmapMemory(*hostVisibleMemory);
+    }
+
    // -------------- NEVER OWNED RESOURCES
    std::shared_ptr<vk::PhysicalDevice> mPhysicalDevice;
    std::shared_ptr<vk::Device> mDevice;
@ -198,11 +278,6 @@ class Tensor
    std::shared_ptr<vk::DeviceMemory> mStagingMemory;
    bool mFreeStagingMemory = false;

-    // -------------- ALWAYS OWNED RESOURCES
-    std::vector<float> mData;
-
-    TensorTypes mTensorType = TensorTypes::eDevice;
-
    void allocateMemoryCreateGPUResources(); // Creates the vulkan buffer
    void createBuffer(std::shared_ptr<vk::Buffer> buffer,
                      vk::BufferUsageFlags bufferUsageFlags);
@ -221,7 +296,60 @@ class Tensor
    vk::MemoryPropertyFlags getPrimaryMemoryPropertyFlags();
    vk::BufferUsageFlags getStagingBufferUsageFlags();
    vk::MemoryPropertyFlags getStagingMemoryPropertyFlags();
-    uint64_t memorySize();
+
+};
+
+// TODO: Limit T to be only float, bool, double, etc
+template <typename T>
+class TensorT: public Tensor
+{
+
+  public:
+    TensorT(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
+           std::shared_ptr<vk::Device> device,
+           const std::vector<T>& data,
+           const TensorTypes& tensorType = TensorTypes::eDevice)
+        : Tensor(physicalDevice,
+                 device,
+                 (void*)data.data(),
+                 data.size(),
+                 sizeof(T),
+                 this->dataType(),
+                 tensorType)
+    {
+        KP_LOG_DEBUG("Kompute TensorT constructor with data size {}", data.size());
+    }
+
+    ~TensorT() {
+        KP_LOG_DEBUG("Kompute TensorT destructor");
+    }
+
+    T* data() {
+        return (T*)this->mRawData;
+    }
+
+    std::vector<T> vector() {
+        return { (T*)this->mRawData, ((T*)this->mRawData) + this->size() };
+    }
+
+    T& operator[](int index) {
+        return *(((T*)this->mRawData) + index);
+    }
+
+    void setData(const std::vector<T>& data) {
+
+        KP_LOG_DEBUG("Kompute TensorT setting data with data size {}", data.size());
+
+        if (data.size() != this->mSize) {
+            throw std::runtime_error(
+              "Kompute TensorT Cannot set data of different sizes");
+        }
+
+        Tensor::setRawData(data.data());
+    }
+
+    TensorDataTypes dataType();
+
 };

 } // End namespace kp
--- a/test/TestAsyncOperations.cpp
+++ b/test/TestAsyncOperations.cpp
@ -37,7 +37,7 @@ TEST(TestAsyncOperations, TestManagerParallelExecution)
        }
    )");

-    std::vector<uint32_t> spirv = kp::Shader::compile_source(shader);
+    std::vector<uint32_t> spirv = kp::Shader::compileSource(shader);

    std::vector<float> data(size, 0.0);
    std::vector<float> resultSync(size, 100000000);
@ -73,7 +73,7 @@ TEST(TestAsyncOperations, TestManagerParallelExecution)
    sq->eval<kp::OpTensorSyncLocal>(inputsSyncB);

    for (uint32_t i = 0; i < numParallel; i++) {
-        EXPECT_EQ(inputsSyncB[i]->data(), resultSync);
+        EXPECT_EQ(inputsSyncB[i]->vector<float>(), resultSync);
    }

    kp::Manager mgrAsync(0, { 0, 2 });
@ -111,7 +111,7 @@ TEST(TestAsyncOperations, TestManagerParallelExecution)
    sq->eval<kp::OpTensorSyncLocal>({ inputsAsyncB });

    for (uint32_t i = 0; i < numParallel; i++) {
-        EXPECT_EQ(inputsAsyncB[i]->data(), resultAsync);
+        EXPECT_EQ((inputsAsyncB[i]->vector<float>()), resultAsync);
    }

    // The speedup should be at least 40%
@ -145,15 +145,15 @@ TEST(TestAsyncOperations, TestManagerAsyncExecution)
        }
    )");

-    std::vector<uint32_t> spirv = kp::Shader::compile_source(shader);
+    std::vector<uint32_t> spirv = kp::Shader::compileSource(shader);

    std::vector<float> data(size, 0.0);
    std::vector<float> resultAsync(size, 100000000);

    kp::Manager mgr;

-    std::shared_ptr<kp::Tensor> tensorA = mgr.tensor(data);
-    std::shared_ptr<kp::Tensor> tensorB = mgr.tensor(data);
+    std::shared_ptr<kp::TensorT<float>> tensorA = mgr.tensor(data);
+    std::shared_ptr<kp::TensorT<float>> tensorB = mgr.tensor(data);

    std::shared_ptr<kp::Sequence> sq1 = mgr.sequence();
    std::shared_ptr<kp::Sequence> sq2 = mgr.sequence();
@ -172,6 +172,6 @@ TEST(TestAsyncOperations, TestManagerAsyncExecution)
    sq1->evalAsync<kp::OpTensorSyncLocal>({ tensorA, tensorB });
    sq1->evalAwait();

-    EXPECT_EQ(tensorA->data(), resultAsync);
-    EXPECT_EQ(tensorB->data(), resultAsync);
+    EXPECT_EQ(tensorA->vector(), resultAsync);
+    EXPECT_EQ(tensorB->vector(), resultAsync);
 }
--- a/test/TestDestroy.cpp
+++ b/test/TestDestroy.cpp
@ -5,9 +5,9 @@

 TEST(TestDestroy, TestDestroyTensorSingle)
 {
-    std::shared_ptr<kp::Tensor> tensorA = nullptr;
+    std::shared_ptr<kp::TensorT<float>> tensorA = nullptr;

-    std::string shader(R"(
+        std::string shader(R"(
      #version 450
      layout (local_size_x = 1) in;
      layout(set = 0, binding = 0) buffer a { float pa[]; };
@ -16,7 +16,7 @@ TEST(TestDestroy, TestDestroyTensorSingle)
          pa[index] = pa[index] + 1;
      })");

-    std::vector<uint32_t> spirv = kp::Shader::compile_source(shader);
+    std::vector<uint32_t> spirv = kp::Shader::compileSource(shader);

    {
        std::shared_ptr<kp::Sequence> sq = nullptr;
@ -34,18 +34,19 @@ TEST(TestDestroy, TestDestroyTensorSingle)
              ->eval()
              ->eval<kp::OpTensorSyncLocal>(algo->getTensors());

+            EXPECT_EQ(tensorA->vector(), std::vector<float>({ 1, 1, 1 }));
+
            tensorA->destroy();
            EXPECT_FALSE(tensorA->isInit());
        }
        EXPECT_FALSE(tensorA->isInit());
    }
-    EXPECT_EQ(tensorA->data(), std::vector<float>({ 1, 1, 1 }));
 }

 TEST(TestDestroy, TestDestroyTensorVector)
 {
-    std::shared_ptr<kp::Tensor> tensorA = nullptr;
-    std::shared_ptr<kp::Tensor> tensorB = nullptr;
+    std::shared_ptr<kp::TensorT<float>> tensorA = nullptr;
+    std::shared_ptr<kp::TensorT<float>> tensorB = nullptr;

    std::string shader(R"(
      #version 450
@ -57,7 +58,7 @@ TEST(TestDestroy, TestDestroyTensorVector)
          pa[index] = pa[index] + 1;
          pb[index] = pb[index] + 2;
      })");
-    std::vector<uint32_t> spirv = kp::Shader::compile_source(shader);
+    std::vector<uint32_t> spirv = kp::Shader::compileSource(shader);

    {
        std::shared_ptr<kp::Sequence> sq = nullptr;
@ -77,6 +78,9 @@ TEST(TestDestroy, TestDestroyTensorVector)
              ->record<kp::OpTensorSyncLocal>(algo->getTensors())
              ->eval();

+            EXPECT_EQ(tensorA->vector(), std::vector<float>({ 2, 2, 2 }));
+            EXPECT_EQ(tensorB->vector(), std::vector<float>({ 3, 3, 3 }));
+
            tensorA->destroy();
            tensorB->destroy();

@ -84,13 +88,11 @@ TEST(TestDestroy, TestDestroyTensorVector)
            EXPECT_FALSE(tensorB->isInit());
        }
    }
-    EXPECT_EQ(tensorA->data(), std::vector<float>({ 2, 2, 2 }));
-    EXPECT_EQ(tensorB->data(), std::vector<float>({ 3, 3, 3 }));
 }

 TEST(TestDestroy, TestDestroySequenceSingle)
 {
-    std::shared_ptr<kp::Tensor> tensorA = nullptr;
+    std::shared_ptr<kp::TensorT<float>> tensorA = nullptr;

    std::string shader(R"(
      #version 450
@ -101,7 +103,7 @@ TEST(TestDestroy, TestDestroySequenceSingle)
          pa[index] = pa[index] + 1;
      })");

-    std::vector<uint32_t> spirv = kp::Shader::compile_source(shader);
+    std::vector<uint32_t> spirv = kp::Shader::compileSource(shader);

    {
        std::shared_ptr<kp::Sequence> sq = nullptr;
@ -121,7 +123,8 @@ TEST(TestDestroy, TestDestroySequenceSingle)
            sq->destroy();

            EXPECT_FALSE(sq->isInit());
+
+            EXPECT_EQ(tensorA->vector(), std::vector<float>({ 1, 1, 1 }));
        }
    }
-    EXPECT_EQ(tensorA->data(), std::vector<float>({ 1, 1, 1 }));
 }
--- a/test/TestLogisticRegression.cpp
+++ b/test/TestLogisticRegression.cpp
@ -14,19 +14,19 @@ TEST(TestLogisticRegression, TestMainLogisticRegression)
    {
        kp::Manager mgr;

-        std::shared_ptr<kp::Tensor> xI = mgr.tensor({ 0, 1, 1, 1, 1 });
-        std::shared_ptr<kp::Tensor> xJ = mgr.tensor({ 0, 0, 0, 1, 1 });
+        std::shared_ptr<kp::TensorT<float>> xI = mgr.tensor({ 0, 1, 1, 1, 1 });
+        std::shared_ptr<kp::TensorT<float>> xJ = mgr.tensor({ 0, 0, 0, 1, 1 });

-        std::shared_ptr<kp::Tensor> y = mgr.tensor({ 0, 0, 0, 1, 1 });
+        std::shared_ptr<kp::TensorT<float>> y = mgr.tensor({ 0, 0, 0, 1, 1 });

-        std::shared_ptr<kp::Tensor> wIn = mgr.tensor({ 0.001, 0.001 });
-        std::shared_ptr<kp::Tensor> wOutI = mgr.tensor({ 0, 0, 0, 0, 0 });
-        std::shared_ptr<kp::Tensor> wOutJ = mgr.tensor({ 0, 0, 0, 0, 0 });
+        std::shared_ptr<kp::TensorT<float>> wIn = mgr.tensor({ 0.001, 0.001 });
+        std::shared_ptr<kp::TensorT<float>> wOutI = mgr.tensor({ 0, 0, 0, 0, 0 });
+        std::shared_ptr<kp::TensorT<float>> wOutJ = mgr.tensor({ 0, 0, 0, 0, 0 });

-        std::shared_ptr<kp::Tensor> bIn = mgr.tensor({ 0 });
-        std::shared_ptr<kp::Tensor> bOut = mgr.tensor({ 0, 0, 0, 0, 0 });
+        std::shared_ptr<kp::TensorT<float>> bIn = mgr.tensor({ 0 });
+        std::shared_ptr<kp::TensorT<float>> bOut = mgr.tensor({ 0, 0, 0, 0, 0 });

-        std::shared_ptr<kp::Tensor> lOut = mgr.tensor({ 0, 0, 0, 0, 0 });
+        std::shared_ptr<kp::TensorT<float>> lOut = mgr.tensor({ 0, 0, 0, 0, 0 });

        std::vector<std::shared_ptr<kp::Tensor>> params = { xI,  xJ,    y,
                                                            wIn, wOutI, wOutJ,
@ -88,21 +88,21 @@ TEST(TestLogisticRegression, TestMainLogisticRegressionManualCopy)
    {
        kp::Manager mgr;

-        std::shared_ptr<kp::Tensor> xI = mgr.tensor({ 0, 1, 1, 1, 1 });
-        std::shared_ptr<kp::Tensor> xJ = mgr.tensor({ 0, 0, 0, 1, 1 });
+        std::shared_ptr<kp::TensorT<float>> xI = mgr.tensor({ 0, 1, 1, 1, 1 });
+        std::shared_ptr<kp::TensorT<float>> xJ = mgr.tensor({ 0, 0, 0, 1, 1 });

-        std::shared_ptr<kp::Tensor> y = mgr.tensor({ 0, 0, 0, 1, 1 });
+        std::shared_ptr<kp::TensorT<float>> y = mgr.tensor({ 0, 0, 0, 1, 1 });

-        std::shared_ptr<kp::Tensor> wIn =
+        std::shared_ptr<kp::TensorT<float>> wIn =
          mgr.tensor({ 0.001, 0.001 }, kp::Tensor::TensorTypes::eHost);
-        std::shared_ptr<kp::Tensor> wOutI = mgr.tensor({ 0, 0, 0, 0, 0 });
-        std::shared_ptr<kp::Tensor> wOutJ = mgr.tensor({ 0, 0, 0, 0, 0 });
+        std::shared_ptr<kp::TensorT<float>> wOutI = mgr.tensor({ 0, 0, 0, 0, 0 });
+        std::shared_ptr<kp::TensorT<float>> wOutJ = mgr.tensor({ 0, 0, 0, 0, 0 });

-        std::shared_ptr<kp::Tensor> bIn =
+        std::shared_ptr<kp::TensorT<float>> bIn =
          mgr.tensor({ 0 }, kp::Tensor::TensorTypes::eHost);
-        std::shared_ptr<kp::Tensor> bOut = mgr.tensor({ 0, 0, 0, 0, 0 });
+        std::shared_ptr<kp::TensorT<float>> bOut = mgr.tensor({ 0, 0, 0, 0, 0 });

-        std::shared_ptr<kp::Tensor> lOut = mgr.tensor({ 0, 0, 0, 0, 0 });
+        std::shared_ptr<kp::TensorT<float>> lOut = mgr.tensor({ 0, 0, 0, 0, 0 });

        std::vector<std::shared_ptr<kp::Tensor>> params = { xI,  xJ,    y,
                                                            wIn, wOutI, wOutJ,
@ -136,8 +136,6 @@ TEST(TestLogisticRegression, TestMainLogisticRegressionManualCopy)
                wIn->data()[1] -= learningRate * wOutJ->data()[j];
                bIn->data()[0] -= learningRate * bOut->data()[j];
            }
-            wIn->mapDataIntoHostMemory();
-            bIn->mapDataIntoHostMemory();
        }

        // Based on the inputs the outputs should be at least:
--- a/test/TestManager.cpp
+++ b/test/TestManager.cpp
@ -7,9 +7,9 @@ TEST(TestManager, EndToEndOpMultEvalFlow)
 {
    kp::Manager mgr;

-    std::shared_ptr<kp::Tensor> tensorLHS = mgr.tensor({ 0, 1, 2 });
-    std::shared_ptr<kp::Tensor> tensorRHS = mgr.tensor({ 2, 4, 6 });
-    std::shared_ptr<kp::Tensor> tensorOutput = mgr.tensor({ 0, 0, 0 });
+    std::shared_ptr<kp::TensorT<float>> tensorLHS = mgr.tensor({ 0, 1, 2 });
+    std::shared_ptr<kp::TensorT<float>> tensorRHS = mgr.tensor({ 2, 4, 6 });
+    std::shared_ptr<kp::TensorT<float>> tensorOutput = mgr.tensor({ 0, 0, 0 });

    std::vector<std::shared_ptr<kp::Tensor>> params = { tensorLHS,
                                                        tensorRHS,
@ -20,16 +20,16 @@ TEST(TestManager, EndToEndOpMultEvalFlow)
      ->eval<kp::OpMult>(params, mgr.algorithm())
      ->eval<kp::OpTensorSyncLocal>(params);

-    EXPECT_EQ(tensorOutput->data(), std::vector<float>({ 0, 4, 12 }));
+    EXPECT_EQ(tensorOutput->vector(), std::vector<float>({ 0, 4, 12 }));
 }

 TEST(TestManager, EndToEndOpMultSeqFlow)
 {
    kp::Manager mgr;

-    std::shared_ptr<kp::Tensor> tensorLHS = mgr.tensor({ 0, 1, 2 });
-    std::shared_ptr<kp::Tensor> tensorRHS = mgr.tensor({ 2, 4, 6 });
-    std::shared_ptr<kp::Tensor> tensorOutput = mgr.tensor({ 0, 0, 0 });
+    std::shared_ptr<kp::TensorT<float>> tensorLHS = mgr.tensor({ 0, 1, 2 });
+    std::shared_ptr<kp::TensorT<float>> tensorRHS = mgr.tensor({ 2, 4, 6 });
+    std::shared_ptr<kp::TensorT<float>> tensorOutput = mgr.tensor({ 0, 0, 0 });

    std::vector<std::shared_ptr<kp::Tensor>> params = { tensorLHS,
                                                        tensorRHS,
@ -41,16 +41,16 @@ TEST(TestManager, EndToEndOpMultSeqFlow)
      ->record<kp::OpTensorSyncLocal>(params)
      ->eval();

-    EXPECT_EQ(tensorOutput->data(), std::vector<float>({ 0, 4, 12 }));
+    EXPECT_EQ(tensorOutput->vector(), std::vector<float>({ 0, 4, 12 }));
 }

 TEST(TestManager, TestMultipleSequences)
 {
    kp::Manager mgr;

-    std::shared_ptr<kp::Tensor> tensorLHS = mgr.tensor({ 0, 1, 2 });
-    std::shared_ptr<kp::Tensor> tensorRHS = mgr.tensor({ 2, 4, 6 });
-    std::shared_ptr<kp::Tensor> tensorOutput = mgr.tensor({ 0, 0, 0 });
+    std::shared_ptr<kp::TensorT<float>> tensorLHS = mgr.tensor({ 0, 1, 2 });
+    std::shared_ptr<kp::TensorT<float>> tensorRHS = mgr.tensor({ 2, 4, 6 });
+    std::shared_ptr<kp::TensorT<float>> tensorOutput = mgr.tensor({ 0, 0, 0 });

    std::vector<std::shared_ptr<kp::Tensor>> params = { tensorLHS,
                                                        tensorRHS,
@ -60,5 +60,5 @@ TEST(TestManager, TestMultipleSequences)
    mgr.sequence()->eval<kp::OpMult>(params, mgr.algorithm());
    mgr.sequence()->eval<kp::OpTensorSyncLocal>(params);

-    EXPECT_EQ(tensorOutput->data(), std::vector<float>({ 0, 4, 12 }));
+    EXPECT_EQ(tensorOutput->vector(), std::vector<float>({ 0, 4, 12 }));
 }
--- a/test/TestMultipleAlgoExecutions.cpp
+++ b/test/TestMultipleAlgoExecutions.cpp
@ -8,10 +8,12 @@ TEST(TestMultipleAlgoExecutions, TestEndToEndFunctionality)

    kp::Manager mgr;

+    // Default tensor constructor simplifies creation of float values
    auto tensorInA = mgr.tensor({ 2., 2., 2. });
    auto tensorInB = mgr.tensor({ 1., 2., 3. });
-    auto tensorOutA = mgr.tensor({ 0., 0., 0. });
-    auto tensorOutB = mgr.tensor({ 0., 0., 0. });
+    // Explicit type constructor supports int, in32, double, float and int
+    auto tensorOutA = mgr.tensorT<uint32_t>({ 0, 0, 0 });
+    auto tensorOutB = mgr.tensorT<uint32_t>({ 0, 0, 0 });

    std::string shader = (R"(
        #version 450
@ -21,8 +23,8 @@ TEST(TestMultipleAlgoExecutions, TestEndToEndFunctionality)
        // The input tensors bind index is relative to index in parameter passed
        layout(set = 0, binding = 0) buffer buf_in_a { float in_a[]; };
        layout(set = 0, binding = 1) buffer buf_in_b { float in_b[]; };
-        layout(set = 0, binding = 2) buffer buf_out_a { float out_a[]; };
-        layout(set = 0, binding = 3) buffer buf_out_b { float out_b[]; };
+        layout(set = 0, binding = 2) buffer buf_out_a { uint out_a[]; };
+        layout(set = 0, binding = 3) buffer buf_out_b { uint out_b[]; };

        // Kompute supports push constants updated on dispatch
        layout(push_constant) uniform PushConstants {
@ -34,8 +36,8 @@ TEST(TestMultipleAlgoExecutions, TestEndToEndFunctionality)

        void main() {
            uint index = gl_GlobalInvocationID.x;
-            out_a[index] += in_a[index] * in_b[index];
-            out_b[index] += const_one * push_const.val;
+            out_a[index] += uint( in_a[index] * in_b[index] );
+            out_b[index] += uint( const_one * push_const.val );
        }
    )");

@ -49,7 +51,7 @@ TEST(TestMultipleAlgoExecutions, TestEndToEndFunctionality)
    kp::Constants pushConstsB({ 3.0 });

    auto algorithm = mgr.algorithm(
-      params, kp::Shader::compile_source(shader), workgroup, specConsts, pushConstsA);
+      params, kp::Shader::compileSource(shader), workgroup, specConsts, pushConstsA);

    // 3. Run operation with string shader synchronously
    mgr.sequence()
@ -64,8 +66,8 @@ TEST(TestMultipleAlgoExecutions, TestEndToEndFunctionality)

    sq->evalAwait();

-    EXPECT_EQ(tensorOutA->data(), std::vector<float>({ 4, 8, 12 }));
-    EXPECT_EQ(tensorOutB->data(), std::vector<float>({ 10, 10, 10 }));
+    EXPECT_EQ(tensorOutA->vector(), std::vector<uint32_t>({ 4, 8, 12 }));
+    EXPECT_EQ(tensorOutB->vector(), std::vector<uint32_t>({ 10, 10, 10 }));
 }

 TEST(TestMultipleAlgoExecutions, SingleSequenceRecord)
@ -73,7 +75,7 @@ TEST(TestMultipleAlgoExecutions, SingleSequenceRecord)

    kp::Manager mgr;

-    std::shared_ptr<kp::Tensor> tensorA = mgr.tensor({ 0, 0, 0 });
+    std::shared_ptr<kp::TensorT<float>> tensorA = mgr.tensor({ 0, 0, 0 });

    std::string shader(R"(
      #version 450
@ -84,7 +86,7 @@ TEST(TestMultipleAlgoExecutions, SingleSequenceRecord)
          pa[index] = pa[index] + 1;
      })");

-    std::vector<uint32_t> spirv = kp::Shader::compile_source(shader);
+    std::vector<uint32_t> spirv = kp::Shader::compileSource(shader);

    {
        mgr.sequence()
@ -96,14 +98,14 @@ TEST(TestMultipleAlgoExecutions, SingleSequenceRecord)
          ->eval();
    }

-    EXPECT_EQ(tensorA->data(), std::vector<float>({ 3, 3, 3 }));
+    EXPECT_EQ(tensorA->vector(), std::vector<float>({ 3, 3, 3 }));
 }

 TEST(TestMultipleAlgoExecutions, MultipleCmdBufRecords)
 {
    kp::Manager mgr;

-    std::shared_ptr<kp::Tensor> tensorA = mgr.tensor({ 0, 0, 0 });
+    std::shared_ptr<kp::TensorT<float>> tensorA = mgr.tensor({ 0, 0, 0 });

    std::string shader(R"(
      #version 450
@ -114,7 +116,7 @@ TEST(TestMultipleAlgoExecutions, MultipleCmdBufRecords)
          pa[index] = pa[index] + 1;
      })");

-    std::vector<uint32_t> spirv = kp::Shader::compile_source(shader);
+    std::vector<uint32_t> spirv = kp::Shader::compileSource(shader);

    std::shared_ptr<kp::Algorithm> algorithm =
      mgr.algorithm({ tensorA }, spirv);
@ -131,7 +133,7 @@ TEST(TestMultipleAlgoExecutions, MultipleCmdBufRecords)

    mgr.sequence()->record<kp::OpTensorSyncLocal>({ tensorA })->eval();

-    EXPECT_EQ(tensorA->data(), std::vector<float>({ 3, 3, 3 }));
+    EXPECT_EQ(tensorA->vector(), std::vector<float>({ 3, 3, 3 }));
 }

 TEST(TestMultipleAlgoExecutions, MultipleSequences)
@ -139,7 +141,7 @@ TEST(TestMultipleAlgoExecutions, MultipleSequences)

    kp::Manager mgr;

-    std::shared_ptr<kp::Tensor> tensorA = mgr.tensor({ 0, 0, 0 });
+    std::shared_ptr<kp::TensorT<float>> tensorA = mgr.tensor({ 0, 0, 0 });

    std::string shader(R"(
      #version 450
@ -150,7 +152,7 @@ TEST(TestMultipleAlgoExecutions, MultipleSequences)
          pa[index] = pa[index] + 1;
      })");

-    std::vector<uint32_t> spirv = kp::Shader::compile_source(shader);
+    std::vector<uint32_t> spirv = kp::Shader::compileSource(shader);

    std::shared_ptr<kp::Algorithm> algorithm =
      mgr.algorithm({ tensorA }, spirv);
@ -167,14 +169,14 @@ TEST(TestMultipleAlgoExecutions, MultipleSequences)

    sq->record<kp::OpTensorSyncLocal>({ tensorA })->eval();

-    EXPECT_EQ(tensorA->data(), std::vector<float>({ 3, 3, 3 }));
+    EXPECT_EQ(tensorA->vector(), std::vector<float>({ 3, 3, 3 }));
 }

 TEST(TestMultipleAlgoExecutions, SingleRecordMultipleEval)
 {
    kp::Manager mgr;

-    std::shared_ptr<kp::Tensor> tensorA = mgr.tensor({ 0, 0, 0 });
+    std::shared_ptr<kp::TensorT<float>> tensorA = mgr.tensor({ 0, 0, 0 });

    std::string shader(R"(
      #version 450
@ -185,7 +187,7 @@ TEST(TestMultipleAlgoExecutions, SingleRecordMultipleEval)
          pa[index] = pa[index] + 1;
      })");

-    std::vector<uint32_t> spirv = kp::Shader::compile_source(shader);
+    std::vector<uint32_t> spirv = kp::Shader::compileSource(shader);

    std::shared_ptr<kp::Algorithm> algorithm =
      mgr.algorithm({ tensorA }, spirv);
@ -198,43 +200,6 @@ TEST(TestMultipleAlgoExecutions, SingleRecordMultipleEval)

    sq->record<kp::OpTensorSyncLocal>({ tensorA })->eval();

-    EXPECT_EQ(tensorA->data(), std::vector<float>({ 3, 3, 3 }));
+    EXPECT_EQ(tensorA->vector(), std::vector<float>({ 3, 3, 3 }));
 }

-TEST(TestMultipleAlgoExecutions, SequenceAlgoDestroyOutsideManagerScope)
-{
-    std::shared_ptr<kp::Tensor> tensorA = nullptr;
-
-    {
-        std::shared_ptr<kp::Sequence> sq = nullptr;
-        {
-            kp::Manager mgr;
-
-            tensorA = mgr.tensor({ 0, 0, 0 });
-
-            std::string shader(R"(
-              #version 450
-              layout (local_size_x = 1) in;
-              layout(set = 0, binding = 0) buffer a { float pa[]; };
-              void main() {
-                  uint index = gl_GlobalInvocationID.x;
-                  pa[index] = pa[index] + 1;
-              })");
-
-            std::vector<uint32_t> spirv = kp::Shader::compile_source(shader);
-
-            std::shared_ptr<kp::Algorithm> algorithm =
-              mgr.algorithm({ tensorA }, spirv);
-
-            sq = mgr.sequence();
-
-            sq->record<kp::OpTensorSyncDevice>({ tensorA })->eval();
-
-            sq->record<kp::OpAlgoDispatch>(algorithm)->eval()->eval()->eval();
-
-            sq->record<kp::OpTensorSyncLocal>({ tensorA })->eval();
-        }
-    }
-
-    EXPECT_EQ(tensorA->data(), std::vector<float>({ 3, 3, 3 }));
-}
--- a/test/TestOpShadersFromStringAndFile.cpp
+++ b/test/TestOpShadersFromStringAndFile.cpp
@ -9,8 +9,8 @@ TEST(TestOpAlgoCreate, ShaderRawDataFromConstructor)
 {
    kp::Manager mgr;

-    std::shared_ptr<kp::Tensor> tensorA = mgr.tensor({ 3, 4, 5 });
-    std::shared_ptr<kp::Tensor> tensorB = mgr.tensor({ 0, 0, 0 });
+    std::shared_ptr<kp::TensorT<float>> tensorA = mgr.tensor({ 3, 4, 5 });
+    std::shared_ptr<kp::TensorT<float>> tensorB = mgr.tensor({ 0, 0, 0 });

    std::string shader(R"(
        #version 450
@ -27,7 +27,7 @@ TEST(TestOpAlgoCreate, ShaderRawDataFromConstructor)
        }
    )");

-    std::vector<uint32_t> spirv = kp::Shader::compile_source(shader);
+    std::vector<uint32_t> spirv = kp::Shader::compileSource(shader);

    std::vector<std::shared_ptr<kp::Tensor>> params = { tensorA, tensorB };

@ -36,16 +36,16 @@ TEST(TestOpAlgoCreate, ShaderRawDataFromConstructor)
      ->eval<kp::OpAlgoDispatch>(mgr.algorithm(params, spirv))
      ->eval<kp::OpTensorSyncLocal>(params);

-    EXPECT_EQ(tensorA->data(), std::vector<float>({ 0, 1, 2 }));
-    EXPECT_EQ(tensorB->data(), std::vector<float>({ 3, 4, 5 }));
+    EXPECT_EQ(tensorA->vector(), std::vector<float>({ 0, 1, 2 }));
+    EXPECT_EQ(tensorB->vector(), std::vector<float>({ 3, 4, 5 }));
 }

 TEST(TestOpAlgoCreate, ShaderCompiledDataFromConstructor)
 {
    kp::Manager mgr;

-    std::shared_ptr<kp::Tensor> tensorA = mgr.tensor({ 3, 4, 5 });
-    std::shared_ptr<kp::Tensor> tensorB = mgr.tensor({ 0, 0, 0 });
+    std::shared_ptr<kp::TensorT<float>> tensorA = mgr.tensor({ 3, 4, 5 });
+    std::shared_ptr<kp::TensorT<float>> tensorB = mgr.tensor({ 0, 0, 0 });

    std::vector<uint32_t> spirv = std::vector<uint32_t>(
      (uint32_t*)
@ -62,8 +62,8 @@ TEST(TestOpAlgoCreate, ShaderCompiledDataFromConstructor)
      ->eval<kp::OpAlgoDispatch>(mgr.algorithm(params, spirv))
      ->eval<kp::OpTensorSyncLocal>(params);

-    EXPECT_EQ(tensorA->data(), std::vector<float>({ 0, 1, 2 }));
-    EXPECT_EQ(tensorB->data(), std::vector<float>({ 3, 4, 5 }));
+    EXPECT_EQ(tensorA->vector(), std::vector<float>({ 0, 1, 2 }));
+    EXPECT_EQ(tensorB->vector(), std::vector<float>({ 3, 4, 5 }));
 }

 // TODO: Add support to read from file for shader
@ -71,8 +71,8 @@ TEST(TestOpAlgoCreate, ShaderCompiledDataFromConstructor)
 //{
 //    kp::Manager mgr;
 //
-//    std::shared_ptr<kp::Tensor> tensorA{ new kp::Tensor({ 3, 4, 5 }) };
-//    std::shared_ptr<kp::Tensor> tensorB{ new kp::Tensor({ 0, 0, 0 }) };
+//    std::shared_ptr<kp::TensorT<float>> tensorA{ new kp::Tensor({ 3, 4, 5 }) };
+//    std::shared_ptr<kp::TensorT<float>> tensorB{ new kp::Tensor({ 0, 0, 0 }) };
 //    mgr.rebuild({ tensorA, tensorB });
 //
 //    mgr.evalOpDefault<kp::OpAlgoCreate>(
@ -81,6 +81,6 @@ TEST(TestOpAlgoCreate, ShaderCompiledDataFromConstructor)
 //
 //    mgr.evalOpDefault<kp::OpTensorSyncLocal>({ tensorA, tensorB });
 //
-//    EXPECT_EQ(tensorA->data(), std::vector<float>({ 0, 1, 2 }));
-//    EXPECT_EQ(tensorB->data(), std::vector<float>({ 3, 4, 5 }));
+//    EXPECT_EQ(tensorA->vector(), std::vector<float>({ 0, 1, 2 }));
+//    EXPECT_EQ(tensorB->vector(), std::vector<float>({ 3, 4, 5 }));
 //}
--- a/test/TestOpTensorCopy.cpp
+++ b/test/TestOpTensorCopy.cpp
@ -11,8 +11,8 @@ TEST(TestOpTensorCopy, CopyDeviceToDeviceTensor)
    std::vector<float> testVecA{ 1, 2, 3 };
    std::vector<float> testVecB{ 0, 0, 0 };

-    std::shared_ptr<kp::Tensor> tensorA = mgr.tensor(testVecA);
-    std::shared_ptr<kp::Tensor> tensorB = mgr.tensor(testVecB);
+    std::shared_ptr<kp::TensorT<float>> tensorA = mgr.tensor(testVecA);
+    std::shared_ptr<kp::TensorT<float>> tensorB = mgr.tensor(testVecB);

    EXPECT_TRUE(tensorA->isInit());
    EXPECT_TRUE(tensorB->isInit());
@ -22,8 +22,8 @@ TEST(TestOpTensorCopy, CopyDeviceToDeviceTensor)
      ->eval<kp::OpTensorCopy>({ tensorA, tensorB })
      ->eval<kp::OpTensorSyncLocal>({ tensorA, tensorB });

-    // Making sure the GPU holds the same data
-    EXPECT_EQ(tensorA->data(), tensorB->data());
+    // Making sure the GPU holds the same vector
+    EXPECT_EQ(tensorA->vector(), tensorB->vector());
 }

 TEST(TestOpTensorCopy, CopyDeviceToDeviceTensorMulti)
@ -35,9 +35,9 @@ TEST(TestOpTensorCopy, CopyDeviceToDeviceTensorMulti)
    std::vector<float> testVecB{ 0, 0, 0 };
    std::vector<float> testVecC{ 0, 0, 0 };

-    std::shared_ptr<kp::Tensor> tensorA = mgr.tensor(testVecA);
-    std::shared_ptr<kp::Tensor> tensorB = mgr.tensor(testVecB);
-    std::shared_ptr<kp::Tensor> tensorC = mgr.tensor(testVecC);
+    std::shared_ptr<kp::TensorT<float>> tensorA = mgr.tensor(testVecA);
+    std::shared_ptr<kp::TensorT<float>> tensorB = mgr.tensor(testVecB);
+    std::shared_ptr<kp::TensorT<float>> tensorC = mgr.tensor(testVecC);

    EXPECT_TRUE(tensorA->isInit());
    EXPECT_TRUE(tensorB->isInit());
@ -47,14 +47,14 @@ TEST(TestOpTensorCopy, CopyDeviceToDeviceTensorMulti)
      ->eval<kp::OpTensorSyncLocal>({ tensorA, tensorB, tensorC })
      ->eval<kp::OpTensorCopy>({ tensorA, tensorB, tensorC });

-    EXPECT_EQ(tensorA->data(), tensorB->data());
-    EXPECT_EQ(tensorA->data(), tensorC->data());
+    EXPECT_EQ(tensorA->vector(), tensorB->vector());
+    EXPECT_EQ(tensorA->vector(), tensorC->vector());

-    // Making sure the GPU holds the same data
+    // Making sure the GPU holds the same vector
    mgr.sequence()->eval<kp::OpTensorSyncLocal>({ tensorB, tensorC });

-    EXPECT_EQ(tensorA->data(), tensorB->data());
-    EXPECT_EQ(tensorA->data(), tensorC->data());
+    EXPECT_EQ(tensorA->vector(), tensorB->vector());
+    EXPECT_EQ(tensorA->vector(), tensorC->vector());
 }

 TEST(TestOpTensorCopy, CopyDeviceToHostTensor)
@ -65,8 +65,8 @@ TEST(TestOpTensorCopy, CopyDeviceToHostTensor)
    std::vector<float> testVecA{ 3, 4, 5 };
    std::vector<float> testVecB{ 0, 0, 0 };

-    std::shared_ptr<kp::Tensor> tensorA = mgr.tensor(testVecA);
-    std::shared_ptr<kp::Tensor> tensorB =
+    std::shared_ptr<kp::TensorT<float>> tensorA = mgr.tensor(testVecA);
+    std::shared_ptr<kp::TensorT<float>> tensorB =
      mgr.tensor(testVecB, kp::Tensor::TensorTypes::eHost);

    //  Only calling sync on device type tensor
@ -77,11 +77,11 @@ TEST(TestOpTensorCopy, CopyDeviceToHostTensor)

    mgr.sequence()->eval<kp::OpTensorCopy>({ tensorA, tensorB });

-    EXPECT_EQ(tensorA->data(), tensorB->data());
+    EXPECT_EQ(tensorA->vector(), tensorB->vector());

-    // Making sure the GPU holds the same data
+    // Making sure the GPU holds the same vector
    mgr.sequence()->eval<kp::OpTensorSyncLocal>({ tensorB });
-    EXPECT_EQ(tensorA->data(), tensorB->data());
+    EXPECT_EQ(tensorA->vector(), tensorB->vector());
 }

 TEST(TestOpTensorCopy, CopyHostToDeviceTensor)
@ -92,9 +92,9 @@ TEST(TestOpTensorCopy, CopyHostToDeviceTensor)
    std::vector<float> testVecA{ 4, 5, 6 };
    std::vector<float> testVecB{ 0, 0, 0 };

-    std::shared_ptr<kp::Tensor> tensorA =
+    std::shared_ptr<kp::TensorT<float>> tensorA =
      mgr.tensor(testVecA, kp::Tensor::TensorTypes::eHost);
-    std::shared_ptr<kp::Tensor> tensorB = mgr.tensor(testVecB);
+    std::shared_ptr<kp::TensorT<float>> tensorB = mgr.tensor(testVecB);

    //  Only calling sync on device type tensor
    mgr.sequence()->eval<kp::OpTensorSyncDevice>({ tensorA, tensorB });
@ -104,11 +104,11 @@ TEST(TestOpTensorCopy, CopyHostToDeviceTensor)

    mgr.sequence()->eval<kp::OpTensorCopy>({ tensorA, tensorB });

-    EXPECT_EQ(tensorA->data(), tensorB->data());
+    EXPECT_EQ(tensorA->vector(), tensorB->vector());

-    // Making sure the GPU holds the same data
+    // Making sure the GPU holds the same vector
    mgr.sequence()->eval<kp::OpTensorSyncLocal>({ tensorB });
-    EXPECT_EQ(tensorA->data(), tensorB->data());
+    EXPECT_EQ(tensorA->vector(), tensorB->vector());
 }

 TEST(TestOpTensorCopy, CopyHostToHostTensor)
@ -119,9 +119,9 @@ TEST(TestOpTensorCopy, CopyHostToHostTensor)
    std::vector<float> testVecA{ 5, 6, 7 };
    std::vector<float> testVecB{ 0, 0, 0 };

-    std::shared_ptr<kp::Tensor> tensorA =
+    std::shared_ptr<kp::TensorT<float>> tensorA =
      mgr.tensor(testVecA, kp::Tensor::TensorTypes::eHost);
-    std::shared_ptr<kp::Tensor> tensorB =
+    std::shared_ptr<kp::TensorT<float>> tensorB =
      mgr.tensor(testVecB, kp::Tensor::TensorTypes::eHost);

    EXPECT_TRUE(tensorA->isInit());
@ -131,11 +131,11 @@ TEST(TestOpTensorCopy, CopyHostToHostTensor)
      ->eval<kp::OpTensorSyncDevice>({ tensorA })
      ->eval<kp::OpTensorCopy>({ tensorA, tensorB });

-    EXPECT_EQ(tensorA->data(), tensorB->data());
+    EXPECT_EQ(tensorA->vector(), tensorB->vector());

-    // Making sure the GPU holds the same data
+    // Making sure the GPU holds the same vector
    mgr.sequence()->eval<kp::OpTensorSyncLocal>({ tensorB });
-    EXPECT_EQ(tensorA->data(), tensorB->data());
+    EXPECT_EQ(tensorA->vector(), tensorB->vector());
 }

 TEST(TestOpTensorCopy, SingleTensorShouldFail)
@ -145,7 +145,7 @@ TEST(TestOpTensorCopy, SingleTensorShouldFail)

    std::vector<float> testVecA{ 6, 7, 8 };

-    std::shared_ptr<kp::Tensor> tensorA =
+    std::shared_ptr<kp::TensorT<float>> tensorA =
      mgr.tensor(testVecA, kp::Tensor::TensorTypes::eHost);

    EXPECT_TRUE(tensorA->isInit());
--- a/test/TestOpTensorCreate.cpp
+++ b/test/TestOpTensorCreate.cpp
@ -6,7 +6,7 @@
 TEST(TestOpTensorCreate, CreateSingleTensorSingleOp)
 {
    std::vector<float> testVecA{ 9, 8, 7 };
-    std::shared_ptr<kp::Tensor> tensorA = nullptr;
+    std::shared_ptr<kp::TensorT<float>> tensorA = nullptr;

    {
        kp::Manager mgr;
@ -15,7 +15,7 @@ TEST(TestOpTensorCreate, CreateSingleTensorSingleOp)

        EXPECT_TRUE(tensorA->isInit());

-        EXPECT_EQ(tensorA->data(), testVecA);
+        EXPECT_EQ(tensorA->vector(), testVecA);
    }

    EXPECT_FALSE(tensorA->isInit());
@ -29,11 +29,11 @@ TEST(TestOpTensorCreate, NoErrorIfTensorFreedBefore)

    kp::Manager mgr;

-    std::shared_ptr<kp::Tensor> tensorA = mgr.tensor(testVecA);
-    std::shared_ptr<kp::Tensor> tensorB = mgr.tensor(testVecB);
+    std::shared_ptr<kp::TensorT<float>> tensorA = mgr.tensor(testVecA);
+    std::shared_ptr<kp::TensorT<float>> tensorB = mgr.tensor(testVecB);

-    EXPECT_EQ(tensorA->data(), testVecA);
-    EXPECT_EQ(tensorB->data(), testVecB);
+    EXPECT_EQ(tensorA->vector(), testVecA);
+    EXPECT_EQ(tensorB->vector(), testVecB);

    tensorA->destroy();
    tensorB->destroy();
@ -49,7 +49,7 @@ TEST(TestOpTensorCreate, ExceptionOnZeroSizeTensor)
    kp::Manager mgr;

    try {
-        std::shared_ptr<kp::Tensor> tensorA = mgr.tensor(testVecA);
+        std::shared_ptr<kp::TensorT<float>> tensorA = mgr.tensor(testVecA);
    } catch (const std::runtime_error& err) {
        // check exception
        ASSERT_TRUE(std::string(err.what()).find("zero-sized") !=
--- a/test/TestOpTensorSync.cpp
+++ b/test/TestOpTensorSync.cpp
@ -11,7 +11,7 @@ TEST(TestOpTensorSync, SyncToDeviceMemorySingleTensor)
    std::vector<float> testVecPreA{ 0, 0, 0 };
    std::vector<float> testVecPostA{ 9, 8, 7 };

-    std::shared_ptr<kp::Tensor> tensorA = mgr.tensor(testVecPreA);
+    std::shared_ptr<kp::TensorT<float>> tensorA = mgr.tensor(testVecPreA);

    EXPECT_TRUE(tensorA->isInit());

@ -21,7 +21,7 @@ TEST(TestOpTensorSync, SyncToDeviceMemorySingleTensor)

    mgr.sequence()->eval<kp::OpTensorSyncLocal>({ tensorA });

-    EXPECT_EQ(tensorA->data(), testVecPostA);
+    EXPECT_EQ(tensorA->vector(), testVecPostA);
 }

 TEST(TestOpTensorSync, SyncToDeviceMemoryMultiTensor)
@ -31,9 +31,9 @@ TEST(TestOpTensorSync, SyncToDeviceMemoryMultiTensor)

    std::vector<float> testVec{ 9, 8, 7 };

-    std::shared_ptr<kp::Tensor> tensorA = mgr.tensor({ 0, 0, 0 });
-    std::shared_ptr<kp::Tensor> tensorB = mgr.tensor({ 0, 0, 0 });
-    std::shared_ptr<kp::Tensor> tensorC = mgr.tensor({ 0, 0, 0 });
+    std::shared_ptr<kp::TensorT<float>> tensorA = mgr.tensor({ 0, 0, 0 });
+    std::shared_ptr<kp::TensorT<float>> tensorB = mgr.tensor({ 0, 0, 0 });
+    std::shared_ptr<kp::TensorT<float>> tensorC = mgr.tensor({ 0, 0, 0 });

    EXPECT_TRUE(tensorA->isInit());
    EXPECT_TRUE(tensorB->isInit());
@ -47,7 +47,7 @@ TEST(TestOpTensorSync, SyncToDeviceMemoryMultiTensor)

    mgr.sequence()->eval<kp::OpTensorSyncLocal>({ tensorA, tensorB, tensorC });

-    EXPECT_EQ(tensorA->data(), testVec);
-    EXPECT_EQ(tensorB->data(), testVec);
-    EXPECT_EQ(tensorC->data(), testVec);
+    EXPECT_EQ(tensorA->vector(), testVec);
+    EXPECT_EQ(tensorB->vector(), testVec);
+    EXPECT_EQ(tensorC->vector(), testVec);
 }
--- a/test/TestPushConstant.cpp
+++ b/test/TestPushConstant.cpp
@ -22,14 +22,14 @@ TEST(TestPushConstants, TestConstantsAlgoDispatchOverride)
              pa[2] += pcs.z;
          })");

-        std::vector<uint32_t> spirv = kp::Shader::compile_source(shader);
+        std::vector<uint32_t> spirv = kp::Shader::compileSource(shader);

        std::shared_ptr<kp::Sequence> sq = nullptr;

        {
            kp::Manager mgr;

-            std::shared_ptr<kp::Tensor> tensor = mgr.tensor({ 0, 0, 0 });
+            std::shared_ptr<kp::TensorT<float>> tensor = mgr.tensor({ 0, 0, 0 });

            std::shared_ptr<kp::Algorithm> algo =
              mgr.algorithm({ tensor }, spirv, kp::Workgroup({ 1 }), {}, { 0.0, 0.0, 0.0 });
@ -42,7 +42,7 @@ TEST(TestPushConstants, TestConstantsAlgoDispatchOverride)
            sq->eval<kp::OpAlgoDispatch>(algo, kp::Constants{ 0.3, 0.2, 0.1 });
            sq->eval<kp::OpTensorSyncLocal>({ tensor });

-            EXPECT_EQ(tensor->data(), kp::Constants({ 0.4, 0.4, 0.4 }));
+            EXPECT_EQ(tensor->vector(), kp::Constants({ 0.4, 0.4, 0.4 }));
        }
    }
 }
@ -65,14 +65,14 @@ TEST(TestPushConstants, TestConstantsAlgoDispatchNoOverride)
              pa[2] += pcs.z;
          })");

-        std::vector<uint32_t> spirv = kp::Shader::compile_source(shader);
+        std::vector<uint32_t> spirv = kp::Shader::compileSource(shader);

        std::shared_ptr<kp::Sequence> sq = nullptr;

        {
            kp::Manager mgr;

-            std::shared_ptr<kp::Tensor> tensor = mgr.tensor({ 0, 0, 0 });
+            std::shared_ptr<kp::TensorT<float>> tensor = mgr.tensor({ 0, 0, 0 });

            std::shared_ptr<kp::Algorithm> algo =
              mgr.algorithm({ tensor }, spirv, kp::Workgroup({ 1 }), {}, { 0.1, 0.2, 0.3 });
@ -85,7 +85,7 @@ TEST(TestPushConstants, TestConstantsAlgoDispatchNoOverride)
            sq->eval<kp::OpAlgoDispatch>(algo, kp::Constants{ 0.3, 0.2, 0.1 });
            sq->eval<kp::OpTensorSyncLocal>({ tensor });

-            EXPECT_EQ(tensor->data(), kp::Constants({ 0.4, 0.4, 0.4 }));
+            EXPECT_EQ(tensor->vector(), kp::Constants({ 0.4, 0.4, 0.4 }));
        }
    }
 }
@ -108,14 +108,14 @@ TEST(TestPushConstants, TestConstantsWrongSize)
              pa[2] += pcs.z;
          })");

-        std::vector<uint32_t> spirv = kp::Shader::compile_source(shader);
+        std::vector<uint32_t> spirv = kp::Shader::compileSource(shader);

        std::shared_ptr<kp::Sequence> sq = nullptr;

        {
            kp::Manager mgr;

-            std::shared_ptr<kp::Tensor> tensor = mgr.tensor({ 0, 0, 0 });
+            std::shared_ptr<kp::TensorT<float>> tensor = mgr.tensor({ 0, 0, 0 });

            std::shared_ptr<kp::Algorithm> algo =
              mgr.algorithm({ tensor }, spirv, kp::Workgroup({ 1 }), {}, { 0.0 });
--- a/test/TestSequence.cpp
+++ b/test/TestSequence.cpp
@ -60,13 +60,13 @@ TEST(TestSequence, RerecordSequence)

    std::shared_ptr<kp::Sequence> sq = mgr.sequence();

-    std::shared_ptr<kp::Tensor> tensorA = mgr.tensor({1, 2, 3});
-    std::shared_ptr<kp::Tensor> tensorB = mgr.tensor({2, 2, 2});
-    std::shared_ptr<kp::Tensor> tensorOut = mgr.tensor({0, 0, 0});
+    std::shared_ptr<kp::TensorT<float>> tensorA = mgr.tensor({1, 2, 3});
+    std::shared_ptr<kp::TensorT<float>> tensorB = mgr.tensor({2, 2, 2});
+    std::shared_ptr<kp::TensorT<float>> tensorOut = mgr.tensor({0, 0, 0});

    sq->eval<kp::OpTensorSyncDevice>({ tensorA, tensorB, tensorOut });

-    std::vector<uint32_t> spirv = kp::Shader::compile_source(R"(
+    std::vector<uint32_t> spirv = kp::Shader::compileSource(R"(
        #version 450

        layout (local_size_x = 1) in;
@ -90,7 +90,7 @@ TEST(TestSequence, RerecordSequence)

    sq->eval();

-    EXPECT_EQ(tensorOut->data(), std::vector<float>({2, 4, 6}));
+    EXPECT_EQ(tensorOut->vector(), std::vector<float>({2, 4, 6}));

    algo->rebuild({tensorOut, tensorA, tensorB}, spirv);

@ -98,7 +98,7 @@ TEST(TestSequence, RerecordSequence)
    sq->rerecord();
    sq->eval();

-    EXPECT_EQ(tensorB->data(), std::vector<float>({2, 8, 18}));
+    EXPECT_EQ(tensorB->vector(), std::vector<float>({2, 8, 18}));
 }


@ -117,7 +117,7 @@ TEST(TestSequence, SequenceTimestamps)
          pa[index] = pa[index] + 1;
      })");

-    std::vector<uint32_t> spirv = kp::Shader::compile_source(shader);
+    std::vector<uint32_t> spirv = kp::Shader::compileSource(shader);
    
    auto seq = mgr.sequence(0, 100); //100 timestamps
    seq->record<kp::OpTensorSyncDevice>({ tensorA })
--- a/test/TestShaderResources.cpp
+++ b/test/TestShaderResources.cpp
@ -25,7 +25,7 @@ static const std::string shaderString = (R"(
 )");

 void compileShaderWithGivenResources(const std::string shaderString, const TBuiltInResource resources) {
-    kp::Shader::compile_source(shaderString,  std::string("main"), std::vector<std::pair<std::string,std::string>>({}), resources);
+    kp::Shader::compileSource(shaderString,  std::string("main"), std::vector<std::pair<std::string,std::string>>({}), resources);
 }


--- a/test/TestSpecializationConstant.cpp
+++ b/test/TestSpecializationConstant.cpp
@ -18,15 +18,15 @@ TEST(TestSpecializationConstants, TestTwoConstants)
              pb[index] = cTwo;
          })");

-        std::vector<uint32_t> spirv = kp::Shader::compile_source(shader);
+        std::vector<uint32_t> spirv = kp::Shader::compileSource(shader);

        std::shared_ptr<kp::Sequence> sq = nullptr;

        {
            kp::Manager mgr;

-            std::shared_ptr<kp::Tensor> tensorA = mgr.tensor({ 0, 0, 0 });
-            std::shared_ptr<kp::Tensor> tensorB = mgr.tensor({ 0, 0, 0 });
+            std::shared_ptr<kp::TensorT<float>> tensorA = mgr.tensor({ 0, 0, 0 });
+            std::shared_ptr<kp::TensorT<float>> tensorB = mgr.tensor({ 0, 0, 0 });

            std::vector<std::shared_ptr<kp::Tensor>> params = { tensorA,
                                                                tensorB };
@ -42,8 +42,8 @@ TEST(TestSpecializationConstants, TestTwoConstants)
                   ->record<kp::OpTensorSyncLocal>(params)
                   ->eval();

-            EXPECT_EQ(tensorA->data(), std::vector<float>({ 5, 5, 5 }));
-            EXPECT_EQ(tensorB->data(), std::vector<float>({ 0.3, 0.3, 0.3 }));
+            EXPECT_EQ(tensorA->vector(), std::vector<float>({ 5, 5, 5 }));
+            EXPECT_EQ(tensorB->vector(), std::vector<float>({ 0.3, 0.3, 0.3 }));
        }
    }
 }
--- a/test/TestTensor.cpp
+++ b/test/TestTensor.cpp
@ -7,7 +7,7 @@ TEST(TestTensor, ConstructorData)
 {
    kp::Manager mgr;
    std::vector<float> vec{ 0, 1, 2 };
-    std::shared_ptr<kp::Tensor> tensor = mgr.tensor(vec);
+    std::shared_ptr<kp::TensorT<float>> tensor = mgr.tensor(vec);
    EXPECT_EQ(tensor->size(), vec.size());
-    EXPECT_EQ(tensor->data(), vec);
+    EXPECT_EQ(tensor->vector(), vec);
 }
--- a/test/TestWorkgroup.cpp
+++ b/test/TestWorkgroup.cpp
@ -7,8 +7,8 @@

 TEST(TestWorkgroup, TestSimpleWorkgroup)
 {
-    std::shared_ptr<kp::Tensor> tensorA = nullptr;
-    std::shared_ptr<kp::Tensor> tensorB = nullptr;
+    std::shared_ptr<kp::TensorT<float>> tensorA = nullptr;
+    std::shared_ptr<kp::TensorT<float>> tensorB = nullptr;
    {
        std::shared_ptr<kp::Sequence> sq = nullptr;

@ -39,29 +39,29 @@ TEST(TestWorkgroup, TestSimpleWorkgroup)
            sq->record<kp::OpAlgoDispatch>(algorithm);
            sq->record<kp::OpTensorSyncLocal>(params);
            sq->eval();
+
+            std::vector<float> expectedA = {
+                0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  1,  1,  1,  1,  1,
+                2,  2,  2,  2,  2,  2,  2,  2,  3,  3,  3,  3,  3,  3,  3,  3,
+                4,  4,  4,  4,  4,  4,  4,  4,  5,  5,  5,  5,  5,  5,  5,  5,
+                6,  6,  6,  6,  6,  6,  6,  6,  7,  7,  7,  7,  7,  7,  7,  7,
+                8,  8,  8,  8,  8,  8,  8,  8,  9,  9,  9,  9,  9,  9,  9,  9,
+                10, 10, 10, 10, 10, 10, 10, 10, 11, 11, 11, 11, 11, 11, 11, 11,
+                12, 12, 12, 12, 12, 12, 12, 12, 13, 13, 13, 13, 13, 13, 13, 13,
+                14, 14, 14, 14, 14, 14, 14, 14, 15, 15, 15, 15, 15, 15, 15, 15
+            };
+
+            std::vector<float> expectedB = {
+                0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5,
+                6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3,
+                4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1,
+                2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7,
+                0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5,
+                6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7
+            };
+
+            EXPECT_EQ(tensorA->vector(), expectedA);
+            EXPECT_EQ(tensorB->vector(), expectedB);
        }
    }
-
-    std::vector<float> expectedA = {
-        0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  1,  1,  1,  1,  1,
-        2,  2,  2,  2,  2,  2,  2,  2,  3,  3,  3,  3,  3,  3,  3,  3,
-        4,  4,  4,  4,  4,  4,  4,  4,  5,  5,  5,  5,  5,  5,  5,  5,
-        6,  6,  6,  6,  6,  6,  6,  6,  7,  7,  7,  7,  7,  7,  7,  7,
-        8,  8,  8,  8,  8,  8,  8,  8,  9,  9,  9,  9,  9,  9,  9,  9,
-        10, 10, 10, 10, 10, 10, 10, 10, 11, 11, 11, 11, 11, 11, 11, 11,
-        12, 12, 12, 12, 12, 12, 12, 12, 13, 13, 13, 13, 13, 13, 13, 13,
-        14, 14, 14, 14, 14, 14, 14, 14, 15, 15, 15, 15, 15, 15, 15, 15
-    };
-
-    std::vector<float> expectedB = {
-        0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5,
-        6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3,
-        4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1,
-        2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7,
-        0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5,
-        6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7
-    };
-
-    EXPECT_EQ(tensorA->data(), expectedA);
-    EXPECT_EQ(tensorB->data(), expectedB);
 }