diff --git a/.ccls b/.ccls
index 2ce15d72f..f215ea9d1 100644
--- a/.ccls
+++ b/.ccls
@@ -13,6 +13,7 @@
-DDEBUG=1
-DKOMPUTE_INCLUDE_FOR_SYNTAX
+-I./python/pybind11/include/
-I./external/Vulkan-Headers/include/
-I./external/googletest/googletest/include/
-I./external/spdlog/include/
diff --git a/.gitmodules b/.gitmodules
index 1c5db0adc..33549db54 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -10,3 +10,6 @@
path = external/spdlog
url = https://github.com/gabime/spdlog
branch = v1.8.1
+[submodule "python/pybind11"]
+ path = python/pybind11
+ url = https://github.com/pybind/pybind11
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 52e45fcf9..454876d4e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,5 +1,5 @@
cmake_minimum_required(VERSION 3.4.1)
-project(kompute VERSION 0.3.0)
+project(kompute VERSION 0.4.1)
set(CMAKE_CXX_STANDARD 14)
@@ -13,6 +13,7 @@ option(KOMPUTE_OPT_BUILD_SHADERS "Enable if you want to re-build all shader file
option(KOMPUTE_OPT_BUILD_SINGLE_HEADER "Enable if you want to build the single header file" 0)
option(KOMPUTE_OPT_INSTALL "Enable if you want to enable installation" 0)
# Build options
+option(KOMPUTE_OPT_BUILD_PYTHON "Enable if you want to build python bindings" 0)
option(KOMPUTE_OPT_ENABLE_SPDLOG "Extra compile flags for Kompute, see docs for full list" 0)
option(KOMPUTE_OPT_REPO_SUBMODULE_BUILD, "Use the submodule repos instead of external package manager" 0)
option(KOMPUTE_OPT_ANDOID_BUILD "Enable android compilation flags required" 0)
@@ -43,12 +44,16 @@ endfunction()
add_subdirectory(src)
+if(KOMPUTE_OPT_BUILD_TESTS)
+ add_subdirectory(test)
+endif()
+
if(KOMPUTE_OPT_BUILD_DOCS)
set(CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/config" ${CMAKE_MODULE_PATH})
add_subdirectory(docs)
endif()
-if(KOMPUTE_OPT_BUILD_TESTS)
- add_subdirectory(test)
+if(KOMPUTE_OPT_BUILD_PYTHON)
+ add_subdirectory(python)
endif()
diff --git a/README.md b/README.md
index 2ff7e0f9a..b745ccfbf 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,5 @@
-
+



@@ -15,7 +15,7 @@
Vulkan Kompute
-The General Purpose Vulkan Compute Framework.
+The General Purpose Vulkan Compute Framework for C++ and Python.
|
@@ -29,10 +29,10 @@
## Principles & Features
-* [Single header](#setup) library for simple import to your project
-* [Documentation](https://kompute.cc) leveraging doxygen and sphinx
-* [Asynchronous & parallel processing](#asynchronous-and-parallel-operations) capabilities with multi-queue command submission
-* [Non-Vulkan naming conventions](#architectural-overview) to disambiguate Vulkan vs Kompute components
+* [Single header](#setup) for simple import with flexible build-system configuration
+* Multi-language support with C++ as core SDK as well as [optimized Python bindings](#python-package)
+* [Asynchronous & parallel processing](#asynchronous-and-parallel-operations) support through GPU family queues
+* [Mobile enabled](#mobile-enabled) with examples in Android studio across several architectures
* BYOV: [Bring-your-own-Vulkan design](#motivations) to play nice with existing Vulkan applications
* Explicit relationships for GPU and host [memory ownership and memory management](https://kompute.cc/overview/memory-management.html)
* [Short code examples](#simple-examples) showing the core features
@@ -118,7 +118,7 @@ int main() {
mgr.evalOpAwaitDefault();
// 5. Create managed sequence to submit batch operations to the CPU
- std::shared_ptr sq = mgr.getOrCreateManagedSequence("seq").lock();
+ std::shared_ptr sq = mgr.getOrCreateManagedSequence("seq");
// 5.1. Explicitly begin recording batch commands
sq->begin();
@@ -255,13 +255,79 @@ You can also access the
- Advanced Examples
+ Simple & Advanced Examples
+ Python Package Overview
Asynchronous & Parallel Operations
Memory Management Principles
Build System Deep Dive
Converting GLSL/HLSL Shaders to C++ Headers
Mobile App Integration (Android)
Game Engine Integration (Godot Engine)
+ Python Class Documentation & Reference
+ C++ Class Documentation & Reference
Code Index
diff --git a/docs/overview/python-package.rst b/docs/overview/python-package.rst
new file mode 100644
index 000000000..74e0cba91
--- /dev/null
+++ b/docs/overview/python-package.rst
@@ -0,0 +1,91 @@
+
+Python Package Overview
+========
+
+This section provides an overview of the Python Package from a functionality perspective. If you wish to see all the classes and their respective functions you can find that in the `Python Class Reference Section `_.
+
+Below is a diagram that provides insights on the relationship between Vulkan Kompute objects and Vulkan resources, which primarily encompass ownership of either CPU and/or GPU memory.
+
+.. image:: ../images/kompute-architecture.jpg
+ :width: 70%
+
+Python Components
+^^^^^^^^
+
+The Python package exposes three main classes:
+
+* :class:`kp.Manager` - Manages all high level Vulkan and Kompute resources created
+* :class:`kp.Sequence` - Contains a set of recorded operations that can be reused
+* :class:`kp.Tensor` - Core data component to manage GPU and host data used in operations
+
+One thing that you will notice is that the class :class:`kp::OpBase` and all its relevant operator subclasses are not exposed in Python.
+
+This is primarily because the way to interact with the operations are through the respective :class:`kp.Manager` and :class:`kp.Sequence` functions.
+
+More specifically, it can be through the following functions:
+
+* mgr.eval_ - Runs operation under an existing named sequence
+* mgr.eval__def - Runs operation under a new anonymous sequence
+* mgr.eval_async_ - Runs operation asynchronously under an existing named sequence
+* mgr.eval_async__def - Runs operation asynchronously under a new anonymous sequence
+* seq.record_ - Records operation in sequence (requires sequence to be in recording mode)
+
+You can see these operations being used in the `Simple Python example `_ and in the `Extended Python Example `_.
+
+Kompute Operation Capabilities
+^^^^^
+
+Handling multiple capabilites of processing can be done by compute shaders being loaded into separate sequences. The example below shows how this can be done:
+
+.. code-block:: python
+ :linenos:
+ from kp import Manager
+
+ # We'll assume we have the shader data available
+ from my_spv_shader_data import mult_shader, sum_shader
+
+ mgr = Manager()
+
+ t1 = mgr.build_tensor([2,2,2])
+ t2 = mgr.build_tensor([1,2,3])
+ t3 = mgr.build_tensor([1,2,3])
+
+ # Create multiple separate sequences
+ sq_mult = mgr.create_sequence("SQ_MULT")
+ sq_sum = mgr.create_sequence("SQ_SUM")
+ sq_sync = mgr.create_sequence("SQ_SYNC")
+
+ # Initialize sq_mult
+ sq_mult.begin()
+ sq_mult.record_algo_data([t1, t2, t3], add_shader)
+ sq_mult.end()
+
+ sq_sum.begin()
+ sq_sum.record_algo_data([t3, t2, t1], sum_shader)
+ sq_sum.end()
+
+ sq_sync.begin()
+ sq_sync.record_tensor_sync_local([t1, t3])
+ sq_sync.end()
+
+ # Run multiple iterations
+ for i in range(10):
+ sq_mult.eval()
+ sq_sum.eval()
+
+ sq_sync.eval()
+
+ print(t1.data(), t2.data(), t3.data())
+
+
+Package Installation
+^^^^^^^^^
+
+The package can be installed through the top level `setup.py` by running:
+
+```
+pip install .
+```
+
+
+
diff --git a/docs/overview/python-reference.rst b/docs/overview/python-reference.rst
new file mode 100644
index 000000000..0a8eb7a23
--- /dev/null
+++ b/docs/overview/python-reference.rst
@@ -0,0 +1,44 @@
+
+
+Python Class Documentation & Reference
+========
+
+This section provides a breakdown of the Python classes and what each of their functions provide.
+Below is a diagram that provides insights on the relationship between Vulkan Kompute objects and Vulkan resources, which primarily encompass ownership of either CPU and/or GPU memory.
+
+.. image:: ../images/kompute-architecture.jpg
+ :width: 70%
+
+Manager
+-------
+
+The Kompute Manager provides a high level interface to simplify interaction with underlying :class:`kp.Sequence` of Operations.
+
+.. autoclass:: kp.Manager
+ :members:
+
+
+Sequence
+-------
+
+The Kompute Sequence consists of batches of Kompute Operations, which are executed on a respective GPU queue. The execution of sequences can be synchronous or asynchronous, and it can be coordinated through its respective Vulkan Fence.
+
+.. autoclass:: kp.Sequence
+ :members:
+
+
+Tensor
+-------
+
+The Kompute Tensor is the atomic unit in Kompute, and it is used primarily for handling Host and GPU Device data.
+
+.. autoclass:: kp.Tensor
+ :members:
+
+
+TensorType
+-------
+
+.. automodule:: kp
+ :members:
+
diff --git a/examples/android/android-simple/app/src/main/cpp/KomputeModelML.cpp b/examples/android/android-simple/app/src/main/cpp/KomputeModelML.cpp
index a7a18c849..e22f2aa00 100755
--- a/examples/android/android-simple/app/src/main/cpp/KomputeModelML.cpp
+++ b/examples/android/android-simple/app/src/main/cpp/KomputeModelML.cpp
@@ -44,14 +44,14 @@ void KomputeModelML::train(std::vector yData, std::vector xIData,
{
std::shared_ptr sqTensor =
- mgr.createManagedSequence().lock();
+ mgr.createManagedSequence();
sqTensor->begin();
sqTensor->record(params);
sqTensor->end();
sqTensor->eval();
- std::shared_ptr sq = mgr.createManagedSequence().lock();
+ std::shared_ptr sq = mgr.createManagedSequence();
// Record op algo base
sq->begin();
@@ -60,11 +60,11 @@ void KomputeModelML::train(std::vector yData, std::vector xIData,
#ifdef KOMPUTE_ANDROID_SHADER_FROM_STRING
// Newer versions of Android are able to use shaderc to read raw string
- sq->record>(
+ sq->record(
params, std::vector(LR_SHADER.begin(), LR_SHADER.end()));
#else
// Older versions of Android require the SPIRV binary directly
- sq->record>(
+ sq->record(
params, std::vector(
kp::shader_data::shaders_glsl_logisticregression_comp_spv,
kp::shader_data::shaders_glsl_logisticregression_comp_spv
diff --git a/examples/array_multiplication/CMakeLists.txt b/examples/array_multiplication/CMakeLists.txt
index 5aeebb450..0b648382e 100644
--- a/examples/array_multiplication/CMakeLists.txt
+++ b/examples/array_multiplication/CMakeLists.txt
@@ -3,20 +3,42 @@ project(kompute_array_mult VERSION 0.1.0)
set(CMAKE_CXX_STANDARD 14)
+option(KOMPUTE_ARR_OPT_INSTALLED_KOMPUTE "Enable if you prefer to use your installed Kompute library" 0)
option(KOMPUTE_OPT_ENABLE_SPDLOG "Extra compile flags for Kompute, see docs for full list" 0)
set(KOMPUTE_EXTRA_CXX_FLAGS "" CACHE STRING "Extra compile flags for Kompute, see docs for full list")
+if(KOMPUTE_OPT_ENABLE_SPDLOG)
+ set(KOMPUTE_EXTRA_CXX_FLAGS "${KOMPUTE_EXTRA_CXX_FLAGS} -DKOMPUTE_ENABLE_SPDLOG=1")
+endif()
+
# It is necessary to pass the DEBUG or RELEASE flag accordingly to Kompute
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -DDEBUG=1 ${KOMPUTE_EXTRA_CXX_FLAGS}")
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -DRELEASE=1 ${KOMPUTE_EXTRA_CXX_FLAGS}")
-find_package(kompute REQUIRED)
+if(KOMPUTE_ARR_OPT_INSTALLED_KOMPUTE)
+ find_package(kompute REQUIRED)
+else()
+ add_subdirectory(../../ ${CMAKE_CURRENT_BINARY_DIR}/kompute_build)
+endif()
+
find_package(Vulkan REQUIRED)
+if(KOMPUTE_OPT_ENABLE_SPDLOG)
+ find_package(spdlog REQUIRED)
+endif()
+
add_executable(kompute_array_mult
src/Main.cpp)
target_link_libraries(kompute_array_mult
kompute::kompute
- Vulkan::Vulkan
-)
+ Vulkan::Vulkan)
+
+include_directories(
+ ../../single_include/)
+
+if(KOMPUTE_OPT_ENABLE_SPDLOG)
+ target_link_libraries(kompute_array_mult
+ spdlog::spdlog)
+endif()
+
diff --git a/examples/array_multiplication/README.md b/examples/array_multiplication/README.md
index 9838b7217..931c7d639 100644
--- a/examples/array_multiplication/README.md
+++ b/examples/array_multiplication/README.md
@@ -6,14 +6,32 @@ This example is structured such that you will be able to extend it for your proj
It contains a cmake build configuration that can be used in your production applications.
+## Building the example
+
+You will notice that it's a standalone project, so you can re-use it for your application.
+
+This project has the option to either import the Kompute dependency relative to the project or use your existing installation of Kompute.
+
+To build you just need to run the cmake command in this folder as follows:
+
+```
+cmake \
+ -Bbuild
+```
+
+You can pass the following optional parameters based on your desired configuration:
+* If you wish to install with spdlog support you just have to pass `-DKOMPUTE_OPT_ENABLE_SPDLOG=1`.
+* If you are using a package manager such as `vcpkg` make sure you pass the `-DCMAKE_TOOLCHAIN_FILE=` parameter
+* If you wish to load shader from raw glsl string instead of spirv bytes you can use `-DKOMPUTE_ANDROID_SHADER_FROM_STRING`
+
## Pre-requisites
In order to run this example, you will need the following dependencies:
* REQUIRED
- + Vulkan Kompute library must be accessible
+ The Vulkan SDK must be installed
* OPTIONAL
+ + Vulkan Kompute library must be accessible (by default it uses the source directory)
+ SPDLOG - for logging
+ FMT - for text formatting
@@ -25,50 +43,5 @@ For the other libraries, because they are optional you can just make sure you bu
Alternatively you can use package managers such as vcpkg to help you install them, although to simplify things you can start without the dependencies first.
-## Set Up Vulkan Kompute Dependency
-You have multiple options to set up Vulkan Kompute. The easiest is to perform a local installation.
-
-For this, you will want to go to the main repo and run the following cmake command, which will configure it without SPDLOG by default.
-
-```
-cmake \
- -Bbuild
-```
-
-You can pass the following optional parameters based on your desired configuration:
-* If you wish to install with spdlog support you just have to pass `-DKOMPUTE_ENABLE_SPDLOG=1`.
-* If you wish to perform the installation on the local folder instead of in your system you can use `-DCMAKE_INSTALL_PREFIX="build/src/CMakeFiles/Export/"` which will basically ensure that the final files are created in the local directory.
-* If you are using a package manager such as `vcpkg` make sure you pass the `-DCMAKE_TOOLCHAIN_FILE=` parameter
-
-Then you can proceed to run the installation:
-
-* For Windows / Visual Studio you just have to build `INSTALL.vcxproj`
-* For Linux you can just run the `install` target via `make -C build install`
-
-You also have the option to build as `Release` or `Debug` - just make sure that you build your example with the same build/debug flags as required.
-
-## Building the example
-
-Now that you've set up the dependencies / installation of Vulkan Kompute you can build this example.
-
-You will notice that it's a standalone project, so you can re-use it for your application.
-
-To build you just need to run the cmake command in this folder as follows:
-
-```
-cmake \
- -Bbuild
-```
-
-Make sure to pass the required flags depending on the configuration above:
-* If you built with Debug make sure you build your example with Debug as well
-* If you installed in the local folder, make sure you pass the CMAKE_PREFIX_PATH pointing to the respective folder (e.g. `-DCMAKE_PREFIX_PATH=../../build/src/CMakeFiles/Export/lib/cmake/kompute/` if parent folder is main repo).
-* If you built Vulkan Kompute with spdlog enabled, make sure to pass `-DKOMPUTE_OPT_ENABLE_SPDLOG=1`
-* If you are using a package manager such as `vcpkg` make sure you pass the `-DCMAKE_TOOLCHAIN_FILE=` parameter
-
-Now you just have to build your application as above:
-
-* For Windows / Visual Studio you just have to build and run `kompute_array_mult.vcxproj`
-* For Linux you can just run the `kompute_array_mult` target via `make -C build kompute_array_mult`
diff --git a/examples/array_multiplication/src/Main.cpp b/examples/array_multiplication/src/Main.cpp
index f3587cae8..14b58cba9 100755
--- a/examples/array_multiplication/src/Main.cpp
+++ b/examples/array_multiplication/src/Main.cpp
@@ -18,6 +18,7 @@ int main()
auto tensorInB = mgr.buildTensor({ 0.0, 1.0, 2.0 });
auto tensorOut = mgr.buildTensor({ 0.0, 0.0, 0.0 });
+#ifdef KOMPUTE_ANDROID_SHADER_FROM_STRING
std::string shader(R"(
// The version to use
#version 450
@@ -37,9 +38,17 @@ int main()
}
)");
- mgr.evalOpDefault>(
+ mgr.evalOpDefault(
{ tensorInA, tensorInB, tensorOut },
std::vector(shader.begin(), shader.end()));
+#else
+ mgr.evalOpDefault(
+ { tensorInA, tensorInB, tensorOut },
+ std::vector(
+ kp::shader_data::shaders_glsl_opmult_comp_spv,
+ kp::shader_data::shaders_glsl_opmult_comp_spv
+ + kp::shader_data::shaders_glsl_opmult_comp_spv_len));
+#endif
mgr.evalOpDefault({tensorOut});
diff --git a/examples/godot_examples/custom_module/kompute_summator/KomputeSummatorNode.h b/examples/godot_examples/custom_module/kompute_summator/KomputeSummatorNode.h
index 5bc201a90..1d94da9a5 100644
--- a/examples/godot_examples/custom_module/kompute_summator/KomputeSummatorNode.h
+++ b/examples/godot_examples/custom_module/kompute_summator/KomputeSummatorNode.h
@@ -24,7 +24,7 @@ protected:
private:
kp::Manager mManager;
- std::weak_ptr mSequence;
+ std::shared_ptr mSequence;
std::shared_ptr mPrimaryTensor;
std::shared_ptr mSecondaryTensor;
};
diff --git a/examples/godot_examples/gdnative_shared/src/KomputeSummator.cpp b/examples/godot_examples/gdnative_shared/src/KomputeSummator.cpp
index f64e0d088..788486e82 100644
--- a/examples/godot_examples/gdnative_shared/src/KomputeSummator.cpp
+++ b/examples/godot_examples/gdnative_shared/src/KomputeSummator.cpp
@@ -16,12 +16,7 @@ void KomputeSummator::add(float value) {
// Set the new data in the local device
this->mSecondaryTensor->setData({value});
// Execute recorded sequence
- if (std::shared_ptr sq = this->mSequence.lock()) {
- sq->eval();
- }
- else {
- throw std::runtime_error("Sequence pointer no longer available");
- }
+ this->mSequence->eval();
}
void KomputeSummator::reset() {
@@ -38,9 +33,7 @@ void KomputeSummator::_init() {
this->mSequence = this->mManager.getOrCreateManagedSequence("AdditionSeq");
// We now record the steps in the sequence
- if (std::shared_ptr sq = this->mSequence.lock())
{
-
std::string shader(R"(
#version 450
@@ -55,26 +48,23 @@ void KomputeSummator::_init() {
}
)");
- sq->begin();
+ this->mSequence->begin();
// First we ensure secondary tensor loads to GPU
// No need to sync the primary tensor as it should not be changed
- sq->record(
+ this->mSequence->record(
{ this->mSecondaryTensor });
// Then we run the operation with both tensors
- sq->record>(
+ this->mSequence->record(
{ this->mPrimaryTensor, this->mSecondaryTensor },
std::vector(shader.begin(), shader.end()));
// We map the result back to local
- sq->record(
+ this->mSequence->record(
{ this->mPrimaryTensor });
- sq->end();
- }
- else {
- throw std::runtime_error("Sequence pointer no longer available");
+ this->mSequence->end();
}
}
diff --git a/examples/godot_examples/gdnative_shared/src/KomputeSummator.hpp b/examples/godot_examples/gdnative_shared/src/KomputeSummator.hpp
index 9131e7f57..7f6b42e82 100644
--- a/examples/godot_examples/gdnative_shared/src/KomputeSummator.hpp
+++ b/examples/godot_examples/gdnative_shared/src/KomputeSummator.hpp
@@ -26,7 +26,7 @@ public:
private:
kp::Manager mManager;
- std::weak_ptr mSequence;
+ std::shared_ptr mSequence;
std::shared_ptr mPrimaryTensor;
std::shared_ptr mSecondaryTensor;
};
diff --git a/examples/godot_logistic_regression/custom_module/kompute_model_ml/KomputeModelMLNode.cpp b/examples/godot_logistic_regression/custom_module/kompute_model_ml/KomputeModelMLNode.cpp
index fe0a911a5..f583d910f 100644
--- a/examples/godot_logistic_regression/custom_module/kompute_model_ml/KomputeModelMLNode.cpp
+++ b/examples/godot_logistic_regression/custom_module/kompute_model_ml/KomputeModelMLNode.cpp
@@ -51,14 +51,14 @@ void KomputeModelMLNode::train(Array yArr, Array xIArr, Array xJArr) {
kp::Manager mgr;
std::shared_ptr sqTensor =
- mgr.createManagedSequence().lock();
+ mgr.createManagedSequence();
sqTensor->begin();
sqTensor->record(params);
sqTensor->end();
sqTensor->eval();
- std::shared_ptr sq = mgr.createManagedSequence().lock();
+ std::shared_ptr sq = mgr.createManagedSequence();
// Record op algo base
sq->begin();
@@ -67,11 +67,11 @@ void KomputeModelMLNode::train(Array yArr, Array xIArr, Array xJArr) {
#ifdef KOMPUTE_ANDROID_SHADER_FROM_STRING
// Newer versions of Android are able to use shaderc to read raw string
- sq->record>(
+ sq->record(
params, std::vector(LR_SHADER.begin(), LR_SHADER.end()));
#else
// Older versions of Android require the SPIRV binary directly
- sq->record>(
+ sq->record(
params, std::vector(
kp::shader_data::shaders_glsl_logisticregression_comp_spv,
kp::shader_data::shaders_glsl_logisticregression_comp_spv
diff --git a/examples/godot_logistic_regression/gdnative_shared/src/KomputeModelML.cpp b/examples/godot_logistic_regression/gdnative_shared/src/KomputeModelML.cpp
index 174398501..4135e83ed 100644
--- a/examples/godot_logistic_regression/gdnative_shared/src/KomputeModelML.cpp
+++ b/examples/godot_logistic_regression/gdnative_shared/src/KomputeModelML.cpp
@@ -56,14 +56,14 @@ void KomputeModelML::train(Array yArr, Array xIArr, Array xJArr) {
{
std::shared_ptr sqTensor =
- mgr.createManagedSequence().lock();
+ mgr.createManagedSequence();
sqTensor->begin();
sqTensor->record(params);
sqTensor->end();
sqTensor->eval();
- std::shared_ptr sq = mgr.createManagedSequence().lock();
+ std::shared_ptr sq = mgr.createManagedSequence();
// Record op algo base
sq->begin();
@@ -72,11 +72,11 @@ void KomputeModelML::train(Array yArr, Array xIArr, Array xJArr) {
#ifdef KOMPUTE_ANDROID_SHADER_FROM_STRING
// Newer versions of Android are able to use shaderc to read raw string
- sq->record>(
+ sq->record(
params, std::vector(LR_SHADER.begin(), LR_SHADER.end()));
#else
// Older versions of Android require the SPIRV binary directly
- sq->record>(
+ sq->record(
params, std::vector(
kp::shader_data::shaders_glsl_logisticregression_comp_spv,
kp::shader_data::shaders_glsl_logisticregression_comp_spv
diff --git a/examples/logistic_regression/CMakeLists.txt b/examples/logistic_regression/CMakeLists.txt
index b12e8227f..f918bbf21 100644
--- a/examples/logistic_regression/CMakeLists.txt
+++ b/examples/logistic_regression/CMakeLists.txt
@@ -3,6 +3,7 @@ project(kompute_linear_reg VERSION 0.1.0)
set(CMAKE_CXX_STANDARD 14)
+option(KOMPUTE_ARR_OPT_INSTALLED_KOMPUTE "Enable if you prefer to use your installed Kompute library" 0)
option(KOMPUTE_OPT_ENABLE_SPDLOG "Extra compile flags for Kompute, see docs for full list" 0)
set(KOMPUTE_EXTRA_CXX_FLAGS "" CACHE STRING "Extra compile flags for Kompute, see docs for full list")
@@ -14,12 +15,16 @@ endif()
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -DDEBUG=1 ${KOMPUTE_EXTRA_CXX_FLAGS}")
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -DRELEASE=1 ${KOMPUTE_EXTRA_CXX_FLAGS}")
-find_package(kompute REQUIRED)
+if(KOMPUTE_ARR_OPT_INSTALLED_KOMPUTE)
+ find_package(kompute REQUIRED)
+else()
+ add_subdirectory(../../ ${CMAKE_CURRENT_BINARY_DIR}/kompute_build)
+endif()
+
find_package(Vulkan REQUIRED)
if(KOMPUTE_OPT_ENABLE_SPDLOG)
find_package(spdlog REQUIRED)
- find_package(fmt REQUIRED)
endif()
add_executable(kompute_linear_reg
@@ -30,11 +35,11 @@ target_link_libraries(kompute_linear_reg
Vulkan::Vulkan
)
+include_directories(
+ ../../single_include/)
+
if(KOMPUTE_OPT_ENABLE_SPDLOG)
- target_link_libraries(kompute_linear_reg
- kompute::kompute
- fmt::fmt
- spdlog::spdlog
- )
+ target_link_libraries(kompute_array_mult
+ spdlog::spdlog)
endif()
diff --git a/examples/logistic_regression/README.md b/examples/logistic_regression/README.md
index 29aa89c01..0de7ee30a 100644
--- a/examples/logistic_regression/README.md
+++ b/examples/logistic_regression/README.md
@@ -6,54 +6,12 @@ This example is structured such that you will be able to extend it for your proj
It contains a cmake build configuration that can be used in your production applications.
-## Pre-requisites
-
-In order to run this example, you will need the following dependencies:
-
-* REQUIRED
- + Vulkan Kompute library must be accessible
- + The Vulkan SDK must be installed
-* OPTIONAL
- + SPDLOG - for logging
- + FMT - for text formatting
-
-We will cover how you can install Vulkan Kompute in the next section.
-
-For the Vulkan SDK, the simplest way to install it is through [their website](https://vulkan.lunarg.com/sdk/home). You just have to follow the instructions for the relevant platform.
-
-For the other libraries, because they are optional you can just make sure you build and install Kompute with these disabled (this will be covered in more detail below).
-
-Alternatively you can use package managers such as vcpkg to help you install them, although to simplify things you can start without the dependencies first.
-
-## Set Up Vulkan Kompute Dependency
-
-You have multiple options to set up Vulkan Kompute. The easiest is to perform a local installation.
-
-For this, you will want to go to the main repo and run the following cmake command, which will configure it without SPDLOG by default.
-
-```
-cmake \
- -Bbuild
-```
-
-You can pass the following optional parameters based on your desired configuration:
-* If you wish to install with spdlog support you just have to pass `-DKOMPUTE_ENABLE_SPDLOG=1`.
-* If you wish to perform the installation on the local folder instead of in your system you can use `-DCMAKE_INSTALL_PREFIX="build/src/CMakeFiles/Export/"` which will basically ensure that the final files are created in the local directory.
-* If you are using a package manager such as `vcpkg` make sure you pass the `-DCMAKE_TOOLCHAIN_FILE=` parameter
-
-Then you can proceed to run the installation:
-
-* For Windows / Visual Studio you just have to build `INSTALL.vcxproj`
-* For Linux you can just run the `install` target via `make -C build install`
-
-You also have the option to build as `Release` or `Debug` - just make sure that you build your example with the same build/debug flags as required.
-
## Building the example
-Now that you've set up the dependencies / installation of Vulkan Kompute you can build this example.
-
You will notice that it's a standalone project, so you can re-use it for your application.
+This project has the option to either import the Kompute dependency relative to the project or use your existing installation of Kompute.
+
To build you just need to run the cmake command in this folder as follows:
```
@@ -61,14 +19,19 @@ cmake \
-Bbuild
```
-Make sure to pass the required flags depending on the configuration above:
-* If you built with Debug make sure you build your example with Debug as well
-* If you installed in the local folder, make sure you pass the CMAKE_PREFIX_PATH pointing to the respective folder (e.g. `-DCMAKE_PREFIX_PATH=../../build/src/CMakeFiles/Export/lib/cmake/kompute/` if parent folder is main repo).
-* If you built Vulkan Kompute with spdlog enabled, make sure to pass `-DKOMPUTE_OPT_ENABLE_SPDLOG=1`
+You can pass the following optional parameters based on your desired configuration:
+* If you wish to install with spdlog support you just have to pass `-DKOMPUTE_OPT_ENABLE_SPDLOG=1`.
* If you are using a package manager such as `vcpkg` make sure you pass the `-DCMAKE_TOOLCHAIN_FILE=` parameter
+* If you wish to load shader from raw glsl string instead of spirv bytes you can use `-DKOMPUTE_ANDROID_SHADER_FROM_STRING`
-Now you just have to build your application as above:
+## Pre-requisites
-* For Windows / Visual Studio you just have to build and run `kompute_linear_reg.vcxproj`
-* For Linux you can just run the `kompute_linear_reg` target via `make -C build kompute_linear_reg`
+In order to run this example, you will need the following dependencies:
+
+* REQUIRED
+ + The Vulkan SDK must be installed
+* OPTIONAL
+ + Vulkan Kompute library must be accessible (by default it uses the source directory)
+ + SPDLOG - for logging
+ + FMT - for text formatting
diff --git a/examples/logistic_regression/src/Main.cpp b/examples/logistic_regression/src/Main.cpp
index 853fa9d67..d3b8b3557 100755
--- a/examples/logistic_regression/src/Main.cpp
+++ b/examples/logistic_regression/src/Main.cpp
@@ -36,22 +36,30 @@ int main()
kp::Manager mgr;
std::shared_ptr sqTensor =
- mgr.createManagedSequence().lock();
+ mgr.createManagedSequence();
sqTensor->begin();
sqTensor->record(params);
sqTensor->end();
sqTensor->eval();
- std::shared_ptr sq = mgr.createManagedSequence().lock();
+ std::shared_ptr sq = mgr.createManagedSequence();
// Record op algo base
sq->begin();
sq->record({ wIn, bIn });
- sq->record>(
+#ifdef KOMPUTE_ANDROID_SHADER_FROM_STRING
+ sq->record(
params, "shaders/glsl/logistic_regression.comp");
+#else
+ sq->record(
+ params, std::vector(
+ kp::shader_data::shaders_glsl_logisticregression_comp_spv,
+ kp::shader_data::shaders_glsl_logisticregression_comp_spv
+ + kp::shader_data::shaders_glsl_logisticregression_comp_spv_len));
+#endif
sq->record({ wOutI, wOutJ, bOut, lOut });
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
new file mode 100644
index 000000000..5f3036986
--- /dev/null
+++ b/python/CMakeLists.txt
@@ -0,0 +1,11 @@
+
+add_subdirectory(pybind11)
+pybind11_add_module(kp src/main.cpp)
+
+include_directories(
+ ${PROJECT_SOURCE_DIR}/single_include/)
+
+target_link_libraries(
+ kp PRIVATE
+ kompute::kompute)
+
diff --git a/python/README.md b/python/README.md
new file mode 100644
index 000000000..7b0d89f0e
--- /dev/null
+++ b/python/README.md
@@ -0,0 +1,2 @@
+# Python Bindings for Vulkan Kompute
+
diff --git a/python/pybind11 b/python/pybind11
new file mode 160000
index 000000000..06a54018c
--- /dev/null
+++ b/python/pybind11
@@ -0,0 +1 @@
+Subproject commit 06a54018c8a9fd9a7be5f5b56414b5da9259f637
diff --git a/python/src/main.cpp b/python/src/main.cpp
new file mode 100644
index 000000000..0f10ea349
--- /dev/null
+++ b/python/src/main.cpp
@@ -0,0 +1,160 @@
+#include
+#include
+
+#include
+
+namespace py = pybind11;
+
+PYBIND11_MODULE(kp, m) {
+
+ py::enum_(m, "TensorTypes", "Enum with GPU memory types for Tensor.")
+ .value("device", kp::Tensor::TensorTypes::eDevice, "Tensor holding data in GPU memory.")
+ .value("staging", kp::Tensor::TensorTypes::eStaging, "Tensor used for transfer of data to device.")
+ .value("storage", kp::Tensor::TensorTypes::eStorage, "Tensor with host visible gpu memory.")
+ .export_values();
+
+ py::class_>(m, "Tensor", "Structured data used in GPU operations.")
+ .def(py::init(
+ [](const std::vector& data) {
+ return std::unique_ptr(new kp::Tensor(data));
+ }), "Initialiser with only list of data components.")
+ .def(py::init(
+ [](const std::vector& data, kp::Tensor::TensorTypes tensorTypes) {
+ return std::unique_ptr(new kp::Tensor(data, tensorTypes));
+ }), "Initialiser with list of data components and tensor GPU memory type.")
+ .def("data", &kp::Tensor::data, "Retrieves the data as a list containing the local Tensor memory data.")
+ .def("size", &kp::Tensor::size, "Retrieves the size of the Tensor data as per the local Tensor memory.")
+ .def("tensor_type", &kp::Tensor::tensorType, "Retreves the memory type of the tensor.")
+ .def("is_init", &kp::Tensor::isInit, "Checks whether the tensor GPU memory has been initialised.")
+ .def("set_data", &kp::Tensor::setData, "Overrides the data in the local Tensor memory.")
+ .def("map_data_from_host", &kp::Tensor::mapDataFromHostMemory, "Maps data into GPU memory from tensor local data.")
+ .def("map_data_into_host", &kp::Tensor::mapDataIntoHostMemory, "Maps data from GPU memory into tensor local data.");
+
+ py::class_>(m, "Sequence")
+ .def("init", &kp::Sequence::init, "Initialises Vulkan resources within sequence using provided device.")
+ // record
+ .def("begin", &kp::Sequence::begin, "Clears previous commands and starts recording commands in sequence which can be run in batch.")
+ .def("end", &kp::Sequence::end, "Stops listening and recording for new commands.")
+ // eval
+ .def("eval", &kp::Sequence::eval, "Executes the currently recorded commands synchronously by waiting on Vulkan Fence.")
+ .def("eval_async", &kp::Sequence::evalAsync, "Executes the currently recorded commands asynchronously.")
+ .def("eval_await", &kp::Sequence::evalAwait, "Waits until the execution finishes using Vulkan Fence.")
+ // status
+ .def("is_running", &kp::Sequence::isRunning, "Checks whether the Sequence operations are currently still executing.")
+ .def("is_rec", &kp::Sequence::isRecording, "Checks whether the Sequence is currently in recording mode.")
+ .def("is_init", &kp::Sequence::isInit, "Checks if the Sequence has been initialized")
+ // record
+ .def("record_tensor_create", &kp::Sequence::record,
+ "Records operation to create and initialise tensor GPU memory and buffer")
+ .def("record_tensor_copy", &kp::Sequence::record,
+ "Records operation to copy one tensor to one or many tensors")
+ .def("record_tensor_sync_device", &kp::Sequence::record,
+ "Records operation to sync tensor from local memory to GPU memory")
+ .def("record_tensor_sync_local", &kp::Sequence::record,
+ "Records operation to sync tensor(s) from GPU memory to local memory using staging tensors")
+ .def("record_algo_mult", &kp::Sequence::record,
+ "Records operation to run multiplication compute shader to two input tensors and an output tensor")
+ .def("record_algo_file", &kp::Sequence::record,
+ "Records an operation using a custom shader provided from a shader path")
+ .def("record_algo_data", &kp::Sequence::record>,
+ "Records an operation using a custom shader provided as raw string or spirv bytes")
+ .def("record_algo_lro", &kp::Sequence::record,
+ "Records operation to run left right out operation with custom shader");
+
+ py::class_(m, "Manager")
+ .def(py::init(), "Default initializer uses device 0 and first compute compatible GPU queueFamily")
+ .def(py::init(
+ [](uint32_t physicalDeviceIndex) {
+ return std::unique_ptr(new kp::Manager(physicalDeviceIndex));
+ }), "Manager initialiser can provide specified device index but will use first compute compatible GPU queueFamily")
+ .def(py::init(
+ [](uint32_t physicalDeviceIndex, const std::vector& familyQueueIndices) {
+ return std::unique_ptr(new kp::Manager(physicalDeviceIndex, familyQueueIndices));
+ }), "Manager initialiser can provide specified device and array of GPU queueFamilies to load.")
+ .def("get_create_sequence", &kp::Manager::getOrCreateManagedSequence, "Get a Sequence or create a new one with given name")
+ .def("create_sequence", &kp::Manager::createManagedSequence,
+ py::arg("name"), py::arg("queueIndex") = 0, "Create a sequence with specific name and specified index of available queues")
+ .def("build_tensor", &kp::Manager::buildTensor,
+ py::arg("data"), py::arg("tensorType") = kp::Tensor::TensorTypes::eDevice,
+ "Build and initialise tensor")
+ // Await functions
+ .def("eval_await", &kp::Manager::evalOpAwait,
+ py::arg("sequenceName"), py::arg("waitFor") = UINT64_MAX,
+ "Awaits for asynchronous operation on a named Sequence")
+ .def("eval_await_def", &kp::Manager::evalOpAwaitDefault,
+ py::arg("waitFor") = UINT64_MAX, "Awaits for asynchronous operation on the last anonymous Sequence created")
+ // eval default
+ .def("eval_tensor_create_def", &kp::Manager::evalOpDefault,
+ "Evaluates operation to create and initialise tensor GPU memory and buffer with new anonymous Sequence")
+ .def("eval_tensor_copy_def", &kp::Manager::evalOpDefault,
+ "Evaluates operation to copy one tensor to one or many tensors with new anonymous Sequence")
+ .def("eval_tensor_sync_device_def", &kp::Manager::evalOpDefault,
+ "Evaluates operation to sync tensor from local memory to GPU memory with new anonymous Sequence")
+ .def("eval_tensor_sync_local_def", &kp::Manager::evalOpDefault,
+ "Evaluates operation to sync tensor(s) from GPU memory to local memory using staging tensors with new anonymous Sequence")
+ .def("eval_algo_mult_def", &kp::Manager::evalOpDefault,
+ "Evaluates operation to run multiplication compute shader to two input tensors and an output tensor with new anonymous Sequence")
+ .def("eval_algo_file_def", &kp::Manager::evalOpDefault,
+ "Evaluates an operation using a custom shader provided from a shader path with new anonymous Sequence")
+ .def("eval_algo_data_def", &kp::Manager::evalOpDefault>,
+ "Evaluates an operation using a custom shader provided as raw string or spirv bytes with new anonymous Sequence")
+ .def("eval_algo_lro_def", &kp::Manager::evalOpDefault,
+ "Evaluates operation to run left right out operation with custom shader with new anonymous Sequence")
+ // eval
+ .def("eval_tensor_create", &kp::Manager::evalOp,
+ "Evaluates operation to create and initialise tensor GPU memory and buffer with explicitly named Sequence")
+ .def("eval_tensor_copy", &kp::Manager::evalOp,
+ "Evaluates operation to copy one tensor to one or many tensors with explicitly named Sequence")
+ .def("eval_tensor_sync_device", &kp::Manager::evalOp,
+ "Evaluates operation to sync tensor from local memory to GPU memory with explicitly named Sequence")
+ .def("eval_tensor_sync_local", &kp::Manager::evalOp,
+ "Evaluates operation to sync tensor(s) from GPU memory to local memory using staging tensors with explicitly named Sequence")
+ .def("eval_algo_mult", &kp::Manager::evalOp,
+ "Evaluates operation to run multiplication compute shader to two input tensors and an output tensor with explicitly named Sequence")
+ .def("eval_algo_file", &kp::Manager::evalOp,
+ "Evaluates an operation using a custom shader provided from a shader path with explicitly named Sequence")
+ .def("eval_algo_data", &kp::Manager::evalOp>,
+ "Evaluates an operation using a custom shader provided as raw string or spirv bytes with explicitly named Sequence")
+ .def("eval_algo_lro", &kp::Manager::evalOp,
+ "Evaluates operation to run left right out operation with custom shader with explicitly named Sequence")
+ // eval async default
+ .def("eval_async_tensor_create_def", &kp::Manager::evalOpAsyncDefault,
+ "Evaluates asynchronously operation to create and initialise tensor GPU memory and buffer with anonymous Sequence")
+ .def("eval_async_tensor_copy_def", &kp::Manager::evalOpAsyncDefault,
+ "Evaluates asynchronously operation to copy one tensor to one or many tensors with anonymous Sequence")
+ .def("eval_async_tensor_sync_device_def", &kp::Manager::evalOpAsyncDefault,
+ "Evaluates asynchronously operation to sync tensor from local memory to GPU memory with anonymous Sequence")
+ .def("eval_async_tensor_sync_local_def", &kp::Manager::evalOpAsyncDefault,
+ "Evaluates asynchronously operation to sync tensor(s) from GPU memory to local memory using staging tensors with anonymous Sequence")
+ .def("eval_async_algo_mult_def", &kp::Manager::evalOpAsyncDefault,
+ "Evaluates asynchronously operation to run multiplication compute shader to two input tensors and an output tensor with anonymous Sequence")
+ .def("eval_async_algo_file_def", &kp::Manager::evalOpAsyncDefault,
+ "Evaluates asynchronously an operation using a custom shader provided from a shader path with anonymous Sequence")
+ .def("eval_async_algo_data_def", &kp::Manager::evalOpAsyncDefault>,
+ "Evaluates asynchronously an operation using a custom shader provided as raw string or spirv bytes with anonymous Sequence")
+ .def("eval_async_algo_lro_def", &kp::Manager::evalOpAsyncDefault,
+ "Evaluates asynchronously operation to run left right out operation with custom shader with anonymous Sequence")
+ // eval async
+ .def("eval_async_tensor_create", &kp::Manager::evalOpAsync,
+ "Evaluates asynchronously operation to create and initialise tensor GPU memory and buffer with explicitly named Sequence")
+ .def("eval_async_tensor_copy", &kp::Manager::evalOpAsync,
+ "Evaluates asynchronously operation to copy one tensor to one or many tensors with explicitly named Sequence")
+ .def("eval_async_tensor_sync_device", &kp::Manager::evalOpAsync,
+ "Evaluates asynchronously operation to sync tensor from local memory to GPU memory with explicitly named Sequence")
+ .def("eval_async_tensor_sync_local", &kp::Manager::evalOpAsync,
+ "Evaluates asynchronously operation to sync tensor(s) from GPU memory to local memory using staging tensors with explicitly named Sequence")
+ .def("eval_async_algo_mult", &kp::Manager::evalOpAsync,
+ "Evaluates asynchronously operation to run multiplication compute shader to two input tensors and an output tensor with explicitly named Sequence")
+ .def("eval_async_algo_file", &kp::Manager::evalOpAsync,
+ "Evaluates asynchronously an operation using a custom shader provided from a shader path with explicitly named Sequence")
+ .def("eval_async_algo_data", &kp::Manager::evalOpAsync>,
+ "Evaluates asynchronously an operation using a custom shader provided as raw string or spirv bytes with explicitly named Sequence")
+ .def("eval_async_algo_lro", &kp::Manager::evalOpAsync,
+ "Evaluates asynchronously operation to run left right out operation with custom shader with explicitly named Sequence");
+
+#ifdef VERSION_INFO
+ m.attr("__version__") = VERSION_INFO;
+#else
+ m.attr("__version__") = "dev";
+#endif
+}
diff --git a/python/test/test_kompute.py b/python/test/test_kompute.py
new file mode 100644
index 000000000..43baf77d1
--- /dev/null
+++ b/python/test/test_kompute.py
@@ -0,0 +1,110 @@
+
+from kp import Tensor, Manager, Sequence
+
+def test_opmult():
+ """
+ Test basic OpMult operation
+ """
+
+ tensor_in_a = Tensor([2, 2, 2])
+ tensor_in_b = Tensor([1, 2, 3])
+ tensor_out = Tensor([0, 0, 0])
+
+ mgr = Manager()
+
+ mgr.eval_tensor_create_def([tensor_in_a, tensor_in_b, tensor_out])
+
+ mgr.eval_algo_mult_def([tensor_in_a, tensor_in_b, tensor_out])
+
+ mgr.eval_tensor_sync_local_def([tensor_out])
+
+ assert tensor_out.data() == [2.0, 4.0, 6.0]
+
+def test_opalgobase_data():
+ """
+ Test basic OpAlgoBase operation
+ """
+
+ tensor_in_a = Tensor([2, 2, 2])
+ tensor_in_b = Tensor([1, 2, 3])
+ tensor_out = Tensor([0, 0, 0])
+
+ mgr = Manager()
+
+ shaderData = """
+ #version 450
+
+ layout (local_size_x = 1) in;
+
+ // The input tensors bind index is relative to index in parameter passed
+ layout(set = 0, binding = 0) buffer bina { float tina[]; };
+ layout(set = 0, binding = 1) buffer binb { float tinb[]; };
+ layout(set = 0, binding = 2) buffer bout { float tout[]; };
+
+ void main() {
+ uint index = gl_GlobalInvocationID.x;
+ tout[index] = tina[index] * tinb[index];
+ }
+ """
+
+ mgr.eval_tensor_create_def([tensor_in_a, tensor_in_b, tensor_out])
+
+ mgr.eval_algo_data_def([tensor_in_a, tensor_in_b, tensor_out], list(shaderData))
+
+ mgr.eval_tensor_sync_local_def([tensor_out])
+
+ assert tensor_out.data() == [2.0, 4.0, 6.0]
+
+
+def test_opalgobase_file():
+ """
+ Test basic OpAlgoBase operation
+ """
+
+ tensor_in_a = Tensor([2, 2, 2])
+ tensor_in_b = Tensor([1, 2, 3])
+ tensor_out = Tensor([0, 0, 0])
+
+ mgr = Manager()
+
+ shaderFilePath = "../../shaders/glsl/opmult.comp"
+
+ mgr.eval_tensor_create_def([tensor_in_a, tensor_in_b, tensor_out])
+
+ mgr.eval_algo_file_def([tensor_in_a, tensor_in_b, tensor_out], shaderFilePath)
+
+ mgr.eval_tensor_sync_local_def([tensor_out])
+
+ assert tensor_out.data() == [2.0, 4.0, 6.0]
+
+def test_sequence():
+ """
+ Test basic OpAlgoBase operation
+ """
+
+ mgr = Manager(0, [2])
+
+ tensor_in_a = Tensor([2, 2, 2])
+ tensor_in_b = Tensor([1, 2, 3])
+ tensor_out = Tensor([0, 0, 0])
+
+ mgr.eval_tensor_create_def([tensor_in_a, tensor_in_b, tensor_out])
+
+ seq = mgr.create_sequence("op")
+
+ shaderFilePath = "../../shaders/glsl/opmult.comp"
+ mgr.eval_async_algo_file_def([tensor_in_a, tensor_in_b, tensor_out], shaderFilePath)
+ mgr.eval_await_def()
+
+ seq.begin()
+ seq.record_tensor_sync_local([tensor_in_a])
+ seq.record_tensor_sync_local([tensor_in_b])
+ seq.record_tensor_sync_local([tensor_out])
+ seq.end()
+
+ seq.eval()
+
+ assert tensor_out.data() == [2.0, 4.0, 6.0]
+
+if __name__ == "__main__":
+ test_sequence()
diff --git a/setup.py b/setup.py
new file mode 100644
index 000000000..0b5db2f9c
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,76 @@
+import os
+import re
+import sys
+import platform
+import subprocess
+
+from setuptools import setup, Extension
+from setuptools.command.build_ext import build_ext
+from distutils.version import LooseVersion
+
+
+class CMakeExtension(Extension):
+ def __init__(self, name, sourcedir=''):
+ Extension.__init__(self, name, sources=[])
+ self.sourcedir = os.path.abspath(sourcedir)
+
+
+class CMakeBuild(build_ext):
+ def run(self):
+ try:
+ out = subprocess.check_output(['cmake', '--version'])
+ except OSError:
+ raise RuntimeError("CMake must be installed to build the following extensions: " +
+ ", ".join(e.name for e in self.extensions))
+
+ if platform.system() == "Windows":
+ cmake_version = LooseVersion(re.search(r'version\s*([\d.]+)', out.decode()).group(1))
+ if cmake_version < '3.1.0':
+ raise RuntimeError("CMake >= 3.1.0 is required on Windows")
+
+ for ext in self.extensions:
+ self.build_extension(ext)
+
+ def build_extension(self, ext):
+ extdir = os.path.abspath(os.path.dirname(self.get_ext_fullpath(ext.name)))
+ # required for auto-detection of auxiliary "native" libs
+ if not extdir.endswith(os.path.sep):
+ extdir += os.path.sep
+
+ cmake_args = ['-DCMAKE_LIBRARY_OUTPUT_DIRECTORY=' + extdir,
+ '-DKOMPUTE_OPT_BUILD_PYTHON=1',
+ '-DKOMPUTE_OPT_BUILD_SINGLE_HEADER=1',
+ '-DPYTHON_EXECUTABLE=' + sys.executable]
+
+ cfg = 'Debug' if self.debug else 'Release'
+ build_args = ['--config', cfg]
+
+ if platform.system() == "Windows":
+ cmake_args += ['-DCMAKE_LIBRARY_OUTPUT_DIRECTORY_{}={}'.format(cfg.upper(), extdir)]
+ if sys.maxsize > 2**32:
+ cmake_args += ['-A', 'x64']
+ build_args += ['--', '/m']
+ else:
+ cmake_args += ['-DKOMPUTE_EXTRA_CXX_FLAGS="-fPIC"']
+ cmake_args += ['-DCMAKE_BUILD_TYPE=' + cfg]
+ build_args += ['--', '-j2']
+
+ env = os.environ.copy()
+ env['CXXFLAGS'] = '{} -DVERSION_INFO=\\"{}\\"'.format(env.get('CXXFLAGS', ''),
+ self.distribution.get_version())
+ if not os.path.exists(self.build_temp):
+ os.makedirs(self.build_temp)
+
+ subprocess.check_call(['cmake', ext.sourcedir] + cmake_args, cwd=self.build_temp, env=env)
+ subprocess.check_call(['cmake', '--build', '.'] + build_args, cwd=self.build_temp)
+
+setup(
+ name='kp',
+ version='0.0.1',
+ author='Alejandro Saucedo',
+ description='Vulkan Kompute: Blazing fast, mobile-enabled, asynchronous, and optimized for advanced GPU processing usecases.',
+ long_description='',
+ ext_modules=[CMakeExtension('kp')],
+ cmdclass=dict(build_ext=CMakeBuild),
+ zip_safe=False,
+)
diff --git a/single_include/kompute/Kompute.hpp b/single_include/kompute/Kompute.hpp
index 8def06e4a..3ae98b483 100755
--- a/single_include/kompute/Kompute.hpp
+++ b/single_include/kompute/Kompute.hpp
@@ -1100,6 +1100,12 @@ class Sequence
*/
bool isInit();
+ /**
+ * Destroys and frees the GPU resources which include the buffer and memory
+ * and sets the sequence as init=False.
+ */
+ void freeMemoryDestroyGPUResources();
+
/**
* Record function for operation to be added to the GPU queue in batch. This
* template requires classes to be derived from the OpBase class. This
@@ -1301,9 +1307,9 @@ class Manager
*
* @param sequenceName The name for the named sequence to be retrieved or
* created
- * @return Weak pointer to the manager owned sequence resource
+ * @return Shared pointer to the manager owned sequence resource
*/
- std::weak_ptr getOrCreateManagedSequence(
+ std::shared_ptr getOrCreateManagedSequence(
std::string sequenceName);
/**
@@ -1315,8 +1321,9 @@ class Manager
* @param queueIndex The queue to use from the available queues
* @return Weak pointer to the manager owned sequence resource
*/
- std::weak_ptr createManagedSequence(std::string sequenceName = "",
- uint32_t queueIndex = 0);
+ std::shared_ptr createManagedSequence(
+ std::string sequenceName = "",
+ uint32_t queueIndex = 0);
/**
* Function that evaluates operation against named sequence.
@@ -1332,22 +1339,21 @@ class Manager
TArgs&&... params)
{
SPDLOG_DEBUG("Kompute Manager evalOp triggered");
- std::weak_ptr sqWeakPtr =
+ std::shared_ptr sq =
this->getOrCreateManagedSequence(sequenceName);
- if (std::shared_ptr sq = sqWeakPtr.lock()) {
- SPDLOG_DEBUG("Kompute Manager evalOp running sequence BEGIN");
- sq->begin();
+ SPDLOG_DEBUG("Kompute Manager evalOp running sequence BEGIN");
+ sq->begin();
- SPDLOG_DEBUG("Kompute Manager evalOp running sequence RECORD");
- sq->record(tensors, std::forward(params)...);
+ SPDLOG_DEBUG("Kompute Manager evalOp running sequence RECORD");
+ sq->record(tensors, std::forward(params)...);
- SPDLOG_DEBUG("Kompute Manager evalOp running sequence END");
- sq->end();
+ SPDLOG_DEBUG("Kompute Manager evalOp running sequence END");
+ sq->end();
+
+ SPDLOG_DEBUG("Kompute Manager evalOp running sequence EVAL");
+ sq->eval();
- SPDLOG_DEBUG("Kompute Manager evalOp running sequence EVAL");
- sq->eval();
- }
SPDLOG_DEBUG("Kompute Manager evalOp running sequence SUCCESS");
}
@@ -1385,26 +1391,21 @@ class Manager
{
SPDLOG_DEBUG("Kompute Manager evalOpAsync triggered");
- std::weak_ptr sqWeakPtr =
+ std::shared_ptr sq =
this->getOrCreateManagedSequence(sequenceName);
- if (std::shared_ptr sq = sqWeakPtr.lock()) {
+ SPDLOG_DEBUG("Kompute Manager evalOpAsync running sequence BEGIN");
+ sq->begin();
- SPDLOG_DEBUG("Kompute Manager evalOpAsync running sequence BEGIN");
- sq->begin();
+ SPDLOG_DEBUG("Kompute Manager evalOpAsync running sequence RECORD");
+ sq->record(tensors, std::forward(params)...);
- SPDLOG_DEBUG("Kompute Manager evalOpAsync running sequence RECORD");
- sq->record(tensors, std::forward(params)...);
+ SPDLOG_DEBUG("Kompute Manager evalOpAsync running sequence END");
+ sq->end();
- SPDLOG_DEBUG("Kompute Manager evalOpAsync running sequence END");
- sq->end();
+ SPDLOG_DEBUG("Kompute Manager evalOpAsync running sequence EVAL");
+ sq->evalAsync();
- SPDLOG_DEBUG("Kompute Manager evalOpAsync running sequence EVAL");
- sq->evalAsync();
- } else {
- SPDLOG_ERROR("Kompute Manager evalOpAsync sequence [{}] not found",
- sequenceName);
- }
SPDLOG_DEBUG("Kompute Manager evalOpAsync running sequence SUCCESS");
}
@@ -1620,20 +1621,17 @@ namespace kp {
* Operation that provides a general abstraction that simplifies the use of
* algorithm and parameter components which can be used with shaders.
* By default it enables the user to provide a dynamic number of tensors
- * which are then passed as inputs.
- *
- * All of these tensors are expected to be initlaised and this is checked with throw std exception in the init function.
- *
- * See OpLhsRhsOut for an example implementation on a more specific granularity on tensor parameters.
- *
- * The template parameters specify the processing GPU layout number of
- * iterations for each x, y, z parameter. More specifically, this will be the
- * input to ".dispatch(uint32_t tX, uint32_t tY, uint32_t, tZ)"
+ * which are then passed as inputs.
*/
-template
class OpAlgoBase : public OpBase
{
public:
+ struct KomputeWorkgroup {
+ uint32_t x;
+ uint32_t y;
+ uint32_t z;
+ };
+
/**
* Base constructor, should not be used unless explicitly intended.
*/
@@ -1649,11 +1647,13 @@ class OpAlgoBase : public OpBase
* @param commandBuffer Vulkan Command Buffer to record commands into
* @param tensors Tensors that are to be used in this operation
* @param shaderFilePath Optional parameter to specify the shader to load (either in spirv or raw format)
+ * @param komputeWorkgroup Optional parameter to specify the layout for processing
*/
OpAlgoBase(std::shared_ptr physicalDevice,
std::shared_ptr device,
std::shared_ptr commandBuffer,
- std::vector>& tensors);
+ std::vector>& tensors,
+ KomputeWorkgroup komputeWorkgroup = KomputeWorkgroup());
/**
* Constructor that enables a file to be passed to the operation with
@@ -1664,13 +1664,15 @@ class OpAlgoBase : public OpBase
* @param device Vulkan logical device for passing to Algorithm
* @param commandBuffer Vulkan Command Buffer to record commands into
* @param tensors Tensors that are to be used in this operation
- * @param shaderFilePath Optional parameter to specify the shader to load (either in spirv or raw format)
+ * @param shaderFilePath Parameter to specify the shader to load (either in spirv or raw format)
+ * @param komputeWorkgroup Optional parameter to specify the layout for processing
*/
OpAlgoBase(std::shared_ptr physicalDevice,
std::shared_ptr device,
std::shared_ptr commandBuffer,
std::vector>& tensors,
- std::string shaderFilePath);
+ std::string shaderFilePath,
+ KomputeWorkgroup komputeWorkgroup = KomputeWorkgroup());
/**
* Constructor that enables raw shader data to be passed to the main operation
@@ -1681,12 +1683,14 @@ class OpAlgoBase : public OpBase
* @param commandBuffer Vulkan Command Buffer to record commands into
* @param tensors Tensors that are to be used in this operation
* @param shaderDataRaw Optional parameter to specify the shader data either in binary or raw form
+ * @param komputeWorkgroup Optional parameter to specify the layout for processing
*/
OpAlgoBase(std::shared_ptr physicalDevice,
std::shared_ptr device,
std::shared_ptr commandBuffer,
std::vector>& tensors,
- const std::vector& shaderDataRaw);
+ const std::vector& shaderDataRaw,
+ KomputeWorkgroup komputeWorkgroup = KomputeWorkgroup());
/**
* Default destructor, which is in charge of destroying the algorithm
@@ -1733,9 +1737,7 @@ class OpAlgoBase : public OpBase
// -------------- ALWAYS OWNED RESOURCES
- uint32_t mX;
- uint32_t mY;
- uint32_t mZ;
+ KomputeWorkgroup mKomputeWorkgroup;
std::string mShaderFilePath; ///< Optional member variable which can be provided for the OpAlgoBase to find the data automatically and load for processing
std::vector mShaderDataRaw; ///< Optional member variable which can be provided to contain either the raw shader content or the spirv binary content
@@ -1745,177 +1747,6 @@ class OpAlgoBase : public OpBase
} // End namespace kp
-// Including implementation for template class
-#ifndef OPALGOBASE_IMPL
-#define OPALGOBASE_IMPL
-
-namespace kp {
-
-template
-OpAlgoBase::OpAlgoBase()
-{
- SPDLOG_DEBUG("Kompute OpAlgoBase constructor base");
-}
-
-template
-OpAlgoBase::OpAlgoBase(std::shared_ptr physicalDevice,
- std::shared_ptr device,
- std::shared_ptr commandBuffer,
- std::vector>& tensors)
- : OpBase(physicalDevice, device, commandBuffer, tensors, false)
-{
- SPDLOG_DEBUG("Kompute OpAlgoBase constructor with params numTensors: {}", tensors.size());
-
- // The dispatch size is set up based on either explicitly provided template
- // parameters or by default it would take the shape and size of the tensors
- if (tX > 0) {
- // If at least the x value is provided we use mainly the parameters
- // provided
- this->mX = tX;
- this->mY = tY > 0 ? tY : 1;
- this->mZ = tZ > 0 ? tZ : 1;
- } else {
- this->mX = tensors[0]->size();
- this->mY = 1;
- this->mZ = 1;
- }
- SPDLOG_INFO("Kompute OpAlgoBase dispatch size X: {}, Y: {}, Z: {}",
- this->mX,
- this->mY,
- this->mZ);
-
- this->mAlgorithm = std::make_shared(device, commandBuffer);
-}
-
-template
-OpAlgoBase::OpAlgoBase(std::shared_ptr physicalDevice,
- std::shared_ptr device,
- std::shared_ptr commandBuffer,
- std::vector>& tensors,
- std::string shaderFilePath)
- : OpAlgoBase(physicalDevice, device, commandBuffer, tensors)
-{
- SPDLOG_DEBUG("Kompute OpAlgoBase shaderFilePath constructo with shaderfile path: {}", shaderFilePath);
-
- this->mShaderFilePath = shaderFilePath;
-}
-
-template
-OpAlgoBase::OpAlgoBase(std::shared_ptr physicalDevice,
- std::shared_ptr device,
- std::shared_ptr commandBuffer,
- std::vector>& tensors,
- const std::vector& shaderDataRaw)
- : OpAlgoBase(physicalDevice, device, commandBuffer, tensors)
-{
- SPDLOG_DEBUG("Kompute OpAlgoBase shaderFilePath constructo with shader raw data length: {}", shaderDataRaw.size());
-
- this->mShaderDataRaw = shaderDataRaw;
-}
-
-template
-OpAlgoBase::~OpAlgoBase()
-{
- SPDLOG_DEBUG("Kompute OpAlgoBase destructor started");
-}
-
-template
-void
-OpAlgoBase::init()
-{
- SPDLOG_DEBUG("Kompute OpAlgoBase init called");
-
- if (this->mTensors.size() < 1) {
- throw std::runtime_error(
- "Kompute OpAlgoBase called with less than 1 tensor");
- }
-
- for (std::shared_ptr tensor : this->mTensors) {
- if(!tensor->isInit()) {
- throw std::runtime_error("Kompute OpAlgoBase validation failed; all tensor parameters must be initialised.");
- }
- }
-
- SPDLOG_DEBUG("Kompute OpAlgoBase fetching spirv data");
-
- std::vector shaderFileData = this->fetchSpirvBinaryData();
-
- SPDLOG_DEBUG("Kompute OpAlgoBase Initialising algorithm component");
-
- this->mAlgorithm->init(shaderFileData, this->mTensors);
-}
-
-template
-void
-OpAlgoBase::record()
-{
- SPDLOG_DEBUG("Kompute OpAlgoBase record called");
-
- // Barrier to ensure the data is finished writing to buffer memory
- for (std::shared_ptr tensor : this->mTensors) {
- tensor->recordBufferMemoryBarrier(
- this->mCommandBuffer,
- vk::AccessFlagBits::eHostWrite,
- vk::AccessFlagBits::eShaderRead,
- vk::PipelineStageFlagBits::eHost,
- vk::PipelineStageFlagBits::eComputeShader);
- }
-
- this->mAlgorithm->recordDispatch(this->mX, this->mY, this->mZ);
-}
-
-template
-void
-OpAlgoBase::preEval()
-{
- SPDLOG_DEBUG("Kompute OpAlgoBase preEval called");
-}
-
-template
-void
-OpAlgoBase::postEval()
-{
- SPDLOG_DEBUG("Kompute OpAlgoBase postSubmit called");
-}
-
-template
-std::vector OpAlgoBase::fetchSpirvBinaryData()
-{
- SPDLOG_WARN(
- "Kompute OpAlgoBase Running shaders directly from spirv file");
-
- if (this->mShaderFilePath.size()) {
- std::ifstream fileStream(this->mShaderFilePath,
- std::ios::binary | std::ios::in | std::ios::ate);
-
- if (!fileStream.good()) {
- throw std::runtime_error("Error reading file: " + this->mShaderFilePath);
- }
-
- size_t shaderFileSize = fileStream.tellg();
- fileStream.seekg(0, std::ios::beg);
- char* shaderDataRaw = new char[shaderFileSize];
- fileStream.read(shaderDataRaw, shaderFileSize);
- fileStream.close();
-
- SPDLOG_WARN(
- "Kompute OpAlgoBase fetched {} bytes", shaderFileSize);
-
- return std::vector(shaderDataRaw,
- shaderDataRaw + shaderFileSize);
- }
- else if (this->mShaderDataRaw.size()) {
- return this->mShaderDataRaw;
- }
- else {
- throw std::runtime_error("Kompute OpAlgoBase Error reached fetchSpirvBinaryData but neither filepath nor data provided");
- }
-}
-
-}
-
-#endif // #ifndef OPALGOBASE_IMPL
-
#include
namespace kp {
@@ -1924,12 +1755,8 @@ namespace kp {
* Operation base class to simplify the creation of operations that require
* right hand and left hand side datapoints together with a single output.
* The expected data passed is two input tensors and one output tensor.
- * The template parameters specify the processing GPU layout number of
- * iterations for each x, y, z parameter. More specifically, this will be the
- * input to ".dispatch(uint32_t tX, uint32_t tY, uint32_t, tZ)"
*/
-template
-class OpAlgoLhsRhsOut : public OpAlgoBase
+class OpAlgoLhsRhsOut : public OpAlgoBase
{
public:
/**
@@ -1947,11 +1774,13 @@ class OpAlgoLhsRhsOut : public OpAlgoBase
* @param commandBuffer Vulkan Command Buffer to record commands into
* @param tensors Tensors that are to be used in this operation
* @param freeTensors Whether operation manages the memory of the Tensors
+ * @param komputeWorkgroup Optional parameter to specify the layout for processing
*/
OpAlgoLhsRhsOut(std::shared_ptr physicalDevice,
std::shared_ptr device,
std::shared_ptr commandBuffer,
- std::vector> tensors);
+ std::vector> tensors,
+ KomputeWorkgroup komputeWorkgroup = KomputeWorkgroup());
/**
* Default destructor, which is in charge of destroying the algorithm
@@ -1982,7 +1811,7 @@ class OpAlgoLhsRhsOut : public OpAlgoBase
* of the GPU Device memory into the staging buffer so the output data can
* be retrieved.
*/
- virtual void postSubmit() override;
+ virtual void postEval() override;
protected:
// -------------- NEVER OWNED RESOURCES
@@ -1996,138 +1825,6 @@ class OpAlgoLhsRhsOut : public OpAlgoBase
} // End namespace kp
-// Including implementation for template class
-#ifndef OPALGOLHSRHSOUT_CPP
-#define OPALGOLHSRHSOUT_CPP
-
-namespace kp {
-
-template
-OpAlgoLhsRhsOut::OpAlgoLhsRhsOut()
-{
- SPDLOG_DEBUG("Kompute OpAlgoLhsRhsOut constructor base");
-}
-
-template
-OpAlgoLhsRhsOut::OpAlgoLhsRhsOut(std::shared_ptr physicalDevice,
- std::shared_ptr device,
- std::shared_ptr commandBuffer,
- std::vector> tensors)
- // The inheritance is initialised with the copyOutputData to false given that
- // this depencendant class handles the transfer of data via staging buffers in
- // a granular way.
- : OpAlgoBase(physicalDevice, device, commandBuffer, tensors)
-{
- SPDLOG_DEBUG("Kompute OpAlgoLhsRhsOut constructor with params");
-}
-
-template
-OpAlgoLhsRhsOut::~OpAlgoLhsRhsOut()
-{
- SPDLOG_DEBUG("Kompute OpAlgoLhsRhsOut destructor started");
-}
-
-template
-void
-OpAlgoLhsRhsOut::init()
-{
- SPDLOG_DEBUG("Kompute OpAlgoLhsRhsOut init called");
-
- if (this->mTensors.size() < 3) {
- throw std::runtime_error(
- "Kompute OpAlgoLhsRhsOut called with less than 1 tensor");
- } else if (this->mTensors.size() > 3) {
- SPDLOG_WARN("Kompute OpAlgoLhsRhsOut called with more than 3 this->mTensors");
- }
-
- this->mTensorLHS = this->mTensors[0];
- this->mTensorRHS = this->mTensors[1];
- this->mTensorOutput = this->mTensors[2];
-
- if (!(this->mTensorLHS->isInit() && this->mTensorRHS->isInit() &&
- this->mTensorOutput->isInit())) {
- throw std::runtime_error(
- "Kompute OpAlgoLhsRhsOut all tensor parameters must be initialised. LHS: " +
- std::to_string(this->mTensorLHS->isInit()) +
- " RHS: " + std::to_string(this->mTensorRHS->isInit()) +
- " Output: " + std::to_string(this->mTensorOutput->isInit()));
- }
-
- if (!(this->mTensorLHS->size() == this->mTensorRHS->size() &&
- this->mTensorRHS->size() == this->mTensorOutput->size())) {
- throw std::runtime_error(
- "Kompute OpAlgoLhsRhsOut all tensor parameters must be the same size LHS: " +
- std::to_string(this->mTensorLHS->size()) +
- " RHS: " + std::to_string(this->mTensorRHS->size()) +
- " Output: " + std::to_string(this->mTensorOutput->size()));
- }
-
- this->mTensorOutputStaging = std::make_shared(
- this->mTensorOutput->data(), Tensor::TensorTypes::eStaging);
-
- this->mTensorOutputStaging->init(
- this->mPhysicalDevice, this->mDevice, this->mCommandBuffer);
-
- SPDLOG_DEBUG("Kompute OpAlgoLhsRhsOut fetching spirv data");
-
- std::vector shaderFileData = this->fetchSpirvBinaryData();
-
- SPDLOG_DEBUG("Kompute OpAlgoLhsRhsOut Initialising algorithm component");
-
- this->mAlgorithm->init(shaderFileData, this->mTensors);
-}
-
-template
-void
-OpAlgoLhsRhsOut::record()
-{
- SPDLOG_DEBUG("Kompute OpAlgoLhsRhsOut record called");
-
- // Barrier to ensure the data is finished writing to buffer memory
- this->mTensorLHS->recordBufferMemoryBarrier(
- this->mCommandBuffer,
- vk::AccessFlagBits::eHostWrite,
- vk::AccessFlagBits::eShaderRead,
- vk::PipelineStageFlagBits::eHost,
- vk::PipelineStageFlagBits::eComputeShader);
- this->mTensorRHS->recordBufferMemoryBarrier(
- this->mCommandBuffer,
- vk::AccessFlagBits::eHostWrite,
- vk::AccessFlagBits::eShaderRead,
- vk::PipelineStageFlagBits::eHost,
- vk::PipelineStageFlagBits::eComputeShader);
-
- this->mAlgorithm->recordDispatch(this->mX, this->mY, this->mZ);
-
- // Barrier to ensure the shader code is executed before buffer read
- this->mTensorOutput->recordBufferMemoryBarrier(
- this->mCommandBuffer,
- vk::AccessFlagBits::eShaderWrite,
- vk::AccessFlagBits::eTransferRead,
- vk::PipelineStageFlagBits::eComputeShader,
- vk::PipelineStageFlagBits::eTransfer);
-
- this->mTensorOutputStaging->recordCopyFrom(
- this->mCommandBuffer,
- this->mTensorOutput,
- true);
-}
-
-template
-void
-OpAlgoLhsRhsOut::postSubmit()
-{
- SPDLOG_DEBUG("Kompute OpAlgoLhsRhsOut postSubmit called");
-
- this->mTensorOutputStaging->mapDataFromHostMemory();
-
- this->mTensorOutput->setData(this->mTensorOutputStaging->data());
-}
-
-}
-
-#endif // #ifndef OPALGOLHSRHSOUT_CPP
-
#include
#if RELEASE
@@ -2138,12 +1835,9 @@ namespace kp {
/**
* Operation that performs multiplication on two tensors and outpus on third
- * tensor. The template parameters specify the processing GPU layout number of
- * iterations for each x, y, z parameter. More specifically, this will be the
- * input to ".dispatch(uint32_t tX, uint32_t tY, uint32_t, tZ)"
+ * tensor.
*/
-template
-class OpMult : public OpAlgoBase
+class OpMult : public OpAlgoBase
{
public:
/**
@@ -2162,13 +1856,14 @@ class OpMult : public OpAlgoBase
* @param device Vulkan logical device for passing to Algorithm
* @param commandBuffer Vulkan Command Buffer to record commands into
* @param tensors Tensors that are to be used in this operation
- * @param freeTensors Whether operation manages the memory of the Tensors
+ * @param komputeWorkgroup Optional parameter to specify the layout for processing
*/
OpMult(std::shared_ptr physicalDevice,
std::shared_ptr