Merge pull request #88 from EthicalML/pybind_python
Added python bindings with kp as python module
This commit is contained in:
commit
e62dfea2a2
47 changed files with 1338 additions and 1036 deletions
1
.ccls
1
.ccls
|
|
@ -13,6 +13,7 @@
|
|||
-DDEBUG=1
|
||||
-DKOMPUTE_INCLUDE_FOR_SYNTAX
|
||||
|
||||
-I./python/pybind11/include/
|
||||
-I./external/Vulkan-Headers/include/
|
||||
-I./external/googletest/googletest/include/
|
||||
-I./external/spdlog/include/
|
||||
|
|
|
|||
3
.gitmodules
vendored
3
.gitmodules
vendored
|
|
@ -10,3 +10,6 @@
|
|||
path = external/spdlog
|
||||
url = https://github.com/gabime/spdlog
|
||||
branch = v1.8.1
|
||||
[submodule "python/pybind11"]
|
||||
path = python/pybind11
|
||||
url = https://github.com/pybind/pybind11
|
||||
|
|
|
|||
|
|
@ -1,5 +1,5 @@
|
|||
cmake_minimum_required(VERSION 3.4.1)
|
||||
project(kompute VERSION 0.3.0)
|
||||
project(kompute VERSION 0.4.1)
|
||||
|
||||
set(CMAKE_CXX_STANDARD 14)
|
||||
|
||||
|
|
@ -13,6 +13,7 @@ option(KOMPUTE_OPT_BUILD_SHADERS "Enable if you want to re-build all shader file
|
|||
option(KOMPUTE_OPT_BUILD_SINGLE_HEADER "Enable if you want to build the single header file" 0)
|
||||
option(KOMPUTE_OPT_INSTALL "Enable if you want to enable installation" 0)
|
||||
# Build options
|
||||
option(KOMPUTE_OPT_BUILD_PYTHON "Enable if you want to build python bindings" 0)
|
||||
option(KOMPUTE_OPT_ENABLE_SPDLOG "Extra compile flags for Kompute, see docs for full list" 0)
|
||||
option(KOMPUTE_OPT_REPO_SUBMODULE_BUILD, "Use the submodule repos instead of external package manager" 0)
|
||||
option(KOMPUTE_OPT_ANDOID_BUILD "Enable android compilation flags required" 0)
|
||||
|
|
@ -43,12 +44,16 @@ endfunction()
|
|||
|
||||
add_subdirectory(src)
|
||||
|
||||
if(KOMPUTE_OPT_BUILD_TESTS)
|
||||
add_subdirectory(test)
|
||||
endif()
|
||||
|
||||
if(KOMPUTE_OPT_BUILD_DOCS)
|
||||
set(CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/config" ${CMAKE_MODULE_PATH})
|
||||
add_subdirectory(docs)
|
||||
endif()
|
||||
|
||||
if(KOMPUTE_OPT_BUILD_TESTS)
|
||||
add_subdirectory(test)
|
||||
if(KOMPUTE_OPT_BUILD_PYTHON)
|
||||
add_subdirectory(python)
|
||||
endif()
|
||||
|
||||
|
|
|
|||
97
README.md
97
README.md
|
|
@ -1,5 +1,5 @@
|
|||
|
||||

|
||||

|
||||

|
||||

|
||||

|
||||
|
|
@ -15,7 +15,7 @@
|
|||
<td>
|
||||
|
||||
<h1>Vulkan Kompute</h1>
|
||||
<h3>The General Purpose Vulkan Compute Framework.</h3>
|
||||
<h3>The General Purpose Vulkan Compute Framework for C++ and Python.</h3>
|
||||
|
||||
</td>
|
||||
|
||||
|
|
@ -29,10 +29,10 @@
|
|||
|
||||
## Principles & Features
|
||||
|
||||
* [Single header](#setup) library for simple import to your project
|
||||
* [Documentation](https://kompute.cc) leveraging doxygen and sphinx
|
||||
* [Asynchronous & parallel processing](#asynchronous-and-parallel-operations) capabilities with multi-queue command submission
|
||||
* [Non-Vulkan naming conventions](#architectural-overview) to disambiguate Vulkan vs Kompute components
|
||||
* [Single header](#setup) for simple import with flexible build-system configuration
|
||||
* Multi-language support with C++ as core SDK as well as [optimized Python bindings](#python-package)
|
||||
* [Asynchronous & parallel processing](#asynchronous-and-parallel-operations) support through GPU family queues
|
||||
* [Mobile enabled](#mobile-enabled) with examples in Android studio across several architectures
|
||||
* BYOV: [Bring-your-own-Vulkan design](#motivations) to play nice with existing Vulkan applications
|
||||
* Explicit relationships for GPU and host [memory ownership and memory management](https://kompute.cc/overview/memory-management.html)
|
||||
* [Short code examples](#simple-examples) showing the core features
|
||||
|
|
@ -118,7 +118,7 @@ int main() {
|
|||
mgr.evalOpAwaitDefault();
|
||||
|
||||
// 5. Create managed sequence to submit batch operations to the CPU
|
||||
std::shared_ptr<kp::Sequence> sq = mgr.getOrCreateManagedSequence("seq").lock();
|
||||
std::shared_ptr<kp::Sequence> sq = mgr.getOrCreateManagedSequence("seq");
|
||||
|
||||
// 5.1. Explicitly begin recording batch commands
|
||||
sq->begin();
|
||||
|
|
@ -255,13 +255,79 @@ You can also access the <a href="https://github.com/EthicalML/vulkan-kompute/tre
|
|||
</tr>
|
||||
</table>
|
||||
|
||||
## Motivations
|
||||
## Python Package
|
||||
|
||||
This project started after seeing that a lot of new and renowned ML & DL projects like Pytorch, Tensorflow, Alibaba DNN, Tencent NCNN - among others - have either integrated or are looking to integrate the Vulkan SDK to add mobile (and cross-vendor) GPU support.
|
||||
Besides the C++ core SDK you can also use the Python package of Kompute, which exposes the same core functionality, and supports interoperability with Python objects like Lists, Numpy Arrays, etc.
|
||||
|
||||
The Vulkan SDK offers a great low level interface that enables for highly specialized optimizations - however it comes at a cost of highly verbose code which requires 500-2000 lines of code to even begin writing application code. This has resulted in each of these projects having to implement the same baseline to abstract the non-compute related features of Vulkan. This large amount of non-standardised boiler-plate can result in limited knowledge transfer, higher chance of unique framework implementation bugs being introduced, etc.
|
||||
You can install from the repository by running:
|
||||
|
||||
We are currently developing Vulkan Kompute not to hide the Vulkan SDK interface (as it's incredibly well designed) but to augment it with a direct focus on Vulkan's GPU computing capabilities. [This article](https://towardsdatascience.com/machine-learning-and-data-processing-in-the-gpu-with-vulkan-kompute-c9350e5e5d3a) provides a high level overview of the motivations of Kompute, together with a set of hands on examples that introduce both GPU computing as well as the core Vulkan Kompute architecture.
|
||||
```
|
||||
pip install .
|
||||
```
|
||||
|
||||
For further details you can read the [Python Package documentation](https://kompute.cc/overview/python-package.html) or the [Python Class Reference documentation](https://kompute.cc/overview/python-reference.html).
|
||||
|
||||
### Python Example (Simple)
|
||||
|
||||
Then you can interact with it from your interpreter. Below is the same sample as above "Your First Kompute (Simple Version)" but in Python:
|
||||
|
||||
```python
|
||||
mgr = Manager()
|
||||
|
||||
# Can be initialized with List[] or np.Array
|
||||
tensor_in_a = Tensor([2, 2, 2])
|
||||
tensor_in_b = Tensor([1, 2, 3])
|
||||
tensor_out = Tensor([0, 0, 0])
|
||||
|
||||
mgr.eval_tensor_create_def([tensor_in_a, tensor_in_b, tensor_out])
|
||||
|
||||
shaderFilePath = "shaders/glsl/opmult.comp"
|
||||
mgr.eval_async_algo_file_def([tensor_in_a, tensor_in_b, tensor_out], shaderFilePath)
|
||||
|
||||
# Alternatively can pass raw string/bytes:
|
||||
# shaderFileData = """ shader code here... """
|
||||
# mgr.eval_algo_data_def([tensor_in_a, tensor_in_b, tensor_out], list(shaderFileData))
|
||||
|
||||
mgr.eval_await_def()
|
||||
|
||||
mgr.eval_tensor_sync_local_def([tensor_out])
|
||||
|
||||
assert tensor_out.data() == [2.0, 4.0, 6.0]
|
||||
```
|
||||
|
||||
### Python Example (Extended)
|
||||
|
||||
Similarly you can find the same extended example as above:
|
||||
|
||||
```python
|
||||
mgr = Manager(0, [2])
|
||||
|
||||
# Can be initialized with List[] or np.Array
|
||||
tensor_in_a = Tensor([2, 2, 2])
|
||||
tensor_in_b = Tensor([1, 2, 3])
|
||||
tensor_out = Tensor([0, 0, 0])
|
||||
|
||||
shaderFilePath = "../../shaders/glsl/opmult.comp"
|
||||
|
||||
mgr.eval_tensor_create_def([tensor_in_a, tensor_in_b, tensor_out])
|
||||
|
||||
seq = mgr.create_sequence("op")
|
||||
|
||||
mgr.eval_async_algo_file_def([tensor_in_a, tensor_in_b, tensor_out], shaderFilePath)
|
||||
mgr.eval_await_def()
|
||||
|
||||
seq.begin()
|
||||
seq.record_tensor_sync_local([tensor_in_a])
|
||||
seq.record_tensor_sync_local([tensor_in_b])
|
||||
seq.record_tensor_sync_local([tensor_out])
|
||||
seq.end()
|
||||
|
||||
seq.eval()
|
||||
|
||||
assert tensor_out.data() == [2.0, 4.0, 6.0]
|
||||
```
|
||||
|
||||
For further details you can read the [Python Package documentation](https://kompute.cc/overview/python-package.html) or the [Python Class Reference documentation](https://kompute.cc/overview/python-reference.html).
|
||||
|
||||
## More examples
|
||||
|
||||
|
|
@ -281,6 +347,7 @@ We are currently developing Vulkan Kompute not to hide the Vulkan SDK interface
|
|||
* [Android NDK Mobile Kompute ML Application](https://towardsdatascience.com/gpu-accelerated-machine-learning-in-your-mobile-applications-using-the-android-ndk-vulkan-kompute-1e9da37b7617)
|
||||
* [Game Development Kompute ML in Godot Engine](https://towardsdatascience.com/supercharging-game-development-with-gpu-accelerated-ml-using-vulkan-kompute-the-godot-game-engine-4e75a84ea9f0)
|
||||
|
||||
|
||||
## Build Overview
|
||||
|
||||
The build system provided uses `cmake`, which allows for cross platform builds.
|
||||
|
|
@ -344,3 +411,11 @@ make mk_cmake MK_BUILD_TYPE="Release"
|
|||
make mk_run_tests
|
||||
```
|
||||
|
||||
## Motivations
|
||||
|
||||
This project started after seeing that a lot of new and renowned ML & DL projects like Pytorch, Tensorflow, Alibaba DNN, Tencent NCNN - among others - have either integrated or are looking to integrate the Vulkan SDK to add mobile (and cross-vendor) GPU support.
|
||||
|
||||
The Vulkan SDK offers a great low level interface that enables for highly specialized optimizations - however it comes at a cost of highly verbose code which requires 500-2000 lines of code to even begin writing application code. This has resulted in each of these projects having to implement the same baseline to abstract the non-compute related features of Vulkan. This large amount of non-standardised boiler-plate can result in limited knowledge transfer, higher chance of unique framework implementation bugs being introduced, etc.
|
||||
|
||||
We are currently developing Vulkan Kompute not to hide the Vulkan SDK interface (as it's incredibly well designed) but to augment it with a direct focus on Vulkan's GPU computing capabilities. [This article](https://towardsdatascience.com/machine-learning-and-data-processing-in-the-gpu-with-vulkan-kompute-c9350e5e5d3a) provides a high level overview of the motivations of Kompute, together with a set of hands on examples that introduce both GPU computing as well as the core Vulkan Kompute architecture.
|
||||
|
||||
|
|
|
|||
|
|
@ -16,13 +16,16 @@
|
|||
|
||||
|
||||
# -- Project information -----------------------------------------------------
|
||||
import sys
|
||||
import os
|
||||
import kp
|
||||
|
||||
project = 'Vulkan Kompute'
|
||||
copyright = '2020, The Institute for Ethical AI & Machine Learning'
|
||||
author = 'Alejandro Saucedo'
|
||||
|
||||
# The full version, including alpha/beta/rc tags
|
||||
release = '0.4.0'
|
||||
release = '0.4.1'
|
||||
|
||||
|
||||
# -- General configuration ---------------------------------------------------
|
||||
|
|
@ -31,6 +34,7 @@ release = '0.4.0'
|
|||
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
|
||||
# ones.
|
||||
extensions = [
|
||||
"sphinx.ext.autodoc",
|
||||
# Creates .nojekyll config
|
||||
'sphinx.ext.githubpages',
|
||||
# Integrates with doxygen
|
||||
|
|
|
|||
|
|
@ -11,13 +11,15 @@ Index
|
|||
:maxdepth: 2
|
||||
:titlesonly:
|
||||
|
||||
Class Documentation and C++ Reference <overview/reference>
|
||||
Advanced Examples <overview/advanced-examples>
|
||||
Simple & Advanced Examples <overview/advanced-examples>
|
||||
Python Package Overview <overview/python-package>
|
||||
Asynchronous & Parallel Operations <overview/async-parallel>
|
||||
Memory Management Principles <overview/memory-management>
|
||||
Build System Deep Dive <overview/build-system>
|
||||
Converting GLSL/HLSL Shaders to C++ Headers <overview/shaders-to-headers>
|
||||
Mobile App Integration (Android) <overview/mobile-android>
|
||||
Game Engine Integration (Godot Engine) <overview/game-engine-godot>
|
||||
Python Class Documentation & Reference <overview/python-reference>
|
||||
C++ Class Documentation & Reference <overview/reference>
|
||||
Code Index <genindex>
|
||||
|
||||
|
|
|
|||
91
docs/overview/python-package.rst
Normal file
91
docs/overview/python-package.rst
Normal file
|
|
@ -0,0 +1,91 @@
|
|||
|
||||
Python Package Overview
|
||||
========
|
||||
|
||||
This section provides an overview of the Python Package from a functionality perspective. If you wish to see all the classes and their respective functions you can find that in the `Python Class Reference Section <python-reference>`_.
|
||||
|
||||
Below is a diagram that provides insights on the relationship between Vulkan Kompute objects and Vulkan resources, which primarily encompass ownership of either CPU and/or GPU memory.
|
||||
|
||||
.. image:: ../images/kompute-architecture.jpg
|
||||
:width: 70%
|
||||
|
||||
Python Components
|
||||
^^^^^^^^
|
||||
|
||||
The Python package exposes three main classes:
|
||||
|
||||
* :class:`kp.Manager` - Manages all high level Vulkan and Kompute resources created
|
||||
* :class:`kp.Sequence` - Contains a set of recorded operations that can be reused
|
||||
* :class:`kp.Tensor` - Core data component to manage GPU and host data used in operations
|
||||
|
||||
One thing that you will notice is that the class :class:`kp::OpBase` and all its relevant operator subclasses are not exposed in Python.
|
||||
|
||||
This is primarily because the way to interact with the operations are through the respective :class:`kp.Manager` and :class:`kp.Sequence` functions.
|
||||
|
||||
More specifically, it can be through the following functions:
|
||||
|
||||
* mgr.eval_<opname> - Runs operation under an existing named sequence
|
||||
* mgr.eval_<opname>_def - Runs operation under a new anonymous sequence
|
||||
* mgr.eval_async_<opname> - Runs operation asynchronously under an existing named sequence
|
||||
* mgr.eval_async_<opname>_def - Runs operation asynchronously under a new anonymous sequence
|
||||
* seq.record_<opname> - Records operation in sequence (requires sequence to be in recording mode)
|
||||
|
||||
You can see these operations being used in the `Simple Python example <https://kompute.cc/index.html#python-example-simple>`_ and in the `Extended Python Example <https://kompute.cc/index.html#python-example-extended>`_.
|
||||
|
||||
Kompute Operation Capabilities
|
||||
^^^^^
|
||||
|
||||
Handling multiple capabilites of processing can be done by compute shaders being loaded into separate sequences. The example below shows how this can be done:
|
||||
|
||||
.. code-block:: python
|
||||
:linenos:
|
||||
from kp import Manager
|
||||
|
||||
# We'll assume we have the shader data available
|
||||
from my_spv_shader_data import mult_shader, sum_shader
|
||||
|
||||
mgr = Manager()
|
||||
|
||||
t1 = mgr.build_tensor([2,2,2])
|
||||
t2 = mgr.build_tensor([1,2,3])
|
||||
t3 = mgr.build_tensor([1,2,3])
|
||||
|
||||
# Create multiple separate sequences
|
||||
sq_mult = mgr.create_sequence("SQ_MULT")
|
||||
sq_sum = mgr.create_sequence("SQ_SUM")
|
||||
sq_sync = mgr.create_sequence("SQ_SYNC")
|
||||
|
||||
# Initialize sq_mult
|
||||
sq_mult.begin()
|
||||
sq_mult.record_algo_data([t1, t2, t3], add_shader)
|
||||
sq_mult.end()
|
||||
|
||||
sq_sum.begin()
|
||||
sq_sum.record_algo_data([t3, t2, t1], sum_shader)
|
||||
sq_sum.end()
|
||||
|
||||
sq_sync.begin()
|
||||
sq_sync.record_tensor_sync_local([t1, t3])
|
||||
sq_sync.end()
|
||||
|
||||
# Run multiple iterations
|
||||
for i in range(10):
|
||||
sq_mult.eval()
|
||||
sq_sum.eval()
|
||||
|
||||
sq_sync.eval()
|
||||
|
||||
print(t1.data(), t2.data(), t3.data())
|
||||
|
||||
|
||||
Package Installation
|
||||
^^^^^^^^^
|
||||
|
||||
The package can be installed through the top level `setup.py` by running:
|
||||
|
||||
```
|
||||
pip install .
|
||||
```
|
||||
|
||||
|
||||
|
||||
44
docs/overview/python-reference.rst
Normal file
44
docs/overview/python-reference.rst
Normal file
|
|
@ -0,0 +1,44 @@
|
|||
|
||||
|
||||
Python Class Documentation & Reference
|
||||
========
|
||||
|
||||
This section provides a breakdown of the Python classes and what each of their functions provide.
|
||||
Below is a diagram that provides insights on the relationship between Vulkan Kompute objects and Vulkan resources, which primarily encompass ownership of either CPU and/or GPU memory.
|
||||
|
||||
.. image:: ../images/kompute-architecture.jpg
|
||||
:width: 70%
|
||||
|
||||
Manager
|
||||
-------
|
||||
|
||||
The Kompute Manager provides a high level interface to simplify interaction with underlying :class:`kp.Sequence` of Operations.
|
||||
|
||||
.. autoclass:: kp.Manager
|
||||
:members:
|
||||
|
||||
|
||||
Sequence
|
||||
-------
|
||||
|
||||
The Kompute Sequence consists of batches of Kompute Operations, which are executed on a respective GPU queue. The execution of sequences can be synchronous or asynchronous, and it can be coordinated through its respective Vulkan Fence.
|
||||
|
||||
.. autoclass:: kp.Sequence
|
||||
:members:
|
||||
|
||||
|
||||
Tensor
|
||||
-------
|
||||
|
||||
The Kompute Tensor is the atomic unit in Kompute, and it is used primarily for handling Host and GPU Device data.
|
||||
|
||||
.. autoclass:: kp.Tensor
|
||||
:members:
|
||||
|
||||
|
||||
TensorType
|
||||
-------
|
||||
|
||||
.. automodule:: kp
|
||||
:members:
|
||||
|
||||
|
|
@ -44,14 +44,14 @@ void KomputeModelML::train(std::vector<float> yData, std::vector<float> xIData,
|
|||
{
|
||||
|
||||
std::shared_ptr<kp::Sequence> sqTensor =
|
||||
mgr.createManagedSequence().lock();
|
||||
mgr.createManagedSequence();
|
||||
|
||||
sqTensor->begin();
|
||||
sqTensor->record<kp::OpTensorCreate>(params);
|
||||
sqTensor->end();
|
||||
sqTensor->eval();
|
||||
|
||||
std::shared_ptr<kp::Sequence> sq = mgr.createManagedSequence().lock();
|
||||
std::shared_ptr<kp::Sequence> sq = mgr.createManagedSequence();
|
||||
|
||||
// Record op algo base
|
||||
sq->begin();
|
||||
|
|
@ -60,11 +60,11 @@ void KomputeModelML::train(std::vector<float> yData, std::vector<float> xIData,
|
|||
|
||||
#ifdef KOMPUTE_ANDROID_SHADER_FROM_STRING
|
||||
// Newer versions of Android are able to use shaderc to read raw string
|
||||
sq->record<kp::OpAlgoBase<>>(
|
||||
sq->record<kp::OpAlgoBase>(
|
||||
params, std::vector<char>(LR_SHADER.begin(), LR_SHADER.end()));
|
||||
#else
|
||||
// Older versions of Android require the SPIRV binary directly
|
||||
sq->record<kp::OpAlgoBase<>>(
|
||||
sq->record<kp::OpAlgoBase>(
|
||||
params, std::vector<char>(
|
||||
kp::shader_data::shaders_glsl_logisticregression_comp_spv,
|
||||
kp::shader_data::shaders_glsl_logisticregression_comp_spv
|
||||
|
|
|
|||
|
|
@ -3,20 +3,42 @@ project(kompute_array_mult VERSION 0.1.0)
|
|||
|
||||
set(CMAKE_CXX_STANDARD 14)
|
||||
|
||||
option(KOMPUTE_ARR_OPT_INSTALLED_KOMPUTE "Enable if you prefer to use your installed Kompute library" 0)
|
||||
option(KOMPUTE_OPT_ENABLE_SPDLOG "Extra compile flags for Kompute, see docs for full list" 0)
|
||||
set(KOMPUTE_EXTRA_CXX_FLAGS "" CACHE STRING "Extra compile flags for Kompute, see docs for full list")
|
||||
|
||||
if(KOMPUTE_OPT_ENABLE_SPDLOG)
|
||||
set(KOMPUTE_EXTRA_CXX_FLAGS "${KOMPUTE_EXTRA_CXX_FLAGS} -DKOMPUTE_ENABLE_SPDLOG=1")
|
||||
endif()
|
||||
|
||||
# It is necessary to pass the DEBUG or RELEASE flag accordingly to Kompute
|
||||
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -DDEBUG=1 ${KOMPUTE_EXTRA_CXX_FLAGS}")
|
||||
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -DRELEASE=1 ${KOMPUTE_EXTRA_CXX_FLAGS}")
|
||||
|
||||
find_package(kompute REQUIRED)
|
||||
if(KOMPUTE_ARR_OPT_INSTALLED_KOMPUTE)
|
||||
find_package(kompute REQUIRED)
|
||||
else()
|
||||
add_subdirectory(../../ ${CMAKE_CURRENT_BINARY_DIR}/kompute_build)
|
||||
endif()
|
||||
|
||||
find_package(Vulkan REQUIRED)
|
||||
|
||||
if(KOMPUTE_OPT_ENABLE_SPDLOG)
|
||||
find_package(spdlog REQUIRED)
|
||||
endif()
|
||||
|
||||
add_executable(kompute_array_mult
|
||||
src/Main.cpp)
|
||||
|
||||
target_link_libraries(kompute_array_mult
|
||||
kompute::kompute
|
||||
Vulkan::Vulkan
|
||||
)
|
||||
Vulkan::Vulkan)
|
||||
|
||||
include_directories(
|
||||
../../single_include/)
|
||||
|
||||
if(KOMPUTE_OPT_ENABLE_SPDLOG)
|
||||
target_link_libraries(kompute_array_mult
|
||||
spdlog::spdlog)
|
||||
endif()
|
||||
|
||||
|
|
|
|||
|
|
@ -6,14 +6,32 @@ This example is structured such that you will be able to extend it for your proj
|
|||
|
||||
It contains a cmake build configuration that can be used in your production applications.
|
||||
|
||||
## Building the example
|
||||
|
||||
You will notice that it's a standalone project, so you can re-use it for your application.
|
||||
|
||||
This project has the option to either import the Kompute dependency relative to the project or use your existing installation of Kompute.
|
||||
|
||||
To build you just need to run the cmake command in this folder as follows:
|
||||
|
||||
```
|
||||
cmake \
|
||||
-Bbuild
|
||||
```
|
||||
|
||||
You can pass the following optional parameters based on your desired configuration:
|
||||
* If you wish to install with spdlog support you just have to pass `-DKOMPUTE_OPT_ENABLE_SPDLOG=1`.
|
||||
* If you are using a package manager such as `vcpkg` make sure you pass the `-DCMAKE_TOOLCHAIN_FILE=` parameter
|
||||
* If you wish to load shader from raw glsl string instead of spirv bytes you can use `-DKOMPUTE_ANDROID_SHADER_FROM_STRING`
|
||||
|
||||
## Pre-requisites
|
||||
|
||||
In order to run this example, you will need the following dependencies:
|
||||
|
||||
* REQUIRED
|
||||
+ Vulkan Kompute library must be accessible
|
||||
+ The Vulkan SDK must be installed
|
||||
* OPTIONAL
|
||||
+ Vulkan Kompute library must be accessible (by default it uses the source directory)
|
||||
+ SPDLOG - for logging
|
||||
+ FMT - for text formatting
|
||||
|
||||
|
|
@ -25,50 +43,5 @@ For the other libraries, because they are optional you can just make sure you bu
|
|||
|
||||
Alternatively you can use package managers such as vcpkg to help you install them, although to simplify things you can start without the dependencies first.
|
||||
|
||||
## Set Up Vulkan Kompute Dependency
|
||||
|
||||
You have multiple options to set up Vulkan Kompute. The easiest is to perform a local installation.
|
||||
|
||||
For this, you will want to go to the main repo and run the following cmake command, which will configure it without SPDLOG by default.
|
||||
|
||||
```
|
||||
cmake \
|
||||
-Bbuild
|
||||
```
|
||||
|
||||
You can pass the following optional parameters based on your desired configuration:
|
||||
* If you wish to install with spdlog support you just have to pass `-DKOMPUTE_ENABLE_SPDLOG=1`.
|
||||
* If you wish to perform the installation on the local folder instead of in your system you can use `-DCMAKE_INSTALL_PREFIX="build/src/CMakeFiles/Export/"` which will basically ensure that the final files are created in the local directory.
|
||||
* If you are using a package manager such as `vcpkg` make sure you pass the `-DCMAKE_TOOLCHAIN_FILE=` parameter
|
||||
|
||||
Then you can proceed to run the installation:
|
||||
|
||||
* For Windows / Visual Studio you just have to build `INSTALL.vcxproj`
|
||||
* For Linux you can just run the `install` target via `make -C build install`
|
||||
|
||||
You also have the option to build as `Release` or `Debug` - just make sure that you build your example with the same build/debug flags as required.
|
||||
|
||||
## Building the example
|
||||
|
||||
Now that you've set up the dependencies / installation of Vulkan Kompute you can build this example.
|
||||
|
||||
You will notice that it's a standalone project, so you can re-use it for your application.
|
||||
|
||||
To build you just need to run the cmake command in this folder as follows:
|
||||
|
||||
```
|
||||
cmake \
|
||||
-Bbuild
|
||||
```
|
||||
|
||||
Make sure to pass the required flags depending on the configuration above:
|
||||
* If you built with Debug make sure you build your example with Debug as well
|
||||
* If you installed in the local folder, make sure you pass the CMAKE_PREFIX_PATH pointing to the respective folder (e.g. `-DCMAKE_PREFIX_PATH=../../build/src/CMakeFiles/Export/lib/cmake/kompute/` if parent folder is main repo).
|
||||
* If you built Vulkan Kompute with spdlog enabled, make sure to pass `-DKOMPUTE_OPT_ENABLE_SPDLOG=1`
|
||||
* If you are using a package manager such as `vcpkg` make sure you pass the `-DCMAKE_TOOLCHAIN_FILE=` parameter
|
||||
|
||||
Now you just have to build your application as above:
|
||||
|
||||
* For Windows / Visual Studio you just have to build and run `kompute_array_mult.vcxproj`
|
||||
* For Linux you can just run the `kompute_array_mult` target via `make -C build kompute_array_mult`
|
||||
|
||||
|
|
|
|||
|
|
@ -18,6 +18,7 @@ int main()
|
|||
auto tensorInB = mgr.buildTensor({ 0.0, 1.0, 2.0 });
|
||||
auto tensorOut = mgr.buildTensor({ 0.0, 0.0, 0.0 });
|
||||
|
||||
#ifdef KOMPUTE_ANDROID_SHADER_FROM_STRING
|
||||
std::string shader(R"(
|
||||
// The version to use
|
||||
#version 450
|
||||
|
|
@ -37,9 +38,17 @@ int main()
|
|||
}
|
||||
)");
|
||||
|
||||
mgr.evalOpDefault<kp::OpAlgoBase<>>(
|
||||
mgr.evalOpDefault<kp::OpAlgoBase>(
|
||||
{ tensorInA, tensorInB, tensorOut },
|
||||
std::vector<char>(shader.begin(), shader.end()));
|
||||
#else
|
||||
mgr.evalOpDefault<kp::OpAlgoBase>(
|
||||
{ tensorInA, tensorInB, tensorOut },
|
||||
std::vector<char>(
|
||||
kp::shader_data::shaders_glsl_opmult_comp_spv,
|
||||
kp::shader_data::shaders_glsl_opmult_comp_spv
|
||||
+ kp::shader_data::shaders_glsl_opmult_comp_spv_len));
|
||||
#endif
|
||||
|
||||
mgr.evalOpDefault<kp::OpTensorSyncLocal>({tensorOut});
|
||||
|
||||
|
|
|
|||
|
|
@ -24,7 +24,7 @@ protected:
|
|||
|
||||
private:
|
||||
kp::Manager mManager;
|
||||
std::weak_ptr<kp::Sequence> mSequence;
|
||||
std::shared_ptr<kp::Sequence> mSequence;
|
||||
std::shared_ptr<kp::Tensor> mPrimaryTensor;
|
||||
std::shared_ptr<kp::Tensor> mSecondaryTensor;
|
||||
};
|
||||
|
|
|
|||
|
|
@ -16,12 +16,7 @@ void KomputeSummator::add(float value) {
|
|||
// Set the new data in the local device
|
||||
this->mSecondaryTensor->setData({value});
|
||||
// Execute recorded sequence
|
||||
if (std::shared_ptr<kp::Sequence> sq = this->mSequence.lock()) {
|
||||
sq->eval();
|
||||
}
|
||||
else {
|
||||
throw std::runtime_error("Sequence pointer no longer available");
|
||||
}
|
||||
this->mSequence->eval();
|
||||
}
|
||||
|
||||
void KomputeSummator::reset() {
|
||||
|
|
@ -38,9 +33,7 @@ void KomputeSummator::_init() {
|
|||
this->mSequence = this->mManager.getOrCreateManagedSequence("AdditionSeq");
|
||||
|
||||
// We now record the steps in the sequence
|
||||
if (std::shared_ptr<kp::Sequence> sq = this->mSequence.lock())
|
||||
{
|
||||
|
||||
std::string shader(R"(
|
||||
#version 450
|
||||
|
||||
|
|
@ -55,26 +48,23 @@ void KomputeSummator::_init() {
|
|||
}
|
||||
)");
|
||||
|
||||
sq->begin();
|
||||
this->mSequence->begin();
|
||||
|
||||
// First we ensure secondary tensor loads to GPU
|
||||
// No need to sync the primary tensor as it should not be changed
|
||||
sq->record<kp::OpTensorSyncDevice>(
|
||||
this->mSequence->record<kp::OpTensorSyncDevice>(
|
||||
{ this->mSecondaryTensor });
|
||||
|
||||
// Then we run the operation with both tensors
|
||||
sq->record<kp::OpAlgoBase<>>(
|
||||
this->mSequence->record<kp::OpAlgoBase>(
|
||||
{ this->mPrimaryTensor, this->mSecondaryTensor },
|
||||
std::vector<char>(shader.begin(), shader.end()));
|
||||
|
||||
// We map the result back to local
|
||||
sq->record<kp::OpTensorSyncLocal>(
|
||||
this->mSequence->record<kp::OpTensorSyncLocal>(
|
||||
{ this->mPrimaryTensor });
|
||||
|
||||
sq->end();
|
||||
}
|
||||
else {
|
||||
throw std::runtime_error("Sequence pointer no longer available");
|
||||
this->mSequence->end();
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -26,7 +26,7 @@ public:
|
|||
|
||||
private:
|
||||
kp::Manager mManager;
|
||||
std::weak_ptr<kp::Sequence> mSequence;
|
||||
std::shared_ptr<kp::Sequence> mSequence;
|
||||
std::shared_ptr<kp::Tensor> mPrimaryTensor;
|
||||
std::shared_ptr<kp::Tensor> mSecondaryTensor;
|
||||
};
|
||||
|
|
|
|||
|
|
@ -51,14 +51,14 @@ void KomputeModelMLNode::train(Array yArr, Array xIArr, Array xJArr) {
|
|||
kp::Manager mgr;
|
||||
|
||||
std::shared_ptr<kp::Sequence> sqTensor =
|
||||
mgr.createManagedSequence().lock();
|
||||
mgr.createManagedSequence();
|
||||
|
||||
sqTensor->begin();
|
||||
sqTensor->record<kp::OpTensorCreate>(params);
|
||||
sqTensor->end();
|
||||
sqTensor->eval();
|
||||
|
||||
std::shared_ptr<kp::Sequence> sq = mgr.createManagedSequence().lock();
|
||||
std::shared_ptr<kp::Sequence> sq = mgr.createManagedSequence();
|
||||
|
||||
// Record op algo base
|
||||
sq->begin();
|
||||
|
|
@ -67,11 +67,11 @@ void KomputeModelMLNode::train(Array yArr, Array xIArr, Array xJArr) {
|
|||
|
||||
#ifdef KOMPUTE_ANDROID_SHADER_FROM_STRING
|
||||
// Newer versions of Android are able to use shaderc to read raw string
|
||||
sq->record<kp::OpAlgoBase<>>(
|
||||
sq->record<kp::OpAlgoBase>(
|
||||
params, std::vector<char>(LR_SHADER.begin(), LR_SHADER.end()));
|
||||
#else
|
||||
// Older versions of Android require the SPIRV binary directly
|
||||
sq->record<kp::OpAlgoBase<>>(
|
||||
sq->record<kp::OpAlgoBase>(
|
||||
params, std::vector<char>(
|
||||
kp::shader_data::shaders_glsl_logisticregression_comp_spv,
|
||||
kp::shader_data::shaders_glsl_logisticregression_comp_spv
|
||||
|
|
|
|||
|
|
@ -56,14 +56,14 @@ void KomputeModelML::train(Array yArr, Array xIArr, Array xJArr) {
|
|||
|
||||
{
|
||||
std::shared_ptr<kp::Sequence> sqTensor =
|
||||
mgr.createManagedSequence().lock();
|
||||
mgr.createManagedSequence();
|
||||
|
||||
sqTensor->begin();
|
||||
sqTensor->record<kp::OpTensorCreate>(params);
|
||||
sqTensor->end();
|
||||
sqTensor->eval();
|
||||
|
||||
std::shared_ptr<kp::Sequence> sq = mgr.createManagedSequence().lock();
|
||||
std::shared_ptr<kp::Sequence> sq = mgr.createManagedSequence();
|
||||
|
||||
// Record op algo base
|
||||
sq->begin();
|
||||
|
|
@ -72,11 +72,11 @@ void KomputeModelML::train(Array yArr, Array xIArr, Array xJArr) {
|
|||
|
||||
#ifdef KOMPUTE_ANDROID_SHADER_FROM_STRING
|
||||
// Newer versions of Android are able to use shaderc to read raw string
|
||||
sq->record<kp::OpAlgoBase<>>(
|
||||
sq->record<kp::OpAlgoBase>(
|
||||
params, std::vector<char>(LR_SHADER.begin(), LR_SHADER.end()));
|
||||
#else
|
||||
// Older versions of Android require the SPIRV binary directly
|
||||
sq->record<kp::OpAlgoBase<>>(
|
||||
sq->record<kp::OpAlgoBase>(
|
||||
params, std::vector<char>(
|
||||
kp::shader_data::shaders_glsl_logisticregression_comp_spv,
|
||||
kp::shader_data::shaders_glsl_logisticregression_comp_spv
|
||||
|
|
|
|||
|
|
@ -3,6 +3,7 @@ project(kompute_linear_reg VERSION 0.1.0)
|
|||
|
||||
set(CMAKE_CXX_STANDARD 14)
|
||||
|
||||
option(KOMPUTE_ARR_OPT_INSTALLED_KOMPUTE "Enable if you prefer to use your installed Kompute library" 0)
|
||||
option(KOMPUTE_OPT_ENABLE_SPDLOG "Extra compile flags for Kompute, see docs for full list" 0)
|
||||
set(KOMPUTE_EXTRA_CXX_FLAGS "" CACHE STRING "Extra compile flags for Kompute, see docs for full list")
|
||||
|
||||
|
|
@ -14,12 +15,16 @@ endif()
|
|||
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -DDEBUG=1 ${KOMPUTE_EXTRA_CXX_FLAGS}")
|
||||
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -DRELEASE=1 ${KOMPUTE_EXTRA_CXX_FLAGS}")
|
||||
|
||||
find_package(kompute REQUIRED)
|
||||
if(KOMPUTE_ARR_OPT_INSTALLED_KOMPUTE)
|
||||
find_package(kompute REQUIRED)
|
||||
else()
|
||||
add_subdirectory(../../ ${CMAKE_CURRENT_BINARY_DIR}/kompute_build)
|
||||
endif()
|
||||
|
||||
find_package(Vulkan REQUIRED)
|
||||
|
||||
if(KOMPUTE_OPT_ENABLE_SPDLOG)
|
||||
find_package(spdlog REQUIRED)
|
||||
find_package(fmt REQUIRED)
|
||||
endif()
|
||||
|
||||
add_executable(kompute_linear_reg
|
||||
|
|
@ -30,11 +35,11 @@ target_link_libraries(kompute_linear_reg
|
|||
Vulkan::Vulkan
|
||||
)
|
||||
|
||||
include_directories(
|
||||
../../single_include/)
|
||||
|
||||
if(KOMPUTE_OPT_ENABLE_SPDLOG)
|
||||
target_link_libraries(kompute_linear_reg
|
||||
kompute::kompute
|
||||
fmt::fmt
|
||||
spdlog::spdlog
|
||||
)
|
||||
target_link_libraries(kompute_array_mult
|
||||
spdlog::spdlog)
|
||||
endif()
|
||||
|
||||
|
|
|
|||
|
|
@ -6,54 +6,12 @@ This example is structured such that you will be able to extend it for your proj
|
|||
|
||||
It contains a cmake build configuration that can be used in your production applications.
|
||||
|
||||
## Pre-requisites
|
||||
|
||||
In order to run this example, you will need the following dependencies:
|
||||
|
||||
* REQUIRED
|
||||
+ Vulkan Kompute library must be accessible
|
||||
+ The Vulkan SDK must be installed
|
||||
* OPTIONAL
|
||||
+ SPDLOG - for logging
|
||||
+ FMT - for text formatting
|
||||
|
||||
We will cover how you can install Vulkan Kompute in the next section.
|
||||
|
||||
For the Vulkan SDK, the simplest way to install it is through [their website](https://vulkan.lunarg.com/sdk/home). You just have to follow the instructions for the relevant platform.
|
||||
|
||||
For the other libraries, because they are optional you can just make sure you build and install Kompute with these disabled (this will be covered in more detail below).
|
||||
|
||||
Alternatively you can use package managers such as vcpkg to help you install them, although to simplify things you can start without the dependencies first.
|
||||
|
||||
## Set Up Vulkan Kompute Dependency
|
||||
|
||||
You have multiple options to set up Vulkan Kompute. The easiest is to perform a local installation.
|
||||
|
||||
For this, you will want to go to the main repo and run the following cmake command, which will configure it without SPDLOG by default.
|
||||
|
||||
```
|
||||
cmake \
|
||||
-Bbuild
|
||||
```
|
||||
|
||||
You can pass the following optional parameters based on your desired configuration:
|
||||
* If you wish to install with spdlog support you just have to pass `-DKOMPUTE_ENABLE_SPDLOG=1`.
|
||||
* If you wish to perform the installation on the local folder instead of in your system you can use `-DCMAKE_INSTALL_PREFIX="build/src/CMakeFiles/Export/"` which will basically ensure that the final files are created in the local directory.
|
||||
* If you are using a package manager such as `vcpkg` make sure you pass the `-DCMAKE_TOOLCHAIN_FILE=` parameter
|
||||
|
||||
Then you can proceed to run the installation:
|
||||
|
||||
* For Windows / Visual Studio you just have to build `INSTALL.vcxproj`
|
||||
* For Linux you can just run the `install` target via `make -C build install`
|
||||
|
||||
You also have the option to build as `Release` or `Debug` - just make sure that you build your example with the same build/debug flags as required.
|
||||
|
||||
## Building the example
|
||||
|
||||
Now that you've set up the dependencies / installation of Vulkan Kompute you can build this example.
|
||||
|
||||
You will notice that it's a standalone project, so you can re-use it for your application.
|
||||
|
||||
This project has the option to either import the Kompute dependency relative to the project or use your existing installation of Kompute.
|
||||
|
||||
To build you just need to run the cmake command in this folder as follows:
|
||||
|
||||
```
|
||||
|
|
@ -61,14 +19,19 @@ cmake \
|
|||
-Bbuild
|
||||
```
|
||||
|
||||
Make sure to pass the required flags depending on the configuration above:
|
||||
* If you built with Debug make sure you build your example with Debug as well
|
||||
* If you installed in the local folder, make sure you pass the CMAKE_PREFIX_PATH pointing to the respective folder (e.g. `-DCMAKE_PREFIX_PATH=../../build/src/CMakeFiles/Export/lib/cmake/kompute/` if parent folder is main repo).
|
||||
* If you built Vulkan Kompute with spdlog enabled, make sure to pass `-DKOMPUTE_OPT_ENABLE_SPDLOG=1`
|
||||
You can pass the following optional parameters based on your desired configuration:
|
||||
* If you wish to install with spdlog support you just have to pass `-DKOMPUTE_OPT_ENABLE_SPDLOG=1`.
|
||||
* If you are using a package manager such as `vcpkg` make sure you pass the `-DCMAKE_TOOLCHAIN_FILE=` parameter
|
||||
* If you wish to load shader from raw glsl string instead of spirv bytes you can use `-DKOMPUTE_ANDROID_SHADER_FROM_STRING`
|
||||
|
||||
Now you just have to build your application as above:
|
||||
## Pre-requisites
|
||||
|
||||
* For Windows / Visual Studio you just have to build and run `kompute_linear_reg.vcxproj`
|
||||
* For Linux you can just run the `kompute_linear_reg` target via `make -C build kompute_linear_reg`
|
||||
In order to run this example, you will need the following dependencies:
|
||||
|
||||
* REQUIRED
|
||||
+ The Vulkan SDK must be installed
|
||||
* OPTIONAL
|
||||
+ Vulkan Kompute library must be accessible (by default it uses the source directory)
|
||||
+ SPDLOG - for logging
|
||||
+ FMT - for text formatting
|
||||
|
||||
|
|
|
|||
|
|
@ -36,22 +36,30 @@ int main()
|
|||
kp::Manager mgr;
|
||||
|
||||
std::shared_ptr<kp::Sequence> sqTensor =
|
||||
mgr.createManagedSequence().lock();
|
||||
mgr.createManagedSequence();
|
||||
|
||||
sqTensor->begin();
|
||||
sqTensor->record<kp::OpTensorCreate>(params);
|
||||
sqTensor->end();
|
||||
sqTensor->eval();
|
||||
|
||||
std::shared_ptr<kp::Sequence> sq = mgr.createManagedSequence().lock();
|
||||
std::shared_ptr<kp::Sequence> sq = mgr.createManagedSequence();
|
||||
|
||||
// Record op algo base
|
||||
sq->begin();
|
||||
|
||||
sq->record<kp::OpTensorSyncDevice>({ wIn, bIn });
|
||||
|
||||
sq->record<kp::OpAlgoBase<>>(
|
||||
#ifdef KOMPUTE_ANDROID_SHADER_FROM_STRING
|
||||
sq->record<kp::OpAlgoBase>(
|
||||
params, "shaders/glsl/logistic_regression.comp");
|
||||
#else
|
||||
sq->record<kp::OpAlgoBase>(
|
||||
params, std::vector<char>(
|
||||
kp::shader_data::shaders_glsl_logisticregression_comp_spv,
|
||||
kp::shader_data::shaders_glsl_logisticregression_comp_spv
|
||||
+ kp::shader_data::shaders_glsl_logisticregression_comp_spv_len));
|
||||
#endif
|
||||
|
||||
sq->record<kp::OpTensorSyncLocal>({ wOutI, wOutJ, bOut, lOut });
|
||||
|
||||
|
|
|
|||
11
python/CMakeLists.txt
Normal file
11
python/CMakeLists.txt
Normal file
|
|
@ -0,0 +1,11 @@
|
|||
|
||||
add_subdirectory(pybind11)
|
||||
pybind11_add_module(kp src/main.cpp)
|
||||
|
||||
include_directories(
|
||||
${PROJECT_SOURCE_DIR}/single_include/)
|
||||
|
||||
target_link_libraries(
|
||||
kp PRIVATE
|
||||
kompute::kompute)
|
||||
|
||||
2
python/README.md
Normal file
2
python/README.md
Normal file
|
|
@ -0,0 +1,2 @@
|
|||
# Python Bindings for Vulkan Kompute
|
||||
|
||||
1
python/pybind11
Submodule
1
python/pybind11
Submodule
|
|
@ -0,0 +1 @@
|
|||
Subproject commit 06a54018c8a9fd9a7be5f5b56414b5da9259f637
|
||||
160
python/src/main.cpp
Normal file
160
python/src/main.cpp
Normal file
|
|
@ -0,0 +1,160 @@
|
|||
#include <pybind11/pybind11.h>
|
||||
#include <pybind11/stl.h>
|
||||
|
||||
#include <kompute/Kompute.hpp>
|
||||
|
||||
namespace py = pybind11;
|
||||
|
||||
PYBIND11_MODULE(kp, m) {
|
||||
|
||||
py::enum_<kp::Tensor::TensorTypes>(m, "TensorTypes", "Enum with GPU memory types for Tensor.")
|
||||
.value("device", kp::Tensor::TensorTypes::eDevice, "Tensor holding data in GPU memory.")
|
||||
.value("staging", kp::Tensor::TensorTypes::eStaging, "Tensor used for transfer of data to device.")
|
||||
.value("storage", kp::Tensor::TensorTypes::eStorage, "Tensor with host visible gpu memory.")
|
||||
.export_values();
|
||||
|
||||
py::class_<kp::Tensor, std::shared_ptr<kp::Tensor>>(m, "Tensor", "Structured data used in GPU operations.")
|
||||
.def(py::init(
|
||||
[](const std::vector<float>& data) {
|
||||
return std::unique_ptr<kp::Tensor>(new kp::Tensor(data));
|
||||
}), "Initialiser with only list of data components.")
|
||||
.def(py::init(
|
||||
[](const std::vector<float>& data, kp::Tensor::TensorTypes tensorTypes) {
|
||||
return std::unique_ptr<kp::Tensor>(new kp::Tensor(data, tensorTypes));
|
||||
}), "Initialiser with list of data components and tensor GPU memory type.")
|
||||
.def("data", &kp::Tensor::data, "Retrieves the data as a list containing the local Tensor memory data.")
|
||||
.def("size", &kp::Tensor::size, "Retrieves the size of the Tensor data as per the local Tensor memory.")
|
||||
.def("tensor_type", &kp::Tensor::tensorType, "Retreves the memory type of the tensor.")
|
||||
.def("is_init", &kp::Tensor::isInit, "Checks whether the tensor GPU memory has been initialised.")
|
||||
.def("set_data", &kp::Tensor::setData, "Overrides the data in the local Tensor memory.")
|
||||
.def("map_data_from_host", &kp::Tensor::mapDataFromHostMemory, "Maps data into GPU memory from tensor local data.")
|
||||
.def("map_data_into_host", &kp::Tensor::mapDataIntoHostMemory, "Maps data from GPU memory into tensor local data.");
|
||||
|
||||
py::class_<kp::Sequence, std::shared_ptr<kp::Sequence>>(m, "Sequence")
|
||||
.def("init", &kp::Sequence::init, "Initialises Vulkan resources within sequence using provided device.")
|
||||
// record
|
||||
.def("begin", &kp::Sequence::begin, "Clears previous commands and starts recording commands in sequence which can be run in batch.")
|
||||
.def("end", &kp::Sequence::end, "Stops listening and recording for new commands.")
|
||||
// eval
|
||||
.def("eval", &kp::Sequence::eval, "Executes the currently recorded commands synchronously by waiting on Vulkan Fence.")
|
||||
.def("eval_async", &kp::Sequence::evalAsync, "Executes the currently recorded commands asynchronously.")
|
||||
.def("eval_await", &kp::Sequence::evalAwait, "Waits until the execution finishes using Vulkan Fence.")
|
||||
// status
|
||||
.def("is_running", &kp::Sequence::isRunning, "Checks whether the Sequence operations are currently still executing.")
|
||||
.def("is_rec", &kp::Sequence::isRecording, "Checks whether the Sequence is currently in recording mode.")
|
||||
.def("is_init", &kp::Sequence::isInit, "Checks if the Sequence has been initialized")
|
||||
// record
|
||||
.def("record_tensor_create", &kp::Sequence::record<kp::OpTensorCreate>,
|
||||
"Records operation to create and initialise tensor GPU memory and buffer")
|
||||
.def("record_tensor_copy", &kp::Sequence::record<kp::OpTensorCopy>,
|
||||
"Records operation to copy one tensor to one or many tensors")
|
||||
.def("record_tensor_sync_device", &kp::Sequence::record<kp::OpTensorSyncDevice>,
|
||||
"Records operation to sync tensor from local memory to GPU memory")
|
||||
.def("record_tensor_sync_local", &kp::Sequence::record<kp::OpTensorSyncLocal>,
|
||||
"Records operation to sync tensor(s) from GPU memory to local memory using staging tensors")
|
||||
.def("record_algo_mult", &kp::Sequence::record<kp::OpMult>,
|
||||
"Records operation to run multiplication compute shader to two input tensors and an output tensor")
|
||||
.def("record_algo_file", &kp::Sequence::record<kp::OpAlgoBase, std::string>,
|
||||
"Records an operation using a custom shader provided from a shader path")
|
||||
.def("record_algo_data", &kp::Sequence::record<kp::OpAlgoBase, std::vector<char>>,
|
||||
"Records an operation using a custom shader provided as raw string or spirv bytes")
|
||||
.def("record_algo_lro", &kp::Sequence::record<kp::OpAlgoLhsRhsOut>,
|
||||
"Records operation to run left right out operation with custom shader");
|
||||
|
||||
py::class_<kp::Manager>(m, "Manager")
|
||||
.def(py::init(), "Default initializer uses device 0 and first compute compatible GPU queueFamily")
|
||||
.def(py::init(
|
||||
[](uint32_t physicalDeviceIndex) {
|
||||
return std::unique_ptr<kp::Manager>(new kp::Manager(physicalDeviceIndex));
|
||||
}), "Manager initialiser can provide specified device index but will use first compute compatible GPU queueFamily")
|
||||
.def(py::init(
|
||||
[](uint32_t physicalDeviceIndex, const std::vector<uint32_t>& familyQueueIndices) {
|
||||
return std::unique_ptr<kp::Manager>(new kp::Manager(physicalDeviceIndex, familyQueueIndices));
|
||||
}), "Manager initialiser can provide specified device and array of GPU queueFamilies to load.")
|
||||
.def("get_create_sequence", &kp::Manager::getOrCreateManagedSequence, "Get a Sequence or create a new one with given name")
|
||||
.def("create_sequence", &kp::Manager::createManagedSequence,
|
||||
py::arg("name"), py::arg("queueIndex") = 0, "Create a sequence with specific name and specified index of available queues")
|
||||
.def("build_tensor", &kp::Manager::buildTensor,
|
||||
py::arg("data"), py::arg("tensorType") = kp::Tensor::TensorTypes::eDevice,
|
||||
"Build and initialise tensor")
|
||||
// Await functions
|
||||
.def("eval_await", &kp::Manager::evalOpAwait,
|
||||
py::arg("sequenceName"), py::arg("waitFor") = UINT64_MAX,
|
||||
"Awaits for asynchronous operation on a named Sequence")
|
||||
.def("eval_await_def", &kp::Manager::evalOpAwaitDefault,
|
||||
py::arg("waitFor") = UINT64_MAX, "Awaits for asynchronous operation on the last anonymous Sequence created")
|
||||
// eval default
|
||||
.def("eval_tensor_create_def", &kp::Manager::evalOpDefault<kp::OpTensorCreate>,
|
||||
"Evaluates operation to create and initialise tensor GPU memory and buffer with new anonymous Sequence")
|
||||
.def("eval_tensor_copy_def", &kp::Manager::evalOpDefault<kp::OpTensorCopy>,
|
||||
"Evaluates operation to copy one tensor to one or many tensors with new anonymous Sequence")
|
||||
.def("eval_tensor_sync_device_def", &kp::Manager::evalOpDefault<kp::OpTensorSyncDevice>,
|
||||
"Evaluates operation to sync tensor from local memory to GPU memory with new anonymous Sequence")
|
||||
.def("eval_tensor_sync_local_def", &kp::Manager::evalOpDefault<kp::OpTensorSyncLocal>,
|
||||
"Evaluates operation to sync tensor(s) from GPU memory to local memory using staging tensors with new anonymous Sequence")
|
||||
.def("eval_algo_mult_def", &kp::Manager::evalOpDefault<kp::OpMult>,
|
||||
"Evaluates operation to run multiplication compute shader to two input tensors and an output tensor with new anonymous Sequence")
|
||||
.def("eval_algo_file_def", &kp::Manager::evalOpDefault<kp::OpAlgoBase, std::string>,
|
||||
"Evaluates an operation using a custom shader provided from a shader path with new anonymous Sequence")
|
||||
.def("eval_algo_data_def", &kp::Manager::evalOpDefault<kp::OpAlgoBase, std::vector<char>>,
|
||||
"Evaluates an operation using a custom shader provided as raw string or spirv bytes with new anonymous Sequence")
|
||||
.def("eval_algo_lro_def", &kp::Manager::evalOpDefault<kp::OpAlgoLhsRhsOut>,
|
||||
"Evaluates operation to run left right out operation with custom shader with new anonymous Sequence")
|
||||
// eval
|
||||
.def("eval_tensor_create", &kp::Manager::evalOp<kp::OpTensorCreate>,
|
||||
"Evaluates operation to create and initialise tensor GPU memory and buffer with explicitly named Sequence")
|
||||
.def("eval_tensor_copy", &kp::Manager::evalOp<kp::OpTensorCopy>,
|
||||
"Evaluates operation to copy one tensor to one or many tensors with explicitly named Sequence")
|
||||
.def("eval_tensor_sync_device", &kp::Manager::evalOp<kp::OpTensorSyncDevice>,
|
||||
"Evaluates operation to sync tensor from local memory to GPU memory with explicitly named Sequence")
|
||||
.def("eval_tensor_sync_local", &kp::Manager::evalOp<kp::OpTensorSyncLocal>,
|
||||
"Evaluates operation to sync tensor(s) from GPU memory to local memory using staging tensors with explicitly named Sequence")
|
||||
.def("eval_algo_mult", &kp::Manager::evalOp<kp::OpMult>,
|
||||
"Evaluates operation to run multiplication compute shader to two input tensors and an output tensor with explicitly named Sequence")
|
||||
.def("eval_algo_file", &kp::Manager::evalOp<kp::OpAlgoBase, std::string>,
|
||||
"Evaluates an operation using a custom shader provided from a shader path with explicitly named Sequence")
|
||||
.def("eval_algo_data", &kp::Manager::evalOp<kp::OpAlgoBase, std::vector<char>>,
|
||||
"Evaluates an operation using a custom shader provided as raw string or spirv bytes with explicitly named Sequence")
|
||||
.def("eval_algo_lro", &kp::Manager::evalOp<kp::OpAlgoLhsRhsOut>,
|
||||
"Evaluates operation to run left right out operation with custom shader with explicitly named Sequence")
|
||||
// eval async default
|
||||
.def("eval_async_tensor_create_def", &kp::Manager::evalOpAsyncDefault<kp::OpTensorCreate>,
|
||||
"Evaluates asynchronously operation to create and initialise tensor GPU memory and buffer with anonymous Sequence")
|
||||
.def("eval_async_tensor_copy_def", &kp::Manager::evalOpAsyncDefault<kp::OpTensorCopy>,
|
||||
"Evaluates asynchronously operation to copy one tensor to one or many tensors with anonymous Sequence")
|
||||
.def("eval_async_tensor_sync_device_def", &kp::Manager::evalOpAsyncDefault<kp::OpTensorSyncDevice>,
|
||||
"Evaluates asynchronously operation to sync tensor from local memory to GPU memory with anonymous Sequence")
|
||||
.def("eval_async_tensor_sync_local_def", &kp::Manager::evalOpAsyncDefault<kp::OpTensorSyncLocal>,
|
||||
"Evaluates asynchronously operation to sync tensor(s) from GPU memory to local memory using staging tensors with anonymous Sequence")
|
||||
.def("eval_async_algo_mult_def", &kp::Manager::evalOpAsyncDefault<kp::OpMult>,
|
||||
"Evaluates asynchronously operation to run multiplication compute shader to two input tensors and an output tensor with anonymous Sequence")
|
||||
.def("eval_async_algo_file_def", &kp::Manager::evalOpAsyncDefault<kp::OpAlgoBase, std::string>,
|
||||
"Evaluates asynchronously an operation using a custom shader provided from a shader path with anonymous Sequence")
|
||||
.def("eval_async_algo_data_def", &kp::Manager::evalOpAsyncDefault<kp::OpAlgoBase, std::vector<char>>,
|
||||
"Evaluates asynchronously an operation using a custom shader provided as raw string or spirv bytes with anonymous Sequence")
|
||||
.def("eval_async_algo_lro_def", &kp::Manager::evalOpAsyncDefault<kp::OpAlgoLhsRhsOut>,
|
||||
"Evaluates asynchronously operation to run left right out operation with custom shader with anonymous Sequence")
|
||||
// eval async
|
||||
.def("eval_async_tensor_create", &kp::Manager::evalOpAsync<kp::OpTensorCreate>,
|
||||
"Evaluates asynchronously operation to create and initialise tensor GPU memory and buffer with explicitly named Sequence")
|
||||
.def("eval_async_tensor_copy", &kp::Manager::evalOpAsync<kp::OpTensorCopy>,
|
||||
"Evaluates asynchronously operation to copy one tensor to one or many tensors with explicitly named Sequence")
|
||||
.def("eval_async_tensor_sync_device", &kp::Manager::evalOpAsync<kp::OpTensorSyncDevice>,
|
||||
"Evaluates asynchronously operation to sync tensor from local memory to GPU memory with explicitly named Sequence")
|
||||
.def("eval_async_tensor_sync_local", &kp::Manager::evalOpAsync<kp::OpTensorSyncLocal>,
|
||||
"Evaluates asynchronously operation to sync tensor(s) from GPU memory to local memory using staging tensors with explicitly named Sequence")
|
||||
.def("eval_async_algo_mult", &kp::Manager::evalOpAsync<kp::OpMult>,
|
||||
"Evaluates asynchronously operation to run multiplication compute shader to two input tensors and an output tensor with explicitly named Sequence")
|
||||
.def("eval_async_algo_file", &kp::Manager::evalOpAsync<kp::OpAlgoBase, std::string>,
|
||||
"Evaluates asynchronously an operation using a custom shader provided from a shader path with explicitly named Sequence")
|
||||
.def("eval_async_algo_data", &kp::Manager::evalOpAsync<kp::OpAlgoBase, std::vector<char>>,
|
||||
"Evaluates asynchronously an operation using a custom shader provided as raw string or spirv bytes with explicitly named Sequence")
|
||||
.def("eval_async_algo_lro", &kp::Manager::evalOpAsync<kp::OpAlgoLhsRhsOut>,
|
||||
"Evaluates asynchronously operation to run left right out operation with custom shader with explicitly named Sequence");
|
||||
|
||||
#ifdef VERSION_INFO
|
||||
m.attr("__version__") = VERSION_INFO;
|
||||
#else
|
||||
m.attr("__version__") = "dev";
|
||||
#endif
|
||||
}
|
||||
110
python/test/test_kompute.py
Normal file
110
python/test/test_kompute.py
Normal file
|
|
@ -0,0 +1,110 @@
|
|||
|
||||
from kp import Tensor, Manager, Sequence
|
||||
|
||||
def test_opmult():
|
||||
"""
|
||||
Test basic OpMult operation
|
||||
"""
|
||||
|
||||
tensor_in_a = Tensor([2, 2, 2])
|
||||
tensor_in_b = Tensor([1, 2, 3])
|
||||
tensor_out = Tensor([0, 0, 0])
|
||||
|
||||
mgr = Manager()
|
||||
|
||||
mgr.eval_tensor_create_def([tensor_in_a, tensor_in_b, tensor_out])
|
||||
|
||||
mgr.eval_algo_mult_def([tensor_in_a, tensor_in_b, tensor_out])
|
||||
|
||||
mgr.eval_tensor_sync_local_def([tensor_out])
|
||||
|
||||
assert tensor_out.data() == [2.0, 4.0, 6.0]
|
||||
|
||||
def test_opalgobase_data():
|
||||
"""
|
||||
Test basic OpAlgoBase operation
|
||||
"""
|
||||
|
||||
tensor_in_a = Tensor([2, 2, 2])
|
||||
tensor_in_b = Tensor([1, 2, 3])
|
||||
tensor_out = Tensor([0, 0, 0])
|
||||
|
||||
mgr = Manager()
|
||||
|
||||
shaderData = """
|
||||
#version 450
|
||||
|
||||
layout (local_size_x = 1) in;
|
||||
|
||||
// The input tensors bind index is relative to index in parameter passed
|
||||
layout(set = 0, binding = 0) buffer bina { float tina[]; };
|
||||
layout(set = 0, binding = 1) buffer binb { float tinb[]; };
|
||||
layout(set = 0, binding = 2) buffer bout { float tout[]; };
|
||||
|
||||
void main() {
|
||||
uint index = gl_GlobalInvocationID.x;
|
||||
tout[index] = tina[index] * tinb[index];
|
||||
}
|
||||
"""
|
||||
|
||||
mgr.eval_tensor_create_def([tensor_in_a, tensor_in_b, tensor_out])
|
||||
|
||||
mgr.eval_algo_data_def([tensor_in_a, tensor_in_b, tensor_out], list(shaderData))
|
||||
|
||||
mgr.eval_tensor_sync_local_def([tensor_out])
|
||||
|
||||
assert tensor_out.data() == [2.0, 4.0, 6.0]
|
||||
|
||||
|
||||
def test_opalgobase_file():
|
||||
"""
|
||||
Test basic OpAlgoBase operation
|
||||
"""
|
||||
|
||||
tensor_in_a = Tensor([2, 2, 2])
|
||||
tensor_in_b = Tensor([1, 2, 3])
|
||||
tensor_out = Tensor([0, 0, 0])
|
||||
|
||||
mgr = Manager()
|
||||
|
||||
shaderFilePath = "../../shaders/glsl/opmult.comp"
|
||||
|
||||
mgr.eval_tensor_create_def([tensor_in_a, tensor_in_b, tensor_out])
|
||||
|
||||
mgr.eval_algo_file_def([tensor_in_a, tensor_in_b, tensor_out], shaderFilePath)
|
||||
|
||||
mgr.eval_tensor_sync_local_def([tensor_out])
|
||||
|
||||
assert tensor_out.data() == [2.0, 4.0, 6.0]
|
||||
|
||||
def test_sequence():
|
||||
"""
|
||||
Test basic OpAlgoBase operation
|
||||
"""
|
||||
|
||||
mgr = Manager(0, [2])
|
||||
|
||||
tensor_in_a = Tensor([2, 2, 2])
|
||||
tensor_in_b = Tensor([1, 2, 3])
|
||||
tensor_out = Tensor([0, 0, 0])
|
||||
|
||||
mgr.eval_tensor_create_def([tensor_in_a, tensor_in_b, tensor_out])
|
||||
|
||||
seq = mgr.create_sequence("op")
|
||||
|
||||
shaderFilePath = "../../shaders/glsl/opmult.comp"
|
||||
mgr.eval_async_algo_file_def([tensor_in_a, tensor_in_b, tensor_out], shaderFilePath)
|
||||
mgr.eval_await_def()
|
||||
|
||||
seq.begin()
|
||||
seq.record_tensor_sync_local([tensor_in_a])
|
||||
seq.record_tensor_sync_local([tensor_in_b])
|
||||
seq.record_tensor_sync_local([tensor_out])
|
||||
seq.end()
|
||||
|
||||
seq.eval()
|
||||
|
||||
assert tensor_out.data() == [2.0, 4.0, 6.0]
|
||||
|
||||
if __name__ == "__main__":
|
||||
test_sequence()
|
||||
76
setup.py
Normal file
76
setup.py
Normal file
|
|
@ -0,0 +1,76 @@
|
|||
import os
|
||||
import re
|
||||
import sys
|
||||
import platform
|
||||
import subprocess
|
||||
|
||||
from setuptools import setup, Extension
|
||||
from setuptools.command.build_ext import build_ext
|
||||
from distutils.version import LooseVersion
|
||||
|
||||
|
||||
class CMakeExtension(Extension):
|
||||
def __init__(self, name, sourcedir=''):
|
||||
Extension.__init__(self, name, sources=[])
|
||||
self.sourcedir = os.path.abspath(sourcedir)
|
||||
|
||||
|
||||
class CMakeBuild(build_ext):
|
||||
def run(self):
|
||||
try:
|
||||
out = subprocess.check_output(['cmake', '--version'])
|
||||
except OSError:
|
||||
raise RuntimeError("CMake must be installed to build the following extensions: " +
|
||||
", ".join(e.name for e in self.extensions))
|
||||
|
||||
if platform.system() == "Windows":
|
||||
cmake_version = LooseVersion(re.search(r'version\s*([\d.]+)', out.decode()).group(1))
|
||||
if cmake_version < '3.1.0':
|
||||
raise RuntimeError("CMake >= 3.1.0 is required on Windows")
|
||||
|
||||
for ext in self.extensions:
|
||||
self.build_extension(ext)
|
||||
|
||||
def build_extension(self, ext):
|
||||
extdir = os.path.abspath(os.path.dirname(self.get_ext_fullpath(ext.name)))
|
||||
# required for auto-detection of auxiliary "native" libs
|
||||
if not extdir.endswith(os.path.sep):
|
||||
extdir += os.path.sep
|
||||
|
||||
cmake_args = ['-DCMAKE_LIBRARY_OUTPUT_DIRECTORY=' + extdir,
|
||||
'-DKOMPUTE_OPT_BUILD_PYTHON=1',
|
||||
'-DKOMPUTE_OPT_BUILD_SINGLE_HEADER=1',
|
||||
'-DPYTHON_EXECUTABLE=' + sys.executable]
|
||||
|
||||
cfg = 'Debug' if self.debug else 'Release'
|
||||
build_args = ['--config', cfg]
|
||||
|
||||
if platform.system() == "Windows":
|
||||
cmake_args += ['-DCMAKE_LIBRARY_OUTPUT_DIRECTORY_{}={}'.format(cfg.upper(), extdir)]
|
||||
if sys.maxsize > 2**32:
|
||||
cmake_args += ['-A', 'x64']
|
||||
build_args += ['--', '/m']
|
||||
else:
|
||||
cmake_args += ['-DKOMPUTE_EXTRA_CXX_FLAGS="-fPIC"']
|
||||
cmake_args += ['-DCMAKE_BUILD_TYPE=' + cfg]
|
||||
build_args += ['--', '-j2']
|
||||
|
||||
env = os.environ.copy()
|
||||
env['CXXFLAGS'] = '{} -DVERSION_INFO=\\"{}\\"'.format(env.get('CXXFLAGS', ''),
|
||||
self.distribution.get_version())
|
||||
if not os.path.exists(self.build_temp):
|
||||
os.makedirs(self.build_temp)
|
||||
|
||||
subprocess.check_call(['cmake', ext.sourcedir] + cmake_args, cwd=self.build_temp, env=env)
|
||||
subprocess.check_call(['cmake', '--build', '.'] + build_args, cwd=self.build_temp)
|
||||
|
||||
setup(
|
||||
name='kp',
|
||||
version='0.0.1',
|
||||
author='Alejandro Saucedo',
|
||||
description='Vulkan Kompute: Blazing fast, mobile-enabled, asynchronous, and optimized for advanced GPU processing usecases.',
|
||||
long_description='',
|
||||
ext_modules=[CMakeExtension('kp')],
|
||||
cmdclass=dict(build_ext=CMakeBuild),
|
||||
zip_safe=False,
|
||||
)
|
||||
|
|
@ -1100,6 +1100,12 @@ class Sequence
|
|||
*/
|
||||
bool isInit();
|
||||
|
||||
/**
|
||||
* Destroys and frees the GPU resources which include the buffer and memory
|
||||
* and sets the sequence as init=False.
|
||||
*/
|
||||
void freeMemoryDestroyGPUResources();
|
||||
|
||||
/**
|
||||
* Record function for operation to be added to the GPU queue in batch. This
|
||||
* template requires classes to be derived from the OpBase class. This
|
||||
|
|
@ -1301,9 +1307,9 @@ class Manager
|
|||
*
|
||||
* @param sequenceName The name for the named sequence to be retrieved or
|
||||
* created
|
||||
* @return Weak pointer to the manager owned sequence resource
|
||||
* @return Shared pointer to the manager owned sequence resource
|
||||
*/
|
||||
std::weak_ptr<Sequence> getOrCreateManagedSequence(
|
||||
std::shared_ptr<Sequence> getOrCreateManagedSequence(
|
||||
std::string sequenceName);
|
||||
|
||||
/**
|
||||
|
|
@ -1315,8 +1321,9 @@ class Manager
|
|||
* @param queueIndex The queue to use from the available queues
|
||||
* @return Weak pointer to the manager owned sequence resource
|
||||
*/
|
||||
std::weak_ptr<Sequence> createManagedSequence(std::string sequenceName = "",
|
||||
uint32_t queueIndex = 0);
|
||||
std::shared_ptr<Sequence> createManagedSequence(
|
||||
std::string sequenceName = "",
|
||||
uint32_t queueIndex = 0);
|
||||
|
||||
/**
|
||||
* Function that evaluates operation against named sequence.
|
||||
|
|
@ -1332,22 +1339,21 @@ class Manager
|
|||
TArgs&&... params)
|
||||
{
|
||||
SPDLOG_DEBUG("Kompute Manager evalOp triggered");
|
||||
std::weak_ptr<Sequence> sqWeakPtr =
|
||||
std::shared_ptr<kp::Sequence> sq =
|
||||
this->getOrCreateManagedSequence(sequenceName);
|
||||
|
||||
if (std::shared_ptr<kp::Sequence> sq = sqWeakPtr.lock()) {
|
||||
SPDLOG_DEBUG("Kompute Manager evalOp running sequence BEGIN");
|
||||
sq->begin();
|
||||
SPDLOG_DEBUG("Kompute Manager evalOp running sequence BEGIN");
|
||||
sq->begin();
|
||||
|
||||
SPDLOG_DEBUG("Kompute Manager evalOp running sequence RECORD");
|
||||
sq->record<T>(tensors, std::forward<TArgs>(params)...);
|
||||
SPDLOG_DEBUG("Kompute Manager evalOp running sequence RECORD");
|
||||
sq->record<T>(tensors, std::forward<TArgs>(params)...);
|
||||
|
||||
SPDLOG_DEBUG("Kompute Manager evalOp running sequence END");
|
||||
sq->end();
|
||||
SPDLOG_DEBUG("Kompute Manager evalOp running sequence END");
|
||||
sq->end();
|
||||
|
||||
SPDLOG_DEBUG("Kompute Manager evalOp running sequence EVAL");
|
||||
sq->eval();
|
||||
|
||||
SPDLOG_DEBUG("Kompute Manager evalOp running sequence EVAL");
|
||||
sq->eval();
|
||||
}
|
||||
SPDLOG_DEBUG("Kompute Manager evalOp running sequence SUCCESS");
|
||||
}
|
||||
|
||||
|
|
@ -1385,26 +1391,21 @@ class Manager
|
|||
{
|
||||
SPDLOG_DEBUG("Kompute Manager evalOpAsync triggered");
|
||||
|
||||
std::weak_ptr<Sequence> sqWeakPtr =
|
||||
std::shared_ptr<kp::Sequence> sq =
|
||||
this->getOrCreateManagedSequence(sequenceName);
|
||||
|
||||
if (std::shared_ptr<kp::Sequence> sq = sqWeakPtr.lock()) {
|
||||
SPDLOG_DEBUG("Kompute Manager evalOpAsync running sequence BEGIN");
|
||||
sq->begin();
|
||||
|
||||
SPDLOG_DEBUG("Kompute Manager evalOpAsync running sequence BEGIN");
|
||||
sq->begin();
|
||||
SPDLOG_DEBUG("Kompute Manager evalOpAsync running sequence RECORD");
|
||||
sq->record<T>(tensors, std::forward<TArgs>(params)...);
|
||||
|
||||
SPDLOG_DEBUG("Kompute Manager evalOpAsync running sequence RECORD");
|
||||
sq->record<T>(tensors, std::forward<TArgs>(params)...);
|
||||
SPDLOG_DEBUG("Kompute Manager evalOpAsync running sequence END");
|
||||
sq->end();
|
||||
|
||||
SPDLOG_DEBUG("Kompute Manager evalOpAsync running sequence END");
|
||||
sq->end();
|
||||
SPDLOG_DEBUG("Kompute Manager evalOpAsync running sequence EVAL");
|
||||
sq->evalAsync();
|
||||
|
||||
SPDLOG_DEBUG("Kompute Manager evalOpAsync running sequence EVAL");
|
||||
sq->evalAsync();
|
||||
} else {
|
||||
SPDLOG_ERROR("Kompute Manager evalOpAsync sequence [{}] not found",
|
||||
sequenceName);
|
||||
}
|
||||
SPDLOG_DEBUG("Kompute Manager evalOpAsync running sequence SUCCESS");
|
||||
}
|
||||
|
||||
|
|
@ -1620,20 +1621,17 @@ namespace kp {
|
|||
* Operation that provides a general abstraction that simplifies the use of
|
||||
* algorithm and parameter components which can be used with shaders.
|
||||
* By default it enables the user to provide a dynamic number of tensors
|
||||
* which are then passed as inputs.
|
||||
*
|
||||
* All of these tensors are expected to be initlaised and this is checked with throw std exception in the init function.
|
||||
*
|
||||
* See OpLhsRhsOut for an example implementation on a more specific granularity on tensor parameters.
|
||||
*
|
||||
* The template parameters specify the processing GPU layout number of
|
||||
* iterations for each x, y, z parameter. More specifically, this will be the
|
||||
* input to ".dispatch(uint32_t tX, uint32_t tY, uint32_t, tZ)"
|
||||
* which are then passed as inputs.
|
||||
*/
|
||||
template<uint32_t tX = 0, uint32_t tY = 0, uint32_t tZ = 0>
|
||||
class OpAlgoBase : public OpBase
|
||||
{
|
||||
public:
|
||||
struct KomputeWorkgroup {
|
||||
uint32_t x;
|
||||
uint32_t y;
|
||||
uint32_t z;
|
||||
};
|
||||
|
||||
/**
|
||||
* Base constructor, should not be used unless explicitly intended.
|
||||
*/
|
||||
|
|
@ -1649,11 +1647,13 @@ class OpAlgoBase : public OpBase
|
|||
* @param commandBuffer Vulkan Command Buffer to record commands into
|
||||
* @param tensors Tensors that are to be used in this operation
|
||||
* @param shaderFilePath Optional parameter to specify the shader to load (either in spirv or raw format)
|
||||
* @param komputeWorkgroup Optional parameter to specify the layout for processing
|
||||
*/
|
||||
OpAlgoBase(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
|
||||
std::shared_ptr<vk::Device> device,
|
||||
std::shared_ptr<vk::CommandBuffer> commandBuffer,
|
||||
std::vector<std::shared_ptr<Tensor>>& tensors);
|
||||
std::vector<std::shared_ptr<Tensor>>& tensors,
|
||||
KomputeWorkgroup komputeWorkgroup = KomputeWorkgroup());
|
||||
|
||||
/**
|
||||
* Constructor that enables a file to be passed to the operation with
|
||||
|
|
@ -1664,13 +1664,15 @@ class OpAlgoBase : public OpBase
|
|||
* @param device Vulkan logical device for passing to Algorithm
|
||||
* @param commandBuffer Vulkan Command Buffer to record commands into
|
||||
* @param tensors Tensors that are to be used in this operation
|
||||
* @param shaderFilePath Optional parameter to specify the shader to load (either in spirv or raw format)
|
||||
* @param shaderFilePath Parameter to specify the shader to load (either in spirv or raw format)
|
||||
* @param komputeWorkgroup Optional parameter to specify the layout for processing
|
||||
*/
|
||||
OpAlgoBase(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
|
||||
std::shared_ptr<vk::Device> device,
|
||||
std::shared_ptr<vk::CommandBuffer> commandBuffer,
|
||||
std::vector<std::shared_ptr<Tensor>>& tensors,
|
||||
std::string shaderFilePath);
|
||||
std::string shaderFilePath,
|
||||
KomputeWorkgroup komputeWorkgroup = KomputeWorkgroup());
|
||||
|
||||
/**
|
||||
* Constructor that enables raw shader data to be passed to the main operation
|
||||
|
|
@ -1681,12 +1683,14 @@ class OpAlgoBase : public OpBase
|
|||
* @param commandBuffer Vulkan Command Buffer to record commands into
|
||||
* @param tensors Tensors that are to be used in this operation
|
||||
* @param shaderDataRaw Optional parameter to specify the shader data either in binary or raw form
|
||||
* @param komputeWorkgroup Optional parameter to specify the layout for processing
|
||||
*/
|
||||
OpAlgoBase(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
|
||||
std::shared_ptr<vk::Device> device,
|
||||
std::shared_ptr<vk::CommandBuffer> commandBuffer,
|
||||
std::vector<std::shared_ptr<Tensor>>& tensors,
|
||||
const std::vector<char>& shaderDataRaw);
|
||||
const std::vector<char>& shaderDataRaw,
|
||||
KomputeWorkgroup komputeWorkgroup = KomputeWorkgroup());
|
||||
|
||||
/**
|
||||
* Default destructor, which is in charge of destroying the algorithm
|
||||
|
|
@ -1733,9 +1737,7 @@ class OpAlgoBase : public OpBase
|
|||
|
||||
// -------------- ALWAYS OWNED RESOURCES
|
||||
|
||||
uint32_t mX;
|
||||
uint32_t mY;
|
||||
uint32_t mZ;
|
||||
KomputeWorkgroup mKomputeWorkgroup;
|
||||
|
||||
std::string mShaderFilePath; ///< Optional member variable which can be provided for the OpAlgoBase to find the data automatically and load for processing
|
||||
std::vector<char> mShaderDataRaw; ///< Optional member variable which can be provided to contain either the raw shader content or the spirv binary content
|
||||
|
|
@ -1745,177 +1747,6 @@ class OpAlgoBase : public OpBase
|
|||
|
||||
} // End namespace kp
|
||||
|
||||
// Including implementation for template class
|
||||
#ifndef OPALGOBASE_IMPL
|
||||
#define OPALGOBASE_IMPL
|
||||
|
||||
namespace kp {
|
||||
|
||||
template<uint32_t tX, uint32_t tY, uint32_t tZ>
|
||||
OpAlgoBase<tX, tY, tZ>::OpAlgoBase()
|
||||
{
|
||||
SPDLOG_DEBUG("Kompute OpAlgoBase constructor base");
|
||||
}
|
||||
|
||||
template<uint32_t tX, uint32_t tY, uint32_t tZ>
|
||||
OpAlgoBase<tX, tY, tZ>::OpAlgoBase(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
|
||||
std::shared_ptr<vk::Device> device,
|
||||
std::shared_ptr<vk::CommandBuffer> commandBuffer,
|
||||
std::vector<std::shared_ptr<Tensor>>& tensors)
|
||||
: OpBase(physicalDevice, device, commandBuffer, tensors, false)
|
||||
{
|
||||
SPDLOG_DEBUG("Kompute OpAlgoBase constructor with params numTensors: {}", tensors.size());
|
||||
|
||||
// The dispatch size is set up based on either explicitly provided template
|
||||
// parameters or by default it would take the shape and size of the tensors
|
||||
if (tX > 0) {
|
||||
// If at least the x value is provided we use mainly the parameters
|
||||
// provided
|
||||
this->mX = tX;
|
||||
this->mY = tY > 0 ? tY : 1;
|
||||
this->mZ = tZ > 0 ? tZ : 1;
|
||||
} else {
|
||||
this->mX = tensors[0]->size();
|
||||
this->mY = 1;
|
||||
this->mZ = 1;
|
||||
}
|
||||
SPDLOG_INFO("Kompute OpAlgoBase dispatch size X: {}, Y: {}, Z: {}",
|
||||
this->mX,
|
||||
this->mY,
|
||||
this->mZ);
|
||||
|
||||
this->mAlgorithm = std::make_shared<Algorithm>(device, commandBuffer);
|
||||
}
|
||||
|
||||
template<uint32_t tX, uint32_t tY, uint32_t tZ>
|
||||
OpAlgoBase<tX, tY, tZ>::OpAlgoBase(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
|
||||
std::shared_ptr<vk::Device> device,
|
||||
std::shared_ptr<vk::CommandBuffer> commandBuffer,
|
||||
std::vector<std::shared_ptr<Tensor>>& tensors,
|
||||
std::string shaderFilePath)
|
||||
: OpAlgoBase(physicalDevice, device, commandBuffer, tensors)
|
||||
{
|
||||
SPDLOG_DEBUG("Kompute OpAlgoBase shaderFilePath constructo with shaderfile path: {}", shaderFilePath);
|
||||
|
||||
this->mShaderFilePath = shaderFilePath;
|
||||
}
|
||||
|
||||
template<uint32_t tX, uint32_t tY, uint32_t tZ>
|
||||
OpAlgoBase<tX, tY, tZ>::OpAlgoBase(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
|
||||
std::shared_ptr<vk::Device> device,
|
||||
std::shared_ptr<vk::CommandBuffer> commandBuffer,
|
||||
std::vector<std::shared_ptr<Tensor>>& tensors,
|
||||
const std::vector<char>& shaderDataRaw)
|
||||
: OpAlgoBase(physicalDevice, device, commandBuffer, tensors)
|
||||
{
|
||||
SPDLOG_DEBUG("Kompute OpAlgoBase shaderFilePath constructo with shader raw data length: {}", shaderDataRaw.size());
|
||||
|
||||
this->mShaderDataRaw = shaderDataRaw;
|
||||
}
|
||||
|
||||
template<uint32_t tX, uint32_t tY, uint32_t tZ>
|
||||
OpAlgoBase<tX, tY, tZ>::~OpAlgoBase()
|
||||
{
|
||||
SPDLOG_DEBUG("Kompute OpAlgoBase destructor started");
|
||||
}
|
||||
|
||||
template<uint32_t tX, uint32_t tY, uint32_t tZ>
|
||||
void
|
||||
OpAlgoBase<tX, tY, tZ>::init()
|
||||
{
|
||||
SPDLOG_DEBUG("Kompute OpAlgoBase init called");
|
||||
|
||||
if (this->mTensors.size() < 1) {
|
||||
throw std::runtime_error(
|
||||
"Kompute OpAlgoBase called with less than 1 tensor");
|
||||
}
|
||||
|
||||
for (std::shared_ptr<Tensor> tensor : this->mTensors) {
|
||||
if(!tensor->isInit()) {
|
||||
throw std::runtime_error("Kompute OpAlgoBase validation failed; all tensor parameters must be initialised.");
|
||||
}
|
||||
}
|
||||
|
||||
SPDLOG_DEBUG("Kompute OpAlgoBase fetching spirv data");
|
||||
|
||||
std::vector<char> shaderFileData = this->fetchSpirvBinaryData();
|
||||
|
||||
SPDLOG_DEBUG("Kompute OpAlgoBase Initialising algorithm component");
|
||||
|
||||
this->mAlgorithm->init(shaderFileData, this->mTensors);
|
||||
}
|
||||
|
||||
template<uint32_t tX, uint32_t tY, uint32_t tZ>
|
||||
void
|
||||
OpAlgoBase<tX, tY, tZ>::record()
|
||||
{
|
||||
SPDLOG_DEBUG("Kompute OpAlgoBase record called");
|
||||
|
||||
// Barrier to ensure the data is finished writing to buffer memory
|
||||
for (std::shared_ptr<Tensor> tensor : this->mTensors) {
|
||||
tensor->recordBufferMemoryBarrier(
|
||||
this->mCommandBuffer,
|
||||
vk::AccessFlagBits::eHostWrite,
|
||||
vk::AccessFlagBits::eShaderRead,
|
||||
vk::PipelineStageFlagBits::eHost,
|
||||
vk::PipelineStageFlagBits::eComputeShader);
|
||||
}
|
||||
|
||||
this->mAlgorithm->recordDispatch(this->mX, this->mY, this->mZ);
|
||||
}
|
||||
|
||||
template<uint32_t tX, uint32_t tY, uint32_t tZ>
|
||||
void
|
||||
OpAlgoBase<tX, tY, tZ>::preEval()
|
||||
{
|
||||
SPDLOG_DEBUG("Kompute OpAlgoBase preEval called");
|
||||
}
|
||||
|
||||
template<uint32_t tX, uint32_t tY, uint32_t tZ>
|
||||
void
|
||||
OpAlgoBase<tX, tY, tZ>::postEval()
|
||||
{
|
||||
SPDLOG_DEBUG("Kompute OpAlgoBase postSubmit called");
|
||||
}
|
||||
|
||||
template<uint32_t tX, uint32_t tY, uint32_t tZ>
|
||||
std::vector<char> OpAlgoBase<tX, tY, tZ>::fetchSpirvBinaryData()
|
||||
{
|
||||
SPDLOG_WARN(
|
||||
"Kompute OpAlgoBase Running shaders directly from spirv file");
|
||||
|
||||
if (this->mShaderFilePath.size()) {
|
||||
std::ifstream fileStream(this->mShaderFilePath,
|
||||
std::ios::binary | std::ios::in | std::ios::ate);
|
||||
|
||||
if (!fileStream.good()) {
|
||||
throw std::runtime_error("Error reading file: " + this->mShaderFilePath);
|
||||
}
|
||||
|
||||
size_t shaderFileSize = fileStream.tellg();
|
||||
fileStream.seekg(0, std::ios::beg);
|
||||
char* shaderDataRaw = new char[shaderFileSize];
|
||||
fileStream.read(shaderDataRaw, shaderFileSize);
|
||||
fileStream.close();
|
||||
|
||||
SPDLOG_WARN(
|
||||
"Kompute OpAlgoBase fetched {} bytes", shaderFileSize);
|
||||
|
||||
return std::vector<char>(shaderDataRaw,
|
||||
shaderDataRaw + shaderFileSize);
|
||||
}
|
||||
else if (this->mShaderDataRaw.size()) {
|
||||
return this->mShaderDataRaw;
|
||||
}
|
||||
else {
|
||||
throw std::runtime_error("Kompute OpAlgoBase Error reached fetchSpirvBinaryData but neither filepath nor data provided");
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
#endif // #ifndef OPALGOBASE_IMPL
|
||||
|
||||
#include <fstream>
|
||||
|
||||
namespace kp {
|
||||
|
|
@ -1924,12 +1755,8 @@ namespace kp {
|
|||
* Operation base class to simplify the creation of operations that require
|
||||
* right hand and left hand side datapoints together with a single output.
|
||||
* The expected data passed is two input tensors and one output tensor.
|
||||
* The template parameters specify the processing GPU layout number of
|
||||
* iterations for each x, y, z parameter. More specifically, this will be the
|
||||
* input to ".dispatch(uint32_t tX, uint32_t tY, uint32_t, tZ)"
|
||||
*/
|
||||
template<uint32_t tX = 0, uint32_t tY = 0, uint32_t tZ = 0>
|
||||
class OpAlgoLhsRhsOut : public OpAlgoBase<tX, tY, tZ>
|
||||
class OpAlgoLhsRhsOut : public OpAlgoBase
|
||||
{
|
||||
public:
|
||||
/**
|
||||
|
|
@ -1947,11 +1774,13 @@ class OpAlgoLhsRhsOut : public OpAlgoBase<tX, tY, tZ>
|
|||
* @param commandBuffer Vulkan Command Buffer to record commands into
|
||||
* @param tensors Tensors that are to be used in this operation
|
||||
* @param freeTensors Whether operation manages the memory of the Tensors
|
||||
* @param komputeWorkgroup Optional parameter to specify the layout for processing
|
||||
*/
|
||||
OpAlgoLhsRhsOut(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
|
||||
std::shared_ptr<vk::Device> device,
|
||||
std::shared_ptr<vk::CommandBuffer> commandBuffer,
|
||||
std::vector<std::shared_ptr<Tensor>> tensors);
|
||||
std::vector<std::shared_ptr<Tensor>> tensors,
|
||||
KomputeWorkgroup komputeWorkgroup = KomputeWorkgroup());
|
||||
|
||||
/**
|
||||
* Default destructor, which is in charge of destroying the algorithm
|
||||
|
|
@ -1982,7 +1811,7 @@ class OpAlgoLhsRhsOut : public OpAlgoBase<tX, tY, tZ>
|
|||
* of the GPU Device memory into the staging buffer so the output data can
|
||||
* be retrieved.
|
||||
*/
|
||||
virtual void postSubmit() override;
|
||||
virtual void postEval() override;
|
||||
|
||||
protected:
|
||||
// -------------- NEVER OWNED RESOURCES
|
||||
|
|
@ -1996,138 +1825,6 @@ class OpAlgoLhsRhsOut : public OpAlgoBase<tX, tY, tZ>
|
|||
|
||||
} // End namespace kp
|
||||
|
||||
// Including implementation for template class
|
||||
#ifndef OPALGOLHSRHSOUT_CPP
|
||||
#define OPALGOLHSRHSOUT_CPP
|
||||
|
||||
namespace kp {
|
||||
|
||||
template<uint32_t tX, uint32_t tY, uint32_t tZ>
|
||||
OpAlgoLhsRhsOut<tX, tY, tZ>::OpAlgoLhsRhsOut()
|
||||
{
|
||||
SPDLOG_DEBUG("Kompute OpAlgoLhsRhsOut constructor base");
|
||||
}
|
||||
|
||||
template<uint32_t tX, uint32_t tY, uint32_t tZ>
|
||||
OpAlgoLhsRhsOut<tX, tY, tZ>::OpAlgoLhsRhsOut(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
|
||||
std::shared_ptr<vk::Device> device,
|
||||
std::shared_ptr<vk::CommandBuffer> commandBuffer,
|
||||
std::vector<std::shared_ptr<Tensor>> tensors)
|
||||
// The inheritance is initialised with the copyOutputData to false given that
|
||||
// this depencendant class handles the transfer of data via staging buffers in
|
||||
// a granular way.
|
||||
: OpAlgoBase<tX, tY, tZ>(physicalDevice, device, commandBuffer, tensors)
|
||||
{
|
||||
SPDLOG_DEBUG("Kompute OpAlgoLhsRhsOut constructor with params");
|
||||
}
|
||||
|
||||
template<uint32_t tX, uint32_t tY, uint32_t tZ>
|
||||
OpAlgoLhsRhsOut<tX, tY, tZ>::~OpAlgoLhsRhsOut()
|
||||
{
|
||||
SPDLOG_DEBUG("Kompute OpAlgoLhsRhsOut destructor started");
|
||||
}
|
||||
|
||||
template<uint32_t tX, uint32_t tY, uint32_t tZ>
|
||||
void
|
||||
OpAlgoLhsRhsOut<tX, tY, tZ>::init()
|
||||
{
|
||||
SPDLOG_DEBUG("Kompute OpAlgoLhsRhsOut init called");
|
||||
|
||||
if (this->mTensors.size() < 3) {
|
||||
throw std::runtime_error(
|
||||
"Kompute OpAlgoLhsRhsOut called with less than 1 tensor");
|
||||
} else if (this->mTensors.size() > 3) {
|
||||
SPDLOG_WARN("Kompute OpAlgoLhsRhsOut called with more than 3 this->mTensors");
|
||||
}
|
||||
|
||||
this->mTensorLHS = this->mTensors[0];
|
||||
this->mTensorRHS = this->mTensors[1];
|
||||
this->mTensorOutput = this->mTensors[2];
|
||||
|
||||
if (!(this->mTensorLHS->isInit() && this->mTensorRHS->isInit() &&
|
||||
this->mTensorOutput->isInit())) {
|
||||
throw std::runtime_error(
|
||||
"Kompute OpAlgoLhsRhsOut all tensor parameters must be initialised. LHS: " +
|
||||
std::to_string(this->mTensorLHS->isInit()) +
|
||||
" RHS: " + std::to_string(this->mTensorRHS->isInit()) +
|
||||
" Output: " + std::to_string(this->mTensorOutput->isInit()));
|
||||
}
|
||||
|
||||
if (!(this->mTensorLHS->size() == this->mTensorRHS->size() &&
|
||||
this->mTensorRHS->size() == this->mTensorOutput->size())) {
|
||||
throw std::runtime_error(
|
||||
"Kompute OpAlgoLhsRhsOut all tensor parameters must be the same size LHS: " +
|
||||
std::to_string(this->mTensorLHS->size()) +
|
||||
" RHS: " + std::to_string(this->mTensorRHS->size()) +
|
||||
" Output: " + std::to_string(this->mTensorOutput->size()));
|
||||
}
|
||||
|
||||
this->mTensorOutputStaging = std::make_shared<Tensor>(
|
||||
this->mTensorOutput->data(), Tensor::TensorTypes::eStaging);
|
||||
|
||||
this->mTensorOutputStaging->init(
|
||||
this->mPhysicalDevice, this->mDevice, this->mCommandBuffer);
|
||||
|
||||
SPDLOG_DEBUG("Kompute OpAlgoLhsRhsOut fetching spirv data");
|
||||
|
||||
std::vector<char> shaderFileData = this->fetchSpirvBinaryData();
|
||||
|
||||
SPDLOG_DEBUG("Kompute OpAlgoLhsRhsOut Initialising algorithm component");
|
||||
|
||||
this->mAlgorithm->init(shaderFileData, this->mTensors);
|
||||
}
|
||||
|
||||
template<uint32_t tX, uint32_t tY, uint32_t tZ>
|
||||
void
|
||||
OpAlgoLhsRhsOut<tX, tY, tZ>::record()
|
||||
{
|
||||
SPDLOG_DEBUG("Kompute OpAlgoLhsRhsOut record called");
|
||||
|
||||
// Barrier to ensure the data is finished writing to buffer memory
|
||||
this->mTensorLHS->recordBufferMemoryBarrier(
|
||||
this->mCommandBuffer,
|
||||
vk::AccessFlagBits::eHostWrite,
|
||||
vk::AccessFlagBits::eShaderRead,
|
||||
vk::PipelineStageFlagBits::eHost,
|
||||
vk::PipelineStageFlagBits::eComputeShader);
|
||||
this->mTensorRHS->recordBufferMemoryBarrier(
|
||||
this->mCommandBuffer,
|
||||
vk::AccessFlagBits::eHostWrite,
|
||||
vk::AccessFlagBits::eShaderRead,
|
||||
vk::PipelineStageFlagBits::eHost,
|
||||
vk::PipelineStageFlagBits::eComputeShader);
|
||||
|
||||
this->mAlgorithm->recordDispatch(this->mX, this->mY, this->mZ);
|
||||
|
||||
// Barrier to ensure the shader code is executed before buffer read
|
||||
this->mTensorOutput->recordBufferMemoryBarrier(
|
||||
this->mCommandBuffer,
|
||||
vk::AccessFlagBits::eShaderWrite,
|
||||
vk::AccessFlagBits::eTransferRead,
|
||||
vk::PipelineStageFlagBits::eComputeShader,
|
||||
vk::PipelineStageFlagBits::eTransfer);
|
||||
|
||||
this->mTensorOutputStaging->recordCopyFrom(
|
||||
this->mCommandBuffer,
|
||||
this->mTensorOutput,
|
||||
true);
|
||||
}
|
||||
|
||||
template<uint32_t tX, uint32_t tY, uint32_t tZ>
|
||||
void
|
||||
OpAlgoLhsRhsOut<tX, tY, tZ>::postSubmit()
|
||||
{
|
||||
SPDLOG_DEBUG("Kompute OpAlgoLhsRhsOut postSubmit called");
|
||||
|
||||
this->mTensorOutputStaging->mapDataFromHostMemory();
|
||||
|
||||
this->mTensorOutput->setData(this->mTensorOutputStaging->data());
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
#endif // #ifndef OPALGOLHSRHSOUT_CPP
|
||||
|
||||
#include <fstream>
|
||||
|
||||
#if RELEASE
|
||||
|
|
@ -2138,12 +1835,9 @@ namespace kp {
|
|||
|
||||
/**
|
||||
* Operation that performs multiplication on two tensors and outpus on third
|
||||
* tensor. The template parameters specify the processing GPU layout number of
|
||||
* iterations for each x, y, z parameter. More specifically, this will be the
|
||||
* input to ".dispatch(uint32_t tX, uint32_t tY, uint32_t, tZ)"
|
||||
* tensor.
|
||||
*/
|
||||
template<uint32_t tX = 0, uint32_t tY = 0, uint32_t tZ = 0>
|
||||
class OpMult : public OpAlgoBase<tX, tY, tZ>
|
||||
class OpMult : public OpAlgoBase
|
||||
{
|
||||
public:
|
||||
/**
|
||||
|
|
@ -2162,13 +1856,14 @@ class OpMult : public OpAlgoBase<tX, tY, tZ>
|
|||
* @param device Vulkan logical device for passing to Algorithm
|
||||
* @param commandBuffer Vulkan Command Buffer to record commands into
|
||||
* @param tensors Tensors that are to be used in this operation
|
||||
* @param freeTensors Whether operation manages the memory of the Tensors
|
||||
* @param komputeWorkgroup Optional parameter to specify the layout for processing
|
||||
*/
|
||||
OpMult(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
|
||||
std::shared_ptr<vk::Device> device,
|
||||
std::shared_ptr<vk::CommandBuffer> commandBuffer,
|
||||
std::vector<std::shared_ptr<Tensor>> tensors)
|
||||
: OpAlgoBase<tX, tY, tZ>(physicalDevice, device, commandBuffer, tensors, "")
|
||||
std::vector<std::shared_ptr<Tensor>> tensors,
|
||||
KomputeWorkgroup komputeWorkgroup = KomputeWorkgroup())
|
||||
: OpAlgoBase(physicalDevice, device, commandBuffer, tensors, "", komputeWorkgroup)
|
||||
{
|
||||
SPDLOG_DEBUG("Kompute OpMult constructor with params");
|
||||
|
||||
|
|
@ -2179,14 +1874,8 @@ class OpMult : public OpAlgoBase<tX, tY, tZ>
|
|||
|
||||
#if RELEASE
|
||||
/**
|
||||
* If release it will be using the static version of the shader which is
|
||||
* loaded using this file directly.
|
||||
*
|
||||
* @param physicalDevice Vulkan physical device used to find device queues
|
||||
* @param device Vulkan logical device for passing to Algorithm
|
||||
* @param commandBuffer Vulkan Command Buffer to record commands into
|
||||
* @param tensors Tensors that are to be used in this operation
|
||||
* @param freeTensors Whether operation manages the memory of the Tensors
|
||||
* If RELEASE=1 it will be using the static version of the shader which is
|
||||
* loaded using this file directly. Otherwise it should not override the function.
|
||||
*/
|
||||
std::vector<char> fetchSpirvBinaryData() override
|
||||
{
|
||||
|
|
|
|||
|
|
@ -34,7 +34,9 @@ Algorithm::~Algorithm()
|
|||
SPDLOG_ERROR("Kompute Algorithm Error requested to destroy "
|
||||
"pipeline but it is null");
|
||||
}
|
||||
this->mDevice->destroy(*this->mPipeline, (vk::Optional<const vk::AllocationCallbacks>)nullptr);
|
||||
this->mDevice->destroy(
|
||||
*this->mPipeline,
|
||||
(vk::Optional<const vk::AllocationCallbacks>)nullptr);
|
||||
}
|
||||
|
||||
if (this->mFreePipelineCache) {
|
||||
|
|
@ -43,7 +45,9 @@ Algorithm::~Algorithm()
|
|||
SPDLOG_ERROR("Kompute Algorithm Error requested to destroy "
|
||||
"pipeline cache but it is null");
|
||||
}
|
||||
this->mDevice->destroy(*this->mPipelineCache, (vk::Optional<const vk::AllocationCallbacks>)nullptr);
|
||||
this->mDevice->destroy(
|
||||
*this->mPipelineCache,
|
||||
(vk::Optional<const vk::AllocationCallbacks>)nullptr);
|
||||
}
|
||||
|
||||
if (this->mFreePipelineLayout) {
|
||||
|
|
@ -52,7 +56,9 @@ Algorithm::~Algorithm()
|
|||
SPDLOG_ERROR("Kompute Algorithm Error requested to destroy "
|
||||
"pipeline layout but it is null");
|
||||
}
|
||||
this->mDevice->destroy(*this->mPipelineLayout, (vk::Optional<const vk::AllocationCallbacks>)nullptr);
|
||||
this->mDevice->destroy(
|
||||
*this->mPipelineLayout,
|
||||
(vk::Optional<const vk::AllocationCallbacks>)nullptr);
|
||||
}
|
||||
|
||||
if (this->mFreeShaderModule) {
|
||||
|
|
@ -61,7 +67,9 @@ Algorithm::~Algorithm()
|
|||
SPDLOG_ERROR("Kompute Algorithm Error requested to destroy shader "
|
||||
"module but it is null");
|
||||
}
|
||||
this->mDevice->destroy(*this->mShaderModule, (vk::Optional<const vk::AllocationCallbacks>)nullptr);
|
||||
this->mDevice->destroy(
|
||||
*this->mShaderModule,
|
||||
(vk::Optional<const vk::AllocationCallbacks>)nullptr);
|
||||
}
|
||||
|
||||
if (this->mFreeDescriptorSet) {
|
||||
|
|
@ -80,7 +88,9 @@ Algorithm::~Algorithm()
|
|||
SPDLOG_ERROR("Kompute Algorithm Error requested to destroy "
|
||||
"descriptor set layout but it is null");
|
||||
}
|
||||
this->mDevice->destroy(*this->mDescriptorSetLayout, (vk::Optional<const vk::AllocationCallbacks>)nullptr);
|
||||
this->mDevice->destroy(
|
||||
*this->mDescriptorSetLayout,
|
||||
(vk::Optional<const vk::AllocationCallbacks>)nullptr);
|
||||
}
|
||||
|
||||
if (this->mFreeDescriptorPool) {
|
||||
|
|
@ -89,7 +99,9 @@ Algorithm::~Algorithm()
|
|||
SPDLOG_ERROR("Kompute Algorithm Error requested to destroy "
|
||||
"descriptor pool but it is null");
|
||||
}
|
||||
this->mDevice->destroy(*this->mDescriptorPool, (vk::Optional<const vk::AllocationCallbacks>)nullptr);
|
||||
this->mDevice->destroy(
|
||||
*this->mDescriptorPool,
|
||||
(vk::Optional<const vk::AllocationCallbacks>)nullptr);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -48,7 +48,8 @@ if(KOMPUTE_OPT_ANDOID_BUILD)
|
|||
${PROJECT_SOURCE_DIR}/vk_ndk_wrapper_include/kompute_vk_ndk_wrapper.cpp)
|
||||
endif()
|
||||
|
||||
add_library(kompute
|
||||
add_library(
|
||||
kompute STATIC
|
||||
${kompute_CPP})
|
||||
|
||||
target_include_directories(
|
||||
|
|
|
|||
17
src/Manager.cpp
Normal file → Executable file
17
src/Manager.cpp
Normal file → Executable file
|
|
@ -59,13 +59,19 @@ Manager::~Manager()
|
|||
}
|
||||
|
||||
if (this->mManagedSequences.size()) {
|
||||
SPDLOG_DEBUG("Releasing managed sequence");
|
||||
SPDLOG_DEBUG("Kompute Manager explicitly running destructor for "
|
||||
"managed sequences");
|
||||
for (const std::pair<std::string, std::shared_ptr<Sequence>>& sqPair :
|
||||
this->mManagedSequences) {
|
||||
sqPair.second->freeMemoryDestroyGPUResources();
|
||||
}
|
||||
this->mManagedSequences.clear();
|
||||
}
|
||||
|
||||
if (this->mFreeDevice) {
|
||||
SPDLOG_INFO("Destroying device");
|
||||
this->mDevice->destroy((vk::Optional<const vk::AllocationCallbacks>)nullptr);
|
||||
this->mDevice->destroy(
|
||||
(vk::Optional<const vk::AllocationCallbacks>)nullptr);
|
||||
SPDLOG_DEBUG("Kompute Manager Destroyed Device");
|
||||
}
|
||||
|
||||
|
|
@ -86,12 +92,13 @@ Manager::~Manager()
|
|||
#endif
|
||||
|
||||
if (this->mFreeInstance) {
|
||||
this->mInstance->destroy((vk::Optional<const vk::AllocationCallbacks>)nullptr);
|
||||
this->mInstance->destroy(
|
||||
(vk::Optional<const vk::AllocationCallbacks>)nullptr);
|
||||
SPDLOG_DEBUG("Kompute Manager Destroyed Instance");
|
||||
}
|
||||
}
|
||||
|
||||
std::weak_ptr<Sequence>
|
||||
std::shared_ptr<Sequence>
|
||||
Manager::getOrCreateManagedSequence(std::string sequenceName)
|
||||
{
|
||||
SPDLOG_DEBUG("Kompute Manager creating Sequence object");
|
||||
|
|
@ -106,7 +113,7 @@ Manager::getOrCreateManagedSequence(std::string sequenceName)
|
|||
}
|
||||
}
|
||||
|
||||
std::weak_ptr<Sequence>
|
||||
std::shared_ptr<Sequence>
|
||||
Manager::createManagedSequence(std::string sequenceName, uint32_t queueIndex)
|
||||
{
|
||||
|
||||
|
|
|
|||
170
src/OpAlgoBase.cpp
Normal file
170
src/OpAlgoBase.cpp
Normal file
|
|
@ -0,0 +1,170 @@
|
|||
#pragma once
|
||||
|
||||
#include "kompute/operations/OpAlgoBase.hpp"
|
||||
|
||||
namespace kp {
|
||||
|
||||
OpAlgoBase::OpAlgoBase()
|
||||
{
|
||||
SPDLOG_DEBUG("Kompute OpAlgoBase constructor base");
|
||||
}
|
||||
|
||||
OpAlgoBase::OpAlgoBase(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
|
||||
std::shared_ptr<vk::Device> device,
|
||||
std::shared_ptr<vk::CommandBuffer> commandBuffer,
|
||||
std::vector<std::shared_ptr<Tensor>>& tensors,
|
||||
KomputeWorkgroup komputeWorkgroup)
|
||||
: OpBase(physicalDevice, device, commandBuffer, tensors, false)
|
||||
{
|
||||
SPDLOG_DEBUG("Kompute OpAlgoBase constructor with params numTensors: {}",
|
||||
tensors.size());
|
||||
|
||||
// The dispatch size is set up based on either explicitly provided template
|
||||
// parameters or by default it would take the shape and size of the tensors
|
||||
if (komputeWorkgroup.x > 0) {
|
||||
// If at least the x value is provided we use mainly the parameters
|
||||
// provided
|
||||
this->mKomputeWorkgroup = {
|
||||
0,
|
||||
komputeWorkgroup.y > 0 ? komputeWorkgroup.y : 1,
|
||||
komputeWorkgroup.z > 0 ? komputeWorkgroup.z : 1
|
||||
};
|
||||
} else {
|
||||
this->mKomputeWorkgroup = { tensors[0]->size(), 1, 1 };
|
||||
}
|
||||
SPDLOG_INFO("Kompute OpAlgoBase dispatch size X: {}, Y: {}, Z: {}",
|
||||
this->mKomputeWorkgroup.x,
|
||||
this->mKomputeWorkgroup.y,
|
||||
this->mKomputeWorkgroup.z);
|
||||
|
||||
this->mAlgorithm = std::make_shared<Algorithm>(device, commandBuffer);
|
||||
}
|
||||
|
||||
OpAlgoBase::OpAlgoBase(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
|
||||
std::shared_ptr<vk::Device> device,
|
||||
std::shared_ptr<vk::CommandBuffer> commandBuffer,
|
||||
std::vector<std::shared_ptr<Tensor>>& tensors,
|
||||
std::string shaderFilePath,
|
||||
KomputeWorkgroup komputeWorkgroup)
|
||||
: OpAlgoBase(physicalDevice, device, commandBuffer, tensors, komputeWorkgroup)
|
||||
{
|
||||
SPDLOG_DEBUG(
|
||||
"Kompute OpAlgoBase shaderFilePath constructo with shaderfile path: {}",
|
||||
shaderFilePath);
|
||||
|
||||
this->mShaderFilePath = shaderFilePath;
|
||||
}
|
||||
|
||||
OpAlgoBase::OpAlgoBase(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
|
||||
std::shared_ptr<vk::Device> device,
|
||||
std::shared_ptr<vk::CommandBuffer> commandBuffer,
|
||||
std::vector<std::shared_ptr<Tensor>>& tensors,
|
||||
const std::vector<char>& shaderDataRaw,
|
||||
KomputeWorkgroup komputeWorkgroup)
|
||||
: OpAlgoBase(physicalDevice, device, commandBuffer, tensors, komputeWorkgroup)
|
||||
{
|
||||
SPDLOG_DEBUG("Kompute OpAlgoBase shaderFilePath constructo with shader raw "
|
||||
"data length: {}",
|
||||
shaderDataRaw.size());
|
||||
|
||||
this->mShaderDataRaw = shaderDataRaw;
|
||||
}
|
||||
|
||||
OpAlgoBase::~OpAlgoBase()
|
||||
{
|
||||
SPDLOG_DEBUG("Kompute OpAlgoBase destructor started");
|
||||
}
|
||||
|
||||
void
|
||||
OpAlgoBase::init()
|
||||
{
|
||||
SPDLOG_DEBUG("Kompute OpAlgoBase init called");
|
||||
|
||||
if (this->mTensors.size() < 1) {
|
||||
throw std::runtime_error(
|
||||
"Kompute OpAlgoBase called with less than 1 tensor");
|
||||
}
|
||||
|
||||
for (std::shared_ptr<Tensor> tensor : this->mTensors) {
|
||||
if (!tensor->isInit()) {
|
||||
throw std::runtime_error(
|
||||
"Kompute OpAlgoBase validation failed; all tensor parameters "
|
||||
"must be initialised.");
|
||||
}
|
||||
}
|
||||
|
||||
SPDLOG_DEBUG("Kompute OpAlgoBase fetching spirv data");
|
||||
|
||||
std::vector<char> shaderFileData = this->fetchSpirvBinaryData();
|
||||
|
||||
SPDLOG_DEBUG("Kompute OpAlgoBase Initialising algorithm component");
|
||||
|
||||
this->mAlgorithm->init(shaderFileData, this->mTensors);
|
||||
}
|
||||
|
||||
void
|
||||
OpAlgoBase::record()
|
||||
{
|
||||
SPDLOG_DEBUG("Kompute OpAlgoBase record called");
|
||||
|
||||
// Barrier to ensure the data is finished writing to buffer memory
|
||||
for (std::shared_ptr<Tensor> tensor : this->mTensors) {
|
||||
tensor->recordBufferMemoryBarrier(
|
||||
this->mCommandBuffer,
|
||||
vk::AccessFlagBits::eHostWrite,
|
||||
vk::AccessFlagBits::eShaderRead,
|
||||
vk::PipelineStageFlagBits::eHost,
|
||||
vk::PipelineStageFlagBits::eComputeShader);
|
||||
}
|
||||
|
||||
this->mAlgorithm->recordDispatch(this->mKomputeWorkgroup.x,
|
||||
this->mKomputeWorkgroup.y,
|
||||
this->mKomputeWorkgroup.z);
|
||||
}
|
||||
|
||||
void
|
||||
OpAlgoBase::preEval()
|
||||
{
|
||||
SPDLOG_DEBUG("Kompute OpAlgoBase preEval called");
|
||||
}
|
||||
|
||||
void
|
||||
OpAlgoBase::postEval()
|
||||
{
|
||||
SPDLOG_DEBUG("Kompute OpAlgoBase postSubmit called");
|
||||
}
|
||||
|
||||
std::vector<char>
|
||||
OpAlgoBase::fetchSpirvBinaryData()
|
||||
{
|
||||
SPDLOG_WARN("Kompute OpAlgoBase Running shaders directly from spirv file");
|
||||
|
||||
if (this->mShaderFilePath.size()) {
|
||||
std::ifstream fileStream(this->mShaderFilePath,
|
||||
std::ios::binary | std::ios::in |
|
||||
std::ios::ate);
|
||||
|
||||
if (!fileStream.good()) {
|
||||
throw std::runtime_error("Error reading file: " +
|
||||
this->mShaderFilePath);
|
||||
}
|
||||
|
||||
size_t shaderFileSize = fileStream.tellg();
|
||||
fileStream.seekg(0, std::ios::beg);
|
||||
char* shaderDataRaw = new char[shaderFileSize];
|
||||
fileStream.read(shaderDataRaw, shaderFileSize);
|
||||
fileStream.close();
|
||||
|
||||
SPDLOG_WARN("Kompute OpAlgoBase fetched {} bytes", shaderFileSize);
|
||||
|
||||
return std::vector<char>(shaderDataRaw, shaderDataRaw + shaderFileSize);
|
||||
} else if (this->mShaderDataRaw.size()) {
|
||||
return this->mShaderDataRaw;
|
||||
} else {
|
||||
throw std::runtime_error(
|
||||
"Kompute OpAlgoBase Error reached fetchSpirvBinaryData but neither "
|
||||
"filepath nor data provided");
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
127
src/OpAlgoLhsRhsOut.cpp
Normal file
127
src/OpAlgoLhsRhsOut.cpp
Normal file
|
|
@ -0,0 +1,127 @@
|
|||
#pragma once
|
||||
|
||||
#include "kompute/operations/OpAlgoLhsRhsOut.hpp"
|
||||
|
||||
namespace kp {
|
||||
|
||||
OpAlgoLhsRhsOut::OpAlgoLhsRhsOut()
|
||||
{
|
||||
SPDLOG_DEBUG("Kompute OpAlgoLhsRhsOut constructor base");
|
||||
}
|
||||
|
||||
OpAlgoLhsRhsOut::OpAlgoLhsRhsOut(
|
||||
std::shared_ptr<vk::PhysicalDevice> physicalDevice,
|
||||
std::shared_ptr<vk::Device> device,
|
||||
std::shared_ptr<vk::CommandBuffer> commandBuffer,
|
||||
std::vector<std::shared_ptr<Tensor>> tensors,
|
||||
KomputeWorkgroup komputeWorkgroup)
|
||||
// The inheritance is initialised with the copyOutputData to false given that
|
||||
// this depencendant class handles the transfer of data via staging buffers in
|
||||
// a granular way.
|
||||
: OpAlgoBase(physicalDevice, device, commandBuffer, tensors, komputeWorkgroup)
|
||||
{
|
||||
SPDLOG_DEBUG("Kompute OpAlgoLhsRhsOut constructor with params");
|
||||
}
|
||||
|
||||
OpAlgoLhsRhsOut::~OpAlgoLhsRhsOut()
|
||||
{
|
||||
SPDLOG_DEBUG("Kompute OpAlgoLhsRhsOut destructor started");
|
||||
}
|
||||
|
||||
void
|
||||
OpAlgoLhsRhsOut::init()
|
||||
{
|
||||
SPDLOG_DEBUG("Kompute OpAlgoLhsRhsOut init called");
|
||||
|
||||
if (this->mTensors.size() < 3) {
|
||||
throw std::runtime_error(
|
||||
"Kompute OpAlgoLhsRhsOut called with less than 1 tensor");
|
||||
} else if (this->mTensors.size() > 3) {
|
||||
SPDLOG_WARN(
|
||||
"Kompute OpAlgoLhsRhsOut called with more than 3 this->mTensors");
|
||||
}
|
||||
|
||||
this->mTensorLHS = this->mTensors[0];
|
||||
this->mTensorRHS = this->mTensors[1];
|
||||
this->mTensorOutput = this->mTensors[2];
|
||||
|
||||
if (!(this->mTensorLHS->isInit() && this->mTensorRHS->isInit() &&
|
||||
this->mTensorOutput->isInit())) {
|
||||
throw std::runtime_error(
|
||||
"Kompute OpAlgoLhsRhsOut all tensor parameters must be initialised. "
|
||||
"LHS: " +
|
||||
std::to_string(this->mTensorLHS->isInit()) +
|
||||
" RHS: " + std::to_string(this->mTensorRHS->isInit()) +
|
||||
" Output: " + std::to_string(this->mTensorOutput->isInit()));
|
||||
}
|
||||
|
||||
if (!(this->mTensorLHS->size() == this->mTensorRHS->size() &&
|
||||
this->mTensorRHS->size() == this->mTensorOutput->size())) {
|
||||
throw std::runtime_error(
|
||||
"Kompute OpAlgoLhsRhsOut all tensor parameters must be the same size "
|
||||
"LHS: " +
|
||||
std::to_string(this->mTensorLHS->size()) +
|
||||
" RHS: " + std::to_string(this->mTensorRHS->size()) +
|
||||
" Output: " + std::to_string(this->mTensorOutput->size()));
|
||||
}
|
||||
|
||||
this->mTensorOutputStaging = std::make_shared<Tensor>(
|
||||
this->mTensorOutput->data(), Tensor::TensorTypes::eStaging);
|
||||
|
||||
this->mTensorOutputStaging->init(this->mPhysicalDevice, this->mDevice);
|
||||
|
||||
SPDLOG_DEBUG("Kompute OpAlgoLhsRhsOut fetching spirv data");
|
||||
|
||||
std::vector<char> shaderFileData = this->fetchSpirvBinaryData();
|
||||
|
||||
SPDLOG_DEBUG("Kompute OpAlgoLhsRhsOut Initialising algorithm component");
|
||||
|
||||
this->mAlgorithm->init(shaderFileData, this->mTensors);
|
||||
}
|
||||
|
||||
void
|
||||
OpAlgoLhsRhsOut::record()
|
||||
{
|
||||
SPDLOG_DEBUG("Kompute OpAlgoLhsRhsOut record called");
|
||||
|
||||
// Barrier to ensure the data is finished writing to buffer memory
|
||||
this->mTensorLHS->recordBufferMemoryBarrier(
|
||||
this->mCommandBuffer,
|
||||
vk::AccessFlagBits::eHostWrite,
|
||||
vk::AccessFlagBits::eShaderRead,
|
||||
vk::PipelineStageFlagBits::eHost,
|
||||
vk::PipelineStageFlagBits::eComputeShader);
|
||||
this->mTensorRHS->recordBufferMemoryBarrier(
|
||||
this->mCommandBuffer,
|
||||
vk::AccessFlagBits::eHostWrite,
|
||||
vk::AccessFlagBits::eShaderRead,
|
||||
vk::PipelineStageFlagBits::eHost,
|
||||
vk::PipelineStageFlagBits::eComputeShader);
|
||||
|
||||
this->mAlgorithm->recordDispatch(this->mKomputeWorkgroup.x,
|
||||
this->mKomputeWorkgroup.y,
|
||||
this->mKomputeWorkgroup.z);
|
||||
|
||||
// Barrier to ensure the shader code is executed before buffer read
|
||||
this->mTensorOutput->recordBufferMemoryBarrier(
|
||||
this->mCommandBuffer,
|
||||
vk::AccessFlagBits::eShaderWrite,
|
||||
vk::AccessFlagBits::eTransferRead,
|
||||
vk::PipelineStageFlagBits::eComputeShader,
|
||||
vk::PipelineStageFlagBits::eTransfer);
|
||||
|
||||
this->mTensorOutputStaging->recordCopyFrom(
|
||||
this->mCommandBuffer, this->mTensorOutput, true);
|
||||
}
|
||||
|
||||
void
|
||||
OpAlgoLhsRhsOut::postEval()
|
||||
{
|
||||
SPDLOG_DEBUG("Kompute OpAlgoLhsRhsOut postSubmit called");
|
||||
|
||||
this->mTensorOutputStaging->mapDataFromHostMemory();
|
||||
|
||||
this->mTensorOutput->setData(this->mTensorOutputStaging->data());
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -27,33 +27,13 @@ Sequence::~Sequence()
|
|||
{
|
||||
SPDLOG_DEBUG("Kompute Sequence Destructor started");
|
||||
|
||||
if (!this->mDevice) {
|
||||
SPDLOG_ERROR(
|
||||
"Kompute Sequence destructor reached with null Device pointer");
|
||||
if (!this->mIsInit) {
|
||||
SPDLOG_INFO("Kompute Sequence destructor called but sequence is not "
|
||||
"initialized so no need to removing GPU resources.");
|
||||
return;
|
||||
}
|
||||
|
||||
if (this->mFreeCommandBuffer) {
|
||||
SPDLOG_INFO("Freeing CommandBuffer");
|
||||
if (!this->mCommandBuffer) {
|
||||
SPDLOG_ERROR("Kompute Sequence destructor reached with null "
|
||||
"CommandPool pointer");
|
||||
return;
|
||||
}
|
||||
this->mDevice->freeCommandBuffers(
|
||||
*this->mCommandPool, 1, this->mCommandBuffer.get());
|
||||
SPDLOG_DEBUG("Kompute Sequence Freed CommandBuffer");
|
||||
}
|
||||
|
||||
if (this->mFreeCommandPool) {
|
||||
SPDLOG_INFO("Destroying CommandPool");
|
||||
if (this->mCommandPool == nullptr) {
|
||||
SPDLOG_ERROR("Kompute Sequence destructor reached with null "
|
||||
"CommandPool pointer");
|
||||
return;
|
||||
}
|
||||
this->mDevice->destroy(*this->mCommandPool, (vk::Optional<const vk::AllocationCallbacks>)nullptr);
|
||||
SPDLOG_DEBUG("Kompute Sequence Destroyed CommandPool");
|
||||
else {
|
||||
this->freeMemoryDestroyGPUResources();
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -186,7 +166,8 @@ Sequence::evalAwait(uint64_t waitFor)
|
|||
|
||||
vk::Result result =
|
||||
this->mDevice->waitForFences(1, &this->mFence, VK_TRUE, waitFor);
|
||||
this->mDevice->destroy(this->mFence, (vk::Optional<const vk::AllocationCallbacks>)nullptr);
|
||||
this->mDevice->destroy(
|
||||
this->mFence, (vk::Optional<const vk::AllocationCallbacks>)nullptr);
|
||||
|
||||
this->mIsRunning = false;
|
||||
|
||||
|
|
@ -220,6 +201,53 @@ Sequence::isInit()
|
|||
return this->mIsInit;
|
||||
}
|
||||
|
||||
void
|
||||
Sequence::freeMemoryDestroyGPUResources()
|
||||
{
|
||||
if (!this->mIsInit) {
|
||||
SPDLOG_ERROR("Kompute Sequence freeMemoryDestroyGPUResources called "
|
||||
"but Sequence is not initialized so there's no relevant GPU resources.");
|
||||
return;
|
||||
}
|
||||
|
||||
if (!this->mDevice) {
|
||||
SPDLOG_ERROR(
|
||||
"Kompute Sequence freeMemoryDestroyGPUResources called with null Device pointer");
|
||||
this->mIsInit = false;
|
||||
return;
|
||||
}
|
||||
|
||||
if (this->mFreeCommandBuffer) {
|
||||
SPDLOG_INFO("Freeing CommandBuffer");
|
||||
if (!this->mCommandBuffer) {
|
||||
SPDLOG_ERROR("Kompute Sequence freeMemoryDestroyGPUResources called with null "
|
||||
"CommandPool pointer");
|
||||
this->mIsInit = false;
|
||||
return;
|
||||
}
|
||||
this->mDevice->freeCommandBuffers(
|
||||
*this->mCommandPool, 1, this->mCommandBuffer.get());
|
||||
SPDLOG_DEBUG("Kompute Sequence Freed CommandBuffer");
|
||||
}
|
||||
|
||||
if (this->mFreeCommandPool) {
|
||||
SPDLOG_INFO("Destroying CommandPool");
|
||||
if (this->mCommandPool == nullptr) {
|
||||
SPDLOG_ERROR("Kompute Sequence freeMemoryDestroyGPUResources called with null "
|
||||
"CommandPool pointer");
|
||||
this->mIsInit = false;
|
||||
return;
|
||||
}
|
||||
this->mDevice->destroy(
|
||||
*this->mCommandPool,
|
||||
(vk::Optional<const vk::AllocationCallbacks>)nullptr);
|
||||
SPDLOG_DEBUG("Kompute Sequence Destroyed CommandPool");
|
||||
}
|
||||
|
||||
this->mIsInit = false;
|
||||
|
||||
}
|
||||
|
||||
void
|
||||
Sequence::createCommandPool()
|
||||
{
|
||||
|
|
|
|||
|
|
@ -12,8 +12,9 @@ Tensor::Tensor()
|
|||
Tensor::Tensor(const std::vector<float>& data, TensorTypes tensorType)
|
||||
{
|
||||
#if DEBUG
|
||||
SPDLOG_DEBUG(
|
||||
"Kompute Tensor constructor data length: {}, and type: {}", data.size(), tensorType);
|
||||
SPDLOG_DEBUG("Kompute Tensor constructor data length: {}, and type: {}",
|
||||
data.size(),
|
||||
tensorType);
|
||||
#endif
|
||||
|
||||
this->mData = data;
|
||||
|
|
@ -350,7 +351,9 @@ Tensor::freeMemoryDestroyGPUResources()
|
|||
"Kompose Tensor expected to free buffer but got null buffer");
|
||||
} else {
|
||||
SPDLOG_DEBUG("Kompose Tensor destroying buffer");
|
||||
this->mDevice->destroy(*this->mBuffer, (vk::Optional<const vk::AllocationCallbacks>)nullptr);
|
||||
this->mDevice->destroy(
|
||||
*this->mBuffer,
|
||||
(vk::Optional<const vk::AllocationCallbacks>)nullptr);
|
||||
this->mBuffer = nullptr;
|
||||
}
|
||||
}
|
||||
|
|
@ -361,7 +364,9 @@ Tensor::freeMemoryDestroyGPUResources()
|
|||
"Kompose Tensor expected to free buffer but got null memory");
|
||||
} else {
|
||||
SPDLOG_DEBUG("Kompose Tensor freeing memory");
|
||||
this->mDevice->freeMemory(*this->mMemory, (vk::Optional<const vk::AllocationCallbacks>)nullptr);
|
||||
this->mDevice->freeMemory(
|
||||
*this->mMemory,
|
||||
(vk::Optional<const vk::AllocationCallbacks>)nullptr);
|
||||
this->mDevice = nullptr;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -63,9 +63,9 @@ class Manager
|
|||
*
|
||||
* @param sequenceName The name for the named sequence to be retrieved or
|
||||
* created
|
||||
* @return Weak pointer to the manager owned sequence resource
|
||||
* @return Shared pointer to the manager owned sequence resource
|
||||
*/
|
||||
std::weak_ptr<Sequence> getOrCreateManagedSequence(
|
||||
std::shared_ptr<Sequence> getOrCreateManagedSequence(
|
||||
std::string sequenceName);
|
||||
|
||||
/**
|
||||
|
|
@ -77,8 +77,9 @@ class Manager
|
|||
* @param queueIndex The queue to use from the available queues
|
||||
* @return Weak pointer to the manager owned sequence resource
|
||||
*/
|
||||
std::weak_ptr<Sequence> createManagedSequence(std::string sequenceName = "",
|
||||
uint32_t queueIndex = 0);
|
||||
std::shared_ptr<Sequence> createManagedSequence(
|
||||
std::string sequenceName = "",
|
||||
uint32_t queueIndex = 0);
|
||||
|
||||
/**
|
||||
* Function that evaluates operation against named sequence.
|
||||
|
|
@ -94,22 +95,21 @@ class Manager
|
|||
TArgs&&... params)
|
||||
{
|
||||
SPDLOG_DEBUG("Kompute Manager evalOp triggered");
|
||||
std::weak_ptr<Sequence> sqWeakPtr =
|
||||
std::shared_ptr<kp::Sequence> sq =
|
||||
this->getOrCreateManagedSequence(sequenceName);
|
||||
|
||||
if (std::shared_ptr<kp::Sequence> sq = sqWeakPtr.lock()) {
|
||||
SPDLOG_DEBUG("Kompute Manager evalOp running sequence BEGIN");
|
||||
sq->begin();
|
||||
SPDLOG_DEBUG("Kompute Manager evalOp running sequence BEGIN");
|
||||
sq->begin();
|
||||
|
||||
SPDLOG_DEBUG("Kompute Manager evalOp running sequence RECORD");
|
||||
sq->record<T>(tensors, std::forward<TArgs>(params)...);
|
||||
SPDLOG_DEBUG("Kompute Manager evalOp running sequence RECORD");
|
||||
sq->record<T>(tensors, std::forward<TArgs>(params)...);
|
||||
|
||||
SPDLOG_DEBUG("Kompute Manager evalOp running sequence END");
|
||||
sq->end();
|
||||
SPDLOG_DEBUG("Kompute Manager evalOp running sequence END");
|
||||
sq->end();
|
||||
|
||||
SPDLOG_DEBUG("Kompute Manager evalOp running sequence EVAL");
|
||||
sq->eval();
|
||||
|
||||
SPDLOG_DEBUG("Kompute Manager evalOp running sequence EVAL");
|
||||
sq->eval();
|
||||
}
|
||||
SPDLOG_DEBUG("Kompute Manager evalOp running sequence SUCCESS");
|
||||
}
|
||||
|
||||
|
|
@ -147,26 +147,21 @@ class Manager
|
|||
{
|
||||
SPDLOG_DEBUG("Kompute Manager evalOpAsync triggered");
|
||||
|
||||
std::weak_ptr<Sequence> sqWeakPtr =
|
||||
std::shared_ptr<kp::Sequence> sq =
|
||||
this->getOrCreateManagedSequence(sequenceName);
|
||||
|
||||
if (std::shared_ptr<kp::Sequence> sq = sqWeakPtr.lock()) {
|
||||
SPDLOG_DEBUG("Kompute Manager evalOpAsync running sequence BEGIN");
|
||||
sq->begin();
|
||||
|
||||
SPDLOG_DEBUG("Kompute Manager evalOpAsync running sequence BEGIN");
|
||||
sq->begin();
|
||||
SPDLOG_DEBUG("Kompute Manager evalOpAsync running sequence RECORD");
|
||||
sq->record<T>(tensors, std::forward<TArgs>(params)...);
|
||||
|
||||
SPDLOG_DEBUG("Kompute Manager evalOpAsync running sequence RECORD");
|
||||
sq->record<T>(tensors, std::forward<TArgs>(params)...);
|
||||
SPDLOG_DEBUG("Kompute Manager evalOpAsync running sequence END");
|
||||
sq->end();
|
||||
|
||||
SPDLOG_DEBUG("Kompute Manager evalOpAsync running sequence END");
|
||||
sq->end();
|
||||
SPDLOG_DEBUG("Kompute Manager evalOpAsync running sequence EVAL");
|
||||
sq->evalAsync();
|
||||
|
||||
SPDLOG_DEBUG("Kompute Manager evalOpAsync running sequence EVAL");
|
||||
sq->evalAsync();
|
||||
} else {
|
||||
SPDLOG_ERROR("Kompute Manager evalOpAsync sequence [{}] not found",
|
||||
sequenceName);
|
||||
}
|
||||
SPDLOG_DEBUG("Kompute Manager evalOpAsync running sequence SUCCESS");
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -106,6 +106,12 @@ class Sequence
|
|||
*/
|
||||
bool isInit();
|
||||
|
||||
/**
|
||||
* Destroys and frees the GPU resources which include the buffer and memory
|
||||
* and sets the sequence as init=False.
|
||||
*/
|
||||
void freeMemoryDestroyGPUResources();
|
||||
|
||||
/**
|
||||
* Record function for operation to be added to the GPU queue in batch. This
|
||||
* template requires classes to be derived from the OpBase class. This
|
||||
|
|
|
|||
|
|
@ -17,20 +17,17 @@ namespace kp {
|
|||
* Operation that provides a general abstraction that simplifies the use of
|
||||
* algorithm and parameter components which can be used with shaders.
|
||||
* By default it enables the user to provide a dynamic number of tensors
|
||||
* which are then passed as inputs.
|
||||
*
|
||||
* All of these tensors are expected to be initlaised and this is checked with throw std exception in the init function.
|
||||
*
|
||||
* See OpLhsRhsOut for an example implementation on a more specific granularity on tensor parameters.
|
||||
*
|
||||
* The template parameters specify the processing GPU layout number of
|
||||
* iterations for each x, y, z parameter. More specifically, this will be the
|
||||
* input to ".dispatch(uint32_t tX, uint32_t tY, uint32_t, tZ)"
|
||||
* which are then passed as inputs.
|
||||
*/
|
||||
template<uint32_t tX = 0, uint32_t tY = 0, uint32_t tZ = 0>
|
||||
class OpAlgoBase : public OpBase
|
||||
{
|
||||
public:
|
||||
struct KomputeWorkgroup {
|
||||
uint32_t x;
|
||||
uint32_t y;
|
||||
uint32_t z;
|
||||
};
|
||||
|
||||
/**
|
||||
* Base constructor, should not be used unless explicitly intended.
|
||||
*/
|
||||
|
|
@ -46,11 +43,13 @@ class OpAlgoBase : public OpBase
|
|||
* @param commandBuffer Vulkan Command Buffer to record commands into
|
||||
* @param tensors Tensors that are to be used in this operation
|
||||
* @param shaderFilePath Optional parameter to specify the shader to load (either in spirv or raw format)
|
||||
* @param komputeWorkgroup Optional parameter to specify the layout for processing
|
||||
*/
|
||||
OpAlgoBase(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
|
||||
std::shared_ptr<vk::Device> device,
|
||||
std::shared_ptr<vk::CommandBuffer> commandBuffer,
|
||||
std::vector<std::shared_ptr<Tensor>>& tensors);
|
||||
std::vector<std::shared_ptr<Tensor>>& tensors,
|
||||
KomputeWorkgroup komputeWorkgroup = KomputeWorkgroup());
|
||||
|
||||
/**
|
||||
* Constructor that enables a file to be passed to the operation with
|
||||
|
|
@ -61,13 +60,15 @@ class OpAlgoBase : public OpBase
|
|||
* @param device Vulkan logical device for passing to Algorithm
|
||||
* @param commandBuffer Vulkan Command Buffer to record commands into
|
||||
* @param tensors Tensors that are to be used in this operation
|
||||
* @param shaderFilePath Optional parameter to specify the shader to load (either in spirv or raw format)
|
||||
* @param shaderFilePath Parameter to specify the shader to load (either in spirv or raw format)
|
||||
* @param komputeWorkgroup Optional parameter to specify the layout for processing
|
||||
*/
|
||||
OpAlgoBase(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
|
||||
std::shared_ptr<vk::Device> device,
|
||||
std::shared_ptr<vk::CommandBuffer> commandBuffer,
|
||||
std::vector<std::shared_ptr<Tensor>>& tensors,
|
||||
std::string shaderFilePath);
|
||||
std::string shaderFilePath,
|
||||
KomputeWorkgroup komputeWorkgroup = KomputeWorkgroup());
|
||||
|
||||
/**
|
||||
* Constructor that enables raw shader data to be passed to the main operation
|
||||
|
|
@ -78,12 +79,14 @@ class OpAlgoBase : public OpBase
|
|||
* @param commandBuffer Vulkan Command Buffer to record commands into
|
||||
* @param tensors Tensors that are to be used in this operation
|
||||
* @param shaderDataRaw Optional parameter to specify the shader data either in binary or raw form
|
||||
* @param komputeWorkgroup Optional parameter to specify the layout for processing
|
||||
*/
|
||||
OpAlgoBase(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
|
||||
std::shared_ptr<vk::Device> device,
|
||||
std::shared_ptr<vk::CommandBuffer> commandBuffer,
|
||||
std::vector<std::shared_ptr<Tensor>>& tensors,
|
||||
const std::vector<char>& shaderDataRaw);
|
||||
const std::vector<char>& shaderDataRaw,
|
||||
KomputeWorkgroup komputeWorkgroup = KomputeWorkgroup());
|
||||
|
||||
/**
|
||||
* Default destructor, which is in charge of destroying the algorithm
|
||||
|
|
@ -131,9 +134,7 @@ class OpAlgoBase : public OpBase
|
|||
|
||||
// -------------- ALWAYS OWNED RESOURCES
|
||||
|
||||
uint32_t mX;
|
||||
uint32_t mY;
|
||||
uint32_t mZ;
|
||||
KomputeWorkgroup mKomputeWorkgroup;
|
||||
|
||||
std::string mShaderFilePath; ///< Optional member variable which can be provided for the OpAlgoBase to find the data automatically and load for processing
|
||||
std::vector<char> mShaderDataRaw; ///< Optional member variable which can be provided to contain either the raw shader content or the spirv binary content
|
||||
|
|
@ -143,174 +144,3 @@ class OpAlgoBase : public OpBase
|
|||
|
||||
} // End namespace kp
|
||||
|
||||
// Including implementation for template class
|
||||
#ifndef OPALGOBASE_IMPL
|
||||
#define OPALGOBASE_IMPL
|
||||
|
||||
namespace kp {
|
||||
|
||||
template<uint32_t tX, uint32_t tY, uint32_t tZ>
|
||||
OpAlgoBase<tX, tY, tZ>::OpAlgoBase()
|
||||
{
|
||||
SPDLOG_DEBUG("Kompute OpAlgoBase constructor base");
|
||||
}
|
||||
|
||||
template<uint32_t tX, uint32_t tY, uint32_t tZ>
|
||||
OpAlgoBase<tX, tY, tZ>::OpAlgoBase(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
|
||||
std::shared_ptr<vk::Device> device,
|
||||
std::shared_ptr<vk::CommandBuffer> commandBuffer,
|
||||
std::vector<std::shared_ptr<Tensor>>& tensors)
|
||||
: OpBase(physicalDevice, device, commandBuffer, tensors, false)
|
||||
{
|
||||
SPDLOG_DEBUG("Kompute OpAlgoBase constructor with params numTensors: {}", tensors.size());
|
||||
|
||||
// The dispatch size is set up based on either explicitly provided template
|
||||
// parameters or by default it would take the shape and size of the tensors
|
||||
if (tX > 0) {
|
||||
// If at least the x value is provided we use mainly the parameters
|
||||
// provided
|
||||
this->mX = tX;
|
||||
this->mY = tY > 0 ? tY : 1;
|
||||
this->mZ = tZ > 0 ? tZ : 1;
|
||||
} else {
|
||||
this->mX = tensors[0]->size();
|
||||
this->mY = 1;
|
||||
this->mZ = 1;
|
||||
}
|
||||
SPDLOG_INFO("Kompute OpAlgoBase dispatch size X: {}, Y: {}, Z: {}",
|
||||
this->mX,
|
||||
this->mY,
|
||||
this->mZ);
|
||||
|
||||
this->mAlgorithm = std::make_shared<Algorithm>(device, commandBuffer);
|
||||
}
|
||||
|
||||
template<uint32_t tX, uint32_t tY, uint32_t tZ>
|
||||
OpAlgoBase<tX, tY, tZ>::OpAlgoBase(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
|
||||
std::shared_ptr<vk::Device> device,
|
||||
std::shared_ptr<vk::CommandBuffer> commandBuffer,
|
||||
std::vector<std::shared_ptr<Tensor>>& tensors,
|
||||
std::string shaderFilePath)
|
||||
: OpAlgoBase(physicalDevice, device, commandBuffer, tensors)
|
||||
{
|
||||
SPDLOG_DEBUG("Kompute OpAlgoBase shaderFilePath constructo with shaderfile path: {}", shaderFilePath);
|
||||
|
||||
this->mShaderFilePath = shaderFilePath;
|
||||
}
|
||||
|
||||
template<uint32_t tX, uint32_t tY, uint32_t tZ>
|
||||
OpAlgoBase<tX, tY, tZ>::OpAlgoBase(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
|
||||
std::shared_ptr<vk::Device> device,
|
||||
std::shared_ptr<vk::CommandBuffer> commandBuffer,
|
||||
std::vector<std::shared_ptr<Tensor>>& tensors,
|
||||
const std::vector<char>& shaderDataRaw)
|
||||
: OpAlgoBase(physicalDevice, device, commandBuffer, tensors)
|
||||
{
|
||||
SPDLOG_DEBUG("Kompute OpAlgoBase shaderFilePath constructo with shader raw data length: {}", shaderDataRaw.size());
|
||||
|
||||
this->mShaderDataRaw = shaderDataRaw;
|
||||
}
|
||||
|
||||
template<uint32_t tX, uint32_t tY, uint32_t tZ>
|
||||
OpAlgoBase<tX, tY, tZ>::~OpAlgoBase()
|
||||
{
|
||||
SPDLOG_DEBUG("Kompute OpAlgoBase destructor started");
|
||||
}
|
||||
|
||||
template<uint32_t tX, uint32_t tY, uint32_t tZ>
|
||||
void
|
||||
OpAlgoBase<tX, tY, tZ>::init()
|
||||
{
|
||||
SPDLOG_DEBUG("Kompute OpAlgoBase init called");
|
||||
|
||||
if (this->mTensors.size() < 1) {
|
||||
throw std::runtime_error(
|
||||
"Kompute OpAlgoBase called with less than 1 tensor");
|
||||
}
|
||||
|
||||
for (std::shared_ptr<Tensor> tensor : this->mTensors) {
|
||||
if(!tensor->isInit()) {
|
||||
throw std::runtime_error("Kompute OpAlgoBase validation failed; all tensor parameters must be initialised.");
|
||||
}
|
||||
}
|
||||
|
||||
SPDLOG_DEBUG("Kompute OpAlgoBase fetching spirv data");
|
||||
|
||||
std::vector<char> shaderFileData = this->fetchSpirvBinaryData();
|
||||
|
||||
SPDLOG_DEBUG("Kompute OpAlgoBase Initialising algorithm component");
|
||||
|
||||
this->mAlgorithm->init(shaderFileData, this->mTensors);
|
||||
}
|
||||
|
||||
template<uint32_t tX, uint32_t tY, uint32_t tZ>
|
||||
void
|
||||
OpAlgoBase<tX, tY, tZ>::record()
|
||||
{
|
||||
SPDLOG_DEBUG("Kompute OpAlgoBase record called");
|
||||
|
||||
// Barrier to ensure the data is finished writing to buffer memory
|
||||
for (std::shared_ptr<Tensor> tensor : this->mTensors) {
|
||||
tensor->recordBufferMemoryBarrier(
|
||||
this->mCommandBuffer,
|
||||
vk::AccessFlagBits::eHostWrite,
|
||||
vk::AccessFlagBits::eShaderRead,
|
||||
vk::PipelineStageFlagBits::eHost,
|
||||
vk::PipelineStageFlagBits::eComputeShader);
|
||||
}
|
||||
|
||||
this->mAlgorithm->recordDispatch(this->mX, this->mY, this->mZ);
|
||||
}
|
||||
|
||||
template<uint32_t tX, uint32_t tY, uint32_t tZ>
|
||||
void
|
||||
OpAlgoBase<tX, tY, tZ>::preEval()
|
||||
{
|
||||
SPDLOG_DEBUG("Kompute OpAlgoBase preEval called");
|
||||
}
|
||||
|
||||
template<uint32_t tX, uint32_t tY, uint32_t tZ>
|
||||
void
|
||||
OpAlgoBase<tX, tY, tZ>::postEval()
|
||||
{
|
||||
SPDLOG_DEBUG("Kompute OpAlgoBase postSubmit called");
|
||||
}
|
||||
|
||||
template<uint32_t tX, uint32_t tY, uint32_t tZ>
|
||||
std::vector<char> OpAlgoBase<tX, tY, tZ>::fetchSpirvBinaryData()
|
||||
{
|
||||
SPDLOG_WARN(
|
||||
"Kompute OpAlgoBase Running shaders directly from spirv file");
|
||||
|
||||
if (this->mShaderFilePath.size()) {
|
||||
std::ifstream fileStream(this->mShaderFilePath,
|
||||
std::ios::binary | std::ios::in | std::ios::ate);
|
||||
|
||||
if (!fileStream.good()) {
|
||||
throw std::runtime_error("Error reading file: " + this->mShaderFilePath);
|
||||
}
|
||||
|
||||
size_t shaderFileSize = fileStream.tellg();
|
||||
fileStream.seekg(0, std::ios::beg);
|
||||
char* shaderDataRaw = new char[shaderFileSize];
|
||||
fileStream.read(shaderDataRaw, shaderFileSize);
|
||||
fileStream.close();
|
||||
|
||||
SPDLOG_WARN(
|
||||
"Kompute OpAlgoBase fetched {} bytes", shaderFileSize);
|
||||
|
||||
return std::vector<char>(shaderDataRaw,
|
||||
shaderDataRaw + shaderFileSize);
|
||||
}
|
||||
else if (this->mShaderDataRaw.size()) {
|
||||
return this->mShaderDataRaw;
|
||||
}
|
||||
else {
|
||||
throw std::runtime_error("Kompute OpAlgoBase Error reached fetchSpirvBinaryData but neither filepath nor data provided");
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
#endif // #ifndef OPALGOBASE_IMPL
|
||||
|
||||
|
|
|
|||
|
|
@ -15,12 +15,8 @@ namespace kp {
|
|||
* Operation base class to simplify the creation of operations that require
|
||||
* right hand and left hand side datapoints together with a single output.
|
||||
* The expected data passed is two input tensors and one output tensor.
|
||||
* The template parameters specify the processing GPU layout number of
|
||||
* iterations for each x, y, z parameter. More specifically, this will be the
|
||||
* input to ".dispatch(uint32_t tX, uint32_t tY, uint32_t, tZ)"
|
||||
*/
|
||||
template<uint32_t tX = 0, uint32_t tY = 0, uint32_t tZ = 0>
|
||||
class OpAlgoLhsRhsOut : public OpAlgoBase<tX, tY, tZ>
|
||||
class OpAlgoLhsRhsOut : public OpAlgoBase
|
||||
{
|
||||
public:
|
||||
/**
|
||||
|
|
@ -38,11 +34,13 @@ class OpAlgoLhsRhsOut : public OpAlgoBase<tX, tY, tZ>
|
|||
* @param commandBuffer Vulkan Command Buffer to record commands into
|
||||
* @param tensors Tensors that are to be used in this operation
|
||||
* @param freeTensors Whether operation manages the memory of the Tensors
|
||||
* @param komputeWorkgroup Optional parameter to specify the layout for processing
|
||||
*/
|
||||
OpAlgoLhsRhsOut(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
|
||||
std::shared_ptr<vk::Device> device,
|
||||
std::shared_ptr<vk::CommandBuffer> commandBuffer,
|
||||
std::vector<std::shared_ptr<Tensor>> tensors);
|
||||
std::vector<std::shared_ptr<Tensor>> tensors,
|
||||
KomputeWorkgroup komputeWorkgroup = KomputeWorkgroup());
|
||||
|
||||
/**
|
||||
* Default destructor, which is in charge of destroying the algorithm
|
||||
|
|
@ -73,7 +71,7 @@ class OpAlgoLhsRhsOut : public OpAlgoBase<tX, tY, tZ>
|
|||
* of the GPU Device memory into the staging buffer so the output data can
|
||||
* be retrieved.
|
||||
*/
|
||||
virtual void postSubmit() override;
|
||||
virtual void postEval() override;
|
||||
|
||||
protected:
|
||||
// -------------- NEVER OWNED RESOURCES
|
||||
|
|
@ -87,136 +85,3 @@ class OpAlgoLhsRhsOut : public OpAlgoBase<tX, tY, tZ>
|
|||
|
||||
} // End namespace kp
|
||||
|
||||
// Including implementation for template class
|
||||
#ifndef OPALGOLHSRHSOUT_CPP
|
||||
#define OPALGOLHSRHSOUT_CPP
|
||||
|
||||
namespace kp {
|
||||
|
||||
template<uint32_t tX, uint32_t tY, uint32_t tZ>
|
||||
OpAlgoLhsRhsOut<tX, tY, tZ>::OpAlgoLhsRhsOut()
|
||||
{
|
||||
SPDLOG_DEBUG("Kompute OpAlgoLhsRhsOut constructor base");
|
||||
}
|
||||
|
||||
template<uint32_t tX, uint32_t tY, uint32_t tZ>
|
||||
OpAlgoLhsRhsOut<tX, tY, tZ>::OpAlgoLhsRhsOut(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
|
||||
std::shared_ptr<vk::Device> device,
|
||||
std::shared_ptr<vk::CommandBuffer> commandBuffer,
|
||||
std::vector<std::shared_ptr<Tensor>> tensors)
|
||||
// The inheritance is initialised with the copyOutputData to false given that
|
||||
// this depencendant class handles the transfer of data via staging buffers in
|
||||
// a granular way.
|
||||
: OpAlgoBase<tX, tY, tZ>(physicalDevice, device, commandBuffer, tensors)
|
||||
{
|
||||
SPDLOG_DEBUG("Kompute OpAlgoLhsRhsOut constructor with params");
|
||||
}
|
||||
|
||||
template<uint32_t tX, uint32_t tY, uint32_t tZ>
|
||||
OpAlgoLhsRhsOut<tX, tY, tZ>::~OpAlgoLhsRhsOut()
|
||||
{
|
||||
SPDLOG_DEBUG("Kompute OpAlgoLhsRhsOut destructor started");
|
||||
}
|
||||
|
||||
template<uint32_t tX, uint32_t tY, uint32_t tZ>
|
||||
void
|
||||
OpAlgoLhsRhsOut<tX, tY, tZ>::init()
|
||||
{
|
||||
SPDLOG_DEBUG("Kompute OpAlgoLhsRhsOut init called");
|
||||
|
||||
if (this->mTensors.size() < 3) {
|
||||
throw std::runtime_error(
|
||||
"Kompute OpAlgoLhsRhsOut called with less than 1 tensor");
|
||||
} else if (this->mTensors.size() > 3) {
|
||||
SPDLOG_WARN("Kompute OpAlgoLhsRhsOut called with more than 3 this->mTensors");
|
||||
}
|
||||
|
||||
this->mTensorLHS = this->mTensors[0];
|
||||
this->mTensorRHS = this->mTensors[1];
|
||||
this->mTensorOutput = this->mTensors[2];
|
||||
|
||||
|
||||
if (!(this->mTensorLHS->isInit() && this->mTensorRHS->isInit() &&
|
||||
this->mTensorOutput->isInit())) {
|
||||
throw std::runtime_error(
|
||||
"Kompute OpAlgoLhsRhsOut all tensor parameters must be initialised. LHS: " +
|
||||
std::to_string(this->mTensorLHS->isInit()) +
|
||||
" RHS: " + std::to_string(this->mTensorRHS->isInit()) +
|
||||
" Output: " + std::to_string(this->mTensorOutput->isInit()));
|
||||
}
|
||||
|
||||
if (!(this->mTensorLHS->size() == this->mTensorRHS->size() &&
|
||||
this->mTensorRHS->size() == this->mTensorOutput->size())) {
|
||||
throw std::runtime_error(
|
||||
"Kompute OpAlgoLhsRhsOut all tensor parameters must be the same size LHS: " +
|
||||
std::to_string(this->mTensorLHS->size()) +
|
||||
" RHS: " + std::to_string(this->mTensorRHS->size()) +
|
||||
" Output: " + std::to_string(this->mTensorOutput->size()));
|
||||
}
|
||||
|
||||
this->mTensorOutputStaging = std::make_shared<Tensor>(
|
||||
this->mTensorOutput->data(), Tensor::TensorTypes::eStaging);
|
||||
|
||||
this->mTensorOutputStaging->init(
|
||||
this->mPhysicalDevice, this->mDevice, this->mCommandBuffer);
|
||||
|
||||
SPDLOG_DEBUG("Kompute OpAlgoLhsRhsOut fetching spirv data");
|
||||
|
||||
std::vector<char> shaderFileData = this->fetchSpirvBinaryData();
|
||||
|
||||
SPDLOG_DEBUG("Kompute OpAlgoLhsRhsOut Initialising algorithm component");
|
||||
|
||||
this->mAlgorithm->init(shaderFileData, this->mTensors);
|
||||
}
|
||||
|
||||
template<uint32_t tX, uint32_t tY, uint32_t tZ>
|
||||
void
|
||||
OpAlgoLhsRhsOut<tX, tY, tZ>::record()
|
||||
{
|
||||
SPDLOG_DEBUG("Kompute OpAlgoLhsRhsOut record called");
|
||||
|
||||
// Barrier to ensure the data is finished writing to buffer memory
|
||||
this->mTensorLHS->recordBufferMemoryBarrier(
|
||||
this->mCommandBuffer,
|
||||
vk::AccessFlagBits::eHostWrite,
|
||||
vk::AccessFlagBits::eShaderRead,
|
||||
vk::PipelineStageFlagBits::eHost,
|
||||
vk::PipelineStageFlagBits::eComputeShader);
|
||||
this->mTensorRHS->recordBufferMemoryBarrier(
|
||||
this->mCommandBuffer,
|
||||
vk::AccessFlagBits::eHostWrite,
|
||||
vk::AccessFlagBits::eShaderRead,
|
||||
vk::PipelineStageFlagBits::eHost,
|
||||
vk::PipelineStageFlagBits::eComputeShader);
|
||||
|
||||
this->mAlgorithm->recordDispatch(this->mX, this->mY, this->mZ);
|
||||
|
||||
// Barrier to ensure the shader code is executed before buffer read
|
||||
this->mTensorOutput->recordBufferMemoryBarrier(
|
||||
this->mCommandBuffer,
|
||||
vk::AccessFlagBits::eShaderWrite,
|
||||
vk::AccessFlagBits::eTransferRead,
|
||||
vk::PipelineStageFlagBits::eComputeShader,
|
||||
vk::PipelineStageFlagBits::eTransfer);
|
||||
|
||||
this->mTensorOutputStaging->recordCopyFrom(
|
||||
this->mCommandBuffer,
|
||||
this->mTensorOutput,
|
||||
true);
|
||||
}
|
||||
|
||||
template<uint32_t tX, uint32_t tY, uint32_t tZ>
|
||||
void
|
||||
OpAlgoLhsRhsOut<tX, tY, tZ>::postSubmit()
|
||||
{
|
||||
SPDLOG_DEBUG("Kompute OpAlgoLhsRhsOut postSubmit called");
|
||||
|
||||
this->mTensorOutputStaging->mapDataFromHostMemory();
|
||||
|
||||
this->mTensorOutput->setData(this->mTensorOutputStaging->data());
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
#endif // #ifndef OPALGOLHSRHSOUT_CPP
|
||||
|
||||
|
|
|
|||
|
|
@ -17,12 +17,9 @@ namespace kp {
|
|||
|
||||
/**
|
||||
* Operation that performs multiplication on two tensors and outpus on third
|
||||
* tensor. The template parameters specify the processing GPU layout number of
|
||||
* iterations for each x, y, z parameter. More specifically, this will be the
|
||||
* input to ".dispatch(uint32_t tX, uint32_t tY, uint32_t, tZ)"
|
||||
* tensor.
|
||||
*/
|
||||
template<uint32_t tX = 0, uint32_t tY = 0, uint32_t tZ = 0>
|
||||
class OpMult : public OpAlgoBase<tX, tY, tZ>
|
||||
class OpMult : public OpAlgoBase
|
||||
{
|
||||
public:
|
||||
/**
|
||||
|
|
@ -41,13 +38,14 @@ class OpMult : public OpAlgoBase<tX, tY, tZ>
|
|||
* @param device Vulkan logical device for passing to Algorithm
|
||||
* @param commandBuffer Vulkan Command Buffer to record commands into
|
||||
* @param tensors Tensors that are to be used in this operation
|
||||
* @param freeTensors Whether operation manages the memory of the Tensors
|
||||
* @param komputeWorkgroup Optional parameter to specify the layout for processing
|
||||
*/
|
||||
OpMult(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
|
||||
std::shared_ptr<vk::Device> device,
|
||||
std::shared_ptr<vk::CommandBuffer> commandBuffer,
|
||||
std::vector<std::shared_ptr<Tensor>> tensors)
|
||||
: OpAlgoBase<tX, tY, tZ>(physicalDevice, device, commandBuffer, tensors, "")
|
||||
std::vector<std::shared_ptr<Tensor>> tensors,
|
||||
KomputeWorkgroup komputeWorkgroup = KomputeWorkgroup())
|
||||
: OpAlgoBase(physicalDevice, device, commandBuffer, tensors, "", komputeWorkgroup)
|
||||
{
|
||||
SPDLOG_DEBUG("Kompute OpMult constructor with params");
|
||||
|
||||
|
|
@ -58,14 +56,8 @@ class OpMult : public OpAlgoBase<tX, tY, tZ>
|
|||
|
||||
#if RELEASE
|
||||
/**
|
||||
* If release it will be using the static version of the shader which is
|
||||
* loaded using this file directly.
|
||||
*
|
||||
* @param physicalDevice Vulkan physical device used to find device queues
|
||||
* @param device Vulkan logical device for passing to Algorithm
|
||||
* @param commandBuffer Vulkan Command Buffer to record commands into
|
||||
* @param tensors Tensors that are to be used in this operation
|
||||
* @param freeTensors Whether operation manages the memory of the Tensors
|
||||
* If RELEASE=1 it will be using the static version of the shader which is
|
||||
* loaded using this file directly. Otherwise it should not override the function.
|
||||
*/
|
||||
std::vector<char> fetchSpirvBinaryData() override
|
||||
{
|
||||
|
|
|
|||
|
|
@ -54,7 +54,7 @@ TEST(TestAsyncOperations, TestManagerParallelExecution)
|
|||
auto startSync = std::chrono::high_resolution_clock::now();
|
||||
|
||||
for (uint32_t i = 0; i < numParallel; i++) {
|
||||
mgr.evalOpDefault<kp::OpAlgoBase<>>(
|
||||
mgr.evalOpDefault<kp::OpAlgoBase>(
|
||||
{ inputsSyncB[i] }, std::vector<char>(shader.begin(), shader.end()));
|
||||
}
|
||||
|
||||
|
|
@ -86,7 +86,7 @@ TEST(TestAsyncOperations, TestManagerParallelExecution)
|
|||
auto startAsync = std::chrono::high_resolution_clock::now();
|
||||
|
||||
for (uint32_t i = 0; i < numParallel; i++) {
|
||||
mgrAsync.evalOpAsync<kp::OpAlgoBase<>>(
|
||||
mgrAsync.evalOpAsync<kp::OpAlgoBase>(
|
||||
{ inputsAsyncB[i] },
|
||||
"async" + std::to_string(i),
|
||||
std::vector<char>(shader.begin(), shader.end()));
|
||||
|
|
@ -151,10 +151,10 @@ TEST(TestAsyncOperations, TestManagerAsyncExecution)
|
|||
|
||||
mgr.evalOpDefault<kp::OpTensorCreate>({ tensorA, tensorB });
|
||||
|
||||
mgr.evalOpAsync<kp::OpAlgoBase<>>(
|
||||
mgr.evalOpAsync<kp::OpAlgoBase>(
|
||||
{ tensorA }, "asyncOne", std::vector<char>(shader.begin(), shader.end()));
|
||||
|
||||
mgr.evalOpAsync<kp::OpAlgoBase<>>(
|
||||
mgr.evalOpAsync<kp::OpAlgoBase>(
|
||||
{ tensorB }, "asyncTwo", std::vector<char>(shader.begin(), shader.end()));
|
||||
|
||||
mgr.evalOpAwait("asyncOne");
|
||||
|
|
|
|||
|
|
@ -31,22 +31,21 @@ TEST(TestLogisticRegressionAlgorithm, TestMainLogisticRegression)
|
|||
{
|
||||
kp::Manager mgr;
|
||||
|
||||
std::shared_ptr<kp::Sequence> sqTensor =
|
||||
mgr.createManagedSequence().lock();
|
||||
std::shared_ptr<kp::Sequence> sqTensor = mgr.createManagedSequence();
|
||||
|
||||
sqTensor->begin();
|
||||
sqTensor->record<kp::OpTensorCreate>(params);
|
||||
sqTensor->end();
|
||||
sqTensor->eval();
|
||||
|
||||
std::shared_ptr<kp::Sequence> sq = mgr.createManagedSequence().lock();
|
||||
std::shared_ptr<kp::Sequence> sq = mgr.createManagedSequence();
|
||||
|
||||
// Record op algo base
|
||||
sq->begin();
|
||||
|
||||
sq->record<kp::OpTensorSyncDevice>({ wIn, bIn });
|
||||
|
||||
sq->record<kp::OpAlgoBase<>>(
|
||||
sq->record<kp::OpAlgoBase>(
|
||||
params, "test/shaders/glsl/test_logistic_regression.comp");
|
||||
|
||||
sq->record<kp::OpTensorSyncLocal>({ wOutI, wOutJ, bOut, lOut });
|
||||
|
|
@ -76,7 +75,7 @@ TEST(TestLogisticRegressionAlgorithm, TestMainLogisticRegression)
|
|||
EXPECT_LT(bIn->data()[0], 0.0);
|
||||
EXPECT_LT(bIn->data()[0], 0.0);
|
||||
|
||||
//SPDLOG_WARN("Result wIn: {}, bIn: {}, loss: {}",
|
||||
// SPDLOG_WARN("Result wIn: {}, bIn: {}, loss: {}",
|
||||
// wIn->data(),
|
||||
// bIn->data(),
|
||||
// lOut->data());
|
||||
|
|
@ -114,20 +113,19 @@ TEST(TestLogisticRegressionAlgorithm, TestMainLogisticRegressionManualCopy)
|
|||
{
|
||||
kp::Manager mgr;
|
||||
|
||||
std::shared_ptr<kp::Sequence> sqTensor =
|
||||
mgr.createManagedSequence().lock();
|
||||
std::shared_ptr<kp::Sequence> sqTensor = mgr.createManagedSequence();
|
||||
|
||||
sqTensor->begin();
|
||||
sqTensor->record<kp::OpTensorCreate>(params);
|
||||
sqTensor->end();
|
||||
sqTensor->eval();
|
||||
|
||||
std::shared_ptr<kp::Sequence> sq = mgr.createManagedSequence().lock();
|
||||
std::shared_ptr<kp::Sequence> sq = mgr.createManagedSequence();
|
||||
|
||||
// Record op algo base
|
||||
sq->begin();
|
||||
|
||||
sq->record<kp::OpAlgoBase<>>(
|
||||
sq->record<kp::OpAlgoBase>(
|
||||
params, "test/shaders/glsl/test_logistic_regression.comp");
|
||||
|
||||
sq->record<kp::OpTensorSyncLocal>({ wOutI, wOutJ, bOut, lOut });
|
||||
|
|
@ -158,7 +156,7 @@ TEST(TestLogisticRegressionAlgorithm, TestMainLogisticRegressionManualCopy)
|
|||
EXPECT_GT(wIn->data()[1], 1.0);
|
||||
EXPECT_LT(bIn->data()[0], 0.0);
|
||||
|
||||
//SPDLOG_WARN("Result wIn: {}, bIn: {}, loss: {}",
|
||||
// SPDLOG_WARN("Result wIn: {}, bIn: {}, loss: {}",
|
||||
// wIn->data(),
|
||||
// bIn->data(),
|
||||
// lOut->data());
|
||||
|
|
|
|||
|
|
@ -17,7 +17,7 @@ TEST(TestManager, EndToEndOpMultFlow)
|
|||
|
||||
mgr.evalOpDefault<kp::OpTensorCreate>({ tensorOutput });
|
||||
|
||||
mgr.evalOpDefault<kp::OpMult<>>({ tensorLHS, tensorRHS, tensorOutput });
|
||||
mgr.evalOpDefault<kp::OpMult>({ tensorLHS, tensorRHS, tensorOutput });
|
||||
|
||||
mgr.evalOpDefault<kp::OpTensorSyncLocal>({ tensorOutput });
|
||||
|
||||
|
|
@ -35,23 +35,23 @@ TEST(TestManager, OpMultSequenceFlow)
|
|||
|
||||
kp::Manager mgr;
|
||||
|
||||
std::weak_ptr<kp::Sequence> sqWeakPtr =
|
||||
mgr.getOrCreateManagedSequence("newSequence");
|
||||
if (std::shared_ptr<kp::Sequence> sq = sqWeakPtr.lock()) {
|
||||
{
|
||||
std::shared_ptr<kp::Sequence> sq =
|
||||
mgr.getOrCreateManagedSequence("newSequence");
|
||||
|
||||
sq->begin();
|
||||
|
||||
sq->record<kp::OpTensorCreate>({ tensorLHS });
|
||||
sq->record<kp::OpTensorCreate>({ tensorRHS });
|
||||
sq->record<kp::OpTensorCreate>({ tensorOutput });
|
||||
|
||||
sq->record<kp::OpMult<>>({ tensorLHS, tensorRHS, tensorOutput });
|
||||
sq->record<kp::OpMult>({ tensorLHS, tensorRHS, tensorOutput });
|
||||
|
||||
sq->record<kp::OpTensorSyncLocal>({ tensorOutput });
|
||||
|
||||
sq->end();
|
||||
sq->eval();
|
||||
}
|
||||
sqWeakPtr.reset();
|
||||
|
||||
EXPECT_EQ(tensorOutput->data(), std::vector<float>({ 0, 4, 12 }));
|
||||
}
|
||||
|
|
@ -60,22 +60,22 @@ TEST(TestManager, TestMultipleSequences)
|
|||
{
|
||||
kp::Manager mgr;
|
||||
|
||||
std::weak_ptr<kp::Sequence> sqWeakPtrOne =
|
||||
std::shared_ptr<kp::Sequence> sqOne =
|
||||
mgr.getOrCreateManagedSequence("sqOne");
|
||||
|
||||
std::weak_ptr<kp::Sequence> sqWeakPtrTwo =
|
||||
std::shared_ptr<kp::Sequence> sqTwo =
|
||||
mgr.getOrCreateManagedSequence("sqTwo");
|
||||
|
||||
std::weak_ptr<kp::Sequence> sqWeakPtrOneRef =
|
||||
std::shared_ptr<kp::Sequence> sqOneRef =
|
||||
mgr.getOrCreateManagedSequence("sqOne");
|
||||
|
||||
std::weak_ptr<kp::Sequence> sqWeakPtrTwoRef =
|
||||
std::shared_ptr<kp::Sequence> sqTwoRef =
|
||||
mgr.getOrCreateManagedSequence("sqTwo");
|
||||
|
||||
EXPECT_EQ(sqWeakPtrOne.lock(), sqWeakPtrOneRef.lock());
|
||||
EXPECT_NE(sqWeakPtrTwo.lock(), sqWeakPtrOneRef.lock());
|
||||
EXPECT_EQ(sqWeakPtrTwo.lock(), sqWeakPtrTwoRef.lock());
|
||||
EXPECT_NE(sqWeakPtrOneRef.lock(), sqWeakPtrTwoRef.lock());
|
||||
EXPECT_EQ(sqOne, sqOneRef);
|
||||
EXPECT_NE(sqTwo, sqOneRef);
|
||||
EXPECT_EQ(sqTwo, sqTwoRef);
|
||||
EXPECT_NE(sqOneRef, sqTwoRef);
|
||||
}
|
||||
|
||||
TEST(TestManager, TestMultipleTensorsAtOnce)
|
||||
|
|
@ -89,9 +89,10 @@ TEST(TestManager, TestMultipleTensorsAtOnce)
|
|||
|
||||
kp::Manager mgr;
|
||||
|
||||
std::weak_ptr<kp::Sequence> sqWeakPtr =
|
||||
std::shared_ptr<kp::Sequence> sq =
|
||||
mgr.getOrCreateManagedSequence("newSequence");
|
||||
if (std::shared_ptr<kp::Sequence> sq = sqWeakPtr.lock()) {
|
||||
|
||||
{
|
||||
sq->begin();
|
||||
|
||||
sq->record<kp::OpTensorCreate>({ tensorLHS, tensorRHS, tensorOutput });
|
||||
|
|
@ -100,14 +101,13 @@ TEST(TestManager, TestMultipleTensorsAtOnce)
|
|||
EXPECT_TRUE(tensorRHS->isInit());
|
||||
EXPECT_TRUE(tensorOutput->isInit());
|
||||
|
||||
sq->record<kp::OpMult<>>({ tensorLHS, tensorRHS, tensorOutput });
|
||||
sq->record<kp::OpMult>({ tensorLHS, tensorRHS, tensorOutput });
|
||||
|
||||
sq->record<kp::OpTensorSyncLocal>({ tensorOutput });
|
||||
|
||||
sq->end();
|
||||
sq->eval();
|
||||
}
|
||||
sqWeakPtr.reset();
|
||||
|
||||
EXPECT_EQ(tensorOutput->data(), std::vector<float>({ 0, 4, 12 }));
|
||||
}
|
||||
|
|
|
|||
|
|
@ -19,18 +19,19 @@ TEST(TestMultipleAlgoExecutions, SingleSequenceRecord)
|
|||
pa[index] = pa[index] + 1;
|
||||
})");
|
||||
|
||||
std::weak_ptr<kp::Sequence> sqWeakPtr =
|
||||
std::shared_ptr<kp::Sequence> sq =
|
||||
mgr.getOrCreateManagedSequence("newSequence");
|
||||
if (std::shared_ptr<kp::Sequence> sq = sqWeakPtr.lock()) {
|
||||
|
||||
{
|
||||
sq->begin();
|
||||
|
||||
sq->record<kp::OpTensorCreate>({ tensorA });
|
||||
|
||||
sq->record<kp::OpAlgoBase<3, 1, 1>>(
|
||||
sq->record<kp::OpAlgoBase>(
|
||||
{ tensorA }, std::vector<char>(shader.begin(), shader.end()));
|
||||
sq->record<kp::OpAlgoBase<3, 1, 1>>(
|
||||
sq->record<kp::OpAlgoBase>(
|
||||
{ tensorA }, std::vector<char>(shader.begin(), shader.end()));
|
||||
sq->record<kp::OpAlgoBase<3, 1, 1>>(
|
||||
sq->record<kp::OpAlgoBase>(
|
||||
{ tensorA }, std::vector<char>(shader.begin(), shader.end()));
|
||||
|
||||
sq->record<kp::OpTensorSyncLocal>({ tensorA });
|
||||
|
|
@ -38,7 +39,6 @@ TEST(TestMultipleAlgoExecutions, SingleSequenceRecord)
|
|||
sq->end();
|
||||
sq->eval();
|
||||
}
|
||||
sqWeakPtr.reset();
|
||||
|
||||
EXPECT_EQ(tensorA->data(), std::vector<float>({ 3, 3, 3 }));
|
||||
}
|
||||
|
|
@ -58,9 +58,9 @@ TEST(TestMultipleAlgoExecutions, MultipleCmdBufRecords)
|
|||
pa[index] = pa[index] + 1;
|
||||
})");
|
||||
|
||||
std::shared_ptr<kp::Sequence> sqTensor = mgr.createManagedSequence().lock();
|
||||
std::shared_ptr<kp::Sequence> sqTensor = mgr.createManagedSequence();
|
||||
|
||||
std::shared_ptr<kp::Sequence> sq = mgr.createManagedSequence().lock();
|
||||
std::shared_ptr<kp::Sequence> sq = mgr.createManagedSequence();
|
||||
|
||||
// First create the tensor in a separate sequence
|
||||
sqTensor->begin();
|
||||
|
|
@ -70,20 +70,20 @@ TEST(TestMultipleAlgoExecutions, MultipleCmdBufRecords)
|
|||
|
||||
// Then perform the computations
|
||||
sq->begin();
|
||||
sq->record<kp::OpAlgoBase<3, 1, 1>>(
|
||||
{ tensorA }, std::vector<char>(shader.begin(), shader.end()));
|
||||
sq->record<kp::OpAlgoBase>({ tensorA },
|
||||
std::vector<char>(shader.begin(), shader.end()));
|
||||
sq->end();
|
||||
sq->eval();
|
||||
|
||||
sq->begin();
|
||||
sq->record<kp::OpAlgoBase<3, 1, 1>>(
|
||||
{ tensorA }, std::vector<char>(shader.begin(), shader.end()));
|
||||
sq->record<kp::OpAlgoBase>({ tensorA },
|
||||
std::vector<char>(shader.begin(), shader.end()));
|
||||
sq->end();
|
||||
sq->eval();
|
||||
|
||||
sq->begin();
|
||||
sq->record<kp::OpAlgoBase<3, 1, 1>>(
|
||||
{ tensorA }, std::vector<char>(shader.begin(), shader.end()));
|
||||
sq->record<kp::OpAlgoBase>({ tensorA },
|
||||
std::vector<char>(shader.begin(), shader.end()));
|
||||
sq->end();
|
||||
sq->eval();
|
||||
|
||||
|
|
@ -111,47 +111,51 @@ TEST(TestMultipleAlgoExecutions, MultipleSequences)
|
|||
pa[index] = pa[index] + 1;
|
||||
})");
|
||||
|
||||
std::weak_ptr<kp::Sequence> sqWeakPtr =
|
||||
mgr.getOrCreateManagedSequence("newSequence");
|
||||
if (std::shared_ptr<kp::Sequence> sq = sqWeakPtr.lock()) {
|
||||
{
|
||||
std::shared_ptr<kp::Sequence> sq =
|
||||
mgr.getOrCreateManagedSequence("newSequence");
|
||||
|
||||
sq->begin();
|
||||
|
||||
sq->record<kp::OpTensorCreate>({ tensorA });
|
||||
|
||||
sq->record<kp::OpAlgoBase<3, 1, 1>>(
|
||||
sq->record<kp::OpAlgoBase>(
|
||||
{ tensorA }, std::vector<char>(shader.begin(), shader.end()));
|
||||
|
||||
sq->end();
|
||||
sq->eval();
|
||||
}
|
||||
|
||||
std::weak_ptr<kp::Sequence> sqWeakPtr2 =
|
||||
mgr.getOrCreateManagedSequence("newSequence2");
|
||||
if (std::shared_ptr<kp::Sequence> sq = sqWeakPtr2.lock()) {
|
||||
{
|
||||
std::shared_ptr<kp::Sequence> sq =
|
||||
mgr.getOrCreateManagedSequence("newSequence2");
|
||||
|
||||
sq->begin();
|
||||
|
||||
sq->record<kp::OpAlgoBase<3, 1, 1>>(
|
||||
sq->record<kp::OpAlgoBase>(
|
||||
{ tensorA }, std::vector<char>(shader.begin(), shader.end()));
|
||||
|
||||
sq->end();
|
||||
sq->eval();
|
||||
}
|
||||
|
||||
std::weak_ptr<kp::Sequence> sqWeakPtr3 =
|
||||
mgr.getOrCreateManagedSequence("newSequence3");
|
||||
if (std::shared_ptr<kp::Sequence> sq = sqWeakPtr3.lock()) {
|
||||
{
|
||||
std::shared_ptr<kp::Sequence> sq =
|
||||
mgr.getOrCreateManagedSequence("newSequence3");
|
||||
|
||||
sq->begin();
|
||||
|
||||
sq->record<kp::OpAlgoBase<3, 1, 1>>(
|
||||
sq->record<kp::OpAlgoBase>(
|
||||
{ tensorA }, std::vector<char>(shader.begin(), shader.end()));
|
||||
|
||||
sq->end();
|
||||
sq->eval();
|
||||
}
|
||||
|
||||
std::weak_ptr<kp::Sequence> sqWeakPtr4 =
|
||||
mgr.getOrCreateManagedSequence("newSequence5");
|
||||
if (std::shared_ptr<kp::Sequence> sq = sqWeakPtr4.lock()) {
|
||||
{
|
||||
std::shared_ptr<kp::Sequence> sq =
|
||||
mgr.getOrCreateManagedSequence("newSequence5");
|
||||
|
||||
sq->begin();
|
||||
|
||||
sq->record<kp::OpTensorSyncLocal>({ tensorA });
|
||||
|
|
@ -179,9 +183,10 @@ TEST(TestMultipleAlgoExecutions, SingleRecordMultipleEval)
|
|||
pa[index] = pa[index] + 1;
|
||||
})");
|
||||
|
||||
std::weak_ptr<kp::Sequence> sqWeakPtr =
|
||||
mgr.getOrCreateManagedSequence("newSequence");
|
||||
if (std::shared_ptr<kp::Sequence> sq = sqWeakPtr.lock()) {
|
||||
{
|
||||
std::shared_ptr<kp::Sequence> sq =
|
||||
mgr.getOrCreateManagedSequence("newSequence");
|
||||
|
||||
sq->begin();
|
||||
|
||||
sq->record<kp::OpTensorCreate>({ tensorA });
|
||||
|
|
@ -190,12 +195,13 @@ TEST(TestMultipleAlgoExecutions, SingleRecordMultipleEval)
|
|||
sq->eval();
|
||||
}
|
||||
|
||||
std::weak_ptr<kp::Sequence> sqWeakPtr2 =
|
||||
mgr.getOrCreateManagedSequence("newSequence2");
|
||||
if (std::shared_ptr<kp::Sequence> sq = sqWeakPtr2.lock()) {
|
||||
{
|
||||
std::shared_ptr<kp::Sequence> sq =
|
||||
mgr.getOrCreateManagedSequence("newSequence2");
|
||||
|
||||
sq->begin();
|
||||
|
||||
sq->record<kp::OpAlgoBase<3, 1, 1>>(
|
||||
sq->record<kp::OpAlgoBase>(
|
||||
{ tensorA }, std::vector<char>(shader.begin(), shader.end()));
|
||||
|
||||
sq->end();
|
||||
|
|
@ -205,9 +211,10 @@ TEST(TestMultipleAlgoExecutions, SingleRecordMultipleEval)
|
|||
sq->eval();
|
||||
}
|
||||
|
||||
std::weak_ptr<kp::Sequence> sqWeakPtr3 =
|
||||
mgr.getOrCreateManagedSequence("newSequence3");
|
||||
if (std::shared_ptr<kp::Sequence> sq = sqWeakPtr2.lock()) {
|
||||
{
|
||||
std::shared_ptr<kp::Sequence> sq =
|
||||
mgr.getOrCreateManagedSequence("newSequence3");
|
||||
|
||||
sq->begin();
|
||||
|
||||
sq->record<kp::OpTensorSyncLocal>({ tensorA });
|
||||
|
|
@ -252,7 +259,7 @@ TEST(TestMultipleAlgoExecutions, ManagerEvalMultSourceStrOpCreate)
|
|||
}
|
||||
)");
|
||||
|
||||
mgr.evalOpDefault<kp::OpAlgoBase<>>(
|
||||
mgr.evalOpDefault<kp::OpAlgoBase>(
|
||||
{ tensorInA, tensorInB, tensorOut },
|
||||
std::vector<char>(shader.begin(), shader.end()));
|
||||
|
||||
|
|
@ -289,7 +296,7 @@ TEST(TestMultipleAlgoExecutions, ManagerEvalMultSourceStrMgrCreate)
|
|||
}
|
||||
)");
|
||||
|
||||
mgr.evalOpDefault<kp::OpAlgoBase<>>(
|
||||
mgr.evalOpDefault<kp::OpAlgoBase>(
|
||||
{ tensorInA, tensorInB, tensorOut },
|
||||
std::vector<char>(shader.begin(), shader.end()));
|
||||
|
||||
|
|
|
|||
|
|
@ -30,10 +30,10 @@ TEST(TestProcessingIterations, IterateThroughMultipleSumAndCopies)
|
|||
}
|
||||
)");
|
||||
|
||||
std::weak_ptr<kp::Sequence> sqWeakPtr =
|
||||
mgr.getOrCreateManagedSequence("default");
|
||||
{
|
||||
std::shared_ptr<kp::Sequence> sq =
|
||||
mgr.getOrCreateManagedSequence("default");
|
||||
|
||||
if (std::shared_ptr<kp::Sequence> sq = sqWeakPtr.lock()) {
|
||||
sq->begin();
|
||||
|
||||
sq->record<kp::OpTensorCreate>({ tensorA, tensorB });
|
||||
|
|
@ -43,13 +43,13 @@ TEST(TestProcessingIterations, IterateThroughMultipleSumAndCopies)
|
|||
sq->eval();
|
||||
}
|
||||
|
||||
std::weak_ptr<kp::Sequence> sqWeakPtr2 =
|
||||
mgr.getOrCreateManagedSequence("run");
|
||||
{
|
||||
std::shared_ptr<kp::Sequence> sq =
|
||||
mgr.getOrCreateManagedSequence("run");
|
||||
|
||||
if (std::shared_ptr<kp::Sequence> sq = sqWeakPtr2.lock()) {
|
||||
sq->begin();
|
||||
|
||||
sq->record<kp::OpAlgoBase<>>(
|
||||
sq->record<kp::OpAlgoBase>(
|
||||
{ tensorA, tensorB },
|
||||
std::vector<char>(shader.begin(), shader.end()));
|
||||
|
||||
|
|
@ -61,10 +61,10 @@ TEST(TestProcessingIterations, IterateThroughMultipleSumAndCopies)
|
|||
}
|
||||
}
|
||||
|
||||
std::weak_ptr<kp::Sequence> sqWeakPtr3 =
|
||||
mgr.getOrCreateManagedSequence("export");
|
||||
{
|
||||
std::shared_ptr<kp::Sequence> sq =
|
||||
mgr.getOrCreateManagedSequence("export");
|
||||
|
||||
if (std::shared_ptr<kp::Sequence> sq = sqWeakPtr3.lock()) {
|
||||
sq->begin();
|
||||
|
||||
sq->record<kp::OpTensorSyncLocal>({ tensorA, tensorB });
|
||||
|
|
|
|||
|
|
@ -28,7 +28,7 @@ TEST(TestOpAlgoBase, ShaderRawDataFromConstructor)
|
|||
}
|
||||
)");
|
||||
|
||||
mgr.evalOpDefault<kp::OpAlgoBase<>>(
|
||||
mgr.evalOpDefault<kp::OpAlgoBase>(
|
||||
{ tensorA, tensorB }, std::vector<char>(shader.begin(), shader.end()));
|
||||
|
||||
mgr.evalOpDefault<kp::OpTensorSyncLocal>({ tensorA, tensorB });
|
||||
|
|
@ -45,7 +45,7 @@ TEST(TestOpAlgoBase, ShaderCompiledDataFromConstructor)
|
|||
std::shared_ptr<kp::Tensor> tensorB{ new kp::Tensor({ 0, 0, 0 }) };
|
||||
mgr.evalOpDefault<kp::OpTensorCreate>({ tensorA, tensorB });
|
||||
|
||||
mgr.evalOpDefault<kp::OpAlgoBase<>>(
|
||||
mgr.evalOpDefault<kp::OpAlgoBase>(
|
||||
{ tensorA, tensorB },
|
||||
std::vector<char>(
|
||||
kp::shader_data::test_shaders_glsl_test_op_custom_shader_comp_spv,
|
||||
|
|
@ -67,7 +67,7 @@ TEST(TestOpAlgoBase, ShaderRawDataFromFile)
|
|||
std::shared_ptr<kp::Tensor> tensorB{ new kp::Tensor({ 0, 0, 0 }) };
|
||||
mgr.evalOpDefault<kp::OpTensorCreate>({ tensorA, tensorB });
|
||||
|
||||
mgr.evalOpDefault<kp::OpAlgoBase<>>(
|
||||
mgr.evalOpDefault<kp::OpAlgoBase>(
|
||||
{ tensorA, tensorB }, "test/shaders/glsl/test_op_custom_shader.comp");
|
||||
|
||||
mgr.evalOpDefault<kp::OpTensorSyncLocal>({ tensorA, tensorB });
|
||||
|
|
@ -84,7 +84,7 @@ TEST(TestOpAlgoBase, ShaderCompiledDataFromFile)
|
|||
std::shared_ptr<kp::Tensor> tensorB{ new kp::Tensor({ 0, 0, 0 }) };
|
||||
mgr.evalOpDefault<kp::OpTensorCreate>({ tensorA, tensorB });
|
||||
|
||||
mgr.evalOpDefault<kp::OpAlgoBase<>>(
|
||||
mgr.evalOpDefault<kp::OpAlgoBase>(
|
||||
{ tensorA, tensorB }, "test/shaders/glsl/test_op_custom_shader.comp.spv");
|
||||
|
||||
mgr.evalOpDefault<kp::OpTensorSyncLocal>({ tensorA, tensorB });
|
||||
|
|
|
|||
|
|
@ -7,10 +7,10 @@ TEST(TestSequence, CmdBufSequenceBeginEnd)
|
|||
{
|
||||
kp::Manager mgr;
|
||||
|
||||
std::weak_ptr<kp::Sequence> sqWeakPtr =
|
||||
mgr.getOrCreateManagedSequence("newSequence");
|
||||
{
|
||||
std::shared_ptr<kp::Sequence> sq =
|
||||
mgr.getOrCreateManagedSequence("newSequence");
|
||||
|
||||
if (std::shared_ptr<kp::Sequence> sq = sqWeakPtr.lock()) {
|
||||
EXPECT_TRUE(sq->eval());
|
||||
EXPECT_TRUE(!sq->isRecording());
|
||||
EXPECT_TRUE(sq->begin());
|
||||
|
|
@ -24,3 +24,18 @@ TEST(TestSequence, CmdBufSequenceBeginEnd)
|
|||
EXPECT_TRUE(sq->eval());
|
||||
}
|
||||
}
|
||||
|
||||
TEST(TestSequence, SequenceDestructorViaManager)
|
||||
{
|
||||
std::shared_ptr<kp::Sequence> sq = nullptr;
|
||||
|
||||
{
|
||||
kp::Manager mgr;
|
||||
|
||||
sq = mgr.getOrCreateManagedSequence("newSequence");
|
||||
|
||||
EXPECT_TRUE(sq->isInit());
|
||||
}
|
||||
|
||||
EXPECT_FALSE(sq->isInit());
|
||||
}
|
||||
|
|
|
|||
|
|
@ -24,7 +24,7 @@ TEST(TestTensor, CopyFromHostData)
|
|||
kp::Manager mgr;
|
||||
|
||||
if (std::shared_ptr<kp::Sequence> sq =
|
||||
mgr.getOrCreateManagedSequence("new").lock()) {
|
||||
mgr.getOrCreateManagedSequence("new")) {
|
||||
sq->begin();
|
||||
|
||||
sq->record<kp::OpTensorCreate>({ tensorA, tensorB });
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue