diff --git a/Makefile b/Makefile index 74a6822b0..872209015 100644 --- a/Makefile +++ b/Makefile @@ -163,6 +163,9 @@ generate_python_docstrings: python -m pybind11_mkdoc \ -o python/src/docstrings.hpp \ single_include/kompute/Kompute.hpp \ + -Iexternal/fmt/include/ \ + -Iexternal/spdlog/include/ \ + -Iexternal/glslang/ \ -I/usr/include/c++/7.5.0/ install_python_reqs: diff --git a/README.md b/README.md index 95acd3de3..41596cb00 100644 --- a/README.md +++ b/README.md @@ -51,7 +51,7 @@ The C++ interface provides low level access to the native components of Kompute void kompute(const std::string& shader) { - // 1. Create Kompute Manager with default settings (device 0 and first compute compatible queue) + // 1. Create Kompute Manager with default settings (device 0, first queue and no extensions) kp::Manager mgr; // 2. Create and initialise Kompute Tensors through manager @@ -71,14 +71,16 @@ void kompute(const std::string& shader) { auto algorithm = mgr.algorithm(params, kp::Shader::compile_source(shader), workgroup, - specConsts); + specConsts, + pushConstsA); // 4. Run operation synchronously using sequence mgr.sequence() ->record(params) - ->record(algorithm, pushConstsA) - ->record(algorithm, pushConstsB) - ->eval(); + ->record(algorithm) // Binds default push consts + ->eval() // Evaluates the two recorded operations + ->record(algorithm, pushConstsB) // Overrides push consts + ->eval(); // Evaluates only last recorded operation // 5. Sync results from the GPU asynchronously sq = mgr.sequence() @@ -138,7 +140,7 @@ The [Python package](https://kompute.cc/overview/python-package.html) provides a ```python def kompute(shader): - # 1. Create Kompute Manager with default settings (device 0 and first compute compatible queue) + # 1. Create Kompute Manager with default settings (device 0, first queue and no extensions) mgr = kp.Manager() # 2. Create and initialise Kompute Tensors through manager @@ -155,14 +157,17 @@ def kompute(shader): push_consts_a = [2] push_consts_b = [3] - algo = mgr.algorithm(params, kp.Shader.compile_source(shader), workgroup, spec_consts) + spirv = kp.Shader.compile_source(shader) + + algo = mgr.algorithm(params, spirv, workgroup, spec_consts, push_consts_a) # 4. Run operation synchronously using sequence (mgr.sequence() .record(kp.OpTensorSyncDevice(params)) - .record(kp.OpAlgoDispatch(algo, push_consts_a)) - .record(kp.OpAlgoDispatch(algo, push_consts_b)) - .eval()) + .record(kp.OpAlgoDispatch(algo)) # Binds default push consts provided + .eval() # evaluates the two recorded ops + .record(kp.OpAlgoDispatch(algo, push_consts_b)) # Overrides push consts + .eval()) # evaluates only the last recorded op # 5. Sync results from the GPU asynchronously sq = mgr.sequence() @@ -429,6 +434,12 @@ We appreciate PRs and Issues. If you want to contribute try checking the "Good f * Uses doxygen and sphinx for documentation and autodocs * Uses vcpkg for finding the dependencies, it's the recommended set up to retrieve the libraries +If you want to run with debug layers you can add them with the `KOMPUTE_ENV_DEBUG_LAYERS` parameter as: + +``` +export KOMPUTE_ENV_DEBUG_LAYERS="VK_LAYER_LUNARG_api_dump" +``` + ##### Updating documentation To update the documentation you will need to: diff --git a/docs/overview/advanced-examples.rst b/docs/overview/advanced-examples.rst index 80df20e42..90066e8cc 100644 --- a/docs/overview/advanced-examples.rst +++ b/docs/overview/advanced-examples.rst @@ -23,6 +23,63 @@ End-to-end examples * `Android NDK Mobile Kompute ML Application `_ * `Game Development Kompute ML in Godot Engine `_ +Add Vulkan Extensions +^^^^^^^^^^^^^^^^^^^^ + +Kompute provides a simple way to add Vulkan extensions through kp::Manager initialisation. When debug is enabled you will be able to see logs that show what are the desired extensions requested and the ones that are added based on the available extensions on the current driver. + +The example below shows how you can enable the "VK_EXT_shader_atomic_float" extension so we can use the adomicAdd for floats in the shaders. + +.. code-block:: cpp + :linenos: + + int main() { + std::string shader(R"( + #version 450 + + #extension GL_EXT_shader_atomic_float: enable + + layout(push_constant) uniform PushConstants { + float x; + float y; + float z; + } pcs; + + layout (local_size_x = 1) in; + + layout(set = 0, binding = 0) buffer a { float pa[]; }; + + void main() { + atomicAdd(pa[0], pcs.x); + atomicAdd(pa[1], pcs.y); + atomicAdd(pa[2], pcs.z); + })"); + + std::vector spirv = kp::Shader::compile_source(shader); + + std::shared_ptr sq = nullptr; + + { + kp::Manager mgr(0, {}, { "VK_EXT_shader_atomic_float" }); + + std::shared_ptr tensor = mgr.tensor({ 0, 0, 0 }); + + std::shared_ptr algo = + mgr.algorithm({ tensor }, spirv, kp::Workgroup({ 1 }), {}, { 0.0, 0.0, 0.0 }); + + sq = mgr.sequence() + ->record({ tensor }) + ->record(algo, + kp::Constants{ 0.1, 0.2, 0.3 }) + ->record(algo, + kp::Constants{ 0.3, 0.2, 0.1 }) + ->record({ tensor }) + ->eval(); + + EXPECT_EQ(tensor->data(), kp::Constants({ 0.4, 0.4, 0.4 })); + } + } + Your Custom Kompute Operation ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/python/src/docstrings.hpp b/python/src/docstrings.hpp index 2000421c3..bf98e6581 100644 --- a/python/src/docstrings.hpp +++ b/python/src/docstrings.hpp @@ -28,17 +28,20 @@ R"doc(Abstraction for compute shaders that are run on top of tensors grouped via ParameterGroups (which group descriptorsets))doc"; static const char *__doc_kp_Algorithm_Algorithm = -R"doc(Base constructor for Algorithm. Should not be used unless explicit -intended.)doc"; - -static const char *__doc_kp_Algorithm_Algorithm_2 = -R"doc(Default constructor for Algorithm +R"doc(Main constructor for algorithm with configuration parameters to create +the underlying resources. @param device The Vulkan device to use for creating resources @param -commandBuffer The vulkan command buffer to bind the pipeline and -shaders)doc"; - -static const char *__doc_kp_Algorithm_createDescriptorPool = R"doc()doc"; +tensors (optional) The tensors to use to create the descriptor +resources @param spirv (optional) The spirv code to use to create the +algorithm @param workgroup (optional) The kp::Workgroup to use for the +dispatch which defaults to kp::Workgroup(tensor[0].size(), 1, 1) if +not set. @param specializationConstants (optional) The kp::Constants +to use to initialize the specialization constants which cannot be +changed once set. @param pushConstants (optional) The kp::Constants to +use when initializing the pipeline, which set the size of the push +constants - these can be modified but all new values must have the +same vector size as this initial value.)doc"; static const char *__doc_kp_Algorithm_createParameters = R"doc()doc"; @@ -46,15 +49,35 @@ static const char *__doc_kp_Algorithm_createPipeline = R"doc()doc"; static const char *__doc_kp_Algorithm_createShaderModule = R"doc()doc"; -static const char *__doc_kp_Algorithm_init = -R"doc(Initialiser for the shader data provided to the algorithm as well as -tensor parameters that will be used in shader. +static const char *__doc_kp_Algorithm_destroy = R"doc()doc"; -@param shaderFileData The bytes in spir-v format of the shader -@tensorParams The Tensors to be used in the Algorithm / shader for -processing)doc"; +static const char *__doc_kp_Algorithm_getPush = +R"doc(Gets the specialization constants of the current algorithm. -static const char *__doc_kp_Algorithm_mCommandBuffer = R"doc()doc"; +@returns The kp::Constants currently set for push constants)doc"; + +static const char *__doc_kp_Algorithm_getSpecializationConstants = +R"doc(Gets the specialization constants of the current algorithm. + +@returns The kp::Constants currently set for specialization constants)doc"; + +static const char *__doc_kp_Algorithm_getTensors = +R"doc(Gets the current tensors that are used in the algorithm. + +@returns The list of tensors used in the algorithm.)doc"; + +static const char *__doc_kp_Algorithm_getWorkgroup = +R"doc(Gets the current workgroup from the algorithm. + +@param The kp::Constant to use to set the push constants to use in the +next bindPush(...) calls. The constants provided must be of the same +size as the ones created during initialization.)doc"; + +static const char *__doc_kp_Algorithm_isInit = +R"doc(function that checks all the gpu resource components to verify if +these have been created and returns true if all are valid. + +@returns returns true if the algorithm is currently initialized.)doc"; static const char *__doc_kp_Algorithm_mDescriptorPool = R"doc()doc"; @@ -84,14 +107,70 @@ static const char *__doc_kp_Algorithm_mPipelineCache = R"doc()doc"; static const char *__doc_kp_Algorithm_mPipelineLayout = R"doc()doc"; +static const char *__doc_kp_Algorithm_mPushConstants = R"doc()doc"; + static const char *__doc_kp_Algorithm_mShaderModule = R"doc()doc"; +static const char *__doc_kp_Algorithm_mSpecializationConstants = R"doc()doc"; + +static const char *__doc_kp_Algorithm_mSpirv = R"doc()doc"; + +static const char *__doc_kp_Algorithm_mTensors = R"doc()doc"; + +static const char *__doc_kp_Algorithm_mWorkgroup = R"doc()doc"; + +static const char *__doc_kp_Algorithm_rebuild = +R"doc(Rebuild function to reconstruct algorithm with configuration +parameters to create the underlying resources. + +@param tensors The tensors to use to create the descriptor resources +@param spirv The spirv code to use to create the algorithm @param +workgroup (optional) The kp::Workgroup to use for the dispatch which +defaults to kp::Workgroup(tensor[0].size(), 1, 1) if not set. @param +specializationConstants (optional) The kp::Constants to use to +initialize the specialization constants which cannot be changed once +set. @param pushConstants (optional) The kp::Constants to use when +initializing the pipeline, which set the size of the push constants - +these can be modified but all new values must have the same vector +size as this initial value.)doc"; + +static const char *__doc_kp_Algorithm_recordBindCore = +R"doc(Records command that binds the "core" algorithm components which +consist of binding the pipeline and binding the descriptorsets. + +@param commandBuffer Command buffer to record the algorithm resources +to)doc"; + +static const char *__doc_kp_Algorithm_recordBindPush = +R"doc(Records command that binds the push constants to the command buffer +provided - it is required that the pushConstants provided are of the +same size as the ones provided during initialization. + +@param commandBuffer Command buffer to record the algorithm resources +to)doc"; + static const char *__doc_kp_Algorithm_recordDispatch = R"doc(Records the dispatch function with the provided template parameters or alternatively using the size of the tensor by default. -@param x Layout X dispatch value @param y Layout Y dispatch value -@param z Layout Z dispatch value)doc"; +@param commandBuffer Command buffer to record the algorithm resources +to)doc"; + +static const char *__doc_kp_Algorithm_setPush = +R"doc(Sets the push constants to the new value provided to use in the next +bindPush() + +@param The kp::Constant to use to set the push constants to use in the +next bindPush(...) calls. The constants provided must be of the same +size as the ones created during initialization.)doc"; + +static const char *__doc_kp_Algorithm_setWorkgroup = +R"doc(Sets the work group to use in the recordDispatch + +@param workgroup The kp::Workgroup value to use to update the +algorithm. It must have a value greater than 1 on the x value (index +1) otherwise it will be initialized on the size of the first tensor +(ie. this->mTensor[0]->size()))doc"; static const char *__doc_kp_Manager = R"doc(Base orchestrator which creates and manages device and child @@ -102,13 +181,13 @@ R"doc(Base constructor and default used which creates the base resources including choosing the device 0 by default.)doc"; static const char *__doc_kp_Manager_Manager_2 = -R"doc(Similar to base constructor but allows the user to provide the device -they would like to create the resources on. +R"doc(Similar to base constructor but allows for further configuration to +use when creating the Vulkan resources. @param physicalDeviceIndex The index of the physical device to use @param familyQueueIndices (Optional) List of queue indices to add for -explicit allocation @param totalQueues The total number of compute -queues to create.)doc"; +explicit allocation @param desiredExtensions The desired extensions to +load from physicalDevice)doc"; static const char *__doc_kp_Manager_Manager_3 = R"doc(Manager constructor which allows your own vulkan application to @@ -119,99 +198,33 @@ integrate with the vulkan kompute use. @param device Vulkan logical device to use for all base resources @param physicalDeviceIndex Index for vulkan physical device used)doc"; +static const char *__doc_kp_Manager_algorithm = +R"doc(Create a managed algorithm that will be destroyed by this manager if +it hasn't been destroyed by its reference count going to zero. + +@param tensors (optional) The tensors to initialise the algorithm with +@param spirv (optional) The SPIRV bytes for the algorithm to dispatch +@param workgroup (optional) kp::Workgroup for algorithm to use, and +defaults to (tensor[0].size(), 1, 1) @param specializationConstants +(optional) kp::Constant to use for specialization constants, and +defaults to an empty constant @param pushConstants (optional) +kp::Constant to use for push constants, and defaults to an empty +constant @returns Shared pointer with initialised algorithm)doc"; + +static const char *__doc_kp_Manager_clear = +R"doc(Run a pseudo-garbage collection to release all the managed resources +that have been already freed due to these reaching to zero ref count.)doc"; + static const char *__doc_kp_Manager_createDevice = R"doc()doc"; static const char *__doc_kp_Manager_createInstance = R"doc()doc"; -static const char *__doc_kp_Manager_destroy = -R"doc(Destroy owned Vulkan GPU resources and free GPU memory for single -tensor. - -@param tensors Single tensor to rebuild)doc"; - -static const char *__doc_kp_Manager_destroy_2 = -R"doc(Destroy owned Vulkan GPU resources and free GPU memory for vector of -tensors. - -@param tensors Single tensor to rebuild)doc"; - -static const char *__doc_kp_Manager_destroy_3 = -R"doc(Destroy owned Vulkan GPU resources and free GPU memory for vector of -sequences. Destroying by sequence name is more efficent and hence -recommended instead of by object. - -@param sequences Vector for shared ptrs with sequences to destroy)doc"; - -static const char *__doc_kp_Manager_destroy_4 = -R"doc(Destroy owned Vulkan GPU resources and free GPU memory for single -sequence. Destroying by sequence name is more efficent and hence -recommended instead of by object. - -@param sequences Single sequence to rebuild)doc"; - -static const char *__doc_kp_Manager_destroy_5 = -R"doc(Destroy owned Vulkan GPU resources and free GPU memory for sequence by -name. - -@param sequenceName Single name of named sequence to destroy)doc"; - -static const char *__doc_kp_Manager_destroy_6 = -R"doc(Destroy owned Vulkan GPU resources and free GPU memory for sequences -using vector of named sequence names. - -@param sequenceName Vector of sequence names to destroy)doc"; - -static const char *__doc_kp_Manager_evalOp = -R"doc(Function that evaluates operation against named sequence. - -@param tensors The tensors to be used in the operation recorded @param -sequenceName The name of the sequence to be retrieved or created -@param TArgs Template parameters that will be used to initialise -Operation to allow for extensible configurations on initialisation)doc"; - -static const char *__doc_kp_Manager_evalOpAsync = -R"doc(Function that evaluates operation against named sequence -asynchronously. - -@param tensors The tensors to be used in the operation recorded @param -sequenceName The name of the sequence to be retrieved or created -@param params Template parameters that will be used to initialise -Operation to allow for extensible configurations on initialisation)doc"; - -static const char *__doc_kp_Manager_evalOpAsyncDefault = -R"doc(Operation that evaluates operation against default sequence -asynchronously. - -@param tensors The tensors to be used in the operation recorded @param -params Template parameters that will be used to initialise Operation -to allow for extensible configurations on initialisation)doc"; - -static const char *__doc_kp_Manager_evalOpAwait = -R"doc(Operation that awaits for named sequence to finish. - -@param sequenceName The name of the sequence to wait for termination -@param waitFor The amount of time to wait before timing out)doc"; - -static const char *__doc_kp_Manager_evalOpAwaitDefault = -R"doc(Operation that awaits for default sequence to finish. - -@param tensors The tensors to be used in the operation recorded @param -params Template parameters that will be used to initialise Operation -to allow for extensible configurations on initialisation)doc"; - -static const char *__doc_kp_Manager_evalOpDefault = -R"doc(Function that evaluates operation against a newly created sequence. - -@param tensors The tensors to be used in the operation recorded @param -TArgs Template parameters that will be used to initialise Operation to -allow for extensible configurations on initialisation)doc"; +static const char *__doc_kp_Manager_destroy = R"doc(Destroy the GPU resources and all managed resources by manager.)doc"; static const char *__doc_kp_Manager_mComputeQueueFamilyIndices = R"doc()doc"; static const char *__doc_kp_Manager_mComputeQueues = R"doc()doc"; -static const char *__doc_kp_Manager_mCurrentSequenceIndex = R"doc()doc"; - static const char *__doc_kp_Manager_mDevice = R"doc()doc"; static const char *__doc_kp_Manager_mFreeDevice = R"doc()doc"; @@ -220,190 +233,51 @@ static const char *__doc_kp_Manager_mFreeInstance = R"doc()doc"; static const char *__doc_kp_Manager_mInstance = R"doc()doc"; +static const char *__doc_kp_Manager_mManageResources = R"doc()doc"; + +static const char *__doc_kp_Manager_mManagedAlgorithms = R"doc()doc"; + static const char *__doc_kp_Manager_mManagedSequences = R"doc()doc"; static const char *__doc_kp_Manager_mManagedTensors = R"doc()doc"; static const char *__doc_kp_Manager_mPhysicalDevice = R"doc()doc"; -static const char *__doc_kp_Manager_mPhysicalDeviceIndex = R"doc()doc"; - -static const char *__doc_kp_Manager_rebuild = -R"doc(Function that simplifies the common workflow of tensor initialisation. -It will take the constructor parameters for a Tensor and will will us -it to create a new Tensor. The tensor memory will then be managed and -owned by the manager. - -@param tensors Array of tensors to rebuild @param syncDataToGPU -Whether to sync the data to GPU memory)doc"; - -static const char *__doc_kp_Manager_rebuild_2 = -R"doc(Function that simplifies the common workflow of tensor initialisation. -It will take the constructor parameters for a Tensor and will will us -it to create a new Tensor. The tensor memory will then be managed and -owned by the manager. - -@param tensors Single tensor to rebuild @param syncDataToGPU Whether -to sync the data to GPU memory)doc"; - static const char *__doc_kp_Manager_sequence = -R"doc(Get or create a managed Sequence that will be contained by this -manager. If the named sequence does not currently exist, it would be -created and initialised. +R"doc(Create a managed sequence that will be destroyed by this manager if it +hasn't been destroyed by its reference count going to zero. -@param sequenceName The name for the named sequence to be retrieved or -created @param queueIndex The queue to use from the available queues -@return Shared pointer to the manager owned sequence resource)doc"; +@param queueIndex The queue to use from the available queues @returns +Shared pointer with initialised sequence)doc"; static const char *__doc_kp_Manager_tensor = -R"doc(Function that simplifies the common workflow of tensor creation and -initialization. It will take the constructor parameters for a Tensor -and will will us it to create a new Tensor and then create it. The -tensor memory will then be managed and owned by the manager. +R"doc(Create a managed tensor that will be destroyed by this manager if it +hasn't been destroyed by its reference count going to zero. @param data The data to initialize the tensor with @param tensorType -The type of tensor to initialize @param syncDataToGPU Whether to sync -the data to GPU memory @returns Initialized Tensor with memory Syncd -to GPU device)doc"; +The type of tensor to initialize @returns Shared pointer with +initialised tensor)doc"; -static const char *__doc_kp_OpAlgoCreate = +static const char *__doc_kp_OpAlgoDispatch = R"doc(Operation that provides a general abstraction that simplifies the use of algorithm and parameter components which can be used with shaders. By default it enables the user to provide a dynamic number of tensors which are then passed as inputs.)doc"; -static const char *__doc_kp_OpAlgoCreate_KomputeWorkgroup = R"doc()doc"; +static const char *__doc_kp_OpAlgoDispatch_OpAlgoDispatch = R"doc()doc"; -static const char *__doc_kp_OpAlgoCreate_KomputeWorkgroup_x = R"doc()doc"; +static const char *__doc_kp_OpAlgoDispatch_mAlgorithm = R"doc()doc"; -static const char *__doc_kp_OpAlgoCreate_KomputeWorkgroup_y = R"doc()doc"; +static const char *__doc_kp_OpAlgoDispatch_mPushConstants = R"doc()doc"; -static const char *__doc_kp_OpAlgoCreate_KomputeWorkgroup_z = R"doc()doc"; - -static const char *__doc_kp_OpAlgoCreate_OpAlgoCreate = R"doc(Base constructor, should not be used unless explicitly intended.)doc"; - -static const char *__doc_kp_OpAlgoCreate_OpAlgoCreate_2 = -R"doc(Default constructor with parameters that provides the bare minimum -requirements for the operations to be able to create and manage their -sub-components. - -@param physicalDevice Vulkan physical device used to find device -queues @param device Vulkan logical device for passing to Algorithm -@param commandBuffer Vulkan Command Buffer to record commands into -@param tensors Tensors that are to be used in this operation @param -shaderFilePath Optional parameter to specify the shader to load -(either in spirv or raw format) @param komputeWorkgroup Optional -parameter to specify the layout for processing)doc"; - -static const char *__doc_kp_OpAlgoCreate_OpAlgoCreate_3 = -R"doc(Constructor that enables a file to be passed to the operation with the -contents of the shader. This can be either in raw format or in -compiled SPIR-V binary format. - -@param physicalDevice Vulkan physical device used to find device -queues @param device Vulkan logical device for passing to Algorithm -@param commandBuffer Vulkan Command Buffer to record commands into -@param tensors Tensors that are to be used in this operation @param -shaderFilePath Parameter to specify the shader to load (either in -spirv or raw format) @param komputeWorkgroup Optional parameter to -specify the layout for processing)doc"; - -static const char *__doc_kp_OpAlgoCreate_OpAlgoCreate_4 = -R"doc(Constructor that enables raw shader data to be passed to the main -operation which can be either in raw shader glsl code or in compiled -SPIR-V binary. - -@param physicalDevice Vulkan physical device used to find device -queues @param device Vulkan logical device for passing to Algorithm -@param commandBuffer Vulkan Command Buffer to record commands into -@param tensors Tensors that are to be used in this operation @param -shaderDataRaw Optional parameter to specify the shader data either in -binary or raw form @param komputeWorkgroup Optional parameter to -specify the layout for processing)doc"; - -static const char *__doc_kp_OpAlgoCreate_fetchSpirvBinaryData = R"doc()doc"; - -static const char *__doc_kp_OpAlgoCreate_init = -R"doc(The init function is responsible for the initialisation of the -algorithm component based on the parameters specified, and allows for -extensibility on the options provided. Further dependent classes can -perform more specific checks such as ensuring tensors provided are -initialised, etc.)doc"; - -static const char *__doc_kp_OpAlgoCreate_mAlgorithm = R"doc()doc"; - -static const char *__doc_kp_OpAlgoCreate_mFreeAlgorithm = R"doc()doc"; - -static const char *__doc_kp_OpAlgoCreate_mKomputeWorkgroup = R"doc()doc"; - -static const char *__doc_kp_OpAlgoCreate_mShaderDataRaw = -R"doc(< Optional member variable which can be provided to contain either the -raw shader content or the spirv binary content)doc"; - -static const char *__doc_kp_OpAlgoCreate_mShaderFilePath = -R"doc(< Optional member variable which can be provided for the OpAlgoCreate to -find the data automatically and load for processing)doc"; - -static const char *__doc_kp_OpAlgoCreate_postEval = +static const char *__doc_kp_OpAlgoDispatch_postEval = R"doc(Executes after the recorded commands are submitted, and performs a copy of the GPU Device memory into the staging buffer so the output data can be retrieved.)doc"; -static const char *__doc_kp_OpAlgoCreate_preEval = R"doc(Does not perform any preEval commands.)doc"; +static const char *__doc_kp_OpAlgoDispatch_preEval = R"doc(Does not perform any preEval commands.)doc"; -static const char *__doc_kp_OpAlgoCreate_record = -R"doc(This records the commands that are to be sent to the GPU. This -includes the barriers that ensure the memory has been copied before -going in and out of the shader, as well as the dispatch operation that -sends the shader processing to the gpu. This function also records the -GPU memory copy of the output data for the staging buffer so it can be -read by the host.)doc"; - -static const char *__doc_kp_OpAlgoLhsRhsOut = -R"doc(Operation base class to simplify the creation of operations that -require right hand and left hand side datapoints together with a -single output. The expected data passed is two input tensors and one -output tensor.)doc"; - -static const char *__doc_kp_OpAlgoLhsRhsOut_OpAlgoLhsRhsOut = R"doc(Base constructor, should not be used unless explicitly intended.)doc"; - -static const char *__doc_kp_OpAlgoLhsRhsOut_OpAlgoLhsRhsOut_2 = -R"doc(Default constructor with parameters that provides the bare minimum -requirements for the operations to be able to create and manage their -sub-components. - -@param physicalDevice Vulkan physical device used to find device -queues @param device Vulkan logical device for passing to Algorithm -@param commandBuffer Vulkan Command Buffer to record commands into -@param tensors Tensors that are to be used in this operation @param -freeTensors Whether operation manages the memory of the Tensors @param -komputeWorkgroup Optional parameter to specify the layout for -processing)doc"; - -static const char *__doc_kp_OpAlgoLhsRhsOut_init = -R"doc(The init function is responsible for ensuring that all of the tensors -provided are aligned with requirements such as LHS, RHS and Output -tensors, and creates the algorithm component which processes the -computation.)doc"; - -static const char *__doc_kp_OpAlgoLhsRhsOut_mTensorLHS = -R"doc(< Reference to the parameter used in the left hand side equation of -the shader)doc"; - -static const char *__doc_kp_OpAlgoLhsRhsOut_mTensorOutput = -R"doc(< Reference to the parameter used in the output of the shader and will -be copied with a staging vector)doc"; - -static const char *__doc_kp_OpAlgoLhsRhsOut_mTensorRHS = -R"doc(< Reference to the parameter used in the right hand side equation of -the shader)doc"; - -static const char *__doc_kp_OpAlgoLhsRhsOut_postEval = -R"doc(Executes after the recorded commands are submitted, and performs a -copy of the GPU Device memory into the staging buffer so the output -data can be retrieved.)doc"; - -static const char *__doc_kp_OpAlgoLhsRhsOut_record = +static const char *__doc_kp_OpAlgoDispatch_record = R"doc(This records the commands that are to be sent to the GPU. This includes the barriers that ensure the memory has been copied before going in and out of the shader, as well as the dispatch operation that @@ -419,36 +293,6 @@ Operations can perform actions on tensors, and optionally can also own an Algorithm with respective parameters. kp::Operations with kp::Algorithms would inherit from kp::OpBaseAlgo.)doc"; -static const char *__doc_kp_OpBase_OpBase = R"doc(Base constructor, should not be used unless explicitly intended.)doc"; - -static const char *__doc_kp_OpBase_OpBase_2 = -R"doc(Default constructor with parameters that provides the bare minimum -requirements for the operations to be able to create and manage their -sub-components. - -@param physicalDevice Vulkan physical device used to find device -queues @param device Vulkan logical device for passing to Algorithm -@param commandBuffer Vulkan Command Buffer to record commands into -@param tensors Tensors that are to be used in this operation)doc"; - -static const char *__doc_kp_OpBase_init = -R"doc(The init function is responsible for setting up all the resources and -should be called after the Operation has been created.)doc"; - -static const char *__doc_kp_OpBase_mCommandBuffer = R"doc(< Vulkan Command Buffer)doc"; - -static const char *__doc_kp_OpBase_mDevice = R"doc(< Vulkan Logical Device)doc"; - -static const char *__doc_kp_OpBase_mFreeTensors = -R"doc(< Explicit boolean that specifies whether the < tensors are freed (if -they are managed))doc"; - -static const char *__doc_kp_OpBase_mPhysicalDevice = R"doc(< Vulkan Physical Device)doc"; - -static const char *__doc_kp_OpBase_mTensors = -R"doc(< Tensors referenced by operation that can be managed < optionally by -operation)doc"; - static const char *__doc_kp_OpBase_postEval = R"doc(Post eval is called after the Sequence has called eval and submitted the commands to the GPU for processing, and can be used to perform any @@ -474,9 +318,7 @@ static const char *__doc_kp_OpMult = R"doc(Operation that performs multiplication on two tensors and outpus on third tensor.)doc"; -static const char *__doc_kp_OpMult_OpMult = R"doc(Base constructor, should not be used unless explicitly intended.)doc"; - -static const char *__doc_kp_OpMult_OpMult_2 = +static const char *__doc_kp_OpMult_OpMult = R"doc(Default constructor with parameters that provides the bare minimum requirements for the operations to be able to create and manage their sub-components. @@ -494,9 +336,7 @@ the tensors provided, using a record command for all the vectors. This operation does not own/manage the memory of the tensors passed to it. The operation must only receive tensors of type)doc"; -static const char *__doc_kp_OpTensorCopy_OpTensorCopy = R"doc()doc"; - -static const char *__doc_kp_OpTensorCopy_OpTensorCopy_2 = +static const char *__doc_kp_OpTensorCopy_OpTensorCopy = R"doc(Default constructor with parameters that provides the core vulkan resources and the tensors that will be used in the operation. @@ -505,10 +345,7 @@ queues @param device Vulkan logical device for passing to Algorithm @param commandBuffer Vulkan Command Buffer to record commands into @param tensors Tensors that will be used to create in operation.)doc"; -static const char *__doc_kp_OpTensorCopy_init = -R"doc(Performs basic checks such as ensuring there are at least two tensors -provided, that they are initialised and that they are not of type -TensorTypes::eStorage.)doc"; +static const char *__doc_kp_OpTensorCopy_mTensors = R"doc()doc"; static const char *__doc_kp_OpTensorCopy_postEval = R"doc(Copies the local vectors for all the tensors to sync the data with the @@ -530,9 +367,7 @@ will happen during preEval before the recorded commands are dispatched. This operation won't have any effect on TensorTypes::eStaging.)doc"; -static const char *__doc_kp_OpTensorSyncDevice_OpTensorSyncDevice = R"doc()doc"; - -static const char *__doc_kp_OpTensorSyncDevice_OpTensorSyncDevice_2 = +static const char *__doc_kp_OpTensorSyncDevice_OpTensorSyncDevice = R"doc(Default constructor with parameters that provides the core vulkan resources and the tensors that will be used in the operation. The tensos provided cannot be of type TensorTypes::eStorage. @@ -542,9 +377,7 @@ queues @param device Vulkan logical device for passing to Algorithm @param commandBuffer Vulkan Command Buffer to record commands into @param tensors Tensors that will be used to create in operation.)doc"; -static const char *__doc_kp_OpTensorSyncDevice_init = -R"doc(Performs basic checks such as ensuring that there is at least one -tensor provided with min memory of 1 element.)doc"; +static const char *__doc_kp_OpTensorSyncDevice_mTensors = R"doc()doc"; static const char *__doc_kp_OpTensorSyncDevice_postEval = R"doc(Does not perform any postEval commands.)doc"; @@ -564,9 +397,7 @@ will happen during preEval before the recorded commands are dispatched. This operation won't have any effect on TensorTypes::eStaging.)doc"; -static const char *__doc_kp_OpTensorSyncLocal_OpTensorSyncLocal = R"doc()doc"; - -static const char *__doc_kp_OpTensorSyncLocal_OpTensorSyncLocal_2 = +static const char *__doc_kp_OpTensorSyncLocal_OpTensorSyncLocal = R"doc(Default constructor with parameters that provides the core vulkan resources and the tensors that will be used in the operation. The tensors provided cannot be of type TensorTypes::eStorage. @@ -576,9 +407,7 @@ queues @param device Vulkan logical device for passing to Algorithm @param commandBuffer Vulkan Command Buffer to record commands into @param tensors Tensors that will be used to create in operation.)doc"; -static const char *__doc_kp_OpTensorSyncLocal_init = -R"doc(Performs basic checks such as ensuring that there is at least one -tensor provided with min memory of 1 element.)doc"; +static const char *__doc_kp_OpTensorSyncLocal_mTensors = R"doc()doc"; static const char *__doc_kp_OpTensorSyncLocal_postEval = R"doc(For host tensors it performs the map command from the host memory into @@ -593,10 +422,6 @@ the data from its device to staging memory.)doc"; static const char *__doc_kp_Sequence = R"doc(Container of operations that can be sent to GPU as batch)doc"; static const char *__doc_kp_Sequence_Sequence = -R"doc(Base constructor for Sequence. Should not be used unless explicit -intended.)doc"; - -static const char *__doc_kp_Sequence_Sequence_2 = R"doc(Main constructor for sequence which requires core vulkan components to generate all dependent resources. @@ -610,10 +435,18 @@ command buffer. @return Boolean stating whether execution was successful.)doc"; +static const char *__doc_kp_Sequence_clear = +R"doc(Clear function clears all operations currently recorded and starts +recording again.)doc"; + static const char *__doc_kp_Sequence_createCommandBuffer = R"doc()doc"; static const char *__doc_kp_Sequence_createCommandPool = R"doc()doc"; +static const char *__doc_kp_Sequence_destroy = +R"doc(Destroys and frees the GPU resources which include the buffer and +memory and sets the sequence as init=False.)doc"; + static const char *__doc_kp_Sequence_end = R"doc(Ends the recording and stops recording commands when the record command is sent. @@ -622,36 +455,84 @@ command is sent. static const char *__doc_kp_Sequence_eval = R"doc(Eval sends all the recorded and stored operations in the vector of +operations into the gpu as a submit job synchronously (with a +barrier). + +@return shared_ptr of the Sequence class itself)doc"; + +static const char *__doc_kp_Sequence_eval_2 = +R"doc(Resets all the recorded and stored operations, records the operation +provided and submits into the gpu as a submit job synchronously (with +a barrier). + +@return shared_ptr of the Sequence class itself)doc"; + +static const char *__doc_kp_Sequence_eval_3 = +R"doc(Eval sends all the recorded and stored operations in the vector of operations into the gpu as a submit job with a barrier. -@return Boolean stating whether execution was successful.)doc"; +@param tensors Vector of tensors to use for the operation @param TArgs +Template parameters that are used to initialise operation which allows +for extensible configurations on initialisation. @return +shared_ptr of the Sequence class itself)doc"; + +static const char *__doc_kp_Sequence_eval_4 = +R"doc(Eval sends all the recorded and stored operations in the vector of +operations into the gpu as a submit job with a barrier. + +@param algorithm Algorithm to use for the record often used for OpAlgo +operations @param TArgs Template parameters that are used to +initialise operation which allows for extensible configurations on +initialisation. @return shared_ptr of the Sequence class +itself)doc"; static const char *__doc_kp_Sequence_evalAsync = R"doc(Eval Async sends all the recorded and stored operations in the vector -of operations into the gpu as a submit job with a barrier. EvalAwait() -must be called after to ensure the sequence is terminated correctly. +of operations into the gpu as a submit job without a barrier. +EvalAwait() must ALWAYS be called after to ensure the sequence is +terminated correctly. @return Boolean stating whether execution was successful.)doc"; +static const char *__doc_kp_Sequence_evalAsync_2 = +R"doc(Clears currnet operations to record provided one in the vector of +operations into the gpu as a submit job without a barrier. EvalAwait() +must ALWAYS be called after to ensure the sequence is terminated +correctly. + +@return Boolean stating whether execution was successful.)doc"; + +static const char *__doc_kp_Sequence_evalAsync_3 = +R"doc(Eval sends all the recorded and stored operations in the vector of +operations into the gpu as a submit job with a barrier. + +@param tensors Vector of tensors to use for the operation @param TArgs +Template parameters that are used to initialise operation which allows +for extensible configurations on initialisation. @return +shared_ptr of the Sequence class itself)doc"; + +static const char *__doc_kp_Sequence_evalAsync_4 = +R"doc(Eval sends all the recorded and stored operations in the vector of +operations into the gpu as a submit job with a barrier. + +@param algorithm Algorithm to use for the record often used for OpAlgo +operations @param TArgs Template parameters that are used to +initialise operation which allows for extensible configurations on +initialisation. @return shared_ptr of the Sequence class +itself)doc"; + static const char *__doc_kp_Sequence_evalAwait = R"doc(Eval Await waits for the fence to finish processing and then once it finishes, it runs the postEval of all operations. @param waitFor Number of milliseconds to wait before timing out. -@return Boolean stating whether execution was successful.)doc"; - -static const char *__doc_kp_Sequence_freeMemoryDestroyGPUResources = -R"doc(Destroys and frees the GPU resources which include the buffer and -memory and sets the sequence as init=False.)doc"; - -static const char *__doc_kp_Sequence_init = -R"doc(Initialises sequence including the creation of the command pool and -the command buffer.)doc"; +@return shared_ptr of the Sequence class itself)doc"; static const char *__doc_kp_Sequence_isInit = -R"doc(Returns true if the sequence has been successfully initialised. +R"doc(Returns true if the sequence has been initialised, and it's based on +the GPU resources being refrenced. -@return Boolean stating if sequence has been initialised.)doc"; +@return Boolean stating if is initialized)doc"; static const char *__doc_kp_Sequence_isRecording = R"doc(Returns true if the sequence is currently in recording activated. @@ -678,8 +559,6 @@ static const char *__doc_kp_Sequence_mFreeCommandBuffer = R"doc()doc"; static const char *__doc_kp_Sequence_mFreeCommandPool = R"doc()doc"; -static const char *__doc_kp_Sequence_mIsInit = R"doc()doc"; - static const char *__doc_kp_Sequence_mIsRunning = R"doc()doc"; static const char *__doc_kp_Sequence_mOperations = R"doc()doc"; @@ -696,9 +575,66 @@ This template requires classes to be derived from the OpBase class. This function also requires the Sequence to be recording, otherwise it will not be able to add the operation. +@param op Object derived from kp::BaseOp that will be recoreded by the +sequence which will be used when the operation is evaluated. @return +shared_ptr of the Sequence class itself)doc"; + +static const char *__doc_kp_Sequence_record_2 = +R"doc(Record function for operation to be added to the GPU queue in batch. +This template requires classes to be derived from the OpBase class. +This function also requires the Sequence to be recording, otherwise it +will not be able to add the operation. + @param tensors Vector of tensors to use for the operation @param TArgs Template parameters that are used to initialise operation which allows -for extensible configurations on initialisation.)doc"; +for extensible configurations on initialisation. @return +shared_ptr of the Sequence class itself)doc"; + +static const char *__doc_kp_Sequence_record_3 = +R"doc(Record function for operation to be added to the GPU queue in batch. +This template requires classes to be derived from the OpBase class. +This function also requires the Sequence to be recording, otherwise it +will not be able to add the operation. + +@param algorithm Algorithm to use for the record often used for OpAlgo +operations @param TArgs Template parameters that are used to +initialise operation which allows for extensible configurations on +initialisation. @return shared_ptr of the Sequence class +itself)doc"; + +static const char *__doc_kp_Sequence_rerecord = +R"doc(Clears command buffer and triggers re-record of all the current +operations saved, which is useful if the underlying kp::Tensors or +kp::Algorithms are modified and need to be re-recorded.)doc"; + +static const char *__doc_kp_Shader = R"doc(Shader utily class with functions to compile and process glsl files.)doc"; + +static const char *__doc_kp_Shader_compile_source = +R"doc(Compile a single glslang source from string value. Currently this +function uses the glslang C++ interface which is not thread safe so +this funciton should not be called from multiple threads concurrently. +If you have a online shader processing multithreading use-case that +can't use offline compilation please open an issue. + +@param source An individual raw glsl shader in string format @param +entryPoint The function name to use as entry point @param definitions +List of pairs containing key value definitions @param resourcesLimit A +list that contains the resource limits for the GLSL compiler @return +The compiled SPIR-V binary in unsigned int32 format)doc"; + +static const char *__doc_kp_Shader_compile_sources = +R"doc(Compile multiple sources with optional filenames. Currently this +function uses the glslang C++ interface which is not thread safe so +this funciton should not be called from multiple threads concurrently. +If you have a online shader processing multithreading use-case that +can't use offline compilation please open an issue. + +@param sources A list of raw glsl shaders in string format @param +files A list of file names respective to each of the sources @param +entryPoint The function name to use as entry point @param definitions +List of pairs containing key value definitions @param resourcesLimit A +list that contains the resource limits for the GLSL compiler @return +The compiled SPIR-V binary in unsigned int32 format)doc"; static const char *__doc_kp_Tensor = R"doc(Structured data used in GPU operations. @@ -708,9 +644,7 @@ across GPUs. Each tensor would have a respective Vulkan memory and buffer, which would be used to store their respective data. The tensors can be used for GPU data storage or transfer.)doc"; -static const char *__doc_kp_Tensor_Tensor = R"doc(Base constructor, should not be used unless explicitly intended.)doc"; - -static const char *__doc_kp_Tensor_Tensor_2 = +static const char *__doc_kp_Tensor_Tensor = R"doc(Default constructor with data provided which would be used to create the respective vulkan buffer and memory. @@ -741,8 +675,6 @@ without exposing it. @return Descriptor buffer info with own buffer)doc"; -static const char *__doc_kp_Tensor_copyBuffer = R"doc()doc"; - static const char *__doc_kp_Tensor_createBuffer = R"doc()doc"; static const char *__doc_kp_Tensor_data = @@ -753,7 +685,7 @@ memory. @return Reference to vector of elements representing the data in the tensor.)doc"; -static const char *__doc_kp_Tensor_freeMemoryDestroyGPUResources = +static const char *__doc_kp_Tensor_destroy = R"doc(Destroys and frees the GPU resources which include the buffer and memory.)doc"; @@ -765,17 +697,7 @@ static const char *__doc_kp_Tensor_getStagingBufferUsageFlags = R"doc()doc"; static const char *__doc_kp_Tensor_getStagingMemoryPropertyFlags = R"doc()doc"; -static const char *__doc_kp_Tensor_init = -R"doc(Initialiser which calls the initialisation for all the respective -tensors as well as creates the respective staging tensors. The staging -tensors would only be created for the tensors of type -TensorType::eDevice as otherwise there is no need to copy from host -memory.)doc"; - -static const char *__doc_kp_Tensor_isInit = -R"doc(Returns true if the tensor initialisation function has been carried -out successful, which would mean that the buffer and memory will have -been provisioned.)doc"; +static const char *__doc_kp_Tensor_isInit = R"doc()doc"; static const char *__doc_kp_Tensor_mData = R"doc()doc"; @@ -789,16 +711,12 @@ static const char *__doc_kp_Tensor_mFreeStagingBuffer = R"doc()doc"; static const char *__doc_kp_Tensor_mFreeStagingMemory = R"doc()doc"; -static const char *__doc_kp_Tensor_mIsInit = R"doc()doc"; - static const char *__doc_kp_Tensor_mPhysicalDevice = R"doc()doc"; static const char *__doc_kp_Tensor_mPrimaryBuffer = R"doc()doc"; static const char *__doc_kp_Tensor_mPrimaryMemory = R"doc()doc"; -static const char *__doc_kp_Tensor_mShape = R"doc()doc"; - static const char *__doc_kp_Tensor_mStagingBuffer = R"doc()doc"; static const char *__doc_kp_Tensor_mStagingMemory = R"doc()doc"; @@ -823,6 +741,13 @@ vector's. @param i The index where the element will be returned from. @return Returns the element in the position requested.)doc"; +static const char *__doc_kp_Tensor_rebuild = +R"doc(Initialiser which calls the initialisation for all the respective +tensors as well as creates the respective staging tensors. The staging +tensors would only be created for the tensors of type +TensorType::eDevice as otherwise there is no need to copy from host +memory.)doc"; + static const char *__doc_kp_Tensor_recordBufferMemoryBarrier = R"doc(Records the buffer memory barrier into the command buffer which ensures that relevant data transfers are carried out correctly. @@ -833,6 +758,8 @@ dstAccessMask Access flags for destination access mask @param scrStageMask Pipeline stage flags for source stage mask @param dstStageMask Pipeline stage flags for destination stage mask)doc"; +static const char *__doc_kp_Tensor_recordCopyBuffer = R"doc()doc"; + static const char *__doc_kp_Tensor_recordCopyFrom = R"doc(Records a copy from the memory of the tensor provided to the current thensor. This is intended to pass memory into a processing, to perform @@ -865,13 +792,6 @@ static const char *__doc_kp_Tensor_setData = R"doc(Sets / resets the vector data of the tensor. This function does not perform any copies into GPU memory and is only performed on the host.)doc"; -static const char *__doc_kp_Tensor_shape = -R"doc(Returns the shape of the tensor, which includes the number of -dimensions and the size per dimension. - -@return Array containing the sizes for each dimension. Zero means -respective dimension is not active.)doc"; - static const char *__doc_kp_Tensor_size = R"doc(Returns the size/magnitude of the Tensor, which will be the total number of elements across all dimensions diff --git a/python/src/main.cpp b/python/src/main.cpp index 8aac68c98..f13347aa8 100644 --- a/python/src/main.cpp +++ b/python/src/main.cpp @@ -135,7 +135,10 @@ PYBIND11_MODULE(kp, m) { py::class_>(m, "Manager") .def(py::init()) .def(py::init()) - .def(py::init&>()) + .def(py::init&,const std::vector&>(), + py::arg("device") = 0, + py::arg("family_queue_indices") = std::vector(), + py::arg("desired_extensions") = std::vector()) .def("sequence", &kp::Manager::sequence, py::arg("queueIndex") = 0) .def("tensor", [np](kp::Manager& self, const py::array_t data, @@ -151,15 +154,16 @@ PYBIND11_MODULE(kp, m) { const std::vector>& tensors, const py::bytes& spirv, const kp::Workgroup& workgroup, - const kp::Constants& spec_consts) { + const kp::Constants& spec_consts, + const kp::Constants& push_consts) { py::buffer_info info(py::buffer(spirv).request()); const char *data = reinterpret_cast(info.ptr); size_t length = static_cast(info.size); std::vector spirvVec((uint32_t*)data, (uint32_t*)(data + length)); - return self.algorithm(tensors, spirvVec, workgroup, spec_consts); + return self.algorithm(tensors, spirvVec, workgroup, spec_consts, push_consts); }, "Algorithm initialisation function", - py::arg("tensors"), py::arg("spirv"), py::arg("workgroup") = kp::Workgroup(), py::arg("spec_consts") = kp::Constants()); + py::arg("tensors"), py::arg("spirv"), py::arg("workgroup") = kp::Workgroup(), py::arg("spec_consts") = kp::Constants(), py::arg("push_consts") = kp::Constants()); #ifdef VERSION_INFO m.attr("__version__") = VERSION_INFO; diff --git a/python/test/test_kompute.py b/python/test/test_kompute.py index 865f72d92..47887930a 100644 --- a/python/test/test_kompute.py +++ b/python/test/test_kompute.py @@ -72,11 +72,11 @@ def test_end_to_end(): push_consts_a = [2] push_consts_b = [3] - algo = mgr.algorithm(params, kp.Shader.compile_source(shader), workgroup, spec_consts) + algo = mgr.algorithm(params, kp.Shader.compile_source(shader), workgroup, spec_consts, push_consts_a) (mgr.sequence() .record(kp.OpTensorSyncDevice(params)) - .record(kp.OpAlgoDispatch(algo, push_consts_a)) + .record(kp.OpAlgoDispatch(algo)) .record(kp.OpAlgoDispatch(algo, push_consts_b)) .eval()) @@ -206,11 +206,11 @@ def test_pushconsts(): tensor = mgr.tensor([0, 0, 0]) - algo = mgr.algorithm([tensor], spirv, (1, 1, 1)) + algo = mgr.algorithm([tensor], spirv, (1, 1, 1), [], [0.1, 0.2, 0.3]) (mgr.sequence() .record(kp.OpTensorSyncDevice([tensor])) - .record(kp.OpAlgoDispatch(algo, [0.1, 0.2, 0.3])) + .record(kp.OpAlgoDispatch(algo)) .record(kp.OpAlgoDispatch(algo, [0.3, 0.2, 0.1])) .record(kp.OpTensorSyncLocal([tensor])) .eval()) diff --git a/single_include/kompute/Kompute.hpp b/single_include/kompute/Kompute.hpp index 7b67e2024..607928f0c 100755 --- a/single_include/kompute/Kompute.hpp +++ b/single_include/kompute/Kompute.hpp @@ -735,124 +735,18 @@ extern py::object kp_debug, kp_info, kp_warning, kp_error; namespace kp { -// The default resource limit for the GLSL compiler, can be overwritten -// Has been adobted by: -// https://github.com/KhronosGroup/glslang/blob/master/StandAlone/ResourceLimits.cpp -const TBuiltInResource defaultResource = { - /* .MaxLights = */ 0, - /* .MaxClipPlanes = */ 0, - /* .MaxTextureUnits = */ 0, - /* .MaxTextureCoords = */ 0, - /* .MaxVertexAttribs = */ 64, - /* .MaxVertexUniformComponents = */ 4096, - /* .MaxVaryingFloats = */ 64, - /* .MaxVertexTextureImageUnits = */ 0, - /* .MaxCombinedTextureImageUnits = */ 0, - /* .MaxTextureImageUnits = */ 0, - /* .MaxFragmentUniformComponents = */ 0, - /* .MaxDrawBuffers = */ 0, - /* .MaxVertexUniformVectors = */ 128, - /* .MaxVaryingVectors = */ 8, - /* .MaxFragmentUniformVectors = */ 0, - /* .MaxVertexOutputVectors = */ 16, - /* .MaxFragmentInputVectors = */ 0, - /* .MinProgramTexelOffset = */ -8, - /* .MaxProgramTexelOffset = */ 7, - /* .MaxClipDistances = */ 8, - /* .MaxComputeWorkGroupCountX = */ 65535, - /* .MaxComputeWorkGroupCountY = */ 65535, - /* .MaxComputeWorkGroupCountZ = */ 65535, - /* .MaxComputeWorkGroupSizeX = */ 1024, - /* .MaxComputeWorkGroupSizeY = */ 1024, - /* .MaxComputeWorkGroupSizeZ = */ 64, - /* .MaxComputeUniformComponents = */ 1024, - /* .MaxComputeTextureImageUnits = */ 16, - /* .MaxComputeImageUniforms = */ 8, - /* .MaxComputeAtomicCounters = */ 8, - /* .MaxComputeAtomicCounterBuffers = */ 1, - /* .MaxVaryingComponents = */ 60, - /* .MaxVertexOutputComponents = */ 64, - /* .MaxGeometryInputComponents = */ 64, - /* .MaxGeometryOutputComponents = */ 128, - /* .MaxFragmentInputComponents = */ 0, - /* .MaxImageUnits = */ 0, - /* .MaxCombinedImageUnitsAndFragmentOutputs = */ 0, - /* .MaxCombinedShaderOutputResources = */ 8, - /* .MaxImageSamples = */ 0, - /* .MaxVertexImageUniforms = */ 0, - /* .MaxTessControlImageUniforms = */ 0, - /* .MaxTessEvaluationImageUniforms = */ 0, - /* .MaxGeometryImageUniforms = */ 0, - /* .MaxFragmentImageUniforms = */ 0, - /* .MaxCombinedImageUniforms = */ 0, - /* .MaxGeometryTextureImageUnits = */ 0, - /* .MaxGeometryOutputVertices = */ 256, - /* .MaxGeometryTotalOutputComponents = */ 1024, - /* .MaxGeometryUniformComponents = */ 1024, - /* .MaxGeometryVaryingComponents = */ 64, - /* .MaxTessControlInputComponents = */ 128, - /* .MaxTessControlOutputComponents = */ 128, - /* .MaxTessControlTextureImageUnits = */ 0, - /* .MaxTessControlUniformComponents = */ 1024, - /* .MaxTessControlTotalOutputComponents = */ 4096, - /* .MaxTessEvaluationInputComponents = */ 128, - /* .MaxTessEvaluationOutputComponents = */ 128, - /* .MaxTessEvaluationTextureImageUnits = */ 16, - /* .MaxTessEvaluationUniformComponents = */ 1024, - /* .MaxTessPatchComponents = */ 120, - /* .MaxPatchVertices = */ 32, - /* .MaxTessGenLevel = */ 64, - /* .MaxViewports = */ 16, - /* .MaxVertexAtomicCounters = */ 0, - /* .MaxTessControlAtomicCounters = */ 0, - /* .MaxTessEvaluationAtomicCounters = */ 0, - /* .MaxGeometryAtomicCounters = */ 0, - /* .MaxFragmentAtomicCounters = */ 0, - /* .MaxCombinedAtomicCounters = */ 8, - /* .MaxAtomicCounterBindings = */ 1, - /* .MaxVertexAtomicCounterBuffers = */ 0, - /* .MaxTessControlAtomicCounterBuffers = */ 0, - /* .MaxTessEvaluationAtomicCounterBuffers = */ 0, - /* .MaxGeometryAtomicCounterBuffers = */ 0, - /* .MaxFragmentAtomicCounterBuffers = */ 0, - /* .MaxCombinedAtomicCounterBuffers = */ 1, - /* .MaxAtomicCounterBufferSize = */ 16384, - /* .MaxTransformFeedbackBuffers = */ 4, - /* .MaxTransformFeedbackInterleavedComponents = */ 64, - /* .MaxCullDistances = */ 8, - /* .MaxCombinedClipAndCullDistances = */ 8, - /* .MaxSamples = */ 4, - /* .maxMeshOutputVerticesNV = */ 256, - /* .maxMeshOutputPrimitivesNV = */ 512, - /* .maxMeshWorkGroupSizeX_NV = */ 32, - /* .maxMeshWorkGroupSizeY_NV = */ 1, - /* .maxMeshWorkGroupSizeZ_NV = */ 1, - /* .maxTaskWorkGroupSizeX_NV = */ 32, - /* .maxTaskWorkGroupSizeY_NV = */ 1, - /* .maxTaskWorkGroupSizeZ_NV = */ 1, - /* .maxMeshViewCountNV = */ 4, - /* .maxDualSourceDrawBuffersEXT = */ 1, - - /* .limits = */ - { - /* .nonInductiveForLoops = */ 1, - /* .whileLoops = */ 1, - /* .doWhileLoops = */ 1, - /* .generalUniformIndexing = */ 1, - /* .generalAttributeMatrixVectorIndexing = */ 1, - /* .generalVaryingIndexing = */ 1, - /* .generalSamplerIndexing = */ 1, - /* .generalVariableIndexing = */ 1, - /* .generalConstantMatrixVectorIndexing = */ 1, - } -}; - /** Shader utily class with functions to compile and process glsl files. */ class Shader { public: + + // The default resource limit for the GLSL compiler, can be overwritten + // Has been adopted by: + // https://github.com/KhronosGroup/glslang/blob/master/StandAlone/ResourceLimits.cpp + const static TBuiltInResource defaultResource; + /** * Compile multiple sources with optional filenames. Currently this function * uses the glslang C++ interface which is not thread safe so this funciton @@ -873,7 +767,7 @@ class Shader const std::vector& files = {}, const std::string& entryPoint = "main", std::vector> definitions = {}, - const TBuiltInResource& resources = defaultResource); + const TBuiltInResource& resources = Shader::defaultResource); /** * Compile a single glslang source from string value. Currently this @@ -893,7 +787,7 @@ class Shader const std::string& source, const std::string& entryPoint = "main", std::vector> definitions = {}, - const TBuiltInResource& resources = defaultResource); + const TBuiltInResource& resources = Shader::defaultResource); }; } @@ -1125,31 +1019,46 @@ class Algorithm { public: /** - * Default constructor for Algorithm + * Main constructor for algorithm with configuration parameters to create + * the underlying resources. * * @param device The Vulkan device to use for creating resources - * @param commandBuffer The vulkan command buffer to bind the pipeline and - * shaders + * @param tensors (optional) The tensors to use to create the descriptor resources + * @param spirv (optional) The spirv code to use to create the algorithm + * @param workgroup (optional) The kp::Workgroup to use for the dispatch which defaults to + * kp::Workgroup(tensor[0].size(), 1, 1) if not set. + * @param specializationConstants (optional) The kp::Constants to use to initialize + * the specialization constants which cannot be changed once set. + * @param pushConstants (optional) The kp::Constants to use when initializing the + * pipeline, which set the size of the push constants - these can be modified but + * all new values must have the same vector size as this initial value. */ Algorithm(std::shared_ptr device, const std::vector>& tensors = {}, const std::vector& spirv = {}, const Workgroup& workgroup = {}, - const Constants& specializationConstants = {}); + const Constants& specializationConstants = {}, + const Constants& pushConstants = {}); /** - * Initialiser for the shader data provided to the algorithm as well as - * tensor parameters that will be used in shader. + * Rebuild function to reconstruct algorithm with configuration parameters to create + * the underlying resources. * - * @param shaderFileData The bytes in spir-v format of the shader - * @tensorParams The Tensors to be used in the Algorithm / shader for - * @specalizationInstalces The specialization parameters to pass to the - * function processing + * @param tensors The tensors to use to create the descriptor resources + * @param spirv The spirv code to use to create the algorithm + * @param workgroup (optional) The kp::Workgroup to use for the dispatch which defaults to + * kp::Workgroup(tensor[0].size(), 1, 1) if not set. + * @param specializationConstants (optional) The kp::Constants to use to initialize + * the specialization constants which cannot be changed once set. + * @param pushConstants (optional) The kp::Constants to use when initializing the + * pipeline, which set the size of the push constants - these can be modified but + * all new values must have the same vector size as this initial value. */ void rebuild(const std::vector>& tensors, const std::vector& spirv, const Workgroup& workgroup = {}, - const Constants& specializationConstants = {}); + const Constants& specializationConstants = {}, + const Constants& pushConstants = {}); /** * Destructor for Algorithm which is responsible for freeing and desroying @@ -1161,23 +1070,77 @@ class Algorithm * Records the dispatch function with the provided template parameters or * alternatively using the size of the tensor by default. * - * @param x Layout X dispatch value - * @param y Layout Y dispatch value - * @param z Layout Z dispatch value + * @param commandBuffer Command buffer to record the algorithm resources to */ void recordDispatch(const vk::CommandBuffer& commandBuffer); - void bindCore(const vk::CommandBuffer& commandBuffer); + /** + * Records command that binds the "core" algorithm components which consist of + * binding the pipeline and binding the descriptorsets. + * + * @param commandBuffer Command buffer to record the algorithm resources to + */ + void recordBindCore(const vk::CommandBuffer& commandBuffer); - void bindPush(const vk::CommandBuffer& commandBuffer, - const Constants& pushConstants); + /** + * Records command that binds the push constants to the command buffer provided + * - it is required that the pushConstants provided are of the same size as the + * ones provided during initialization. + * + * @param commandBuffer Command buffer to record the algorithm resources to + */ + void recordBindPush(const vk::CommandBuffer& commandBuffer); + /** + * function that checks all the gpu resource components to verify if these have + * been created and returns true if all are valid. + * + * @returns returns true if the algorithm is currently initialized. + */ bool isInit(); + /** + * Sets the work group to use in the recordDispatch + * + * @param workgroup The kp::Workgroup value to use to update the algorithm. It + * must have a value greater than 1 on the x value (index 1) otherwise it will + * be initialized on the size of the first tensor (ie. this->mTensor[0]->size()) + */ void setWorkgroup(const Workgroup& workgroup, uint32_t minSize = 1); + /** + * Sets the push constants to the new value provided to use in the next bindPush() + * + * @param The kp::Constant to use to set the push constants to use in the next + * bindPush(...) calls. The constants provided must be of the same size as the + * ones created during initialization. + */ + void setPush(const Constants& pushConstants); + /** + * Gets the current workgroup from the algorithm. + * + * @param The kp::Constant to use to set the push constants to use in the next + * bindPush(...) calls. The constants provided must be of the same size as the + * ones created during initialization. + */ const Workgroup& getWorkgroup(); + /** + * Gets the specialization constants of the current algorithm. + * + * @returns The kp::Constants currently set for specialization constants + */ const Constants& getSpecializationConstants(); + /** + * Gets the specialization constants of the current algorithm. + * + * @returns The kp::Constants currently set for push constants + */ + const Constants& getPush(); + /** + * Gets the current tensors that are used in the algorithm. + * + * @returns The list of tensors used in the algorithm. + */ const std::vector>& getTensors(); void destroy(); @@ -1206,10 +1169,9 @@ class Algorithm // -------------- ALWAYS OWNED RESOURCES std::vector mSpirv; Constants mSpecializationConstants; + Constants mPushConstants; Workgroup mWorkgroup; - bool mIsInit; - // Create util functions void createShaderModule(); void createPipeline(); @@ -1539,6 +1501,14 @@ class Sequence : public std::enable_shared_from_this ~Sequence(); /** + * Record function for operation to be added to the GPU queue in batch. This + * template requires classes to be derived from the OpBase class. This + * function also requires the Sequence to be recording, otherwise it will + * not be able to add the operation. + * + * @param op Object derived from kp::BaseOp that will be recoreded by the sequence + * which will be used when the operation is evaluated. + * @return shared_ptr of the Sequence class itself */ std::shared_ptr record(std::shared_ptr op); @@ -1551,6 +1521,7 @@ class Sequence : public std::enable_shared_from_this * @param tensors Vector of tensors to use for the operation * @param TArgs Template parameters that are used to initialise operation * which allows for extensible configurations on initialisation. + * @return shared_ptr of the Sequence class itself */ template std::shared_ptr record( @@ -1559,6 +1530,18 @@ class Sequence : public std::enable_shared_from_this std::shared_ptr op{ new T(tensors, std::forward(params)...) }; return this->record(op); } + /** + * Record function for operation to be added to the GPU queue in batch. This + * template requires classes to be derived from the OpBase class. This + * function also requires the Sequence to be recording, otherwise it will + * not be able to add the operation. + * + * @param algorithm Algorithm to use for the record often used for OpAlgo + * operations + * @param TArgs Template parameters that are used to initialise operation + * which allows for extensible configurations on initialisation. + * @return shared_ptr of the Sequence class itself + */ template std::shared_ptr record(std::shared_ptr algorithm, TArgs&&... params) @@ -1570,21 +1553,29 @@ class Sequence : public std::enable_shared_from_this /** * Eval sends all the recorded and stored operations in the vector of - * operations into the gpu as a submit job with a barrier. + * operations into the gpu as a submit job synchronously (with a barrier). * * @return shared_ptr of the Sequence class itself */ std::shared_ptr eval(); + /** + * Resets all the recorded and stored operations, records the operation + * provided and submits into the gpu as a submit job synchronously (with a barrier). + * + * @return shared_ptr of the Sequence class itself + */ std::shared_ptr eval(std::shared_ptr op); /** * Eval sends all the recorded and stored operations in the vector of * operations into the gpu as a submit job with a barrier. * + * @param tensors Vector of tensors to use for the operation + * @param TArgs Template parameters that are used to initialise operation + * which allows for extensible configurations on initialisation. * @return shared_ptr of the Sequence class itself */ - // TODO: Aim to have only a single function with tensors/algorithm template std::shared_ptr eval(std::vector> tensors, TArgs&&... params) @@ -1592,6 +1583,16 @@ class Sequence : public std::enable_shared_from_this std::shared_ptr op{ new T(tensors, std::forward(params)...) }; return this->eval(op); } + /** + * Eval sends all the recorded and stored operations in the vector of + * operations into the gpu as a submit job with a barrier. + * + * @param algorithm Algorithm to use for the record often used for OpAlgo + * operations + * @param TArgs Template parameters that are used to initialise operation + * which allows for extensible configurations on initialisation. + * @return shared_ptr of the Sequence class itself + */ template std::shared_ptr eval(std::shared_ptr algorithm, TArgs&&... params) @@ -1603,18 +1604,27 @@ class Sequence : public std::enable_shared_from_this /** * Eval Async sends all the recorded and stored operations in the vector of - * operations into the gpu as a submit job with a barrier. EvalAwait() must - * be called after to ensure the sequence is terminated correctly. + * operations into the gpu as a submit job without a barrier. EvalAwait() must + * ALWAYS be called after to ensure the sequence is terminated correctly. * * @return Boolean stating whether execution was successful. */ std::shared_ptr evalAsync(); + /** + * Clears currnet operations to record provided one in the vector of + * operations into the gpu as a submit job without a barrier. EvalAwait() must + * ALWAYS be called after to ensure the sequence is terminated correctly. + * + * @return Boolean stating whether execution was successful. + */ std::shared_ptr evalAsync(std::shared_ptr op); - /** * Eval sends all the recorded and stored operations in the vector of * operations into the gpu as a submit job with a barrier. * + * @param tensors Vector of tensors to use for the operation + * @param TArgs Template parameters that are used to initialise operation + * which allows for extensible configurations on initialisation. * @return shared_ptr of the Sequence class itself */ template @@ -1625,6 +1635,16 @@ class Sequence : public std::enable_shared_from_this std::shared_ptr op{ new T(tensors, std::forward(params)...) }; return this->evalAsync(op); } + /** + * Eval sends all the recorded and stored operations in the vector of + * operations into the gpu as a submit job with a barrier. + * + * @param algorithm Algorithm to use for the record often used for OpAlgo + * operations + * @param TArgs Template parameters that are used to initialise operation + * which allows for extensible configurations on initialisation. + * @return shared_ptr of the Sequence class itself + */ template std::shared_ptr evalAsync(std::shared_ptr algorithm, TArgs&&... params) @@ -1639,7 +1659,7 @@ class Sequence : public std::enable_shared_from_this * finishes, it runs the postEval of all operations. * * @param waitFor Number of milliseconds to wait before timing out. - * @return Boolean stating whether execution was successful. + * @return shared_ptr of the Sequence class itself */ std::shared_ptr evalAwait(uint64_t waitFor = UINT64_MAX); @@ -1672,8 +1692,19 @@ class Sequence : public std::enable_shared_from_this */ bool isRecording(); + /** + * Returns true if the sequence has been initialised, and it's based on the + * GPU resources being refrenced. + * + * @return Boolean stating if is initialized + */ bool isInit(); + /** + * Clears command buffer and triggers re-record of all the current operations + * saved, which is useful if the underlying kp::Tensors or kp::Algorithms + * are modified and need to be re-recorded. + */ void rerecord(); /** @@ -1738,18 +1769,17 @@ class Manager Manager(); /** - * Similar to base constructor but allows the user to provide the device - * they would like to create the resources on. + * Similar to base constructor but allows for further configuration to use when + * creating the Vulkan resources. * * @param physicalDeviceIndex The index of the physical device to use - * @param manageResources (Optional) Whether to manage the memory of the - * resources created and destroy when the manager is destroyed. * @param familyQueueIndices (Optional) List of queue indices to add for * explicit allocation - * @param totalQueues The total number of compute queues to create. + * @param desiredExtensions The desired extensions to load from physicalDevice */ Manager(uint32_t physicalDeviceIndex, - const std::vector& familyQueueIndices = {}); + const std::vector& familyQueueIndices = {}, + const std::vector& desiredExtensions = {}); /** * Manager constructor which allows your own vulkan application to integrate @@ -1771,39 +1801,55 @@ class Manager ~Manager(); /** - * Get or create a managed Sequence that will be contained by this manager. - * If the named sequence does not currently exist, it would be created and - * initialised. + * Create a managed sequence that will be destroyed by this manager + * if it hasn't been destroyed by its reference count going to zero. * - * @param sequenceName The name for the named sequence to be retrieved or - * created * @param queueIndex The queue to use from the available queues - * @return Shared pointer to the manager owned sequence resource + * @returns Shared pointer with initialised sequence */ std::shared_ptr sequence(uint32_t queueIndex = 0); /** - * Function that simplifies the common workflow of tensor creation and - * initialization. It will take the constructor parameters for a Tensor - * and will will us it to create a new Tensor and then create it. The - * tensor memory will then be managed and owned by the manager. + * Create a managed tensor that will be destroyed by this manager + * if it hasn't been destroyed by its reference count going to zero. * * @param data The data to initialize the tensor with * @param tensorType The type of tensor to initialize - * @param syncDataToGPU Whether to sync the data to GPU memory - * @returns Initialized Tensor with memory Syncd to GPU device + * @returns Shared pointer with initialised tensor */ std::shared_ptr tensor( const std::vector& data, Tensor::TensorTypes tensorType = Tensor::TensorTypes::eDevice); + /** + * Create a managed algorithm that will be destroyed by this manager + * if it hasn't been destroyed by its reference count going to zero. + * + * @param tensors (optional) The tensors to initialise the algorithm with + * @param spirv (optional) The SPIRV bytes for the algorithm to dispatch + * @param workgroup (optional) kp::Workgroup for algorithm to use, and + * defaults to (tensor[0].size(), 1, 1) + * @param specializationConstants (optional) kp::Constant to use for + * specialization constants, and defaults to an empty constant + * @param pushConstants (optional) kp::Constant to use for push constants, + * and defaults to an empty constant + * @returns Shared pointer with initialised algorithm + */ std::shared_ptr algorithm( const std::vector>& tensors = {}, const std::vector& spirv = {}, const Workgroup& workgroup = {}, - const Constants& specializationConstants = {}); + const Constants& specializationConstants = {}, + const Constants& pushConstants = {}); + /** + * Destroy the GPU resources and all managed resources by manager. + **/ void destroy(); + /** + * Run a pseudo-garbage collection to release all the managed resources + * that have been already freed due to these reaching to zero ref count. + **/ void clear(); private: @@ -1834,7 +1880,8 @@ class Manager // Create functions void createInstance(); void createDevice(const std::vector& familyQueueIndices = {}, - uint32_t hysicalDeviceIndex = 0); + uint32_t hysicalDeviceIndex = 0, + const std::vector& desiredExtensions = {}); }; } // End namespace kp diff --git a/src/Algorithm.cpp b/src/Algorithm.cpp index c58c5a228..d5263628b 100644 --- a/src/Algorithm.cpp +++ b/src/Algorithm.cpp @@ -8,7 +8,8 @@ Algorithm::Algorithm(std::shared_ptr device, const std::vector>& tensors, const std::vector& spirv, const Workgroup& workgroup, - const Constants& specializationConstants) + const Constants& specializationConstants, + const Constants& pushConstants) { KP_LOG_DEBUG("Kompute Algorithm Constructor with device"); @@ -19,7 +20,7 @@ Algorithm::Algorithm(std::shared_ptr device, "spirv size: {}", tensors.size(), spirv.size()); - this->rebuild(tensors, spirv, workgroup, specializationConstants); + this->rebuild(tensors, spirv, workgroup, specializationConstants, pushConstants); } else { KP_LOG_INFO("Kompute Algorithm constructor with empty tensors and or " "spirv so not rebuilding vulkan components"); @@ -37,13 +38,15 @@ void Algorithm::rebuild(const std::vector>& tensors, const std::vector& spirv, const Workgroup& workgroup, - const Constants& specializationConstants) + const Constants& specializationConstants, + const Constants& pushConstants) { KP_LOG_DEBUG("Kompute Algorithm rebuild started"); this->mTensors = tensors; this->mSpirv = spirv; this->mSpecializationConstants = specializationConstants; + this->mPushConstants = pushConstants; this->setWorkgroup(workgroup, this->mTensors.size() ? this->mTensors[0]->size() : 1); @@ -273,6 +276,16 @@ Algorithm::createPipeline() 1, // Set layout count this->mDescriptorSetLayout.get()); + vk::PushConstantRange pushConstantRange; + if (this->mPushConstants.size()) { + pushConstantRange.setStageFlags(vk::ShaderStageFlagBits::eCompute); + pushConstantRange.setOffset(0); + pushConstantRange.setSize(sizeof(float) * this->mPushConstants.size()); + + pipelineLayoutInfo.setPushConstantRangeCount(1); + pipelineLayoutInfo.setPPushConstantRanges(&pushConstantRange); + } + this->mPipelineLayout = std::make_shared(); this->mDevice->createPipelineLayout( &pipelineLayoutInfo, nullptr, this->mPipelineLayout.get()); @@ -346,7 +359,7 @@ Algorithm::createPipeline() } void -Algorithm::bindCore(const vk::CommandBuffer& commandBuffer) +Algorithm::recordBindCore(const vk::CommandBuffer& commandBuffer) { KP_LOG_DEBUG("Kompute Algorithm binding pipeline"); @@ -364,18 +377,17 @@ Algorithm::bindCore(const vk::CommandBuffer& commandBuffer) } void -Algorithm::bindPush(const vk::CommandBuffer& commandBuffer, - const Constants& pushConstants) +Algorithm::recordBindPush(const vk::CommandBuffer& commandBuffer) { - if (pushConstants.size()) { + if (this->mPushConstants.size()) { KP_LOG_DEBUG("Kompute Algorithm binding push constants size: {}", - pushConstants.size()); + this->mPushConstants.size()); commandBuffer.pushConstants(*this->mPipelineLayout, vk::ShaderStageFlagBits::eCompute, 0, - pushConstants.size() * sizeof(float), - pushConstants.data()); + this->mPushConstants.size() * sizeof(float), + this->mPushConstants.data()); } } @@ -412,6 +424,18 @@ Algorithm::setWorkgroup(const Workgroup& workgroup, uint32_t minSize) this->mWorkgroup[2]); } +void +Algorithm::setPush(const Constants& pushConstants) { + + if (pushConstants.size() != this->mPushConstants.size()) { + throw std::runtime_error(fmt::format("Kompute Algorithm push " + "constant provided is size {} but expected size {}", + pushConstants.size(), this->mPushConstants.size())); + } + + this->mPushConstants = pushConstants; +} + const Workgroup& Algorithm::getWorkgroup() { @@ -424,6 +448,11 @@ Algorithm::getSpecializationConstants() return this->mSpecializationConstants; } +const Constants& +Algorithm::getPush() { + return this->mPushConstants; +} + const std::vector>& Algorithm::getTensors() { diff --git a/src/Manager.cpp b/src/Manager.cpp index 38f67de0d..83676f9ec 100644 --- a/src/Manager.cpp +++ b/src/Manager.cpp @@ -1,9 +1,13 @@ #include #include +#include +#include #include "kompute/Manager.hpp" +#include "fmt/ranges.h" + namespace kp { #if DEBUG @@ -29,12 +33,13 @@ Manager::Manager() {} Manager::Manager(uint32_t physicalDeviceIndex, - const std::vector& familyQueueIndices) + const std::vector& familyQueueIndices, + const std::vector& desiredExtensions) { this->mManageResources = true; this->createInstance(); - this->createDevice(familyQueueIndices, physicalDeviceIndex); + this->createDevice(familyQueueIndices, physicalDeviceIndex, desiredExtensions); } Manager::Manager(std::shared_ptr instance, @@ -146,7 +151,10 @@ Manager::createInstance() applicationInfo.applicationVersion = KOMPUTE_VK_API_VERSION; std::vector applicationExtensions; + +#if DEBUG applicationExtensions.push_back(VK_EXT_DEBUG_REPORT_EXTENSION_NAME); +#endif vk::InstanceCreateInfo computeInstanceCreateInfo; computeInstanceCreateInfo.pApplicationInfo = &applicationInfo; @@ -163,8 +171,23 @@ Manager::createInstance() // We'll identify the layers that are supported std::vector validLayerNames; std::vector desiredLayerNames = { - "VK_LAYER_LUNARG_assistant_layer", "VK_LAYER_LUNARG_standard_validation" + "VK_LAYER_LUNARG_assistant_layer", + "VK_LAYER_LUNARG_standard_validation", + "VK_LAYER_KHRONOS_validation", }; + std::vector envLayerNames; + const char* envLayerNamesVal = std::getenv("KOMPUTE_ENV_DEBUG_LAYERS"); + KP_LOG_DEBUG("Kompute Manager adding environment layers: {}", envLayerNamesVal); + if (envLayerNamesVal != NULL && *envLayerNamesVal != '\0') { + std::istringstream iss(envLayerNamesVal); + std::istream_iterator beg(iss), end; + envLayerNames = std::vector(beg, end); + for (const std::string& layerName : envLayerNames) { + desiredLayerNames.push_back(layerName.c_str()); + } + KP_LOG_DEBUG("Desired layers: {}", desiredLayerNames); + } + // Identify the valid layer names based on the desiredLayerNames { std::set uniqueLayerNames; @@ -174,6 +197,7 @@ Manager::createInstance() std::string layerName(layerProperties.layerName.data()); uniqueLayerNames.insert(layerName); } + KP_LOG_DEBUG("Available layers: {}", uniqueLayerNames); for (const char* desiredLayerName : desiredLayerNames) { if (uniqueLayerNames.count(desiredLayerName) != 0) { validLayerNames.push_back(desiredLayerName); @@ -182,10 +206,14 @@ Manager::createInstance() } if (validLayerNames.size() > 0) { + KP_LOG_DEBUG("Kompute Manager Initializing instance with valid layers: {}", validLayerNames); computeInstanceCreateInfo.enabledLayerCount = (uint32_t)validLayerNames.size(); computeInstanceCreateInfo.ppEnabledLayerNames = validLayerNames.data(); } + else { + KP_LOG_WARN("Kompute Manager no valid layer names found from desired layer names"); + } #endif #endif @@ -240,7 +268,8 @@ Manager::clear() void Manager::createDevice(const std::vector& familyQueueIndices, - uint32_t physicalDeviceIndex) + uint32_t physicalDeviceIndex, + const std::vector& desiredExtensions) { KP_LOG_DEBUG("Kompute Manager creating Device"); @@ -268,7 +297,7 @@ Manager::createDevice(const std::vector& familyQueueIndices, KP_LOG_INFO("Using physical device index {} found {}", physicalDeviceIndex, - physicalDeviceProperties.deviceName); + physicalDeviceProperties.deviceName.data()); if (!familyQueueIndices.size()) { // Find compute queue @@ -318,9 +347,33 @@ Manager::createDevice(const std::vector& familyQueueIndices, deviceQueueCreateInfos.push_back(deviceQueueCreateInfo); } + KP_LOG_DEBUG("Kompute Manager desired extension layers {}", desiredExtensions); + + std::vector deviceExtensions = this->mPhysicalDevice->enumerateDeviceExtensionProperties(); + + std::set uniqueExtensionNames; + for (const vk::ExtensionProperties& ext : deviceExtensions) { + std::string extName(ext.extensionName.data()); + uniqueExtensionNames.insert(extName); + } + KP_LOG_DEBUG("Kompute Manager available extensions {}", uniqueExtensionNames); + std::vector validExtensions; + for (std::string ext : desiredExtensions) { + if (uniqueExtensionNames.count(ext) != 0) { + validExtensions.push_back(ext.c_str()); + } + } + if (desiredExtensions.size() != validExtensions.size()) { + KP_LOG_ERROR("Kompute Manager not all extensions were added: {}", validExtensions); + } + vk::DeviceCreateInfo deviceCreateInfo(vk::DeviceCreateFlags(), deviceQueueCreateInfos.size(), - deviceQueueCreateInfos.data()); + deviceQueueCreateInfos.data(), + {}, + {}, + validExtensions.size(), + validExtensions.data()); this->mDevice = std::make_shared(); physicalDevice.createDevice( @@ -361,13 +414,14 @@ std::shared_ptr Manager::algorithm(const std::vector>& tensors, const std::vector& spirv, const Workgroup& workgroup, - const Constants& specializationConstants) + const Constants& specializationConstants, + const Constants& pushConstants) { KP_LOG_DEBUG("Kompute Manager algorithm creation triggered"); std::shared_ptr algorithm{ new kp::Algorithm( - this->mDevice, tensors, spirv, workgroup, specializationConstants) }; + this->mDevice, tensors, spirv, workgroup, specializationConstants, pushConstants) }; if (this->mManageResources) { this->mManagedAlgorithms.push_back(algorithm); diff --git a/src/OpAlgoDispatch.cpp b/src/OpAlgoDispatch.cpp index 4a30751fb..44908adb3 100644 --- a/src/OpAlgoDispatch.cpp +++ b/src/OpAlgoDispatch.cpp @@ -34,8 +34,12 @@ OpAlgoDispatch::record(const vk::CommandBuffer& commandBuffer) vk::PipelineStageFlagBits::eComputeShader); } - this->mAlgorithm->bindCore(commandBuffer); - this->mAlgorithm->bindPush(commandBuffer, this->mPushConstants); + if (this->mPushConstants.size()) { + this->mAlgorithm->setPush(this->mPushConstants); + } + + this->mAlgorithm->recordBindCore(commandBuffer); + this->mAlgorithm->recordBindPush(commandBuffer); this->mAlgorithm->recordDispatch(commandBuffer); } diff --git a/src/Shader.cpp b/src/Shader.cpp index 428b5a667..968e53234 100644 --- a/src/Shader.cpp +++ b/src/Shader.cpp @@ -105,5 +105,114 @@ Shader::compile_source( resource); } +const TBuiltInResource Shader::defaultResource = { + /* .MaxLights = */ 0, + /* .MaxClipPlanes = */ 0, + /* .MaxTextureUnits = */ 0, + /* .MaxTextureCoords = */ 0, + /* .MaxVertexAttribs = */ 64, + /* .MaxVertexUniformComponents = */ 4096, + /* .MaxVaryingFloats = */ 64, + /* .MaxVertexTextureImageUnits = */ 0, + /* .MaxCombinedTextureImageUnits = */ 0, + /* .MaxTextureImageUnits = */ 0, + /* .MaxFragmentUniformComponents = */ 0, + /* .MaxDrawBuffers = */ 0, + /* .MaxVertexUniformVectors = */ 128, + /* .MaxVaryingVectors = */ 8, + /* .MaxFragmentUniformVectors = */ 0, + /* .MaxVertexOutputVectors = */ 16, + /* .MaxFragmentInputVectors = */ 0, + /* .MinProgramTexelOffset = */ -8, + /* .MaxProgramTexelOffset = */ 7, + /* .MaxClipDistances = */ 8, + /* .MaxComputeWorkGroupCountX = */ 65535, + /* .MaxComputeWorkGroupCountY = */ 65535, + /* .MaxComputeWorkGroupCountZ = */ 65535, + /* .MaxComputeWorkGroupSizeX = */ 1024, + /* .MaxComputeWorkGroupSizeY = */ 1024, + /* .MaxComputeWorkGroupSizeZ = */ 64, + /* .MaxComputeUniformComponents = */ 1024, + /* .MaxComputeTextureImageUnits = */ 16, + /* .MaxComputeImageUniforms = */ 8, + /* .MaxComputeAtomicCounters = */ 8, + /* .MaxComputeAtomicCounterBuffers = */ 1, + /* .MaxVaryingComponents = */ 60, + /* .MaxVertexOutputComponents = */ 64, + /* .MaxGeometryInputComponents = */ 64, + /* .MaxGeometryOutputComponents = */ 128, + /* .MaxFragmentInputComponents = */ 0, + /* .MaxImageUnits = */ 0, + /* .MaxCombinedImageUnitsAndFragmentOutputs = */ 0, + /* .MaxCombinedShaderOutputResources = */ 8, + /* .MaxImageSamples = */ 0, + /* .MaxVertexImageUniforms = */ 0, + /* .MaxTessControlImageUniforms = */ 0, + /* .MaxTessEvaluationImageUniforms = */ 0, + /* .MaxGeometryImageUniforms = */ 0, + /* .MaxFragmentImageUniforms = */ 0, + /* .MaxCombinedImageUniforms = */ 0, + /* .MaxGeometryTextureImageUnits = */ 0, + /* .MaxGeometryOutputVertices = */ 256, + /* .MaxGeometryTotalOutputComponents = */ 1024, + /* .MaxGeometryUniformComponents = */ 1024, + /* .MaxGeometryVaryingComponents = */ 64, + /* .MaxTessControlInputComponents = */ 128, + /* .MaxTessControlOutputComponents = */ 128, + /* .MaxTessControlTextureImageUnits = */ 0, + /* .MaxTessControlUniformComponents = */ 1024, + /* .MaxTessControlTotalOutputComponents = */ 4096, + /* .MaxTessEvaluationInputComponents = */ 128, + /* .MaxTessEvaluationOutputComponents = */ 128, + /* .MaxTessEvaluationTextureImageUnits = */ 16, + /* .MaxTessEvaluationUniformComponents = */ 1024, + /* .MaxTessPatchComponents = */ 120, + /* .MaxPatchVertices = */ 32, + /* .MaxTessGenLevel = */ 64, + /* .MaxViewports = */ 16, + /* .MaxVertexAtomicCounters = */ 0, + /* .MaxTessControlAtomicCounters = */ 0, + /* .MaxTessEvaluationAtomicCounters = */ 0, + /* .MaxGeometryAtomicCounters = */ 0, + /* .MaxFragmentAtomicCounters = */ 0, + /* .MaxCombinedAtomicCounters = */ 8, + /* .MaxAtomicCounterBindings = */ 1, + /* .MaxVertexAtomicCounterBuffers = */ 0, + /* .MaxTessControlAtomicCounterBuffers = */ 0, + /* .MaxTessEvaluationAtomicCounterBuffers = */ 0, + /* .MaxGeometryAtomicCounterBuffers = */ 0, + /* .MaxFragmentAtomicCounterBuffers = */ 0, + /* .MaxCombinedAtomicCounterBuffers = */ 1, + /* .MaxAtomicCounterBufferSize = */ 16384, + /* .MaxTransformFeedbackBuffers = */ 4, + /* .MaxTransformFeedbackInterleavedComponents = */ 64, + /* .MaxCullDistances = */ 8, + /* .MaxCombinedClipAndCullDistances = */ 8, + /* .MaxSamples = */ 4, + /* .maxMeshOutputVerticesNV = */ 256, + /* .maxMeshOutputPrimitivesNV = */ 512, + /* .maxMeshWorkGroupSizeX_NV = */ 32, + /* .maxMeshWorkGroupSizeY_NV = */ 1, + /* .maxMeshWorkGroupSizeZ_NV = */ 1, + /* .maxTaskWorkGroupSizeX_NV = */ 32, + /* .maxTaskWorkGroupSizeY_NV = */ 1, + /* .maxTaskWorkGroupSizeZ_NV = */ 1, + /* .maxMeshViewCountNV = */ 4, + /* .maxDualSourceDrawBuffersEXT = */ 1, + + /* .limits = */ + { + /* .nonInductiveForLoops = */ 1, + /* .whileLoops = */ 1, + /* .doWhileLoops = */ 1, + /* .generalUniformIndexing = */ 1, + /* .generalAttributeMatrixVectorIndexing = */ 1, + /* .generalVaryingIndexing = */ 1, + /* .generalSamplerIndexing = */ 1, + /* .generalVariableIndexing = */ 1, + /* .generalConstantMatrixVectorIndexing = */ 1, + } +}; + } #endif // DKOMPUTE_DISABLE_SHADER_UTILS diff --git a/src/include/kompute/Algorithm.hpp b/src/include/kompute/Algorithm.hpp index 32e5d9bdf..fae9cfd4b 100644 --- a/src/include/kompute/Algorithm.hpp +++ b/src/include/kompute/Algorithm.hpp @@ -14,31 +14,46 @@ class Algorithm { public: /** - * Default constructor for Algorithm + * Main constructor for algorithm with configuration parameters to create + * the underlying resources. * * @param device The Vulkan device to use for creating resources - * @param commandBuffer The vulkan command buffer to bind the pipeline and - * shaders + * @param tensors (optional) The tensors to use to create the descriptor resources + * @param spirv (optional) The spirv code to use to create the algorithm + * @param workgroup (optional) The kp::Workgroup to use for the dispatch which defaults to + * kp::Workgroup(tensor[0].size(), 1, 1) if not set. + * @param specializationConstants (optional) The kp::Constants to use to initialize + * the specialization constants which cannot be changed once set. + * @param pushConstants (optional) The kp::Constants to use when initializing the + * pipeline, which set the size of the push constants - these can be modified but + * all new values must have the same vector size as this initial value. */ Algorithm(std::shared_ptr device, const std::vector>& tensors = {}, const std::vector& spirv = {}, const Workgroup& workgroup = {}, - const Constants& specializationConstants = {}); + const Constants& specializationConstants = {}, + const Constants& pushConstants = {}); /** - * Initialiser for the shader data provided to the algorithm as well as - * tensor parameters that will be used in shader. + * Rebuild function to reconstruct algorithm with configuration parameters to create + * the underlying resources. * - * @param shaderFileData The bytes in spir-v format of the shader - * @tensorParams The Tensors to be used in the Algorithm / shader for - * @specalizationInstalces The specialization parameters to pass to the - * function processing + * @param tensors The tensors to use to create the descriptor resources + * @param spirv The spirv code to use to create the algorithm + * @param workgroup (optional) The kp::Workgroup to use for the dispatch which defaults to + * kp::Workgroup(tensor[0].size(), 1, 1) if not set. + * @param specializationConstants (optional) The kp::Constants to use to initialize + * the specialization constants which cannot be changed once set. + * @param pushConstants (optional) The kp::Constants to use when initializing the + * pipeline, which set the size of the push constants - these can be modified but + * all new values must have the same vector size as this initial value. */ void rebuild(const std::vector>& tensors, const std::vector& spirv, const Workgroup& workgroup = {}, - const Constants& specializationConstants = {}); + const Constants& specializationConstants = {}, + const Constants& pushConstants = {}); /** * Destructor for Algorithm which is responsible for freeing and desroying @@ -50,23 +65,77 @@ class Algorithm * Records the dispatch function with the provided template parameters or * alternatively using the size of the tensor by default. * - * @param x Layout X dispatch value - * @param y Layout Y dispatch value - * @param z Layout Z dispatch value + * @param commandBuffer Command buffer to record the algorithm resources to */ void recordDispatch(const vk::CommandBuffer& commandBuffer); - void bindCore(const vk::CommandBuffer& commandBuffer); + /** + * Records command that binds the "core" algorithm components which consist of + * binding the pipeline and binding the descriptorsets. + * + * @param commandBuffer Command buffer to record the algorithm resources to + */ + void recordBindCore(const vk::CommandBuffer& commandBuffer); - void bindPush(const vk::CommandBuffer& commandBuffer, - const Constants& pushConstants); + /** + * Records command that binds the push constants to the command buffer provided + * - it is required that the pushConstants provided are of the same size as the + * ones provided during initialization. + * + * @param commandBuffer Command buffer to record the algorithm resources to + */ + void recordBindPush(const vk::CommandBuffer& commandBuffer); + /** + * function that checks all the gpu resource components to verify if these have + * been created and returns true if all are valid. + * + * @returns returns true if the algorithm is currently initialized. + */ bool isInit(); + /** + * Sets the work group to use in the recordDispatch + * + * @param workgroup The kp::Workgroup value to use to update the algorithm. It + * must have a value greater than 1 on the x value (index 1) otherwise it will + * be initialized on the size of the first tensor (ie. this->mTensor[0]->size()) + */ void setWorkgroup(const Workgroup& workgroup, uint32_t minSize = 1); + /** + * Sets the push constants to the new value provided to use in the next bindPush() + * + * @param The kp::Constant to use to set the push constants to use in the next + * bindPush(...) calls. The constants provided must be of the same size as the + * ones created during initialization. + */ + void setPush(const Constants& pushConstants); + /** + * Gets the current workgroup from the algorithm. + * + * @param The kp::Constant to use to set the push constants to use in the next + * bindPush(...) calls. The constants provided must be of the same size as the + * ones created during initialization. + */ const Workgroup& getWorkgroup(); + /** + * Gets the specialization constants of the current algorithm. + * + * @returns The kp::Constants currently set for specialization constants + */ const Constants& getSpecializationConstants(); + /** + * Gets the specialization constants of the current algorithm. + * + * @returns The kp::Constants currently set for push constants + */ + const Constants& getPush(); + /** + * Gets the current tensors that are used in the algorithm. + * + * @returns The list of tensors used in the algorithm. + */ const std::vector>& getTensors(); void destroy(); @@ -95,10 +164,9 @@ class Algorithm // -------------- ALWAYS OWNED RESOURCES std::vector mSpirv; Constants mSpecializationConstants; + Constants mPushConstants; Workgroup mWorkgroup; - bool mIsInit; - // Create util functions void createShaderModule(); void createPipeline(); diff --git a/src/include/kompute/Manager.hpp b/src/include/kompute/Manager.hpp index 61212abf2..957e45d2e 100644 --- a/src/include/kompute/Manager.hpp +++ b/src/include/kompute/Manager.hpp @@ -24,18 +24,17 @@ class Manager Manager(); /** - * Similar to base constructor but allows the user to provide the device - * they would like to create the resources on. + * Similar to base constructor but allows for further configuration to use when + * creating the Vulkan resources. * * @param physicalDeviceIndex The index of the physical device to use - * @param manageResources (Optional) Whether to manage the memory of the - * resources created and destroy when the manager is destroyed. * @param familyQueueIndices (Optional) List of queue indices to add for * explicit allocation - * @param totalQueues The total number of compute queues to create. + * @param desiredExtensions The desired extensions to load from physicalDevice */ Manager(uint32_t physicalDeviceIndex, - const std::vector& familyQueueIndices = {}); + const std::vector& familyQueueIndices = {}, + const std::vector& desiredExtensions = {}); /** * Manager constructor which allows your own vulkan application to integrate @@ -57,39 +56,55 @@ class Manager ~Manager(); /** - * Get or create a managed Sequence that will be contained by this manager. - * If the named sequence does not currently exist, it would be created and - * initialised. + * Create a managed sequence that will be destroyed by this manager + * if it hasn't been destroyed by its reference count going to zero. * - * @param sequenceName The name for the named sequence to be retrieved or - * created * @param queueIndex The queue to use from the available queues - * @return Shared pointer to the manager owned sequence resource + * @returns Shared pointer with initialised sequence */ std::shared_ptr sequence(uint32_t queueIndex = 0); /** - * Function that simplifies the common workflow of tensor creation and - * initialization. It will take the constructor parameters for a Tensor - * and will will us it to create a new Tensor and then create it. The - * tensor memory will then be managed and owned by the manager. + * Create a managed tensor that will be destroyed by this manager + * if it hasn't been destroyed by its reference count going to zero. * * @param data The data to initialize the tensor with * @param tensorType The type of tensor to initialize - * @param syncDataToGPU Whether to sync the data to GPU memory - * @returns Initialized Tensor with memory Syncd to GPU device + * @returns Shared pointer with initialised tensor */ std::shared_ptr tensor( const std::vector& data, Tensor::TensorTypes tensorType = Tensor::TensorTypes::eDevice); + /** + * Create a managed algorithm that will be destroyed by this manager + * if it hasn't been destroyed by its reference count going to zero. + * + * @param tensors (optional) The tensors to initialise the algorithm with + * @param spirv (optional) The SPIRV bytes for the algorithm to dispatch + * @param workgroup (optional) kp::Workgroup for algorithm to use, and + * defaults to (tensor[0].size(), 1, 1) + * @param specializationConstants (optional) kp::Constant to use for + * specialization constants, and defaults to an empty constant + * @param pushConstants (optional) kp::Constant to use for push constants, + * and defaults to an empty constant + * @returns Shared pointer with initialised algorithm + */ std::shared_ptr algorithm( const std::vector>& tensors = {}, const std::vector& spirv = {}, const Workgroup& workgroup = {}, - const Constants& specializationConstants = {}); + const Constants& specializationConstants = {}, + const Constants& pushConstants = {}); + /** + * Destroy the GPU resources and all managed resources by manager. + **/ void destroy(); + /** + * Run a pseudo-garbage collection to release all the managed resources + * that have been already freed due to these reaching to zero ref count. + **/ void clear(); private: @@ -120,7 +135,8 @@ class Manager // Create functions void createInstance(); void createDevice(const std::vector& familyQueueIndices = {}, - uint32_t hysicalDeviceIndex = 0); + uint32_t hysicalDeviceIndex = 0, + const std::vector& desiredExtensions = {}); }; } // End namespace kp diff --git a/src/include/kompute/Parameter.hpp b/src/include/kompute/Parameter.hpp deleted file mode 100644 index a37eb31f8..000000000 --- a/src/include/kompute/Parameter.hpp +++ /dev/null @@ -1,47 +0,0 @@ -#pragma once - -#include "kompute/Core.hpp" - -#include "kompute/Tensor.hpp" - -namespace kp { - -class Algorithm -{ - public: - Algorithm(); - - Algorithm(std::shared_ptr device); - - void init(std::string shaderFilePath, - std::vector> tensorParams); - - ~Algorithm(); - - private: - // -------------- NEVER OWNED RESOURCES - std::shared_ptr mDevice; - - // -------------- OPTIONALLY OWNED RESOURCES - std::shared_ptr mDescriptorSetLayout; - bool mFreeDescriptorSetLayout = false; - std::shared_ptr mDescriptorPool; - bool mFreeDescriptorPool = false; - std::shared_ptr mDescriptorSet; - bool mFreeDescriptorSet = false; - std::shared_ptr mShaderModule; - bool mFreeShaderModule = false; - std::shared_ptr mPipelineLayout; - bool mFreePipelineLayout = false; - std::shared_ptr mPipelineCache; - bool mFreePipelineCache = false; - std::shared_ptr mPipeline; - bool mFreePipeline = false; - - // Create util functions - void createParameters(); - void createShaderModule(std::string shaderFilePath); - void createPipeline(); -}; - -} // End namespace kp diff --git a/src/include/kompute/Sequence.hpp b/src/include/kompute/Sequence.hpp index 5741fb4e6..10aa80148 100644 --- a/src/include/kompute/Sequence.hpp +++ b/src/include/kompute/Sequence.hpp @@ -32,6 +32,14 @@ class Sequence : public std::enable_shared_from_this ~Sequence(); /** + * Record function for operation to be added to the GPU queue in batch. This + * template requires classes to be derived from the OpBase class. This + * function also requires the Sequence to be recording, otherwise it will + * not be able to add the operation. + * + * @param op Object derived from kp::BaseOp that will be recoreded by the sequence + * which will be used when the operation is evaluated. + * @return shared_ptr of the Sequence class itself */ std::shared_ptr record(std::shared_ptr op); @@ -44,6 +52,7 @@ class Sequence : public std::enable_shared_from_this * @param tensors Vector of tensors to use for the operation * @param TArgs Template parameters that are used to initialise operation * which allows for extensible configurations on initialisation. + * @return shared_ptr of the Sequence class itself */ template std::shared_ptr record( @@ -52,6 +61,18 @@ class Sequence : public std::enable_shared_from_this std::shared_ptr op{ new T(tensors, std::forward(params)...) }; return this->record(op); } + /** + * Record function for operation to be added to the GPU queue in batch. This + * template requires classes to be derived from the OpBase class. This + * function also requires the Sequence to be recording, otherwise it will + * not be able to add the operation. + * + * @param algorithm Algorithm to use for the record often used for OpAlgo + * operations + * @param TArgs Template parameters that are used to initialise operation + * which allows for extensible configurations on initialisation. + * @return shared_ptr of the Sequence class itself + */ template std::shared_ptr record(std::shared_ptr algorithm, TArgs&&... params) @@ -63,21 +84,29 @@ class Sequence : public std::enable_shared_from_this /** * Eval sends all the recorded and stored operations in the vector of - * operations into the gpu as a submit job with a barrier. + * operations into the gpu as a submit job synchronously (with a barrier). * * @return shared_ptr of the Sequence class itself */ std::shared_ptr eval(); + /** + * Resets all the recorded and stored operations, records the operation + * provided and submits into the gpu as a submit job synchronously (with a barrier). + * + * @return shared_ptr of the Sequence class itself + */ std::shared_ptr eval(std::shared_ptr op); /** * Eval sends all the recorded and stored operations in the vector of * operations into the gpu as a submit job with a barrier. * + * @param tensors Vector of tensors to use for the operation + * @param TArgs Template parameters that are used to initialise operation + * which allows for extensible configurations on initialisation. * @return shared_ptr of the Sequence class itself */ - // TODO: Aim to have only a single function with tensors/algorithm template std::shared_ptr eval(std::vector> tensors, TArgs&&... params) @@ -85,6 +114,16 @@ class Sequence : public std::enable_shared_from_this std::shared_ptr op{ new T(tensors, std::forward(params)...) }; return this->eval(op); } + /** + * Eval sends all the recorded and stored operations in the vector of + * operations into the gpu as a submit job with a barrier. + * + * @param algorithm Algorithm to use for the record often used for OpAlgo + * operations + * @param TArgs Template parameters that are used to initialise operation + * which allows for extensible configurations on initialisation. + * @return shared_ptr of the Sequence class itself + */ template std::shared_ptr eval(std::shared_ptr algorithm, TArgs&&... params) @@ -96,18 +135,27 @@ class Sequence : public std::enable_shared_from_this /** * Eval Async sends all the recorded and stored operations in the vector of - * operations into the gpu as a submit job with a barrier. EvalAwait() must - * be called after to ensure the sequence is terminated correctly. + * operations into the gpu as a submit job without a barrier. EvalAwait() must + * ALWAYS be called after to ensure the sequence is terminated correctly. * * @return Boolean stating whether execution was successful. */ std::shared_ptr evalAsync(); + /** + * Clears currnet operations to record provided one in the vector of + * operations into the gpu as a submit job without a barrier. EvalAwait() must + * ALWAYS be called after to ensure the sequence is terminated correctly. + * + * @return Boolean stating whether execution was successful. + */ std::shared_ptr evalAsync(std::shared_ptr op); - /** * Eval sends all the recorded and stored operations in the vector of * operations into the gpu as a submit job with a barrier. * + * @param tensors Vector of tensors to use for the operation + * @param TArgs Template parameters that are used to initialise operation + * which allows for extensible configurations on initialisation. * @return shared_ptr of the Sequence class itself */ template @@ -118,6 +166,16 @@ class Sequence : public std::enable_shared_from_this std::shared_ptr op{ new T(tensors, std::forward(params)...) }; return this->evalAsync(op); } + /** + * Eval sends all the recorded and stored operations in the vector of + * operations into the gpu as a submit job with a barrier. + * + * @param algorithm Algorithm to use for the record often used for OpAlgo + * operations + * @param TArgs Template parameters that are used to initialise operation + * which allows for extensible configurations on initialisation. + * @return shared_ptr of the Sequence class itself + */ template std::shared_ptr evalAsync(std::shared_ptr algorithm, TArgs&&... params) @@ -132,7 +190,7 @@ class Sequence : public std::enable_shared_from_this * finishes, it runs the postEval of all operations. * * @param waitFor Number of milliseconds to wait before timing out. - * @return Boolean stating whether execution was successful. + * @return shared_ptr of the Sequence class itself */ std::shared_ptr evalAwait(uint64_t waitFor = UINT64_MAX); @@ -165,8 +223,19 @@ class Sequence : public std::enable_shared_from_this */ bool isRecording(); + /** + * Returns true if the sequence has been initialised, and it's based on the + * GPU resources being refrenced. + * + * @return Boolean stating if is initialized + */ bool isInit(); + /** + * Clears command buffer and triggers re-record of all the current operations + * saved, which is useful if the underlying kp::Tensors or kp::Algorithms + * are modified and need to be re-recorded. + */ void rerecord(); /** diff --git a/src/include/kompute/Shader.hpp b/src/include/kompute/Shader.hpp index 2d0e43741..9fd1709be 100644 --- a/src/include/kompute/Shader.hpp +++ b/src/include/kompute/Shader.hpp @@ -12,124 +12,18 @@ namespace kp { -// The default resource limit for the GLSL compiler, can be overwritten -// Has been adobted by: -// https://github.com/KhronosGroup/glslang/blob/master/StandAlone/ResourceLimits.cpp -const TBuiltInResource defaultResource = { - /* .MaxLights = */ 0, - /* .MaxClipPlanes = */ 0, - /* .MaxTextureUnits = */ 0, - /* .MaxTextureCoords = */ 0, - /* .MaxVertexAttribs = */ 64, - /* .MaxVertexUniformComponents = */ 4096, - /* .MaxVaryingFloats = */ 64, - /* .MaxVertexTextureImageUnits = */ 0, - /* .MaxCombinedTextureImageUnits = */ 0, - /* .MaxTextureImageUnits = */ 0, - /* .MaxFragmentUniformComponents = */ 0, - /* .MaxDrawBuffers = */ 0, - /* .MaxVertexUniformVectors = */ 128, - /* .MaxVaryingVectors = */ 8, - /* .MaxFragmentUniformVectors = */ 0, - /* .MaxVertexOutputVectors = */ 16, - /* .MaxFragmentInputVectors = */ 0, - /* .MinProgramTexelOffset = */ -8, - /* .MaxProgramTexelOffset = */ 7, - /* .MaxClipDistances = */ 8, - /* .MaxComputeWorkGroupCountX = */ 65535, - /* .MaxComputeWorkGroupCountY = */ 65535, - /* .MaxComputeWorkGroupCountZ = */ 65535, - /* .MaxComputeWorkGroupSizeX = */ 1024, - /* .MaxComputeWorkGroupSizeY = */ 1024, - /* .MaxComputeWorkGroupSizeZ = */ 64, - /* .MaxComputeUniformComponents = */ 1024, - /* .MaxComputeTextureImageUnits = */ 16, - /* .MaxComputeImageUniforms = */ 8, - /* .MaxComputeAtomicCounters = */ 8, - /* .MaxComputeAtomicCounterBuffers = */ 1, - /* .MaxVaryingComponents = */ 60, - /* .MaxVertexOutputComponents = */ 64, - /* .MaxGeometryInputComponents = */ 64, - /* .MaxGeometryOutputComponents = */ 128, - /* .MaxFragmentInputComponents = */ 0, - /* .MaxImageUnits = */ 0, - /* .MaxCombinedImageUnitsAndFragmentOutputs = */ 0, - /* .MaxCombinedShaderOutputResources = */ 8, - /* .MaxImageSamples = */ 0, - /* .MaxVertexImageUniforms = */ 0, - /* .MaxTessControlImageUniforms = */ 0, - /* .MaxTessEvaluationImageUniforms = */ 0, - /* .MaxGeometryImageUniforms = */ 0, - /* .MaxFragmentImageUniforms = */ 0, - /* .MaxCombinedImageUniforms = */ 0, - /* .MaxGeometryTextureImageUnits = */ 0, - /* .MaxGeometryOutputVertices = */ 256, - /* .MaxGeometryTotalOutputComponents = */ 1024, - /* .MaxGeometryUniformComponents = */ 1024, - /* .MaxGeometryVaryingComponents = */ 64, - /* .MaxTessControlInputComponents = */ 128, - /* .MaxTessControlOutputComponents = */ 128, - /* .MaxTessControlTextureImageUnits = */ 0, - /* .MaxTessControlUniformComponents = */ 1024, - /* .MaxTessControlTotalOutputComponents = */ 4096, - /* .MaxTessEvaluationInputComponents = */ 128, - /* .MaxTessEvaluationOutputComponents = */ 128, - /* .MaxTessEvaluationTextureImageUnits = */ 16, - /* .MaxTessEvaluationUniformComponents = */ 1024, - /* .MaxTessPatchComponents = */ 120, - /* .MaxPatchVertices = */ 32, - /* .MaxTessGenLevel = */ 64, - /* .MaxViewports = */ 16, - /* .MaxVertexAtomicCounters = */ 0, - /* .MaxTessControlAtomicCounters = */ 0, - /* .MaxTessEvaluationAtomicCounters = */ 0, - /* .MaxGeometryAtomicCounters = */ 0, - /* .MaxFragmentAtomicCounters = */ 0, - /* .MaxCombinedAtomicCounters = */ 8, - /* .MaxAtomicCounterBindings = */ 1, - /* .MaxVertexAtomicCounterBuffers = */ 0, - /* .MaxTessControlAtomicCounterBuffers = */ 0, - /* .MaxTessEvaluationAtomicCounterBuffers = */ 0, - /* .MaxGeometryAtomicCounterBuffers = */ 0, - /* .MaxFragmentAtomicCounterBuffers = */ 0, - /* .MaxCombinedAtomicCounterBuffers = */ 1, - /* .MaxAtomicCounterBufferSize = */ 16384, - /* .MaxTransformFeedbackBuffers = */ 4, - /* .MaxTransformFeedbackInterleavedComponents = */ 64, - /* .MaxCullDistances = */ 8, - /* .MaxCombinedClipAndCullDistances = */ 8, - /* .MaxSamples = */ 4, - /* .maxMeshOutputVerticesNV = */ 256, - /* .maxMeshOutputPrimitivesNV = */ 512, - /* .maxMeshWorkGroupSizeX_NV = */ 32, - /* .maxMeshWorkGroupSizeY_NV = */ 1, - /* .maxMeshWorkGroupSizeZ_NV = */ 1, - /* .maxTaskWorkGroupSizeX_NV = */ 32, - /* .maxTaskWorkGroupSizeY_NV = */ 1, - /* .maxTaskWorkGroupSizeZ_NV = */ 1, - /* .maxMeshViewCountNV = */ 4, - /* .maxDualSourceDrawBuffersEXT = */ 1, - - /* .limits = */ - { - /* .nonInductiveForLoops = */ 1, - /* .whileLoops = */ 1, - /* .doWhileLoops = */ 1, - /* .generalUniformIndexing = */ 1, - /* .generalAttributeMatrixVectorIndexing = */ 1, - /* .generalVaryingIndexing = */ 1, - /* .generalSamplerIndexing = */ 1, - /* .generalVariableIndexing = */ 1, - /* .generalConstantMatrixVectorIndexing = */ 1, - } -}; - /** Shader utily class with functions to compile and process glsl files. */ class Shader { public: + + // The default resource limit for the GLSL compiler, can be overwritten + // Has been adopted by: + // https://github.com/KhronosGroup/glslang/blob/master/StandAlone/ResourceLimits.cpp + const static TBuiltInResource defaultResource; + /** * Compile multiple sources with optional filenames. Currently this function * uses the glslang C++ interface which is not thread safe so this funciton @@ -150,7 +44,7 @@ class Shader const std::vector& files = {}, const std::string& entryPoint = "main", std::vector> definitions = {}, - const TBuiltInResource& resources = defaultResource); + const TBuiltInResource& resources = Shader::defaultResource); /** * Compile a single glslang source from string value. Currently this @@ -170,7 +64,7 @@ class Shader const std::string& source, const std::string& entryPoint = "main", std::vector> definitions = {}, - const TBuiltInResource& resources = defaultResource); + const TBuiltInResource& resources = Shader::defaultResource); }; } diff --git a/src/include/kompute/Tensor.hpp b/src/include/kompute/Tensor.hpp index 7b24f3de7..195af44f4 100644 --- a/src/include/kompute/Tensor.hpp +++ b/src/include/kompute/Tensor.hpp @@ -29,12 +29,14 @@ class Tensor }; /** - * Default constructor with data provided which would be used to create the + * Constructor with data provided which would be used to create the * respective vulkan buffer and memory. * + * @param physicalDevice The physical device to use to fetch properties + * @param device The device to use to create the buffer and memory from * @param data Non-zero-sized vector of data that will be used by the * tensor - * @param tensorType Type for the tensor which is of type TensorTypes + * @param tensorTypes Type for the tensor which is of type TensorTypes */ Tensor(std::shared_ptr physicalDevice, std::shared_ptr device, @@ -48,10 +50,11 @@ class Tensor ~Tensor(); /** - * Initialiser which calls the initialisation for all the respective tensors - * as well as creates the respective staging tensors. The staging tensors - * would only be created for the tensors of type TensorType::eDevice as - * otherwise there is no need to copy from host memory. + * Function to trigger reinitialisation of the tensor buffer and memory with + * new data as well as new potential device type. + * + * @param data Vector of data to use to initialise vector from + * @param tensorType The type to use for the tensor */ void rebuild(const std::vector& data, TensorTypes tensorType = TensorTypes::eDevice); @@ -61,6 +64,11 @@ class Tensor */ void destroy(); + /** + * Check whether tensor is initialized based on the created gpu resources. + * + * @returns Boolean stating whether tensor is initialized + */ bool isInit(); /** diff --git a/src/include/kompute/operations/OpAlgoDispatch.hpp b/src/include/kompute/operations/OpAlgoDispatch.hpp index 6975f2793..018fbced5 100644 --- a/src/include/kompute/operations/OpAlgoDispatch.hpp +++ b/src/include/kompute/operations/OpAlgoDispatch.hpp @@ -17,6 +17,13 @@ class OpAlgoDispatch : public OpBase { public: + /** + * Constructor that stores the algorithm to use as well as the relevant + * push constants to override when recording. + * + * @param algorithm The algorithm object to use for dispatch + * @param pushConstants The push constants to use for override + */ OpAlgoDispatch(const std::shared_ptr& algorithm, const kp::Constants& pushConstants = {}); @@ -33,18 +40,22 @@ class OpAlgoDispatch : public OpBase * shader processing to the gpu. This function also records the GPU memory * copy of the output data for the staging buffer so it can be read by the * host. + * + * @param commandBuffer The command buffer to record the command into. */ virtual void record(const vk::CommandBuffer& commandBuffer) override; /** * Does not perform any preEval commands. + * + * @param commandBuffer The command buffer to record the command into. */ virtual void preEval(const vk::CommandBuffer& commandBuffer) override; /** - * Executes after the recorded commands are submitted, and performs a copy - * of the GPU Device memory into the staging buffer so the output data can - * be retrieved. + * Does not perform any postEval commands. + * + * @param commandBuffer The command buffer to record the command into. */ virtual void postEval(const vk::CommandBuffer& commandBuffer) override; diff --git a/src/include/kompute/operations/OpBase.hpp b/src/include/kompute/operations/OpBase.hpp index 34818fcf0..f4efb2e9b 100644 --- a/src/include/kompute/operations/OpBase.hpp +++ b/src/include/kompute/operations/OpBase.hpp @@ -32,6 +32,8 @@ class OpBase * The record function is intended to only send a record command or run * commands that are expected to record operations that are to be submitted * as a batch into the GPU. + * + * @param commandBuffer The command buffer to record the command into. */ virtual void record(const vk::CommandBuffer& commandBuffer) = 0; @@ -42,6 +44,8 @@ class OpBase * there are situations where eval can be called multiple times, so the * resources that are created should be idempotent in case it's called multiple * times in a row. + * + * @param commandBuffer The command buffer to record the command into. */ virtual void preEval(const vk::CommandBuffer& commandBuffer) = 0; @@ -52,6 +56,8 @@ class OpBase * there are situations where eval can be called multiple times, so the * resources that are destroyed should not require a re-init unless explicitly * provided by the user. + * + * @param commandBuffer The command buffer to record the command into. */ virtual void postEval(const vk::CommandBuffer& commandBuffer) = 0; }; diff --git a/src/include/kompute/operations/OpMult.hpp b/src/include/kompute/operations/OpMult.hpp index 992b0e8a0..5c6dec9f0 100644 --- a/src/include/kompute/operations/OpMult.hpp +++ b/src/include/kompute/operations/OpMult.hpp @@ -26,11 +26,9 @@ class OpMult : public OpAlgoDispatch * requirements for the operations to be able to create and manage their * sub-components. * - * @param physicalDevice Vulkan physical device used to find device queues - * @param device Vulkan logical device for passing to Algorithm - * @param commandBuffer Vulkan Command Buffer to record commands into * @param tensors Tensors that are to be used in this operation - * @param komputeWorkgroup Optional parameter to specify the layout for processing + * @param algorithm An algorithm that will be overridden with the OpMult + * shader data and the tensors provided which are expected to be 3 */ OpMult(std::vector> tensors, std::shared_ptr algorithm) : OpAlgoDispatch(algorithm) diff --git a/src/include/kompute/operations/OpTensorCopy.hpp b/src/include/kompute/operations/OpTensorCopy.hpp index 3d202031f..892528996 100644 --- a/src/include/kompute/operations/OpTensorCopy.hpp +++ b/src/include/kompute/operations/OpTensorCopy.hpp @@ -9,38 +9,47 @@ namespace kp { /** - Operation that copies the data from the first tensor to the rest of the tensors provided, using a record command for all the vectors. This operation does not own/manage the memory of the tensors passed to it. The operation must only receive tensors of type + * Operation that copies the data from the first tensor to the rest of the tensors + * provided, using a record command for all the vectors. This operation does not + * own/manage the memory of the tensors passed to it. The operation must only + * receive tensors of type */ class OpTensorCopy : public OpBase { public: /** - * Default constructor with parameters that provides the core vulkan resources and the tensors that will be used in the operation. + * Default constructor with parameters that provides the core vulkan resources + * and the tensors that will be used in the operation. * - * @param physicalDevice Vulkan physical device used to find device queues - * @param device Vulkan logical device for passing to Algorithm - * @param commandBuffer Vulkan Command Buffer to record commands into * @param tensors Tensors that will be used to create in operation. */ OpTensorCopy(const std::vector>& tensors); /** - * Default destructor. This class does not manage memory so it won't be expecting the parent to perform a release. + * Default destructor. This class does not manage memory so it won't be + * expecting the parent to perform a release. */ ~OpTensorCopy() override; /** - * Records the copy commands from the first tensor into all the other tensors provided. Also optionally records a barrier. + * Records the copy commands from the first tensor into all the other + * tensors provided. Also optionally records a barrier. + * + * @param commandBuffer The command buffer to record the command into. */ void record(const vk::CommandBuffer& commandBuffer) override; /** * Does not perform any preEval commands. + * + * @param commandBuffer The command buffer to record the command into. */ virtual void preEval(const vk::CommandBuffer& commandBuffer) override; /** * Copies the local vectors for all the tensors to sync the data with the gpu. + * + * @param commandBuffer The command buffer to record the command into. */ virtual void postEval(const vk::CommandBuffer& commandBuffer) override; diff --git a/src/include/kompute/operations/OpTensorSyncDevice.hpp b/src/include/kompute/operations/OpTensorSyncDevice.hpp index cbb8ec40e..216ac74c9 100644 --- a/src/include/kompute/operations/OpTensorSyncDevice.hpp +++ b/src/include/kompute/operations/OpTensorSyncDevice.hpp @@ -8,17 +8,20 @@ namespace kp { /** - Operation that syncs tensor's device by mapping local data into the device memory. For TensorTypes::eDevice it will use a record operation for the memory to be syncd into GPU memory which means that the operation will be done in sync with GPU commands. For TensorTypes::eStaging it will only map the data into host memory which will happen during preEval before the recorded commands are dispatched. This operation won't have any effect on TensorTypes::eStaging. + * Operation that syncs tensor's device by mapping local data into the device memory. + * For TensorTypes::eDevice it will use a record operation for the memory to be syncd + * into GPU memory which means that the operation will be done in sync with GPU commands. + * For TensorTypes::eHost it will only map the data into host memory which will + * happen during preEval before the recorded commands are dispatched. */ class OpTensorSyncDevice : public OpBase { public: /** - * Default constructor with parameters that provides the core vulkan resources and the tensors that will be used in the operation. The tensos provided cannot be of type TensorTypes::eStorage. + * Default constructor with parameters that provides the core vulkan resources + * and the tensors that will be used in the operation. The tensos provided cannot + * be of type TensorTypes::eStorage. * - * @param physicalDevice Vulkan physical device used to find device queues - * @param device Vulkan logical device for passing to Algorithm - * @param commandBuffer Vulkan Command Buffer to record commands into * @param tensors Tensors that will be used to create in operation. */ OpTensorSyncDevice(const std::vector>& tensors); @@ -29,17 +32,24 @@ class OpTensorSyncDevice : public OpBase ~OpTensorSyncDevice() override; /** - * For device tensors, it records the copy command for the tensor to copy the data from its staging to device memory. + * For device tensors, it records the copy command for the tensor to copy the + * data from its staging to device memory. + * + * @param commandBuffer The command buffer to record the command into. */ void record(const vk::CommandBuffer& commandBuffer) override; /** * Does not perform any preEval commands. + * + * @param commandBuffer The command buffer to record the command into. */ virtual void preEval(const vk::CommandBuffer& commandBuffer) override; /** * Does not perform any postEval commands. + * + * @param commandBuffer The command buffer to record the command into. */ virtual void postEval(const vk::CommandBuffer& commandBuffer) override; diff --git a/src/include/kompute/operations/OpTensorSyncLocal.hpp b/src/include/kompute/operations/OpTensorSyncLocal.hpp index 276f38137..fc52acc35 100644 --- a/src/include/kompute/operations/OpTensorSyncLocal.hpp +++ b/src/include/kompute/operations/OpTensorSyncLocal.hpp @@ -9,38 +9,50 @@ namespace kp { /** - Operation that syncs tensor's local memory by mapping device data into the local CPU memory. For TensorTypes::eDevice it will use a record operation for the memory to be syncd into GPU memory which means that the operation will be done in sync with GPU commands. For TensorTypes::eStaging it will only map the data into host memory which will happen during preEval before the recorded commands are dispatched. This operation won't have any effect on TensorTypes::eStaging. + * Operation that syncs tensor's local memory by mapping device data into the + * local CPU memory. For TensorTypes::eDevice it will use a record operation + * for the memory to be syncd into GPU memory which means that the operation + * will be done in sync with GPU commands. For TensorTypes::eHost it will + * only map the data into host memory which will happen during preEval before + * the recorded commands are dispatched. */ class OpTensorSyncLocal : public OpBase { public: /** - * Default constructor with parameters that provides the core vulkan resources and the tensors that will be used in the operation. The tensors provided cannot be of type TensorTypes::eStorage. + * Default constructor with parameters that provides the core vulkan resources + * and the tensors that will be used in the operation. The tensors provided + * cannot be of type TensorTypes::eStorage. * - * @param physicalDevice Vulkan physical device used to find device queues - * @param device Vulkan logical device for passing to Algorithm - * @param commandBuffer Vulkan Command Buffer to record commands into * @param tensors Tensors that will be used to create in operation. */ OpTensorSyncLocal(const std::vector>& tensors); /** - * Default destructor. This class does not manage memory so it won't be expecting the parent to perform a release. + * Default destructor. This class does not manage memory so it won't be expecting + * the parent to perform a release. */ ~OpTensorSyncLocal() override; /** - * For device tensors, it records the copy command for the tensor to copy the data from its device to staging memory. + * For device tensors, it records the copy command for the tensor to copy the + * data from its device to staging memory. + * + * @param commandBuffer The command buffer to record the command into. */ void record(const vk::CommandBuffer& commandBuffer) override; /** * Does not perform any preEval commands. + * + * @param commandBuffer The command buffer to record the command into. */ virtual void preEval(const vk::CommandBuffer& commandBuffer) override; /** * For host tensors it performs the map command from the host memory into local memory. + * + * @param commandBuffer The command buffer to record the command into. */ virtual void postEval(const vk::CommandBuffer& commandBuffer) override; diff --git a/test/TestMultipleAlgoExecutions.cpp b/test/TestMultipleAlgoExecutions.cpp index e050e02ea..b94591308 100644 --- a/test/TestMultipleAlgoExecutions.cpp +++ b/test/TestMultipleAlgoExecutions.cpp @@ -49,12 +49,13 @@ TEST(TestMultipleAlgoExecutions, TestEndToEndFunctionality) kp::Constants pushConstsB({ 3.0 }); auto algorithm = mgr.algorithm( - params, kp::Shader::compile_source(shader), workgroup, specConsts); + params, kp::Shader::compile_source(shader), workgroup, specConsts, pushConstsA); // 3. Run operation with string shader synchronously mgr.sequence() ->record(params) - ->record(algorithm, pushConstsA) + ->record(algorithm) + ->eval() ->record(algorithm, pushConstsB) ->eval(); diff --git a/test/TestPushConstant.cpp b/test/TestPushConstant.cpp index ae8cf4a32..f51f8cc42 100644 --- a/test/TestPushConstant.cpp +++ b/test/TestPushConstant.cpp @@ -4,7 +4,7 @@ #include "fmt/ranges.h" -TEST(TestPushConstants, TestTwoConstants) +TEST(TestPushConstants, TestConstantsAlgoDispatchOverride) { { std::string shader(R"( @@ -32,18 +32,98 @@ TEST(TestPushConstants, TestTwoConstants) std::shared_ptr tensor = mgr.tensor({ 0, 0, 0 }); std::shared_ptr algo = - mgr.algorithm({ tensor }, spirv, kp::Workgroup({ 1 })); + mgr.algorithm({ tensor }, spirv, kp::Workgroup({ 1 }), {}, { 0.0, 0.0, 0.0 }); - sq = mgr.sequence() - ->record({ tensor }) - ->record(algo, - kp::Constants{ 0.1, 0.2, 0.3 }) - ->record(algo, - kp::Constants{ 0.3, 0.2, 0.1 }) - ->record({ tensor }) - ->eval(); + sq = mgr.sequence()->eval({ tensor }); + + // We need to run this in sequence to avoid race condition + // We can't use atomicAdd as swiftshader doesn't support it for float + sq->eval(algo, kp::Constants{ 0.1, 0.2, 0.3 }); + sq->eval(algo, kp::Constants{ 0.3, 0.2, 0.1 }); + sq->eval({ tensor }); EXPECT_EQ(tensor->data(), kp::Constants({ 0.4, 0.4, 0.4 })); } } } + +TEST(TestPushConstants, TestConstantsAlgoDispatchNoOverride) +{ + { + std::string shader(R"( + #version 450 + layout(push_constant) uniform PushConstants { + float x; + float y; + float z; + } pcs; + layout (local_size_x = 1) in; + layout(set = 0, binding = 0) buffer a { float pa[]; }; + void main() { + pa[0] += pcs.x; + pa[1] += pcs.y; + pa[2] += pcs.z; + })"); + + std::vector spirv = kp::Shader::compile_source(shader); + + std::shared_ptr sq = nullptr; + + { + kp::Manager mgr; + + std::shared_ptr tensor = mgr.tensor({ 0, 0, 0 }); + + std::shared_ptr algo = + mgr.algorithm({ tensor }, spirv, kp::Workgroup({ 1 }), {}, { 0.1, 0.2, 0.3 }); + + sq = mgr.sequence()->eval({ tensor }); + + // We need to run this in sequence to avoid race condition + // We can't use atomicAdd as swiftshader doesn't support it for float + sq->eval(algo); + sq->eval(algo, kp::Constants{ 0.3, 0.2, 0.1 }); + sq->eval({ tensor }); + + EXPECT_EQ(tensor->data(), kp::Constants({ 0.4, 0.4, 0.4 })); + } + } +} + +TEST(TestPushConstants, TestConstantsWrongSize) +{ + { + std::string shader(R"( + #version 450 + layout(push_constant) uniform PushConstants { + float x; + float y; + float z; + } pcs; + layout (local_size_x = 1) in; + layout(set = 0, binding = 0) buffer a { float pa[]; }; + void main() { + pa[0] += pcs.x; + pa[1] += pcs.y; + pa[2] += pcs.z; + })"); + + std::vector spirv = kp::Shader::compile_source(shader); + + std::shared_ptr sq = nullptr; + + { + kp::Manager mgr; + + std::shared_ptr tensor = mgr.tensor({ 0, 0, 0 }); + + std::shared_ptr algo = + mgr.algorithm({ tensor }, spirv, kp::Workgroup({ 1 }), {}, { 0.0 }); + + sq = mgr.sequence() + ->record({ tensor }); + + EXPECT_THROW(sq->record(algo, kp::Constants{ 0.1, 0.2, 0.3 }), std::runtime_error); + } + } +} diff --git a/test/TestShaderResources.cpp b/test/TestShaderResources.cpp index f3436f228..b0013ef80 100644 --- a/test/TestShaderResources.cpp +++ b/test/TestShaderResources.cpp @@ -32,7 +32,7 @@ void compileShaderWithGivenResources(const std::string shaderString, const TBuil TEST(TestShaderResources, TestNoMaxLight) { - TBuiltInResource noMaxLightResources = kp::defaultResource; + TBuiltInResource noMaxLightResources = kp::Shader::defaultResource; noMaxLightResources.maxLights=0; EXPECT_NO_THROW(compileShaderWithGivenResources(shaderString, noMaxLightResources)); @@ -41,7 +41,7 @@ TEST(TestShaderResources, TestNoMaxLight) TEST(TestShaderResources, TestSmallComputeWorkGroupSizeX) { - TBuiltInResource smallComputeWorkGroupSizeXResources = kp::defaultResource; + TBuiltInResource smallComputeWorkGroupSizeXResources = kp::Shader::defaultResource; smallComputeWorkGroupSizeXResources.maxComputeWorkGroupSizeX=0; ASSERT_THROW(compileShaderWithGivenResources(shaderString, smallComputeWorkGroupSizeXResources), std::runtime_error); @@ -50,7 +50,7 @@ TEST(TestShaderResources, TestSmallComputeWorkGroupSizeX) TEST(TestShaderResources, TestNoWhileLoopLimit) { - TBuiltInResource noWhileLoopLimitResources = kp::defaultResource; + TBuiltInResource noWhileLoopLimitResources = kp::Shader::defaultResource; noWhileLoopLimitResources.limits.whileLoops=0; ASSERT_THROW(compileShaderWithGivenResources(shaderString, noWhileLoopLimitResources), std::runtime_error);