diff --git a/README.md b/README.md index 1d191ecc3..e1c19f185 100644 --- a/README.md +++ b/README.md @@ -52,17 +52,50 @@ int main() { kp::Manager mgr; // Automatically selects Device 0 - std::shared_ptr tensorLHS{ new kp::Tensor({ 0.0, 1.0, 2.0 }) }; - mgr.evalOp({ tensorLHS }); + auto tensorLhs = std::make_shared(kp::Tensor({ 0, 1, 2 })); + auto tensorRhs = std::make_shared(kp::Tensor({ 2, 4, 6 })); + auto tensorOut = std::make_shared(kp::Tensor({ 0, 0, 0 })); - std::shared_ptr tensorRHS{ new kp::Tensor( { 2.0, 4.0, 6.0 }) }; - mgr.evalOp({ tensorRHS }); + auto params = std::vector({ tensorLhs, tensorRhs, tensorOut }) - // TODO: Add capabilities for just output tensor types - std::shared_ptr tensorOutput{ new kp::Tensor({ 0.0, 0.0, 0.0 }) }; - mgr.evalOp({ tensorOutput }); + // Create tensor data in GPU + mgr.evalOp(params); - mgr.evalOp({ tensorLHS, tensorRHS, tensorOutput }); + // Run Kompute operation on the parameters provided with dispatch layout + mgr.evalOp>(params, "path/to/shader.comp.spv"); + + // Print the output + std::cout << fmt::format("Output: {}", tensorOutput.data()) << std::endl; +} +``` + +Create your own operations with full control on each of the steps. + +```c++ +template +class OpCustom : public OpAlgoBase { + // ... + OpCustom(std::shared_ptr physicalDevice, + std::shared_ptr device, + std::shared_ptr commandBuffer, + std::vector>& tensors) + : OpAlgoBase(physicalDevice, device, commandBuffer, tensors, true) + { + // ... extra steps to perform custom setup + this->mOptSpirvBinPath = "shaders/glsl/opmult.comp.spv"; + } +} + +int main() { + kp::Manager mgr; // Automatically selects Device 0 + + // Create parameters but don't initialise if customOp performs multiple + auto tensorLhs = std::make_shared(kp::Tensor({ 0, 1, 2 })); + auto tensorRhs = std::make_shared(kp::Tensor({ 2, 4, 6 })); + auto tensorOut = std::make_shared(kp::Tensor({ 0, 0, 0 })); + + // Pass parameters to custom operation which performs relevant steps + mgr.evalOp({ tensorLHS, tensorRHS, tensorOutput }); std::cout << fmt::format("Output: {}", tensorOutput.data()) << std::endl; } @@ -72,6 +105,7 @@ Record commands in a single submit by using a Sequence to send in batch to GPU. ```c++ int main() { + kp::Manager mgr; std::shared_ptr tensorLHS{ new kp::Tensor({ 0.0, 1.0, 2.0 }) }; @@ -90,8 +124,10 @@ int main() { sq.record>({ tensorLHS, tensorRHS, tensorOutput }); } + // Stop recording sq.end(); + // Submit operations to GPU sq.eval(); @@ -99,29 +135,6 @@ int main() { } ``` -Create your own custom operations to leverage Vulkan Compute for your specialised use-cases. - -```c++ -class OpCustom : kp::OpBase { - // ... - void init(std::shared_ptr tensors) { - // ... extra steps to initialise tensors - this->mAlgorithm->init("path/to/your/shader.compute.spv", tensors); - } -} - -int main() { - kp::Manager mgr; // Automatically selects Device 0 - - std::shared_ptr tensor{ new kp::Tensor({ 0.0, 1.0, 2.0 }) }; - mgr.evalOp({ tensorLHS }); - - mgr.evalOp({ tensorLHS, tensorRHS, tensorOutput }); - - std::cout << fmt::format("Output: {}", tensorOutput.data()) << std::endl; -} -``` - ## Motivations Vulkan Kompute was created after identifying the challenge most GPU processing projects with Vulkan undergo - namely having to build extensive boilerplate for Vulkan and create abstractions and interfaces that expose the core compute capabilities. It is only after a few thousand lines of code that it's possible to start building the application-specific logic. diff --git a/single_include/AggregateHeaders.cpp b/single_include/AggregateHeaders.cpp index 46465f4ee..dd756c65b 100644 --- a/single_include/AggregateHeaders.cpp +++ b/single_include/AggregateHeaders.cpp @@ -5,7 +5,6 @@ #include "kompute/operations/OpBase.hpp" #include "kompute/operations/OpAlgoBase.hpp" #include "kompute/operations/OpAlgoLhsRhsOut.hpp" -#include "kompute/operations/OpAlgoAllInOut.hpp" #include "kompute/operations/OpMult.hpp" #include "kompute/operations/OpCreateTensor.hpp" #include "kompute/Algorithm.hpp" diff --git a/single_include/kompute/Kompute.hpp b/single_include/kompute/Kompute.hpp index 6d806245e..204fd3823 100755 --- a/single_include/kompute/Kompute.hpp +++ b/single_include/kompute/Kompute.hpp @@ -526,7 +526,8 @@ class Sequence * not be able to add the operation. * * @param tensors Vector of tensors to use for the operation - * @param TArgs Template parameters that are used to initialise operation which allows for extensible configurations on initialisation. + * @param TArgs Template parameters that are used to initialise operation + * which allows for extensible configurations on initialisation. */ template bool record(std::vector> tensors, TArgs&&... params) @@ -655,7 +656,8 @@ class Manager * * @param tensors The tensors to be used in the operation recorded * @param sequenceName The name of the sequence to be retrieved or created - * @param TArgs Template parameters that will be used to initialise Operation to allow for extensible configurations on initialisation + * @param TArgs Template parameters that will be used to initialise + * Operation to allow for extensible configurations on initialisation */ template void evalOp(std::vector> tensors, @@ -801,6 +803,18 @@ namespace kp { /** * Operation that provides a general abstraction that simplifies the use of * algorithm and parameter components which can be used with shaders. + * By default it enables the user to provide a dynamic number of tensors + * which are then passed as inputs. + * + * All of these tensors are expected to be initlaised and this is checked with throw std exception in the init function. + * + * It is possible to also choose if the user requires all of the tensors to be + * copied from device memory to their host data. This can be disabled by either + * passing the copyOutputData constructor parameter and/or by overriding the + * functions to carry out copy commands accordingly. + * + * See OpLhsRhsOut for an example implementation on a more specific granularity on tensor parameters. + * * The template parameters specify the processing GPU layout number of * iterations for each x, y, z parameter. More specifically, this will be the * input to ".dispatch(uint32_t tX, uint32_t tY, uint32_t, tZ)" @@ -1267,203 +1281,6 @@ OpAlgoLhsRhsOut::postSubmit() #include -namespace kp { - -/** - * Operation base class to simplify the creation of operations that require - * multiple unknown number of tensors, all which will be expected to be - * Device storage tensors with the data already stored. All the tensors - * will also be used as outputs so the data will be copied from the device - * into the respective tensors. - * The template parameters specify the processing GPU layout number of - * iterations for each x, y, z parameter. More specifically, this will be the - * input to ".dispatch(uint32_t tX, uint32_t tY, uint32_t, tZ)" - */ -template -class OpAlgoAllInOut : public OpAlgoBase -{ - public: - /** - * Base constructor, should not be used unless explicitly intended. - */ - OpAlgoAllInOut(); - - /** - * Default constructor with parameters that provides the bare minimum - * requirements for the operations to be able to create and manage their - * sub-components. - * - * @param physicalDevice Vulkan physical device used to find device queues - * @param device Vulkan logical device for passing to Algorithm - * @param commandBuffer Vulkan Command Buffer to record commands into - * @param tensors Tensors that are to be used in this operation - * @param freeTensors Whether operation manages the memory of the Tensors - */ - OpAlgoAllInOut(std::shared_ptr physicalDevice, - std::shared_ptr device, - std::shared_ptr commandBuffer, - std::vector>& tensors); - - /** - * Default destructor, which is in charge of destroying the algorithm - * components but does not destroy the underlying tensors - */ - ~OpAlgoAllInOut(); - - /** - * The init function is responsible for ensuring that all of the tensors - * passed into the function have been initialised and are of type Device. - * This is required as the parameters provided are expected to be - * used as storage buffers, as well as output buffers, so the data will - * be transferred out from the Device into the Tensors replacing existing - * data. - */ - void init() override; - - /** - * This records the commands that are to be sent to the GPU. This includes - * the barriers that ensure the memory has been copied before going in and - * out of the shader, as well as the dispatch operation that sends the - * shader processing to the gpu. This function also records the GPU memory - * copy of the output data for the staging bufffer so it can be read by the - * host. - */ - void record() override; - - /** - * Executes after the recorded commands are submitted, and performs a copy - * of the GPU Device memory into the staging buffer so the output data can - * be retrieved. - */ - void postSubmit() override; - - protected: - // -------------- ALWAYS OWNED RESOURCES - std::vector> mOutputStagingTensors; ///< Array of output staging tensors which will be expected to be the same size as the number of inputs. -}; - -} // End namespace kp - -// Including implemenation for template class -#ifndef OPALGOALLINOUT_CPP -#define OPALGOALLINOUT_CPP - -namespace kp { - -template -OpAlgoAllInOut::OpAlgoAllInOut() -{ - SPDLOG_DEBUG("Kompute OpAlgoAllInOut constructor base"); -} - -template -OpAlgoAllInOut::OpAlgoAllInOut(std::shared_ptr physicalDevice, - std::shared_ptr device, - std::shared_ptr commandBuffer, - std::vector>& tensors) - : OpAlgoBase(physicalDevice, device, commandBuffer, tensors) -{ - SPDLOG_DEBUG("Kompute OpAlgoAllInOut constructor with params"); -} - -template -OpAlgoAllInOut::~OpAlgoAllInOut() -{ - SPDLOG_DEBUG("Kompute OpAlgoAllInOut destructor started"); - - SPDLOG_DEBUG("Kompute OpAlgoAllInOut destroying staging tensors"); - for (std::shared_ptr stagingTensor : this->mOutputStagingTensors) { - stagingTensor->freeMemoryDestroyGPUResources(); - } -} - -template -void -OpAlgoAllInOut::init() -{ - SPDLOG_DEBUG("Kompute OpAlgoAllInOut init called"); - - if (this->mTensors.size() < 1) { - throw std::runtime_error( - "Kompute OpAlgoAllInOut called with less than 1 tensor"); - } - - for (std::shared_ptr tensor : this->mTensors) { - if(!tensor->isInit()) { - throw std::runtime_error("Kompute OpAlgoAllInOut validation failed; all tensor parameters must be initialised."); - } - } - - SPDLOG_DEBUG("Kompute OpAlgoAllInOut creating staging output tensors"); - - for (std::shared_ptr tensor : this->mTensors) { - std::shared_ptr stagingTensor = std::make_shared( - tensor->data(), Tensor::TensorTypes::eStaging); - stagingTensor->init( - this->mPhysicalDevice, this->mDevice, this->mCommandBuffer); - this->mOutputStagingTensors.push_back(stagingTensor); - } - - SPDLOG_DEBUG("Kompute OpAlgoAllInOut fetching spirv data"); - - std::vector& shaderFileData = this->fetchSpirvBinaryData(); - - SPDLOG_DEBUG("Kompute OpAlgoAllInOut Initialising algorithm component"); - - this->mAlgorithm->init(shaderFileData, this->mTensors); -} - -template -void -OpAlgoAllInOut::record() -{ - SPDLOG_DEBUG("Kompute OpAlgoAllInOut record called"); - - // Barrier to ensure the data is finished writing to buffer memory - for (std::shared_ptr tensor : this->mTensors) { - tensor->recordBufferMemoryBarrier( - vk::AccessFlagBits::eHostWrite, - vk::AccessFlagBits::eShaderRead, - vk::PipelineStageFlagBits::eHost, - vk::PipelineStageFlagBits::eComputeShader); - } - - this->mAlgorithm->recordDispatch(this->mX, this->mY, this->mZ); - - // Barrier to ensure the shader code is executed before buffer read - for (std::shared_ptr tensor : this->mTensors) { - tensor->recordBufferMemoryBarrier( - vk::AccessFlagBits::eShaderWrite, - vk::AccessFlagBits::eTransferRead, - vk::PipelineStageFlagBits::eComputeShader, - vk::PipelineStageFlagBits::eTransfer); - } - - // Record copy from and create barrier for STAGING tensors - for (std::shared_ptr stagingTensor : this->mOutputStagingTensors) { - stagingTensor->recordCopyFrom(this->mTensorOutput, true); - } -} - -template -void -OpAlgoAllInOut::postSubmit() -{ - SPDLOG_DEBUG("Kompute OpAlgoAllInOut postSubmit called"); - - for (size_t i = 0; i < this->mTensors.size(); i++) { - this->mOutputStagingTensors[i]->mapDataFromHostMemory(); - - this->mTensors[i]->setData(this->mOutputStagingTensors[i]->data()); - } -} - -} - -#endif // #ifndef OPALGOALLINOUT_CPP - -#include - #if RELEASE #endif @@ -1477,7 +1294,7 @@ namespace kp { * input to ".dispatch(uint32_t tX, uint32_t tY, uint32_t, tZ)" */ template -class OpMult : public OpAlgoLhsRhsOut +class OpMult : public OpAlgoBase { public: /** @@ -1502,7 +1319,7 @@ class OpMult : public OpAlgoLhsRhsOut std::shared_ptr device, std::shared_ptr commandBuffer, std::vector>& tensors) - : OpAlgoLhsRhsOut(physicalDevice, device, commandBuffer, tensors) + : OpAlgoBase(physicalDevice, device, commandBuffer, tensors, true) { SPDLOG_DEBUG("Kompute OpMult constructor with params"); diff --git a/src/include/kompute/operations/OpMult.hpp b/src/include/kompute/operations/OpMult.hpp index 3c56d6143..f2b62da91 100644 --- a/src/include/kompute/operations/OpMult.hpp +++ b/src/include/kompute/operations/OpMult.hpp @@ -11,7 +11,7 @@ #include "kompute/Algorithm.hpp" #include "kompute/Tensor.hpp" -#include "kompute/operations/OpAlgoLhsRhsOut.hpp" +#include "kompute/operations/OpAlgoBase.hpp" namespace kp { @@ -22,7 +22,7 @@ namespace kp { * input to ".dispatch(uint32_t tX, uint32_t tY, uint32_t, tZ)" */ template -class OpMult : public OpAlgoLhsRhsOut +class OpMult : public OpAlgoBase { public: /** @@ -47,7 +47,7 @@ class OpMult : public OpAlgoLhsRhsOut std::shared_ptr device, std::shared_ptr commandBuffer, std::vector>& tensors) - : OpAlgoLhsRhsOut(physicalDevice, device, commandBuffer, tensors) + : OpAlgoBase(physicalDevice, device, commandBuffer, tensors, true) { SPDLOG_DEBUG("Kompute OpMult constructor with params");