diff --git a/single_include/kompute/Kompute.hpp b/single_include/kompute/Kompute.hpp index a6ebc1254..6d806245e 100755 --- a/single_include/kompute/Kompute.hpp +++ b/single_include/kompute/Kompute.hpp @@ -827,7 +827,8 @@ class OpAlgoBase : public OpBase OpAlgoBase(std::shared_ptr physicalDevice, std::shared_ptr device, std::shared_ptr commandBuffer, - std::vector>& tensors); + std::vector>& tensors, + bool copyOutputData); /** * Default destructor, which is in charge of destroying the algorithm @@ -868,6 +869,9 @@ class OpAlgoBase : public OpBase bool mFreeAlgorithm = false; // -------------- ALWAYS OWNED RESOURCES + std::vector> mOutputStagingTensors; ///< Array of output staging tensors which will be expected to be the same size as the number of inputs. + bool mCopyOutputData; ///< Configuration parameter which states whether data will be copied back to all provided tensors for convenience. This can be deactivated by setting this flag and or overriding the functions provided. + uint32_t mX; uint32_t mY; uint32_t mZ; @@ -895,11 +899,14 @@ template OpAlgoBase::OpAlgoBase(std::shared_ptr physicalDevice, std::shared_ptr device, std::shared_ptr commandBuffer, - std::vector>& tensors) + std::vector>& tensors, + bool copyOutputData) : OpBase(physicalDevice, device, commandBuffer, tensors, false) { SPDLOG_DEBUG("Kompute OpAlgoBase constructor with params"); + SPDLOG_DEBUG("Kompute OpAlgoBase configured for copy output data: {}", copyOutputData); + // The dispatch size is set up based on either explicitly provided template // parameters or by default it would take the shape and size of the tensors if (tX > 0) { @@ -920,6 +927,8 @@ OpAlgoBase::OpAlgoBase(std::shared_ptr physicalD this->mY, this->mZ); + this->mCopyOutputData = copyOutputData; + this->mAlgorithm = std::make_shared(device, commandBuffer); } @@ -927,6 +936,101 @@ template OpAlgoBase::~OpAlgoBase() { SPDLOG_DEBUG("Kompute OpAlgoBase destructor started"); + + if (this->mCopyOutputData) { + SPDLOG_DEBUG("Kompute OpAlgoBase destroying staging tensors"); + for (std::shared_ptr stagingTensor : this->mOutputStagingTensors) { + stagingTensor->freeMemoryDestroyGPUResources(); + } + } +} + +template +void +OpAlgoBase::init() +{ + SPDLOG_DEBUG("Kompute OpAlgoBase init called"); + + if (this->mTensors.size() < 1) { + throw std::runtime_error( + "Kompute OpAlgoBase called with less than 1 tensor"); + } + + for (std::shared_ptr tensor : this->mTensors) { + if(!tensor->isInit()) { + throw std::runtime_error("Kompute OpAlgoBase validation failed; all tensor parameters must be initialised."); + } + } + + if (this->mCopyOutputData) { + SPDLOG_DEBUG("Kompute OpAlgoBase creating staging output tensors"); + + for (std::shared_ptr tensor : this->mTensors) { + std::shared_ptr stagingTensor = std::make_shared( + tensor->data(), Tensor::TensorTypes::eStaging); + stagingTensor->init( + this->mPhysicalDevice, this->mDevice, this->mCommandBuffer); + this->mOutputStagingTensors.push_back(stagingTensor); + } + } + + SPDLOG_DEBUG("Kompute OpAlgoBase fetching spirv data"); + + std::vector& shaderFileData = this->fetchSpirvBinaryData(); + + SPDLOG_DEBUG("Kompute OpAlgoBase Initialising algorithm component"); + + this->mAlgorithm->init(shaderFileData, this->mTensors); +} + +template +void +OpAlgoBase::record() +{ + SPDLOG_DEBUG("Kompute OpAlgoBase record called"); + + // Barrier to ensure the data is finished writing to buffer memory + for (std::shared_ptr tensor : this->mTensors) { + tensor->recordBufferMemoryBarrier( + vk::AccessFlagBits::eHostWrite, + vk::AccessFlagBits::eShaderRead, + vk::PipelineStageFlagBits::eHost, + vk::PipelineStageFlagBits::eComputeShader); + } + + this->mAlgorithm->recordDispatch(this->mX, this->mY, this->mZ); + + if (this->mCopyOutputData) { + // Barrier to ensure the shader code is executed before buffer read + for (std::shared_ptr tensor : this->mTensors) { + tensor->recordBufferMemoryBarrier( + vk::AccessFlagBits::eShaderWrite, + vk::AccessFlagBits::eTransferRead, + vk::PipelineStageFlagBits::eComputeShader, + vk::PipelineStageFlagBits::eTransfer); + } + + // Record copy from and create barrier for STAGING tensors + for (size_t i = 0; i < this->mTensors.size(); i++) { + this->mOutputStagingTensors[i]->recordCopyFrom( + this->mTensors[i], true); + } + } +} + +template +void +OpAlgoBase::postSubmit() +{ + SPDLOG_DEBUG("Kompute OpAlgoBase postSubmit called"); + + if (this->mCopyOutputData) { + for (size_t i = 0; i < this->mTensors.size(); i++) { + this->mOutputStagingTensors[i]->mapDataFromHostMemory(); + + this->mTensors[i]->setData(this->mOutputStagingTensors[i]->data()); + } + } } template @@ -948,45 +1052,12 @@ std::vector OpAlgoBase::fetchSpirvBinaryData() shaderDataRaw + shaderFileSize); } -template -void -OpAlgoBase::init() -{ - SPDLOG_DEBUG("Kompute OpAlgoBase init called"); - - std::vector shaderFileData = this->fetchSpirvBinaryData(); - - this->mAlgorithm->init(shaderFileData, this->mTensors); -} - -template -void -OpAlgoBase::record() -{ - SPDLOG_DEBUG("Kompute OpAlgoBase record called"); - - this->mAlgorithm->recordDispatch(this->mX, this->mY, this->mZ); -} - -template -void -OpAlgoBase::postSubmit() -{ - SPDLOG_DEBUG("Kompute OpAlgoBase postSubmit called"); -} - } #endif // #ifndef OPALGOBASE_IMPL #include -#if RELEASE - -#endif - -#include - namespace kp { /** @@ -1034,7 +1105,7 @@ class OpAlgoLhsRhsOut : public OpAlgoBase * tensors, and creates the algorithm component which processes the * computation. */ - void init() override; + virtual void init() override; /** * This records the commands that are to be sent to the GPU. This includes @@ -1044,14 +1115,14 @@ class OpAlgoLhsRhsOut : public OpAlgoBase * copy of the output data for the staging bufffer so it can be read by the * host. */ - void record() override; + virtual void record() override; /** * Executes after the recorded commands are submitted, and performs a copy * of the GPU Device memory into the staging buffer so the output data can * be retrieved. */ - void postSubmit() override; + virtual void postSubmit() override; protected: // -------------- NEVER OWNED RESOURCES @@ -1082,7 +1153,10 @@ OpAlgoLhsRhsOut::OpAlgoLhsRhsOut(std::shared_ptr std::shared_ptr device, std::shared_ptr commandBuffer, std::vector>& tensors) - : OpAlgoBase(physicalDevice, device, commandBuffer, tensors) + // The inheritance is initialised with the copyOutputData to false given that + // this depencendant class handles the transfer of data via staging buffers in + // a granular way. + : OpAlgoBase(physicalDevice, device, commandBuffer, tensors, false) { SPDLOG_DEBUG("Kompute OpAlgoLhsRhsOut constructor with params"); } @@ -1191,6 +1265,209 @@ OpAlgoLhsRhsOut::postSubmit() #endif // #ifndef OPALGOLHSRHSOUT_CPP +#include + +namespace kp { + +/** + * Operation base class to simplify the creation of operations that require + * multiple unknown number of tensors, all which will be expected to be + * Device storage tensors with the data already stored. All the tensors + * will also be used as outputs so the data will be copied from the device + * into the respective tensors. + * The template parameters specify the processing GPU layout number of + * iterations for each x, y, z parameter. More specifically, this will be the + * input to ".dispatch(uint32_t tX, uint32_t tY, uint32_t, tZ)" + */ +template +class OpAlgoAllInOut : public OpAlgoBase +{ + public: + /** + * Base constructor, should not be used unless explicitly intended. + */ + OpAlgoAllInOut(); + + /** + * Default constructor with parameters that provides the bare minimum + * requirements for the operations to be able to create and manage their + * sub-components. + * + * @param physicalDevice Vulkan physical device used to find device queues + * @param device Vulkan logical device for passing to Algorithm + * @param commandBuffer Vulkan Command Buffer to record commands into + * @param tensors Tensors that are to be used in this operation + * @param freeTensors Whether operation manages the memory of the Tensors + */ + OpAlgoAllInOut(std::shared_ptr physicalDevice, + std::shared_ptr device, + std::shared_ptr commandBuffer, + std::vector>& tensors); + + /** + * Default destructor, which is in charge of destroying the algorithm + * components but does not destroy the underlying tensors + */ + ~OpAlgoAllInOut(); + + /** + * The init function is responsible for ensuring that all of the tensors + * passed into the function have been initialised and are of type Device. + * This is required as the parameters provided are expected to be + * used as storage buffers, as well as output buffers, so the data will + * be transferred out from the Device into the Tensors replacing existing + * data. + */ + void init() override; + + /** + * This records the commands that are to be sent to the GPU. This includes + * the barriers that ensure the memory has been copied before going in and + * out of the shader, as well as the dispatch operation that sends the + * shader processing to the gpu. This function also records the GPU memory + * copy of the output data for the staging bufffer so it can be read by the + * host. + */ + void record() override; + + /** + * Executes after the recorded commands are submitted, and performs a copy + * of the GPU Device memory into the staging buffer so the output data can + * be retrieved. + */ + void postSubmit() override; + + protected: + // -------------- ALWAYS OWNED RESOURCES + std::vector> mOutputStagingTensors; ///< Array of output staging tensors which will be expected to be the same size as the number of inputs. +}; + +} // End namespace kp + +// Including implemenation for template class +#ifndef OPALGOALLINOUT_CPP +#define OPALGOALLINOUT_CPP + +namespace kp { + +template +OpAlgoAllInOut::OpAlgoAllInOut() +{ + SPDLOG_DEBUG("Kompute OpAlgoAllInOut constructor base"); +} + +template +OpAlgoAllInOut::OpAlgoAllInOut(std::shared_ptr physicalDevice, + std::shared_ptr device, + std::shared_ptr commandBuffer, + std::vector>& tensors) + : OpAlgoBase(physicalDevice, device, commandBuffer, tensors) +{ + SPDLOG_DEBUG("Kompute OpAlgoAllInOut constructor with params"); +} + +template +OpAlgoAllInOut::~OpAlgoAllInOut() +{ + SPDLOG_DEBUG("Kompute OpAlgoAllInOut destructor started"); + + SPDLOG_DEBUG("Kompute OpAlgoAllInOut destroying staging tensors"); + for (std::shared_ptr stagingTensor : this->mOutputStagingTensors) { + stagingTensor->freeMemoryDestroyGPUResources(); + } +} + +template +void +OpAlgoAllInOut::init() +{ + SPDLOG_DEBUG("Kompute OpAlgoAllInOut init called"); + + if (this->mTensors.size() < 1) { + throw std::runtime_error( + "Kompute OpAlgoAllInOut called with less than 1 tensor"); + } + + for (std::shared_ptr tensor : this->mTensors) { + if(!tensor->isInit()) { + throw std::runtime_error("Kompute OpAlgoAllInOut validation failed; all tensor parameters must be initialised."); + } + } + + SPDLOG_DEBUG("Kompute OpAlgoAllInOut creating staging output tensors"); + + for (std::shared_ptr tensor : this->mTensors) { + std::shared_ptr stagingTensor = std::make_shared( + tensor->data(), Tensor::TensorTypes::eStaging); + stagingTensor->init( + this->mPhysicalDevice, this->mDevice, this->mCommandBuffer); + this->mOutputStagingTensors.push_back(stagingTensor); + } + + SPDLOG_DEBUG("Kompute OpAlgoAllInOut fetching spirv data"); + + std::vector& shaderFileData = this->fetchSpirvBinaryData(); + + SPDLOG_DEBUG("Kompute OpAlgoAllInOut Initialising algorithm component"); + + this->mAlgorithm->init(shaderFileData, this->mTensors); +} + +template +void +OpAlgoAllInOut::record() +{ + SPDLOG_DEBUG("Kompute OpAlgoAllInOut record called"); + + // Barrier to ensure the data is finished writing to buffer memory + for (std::shared_ptr tensor : this->mTensors) { + tensor->recordBufferMemoryBarrier( + vk::AccessFlagBits::eHostWrite, + vk::AccessFlagBits::eShaderRead, + vk::PipelineStageFlagBits::eHost, + vk::PipelineStageFlagBits::eComputeShader); + } + + this->mAlgorithm->recordDispatch(this->mX, this->mY, this->mZ); + + // Barrier to ensure the shader code is executed before buffer read + for (std::shared_ptr tensor : this->mTensors) { + tensor->recordBufferMemoryBarrier( + vk::AccessFlagBits::eShaderWrite, + vk::AccessFlagBits::eTransferRead, + vk::PipelineStageFlagBits::eComputeShader, + vk::PipelineStageFlagBits::eTransfer); + } + + // Record copy from and create barrier for STAGING tensors + for (std::shared_ptr stagingTensor : this->mOutputStagingTensors) { + stagingTensor->recordCopyFrom(this->mTensorOutput, true); + } +} + +template +void +OpAlgoAllInOut::postSubmit() +{ + SPDLOG_DEBUG("Kompute OpAlgoAllInOut postSubmit called"); + + for (size_t i = 0; i < this->mTensors.size(); i++) { + this->mOutputStagingTensors[i]->mapDataFromHostMemory(); + + this->mTensors[i]->setData(this->mOutputStagingTensors[i]->data()); + } +} + +} + +#endif // #ifndef OPALGOALLINOUT_CPP + +#include + +#if RELEASE + +#endif + namespace kp { /** diff --git a/src/include/kompute/operations/OpAlgoBase.hpp b/src/include/kompute/operations/OpAlgoBase.hpp index b224dea14..1ee42b481 100644 --- a/src/include/kompute/operations/OpAlgoBase.hpp +++ b/src/include/kompute/operations/OpAlgoBase.hpp @@ -42,7 +42,8 @@ class OpAlgoBase : public OpBase OpAlgoBase(std::shared_ptr physicalDevice, std::shared_ptr device, std::shared_ptr commandBuffer, - std::vector>& tensors); + std::vector>& tensors, + bool copyOutputData); /** * Default destructor, which is in charge of destroying the algorithm @@ -83,6 +84,9 @@ class OpAlgoBase : public OpBase bool mFreeAlgorithm = false; // -------------- ALWAYS OWNED RESOURCES + std::vector> mOutputStagingTensors; ///< Array of output staging tensors which will be expected to be the same size as the number of inputs. + bool mCopyOutputData; ///< Configuration parameter which states whether data will be copied back to all provided tensors for convenience. This can be deactivated by setting this flag and or overriding the functions provided. + uint32_t mX; uint32_t mY; uint32_t mZ; @@ -110,11 +114,14 @@ template OpAlgoBase::OpAlgoBase(std::shared_ptr physicalDevice, std::shared_ptr device, std::shared_ptr commandBuffer, - std::vector>& tensors) + std::vector>& tensors, + bool copyOutputData) : OpBase(physicalDevice, device, commandBuffer, tensors, false) { SPDLOG_DEBUG("Kompute OpAlgoBase constructor with params"); + SPDLOG_DEBUG("Kompute OpAlgoBase configured for copy output data: {}", copyOutputData); + // The dispatch size is set up based on either explicitly provided template // parameters or by default it would take the shape and size of the tensors if (tX > 0) { @@ -135,6 +142,8 @@ OpAlgoBase::OpAlgoBase(std::shared_ptr physicalD this->mY, this->mZ); + this->mCopyOutputData = copyOutputData; + this->mAlgorithm = std::make_shared(device, commandBuffer); } @@ -142,6 +151,101 @@ template OpAlgoBase::~OpAlgoBase() { SPDLOG_DEBUG("Kompute OpAlgoBase destructor started"); + + if (this->mCopyOutputData) { + SPDLOG_DEBUG("Kompute OpAlgoBase destroying staging tensors"); + for (std::shared_ptr stagingTensor : this->mOutputStagingTensors) { + stagingTensor->freeMemoryDestroyGPUResources(); + } + } +} + +template +void +OpAlgoBase::init() +{ + SPDLOG_DEBUG("Kompute OpAlgoBase init called"); + + if (this->mTensors.size() < 1) { + throw std::runtime_error( + "Kompute OpAlgoBase called with less than 1 tensor"); + } + + for (std::shared_ptr tensor : this->mTensors) { + if(!tensor->isInit()) { + throw std::runtime_error("Kompute OpAlgoBase validation failed; all tensor parameters must be initialised."); + } + } + + if (this->mCopyOutputData) { + SPDLOG_DEBUG("Kompute OpAlgoBase creating staging output tensors"); + + for (std::shared_ptr tensor : this->mTensors) { + std::shared_ptr stagingTensor = std::make_shared( + tensor->data(), Tensor::TensorTypes::eStaging); + stagingTensor->init( + this->mPhysicalDevice, this->mDevice, this->mCommandBuffer); + this->mOutputStagingTensors.push_back(stagingTensor); + } + } + + SPDLOG_DEBUG("Kompute OpAlgoBase fetching spirv data"); + + std::vector& shaderFileData = this->fetchSpirvBinaryData(); + + SPDLOG_DEBUG("Kompute OpAlgoBase Initialising algorithm component"); + + this->mAlgorithm->init(shaderFileData, this->mTensors); +} + +template +void +OpAlgoBase::record() +{ + SPDLOG_DEBUG("Kompute OpAlgoBase record called"); + + // Barrier to ensure the data is finished writing to buffer memory + for (std::shared_ptr tensor : this->mTensors) { + tensor->recordBufferMemoryBarrier( + vk::AccessFlagBits::eHostWrite, + vk::AccessFlagBits::eShaderRead, + vk::PipelineStageFlagBits::eHost, + vk::PipelineStageFlagBits::eComputeShader); + } + + this->mAlgorithm->recordDispatch(this->mX, this->mY, this->mZ); + + if (this->mCopyOutputData) { + // Barrier to ensure the shader code is executed before buffer read + for (std::shared_ptr tensor : this->mTensors) { + tensor->recordBufferMemoryBarrier( + vk::AccessFlagBits::eShaderWrite, + vk::AccessFlagBits::eTransferRead, + vk::PipelineStageFlagBits::eComputeShader, + vk::PipelineStageFlagBits::eTransfer); + } + + // Record copy from and create barrier for STAGING tensors + for (size_t i = 0; i < this->mTensors.size(); i++) { + this->mOutputStagingTensors[i]->recordCopyFrom( + this->mTensors[i], true); + } + } +} + +template +void +OpAlgoBase::postSubmit() +{ + SPDLOG_DEBUG("Kompute OpAlgoBase postSubmit called"); + + if (this->mCopyOutputData) { + for (size_t i = 0; i < this->mTensors.size(); i++) { + this->mOutputStagingTensors[i]->mapDataFromHostMemory(); + + this->mTensors[i]->setData(this->mOutputStagingTensors[i]->data()); + } + } } template @@ -163,33 +267,6 @@ std::vector OpAlgoBase::fetchSpirvBinaryData() shaderDataRaw + shaderFileSize); } -template -void -OpAlgoBase::init() -{ - SPDLOG_DEBUG("Kompute OpAlgoBase init called"); - - std::vector shaderFileData = this->fetchSpirvBinaryData(); - - this->mAlgorithm->init(shaderFileData, this->mTensors); -} - -template -void -OpAlgoBase::record() -{ - SPDLOG_DEBUG("Kompute OpAlgoBase record called"); - - this->mAlgorithm->recordDispatch(this->mX, this->mY, this->mZ); -} - -template -void -OpAlgoBase::postSubmit() -{ - SPDLOG_DEBUG("Kompute OpAlgoBase postSubmit called"); -} - } #endif // #ifndef OPALGOBASE_IMPL diff --git a/src/include/kompute/operations/OpAlgoLhsRhsOut.hpp b/src/include/kompute/operations/OpAlgoLhsRhsOut.hpp index ec901acc1..dca11eb0c 100644 --- a/src/include/kompute/operations/OpAlgoLhsRhsOut.hpp +++ b/src/include/kompute/operations/OpAlgoLhsRhsOut.hpp @@ -56,7 +56,7 @@ class OpAlgoLhsRhsOut : public OpAlgoBase * tensors, and creates the algorithm component which processes the * computation. */ - void init() override; + virtual void init() override; /** * This records the commands that are to be sent to the GPU. This includes @@ -66,14 +66,14 @@ class OpAlgoLhsRhsOut : public OpAlgoBase * copy of the output data for the staging bufffer so it can be read by the * host. */ - void record() override; + virtual void record() override; /** * Executes after the recorded commands are submitted, and performs a copy * of the GPU Device memory into the staging buffer so the output data can * be retrieved. */ - void postSubmit() override; + virtual void postSubmit() override; protected: // -------------- NEVER OWNED RESOURCES @@ -104,7 +104,10 @@ OpAlgoLhsRhsOut::OpAlgoLhsRhsOut(std::shared_ptr std::shared_ptr device, std::shared_ptr commandBuffer, std::vector>& tensors) - : OpAlgoBase(physicalDevice, device, commandBuffer, tensors) + // The inheritance is initialised with the copyOutputData to false given that + // this depencendant class handles the transfer of data via staging buffers in + // a granular way. + : OpAlgoBase(physicalDevice, device, commandBuffer, tensors, false) { SPDLOG_DEBUG("Kompute OpAlgoLhsRhsOut constructor with params"); }