diff --git a/single_include/AggregateHeaders.cpp b/single_include/AggregateHeaders.cpp index 779bcd29b..46465f4ee 100644 --- a/single_include/AggregateHeaders.cpp +++ b/single_include/AggregateHeaders.cpp @@ -4,6 +4,8 @@ #include "kompute/Sequence.hpp" #include "kompute/operations/OpBase.hpp" #include "kompute/operations/OpAlgoBase.hpp" +#include "kompute/operations/OpAlgoLhsRhsOut.hpp" +#include "kompute/operations/OpAlgoAllInOut.hpp" #include "kompute/operations/OpMult.hpp" #include "kompute/operations/OpCreateTensor.hpp" #include "kompute/Algorithm.hpp" diff --git a/single_include/kompute/Kompute.hpp b/single_include/kompute/Kompute.hpp index 67ab95763..a6ebc1254 100755 --- a/single_include/kompute/Kompute.hpp +++ b/single_include/kompute/Kompute.hpp @@ -277,7 +277,7 @@ class Tensor * copied before further operations. Default is true. */ void recordCopyFrom(std::shared_ptr copyFromTensor, - bool createBarrier = true); + bool createBarrier); /** * Records the buffer memory barrier into the command buffer which @@ -990,8 +990,10 @@ OpAlgoBase::postSubmit() namespace kp { /** - * Operation that performs multiplication on two tensors and outpus on third - * tensor. The template parameters specify the processing GPU layout number of + * Operation base class to simplify the creation of operations that require + * right hand and left hand side datapoints together with a single output. + * The expected data passed is two input tensors and one output tensor. + * The template parameters specify the processing GPU layout number of * iterations for each x, y, z parameter. More specifically, this will be the * input to ".dispatch(uint32_t tX, uint32_t tY, uint32_t, tZ)" */ @@ -1171,7 +1173,7 @@ OpAlgoLhsRhsOut::record() vk::PipelineStageFlagBits::eComputeShader, vk::PipelineStageFlagBits::eTransfer); - this->mTensorOutputStaging->recordCopyFrom(this->mTensorOutput); + this->mTensorOutputStaging->recordCopyFrom(this->mTensorOutput, true); } template diff --git a/src/OpCreateTensor.cpp b/src/OpCreateTensor.cpp index ee0c2504f..266f57479 100644 --- a/src/OpCreateTensor.cpp +++ b/src/OpCreateTensor.cpp @@ -66,7 +66,7 @@ OpCreateTensor::record() SPDLOG_DEBUG("Kompute OpCreateTensor record called"); if (this->mPrimaryTensor->tensorType() == Tensor::TensorTypes::eDevice) { - this->mPrimaryTensor->recordCopyFrom(this->mStagingTensor); + this->mPrimaryTensor->recordCopyFrom(this->mStagingTensor, true); } } diff --git a/src/include/kompute/Tensor.hpp b/src/include/kompute/Tensor.hpp index ba73f67c5..e5cdd6932 100644 --- a/src/include/kompute/Tensor.hpp +++ b/src/include/kompute/Tensor.hpp @@ -115,7 +115,7 @@ class Tensor * copied before further operations. Default is true. */ void recordCopyFrom(std::shared_ptr copyFromTensor, - bool createBarrier = true); + bool createBarrier); /** * Records the buffer memory barrier into the command buffer which diff --git a/src/include/kompute/operations/OpAlgoAllInOut.hpp b/src/include/kompute/operations/OpAlgoAllInOut.hpp new file mode 100644 index 000000000..b4fe53ada --- /dev/null +++ b/src/include/kompute/operations/OpAlgoAllInOut.hpp @@ -0,0 +1,207 @@ +#pragma once + +#include + +#include "kompute/Core.hpp" + +#include "kompute/Algorithm.hpp" +#include "kompute/Tensor.hpp" + +#include "kompute/operations/OpAlgoBase.hpp" + +namespace kp { + +/** + * Operation base class to simplify the creation of operations that require + * multiple unknown number of tensors, all which will be expected to be + * Device storage tensors with the data already stored. All the tensors + * will also be used as outputs so the data will be copied from the device + * into the respective tensors. + * The template parameters specify the processing GPU layout number of + * iterations for each x, y, z parameter. More specifically, this will be the + * input to ".dispatch(uint32_t tX, uint32_t tY, uint32_t, tZ)" + */ +template +class OpAlgoAllInOut : public OpAlgoBase +{ + public: + /** + * Base constructor, should not be used unless explicitly intended. + */ + OpAlgoAllInOut(); + + /** + * Default constructor with parameters that provides the bare minimum + * requirements for the operations to be able to create and manage their + * sub-components. + * + * @param physicalDevice Vulkan physical device used to find device queues + * @param device Vulkan logical device for passing to Algorithm + * @param commandBuffer Vulkan Command Buffer to record commands into + * @param tensors Tensors that are to be used in this operation + * @param freeTensors Whether operation manages the memory of the Tensors + */ + OpAlgoAllInOut(std::shared_ptr physicalDevice, + std::shared_ptr device, + std::shared_ptr commandBuffer, + std::vector>& tensors); + + /** + * Default destructor, which is in charge of destroying the algorithm + * components but does not destroy the underlying tensors + */ + ~OpAlgoAllInOut(); + + /** + * The init function is responsible for ensuring that all of the tensors + * passed into the function have been initialised and are of type Device. + * This is required as the parameters provided are expected to be + * used as storage buffers, as well as output buffers, so the data will + * be transferred out from the Device into the Tensors replacing existing + * data. + */ + void init() override; + + /** + * This records the commands that are to be sent to the GPU. This includes + * the barriers that ensure the memory has been copied before going in and + * out of the shader, as well as the dispatch operation that sends the + * shader processing to the gpu. This function also records the GPU memory + * copy of the output data for the staging bufffer so it can be read by the + * host. + */ + void record() override; + + /** + * Executes after the recorded commands are submitted, and performs a copy + * of the GPU Device memory into the staging buffer so the output data can + * be retrieved. + */ + void postSubmit() override; + + protected: + // -------------- ALWAYS OWNED RESOURCES + std::vector> mOutputStagingTensors; ///< Array of output staging tensors which will be expected to be the same size as the number of inputs. +}; + +} // End namespace kp + +// Including implemenation for template class +#ifndef OPALGOALLINOUT_CPP +#define OPALGOALLINOUT_CPP + +namespace kp { + +template +OpAlgoAllInOut::OpAlgoAllInOut() +{ + SPDLOG_DEBUG("Kompute OpAlgoAllInOut constructor base"); +} + +template +OpAlgoAllInOut::OpAlgoAllInOut(std::shared_ptr physicalDevice, + std::shared_ptr device, + std::shared_ptr commandBuffer, + std::vector>& tensors) + : OpAlgoBase(physicalDevice, device, commandBuffer, tensors) +{ + SPDLOG_DEBUG("Kompute OpAlgoAllInOut constructor with params"); +} + +template +OpAlgoAllInOut::~OpAlgoAllInOut() +{ + SPDLOG_DEBUG("Kompute OpAlgoAllInOut destructor started"); + + SPDLOG_DEBUG("Kompute OpAlgoAllInOut destroying staging tensors"); + for (std::shared_ptr stagingTensor : this->mOutputStagingTensors) { + stagingTensor->freeMemoryDestroyGPUResources(); + } +} + +template +void +OpAlgoAllInOut::init() +{ + SPDLOG_DEBUG("Kompute OpAlgoAllInOut init called"); + + if (this->mTensors.size() < 1) { + throw std::runtime_error( + "Kompute OpAlgoAllInOut called with less than 1 tensor"); + } + + for (std::shared_ptr tensor : this->mTensors) { + if(!tensor->isInit()) { + throw std::runtime_error("Kompute OpAlgoAllInOut validation failed; all tensor parameters must be initialised."); + } + } + + SPDLOG_DEBUG("Kompute OpAlgoAllInOut creating staging output tensors"); + + for (std::shared_ptr tensor : this->mTensors) { + std::shared_ptr stagingTensor = std::make_shared( + tensor->data(), Tensor::TensorTypes::eStaging); + stagingTensor->init( + this->mPhysicalDevice, this->mDevice, this->mCommandBuffer); + this->mOutputStagingTensors.push_back(stagingTensor); + } + + SPDLOG_DEBUG("Kompute OpAlgoAllInOut fetching spirv data"); + + std::vector& shaderFileData = this->fetchSpirvBinaryData(); + + SPDLOG_DEBUG("Kompute OpAlgoAllInOut Initialising algorithm component"); + + this->mAlgorithm->init(shaderFileData, this->mTensors); +} + +template +void +OpAlgoAllInOut::record() +{ + SPDLOG_DEBUG("Kompute OpAlgoAllInOut record called"); + + // Barrier to ensure the data is finished writing to buffer memory + for (std::shared_ptr tensor : this->mTensors) { + tensor->recordBufferMemoryBarrier( + vk::AccessFlagBits::eHostWrite, + vk::AccessFlagBits::eShaderRead, + vk::PipelineStageFlagBits::eHost, + vk::PipelineStageFlagBits::eComputeShader); + } + + this->mAlgorithm->recordDispatch(this->mX, this->mY, this->mZ); + + // Barrier to ensure the shader code is executed before buffer read + for (std::shared_ptr tensor : this->mTensors) { + tensor->recordBufferMemoryBarrier( + vk::AccessFlagBits::eShaderWrite, + vk::AccessFlagBits::eTransferRead, + vk::PipelineStageFlagBits::eComputeShader, + vk::PipelineStageFlagBits::eTransfer); + } + + // Record copy from and create barrier for STAGING tensors + for (std::shared_ptr stagingTensor : this->mOutputStagingTensors) { + stagingTensor->recordCopyFrom(this->mTensorOutput, true); + } +} + +template +void +OpAlgoAllInOut::postSubmit() +{ + SPDLOG_DEBUG("Kompute OpAlgoAllInOut postSubmit called"); + + for (size_t i = 0; i < this->mTensors.size(); i++) { + this->mOutputStagingTensors[i]->mapDataFromHostMemory(); + + this->mTensors[i]->setData(this->mOutputStagingTensors[i]->data()); + } +} + +} + +#endif // #ifndef OPALGOALLINOUT_CPP + + diff --git a/src/include/kompute/operations/OpAlgoLhsRhsOut.hpp b/src/include/kompute/operations/OpAlgoLhsRhsOut.hpp index bd21aa351..ec901acc1 100644 --- a/src/include/kompute/operations/OpAlgoLhsRhsOut.hpp +++ b/src/include/kompute/operations/OpAlgoLhsRhsOut.hpp @@ -12,8 +12,10 @@ namespace kp { /** - * Operation that performs multiplication on two tensors and outpus on third - * tensor. The template parameters specify the processing GPU layout number of + * Operation base class to simplify the creation of operations that require + * right hand and left hand side datapoints together with a single output. + * The expected data passed is two input tensors and one output tensor. + * The template parameters specify the processing GPU layout number of * iterations for each x, y, z parameter. More specifically, this will be the * input to ".dispatch(uint32_t tX, uint32_t tY, uint32_t, tZ)" */ @@ -194,7 +196,7 @@ OpAlgoLhsRhsOut::record() vk::PipelineStageFlagBits::eComputeShader, vk::PipelineStageFlagBits::eTransfer); - this->mTensorOutputStaging->recordCopyFrom(this->mTensorOutput); + this->mTensorOutputStaging->recordCopyFrom(this->mTensorOutput, true); } template