diff --git a/Makefile b/Makefile index 849ecfd01..63f76c5db 100755 --- a/Makefile +++ b/Makefile @@ -73,7 +73,8 @@ mk_run_tests: mk_build_tests ####### Visual studio build shortcut commands ####### VS_BUILD_TYPE ?= "Debug" -VS_CMAKE_EXTRA_FLAGS ?= "" +# Run with multiprocessin / parallel build by default +VS_CMAKE_EXTRA_FLAGS ?= "/MP" vs_cmake: $(CMAKE_BIN) \ diff --git a/README.md b/README.md index 271a3f6b4..85963bef9 100644 --- a/README.md +++ b/README.md @@ -15,14 +15,14 @@

Vulkan Kompute

-

The General Purpose Vulkan Compute Framework.

+

The General Purpose Vulkan Compute Framework.

-

Blazing fast, lightweight, easy to set up and optimized for advanced GPU processing usecases.

+

Blazing fast, lightweight, mobile-enabled, and optimized for advanced GPU processing usecases.

🔋 [Documentation](https://axsaucedo.github.io/vulkan-kompute/) 💻 [Import to your project](https://axsaucedo.github.io/vulkan-kompute/) ⌨ [Tutorials](https://axsaucedo.github.io/vulkan-kompute/) 💾 @@ -261,8 +261,6 @@ Simplified Kompute Components - - ## Kompute Development We appreciate PRs and Issues. If you want to contribute try checking the "Good first issue" tag, but even using Vulkan Kompute and reporting issues is a great contribution! diff --git a/single_include/AggregateHeaders.cpp b/single_include/AggregateHeaders.cpp index dd756c65b..2f3607e2d 100644 --- a/single_include/AggregateHeaders.cpp +++ b/single_include/AggregateHeaders.cpp @@ -7,5 +7,6 @@ #include "kompute/operations/OpAlgoLhsRhsOut.hpp" #include "kompute/operations/OpMult.hpp" #include "kompute/operations/OpCreateTensor.hpp" +#include "kompute/operations/OpTensorCopy.hpp" #include "kompute/Algorithm.hpp" #include "kompute/Tensor.hpp" diff --git a/single_include/kompute/Kompute.hpp b/single_include/kompute/Kompute.hpp index 6a5ee7698..fcaed2666 100755 --- a/single_include/kompute/Kompute.hpp +++ b/single_include/kompute/Kompute.hpp @@ -218,9 +218,9 @@ class Tensor */ enum class TensorTypes { - eDevice = 0, - eStaging = 1, - eStorage = 2, + eDevice = 0, ///< Type is device memory, source and destination + eStaging = 1, ///< Type is host memory, source and destination + eStorage = 2, ///< Type is Device memory (only) }; /** @@ -248,8 +248,7 @@ class Tensor * Initialiser which calls the initialisation for all the respective tensors as well as creates the respective staging tensors. The staging tensors woudl only be created for the tensors of type TensorType::eDevice as otherwise there is no need to copy from host memory. */ void init(std::shared_ptr physicalDevice, - std::shared_ptr device, - std::shared_ptr commandBuffer); + std::shared_ptr device); /** * Destroys and frees the GPU resources which include the buffer and memory. @@ -312,23 +311,27 @@ class Tensor * thensor. This is intended to pass memory into a processing, to perform * a staging buffer transfer, or to gather output (between others). * + * @param commandBuffer Vulkan Command Buffer to record the commands into * @param copyFromTensor Tensor to copy the data from * @param createBarrier Whether to create a barrier that ensures the data is * copied before further operations. Default is true. */ - void recordCopyFrom(std::shared_ptr copyFromTensor, + void recordCopyFrom(std::shared_ptr commandBuffer, + std::shared_ptr copyFromTensor, bool createBarrier); /** * Records the buffer memory barrier into the command buffer which * ensures that relevant data transfers are carried out correctly. * + * @param commandBuffer Vulkan Command Buffer to record the commands into * @param srcAccessMask Access flags for source access mask * @param dstAccessMask Access flags for destination access mask * @param scrStageMask Pipeline stage flags for source stage mask * @param dstStageMask Pipeline stage flags for destination stage mask */ - void recordBufferMemoryBarrier(vk::AccessFlagBits srcAccessMask, + void recordBufferMemoryBarrier(std::shared_ptr commandBuffer, + vk::AccessFlagBits srcAccessMask, vk::AccessFlagBits dstAccessMask, vk::PipelineStageFlagBits srcStageMask, vk::PipelineStageFlagBits dstStageMask); @@ -356,7 +359,6 @@ class Tensor // -------------- NEVER OWNED RESOURCES std::shared_ptr mPhysicalDevice; std::shared_ptr mDevice; - std::shared_ptr mCommandBuffer; // -------------- OPTIONALLY OWNED RESOURCES std::shared_ptr mBuffer; @@ -1104,7 +1106,7 @@ OpAlgoBase::init() std::shared_ptr stagingTensor = std::make_shared( tensor->data(), Tensor::TensorTypes::eStaging); stagingTensor->init( - this->mPhysicalDevice, this->mDevice, this->mCommandBuffer); + this->mPhysicalDevice, this->mDevice); this->mOutputStagingTensors.push_back(stagingTensor); } } @@ -1127,6 +1129,7 @@ OpAlgoBase::record() // Barrier to ensure the data is finished writing to buffer memory for (std::shared_ptr tensor : this->mTensors) { tensor->recordBufferMemoryBarrier( + this->mCommandBuffer, vk::AccessFlagBits::eHostWrite, vk::AccessFlagBits::eShaderRead, vk::PipelineStageFlagBits::eHost, @@ -1139,6 +1142,7 @@ OpAlgoBase::record() // Barrier to ensure the shader code is executed before buffer read for (const std::shared_ptr& tensor : this->mTensors) { tensor->recordBufferMemoryBarrier( + this->mCommandBuffer, vk::AccessFlagBits::eShaderWrite, vk::AccessFlagBits::eTransferRead, vk::PipelineStageFlagBits::eComputeShader, @@ -1148,7 +1152,9 @@ OpAlgoBase::record() // Record copy from and create barrier for STAGING tensors for (size_t i = 0; i < this->mTensors.size(); i++) { this->mOutputStagingTensors[i]->recordCopyFrom( - this->mTensors[i], true); + this->mCommandBuffer, + this->mTensors[i], + true); } } } @@ -1327,7 +1333,7 @@ OpAlgoLhsRhsOut::init() throw std::runtime_error( "Kompute OpAlgoLhsRhsOut called with less than 1 tensor"); } else if (this->mTensors.size() > 3) { - spdlog::warn("Kompute OpAlgoLhsRhsOut called with more than 3 this->mTensors"); + SPDLOG_WARN("Kompute OpAlgoLhsRhsOut called with more than 3 this->mTensors"); } this->mTensorLHS = this->mTensors[0]; @@ -1555,3 +1561,52 @@ class OpCreateTensor : public OpBase }; } // End namespace kp + +namespace kp { + +/** + Operation that copies the data from the first tensor to the rest of the tensors provided, using a record command for all the vectors. This operation does not own/manage the memory of the tensors passed to it. +*/ +class OpTensorCopy : public OpBase +{ + public: + OpTensorCopy(); + + /** + * Default constructor with parameters that provides the core vulkan resources and the tensors that will be used in the operation. + * + * @param physicalDevice Vulkan physical device used to find device queues + * @param device Vulkan logical device for passing to Algorithm + * @param commandBuffer Vulkan Command Buffer to record commands into + * @param tensors Tensors that will be used to create in operation. + */ + OpTensorCopy(std::shared_ptr physicalDevice, + std::shared_ptr device, + std::shared_ptr commandBuffer, + std::vector> tensors); + + /** + * Default destructor which in this case expects the parent class to free + * the tensors + */ + ~OpTensorCopy() override; + + /** + * TODO + */ + void init() override; + + /** + * Records the copy commands from teh first tensor into all the other tensors provided. Also optionally records a barrier. + */ + void record() override; + + /** + * Copies the local vectors for all the tensors to sync the data with the gpu. + */ + void postSubmit() override; + + private: +}; + +} // End namespace kp diff --git a/src/OpCreateTensor.cpp b/src/OpCreateTensor.cpp index f99a81ba5..ae551e259 100644 --- a/src/OpCreateTensor.cpp +++ b/src/OpCreateTensor.cpp @@ -48,13 +48,13 @@ OpCreateTensor::init() } if (tensor->tensorType() == Tensor::TensorTypes::eDevice) { tensor->init( - this->mPhysicalDevice, this->mDevice, this->mCommandBuffer); + this->mPhysicalDevice, this->mDevice); std::shared_ptr stagingTensor = std::make_shared( tensor->data(), Tensor::TensorTypes::eStaging); stagingTensor->init( - this->mPhysicalDevice, this->mDevice, this->mCommandBuffer); + this->mPhysicalDevice, this->mDevice); stagingTensor->mapDataIntoHostMemory(); @@ -63,7 +63,7 @@ OpCreateTensor::init() } else { tensor->init( - this->mPhysicalDevice, this->mDevice, this->mCommandBuffer); + this->mPhysicalDevice, this->mDevice); // We push a nullptr when no staging tensor is needed to match // index number in array to have one to one mapping with tensors @@ -79,7 +79,9 @@ OpCreateTensor::record() for (size_t i = 0; i < this->mTensors.size(); i++) { if (this->mTensors[i]->tensorType() == Tensor::TensorTypes::eDevice) { - this->mTensors[i]->recordCopyFrom(this->mStagingTensors[i], false); + this->mTensors[i]->recordCopyFrom(this->mCommandBuffer, this->mStagingTensors[i], false); + } else if (this->mTensors[i]->tensorType() == Tensor::TensorTypes::eStaging) { + this->mTensors[i]->mapDataIntoHostMemory(); } } } diff --git a/src/OpTensorCopy.cpp b/src/OpTensorCopy.cpp new file mode 100644 index 000000000..c0e1f5046 --- /dev/null +++ b/src/OpTensorCopy.cpp @@ -0,0 +1,71 @@ + +#include "kompute/operations/OpTensorCopy.hpp" + +namespace kp { + +OpTensorCopy::OpTensorCopy() +{ + SPDLOG_DEBUG("Kompute OpTensorCopy constructor base"); +} + +OpTensorCopy::OpTensorCopy( + std::shared_ptr physicalDevice, + std::shared_ptr device, + std::shared_ptr commandBuffer, + std::vector> tensors) + : OpBase(physicalDevice, device, commandBuffer, tensors, false) +{ + SPDLOG_DEBUG("Kompute OpTensorCopy constructor with params"); +} + +OpTensorCopy::~OpTensorCopy() +{ + SPDLOG_DEBUG("Kompute OpTensorCopy destructor started"); + + SPDLOG_DEBUG("Kompute OpTensorCopy destroying staging tensors"); +} + +void +OpTensorCopy::init() +{ + SPDLOG_DEBUG("Kompute OpTensorCopy init called"); + + if (this->mTensors.size() < 2) { + throw std::runtime_error( + "Kompute OpTensorCopy called with less than 2 tensor"); + } + + for (std::shared_ptr tensor: this->mTensors) { + if (!tensor->isInit()) { + throw std::runtime_error("Kompute OpTensorCopy tensor parameter has not been initialized"); + } + if (tensor->tensorType() == Tensor::TensorTypes::eStorage) { + throw std::runtime_error("Kompute OpTensorCopy tensor parameter is of type storage and hence cannot be used to receive or pass data."); + } + } +} + +void +OpTensorCopy::record() +{ + SPDLOG_DEBUG("Kompute OpTensorCopy record called"); + + // We iterate from the second tensor onwards and record a copy to all + for (size_t i = 1; i < this->mTensors.size(); i++) { + this->mTensors[i]->recordCopyFrom(this->mCommandBuffer, this->mTensors[0], false); + } +} + +void +OpTensorCopy::postSubmit() +{ + SPDLOG_DEBUG("Kompute OpTensorCopy postSubmit called"); + + // Copy the data from the first tensor into all the tensors + for (size_t i = 1; i < this->mTensors.size(); i++) { + this->mTensors[i]->setData(this->mTensors[0]->data()); + } +} + +} + diff --git a/src/Tensor.cpp b/src/Tensor.cpp index bca6c3500..a229a5eec 100644 --- a/src/Tensor.cpp +++ b/src/Tensor.cpp @@ -41,8 +41,7 @@ Tensor::~Tensor() void Tensor::init(std::shared_ptr physicalDevice, - std::shared_ptr device, - std::shared_ptr commandBuffer) + std::shared_ptr device) { SPDLOG_DEBUG("Kompute Tensor running init with Vulkan params and num data " "elementS: {}", @@ -50,7 +49,6 @@ Tensor::init(std::shared_ptr physicalDevice, this->mPhysicalDevice = physicalDevice; this->mDevice = device; - this->mCommandBuffer = commandBuffer; this->mIsInit = true; @@ -106,8 +104,10 @@ Tensor::setData(const std::vector& data) } void -Tensor::recordCopyFrom(std::shared_ptr copyFromTensor, - bool createBarrier) +Tensor::recordCopyFrom( + std::shared_ptr commandBuffer, + std::shared_ptr copyFromTensor, + bool createBarrier) { SPDLOG_DEBUG("Kompute Tensor recordCopyFrom called"); @@ -121,12 +121,13 @@ Tensor::recordCopyFrom(std::shared_ptr copyFromTensor, SPDLOG_DEBUG("Kompute Tensor copying data size {}.", bufferSize); - this->mCommandBuffer->copyBuffer( + commandBuffer->copyBuffer( *copyFromTensor->mBuffer, *this->mBuffer, copyRegion); if (createBarrier) { // Buffer to ensure wait until data is copied to staging buffer - this->recordBufferMemoryBarrier(vk::AccessFlagBits::eTransferWrite, + this->recordBufferMemoryBarrier(commandBuffer, + vk::AccessFlagBits::eTransferWrite, vk::AccessFlagBits::eHostRead, vk::PipelineStageFlagBits::eTransfer, vk::PipelineStageFlagBits::eHost); @@ -134,7 +135,8 @@ Tensor::recordCopyFrom(std::shared_ptr copyFromTensor, } void -Tensor::recordBufferMemoryBarrier(vk::AccessFlagBits srcAccessMask, +Tensor::recordBufferMemoryBarrier(std::shared_ptr commandBuffer, + vk::AccessFlagBits srcAccessMask, vk::AccessFlagBits dstAccessMask, vk::PipelineStageFlagBits srcStageMask, vk::PipelineStageFlagBits dstStageMask) @@ -151,7 +153,7 @@ Tensor::recordBufferMemoryBarrier(vk::AccessFlagBits srcAccessMask, bufferMemoryBarrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; bufferMemoryBarrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; - this->mCommandBuffer->pipelineBarrier(srcStageMask, + commandBuffer->pipelineBarrier(srcStageMask, dstStageMask, vk::DependencyFlags(), nullptr, diff --git a/src/include/kompute/Tensor.hpp b/src/include/kompute/Tensor.hpp index eea3d6c44..ee34949e0 100644 --- a/src/include/kompute/Tensor.hpp +++ b/src/include/kompute/Tensor.hpp @@ -25,9 +25,9 @@ class Tensor */ enum class TensorTypes { - eDevice = 0, - eStaging = 1, - eStorage = 2, + eDevice = 0, ///< Type is device memory, source and destination + eStaging = 1, ///< Type is host memory, source and destination + eStorage = 2, ///< Type is Device memory (only) }; /** @@ -55,8 +55,7 @@ class Tensor * Initialiser which calls the initialisation for all the respective tensors as well as creates the respective staging tensors. The staging tensors woudl only be created for the tensors of type TensorType::eDevice as otherwise there is no need to copy from host memory. */ void init(std::shared_ptr physicalDevice, - std::shared_ptr device, - std::shared_ptr commandBuffer); + std::shared_ptr device); /** * Destroys and frees the GPU resources which include the buffer and memory. @@ -119,23 +118,27 @@ class Tensor * thensor. This is intended to pass memory into a processing, to perform * a staging buffer transfer, or to gather output (between others). * + * @param commandBuffer Vulkan Command Buffer to record the commands into * @param copyFromTensor Tensor to copy the data from * @param createBarrier Whether to create a barrier that ensures the data is * copied before further operations. Default is true. */ - void recordCopyFrom(std::shared_ptr copyFromTensor, + void recordCopyFrom(std::shared_ptr commandBuffer, + std::shared_ptr copyFromTensor, bool createBarrier); /** * Records the buffer memory barrier into the command buffer which * ensures that relevant data transfers are carried out correctly. * + * @param commandBuffer Vulkan Command Buffer to record the commands into * @param srcAccessMask Access flags for source access mask * @param dstAccessMask Access flags for destination access mask * @param scrStageMask Pipeline stage flags for source stage mask * @param dstStageMask Pipeline stage flags for destination stage mask */ - void recordBufferMemoryBarrier(vk::AccessFlagBits srcAccessMask, + void recordBufferMemoryBarrier(std::shared_ptr commandBuffer, + vk::AccessFlagBits srcAccessMask, vk::AccessFlagBits dstAccessMask, vk::PipelineStageFlagBits srcStageMask, vk::PipelineStageFlagBits dstStageMask); @@ -163,7 +166,6 @@ class Tensor // -------------- NEVER OWNED RESOURCES std::shared_ptr mPhysicalDevice; std::shared_ptr mDevice; - std::shared_ptr mCommandBuffer; // -------------- OPTIONALLY OWNED RESOURCES std::shared_ptr mBuffer; diff --git a/src/include/kompute/operations/OpAlgoBase.hpp b/src/include/kompute/operations/OpAlgoBase.hpp index 9709e146b..86e03c442 100644 --- a/src/include/kompute/operations/OpAlgoBase.hpp +++ b/src/include/kompute/operations/OpAlgoBase.hpp @@ -260,7 +260,7 @@ OpAlgoBase::init() std::shared_ptr stagingTensor = std::make_shared( tensor->data(), Tensor::TensorTypes::eStaging); stagingTensor->init( - this->mPhysicalDevice, this->mDevice, this->mCommandBuffer); + this->mPhysicalDevice, this->mDevice); this->mOutputStagingTensors.push_back(stagingTensor); } } @@ -283,6 +283,7 @@ OpAlgoBase::record() // Barrier to ensure the data is finished writing to buffer memory for (std::shared_ptr tensor : this->mTensors) { tensor->recordBufferMemoryBarrier( + this->mCommandBuffer, vk::AccessFlagBits::eHostWrite, vk::AccessFlagBits::eShaderRead, vk::PipelineStageFlagBits::eHost, @@ -295,6 +296,7 @@ OpAlgoBase::record() // Barrier to ensure the shader code is executed before buffer read for (const std::shared_ptr& tensor : this->mTensors) { tensor->recordBufferMemoryBarrier( + this->mCommandBuffer, vk::AccessFlagBits::eShaderWrite, vk::AccessFlagBits::eTransferRead, vk::PipelineStageFlagBits::eComputeShader, @@ -304,7 +306,9 @@ OpAlgoBase::record() // Record copy from and create barrier for STAGING tensors for (size_t i = 0; i < this->mTensors.size(); i++) { this->mOutputStagingTensors[i]->recordCopyFrom( - this->mTensors[i], true); + this->mCommandBuffer, + this->mTensors[i], + true); } } } diff --git a/src/include/kompute/operations/OpAlgoLhsRhsOut.hpp b/src/include/kompute/operations/OpAlgoLhsRhsOut.hpp index ecb7e33d2..1f9605878 100644 --- a/src/include/kompute/operations/OpAlgoLhsRhsOut.hpp +++ b/src/include/kompute/operations/OpAlgoLhsRhsOut.hpp @@ -128,7 +128,7 @@ OpAlgoLhsRhsOut::init() throw std::runtime_error( "Kompute OpAlgoLhsRhsOut called with less than 1 tensor"); } else if (this->mTensors.size() > 3) { - spdlog::warn("Kompute OpAlgoLhsRhsOut called with more than 3 this->mTensors"); + SPDLOG_WARN("Kompute OpAlgoLhsRhsOut called with more than 3 this->mTensors"); } this->mTensorLHS = this->mTensors[0]; diff --git a/src/include/kompute/operations/OpCreateTensor.hpp b/src/include/kompute/operations/OpCreateTensor.hpp index 025bf2862..e04e65062 100644 --- a/src/include/kompute/operations/OpCreateTensor.hpp +++ b/src/include/kompute/operations/OpCreateTensor.hpp @@ -47,8 +47,10 @@ class OpCreateTensor : public OpBase void init() override; /** - * Records the copy command into the GPU memory from the staging or host - * memory depending on the type of tensor. + * Record runs the core actions to create the tensors. For device tensors + * it records a copyCommand to move the data from the staging tensor to the + * device tensor. For staging tensors it performs a mapDataIntoHostMemory + * which would perform immediately as opposed to on sequence eval/submission. */ void record() override; diff --git a/src/include/kompute/operations/OpTensorCopy.hpp b/src/include/kompute/operations/OpTensorCopy.hpp new file mode 100644 index 000000000..646d1b158 --- /dev/null +++ b/src/include/kompute/operations/OpTensorCopy.hpp @@ -0,0 +1,57 @@ +#pragma once + +#include "kompute/Core.hpp" + +#include "kompute/Tensor.hpp" + +#include "kompute/operations/OpBase.hpp" + +namespace kp { + +/** + Operation that copies the data from the first tensor to the rest of the tensors provided, using a record command for all the vectors. This operation does not own/manage the memory of the tensors passed to it. +*/ +class OpTensorCopy : public OpBase +{ + public: + OpTensorCopy(); + + /** + * Default constructor with parameters that provides the core vulkan resources and the tensors that will be used in the operation. + * + * @param physicalDevice Vulkan physical device used to find device queues + * @param device Vulkan logical device for passing to Algorithm + * @param commandBuffer Vulkan Command Buffer to record commands into + * @param tensors Tensors that will be used to create in operation. + */ + OpTensorCopy(std::shared_ptr physicalDevice, + std::shared_ptr device, + std::shared_ptr commandBuffer, + std::vector> tensors); + + /** + * Default destructor which in this case expects the parent class to free + * the tensors + */ + ~OpTensorCopy() override; + + /** + * TODO + */ + void init() override; + + /** + * Records the copy commands from teh first tensor into all the other tensors provided. Also optionally records a barrier. + */ + void record() override; + + /** + * Copies the local vectors for all the tensors to sync the data with the gpu. + */ + void postSubmit() override; + + private: +}; + +} // End namespace kp + diff --git a/test/TestTensor.cpp b/test/TestTensor.cpp index d6195ed25..94f8aa92d 100644 --- a/test/TestTensor.cpp +++ b/test/TestTensor.cpp @@ -30,9 +30,7 @@ TEST(TestTensor, CopyFromHostData) { sq->record({tensorA, tensorB}); - tensorA->mapDataIntoHostMemory(); - - tensorB->recordCopyFrom(tensorA, true); + sq->record({tensorA, tensorB}); sq->end();