From 8ec486b4abb8d02229e5e2469919c8c6c2c419c4 Mon Sep 17 00:00:00 2001 From: Alejandro Saucedo Date: Wed, 14 Oct 2020 21:50:33 +0100 Subject: [PATCH 01/19] Added initial async capability --- src/Sequence.cpp | 58 ++++++++++++++++++++++++++------ src/include/kompute/Manager.hpp | 57 +++++++++++++++++++++++++++++++ src/include/kompute/Sequence.hpp | 27 ++++++++++++++- 3 files changed, 130 insertions(+), 12 deletions(-) diff --git a/src/Sequence.cpp b/src/Sequence.cpp index 7f2c63a44..6fd9773e2 100644 --- a/src/Sequence.cpp +++ b/src/Sequence.cpp @@ -118,38 +118,74 @@ Sequence::end() bool Sequence::eval() { - SPDLOG_DEBUG("Kompute sequence compute recording EVAL"); + SPDLOG_DEBUG("Kompute sequence EVAL BEGIN"); - if (this->isRecording()) { - SPDLOG_WARN("Kompute Sequence eval called when still recording"); + bool evalResult = this->evalAsync(); + if (!evalResult) { + SPDLOG_DEBUG("Kompute sequence EVAL FAILURE"); return false; } + evalResult = this->evalAwait(); + + SPDLOG_DEBUG("Kompute sequence EVAL SUCCESS"); + + return evalResult; +} + +bool +Sequence::evalAsync() +{ + if (this->isRecording()) { + SPDLOG_WARN("Kompute Sequence evalAsync called when still recording"); + return false; + } + if (this->mEvalBusy) { + SPDLOG_WARN("Kompute Sequence evalAsync called when an eval async was called without successful wait"); + return false; + } + + this->mEvalBusy = true; + for (size_t i = 0; i < this->mOperations.size(); i++) { this->mOperations[i]->preEval(); } const vk::PipelineStageFlags waitStageMask = - vk::PipelineStageFlagBits::eTransfer; + vk::PipelineStageFlagBits::eTransfer & vk::PipelineStageFlagBits::eComputeShader; vk::SubmitInfo submitInfo( 0, nullptr, &waitStageMask, 1, this->mCommandBuffer.get()); - vk::Fence fence = this->mDevice->createFence(vk::FenceCreateInfo()); + this->mFence = this->mDevice->createFence(vk::FenceCreateInfo()); SPDLOG_DEBUG( "Kompute sequence submitting command buffer into compute queue"); - this->mComputeQueue->submit(1, &submitInfo, fence); - this->mDevice->waitForFences(1, &fence, VK_TRUE, UINT64_MAX); - this->mDevice->destroy(fence); + this->mComputeQueue->submit(1, &submitInfo, this->mFence); +} + +bool +Sequence::evalAwait(uint64_t waitFor) +{ + if (!this->mEvalBusy) { + SPDLOG_WARN("Kompute Sequence evalAwait called without existing eval"); + return false; + } + + vk::Result result = this->mDevice->waitForFences(1, &this->mFence, VK_TRUE, waitFor); + this->mDevice->destroy(this->mFence); + + if (result == vk::Result::eTimeout) { + SPDLOG_WARN("Kompute Sequence evalAwait timed out"); + this->mEvalBusy = false; + return false; + } for (size_t i = 0; i < this->mOperations.size(); i++) { this->mOperations[i]->postEval(); } - SPDLOG_DEBUG("Kompute sequence EVAL success"); - - return true; + this->mEvalBusy = false; } bool diff --git a/src/include/kompute/Manager.hpp b/src/include/kompute/Manager.hpp index e2c7832fd..70eea2e73 100644 --- a/src/include/kompute/Manager.hpp +++ b/src/include/kompute/Manager.hpp @@ -96,6 +96,63 @@ class Manager SPDLOG_DEBUG("Kompute Manager evalOp running sequence SUCCESS"); } + /** + * Operation that adds extra operations to existing or new created + * sequences. + * + * @param tensors The tensors to be used in the operation recorded + * @param sequenceName The name of the sequence to be retrieved or created + * @param TArgs Template parameters that will be used to initialise + * Operation to allow for extensible configurations on initialisation + */ + template + void evalOpAsync(std::vector> tensors, + std::string sequenceName, + TArgs&&... params) + { + SPDLOG_DEBUG("Kompute Manager evalOp triggered"); + std::weak_ptr sqWeakPtr = + this->getOrCreateManagedSequence(sequenceName); + + if (std::shared_ptr sq = sqWeakPtr.lock()) { + SPDLOG_DEBUG("Kompute Manager evalOp running sequence BEGIN"); + sq->begin(); + + SPDLOG_DEBUG("Kompute Manager evalOp running sequence RECORD"); + sq->record(tensors, std::forward(params)...); + + SPDLOG_DEBUG("Kompute Manager evalOp running sequence END"); + sq->end(); + + SPDLOG_DEBUG("Kompute Manager evalOp running sequence EVAL"); + sq->evalAsync(); + } + SPDLOG_DEBUG("Kompute Manager evalOp running sequence SUCCESS"); + } + + /** + * Operation that adds extra operations to existing or new created + * sequences. + * + * @param tensors The tensors to be used in the operation recorded + * @param sequenceName The name of the sequence to be retrieved or created + * @param TArgs Template parameters that will be used to initialise + * Operation to allow for extensible configurations on initialisation + */ + template + void evalOpAwait(std::string sequenceName, uint64_t waitFor = UINT64_MAX) + { + SPDLOG_DEBUG("Kompute Manager evalOpAwait triggered"); + std::weak_ptr sqWeakPtr = + this->getOrCreateManagedSequence(sequenceName); + + if (std::shared_ptr sq = sqWeakPtr.lock()) { + SPDLOG_DEBUG("Kompute Manager evalOpAwait running sequence EVAL AWAIT"); + sq->evalAwait(waitFor); + } + SPDLOG_DEBUG("Kompute Manager evalOpAwait running sequence SUCCESS"); + } + /** * Operation that adds extra operations to existing or new created * sequences. diff --git a/src/include/kompute/Sequence.hpp b/src/include/kompute/Sequence.hpp index 6e4a4ab4f..749fdbeeb 100644 --- a/src/include/kompute/Sequence.hpp +++ b/src/include/kompute/Sequence.hpp @@ -45,19 +45,42 @@ class Sequence /** * Begins recording commands for commands to be submitted into the command * buffer. + * + * @return Boolean stating whether execution was successful. */ bool begin(); + /** * Ends the recording and stops recording commands when the record command * is sent. + * + * @return Boolean stating whether execution was successful. */ bool end(); + /** * Eval sends all the recorded and stored operations in the vector of * operations into the gpu as a submit job with a barrier. + * + * @return Boolean stating whether execution was successful. */ bool eval(); + /** + * Eval Async sends all the recorded and stored operations in the vector of operations into the gpu as a submit job with a barrier. EvalAwait() must be called after to ensure the sequence is terminated correctly. + * + * @return Boolean stating whether execution was successful. + */ + bool evalAsync(); + + /** + * Eval Await waits for the fence to finish processing and then once it finishes, it runs the postEval of all operations. + * + * @param waitFor Number of milliseconds to wait before timing out. + * @return Boolean stating whether execution was successful. + */ + bool evalAwait(uint64_t waitFor = UINT64_MAX); + /** * Returns true if the sequence is currently in recording activated. * @@ -134,12 +157,14 @@ class Sequence std::shared_ptr mCommandBuffer = nullptr; bool mFreeCommandBuffer = false; - // Base op objects + // -------------- ALWAYS OWNED RESOURCES + vk::Fence mFence; std::vector> mOperations; // State bool mIsInit = false; bool mRecording = false; + bool mEvalBusy = false; // Create functions void createCommandPool(); From 33df1dec4e8e157ce8d4860a78f85230af3f61e3 Mon Sep 17 00:00:00 2001 From: Alejandro Saucedo Date: Thu, 15 Oct 2020 07:16:01 +0100 Subject: [PATCH 02/19] Added evalOpAwait and evalOpAsync into the manager --- src/include/kompute/Manager.hpp | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/src/include/kompute/Manager.hpp b/src/include/kompute/Manager.hpp index 70eea2e73..7f95aa9ab 100644 --- a/src/include/kompute/Manager.hpp +++ b/src/include/kompute/Manager.hpp @@ -143,14 +143,16 @@ class Manager void evalOpAwait(std::string sequenceName, uint64_t waitFor = UINT64_MAX) { SPDLOG_DEBUG("Kompute Manager evalOpAwait triggered"); - std::weak_ptr sqWeakPtr = - this->getOrCreateManagedSequence(sequenceName); + std::unordered_map>::iterator found = + this->mManagedSequences.find(sequenceName); - if (std::shared_ptr sq = sqWeakPtr.lock()) { - SPDLOG_DEBUG("Kompute Manager evalOpAwait running sequence EVAL AWAIT"); - sq->evalAwait(waitFor); + if (found == this->mManagedSequences.end()) { + if (std::shared_ptr sq = found->second) { + SPDLOG_DEBUG("Kompute Manager evalOpAwait running sequence Sequence EVAL AWAIT"); + sq->evalAwait(waitFor); + } + SPDLOG_DEBUG("Kompute Manager evalOpAwait running sequence SUCCESS"); } - SPDLOG_DEBUG("Kompute Manager evalOpAwait running sequence SUCCESS"); } /** From 48805e16399751f3df54ca1dcfd8ad404a82a84f Mon Sep 17 00:00:00 2001 From: Alejandro Saucedo Date: Thu, 15 Oct 2020 09:25:16 +0100 Subject: [PATCH 03/19] Updated to function for async --- single_include/kompute/Kompute.hpp | 98 ++++++++++++++++++- src/Sequence.cpp | 21 ++-- src/include/kompute/Manager.hpp | 33 ++++--- src/include/kompute/Sequence.hpp | 9 +- src/include/kompute/operations/OpAlgoBase.hpp | 2 +- 5 files changed, 138 insertions(+), 25 deletions(-) diff --git a/single_include/kompute/Kompute.hpp b/single_include/kompute/Kompute.hpp index 2718b6599..ee8705df5 100755 --- a/single_include/kompute/Kompute.hpp +++ b/single_include/kompute/Kompute.hpp @@ -1033,19 +1033,42 @@ class Sequence /** * Begins recording commands for commands to be submitted into the command * buffer. + * + * @return Boolean stating whether execution was successful. */ bool begin(); + /** * Ends the recording and stops recording commands when the record command * is sent. + * + * @return Boolean stating whether execution was successful. */ bool end(); + /** * Eval sends all the recorded and stored operations in the vector of * operations into the gpu as a submit job with a barrier. + * + * @return Boolean stating whether execution was successful. */ bool eval(); + /** + * Eval Async sends all the recorded and stored operations in the vector of operations into the gpu as a submit job with a barrier. EvalAwait() must be called after to ensure the sequence is terminated correctly. + * + * @return Boolean stating whether execution was successful. + */ + bool evalAsync(); + + /** + * Eval Await waits for the fence to finish processing and then once it finishes, it runs the postEval of all operations. + * + * @param waitFor Number of milliseconds to wait before timing out. + * @return Boolean stating whether execution was successful. + */ + bool evalAwait(uint64_t waitFor = UINT64_MAX); + /** * Returns true if the sequence is currently in recording activated. * @@ -1053,6 +1076,13 @@ class Sequence */ bool isRecording(); + /** + * Returns true if the sequence is currently running - mostly used for async workloads. + * + * @return Boolean stating if currently running. + */ + bool isRunning(); + /** * Returns true if the sequence has been successfully initialised. * @@ -1122,12 +1152,14 @@ class Sequence std::shared_ptr mCommandBuffer = nullptr; bool mFreeCommandBuffer = false; - // Base op objects + // -------------- ALWAYS OWNED RESOURCES + vk::Fence mFence; std::vector> mOperations; // State bool mIsInit = false; bool mRecording = false; + bool mIsRunning = false; // Create functions void createCommandPool(); @@ -1292,6 +1324,68 @@ class Manager SPDLOG_DEBUG("Kompute Manager evalOp running sequence SUCCESS"); } + /** + * Operation that adds extra operations to existing or new created + * sequences. + * + * @param tensors The tensors to be used in the operation recorded + * @param sequenceName The name of the sequence to be retrieved or created + * @param params Template parameters that will be used to initialise + * Operation to allow for extensible configurations on initialisation + */ + template + void evalOpAsync(std::vector> tensors, + std::string sequenceName, + TArgs&&... params) + { + SPDLOG_DEBUG("Kompute Manager evalOpAsync triggered"); + std::weak_ptr sqWeakPtr = + this->getOrCreateManagedSequence(sequenceName); + + if (std::shared_ptr sq = sqWeakPtr.lock()) { + SPDLOG_DEBUG("Kompute Manager evalOpAsync running sequence BEGIN"); + sq->begin(); + + SPDLOG_DEBUG("Kompute Manager evalOpAsync running sequence RECORD"); + sq->record(tensors, std::forward(params)...); + + SPDLOG_DEBUG("Kompute Manager evalOpAsync running sequence END"); + sq->end(); + + SPDLOG_DEBUG("Kompute Manager evalOpAsync running sequence EVAL"); + sq->evalAsync(); + } + SPDLOG_DEBUG("Kompute Manager evalOpAsync running sequence SUCCESS"); + } + + /** + * Operation that adds extra operations to existing or new created + * sequences. + * + * @param sequenceName The name of the sequence to wait for termination + * @param waitFor The amount of time to wait before timing out + */ + template + void evalOpAwait(std::string sequenceName, uint64_t waitFor = UINT64_MAX) + { + SPDLOG_DEBUG("Kompute Manager evalOpAwait triggered"); + std::unordered_map>::iterator found = + this->mManagedSequences.find(sequenceName); + + if (found != this->mManagedSequences.end()) { + if (std::shared_ptr sq = found->second) { + SPDLOG_DEBUG("Kompute Manager evalOpAwait running sequence Sequence EVAL AWAIT"); + if (sq->isRunning()) { + sq->evalAwait(waitFor); + } + } + SPDLOG_DEBUG("Kompute Manager evalOpAwait running sequence SUCCESS"); + } + else { + SPDLOG_ERROR("Sequence not found"); + } + } + /** * Operation that adds extra operations to existing or new created * sequences. @@ -1599,7 +1693,7 @@ OpAlgoBase::OpAlgoBase(std::shared_ptr physicalD std::vector>& tensors) : OpBase(physicalDevice, device, commandBuffer, tensors, false) { - SPDLOG_DEBUG("Kompute OpAlgoBase constructor with params numTensors: {} , shaderFilePath: {}", tensors.size()); + SPDLOG_DEBUG("Kompute OpAlgoBase constructor with params numTensors: {}", tensors.size()); // The dispatch size is set up based on either explicitly provided template // parameters or by default it would take the shape and size of the tensors diff --git a/src/Sequence.cpp b/src/Sequence.cpp index 6fd9773e2..9038db2a7 100644 --- a/src/Sequence.cpp +++ b/src/Sequence.cpp @@ -140,12 +140,12 @@ Sequence::evalAsync() SPDLOG_WARN("Kompute Sequence evalAsync called when still recording"); return false; } - if (this->mEvalBusy) { + if (this->mIsRunning) { SPDLOG_WARN("Kompute Sequence evalAsync called when an eval async was called without successful wait"); return false; } - this->mEvalBusy = true; + this->mIsRunning = true; for (size_t i = 0; i < this->mOperations.size(); i++) { this->mOperations[i]->preEval(); @@ -154,7 +154,7 @@ Sequence::evalAsync() const vk::PipelineStageFlags waitStageMask = vk::PipelineStageFlagBits::eTransfer & vk::PipelineStageFlagBits::eComputeShader; vk::SubmitInfo submitInfo( - 0, nullptr, &waitStageMask, 1, this->mCommandBuffer.get()); + 0, nullptr, nullptr, 1, this->mCommandBuffer.get()); this->mFence = this->mDevice->createFence(vk::FenceCreateInfo()); @@ -162,12 +162,14 @@ Sequence::evalAsync() "Kompute sequence submitting command buffer into compute queue"); this->mComputeQueue->submit(1, &submitInfo, this->mFence); + + return true; } bool Sequence::evalAwait(uint64_t waitFor) { - if (!this->mEvalBusy) { + if (!this->mIsRunning) { SPDLOG_WARN("Kompute Sequence evalAwait called without existing eval"); return false; } @@ -177,7 +179,7 @@ Sequence::evalAwait(uint64_t waitFor) if (result == vk::Result::eTimeout) { SPDLOG_WARN("Kompute Sequence evalAwait timed out"); - this->mEvalBusy = false; + this->mIsRunning = false; return false; } @@ -185,7 +187,14 @@ Sequence::evalAwait(uint64_t waitFor) this->mOperations[i]->postEval(); } - this->mEvalBusy = false; + this->mIsRunning = false; + + return true; +} + +bool +Sequence::isRunning() { + return this->mIsRunning; } bool diff --git a/src/include/kompute/Manager.hpp b/src/include/kompute/Manager.hpp index 7f95aa9ab..e795bbf2e 100644 --- a/src/include/kompute/Manager.hpp +++ b/src/include/kompute/Manager.hpp @@ -102,7 +102,7 @@ class Manager * * @param tensors The tensors to be used in the operation recorded * @param sequenceName The name of the sequence to be retrieved or created - * @param TArgs Template parameters that will be used to initialise + * @param params Template parameters that will be used to initialise * Operation to allow for extensible configurations on initialisation */ template @@ -110,49 +110,52 @@ class Manager std::string sequenceName, TArgs&&... params) { - SPDLOG_DEBUG("Kompute Manager evalOp triggered"); + SPDLOG_DEBUG("Kompute Manager evalOpAsync triggered"); std::weak_ptr sqWeakPtr = this->getOrCreateManagedSequence(sequenceName); if (std::shared_ptr sq = sqWeakPtr.lock()) { - SPDLOG_DEBUG("Kompute Manager evalOp running sequence BEGIN"); + SPDLOG_DEBUG("Kompute Manager evalOpAsync running sequence BEGIN"); sq->begin(); - SPDLOG_DEBUG("Kompute Manager evalOp running sequence RECORD"); + SPDLOG_DEBUG("Kompute Manager evalOpAsync running sequence RECORD"); sq->record(tensors, std::forward(params)...); - SPDLOG_DEBUG("Kompute Manager evalOp running sequence END"); + SPDLOG_DEBUG("Kompute Manager evalOpAsync running sequence END"); sq->end(); - SPDLOG_DEBUG("Kompute Manager evalOp running sequence EVAL"); + SPDLOG_DEBUG("Kompute Manager evalOpAsync running sequence EVAL"); sq->evalAsync(); } - SPDLOG_DEBUG("Kompute Manager evalOp running sequence SUCCESS"); + SPDLOG_DEBUG("Kompute Manager evalOpAsync running sequence SUCCESS"); } /** * Operation that adds extra operations to existing or new created * sequences. * - * @param tensors The tensors to be used in the operation recorded - * @param sequenceName The name of the sequence to be retrieved or created - * @param TArgs Template parameters that will be used to initialise - * Operation to allow for extensible configurations on initialisation + * @param sequenceName The name of the sequence to wait for termination + * @param waitFor The amount of time to wait before timing out */ - template + template void evalOpAwait(std::string sequenceName, uint64_t waitFor = UINT64_MAX) { SPDLOG_DEBUG("Kompute Manager evalOpAwait triggered"); std::unordered_map>::iterator found = - this->mManagedSequences.find(sequenceName); + this->mManagedSequences.find(sequenceName); - if (found == this->mManagedSequences.end()) { + if (found != this->mManagedSequences.end()) { if (std::shared_ptr sq = found->second) { SPDLOG_DEBUG("Kompute Manager evalOpAwait running sequence Sequence EVAL AWAIT"); - sq->evalAwait(waitFor); + if (sq->isRunning()) { + sq->evalAwait(waitFor); + } } SPDLOG_DEBUG("Kompute Manager evalOpAwait running sequence SUCCESS"); } + else { + SPDLOG_ERROR("Sequence not found"); + } } /** diff --git a/src/include/kompute/Sequence.hpp b/src/include/kompute/Sequence.hpp index 749fdbeeb..2d9ef8e51 100644 --- a/src/include/kompute/Sequence.hpp +++ b/src/include/kompute/Sequence.hpp @@ -88,6 +88,13 @@ class Sequence */ bool isRecording(); + /** + * Returns true if the sequence is currently running - mostly used for async workloads. + * + * @return Boolean stating if currently running. + */ + bool isRunning(); + /** * Returns true if the sequence has been successfully initialised. * @@ -164,7 +171,7 @@ class Sequence // State bool mIsInit = false; bool mRecording = false; - bool mEvalBusy = false; + bool mIsRunning = false; // Create functions void createCommandPool(); diff --git a/src/include/kompute/operations/OpAlgoBase.hpp b/src/include/kompute/operations/OpAlgoBase.hpp index 3d1de2992..a32e20011 100644 --- a/src/include/kompute/operations/OpAlgoBase.hpp +++ b/src/include/kompute/operations/OpAlgoBase.hpp @@ -162,7 +162,7 @@ OpAlgoBase::OpAlgoBase(std::shared_ptr physicalD std::vector>& tensors) : OpBase(physicalDevice, device, commandBuffer, tensors, false) { - SPDLOG_DEBUG("Kompute OpAlgoBase constructor with params numTensors: {} , shaderFilePath: {}", tensors.size()); + SPDLOG_DEBUG("Kompute OpAlgoBase constructor with params numTensors: {}", tensors.size()); // The dispatch size is set up based on either explicitly provided template // parameters or by default it would take the shape and size of the tensors From 3e5364fc4448e079f753c980f76d74754c18b4e5 Mon Sep 17 00:00:00 2001 From: Alejandro Saucedo Date: Thu, 15 Oct 2020 09:25:23 +0100 Subject: [PATCH 04/19] Added async test --- test/TestAsyncOperations.cpp | 95 ++++++++++++++++++++++++++++++++++++ 1 file changed, 95 insertions(+) create mode 100644 test/TestAsyncOperations.cpp diff --git a/test/TestAsyncOperations.cpp b/test/TestAsyncOperations.cpp new file mode 100644 index 000000000..ba8b203de --- /dev/null +++ b/test/TestAsyncOperations.cpp @@ -0,0 +1,95 @@ + +#include "gtest/gtest.h" + +#include + +#include "kompute/Kompute.hpp" + +TEST(TestAsyncOperations, TestManagerAsync) +{ + uint32_t size = 100000; + + std::vector data(size, 0.0); + std::vector resultSync(size, 100000); + std::vector resultAsync(size, 200000); + + std::shared_ptr tensorA{ new kp::Tensor(data) }; + std::shared_ptr tensorB{ new kp::Tensor(data) }; + std::shared_ptr tensorC{ new kp::Tensor(data) }; + std::shared_ptr tensorD{ new kp::Tensor(data) }; + std::shared_ptr tensorE{ new kp::Tensor(data) }; + std::shared_ptr tensorF{ new kp::Tensor(data) }; + + kp::Manager mgr; + + mgr.evalOpDefault({ tensorA, tensorB, tensorC, tensorD, tensorE, tensorF }); + + std::string shader(R"( + #version 450 + + layout (local_size_x = 1) in; + + layout(set = 0, binding = 0) buffer a { float pa[]; }; + layout(set = 0, binding = 1) buffer b { float pb[]; }; + + void main() { + uint index = gl_GlobalInvocationID.x; + + for (int i = 0; i < 100000; i++) + { + pa[index] += 1.0; + } + + pb[index] = pa[index]; + } + )"); + + auto startSync = std::chrono::high_resolution_clock::now(); + + mgr.evalOpDefault>( + { tensorA, tensorB }, std::vector(shader.begin(), shader.end())); + + mgr.evalOpDefault>( + { tensorC, tensorD }, std::vector(shader.begin(), shader.end())); + + mgr.evalOpDefault>( + { tensorE, tensorF }, std::vector(shader.begin(), shader.end())); + + auto endSync = std::chrono::high_resolution_clock::now(); + + mgr.evalOpDefault({ tensorB, tensorD, tensorF }); + + EXPECT_EQ(tensorB->data(), resultSync); + EXPECT_EQ(tensorD->data(), resultSync); + EXPECT_EQ(tensorF->data(), resultSync); + + auto durationSync = std::chrono::duration_cast(endSync - startSync).count(); + + auto startAsync = std::chrono::high_resolution_clock::now(); + + mgr.evalOpAsync>( + { tensorA, tensorB }, "asyncOne", std::vector(shader.begin(), shader.end())); + + mgr.evalOpAsync>( + { tensorC, tensorD }, "asyncTwo", std::vector(shader.begin(), shader.end())); + + mgr.evalOpAsync>( + { tensorE, tensorF }, "asyncThree", std::vector(shader.begin(), shader.end())); + + mgr.evalOpAwait("asyncOne"); + mgr.evalOpAwait("asyncTwo"); + mgr.evalOpAwait("asyncThree"); + + auto endAsync = std::chrono::high_resolution_clock::now(); + + auto durationAsync = std::chrono::duration_cast(endAsync - startAsync).count(); + + mgr.evalOpDefault({ tensorB, tensorD, tensorF }); + + EXPECT_EQ(tensorB->data(), resultAsync); + EXPECT_EQ(tensorD->data(), resultAsync); + EXPECT_EQ(tensorF->data(), resultAsync); + + SPDLOG_DEBUG("Total Sync: {}", durationSync); + SPDLOG_DEBUG("Total Async: {}", durationAsync); +} From 4e697bb7873c3edd07bd57fdf711688d0194919d Mon Sep 17 00:00:00 2001 From: Alejandro Saucedo Date: Thu, 15 Oct 2020 21:40:31 +0100 Subject: [PATCH 05/19] Updated to current multiple queue implementation --- single_include/kompute/Kompute.hpp | 41 +++++++++--- src/Manager.cpp | 47 +++++++++----- src/include/kompute/Manager.hpp | 41 +++++++++--- test/TestAsyncOperations.cpp | 101 +++++++++++++++++------------ 4 files changed, 151 insertions(+), 79 deletions(-) diff --git a/single_include/kompute/Kompute.hpp b/single_include/kompute/Kompute.hpp index ee8705df5..a9860425c 100755 --- a/single_include/kompute/Kompute.hpp +++ b/single_include/kompute/Kompute.hpp @@ -1253,19 +1253,22 @@ class Manager Manager(); /** - Similar to base constructor but allows the user to provide the device - they would like to create the resources on. + * Similar to base constructor but allows the user to provide the device + * they would like to create the resources on. + * + * @param physicalDeviceIndex The index of the physical device to use + * @param totalQueues The total number of compute queues to create. */ - Manager(uint32_t physicalDeviceIndex); + Manager(uint32_t physicalDeviceIndex, uint32_t totalComputeQueues = 1); /** * Manager constructor which allows your own vulkan application to integrate * with the vulkan kompute use. * * @param instance Vulkan compute instance to base this application - * @physicalDevice Vulkan physical device to use for application - * @device Vulkan logical device to use for all base resources - * @physicalDeviceIndex Index for vulkan physical device used + * @param physicalDevice Vulkan physical device to use for application + * @param device Vulkan logical device to use for all base resources + * @param physicalDeviceIndex Index for vulkan physical device used */ Manager(std::shared_ptr instance, std::shared_ptr physicalDevice, @@ -1290,6 +1293,16 @@ class Manager std::weak_ptr getOrCreateManagedSequence( std::string sequenceName); + /** + * Create a new managed Kompute sequence so it's available within the manager. + * + * @param sequenceName The name for the named sequence to be created + * @param queueIndex The queue to use from the available queues + * @return Weak pointer to the manager owned sequence resource + */ + std::weak_ptr createManagedSequence( + std::string sequenceName, uint32_t queueIndex = 0); + /** * Operation that adds extra operations to existing or new created * sequences. @@ -1342,7 +1355,12 @@ class Manager std::weak_ptr sqWeakPtr = this->getOrCreateManagedSequence(sequenceName); - if (std::shared_ptr sq = sqWeakPtr.lock()) { + std::unordered_map>::iterator found = + this->mManagedSequences.find(sequenceName); + + if (found == this->mManagedSequences.end()) { + std::shared_ptr sq = found->second; + SPDLOG_DEBUG("Kompute Manager evalOpAsync running sequence BEGIN"); sq->begin(); @@ -1355,6 +1373,9 @@ class Manager SPDLOG_DEBUG("Kompute Manager evalOpAsync running sequence EVAL"); sq->evalAsync(); } + else { + SPDLOG_ERROR("Kompute Manager evalOpAsync sequence [{}] not found", sequenceName); + } SPDLOG_DEBUG("Kompute Manager evalOpAsync running sequence SUCCESS"); } @@ -1382,7 +1403,7 @@ class Manager SPDLOG_DEBUG("Kompute Manager evalOpAwait running sequence SUCCESS"); } else { - SPDLOG_ERROR("Sequence not found"); + SPDLOG_ERROR("Kompute Manager evalOpAwait Sequence not found"); } } @@ -1437,7 +1458,7 @@ class Manager std::shared_ptr mDevice = nullptr; bool mFreeDevice = false; uint32_t mComputeQueueFamilyIndex = -1; - std::shared_ptr mComputeQueue = nullptr; + std::vector> mComputeQueues; // -------------- ALWAYS OWNED RESOURCES std::unordered_map> @@ -1452,7 +1473,7 @@ class Manager // Create functions void createInstance(); - void createDevice(); + void createDevice(uint32_t totalComputeQueues); }; } // End namespace kp diff --git a/src/Manager.cpp b/src/Manager.cpp index 1debb4c9a..58f067843 100644 --- a/src/Manager.cpp +++ b/src/Manager.cpp @@ -25,15 +25,15 @@ debugMessageCallback(VkDebugReportFlagsEXT flags, #endif Manager::Manager() - : Manager(0) + : Manager(0, 1) {} -Manager::Manager(uint32_t physicalDeviceIndex) +Manager::Manager(uint32_t physicalDeviceIndex, uint32_t totalComputeQueues) { this->mPhysicalDeviceIndex = physicalDeviceIndex; this->createInstance(); - this->createDevice(); + this->createDevice(totalComputeQueues); } Manager::Manager(std::shared_ptr instance, @@ -98,19 +98,27 @@ Manager::getOrCreateManagedSequence(std::string sequenceName) this->mManagedSequences.find(sequenceName); if (found == this->mManagedSequences.end()) { - std::shared_ptr sq = - std::make_shared(this->mPhysicalDevice, - this->mDevice, - this->mComputeQueue, - this->mComputeQueueFamilyIndex); - sq->init(); - this->mManagedSequences.insert({ sequenceName, sq }); - return sq; + return this->createManagedSequence(sequenceName); } else { return found->second; } } +std::weak_ptr +Manager::createManagedSequence(std::string sequenceName, uint32_t queueIndex) { + + SPDLOG_DEBUG("Kompute Manager createManagedSequence with sequenceName: {} and queueIndex: {}", sequenceName, queueIndex); + + std::shared_ptr sq = + std::make_shared(this->mPhysicalDevice, + this->mDevice, + this->mComputeQueues[queueIndex], + this->mComputeQueueFamilyIndex); + sq->init(); + this->mManagedSequences.insert({ sequenceName, sq }); + return sq; +} + void Manager::createInstance() { @@ -197,7 +205,7 @@ Manager::createInstance() } void -Manager::createDevice() +Manager::createDevice(uint32_t totalComputeQueues) { SPDLOG_DEBUG("Kompute Manager creating Device"); @@ -248,7 +256,7 @@ Manager::createDevice() } const float defaultQueuePriority(0.0f); - const uint32_t defaultQueueCount(1); + const uint32_t defaultQueueCount(totalComputeQueues); vk::DeviceQueueCreateInfo deviceQueueCreateInfo( vk::DeviceQueueCreateFlags(), this->mComputeQueueFamilyIndex, @@ -264,9 +272,16 @@ Manager::createDevice() &deviceCreateInfo, nullptr, this->mDevice.get()); SPDLOG_DEBUG("Kompute Manager device created"); - this->mComputeQueue = std::make_shared(); - this->mDevice->getQueue( - this->mComputeQueueFamilyIndex, 0, this->mComputeQueue.get()); + for (uint32_t i = 0; i < totalComputeQueues; i++) + { + std::shared_ptr currQueue = std::make_shared(); + + this->mDevice->getQueue( + this->mComputeQueueFamilyIndex, i, currQueue.get()); + + this->mComputeQueues.push_back(currQueue); + } + SPDLOG_DEBUG("Kompute Manager compute queue obtained"); } diff --git a/src/include/kompute/Manager.hpp b/src/include/kompute/Manager.hpp index e795bbf2e..e2709836c 100644 --- a/src/include/kompute/Manager.hpp +++ b/src/include/kompute/Manager.hpp @@ -25,19 +25,22 @@ class Manager Manager(); /** - Similar to base constructor but allows the user to provide the device - they would like to create the resources on. + * Similar to base constructor but allows the user to provide the device + * they would like to create the resources on. + * + * @param physicalDeviceIndex The index of the physical device to use + * @param totalQueues The total number of compute queues to create. */ - Manager(uint32_t physicalDeviceIndex); + Manager(uint32_t physicalDeviceIndex, uint32_t totalComputeQueues = 1); /** * Manager constructor which allows your own vulkan application to integrate * with the vulkan kompute use. * * @param instance Vulkan compute instance to base this application - * @physicalDevice Vulkan physical device to use for application - * @device Vulkan logical device to use for all base resources - * @physicalDeviceIndex Index for vulkan physical device used + * @param physicalDevice Vulkan physical device to use for application + * @param device Vulkan logical device to use for all base resources + * @param physicalDeviceIndex Index for vulkan physical device used */ Manager(std::shared_ptr instance, std::shared_ptr physicalDevice, @@ -62,6 +65,16 @@ class Manager std::weak_ptr getOrCreateManagedSequence( std::string sequenceName); + /** + * Create a new managed Kompute sequence so it's available within the manager. + * + * @param sequenceName The name for the named sequence to be created + * @param queueIndex The queue to use from the available queues + * @return Weak pointer to the manager owned sequence resource + */ + std::weak_ptr createManagedSequence( + std::string sequenceName, uint32_t queueIndex = 0); + /** * Operation that adds extra operations to existing or new created * sequences. @@ -114,7 +127,12 @@ class Manager std::weak_ptr sqWeakPtr = this->getOrCreateManagedSequence(sequenceName); - if (std::shared_ptr sq = sqWeakPtr.lock()) { + std::unordered_map>::iterator found = + this->mManagedSequences.find(sequenceName); + + if (found == this->mManagedSequences.end()) { + std::shared_ptr sq = found->second; + SPDLOG_DEBUG("Kompute Manager evalOpAsync running sequence BEGIN"); sq->begin(); @@ -127,6 +145,9 @@ class Manager SPDLOG_DEBUG("Kompute Manager evalOpAsync running sequence EVAL"); sq->evalAsync(); } + else { + SPDLOG_ERROR("Kompute Manager evalOpAsync sequence [{}] not found", sequenceName); + } SPDLOG_DEBUG("Kompute Manager evalOpAsync running sequence SUCCESS"); } @@ -154,7 +175,7 @@ class Manager SPDLOG_DEBUG("Kompute Manager evalOpAwait running sequence SUCCESS"); } else { - SPDLOG_ERROR("Sequence not found"); + SPDLOG_ERROR("Kompute Manager evalOpAwait Sequence not found"); } } @@ -209,7 +230,7 @@ class Manager std::shared_ptr mDevice = nullptr; bool mFreeDevice = false; uint32_t mComputeQueueFamilyIndex = -1; - std::shared_ptr mComputeQueue = nullptr; + std::vector> mComputeQueues; // -------------- ALWAYS OWNED RESOURCES std::unordered_map> @@ -224,7 +245,7 @@ class Manager // Create functions void createInstance(); - void createDevice(); + void createDevice(uint32_t totalComputeQueues); }; } // End namespace kp diff --git a/test/TestAsyncOperations.cpp b/test/TestAsyncOperations.cpp index ba8b203de..15fd9caef 100644 --- a/test/TestAsyncOperations.cpp +++ b/test/TestAsyncOperations.cpp @@ -9,21 +9,6 @@ TEST(TestAsyncOperations, TestManagerAsync) { uint32_t size = 100000; - std::vector data(size, 0.0); - std::vector resultSync(size, 100000); - std::vector resultAsync(size, 200000); - - std::shared_ptr tensorA{ new kp::Tensor(data) }; - std::shared_ptr tensorB{ new kp::Tensor(data) }; - std::shared_ptr tensorC{ new kp::Tensor(data) }; - std::shared_ptr tensorD{ new kp::Tensor(data) }; - std::shared_ptr tensorE{ new kp::Tensor(data) }; - std::shared_ptr tensorF{ new kp::Tensor(data) }; - - kp::Manager mgr; - - mgr.evalOpDefault({ tensorA, tensorB, tensorC, tensorD, tensorE, tensorF }); - std::string shader(R"( #version 450 @@ -44,52 +29,82 @@ TEST(TestAsyncOperations, TestManagerAsync) } )"); + std::vector data(size, 0.0); + std::vector resultSync(size, 100000); + std::vector resultAsync(size, 100000); + + std::shared_ptr tensorSyncA{ new kp::Tensor(data) }; + std::shared_ptr tensorSyncB{ new kp::Tensor(data) }; + std::shared_ptr tensorSyncC{ new kp::Tensor(data) }; + std::shared_ptr tensorSyncD{ new kp::Tensor(data) }; + std::shared_ptr tensorSyncE{ new kp::Tensor(data) }; + std::shared_ptr tensorSyncF{ new kp::Tensor(data) }; + + kp::Manager mgr; + + mgr.evalOpDefault({ tensorSyncA, tensorSyncB, tensorSyncC, tensorSyncD, tensorSyncE, tensorSyncF }); + auto startSync = std::chrono::high_resolution_clock::now(); mgr.evalOpDefault>( - { tensorA, tensorB }, std::vector(shader.begin(), shader.end())); + { tensorSyncA, tensorSyncB }, std::vector(shader.begin(), shader.end())); mgr.evalOpDefault>( - { tensorC, tensorD }, std::vector(shader.begin(), shader.end())); + { tensorSyncC, tensorSyncD }, std::vector(shader.begin(), shader.end())); mgr.evalOpDefault>( - { tensorE, tensorF }, std::vector(shader.begin(), shader.end())); + { tensorSyncE, tensorSyncF }, std::vector(shader.begin(), shader.end())); + + mgr.evalOpDefault({ tensorSyncB, tensorSyncD, tensorSyncF }); auto endSync = std::chrono::high_resolution_clock::now(); - - mgr.evalOpDefault({ tensorB, tensorD, tensorF }); - - EXPECT_EQ(tensorB->data(), resultSync); - EXPECT_EQ(tensorD->data(), resultSync); - EXPECT_EQ(tensorF->data(), resultSync); - auto durationSync = std::chrono::duration_cast(endSync - startSync).count(); - auto startAsync = std::chrono::high_resolution_clock::now(); + EXPECT_EQ(tensorSyncB->data(), resultSync); + EXPECT_EQ(tensorSyncD->data(), resultSync); + EXPECT_EQ(tensorSyncF->data(), resultSync); - mgr.evalOpAsync>( - { tensorA, tensorB }, "asyncOne", std::vector(shader.begin(), shader.end())); + //std::shared_ptr tensorAsyncA{ new kp::Tensor(data) }; + //std::shared_ptr tensorAsyncB{ new kp::Tensor(data) }; + //std::shared_ptr tensorAsyncC{ new kp::Tensor(data) }; + //std::shared_ptr tensorAsyncD{ new kp::Tensor(data) }; + //std::shared_ptr tensorAsyncE{ new kp::Tensor(data) }; + //std::shared_ptr tensorAsyncF{ new kp::Tensor(data) }; - mgr.evalOpAsync>( - { tensorC, tensorD }, "asyncTwo", std::vector(shader.begin(), shader.end())); + //kp::Manager mgrAsync(0, 1); - mgr.evalOpAsync>( - { tensorE, tensorF }, "asyncThree", std::vector(shader.begin(), shader.end())); + //mgrAsync.evalOpDefault({ tensorAsyncA, tensorAsyncB, tensorAsyncC, tensorAsyncD, tensorAsyncE, tensorAsyncF }); - mgr.evalOpAwait("asyncOne"); - mgr.evalOpAwait("asyncTwo"); - mgr.evalOpAwait("asyncThree"); + //mgrAsync.createManagedSequence("async0", 0); + ////mgrAsync.createManagedSequence("async1", 1); + ////mgrAsync.createManagedSequence("async2", 2); - auto endAsync = std::chrono::high_resolution_clock::now(); + //auto startAsync = std::chrono::high_resolution_clock::now(); - auto durationAsync = std::chrono::duration_cast(endAsync - startAsync).count(); + //mgrAsync.evalOpAsync>( + // { tensorAsyncA, tensorAsyncB }, "async0", std::vector(shader.begin(), shader.end())); - mgr.evalOpDefault({ tensorB, tensorD, tensorF }); + ////mgrAsync.evalOpAsync>( + //// { tensorAsyncC, tensorAsyncD }, "async1", std::vector(shader.begin(), shader.end())); - EXPECT_EQ(tensorB->data(), resultAsync); - EXPECT_EQ(tensorD->data(), resultAsync); - EXPECT_EQ(tensorF->data(), resultAsync); + ////mgrAsync.evalOpAsync>( + //// { tensorAsyncE, tensorAsyncF }, "async2", std::vector(shader.begin(), shader.end())); - SPDLOG_DEBUG("Total Sync: {}", durationSync); - SPDLOG_DEBUG("Total Async: {}", durationAsync); + //mgrAsync.evalOpAwait("async0"); + ////mgrAsync.evalOpAwait("async1"); + ////mgrAsync.evalOpAwait("async2"); + + //mgrAsync.evalOpDefault({ tensorAsyncB }); + ////mgrAsync.evalOpDefault({ tensorAsyncD }); + ////mgrAsync.evalOpDefault({ tensorAsyncF }); + + //auto endAsync = std::chrono::high_resolution_clock::now(); + //auto durationAsync = std::chrono::duration_cast(endAsync - startAsync).count(); + + //EXPECT_EQ(tensorAsyncB->data(), resultAsync); + ////EXPECT_EQ(tensorAsyncD->data(), resultAsync); + ////EXPECT_EQ(tensorAsyncF->data(), resultAsync); + + ////SPDLOG_DEBUG("Total Sync: {}", durationSync); + //SPDLOG_DEBUG("Total Async: {}", durationAsync); } From 9e79b9f352c7f52918fb56e0a432cd1f18d075c0 Mon Sep 17 00:00:00 2001 From: Alejandro Saucedo Date: Thu, 15 Oct 2020 22:13:12 +0100 Subject: [PATCH 06/19] Updated to implementation of queues but no speedups yet --- single_include/kompute/Kompute.hpp | 2 +- src/include/kompute/Manager.hpp | 2 +- test/TestAsyncOperations.cpp | 109 +++++++++++++++-------------- 3 files changed, 57 insertions(+), 56 deletions(-) diff --git a/single_include/kompute/Kompute.hpp b/single_include/kompute/Kompute.hpp index a9860425c..9b032bb4f 100755 --- a/single_include/kompute/Kompute.hpp +++ b/single_include/kompute/Kompute.hpp @@ -1358,7 +1358,7 @@ class Manager std::unordered_map>::iterator found = this->mManagedSequences.find(sequenceName); - if (found == this->mManagedSequences.end()) { + if (found != this->mManagedSequences.end()) { std::shared_ptr sq = found->second; SPDLOG_DEBUG("Kompute Manager evalOpAsync running sequence BEGIN"); diff --git a/src/include/kompute/Manager.hpp b/src/include/kompute/Manager.hpp index e2709836c..340fd782d 100644 --- a/src/include/kompute/Manager.hpp +++ b/src/include/kompute/Manager.hpp @@ -130,7 +130,7 @@ class Manager std::unordered_map>::iterator found = this->mManagedSequences.find(sequenceName); - if (found == this->mManagedSequences.end()) { + if (found != this->mManagedSequences.end()) { std::shared_ptr sq = found->second; SPDLOG_DEBUG("Kompute Manager evalOpAsync running sequence BEGIN"); diff --git a/test/TestAsyncOperations.cpp b/test/TestAsyncOperations.cpp index 15fd9caef..1c0f74e59 100644 --- a/test/TestAsyncOperations.cpp +++ b/test/TestAsyncOperations.cpp @@ -9,6 +9,8 @@ TEST(TestAsyncOperations, TestManagerAsync) { uint32_t size = 100000; + uint32_t numParallel = 6; + std::string shader(R"( #version 450 @@ -20,91 +22,90 @@ TEST(TestAsyncOperations, TestManagerAsync) void main() { uint index = gl_GlobalInvocationID.x; - for (int i = 0; i < 100000; i++) + for (int i = 0; i < 10000; i++) { pa[index] += 1.0; } pb[index] = pa[index]; + pa[index] = 0; } )"); std::vector data(size, 0.0); - std::vector resultSync(size, 100000); - std::vector resultAsync(size, 100000); - - std::shared_ptr tensorSyncA{ new kp::Tensor(data) }; - std::shared_ptr tensorSyncB{ new kp::Tensor(data) }; - std::shared_ptr tensorSyncC{ new kp::Tensor(data) }; - std::shared_ptr tensorSyncD{ new kp::Tensor(data) }; - std::shared_ptr tensorSyncE{ new kp::Tensor(data) }; - std::shared_ptr tensorSyncF{ new kp::Tensor(data) }; + std::vector resultSync(size, 10000); + std::vector resultAsync(size, 10000); kp::Manager mgr; - mgr.evalOpDefault({ tensorSyncA, tensorSyncB, tensorSyncC, tensorSyncD, tensorSyncE, tensorSyncF }); + std::vector> inputsSyncA; + std::vector> inputsSyncB; + + for (uint32_t i = 0; i < numParallel; i++) { + inputsSyncA.push_back(std::make_shared(kp::Tensor(data))); + inputsSyncB.push_back(std::make_shared(kp::Tensor(data))); + } + + mgr.evalOpDefault(inputsSyncA); + mgr.evalOpDefault(inputsSyncB); auto startSync = std::chrono::high_resolution_clock::now(); - mgr.evalOpDefault>( - { tensorSyncA, tensorSyncB }, std::vector(shader.begin(), shader.end())); + for (uint32_t i = 0; i < numParallel; i++) { + mgr.evalOpDefault>( + { inputsSyncA[i], inputsSyncB[i] }, + std::vector(shader.begin(), shader.end())); - mgr.evalOpDefault>( - { tensorSyncC, tensorSyncD }, std::vector(shader.begin(), shader.end())); + } - mgr.evalOpDefault>( - { tensorSyncE, tensorSyncF }, std::vector(shader.begin(), shader.end())); - - mgr.evalOpDefault({ tensorSyncB, tensorSyncD, tensorSyncF }); + mgr.evalOpDefault(inputsSyncB); auto endSync = std::chrono::high_resolution_clock::now(); auto durationSync = std::chrono::duration_cast(endSync - startSync).count(); - EXPECT_EQ(tensorSyncB->data(), resultSync); - EXPECT_EQ(tensorSyncD->data(), resultSync); - EXPECT_EQ(tensorSyncF->data(), resultSync); + for (uint32_t i = 0; i < numParallel; i++) { + EXPECT_EQ(inputsSyncB[i]->data(), resultSync); + } - //std::shared_ptr tensorAsyncA{ new kp::Tensor(data) }; - //std::shared_ptr tensorAsyncB{ new kp::Tensor(data) }; - //std::shared_ptr tensorAsyncC{ new kp::Tensor(data) }; - //std::shared_ptr tensorAsyncD{ new kp::Tensor(data) }; - //std::shared_ptr tensorAsyncE{ new kp::Tensor(data) }; - //std::shared_ptr tensorAsyncF{ new kp::Tensor(data) }; + kp::Manager mgrAsync(0, numParallel); - //kp::Manager mgrAsync(0, 1); + std::vector> inputsAsyncA; + std::vector> inputsAsyncB; - //mgrAsync.evalOpDefault({ tensorAsyncA, tensorAsyncB, tensorAsyncC, tensorAsyncD, tensorAsyncE, tensorAsyncF }); + for (uint32_t i = 0; i < numParallel; i++) { + inputsAsyncA.push_back(std::make_shared(kp::Tensor(data))); + inputsAsyncB.push_back(std::make_shared(kp::Tensor(data))); + } - //mgrAsync.createManagedSequence("async0", 0); - ////mgrAsync.createManagedSequence("async1", 1); - ////mgrAsync.createManagedSequence("async2", 2); + mgrAsync.evalOpDefault(inputsAsyncA); + mgrAsync.evalOpDefault(inputsAsyncB); - //auto startAsync = std::chrono::high_resolution_clock::now(); + for (uint32_t i = 0; i < numParallel; i++) { + mgrAsync.createManagedSequence("async" + std::to_string(i), i); + } - //mgrAsync.evalOpAsync>( - // { tensorAsyncA, tensorAsyncB }, "async0", std::vector(shader.begin(), shader.end())); + auto startAsync = std::chrono::high_resolution_clock::now(); - ////mgrAsync.evalOpAsync>( - //// { tensorAsyncC, tensorAsyncD }, "async1", std::vector(shader.begin(), shader.end())); + for (uint32_t i = 0; i < numParallel; i++) { + mgrAsync.evalOpAsync>( + { inputsAsyncA[i], inputsAsyncB[i] }, + "async" + std::to_string(i), + std::vector(shader.begin(), shader.end())); + } - ////mgrAsync.evalOpAsync>( - //// { tensorAsyncE, tensorAsyncF }, "async2", std::vector(shader.begin(), shader.end())); + for (uint32_t i = 0; i < numParallel; i++) { + mgrAsync.evalOpAwait("async" + std::to_string(i)); + } - //mgrAsync.evalOpAwait("async0"); - ////mgrAsync.evalOpAwait("async1"); - ////mgrAsync.evalOpAwait("async2"); + mgrAsync.evalOpDefault({ inputsAsyncB }); - //mgrAsync.evalOpDefault({ tensorAsyncB }); - ////mgrAsync.evalOpDefault({ tensorAsyncD }); - ////mgrAsync.evalOpDefault({ tensorAsyncF }); + auto endAsync = std::chrono::high_resolution_clock::now(); + auto durationAsync = std::chrono::duration_cast(endAsync - startAsync).count(); - //auto endAsync = std::chrono::high_resolution_clock::now(); - //auto durationAsync = std::chrono::duration_cast(endAsync - startAsync).count(); + for (uint32_t i = 0; i < numParallel; i++) { + EXPECT_EQ(inputsAsyncB[i]->data(), resultAsync); + } - //EXPECT_EQ(tensorAsyncB->data(), resultAsync); - ////EXPECT_EQ(tensorAsyncD->data(), resultAsync); - ////EXPECT_EQ(tensorAsyncF->data(), resultAsync); - - ////SPDLOG_DEBUG("Total Sync: {}", durationSync); - //SPDLOG_DEBUG("Total Async: {}", durationAsync); + SPDLOG_ERROR("Total Sync: {}", durationSync); + SPDLOG_ERROR("Total Async: {}", durationAsync); } From c69fcb7e606e4f7e340cd268b71463003cdfc56c Mon Sep 17 00:00:00 2001 From: Alejandro Saucedo Date: Fri, 16 Oct 2020 07:39:37 +0100 Subject: [PATCH 07/19] UPdated to have shorter dispatch size and larger loop size --- test/TestAsyncOperations.cpp | 36 ++++++++++++++++++++++++++---------- 1 file changed, 26 insertions(+), 10 deletions(-) diff --git a/test/TestAsyncOperations.cpp b/test/TestAsyncOperations.cpp index 1c0f74e59..6da9533a9 100644 --- a/test/TestAsyncOperations.cpp +++ b/test/TestAsyncOperations.cpp @@ -7,7 +7,7 @@ TEST(TestAsyncOperations, TestManagerAsync) { - uint32_t size = 100000; + uint32_t size = 10; uint32_t numParallel = 6; @@ -19,22 +19,38 @@ TEST(TestAsyncOperations, TestManagerAsync) layout(set = 0, binding = 0) buffer a { float pa[]; }; layout(set = 0, binding = 1) buffer b { float pb[]; }; + shared uint sharedTotal[1]; + void main() { uint index = gl_GlobalInvocationID.x; - for (int i = 0; i < 10000; i++) + sharedTotal[0] = 0; + + barrier(); + memoryBarrierShared(); + + for (int i = 0; i < 100000000; i++) { - pa[index] += 1.0; + atomicAdd(sharedTotal[0], 1); + atomicAdd(sharedTotal[0], -1); + atomicAdd(sharedTotal[0], 1); + atomicAdd(sharedTotal[0], -1); + atomicAdd(sharedTotal[0], 1); + atomicAdd(sharedTotal[0], -1); + atomicAdd(sharedTotal[0], 1); } - pb[index] = pa[index]; + barrier(); + memoryBarrierShared(); + + pb[index] = sharedTotal[0]; pa[index] = 0; } )"); std::vector data(size, 0.0); - std::vector resultSync(size, 10000); - std::vector resultAsync(size, 10000); + std::vector resultSync(size, 100000000); + std::vector resultAsync(size, 100000000); kp::Manager mgr; @@ -58,11 +74,11 @@ TEST(TestAsyncOperations, TestManagerAsync) } - mgr.evalOpDefault(inputsSyncB); - auto endSync = std::chrono::high_resolution_clock::now(); auto durationSync = std::chrono::duration_cast(endSync - startSync).count(); + mgr.evalOpDefault(inputsSyncB); + for (uint32_t i = 0; i < numParallel; i++) { EXPECT_EQ(inputsSyncB[i]->data(), resultSync); } @@ -97,11 +113,11 @@ TEST(TestAsyncOperations, TestManagerAsync) mgrAsync.evalOpAwait("async" + std::to_string(i)); } - mgrAsync.evalOpDefault({ inputsAsyncB }); - auto endAsync = std::chrono::high_resolution_clock::now(); auto durationAsync = std::chrono::duration_cast(endAsync - startAsync).count(); + mgrAsync.evalOpDefault({ inputsAsyncB }); + for (uint32_t i = 0; i < numParallel; i++) { EXPECT_EQ(inputsAsyncB[i]->data(), resultAsync); } From 79ca746ebc4b368edba1a3a4c6fa7fe3eaab3fcf Mon Sep 17 00:00:00 2001 From: Alejandro Saucedo Date: Fri, 16 Oct 2020 07:54:42 +0100 Subject: [PATCH 08/19] Removed wait stage mask from submit info --- src/Sequence.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/Sequence.cpp b/src/Sequence.cpp index 9038db2a7..1496e67d9 100644 --- a/src/Sequence.cpp +++ b/src/Sequence.cpp @@ -151,8 +151,6 @@ Sequence::evalAsync() this->mOperations[i]->preEval(); } - const vk::PipelineStageFlags waitStageMask = - vk::PipelineStageFlagBits::eTransfer & vk::PipelineStageFlagBits::eComputeShader; vk::SubmitInfo submitInfo( 0, nullptr, nullptr, 1, this->mCommandBuffer.get()); From 6fc0b0b3e3f42c5d68d4d924cfa9079415266b15 Mon Sep 17 00:00:00 2001 From: Alejandro Saucedo Date: Sat, 17 Oct 2020 10:40:43 +0100 Subject: [PATCH 09/19] Added capabilities for multiple queues --- single_include/kompute/Kompute.hpp | 12 +++-- src/Manager.cpp | 79 +++++++++++++++++++----------- src/include/kompute/Manager.hpp | 12 +++-- 3 files changed, 65 insertions(+), 38 deletions(-) mode change 100644 => 100755 src/Manager.cpp diff --git a/single_include/kompute/Kompute.hpp b/single_include/kompute/Kompute.hpp index 9b032bb4f..15e03ccc4 100755 --- a/single_include/kompute/Kompute.hpp +++ b/single_include/kompute/Kompute.hpp @@ -1245,7 +1245,7 @@ namespace kp { */ class Manager { - public: +public: /** Base constructor and default used which creates the base resources including choosing the device 0 by default. @@ -1257,9 +1257,10 @@ class Manager * they would like to create the resources on. * * @param physicalDeviceIndex The index of the physical device to use + * @param familyQueueIndeces (Optional) List of queue indeces to add for explicit allocation * @param totalQueues The total number of compute queues to create. */ - Manager(uint32_t physicalDeviceIndex, uint32_t totalComputeQueues = 1); + Manager(uint32_t physicalDeviceIndex, const std::vector & familyQueueIndeces = {}); /** * Manager constructor which allows your own vulkan application to integrate @@ -1457,13 +1458,14 @@ class Manager uint32_t mPhysicalDeviceIndex = -1; std::shared_ptr mDevice = nullptr; bool mFreeDevice = false; - uint32_t mComputeQueueFamilyIndex = -1; - std::vector> mComputeQueues; // -------------- ALWAYS OWNED RESOURCES std::unordered_map> mManagedSequences; + std::vector mComputeQueueFamilyIndeces; + std::vector> mComputeQueues; + #if DEBUG #ifndef KOMPUTE_DISABLE_VK_DEBUG_LAYERS vk::DebugReportCallbackEXT mDebugReportCallback; @@ -1473,7 +1475,7 @@ class Manager // Create functions void createInstance(); - void createDevice(uint32_t totalComputeQueues); + void createDevice(const std::vector & familyQueueIndeces = {}); }; } // End namespace kp diff --git a/src/Manager.cpp b/src/Manager.cpp old mode 100644 new mode 100755 index 58f067843..7e5cc831a --- a/src/Manager.cpp +++ b/src/Manager.cpp @@ -25,15 +25,15 @@ debugMessageCallback(VkDebugReportFlagsEXT flags, #endif Manager::Manager() - : Manager(0, 1) + : Manager(0) {} -Manager::Manager(uint32_t physicalDeviceIndex, uint32_t totalComputeQueues) +Manager::Manager(uint32_t physicalDeviceIndex, const std::vector & familyQueueIndeces) { this->mPhysicalDeviceIndex = physicalDeviceIndex; this->createInstance(); - this->createDevice(totalComputeQueues); + this->createDevice(familyQueueIndeces); } Manager::Manager(std::shared_ptr instance, @@ -113,7 +113,7 @@ Manager::createManagedSequence(std::string sequenceName, uint32_t queueIndex) { std::make_shared(this->mPhysicalDevice, this->mDevice, this->mComputeQueues[queueIndex], - this->mComputeQueueFamilyIndex); + this->mComputeQueueFamilyIndeces[queueIndex]); sq->init(); this->mManagedSequences.insert({ sequenceName, sq }); return sq; @@ -205,7 +205,7 @@ Manager::createInstance() } void -Manager::createDevice(uint32_t totalComputeQueues) +Manager::createDevice(const std::vector & familyQueueIndeces) { SPDLOG_DEBUG("Kompute Manager creating Device"); @@ -236,48 +236,71 @@ Manager::createDevice(uint32_t totalComputeQueues) this->mPhysicalDeviceIndex, physicalDeviceProperties.deviceName); - // Find compute queue - std::vector allQueueFamilyProperties = - physicalDevice.getQueueFamilyProperties(); + if (!familyQueueIndeces.size()) { + // Find compute queue + std::vector allQueueFamilyProperties = + physicalDevice.getQueueFamilyProperties(); - this->mComputeQueueFamilyIndex = -1; - for (uint32_t i = 0; i < allQueueFamilyProperties.size(); i++) { - vk::QueueFamilyProperties queueFamilyProperties = - allQueueFamilyProperties[i]; + uint32_t computeQueueFamilyIndex = -1; + for (uint32_t i = 0; i < allQueueFamilyProperties.size(); i++) { + vk::QueueFamilyProperties queueFamilyProperties = + allQueueFamilyProperties[i]; - if (queueFamilyProperties.queueFlags & vk::QueueFlagBits::eCompute) { - this->mComputeQueueFamilyIndex = i; - break; + if (queueFamilyProperties.queueFlags & vk::QueueFlagBits::eCompute) { + computeQueueFamilyIndex = i; + break; + } } + + if (computeQueueFamilyIndex < 0) { + throw std::runtime_error("Compute queue is not supported"); + } + + this->mComputeQueueFamilyIndeces.push_back(computeQueueFamilyIndex); + } + else { + this->mComputeQueueFamilyIndeces = familyQueueIndeces; } - if (this->mComputeQueueFamilyIndex < 0) { - throw std::runtime_error("Compute queue is not supported"); + std::unordered_map familyQueueCounts; + std::unordered_map> familyQueuePriorities; + for (const auto& value : this->mComputeQueueFamilyIndeces) { + familyQueueCounts[value]++; + familyQueuePriorities[value].push_back(1.0f); } - const float defaultQueuePriority(0.0f); - const uint32_t defaultQueueCount(totalComputeQueues); - vk::DeviceQueueCreateInfo deviceQueueCreateInfo( - vk::DeviceQueueCreateFlags(), - this->mComputeQueueFamilyIndex, - defaultQueueCount, - &defaultQueuePriority); + std::unordered_map familyQueueIndexCount; + std::vector deviceQueueCreateInfos; + for (const auto& familyQueueInfo : familyQueueCounts) { + // Setting the device count to 0 + familyQueueIndexCount[familyQueueInfo.first] = 0; + + // Creating the respective device queue + vk::DeviceQueueCreateInfo deviceQueueCreateInfo( + vk::DeviceQueueCreateFlags(), + familyQueueInfo.first, + familyQueueInfo.second, + familyQueuePriorities[familyQueueInfo.first].data()); + deviceQueueCreateInfos.push_back(deviceQueueCreateInfo); + } vk::DeviceCreateInfo deviceCreateInfo(vk::DeviceCreateFlags(), - 1, // Number of deviceQueueCreateInfo - &deviceQueueCreateInfo); + deviceQueueCreateInfos.size(), + deviceQueueCreateInfos.data()); this->mDevice = std::make_shared(); physicalDevice.createDevice( &deviceCreateInfo, nullptr, this->mDevice.get()); SPDLOG_DEBUG("Kompute Manager device created"); - for (uint32_t i = 0; i < totalComputeQueues; i++) + for (const uint32_t & familyQueueIndex : this->mComputeQueueFamilyIndeces) { std::shared_ptr currQueue = std::make_shared(); this->mDevice->getQueue( - this->mComputeQueueFamilyIndex, i, currQueue.get()); + familyQueueIndex, familyQueueIndexCount[familyQueueIndex], currQueue.get()); + + familyQueueIndexCount[familyQueueIndex]++; this->mComputeQueues.push_back(currQueue); } diff --git a/src/include/kompute/Manager.hpp b/src/include/kompute/Manager.hpp index 340fd782d..2581139d9 100644 --- a/src/include/kompute/Manager.hpp +++ b/src/include/kompute/Manager.hpp @@ -17,7 +17,7 @@ namespace kp { */ class Manager { - public: +public: /** Base constructor and default used which creates the base resources including choosing the device 0 by default. @@ -29,9 +29,10 @@ class Manager * they would like to create the resources on. * * @param physicalDeviceIndex The index of the physical device to use + * @param familyQueueIndeces (Optional) List of queue indeces to add for explicit allocation * @param totalQueues The total number of compute queues to create. */ - Manager(uint32_t physicalDeviceIndex, uint32_t totalComputeQueues = 1); + Manager(uint32_t physicalDeviceIndex, const std::vector & familyQueueIndeces = {}); /** * Manager constructor which allows your own vulkan application to integrate @@ -229,13 +230,14 @@ class Manager uint32_t mPhysicalDeviceIndex = -1; std::shared_ptr mDevice = nullptr; bool mFreeDevice = false; - uint32_t mComputeQueueFamilyIndex = -1; - std::vector> mComputeQueues; // -------------- ALWAYS OWNED RESOURCES std::unordered_map> mManagedSequences; + std::vector mComputeQueueFamilyIndeces; + std::vector> mComputeQueues; + #if DEBUG #ifndef KOMPUTE_DISABLE_VK_DEBUG_LAYERS vk::DebugReportCallbackEXT mDebugReportCallback; @@ -245,7 +247,7 @@ class Manager // Create functions void createInstance(); - void createDevice(uint32_t totalComputeQueues); + void createDevice(const std::vector & familyQueueIndeces = {}); }; } // End namespace kp From 356a56347119ee492bc922ac2676ad47a9b82859 Mon Sep 17 00:00:00 2001 From: Alejandro Saucedo Date: Sat, 17 Oct 2020 10:42:07 +0100 Subject: [PATCH 10/19] Added test that ensures multiprocessing is working --- test/TestAsyncOperations.cpp | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) mode change 100644 => 100755 test/TestAsyncOperations.cpp diff --git a/test/TestAsyncOperations.cpp b/test/TestAsyncOperations.cpp old mode 100644 new mode 100755 index 6da9533a9..6c7de15e8 --- a/test/TestAsyncOperations.cpp +++ b/test/TestAsyncOperations.cpp @@ -9,7 +9,7 @@ TEST(TestAsyncOperations, TestManagerAsync) { uint32_t size = 10; - uint32_t numParallel = 6; + uint32_t numParallel = 2; std::string shader(R"( #version 450 @@ -83,7 +83,7 @@ TEST(TestAsyncOperations, TestManagerAsync) EXPECT_EQ(inputsSyncB[i]->data(), resultSync); } - kp::Manager mgrAsync(0, numParallel); + kp::Manager mgrAsync(0, {0, 2}); std::vector> inputsAsyncA; std::vector> inputsAsyncB; @@ -109,6 +109,8 @@ TEST(TestAsyncOperations, TestManagerAsync) std::vector(shader.begin(), shader.end())); } + // TODO: Add function to print device details (or link) + // TODO: Seems to fail if await called twice for (uint32_t i = 0; i < numParallel; i++) { mgrAsync.evalOpAwait("async" + std::to_string(i)); } @@ -122,6 +124,6 @@ TEST(TestAsyncOperations, TestManagerAsync) EXPECT_EQ(inputsAsyncB[i]->data(), resultAsync); } - SPDLOG_ERROR("Total Sync: {}", durationSync); - SPDLOG_ERROR("Total Async: {}", durationAsync); + // The speedup should be at least 40% + EXPECT_LT(durationAsync, durationSync * 0.6); } From 40fc293eb1fb3c445dd04c6e134ca3370f4031f2 Mon Sep 17 00:00:00 2001 From: Alejandro Saucedo Date: Sat, 17 Oct 2020 11:17:23 +0100 Subject: [PATCH 11/19] Updated tests to work with async tests --- test/TestAsyncOperations.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/test/TestAsyncOperations.cpp b/test/TestAsyncOperations.cpp index 6c7de15e8..45027942f 100755 --- a/test/TestAsyncOperations.cpp +++ b/test/TestAsyncOperations.cpp @@ -109,8 +109,6 @@ TEST(TestAsyncOperations, TestManagerAsync) std::vector(shader.begin(), shader.end())); } - // TODO: Add function to print device details (or link) - // TODO: Seems to fail if await called twice for (uint32_t i = 0; i < numParallel; i++) { mgrAsync.evalOpAwait("async" + std::to_string(i)); } @@ -124,6 +122,9 @@ TEST(TestAsyncOperations, TestManagerAsync) EXPECT_EQ(inputsAsyncB[i]->data(), resultAsync); } + SPDLOG_ERROR("sync {}", durationSync); + SPDLOG_ERROR("async {}", durationAsync); + // The speedup should be at least 40% EXPECT_LT(durationAsync, durationSync * 0.6); } From 872a0bc7170e24206866d8920d406fb7e204ba80 Mon Sep 17 00:00:00 2001 From: Alejandro Saucedo Date: Sat, 17 Oct 2020 11:18:50 +0100 Subject: [PATCH 12/19] Added details on test --- test/TestAsyncOperations.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/test/TestAsyncOperations.cpp b/test/TestAsyncOperations.cpp index 45027942f..b9e84e1b5 100755 --- a/test/TestAsyncOperations.cpp +++ b/test/TestAsyncOperations.cpp @@ -7,6 +7,9 @@ TEST(TestAsyncOperations, TestManagerAsync) { + // This test is built for NVIDIA 1650. It assumes: + // * Queue family 0 and 2 have compute capabilities + // * GPU is able to process parallel shader code across different families uint32_t size = 10; uint32_t numParallel = 2; From 9a64339e953c8eb23c7a2a1d39fb7f0f5a4a4e67 Mon Sep 17 00:00:00 2001 From: Alejandro Saucedo Date: Sat, 17 Oct 2020 11:19:25 +0100 Subject: [PATCH 13/19] Reformatted --- src/Algorithm.cpp | 3 +- src/Manager.cpp | 39 ++++++++++--------- src/Sequence.cpp | 9 +++-- src/include/kompute/Core.hpp | 17 ++++++--- src/include/kompute/Manager.hpp | 58 +++++++++++++++-------------- src/include/kompute/Sequence.hpp | 10 +++-- test/TestAsyncOperations.cpp | 19 ++++++---- test/TestMultipleAlgoExecutions.cpp | 12 +++--- 8 files changed, 95 insertions(+), 72 deletions(-) mode change 100755 => 100644 src/Manager.cpp mode change 100755 => 100644 test/TestAsyncOperations.cpp diff --git a/src/Algorithm.cpp b/src/Algorithm.cpp index 0138201c7..845036475 100644 --- a/src/Algorithm.cpp +++ b/src/Algorithm.cpp @@ -266,8 +266,7 @@ Algorithm::createPipeline(std::vector specializationData) #ifdef KOMPUTE_CREATE_PIPELINE_RESULT_VALUE vk::ResultValue pipelineResult = - this->mDevice->createComputePipeline(*this->mPipelineCache, -pipelineInfo); + this->mDevice->createComputePipeline(*this->mPipelineCache, pipelineInfo); if (pipelineResult.result != vk::Result::eSuccess) { throw std::runtime_error("Failed to create pipeline result: " + diff --git a/src/Manager.cpp b/src/Manager.cpp old mode 100755 new mode 100644 index 7e5cc831a..b2739d6be --- a/src/Manager.cpp +++ b/src/Manager.cpp @@ -28,7 +28,8 @@ Manager::Manager() : Manager(0) {} -Manager::Manager(uint32_t physicalDeviceIndex, const std::vector & familyQueueIndeces) +Manager::Manager(uint32_t physicalDeviceIndex, + const std::vector& familyQueueIndeces) { this->mPhysicalDeviceIndex = physicalDeviceIndex; @@ -105,9 +106,13 @@ Manager::getOrCreateManagedSequence(std::string sequenceName) } std::weak_ptr -Manager::createManagedSequence(std::string sequenceName, uint32_t queueIndex) { +Manager::createManagedSequence(std::string sequenceName, uint32_t queueIndex) +{ - SPDLOG_DEBUG("Kompute Manager createManagedSequence with sequenceName: {} and queueIndex: {}", sequenceName, queueIndex); + SPDLOG_DEBUG("Kompute Manager createManagedSequence with sequenceName: {} " + "and queueIndex: {}", + sequenceName, + queueIndex); std::shared_ptr sq = std::make_shared(this->mPhysicalDevice, @@ -205,7 +210,7 @@ Manager::createInstance() } void -Manager::createDevice(const std::vector & familyQueueIndeces) +Manager::createDevice(const std::vector& familyQueueIndeces) { SPDLOG_DEBUG("Kompute Manager creating Device"); @@ -246,7 +251,8 @@ Manager::createDevice(const std::vector & familyQueueIndeces) vk::QueueFamilyProperties queueFamilyProperties = allQueueFamilyProperties[i]; - if (queueFamilyProperties.queueFlags & vk::QueueFlagBits::eCompute) { + if (queueFamilyProperties.queueFlags & + vk::QueueFlagBits::eCompute) { computeQueueFamilyIndex = i; break; } @@ -257,8 +263,7 @@ Manager::createDevice(const std::vector & familyQueueIndeces) } this->mComputeQueueFamilyIndeces.push_back(computeQueueFamilyIndex); - } - else { + } else { this->mComputeQueueFamilyIndeces = familyQueueIndeces; } @@ -277,28 +282,28 @@ Manager::createDevice(const std::vector & familyQueueIndeces) // Creating the respective device queue vk::DeviceQueueCreateInfo deviceQueueCreateInfo( - vk::DeviceQueueCreateFlags(), - familyQueueInfo.first, - familyQueueInfo.second, - familyQueuePriorities[familyQueueInfo.first].data()); + vk::DeviceQueueCreateFlags(), + familyQueueInfo.first, + familyQueueInfo.second, + familyQueuePriorities[familyQueueInfo.first].data()); deviceQueueCreateInfos.push_back(deviceQueueCreateInfo); } vk::DeviceCreateInfo deviceCreateInfo(vk::DeviceCreateFlags(), - deviceQueueCreateInfos.size(), - deviceQueueCreateInfos.data()); + deviceQueueCreateInfos.size(), + deviceQueueCreateInfos.data()); this->mDevice = std::make_shared(); physicalDevice.createDevice( &deviceCreateInfo, nullptr, this->mDevice.get()); SPDLOG_DEBUG("Kompute Manager device created"); - for (const uint32_t & familyQueueIndex : this->mComputeQueueFamilyIndeces) - { + for (const uint32_t& familyQueueIndex : this->mComputeQueueFamilyIndeces) { std::shared_ptr currQueue = std::make_shared(); - this->mDevice->getQueue( - familyQueueIndex, familyQueueIndexCount[familyQueueIndex], currQueue.get()); + this->mDevice->getQueue(familyQueueIndex, + familyQueueIndexCount[familyQueueIndex], + currQueue.get()); familyQueueIndexCount[familyQueueIndex]++; diff --git a/src/Sequence.cpp b/src/Sequence.cpp index 1496e67d9..a03a34afe 100644 --- a/src/Sequence.cpp +++ b/src/Sequence.cpp @@ -141,7 +141,8 @@ Sequence::evalAsync() return false; } if (this->mIsRunning) { - SPDLOG_WARN("Kompute Sequence evalAsync called when an eval async was called without successful wait"); + SPDLOG_WARN("Kompute Sequence evalAsync called when an eval async was " + "called without successful wait"); return false; } @@ -172,7 +173,8 @@ Sequence::evalAwait(uint64_t waitFor) return false; } - vk::Result result = this->mDevice->waitForFences(1, &this->mFence, VK_TRUE, waitFor); + vk::Result result = + this->mDevice->waitForFences(1, &this->mFence, VK_TRUE, waitFor); this->mDevice->destroy(this->mFence); if (result == vk::Result::eTimeout) { @@ -191,7 +193,8 @@ Sequence::evalAwait(uint64_t waitFor) } bool -Sequence::isRunning() { +Sequence::isRunning() +{ return this->mIsRunning; } diff --git a/src/include/kompute/Core.hpp b/src/include/kompute/Core.hpp index dfa83d7a5..72ffa5346 100644 --- a/src/include/kompute/Core.hpp +++ b/src/include/kompute/Core.hpp @@ -18,10 +18,11 @@ static const char* KOMPUTE_LOG_TAG = "KomputeLog"; #ifndef KOMPUTE_VK_API_MINOR_VERSION #define KOMPUTE_VK_API_MINOR_VERSION 1 #endif // KOMPUTE_VK_API_MINOR_VERSION -#define KOMPUTE_VK_API_VERSION VK_MAKE_VERSION(KOMPUTE_VK_API_MAJOR_VERSION, KOMPUTE_VK_API_MINOR_VERSION, 0) +#define KOMPUTE_VK_API_VERSION \ + VK_MAKE_VERSION( \ + KOMPUTE_VK_API_MAJOR_VERSION, KOMPUTE_VK_API_MINOR_VERSION, 0) #endif // KOMPUTE_VK_API_VERSION - // SPDLOG_ACTIVE_LEVEL must be defined before spdlog.h import #if !defined(SPDLOG_ACTIVE_LEVEL) #if DEBUG @@ -40,7 +41,8 @@ static const char* KOMPUTE_LOG_TAG = "KomputeLog"; #define SPDLOG_DEBUG(message, ...) #else #if defined(VK_USE_PLATFORM_ANDROID_KHR) -#define SPDLOG_DEBUG(message, ...) ((void)__android_log_print(ANDROID_LOG_DEBUG, KOMPUTE_LOG_TAG, message)) +#define SPDLOG_DEBUG(message, ...) \ + ((void)__android_log_print(ANDROID_LOG_DEBUG, KOMPUTE_LOG_TAG, message)) #else #define SPDLOG_DEBUG(message, ...) \ std::cout << "DEBUG: " << message << std::endl @@ -50,7 +52,8 @@ static const char* KOMPUTE_LOG_TAG = "KomputeLog"; #define SPDLOG_INFO(message, ...) #else #if defined(VK_USE_PLATFORM_ANDROID_KHR) -#define SPDLOG_INFO(message, ...) ((void)__android_log_print(ANDROID_LOG_INFO, KOMPUTE_LOG_TAG, message)) +#define SPDLOG_INFO(message, ...) \ + ((void)__android_log_print(ANDROID_LOG_INFO, KOMPUTE_LOG_TAG, message)) #else #define SPDLOG_INFO(message, ...) std::cout << "INFO: " << message << std::endl #endif // VK_USE_PLATFORM_ANDROID_KHR @@ -59,7 +62,8 @@ static const char* KOMPUTE_LOG_TAG = "KomputeLog"; #define SPDLOG_WARN(message, ...) #else #if defined(VK_USE_PLATFORM_ANDROID_KHR) -#define SPDLOG_WARN(message, ...) ((void)__android_log_print(ANDROID_LOG_INFO, KOMPUTE_LOG_TAG, message)) +#define SPDLOG_WARN(message, ...) \ + ((void)__android_log_print(ANDROID_LOG_INFO, KOMPUTE_LOG_TAG, message)) #else #define SPDLOG_WARN(message, ...) \ std::cout << "WARNING: " << message << std::endl @@ -69,7 +73,8 @@ static const char* KOMPUTE_LOG_TAG = "KomputeLog"; #define SPDLOG_ERROR(message, ...) #else #if defined(VK_USE_PLATFORM_ANDROID_KHR) -#define SPDLOG_ERROR(message, ...) ((void)__android_log_print(ANDROID_LOG_INFO, KOMPUTE_LOG_TAG, message)) +#define SPDLOG_ERROR(message, ...) \ + ((void)__android_log_print(ANDROID_LOG_INFO, KOMPUTE_LOG_TAG, message)) #else #define SPDLOG_ERROR(message, ...) \ std::cout << "ERROR: " << message << std::endl diff --git a/src/include/kompute/Manager.hpp b/src/include/kompute/Manager.hpp index 2581139d9..650da5ca0 100644 --- a/src/include/kompute/Manager.hpp +++ b/src/include/kompute/Manager.hpp @@ -17,7 +17,7 @@ namespace kp { */ class Manager { -public: + public: /** Base constructor and default used which creates the base resources including choosing the device 0 by default. @@ -25,14 +25,16 @@ public: Manager(); /** - * Similar to base constructor but allows the user to provide the device - * they would like to create the resources on. - * - * @param physicalDeviceIndex The index of the physical device to use - * @param familyQueueIndeces (Optional) List of queue indeces to add for explicit allocation - * @param totalQueues The total number of compute queues to create. - */ - Manager(uint32_t physicalDeviceIndex, const std::vector & familyQueueIndeces = {}); + * Similar to base constructor but allows the user to provide the device + * they would like to create the resources on. + * + * @param physicalDeviceIndex The index of the physical device to use + * @param familyQueueIndeces (Optional) List of queue indeces to add for + * explicit allocation + * @param totalQueues The total number of compute queues to create. + */ + Manager(uint32_t physicalDeviceIndex, + const std::vector& familyQueueIndeces = {}); /** * Manager constructor which allows your own vulkan application to integrate @@ -67,14 +69,15 @@ public: std::string sequenceName); /** - * Create a new managed Kompute sequence so it's available within the manager. + * Create a new managed Kompute sequence so it's available within the + * manager. * * @param sequenceName The name for the named sequence to be created * @param queueIndex The queue to use from the available queues * @return Weak pointer to the manager owned sequence resource */ - std::weak_ptr createManagedSequence( - std::string sequenceName, uint32_t queueIndex = 0); + std::weak_ptr createManagedSequence(std::string sequenceName, + uint32_t queueIndex = 0); /** * Operation that adds extra operations to existing or new created @@ -121,15 +124,15 @@ public: */ template void evalOpAsync(std::vector> tensors, - std::string sequenceName, - TArgs&&... params) + std::string sequenceName, + TArgs&&... params) { SPDLOG_DEBUG("Kompute Manager evalOpAsync triggered"); std::weak_ptr sqWeakPtr = this->getOrCreateManagedSequence(sequenceName); - std::unordered_map>::iterator found = - this->mManagedSequences.find(sequenceName); + std::unordered_map>::iterator + found = this->mManagedSequences.find(sequenceName); if (found != this->mManagedSequences.end()) { std::shared_ptr sq = found->second; @@ -145,9 +148,9 @@ public: SPDLOG_DEBUG("Kompute Manager evalOpAsync running sequence EVAL"); sq->evalAsync(); - } - else { - SPDLOG_ERROR("Kompute Manager evalOpAsync sequence [{}] not found", sequenceName); + } else { + SPDLOG_ERROR("Kompute Manager evalOpAsync sequence [{}] not found", + sequenceName); } SPDLOG_DEBUG("Kompute Manager evalOpAsync running sequence SUCCESS"); } @@ -163,21 +166,22 @@ public: void evalOpAwait(std::string sequenceName, uint64_t waitFor = UINT64_MAX) { SPDLOG_DEBUG("Kompute Manager evalOpAwait triggered"); - std::unordered_map>::iterator found = - this->mManagedSequences.find(sequenceName); + std::unordered_map>::iterator + found = this->mManagedSequences.find(sequenceName); if (found != this->mManagedSequences.end()) { if (std::shared_ptr sq = found->second) { - SPDLOG_DEBUG("Kompute Manager evalOpAwait running sequence Sequence EVAL AWAIT"); + SPDLOG_DEBUG("Kompute Manager evalOpAwait running sequence " + "Sequence EVAL AWAIT"); if (sq->isRunning()) { sq->evalAwait(waitFor); } } - SPDLOG_DEBUG("Kompute Manager evalOpAwait running sequence SUCCESS"); - } - else { + SPDLOG_DEBUG( + "Kompute Manager evalOpAwait running sequence SUCCESS"); + } else { SPDLOG_ERROR("Kompute Manager evalOpAwait Sequence not found"); - } + } } /** @@ -247,7 +251,7 @@ public: // Create functions void createInstance(); - void createDevice(const std::vector & familyQueueIndeces = {}); + void createDevice(const std::vector& familyQueueIndeces = {}); }; } // End namespace kp diff --git a/src/include/kompute/Sequence.hpp b/src/include/kompute/Sequence.hpp index 2d9ef8e51..314de6657 100644 --- a/src/include/kompute/Sequence.hpp +++ b/src/include/kompute/Sequence.hpp @@ -67,14 +67,17 @@ class Sequence bool eval(); /** - * Eval Async sends all the recorded and stored operations in the vector of operations into the gpu as a submit job with a barrier. EvalAwait() must be called after to ensure the sequence is terminated correctly. + * Eval Async sends all the recorded and stored operations in the vector of + * operations into the gpu as a submit job with a barrier. EvalAwait() must + * be called after to ensure the sequence is terminated correctly. * * @return Boolean stating whether execution was successful. */ bool evalAsync(); /** - * Eval Await waits for the fence to finish processing and then once it finishes, it runs the postEval of all operations. + * Eval Await waits for the fence to finish processing and then once it + * finishes, it runs the postEval of all operations. * * @param waitFor Number of milliseconds to wait before timing out. * @return Boolean stating whether execution was successful. @@ -89,7 +92,8 @@ class Sequence bool isRecording(); /** - * Returns true if the sequence is currently running - mostly used for async workloads. + * Returns true if the sequence is currently running - mostly used for async + * workloads. * * @return Boolean stating if currently running. */ diff --git a/test/TestAsyncOperations.cpp b/test/TestAsyncOperations.cpp old mode 100755 new mode 100644 index b9e84e1b5..28dd8c144 --- a/test/TestAsyncOperations.cpp +++ b/test/TestAsyncOperations.cpp @@ -72,13 +72,14 @@ TEST(TestAsyncOperations, TestManagerAsync) for (uint32_t i = 0; i < numParallel; i++) { mgr.evalOpDefault>( - { inputsSyncA[i], inputsSyncB[i] }, + { inputsSyncA[i], inputsSyncB[i] }, std::vector(shader.begin(), shader.end())); - } auto endSync = std::chrono::high_resolution_clock::now(); - auto durationSync = std::chrono::duration_cast(endSync - startSync).count(); + auto durationSync = + std::chrono::duration_cast(endSync - startSync) + .count(); mgr.evalOpDefault(inputsSyncB); @@ -86,7 +87,7 @@ TEST(TestAsyncOperations, TestManagerAsync) EXPECT_EQ(inputsSyncB[i]->data(), resultSync); } - kp::Manager mgrAsync(0, {0, 2}); + kp::Manager mgrAsync(0, { 0, 2 }); std::vector> inputsAsyncA; std::vector> inputsAsyncB; @@ -107,8 +108,8 @@ TEST(TestAsyncOperations, TestManagerAsync) for (uint32_t i = 0; i < numParallel; i++) { mgrAsync.evalOpAsync>( - { inputsAsyncA[i], inputsAsyncB[i] }, - "async" + std::to_string(i), + { inputsAsyncA[i], inputsAsyncB[i] }, + "async" + std::to_string(i), std::vector(shader.begin(), shader.end())); } @@ -117,7 +118,9 @@ TEST(TestAsyncOperations, TestManagerAsync) } auto endAsync = std::chrono::high_resolution_clock::now(); - auto durationAsync = std::chrono::duration_cast(endAsync - startAsync).count(); + auto durationAsync = std::chrono::duration_cast( + endAsync - startAsync) + .count(); mgrAsync.evalOpDefault({ inputsAsyncB }); @@ -128,6 +131,6 @@ TEST(TestAsyncOperations, TestManagerAsync) SPDLOG_ERROR("sync {}", durationSync); SPDLOG_ERROR("async {}", durationAsync); - // The speedup should be at least 40% + // The speedup should be at least 40% EXPECT_LT(durationAsync, durationSync * 0.6); } diff --git a/test/TestMultipleAlgoExecutions.cpp b/test/TestMultipleAlgoExecutions.cpp index fb0803690..1b59d3516 100644 --- a/test/TestMultipleAlgoExecutions.cpp +++ b/test/TestMultipleAlgoExecutions.cpp @@ -258,10 +258,10 @@ TEST(TestMultipleAlgoExecutions, ManagerEvalMultSourceStrOpCreate) )"); mgr.evalOpDefault>( - { tensorInA, tensorInB, tensorOut }, - std::vector(shader.begin(), shader.end())); + { tensorInA, tensorInB, tensorOut }, + std::vector(shader.begin(), shader.end())); - mgr.evalOpDefault({tensorOut}); + mgr.evalOpDefault({ tensorOut }); EXPECT_EQ(tensorOut->data(), std::vector({ 0.0, 4.0, 12.0 })); } @@ -295,10 +295,10 @@ TEST(TestMultipleAlgoExecutions, ManagerEvalMultSourceStrMgrCreate) )"); mgr.evalOpDefault>( - { tensorInA, tensorInB, tensorOut }, - std::vector(shader.begin(), shader.end())); + { tensorInA, tensorInB, tensorOut }, + std::vector(shader.begin(), shader.end())); - mgr.evalOpDefault({tensorOut}); + mgr.evalOpDefault({ tensorOut }); EXPECT_EQ(tensorOut->data(), std::vector({ 0.0, 4.0, 12.0 })); } From 5cfc2ab8ce8d1389815c7e5871214053b8151f05 Mon Sep 17 00:00:00 2001 From: Alejandro Saucedo Date: Sat, 17 Oct 2020 11:20:33 +0100 Subject: [PATCH 14/19] Added single include with async --- single_include/kompute/Kompute.hpp | 84 +++++++++++++++++------------- 1 file changed, 49 insertions(+), 35 deletions(-) diff --git a/single_include/kompute/Kompute.hpp b/single_include/kompute/Kompute.hpp index 15e03ccc4..382b8332d 100755 --- a/single_include/kompute/Kompute.hpp +++ b/single_include/kompute/Kompute.hpp @@ -18,7 +18,9 @@ static const char* KOMPUTE_LOG_TAG = "KomputeLog"; #ifndef KOMPUTE_VK_API_MINOR_VERSION #define KOMPUTE_VK_API_MINOR_VERSION 1 #endif // KOMPUTE_VK_API_MINOR_VERSION -#define KOMPUTE_VK_API_VERSION VK_MAKE_VERSION(KOMPUTE_VK_API_MAJOR_VERSION, KOMPUTE_VK_API_MINOR_VERSION, 0) +#define KOMPUTE_VK_API_VERSION \ + VK_MAKE_VERSION( \ + KOMPUTE_VK_API_MAJOR_VERSION, KOMPUTE_VK_API_MINOR_VERSION, 0) #endif // KOMPUTE_VK_API_VERSION // SPDLOG_ACTIVE_LEVEL must be defined before spdlog.h import @@ -39,7 +41,8 @@ static const char* KOMPUTE_LOG_TAG = "KomputeLog"; #define SPDLOG_DEBUG(message, ...) #else #if defined(VK_USE_PLATFORM_ANDROID_KHR) -#define SPDLOG_DEBUG(message, ...) ((void)__android_log_print(ANDROID_LOG_DEBUG, KOMPUTE_LOG_TAG, message)) +#define SPDLOG_DEBUG(message, ...) \ + ((void)__android_log_print(ANDROID_LOG_DEBUG, KOMPUTE_LOG_TAG, message)) #else #define SPDLOG_DEBUG(message, ...) \ std::cout << "DEBUG: " << message << std::endl @@ -49,7 +52,8 @@ static const char* KOMPUTE_LOG_TAG = "KomputeLog"; #define SPDLOG_INFO(message, ...) #else #if defined(VK_USE_PLATFORM_ANDROID_KHR) -#define SPDLOG_INFO(message, ...) ((void)__android_log_print(ANDROID_LOG_INFO, KOMPUTE_LOG_TAG, message)) +#define SPDLOG_INFO(message, ...) \ + ((void)__android_log_print(ANDROID_LOG_INFO, KOMPUTE_LOG_TAG, message)) #else #define SPDLOG_INFO(message, ...) std::cout << "INFO: " << message << std::endl #endif // VK_USE_PLATFORM_ANDROID_KHR @@ -58,7 +62,8 @@ static const char* KOMPUTE_LOG_TAG = "KomputeLog"; #define SPDLOG_WARN(message, ...) #else #if defined(VK_USE_PLATFORM_ANDROID_KHR) -#define SPDLOG_WARN(message, ...) ((void)__android_log_print(ANDROID_LOG_INFO, KOMPUTE_LOG_TAG, message)) +#define SPDLOG_WARN(message, ...) \ + ((void)__android_log_print(ANDROID_LOG_INFO, KOMPUTE_LOG_TAG, message)) #else #define SPDLOG_WARN(message, ...) \ std::cout << "WARNING: " << message << std::endl @@ -68,7 +73,8 @@ static const char* KOMPUTE_LOG_TAG = "KomputeLog"; #define SPDLOG_ERROR(message, ...) #else #if defined(VK_USE_PLATFORM_ANDROID_KHR) -#define SPDLOG_ERROR(message, ...) ((void)__android_log_print(ANDROID_LOG_INFO, KOMPUTE_LOG_TAG, message)) +#define SPDLOG_ERROR(message, ...) \ + ((void)__android_log_print(ANDROID_LOG_INFO, KOMPUTE_LOG_TAG, message)) #else #define SPDLOG_ERROR(message, ...) \ std::cout << "ERROR: " << message << std::endl @@ -1055,14 +1061,17 @@ class Sequence bool eval(); /** - * Eval Async sends all the recorded and stored operations in the vector of operations into the gpu as a submit job with a barrier. EvalAwait() must be called after to ensure the sequence is terminated correctly. + * Eval Async sends all the recorded and stored operations in the vector of + * operations into the gpu as a submit job with a barrier. EvalAwait() must + * be called after to ensure the sequence is terminated correctly. * * @return Boolean stating whether execution was successful. */ bool evalAsync(); /** - * Eval Await waits for the fence to finish processing and then once it finishes, it runs the postEval of all operations. + * Eval Await waits for the fence to finish processing and then once it + * finishes, it runs the postEval of all operations. * * @param waitFor Number of milliseconds to wait before timing out. * @return Boolean stating whether execution was successful. @@ -1077,7 +1086,8 @@ class Sequence bool isRecording(); /** - * Returns true if the sequence is currently running - mostly used for async workloads. + * Returns true if the sequence is currently running - mostly used for async + * workloads. * * @return Boolean stating if currently running. */ @@ -1245,7 +1255,7 @@ namespace kp { */ class Manager { -public: + public: /** Base constructor and default used which creates the base resources including choosing the device 0 by default. @@ -1253,14 +1263,16 @@ public: Manager(); /** - * Similar to base constructor but allows the user to provide the device - * they would like to create the resources on. - * - * @param physicalDeviceIndex The index of the physical device to use - * @param familyQueueIndeces (Optional) List of queue indeces to add for explicit allocation - * @param totalQueues The total number of compute queues to create. - */ - Manager(uint32_t physicalDeviceIndex, const std::vector & familyQueueIndeces = {}); + * Similar to base constructor but allows the user to provide the device + * they would like to create the resources on. + * + * @param physicalDeviceIndex The index of the physical device to use + * @param familyQueueIndeces (Optional) List of queue indeces to add for + * explicit allocation + * @param totalQueues The total number of compute queues to create. + */ + Manager(uint32_t physicalDeviceIndex, + const std::vector& familyQueueIndeces = {}); /** * Manager constructor which allows your own vulkan application to integrate @@ -1295,14 +1307,15 @@ public: std::string sequenceName); /** - * Create a new managed Kompute sequence so it's available within the manager. + * Create a new managed Kompute sequence so it's available within the + * manager. * * @param sequenceName The name for the named sequence to be created * @param queueIndex The queue to use from the available queues * @return Weak pointer to the manager owned sequence resource */ - std::weak_ptr createManagedSequence( - std::string sequenceName, uint32_t queueIndex = 0); + std::weak_ptr createManagedSequence(std::string sequenceName, + uint32_t queueIndex = 0); /** * Operation that adds extra operations to existing or new created @@ -1349,15 +1362,15 @@ public: */ template void evalOpAsync(std::vector> tensors, - std::string sequenceName, - TArgs&&... params) + std::string sequenceName, + TArgs&&... params) { SPDLOG_DEBUG("Kompute Manager evalOpAsync triggered"); std::weak_ptr sqWeakPtr = this->getOrCreateManagedSequence(sequenceName); - std::unordered_map>::iterator found = - this->mManagedSequences.find(sequenceName); + std::unordered_map>::iterator + found = this->mManagedSequences.find(sequenceName); if (found != this->mManagedSequences.end()) { std::shared_ptr sq = found->second; @@ -1373,9 +1386,9 @@ public: SPDLOG_DEBUG("Kompute Manager evalOpAsync running sequence EVAL"); sq->evalAsync(); - } - else { - SPDLOG_ERROR("Kompute Manager evalOpAsync sequence [{}] not found", sequenceName); + } else { + SPDLOG_ERROR("Kompute Manager evalOpAsync sequence [{}] not found", + sequenceName); } SPDLOG_DEBUG("Kompute Manager evalOpAsync running sequence SUCCESS"); } @@ -1391,21 +1404,22 @@ public: void evalOpAwait(std::string sequenceName, uint64_t waitFor = UINT64_MAX) { SPDLOG_DEBUG("Kompute Manager evalOpAwait triggered"); - std::unordered_map>::iterator found = - this->mManagedSequences.find(sequenceName); + std::unordered_map>::iterator + found = this->mManagedSequences.find(sequenceName); if (found != this->mManagedSequences.end()) { if (std::shared_ptr sq = found->second) { - SPDLOG_DEBUG("Kompute Manager evalOpAwait running sequence Sequence EVAL AWAIT"); + SPDLOG_DEBUG("Kompute Manager evalOpAwait running sequence " + "Sequence EVAL AWAIT"); if (sq->isRunning()) { sq->evalAwait(waitFor); } } - SPDLOG_DEBUG("Kompute Manager evalOpAwait running sequence SUCCESS"); - } - else { + SPDLOG_DEBUG( + "Kompute Manager evalOpAwait running sequence SUCCESS"); + } else { SPDLOG_ERROR("Kompute Manager evalOpAwait Sequence not found"); - } + } } /** @@ -1475,7 +1489,7 @@ public: // Create functions void createInstance(); - void createDevice(const std::vector & familyQueueIndeces = {}); + void createDevice(const std::vector& familyQueueIndeces = {}); }; } // End namespace kp From abe2c4104118fd7e6e7be31d89aeae7eaa9c2d7b Mon Sep 17 00:00:00 2001 From: Alejandro Saucedo Date: Sat, 17 Oct 2020 11:57:20 +0100 Subject: [PATCH 15/19] Simplified test and shortened run time --- test/TestAsyncOperations.cpp | 26 +++----------------------- 1 file changed, 3 insertions(+), 23 deletions(-) diff --git a/test/TestAsyncOperations.cpp b/test/TestAsyncOperations.cpp index 28dd8c144..ac7422785 100644 --- a/test/TestAsyncOperations.cpp +++ b/test/TestAsyncOperations.cpp @@ -19,8 +19,7 @@ TEST(TestAsyncOperations, TestManagerAsync) layout (local_size_x = 1) in; - layout(set = 0, binding = 0) buffer a { float pa[]; }; - layout(set = 0, binding = 1) buffer b { float pb[]; }; + layout(set = 0, binding = 0) buffer b { float pb[]; }; shared uint sharedTotal[1]; @@ -29,25 +28,12 @@ TEST(TestAsyncOperations, TestManagerAsync) sharedTotal[0] = 0; - barrier(); - memoryBarrierShared(); - for (int i = 0; i < 100000000; i++) { atomicAdd(sharedTotal[0], 1); - atomicAdd(sharedTotal[0], -1); - atomicAdd(sharedTotal[0], 1); - atomicAdd(sharedTotal[0], -1); - atomicAdd(sharedTotal[0], 1); - atomicAdd(sharedTotal[0], -1); - atomicAdd(sharedTotal[0], 1); } - barrier(); - memoryBarrierShared(); - pb[index] = sharedTotal[0]; - pa[index] = 0; } )"); @@ -57,22 +43,19 @@ TEST(TestAsyncOperations, TestManagerAsync) kp::Manager mgr; - std::vector> inputsSyncA; std::vector> inputsSyncB; for (uint32_t i = 0; i < numParallel; i++) { - inputsSyncA.push_back(std::make_shared(kp::Tensor(data))); inputsSyncB.push_back(std::make_shared(kp::Tensor(data))); } - mgr.evalOpDefault(inputsSyncA); mgr.evalOpDefault(inputsSyncB); auto startSync = std::chrono::high_resolution_clock::now(); for (uint32_t i = 0; i < numParallel; i++) { mgr.evalOpDefault>( - { inputsSyncA[i], inputsSyncB[i] }, + { inputsSyncB[i] }, std::vector(shader.begin(), shader.end())); } @@ -89,15 +72,12 @@ TEST(TestAsyncOperations, TestManagerAsync) kp::Manager mgrAsync(0, { 0, 2 }); - std::vector> inputsAsyncA; std::vector> inputsAsyncB; for (uint32_t i = 0; i < numParallel; i++) { - inputsAsyncA.push_back(std::make_shared(kp::Tensor(data))); inputsAsyncB.push_back(std::make_shared(kp::Tensor(data))); } - mgrAsync.evalOpDefault(inputsAsyncA); mgrAsync.evalOpDefault(inputsAsyncB); for (uint32_t i = 0; i < numParallel; i++) { @@ -108,7 +88,7 @@ TEST(TestAsyncOperations, TestManagerAsync) for (uint32_t i = 0; i < numParallel; i++) { mgrAsync.evalOpAsync>( - { inputsAsyncA[i], inputsAsyncB[i] }, + { inputsAsyncB[i] }, "async" + std::to_string(i), std::vector(shader.begin(), shader.end())); } From 10faaab81e51a755953ce6e02bb7769389601da3 Mon Sep 17 00:00:00 2001 From: Alejandro Saucedo Date: Sat, 17 Oct 2020 11:57:33 +0100 Subject: [PATCH 16/19] Added basic documentation and examples in readme --- README.md | 256 ++++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 209 insertions(+), 47 deletions(-) diff --git a/README.md b/README.md index ff15caf44..13426bca9 100644 --- a/README.md +++ b/README.md @@ -23,7 +23,7 @@ -

Blazing fast, lightweight, mobile-enabled, and optimized for advanced GPU processing usecases.

+

Blazing fast, mobile-enabled, asynchronous, and optimized for advanced GPU processing usecases.

🔋 [Documentation](https://kompute.cc) 💻 [Blog Post](https://medium.com/@AxSaucedo/machine-learning-and-data-processing-in-the-gpu-with-vulkan-kompute-c9350e5e5d3a) ⌨ [Examples](#more-examples) 💾 @@ -34,6 +34,7 @@ * [Documentation](https://kompute.cc) leveraging doxygen and sphinx * BYOV: Bring-your-own-Vulkan design to play nice with existing Vulkan applications * Non-Vulkan core naming conventions to disambiguate Vulkan vs Kompute components +* Asynchronous processing capabilities with granular mult-queue workload processing * Fast development cycles with shader tooling, but robust static shader binary bundles for prod * Explicit relationships for GPU and host memory ownership and memory management * End-to-end examples for [machine learning 🤖](https://towardsdatascience.com/machine-learning-and-data-processing-in-the-gpu-with-vulkan-kompute-c9350e5e5d3a), [mobile development 📱](https://towardsdatascience.com/gpu-accelerated-machine-learning-in-your-mobile-applications-using-the-android-ndk-vulkan-kompute-1e9da37b7617), [game development 🎮](https://towardsdatascience.com/supercharging-game-development-with-gpu-accelerated-ml-using-vulkan-kompute-the-godot-game-engine-4e75a84ea9f0). @@ -110,8 +111,10 @@ We are currently developing Vulkan Kompute not to hide the Vulkan SDK interface ### Simple examples * [Pass shader as raw string](#your-first-kompute) -* [Create your custom Kompute Operations](#your-custom-kompute-operation) * [Record batch commands with a Kompute Sequence](#record-batch-commands) +* [Run Asynchronous Operations](#asynchronous-operations) +* [Run Parallel Operations Across Multiple GPU Queues](#parallel-operations) +* [Create your custom Kompute Operations](#your-custom-kompute-operation) ### End-to-end examples @@ -119,51 +122,6 @@ We are currently developing Vulkan Kompute not to hide the Vulkan SDK interface * [Android NDK Mobile Kompute ML Application](https://towardsdatascience.com/gpu-accelerated-machine-learning-in-your-mobile-applications-using-the-android-ndk-vulkan-kompute-1e9da37b7617) * [Game Development Kompute ML in Godot Engine](https://towardsdatascience.com/supercharging-game-development-with-gpu-accelerated-ml-using-vulkan-kompute-the-godot-game-engine-4e75a84ea9f0) -### Your Custom Kompute Operation - -Build your own pre-compiled operations for domain specific workflows. Back to [more examples](#simple-examples) - -We also provide tools that allow you to [convert shaders into C++ headers](https://github.com/EthicalML/vulkan-kompute/blob/master/scripts/convert_shaders.py#L40). - -```c++ - -template -class OpMyCustom : public OpAlgoBase -{ - public: - OpMyCustom(std::shared_ptr physicalDevice, - std::shared_ptr device, - std::shared_ptr commandBuffer, - std::vector> tensors) - : OpAlgoBase(physicalDevice, device, commandBuffer, tensors, "") - { - // Perform your custom steps such as reading from a shader file - this->mShaderFilePath = "shaders/glsl/opmult.comp"; - } -} - - -int main() { - - kp::Manager mgr; // Automatically selects Device 0 - - // Create 3 tensors of default type float - auto tensorLhs = std::make_shared(kp::Tensor({ 0., 1., 2. })); - auto tensorRhs = std::make_shared(kp::Tensor({ 2., 4., 6. })); - auto tensorOut = std::make_shared(kp::Tensor({ 0., 0., 0. })); - - // Create tensors data explicitly in GPU with an operation - mgr.evalOpDefault({ tensorLhs, tensorRhs, tensorOut }); - - // Run Kompute operation on the parameters provided with dispatch layout - mgr.evalOpDefault>( - { tensorLhs, tensorRhs, tensorOut }); - - // Prints the output which is { 0, 4, 12 } - std::cout << fmt::format("Output: {}", tensorOutput.data()) << std::endl; -} -``` - #### Record batch commands Record commands in a single submit by using a Sequence to send in batch to GPU. Back to [more examples](#simple-examples) @@ -214,6 +172,210 @@ int main() { } ``` +#### Asynchronous Operations + +You can submit operations asynchronously with the async/await commands in the kp::Manager and kp::Sequence, which provides granularity on waiting on the vk::Fence. Back to [more examples](#simple-examples) + +```c++ +int main() { + + // You can allow Kompute to create the Vulkan components, or pass your existing ones + kp::Manager mgr; // Selects device 0 unless explicitly requested + + // For synchronous steps we must already have a sequence created + mgr.createManagedSequence("async"); + + // Creates tensor an initializes GPU memory (below we show more granularity) + auto tensor = std::make_shared(kp::Tensor(std::vector(10, 0.0))); + + // Create tensors data explicitly in GPU with an operation + mgr.evalOpAsync({ tensor }, "async"); + + // Define your shader as a string (using string literals for simplicity) + // (You can also pass the raw compiled bytes, or even path to file) + std::string shader(R"( + #version 450 + + layout (local_size_x = 1) in; + + layout(set = 0, binding = 0) buffer b { float pb[]; }; + + shared uint sharedTotal[1]; + + void main() { + uint index = gl_GlobalInvocationID.x; + + sharedTotal[0] = 0; + + // Iterating to simulate longer process + for (int i = 0; i < 100000000; i++) + { + atomicAdd(sharedTotal[0], 1); + } + + pb[index] = sharedTotal[0]; + } + )"); + + // We can now await for the previous submitted command + // The second parameter can be the amount of time to wait + // The time provided is in nanoseconds + mgr.evalOpAwait("async", 10000); + + // Run Async Kompute operation on the parameters provided + mgr.evalOpAsync>( + { tensor }, + "async", + std::vector(shader.begin(), shader.end())); + + // Here we can do other work + + // When we're ready we can wait + // The default wait time is UINT64_MAX + mgr.evalOpAwait("async") + + // Sync the GPU memory back to the local tensor + // We can still run synchronous jobs in our created sequence + mgr.evalOp({ tensor }, "async"); + + // Prints the output: B: { 100000000, ... } + std::cout << fmt::format("B: {}", + tensor.data()) << std::endl; +} +``` + +#### Parallel Operations + +Besides being able to submit asynchronous operations, you can also leverage the underlying GPU compute queues to process operations in parallel. + +This will depend on your underlying graphics card, but for example in NVIDIA graphics cards the operations submitted across queues in one family are not parallelizable, but operations submitted across queueFamilies can be parallelizable. + +Below we show how you can parallelize operations in an [NVIDIA 1650](http://vulkan.gpuinfo.org/displayreport.php?id=9700#queuefamilies), which has a `GRAPHICS+COMPUTE` family on `index 0`, and `COMPUTE` family on `index 2`. + +Back to [more examples](#simple-examples) + +```c++ +int main() { + + // In this case we select device 0, and for queues, one queue from familyIndex 0 + // and one queue from familyIndex 2 + uint32_t deviceIndex(0); + std::vector familyIndeces = {0, 2}; + + // We create a manager with device index, and queues by queue family index + kp::Manager mgr(deviceIndex, familyIndeces); + + // We need to create explicit sequences with their respective queues + // The second parameter is the index in the familyIndex array which is relative + // to the vector we created the manager with. + mgr.createManagedSequence("queueOne", 0); + mgr.createManagedSequence("queueTwo", 1); + + // Creates tensor an initializes GPU memory (below we show more granularity) + auto tensorA = std::make_shared(kp::Tensor(std::vector(10, 0.0))); + auto tensorB = std::make_shared(kp::Tensor(std::vector(10, 0.0))); + + // We run the first step synchronously on the default sequence + mgr.evalOpDefault({ tensorA, tensorB }); + + // Define your shader as a string (using string literals for simplicity) + // (You can also pass the raw compiled bytes, or even path to file) + std::string shader(R"( + #version 450 + + layout (local_size_x = 1) in; + + layout(set = 0, binding = 0) buffer b { float pb[]; }; + + shared uint sharedTotal[1]; + + void main() { + uint index = gl_GlobalInvocationID.x; + + sharedTotal[0] = 0; + + // Iterating to simulate longer process + for (int i = 0; i < 100000000; i++) + { + atomicAdd(sharedTotal[0], 1); + } + + pb[index] = sharedTotal[0]; + } + )"); + + // Run the first parallel operation in the `queueOne` sequence + mgr.evalOpAsync>( + { tensorA }, + "queueOne", + std::vector(shader.begin(), shader.end())); + + // Run the second parallel operation in the `queueTwo` sequence + mgr.evalOpAsync>( + { tensorB }, + "queueTwo", + std::vector(shader.begin(), shader.end())); + + // Here we can do other work + + // We can now wait for thw two parallel tasks to finish + mgr.evalOpAwait("queueOne") + mgr.evalOpAwait("queueTwo") + + // Sync the GPU memory back to the local tensor + mgr.evalOp({ tensorA, tensorB }); + + // Prints the output: A: 100000000 B: 100000000 + std::cout << fmt::format("A: {}, B: {}", + tensorA.data()[0], tensorB.data()[0]) << std::endl; +} +``` + +### Your Custom Kompute Operation + +Build your own pre-compiled operations for domain specific workflows. Back to [more examples](#simple-examples) + +We also provide tools that allow you to [convert shaders into C++ headers](https://github.com/EthicalML/vulkan-kompute/blob/master/scripts/convert_shaders.py#L40). + +```c++ + +template +class OpMyCustom : public OpAlgoBase +{ + public: + OpMyCustom(std::shared_ptr physicalDevice, + std::shared_ptr device, + std::shared_ptr commandBuffer, + std::vector> tensors) + : OpAlgoBase(physicalDevice, device, commandBuffer, tensors, "") + { + // Perform your custom steps such as reading from a shader file + this->mShaderFilePath = "shaders/glsl/opmult.comp"; + } +} + + +int main() { + + kp::Manager mgr; // Automatically selects Device 0 + + // Create 3 tensors of default type float + auto tensorLhs = std::make_shared(kp::Tensor({ 0., 1., 2. })); + auto tensorRhs = std::make_shared(kp::Tensor({ 2., 4., 6. })); + auto tensorOut = std::make_shared(kp::Tensor({ 0., 0., 0. })); + + // Create tensors data explicitly in GPU with an operation + mgr.evalOpDefault({ tensorLhs, tensorRhs, tensorOut }); + + // Run Kompute operation on the parameters provided with dispatch layout + mgr.evalOpDefault>( + { tensorLhs, tensorRhs, tensorOut }); + + // Prints the output which is { 0, 4, 12 } + std::cout << fmt::format("Output: {}", tensorOutput.data()) << std::endl; +} +``` + ## Components & Architecture The core architecture of Kompute include the following: From 8c66b4b66bf2d5c0a4115f2684003de85d12cedb Mon Sep 17 00:00:00 2001 From: Alejandro Saucedo Date: Sat, 17 Oct 2020 12:31:20 +0100 Subject: [PATCH 17/19] Added evalOpAwaitDefault and evalOpAsyncDefault --- single_include/kompute/Kompute.hpp | 58 +++++++++++++++++++++--------- src/include/kompute/Manager.hpp | 58 +++++++++++++++++++++--------- 2 files changed, 82 insertions(+), 34 deletions(-) diff --git a/single_include/kompute/Kompute.hpp b/single_include/kompute/Kompute.hpp index 382b8332d..bd7b3a125 100755 --- a/single_include/kompute/Kompute.hpp +++ b/single_include/kompute/Kompute.hpp @@ -1318,8 +1318,7 @@ class Manager uint32_t queueIndex = 0); /** - * Operation that adds extra operations to existing or new created - * sequences. + * Function that evaluates operation against named sequence. * * @param tensors The tensors to be used in the operation recorded * @param sequenceName The name of the sequence to be retrieved or created @@ -1352,8 +1351,23 @@ class Manager } /** - * Operation that adds extra operations to existing or new created - * sequences. + * Function that evaluates operation against default sequence. + * + * @param tensors The tensors to be used in the operation recorded + * @param TArgs Template parameters that will be used to initialise + * Operation to allow for extensible configurations on initialisation + */ + template + void evalOpDefault(std::vector> tensors, + TArgs&&... params) + { + SPDLOG_DEBUG("Kompute Manager evalOp Default triggered"); + this->evalOp( + tensors, KP_DEFAULT_SESSION, std::forward(params)...); + } + + /** + * Function that evaluates operation against named sequence asynchronously. * * @param tensors The tensors to be used in the operation recorded * @param sequenceName The name of the sequence to be retrieved or created @@ -1394,16 +1408,30 @@ class Manager } /** - * Operation that adds extra operations to existing or new created - * sequences. + * Operation that evaluates operation against default sequence asynchronously. + * + * @param tensors The tensors to be used in the operation recorded + * @param params Template parameters that will be used to initialise + * Operation to allow for extensible configurations on initialisation + */ + template + void evalOpAsyncDefault(std::vector> tensors, + TArgs&&... params) + { + SPDLOG_DEBUG("Kompute Manager evalOpAsyncDefault triggered"); + this->evalOpAsync( + tensors, KP_DEFAULT_SESSION, std::forward(params)...); + } + + /** + * Operation that awaits for named sequence to finish. * * @param sequenceName The name of the sequence to wait for termination * @param waitFor The amount of time to wait before timing out */ - template void evalOpAwait(std::string sequenceName, uint64_t waitFor = UINT64_MAX) { - SPDLOG_DEBUG("Kompute Manager evalOpAwait triggered"); + SPDLOG_DEBUG("Kompute Manager evalOpAwait triggered with sequence {}", sequenceName); std::unordered_map>::iterator found = this->mManagedSequences.find(sequenceName); @@ -1423,20 +1451,16 @@ class Manager } /** - * Operation that adds extra operations to existing or new created - * sequences. + * Operation that awaits for default sequence to finish. * * @param tensors The tensors to be used in the operation recorded - * @param TArgs Template parameters that will be used to initialise + * @param params Template parameters that will be used to initialise * Operation to allow for extensible configurations on initialisation */ - template - void evalOpDefault(std::vector> tensors, - TArgs&&... params) + void evalOpAwaitDefault(uint64_t waitFor = UINT64_MAX) { - SPDLOG_DEBUG("Kompute Manager evalOp Default triggered"); - this->evalOp( - tensors, KP_DEFAULT_SESSION, std::forward(params)...); + SPDLOG_DEBUG("Kompute Manager evalOpAwaitDefault triggered"); + this->evalOpAwait(KP_DEFAULT_SESSION, waitFor); } /** diff --git a/src/include/kompute/Manager.hpp b/src/include/kompute/Manager.hpp index 650da5ca0..969af9ac5 100644 --- a/src/include/kompute/Manager.hpp +++ b/src/include/kompute/Manager.hpp @@ -80,8 +80,7 @@ class Manager uint32_t queueIndex = 0); /** - * Operation that adds extra operations to existing or new created - * sequences. + * Function that evaluates operation against named sequence. * * @param tensors The tensors to be used in the operation recorded * @param sequenceName The name of the sequence to be retrieved or created @@ -114,8 +113,23 @@ class Manager } /** - * Operation that adds extra operations to existing or new created - * sequences. + * Function that evaluates operation against default sequence. + * + * @param tensors The tensors to be used in the operation recorded + * @param TArgs Template parameters that will be used to initialise + * Operation to allow for extensible configurations on initialisation + */ + template + void evalOpDefault(std::vector> tensors, + TArgs&&... params) + { + SPDLOG_DEBUG("Kompute Manager evalOp Default triggered"); + this->evalOp( + tensors, KP_DEFAULT_SESSION, std::forward(params)...); + } + + /** + * Function that evaluates operation against named sequence asynchronously. * * @param tensors The tensors to be used in the operation recorded * @param sequenceName The name of the sequence to be retrieved or created @@ -156,16 +170,30 @@ class Manager } /** - * Operation that adds extra operations to existing or new created - * sequences. + * Operation that evaluates operation against default sequence asynchronously. + * + * @param tensors The tensors to be used in the operation recorded + * @param params Template parameters that will be used to initialise + * Operation to allow for extensible configurations on initialisation + */ + template + void evalOpAsyncDefault(std::vector> tensors, + TArgs&&... params) + { + SPDLOG_DEBUG("Kompute Manager evalOpAsyncDefault triggered"); + this->evalOpAsync( + tensors, KP_DEFAULT_SESSION, std::forward(params)...); + } + + /** + * Operation that awaits for named sequence to finish. * * @param sequenceName The name of the sequence to wait for termination * @param waitFor The amount of time to wait before timing out */ - template void evalOpAwait(std::string sequenceName, uint64_t waitFor = UINT64_MAX) { - SPDLOG_DEBUG("Kompute Manager evalOpAwait triggered"); + SPDLOG_DEBUG("Kompute Manager evalOpAwait triggered with sequence {}", sequenceName); std::unordered_map>::iterator found = this->mManagedSequences.find(sequenceName); @@ -185,20 +213,16 @@ class Manager } /** - * Operation that adds extra operations to existing or new created - * sequences. + * Operation that awaits for default sequence to finish. * * @param tensors The tensors to be used in the operation recorded - * @param TArgs Template parameters that will be used to initialise + * @param params Template parameters that will be used to initialise * Operation to allow for extensible configurations on initialisation */ - template - void evalOpDefault(std::vector> tensors, - TArgs&&... params) + void evalOpAwaitDefault(uint64_t waitFor = UINT64_MAX) { - SPDLOG_DEBUG("Kompute Manager evalOp Default triggered"); - this->evalOp( - tensors, KP_DEFAULT_SESSION, std::forward(params)...); + SPDLOG_DEBUG("Kompute Manager evalOpAwaitDefault triggered"); + this->evalOpAwait(KP_DEFAULT_SESSION, waitFor); } /** From f4275e2c31cf0fd8c368cee708b0de27ff5bd7bb Mon Sep 17 00:00:00 2001 From: Alejandro Saucedo Date: Sat, 17 Oct 2020 12:31:28 +0100 Subject: [PATCH 18/19] Added test just for the await component --- test/TestAsyncOperations.cpp | 56 +++++++++++++++++++++++++++++++++--- 1 file changed, 52 insertions(+), 4 deletions(-) diff --git a/test/TestAsyncOperations.cpp b/test/TestAsyncOperations.cpp index ac7422785..5c8612fc3 100644 --- a/test/TestAsyncOperations.cpp +++ b/test/TestAsyncOperations.cpp @@ -5,7 +5,7 @@ #include "kompute/Kompute.hpp" -TEST(TestAsyncOperations, TestManagerAsync) +TEST(TestAsyncOperations, TestManagerParallelExecution) { // This test is built for NVIDIA 1650. It assumes: // * Queue family 0 and 2 have compute capabilities @@ -108,9 +108,57 @@ TEST(TestAsyncOperations, TestManagerAsync) EXPECT_EQ(inputsAsyncB[i]->data(), resultAsync); } - SPDLOG_ERROR("sync {}", durationSync); - SPDLOG_ERROR("async {}", durationAsync); - // The speedup should be at least 40% EXPECT_LT(durationAsync, durationSync * 0.6); } + +TEST(TestAsyncOperations, TestManagerAsyncExecution) +{ + uint32_t size = 10; + + std::string shader(R"( + #version 450 + + layout (local_size_x = 1) in; + + layout(set = 0, binding = 0) buffer b { float pb[]; }; + + shared uint sharedTotal[1]; + + void main() { + uint index = gl_GlobalInvocationID.x; + + sharedTotal[0] = 0; + + for (int i = 0; i < 100000000; i++) + { + atomicAdd(sharedTotal[0], 1); + } + + pb[index] = sharedTotal[0]; + } + )"); + + std::vector data(size, 0.0); + std::vector resultAsync(size, 100000000); + + kp::Manager mgrAsync(0); + + std::vector> inputsAsyncB; + + inputsAsyncB.push_back(std::make_shared(kp::Tensor(data))); + + mgrAsync.evalOpAsyncDefault(inputsAsyncB); + mgrAsync.evalOpAwaitDefault(); + + mgrAsync.evalOpAsyncDefault>( + { inputsAsyncB[0] }, + std::vector(shader.begin(), shader.end())); + + mgrAsync.evalOpAwaitDefault(); + + mgrAsync.evalOpAsyncDefault({ inputsAsyncB }); + mgrAsync.evalOpAwaitDefault(); + + EXPECT_EQ(inputsAsyncB[0]->data(), resultAsync); +} From c6695f130cd18b7fcac3e3d93fde322a9f7652e7 Mon Sep 17 00:00:00 2001 From: Alejandro Saucedo Date: Sat, 17 Oct 2020 12:37:34 +0100 Subject: [PATCH 19/19] Added basic documentation on async / parallel --- README.md | 16 ++--- docs/index.rst | 5 +- docs/overview/async-parallel.rst | 108 +++++++++++++++++++++++++++++++ docs/overview/reference.rst | 2 +- 4 files changed, 118 insertions(+), 13 deletions(-) create mode 100644 docs/overview/async-parallel.rst diff --git a/README.md b/README.md index 13426bca9..fdd721af8 100644 --- a/README.md +++ b/README.md @@ -182,14 +182,11 @@ int main() { // You can allow Kompute to create the Vulkan components, or pass your existing ones kp::Manager mgr; // Selects device 0 unless explicitly requested - // For synchronous steps we must already have a sequence created - mgr.createManagedSequence("async"); - // Creates tensor an initializes GPU memory (below we show more granularity) auto tensor = std::make_shared(kp::Tensor(std::vector(10, 0.0))); // Create tensors data explicitly in GPU with an operation - mgr.evalOpAsync({ tensor }, "async"); + mgr.evalOpAsyncDefault({ tensor }); // Define your shader as a string (using string literals for simplicity) // (You can also pass the raw compiled bytes, or even path to file) @@ -218,25 +215,24 @@ int main() { )"); // We can now await for the previous submitted command - // The second parameter can be the amount of time to wait + // The first parameter can be the amount of time to wait // The time provided is in nanoseconds - mgr.evalOpAwait("async", 10000); + mgr.evalOpAwaitDefault(10000); // Run Async Kompute operation on the parameters provided - mgr.evalOpAsync>( + mgr.evalOpAsyncDefault>( { tensor }, - "async", std::vector(shader.begin(), shader.end())); // Here we can do other work // When we're ready we can wait // The default wait time is UINT64_MAX - mgr.evalOpAwait("async") + mgr.evalOpAwaitDefault() // Sync the GPU memory back to the local tensor // We can still run synchronous jobs in our created sequence - mgr.evalOp({ tensor }, "async"); + mgr.evalOpDefault({ tensor }); // Prints the output: B: { 100000000, ... } std::cout << fmt::format("B: {}", diff --git a/docs/index.rst b/docs/index.rst index d29f1fbf6..ca8d59398 100755 --- a/docs/index.rst +++ b/docs/index.rst @@ -15,7 +15,8 @@ Index Mobile App Intergration (Android) Game Engine Integration (Godot Engine) Converting GLSL/HLSL Shaders to C++ Headers - Class Reference - Memory management principles + Asynchronous & Parallel Operations + Class Documentation and C++ Reference + Memory Management Principles Code Index diff --git a/docs/overview/async-parallel.rst b/docs/overview/async-parallel.rst new file mode 100644 index 000000000..0505424d6 --- /dev/null +++ b/docs/overview/async-parallel.rst @@ -0,0 +1,108 @@ + +Asynchronous and Parallel Operations +============= + +In GPU computing it is possible to have multiple levels of asynchronous and parallel processing of GPU tasks. + +It is important to understand the conceptual distinctions of the diffent terminology when using each of these components. + +In this section we will cover the following points: + +* Asynchronous operation submission +* Parallel processing of operations + +Asynchronous operation submission +--------------------------------- + +As the name implies, this refers to the asynchronous submission of operations. This means that operations can be submitted to the GPU, and the C++ / host CPU can continue performing tasks, until when the user desires to run `await` to wait until the operation finishes. + +This basically provides further granularity on Vulkan Fences, which is its means to enable the CPU host to know when GPU commands have finished executing. + +It is important that submitting tasks asynchronously, does not mean that these will be executed in parallel. Parallel execution of operations will be covered in the following section. + +Asynchronous operation submission can be achieved through the kp::Manager, or directly through the kp::Sequence. Below is an example using the Kompute manager. + +Async/Await Example +^^^^^^^^^^^^^^^^^^^^^ + +Asynchronous job submission is done using `evalOpAsync` and `evalOpAwait` functions. + +For simplicity the `evalOpAsyncDefault` and `evalOpAwaitDefault` functions are provided, which can be used similar to the synchronous counterparts (which basically use the default named sequence). + +A simple example of asynchronous submission can be found below. + +One important thing to bare in mind when using asynchronous submissions, is that you should make sure that any overlapping asynchronous functions are run in separate sequences. + +The reason why this is important is that the Await function not only waits for the fence, but also runs the `postEval` functions across all operations, which is required for several operations. + +.. code-block:: cpp + :linenos: + + // You can allow Kompute to create the Vulkan components, or pass your existing ones + kp::Manager mgr; // Selects device 0 unless explicitly requested + + // Creates tensor an initializes GPU memory (below we show more granularity) + auto tensor = std::make_shared(kp::Tensor(std::vector(10, 0.0))); + + // Create tensors data explicitly in GPU with an operation + mgr.evalOpAsyncDefault({ tensor }); + + // Define your shader as a string (using string literals for simplicity) + // (You can also pass the raw compiled bytes, or even path to file) + std::string shader(R"( + #version 450 + + layout (local_size_x = 1) in; + + layout(set = 0, binding = 0) buffer b { float pb[]; }; + + shared uint sharedTotal[1]; + + void main() { + uint index = gl_GlobalInvocationID.x; + + sharedTotal[0] = 0; + + // Iterating to simulate longer process + for (int i = 0; i < 100000000; i++) + { + atomicAdd(sharedTotal[0], 1); + } + + pb[index] = sharedTotal[0]; + } + )"); + + // We can now await for the previous submitted command + // The first parameter can be the amount of time to wait + // The time provided is in nanoseconds + mgr.evalOpAwaitDefault(10000); + + // Run Async Kompute operation on the parameters provided + mgr.evalOpAsyncDefault>( + { tensor }, + std::vector(shader.begin(), shader.end())); + + // Here we can do other work + + // When we're ready we can wait + // The default wait time is UINT64_MAX + mgr.evalOpAwaitDefault() + + // Sync the GPU memory back to the local tensor + // We can still run synchronous jobs in our created sequence + mgr.evalOpDefault({ tensor }); + + // Prints the output: B: { 100000000, ... } + std::cout << fmt::format("B: {}", + tensor.data()) << std::endl; + + +Parallel Operation Submission +----------- + +In order to work with parallel execution of tasks, it is important that you understand some of the core GPU processing limitations, as these can be quite broad and hardware dependent, which means they will vary across NVIDIA / AMD / ETC video cards. + +GPUs by default will optimize towards GPU + + diff --git a/docs/overview/reference.rst b/docs/overview/reference.rst index f2b8eda18..5bffdcddd 100644 --- a/docs/overview/reference.rst +++ b/docs/overview/reference.rst @@ -1,5 +1,5 @@ -Reference +Class Documentation and C++ Reference ======== This section provides a breakdown of the cpp classes and what each of their functions provide. It is partially generated and augomented from the Doxygen autodoc content. You can also go directly to the `raw doxygen docs <../doxygen/annotated.html>`_.