diff --git a/Makefile b/Makefile index 872209015..9fdcbdcbe 100644 --- a/Makefile +++ b/Makefile @@ -13,7 +13,7 @@ VCPKG_WIN_PATH ?= "C:\\Users\\axsau\\Programming\\lib\\vcpkg\\scripts\\buildsyst VCPKG_UNIX_PATH ?= "/c/Users/axsau/Programming/lib/vcpkg/scripts/buildsystems/vcpkg.cmake" # Regext to pass to catch2 to filter tests -FILTER_TESTS ?= "-TestAsyncOperations.TestManagerParallelExecution" +FILTER_TESTS ?= "-TestAsyncOperations.TestManagerParallelExecution:TestSequence.SequenceTimestamps" ifeq ($(OS),Windows_NT) # is Windows_NT on XP, 2000, 7, Vista, 10... CMAKE_BIN ?= "C:\Program Files\CMake\bin\cmake.exe" diff --git a/python/src/main.cpp b/python/src/main.cpp index f13347aa8..7165d41e7 100644 --- a/python/src/main.cpp +++ b/python/src/main.cpp @@ -129,6 +129,7 @@ PYBIND11_MODULE(kp, m) { .def("is_recording", &kp::Sequence::isRecording) .def("is_running", &kp::Sequence::isRunning) .def("is_init", &kp::Sequence::isInit) + .def("get_timestamps", &kp::Sequence::getTimestamps) .def("clear", &kp::Sequence::clear) .def("destroy", &kp::Sequence::destroy); @@ -139,7 +140,7 @@ PYBIND11_MODULE(kp, m) { py::arg("device") = 0, py::arg("family_queue_indices") = std::vector(), py::arg("desired_extensions") = std::vector()) - .def("sequence", &kp::Manager::sequence, py::arg("queueIndex") = 0) + .def("sequence", &kp::Manager::sequence, py::arg("queue_index") = 0, py::arg("total_timestamps") = 0) .def("tensor", [np](kp::Manager& self, const py::array_t data, kp::Tensor::TensorTypes tensor_type) { diff --git a/single_include/kompute/Kompute.hpp b/single_include/kompute/Kompute.hpp index 607928f0c..38213bb6e 100755 --- a/single_include/kompute/Kompute.hpp +++ b/single_include/kompute/Kompute.hpp @@ -820,12 +820,14 @@ class Tensor }; /** - * Default constructor with data provided which would be used to create the + * Constructor with data provided which would be used to create the * respective vulkan buffer and memory. * + * @param physicalDevice The physical device to use to fetch properties + * @param device The device to use to create the buffer and memory from * @param data Non-zero-sized vector of data that will be used by the * tensor - * @param tensorType Type for the tensor which is of type TensorTypes + * @param tensorTypes Type for the tensor which is of type TensorTypes */ Tensor(std::shared_ptr physicalDevice, std::shared_ptr device, @@ -839,10 +841,11 @@ class Tensor ~Tensor(); /** - * Initialiser which calls the initialisation for all the respective tensors - * as well as creates the respective staging tensors. The staging tensors - * would only be created for the tensors of type TensorType::eDevice as - * otherwise there is no need to copy from host memory. + * Function to trigger reinitialisation of the tensor buffer and memory with + * new data as well as new potential device type. + * + * @param data Vector of data to use to initialise vector from + * @param tensorType The type to use for the tensor */ void rebuild(const std::vector& data, TensorTypes tensorType = TensorTypes::eDevice); @@ -852,6 +855,11 @@ class Tensor */ void destroy(); + /** + * Check whether tensor is initialized based on the created gpu resources. + * + * @returns Boolean stating whether tensor is initialized + */ bool isInit(); /** @@ -1210,6 +1218,8 @@ class OpBase * The record function is intended to only send a record command or run * commands that are expected to record operations that are to be submitted * as a batch into the GPU. + * + * @param commandBuffer The command buffer to record the command into. */ virtual void record(const vk::CommandBuffer& commandBuffer) = 0; @@ -1220,6 +1230,8 @@ class OpBase * there are situations where eval can be called multiple times, so the * resources that are created should be idempotent in case it's called multiple * times in a row. + * + * @param commandBuffer The command buffer to record the command into. */ virtual void preEval(const vk::CommandBuffer& commandBuffer) = 0; @@ -1230,6 +1242,8 @@ class OpBase * there are situations where eval can be called multiple times, so the * resources that are destroyed should not require a re-init unless explicitly * provided by the user. + * + * @param commandBuffer The command buffer to record the command into. */ virtual void postEval(const vk::CommandBuffer& commandBuffer) = 0; }; @@ -1239,38 +1253,47 @@ class OpBase namespace kp { /** - Operation that copies the data from the first tensor to the rest of the tensors provided, using a record command for all the vectors. This operation does not own/manage the memory of the tensors passed to it. The operation must only receive tensors of type + * Operation that copies the data from the first tensor to the rest of the tensors + * provided, using a record command for all the vectors. This operation does not + * own/manage the memory of the tensors passed to it. The operation must only + * receive tensors of type */ class OpTensorCopy : public OpBase { public: /** - * Default constructor with parameters that provides the core vulkan resources and the tensors that will be used in the operation. + * Default constructor with parameters that provides the core vulkan resources + * and the tensors that will be used in the operation. * - * @param physicalDevice Vulkan physical device used to find device queues - * @param device Vulkan logical device for passing to Algorithm - * @param commandBuffer Vulkan Command Buffer to record commands into * @param tensors Tensors that will be used to create in operation. */ OpTensorCopy(const std::vector>& tensors); /** - * Default destructor. This class does not manage memory so it won't be expecting the parent to perform a release. + * Default destructor. This class does not manage memory so it won't be + * expecting the parent to perform a release. */ ~OpTensorCopy() override; /** - * Records the copy commands from the first tensor into all the other tensors provided. Also optionally records a barrier. + * Records the copy commands from the first tensor into all the other + * tensors provided. Also optionally records a barrier. + * + * @param commandBuffer The command buffer to record the command into. */ void record(const vk::CommandBuffer& commandBuffer) override; /** * Does not perform any preEval commands. + * + * @param commandBuffer The command buffer to record the command into. */ virtual void preEval(const vk::CommandBuffer& commandBuffer) override; /** * Copies the local vectors for all the tensors to sync the data with the gpu. + * + * @param commandBuffer The command buffer to record the command into. */ virtual void postEval(const vk::CommandBuffer& commandBuffer) override; @@ -1284,17 +1307,20 @@ class OpTensorCopy : public OpBase namespace kp { /** - Operation that syncs tensor's device by mapping local data into the device memory. For TensorTypes::eDevice it will use a record operation for the memory to be syncd into GPU memory which means that the operation will be done in sync with GPU commands. For TensorTypes::eStaging it will only map the data into host memory which will happen during preEval before the recorded commands are dispatched. This operation won't have any effect on TensorTypes::eStaging. + * Operation that syncs tensor's device by mapping local data into the device memory. + * For TensorTypes::eDevice it will use a record operation for the memory to be syncd + * into GPU memory which means that the operation will be done in sync with GPU commands. + * For TensorTypes::eHost it will only map the data into host memory which will + * happen during preEval before the recorded commands are dispatched. */ class OpTensorSyncDevice : public OpBase { public: /** - * Default constructor with parameters that provides the core vulkan resources and the tensors that will be used in the operation. The tensos provided cannot be of type TensorTypes::eStorage. + * Default constructor with parameters that provides the core vulkan resources + * and the tensors that will be used in the operation. The tensos provided cannot + * be of type TensorTypes::eStorage. * - * @param physicalDevice Vulkan physical device used to find device queues - * @param device Vulkan logical device for passing to Algorithm - * @param commandBuffer Vulkan Command Buffer to record commands into * @param tensors Tensors that will be used to create in operation. */ OpTensorSyncDevice(const std::vector>& tensors); @@ -1305,17 +1331,24 @@ class OpTensorSyncDevice : public OpBase ~OpTensorSyncDevice() override; /** - * For device tensors, it records the copy command for the tensor to copy the data from its staging to device memory. + * For device tensors, it records the copy command for the tensor to copy the + * data from its staging to device memory. + * + * @param commandBuffer The command buffer to record the command into. */ void record(const vk::CommandBuffer& commandBuffer) override; /** * Does not perform any preEval commands. + * + * @param commandBuffer The command buffer to record the command into. */ virtual void preEval(const vk::CommandBuffer& commandBuffer) override; /** * Does not perform any postEval commands. + * + * @param commandBuffer The command buffer to record the command into. */ virtual void postEval(const vk::CommandBuffer& commandBuffer) override; @@ -1329,38 +1362,50 @@ class OpTensorSyncDevice : public OpBase namespace kp { /** - Operation that syncs tensor's local memory by mapping device data into the local CPU memory. For TensorTypes::eDevice it will use a record operation for the memory to be syncd into GPU memory which means that the operation will be done in sync with GPU commands. For TensorTypes::eStaging it will only map the data into host memory which will happen during preEval before the recorded commands are dispatched. This operation won't have any effect on TensorTypes::eStaging. + * Operation that syncs tensor's local memory by mapping device data into the + * local CPU memory. For TensorTypes::eDevice it will use a record operation + * for the memory to be syncd into GPU memory which means that the operation + * will be done in sync with GPU commands. For TensorTypes::eHost it will + * only map the data into host memory which will happen during preEval before + * the recorded commands are dispatched. */ class OpTensorSyncLocal : public OpBase { public: /** - * Default constructor with parameters that provides the core vulkan resources and the tensors that will be used in the operation. The tensors provided cannot be of type TensorTypes::eStorage. + * Default constructor with parameters that provides the core vulkan resources + * and the tensors that will be used in the operation. The tensors provided + * cannot be of type TensorTypes::eStorage. * - * @param physicalDevice Vulkan physical device used to find device queues - * @param device Vulkan logical device for passing to Algorithm - * @param commandBuffer Vulkan Command Buffer to record commands into * @param tensors Tensors that will be used to create in operation. */ OpTensorSyncLocal(const std::vector>& tensors); /** - * Default destructor. This class does not manage memory so it won't be expecting the parent to perform a release. + * Default destructor. This class does not manage memory so it won't be expecting + * the parent to perform a release. */ ~OpTensorSyncLocal() override; /** - * For device tensors, it records the copy command for the tensor to copy the data from its device to staging memory. + * For device tensors, it records the copy command for the tensor to copy the + * data from its device to staging memory. + * + * @param commandBuffer The command buffer to record the command into. */ void record(const vk::CommandBuffer& commandBuffer) override; /** * Does not perform any preEval commands. + * + * @param commandBuffer The command buffer to record the command into. */ virtual void preEval(const vk::CommandBuffer& commandBuffer) override; /** * For host tensors it performs the map command from the host memory into local memory. + * + * @param commandBuffer The command buffer to record the command into. */ virtual void postEval(const vk::CommandBuffer& commandBuffer) override; @@ -1383,6 +1428,13 @@ class OpAlgoDispatch : public OpBase { public: + /** + * Constructor that stores the algorithm to use as well as the relevant + * push constants to override when recording. + * + * @param algorithm The algorithm object to use for dispatch + * @param pushConstants The push constants to use for override + */ OpAlgoDispatch(const std::shared_ptr& algorithm, const kp::Constants& pushConstants = {}); @@ -1399,18 +1451,22 @@ class OpAlgoDispatch : public OpBase * shader processing to the gpu. This function also records the GPU memory * copy of the output data for the staging buffer so it can be read by the * host. + * + * @param commandBuffer The command buffer to record the command into. */ virtual void record(const vk::CommandBuffer& commandBuffer) override; /** * Does not perform any preEval commands. + * + * @param commandBuffer The command buffer to record the command into. */ virtual void preEval(const vk::CommandBuffer& commandBuffer) override; /** - * Executes after the recorded commands are submitted, and performs a copy - * of the GPU Device memory into the staging buffer so the output data can - * be retrieved. + * Does not perform any postEval commands. + * + * @param commandBuffer The command buffer to record the command into. */ virtual void postEval(const vk::CommandBuffer& commandBuffer) override; @@ -1439,11 +1495,9 @@ class OpMult : public OpAlgoDispatch * requirements for the operations to be able to create and manage their * sub-components. * - * @param physicalDevice Vulkan physical device used to find device queues - * @param device Vulkan logical device for passing to Algorithm - * @param commandBuffer Vulkan Command Buffer to record commands into * @param tensors Tensors that are to be used in this operation - * @param komputeWorkgroup Optional parameter to specify the layout for processing + * @param algorithm An algorithm that will be overridden with the OpMult + * shader data and the tensors provided which are expected to be 3 */ OpMult(std::vector> tensors, std::shared_ptr algorithm) : OpAlgoDispatch(algorithm) @@ -1489,11 +1543,13 @@ class Sequence : public std::enable_shared_from_this * @param device Vulkan logical device * @param computeQueue Vulkan compute queue * @param queueIndex Vulkan compute queue index in device + * @param totalTimestamps Maximum number of timestamps to allocate */ Sequence(std::shared_ptr physicalDevice, std::shared_ptr device, std::shared_ptr computeQueue, - uint32_t queueIndex); + uint32_t queueIndex, + uint32_t totalTimestamps = 0); /** * Destructor for sequence which is responsible for cleaning all subsequent * owned operations. @@ -1669,6 +1725,12 @@ class Sequence : public std::enable_shared_from_this */ void clear(); + /** + * Return the timestamps that were latched at the beginning and + * after each operation during the last eval() call. + */ + std::vector getTimestamps(); + /** * Begins recording commands for commands to be submitted into the command * buffer. @@ -1737,6 +1799,7 @@ class Sequence : public std::enable_shared_from_this // -------------- ALWAYS OWNED RESOURCES vk::Fence mFence; std::vector> mOperations; + std::shared_ptr timestampQueryPool = nullptr; // State bool mRecording = false; @@ -1745,6 +1808,7 @@ class Sequence : public std::enable_shared_from_this // Create functions void createCommandPool(); void createCommandBuffer(); + void createTimestampQueryPool(uint32_t totalTimestamps); }; } // End namespace kp @@ -1805,9 +1869,11 @@ class Manager * if it hasn't been destroyed by its reference count going to zero. * * @param queueIndex The queue to use from the available queues + * @param nrOfTimestamps The maximum number of timestamps to allocate. + * If zero (default), disables latching of timestamps. * @returns Shared pointer with initialised sequence */ - std::shared_ptr sequence(uint32_t queueIndex = 0); + std::shared_ptr sequence(uint32_t queueIndex = 0, uint32_t nrOfTimestamps = 0); /** * Create a managed tensor that will be destroyed by this manager diff --git a/src/Manager.cpp b/src/Manager.cpp index 83676f9ec..e3bdbb2d9 100644 --- a/src/Manager.cpp +++ b/src/Manager.cpp @@ -431,7 +431,7 @@ Manager::algorithm(const std::vector>& tensors, } std::shared_ptr -Manager::sequence(uint32_t queueIndex) +Manager::sequence(uint32_t queueIndex, uint32_t totalTimestamps) { KP_LOG_DEBUG("Kompute Manager sequence() with queueIndex: {}", queueIndex); @@ -439,7 +439,8 @@ Manager::sequence(uint32_t queueIndex) this->mPhysicalDevice, this->mDevice, this->mComputeQueues[queueIndex], - this->mComputeQueueFamilyIndices[queueIndex]) }; + this->mComputeQueueFamilyIndices[queueIndex], + totalTimestamps) }; if (this->mManageResources) { this->mManagedSequences.push_back(sq); diff --git a/src/Sequence.cpp b/src/Sequence.cpp index fa715cefc..6e379eb92 100644 --- a/src/Sequence.cpp +++ b/src/Sequence.cpp @@ -6,7 +6,8 @@ namespace kp { Sequence::Sequence(std::shared_ptr physicalDevice, std::shared_ptr device, std::shared_ptr computeQueue, - uint32_t queueIndex) + uint32_t queueIndex, + uint32_t totalTimestamps) { KP_LOG_DEBUG("Kompute Sequence Constructor with existing device & queue"); @@ -17,6 +18,8 @@ Sequence::Sequence(std::shared_ptr physicalDevice, this->createCommandPool(); this->createCommandBuffer(); + if(totalTimestamps>0) + this->createTimestampQueryPool(totalTimestamps+1); //+1 for the first one } Sequence::~Sequence() @@ -44,6 +47,13 @@ Sequence::begin() KP_LOG_INFO("Kompute Sequence command now started recording"); this->mCommandBuffer->begin(vk::CommandBufferBeginInfo()); this->mRecording = true; + + //latch the first timestamp before any commands are submitted + if(this->timestampQueryPool) + this->mCommandBuffer->writeTimestamp( + vk::PipelineStageFlagBits::eAllCommands, + *this->timestampQueryPool, 0 + ); } void @@ -236,6 +246,16 @@ Sequence::destroy() this->mOperations.clear(); } + if(this->timestampQueryPool){ + KP_LOG_INFO("Destroying QueryPool"); + this->mDevice->destroy( + *this->timestampQueryPool, + (vk::Optional)nullptr); + + this->timestampQueryPool = nullptr; + KP_LOG_DEBUG("Kompute Sequence Destroyed QueryPool"); + } + if (this->mDevice) { this->mDevice = nullptr; } @@ -261,6 +281,12 @@ Sequence::record(std::shared_ptr op) this->mOperations.push_back(op); + if(this->timestampQueryPool) + this->mCommandBuffer->writeTimestamp( + vk::PipelineStageFlagBits::eAllCommands, + *this->timestampQueryPool, this->mOperations.size() + ); + return shared_from_this(); } @@ -308,4 +334,46 @@ Sequence::createCommandBuffer() KP_LOG_DEBUG("Kompute Sequence Command Buffer Created"); } +void +Sequence::createTimestampQueryPool(uint32_t totalTimestamps) +{ + KP_LOG_DEBUG("Kompute Sequence creating query pool"); + if (!this->isInit()) { + throw std::runtime_error("createTimestampQueryPool() called on uninitialized Sequence"); + } + if (!this->mPhysicalDevice) { + throw std::runtime_error("Kompute Sequence physical device is null"); + } + + vk::PhysicalDeviceProperties physicalDeviceProperties = + this->mPhysicalDevice->getProperties(); + + if(physicalDeviceProperties.limits.timestampComputeAndGraphics){ + vk::QueryPoolCreateInfo queryPoolInfo; + queryPoolInfo.setQueryCount(totalTimestamps); + queryPoolInfo.setQueryType(vk::QueryType::eTimestamp); + this->timestampQueryPool = std::make_shared(this->mDevice->createQueryPool(queryPoolInfo)); + + KP_LOG_DEBUG("Query pool for timestamps created"); + } + else{ + throw std::runtime_error("Device does not support timestamps"); + } +} + +std::vector +Sequence::getTimestamps() +{ + if(!this->timestampQueryPool) + throw std::runtime_error("Timestamp latching not enabled"); + + const auto n = this->mOperations.size()+1; + std::vector timestamps(n, 0); + this->mDevice->getQueryPoolResults(*this->timestampQueryPool, + 0, n, timestamps.size()*sizeof(std::uint64_t), timestamps.data(), + sizeof(uint64_t), vk::QueryResultFlagBits::e64 | vk::QueryResultFlagBits::eWait); + + return timestamps; +} + } diff --git a/src/include/kompute/Manager.hpp b/src/include/kompute/Manager.hpp index 957e45d2e..d9c6ddf3e 100644 --- a/src/include/kompute/Manager.hpp +++ b/src/include/kompute/Manager.hpp @@ -60,9 +60,11 @@ class Manager * if it hasn't been destroyed by its reference count going to zero. * * @param queueIndex The queue to use from the available queues + * @param nrOfTimestamps The maximum number of timestamps to allocate. + * If zero (default), disables latching of timestamps. * @returns Shared pointer with initialised sequence */ - std::shared_ptr sequence(uint32_t queueIndex = 0); + std::shared_ptr sequence(uint32_t queueIndex = 0, uint32_t totalTimestamps = 0); /** * Create a managed tensor that will be destroyed by this manager diff --git a/src/include/kompute/Sequence.hpp b/src/include/kompute/Sequence.hpp index 10aa80148..d29f6aaf0 100644 --- a/src/include/kompute/Sequence.hpp +++ b/src/include/kompute/Sequence.hpp @@ -3,6 +3,7 @@ #include "kompute/Core.hpp" #include "kompute/operations/OpBase.hpp" +#include "kompute/operations/OpAlgoDispatch.hpp" namespace kp { @@ -20,11 +21,13 @@ class Sequence : public std::enable_shared_from_this * @param device Vulkan logical device * @param computeQueue Vulkan compute queue * @param queueIndex Vulkan compute queue index in device + * @param totalTimestamps Maximum number of timestamps to allocate */ Sequence(std::shared_ptr physicalDevice, std::shared_ptr device, std::shared_ptr computeQueue, - uint32_t queueIndex); + uint32_t queueIndex, + uint32_t totalTimestamps = 0); /** * Destructor for sequence which is responsible for cleaning all subsequent * owned operations. @@ -200,6 +203,12 @@ class Sequence : public std::enable_shared_from_this */ void clear(); + /** + * Return the timestamps that were latched at the beginning and + * after each operation during the last eval() call. + */ + std::vector getTimestamps(); + /** * Begins recording commands for commands to be submitted into the command * buffer. @@ -268,6 +277,7 @@ class Sequence : public std::enable_shared_from_this // -------------- ALWAYS OWNED RESOURCES vk::Fence mFence; std::vector> mOperations; + std::shared_ptr timestampQueryPool = nullptr; // State bool mRecording = false; @@ -276,6 +286,7 @@ class Sequence : public std::enable_shared_from_this // Create functions void createCommandPool(); void createCommandBuffer(); + void createTimestampQueryPool(uint32_t totalTimestamps); }; } // End namespace kp diff --git a/test/TestSequence.cpp b/test/TestSequence.cpp index 482868a88..b8afd1ad6 100644 --- a/test/TestSequence.cpp +++ b/test/TestSequence.cpp @@ -100,3 +100,33 @@ TEST(TestSequence, RerecordSequence) EXPECT_EQ(tensorB->data(), std::vector({2, 8, 18})); } + + +TEST(TestSequence, SequenceTimestamps) +{ + kp::Manager mgr; + + std::shared_ptr tensorA = mgr.tensor({ 0, 0, 0 }); + + std::string shader(R"( + #version 450 + layout (local_size_x = 1) in; + layout(set = 0, binding = 0) buffer a { float pa[]; }; + void main() { + uint index = gl_GlobalInvocationID.x; + pa[index] = pa[index] + 1; + })"); + + std::vector spirv = kp::Shader::compile_source(shader); + + auto seq = mgr.sequence(0, 100); //100 timestamps + seq->record({ tensorA }) + ->record(mgr.algorithm({ tensorA }, spirv)) + ->record(mgr.algorithm({ tensorA }, spirv)) + ->record(mgr.algorithm({ tensorA }, spirv)) + ->record({ tensorA }) + ->eval(); + const std::vector timestamps = seq->getTimestamps(); + + EXPECT_EQ(timestamps.size(), 6); //1 timestamp at start + 1 after each operation +}