diff --git a/python/src/main.cpp b/python/src/main.cpp index a6a03562b..7165d41e7 100644 --- a/python/src/main.cpp +++ b/python/src/main.cpp @@ -140,7 +140,7 @@ PYBIND11_MODULE(kp, m) { py::arg("device") = 0, py::arg("family_queue_indices") = std::vector(), py::arg("desired_extensions") = std::vector()) - .def("sequence", &kp::Manager::sequence, py::arg("queueIndex") = 0, py::arg("nrOfTimestamps") = 0) + .def("sequence", &kp::Manager::sequence, py::arg("queue_index") = 0, py::arg("total_timestamps") = 0) .def("tensor", [np](kp::Manager& self, const py::array_t data, kp::Tensor::TensorTypes tensor_type) { diff --git a/single_include/kompute/Kompute.hpp b/single_include/kompute/Kompute.hpp index 0ae6cce5a..38213bb6e 100755 --- a/single_include/kompute/Kompute.hpp +++ b/single_include/kompute/Kompute.hpp @@ -820,12 +820,14 @@ class Tensor }; /** - * Default constructor with data provided which would be used to create the + * Constructor with data provided which would be used to create the * respective vulkan buffer and memory. * + * @param physicalDevice The physical device to use to fetch properties + * @param device The device to use to create the buffer and memory from * @param data Non-zero-sized vector of data that will be used by the * tensor - * @param tensorType Type for the tensor which is of type TensorTypes + * @param tensorTypes Type for the tensor which is of type TensorTypes */ Tensor(std::shared_ptr physicalDevice, std::shared_ptr device, @@ -839,10 +841,11 @@ class Tensor ~Tensor(); /** - * Initialiser which calls the initialisation for all the respective tensors - * as well as creates the respective staging tensors. The staging tensors - * would only be created for the tensors of type TensorType::eDevice as - * otherwise there is no need to copy from host memory. + * Function to trigger reinitialisation of the tensor buffer and memory with + * new data as well as new potential device type. + * + * @param data Vector of data to use to initialise vector from + * @param tensorType The type to use for the tensor */ void rebuild(const std::vector& data, TensorTypes tensorType = TensorTypes::eDevice); @@ -852,6 +855,11 @@ class Tensor */ void destroy(); + /** + * Check whether tensor is initialized based on the created gpu resources. + * + * @returns Boolean stating whether tensor is initialized + */ bool isInit(); /** @@ -1210,6 +1218,8 @@ class OpBase * The record function is intended to only send a record command or run * commands that are expected to record operations that are to be submitted * as a batch into the GPU. + * + * @param commandBuffer The command buffer to record the command into. */ virtual void record(const vk::CommandBuffer& commandBuffer) = 0; @@ -1220,6 +1230,8 @@ class OpBase * there are situations where eval can be called multiple times, so the * resources that are created should be idempotent in case it's called multiple * times in a row. + * + * @param commandBuffer The command buffer to record the command into. */ virtual void preEval(const vk::CommandBuffer& commandBuffer) = 0; @@ -1230,6 +1242,8 @@ class OpBase * there are situations where eval can be called multiple times, so the * resources that are destroyed should not require a re-init unless explicitly * provided by the user. + * + * @param commandBuffer The command buffer to record the command into. */ virtual void postEval(const vk::CommandBuffer& commandBuffer) = 0; }; @@ -1239,38 +1253,47 @@ class OpBase namespace kp { /** - Operation that copies the data from the first tensor to the rest of the tensors provided, using a record command for all the vectors. This operation does not own/manage the memory of the tensors passed to it. The operation must only receive tensors of type + * Operation that copies the data from the first tensor to the rest of the tensors + * provided, using a record command for all the vectors. This operation does not + * own/manage the memory of the tensors passed to it. The operation must only + * receive tensors of type */ class OpTensorCopy : public OpBase { public: /** - * Default constructor with parameters that provides the core vulkan resources and the tensors that will be used in the operation. + * Default constructor with parameters that provides the core vulkan resources + * and the tensors that will be used in the operation. * - * @param physicalDevice Vulkan physical device used to find device queues - * @param device Vulkan logical device for passing to Algorithm - * @param commandBuffer Vulkan Command Buffer to record commands into * @param tensors Tensors that will be used to create in operation. */ OpTensorCopy(const std::vector>& tensors); /** - * Default destructor. This class does not manage memory so it won't be expecting the parent to perform a release. + * Default destructor. This class does not manage memory so it won't be + * expecting the parent to perform a release. */ ~OpTensorCopy() override; /** - * Records the copy commands from the first tensor into all the other tensors provided. Also optionally records a barrier. + * Records the copy commands from the first tensor into all the other + * tensors provided. Also optionally records a barrier. + * + * @param commandBuffer The command buffer to record the command into. */ void record(const vk::CommandBuffer& commandBuffer) override; /** * Does not perform any preEval commands. + * + * @param commandBuffer The command buffer to record the command into. */ virtual void preEval(const vk::CommandBuffer& commandBuffer) override; /** * Copies the local vectors for all the tensors to sync the data with the gpu. + * + * @param commandBuffer The command buffer to record the command into. */ virtual void postEval(const vk::CommandBuffer& commandBuffer) override; @@ -1284,17 +1307,20 @@ class OpTensorCopy : public OpBase namespace kp { /** - Operation that syncs tensor's device by mapping local data into the device memory. For TensorTypes::eDevice it will use a record operation for the memory to be syncd into GPU memory which means that the operation will be done in sync with GPU commands. For TensorTypes::eStaging it will only map the data into host memory which will happen during preEval before the recorded commands are dispatched. This operation won't have any effect on TensorTypes::eStaging. + * Operation that syncs tensor's device by mapping local data into the device memory. + * For TensorTypes::eDevice it will use a record operation for the memory to be syncd + * into GPU memory which means that the operation will be done in sync with GPU commands. + * For TensorTypes::eHost it will only map the data into host memory which will + * happen during preEval before the recorded commands are dispatched. */ class OpTensorSyncDevice : public OpBase { public: /** - * Default constructor with parameters that provides the core vulkan resources and the tensors that will be used in the operation. The tensos provided cannot be of type TensorTypes::eStorage. + * Default constructor with parameters that provides the core vulkan resources + * and the tensors that will be used in the operation. The tensos provided cannot + * be of type TensorTypes::eStorage. * - * @param physicalDevice Vulkan physical device used to find device queues - * @param device Vulkan logical device for passing to Algorithm - * @param commandBuffer Vulkan Command Buffer to record commands into * @param tensors Tensors that will be used to create in operation. */ OpTensorSyncDevice(const std::vector>& tensors); @@ -1305,17 +1331,24 @@ class OpTensorSyncDevice : public OpBase ~OpTensorSyncDevice() override; /** - * For device tensors, it records the copy command for the tensor to copy the data from its staging to device memory. + * For device tensors, it records the copy command for the tensor to copy the + * data from its staging to device memory. + * + * @param commandBuffer The command buffer to record the command into. */ void record(const vk::CommandBuffer& commandBuffer) override; /** * Does not perform any preEval commands. + * + * @param commandBuffer The command buffer to record the command into. */ virtual void preEval(const vk::CommandBuffer& commandBuffer) override; /** * Does not perform any postEval commands. + * + * @param commandBuffer The command buffer to record the command into. */ virtual void postEval(const vk::CommandBuffer& commandBuffer) override; @@ -1329,38 +1362,50 @@ class OpTensorSyncDevice : public OpBase namespace kp { /** - Operation that syncs tensor's local memory by mapping device data into the local CPU memory. For TensorTypes::eDevice it will use a record operation for the memory to be syncd into GPU memory which means that the operation will be done in sync with GPU commands. For TensorTypes::eStaging it will only map the data into host memory which will happen during preEval before the recorded commands are dispatched. This operation won't have any effect on TensorTypes::eStaging. + * Operation that syncs tensor's local memory by mapping device data into the + * local CPU memory. For TensorTypes::eDevice it will use a record operation + * for the memory to be syncd into GPU memory which means that the operation + * will be done in sync with GPU commands. For TensorTypes::eHost it will + * only map the data into host memory which will happen during preEval before + * the recorded commands are dispatched. */ class OpTensorSyncLocal : public OpBase { public: /** - * Default constructor with parameters that provides the core vulkan resources and the tensors that will be used in the operation. The tensors provided cannot be of type TensorTypes::eStorage. + * Default constructor with parameters that provides the core vulkan resources + * and the tensors that will be used in the operation. The tensors provided + * cannot be of type TensorTypes::eStorage. * - * @param physicalDevice Vulkan physical device used to find device queues - * @param device Vulkan logical device for passing to Algorithm - * @param commandBuffer Vulkan Command Buffer to record commands into * @param tensors Tensors that will be used to create in operation. */ OpTensorSyncLocal(const std::vector>& tensors); /** - * Default destructor. This class does not manage memory so it won't be expecting the parent to perform a release. + * Default destructor. This class does not manage memory so it won't be expecting + * the parent to perform a release. */ ~OpTensorSyncLocal() override; /** - * For device tensors, it records the copy command for the tensor to copy the data from its device to staging memory. + * For device tensors, it records the copy command for the tensor to copy the + * data from its device to staging memory. + * + * @param commandBuffer The command buffer to record the command into. */ void record(const vk::CommandBuffer& commandBuffer) override; /** * Does not perform any preEval commands. + * + * @param commandBuffer The command buffer to record the command into. */ virtual void preEval(const vk::CommandBuffer& commandBuffer) override; /** * For host tensors it performs the map command from the host memory into local memory. + * + * @param commandBuffer The command buffer to record the command into. */ virtual void postEval(const vk::CommandBuffer& commandBuffer) override; @@ -1383,6 +1428,13 @@ class OpAlgoDispatch : public OpBase { public: + /** + * Constructor that stores the algorithm to use as well as the relevant + * push constants to override when recording. + * + * @param algorithm The algorithm object to use for dispatch + * @param pushConstants The push constants to use for override + */ OpAlgoDispatch(const std::shared_ptr& algorithm, const kp::Constants& pushConstants = {}); @@ -1399,18 +1451,22 @@ class OpAlgoDispatch : public OpBase * shader processing to the gpu. This function also records the GPU memory * copy of the output data for the staging buffer so it can be read by the * host. + * + * @param commandBuffer The command buffer to record the command into. */ virtual void record(const vk::CommandBuffer& commandBuffer) override; /** * Does not perform any preEval commands. + * + * @param commandBuffer The command buffer to record the command into. */ virtual void preEval(const vk::CommandBuffer& commandBuffer) override; /** - * Executes after the recorded commands are submitted, and performs a copy - * of the GPU Device memory into the staging buffer so the output data can - * be retrieved. + * Does not perform any postEval commands. + * + * @param commandBuffer The command buffer to record the command into. */ virtual void postEval(const vk::CommandBuffer& commandBuffer) override; @@ -1439,11 +1495,9 @@ class OpMult : public OpAlgoDispatch * requirements for the operations to be able to create and manage their * sub-components. * - * @param physicalDevice Vulkan physical device used to find device queues - * @param device Vulkan logical device for passing to Algorithm - * @param commandBuffer Vulkan Command Buffer to record commands into * @param tensors Tensors that are to be used in this operation - * @param komputeWorkgroup Optional parameter to specify the layout for processing + * @param algorithm An algorithm that will be overridden with the OpMult + * shader data and the tensors provided which are expected to be 3 */ OpMult(std::vector> tensors, std::shared_ptr algorithm) : OpAlgoDispatch(algorithm) @@ -1489,13 +1543,13 @@ class Sequence : public std::enable_shared_from_this * @param device Vulkan logical device * @param computeQueue Vulkan compute queue * @param queueIndex Vulkan compute queue index in device - * @param nrOfTimestamps Maximum number of timestamps to allocate + * @param totalTimestamps Maximum number of timestamps to allocate */ Sequence(std::shared_ptr physicalDevice, std::shared_ptr device, std::shared_ptr computeQueue, uint32_t queueIndex, - uint32_t nrOfTimestamps = 0); + uint32_t totalTimestamps = 0); /** * Destructor for sequence which is responsible for cleaning all subsequent * owned operations. @@ -1754,7 +1808,7 @@ class Sequence : public std::enable_shared_from_this // Create functions void createCommandPool(); void createCommandBuffer(); - void createTimestampQueryPool(uint32_t); + void createTimestampQueryPool(uint32_t totalTimestamps); }; } // End namespace kp diff --git a/src/Manager.cpp b/src/Manager.cpp index 563e102bf..d6743739c 100644 --- a/src/Manager.cpp +++ b/src/Manager.cpp @@ -431,7 +431,7 @@ Manager::algorithm(const std::vector>& tensors, } std::shared_ptr -Manager::sequence(uint32_t queueIndex, uint32_t nrOfTimestamps) +Manager::sequence(uint32_t queueIndex, uint32_t total_timestamps) { KP_LOG_DEBUG("Kompute Manager sequence() with queueIndex: {}", queueIndex); @@ -440,7 +440,7 @@ Manager::sequence(uint32_t queueIndex, uint32_t nrOfTimestamps) this->mDevice, this->mComputeQueues[queueIndex], this->mComputeQueueFamilyIndices[queueIndex], - nrOfTimestamps) }; + total_timestamps) }; if (this->mManageResources) { this->mManagedSequences.push_back(sq); diff --git a/src/Sequence.cpp b/src/Sequence.cpp index 21cbf5af2..6e379eb92 100644 --- a/src/Sequence.cpp +++ b/src/Sequence.cpp @@ -7,7 +7,7 @@ Sequence::Sequence(std::shared_ptr physicalDevice, std::shared_ptr device, std::shared_ptr computeQueue, uint32_t queueIndex, - uint32_t nrOfTimestamps) + uint32_t totalTimestamps) { KP_LOG_DEBUG("Kompute Sequence Constructor with existing device & queue"); @@ -18,8 +18,8 @@ Sequence::Sequence(std::shared_ptr physicalDevice, this->createCommandPool(); this->createCommandBuffer(); - if(nrOfTimestamps>0) - this->createTimestampQueryPool(nrOfTimestamps+1); //+1 for the first one + if(totalTimestamps>0) + this->createTimestampQueryPool(totalTimestamps+1); //+1 for the first one } Sequence::~Sequence() @@ -246,6 +246,16 @@ Sequence::destroy() this->mOperations.clear(); } + if(this->timestampQueryPool){ + KP_LOG_INFO("Destroying QueryPool"); + this->mDevice->destroy( + *this->timestampQueryPool, + (vk::Optional)nullptr); + + this->timestampQueryPool = nullptr; + KP_LOG_DEBUG("Kompute Sequence Destroyed QueryPool"); + } + if (this->mDevice) { this->mDevice = nullptr; } @@ -325,11 +335,11 @@ Sequence::createCommandBuffer() } void -Sequence::createTimestampQueryPool(uint32_t query_size) +Sequence::createTimestampQueryPool(uint32_t totalTimestamps) { KP_LOG_DEBUG("Kompute Sequence creating query pool"); - if (!this->mDevice) { - throw std::runtime_error("Kompute Sequence device is null"); + if (!this->isInit()) { + throw std::runtime_error("createTimestampQueryPool() called on uninitialized Sequence"); } if (!this->mPhysicalDevice) { throw std::runtime_error("Kompute Sequence physical device is null"); @@ -339,32 +349,29 @@ Sequence::createTimestampQueryPool(uint32_t query_size) this->mPhysicalDevice->getProperties(); if(physicalDeviceProperties.limits.timestampComputeAndGraphics){ - vk::QueryPoolCreateInfo queryPoolInfo; - queryPoolInfo.setQueryCount(query_size); - queryPoolInfo.setQueryType(vk::QueryType::eTimestamp); - this->timestampQueryPool = std::make_shared(this->mDevice->createQueryPool(queryPoolInfo)); + vk::QueryPoolCreateInfo queryPoolInfo; + queryPoolInfo.setQueryCount(totalTimestamps); + queryPoolInfo.setQueryType(vk::QueryType::eTimestamp); + this->timestampQueryPool = std::make_shared(this->mDevice->createQueryPool(queryPoolInfo)); - KP_LOG_DEBUG("Query pool for timestamps created"); + KP_LOG_DEBUG("Query pool for timestamps created"); } else{ - KP_LOG_DEBUG("Device does not support timestamps"); + throw std::runtime_error("Device does not support timestamps"); } } std::vector -Sequence::getTimestamps(){ +Sequence::getTimestamps() +{ if(!this->timestampQueryPool) throw std::runtime_error("Timestamp latching not enabled"); const auto n = this->mOperations.size()+1; std::vector timestamps(n, 0); - //XXX: the C++ method this->mDevice->getQueryPoolResults does not compile for me - const VkResult result = - vkGetQueryPoolResults(*this->mDevice, *this->timestampQueryPool, - 0, n, timestamps.size()*sizeof(std::uint64_t), timestamps.data(), - sizeof(uint64_t), VK_QUERY_RESULT_64_BIT | VK_QUERY_RESULT_WAIT_BIT); - if(result!=VK_SUCCESS) - throw std::runtime_error("vkGetQueryPoolResults failed"); + this->mDevice->getQueryPoolResults(*this->timestampQueryPool, + 0, n, timestamps.size()*sizeof(std::uint64_t), timestamps.data(), + sizeof(uint64_t), vk::QueryResultFlagBits::e64 | vk::QueryResultFlagBits::eWait); return timestamps; } diff --git a/src/include/kompute/Sequence.hpp b/src/include/kompute/Sequence.hpp index 502720f72..d29f6aaf0 100644 --- a/src/include/kompute/Sequence.hpp +++ b/src/include/kompute/Sequence.hpp @@ -21,13 +21,13 @@ class Sequence : public std::enable_shared_from_this * @param device Vulkan logical device * @param computeQueue Vulkan compute queue * @param queueIndex Vulkan compute queue index in device - * @param nrOfTimestamps Maximum number of timestamps to allocate + * @param totalTimestamps Maximum number of timestamps to allocate */ Sequence(std::shared_ptr physicalDevice, std::shared_ptr device, std::shared_ptr computeQueue, uint32_t queueIndex, - uint32_t nrOfTimestamps = 0); + uint32_t totalTimestamps = 0); /** * Destructor for sequence which is responsible for cleaning all subsequent * owned operations. @@ -286,7 +286,7 @@ class Sequence : public std::enable_shared_from_this // Create functions void createCommandPool(); void createCommandBuffer(); - void createTimestampQueryPool(uint32_t); + void createTimestampQueryPool(uint32_t totalTimestamps); }; } // End namespace kp