diff --git a/single_include/kompute/Kompute.hpp b/single_include/kompute/Kompute.hpp index b202ab580..67efbe708 100755 --- a/single_include/kompute/Kompute.hpp +++ b/single_include/kompute/Kompute.hpp @@ -1793,3 +1793,523 @@ class OpMult : public OpAlgoDispatch }; } // End namespace kp + +// SPDX-License-Identifier: Apache-2.0 + +namespace kp { + +/** + * Container of operations that can be sent to GPU as batch + */ +class Sequence : public std::enable_shared_from_this +{ + public: + /** + * Main constructor for sequence which requires core vulkan components to + * generate all dependent resources. + * + * @param physicalDevice Vulkan physical device + * @param device Vulkan logical device + * @param computeQueue Vulkan compute queue + * @param queueIndex Vulkan compute queue index in device + * @param totalTimestamps Maximum number of timestamps to allocate + */ + Sequence(std::shared_ptr physicalDevice, + std::shared_ptr device, + std::shared_ptr computeQueue, + uint32_t queueIndex, + uint32_t totalTimestamps = 0); + /** + * Destructor for sequence which is responsible for cleaning all subsequent + * owned operations. + */ + ~Sequence(); + + /** + * Record function for operation to be added to the GPU queue in batch. This + * template requires classes to be derived from the OpBase class. This + * function also requires the Sequence to be recording, otherwise it will + * not be able to add the operation. + * + * @param op Object derived from kp::BaseOp that will be recoreded by the + * sequence which will be used when the operation is evaluated. + * @return shared_ptr of the Sequence class itself + */ + std::shared_ptr record(std::shared_ptr op); + + /** + * Record function for operation to be added to the GPU queue in batch. This + * template requires classes to be derived from the OpBase class. This + * function also requires the Sequence to be recording, otherwise it will + * not be able to add the operation. + * + * @param tensors Vector of tensors to use for the operation + * @param TArgs Template parameters that are used to initialise operation + * which allows for extensible configurations on initialisation. + * @return shared_ptr of the Sequence class itself + */ + template + std::shared_ptr record( + std::vector> tensors, + TArgs&&... params) + { + std::shared_ptr op{ new T(tensors, std::forward(params)...) }; + return this->record(op); + } + /** + * Record function for operation to be added to the GPU queue in batch. This + * template requires classes to be derived from the OpBase class. This + * function also requires the Sequence to be recording, otherwise it will + * not be able to add the operation. + * + * @param algorithm Algorithm to use for the record often used for OpAlgo + * operations + * @param TArgs Template parameters that are used to initialise operation + * which allows for extensible configurations on initialisation. + * @return shared_ptr of the Sequence class itself + */ + template + std::shared_ptr record(std::shared_ptr algorithm, + TArgs&&... params) + { + std::shared_ptr op{ new T(algorithm, + std::forward(params)...) }; + return this->record(op); + } + + /** + * Eval sends all the recorded and stored operations in the vector of + * operations into the gpu as a submit job synchronously (with a barrier). + * + * @return shared_ptr of the Sequence class itself + */ + std::shared_ptr eval(); + + /** + * Resets all the recorded and stored operations, records the operation + * provided and submits into the gpu as a submit job synchronously (with a + * barrier). + * + * @return shared_ptr of the Sequence class itself + */ + std::shared_ptr eval(std::shared_ptr op); + + /** + * Eval sends all the recorded and stored operations in the vector of + * operations into the gpu as a submit job with a barrier. + * + * @param tensors Vector of tensors to use for the operation + * @param TArgs Template parameters that are used to initialise operation + * which allows for extensible configurations on initialisation. + * @return shared_ptr of the Sequence class itself + */ + template + std::shared_ptr eval(std::vector> tensors, + TArgs&&... params) + { + std::shared_ptr op{ new T(tensors, std::forward(params)...) }; + return this->eval(op); + } + /** + * Eval sends all the recorded and stored operations in the vector of + * operations into the gpu as a submit job with a barrier. + * + * @param algorithm Algorithm to use for the record often used for OpAlgo + * operations + * @param TArgs Template parameters that are used to initialise operation + * which allows for extensible configurations on initialisation. + * @return shared_ptr of the Sequence class itself + */ + template + std::shared_ptr eval(std::shared_ptr algorithm, + TArgs&&... params) + { + std::shared_ptr op{ new T(algorithm, + std::forward(params)...) }; + return this->eval(op); + } + + /** + * Eval Async sends all the recorded and stored operations in the vector of + * operations into the gpu as a submit job without a barrier. EvalAwait() + * must ALWAYS be called after to ensure the sequence is terminated + * correctly. + * + * @return Boolean stating whether execution was successful. + */ + std::shared_ptr evalAsync(); + /** + * Clears currnet operations to record provided one in the vector of + * operations into the gpu as a submit job without a barrier. EvalAwait() + * must ALWAYS be called after to ensure the sequence is terminated + * correctly. + * + * @return Boolean stating whether execution was successful. + */ + std::shared_ptr evalAsync(std::shared_ptr op); + /** + * Eval sends all the recorded and stored operations in the vector of + * operations into the gpu as a submit job with a barrier. + * + * @param tensors Vector of tensors to use for the operation + * @param TArgs Template parameters that are used to initialise operation + * which allows for extensible configurations on initialisation. + * @return shared_ptr of the Sequence class itself + */ + template + std::shared_ptr evalAsync( + std::vector> tensors, + TArgs&&... params) + { + std::shared_ptr op{ new T(tensors, std::forward(params)...) }; + return this->evalAsync(op); + } + /** + * Eval sends all the recorded and stored operations in the vector of + * operations into the gpu as a submit job with a barrier. + * + * @param algorithm Algorithm to use for the record often used for OpAlgo + * operations + * @param TArgs Template parameters that are used to initialise operation + * which allows for extensible configurations on initialisation. + * @return shared_ptr of the Sequence class itself + */ + template + std::shared_ptr evalAsync(std::shared_ptr algorithm, + TArgs&&... params) + { + std::shared_ptr op{ new T(algorithm, + std::forward(params)...) }; + return this->evalAsync(op); + } + + /** + * Eval Await waits for the fence to finish processing and then once it + * finishes, it runs the postEval of all operations. + * + * @param waitFor Number of milliseconds to wait before timing out. + * @return shared_ptr of the Sequence class itself + */ + std::shared_ptr evalAwait(uint64_t waitFor = UINT64_MAX); + + /** + * Clear function clears all operations currently recorded and starts + * recording again. + */ + void clear(); + + /** + * Return the timestamps that were latched at the beginning and + * after each operation during the last eval() call. + */ + std::vector getTimestamps(); + + /** + * Begins recording commands for commands to be submitted into the command + * buffer. + * + * @return Boolean stating whether execution was successful. + */ + void begin(); + + /** + * Ends the recording and stops recording commands when the record command + * is sent. + * + * @return Boolean stating whether execution was successful. + */ + void end(); + + /** + * Returns true if the sequence is currently in recording activated. + * + * @return Boolean stating if recording ongoing. + */ + bool isRecording(); + + /** + * Returns true if the sequence has been initialised, and it's based on the + * GPU resources being refrenced. + * + * @return Boolean stating if is initialized + */ + bool isInit(); + + /** + * Clears command buffer and triggers re-record of all the current + * operations saved, which is useful if the underlying kp::Tensors or + * kp::Algorithms are modified and need to be re-recorded. + */ + void rerecord(); + + /** + * Returns true if the sequence is currently running - mostly used for async + * workloads. + * + * @return Boolean stating if currently running. + */ + bool isRunning(); + + /** + * Destroys and frees the GPU resources which include the buffer and memory + * and sets the sequence as init=False. + */ + void destroy(); + + private: + // -------------- NEVER OWNED RESOURCES + std::shared_ptr mPhysicalDevice = nullptr; + std::shared_ptr mDevice = nullptr; + std::shared_ptr mComputeQueue = nullptr; + uint32_t mQueueIndex = -1; + + // -------------- OPTIONALLY OWNED RESOURCES + std::shared_ptr mCommandPool = nullptr; + bool mFreeCommandPool = false; + std::shared_ptr mCommandBuffer = nullptr; + bool mFreeCommandBuffer = false; + + // -------------- ALWAYS OWNED RESOURCES + vk::Fence mFence; + std::vector> mOperations; + std::shared_ptr timestampQueryPool = nullptr; + + // State + bool mRecording = false; + bool mIsRunning = false; + + // Create functions + void createCommandPool(); + void createCommandBuffer(); + void createTimestampQueryPool(uint32_t totalTimestamps); +}; + +} // End namespace kp + +// SPDX-License-Identifier: Apache-2.0 + +#include +#include + +#define KP_DEFAULT_SESSION "DEFAULT" + +namespace kp { + +/** + Base orchestrator which creates and manages device and child components +*/ +class Manager +{ + public: + /** + Base constructor and default used which creates the base resources + including choosing the device 0 by default. + */ + Manager(); + + /** + * Similar to base constructor but allows for further configuration to use + * when creating the Vulkan resources. + * + * @param physicalDeviceIndex The index of the physical device to use + * @param familyQueueIndices (Optional) List of queue indices to add for + * explicit allocation + * @param desiredExtensions The desired extensions to load from + * physicalDevice + */ + Manager(uint32_t physicalDeviceIndex, + const std::vector& familyQueueIndices = {}, + const std::vector& desiredExtensions = {}); + + /** + * Manager constructor which allows your own vulkan application to integrate + * with the kompute use. + * + * @param instance Vulkan compute instance to base this application + * @param physicalDevice Vulkan physical device to use for application + * @param device Vulkan logical device to use for all base resources + * @param physicalDeviceIndex Index for vulkan physical device used + */ + Manager(std::shared_ptr instance, + std::shared_ptr physicalDevice, + std::shared_ptr device); + + /** + * Manager destructor which would ensure all owned resources are destroyed + * unless explicitly stated that resources should not be destroyed or freed. + */ + ~Manager(); + + /** + * Create a managed sequence that will be destroyed by this manager + * if it hasn't been destroyed by its reference count going to zero. + * + * @param queueIndex The queue to use from the available queues + * @param nrOfTimestamps The maximum number of timestamps to allocate. + * If zero (default), disables latching of timestamps. + * @returns Shared pointer with initialised sequence + */ + std::shared_ptr sequence(uint32_t queueIndex = 0, + uint32_t totalTimestamps = 0); + + /** + * Create a managed tensor that will be destroyed by this manager + * if it hasn't been destroyed by its reference count going to zero. + * + * @param data The data to initialize the tensor with + * @param tensorType The type of tensor to initialize + * @returns Shared pointer with initialised tensor + */ + template + std::shared_ptr> tensorT( + const std::vector& data, + Tensor::TensorTypes tensorType = Tensor::TensorTypes::eDevice) + { + KP_LOG_DEBUG("Kompute Manager tensor creation triggered"); + + std::shared_ptr> tensor{ new kp::TensorT( + this->mPhysicalDevice, this->mDevice, data, tensorType) }; + + if (this->mManageResources) { + this->mManagedTensors.push_back(tensor); + } + + return tensor; + } + + std::shared_ptr> tensor( + const std::vector& data, + Tensor::TensorTypes tensorType = Tensor::TensorTypes::eDevice) + { + return this->tensorT(data, tensorType); + } + + std::shared_ptr tensor( + void* data, + uint32_t elementTotalCount, + uint32_t elementMemorySize, + const Tensor::TensorDataTypes& dataType, + Tensor::TensorTypes tensorType = Tensor::TensorTypes::eDevice) + { + std::shared_ptr tensor{ new kp::Tensor(this->mPhysicalDevice, + this->mDevice, + data, + elementTotalCount, + elementMemorySize, + dataType, + tensorType) }; + + if (this->mManageResources) { + this->mManagedTensors.push_back(tensor); + } + + return tensor; + } + + std::shared_ptr algorithm( + const std::vector>& tensors = {}, + const std::vector& spirv = {}, + const Workgroup& workgroup = {}, + const std::vector& specializationConstants = {}, + const std::vector& pushConstants = {}) + { + return this->algorithm<>(tensors, spirv, workgroup, specializationConstants, pushConstants); + } + + /** + * Create a managed algorithm that will be destroyed by this manager + * if it hasn't been destroyed by its reference count going to zero. + * + * @param tensors (optional) The tensors to initialise the algorithm with + * @param spirv (optional) The SPIRV bytes for the algorithm to dispatch + * @param workgroup (optional) kp::Workgroup for algorithm to use, and + * defaults to (tensor[0].size(), 1, 1) + * @param specializationConstants (optional) kp::Constant to use for + * specialization constants, and defaults to an empty constant + * @param pushConstants (optional) kp::Constant to use for push constants, + * and defaults to an empty constant + * @returns Shared pointer with initialised algorithm + */ + template + std::shared_ptr algorithm( + const std::vector>& tensors, + const std::vector& spirv, + const Workgroup& workgroup, + const std::vector& specializationConstants, + const std::vector

& pushConstants) + { + + KP_LOG_DEBUG("Kompute Manager algorithm creation triggered"); + + std::shared_ptr algorithm{ new kp::Algorithm( + this->mDevice, + tensors, + spirv, + workgroup, + specializationConstants, + pushConstants) }; + + if (this->mManageResources) { + this->mManagedAlgorithms.push_back(algorithm); + } + + return algorithm; + } + + /** + * Destroy the GPU resources and all managed resources by manager. + **/ + void destroy(); + /** + * Run a pseudo-garbage collection to release all the managed resources + * that have been already freed due to these reaching to zero ref count. + **/ + void clear(); + + /** + * Information about the current device. + * + * @return vk::PhysicalDeviceProperties containing information about the device + **/ + vk::PhysicalDeviceProperties getDeviceProperties() const; + + /** + * List the devices available in the current vulkan instance. + * + * @return vector of physical devices containing their respective properties + **/ + std::vector listDevices() const; + + private: + // -------------- OPTIONALLY OWNED RESOURCES + std::shared_ptr mInstance = nullptr; + bool mFreeInstance = false; + std::shared_ptr mPhysicalDevice = nullptr; + std::shared_ptr mDevice = nullptr; + bool mFreeDevice = false; + + // -------------- ALWAYS OWNED RESOURCES + std::vector> mManagedTensors; + std::vector> mManagedSequences; + std::vector> mManagedAlgorithms; + + std::vector mComputeQueueFamilyIndices; + std::vector> mComputeQueues; + + bool mManageResources = false; + +#if DEBUG +#ifndef KOMPUTE_DISABLE_VK_DEBUG_LAYERS + vk::DebugReportCallbackEXT mDebugReportCallback; + vk::DispatchLoaderDynamic mDebugDispatcher; +#endif +#endif + + // Create functions + void createInstance(); + void createDevice(const std::vector& familyQueueIndices = {}, + uint32_t hysicalDeviceIndex = 0, + const std::vector& desiredExtensions = {}); +}; + +} // End namespace kp