Updating python
Signed-off-by: Alejandro Saucedo <axsauze@gmail.com>
This commit is contained in:
parent
1972f2c8f8
commit
3d320ff687
2 changed files with 1 additions and 520 deletions
|
|
@ -1793,523 +1793,3 @@ class OpMult : public OpAlgoDispatch
|
|||
};
|
||||
|
||||
} // End namespace kp
|
||||
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
namespace kp {
|
||||
|
||||
/**
|
||||
* Container of operations that can be sent to GPU as batch
|
||||
*/
|
||||
class Sequence : public std::enable_shared_from_this<Sequence>
|
||||
{
|
||||
public:
|
||||
/**
|
||||
* Main constructor for sequence which requires core vulkan components to
|
||||
* generate all dependent resources.
|
||||
*
|
||||
* @param physicalDevice Vulkan physical device
|
||||
* @param device Vulkan logical device
|
||||
* @param computeQueue Vulkan compute queue
|
||||
* @param queueIndex Vulkan compute queue index in device
|
||||
* @param totalTimestamps Maximum number of timestamps to allocate
|
||||
*/
|
||||
Sequence(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
|
||||
std::shared_ptr<vk::Device> device,
|
||||
std::shared_ptr<vk::Queue> computeQueue,
|
||||
uint32_t queueIndex,
|
||||
uint32_t totalTimestamps = 0);
|
||||
/**
|
||||
* Destructor for sequence which is responsible for cleaning all subsequent
|
||||
* owned operations.
|
||||
*/
|
||||
~Sequence();
|
||||
|
||||
/**
|
||||
* Record function for operation to be added to the GPU queue in batch. This
|
||||
* template requires classes to be derived from the OpBase class. This
|
||||
* function also requires the Sequence to be recording, otherwise it will
|
||||
* not be able to add the operation.
|
||||
*
|
||||
* @param op Object derived from kp::BaseOp that will be recoreded by the
|
||||
* sequence which will be used when the operation is evaluated.
|
||||
* @return shared_ptr<Sequence> of the Sequence class itself
|
||||
*/
|
||||
std::shared_ptr<Sequence> record(std::shared_ptr<OpBase> op);
|
||||
|
||||
/**
|
||||
* Record function for operation to be added to the GPU queue in batch. This
|
||||
* template requires classes to be derived from the OpBase class. This
|
||||
* function also requires the Sequence to be recording, otherwise it will
|
||||
* not be able to add the operation.
|
||||
*
|
||||
* @param tensors Vector of tensors to use for the operation
|
||||
* @param TArgs Template parameters that are used to initialise operation
|
||||
* which allows for extensible configurations on initialisation.
|
||||
* @return shared_ptr<Sequence> of the Sequence class itself
|
||||
*/
|
||||
template<typename T, typename... TArgs>
|
||||
std::shared_ptr<Sequence> record(
|
||||
std::vector<std::shared_ptr<Tensor>> tensors,
|
||||
TArgs&&... params)
|
||||
{
|
||||
std::shared_ptr<T> op{ new T(tensors, std::forward<TArgs>(params)...) };
|
||||
return this->record(op);
|
||||
}
|
||||
/**
|
||||
* Record function for operation to be added to the GPU queue in batch. This
|
||||
* template requires classes to be derived from the OpBase class. This
|
||||
* function also requires the Sequence to be recording, otherwise it will
|
||||
* not be able to add the operation.
|
||||
*
|
||||
* @param algorithm Algorithm to use for the record often used for OpAlgo
|
||||
* operations
|
||||
* @param TArgs Template parameters that are used to initialise operation
|
||||
* which allows for extensible configurations on initialisation.
|
||||
* @return shared_ptr<Sequence> of the Sequence class itself
|
||||
*/
|
||||
template<typename T, typename... TArgs>
|
||||
std::shared_ptr<Sequence> record(std::shared_ptr<Algorithm> algorithm,
|
||||
TArgs&&... params)
|
||||
{
|
||||
std::shared_ptr<T> op{ new T(algorithm,
|
||||
std::forward<TArgs>(params)...) };
|
||||
return this->record(op);
|
||||
}
|
||||
|
||||
/**
|
||||
* Eval sends all the recorded and stored operations in the vector of
|
||||
* operations into the gpu as a submit job synchronously (with a barrier).
|
||||
*
|
||||
* @return shared_ptr<Sequence> of the Sequence class itself
|
||||
*/
|
||||
std::shared_ptr<Sequence> eval();
|
||||
|
||||
/**
|
||||
* Resets all the recorded and stored operations, records the operation
|
||||
* provided and submits into the gpu as a submit job synchronously (with a
|
||||
* barrier).
|
||||
*
|
||||
* @return shared_ptr<Sequence> of the Sequence class itself
|
||||
*/
|
||||
std::shared_ptr<Sequence> eval(std::shared_ptr<OpBase> op);
|
||||
|
||||
/**
|
||||
* Eval sends all the recorded and stored operations in the vector of
|
||||
* operations into the gpu as a submit job with a barrier.
|
||||
*
|
||||
* @param tensors Vector of tensors to use for the operation
|
||||
* @param TArgs Template parameters that are used to initialise operation
|
||||
* which allows for extensible configurations on initialisation.
|
||||
* @return shared_ptr<Sequence> of the Sequence class itself
|
||||
*/
|
||||
template<typename T, typename... TArgs>
|
||||
std::shared_ptr<Sequence> eval(std::vector<std::shared_ptr<Tensor>> tensors,
|
||||
TArgs&&... params)
|
||||
{
|
||||
std::shared_ptr<T> op{ new T(tensors, std::forward<TArgs>(params)...) };
|
||||
return this->eval(op);
|
||||
}
|
||||
/**
|
||||
* Eval sends all the recorded and stored operations in the vector of
|
||||
* operations into the gpu as a submit job with a barrier.
|
||||
*
|
||||
* @param algorithm Algorithm to use for the record often used for OpAlgo
|
||||
* operations
|
||||
* @param TArgs Template parameters that are used to initialise operation
|
||||
* which allows for extensible configurations on initialisation.
|
||||
* @return shared_ptr<Sequence> of the Sequence class itself
|
||||
*/
|
||||
template<typename T, typename... TArgs>
|
||||
std::shared_ptr<Sequence> eval(std::shared_ptr<Algorithm> algorithm,
|
||||
TArgs&&... params)
|
||||
{
|
||||
std::shared_ptr<T> op{ new T(algorithm,
|
||||
std::forward<TArgs>(params)...) };
|
||||
return this->eval(op);
|
||||
}
|
||||
|
||||
/**
|
||||
* Eval Async sends all the recorded and stored operations in the vector of
|
||||
* operations into the gpu as a submit job without a barrier. EvalAwait()
|
||||
* must ALWAYS be called after to ensure the sequence is terminated
|
||||
* correctly.
|
||||
*
|
||||
* @return Boolean stating whether execution was successful.
|
||||
*/
|
||||
std::shared_ptr<Sequence> evalAsync();
|
||||
/**
|
||||
* Clears currnet operations to record provided one in the vector of
|
||||
* operations into the gpu as a submit job without a barrier. EvalAwait()
|
||||
* must ALWAYS be called after to ensure the sequence is terminated
|
||||
* correctly.
|
||||
*
|
||||
* @return Boolean stating whether execution was successful.
|
||||
*/
|
||||
std::shared_ptr<Sequence> evalAsync(std::shared_ptr<OpBase> op);
|
||||
/**
|
||||
* Eval sends all the recorded and stored operations in the vector of
|
||||
* operations into the gpu as a submit job with a barrier.
|
||||
*
|
||||
* @param tensors Vector of tensors to use for the operation
|
||||
* @param TArgs Template parameters that are used to initialise operation
|
||||
* which allows for extensible configurations on initialisation.
|
||||
* @return shared_ptr<Sequence> of the Sequence class itself
|
||||
*/
|
||||
template<typename T, typename... TArgs>
|
||||
std::shared_ptr<Sequence> evalAsync(
|
||||
std::vector<std::shared_ptr<Tensor>> tensors,
|
||||
TArgs&&... params)
|
||||
{
|
||||
std::shared_ptr<T> op{ new T(tensors, std::forward<TArgs>(params)...) };
|
||||
return this->evalAsync(op);
|
||||
}
|
||||
/**
|
||||
* Eval sends all the recorded and stored operations in the vector of
|
||||
* operations into the gpu as a submit job with a barrier.
|
||||
*
|
||||
* @param algorithm Algorithm to use for the record often used for OpAlgo
|
||||
* operations
|
||||
* @param TArgs Template parameters that are used to initialise operation
|
||||
* which allows for extensible configurations on initialisation.
|
||||
* @return shared_ptr<Sequence> of the Sequence class itself
|
||||
*/
|
||||
template<typename T, typename... TArgs>
|
||||
std::shared_ptr<Sequence> evalAsync(std::shared_ptr<Algorithm> algorithm,
|
||||
TArgs&&... params)
|
||||
{
|
||||
std::shared_ptr<T> op{ new T(algorithm,
|
||||
std::forward<TArgs>(params)...) };
|
||||
return this->evalAsync(op);
|
||||
}
|
||||
|
||||
/**
|
||||
* Eval Await waits for the fence to finish processing and then once it
|
||||
* finishes, it runs the postEval of all operations.
|
||||
*
|
||||
* @param waitFor Number of milliseconds to wait before timing out.
|
||||
* @return shared_ptr<Sequence> of the Sequence class itself
|
||||
*/
|
||||
std::shared_ptr<Sequence> evalAwait(uint64_t waitFor = UINT64_MAX);
|
||||
|
||||
/**
|
||||
* Clear function clears all operations currently recorded and starts
|
||||
* recording again.
|
||||
*/
|
||||
void clear();
|
||||
|
||||
/**
|
||||
* Return the timestamps that were latched at the beginning and
|
||||
* after each operation during the last eval() call.
|
||||
*/
|
||||
std::vector<std::uint64_t> getTimestamps();
|
||||
|
||||
/**
|
||||
* Begins recording commands for commands to be submitted into the command
|
||||
* buffer.
|
||||
*
|
||||
* @return Boolean stating whether execution was successful.
|
||||
*/
|
||||
void begin();
|
||||
|
||||
/**
|
||||
* Ends the recording and stops recording commands when the record command
|
||||
* is sent.
|
||||
*
|
||||
* @return Boolean stating whether execution was successful.
|
||||
*/
|
||||
void end();
|
||||
|
||||
/**
|
||||
* Returns true if the sequence is currently in recording activated.
|
||||
*
|
||||
* @return Boolean stating if recording ongoing.
|
||||
*/
|
||||
bool isRecording();
|
||||
|
||||
/**
|
||||
* Returns true if the sequence has been initialised, and it's based on the
|
||||
* GPU resources being refrenced.
|
||||
*
|
||||
* @return Boolean stating if is initialized
|
||||
*/
|
||||
bool isInit();
|
||||
|
||||
/**
|
||||
* Clears command buffer and triggers re-record of all the current
|
||||
* operations saved, which is useful if the underlying kp::Tensors or
|
||||
* kp::Algorithms are modified and need to be re-recorded.
|
||||
*/
|
||||
void rerecord();
|
||||
|
||||
/**
|
||||
* Returns true if the sequence is currently running - mostly used for async
|
||||
* workloads.
|
||||
*
|
||||
* @return Boolean stating if currently running.
|
||||
*/
|
||||
bool isRunning();
|
||||
|
||||
/**
|
||||
* Destroys and frees the GPU resources which include the buffer and memory
|
||||
* and sets the sequence as init=False.
|
||||
*/
|
||||
void destroy();
|
||||
|
||||
private:
|
||||
// -------------- NEVER OWNED RESOURCES
|
||||
std::shared_ptr<vk::PhysicalDevice> mPhysicalDevice = nullptr;
|
||||
std::shared_ptr<vk::Device> mDevice = nullptr;
|
||||
std::shared_ptr<vk::Queue> mComputeQueue = nullptr;
|
||||
uint32_t mQueueIndex = -1;
|
||||
|
||||
// -------------- OPTIONALLY OWNED RESOURCES
|
||||
std::shared_ptr<vk::CommandPool> mCommandPool = nullptr;
|
||||
bool mFreeCommandPool = false;
|
||||
std::shared_ptr<vk::CommandBuffer> mCommandBuffer = nullptr;
|
||||
bool mFreeCommandBuffer = false;
|
||||
|
||||
// -------------- ALWAYS OWNED RESOURCES
|
||||
vk::Fence mFence;
|
||||
std::vector<std::shared_ptr<OpBase>> mOperations;
|
||||
std::shared_ptr<vk::QueryPool> timestampQueryPool = nullptr;
|
||||
|
||||
// State
|
||||
bool mRecording = false;
|
||||
bool mIsRunning = false;
|
||||
|
||||
// Create functions
|
||||
void createCommandPool();
|
||||
void createCommandBuffer();
|
||||
void createTimestampQueryPool(uint32_t totalTimestamps);
|
||||
};
|
||||
|
||||
} // End namespace kp
|
||||
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
#include <set>
|
||||
#include <unordered_map>
|
||||
|
||||
#define KP_DEFAULT_SESSION "DEFAULT"
|
||||
|
||||
namespace kp {
|
||||
|
||||
/**
|
||||
Base orchestrator which creates and manages device and child components
|
||||
*/
|
||||
class Manager
|
||||
{
|
||||
public:
|
||||
/**
|
||||
Base constructor and default used which creates the base resources
|
||||
including choosing the device 0 by default.
|
||||
*/
|
||||
Manager();
|
||||
|
||||
/**
|
||||
* Similar to base constructor but allows for further configuration to use
|
||||
* when creating the Vulkan resources.
|
||||
*
|
||||
* @param physicalDeviceIndex The index of the physical device to use
|
||||
* @param familyQueueIndices (Optional) List of queue indices to add for
|
||||
* explicit allocation
|
||||
* @param desiredExtensions The desired extensions to load from
|
||||
* physicalDevice
|
||||
*/
|
||||
Manager(uint32_t physicalDeviceIndex,
|
||||
const std::vector<uint32_t>& familyQueueIndices = {},
|
||||
const std::vector<std::string>& desiredExtensions = {});
|
||||
|
||||
/**
|
||||
* Manager constructor which allows your own vulkan application to integrate
|
||||
* with the kompute use.
|
||||
*
|
||||
* @param instance Vulkan compute instance to base this application
|
||||
* @param physicalDevice Vulkan physical device to use for application
|
||||
* @param device Vulkan logical device to use for all base resources
|
||||
* @param physicalDeviceIndex Index for vulkan physical device used
|
||||
*/
|
||||
Manager(std::shared_ptr<vk::Instance> instance,
|
||||
std::shared_ptr<vk::PhysicalDevice> physicalDevice,
|
||||
std::shared_ptr<vk::Device> device);
|
||||
|
||||
/**
|
||||
* Manager destructor which would ensure all owned resources are destroyed
|
||||
* unless explicitly stated that resources should not be destroyed or freed.
|
||||
*/
|
||||
~Manager();
|
||||
|
||||
/**
|
||||
* Create a managed sequence that will be destroyed by this manager
|
||||
* if it hasn't been destroyed by its reference count going to zero.
|
||||
*
|
||||
* @param queueIndex The queue to use from the available queues
|
||||
* @param nrOfTimestamps The maximum number of timestamps to allocate.
|
||||
* If zero (default), disables latching of timestamps.
|
||||
* @returns Shared pointer with initialised sequence
|
||||
*/
|
||||
std::shared_ptr<Sequence> sequence(uint32_t queueIndex = 0,
|
||||
uint32_t totalTimestamps = 0);
|
||||
|
||||
/**
|
||||
* Create a managed tensor that will be destroyed by this manager
|
||||
* if it hasn't been destroyed by its reference count going to zero.
|
||||
*
|
||||
* @param data The data to initialize the tensor with
|
||||
* @param tensorType The type of tensor to initialize
|
||||
* @returns Shared pointer with initialised tensor
|
||||
*/
|
||||
template<typename T>
|
||||
std::shared_ptr<TensorT<T>> tensorT(
|
||||
const std::vector<T>& data,
|
||||
Tensor::TensorTypes tensorType = Tensor::TensorTypes::eDevice)
|
||||
{
|
||||
KP_LOG_DEBUG("Kompute Manager tensor creation triggered");
|
||||
|
||||
std::shared_ptr<TensorT<T>> tensor{ new kp::TensorT<T>(
|
||||
this->mPhysicalDevice, this->mDevice, data, tensorType) };
|
||||
|
||||
if (this->mManageResources) {
|
||||
this->mManagedTensors.push_back(tensor);
|
||||
}
|
||||
|
||||
return tensor;
|
||||
}
|
||||
|
||||
std::shared_ptr<TensorT<float>> tensor(
|
||||
const std::vector<float>& data,
|
||||
Tensor::TensorTypes tensorType = Tensor::TensorTypes::eDevice)
|
||||
{
|
||||
return this->tensorT<float>(data, tensorType);
|
||||
}
|
||||
|
||||
std::shared_ptr<Tensor> tensor(
|
||||
void* data,
|
||||
uint32_t elementTotalCount,
|
||||
uint32_t elementMemorySize,
|
||||
const Tensor::TensorDataTypes& dataType,
|
||||
Tensor::TensorTypes tensorType = Tensor::TensorTypes::eDevice)
|
||||
{
|
||||
std::shared_ptr<Tensor> tensor{ new kp::Tensor(this->mPhysicalDevice,
|
||||
this->mDevice,
|
||||
data,
|
||||
elementTotalCount,
|
||||
elementMemorySize,
|
||||
dataType,
|
||||
tensorType) };
|
||||
|
||||
if (this->mManageResources) {
|
||||
this->mManagedTensors.push_back(tensor);
|
||||
}
|
||||
|
||||
return tensor;
|
||||
}
|
||||
|
||||
std::shared_ptr<Algorithm> algorithm(
|
||||
const std::vector<std::shared_ptr<Tensor>>& tensors = {},
|
||||
const std::vector<uint32_t>& spirv = {},
|
||||
const Workgroup& workgroup = {},
|
||||
const std::vector<float>& specializationConstants = {},
|
||||
const std::vector<float>& pushConstants = {})
|
||||
{
|
||||
return this->algorithm<>(tensors, spirv, workgroup, specializationConstants, pushConstants);
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a managed algorithm that will be destroyed by this manager
|
||||
* if it hasn't been destroyed by its reference count going to zero.
|
||||
*
|
||||
* @param tensors (optional) The tensors to initialise the algorithm with
|
||||
* @param spirv (optional) The SPIRV bytes for the algorithm to dispatch
|
||||
* @param workgroup (optional) kp::Workgroup for algorithm to use, and
|
||||
* defaults to (tensor[0].size(), 1, 1)
|
||||
* @param specializationConstants (optional) kp::Constant to use for
|
||||
* specialization constants, and defaults to an empty constant
|
||||
* @param pushConstants (optional) kp::Constant to use for push constants,
|
||||
* and defaults to an empty constant
|
||||
* @returns Shared pointer with initialised algorithm
|
||||
*/
|
||||
template<typename S = float, typename P = float>
|
||||
std::shared_ptr<Algorithm> algorithm(
|
||||
const std::vector<std::shared_ptr<Tensor>>& tensors,
|
||||
const std::vector<uint32_t>& spirv,
|
||||
const Workgroup& workgroup,
|
||||
const std::vector<S>& specializationConstants,
|
||||
const std::vector<P>& pushConstants)
|
||||
{
|
||||
|
||||
KP_LOG_DEBUG("Kompute Manager algorithm creation triggered");
|
||||
|
||||
std::shared_ptr<Algorithm> algorithm{ new kp::Algorithm(
|
||||
this->mDevice,
|
||||
tensors,
|
||||
spirv,
|
||||
workgroup,
|
||||
specializationConstants,
|
||||
pushConstants) };
|
||||
|
||||
if (this->mManageResources) {
|
||||
this->mManagedAlgorithms.push_back(algorithm);
|
||||
}
|
||||
|
||||
return algorithm;
|
||||
}
|
||||
|
||||
/**
|
||||
* Destroy the GPU resources and all managed resources by manager.
|
||||
**/
|
||||
void destroy();
|
||||
/**
|
||||
* Run a pseudo-garbage collection to release all the managed resources
|
||||
* that have been already freed due to these reaching to zero ref count.
|
||||
**/
|
||||
void clear();
|
||||
|
||||
/**
|
||||
* Information about the current device.
|
||||
*
|
||||
* @return vk::PhysicalDeviceProperties containing information about the device
|
||||
**/
|
||||
vk::PhysicalDeviceProperties getDeviceProperties() const;
|
||||
|
||||
/**
|
||||
* List the devices available in the current vulkan instance.
|
||||
*
|
||||
* @return vector of physical devices containing their respective properties
|
||||
**/
|
||||
std::vector<vk::PhysicalDevice> listDevices() const;
|
||||
|
||||
private:
|
||||
// -------------- OPTIONALLY OWNED RESOURCES
|
||||
std::shared_ptr<vk::Instance> mInstance = nullptr;
|
||||
bool mFreeInstance = false;
|
||||
std::shared_ptr<vk::PhysicalDevice> mPhysicalDevice = nullptr;
|
||||
std::shared_ptr<vk::Device> mDevice = nullptr;
|
||||
bool mFreeDevice = false;
|
||||
|
||||
// -------------- ALWAYS OWNED RESOURCES
|
||||
std::vector<std::weak_ptr<Tensor>> mManagedTensors;
|
||||
std::vector<std::weak_ptr<Sequence>> mManagedSequences;
|
||||
std::vector<std::weak_ptr<Algorithm>> mManagedAlgorithms;
|
||||
|
||||
std::vector<uint32_t> mComputeQueueFamilyIndices;
|
||||
std::vector<std::shared_ptr<vk::Queue>> mComputeQueues;
|
||||
|
||||
bool mManageResources = false;
|
||||
|
||||
#if DEBUG
|
||||
#ifndef KOMPUTE_DISABLE_VK_DEBUG_LAYERS
|
||||
vk::DebugReportCallbackEXT mDebugReportCallback;
|
||||
vk::DispatchLoaderDynamic mDebugDispatcher;
|
||||
#endif
|
||||
#endif
|
||||
|
||||
// Create functions
|
||||
void createInstance();
|
||||
void createDevice(const std::vector<uint32_t>& familyQueueIndices = {},
|
||||
uint32_t hysicalDeviceIndex = 0,
|
||||
const std::vector<std::string>& desiredExtensions = {});
|
||||
};
|
||||
|
||||
} // End namespace kp
|
||||
|
|
|
|||
|
|
@ -10,6 +10,7 @@ OpAlgoDispatch::~OpAlgoDispatch()
|
|||
KP_LOG_DEBUG("Kompute OpAlgoDispatch destructor started");
|
||||
|
||||
if (this->mPushConstantsData) {
|
||||
KP_LOG_DEBUG("Kompute freeing push constants data");
|
||||
free(this->mPushConstantsData);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue