Further tests added to new structure

This commit is contained in:
Alejandro Saucedo 2021-02-25 22:33:08 +00:00
parent 3f1288271d
commit 6378583a23
17 changed files with 636 additions and 514 deletions

View file

@ -10,5 +10,6 @@
#include "kompute/operations/OpTensorCopy.hpp"
#include "kompute/operations/OpTensorSyncDevice.hpp"
#include "kompute/operations/OpTensorSyncLocal.hpp"
#include "kompute/operations/OpAlgoDispatch.hpp"
#include "kompute/Algorithm.hpp"
#include "kompute/Tensor.hpp"

View file

@ -928,7 +928,9 @@ class Tensor
/**
* Destroys and frees the GPU resources which include the buffer and memory.
*/
void freeMemoryDestroyGPUResources();
void destroy();
bool isInit();
/**
* Returns the vector of data currently contained by the Tensor. It is
@ -1129,10 +1131,6 @@ public:
const Constants& specializationConstants = {},
const Constants& pushConstants = {});
bool isInit();
void freeMemoryDestroyGPUResources();
/**
* Destructor for Algorithm which is responsible for freeing and desroying
* respective pipelines and owned parameter groups.
@ -1149,11 +1147,21 @@ public:
*/
void recordDispatch(std::shared_ptr<vk::CommandBuffer> commandBuffer);
bool isInit();
void setWorkgroup(const Workgroup& workgroup, uint32_t minSize = 1);
const Workgroup& getWorkgroup();
const Constants& getSpecializationConstants();
const Constants& getPushConstants();
const std::vector<std::shared_ptr<Tensor>>& getTensors();
void destroy();
private:
// -------------- NEVER OWNED RESOURCES
std::shared_ptr<vk::Device> mDevice;
std::vector<std::shared_ptr<Tensor>> mTensors;
// -------------- OPTIONALLY OWNED RESOURCES
std::shared_ptr<vk::DescriptorSetLayout> mDescriptorSetLayout;
@ -1184,7 +1192,7 @@ private:
void createPipeline();
// Parameters
void createParameters(const std::vector<std::shared_ptr<Tensor>>& tensorParams);
void createParameters();
};
} // End namespace kp
@ -1270,6 +1278,10 @@ class Sequence: public std::enable_shared_from_this<Sequence>
*/
~Sequence();
/**
*/
std::shared_ptr<Sequence> record(std::shared_ptr<OpBase> op);
/**
* Record function for operation to be added to the GPU queue in batch. This
* template requires classes to be derived from the OpBase class. This
@ -1280,7 +1292,146 @@ class Sequence: public std::enable_shared_from_this<Sequence>
* @param TArgs Template parameters that are used to initialise operation
* which allows for extensible configurations on initialisation.
*/
std::shared_ptr<Sequence> record(std::shared_ptr<OpBase> op);
template<typename T, typename... TArgs>
std::shared_ptr<Sequence>
record(std::vector<std::shared_ptr<Tensor>> tensors, TArgs&&... params)
{
KP_LOG_DEBUG("Kompute Sequence record function started");
static_assert(std::is_base_of<OpBase, T>::value,
"Kompute Sequence record(...) template only valid with "
"OpBase derived classes");
KP_LOG_DEBUG("Kompute Sequence creating OpBase derived class instance");
std::shared_ptr<T> op{
new T(tensors, std::forward<TArgs>(params)...) };
return this->record(op);
}
template<typename T, typename... TArgs>
std::shared_ptr<Sequence>
record(std::shared_ptr<Algorithm> algorithm, TArgs&&... params)
{
KP_LOG_DEBUG("Kompute Sequence record function started");
static_assert(std::is_base_of<OpBase, T>::value,
"Kompute Sequence record(...) template only valid with "
"OpBase derived classes");
KP_LOG_DEBUG("Kompute Sequence creating OpBase derived class instance");
std::shared_ptr<T> op{
new T(algorithm, std::forward<TArgs>(params)...) };
return this->record(op);
}
/**
* Eval sends all the recorded and stored operations in the vector of
* operations into the gpu as a submit job with a barrier.
*
* @return shared_ptr<Sequence> of the Sequence class itself
*/
std::shared_ptr<Sequence> eval();
std::shared_ptr<Sequence> eval(std::shared_ptr<OpBase> op);
/**
* Eval sends all the recorded and stored operations in the vector of
* operations into the gpu as a submit job with a barrier.
*
* @return shared_ptr<Sequence> of the Sequence class itself
*/
template<typename T, typename... TArgs>
std::shared_ptr<Sequence>
eval(std::vector<std::shared_ptr<Tensor>> tensors, TArgs&&... params)
{
KP_LOG_DEBUG("Kompute Sequence record function started");
static_assert(std::is_base_of<OpBase, T>::value,
"Kompute Sequence record(...) template only valid with "
"OpBase derived classes");
KP_LOG_DEBUG("Kompute Sequence creating OpBase derived class instance");
std::shared_ptr<T> op{
new T(tensors, std::forward<TArgs>(params)...) };
return this->eval(op);
}
// Needded as otherise can't use initialiser list
template<typename T, typename... TArgs>
std::shared_ptr<Sequence>
eval(std::shared_ptr<Algorithm> algorithm, TArgs&&... params)
{
KP_LOG_DEBUG("Kompute Sequence record function started");
static_assert(std::is_base_of<OpBase, T>::value,
"Kompute Sequence record(...) template only valid with "
"OpBase derived classes");
KP_LOG_DEBUG("Kompute Sequence creating OpBase derived class instance");
std::shared_ptr<T> op{
new T(algorithm, std::forward<TArgs>(params)...) };
return this->eval(op);
}
/**
* Eval Async sends all the recorded and stored operations in the vector of
* operations into the gpu as a submit job with a barrier. EvalAwait() must
* be called after to ensure the sequence is terminated correctly.
*
* @return Boolean stating whether execution was successful.
*/
std::shared_ptr<Sequence> evalAsync();
/**
* Eval sends all the recorded and stored operations in the vector of
* operations into the gpu as a submit job with a barrier.
*
* @return shared_ptr<Sequence> of the Sequence class itself
*/
template<typename T, typename... TArgs>
std::shared_ptr<Sequence>
evalAsync(std::vector<std::shared_ptr<Tensor>> tensors, TArgs&&... params)
{
KP_LOG_DEBUG("Kompute Sequence record function started");
static_assert(std::is_base_of<OpBase, T>::value,
"Kompute Sequence record(...) template only valid with "
"OpBase derived classes");
KP_LOG_DEBUG("Kompute Sequence creating OpBase derived class instance");
std::shared_ptr<T> op{
new T(tensors, std::forward<TArgs>(params)...) };
return this->evalAsync(op);
}
// Needed as otherwise it's not possible to use initializer lists
template<typename T, typename... TArgs>
std::shared_ptr<Sequence>
evalAsync(std::shared_ptr<Algorithm> algorithm, TArgs&&... params)
{
KP_LOG_DEBUG("Kompute Sequence record function started");
static_assert(std::is_base_of<OpBase, T>::value,
"Kompute Sequence record(...) template only valid with "
"OpBase derived classes");
KP_LOG_DEBUG("Kompute Sequence creating OpBase derived class instance");
std::shared_ptr<T> op{
new T(algorithm, std::forward<TArgs>(params)...) };
return this->evalAsync(op);
}
/**
* Eval Await waits for the fence to finish processing and then once it
* finishes, it runs the postEval of all operations.
*
* @param waitFor Number of milliseconds to wait before timing out.
* @return Boolean stating whether execution was successful.
*/
std::shared_ptr<Sequence> evalAwait(uint64_t waitFor = UINT64_MAX);
/**
* Clear function clears all operations currently recorded and starts recording again.
@ -1303,32 +1454,6 @@ class Sequence: public std::enable_shared_from_this<Sequence>
*/
void end();
/**
* Eval sends all the recorded and stored operations in the vector of
* operations into the gpu as a submit job with a barrier.
*
* @return Boolean stating whether execution was successful.
*/
std::shared_ptr<Sequence> eval();
/**
* Eval Async sends all the recorded and stored operations in the vector of
* operations into the gpu as a submit job with a barrier. EvalAwait() must
* be called after to ensure the sequence is terminated correctly.
*
* @return Boolean stating whether execution was successful.
*/
std::shared_ptr<Sequence> evalAsync();
/**
* Eval Await waits for the fence to finish processing and then once it
* finishes, it runs the postEval of all operations.
*
* @param waitFor Number of milliseconds to wait before timing out.
* @return Boolean stating whether execution was successful.
*/
std::shared_ptr<Sequence> evalAwait(uint64_t waitFor = UINT64_MAX);
/**
* Returns true if the sequence is currently in recording activated.
*
@ -1336,6 +1461,8 @@ class Sequence: public std::enable_shared_from_this<Sequence>
*/
bool isRecording();
bool isInit();
/**
* Returns true if the sequence is currently running - mostly used for async
* workloads.
@ -1348,7 +1475,7 @@ class Sequence: public std::enable_shared_from_this<Sequence>
* Destroys and frees the GPU resources which include the buffer and memory
* and sets the sequence as init=False.
*/
void freeMemoryDestroyGPUResources();
void destroy();
private:
// -------------- NEVER OWNED RESOURCES
@ -1444,6 +1571,8 @@ class Manager
* they would like to create the resources on.
*
* @param physicalDeviceIndex The index of the physical device to use
* @param manageResources (Optional) Whether to manage the memory of the
* resources created and destroy when the manager is destroyed.
* @param familyQueueIndices (Optional) List of queue indices to add for
* explicit allocation
* @param totalQueues The total number of compute queues to create.
@ -1462,8 +1591,7 @@ class Manager
*/
Manager(std::shared_ptr<vk::Instance> instance,
std::shared_ptr<vk::PhysicalDevice> physicalDevice,
std::shared_ptr<vk::Device> device,
uint32_t physicalDeviceIndex);
std::shared_ptr<vk::Device> device);
/**
* Manager destructor which would ensure all owned resources are destroyed
@ -1506,12 +1634,14 @@ class Manager
const Constants& specializationConstants = {},
const Constants& pushConstants = {});
void destroy();
void clear();
private:
// -------------- OPTIONALLY OWNED RESOURCES
std::shared_ptr<vk::Instance> mInstance = nullptr;
bool mFreeInstance = false;
std::shared_ptr<vk::PhysicalDevice> mPhysicalDevice = nullptr;
uint32_t mPhysicalDeviceIndex = -1;
std::shared_ptr<vk::Device> mDevice = nullptr;
bool mFreeDevice = false;
@ -1523,7 +1653,7 @@ class Manager
std::vector<uint32_t> mComputeQueueFamilyIndices;
std::vector<std::shared_ptr<vk::Queue>> mComputeQueues;
uint32_t mCurrentSequenceIndex = -1;
bool mManageResources = false;
#if DEBUG
#ifndef KOMPUTE_DISABLE_VK_DEBUG_LAYERS
@ -1534,7 +1664,7 @@ class Manager
// Create functions
void createInstance();
void createDevice(const std::vector<uint32_t>& familyQueueIndices = {});
void createDevice(const std::vector<uint32_t>& familyQueueIndices = {}, uint32_t hysicalDeviceIndex = 0);
};
} // End namespace kp
@ -1553,8 +1683,7 @@ class OpAlgoDispatch : public OpBase
{
public:
OpAlgoDispatch(const std::vector<std::shared_ptr<Tensor>>& tensors,
const std::shared_ptr<kp::Algorithm>& algorithm);
OpAlgoDispatch(const std::shared_ptr<kp::Algorithm>& algorithm);
/**
* Default destructor, which is in charge of destroying the algorithm
@ -1586,7 +1715,6 @@ class OpAlgoDispatch : public OpBase
private:
// -------------- ALWAYS OWNED RESOURCES
std::vector<std::shared_ptr<Tensor>> mTensors;
std::shared_ptr<Algorithm> mAlgorithm;
};

View file

@ -22,7 +22,7 @@ Algorithm::~Algorithm()
{
KP_LOG_DEBUG("Kompute Algorithm Destructor started");
this->freeMemoryDestroyGPUResources();
this->destroy();
}
void
@ -35,23 +35,35 @@ Algorithm::rebuild(
{
KP_LOG_DEBUG("Kompute Algorithm rebuild started");
this->setWorkgroup(workgroup);
this->mTensors = tensors;
this->mSpirv = spirv;
this->mSpecializationConstants = specializationConstants;
this->mPushConstants = pushConstants;
this->setWorkgroup(workgroup);
// Descriptor pool is created first so if available then destroy all before rebuild
if (this->mFreeDescriptorPool) {
this->freeMemoryDestroyGPUResources();
this->destroy();
}
this->createParameters(tensors);
this->createParameters();
this->createShaderModule();
this->createPipeline();
}
bool
Algorithm::isInit() {
return this->mPipeline &&
this->mPipelineCache &&
this->mPipelineLayout &&
this->mDescriptorPool &&
this->mDescriptorSet &&
this->mDescriptorSetLayout &&
this->mShaderModule;
}
void
Algorithm::freeMemoryDestroyGPUResources() {
Algorithm::destroy() {
if (!this->mDevice) {
KP_LOG_WARN(
@ -68,6 +80,7 @@ Algorithm::freeMemoryDestroyGPUResources() {
this->mDevice->destroy(
*this->mPipeline,
(vk::Optional<const vk::AllocationCallbacks>)nullptr);
this->mPipeline = nullptr;
}
if (this->mFreePipelineCache) {
@ -79,6 +92,7 @@ Algorithm::freeMemoryDestroyGPUResources() {
this->mDevice->destroy(
*this->mPipelineCache,
(vk::Optional<const vk::AllocationCallbacks>)nullptr);
this->mPipelineCache = nullptr;
}
if (this->mFreePipelineLayout) {
@ -90,6 +104,7 @@ Algorithm::freeMemoryDestroyGPUResources() {
this->mDevice->destroy(
*this->mPipelineLayout,
(vk::Optional<const vk::AllocationCallbacks>)nullptr);
this->mPipelineLayout = nullptr;
}
if (this->mFreeShaderModule) {
@ -101,6 +116,7 @@ Algorithm::freeMemoryDestroyGPUResources() {
this->mDevice->destroy(
*this->mShaderModule,
(vk::Optional<const vk::AllocationCallbacks>)nullptr);
this->mShaderModule = nullptr;
}
if (this->mFreeDescriptorSet) {
@ -111,6 +127,7 @@ Algorithm::freeMemoryDestroyGPUResources() {
}
this->mDevice->freeDescriptorSets(
*this->mDescriptorPool, 1, this->mDescriptorSet.get());
this->mDescriptorSet = nullptr;
}
if (this->mFreeDescriptorSetLayout) {
@ -122,6 +139,7 @@ Algorithm::freeMemoryDestroyGPUResources() {
this->mDevice->destroy(
*this->mDescriptorSetLayout,
(vk::Optional<const vk::AllocationCallbacks>)nullptr);
this->mDescriptorSetLayout = nullptr;
}
if (this->mFreeDescriptorPool) {
@ -133,18 +151,19 @@ Algorithm::freeMemoryDestroyGPUResources() {
this->mDevice->destroy(
*this->mDescriptorPool,
(vk::Optional<const vk::AllocationCallbacks>)nullptr);
this->mDescriptorPool = nullptr;
}
}
void
Algorithm::createParameters(const std::vector<std::shared_ptr<Tensor>>& tensorParams)
Algorithm::createParameters()
{
KP_LOG_DEBUG("Kompute Algorithm createParameters started");
std::vector<vk::DescriptorPoolSize> descriptorPoolSizes = {
vk::DescriptorPoolSize(
vk::DescriptorType::eStorageBuffer,
static_cast<uint32_t>(tensorParams.size()) // Descriptor count
static_cast<uint32_t>(this->mTensors.size()) // Descriptor count
)
};
@ -161,7 +180,7 @@ Algorithm::createParameters(const std::vector<std::shared_ptr<Tensor>>& tensorPa
this->mFreeDescriptorPool = true;
std::vector<vk::DescriptorSetLayoutBinding> descriptorSetBindings;
for (size_t i = 0; i < tensorParams.size(); i++) {
for (size_t i = 0; i < this->mTensors.size(); i++) {
descriptorSetBindings.push_back(
vk::DescriptorSetLayoutBinding(i, // Binding index
vk::DescriptorType::eStorageBuffer,
@ -193,11 +212,11 @@ Algorithm::createParameters(const std::vector<std::shared_ptr<Tensor>>& tensorPa
this->mFreeDescriptorSet = true;
KP_LOG_DEBUG("Kompute Algorithm updating descriptor sets");
for (size_t i = 0; i < tensorParams.size(); i++) {
for (size_t i = 0; i < this->mTensors.size(); i++) {
std::vector<vk::WriteDescriptorSet> computeWriteDescriptorSets;
vk::DescriptorBufferInfo descriptorBufferInfo =
tensorParams[i]->constructDescriptorBufferInfo();
this->mTensors[i]->constructDescriptorBufferInfo();
computeWriteDescriptorSets.push_back(
vk::WriteDescriptorSet(*this->mDescriptorSet,
@ -377,4 +396,24 @@ Algorithm::setWorkgroup(const Workgroup& workgroup, uint32_t minSize) {
}
}
const Workgroup&
Algorithm::getWorkgroup() {
return this->mWorkgroup;
}
const Constants&
Algorithm::getSpecializationConstants() {
return this->mSpecializationConstants;
}
const Constants&
Algorithm::getPushConstants() {
return this->mPushConstants;
}
const std::vector<std::shared_ptr<Tensor>>&
Algorithm::getTensors() {
return this->mTensors;
}
}

View file

@ -33,26 +33,33 @@ Manager::Manager()
Manager::Manager(uint32_t physicalDeviceIndex,
const std::vector<uint32_t>& familyQueueIndices)
{
this->mPhysicalDeviceIndex = physicalDeviceIndex;
this->mManageResources = false;
this->createInstance();
this->createDevice(familyQueueIndices);
this->createDevice(familyQueueIndices, physicalDeviceIndex);
}
Manager::Manager(std::shared_ptr<vk::Instance> instance,
std::shared_ptr<vk::PhysicalDevice> physicalDevice,
std::shared_ptr<vk::Device> device,
uint32_t physicalDeviceIndex)
std::shared_ptr<vk::Device> device)
{
this->mManageResources = true;
this->mInstance = instance;
this->mPhysicalDevice = physicalDevice;
this->mDevice = device;
this->mPhysicalDeviceIndex = physicalDeviceIndex;
}
Manager::~Manager()
{
KP_LOG_DEBUG("Kompute Manager Destructor started");
this->destroy();
}
void
Manager::destroy() {
KP_LOG_DEBUG("Kompute Manager destroy() started");
if (this->mDevice == nullptr) {
KP_LOG_ERROR(
@ -60,32 +67,32 @@ Manager::~Manager()
return;
}
if (this->mManagedSequences.size()) {
if (this->mManageResources && this->mManagedSequences.size()) {
KP_LOG_DEBUG("Kompute Manager explicitly running destructor for "
"managed sequences");
for (const std::weak_ptr<Sequence>& weakSq : this->mManagedSequences) {
if (std::shared_ptr<Sequence> sq = weakSq.lock()) {
sq->freeMemoryDestroyGPUResources();
sq->destroy();
}
}
this->mManagedSequences.clear();
}
if (this->mManagedAlgorithms.size()) {
if (this->mManageResources && this->mManagedAlgorithms.size()) {
KP_LOG_DEBUG("Kompute Manager explicitly freeing algorithms");
for (const std::weak_ptr<Algorithm>& weakAlgorithm : this->mManagedAlgorithms) {
if (std::shared_ptr<Algorithm> algorithm = weakAlgorithm.lock()) {
algorithm->freeMemoryDestroyGPUResources();
algorithm->destroy();
}
}
this->mManagedAlgorithms.clear();
}
if (this->mManagedTensors.size()) {
if (this->mManageResources && this->mManagedTensors.size()) {
KP_LOG_DEBUG("Kompute Manager explicitly freeing tensors");
for (const std::weak_ptr<Tensor>& weakTensor : this->mManagedTensors) {
if (std::shared_ptr<Tensor> tensor = weakTensor.lock()) {
tensor->freeMemoryDestroyGPUResources();
tensor->destroy();
}
}
this->mManagedTensors.clear();
@ -95,6 +102,7 @@ Manager::~Manager()
KP_LOG_INFO("Destroying device");
this->mDevice->destroy(
(vk::Optional<const vk::AllocationCallbacks>)nullptr);
this->mDevice = nullptr;
KP_LOG_DEBUG("Kompute Manager Destroyed Device");
}
@ -109,6 +117,7 @@ Manager::~Manager()
if (this->mDebugReportCallback) {
this->mInstance->destroyDebugReportCallbackEXT(
this->mDebugReportCallback, nullptr, this->mDebugDispatcher);
this->mInstance = nullptr;
KP_LOG_DEBUG("Kompute Manager Destroyed Debug Report Callback");
}
#endif
@ -117,6 +126,7 @@ Manager::~Manager()
if (this->mFreeInstance) {
this->mInstance->destroy(
(vk::Optional<const vk::AllocationCallbacks>)nullptr);
this->mInstance = nullptr;
KP_LOG_DEBUG("Kompute Manager Destroyed Instance");
}
}
@ -207,7 +217,31 @@ Manager::createInstance()
}
void
Manager::createDevice(const std::vector<uint32_t>& familyQueueIndices)
Manager::clear() {
if (this->mManageResources) {
this->mManagedTensors.erase(
std::remove_if(
begin(this->mManagedTensors),
end(this->mManagedTensors),
[](std::weak_ptr<Tensor> t) {return t.expired();}),
end(this->mManagedTensors));
this->mManagedAlgorithms.erase(
std::remove_if(
begin(this->mManagedAlgorithms),
end(this->mManagedAlgorithms),
[](std::weak_ptr<Algorithm> t) {return t.expired();}),
end(this->mManagedAlgorithms));
this->mManagedSequences.erase(
std::remove_if(
begin(this->mManagedSequences),
end(this->mManagedSequences),
[](std::weak_ptr<Sequence> t) {return t.expired();}),
end(this->mManagedSequences));
}
}
void
Manager::createDevice(const std::vector<uint32_t>& familyQueueIndices, uint32_t physicalDeviceIndex)
{
KP_LOG_DEBUG("Kompute Manager creating Device");
@ -215,7 +249,7 @@ Manager::createDevice(const std::vector<uint32_t>& familyQueueIndices)
if (this->mInstance == nullptr) {
throw std::runtime_error("Kompute Manager instance is null");
}
if (this->mPhysicalDeviceIndex < 0) {
if (physicalDeviceIndex < 0) {
throw std::runtime_error(
"Kompute Manager physical device index not provided");
}
@ -226,7 +260,7 @@ Manager::createDevice(const std::vector<uint32_t>& familyQueueIndices)
this->mInstance->enumeratePhysicalDevices();
vk::PhysicalDevice physicalDevice =
physicalDevices[this->mPhysicalDeviceIndex];
physicalDevices[physicalDeviceIndex];
this->mPhysicalDevice =
std::make_shared<vk::PhysicalDevice>(physicalDevice);
@ -235,7 +269,7 @@ Manager::createDevice(const std::vector<uint32_t>& familyQueueIndices)
physicalDevice.getProperties();
KP_LOG_INFO("Using physical device index {} found {}",
this->mPhysicalDeviceIndex,
physicalDeviceIndex,
physicalDeviceProperties.deviceName);
if (!familyQueueIndices.size()) {
@ -321,7 +355,9 @@ Manager::tensor(
std::shared_ptr<Tensor> tensor{
new kp::Tensor(this->mPhysicalDevice, this->mDevice, data, tensorType) };
this->mManagedTensors.push_back(tensor);
if (this->mManageResources) {
this->mManagedTensors.push_back(tensor);
}
return tensor;
}
@ -345,7 +381,9 @@ Manager::algorithm(
specializationConstants,
pushConstants)};
this->mManagedAlgorithms.push_back(algorithm);
if (this->mManageResources) {
this->mManagedAlgorithms.push_back(algorithm);
}
return algorithm;
}
@ -362,7 +400,9 @@ Manager::sequence(uint32_t queueIndex)
this->mComputeQueues[queueIndex],
this->mComputeQueueFamilyIndices[queueIndex]) };
this->mManagedSequences.push_back(sq);
if (this->mManageResources) {
this->mManagedSequences.push_back(sq);
}
return sq;
}

View file

@ -4,12 +4,10 @@
namespace kp {
OpAlgoDispatch::OpAlgoDispatch(const std::vector<std::shared_ptr<Tensor>>& tensors,
const std::shared_ptr<kp::Algorithm>& algorithm)
OpAlgoDispatch::OpAlgoDispatch(const std::shared_ptr<kp::Algorithm>& algorithm)
{
KP_LOG_DEBUG("Kompute OpAlgoDispatch constructor");
this->mTensors = tensors;
this->mAlgorithm = algorithm;
}
@ -24,7 +22,7 @@ OpAlgoDispatch::record(std::shared_ptr<vk::CommandBuffer> commandBuffer)
KP_LOG_DEBUG("Kompute OpAlgoDispatch record called");
// Barrier to ensure the data is finished writing to buffer memory
for (std::shared_ptr<Tensor> tensor : this->mTensors) {
for (const std::shared_ptr<Tensor>& tensor : this->mAlgorithm->getTensors()) {
tensor->recordBufferMemoryBarrier(
commandBuffer,
vk::AccessFlagBits::eHostWrite,

View file

@ -61,6 +61,12 @@ Sequence::end()
}
}
void
Sequence::clear() {
KP_LOG_DEBUG("Kompute Sequence calling clear");
this->end();
}
std::shared_ptr<Sequence>
Sequence::eval()
{
@ -69,6 +75,13 @@ Sequence::eval()
return this->evalAsync()->evalAwait();
}
std::shared_ptr<Sequence>
Sequence::eval(std::shared_ptr<OpBase> op) {
this->clear();
this->record(op);
this->eval();
}
std::shared_ptr<Sequence>
Sequence::evalAsync()
{
@ -138,8 +151,16 @@ Sequence::isRecording()
return this->mRecording;
}
bool
Sequence::isInit() {
return this->mDevice &&
this->mCommandPool &&
this->mCommandBuffer &&
this->mComputeQueue;
}
void
Sequence::freeMemoryDestroyGPUResources()
Sequence::destroy()
{
KP_LOG_DEBUG("Kompute Sequence freeMemoryDestroyGPUResources called");
@ -189,6 +210,16 @@ Sequence::freeMemoryDestroyGPUResources()
this->mOperations.clear();
}
if (this->mDevice) {
this->mDevice = nullptr;
}
if (this->mPhysicalDevice) {
this->mPhysicalDevice = nullptr;
}
if (this->mComputeQueue) {
this->mComputeQueue = nullptr;
}
}
std::shared_ptr<Sequence>

View file

@ -76,6 +76,15 @@ Tensor::tensorType()
return this->mTensorType;
}
bool
Tensor::isInit() {
return this->mDevice &&
this->mPrimaryBuffer &&
this->mPrimaryMemory &&
this->mStagingBuffer &&
this->mStagingMemory;
}
void
Tensor::setData(const std::vector<float>& data)
{
@ -429,7 +438,7 @@ Tensor::allocateBindMemory(std::shared_ptr<vk::Buffer> buffer,
}
void
Tensor::freeMemoryDestroyGPUResources()
Tensor::destroy()
{
KP_LOG_DEBUG("Kompute Tensor started freeMemoryDestroyGPUResources()");
@ -495,6 +504,10 @@ Tensor::freeMemoryDestroyGPUResources()
}
}
if (this->mDevice) {
this->mDevice = nullptr;
}
KP_LOG_DEBUG("Kompute Tensor successful freeMemoryDestroyGPUResources()");
}

View file

@ -45,10 +45,6 @@ public:
const Constants& specializationConstants = {},
const Constants& pushConstants = {});
bool isInit();
void freeMemoryDestroyGPUResources();
/**
* Destructor for Algorithm which is responsible for freeing and desroying
* respective pipelines and owned parameter groups.
@ -65,11 +61,21 @@ public:
*/
void recordDispatch(std::shared_ptr<vk::CommandBuffer> commandBuffer);
bool isInit();
void setWorkgroup(const Workgroup& workgroup, uint32_t minSize = 1);
const Workgroup& getWorkgroup();
const Constants& getSpecializationConstants();
const Constants& getPushConstants();
const std::vector<std::shared_ptr<Tensor>>& getTensors();
void destroy();
private:
// -------------- NEVER OWNED RESOURCES
std::shared_ptr<vk::Device> mDevice;
std::vector<std::shared_ptr<Tensor>> mTensors;
// -------------- OPTIONALLY OWNED RESOURCES
std::shared_ptr<vk::DescriptorSetLayout> mDescriptorSetLayout;
@ -100,7 +106,7 @@ private:
void createPipeline();
// Parameters
void createParameters(const std::vector<std::shared_ptr<Tensor>>& tensorParams);
void createParameters();
};
} // End namespace kp

View file

@ -30,6 +30,8 @@ class Manager
* they would like to create the resources on.
*
* @param physicalDeviceIndex The index of the physical device to use
* @param manageResources (Optional) Whether to manage the memory of the
* resources created and destroy when the manager is destroyed.
* @param familyQueueIndices (Optional) List of queue indices to add for
* explicit allocation
* @param totalQueues The total number of compute queues to create.
@ -48,8 +50,7 @@ class Manager
*/
Manager(std::shared_ptr<vk::Instance> instance,
std::shared_ptr<vk::PhysicalDevice> physicalDevice,
std::shared_ptr<vk::Device> device,
uint32_t physicalDeviceIndex);
std::shared_ptr<vk::Device> device);
/**
* Manager destructor which would ensure all owned resources are destroyed
@ -92,12 +93,14 @@ class Manager
const Constants& specializationConstants = {},
const Constants& pushConstants = {});
void destroy();
void clear();
private:
// -------------- OPTIONALLY OWNED RESOURCES
std::shared_ptr<vk::Instance> mInstance = nullptr;
bool mFreeInstance = false;
std::shared_ptr<vk::PhysicalDevice> mPhysicalDevice = nullptr;
uint32_t mPhysicalDeviceIndex = -1;
std::shared_ptr<vk::Device> mDevice = nullptr;
bool mFreeDevice = false;
@ -109,7 +112,7 @@ class Manager
std::vector<uint32_t> mComputeQueueFamilyIndices;
std::vector<std::shared_ptr<vk::Queue>> mComputeQueues;
uint32_t mCurrentSequenceIndex = -1;
bool mManageResources = false;
#if DEBUG
#ifndef KOMPUTE_DISABLE_VK_DEBUG_LAYERS
@ -120,7 +123,7 @@ class Manager
// Create functions
void createInstance();
void createDevice(const std::vector<uint32_t>& familyQueueIndices = {});
void createDevice(const std::vector<uint32_t>& familyQueueIndices = {}, uint32_t hysicalDeviceIndex = 0);
};
} // End namespace kp

View file

@ -31,6 +31,10 @@ class Sequence: public std::enable_shared_from_this<Sequence>
*/
~Sequence();
/**
*/
std::shared_ptr<Sequence> record(std::shared_ptr<OpBase> op);
/**
* Record function for operation to be added to the GPU queue in batch. This
* template requires classes to be derived from the OpBase class. This
@ -41,7 +45,148 @@ class Sequence: public std::enable_shared_from_this<Sequence>
* @param TArgs Template parameters that are used to initialise operation
* which allows for extensible configurations on initialisation.
*/
std::shared_ptr<Sequence> record(std::shared_ptr<OpBase> op);
template<typename T, typename... TArgs>
std::shared_ptr<Sequence>
record(std::vector<std::shared_ptr<Tensor>> tensors, TArgs&&... params)
{
KP_LOG_DEBUG("Kompute Sequence record function started");
static_assert(std::is_base_of<OpBase, T>::value,
"Kompute Sequence record(...) template only valid with "
"OpBase derived classes");
KP_LOG_DEBUG("Kompute Sequence creating OpBase derived class instance");
std::shared_ptr<T> op{
new T(tensors, std::forward<TArgs>(params)...) };
return this->record(op);
}
template<typename T, typename... TArgs>
std::shared_ptr<Sequence>
record(std::shared_ptr<Algorithm> algorithm, TArgs&&... params)
{
KP_LOG_DEBUG("Kompute Sequence record function started");
static_assert(std::is_base_of<OpBase, T>::value,
"Kompute Sequence record(...) template only valid with "
"OpBase derived classes");
KP_LOG_DEBUG("Kompute Sequence creating OpBase derived class instance");
std::shared_ptr<T> op{
new T(algorithm, std::forward<TArgs>(params)...) };
return this->record(op);
}
/**
* Eval sends all the recorded and stored operations in the vector of
* operations into the gpu as a submit job with a barrier.
*
* @return shared_ptr<Sequence> of the Sequence class itself
*/
std::shared_ptr<Sequence> eval();
std::shared_ptr<Sequence> eval(std::shared_ptr<OpBase> op);
/**
* Eval sends all the recorded and stored operations in the vector of
* operations into the gpu as a submit job with a barrier.
*
* @return shared_ptr<Sequence> of the Sequence class itself
*/
// TODO: Aim to have only a single function with tensors/algorithm
template<typename T, typename... TArgs>
std::shared_ptr<Sequence>
eval(std::vector<std::shared_ptr<Tensor>> tensors, TArgs&&... params)
{
KP_LOG_DEBUG("Kompute Sequence record function started");
static_assert(std::is_base_of<OpBase, T>::value,
"Kompute Sequence record(...) template only valid with "
"OpBase derived classes");
KP_LOG_DEBUG("Kompute Sequence creating OpBase derived class instance");
std::shared_ptr<T> op{
new T(tensors, std::forward<TArgs>(params)...) };
// TODO: Aim to be able to handle errors when returning without throw except
return this->eval(op);
}
// Needded as otherise can't use initialiser list
template<typename T, typename... TArgs>
std::shared_ptr<Sequence>
eval(std::shared_ptr<Algorithm> algorithm, TArgs&&... params)
{
KP_LOG_DEBUG("Kompute Sequence record function started");
static_assert(std::is_base_of<OpBase, T>::value,
"Kompute Sequence record(...) template only valid with "
"OpBase derived classes");
KP_LOG_DEBUG("Kompute Sequence creating OpBase derived class instance");
std::shared_ptr<T> op{
new T(algorithm, std::forward<TArgs>(params)...) };
return this->eval(op);
}
/**
* Eval Async sends all the recorded and stored operations in the vector of
* operations into the gpu as a submit job with a barrier. EvalAwait() must
* be called after to ensure the sequence is terminated correctly.
*
* @return Boolean stating whether execution was successful.
*/
std::shared_ptr<Sequence> evalAsync();
/**
* Eval sends all the recorded and stored operations in the vector of
* operations into the gpu as a submit job with a barrier.
*
* @return shared_ptr<Sequence> of the Sequence class itself
*/
template<typename T, typename... TArgs>
std::shared_ptr<Sequence>
evalAsync(std::vector<std::shared_ptr<Tensor>> tensors, TArgs&&... params)
{
KP_LOG_DEBUG("Kompute Sequence record function started");
static_assert(std::is_base_of<OpBase, T>::value,
"Kompute Sequence record(...) template only valid with "
"OpBase derived classes");
KP_LOG_DEBUG("Kompute Sequence creating OpBase derived class instance");
std::shared_ptr<T> op{
new T(tensors, std::forward<TArgs>(params)...) };
return this->evalAsync(op);
}
// Needed as otherwise it's not possible to use initializer lists
template<typename T, typename... TArgs>
std::shared_ptr<Sequence>
evalAsync(std::shared_ptr<Algorithm> algorithm, TArgs&&... params)
{
KP_LOG_DEBUG("Kompute Sequence record function started");
static_assert(std::is_base_of<OpBase, T>::value,
"Kompute Sequence record(...) template only valid with "
"OpBase derived classes");
KP_LOG_DEBUG("Kompute Sequence creating OpBase derived class instance");
std::shared_ptr<T> op{
new T(algorithm, std::forward<TArgs>(params)...) };
return this->evalAsync(op);
}
/**
* Eval Await waits for the fence to finish processing and then once it
* finishes, it runs the postEval of all operations.
*
* @param waitFor Number of milliseconds to wait before timing out.
* @return Boolean stating whether execution was successful.
*/
std::shared_ptr<Sequence> evalAwait(uint64_t waitFor = UINT64_MAX);
/**
* Clear function clears all operations currently recorded and starts recording again.
@ -64,32 +209,6 @@ class Sequence: public std::enable_shared_from_this<Sequence>
*/
void end();
/**
* Eval sends all the recorded and stored operations in the vector of
* operations into the gpu as a submit job with a barrier.
*
* @return Boolean stating whether execution was successful.
*/
std::shared_ptr<Sequence> eval();
/**
* Eval Async sends all the recorded and stored operations in the vector of
* operations into the gpu as a submit job with a barrier. EvalAwait() must
* be called after to ensure the sequence is terminated correctly.
*
* @return Boolean stating whether execution was successful.
*/
std::shared_ptr<Sequence> evalAsync();
/**
* Eval Await waits for the fence to finish processing and then once it
* finishes, it runs the postEval of all operations.
*
* @param waitFor Number of milliseconds to wait before timing out.
* @return Boolean stating whether execution was successful.
*/
std::shared_ptr<Sequence> evalAwait(uint64_t waitFor = UINT64_MAX);
/**
* Returns true if the sequence is currently in recording activated.
*
@ -97,6 +216,9 @@ class Sequence: public std::enable_shared_from_this<Sequence>
*/
bool isRecording();
bool isInit();
/**
* Returns true if the sequence is currently running - mostly used for async
* workloads.
@ -109,7 +231,7 @@ class Sequence: public std::enable_shared_from_this<Sequence>
* Destroys and frees the GPU resources which include the buffer and memory
* and sets the sequence as init=False.
*/
void freeMemoryDestroyGPUResources();
void destroy();
private:
// -------------- NEVER OWNED RESOURCES

View file

@ -59,7 +59,9 @@ class Tensor
/**
* Destroys and frees the GPU resources which include the buffer and memory.
*/
void freeMemoryDestroyGPUResources();
void destroy();
bool isInit();
/**
* Returns the vector of data currently contained by the Tensor. It is

View file

@ -17,8 +17,7 @@ class OpAlgoDispatch : public OpBase
{
public:
OpAlgoDispatch(const std::vector<std::shared_ptr<Tensor>>& tensors,
const std::shared_ptr<kp::Algorithm>& algorithm);
OpAlgoDispatch(const std::shared_ptr<kp::Algorithm>& algorithm);
/**
* Default destructor, which is in charge of destroying the algorithm
@ -50,7 +49,6 @@ class OpAlgoDispatch : public OpBase
private:
// -------------- ALWAYS OWNED RESOURCES
std::vector<std::shared_ptr<Tensor>> mTensors;
std::shared_ptr<Algorithm> mAlgorithm;
};

View file

@ -11,8 +11,7 @@ else()
endif()
file(GLOB test_kompute_CPP
"${CMAKE_CURRENT_SOURCE_DIR}/TestMain.cpp"
"${CMAKE_CURRENT_SOURCE_DIR}/TestWorkgroup.cpp"
"${CMAKE_CURRENT_SOURCE_DIR}/*.cpp"
)
add_executable(test_kompute ${test_kompute_CPP})

View file

@ -37,25 +37,32 @@ TEST(TestAsyncOperations, TestManagerParallelExecution)
}
)");
std::vector<uint32_t> spirv = kp::Shader::compile_source(shader);
std::vector<float> data(size, 0.0);
std::vector<float> resultSync(size, 100000000);
std::vector<float> resultAsync(size, 100000000);
kp::Manager mgr;
std::shared_ptr<kp::Sequence> sq = mgr.sequence();
std::vector<std::shared_ptr<kp::Tensor>> inputsSyncB;
std::vector<std::shared_ptr<kp::Algorithm>> algorithms;
for (uint32_t i = 0; i < numParallel; i++) {
inputsSyncB.push_back(std::make_shared<kp::Tensor>(kp::Tensor(data)));
inputsSyncB.push_back(mgr.tensor(data));
algorithms.push_back(mgr.algorithm({ inputsSyncB[i] }, spirv));
}
mgr.rebuild(inputsSyncB);
sq->eval<kp::OpTensorSyncDevice>(inputsSyncB);
mgr.sequence()->eval<kp::OpTensorSyncDevice>(inputsSyncB);
auto startSync = std::chrono::high_resolution_clock::now();
for (uint32_t i = 0; i < numParallel; i++) {
mgr.evalOpDefault<kp::OpAlgoCreate>(
{ inputsSyncB[i] }, kp::Shader::compile_source(shader));
sq->eval<kp::OpAlgoDispatch>(algorithms[i]);
}
auto endSync = std::chrono::high_resolution_clock::now();
@ -63,7 +70,7 @@ TEST(TestAsyncOperations, TestManagerParallelExecution)
std::chrono::duration_cast<std::chrono::microseconds>(endSync - startSync)
.count();
mgr.evalOpDefault<kp::OpTensorSyncLocal>(inputsSyncB);
sq->eval<kp::OpTensorSyncLocal>(inputsSyncB);
for (uint32_t i = 0; i < numParallel; i++) {
EXPECT_EQ(inputsSyncB[i]->data(), resultSync);
@ -74,26 +81,23 @@ TEST(TestAsyncOperations, TestManagerParallelExecution)
std::vector<std::shared_ptr<kp::Tensor>> inputsAsyncB;
for (uint32_t i = 0; i < numParallel; i++) {
inputsAsyncB.push_back(std::make_shared<kp::Tensor>(kp::Tensor(data)));
inputsAsyncB.push_back(mgr.tensor(data));
}
mgrAsync.rebuild(inputsAsyncB);
std::vector<std::shared_ptr<kp::Sequence>> sqs;
for (uint32_t i = 0; i < numParallel; i++) {
mgrAsync.sequence("async" + std::to_string(i), i);
sqs.push_back(mgrAsync.sequence(i));
}
auto startAsync = std::chrono::high_resolution_clock::now();
for (uint32_t i = 0; i < numParallel; i++) {
mgrAsync.evalOpAsync<kp::OpAlgoCreate>(
{ inputsAsyncB[i] },
"async" + std::to_string(i),
kp::Shader::compile_source(shader));
sqs[i]->evalAsync<kp::OpAlgoDispatch>(algorithms[i]);
}
for (uint32_t i = 0; i < numParallel; i++) {
mgrAsync.evalOpAwait("async" + std::to_string(i));
sqs[i]->evalAwait();
}
auto endAsync = std::chrono::high_resolution_clock::now();
@ -101,7 +105,7 @@ TEST(TestAsyncOperations, TestManagerParallelExecution)
endAsync - startAsync)
.count();
mgrAsync.evalOpDefault<kp::OpTensorSyncLocal>({ inputsAsyncB });
sq->eval<kp::OpTensorSyncLocal>({ inputsAsyncB });
for (uint32_t i = 0; i < numParallel; i++) {
EXPECT_EQ(inputsAsyncB[i]->data(), resultAsync);
@ -138,32 +142,32 @@ TEST(TestAsyncOperations, TestManagerAsyncExecution)
}
)");
std::vector<uint32_t> spirv = kp::Shader::compile_source(shader);
std::vector<float> data(size, 0.0);
std::vector<float> resultAsync(size, 100000000);
kp::Manager mgr;
std::shared_ptr<kp::Tensor> tensorA{ new kp::Tensor(data) };
std::shared_ptr<kp::Tensor> tensorB{ new kp::Tensor(data) };
std::shared_ptr<kp::Tensor> tensorA = mgr.tensor(data);
std::shared_ptr<kp::Tensor> tensorB = mgr.tensor(data);
mgr.sequence("asyncOne");
mgr.sequence("asyncTwo");
std::shared_ptr<kp::Sequence> sq1 = mgr.sequence();
std::shared_ptr<kp::Sequence> sq2 = mgr.sequence();
mgr.rebuild({ tensorA, tensorB });
sq1->eval<kp::OpTensorSyncLocal>({ tensorA, tensorB });
std::vector<uint32_t> result = kp::Shader::compile_source(shader);
std::shared_ptr<kp::Algorithm> algo1 = mgr.algorithm({tensorA});
std::shared_ptr<kp::Algorithm> algo2 = mgr.algorithm({tensorB});
mgr.evalOpAsync<kp::OpAlgoCreate>(
{ tensorA }, "asyncOne", kp::Shader::compile_source(shader));
sq1->evalAsync<kp::OpAlgoDispatch>(algo1);
sq2->evalAsync<kp::OpAlgoDispatch>(algo2);
mgr.evalOpAsync<kp::OpAlgoCreate>(
{ tensorB }, "asyncTwo", kp::Shader::compile_source(shader));
sq1->evalAwait();
sq2->evalAwait();
mgr.evalOpAwait("asyncOne");
mgr.evalOpAwait("asyncTwo");
mgr.evalOpAsyncDefault<kp::OpTensorSyncLocal>({ tensorA, tensorB });
mgr.evalOpAwaitDefault();
sq1->evalAsync<kp::OpTensorSyncLocal>({ tensorA, tensorB });
sq1->evalAwait();
EXPECT_EQ(tensorA->data(), resultAsync);
EXPECT_EQ(tensorB->data(), resultAsync);

View file

@ -5,7 +5,7 @@
TEST(TestDestroy, TestDestroyTensorSingle)
{
std::shared_ptr<kp::Tensor> tensorA{ new kp::Tensor({ 0, 0, 0 }) };
std::shared_ptr<kp::Tensor> tensorA = nullptr;
std::string shader(R"(
#version 450
@ -16,37 +16,36 @@ TEST(TestDestroy, TestDestroyTensorSingle)
pa[index] = pa[index] + 1;
})");
std::vector<uint32_t> spirv = kp::Shader::compile_source(shader);
{
std::shared_ptr<kp::Sequence> sq = nullptr;
{
kp::Manager mgr;
mgr.rebuild({ tensorA });
tensorA = mgr.tensor({ 0, 0, 0 });
sq = mgr.sequence();
std::shared_ptr<kp::Algorithm> algo =
mgr.algorithm({ tensorA }, spirv);
sq->begin();
sq->record<kp::OpAlgoCreate>(
{ tensorA }, kp::Shader::compile_source(shader));
sq->end();
sq->eval();
mgr.evalOpDefault<kp::OpTensorSyncLocal>({ tensorA });
mgr.destroy(tensorA);
mgr.sequence()
->record<kp::OpAlgoDispatch>(algo)
->eval()
->eval<kp::OpTensorSyncLocal>(algo->getTensors());
tensorA->destroy();
EXPECT_FALSE(tensorA->isInit());
}
EXPECT_FALSE(tensorA->isInit());
}
EXPECT_EQ(tensorA->data(), std::vector<float>({ 1, 1, 1 }));
}
TEST(TestDestroy, TestDestroyTensorVector)
{
std::shared_ptr<kp::Tensor> tensorA{ new kp::Tensor({ 1, 1, 1 }) };
std::shared_ptr<kp::Tensor> tensorB{ new kp::Tensor({ 1, 1, 1 }) };
std::shared_ptr<kp::Tensor> tensorA = nullptr;
std::shared_ptr<kp::Tensor> tensorB = nullptr;
std::string shader(R"(
#version 450
@ -58,6 +57,7 @@ TEST(TestDestroy, TestDestroyTensorVector)
pa[index] = pa[index] + 1;
pb[index] = pb[index] + 2;
})");
std::vector<uint32_t> spirv = kp::Shader::compile_source(shader);
{
std::shared_ptr<kp::Sequence> sq = nullptr;
@ -65,20 +65,20 @@ TEST(TestDestroy, TestDestroyTensorVector)
{
kp::Manager mgr;
mgr.rebuild({ tensorA, tensorB });
tensorA = mgr.tensor({ 1, 1, 1 });
tensorB = mgr.tensor({ 1, 1, 1 });
sq = mgr.sequence();
std::shared_ptr<kp::Algorithm> algo =
mgr.algorithm({tensorA, tensorB}, spirv);
sq->begin();
sq->record<kp::OpAlgoCreate>(
{ tensorA, tensorB }, kp::Shader::compile_source(shader));
sq->end();
mgr.sequence()
->record<kp::OpTensorSyncDevice>(algo->getTensors())
->record<kp::OpAlgoDispatch>(algo)
->record<kp::OpTensorSyncDevice>(algo->getTensors())
->eval();
sq->eval();
mgr.evalOpDefault<kp::OpTensorSyncLocal>({ tensorA, tensorB });
mgr.destroy({ tensorA, tensorB });
tensorA->destroy();
tensorB->destroy();
EXPECT_FALSE(tensorA->isInit());
EXPECT_FALSE(tensorB->isInit());
@ -88,32 +88,9 @@ TEST(TestDestroy, TestDestroyTensorVector)
EXPECT_EQ(tensorB->data(), std::vector<float>({ 3, 3, 3 }));
}
TEST(TestDestroy, TestDestroyTensorVectorUninitialised)
{
std::shared_ptr<kp::Tensor> tensorA{ new kp::Tensor({ 1, 1, 1 }) };
std::shared_ptr<kp::Tensor> tensorB{ new kp::Tensor({ 1, 1, 1 }) };
{
std::shared_ptr<kp::Sequence> sq = nullptr;
{
kp::Manager mgr;
mgr.rebuild({ tensorA, tensorB });
mgr.destroy({ tensorA, tensorB });
EXPECT_FALSE(tensorA->isInit());
EXPECT_FALSE(tensorB->isInit());
}
}
EXPECT_EQ(tensorA->data(), std::vector<float>({ 1, 1, 1 }));
EXPECT_EQ(tensorA->data(), std::vector<float>({ 1, 1, 1 }));
}
TEST(TestDestroy, TestDestroySequenceSingle)
{
std::shared_ptr<kp::Tensor> tensorA{ new kp::Tensor({ 0, 0, 0 }) };
std::shared_ptr<kp::Tensor> tensorA = nullptr;
std::string shader(R"(
#version 450
@ -124,26 +101,21 @@ TEST(TestDestroy, TestDestroySequenceSingle)
pa[index] = pa[index] + 1;
})");
std::vector<uint32_t> spirv = kp::Shader::compile_source(shader);
{
std::shared_ptr<kp::Sequence> sq = nullptr;
{
kp::Manager mgr;
mgr.rebuild({ tensorA });
tensorA = mgr.tensor({0, 0, 0});
sq = mgr.sequence();
sq->begin();
sq->record<kp::OpAlgoCreate>(
{ tensorA }, kp::Shader::compile_source(shader));
sq->end();
sq->eval();
mgr.evalOpDefault<kp::OpTensorSyncLocal>({ tensorA });
mgr.destroy(sq);
mgr.sequence()
->record<kp::OpTensorSyncDevice>({tensorA})
->record<kp::OpAlgoDispatch>(mgr.algorithm({tensorA}, spirv))
->record<kp::OpTensorSyncLocal>({tensorA})
->eval();
EXPECT_FALSE(sq->isInit());
}
@ -151,220 +123,3 @@ TEST(TestDestroy, TestDestroySequenceSingle)
EXPECT_EQ(tensorA->data(), std::vector<float>({ 1, 1, 1 }));
}
TEST(TestDestroy, TestDestroySequenceVector)
{
std::shared_ptr<kp::Tensor> tensorA{ new kp::Tensor({ 0, 0, 0 }) };
std::string shader(R"(
#version 450
layout (local_size_x = 1) in;
layout(set = 0, binding = 0) buffer a { float pa[]; };
void main() {
uint index = gl_GlobalInvocationID.x;
pa[index] = pa[index] + 1;
})");
{
std::shared_ptr<kp::Sequence> sq1 = nullptr;
std::shared_ptr<kp::Sequence> sq2 = nullptr;
{
kp::Manager mgr;
mgr.rebuild({ tensorA });
sq1 = mgr.sequence("One");
sq1->begin();
sq1->record<kp::OpAlgoCreate>(
{ tensorA }, kp::Shader::compile_source(shader));
sq1->end();
sq1->eval();
sq2 = mgr.sequence("Two");
sq2->begin();
sq2->record<kp::OpAlgoCreate>(
{ tensorA }, kp::Shader::compile_source(shader));
sq2->end();
sq2->eval();
mgr.evalOpDefault<kp::OpTensorSyncLocal>({ tensorA });
mgr.destroy({ sq1, sq2 });
EXPECT_FALSE(sq1->isInit());
EXPECT_FALSE(sq2->isInit());
}
}
EXPECT_EQ(tensorA->data(), std::vector<float>({ 2, 2, 2 }));
}
TEST(TestDestroy, TestDestroySequenceNameSingleInsideManager)
{
std::shared_ptr<kp::Tensor> tensorA{ new kp::Tensor({ 0, 0, 0 }) };
std::string shader(R"(
#version 450
layout (local_size_x = 1) in;
layout(set = 0, binding = 0) buffer a { float pa[]; };
void main() {
uint index = gl_GlobalInvocationID.x;
pa[index] = pa[index] + 1;
})");
{
kp::Manager mgr;
{
mgr.rebuild({ tensorA });
mgr.evalOp<kp::OpAlgoCreate>(
{ tensorA }, "one",
kp::Shader::compile_source(shader));
mgr.evalOp<kp::OpAlgoCreate>(
{ tensorA }, "two",
kp::Shader::compile_source(shader));
mgr.evalOpDefault<kp::OpTensorSyncLocal>({ tensorA });
mgr.destroy("one");
mgr.destroy("two");
}
}
EXPECT_EQ(tensorA->data(), std::vector<float>({ 2, 2, 2 }));
}
TEST(TestDestroy, TestDestroySequenceNameSingleOutsideManager)
{
std::shared_ptr<kp::Tensor> tensorA{ new kp::Tensor({ 0, 0, 0 }) };
std::string shader(R"(
#version 450
layout (local_size_x = 1) in;
layout(set = 0, binding = 0) buffer a { float pa[]; };
void main() {
uint index = gl_GlobalInvocationID.x;
pa[index] = pa[index] + 1;
})");
{
std::shared_ptr<kp::Sequence> sq1 = nullptr;
{
kp::Manager mgr;
mgr.rebuild({ tensorA });
sq1 = mgr.sequence("One");
sq1->begin();
sq1->record<kp::OpAlgoCreate>(
{ tensorA }, kp::Shader::compile_source(shader));
sq1->end();
sq1->eval();
mgr.evalOpDefault<kp::OpTensorSyncLocal>({ tensorA });
mgr.destroy("One");
EXPECT_FALSE(sq1->isInit());
}
}
EXPECT_EQ(tensorA->data(), std::vector<float>({ 1, 1, 1 }));
}
TEST(TestDestroy, TestDestroySequenceNameVectorInsideManager)
{
std::shared_ptr<kp::Tensor> tensorA{ new kp::Tensor({ 0, 0, 0 }) };
std::string shader(R"(
#version 450
layout (local_size_x = 1) in;
layout(set = 0, binding = 0) buffer a { float pa[]; };
void main() {
uint index = gl_GlobalInvocationID.x;
pa[index] = pa[index] + 1;
})");
{
kp::Manager mgr;
{
mgr.rebuild({ tensorA });
mgr.evalOp<kp::OpAlgoCreate>(
{ tensorA }, "one",
kp::Shader::compile_source(shader));
mgr.evalOp<kp::OpAlgoCreate>(
{ tensorA }, "two",
kp::Shader::compile_source(shader));
mgr.evalOpDefault<kp::OpTensorSyncLocal>({ tensorA });
mgr.destroy(std::vector<std::string>({"one", "two"}));
}
}
EXPECT_EQ(tensorA->data(), std::vector<float>({ 2, 2, 2 }));
}
TEST(TestDestroy, TestDestroySequenceNameVectorOutsideManager)
{
std::shared_ptr<kp::Tensor> tensorA{ new kp::Tensor({ 0, 0, 0 }) };
std::string shader(R"(
#version 450
layout (local_size_x = 1) in;
layout(set = 0, binding = 0) buffer a { float pa[]; };
void main() {
uint index = gl_GlobalInvocationID.x;
pa[index] = pa[index] + 1;
})");
{
kp::Manager mgr;
{
mgr.rebuild({ tensorA });
mgr.evalOp<kp::OpAlgoCreate>(
{ tensorA }, "one",
kp::Shader::compile_source(shader));
mgr.evalOp<kp::OpAlgoCreate>(
{ tensorA }, "two",
kp::Shader::compile_source(shader));
mgr.evalOpDefault<kp::OpTensorSyncLocal>({ tensorA });
mgr.destroy(std::vector<std::string>({"one", "two"}));
}
}
EXPECT_EQ(tensorA->data(), std::vector<float>({ 2, 2, 2 }));
}
TEST(TestDestroy, TestDestroySequenceNameDefaultOutsideManager)
{
std::shared_ptr<kp::Tensor> tensorA{ new kp::Tensor({ 0, 0, 0 }) };
std::string shader(R"(
#version 450
layout (local_size_x = 1) in;
layout(set = 0, binding = 0) buffer a { float pa[]; };
void main() {
uint index = gl_GlobalInvocationID.x;
pa[index] = pa[index] + 1;
})");
{
kp::Manager mgr;
{
mgr.rebuild({ tensorA });
mgr.evalOpDefault<kp::OpAlgoCreate>(
{ tensorA },
kp::Shader::compile_source(shader));
mgr.evalOpDefault<kp::OpTensorSyncLocal>({ tensorA });
mgr.destroy(KP_DEFAULT_SESSION);
}
}
EXPECT_EQ(tensorA->data(), std::vector<float>({ 1, 1, 1 }));
}

View file

@ -11,47 +11,40 @@ TEST(TestLogisticRegression, TestMainLogisticRegression)
uint32_t ITERATIONS = 100;
float learningRate = 0.1;
std::shared_ptr<kp::Tensor> xI{ new kp::Tensor({ 0, 1, 1, 1, 1 }) };
std::shared_ptr<kp::Tensor> xJ{ new kp::Tensor({ 0, 0, 0, 1, 1 }) };
std::shared_ptr<kp::Tensor> y{ new kp::Tensor({ 0, 0, 0, 1, 1 }) };
std::shared_ptr<kp::Tensor> wIn{ new kp::Tensor({ 0.001, 0.001 }) };
std::shared_ptr<kp::Tensor> wOutI{ new kp::Tensor({ 0, 0, 0, 0, 0 }) };
std::shared_ptr<kp::Tensor> wOutJ{ new kp::Tensor({ 0, 0, 0, 0, 0 }) };
std::shared_ptr<kp::Tensor> bIn{ new kp::Tensor({ 0 }) };
std::shared_ptr<kp::Tensor> bOut{ new kp::Tensor({ 0, 0, 0, 0, 0 }) };
std::shared_ptr<kp::Tensor> lOut{ new kp::Tensor({ 0, 0, 0, 0, 0 }) };
std::vector<std::shared_ptr<kp::Tensor>> params = { xI, xJ, y,
wIn, wOutI, wOutJ,
bIn, bOut, lOut };
{
kp::Manager mgr;
mgr.rebuild(params);
std::shared_ptr<kp::Tensor> xI = mgr.tensor({ 0, 1, 1, 1, 1 });
std::shared_ptr<kp::Tensor> xJ = mgr.tensor({ 0, 0, 0, 1, 1 });
std::shared_ptr<kp::Sequence> sq = mgr.sequence();
std::shared_ptr<kp::Tensor> y = mgr.tensor({ 0, 0, 0, 1, 1 });
// Record op algo base
sq->begin();
std::shared_ptr<kp::Tensor> wIn = mgr.tensor({ 0.001, 0.001 });
std::shared_ptr<kp::Tensor> wOutI = mgr.tensor({ 0, 0, 0, 0, 0 });
std::shared_ptr<kp::Tensor> wOutJ = mgr.tensor({ 0, 0, 0, 0, 0 });
sq->record<kp::OpTensorSyncDevice>({ wIn, bIn });
std::shared_ptr<kp::Tensor> bIn = mgr.tensor({ 0 });
std::shared_ptr<kp::Tensor> bOut = mgr.tensor({ 0, 0, 0, 0, 0 });
sq->record<kp::OpAlgoCreate>(
params,
std::vector<uint32_t>(
std::shared_ptr<kp::Tensor> lOut = mgr.tensor({ 0, 0, 0, 0, 0 });
std::vector<std::shared_ptr<kp::Tensor>> params = { xI, xJ, y,
wIn, wOutI, wOutJ,
bIn, bOut, lOut };
std::vector<uint32_t> spirv = std::vector<uint32_t>(
(uint32_t*)kp::shader_data::shaders_glsl_logisticregression_comp_spv,
(uint32_t*)(kp::shader_data::shaders_glsl_logisticregression_comp_spv +
kp::shader_data::shaders_glsl_logisticregression_comp_spv_len)),
kp::Workgroup(), kp::Constants({5.0}));
kp::shader_data::shaders_glsl_logisticregression_comp_spv_len));
sq->record<kp::OpTensorSyncLocal>({ wOutI, wOutJ, bOut, lOut });
std::shared_ptr<kp::Algorithm> algorithm =
mgr.algorithm(params, spirv, kp::Workgroup(), kp::Constants({5.0}));
sq->end();
std::shared_ptr<kp::Sequence> sq =
mgr.sequence()
->record<kp::OpTensorSyncDevice>({ wIn, bIn })
->record<kp::OpAlgoDispatch>(algorithm)
->record<kp::OpTensorSyncLocal>({ wOutI, wOutJ, bOut, lOut });
// Iterate across all expected iterations
for (size_t i = 0; i < ITERATIONS; i++) {
@ -64,21 +57,21 @@ TEST(TestLogisticRegression, TestMainLogisticRegression)
bIn->data()[0] -= learningRate * bOut->data()[j];
}
}
// Based on the inputs the outputs should be at least:
// * wi < 0.01
// * wj > 1.0
// * b < 0
// TODO: Add EXPECT_DOUBLE_EQ instead
EXPECT_LT(wIn->data()[0], 0.01);
EXPECT_GT(wIn->data()[1], 1.0);
EXPECT_LT(bIn->data()[0], 0.0);
KP_LOG_WARN("Result wIn i: {}, wIn j: {}, bIn: {}",
wIn->data()[0],
wIn->data()[1],
bIn->data()[0]);
}
// Based on the inputs the outputs should be at least:
// * wi < 0.01
// * wj > 1.0
// * b < 0
// TODO: Add EXPECT_DOUBLE_EQ instead
EXPECT_LT(wIn->data()[0], 0.01);
EXPECT_GT(wIn->data()[1], 1.0);
EXPECT_LT(bIn->data()[0], 0.0);
KP_LOG_WARN("Result wIn i: {}, wIn j: {}, bIn: {}",
wIn->data()[0],
wIn->data()[1],
bIn->data()[0]);
}
TEST(TestLogisticRegression, TestMainLogisticRegressionManualCopy)
@ -87,50 +80,43 @@ TEST(TestLogisticRegression, TestMainLogisticRegressionManualCopy)
uint32_t ITERATIONS = 100;
float learningRate = 0.1;
kp::Constants wInVec = { 0.001, 0.001 };
std::vector<float> bInVec = { 0 };
std::shared_ptr<kp::Tensor> xI{ new kp::Tensor({ 0, 1, 1, 1, 1 }) };
std::shared_ptr<kp::Tensor> xJ{ new kp::Tensor({ 0, 0, 0, 1, 1 }) };
std::shared_ptr<kp::Tensor> y{ new kp::Tensor({ 0, 0, 0, 1, 1 }) };
std::shared_ptr<kp::Tensor> wIn{ new kp::Tensor(
wInVec, kp::Tensor::TensorTypes::eHost) };
std::shared_ptr<kp::Tensor> wOutI{ new kp::Tensor({ 0, 0, 0, 0, 0 }) };
std::shared_ptr<kp::Tensor> wOutJ{ new kp::Tensor({ 0, 0, 0, 0, 0 }) };
std::shared_ptr<kp::Tensor> bIn{ new kp::Tensor(
bInVec, kp::Tensor::TensorTypes::eHost) };
std::shared_ptr<kp::Tensor> bOut{ new kp::Tensor({ 0, 0, 0, 0, 0 }) };
std::shared_ptr<kp::Tensor> lOut{ new kp::Tensor({ 0, 0, 0, 0, 0 }) };
std::vector<std::shared_ptr<kp::Tensor>> params = { xI, xJ, y,
wIn, wOutI, wOutJ,
bIn, bOut, lOut };
{
kp::Manager mgr;
mgr.rebuild(params);
std::shared_ptr<kp::Tensor> xI = mgr.tensor({ 0, 1, 1, 1, 1 });
std::shared_ptr<kp::Tensor> xJ = mgr.tensor({ 0, 0, 0, 1, 1 });
std::shared_ptr<kp::Sequence> sq = mgr.sequence();
std::shared_ptr<kp::Tensor> y = mgr.tensor({ 0, 0, 0, 1, 1 });
// Record op algo base
sq->begin();
std::shared_ptr<kp::Tensor> wIn = mgr.tensor(
{ 0.001, 0.001 }, kp::Tensor::TensorTypes::eHost);
std::shared_ptr<kp::Tensor> wOutI = mgr.tensor({ 0, 0, 0, 0, 0 });
std::shared_ptr<kp::Tensor> wOutJ = mgr.tensor({ 0, 0, 0, 0, 0 });
sq->record<kp::OpAlgoCreate>(
params,
std::vector<uint32_t>(
std::shared_ptr<kp::Tensor> bIn = mgr.tensor(
{ 0 },
kp::Tensor::TensorTypes::eHost);
std::shared_ptr<kp::Tensor> bOut = mgr.tensor({ 0, 0, 0, 0, 0 });
std::shared_ptr<kp::Tensor> lOut = mgr.tensor({ 0, 0, 0, 0, 0 });
std::vector<std::shared_ptr<kp::Tensor>> params = { xI, xJ, y,
wIn, wOutI, wOutJ,
bIn, bOut, lOut };
std::vector<uint32_t> spirv = std::vector<uint32_t>(
(uint32_t*)kp::shader_data::shaders_glsl_logisticregression_comp_spv,
(uint32_t*)(kp::shader_data::shaders_glsl_logisticregression_comp_spv +
kp::shader_data::shaders_glsl_logisticregression_comp_spv_len)),
kp::Workgroup(), kp::Constants({5.0}));
kp::shader_data::shaders_glsl_logisticregression_comp_spv_len));
sq->record<kp::OpTensorSyncLocal>({ wOutI, wOutJ, bOut, lOut });
std::shared_ptr<kp::Algorithm> algorithm =
mgr.algorithm(params, spirv, kp::Workgroup(), kp::Constants({5.0}));
sq->end();
std::shared_ptr<kp::Sequence> sq =
mgr.sequence()
->record<kp::OpTensorSyncDevice>({ wIn, bIn })
->record<kp::OpAlgoDispatch>(algorithm)
->record<kp::OpTensorSyncLocal>({ wOutI, wOutJ, bOut, lOut });
// Iterate across all expected iterations
for (size_t i = 0; i < ITERATIONS; i++) {
@ -145,7 +131,6 @@ TEST(TestLogisticRegression, TestMainLogisticRegressionManualCopy)
wIn->mapDataIntoHostMemory();
bIn->mapDataIntoHostMemory();
}
}
// Based on the inputs the outputs should be at least:
// * wi < 0.01
@ -160,4 +145,5 @@ TEST(TestLogisticRegression, TestMainLogisticRegressionManualCopy)
wIn->data()[0],
wIn->data()[1],
bIn->data()[0]);
}
}

View file

@ -3,9 +3,6 @@
#include "kompute/Kompute.hpp"
#include "kompute_test/shaders/shadertest_workgroup.hpp"
TEST(TestWorkgroup, TestSimpleWorkgroup)
{
std::shared_ptr<kp::Tensor> tensorA = nullptr;
@ -31,9 +28,9 @@ TEST(TestWorkgroup, TestSimpleWorkgroup)
std::shared_ptr<kp::Algorithm> algorithm = mgr.algorithm(params, spirv, workgroup);
sq = mgr.sequence();
sq->record(std::make_shared<kp::OpTensorSyncDevice>(params));
sq->record(std::make_shared<kp::OpAlgoDispatch>(params, algorithm));
sq->record(std::make_shared<kp::OpTensorSyncLocal>(params));
sq->record<kp::OpTensorSyncDevice>(params);
sq->record<kp::OpAlgoDispatch>(params, algorithm);
sq->record<kp::OpTensorSyncLocal>(params);
sq->eval();
}
}