Updated documentation to reflect updated interface
This commit is contained in:
parent
6cbbb48827
commit
6c69d832d3
4 changed files with 65 additions and 236 deletions
75
README.md
75
README.md
|
|
@ -52,17 +52,50 @@ int main() {
|
|||
|
||||
kp::Manager mgr; // Automatically selects Device 0
|
||||
|
||||
std::shared_ptr<kp::Tensor> tensorLHS{ new kp::Tensor({ 0.0, 1.0, 2.0 }) };
|
||||
mgr.evalOp<kp::OpCreateTensor>({ tensorLHS });
|
||||
auto tensorLhs = std::make_shared<kp::Tensor>(kp::Tensor({ 0, 1, 2 }));
|
||||
auto tensorRhs = std::make_shared<kp::Tensor>(kp::Tensor({ 2, 4, 6 }));
|
||||
auto tensorOut = std::make_shared<kp::Tensor>(kp::Tensor({ 0, 0, 0 }));
|
||||
|
||||
std::shared_ptr<kp::Tensor> tensorRHS{ new kp::Tensor( { 2.0, 4.0, 6.0 }) };
|
||||
mgr.evalOp<kp::OpCreateTensor>({ tensorRHS });
|
||||
auto params = std::vector<kp::Tensor>({ tensorLhs, tensorRhs, tensorOut })
|
||||
|
||||
// TODO: Add capabilities for just output tensor types
|
||||
std::shared_ptr<kp::Tensor> tensorOutput{ new kp::Tensor({ 0.0, 0.0, 0.0 }) };
|
||||
mgr.evalOp<kp::OpCreateTensor>({ tensorOutput });
|
||||
// Create tensor data in GPU
|
||||
mgr.evalOp<kp::OpCreateTensor>(params);
|
||||
|
||||
mgr.evalOp<kp::OpMult>({ tensorLHS, tensorRHS, tensorOutput });
|
||||
// Run Kompute operation on the parameters provided with dispatch layout
|
||||
mgr.evalOp<kp::OpAlgoShader<10, 1, 1>>(params, "path/to/shader.comp.spv");
|
||||
|
||||
// Print the output
|
||||
std::cout << fmt::format("Output: {}", tensorOutput.data()) << std::endl;
|
||||
}
|
||||
```
|
||||
|
||||
Create your own operations with full control on each of the steps.
|
||||
|
||||
```c++
|
||||
template<uint32_t tX = 0, uint32_t tY = 0, uint32_t tZ = 0>
|
||||
class OpCustom : public OpAlgoBase<tX, tY, tZ> {
|
||||
// ...
|
||||
OpCustom(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
|
||||
std::shared_ptr<vk::Device> device,
|
||||
std::shared_ptr<vk::CommandBuffer> commandBuffer,
|
||||
std::vector<std::shared_ptr<Tensor>>& tensors)
|
||||
: OpAlgoBase<tX, tY, tZ>(physicalDevice, device, commandBuffer, tensors, true)
|
||||
{
|
||||
// ... extra steps to perform custom setup
|
||||
this->mOptSpirvBinPath = "shaders/glsl/opmult.comp.spv";
|
||||
}
|
||||
}
|
||||
|
||||
int main() {
|
||||
kp::Manager mgr; // Automatically selects Device 0
|
||||
|
||||
// Create parameters but don't initialise if customOp performs multiple
|
||||
auto tensorLhs = std::make_shared<kp::Tensor>(kp::Tensor({ 0, 1, 2 }));
|
||||
auto tensorRhs = std::make_shared<kp::Tensor>(kp::Tensor({ 2, 4, 6 }));
|
||||
auto tensorOut = std::make_shared<kp::Tensor>(kp::Tensor({ 0, 0, 0 }));
|
||||
|
||||
// Pass parameters to custom operation which performs relevant steps
|
||||
mgr.evalOp<kp::OpCustom>({ tensorLHS, tensorRHS, tensorOutput });
|
||||
|
||||
std::cout << fmt::format("Output: {}", tensorOutput.data()) << std::endl;
|
||||
}
|
||||
|
|
@ -72,6 +105,7 @@ Record commands in a single submit by using a Sequence to send in batch to GPU.
|
|||
|
||||
```c++
|
||||
int main() {
|
||||
|
||||
kp::Manager mgr;
|
||||
|
||||
std::shared_ptr<kp::Tensor> tensorLHS{ new kp::Tensor({ 0.0, 1.0, 2.0 }) };
|
||||
|
|
@ -90,8 +124,10 @@ int main() {
|
|||
|
||||
sq.record<kp::OpMult<>>({ tensorLHS, tensorRHS, tensorOutput });
|
||||
}
|
||||
|
||||
// Stop recording
|
||||
sq.end();
|
||||
|
||||
// Submit operations to GPU
|
||||
sq.eval();
|
||||
|
||||
|
|
@ -99,29 +135,6 @@ int main() {
|
|||
}
|
||||
```
|
||||
|
||||
Create your own custom operations to leverage Vulkan Compute for your specialised use-cases.
|
||||
|
||||
```c++
|
||||
class OpCustom : kp::OpBase {
|
||||
// ...
|
||||
void init(std::shared_ptr<Tensor> tensors) {
|
||||
// ... extra steps to initialise tensors
|
||||
this->mAlgorithm->init("path/to/your/shader.compute.spv", tensors);
|
||||
}
|
||||
}
|
||||
|
||||
int main() {
|
||||
kp::Manager mgr; // Automatically selects Device 0
|
||||
|
||||
std::shared_ptr<kp::Tensor> tensor{ new kp::Tensor({ 0.0, 1.0, 2.0 }) };
|
||||
mgr.evalOp<kp::OpCreateTensor>({ tensorLHS });
|
||||
|
||||
mgr.evalOp<kp::OpCustom>({ tensorLHS, tensorRHS, tensorOutput });
|
||||
|
||||
std::cout << fmt::format("Output: {}", tensorOutput.data()) << std::endl;
|
||||
}
|
||||
```
|
||||
|
||||
## Motivations
|
||||
|
||||
Vulkan Kompute was created after identifying the challenge most GPU processing projects with Vulkan undergo - namely having to build extensive boilerplate for Vulkan and create abstractions and interfaces that expose the core compute capabilities. It is only after a few thousand lines of code that it's possible to start building the application-specific logic.
|
||||
|
|
|
|||
|
|
@ -5,7 +5,6 @@
|
|||
#include "kompute/operations/OpBase.hpp"
|
||||
#include "kompute/operations/OpAlgoBase.hpp"
|
||||
#include "kompute/operations/OpAlgoLhsRhsOut.hpp"
|
||||
#include "kompute/operations/OpAlgoAllInOut.hpp"
|
||||
#include "kompute/operations/OpMult.hpp"
|
||||
#include "kompute/operations/OpCreateTensor.hpp"
|
||||
#include "kompute/Algorithm.hpp"
|
||||
|
|
|
|||
|
|
@ -526,7 +526,8 @@ class Sequence
|
|||
* not be able to add the operation.
|
||||
*
|
||||
* @param tensors Vector of tensors to use for the operation
|
||||
* @param TArgs Template parameters that are used to initialise operation which allows for extensible configurations on initialisation.
|
||||
* @param TArgs Template parameters that are used to initialise operation
|
||||
* which allows for extensible configurations on initialisation.
|
||||
*/
|
||||
template<typename T, typename... TArgs>
|
||||
bool record(std::vector<std::shared_ptr<Tensor>> tensors, TArgs&&... params)
|
||||
|
|
@ -655,7 +656,8 @@ class Manager
|
|||
*
|
||||
* @param tensors The tensors to be used in the operation recorded
|
||||
* @param sequenceName The name of the sequence to be retrieved or created
|
||||
* @param TArgs Template parameters that will be used to initialise Operation to allow for extensible configurations on initialisation
|
||||
* @param TArgs Template parameters that will be used to initialise
|
||||
* Operation to allow for extensible configurations on initialisation
|
||||
*/
|
||||
template<typename T, typename... TArgs>
|
||||
void evalOp(std::vector<std::shared_ptr<Tensor>> tensors,
|
||||
|
|
@ -801,6 +803,18 @@ namespace kp {
|
|||
/**
|
||||
* Operation that provides a general abstraction that simplifies the use of
|
||||
* algorithm and parameter components which can be used with shaders.
|
||||
* By default it enables the user to provide a dynamic number of tensors
|
||||
* which are then passed as inputs.
|
||||
*
|
||||
* All of these tensors are expected to be initlaised and this is checked with throw std exception in the init function.
|
||||
*
|
||||
* It is possible to also choose if the user requires all of the tensors to be
|
||||
* copied from device memory to their host data. This can be disabled by either
|
||||
* passing the copyOutputData constructor parameter and/or by overriding the
|
||||
* functions to carry out copy commands accordingly.
|
||||
*
|
||||
* See OpLhsRhsOut for an example implementation on a more specific granularity on tensor parameters.
|
||||
*
|
||||
* The template parameters specify the processing GPU layout number of
|
||||
* iterations for each x, y, z parameter. More specifically, this will be the
|
||||
* input to ".dispatch(uint32_t tX, uint32_t tY, uint32_t, tZ)"
|
||||
|
|
@ -1267,203 +1281,6 @@ OpAlgoLhsRhsOut<tX, tY, tZ>::postSubmit()
|
|||
|
||||
#include <fstream>
|
||||
|
||||
namespace kp {
|
||||
|
||||
/**
|
||||
* Operation base class to simplify the creation of operations that require
|
||||
* multiple unknown number of tensors, all which will be expected to be
|
||||
* Device storage tensors with the data already stored. All the tensors
|
||||
* will also be used as outputs so the data will be copied from the device
|
||||
* into the respective tensors.
|
||||
* The template parameters specify the processing GPU layout number of
|
||||
* iterations for each x, y, z parameter. More specifically, this will be the
|
||||
* input to ".dispatch(uint32_t tX, uint32_t tY, uint32_t, tZ)"
|
||||
*/
|
||||
template<uint32_t tX = 0, uint32_t tY = 0, uint32_t tZ = 0>
|
||||
class OpAlgoAllInOut : public OpAlgoBase<tX, tY, tZ>
|
||||
{
|
||||
public:
|
||||
/**
|
||||
* Base constructor, should not be used unless explicitly intended.
|
||||
*/
|
||||
OpAlgoAllInOut();
|
||||
|
||||
/**
|
||||
* Default constructor with parameters that provides the bare minimum
|
||||
* requirements for the operations to be able to create and manage their
|
||||
* sub-components.
|
||||
*
|
||||
* @param physicalDevice Vulkan physical device used to find device queues
|
||||
* @param device Vulkan logical device for passing to Algorithm
|
||||
* @param commandBuffer Vulkan Command Buffer to record commands into
|
||||
* @param tensors Tensors that are to be used in this operation
|
||||
* @param freeTensors Whether operation manages the memory of the Tensors
|
||||
*/
|
||||
OpAlgoAllInOut(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
|
||||
std::shared_ptr<vk::Device> device,
|
||||
std::shared_ptr<vk::CommandBuffer> commandBuffer,
|
||||
std::vector<std::shared_ptr<Tensor>>& tensors);
|
||||
|
||||
/**
|
||||
* Default destructor, which is in charge of destroying the algorithm
|
||||
* components but does not destroy the underlying tensors
|
||||
*/
|
||||
~OpAlgoAllInOut();
|
||||
|
||||
/**
|
||||
* The init function is responsible for ensuring that all of the tensors
|
||||
* passed into the function have been initialised and are of type Device.
|
||||
* This is required as the parameters provided are expected to be
|
||||
* used as storage buffers, as well as output buffers, so the data will
|
||||
* be transferred out from the Device into the Tensors replacing existing
|
||||
* data.
|
||||
*/
|
||||
void init() override;
|
||||
|
||||
/**
|
||||
* This records the commands that are to be sent to the GPU. This includes
|
||||
* the barriers that ensure the memory has been copied before going in and
|
||||
* out of the shader, as well as the dispatch operation that sends the
|
||||
* shader processing to the gpu. This function also records the GPU memory
|
||||
* copy of the output data for the staging bufffer so it can be read by the
|
||||
* host.
|
||||
*/
|
||||
void record() override;
|
||||
|
||||
/**
|
||||
* Executes after the recorded commands are submitted, and performs a copy
|
||||
* of the GPU Device memory into the staging buffer so the output data can
|
||||
* be retrieved.
|
||||
*/
|
||||
void postSubmit() override;
|
||||
|
||||
protected:
|
||||
// -------------- ALWAYS OWNED RESOURCES
|
||||
std::vector<std::shared_ptr<Tensor>> mOutputStagingTensors; ///< Array of output staging tensors which will be expected to be the same size as the number of inputs.
|
||||
};
|
||||
|
||||
} // End namespace kp
|
||||
|
||||
// Including implemenation for template class
|
||||
#ifndef OPALGOALLINOUT_CPP
|
||||
#define OPALGOALLINOUT_CPP
|
||||
|
||||
namespace kp {
|
||||
|
||||
template<uint32_t tX, uint32_t tY, uint32_t tZ>
|
||||
OpAlgoAllInOut<tX, tY, tZ>::OpAlgoAllInOut()
|
||||
{
|
||||
SPDLOG_DEBUG("Kompute OpAlgoAllInOut constructor base");
|
||||
}
|
||||
|
||||
template<uint32_t tX, uint32_t tY, uint32_t tZ>
|
||||
OpAlgoAllInOut<tX, tY, tZ>::OpAlgoAllInOut(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
|
||||
std::shared_ptr<vk::Device> device,
|
||||
std::shared_ptr<vk::CommandBuffer> commandBuffer,
|
||||
std::vector<std::shared_ptr<Tensor>>& tensors)
|
||||
: OpAlgoBase<tX, tY, tZ>(physicalDevice, device, commandBuffer, tensors)
|
||||
{
|
||||
SPDLOG_DEBUG("Kompute OpAlgoAllInOut constructor with params");
|
||||
}
|
||||
|
||||
template<uint32_t tX, uint32_t tY, uint32_t tZ>
|
||||
OpAlgoAllInOut<tX, tY, tZ>::~OpAlgoAllInOut()
|
||||
{
|
||||
SPDLOG_DEBUG("Kompute OpAlgoAllInOut destructor started");
|
||||
|
||||
SPDLOG_DEBUG("Kompute OpAlgoAllInOut destroying staging tensors");
|
||||
for (std::shared_ptr<Tensor> stagingTensor : this->mOutputStagingTensors) {
|
||||
stagingTensor->freeMemoryDestroyGPUResources();
|
||||
}
|
||||
}
|
||||
|
||||
template<uint32_t tX, uint32_t tY, uint32_t tZ>
|
||||
void
|
||||
OpAlgoAllInOut<tX, tY, tZ>::init()
|
||||
{
|
||||
SPDLOG_DEBUG("Kompute OpAlgoAllInOut init called");
|
||||
|
||||
if (this->mTensors.size() < 1) {
|
||||
throw std::runtime_error(
|
||||
"Kompute OpAlgoAllInOut called with less than 1 tensor");
|
||||
}
|
||||
|
||||
for (std::shared_ptr<Tensor> tensor : this->mTensors) {
|
||||
if(!tensor->isInit()) {
|
||||
throw std::runtime_error("Kompute OpAlgoAllInOut validation failed; all tensor parameters must be initialised.");
|
||||
}
|
||||
}
|
||||
|
||||
SPDLOG_DEBUG("Kompute OpAlgoAllInOut creating staging output tensors");
|
||||
|
||||
for (std::shared_ptr<Tensor> tensor : this->mTensors) {
|
||||
std::shared_ptr<Tensor> stagingTensor = std::make_shared<Tensor>(
|
||||
tensor->data(), Tensor::TensorTypes::eStaging);
|
||||
stagingTensor->init(
|
||||
this->mPhysicalDevice, this->mDevice, this->mCommandBuffer);
|
||||
this->mOutputStagingTensors.push_back(stagingTensor);
|
||||
}
|
||||
|
||||
SPDLOG_DEBUG("Kompute OpAlgoAllInOut fetching spirv data");
|
||||
|
||||
std::vector<char>& shaderFileData = this->fetchSpirvBinaryData();
|
||||
|
||||
SPDLOG_DEBUG("Kompute OpAlgoAllInOut Initialising algorithm component");
|
||||
|
||||
this->mAlgorithm->init(shaderFileData, this->mTensors);
|
||||
}
|
||||
|
||||
template<uint32_t tX, uint32_t tY, uint32_t tZ>
|
||||
void
|
||||
OpAlgoAllInOut<tX, tY, tZ>::record()
|
||||
{
|
||||
SPDLOG_DEBUG("Kompute OpAlgoAllInOut record called");
|
||||
|
||||
// Barrier to ensure the data is finished writing to buffer memory
|
||||
for (std::shared_ptr<Tensor> tensor : this->mTensors) {
|
||||
tensor->recordBufferMemoryBarrier(
|
||||
vk::AccessFlagBits::eHostWrite,
|
||||
vk::AccessFlagBits::eShaderRead,
|
||||
vk::PipelineStageFlagBits::eHost,
|
||||
vk::PipelineStageFlagBits::eComputeShader);
|
||||
}
|
||||
|
||||
this->mAlgorithm->recordDispatch(this->mX, this->mY, this->mZ);
|
||||
|
||||
// Barrier to ensure the shader code is executed before buffer read
|
||||
for (std::shared_ptr<Tensor> tensor : this->mTensors) {
|
||||
tensor->recordBufferMemoryBarrier(
|
||||
vk::AccessFlagBits::eShaderWrite,
|
||||
vk::AccessFlagBits::eTransferRead,
|
||||
vk::PipelineStageFlagBits::eComputeShader,
|
||||
vk::PipelineStageFlagBits::eTransfer);
|
||||
}
|
||||
|
||||
// Record copy from and create barrier for STAGING tensors
|
||||
for (std::shared_ptr<Tensor> stagingTensor : this->mOutputStagingTensors) {
|
||||
stagingTensor->recordCopyFrom(this->mTensorOutput, true);
|
||||
}
|
||||
}
|
||||
|
||||
template<uint32_t tX, uint32_t tY, uint32_t tZ>
|
||||
void
|
||||
OpAlgoAllInOut<tX, tY, tZ>::postSubmit()
|
||||
{
|
||||
SPDLOG_DEBUG("Kompute OpAlgoAllInOut postSubmit called");
|
||||
|
||||
for (size_t i = 0; i < this->mTensors.size(); i++) {
|
||||
this->mOutputStagingTensors[i]->mapDataFromHostMemory();
|
||||
|
||||
this->mTensors[i]->setData(this->mOutputStagingTensors[i]->data());
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
#endif // #ifndef OPALGOALLINOUT_CPP
|
||||
|
||||
#include <fstream>
|
||||
|
||||
#if RELEASE
|
||||
|
||||
#endif
|
||||
|
|
@ -1477,7 +1294,7 @@ namespace kp {
|
|||
* input to ".dispatch(uint32_t tX, uint32_t tY, uint32_t, tZ)"
|
||||
*/
|
||||
template<uint32_t tX = 0, uint32_t tY = 0, uint32_t tZ = 0>
|
||||
class OpMult : public OpAlgoLhsRhsOut<tX, tY, tZ>
|
||||
class OpMult : public OpAlgoBase<tX, tY, tZ>
|
||||
{
|
||||
public:
|
||||
/**
|
||||
|
|
@ -1502,7 +1319,7 @@ class OpMult : public OpAlgoLhsRhsOut<tX, tY, tZ>
|
|||
std::shared_ptr<vk::Device> device,
|
||||
std::shared_ptr<vk::CommandBuffer> commandBuffer,
|
||||
std::vector<std::shared_ptr<Tensor>>& tensors)
|
||||
: OpAlgoLhsRhsOut<tX, tY, tZ>(physicalDevice, device, commandBuffer, tensors)
|
||||
: OpAlgoBase<tX, tY, tZ>(physicalDevice, device, commandBuffer, tensors, true)
|
||||
{
|
||||
SPDLOG_DEBUG("Kompute OpMult constructor with params");
|
||||
|
||||
|
|
|
|||
|
|
@ -11,7 +11,7 @@
|
|||
#include "kompute/Algorithm.hpp"
|
||||
#include "kompute/Tensor.hpp"
|
||||
|
||||
#include "kompute/operations/OpAlgoLhsRhsOut.hpp"
|
||||
#include "kompute/operations/OpAlgoBase.hpp"
|
||||
|
||||
namespace kp {
|
||||
|
||||
|
|
@ -22,7 +22,7 @@ namespace kp {
|
|||
* input to ".dispatch(uint32_t tX, uint32_t tY, uint32_t, tZ)"
|
||||
*/
|
||||
template<uint32_t tX = 0, uint32_t tY = 0, uint32_t tZ = 0>
|
||||
class OpMult : public OpAlgoLhsRhsOut<tX, tY, tZ>
|
||||
class OpMult : public OpAlgoBase<tX, tY, tZ>
|
||||
{
|
||||
public:
|
||||
/**
|
||||
|
|
@ -47,7 +47,7 @@ class OpMult : public OpAlgoLhsRhsOut<tX, tY, tZ>
|
|||
std::shared_ptr<vk::Device> device,
|
||||
std::shared_ptr<vk::CommandBuffer> commandBuffer,
|
||||
std::vector<std::shared_ptr<Tensor>>& tensors)
|
||||
: OpAlgoLhsRhsOut<tX, tY, tZ>(physicalDevice, device, commandBuffer, tensors)
|
||||
: OpAlgoBase<tX, tY, tZ>(physicalDevice, device, commandBuffer, tensors, true)
|
||||
{
|
||||
SPDLOG_DEBUG("Kompute OpMult constructor with params");
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue