Updated OpAlgoBase to not copy data as optensorsync operations are introduced
This commit is contained in:
parent
4171786b6f
commit
9f8508075a
10 changed files with 92 additions and 180 deletions
31
README.md
31
README.md
|
|
@ -61,8 +61,11 @@ int main() {
|
|||
kp::Manager mgr; // Selects device 0 unless explicitly requested
|
||||
|
||||
// Creates tensor an initializes GPU memory (below we show more granularity)
|
||||
auto tensorA = mgr.buildTensor({ 3, 4, 5 });
|
||||
auto tensorB = mgr.buildTensor({ 0, 0, 0 });
|
||||
auto tensorA = std::make_shared<kp::Tensor>(kp::Tensor({ 3., 4., 5. }));
|
||||
auto tensorB = std::make_shared<kp::Tensor>(kp::Tensor({ 0., 0., 0. }));
|
||||
|
||||
// Create tensors data explicitly in GPU with an operation
|
||||
mgr.evalOpDefault<kp::OpTensorCreate>({ tensorA, tensorB });
|
||||
|
||||
// Define your shader as a string (using string literals for simplicity)
|
||||
// (You can also pass the raw compiled bytes, or even path to file)
|
||||
|
|
@ -82,11 +85,13 @@ int main() {
|
|||
)");
|
||||
|
||||
// Run Kompute operation on the parameters provided with dispatch layout
|
||||
mgr.evalOpDefault<kp::OpMult<3, 1, 1>>(
|
||||
mgr.evalOpDefault<kp::OpAlgoBase<3, 1, 1>>(
|
||||
{ tensorA, tensorB },
|
||||
true, // Whether to retrieve the output from GPU memory
|
||||
std::vector<char>(shader.begin(), shader.end()));
|
||||
|
||||
// Sync the GPU memory back to the local tensor
|
||||
mgr.evalOpDefault<kp::OpTensorSyncLocal>({ tensorA, tensorB });
|
||||
|
||||
// Prints the output which is A: { 0, 1, 2 } B: { 3, 4, 5 }
|
||||
std::cout << fmt::format("A: {}, B: {}",
|
||||
tensorA.data(), tensorB.data()) << std::endl;
|
||||
|
|
@ -107,7 +112,7 @@ class OpMyCustom : public OpAlgoBase<tX, tY, tZ>
|
|||
std::shared_ptr<vk::Device> device,
|
||||
std::shared_ptr<vk::CommandBuffer> commandBuffer,
|
||||
std::vector<std::shared_ptr<Tensor>> tensors)
|
||||
: OpAlgoBase<tX, tY, tZ>(physicalDevice, device, commandBuffer, tensors, true, "")
|
||||
: OpAlgoBase<tX, tY, tZ>(physicalDevice, device, commandBuffer, tensors, "")
|
||||
{
|
||||
// Perform your custom steps such as reading from a shader file
|
||||
this->mShaderFilePath = "shaders/glsl/opmult.comp";
|
||||
|
|
@ -144,7 +149,7 @@ int main() {
|
|||
kp::Manager mgr;
|
||||
|
||||
std::shared_ptr<kp::Tensor> tensorLHS{ new kp::Tensor({ 1., 1., 1. }) };
|
||||
std::shared_ptr<kp::Tensor> tensorRHS{ new kp::Tensor( { 2., 2., 2. }) };
|
||||
std::shared_ptr<kp::Tensor> tensorRHS{ new kp::Tensor({ 2., 2., 2. }) };
|
||||
std::shared_ptr<kp::Tensor> tensorOutput{ new kp::Tensor({ 0., 0., 0. }) };
|
||||
|
||||
// Create all the tensors in memory
|
||||
|
|
@ -159,17 +164,23 @@ int main() {
|
|||
sq.begin();
|
||||
|
||||
// Record batch commands to send to GPU
|
||||
sq.record<kp::OpMult<>>({ tensorLHS, tensorRHS, tensorOutput });
|
||||
sq.record<kp::OpTensorCopy>({tensorOutput, tensorLHS, tensorRHS});
|
||||
sq->record<kp::OpMult<>>({ tensorLHS, tensorRHS, tensorOutput });
|
||||
sq->record<kp::OpTensorCopy>({tensorOutput, tensorLHS, tensorRHS});
|
||||
|
||||
// Stop recording
|
||||
sq.end();
|
||||
sq->end();
|
||||
|
||||
// Submit multiple batch operations to GPU
|
||||
size_t ITERATIONS = 5;
|
||||
for (size_t i = 0; i < ITERATIONS; i++) {
|
||||
sq.eval();
|
||||
sq->eval();
|
||||
}
|
||||
|
||||
// Sync GPU memory back to local tensor
|
||||
sq->begin();
|
||||
sq->record<kp::OpTensorSyncLocal>({tensorOutput});
|
||||
sq->end();
|
||||
sq->eval();
|
||||
}
|
||||
|
||||
// Print the output which iterates through OpMult 5 times
|
||||
|
|
|
|||
|
|
@ -971,11 +971,6 @@ namespace kp {
|
|||
*
|
||||
* All of these tensors are expected to be initlaised and this is checked with throw std exception in the init function.
|
||||
*
|
||||
* It is possible to also choose if the user requires all of the tensors to be
|
||||
* copied from device memory to their host data. This can be disabled by either
|
||||
* passing the copyOutputData constructor parameter and/or by overriding the
|
||||
* functions to carry out copy commands accordingly.
|
||||
*
|
||||
* See OpLhsRhsOut for an example implementation on a more specific granularity on tensor parameters.
|
||||
*
|
||||
* The template parameters specify the processing GPU layout number of
|
||||
|
|
@ -1000,14 +995,12 @@ class OpAlgoBase : public OpBase
|
|||
* @param device Vulkan logical device for passing to Algorithm
|
||||
* @param commandBuffer Vulkan Command Buffer to record commands into
|
||||
* @param tensors Tensors that are to be used in this operation
|
||||
* @param copyOutputData Whether to map device data for all output tensors back to their host data vectors
|
||||
* @param shaderFilePath Optional parameter to specify the shader to load (either in spirv or raw format)
|
||||
*/
|
||||
OpAlgoBase(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
|
||||
std::shared_ptr<vk::Device> device,
|
||||
std::shared_ptr<vk::CommandBuffer> commandBuffer,
|
||||
std::vector<std::shared_ptr<Tensor>>& tensors,
|
||||
bool copyOutputData);
|
||||
std::vector<std::shared_ptr<Tensor>>& tensors);
|
||||
|
||||
/**
|
||||
* Constructor that enables a file to be passed to the operation with
|
||||
|
|
@ -1018,14 +1011,12 @@ class OpAlgoBase : public OpBase
|
|||
* @param device Vulkan logical device for passing to Algorithm
|
||||
* @param commandBuffer Vulkan Command Buffer to record commands into
|
||||
* @param tensors Tensors that are to be used in this operation
|
||||
* @param copyOutputData Whether to map device data for all output tensors back to their host data vectors
|
||||
* @param shaderFilePath Optional parameter to specify the shader to load (either in spirv or raw format)
|
||||
*/
|
||||
OpAlgoBase(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
|
||||
std::shared_ptr<vk::Device> device,
|
||||
std::shared_ptr<vk::CommandBuffer> commandBuffer,
|
||||
std::vector<std::shared_ptr<Tensor>>& tensors,
|
||||
bool copyOutputData,
|
||||
std::string shaderFilePath);
|
||||
|
||||
/**
|
||||
|
|
@ -1036,14 +1027,12 @@ class OpAlgoBase : public OpBase
|
|||
* @param device Vulkan logical device for passing to Algorithm
|
||||
* @param commandBuffer Vulkan Command Buffer to record commands into
|
||||
* @param tensors Tensors that are to be used in this operation
|
||||
* @param copyOutputData Whether to map device data for all output tensors back to their host data vectors
|
||||
* @param shaderDataRaw Optional parameter to specify the shader data either in binary or raw form
|
||||
*/
|
||||
OpAlgoBase(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
|
||||
std::shared_ptr<vk::Device> device,
|
||||
std::shared_ptr<vk::CommandBuffer> commandBuffer,
|
||||
std::vector<std::shared_ptr<Tensor>>& tensors,
|
||||
bool copyOutputData,
|
||||
const std::vector<char>& shaderDataRaw);
|
||||
|
||||
/**
|
||||
|
|
@ -1090,8 +1079,6 @@ class OpAlgoBase : public OpBase
|
|||
bool mFreeAlgorithm = false;
|
||||
|
||||
// -------------- ALWAYS OWNED RESOURCES
|
||||
std::vector<std::shared_ptr<Tensor>> mOutputStagingTensors; ///< Array of output staging tensors which will be expected to be the same size as the number of inputs.
|
||||
bool mCopyOutputData; ///< Configuration parameter which states whether data will be copied back to all provided tensors for convenience. This can be deactivated by setting this flag and or overriding the functions provided.
|
||||
|
||||
uint32_t mX;
|
||||
uint32_t mY;
|
||||
|
|
@ -1121,11 +1108,10 @@ template<uint32_t tX, uint32_t tY, uint32_t tZ>
|
|||
OpAlgoBase<tX, tY, tZ>::OpAlgoBase(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
|
||||
std::shared_ptr<vk::Device> device,
|
||||
std::shared_ptr<vk::CommandBuffer> commandBuffer,
|
||||
std::vector<std::shared_ptr<Tensor>>& tensors,
|
||||
bool copyOutputData)
|
||||
std::vector<std::shared_ptr<Tensor>>& tensors)
|
||||
: OpBase(physicalDevice, device, commandBuffer, tensors, false)
|
||||
{
|
||||
SPDLOG_DEBUG("Kompute OpAlgoBase constructor with params numTensors: {} copyOutputData: {}, shaderFilePath: {}", tensors.size(), copyOutputData);
|
||||
SPDLOG_DEBUG("Kompute OpAlgoBase constructor with params numTensors: {} , shaderFilePath: {}", tensors.size());
|
||||
|
||||
// The dispatch size is set up based on either explicitly provided template
|
||||
// parameters or by default it would take the shape and size of the tensors
|
||||
|
|
@ -1145,8 +1131,6 @@ OpAlgoBase<tX, tY, tZ>::OpAlgoBase(std::shared_ptr<vk::PhysicalDevice> physicalD
|
|||
this->mY,
|
||||
this->mZ);
|
||||
|
||||
this->mCopyOutputData = copyOutputData;
|
||||
|
||||
this->mAlgorithm = std::make_shared<Algorithm>(device, commandBuffer);
|
||||
}
|
||||
|
||||
|
|
@ -1155,9 +1139,8 @@ OpAlgoBase<tX, tY, tZ>::OpAlgoBase(std::shared_ptr<vk::PhysicalDevice> physicalD
|
|||
std::shared_ptr<vk::Device> device,
|
||||
std::shared_ptr<vk::CommandBuffer> commandBuffer,
|
||||
std::vector<std::shared_ptr<Tensor>>& tensors,
|
||||
bool copyOutputData,
|
||||
std::string shaderFilePath)
|
||||
: OpAlgoBase(physicalDevice, device, commandBuffer, tensors, copyOutputData)
|
||||
: OpAlgoBase(physicalDevice, device, commandBuffer, tensors)
|
||||
{
|
||||
SPDLOG_DEBUG("Kompute OpAlgoBase shaderFilePath constructo with shaderfile path: {}", shaderFilePath);
|
||||
|
||||
|
|
@ -1169,9 +1152,8 @@ OpAlgoBase<tX, tY, tZ>::OpAlgoBase(std::shared_ptr<vk::PhysicalDevice> physicalD
|
|||
std::shared_ptr<vk::Device> device,
|
||||
std::shared_ptr<vk::CommandBuffer> commandBuffer,
|
||||
std::vector<std::shared_ptr<Tensor>>& tensors,
|
||||
bool copyOutputData,
|
||||
const std::vector<char>& shaderDataRaw)
|
||||
: OpAlgoBase(physicalDevice, device, commandBuffer, tensors, copyOutputData)
|
||||
: OpAlgoBase(physicalDevice, device, commandBuffer, tensors)
|
||||
{
|
||||
SPDLOG_DEBUG("Kompute OpAlgoBase shaderFilePath constructo with shader raw data length: {}", shaderDataRaw.size());
|
||||
|
||||
|
|
@ -1182,13 +1164,6 @@ template<uint32_t tX, uint32_t tY, uint32_t tZ>
|
|||
OpAlgoBase<tX, tY, tZ>::~OpAlgoBase()
|
||||
{
|
||||
SPDLOG_DEBUG("Kompute OpAlgoBase destructor started");
|
||||
|
||||
if (this->mCopyOutputData) {
|
||||
SPDLOG_DEBUG("Kompute OpAlgoBase destroying staging tensors");
|
||||
for (std::shared_ptr<Tensor> stagingTensor : this->mOutputStagingTensors) {
|
||||
stagingTensor->freeMemoryDestroyGPUResources();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template<uint32_t tX, uint32_t tY, uint32_t tZ>
|
||||
|
|
@ -1208,18 +1183,6 @@ OpAlgoBase<tX, tY, tZ>::init()
|
|||
}
|
||||
}
|
||||
|
||||
if (this->mCopyOutputData) {
|
||||
SPDLOG_DEBUG("Kompute OpAlgoBase creating staging output tensors");
|
||||
|
||||
for (std::shared_ptr<Tensor> tensor : this->mTensors) {
|
||||
std::shared_ptr<Tensor> stagingTensor = std::make_shared<Tensor>(
|
||||
tensor->data(), Tensor::TensorTypes::eStaging);
|
||||
stagingTensor->init(
|
||||
this->mPhysicalDevice, this->mDevice);
|
||||
this->mOutputStagingTensors.push_back(stagingTensor);
|
||||
}
|
||||
}
|
||||
|
||||
SPDLOG_DEBUG("Kompute OpAlgoBase fetching spirv data");
|
||||
|
||||
std::vector<char>& shaderFileData = this->fetchSpirvBinaryData();
|
||||
|
|
@ -1246,27 +1209,6 @@ OpAlgoBase<tX, tY, tZ>::record()
|
|||
}
|
||||
|
||||
this->mAlgorithm->recordDispatch(this->mX, this->mY, this->mZ);
|
||||
|
||||
if (this->mCopyOutputData) {
|
||||
// Barrier to ensure the shader code is executed before buffer read
|
||||
for (const std::shared_ptr<Tensor>& tensor : this->mTensors) {
|
||||
tensor->recordBufferMemoryBarrier(
|
||||
this->mCommandBuffer,
|
||||
vk::AccessFlagBits::eShaderWrite,
|
||||
vk::AccessFlagBits::eTransferRead,
|
||||
vk::PipelineStageFlagBits::eComputeShader,
|
||||
vk::PipelineStageFlagBits::eTransfer);
|
||||
}
|
||||
|
||||
// Record copy from and create barrier for STAGING tensors
|
||||
// TODO: This only accounts for device tensors need to account for staging and storage
|
||||
for (size_t i = 0; i < this->mTensors.size(); i++) {
|
||||
this->mOutputStagingTensors[i]->recordCopyFrom(
|
||||
this->mCommandBuffer,
|
||||
this->mTensors[i],
|
||||
true);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template<uint32_t tX, uint32_t tY, uint32_t tZ>
|
||||
|
|
@ -1281,14 +1223,6 @@ void
|
|||
OpAlgoBase<tX, tY, tZ>::postEval()
|
||||
{
|
||||
SPDLOG_DEBUG("Kompute OpAlgoBase postSubmit called");
|
||||
|
||||
if (this->mCopyOutputData) {
|
||||
for (size_t i = 0; i < this->mTensors.size(); i++) {
|
||||
this->mOutputStagingTensors[i]->mapDataFromHostMemory();
|
||||
|
||||
this->mTensors[i]->setData(this->mOutputStagingTensors[i]->data());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template<uint32_t tX, uint32_t tY, uint32_t tZ>
|
||||
|
|
@ -1429,7 +1363,7 @@ OpAlgoLhsRhsOut<tX, tY, tZ>::OpAlgoLhsRhsOut(std::shared_ptr<vk::PhysicalDevice>
|
|||
// The inheritance is initialised with the copyOutputData to false given that
|
||||
// this depencendant class handles the transfer of data via staging buffers in
|
||||
// a granular way.
|
||||
: OpAlgoBase<tX, tY, tZ>(physicalDevice, device, commandBuffer, tensors, false)
|
||||
: OpAlgoBase<tX, tY, tZ>(physicalDevice, device, commandBuffer, tensors)
|
||||
{
|
||||
SPDLOG_DEBUG("Kompute OpAlgoLhsRhsOut constructor with params");
|
||||
}
|
||||
|
|
@ -1575,7 +1509,7 @@ class OpMult : public OpAlgoBase<tX, tY, tZ>
|
|||
std::shared_ptr<vk::Device> device,
|
||||
std::shared_ptr<vk::CommandBuffer> commandBuffer,
|
||||
std::vector<std::shared_ptr<Tensor>> tensors)
|
||||
: OpAlgoBase<tX, tY, tZ>(physicalDevice, device, commandBuffer, tensors, true, "")
|
||||
: OpAlgoBase<tX, tY, tZ>(physicalDevice, device, commandBuffer, tensors, "")
|
||||
{
|
||||
SPDLOG_DEBUG("Kompute OpMult constructor with params");
|
||||
|
||||
|
|
|
|||
|
|
@ -21,11 +21,6 @@ namespace kp {
|
|||
*
|
||||
* All of these tensors are expected to be initlaised and this is checked with throw std exception in the init function.
|
||||
*
|
||||
* It is possible to also choose if the user requires all of the tensors to be
|
||||
* copied from device memory to their host data. This can be disabled by either
|
||||
* passing the copyOutputData constructor parameter and/or by overriding the
|
||||
* functions to carry out copy commands accordingly.
|
||||
*
|
||||
* See OpLhsRhsOut for an example implementation on a more specific granularity on tensor parameters.
|
||||
*
|
||||
* The template parameters specify the processing GPU layout number of
|
||||
|
|
@ -50,14 +45,12 @@ class OpAlgoBase : public OpBase
|
|||
* @param device Vulkan logical device for passing to Algorithm
|
||||
* @param commandBuffer Vulkan Command Buffer to record commands into
|
||||
* @param tensors Tensors that are to be used in this operation
|
||||
* @param copyOutputData Whether to map device data for all output tensors back to their host data vectors
|
||||
* @param shaderFilePath Optional parameter to specify the shader to load (either in spirv or raw format)
|
||||
*/
|
||||
OpAlgoBase(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
|
||||
std::shared_ptr<vk::Device> device,
|
||||
std::shared_ptr<vk::CommandBuffer> commandBuffer,
|
||||
std::vector<std::shared_ptr<Tensor>>& tensors,
|
||||
bool copyOutputData);
|
||||
std::vector<std::shared_ptr<Tensor>>& tensors);
|
||||
|
||||
/**
|
||||
* Constructor that enables a file to be passed to the operation with
|
||||
|
|
@ -68,14 +61,12 @@ class OpAlgoBase : public OpBase
|
|||
* @param device Vulkan logical device for passing to Algorithm
|
||||
* @param commandBuffer Vulkan Command Buffer to record commands into
|
||||
* @param tensors Tensors that are to be used in this operation
|
||||
* @param copyOutputData Whether to map device data for all output tensors back to their host data vectors
|
||||
* @param shaderFilePath Optional parameter to specify the shader to load (either in spirv or raw format)
|
||||
*/
|
||||
OpAlgoBase(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
|
||||
std::shared_ptr<vk::Device> device,
|
||||
std::shared_ptr<vk::CommandBuffer> commandBuffer,
|
||||
std::vector<std::shared_ptr<Tensor>>& tensors,
|
||||
bool copyOutputData,
|
||||
std::string shaderFilePath);
|
||||
|
||||
/**
|
||||
|
|
@ -86,14 +77,12 @@ class OpAlgoBase : public OpBase
|
|||
* @param device Vulkan logical device for passing to Algorithm
|
||||
* @param commandBuffer Vulkan Command Buffer to record commands into
|
||||
* @param tensors Tensors that are to be used in this operation
|
||||
* @param copyOutputData Whether to map device data for all output tensors back to their host data vectors
|
||||
* @param shaderDataRaw Optional parameter to specify the shader data either in binary or raw form
|
||||
*/
|
||||
OpAlgoBase(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
|
||||
std::shared_ptr<vk::Device> device,
|
||||
std::shared_ptr<vk::CommandBuffer> commandBuffer,
|
||||
std::vector<std::shared_ptr<Tensor>>& tensors,
|
||||
bool copyOutputData,
|
||||
const std::vector<char>& shaderDataRaw);
|
||||
|
||||
/**
|
||||
|
|
@ -141,8 +130,6 @@ class OpAlgoBase : public OpBase
|
|||
bool mFreeAlgorithm = false;
|
||||
|
||||
// -------------- ALWAYS OWNED RESOURCES
|
||||
std::vector<std::shared_ptr<Tensor>> mOutputStagingTensors; ///< Array of output staging tensors which will be expected to be the same size as the number of inputs.
|
||||
bool mCopyOutputData; ///< Configuration parameter which states whether data will be copied back to all provided tensors for convenience. This can be deactivated by setting this flag and or overriding the functions provided.
|
||||
|
||||
uint32_t mX;
|
||||
uint32_t mY;
|
||||
|
|
@ -172,11 +159,10 @@ template<uint32_t tX, uint32_t tY, uint32_t tZ>
|
|||
OpAlgoBase<tX, tY, tZ>::OpAlgoBase(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
|
||||
std::shared_ptr<vk::Device> device,
|
||||
std::shared_ptr<vk::CommandBuffer> commandBuffer,
|
||||
std::vector<std::shared_ptr<Tensor>>& tensors,
|
||||
bool copyOutputData)
|
||||
std::vector<std::shared_ptr<Tensor>>& tensors)
|
||||
: OpBase(physicalDevice, device, commandBuffer, tensors, false)
|
||||
{
|
||||
SPDLOG_DEBUG("Kompute OpAlgoBase constructor with params numTensors: {} copyOutputData: {}, shaderFilePath: {}", tensors.size(), copyOutputData);
|
||||
SPDLOG_DEBUG("Kompute OpAlgoBase constructor with params numTensors: {} , shaderFilePath: {}", tensors.size());
|
||||
|
||||
// The dispatch size is set up based on either explicitly provided template
|
||||
// parameters or by default it would take the shape and size of the tensors
|
||||
|
|
@ -196,8 +182,6 @@ OpAlgoBase<tX, tY, tZ>::OpAlgoBase(std::shared_ptr<vk::PhysicalDevice> physicalD
|
|||
this->mY,
|
||||
this->mZ);
|
||||
|
||||
this->mCopyOutputData = copyOutputData;
|
||||
|
||||
this->mAlgorithm = std::make_shared<Algorithm>(device, commandBuffer);
|
||||
}
|
||||
|
||||
|
|
@ -206,9 +190,8 @@ OpAlgoBase<tX, tY, tZ>::OpAlgoBase(std::shared_ptr<vk::PhysicalDevice> physicalD
|
|||
std::shared_ptr<vk::Device> device,
|
||||
std::shared_ptr<vk::CommandBuffer> commandBuffer,
|
||||
std::vector<std::shared_ptr<Tensor>>& tensors,
|
||||
bool copyOutputData,
|
||||
std::string shaderFilePath)
|
||||
: OpAlgoBase(physicalDevice, device, commandBuffer, tensors, copyOutputData)
|
||||
: OpAlgoBase(physicalDevice, device, commandBuffer, tensors)
|
||||
{
|
||||
SPDLOG_DEBUG("Kompute OpAlgoBase shaderFilePath constructo with shaderfile path: {}", shaderFilePath);
|
||||
|
||||
|
|
@ -220,9 +203,8 @@ OpAlgoBase<tX, tY, tZ>::OpAlgoBase(std::shared_ptr<vk::PhysicalDevice> physicalD
|
|||
std::shared_ptr<vk::Device> device,
|
||||
std::shared_ptr<vk::CommandBuffer> commandBuffer,
|
||||
std::vector<std::shared_ptr<Tensor>>& tensors,
|
||||
bool copyOutputData,
|
||||
const std::vector<char>& shaderDataRaw)
|
||||
: OpAlgoBase(physicalDevice, device, commandBuffer, tensors, copyOutputData)
|
||||
: OpAlgoBase(physicalDevice, device, commandBuffer, tensors)
|
||||
{
|
||||
SPDLOG_DEBUG("Kompute OpAlgoBase shaderFilePath constructo with shader raw data length: {}", shaderDataRaw.size());
|
||||
|
||||
|
|
@ -233,13 +215,6 @@ template<uint32_t tX, uint32_t tY, uint32_t tZ>
|
|||
OpAlgoBase<tX, tY, tZ>::~OpAlgoBase()
|
||||
{
|
||||
SPDLOG_DEBUG("Kompute OpAlgoBase destructor started");
|
||||
|
||||
if (this->mCopyOutputData) {
|
||||
SPDLOG_DEBUG("Kompute OpAlgoBase destroying staging tensors");
|
||||
for (std::shared_ptr<Tensor> stagingTensor : this->mOutputStagingTensors) {
|
||||
stagingTensor->freeMemoryDestroyGPUResources();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template<uint32_t tX, uint32_t tY, uint32_t tZ>
|
||||
|
|
@ -259,18 +234,6 @@ OpAlgoBase<tX, tY, tZ>::init()
|
|||
}
|
||||
}
|
||||
|
||||
if (this->mCopyOutputData) {
|
||||
SPDLOG_DEBUG("Kompute OpAlgoBase creating staging output tensors");
|
||||
|
||||
for (std::shared_ptr<Tensor> tensor : this->mTensors) {
|
||||
std::shared_ptr<Tensor> stagingTensor = std::make_shared<Tensor>(
|
||||
tensor->data(), Tensor::TensorTypes::eStaging);
|
||||
stagingTensor->init(
|
||||
this->mPhysicalDevice, this->mDevice);
|
||||
this->mOutputStagingTensors.push_back(stagingTensor);
|
||||
}
|
||||
}
|
||||
|
||||
SPDLOG_DEBUG("Kompute OpAlgoBase fetching spirv data");
|
||||
|
||||
std::vector<char>& shaderFileData = this->fetchSpirvBinaryData();
|
||||
|
|
@ -297,27 +260,6 @@ OpAlgoBase<tX, tY, tZ>::record()
|
|||
}
|
||||
|
||||
this->mAlgorithm->recordDispatch(this->mX, this->mY, this->mZ);
|
||||
|
||||
if (this->mCopyOutputData) {
|
||||
// Barrier to ensure the shader code is executed before buffer read
|
||||
for (const std::shared_ptr<Tensor>& tensor : this->mTensors) {
|
||||
tensor->recordBufferMemoryBarrier(
|
||||
this->mCommandBuffer,
|
||||
vk::AccessFlagBits::eShaderWrite,
|
||||
vk::AccessFlagBits::eTransferRead,
|
||||
vk::PipelineStageFlagBits::eComputeShader,
|
||||
vk::PipelineStageFlagBits::eTransfer);
|
||||
}
|
||||
|
||||
// Record copy from and create barrier for STAGING tensors
|
||||
// TODO: This only accounts for device tensors need to account for staging and storage
|
||||
for (size_t i = 0; i < this->mTensors.size(); i++) {
|
||||
this->mOutputStagingTensors[i]->recordCopyFrom(
|
||||
this->mCommandBuffer,
|
||||
this->mTensors[i],
|
||||
true);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template<uint32_t tX, uint32_t tY, uint32_t tZ>
|
||||
|
|
@ -332,14 +274,6 @@ void
|
|||
OpAlgoBase<tX, tY, tZ>::postEval()
|
||||
{
|
||||
SPDLOG_DEBUG("Kompute OpAlgoBase postSubmit called");
|
||||
|
||||
if (this->mCopyOutputData) {
|
||||
for (size_t i = 0; i < this->mTensors.size(); i++) {
|
||||
this->mOutputStagingTensors[i]->mapDataFromHostMemory();
|
||||
|
||||
this->mTensors[i]->setData(this->mOutputStagingTensors[i]->data());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template<uint32_t tX, uint32_t tY, uint32_t tZ>
|
||||
|
|
|
|||
|
|
@ -107,7 +107,7 @@ OpAlgoLhsRhsOut<tX, tY, tZ>::OpAlgoLhsRhsOut(std::shared_ptr<vk::PhysicalDevice>
|
|||
// The inheritance is initialised with the copyOutputData to false given that
|
||||
// this depencendant class handles the transfer of data via staging buffers in
|
||||
// a granular way.
|
||||
: OpAlgoBase<tX, tY, tZ>(physicalDevice, device, commandBuffer, tensors, false)
|
||||
: OpAlgoBase<tX, tY, tZ>(physicalDevice, device, commandBuffer, tensors)
|
||||
{
|
||||
SPDLOG_DEBUG("Kompute OpAlgoLhsRhsOut constructor with params");
|
||||
}
|
||||
|
|
|
|||
|
|
@ -47,7 +47,7 @@ class OpMult : public OpAlgoBase<tX, tY, tZ>
|
|||
std::shared_ptr<vk::Device> device,
|
||||
std::shared_ptr<vk::CommandBuffer> commandBuffer,
|
||||
std::vector<std::shared_ptr<Tensor>> tensors)
|
||||
: OpAlgoBase<tX, tY, tZ>(physicalDevice, device, commandBuffer, tensors, true, "")
|
||||
: OpAlgoBase<tX, tY, tZ>(physicalDevice, device, commandBuffer, tensors, "")
|
||||
{
|
||||
SPDLOG_DEBUG("Kompute OpMult constructor with params");
|
||||
|
||||
|
|
|
|||
|
|
@ -48,7 +48,6 @@ TEST(TestLogisticRegressionAlgorithm, TestMainLogisticRegression) {
|
|||
|
||||
sq->record<kp::OpAlgoBase<>>(
|
||||
params,
|
||||
false, // Whether to copy output from device
|
||||
"test/shaders/glsl/test_logistic_regression.comp");
|
||||
|
||||
sq->record<kp::OpTensorSyncLocal>({wOutI, wOutJ, bOut});
|
||||
|
|
@ -125,9 +124,10 @@ TEST(TestLogisticRegressionAlgorithm, TestMainLogisticRegressionManualCopy) {
|
|||
|
||||
sq->record<kp::OpAlgoBase<>>(
|
||||
params,
|
||||
true, // Whether to copy output from device
|
||||
"test/shaders/glsl/test_logistic_regression.comp");
|
||||
|
||||
sq->record<kp::OpTensorSyncLocal>({wOutI, wOutJ, bOut});
|
||||
|
||||
sq->end();
|
||||
|
||||
// Iterate across all expected iterations
|
||||
|
|
|
|||
|
|
@ -10,16 +10,17 @@ TEST(TestManager, EndToEndOpMultFlow)
|
|||
std::shared_ptr<kp::Tensor> tensorLHS{ new kp::Tensor({ 0, 1, 2 }) };
|
||||
mgr.evalOp<kp::OpTensorCreate>({ tensorLHS });
|
||||
|
||||
std::shared_ptr<kp::Tensor> tensorRHS{ new kp::Tensor(
|
||||
{ 2, 4, 6 }) };
|
||||
std::shared_ptr<kp::Tensor> tensorRHS{ new kp::Tensor( { 2, 4, 6 }) };
|
||||
mgr.evalOp<kp::OpTensorCreate>({ tensorRHS });
|
||||
|
||||
std::shared_ptr<kp::Tensor> tensorOutput{ new kp::Tensor(
|
||||
{ 0, 0, 0 }) };
|
||||
std::shared_ptr<kp::Tensor> tensorOutput{ new kp::Tensor( { 0, 0, 0 }) };
|
||||
|
||||
mgr.evalOp<kp::OpTensorCreate>({ tensorOutput });
|
||||
|
||||
mgr.evalOp<kp::OpMult<>>({ tensorLHS, tensorRHS, tensorOutput });
|
||||
|
||||
mgr.evalOp<kp::OpTensorSyncLocal>({ tensorOutput });
|
||||
|
||||
EXPECT_EQ(tensorOutput->data(), std::vector<float>({0, 4, 12}));
|
||||
}
|
||||
|
||||
|
|
@ -46,6 +47,8 @@ TEST(TestManager, OpMultSequenceFlow) {
|
|||
|
||||
sq->record<kp::OpMult<>>({ tensorLHS, tensorRHS, tensorOutput });
|
||||
|
||||
sq->record<kp::OpTensorSyncLocal>({ tensorOutput });
|
||||
|
||||
sq->end();
|
||||
sq->eval();
|
||||
}
|
||||
|
|
@ -100,6 +103,8 @@ TEST(TestManager, TestMultipleTensorsAtOnce) {
|
|||
|
||||
sq->record<kp::OpMult<>>({ tensorLHS, tensorRHS, tensorOutput });
|
||||
|
||||
sq->record<kp::OpTensorSyncLocal>({ tensorOutput });
|
||||
|
||||
sq->end();
|
||||
sq->eval();
|
||||
}
|
||||
|
|
|
|||
|
|
@ -27,17 +27,16 @@ TEST(TestMultipleAlgoExecutions, SingleSequenceRecord) {
|
|||
|
||||
sq->record<kp::OpAlgoBase<3, 1, 1>>(
|
||||
{ tensorA },
|
||||
false, // Whether to copy output from device
|
||||
std::vector<char>(shader.begin(), shader.end()));
|
||||
sq->record<kp::OpAlgoBase<3, 1, 1>>(
|
||||
{ tensorA },
|
||||
false, // Whether to copy output from device
|
||||
std::vector<char>(shader.begin(), shader.end()));
|
||||
sq->record<kp::OpAlgoBase<3, 1, 1>>(
|
||||
{ tensorA },
|
||||
true, // Whether to copy output from device
|
||||
std::vector<char>(shader.begin(), shader.end()));
|
||||
|
||||
sq->record<kp::OpTensorSyncLocal>({ tensorA });
|
||||
|
||||
sq->end();
|
||||
sq->eval();
|
||||
}
|
||||
|
|
@ -70,7 +69,6 @@ TEST(TestMultipleAlgoExecutions, MultipleCmdBufRecords) {
|
|||
|
||||
sq->record<kp::OpAlgoBase<3, 1, 1>>(
|
||||
{ tensorA },
|
||||
false, // Whether to copy output from device
|
||||
std::vector<char>(shader.begin(), shader.end()));
|
||||
|
||||
sq->end();
|
||||
|
|
@ -80,7 +78,6 @@ TEST(TestMultipleAlgoExecutions, MultipleCmdBufRecords) {
|
|||
|
||||
sq->record<kp::OpAlgoBase<3, 1, 1>>(
|
||||
{ tensorA },
|
||||
false, // Whether to copy output from device
|
||||
std::vector<char>(shader.begin(), shader.end()));
|
||||
|
||||
sq->end();
|
||||
|
|
@ -90,11 +87,18 @@ TEST(TestMultipleAlgoExecutions, MultipleCmdBufRecords) {
|
|||
|
||||
sq->record<kp::OpAlgoBase<3, 1, 1>>(
|
||||
{ tensorA },
|
||||
true, // Whether to copy output from device
|
||||
std::vector<char>(shader.begin(), shader.end()));
|
||||
|
||||
sq->end();
|
||||
sq->eval();
|
||||
|
||||
sq->begin();
|
||||
|
||||
sq->record<kp::OpTensorSyncLocal>(
|
||||
{ tensorA });
|
||||
|
||||
sq->end();
|
||||
sq->eval();
|
||||
}
|
||||
sqWeakPtr.reset();
|
||||
|
||||
|
|
@ -126,7 +130,6 @@ TEST(TestMultipleAlgoExecutions, MultipleSequences) {
|
|||
|
||||
sq->record<kp::OpAlgoBase<3, 1, 1>>(
|
||||
{ tensorA },
|
||||
true, // Whether to copy output from device
|
||||
std::vector<char>(shader.begin(), shader.end()));
|
||||
|
||||
sq->end();
|
||||
|
|
@ -134,12 +137,11 @@ TEST(TestMultipleAlgoExecutions, MultipleSequences) {
|
|||
}
|
||||
|
||||
std::weak_ptr<kp::Sequence> sqWeakPtr2 = mgr.getOrCreateManagedSequence("newSequence2");
|
||||
if (std::shared_ptr<kp::Sequence> sq = sqWeakPtr.lock()) {
|
||||
if (std::shared_ptr<kp::Sequence> sq = sqWeakPtr2.lock()) {
|
||||
sq->begin();
|
||||
|
||||
sq->record<kp::OpAlgoBase<3, 1, 1>>(
|
||||
{ tensorA },
|
||||
true, // Whether to copy output from device
|
||||
std::vector<char>(shader.begin(), shader.end()));
|
||||
|
||||
sq->end();
|
||||
|
|
@ -148,18 +150,28 @@ TEST(TestMultipleAlgoExecutions, MultipleSequences) {
|
|||
|
||||
|
||||
std::weak_ptr<kp::Sequence> sqWeakPtr3 = mgr.getOrCreateManagedSequence("newSequence3");
|
||||
if (std::shared_ptr<kp::Sequence> sq = sqWeakPtr.lock()) {
|
||||
if (std::shared_ptr<kp::Sequence> sq = sqWeakPtr3.lock()) {
|
||||
sq->begin();
|
||||
|
||||
sq->record<kp::OpAlgoBase<3, 1, 1>>(
|
||||
{ tensorA },
|
||||
true, // Whether to copy output from device
|
||||
std::vector<char>(shader.begin(), shader.end()));
|
||||
|
||||
sq->end();
|
||||
sq->eval();
|
||||
}
|
||||
|
||||
std::weak_ptr<kp::Sequence> sqWeakPtr4 = mgr.getOrCreateManagedSequence("newSequence5");
|
||||
if (std::shared_ptr<kp::Sequence> sq = sqWeakPtr4.lock()) {
|
||||
sq->begin();
|
||||
|
||||
sq->record<kp::OpTensorSyncLocal>(
|
||||
{ tensorA });
|
||||
|
||||
sq->end();
|
||||
sq->eval();
|
||||
}
|
||||
|
||||
EXPECT_EQ(tensorA->data(), std::vector<float>({3, 3, 3}));
|
||||
}
|
||||
|
||||
|
|
@ -190,12 +202,11 @@ TEST(TestMultipleAlgoExecutions, SingleRecordMultipleEval) {
|
|||
}
|
||||
|
||||
std::weak_ptr<kp::Sequence> sqWeakPtr2 = mgr.getOrCreateManagedSequence("newSequence2");
|
||||
if (std::shared_ptr<kp::Sequence> sq = sqWeakPtr.lock()) {
|
||||
if (std::shared_ptr<kp::Sequence> sq = sqWeakPtr2.lock()) {
|
||||
sq->begin();
|
||||
|
||||
sq->record<kp::OpAlgoBase<3, 1, 1>>(
|
||||
{ tensorA },
|
||||
true, // Whether to copy output from device
|
||||
std::vector<char>(shader.begin(), shader.end()));
|
||||
|
||||
sq->end();
|
||||
|
|
@ -205,6 +216,20 @@ TEST(TestMultipleAlgoExecutions, SingleRecordMultipleEval) {
|
|||
sq->eval();
|
||||
}
|
||||
|
||||
std::weak_ptr<kp::Sequence> sqWeakPtr3 = mgr.getOrCreateManagedSequence("newSequence3");
|
||||
if (std::shared_ptr<kp::Sequence> sq = sqWeakPtr2.lock()) {
|
||||
sq->begin();
|
||||
|
||||
sq->record<kp::OpTensorSyncLocal>(
|
||||
{ tensorA });
|
||||
|
||||
sq->end();
|
||||
|
||||
sq->eval();
|
||||
sq->eval();
|
||||
sq->eval();
|
||||
}
|
||||
|
||||
EXPECT_EQ(tensorA->data(), std::vector<float>({3, 3, 3}));
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -50,7 +50,6 @@ TEST(TestProcessingIterations, IterateThroughMultipleSumAndCopies) {
|
|||
|
||||
sq->record<kp::OpAlgoBase<>>(
|
||||
{ tensorA, tensorB },
|
||||
true, // Whether to copy output from device
|
||||
std::vector<char>(shader.begin(), shader.end()));
|
||||
|
||||
sq->record<kp::OpTensorCopy>({tensorB, tensorA});
|
||||
|
|
|
|||
|
|
@ -29,9 +29,10 @@ TEST(TestOpAlgoBase, ShaderRawDataFromConstructor) {
|
|||
|
||||
mgr.evalOpDefault<kp::OpAlgoBase<>>(
|
||||
{ tensorA, tensorB },
|
||||
true, // Whether to copy output from device
|
||||
std::vector<char>(shader.begin(), shader.end()));
|
||||
|
||||
mgr.evalOpDefault<kp::OpTensorSyncLocal>({tensorA, tensorB});
|
||||
|
||||
EXPECT_EQ(tensorA->data(), std::vector<float>({0, 1, 2}));
|
||||
EXPECT_EQ(tensorB->data(), std::vector<float>({3, 4, 5}));
|
||||
}
|
||||
|
|
@ -45,12 +46,13 @@ TEST(TestOpAlgoBase, ShaderCompiledDataFromConstructor) {
|
|||
|
||||
mgr.evalOpDefault<kp::OpAlgoBase<>>(
|
||||
{ tensorA, tensorB },
|
||||
true, // Whether to copy output from device
|
||||
std::vector<char>(
|
||||
kp::shader_data::test_shaders_glsl_test_op_custom_shader_comp_spv,
|
||||
kp::shader_data::test_shaders_glsl_test_op_custom_shader_comp_spv +
|
||||
kp::shader_data::test_shaders_glsl_test_op_custom_shader_comp_spv_len));
|
||||
|
||||
mgr.evalOpDefault<kp::OpTensorSyncLocal>({tensorA, tensorB});
|
||||
|
||||
EXPECT_EQ(tensorA->data(), std::vector<float>({0, 1, 2}));
|
||||
EXPECT_EQ(tensorB->data(), std::vector<float>({3, 4, 5}));
|
||||
}
|
||||
|
|
@ -64,9 +66,10 @@ TEST(TestOpAlgoBase, ShaderRawDataFromFile) {
|
|||
|
||||
mgr.evalOpDefault<kp::OpAlgoBase<>>(
|
||||
{ tensorA, tensorB },
|
||||
true, // Whether to copy output from device
|
||||
"test/shaders/glsl/test_op_custom_shader.comp");
|
||||
|
||||
mgr.evalOpDefault<kp::OpTensorSyncLocal>({tensorA, tensorB});
|
||||
|
||||
EXPECT_EQ(tensorA->data(), std::vector<float>({0, 1, 2}));
|
||||
EXPECT_EQ(tensorB->data(), std::vector<float>({3, 4, 5}));
|
||||
}
|
||||
|
|
@ -80,9 +83,10 @@ TEST(TestOpAlgoBase, ShaderCompiledDataFromFile) {
|
|||
|
||||
mgr.evalOpDefault<kp::OpAlgoBase<>>(
|
||||
{ tensorA, tensorB },
|
||||
true, // Whether to copy output from device
|
||||
"test/shaders/glsl/test_op_custom_shader.comp.spv");
|
||||
|
||||
mgr.evalOpDefault<kp::OpTensorSyncLocal>({tensorA, tensorB});
|
||||
|
||||
EXPECT_EQ(tensorA->data(), std::vector<float>({0, 1, 2}));
|
||||
EXPECT_EQ(tensorB->data(), std::vector<float>({3, 4, 5}));
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue