diff --git a/src/OpAlgoLhsRhsOut.cpp b/src/OpAlgoLhsRhsOut.cpp index ab759fed8..622a4f431 100644 --- a/src/OpAlgoLhsRhsOut.cpp +++ b/src/OpAlgoLhsRhsOut.cpp @@ -65,11 +65,6 @@ OpAlgoLhsRhsOut::init() " Output: " + std::to_string(this->mTensorOutput->size())); } - this->mTensorOutputStaging = std::make_shared( - this->mTensorOutput->data(), Tensor::TensorTypes::eStaging); - - this->mTensorOutputStaging->init(this->mPhysicalDevice, this->mDevice); - SPDLOG_DEBUG("Kompute OpAlgoLhsRhsOut fetching spirv data"); std::vector shaderFileData = this->fetchSpirvBinaryData(); @@ -110,8 +105,10 @@ OpAlgoLhsRhsOut::record() vk::PipelineStageFlagBits::eComputeShader, vk::PipelineStageFlagBits::eTransfer); - this->mTensorOutputStaging->recordCopyFrom( - this->mCommandBuffer, this->mTensorOutput, true); + if (this->mTensorOutput->tensorType() == Tensor::TensorTypes::eDevice) { + this->mTensorOutput->recordCopyFromDeviceToStaging( + this->mCommandBuffer, true); + } } void @@ -119,9 +116,7 @@ OpAlgoLhsRhsOut::postEval() { SPDLOG_DEBUG("Kompute OpAlgoLhsRhsOut postSubmit called"); - this->mTensorOutputStaging->mapDataFromHostMemory(); - - this->mTensorOutput->setData(this->mTensorOutputStaging->data()); + this->mTensorOutput->mapDataFromHostMemory(); } } diff --git a/src/OpTensorCreate.cpp b/src/OpTensorCreate.cpp index ac9485baf..7918415e9 100644 --- a/src/OpTensorCreate.cpp +++ b/src/OpTensorCreate.cpp @@ -23,16 +23,6 @@ OpTensorCreate::OpTensorCreate( OpTensorCreate::~OpTensorCreate() { SPDLOG_DEBUG("Kompute OpTensorCreate destructor started"); - - SPDLOG_DEBUG("Kompute OpTensorCreate freeing staging tensors"); - for (std::shared_ptr tensor : this->mStagingTensors) { - if (tensor && tensor->isInit()) { - tensor->freeMemoryDestroyGPUResources(); - } else { - SPDLOG_ERROR("Kompute OpTensorCreate expected to free " - "tensor but has already been freed."); - } - } } void @@ -50,27 +40,10 @@ OpTensorCreate::init() throw std::runtime_error( "Kompute OpTensorCreate: Tensor has already been initialized"); } - if (tensor->tensorType() == Tensor::TensorTypes::eDevice) { - tensor->init(this->mPhysicalDevice, this->mDevice); - - std::shared_ptr stagingTensor = std::make_shared( - tensor->data(), Tensor::TensorTypes::eStaging); - - stagingTensor->init(this->mPhysicalDevice, this->mDevice); - - stagingTensor->mapDataIntoHostMemory(); - - this->mStagingTensors.push_back(stagingTensor); - - } else { - + if (tensor->tensorType() != Tensor::TensorTypes::eStorage) { tensor->init(this->mPhysicalDevice, this->mDevice); tensor->mapDataIntoHostMemory(); - - // We push a nullptr when no staging tensor is needed to match - // index number in array to have one to one mapping with tensors - this->mStagingTensors.push_back(nullptr); } } } @@ -82,8 +55,8 @@ OpTensorCreate::record() for (size_t i = 0; i < this->mTensors.size(); i++) { if (this->mTensors[i]->tensorType() == Tensor::TensorTypes::eDevice) { - this->mTensors[i]->recordCopyFrom( - this->mCommandBuffer, this->mStagingTensors[i], false); + this->mTensors[i]->recordCopyFromStagingToDevice( + this->mCommandBuffer, false); } } } diff --git a/src/OpTensorSyncDevice.cpp b/src/OpTensorSyncDevice.cpp index b975d2a9b..340786eb5 100644 --- a/src/OpTensorSyncDevice.cpp +++ b/src/OpTensorSyncDevice.cpp @@ -41,25 +41,11 @@ OpTensorSyncDevice::init() "has not been initialized"); } if (tensor->tensorType() == Tensor::TensorTypes::eStorage) { - throw std::runtime_error( + SPDLOG_WARN( "Kompute OpTensorSyncLocal tensor parameter is of type " "TensorTypes::eStorage and hence cannot be used to receive or " "pass data."); } - if (tensor->tensorType() == Tensor::TensorTypes::eDevice) { - - std::shared_ptr stagingTensor = std::make_shared( - tensor->data(), Tensor::TensorTypes::eStaging); - - stagingTensor->init(this->mPhysicalDevice, this->mDevice); - - this->mStagingTensors.push_back(stagingTensor); - - } else { - // We push a nullptr when no staging tensor is needed to match - // index number in array to have one to one mapping with tensors - this->mStagingTensors.push_back(nullptr); - } } } @@ -70,8 +56,8 @@ OpTensorSyncDevice::record() for (size_t i = 0; i < this->mTensors.size(); i++) { if (this->mTensors[i]->tensorType() == Tensor::TensorTypes::eDevice) { - this->mTensors[i]->recordCopyFrom( - this->mCommandBuffer, this->mStagingTensors[i], false); + this->mTensors[i]->recordCopyFromStagingToDevice( + this->mCommandBuffer, false); } } } @@ -83,11 +69,8 @@ OpTensorSyncDevice::preEval() // Performing sync of data as eval can be called multiple times with same op for (size_t i = 0; i < this->mTensors.size(); i++) { - if (this->mTensors[i]->tensorType() == Tensor::TensorTypes::eDevice) { - this->mStagingTensors[i]->setData(this->mTensors[i]->data()); - this->mStagingTensors[i]->mapDataIntoHostMemory(); - } else { - this->mTensors[i]->mapDataFromHostMemory(); + if (this->mTensors[i]->tensorType() != Tensor::TensorTypes::eStorage) { + this->mTensors[i]->mapDataIntoHostMemory(); } } } diff --git a/src/OpTensorSyncLocal.cpp b/src/OpTensorSyncLocal.cpp index 24a737bdd..09d966e12 100644 --- a/src/OpTensorSyncLocal.cpp +++ b/src/OpTensorSyncLocal.cpp @@ -41,26 +41,11 @@ OpTensorSyncLocal::init() "Kompute OpTensorSyncLocal: Tensor has not been initialized"); } if (tensor->tensorType() == Tensor::TensorTypes::eStorage) { - throw std::runtime_error( + SPDLOG_WARN( "Kompute OpTensorSyncLocal tensor parameter is of type " "TensorTypes::eStorage and hence cannot be used to receive or " "pass data."); } - if (tensor->tensorType() == Tensor::TensorTypes::eDevice) { - - std::shared_ptr stagingTensor = std::make_shared( - tensor->data(), Tensor::TensorTypes::eStaging); - - stagingTensor->init(this->mPhysicalDevice, this->mDevice); - - this->mStagingTensors.push_back(stagingTensor); - - } else { - - // We push a nullptr when no staging tensor is needed to match - // index number in array to have one to one mapping with tensors - this->mStagingTensors.push_back(nullptr); - } } } @@ -71,8 +56,8 @@ OpTensorSyncLocal::record() for (size_t i = 0; i < this->mTensors.size(); i++) { if (this->mTensors[i]->tensorType() == Tensor::TensorTypes::eDevice) { - this->mStagingTensors[i]->recordCopyFrom( - this->mCommandBuffer, this->mTensors[i], true); + this->mTensors[i]->recordCopyFromDeviceToStaging( + this->mCommandBuffer, true); } } } @@ -90,10 +75,7 @@ OpTensorSyncLocal::postEval() SPDLOG_DEBUG("Kompute OpTensorSyncLocal mapping data into tensor local"); for (size_t i = 0; i < this->mTensors.size(); i++) { - if (this->mTensors[i]->tensorType() == Tensor::TensorTypes::eDevice) { - this->mStagingTensors[i]->mapDataFromHostMemory(); - this->mTensors[i]->setData(this->mStagingTensors[i]->data()); - } else { + if (this->mTensors[i]->tensorType() != Tensor::TensorTypes::eStorage) { this->mTensors[i]->mapDataFromHostMemory(); } } diff --git a/src/Tensor.cpp b/src/Tensor.cpp index b26132e9d..726723def 100644 --- a/src/Tensor.cpp +++ b/src/Tensor.cpp @@ -107,20 +107,51 @@ Tensor::recordCopyFrom(std::shared_ptr commandBuffer, std::shared_ptr copyFromTensor, bool createBarrier) { - SPDLOG_DEBUG("Kompute Tensor recordCopyFrom called"); - if (!this->mIsInit || !copyFromTensor->mIsInit) { - throw std::runtime_error( - "Kompute Tensor attempted to run createBuffer without init"); - } + vk::DeviceSize bufferSize(this->memorySize()); + vk::BufferCopy copyRegion(0, 0, bufferSize); + SPDLOG_DEBUG("Kompute Tensor recordCopyFrom data size {}.", bufferSize); + + this->copyBuffer(commandBuffer, copyFromTensor->mPrimaryBuffer, this->mPrimaryBuffer, bufferSize, copyRegion, createBarrier); + +} + +void +Tensor::recordCopyFromStagingToDevice(std::shared_ptr commandBuffer, + bool createBarrier) +{ vk::DeviceSize bufferSize(this->memorySize()); vk::BufferCopy copyRegion(0, 0, bufferSize); SPDLOG_DEBUG("Kompute Tensor copying data size {}.", bufferSize); + this->copyBuffer(commandBuffer, this->mStagingBuffer, this->mPrimaryBuffer, bufferSize, copyRegion, createBarrier); +} + +void +Tensor::recordCopyFromDeviceToStaging(std::shared_ptr commandBuffer, + bool createBarrier) +{ + vk::DeviceSize bufferSize(this->memorySize()); + vk::BufferCopy copyRegion(0, 0, bufferSize); + + SPDLOG_DEBUG("Kompute Tensor copying data size {}.", bufferSize); + + this->copyBuffer(commandBuffer, this->mPrimaryBuffer, this->mStagingBuffer, bufferSize, copyRegion, createBarrier); + +} + +void +Tensor::copyBuffer(std::shared_ptr commandBuffer, std::shared_ptr bufferFrom, std::shared_ptr bufferTo, vk::DeviceSize bufferSize, vk::BufferCopy copyRegion, bool createBarrier) { + + if (!this->mIsInit) { + throw std::runtime_error( + "Kompute Tensor attempted to run copyBuffer without init"); + } + commandBuffer->copyBuffer( - *copyFromTensor->mPrimaryBuffer, *this->mPrimaryBuffer, copyRegion); + *bufferFrom, *bufferTo, copyRegion); if (createBarrier) { // Buffer to ensure wait until data is copied to staging buffer @@ -275,7 +306,7 @@ Tensor::getStagingMemoryPropertyFlags() { switch (this->mTensorType) { case TensorTypes::eDevice: - return vk::MemoryPropertyFlagBits::eDeviceLocal; + return vk::MemoryPropertyFlagBits::eHostVisible; break; default: throw std::runtime_error("Kompute Tensor invalid tensor type"); @@ -299,16 +330,22 @@ Tensor::allocateMemoryCreateGPUResources() throw std::runtime_error("Kompute Tensor device is null"); } + SPDLOG_DEBUG("Kompute Tensor creating primary buffer and memory"); + this->mPrimaryBuffer = std::make_shared(); this->createBuffer(this->mPrimaryBuffer, this->getPrimaryBufferUsageFlags()); this->mFreePrimaryBuffer = true; + this->mPrimaryMemory = std::make_shared(); this->allocateBindMemory(this->mPrimaryBuffer, this->mPrimaryMemory, this->getPrimaryMemoryPropertyFlags()); this->mFreePrimaryMemory = true; if (this->mTensorType == TensorTypes::eDevice) { + SPDLOG_DEBUG("Kompute Tensor creating staging buffer and memory"); + this->mStagingBuffer = std::make_shared(); this->createBuffer(this->mStagingBuffer, this->getStagingBufferUsageFlags()); this->mFreeStagingBuffer = true; + this->mStagingMemory = std::make_shared(); this->allocateBindMemory(this->mStagingBuffer, this->mStagingMemory, this->getStagingMemoryPropertyFlags()); this->mFreeStagingMemory = true; } @@ -330,7 +367,7 @@ Tensor::createBuffer(std::shared_ptr buffer, vk::BufferUsageFlags bu SPDLOG_DEBUG("Kompute Tensor creating buffer with memory size: {}, and " "usage flags: {}", bufferSize, - vk::to_string(usageFlags)); + vk::to_string(bufferUsageFlags)); // TODO: Explore having concurrent sharing mode (with option) vk::BufferCreateInfo bufferInfo(vk::BufferCreateFlags(), @@ -430,7 +467,7 @@ Tensor::freeMemoryDestroyGPUResources() this->mDevice->freeMemory( *this->mPrimaryMemory, (vk::Optional)nullptr); - this->mDevice = nullptr; + this->mPrimaryMemory = nullptr; } } @@ -443,7 +480,7 @@ Tensor::freeMemoryDestroyGPUResources() this->mDevice->freeMemory( *this->mStagingMemory, (vk::Optional)nullptr); - this->mDevice = nullptr; + this->mStagingMemory = nullptr; } } diff --git a/src/include/kompute/Tensor.hpp b/src/include/kompute/Tensor.hpp index 7ab6f4e02..09ae89fd3 100644 --- a/src/include/kompute/Tensor.hpp +++ b/src/include/kompute/Tensor.hpp @@ -131,6 +131,26 @@ class Tensor std::shared_ptr copyFromTensor, bool createBarrier); + /** + * Records a copy from the internal staging memory to the device memory using an optional barrier to wait for the operation. This function would only be relevant for kp::Tensors of type eDevice. + * + * @param commandBuffer Vulkan Command Buffer to record the commands into + * @param createBarrier Whether to create a barrier that ensures the data is + * copied before further operations. Default is true. + */ + void recordCopyFromStagingToDevice(std::shared_ptr commandBuffer, + bool createBarrier); + + /** + * Records a copy from the internal device memory to the staging memory using an optional barrier to wait for the operation. This function would only be relevant for kp::Tensors of type eDevice. + * + * @param commandBuffer Vulkan Command Buffer to record the commands into + * @param createBarrier Whether to create a barrier that ensures the data is + * copied before further operations. Default is true. + */ + void recordCopyFromDeviceToStaging(std::shared_ptr commandBuffer, + bool createBarrier); + /** * Records the buffer memory barrier into the command buffer which * ensures that relevant data transfers are carried out correctly. @@ -174,13 +194,13 @@ class Tensor // -------------- OPTIONALLY OWNED RESOURCES std::shared_ptr mPrimaryBuffer; - bool mFreePrimaryBuffer; + bool mFreePrimaryBuffer = false; std::shared_ptr mStagingBuffer; - bool mFreeStagingBuffer; + bool mFreeStagingBuffer = false; std::shared_ptr mPrimaryMemory; - bool mFreePrimaryMemory; + bool mFreePrimaryMemory = false; std::shared_ptr mStagingMemory; - bool mFreeStagingMemory; + bool mFreeStagingMemory = false; // -------------- ALWAYS OWNED RESOURCES std::vector mData; @@ -193,6 +213,7 @@ class Tensor void allocateMemoryCreateGPUResources(); // Creates the vulkan buffer void createBuffer(std::shared_ptr buffer, vk::BufferUsageFlags bufferUsageFlags); void allocateBindMemory(std::shared_ptr buffer, std::shared_ptr memory, vk::MemoryPropertyFlags memoryPropertyFlags); + void copyBuffer(std::shared_ptr commandBuffer, std::shared_ptr bufferFrom, std::shared_ptr bufferTo, vk::DeviceSize bufferSize, vk::BufferCopy copyRegion, bool createBarrier); // Private util functions vk::BufferUsageFlags getPrimaryBufferUsageFlags(); diff --git a/src/include/kompute/operations/OpAlgoLhsRhsOut.hpp b/src/include/kompute/operations/OpAlgoLhsRhsOut.hpp index c826bd324..db79fa6eb 100644 --- a/src/include/kompute/operations/OpAlgoLhsRhsOut.hpp +++ b/src/include/kompute/operations/OpAlgoLhsRhsOut.hpp @@ -78,9 +78,6 @@ class OpAlgoLhsRhsOut : public OpAlgoBase std::shared_ptr mTensorLHS; ///< Reference to the parameter used in the left hand side equation of the shader std::shared_ptr mTensorRHS; ///< Reference to the parameter used in the right hand side equation of the shader std::shared_ptr mTensorOutput; ///< Reference to the parameter used in the output of the shader and will be copied with a staging vector - - // -------------- ALWAYS OWNED RESOURCES - std::shared_ptr mTensorOutputStaging; ///< Staging temporary tensor user do to copy the output of the tensor }; } // End namespace kp diff --git a/src/include/kompute/operations/OpBase.hpp b/src/include/kompute/operations/OpBase.hpp index dc0da487f..6e35df994 100644 --- a/src/include/kompute/operations/OpBase.hpp +++ b/src/include/kompute/operations/OpBase.hpp @@ -69,7 +69,7 @@ class OpBase if (tensor && tensor->isInit()) { tensor->freeMemoryDestroyGPUResources(); } else { - SPDLOG_ERROR("Kompute OpBase expected to free " + SPDLOG_WARN("Kompute OpBase expected to free " "tensor but has already been freed."); } } diff --git a/src/include/kompute/operations/OpTensorCreate.hpp b/src/include/kompute/operations/OpTensorCreate.hpp index ca143b334..4b8c784cc 100644 --- a/src/include/kompute/operations/OpTensorCreate.hpp +++ b/src/include/kompute/operations/OpTensorCreate.hpp @@ -69,8 +69,6 @@ class OpTensorCreate : public OpBase private: - // Never owned resources - std::vector> mStagingTensors; }; } // End namespace kp diff --git a/src/include/kompute/operations/OpTensorSyncDevice.hpp b/src/include/kompute/operations/OpTensorSyncDevice.hpp index a19e40dca..b80cc1db0 100644 --- a/src/include/kompute/operations/OpTensorSyncDevice.hpp +++ b/src/include/kompute/operations/OpTensorSyncDevice.hpp @@ -9,7 +9,7 @@ namespace kp { /** - Operation that syncs tensor's device by mapping local data into the device memory. For TensorTypes::eDevice it will use a staging tensor to perform the copy. For TensorTypes::eStaging it will only copy the data and perform a map, which will be executed during the record (as opposed to during the sequence eval/submit). This function cannot be carried out for TensorTypes::eStaging. + Operation that syncs tensor's device by mapping local data into the device memory. For TensorTypes::eDevice it will use a record operation for the memory to be syncd into GPU memory which means that the operation will be done in sync with GPU commands. For TensorTypes::eStaging it will only map the data into host memory which will happen during preEval before the recorded commands are dispatched. This operation won't have any effect on TensorTypes::eStaging. */ class OpTensorSyncDevice : public OpBase { @@ -35,12 +35,12 @@ class OpTensorSyncDevice : public OpBase ~OpTensorSyncDevice() override; /** - * Performs basic checks such as ensuring that there is at least one tensor provided, that they are initialized and that they are not of type TensorTpes::eStaging. For staging tensors in host memory, the map is performed during the init function. + * Performs basic checks such as ensuring that there is at least one tensor provided with min memory of 1 element. */ void init() override; /** - * For device tensors, it records the copy command to the device tensor from the temporary staging tensor. + * For device tensors, it records the copy command for the tensor to copy the data from its staging to device memory. */ void record() override; @@ -55,8 +55,6 @@ class OpTensorSyncDevice : public OpBase virtual void postEval() override; private: - // Never owned resources - std::vector> mStagingTensors; }; } // End namespace kp diff --git a/src/include/kompute/operations/OpTensorSyncLocal.hpp b/src/include/kompute/operations/OpTensorSyncLocal.hpp index caf0ec9b1..dd4549b00 100644 --- a/src/include/kompute/operations/OpTensorSyncLocal.hpp +++ b/src/include/kompute/operations/OpTensorSyncLocal.hpp @@ -9,7 +9,7 @@ namespace kp { /** - Operation that syncs tensor's local data by mapping the data from device memory into the local vector. For TensorTypes::eDevice it will use a staging tensor to perform the copy. For TensorTypes::eStaging it will only copy the data and perform a map, which will be executed during the postSubmit (there will be no copy during the sequence eval/submit). This function cannot be carried out for TensorTypes::eStaging. + Operation that syncs tensor's local memory by mapping device data into the local CPU memory. For TensorTypes::eDevice it will use a record operation for the memory to be syncd into GPU memory which means that the operation will be done in sync with GPU commands. For TensorTypes::eStaging it will only map the data into host memory which will happen during preEval before the recorded commands are dispatched. This operation won't have any effect on TensorTypes::eStaging. */ class OpTensorSyncLocal : public OpBase { @@ -30,17 +30,17 @@ class OpTensorSyncLocal : public OpBase std::vector> tensors); /** - * Default destructor. This class manages the memory of the staging tensors it owns but these are released in the postSubmit, before it arrives to the destructor. + * Default destructor. This class does not manage memory so it won't be expecting the parent to perform a release. */ ~OpTensorSyncLocal() override; /** - * Performs basic checks such as ensuring that there is at least one tensor provided, that they are initialized and that they are not of type TensorTpes::eStaging. + * Performs basic checks such as ensuring that there is at least one tensor provided with min memory of 1 element. */ void init() override; /** - * For device tensors, it records the copy command into the staging tensor from the device tensor. + * For device tensors, it records the copy command for the tensor to copy the data from its device to staging memory. */ void record() override; @@ -56,8 +56,6 @@ class OpTensorSyncLocal : public OpBase private: - // Never owned resources - std::vector> mStagingTensors; }; } // End namespace kp