From b61f3f22975621a1f0aca8845a4523ea428ad623 Mon Sep 17 00:00:00 2001 From: Alejandro Saucedo Date: Sun, 7 Feb 2021 22:00:58 +0000 Subject: [PATCH 1/6] Added initial iteration of tensor with two memory compoennts --- src/Tensor.cpp | 184 +++++++++++++++++++++++---------- src/include/kompute/Tensor.hpp | 24 +++-- 2 files changed, 144 insertions(+), 64 deletions(-) diff --git a/src/Tensor.cpp b/src/Tensor.cpp index 63e667053..b26132e9d 100644 --- a/src/Tensor.cpp +++ b/src/Tensor.cpp @@ -47,7 +47,7 @@ Tensor::init(std::shared_ptr physicalDevice, this->mIsInit = true; - this->createBuffer(); + this->allocateMemoryCreateGPUResources(); } std::vector& @@ -89,7 +89,7 @@ Tensor::tensorType() bool Tensor::isInit() { - return this->mIsInit && this->mBuffer && this->mMemory; + return this->mIsInit && this->mPrimaryBuffer && this->mPrimaryMemory; } void @@ -120,7 +120,7 @@ Tensor::recordCopyFrom(std::shared_ptr commandBuffer, SPDLOG_DEBUG("Kompute Tensor copying data size {}.", bufferSize); commandBuffer->copyBuffer( - *copyFromTensor->mBuffer, *this->mBuffer, copyRegion); + *copyFromTensor->mPrimaryBuffer, *this->mPrimaryBuffer, copyRegion); if (createBarrier) { // Buffer to ensure wait until data is copied to staging buffer @@ -145,7 +145,7 @@ Tensor::recordBufferMemoryBarrier( vk::DeviceSize bufferSize = this->memorySize(); vk::BufferMemoryBarrier bufferMemoryBarrier; - bufferMemoryBarrier.buffer = *this->mBuffer; + bufferMemoryBarrier.buffer = *this->mPrimaryBuffer; bufferMemoryBarrier.size = bufferSize; bufferMemoryBarrier.srcAccessMask = srcAccessMask; bufferMemoryBarrier.dstAccessMask = dstAccessMask; @@ -164,7 +164,7 @@ vk::DescriptorBufferInfo Tensor::constructDescriptorBufferInfo() { vk::DeviceSize bufferSize = this->memorySize(); - return vk::DescriptorBufferInfo(*this->mBuffer, + return vk::DescriptorBufferInfo(*this->mPrimaryBuffer, 0, // offset bufferSize); } @@ -174,20 +174,22 @@ Tensor::mapDataFromHostMemory() { SPDLOG_DEBUG("Kompute Tensor mapping data from host buffer"); - if (this->mTensorType != TensorTypes::eStaging) { - SPDLOG_ERROR( - "Mapping tensor data manually from DEVICE buffer instead of " - "using record GPU command with staging buffer"); - return; + std::shared_ptr hostVisibleMemory = nullptr; + + if (this->mTensorType == TensorTypes::eHost) { + hostVisibleMemory = this->mPrimaryMemory; + } + else { + hostVisibleMemory = this->mStagingMemory; } vk::DeviceSize bufferSize = this->memorySize(); void* mapped = this->mDevice->mapMemory( - *this->mMemory, 0, bufferSize, vk::MemoryMapFlags()); - vk::MappedMemoryRange mappedMemoryRange(*this->mMemory, 0, bufferSize); + *hostVisibleMemory, 0, bufferSize, vk::MemoryMapFlags()); + vk::MappedMemoryRange mappedMemoryRange(*hostVisibleMemory, 0, bufferSize); this->mDevice->invalidateMappedMemoryRanges(mappedMemoryRange); memcpy(this->mData.data(), mapped, bufferSize); - this->mDevice->unmapMemory(*this->mMemory); + this->mDevice->unmapMemory(*hostVisibleMemory); } void @@ -196,24 +198,27 @@ Tensor::mapDataIntoHostMemory() SPDLOG_DEBUG("Kompute Tensor local mapping tensor data to host buffer"); - if (this->mTensorType != TensorTypes::eStaging) { - SPDLOG_ERROR("Mapping tensor data manually to DEVICE memory instead of " - "using record GPU command with staging buffer"); - return; + std::shared_ptr hostVisibleMemory = nullptr; + + if (this->mTensorType == TensorTypes::eHost) { + hostVisibleMemory = this->mPrimaryMemory; + } + else { + hostVisibleMemory = this->mStagingMemory; } vk::DeviceSize bufferSize = this->memorySize(); void* mapped = this->mDevice->mapMemory( - *this->mMemory, 0, bufferSize, vk::MemoryMapFlags()); + *hostVisibleMemory, 0, bufferSize, vk::MemoryMapFlags()); memcpy(mapped, this->mData.data(), bufferSize); - vk::MappedMemoryRange mappedRange(*this->mMemory, 0, bufferSize); + vk::MappedMemoryRange mappedRange(*hostVisibleMemory, 0, bufferSize); this->mDevice->flushMappedMemoryRanges(1, &mappedRange); - this->mDevice->unmapMemory(*this->mMemory); + this->mDevice->unmapMemory(*hostVisibleMemory); } vk::BufferUsageFlags -Tensor::getBufferUsageFlags() +Tensor::getPrimaryBufferUsageFlags() { switch (this->mTensorType) { case TensorTypes::eDevice: @@ -221,8 +226,9 @@ Tensor::getBufferUsageFlags() vk::BufferUsageFlagBits::eTransferSrc | vk::BufferUsageFlagBits::eTransferDst; break; - case TensorTypes::eStaging: - return vk::BufferUsageFlagBits::eTransferSrc | + case TensorTypes::eHost: + return vk::BufferUsageFlagBits::eStorageBuffer | + vk::BufferUsageFlagBits::eTransferSrc | vk::BufferUsageFlagBits::eTransferDst; break; case TensorTypes::eStorage: @@ -234,13 +240,13 @@ Tensor::getBufferUsageFlags() } vk::MemoryPropertyFlags -Tensor::getMemoryPropertyFlags() +Tensor::getPrimaryMemoryPropertyFlags() { switch (this->mTensorType) { case TensorTypes::eDevice: return vk::MemoryPropertyFlagBits::eDeviceLocal; break; - case TensorTypes::eStaging: + case TensorTypes::eHost: return vk::MemoryPropertyFlagBits::eHostVisible; break; case TensorTypes::eStorage: @@ -251,8 +257,33 @@ Tensor::getMemoryPropertyFlags() } } +vk::BufferUsageFlags +Tensor::getStagingBufferUsageFlags() +{ + switch (this->mTensorType) { + case TensorTypes::eDevice: + return vk::BufferUsageFlagBits::eTransferSrc | + vk::BufferUsageFlagBits::eTransferDst; + break; + default: + throw std::runtime_error("Kompute Tensor invalid tensor type"); + } +} + +vk::MemoryPropertyFlags +Tensor::getStagingMemoryPropertyFlags() +{ + switch (this->mTensorType) { + case TensorTypes::eDevice: + return vk::MemoryPropertyFlagBits::eDeviceLocal; + break; + default: + throw std::runtime_error("Kompute Tensor invalid tensor type"); + } +} + void -Tensor::createBuffer() +Tensor::allocateMemoryCreateGPUResources() { SPDLOG_DEBUG("Kompute Tensor creating buffer"); @@ -268,44 +299,64 @@ Tensor::createBuffer() throw std::runtime_error("Kompute Tensor device is null"); } + this->mPrimaryBuffer = std::make_shared(); + this->createBuffer(this->mPrimaryBuffer, this->getPrimaryBufferUsageFlags()); + this->mFreePrimaryBuffer = true; + this->allocateBindMemory(this->mPrimaryBuffer, this->mPrimaryMemory, this->getPrimaryMemoryPropertyFlags()); + this->mFreePrimaryMemory = true; + + if (this->mTensorType == TensorTypes::eDevice) { + this->mStagingBuffer = std::make_shared(); + this->createBuffer(this->mStagingBuffer, this->getStagingBufferUsageFlags()); + this->mFreeStagingBuffer = true; + this->allocateBindMemory(this->mStagingBuffer, this->mStagingMemory, this->getStagingMemoryPropertyFlags()); + this->mFreeStagingMemory = true; + } + + SPDLOG_DEBUG("Kompute Tensor buffer & memory creation successful"); +} + +void +Tensor::createBuffer(std::shared_ptr buffer, vk::BufferUsageFlags bufferUsageFlags) { + - vk::BufferUsageFlags usageFlags = this->getBufferUsageFlags(); vk::DeviceSize bufferSize = this->memorySize(); + if(bufferSize<1){ throw std::runtime_error("Kompute Tensor attempted to create a zero-sized buffer"); } - this->mFreeBuffer = true; SPDLOG_DEBUG("Kompute Tensor creating buffer with memory size: {}, and " "usage flags: {}", bufferSize, vk::to_string(usageFlags)); + // TODO: Explore having concurrent sharing mode (with option) vk::BufferCreateInfo bufferInfo(vk::BufferCreateFlags(), bufferSize, - usageFlags, + bufferUsageFlags, vk::SharingMode::eExclusive); - this->mBuffer = std::make_shared(); - this->mDevice->createBuffer(&bufferInfo, nullptr, this->mBuffer.get()); + this->mDevice->createBuffer(&bufferInfo, nullptr, buffer.get()); - SPDLOG_DEBUG("Kompute Tensor buffer created now creating memory"); +} + +void +Tensor::allocateBindMemory(std::shared_ptr buffer, std::shared_ptr memory, vk::MemoryPropertyFlags memoryPropertyFlags) { + + SPDLOG_DEBUG("Kompute Tensor allocating and binding memory"); vk::PhysicalDeviceMemoryProperties memoryProperties = this->mPhysicalDevice->getMemoryProperties(); vk::MemoryRequirements memoryRequirements = - this->mDevice->getBufferMemoryRequirements(*this->mBuffer); - - vk::MemoryPropertyFlags memoryPropertyFlags = - this->getMemoryPropertyFlags(); + this->mDevice->getBufferMemoryRequirements(*buffer); uint32_t memoryTypeIndex = -1; for (uint32_t i = 0; i < memoryProperties.memoryTypeCount; i++) { if (memoryRequirements.memoryTypeBits & (1 << i)) { - if ((memoryProperties.memoryTypes[i].propertyFlags & - memoryPropertyFlags) == memoryPropertyFlags) { + if (((memoryProperties.memoryTypes[i]).propertyFlags & memoryPropertyFlags) == memoryPropertyFlags) { memoryTypeIndex = i; break; } @@ -316,8 +367,6 @@ Tensor::createBuffer() "Memory type index for buffer creation not found"); } - this->mFreeMemory = true; - SPDLOG_DEBUG( "Kompute Tensor allocating memory index: {}, size {}, flags: {}", memoryTypeIndex, @@ -327,13 +376,10 @@ Tensor::createBuffer() vk::MemoryAllocateInfo memoryAllocateInfo(memoryRequirements.size, memoryTypeIndex); - this->mMemory = std::make_shared(); this->mDevice->allocateMemory( - &memoryAllocateInfo, nullptr, this->mMemory.get()); + &memoryAllocateInfo, nullptr, memory.get()); - this->mDevice->bindBufferMemory(*this->mBuffer, *this->mMemory, 0); - - SPDLOG_DEBUG("Kompute Tensor buffer & memory creation successful"); + this->mDevice->bindBufferMemory(*buffer, *memory, 0); } void @@ -349,27 +395,53 @@ Tensor::freeMemoryDestroyGPUResources() return; } - if (this->mFreeBuffer) { - if (!this->mBuffer) { + if (this->mFreePrimaryBuffer) { + if (!this->mPrimaryBuffer) { SPDLOG_ERROR( - "Kompose Tensor expected to free buffer but got null buffer"); + "Kompose Tensor expected to destroy primary buffer but got null buffer"); } else { - SPDLOG_DEBUG("Kompose Tensor destroying buffer"); + SPDLOG_DEBUG("Kompose Tensor destroying primary buffer"); this->mDevice->destroy( - *this->mBuffer, + *this->mPrimaryBuffer, (vk::Optional)nullptr); - this->mBuffer = nullptr; + this->mPrimaryBuffer = nullptr; } } - if (this->mFreeMemory) { - if (!this->mMemory) { + if (this->mFreeStagingBuffer) { + if (!this->mStagingBuffer) { SPDLOG_ERROR( - "Kompose Tensor expected to free buffer but got null memory"); + "Kompose Tensor expected to destroy staging buffer but got null buffer"); } else { - SPDLOG_DEBUG("Kompose Tensor freeing memory"); + SPDLOG_DEBUG("Kompose Tensor destroying staging buffer"); + this->mDevice->destroy( + *this->mStagingBuffer, + (vk::Optional)nullptr); + this->mStagingBuffer = nullptr; + } + } + + if (this->mFreePrimaryMemory) { + if (!this->mPrimaryMemory) { + SPDLOG_ERROR( + "Kompose Tensor expected to free primary memory but got null memory"); + } else { + SPDLOG_DEBUG("Kompose Tensor freeing primary memory"); this->mDevice->freeMemory( - *this->mMemory, + *this->mPrimaryMemory, + (vk::Optional)nullptr); + this->mDevice = nullptr; + } + } + + if (this->mFreeStagingMemory) { + if (!this->mStagingMemory) { + SPDLOG_ERROR( + "Kompose Tensor expected to free staging memory but got null memory"); + } else { + SPDLOG_DEBUG("Kompose Tensor freeing staging memory"); + this->mDevice->freeMemory( + *this->mStagingMemory, (vk::Optional)nullptr); this->mDevice = nullptr; } diff --git a/src/include/kompute/Tensor.hpp b/src/include/kompute/Tensor.hpp index d7b7e8f8e..7ab6f4e02 100644 --- a/src/include/kompute/Tensor.hpp +++ b/src/include/kompute/Tensor.hpp @@ -26,7 +26,7 @@ class Tensor enum class TensorTypes { eDevice = 0, ///< Type is device memory, source and destination - eStaging = 1, ///< Type is host memory, source and destination + eHost = 1, ///< Type is host memory, source and destination eStorage = 2, ///< Type is Device memory (only) }; @@ -173,10 +173,14 @@ class Tensor std::shared_ptr mDevice; // -------------- OPTIONALLY OWNED RESOURCES - std::shared_ptr mBuffer; - bool mFreeBuffer; - std::shared_ptr mMemory; - bool mFreeMemory; + std::shared_ptr mPrimaryBuffer; + bool mFreePrimaryBuffer; + std::shared_ptr mStagingBuffer; + bool mFreeStagingBuffer; + std::shared_ptr mPrimaryMemory; + bool mFreePrimaryMemory; + std::shared_ptr mStagingMemory; + bool mFreeStagingMemory; // -------------- ALWAYS OWNED RESOURCES std::vector mData; @@ -186,11 +190,15 @@ class Tensor std::array mShape; bool mIsInit = false; - void createBuffer(); // Creates the vulkan buffer + void allocateMemoryCreateGPUResources(); // Creates the vulkan buffer + void createBuffer(std::shared_ptr buffer, vk::BufferUsageFlags bufferUsageFlags); + void allocateBindMemory(std::shared_ptr buffer, std::shared_ptr memory, vk::MemoryPropertyFlags memoryPropertyFlags); // Private util functions - vk::BufferUsageFlags getBufferUsageFlags(); - vk::MemoryPropertyFlags getMemoryPropertyFlags(); + vk::BufferUsageFlags getPrimaryBufferUsageFlags(); + vk::MemoryPropertyFlags getPrimaryMemoryPropertyFlags(); + vk::BufferUsageFlags getStagingBufferUsageFlags(); + vk::MemoryPropertyFlags getStagingMemoryPropertyFlags(); uint64_t memorySize(); }; From 04853df4697710f8a8334a1fc4238d32ca0de94a Mon Sep 17 00:00:00 2001 From: Alejandro Saucedo Date: Mon, 8 Feb 2021 07:17:54 +0000 Subject: [PATCH 2/6] Updated Tensor Memory to hold staging within class --- src/OpAlgoLhsRhsOut.cpp | 15 ++--- src/OpTensorCreate.cpp | 33 +---------- src/OpTensorSyncDevice.cpp | 27 ++------- src/OpTensorSyncLocal.cpp | 26 ++------- src/Tensor.cpp | 57 +++++++++++++++---- src/include/kompute/Tensor.hpp | 29 ++++++++-- .../kompute/operations/OpAlgoLhsRhsOut.hpp | 3 - src/include/kompute/operations/OpBase.hpp | 2 +- .../kompute/operations/OpTensorCreate.hpp | 2 - .../kompute/operations/OpTensorSyncDevice.hpp | 8 +-- .../kompute/operations/OpTensorSyncLocal.hpp | 10 ++-- 11 files changed, 97 insertions(+), 115 deletions(-) diff --git a/src/OpAlgoLhsRhsOut.cpp b/src/OpAlgoLhsRhsOut.cpp index ab759fed8..622a4f431 100644 --- a/src/OpAlgoLhsRhsOut.cpp +++ b/src/OpAlgoLhsRhsOut.cpp @@ -65,11 +65,6 @@ OpAlgoLhsRhsOut::init() " Output: " + std::to_string(this->mTensorOutput->size())); } - this->mTensorOutputStaging = std::make_shared( - this->mTensorOutput->data(), Tensor::TensorTypes::eStaging); - - this->mTensorOutputStaging->init(this->mPhysicalDevice, this->mDevice); - SPDLOG_DEBUG("Kompute OpAlgoLhsRhsOut fetching spirv data"); std::vector shaderFileData = this->fetchSpirvBinaryData(); @@ -110,8 +105,10 @@ OpAlgoLhsRhsOut::record() vk::PipelineStageFlagBits::eComputeShader, vk::PipelineStageFlagBits::eTransfer); - this->mTensorOutputStaging->recordCopyFrom( - this->mCommandBuffer, this->mTensorOutput, true); + if (this->mTensorOutput->tensorType() == Tensor::TensorTypes::eDevice) { + this->mTensorOutput->recordCopyFromDeviceToStaging( + this->mCommandBuffer, true); + } } void @@ -119,9 +116,7 @@ OpAlgoLhsRhsOut::postEval() { SPDLOG_DEBUG("Kompute OpAlgoLhsRhsOut postSubmit called"); - this->mTensorOutputStaging->mapDataFromHostMemory(); - - this->mTensorOutput->setData(this->mTensorOutputStaging->data()); + this->mTensorOutput->mapDataFromHostMemory(); } } diff --git a/src/OpTensorCreate.cpp b/src/OpTensorCreate.cpp index ac9485baf..7918415e9 100644 --- a/src/OpTensorCreate.cpp +++ b/src/OpTensorCreate.cpp @@ -23,16 +23,6 @@ OpTensorCreate::OpTensorCreate( OpTensorCreate::~OpTensorCreate() { SPDLOG_DEBUG("Kompute OpTensorCreate destructor started"); - - SPDLOG_DEBUG("Kompute OpTensorCreate freeing staging tensors"); - for (std::shared_ptr tensor : this->mStagingTensors) { - if (tensor && tensor->isInit()) { - tensor->freeMemoryDestroyGPUResources(); - } else { - SPDLOG_ERROR("Kompute OpTensorCreate expected to free " - "tensor but has already been freed."); - } - } } void @@ -50,27 +40,10 @@ OpTensorCreate::init() throw std::runtime_error( "Kompute OpTensorCreate: Tensor has already been initialized"); } - if (tensor->tensorType() == Tensor::TensorTypes::eDevice) { - tensor->init(this->mPhysicalDevice, this->mDevice); - - std::shared_ptr stagingTensor = std::make_shared( - tensor->data(), Tensor::TensorTypes::eStaging); - - stagingTensor->init(this->mPhysicalDevice, this->mDevice); - - stagingTensor->mapDataIntoHostMemory(); - - this->mStagingTensors.push_back(stagingTensor); - - } else { - + if (tensor->tensorType() != Tensor::TensorTypes::eStorage) { tensor->init(this->mPhysicalDevice, this->mDevice); tensor->mapDataIntoHostMemory(); - - // We push a nullptr when no staging tensor is needed to match - // index number in array to have one to one mapping with tensors - this->mStagingTensors.push_back(nullptr); } } } @@ -82,8 +55,8 @@ OpTensorCreate::record() for (size_t i = 0; i < this->mTensors.size(); i++) { if (this->mTensors[i]->tensorType() == Tensor::TensorTypes::eDevice) { - this->mTensors[i]->recordCopyFrom( - this->mCommandBuffer, this->mStagingTensors[i], false); + this->mTensors[i]->recordCopyFromStagingToDevice( + this->mCommandBuffer, false); } } } diff --git a/src/OpTensorSyncDevice.cpp b/src/OpTensorSyncDevice.cpp index b975d2a9b..340786eb5 100644 --- a/src/OpTensorSyncDevice.cpp +++ b/src/OpTensorSyncDevice.cpp @@ -41,25 +41,11 @@ OpTensorSyncDevice::init() "has not been initialized"); } if (tensor->tensorType() == Tensor::TensorTypes::eStorage) { - throw std::runtime_error( + SPDLOG_WARN( "Kompute OpTensorSyncLocal tensor parameter is of type " "TensorTypes::eStorage and hence cannot be used to receive or " "pass data."); } - if (tensor->tensorType() == Tensor::TensorTypes::eDevice) { - - std::shared_ptr stagingTensor = std::make_shared( - tensor->data(), Tensor::TensorTypes::eStaging); - - stagingTensor->init(this->mPhysicalDevice, this->mDevice); - - this->mStagingTensors.push_back(stagingTensor); - - } else { - // We push a nullptr when no staging tensor is needed to match - // index number in array to have one to one mapping with tensors - this->mStagingTensors.push_back(nullptr); - } } } @@ -70,8 +56,8 @@ OpTensorSyncDevice::record() for (size_t i = 0; i < this->mTensors.size(); i++) { if (this->mTensors[i]->tensorType() == Tensor::TensorTypes::eDevice) { - this->mTensors[i]->recordCopyFrom( - this->mCommandBuffer, this->mStagingTensors[i], false); + this->mTensors[i]->recordCopyFromStagingToDevice( + this->mCommandBuffer, false); } } } @@ -83,11 +69,8 @@ OpTensorSyncDevice::preEval() // Performing sync of data as eval can be called multiple times with same op for (size_t i = 0; i < this->mTensors.size(); i++) { - if (this->mTensors[i]->tensorType() == Tensor::TensorTypes::eDevice) { - this->mStagingTensors[i]->setData(this->mTensors[i]->data()); - this->mStagingTensors[i]->mapDataIntoHostMemory(); - } else { - this->mTensors[i]->mapDataFromHostMemory(); + if (this->mTensors[i]->tensorType() != Tensor::TensorTypes::eStorage) { + this->mTensors[i]->mapDataIntoHostMemory(); } } } diff --git a/src/OpTensorSyncLocal.cpp b/src/OpTensorSyncLocal.cpp index 24a737bdd..09d966e12 100644 --- a/src/OpTensorSyncLocal.cpp +++ b/src/OpTensorSyncLocal.cpp @@ -41,26 +41,11 @@ OpTensorSyncLocal::init() "Kompute OpTensorSyncLocal: Tensor has not been initialized"); } if (tensor->tensorType() == Tensor::TensorTypes::eStorage) { - throw std::runtime_error( + SPDLOG_WARN( "Kompute OpTensorSyncLocal tensor parameter is of type " "TensorTypes::eStorage and hence cannot be used to receive or " "pass data."); } - if (tensor->tensorType() == Tensor::TensorTypes::eDevice) { - - std::shared_ptr stagingTensor = std::make_shared( - tensor->data(), Tensor::TensorTypes::eStaging); - - stagingTensor->init(this->mPhysicalDevice, this->mDevice); - - this->mStagingTensors.push_back(stagingTensor); - - } else { - - // We push a nullptr when no staging tensor is needed to match - // index number in array to have one to one mapping with tensors - this->mStagingTensors.push_back(nullptr); - } } } @@ -71,8 +56,8 @@ OpTensorSyncLocal::record() for (size_t i = 0; i < this->mTensors.size(); i++) { if (this->mTensors[i]->tensorType() == Tensor::TensorTypes::eDevice) { - this->mStagingTensors[i]->recordCopyFrom( - this->mCommandBuffer, this->mTensors[i], true); + this->mTensors[i]->recordCopyFromDeviceToStaging( + this->mCommandBuffer, true); } } } @@ -90,10 +75,7 @@ OpTensorSyncLocal::postEval() SPDLOG_DEBUG("Kompute OpTensorSyncLocal mapping data into tensor local"); for (size_t i = 0; i < this->mTensors.size(); i++) { - if (this->mTensors[i]->tensorType() == Tensor::TensorTypes::eDevice) { - this->mStagingTensors[i]->mapDataFromHostMemory(); - this->mTensors[i]->setData(this->mStagingTensors[i]->data()); - } else { + if (this->mTensors[i]->tensorType() != Tensor::TensorTypes::eStorage) { this->mTensors[i]->mapDataFromHostMemory(); } } diff --git a/src/Tensor.cpp b/src/Tensor.cpp index b26132e9d..726723def 100644 --- a/src/Tensor.cpp +++ b/src/Tensor.cpp @@ -107,20 +107,51 @@ Tensor::recordCopyFrom(std::shared_ptr commandBuffer, std::shared_ptr copyFromTensor, bool createBarrier) { - SPDLOG_DEBUG("Kompute Tensor recordCopyFrom called"); - if (!this->mIsInit || !copyFromTensor->mIsInit) { - throw std::runtime_error( - "Kompute Tensor attempted to run createBuffer without init"); - } + vk::DeviceSize bufferSize(this->memorySize()); + vk::BufferCopy copyRegion(0, 0, bufferSize); + SPDLOG_DEBUG("Kompute Tensor recordCopyFrom data size {}.", bufferSize); + + this->copyBuffer(commandBuffer, copyFromTensor->mPrimaryBuffer, this->mPrimaryBuffer, bufferSize, copyRegion, createBarrier); + +} + +void +Tensor::recordCopyFromStagingToDevice(std::shared_ptr commandBuffer, + bool createBarrier) +{ vk::DeviceSize bufferSize(this->memorySize()); vk::BufferCopy copyRegion(0, 0, bufferSize); SPDLOG_DEBUG("Kompute Tensor copying data size {}.", bufferSize); + this->copyBuffer(commandBuffer, this->mStagingBuffer, this->mPrimaryBuffer, bufferSize, copyRegion, createBarrier); +} + +void +Tensor::recordCopyFromDeviceToStaging(std::shared_ptr commandBuffer, + bool createBarrier) +{ + vk::DeviceSize bufferSize(this->memorySize()); + vk::BufferCopy copyRegion(0, 0, bufferSize); + + SPDLOG_DEBUG("Kompute Tensor copying data size {}.", bufferSize); + + this->copyBuffer(commandBuffer, this->mPrimaryBuffer, this->mStagingBuffer, bufferSize, copyRegion, createBarrier); + +} + +void +Tensor::copyBuffer(std::shared_ptr commandBuffer, std::shared_ptr bufferFrom, std::shared_ptr bufferTo, vk::DeviceSize bufferSize, vk::BufferCopy copyRegion, bool createBarrier) { + + if (!this->mIsInit) { + throw std::runtime_error( + "Kompute Tensor attempted to run copyBuffer without init"); + } + commandBuffer->copyBuffer( - *copyFromTensor->mPrimaryBuffer, *this->mPrimaryBuffer, copyRegion); + *bufferFrom, *bufferTo, copyRegion); if (createBarrier) { // Buffer to ensure wait until data is copied to staging buffer @@ -275,7 +306,7 @@ Tensor::getStagingMemoryPropertyFlags() { switch (this->mTensorType) { case TensorTypes::eDevice: - return vk::MemoryPropertyFlagBits::eDeviceLocal; + return vk::MemoryPropertyFlagBits::eHostVisible; break; default: throw std::runtime_error("Kompute Tensor invalid tensor type"); @@ -299,16 +330,22 @@ Tensor::allocateMemoryCreateGPUResources() throw std::runtime_error("Kompute Tensor device is null"); } + SPDLOG_DEBUG("Kompute Tensor creating primary buffer and memory"); + this->mPrimaryBuffer = std::make_shared(); this->createBuffer(this->mPrimaryBuffer, this->getPrimaryBufferUsageFlags()); this->mFreePrimaryBuffer = true; + this->mPrimaryMemory = std::make_shared(); this->allocateBindMemory(this->mPrimaryBuffer, this->mPrimaryMemory, this->getPrimaryMemoryPropertyFlags()); this->mFreePrimaryMemory = true; if (this->mTensorType == TensorTypes::eDevice) { + SPDLOG_DEBUG("Kompute Tensor creating staging buffer and memory"); + this->mStagingBuffer = std::make_shared(); this->createBuffer(this->mStagingBuffer, this->getStagingBufferUsageFlags()); this->mFreeStagingBuffer = true; + this->mStagingMemory = std::make_shared(); this->allocateBindMemory(this->mStagingBuffer, this->mStagingMemory, this->getStagingMemoryPropertyFlags()); this->mFreeStagingMemory = true; } @@ -330,7 +367,7 @@ Tensor::createBuffer(std::shared_ptr buffer, vk::BufferUsageFlags bu SPDLOG_DEBUG("Kompute Tensor creating buffer with memory size: {}, and " "usage flags: {}", bufferSize, - vk::to_string(usageFlags)); + vk::to_string(bufferUsageFlags)); // TODO: Explore having concurrent sharing mode (with option) vk::BufferCreateInfo bufferInfo(vk::BufferCreateFlags(), @@ -430,7 +467,7 @@ Tensor::freeMemoryDestroyGPUResources() this->mDevice->freeMemory( *this->mPrimaryMemory, (vk::Optional)nullptr); - this->mDevice = nullptr; + this->mPrimaryMemory = nullptr; } } @@ -443,7 +480,7 @@ Tensor::freeMemoryDestroyGPUResources() this->mDevice->freeMemory( *this->mStagingMemory, (vk::Optional)nullptr); - this->mDevice = nullptr; + this->mStagingMemory = nullptr; } } diff --git a/src/include/kompute/Tensor.hpp b/src/include/kompute/Tensor.hpp index 7ab6f4e02..09ae89fd3 100644 --- a/src/include/kompute/Tensor.hpp +++ b/src/include/kompute/Tensor.hpp @@ -131,6 +131,26 @@ class Tensor std::shared_ptr copyFromTensor, bool createBarrier); + /** + * Records a copy from the internal staging memory to the device memory using an optional barrier to wait for the operation. This function would only be relevant for kp::Tensors of type eDevice. + * + * @param commandBuffer Vulkan Command Buffer to record the commands into + * @param createBarrier Whether to create a barrier that ensures the data is + * copied before further operations. Default is true. + */ + void recordCopyFromStagingToDevice(std::shared_ptr commandBuffer, + bool createBarrier); + + /** + * Records a copy from the internal device memory to the staging memory using an optional barrier to wait for the operation. This function would only be relevant for kp::Tensors of type eDevice. + * + * @param commandBuffer Vulkan Command Buffer to record the commands into + * @param createBarrier Whether to create a barrier that ensures the data is + * copied before further operations. Default is true. + */ + void recordCopyFromDeviceToStaging(std::shared_ptr commandBuffer, + bool createBarrier); + /** * Records the buffer memory barrier into the command buffer which * ensures that relevant data transfers are carried out correctly. @@ -174,13 +194,13 @@ class Tensor // -------------- OPTIONALLY OWNED RESOURCES std::shared_ptr mPrimaryBuffer; - bool mFreePrimaryBuffer; + bool mFreePrimaryBuffer = false; std::shared_ptr mStagingBuffer; - bool mFreeStagingBuffer; + bool mFreeStagingBuffer = false; std::shared_ptr mPrimaryMemory; - bool mFreePrimaryMemory; + bool mFreePrimaryMemory = false; std::shared_ptr mStagingMemory; - bool mFreeStagingMemory; + bool mFreeStagingMemory = false; // -------------- ALWAYS OWNED RESOURCES std::vector mData; @@ -193,6 +213,7 @@ class Tensor void allocateMemoryCreateGPUResources(); // Creates the vulkan buffer void createBuffer(std::shared_ptr buffer, vk::BufferUsageFlags bufferUsageFlags); void allocateBindMemory(std::shared_ptr buffer, std::shared_ptr memory, vk::MemoryPropertyFlags memoryPropertyFlags); + void copyBuffer(std::shared_ptr commandBuffer, std::shared_ptr bufferFrom, std::shared_ptr bufferTo, vk::DeviceSize bufferSize, vk::BufferCopy copyRegion, bool createBarrier); // Private util functions vk::BufferUsageFlags getPrimaryBufferUsageFlags(); diff --git a/src/include/kompute/operations/OpAlgoLhsRhsOut.hpp b/src/include/kompute/operations/OpAlgoLhsRhsOut.hpp index c826bd324..db79fa6eb 100644 --- a/src/include/kompute/operations/OpAlgoLhsRhsOut.hpp +++ b/src/include/kompute/operations/OpAlgoLhsRhsOut.hpp @@ -78,9 +78,6 @@ class OpAlgoLhsRhsOut : public OpAlgoBase std::shared_ptr mTensorLHS; ///< Reference to the parameter used in the left hand side equation of the shader std::shared_ptr mTensorRHS; ///< Reference to the parameter used in the right hand side equation of the shader std::shared_ptr mTensorOutput; ///< Reference to the parameter used in the output of the shader and will be copied with a staging vector - - // -------------- ALWAYS OWNED RESOURCES - std::shared_ptr mTensorOutputStaging; ///< Staging temporary tensor user do to copy the output of the tensor }; } // End namespace kp diff --git a/src/include/kompute/operations/OpBase.hpp b/src/include/kompute/operations/OpBase.hpp index dc0da487f..6e35df994 100644 --- a/src/include/kompute/operations/OpBase.hpp +++ b/src/include/kompute/operations/OpBase.hpp @@ -69,7 +69,7 @@ class OpBase if (tensor && tensor->isInit()) { tensor->freeMemoryDestroyGPUResources(); } else { - SPDLOG_ERROR("Kompute OpBase expected to free " + SPDLOG_WARN("Kompute OpBase expected to free " "tensor but has already been freed."); } } diff --git a/src/include/kompute/operations/OpTensorCreate.hpp b/src/include/kompute/operations/OpTensorCreate.hpp index ca143b334..4b8c784cc 100644 --- a/src/include/kompute/operations/OpTensorCreate.hpp +++ b/src/include/kompute/operations/OpTensorCreate.hpp @@ -69,8 +69,6 @@ class OpTensorCreate : public OpBase private: - // Never owned resources - std::vector> mStagingTensors; }; } // End namespace kp diff --git a/src/include/kompute/operations/OpTensorSyncDevice.hpp b/src/include/kompute/operations/OpTensorSyncDevice.hpp index a19e40dca..b80cc1db0 100644 --- a/src/include/kompute/operations/OpTensorSyncDevice.hpp +++ b/src/include/kompute/operations/OpTensorSyncDevice.hpp @@ -9,7 +9,7 @@ namespace kp { /** - Operation that syncs tensor's device by mapping local data into the device memory. For TensorTypes::eDevice it will use a staging tensor to perform the copy. For TensorTypes::eStaging it will only copy the data and perform a map, which will be executed during the record (as opposed to during the sequence eval/submit). This function cannot be carried out for TensorTypes::eStaging. + Operation that syncs tensor's device by mapping local data into the device memory. For TensorTypes::eDevice it will use a record operation for the memory to be syncd into GPU memory which means that the operation will be done in sync with GPU commands. For TensorTypes::eStaging it will only map the data into host memory which will happen during preEval before the recorded commands are dispatched. This operation won't have any effect on TensorTypes::eStaging. */ class OpTensorSyncDevice : public OpBase { @@ -35,12 +35,12 @@ class OpTensorSyncDevice : public OpBase ~OpTensorSyncDevice() override; /** - * Performs basic checks such as ensuring that there is at least one tensor provided, that they are initialized and that they are not of type TensorTpes::eStaging. For staging tensors in host memory, the map is performed during the init function. + * Performs basic checks such as ensuring that there is at least one tensor provided with min memory of 1 element. */ void init() override; /** - * For device tensors, it records the copy command to the device tensor from the temporary staging tensor. + * For device tensors, it records the copy command for the tensor to copy the data from its staging to device memory. */ void record() override; @@ -55,8 +55,6 @@ class OpTensorSyncDevice : public OpBase virtual void postEval() override; private: - // Never owned resources - std::vector> mStagingTensors; }; } // End namespace kp diff --git a/src/include/kompute/operations/OpTensorSyncLocal.hpp b/src/include/kompute/operations/OpTensorSyncLocal.hpp index caf0ec9b1..dd4549b00 100644 --- a/src/include/kompute/operations/OpTensorSyncLocal.hpp +++ b/src/include/kompute/operations/OpTensorSyncLocal.hpp @@ -9,7 +9,7 @@ namespace kp { /** - Operation that syncs tensor's local data by mapping the data from device memory into the local vector. For TensorTypes::eDevice it will use a staging tensor to perform the copy. For TensorTypes::eStaging it will only copy the data and perform a map, which will be executed during the postSubmit (there will be no copy during the sequence eval/submit). This function cannot be carried out for TensorTypes::eStaging. + Operation that syncs tensor's local memory by mapping device data into the local CPU memory. For TensorTypes::eDevice it will use a record operation for the memory to be syncd into GPU memory which means that the operation will be done in sync with GPU commands. For TensorTypes::eStaging it will only map the data into host memory which will happen during preEval before the recorded commands are dispatched. This operation won't have any effect on TensorTypes::eStaging. */ class OpTensorSyncLocal : public OpBase { @@ -30,17 +30,17 @@ class OpTensorSyncLocal : public OpBase std::vector> tensors); /** - * Default destructor. This class manages the memory of the staging tensors it owns but these are released in the postSubmit, before it arrives to the destructor. + * Default destructor. This class does not manage memory so it won't be expecting the parent to perform a release. */ ~OpTensorSyncLocal() override; /** - * Performs basic checks such as ensuring that there is at least one tensor provided, that they are initialized and that they are not of type TensorTpes::eStaging. + * Performs basic checks such as ensuring that there is at least one tensor provided with min memory of 1 element. */ void init() override; /** - * For device tensors, it records the copy command into the staging tensor from the device tensor. + * For device tensors, it records the copy command for the tensor to copy the data from its device to staging memory. */ void record() override; @@ -56,8 +56,6 @@ class OpTensorSyncLocal : public OpBase private: - // Never owned resources - std::vector> mStagingTensors; }; } // End namespace kp From 8a66c6b1e2a691d9387cadb18391d581c50494ac Mon Sep 17 00:00:00 2001 From: Alejandro Saucedo Date: Mon, 8 Feb 2021 07:18:05 +0000 Subject: [PATCH 3/6] Updated tests to reflect staging --- test/TestLogisticRegression.cpp | 4 ++-- test/TestManager.cpp | 2 +- test/TestOpTensorCopy.cpp | 16 ++++++++-------- test/TestTensor.cpp | 4 ++-- 4 files changed, 13 insertions(+), 13 deletions(-) diff --git a/test/TestLogisticRegression.cpp b/test/TestLogisticRegression.cpp index 5fa3032f8..7c3f15387 100644 --- a/test/TestLogisticRegression.cpp +++ b/test/TestLogisticRegression.cpp @@ -105,12 +105,12 @@ TEST(TestLogisticRegressionAlgorithm, TestMainLogisticRegressionManualCopy) std::shared_ptr y{ new kp::Tensor({ 0, 0, 0, 1, 1 }) }; std::shared_ptr wIn{ new kp::Tensor( - wInVec, kp::Tensor::TensorTypes::eStaging) }; + wInVec, kp::Tensor::TensorTypes::eHost) }; std::shared_ptr wOutI{ new kp::Tensor({ 0, 0, 0, 0, 0 }) }; std::shared_ptr wOutJ{ new kp::Tensor({ 0, 0, 0, 0, 0 }) }; std::shared_ptr bIn{ new kp::Tensor( - bInVec, kp::Tensor::TensorTypes::eStaging) }; + bInVec, kp::Tensor::TensorTypes::eHost) }; std::shared_ptr bOut{ new kp::Tensor({ 0, 0, 0, 0, 0 }) }; std::shared_ptr lOut{ new kp::Tensor({ 0, 0, 0, 0, 0 }) }; diff --git a/test/TestManager.cpp b/test/TestManager.cpp index 3076b2a62..198e617af 100644 --- a/test/TestManager.cpp +++ b/test/TestManager.cpp @@ -126,7 +126,7 @@ TEST(TestManager, TestCreateInitTensor) EXPECT_EQ(tensorB->data(), std::vector({ 0, 1, 2 })); std::shared_ptr tensorC = - mgr.buildTensor({ 0, 0, 0 }, kp::Tensor::TensorTypes::eStaging); + mgr.buildTensor({ 0, 0, 0 }, kp::Tensor::TensorTypes::eHost); mgr.evalOpDefault({ tensorA, tensorC }); diff --git a/test/TestOpTensorCopy.cpp b/test/TestOpTensorCopy.cpp index ab5b67402..0e840cad6 100644 --- a/test/TestOpTensorCopy.cpp +++ b/test/TestOpTensorCopy.cpp @@ -58,7 +58,7 @@ TEST(TestOpTensorCopy, CopyDeviceToDeviceTensorMulti) EXPECT_EQ(tensorA->data(), tensorC->data()); } -TEST(TestOpTensorCopy, CopyDeviceToStagingTensor) +TEST(TestOpTensorCopy, CopyDeviceToHostTensor) { kp::Manager mgr; @@ -68,7 +68,7 @@ TEST(TestOpTensorCopy, CopyDeviceToStagingTensor) std::shared_ptr tensorA{ new kp::Tensor(testVecA) }; std::shared_ptr tensorB{ new kp::Tensor( - testVecB, kp::Tensor::TensorTypes::eStaging) }; + testVecB, kp::Tensor::TensorTypes::eHost) }; mgr.evalOpDefault({ tensorA, tensorB }); @@ -84,7 +84,7 @@ TEST(TestOpTensorCopy, CopyDeviceToStagingTensor) EXPECT_EQ(tensorA->data(), tensorB->data()); } -TEST(TestOpTensorCopy, CopyStagingToDeviceTensor) +TEST(TestOpTensorCopy, CopyHostToDeviceTensor) { kp::Manager mgr; @@ -93,7 +93,7 @@ TEST(TestOpTensorCopy, CopyStagingToDeviceTensor) std::vector testVecB{ 0, 0, 0 }; std::shared_ptr tensorA{ new kp::Tensor( - testVecA, kp::Tensor::TensorTypes::eStaging) }; + testVecA, kp::Tensor::TensorTypes::eHost) }; std::shared_ptr tensorB{ new kp::Tensor(testVecB) }; mgr.evalOpDefault({ tensorA, tensorB }); @@ -110,7 +110,7 @@ TEST(TestOpTensorCopy, CopyStagingToDeviceTensor) EXPECT_EQ(tensorA->data(), tensorB->data()); } -TEST(TestOpTensorCopy, CopyStagingToStagingTensor) +TEST(TestOpTensorCopy, CopyHostToHostTensor) { kp::Manager mgr; @@ -119,9 +119,9 @@ TEST(TestOpTensorCopy, CopyStagingToStagingTensor) std::vector testVecB{ 0, 0, 0 }; std::shared_ptr tensorA{ new kp::Tensor( - testVecA, kp::Tensor::TensorTypes::eStaging) }; + testVecA, kp::Tensor::TensorTypes::eHost) }; std::shared_ptr tensorB{ new kp::Tensor( - testVecB, kp::Tensor::TensorTypes::eStaging) }; + testVecB, kp::Tensor::TensorTypes::eHost) }; mgr.evalOpDefault({ tensorA, tensorB }); @@ -145,7 +145,7 @@ TEST(TestOpTensorCopy, SingleTensorShouldFail) std::vector testVecA{ 9, 8, 7 }; std::shared_ptr tensorA{ new kp::Tensor( - testVecA, kp::Tensor::TensorTypes::eStaging) }; + testVecA, kp::Tensor::TensorTypes::eHost) }; mgr.evalOpDefault({ tensorA }); diff --git a/test/TestTensor.cpp b/test/TestTensor.cpp index 42731bcfe..5e54e8585 100644 --- a/test/TestTensor.cpp +++ b/test/TestTensor.cpp @@ -17,9 +17,9 @@ TEST(TestTensor, CopyFromHostData) std::vector vecB{ 0, 0, 0 }; std::shared_ptr tensorA = - std::make_shared(vecA, kp::Tensor::TensorTypes::eStaging); + std::make_shared(vecA, kp::Tensor::TensorTypes::eHost); std::shared_ptr tensorB = - std::make_shared(vecB, kp::Tensor::TensorTypes::eStaging); + std::make_shared(vecB, kp::Tensor::TensorTypes::eHost); kp::Manager mgr; From 815acfa1fe2cbc223162f90ec3a9486ee6331841 Mon Sep 17 00:00:00 2001 From: Alejandro Saucedo Date: Mon, 8 Feb 2021 07:18:14 +0000 Subject: [PATCH 4/6] Updatd single include --- single_include/kompute/Kompute.hpp | 70 +++++++++++++++++++----------- 1 file changed, 45 insertions(+), 25 deletions(-) diff --git a/single_include/kompute/Kompute.hpp b/single_include/kompute/Kompute.hpp index 244a742f7..d388fa24b 100755 --- a/single_include/kompute/Kompute.hpp +++ b/single_include/kompute/Kompute.hpp @@ -723,7 +723,7 @@ class Tensor enum class TensorTypes { eDevice = 0, ///< Type is device memory, source and destination - eStaging = 1, ///< Type is host memory, source and destination + eHost = 1, ///< Type is host memory, source and destination eStorage = 2, ///< Type is Device memory (only) }; @@ -828,6 +828,26 @@ class Tensor std::shared_ptr copyFromTensor, bool createBarrier); + /** + * Records a copy from the internal staging memory to the device memory using an optional barrier to wait for the operation. This function would only be relevant for kp::Tensors of type eDevice. + * + * @param commandBuffer Vulkan Command Buffer to record the commands into + * @param createBarrier Whether to create a barrier that ensures the data is + * copied before further operations. Default is true. + */ + void recordCopyFromStagingToDevice(std::shared_ptr commandBuffer, + bool createBarrier); + + /** + * Records a copy from the internal device memory to the staging memory using an optional barrier to wait for the operation. This function would only be relevant for kp::Tensors of type eDevice. + * + * @param commandBuffer Vulkan Command Buffer to record the commands into + * @param createBarrier Whether to create a barrier that ensures the data is + * copied before further operations. Default is true. + */ + void recordCopyFromDeviceToStaging(std::shared_ptr commandBuffer, + bool createBarrier); + /** * Records the buffer memory barrier into the command buffer which * ensures that relevant data transfers are carried out correctly. @@ -870,10 +890,14 @@ class Tensor std::shared_ptr mDevice; // -------------- OPTIONALLY OWNED RESOURCES - std::shared_ptr mBuffer; - bool mFreeBuffer; - std::shared_ptr mMemory; - bool mFreeMemory; + std::shared_ptr mPrimaryBuffer; + bool mFreePrimaryBuffer = false; + std::shared_ptr mStagingBuffer; + bool mFreeStagingBuffer = false; + std::shared_ptr mPrimaryMemory; + bool mFreePrimaryMemory = false; + std::shared_ptr mStagingMemory; + bool mFreeStagingMemory = false; // -------------- ALWAYS OWNED RESOURCES std::vector mData; @@ -883,11 +907,16 @@ class Tensor std::array mShape; bool mIsInit = false; - void createBuffer(); // Creates the vulkan buffer + void allocateMemoryCreateGPUResources(); // Creates the vulkan buffer + void createBuffer(std::shared_ptr buffer, vk::BufferUsageFlags bufferUsageFlags); + void allocateBindMemory(std::shared_ptr buffer, std::shared_ptr memory, vk::MemoryPropertyFlags memoryPropertyFlags); + void copyBuffer(std::shared_ptr commandBuffer, std::shared_ptr bufferFrom, std::shared_ptr bufferTo, vk::DeviceSize bufferSize, vk::BufferCopy copyRegion, bool createBarrier); // Private util functions - vk::BufferUsageFlags getBufferUsageFlags(); - vk::MemoryPropertyFlags getMemoryPropertyFlags(); + vk::BufferUsageFlags getPrimaryBufferUsageFlags(); + vk::MemoryPropertyFlags getPrimaryMemoryPropertyFlags(); + vk::BufferUsageFlags getStagingBufferUsageFlags(); + vk::MemoryPropertyFlags getStagingMemoryPropertyFlags(); uint64_t memorySize(); }; @@ -958,7 +987,7 @@ class OpBase if (tensor && tensor->isInit()) { tensor->freeMemoryDestroyGPUResources(); } else { - SPDLOG_ERROR("Kompute OpBase expected to free " + SPDLOG_WARN("Kompute OpBase expected to free " "tensor but has already been freed."); } } @@ -1264,8 +1293,6 @@ class OpTensorCreate : public OpBase virtual void postEval() override; private: - // Never owned resources - std::vector> mStagingTensors; }; } // End namespace kp @@ -1836,9 +1863,6 @@ class OpAlgoLhsRhsOut : public OpAlgoBase std::shared_ptr mTensorLHS; ///< Reference to the parameter used in the left hand side equation of the shader std::shared_ptr mTensorRHS; ///< Reference to the parameter used in the right hand side equation of the shader std::shared_ptr mTensorOutput; ///< Reference to the parameter used in the output of the shader and will be copied with a staging vector - - // -------------- ALWAYS OWNED RESOURCES - std::shared_ptr mTensorOutputStaging; ///< Staging temporary tensor user do to copy the output of the tensor }; } // End namespace kp @@ -1976,7 +2000,7 @@ class OpTensorCopy : public OpBase namespace kp { /** - Operation that syncs tensor's device by mapping local data into the device memory. For TensorTypes::eDevice it will use a staging tensor to perform the copy. For TensorTypes::eStaging it will only copy the data and perform a map, which will be executed during the record (as opposed to during the sequence eval/submit). This function cannot be carried out for TensorTypes::eStaging. + Operation that syncs tensor's device by mapping local data into the device memory. For TensorTypes::eDevice it will use a record operation for the memory to be syncd into GPU memory which means that the operation will be done in sync with GPU commands. For TensorTypes::eStaging it will only map the data into host memory which will happen during preEval before the recorded commands are dispatched. This operation won't have any effect on TensorTypes::eStaging. */ class OpTensorSyncDevice : public OpBase { @@ -2002,12 +2026,12 @@ class OpTensorSyncDevice : public OpBase ~OpTensorSyncDevice() override; /** - * Performs basic checks such as ensuring that there is at least one tensor provided, that they are initialized and that they are not of type TensorTpes::eStaging. For staging tensors in host memory, the map is performed during the init function. + * Performs basic checks such as ensuring that there is at least one tensor provided with min memory of 1 element. */ void init() override; /** - * For device tensors, it records the copy command to the device tensor from the temporary staging tensor. + * For device tensors, it records the copy command for the tensor to copy the data from its staging to device memory. */ void record() override; @@ -2022,8 +2046,6 @@ class OpTensorSyncDevice : public OpBase virtual void postEval() override; private: - // Never owned resources - std::vector> mStagingTensors; }; } // End namespace kp @@ -2031,7 +2053,7 @@ class OpTensorSyncDevice : public OpBase namespace kp { /** - Operation that syncs tensor's local data by mapping the data from device memory into the local vector. For TensorTypes::eDevice it will use a staging tensor to perform the copy. For TensorTypes::eStaging it will only copy the data and perform a map, which will be executed during the postSubmit (there will be no copy during the sequence eval/submit). This function cannot be carried out for TensorTypes::eStaging. + Operation that syncs tensor's local memory by mapping device data into the local CPU memory. For TensorTypes::eDevice it will use a record operation for the memory to be syncd into GPU memory which means that the operation will be done in sync with GPU commands. For TensorTypes::eStaging it will only map the data into host memory which will happen during preEval before the recorded commands are dispatched. This operation won't have any effect on TensorTypes::eStaging. */ class OpTensorSyncLocal : public OpBase { @@ -2052,17 +2074,17 @@ class OpTensorSyncLocal : public OpBase std::vector> tensors); /** - * Default destructor. This class manages the memory of the staging tensors it owns but these are released in the postSubmit, before it arrives to the destructor. + * Default destructor. This class does not manage memory so it won't be expecting the parent to perform a release. */ ~OpTensorSyncLocal() override; /** - * Performs basic checks such as ensuring that there is at least one tensor provided, that they are initialized and that they are not of type TensorTpes::eStaging. + * Performs basic checks such as ensuring that there is at least one tensor provided with min memory of 1 element. */ void init() override; /** - * For device tensors, it records the copy command into the staging tensor from the device tensor. + * For device tensors, it records the copy command for the tensor to copy the data from its device to staging memory. */ void record() override; @@ -2077,8 +2099,6 @@ class OpTensorSyncLocal : public OpBase virtual void postEval() override; private: - // Never owned resources - std::vector> mStagingTensors; }; } // End namespace kp From d24dfb759073650bdedce2500bc670bcf559f705 Mon Sep 17 00:00:00 2001 From: Alejandro Saucedo Date: Mon, 8 Feb 2021 07:18:32 +0000 Subject: [PATCH 5/6] Reformat --- src/OpAlgoLhsRhsOut.cpp | 4 +- src/Tensor.cpp | 103 +++++++++++++++++++++------------ src/include/kompute/Tensor.hpp | 37 ++++++++---- test/TestOpTensorCreate.cpp | 11 ++-- 4 files changed, 98 insertions(+), 57 deletions(-) diff --git a/src/OpAlgoLhsRhsOut.cpp b/src/OpAlgoLhsRhsOut.cpp index 622a4f431..3b78fa7d9 100644 --- a/src/OpAlgoLhsRhsOut.cpp +++ b/src/OpAlgoLhsRhsOut.cpp @@ -106,8 +106,8 @@ OpAlgoLhsRhsOut::record() vk::PipelineStageFlagBits::eTransfer); if (this->mTensorOutput->tensorType() == Tensor::TensorTypes::eDevice) { - this->mTensorOutput->recordCopyFromDeviceToStaging( - this->mCommandBuffer, true); + this->mTensorOutput->recordCopyFromDeviceToStaging(this->mCommandBuffer, + true); } } diff --git a/src/Tensor.cpp b/src/Tensor.cpp index 726723def..f04165cf9 100644 --- a/src/Tensor.cpp +++ b/src/Tensor.cpp @@ -113,45 +113,65 @@ Tensor::recordCopyFrom(std::shared_ptr commandBuffer, SPDLOG_DEBUG("Kompute Tensor recordCopyFrom data size {}.", bufferSize); - this->copyBuffer(commandBuffer, copyFromTensor->mPrimaryBuffer, this->mPrimaryBuffer, bufferSize, copyRegion, createBarrier); - + this->copyBuffer(commandBuffer, + copyFromTensor->mPrimaryBuffer, + this->mPrimaryBuffer, + bufferSize, + copyRegion, + createBarrier); } void -Tensor::recordCopyFromStagingToDevice(std::shared_ptr commandBuffer, - bool createBarrier) +Tensor::recordCopyFromStagingToDevice( + std::shared_ptr commandBuffer, + bool createBarrier) { vk::DeviceSize bufferSize(this->memorySize()); vk::BufferCopy copyRegion(0, 0, bufferSize); SPDLOG_DEBUG("Kompute Tensor copying data size {}.", bufferSize); - this->copyBuffer(commandBuffer, this->mStagingBuffer, this->mPrimaryBuffer, bufferSize, copyRegion, createBarrier); + this->copyBuffer(commandBuffer, + this->mStagingBuffer, + this->mPrimaryBuffer, + bufferSize, + copyRegion, + createBarrier); } void -Tensor::recordCopyFromDeviceToStaging(std::shared_ptr commandBuffer, - bool createBarrier) +Tensor::recordCopyFromDeviceToStaging( + std::shared_ptr commandBuffer, + bool createBarrier) { vk::DeviceSize bufferSize(this->memorySize()); vk::BufferCopy copyRegion(0, 0, bufferSize); SPDLOG_DEBUG("Kompute Tensor copying data size {}.", bufferSize); - this->copyBuffer(commandBuffer, this->mPrimaryBuffer, this->mStagingBuffer, bufferSize, copyRegion, createBarrier); - + this->copyBuffer(commandBuffer, + this->mPrimaryBuffer, + this->mStagingBuffer, + bufferSize, + copyRegion, + createBarrier); } void -Tensor::copyBuffer(std::shared_ptr commandBuffer, std::shared_ptr bufferFrom, std::shared_ptr bufferTo, vk::DeviceSize bufferSize, vk::BufferCopy copyRegion, bool createBarrier) { +Tensor::copyBuffer(std::shared_ptr commandBuffer, + std::shared_ptr bufferFrom, + std::shared_ptr bufferTo, + vk::DeviceSize bufferSize, + vk::BufferCopy copyRegion, + bool createBarrier) +{ if (!this->mIsInit) { throw std::runtime_error( "Kompute Tensor attempted to run copyBuffer without init"); } - commandBuffer->copyBuffer( - *bufferFrom, *bufferTo, copyRegion); + commandBuffer->copyBuffer(*bufferFrom, *bufferTo, copyRegion); if (createBarrier) { // Buffer to ensure wait until data is copied to staging buffer @@ -209,8 +229,7 @@ Tensor::mapDataFromHostMemory() if (this->mTensorType == TensorTypes::eHost) { hostVisibleMemory = this->mPrimaryMemory; - } - else { + } else { hostVisibleMemory = this->mStagingMemory; } @@ -233,8 +252,7 @@ Tensor::mapDataIntoHostMemory() if (this->mTensorType == TensorTypes::eHost) { hostVisibleMemory = this->mPrimaryMemory; - } - else { + } else { hostVisibleMemory = this->mStagingMemory; } @@ -333,20 +351,26 @@ Tensor::allocateMemoryCreateGPUResources() SPDLOG_DEBUG("Kompute Tensor creating primary buffer and memory"); this->mPrimaryBuffer = std::make_shared(); - this->createBuffer(this->mPrimaryBuffer, this->getPrimaryBufferUsageFlags()); + this->createBuffer(this->mPrimaryBuffer, + this->getPrimaryBufferUsageFlags()); this->mFreePrimaryBuffer = true; this->mPrimaryMemory = std::make_shared(); - this->allocateBindMemory(this->mPrimaryBuffer, this->mPrimaryMemory, this->getPrimaryMemoryPropertyFlags()); + this->allocateBindMemory(this->mPrimaryBuffer, + this->mPrimaryMemory, + this->getPrimaryMemoryPropertyFlags()); this->mFreePrimaryMemory = true; if (this->mTensorType == TensorTypes::eDevice) { SPDLOG_DEBUG("Kompute Tensor creating staging buffer and memory"); this->mStagingBuffer = std::make_shared(); - this->createBuffer(this->mStagingBuffer, this->getStagingBufferUsageFlags()); + this->createBuffer(this->mStagingBuffer, + this->getStagingBufferUsageFlags()); this->mFreeStagingBuffer = true; this->mStagingMemory = std::make_shared(); - this->allocateBindMemory(this->mStagingBuffer, this->mStagingMemory, this->getStagingMemoryPropertyFlags()); + this->allocateBindMemory(this->mStagingBuffer, + this->mStagingMemory, + this->getStagingMemoryPropertyFlags()); this->mFreeStagingMemory = true; } @@ -354,15 +378,16 @@ Tensor::allocateMemoryCreateGPUResources() } void -Tensor::createBuffer(std::shared_ptr buffer, vk::BufferUsageFlags bufferUsageFlags) { - +Tensor::createBuffer(std::shared_ptr buffer, + vk::BufferUsageFlags bufferUsageFlags) +{ vk::DeviceSize bufferSize = this->memorySize(); - if(bufferSize<1){ - throw std::runtime_error("Kompute Tensor attempted to create a zero-sized buffer"); + if (bufferSize < 1) { + throw std::runtime_error( + "Kompute Tensor attempted to create a zero-sized buffer"); } - SPDLOG_DEBUG("Kompute Tensor creating buffer with memory size: {}, and " "usage flags: {}", @@ -376,11 +401,13 @@ Tensor::createBuffer(std::shared_ptr buffer, vk::BufferUsageFlags bu vk::SharingMode::eExclusive); this->mDevice->createBuffer(&bufferInfo, nullptr, buffer.get()); - } void -Tensor::allocateBindMemory(std::shared_ptr buffer, std::shared_ptr memory, vk::MemoryPropertyFlags memoryPropertyFlags) { +Tensor::allocateBindMemory(std::shared_ptr buffer, + std::shared_ptr memory, + vk::MemoryPropertyFlags memoryPropertyFlags) +{ SPDLOG_DEBUG("Kompute Tensor allocating and binding memory"); @@ -393,7 +420,8 @@ Tensor::allocateBindMemory(std::shared_ptr buffer, std::shared_ptr buffer, std::shared_ptrmDevice->allocateMemory( - &memoryAllocateInfo, nullptr, memory.get()); + this->mDevice->allocateMemory(&memoryAllocateInfo, nullptr, memory.get()); this->mDevice->bindBufferMemory(*buffer, *memory, 0); } @@ -434,8 +461,8 @@ Tensor::freeMemoryDestroyGPUResources() if (this->mFreePrimaryBuffer) { if (!this->mPrimaryBuffer) { - SPDLOG_ERROR( - "Kompose Tensor expected to destroy primary buffer but got null buffer"); + SPDLOG_ERROR("Kompose Tensor expected to destroy primary buffer " + "but got null buffer"); } else { SPDLOG_DEBUG("Kompose Tensor destroying primary buffer"); this->mDevice->destroy( @@ -447,8 +474,8 @@ Tensor::freeMemoryDestroyGPUResources() if (this->mFreeStagingBuffer) { if (!this->mStagingBuffer) { - SPDLOG_ERROR( - "Kompose Tensor expected to destroy staging buffer but got null buffer"); + SPDLOG_ERROR("Kompose Tensor expected to destroy staging buffer " + "but got null buffer"); } else { SPDLOG_DEBUG("Kompose Tensor destroying staging buffer"); this->mDevice->destroy( @@ -460,8 +487,8 @@ Tensor::freeMemoryDestroyGPUResources() if (this->mFreePrimaryMemory) { if (!this->mPrimaryMemory) { - SPDLOG_ERROR( - "Kompose Tensor expected to free primary memory but got null memory"); + SPDLOG_ERROR("Kompose Tensor expected to free primary memory but " + "got null memory"); } else { SPDLOG_DEBUG("Kompose Tensor freeing primary memory"); this->mDevice->freeMemory( @@ -473,8 +500,8 @@ Tensor::freeMemoryDestroyGPUResources() if (this->mFreeStagingMemory) { if (!this->mStagingMemory) { - SPDLOG_ERROR( - "Kompose Tensor expected to free staging memory but got null memory"); + SPDLOG_ERROR("Kompose Tensor expected to free staging memory but " + "got null memory"); } else { SPDLOG_DEBUG("Kompose Tensor freeing staging memory"); this->mDevice->freeMemory( diff --git a/src/include/kompute/Tensor.hpp b/src/include/kompute/Tensor.hpp index 09ae89fd3..5d9fb07df 100644 --- a/src/include/kompute/Tensor.hpp +++ b/src/include/kompute/Tensor.hpp @@ -26,7 +26,7 @@ class Tensor enum class TensorTypes { eDevice = 0, ///< Type is device memory, source and destination - eHost = 1, ///< Type is host memory, source and destination + eHost = 1, ///< Type is host memory, source and destination eStorage = 2, ///< Type is Device memory (only) }; @@ -39,7 +39,8 @@ class Tensor * Default constructor with data provided which would be used to create the * respective vulkan buffer and memory. * - * @param data Non-zero-sized vector of data that will be used by the tensor + * @param data Non-zero-sized vector of data that will be used by the + * tensor * @param tensorType Type for the tensor which is of type TensorTypes */ Tensor(const std::vector& data, @@ -132,24 +133,30 @@ class Tensor bool createBarrier); /** - * Records a copy from the internal staging memory to the device memory using an optional barrier to wait for the operation. This function would only be relevant for kp::Tensors of type eDevice. + * Records a copy from the internal staging memory to the device memory + * using an optional barrier to wait for the operation. This function would + * only be relevant for kp::Tensors of type eDevice. * * @param commandBuffer Vulkan Command Buffer to record the commands into * @param createBarrier Whether to create a barrier that ensures the data is * copied before further operations. Default is true. */ - void recordCopyFromStagingToDevice(std::shared_ptr commandBuffer, - bool createBarrier); + void recordCopyFromStagingToDevice( + std::shared_ptr commandBuffer, + bool createBarrier); /** - * Records a copy from the internal device memory to the staging memory using an optional barrier to wait for the operation. This function would only be relevant for kp::Tensors of type eDevice. + * Records a copy from the internal device memory to the staging memory + * using an optional barrier to wait for the operation. This function would + * only be relevant for kp::Tensors of type eDevice. * * @param commandBuffer Vulkan Command Buffer to record the commands into * @param createBarrier Whether to create a barrier that ensures the data is * copied before further operations. Default is true. */ - void recordCopyFromDeviceToStaging(std::shared_ptr commandBuffer, - bool createBarrier); + void recordCopyFromDeviceToStaging( + std::shared_ptr commandBuffer, + bool createBarrier); /** * Records the buffer memory barrier into the command buffer which @@ -211,9 +218,17 @@ class Tensor bool mIsInit = false; void allocateMemoryCreateGPUResources(); // Creates the vulkan buffer - void createBuffer(std::shared_ptr buffer, vk::BufferUsageFlags bufferUsageFlags); - void allocateBindMemory(std::shared_ptr buffer, std::shared_ptr memory, vk::MemoryPropertyFlags memoryPropertyFlags); - void copyBuffer(std::shared_ptr commandBuffer, std::shared_ptr bufferFrom, std::shared_ptr bufferTo, vk::DeviceSize bufferSize, vk::BufferCopy copyRegion, bool createBarrier); + void createBuffer(std::shared_ptr buffer, + vk::BufferUsageFlags bufferUsageFlags); + void allocateBindMemory(std::shared_ptr buffer, + std::shared_ptr memory, + vk::MemoryPropertyFlags memoryPropertyFlags); + void copyBuffer(std::shared_ptr commandBuffer, + std::shared_ptr bufferFrom, + std::shared_ptr bufferTo, + vk::DeviceSize bufferSize, + vk::BufferCopy copyRegion, + bool createBarrier); // Private util functions vk::BufferUsageFlags getPrimaryBufferUsageFlags(); diff --git a/test/TestOpTensorCreate.cpp b/test/TestOpTensorCreate.cpp index 1281e1be2..f0ba87433 100644 --- a/test/TestOpTensorCreate.cpp +++ b/test/TestOpTensorCreate.cpp @@ -114,7 +114,6 @@ TEST(TestOpTensorCreate, NoErrorIfTensorFreedBefore) EXPECT_FALSE(tensorB->isInit()); } - TEST(TestOpTensorCreate, ExceptionOnZeroSizeTensor) { std::vector testVecA; @@ -123,11 +122,11 @@ TEST(TestOpTensorCreate, ExceptionOnZeroSizeTensor) kp::Manager mgr; - try{ + try { mgr.evalOpDefault({ tensorA }); - } catch( const std::runtime_error& err ) { - // check exception - ASSERT_TRUE( std::string(err.what()).find("zero-sized") != std::string::npos ); + } catch (const std::runtime_error& err) { + // check exception + ASSERT_TRUE(std::string(err.what()).find("zero-sized") != + std::string::npos); } - } From bf401019c9564b995ae2738dc3fe676f470bbcba Mon Sep 17 00:00:00 2001 From: Alejandro Saucedo Date: Mon, 8 Feb 2021 07:29:34 +0000 Subject: [PATCH 6/6] Updated python --- python/src/docstrings.hpp | 14 +++++++------- python/src/main.cpp | 12 ++++++------ 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/python/src/docstrings.hpp b/python/src/docstrings.hpp index 37f3ff785..6b3a1dc78 100644 --- a/python/src/docstrings.hpp +++ b/python/src/docstrings.hpp @@ -513,10 +513,10 @@ function not in the record function.)doc"; static const char *__doc_kp_OpTensorSyncDevice = R"doc(Operation that syncs tensor's device by mapping local data into the device memory. For TensorTypes::eDevice it will use a staging tensor -to perform the copy. For TensorTypes::eStaging it will only copy the +to perform the copy. For TensorTypes::eHost it will only copy the data and perform a map, which will be executed during the record (as opposed to during the sequence eval/submit). This function cannot be -carried out for TensorTypes::eStaging.)doc"; +carried out for TensorTypes::eHost.)doc"; static const char *__doc_kp_OpTensorSyncDevice_OpTensorSyncDevice = R"doc()doc"; @@ -533,7 +533,7 @@ queues @param device Vulkan logical device for passing to Algorithm static const char *__doc_kp_OpTensorSyncDevice_init = R"doc(Performs basic checks such as ensuring that there is at least one tensor provided, that they are initialized and that they are not of -type TensorTpes::eStaging. For staging tensors in host memory, the map +type TensorTpes::eHost. For staging tensors in host memory, the map is performed during the init function.)doc"; static const char *__doc_kp_OpTensorSyncDevice_mStagingTensors = R"doc()doc"; @@ -549,11 +549,11 @@ from the temporary staging tensor.)doc"; static const char *__doc_kp_OpTensorSyncLocal = R"doc(Operation that syncs tensor's local data by mapping the data from device memory into the local vector. For TensorTypes::eDevice it will -use a staging tensor to perform the copy. For TensorTypes::eStaging it +use a staging tensor to perform the copy. For TensorTypes::eHost it will only copy the data and perform a map, which will be executed during the postSubmit (there will be no copy during the sequence eval/submit). This function cannot be carried out for -TensorTypes::eStaging.)doc"; +TensorTypes::eHost.)doc"; static const char *__doc_kp_OpTensorSyncLocal_OpTensorSyncLocal = R"doc()doc"; @@ -570,7 +570,7 @@ queues @param device Vulkan logical device for passing to Algorithm static const char *__doc_kp_OpTensorSyncLocal_init = R"doc(Performs basic checks such as ensuring that there is at least one tensor provided, that they are initialized and that they are not of -type TensorTpes::eStaging.)doc"; +type TensorTpes::eHost.)doc"; static const char *__doc_kp_OpTensorSyncLocal_mStagingTensors = R"doc()doc"; @@ -719,7 +719,7 @@ shader storage).)doc"; static const char *__doc_kp_Tensor_TensorTypes_eDevice = R"doc(< Type is device memory, source and destination)doc"; -static const char *__doc_kp_Tensor_TensorTypes_eStaging = R"doc(< Type is host memory, source and destination)doc"; +static const char *__doc_kp_Tensor_TensorTypes_eHost = R"doc(< Type is host memory, source and destination)doc"; static const char *__doc_kp_Tensor_TensorTypes_eStorage = R"doc(< Type is Device memory (only))doc"; diff --git a/python/src/main.cpp b/python/src/main.cpp index 09368f2cd..6e795fad5 100644 --- a/python/src/main.cpp +++ b/python/src/main.cpp @@ -26,7 +26,7 @@ PYBIND11_MODULE(kp, m) { py::enum_(m, "TensorTypes", DOC(kp, Tensor, TensorTypes)) .value("device", kp::Tensor::TensorTypes::eDevice, "Tensor holding data in GPU memory.") - .value("staging", kp::Tensor::TensorTypes::eStaging, "Tensor used for transfer of data to device.") + .value("host", kp::Tensor::TensorTypes::eHost, "Tensor used for CPU visible GPU data.") .value("storage", kp::Tensor::TensorTypes::eStorage, "Tensor with host visible gpu memory.") .export_values(); @@ -112,7 +112,7 @@ PYBIND11_MODULE(kp, m) { .def("record_tensor_sync_device", &kp::Sequence::record, "Records operation to sync tensor from local memory to GPU memory") .def("record_tensor_sync_local", &kp::Sequence::record, - "Records operation to sync tensor(s) from GPU memory to local memory using staging tensors") + "Records operation to sync tensor(s) from GPU memory to local memory") .def("record_algo_mult", &kp::Sequence::record, "Records operation to run multiplication compute shader to two input tensors and an output tensor") .def("record_algo_file", [](kp::Sequence &self, @@ -179,7 +179,7 @@ PYBIND11_MODULE(kp, m) { .def("eval_tensor_sync_device_def", &kp::Manager::evalOpDefault, "Evaluates operation to sync tensor from local memory to GPU memory with new anonymous Sequence") .def("eval_tensor_sync_local_def", &kp::Manager::evalOpDefault, - "Evaluates operation to sync tensor(s) from GPU memory to local memory using staging tensors with new anonymous Sequence") + "Evaluates operation to sync tensor(s) from GPU memory to local memory with new anonymous Sequence") .def("eval_algo_mult_def", &kp::Manager::evalOpDefault, "Evaluates operation to run multiplication compute shader to two input tensors and an output tensor with new anonymous Sequence") .def("eval_algo_file_def", &kp::Manager::evalOpDefault, @@ -216,7 +216,7 @@ PYBIND11_MODULE(kp, m) { .def("eval_tensor_sync_device", &kp::Manager::evalOp, "Evaluates operation to sync tensor from local memory to GPU memory with explicitly named Sequence") .def("eval_tensor_sync_local", &kp::Manager::evalOp, - "Evaluates operation to sync tensor(s) from GPU memory to local memory using staging tensors with explicitly named Sequence") + "Evaluates operation to sync tensor(s) from GPU memory to local memory with explicitly named Sequence") .def("eval_algo_mult", &kp::Manager::evalOp, "Evaluates operation to run multiplication compute shader to two input tensors and an output tensor with explicitly named Sequence") .def("eval_algo_file", &kp::Manager::evalOp, @@ -256,7 +256,7 @@ PYBIND11_MODULE(kp, m) { .def("eval_async_tensor_sync_device_def", &kp::Manager::evalOpAsyncDefault, "Evaluates asynchronously operation to sync tensor from local memory to GPU memory with anonymous Sequence") .def("eval_async_tensor_sync_local_def", &kp::Manager::evalOpAsyncDefault, - "Evaluates asynchronously operation to sync tensor(s) from GPU memory to local memory using staging tensors with anonymous Sequence") + "Evaluates asynchronously operation to sync tensor(s) from GPU memory to local memory with anonymous Sequence") .def("eval_async_algo_mult_def", &kp::Manager::evalOpAsyncDefault, "Evaluates asynchronously operation to run multiplication compute shader to two input tensors and an output tensor with anonymous Sequence") .def("eval_async_algo_file_def", &kp::Manager::evalOpAsyncDefault, @@ -293,7 +293,7 @@ PYBIND11_MODULE(kp, m) { .def("eval_async_tensor_sync_device", &kp::Manager::evalOpAsync, "Evaluates asynchronously operation to sync tensor from local memory to GPU memory with explicitly named Sequence") .def("eval_async_tensor_sync_local", &kp::Manager::evalOpAsync, - "Evaluates asynchronously operation to sync tensor(s) from GPU memory to local memory using staging tensors with explicitly named Sequence") + "Evaluates asynchronously operation to sync tensor(s) from GPU memory to local memory with explicitly named Sequence") .def("eval_async_algo_mult", &kp::Manager::evalOpAsync, "Evaluates asynchronously operation to run multiplication compute shader to two input tensors and an output tensor with explicitly named Sequence") .def("eval_async_algo_file", &kp::Manager::evalOpAsync,