From 263f392cbb3385421f989b8ff21ec7955a3c6c63 Mon Sep 17 00:00:00 2001 From: Alejandro Saucedo Date: Tue, 9 Mar 2021 08:06:27 +0000 Subject: [PATCH 1/5] Updated memory barriers to include staging buffers --- single_include/kompute/Kompute.hpp | 271 +++++++++++++++-------------- src/OpAlgoDispatch.cpp | 2 +- src/OpTensorCopy.cpp | 2 +- src/OpTensorSyncDevice.cpp | 3 +- src/OpTensorSyncLocal.cpp | 10 +- src/Tensor.cpp | 65 ++++--- src/include/kompute/Tensor.hpp | 52 +++--- 7 files changed, 230 insertions(+), 175 deletions(-) diff --git a/single_include/kompute/Kompute.hpp b/single_include/kompute/Kompute.hpp index 9b41e1ead..fc7d697fb 100755 --- a/single_include/kompute/Kompute.hpp +++ b/single_include/kompute/Kompute.hpp @@ -741,7 +741,6 @@ namespace kp { class Shader { public: - // The default resource limit for the GLSL compiler, can be overwritten // Has been adopted by: // https://github.com/KhronosGroup/glslang/blob/master/StandAlone/ResourceLimits.cpp @@ -888,12 +887,9 @@ class Tensor * * @param commandBuffer Vulkan Command Buffer to record the commands into * @param copyFromTensor Tensor to copy the data from - * @param createBarrier Whether to create a barrier that ensures the data is - * copied before further operations. Default is true. */ void recordCopyFrom(const vk::CommandBuffer& commandBuffer, - std::shared_ptr copyFromTensor, - bool createBarrier); + std::shared_ptr copyFromTensor); /** * Records a copy from the internal staging memory to the device memory @@ -901,11 +897,8 @@ class Tensor * only be relevant for kp::Tensors of type eDevice. * * @param commandBuffer Vulkan Command Buffer to record the commands into - * @param createBarrier Whether to create a barrier that ensures the data is - * copied before further operations. Default is true. */ - void recordCopyFromStagingToDevice(const vk::CommandBuffer& commandBuffer, - bool createBarrier); + void recordCopyFromStagingToDevice(const vk::CommandBuffer& commandBuffer); /** * Records a copy from the internal device memory to the staging memory @@ -913,14 +906,11 @@ class Tensor * only be relevant for kp::Tensors of type eDevice. * * @param commandBuffer Vulkan Command Buffer to record the commands into - * @param createBarrier Whether to create a barrier that ensures the data is - * copied before further operations. Default is true. */ - void recordCopyFromDeviceToStaging(const vk::CommandBuffer& commandBuffer, - bool createBarrier); + void recordCopyFromDeviceToStaging(const vk::CommandBuffer& commandBuffer); /** - * Records the buffer memory barrier into the command buffer which + * Records the buffer memory barrier into the primary buffer and command buffer which * ensures that relevant data transfers are carried out correctly. * * @param commandBuffer Vulkan Command Buffer to record the commands into @@ -929,11 +919,26 @@ class Tensor * @param scrStageMask Pipeline stage flags for source stage mask * @param dstStageMask Pipeline stage flags for destination stage mask */ - void recordBufferMemoryBarrier(const vk::CommandBuffer& commandBuffer, - vk::AccessFlagBits srcAccessMask, - vk::AccessFlagBits dstAccessMask, - vk::PipelineStageFlagBits srcStageMask, - vk::PipelineStageFlagBits dstStageMask); + void recordPrimaryBufferMemoryBarrier(const vk::CommandBuffer& commandBuffer, + vk::AccessFlagBits srcAccessMask, + vk::AccessFlagBits dstAccessMask, + vk::PipelineStageFlagBits srcStageMask, + vk::PipelineStageFlagBits dstStageMask); + /** + * Records the buffer memory barrier into the staging buffer and command buffer which + * ensures that relevant data transfers are carried out correctly. + * + * @param commandBuffer Vulkan Command Buffer to record the commands into + * @param srcAccessMask Access flags for source access mask + * @param dstAccessMask Access flags for destination access mask + * @param scrStageMask Pipeline stage flags for source stage mask + * @param dstStageMask Pipeline stage flags for destination stage mask + */ + void recordStagingBufferMemoryBarrier(const vk::CommandBuffer& commandBuffer, + vk::AccessFlagBits srcAccessMask, + vk::AccessFlagBits dstAccessMask, + vk::PipelineStageFlagBits srcStageMask, + vk::PipelineStageFlagBits dstStageMask); /** * Constructs a vulkan descriptor buffer info which can be used to specify @@ -951,41 +956,33 @@ class Tensor * @return Unsigned integer representing the total number of elements */ // TODO: move to cpp - uint32_t size() { - return this->mSize; - } + uint32_t size() { return this->mSize; } // TODO: move to cpp - uint32_t dataTypeMemorySize() { - return this->mDataTypeMemorySize; - } + uint32_t dataTypeMemorySize() { return this->mDataTypeMemorySize; } // TODO: move to cpp - uint32_t memorySize() { - return this->mSize * this->mDataTypeMemorySize; - } + uint32_t memorySize() { return this->mSize * this->mDataTypeMemorySize; } /** * Retrieve the underlying data type of the Tensor * * @return Data type of tensor of type kp::Tensor::TensorDataTypes */ - TensorDataTypes dataType() { - return this->mDataType; - } + TensorDataTypes dataType() { return this->mDataType; } - void* rawData() { - return this->mRawData; - } + void* rawData() { return this->mRawData; } // TODO: move to cpp - template - T* data() { + template + T* data() + { return (T*)this->mRawData; } - template - std::vector vector() { + template + std::vector vector() + { return { (T*)this->mRawData, ((T*)this->mRawData) + this->size() }; } @@ -993,9 +990,9 @@ class Tensor * Sets / resets the vector data of the tensor. This function does not * perform any copies into GPU memory and is only performed on the host. */ - void setRawData(const void* data) + void setRawData(const void* data) { - // Copy data + // Copy data memcpy(this->mRawData, data, this->memorySize()); } @@ -1008,7 +1005,8 @@ class Tensor void* mRawData; private: - void mapRawData() { + void mapRawData() + { KP_LOG_DEBUG("Kompute Tensor mapping data from host buffer"); @@ -1026,14 +1024,17 @@ class Tensor vk::DeviceSize bufferSize = this->memorySize(); - // Given we request coherent host memory we don't need to invalidate / flush + // Given we request coherent host memory we don't need to invalidate / + // flush this->mRawData = this->mDevice->mapMemory( *hostVisibleMemory, 0, bufferSize, vk::MemoryMapFlags()); - vk::MappedMemoryRange mappedMemoryRange(*hostVisibleMemory, 0, bufferSize); + vk::MappedMemoryRange mappedMemoryRange( + *hostVisibleMemory, 0, bufferSize); } - void unmapRawData() { + void unmapRawData() + { KP_LOG_DEBUG("Kompute Tensor mapping data from host buffer"); @@ -1079,57 +1080,59 @@ class Tensor std::shared_ptr bufferFrom, std::shared_ptr bufferTo, vk::DeviceSize bufferSize, - vk::BufferCopy copyRegion, - bool createBarrier); + vk::BufferCopy copyRegion); + void recordBufferMemoryBarrier(const vk::CommandBuffer& commandBuffer, + const vk::Buffer& buffer, + vk::AccessFlagBits srcAccessMask, + vk::AccessFlagBits dstAccessMask, + vk::PipelineStageFlagBits srcStageMask, + vk::PipelineStageFlagBits dstStageMask); // Private util functions vk::BufferUsageFlags getPrimaryBufferUsageFlags(); vk::MemoryPropertyFlags getPrimaryMemoryPropertyFlags(); vk::BufferUsageFlags getStagingBufferUsageFlags(); vk::MemoryPropertyFlags getStagingMemoryPropertyFlags(); - }; // TODO: Limit T to be only float, bool, double, etc -template -class TensorT: public Tensor +template +class TensorT : public Tensor { public: TensorT(std::shared_ptr physicalDevice, - std::shared_ptr device, - const std::vector& data, - const TensorTypes& tensorType = TensorTypes::eDevice) - : Tensor(physicalDevice, - device, - (void*)data.data(), - data.size(), - sizeof(T), - this->dataType(), - tensorType) + std::shared_ptr device, + const std::vector& data, + const TensorTypes& tensorType = TensorTypes::eDevice) + : Tensor(physicalDevice, + device, + (void*)data.data(), + data.size(), + sizeof(T), + this->dataType(), + tensorType) { - KP_LOG_DEBUG("Kompute TensorT constructor with data size {}", data.size()); + KP_LOG_DEBUG("Kompute TensorT constructor with data size {}", + data.size()); } - ~TensorT() { - KP_LOG_DEBUG("Kompute TensorT destructor"); - } + ~TensorT() { KP_LOG_DEBUG("Kompute TensorT destructor"); } - T* data() { - return (T*)this->mRawData; - } + T* data() { return (T*)this->mRawData; } - std::vector vector() { + std::vector vector() + { return { (T*)this->mRawData, ((T*)this->mRawData) + this->size() }; } - T& operator[](int index) { - return *(((T*)this->mRawData) + index); - } + T& operator[](int index) { return *(((T*)this->mRawData) + index); } - void setData(const std::vector& data) { + void setData(const std::vector& data) + { - KP_LOG_DEBUG("Kompute TensorT setting data with data size {}", data.size()); + KP_LOG_DEBUG("Kompute TensorT setting data with data size {}", + data.size()); if (data.size() != this->mSize) { throw std::runtime_error( @@ -1140,7 +1143,6 @@ class TensorT: public Tensor } TensorDataTypes dataType(); - }; } // End namespace kp @@ -1159,15 +1161,17 @@ class Algorithm * the underlying resources. * * @param device The Vulkan device to use for creating resources - * @param tensors (optional) The tensors to use to create the descriptor resources + * @param tensors (optional) The tensors to use to create the descriptor + * resources * @param spirv (optional) The spirv code to use to create the algorithm - * @param workgroup (optional) The kp::Workgroup to use for the dispatch which defaults to - * kp::Workgroup(tensor[0].size(), 1, 1) if not set. - * @param specializationConstants (optional) The kp::Constants to use to initialize - * the specialization constants which cannot be changed once set. - * @param pushConstants (optional) The kp::Constants to use when initializing the - * pipeline, which set the size of the push constants - these can be modified but - * all new values must have the same vector size as this initial value. + * @param workgroup (optional) The kp::Workgroup to use for the dispatch + * which defaults to kp::Workgroup(tensor[0].size(), 1, 1) if not set. + * @param specializationConstants (optional) The kp::Constants to use to + * initialize the specialization constants which cannot be changed once set. + * @param pushConstants (optional) The kp::Constants to use when + * initializing the pipeline, which set the size of the push constants - + * these can be modified but all new values must have the same vector size + * as this initial value. */ Algorithm(std::shared_ptr device, const std::vector>& tensors = {}, @@ -1177,18 +1181,19 @@ class Algorithm const Constants& pushConstants = {}); /** - * Rebuild function to reconstruct algorithm with configuration parameters to create - * the underlying resources. + * Rebuild function to reconstruct algorithm with configuration parameters + * to create the underlying resources. * * @param tensors The tensors to use to create the descriptor resources * @param spirv The spirv code to use to create the algorithm - * @param workgroup (optional) The kp::Workgroup to use for the dispatch which defaults to - * kp::Workgroup(tensor[0].size(), 1, 1) if not set. - * @param specializationConstants (optional) The kp::Constants to use to initialize - * the specialization constants which cannot be changed once set. - * @param pushConstants (optional) The kp::Constants to use when initializing the - * pipeline, which set the size of the push constants - these can be modified but - * all new values must have the same vector size as this initial value. + * @param workgroup (optional) The kp::Workgroup to use for the dispatch + * which defaults to kp::Workgroup(tensor[0].size(), 1, 1) if not set. + * @param specializationConstants (optional) The kp::Constants to use to + * initialize the specialization constants which cannot be changed once set. + * @param pushConstants (optional) The kp::Constants to use when + * initializing the pipeline, which set the size of the push constants - + * these can be modified but all new values must have the same vector size + * as this initial value. */ void rebuild(const std::vector>& tensors, const std::vector& spirv, @@ -1211,25 +1216,26 @@ class Algorithm void recordDispatch(const vk::CommandBuffer& commandBuffer); /** - * Records command that binds the "core" algorithm components which consist of - * binding the pipeline and binding the descriptorsets. + * Records command that binds the "core" algorithm components which consist + * of binding the pipeline and binding the descriptorsets. * * @param commandBuffer Command buffer to record the algorithm resources to */ void recordBindCore(const vk::CommandBuffer& commandBuffer); /** - * Records command that binds the push constants to the command buffer provided - * - it is required that the pushConstants provided are of the same size as the - * ones provided during initialization. + * Records command that binds the push constants to the command buffer + * provided + * - it is required that the pushConstants provided are of the same size as + * the ones provided during initialization. * * @param commandBuffer Command buffer to record the algorithm resources to */ void recordBindPush(const vk::CommandBuffer& commandBuffer); /** - * function that checks all the gpu resource components to verify if these have - * been created and returns true if all are valid. + * function that checks all the gpu resource components to verify if these + * have been created and returns true if all are valid. * * @returns returns true if the algorithm is currently initialized. */ @@ -1238,26 +1244,28 @@ class Algorithm /** * Sets the work group to use in the recordDispatch * - * @param workgroup The kp::Workgroup value to use to update the algorithm. It - * must have a value greater than 1 on the x value (index 1) otherwise it will - * be initialized on the size of the first tensor (ie. this->mTensor[0]->size()) + * @param workgroup The kp::Workgroup value to use to update the algorithm. + * It must have a value greater than 1 on the x value (index 1) otherwise it + * will be initialized on the size of the first tensor (ie. + * this->mTensor[0]->size()) */ void setWorkgroup(const Workgroup& workgroup, uint32_t minSize = 1); /** - * Sets the push constants to the new value provided to use in the next bindPush() + * Sets the push constants to the new value provided to use in the next + * bindPush() * - * @param The kp::Constant to use to set the push constants to use in the next - * bindPush(...) calls. The constants provided must be of the same size as the - * ones created during initialization. + * @param The kp::Constant to use to set the push constants to use in the + * next bindPush(...) calls. The constants provided must be of the same size + * as the ones created during initialization. */ void setPush(const Constants& pushConstants); /** * Gets the current workgroup from the algorithm. * - * @param The kp::Constant to use to set the push constants to use in the next - * bindPush(...) calls. The constants provided must be of the same size as the - * ones created during initialization. + * @param The kp::Constant to use to set the push constants to use in the + * next bindPush(...) calls. The constants provided must be of the same size + * as the ones created during initialization. */ const Workgroup& getWorkgroup(); /** @@ -1690,8 +1698,8 @@ class Sequence : public std::enable_shared_from_this * function also requires the Sequence to be recording, otherwise it will * not be able to add the operation. * - * @param op Object derived from kp::BaseOp that will be recoreded by the sequence - * which will be used when the operation is evaluated. + * @param op Object derived from kp::BaseOp that will be recoreded by the + * sequence which will be used when the operation is evaluated. * @return shared_ptr of the Sequence class itself */ std::shared_ptr record(std::shared_ptr op); @@ -1709,7 +1717,8 @@ class Sequence : public std::enable_shared_from_this */ template std::shared_ptr record( - std::vector> tensors, TArgs&&... params) + std::vector> tensors, + TArgs&&... params) { std::shared_ptr op{ new T(tensors, std::forward(params)...) }; return this->record(op); @@ -1744,8 +1753,9 @@ class Sequence : public std::enable_shared_from_this std::shared_ptr eval(); /** - * Resets all the recorded and stored operations, records the operation - * provided and submits into the gpu as a submit job synchronously (with a barrier). + * Resets all the recorded and stored operations, records the operation + * provided and submits into the gpu as a submit job synchronously (with a + * barrier). * * @return shared_ptr of the Sequence class itself */ @@ -1788,16 +1798,18 @@ class Sequence : public std::enable_shared_from_this /** * Eval Async sends all the recorded and stored operations in the vector of - * operations into the gpu as a submit job without a barrier. EvalAwait() must - * ALWAYS be called after to ensure the sequence is terminated correctly. + * operations into the gpu as a submit job without a barrier. EvalAwait() + * must ALWAYS be called after to ensure the sequence is terminated + * correctly. * * @return Boolean stating whether execution was successful. */ std::shared_ptr evalAsync(); /** * Clears currnet operations to record provided one in the vector of - * operations into the gpu as a submit job without a barrier. EvalAwait() must - * ALWAYS be called after to ensure the sequence is terminated correctly. + * operations into the gpu as a submit job without a barrier. EvalAwait() + * must ALWAYS be called after to ensure the sequence is terminated + * correctly. * * @return Boolean stating whether execution was successful. */ @@ -1891,9 +1903,9 @@ class Sequence : public std::enable_shared_from_this bool isInit(); /** - * Clears command buffer and triggers re-record of all the current operations - * saved, which is useful if the underlying kp::Tensors or kp::Algorithms - * are modified and need to be re-recorded. + * Clears command buffer and triggers re-record of all the current + * operations saved, which is useful if the underlying kp::Tensors or + * kp::Algorithms are modified and need to be re-recorded. */ void rerecord(); @@ -1961,13 +1973,14 @@ class Manager Manager(); /** - * Similar to base constructor but allows for further configuration to use when - * creating the Vulkan resources. + * Similar to base constructor but allows for further configuration to use + * when creating the Vulkan resources. * * @param physicalDeviceIndex The index of the physical device to use * @param familyQueueIndices (Optional) List of queue indices to add for * explicit allocation - * @param desiredExtensions The desired extensions to load from physicalDevice + * @param desiredExtensions The desired extensions to load from + * physicalDevice */ Manager(uint32_t physicalDeviceIndex, const std::vector& familyQueueIndices = {}, @@ -2001,7 +2014,8 @@ class Manager * If zero (default), disables latching of timestamps. * @returns Shared pointer with initialised sequence */ - std::shared_ptr sequence(uint32_t queueIndex = 0, uint32_t totalTimestamps = 0); + std::shared_ptr sequence(uint32_t queueIndex = 0, + uint32_t totalTimestamps = 0); /** * Create a managed tensor that will be destroyed by this manager @@ -2011,7 +2025,7 @@ class Manager * @param tensorType The type of tensor to initialize * @returns Shared pointer with initialised tensor */ - template + template std::shared_ptr> tensorT( const std::vector& data, Tensor::TensorTypes tensorType = Tensor::TensorTypes::eDevice) @@ -2042,8 +2056,13 @@ class Manager const Tensor::TensorDataTypes& dataType, Tensor::TensorTypes tensorType = Tensor::TensorTypes::eDevice) { - std::shared_ptr tensor{ new kp::Tensor( - this->mPhysicalDevice, this->mDevice, data, elementTotalCount, elementMemorySize, dataType, tensorType) }; + std::shared_ptr tensor{ new kp::Tensor(this->mPhysicalDevice, + this->mDevice, + data, + elementTotalCount, + elementMemorySize, + dataType, + tensorType) }; if (this->mManageResources) { this->mManagedTensors.push_back(tensor); diff --git a/src/OpAlgoDispatch.cpp b/src/OpAlgoDispatch.cpp index 44908adb3..94502cc76 100644 --- a/src/OpAlgoDispatch.cpp +++ b/src/OpAlgoDispatch.cpp @@ -26,7 +26,7 @@ OpAlgoDispatch::record(const vk::CommandBuffer& commandBuffer) // Barrier to ensure the data is finished writing to buffer memory for (const std::shared_ptr& tensor : this->mAlgorithm->getTensors()) { - tensor->recordBufferMemoryBarrier( + tensor->recordPrimaryBufferMemoryBarrier( commandBuffer, vk::AccessFlagBits::eHostWrite, vk::AccessFlagBits::eShaderRead, diff --git a/src/OpTensorCopy.cpp b/src/OpTensorCopy.cpp index 33b9eb838..13e189a58 100644 --- a/src/OpTensorCopy.cpp +++ b/src/OpTensorCopy.cpp @@ -45,7 +45,7 @@ OpTensorCopy::record(const vk::CommandBuffer& commandBuffer) // We iterate from the second tensor onwards and record a copy to all for (size_t i = 1; i < this->mTensors.size(); i++) { this->mTensors[i]->recordCopyFrom( - commandBuffer, this->mTensors[0], false); + commandBuffer, this->mTensors[0]); } } diff --git a/src/OpTensorSyncDevice.cpp b/src/OpTensorSyncDevice.cpp index cd887c148..ad071d8d3 100644 --- a/src/OpTensorSyncDevice.cpp +++ b/src/OpTensorSyncDevice.cpp @@ -30,8 +30,7 @@ OpTensorSyncDevice::record(const vk::CommandBuffer& commandBuffer) for (size_t i = 0; i < this->mTensors.size(); i++) { if (this->mTensors[i]->tensorType() == Tensor::TensorTypes::eDevice) { - this->mTensors[i]->recordCopyFromStagingToDevice(commandBuffer, - false); + this->mTensors[i]->recordCopyFromStagingToDevice(commandBuffer); } } } diff --git a/src/OpTensorSyncLocal.cpp b/src/OpTensorSyncLocal.cpp index f7e15ffd5..70642b9de 100644 --- a/src/OpTensorSyncLocal.cpp +++ b/src/OpTensorSyncLocal.cpp @@ -30,8 +30,14 @@ OpTensorSyncLocal::record(const vk::CommandBuffer& commandBuffer) for (size_t i = 0; i < this->mTensors.size(); i++) { if (this->mTensors[i]->tensorType() == Tensor::TensorTypes::eDevice) { - this->mTensors[i]->recordCopyFromDeviceToStaging(commandBuffer, - true); + + this->mTensors[i]->recordCopyFromDeviceToStaging(commandBuffer); + + this->mTensors[i]->recordStagingBufferMemoryBarrier(commandBuffer, + vk::AccessFlagBits::eTransferWrite, + vk::AccessFlagBits::eHostRead, + vk::PipelineStageFlagBits::eTransfer, + vk::PipelineStageFlagBits::eHost); } } } diff --git a/src/Tensor.cpp b/src/Tensor.cpp index 90c21fc8a..7e7af4ca4 100644 --- a/src/Tensor.cpp +++ b/src/Tensor.cpp @@ -70,8 +70,7 @@ Tensor::isInit() void Tensor::recordCopyFrom(const vk::CommandBuffer& commandBuffer, - std::shared_ptr copyFromTensor, - bool createBarrier) + std::shared_ptr copyFromTensor) { vk::DeviceSize bufferSize(this->memorySize()); @@ -83,13 +82,11 @@ Tensor::recordCopyFrom(const vk::CommandBuffer& commandBuffer, copyFromTensor->mPrimaryBuffer, this->mPrimaryBuffer, bufferSize, - copyRegion, - createBarrier); + copyRegion); } void -Tensor::recordCopyFromStagingToDevice(const vk::CommandBuffer& commandBuffer, - bool createBarrier) +Tensor::recordCopyFromStagingToDevice(const vk::CommandBuffer& commandBuffer) { vk::DeviceSize bufferSize(this->memorySize()); vk::BufferCopy copyRegion(0, 0, bufferSize); @@ -100,13 +97,11 @@ Tensor::recordCopyFromStagingToDevice(const vk::CommandBuffer& commandBuffer, this->mStagingBuffer, this->mPrimaryBuffer, bufferSize, - copyRegion, - createBarrier); + copyRegion); } void -Tensor::recordCopyFromDeviceToStaging(const vk::CommandBuffer& commandBuffer, - bool createBarrier) +Tensor::recordCopyFromDeviceToStaging(const vk::CommandBuffer& commandBuffer) { vk::DeviceSize bufferSize(this->memorySize()); vk::BufferCopy copyRegion(0, 0, bufferSize); @@ -117,8 +112,7 @@ Tensor::recordCopyFromDeviceToStaging(const vk::CommandBuffer& commandBuffer, this->mPrimaryBuffer, this->mStagingBuffer, bufferSize, - copyRegion, - createBarrier); + copyRegion); } void @@ -126,24 +120,49 @@ Tensor::recordCopyBuffer(const vk::CommandBuffer& commandBuffer, std::shared_ptr bufferFrom, std::shared_ptr bufferTo, vk::DeviceSize bufferSize, - vk::BufferCopy copyRegion, - bool createBarrier) + vk::BufferCopy copyRegion) { commandBuffer.copyBuffer(*bufferFrom, *bufferTo, copyRegion); +} - if (createBarrier) { - // Buffer to ensure wait until data is copied to staging buffer - this->recordBufferMemoryBarrier(commandBuffer, - vk::AccessFlagBits::eTransferWrite, - vk::AccessFlagBits::eHostRead, - vk::PipelineStageFlagBits::eTransfer, - vk::PipelineStageFlagBits::eHost); - } +void +Tensor::recordPrimaryBufferMemoryBarrier(const vk::CommandBuffer& commandBuffer, + vk::AccessFlagBits srcAccessMask, + vk::AccessFlagBits dstAccessMask, + vk::PipelineStageFlagBits srcStageMask, + vk::PipelineStageFlagBits dstStageMask) +{ + KP_LOG_DEBUG("Kompute Tensor recording PRIMARY buffer memory barrier"); + + this->recordBufferMemoryBarrier(commandBuffer, + *this->mPrimaryBuffer, + srcAccessMask, + dstAccessMask, + srcStageMask, + dstStageMask); +} + +void +Tensor::recordStagingBufferMemoryBarrier(const vk::CommandBuffer& commandBuffer, + vk::AccessFlagBits srcAccessMask, + vk::AccessFlagBits dstAccessMask, + vk::PipelineStageFlagBits srcStageMask, + vk::PipelineStageFlagBits dstStageMask) +{ + KP_LOG_DEBUG("Kompute Tensor recording PRIMARY buffer memory barrier"); + + this->recordBufferMemoryBarrier(commandBuffer, + *this->mStagingBuffer, + srcAccessMask, + dstAccessMask, + srcStageMask, + dstStageMask); } void Tensor::recordBufferMemoryBarrier(const vk::CommandBuffer& commandBuffer, + const vk::Buffer& buffer, vk::AccessFlagBits srcAccessMask, vk::AccessFlagBits dstAccessMask, vk::PipelineStageFlagBits srcStageMask, @@ -154,7 +173,7 @@ Tensor::recordBufferMemoryBarrier(const vk::CommandBuffer& commandBuffer, vk::DeviceSize bufferSize = this->memorySize(); vk::BufferMemoryBarrier bufferMemoryBarrier; - bufferMemoryBarrier.buffer = *this->mPrimaryBuffer; + bufferMemoryBarrier.buffer = buffer; bufferMemoryBarrier.size = bufferSize; bufferMemoryBarrier.srcAccessMask = srcAccessMask; bufferMemoryBarrier.dstAccessMask = dstAccessMask; diff --git a/src/include/kompute/Tensor.hpp b/src/include/kompute/Tensor.hpp index dc4a4f51a..6cd75996a 100644 --- a/src/include/kompute/Tensor.hpp +++ b/src/include/kompute/Tensor.hpp @@ -97,12 +97,9 @@ class Tensor * * @param commandBuffer Vulkan Command Buffer to record the commands into * @param copyFromTensor Tensor to copy the data from - * @param createBarrier Whether to create a barrier that ensures the data is - * copied before further operations. Default is true. */ void recordCopyFrom(const vk::CommandBuffer& commandBuffer, - std::shared_ptr copyFromTensor, - bool createBarrier); + std::shared_ptr copyFromTensor); /** * Records a copy from the internal staging memory to the device memory @@ -110,11 +107,8 @@ class Tensor * only be relevant for kp::Tensors of type eDevice. * * @param commandBuffer Vulkan Command Buffer to record the commands into - * @param createBarrier Whether to create a barrier that ensures the data is - * copied before further operations. Default is true. */ - void recordCopyFromStagingToDevice(const vk::CommandBuffer& commandBuffer, - bool createBarrier); + void recordCopyFromStagingToDevice(const vk::CommandBuffer& commandBuffer); /** * Records a copy from the internal device memory to the staging memory @@ -122,14 +116,11 @@ class Tensor * only be relevant for kp::Tensors of type eDevice. * * @param commandBuffer Vulkan Command Buffer to record the commands into - * @param createBarrier Whether to create a barrier that ensures the data is - * copied before further operations. Default is true. */ - void recordCopyFromDeviceToStaging(const vk::CommandBuffer& commandBuffer, - bool createBarrier); + void recordCopyFromDeviceToStaging(const vk::CommandBuffer& commandBuffer); /** - * Records the buffer memory barrier into the command buffer which + * Records the buffer memory barrier into the primary buffer and command buffer which * ensures that relevant data transfers are carried out correctly. * * @param commandBuffer Vulkan Command Buffer to record the commands into @@ -138,11 +129,27 @@ class Tensor * @param scrStageMask Pipeline stage flags for source stage mask * @param dstStageMask Pipeline stage flags for destination stage mask */ - void recordBufferMemoryBarrier(const vk::CommandBuffer& commandBuffer, - vk::AccessFlagBits srcAccessMask, - vk::AccessFlagBits dstAccessMask, - vk::PipelineStageFlagBits srcStageMask, - vk::PipelineStageFlagBits dstStageMask); + void recordPrimaryBufferMemoryBarrier(const vk::CommandBuffer& commandBuffer, + vk::AccessFlagBits srcAccessMask, + vk::AccessFlagBits dstAccessMask, + vk::PipelineStageFlagBits srcStageMask, + vk::PipelineStageFlagBits dstStageMask); + /** + * Records the buffer memory barrier into the staging buffer and command buffer which + * ensures that relevant data transfers are carried out correctly. + * + * @param commandBuffer Vulkan Command Buffer to record the commands into + * @param srcAccessMask Access flags for source access mask + * @param dstAccessMask Access flags for destination access mask + * @param scrStageMask Pipeline stage flags for source stage mask + * @param dstStageMask Pipeline stage flags for destination stage mask + */ + void recordStagingBufferMemoryBarrier(const vk::CommandBuffer& commandBuffer, + vk::AccessFlagBits srcAccessMask, + vk::AccessFlagBits dstAccessMask, + vk::PipelineStageFlagBits srcStageMask, + vk::PipelineStageFlagBits dstStageMask); + /** * Constructs a vulkan descriptor buffer info which can be used to specify @@ -284,8 +291,13 @@ class Tensor std::shared_ptr bufferFrom, std::shared_ptr bufferTo, vk::DeviceSize bufferSize, - vk::BufferCopy copyRegion, - bool createBarrier); + vk::BufferCopy copyRegion); + void recordBufferMemoryBarrier(const vk::CommandBuffer& commandBuffer, + const vk::Buffer& buffer, + vk::AccessFlagBits srcAccessMask, + vk::AccessFlagBits dstAccessMask, + vk::PipelineStageFlagBits srcStageMask, + vk::PipelineStageFlagBits dstStageMask); // Private util functions vk::BufferUsageFlags getPrimaryBufferUsageFlags(); From 0ea3a7912375253eb2e51030c946ba462b4ac48a Mon Sep 17 00:00:00 2001 From: Alejandro Saucedo Date: Tue, 9 Mar 2021 08:52:43 +0000 Subject: [PATCH 2/5] Updated access masks for the pipeline barriers --- src/OpAlgoDispatch.cpp | 4 ++-- src/OpTensorSyncLocal.cpp | 12 ++++++------ 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/src/OpAlgoDispatch.cpp b/src/OpAlgoDispatch.cpp index 94502cc76..517a70d52 100644 --- a/src/OpAlgoDispatch.cpp +++ b/src/OpAlgoDispatch.cpp @@ -28,9 +28,9 @@ OpAlgoDispatch::record(const vk::CommandBuffer& commandBuffer) this->mAlgorithm->getTensors()) { tensor->recordPrimaryBufferMemoryBarrier( commandBuffer, - vk::AccessFlagBits::eHostWrite, + vk::AccessFlagBits::eTransferWrite, vk::AccessFlagBits::eShaderRead, - vk::PipelineStageFlagBits::eHost, + vk::PipelineStageFlagBits::eTransfer, vk::PipelineStageFlagBits::eComputeShader); } diff --git a/src/OpTensorSyncLocal.cpp b/src/OpTensorSyncLocal.cpp index 70642b9de..8a99c780b 100644 --- a/src/OpTensorSyncLocal.cpp +++ b/src/OpTensorSyncLocal.cpp @@ -31,13 +31,13 @@ OpTensorSyncLocal::record(const vk::CommandBuffer& commandBuffer) for (size_t i = 0; i < this->mTensors.size(); i++) { if (this->mTensors[i]->tensorType() == Tensor::TensorTypes::eDevice) { - this->mTensors[i]->recordCopyFromDeviceToStaging(commandBuffer); + this->mTensors[i]->recordPrimaryBufferMemoryBarrier(commandBuffer, + vk::AccessFlagBits::eShaderWrite, + vk::AccessFlagBits::eTransferRead, + vk::PipelineStageFlagBits::eComputeShader, + vk::PipelineStageFlagBits::eTransfer); - this->mTensors[i]->recordStagingBufferMemoryBarrier(commandBuffer, - vk::AccessFlagBits::eTransferWrite, - vk::AccessFlagBits::eHostRead, - vk::PipelineStageFlagBits::eTransfer, - vk::PipelineStageFlagBits::eHost); + this->mTensors[i]->recordCopyFromDeviceToStaging(commandBuffer); } } } From c9bd406c8b2fb268459da5e8716bb59c25b0f95b Mon Sep 17 00:00:00 2001 From: Alejandro Saucedo Date: Tue, 9 Mar 2021 10:00:33 +0000 Subject: [PATCH 3/5] Added barrier --- src/OpTensorSyncLocal.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/OpTensorSyncLocal.cpp b/src/OpTensorSyncLocal.cpp index 8a99c780b..5e653154a 100644 --- a/src/OpTensorSyncLocal.cpp +++ b/src/OpTensorSyncLocal.cpp @@ -38,6 +38,12 @@ OpTensorSyncLocal::record(const vk::CommandBuffer& commandBuffer) vk::PipelineStageFlagBits::eTransfer); this->mTensors[i]->recordCopyFromDeviceToStaging(commandBuffer); + + this->mTensors[i]->recordPrimaryBufferMemoryBarrier(commandBuffer, + vk::AccessFlagBits::eTransferWrite, + vk::AccessFlagBits::eHostRead, + vk::PipelineStageFlagBits::eTransfer, + vk::PipelineStageFlagBits::eHost); } } } From d7b98f149b41f3877e59f5b8dc4863cfff43d3c9 Mon Sep 17 00:00:00 2001 From: Alejandro Saucedo Date: Tue, 9 Mar 2021 22:11:54 +0000 Subject: [PATCH 4/5] Added OpMemoryBarrier to the available operations --- single_include/AggregateHeaders.cpp | 1 + single_include/kompute/Kompute.hpp | 71 +++++++++++++++++ src/OpMemoryBarrier.cpp | 69 ++++++++++++++++ .../kompute/operations/OpMemoryBarrier.hpp | 78 +++++++++++++++++++ 4 files changed, 219 insertions(+) create mode 100644 src/OpMemoryBarrier.cpp create mode 100644 src/include/kompute/operations/OpMemoryBarrier.hpp diff --git a/single_include/AggregateHeaders.cpp b/single_include/AggregateHeaders.cpp index 25dc04edb..e4d1014e9 100644 --- a/single_include/AggregateHeaders.cpp +++ b/single_include/AggregateHeaders.cpp @@ -6,6 +6,7 @@ #include "kompute/Tensor.hpp" #include "kompute/Algorithm.hpp" #include "kompute/operations/OpBase.hpp" +#include "kompute/operations/OpMemoryBarrier.hpp" #include "kompute/operations/OpTensorCopy.hpp" #include "kompute/operations/OpTensorSyncDevice.hpp" #include "kompute/operations/OpTensorSyncLocal.hpp" diff --git a/single_include/kompute/Kompute.hpp b/single_include/kompute/Kompute.hpp index fc7d697fb..2353f62b8 100755 --- a/single_include/kompute/Kompute.hpp +++ b/single_include/kompute/Kompute.hpp @@ -1388,6 +1388,77 @@ class OpBase namespace kp { +/** + * Operation that provides a general abstraction that simplifies the use of + * algorithm and parameter components which can be used with shaders. + * It exposes the pipeline barrier functionality specifically for memory + * barriers that can be configured through the respective source and destination + * masks + */ +class OpMemoryBarrier : public OpBase +{ + public: + + /** + * Constructor that stores tensors as well as memory barrier parameters to be + * used to create a pipeline barrier on the respective primary or staging tensor. + * + * @param tensors The tensors to apply the memory barriers on + * @param srcAccessMask The kp::AccessFlagBits for the source access mask + * @param dstAccessMask The kp::AccessFlagBits for the destination access mask + * @param srcStageMask The kp::PipelineStageFlagBits for the source stage mask + * @param dstStageMask The kp::PipelineStageFlagBits for the destination stage mask + * @param barrierOnPrimary Boolean to select primary or secondary buffers on tensors + */ + OpMemoryBarrier( + const std::vector>& tensors, + const vk::AccessFlagBits& srcAccessMask, + const vk::AccessFlagBits& dstAccessMask, + const vk::PipelineStageFlagBits& srcStageMask, + const vk::PipelineStageFlagBits& dstStageMask, + bool barrierOnPrimary = true); + + /** + * Default destructor, which is in charge of destroying the reference to the tensors + * and all the relevant access / stage masks created + */ + virtual ~OpMemoryBarrier() override; + + /** + * This records the memory barrier with the access and stage masks provided + * across all relevant tensors. + * + * @param commandBuffer The command buffer to record the command into. + */ + virtual void record(const vk::CommandBuffer& commandBuffer) override; + + /** + * Does not perform any preEval commands. + * + * @param commandBuffer The command buffer to record the command into. + */ + virtual void preEval(const vk::CommandBuffer& commandBuffer) override; + + /** + * Does not perform any postEval commands. + * + * @param commandBuffer The command buffer to record the command into. + */ + virtual void postEval(const vk::CommandBuffer& commandBuffer) override; + +private: + const vk::AccessFlagBits mSrcAccessMask; + const vk::AccessFlagBits mDstAccessMask; + const vk::PipelineStageFlagBits mSrcStageMask; + const vk::PipelineStageFlagBits mDstStageMask; + const bool mBarrierOnPrimary; + const std::vector> mTensors; +}; + +} // End namespace kp + +namespace kp { + /** * Operation that copies the data from the first tensor to the rest of the tensors * provided, using a record command for all the vectors. This operation does not diff --git a/src/OpMemoryBarrier.cpp b/src/OpMemoryBarrier.cpp new file mode 100644 index 000000000..9e45d0c4e --- /dev/null +++ b/src/OpMemoryBarrier.cpp @@ -0,0 +1,69 @@ +#pragma once + +#include "kompute/operations/OpMemoryBarrier.hpp" + +namespace kp { + +OpMemoryBarrier::OpMemoryBarrier( + const std::vector>& tensors, + const vk::AccessFlagBits& srcAccessMask, + const vk::AccessFlagBits& dstAccessMask, + const vk::PipelineStageFlagBits& srcStageMask, + const vk::PipelineStageFlagBits& dstStageMask, + bool barrierOnPrimary) + : mTensors(tensors), + mSrcAccessMask(srcAccessMask), + mDstAccessMask(dstAccessMask), + mSrcStageMask(srcStageMask), + mDstStageMask(dstStageMask), + mBarrierOnPrimary(barrierOnPrimary) +{ + KP_LOG_DEBUG("Kompute OpMemoryBarrier constructor"); + +} + +OpMemoryBarrier::~OpMemoryBarrier() +{ + KP_LOG_DEBUG("Kompute OpMemoryBarrier destructor started"); +} + +void +OpMemoryBarrier::record(const vk::CommandBuffer& commandBuffer) +{ + KP_LOG_DEBUG("Kompute OpMemoryBarrier record called"); + + // Barrier to ensure the data is finished writing to buffer memory + if (this->mBarrierOnPrimary) { + for (const std::shared_ptr& tensor : this->mTensors) { + tensor->recordPrimaryBufferMemoryBarrier( + commandBuffer, + this->mSrcAccessMask, + this->mDstAccessMask, + this->mSrcStageMask, + this->mDstStageMask); + } + } else { + for (const std::shared_ptr& tensor : this->mTensors) { + tensor->recordStagingBufferMemoryBarrier( + commandBuffer, + this->mSrcAccessMask, + this->mDstAccessMask, + this->mSrcStageMask, + this->mDstStageMask); + } + } +} + +void +OpMemoryBarrier::preEval(const vk::CommandBuffer& commandBuffer) +{ + KP_LOG_DEBUG("Kompute OpMemoryBarrier preEval called"); +} + +void +OpMemoryBarrier::postEval(const vk::CommandBuffer& commandBuffer) +{ + KP_LOG_DEBUG("Kompute OpMemoryBarrier postSubmit called"); +} + +} diff --git a/src/include/kompute/operations/OpMemoryBarrier.hpp b/src/include/kompute/operations/OpMemoryBarrier.hpp new file mode 100644 index 000000000..6ae7100db --- /dev/null +++ b/src/include/kompute/operations/OpMemoryBarrier.hpp @@ -0,0 +1,78 @@ +#pragma once + +#include "kompute/Core.hpp" +#include "kompute/Algorithm.hpp" +#include "kompute/Tensor.hpp" +#include "kompute/operations/OpBase.hpp" + +namespace kp { + +/** + * Operation that provides a general abstraction that simplifies the use of + * algorithm and parameter components which can be used with shaders. + * It exposes the pipeline barrier functionality specifically for memory + * barriers that can be configured through the respective source and destination + * masks + */ +class OpMemoryBarrier : public OpBase +{ + public: + + /** + * Constructor that stores tensors as well as memory barrier parameters to be + * used to create a pipeline barrier on the respective primary or staging tensor. + * + * @param tensors The tensors to apply the memory barriers on + * @param srcAccessMask The kp::AccessFlagBits for the source access mask + * @param dstAccessMask The kp::AccessFlagBits for the destination access mask + * @param srcStageMask The kp::PipelineStageFlagBits for the source stage mask + * @param dstStageMask The kp::PipelineStageFlagBits for the destination stage mask + * @param barrierOnPrimary Boolean to select primary or secondary buffers on tensors + */ + OpMemoryBarrier( + const std::vector>& tensors, + const vk::AccessFlagBits& srcAccessMask, + const vk::AccessFlagBits& dstAccessMask, + const vk::PipelineStageFlagBits& srcStageMask, + const vk::PipelineStageFlagBits& dstStageMask, + bool barrierOnPrimary = true); + + /** + * Default destructor, which is in charge of destroying the reference to the tensors + * and all the relevant access / stage masks created + */ + virtual ~OpMemoryBarrier() override; + + /** + * This records the memory barrier with the access and stage masks provided + * across all relevant tensors. + * + * @param commandBuffer The command buffer to record the command into. + */ + virtual void record(const vk::CommandBuffer& commandBuffer) override; + + /** + * Does not perform any preEval commands. + * + * @param commandBuffer The command buffer to record the command into. + */ + virtual void preEval(const vk::CommandBuffer& commandBuffer) override; + + /** + * Does not perform any postEval commands. + * + * @param commandBuffer The command buffer to record the command into. + */ + virtual void postEval(const vk::CommandBuffer& commandBuffer) override; + +private: + const vk::AccessFlagBits mSrcAccessMask; + const vk::AccessFlagBits mDstAccessMask; + const vk::PipelineStageFlagBits mSrcStageMask; + const vk::PipelineStageFlagBits mDstStageMask; + const bool mBarrierOnPrimary; + const std::vector> mTensors; +}; + +} // End namespace kp + From 4e86562cff4c3fc3737160c93a9f9aebe41f089f Mon Sep 17 00:00:00 2001 From: Alejandro Saucedo Date: Sat, 13 Mar 2021 07:40:20 +0000 Subject: [PATCH 5/5] Updated staging memory barrier to ensure it's always coherent --- src/Tensor.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/Tensor.cpp b/src/Tensor.cpp index 7e7af4ca4..6ea92ccd6 100644 --- a/src/Tensor.cpp +++ b/src/Tensor.cpp @@ -258,7 +258,8 @@ Tensor::getStagingMemoryPropertyFlags() { switch (this->mTensorType) { case TensorTypes::eDevice: - return vk::MemoryPropertyFlagBits::eHostVisible; + return vk::MemoryPropertyFlagBits::eHostVisible | + vk::MemoryPropertyFlagBits::eHostCoherent; break; default: throw std::runtime_error("Kompute Tensor invalid tensor type");