diff --git a/single_include/kompute/Kompute.hpp b/single_include/kompute/Kompute.hpp index cb13b744f..f85285e2d 100755 --- a/single_include/kompute/Kompute.hpp +++ b/single_include/kompute/Kompute.hpp @@ -246,8 +246,12 @@ class Tensor * Records a copy from the memory of the tensor provided to the current * thensor. This is intended to pass memory into a processing, to perform * a staging buffer transfer, or to gather output (between others). + * + * @param copyFromTensor Tensor to copy the data from + * @param createBarrier Whether to create a barrier that ensures the data is copied before further operations. Default is true. */ - void recordCopyFrom(std::shared_ptr copyFromTensor); + void recordCopyFrom(std::shared_ptr copyFromTensor, + bool createBarrier = true); /** * Records the buffer memory barrier into the command buffer which @@ -1077,7 +1081,6 @@ OpMult::init() #endif SPDLOG_DEBUG("Kompute OpMult Initialising algorithm component"); - SPDLOG_DEBUG("Kompute vector size {}", shaderFileData.size()); this->mAlgorithm->init(shaderFileData, this->mTensors); } @@ -1103,11 +1106,6 @@ OpMult::record() this->mAlgorithm->recordDispatch(this->mX, this->mY, this->mZ); // Barrier to ensure the shader code is executed before buffer read - this->mTensorLHS->recordBufferMemoryBarrier( - vk::AccessFlagBits::eShaderWrite, - vk::AccessFlagBits::eTransferRead, - vk::PipelineStageFlagBits::eComputeShader, - vk::PipelineStageFlagBits::eTransfer); this->mTensorOutput->recordBufferMemoryBarrier( vk::AccessFlagBits::eShaderWrite, vk::AccessFlagBits::eTransferRead, @@ -1115,18 +1113,6 @@ OpMult::record() vk::PipelineStageFlagBits::eTransfer); this->mTensorOutputStaging->recordCopyFrom(this->mTensorOutput); - - // Buffer to ensure wait until data is copied to staging buffer - this->mTensorLHS->recordBufferMemoryBarrier( - vk::AccessFlagBits::eTransferWrite, - vk::AccessFlagBits::eHostRead, - vk::PipelineStageFlagBits::eTransfer, - vk::PipelineStageFlagBits::eHost); - this->mTensorOutput->recordBufferMemoryBarrier( - vk::AccessFlagBits::eTransferWrite, - vk::AccessFlagBits::eHostRead, - vk::PipelineStageFlagBits::eTransfer, - vk::PipelineStageFlagBits::eHost); } template diff --git a/src/Tensor.cpp b/src/Tensor.cpp index 04e8f740e..e1b85a42e 100644 --- a/src/Tensor.cpp +++ b/src/Tensor.cpp @@ -95,7 +95,7 @@ Tensor::setData(const std::vector& data) } void -Tensor::recordCopyFrom(std::shared_ptr copyFromTensor) +Tensor::recordCopyFrom(std::shared_ptr copyFromTensor, bool createBarrier) { SPDLOG_DEBUG("Kompute Tensor recordCopyFrom called"); @@ -114,6 +114,15 @@ Tensor::recordCopyFrom(std::shared_ptr copyFromTensor) // TODO: Ensure command buffer is in same device from buffer this->mCommandBuffer->copyBuffer( *copyFromTensor->mBuffer, *this->mBuffer, copyRegion); + + if (createBarrier) { + // Buffer to ensure wait until data is copied to staging buffer + this->recordBufferMemoryBarrier( + vk::AccessFlagBits::eTransferWrite, + vk::AccessFlagBits::eHostRead, + vk::PipelineStageFlagBits::eTransfer, + vk::PipelineStageFlagBits::eHost); + } } void diff --git a/src/include/kompute/Tensor.hpp b/src/include/kompute/Tensor.hpp index 6e16af85e..0ff811300 100644 --- a/src/include/kompute/Tensor.hpp +++ b/src/include/kompute/Tensor.hpp @@ -96,8 +96,12 @@ class Tensor * Records a copy from the memory of the tensor provided to the current * thensor. This is intended to pass memory into a processing, to perform * a staging buffer transfer, or to gather output (between others). + * + * @param copyFromTensor Tensor to copy the data from + * @param createBarrier Whether to create a barrier that ensures the data is copied before further operations. Default is true. */ - void recordCopyFrom(std::shared_ptr copyFromTensor); + void recordCopyFrom(std::shared_ptr copyFromTensor, + bool createBarrier = true); /** * Records the buffer memory barrier into the command buffer which diff --git a/src/include/kompute/operations/OpMult.hpp b/src/include/kompute/operations/OpMult.hpp index fdc7a3282..32128643d 100644 --- a/src/include/kompute/operations/OpMult.hpp +++ b/src/include/kompute/operations/OpMult.hpp @@ -198,11 +198,6 @@ OpMult::record() this->mAlgorithm->recordDispatch(this->mX, this->mY, this->mZ); // Barrier to ensure the shader code is executed before buffer read - this->mTensorLHS->recordBufferMemoryBarrier( - vk::AccessFlagBits::eShaderWrite, - vk::AccessFlagBits::eTransferRead, - vk::PipelineStageFlagBits::eComputeShader, - vk::PipelineStageFlagBits::eTransfer); this->mTensorOutput->recordBufferMemoryBarrier( vk::AccessFlagBits::eShaderWrite, vk::AccessFlagBits::eTransferRead, @@ -210,18 +205,6 @@ OpMult::record() vk::PipelineStageFlagBits::eTransfer); this->mTensorOutputStaging->recordCopyFrom(this->mTensorOutput); - - // Buffer to ensure wait until data is copied to staging buffer - this->mTensorLHS->recordBufferMemoryBarrier( - vk::AccessFlagBits::eTransferWrite, - vk::AccessFlagBits::eHostRead, - vk::PipelineStageFlagBits::eTransfer, - vk::PipelineStageFlagBits::eHost); - this->mTensorOutput->recordBufferMemoryBarrier( - vk::AccessFlagBits::eTransferWrite, - vk::AccessFlagBits::eHostRead, - vk::PipelineStageFlagBits::eTransfer, - vk::PipelineStageFlagBits::eHost); } template