Merge pull request #137 from EthicalML/revamp_memory_tensor_mgmt
Removed Staging Tensors in favour of having two buffer & memory in a Tensor to minimise data transfer
This commit is contained in:
commit
698883992f
19 changed files with 361 additions and 238 deletions
|
|
@ -513,10 +513,10 @@ function not in the record function.)doc";
|
|||
static const char *__doc_kp_OpTensorSyncDevice =
|
||||
R"doc(Operation that syncs tensor's device by mapping local data into the
|
||||
device memory. For TensorTypes::eDevice it will use a staging tensor
|
||||
to perform the copy. For TensorTypes::eStaging it will only copy the
|
||||
to perform the copy. For TensorTypes::eHost it will only copy the
|
||||
data and perform a map, which will be executed during the record (as
|
||||
opposed to during the sequence eval/submit). This function cannot be
|
||||
carried out for TensorTypes::eStaging.)doc";
|
||||
carried out for TensorTypes::eHost.)doc";
|
||||
|
||||
static const char *__doc_kp_OpTensorSyncDevice_OpTensorSyncDevice = R"doc()doc";
|
||||
|
||||
|
|
@ -533,7 +533,7 @@ queues @param device Vulkan logical device for passing to Algorithm
|
|||
static const char *__doc_kp_OpTensorSyncDevice_init =
|
||||
R"doc(Performs basic checks such as ensuring that there is at least one
|
||||
tensor provided, that they are initialized and that they are not of
|
||||
type TensorTpes::eStaging. For staging tensors in host memory, the map
|
||||
type TensorTpes::eHost. For staging tensors in host memory, the map
|
||||
is performed during the init function.)doc";
|
||||
|
||||
static const char *__doc_kp_OpTensorSyncDevice_mStagingTensors = R"doc()doc";
|
||||
|
|
@ -549,11 +549,11 @@ from the temporary staging tensor.)doc";
|
|||
static const char *__doc_kp_OpTensorSyncLocal =
|
||||
R"doc(Operation that syncs tensor's local data by mapping the data from
|
||||
device memory into the local vector. For TensorTypes::eDevice it will
|
||||
use a staging tensor to perform the copy. For TensorTypes::eStaging it
|
||||
use a staging tensor to perform the copy. For TensorTypes::eHost it
|
||||
will only copy the data and perform a map, which will be executed
|
||||
during the postSubmit (there will be no copy during the sequence
|
||||
eval/submit). This function cannot be carried out for
|
||||
TensorTypes::eStaging.)doc";
|
||||
TensorTypes::eHost.)doc";
|
||||
|
||||
static const char *__doc_kp_OpTensorSyncLocal_OpTensorSyncLocal = R"doc()doc";
|
||||
|
||||
|
|
@ -570,7 +570,7 @@ queues @param device Vulkan logical device for passing to Algorithm
|
|||
static const char *__doc_kp_OpTensorSyncLocal_init =
|
||||
R"doc(Performs basic checks such as ensuring that there is at least one
|
||||
tensor provided, that they are initialized and that they are not of
|
||||
type TensorTpes::eStaging.)doc";
|
||||
type TensorTpes::eHost.)doc";
|
||||
|
||||
static const char *__doc_kp_OpTensorSyncLocal_mStagingTensors = R"doc()doc";
|
||||
|
||||
|
|
@ -719,7 +719,7 @@ shader storage).)doc";
|
|||
|
||||
static const char *__doc_kp_Tensor_TensorTypes_eDevice = R"doc(< Type is device memory, source and destination)doc";
|
||||
|
||||
static const char *__doc_kp_Tensor_TensorTypes_eStaging = R"doc(< Type is host memory, source and destination)doc";
|
||||
static const char *__doc_kp_Tensor_TensorTypes_eHost = R"doc(< Type is host memory, source and destination)doc";
|
||||
|
||||
static const char *__doc_kp_Tensor_TensorTypes_eStorage = R"doc(< Type is Device memory (only))doc";
|
||||
|
||||
|
|
|
|||
|
|
@ -26,7 +26,7 @@ PYBIND11_MODULE(kp, m) {
|
|||
|
||||
py::enum_<kp::Tensor::TensorTypes>(m, "TensorTypes", DOC(kp, Tensor, TensorTypes))
|
||||
.value("device", kp::Tensor::TensorTypes::eDevice, "Tensor holding data in GPU memory.")
|
||||
.value("staging", kp::Tensor::TensorTypes::eStaging, "Tensor used for transfer of data to device.")
|
||||
.value("host", kp::Tensor::TensorTypes::eHost, "Tensor used for CPU visible GPU data.")
|
||||
.value("storage", kp::Tensor::TensorTypes::eStorage, "Tensor with host visible gpu memory.")
|
||||
.export_values();
|
||||
|
||||
|
|
@ -112,7 +112,7 @@ PYBIND11_MODULE(kp, m) {
|
|||
.def("record_tensor_sync_device", &kp::Sequence::record<kp::OpTensorSyncDevice>,
|
||||
"Records operation to sync tensor from local memory to GPU memory")
|
||||
.def("record_tensor_sync_local", &kp::Sequence::record<kp::OpTensorSyncLocal>,
|
||||
"Records operation to sync tensor(s) from GPU memory to local memory using staging tensors")
|
||||
"Records operation to sync tensor(s) from GPU memory to local memory")
|
||||
.def("record_algo_mult", &kp::Sequence::record<kp::OpMult>,
|
||||
"Records operation to run multiplication compute shader to two input tensors and an output tensor")
|
||||
.def("record_algo_file", [](kp::Sequence &self,
|
||||
|
|
@ -179,7 +179,7 @@ PYBIND11_MODULE(kp, m) {
|
|||
.def("eval_tensor_sync_device_def", &kp::Manager::evalOpDefault<kp::OpTensorSyncDevice>,
|
||||
"Evaluates operation to sync tensor from local memory to GPU memory with new anonymous Sequence")
|
||||
.def("eval_tensor_sync_local_def", &kp::Manager::evalOpDefault<kp::OpTensorSyncLocal>,
|
||||
"Evaluates operation to sync tensor(s) from GPU memory to local memory using staging tensors with new anonymous Sequence")
|
||||
"Evaluates operation to sync tensor(s) from GPU memory to local memory with new anonymous Sequence")
|
||||
.def("eval_algo_mult_def", &kp::Manager::evalOpDefault<kp::OpMult>,
|
||||
"Evaluates operation to run multiplication compute shader to two input tensors and an output tensor with new anonymous Sequence")
|
||||
.def("eval_algo_file_def", &kp::Manager::evalOpDefault<kp::OpAlgoBase, std::string>,
|
||||
|
|
@ -216,7 +216,7 @@ PYBIND11_MODULE(kp, m) {
|
|||
.def("eval_tensor_sync_device", &kp::Manager::evalOp<kp::OpTensorSyncDevice>,
|
||||
"Evaluates operation to sync tensor from local memory to GPU memory with explicitly named Sequence")
|
||||
.def("eval_tensor_sync_local", &kp::Manager::evalOp<kp::OpTensorSyncLocal>,
|
||||
"Evaluates operation to sync tensor(s) from GPU memory to local memory using staging tensors with explicitly named Sequence")
|
||||
"Evaluates operation to sync tensor(s) from GPU memory to local memory with explicitly named Sequence")
|
||||
.def("eval_algo_mult", &kp::Manager::evalOp<kp::OpMult>,
|
||||
"Evaluates operation to run multiplication compute shader to two input tensors and an output tensor with explicitly named Sequence")
|
||||
.def("eval_algo_file", &kp::Manager::evalOp<kp::OpAlgoBase, std::string>,
|
||||
|
|
@ -256,7 +256,7 @@ PYBIND11_MODULE(kp, m) {
|
|||
.def("eval_async_tensor_sync_device_def", &kp::Manager::evalOpAsyncDefault<kp::OpTensorSyncDevice>,
|
||||
"Evaluates asynchronously operation to sync tensor from local memory to GPU memory with anonymous Sequence")
|
||||
.def("eval_async_tensor_sync_local_def", &kp::Manager::evalOpAsyncDefault<kp::OpTensorSyncLocal>,
|
||||
"Evaluates asynchronously operation to sync tensor(s) from GPU memory to local memory using staging tensors with anonymous Sequence")
|
||||
"Evaluates asynchronously operation to sync tensor(s) from GPU memory to local memory with anonymous Sequence")
|
||||
.def("eval_async_algo_mult_def", &kp::Manager::evalOpAsyncDefault<kp::OpMult>,
|
||||
"Evaluates asynchronously operation to run multiplication compute shader to two input tensors and an output tensor with anonymous Sequence")
|
||||
.def("eval_async_algo_file_def", &kp::Manager::evalOpAsyncDefault<kp::OpAlgoBase, std::string>,
|
||||
|
|
@ -293,7 +293,7 @@ PYBIND11_MODULE(kp, m) {
|
|||
.def("eval_async_tensor_sync_device", &kp::Manager::evalOpAsync<kp::OpTensorSyncDevice>,
|
||||
"Evaluates asynchronously operation to sync tensor from local memory to GPU memory with explicitly named Sequence")
|
||||
.def("eval_async_tensor_sync_local", &kp::Manager::evalOpAsync<kp::OpTensorSyncLocal>,
|
||||
"Evaluates asynchronously operation to sync tensor(s) from GPU memory to local memory using staging tensors with explicitly named Sequence")
|
||||
"Evaluates asynchronously operation to sync tensor(s) from GPU memory to local memory with explicitly named Sequence")
|
||||
.def("eval_async_algo_mult", &kp::Manager::evalOpAsync<kp::OpMult>,
|
||||
"Evaluates asynchronously operation to run multiplication compute shader to two input tensors and an output tensor with explicitly named Sequence")
|
||||
.def("eval_async_algo_file", &kp::Manager::evalOpAsync<kp::OpAlgoBase, std::string>,
|
||||
|
|
|
|||
|
|
@ -723,7 +723,7 @@ class Tensor
|
|||
enum class TensorTypes
|
||||
{
|
||||
eDevice = 0, ///< Type is device memory, source and destination
|
||||
eStaging = 1, ///< Type is host memory, source and destination
|
||||
eHost = 1, ///< Type is host memory, source and destination
|
||||
eStorage = 2, ///< Type is Device memory (only)
|
||||
};
|
||||
|
||||
|
|
@ -828,6 +828,26 @@ class Tensor
|
|||
std::shared_ptr<Tensor> copyFromTensor,
|
||||
bool createBarrier);
|
||||
|
||||
/**
|
||||
* Records a copy from the internal staging memory to the device memory using an optional barrier to wait for the operation. This function would only be relevant for kp::Tensors of type eDevice.
|
||||
*
|
||||
* @param commandBuffer Vulkan Command Buffer to record the commands into
|
||||
* @param createBarrier Whether to create a barrier that ensures the data is
|
||||
* copied before further operations. Default is true.
|
||||
*/
|
||||
void recordCopyFromStagingToDevice(std::shared_ptr<vk::CommandBuffer> commandBuffer,
|
||||
bool createBarrier);
|
||||
|
||||
/**
|
||||
* Records a copy from the internal device memory to the staging memory using an optional barrier to wait for the operation. This function would only be relevant for kp::Tensors of type eDevice.
|
||||
*
|
||||
* @param commandBuffer Vulkan Command Buffer to record the commands into
|
||||
* @param createBarrier Whether to create a barrier that ensures the data is
|
||||
* copied before further operations. Default is true.
|
||||
*/
|
||||
void recordCopyFromDeviceToStaging(std::shared_ptr<vk::CommandBuffer> commandBuffer,
|
||||
bool createBarrier);
|
||||
|
||||
/**
|
||||
* Records the buffer memory barrier into the command buffer which
|
||||
* ensures that relevant data transfers are carried out correctly.
|
||||
|
|
@ -870,10 +890,14 @@ class Tensor
|
|||
std::shared_ptr<vk::Device> mDevice;
|
||||
|
||||
// -------------- OPTIONALLY OWNED RESOURCES
|
||||
std::shared_ptr<vk::Buffer> mBuffer;
|
||||
bool mFreeBuffer;
|
||||
std::shared_ptr<vk::DeviceMemory> mMemory;
|
||||
bool mFreeMemory;
|
||||
std::shared_ptr<vk::Buffer> mPrimaryBuffer;
|
||||
bool mFreePrimaryBuffer = false;
|
||||
std::shared_ptr<vk::Buffer> mStagingBuffer;
|
||||
bool mFreeStagingBuffer = false;
|
||||
std::shared_ptr<vk::DeviceMemory> mPrimaryMemory;
|
||||
bool mFreePrimaryMemory = false;
|
||||
std::shared_ptr<vk::DeviceMemory> mStagingMemory;
|
||||
bool mFreeStagingMemory = false;
|
||||
|
||||
// -------------- ALWAYS OWNED RESOURCES
|
||||
std::vector<float> mData;
|
||||
|
|
@ -883,11 +907,16 @@ class Tensor
|
|||
std::array<uint32_t, KP_MAX_DIM_SIZE> mShape;
|
||||
bool mIsInit = false;
|
||||
|
||||
void createBuffer(); // Creates the vulkan buffer
|
||||
void allocateMemoryCreateGPUResources(); // Creates the vulkan buffer
|
||||
void createBuffer(std::shared_ptr<vk::Buffer> buffer, vk::BufferUsageFlags bufferUsageFlags);
|
||||
void allocateBindMemory(std::shared_ptr<vk::Buffer> buffer, std::shared_ptr<vk::DeviceMemory> memory, vk::MemoryPropertyFlags memoryPropertyFlags);
|
||||
void copyBuffer(std::shared_ptr<vk::CommandBuffer> commandBuffer, std::shared_ptr<vk::Buffer> bufferFrom, std::shared_ptr<vk::Buffer> bufferTo, vk::DeviceSize bufferSize, vk::BufferCopy copyRegion, bool createBarrier);
|
||||
|
||||
// Private util functions
|
||||
vk::BufferUsageFlags getBufferUsageFlags();
|
||||
vk::MemoryPropertyFlags getMemoryPropertyFlags();
|
||||
vk::BufferUsageFlags getPrimaryBufferUsageFlags();
|
||||
vk::MemoryPropertyFlags getPrimaryMemoryPropertyFlags();
|
||||
vk::BufferUsageFlags getStagingBufferUsageFlags();
|
||||
vk::MemoryPropertyFlags getStagingMemoryPropertyFlags();
|
||||
uint64_t memorySize();
|
||||
};
|
||||
|
||||
|
|
@ -958,7 +987,7 @@ class OpBase
|
|||
if (tensor && tensor->isInit()) {
|
||||
tensor->freeMemoryDestroyGPUResources();
|
||||
} else {
|
||||
SPDLOG_ERROR("Kompute OpBase expected to free "
|
||||
SPDLOG_WARN("Kompute OpBase expected to free "
|
||||
"tensor but has already been freed.");
|
||||
}
|
||||
}
|
||||
|
|
@ -1264,8 +1293,6 @@ class OpTensorCreate : public OpBase
|
|||
virtual void postEval() override;
|
||||
|
||||
private:
|
||||
// Never owned resources
|
||||
std::vector<std::shared_ptr<Tensor>> mStagingTensors;
|
||||
};
|
||||
|
||||
} // End namespace kp
|
||||
|
|
@ -1836,9 +1863,6 @@ class OpAlgoLhsRhsOut : public OpAlgoBase
|
|||
std::shared_ptr<Tensor> mTensorLHS; ///< Reference to the parameter used in the left hand side equation of the shader
|
||||
std::shared_ptr<Tensor> mTensorRHS; ///< Reference to the parameter used in the right hand side equation of the shader
|
||||
std::shared_ptr<Tensor> mTensorOutput; ///< Reference to the parameter used in the output of the shader and will be copied with a staging vector
|
||||
|
||||
// -------------- ALWAYS OWNED RESOURCES
|
||||
std::shared_ptr<Tensor> mTensorOutputStaging; ///< Staging temporary tensor user do to copy the output of the tensor
|
||||
};
|
||||
|
||||
} // End namespace kp
|
||||
|
|
@ -1976,7 +2000,7 @@ class OpTensorCopy : public OpBase
|
|||
namespace kp {
|
||||
|
||||
/**
|
||||
Operation that syncs tensor's device by mapping local data into the device memory. For TensorTypes::eDevice it will use a staging tensor to perform the copy. For TensorTypes::eStaging it will only copy the data and perform a map, which will be executed during the record (as opposed to during the sequence eval/submit). This function cannot be carried out for TensorTypes::eStaging.
|
||||
Operation that syncs tensor's device by mapping local data into the device memory. For TensorTypes::eDevice it will use a record operation for the memory to be syncd into GPU memory which means that the operation will be done in sync with GPU commands. For TensorTypes::eStaging it will only map the data into host memory which will happen during preEval before the recorded commands are dispatched. This operation won't have any effect on TensorTypes::eStaging.
|
||||
*/
|
||||
class OpTensorSyncDevice : public OpBase
|
||||
{
|
||||
|
|
@ -2002,12 +2026,12 @@ class OpTensorSyncDevice : public OpBase
|
|||
~OpTensorSyncDevice() override;
|
||||
|
||||
/**
|
||||
* Performs basic checks such as ensuring that there is at least one tensor provided, that they are initialized and that they are not of type TensorTpes::eStaging. For staging tensors in host memory, the map is performed during the init function.
|
||||
* Performs basic checks such as ensuring that there is at least one tensor provided with min memory of 1 element.
|
||||
*/
|
||||
void init() override;
|
||||
|
||||
/**
|
||||
* For device tensors, it records the copy command to the device tensor from the temporary staging tensor.
|
||||
* For device tensors, it records the copy command for the tensor to copy the data from its staging to device memory.
|
||||
*/
|
||||
void record() override;
|
||||
|
||||
|
|
@ -2022,8 +2046,6 @@ class OpTensorSyncDevice : public OpBase
|
|||
virtual void postEval() override;
|
||||
|
||||
private:
|
||||
// Never owned resources
|
||||
std::vector<std::shared_ptr<Tensor>> mStagingTensors;
|
||||
};
|
||||
|
||||
} // End namespace kp
|
||||
|
|
@ -2031,7 +2053,7 @@ class OpTensorSyncDevice : public OpBase
|
|||
namespace kp {
|
||||
|
||||
/**
|
||||
Operation that syncs tensor's local data by mapping the data from device memory into the local vector. For TensorTypes::eDevice it will use a staging tensor to perform the copy. For TensorTypes::eStaging it will only copy the data and perform a map, which will be executed during the postSubmit (there will be no copy during the sequence eval/submit). This function cannot be carried out for TensorTypes::eStaging.
|
||||
Operation that syncs tensor's local memory by mapping device data into the local CPU memory. For TensorTypes::eDevice it will use a record operation for the memory to be syncd into GPU memory which means that the operation will be done in sync with GPU commands. For TensorTypes::eStaging it will only map the data into host memory which will happen during preEval before the recorded commands are dispatched. This operation won't have any effect on TensorTypes::eStaging.
|
||||
*/
|
||||
class OpTensorSyncLocal : public OpBase
|
||||
{
|
||||
|
|
@ -2052,17 +2074,17 @@ class OpTensorSyncLocal : public OpBase
|
|||
std::vector<std::shared_ptr<Tensor>> tensors);
|
||||
|
||||
/**
|
||||
* Default destructor. This class manages the memory of the staging tensors it owns but these are released in the postSubmit, before it arrives to the destructor.
|
||||
* Default destructor. This class does not manage memory so it won't be expecting the parent to perform a release.
|
||||
*/
|
||||
~OpTensorSyncLocal() override;
|
||||
|
||||
/**
|
||||
* Performs basic checks such as ensuring that there is at least one tensor provided, that they are initialized and that they are not of type TensorTpes::eStaging.
|
||||
* Performs basic checks such as ensuring that there is at least one tensor provided with min memory of 1 element.
|
||||
*/
|
||||
void init() override;
|
||||
|
||||
/**
|
||||
* For device tensors, it records the copy command into the staging tensor from the device tensor.
|
||||
* For device tensors, it records the copy command for the tensor to copy the data from its device to staging memory.
|
||||
*/
|
||||
void record() override;
|
||||
|
||||
|
|
@ -2077,8 +2099,6 @@ class OpTensorSyncLocal : public OpBase
|
|||
virtual void postEval() override;
|
||||
|
||||
private:
|
||||
// Never owned resources
|
||||
std::vector<std::shared_ptr<Tensor>> mStagingTensors;
|
||||
};
|
||||
|
||||
} // End namespace kp
|
||||
|
|
|
|||
|
|
@ -65,11 +65,6 @@ OpAlgoLhsRhsOut::init()
|
|||
" Output: " + std::to_string(this->mTensorOutput->size()));
|
||||
}
|
||||
|
||||
this->mTensorOutputStaging = std::make_shared<Tensor>(
|
||||
this->mTensorOutput->data(), Tensor::TensorTypes::eStaging);
|
||||
|
||||
this->mTensorOutputStaging->init(this->mPhysicalDevice, this->mDevice);
|
||||
|
||||
SPDLOG_DEBUG("Kompute OpAlgoLhsRhsOut fetching spirv data");
|
||||
|
||||
std::vector<char> shaderFileData = this->fetchSpirvBinaryData();
|
||||
|
|
@ -110,8 +105,10 @@ OpAlgoLhsRhsOut::record()
|
|||
vk::PipelineStageFlagBits::eComputeShader,
|
||||
vk::PipelineStageFlagBits::eTransfer);
|
||||
|
||||
this->mTensorOutputStaging->recordCopyFrom(
|
||||
this->mCommandBuffer, this->mTensorOutput, true);
|
||||
if (this->mTensorOutput->tensorType() == Tensor::TensorTypes::eDevice) {
|
||||
this->mTensorOutput->recordCopyFromDeviceToStaging(this->mCommandBuffer,
|
||||
true);
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
|
|
@ -119,9 +116,7 @@ OpAlgoLhsRhsOut::postEval()
|
|||
{
|
||||
SPDLOG_DEBUG("Kompute OpAlgoLhsRhsOut postSubmit called");
|
||||
|
||||
this->mTensorOutputStaging->mapDataFromHostMemory();
|
||||
|
||||
this->mTensorOutput->setData(this->mTensorOutputStaging->data());
|
||||
this->mTensorOutput->mapDataFromHostMemory();
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
|||
|
|
@ -23,16 +23,6 @@ OpTensorCreate::OpTensorCreate(
|
|||
OpTensorCreate::~OpTensorCreate()
|
||||
{
|
||||
SPDLOG_DEBUG("Kompute OpTensorCreate destructor started");
|
||||
|
||||
SPDLOG_DEBUG("Kompute OpTensorCreate freeing staging tensors");
|
||||
for (std::shared_ptr<Tensor> tensor : this->mStagingTensors) {
|
||||
if (tensor && tensor->isInit()) {
|
||||
tensor->freeMemoryDestroyGPUResources();
|
||||
} else {
|
||||
SPDLOG_ERROR("Kompute OpTensorCreate expected to free "
|
||||
"tensor but has already been freed.");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
|
|
@ -50,27 +40,10 @@ OpTensorCreate::init()
|
|||
throw std::runtime_error(
|
||||
"Kompute OpTensorCreate: Tensor has already been initialized");
|
||||
}
|
||||
if (tensor->tensorType() == Tensor::TensorTypes::eDevice) {
|
||||
tensor->init(this->mPhysicalDevice, this->mDevice);
|
||||
|
||||
std::shared_ptr<Tensor> stagingTensor = std::make_shared<Tensor>(
|
||||
tensor->data(), Tensor::TensorTypes::eStaging);
|
||||
|
||||
stagingTensor->init(this->mPhysicalDevice, this->mDevice);
|
||||
|
||||
stagingTensor->mapDataIntoHostMemory();
|
||||
|
||||
this->mStagingTensors.push_back(stagingTensor);
|
||||
|
||||
} else {
|
||||
|
||||
if (tensor->tensorType() != Tensor::TensorTypes::eStorage) {
|
||||
tensor->init(this->mPhysicalDevice, this->mDevice);
|
||||
|
||||
tensor->mapDataIntoHostMemory();
|
||||
|
||||
// We push a nullptr when no staging tensor is needed to match
|
||||
// index number in array to have one to one mapping with tensors
|
||||
this->mStagingTensors.push_back(nullptr);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -82,8 +55,8 @@ OpTensorCreate::record()
|
|||
|
||||
for (size_t i = 0; i < this->mTensors.size(); i++) {
|
||||
if (this->mTensors[i]->tensorType() == Tensor::TensorTypes::eDevice) {
|
||||
this->mTensors[i]->recordCopyFrom(
|
||||
this->mCommandBuffer, this->mStagingTensors[i], false);
|
||||
this->mTensors[i]->recordCopyFromStagingToDevice(
|
||||
this->mCommandBuffer, false);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -41,25 +41,11 @@ OpTensorSyncDevice::init()
|
|||
"has not been initialized");
|
||||
}
|
||||
if (tensor->tensorType() == Tensor::TensorTypes::eStorage) {
|
||||
throw std::runtime_error(
|
||||
SPDLOG_WARN(
|
||||
"Kompute OpTensorSyncLocal tensor parameter is of type "
|
||||
"TensorTypes::eStorage and hence cannot be used to receive or "
|
||||
"pass data.");
|
||||
}
|
||||
if (tensor->tensorType() == Tensor::TensorTypes::eDevice) {
|
||||
|
||||
std::shared_ptr<Tensor> stagingTensor = std::make_shared<Tensor>(
|
||||
tensor->data(), Tensor::TensorTypes::eStaging);
|
||||
|
||||
stagingTensor->init(this->mPhysicalDevice, this->mDevice);
|
||||
|
||||
this->mStagingTensors.push_back(stagingTensor);
|
||||
|
||||
} else {
|
||||
// We push a nullptr when no staging tensor is needed to match
|
||||
// index number in array to have one to one mapping with tensors
|
||||
this->mStagingTensors.push_back(nullptr);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -70,8 +56,8 @@ OpTensorSyncDevice::record()
|
|||
|
||||
for (size_t i = 0; i < this->mTensors.size(); i++) {
|
||||
if (this->mTensors[i]->tensorType() == Tensor::TensorTypes::eDevice) {
|
||||
this->mTensors[i]->recordCopyFrom(
|
||||
this->mCommandBuffer, this->mStagingTensors[i], false);
|
||||
this->mTensors[i]->recordCopyFromStagingToDevice(
|
||||
this->mCommandBuffer, false);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -83,11 +69,8 @@ OpTensorSyncDevice::preEval()
|
|||
|
||||
// Performing sync of data as eval can be called multiple times with same op
|
||||
for (size_t i = 0; i < this->mTensors.size(); i++) {
|
||||
if (this->mTensors[i]->tensorType() == Tensor::TensorTypes::eDevice) {
|
||||
this->mStagingTensors[i]->setData(this->mTensors[i]->data());
|
||||
this->mStagingTensors[i]->mapDataIntoHostMemory();
|
||||
} else {
|
||||
this->mTensors[i]->mapDataFromHostMemory();
|
||||
if (this->mTensors[i]->tensorType() != Tensor::TensorTypes::eStorage) {
|
||||
this->mTensors[i]->mapDataIntoHostMemory();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -41,26 +41,11 @@ OpTensorSyncLocal::init()
|
|||
"Kompute OpTensorSyncLocal: Tensor has not been initialized");
|
||||
}
|
||||
if (tensor->tensorType() == Tensor::TensorTypes::eStorage) {
|
||||
throw std::runtime_error(
|
||||
SPDLOG_WARN(
|
||||
"Kompute OpTensorSyncLocal tensor parameter is of type "
|
||||
"TensorTypes::eStorage and hence cannot be used to receive or "
|
||||
"pass data.");
|
||||
}
|
||||
if (tensor->tensorType() == Tensor::TensorTypes::eDevice) {
|
||||
|
||||
std::shared_ptr<Tensor> stagingTensor = std::make_shared<Tensor>(
|
||||
tensor->data(), Tensor::TensorTypes::eStaging);
|
||||
|
||||
stagingTensor->init(this->mPhysicalDevice, this->mDevice);
|
||||
|
||||
this->mStagingTensors.push_back(stagingTensor);
|
||||
|
||||
} else {
|
||||
|
||||
// We push a nullptr when no staging tensor is needed to match
|
||||
// index number in array to have one to one mapping with tensors
|
||||
this->mStagingTensors.push_back(nullptr);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -71,8 +56,8 @@ OpTensorSyncLocal::record()
|
|||
|
||||
for (size_t i = 0; i < this->mTensors.size(); i++) {
|
||||
if (this->mTensors[i]->tensorType() == Tensor::TensorTypes::eDevice) {
|
||||
this->mStagingTensors[i]->recordCopyFrom(
|
||||
this->mCommandBuffer, this->mTensors[i], true);
|
||||
this->mTensors[i]->recordCopyFromDeviceToStaging(
|
||||
this->mCommandBuffer, true);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -90,10 +75,7 @@ OpTensorSyncLocal::postEval()
|
|||
|
||||
SPDLOG_DEBUG("Kompute OpTensorSyncLocal mapping data into tensor local");
|
||||
for (size_t i = 0; i < this->mTensors.size(); i++) {
|
||||
if (this->mTensors[i]->tensorType() == Tensor::TensorTypes::eDevice) {
|
||||
this->mStagingTensors[i]->mapDataFromHostMemory();
|
||||
this->mTensors[i]->setData(this->mStagingTensors[i]->data());
|
||||
} else {
|
||||
if (this->mTensors[i]->tensorType() != Tensor::TensorTypes::eStorage) {
|
||||
this->mTensors[i]->mapDataFromHostMemory();
|
||||
}
|
||||
}
|
||||
|
|
|
|||
278
src/Tensor.cpp
278
src/Tensor.cpp
|
|
@ -47,7 +47,7 @@ Tensor::init(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
|
|||
|
||||
this->mIsInit = true;
|
||||
|
||||
this->createBuffer();
|
||||
this->allocateMemoryCreateGPUResources();
|
||||
}
|
||||
|
||||
std::vector<float>&
|
||||
|
|
@ -89,7 +89,7 @@ Tensor::tensorType()
|
|||
bool
|
||||
Tensor::isInit()
|
||||
{
|
||||
return this->mIsInit && this->mBuffer && this->mMemory;
|
||||
return this->mIsInit && this->mPrimaryBuffer && this->mPrimaryMemory;
|
||||
}
|
||||
|
||||
void
|
||||
|
|
@ -107,20 +107,71 @@ Tensor::recordCopyFrom(std::shared_ptr<vk::CommandBuffer> commandBuffer,
|
|||
std::shared_ptr<Tensor> copyFromTensor,
|
||||
bool createBarrier)
|
||||
{
|
||||
SPDLOG_DEBUG("Kompute Tensor recordCopyFrom called");
|
||||
|
||||
if (!this->mIsInit || !copyFromTensor->mIsInit) {
|
||||
throw std::runtime_error(
|
||||
"Kompute Tensor attempted to run createBuffer without init");
|
||||
}
|
||||
vk::DeviceSize bufferSize(this->memorySize());
|
||||
vk::BufferCopy copyRegion(0, 0, bufferSize);
|
||||
|
||||
SPDLOG_DEBUG("Kompute Tensor recordCopyFrom data size {}.", bufferSize);
|
||||
|
||||
this->copyBuffer(commandBuffer,
|
||||
copyFromTensor->mPrimaryBuffer,
|
||||
this->mPrimaryBuffer,
|
||||
bufferSize,
|
||||
copyRegion,
|
||||
createBarrier);
|
||||
}
|
||||
|
||||
void
|
||||
Tensor::recordCopyFromStagingToDevice(
|
||||
std::shared_ptr<vk::CommandBuffer> commandBuffer,
|
||||
bool createBarrier)
|
||||
{
|
||||
vk::DeviceSize bufferSize(this->memorySize());
|
||||
vk::BufferCopy copyRegion(0, 0, bufferSize);
|
||||
|
||||
SPDLOG_DEBUG("Kompute Tensor copying data size {}.", bufferSize);
|
||||
|
||||
commandBuffer->copyBuffer(
|
||||
*copyFromTensor->mBuffer, *this->mBuffer, copyRegion);
|
||||
this->copyBuffer(commandBuffer,
|
||||
this->mStagingBuffer,
|
||||
this->mPrimaryBuffer,
|
||||
bufferSize,
|
||||
copyRegion,
|
||||
createBarrier);
|
||||
}
|
||||
|
||||
void
|
||||
Tensor::recordCopyFromDeviceToStaging(
|
||||
std::shared_ptr<vk::CommandBuffer> commandBuffer,
|
||||
bool createBarrier)
|
||||
{
|
||||
vk::DeviceSize bufferSize(this->memorySize());
|
||||
vk::BufferCopy copyRegion(0, 0, bufferSize);
|
||||
|
||||
SPDLOG_DEBUG("Kompute Tensor copying data size {}.", bufferSize);
|
||||
|
||||
this->copyBuffer(commandBuffer,
|
||||
this->mPrimaryBuffer,
|
||||
this->mStagingBuffer,
|
||||
bufferSize,
|
||||
copyRegion,
|
||||
createBarrier);
|
||||
}
|
||||
|
||||
void
|
||||
Tensor::copyBuffer(std::shared_ptr<vk::CommandBuffer> commandBuffer,
|
||||
std::shared_ptr<vk::Buffer> bufferFrom,
|
||||
std::shared_ptr<vk::Buffer> bufferTo,
|
||||
vk::DeviceSize bufferSize,
|
||||
vk::BufferCopy copyRegion,
|
||||
bool createBarrier)
|
||||
{
|
||||
|
||||
if (!this->mIsInit) {
|
||||
throw std::runtime_error(
|
||||
"Kompute Tensor attempted to run copyBuffer without init");
|
||||
}
|
||||
|
||||
commandBuffer->copyBuffer(*bufferFrom, *bufferTo, copyRegion);
|
||||
|
||||
if (createBarrier) {
|
||||
// Buffer to ensure wait until data is copied to staging buffer
|
||||
|
|
@ -145,7 +196,7 @@ Tensor::recordBufferMemoryBarrier(
|
|||
vk::DeviceSize bufferSize = this->memorySize();
|
||||
|
||||
vk::BufferMemoryBarrier bufferMemoryBarrier;
|
||||
bufferMemoryBarrier.buffer = *this->mBuffer;
|
||||
bufferMemoryBarrier.buffer = *this->mPrimaryBuffer;
|
||||
bufferMemoryBarrier.size = bufferSize;
|
||||
bufferMemoryBarrier.srcAccessMask = srcAccessMask;
|
||||
bufferMemoryBarrier.dstAccessMask = dstAccessMask;
|
||||
|
|
@ -164,7 +215,7 @@ vk::DescriptorBufferInfo
|
|||
Tensor::constructDescriptorBufferInfo()
|
||||
{
|
||||
vk::DeviceSize bufferSize = this->memorySize();
|
||||
return vk::DescriptorBufferInfo(*this->mBuffer,
|
||||
return vk::DescriptorBufferInfo(*this->mPrimaryBuffer,
|
||||
0, // offset
|
||||
bufferSize);
|
||||
}
|
||||
|
|
@ -174,20 +225,21 @@ Tensor::mapDataFromHostMemory()
|
|||
{
|
||||
SPDLOG_DEBUG("Kompute Tensor mapping data from host buffer");
|
||||
|
||||
if (this->mTensorType != TensorTypes::eStaging) {
|
||||
SPDLOG_ERROR(
|
||||
"Mapping tensor data manually from DEVICE buffer instead of "
|
||||
"using record GPU command with staging buffer");
|
||||
return;
|
||||
std::shared_ptr<vk::DeviceMemory> hostVisibleMemory = nullptr;
|
||||
|
||||
if (this->mTensorType == TensorTypes::eHost) {
|
||||
hostVisibleMemory = this->mPrimaryMemory;
|
||||
} else {
|
||||
hostVisibleMemory = this->mStagingMemory;
|
||||
}
|
||||
|
||||
vk::DeviceSize bufferSize = this->memorySize();
|
||||
void* mapped = this->mDevice->mapMemory(
|
||||
*this->mMemory, 0, bufferSize, vk::MemoryMapFlags());
|
||||
vk::MappedMemoryRange mappedMemoryRange(*this->mMemory, 0, bufferSize);
|
||||
*hostVisibleMemory, 0, bufferSize, vk::MemoryMapFlags());
|
||||
vk::MappedMemoryRange mappedMemoryRange(*hostVisibleMemory, 0, bufferSize);
|
||||
this->mDevice->invalidateMappedMemoryRanges(mappedMemoryRange);
|
||||
memcpy(this->mData.data(), mapped, bufferSize);
|
||||
this->mDevice->unmapMemory(*this->mMemory);
|
||||
this->mDevice->unmapMemory(*hostVisibleMemory);
|
||||
}
|
||||
|
||||
void
|
||||
|
|
@ -196,24 +248,26 @@ Tensor::mapDataIntoHostMemory()
|
|||
|
||||
SPDLOG_DEBUG("Kompute Tensor local mapping tensor data to host buffer");
|
||||
|
||||
if (this->mTensorType != TensorTypes::eStaging) {
|
||||
SPDLOG_ERROR("Mapping tensor data manually to DEVICE memory instead of "
|
||||
"using record GPU command with staging buffer");
|
||||
return;
|
||||
std::shared_ptr<vk::DeviceMemory> hostVisibleMemory = nullptr;
|
||||
|
||||
if (this->mTensorType == TensorTypes::eHost) {
|
||||
hostVisibleMemory = this->mPrimaryMemory;
|
||||
} else {
|
||||
hostVisibleMemory = this->mStagingMemory;
|
||||
}
|
||||
|
||||
vk::DeviceSize bufferSize = this->memorySize();
|
||||
|
||||
void* mapped = this->mDevice->mapMemory(
|
||||
*this->mMemory, 0, bufferSize, vk::MemoryMapFlags());
|
||||
*hostVisibleMemory, 0, bufferSize, vk::MemoryMapFlags());
|
||||
memcpy(mapped, this->mData.data(), bufferSize);
|
||||
vk::MappedMemoryRange mappedRange(*this->mMemory, 0, bufferSize);
|
||||
vk::MappedMemoryRange mappedRange(*hostVisibleMemory, 0, bufferSize);
|
||||
this->mDevice->flushMappedMemoryRanges(1, &mappedRange);
|
||||
this->mDevice->unmapMemory(*this->mMemory);
|
||||
this->mDevice->unmapMemory(*hostVisibleMemory);
|
||||
}
|
||||
|
||||
vk::BufferUsageFlags
|
||||
Tensor::getBufferUsageFlags()
|
||||
Tensor::getPrimaryBufferUsageFlags()
|
||||
{
|
||||
switch (this->mTensorType) {
|
||||
case TensorTypes::eDevice:
|
||||
|
|
@ -221,8 +275,9 @@ Tensor::getBufferUsageFlags()
|
|||
vk::BufferUsageFlagBits::eTransferSrc |
|
||||
vk::BufferUsageFlagBits::eTransferDst;
|
||||
break;
|
||||
case TensorTypes::eStaging:
|
||||
return vk::BufferUsageFlagBits::eTransferSrc |
|
||||
case TensorTypes::eHost:
|
||||
return vk::BufferUsageFlagBits::eStorageBuffer |
|
||||
vk::BufferUsageFlagBits::eTransferSrc |
|
||||
vk::BufferUsageFlagBits::eTransferDst;
|
||||
break;
|
||||
case TensorTypes::eStorage:
|
||||
|
|
@ -234,13 +289,13 @@ Tensor::getBufferUsageFlags()
|
|||
}
|
||||
|
||||
vk::MemoryPropertyFlags
|
||||
Tensor::getMemoryPropertyFlags()
|
||||
Tensor::getPrimaryMemoryPropertyFlags()
|
||||
{
|
||||
switch (this->mTensorType) {
|
||||
case TensorTypes::eDevice:
|
||||
return vk::MemoryPropertyFlagBits::eDeviceLocal;
|
||||
break;
|
||||
case TensorTypes::eStaging:
|
||||
case TensorTypes::eHost:
|
||||
return vk::MemoryPropertyFlagBits::eHostVisible;
|
||||
break;
|
||||
case TensorTypes::eStorage:
|
||||
|
|
@ -251,8 +306,33 @@ Tensor::getMemoryPropertyFlags()
|
|||
}
|
||||
}
|
||||
|
||||
vk::BufferUsageFlags
|
||||
Tensor::getStagingBufferUsageFlags()
|
||||
{
|
||||
switch (this->mTensorType) {
|
||||
case TensorTypes::eDevice:
|
||||
return vk::BufferUsageFlagBits::eTransferSrc |
|
||||
vk::BufferUsageFlagBits::eTransferDst;
|
||||
break;
|
||||
default:
|
||||
throw std::runtime_error("Kompute Tensor invalid tensor type");
|
||||
}
|
||||
}
|
||||
|
||||
vk::MemoryPropertyFlags
|
||||
Tensor::getStagingMemoryPropertyFlags()
|
||||
{
|
||||
switch (this->mTensorType) {
|
||||
case TensorTypes::eDevice:
|
||||
return vk::MemoryPropertyFlagBits::eHostVisible;
|
||||
break;
|
||||
default:
|
||||
throw std::runtime_error("Kompute Tensor invalid tensor type");
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
Tensor::createBuffer()
|
||||
Tensor::allocateMemoryCreateGPUResources()
|
||||
{
|
||||
SPDLOG_DEBUG("Kompute Tensor creating buffer");
|
||||
|
||||
|
|
@ -268,43 +348,79 @@ Tensor::createBuffer()
|
|||
throw std::runtime_error("Kompute Tensor device is null");
|
||||
}
|
||||
|
||||
SPDLOG_DEBUG("Kompute Tensor creating primary buffer and memory");
|
||||
|
||||
vk::BufferUsageFlags usageFlags = this->getBufferUsageFlags();
|
||||
vk::DeviceSize bufferSize = this->memorySize();
|
||||
if(bufferSize<1){
|
||||
throw std::runtime_error("Kompute Tensor attempted to create a zero-sized buffer");
|
||||
this->mPrimaryBuffer = std::make_shared<vk::Buffer>();
|
||||
this->createBuffer(this->mPrimaryBuffer,
|
||||
this->getPrimaryBufferUsageFlags());
|
||||
this->mFreePrimaryBuffer = true;
|
||||
this->mPrimaryMemory = std::make_shared<vk::DeviceMemory>();
|
||||
this->allocateBindMemory(this->mPrimaryBuffer,
|
||||
this->mPrimaryMemory,
|
||||
this->getPrimaryMemoryPropertyFlags());
|
||||
this->mFreePrimaryMemory = true;
|
||||
|
||||
if (this->mTensorType == TensorTypes::eDevice) {
|
||||
SPDLOG_DEBUG("Kompute Tensor creating staging buffer and memory");
|
||||
|
||||
this->mStagingBuffer = std::make_shared<vk::Buffer>();
|
||||
this->createBuffer(this->mStagingBuffer,
|
||||
this->getStagingBufferUsageFlags());
|
||||
this->mFreeStagingBuffer = true;
|
||||
this->mStagingMemory = std::make_shared<vk::DeviceMemory>();
|
||||
this->allocateBindMemory(this->mStagingBuffer,
|
||||
this->mStagingMemory,
|
||||
this->getStagingMemoryPropertyFlags());
|
||||
this->mFreeStagingMemory = true;
|
||||
}
|
||||
|
||||
SPDLOG_DEBUG("Kompute Tensor buffer & memory creation successful");
|
||||
}
|
||||
|
||||
void
|
||||
Tensor::createBuffer(std::shared_ptr<vk::Buffer> buffer,
|
||||
vk::BufferUsageFlags bufferUsageFlags)
|
||||
{
|
||||
|
||||
vk::DeviceSize bufferSize = this->memorySize();
|
||||
|
||||
if (bufferSize < 1) {
|
||||
throw std::runtime_error(
|
||||
"Kompute Tensor attempted to create a zero-sized buffer");
|
||||
}
|
||||
|
||||
this->mFreeBuffer = true;
|
||||
|
||||
SPDLOG_DEBUG("Kompute Tensor creating buffer with memory size: {}, and "
|
||||
"usage flags: {}",
|
||||
bufferSize,
|
||||
vk::to_string(usageFlags));
|
||||
vk::to_string(bufferUsageFlags));
|
||||
|
||||
// TODO: Explore having concurrent sharing mode (with option)
|
||||
vk::BufferCreateInfo bufferInfo(vk::BufferCreateFlags(),
|
||||
bufferSize,
|
||||
usageFlags,
|
||||
bufferUsageFlags,
|
||||
vk::SharingMode::eExclusive);
|
||||
|
||||
this->mBuffer = std::make_shared<vk::Buffer>();
|
||||
this->mDevice->createBuffer(&bufferInfo, nullptr, this->mBuffer.get());
|
||||
this->mDevice->createBuffer(&bufferInfo, nullptr, buffer.get());
|
||||
}
|
||||
|
||||
SPDLOG_DEBUG("Kompute Tensor buffer created now creating memory");
|
||||
void
|
||||
Tensor::allocateBindMemory(std::shared_ptr<vk::Buffer> buffer,
|
||||
std::shared_ptr<vk::DeviceMemory> memory,
|
||||
vk::MemoryPropertyFlags memoryPropertyFlags)
|
||||
{
|
||||
|
||||
SPDLOG_DEBUG("Kompute Tensor allocating and binding memory");
|
||||
|
||||
vk::PhysicalDeviceMemoryProperties memoryProperties =
|
||||
this->mPhysicalDevice->getMemoryProperties();
|
||||
|
||||
vk::MemoryRequirements memoryRequirements =
|
||||
this->mDevice->getBufferMemoryRequirements(*this->mBuffer);
|
||||
|
||||
vk::MemoryPropertyFlags memoryPropertyFlags =
|
||||
this->getMemoryPropertyFlags();
|
||||
this->mDevice->getBufferMemoryRequirements(*buffer);
|
||||
|
||||
uint32_t memoryTypeIndex = -1;
|
||||
for (uint32_t i = 0; i < memoryProperties.memoryTypeCount; i++) {
|
||||
if (memoryRequirements.memoryTypeBits & (1 << i)) {
|
||||
if ((memoryProperties.memoryTypes[i].propertyFlags &
|
||||
if (((memoryProperties.memoryTypes[i]).propertyFlags &
|
||||
memoryPropertyFlags) == memoryPropertyFlags) {
|
||||
memoryTypeIndex = i;
|
||||
break;
|
||||
|
|
@ -316,8 +432,6 @@ Tensor::createBuffer()
|
|||
"Memory type index for buffer creation not found");
|
||||
}
|
||||
|
||||
this->mFreeMemory = true;
|
||||
|
||||
SPDLOG_DEBUG(
|
||||
"Kompute Tensor allocating memory index: {}, size {}, flags: {}",
|
||||
memoryTypeIndex,
|
||||
|
|
@ -327,13 +441,9 @@ Tensor::createBuffer()
|
|||
vk::MemoryAllocateInfo memoryAllocateInfo(memoryRequirements.size,
|
||||
memoryTypeIndex);
|
||||
|
||||
this->mMemory = std::make_shared<vk::DeviceMemory>();
|
||||
this->mDevice->allocateMemory(
|
||||
&memoryAllocateInfo, nullptr, this->mMemory.get());
|
||||
this->mDevice->allocateMemory(&memoryAllocateInfo, nullptr, memory.get());
|
||||
|
||||
this->mDevice->bindBufferMemory(*this->mBuffer, *this->mMemory, 0);
|
||||
|
||||
SPDLOG_DEBUG("Kompute Tensor buffer & memory creation successful");
|
||||
this->mDevice->bindBufferMemory(*buffer, *memory, 0);
|
||||
}
|
||||
|
||||
void
|
||||
|
|
@ -349,29 +459,55 @@ Tensor::freeMemoryDestroyGPUResources()
|
|||
return;
|
||||
}
|
||||
|
||||
if (this->mFreeBuffer) {
|
||||
if (!this->mBuffer) {
|
||||
SPDLOG_ERROR(
|
||||
"Kompose Tensor expected to free buffer but got null buffer");
|
||||
if (this->mFreePrimaryBuffer) {
|
||||
if (!this->mPrimaryBuffer) {
|
||||
SPDLOG_ERROR("Kompose Tensor expected to destroy primary buffer "
|
||||
"but got null buffer");
|
||||
} else {
|
||||
SPDLOG_DEBUG("Kompose Tensor destroying buffer");
|
||||
SPDLOG_DEBUG("Kompose Tensor destroying primary buffer");
|
||||
this->mDevice->destroy(
|
||||
*this->mBuffer,
|
||||
*this->mPrimaryBuffer,
|
||||
(vk::Optional<const vk::AllocationCallbacks>)nullptr);
|
||||
this->mBuffer = nullptr;
|
||||
this->mPrimaryBuffer = nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
if (this->mFreeMemory) {
|
||||
if (!this->mMemory) {
|
||||
SPDLOG_ERROR(
|
||||
"Kompose Tensor expected to free buffer but got null memory");
|
||||
if (this->mFreeStagingBuffer) {
|
||||
if (!this->mStagingBuffer) {
|
||||
SPDLOG_ERROR("Kompose Tensor expected to destroy staging buffer "
|
||||
"but got null buffer");
|
||||
} else {
|
||||
SPDLOG_DEBUG("Kompose Tensor freeing memory");
|
||||
this->mDevice->freeMemory(
|
||||
*this->mMemory,
|
||||
SPDLOG_DEBUG("Kompose Tensor destroying staging buffer");
|
||||
this->mDevice->destroy(
|
||||
*this->mStagingBuffer,
|
||||
(vk::Optional<const vk::AllocationCallbacks>)nullptr);
|
||||
this->mDevice = nullptr;
|
||||
this->mStagingBuffer = nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
if (this->mFreePrimaryMemory) {
|
||||
if (!this->mPrimaryMemory) {
|
||||
SPDLOG_ERROR("Kompose Tensor expected to free primary memory but "
|
||||
"got null memory");
|
||||
} else {
|
||||
SPDLOG_DEBUG("Kompose Tensor freeing primary memory");
|
||||
this->mDevice->freeMemory(
|
||||
*this->mPrimaryMemory,
|
||||
(vk::Optional<const vk::AllocationCallbacks>)nullptr);
|
||||
this->mPrimaryMemory = nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
if (this->mFreeStagingMemory) {
|
||||
if (!this->mStagingMemory) {
|
||||
SPDLOG_ERROR("Kompose Tensor expected to free staging memory but "
|
||||
"got null memory");
|
||||
} else {
|
||||
SPDLOG_DEBUG("Kompose Tensor freeing staging memory");
|
||||
this->mDevice->freeMemory(
|
||||
*this->mStagingMemory,
|
||||
(vk::Optional<const vk::AllocationCallbacks>)nullptr);
|
||||
this->mStagingMemory = nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -26,7 +26,7 @@ class Tensor
|
|||
enum class TensorTypes
|
||||
{
|
||||
eDevice = 0, ///< Type is device memory, source and destination
|
||||
eStaging = 1, ///< Type is host memory, source and destination
|
||||
eHost = 1, ///< Type is host memory, source and destination
|
||||
eStorage = 2, ///< Type is Device memory (only)
|
||||
};
|
||||
|
||||
|
|
@ -39,7 +39,8 @@ class Tensor
|
|||
* Default constructor with data provided which would be used to create the
|
||||
* respective vulkan buffer and memory.
|
||||
*
|
||||
* @param data Non-zero-sized vector of data that will be used by the tensor
|
||||
* @param data Non-zero-sized vector of data that will be used by the
|
||||
* tensor
|
||||
* @param tensorType Type for the tensor which is of type TensorTypes
|
||||
*/
|
||||
Tensor(const std::vector<float>& data,
|
||||
|
|
@ -131,6 +132,32 @@ class Tensor
|
|||
std::shared_ptr<Tensor> copyFromTensor,
|
||||
bool createBarrier);
|
||||
|
||||
/**
|
||||
* Records a copy from the internal staging memory to the device memory
|
||||
* using an optional barrier to wait for the operation. This function would
|
||||
* only be relevant for kp::Tensors of type eDevice.
|
||||
*
|
||||
* @param commandBuffer Vulkan Command Buffer to record the commands into
|
||||
* @param createBarrier Whether to create a barrier that ensures the data is
|
||||
* copied before further operations. Default is true.
|
||||
*/
|
||||
void recordCopyFromStagingToDevice(
|
||||
std::shared_ptr<vk::CommandBuffer> commandBuffer,
|
||||
bool createBarrier);
|
||||
|
||||
/**
|
||||
* Records a copy from the internal device memory to the staging memory
|
||||
* using an optional barrier to wait for the operation. This function would
|
||||
* only be relevant for kp::Tensors of type eDevice.
|
||||
*
|
||||
* @param commandBuffer Vulkan Command Buffer to record the commands into
|
||||
* @param createBarrier Whether to create a barrier that ensures the data is
|
||||
* copied before further operations. Default is true.
|
||||
*/
|
||||
void recordCopyFromDeviceToStaging(
|
||||
std::shared_ptr<vk::CommandBuffer> commandBuffer,
|
||||
bool createBarrier);
|
||||
|
||||
/**
|
||||
* Records the buffer memory barrier into the command buffer which
|
||||
* ensures that relevant data transfers are carried out correctly.
|
||||
|
|
@ -173,10 +200,14 @@ class Tensor
|
|||
std::shared_ptr<vk::Device> mDevice;
|
||||
|
||||
// -------------- OPTIONALLY OWNED RESOURCES
|
||||
std::shared_ptr<vk::Buffer> mBuffer;
|
||||
bool mFreeBuffer;
|
||||
std::shared_ptr<vk::DeviceMemory> mMemory;
|
||||
bool mFreeMemory;
|
||||
std::shared_ptr<vk::Buffer> mPrimaryBuffer;
|
||||
bool mFreePrimaryBuffer = false;
|
||||
std::shared_ptr<vk::Buffer> mStagingBuffer;
|
||||
bool mFreeStagingBuffer = false;
|
||||
std::shared_ptr<vk::DeviceMemory> mPrimaryMemory;
|
||||
bool mFreePrimaryMemory = false;
|
||||
std::shared_ptr<vk::DeviceMemory> mStagingMemory;
|
||||
bool mFreeStagingMemory = false;
|
||||
|
||||
// -------------- ALWAYS OWNED RESOURCES
|
||||
std::vector<float> mData;
|
||||
|
|
@ -186,11 +217,24 @@ class Tensor
|
|||
std::array<uint32_t, KP_MAX_DIM_SIZE> mShape;
|
||||
bool mIsInit = false;
|
||||
|
||||
void createBuffer(); // Creates the vulkan buffer
|
||||
void allocateMemoryCreateGPUResources(); // Creates the vulkan buffer
|
||||
void createBuffer(std::shared_ptr<vk::Buffer> buffer,
|
||||
vk::BufferUsageFlags bufferUsageFlags);
|
||||
void allocateBindMemory(std::shared_ptr<vk::Buffer> buffer,
|
||||
std::shared_ptr<vk::DeviceMemory> memory,
|
||||
vk::MemoryPropertyFlags memoryPropertyFlags);
|
||||
void copyBuffer(std::shared_ptr<vk::CommandBuffer> commandBuffer,
|
||||
std::shared_ptr<vk::Buffer> bufferFrom,
|
||||
std::shared_ptr<vk::Buffer> bufferTo,
|
||||
vk::DeviceSize bufferSize,
|
||||
vk::BufferCopy copyRegion,
|
||||
bool createBarrier);
|
||||
|
||||
// Private util functions
|
||||
vk::BufferUsageFlags getBufferUsageFlags();
|
||||
vk::MemoryPropertyFlags getMemoryPropertyFlags();
|
||||
vk::BufferUsageFlags getPrimaryBufferUsageFlags();
|
||||
vk::MemoryPropertyFlags getPrimaryMemoryPropertyFlags();
|
||||
vk::BufferUsageFlags getStagingBufferUsageFlags();
|
||||
vk::MemoryPropertyFlags getStagingMemoryPropertyFlags();
|
||||
uint64_t memorySize();
|
||||
};
|
||||
|
||||
|
|
|
|||
|
|
@ -78,9 +78,6 @@ class OpAlgoLhsRhsOut : public OpAlgoBase
|
|||
std::shared_ptr<Tensor> mTensorLHS; ///< Reference to the parameter used in the left hand side equation of the shader
|
||||
std::shared_ptr<Tensor> mTensorRHS; ///< Reference to the parameter used in the right hand side equation of the shader
|
||||
std::shared_ptr<Tensor> mTensorOutput; ///< Reference to the parameter used in the output of the shader and will be copied with a staging vector
|
||||
|
||||
// -------------- ALWAYS OWNED RESOURCES
|
||||
std::shared_ptr<Tensor> mTensorOutputStaging; ///< Staging temporary tensor user do to copy the output of the tensor
|
||||
};
|
||||
|
||||
} // End namespace kp
|
||||
|
|
|
|||
|
|
@ -69,7 +69,7 @@ class OpBase
|
|||
if (tensor && tensor->isInit()) {
|
||||
tensor->freeMemoryDestroyGPUResources();
|
||||
} else {
|
||||
SPDLOG_ERROR("Kompute OpBase expected to free "
|
||||
SPDLOG_WARN("Kompute OpBase expected to free "
|
||||
"tensor but has already been freed.");
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -69,8 +69,6 @@ class OpTensorCreate : public OpBase
|
|||
|
||||
|
||||
private:
|
||||
// Never owned resources
|
||||
std::vector<std::shared_ptr<Tensor>> mStagingTensors;
|
||||
};
|
||||
|
||||
} // End namespace kp
|
||||
|
|
|
|||
|
|
@ -9,7 +9,7 @@
|
|||
namespace kp {
|
||||
|
||||
/**
|
||||
Operation that syncs tensor's device by mapping local data into the device memory. For TensorTypes::eDevice it will use a staging tensor to perform the copy. For TensorTypes::eStaging it will only copy the data and perform a map, which will be executed during the record (as opposed to during the sequence eval/submit). This function cannot be carried out for TensorTypes::eStaging.
|
||||
Operation that syncs tensor's device by mapping local data into the device memory. For TensorTypes::eDevice it will use a record operation for the memory to be syncd into GPU memory which means that the operation will be done in sync with GPU commands. For TensorTypes::eStaging it will only map the data into host memory which will happen during preEval before the recorded commands are dispatched. This operation won't have any effect on TensorTypes::eStaging.
|
||||
*/
|
||||
class OpTensorSyncDevice : public OpBase
|
||||
{
|
||||
|
|
@ -35,12 +35,12 @@ class OpTensorSyncDevice : public OpBase
|
|||
~OpTensorSyncDevice() override;
|
||||
|
||||
/**
|
||||
* Performs basic checks such as ensuring that there is at least one tensor provided, that they are initialized and that they are not of type TensorTpes::eStaging. For staging tensors in host memory, the map is performed during the init function.
|
||||
* Performs basic checks such as ensuring that there is at least one tensor provided with min memory of 1 element.
|
||||
*/
|
||||
void init() override;
|
||||
|
||||
/**
|
||||
* For device tensors, it records the copy command to the device tensor from the temporary staging tensor.
|
||||
* For device tensors, it records the copy command for the tensor to copy the data from its staging to device memory.
|
||||
*/
|
||||
void record() override;
|
||||
|
||||
|
|
@ -55,8 +55,6 @@ class OpTensorSyncDevice : public OpBase
|
|||
virtual void postEval() override;
|
||||
|
||||
private:
|
||||
// Never owned resources
|
||||
std::vector<std::shared_ptr<Tensor>> mStagingTensors;
|
||||
};
|
||||
|
||||
} // End namespace kp
|
||||
|
|
|
|||
|
|
@ -9,7 +9,7 @@
|
|||
namespace kp {
|
||||
|
||||
/**
|
||||
Operation that syncs tensor's local data by mapping the data from device memory into the local vector. For TensorTypes::eDevice it will use a staging tensor to perform the copy. For TensorTypes::eStaging it will only copy the data and perform a map, which will be executed during the postSubmit (there will be no copy during the sequence eval/submit). This function cannot be carried out for TensorTypes::eStaging.
|
||||
Operation that syncs tensor's local memory by mapping device data into the local CPU memory. For TensorTypes::eDevice it will use a record operation for the memory to be syncd into GPU memory which means that the operation will be done in sync with GPU commands. For TensorTypes::eStaging it will only map the data into host memory which will happen during preEval before the recorded commands are dispatched. This operation won't have any effect on TensorTypes::eStaging.
|
||||
*/
|
||||
class OpTensorSyncLocal : public OpBase
|
||||
{
|
||||
|
|
@ -30,17 +30,17 @@ class OpTensorSyncLocal : public OpBase
|
|||
std::vector<std::shared_ptr<Tensor>> tensors);
|
||||
|
||||
/**
|
||||
* Default destructor. This class manages the memory of the staging tensors it owns but these are released in the postSubmit, before it arrives to the destructor.
|
||||
* Default destructor. This class does not manage memory so it won't be expecting the parent to perform a release.
|
||||
*/
|
||||
~OpTensorSyncLocal() override;
|
||||
|
||||
/**
|
||||
* Performs basic checks such as ensuring that there is at least one tensor provided, that they are initialized and that they are not of type TensorTpes::eStaging.
|
||||
* Performs basic checks such as ensuring that there is at least one tensor provided with min memory of 1 element.
|
||||
*/
|
||||
void init() override;
|
||||
|
||||
/**
|
||||
* For device tensors, it records the copy command into the staging tensor from the device tensor.
|
||||
* For device tensors, it records the copy command for the tensor to copy the data from its device to staging memory.
|
||||
*/
|
||||
void record() override;
|
||||
|
||||
|
|
@ -56,8 +56,6 @@ class OpTensorSyncLocal : public OpBase
|
|||
|
||||
|
||||
private:
|
||||
// Never owned resources
|
||||
std::vector<std::shared_ptr<Tensor>> mStagingTensors;
|
||||
};
|
||||
|
||||
} // End namespace kp
|
||||
|
|
|
|||
|
|
@ -105,12 +105,12 @@ TEST(TestLogisticRegressionAlgorithm, TestMainLogisticRegressionManualCopy)
|
|||
std::shared_ptr<kp::Tensor> y{ new kp::Tensor({ 0, 0, 0, 1, 1 }) };
|
||||
|
||||
std::shared_ptr<kp::Tensor> wIn{ new kp::Tensor(
|
||||
wInVec, kp::Tensor::TensorTypes::eStaging) };
|
||||
wInVec, kp::Tensor::TensorTypes::eHost) };
|
||||
std::shared_ptr<kp::Tensor> wOutI{ new kp::Tensor({ 0, 0, 0, 0, 0 }) };
|
||||
std::shared_ptr<kp::Tensor> wOutJ{ new kp::Tensor({ 0, 0, 0, 0, 0 }) };
|
||||
|
||||
std::shared_ptr<kp::Tensor> bIn{ new kp::Tensor(
|
||||
bInVec, kp::Tensor::TensorTypes::eStaging) };
|
||||
bInVec, kp::Tensor::TensorTypes::eHost) };
|
||||
std::shared_ptr<kp::Tensor> bOut{ new kp::Tensor({ 0, 0, 0, 0, 0 }) };
|
||||
|
||||
std::shared_ptr<kp::Tensor> lOut{ new kp::Tensor({ 0, 0, 0, 0, 0 }) };
|
||||
|
|
|
|||
|
|
@ -126,7 +126,7 @@ TEST(TestManager, TestCreateInitTensor)
|
|||
EXPECT_EQ(tensorB->data(), std::vector<float>({ 0, 1, 2 }));
|
||||
|
||||
std::shared_ptr<kp::Tensor> tensorC =
|
||||
mgr.buildTensor({ 0, 0, 0 }, kp::Tensor::TensorTypes::eStaging);
|
||||
mgr.buildTensor({ 0, 0, 0 }, kp::Tensor::TensorTypes::eHost);
|
||||
|
||||
mgr.evalOpDefault<kp::OpTensorCopy>({ tensorA, tensorC });
|
||||
|
||||
|
|
|
|||
|
|
@ -58,7 +58,7 @@ TEST(TestOpTensorCopy, CopyDeviceToDeviceTensorMulti)
|
|||
EXPECT_EQ(tensorA->data(), tensorC->data());
|
||||
}
|
||||
|
||||
TEST(TestOpTensorCopy, CopyDeviceToStagingTensor)
|
||||
TEST(TestOpTensorCopy, CopyDeviceToHostTensor)
|
||||
{
|
||||
|
||||
kp::Manager mgr;
|
||||
|
|
@ -68,7 +68,7 @@ TEST(TestOpTensorCopy, CopyDeviceToStagingTensor)
|
|||
|
||||
std::shared_ptr<kp::Tensor> tensorA{ new kp::Tensor(testVecA) };
|
||||
std::shared_ptr<kp::Tensor> tensorB{ new kp::Tensor(
|
||||
testVecB, kp::Tensor::TensorTypes::eStaging) };
|
||||
testVecB, kp::Tensor::TensorTypes::eHost) };
|
||||
|
||||
mgr.evalOpDefault<kp::OpTensorCreate>({ tensorA, tensorB });
|
||||
|
||||
|
|
@ -84,7 +84,7 @@ TEST(TestOpTensorCopy, CopyDeviceToStagingTensor)
|
|||
EXPECT_EQ(tensorA->data(), tensorB->data());
|
||||
}
|
||||
|
||||
TEST(TestOpTensorCopy, CopyStagingToDeviceTensor)
|
||||
TEST(TestOpTensorCopy, CopyHostToDeviceTensor)
|
||||
{
|
||||
|
||||
kp::Manager mgr;
|
||||
|
|
@ -93,7 +93,7 @@ TEST(TestOpTensorCopy, CopyStagingToDeviceTensor)
|
|||
std::vector<float> testVecB{ 0, 0, 0 };
|
||||
|
||||
std::shared_ptr<kp::Tensor> tensorA{ new kp::Tensor(
|
||||
testVecA, kp::Tensor::TensorTypes::eStaging) };
|
||||
testVecA, kp::Tensor::TensorTypes::eHost) };
|
||||
std::shared_ptr<kp::Tensor> tensorB{ new kp::Tensor(testVecB) };
|
||||
|
||||
mgr.evalOpDefault<kp::OpTensorCreate>({ tensorA, tensorB });
|
||||
|
|
@ -110,7 +110,7 @@ TEST(TestOpTensorCopy, CopyStagingToDeviceTensor)
|
|||
EXPECT_EQ(tensorA->data(), tensorB->data());
|
||||
}
|
||||
|
||||
TEST(TestOpTensorCopy, CopyStagingToStagingTensor)
|
||||
TEST(TestOpTensorCopy, CopyHostToHostTensor)
|
||||
{
|
||||
|
||||
kp::Manager mgr;
|
||||
|
|
@ -119,9 +119,9 @@ TEST(TestOpTensorCopy, CopyStagingToStagingTensor)
|
|||
std::vector<float> testVecB{ 0, 0, 0 };
|
||||
|
||||
std::shared_ptr<kp::Tensor> tensorA{ new kp::Tensor(
|
||||
testVecA, kp::Tensor::TensorTypes::eStaging) };
|
||||
testVecA, kp::Tensor::TensorTypes::eHost) };
|
||||
std::shared_ptr<kp::Tensor> tensorB{ new kp::Tensor(
|
||||
testVecB, kp::Tensor::TensorTypes::eStaging) };
|
||||
testVecB, kp::Tensor::TensorTypes::eHost) };
|
||||
|
||||
mgr.evalOpDefault<kp::OpTensorCreate>({ tensorA, tensorB });
|
||||
|
||||
|
|
@ -145,7 +145,7 @@ TEST(TestOpTensorCopy, SingleTensorShouldFail)
|
|||
std::vector<float> testVecA{ 9, 8, 7 };
|
||||
|
||||
std::shared_ptr<kp::Tensor> tensorA{ new kp::Tensor(
|
||||
testVecA, kp::Tensor::TensorTypes::eStaging) };
|
||||
testVecA, kp::Tensor::TensorTypes::eHost) };
|
||||
|
||||
mgr.evalOpDefault<kp::OpTensorCreate>({ tensorA });
|
||||
|
||||
|
|
|
|||
|
|
@ -114,7 +114,6 @@ TEST(TestOpTensorCreate, NoErrorIfTensorFreedBefore)
|
|||
EXPECT_FALSE(tensorB->isInit());
|
||||
}
|
||||
|
||||
|
||||
TEST(TestOpTensorCreate, ExceptionOnZeroSizeTensor)
|
||||
{
|
||||
std::vector<float> testVecA;
|
||||
|
|
@ -123,11 +122,11 @@ TEST(TestOpTensorCreate, ExceptionOnZeroSizeTensor)
|
|||
|
||||
kp::Manager mgr;
|
||||
|
||||
try{
|
||||
try {
|
||||
mgr.evalOpDefault<kp::OpTensorCreate>({ tensorA });
|
||||
} catch( const std::runtime_error& err ) {
|
||||
// check exception
|
||||
ASSERT_TRUE( std::string(err.what()).find("zero-sized") != std::string::npos );
|
||||
} catch (const std::runtime_error& err) {
|
||||
// check exception
|
||||
ASSERT_TRUE(std::string(err.what()).find("zero-sized") !=
|
||||
std::string::npos);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
|||
|
|
@ -17,9 +17,9 @@ TEST(TestTensor, CopyFromHostData)
|
|||
std::vector<float> vecB{ 0, 0, 0 };
|
||||
|
||||
std::shared_ptr<kp::Tensor> tensorA =
|
||||
std::make_shared<kp::Tensor>(vecA, kp::Tensor::TensorTypes::eStaging);
|
||||
std::make_shared<kp::Tensor>(vecA, kp::Tensor::TensorTypes::eHost);
|
||||
std::shared_ptr<kp::Tensor> tensorB =
|
||||
std::make_shared<kp::Tensor>(vecB, kp::Tensor::TensorTypes::eStaging);
|
||||
std::make_shared<kp::Tensor>(vecB, kp::Tensor::TensorTypes::eHost);
|
||||
|
||||
kp::Manager mgr;
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue