Merge pull request #176 from alexander-g/timestamps
Support for Timestamping
This commit is contained in:
commit
cc1ec748a7
8 changed files with 221 additions and 42 deletions
2
Makefile
2
Makefile
|
|
@ -13,7 +13,7 @@ VCPKG_WIN_PATH ?= "C:\\Users\\axsau\\Programming\\lib\\vcpkg\\scripts\\buildsyst
|
|||
VCPKG_UNIX_PATH ?= "/c/Users/axsau/Programming/lib/vcpkg/scripts/buildsystems/vcpkg.cmake"
|
||||
|
||||
# Regext to pass to catch2 to filter tests
|
||||
FILTER_TESTS ?= "-TestAsyncOperations.TestManagerParallelExecution"
|
||||
FILTER_TESTS ?= "-TestAsyncOperations.TestManagerParallelExecution:TestSequence.SequenceTimestamps"
|
||||
|
||||
ifeq ($(OS),Windows_NT) # is Windows_NT on XP, 2000, 7, Vista, 10...
|
||||
CMAKE_BIN ?= "C:\Program Files\CMake\bin\cmake.exe"
|
||||
|
|
|
|||
|
|
@ -129,6 +129,7 @@ PYBIND11_MODULE(kp, m) {
|
|||
.def("is_recording", &kp::Sequence::isRecording)
|
||||
.def("is_running", &kp::Sequence::isRunning)
|
||||
.def("is_init", &kp::Sequence::isInit)
|
||||
.def("get_timestamps", &kp::Sequence::getTimestamps)
|
||||
.def("clear", &kp::Sequence::clear)
|
||||
.def("destroy", &kp::Sequence::destroy);
|
||||
|
||||
|
|
@ -139,7 +140,7 @@ PYBIND11_MODULE(kp, m) {
|
|||
py::arg("device") = 0,
|
||||
py::arg("family_queue_indices") = std::vector<uint32_t>(),
|
||||
py::arg("desired_extensions") = std::vector<std::string>())
|
||||
.def("sequence", &kp::Manager::sequence, py::arg("queueIndex") = 0)
|
||||
.def("sequence", &kp::Manager::sequence, py::arg("queue_index") = 0, py::arg("total_timestamps") = 0)
|
||||
.def("tensor", [np](kp::Manager& self,
|
||||
const py::array_t<float> data,
|
||||
kp::Tensor::TensorTypes tensor_type) {
|
||||
|
|
|
|||
|
|
@ -820,12 +820,14 @@ class Tensor
|
|||
};
|
||||
|
||||
/**
|
||||
* Default constructor with data provided which would be used to create the
|
||||
* Constructor with data provided which would be used to create the
|
||||
* respective vulkan buffer and memory.
|
||||
*
|
||||
* @param physicalDevice The physical device to use to fetch properties
|
||||
* @param device The device to use to create the buffer and memory from
|
||||
* @param data Non-zero-sized vector of data that will be used by the
|
||||
* tensor
|
||||
* @param tensorType Type for the tensor which is of type TensorTypes
|
||||
* @param tensorTypes Type for the tensor which is of type TensorTypes
|
||||
*/
|
||||
Tensor(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
|
||||
std::shared_ptr<vk::Device> device,
|
||||
|
|
@ -839,10 +841,11 @@ class Tensor
|
|||
~Tensor();
|
||||
|
||||
/**
|
||||
* Initialiser which calls the initialisation for all the respective tensors
|
||||
* as well as creates the respective staging tensors. The staging tensors
|
||||
* would only be created for the tensors of type TensorType::eDevice as
|
||||
* otherwise there is no need to copy from host memory.
|
||||
* Function to trigger reinitialisation of the tensor buffer and memory with
|
||||
* new data as well as new potential device type.
|
||||
*
|
||||
* @param data Vector of data to use to initialise vector from
|
||||
* @param tensorType The type to use for the tensor
|
||||
*/
|
||||
void rebuild(const std::vector<float>& data,
|
||||
TensorTypes tensorType = TensorTypes::eDevice);
|
||||
|
|
@ -852,6 +855,11 @@ class Tensor
|
|||
*/
|
||||
void destroy();
|
||||
|
||||
/**
|
||||
* Check whether tensor is initialized based on the created gpu resources.
|
||||
*
|
||||
* @returns Boolean stating whether tensor is initialized
|
||||
*/
|
||||
bool isInit();
|
||||
|
||||
/**
|
||||
|
|
@ -1210,6 +1218,8 @@ class OpBase
|
|||
* The record function is intended to only send a record command or run
|
||||
* commands that are expected to record operations that are to be submitted
|
||||
* as a batch into the GPU.
|
||||
*
|
||||
* @param commandBuffer The command buffer to record the command into.
|
||||
*/
|
||||
virtual void record(const vk::CommandBuffer& commandBuffer) = 0;
|
||||
|
||||
|
|
@ -1220,6 +1230,8 @@ class OpBase
|
|||
* there are situations where eval can be called multiple times, so the
|
||||
* resources that are created should be idempotent in case it's called multiple
|
||||
* times in a row.
|
||||
*
|
||||
* @param commandBuffer The command buffer to record the command into.
|
||||
*/
|
||||
virtual void preEval(const vk::CommandBuffer& commandBuffer) = 0;
|
||||
|
||||
|
|
@ -1230,6 +1242,8 @@ class OpBase
|
|||
* there are situations where eval can be called multiple times, so the
|
||||
* resources that are destroyed should not require a re-init unless explicitly
|
||||
* provided by the user.
|
||||
*
|
||||
* @param commandBuffer The command buffer to record the command into.
|
||||
*/
|
||||
virtual void postEval(const vk::CommandBuffer& commandBuffer) = 0;
|
||||
};
|
||||
|
|
@ -1239,38 +1253,47 @@ class OpBase
|
|||
namespace kp {
|
||||
|
||||
/**
|
||||
Operation that copies the data from the first tensor to the rest of the tensors provided, using a record command for all the vectors. This operation does not own/manage the memory of the tensors passed to it. The operation must only receive tensors of type
|
||||
* Operation that copies the data from the first tensor to the rest of the tensors
|
||||
* provided, using a record command for all the vectors. This operation does not
|
||||
* own/manage the memory of the tensors passed to it. The operation must only
|
||||
* receive tensors of type
|
||||
*/
|
||||
class OpTensorCopy : public OpBase
|
||||
{
|
||||
public:
|
||||
/**
|
||||
* Default constructor with parameters that provides the core vulkan resources and the tensors that will be used in the operation.
|
||||
* Default constructor with parameters that provides the core vulkan resources
|
||||
* and the tensors that will be used in the operation.
|
||||
*
|
||||
* @param physicalDevice Vulkan physical device used to find device queues
|
||||
* @param device Vulkan logical device for passing to Algorithm
|
||||
* @param commandBuffer Vulkan Command Buffer to record commands into
|
||||
* @param tensors Tensors that will be used to create in operation.
|
||||
*/
|
||||
OpTensorCopy(const std::vector<std::shared_ptr<Tensor>>& tensors);
|
||||
|
||||
/**
|
||||
* Default destructor. This class does not manage memory so it won't be expecting the parent to perform a release.
|
||||
* Default destructor. This class does not manage memory so it won't be
|
||||
* expecting the parent to perform a release.
|
||||
*/
|
||||
~OpTensorCopy() override;
|
||||
|
||||
/**
|
||||
* Records the copy commands from the first tensor into all the other tensors provided. Also optionally records a barrier.
|
||||
* Records the copy commands from the first tensor into all the other
|
||||
* tensors provided. Also optionally records a barrier.
|
||||
*
|
||||
* @param commandBuffer The command buffer to record the command into.
|
||||
*/
|
||||
void record(const vk::CommandBuffer& commandBuffer) override;
|
||||
|
||||
/**
|
||||
* Does not perform any preEval commands.
|
||||
*
|
||||
* @param commandBuffer The command buffer to record the command into.
|
||||
*/
|
||||
virtual void preEval(const vk::CommandBuffer& commandBuffer) override;
|
||||
|
||||
/**
|
||||
* Copies the local vectors for all the tensors to sync the data with the gpu.
|
||||
*
|
||||
* @param commandBuffer The command buffer to record the command into.
|
||||
*/
|
||||
virtual void postEval(const vk::CommandBuffer& commandBuffer) override;
|
||||
|
||||
|
|
@ -1284,17 +1307,20 @@ class OpTensorCopy : public OpBase
|
|||
namespace kp {
|
||||
|
||||
/**
|
||||
Operation that syncs tensor's device by mapping local data into the device memory. For TensorTypes::eDevice it will use a record operation for the memory to be syncd into GPU memory which means that the operation will be done in sync with GPU commands. For TensorTypes::eStaging it will only map the data into host memory which will happen during preEval before the recorded commands are dispatched. This operation won't have any effect on TensorTypes::eStaging.
|
||||
* Operation that syncs tensor's device by mapping local data into the device memory.
|
||||
* For TensorTypes::eDevice it will use a record operation for the memory to be syncd
|
||||
* into GPU memory which means that the operation will be done in sync with GPU commands.
|
||||
* For TensorTypes::eHost it will only map the data into host memory which will
|
||||
* happen during preEval before the recorded commands are dispatched.
|
||||
*/
|
||||
class OpTensorSyncDevice : public OpBase
|
||||
{
|
||||
public:
|
||||
/**
|
||||
* Default constructor with parameters that provides the core vulkan resources and the tensors that will be used in the operation. The tensos provided cannot be of type TensorTypes::eStorage.
|
||||
* Default constructor with parameters that provides the core vulkan resources
|
||||
* and the tensors that will be used in the operation. The tensos provided cannot
|
||||
* be of type TensorTypes::eStorage.
|
||||
*
|
||||
* @param physicalDevice Vulkan physical device used to find device queues
|
||||
* @param device Vulkan logical device for passing to Algorithm
|
||||
* @param commandBuffer Vulkan Command Buffer to record commands into
|
||||
* @param tensors Tensors that will be used to create in operation.
|
||||
*/
|
||||
OpTensorSyncDevice(const std::vector<std::shared_ptr<Tensor>>& tensors);
|
||||
|
|
@ -1305,17 +1331,24 @@ class OpTensorSyncDevice : public OpBase
|
|||
~OpTensorSyncDevice() override;
|
||||
|
||||
/**
|
||||
* For device tensors, it records the copy command for the tensor to copy the data from its staging to device memory.
|
||||
* For device tensors, it records the copy command for the tensor to copy the
|
||||
* data from its staging to device memory.
|
||||
*
|
||||
* @param commandBuffer The command buffer to record the command into.
|
||||
*/
|
||||
void record(const vk::CommandBuffer& commandBuffer) override;
|
||||
|
||||
/**
|
||||
* Does not perform any preEval commands.
|
||||
*
|
||||
* @param commandBuffer The command buffer to record the command into.
|
||||
*/
|
||||
virtual void preEval(const vk::CommandBuffer& commandBuffer) override;
|
||||
|
||||
/**
|
||||
* Does not perform any postEval commands.
|
||||
*
|
||||
* @param commandBuffer The command buffer to record the command into.
|
||||
*/
|
||||
virtual void postEval(const vk::CommandBuffer& commandBuffer) override;
|
||||
|
||||
|
|
@ -1329,38 +1362,50 @@ class OpTensorSyncDevice : public OpBase
|
|||
namespace kp {
|
||||
|
||||
/**
|
||||
Operation that syncs tensor's local memory by mapping device data into the local CPU memory. For TensorTypes::eDevice it will use a record operation for the memory to be syncd into GPU memory which means that the operation will be done in sync with GPU commands. For TensorTypes::eStaging it will only map the data into host memory which will happen during preEval before the recorded commands are dispatched. This operation won't have any effect on TensorTypes::eStaging.
|
||||
* Operation that syncs tensor's local memory by mapping device data into the
|
||||
* local CPU memory. For TensorTypes::eDevice it will use a record operation
|
||||
* for the memory to be syncd into GPU memory which means that the operation
|
||||
* will be done in sync with GPU commands. For TensorTypes::eHost it will
|
||||
* only map the data into host memory which will happen during preEval before
|
||||
* the recorded commands are dispatched.
|
||||
*/
|
||||
class OpTensorSyncLocal : public OpBase
|
||||
{
|
||||
public:
|
||||
/**
|
||||
* Default constructor with parameters that provides the core vulkan resources and the tensors that will be used in the operation. The tensors provided cannot be of type TensorTypes::eStorage.
|
||||
* Default constructor with parameters that provides the core vulkan resources
|
||||
* and the tensors that will be used in the operation. The tensors provided
|
||||
* cannot be of type TensorTypes::eStorage.
|
||||
*
|
||||
* @param physicalDevice Vulkan physical device used to find device queues
|
||||
* @param device Vulkan logical device for passing to Algorithm
|
||||
* @param commandBuffer Vulkan Command Buffer to record commands into
|
||||
* @param tensors Tensors that will be used to create in operation.
|
||||
*/
|
||||
OpTensorSyncLocal(const std::vector<std::shared_ptr<Tensor>>& tensors);
|
||||
|
||||
/**
|
||||
* Default destructor. This class does not manage memory so it won't be expecting the parent to perform a release.
|
||||
* Default destructor. This class does not manage memory so it won't be expecting
|
||||
* the parent to perform a release.
|
||||
*/
|
||||
~OpTensorSyncLocal() override;
|
||||
|
||||
/**
|
||||
* For device tensors, it records the copy command for the tensor to copy the data from its device to staging memory.
|
||||
* For device tensors, it records the copy command for the tensor to copy the
|
||||
* data from its device to staging memory.
|
||||
*
|
||||
* @param commandBuffer The command buffer to record the command into.
|
||||
*/
|
||||
void record(const vk::CommandBuffer& commandBuffer) override;
|
||||
|
||||
/**
|
||||
* Does not perform any preEval commands.
|
||||
*
|
||||
* @param commandBuffer The command buffer to record the command into.
|
||||
*/
|
||||
virtual void preEval(const vk::CommandBuffer& commandBuffer) override;
|
||||
|
||||
/**
|
||||
* For host tensors it performs the map command from the host memory into local memory.
|
||||
*
|
||||
* @param commandBuffer The command buffer to record the command into.
|
||||
*/
|
||||
virtual void postEval(const vk::CommandBuffer& commandBuffer) override;
|
||||
|
||||
|
|
@ -1383,6 +1428,13 @@ class OpAlgoDispatch : public OpBase
|
|||
{
|
||||
public:
|
||||
|
||||
/**
|
||||
* Constructor that stores the algorithm to use as well as the relevant
|
||||
* push constants to override when recording.
|
||||
*
|
||||
* @param algorithm The algorithm object to use for dispatch
|
||||
* @param pushConstants The push constants to use for override
|
||||
*/
|
||||
OpAlgoDispatch(const std::shared_ptr<kp::Algorithm>& algorithm,
|
||||
const kp::Constants& pushConstants = {});
|
||||
|
||||
|
|
@ -1399,18 +1451,22 @@ class OpAlgoDispatch : public OpBase
|
|||
* shader processing to the gpu. This function also records the GPU memory
|
||||
* copy of the output data for the staging buffer so it can be read by the
|
||||
* host.
|
||||
*
|
||||
* @param commandBuffer The command buffer to record the command into.
|
||||
*/
|
||||
virtual void record(const vk::CommandBuffer& commandBuffer) override;
|
||||
|
||||
/**
|
||||
* Does not perform any preEval commands.
|
||||
*
|
||||
* @param commandBuffer The command buffer to record the command into.
|
||||
*/
|
||||
virtual void preEval(const vk::CommandBuffer& commandBuffer) override;
|
||||
|
||||
/**
|
||||
* Executes after the recorded commands are submitted, and performs a copy
|
||||
* of the GPU Device memory into the staging buffer so the output data can
|
||||
* be retrieved.
|
||||
* Does not perform any postEval commands.
|
||||
*
|
||||
* @param commandBuffer The command buffer to record the command into.
|
||||
*/
|
||||
virtual void postEval(const vk::CommandBuffer& commandBuffer) override;
|
||||
|
||||
|
|
@ -1439,11 +1495,9 @@ class OpMult : public OpAlgoDispatch
|
|||
* requirements for the operations to be able to create and manage their
|
||||
* sub-components.
|
||||
*
|
||||
* @param physicalDevice Vulkan physical device used to find device queues
|
||||
* @param device Vulkan logical device for passing to Algorithm
|
||||
* @param commandBuffer Vulkan Command Buffer to record commands into
|
||||
* @param tensors Tensors that are to be used in this operation
|
||||
* @param komputeWorkgroup Optional parameter to specify the layout for processing
|
||||
* @param algorithm An algorithm that will be overridden with the OpMult
|
||||
* shader data and the tensors provided which are expected to be 3
|
||||
*/
|
||||
OpMult(std::vector<std::shared_ptr<Tensor>> tensors, std::shared_ptr<Algorithm> algorithm)
|
||||
: OpAlgoDispatch(algorithm)
|
||||
|
|
@ -1489,11 +1543,13 @@ class Sequence : public std::enable_shared_from_this<Sequence>
|
|||
* @param device Vulkan logical device
|
||||
* @param computeQueue Vulkan compute queue
|
||||
* @param queueIndex Vulkan compute queue index in device
|
||||
* @param totalTimestamps Maximum number of timestamps to allocate
|
||||
*/
|
||||
Sequence(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
|
||||
std::shared_ptr<vk::Device> device,
|
||||
std::shared_ptr<vk::Queue> computeQueue,
|
||||
uint32_t queueIndex);
|
||||
uint32_t queueIndex,
|
||||
uint32_t totalTimestamps = 0);
|
||||
/**
|
||||
* Destructor for sequence which is responsible for cleaning all subsequent
|
||||
* owned operations.
|
||||
|
|
@ -1669,6 +1725,12 @@ class Sequence : public std::enable_shared_from_this<Sequence>
|
|||
*/
|
||||
void clear();
|
||||
|
||||
/**
|
||||
* Return the timestamps that were latched at the beginning and
|
||||
* after each operation during the last eval() call.
|
||||
*/
|
||||
std::vector<std::uint64_t> getTimestamps();
|
||||
|
||||
/**
|
||||
* Begins recording commands for commands to be submitted into the command
|
||||
* buffer.
|
||||
|
|
@ -1737,6 +1799,7 @@ class Sequence : public std::enable_shared_from_this<Sequence>
|
|||
// -------------- ALWAYS OWNED RESOURCES
|
||||
vk::Fence mFence;
|
||||
std::vector<std::shared_ptr<OpBase>> mOperations;
|
||||
std::shared_ptr<vk::QueryPool> timestampQueryPool = nullptr;
|
||||
|
||||
// State
|
||||
bool mRecording = false;
|
||||
|
|
@ -1745,6 +1808,7 @@ class Sequence : public std::enable_shared_from_this<Sequence>
|
|||
// Create functions
|
||||
void createCommandPool();
|
||||
void createCommandBuffer();
|
||||
void createTimestampQueryPool(uint32_t totalTimestamps);
|
||||
};
|
||||
|
||||
} // End namespace kp
|
||||
|
|
@ -1805,9 +1869,11 @@ class Manager
|
|||
* if it hasn't been destroyed by its reference count going to zero.
|
||||
*
|
||||
* @param queueIndex The queue to use from the available queues
|
||||
* @param nrOfTimestamps The maximum number of timestamps to allocate.
|
||||
* If zero (default), disables latching of timestamps.
|
||||
* @returns Shared pointer with initialised sequence
|
||||
*/
|
||||
std::shared_ptr<Sequence> sequence(uint32_t queueIndex = 0);
|
||||
std::shared_ptr<Sequence> sequence(uint32_t queueIndex = 0, uint32_t nrOfTimestamps = 0);
|
||||
|
||||
/**
|
||||
* Create a managed tensor that will be destroyed by this manager
|
||||
|
|
|
|||
|
|
@ -431,7 +431,7 @@ Manager::algorithm(const std::vector<std::shared_ptr<Tensor>>& tensors,
|
|||
}
|
||||
|
||||
std::shared_ptr<Sequence>
|
||||
Manager::sequence(uint32_t queueIndex)
|
||||
Manager::sequence(uint32_t queueIndex, uint32_t totalTimestamps)
|
||||
{
|
||||
KP_LOG_DEBUG("Kompute Manager sequence() with queueIndex: {}", queueIndex);
|
||||
|
||||
|
|
@ -439,7 +439,8 @@ Manager::sequence(uint32_t queueIndex)
|
|||
this->mPhysicalDevice,
|
||||
this->mDevice,
|
||||
this->mComputeQueues[queueIndex],
|
||||
this->mComputeQueueFamilyIndices[queueIndex]) };
|
||||
this->mComputeQueueFamilyIndices[queueIndex],
|
||||
totalTimestamps) };
|
||||
|
||||
if (this->mManageResources) {
|
||||
this->mManagedSequences.push_back(sq);
|
||||
|
|
|
|||
|
|
@ -6,7 +6,8 @@ namespace kp {
|
|||
Sequence::Sequence(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
|
||||
std::shared_ptr<vk::Device> device,
|
||||
std::shared_ptr<vk::Queue> computeQueue,
|
||||
uint32_t queueIndex)
|
||||
uint32_t queueIndex,
|
||||
uint32_t totalTimestamps)
|
||||
{
|
||||
KP_LOG_DEBUG("Kompute Sequence Constructor with existing device & queue");
|
||||
|
||||
|
|
@ -17,6 +18,8 @@ Sequence::Sequence(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
|
|||
|
||||
this->createCommandPool();
|
||||
this->createCommandBuffer();
|
||||
if(totalTimestamps>0)
|
||||
this->createTimestampQueryPool(totalTimestamps+1); //+1 for the first one
|
||||
}
|
||||
|
||||
Sequence::~Sequence()
|
||||
|
|
@ -44,6 +47,13 @@ Sequence::begin()
|
|||
KP_LOG_INFO("Kompute Sequence command now started recording");
|
||||
this->mCommandBuffer->begin(vk::CommandBufferBeginInfo());
|
||||
this->mRecording = true;
|
||||
|
||||
//latch the first timestamp before any commands are submitted
|
||||
if(this->timestampQueryPool)
|
||||
this->mCommandBuffer->writeTimestamp(
|
||||
vk::PipelineStageFlagBits::eAllCommands,
|
||||
*this->timestampQueryPool, 0
|
||||
);
|
||||
}
|
||||
|
||||
void
|
||||
|
|
@ -236,6 +246,16 @@ Sequence::destroy()
|
|||
this->mOperations.clear();
|
||||
}
|
||||
|
||||
if(this->timestampQueryPool){
|
||||
KP_LOG_INFO("Destroying QueryPool");
|
||||
this->mDevice->destroy(
|
||||
*this->timestampQueryPool,
|
||||
(vk::Optional<const vk::AllocationCallbacks>)nullptr);
|
||||
|
||||
this->timestampQueryPool = nullptr;
|
||||
KP_LOG_DEBUG("Kompute Sequence Destroyed QueryPool");
|
||||
}
|
||||
|
||||
if (this->mDevice) {
|
||||
this->mDevice = nullptr;
|
||||
}
|
||||
|
|
@ -261,6 +281,12 @@ Sequence::record(std::shared_ptr<OpBase> op)
|
|||
|
||||
this->mOperations.push_back(op);
|
||||
|
||||
if(this->timestampQueryPool)
|
||||
this->mCommandBuffer->writeTimestamp(
|
||||
vk::PipelineStageFlagBits::eAllCommands,
|
||||
*this->timestampQueryPool, this->mOperations.size()
|
||||
);
|
||||
|
||||
return shared_from_this();
|
||||
}
|
||||
|
||||
|
|
@ -308,4 +334,46 @@ Sequence::createCommandBuffer()
|
|||
KP_LOG_DEBUG("Kompute Sequence Command Buffer Created");
|
||||
}
|
||||
|
||||
void
|
||||
Sequence::createTimestampQueryPool(uint32_t totalTimestamps)
|
||||
{
|
||||
KP_LOG_DEBUG("Kompute Sequence creating query pool");
|
||||
if (!this->isInit()) {
|
||||
throw std::runtime_error("createTimestampQueryPool() called on uninitialized Sequence");
|
||||
}
|
||||
if (!this->mPhysicalDevice) {
|
||||
throw std::runtime_error("Kompute Sequence physical device is null");
|
||||
}
|
||||
|
||||
vk::PhysicalDeviceProperties physicalDeviceProperties =
|
||||
this->mPhysicalDevice->getProperties();
|
||||
|
||||
if(physicalDeviceProperties.limits.timestampComputeAndGraphics){
|
||||
vk::QueryPoolCreateInfo queryPoolInfo;
|
||||
queryPoolInfo.setQueryCount(totalTimestamps);
|
||||
queryPoolInfo.setQueryType(vk::QueryType::eTimestamp);
|
||||
this->timestampQueryPool = std::make_shared<vk::QueryPool>(this->mDevice->createQueryPool(queryPoolInfo));
|
||||
|
||||
KP_LOG_DEBUG("Query pool for timestamps created");
|
||||
}
|
||||
else{
|
||||
throw std::runtime_error("Device does not support timestamps");
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<std::uint64_t>
|
||||
Sequence::getTimestamps()
|
||||
{
|
||||
if(!this->timestampQueryPool)
|
||||
throw std::runtime_error("Timestamp latching not enabled");
|
||||
|
||||
const auto n = this->mOperations.size()+1;
|
||||
std::vector<std::uint64_t> timestamps(n, 0);
|
||||
this->mDevice->getQueryPoolResults(*this->timestampQueryPool,
|
||||
0, n, timestamps.size()*sizeof(std::uint64_t), timestamps.data(),
|
||||
sizeof(uint64_t), vk::QueryResultFlagBits::e64 | vk::QueryResultFlagBits::eWait);
|
||||
|
||||
return timestamps;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
|||
|
|
@ -60,9 +60,11 @@ class Manager
|
|||
* if it hasn't been destroyed by its reference count going to zero.
|
||||
*
|
||||
* @param queueIndex The queue to use from the available queues
|
||||
* @param nrOfTimestamps The maximum number of timestamps to allocate.
|
||||
* If zero (default), disables latching of timestamps.
|
||||
* @returns Shared pointer with initialised sequence
|
||||
*/
|
||||
std::shared_ptr<Sequence> sequence(uint32_t queueIndex = 0);
|
||||
std::shared_ptr<Sequence> sequence(uint32_t queueIndex = 0, uint32_t totalTimestamps = 0);
|
||||
|
||||
/**
|
||||
* Create a managed tensor that will be destroyed by this manager
|
||||
|
|
|
|||
|
|
@ -3,6 +3,7 @@
|
|||
#include "kompute/Core.hpp"
|
||||
|
||||
#include "kompute/operations/OpBase.hpp"
|
||||
#include "kompute/operations/OpAlgoDispatch.hpp"
|
||||
|
||||
namespace kp {
|
||||
|
||||
|
|
@ -20,11 +21,13 @@ class Sequence : public std::enable_shared_from_this<Sequence>
|
|||
* @param device Vulkan logical device
|
||||
* @param computeQueue Vulkan compute queue
|
||||
* @param queueIndex Vulkan compute queue index in device
|
||||
* @param totalTimestamps Maximum number of timestamps to allocate
|
||||
*/
|
||||
Sequence(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
|
||||
std::shared_ptr<vk::Device> device,
|
||||
std::shared_ptr<vk::Queue> computeQueue,
|
||||
uint32_t queueIndex);
|
||||
uint32_t queueIndex,
|
||||
uint32_t totalTimestamps = 0);
|
||||
/**
|
||||
* Destructor for sequence which is responsible for cleaning all subsequent
|
||||
* owned operations.
|
||||
|
|
@ -200,6 +203,12 @@ class Sequence : public std::enable_shared_from_this<Sequence>
|
|||
*/
|
||||
void clear();
|
||||
|
||||
/**
|
||||
* Return the timestamps that were latched at the beginning and
|
||||
* after each operation during the last eval() call.
|
||||
*/
|
||||
std::vector<std::uint64_t> getTimestamps();
|
||||
|
||||
/**
|
||||
* Begins recording commands for commands to be submitted into the command
|
||||
* buffer.
|
||||
|
|
@ -268,6 +277,7 @@ class Sequence : public std::enable_shared_from_this<Sequence>
|
|||
// -------------- ALWAYS OWNED RESOURCES
|
||||
vk::Fence mFence;
|
||||
std::vector<std::shared_ptr<OpBase>> mOperations;
|
||||
std::shared_ptr<vk::QueryPool> timestampQueryPool = nullptr;
|
||||
|
||||
// State
|
||||
bool mRecording = false;
|
||||
|
|
@ -276,6 +286,7 @@ class Sequence : public std::enable_shared_from_this<Sequence>
|
|||
// Create functions
|
||||
void createCommandPool();
|
||||
void createCommandBuffer();
|
||||
void createTimestampQueryPool(uint32_t totalTimestamps);
|
||||
};
|
||||
|
||||
} // End namespace kp
|
||||
|
|
|
|||
|
|
@ -100,3 +100,33 @@ TEST(TestSequence, RerecordSequence)
|
|||
|
||||
EXPECT_EQ(tensorB->data(), std::vector<float>({2, 8, 18}));
|
||||
}
|
||||
|
||||
|
||||
TEST(TestSequence, SequenceTimestamps)
|
||||
{
|
||||
kp::Manager mgr;
|
||||
|
||||
std::shared_ptr<kp::Tensor> tensorA = mgr.tensor({ 0, 0, 0 });
|
||||
|
||||
std::string shader(R"(
|
||||
#version 450
|
||||
layout (local_size_x = 1) in;
|
||||
layout(set = 0, binding = 0) buffer a { float pa[]; };
|
||||
void main() {
|
||||
uint index = gl_GlobalInvocationID.x;
|
||||
pa[index] = pa[index] + 1;
|
||||
})");
|
||||
|
||||
std::vector<uint32_t> spirv = kp::Shader::compile_source(shader);
|
||||
|
||||
auto seq = mgr.sequence(0, 100); //100 timestamps
|
||||
seq->record<kp::OpTensorSyncDevice>({ tensorA })
|
||||
->record<kp::OpAlgoDispatch>(mgr.algorithm({ tensorA }, spirv))
|
||||
->record<kp::OpAlgoDispatch>(mgr.algorithm({ tensorA }, spirv))
|
||||
->record<kp::OpAlgoDispatch>(mgr.algorithm({ tensorA }, spirv))
|
||||
->record<kp::OpTensorSyncLocal>({ tensorA })
|
||||
->eval();
|
||||
const std::vector<uint64_t> timestamps = seq->getTimestamps();
|
||||
|
||||
EXPECT_EQ(timestamps.size(), 6); //1 timestamp at start + 1 after each operation
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue