Merge pull request #176 from alexander-g/timestamps

Support for Timestamping
This commit is contained in:
Alejandro Saucedo 2021-03-07 14:00:49 +00:00 committed by GitHub
commit cc1ec748a7
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
8 changed files with 221 additions and 42 deletions

View file

@ -13,7 +13,7 @@ VCPKG_WIN_PATH ?= "C:\\Users\\axsau\\Programming\\lib\\vcpkg\\scripts\\buildsyst
VCPKG_UNIX_PATH ?= "/c/Users/axsau/Programming/lib/vcpkg/scripts/buildsystems/vcpkg.cmake"
# Regext to pass to catch2 to filter tests
FILTER_TESTS ?= "-TestAsyncOperations.TestManagerParallelExecution"
FILTER_TESTS ?= "-TestAsyncOperations.TestManagerParallelExecution:TestSequence.SequenceTimestamps"
ifeq ($(OS),Windows_NT) # is Windows_NT on XP, 2000, 7, Vista, 10...
CMAKE_BIN ?= "C:\Program Files\CMake\bin\cmake.exe"

View file

@ -129,6 +129,7 @@ PYBIND11_MODULE(kp, m) {
.def("is_recording", &kp::Sequence::isRecording)
.def("is_running", &kp::Sequence::isRunning)
.def("is_init", &kp::Sequence::isInit)
.def("get_timestamps", &kp::Sequence::getTimestamps)
.def("clear", &kp::Sequence::clear)
.def("destroy", &kp::Sequence::destroy);
@ -139,7 +140,7 @@ PYBIND11_MODULE(kp, m) {
py::arg("device") = 0,
py::arg("family_queue_indices") = std::vector<uint32_t>(),
py::arg("desired_extensions") = std::vector<std::string>())
.def("sequence", &kp::Manager::sequence, py::arg("queueIndex") = 0)
.def("sequence", &kp::Manager::sequence, py::arg("queue_index") = 0, py::arg("total_timestamps") = 0)
.def("tensor", [np](kp::Manager& self,
const py::array_t<float> data,
kp::Tensor::TensorTypes tensor_type) {

View file

@ -820,12 +820,14 @@ class Tensor
};
/**
* Default constructor with data provided which would be used to create the
* Constructor with data provided which would be used to create the
* respective vulkan buffer and memory.
*
* @param physicalDevice The physical device to use to fetch properties
* @param device The device to use to create the buffer and memory from
* @param data Non-zero-sized vector of data that will be used by the
* tensor
* @param tensorType Type for the tensor which is of type TensorTypes
* @param tensorTypes Type for the tensor which is of type TensorTypes
*/
Tensor(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
std::shared_ptr<vk::Device> device,
@ -839,10 +841,11 @@ class Tensor
~Tensor();
/**
* Initialiser which calls the initialisation for all the respective tensors
* as well as creates the respective staging tensors. The staging tensors
* would only be created for the tensors of type TensorType::eDevice as
* otherwise there is no need to copy from host memory.
* Function to trigger reinitialisation of the tensor buffer and memory with
* new data as well as new potential device type.
*
* @param data Vector of data to use to initialise vector from
* @param tensorType The type to use for the tensor
*/
void rebuild(const std::vector<float>& data,
TensorTypes tensorType = TensorTypes::eDevice);
@ -852,6 +855,11 @@ class Tensor
*/
void destroy();
/**
* Check whether tensor is initialized based on the created gpu resources.
*
* @returns Boolean stating whether tensor is initialized
*/
bool isInit();
/**
@ -1210,6 +1218,8 @@ class OpBase
* The record function is intended to only send a record command or run
* commands that are expected to record operations that are to be submitted
* as a batch into the GPU.
*
* @param commandBuffer The command buffer to record the command into.
*/
virtual void record(const vk::CommandBuffer& commandBuffer) = 0;
@ -1220,6 +1230,8 @@ class OpBase
* there are situations where eval can be called multiple times, so the
* resources that are created should be idempotent in case it's called multiple
* times in a row.
*
* @param commandBuffer The command buffer to record the command into.
*/
virtual void preEval(const vk::CommandBuffer& commandBuffer) = 0;
@ -1230,6 +1242,8 @@ class OpBase
* there are situations where eval can be called multiple times, so the
* resources that are destroyed should not require a re-init unless explicitly
* provided by the user.
*
* @param commandBuffer The command buffer to record the command into.
*/
virtual void postEval(const vk::CommandBuffer& commandBuffer) = 0;
};
@ -1239,38 +1253,47 @@ class OpBase
namespace kp {
/**
Operation that copies the data from the first tensor to the rest of the tensors provided, using a record command for all the vectors. This operation does not own/manage the memory of the tensors passed to it. The operation must only receive tensors of type
* Operation that copies the data from the first tensor to the rest of the tensors
* provided, using a record command for all the vectors. This operation does not
* own/manage the memory of the tensors passed to it. The operation must only
* receive tensors of type
*/
class OpTensorCopy : public OpBase
{
public:
/**
* Default constructor with parameters that provides the core vulkan resources and the tensors that will be used in the operation.
* Default constructor with parameters that provides the core vulkan resources
* and the tensors that will be used in the operation.
*
* @param physicalDevice Vulkan physical device used to find device queues
* @param device Vulkan logical device for passing to Algorithm
* @param commandBuffer Vulkan Command Buffer to record commands into
* @param tensors Tensors that will be used to create in operation.
*/
OpTensorCopy(const std::vector<std::shared_ptr<Tensor>>& tensors);
/**
* Default destructor. This class does not manage memory so it won't be expecting the parent to perform a release.
* Default destructor. This class does not manage memory so it won't be
* expecting the parent to perform a release.
*/
~OpTensorCopy() override;
/**
* Records the copy commands from the first tensor into all the other tensors provided. Also optionally records a barrier.
* Records the copy commands from the first tensor into all the other
* tensors provided. Also optionally records a barrier.
*
* @param commandBuffer The command buffer to record the command into.
*/
void record(const vk::CommandBuffer& commandBuffer) override;
/**
* Does not perform any preEval commands.
*
* @param commandBuffer The command buffer to record the command into.
*/
virtual void preEval(const vk::CommandBuffer& commandBuffer) override;
/**
* Copies the local vectors for all the tensors to sync the data with the gpu.
*
* @param commandBuffer The command buffer to record the command into.
*/
virtual void postEval(const vk::CommandBuffer& commandBuffer) override;
@ -1284,17 +1307,20 @@ class OpTensorCopy : public OpBase
namespace kp {
/**
Operation that syncs tensor's device by mapping local data into the device memory. For TensorTypes::eDevice it will use a record operation for the memory to be syncd into GPU memory which means that the operation will be done in sync with GPU commands. For TensorTypes::eStaging it will only map the data into host memory which will happen during preEval before the recorded commands are dispatched. This operation won't have any effect on TensorTypes::eStaging.
* Operation that syncs tensor's device by mapping local data into the device memory.
* For TensorTypes::eDevice it will use a record operation for the memory to be syncd
* into GPU memory which means that the operation will be done in sync with GPU commands.
* For TensorTypes::eHost it will only map the data into host memory which will
* happen during preEval before the recorded commands are dispatched.
*/
class OpTensorSyncDevice : public OpBase
{
public:
/**
* Default constructor with parameters that provides the core vulkan resources and the tensors that will be used in the operation. The tensos provided cannot be of type TensorTypes::eStorage.
* Default constructor with parameters that provides the core vulkan resources
* and the tensors that will be used in the operation. The tensos provided cannot
* be of type TensorTypes::eStorage.
*
* @param physicalDevice Vulkan physical device used to find device queues
* @param device Vulkan logical device for passing to Algorithm
* @param commandBuffer Vulkan Command Buffer to record commands into
* @param tensors Tensors that will be used to create in operation.
*/
OpTensorSyncDevice(const std::vector<std::shared_ptr<Tensor>>& tensors);
@ -1305,17 +1331,24 @@ class OpTensorSyncDevice : public OpBase
~OpTensorSyncDevice() override;
/**
* For device tensors, it records the copy command for the tensor to copy the data from its staging to device memory.
* For device tensors, it records the copy command for the tensor to copy the
* data from its staging to device memory.
*
* @param commandBuffer The command buffer to record the command into.
*/
void record(const vk::CommandBuffer& commandBuffer) override;
/**
* Does not perform any preEval commands.
*
* @param commandBuffer The command buffer to record the command into.
*/
virtual void preEval(const vk::CommandBuffer& commandBuffer) override;
/**
* Does not perform any postEval commands.
*
* @param commandBuffer The command buffer to record the command into.
*/
virtual void postEval(const vk::CommandBuffer& commandBuffer) override;
@ -1329,38 +1362,50 @@ class OpTensorSyncDevice : public OpBase
namespace kp {
/**
Operation that syncs tensor's local memory by mapping device data into the local CPU memory. For TensorTypes::eDevice it will use a record operation for the memory to be syncd into GPU memory which means that the operation will be done in sync with GPU commands. For TensorTypes::eStaging it will only map the data into host memory which will happen during preEval before the recorded commands are dispatched. This operation won't have any effect on TensorTypes::eStaging.
* Operation that syncs tensor's local memory by mapping device data into the
* local CPU memory. For TensorTypes::eDevice it will use a record operation
* for the memory to be syncd into GPU memory which means that the operation
* will be done in sync with GPU commands. For TensorTypes::eHost it will
* only map the data into host memory which will happen during preEval before
* the recorded commands are dispatched.
*/
class OpTensorSyncLocal : public OpBase
{
public:
/**
* Default constructor with parameters that provides the core vulkan resources and the tensors that will be used in the operation. The tensors provided cannot be of type TensorTypes::eStorage.
* Default constructor with parameters that provides the core vulkan resources
* and the tensors that will be used in the operation. The tensors provided
* cannot be of type TensorTypes::eStorage.
*
* @param physicalDevice Vulkan physical device used to find device queues
* @param device Vulkan logical device for passing to Algorithm
* @param commandBuffer Vulkan Command Buffer to record commands into
* @param tensors Tensors that will be used to create in operation.
*/
OpTensorSyncLocal(const std::vector<std::shared_ptr<Tensor>>& tensors);
/**
* Default destructor. This class does not manage memory so it won't be expecting the parent to perform a release.
* Default destructor. This class does not manage memory so it won't be expecting
* the parent to perform a release.
*/
~OpTensorSyncLocal() override;
/**
* For device tensors, it records the copy command for the tensor to copy the data from its device to staging memory.
* For device tensors, it records the copy command for the tensor to copy the
* data from its device to staging memory.
*
* @param commandBuffer The command buffer to record the command into.
*/
void record(const vk::CommandBuffer& commandBuffer) override;
/**
* Does not perform any preEval commands.
*
* @param commandBuffer The command buffer to record the command into.
*/
virtual void preEval(const vk::CommandBuffer& commandBuffer) override;
/**
* For host tensors it performs the map command from the host memory into local memory.
*
* @param commandBuffer The command buffer to record the command into.
*/
virtual void postEval(const vk::CommandBuffer& commandBuffer) override;
@ -1383,6 +1428,13 @@ class OpAlgoDispatch : public OpBase
{
public:
/**
* Constructor that stores the algorithm to use as well as the relevant
* push constants to override when recording.
*
* @param algorithm The algorithm object to use for dispatch
* @param pushConstants The push constants to use for override
*/
OpAlgoDispatch(const std::shared_ptr<kp::Algorithm>& algorithm,
const kp::Constants& pushConstants = {});
@ -1399,18 +1451,22 @@ class OpAlgoDispatch : public OpBase
* shader processing to the gpu. This function also records the GPU memory
* copy of the output data for the staging buffer so it can be read by the
* host.
*
* @param commandBuffer The command buffer to record the command into.
*/
virtual void record(const vk::CommandBuffer& commandBuffer) override;
/**
* Does not perform any preEval commands.
*
* @param commandBuffer The command buffer to record the command into.
*/
virtual void preEval(const vk::CommandBuffer& commandBuffer) override;
/**
* Executes after the recorded commands are submitted, and performs a copy
* of the GPU Device memory into the staging buffer so the output data can
* be retrieved.
* Does not perform any postEval commands.
*
* @param commandBuffer The command buffer to record the command into.
*/
virtual void postEval(const vk::CommandBuffer& commandBuffer) override;
@ -1439,11 +1495,9 @@ class OpMult : public OpAlgoDispatch
* requirements for the operations to be able to create and manage their
* sub-components.
*
* @param physicalDevice Vulkan physical device used to find device queues
* @param device Vulkan logical device for passing to Algorithm
* @param commandBuffer Vulkan Command Buffer to record commands into
* @param tensors Tensors that are to be used in this operation
* @param komputeWorkgroup Optional parameter to specify the layout for processing
* @param algorithm An algorithm that will be overridden with the OpMult
* shader data and the tensors provided which are expected to be 3
*/
OpMult(std::vector<std::shared_ptr<Tensor>> tensors, std::shared_ptr<Algorithm> algorithm)
: OpAlgoDispatch(algorithm)
@ -1489,11 +1543,13 @@ class Sequence : public std::enable_shared_from_this<Sequence>
* @param device Vulkan logical device
* @param computeQueue Vulkan compute queue
* @param queueIndex Vulkan compute queue index in device
* @param totalTimestamps Maximum number of timestamps to allocate
*/
Sequence(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
std::shared_ptr<vk::Device> device,
std::shared_ptr<vk::Queue> computeQueue,
uint32_t queueIndex);
uint32_t queueIndex,
uint32_t totalTimestamps = 0);
/**
* Destructor for sequence which is responsible for cleaning all subsequent
* owned operations.
@ -1669,6 +1725,12 @@ class Sequence : public std::enable_shared_from_this<Sequence>
*/
void clear();
/**
* Return the timestamps that were latched at the beginning and
* after each operation during the last eval() call.
*/
std::vector<std::uint64_t> getTimestamps();
/**
* Begins recording commands for commands to be submitted into the command
* buffer.
@ -1737,6 +1799,7 @@ class Sequence : public std::enable_shared_from_this<Sequence>
// -------------- ALWAYS OWNED RESOURCES
vk::Fence mFence;
std::vector<std::shared_ptr<OpBase>> mOperations;
std::shared_ptr<vk::QueryPool> timestampQueryPool = nullptr;
// State
bool mRecording = false;
@ -1745,6 +1808,7 @@ class Sequence : public std::enable_shared_from_this<Sequence>
// Create functions
void createCommandPool();
void createCommandBuffer();
void createTimestampQueryPool(uint32_t totalTimestamps);
};
} // End namespace kp
@ -1805,9 +1869,11 @@ class Manager
* if it hasn't been destroyed by its reference count going to zero.
*
* @param queueIndex The queue to use from the available queues
* @param nrOfTimestamps The maximum number of timestamps to allocate.
* If zero (default), disables latching of timestamps.
* @returns Shared pointer with initialised sequence
*/
std::shared_ptr<Sequence> sequence(uint32_t queueIndex = 0);
std::shared_ptr<Sequence> sequence(uint32_t queueIndex = 0, uint32_t nrOfTimestamps = 0);
/**
* Create a managed tensor that will be destroyed by this manager

View file

@ -431,7 +431,7 @@ Manager::algorithm(const std::vector<std::shared_ptr<Tensor>>& tensors,
}
std::shared_ptr<Sequence>
Manager::sequence(uint32_t queueIndex)
Manager::sequence(uint32_t queueIndex, uint32_t totalTimestamps)
{
KP_LOG_DEBUG("Kompute Manager sequence() with queueIndex: {}", queueIndex);
@ -439,7 +439,8 @@ Manager::sequence(uint32_t queueIndex)
this->mPhysicalDevice,
this->mDevice,
this->mComputeQueues[queueIndex],
this->mComputeQueueFamilyIndices[queueIndex]) };
this->mComputeQueueFamilyIndices[queueIndex],
totalTimestamps) };
if (this->mManageResources) {
this->mManagedSequences.push_back(sq);

View file

@ -6,7 +6,8 @@ namespace kp {
Sequence::Sequence(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
std::shared_ptr<vk::Device> device,
std::shared_ptr<vk::Queue> computeQueue,
uint32_t queueIndex)
uint32_t queueIndex,
uint32_t totalTimestamps)
{
KP_LOG_DEBUG("Kompute Sequence Constructor with existing device & queue");
@ -17,6 +18,8 @@ Sequence::Sequence(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
this->createCommandPool();
this->createCommandBuffer();
if(totalTimestamps>0)
this->createTimestampQueryPool(totalTimestamps+1); //+1 for the first one
}
Sequence::~Sequence()
@ -44,6 +47,13 @@ Sequence::begin()
KP_LOG_INFO("Kompute Sequence command now started recording");
this->mCommandBuffer->begin(vk::CommandBufferBeginInfo());
this->mRecording = true;
//latch the first timestamp before any commands are submitted
if(this->timestampQueryPool)
this->mCommandBuffer->writeTimestamp(
vk::PipelineStageFlagBits::eAllCommands,
*this->timestampQueryPool, 0
);
}
void
@ -236,6 +246,16 @@ Sequence::destroy()
this->mOperations.clear();
}
if(this->timestampQueryPool){
KP_LOG_INFO("Destroying QueryPool");
this->mDevice->destroy(
*this->timestampQueryPool,
(vk::Optional<const vk::AllocationCallbacks>)nullptr);
this->timestampQueryPool = nullptr;
KP_LOG_DEBUG("Kompute Sequence Destroyed QueryPool");
}
if (this->mDevice) {
this->mDevice = nullptr;
}
@ -261,6 +281,12 @@ Sequence::record(std::shared_ptr<OpBase> op)
this->mOperations.push_back(op);
if(this->timestampQueryPool)
this->mCommandBuffer->writeTimestamp(
vk::PipelineStageFlagBits::eAllCommands,
*this->timestampQueryPool, this->mOperations.size()
);
return shared_from_this();
}
@ -308,4 +334,46 @@ Sequence::createCommandBuffer()
KP_LOG_DEBUG("Kompute Sequence Command Buffer Created");
}
void
Sequence::createTimestampQueryPool(uint32_t totalTimestamps)
{
KP_LOG_DEBUG("Kompute Sequence creating query pool");
if (!this->isInit()) {
throw std::runtime_error("createTimestampQueryPool() called on uninitialized Sequence");
}
if (!this->mPhysicalDevice) {
throw std::runtime_error("Kompute Sequence physical device is null");
}
vk::PhysicalDeviceProperties physicalDeviceProperties =
this->mPhysicalDevice->getProperties();
if(physicalDeviceProperties.limits.timestampComputeAndGraphics){
vk::QueryPoolCreateInfo queryPoolInfo;
queryPoolInfo.setQueryCount(totalTimestamps);
queryPoolInfo.setQueryType(vk::QueryType::eTimestamp);
this->timestampQueryPool = std::make_shared<vk::QueryPool>(this->mDevice->createQueryPool(queryPoolInfo));
KP_LOG_DEBUG("Query pool for timestamps created");
}
else{
throw std::runtime_error("Device does not support timestamps");
}
}
std::vector<std::uint64_t>
Sequence::getTimestamps()
{
if(!this->timestampQueryPool)
throw std::runtime_error("Timestamp latching not enabled");
const auto n = this->mOperations.size()+1;
std::vector<std::uint64_t> timestamps(n, 0);
this->mDevice->getQueryPoolResults(*this->timestampQueryPool,
0, n, timestamps.size()*sizeof(std::uint64_t), timestamps.data(),
sizeof(uint64_t), vk::QueryResultFlagBits::e64 | vk::QueryResultFlagBits::eWait);
return timestamps;
}
}

View file

@ -60,9 +60,11 @@ class Manager
* if it hasn't been destroyed by its reference count going to zero.
*
* @param queueIndex The queue to use from the available queues
* @param nrOfTimestamps The maximum number of timestamps to allocate.
* If zero (default), disables latching of timestamps.
* @returns Shared pointer with initialised sequence
*/
std::shared_ptr<Sequence> sequence(uint32_t queueIndex = 0);
std::shared_ptr<Sequence> sequence(uint32_t queueIndex = 0, uint32_t totalTimestamps = 0);
/**
* Create a managed tensor that will be destroyed by this manager

View file

@ -3,6 +3,7 @@
#include "kompute/Core.hpp"
#include "kompute/operations/OpBase.hpp"
#include "kompute/operations/OpAlgoDispatch.hpp"
namespace kp {
@ -20,11 +21,13 @@ class Sequence : public std::enable_shared_from_this<Sequence>
* @param device Vulkan logical device
* @param computeQueue Vulkan compute queue
* @param queueIndex Vulkan compute queue index in device
* @param totalTimestamps Maximum number of timestamps to allocate
*/
Sequence(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
std::shared_ptr<vk::Device> device,
std::shared_ptr<vk::Queue> computeQueue,
uint32_t queueIndex);
uint32_t queueIndex,
uint32_t totalTimestamps = 0);
/**
* Destructor for sequence which is responsible for cleaning all subsequent
* owned operations.
@ -200,6 +203,12 @@ class Sequence : public std::enable_shared_from_this<Sequence>
*/
void clear();
/**
* Return the timestamps that were latched at the beginning and
* after each operation during the last eval() call.
*/
std::vector<std::uint64_t> getTimestamps();
/**
* Begins recording commands for commands to be submitted into the command
* buffer.
@ -268,6 +277,7 @@ class Sequence : public std::enable_shared_from_this<Sequence>
// -------------- ALWAYS OWNED RESOURCES
vk::Fence mFence;
std::vector<std::shared_ptr<OpBase>> mOperations;
std::shared_ptr<vk::QueryPool> timestampQueryPool = nullptr;
// State
bool mRecording = false;
@ -276,6 +286,7 @@ class Sequence : public std::enable_shared_from_this<Sequence>
// Create functions
void createCommandPool();
void createCommandBuffer();
void createTimestampQueryPool(uint32_t totalTimestamps);
};
} // End namespace kp

View file

@ -100,3 +100,33 @@ TEST(TestSequence, RerecordSequence)
EXPECT_EQ(tensorB->data(), std::vector<float>({2, 8, 18}));
}
TEST(TestSequence, SequenceTimestamps)
{
kp::Manager mgr;
std::shared_ptr<kp::Tensor> tensorA = mgr.tensor({ 0, 0, 0 });
std::string shader(R"(
#version 450
layout (local_size_x = 1) in;
layout(set = 0, binding = 0) buffer a { float pa[]; };
void main() {
uint index = gl_GlobalInvocationID.x;
pa[index] = pa[index] + 1;
})");
std::vector<uint32_t> spirv = kp::Shader::compile_source(shader);
auto seq = mgr.sequence(0, 100); //100 timestamps
seq->record<kp::OpTensorSyncDevice>({ tensorA })
->record<kp::OpAlgoDispatch>(mgr.algorithm({ tensorA }, spirv))
->record<kp::OpAlgoDispatch>(mgr.algorithm({ tensorA }, spirv))
->record<kp::OpAlgoDispatch>(mgr.algorithm({ tensorA }, spirv))
->record<kp::OpTensorSyncLocal>({ tensorA })
->eval();
const std::vector<uint64_t> timestamps = seq->getTimestamps();
EXPECT_EQ(timestamps.size(), 6); //1 timestamp at start + 1 after each operation
}