Merge pull request #176 from alexander-g/timestamps

Support for Timestamping
2021-03-07 14:00:49 +00:00 · 2021-03-07 14:00:49 +00:00 · cc1ec748a7
commit cc1ec748a7
parent 515c3b6cd4 259d3f1d13
8 changed files with 221 additions and 42 deletions
--- a/2
+++ b/2
@ -13,7 +13,7 @@ VCPKG_WIN_PATH ?= "C:\\Users\\axsau\\Programming\\lib\\vcpkg\\scripts\\buildsyst
 VCPKG_UNIX_PATH ?= "/c/Users/axsau/Programming/lib/vcpkg/scripts/buildsystems/vcpkg.cmake"

 # Regext to pass to catch2 to filter tests
-FILTER_TESTS ?= "-TestAsyncOperations.TestManagerParallelExecution"
+FILTER_TESTS ?= "-TestAsyncOperations.TestManagerParallelExecution:TestSequence.SequenceTimestamps"

 ifeq ($(OS),Windows_NT)     # is Windows_NT on XP, 2000, 7, Vista, 10...
 	CMAKE_BIN ?= "C:\Program Files\CMake\bin\cmake.exe"
--- a/python/src/main.cpp
+++ b/python/src/main.cpp
@ -129,6 +129,7 @@ PYBIND11_MODULE(kp, m) {
        .def("is_recording", &kp::Sequence::isRecording)
        .def("is_running", &kp::Sequence::isRunning)
        .def("is_init", &kp::Sequence::isInit)
+        .def("get_timestamps", &kp::Sequence::getTimestamps)
        .def("clear", &kp::Sequence::clear)
        .def("destroy", &kp::Sequence::destroy);

@ -139,7 +140,7 @@ PYBIND11_MODULE(kp, m) {
                py::arg("device") = 0,
                py::arg("family_queue_indices") = std::vector<uint32_t>(),
                py::arg("desired_extensions") = std::vector<std::string>())
-        .def("sequence", &kp::Manager::sequence, py::arg("queueIndex") = 0)
+        .def("sequence", &kp::Manager::sequence, py::arg("queue_index") = 0, py::arg("total_timestamps") = 0)
        .def("tensor", [np](kp::Manager& self,
                            const py::array_t<float> data,
                            kp::Tensor::TensorTypes tensor_type) {
--- a/single_include/kompute/Kompute.hpp
+++ b/single_include/kompute/Kompute.hpp
@ -820,12 +820,14 @@ class Tensor
    };

    /**
-     *  Default constructor with data provided which would be used to create the
+     *  Constructor with data provided which would be used to create the
     * respective vulkan buffer and memory.
     *
+     *  @param physicalDevice The physical device to use to fetch properties
+     *  @param device The device to use to create the buffer and memory from
     *  @param data Non-zero-sized vector of data that will be used by the
     * tensor
-     *  @param tensorType Type for the tensor which is of type TensorTypes
+     *  @param tensorTypes Type for the tensor which is of type TensorTypes
     */
    Tensor(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
           std::shared_ptr<vk::Device> device,
@ -839,10 +841,11 @@ class Tensor
    ~Tensor();

    /**
-     * Initialiser which calls the initialisation for all the respective tensors
-     * as well as creates the respective staging tensors. The staging tensors
-     * would only be created for the tensors of type TensorType::eDevice as
-     * otherwise there is no need to copy from host memory.
+     * Function to trigger reinitialisation of the tensor buffer and memory with
+     * new data as well as new potential device type.
+     *
+     * @param data Vector of data to use to initialise vector from
+     * @param tensorType The type to use for the tensor
     */
    void rebuild(const std::vector<float>& data,
                 TensorTypes tensorType = TensorTypes::eDevice);
@ -852,6 +855,11 @@ class Tensor
     */
    void destroy();

+    /**
+     * Check whether tensor is initialized based on the created gpu resources.
+     *
+     * @returns Boolean stating whether tensor is initialized
+     */
    bool isInit();

    /**
@ -1210,6 +1218,8 @@ class OpBase
     * The record function is intended to only send a record command or run
     * commands that are expected to record operations that are to be submitted
     * as a batch into the GPU.
+     *
+     * @param commandBuffer The command buffer to record the command into.
     */
    virtual void record(const vk::CommandBuffer& commandBuffer) = 0;

@ -1220,6 +1230,8 @@ class OpBase
     * there are situations where eval can be called multiple times, so the 
     * resources that are created should be idempotent in case it's called multiple
     * times in a row.
+     *
+     * @param commandBuffer The command buffer to record the command into.
     */
    virtual void preEval(const vk::CommandBuffer& commandBuffer) = 0;

@ -1230,6 +1242,8 @@ class OpBase
     * there are situations where eval can be called multiple times, so the 
     * resources that are destroyed should not require a re-init unless explicitly
     * provided by the user.
+     *
+     * @param commandBuffer The command buffer to record the command into.
     */
    virtual void postEval(const vk::CommandBuffer& commandBuffer) = 0;
 };
@ -1239,38 +1253,47 @@ class OpBase
 namespace kp {

 /**
-    Operation that copies the data from the first tensor to the rest of the tensors provided, using a record command for all the vectors. This operation does not own/manage the memory of the tensors passed to it. The operation must only receive tensors of type 
+ * Operation that copies the data from the first tensor to the rest of the tensors 
+ * provided, using a record command for all the vectors. This operation does not 
+ * own/manage the memory of the tensors passed to it. The operation must only 
+ * receive tensors of type 
 */
 class OpTensorCopy : public OpBase
 {
  public:
    /**
-     * Default constructor with parameters that provides the core vulkan resources and the tensors that will be used in the operation.
+     * Default constructor with parameters that provides the core vulkan resources 
+     * and the tensors that will be used in the operation.
     *
-     * @param physicalDevice Vulkan physical device used to find device queues
-     * @param device Vulkan logical device for passing to Algorithm
-     * @param commandBuffer Vulkan Command Buffer to record commands into
     * @param tensors Tensors that will be used to create in operation.
     */
    OpTensorCopy(const std::vector<std::shared_ptr<Tensor>>& tensors);

    /**
-     * Default destructor. This class does not manage memory so it won't be expecting the parent to perform a release.
+     * Default destructor. This class does not manage memory so it won't be 
+     * expecting the parent to perform a release.
     */
    ~OpTensorCopy() override;

    /**
-     * Records the copy commands from the first tensor into all the other tensors provided. Also optionally records a barrier.
+     * Records the copy commands from the first tensor into all the other 
+     * tensors provided. Also optionally records a barrier.
+     *
+     * @param commandBuffer The command buffer to record the command into.
     */
    void record(const vk::CommandBuffer& commandBuffer) override;

    /**
     * Does not perform any preEval commands.
+     *
+     * @param commandBuffer The command buffer to record the command into.
     */
    virtual void preEval(const vk::CommandBuffer& commandBuffer) override;

    /**
     * Copies the local vectors for all the tensors to sync the data with the gpu.
+     *
+     * @param commandBuffer The command buffer to record the command into.
     */
    virtual void postEval(const vk::CommandBuffer& commandBuffer) override;

@ -1284,17 +1307,20 @@ class OpTensorCopy : public OpBase
 namespace kp {

 /**
-    Operation that syncs tensor's device by mapping local data into the device memory. For TensorTypes::eDevice it will use a record operation for the memory to be syncd into GPU memory which means that the operation will be done in sync with GPU commands. For TensorTypes::eStaging it will only map the data into host memory which will happen during preEval before the recorded commands are dispatched. This operation won't have any effect on TensorTypes::eStaging.
+ * Operation that syncs tensor's device by mapping local data into the device memory. 
+ * For TensorTypes::eDevice it will use a record operation for the memory to be syncd 
+ * into GPU memory which means that the operation will be done in sync with GPU commands. 
+ * For TensorTypes::eHost it will only map the data into host memory which will 
+ * happen during preEval before the recorded commands are dispatched.
 */
 class OpTensorSyncDevice : public OpBase
 {
  public:
    /**
-     * Default constructor with parameters that provides the core vulkan resources and the tensors that will be used in the operation. The tensos provided cannot be of type TensorTypes::eStorage.
+     * Default constructor with parameters that provides the core vulkan resources 
+     * and the tensors that will be used in the operation. The tensos provided cannot 
+     * be of type TensorTypes::eStorage.
     *
-     * @param physicalDevice Vulkan physical device used to find device queues
-     * @param device Vulkan logical device for passing to Algorithm
-     * @param commandBuffer Vulkan Command Buffer to record commands into
     * @param tensors Tensors that will be used to create in operation.
     */
    OpTensorSyncDevice(const std::vector<std::shared_ptr<Tensor>>& tensors);
@ -1305,17 +1331,24 @@ class OpTensorSyncDevice : public OpBase
    ~OpTensorSyncDevice() override;

    /**
-     * For device tensors, it records the copy command for the tensor to copy the data from its staging to device memory.
+     * For device tensors, it records the copy command for the tensor to copy the 
+     * data from its staging to device memory.
+     *
+     * @param commandBuffer The command buffer to record the command into.
     */
    void record(const vk::CommandBuffer& commandBuffer) override;

    /**
     * Does not perform any preEval commands.
+     *
+     * @param commandBuffer The command buffer to record the command into.
     */
    virtual void preEval(const vk::CommandBuffer& commandBuffer) override;

    /**
     * Does not perform any postEval commands.
+     *
+     * @param commandBuffer The command buffer to record the command into.
     */
    virtual void postEval(const vk::CommandBuffer& commandBuffer) override;

@ -1329,38 +1362,50 @@ class OpTensorSyncDevice : public OpBase
 namespace kp {

 /**
-    Operation that syncs tensor's local memory by mapping device data into the local CPU memory. For TensorTypes::eDevice it will use a record operation for the memory to be syncd into GPU memory which means that the operation will be done in sync with GPU commands. For TensorTypes::eStaging it will only map the data into host memory which will happen during preEval before the recorded commands are dispatched. This operation won't have any effect on TensorTypes::eStaging.
+ * Operation that syncs tensor's local memory by mapping device data into the 
+ * local CPU memory. For TensorTypes::eDevice it will use a record operation 
+ * for the memory to be syncd into GPU memory which means that the operation 
+ * will be done in sync with GPU commands. For TensorTypes::eHost it will 
+ * only map the data into host memory which will happen during preEval before 
+ * the recorded commands are dispatched.
 */
 class OpTensorSyncLocal : public OpBase
 {
  public:
    /**
-     * Default constructor with parameters that provides the core vulkan resources and the tensors that will be used in the operation. The tensors provided cannot be of type TensorTypes::eStorage.
+     * Default constructor with parameters that provides the core vulkan resources 
+     * and the tensors that will be used in the operation. The tensors provided 
+     * cannot be of type TensorTypes::eStorage.
     *
-     * @param physicalDevice Vulkan physical device used to find device queues
-     * @param device Vulkan logical device for passing to Algorithm
-     * @param commandBuffer Vulkan Command Buffer to record commands into
     * @param tensors Tensors that will be used to create in operation.
     */
    OpTensorSyncLocal(const std::vector<std::shared_ptr<Tensor>>& tensors);

    /**
-     * Default destructor. This class does not manage memory so it won't be expecting the parent to perform a release.
+     * Default destructor. This class does not manage memory so it won't be expecting 
+     * the parent to perform a release.
     */
    ~OpTensorSyncLocal() override;

    /**
-     * For device tensors, it records the copy command for the tensor to copy the data from its device to staging memory.
+     * For device tensors, it records the copy command for the tensor to copy the 
+     * data from its device to staging memory.
+     *
+     * @param commandBuffer The command buffer to record the command into.
     */
    void record(const vk::CommandBuffer& commandBuffer) override;

    /**
     * Does not perform any preEval commands.
+     *
+     * @param commandBuffer The command buffer to record the command into.
     */
    virtual void preEval(const vk::CommandBuffer& commandBuffer) override;

    /**
     * For host tensors it performs the map command from the host memory into local memory.
+     *
+     * @param commandBuffer The command buffer to record the command into.
     */
    virtual void postEval(const vk::CommandBuffer& commandBuffer) override;

@ -1383,6 +1428,13 @@ class OpAlgoDispatch : public OpBase
 {
  public:

+    /**
+     * Constructor that stores the algorithm to use as well as the relevant
+     * push constants to override when recording.
+     *
+     * @param algorithm The algorithm object to use for dispatch
+     * @param pushConstants The push constants to use for override
+     */
    OpAlgoDispatch(const std::shared_ptr<kp::Algorithm>& algorithm,
            const kp::Constants& pushConstants = {});

@ -1399,18 +1451,22 @@ class OpAlgoDispatch : public OpBase
     * shader processing to the gpu. This function also records the GPU memory
     * copy of the output data for the staging buffer so it can be read by the
     * host.
+     *
+     * @param commandBuffer The command buffer to record the command into.
     */
    virtual void record(const vk::CommandBuffer& commandBuffer) override;

    /**
     * Does not perform any preEval commands.
+     *
+     * @param commandBuffer The command buffer to record the command into.
     */
    virtual void preEval(const vk::CommandBuffer& commandBuffer) override;

    /**
-     * Executes after the recorded commands are submitted, and performs a copy
-     * of the GPU Device memory into the staging buffer so the output data can
-     * be retrieved.
+     * Does not perform any postEval commands.
+     *
+     * @param commandBuffer The command buffer to record the command into.
     */
    virtual void postEval(const vk::CommandBuffer& commandBuffer) override;

@ -1439,11 +1495,9 @@ class OpMult : public OpAlgoDispatch
     * requirements for the operations to be able to create and manage their
     * sub-components.
     *
-     * @param physicalDevice Vulkan physical device used to find device queues
-     * @param device Vulkan logical device for passing to Algorithm
-     * @param commandBuffer Vulkan Command Buffer to record commands into
     * @param tensors Tensors that are to be used in this operation
-     * @param komputeWorkgroup Optional parameter to specify the layout for processing
+     * @param algorithm An algorithm that will be overridden with the OpMult
+     * shader data and the tensors provided which are expected to be 3
     */
    OpMult(std::vector<std::shared_ptr<Tensor>> tensors, std::shared_ptr<Algorithm> algorithm)
        : OpAlgoDispatch(algorithm)
@ -1489,11 +1543,13 @@ class Sequence : public std::enable_shared_from_this<Sequence>
     * @param device Vulkan logical device
     * @param computeQueue Vulkan compute queue
     * @param queueIndex Vulkan compute queue index in device
+     * @param totalTimestamps Maximum number of timestamps to allocate
     */
    Sequence(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
             std::shared_ptr<vk::Device> device,
             std::shared_ptr<vk::Queue> computeQueue,
-             uint32_t queueIndex);
+             uint32_t queueIndex,
+             uint32_t totalTimestamps = 0);
    /**
     * Destructor for sequence which is responsible for cleaning all subsequent
     * owned operations.
@ -1669,6 +1725,12 @@ class Sequence : public std::enable_shared_from_this<Sequence>
     */
    void clear();

+    /**
+     * Return the timestamps that were latched at the beginning and
+     * after each operation during the last eval() call.
+     */
+    std::vector<std::uint64_t> getTimestamps();
+
    /**
     * Begins recording commands for commands to be submitted into the command
     * buffer.
@ -1737,6 +1799,7 @@ class Sequence : public std::enable_shared_from_this<Sequence>
    // -------------- ALWAYS OWNED RESOURCES
    vk::Fence mFence;
    std::vector<std::shared_ptr<OpBase>> mOperations;
+    std::shared_ptr<vk::QueryPool> timestampQueryPool = nullptr;

    // State
    bool mRecording = false;
@ -1745,6 +1808,7 @@ class Sequence : public std::enable_shared_from_this<Sequence>
    // Create functions
    void createCommandPool();
    void createCommandBuffer();
+    void createTimestampQueryPool(uint32_t totalTimestamps);
 };

 } // End namespace kp
@ -1805,9 +1869,11 @@ class Manager
     * if it hasn't been destroyed by its reference count going to zero.
     *
     * @param queueIndex The queue to use from the available queues
+     * @param nrOfTimestamps The maximum number of timestamps to allocate.
+     * If zero (default), disables latching of timestamps.
     * @returns Shared pointer with initialised sequence
     */
-    std::shared_ptr<Sequence> sequence(uint32_t queueIndex = 0);
+    std::shared_ptr<Sequence> sequence(uint32_t queueIndex = 0, uint32_t nrOfTimestamps = 0);

    /**
     * Create a managed tensor that will be destroyed by this manager
--- a/src/Manager.cpp
+++ b/src/Manager.cpp
@ -431,7 +431,7 @@ Manager::algorithm(const std::vector<std::shared_ptr<Tensor>>& tensors,
 }

 std::shared_ptr<Sequence>
-Manager::sequence(uint32_t queueIndex)
+Manager::sequence(uint32_t queueIndex, uint32_t totalTimestamps)
 {
    KP_LOG_DEBUG("Kompute Manager sequence() with queueIndex: {}", queueIndex);

@ -439,7 +439,8 @@ Manager::sequence(uint32_t queueIndex)
      this->mPhysicalDevice,
      this->mDevice,
      this->mComputeQueues[queueIndex],
-      this->mComputeQueueFamilyIndices[queueIndex]) };
+      this->mComputeQueueFamilyIndices[queueIndex],
+      totalTimestamps) };

    if (this->mManageResources) {
        this->mManagedSequences.push_back(sq);
--- a/src/Sequence.cpp
+++ b/src/Sequence.cpp
@ -6,7 +6,8 @@ namespace kp {
 Sequence::Sequence(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
                   std::shared_ptr<vk::Device> device,
                   std::shared_ptr<vk::Queue> computeQueue,
-                   uint32_t queueIndex)
+                   uint32_t queueIndex,
+                   uint32_t totalTimestamps)
 {
    KP_LOG_DEBUG("Kompute Sequence Constructor with existing device & queue");

@ -17,6 +18,8 @@ Sequence::Sequence(std::shared_ptr<vk::PhysicalDevice> physicalDevice,

    this->createCommandPool();
    this->createCommandBuffer();
+    if(totalTimestamps>0)
+        this->createTimestampQueryPool(totalTimestamps+1); //+1 for the first one
 }

 Sequence::~Sequence()
@ -44,6 +47,13 @@ Sequence::begin()
    KP_LOG_INFO("Kompute Sequence command now started recording");
    this->mCommandBuffer->begin(vk::CommandBufferBeginInfo());
    this->mRecording = true;
+
+    //latch the first timestamp before any commands are submitted
+    if(this->timestampQueryPool)
+        this->mCommandBuffer->writeTimestamp(
+            vk::PipelineStageFlagBits::eAllCommands,
+            *this->timestampQueryPool, 0
+        );
 }

 void
@ -236,6 +246,16 @@ Sequence::destroy()
        this->mOperations.clear();
    }

+    if(this->timestampQueryPool){
+        KP_LOG_INFO("Destroying QueryPool");
+        this->mDevice->destroy(
+            *this->timestampQueryPool,
+            (vk::Optional<const vk::AllocationCallbacks>)nullptr);
+        
+        this->timestampQueryPool = nullptr;
+        KP_LOG_DEBUG("Kompute Sequence Destroyed QueryPool");
+    }
+
    if (this->mDevice) {
        this->mDevice = nullptr;
    }
@ -261,6 +281,12 @@ Sequence::record(std::shared_ptr<OpBase> op)

    this->mOperations.push_back(op);

+    if(this->timestampQueryPool)
+      this->mCommandBuffer->writeTimestamp(
+                vk::PipelineStageFlagBits::eAllCommands,
+                *this->timestampQueryPool, this->mOperations.size()
+        );
+    
    return shared_from_this();
 }

@ -308,4 +334,46 @@ Sequence::createCommandBuffer()
    KP_LOG_DEBUG("Kompute Sequence Command Buffer Created");
 }

+void
+Sequence::createTimestampQueryPool(uint32_t totalTimestamps)
+{
+    KP_LOG_DEBUG("Kompute Sequence creating query pool");
+    if (!this->isInit()) {
+        throw std::runtime_error("createTimestampQueryPool() called on uninitialized Sequence");
+    }
+    if (!this->mPhysicalDevice) {
+        throw std::runtime_error("Kompute Sequence physical device is null");
+    }
+
+    vk::PhysicalDeviceProperties physicalDeviceProperties =
+      this->mPhysicalDevice->getProperties();
+    
+    if(physicalDeviceProperties.limits.timestampComputeAndGraphics){
+        vk::QueryPoolCreateInfo queryPoolInfo;
+        queryPoolInfo.setQueryCount(totalTimestamps);
+        queryPoolInfo.setQueryType(vk::QueryType::eTimestamp);
+        this->timestampQueryPool = std::make_shared<vk::QueryPool>(this->mDevice->createQueryPool(queryPoolInfo));
+
+        KP_LOG_DEBUG("Query pool for timestamps created");
+    }
+    else{
+        throw std::runtime_error("Device does not support timestamps");
+    }
+}
+
+std::vector<std::uint64_t>
+Sequence::getTimestamps()
+{
+    if(!this->timestampQueryPool)
+        throw std::runtime_error("Timestamp latching not enabled");
+    
+    const auto n = this->mOperations.size()+1;
+    std::vector<std::uint64_t> timestamps(n, 0);
+    this->mDevice->getQueryPoolResults(*this->timestampQueryPool, 
+                                       0, n, timestamps.size()*sizeof(std::uint64_t), timestamps.data(),
+                                       sizeof(uint64_t), vk::QueryResultFlagBits::e64 | vk::QueryResultFlagBits::eWait);
+
+    return timestamps;
+}
+
 }
--- a/src/include/kompute/Manager.hpp
+++ b/src/include/kompute/Manager.hpp
@ -60,9 +60,11 @@ class Manager
     * if it hasn't been destroyed by its reference count going to zero.
     *
     * @param queueIndex The queue to use from the available queues
+     * @param nrOfTimestamps The maximum number of timestamps to allocate.
+     * If zero (default), disables latching of timestamps.
     * @returns Shared pointer with initialised sequence
     */
-    std::shared_ptr<Sequence> sequence(uint32_t queueIndex = 0);
+    std::shared_ptr<Sequence> sequence(uint32_t queueIndex = 0, uint32_t totalTimestamps = 0);

    /**
     * Create a managed tensor that will be destroyed by this manager
--- a/src/include/kompute/Sequence.hpp
+++ b/src/include/kompute/Sequence.hpp
@ -3,6 +3,7 @@
 #include "kompute/Core.hpp"

 #include "kompute/operations/OpBase.hpp"
+#include "kompute/operations/OpAlgoDispatch.hpp"

 namespace kp {

@ -20,11 +21,13 @@ class Sequence : public std::enable_shared_from_this<Sequence>
     * @param device Vulkan logical device
     * @param computeQueue Vulkan compute queue
     * @param queueIndex Vulkan compute queue index in device
+     * @param totalTimestamps Maximum number of timestamps to allocate
     */
    Sequence(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
             std::shared_ptr<vk::Device> device,
             std::shared_ptr<vk::Queue> computeQueue,
-             uint32_t queueIndex);
+             uint32_t queueIndex,
+             uint32_t totalTimestamps = 0);
    /**
     * Destructor for sequence which is responsible for cleaning all subsequent
     * owned operations.
@ -200,6 +203,12 @@ class Sequence : public std::enable_shared_from_this<Sequence>
     */
    void clear();

+    /**
+     * Return the timestamps that were latched at the beginning and
+     * after each operation during the last eval() call.
+     */
+    std::vector<std::uint64_t> getTimestamps();
+
    /**
     * Begins recording commands for commands to be submitted into the command
     * buffer.
@ -268,6 +277,7 @@ class Sequence : public std::enable_shared_from_this<Sequence>
    // -------------- ALWAYS OWNED RESOURCES
    vk::Fence mFence;
    std::vector<std::shared_ptr<OpBase>> mOperations;
+    std::shared_ptr<vk::QueryPool> timestampQueryPool = nullptr;

    // State
    bool mRecording = false;
@ -276,6 +286,7 @@ class Sequence : public std::enable_shared_from_this<Sequence>
    // Create functions
    void createCommandPool();
    void createCommandBuffer();
+    void createTimestampQueryPool(uint32_t totalTimestamps);
 };

 } // End namespace kp
--- a/test/TestSequence.cpp
+++ b/test/TestSequence.cpp
@ -100,3 +100,33 @@ TEST(TestSequence, RerecordSequence)

    EXPECT_EQ(tensorB->data(), std::vector<float>({2, 8, 18}));
 }
+
+
+TEST(TestSequence, SequenceTimestamps)
+{
+    kp::Manager mgr;
+
+    std::shared_ptr<kp::Tensor> tensorA = mgr.tensor({ 0, 0, 0 });
+
+    std::string shader(R"(
+      #version 450
+      layout (local_size_x = 1) in;
+      layout(set = 0, binding = 0) buffer a { float pa[]; };
+      void main() {
+          uint index = gl_GlobalInvocationID.x;
+          pa[index] = pa[index] + 1;
+      })");
+
+    std::vector<uint32_t> spirv = kp::Shader::compile_source(shader);
+    
+    auto seq = mgr.sequence(0, 100); //100 timestamps
+    seq->record<kp::OpTensorSyncDevice>({ tensorA })
+        ->record<kp::OpAlgoDispatch>(mgr.algorithm({ tensorA }, spirv))
+        ->record<kp::OpAlgoDispatch>(mgr.algorithm({ tensorA }, spirv))
+        ->record<kp::OpAlgoDispatch>(mgr.algorithm({ tensorA }, spirv))
+        ->record<kp::OpTensorSyncLocal>({ tensorA })
+        ->eval();
+    const std::vector<uint64_t> timestamps = seq->getTimestamps();
+    
+    EXPECT_EQ(timestamps.size(), 6); //1 timestamp at start + 1 after each operation
+}