From e68d09dbdcb7425512d48ac5155ddf5c94f3f637 Mon Sep 17 00:00:00 2001
From: Alejandro Saucedo <axsauze@gmail.com>
Date: Sun, 6 Sep 2020 15:51:31 +0100
Subject: [PATCH] Added functional optensorsyncDevice and optensorsynclocal

---
 docs/overview/advanced-examples.rst           | 12 ++-
 single_include/kompute/Kompute.hpp            | 69 ++++++++++++---
 src/OpTensorCopy.cpp                          | 10 ++-
 src/OpTensorCreate.cpp                        | 11 ++-
 src/OpTensorSyncDevice.cpp                    | 27 ++++--
 src/OpTensorSyncLocal.cpp                     | 15 ++--
 src/Sequence.cpp                              |  6 +-
 src/include/kompute/operations/OpAlgoBase.hpp | 17 +++-
 src/include/kompute/operations/OpBase.hpp     | 21 ++++-
 .../kompute/operations/OpTensorCopy.hpp       |  7 +-
 .../kompute/operations/OpTensorCreate.hpp     |  8 +-
 .../kompute/operations/OpTensorSyncDevice.hpp |  9 +-
 .../kompute/operations/OpTensorSyncLocal.hpp  | 10 ++-
 test/TestLogisticRegression.cpp               | 86 ++++++++++++++++++-
 test/TestOpTensorSyncDevice.cpp               |  0
 15 files changed, 258 insertions(+), 50 deletions(-)
 create mode 100644 test/TestOpTensorSyncDevice.cpp

diff --git a/docs/overview/advanced-examples.rst b/docs/overview/advanced-examples.rst
index cb52500b1..dc0e43583 100644
--- a/docs/overview/advanced-examples.rst
+++ b/docs/overview/advanced-examples.rst
@@ -121,7 +121,11 @@ Now that we have the inputs and outputs we will be able to use them in the proce
 
 Once we re-record, all the instructions that were recorded previosuly are cleared.
 
-Because of this we can record now the new command which is just the OpAlgoBase with the LR shader.
+Because of this we can record now the new commands which will consist of the following:
+
+1. Copy the tensor data from local to device
+2. Run the logistic regression shader
+3. Copy the output data 
 
 .. code-block:: cpp
     :linenos:
@@ -131,11 +135,15 @@ Because of this we can record now the new command which is just the OpAlgoBase w
 
         sq->begin();
 
+        sq->record<kp::OpTensorSyncDevice>({wIn, bIn});
+
         sq->record<kp::OpAlgoBase<>>(
                 params, 
-                true, // Whether to copy output from device
+                false, // Whether to copy output from device
                 "test/shaders/glsl/test_logistic_regression.comp");
 
+        sq->record<kp::OpTensorSyncLocal>({wOutI, wOutJ, bOut});
+
         sq->end();
 
 4. Loop across number of iterations + 4-a. Submit algo operation on LR shader
diff --git a/single_include/kompute/Kompute.hpp b/single_include/kompute/Kompute.hpp
index 83e97fdd8..4f3fe5ed9 100755
--- a/single_include/kompute/Kompute.hpp
+++ b/single_include/kompute/Kompute.hpp
@@ -470,11 +470,24 @@ class OpBase
     virtual void record() = 0;
 
     /**
-     * Post submit is called after the Sequence has submitted the commands to
-     * the GPU for processing, and can be used to perform any tear-down steps
-     * required as the computation iteration finishes.
+     * Pre eval is called before the Sequence has called eval and submitted the commands to
+     * the GPU for processing, and can be used to perform any per-eval setup steps
+     * required as the computation iteration begins. It's worth noting that 
+     * there are situations where eval can be called multiple times, so the 
+     * resources that are created should be idempotent in case it's called multiple
+     * times in a row.
      */
-    virtual void postSubmit() = 0;
+    virtual void preEval() = 0;
+
+    /**
+     * Post eval is called after the Sequence has called eval and submitted the commands to
+     * the GPU for processing, and can be used to perform any tear-down steps
+     * required as the computation iteration finishes. It's worth noting that 
+     * there are situations where eval can be called multiple times, so the 
+     * resources that are destroyed should not require a re-init unless explicitly
+     * provided by the user.
+     */
+    virtual void postEval() = 0;
 
   protected:
     // -------------- NEVER OWNED RESOURCES
@@ -966,12 +979,17 @@ class OpAlgoBase : public OpBase
      */
     virtual void record() override;
 
+    /**
+     * Does not perform any preEval commands.
+     */
+    virtual void preEval() override;
+
     /**
      * Executes after the recorded commands are submitted, and performs a copy
      * of the GPU Device memory into the staging buffer so the output data can
      * be retrieved.
      */
-    virtual void postSubmit() override;
+    virtual void postEval() override;
 
   protected:
     // -------------- NEVER OWNED RESOURCES
@@ -1162,7 +1180,14 @@ OpAlgoBase<tX, tY, tZ>::record()
 
 template<uint32_t tX, uint32_t tY, uint32_t tZ>
 void
-OpAlgoBase<tX, tY, tZ>::postSubmit()
+OpAlgoBase<tX, tY, tZ>::preEval()
+{
+    SPDLOG_DEBUG("Kompute OpAlgoBase preEval called");
+}
+
+template<uint32_t tX, uint32_t tY, uint32_t tZ>
+void
+OpAlgoBase<tX, tY, tZ>::postEval()
 {
     SPDLOG_DEBUG("Kompute OpAlgoBase postSubmit called");
 
@@ -1554,11 +1579,16 @@ class OpTensorCreate : public OpBase
      */
     void record() override;
 
+    /**
+     * Does not perform any preEval commands.
+     */
+    virtual void preEval() override;
+
     /**
      * Performs a copy back into the main tensor to ensure that the data
      * contained is the one that is now being stored in the GPU.
      */
-    void postSubmit() override;
+    virtual void postEval() override;
 
   private:
     // Never owned resources
@@ -1605,10 +1635,15 @@ class OpTensorCopy : public OpBase
      */
     void record() override;
 
+    /**
+     * Does not perform any preEval commands.
+     */
+    virtual void preEval() override;
+
     /**
      * Copies the local vectors for all the tensors to sync the data with the gpu.
      */
-    void postSubmit() override;
+    virtual void postEval() override;
 
   private:
 };
@@ -1654,9 +1689,14 @@ class OpTensorSyncDevice : public OpBase
     void record() override;
 
     /**
-     * Does not perform any further sync functions. Frees the staging tensors together with their respective memory.
+     * Does not perform any preEval commands.
      */
-    void postSubmit() override;
+    virtual void preEval() override;
+
+    /**
+     * Does not perform any postEval commands.
+     */
+    virtual void postEval() override;
 
   private:
     // Never owned resources
@@ -1704,9 +1744,14 @@ class OpTensorSyncLocal : public OpBase
     void record() override;
 
     /**
-     * For host tensors it performs the map command from the host memory into local memory. Frees the staging tensors together with their respective memory.
+     * Does not perform any preEval commands.
      */
-    void postSubmit() override;
+    virtual void preEval() override;
+
+    /**
+     * For host tensors it performs the map command from the host memory into local memory.
+     */
+    virtual void postEval() override;
 
   private:
     // Never owned resources
diff --git a/src/OpTensorCopy.cpp b/src/OpTensorCopy.cpp
index 50eb9c4c1..2b4ae52b2 100644
--- a/src/OpTensorCopy.cpp
+++ b/src/OpTensorCopy.cpp
@@ -57,9 +57,15 @@ OpTensorCopy::record()
 }
 
 void
-OpTensorCopy::postSubmit()
+OpTensorCopy::preEval()
 {
-    SPDLOG_DEBUG("Kompute OpTensorCopy postSubmit called");
+    SPDLOG_DEBUG("Kompute OpTensorCopy preEval called");
+}
+
+void
+OpTensorCopy::postEval()
+{
+    SPDLOG_DEBUG("Kompute OpTensorCopy postEval called");
 
     // Copy the data from the first tensor into all the tensors
     for (size_t i = 1; i < this->mTensors.size(); i++) {
diff --git a/src/OpTensorCreate.cpp b/src/OpTensorCreate.cpp
index 55dddb006..aac098220 100644
--- a/src/OpTensorCreate.cpp
+++ b/src/OpTensorCreate.cpp
@@ -80,12 +80,15 @@ OpTensorCreate::record()
 }
 
 void
-OpTensorCreate::postSubmit()
+OpTensorCreate::preEval()
 {
-    SPDLOG_DEBUG("Kompute OpTensorCreate postSubmit called");
+    SPDLOG_DEBUG("Kompute OpTensorCreate preEval called");
+}
 
-    SPDLOG_DEBUG("Kompute OpTensorCreate destroying staging tensors");
-    this->mStagingTensors.clear();
+void
+OpTensorCreate::postEval()
+{
+    SPDLOG_DEBUG("Kompute OpTensorCreate postEval called");
 }
 
 }
diff --git a/src/OpTensorSyncDevice.cpp b/src/OpTensorSyncDevice.cpp
index 7c87245cd..72d358f23 100644
--- a/src/OpTensorSyncDevice.cpp
+++ b/src/OpTensorSyncDevice.cpp
@@ -36,8 +36,8 @@ OpTensorSyncDevice::init()
     }
 
     for (std::shared_ptr<Tensor> tensor: this->mTensors) {
-        if (tensor->isInit()) {
-            throw std::runtime_error("Kompute OpTensorSyncDevice: Tensor has already been initialized");
+        if (!tensor->isInit()) {
+            throw std::runtime_error("Kompute OpTensorSyncDevice: Tensor param has not been initialized");
         }
         if (tensor->tensorType() == Tensor::TensorTypes::eStorage) {
             throw std::runtime_error("Kompute OpTensorSyncLocal tensor parameter is of type TensorTypes::eStorage and hence cannot be used to receive or pass data.");
@@ -78,14 +78,25 @@ OpTensorSyncDevice::record()
 }
 
 void
-OpTensorSyncDevice::postSubmit()
+OpTensorSyncDevice::preEval()
 {
-    SPDLOG_DEBUG("Kompute OpTensorSyncDevice postSubmit called");
+    SPDLOG_DEBUG("Kompute OpTensorSyncDevice preEval called");
 
-    // Remove all staging tensors as they are not required after operation
-    SPDLOG_DEBUG("Kompute OpTensorSyncDevice destroying staging tensors");
-    // TODO: This would cause issues if there is no CPU barrier
-    this->mStagingTensors.clear();
+    // Performing sync of data as eval can be called multiple times with same op
+    for (size_t i = 0; i < this->mTensors.size(); i++) {
+        if (this->mTensors[i]->tensorType() == Tensor::TensorTypes::eDevice) {
+            this->mStagingTensors[i]->setData(this->mTensors[i]->data());
+            this->mStagingTensors[i]->mapDataIntoHostMemory();
+        } else {
+            this->mTensors[i]->mapDataFromHostMemory();
+        }
+    }
+}
+
+void
+OpTensorSyncDevice::postEval()
+{
+    SPDLOG_DEBUG("Kompute OpTensorSyncDevice postEval called");
 }
 
 }
diff --git a/src/OpTensorSyncLocal.cpp b/src/OpTensorSyncLocal.cpp
index 8412c5063..7946948c2 100644
--- a/src/OpTensorSyncLocal.cpp
+++ b/src/OpTensorSyncLocal.cpp
@@ -74,10 +74,17 @@ OpTensorSyncLocal::record()
 }
 
 void
-OpTensorSyncLocal::postSubmit()
+OpTensorSyncLocal::preEval()
 {
-    SPDLOG_DEBUG("Kompute OpTensorSyncLocal postSubmit called");
+    SPDLOG_DEBUG("Kompute OpTensorSyncLocal preEval called");
+}
 
+void
+OpTensorSyncLocal::postEval()
+{
+    SPDLOG_DEBUG("Kompute OpTensorSyncLocal postEval called");
+
+    SPDLOG_DEBUG("Kompute OpTensorSyncLocal mapping data into tensor local");
     for (size_t i = 0; i < this->mTensors.size(); i++) {
         if (this->mTensors[i]->tensorType() == Tensor::TensorTypes::eDevice) {
             this->mStagingTensors[i]->mapDataFromHostMemory();
@@ -86,10 +93,6 @@ OpTensorSyncLocal::postSubmit()
             this->mTensors[i]->mapDataFromHostMemory();
         }
     }
-
-    // Remove all staging tensors as they are not required after operation
-    SPDLOG_DEBUG("Kompute OpTensorSyncLocal destroying staging tensors");
-    this->mStagingTensors.clear();
 }
 
 }
diff --git a/src/Sequence.cpp b/src/Sequence.cpp
index bdeef5763..e8ac59610 100644
--- a/src/Sequence.cpp
+++ b/src/Sequence.cpp
@@ -125,6 +125,10 @@ Sequence::eval()
         return false;
     }
 
+    for (size_t i = 0; i < this->mOperations.size(); i++) {
+        this->mOperations[i]->preEval();
+    }
+
     const vk::PipelineStageFlags waitStageMask =
       vk::PipelineStageFlagBits::eTransfer;
     vk::SubmitInfo submitInfo(
@@ -140,7 +144,7 @@ Sequence::eval()
     this->mDevice->destroy(fence);
 
     for (size_t i = 0; i < this->mOperations.size(); i++) {
-        this->mOperations[i]->postSubmit();
+        this->mOperations[i]->postEval();
     }
 
     SPDLOG_DEBUG("Kompute sequence EVAL success");
diff --git a/src/include/kompute/operations/OpAlgoBase.hpp b/src/include/kompute/operations/OpAlgoBase.hpp
index 92c7e607b..417a05550 100644
--- a/src/include/kompute/operations/OpAlgoBase.hpp
+++ b/src/include/kompute/operations/OpAlgoBase.hpp
@@ -120,12 +120,18 @@ class OpAlgoBase : public OpBase
      */
     virtual void record() override;
 
+
+    /**
+     * Does not perform any preEval commands.
+     */
+    virtual void preEval() override;
+
     /**
      * Executes after the recorded commands are submitted, and performs a copy
      * of the GPU Device memory into the staging buffer so the output data can
      * be retrieved.
      */
-    virtual void postSubmit() override;
+    virtual void postEval() override;
 
   protected:
     // -------------- NEVER OWNED RESOURCES
@@ -316,7 +322,14 @@ OpAlgoBase<tX, tY, tZ>::record()
 
 template<uint32_t tX, uint32_t tY, uint32_t tZ>
 void
-OpAlgoBase<tX, tY, tZ>::postSubmit()
+OpAlgoBase<tX, tY, tZ>::preEval()
+{
+    SPDLOG_DEBUG("Kompute OpAlgoBase preEval called");
+}
+
+template<uint32_t tX, uint32_t tY, uint32_t tZ>
+void
+OpAlgoBase<tX, tY, tZ>::postEval()
 {
     SPDLOG_DEBUG("Kompute OpAlgoBase postSubmit called");
 
diff --git a/src/include/kompute/operations/OpBase.hpp b/src/include/kompute/operations/OpBase.hpp
index d4543cb4c..dc0da487f 100644
--- a/src/include/kompute/operations/OpBase.hpp
+++ b/src/include/kompute/operations/OpBase.hpp
@@ -90,11 +90,24 @@ class OpBase
     virtual void record() = 0;
 
     /**
-     * Post submit is called after the Sequence has submitted the commands to
-     * the GPU for processing, and can be used to perform any tear-down steps
-     * required as the computation iteration finishes.
+     * Pre eval is called before the Sequence has called eval and submitted the commands to
+     * the GPU for processing, and can be used to perform any per-eval setup steps
+     * required as the computation iteration begins. It's worth noting that 
+     * there are situations where eval can be called multiple times, so the 
+     * resources that are created should be idempotent in case it's called multiple
+     * times in a row.
      */
-    virtual void postSubmit() = 0;
+    virtual void preEval() = 0;
+
+    /**
+     * Post eval is called after the Sequence has called eval and submitted the commands to
+     * the GPU for processing, and can be used to perform any tear-down steps
+     * required as the computation iteration finishes. It's worth noting that 
+     * there are situations where eval can be called multiple times, so the 
+     * resources that are destroyed should not require a re-init unless explicitly
+     * provided by the user.
+     */
+    virtual void postEval() = 0;
 
   protected:
     // -------------- NEVER OWNED RESOURCES
diff --git a/src/include/kompute/operations/OpTensorCopy.hpp b/src/include/kompute/operations/OpTensorCopy.hpp
index 244e22894..119466676 100644
--- a/src/include/kompute/operations/OpTensorCopy.hpp
+++ b/src/include/kompute/operations/OpTensorCopy.hpp
@@ -44,10 +44,15 @@ class OpTensorCopy : public OpBase
      */
     void record() override;
 
+    /**
+     * Does not perform any preEval commands.
+     */
+    virtual void preEval() override;
+
     /**
      * Copies the local vectors for all the tensors to sync the data with the gpu.
      */
-    void postSubmit() override;
+    virtual void postEval() override;
 
   private:
 };
diff --git a/src/include/kompute/operations/OpTensorCreate.hpp b/src/include/kompute/operations/OpTensorCreate.hpp
index 1702237eb..ca143b334 100644
--- a/src/include/kompute/operations/OpTensorCreate.hpp
+++ b/src/include/kompute/operations/OpTensorCreate.hpp
@@ -56,11 +56,17 @@ class OpTensorCreate : public OpBase
      */
     void record() override;
 
+    /**
+     * Does not perform any preEval commands.
+     */
+    virtual void preEval() override;
+
     /**
      * Performs a copy back into the main tensor to ensure that the data
      * contained is the one that is now being stored in the GPU.
      */
-    void postSubmit() override;
+    virtual void postEval() override;
+
 
   private:
     // Never owned resources
diff --git a/src/include/kompute/operations/OpTensorSyncDevice.hpp b/src/include/kompute/operations/OpTensorSyncDevice.hpp
index de57e0683..a19e40dca 100644
--- a/src/include/kompute/operations/OpTensorSyncDevice.hpp
+++ b/src/include/kompute/operations/OpTensorSyncDevice.hpp
@@ -45,9 +45,14 @@ class OpTensorSyncDevice : public OpBase
     void record() override;
 
     /**
-     * Does not perform any further sync functions. Frees the staging tensors together with their respective memory.
+     * Does not perform any preEval commands.
      */
-    void postSubmit() override;
+    virtual void preEval() override;
+
+    /**
+     * Does not perform any postEval commands.
+     */
+    virtual void postEval() override;
 
   private:
     // Never owned resources
diff --git a/src/include/kompute/operations/OpTensorSyncLocal.hpp b/src/include/kompute/operations/OpTensorSyncLocal.hpp
index d06629c29..caf0ec9b1 100644
--- a/src/include/kompute/operations/OpTensorSyncLocal.hpp
+++ b/src/include/kompute/operations/OpTensorSyncLocal.hpp
@@ -45,9 +45,15 @@ class OpTensorSyncLocal : public OpBase
     void record() override;
 
     /**
-     * For host tensors it performs the map command from the host memory into local memory. Frees the staging tensors together with their respective memory.
+     * Does not perform any preEval commands.
      */
-    void postSubmit() override;
+    virtual void preEval() override;
+
+    /**
+     * For host tensors it performs the map command from the host memory into local memory.
+     */
+    virtual void postEval() override;
+
 
   private:
     // Never owned resources
diff --git a/test/TestLogisticRegression.cpp b/test/TestLogisticRegression.cpp
index 68805e86d..603a49c7d 100644
--- a/test/TestLogisticRegression.cpp
+++ b/test/TestLogisticRegression.cpp
@@ -1,9 +1,89 @@
 
 #include "gtest/gtest.h"
 
+#include "fmt/ranges.h"
 #include "kompute/Kompute.hpp"
 
-TEST(LogisticRegressionAlgorithm, TestMainLogisticRegression) {
+TEST(TestLogisticRegressionAlgorithm, TestMainLogisticRegression) {
+
+    uint32_t ITERATIONS = 100;
+
+    std::vector<float> wInVec = { 0.001, 0.001 };
+    std::vector<float> bInVec = { 0 };
+
+    std::shared_ptr<kp::Tensor> xI{ new kp::Tensor({ 0, 1, 1, 1, 1 })};
+    std::shared_ptr<kp::Tensor> xJ{ new kp::Tensor({ 0, 0, 0, 1, 1 })};
+
+    std::shared_ptr<kp::Tensor> y{ new kp::Tensor({ 0, 0, 0, 1, 1 })};
+
+    std::shared_ptr<kp::Tensor> wIn{ 
+        new kp::Tensor(wInVec)};
+    std::shared_ptr<kp::Tensor> wOutI{ new kp::Tensor({ 0, 0, 0, 0, 0 })};
+    std::shared_ptr<kp::Tensor> wOutJ{ new kp::Tensor({ 0, 0, 0, 0, 0 })};
+
+    std::shared_ptr<kp::Tensor> bIn{ 
+        new kp::Tensor(bInVec)};
+    std::shared_ptr<kp::Tensor> bOut{ new kp::Tensor({ 0, 0, 0, 0, 0 })};
+
+    std::vector<std::shared_ptr<kp::Tensor>> params = 
+        {xI, xJ, y, wIn, wOutI, wOutJ, bIn, bOut};
+
+    {
+        kp::Manager mgr;
+
+        if (std::shared_ptr<kp::Sequence> sq = 
+                mgr.getOrCreateManagedSequence("createTensors").lock()) {
+
+            sq->begin();
+
+            sq->record<kp::OpTensorCreate>(params);
+
+            sq->end();
+            sq->eval();
+
+            // Record op algo base
+            sq->begin();
+
+            sq->record<kp::OpTensorSyncDevice>({wIn, bIn});
+
+            sq->record<kp::OpAlgoBase<>>(
+                    params, 
+                    false, // Whether to copy output from device
+                    "test/shaders/glsl/test_logistic_regression.comp");
+
+            sq->record<kp::OpTensorSyncLocal>({wOutI, wOutJ, bOut});
+
+            sq->end();
+
+            // Iterate across all expected iterations
+            for (size_t i = 0; i < ITERATIONS; i++) {
+
+                sq->eval();
+
+                for(size_t j = 0; j < bOut->size(); j++) {
+                    wIn->data()[0] -= wOutI->data()[j];
+                    wIn->data()[1] -= wOutJ->data()[j];
+                    bIn->data()[0] -= bOut->data()[j];
+                }
+            }
+        }
+    }
+
+
+    // Based on the inputs the outputs should be at least:
+    // * wi < 0.01
+    // * wj > 1.0
+    // * b < 0
+    // TODO: Add EXPECT_DOUBLE_EQ instead
+    EXPECT_LT(wIn->data()[0], 0.01);
+    EXPECT_GT(wIn->data()[1], 1.0);
+    EXPECT_LT(bIn->data()[0], 0.0);
+
+    SPDLOG_ERROR("Result wIn: {}, bIn: {}", 
+            wIn->data(), bIn->data());
+}
+
+TEST(TestLogisticRegressionAlgorithm, TestMainLogisticRegressionManualCopy) {
 
     uint32_t ITERATIONS = 100;
 
@@ -76,6 +156,6 @@ TEST(LogisticRegressionAlgorithm, TestMainLogisticRegression) {
     EXPECT_GT(wIn->data()[1], 1.0);
     EXPECT_LT(bIn->data()[0], 0.0);
 
-    //SPDLOG_DEBUG("Result wIn: {}, bIn: {}", 
-    //        wIn->data(), bIn->data());
+    SPDLOG_ERROR("Result wIn: {}, bIn: {}", 
+            wIn->data(), bIn->data());
 }
diff --git a/test/TestOpTensorSyncDevice.cpp b/test/TestOpTensorSyncDevice.cpp
new file mode 100644
index 000000000..e69de29bb