Added functional optensorsyncDevice and optensorsynclocal

2020-09-06 15:51:31 +01:00 · 2020-09-06 15:51:31 +01:00 · e68d09dbdc
commit e68d09dbdc
parent e6f4097acb
15 changed files with 258 additions and 50 deletions
--- a/docs/overview/advanced-examples.rst
+++ b/docs/overview/advanced-examples.rst
@ -121,7 +121,11 @@ Now that we have the inputs and outputs we will be able to use them in the proce

 Once we re-record, all the instructions that were recorded previosuly are cleared.

-Because of this we can record now the new command which is just the OpAlgoBase with the LR shader.
+Because of this we can record now the new commands which will consist of the following:
+
+1. Copy the tensor data from local to device
+2. Run the logistic regression shader
+3. Copy the output data 

 .. code-block:: cpp
    :linenos:
@ -131,11 +135,15 @@ Because of this we can record now the new command which is just the OpAlgoBase w

        sq->begin();

+        sq->record<kp::OpTensorSyncDevice>({wIn, bIn});
+
        sq->record<kp::OpAlgoBase<>>(
                params, 
-                true, // Whether to copy output from device
+                false, // Whether to copy output from device
                "test/shaders/glsl/test_logistic_regression.comp");

+        sq->record<kp::OpTensorSyncLocal>({wOutI, wOutJ, bOut});
+
        sq->end();

 4. Loop across number of iterations + 4-a. Submit algo operation on LR shader
--- a/single_include/kompute/Kompute.hpp
+++ b/single_include/kompute/Kompute.hpp
@ -470,11 +470,24 @@ class OpBase
    virtual void record() = 0;

    /**
-     * Post submit is called after the Sequence has submitted the commands to
-     * the GPU for processing, and can be used to perform any tear-down steps
-     * required as the computation iteration finishes.
+     * Pre eval is called before the Sequence has called eval and submitted the commands to
+     * the GPU for processing, and can be used to perform any per-eval setup steps
+     * required as the computation iteration begins. It's worth noting that 
+     * there are situations where eval can be called multiple times, so the 
+     * resources that are created should be idempotent in case it's called multiple
+     * times in a row.
     */
-    virtual void postSubmit() = 0;
+    virtual void preEval() = 0;
+
+    /**
+     * Post eval is called after the Sequence has called eval and submitted the commands to
+     * the GPU for processing, and can be used to perform any tear-down steps
+     * required as the computation iteration finishes. It's worth noting that 
+     * there are situations where eval can be called multiple times, so the 
+     * resources that are destroyed should not require a re-init unless explicitly
+     * provided by the user.
+     */
+    virtual void postEval() = 0;

  protected:
    // -------------- NEVER OWNED RESOURCES
@ -966,12 +979,17 @@ class OpAlgoBase : public OpBase
     */
    virtual void record() override;

+    /**
+     * Does not perform any preEval commands.
+     */
+    virtual void preEval() override;
+
    /**
     * Executes after the recorded commands are submitted, and performs a copy
     * of the GPU Device memory into the staging buffer so the output data can
     * be retrieved.
     */
-    virtual void postSubmit() override;
+    virtual void postEval() override;

  protected:
    // -------------- NEVER OWNED RESOURCES
@ -1162,7 +1180,14 @@ OpAlgoBase<tX, tY, tZ>::record()

 template<uint32_t tX, uint32_t tY, uint32_t tZ>
 void
-OpAlgoBase<tX, tY, tZ>::postSubmit()
+OpAlgoBase<tX, tY, tZ>::preEval()
+{
+    SPDLOG_DEBUG("Kompute OpAlgoBase preEval called");
+}
+
+template<uint32_t tX, uint32_t tY, uint32_t tZ>
+void
+OpAlgoBase<tX, tY, tZ>::postEval()
 {
    SPDLOG_DEBUG("Kompute OpAlgoBase postSubmit called");

@ -1554,11 +1579,16 @@ class OpTensorCreate : public OpBase
     */
    void record() override;

+    /**
+     * Does not perform any preEval commands.
+     */
+    virtual void preEval() override;
+
    /**
     * Performs a copy back into the main tensor to ensure that the data
     * contained is the one that is now being stored in the GPU.
     */
-    void postSubmit() override;
+    virtual void postEval() override;

  private:
    // Never owned resources
@ -1605,10 +1635,15 @@ class OpTensorCopy : public OpBase
     */
    void record() override;

+    /**
+     * Does not perform any preEval commands.
+     */
+    virtual void preEval() override;
+
    /**
     * Copies the local vectors for all the tensors to sync the data with the gpu.
     */
-    void postSubmit() override;
+    virtual void postEval() override;

  private:
 };
@ -1654,9 +1689,14 @@ class OpTensorSyncDevice : public OpBase
    void record() override;

    /**
-     * Does not perform any further sync functions. Frees the staging tensors together with their respective memory.
+     * Does not perform any preEval commands.
     */
-    void postSubmit() override;
+    virtual void preEval() override;
+
+    /**
+     * Does not perform any postEval commands.
+     */
+    virtual void postEval() override;

  private:
    // Never owned resources
@ -1704,9 +1744,14 @@ class OpTensorSyncLocal : public OpBase
    void record() override;

    /**
-     * For host tensors it performs the map command from the host memory into local memory. Frees the staging tensors together with their respective memory.
+     * Does not perform any preEval commands.
     */
-    void postSubmit() override;
+    virtual void preEval() override;
+
+    /**
+     * For host tensors it performs the map command from the host memory into local memory.
+     */
+    virtual void postEval() override;

  private:
    // Never owned resources
--- a/src/OpTensorCopy.cpp
+++ b/src/OpTensorCopy.cpp
@ -57,9 +57,15 @@ OpTensorCopy::record()
 }

 void
-OpTensorCopy::postSubmit()
+OpTensorCopy::preEval()
 {
-    SPDLOG_DEBUG("Kompute OpTensorCopy postSubmit called");
+    SPDLOG_DEBUG("Kompute OpTensorCopy preEval called");
+}
+
+void
+OpTensorCopy::postEval()
+{
+    SPDLOG_DEBUG("Kompute OpTensorCopy postEval called");

    // Copy the data from the first tensor into all the tensors
    for (size_t i = 1; i < this->mTensors.size(); i++) {
--- a/src/OpTensorCreate.cpp
+++ b/src/OpTensorCreate.cpp
@ -80,12 +80,15 @@ OpTensorCreate::record()
 }

 void
-OpTensorCreate::postSubmit()
+OpTensorCreate::preEval()
 {
-    SPDLOG_DEBUG("Kompute OpTensorCreate postSubmit called");
+    SPDLOG_DEBUG("Kompute OpTensorCreate preEval called");
+}

-    SPDLOG_DEBUG("Kompute OpTensorCreate destroying staging tensors");
-    this->mStagingTensors.clear();
+void
+OpTensorCreate::postEval()
+{
+    SPDLOG_DEBUG("Kompute OpTensorCreate postEval called");
 }

 }
--- a/src/OpTensorSyncDevice.cpp
+++ b/src/OpTensorSyncDevice.cpp
@ -36,8 +36,8 @@ OpTensorSyncDevice::init()
    }

    for (std::shared_ptr<Tensor> tensor: this->mTensors) {
-        if (tensor->isInit()) {
-            throw std::runtime_error("Kompute OpTensorSyncDevice: Tensor has already been initialized");
+        if (!tensor->isInit()) {
+            throw std::runtime_error("Kompute OpTensorSyncDevice: Tensor param has not been initialized");
        }
        if (tensor->tensorType() == Tensor::TensorTypes::eStorage) {
            throw std::runtime_error("Kompute OpTensorSyncLocal tensor parameter is of type TensorTypes::eStorage and hence cannot be used to receive or pass data.");
@ -78,14 +78,25 @@ OpTensorSyncDevice::record()
 }

 void
-OpTensorSyncDevice::postSubmit()
+OpTensorSyncDevice::preEval()
 {
-    SPDLOG_DEBUG("Kompute OpTensorSyncDevice postSubmit called");
+    SPDLOG_DEBUG("Kompute OpTensorSyncDevice preEval called");

-    // Remove all staging tensors as they are not required after operation
-    SPDLOG_DEBUG("Kompute OpTensorSyncDevice destroying staging tensors");
-    // TODO: This would cause issues if there is no CPU barrier
-    this->mStagingTensors.clear();
+    // Performing sync of data as eval can be called multiple times with same op
+    for (size_t i = 0; i < this->mTensors.size(); i++) {
+        if (this->mTensors[i]->tensorType() == Tensor::TensorTypes::eDevice) {
+            this->mStagingTensors[i]->setData(this->mTensors[i]->data());
+            this->mStagingTensors[i]->mapDataIntoHostMemory();
+        } else {
+            this->mTensors[i]->mapDataFromHostMemory();
+        }
+    }
+}
+
+void
+OpTensorSyncDevice::postEval()
+{
+    SPDLOG_DEBUG("Kompute OpTensorSyncDevice postEval called");
 }

 }
--- a/src/OpTensorSyncLocal.cpp
+++ b/src/OpTensorSyncLocal.cpp
@ -74,10 +74,17 @@ OpTensorSyncLocal::record()
 }

 void
-OpTensorSyncLocal::postSubmit()
+OpTensorSyncLocal::preEval()
 {
-    SPDLOG_DEBUG("Kompute OpTensorSyncLocal postSubmit called");
+    SPDLOG_DEBUG("Kompute OpTensorSyncLocal preEval called");
+}

+void
+OpTensorSyncLocal::postEval()
+{
+    SPDLOG_DEBUG("Kompute OpTensorSyncLocal postEval called");
+
+    SPDLOG_DEBUG("Kompute OpTensorSyncLocal mapping data into tensor local");
    for (size_t i = 0; i < this->mTensors.size(); i++) {
        if (this->mTensors[i]->tensorType() == Tensor::TensorTypes::eDevice) {
            this->mStagingTensors[i]->mapDataFromHostMemory();
@ -86,10 +93,6 @@ OpTensorSyncLocal::postSubmit()
            this->mTensors[i]->mapDataFromHostMemory();
        }
    }
-
-    // Remove all staging tensors as they are not required after operation
-    SPDLOG_DEBUG("Kompute OpTensorSyncLocal destroying staging tensors");
-    this->mStagingTensors.clear();
 }

 }
--- a/src/Sequence.cpp
+++ b/src/Sequence.cpp
@ -125,6 +125,10 @@ Sequence::eval()
        return false;
    }

+    for (size_t i = 0; i < this->mOperations.size(); i++) {
+        this->mOperations[i]->preEval();
+    }
+
    const vk::PipelineStageFlags waitStageMask =
      vk::PipelineStageFlagBits::eTransfer;
    vk::SubmitInfo submitInfo(
@ -140,7 +144,7 @@ Sequence::eval()
    this->mDevice->destroy(fence);

    for (size_t i = 0; i < this->mOperations.size(); i++) {
-        this->mOperations[i]->postSubmit();
+        this->mOperations[i]->postEval();
    }

    SPDLOG_DEBUG("Kompute sequence EVAL success");
--- a/src/include/kompute/operations/OpAlgoBase.hpp
+++ b/src/include/kompute/operations/OpAlgoBase.hpp
@ -120,12 +120,18 @@ class OpAlgoBase : public OpBase
     */
    virtual void record() override;

+
+    /**
+     * Does not perform any preEval commands.
+     */
+    virtual void preEval() override;
+
    /**
     * Executes after the recorded commands are submitted, and performs a copy
     * of the GPU Device memory into the staging buffer so the output data can
     * be retrieved.
     */
-    virtual void postSubmit() override;
+    virtual void postEval() override;

  protected:
    // -------------- NEVER OWNED RESOURCES
@ -316,7 +322,14 @@ OpAlgoBase<tX, tY, tZ>::record()

 template<uint32_t tX, uint32_t tY, uint32_t tZ>
 void
-OpAlgoBase<tX, tY, tZ>::postSubmit()
+OpAlgoBase<tX, tY, tZ>::preEval()
+{
+    SPDLOG_DEBUG("Kompute OpAlgoBase preEval called");
+}
+
+template<uint32_t tX, uint32_t tY, uint32_t tZ>
+void
+OpAlgoBase<tX, tY, tZ>::postEval()
 {
    SPDLOG_DEBUG("Kompute OpAlgoBase postSubmit called");

--- a/src/include/kompute/operations/OpBase.hpp
+++ b/src/include/kompute/operations/OpBase.hpp
@ -90,11 +90,24 @@ class OpBase
    virtual void record() = 0;

    /**
-     * Post submit is called after the Sequence has submitted the commands to
-     * the GPU for processing, and can be used to perform any tear-down steps
-     * required as the computation iteration finishes.
+     * Pre eval is called before the Sequence has called eval and submitted the commands to
+     * the GPU for processing, and can be used to perform any per-eval setup steps
+     * required as the computation iteration begins. It's worth noting that 
+     * there are situations where eval can be called multiple times, so the 
+     * resources that are created should be idempotent in case it's called multiple
+     * times in a row.
     */
-    virtual void postSubmit() = 0;
+    virtual void preEval() = 0;
+
+    /**
+     * Post eval is called after the Sequence has called eval and submitted the commands to
+     * the GPU for processing, and can be used to perform any tear-down steps
+     * required as the computation iteration finishes. It's worth noting that 
+     * there are situations where eval can be called multiple times, so the 
+     * resources that are destroyed should not require a re-init unless explicitly
+     * provided by the user.
+     */
+    virtual void postEval() = 0;

  protected:
    // -------------- NEVER OWNED RESOURCES
--- a/src/include/kompute/operations/OpTensorCopy.hpp
+++ b/src/include/kompute/operations/OpTensorCopy.hpp
@ -44,10 +44,15 @@ class OpTensorCopy : public OpBase
     */
    void record() override;

+    /**
+     * Does not perform any preEval commands.
+     */
+    virtual void preEval() override;
+
    /**
     * Copies the local vectors for all the tensors to sync the data with the gpu.
     */
-    void postSubmit() override;
+    virtual void postEval() override;

  private:
 };
--- a/src/include/kompute/operations/OpTensorCreate.hpp
+++ b/src/include/kompute/operations/OpTensorCreate.hpp
@ -56,11 +56,17 @@ class OpTensorCreate : public OpBase
     */
    void record() override;

+    /**
+     * Does not perform any preEval commands.
+     */
+    virtual void preEval() override;
+
    /**
     * Performs a copy back into the main tensor to ensure that the data
     * contained is the one that is now being stored in the GPU.
     */
-    void postSubmit() override;
+    virtual void postEval() override;
+

  private:
    // Never owned resources
--- a/src/include/kompute/operations/OpTensorSyncDevice.hpp
+++ b/src/include/kompute/operations/OpTensorSyncDevice.hpp
@ -45,9 +45,14 @@ class OpTensorSyncDevice : public OpBase
    void record() override;

    /**
-     * Does not perform any further sync functions. Frees the staging tensors together with their respective memory.
+     * Does not perform any preEval commands.
     */
-    void postSubmit() override;
+    virtual void preEval() override;
+
+    /**
+     * Does not perform any postEval commands.
+     */
+    virtual void postEval() override;

  private:
    // Never owned resources
--- a/src/include/kompute/operations/OpTensorSyncLocal.hpp
+++ b/src/include/kompute/operations/OpTensorSyncLocal.hpp
@ -45,9 +45,15 @@ class OpTensorSyncLocal : public OpBase
    void record() override;

    /**
-     * For host tensors it performs the map command from the host memory into local memory. Frees the staging tensors together with their respective memory.
+     * Does not perform any preEval commands.
     */
-    void postSubmit() override;
+    virtual void preEval() override;
+
+    /**
+     * For host tensors it performs the map command from the host memory into local memory.
+     */
+    virtual void postEval() override;
+

  private:
    // Never owned resources
--- a/test/TestLogisticRegression.cpp
+++ b/test/TestLogisticRegression.cpp
@ -1,9 +1,89 @@

 #include "gtest/gtest.h"

+#include "fmt/ranges.h"
 #include "kompute/Kompute.hpp"

-TEST(LogisticRegressionAlgorithm, TestMainLogisticRegression) {
+TEST(TestLogisticRegressionAlgorithm, TestMainLogisticRegression) {
+
+    uint32_t ITERATIONS = 100;
+
+    std::vector<float> wInVec = { 0.001, 0.001 };
+    std::vector<float> bInVec = { 0 };
+
+    std::shared_ptr<kp::Tensor> xI{ new kp::Tensor({ 0, 1, 1, 1, 1 })};
+    std::shared_ptr<kp::Tensor> xJ{ new kp::Tensor({ 0, 0, 0, 1, 1 })};
+
+    std::shared_ptr<kp::Tensor> y{ new kp::Tensor({ 0, 0, 0, 1, 1 })};
+
+    std::shared_ptr<kp::Tensor> wIn{ 
+        new kp::Tensor(wInVec)};
+    std::shared_ptr<kp::Tensor> wOutI{ new kp::Tensor({ 0, 0, 0, 0, 0 })};
+    std::shared_ptr<kp::Tensor> wOutJ{ new kp::Tensor({ 0, 0, 0, 0, 0 })};
+
+    std::shared_ptr<kp::Tensor> bIn{ 
+        new kp::Tensor(bInVec)};
+    std::shared_ptr<kp::Tensor> bOut{ new kp::Tensor({ 0, 0, 0, 0, 0 })};
+
+    std::vector<std::shared_ptr<kp::Tensor>> params = 
+        {xI, xJ, y, wIn, wOutI, wOutJ, bIn, bOut};
+
+    {
+        kp::Manager mgr;
+
+        if (std::shared_ptr<kp::Sequence> sq = 
+                mgr.getOrCreateManagedSequence("createTensors").lock()) {
+
+            sq->begin();
+
+            sq->record<kp::OpTensorCreate>(params);
+
+            sq->end();
+            sq->eval();
+
+            // Record op algo base
+            sq->begin();
+
+            sq->record<kp::OpTensorSyncDevice>({wIn, bIn});
+
+            sq->record<kp::OpAlgoBase<>>(
+                    params, 
+                    false, // Whether to copy output from device
+                    "test/shaders/glsl/test_logistic_regression.comp");
+
+            sq->record<kp::OpTensorSyncLocal>({wOutI, wOutJ, bOut});
+
+            sq->end();
+
+            // Iterate across all expected iterations
+            for (size_t i = 0; i < ITERATIONS; i++) {
+
+                sq->eval();
+
+                for(size_t j = 0; j < bOut->size(); j++) {
+                    wIn->data()[0] -= wOutI->data()[j];
+                    wIn->data()[1] -= wOutJ->data()[j];
+                    bIn->data()[0] -= bOut->data()[j];
+                }
+            }
+        }
+    }
+
+
+    // Based on the inputs the outputs should be at least:
+    // * wi < 0.01
+    // * wj > 1.0
+    // * b < 0
+    // TODO: Add EXPECT_DOUBLE_EQ instead
+    EXPECT_LT(wIn->data()[0], 0.01);
+    EXPECT_GT(wIn->data()[1], 1.0);
+    EXPECT_LT(bIn->data()[0], 0.0);
+
+    SPDLOG_ERROR("Result wIn: {}, bIn: {}", 
+            wIn->data(), bIn->data());
+}
+
+TEST(TestLogisticRegressionAlgorithm, TestMainLogisticRegressionManualCopy) {

    uint32_t ITERATIONS = 100;

@ -76,6 +156,6 @@ TEST(LogisticRegressionAlgorithm, TestMainLogisticRegression) {
    EXPECT_GT(wIn->data()[1], 1.0);
    EXPECT_LT(bIn->data()[0], 0.0);

-    //SPDLOG_DEBUG("Result wIn: {}, bIn: {}", 
-    //        wIn->data(), bIn->data());
+    SPDLOG_ERROR("Result wIn: {}, bIn: {}", 
+            wIn->data(), bIn->data());
 }
--- a/test/TestOpTensorSyncDevice.cpp
+++ b/test/TestOpTensorSyncDevice.cpp