From e68d09dbdcb7425512d48ac5155ddf5c94f3f637 Mon Sep 17 00:00:00 2001 From: Alejandro Saucedo Date: Sun, 6 Sep 2020 15:51:31 +0100 Subject: [PATCH] Added functional optensorsyncDevice and optensorsynclocal --- docs/overview/advanced-examples.rst | 12 ++- single_include/kompute/Kompute.hpp | 69 ++++++++++++--- src/OpTensorCopy.cpp | 10 ++- src/OpTensorCreate.cpp | 11 ++- src/OpTensorSyncDevice.cpp | 27 ++++-- src/OpTensorSyncLocal.cpp | 15 ++-- src/Sequence.cpp | 6 +- src/include/kompute/operations/OpAlgoBase.hpp | 17 +++- src/include/kompute/operations/OpBase.hpp | 21 ++++- .../kompute/operations/OpTensorCopy.hpp | 7 +- .../kompute/operations/OpTensorCreate.hpp | 8 +- .../kompute/operations/OpTensorSyncDevice.hpp | 9 +- .../kompute/operations/OpTensorSyncLocal.hpp | 10 ++- test/TestLogisticRegression.cpp | 86 ++++++++++++++++++- test/TestOpTensorSyncDevice.cpp | 0 15 files changed, 258 insertions(+), 50 deletions(-) create mode 100644 test/TestOpTensorSyncDevice.cpp diff --git a/docs/overview/advanced-examples.rst b/docs/overview/advanced-examples.rst index cb52500b1..dc0e43583 100644 --- a/docs/overview/advanced-examples.rst +++ b/docs/overview/advanced-examples.rst @@ -121,7 +121,11 @@ Now that we have the inputs and outputs we will be able to use them in the proce Once we re-record, all the instructions that were recorded previosuly are cleared. -Because of this we can record now the new command which is just the OpAlgoBase with the LR shader. +Because of this we can record now the new commands which will consist of the following: + +1. Copy the tensor data from local to device +2. Run the logistic regression shader +3. Copy the output data .. code-block:: cpp :linenos: @@ -131,11 +135,15 @@ Because of this we can record now the new command which is just the OpAlgoBase w sq->begin(); + sq->record({wIn, bIn}); + sq->record>( params, - true, // Whether to copy output from device + false, // Whether to copy output from device "test/shaders/glsl/test_logistic_regression.comp"); + sq->record({wOutI, wOutJ, bOut}); + sq->end(); 4. Loop across number of iterations + 4-a. Submit algo operation on LR shader diff --git a/single_include/kompute/Kompute.hpp b/single_include/kompute/Kompute.hpp index 83e97fdd8..4f3fe5ed9 100755 --- a/single_include/kompute/Kompute.hpp +++ b/single_include/kompute/Kompute.hpp @@ -470,11 +470,24 @@ class OpBase virtual void record() = 0; /** - * Post submit is called after the Sequence has submitted the commands to - * the GPU for processing, and can be used to perform any tear-down steps - * required as the computation iteration finishes. + * Pre eval is called before the Sequence has called eval and submitted the commands to + * the GPU for processing, and can be used to perform any per-eval setup steps + * required as the computation iteration begins. It's worth noting that + * there are situations where eval can be called multiple times, so the + * resources that are created should be idempotent in case it's called multiple + * times in a row. */ - virtual void postSubmit() = 0; + virtual void preEval() = 0; + + /** + * Post eval is called after the Sequence has called eval and submitted the commands to + * the GPU for processing, and can be used to perform any tear-down steps + * required as the computation iteration finishes. It's worth noting that + * there are situations where eval can be called multiple times, so the + * resources that are destroyed should not require a re-init unless explicitly + * provided by the user. + */ + virtual void postEval() = 0; protected: // -------------- NEVER OWNED RESOURCES @@ -966,12 +979,17 @@ class OpAlgoBase : public OpBase */ virtual void record() override; + /** + * Does not perform any preEval commands. + */ + virtual void preEval() override; + /** * Executes after the recorded commands are submitted, and performs a copy * of the GPU Device memory into the staging buffer so the output data can * be retrieved. */ - virtual void postSubmit() override; + virtual void postEval() override; protected: // -------------- NEVER OWNED RESOURCES @@ -1162,7 +1180,14 @@ OpAlgoBase::record() template void -OpAlgoBase::postSubmit() +OpAlgoBase::preEval() +{ + SPDLOG_DEBUG("Kompute OpAlgoBase preEval called"); +} + +template +void +OpAlgoBase::postEval() { SPDLOG_DEBUG("Kompute OpAlgoBase postSubmit called"); @@ -1554,11 +1579,16 @@ class OpTensorCreate : public OpBase */ void record() override; + /** + * Does not perform any preEval commands. + */ + virtual void preEval() override; + /** * Performs a copy back into the main tensor to ensure that the data * contained is the one that is now being stored in the GPU. */ - void postSubmit() override; + virtual void postEval() override; private: // Never owned resources @@ -1605,10 +1635,15 @@ class OpTensorCopy : public OpBase */ void record() override; + /** + * Does not perform any preEval commands. + */ + virtual void preEval() override; + /** * Copies the local vectors for all the tensors to sync the data with the gpu. */ - void postSubmit() override; + virtual void postEval() override; private: }; @@ -1654,9 +1689,14 @@ class OpTensorSyncDevice : public OpBase void record() override; /** - * Does not perform any further sync functions. Frees the staging tensors together with their respective memory. + * Does not perform any preEval commands. */ - void postSubmit() override; + virtual void preEval() override; + + /** + * Does not perform any postEval commands. + */ + virtual void postEval() override; private: // Never owned resources @@ -1704,9 +1744,14 @@ class OpTensorSyncLocal : public OpBase void record() override; /** - * For host tensors it performs the map command from the host memory into local memory. Frees the staging tensors together with their respective memory. + * Does not perform any preEval commands. */ - void postSubmit() override; + virtual void preEval() override; + + /** + * For host tensors it performs the map command from the host memory into local memory. + */ + virtual void postEval() override; private: // Never owned resources diff --git a/src/OpTensorCopy.cpp b/src/OpTensorCopy.cpp index 50eb9c4c1..2b4ae52b2 100644 --- a/src/OpTensorCopy.cpp +++ b/src/OpTensorCopy.cpp @@ -57,9 +57,15 @@ OpTensorCopy::record() } void -OpTensorCopy::postSubmit() +OpTensorCopy::preEval() { - SPDLOG_DEBUG("Kompute OpTensorCopy postSubmit called"); + SPDLOG_DEBUG("Kompute OpTensorCopy preEval called"); +} + +void +OpTensorCopy::postEval() +{ + SPDLOG_DEBUG("Kompute OpTensorCopy postEval called"); // Copy the data from the first tensor into all the tensors for (size_t i = 1; i < this->mTensors.size(); i++) { diff --git a/src/OpTensorCreate.cpp b/src/OpTensorCreate.cpp index 55dddb006..aac098220 100644 --- a/src/OpTensorCreate.cpp +++ b/src/OpTensorCreate.cpp @@ -80,12 +80,15 @@ OpTensorCreate::record() } void -OpTensorCreate::postSubmit() +OpTensorCreate::preEval() { - SPDLOG_DEBUG("Kompute OpTensorCreate postSubmit called"); + SPDLOG_DEBUG("Kompute OpTensorCreate preEval called"); +} - SPDLOG_DEBUG("Kompute OpTensorCreate destroying staging tensors"); - this->mStagingTensors.clear(); +void +OpTensorCreate::postEval() +{ + SPDLOG_DEBUG("Kompute OpTensorCreate postEval called"); } } diff --git a/src/OpTensorSyncDevice.cpp b/src/OpTensorSyncDevice.cpp index 7c87245cd..72d358f23 100644 --- a/src/OpTensorSyncDevice.cpp +++ b/src/OpTensorSyncDevice.cpp @@ -36,8 +36,8 @@ OpTensorSyncDevice::init() } for (std::shared_ptr tensor: this->mTensors) { - if (tensor->isInit()) { - throw std::runtime_error("Kompute OpTensorSyncDevice: Tensor has already been initialized"); + if (!tensor->isInit()) { + throw std::runtime_error("Kompute OpTensorSyncDevice: Tensor param has not been initialized"); } if (tensor->tensorType() == Tensor::TensorTypes::eStorage) { throw std::runtime_error("Kompute OpTensorSyncLocal tensor parameter is of type TensorTypes::eStorage and hence cannot be used to receive or pass data."); @@ -78,14 +78,25 @@ OpTensorSyncDevice::record() } void -OpTensorSyncDevice::postSubmit() +OpTensorSyncDevice::preEval() { - SPDLOG_DEBUG("Kompute OpTensorSyncDevice postSubmit called"); + SPDLOG_DEBUG("Kompute OpTensorSyncDevice preEval called"); - // Remove all staging tensors as they are not required after operation - SPDLOG_DEBUG("Kompute OpTensorSyncDevice destroying staging tensors"); - // TODO: This would cause issues if there is no CPU barrier - this->mStagingTensors.clear(); + // Performing sync of data as eval can be called multiple times with same op + for (size_t i = 0; i < this->mTensors.size(); i++) { + if (this->mTensors[i]->tensorType() == Tensor::TensorTypes::eDevice) { + this->mStagingTensors[i]->setData(this->mTensors[i]->data()); + this->mStagingTensors[i]->mapDataIntoHostMemory(); + } else { + this->mTensors[i]->mapDataFromHostMemory(); + } + } +} + +void +OpTensorSyncDevice::postEval() +{ + SPDLOG_DEBUG("Kompute OpTensorSyncDevice postEval called"); } } diff --git a/src/OpTensorSyncLocal.cpp b/src/OpTensorSyncLocal.cpp index 8412c5063..7946948c2 100644 --- a/src/OpTensorSyncLocal.cpp +++ b/src/OpTensorSyncLocal.cpp @@ -74,10 +74,17 @@ OpTensorSyncLocal::record() } void -OpTensorSyncLocal::postSubmit() +OpTensorSyncLocal::preEval() { - SPDLOG_DEBUG("Kompute OpTensorSyncLocal postSubmit called"); + SPDLOG_DEBUG("Kompute OpTensorSyncLocal preEval called"); +} +void +OpTensorSyncLocal::postEval() +{ + SPDLOG_DEBUG("Kompute OpTensorSyncLocal postEval called"); + + SPDLOG_DEBUG("Kompute OpTensorSyncLocal mapping data into tensor local"); for (size_t i = 0; i < this->mTensors.size(); i++) { if (this->mTensors[i]->tensorType() == Tensor::TensorTypes::eDevice) { this->mStagingTensors[i]->mapDataFromHostMemory(); @@ -86,10 +93,6 @@ OpTensorSyncLocal::postSubmit() this->mTensors[i]->mapDataFromHostMemory(); } } - - // Remove all staging tensors as they are not required after operation - SPDLOG_DEBUG("Kompute OpTensorSyncLocal destroying staging tensors"); - this->mStagingTensors.clear(); } } diff --git a/src/Sequence.cpp b/src/Sequence.cpp index bdeef5763..e8ac59610 100644 --- a/src/Sequence.cpp +++ b/src/Sequence.cpp @@ -125,6 +125,10 @@ Sequence::eval() return false; } + for (size_t i = 0; i < this->mOperations.size(); i++) { + this->mOperations[i]->preEval(); + } + const vk::PipelineStageFlags waitStageMask = vk::PipelineStageFlagBits::eTransfer; vk::SubmitInfo submitInfo( @@ -140,7 +144,7 @@ Sequence::eval() this->mDevice->destroy(fence); for (size_t i = 0; i < this->mOperations.size(); i++) { - this->mOperations[i]->postSubmit(); + this->mOperations[i]->postEval(); } SPDLOG_DEBUG("Kompute sequence EVAL success"); diff --git a/src/include/kompute/operations/OpAlgoBase.hpp b/src/include/kompute/operations/OpAlgoBase.hpp index 92c7e607b..417a05550 100644 --- a/src/include/kompute/operations/OpAlgoBase.hpp +++ b/src/include/kompute/operations/OpAlgoBase.hpp @@ -120,12 +120,18 @@ class OpAlgoBase : public OpBase */ virtual void record() override; + + /** + * Does not perform any preEval commands. + */ + virtual void preEval() override; + /** * Executes after the recorded commands are submitted, and performs a copy * of the GPU Device memory into the staging buffer so the output data can * be retrieved. */ - virtual void postSubmit() override; + virtual void postEval() override; protected: // -------------- NEVER OWNED RESOURCES @@ -316,7 +322,14 @@ OpAlgoBase::record() template void -OpAlgoBase::postSubmit() +OpAlgoBase::preEval() +{ + SPDLOG_DEBUG("Kompute OpAlgoBase preEval called"); +} + +template +void +OpAlgoBase::postEval() { SPDLOG_DEBUG("Kompute OpAlgoBase postSubmit called"); diff --git a/src/include/kompute/operations/OpBase.hpp b/src/include/kompute/operations/OpBase.hpp index d4543cb4c..dc0da487f 100644 --- a/src/include/kompute/operations/OpBase.hpp +++ b/src/include/kompute/operations/OpBase.hpp @@ -90,11 +90,24 @@ class OpBase virtual void record() = 0; /** - * Post submit is called after the Sequence has submitted the commands to - * the GPU for processing, and can be used to perform any tear-down steps - * required as the computation iteration finishes. + * Pre eval is called before the Sequence has called eval and submitted the commands to + * the GPU for processing, and can be used to perform any per-eval setup steps + * required as the computation iteration begins. It's worth noting that + * there are situations where eval can be called multiple times, so the + * resources that are created should be idempotent in case it's called multiple + * times in a row. */ - virtual void postSubmit() = 0; + virtual void preEval() = 0; + + /** + * Post eval is called after the Sequence has called eval and submitted the commands to + * the GPU for processing, and can be used to perform any tear-down steps + * required as the computation iteration finishes. It's worth noting that + * there are situations where eval can be called multiple times, so the + * resources that are destroyed should not require a re-init unless explicitly + * provided by the user. + */ + virtual void postEval() = 0; protected: // -------------- NEVER OWNED RESOURCES diff --git a/src/include/kompute/operations/OpTensorCopy.hpp b/src/include/kompute/operations/OpTensorCopy.hpp index 244e22894..119466676 100644 --- a/src/include/kompute/operations/OpTensorCopy.hpp +++ b/src/include/kompute/operations/OpTensorCopy.hpp @@ -44,10 +44,15 @@ class OpTensorCopy : public OpBase */ void record() override; + /** + * Does not perform any preEval commands. + */ + virtual void preEval() override; + /** * Copies the local vectors for all the tensors to sync the data with the gpu. */ - void postSubmit() override; + virtual void postEval() override; private: }; diff --git a/src/include/kompute/operations/OpTensorCreate.hpp b/src/include/kompute/operations/OpTensorCreate.hpp index 1702237eb..ca143b334 100644 --- a/src/include/kompute/operations/OpTensorCreate.hpp +++ b/src/include/kompute/operations/OpTensorCreate.hpp @@ -56,11 +56,17 @@ class OpTensorCreate : public OpBase */ void record() override; + /** + * Does not perform any preEval commands. + */ + virtual void preEval() override; + /** * Performs a copy back into the main tensor to ensure that the data * contained is the one that is now being stored in the GPU. */ - void postSubmit() override; + virtual void postEval() override; + private: // Never owned resources diff --git a/src/include/kompute/operations/OpTensorSyncDevice.hpp b/src/include/kompute/operations/OpTensorSyncDevice.hpp index de57e0683..a19e40dca 100644 --- a/src/include/kompute/operations/OpTensorSyncDevice.hpp +++ b/src/include/kompute/operations/OpTensorSyncDevice.hpp @@ -45,9 +45,14 @@ class OpTensorSyncDevice : public OpBase void record() override; /** - * Does not perform any further sync functions. Frees the staging tensors together with their respective memory. + * Does not perform any preEval commands. */ - void postSubmit() override; + virtual void preEval() override; + + /** + * Does not perform any postEval commands. + */ + virtual void postEval() override; private: // Never owned resources diff --git a/src/include/kompute/operations/OpTensorSyncLocal.hpp b/src/include/kompute/operations/OpTensorSyncLocal.hpp index d06629c29..caf0ec9b1 100644 --- a/src/include/kompute/operations/OpTensorSyncLocal.hpp +++ b/src/include/kompute/operations/OpTensorSyncLocal.hpp @@ -45,9 +45,15 @@ class OpTensorSyncLocal : public OpBase void record() override; /** - * For host tensors it performs the map command from the host memory into local memory. Frees the staging tensors together with their respective memory. + * Does not perform any preEval commands. */ - void postSubmit() override; + virtual void preEval() override; + + /** + * For host tensors it performs the map command from the host memory into local memory. + */ + virtual void postEval() override; + private: // Never owned resources diff --git a/test/TestLogisticRegression.cpp b/test/TestLogisticRegression.cpp index 68805e86d..603a49c7d 100644 --- a/test/TestLogisticRegression.cpp +++ b/test/TestLogisticRegression.cpp @@ -1,9 +1,89 @@ #include "gtest/gtest.h" +#include "fmt/ranges.h" #include "kompute/Kompute.hpp" -TEST(LogisticRegressionAlgorithm, TestMainLogisticRegression) { +TEST(TestLogisticRegressionAlgorithm, TestMainLogisticRegression) { + + uint32_t ITERATIONS = 100; + + std::vector wInVec = { 0.001, 0.001 }; + std::vector bInVec = { 0 }; + + std::shared_ptr xI{ new kp::Tensor({ 0, 1, 1, 1, 1 })}; + std::shared_ptr xJ{ new kp::Tensor({ 0, 0, 0, 1, 1 })}; + + std::shared_ptr y{ new kp::Tensor({ 0, 0, 0, 1, 1 })}; + + std::shared_ptr wIn{ + new kp::Tensor(wInVec)}; + std::shared_ptr wOutI{ new kp::Tensor({ 0, 0, 0, 0, 0 })}; + std::shared_ptr wOutJ{ new kp::Tensor({ 0, 0, 0, 0, 0 })}; + + std::shared_ptr bIn{ + new kp::Tensor(bInVec)}; + std::shared_ptr bOut{ new kp::Tensor({ 0, 0, 0, 0, 0 })}; + + std::vector> params = + {xI, xJ, y, wIn, wOutI, wOutJ, bIn, bOut}; + + { + kp::Manager mgr; + + if (std::shared_ptr sq = + mgr.getOrCreateManagedSequence("createTensors").lock()) { + + sq->begin(); + + sq->record(params); + + sq->end(); + sq->eval(); + + // Record op algo base + sq->begin(); + + sq->record({wIn, bIn}); + + sq->record>( + params, + false, // Whether to copy output from device + "test/shaders/glsl/test_logistic_regression.comp"); + + sq->record({wOutI, wOutJ, bOut}); + + sq->end(); + + // Iterate across all expected iterations + for (size_t i = 0; i < ITERATIONS; i++) { + + sq->eval(); + + for(size_t j = 0; j < bOut->size(); j++) { + wIn->data()[0] -= wOutI->data()[j]; + wIn->data()[1] -= wOutJ->data()[j]; + bIn->data()[0] -= bOut->data()[j]; + } + } + } + } + + + // Based on the inputs the outputs should be at least: + // * wi < 0.01 + // * wj > 1.0 + // * b < 0 + // TODO: Add EXPECT_DOUBLE_EQ instead + EXPECT_LT(wIn->data()[0], 0.01); + EXPECT_GT(wIn->data()[1], 1.0); + EXPECT_LT(bIn->data()[0], 0.0); + + SPDLOG_ERROR("Result wIn: {}, bIn: {}", + wIn->data(), bIn->data()); +} + +TEST(TestLogisticRegressionAlgorithm, TestMainLogisticRegressionManualCopy) { uint32_t ITERATIONS = 100; @@ -76,6 +156,6 @@ TEST(LogisticRegressionAlgorithm, TestMainLogisticRegression) { EXPECT_GT(wIn->data()[1], 1.0); EXPECT_LT(bIn->data()[0], 0.0); - //SPDLOG_DEBUG("Result wIn: {}, bIn: {}", - // wIn->data(), bIn->data()); + SPDLOG_ERROR("Result wIn: {}, bIn: {}", + wIn->data(), bIn->data()); } diff --git a/test/TestOpTensorSyncDevice.cpp b/test/TestOpTensorSyncDevice.cpp new file mode 100644 index 000000000..e69de29bb