Added functional optensorsyncDevice and optensorsynclocal
This commit is contained in:
parent
e6f4097acb
commit
e68d09dbdc
15 changed files with 258 additions and 50 deletions
|
|
@ -121,7 +121,11 @@ Now that we have the inputs and outputs we will be able to use them in the proce
|
|||
|
||||
Once we re-record, all the instructions that were recorded previosuly are cleared.
|
||||
|
||||
Because of this we can record now the new command which is just the OpAlgoBase with the LR shader.
|
||||
Because of this we can record now the new commands which will consist of the following:
|
||||
|
||||
1. Copy the tensor data from local to device
|
||||
2. Run the logistic regression shader
|
||||
3. Copy the output data
|
||||
|
||||
.. code-block:: cpp
|
||||
:linenos:
|
||||
|
|
@ -131,11 +135,15 @@ Because of this we can record now the new command which is just the OpAlgoBase w
|
|||
|
||||
sq->begin();
|
||||
|
||||
sq->record<kp::OpTensorSyncDevice>({wIn, bIn});
|
||||
|
||||
sq->record<kp::OpAlgoBase<>>(
|
||||
params,
|
||||
true, // Whether to copy output from device
|
||||
false, // Whether to copy output from device
|
||||
"test/shaders/glsl/test_logistic_regression.comp");
|
||||
|
||||
sq->record<kp::OpTensorSyncLocal>({wOutI, wOutJ, bOut});
|
||||
|
||||
sq->end();
|
||||
|
||||
4. Loop across number of iterations + 4-a. Submit algo operation on LR shader
|
||||
|
|
|
|||
|
|
@ -470,11 +470,24 @@ class OpBase
|
|||
virtual void record() = 0;
|
||||
|
||||
/**
|
||||
* Post submit is called after the Sequence has submitted the commands to
|
||||
* the GPU for processing, and can be used to perform any tear-down steps
|
||||
* required as the computation iteration finishes.
|
||||
* Pre eval is called before the Sequence has called eval and submitted the commands to
|
||||
* the GPU for processing, and can be used to perform any per-eval setup steps
|
||||
* required as the computation iteration begins. It's worth noting that
|
||||
* there are situations where eval can be called multiple times, so the
|
||||
* resources that are created should be idempotent in case it's called multiple
|
||||
* times in a row.
|
||||
*/
|
||||
virtual void postSubmit() = 0;
|
||||
virtual void preEval() = 0;
|
||||
|
||||
/**
|
||||
* Post eval is called after the Sequence has called eval and submitted the commands to
|
||||
* the GPU for processing, and can be used to perform any tear-down steps
|
||||
* required as the computation iteration finishes. It's worth noting that
|
||||
* there are situations where eval can be called multiple times, so the
|
||||
* resources that are destroyed should not require a re-init unless explicitly
|
||||
* provided by the user.
|
||||
*/
|
||||
virtual void postEval() = 0;
|
||||
|
||||
protected:
|
||||
// -------------- NEVER OWNED RESOURCES
|
||||
|
|
@ -966,12 +979,17 @@ class OpAlgoBase : public OpBase
|
|||
*/
|
||||
virtual void record() override;
|
||||
|
||||
/**
|
||||
* Does not perform any preEval commands.
|
||||
*/
|
||||
virtual void preEval() override;
|
||||
|
||||
/**
|
||||
* Executes after the recorded commands are submitted, and performs a copy
|
||||
* of the GPU Device memory into the staging buffer so the output data can
|
||||
* be retrieved.
|
||||
*/
|
||||
virtual void postSubmit() override;
|
||||
virtual void postEval() override;
|
||||
|
||||
protected:
|
||||
// -------------- NEVER OWNED RESOURCES
|
||||
|
|
@ -1162,7 +1180,14 @@ OpAlgoBase<tX, tY, tZ>::record()
|
|||
|
||||
template<uint32_t tX, uint32_t tY, uint32_t tZ>
|
||||
void
|
||||
OpAlgoBase<tX, tY, tZ>::postSubmit()
|
||||
OpAlgoBase<tX, tY, tZ>::preEval()
|
||||
{
|
||||
SPDLOG_DEBUG("Kompute OpAlgoBase preEval called");
|
||||
}
|
||||
|
||||
template<uint32_t tX, uint32_t tY, uint32_t tZ>
|
||||
void
|
||||
OpAlgoBase<tX, tY, tZ>::postEval()
|
||||
{
|
||||
SPDLOG_DEBUG("Kompute OpAlgoBase postSubmit called");
|
||||
|
||||
|
|
@ -1554,11 +1579,16 @@ class OpTensorCreate : public OpBase
|
|||
*/
|
||||
void record() override;
|
||||
|
||||
/**
|
||||
* Does not perform any preEval commands.
|
||||
*/
|
||||
virtual void preEval() override;
|
||||
|
||||
/**
|
||||
* Performs a copy back into the main tensor to ensure that the data
|
||||
* contained is the one that is now being stored in the GPU.
|
||||
*/
|
||||
void postSubmit() override;
|
||||
virtual void postEval() override;
|
||||
|
||||
private:
|
||||
// Never owned resources
|
||||
|
|
@ -1605,10 +1635,15 @@ class OpTensorCopy : public OpBase
|
|||
*/
|
||||
void record() override;
|
||||
|
||||
/**
|
||||
* Does not perform any preEval commands.
|
||||
*/
|
||||
virtual void preEval() override;
|
||||
|
||||
/**
|
||||
* Copies the local vectors for all the tensors to sync the data with the gpu.
|
||||
*/
|
||||
void postSubmit() override;
|
||||
virtual void postEval() override;
|
||||
|
||||
private:
|
||||
};
|
||||
|
|
@ -1654,9 +1689,14 @@ class OpTensorSyncDevice : public OpBase
|
|||
void record() override;
|
||||
|
||||
/**
|
||||
* Does not perform any further sync functions. Frees the staging tensors together with their respective memory.
|
||||
* Does not perform any preEval commands.
|
||||
*/
|
||||
void postSubmit() override;
|
||||
virtual void preEval() override;
|
||||
|
||||
/**
|
||||
* Does not perform any postEval commands.
|
||||
*/
|
||||
virtual void postEval() override;
|
||||
|
||||
private:
|
||||
// Never owned resources
|
||||
|
|
@ -1704,9 +1744,14 @@ class OpTensorSyncLocal : public OpBase
|
|||
void record() override;
|
||||
|
||||
/**
|
||||
* For host tensors it performs the map command from the host memory into local memory. Frees the staging tensors together with their respective memory.
|
||||
* Does not perform any preEval commands.
|
||||
*/
|
||||
void postSubmit() override;
|
||||
virtual void preEval() override;
|
||||
|
||||
/**
|
||||
* For host tensors it performs the map command from the host memory into local memory.
|
||||
*/
|
||||
virtual void postEval() override;
|
||||
|
||||
private:
|
||||
// Never owned resources
|
||||
|
|
|
|||
|
|
@ -57,9 +57,15 @@ OpTensorCopy::record()
|
|||
}
|
||||
|
||||
void
|
||||
OpTensorCopy::postSubmit()
|
||||
OpTensorCopy::preEval()
|
||||
{
|
||||
SPDLOG_DEBUG("Kompute OpTensorCopy postSubmit called");
|
||||
SPDLOG_DEBUG("Kompute OpTensorCopy preEval called");
|
||||
}
|
||||
|
||||
void
|
||||
OpTensorCopy::postEval()
|
||||
{
|
||||
SPDLOG_DEBUG("Kompute OpTensorCopy postEval called");
|
||||
|
||||
// Copy the data from the first tensor into all the tensors
|
||||
for (size_t i = 1; i < this->mTensors.size(); i++) {
|
||||
|
|
|
|||
|
|
@ -80,12 +80,15 @@ OpTensorCreate::record()
|
|||
}
|
||||
|
||||
void
|
||||
OpTensorCreate::postSubmit()
|
||||
OpTensorCreate::preEval()
|
||||
{
|
||||
SPDLOG_DEBUG("Kompute OpTensorCreate postSubmit called");
|
||||
SPDLOG_DEBUG("Kompute OpTensorCreate preEval called");
|
||||
}
|
||||
|
||||
SPDLOG_DEBUG("Kompute OpTensorCreate destroying staging tensors");
|
||||
this->mStagingTensors.clear();
|
||||
void
|
||||
OpTensorCreate::postEval()
|
||||
{
|
||||
SPDLOG_DEBUG("Kompute OpTensorCreate postEval called");
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
|||
|
|
@ -36,8 +36,8 @@ OpTensorSyncDevice::init()
|
|||
}
|
||||
|
||||
for (std::shared_ptr<Tensor> tensor: this->mTensors) {
|
||||
if (tensor->isInit()) {
|
||||
throw std::runtime_error("Kompute OpTensorSyncDevice: Tensor has already been initialized");
|
||||
if (!tensor->isInit()) {
|
||||
throw std::runtime_error("Kompute OpTensorSyncDevice: Tensor param has not been initialized");
|
||||
}
|
||||
if (tensor->tensorType() == Tensor::TensorTypes::eStorage) {
|
||||
throw std::runtime_error("Kompute OpTensorSyncLocal tensor parameter is of type TensorTypes::eStorage and hence cannot be used to receive or pass data.");
|
||||
|
|
@ -78,14 +78,25 @@ OpTensorSyncDevice::record()
|
|||
}
|
||||
|
||||
void
|
||||
OpTensorSyncDevice::postSubmit()
|
||||
OpTensorSyncDevice::preEval()
|
||||
{
|
||||
SPDLOG_DEBUG("Kompute OpTensorSyncDevice postSubmit called");
|
||||
SPDLOG_DEBUG("Kompute OpTensorSyncDevice preEval called");
|
||||
|
||||
// Remove all staging tensors as they are not required after operation
|
||||
SPDLOG_DEBUG("Kompute OpTensorSyncDevice destroying staging tensors");
|
||||
// TODO: This would cause issues if there is no CPU barrier
|
||||
this->mStagingTensors.clear();
|
||||
// Performing sync of data as eval can be called multiple times with same op
|
||||
for (size_t i = 0; i < this->mTensors.size(); i++) {
|
||||
if (this->mTensors[i]->tensorType() == Tensor::TensorTypes::eDevice) {
|
||||
this->mStagingTensors[i]->setData(this->mTensors[i]->data());
|
||||
this->mStagingTensors[i]->mapDataIntoHostMemory();
|
||||
} else {
|
||||
this->mTensors[i]->mapDataFromHostMemory();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
OpTensorSyncDevice::postEval()
|
||||
{
|
||||
SPDLOG_DEBUG("Kompute OpTensorSyncDevice postEval called");
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
|||
|
|
@ -74,10 +74,17 @@ OpTensorSyncLocal::record()
|
|||
}
|
||||
|
||||
void
|
||||
OpTensorSyncLocal::postSubmit()
|
||||
OpTensorSyncLocal::preEval()
|
||||
{
|
||||
SPDLOG_DEBUG("Kompute OpTensorSyncLocal postSubmit called");
|
||||
SPDLOG_DEBUG("Kompute OpTensorSyncLocal preEval called");
|
||||
}
|
||||
|
||||
void
|
||||
OpTensorSyncLocal::postEval()
|
||||
{
|
||||
SPDLOG_DEBUG("Kompute OpTensorSyncLocal postEval called");
|
||||
|
||||
SPDLOG_DEBUG("Kompute OpTensorSyncLocal mapping data into tensor local");
|
||||
for (size_t i = 0; i < this->mTensors.size(); i++) {
|
||||
if (this->mTensors[i]->tensorType() == Tensor::TensorTypes::eDevice) {
|
||||
this->mStagingTensors[i]->mapDataFromHostMemory();
|
||||
|
|
@ -86,10 +93,6 @@ OpTensorSyncLocal::postSubmit()
|
|||
this->mTensors[i]->mapDataFromHostMemory();
|
||||
}
|
||||
}
|
||||
|
||||
// Remove all staging tensors as they are not required after operation
|
||||
SPDLOG_DEBUG("Kompute OpTensorSyncLocal destroying staging tensors");
|
||||
this->mStagingTensors.clear();
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
|||
|
|
@ -125,6 +125,10 @@ Sequence::eval()
|
|||
return false;
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < this->mOperations.size(); i++) {
|
||||
this->mOperations[i]->preEval();
|
||||
}
|
||||
|
||||
const vk::PipelineStageFlags waitStageMask =
|
||||
vk::PipelineStageFlagBits::eTransfer;
|
||||
vk::SubmitInfo submitInfo(
|
||||
|
|
@ -140,7 +144,7 @@ Sequence::eval()
|
|||
this->mDevice->destroy(fence);
|
||||
|
||||
for (size_t i = 0; i < this->mOperations.size(); i++) {
|
||||
this->mOperations[i]->postSubmit();
|
||||
this->mOperations[i]->postEval();
|
||||
}
|
||||
|
||||
SPDLOG_DEBUG("Kompute sequence EVAL success");
|
||||
|
|
|
|||
|
|
@ -120,12 +120,18 @@ class OpAlgoBase : public OpBase
|
|||
*/
|
||||
virtual void record() override;
|
||||
|
||||
|
||||
/**
|
||||
* Does not perform any preEval commands.
|
||||
*/
|
||||
virtual void preEval() override;
|
||||
|
||||
/**
|
||||
* Executes after the recorded commands are submitted, and performs a copy
|
||||
* of the GPU Device memory into the staging buffer so the output data can
|
||||
* be retrieved.
|
||||
*/
|
||||
virtual void postSubmit() override;
|
||||
virtual void postEval() override;
|
||||
|
||||
protected:
|
||||
// -------------- NEVER OWNED RESOURCES
|
||||
|
|
@ -316,7 +322,14 @@ OpAlgoBase<tX, tY, tZ>::record()
|
|||
|
||||
template<uint32_t tX, uint32_t tY, uint32_t tZ>
|
||||
void
|
||||
OpAlgoBase<tX, tY, tZ>::postSubmit()
|
||||
OpAlgoBase<tX, tY, tZ>::preEval()
|
||||
{
|
||||
SPDLOG_DEBUG("Kompute OpAlgoBase preEval called");
|
||||
}
|
||||
|
||||
template<uint32_t tX, uint32_t tY, uint32_t tZ>
|
||||
void
|
||||
OpAlgoBase<tX, tY, tZ>::postEval()
|
||||
{
|
||||
SPDLOG_DEBUG("Kompute OpAlgoBase postSubmit called");
|
||||
|
||||
|
|
|
|||
|
|
@ -90,11 +90,24 @@ class OpBase
|
|||
virtual void record() = 0;
|
||||
|
||||
/**
|
||||
* Post submit is called after the Sequence has submitted the commands to
|
||||
* the GPU for processing, and can be used to perform any tear-down steps
|
||||
* required as the computation iteration finishes.
|
||||
* Pre eval is called before the Sequence has called eval and submitted the commands to
|
||||
* the GPU for processing, and can be used to perform any per-eval setup steps
|
||||
* required as the computation iteration begins. It's worth noting that
|
||||
* there are situations where eval can be called multiple times, so the
|
||||
* resources that are created should be idempotent in case it's called multiple
|
||||
* times in a row.
|
||||
*/
|
||||
virtual void postSubmit() = 0;
|
||||
virtual void preEval() = 0;
|
||||
|
||||
/**
|
||||
* Post eval is called after the Sequence has called eval and submitted the commands to
|
||||
* the GPU for processing, and can be used to perform any tear-down steps
|
||||
* required as the computation iteration finishes. It's worth noting that
|
||||
* there are situations where eval can be called multiple times, so the
|
||||
* resources that are destroyed should not require a re-init unless explicitly
|
||||
* provided by the user.
|
||||
*/
|
||||
virtual void postEval() = 0;
|
||||
|
||||
protected:
|
||||
// -------------- NEVER OWNED RESOURCES
|
||||
|
|
|
|||
|
|
@ -44,10 +44,15 @@ class OpTensorCopy : public OpBase
|
|||
*/
|
||||
void record() override;
|
||||
|
||||
/**
|
||||
* Does not perform any preEval commands.
|
||||
*/
|
||||
virtual void preEval() override;
|
||||
|
||||
/**
|
||||
* Copies the local vectors for all the tensors to sync the data with the gpu.
|
||||
*/
|
||||
void postSubmit() override;
|
||||
virtual void postEval() override;
|
||||
|
||||
private:
|
||||
};
|
||||
|
|
|
|||
|
|
@ -56,11 +56,17 @@ class OpTensorCreate : public OpBase
|
|||
*/
|
||||
void record() override;
|
||||
|
||||
/**
|
||||
* Does not perform any preEval commands.
|
||||
*/
|
||||
virtual void preEval() override;
|
||||
|
||||
/**
|
||||
* Performs a copy back into the main tensor to ensure that the data
|
||||
* contained is the one that is now being stored in the GPU.
|
||||
*/
|
||||
void postSubmit() override;
|
||||
virtual void postEval() override;
|
||||
|
||||
|
||||
private:
|
||||
// Never owned resources
|
||||
|
|
|
|||
|
|
@ -45,9 +45,14 @@ class OpTensorSyncDevice : public OpBase
|
|||
void record() override;
|
||||
|
||||
/**
|
||||
* Does not perform any further sync functions. Frees the staging tensors together with their respective memory.
|
||||
* Does not perform any preEval commands.
|
||||
*/
|
||||
void postSubmit() override;
|
||||
virtual void preEval() override;
|
||||
|
||||
/**
|
||||
* Does not perform any postEval commands.
|
||||
*/
|
||||
virtual void postEval() override;
|
||||
|
||||
private:
|
||||
// Never owned resources
|
||||
|
|
|
|||
|
|
@ -45,9 +45,15 @@ class OpTensorSyncLocal : public OpBase
|
|||
void record() override;
|
||||
|
||||
/**
|
||||
* For host tensors it performs the map command from the host memory into local memory. Frees the staging tensors together with their respective memory.
|
||||
* Does not perform any preEval commands.
|
||||
*/
|
||||
void postSubmit() override;
|
||||
virtual void preEval() override;
|
||||
|
||||
/**
|
||||
* For host tensors it performs the map command from the host memory into local memory.
|
||||
*/
|
||||
virtual void postEval() override;
|
||||
|
||||
|
||||
private:
|
||||
// Never owned resources
|
||||
|
|
|
|||
|
|
@ -1,9 +1,89 @@
|
|||
|
||||
#include "gtest/gtest.h"
|
||||
|
||||
#include "fmt/ranges.h"
|
||||
#include "kompute/Kompute.hpp"
|
||||
|
||||
TEST(LogisticRegressionAlgorithm, TestMainLogisticRegression) {
|
||||
TEST(TestLogisticRegressionAlgorithm, TestMainLogisticRegression) {
|
||||
|
||||
uint32_t ITERATIONS = 100;
|
||||
|
||||
std::vector<float> wInVec = { 0.001, 0.001 };
|
||||
std::vector<float> bInVec = { 0 };
|
||||
|
||||
std::shared_ptr<kp::Tensor> xI{ new kp::Tensor({ 0, 1, 1, 1, 1 })};
|
||||
std::shared_ptr<kp::Tensor> xJ{ new kp::Tensor({ 0, 0, 0, 1, 1 })};
|
||||
|
||||
std::shared_ptr<kp::Tensor> y{ new kp::Tensor({ 0, 0, 0, 1, 1 })};
|
||||
|
||||
std::shared_ptr<kp::Tensor> wIn{
|
||||
new kp::Tensor(wInVec)};
|
||||
std::shared_ptr<kp::Tensor> wOutI{ new kp::Tensor({ 0, 0, 0, 0, 0 })};
|
||||
std::shared_ptr<kp::Tensor> wOutJ{ new kp::Tensor({ 0, 0, 0, 0, 0 })};
|
||||
|
||||
std::shared_ptr<kp::Tensor> bIn{
|
||||
new kp::Tensor(bInVec)};
|
||||
std::shared_ptr<kp::Tensor> bOut{ new kp::Tensor({ 0, 0, 0, 0, 0 })};
|
||||
|
||||
std::vector<std::shared_ptr<kp::Tensor>> params =
|
||||
{xI, xJ, y, wIn, wOutI, wOutJ, bIn, bOut};
|
||||
|
||||
{
|
||||
kp::Manager mgr;
|
||||
|
||||
if (std::shared_ptr<kp::Sequence> sq =
|
||||
mgr.getOrCreateManagedSequence("createTensors").lock()) {
|
||||
|
||||
sq->begin();
|
||||
|
||||
sq->record<kp::OpTensorCreate>(params);
|
||||
|
||||
sq->end();
|
||||
sq->eval();
|
||||
|
||||
// Record op algo base
|
||||
sq->begin();
|
||||
|
||||
sq->record<kp::OpTensorSyncDevice>({wIn, bIn});
|
||||
|
||||
sq->record<kp::OpAlgoBase<>>(
|
||||
params,
|
||||
false, // Whether to copy output from device
|
||||
"test/shaders/glsl/test_logistic_regression.comp");
|
||||
|
||||
sq->record<kp::OpTensorSyncLocal>({wOutI, wOutJ, bOut});
|
||||
|
||||
sq->end();
|
||||
|
||||
// Iterate across all expected iterations
|
||||
for (size_t i = 0; i < ITERATIONS; i++) {
|
||||
|
||||
sq->eval();
|
||||
|
||||
for(size_t j = 0; j < bOut->size(); j++) {
|
||||
wIn->data()[0] -= wOutI->data()[j];
|
||||
wIn->data()[1] -= wOutJ->data()[j];
|
||||
bIn->data()[0] -= bOut->data()[j];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// Based on the inputs the outputs should be at least:
|
||||
// * wi < 0.01
|
||||
// * wj > 1.0
|
||||
// * b < 0
|
||||
// TODO: Add EXPECT_DOUBLE_EQ instead
|
||||
EXPECT_LT(wIn->data()[0], 0.01);
|
||||
EXPECT_GT(wIn->data()[1], 1.0);
|
||||
EXPECT_LT(bIn->data()[0], 0.0);
|
||||
|
||||
SPDLOG_ERROR("Result wIn: {}, bIn: {}",
|
||||
wIn->data(), bIn->data());
|
||||
}
|
||||
|
||||
TEST(TestLogisticRegressionAlgorithm, TestMainLogisticRegressionManualCopy) {
|
||||
|
||||
uint32_t ITERATIONS = 100;
|
||||
|
||||
|
|
@ -76,6 +156,6 @@ TEST(LogisticRegressionAlgorithm, TestMainLogisticRegression) {
|
|||
EXPECT_GT(wIn->data()[1], 1.0);
|
||||
EXPECT_LT(bIn->data()[0], 0.0);
|
||||
|
||||
//SPDLOG_DEBUG("Result wIn: {}, bIn: {}",
|
||||
// wIn->data(), bIn->data());
|
||||
SPDLOG_ERROR("Result wIn: {}, bIn: {}",
|
||||
wIn->data(), bIn->data());
|
||||
}
|
||||
|
|
|
|||
0
test/TestOpTensorSyncDevice.cpp
Normal file
0
test/TestOpTensorSyncDevice.cpp
Normal file
Loading…
Add table
Add a link
Reference in a new issue