diff --git a/python/src/main.cpp b/python/src/main.cpp index cb538112b..36be7ac7a 100644 --- a/python/src/main.cpp +++ b/python/src/main.cpp @@ -54,17 +54,20 @@ PYBIND11_MODULE(kp, m) { py::class_>(m, "OpBase"); - py::class_>(m, "OpTensorSyncDevice") + py::class_>(m, "OpTensorSyncDevice", py::base()) .def(py::init>&>()); - py::class_>(m, "OpTensorSyncLocal") + py::class_>(m, "OpTensorSyncLocal", py::base()) .def(py::init>&>()); - py::class_>(m, "OpTensorCopy") + py::class_>(m, "OpTensorCopy", py::base()) .def(py::init>&>()); - py::class_>(m, "OpAlgoDispatch") - .def(py::init&, bool>()); + py::class_>(m, "OpAlgoDispatch", py::base()) + .def(py::init&>()); + + py::class_>(m, "OpMult", py::base()) + .def(py::init>&,const std::shared_ptr&>()); py::class_>(m, "Algorithm") .def("get_tensors", &kp::Algorithm::getTensors) @@ -112,8 +115,7 @@ PYBIND11_MODULE(kp, m) { .def("__len__", &kp::Tensor::size, "Retrieves the size of the Tensor data as per the local Tensor memory.") .def("tensor_type", &kp::Tensor::tensorType, "Retreves the memory type of the tensor.") .def("is_init", &kp::Tensor::isInit, "Checks whether the tensor GPU memory has been initialised.") - .def("map_data_from_host", &kp::Tensor::mapDataFromHostMemory, "Maps data into GPU memory from tensor local data.") - .def("map_data_into_host", &kp::Tensor::mapDataIntoHostMemory, "Maps data from GPU memory into tensor local data."); + .def("destroy", &kp::Tensor::destroy, "Destroy tensor GPU resources."); py::class_>(m, "Sequence") .def("record", [](kp::Sequence& self, std::shared_ptr op) { return self.record(op); }) @@ -147,15 +149,17 @@ PYBIND11_MODULE(kp, m) { .def("algorithm", [](kp::Manager& self, const std::vector>& tensors, const py::bytes& spirv, - const kp::Workgroup& workgroup = {}, - const kp::Constants& spec_consts = {}, - const kp::Constants& push_consts = {}) { + const kp::Workgroup& workgroup, + const kp::Constants& spec_consts, + const kp::Constants& push_consts) { py::buffer_info info(py::buffer(spirv).request()); const char *data = reinterpret_cast(info.ptr); size_t length = static_cast(info.size); std::vector spirvVec((uint32_t*)data, (uint32_t*)(data + length)); return self.algorithm(tensors, spirvVec, workgroup, spec_consts, push_consts); - }); + }, + "Algorithm initialisation function", + py::arg("tensors"), py::arg("spirv"), py::arg("workgroup") = kp::Workgroup(), py::arg("spec_consts") = kp::Constants(), py::arg("push_consts") = kp::Constants()); #ifdef VERSION_INFO m.attr("__version__") = VERSION_INFO; diff --git a/python/test/test_array_multiplication.py b/python/test/test_array_multiplication.py index bcad405a6..55d764805 100644 --- a/python/test/test_array_multiplication.py +++ b/python/test/test_array_multiplication.py @@ -9,29 +9,26 @@ def test_array_multiplication(): mgr = kp.Manager() # 2. Create Kompute Tensors to hold data - tensor_in_a = kp.Tensor([2, 2, 2]) - tensor_in_b = kp.Tensor([1, 2, 3]) - tensor_out = kp.Tensor([0, 0, 0]) + tensor_in_a = mgr.tensor([2, 2, 2]) + tensor_in_b = mgr.tensor([1, 2, 3]) + tensor_out = mgr.tensor([0, 0, 0]) - # 3. Initialise the Kompute Tensors in the GPU - mgr.rebuild([tensor_in_a, tensor_in_b, tensor_out]) + params = [tensor_in_a, tensor_in_b, tensor_out] # 4. Define the multiplication shader code to run on the GPU @ps.python2shader - def compute_shader_multiply(index=("input", "GlobalInvocationId", ps.ivec3), + def compute_mult(index=("input", "GlobalInvocationId", ps.ivec3), data1=("buffer", 0, ps.Array(ps.f32)), data2=("buffer", 1, ps.Array(ps.f32)), data3=("buffer", 2, ps.Array(ps.f32))): i = index.x data3[i] = data1[i] * data2[i] - # 5. Run shader code against our previously defined tensors - mgr.eval_algo_data_def( - [tensor_in_a, tensor_in_b, tensor_out], - compute_shader_multiply.to_spirv()) - - # 6. Sync tensor data from GPU back to local - mgr.eval_tensor_sync_local_def([tensor_out]) + (mgr.sequence() + .record(kp.OpTensorSyncDevice(params)) + .record(kp.OpAlgoDispatch(mgr.algorithm(params, compute_mult.to_spirv()))) + .record(kp.OpTensorSyncLocal([tensor_out])) + .eval()) assert tensor_out.data() == [2.0, 4.0, 6.0] assert np.all(tensor_out.numpy() == [2.0, 4.0, 6.0]) diff --git a/python/test/test_kompute.py b/python/test/test_kompute.py index e923c6393..ad4b77391 100644 --- a/python/test/test_kompute.py +++ b/python/test/test_kompute.py @@ -7,6 +7,8 @@ import pyshader as ps DIRNAME = os.path.dirname(os.path.abspath(__file__)) +kp_log = logging.getLogger("kp") + # TODO: Add example with file #def test_opalgobase_file(): # """ @@ -62,9 +64,9 @@ void main() algo = mgr.algorithm(params, spirv) (mgr.sequence() - .record(kp.OpTensorSyncLocal(params)) - .record(kp.OpAlgoDispatch(algo)) .record(kp.OpTensorSyncDevice(params)) + .record(kp.OpAlgoDispatch(algo)) + .record(kp.OpTensorSyncLocal(params)) .eval()) assert tensor_out.data() == [2.0, 4.0, 6.0] @@ -102,9 +104,9 @@ def test_sequence(): sq = mgr.sequence() - sq.record(kp.OpTensorSyncLocal(params)) - sq.record(kp.OpAlgoDispatch(algo)) sq.record(kp.OpTensorSyncDevice(params)) + sq.record(kp.OpAlgoDispatch(algo)) + sq.record(kp.OpTensorSyncLocal(params)) sq.eval() @@ -141,16 +143,14 @@ def test_workgroup(): data1[i] = f32(gl_idx.x) data2[i] = f32(gl_idx.y) - algo = mgr.algorithm([tensor_a, tensor_b], compute_shader_wg.to_spirv(), (16,8,1), [], []) + algo = mgr.algorithm([tensor_a, tensor_b], compute_shader_wg.to_spirv(), (16,8,1)) (mgr.sequence() .record(kp.OpTensorSyncDevice([tensor_a, tensor_b])) .record(kp.OpAlgoDispatch(algo)) - .record(kp.OpAlgoTensorSyncLocal([tensor_a, tensor_b])) + .record(kp.OpTensorSyncLocal([tensor_a, tensor_b])) .eval()) - assert sq.is_init() == False - print(tensor_a.numpy()) print(tensor_b.numpy()) diff --git a/python/test/test_logistic_regression.py b/python/test/test_logistic_regression.py index 6783bbc87..4bd0c28fa 100644 --- a/python/test/test_logistic_regression.py +++ b/python/test/test_logistic_regression.py @@ -46,45 +46,39 @@ def test_logistic_regression(): mgr = kp.Manager(0) # First we create input and ouput tensors for shader - tensor_x_i = kp.Tensor([0.0, 1.0, 1.0, 1.0, 1.0]) - tensor_x_j = kp.Tensor([0.0, 0.0, 0.0, 1.0, 1.0]) + tensor_x_i = mgr.tensor([0.0, 1.0, 1.0, 1.0, 1.0]) + tensor_x_j = mgr.tensor([0.0, 0.0, 0.0, 1.0, 1.0]) - tensor_y = kp.Tensor([0.0, 0.0, 0.0, 1.0, 1.0]) + tensor_y = mgr.tensor([0.0, 0.0, 0.0, 1.0, 1.0]) - tensor_w_in = kp.Tensor([0.001, 0.001]) - tensor_w_out_i = kp.Tensor([0.0, 0.0, 0.0, 0.0, 0.0]) - tensor_w_out_j = kp.Tensor([0.0, 0.0, 0.0, 0.0, 0.0]) + tensor_w_in = mgr.tensor([0.001, 0.001]) + tensor_w_out_i = mgr.tensor([0.0, 0.0, 0.0, 0.0, 0.0]) + tensor_w_out_j = mgr.tensor([0.0, 0.0, 0.0, 0.0, 0.0]) - tensor_b_in = kp.Tensor([0.0]) - tensor_b_out = kp.Tensor([0.0, 0.0, 0.0, 0.0, 0.0]) + tensor_b_in = mgr.tensor([0.0]) + tensor_b_out = mgr.tensor([0.0, 0.0, 0.0, 0.0, 0.0]) - tensor_l_out = kp.Tensor([0.0, 0.0, 0.0, 0.0, 0.0]) + tensor_l_out = mgr.tensor([0.0, 0.0, 0.0, 0.0, 0.0]) - tensor_m = kp.Tensor([ tensor_y.size() ]) + tensor_m = mgr.tensor([ tensor_y.size() ]) # We store them in an array for easier interaction params = [tensor_x_i, tensor_x_j, tensor_y, tensor_w_in, tensor_w_out_i, tensor_w_out_j, tensor_b_in, tensor_b_out, tensor_l_out, tensor_m] - mgr.rebuild(params) + mgr.sequence().eval(kp.OpTensorSyncDevice(params)) # Create a managed sequence sq = mgr.sequence() - # Clear previous operations and begin recording for new operations - sq.begin() - # Record operation to sync memory from local to GPU memory - sq.record_tensor_sync_device([tensor_w_in, tensor_b_in]) + sq.record(kp.OpTensorSyncDevice([tensor_w_in, tensor_b_in])) # Record operation to execute GPU shader against all our parameters - sq.record_algo_data(params, compute_shader.to_spirv()) + sq.record(kp.OpAlgoDispatch(mgr.algorithm(params, compute_shader.to_spirv()))) # Record operation to sync memory from GPU to local memory - sq.record_tensor_sync_local([tensor_w_out_i, tensor_w_out_j, tensor_b_out, tensor_l_out]) - - # Stop recording operations - sq.end() + sq.record(kp.OpTensorSyncLocal([tensor_w_out_i, tensor_w_out_j, tensor_b_out, tensor_l_out])) ITERATIONS = 100 learning_rate = 0.1 diff --git a/single_include/AggregateHeaders.cpp b/single_include/AggregateHeaders.cpp index 23372873b..25dc04edb 100644 --- a/single_include/AggregateHeaders.cpp +++ b/single_include/AggregateHeaders.cpp @@ -6,10 +6,10 @@ #include "kompute/Tensor.hpp" #include "kompute/Algorithm.hpp" #include "kompute/operations/OpBase.hpp" -#include "kompute/operations/OpMult.hpp" #include "kompute/operations/OpTensorCopy.hpp" #include "kompute/operations/OpTensorSyncDevice.hpp" #include "kompute/operations/OpTensorSyncLocal.hpp" #include "kompute/operations/OpAlgoDispatch.hpp" +#include "kompute/operations/OpMult.hpp" #include "kompute/Sequence.hpp" #include "kompute/Manager.hpp" diff --git a/single_include/kompute/Kompute.hpp b/single_include/kompute/Kompute.hpp index b1d278081..16c4d6266 100755 --- a/single_include/kompute/Kompute.hpp +++ b/single_include/kompute/Kompute.hpp @@ -1247,106 +1247,6 @@ class OpBase } // End namespace kp -#include - -namespace kp { - -/** - * Operation that provides a general abstraction that simplifies the use of - * algorithm and parameter components which can be used with shaders. - * By default it enables the user to provide a dynamic number of tensors - * which are then passed as inputs. - */ -class OpAlgoDispatch : public OpBase -{ - public: - - OpAlgoDispatch(const std::shared_ptr& algorithm, bool skipAlgoCheck = false); - - /** - * Default destructor, which is in charge of destroying the algorithm - * components but does not destroy the underlying tensors - */ - virtual ~OpAlgoDispatch() override; - - /** - * This records the commands that are to be sent to the GPU. This includes - * the barriers that ensure the memory has been copied before going in and - * out of the shader, as well as the dispatch operation that sends the - * shader processing to the gpu. This function also records the GPU memory - * copy of the output data for the staging buffer so it can be read by the - * host. - */ - virtual void record(std::shared_ptr commandBuffer) override; - - /** - * Does not perform any preEval commands. - */ - virtual void preEval() override; - - /** - * Executes after the recorded commands are submitted, and performs a copy - * of the GPU Device memory into the staging buffer so the output data can - * be retrieved. - */ - virtual void postEval() override; - -private: - // -------------- ALWAYS OWNED RESOURCES - std::shared_ptr mAlgorithm; -}; - -} // End namespace kp - -namespace kp { - -/** - * Operation that performs multiplication on two tensors and outpus on third - * tensor. - */ -class OpMult : public OpAlgoDispatch -{ - public: - - /** - * Default constructor with parameters that provides the bare minimum - * requirements for the operations to be able to create and manage their - * sub-components. - * - * @param physicalDevice Vulkan physical device used to find device queues - * @param device Vulkan logical device for passing to Algorithm - * @param commandBuffer Vulkan Command Buffer to record commands into - * @param tensors Tensors that are to be used in this operation - * @param komputeWorkgroup Optional parameter to specify the layout for processing - */ - OpMult(std::vector> tensors, std::shared_ptr algorithm) - : OpAlgoDispatch(algorithm, true) - { - KP_LOG_DEBUG("Kompute OpMult constructor with params"); - - if (tensors.size() != 3) { - throw std::runtime_error("Kompute OpMult expected 3 tensors but got " + tensors.size()); - } - - std::vector spirv( - (uint32_t*)shader_data::shaders_glsl_opmult_comp_spv, - (uint32_t*)(shader_data::shaders_glsl_opmult_comp_spv + - kp::shader_data::shaders_glsl_opmult_comp_spv_len)); - - algorithm->rebuild(tensors, spirv); - } - - /** - * Default destructor, which is in charge of destroying the algorithm - * components but does not destroy the underlying tensors - */ - virtual ~OpMult() override { - KP_LOG_DEBUG("Kompute OpMult destructor started"); - } -}; - -} // End namespace kp - namespace kp { /** @@ -1484,6 +1384,106 @@ class OpTensorSyncLocal : public OpBase namespace kp { +/** + * Operation that provides a general abstraction that simplifies the use of + * algorithm and parameter components which can be used with shaders. + * By default it enables the user to provide a dynamic number of tensors + * which are then passed as inputs. + */ +class OpAlgoDispatch : public OpBase +{ + public: + + OpAlgoDispatch(const std::shared_ptr& algorithm); + + /** + * Default destructor, which is in charge of destroying the algorithm + * components but does not destroy the underlying tensors + */ + virtual ~OpAlgoDispatch() override; + + /** + * This records the commands that are to be sent to the GPU. This includes + * the barriers that ensure the memory has been copied before going in and + * out of the shader, as well as the dispatch operation that sends the + * shader processing to the gpu. This function also records the GPU memory + * copy of the output data for the staging buffer so it can be read by the + * host. + */ + virtual void record(std::shared_ptr commandBuffer) override; + + /** + * Does not perform any preEval commands. + */ + virtual void preEval() override; + + /** + * Executes after the recorded commands are submitted, and performs a copy + * of the GPU Device memory into the staging buffer so the output data can + * be retrieved. + */ + virtual void postEval() override; + +private: + // -------------- ALWAYS OWNED RESOURCES + std::shared_ptr mAlgorithm; +}; + +} // End namespace kp + +#include + +namespace kp { + +/** + * Operation that performs multiplication on two tensors and outpus on third + * tensor. + */ +class OpMult : public OpAlgoDispatch +{ + public: + + /** + * Default constructor with parameters that provides the bare minimum + * requirements for the operations to be able to create and manage their + * sub-components. + * + * @param physicalDevice Vulkan physical device used to find device queues + * @param device Vulkan logical device for passing to Algorithm + * @param commandBuffer Vulkan Command Buffer to record commands into + * @param tensors Tensors that are to be used in this operation + * @param komputeWorkgroup Optional parameter to specify the layout for processing + */ + OpMult(std::vector> tensors, std::shared_ptr algorithm) + : OpAlgoDispatch(algorithm) + { + KP_LOG_DEBUG("Kompute OpMult constructor with params"); + + if (tensors.size() != 3) { + throw std::runtime_error("Kompute OpMult expected 3 tensors but got " + tensors.size()); + } + + std::vector spirv( + (uint32_t*)shader_data::shaders_glsl_opmult_comp_spv, + (uint32_t*)(shader_data::shaders_glsl_opmult_comp_spv + + kp::shader_data::shaders_glsl_opmult_comp_spv_len)); + + algorithm->rebuild(tensors, spirv); + } + + /** + * Default destructor, which is in charge of destroying the algorithm + * components but does not destroy the underlying tensors + */ + virtual ~OpMult() override { + KP_LOG_DEBUG("Kompute OpMult destructor started"); + } +}; + +} // End namespace kp + +namespace kp { + /** * Container of operations that can be sent to GPU as batch */ diff --git a/src/Algorithm.cpp b/src/Algorithm.cpp index 4e532e269..f8ee78e3e 100644 --- a/src/Algorithm.cpp +++ b/src/Algorithm.cpp @@ -78,7 +78,7 @@ Algorithm::destroy() { return; } - if (this->mFreePipeline) { + if (this->mFreePipeline && this->mPipeline) { KP_LOG_DEBUG("Kompute Algorithm Destroying pipeline"); if (!this->mPipeline) { KP_LOG_WARN("Kompute Algorithm Error requested to destroy " @@ -90,7 +90,7 @@ Algorithm::destroy() { this->mPipeline = nullptr; } - if (this->mFreePipelineCache) { + if (this->mFreePipelineCache && this->mPipelineCache) { KP_LOG_DEBUG("Kompute Algorithm Destroying pipeline cache"); if (!this->mPipelineCache) { KP_LOG_WARN("Kompute Algorithm Error requested to destroy " @@ -102,7 +102,7 @@ Algorithm::destroy() { this->mPipelineCache = nullptr; } - if (this->mFreePipelineLayout) { + if (this->mFreePipelineLayout && this->mPipelineLayout) { KP_LOG_DEBUG("Kompute Algorithm Destroying pipeline layout"); if (!this->mPipelineLayout) { KP_LOG_WARN("Kompute Algorithm Error requested to destroy " @@ -114,7 +114,7 @@ Algorithm::destroy() { this->mPipelineLayout = nullptr; } - if (this->mFreeShaderModule) { + if (this->mFreeShaderModule && this->mShaderModule) { KP_LOG_DEBUG("Kompute Algorithm Destroying shader module"); if (!this->mShaderModule) { KP_LOG_WARN("Kompute Algorithm Error requested to destroy shader " @@ -129,7 +129,7 @@ Algorithm::destroy() { // We don't call freeDescriptorSet as the descriptor pool is not created with // VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT more at // (https://www.khronos.org/registry/vulkan/specs/1.0/html/vkspec.html#VUID-vkFreeDescriptorSets-descriptorPool-00312)) - //if (this->mFreeDescriptorSet) { + //if (this->mFreeDescriptorSet && this->mDescriptorSet) { // KP_LOG_DEBUG("Kompute Algorithm Freeing Descriptor Set"); // if (!this->mDescriptorSet) { // KP_LOG_WARN( @@ -140,7 +140,7 @@ Algorithm::destroy() { // this->mDescriptorSet = nullptr; //} - if (this->mFreeDescriptorSetLayout) { + if (this->mFreeDescriptorSetLayout && this->mDescriptorSetLayout) { KP_LOG_DEBUG("Kompute Algorithm Destroying Descriptor Set Layout"); if (!this->mDescriptorSetLayout) { KP_LOG_WARN("Kompute Algorithm Error requested to destroy " @@ -152,7 +152,7 @@ Algorithm::destroy() { this->mDescriptorSetLayout = nullptr; } - if (this->mFreeDescriptorPool) { + if (this->mFreeDescriptorPool && this->mDescriptorPool) { KP_LOG_DEBUG("Kompute Algorithm Destroying Descriptor Pool"); if (!this->mDescriptorPool) { KP_LOG_WARN("Kompute Algorithm Error requested to destroy " diff --git a/src/Manager.cpp b/src/Manager.cpp index 92ffaea9c..2ee94a2ee 100644 --- a/src/Manager.cpp +++ b/src/Manager.cpp @@ -4,8 +4,6 @@ #include "kompute/Manager.hpp" -#include "kompute/operations/OpAlgoDispatch.hpp" - namespace kp { #if DEBUG diff --git a/src/OpAlgoDispatch.cpp b/src/OpAlgoDispatch.cpp index 09050fba0..a20900189 100644 --- a/src/OpAlgoDispatch.cpp +++ b/src/OpAlgoDispatch.cpp @@ -4,14 +4,10 @@ namespace kp { -OpAlgoDispatch::OpAlgoDispatch(const std::shared_ptr& algorithm, bool skipAlgoCheck) +OpAlgoDispatch::OpAlgoDispatch(const std::shared_ptr& algorithm) { KP_LOG_DEBUG("Kompute OpAlgoDispatch constructor"); - if (!skipAlgoCheck && !algorithm->isInit()) { - throw std::runtime_error("Kompute OpAlgoDispatch constructor with non initialised algorithm"); - } - this->mAlgorithm = algorithm; } diff --git a/src/include/kompute/operations/OpAlgoDispatch.hpp b/src/include/kompute/operations/OpAlgoDispatch.hpp index d80c6bd91..1b5ab1bf0 100644 --- a/src/include/kompute/operations/OpAlgoDispatch.hpp +++ b/src/include/kompute/operations/OpAlgoDispatch.hpp @@ -17,7 +17,7 @@ class OpAlgoDispatch : public OpBase { public: - OpAlgoDispatch(const std::shared_ptr& algorithm, bool skipAlgoCheck = false); + OpAlgoDispatch(const std::shared_ptr& algorithm); /** * Default destructor, which is in charge of destroying the algorithm diff --git a/src/include/kompute/operations/OpMult.hpp b/src/include/kompute/operations/OpMult.hpp index fea38bdee..992b0e8a0 100644 --- a/src/include/kompute/operations/OpMult.hpp +++ b/src/include/kompute/operations/OpMult.hpp @@ -33,7 +33,7 @@ class OpMult : public OpAlgoDispatch * @param komputeWorkgroup Optional parameter to specify the layout for processing */ OpMult(std::vector> tensors, std::shared_ptr algorithm) - : OpAlgoDispatch(algorithm, true) + : OpAlgoDispatch(algorithm) { KP_LOG_DEBUG("Kompute OpMult constructor with params");