From 44fd0e2f1a599cb705ded60f9a4c699b06621a4c Mon Sep 17 00:00:00 2001 From: Alejandro Saucedo Date: Thu, 14 Jan 2021 19:58:41 +0000 Subject: [PATCH] Added simplified examples on readme --- README.md | 278 ++++++++++++++++-------------------------------------- 1 file changed, 79 insertions(+), 199 deletions(-) diff --git a/README.md b/README.md index 64025b883..462c263f3 100644 --- a/README.md +++ b/README.md @@ -80,16 +80,23 @@ You are able to try out the interactive Colab Notebooks which allow you to use a -### Your First Kompute (Simple Version) +### Your First Kompute -This simple example will show the basics of Kompute through the high level API. +Below you can find both the C++ and Python version of a simple GPU multiplication snippet with Kompute. + +In both examples the steps carried out will include: 1. Create Kompute Manager with default settings (device 0 and first compute compatible queue) 2. Create and initialise Kompute Tensors through manager -3. Run multiplication operation synchronously -4. Map results back from GPU memory to print the results +3. Specify "multiply shader" code (can also be raw string, spir-v bytes or file path) +4. Run multiplication operation synchronously +5. Map results back from GPU memory to print the results -View the [extended version](#your-first-kompute-extended-version) or [more examples](#simple-examples). +#### Simple C++ Example + +The C++ interface provides lower level access to the native components of Kompute and Vulkan, enabling for advanced optimizations as well as extension of components. + +To see a full breakdown you can read further in the [C++ Class Reference](https://kompute.cc/overview/reference.html). ```c++ int main() { @@ -102,141 +109,77 @@ int main() { auto tensorInB = mgr.buildTensor({ 1., 2., 3. }); auto tensorOut = mgr.buildTensor({ 0., 0., 0. }); + // 3. Specify "multiply shader" code (can also be raw string, spir-v bytes or file path) + std::string shaderString = (R"( + #version 450 + + layout (local_size_x = 1) in; + + // The input tensors bind index is relative to index in parameter passed + layout(set = 0, binding = 0) buffer bina { float tina[]; }; + layout(set = 0, binding = 1) buffer binb { float tinb[]; }; + layout(set = 0, binding = 2) buffer bout { float tout[]; }; + + void main() { + uint index = gl_GlobalInvocationID.x; + tout[index] = tina[index] * tinb[index]; + } + )"); + // 3. Run multiplication operation synchronously mgr.evalOpDefault( - { tensorInA, tensorInB, tensorOut }); + { tensorInA, tensorInB, tensorOut }, + std::vector(shaderString.begin(), shaderString.end())); // 4. Map results back from GPU memory to print the results mgr.evalOpDefault({ tensorInA, tensorInB, tensorOut }); // Prints the output which is Output: { 2, 4, 6 } - std::cout << "Output: { "; - for (const float& elem : tensorOut->data()) { - std::cout << elem << " "; - } - std::cout << "}" << std::endl; + for (const float& elem : tensorOut->data()) std::cout << elem << " "; } + ``` -## Your First Kompute (Extended Version) +#### Simple Python Example -We will now show the [same example as above](#your-first-kompute-simple-version) but leveraging more advanced Kompute features: +The Python interface provides a higher level interactive interface that enables for experimentation whilst ensuring high performance and fast development workflows. -1. Create Kompute Manager with explicit device 0 and single queue of familyIndex 2 -2. Explicitly create Kompute Tensors without initializing in GPU -3. Initialise the Kompute Tensor in GPU memory and map data into GPU -4. Run operation with custom compute shader code asynchronously with explicit dispatch layout -5. Create managed sequence to submit batch operations to the CPU -6. Map data back to host by running the sequence of batch operations +For further details you can read the [Python Package documentation](https://kompute.cc/overview/python-package.html) or the [Python Class Reference documentation](https://kompute.cc/overview/python-reference.html). -View [more examples](https://kompute.cc/overview/advanced-examples.html#simple-examples). +```python +mgr = Manager() -```c++ -int main() { +# Can be initialized with List[] or np.Array +tensor_in_a = Tensor([2, 2, 2]) +tensor_in_b = Tensor([1, 2, 3]) +tensor_out = Tensor([0, 0, 0]) - // 1. Create Kompute Manager with explicit device 0 and single queue of familyIndex 2 - kp::Manager mgr(0, { 2 }); +mgr.eval_tensor_create_def([tensor_in_a, tensor_in_b, tensor_out]) - // 2. Explicitly create Kompute Tensors without initializing in GPU - auto tensorInA = std::make_shared(kp::Tensor({ 2., 2., 2. })); - auto tensorInB = std::make_shared(kp::Tensor({ 1., 2., 3. })); - auto tensorOut = std::make_shared(kp::Tensor({ 0., 0., 0. })); +# Define the function via PyShader or directly as glsl string or spirv bytes +@python2shader +def compute_shader_multiply(index=("input", "GlobalInvocationId", ivec3), + data1=("buffer", 0, Array(f32)), + data2=("buffer", 1, Array(f32)), + data3=("buffer", 2, Array(f32))): + i = index.x + data3[i] = data1[i] * data2[i] - // 3. Initialise the Kompute Tensor in GPU memory and map data into GPU - mgr.evalOpDefault({ tensorInA, tensorInB, tensorOut }); +# Run shader operation synchronously +mgr.eval_algo_data_def( + [tensor_in_a, tensor_in_b, tensor_out], compute_shader_multiply.to_spirv()) - // 4. Run operation with custom compute shader code asynchronously with explicit dispatch layout - mgr.evalOpAsyncDefault( - { tensorInA, tensorInB, tensorOut }, - shaderData); // "shaderData" defined is below and can be glsl/spirv string, or path to file +# Alternatively can pass raw string/bytes: +# shaderFileData = """ shader code here... """ +# mgr.eval_algo_data_def([tensor_in_a, tensor_in_b, tensor_out], shaderFileData) - // 4.1. Before submitting sequence batch we wait for the async operation - mgr.evalOpAwaitDefault(); +mgr.eval_await_def() - // 5. Create managed sequence to submit batch operations to the CPU - std::shared_ptr sq = mgr.getOrCreateManagedSequence("seq"); +mgr.eval_tensor_sync_local_def([tensor_out]) - // 5.1. Explicitly begin recording batch commands - sq->begin(); - - // 5.2. Record batch commands - sq->record({ tensorInA }); - sq->record({ tensorInB }); - sq->record({ tensorOut }); - - // 5.3. Explicitly stop recording batch commands - sq->end(); - - // 6. Map data back to host by running the sequence of batch operations - sq->eval(); - - // Prints the output which is Output: { 2, 4, 6 } - std::cout << "Output: { "; - for (const float& elem : tensorOut->data()) { - std::cout << elem << " "; - } - std::cout << "}" << std::endl; -} +assert tensor_out.data() == [2.0, 4.0, 6.0] ``` -Your shader can be provided as raw glsl/hlsl string, SPIR-V bytes array (using our CLI), or string path to file containing either. Below are the examples of the valid ways of providing shader. - -#### Passing raw GLSL/HLSL string - -```c++ -static std::string shaderString = (R"( - #version 450 - - layout (local_size_x = 1) in; - - // The input tensors bind index is relative to index in parameter passed - layout(set = 0, binding = 0) buffer bina { float tina[]; }; - layout(set = 0, binding = 1) buffer binb { float tinb[]; }; - layout(set = 0, binding = 2) buffer bout { float tout[]; }; - - void main() { - uint index = gl_GlobalInvocationID.x; - tout[index] = tina[index] * tinb[index]; - } -)"); -static std::vector shaderData(shaderString.begin(), shaderString.end()); -``` - -#### Passing SPIR-V Bytes array - -You can use the Kompute [shader-to-cpp-header CLI](https://kompute.cc/overview/shaders-to-headers.html) to convert your GLSL/HLSL or SPIR-V shader into C++ header file (see documentation link for more info). This is useful if you want your binary to be compiled with all relevant artifacts. - -```c++ -static std::vector shaderData = { 0x03, //... spirv bytes go here) -``` - -#### Path to file containing raw glsl/hlsl or SPIRV bytes - -```c++ -static std::string shaderData = "path/to/shader.glsl"; -// Or SPIR-V -static std::string shaderData = "path/to/shader.glsl.spv"; -``` - -## More examples - -### Simple examples - -* [Pass shader as raw string](https://kompute.cc/overview/advanced-examples.html#simple-shader-example) -* [Record batch commands with a Kompute Sequence](https://kompute.cc/overview/advanced-examples.html#record-batch-commands) -* [Run Asynchronous Operations](https://kompute.cc/overview/advanced-examples.html#asynchronous-operations) -* [Run Parallel Operations Across Multiple GPU Queues](https://kompute.cc/overview/advanced-examples.html#parallel-operations) -* [Create your custom Kompute Operations](https://kompute.cc/overview/advanced-examples.html#your-custom-kompute-operation) -* [Implementing logistic regression from scratch](https://kompute.cc/overview/advanced-examples.html#logistic-regression-example) - -### End-to-end examples - -* [Machine Learning Logistic Regression Implementation](https://towardsdatascience.com/machine-learning-and-data-processing-in-the-gpu-with-vulkan-kompute-c9350e5e5d3a) -* [Parallelizing GPU-intensive Workloads via Multi-Queue Operations](https://towardsdatascience.com/parallelizing-heavy-gpu-workloads-via-multi-queue-operations-50a38b15a1dc) -* [Android NDK Mobile Kompute ML Application](https://towardsdatascience.com/gpu-accelerated-machine-learning-in-your-mobile-applications-using-the-android-ndk-vulkan-kompute-1e9da37b7617) -* [Game Development Kompute ML in Godot Engine](https://towardsdatascience.com/supercharging-game-development-with-gpu-accelerated-ml-using-vulkan-kompute-the-godot-game-engine-4e75a84ea9f0) - - ## Architectural Overview The core architecture of Kompute includes the following: @@ -313,6 +256,24 @@ You can also access the