Merge pull request #97 from EthicalML/python_extensions

Python extensions for end to end example
This commit is contained in:
Alejandro Saucedo 2020-11-11 08:46:46 +00:00 committed by GitHub
commit 5a3e4da916
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
7 changed files with 223 additions and 157 deletions

BIN
docs/images/binder-python.jpg Executable file

Binary file not shown.

After

Width:  |  Height:  |  Size: 424 KiB

View file

@ -2,13 +2,28 @@
Python Package Overview
========
This section provides an overview of the Python Package from a functionality perspective. If you wish to see all the classes and their respective functions you can find that in the `Python Class Reference Section <python-reference>`_.
This section provides an overview of the Python Package from a functionality perspective. If you wish to see all the classes and their respective functions you can find that in the `Python Class Reference Section <python-reference.html>`_.
Below is a diagram that provides insights on the relationship between Vulkan Kompute objects and Vulkan resources, which primarily encompass ownership of either CPU and/or GPU memory.
.. image:: ../images/kompute-architecture.jpg
:width: 70%
Package Installation
^^^^^^^^^
Once you set up the package dependencies, you can install Kompute from ```Pypi``` using ```pip``` by running:
.. code-block:: bash
pip install kp
You can also install from master branch using:
.. code-block:: python
pip install git+git://github.com/EthicalML/vulkan-kompute.git@master
Core Python Components
^^^^^^^^
@ -272,28 +287,16 @@ Similar to the logistic regression implementation in the C++ examples section, b
print(tensor_b_in.data())
Package Installation
^^^^^^^^^
The package can be installed through the top level `setup.py` by running:
```
pip install kp
```
You can also install from master branch using:
```
pip install git+git://github.com/EthicalML/vulkan-kompute.git@master
```
Log Level Configuration
^^^^^^
You can configure log level with the function `kp.log_level` as outlined below.
The values are TRACE=0, DEBUG=1, INFO=2, WARN=3, ERROR=4. Kompute defaults to INFO.
```
import kp
kp.log_level(1)
```
.. code-block:: python
:linenos:
import kp
kp.log_level(1)

17
examples/python/README.md Normal file
View file

@ -0,0 +1,17 @@
# Kompute Python Example
This folder contains the accompanying code for the article "High Performance Python for GPU Accelerated Machine Learning in Cross-Vendor GPUs".
The easiest way to try this example is by using the [Google Binder Notebook](https://colab.research.google.com/drive/15uQ7qMZuOyk8JcXF-3SB2R5yNFW21I4P), which will allow you to use a GPU for free and runs without much setup.
<a href="https://colab.research.google.com/drive/15uQ7qMZuOyk8JcXF-3SB2R5yNFW21I4P">
<img src="https://raw.githubusercontent.com/EthicalML/vulkan-kompute/python_extensions/docs/images/binder-python.jpg">
</a>
Alternatively if you want to test the example yourself locally, you can get setup and started through the following links:
1. Install the [Kompute Python Package](https://kompute.cc/overview/python-package.html#package-installation)
2. Run the [Array Multiplication Code](https://github.com/EthicalML/vulkan-kompute/blob/python_extensions/python/test/test_array_multiplication.py)
3. Run the [Logistic Regression Code](https://github.com/EthicalML/vulkan-kompute/blob/python_extensions/python/test/test_logistic_regression.py)

View file

@ -39,12 +39,33 @@ PYBIND11_MODULE(kp, m) {
return std::unique_ptr<kp::Tensor>(new kp::Tensor(data, tensorTypes));
}), "Initialiser with list of data components and tensor GPU memory type.")
.def("data", &kp::Tensor::data, DOC(kp, Tensor, data))
.def("get", [](kp::Tensor &self, uint32_t index) -> float { return self.data()[index]; },
.def("__getitem__", [](kp::Tensor &self, size_t index) -> float { return self.data()[index]; },
"When only an index is necessary")
.def("set", [](kp::Tensor &self, uint32_t index, float value) {
.def("__setitem__", [](kp::Tensor &self, size_t index, float value) {
self.data()[index] = value; })
.def("set", &kp::Tensor::setData, "Overrides the data in the local Tensor memory.")
.def("set_data", &kp::Tensor::setData, "Overrides the data in the local Tensor memory.")
.def("__iter__", [](kp::Tensor &self) {
return py::make_iterator(self.data().begin(), self.data().end());
}, py::keep_alive<0, 1>(), // Required to keep alive iterator while exists
"Iterator to enable looping within data structure as required.")
.def("__contains__", [](kp::Tensor &self, float v) {
for (size_t i = 0; i < self.data().size(); ++i) {
if (v == self.data()[i]) {
return true;
}
}
return false;
})
.def("__reversed__", [](kp::Tensor &self) {
size_t size = self.data().size();
std::vector<float> reversed(size);
for (size_t i = 0; i < size; i++) {
reversed[size - i - 1] = self.data()[i];
}
return reversed;
})
.def("size", &kp::Tensor::size, "Retrieves the size of the Tensor data as per the local Tensor memory.")
.def("__len__", &kp::Tensor::size, "Retrieves the size of the Tensor data as per the local Tensor memory.")
.def("tensor_type", &kp::Tensor::tensorType, "Retreves the memory type of the tensor.")
.def("is_init", &kp::Tensor::isInit, "Checks whether the tensor GPU memory has been initialised.")
.def("map_data_from_host", &kp::Tensor::mapDataFromHostMemory, "Maps data into GPU memory from tensor local data.")

View file

@ -0,0 +1,35 @@
import pyshader as ps
import kp
def test_array_multiplication():
# 1. Create Kompute Manager (selects device 0 by default)
mgr = kp.Manager()
# 2. Create Kompute Tensors to hold data
tensor_in_a = kp.Tensor([2, 2, 2])
tensor_in_b = kp.Tensor([1, 2, 3])
tensor_out = kp.Tensor([0, 0, 0])
# 3. Initialise the Kompute Tensors in the GPU
mgr.eval_tensor_create_def([tensor_in_a, tensor_in_b, tensor_out])
# 4. Define the multiplication shader code to run on the GPU
@ps.python2shader
def compute_shader_multiply(index=("input", "GlobalInvocationId", ps.ivec3),
data1=("buffer", 0, ps.Array(ps.f32)),
data2=("buffer", 1, ps.Array(ps.f32)),
data3=("buffer", 2, ps.Array(ps.f32))):
i = index.x
data3[i] = data1[i] * data2[i]
# 5. Run shader code against our previously defined tensors
mgr.eval_algo_data_def(
[tensor_in_a, tensor_in_b, tensor_out],
compute_shader_multiply.to_spirv())
# 6. Sync tensor data from GPU back to local
mgr.eval_tensor_sync_local_def([tensor_out])
assert tensor_out.data() == [2.0, 4.0, 6.0]

View file

@ -1,9 +1,6 @@
import os
from pyshader import python2shader, f32, ivec3, Array
from pyshader.stdlib import exp, log
from kp import Tensor, Manager, Sequence
import kp
DIRNAME = os.path.dirname(os.path.abspath(__file__))
@ -12,11 +9,11 @@ def test_opmult():
Test basic OpMult operation
"""
tensor_in_a = Tensor([2, 2, 2])
tensor_in_b = Tensor([1, 2, 3])
tensor_out = Tensor([0, 0, 0])
tensor_in_a = kp.Tensor([2, 2, 2])
tensor_in_b = kp.Tensor([1, 2, 3])
tensor_out = kp.Tensor([0, 0, 0])
mgr = Manager()
mgr = kp.Manager()
mgr.eval_tensor_create_def([tensor_in_a, tensor_in_b, tensor_out])
@ -31,11 +28,11 @@ def test_opalgobase_data():
Test basic OpAlgoBase operation
"""
tensor_in_a = Tensor([2, 2, 2])
tensor_in_b = Tensor([1, 2, 3])
tensor_out = Tensor([0, 0, 0])
tensor_in_a = kp.Tensor([2, 2, 2])
tensor_in_b = kp.Tensor([1, 2, 3])
tensor_out = kp.Tensor([0, 0, 0])
mgr = Manager()
mgr = kp.Manager()
shaderData = """
#version 450
@ -67,11 +64,11 @@ def test_opalgobase_file():
Test basic OpAlgoBase operation
"""
tensor_in_a = Tensor([2, 2, 2])
tensor_in_b = Tensor([1, 2, 3])
tensor_out = Tensor([0, 0, 0])
tensor_in_a = kp.Tensor([2, 2, 2])
tensor_in_b = kp.Tensor([1, 2, 3])
tensor_out = kp.Tensor([0, 0, 0])
mgr = Manager()
mgr = kp.Manager()
shaderFilePath = os.path.join(DIRNAME, "../../shaders/glsl/opmult.comp")
@ -87,11 +84,11 @@ def test_sequence():
"""
Test basic OpAlgoBase operation
"""
mgr = Manager(0, [2])
mgr = kp.Manager(0, [2])
tensor_in_a = Tensor([2, 2, 2])
tensor_in_b = Tensor([1, 2, 3])
tensor_out = Tensor([0, 0, 0])
tensor_in_a = kp.Tensor([2, 2, 2])
tensor_in_b = kp.Tensor([1, 2, 3])
tensor_out = kp.Tensor([0, 0, 0])
mgr.eval_tensor_create_def([tensor_in_a, tensor_in_b, tensor_out])
@ -109,118 +106,3 @@ def test_sequence():
seq.eval()
assert tensor_out.data() == [2.0, 4.0, 6.0]
def test_pyshader_pyshader():
@python2shader
def compute_shader_multiply(index=("input", "GlobalInvocationId", ivec3),
data1=("buffer", 0, Array(f32)),
data2=("buffer", 1, Array(f32)),
data3=("buffer", 2, Array(f32))):
i = index.x
data3[i] = data1[i] * data2[i]
tensor_in_a = Tensor([2, 2, 2])
tensor_in_b = Tensor([1, 2, 3])
tensor_out = Tensor([0, 0, 0])
mgr = Manager()
mgr.eval_tensor_create_def([tensor_in_a, tensor_in_b, tensor_out])
mgr.eval_algo_data_def([tensor_in_a, tensor_in_b, tensor_out], compute_shader_multiply.to_spirv())
mgr.eval_tensor_sync_local_def([tensor_out])
assert tensor_out.data() == [2.0, 4.0, 6.0]
def test_logistic_regression_pyshader():
@python2shader
def compute_shader(
index = ("input", "GlobalInvocationId", ivec3),
x_i = ("buffer", 0, Array(f32)),
x_j = ("buffer", 1, Array(f32)),
y = ("buffer", 2, Array(f32)),
w_in = ("buffer", 3, Array(f32)),
w_out_i = ("buffer", 4, Array(f32)),
w_out_j = ("buffer", 5, Array(f32)),
b_in = ("buffer", 6, Array(f32)),
b_out = ("buffer", 7, Array(f32)),
l_out = ("buffer", 8, Array(f32)),
M = ("buffer", 9, Array(f32))):
i = index.x
m = M[0]
w_curr = vec2(w_in[0], w_in[1])
b_curr = b_in[0]
x_curr = vec2(x_i[i], x_j[i])
y_curr = y[i]
z_dot = w_curr @ x_curr
z = z_dot + b_curr
y_hat = 1.0 / (1.0 + exp(-z))
d_z = y_hat - y_curr
d_w = (1.0 / m) * x_curr * d_z
d_b = (1.0 / m) * d_z
loss = -((y_curr * log(y_hat)) + ((1.0 + y_curr) * log(1.0 - y_hat)))
w_out_i[i] = d_w.x
w_out_j[i] = d_w.y
b_out[i] = d_b
l_out[i] = loss
# First we create input and ouput tensors for shader
tensor_x_i = Tensor([0.0, 1.0, 1.0, 1.0, 1.0])
tensor_x_j = Tensor([0.0, 0.0, 0.0, 1.0, 1.0])
tensor_y = Tensor([0.0, 0.0, 0.0, 1.0, 1.0])
tensor_w_in = Tensor([0.001, 0.001])
tensor_w_out_i = Tensor([0.0, 0.0, 0.0, 0.0, 0.0])
tensor_w_out_j = Tensor([0.0, 0.0, 0.0, 0.0, 0.0])
tensor_b_in = Tensor([0.0])
tensor_b_out = Tensor([0.0, 0.0, 0.0, 0.0, 0.0])
tensor_l_out = Tensor([0.0, 0.0, 0.0, 0.0, 0.0])
tensor_m = Tensor([ 5.0 ])
# We store them in an array for easier interaction
params = [tensor_x_i, tensor_x_j, tensor_y, tensor_w_in, tensor_w_out_i,
tensor_w_out_j, tensor_b_in, tensor_b_out, tensor_l_out, tensor_m]
mgr = Manager()
mgr.eval_tensor_create_def(params)
# Record commands for efficient evaluation
sq = mgr.create_sequence()
sq.begin()
sq.record_tensor_sync_device([tensor_w_in, tensor_b_in])
sq.record_algo_data(params, compute_shader.to_spirv())
sq.record_tensor_sync_local([tensor_w_out_i, tensor_w_out_j, tensor_b_out, tensor_l_out])
sq.end()
ITERATIONS = 100
learning_rate = 0.1
# Perform machine learning training and inference across all input X and Y
for i_iter in range(ITERATIONS):
sq.eval()
# Calculate the parameters based on the respective derivatives calculated
for j_iter in range(tensor_b_out.size()):
tensor_w_in.set(0, tensor_w_in.get(0) - learning_rate * tensor_w_out_i.data()[j_iter])
tensor_w_in.set(1, tensor_w_in.get(1) - learning_rate * tensor_w_out_j.data()[j_iter])
tensor_b_in.set(0, tensor_b_in.get(0) - learning_rate * tensor_b_out.data()[j_iter])
assert tensor_w_in.data()[0] < 0.01
assert tensor_w_in.data()[0] > 0.0
assert tensor_w_in.data()[1] > 1.5
assert tensor_b_in.data()[0] < 0.7

View file

@ -0,0 +1,108 @@
import pyshader as ps
import kp
def test_logistic_regression():
@ps.python2shader
def compute_shader(
index = ("input", "GlobalInvocationId", ps.ivec3),
x_i = ("buffer", 0, ps.Array(ps.f32)),
x_j = ("buffer", 1, ps.Array(ps.f32)),
y = ("buffer", 2, ps.Array(ps.f32)),
w_in = ("buffer", 3, ps.Array(ps.f32)),
w_out_i = ("buffer", 4, ps.Array(ps.f32)),
w_out_j = ("buffer", 5, ps.Array(ps.f32)),
b_in = ("buffer", 6, ps.Array(ps.f32)),
b_out = ("buffer", 7, ps.Array(ps.f32)),
l_out = ("buffer", 8, ps.Array(ps.f32)),
M = ("buffer", 9, ps.Array(ps.f32))):
i = index.x
m = M[0]
w_curr = vec2(w_in[0], w_in[1])
b_curr = b_in[0]
x_curr = vec2(x_i[i], x_j[i])
y_curr = y[i]
z_dot = w_curr @ x_curr
z = z_dot + b_curr
y_hat = 1.0 / (1.0 + exp(-z))
d_z = y_hat - y_curr
d_w = (1.0 / m) * x_curr * d_z
d_b = (1.0 / m) * d_z
loss = -((y_curr * log(y_hat)) + ((1.0 + y_curr) * log(1.0 - y_hat)))
w_out_i[i] = d_w.x
w_out_j[i] = d_w.y
b_out[i] = d_b
l_out[i] = loss
mgr = kp.Manager(0)
# First we create input and ouput tensors for shader
tensor_x_i = kp.Tensor([0.0, 1.0, 1.0, 1.0, 1.0])
tensor_x_j = kp.Tensor([0.0, 0.0, 0.0, 1.0, 1.0])
tensor_y = kp.Tensor([0.0, 0.0, 0.0, 1.0, 1.0])
tensor_w_in = kp.Tensor([0.001, 0.001])
tensor_w_out_i = kp.Tensor([0.0, 0.0, 0.0, 0.0, 0.0])
tensor_w_out_j = kp.Tensor([0.0, 0.0, 0.0, 0.0, 0.0])
tensor_b_in = kp.Tensor([0.0])
tensor_b_out = kp.Tensor([0.0, 0.0, 0.0, 0.0, 0.0])
tensor_l_out = kp.Tensor([0.0, 0.0, 0.0, 0.0, 0.0])
tensor_m = kp.Tensor([ tensor_y.size() ])
# We store them in an array for easier interaction
params = [tensor_x_i, tensor_x_j, tensor_y, tensor_w_in, tensor_w_out_i,
tensor_w_out_j, tensor_b_in, tensor_b_out, tensor_l_out, tensor_m]
mgr.eval_tensor_create_def(params)
# Create a managed sequence
sq = mgr.create_sequence()
# Clear previous operations and begin recording for new operations
sq.begin()
# Record operation to sync memory from local to GPU memory
sq.record_tensor_sync_device([tensor_w_in, tensor_b_in])
# Record operation to execute GPU shader against all our parameters
sq.record_algo_data(params, compute_shader.to_spirv())
# Record operation to sync memory from GPU to local memory
sq.record_tensor_sync_local([tensor_w_out_i, tensor_w_out_j, tensor_b_out, tensor_l_out])
# Stop recording operations
sq.end()
ITERATIONS = 100
learning_rate = 0.1
# Perform machine learning training and inference across all input X and Y
for i_iter in range(ITERATIONS):
# Execute an iteration of the algorithm
sq.eval()
# Calculate the parameters based on the respective derivatives calculated
for j_iter in range(tensor_b_out.size()):
tensor_w_in[0] -= learning_rate * tensor_w_out_i.data()[j_iter]
tensor_w_in[1] -= learning_rate * tensor_w_out_j.data()[j_iter]
tensor_b_in[0] -= learning_rate * tensor_b_out.data()[j_iter]
assert tensor_w_in.data()[0] < 0.01
assert tensor_w_in.data()[0] > 0.0
assert tensor_w_in.data()[1] > 1.5
assert tensor_b_in.data()[0] < 0.7