diff --git a/examples/neural_network_vgg7/.gitignore b/examples/neural_network_vgg7/.gitignore new file mode 100644 index 000000000..8a402f0be --- /dev/null +++ b/examples/neural_network_vgg7/.gitignore @@ -0,0 +1 @@ +model-kipper diff --git a/examples/neural_network_vgg7/README.md b/examples/neural_network_vgg7/README.md new file mode 100644 index 000000000..187da42e3 --- /dev/null +++ b/examples/neural_network_vgg7/README.md @@ -0,0 +1,12 @@ +# Waifu2x VGG7 implementation + +This demonstrates performing image upscaling using Python and vulkan-kompute. + +To import an existing VGG7 model (assuming you have https://github.com/nagadomi/waifu2x/ cloned somewhere): + +`python3 import_vgg7.py waifu2x/models/vgg_7/art/scale2.0x_model.json` + +To execute that model (no tiling is performed, so be careful about image sizes): + +`python3 run_vgg7.py w2wbinit.png out.png` + diff --git a/examples/neural_network_vgg7/import_vgg7.py b/examples/neural_network_vgg7/import_vgg7.py new file mode 100644 index 000000000..c86ff36c8 --- /dev/null +++ b/examples/neural_network_vgg7/import_vgg7.py @@ -0,0 +1,30 @@ +import numpy +import json +import os +import sys +import time +import sh_common + +if len(sys.argv) != 2: + print("import_vgg7.py JSONPATH") + print(" i.e. import_vgg7.py /home/you/Documents/External/waifu2x/models/vgg_7/art/scale2.0x_model.json") + sys.exit(1) + +try: + os.mkdir("model-kipper") +except: + pass + +data_list = json.load(open(sys.argv[1], "rb")) + +idx = 0 +for i in range(7): + layer = data_list[i] + w = numpy.array(layer["weight"]) + w.reshape((-1, 3, 3)).transpose((0, 2, 1)) + b = numpy.array(layer["bias"]) + sh_common.save_param("kipper", idx, w) + idx += 1 + sh_common.save_param("kipper", idx, b) + idx += 1 + diff --git a/examples/neural_network_vgg7/run_vgg7.py b/examples/neural_network_vgg7/run_vgg7.py new file mode 100644 index 000000000..f5d88e841 --- /dev/null +++ b/examples/neural_network_vgg7/run_vgg7.py @@ -0,0 +1,125 @@ +import kp +import numpy +import os +import sys +import time +import sh_conv +import sh_common + +if len(sys.argv) != 3: + print("run_vgg7.py INPUT OUTPUT") + print(" Tiling is not implemented, but padding is implemented") + sys.exit(1) + +# NOTES: +# + Tiling is not implemented, but padding is implemented +# So don't run anything too big through it + +if False: + kpm = kp.Manager(1) + if kpm.get_device_properties()["device_name"].count("RAVEN") > 0: + raise "Safety cut-out triggered. Sorry!" +else: + kpm = kp.Manager() + +image = sh_common.image_load(sys.argv[1]) +image = image.repeat(2, 0).repeat(2, 1) +image = numpy.pad(image, [[7, 7], [7, 7], [0, 0]], mode = "edge") + +# Ensure image has 4 channels even though they will be unused. +# This is because of vectorization vec4 magic. +while image.shape[2] < sh_common.VSZ: + image = numpy.pad(image, [[0, 0], [0, 0], [0, 1]], mode = "constant") + +# sh_common.image_save("pad.png", image) + +# Prepare the initial tensor. + +tensor_in = kpm.tensor(image) +tensor_in_h = image.shape[0] +tensor_in_w = image.shape[1] +tensor_in_cg = 1 +tensor_in_c = 3 + +# Run things. +channels = [32, 32, 64, 64, 128, 128, 3] + +for i in range(7): + # Prepare tensors. + # 'c' is the total amount of channels, while 'cg' is the amount of vec4s (channel-groups). + # This is important because weights have to be padded for the shader. + tensor_out_h = tensor_in_h - 2 + tensor_out_w = tensor_in_w - 2 + tensor_out_c = channels[i] + tensor_out_cg = (channels[i] + (sh_common.VSZ - 1)) // sh_common.VSZ + # TODO: How to produce a blank tensor we don't care about the contents of? + # This isn't being synced, and from experience so far that should handle most of it, + # but what about memory usage? + # *Most* of these tensors live entirely on-device except when debugging. + # Can that be handled? (Also good question: Does it even need to be handled?) + tensor_out = kpm.tensor(numpy.zeros((tensor_out_h * tensor_out_w * tensor_out_cg * sh_common.VSZ))) + weight = kpm.tensor(sh_common.load_weights_padded("kipper", (i * 2) + 0, tensor_out_c, tensor_in_c, 3)) + bias = kpm.tensor(sh_common.load_biases_padded("kipper", (i * 2) + 1, tensor_out_c)) + # Compute. + # TODO: It'd be nice to wrap this up into a class for optimization purposes. + workgroup = ((tensor_out_w + 7) // 8, (tensor_out_h + 1) // 2, tensor_out_cg) + alg = kpm.algorithm( + # tensors + [tensor_in, bias, weight, tensor_out], + # spirv + sh_conv.conv_shader, + # workgroup + workgroup, + # spec_consts + [tensor_in_w, tensor_in_h, tensor_in_cg, tensor_out_w, tensor_out_h, tensor_out_cg], + # push_consts + [] + ) + + print("Step complexity " + str(workgroup)) + print("Step channel layout " + str(tensor_in_cg) + " " + str(tensor_out_cg)) + + # Do this first. Keep in mind "syncs" are copies. + last_seq = kpm.sequence() + things_to_sync_to_device = [bias, weight] + if i == 0: + # For first layer, the input isn't on-device yet + things_to_sync_to_device.append(tensor_in) + last_seq.eval_async(kp.OpTensorSyncDevice(things_to_sync_to_device)) + last_seq.eval_await() + + # Prepare + seq = (kpm.sequence() + .record(kp.OpAlgoDispatch(alg, [])) + ) + # Run + seq.eval() + + print("Done with step") + + if False: + # DEBUG: + # We want to see the output, copy it to local + last_seq = kpm.sequence() + last_seq.eval_async(kp.OpTensorSyncLocal([tensor_out])) + last_seq.eval_await() + tensor_out.data().astype(" numpy.ndarray: + """ + Loads an image. + Doesn't Tensor it, in case you need to do further work with it. + Shape is (h, w, 3). + """ + # file + na = numpy.array(Image.open(path)) + # change type + na = na.astype("float32") / 255.0 + return na + +def image_save(path, na: numpy.ndarray): + """ + Saves an image. + However, note this expects a numpy array. + Shape is (h, w, 3). + """ + # change type + na = numpy.fmax(numpy.fmin(na * 255.0, 255), 0).astype("uint8") + # file + Image.fromarray(na).save(path) + +def load_param(mdl, idx, expected): + npa = numpy.fromfile("model-" + mdl + "/snoop_bin_" + str(idx) + ".bin", " + # [outputCGroups][kernelH][kernelW][inputCGroups][outputChannels][inputChannels] + weight_na = load_param(mdl, idx, tensor_out_c * tensor_in_c * weight_s * weight_s) + # start by putting in the initial shape + weight_na = weight_na.reshape(tensor_out_c, tensor_in_c, weight_s, weight_s) + # then by padding + # NOTE: It is *critically important* that weight padding is done with the "zero" mode. + # The shader WILL NOT ignore these values, but zeroing them causes them to have no effect. + if (tensor_in_c & 3) != 0: + weight_na = numpy.pad(weight_na, [[0, 0], [0, 4 - (tensor_in_c & 3)], [0, 0], [0, 0]], mode = "constant") + if (tensor_out_c & 3) != 0: + weight_na = numpy.pad(weight_na, [[0, 4 - (tensor_out_c & 3)], [0, 0], [0, 0], [0, 0]], mode = "constant") + # reshape to finish splitting things up + weight_na = weight_na.reshape(tensor_out_cg, 4, tensor_in_cg, 4, weight_s, weight_s) + # result is: + # [outputCGroups][outputChannels][inputCGroups][inputChannels][kernelH][kernelW] + # and move output channels to the right... + weight_na = numpy.moveaxis(weight_na, 1, 5) + # result is: + # [outputCGroups][inputCGroups][inputChannels][kernelH][kernelW][outputChannels] + # and move input channels to the right... + weight_na = numpy.moveaxis(weight_na, 2, 5) + # result is: + # [outputCGroups][inputCGroups][kernelH][kernelW][outputChannels][inputChannels] + # and move input cgroups to the right... + weight_na = numpy.moveaxis(weight_na, 1, 3) + return weight_na + +def load_biases_padded(mdl, idx, tensor_out_c): + tensor_out_cg = (tensor_out_c + 3) // 4 + # [outputCGroups][outputChannels] + # biases merely need padding + # Again, has to be zero + bias_na = load_param(mdl, idx, tensor_out_c) + if (tensor_out_c & 3) != 0: + bias_na = numpy.pad(bias_na, [[0, 4 - (tensor_out_c & 3)]], mode = "constant") + return bias_na + diff --git a/examples/neural_network_vgg7/sh_conv.py b/examples/neural_network_vgg7/sh_conv.py new file mode 100644 index 000000000..dea3722cf --- /dev/null +++ b/examples/neural_network_vgg7/sh_conv.py @@ -0,0 +1,70 @@ +import kp + +# This is the convolution & leakyrelu shader. +global conv_shader +conv_shader = kp.Shader.compile_source(""" +#version 450 + +layout (local_size_x = 8, local_size_y = 2) in; + +// [y][x][group] (vec4: channels) +layout (set = 0, binding = 0) buffer buf_in_image { readonly restrict vec4 in_image[]; }; +// [outputCGroups] (vec4: output channels) +layout (set = 0, binding = 1) buffer buf_in_bias { readonly restrict vec4 in_bias[]; }; +// [outputCGroups][kernelH][kernelW][inputCGroups] (mat4: input & output channels) +layout (set = 0, binding = 2) buffer buf_in_weight { readonly restrict mat4 in_weight[]; }; +// [y][x][group] (vec4: channels) +layout (set = 0, binding = 3) buffer buf_out_image { writeonly restrict vec4 out_image[]; }; + +// The 'c' measures in cgroups. +// Some maths changes as a result. +layout (constant_id = 0) const float in_w = 0; +layout (constant_id = 1) const float in_h = 0; +layout (constant_id = 2) const float in_cg = 0; +layout (constant_id = 3) const float out_w = 0; +layout (constant_id = 4) const float out_h = 0; +layout (constant_id = 5) const float out_cg = 0; + +uint index_in_no_ic(uvec2 pos) { + return (pos.x + (pos.y * uint(in_w))) * uint(in_cg); +} + +uint index_out(uvec2 pos) { + return ((pos.x + (pos.y * uint(out_w))) * uint(out_cg)) + gl_GlobalInvocationID.z; +} + +void main() { + // out x/y is gl_GlobalInvocationID.xy + // we need to account for workgroupy padding *here* + // so long as we aren't trying to output to a pixel that doesn't exist, + // we won't read from any pixels that don't exist + if ( + (gl_GlobalInvocationID.x < (uint(in_w) - 2)) && + (gl_GlobalInvocationID.y < (uint(in_h) - 2)) + ) { + vec4 value = in_bias[gl_GlobalInvocationID.z]; + for (uint x = 0; x < 3; x++) { + for (uint y = 0; y < 3; y++) { + uint weight_ptr = ((gl_GlobalInvocationID.z * 9) + (x + (y * 3))) * uint(in_cg); + // specific pixel + // important to note is that since in position has a border around it, + // no further transformation is necessary (the - is implied) + uvec2 in_pos = gl_GlobalInvocationID.xy + uvec2(x, y); + uint in_ptr = index_in_no_ic(in_pos); + for (uint icg = 0; icg < uint(in_cg); icg++) { + // input channel group + vec4 iCG = in_image[in_ptr]; + // handle all 4 input components + value += iCG * in_weight[weight_ptr]; + weight_ptr += 1; + in_ptr += 1; + } + } + } + // leakyrelu slope 0.1 + value = (max(value, 0.0) * 0.9) + (value * 0.1); + out_image[index_out(gl_GlobalInvocationID.xy)] = value; + } +} +""") + diff --git a/examples/neural_network_vgg7/w2wbinit.png b/examples/neural_network_vgg7/w2wbinit.png new file mode 100644 index 000000000..fc3a908e5 Binary files /dev/null and b/examples/neural_network_vgg7/w2wbinit.png differ