VGG7 example
This commit is contained in:
parent
20365e333d
commit
865fd4b5cd
7 changed files with 320 additions and 0 deletions
1
examples/neural_network_vgg7/.gitignore
vendored
Normal file
1
examples/neural_network_vgg7/.gitignore
vendored
Normal file
|
|
@ -0,0 +1 @@
|
|||
model-kipper
|
||||
12
examples/neural_network_vgg7/README.md
Normal file
12
examples/neural_network_vgg7/README.md
Normal file
|
|
@ -0,0 +1,12 @@
|
|||
# Waifu2x VGG7 implementation
|
||||
|
||||
This demonstrates performing image upscaling using Python and vulkan-kompute.
|
||||
|
||||
To import an existing VGG7 model (assuming you have https://github.com/nagadomi/waifu2x/ cloned somewhere):
|
||||
|
||||
`python3 import_vgg7.py waifu2x/models/vgg_7/art/scale2.0x_model.json`
|
||||
|
||||
To execute that model (no tiling is performed, so be careful about image sizes):
|
||||
|
||||
`python3 run_vgg7.py w2wbinit.png out.png`
|
||||
|
||||
30
examples/neural_network_vgg7/import_vgg7.py
Normal file
30
examples/neural_network_vgg7/import_vgg7.py
Normal file
|
|
@ -0,0 +1,30 @@
|
|||
import numpy
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
import sh_common
|
||||
|
||||
if len(sys.argv) != 2:
|
||||
print("import_vgg7.py JSONPATH")
|
||||
print(" i.e. import_vgg7.py /home/you/Documents/External/waifu2x/models/vgg_7/art/scale2.0x_model.json")
|
||||
sys.exit(1)
|
||||
|
||||
try:
|
||||
os.mkdir("model-kipper")
|
||||
except:
|
||||
pass
|
||||
|
||||
data_list = json.load(open(sys.argv[1], "rb"))
|
||||
|
||||
idx = 0
|
||||
for i in range(7):
|
||||
layer = data_list[i]
|
||||
w = numpy.array(layer["weight"])
|
||||
w.reshape((-1, 3, 3)).transpose((0, 2, 1))
|
||||
b = numpy.array(layer["bias"])
|
||||
sh_common.save_param("kipper", idx, w)
|
||||
idx += 1
|
||||
sh_common.save_param("kipper", idx, b)
|
||||
idx += 1
|
||||
|
||||
125
examples/neural_network_vgg7/run_vgg7.py
Normal file
125
examples/neural_network_vgg7/run_vgg7.py
Normal file
|
|
@ -0,0 +1,125 @@
|
|||
import kp
|
||||
import numpy
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
import sh_conv
|
||||
import sh_common
|
||||
|
||||
if len(sys.argv) != 3:
|
||||
print("run_vgg7.py INPUT OUTPUT")
|
||||
print(" Tiling is not implemented, but padding is implemented")
|
||||
sys.exit(1)
|
||||
|
||||
# NOTES:
|
||||
# + Tiling is not implemented, but padding is implemented
|
||||
# So don't run anything too big through it
|
||||
|
||||
if False:
|
||||
kpm = kp.Manager(1)
|
||||
if kpm.get_device_properties()["device_name"].count("RAVEN") > 0:
|
||||
raise "Safety cut-out triggered. Sorry!"
|
||||
else:
|
||||
kpm = kp.Manager()
|
||||
|
||||
image = sh_common.image_load(sys.argv[1])
|
||||
image = image.repeat(2, 0).repeat(2, 1)
|
||||
image = numpy.pad(image, [[7, 7], [7, 7], [0, 0]], mode = "edge")
|
||||
|
||||
# Ensure image has 4 channels even though they will be unused.
|
||||
# This is because of vectorization vec4 magic.
|
||||
while image.shape[2] < sh_common.VSZ:
|
||||
image = numpy.pad(image, [[0, 0], [0, 0], [0, 1]], mode = "constant")
|
||||
|
||||
# sh_common.image_save("pad.png", image)
|
||||
|
||||
# Prepare the initial tensor.
|
||||
|
||||
tensor_in = kpm.tensor(image)
|
||||
tensor_in_h = image.shape[0]
|
||||
tensor_in_w = image.shape[1]
|
||||
tensor_in_cg = 1
|
||||
tensor_in_c = 3
|
||||
|
||||
# Run things.
|
||||
channels = [32, 32, 64, 64, 128, 128, 3]
|
||||
|
||||
for i in range(7):
|
||||
# Prepare tensors.
|
||||
# 'c' is the total amount of channels, while 'cg' is the amount of vec4s (channel-groups).
|
||||
# This is important because weights have to be padded for the shader.
|
||||
tensor_out_h = tensor_in_h - 2
|
||||
tensor_out_w = tensor_in_w - 2
|
||||
tensor_out_c = channels[i]
|
||||
tensor_out_cg = (channels[i] + (sh_common.VSZ - 1)) // sh_common.VSZ
|
||||
# TODO: How to produce a blank tensor we don't care about the contents of?
|
||||
# This isn't being synced, and from experience so far that should handle most of it,
|
||||
# but what about memory usage?
|
||||
# *Most* of these tensors live entirely on-device except when debugging.
|
||||
# Can that be handled? (Also good question: Does it even need to be handled?)
|
||||
tensor_out = kpm.tensor(numpy.zeros((tensor_out_h * tensor_out_w * tensor_out_cg * sh_common.VSZ)))
|
||||
weight = kpm.tensor(sh_common.load_weights_padded("kipper", (i * 2) + 0, tensor_out_c, tensor_in_c, 3))
|
||||
bias = kpm.tensor(sh_common.load_biases_padded("kipper", (i * 2) + 1, tensor_out_c))
|
||||
# Compute.
|
||||
# TODO: It'd be nice to wrap this up into a class for optimization purposes.
|
||||
workgroup = ((tensor_out_w + 7) // 8, (tensor_out_h + 1) // 2, tensor_out_cg)
|
||||
alg = kpm.algorithm(
|
||||
# tensors
|
||||
[tensor_in, bias, weight, tensor_out],
|
||||
# spirv
|
||||
sh_conv.conv_shader,
|
||||
# workgroup
|
||||
workgroup,
|
||||
# spec_consts
|
||||
[tensor_in_w, tensor_in_h, tensor_in_cg, tensor_out_w, tensor_out_h, tensor_out_cg],
|
||||
# push_consts
|
||||
[]
|
||||
)
|
||||
|
||||
print("Step complexity " + str(workgroup))
|
||||
print("Step channel layout " + str(tensor_in_cg) + " " + str(tensor_out_cg))
|
||||
|
||||
# Do this first. Keep in mind "syncs" are copies.
|
||||
last_seq = kpm.sequence()
|
||||
things_to_sync_to_device = [bias, weight]
|
||||
if i == 0:
|
||||
# For first layer, the input isn't on-device yet
|
||||
things_to_sync_to_device.append(tensor_in)
|
||||
last_seq.eval_async(kp.OpTensorSyncDevice(things_to_sync_to_device))
|
||||
last_seq.eval_await()
|
||||
|
||||
# Prepare
|
||||
seq = (kpm.sequence()
|
||||
.record(kp.OpAlgoDispatch(alg, []))
|
||||
)
|
||||
# Run
|
||||
seq.eval()
|
||||
|
||||
print("Done with step")
|
||||
|
||||
if False:
|
||||
# DEBUG:
|
||||
# We want to see the output, copy it to local
|
||||
last_seq = kpm.sequence()
|
||||
last_seq.eval_async(kp.OpTensorSyncLocal([tensor_out]))
|
||||
last_seq.eval_await()
|
||||
tensor_out.data().astype("<f4", "C").tofile("raw" + str(i) + ".bin")
|
||||
|
||||
# Swap over.
|
||||
tensor_in = tensor_out
|
||||
tensor_in_h = tensor_out_h
|
||||
tensor_in_w = tensor_out_w
|
||||
tensor_in_c = tensor_out_c
|
||||
tensor_in_cg = tensor_out_cg
|
||||
|
||||
# Download output
|
||||
fin_seq = kpm.sequence()
|
||||
fin_seq.eval_async(kp.OpTensorSyncLocal([tensor_in]))
|
||||
fin_seq.eval_await()
|
||||
|
||||
# Output
|
||||
out_na = tensor_in.data().reshape((tensor_in_h, tensor_in_w, tensor_in_cg * sh_common.VSZ))
|
||||
# Crop off 'alpha'
|
||||
out_na = out_na[:, :, 0:3]
|
||||
sh_common.image_save(sys.argv[2], out_na)
|
||||
|
||||
82
examples/neural_network_vgg7/sh_common.py
Normal file
82
examples/neural_network_vgg7/sh_common.py
Normal file
|
|
@ -0,0 +1,82 @@
|
|||
from PIL import Image
|
||||
import numpy
|
||||
|
||||
# just in case
|
||||
global VSZ
|
||||
VSZ = 4
|
||||
|
||||
def image_load(path) -> numpy.ndarray:
|
||||
"""
|
||||
Loads an image.
|
||||
Doesn't Tensor it, in case you need to do further work with it.
|
||||
Shape is (h, w, 3).
|
||||
"""
|
||||
# file
|
||||
na = numpy.array(Image.open(path))
|
||||
# change type
|
||||
na = na.astype("float32") / 255.0
|
||||
return na
|
||||
|
||||
def image_save(path, na: numpy.ndarray):
|
||||
"""
|
||||
Saves an image.
|
||||
However, note this expects a numpy array.
|
||||
Shape is (h, w, 3).
|
||||
"""
|
||||
# change type
|
||||
na = numpy.fmax(numpy.fmin(na * 255.0, 255), 0).astype("uint8")
|
||||
# file
|
||||
Image.fromarray(na).save(path)
|
||||
|
||||
def load_param(mdl, idx, expected):
|
||||
npa = numpy.fromfile("model-" + mdl + "/snoop_bin_" + str(idx) + ".bin", "<f4")
|
||||
assert npa.shape[0] == expected
|
||||
return npa
|
||||
|
||||
def save_param(mdl, idx, data):
|
||||
data.astype("<f4", "C").tofile("model-" + mdl + "/snoop_bin_" + str(idx) + ".bin")
|
||||
|
||||
def load_weights_padded(mdl, idx, tensor_out_c, tensor_in_c, weight_s):
|
||||
tensor_out_cg = (tensor_out_c + 3) // 4
|
||||
tensor_in_cg = (tensor_in_c + 3) // 4
|
||||
# weight & bias load & fixup
|
||||
# a reminder:
|
||||
# [outputChannels][inputChannels][kernelH][kernelW]
|
||||
# ->
|
||||
# [outputCGroups][kernelH][kernelW][inputCGroups][outputChannels][inputChannels]
|
||||
weight_na = load_param(mdl, idx, tensor_out_c * tensor_in_c * weight_s * weight_s)
|
||||
# start by putting in the initial shape
|
||||
weight_na = weight_na.reshape(tensor_out_c, tensor_in_c, weight_s, weight_s)
|
||||
# then by padding
|
||||
# NOTE: It is *critically important* that weight padding is done with the "zero" mode.
|
||||
# The shader WILL NOT ignore these values, but zeroing them causes them to have no effect.
|
||||
if (tensor_in_c & 3) != 0:
|
||||
weight_na = numpy.pad(weight_na, [[0, 0], [0, 4 - (tensor_in_c & 3)], [0, 0], [0, 0]], mode = "constant")
|
||||
if (tensor_out_c & 3) != 0:
|
||||
weight_na = numpy.pad(weight_na, [[0, 4 - (tensor_out_c & 3)], [0, 0], [0, 0], [0, 0]], mode = "constant")
|
||||
# reshape to finish splitting things up
|
||||
weight_na = weight_na.reshape(tensor_out_cg, 4, tensor_in_cg, 4, weight_s, weight_s)
|
||||
# result is:
|
||||
# [outputCGroups][outputChannels][inputCGroups][inputChannels][kernelH][kernelW]
|
||||
# and move output channels to the right...
|
||||
weight_na = numpy.moveaxis(weight_na, 1, 5)
|
||||
# result is:
|
||||
# [outputCGroups][inputCGroups][inputChannels][kernelH][kernelW][outputChannels]
|
||||
# and move input channels to the right...
|
||||
weight_na = numpy.moveaxis(weight_na, 2, 5)
|
||||
# result is:
|
||||
# [outputCGroups][inputCGroups][kernelH][kernelW][outputChannels][inputChannels]
|
||||
# and move input cgroups to the right...
|
||||
weight_na = numpy.moveaxis(weight_na, 1, 3)
|
||||
return weight_na
|
||||
|
||||
def load_biases_padded(mdl, idx, tensor_out_c):
|
||||
tensor_out_cg = (tensor_out_c + 3) // 4
|
||||
# [outputCGroups][outputChannels]
|
||||
# biases merely need padding
|
||||
# Again, has to be zero
|
||||
bias_na = load_param(mdl, idx, tensor_out_c)
|
||||
if (tensor_out_c & 3) != 0:
|
||||
bias_na = numpy.pad(bias_na, [[0, 4 - (tensor_out_c & 3)]], mode = "constant")
|
||||
return bias_na
|
||||
|
||||
70
examples/neural_network_vgg7/sh_conv.py
Normal file
70
examples/neural_network_vgg7/sh_conv.py
Normal file
|
|
@ -0,0 +1,70 @@
|
|||
import kp
|
||||
|
||||
# This is the convolution & leakyrelu shader.
|
||||
global conv_shader
|
||||
conv_shader = kp.Shader.compile_source("""
|
||||
#version 450
|
||||
|
||||
layout (local_size_x = 8, local_size_y = 2) in;
|
||||
|
||||
// [y][x][group] (vec4: channels)
|
||||
layout (set = 0, binding = 0) buffer buf_in_image { readonly restrict vec4 in_image[]; };
|
||||
// [outputCGroups] (vec4: output channels)
|
||||
layout (set = 0, binding = 1) buffer buf_in_bias { readonly restrict vec4 in_bias[]; };
|
||||
// [outputCGroups][kernelH][kernelW][inputCGroups] (mat4: input & output channels)
|
||||
layout (set = 0, binding = 2) buffer buf_in_weight { readonly restrict mat4 in_weight[]; };
|
||||
// [y][x][group] (vec4: channels)
|
||||
layout (set = 0, binding = 3) buffer buf_out_image { writeonly restrict vec4 out_image[]; };
|
||||
|
||||
// The 'c' measures in cgroups.
|
||||
// Some maths changes as a result.
|
||||
layout (constant_id = 0) const float in_w = 0;
|
||||
layout (constant_id = 1) const float in_h = 0;
|
||||
layout (constant_id = 2) const float in_cg = 0;
|
||||
layout (constant_id = 3) const float out_w = 0;
|
||||
layout (constant_id = 4) const float out_h = 0;
|
||||
layout (constant_id = 5) const float out_cg = 0;
|
||||
|
||||
uint index_in_no_ic(uvec2 pos) {
|
||||
return (pos.x + (pos.y * uint(in_w))) * uint(in_cg);
|
||||
}
|
||||
|
||||
uint index_out(uvec2 pos) {
|
||||
return ((pos.x + (pos.y * uint(out_w))) * uint(out_cg)) + gl_GlobalInvocationID.z;
|
||||
}
|
||||
|
||||
void main() {
|
||||
// out x/y is gl_GlobalInvocationID.xy
|
||||
// we need to account for workgroupy padding *here*
|
||||
// so long as we aren't trying to output to a pixel that doesn't exist,
|
||||
// we won't read from any pixels that don't exist
|
||||
if (
|
||||
(gl_GlobalInvocationID.x < (uint(in_w) - 2)) &&
|
||||
(gl_GlobalInvocationID.y < (uint(in_h) - 2))
|
||||
) {
|
||||
vec4 value = in_bias[gl_GlobalInvocationID.z];
|
||||
for (uint x = 0; x < 3; x++) {
|
||||
for (uint y = 0; y < 3; y++) {
|
||||
uint weight_ptr = ((gl_GlobalInvocationID.z * 9) + (x + (y * 3))) * uint(in_cg);
|
||||
// specific pixel
|
||||
// important to note is that since in position has a border around it,
|
||||
// no further transformation is necessary (the - is implied)
|
||||
uvec2 in_pos = gl_GlobalInvocationID.xy + uvec2(x, y);
|
||||
uint in_ptr = index_in_no_ic(in_pos);
|
||||
for (uint icg = 0; icg < uint(in_cg); icg++) {
|
||||
// input channel group
|
||||
vec4 iCG = in_image[in_ptr];
|
||||
// handle all 4 input components
|
||||
value += iCG * in_weight[weight_ptr];
|
||||
weight_ptr += 1;
|
||||
in_ptr += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
// leakyrelu slope 0.1
|
||||
value = (max(value, 0.0) * 0.9) + (value * 0.1);
|
||||
out_image[index_out(gl_GlobalInvocationID.xy)] = value;
|
||||
}
|
||||
}
|
||||
""")
|
||||
|
||||
BIN
examples/neural_network_vgg7/w2wbinit.png
Normal file
BIN
examples/neural_network_vgg7/w2wbinit.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 14 KiB |
Loading…
Add table
Add a link
Reference in a new issue