VGG7 example

This commit is contained in:
20kdc 2021-05-30 18:42:35 +01:00
parent 20365e333d
commit 865fd4b5cd
7 changed files with 320 additions and 0 deletions

View file

@ -0,0 +1 @@
model-kipper

View file

@ -0,0 +1,12 @@
# Waifu2x VGG7 implementation
This demonstrates performing image upscaling using Python and vulkan-kompute.
To import an existing VGG7 model (assuming you have https://github.com/nagadomi/waifu2x/ cloned somewhere):
`python3 import_vgg7.py waifu2x/models/vgg_7/art/scale2.0x_model.json`
To execute that model (no tiling is performed, so be careful about image sizes):
`python3 run_vgg7.py w2wbinit.png out.png`

View file

@ -0,0 +1,30 @@
import numpy
import json
import os
import sys
import time
import sh_common
if len(sys.argv) != 2:
print("import_vgg7.py JSONPATH")
print(" i.e. import_vgg7.py /home/you/Documents/External/waifu2x/models/vgg_7/art/scale2.0x_model.json")
sys.exit(1)
try:
os.mkdir("model-kipper")
except:
pass
data_list = json.load(open(sys.argv[1], "rb"))
idx = 0
for i in range(7):
layer = data_list[i]
w = numpy.array(layer["weight"])
w.reshape((-1, 3, 3)).transpose((0, 2, 1))
b = numpy.array(layer["bias"])
sh_common.save_param("kipper", idx, w)
idx += 1
sh_common.save_param("kipper", idx, b)
idx += 1

View file

@ -0,0 +1,125 @@
import kp
import numpy
import os
import sys
import time
import sh_conv
import sh_common
if len(sys.argv) != 3:
print("run_vgg7.py INPUT OUTPUT")
print(" Tiling is not implemented, but padding is implemented")
sys.exit(1)
# NOTES:
# + Tiling is not implemented, but padding is implemented
# So don't run anything too big through it
if False:
kpm = kp.Manager(1)
if kpm.get_device_properties()["device_name"].count("RAVEN") > 0:
raise "Safety cut-out triggered. Sorry!"
else:
kpm = kp.Manager()
image = sh_common.image_load(sys.argv[1])
image = image.repeat(2, 0).repeat(2, 1)
image = numpy.pad(image, [[7, 7], [7, 7], [0, 0]], mode = "edge")
# Ensure image has 4 channels even though they will be unused.
# This is because of vectorization vec4 magic.
while image.shape[2] < sh_common.VSZ:
image = numpy.pad(image, [[0, 0], [0, 0], [0, 1]], mode = "constant")
# sh_common.image_save("pad.png", image)
# Prepare the initial tensor.
tensor_in = kpm.tensor(image)
tensor_in_h = image.shape[0]
tensor_in_w = image.shape[1]
tensor_in_cg = 1
tensor_in_c = 3
# Run things.
channels = [32, 32, 64, 64, 128, 128, 3]
for i in range(7):
# Prepare tensors.
# 'c' is the total amount of channels, while 'cg' is the amount of vec4s (channel-groups).
# This is important because weights have to be padded for the shader.
tensor_out_h = tensor_in_h - 2
tensor_out_w = tensor_in_w - 2
tensor_out_c = channels[i]
tensor_out_cg = (channels[i] + (sh_common.VSZ - 1)) // sh_common.VSZ
# TODO: How to produce a blank tensor we don't care about the contents of?
# This isn't being synced, and from experience so far that should handle most of it,
# but what about memory usage?
# *Most* of these tensors live entirely on-device except when debugging.
# Can that be handled? (Also good question: Does it even need to be handled?)
tensor_out = kpm.tensor(numpy.zeros((tensor_out_h * tensor_out_w * tensor_out_cg * sh_common.VSZ)))
weight = kpm.tensor(sh_common.load_weights_padded("kipper", (i * 2) + 0, tensor_out_c, tensor_in_c, 3))
bias = kpm.tensor(sh_common.load_biases_padded("kipper", (i * 2) + 1, tensor_out_c))
# Compute.
# TODO: It'd be nice to wrap this up into a class for optimization purposes.
workgroup = ((tensor_out_w + 7) // 8, (tensor_out_h + 1) // 2, tensor_out_cg)
alg = kpm.algorithm(
# tensors
[tensor_in, bias, weight, tensor_out],
# spirv
sh_conv.conv_shader,
# workgroup
workgroup,
# spec_consts
[tensor_in_w, tensor_in_h, tensor_in_cg, tensor_out_w, tensor_out_h, tensor_out_cg],
# push_consts
[]
)
print("Step complexity " + str(workgroup))
print("Step channel layout " + str(tensor_in_cg) + " " + str(tensor_out_cg))
# Do this first. Keep in mind "syncs" are copies.
last_seq = kpm.sequence()
things_to_sync_to_device = [bias, weight]
if i == 0:
# For first layer, the input isn't on-device yet
things_to_sync_to_device.append(tensor_in)
last_seq.eval_async(kp.OpTensorSyncDevice(things_to_sync_to_device))
last_seq.eval_await()
# Prepare
seq = (kpm.sequence()
.record(kp.OpAlgoDispatch(alg, []))
)
# Run
seq.eval()
print("Done with step")
if False:
# DEBUG:
# We want to see the output, copy it to local
last_seq = kpm.sequence()
last_seq.eval_async(kp.OpTensorSyncLocal([tensor_out]))
last_seq.eval_await()
tensor_out.data().astype("<f4", "C").tofile("raw" + str(i) + ".bin")
# Swap over.
tensor_in = tensor_out
tensor_in_h = tensor_out_h
tensor_in_w = tensor_out_w
tensor_in_c = tensor_out_c
tensor_in_cg = tensor_out_cg
# Download output
fin_seq = kpm.sequence()
fin_seq.eval_async(kp.OpTensorSyncLocal([tensor_in]))
fin_seq.eval_await()
# Output
out_na = tensor_in.data().reshape((tensor_in_h, tensor_in_w, tensor_in_cg * sh_common.VSZ))
# Crop off 'alpha'
out_na = out_na[:, :, 0:3]
sh_common.image_save(sys.argv[2], out_na)

View file

@ -0,0 +1,82 @@
from PIL import Image
import numpy
# just in case
global VSZ
VSZ = 4
def image_load(path) -> numpy.ndarray:
"""
Loads an image.
Doesn't Tensor it, in case you need to do further work with it.
Shape is (h, w, 3).
"""
# file
na = numpy.array(Image.open(path))
# change type
na = na.astype("float32") / 255.0
return na
def image_save(path, na: numpy.ndarray):
"""
Saves an image.
However, note this expects a numpy array.
Shape is (h, w, 3).
"""
# change type
na = numpy.fmax(numpy.fmin(na * 255.0, 255), 0).astype("uint8")
# file
Image.fromarray(na).save(path)
def load_param(mdl, idx, expected):
npa = numpy.fromfile("model-" + mdl + "/snoop_bin_" + str(idx) + ".bin", "<f4")
assert npa.shape[0] == expected
return npa
def save_param(mdl, idx, data):
data.astype("<f4", "C").tofile("model-" + mdl + "/snoop_bin_" + str(idx) + ".bin")
def load_weights_padded(mdl, idx, tensor_out_c, tensor_in_c, weight_s):
tensor_out_cg = (tensor_out_c + 3) // 4
tensor_in_cg = (tensor_in_c + 3) // 4
# weight & bias load & fixup
# a reminder:
# [outputChannels][inputChannels][kernelH][kernelW]
# ->
# [outputCGroups][kernelH][kernelW][inputCGroups][outputChannels][inputChannels]
weight_na = load_param(mdl, idx, tensor_out_c * tensor_in_c * weight_s * weight_s)
# start by putting in the initial shape
weight_na = weight_na.reshape(tensor_out_c, tensor_in_c, weight_s, weight_s)
# then by padding
# NOTE: It is *critically important* that weight padding is done with the "zero" mode.
# The shader WILL NOT ignore these values, but zeroing them causes them to have no effect.
if (tensor_in_c & 3) != 0:
weight_na = numpy.pad(weight_na, [[0, 0], [0, 4 - (tensor_in_c & 3)], [0, 0], [0, 0]], mode = "constant")
if (tensor_out_c & 3) != 0:
weight_na = numpy.pad(weight_na, [[0, 4 - (tensor_out_c & 3)], [0, 0], [0, 0], [0, 0]], mode = "constant")
# reshape to finish splitting things up
weight_na = weight_na.reshape(tensor_out_cg, 4, tensor_in_cg, 4, weight_s, weight_s)
# result is:
# [outputCGroups][outputChannels][inputCGroups][inputChannels][kernelH][kernelW]
# and move output channels to the right...
weight_na = numpy.moveaxis(weight_na, 1, 5)
# result is:
# [outputCGroups][inputCGroups][inputChannels][kernelH][kernelW][outputChannels]
# and move input channels to the right...
weight_na = numpy.moveaxis(weight_na, 2, 5)
# result is:
# [outputCGroups][inputCGroups][kernelH][kernelW][outputChannels][inputChannels]
# and move input cgroups to the right...
weight_na = numpy.moveaxis(weight_na, 1, 3)
return weight_na
def load_biases_padded(mdl, idx, tensor_out_c):
tensor_out_cg = (tensor_out_c + 3) // 4
# [outputCGroups][outputChannels]
# biases merely need padding
# Again, has to be zero
bias_na = load_param(mdl, idx, tensor_out_c)
if (tensor_out_c & 3) != 0:
bias_na = numpy.pad(bias_na, [[0, 4 - (tensor_out_c & 3)]], mode = "constant")
return bias_na

View file

@ -0,0 +1,70 @@
import kp
# This is the convolution & leakyrelu shader.
global conv_shader
conv_shader = kp.Shader.compile_source("""
#version 450
layout (local_size_x = 8, local_size_y = 2) in;
// [y][x][group] (vec4: channels)
layout (set = 0, binding = 0) buffer buf_in_image { readonly restrict vec4 in_image[]; };
// [outputCGroups] (vec4: output channels)
layout (set = 0, binding = 1) buffer buf_in_bias { readonly restrict vec4 in_bias[]; };
// [outputCGroups][kernelH][kernelW][inputCGroups] (mat4: input & output channels)
layout (set = 0, binding = 2) buffer buf_in_weight { readonly restrict mat4 in_weight[]; };
// [y][x][group] (vec4: channels)
layout (set = 0, binding = 3) buffer buf_out_image { writeonly restrict vec4 out_image[]; };
// The 'c' measures in cgroups.
// Some maths changes as a result.
layout (constant_id = 0) const float in_w = 0;
layout (constant_id = 1) const float in_h = 0;
layout (constant_id = 2) const float in_cg = 0;
layout (constant_id = 3) const float out_w = 0;
layout (constant_id = 4) const float out_h = 0;
layout (constant_id = 5) const float out_cg = 0;
uint index_in_no_ic(uvec2 pos) {
return (pos.x + (pos.y * uint(in_w))) * uint(in_cg);
}
uint index_out(uvec2 pos) {
return ((pos.x + (pos.y * uint(out_w))) * uint(out_cg)) + gl_GlobalInvocationID.z;
}
void main() {
// out x/y is gl_GlobalInvocationID.xy
// we need to account for workgroupy padding *here*
// so long as we aren't trying to output to a pixel that doesn't exist,
// we won't read from any pixels that don't exist
if (
(gl_GlobalInvocationID.x < (uint(in_w) - 2)) &&
(gl_GlobalInvocationID.y < (uint(in_h) - 2))
) {
vec4 value = in_bias[gl_GlobalInvocationID.z];
for (uint x = 0; x < 3; x++) {
for (uint y = 0; y < 3; y++) {
uint weight_ptr = ((gl_GlobalInvocationID.z * 9) + (x + (y * 3))) * uint(in_cg);
// specific pixel
// important to note is that since in position has a border around it,
// no further transformation is necessary (the - is implied)
uvec2 in_pos = gl_GlobalInvocationID.xy + uvec2(x, y);
uint in_ptr = index_in_no_ic(in_pos);
for (uint icg = 0; icg < uint(in_cg); icg++) {
// input channel group
vec4 iCG = in_image[in_ptr];
// handle all 4 input components
value += iCG * in_weight[weight_ptr];
weight_ptr += 1;
in_ptr += 1;
}
}
}
// leakyrelu slope 0.1
value = (max(value, 0.0) * 0.9) + (value * 0.1);
out_image[index_out(gl_GlobalInvocationID.xy)] = value;
}
}
""")

Binary file not shown.

After

Width:  |  Height:  |  Size: 14 KiB