diff --git a/examples/neural_network_vgg7/.gitignore b/examples/neural_network_vgg7/.gitignore
new file mode 100644
index 000000000..8a402f0be
--- /dev/null
+++ b/examples/neural_network_vgg7/.gitignore
@@ -0,0 +1 @@
+model-kipper
diff --git a/examples/neural_network_vgg7/README.md b/examples/neural_network_vgg7/README.md
new file mode 100644
index 000000000..187da42e3
--- /dev/null
+++ b/examples/neural_network_vgg7/README.md
@@ -0,0 +1,12 @@
+# Waifu2x VGG7 implementation
+
+This demonstrates performing image upscaling using Python and vulkan-kompute.
+
+To import an existing VGG7 model (assuming you have https://github.com/nagadomi/waifu2x/ cloned somewhere):
+
+`python3 import_vgg7.py waifu2x/models/vgg_7/art/scale2.0x_model.json`
+
+To execute that model (no tiling is performed, so be careful about image sizes):
+
+`python3 run_vgg7.py w2wbinit.png out.png`
+
diff --git a/examples/neural_network_vgg7/import_vgg7.py b/examples/neural_network_vgg7/import_vgg7.py
new file mode 100644
index 000000000..c86ff36c8
--- /dev/null
+++ b/examples/neural_network_vgg7/import_vgg7.py
@@ -0,0 +1,30 @@
+import numpy
+import json
+import os
+import sys
+import time
+import sh_common
+
+if len(sys.argv) != 2:
+    print("import_vgg7.py JSONPATH")
+    print(" i.e. import_vgg7.py /home/you/Documents/External/waifu2x/models/vgg_7/art/scale2.0x_model.json")
+    sys.exit(1)
+
+try:
+    os.mkdir("model-kipper")
+except:
+    pass
+
+data_list = json.load(open(sys.argv[1], "rb"))
+
+idx = 0
+for i in range(7):
+    layer = data_list[i]
+    w = numpy.array(layer["weight"])
+    w.reshape((-1, 3, 3)).transpose((0, 2, 1))
+    b = numpy.array(layer["bias"])
+    sh_common.save_param("kipper", idx, w)
+    idx += 1
+    sh_common.save_param("kipper", idx, b)
+    idx += 1
+
diff --git a/examples/neural_network_vgg7/run_vgg7.py b/examples/neural_network_vgg7/run_vgg7.py
new file mode 100644
index 000000000..f5d88e841
--- /dev/null
+++ b/examples/neural_network_vgg7/run_vgg7.py
@@ -0,0 +1,125 @@
+import kp
+import numpy
+import os
+import sys
+import time
+import sh_conv
+import sh_common
+
+if len(sys.argv) != 3:
+    print("run_vgg7.py INPUT OUTPUT")
+    print(" Tiling is not implemented, but padding is implemented")
+    sys.exit(1)
+
+# NOTES:
+# + Tiling is not implemented, but padding is implemented
+#   So don't run anything too big through it
+
+if False:
+    kpm = kp.Manager(1)
+    if kpm.get_device_properties()["device_name"].count("RAVEN") > 0:
+        raise "Safety cut-out triggered. Sorry!"
+else:
+    kpm = kp.Manager()
+
+image = sh_common.image_load(sys.argv[1])
+image = image.repeat(2, 0).repeat(2, 1)
+image = numpy.pad(image, [[7, 7], [7, 7], [0, 0]], mode = "edge")
+
+# Ensure image has 4 channels even though they will be unused.
+# This is because of vectorization vec4 magic.
+while image.shape[2] < sh_common.VSZ:
+    image = numpy.pad(image, [[0, 0], [0, 0], [0, 1]], mode = "constant")
+
+# sh_common.image_save("pad.png", image)
+
+# Prepare the initial tensor.
+
+tensor_in = kpm.tensor(image)
+tensor_in_h = image.shape[0]
+tensor_in_w = image.shape[1]
+tensor_in_cg = 1
+tensor_in_c = 3
+
+# Run things.
+channels = [32, 32, 64, 64, 128, 128, 3]
+
+for i in range(7):
+    # Prepare tensors.
+    # 'c' is the total amount of channels, while 'cg' is the amount of vec4s (channel-groups).
+    # This is important because weights have to be padded for the shader.
+    tensor_out_h = tensor_in_h - 2
+    tensor_out_w = tensor_in_w - 2
+    tensor_out_c = channels[i]
+    tensor_out_cg = (channels[i] + (sh_common.VSZ - 1)) // sh_common.VSZ
+    # TODO: How to produce a blank tensor we don't care about the contents of?
+    # This isn't being synced, and from experience so far that should handle most of it,
+    #  but what about memory usage?
+    # *Most* of these tensors live entirely on-device except when debugging.
+    # Can that be handled? (Also good question: Does it even need to be handled?)
+    tensor_out = kpm.tensor(numpy.zeros((tensor_out_h * tensor_out_w * tensor_out_cg * sh_common.VSZ)))
+    weight = kpm.tensor(sh_common.load_weights_padded("kipper", (i * 2) + 0, tensor_out_c, tensor_in_c, 3))
+    bias = kpm.tensor(sh_common.load_biases_padded("kipper", (i * 2) + 1, tensor_out_c))
+    # Compute.
+    # TODO: It'd be nice to wrap this up into a class for optimization purposes.
+    workgroup = ((tensor_out_w + 7) // 8, (tensor_out_h + 1) // 2, tensor_out_cg)
+    alg = kpm.algorithm(
+        # tensors
+        [tensor_in, bias, weight, tensor_out],
+        # spirv
+        sh_conv.conv_shader,
+        # workgroup
+        workgroup,
+        # spec_consts
+        [tensor_in_w, tensor_in_h, tensor_in_cg, tensor_out_w, tensor_out_h, tensor_out_cg],
+        # push_consts
+        []
+    )
+
+    print("Step complexity " + str(workgroup))
+    print("Step channel layout " + str(tensor_in_cg) + " " + str(tensor_out_cg))
+
+    # Do this first. Keep in mind "syncs" are copies.
+    last_seq = kpm.sequence()
+    things_to_sync_to_device = [bias, weight]
+    if i == 0:
+        # For first layer, the input isn't on-device yet
+        things_to_sync_to_device.append(tensor_in)
+    last_seq.eval_async(kp.OpTensorSyncDevice(things_to_sync_to_device))
+    last_seq.eval_await()
+
+    # Prepare
+    seq = (kpm.sequence()
+        .record(kp.OpAlgoDispatch(alg, []))
+    )
+    # Run
+    seq.eval()
+
+    print("Done with step")
+
+    if False:
+        # DEBUG:
+        # We want to see the output, copy it to local
+        last_seq = kpm.sequence()
+        last_seq.eval_async(kp.OpTensorSyncLocal([tensor_out]))
+        last_seq.eval_await()
+        tensor_out.data().astype("<f4", "C").tofile("raw" + str(i) + ".bin")
+
+    # Swap over.
+    tensor_in = tensor_out
+    tensor_in_h = tensor_out_h
+    tensor_in_w = tensor_out_w
+    tensor_in_c = tensor_out_c
+    tensor_in_cg = tensor_out_cg
+
+# Download output
+fin_seq = kpm.sequence()
+fin_seq.eval_async(kp.OpTensorSyncLocal([tensor_in]))
+fin_seq.eval_await()
+
+# Output
+out_na = tensor_in.data().reshape((tensor_in_h, tensor_in_w, tensor_in_cg * sh_common.VSZ))
+# Crop off 'alpha'
+out_na = out_na[:, :, 0:3]
+sh_common.image_save(sys.argv[2], out_na)
+
diff --git a/examples/neural_network_vgg7/sh_common.py b/examples/neural_network_vgg7/sh_common.py
new file mode 100644
index 000000000..7f98c0d37
--- /dev/null
+++ b/examples/neural_network_vgg7/sh_common.py
@@ -0,0 +1,82 @@
+from PIL import Image
+import numpy
+
+# just in case
+global VSZ
+VSZ = 4
+
+def image_load(path) -> numpy.ndarray:
+    """
+    Loads an image.
+    Doesn't Tensor it, in case you need to do further work with it.
+    Shape is (h, w, 3).
+    """
+    # file
+    na = numpy.array(Image.open(path))
+    # change type
+    na = na.astype("float32") / 255.0
+    return na
+
+def image_save(path, na: numpy.ndarray):
+    """
+    Saves an image.
+    However, note this expects a numpy array.
+    Shape is (h, w, 3).
+    """
+    # change type
+    na = numpy.fmax(numpy.fmin(na * 255.0, 255), 0).astype("uint8")
+    # file
+    Image.fromarray(na).save(path)
+
+def load_param(mdl, idx, expected):
+    npa = numpy.fromfile("model-" + mdl + "/snoop_bin_" + str(idx) + ".bin", "<f4")
+    assert npa.shape[0] == expected
+    return npa
+
+def save_param(mdl, idx, data):
+    data.astype("<f4", "C").tofile("model-" + mdl + "/snoop_bin_" + str(idx) + ".bin")
+
+def load_weights_padded(mdl, idx, tensor_out_c, tensor_in_c, weight_s):
+    tensor_out_cg = (tensor_out_c + 3) // 4
+    tensor_in_cg = (tensor_in_c + 3) // 4
+    # weight & bias load & fixup
+    # a reminder:
+    #  [outputChannels][inputChannels][kernelH][kernelW]
+    # ->
+    #  [outputCGroups][kernelH][kernelW][inputCGroups][outputChannels][inputChannels]
+    weight_na = load_param(mdl, idx, tensor_out_c * tensor_in_c * weight_s * weight_s)
+    # start by putting in the initial shape
+    weight_na = weight_na.reshape(tensor_out_c, tensor_in_c, weight_s, weight_s)
+    # then by padding
+    # NOTE: It is *critically important* that weight padding is done with the "zero" mode.
+    # The shader WILL NOT ignore these values, but zeroing them causes them to have no effect.
+    if (tensor_in_c & 3) != 0:
+        weight_na = numpy.pad(weight_na, [[0, 0], [0, 4 - (tensor_in_c & 3)], [0, 0], [0, 0]], mode = "constant")
+    if (tensor_out_c & 3) != 0:
+        weight_na = numpy.pad(weight_na, [[0, 4 - (tensor_out_c & 3)], [0, 0], [0, 0], [0, 0]], mode = "constant")
+    # reshape to finish splitting things up
+    weight_na = weight_na.reshape(tensor_out_cg, 4, tensor_in_cg, 4, weight_s, weight_s)
+    # result is:
+    # [outputCGroups][outputChannels][inputCGroups][inputChannels][kernelH][kernelW]
+    # and move output channels to the right...
+    weight_na = numpy.moveaxis(weight_na, 1, 5)
+    # result is:
+    # [outputCGroups][inputCGroups][inputChannels][kernelH][kernelW][outputChannels]
+    # and move input channels to the right...
+    weight_na = numpy.moveaxis(weight_na, 2, 5)
+    # result is:
+    # [outputCGroups][inputCGroups][kernelH][kernelW][outputChannels][inputChannels]
+    # and move input cgroups to the right...
+    weight_na = numpy.moveaxis(weight_na, 1, 3)
+    return weight_na
+
+def load_biases_padded(mdl, idx, tensor_out_c):
+    tensor_out_cg = (tensor_out_c + 3) // 4
+    # [outputCGroups][outputChannels]
+    # biases merely need padding
+    # Again, has to be zero
+    bias_na = load_param(mdl, idx, tensor_out_c)
+    if (tensor_out_c & 3) != 0:
+        bias_na = numpy.pad(bias_na, [[0, 4 - (tensor_out_c & 3)]], mode = "constant")
+    return bias_na
+
diff --git a/examples/neural_network_vgg7/sh_conv.py b/examples/neural_network_vgg7/sh_conv.py
new file mode 100644
index 000000000..dea3722cf
--- /dev/null
+++ b/examples/neural_network_vgg7/sh_conv.py
@@ -0,0 +1,70 @@
+import kp
+
+# This is the convolution & leakyrelu shader.
+global conv_shader
+conv_shader = kp.Shader.compile_source("""
+#version 450
+
+layout (local_size_x = 8, local_size_y = 2) in;
+
+// [y][x][group] (vec4: channels)
+layout (set = 0, binding = 0) buffer buf_in_image { readonly restrict vec4 in_image[]; };
+// [outputCGroups] (vec4: output channels)
+layout (set = 0, binding = 1) buffer buf_in_bias { readonly restrict vec4 in_bias[]; };
+// [outputCGroups][kernelH][kernelW][inputCGroups] (mat4: input & output channels)
+layout (set = 0, binding = 2) buffer buf_in_weight { readonly restrict mat4 in_weight[]; };
+// [y][x][group] (vec4: channels)
+layout (set = 0, binding = 3) buffer buf_out_image { writeonly restrict vec4 out_image[]; };
+
+// The 'c' measures in cgroups.
+// Some maths changes as a result.
+layout (constant_id = 0) const float in_w = 0;
+layout (constant_id = 1) const float in_h = 0;
+layout (constant_id = 2) const float in_cg = 0;
+layout (constant_id = 3) const float out_w = 0;
+layout (constant_id = 4) const float out_h = 0;
+layout (constant_id = 5) const float out_cg = 0;
+
+uint index_in_no_ic(uvec2 pos) {
+    return (pos.x + (pos.y * uint(in_w))) * uint(in_cg);
+}
+
+uint index_out(uvec2 pos) {
+    return ((pos.x + (pos.y * uint(out_w))) * uint(out_cg)) + gl_GlobalInvocationID.z;
+}
+
+void main() {
+    // out x/y is gl_GlobalInvocationID.xy
+    // we need to account for workgroupy padding *here*
+    // so long as we aren't trying to output to a pixel that doesn't exist,
+    //  we won't read from any pixels that don't exist
+    if (
+        (gl_GlobalInvocationID.x < (uint(in_w) - 2)) &&
+        (gl_GlobalInvocationID.y < (uint(in_h) - 2))
+    ) {
+        vec4 value = in_bias[gl_GlobalInvocationID.z];
+        for (uint x = 0; x < 3; x++) {
+            for (uint y = 0; y < 3; y++) {
+                uint weight_ptr = ((gl_GlobalInvocationID.z * 9) + (x + (y * 3))) * uint(in_cg);
+                // specific pixel
+                // important to note is that since in position has a border around it,
+                // no further transformation is necessary (the - is implied)
+                uvec2 in_pos = gl_GlobalInvocationID.xy + uvec2(x, y);
+                uint in_ptr = index_in_no_ic(in_pos);
+                for (uint icg = 0; icg < uint(in_cg); icg++) {
+                    // input channel group
+                    vec4 iCG = in_image[in_ptr];
+                    // handle all 4 input components
+                    value += iCG * in_weight[weight_ptr];
+                    weight_ptr += 1;
+                    in_ptr += 1;
+                }
+            }
+        }
+        // leakyrelu slope 0.1
+        value = (max(value, 0.0) * 0.9) + (value * 0.1);
+        out_image[index_out(gl_GlobalInvocationID.xy)] = value;
+    }
+}
+""")
+
diff --git a/examples/neural_network_vgg7/w2wbinit.png b/examples/neural_network_vgg7/w2wbinit.png
new file mode 100644
index 000000000..fc3a908e5
Binary files /dev/null and b/examples/neural_network_vgg7/w2wbinit.png differ