#if defined(_WIN32) #pragma comment(linker, "/subsystem:console") #endif // SPDLOG_ACTIVE_LEVEL must be defined before spdlog.h import #if DEBUG #define SPDLOG_ACTIVE_LEVEL SPDLOG_LEVEL_DEBUG #endif #include #include #include #include #include #include // ranges.h must come after spdlog.h #include #include #include #include "Manager.hpp" #include "OpCreateTensor.hpp" #include "OpMult.hpp" #include "Tensor.hpp" #define BUFFER_ELEMENTS 32 #if DEBUG static VKAPI_ATTR VkBool32 VKAPI_CALL debugMessageCallback(VkDebugReportFlagsEXT flags, VkDebugReportObjectTypeEXT objectType, uint64_t object, size_t location, int32_t messageCode, const char* pLayerPrefix, const char* pMessage, void* pUserData) { SPDLOG_DEBUG("[VALIDATION]: {} - {}", pLayerPrefix, pMessage); return VK_FALSE; } #endif class VulkanCompute { public: vk::Instance mInstance; vk::PhysicalDevice mPhysicalDevice; vk::Device mDevice; vk::Queue mComputeQueue; vk::DescriptorPool mDescriptorPool; vk::DescriptorSetLayout mDescriptorSetLayout; vk::PipelineLayout mPipelineLayout; vk::DescriptorSet mDescriptorSet; vk::PipelineCache mPipelineCache; vk::ShaderModule mShaderModule; vk::Pipeline mPipeline; vk::CommandPool mCommandPool; vk::CommandBuffer mCommandBuffer; uint32_t mComputeQueueFamilyIndex; #if DEBUG vk::DebugReportCallbackEXT mDebugReportCallback; vk::DispatchLoaderDynamic mDebugDispatcher; #endif void createBuffer(const vk::BufferUsageFlags& aUsageFlags, const vk::MemoryPropertyFlags& aMemoryPropertyFlags, vk::Buffer* aBuffer, vk::DeviceMemory* aMemory, vk::DeviceSize aSize, void* data = nullptr) const { SPDLOG_DEBUG("Creating buffer: {}, {}, {}", vk::to_string(aUsageFlags), vk::to_string(aMemoryPropertyFlags), aSize); vk::BufferCreateInfo bufferCreateInfo(vk::BufferCreateFlags(), aSize, aUsageFlags, vk::SharingMode::eExclusive); *aBuffer = this->mDevice.createBuffer(bufferCreateInfo); vk::PhysicalDeviceMemoryProperties deviceMemoryProperties = this->mPhysicalDevice.getMemoryProperties(); vk::MemoryRequirements memReqs = this->mDevice.getBufferMemoryRequirements(*aBuffer); uint32_t memoryTypeIndex = -1; for (uint32_t i = 0; i < deviceMemoryProperties.memoryTypeCount; i++) { if (memReqs.memoryTypeBits & (1 << i)) { if ((deviceMemoryProperties.memoryTypes[i].propertyFlags & aMemoryPropertyFlags) == aMemoryPropertyFlags) { memoryTypeIndex = i; break; } } } if (memoryTypeIndex < 0) { throw std::runtime_error( "Memory type index for buffer creation not found"); } vk::MemoryAllocateInfo memoryAllocateInfo(memReqs.size, memoryTypeIndex); *aMemory = this->mDevice.allocateMemory(memoryAllocateInfo); this->mDevice.bindBufferMemory(*aBuffer, *aMemory, 0); } VulkanCompute() { vk::ApplicationInfo applicationInfo; applicationInfo.pApplicationName = "Vulkan compute"; applicationInfo.pEngineName = "VulkanCompute"; applicationInfo.apiVersion = VK_API_VERSION_1_2; std::vector applicationExtensions; applicationExtensions.push_back(VK_EXT_DEBUG_REPORT_EXTENSION_NAME); vk::InstanceCreateInfo computeInstanceCreateInfo; computeInstanceCreateInfo.pApplicationInfo = &applicationInfo; if (!applicationExtensions.empty()) { computeInstanceCreateInfo.enabledExtensionCount = (uint32_t)applicationExtensions.size(); computeInstanceCreateInfo.ppEnabledExtensionNames = applicationExtensions.data(); } #if DEBUG // We'll identify the layers that are supported std::vector validLayerNames; std::vector desiredLayerNames = { "VK_LAYER_LUNARG_assistant_layer", "VK_LAYER_LUNARG_standard_validation" }; // Identify the valid layer names based on the desiredLayerNames { std::set uniqueLayerNames; std::vector availableLayerProperties = vk::enumerateInstanceLayerProperties(); for (vk::LayerProperties layerProperties : availableLayerProperties) { std::string layerName(layerProperties.layerName); uniqueLayerNames.insert(layerName); } for (const char* desiredLayerName : desiredLayerNames) { if (uniqueLayerNames.count(desiredLayerName) != 0) { validLayerNames.push_back(desiredLayerName); } } } if (validLayerNames.size() > 0) { computeInstanceCreateInfo.enabledLayerCount = (uint32_t)validLayerNames.size(); computeInstanceCreateInfo.ppEnabledLayerNames = validLayerNames.data(); } #endif this->mInstance = vk::createInstance(computeInstanceCreateInfo); #if DEBUG if (validLayerNames.size() > 0) { vk::DebugReportFlagsEXT debugFlags = vk::DebugReportFlagBitsEXT::eError | vk::DebugReportFlagBitsEXT::eWarning; vk::DebugReportCallbackCreateInfoEXT debugCreateInfo = {}; debugCreateInfo.pfnCallback = (PFN_vkDebugReportCallbackEXT)debugMessageCallback; debugCreateInfo.flags = debugFlags; this->mDebugDispatcher.init(this->mInstance, &vkGetInstanceProcAddr); this->mDebugReportCallback = this->mInstance.createDebugReportCallbackEXT( debugCreateInfo, nullptr, this->mDebugDispatcher); } #endif // Find device (currently only pick first device) { std::vector physicalDevices = this->mInstance.enumeratePhysicalDevices(); this->mPhysicalDevice = physicalDevices[0]; vk::PhysicalDeviceProperties physicalDeviceProperties = this->mPhysicalDevice.getProperties(); spdlog::info("Device {}", physicalDeviceProperties.deviceName); } { spdlog::info("Finding compute queue"); // Find compute queue std::vector allQueueFamilyProperties = this->mPhysicalDevice.getQueueFamilyProperties(); this->mComputeQueueFamilyIndex = -1; for (uint32_t i = 0; i < allQueueFamilyProperties.size(); i++) { vk::QueueFamilyProperties queueFamilyProperties = allQueueFamilyProperties[i]; if (queueFamilyProperties.queueFlags & vk::QueueFlagBits::eCompute) { this->mComputeQueueFamilyIndex = i; break; } } if (this->mComputeQueueFamilyIndex < 0) { spdlog::critical("Compute queue is not supported"); } const float defaultQueuePriority(0.0f); const uint32_t defaultQueueCount(1); vk::DeviceQueueCreateInfo deviceQueueCreateInfo( vk::DeviceQueueCreateFlags(), this->mComputeQueueFamilyIndex, defaultQueueCount, &defaultQueuePriority); vk::DeviceCreateInfo deviceCreateInfo( vk::DeviceCreateFlags(), 1, // Number of deviceQueueCreateInfo &deviceQueueCreateInfo); this->mDevice = this->mPhysicalDevice.createDevice(deviceCreateInfo); this->mComputeQueue = this->mDevice.getQueue(this->mComputeQueueFamilyIndex, 0); } /* Create command pool */ { vk::CommandPoolCreateInfo commandPoolInfo( vk::CommandPoolCreateFlags(), this->mComputeQueueFamilyIndex); this->mCommandPool = this->mDevice.createCommandPool(commandPoolInfo); } /* Prepare storage buffers */ std::vector computeInput(BUFFER_ELEMENTS); std::vector computeOutput(BUFFER_ELEMENTS); // Fill input data uint32_t n = 0; std::generate( computeInput.begin(), computeInput.end(), [&n] { return n++; }); const VkDeviceSize bufferSize = BUFFER_ELEMENTS * sizeof(uint32_t); vk::Buffer hostBuffer, deviceBuffer; vk::DeviceMemory hostMemory, deviceMemory; { createBuffer(vk::BufferUsageFlagBits::eTransferSrc | vk::BufferUsageFlagBits::eTransferDst, vk::MemoryPropertyFlagBits::eHostVisible, &hostBuffer, &hostMemory, bufferSize); createBuffer(vk::BufferUsageFlagBits::eStorageBuffer | vk::BufferUsageFlagBits::eTransferSrc | vk::BufferUsageFlagBits::eTransferDst, vk::MemoryPropertyFlagBits::eDeviceLocal, &deviceBuffer, &deviceMemory, bufferSize); } /* Copy data to host memory */ { void* mapped = this->mDevice.mapMemory( hostMemory, 0, bufferSize, vk::MemoryMapFlags()); memcpy(mapped, computeInput.data(), bufferSize); vk::MappedMemoryRange mappedRange(hostMemory, 0, bufferSize); this->mDevice.flushMappedMemoryRanges(1, &mappedRange); this->mDevice.unmapMemory(hostMemory); } /* Copy data from host memory to staging buffer */ { spdlog::info("Copying data from host memory to staging buffer"); vk::CommandBufferAllocateInfo commandBufferAllocateInfo( this->mCommandPool, vk::CommandBufferLevel::ePrimary, 1); std::vector copyCommandBuffers = this->mDevice.allocateCommandBuffers(commandBufferAllocateInfo); vk::CommandBuffer copyCommandBuffer = copyCommandBuffers[0]; copyCommandBuffer.begin(vk::CommandBufferBeginInfo()); { vk::BufferCopy copyRegion(0, 0, bufferSize); copyCommandBuffer.copyBuffer( hostBuffer, deviceBuffer, copyRegion); } copyCommandBuffer.end(); const vk::PipelineStageFlags waitStageMask = vk::PipelineStageFlagBits::eTransfer; vk::SubmitInfo submitInfo( 0, nullptr, &waitStageMask, 1, ©CommandBuffer); vk::Fence fence = this->mDevice.createFence(vk::FenceCreateInfo()); this->mComputeQueue.submit(1, &submitInfo, fence); this->mDevice.waitForFences(1, &fence, VK_TRUE, UINT64_MAX); this->mDevice.destroy(fence); this->mDevice.freeCommandBuffers( this->mCommandPool, 1, ©CommandBuffer); } { std::vector poolSizes = { vk::DescriptorPoolSize(vk::DescriptorType::eStorageBuffer, 1) }; vk::DescriptorPoolCreateInfo descriptorPoolInfo( vk::DescriptorPoolCreateFlags(), 1, static_cast(poolSizes.size()), poolSizes.data()); this->mDescriptorPool = this->mDevice.createDescriptorPool(descriptorPoolInfo); std::vector setLayoutBindings = { vk::DescriptorSetLayoutBinding( 0, vk::DescriptorType::eStorageBuffer, 1, vk::ShaderStageFlagBits::eCompute) }; vk::DescriptorSetLayoutCreateInfo descriptorSetLayoutInfo( vk::DescriptorSetLayoutCreateFlags(), static_cast(setLayoutBindings.size()), setLayoutBindings.data()); this->mDescriptorSetLayout = this->mDevice.createDescriptorSetLayout(descriptorSetLayoutInfo); // For simplicity we don't create an array and pass a single // descriptorSetLayout vk::PipelineLayoutCreateInfo pipelineLayoutCreateInfo( vk::PipelineLayoutCreateFlags(), 1, &this->mDescriptorSetLayout); this->mPipelineLayout = this->mDevice.createPipelineLayout(pipelineLayoutCreateInfo); vk::DescriptorSetAllocateInfo descriptorSetAllocateInfo( this->mDescriptorPool, 1, &this->mDescriptorSetLayout); std::vector descriptorSets = this->mDevice.allocateDescriptorSets(descriptorSetAllocateInfo); this->mDescriptorSet = descriptorSets[0]; vk::DescriptorBufferInfo descriptorBufferInfo( deviceBuffer, 0, VK_WHOLE_SIZE); std::vector computeWriteDescriptorSets = { vk::WriteDescriptorSet(this->mDescriptorSet, 0, 0, 1, vk::DescriptorType::eStorageBuffer, nullptr, &descriptorBufferInfo) }; this->mDevice.updateDescriptorSets(computeWriteDescriptorSets, nullptr); } { struct SpecializationData { uint32_t BUFFER_ELEMENT_COUNT = BUFFER_ELEMENTS; } specializationData; vk::SpecializationMapEntry specializationMapEntry( 0, 0, sizeof(SpecializationData)); vk::SpecializationInfo specializationInfo( 1, &specializationMapEntry, sizeof(SpecializationData), &specializationData); const std::string shadersPath = "shaders/glsl/"; const std::string shaderFilePath = shadersPath + "computeheadless.comp.spv"; spdlog::info("Shader file path: {}", shaderFilePath); SPDLOG_DEBUG("Reading file"); std::ifstream fileStream( shaderFilePath, std::ios::binary | std::ios::in | std::ios::ate); size_t shaderFileSize = fileStream.tellg(); fileStream.seekg(0, std::ios::beg); char* shaderFileData = new char[shaderFileSize]; fileStream.read(shaderFileData, shaderFileSize); fileStream.close(); SPDLOG_DEBUG("Converting the read file into module"); vk::ShaderModuleCreateInfo shaderModuleInfo( vk::ShaderModuleCreateFlags(), shaderFileSize, (uint32_t*)shaderFileData); this->mShaderModule = this->mDevice.createShaderModule(shaderModuleInfo); SPDLOG_DEBUG("Converting to shader stage"); vk::PipelineShaderStageCreateInfo shaderStage( vk::PipelineShaderStageCreateFlags(), vk::ShaderStageFlagBits::eCompute, this->mShaderModule, "main", &specializationInfo); vk::PipelineCacheCreateInfo pipelineCacheCreateInfo; this->mPipelineCache = this->mDevice.createPipelineCache(pipelineCacheCreateInfo); vk::ComputePipelineCreateInfo computePipelineCreateInfo( vk::PipelineCreateFlags(), shaderStage, this->mPipelineLayout, vk::Pipeline(), 0); vk::ResultValue pipelineResult = this->mDevice.createComputePipeline(this->mPipelineCache, computePipelineCreateInfo); if (pipelineResult.result != vk::Result::eSuccess) { throw std::runtime_error("Failed to create pipeline result: " + vk::to_string(pipelineResult.result)); } this->mPipeline = pipelineResult.value; } { vk::CommandBufferAllocateInfo cmdBufferAllocInfo( this->mCommandPool, vk::CommandBufferLevel::ePrimary, 1); std::vector cmdBuffers = this->mDevice.allocateCommandBuffers(cmdBufferAllocInfo); this->mCommandBuffer = cmdBuffers[0]; this->mCommandBuffer.begin(vk::CommandBufferBeginInfo()); { // Barrier to ensure input transfer is finished before compute // shader reads from it vk::BufferMemoryBarrier bufferMemoryBarrier; bufferMemoryBarrier.buffer = deviceBuffer; bufferMemoryBarrier.size = VK_WHOLE_SIZE; bufferMemoryBarrier.srcAccessMask = vk::AccessFlagBits::eHostWrite; bufferMemoryBarrier.dstAccessMask = vk::AccessFlagBits::eShaderRead; bufferMemoryBarrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; bufferMemoryBarrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; this->mCommandBuffer.pipelineBarrier( vk::PipelineStageFlagBits::eHost, vk::PipelineStageFlagBits::eComputeShader, vk::DependencyFlags(), nullptr, bufferMemoryBarrier, nullptr); this->mCommandBuffer.bindPipeline( vk::PipelineBindPoint::eCompute, this->mPipeline); this->mCommandBuffer.bindDescriptorSets( vk::PipelineBindPoint::eCompute, this->mPipelineLayout, 0, this->mDescriptorSet, nullptr); this->mCommandBuffer.dispatch(BUFFER_ELEMENTS / 4, 1, 1); // Barrier to ensure that shader writes are finished before // buffer is read back from GPU bufferMemoryBarrier.srcAccessMask = vk::AccessFlagBits::eShaderWrite; bufferMemoryBarrier.dstAccessMask = vk::AccessFlagBits::eTransferRead; this->mCommandBuffer.pipelineBarrier( vk::PipelineStageFlagBits::eComputeShader, vk::PipelineStageFlagBits::eTransfer, vk::DependencyFlags(), nullptr, bufferMemoryBarrier, nullptr); // Read back to host visible buffer vk::BufferCopy copyRegion(0, 0, bufferSize); this->mCommandBuffer.copyBuffer( deviceBuffer, hostBuffer, copyRegion); // Barrier to ensure that buffer copy is finished before host // reading from it bufferMemoryBarrier.srcAccessMask = vk::AccessFlagBits::eTransferWrite; bufferMemoryBarrier.dstAccessMask = vk::AccessFlagBits::eHostRead; bufferMemoryBarrier.buffer = hostBuffer; this->mCommandBuffer.pipelineBarrier( vk::PipelineStageFlagBits::eTransfer, vk::PipelineStageFlagBits::eHost, vk::DependencyFlags(), nullptr, bufferMemoryBarrier, nullptr); } this->mCommandBuffer.end(); } { vk::Fence fence = this->mDevice.createFence(vk::FenceCreateInfo()); const vk::PipelineStageFlags waitStageMask = vk::PipelineStageFlagBits::eTransfer; vk::SubmitInfo computeSubmitInfo( 0, nullptr, &waitStageMask, 1, &this->mCommandBuffer); this->mComputeQueue.submit(computeSubmitInfo, fence); this->mDevice.waitForFences(fence, VK_TRUE, UINT64_MAX); this->mDevice.destroy(fence); } { // Make device writes visible to host void* mapped = this->mDevice.mapMemory( hostMemory, 0, VK_WHOLE_SIZE, vk::MemoryMapFlags()); vk::MappedMemoryRange mappedMemoryRange( hostMemory, 0, VK_WHOLE_SIZE); this->mDevice.invalidateMappedMemoryRanges(mappedMemoryRange); memcpy(computeOutput.data(), mapped, bufferSize); this->mDevice.unmapMemory(hostMemory); } { this->mComputeQueue.waitIdle(); spdlog::info("Compute input: {}", computeInput); spdlog::info("Compute output: {}", computeOutput); } { this->mDevice.destroy(deviceBuffer); this->mDevice.freeMemory(deviceMemory); this->mDevice.destroy(hostBuffer); this->mDevice.freeMemory(hostMemory); } } ~VulkanCompute() { this->mDevice.destroy(this->mPipelineLayout); this->mDevice.destroy(this->mDescriptorSetLayout); this->mDevice.destroy(this->mDescriptorPool); this->mDevice.destroy(this->mPipeline); this->mDevice.destroy(this->mPipelineCache); this->mDevice.destroy(this->mCommandPool); this->mDevice.destroy(this->mShaderModule); this->mDevice.destroy(); #if DEBUG if (this->mDebugReportCallback) { this->mInstance.destroyDebugReportCallbackEXT( this->mDebugReportCallback, nullptr, this->mDebugDispatcher); } #endif this->mInstance.destroy(); } }; int main() { #if DEBUG spdlog::set_level(spdlog::level::debug); #else spdlog::set_level(spdlog::level::info); #endif try { // VulkanCompute* vulkanExample = new VulkanCompute(); // spdlog::info("Finished."); // delete (vulkanExample); // Run Kompute spdlog::info("Creating manager"); kp::Manager mgr; spdlog::info("Creating first tensor"); std::shared_ptr tensorLHS{ new kp::Tensor( { 0.0, 1.0, 2.0 }) }; mgr.evalOp({ tensorLHS }); spdlog::info("Creating second tensor"); std::shared_ptr tensorRHS{ new kp::Tensor( { 2.0, 4.0, 6.0 }) }; mgr.evalOp({ tensorRHS }); // TODO: Add capabilities for just output tensor types spdlog::info("Creating output tensor"); std::shared_ptr tensorOutput{ new kp::Tensor( { 0.0, 0.0, 0.0 }) }; mgr.evalOp({ tensorOutput }); spdlog::info("OpCreateTensor success for tensors"); spdlog::info("Tensor one: {}", tensorLHS->data()); spdlog::info("Tensor two: {}", tensorRHS->data()); spdlog::info("Tensor output: {}", tensorOutput->data()); spdlog::info("Calling op mult"); mgr.evalOp({ tensorLHS, tensorRHS, tensorOutput }); spdlog::info("OpMult call success"); spdlog::info("Tensor output: {}", tensorOutput->data()); spdlog::info("Called manager eval success END PROGRAM"); return 0; } catch (const std::exception& exc) { spdlog::error("Exception caught: {}", exc.what()); return 1; } catch (...) { spdlog::error("Uncaught exception"); return 1; } }