Add compute benchmarks
This change adds Vulkan compute benchmarks for simple operations like
sqrt, sin, cos, exp, and log. A no-op 'mov' benchmark is also added as
a comparison point for the overhead of dispatching compute commands,
multi-threaded task scheduling, and memory bandwidth.
A single-threaded scalar C++ equivalent for these operations has also
been added for comparison purposes.
Bug: b/158231104
Change-Id: Ibe151d699ed5019c9b34fa9e038149435d781773
Kokoro-Result: kokoro <>
Reviewed-by: Alexis Hétu <>
Tested-by: Nicolas Capens <>
diff --git a/tests/VulkanBenchmarks/CMakeLists.txt b/tests/VulkanBenchmarks/CMakeLists.txt
index 62e699f..76ac1d7 100644
--- a/tests/VulkanBenchmarks/CMakeLists.txt
+++ b/tests/VulkanBenchmarks/CMakeLists.txt
@@ -23,6 +23,7 @@
+ ComputeBenchmarks.cpp
diff --git a/tests/VulkanBenchmarks/ComputeBenchmarks.cpp b/tests/VulkanBenchmarks/ComputeBenchmarks.cpp
new file mode 100644
index 0000000..f2a0d16
--- /dev/null
+++ b/tests/VulkanBenchmarks/ComputeBenchmarks.cpp
@@ -0,0 +1,332 @@
+// Copyright 2021 The SwiftShader Authors. All Rights Reserved.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "Util.hpp"
+#include "VulkanTester.hpp"
+#include "benchmark/benchmark.h"
+#include "spirv-tools/libspirv.hpp"
+#include <cmath>
+#include <cstring>
+#include <sstream>
+// C++ reference implementation for single-threaded 'compute' operations.
+template<typename Init, typename Func>
+void CppCompute(benchmark::State &state, Init init, Func op)
+ int64_t numElements = state.range(0);
+ float *bufferIn = (float *)malloc(numElements * sizeof(float));
+ float *bufferOut = (float *)malloc(numElements * sizeof(float));
+ for(int64_t i = 0; i < numElements; i++)
+ {
+ bufferIn[i] = init(i);
+ }
+ for(auto _ : state)
+ {
+ for(int64_t i = 0; i < numElements; i++)
+ {
+ bufferOut[i] = op(bufferIn[i]);
+ }
+ }
+ free(bufferIn);
+ free(bufferOut);
+float zero(int64_t i)
+ return 0.0f;
+float one(int64_t i)
+ return 1.0f;
+BENCHMARK_CAPTURE(CppCompute, mov, zero, [](float x) { return x; })->Arg(4 * 1024 * 1024)->Unit(benchmark::kMillisecond);
+BENCHMARK_CAPTURE(CppCompute, sqrt, one, sqrtf)->Arg(4 * 1024 * 1024)->Unit(benchmark::kMillisecond);
+BENCHMARK_CAPTURE(CppCompute, sin, zero, sinf)->Arg(4 * 1024 * 1024)->Unit(benchmark::kMillisecond);
+BENCHMARK_CAPTURE(CppCompute, cos, zero, cosf)->Arg(4 * 1024 * 1024)->Unit(benchmark::kMillisecond);
+BENCHMARK_CAPTURE(CppCompute, exp, zero, expf)->Arg(4 * 1024 * 1024)->Unit(benchmark::kMillisecond);
+BENCHMARK_CAPTURE(CppCompute, log, one, logf)->Arg(4 * 1024 * 1024)->Unit(benchmark::kMillisecond);
+class ComputeBenchmark
+ ComputeBenchmark()
+ {
+ tester.initialize();
+ }
+ VulkanTester tester;
+// Base class for compute benchmarks that read from an input buffer and write to an
+// output buffer of the same length.
+class BufferToBufferComputeBenchmark : public ComputeBenchmark
+ BufferToBufferComputeBenchmark(const benchmark::State &state)
+ : state(state)
+ {
+ device = tester.getDevice();
+ }
+ virtual ~BufferToBufferComputeBenchmark()
+ {
+ device.destroyCommandPool(commandPool);
+ device.destroyDescriptorPool(descriptorPool);
+ device.destroyPipeline(pipeline);
+ device.destroyDescriptorSetLayout(descriptorSetLayout);
+ device.destroyBuffer(bufferIn);
+ device.destroyBuffer(bufferOut);
+ device.freeMemory(deviceMemory);
+ }
+ void run();
+ void initialize(const std::string &glslShader);
+ uint32_t localSizeX = 128;
+ uint32_t localSizeY = 1;
+ uint32_t localSizeZ = 1;
+ const benchmark::State &state;
+ // Weak references
+ vk::Device device;
+ vk::Queue queue;
+ vk::CommandBuffer commandBuffer;
+ // Owned resources
+ vk::CommandPool commandPool;
+ vk::DescriptorPool descriptorPool;
+ vk::Pipeline pipeline;
+ vk::DescriptorSetLayout descriptorSetLayout;
+ vk::DeviceMemory deviceMemory;
+ vk::Buffer bufferIn;
+ vk::Buffer bufferOut;
+void BufferToBufferComputeBenchmark::initialize(const std::string &glslShader)
+ auto code = Util::compileGLSLtoSPIRV(glslShader.c_str(), EShLanguage::EShLangCompute);
+ auto &device = tester.getDevice();
+ auto &physicalDevice = tester.getPhysicalDevice();
+ queue = device.getQueue(0, 0); // TODO: Don't assume this queue can do compute.
+ size_t numElements = state.range(0);
+ size_t inOffset = 0;
+ size_t outOffset = numElements;
+ size_t buffersTotalElements = 2 * numElements;
+ size_t buffersSize = sizeof(uint32_t) * buffersTotalElements;
+ // TODO: vk::MemoryRequirements memoryRequirements = device.getBufferMemoryRequirements(buffer);
+ vk::MemoryAllocateInfo allocateInfo;
+ allocateInfo.allocationSize = buffersSize; // TODO: memoryRequirements.size
+ allocateInfo.memoryTypeIndex = 0; // TODO: memoryRequirements.memoryTypeBits
+ deviceMemory = device.allocateMemory(allocateInfo);
+ uint32_t *buffers = (uint32_t *)device.mapMemory(deviceMemory, 0, buffersSize);
+ memset(buffers, 0, buffersSize);
+ for(size_t i = 0; i < numElements; i++)
+ {
+ buffers[inOffset + i] = (uint32_t)i;
+ }
+ device.unmapMemory(deviceMemory);
+ buffers = nullptr;
+ vk::BufferCreateInfo bufferCreateInfo({}, sizeof(uint32_t) * numElements, vk::BufferUsageFlagBits::eStorageBuffer);
+ bufferIn = device.createBuffer(bufferCreateInfo);
+ device.bindBufferMemory(bufferIn, deviceMemory, sizeof(uint32_t) * inOffset);
+ bufferOut = device.createBuffer(bufferCreateInfo);
+ device.bindBufferMemory(bufferOut, deviceMemory, sizeof(uint32_t) * outOffset);
+ vk::ShaderModuleCreateInfo moduleCreateInfo;
+ moduleCreateInfo.codeSize = code.size() * sizeof(uint32_t);
+ moduleCreateInfo.pCode = (uint32_t *);
+ vk::ShaderModule shaderModule = device.createShaderModule(moduleCreateInfo);
+ vk::DescriptorSetLayoutBinding in;
+ in.binding = 0;
+ in.descriptorCount = 1;
+ in.descriptorType = vk::DescriptorType::eStorageBuffer;
+ in.stageFlags = vk::ShaderStageFlagBits::eCompute;
+ vk::DescriptorSetLayoutBinding out;
+ out.binding = 1;
+ out.descriptorCount = 1;
+ out.descriptorType = vk::DescriptorType::eStorageBuffer;
+ out.stageFlags = vk::ShaderStageFlagBits::eCompute;
+ std::vector<vk::DescriptorSetLayoutBinding> setLayoutBindings = { in, out };
+ vk::DescriptorSetLayoutCreateInfo layoutInfo;
+ layoutInfo.bindingCount = static_cast<uint32_t>(setLayoutBindings.size());
+ layoutInfo.pBindings =;
+ descriptorSetLayout = device.createDescriptorSetLayout(layoutInfo);
+ vk::PipelineLayoutCreateInfo pipelineLayoutCreateInfo;
+ pipelineLayoutCreateInfo.setLayoutCount = 1;
+ pipelineLayoutCreateInfo.pSetLayouts = &descriptorSetLayout;
+ vk::PipelineLayout pipelineLayout = device.createPipelineLayout(pipelineLayoutCreateInfo);
+ vk::ComputePipelineCreateInfo computePipelineCreateInfo;
+ computePipelineCreateInfo.layout = pipelineLayout;
+ computePipelineCreateInfo.stage.stage = vk::ShaderStageFlagBits::eCompute;
+ computePipelineCreateInfo.stage.module = shaderModule;
+ computePipelineCreateInfo.stage.pName = "main";
+ pipeline = device.createComputePipeline({}, computePipelineCreateInfo).value;
+ // "A shader module can be destroyed while pipelines created using its shaders are still in use."
+ device.destroyShaderModule(shaderModule);
+ std::array<vk::DescriptorPoolSize, 1> poolSizes = {};
+ poolSizes[0].type = vk::DescriptorType::eStorageBuffer;
+ poolSizes[0].descriptorCount = 2;
+ vk::DescriptorPoolCreateInfo descriptorPoolCreateInfo;
+ descriptorPoolCreateInfo.maxSets = 1;
+ descriptorPoolCreateInfo.poolSizeCount = static_cast<uint32_t>(poolSizes.size());
+ descriptorPoolCreateInfo.pPoolSizes =;
+ descriptorPool = device.createDescriptorPool(descriptorPoolCreateInfo);
+ vk::DescriptorSetAllocateInfo descriptorSetAllocateInfo;
+ descriptorSetAllocateInfo.descriptorPool = descriptorPool;
+ descriptorSetAllocateInfo.descriptorSetCount = 1;
+ descriptorSetAllocateInfo.pSetLayouts = &descriptorSetLayout;
+ auto descriptorSets = device.allocateDescriptorSets(descriptorSetAllocateInfo);
+ vk::DescriptorBufferInfo inBufferInfo;
+ inBufferInfo.buffer = bufferIn;
+ inBufferInfo.offset = 0;
+ inBufferInfo.range = VK_WHOLE_SIZE;
+ vk::DescriptorBufferInfo outBufferInfo;
+ outBufferInfo.buffer = bufferOut;
+ outBufferInfo.offset = 0;
+ outBufferInfo.range = VK_WHOLE_SIZE;
+ std::array<vk::WriteDescriptorSet, 2> descriptorWrites = {};
+ descriptorWrites[0].dstSet = descriptorSets[0];
+ descriptorWrites[0].dstBinding = 0;
+ descriptorWrites[0].dstArrayElement = 0;
+ descriptorWrites[0].descriptorType = vk::DescriptorType::eStorageBuffer;
+ descriptorWrites[0].descriptorCount = 1;
+ descriptorWrites[0].pBufferInfo = &inBufferInfo;
+ descriptorWrites[1].dstSet = descriptorSets[0];
+ descriptorWrites[1].dstBinding = 1;
+ descriptorWrites[1].dstArrayElement = 0;
+ descriptorWrites[1].descriptorType = vk::DescriptorType::eStorageBuffer;
+ descriptorWrites[1].descriptorCount = 1;
+ descriptorWrites[1].pBufferInfo = &outBufferInfo;
+ device.updateDescriptorSets(static_cast<uint32_t>(descriptorWrites.size()),, 0, nullptr);
+ vk::CommandPoolCreateInfo commandPoolCreateInfo;
+ commandPoolCreateInfo.queueFamilyIndex = 0; // TODO: Don't assume queue family 0 can do compute.
+ commandPoolCreateInfo.flags = vk::CommandPoolCreateFlagBits::eResetCommandBuffer;
+ commandPool = device.createCommandPool(commandPoolCreateInfo);
+ vk::CommandBufferAllocateInfo commandBufferAllocateInfo;
+ commandBufferAllocateInfo.commandPool = commandPool;
+ commandBufferAllocateInfo.commandBufferCount = 1;
+ commandBufferAllocateInfo.level = vk::CommandBufferLevel::ePrimary;
+ auto commandBuffers = device.allocateCommandBuffers(commandBufferAllocateInfo);
+ // Record the command buffer
+ commandBuffer = commandBuffers[0];
+ vk::CommandBufferBeginInfo commandBufferBeginInfo;
+ commandBuffer.begin(commandBufferBeginInfo);
+ commandBuffer.bindPipeline(vk::PipelineBindPoint::eCompute, pipeline);
+ commandBuffer.bindDescriptorSets(vk::PipelineBindPoint::eCompute, pipelineLayout, 0, 1, &descriptorSets[0], 0, nullptr);
+ commandBuffer.dispatch((uint32_t)(numElements / localSizeX), 1, 1);
+ commandBuffer.end();
+ // Destroy objects we don't have to hold on to after command buffer recording.
+ // "A VkPipelineLayout object must not be destroyed while any command buffer that uses it is in the recording state."
+ device.destroyPipelineLayout(pipelineLayout);
+void BufferToBufferComputeBenchmark::run()
+ vk::SubmitInfo submitInfo;
+ submitInfo.commandBufferCount = 1;
+ submitInfo.pCommandBuffers = &commandBuffer;
+ queue.submit(submitInfo);
+ queue.waitIdle();
+// Performs an operation `op` on each element.
+class ComputeOp : public BufferToBufferComputeBenchmark
+ ComputeOp(const benchmark::State &state, const char *op)
+ : BufferToBufferComputeBenchmark(state)
+ {
+ std::stringstream src;
+ src << R"(#version 450
+ layout(local_size_x = 1, local_size_y = 1, local_size_z = 1) in;
+ layout(binding = 0, std430) buffer InBuffer
+ {
+ float Data[];
+ } In;
+ layout(binding = 1, std430) buffer OutBuffer
+ {
+ float Data[];
+ } Out;
+ void main()
+ {
+ float x = In.Data[gl_GlobalInvocationID.x];
+ Out.Data[gl_GlobalInvocationID.x] = )"
+ << op << R"( (x);
+ })";
+ initialize(src.str());
+ }
+static void Compute(benchmark::State &state, const char *op)
+ ComputeOp benchmark(state, op);
+ // Execute once to have the Reactor routine generated.
+ for(auto _ : state)
+ {
+ }
+BENCHMARK_CAPTURE(Compute, mov, "")->RangeMultiplier(2)->Range(128, 4 * 1024 * 1024)->Unit(benchmark::kMillisecond)->MeasureProcessCPUTime();
+BENCHMARK_CAPTURE(Compute, sqrt, "sqrt")->Arg(4 * 1024 * 1024)->Unit(benchmark::kMillisecond)->MeasureProcessCPUTime();
+BENCHMARK_CAPTURE(Compute, sin, "sin")->Arg(4 * 1024 * 1024)->Unit(benchmark::kMillisecond)->MeasureProcessCPUTime();
+BENCHMARK_CAPTURE(Compute, cos, "cos")->Arg(4 * 1024 * 1024)->Unit(benchmark::kMillisecond)->MeasureProcessCPUTime();
+BENCHMARK_CAPTURE(Compute, exp, "exp")->Arg(4 * 1024 * 1024)->Unit(benchmark::kMillisecond)->MeasureProcessCPUTime();
+BENCHMARK_CAPTURE(Compute, log, "log")->Arg(4 * 1024 * 1024)->Unit(benchmark::kMillisecond)->MeasureProcessCPUTime();
diff --git a/tests/VulkanUnitTests/Device.cpp b/tests/VulkanUnitTests/Device.cpp
index 462492f..7cffc43 100644
--- a/tests/VulkanUnitTests/Device.cpp
+++ b/tests/VulkanUnitTests/Device.cpp
@@ -446,7 +446,7 @@
nullptr, // pSignalSemaphores
- VkResult result = driver->vkQueueSubmit(queue, 1, &info, 0);
+ VkResult result = driver->vkQueueSubmit(queue, 1, &info, VK_NULL_HANDLE);
if(result != VK_SUCCESS)
return result;