Add compute benchmarks

This change adds Vulkan compute benchmarks for simple operations like
sqrt, sin, cos, exp, and log. A no-op 'mov' benchmark is also added as
a comparison point for the overhead of dispatching compute commands,
multi-threaded task scheduling, and memory bandwidth.

A single-threaded scalar C++ equivalent for these operations has also
been added for comparison purposes.

Bug: b/158231104
Change-Id: Ibe151d699ed5019c9b34fa9e038149435d781773
Reviewed-on: https://swiftshader-review.googlesource.com/c/SwiftShader/+/62128
Kokoro-Result: kokoro <noreply+kokoro@google.com>
Reviewed-by: Alexis Hétu <sugoi@google.com>
Tested-by: Nicolas Capens <nicolascapens@google.com>
diff --git a/tests/VulkanBenchmarks/CMakeLists.txt b/tests/VulkanBenchmarks/CMakeLists.txt
index 62e699f..76ac1d7 100644
--- a/tests/VulkanBenchmarks/CMakeLists.txt
+++ b/tests/VulkanBenchmarks/CMakeLists.txt
@@ -23,6 +23,7 @@
 
 set(VULKAN_BENCHMARKS_SRC_FILES
     ClearImageBenchmarks.cpp
+    ComputeBenchmarks.cpp
     main.cpp
     TriangleBenchmarks.cpp
 )
diff --git a/tests/VulkanBenchmarks/ComputeBenchmarks.cpp b/tests/VulkanBenchmarks/ComputeBenchmarks.cpp
new file mode 100644
index 0000000..f2a0d16
--- /dev/null
+++ b/tests/VulkanBenchmarks/ComputeBenchmarks.cpp
@@ -0,0 +1,332 @@
+// Copyright 2021 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "Util.hpp"
+#include "VulkanTester.hpp"
+
+#include "benchmark/benchmark.h"
+#include "spirv-tools/libspirv.hpp"
+
+#include <cmath>
+#include <cstring>
+#include <sstream>
+
+// C++ reference implementation for single-threaded 'compute' operations.
+template<typename Init, typename Func>
+void CppCompute(benchmark::State &state, Init init, Func op)
+{
+	int64_t numElements = state.range(0);
+	float *bufferIn = (float *)malloc(numElements * sizeof(float));
+	float *bufferOut = (float *)malloc(numElements * sizeof(float));
+
+	for(int64_t i = 0; i < numElements; i++)
+	{
+		bufferIn[i] = init(i);
+	}
+
+	for(auto _ : state)
+	{
+		for(int64_t i = 0; i < numElements; i++)
+		{
+			bufferOut[i] = op(bufferIn[i]);
+		}
+	}
+
+	free(bufferIn);
+	free(bufferOut);
+}
+
+float zero(int64_t i)
+{
+	return 0.0f;
+}
+
+float one(int64_t i)
+{
+	return 1.0f;
+}
+
+BENCHMARK_CAPTURE(CppCompute, mov, zero, [](float x) { return x; })->Arg(4 * 1024 * 1024)->Unit(benchmark::kMillisecond);
+BENCHMARK_CAPTURE(CppCompute, sqrt, one, sqrtf)->Arg(4 * 1024 * 1024)->Unit(benchmark::kMillisecond);
+BENCHMARK_CAPTURE(CppCompute, sin, zero, sinf)->Arg(4 * 1024 * 1024)->Unit(benchmark::kMillisecond);
+BENCHMARK_CAPTURE(CppCompute, cos, zero, cosf)->Arg(4 * 1024 * 1024)->Unit(benchmark::kMillisecond);
+BENCHMARK_CAPTURE(CppCompute, exp, zero, expf)->Arg(4 * 1024 * 1024)->Unit(benchmark::kMillisecond);
+BENCHMARK_CAPTURE(CppCompute, log, one, logf)->Arg(4 * 1024 * 1024)->Unit(benchmark::kMillisecond);
+
+class ComputeBenchmark
+{
+protected:
+	ComputeBenchmark()
+	{
+		tester.initialize();
+	}
+
+	VulkanTester tester;
+};
+
+// Base class for compute benchmarks that read from an input buffer and write to an
+// output buffer of the same length.
+class BufferToBufferComputeBenchmark : public ComputeBenchmark
+{
+public:
+	BufferToBufferComputeBenchmark(const benchmark::State &state)
+	    : state(state)
+	{
+		device = tester.getDevice();
+	}
+
+	virtual ~BufferToBufferComputeBenchmark()
+	{
+		device.destroyCommandPool(commandPool);
+		device.destroyDescriptorPool(descriptorPool);
+		device.destroyPipeline(pipeline);
+		device.destroyDescriptorSetLayout(descriptorSetLayout);
+		device.destroyBuffer(bufferIn);
+		device.destroyBuffer(bufferOut);
+		device.freeMemory(deviceMemory);
+	}
+
+	void run();
+
+protected:
+	void initialize(const std::string &glslShader);
+
+	uint32_t localSizeX = 128;
+	uint32_t localSizeY = 1;
+	uint32_t localSizeZ = 1;
+
+private:
+	const benchmark::State &state;
+
+	// Weak references
+	vk::Device device;
+	vk::Queue queue;
+	vk::CommandBuffer commandBuffer;
+
+	// Owned resources
+	vk::CommandPool commandPool;
+	vk::DescriptorPool descriptorPool;
+	vk::Pipeline pipeline;
+	vk::DescriptorSetLayout descriptorSetLayout;
+	vk::DeviceMemory deviceMemory;
+	vk::Buffer bufferIn;
+	vk::Buffer bufferOut;
+};
+
+void BufferToBufferComputeBenchmark::initialize(const std::string &glslShader)
+{
+	auto code = Util::compileGLSLtoSPIRV(glslShader.c_str(), EShLanguage::EShLangCompute);
+
+	auto &device = tester.getDevice();
+	auto &physicalDevice = tester.getPhysicalDevice();
+	queue = device.getQueue(0, 0);  // TODO: Don't assume this queue can do compute.
+
+	size_t numElements = state.range(0);
+	size_t inOffset = 0;
+	size_t outOffset = numElements;
+	size_t buffersTotalElements = 2 * numElements;
+	size_t buffersSize = sizeof(uint32_t) * buffersTotalElements;
+
+	// TODO: vk::MemoryRequirements memoryRequirements = device.getBufferMemoryRequirements(buffer);
+	vk::MemoryAllocateInfo allocateInfo;
+	allocateInfo.allocationSize = buffersSize;  // TODO: memoryRequirements.size
+	allocateInfo.memoryTypeIndex = 0;           // TODO: memoryRequirements.memoryTypeBits
+	deviceMemory = device.allocateMemory(allocateInfo);
+
+	uint32_t *buffers = (uint32_t *)device.mapMemory(deviceMemory, 0, buffersSize);
+	memset(buffers, 0, buffersSize);
+
+	for(size_t i = 0; i < numElements; i++)
+	{
+		buffers[inOffset + i] = (uint32_t)i;
+	}
+
+	device.unmapMemory(deviceMemory);
+	buffers = nullptr;
+
+	vk::BufferCreateInfo bufferCreateInfo({}, sizeof(uint32_t) * numElements, vk::BufferUsageFlagBits::eStorageBuffer);
+	bufferIn = device.createBuffer(bufferCreateInfo);
+	device.bindBufferMemory(bufferIn, deviceMemory, sizeof(uint32_t) * inOffset);
+
+	bufferOut = device.createBuffer(bufferCreateInfo);
+	device.bindBufferMemory(bufferOut, deviceMemory, sizeof(uint32_t) * outOffset);
+
+	vk::ShaderModuleCreateInfo moduleCreateInfo;
+	moduleCreateInfo.codeSize = code.size() * sizeof(uint32_t);
+	moduleCreateInfo.pCode = (uint32_t *)code.data();
+	vk::ShaderModule shaderModule = device.createShaderModule(moduleCreateInfo);
+
+	vk::DescriptorSetLayoutBinding in;
+	in.binding = 0;
+	in.descriptorCount = 1;
+	in.descriptorType = vk::DescriptorType::eStorageBuffer;
+	in.stageFlags = vk::ShaderStageFlagBits::eCompute;
+
+	vk::DescriptorSetLayoutBinding out;
+	out.binding = 1;
+	out.descriptorCount = 1;
+	out.descriptorType = vk::DescriptorType::eStorageBuffer;
+	out.stageFlags = vk::ShaderStageFlagBits::eCompute;
+
+	std::vector<vk::DescriptorSetLayoutBinding> setLayoutBindings = { in, out };
+	vk::DescriptorSetLayoutCreateInfo layoutInfo;
+	layoutInfo.bindingCount = static_cast<uint32_t>(setLayoutBindings.size());
+	layoutInfo.pBindings = setLayoutBindings.data();
+	descriptorSetLayout = device.createDescriptorSetLayout(layoutInfo);
+
+	vk::PipelineLayoutCreateInfo pipelineLayoutCreateInfo;
+	pipelineLayoutCreateInfo.setLayoutCount = 1;
+	pipelineLayoutCreateInfo.pSetLayouts = &descriptorSetLayout;
+	vk::PipelineLayout pipelineLayout = device.createPipelineLayout(pipelineLayoutCreateInfo);
+
+	vk::ComputePipelineCreateInfo computePipelineCreateInfo;
+	computePipelineCreateInfo.layout = pipelineLayout;
+	computePipelineCreateInfo.stage.stage = vk::ShaderStageFlagBits::eCompute;
+	computePipelineCreateInfo.stage.module = shaderModule;
+	computePipelineCreateInfo.stage.pName = "main";
+	pipeline = device.createComputePipeline({}, computePipelineCreateInfo).value;
+
+	// "A shader module can be destroyed while pipelines created using its shaders are still in use."
+	device.destroyShaderModule(shaderModule);
+
+	std::array<vk::DescriptorPoolSize, 1> poolSizes = {};
+	poolSizes[0].type = vk::DescriptorType::eStorageBuffer;
+	poolSizes[0].descriptorCount = 2;
+	vk::DescriptorPoolCreateInfo descriptorPoolCreateInfo;
+	descriptorPoolCreateInfo.maxSets = 1;
+	descriptorPoolCreateInfo.poolSizeCount = static_cast<uint32_t>(poolSizes.size());
+	descriptorPoolCreateInfo.pPoolSizes = poolSizes.data();
+
+	descriptorPool = device.createDescriptorPool(descriptorPoolCreateInfo);
+
+	vk::DescriptorSetAllocateInfo descriptorSetAllocateInfo;
+	descriptorSetAllocateInfo.descriptorPool = descriptorPool;
+	descriptorSetAllocateInfo.descriptorSetCount = 1;
+	descriptorSetAllocateInfo.pSetLayouts = &descriptorSetLayout;
+	auto descriptorSets = device.allocateDescriptorSets(descriptorSetAllocateInfo);
+
+	vk::DescriptorBufferInfo inBufferInfo;
+	inBufferInfo.buffer = bufferIn;
+	inBufferInfo.offset = 0;
+	inBufferInfo.range = VK_WHOLE_SIZE;
+
+	vk::DescriptorBufferInfo outBufferInfo;
+	outBufferInfo.buffer = bufferOut;
+	outBufferInfo.offset = 0;
+	outBufferInfo.range = VK_WHOLE_SIZE;
+
+	std::array<vk::WriteDescriptorSet, 2> descriptorWrites = {};
+
+	descriptorWrites[0].dstSet = descriptorSets[0];
+	descriptorWrites[0].dstBinding = 0;
+	descriptorWrites[0].dstArrayElement = 0;
+	descriptorWrites[0].descriptorType = vk::DescriptorType::eStorageBuffer;
+	descriptorWrites[0].descriptorCount = 1;
+	descriptorWrites[0].pBufferInfo = &inBufferInfo;
+
+	descriptorWrites[1].dstSet = descriptorSets[0];
+	descriptorWrites[1].dstBinding = 1;
+	descriptorWrites[1].dstArrayElement = 0;
+	descriptorWrites[1].descriptorType = vk::DescriptorType::eStorageBuffer;
+	descriptorWrites[1].descriptorCount = 1;
+	descriptorWrites[1].pBufferInfo = &outBufferInfo;
+
+	device.updateDescriptorSets(static_cast<uint32_t>(descriptorWrites.size()), descriptorWrites.data(), 0, nullptr);
+
+	vk::CommandPoolCreateInfo commandPoolCreateInfo;
+	commandPoolCreateInfo.queueFamilyIndex = 0;  // TODO: Don't assume queue family 0 can do compute.
+	commandPoolCreateInfo.flags = vk::CommandPoolCreateFlagBits::eResetCommandBuffer;
+	commandPool = device.createCommandPool(commandPoolCreateInfo);
+
+	vk::CommandBufferAllocateInfo commandBufferAllocateInfo;
+	commandBufferAllocateInfo.commandPool = commandPool;
+	commandBufferAllocateInfo.commandBufferCount = 1;
+	commandBufferAllocateInfo.level = vk::CommandBufferLevel::ePrimary;
+	auto commandBuffers = device.allocateCommandBuffers(commandBufferAllocateInfo);
+
+	// Record the command buffer
+	commandBuffer = commandBuffers[0];
+
+	vk::CommandBufferBeginInfo commandBufferBeginInfo;
+	commandBuffer.begin(commandBufferBeginInfo);
+
+	commandBuffer.bindPipeline(vk::PipelineBindPoint::eCompute, pipeline);
+	commandBuffer.bindDescriptorSets(vk::PipelineBindPoint::eCompute, pipelineLayout, 0, 1, &descriptorSets[0], 0, nullptr);
+
+	commandBuffer.dispatch((uint32_t)(numElements / localSizeX), 1, 1);
+
+	commandBuffer.end();
+
+	// Destroy objects we don't have to hold on to after command buffer recording.
+	// "A VkPipelineLayout object must not be destroyed while any command buffer that uses it is in the recording state."
+	device.destroyPipelineLayout(pipelineLayout);
+}
+
+void BufferToBufferComputeBenchmark::run()
+{
+	vk::SubmitInfo submitInfo;
+	submitInfo.commandBufferCount = 1;
+	submitInfo.pCommandBuffers = &commandBuffer;
+	queue.submit(submitInfo);
+	queue.waitIdle();
+}
+
+// Performs an operation `op` on each element.
+class ComputeOp : public BufferToBufferComputeBenchmark
+{
+public:
+	ComputeOp(const benchmark::State &state, const char *op)
+	    : BufferToBufferComputeBenchmark(state)
+	{
+		std::stringstream src;
+		src << R"(#version 450
+			layout(local_size_x = 1, local_size_y = 1, local_size_z = 1) in;
+			layout(binding = 0, std430) buffer InBuffer
+			{
+				float Data[];
+			} In;
+			layout(binding = 1, std430) buffer OutBuffer
+			{
+				float Data[];
+			} Out;
+			void main()
+			{
+				float x = In.Data[gl_GlobalInvocationID.x];
+				Out.Data[gl_GlobalInvocationID.x] = )"
+		    << op << R"( (x);
+			})";
+
+		initialize(src.str());
+	}
+};
+
+static void Compute(benchmark::State &state, const char *op)
+{
+	ComputeOp benchmark(state, op);
+
+	// Execute once to have the Reactor routine generated.
+	benchmark.run();
+
+	for(auto _ : state)
+	{
+		benchmark.run();
+	}
+}
+
+BENCHMARK_CAPTURE(Compute, mov, "")->RangeMultiplier(2)->Range(128, 4 * 1024 * 1024)->Unit(benchmark::kMillisecond)->MeasureProcessCPUTime();
+BENCHMARK_CAPTURE(Compute, sqrt, "sqrt")->Arg(4 * 1024 * 1024)->Unit(benchmark::kMillisecond)->MeasureProcessCPUTime();
+BENCHMARK_CAPTURE(Compute, sin, "sin")->Arg(4 * 1024 * 1024)->Unit(benchmark::kMillisecond)->MeasureProcessCPUTime();
+BENCHMARK_CAPTURE(Compute, cos, "cos")->Arg(4 * 1024 * 1024)->Unit(benchmark::kMillisecond)->MeasureProcessCPUTime();
+BENCHMARK_CAPTURE(Compute, exp, "exp")->Arg(4 * 1024 * 1024)->Unit(benchmark::kMillisecond)->MeasureProcessCPUTime();
+BENCHMARK_CAPTURE(Compute, log, "log")->Arg(4 * 1024 * 1024)->Unit(benchmark::kMillisecond)->MeasureProcessCPUTime();
\ No newline at end of file
diff --git a/tests/VulkanUnitTests/Device.cpp b/tests/VulkanUnitTests/Device.cpp
index 462492f..7cffc43 100644
--- a/tests/VulkanUnitTests/Device.cpp
+++ b/tests/VulkanUnitTests/Device.cpp
@@ -446,7 +446,7 @@
 		nullptr,                        // pSignalSemaphores
 	};
 
-	VkResult result = driver->vkQueueSubmit(queue, 1, &info, 0);
+	VkResult result = driver->vkQueueSubmit(queue, 1, &info, VK_NULL_HANDLE);
 	if(result != VK_SUCCESS)
 	{
 		return result;