Add support for push constants

- Proper support for calculating offsets in explicit-layout storage
classes (push constant, uniform, and storage buffer) according to the
Offset, ArrayStride and MatrixStride decorations.
- Plumb a block of push constant data throughout the pipeline
- Implement push constant update commands

Bug: b/128690261
Bug: b/128872954
Test: dEQP-VK.*push_constant*
Test: dEQP-VK.glsl.*
Test: dEQP-VK.spirv_assembly.*
Change-Id: I7d5a66ac4aafd6b637b4693eb6ce96a327b4904e
Reviewed-on: https://swiftshader-review.googlesource.com/c/SwiftShader/+/27528
Tested-by: Chris Forbes <chrisforbes@google.com>
Presubmit-Ready: Chris Forbes <chrisforbes@google.com>
Reviewed-by: Ben Clayton <bclayton@google.com>
Reviewed-by: Nicolas Capens <nicolascapens@google.com>
Kokoro-Presubmit: kokoro <noreply+kokoro@google.com>
diff --git a/src/Device/Context.hpp b/src/Device/Context.hpp
index 2405224..8645a1a 100644
--- a/src/Device/Context.hpp
+++ b/src/Device/Context.hpp
@@ -15,6 +15,7 @@
 #ifndef sw_Context_hpp
 #define sw_Context_hpp
 
+#include "Vulkan/VkConfig.h"
 #include "Sampler.hpp"
 #include "Stream.hpp"
 #include "Point.hpp"
@@ -107,6 +108,11 @@
 		TRANSPARENCY_LAST = TRANSPARENCY_ALPHA_TO_COVERAGE
 	};
 
+	struct PushConstantStorage
+	{
+		unsigned char data[vk::MAX_PUSH_CONSTANT_SIZE];
+	};
+
 	class Context
 	{
 	public:
@@ -220,6 +226,8 @@
 		unsigned int sampleMask;
 		unsigned int multiSampleMask;
 		int sampleCount;
+
+		PushConstantStorage pushConstants;
 	};
 }
 
diff --git a/src/Device/Renderer.cpp b/src/Device/Renderer.cpp
index 2038d50..042d68d 100644
--- a/src/Device/Renderer.cpp
+++ b/src/Device/Renderer.cpp
@@ -444,6 +444,11 @@
 			data->scissorY1 = scissor.offset.y + scissor.extent.height;
 		}
 
+		// Push constants
+		{
+			data->pushConstants = context->pushConstants;
+		}
+
 		draw->primitive = 0;
 		draw->count = count;
 
diff --git a/src/Device/Renderer.hpp b/src/Device/Renderer.hpp
index c31676c..eb93485 100644
--- a/src/Device/Renderer.hpp
+++ b/src/Device/Renderer.hpp
@@ -187,6 +187,8 @@
 		float4 a2c1;
 		float4 a2c2;
 		float4 a2c3;
+
+		PushConstantStorage pushConstants;
 	};
 
 	class Renderer : public VertexProcessor, public PixelProcessor, public SetupProcessor
diff --git a/src/Pipeline/ComputeProgram.cpp b/src/Pipeline/ComputeProgram.cpp
index a119a3c..d15b8d4 100644
--- a/src/Pipeline/ComputeProgram.cpp
+++ b/src/Pipeline/ComputeProgram.cpp
@@ -52,6 +52,8 @@
 			routine.descriptorSets[i] = descriptorSetsIn[i];
 		}
 
+		routine.pushConstants = Pointer<Byte>(data + OFFSET(Data, pushConstants));
+
 		auto &modes = shader->getModes();
 
 		int localSize[3] = {modes.WorkgroupSizeX, modes.WorkgroupSizeY, modes.WorkgroupSizeZ};
@@ -167,7 +169,7 @@
 	}
 
 	void ComputeProgram::run(
-		Routine *routine, void** descriptorSets,
+		Routine *routine, void** descriptorSets, PushConstantStorage const &pushConstants,
 		uint32_t groupCountX, uint32_t groupCountY, uint32_t groupCountZ)
 	{
 		auto runWorkgroup = (void(*)(void*))(routine->getEntry());
@@ -178,6 +180,7 @@
 		data.numWorkgroups[Y] = groupCountY;
 		data.numWorkgroups[Z] = groupCountZ;
 		data.numWorkgroups[3] = 0;
+		data.pushConstants = pushConstants;
 
 		// TODO(bclayton): Split work across threads.
 		for (uint32_t groupZ = 0; groupZ < groupCountZ; groupZ++)
diff --git a/src/Pipeline/ComputeProgram.hpp b/src/Pipeline/ComputeProgram.hpp
index ef3a4de..6b63233 100644
--- a/src/Pipeline/ComputeProgram.hpp
+++ b/src/Pipeline/ComputeProgram.hpp
@@ -18,6 +18,7 @@
 #include "SpirvShader.hpp"
 
 #include "Reactor/Reactor.hpp"
+#include "Device/Context.hpp"
 
 #include <functional>
 
@@ -47,7 +48,7 @@
 		// run executes the compute shader routine for all workgroups.
 		// TODO(bclayton): This probably does not belong here. Consider moving.
 		static void run(
-			Routine *routine, void** descriptorSets,
+			Routine *routine, void** descriptorSets, PushConstantStorage const &pushConstants,
 			uint32_t groupCountX, uint32_t groupCountY, uint32_t groupCountZ);
 
 	protected:
@@ -62,6 +63,7 @@
 			void** descriptorSets;
 			uint4 numWorkgroups;
 			uint4 workgroupID;
+			PushConstantStorage pushConstants;
 		};
 
 		SpirvRoutine routine;
diff --git a/src/Pipeline/PixelProgram.cpp b/src/Pipeline/PixelProgram.cpp
index 57df1cd..7d78e69 100644
--- a/src/Pipeline/PixelProgram.cpp
+++ b/src/Pipeline/PixelProgram.cpp
@@ -31,6 +31,8 @@
 	{
 		enableIndex = 0;
 
+		routine.pushConstants = data + OFFSET(DrawData, pushConstants);
+
 		spirvShader->emit(&routine);
 		spirvShader->emitEpilog(&routine);
 
diff --git a/src/Pipeline/SpirvShader.cpp b/src/Pipeline/SpirvShader.cpp
index 7a858b0..46a8fd5 100644
--- a/src/Pipeline/SpirvShader.cpp
+++ b/src/Pipeline/SpirvShader.cpp
@@ -181,6 +181,7 @@
 					break;
 				case spv::StorageClassUniform:
 				case spv::StorageClassStorageBuffer:
+				case spv::StorageClassPushConstant:
 					object.kind = Object::Kind::PhysicalPointer;
 					break;
 
@@ -192,7 +193,6 @@
 				case spv::StorageClassWorkgroup:
 				case spv::StorageClassCrossWorkgroup:
 				case spv::StorageClassGeneric:
-				case spv::StorageClassPushConstant:
 				case spv::StorageClassAtomicCounter:
 				case spv::StorageClassImage:
 					UNIMPLEMENTED("StorageClass %d not yet implemented", (int)storageClass);
@@ -653,6 +653,7 @@
 		{
 		case spv::StorageClassUniform:
 		case spv::StorageClassStorageBuffer:
+		case spv::StorageClassPushConstant:
 			return false;
 		default:
 			return true;
@@ -742,10 +743,88 @@
 		VisitInterfaceInner<F>(def.word(1), d, f);
 	}
 
+	SIMD::Int SpirvShader::WalkExplicitLayoutAccessChain(Object::ID id, uint32_t numIndexes, uint32_t const *indexIds, SpirvRoutine *routine) const
+	{
+		// Produce a offset into external memory in sizeof(float) units
+
+		int constantOffset = 0;
+		SIMD::Int dynamicOffset = SIMD::Int(0);
+		auto &baseObject = getObject(id);
+		Type::ID typeId = getType(baseObject.type).element;
+
+		// The <base> operand is an intermediate value itself, ie produced by a previous OpAccessChain.
+		// Start with its offset and build from there.
+		if (baseObject.kind == Object::Kind::Value)
+		{
+			dynamicOffset += routine->getIntermediate(id).Int(0);
+		}
+
+		for (auto i = 0u; i < numIndexes; i++)
+		{
+			auto & type = getType(typeId);
+			switch (type.definition.opcode())
+			{
+			case spv::OpTypeStruct:
+			{
+				int memberIndex = GetConstantInt(indexIds[i]);
+				Decorations d{};
+				ApplyDecorationsForIdMember(&d, typeId, memberIndex);
+				ASSERT(d.HasOffset);
+				constantOffset += d.Offset / sizeof(float);
+				typeId = type.definition.word(2u + memberIndex);
+				break;
+			}
+			case spv::OpTypeArray:
+			case spv::OpTypeRuntimeArray:
+			{
+				// TODO: b/127950082: Check bounds.
+				Decorations d{};
+				ApplyDecorationsForId(&d, typeId);
+				ASSERT(d.HasArrayStride);
+				auto & obj = getObject(indexIds[i]);
+				if (obj.kind == Object::Kind::Constant)
+					constantOffset += d.ArrayStride/sizeof(float) * GetConstantInt(indexIds[i]);
+				else
+					dynamicOffset += SIMD::Int(d.ArrayStride / sizeof(float)) * routine->getIntermediate(indexIds[i]).Int(0);
+				typeId = type.element;
+				break;
+			}
+			case spv::OpTypeMatrix:
+			{
+				// TODO: b/127950082: Check bounds.
+				Decorations d{};
+				ApplyDecorationsForId(&d, typeId);
+				ASSERT(d.HasMatrixStride);
+				auto & obj = getObject(indexIds[i]);
+				if (obj.kind == Object::Kind::Constant)
+					constantOffset += d.MatrixStride/sizeof(float) * GetConstantInt(indexIds[i]);
+				else
+					dynamicOffset += SIMD::Int(d.MatrixStride / sizeof(float)) * routine->getIntermediate(indexIds[i]).Int(0);
+				typeId = type.element;
+				break;
+			}
+			case spv::OpTypeVector:
+			{
+				auto & obj = getObject(indexIds[i]);
+				if (obj.kind == Object::Kind::Constant)
+					constantOffset += GetConstantInt(indexIds[i]);
+				else
+					dynamicOffset += routine->getIntermediate(indexIds[i]).Int(0);
+				typeId = type.element;
+				break;
+			}
+			default:
+				UNIMPLEMENTED("Unexpected type '%s' in WalkExplicitLayoutAccessChain", OpcodeName(type.definition.opcode()).c_str());
+			}
+		}
+
+		return dynamicOffset + SIMD::Int(constantOffset);
+	}
+
 	SIMD::Int SpirvShader::WalkAccessChain(Object::ID id, uint32_t numIndexes, uint32_t const *indexIds, SpirvRoutine *routine) const
 	{
-		// TODO: think about explicit layout (UBO/SSBO) storage classes
 		// TODO: avoid doing per-lane work in some cases if we can?
+		// Produce a *component* offset into location-oriented memory
 
 		int constantOffset = 0;
 		SIMD::Int dynamicOffset = SIMD::Int(0);
@@ -1275,6 +1354,11 @@
 			routine->physicalPointers[resultId] = address;
 			break;
 		}
+		case spv::StorageClassPushConstant:
+		{
+			routine->physicalPointers[resultId] = routine->pushConstants;
+			break;
+		}
 		default:
 			break;
 		}
@@ -1372,7 +1456,17 @@
 		ASSERT(getObject(baseId).pointerBase == getObject(objectId).pointerBase);
 
 		auto &dst = routine->createIntermediate(objectId, type.sizeInComponents);
-		dst.emplace(0, WalkAccessChain(baseId, insn.wordCount() - 4, insn.wordPointer(4), routine));
+
+		if (type.storageClass == spv::StorageClassPushConstant ||
+			type.storageClass == spv::StorageClassUniform ||
+			type.storageClass == spv::StorageClassStorageBuffer)
+		{
+			dst.emplace(0, WalkExplicitLayoutAccessChain(baseId, insn.wordCount() - 4, insn.wordPointer(4), routine));
+		}
+		else
+		{
+			dst.emplace(0, WalkAccessChain(baseId, insn.wordCount() - 4, insn.wordPointer(4), routine));
+		}
 	}
 
 	void SpirvShader::EmitStore(InsnIterator insn, SpirvRoutine *routine) const
diff --git a/src/Pipeline/SpirvShader.hpp b/src/Pipeline/SpirvShader.hpp
index 1f5e075..c3bd154 100644
--- a/src/Pipeline/SpirvShader.hpp
+++ b/src/Pipeline/SpirvShader.hpp
@@ -471,6 +471,7 @@
 
 		void ProcessInterfaceVariable(Object &object);
 
+		SIMD::Int WalkExplicitLayoutAccessChain(Object::ID id, uint32_t numIndexes, uint32_t const *indexIds, SpirvRoutine *routine) const;
 		SIMD::Int WalkAccessChain(Object::ID id, uint32_t numIndexes, uint32_t const *indexIds, SpirvRoutine *routine) const;
 		uint32_t WalkLiteralAccessChain(Type::ID id, uint32_t numIndexes, uint32_t const *indexes) const;
 
@@ -524,6 +525,7 @@
 		SIMD::Int activeLaneMask = SIMD::Int(0xFFFFFFFF);
 
 		std::array<Pointer<Byte>, vk::MAX_BOUND_DESCRIPTOR_SETS> descriptorSets;
+		Pointer<Byte> pushConstants;
 
 		void createLvalue(SpirvShader::Object::ID id, uint32_t size)
 		{
diff --git a/src/Pipeline/VertexProgram.cpp b/src/Pipeline/VertexProgram.cpp
index 182e7bd..9a40f79 100644
--- a/src/Pipeline/VertexProgram.cpp
+++ b/src/Pipeline/VertexProgram.cpp
@@ -43,6 +43,8 @@
 			routine.getValue(it->second.Id)[it->second.FirstComponent] =
 					As<Float4>(Int4((*Pointer<Int>(data + OFFSET(DrawData, instanceID)))));
 		}
+
+		routine.pushConstants = data + OFFSET(DrawData, pushConstants);
 	}
 
 	VertexProgram::~VertexProgram()
diff --git a/src/Vulkan/VkCommandBuffer.cpp b/src/Vulkan/VkCommandBuffer.cpp
index 0421780..2bf215d 100644
--- a/src/Vulkan/VkCommandBuffer.cpp
+++ b/src/Vulkan/VkCommandBuffer.cpp
@@ -139,7 +139,8 @@
 			executionState.pipelines[VK_PIPELINE_BIND_POINT_COMPUTE]);
 		pipeline->run(groupCountX, groupCountY, groupCountZ,
 			MAX_BOUND_DESCRIPTOR_SETS,
-			executionState.boundDescriptorSets[VK_PIPELINE_BIND_POINT_COMPUTE]);
+			executionState.boundDescriptorSets[VK_PIPELINE_BIND_POINT_COMPUTE],
+			executionState.pushConstants);
 	}
 
 private:
@@ -241,6 +242,8 @@
 			}
 		}
 
+		context.pushConstants = executionState.pushConstants;
+
 		executionState.renderer->setContext(context);
 		executionState.renderer->setScissor(pipeline->getScissor());
 		executionState.renderer->setViewport(pipeline->getViewport());
@@ -288,6 +291,8 @@
 			}
 		}
 
+		context.pushConstants = executionState.pushConstants;
+
 		context.indexBuffer = Cast(executionState.indexBufferBinding.buffer)->getOffsetPointer(
 				executionState.indexBufferBinding.offset + firstIndex * (executionState.indexType == VK_INDEX_TYPE_UINT16 ? 2 : 4));
 
@@ -571,6 +576,28 @@
 	const VkDescriptorSet descriptorSet;
 };
 
+struct SetPushConstants : public CommandBuffer::Command
+{
+	SetPushConstants(uint32_t offset, uint32_t size, void const *pValues)
+		: offset(offset), size(size)
+	{
+		ASSERT(offset < MAX_PUSH_CONSTANT_SIZE);
+		ASSERT(offset + size <= MAX_PUSH_CONSTANT_SIZE);
+
+		memcpy(data, pValues, size);
+	}
+
+	void play(CommandBuffer::ExecutionState& executionState)
+	{
+		memcpy(&executionState.pushConstants.data[offset], data, size);
+	}
+
+private:
+	uint32_t offset;
+	uint32_t size;
+	unsigned char data[MAX_PUSH_CONSTANT_SIZE];
+};
+
 CommandBuffer::CommandBuffer(VkCommandBufferLevel pLevel) : level(pLevel)
 {
 	// FIXME (b/119409619): replace this vector by an allocator so we can control all memory allocations
@@ -740,7 +767,7 @@
 void CommandBuffer::pushConstants(VkPipelineLayout layout, VkShaderStageFlags stageFlags,
 	uint32_t offset, uint32_t size, const void* pValues)
 {
-	UNIMPLEMENTED("pushConstants");
+	addCommand<SetPushConstants>(offset, size, pValues);
 }
 
 void CommandBuffer::setViewport(uint32_t firstViewport, uint32_t viewportCount, const VkViewport* pViewports)
diff --git a/src/Vulkan/VkCommandBuffer.hpp b/src/Vulkan/VkCommandBuffer.hpp
index 38d8e59..4f86071 100644
--- a/src/Vulkan/VkCommandBuffer.hpp
+++ b/src/Vulkan/VkCommandBuffer.hpp
@@ -17,6 +17,7 @@
 
 #include "VkConfig.h"
 #include "VkObject.hpp"
+#include "Device/Context.hpp"
 #include <memory>
 #include <vector>
 
@@ -126,6 +127,7 @@
 		Framebuffer* renderPassFramebuffer = nullptr;
 		Pipeline* pipelines[VK_PIPELINE_BIND_POINT_RANGE_SIZE] = {};
 		VkDescriptorSet boundDescriptorSets[VK_PIPELINE_BIND_POINT_RANGE_SIZE][MAX_BOUND_DESCRIPTOR_SETS] = { { VK_NULL_HANDLE } };
+		sw::PushConstantStorage pushConstants;
 
 		struct VertexInputBinding
 		{
diff --git a/src/Vulkan/VkConfig.h b/src/Vulkan/VkConfig.h
index 0e4f10c..772f772 100644
--- a/src/Vulkan/VkConfig.h
+++ b/src/Vulkan/VkConfig.h
@@ -57,6 +57,7 @@
 {
 	MAX_BOUND_DESCRIPTOR_SETS = 4,
 	MAX_VERTEX_INPUT_BINDINGS = 16,
+	MAX_PUSH_CONSTANT_SIZE = 128,
 };
 
 enum
diff --git a/src/Vulkan/VkPhysicalDevice.cpp b/src/Vulkan/VkPhysicalDevice.cpp
index 95cf71d..7dddd76 100644
--- a/src/Vulkan/VkPhysicalDevice.cpp
+++ b/src/Vulkan/VkPhysicalDevice.cpp
@@ -147,7 +147,7 @@
 		65536, // maxTexelBufferElements
 		16384, // maxUniformBufferRange
 		(1ul << 27), // maxStorageBufferRange
-		128, // maxPushConstantsSize
+		vk::MAX_PUSH_CONSTANT_SIZE, // maxPushConstantsSize
 		4096, // maxMemoryAllocationCount
 		4000, // maxSamplerAllocationCount
 		131072, // bufferImageGranularity
diff --git a/src/Vulkan/VkPipeline.cpp b/src/Vulkan/VkPipeline.cpp
index e619a03..1a28242 100644
--- a/src/Vulkan/VkPipeline.cpp
+++ b/src/Vulkan/VkPipeline.cpp
@@ -546,11 +546,11 @@
 }
 
 void ComputePipeline::run(uint32_t groupCountX, uint32_t groupCountY, uint32_t groupCountZ,
-	size_t numDescriptorSets, VkDescriptorSet *descriptorSets)
+	size_t numDescriptorSets, VkDescriptorSet *descriptorSets, sw::PushConstantStorage const &pushConstants)
 {
 	ASSERT_OR_RETURN(routine != nullptr);
 	sw::ComputeProgram::run(
-		routine, reinterpret_cast<void**>(descriptorSets),
+		routine, reinterpret_cast<void**>(descriptorSets), pushConstants,
 		groupCountX, groupCountY, groupCountZ);
 }
 
diff --git a/src/Vulkan/VkPipeline.hpp b/src/Vulkan/VkPipeline.hpp
index 84b6ef5..c43a8e9 100644
--- a/src/Vulkan/VkPipeline.hpp
+++ b/src/Vulkan/VkPipeline.hpp
@@ -104,7 +104,7 @@
 	void compileShaders(const VkAllocationCallbacks* pAllocator, const VkComputePipelineCreateInfo* pCreateInfo);
 
 	void run(uint32_t groupCountX, uint32_t groupCountY, uint32_t groupCountZ,
-		size_t numDescriptorSets, VkDescriptorSet *descriptorSets);
+		size_t numDescriptorSets, VkDescriptorSet *descriptorSets, sw::PushConstantStorage const &pushConstants);
 
 protected:
 	sw::SpirvShader *shader = nullptr;