Implement descriptor indexing for storage buffers

Descriptor indexing is a part of the 2022 Vulkan roadmap. It allows
shaders to use non-constant, non-uniform indexes for descriptor arrays
with various types. This CL allows the use of non-uniform indices for
storage buffers.

Tests: dEQP-VK.descriptor_indexing.*
Bug: b/206633340
Change-Id: Ice05f8ff8e5a12298a0c42806204e2c9379ac9aa
Reviewed-on: https://swiftshader-review.googlesource.com/c/SwiftShader/+/64928
Reviewed-by: Nicolas Capens <nicolascapens@google.com>
Commit-Queue: Sean Risser <srisser@google.com>
Reviewed-by: Alexis Hétu <sugoi@google.com>
Kokoro-Result: kokoro <noreply+kokoro@google.com>
Tested-by: Sean Risser <srisser@google.com>
diff --git a/src/Pipeline/SpirvShader.cpp b/src/Pipeline/SpirvShader.cpp
index 99a2d44..1a7dff9 100644
--- a/src/Pipeline/SpirvShader.cpp
+++ b/src/Pipeline/SpirvShader.cpp
@@ -447,6 +447,9 @@
 				case spv::CapabilityStencilExportEXT: capabilities.StencilExportEXT = true; break;
 				case spv::CapabilityVulkanMemoryModel: capabilities.VulkanMemoryModel = true; break;
 				case spv::CapabilityVulkanMemoryModelDeviceScope: capabilities.VulkanMemoryModelDeviceScope = true; break;
+				case spv::CapabilityShaderNonUniform: capabilities.ShaderNonUniform = true; break;
+				case spv::CapabilityRuntimeDescriptorArray: capabilities.RuntimeDescriptorArray = true; break;
+				case spv::CapabilityStorageBufferArrayNonUniformIndexing: capabilities.StorageBufferArrayNonUniformIndexing = true; break;
 				default:
 					UNSUPPORTED("Unsupported capability %u", insn.word(1));
 				}
@@ -810,6 +813,7 @@
 				if(!strcmp(ext, "SPV_GOOGLE_decorate_string")) break;
 				if(!strcmp(ext, "SPV_GOOGLE_hlsl_functionality1")) break;
 				if(!strcmp(ext, "SPV_GOOGLE_user_type")) break;
+				if(!strcmp(ext, "SPV_EXT_descriptor_indexing")) break;
 				UNSUPPORTED("SPIR-V Extension: %s", ext);
 			}
 			break;
@@ -1243,15 +1247,15 @@
 	}
 }
 
-SIMD::Pointer SpirvShader::WalkExplicitLayoutAccessChain(Object::ID baseId, Object::ID elementId, const Span &indexIds, const EmitState *state) const
+SIMD::Pointer SpirvShader::WalkExplicitLayoutAccessChain(Object::ID baseId, Object::ID elementId, const Span &indexIds, bool nonUniform, const EmitState *state) const
 {
 	// Produce a offset into external memory in sizeof(float) units
 
 	auto &baseObject = getObject(baseId);
 	Type::ID typeId = getType(baseObject).element;
 	Decorations d = GetDecorationsForId(baseObject.typeId());
+	SIMD::Int arrayIndex = 0;
 
-	Int arrayIndex = 0;
 	uint32_t start = 0;
 	if(baseObject.kind == Object::Kind::DescriptorSet)
 	{
@@ -1266,8 +1270,8 @@
 			}
 			else
 			{
-				// Note: the value of indexIds[0] must be dynamically uniform.
-				arrayIndex = Extract(state->getIntermediate(indexIds[0]).Int(0), 0);
+				nonUniform |= GetDecorationsForId(indexIds[0]).NonUniform;
+				arrayIndex = state->getIntermediate(indexIds[0]).Int(0);
 			}
 
 			start = 1;
@@ -1275,7 +1279,7 @@
 		}
 	}
 
-	auto ptr = GetPointerToData(baseId, arrayIndex, state);
+	auto ptr = GetPointerToData(baseId, arrayIndex, nonUniform, state);
 	OffsetToElement(ptr, elementId, d.ArrayStride, state);
 
 	int constantOffset = 0;
@@ -1536,6 +1540,10 @@
 	case spv::DecorationColMajor:
 		HasRowMajor = true;
 		RowMajor = false;
+		break;
+	case spv::DecorationNonUniform:
+		NonUniform = true;
+		break;
 	default:
 		// Intentionally partial, there are many decorations we just don't care about.
 		break;
@@ -1594,6 +1602,7 @@
 	BufferBlock |= src.BufferBlock;
 	RelaxedPrecision |= src.RelaxedPrecision;
 	InsideMatrix |= src.InsideMatrix;
+	NonUniform |= src.NonUniform;
 }
 
 void SpirvShader::DescriptorDecorations::Apply(const sw::SpirvShader::DescriptorDecorations &src)
@@ -2206,6 +2215,7 @@
 {
 	Type::ID typeId = insn.word(1);
 	Object::ID resultId = insn.word(2);
+	bool nonUniform = GetDecorationsForId(resultId).NonUniform;
 	Object::ID baseId = insn.word(3);
 	auto &type = getType(typeId);
 	ASSERT(type.componentCount == 1);
@@ -2218,7 +2228,7 @@
 	   type.storageClass == spv::StorageClassUniform ||
 	   type.storageClass == spv::StorageClassStorageBuffer)
 	{
-		auto ptr = WalkExplicitLayoutAccessChain(baseId, elementId, Span(insn, indexId, insn.wordCount() - indexId), state);
+		auto ptr = WalkExplicitLayoutAccessChain(baseId, elementId, Span(insn, indexId, insn.wordCount() - indexId), nonUniform, state);
 		state->createPointer(resultId, ptr);
 	}
 	else
@@ -2564,7 +2574,7 @@
 	auto arrayId = Type::ID(structTy.definition.word(2 + arrayFieldIdx));
 
 	auto &result = state->createIntermediate(insn.resultId(), 1);
-	auto structBase = GetPointerToData(structPtrId, 0, state);
+	auto structBase = GetPointerToData(structPtrId, 0, false, state);
 
 	Decorations structDecorations = {};
 	ApplyDecorationsForIdMember(&structDecorations, structPtrTy.element, arrayFieldIdx);
diff --git a/src/Pipeline/SpirvShader.hpp b/src/Pipeline/SpirvShader.hpp
index 8cc7bc8..ebc34cb 100644
--- a/src/Pipeline/SpirvShader.hpp
+++ b/src/Pipeline/SpirvShader.hpp
@@ -723,6 +723,9 @@
 		bool StencilExportEXT : 1;
 		bool VulkanMemoryModel : 1;
 		bool VulkanMemoryModelDeviceScope : 1;
+		bool ShaderNonUniform : 1;
+		bool RuntimeDescriptorArray : 1;
+		bool StorageBufferArrayNonUniformIndexing : 1;
 	};
 
 	const Capabilities &getUsedCapabilities() const
@@ -805,6 +808,7 @@
 		bool RelaxedPrecision : 1;
 		bool RowMajor : 1;      // RowMajor if true; ColMajor if false
 		bool InsideMatrix : 1;  // pseudo-decoration for whether we're inside a matrix.
+		bool NonUniform : 1;
 
 		Decorations()
 		    : Location{ -1 }
@@ -828,6 +832,7 @@
 		    , RelaxedPrecision{ false }
 		    , RowMajor{ false }
 		    , InsideMatrix{ false }
+		    , NonUniform{ false }
 		{
 		}
 
@@ -1280,12 +1285,12 @@
 	//  - Pointer
 	//  - InterfaceVariable
 	// Calling GetPointerToData with objects of any other kind will assert.
-	SIMD::Pointer GetPointerToData(Object::ID id, Int arrayIndex, EmitState const *state) const;
+	SIMD::Pointer GetPointerToData(Object::ID id, SIMD::Int arrayIndex, bool nonUniform, EmitState const *state) const;
 	void OffsetToElement(SIMD::Pointer &ptr, Object::ID elementId, int32_t arrayStride, EmitState const *state) const;
 
 	OutOfBoundsBehavior getOutOfBoundsBehavior(Object::ID pointerId, EmitState const *state) const;
 
-	SIMD::Pointer WalkExplicitLayoutAccessChain(Object::ID id, Object::ID elementId, const Span &indexIds, const EmitState *state) const;
+	SIMD::Pointer WalkExplicitLayoutAccessChain(Object::ID id, Object::ID elementId, const Span &indexIds, bool nonUniform, const EmitState *state) const;
 	SIMD::Pointer WalkAccessChain(Object::ID id, Object::ID elementId, const Span &indexIds, const EmitState *state) const;
 
 	// Returns the *component* offset in the literal for the given access chain.
@@ -1569,9 +1574,9 @@
 	std::array<SIMD::Float, 4> fragCoord;
 	std::array<SIMD::Float, 4> pointCoord;
 	SIMD::Int helperInvocation;
-	Int4 numWorkgroups;
-	Int4 workgroupID;
-	Int4 workgroupSize;
+	SIMD::Int numWorkgroups;
+	SIMD::Int workgroupID;
+	SIMD::Int workgroupSize;
 	Int subgroupsPerWorkgroup;
 	Int invocationsPerSubgroup;
 	Int subgroupIndex;
diff --git a/src/Pipeline/SpirvShaderMemory.cpp b/src/Pipeline/SpirvShaderMemory.cpp
index c99c86c..0fec9c9 100644
--- a/src/Pipeline/SpirvShaderMemory.cpp
+++ b/src/Pipeline/SpirvShaderMemory.cpp
@@ -53,7 +53,7 @@
 		memoryOrder = MemoryOrder(memorySemantics);
 	}
 
-	auto ptr = GetPointerToData(pointerId, 0, state);
+	auto ptr = GetPointerToData(pointerId, 0, false, state);
 	bool interleavedByLane = IsStorageInterleavedByLane(pointerTy.storageClass);
 	auto &dst = state->createIntermediate(resultId, resultTy.componentCount);
 	auto robustness = getOutOfBoundsBehavior(pointerId, state);
@@ -98,7 +98,7 @@
 
 	ASSERT(!atomic || elementTy.opcode() == spv::OpTypeInt);  // Vulkan 1.1: "Atomic instructions must declare a scalar 32-bit integer type, for the value pointed to by Pointer."
 
-	auto ptr = GetPointerToData(pointerId, 0, state);
+	auto ptr = GetPointerToData(pointerId, 0, false, state);
 	bool interleavedByLane = IsStorageInterleavedByLane(pointerTy.storageClass);
 	auto robustness = getOutOfBoundsBehavior(pointerId, state);
 
@@ -222,7 +222,7 @@
 		case spv::StorageClassWorkgroup:
 			{
 				bool interleavedByLane = IsStorageInterleavedByLane(objectTy.storageClass);
-				auto ptr = GetPointerToData(resultId, 0, state);
+				auto ptr = GetPointerToData(resultId, 0, false, state);
 				Operand initialValue(this, state, initializerId);
 				VisitMemoryObject(resultId, [&](const MemoryElement &el) {
 					auto p = ptr + el.offset;
@@ -256,8 +256,8 @@
 
 	bool dstInterleavedByLane = IsStorageInterleavedByLane(dstPtrTy.storageClass);
 	bool srcInterleavedByLane = IsStorageInterleavedByLane(srcPtrTy.storageClass);
-	auto dstPtr = GetPointerToData(dstPtrId, 0, state);
-	auto srcPtr = GetPointerToData(srcPtrId, 0, state);
+	auto dstPtr = GetPointerToData(dstPtrId, 0, false, state);
+	auto srcPtr = GetPointerToData(srcPtrId, 0, false, state);
 
 	std::unordered_map<uint32_t, uint32_t> srcOffsets;
 
@@ -378,7 +378,7 @@
 	}
 }
 
-SIMD::Pointer SpirvShader::GetPointerToData(Object::ID id, Int arrayIndex, EmitState const *state) const
+SIMD::Pointer SpirvShader::GetPointerToData(Object::ID id, SIMD::Int arrayIndices, bool nonUniform, EmitState const *state) const
 {
 	auto routine = state->routine;
 	auto &object = getObject(id);
@@ -397,37 +397,64 @@
 
 			uint32_t bindingOffset = routine->pipelineLayout->getBindingOffset(d.DescriptorSet, d.Binding);
 			uint32_t descriptorSize = routine->pipelineLayout->getDescriptorSize(d.DescriptorSet, d.Binding);
-			Int descriptorOffset = bindingOffset + descriptorSize * arrayIndex;
 
 			auto set = state->getPointer(id);
-			Pointer<Byte> descriptor = set.getUniformPointer() + descriptorOffset;  // BufferDescriptor* or inline uniform block
+			if(nonUniform)
+			{
+				SIMD::Int descriptorOffset = bindingOffset + descriptorSize * arrayIndices;
+				auto robustness = getOutOfBoundsBehavior(id, state);
+				ASSERT(routine->pipelineLayout->getDescriptorType(d.DescriptorSet, d.Binding) != VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT);
 
-			auto descriptorType = routine->pipelineLayout->getDescriptorType(d.DescriptorSet, d.Binding);
-			if(descriptorType == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT)
-			{
-				// Note: there is no bounds checking for inline uniform blocks.
-				// MAX_INLINE_UNIFORM_BLOCK_SIZE represents the maximum size of
-				// an inline uniform block, but this value should remain unused.
-				return SIMD::Pointer(descriptor, vk::MAX_INLINE_UNIFORM_BLOCK_SIZE);
-			}
-			else
-			{
-				Pointer<Byte> data = *Pointer<Pointer<Byte>>(descriptor + OFFSET(vk::BufferDescriptor, ptr));  // void*
-				Int size = *Pointer<Int>(descriptor + OFFSET(vk::BufferDescriptor, sizeInBytes));
+				std::array<Pointer<Byte>, 4> pointers;
+				for(int i = 0; i < SIMD::Width; i++)
+				{
+					pointers[i] = *Pointer<Pointer<Byte>>(set.getPointerForLane(i) + Extract(descriptorOffset, i) + OFFSET(vk::BufferDescriptor, ptr));
+				}
+
+				SIMD::Pointer ptr(pointers);
 
 				if(routine->pipelineLayout->isDescriptorDynamic(d.DescriptorSet, d.Binding))
 				{
-					Int dynamicOffsetIndex =
-					    routine->pipelineLayout->getDynamicOffsetIndex(d.DescriptorSet, d.Binding) +
-					    arrayIndex;
-					Int offset = routine->descriptorDynamicOffsets[dynamicOffsetIndex];
-					Int robustnessSize = *Pointer<Int>(descriptor + OFFSET(vk::BufferDescriptor, robustnessSize));
+					SIMD::Int dynamicOffsetIndex = SIMD::Int(routine->pipelineLayout->getDynamicOffsetIndex(d.DescriptorSet, d.Binding) + arrayIndices);
+					SIMD::Pointer routineDynamicOffsets = SIMD::Pointer(routine->descriptorDynamicOffsets, 0, sizeof(int) * dynamicOffsetIndex);
+					SIMD::Int dynamicOffsets = routineDynamicOffsets.Load<SIMD::Int>(robustness, state->activeLaneMask());
+					ptr += dynamicOffsets;
+				}
+				return ptr;
+			}
+			else
+			{
+				rr::Int arrayIdx = Extract(arrayIndices, 0);
+				rr::Int descriptorOffset = bindingOffset + descriptorSize * arrayIdx;
+				Pointer<Byte> descriptor = set.getUniformPointer() + descriptorOffset;  // BufferDescriptor* or inline uniform block
 
-					return SIMD::Pointer(data + offset, Min(size, robustnessSize - offset));
+				auto descriptorType = routine->pipelineLayout->getDescriptorType(d.DescriptorSet, d.Binding);
+				if(descriptorType == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT)
+				{
+					// Note: there is no bounds checking for inline uniform blocks.
+					// MAX_INLINE_UNIFORM_BLOCK_SIZE represents the maximum size of
+					// an inline uniform block, but this value should remain unused.
+					return SIMD::Pointer(descriptor, vk::MAX_INLINE_UNIFORM_BLOCK_SIZE);
 				}
 				else
 				{
-					return SIMD::Pointer(data, size);
+					Pointer<Byte> data = *Pointer<Pointer<Byte>>(descriptor + OFFSET(vk::BufferDescriptor, ptr));  // void*
+					rr::Int size = *Pointer<Int>(descriptor + OFFSET(vk::BufferDescriptor, sizeInBytes));
+
+					if(routine->pipelineLayout->isDescriptorDynamic(d.DescriptorSet, d.Binding))
+					{
+						rr::Int dynamicOffsetIndex =
+						    routine->pipelineLayout->getDynamicOffsetIndex(d.DescriptorSet, d.Binding) +
+						    arrayIdx;
+						rr::Int offset = routine->descriptorDynamicOffsets[dynamicOffsetIndex];
+						rr::Int robustnessSize = *Pointer<rr::Int>(descriptor + OFFSET(vk::BufferDescriptor, robustnessSize));
+
+						return SIMD::Pointer(data + offset, Min(size, robustnessSize - offset));
+					}
+					else
+					{
+						return SIMD::Pointer(data, size);
+					}
 				}
 			}
 		}