Robust buffer access for vertex inputs

Robust buffer access for vertex inputs/attributes
was not implemented. It's fixed in this cl by adding
robustBufferAccess to the Context structure, adding a
check in Renderer::advanceInstanceAttributes() to prevent
advancing a buffer past its end and adding out of bounds
checks in VertexRoutine::readStream()

This affects dEQP-VK.robustness.vertex_access.* tests,
which will be enabled when the vertexPipelineStoresAndAtomics
feature is turned on.

For now, this change should be noop in test results.

Bug b/140294254 b/131224163

Change-Id: Ib4d4cdb73f48495a556b6bd05b1d0e3ce800ad15
Reviewed-on: https://swiftshader-review.googlesource.com/c/SwiftShader/+/36268
Tested-by: Alexis Hétu <sugoi@google.com>
Reviewed-by: Nicolas Capens <nicolascapens@google.com>
diff --git a/src/Device/Context.cpp b/src/Device/Context.cpp
index 6825d01..34a22a7 100644
--- a/src/Device/Context.cpp
+++ b/src/Device/Context.cpp
@@ -99,6 +99,8 @@
 		frontStencil = {};
 		backStencil = {};
 
+		robustBufferAccess = false;
+
 		rasterizerDiscard = false;
 
 		depthCompareMode = VK_COMPARE_OP_LESS;
diff --git a/src/Device/Context.hpp b/src/Device/Context.hpp
index 610ec88..2736a9e 100644
--- a/src/Device/Context.hpp
+++ b/src/Device/Context.hpp
@@ -107,6 +107,7 @@
 		vk::DescriptorSet::Bindings descriptorSets = {};
 		vk::DescriptorSet::DynamicOffsets descriptorDynamicOffsets = {};
 		Stream input[MAX_INTERFACE_COMPONENTS / 4];
+		bool robustBufferAccess;
 
 		vk::ImageView *renderTarget[RENDERTARGETS];
 		vk::ImageView *depthBuffer;
diff --git a/src/Device/Renderer.cpp b/src/Device/Renderer.cpp
index 8b0ab36..de4e9af 100644
--- a/src/Device/Renderer.cpp
+++ b/src/Device/Renderer.cpp
@@ -277,6 +277,7 @@
 		for(int i = 0; i < MAX_INTERFACE_COMPONENTS/4; i++)
 		{
 			data->input[i] = context->input[i].buffer;
+			data->robustnessSize[i] = context->input[i].robustnessSize;
 			data->stride[i] = context->input[i].vertexStride;
 		}
 
@@ -1057,10 +1058,11 @@
 		for(uint32_t i = 0; i < vk::MAX_VERTEX_INPUT_BINDINGS; i++)
 		{
 			auto &attrib = inputs[i];
-			if (attrib.count && attrib.instanceStride)
+			if (attrib.count && attrib.instanceStride && (attrib.instanceStride < attrib.robustnessSize))
 			{
 				// Under the casts: attrib.buffer += attrib.instanceStride
 				attrib.buffer = (void const *)((uintptr_t)attrib.buffer + attrib.instanceStride);
+				attrib.robustnessSize -= attrib.instanceStride;
 			}
 		}
 	}
diff --git a/src/Device/Renderer.hpp b/src/Device/Renderer.hpp
index 63b4587..9b45f74 100644
--- a/src/Device/Renderer.hpp
+++ b/src/Device/Renderer.hpp
@@ -66,6 +66,7 @@
 		vk::DescriptorSet::DynamicOffsets descriptorDynamicOffsets = {};
 
 		const void *input[MAX_INTERFACE_COMPONENTS / 4];
+		unsigned int robustnessSize[MAX_INTERFACE_COMPONENTS / 4];
 		unsigned int stride[MAX_INTERFACE_COMPONENTS / 4];
 		const void *indices;
 
diff --git a/src/Device/Stream.hpp b/src/Device/Stream.hpp
index cdf5c69..b6bb56c 100644
--- a/src/Device/Stream.hpp
+++ b/src/Device/Stream.hpp
@@ -39,6 +39,7 @@
 	struct Stream
 	{
 		const void *buffer = nullptr;
+		unsigned int robustnessSize = 0;
 		unsigned int vertexStride = 0;
 		unsigned int instanceStride = 0;
 		StreamType type = STREAMTYPE_FLOAT;
diff --git a/src/Device/VertexProcessor.cpp b/src/Device/VertexProcessor.cpp
index 72e4cf4..80988cd 100644
--- a/src/Device/VertexProcessor.cpp
+++ b/src/Device/VertexProcessor.cpp
@@ -44,6 +44,32 @@
 		return hash;
 	}
 
+	unsigned int VertexProcessor::States::Input::bytesPerAttrib() const
+	{
+		switch(type)
+		{
+		case STREAMTYPE_FLOAT:
+		case STREAMTYPE_INT:
+		case STREAMTYPE_UINT:
+			return count * sizeof(uint32_t);
+		case STREAMTYPE_HALF:
+		case STREAMTYPE_SHORT:
+		case STREAMTYPE_USHORT:
+			return count * sizeof(uint16_t);
+		case STREAMTYPE_BYTE:
+		case STREAMTYPE_SBYTE:
+			return count * sizeof(uint8_t);
+		case STREAMTYPE_COLOR:
+		case STREAMTYPE_2_10_10_10_INT:
+		case STREAMTYPE_2_10_10_10_UINT:
+			return sizeof(int);
+		default:
+			UNSUPPORTED("stream.type %d", int(type));
+		}
+
+		return 0;
+	}
+
 	bool VertexProcessor::State::operator==(const State &state) const
 	{
 		if(hash != state.hash)
@@ -78,6 +104,7 @@
 		State state;
 
 		state.shaderID = context->vertexShader->getSerialID();
+		state.robustBufferAccess = context->robustBufferAccess;
 		state.isPoint = context->topology == VK_PRIMITIVE_TOPOLOGY_POINT_LIST;
 
 		for(int i = 0; i < MAX_INTERFACE_COMPONENTS / 4; i++)
diff --git a/src/Device/VertexProcessor.hpp b/src/Device/VertexProcessor.hpp
index 247e8a3..e574e77 100644
--- a/src/Device/VertexProcessor.hpp
+++ b/src/Device/VertexProcessor.hpp
@@ -68,6 +68,8 @@
 					return count != 0;
 				}
 
+				unsigned int bytesPerAttrib() const;
+
 				StreamType type    : BITS(STREAMTYPE_LAST);
 				unsigned int count : 3;
 				bool normalized    : 1;
@@ -75,6 +77,7 @@
 			};
 
 			Input input[MAX_INTERFACE_COMPONENTS / 4];
+			bool robustBufferAccess : 1;
 			bool isPoint : 1;
 		};
 
diff --git a/src/Pipeline/VertexRoutine.cpp b/src/Pipeline/VertexRoutine.cpp
index be84300..acc13cd 100644
--- a/src/Pipeline/VertexRoutine.cpp
+++ b/src/Pipeline/VertexRoutine.cpp
@@ -94,8 +94,13 @@
 			{
 				Pointer<Byte> input = *Pointer<Pointer<Byte>>(data + OFFSET(DrawData, input) + sizeof(void*) * (i / 4));
 				UInt stride = *Pointer<UInt>(data + OFFSET(DrawData, stride) + sizeof(uint32_t) * (i / 4));
+				UInt robustnessSize(0);
+				if(state.robustBufferAccess)
+				{
+					robustnessSize = *Pointer<UInt>(data + OFFSET(DrawData, robustnessSize) + sizeof(uint32_t) * (i / 4));
+				}
 
-				auto value = readStream(input, stride, state.input[i / 4], batch);
+				auto value = readStream(input, stride, state.input[i / 4], batch, state.robustBufferAccess, robustnessSize);
 				routine.inputs[i + 0] = value.x;
 				routine.inputs[i + 1] = value.y;
 				routine.inputs[i + 2] = value.z;
@@ -137,14 +142,28 @@
 		clipFlags |= Pointer<Int>(constants + OFFSET(Constants,fini))[SignMask(finiteXYZ)];
 	}
 
-	Vector4f VertexRoutine::readStream(Pointer<Byte> &buffer, UInt &stride, const Stream &stream, Pointer<UInt> &batch)
+	Vector4f VertexRoutine::readStream(Pointer<Byte> &buffer, UInt &stride, const Stream &stream, Pointer<UInt> &batch,
+	                                   bool robustBufferAccess, UInt & robustnessSize)
 	{
 		Vector4f v;
+		UInt4 offsets = *Pointer<UInt4>(As<Pointer<UInt4>>(batch)) * UInt4(stride);
 
-		Pointer<Byte> source0 = buffer + batch[0] * stride;
-		Pointer<Byte> source1 = buffer + batch[1] * stride;
-		Pointer<Byte> source2 = buffer + batch[2] * stride;
-		Pointer<Byte> source3 = buffer + batch[3] * stride;
+		Pointer<Byte> source0 = buffer + offsets.x;

+		Pointer<Byte> source1 = buffer + offsets.y;

+		Pointer<Byte> source2 = buffer + offsets.z;

+		Pointer<Byte> source3 = buffer + offsets.w;
+
+		UInt4 zero(0);
+		if (robustBufferAccess)
+		{
+			// TODO(b/141124876): Optimize for wide-vector gather operations.
+			UInt4 limits = offsets + UInt4(stream.bytesPerAttrib());
+			Pointer<Byte> zeroSource = As<Pointer<Byte>>(&zero);
+			source0 = IfThenElse(limits.x <= robustnessSize, source0, zeroSource);
+			source1 = IfThenElse(limits.y <= robustnessSize, source1, zeroSource);
+			source2 = IfThenElse(limits.z <= robustnessSize, source2, zeroSource);
+			source3 = IfThenElse(limits.w <= robustnessSize, source3, zeroSource);
+		}
 
 		bool isNativeFloatAttrib = (stream.attribType == SpirvShader::ATTRIBTYPE_FLOAT) || stream.normalized;
 
diff --git a/src/Pipeline/VertexRoutine.hpp b/src/Pipeline/VertexRoutine.hpp
index 5a0a6e2..3be1484 100644
--- a/src/Pipeline/VertexRoutine.hpp
+++ b/src/Pipeline/VertexRoutine.hpp
@@ -66,7 +66,8 @@
 
 		typedef VertexProcessor::State::Input Stream;
 
-		Vector4f readStream(Pointer<Byte> &buffer, UInt &stride, const Stream &stream, Pointer<UInt> &batch);
+		Vector4f readStream(Pointer<Byte> &buffer, UInt &stride, const Stream &stream, Pointer<UInt> &batch,
+		                    bool robustBufferAccess, UInt& robustnessSize);
 		void readInput(Pointer<UInt> &batch);
 		void computeClipFlags();
 		void writeCache(Pointer<Byte> &vertexCache, Pointer<UInt> &tagCache, Pointer<UInt> &batch);
diff --git a/src/Vulkan/VkCommandBuffer.cpp b/src/Vulkan/VkCommandBuffer.cpp
index e3a9ae7..8562c2b 100644
--- a/src/Vulkan/VkCommandBuffer.cpp
+++ b/src/Vulkan/VkCommandBuffer.cpp
@@ -419,8 +419,13 @@
 		if (attrib.count)
 		{
 			const auto &vertexInput = vertexInputBindings[attrib.binding];
-			attrib.buffer = vertexInput.buffer ? vertexInput.buffer->getOffsetPointer(
-					attrib.offset + vertexInput.offset + attrib.vertexStride * firstVertex + attrib.instanceStride * firstInstance) : nullptr;
+			VkDeviceSize offset = attrib.offset + vertexInput.offset +
+			                      attrib.vertexStride * firstVertex +
+			                      attrib.instanceStride * firstInstance;
+			attrib.buffer = vertexInput.buffer ? vertexInput.buffer->getOffsetPointer(offset) : nullptr;
+
+			VkDeviceSize size = vertexInput.buffer ? vertexInput.buffer->getSize() : 0;
+			attrib.robustnessSize = (size > offset) ? size - offset : 0;
 		}
 	}
 }
diff --git a/src/Vulkan/VkPipeline.cpp b/src/Vulkan/VkPipeline.cpp
index 86ea342..7c55daa 100644
--- a/src/Vulkan/VkPipeline.cpp
+++ b/src/Vulkan/VkPipeline.cpp
@@ -283,6 +283,8 @@
 GraphicsPipeline::GraphicsPipeline(const VkGraphicsPipelineCreateInfo* pCreateInfo, void* mem, const Device *device)
 	: Pipeline(vk::Cast(pCreateInfo->layout), device)
 {
+	context.robustBufferAccess = robustBufferAccess;
+
 	if(((pCreateInfo->flags &
 		~(VK_PIPELINE_CREATE_DISABLE_OPTIMIZATION_BIT |
 	      VK_PIPELINE_CREATE_DERIVATIVE_BIT |