Process independent vertex elements

Previously, vertices would be processed in consecutive groups of four
(for SSE/NEON). Now four indices are read from the index buffer.
Reading the input was already a gather operation, but with constant
stride. The vertex cache now performs a scatter. The vertices are
written in reverse order so that the first vertex in a group is always
present in the cache.

Also use 2^32-1 as invalid vertex cache index (corresponds with the
primitive restart index) instead of 0x80000000, since
maxDrawIndexedIndexValue is UINT32_MAX.

Bug: b/27351835
Test: dEQP-VK.glsl.loops.special.do_while_dynamic_iterations.dowhile_trap_vertex
Change-Id: Ic69dbf53c67cbda50e44913ccae91aaca2b86e21
Reviewed-on: https://swiftshader-review.googlesource.com/c/SwiftShader/+/32609
Presubmit-Ready: Nicolas Capens <nicolascapens@google.com>
Kokoro-Presubmit: kokoro <noreply+kokoro@google.com>
Tested-by: Nicolas Capens <nicolascapens@google.com>
Reviewed-by: Alexis Hétu <sugoi@google.com>
diff --git a/src/Device/Renderer.cpp b/src/Device/Renderer.cpp
index dc0c3f5..3d4581b 100644
--- a/src/Device/Renderer.cpp
+++ b/src/Device/Renderer.cpp
@@ -824,7 +824,7 @@
 			task->vertexCache.drawCall = primitiveDrawCall;
 		}
 
-		unsigned int batch[128][3];   // FIXME: Adjust to dynamic batch size
+		unsigned int batch[128 + 1][3];  // One extra for SIMD width overrun. TODO: Adjust to dynamic batch size.
 		VkPrimitiveTopology topology = static_cast<VkPrimitiveTopology>(static_cast<int>(draw->topology));
 
 		if(!indices)
@@ -862,6 +862,11 @@
 			}
 		}
 
+		// Repeat the last index to allow for SIMD width overrun.
+		batch[triangleCount][0] = batch[triangleCount - 1][2];
+		batch[triangleCount][1] = batch[triangleCount - 1][2];
+		batch[triangleCount][2] = batch[triangleCount - 1][2];
+
 		task->primitiveStart = start;
 		task->vertexCount = triangleCount * 3;
 		vertexRoutine(&triangle->v0, (unsigned int*)&batch, task, data);
diff --git a/src/Device/VertexProcessor.cpp b/src/Device/VertexProcessor.cpp
index 82c4547..5c66309 100644
--- a/src/Device/VertexProcessor.cpp
+++ b/src/Device/VertexProcessor.cpp
@@ -25,9 +25,9 @@
 {
 	void VertexCache::clear()
 	{
-		for(int i = 0; i < 16; i++)
+		for(uint32_t i = 0; i < SIZE; i++)
 		{
-			tag[i] = 0x80000000;
+			tag[i] = 0xFFFFFFFF;
 		}
 	}
 
diff --git a/src/Device/VertexProcessor.hpp b/src/Device/VertexProcessor.hpp
index 811ac32..a17e86a 100644
--- a/src/Device/VertexProcessor.hpp
+++ b/src/Device/VertexProcessor.hpp
@@ -25,12 +25,16 @@
 {
 	struct DrawData;
 
-	struct VertexCache   // FIXME: Variable size
+	// Basic direct mapped vertex cache.
+	struct VertexCache
 	{
+		static constexpr uint32_t SIZE = 64;  // TODO: Variable size?
+		static constexpr uint32_t TAG_MASK = SIZE - 1;  // Size must be power of 2.
+
 		void clear();
 
-		Vertex vertex[16][4];
-		unsigned int tag[16];
+		Vertex vertex[SIZE];
+		uint32_t tag[SIZE];
 
 		int drawCall;
 	};
diff --git a/src/Pipeline/VertexProgram.cpp b/src/Pipeline/VertexProgram.cpp
index 647ff3a..e240e7f 100644
--- a/src/Pipeline/VertexProgram.cpp
+++ b/src/Pipeline/VertexProgram.cpp
@@ -73,17 +73,23 @@
 	{
 	}
 
-	void VertexProgram::program(UInt &index)
+	void VertexProgram::program(Pointer<UInt> &batch)
 	{
 		auto it = spirvShader->inputBuiltins.find(spv::BuiltInVertexIndex);
 		if (it != spirvShader->inputBuiltins.end())
 		{
 			assert(it->second.SizeInComponents == 1);
+
+			Int4 indices;
+			indices = Insert(indices, As<Int>(batch[0]), 0);
+			indices = Insert(indices, As<Int>(batch[1]), 1);
+			indices = Insert(indices, As<Int>(batch[2]), 2);
+			indices = Insert(indices, As<Int>(batch[3]), 3);
 			routine.getVariable(it->second.Id)[it->second.FirstComponent] =
-					As<Float4>(Int4(As<Int>(index) + *Pointer<Int>(data + OFFSET(DrawData, baseVertex))) + Int4(0, 1, 2, 3));
+					As<Float4>(indices + Int4(*Pointer<Int>(data + OFFSET(DrawData, baseVertex))));
 		}
 
-		auto activeLaneMask = SIMD::Int(0xFFFFFFFF); // TODO: Control this.
+		auto activeLaneMask = SIMD::Int(0xFFFFFFFF);
 		spirvShader->emit(&routine, activeLaneMask, descriptorSets);
 
 		spirvShader->emitEpilog(&routine);
diff --git a/src/Pipeline/VertexProgram.hpp b/src/Pipeline/VertexProgram.hpp
index 765575d..3c8a664 100644
--- a/src/Pipeline/VertexProgram.hpp
+++ b/src/Pipeline/VertexProgram.hpp
@@ -34,7 +34,7 @@
 		virtual ~VertexProgram();
 
 	private:
-		void program(UInt &index) override;
+		void program(Pointer<UInt> &batch) override;
 
 		const vk::DescriptorSet::Bindings &descriptorSets;
 	};
diff --git a/src/Pipeline/VertexRoutine.cpp b/src/Pipeline/VertexRoutine.cpp
index ae65feb..8ada5d4 100644
--- a/src/Pipeline/VertexRoutine.cpp
+++ b/src/Pipeline/VertexRoutine.cpp
@@ -42,33 +42,32 @@
 	{
 		Pointer<Byte> cache = task + OFFSET(VertexTask,vertexCache);
 		Pointer<Byte> vertexCache = cache + OFFSET(VertexCache,vertex);
-		Pointer<Byte> tagCache = cache + OFFSET(VertexCache,tag);
+		Pointer<UInt> tagCache = Pointer<UInt>(cache + OFFSET(VertexCache,tag));
 
 		UInt vertexCount = *Pointer<UInt>(task + OFFSET(VertexTask,vertexCount));
 
 		constants = *Pointer<Pointer<Byte>>(data + OFFSET(DrawData,constants));
 
+		// Check the cache one vertex index at a time. If a hit occurs, copy from the cache to the 'vertex' output buffer.
+		// On a cache miss, process a SIMD width of consecutive indices from the input batch. They're written to the cache
+		// in reverse order to guarantee that the first one doesn't get evicted and can be written out.
+
 		Do
 		{
 			UInt index = *batch;
-			UInt tagIndex = index & 0x0000003C;
-			UInt indexQ = index & 0xFFFFFFFC;
+			UInt cacheIndex = index & VertexCache::TAG_MASK;
 
-			If(*Pointer<UInt>(tagCache + tagIndex) != indexQ)
+			If(tagCache[cacheIndex] != index)
 			{
-				*Pointer<UInt>(tagCache + tagIndex) = indexQ;
-
-				readInput(indexQ);
-				program(indexQ);
+				readInput(batch);
+				program(batch);
 				computeClipFlags();
 
-				Pointer<Byte> cacheLine0 = vertexCache + tagIndex * UInt((int)sizeof(Vertex));
-				writeCache(cacheLine0);
+				writeCache(vertexCache, tagCache, batch);
 			}
 
-			UInt cacheIndex = index & 0x0000003F;
-			Pointer<Byte> cacheLine = vertexCache + cacheIndex * UInt((int)sizeof(Vertex));
-			writeVertex(vertex, cacheLine);
+			Pointer<Byte> cacheEntry = vertexCache + cacheIndex * UInt((int)sizeof(Vertex));
+			writeVertex(vertex, cacheEntry);
 
 			vertex += sizeof(Vertex);
 			batch = Pointer<UInt>(Pointer<Byte>(batch) + sizeof(uint32_t));
@@ -79,7 +78,7 @@
 		Return();
 	}
 
-	void VertexRoutine::readInput(UInt &index)
+	void VertexRoutine::readInput(Pointer<UInt> &batch)
 	{
 		for(int i = 0; i < MAX_INTERFACE_COMPONENTS; i += 4)
 		{
@@ -88,11 +87,10 @@
 			   spirvShader->inputs[i + 2].Type != SpirvShader::ATTRIBTYPE_UNUSED ||
 			   spirvShader->inputs[i + 3].Type != SpirvShader::ATTRIBTYPE_UNUSED)
 			{
-
 				Pointer<Byte> input = *Pointer<Pointer<Byte>>(data + OFFSET(DrawData, input) + sizeof(void*) * (i / 4));
 				UInt stride = *Pointer<UInt>(data + OFFSET(DrawData, stride) + sizeof(uint32_t) * (i / 4));
 
-				auto value = readStream(input, stride, state.input[i / 4], index);
+				auto value = readStream(input, stride, state.input[i / 4], batch);
 				routine.inputs[i + 0] = value.x;
 				routine.inputs[i + 1] = value.y;
 				routine.inputs[i + 2] = value.z;
@@ -134,14 +132,14 @@
 		clipFlags |= Pointer<Int>(constants + OFFSET(Constants,fini))[SignMask(finiteXYZ)];
 	}
 
-	Vector4f VertexRoutine::readStream(Pointer<Byte> &buffer, UInt &stride, const Stream &stream, const UInt &index)
+	Vector4f VertexRoutine::readStream(Pointer<Byte> &buffer, UInt &stride, const Stream &stream, Pointer<UInt> &batch)
 	{
 		Vector4f v;
 
-		Pointer<Byte> source0 = buffer + index * stride;
-		Pointer<Byte> source1 = source0 + stride;
-		Pointer<Byte> source2 = source1 + stride;
-		Pointer<Byte> source3 = source2 + stride;
+		Pointer<Byte> source0 = buffer + batch[0] * stride;
+		Pointer<Byte> source1 = buffer + batch[1] * stride;
+		Pointer<Byte> source2 = buffer + batch[2] * stride;
+		Pointer<Byte> source3 = buffer + batch[3] * stride;
 
 		bool isNativeFloatAttrib = (stream.attribType == SpirvShader::ATTRIBTYPE_FLOAT) || stream.normalized;
 
@@ -486,8 +484,25 @@
 		return v;
 	}
 
-	void VertexRoutine::writeCache(Pointer<Byte> &cacheLine)
+	void VertexRoutine::writeCache(Pointer<Byte> &vertexCache, Pointer<UInt> &tagCache, Pointer<UInt> &batch)
 	{
+		UInt index0 = batch[0];
+		UInt index1 = batch[1];
+		UInt index2 = batch[2];
+		UInt index3 = batch[3];
+
+		UInt cacheIndex0 = index0 & VertexCache::TAG_MASK;
+		UInt cacheIndex1 = index1 & VertexCache::TAG_MASK;
+		UInt cacheIndex2 = index2 & VertexCache::TAG_MASK;
+		UInt cacheIndex3 = index3 & VertexCache::TAG_MASK;
+
+		// We processed a SIMD group of vertices, with the first one being the one that missed the cache tag check.
+		// Write them out in reverse order here and below to ensure the first one is now guaranteed to be in the cache.
+		tagCache[cacheIndex3] = index3;
+		tagCache[cacheIndex2] = index2;
+		tagCache[cacheIndex1] = index1;
+		tagCache[cacheIndex0] = index0;
+
 		auto it = spirvShader->outputBuiltins.find(spv::BuiltInPosition);
 		assert(it != spirvShader->outputBuiltins.end());
 		assert(it->second.SizeInComponents == 4);
@@ -511,10 +526,10 @@
 
 		transpose4x4(pos.x, pos.y, pos.z, pos.w);
 
-		*Pointer<Float4>(cacheLine + OFFSET(Vertex,position) + sizeof(Vertex) * 0, 16) = pos.x;
-		*Pointer<Float4>(cacheLine + OFFSET(Vertex,position) + sizeof(Vertex) * 1, 16) = pos.y;
-		*Pointer<Float4>(cacheLine + OFFSET(Vertex,position) + sizeof(Vertex) * 2, 16) = pos.z;
-		*Pointer<Float4>(cacheLine + OFFSET(Vertex,position) + sizeof(Vertex) * 3, 16) = pos.w;
+		*Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex3 + OFFSET(Vertex,position), 16) = pos.w;
+		*Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex2 + OFFSET(Vertex,position), 16) = pos.z;
+		*Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex1 + OFFSET(Vertex,position), 16) = pos.y;
+		*Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex0 + OFFSET(Vertex,position), 16) = pos.x;
 
 		it = spirvShader->outputBuiltins.find(spv::BuiltInPointSize);
 		if(it != spirvShader->outputBuiltins.end())
@@ -522,23 +537,23 @@
 			assert(it->second.SizeInComponents == 1);
 			auto psize = routine.getVariable(it->second.Id)[it->second.FirstComponent];
 
-			*Pointer<Float>(cacheLine + OFFSET(Vertex,pointSize) + sizeof(Vertex) * 0) = Extract(psize, 0);
-			*Pointer<Float>(cacheLine + OFFSET(Vertex,pointSize) + sizeof(Vertex) * 1) = Extract(psize, 1);
-			*Pointer<Float>(cacheLine + OFFSET(Vertex,pointSize) + sizeof(Vertex) * 2) = Extract(psize, 2);
-			*Pointer<Float>(cacheLine + OFFSET(Vertex,pointSize) + sizeof(Vertex) * 3) = Extract(psize, 3);
+			*Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex3 + OFFSET(Vertex,pointSize)) = Extract(psize, 3);
+			*Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex2 + OFFSET(Vertex,pointSize)) = Extract(psize, 2);
+			*Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex1 + OFFSET(Vertex,pointSize)) = Extract(psize, 1);
+			*Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex0 + OFFSET(Vertex,pointSize)) = Extract(psize, 0);
 		}
 
-		*Pointer<Int>(cacheLine + OFFSET(Vertex,clipFlags) + sizeof(Vertex) * 0) = (clipFlags >> 0)  & 0x0000000FF;
-		*Pointer<Int>(cacheLine + OFFSET(Vertex,clipFlags) + sizeof(Vertex) * 1) = (clipFlags >> 8)  & 0x0000000FF;
-		*Pointer<Int>(cacheLine + OFFSET(Vertex,clipFlags) + sizeof(Vertex) * 2) = (clipFlags >> 16) & 0x0000000FF;
-		*Pointer<Int>(cacheLine + OFFSET(Vertex,clipFlags) + sizeof(Vertex) * 3) = (clipFlags >> 24) & 0x0000000FF;
+		*Pointer<Int>(vertexCache + sizeof(Vertex) * cacheIndex3 + OFFSET(Vertex,clipFlags)) = (clipFlags >> 24) & 0x0000000FF;
+		*Pointer<Int>(vertexCache + sizeof(Vertex) * cacheIndex2 + OFFSET(Vertex,clipFlags)) = (clipFlags >> 16) & 0x0000000FF;
+		*Pointer<Int>(vertexCache + sizeof(Vertex) * cacheIndex1 + OFFSET(Vertex,clipFlags)) = (clipFlags >> 8)  & 0x0000000FF;
+		*Pointer<Int>(vertexCache + sizeof(Vertex) * cacheIndex0 + OFFSET(Vertex,clipFlags)) = (clipFlags >> 0)  & 0x0000000FF;
 
 		transpose4x4(proj.x, proj.y, proj.z, proj.w);
 
-		*Pointer<Float4>(cacheLine + OFFSET(Vertex,projected) + sizeof(Vertex) * 0, 16) = proj.x;
-		*Pointer<Float4>(cacheLine + OFFSET(Vertex,projected) + sizeof(Vertex) * 1, 16) = proj.y;
-		*Pointer<Float4>(cacheLine + OFFSET(Vertex,projected) + sizeof(Vertex) * 2, 16) = proj.z;
-		*Pointer<Float4>(cacheLine + OFFSET(Vertex,projected) + sizeof(Vertex) * 3, 16) = proj.w;
+		*Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex3 + OFFSET(Vertex,projected), 16) = proj.w;
+		*Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex2 + OFFSET(Vertex,projected), 16) = proj.z;
+		*Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex1 + OFFSET(Vertex,projected), 16) = proj.y;
+		*Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex0 + OFFSET(Vertex,projected), 16) = proj.x;
 
 		for(int i = 0; i < MAX_INTERFACE_COMPONENTS; i += 4)
 		{
@@ -555,10 +570,10 @@
 
 				transpose4x4(v.x, v.y, v.z, v.w);
 
-				*Pointer<Float4>(cacheLine + OFFSET(Vertex,v[i]) + sizeof(Vertex) * 0, 16) = v.x;
-				*Pointer<Float4>(cacheLine + OFFSET(Vertex,v[i]) + sizeof(Vertex) * 1, 16) = v.y;
-				*Pointer<Float4>(cacheLine + OFFSET(Vertex,v[i]) + sizeof(Vertex) * 2, 16) = v.z;
-				*Pointer<Float4>(cacheLine + OFFSET(Vertex,v[i]) + sizeof(Vertex) * 3, 16) = v.w;
+				*Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex3 + OFFSET(Vertex,v[i]), 16) = v.w;
+				*Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex2 + OFFSET(Vertex,v[i]), 16) = v.z;
+				*Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex1 + OFFSET(Vertex,v[i]), 16) = v.y;
+				*Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex0 + OFFSET(Vertex,v[i]), 16) = v.x;
 			}
 		}
 	}
diff --git a/src/Pipeline/VertexRoutine.hpp b/src/Pipeline/VertexRoutine.hpp
index 2e71343..dc22ecb 100644
--- a/src/Pipeline/VertexRoutine.hpp
+++ b/src/Pipeline/VertexRoutine.hpp
@@ -62,14 +62,14 @@
 		SpirvShader const * const spirvShader;
 
 	private:
-		virtual void program(UInt &index) = 0;
+		virtual void program(Pointer<UInt> &batch) = 0;
 
 		typedef VertexProcessor::State::Input Stream;
 
-		Vector4f readStream(Pointer<Byte> &buffer, UInt &stride, const Stream &stream, const UInt &index);
-		void readInput(UInt &index);
+		Vector4f readStream(Pointer<Byte> &buffer, UInt &stride, const Stream &stream, Pointer<UInt> &batch);
+		void readInput(Pointer<UInt> &batch);
 		void computeClipFlags();
-		void writeCache(Pointer<Byte> &cacheLine);
+		void writeCache(Pointer<Byte> &vertexCache, Pointer<UInt> &tagCache, Pointer<UInt> &batch);
 		void writeVertex(const Pointer<Byte> &vertex, Pointer<Byte> &cacheEntry);
 	};
 }