Prevent extra vertices from being processed

Extra vertices are added at the end of the vertex list when
processing a batch in order to avoid SIMD width overrun.
A single comparison can prevent us from processing these
vertices by disabling them in the stores and atomics mask.

This would fix the following tests:
dEQP-VK.synchronization.op.single_queue.*
when enabling the vertexPipelineStoresAndAtomics feature.

Note that these tests are affected because they wrongly
assume vertices won't be processed more than once. These
tests should still get fixed.

Bug b/140294254

Change-Id: I04185b899a9770537c3d10bcfd87e00e314582de
Reviewed-on: https://swiftshader-review.googlesource.com/c/SwiftShader/+/36368
Presubmit-Ready: Alexis Hétu <sugoi@google.com>
Kokoro-Presubmit: kokoro <noreply+kokoro@google.com>
Reviewed-by: Nicolas Capens <nicolascapens@google.com>
Tested-by: Alexis Hétu <sugoi@google.com>
diff --git a/src/Pipeline/VertexProgram.cpp b/src/Pipeline/VertexProgram.cpp
index dedf800..5bb3918 100644
--- a/src/Pipeline/VertexProgram.cpp
+++ b/src/Pipeline/VertexProgram.cpp
@@ -63,7 +63,7 @@
 	{
 	}
 
-	void VertexProgram::program(Pointer<UInt> &batch)
+	void VertexProgram::program(Pointer<UInt> &batch, UInt& vertexCount)
 	{
 		auto it = spirvShader->inputBuiltins.find(spv::BuiltInVertexIndex);
 		if (it != spirvShader->inputBuiltins.end())
@@ -80,7 +80,8 @@
 		}
 
 		auto activeLaneMask = SIMD::Int(0xFFFFFFFF);
-		spirvShader->emit(&routine, activeLaneMask, activeLaneMask, descriptorSets);
+		Int4 storesAndAtomicsMask = CmpGE(UInt4(vertexCount), UInt4(1, 2, 3, 4));
+		spirvShader->emit(&routine, activeLaneMask, storesAndAtomicsMask, descriptorSets);
 
 		spirvShader->emitEpilog(&routine);
 	}
diff --git a/src/Pipeline/VertexProgram.hpp b/src/Pipeline/VertexProgram.hpp
index 3c8a664..7baee79 100644
--- a/src/Pipeline/VertexProgram.hpp
+++ b/src/Pipeline/VertexProgram.hpp
@@ -34,7 +34,7 @@
 		virtual ~VertexProgram();
 
 	private:
-		void program(Pointer<UInt> &batch) override;
+		void program(Pointer<UInt> &batch, UInt& vertexCount) override;
 
 		const vk::DescriptorSet::Bindings &descriptorSets;
 	};
diff --git a/src/Pipeline/VertexRoutine.cpp b/src/Pipeline/VertexRoutine.cpp
index e43e96b..be84300 100644
--- a/src/Pipeline/VertexRoutine.cpp
+++ b/src/Pipeline/VertexRoutine.cpp
@@ -60,7 +60,7 @@
 			If(tagCache[cacheIndex] != index)
 			{
 				readInput(batch);
-				program(batch);
+				program(batch, vertexCount);
 				computeClipFlags();
 
 				writeCache(vertexCache, tagCache, batch);
diff --git a/src/Pipeline/VertexRoutine.hpp b/src/Pipeline/VertexRoutine.hpp
index dc22ecb..5a0a6e2 100644
--- a/src/Pipeline/VertexRoutine.hpp
+++ b/src/Pipeline/VertexRoutine.hpp
@@ -62,7 +62,7 @@
 		SpirvShader const * const spirvShader;
 
 	private:
-		virtual void program(Pointer<UInt> &batch) = 0;
+		virtual void program(Pointer<UInt> &batch, UInt& vertexCount) = 0;
 
 		typedef VertexProcessor::State::Input Stream;