Enable fragmentStoresAndAtomics

A few changes were made to make this work:
- activeLaneMask now contains cMask. Since it's only used in
  loads, stores and atomics, it should be fine to include it
- Added a storesAndAtomicsMask, which also contains sMask
  (stencil) and zMask (depth) for early fragment tests.
  The mask affects all atomic operations and store operations
  into storage buffer and images.
- support for spv::BuiltInHelperInvocation was added

Bug b/140294254

Test: dEQP-VK.*

Change-Id: I42b97a766ddfe331bb2767d80d4360104a221482
Reviewed-on: https://swiftshader-review.googlesource.com/c/SwiftShader/+/34114
Presubmit-Ready: Alexis Hétu <sugoi@google.com>
Tested-by: Alexis Hétu <sugoi@google.com>
Kokoro-Presubmit: kokoro <noreply+kokoro@google.com>
Reviewed-by: Chris Forbes <chrisforbes@google.com>
diff --git a/src/Pipeline/ComputeProgram.cpp b/src/Pipeline/ComputeProgram.cpp
index a209339..2a86cc4 100644
--- a/src/Pipeline/ComputeProgram.cpp
+++ b/src/Pipeline/ComputeProgram.cpp
@@ -188,7 +188,7 @@
 
 			setSubgroupBuiltins(data, routine, workgroupID, localInvocationIndex, subgroupIndex);
 
-			shader->emit(routine, activeLaneMask, descriptorSets);
+			shader->emit(routine, activeLaneMask, activeLaneMask, descriptorSets);
 		}
 	}
 
diff --git a/src/Pipeline/PixelProgram.cpp b/src/Pipeline/PixelProgram.cpp
index d9a7bb5..d38e245 100644
--- a/src/Pipeline/PixelProgram.cpp
+++ b/src/Pipeline/PixelProgram.cpp
@@ -20,7 +20,43 @@
 
 namespace sw
 {
-	void PixelProgram::setBuiltins(Int &x, Int &y, Float4(&z)[4], Float4 &w)
+	// Union all cMask and return it as 4 booleans
+	Int4 PixelProgram::maskAny(Int cMask[4]) const
+	{
+		// See if at least 1 sample is used
+		Int maskUnion = cMask[0];
+		for(auto i = 1u; i < state.multiSample; i++)
+		{
+			maskUnion |= cMask[i];
+		}
+
+		// Convert to 4 booleans
+		Int4 laneBits = Int4(1, 2, 4, 8);
+		Int4 laneShiftsToMSB = Int4(31, 30, 29, 28);
+		Int4 mask(maskUnion);
+		mask = ((mask & laneBits) << laneShiftsToMSB) >> Int4(31);
+		return mask;
+	}
+
+	// Union all cMask/sMask/zMask and return it as 4 booleans
+	Int4 PixelProgram::maskAny(Int cMask[4], Int sMask[4], Int zMask[4]) const
+	{
+		// See if at least 1 sample is used
+		Int maskUnion = cMask[0] & sMask[0] & zMask[0];
+		for(auto i = 1u; i < state.multiSample; i++)
+		{
+			maskUnion |= (cMask[i] & sMask[i] & zMask[i]);
+		}
+
+		// Convert to 4 booleans
+		Int4 laneBits = Int4(1, 2, 4, 8);
+		Int4 laneShiftsToMSB = Int4(31, 30, 29, 28);
+		Int4 mask(maskUnion);
+		mask = ((mask & laneBits) << laneShiftsToMSB) >> Int4(31);
+		return mask;
+	}
+
+	void PixelProgram::setBuiltins(Int &x, Int &y, Float4(&z)[4], Float4 &w, Int cMask[4])
 	{
 		routine.setImmutableInputBuiltins(spirvShader);
 
@@ -54,12 +90,18 @@
 			value[builtin.FirstComponent] = As<SIMD::Float>(SIMD::Int(SIMD::Width));
 		});
 
+		routine.setInputBuiltin(spirvShader, spv::BuiltInHelperInvocation, [&](const SpirvShader::BuiltinMapping& builtin, Array<SIMD::Float>& value)
+		{
+			assert(builtin.SizeInComponents == 1);
+			value[builtin.FirstComponent] = As<SIMD::Float>(~maskAny(cMask));
+		});
+
 		routine.windowSpacePosition[0] = x + SIMD::Int(0,1,0,1);
 		routine.windowSpacePosition[1] = y + SIMD::Int(0,0,1,1);
 		routine.viewID = *Pointer<Int>(data + OFFSET(DrawData, viewID));
 	}
 
-	void PixelProgram::applyShader(Int cMask[4])
+	void PixelProgram::applyShader(Int cMask[4], Int sMask[4], Int zMask[4])
 	{
 		routine.descriptorSets = data + OFFSET(DrawData, descriptorSets);
 		routine.descriptorDynamicOffsets = data + OFFSET(DrawData, descriptorDynamicOffsets);
@@ -96,9 +138,10 @@
 		// Note: all lanes initially active to facilitate derivatives etc. Actual coverage is
 		// handled separately, through the cMask.
 		auto activeLaneMask = SIMD::Int(0xFFFFFFFF);
+		auto storesAndAtomicsMask = maskAny(cMask, sMask, zMask);
 		routine.killMask = 0;
 
-		spirvShader->emit(&routine, activeLaneMask, descriptorSets);
+		spirvShader->emit(&routine, activeLaneMask, storesAndAtomicsMask, descriptorSets);
 		spirvShader->emitEpilog(&routine);
 
 		for(int i = 0; i < RENDERTARGETS; i++)
diff --git a/src/Pipeline/PixelProgram.hpp b/src/Pipeline/PixelProgram.hpp
index 9f6d14f..3555aee 100644
--- a/src/Pipeline/PixelProgram.hpp
+++ b/src/Pipeline/PixelProgram.hpp
@@ -34,8 +34,8 @@
 		virtual ~PixelProgram() {}
 
 	protected:
-		virtual void setBuiltins(Int &x, Int &y, Float4(&z)[4], Float4 &w);
-		virtual void applyShader(Int cMask[4]);
+		virtual void setBuiltins(Int &x, Int &y, Float4(&z)[4], Float4 &w, Int cMask[4]);
+		virtual void applyShader(Int cMask[4], Int sMask[4], Int zMask[4]);
 		virtual Bool alphaTest(Int cMask[4]);
 		virtual void rasterOperation(Pointer<Byte> cBuffer[4], Int &x, Int sMask[4], Int zMask[4], Int cMask[4]);
 
@@ -46,6 +46,8 @@
 		// Raster operations
 		void clampColor(Vector4f oC[RENDERTARGETS]);
 
+		Int4 maskAny(Int cMask[4]) const;
+		Int4 maskAny(Int cMask[4], Int sMask[4], Int zMask[4]) const;
 		Float4 linearToSRGB(const Float4 &x);
 	};
 }
diff --git a/src/Pipeline/PixelRoutine.cpp b/src/Pipeline/PixelRoutine.cpp
index a06b4d1..186a44a 100644
--- a/src/Pipeline/PixelRoutine.cpp
+++ b/src/Pipeline/PixelRoutine.cpp
@@ -53,7 +53,7 @@
 	void PixelRoutine::quad(Pointer<Byte> cBuffer[RENDERTARGETS], Pointer<Byte> &zBuffer, Pointer<Byte> &sBuffer, Int cMask[4], Int &x, Int &y)
 	{
 		// TODO: consider shader which modifies sample mask in general
-		const bool earlyDepthTest = !spirvShader || (!spirvShader->getModes().DepthReplacing && !state.alphaToCoverage);
+		const bool earlyDepthTest = !spirvShader || (spirvShader->getModes().EarlyFragmentTests && !spirvShader->getModes().DepthReplacing && !state.alphaToCoverage);
 
 		Int zMask[4];   // Depth mask
 		Int sMask[4];   // Stencil mask
@@ -161,14 +161,15 @@
 					}
 				}
 
-				setBuiltins(x, y, z, w);
+				setBuiltins(x, y, z, w, cMask);
 			}
 
 			Bool alphaPass = true;
 
 			if (spirvShader)
 			{
-				applyShader(cMask);
+				bool earlyFragTests = (spirvShader && spirvShader->getModes().EarlyFragmentTests);
+				applyShader(cMask, earlyFragTests ? sMask : cMask, earlyDepthTest ? zMask : cMask);
 			}
 
 			alphaPass = alphaTest(cMask);
diff --git a/src/Pipeline/PixelRoutine.hpp b/src/Pipeline/PixelRoutine.hpp
index c4d2423..b9486af 100644
--- a/src/Pipeline/PixelRoutine.hpp
+++ b/src/Pipeline/PixelRoutine.hpp
@@ -43,8 +43,8 @@
 		// Depth output
 		Float4 oDepth;
 
-		virtual void setBuiltins(Int &x, Int &y, Float4(&z)[4], Float4 &w) = 0;
-		virtual void applyShader(Int cMask[4]) = 0;
+		virtual void setBuiltins(Int &x, Int &y, Float4(&z)[4], Float4 &w, Int cMask[4]) = 0;
+		virtual void applyShader(Int cMask[4], Int sMask[4], Int zMask[4]) = 0;
 		virtual Bool alphaTest(Int cMask[4]) = 0;
 		virtual void rasterOperation(Pointer<Byte> cBuffer[4], Int &x, Int sMask[4], Int zMask[4], Int cMask[4]) = 0;
 
diff --git a/src/Pipeline/SpirvShader.cpp b/src/Pipeline/SpirvShader.cpp
index d47fd87..63d403c 100644
--- a/src/Pipeline/SpirvShader.cpp
+++ b/src/Pipeline/SpirvShader.cpp
@@ -1367,6 +1367,19 @@
 		}
 	}
 
+	bool SpirvShader::StoresInHelperInvocation(spv::StorageClass storageClass)
+	{
+		switch (storageClass)
+		{
+		case spv::StorageClassUniform:
+		case spv::StorageClassStorageBuffer:
+		case spv::StorageClassImage:
+			return false;
+		default:
+			return true;
+		}
+	}
+
 	bool SpirvShader::IsExplicitLayout(spv::StorageClass storageClass)
 	{
 		switch (storageClass)
@@ -2136,9 +2149,9 @@
 		}
 	}
 
-	void SpirvShader::emit(SpirvRoutine *routine, RValue<SIMD::Int> const &activeLaneMask, const vk::DescriptorSet::Bindings &descriptorSets) const
+	void SpirvShader::emit(SpirvRoutine *routine, RValue<SIMD::Int> const &activeLaneMask, RValue<SIMD::Int> const &storesAndAtomicsMask, const vk::DescriptorSet::Bindings &descriptorSets) const
 	{
-		EmitState state(routine, entryPoint, activeLaneMask, descriptorSets, robustBufferAccess, executionModel);
+		EmitState state(routine, entryPoint, activeLaneMask, storesAndAtomicsMask, descriptorSets, robustBufferAccess, executionModel);
 
 		// Emit everything up to the first label
 		// TODO: Separate out dispatch of block from non-block instructions?
@@ -2975,6 +2988,12 @@
 		bool interleavedByLane = IsStorageInterleavedByLane(pointerTy.storageClass);
 		auto robustness = state->getOutOfBoundsBehavior(pointerTy.storageClass);
 
+		SIMD::Int mask = state->activeLaneMask();
+		if (!StoresInHelperInvocation(pointerTy.storageClass))
+		{
+			mask = mask & state->storesAndAtomicsMask();
+		}
+
 		if (object.kind == Object::Kind::Constant)
 		{
 			// Constant source data.
@@ -2983,7 +3002,7 @@
 			{
 				auto p = ptr + offset;
 				if (interleavedByLane) { p = interleaveByLane(p); }
-				SIMD::Store(p, SIMD::Float(src[i]), robustness, state->activeLaneMask(), atomic, memoryOrder);
+				SIMD::Store(p, SIMD::Float(src[i]), robustness, mask, atomic, memoryOrder);
 			});
 		}
 		else
@@ -2994,7 +3013,7 @@
 			{
 				auto p = ptr + offset;
 				if (interleavedByLane) { p = interleaveByLane(p); }
-				SIMD::Store(p, src.Float(i), robustness, state->activeLaneMask(), atomic, memoryOrder);
+				SIMD::Store(p, src.Float(i), robustness, mask, atomic, memoryOrder);
 			});
 		}
 
@@ -5843,10 +5862,11 @@
 		auto ptr = state->getPointer(insn.word(3));
 		auto ptrOffsets = ptr.offsets();
 
-		SIMD::UInt x;
+		SIMD::UInt x(0);
+		auto mask = state->activeLaneMask() & state->storesAndAtomicsMask();
 		for (int j = 0; j < SIMD::Width; j++)
 		{
-			If(Extract(state->activeLaneMask(), j) != 0)
+			If(Extract(mask, j) != 0)
 			{
 				auto offset = Extract(ptrOffsets, j);
 				auto laneValue = Extract(value, j);
@@ -5914,10 +5934,11 @@
 		auto ptr = state->getPointer(insn.word(3));
 		auto ptrOffsets = ptr.offsets();
 
-		SIMD::UInt x;
+		SIMD::UInt x(0);
+		auto mask = state->activeLaneMask() & state->storesAndAtomicsMask();
 		for (int j = 0; j < SIMD::Width; j++)
 		{
-			If(Extract(state->activeLaneMask(), j) != 0)
+			If(Extract(mask, j) != 0)
 			{
 				auto offset = Extract(ptrOffsets, j);
 				auto laneValue = Extract(value.UInt(0), j);
diff --git a/src/Pipeline/SpirvShader.hpp b/src/Pipeline/SpirvShader.hpp
index 0cc4373..a48e3b3 100644
--- a/src/Pipeline/SpirvShader.hpp
+++ b/src/Pipeline/SpirvShader.hpp
@@ -889,7 +889,7 @@
 		std::vector<InterfaceComponent> outputs;
 
 		void emitProlog(SpirvRoutine *routine) const;
-		void emit(SpirvRoutine *routine, RValue<SIMD::Int> const &activeLaneMask, const vk::DescriptorSet::Bindings &descriptorSets) const;
+		void emit(SpirvRoutine *routine, RValue<SIMD::Int> const &activeLaneMask, RValue<SIMD::Int> const &storesAndAtomicsMask, const vk::DescriptorSet::Bindings &descriptorSets) const;
 		void emitEpilog(SpirvRoutine *routine) const;
 
 		using BuiltInHash = std::hash<std::underlying_type<spv::BuiltIn>::type>;
@@ -967,6 +967,9 @@
 		//
 		static bool IsStorageInterleavedByLane(spv::StorageClass storageClass);
 		static bool IsExplicitLayout(spv::StorageClass storageClass);
+	
+		// Output storage buffers and images should not be affected by helper invocations
+		static bool StoresInHelperInvocation(spv::StorageClass storageClass);
 
 		template<typename F>
 		int VisitInterfaceInner(Type::ID id, Decorations d, F f) const;
@@ -991,12 +994,14 @@
 			EmitState(SpirvRoutine *routine,
 					Function::ID function,
 					RValue<SIMD::Int> activeLaneMask,
+					RValue<SIMD::Int> storesAndAtomicsMask,
 					const vk::DescriptorSet::Bindings &descriptorSets,
 					bool robustBufferAccess,
 					spv::ExecutionModel executionModel)
 				: routine(routine),
 				  function(function),
 				  activeLaneMaskValue(activeLaneMask.value),
+				  storesAndAtomicsMaskValue(storesAndAtomicsMask.value),
 				  descriptorSets(descriptorSets),
 				  robustBufferAccess(robustBufferAccess),
 				  executionModel(executionModel)
@@ -1010,6 +1015,12 @@
 				return RValue<SIMD::Int>(activeLaneMaskValue);
 			}
 
+			RValue<SIMD::Int> storesAndAtomicsMask() const
+			{
+				ASSERT(storesAndAtomicsMaskValue != nullptr);
+				return RValue<SIMD::Int>(storesAndAtomicsMaskValue);
+			}
+
 			void setActiveLaneMask(RValue<SIMD::Int> mask)
 			{
 				activeLaneMaskValue = mask.value;
@@ -1030,6 +1041,7 @@
 			Function::ID function; // The current function being built.
 			Block::ID block; // The current block being built.
 			rr::Value *activeLaneMaskValue = nullptr; // The current active lane mask.
+			rr::Value *storesAndAtomicsMaskValue = nullptr; // The current atomics mask.
 			Block::Set visited; // Blocks already built.
 			std::unordered_map<Block::Edge, RValue<SIMD::Int>, Block::Edge::Hash> edgeActiveLaneMasks;
 			std::deque<Block::ID> *pending;
diff --git a/src/Pipeline/VertexProgram.cpp b/src/Pipeline/VertexProgram.cpp
index c7f020c..dedf800 100644
--- a/src/Pipeline/VertexProgram.cpp
+++ b/src/Pipeline/VertexProgram.cpp
@@ -80,7 +80,7 @@
 		}
 
 		auto activeLaneMask = SIMD::Int(0xFFFFFFFF);
-		spirvShader->emit(&routine, activeLaneMask, descriptorSets);
+		spirvShader->emit(&routine, activeLaneMask, activeLaneMask, descriptorSets);
 
 		spirvShader->emitEpilog(&routine);
 	}
diff --git a/src/Vulkan/VkPhysicalDevice.cpp b/src/Vulkan/VkPhysicalDevice.cpp
index 4abb0f8..8e6b8e7 100644
--- a/src/Vulkan/VkPhysicalDevice.cpp
+++ b/src/Vulkan/VkPhysicalDevice.cpp
@@ -57,7 +57,7 @@
 		VK_FALSE,  // occlusionQueryPrecise
 		VK_FALSE,  // pipelineStatisticsQuery
 		VK_FALSE,  // vertexPipelineStoresAndAtomics
-		VK_FALSE,  // fragmentStoresAndAtomics
+		VK_TRUE,   // fragmentStoresAndAtomics
 		VK_FALSE,  // shaderTessellationAndGeometryPointSize
 		VK_FALSE,  // shaderImageGatherExtended
 		VK_FALSE,  // shaderStorageImageExtendedFormats