Per sample shading

This cl introduces per sample shading in the fragment shader.
Rather than call the fragment shader multiple times per sample,
this cl adds a potential loop in the fragment shader where each
sample is processes in one of the loop's iteration.

- Each multisample related loop now processes either all samples,
  like before, or the current sample, if per sample shading is
  enabled
- A new per sample PixelProgram::maskAny() function was added
- emitEpilog() now has an option not to clear phis in order to be
  able to only clear them on the last sample
- The routine's fragCoord values are set per sample, with the
  proper sample offsets
- Similarly, the xxxx and yyyy values used for interpolation are
  now offset with the proper sample offsets when per sample
  shading is enabled

Bug: b/171415086
Change-Id: Ibd0c1bad23e2d81f7fa97240ebb50f88f1fee36e
Reviewed-on: https://swiftshader-review.googlesource.com/c/SwiftShader/+/51733
Reviewed-by: Antonio Maiorano <amaiorano@google.com>
Commit-Queue: Alexis Hétu <sugoi@google.com>
Tested-by: Alexis Hétu <sugoi@google.com>
Kokoro-Result: kokoro <noreply+kokoro@google.com>
diff --git a/src/Pipeline/PixelProgram.cpp b/src/Pipeline/PixelProgram.cpp
index 0deffa9..5066996 100644
--- a/src/Pipeline/PixelProgram.cpp
+++ b/src/Pipeline/PixelProgram.cpp
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "PixelProgram.hpp"
+#include "Constants.hpp"
 
 #include "SamplerCore.hpp"
 #include "Device/Primitive.hpp"
@@ -56,14 +57,37 @@
 	return mask;
 }
 
-void PixelProgram::setBuiltins(Int &x, Int &y, Float4 (&z)[4], Float4 &w, Int cMask[4])
+Int4 PixelProgram::maskAny(Int cMask, Int sMask, Int zMask) const
+{
+	Int maskUnion = cMask & sMask & zMask;
+
+	// Convert to 4 booleans
+	Int4 laneBits = Int4(1, 2, 4, 8);
+	Int4 laneShiftsToMSB = Int4(31, 30, 29, 28);
+	Int4 mask(maskUnion);
+	mask = ((mask & laneBits) << laneShiftsToMSB) >> Int4(31);
+	return mask;
+}
+
+void PixelProgram::setBuiltins(Int &x, Int &y, Float4 (&z)[4], Float4 &w, Int cMask[4], int sampleId)
 {
 	routine.setImmutableInputBuiltins(spirvShader);
 
 	// TODO(b/146486064): Consider only assigning these to the SpirvRoutine iff
 	// they are ever going to be read.
-	routine.fragCoord[0] = SIMD::Float(Float(x)) + SIMD::Float(0.5f, 1.5f, 0.5f, 1.5f);
-	routine.fragCoord[1] = SIMD::Float(Float(y)) + SIMD::Float(0.5f, 0.5f, 1.5f, 1.5f);
+	float x0 = 0.5f;
+	float y0 = 0.5f;
+	float x1 = 1.5f;
+	float y1 = 1.5f;
+	if((state.multiSampleCount > 1) && (sampleId >= 0))
+	{
+		x0 = Constants::VkSampleLocations4[sampleId][0];
+		y0 = Constants::VkSampleLocations4[sampleId][1];
+		x1 = 1.0f + x0;
+		y1 = 1.0f + y0;
+	}
+	routine.fragCoord[0] = SIMD::Float(Float(x)) + SIMD::Float(x0, x1, x0, x1);
+	routine.fragCoord[1] = SIMD::Float(Float(y)) + SIMD::Float(y0, y0, y1, y1);
 	routine.fragCoord[2] = z[0];  // sample 0
 	routine.fragCoord[3] = w;
 
@@ -109,8 +133,11 @@
 	});
 }
 
-void PixelProgram::applyShader(Int cMask[4], Int sMask[4], Int zMask[4])
+void PixelProgram::applyShader(Int cMask[4], Int sMask[4], Int zMask[4], int sampleId)
 {
+	unsigned int sampleLoopInit = (sampleId >= 0) ? sampleId : 0;
+	unsigned int sampleLoopEnd = (sampleId >= 0) ? sampleId + 1 : state.multiSampleCount;
+
 	routine.descriptorSets = data + OFFSET(DrawData, descriptorSets);
 	routine.descriptorDynamicOffsets = data + OFFSET(DrawData, descriptorDynamicOffsets);
 	routine.pushConstants = data + OFFSET(DrawData, pushConstants);
@@ -130,8 +157,8 @@
 		static_assert(SIMD::Width == 4, "Expects SIMD width to be 4");
 		Int4 laneBits = Int4(1, 2, 4, 8);
 
-		Int4 inputSampleMask = Int4(1) & CmpNEQ(Int4(cMask[0]) & laneBits, Int4(0));
-		for(auto i = 1u; i < state.multiSampleCount; i++)
+		Int4 inputSampleMask = 0;
+		for(auto i = sampleLoopInit; i < sampleLoopEnd; i++)
 		{
 			inputSampleMask |= Int4(1 << i) & CmpNEQ(Int4(cMask[i]) & laneBits, Int4(0));
 		}
@@ -146,11 +173,15 @@
 	// Note: all lanes initially active to facilitate derivatives etc. Actual coverage is
 	// handled separately, through the cMask.
 	auto activeLaneMask = SIMD::Int(0xFFFFFFFF);
-	auto storesAndAtomicsMask = maskAny(cMask, sMask, zMask);
+	auto storesAndAtomicsMask = (sampleId >= 0) ? maskAny(cMask[sampleId], sMask[sampleId], zMask[sampleId]) : maskAny(cMask, sMask, zMask);
 	routine.killMask = 0;
 
 	spirvShader->emit(&routine, activeLaneMask, storesAndAtomicsMask, descriptorSets);
 	spirvShader->emitEpilog(&routine);
+	if((sampleId < 0) || (sampleId == static_cast<int>(state.multiSampleCount - 1)))
+	{
+		spirvShader->clearPhis(&routine);
+	}
 
 	for(int i = 0; i < RENDERTARGETS; i++)
 	{
@@ -168,7 +199,7 @@
 
 	if(spirvShader->getModes().ContainsKill)
 	{
-		for(auto i = 0u; i < state.multiSampleCount; i++)
+		for(auto i = sampleLoopInit; i < sampleLoopEnd; i++)
 		{
 			cMask[i] &= ~routine.killMask;
 		}
@@ -179,7 +210,7 @@
 	{
 		auto outputSampleMask = As<SIMD::Int>(routine.getVariable(it->second.Id)[it->second.FirstComponent]);
 
-		for(auto i = 0u; i < state.multiSampleCount; i++)
+		for(auto i = sampleLoopInit; i < sampleLoopEnd; i++)
 		{
 			cMask[i] &= SignMask(CmpNEQ(outputSampleMask & SIMD::Int(1 << i), SIMD::Int(0)));
 		}
@@ -192,14 +223,19 @@
 	}
 }
 
-Bool PixelProgram::alphaTest(Int cMask[4])
+Bool PixelProgram::alphaTest(Int cMask[4], int sampleId)
 {
 	if(!state.alphaToCoverage)
 	{
 		return true;
 	}
 
-	alphaToCoverage(cMask, c[0].w);
+	alphaToCoverage(cMask, c[0].w, sampleId);
+
+	if(sampleId >= 0)
+	{
+		return cMask[sampleId] != 0x0;
+	}
 
 	Int pass = cMask[0];
 
@@ -211,8 +247,11 @@
 	return pass != 0x0;
 }
 
-void PixelProgram::rasterOperation(Pointer<Byte> cBuffer[4], Int &x, Int sMask[4], Int zMask[4], Int cMask[4])
+void PixelProgram::rasterOperation(Pointer<Byte> cBuffer[4], Int &x, Int sMask[4], Int zMask[4], Int cMask[4], int sampleId)
 {
+	unsigned int sampleLoopInit = (sampleId >= 0) ? sampleId : 0;
+	unsigned int sampleLoopEnd = (sampleId >= 0) ? sampleId + 1 : state.multiSampleCount;
+
 	for(int index = 0; index < RENDERTARGETS; index++)
 	{
 		if(!state.colorWriteActive(index))
@@ -237,7 +276,7 @@
 			case VK_FORMAT_A8B8G8R8_SRGB_PACK32:
 			case VK_FORMAT_A2B10G10R10_UNORM_PACK32:
 			case VK_FORMAT_A2R10G10B10_UNORM_PACK32:
-				for(unsigned int q = 0; q < state.multiSampleCount; q++)
+				for(unsigned int q = sampleLoopInit; q < sampleLoopEnd; q++)
 				{
 					if(state.multiSampleMask & (1 << q))
 					{
@@ -283,7 +322,7 @@
 			case VK_FORMAT_A8B8G8R8_SINT_PACK32:
 			case VK_FORMAT_A2B10G10R10_UINT_PACK32:
 			case VK_FORMAT_A2R10G10B10_UINT_PACK32:
-				for(unsigned int q = 0; q < state.multiSampleCount; q++)
+				for(unsigned int q = sampleLoopInit; q < sampleLoopEnd; q++)
 				{
 					if(state.multiSampleMask & (1 << q))
 					{