Add support for D16_UNORM depth test & write

D16_UNORM depth operations are performed in fixed point to avoid quirks
of ushort <-> float roundtrip

Some slight oddities to work around missing primitives:
- We only have CmpEQ, CmpGT. This matches SSE hardware reality.
  Everything else is made out of negations and operand swaps.
- We only have *signed* CmpGT. Make an unsigned CmpGT by biasing both
  16-bit operands by -0x8000.
- In non-quad-layout we don't have a good way to separate .xy and .zw
  halfs; reading via masked short4 reads, writing via individual
  component writes.

Bug: b/128363727
Test: dEQP-VK.pipeline.depth.format.*
Change-Id: I9c8b249470a9c91589c91135988dc4d1a58bbc5f
Reviewed-on: https://swiftshader-review.googlesource.com/c/SwiftShader/+/27030
Tested-by: Chris Forbes <chrisforbes@google.com>
Presubmit-Ready: Chris Forbes <chrisforbes@google.com>
Kokoro-Presubmit: kokoro <noreply+kokoro@google.com>
Reviewed-by: Nicolas Capens <nicolascapens@google.com>
diff --git a/src/Pipeline/PixelRoutine.cpp b/src/Pipeline/PixelRoutine.cpp
index b27fab9..6c528d9 100644
--- a/src/Pipeline/PixelRoutine.cpp
+++ b/src/Pipeline/PixelRoutine.cpp
@@ -363,13 +363,8 @@
 		}
 	}
 
-	Bool PixelRoutine::depthTest(Pointer<Byte> &zBuffer, int q, Int &x, Float4 &z, Int &sMask, Int &zMask, Int &cMask)
+	Bool PixelRoutine::depthTest32F(Pointer<Byte> &zBuffer, int q, Int &x, Float4 &z, Int &sMask, Int &zMask, Int &cMask)
 	{
-		if(!state.depthTestActive)
-		{
-			return true;
-		}
-
 		Float4 Z = z;
 
 		if(spirvShader && spirvShader->getModes().DepthReplacing)
@@ -464,6 +459,119 @@
 		return zMask != 0;
 	}
 
+	Bool PixelRoutine::depthTest16(Pointer<Byte> &zBuffer, int q, Int &x, Float4 &z, Int &sMask, Int &zMask, Int &cMask)
+	{
+		Short4 Z = convertFixed16(z, true);
+
+		if(spirvShader && spirvShader->getModes().DepthReplacing)
+		{
+			Z = convertFixed16(oDepth, true);
+		}
+
+		Pointer<Byte> buffer;
+		Int pitch;
+
+		if(!state.quadLayoutDepthBuffer)
+		{
+			buffer = zBuffer + 2 * x;
+			pitch = *Pointer<Int>(data + OFFSET(DrawData,depthPitchB));
+		}
+		else
+		{
+			buffer = zBuffer + 4 * x;
+		}
+
+		if(q > 0)
+		{
+			buffer += q * *Pointer<Int>(data + OFFSET(DrawData,depthSliceB));
+		}
+
+		Short4 zValue;
+
+		if(state.depthCompareMode != VK_COMPARE_OP_NEVER || (state.depthCompareMode != VK_COMPARE_OP_ALWAYS && !state.depthWriteEnable))
+		{
+			if(!state.quadLayoutDepthBuffer)
+			{
+				// FIXME: Properly optimizes?
+				zValue = *Pointer<Short4>(buffer) & Short4(-1, -1, 0, 0);
+				zValue = zValue | (*Pointer<Short4>(buffer + pitch - 4) & Short4(0, 0, -1, -1));
+			}
+			else
+			{
+				zValue = *Pointer<Short4>(buffer, 8);
+			}
+		}
+
+		Int4 zTest;
+
+		// Bias values to make unsigned compares out of Reactor's (due SSE's) signed compares only
+		zValue = zValue - Short4(0x8000);
+		Z = Z - Short4(0x8000);
+
+		switch(state.depthCompareMode)
+		{
+		case VK_COMPARE_OP_ALWAYS:
+			// Optimized
+			break;
+		case VK_COMPARE_OP_NEVER:
+			// Optimized
+			break;
+		case VK_COMPARE_OP_EQUAL:
+			zTest = Int4(CmpEQ(zValue, Z));
+			break;
+		case VK_COMPARE_OP_NOT_EQUAL:
+			zTest = ~Int4(CmpEQ(zValue, Z));
+			break;
+		case VK_COMPARE_OP_LESS:
+			zTest = Int4(CmpGT(zValue, Z));
+			break;
+		case VK_COMPARE_OP_GREATER_OR_EQUAL:
+			zTest = ~Int4(CmpGT(zValue, Z));
+			break;
+		case VK_COMPARE_OP_LESS_OR_EQUAL:
+			zTest = ~Int4(CmpGT(Z, zValue));
+			break;
+		case VK_COMPARE_OP_GREATER:
+			zTest = Int4(CmpGT(Z, zValue));
+			break;
+		default:
+			ASSERT(false);
+		}
+
+		switch(state.depthCompareMode)
+		{
+		case VK_COMPARE_OP_ALWAYS:
+			zMask = cMask;
+			break;
+		case VK_COMPARE_OP_NEVER:
+			zMask = 0x0;
+			break;
+		default:
+			zMask = SignMask(zTest) & cMask;
+			break;
+		}
+
+		if(state.stencilActive)
+		{
+			zMask &= sMask;
+		}
+
+		return zMask != 0;
+	}
+
+	Bool PixelRoutine::depthTest(Pointer<Byte> &zBuffer, int q, Int &x, Float4 &z, Int &sMask, Int &zMask, Int &cMask)
+	{
+		if(!state.depthTestActive)
+		{
+			return true;
+		}
+
+		if (state.depthFormat == VK_FORMAT_D16_UNORM)
+			return depthTest16(zBuffer, q, x, z, sMask, zMask, cMask);
+		else
+			return depthTest32F(zBuffer, q, x, z, sMask, zMask, cMask);
+	}
+
 	void PixelRoutine::alphaToCoverage(Int cMask[4], Float4 &alpha)
 	{
 		Int4 coverage0 = CmpNLT(alpha, *Pointer<Float4>(data + OFFSET(DrawData,a2c0)));
@@ -482,13 +590,8 @@
 		cMask[3] &= aMask3;
 	}
 
-	void PixelRoutine::writeDepth(Pointer<Byte> &zBuffer, int q, Int &x, Float4 &z, Int &zMask)
+	void PixelRoutine::writeDepth32F(Pointer<Byte> &zBuffer, int q, Int &x, Float4 &z, Int &zMask)
 	{
-		if(!state.depthWriteEnable)
-		{
-			return;
-		}
-
 		Float4 Z = z;
 
 		if(spirvShader && spirvShader->getModes().DepthReplacing)
@@ -546,6 +649,80 @@
 		}
 	}
 
+	void PixelRoutine::writeDepth16(Pointer<Byte> &zBuffer, int q, Int &x, Float4 &z, Int &zMask)
+	{
+		Short4 Z = As<Short4>(convertFixed16(z, true));
+
+		if(spirvShader && spirvShader->getModes().DepthReplacing)
+		{
+			Z = As<Short4>(convertFixed16(oDepth, true));
+		}
+
+		Pointer<Byte> buffer;
+		Int pitch;
+
+		if(!state.quadLayoutDepthBuffer)
+		{
+			buffer = zBuffer + 2 * x;
+			pitch = *Pointer<Int>(data + OFFSET(DrawData,depthPitchB));
+		}
+		else
+		{
+			buffer = zBuffer + 4 * x;
+		}
+
+		if(q > 0)
+		{
+			buffer += q * *Pointer<Int>(data + OFFSET(DrawData,depthSliceB));
+		}
+
+		Short4 zValue;
+
+		if(state.depthCompareMode != VK_COMPARE_OP_NEVER || (state.depthCompareMode != VK_COMPARE_OP_ALWAYS && !state.depthWriteEnable))
+		{
+			if(!state.quadLayoutDepthBuffer)
+			{
+				// FIXME: Properly optimizes?
+				zValue = *Pointer<Short4>(buffer) & Short4(-1, -1, 0, 0);
+				zValue = zValue | (*Pointer<Short4>(buffer + pitch - 4) & Short4(0, 0, -1, -1));
+			}
+			else
+			{
+				zValue = *Pointer<Short4>(buffer, 8);
+			}
+		}
+
+		Z = Z & *Pointer<Short4>(constants + OFFSET(Constants,maskW4Q) + zMask * 8, 8);
+		zValue = zValue & *Pointer<Short4>(constants + OFFSET(Constants,invMaskW4Q) + zMask * 8, 8);
+		Z = Z | zValue;
+
+		if(!state.quadLayoutDepthBuffer)
+		{
+			// FIXME: Properly optimizes?
+			*Pointer<Short>(buffer) = Extract(Z, 0);
+			*Pointer<Short>(buffer+2) = Extract(Z, 1);
+			*Pointer<Short>(buffer+pitch) = Extract(Z, 2);
+			*Pointer<Short>(buffer+pitch+2) = Extract(Z, 3);
+		}
+		else
+		{
+			*Pointer<Short4>(buffer, 8) = Z;
+		}
+	}
+
+	void PixelRoutine::writeDepth(Pointer<Byte> &zBuffer, int q, Int &x, Float4 &z, Int &zMask)
+	{
+		if(!state.depthWriteEnable)
+		{
+			return;
+		}
+
+		if (state.depthFormat == VK_FORMAT_D16_UNORM)
+			writeDepth16(zBuffer, q, x, z, zMask);
+		else
+			writeDepth32F(zBuffer, q, x, z, zMask);
+	}
+
 	void PixelRoutine::writeStencil(Pointer<Byte> &sBuffer, int q, Int &x, Int &sMask, Int &zMask, Int &cMask)
 	{
 		if(!state.stencilActive)