VK_EXT_shader_stencil_export support

VK_EXT_shader_stencil_export is a simple extension to implement in
SwiftShader. Added PixelRoutine::stencilReplaceRef(), which can be
trivially modified to output Byte4 instead of Byte8, should the
stencil code be refactored.

Verified that these tests still pass with SwANGLE
dEQP-GLES3.functional.fbo.blit.depth_stencil.depth32f_stencil8_basic
dEQP-GLES3.functional.fbo.blit.depth_stencil.depth32f_stencil8_stencil_only
dEQP-GLES3.functional.fbo.blit.depth_stencil.depth24_stencil8_basic
dEQP-GLES3.functional.fbo.blit.depth_stencil.depth24_stencil8_stencil_only

Change-Id: Ia5829489ccba75ee7c41e365ca2d3d586c987c2d
Tests: dEQP-VK.pipeline.shader_stencil_export.op_replace
Bug: b/148175198
Reviewed-on: https://swiftshader-review.googlesource.com/c/SwiftShader/+/41469
Presubmit-Ready: Alexis Hétu <sugoi@google.com>
Reviewed-by: Nicolas Capens <nicolascapens@google.com>
Kokoro-Presubmit: kokoro <noreply+kokoro@google.com>
Tested-by: Alexis Hétu <sugoi@google.com>
diff --git a/src/Pipeline/PixelRoutine.cpp b/src/Pipeline/PixelRoutine.cpp
index 8543a5e..26ed5b5 100644
--- a/src/Pipeline/PixelRoutine.cpp
+++ b/src/Pipeline/PixelRoutine.cpp
@@ -750,6 +750,29 @@
 	}
 }
 
+Byte8 PixelRoutine::stencilReplaceRef(bool isBack)
+{
+	if(spirvShader)
+	{
+		auto it = spirvShader->outputBuiltins.find(spv::BuiltInFragStencilRefEXT);
+		if(it != spirvShader->outputBuiltins.end())
+		{
+			UInt4 sRef = As<UInt4>(routine.getVariable(it->second.Id)[it->second.FirstComponent]) & UInt4(0xff);
+			// TODO (b/148295813): Could be done with a single pshufb instruction. Optimize the
+			//                     following line by either adding a rr::Shuffle() variant to do
+			//                     it explicitly or adding a Byte4(Int4) constructor would work.
+			sRef.x = rr::UInt(sRef.x) | (rr::UInt(sRef.y) << 8) | (rr::UInt(sRef.z) << 16) | (rr::UInt(sRef.w) << 24);
+
+			UInt2 sRefDuplicated;
+			sRefDuplicated = Insert(sRefDuplicated, sRef.x, 0);
+			sRefDuplicated = Insert(sRefDuplicated, sRef.x, 1);
+			return As<Byte8>(sRefDuplicated);
+		}
+	}
+
+	return *Pointer<Byte8>(data + OFFSET(DrawData, stencil[isBack].referenceQ));
+}
+
 void PixelRoutine::stencilOperation(Byte8 &output, const Byte8 &bufferValue, VkStencilOp operation, bool isBack)
 {
 	switch(operation)
@@ -761,7 +784,7 @@
 			output = Byte8(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
 			break;
 		case VK_STENCIL_OP_REPLACE:
-			output = *Pointer<Byte8>(data + OFFSET(DrawData, stencil[isBack].referenceQ));
+			output = stencilReplaceRef(isBack);
 			break;
 		case VK_STENCIL_OP_INCREMENT_AND_CLAMP:
 			output = AddSat(bufferValue, Byte8(1, 1, 1, 1, 1, 1, 1, 1));
diff --git a/src/Pipeline/PixelRoutine.hpp b/src/Pipeline/PixelRoutine.hpp
index 34722ad..72d4253 100644
--- a/src/Pipeline/PixelRoutine.hpp
+++ b/src/Pipeline/PixelRoutine.hpp
@@ -65,6 +65,7 @@
 
 private:
 	Float4 interpolateCentroid(const Float4 &x, const Float4 &y, const Float4 &rhw, Pointer<Byte> planeEquation, bool flat, bool perspective);
+	Byte8 stencilReplaceRef(bool isBack);
 	void stencilTest(const Pointer<Byte> &sBuffer, int q, const Int &x, Int &sMask, const Int &cMask);
 	void stencilTest(Byte8 &value, VkCompareOp stencilCompareMode, bool isBack);
 	void stencilOperation(Byte8 &newValue, const Byte8 &bufferValue, const PixelProcessor::States::StencilOpState &ops, bool isBack, const Int &zMask, const Int &sMask);
diff --git a/src/Pipeline/SpirvShader.cpp b/src/Pipeline/SpirvShader.cpp
index 5026373..4e50f2d 100644
--- a/src/Pipeline/SpirvShader.cpp
+++ b/src/Pipeline/SpirvShader.cpp
@@ -381,6 +381,7 @@
 					case spv::CapabilityGroupNonUniformShuffleRelative: capabilities.GroupNonUniformShuffleRelative = true; break;
 					case spv::CapabilityDeviceGroup: capabilities.DeviceGroup = true; break;
 					case spv::CapabilityMultiView: capabilities.MultiView = true; break;
+					case spv::CapabilityStencilExportEXT: capabilities.StencilExportEXT = true; break;
 					default:
 						UNSUPPORTED("Unsupported capability %u", insn.word(1));
 				}
@@ -719,6 +720,7 @@
 				if(!strcmp(ext, "SPV_KHR_variable_pointers")) break;
 				if(!strcmp(ext, "SPV_KHR_device_group")) break;
 				if(!strcmp(ext, "SPV_KHR_multiview")) break;
+				if(!strcmp(ext, "SPV_EXT_shader_stencil_export")) break;
 				UNSUPPORTED("SPIR-V Extension: %s", ext);
 				break;
 			}
diff --git a/src/Pipeline/SpirvShader.hpp b/src/Pipeline/SpirvShader.hpp
index 406fe97..e9876c7 100644
--- a/src/Pipeline/SpirvShader.hpp
+++ b/src/Pipeline/SpirvShader.hpp
@@ -544,6 +544,7 @@
 		bool GroupNonUniformArithmetic : 1;
 		bool DeviceGroup : 1;
 		bool MultiView : 1;
+		bool StencilExportEXT : 1;
 	};
 
 	Capabilities const &getUsedCapabilities() const
diff --git a/src/Shader/SamplerCore.cpp b/src/Shader/SamplerCore.cpp
index 5690cfa..810874b 100644
--- a/src/Shader/SamplerCore.cpp
+++ b/src/Shader/SamplerCore.cpp
@@ -1915,7 +1915,7 @@
 					Int c1 = Int(*Pointer<Byte>(buffer[f1] + index[1]));
 					Int c2 = Int(*Pointer<Byte>(buffer[f2] + index[2]));
 					Int c3 = Int(*Pointer<Byte>(buffer[f3] + index[3]));
-					c0 = c0 | (c1 << 8) | (c2 << 16) | (c3 << 24);
+					c0 = c0 | (c1 << 8) | (c2 << 16) | (c3 << 24); // TODO (b/148295813) : Optimize with pshufb
 
 					switch(state.textureFormat)
 					{
@@ -2067,7 +2067,7 @@
 			Int c1 = Int(buffer[0][index[1]]);
 			Int c2 = Int(buffer[0][index[2]]);
 			Int c3 = Int(buffer[0][index[3]]);
-			c0 = c0 | (c1 << 8) | (c2 << 16) | (c3 << 24);
+			c0 = c0 | (c1 << 8) | (c2 << 16) | (c3 << 24); // TODO (b/148295813) : Optimize with pshufb
 			UShort4 Y = As<UShort4>(Unpack(As<Byte4>(c0)));
 
 			computeIndices(index, uuuu, vvvv, wwww, offset, mipmap + sizeof(Mipmap), function);
@@ -2075,14 +2075,14 @@
 			c1 = Int(buffer[1][index[1]]);
 			c2 = Int(buffer[1][index[2]]);
 			c3 = Int(buffer[1][index[3]]);
-			c0 = c0 | (c1 << 8) | (c2 << 16) | (c3 << 24);
+			c0 = c0 | (c1 << 8) | (c2 << 16) | (c3 << 24); // TODO (b/148295813) : Optimize with pshufb
 			UShort4 V = As<UShort4>(Unpack(As<Byte4>(c0)));
 
 			c0 = Int(buffer[2][index[0]]);
 			c1 = Int(buffer[2][index[1]]);
 			c2 = Int(buffer[2][index[2]]);
 			c3 = Int(buffer[2][index[3]]);
-			c0 = c0 | (c1 << 8) | (c2 << 16) | (c3 << 24);
+			c0 = c0 | (c1 << 8) | (c2 << 16) | (c3 << 24); // TODO (b/148295813) : Optimize with pshufb
 			UShort4 U = As<UShort4>(Unpack(As<Byte4>(c0)));
 
 			const UShort4 yY = UShort4(iround(Yy * 0x4000));
diff --git a/src/Vulkan/libVulkan.cpp b/src/Vulkan/libVulkan.cpp
index 92a123a..937f482 100644
--- a/src/Vulkan/libVulkan.cpp
+++ b/src/Vulkan/libVulkan.cpp
@@ -321,6 +321,8 @@
 	{ VK_EXT_QUEUE_FAMILY_FOREIGN_EXTENSION_NAME, VK_EXT_QUEUE_FAMILY_FOREIGN_SPEC_VERSION },
 	// The following extension is only used to add support for Bresenham lines
 	{ VK_EXT_LINE_RASTERIZATION_EXTENSION_NAME, VK_EXT_LINE_RASTERIZATION_SPEC_VERSION },
+	// The following extension is used by ANGLE to emulate blitting the stencil buffer
+	{ VK_EXT_SHADER_STENCIL_EXPORT_EXTENSION_NAME, VK_EXT_SHADER_STENCIL_EXPORT_SPEC_VERSION },
 #ifndef __ANDROID__
 	// We fully support the KHR_swapchain v70 additions, so just track the spec version.
 	{ VK_KHR_SWAPCHAIN_EXTENSION_NAME, VK_KHR_SWAPCHAIN_SPEC_VERSION },