Stencil buffer texture sampling

In order to perform a blitFramebuffer operation, ANGLE uses
a compute shader which reads the stencil from a depth+stencil
image to a buffer, and then copies this buffer to the stencil
of the other image.

This cl basically treats VK_FORMAT_S8_UINT as VK_FORMAT_R8_UINT
with quad layout for the purpose of texture sampling.

Fixes the following tests:
dEQP-GLES3.functional.fbo.blit.depth_stencil.depth32f_stencil8_basic
dEQP-GLES3.functional.fbo.blit.depth_stencil.depth32f_stencil8_stencil_only
dEQP-GLES3.functional.fbo.blit.depth_stencil.depth24_stencil8_basic
dEQP-GLES3.functional.fbo.blit.depth_stencil.depth24_stencil8_stencil_only

Bug: b/142385547
Change-Id: Ib2ea7fa81496ceca1c24ea7f065b1c2cd05596ee
Reviewed-on: https://swiftshader-review.googlesource.com/c/SwiftShader/+/37188
Presubmit-Ready: Alexis Hétu <sugoi@google.com>
Kokoro-Presubmit: kokoro <noreply+kokoro@google.com>
Tested-by: Alexis Hétu <sugoi@google.com>
Reviewed-by: Antonio Maiorano <amaiorano@google.com>
diff --git a/src/Device/Blitter.cpp b/src/Device/Blitter.cpp
index 629b5be..5278f5d 100644
--- a/src/Device/Blitter.cpp
+++ b/src/Device/Blitter.cpp
@@ -1032,6 +1032,7 @@
 		case VK_FORMAT_R8G8_UINT:
 			c = Insert(c, Int(*Pointer<Byte>(element + 1)), 1);
 		case VK_FORMAT_R8_UINT:
+		case VK_FORMAT_S8_UINT:
 			c = Insert(c, Int(*Pointer<Byte>(element)), 0);
 			break;
 		case VK_FORMAT_R16G16B16A16_SINT:
@@ -1090,6 +1091,7 @@
 		case VK_FORMAT_R8G8B8_USCALED:
 		case VK_FORMAT_R8G8_USCALED:
 		case VK_FORMAT_R8_USCALED:
+		case VK_FORMAT_S8_UINT:
 			c = Min(As<UInt4>(c), UInt4(0xFF));
 			break;
 		case VK_FORMAT_R16G16B16A16_UINT:
@@ -1215,6 +1217,7 @@
 			if(writeG) { *Pointer<Byte>(element + 1) = Byte(Extract(c, 1)); }
 		case VK_FORMAT_R8_UINT:
 		case VK_FORMAT_R8_USCALED:
+		case VK_FORMAT_S8_UINT:
 			if(writeR) { *Pointer<Byte>(element) = Byte(Extract(c, 0)); }
 			break;
 		case VK_FORMAT_R16G16B16A16_SINT:
@@ -1358,7 +1361,7 @@
 		{
 			// (x & ~1) * 2 + (x & 1) == (x - (x & 1)) * 2 + (x & 1) == x * 2 - (x & 1) * 2 + (x & 1) == x * 2 - (x & 1)
 			return (y & Int(~1)) * pitchB +
-			       ((y & Int(1)) * 2 + x * 2 - (x & Int(1))) * bytes;
+			       ((((y & Int(1)) + x) << 1) - (x & Int(1))) * bytes;
 		}
 	}
 
diff --git a/src/Pipeline/SamplerCore.cpp b/src/Pipeline/SamplerCore.cpp
index 055d9ff..62e3cef 100644
--- a/src/Pipeline/SamplerCore.cpp
+++ b/src/Pipeline/SamplerCore.cpp
@@ -45,6 +45,13 @@
 		default: ASSERT(false);
 		}
 	}
+
+	template <typename T>
+	void applyQuadLayout(T& x, T& y)
+	{
+		x = (((y & T(1)) + x) << 1) - (x & T(1));
+		y &= T(~1);
+	}
 }
 
 namespace sw
@@ -895,6 +902,11 @@
 		address(v, y0, y1, fv, mipmap, offset.y, filter, OFFSET(Mipmap, height), state.addressingModeV, function);
 		address(w, z0, z0, fw, mipmap, offset.z, filter, OFFSET(Mipmap, depth), state.addressingModeW, function);
 
+		if(hasQuadLayout())
+		{
+			::applyQuadLayout(x0, y0);
+		}
+
 		Int4 pitchP = *Pointer<Int4>(mipmap + OFFSET(Mipmap, pitchP), 16);
 		y0 *= pitchP;
 		if(state.addressingModeW != ADDRESSING_UNUSED)
@@ -908,6 +920,11 @@
 		}
 		else
 		{
+			if(hasQuadLayout())
+			{
+				::applyQuadLayout(x1, y1);
+			}
+
 			y1 *= pitchP;
 
 			Vector4f c00 = sampleTexel(x0, y0, z0, q, mipmap, buffer, function);
@@ -971,6 +988,11 @@
 		address(v, y0, y1, fv, mipmap, offset.y, filter, OFFSET(Mipmap, height), state.addressingModeV, function);
 		address(w, z0, z1, fw, mipmap, offset.z, filter, OFFSET(Mipmap, depth), state.addressingModeW, function);
 
+		if(hasQuadLayout())
+		{
+			::applyQuadLayout(x0, y0);
+		}
+
 		Int4 pitchP = *Pointer<Int4>(mipmap + OFFSET(Mipmap, pitchP), 16);
 		Int4 sliceP = *Pointer<Int4>(mipmap + OFFSET(Mipmap, sliceP), 16);
 		y0 *= pitchP;
@@ -982,6 +1004,11 @@
 		}
 		else
 		{
+			if(hasQuadLayout())
+			{
+				::applyQuadLayout(x1, y1);
+			}
+
 			y1 *= pitchP;
 			z1 *= sliceP;
 
@@ -1280,6 +1307,11 @@
 			                   texelFetch ? ADDRESSING_TEXELFETCH : state.addressingModeV);
 		}
 
+		if(hasQuadLayout())
+		{
+			::applyQuadLayout(uuuu, vvvv);
+		}
+
 		Short4 uuu2 = uuuu;
 		uuuu = As<Short4>(UnpackLow(uuuu, vvvv));
 		uuu2 = As<Short4>(UnpackHigh(uuu2, vvvv));
@@ -1491,6 +1523,7 @@
 					{
 					case VK_FORMAT_R8_SINT:
 					case VK_FORMAT_R8_UINT:
+					case VK_FORMAT_S8_UINT:
 						{
 							Int zero(0);
 							c.x = Unpack(As<Byte4>(c0), As<Byte4>(zero));
@@ -2386,6 +2419,11 @@
 		return state.textureFormat.has32bitIntegerTextureComponents();
 	}
 
+	bool SamplerCore::hasQuadLayout() const
+	{
+		return state.textureFormat.hasQuadLayout();
+	}
+
 	bool SamplerCore::isYcbcrFormat() const
 	{
 		return state.textureFormat.isYcbcrFormat();
diff --git a/src/Pipeline/SamplerCore.hpp b/src/Pipeline/SamplerCore.hpp
index de32eed..19e99b1 100644
--- a/src/Pipeline/SamplerCore.hpp
+++ b/src/Pipeline/SamplerCore.hpp
@@ -104,6 +104,7 @@
 		bool has8bitTextureComponents() const;
 		bool has16bitTextureComponents() const;
 		bool has32bitIntegerTextureComponents() const;
+		bool hasQuadLayout() const;
 		bool isYcbcrFormat() const;
 		bool isRGBComponent(int component) const;
 		bool borderModeActive() const;
diff --git a/src/Vulkan/VkFormat.cpp b/src/Vulkan/VkFormat.cpp
index 3746c08..9489401 100644
--- a/src/Vulkan/VkFormat.cpp
+++ b/src/Vulkan/VkFormat.cpp
@@ -127,6 +127,7 @@
 	case VK_FORMAT_R64G64_UINT:
 	case VK_FORMAT_R64G64B64_UINT:
 	case VK_FORMAT_R64G64B64A64_UINT:
+	case VK_FORMAT_S8_UINT:
 		return true;
 	default:
 		return false;
@@ -1978,6 +1979,7 @@
 	case VK_FORMAT_E5B9G9R9_UFLOAT_PACK32:
 	case VK_FORMAT_B10G11R11_UFLOAT_PACK32:
 	case VK_FORMAT_D16_UNORM:
+	case VK_FORMAT_S8_UINT:
 		return false;
 	default:
 		UNIMPLEMENTED("Format: %d", int(format));
@@ -2009,6 +2011,7 @@
 	case VK_FORMAT_R8G8_UINT:
 	case VK_FORMAT_R8G8B8A8_SINT:
 	case VK_FORMAT_R8G8B8A8_UINT:
+	case VK_FORMAT_S8_UINT:
 		return true;
 	case VK_FORMAT_A1R5G5B5_UNORM_PACK16:
 	case VK_FORMAT_B4G4R4A4_UNORM_PACK16:
@@ -2092,6 +2095,7 @@
 	case VK_FORMAT_A2B10G10R10_UINT_PACK32:
 	case VK_FORMAT_E5B9G9R9_UFLOAT_PACK32:
 	case VK_FORMAT_B10G11R11_UFLOAT_PACK32:
+	case VK_FORMAT_S8_UINT:
 		return false;
 	case VK_FORMAT_R16_UNORM:
 	case VK_FORMAT_R16_SNORM:
@@ -2166,6 +2170,7 @@
 	case VK_FORMAT_E5B9G9R9_UFLOAT_PACK32:
 	case VK_FORMAT_B10G11R11_UFLOAT_PACK32:
 	case VK_FORMAT_D16_UNORM:
+	case VK_FORMAT_S8_UINT:
 		return false;
 	case VK_FORMAT_R32_SINT:
 	case VK_FORMAT_R32_UINT:
@@ -2239,6 +2244,7 @@
 		return component < 3;
 	case VK_FORMAT_D32_SFLOAT:
 	case VK_FORMAT_D16_UNORM:
+	case VK_FORMAT_S8_UINT:
 		return false;
 	default:
 		UNIMPLEMENTED("Format: %d", int(format));