Handle VK_FORMAT_G10X6_B10X6R10X6_2PLANE_420_UNORM_3PACK16 sampling

As
https://swiftshader-review.googlesource.com/c/SwiftShader/+/63748
was incomplete and introduced some failures as seen in
https://swiftshader-review.googlesource.com/c/SwiftShader/+/63628.

Updates `SamplerCore::sampleTexel()` to handle both 8-bit and 10-bit
formats by reading the 8-bit or 10-bit buffers values without 8.8
packing (e.g. packing the 8-bit raw `u` value from the buffer into
both the high 8-bits and low 8-bits of `U`) and by using more
generic range expansion (as opposed to the hardcoded constants for
8-bit).

Bug: b/219756793
Test: dEQP-VK.ycbcr.*
Change-Id: I6e1681cf2f40dbbc389652136711bc2f20b413a1
Reviewed-on: https://swiftshader-review.googlesource.com/c/SwiftShader/+/64268
Presubmit-Ready: Jason Macnak <natsu@google.com>
Reviewed-by: Nicolas Capens <nicolascapens@google.com>
Kokoro-Result: kokoro <noreply+kokoro@google.com>
Tested-by: Jason Macnak <natsu@google.com>
Commit-Queue: Jason Macnak <natsu@google.com>
diff --git a/src/Pipeline/SamplerCore.cpp b/src/Pipeline/SamplerCore.cpp
index 4268810..f126b4b 100644
--- a/src/Pipeline/SamplerCore.cpp
+++ b/src/Pipeline/SamplerCore.cpp
@@ -1727,12 +1727,14 @@
 
 	if(isYcbcrFormat())
 	{
+		// Generates 15-bit output.
+
 		// Pointers to the planes of YCbCr images are stored in consecutive mipmap levels.
 		Pointer<Byte> bufferY = buffer;                                                                         // *Pointer<Pointer<Byte>>(mipmap + 0 * sizeof(Mipmap) + OFFSET(Mipmap, buffer));
 		Pointer<Byte> bufferU = *Pointer<Pointer<Byte>>(mipmap + 1 * sizeof(Mipmap) + OFFSET(Mipmap, buffer));  // U/V for 2-plane interleaved formats.
 		Pointer<Byte> bufferV = *Pointer<Pointer<Byte>>(mipmap + 2 * sizeof(Mipmap) + OFFSET(Mipmap, buffer));
 
-		// Luminance
+		// Luminance (either 8-bit or 10-bit in bottom bits).
 		UShort4 Y;
 		{
 			switch(state.textureFormat)
@@ -1740,24 +1742,20 @@
 			case VK_FORMAT_G8_B8_R8_3PLANE_420_UNORM:
 			case VK_FORMAT_G8_B8R8_2PLANE_420_UNORM:
 				{
-					Int c0 = Int(bufferY[index[0]]);
-					Int c1 = Int(bufferY[index[1]]);
-					Int c2 = Int(bufferY[index[2]]);
-					Int c3 = Int(bufferY[index[3]]);
-					c0 = c0 | (c1 << 8) | (c2 << 16) | (c3 << 24);
-					Y = As<UShort4>(Unpack(As<Byte4>(c0)));
+					Y = Insert(Y, UShort(bufferY[index[0]]), 0);
+					Y = Insert(Y, UShort(bufferY[index[1]]), 1);
+					Y = Insert(Y, UShort(bufferY[index[2]]), 2);
+					Y = Insert(Y, UShort(bufferY[index[3]]), 3);
 				}
 				break;
 			case VK_FORMAT_G10X6_B10X6R10X6_2PLANE_420_UNORM_3PACK16:
 				{
-					Y = Insert(Y, Pointer<UShort>(bufferY)[index[0]], 0);  // TODO: Insert(UShort4, UShort)
+					Y = Insert(Y, Pointer<UShort>(bufferY)[index[0]], 0);
 					Y = Insert(Y, Pointer<UShort>(bufferY)[index[1]], 1);
 					Y = Insert(Y, Pointer<UShort>(bufferY)[index[2]], 2);
 					Y = Insert(Y, Pointer<UShort>(bufferY)[index[3]], 3);
 					// Top 10 bits of each 16 bits:
 					Y = (Y & UShort4(0xFFC0u)) >> 6;
-					// Scale from 10 bits to 16 bits:
-					Y = Y << 6;
 				}
 				break;
 			default:
@@ -1766,7 +1764,7 @@
 			}
 		}
 
-		// Chroma
+		// Chroma (either 8-bit or 10-bit in bottom bits).
 		UShort4 Cb, Cr;
 		{
 			computeIndices(index, uuuu, vvvv, wwww, layerIndex, offset, sample, mipmap + sizeof(Mipmap));
@@ -1776,30 +1774,27 @@
 			{
 			case VK_FORMAT_G8_B8_R8_3PLANE_420_UNORM:
 				{
-					Int c0 = Int(bufferU[index[0]]);
-					Int c1 = Int(bufferU[index[1]]);
-					Int c2 = Int(bufferU[index[2]]);
-					Int c3 = Int(bufferU[index[3]]);
-					c0 = c0 | (c1 << 8) | (c2 << 16) | (c3 << 24);
-					U = As<UShort4>(Unpack(As<Byte4>(c0)));
+					U = Insert(U, UShort(bufferU[index[0]]), 0);
+					U = Insert(U, UShort(bufferU[index[1]]), 1);
+					U = Insert(U, UShort(bufferU[index[2]]), 2);
+					U = Insert(U, UShort(bufferU[index[3]]), 3);
 
-					c0 = Int(bufferV[index[0]]);
-					c1 = Int(bufferV[index[1]]);
-					c2 = Int(bufferV[index[2]]);
-					c3 = Int(bufferV[index[3]]);
-					c0 = c0 | (c1 << 8) | (c2 << 16) | (c3 << 24);
-					V = As<UShort4>(Unpack(As<Byte4>(c0)));
+					V = Insert(V, UShort(bufferV[index[0]]), 0);
+					V = Insert(V, UShort(bufferV[index[1]]), 1);
+					V = Insert(V, UShort(bufferV[index[2]]), 2);
+					V = Insert(V, UShort(bufferV[index[3]]), 3);
 				}
 				break;
 			case VK_FORMAT_G8_B8R8_2PLANE_420_UNORM:
 				{
-					Short4 UV;
-					UV = Insert(UV, Pointer<Short>(bufferU)[index[0]], 0);  // TODO: Insert(UShort4, UShort)
-					UV = Insert(UV, Pointer<Short>(bufferU)[index[1]], 1);
-					UV = Insert(UV, Pointer<Short>(bufferU)[index[2]], 2);
-					UV = Insert(UV, Pointer<Short>(bufferU)[index[3]], 3);
-					U = (UV & Short4(0x00FFu)) | (UV << 8);
-					V = (UV & Short4(0xFF00u)) | As<Short4>(As<UShort4>(UV) >> 8);
+					UShort4 UV;
+					UV = Insert(UV, Pointer<UShort>(bufferU)[index[0]], 0);
+					UV = Insert(UV, Pointer<UShort>(bufferU)[index[1]], 1);
+					UV = Insert(UV, Pointer<UShort>(bufferU)[index[2]], 2);
+					UV = Insert(UV, Pointer<UShort>(bufferU)[index[3]], 3);
+
+					U = (UV & UShort4(0x00FFu));
+					V = (UV & UShort4(0xFF00u)) >> 8;
 				}
 				break;
 			case VK_FORMAT_G10X6_B10X6R10X6_2PLANE_420_UNORM_3PACK16:
@@ -1809,13 +1804,10 @@
 					UV = Insert(UV, Pointer<UInt>(bufferU)[index[1]], 1);
 					UV = Insert(UV, Pointer<UInt>(bufferU)[index[2]], 2);
 					UV = Insert(UV, Pointer<UInt>(bufferU)[index[3]], 3);
-					// Top 10 bits of first 16 bits:
-					U = UShort4((UV & UInt4(0x0000FFC0u)) >> 6);  // TODO: UnpackLower(UInt4)
-					// Top 10 bits of second 16 bits:
-					V = UShort4((UV & UInt4(0xFFC00000u)) >> 22);  // TODO: UnpackUpper(UInt4)
-					// Scale from 10 bits to 16 bits:
-					U = U << 6;
-					V = V << 6;
+					// Top 10 bits of first 16-bits:
+					U = UShort4((UV & UInt4(0x0000FFC0u)) >> 6);
+					// Top 10 bits of second 16-bits:
+					V = UShort4((UV & UInt4(0xFFC00000u)) >> 22);
 				}
 				break;
 			default:
@@ -1835,31 +1827,65 @@
 			}
 		}
 
+		uint8_t lumaBits = 8;
+		uint8_t chromaBits = 8;
+		switch(state.textureFormat)
+		{
+		case VK_FORMAT_G8_B8_R8_3PLANE_420_UNORM:
+		case VK_FORMAT_G8_B8R8_2PLANE_420_UNORM:
+			lumaBits = 8;
+			chromaBits = 8;
+			break;
+		case VK_FORMAT_G10X6_B10X6R10X6_2PLANE_420_UNORM_3PACK16:
+			lumaBits = 10;
+			chromaBits = 10;
+			break;
+		default:
+			UNSUPPORTED("state.textureFormat %d", (int)state.textureFormat);
+			break;
+		}
+
 		if(state.ycbcrModel == VK_SAMPLER_YCBCR_MODEL_CONVERSION_RGB_IDENTITY)
 		{
-			// YCbCr formats are treated as signed 15-bit.
-			c.x = Cr >> 1;
-			c.y = Y >> 1;
-			c.z = Cb >> 1;
+			// Scale to the output 15-bit.
+			c.x = Cr << (15 - chromaBits);
+			c.y = Y << (15 - lumaBits);
+			c.z = Cb << (15 - chromaBits);
 		}
 		else
 		{
-			// Scaling and bias for studio-swing range: Y = [16 .. 235], U/V = [16 .. 240]
-			// Scale down by 0x0101 to normalize the 8.8 samples, and up by 0x7FFF for signed 15-bit output.
-			float yOffset = static_cast<float>(state.studioSwing ? 16 * 0x0101 : 0);
-			float uvOffset = static_cast<float>(128 * 0x0101);
-			float yFactor = static_cast<float>(0x7FFF) / static_cast<float>(state.studioSwing ? 219 * 0x0101 : 255 * 0x0101);
-			float uvFactor = static_cast<float>(0x7FFF) / static_cast<float>(state.studioSwing ? 224 * 0x0101 : 255 * 0x0101);
+			const float twoPowLumaBits = static_cast<float>(0x1u << lumaBits);
+			const float twoPowLumaBitsMinus8 = static_cast<float>(0x1u << (lumaBits - 8));
+			const float twoPowChromaBits = static_cast<float>(0x1u << chromaBits);
+			const float twoPowChromaBitsMinus1 = static_cast<float>(0x1u << (chromaBits - 1));
+			const float twoPowChromaBitsMinus8 = static_cast<float>(0x1u << (chromaBits - 8));
 
-			Float4 y = (Float4(Y) - Float4(yOffset)) * Float4(yFactor);
-			Float4 u = (Float4(Cb) - Float4(uvOffset)) * Float4(uvFactor);
-			Float4 v = (Float4(Cr) - Float4(uvOffset)) * Float4(uvFactor);
+			Float4 y = Float4(Y);
+			Float4 u = Float4(Cb);
+			Float4 v = Float4(Cr);
+
+			if(state.studioSwing)
+			{
+				// See https://www.khronos.org/registry/DataFormat/specs/1.3/dataformat.1.3.html#QUANTIZATION_NARROW
+				y = ((y / Float4(twoPowLumaBitsMinus8)) - Float4(16.0f)) / Float4(219.0f);
+				u = ((u / Float4(twoPowChromaBitsMinus8)) - Float4(128.0f)) / Float4(224.0f);
+				v = ((v / Float4(twoPowChromaBitsMinus8)) - Float4(128.0f)) / Float4(224.0f);
+			}
+			else
+			{
+				// See https://www.khronos.org/registry/DataFormat/specs/1.3/dataformat.1.3.html#QUANTIZATION_FULL
+				y = y / Float4(twoPowLumaBits - 1.0f);
+				u = (u - Float4(twoPowChromaBitsMinus1)) / Float4(twoPowChromaBits - 1.0f);
+				v = (v - Float4(twoPowChromaBitsMinus1)) / Float4(twoPowChromaBits - 1.0f);
+			}
+
+			// Now, `y` is in [0, 1] and `u` and `v` are in [-0.5, 0.5].
 
 			if(state.ycbcrModel == VK_SAMPLER_YCBCR_MODEL_CONVERSION_YCBCR_IDENTITY)
 			{
-				c.x = Short4(v);
-				c.y = Short4(y);
-				c.z = Short4(u);
+				c.x = Short4(v * static_cast<float>(0x7FFF));
+				c.y = Short4(y * static_cast<float>(0x7FFF));
+				c.z = Short4(u * static_cast<float>(0x7FFF));
 			}
 			else
 			{
@@ -1900,9 +1926,9 @@
 				Float4 g = y + Float4(Gb) * u + Float4(Gr) * v;
 				Float4 b = y + Float4(Bb) * u;
 
-				c.x = Short4(r);
-				c.y = Short4(g);
-				c.z = Short4(b);
+				c.x = Short4(r * static_cast<float>(0x7FFF));
+				c.y = Short4(g * static_cast<float>(0x7FFF));
+				c.z = Short4(b * static_cast<float>(0x7FFF));
 			}
 		}
 	}
diff --git a/src/Reactor/Reactor.cpp b/src/Reactor/Reactor.cpp
index 41268a9..3d6459f 100644
--- a/src/Reactor/Reactor.cpp
+++ b/src/Reactor/Reactor.cpp
@@ -1061,6 +1061,13 @@
 	storeValue(integer);
 }
 
+UShort::UShort(RValue<Byte> cast)
+{
+	Value *integer = Nucleus::createZExt(cast.value(), UShort::type());
+
+	storeValue(integer);
+}
+
 UShort::UShort(unsigned short x)
 {
 	storeValue(Nucleus::createConstantShort(x));
@@ -1331,6 +1338,11 @@
 	return store(rhs.load());
 }
 
+RValue<Byte4> Insert(RValue<Byte4> val, RValue<Byte> element, int i)
+{
+	return RValue<Byte4>(Nucleus::createInsertElement(val.value(), element.value(), i));
+}
+
 Byte8::Byte8(uint8_t x0, uint8_t x1, uint8_t x2, uint8_t x3, uint8_t x4, uint8_t x5, uint8_t x6, uint8_t x7)
 {
 	int64_t constantVector[8] = { x0, x1, x2, x3, x4, x5, x6, x7 };
diff --git a/src/Reactor/Reactor.hpp b/src/Reactor/Reactor.hpp
index 453684e..aeaad54 100644
--- a/src/Reactor/Reactor.hpp
+++ b/src/Reactor/Reactor.hpp
@@ -561,6 +561,7 @@
 
 	explicit UShort(RValue<UInt> cast);
 	explicit UShort(RValue<Int> cast);
+	explicit UShort(RValue<Byte> cast);
 
 	UShort() = default;
 	UShort(unsigned short x);
@@ -632,6 +633,8 @@
 	static Type *type();
 };
 
+RValue<Byte4> Insert(RValue<Byte4> val, RValue<Byte> element, int i);
+
 //	RValue<Byte4> operator+(RValue<Byte4> lhs, RValue<Byte4> rhs);
 //	RValue<Byte4> operator-(RValue<Byte4> lhs, RValue<Byte4> rhs);
 //	RValue<Byte4> operator*(RValue<Byte4> lhs, RValue<Byte4> rhs);