Improve 1010102 blend precision

Making sure we extend 1010102 to the full 16 bit range
fixes 51 of the 53 failures found in ToT dEQP-VK.*a2b10*

Also added a utility function to OR all elements of an
int vector, which allows us to use more vector operations
(as opposed to scalar) while improving the readability.

Bug: b/146633956
Change-Id: If8b946c45cf27f5868d7a97166e21dba565ed72f
Reviewed-on: https://swiftshader-review.googlesource.com/c/SwiftShader/+/39768
Tested-by: Alexis Hétu <sugoi@google.com>
Reviewed-by: Nicolas Capens <nicolascapens@google.com>
diff --git a/src/Device/Blitter.cpp b/src/Device/Blitter.cpp
index 2394fb2..958da63 100644
--- a/src/Device/Blitter.cpp
+++ b/src/Device/Blitter.cpp
@@ -24,6 +24,16 @@
 
 #include <utility>
 
+namespace {
+rr::RValue<rr::Int> PackFields(rr::Int4 const &ints, const sw::int4 shifts)
+{
+	return (rr::Int(ints.x) << shifts[0]) |
+	       (rr::Int(ints.y) << shifts[1]) |
+	       (rr::Int(ints.z) << shifts[2]) |
+	       (rr::Int(ints.w) << shifts[3]);
+}
+}  // namespace
+
 namespace sw {
 
 Blitter::Blitter()
@@ -825,28 +835,21 @@
 		case VK_FORMAT_R5G6B5_UNORM_PACK16:
 			if(writeR && writeG && writeB)
 			{
-				*Pointer<UShort>(element) = UShort(RoundInt(Float(c.z)) |
-				                                   (RoundInt(Float(c.y)) << Int(5)) |
-				                                   (RoundInt(Float(c.x)) << Int(11)));
+				*Pointer<UShort>(element) = UShort(PackFields(RoundInt(c.xyzz), { 11, 5, 0, 0 }));
 			}
 			else
 			{
 				unsigned short mask = (writeB ? 0x001F : 0x0000) | (writeG ? 0x07E0 : 0x0000) | (writeR ? 0xF800 : 0x0000);
 				unsigned short unmask = ~mask;
 				*Pointer<UShort>(element) = (*Pointer<UShort>(element) & UShort(unmask)) |
-				                            (UShort(RoundInt(Float(c.z)) |
-				                                    (RoundInt(Float(c.y)) << Int(5)) |
-				                                    (RoundInt(Float(c.x)) << Int(11))) &
+				                            (UShort(PackFields(RoundInt(c.xyzz), { 11, 5, 0, 0 })) &
 				                             UShort(mask));
 			}
 			break;
 		case VK_FORMAT_R5G5B5A1_UNORM_PACK16:
 			if(writeRGBA)
 			{
-				*Pointer<UShort>(element) = UShort(RoundInt(Float(c.w)) |
-				                                   (RoundInt(Float(c.z)) << Int(1)) |
-				                                   (RoundInt(Float(c.y)) << Int(6)) |
-				                                   (RoundInt(Float(c.x)) << Int(11)));
+				*Pointer<UShort>(element) = UShort(PackFields(RoundInt(c), { 11, 6, 1, 0 }));
 			}
 			else
 			{
@@ -856,20 +859,14 @@
 				                      (writeB ? 0x001F : 0x0000);
 				unsigned short unmask = ~mask;
 				*Pointer<UShort>(element) = (*Pointer<UShort>(element) & UShort(unmask)) |
-				                            (UShort(RoundInt(Float(c.w)) |
-				                                    (RoundInt(Float(c.z)) << Int(1)) |
-				                                    (RoundInt(Float(c.y)) << Int(6)) |
-				                                    (RoundInt(Float(c.x)) << Int(11))) &
+				                            (UShort(PackFields(RoundInt(c), { 11, 6, 1, 0 })) &
 				                             UShort(mask));
 			}
 			break;
 		case VK_FORMAT_B5G5R5A1_UNORM_PACK16:
 			if(writeRGBA)
 			{
-				*Pointer<UShort>(element) = UShort(RoundInt(Float(c.w)) |
-				                                   (RoundInt(Float(c.x)) << Int(1)) |
-				                                   (RoundInt(Float(c.y)) << Int(6)) |
-				                                   (RoundInt(Float(c.z)) << Int(11)));
+				*Pointer<UShort>(element) = UShort(PackFields(RoundInt(c), { 1, 6, 11, 0 }));
 			}
 			else
 			{
@@ -879,20 +876,14 @@
 				                      (writeB ? 0x001F : 0x0000);
 				unsigned short unmask = ~mask;
 				*Pointer<UShort>(element) = (*Pointer<UShort>(element) & UShort(unmask)) |
-				                            (UShort(RoundInt(Float(c.w)) |
-				                                    (RoundInt(Float(c.x)) << Int(1)) |
-				                                    (RoundInt(Float(c.y)) << Int(6)) |
-				                                    (RoundInt(Float(c.z)) << Int(11))) &
+				                            (UShort(PackFields(RoundInt(c), { 1, 6, 11, 0 })) &
 				                             UShort(mask));
 			}
 			break;
 		case VK_FORMAT_A1R5G5B5_UNORM_PACK16:
 			if(writeRGBA)
 			{
-				*Pointer<UShort>(element) = UShort(RoundInt(Float(c.z)) |
-				                                   (RoundInt(Float(c.y)) << Int(5)) |
-				                                   (RoundInt(Float(c.x)) << Int(10)) |
-				                                   (RoundInt(Float(c.w)) << Int(15)));
+				*Pointer<UShort>(element) = UShort(PackFields(RoundInt(c), { 10, 5, 0, 15 }));
 			}
 			else
 			{
@@ -902,10 +893,7 @@
 				                      (writeB ? 0x001F : 0x0000);
 				unsigned short unmask = ~mask;
 				*Pointer<UShort>(element) = (*Pointer<UShort>(element) & UShort(unmask)) |
-				                            (UShort(RoundInt(Float(c.z)) |
-				                                    (RoundInt(Float(c.y)) << Int(5)) |
-				                                    (RoundInt(Float(c.x)) << Int(10)) |
-				                                    (RoundInt(Float(c.w)) << Int(15))) &
+				                            (UShort(PackFields(RoundInt(c), { 10, 5, 0, 15 })) &
 				                             UShort(mask));
 			}
 			break;
@@ -914,10 +902,7 @@
 		case VK_FORMAT_A2B10G10R10_SNORM_PACK32:
 			if(writeRGBA)
 			{
-				*Pointer<UInt>(element) = UInt(RoundInt(Float(c.x)) |
-				                               (RoundInt(Float(c.y)) << 10) |
-				                               (RoundInt(Float(c.z)) << 20) |
-				                               (RoundInt(Float(c.w)) << 30));
+				*Pointer<UInt>(element) = As<UInt>(PackFields(RoundInt(c), { 0, 10, 20, 30 }));
 			}
 			else
 			{
@@ -927,10 +912,7 @@
 				                    (writeR ? 0x000003FF : 0x0000);
 				unsigned int unmask = ~mask;
 				*Pointer<UInt>(element) = (*Pointer<UInt>(element) & UInt(unmask)) |
-				                          (UInt(RoundInt(Float(c.x)) |
-				                                (RoundInt(Float(c.y)) << 10) |
-				                                (RoundInt(Float(c.z)) << 20) |
-				                                (RoundInt(Float(c.w)) << 30)) &
+				                          (As<UInt>(PackFields(RoundInt(c), { 0, 10, 20, 30 })) &
 				                           UInt(mask));
 			}
 			break;
@@ -939,10 +921,7 @@
 		case VK_FORMAT_A2R10G10B10_SNORM_PACK32:
 			if(writeRGBA)
 			{
-				*Pointer<UInt>(element) = UInt(RoundInt(Float(c.z)) |
-				                               (RoundInt(Float(c.y)) << 10) |
-				                               (RoundInt(Float(c.x)) << 20) |
-				                               (RoundInt(Float(c.w)) << 30));
+				*Pointer<UInt>(element) = As<UInt>(PackFields(RoundInt(c), { 20, 10, 0, 30 }));
 			}
 			else
 			{
@@ -952,10 +931,7 @@
 				                    (writeB ? 0x000003FF : 0x0000);
 				unsigned int unmask = ~mask;
 				*Pointer<UInt>(element) = (*Pointer<UInt>(element) & UInt(unmask)) |
-				                          (UInt(RoundInt(Float(c.z)) |
-				                                (RoundInt(Float(c.y)) << 10) |
-				                                (RoundInt(Float(c.x)) << 20) |
-				                                (RoundInt(Float(c.w)) << 30)) &
+				                          (As<UInt>(PackFields(RoundInt(c), { 20, 10, 0, 30 })) &
 				                           UInt(mask));
 			}
 			break;
@@ -1133,8 +1109,7 @@
 		case VK_FORMAT_A2B10G10R10_SSCALED_PACK32:
 			if(writeRGBA)
 			{
-				*Pointer<UInt>(element) =
-				    UInt((Extract(c, 0)) | (Extract(c, 1) << 10) | (Extract(c, 2) << 20) | (Extract(c, 3) << 30));
+				*Pointer<UInt>(element) = As<UInt>(PackFields(c, { 0, 10, 20, 30 }));
 			}
 			else
 			{
@@ -1144,7 +1119,7 @@
 				                    (writeR ? 0x000003FF : 0x0000);
 				unsigned int unmask = ~mask;
 				*Pointer<UInt>(element) = (*Pointer<UInt>(element) & UInt(unmask)) |
-				                          (UInt(Extract(c, 0) | (Extract(c, 1) << 10) | (Extract(c, 2) << 20) | (Extract(c, 3) << 30)) & UInt(mask));
+				                          (As<UInt>(PackFields(c, { 0, 10, 20, 30 })) & UInt(mask));
 			}
 			break;
 		case VK_FORMAT_A2R10G10B10_UINT_PACK32:
@@ -1153,8 +1128,7 @@
 		case VK_FORMAT_A2R10G10B10_SSCALED_PACK32:
 			if(writeRGBA)
 			{
-				*Pointer<UInt>(element) =
-				    UInt((Extract(c, 2)) | (Extract(c, 1) << 10) | (Extract(c, 0) << 20) | (Extract(c, 3) << 30));
+				*Pointer<UInt>(element) = As<UInt>(PackFields(c, { 20, 10, 0, 30 }));
 			}
 			else
 			{
@@ -1164,7 +1138,7 @@
 				                    (writeB ? 0x000003FF : 0x0000);
 				unsigned int unmask = ~mask;
 				*Pointer<UInt>(element) = (*Pointer<UInt>(element) & UInt(unmask)) |
-				                          (UInt(Extract(c, 2) | (Extract(c, 1) << 10) | (Extract(c, 0) << 20) | (Extract(c, 3) << 30)) & UInt(mask));
+				                          (As<UInt>(PackFields(c, { 20, 10, 0, 30 })) & UInt(mask));
 			}
 			break;
 		case VK_FORMAT_B8G8R8A8_UINT:
diff --git a/src/Pipeline/PixelRoutine.cpp b/src/Pipeline/PixelRoutine.cpp
index b2ae9d0..be79715 100644
--- a/src/Pipeline/PixelRoutine.cpp
+++ b/src/Pipeline/PixelRoutine.cpp
@@ -1055,10 +1055,7 @@
 			v = Insert(v, *Pointer<Int>(buffer + 4 * x), 2);
 			v = Insert(v, *Pointer<Int>(buffer + 4 * x + 4), 3);
 
-			pixel.x = Short4(v << 6) & Short4(0xFFC0u);
-			pixel.y = Short4(v >> 4) & Short4(0xFFC0u);
-			pixel.z = Short4(v >> 14) & Short4(0xFFC0u);
-			pixel.w = Short4(v >> 16) & Short4(0xC000u);
+			a2b10g10r10Unpack(v, pixel);
 		}
 		break;
 		default:
diff --git a/src/Pipeline/SamplerCore.cpp b/src/Pipeline/SamplerCore.cpp
index 10d848e..63b15f9 100644
--- a/src/Pipeline/SamplerCore.cpp
+++ b/src/Pipeline/SamplerCore.cpp
@@ -1699,16 +1699,7 @@
 		cc = Insert(cc, Pointer<Int>(buffer)[index[2]], 2);
 		cc = Insert(cc, Pointer<Int>(buffer)[index[3]], 3);
 
-		// shift each 10 bit field left 6, and replicate 6 high bits into bottom 6
-		c.x = Short4(((cc << 6) & Int4(0xFFC0)) | ((cc >> 4) & Int4(0x3F)));
-		c.y = Short4(((cc >> 4) & Int4(0xFFC0)) | ((cc >> 14) & Int4(0x3F)));
-		c.z = Short4(((cc >> 14) & Int4(0xFFC0)) | ((cc >> 24) & Int4(0x3F)));
-		c.w = Short4(((cc >> 16) & Int4(0xC000)));
-
-		// replicate 2 bit alpha component all the way down
-		c.w |= (c.w >> 8) & Short4(0xc0);
-		c.w |= (c.w >> 4) & Short4(0x0c0c);
-		c.w |= (c.w >> 2) & Short4(0x3333);
+		a2b10g10r10Unpack(cc, c);
 	}
 	else if(state.textureFormat == VK_FORMAT_A2B10G10R10_UINT_PACK32)
 	{
diff --git a/src/Pipeline/ShaderCore.cpp b/src/Pipeline/ShaderCore.cpp
index 534ed8d..87f33c8 100644
--- a/src/Pipeline/ShaderCore.cpp
+++ b/src/Pipeline/ShaderCore.cpp
@@ -616,6 +616,22 @@
 	return (UInt(halfBits.x) >> 20) | (UInt(halfBits.y) >> 9) | (UInt(halfBits.z) << 1);
 }
 
+void a2b10g10r10Unpack(Int4 &value, Vector4s &result)
+{
+	result.x = Short4(value << 6) & Short4(0xFFC0u);
+	result.y = Short4(value >> 4) & Short4(0xFFC0u);
+	result.z = Short4(value >> 14) & Short4(0xFFC0u);
+	result.w = Short4(value >> 16) & Short4(0xC000u);
+
+	// Expand to 16 bit range
+	result.x |= As<Short4>(As<UShort4>(result.x) >> 10);
+	result.y |= As<Short4>(As<UShort4>(result.y) >> 10);
+	result.z |= As<Short4>(As<UShort4>(result.z) >> 10);
+	result.w |= As<Short4>(As<UShort4>(result.w) >> 2);
+	result.w |= As<Short4>(As<UShort4>(result.w) >> 4);
+	result.w |= As<Short4>(As<UShort4>(result.w) >> 8);
+}
+
 rr::RValue<rr::Bool> AnyTrue(rr::RValue<sw::SIMD::Int> const &ints)
 {
 	return rr::SignMask(ints) != 0;
diff --git a/src/Pipeline/ShaderCore.hpp b/src/Pipeline/ShaderCore.hpp
index a911be2..e19fdbd 100644
--- a/src/Pipeline/ShaderCore.hpp
+++ b/src/Pipeline/ShaderCore.hpp
@@ -208,6 +208,7 @@
 sw::SIMD::UInt floatToHalfBits(sw::SIMD::UInt floatBits, bool storeInUpperBits);
 sw::SIMD::Float r11g11b10Unpack(UInt r11g11b10bits);
 UInt r11g11b10Pack(sw::SIMD::Float &value);
+void a2b10g10r10Unpack(Int4 &value, Vector4s &result);
 
 rr::RValue<rr::Bool> AnyTrue(rr::RValue<sw::SIMD::Int> const &ints);