Implement VK_EXT_4444_formats

This adds the formats:
 * VK_FORMAT_A4R4G4B4_UNORM_PACK16_EXT
 * VK_FORMAT_A4B4G4R4_UNORM_PACK16_EXT

These formats are useful for D3D emulation

Tests: dEQP-VK.*pack16_ext*
Bug: b/198764346
Change-Id: Ice16c52f9e672d1b63d82e96a943229e470184fa
Reviewed-on: https://swiftshader-review.googlesource.com/c/SwiftShader/+/57428
Tested-by: Sean Risser <srisser@google.com>
Kokoro-Result: kokoro <noreply+kokoro@google.com>
Reviewed-by: Nicolas Capens <nicolascapens@google.com>
Commit-Queue: Sean Risser <srisser@google.com>
diff --git a/src/Pipeline/SamplerCore.cpp b/src/Pipeline/SamplerCore.cpp
index 9043038..40e9bb5 100644
--- a/src/Pipeline/SamplerCore.cpp
+++ b/src/Pipeline/SamplerCore.cpp
@@ -159,6 +159,8 @@
 				break;
 			case VK_FORMAT_R4G4B4A4_UNORM_PACK16:
 			case VK_FORMAT_B4G4R4A4_UNORM_PACK16:
+			case VK_FORMAT_A4R4G4B4_UNORM_PACK16_EXT:
+			case VK_FORMAT_A4B4G4R4_UNORM_PACK16_EXT:
 				c.x *= Float4(1.0f / 0xF000);
 				c.y *= Float4(1.0f / 0xF000);
 				c.z *= Float4(1.0f / 0xF000);
@@ -225,6 +227,8 @@
 			break;
 		case VK_FORMAT_R4G4B4A4_UNORM_PACK16:
 		case VK_FORMAT_B4G4R4A4_UNORM_PACK16:
+		case VK_FORMAT_A4R4G4B4_UNORM_PACK16_EXT:
+		case VK_FORMAT_A4B4G4R4_UNORM_PACK16_EXT:
 			c.x = Float4(As<UShort4>(cs.x)) * Float4(1.0f / 0xF000);
 			c.y = Float4(As<UShort4>(cs.y)) * Float4(1.0f / 0xF000);
 			c.z = Float4(As<UShort4>(cs.z)) * Float4(1.0f / 0xF000);
@@ -1551,6 +1555,18 @@
 			c.y = (c.x << 4) & Short4(0xF000u);
 			c.x = (c.x << 8) & Short4(0xF000u);
 			break;
+		case VK_FORMAT_A4R4G4B4_UNORM_PACK16_EXT:
+			c.w = (c.x) & Short4(0xF000u);
+			c.z = (c.x << 12) & Short4(0xF000u);
+			c.y = (c.x << 8) & Short4(0xF000u);
+			c.x = (c.x << 4) & Short4(0xF000u);
+			break;
+		case VK_FORMAT_A4B4G4R4_UNORM_PACK16_EXT:
+			c.w = (c.x) & Short4(0xF000u);
+			c.z = (c.x << 4) & Short4(0xF000u);
+			c.y = (c.x << 8) & Short4(0xF000u);
+			c.x = (c.x << 12) & Short4(0xF000u);
+			break;
 		case VK_FORMAT_R5G5B5A1_UNORM_PACK16:
 			c.w = (c.x << 15) & Short4(0x8000u);
 			c.z = (c.x << 10) & Short4(0xF800u);