Implement VK_EXT_4444_formats

This adds the formats:
 * VK_FORMAT_A4R4G4B4_UNORM_PACK16_EXT
 * VK_FORMAT_A4B4G4R4_UNORM_PACK16_EXT

These formats are useful for D3D emulation

Tests: dEQP-VK.*pack16_ext*
Bug: b/198764346
Change-Id: Ice16c52f9e672d1b63d82e96a943229e470184fa
Reviewed-on: https://swiftshader-review.googlesource.com/c/SwiftShader/+/57428
Tested-by: Sean Risser <srisser@google.com>
Kokoro-Result: kokoro <noreply+kokoro@google.com>
Reviewed-by: Nicolas Capens <nicolascapens@google.com>
Commit-Queue: Sean Risser <srisser@google.com>
diff --git a/src/Device/Blitter.cpp b/src/Device/Blitter.cpp
index 152b7bd..8a33854 100644
--- a/src/Device/Blitter.cpp
+++ b/src/Device/Blitter.cpp
@@ -472,6 +472,18 @@
 		c.z = Float(Int((*Pointer<UShort>(element) & UShort(0x00F0)) >> UShort(4)));
 		c.w = Float(Int(*Pointer<UShort>(element) & UShort(0x000F)));
 		break;
+	case VK_FORMAT_A4B4G4R4_UNORM_PACK16_EXT:
+		c.w = Float(Int((*Pointer<UShort>(element) & UShort(0xF000)) >> UShort(12)));
+		c.z = Float(Int((*Pointer<UShort>(element) & UShort(0x0F00)) >> UShort(8)));
+		c.y = Float(Int((*Pointer<UShort>(element) & UShort(0x00F0)) >> UShort(4)));
+		c.x = Float(Int(*Pointer<UShort>(element) & UShort(0x000F)));
+		break;
+	case VK_FORMAT_A4R4G4B4_UNORM_PACK16_EXT:
+		c.w = Float(Int((*Pointer<UShort>(element) & UShort(0xF000)) >> UShort(12)));
+		c.x = Float(Int((*Pointer<UShort>(element) & UShort(0x0F00)) >> UShort(8)));
+		c.y = Float(Int((*Pointer<UShort>(element) & UShort(0x00F0)) >> UShort(4)));
+		c.z = Float(Int(*Pointer<UShort>(element) & UShort(0x000F)));
+		break;
 	case VK_FORMAT_R5G6B5_UNORM_PACK16:
 		c.x = Float(Int((*Pointer<UShort>(element) & UShort(0xF800)) >> UShort(11)));
 		c.y = Float(Int((*Pointer<UShort>(element) & UShort(0x07E0)) >> UShort(5)));
@@ -595,6 +607,52 @@
 			                             UShort(mask));
 		}
 		break;
+	case VK_FORMAT_A4R4G4B4_UNORM_PACK16_EXT:
+		if(writeRGBA)
+		{
+			*Pointer<UShort>(element) = UShort(RoundInt(Float(c.z)) & Int(0xF)) |
+			                            UShort((RoundInt(Float(c.y)) & Int(0xF)) << 4) |
+			                            UShort((RoundInt(Float(c.x)) & Int(0xF)) << 8) |
+			                            UShort((RoundInt(Float(c.w)) & Int(0xF)) << 12);
+		}
+		else
+		{
+			unsigned short mask = (writeB ? 0x000F : 0x0000) |
+			                      (writeG ? 0x00F0 : 0x0000) |
+			                      (writeR ? 0x0F00 : 0x0000) |
+			                      (writeA ? 0xF000 : 0x0000);
+			unsigned short unmask = ~mask;
+			*Pointer<UShort>(element) = (*Pointer<UShort>(element) & UShort(unmask)) |
+			                            ((UShort(RoundInt(Float(c.w)) & Int(0xF)) |
+			                              UShort((RoundInt(Float(c.x)) & Int(0xF)) << 4) |
+			                              UShort((RoundInt(Float(c.y)) & Int(0xF)) << 8) |
+			                              UShort((RoundInt(Float(c.z)) & Int(0xF)) << 12)) &
+			                             UShort(mask));
+		}
+		break;
+	case VK_FORMAT_A4B4G4R4_UNORM_PACK16_EXT:
+		if(writeRGBA)
+		{
+			*Pointer<UShort>(element) = UShort(RoundInt(Float(c.x)) & Int(0xF)) |
+			                            UShort((RoundInt(Float(c.y)) & Int(0xF)) << 4) |
+			                            UShort((RoundInt(Float(c.z)) & Int(0xF)) << 8) |
+			                            UShort((RoundInt(Float(c.w)) & Int(0xF)) << 12);
+		}
+		else
+		{
+			unsigned short mask = (writeR ? 0x000F : 0x0000) |
+			                      (writeG ? 0x00F0 : 0x0000) |
+			                      (writeB ? 0x0F00 : 0x0000) |
+			                      (writeA ? 0xF000 : 0x0000);
+			unsigned short unmask = ~mask;
+			*Pointer<UShort>(element) = (*Pointer<UShort>(element) & UShort(unmask)) |
+			                            ((UShort(RoundInt(Float(c.x)) & Int(0xF)) |
+			                              UShort((RoundInt(Float(c.y)) & Int(0xF)) << 4) |
+			                              UShort((RoundInt(Float(c.z)) & Int(0xF)) << 8) |
+			                              UShort((RoundInt(Float(c.w)) & Int(0xF)) << 12)) &
+			                             UShort(mask));
+		}
+		break;
 	case VK_FORMAT_B8G8R8A8_SRGB:
 	case VK_FORMAT_B8G8R8A8_UNORM:
 		if(writeRGBA)
diff --git a/src/Pipeline/Constants.cpp b/src/Pipeline/Constants.cpp
index 5855fce..9f82e2d 100644
--- a/src/Pipeline/Constants.cpp
+++ b/src/Pipeline/Constants.cpp
@@ -256,6 +256,8 @@
 		maskb5g5r5a1Q[i] = word4((i & 0x1 ? 0xF800 : 0) | (i & 0x2 ? 0x07C0 : 0) | (i & 0x4 ? 0x003E : 0) | (i & 8 ? 0x0001 : 0));
 		mask4rgbaQ[i] = word4((i & 0x1 ? 0x00F0 : 0) | (i & 0x2 ? 0x0F00 : 0) | (i & 0x4 ? 0xF000 : 0) | (i & 8 ? 0x000F : 0));
 		mask4bgraQ[i] = word4((i & 0x1 ? 0xF000 : 0) | (i & 0x2 ? 0x0F00 : 0) | (i & 0x4 ? 0x00F0 : 0) | (i & 8 ? 0x000F : 0));
+		mask4abgrQ[i] = word4((i & 0x1 ? 0x0F00 : 0) | (i & 0x2 ? 0x00F0 : 0) | (i & 0x4 ? 0x000F : 0) | (i & 8 ? 0xF000 : 0));
+		mask4argbQ[i] = word4((i & 0x1 ? 0x000F : 0) | (i & 0x2 ? 0x00F0 : 0) | (i & 0x4 ? 0x0F00 : 0) | (i & 8 ? 0xF000 : 0));
 	}
 
 	for(int i = 0; i < 4; i++)
diff --git a/src/Pipeline/Constants.hpp b/src/Pipeline/Constants.hpp
index dc66a9c..7f3f2a9 100644
--- a/src/Pipeline/Constants.hpp
+++ b/src/Pipeline/Constants.hpp
@@ -74,6 +74,8 @@
 	word4 maskb5g5r5a1Q[16];  // 4 bit writemask -> B5G5R5A1 bit patterns, replicated 4x
 	word4 mask4rgbaQ[16];     // 4 bit writemask -> R4G4B4A4 bit patterns, replicated 4x
 	word4 mask4bgraQ[16];     // 4 bit writemask -> B4G4R4A4 bit patterns, replicated 4x
+	word4 mask4abgrQ[16];     // 4 bit writemask -> A4B4G4R4 bit patterns, replicated 4x
+	word4 mask4argbQ[16];     // 4 bit writemask -> A4R4G4B4 bit patterns, replicated 4x
 	dword4 mask11X[8];        // 3 bit writemask -> B10G11R11 bit patterns, replicated 4x
 
 	unsigned short sRGBtoLinearFF_FF00[256];
diff --git a/src/Pipeline/PixelProgram.cpp b/src/Pipeline/PixelProgram.cpp
index bde61eb..ba49834 100644
--- a/src/Pipeline/PixelProgram.cpp
+++ b/src/Pipeline/PixelProgram.cpp
@@ -277,6 +277,8 @@
 		{
 		case VK_FORMAT_R4G4B4A4_UNORM_PACK16:
 		case VK_FORMAT_B4G4R4A4_UNORM_PACK16:
+		case VK_FORMAT_A4R4G4B4_UNORM_PACK16_EXT:
+		case VK_FORMAT_A4B4G4R4_UNORM_PACK16_EXT:
 		case VK_FORMAT_B5G6R5_UNORM_PACK16:
 		case VK_FORMAT_R5G5B5A1_UNORM_PACK16:
 		case VK_FORMAT_B5G5R5A1_UNORM_PACK16:
@@ -367,6 +369,8 @@
 			break;
 		case VK_FORMAT_R4G4B4A4_UNORM_PACK16:
 		case VK_FORMAT_B4G4R4A4_UNORM_PACK16:
+		case VK_FORMAT_A4R4G4B4_UNORM_PACK16_EXT:
+		case VK_FORMAT_A4B4G4R4_UNORM_PACK16_EXT:
 		case VK_FORMAT_B5G6R5_UNORM_PACK16:
 		case VK_FORMAT_R5G5B5A1_UNORM_PACK16:
 		case VK_FORMAT_B5G5R5A1_UNORM_PACK16:
diff --git a/src/Pipeline/PixelRoutine.cpp b/src/Pipeline/PixelRoutine.cpp
index 3e8104c..8046fdd 100644
--- a/src/Pipeline/PixelRoutine.cpp
+++ b/src/Pipeline/PixelRoutine.cpp
@@ -21,6 +21,7 @@
 #include "Device/Renderer.hpp"
 #include "System/Debug.hpp"
 #include "Vulkan/VkPipelineLayout.hpp"
+#include "Vulkan/VkStringify.hpp"
 
 namespace sw {
 
@@ -1117,6 +1118,46 @@
 		pixel.w |= As<Short4>(As<UShort4>(pixel.w) >> 4);
 		pixel.w |= As<Short4>(As<UShort4>(pixel.w) >> 8);
 		break;
+	case VK_FORMAT_A4B4G4R4_UNORM_PACK16_EXT:
+		buffer += 2 * x;
+		buffer2 = buffer + pitchB;
+		c01 = As<Short4>(Int2(*Pointer<Int>(buffer), *Pointer<Int>(buffer2)));
+
+		pixel.w = (c01 & Short4(0xF000u));
+		pixel.z = (c01 & Short4(0x0F00u)) << 4;
+		pixel.y = (c01 & Short4(0x00F0u)) << 8;
+		pixel.x = (c01 & Short4(0x000Fu)) << 12;
+
+		// Expand to 16 bit range
+		pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 4);
+		pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 8);
+		pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 4);
+		pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 8);
+		pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 4);
+		pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 8);
+		pixel.w |= As<Short4>(As<UShort4>(pixel.w) >> 4);
+		pixel.w |= As<Short4>(As<UShort4>(pixel.w) >> 8);
+		break;
+	case VK_FORMAT_A4R4G4B4_UNORM_PACK16_EXT:
+		buffer += 2 * x;
+		buffer2 = buffer + pitchB;
+		c01 = As<Short4>(Int2(*Pointer<Int>(buffer), *Pointer<Int>(buffer2)));
+
+		pixel.w = (c01 & Short4(0xF000u));
+		pixel.x = (c01 & Short4(0x0F00u)) << 4;
+		pixel.y = (c01 & Short4(0x00F0u)) << 8;
+		pixel.z = (c01 & Short4(0x000Fu)) << 12;
+
+		// Expand to 16 bit range
+		pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 4);
+		pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 8);
+		pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 4);
+		pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 8);
+		pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 4);
+		pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 8);
+		pixel.w |= As<Short4>(As<UShort4>(pixel.w) >> 4);
+		pixel.w |= As<Short4>(As<UShort4>(pixel.w) >> 8);
+		break;
 	case VK_FORMAT_R5G5B5A1_UNORM_PACK16:
 		buffer += 2 * x;
 		buffer2 = buffer + pitchB;
@@ -1477,6 +1518,8 @@
 		break;
 	case VK_FORMAT_R4G4B4A4_UNORM_PACK16:
 	case VK_FORMAT_B4G4R4A4_UNORM_PACK16:
+	case VK_FORMAT_A4R4G4B4_UNORM_PACK16_EXT:
+	case VK_FORMAT_A4B4G4R4_UNORM_PACK16_EXT:
 		current.x = current.x - As<Short4>(As<UShort4>(current.x) >> 4) + Short4(0x0800);
 		current.y = current.y - As<Short4>(As<UShort4>(current.y) >> 4) + Short4(0x0800);
 		current.z = current.z - As<Short4>(As<UShort4>(current.z) >> 4) + Short4(0x0800);
@@ -1525,6 +1568,26 @@
 			current.x = current.x | current.y | current.z | current.w;
 		}
 		break;
+	case VK_FORMAT_A4R4G4B4_UNORM_PACK16_EXT:
+		{
+			current.w = As<UShort4>(current.w & Short4(0xF000));
+			current.x = As<UShort4>(current.x & Short4(0xF000)) >> 4;
+			current.y = As<UShort4>(current.y & Short4(0xF000)) >> 8;
+			current.z = As<UShort4>(current.z & Short4(0xF000u)) >> 12;
+
+			current.x = current.x | current.y | current.z | current.w;
+		}
+		break;
+	case VK_FORMAT_A4B4G4R4_UNORM_PACK16_EXT:
+		{
+			current.w = As<UShort4>(current.w & Short4(0xF000));
+			current.z = As<UShort4>(current.z & Short4(0xF000)) >> 4;
+			current.y = As<UShort4>(current.y & Short4(0xF000)) >> 8;
+			current.x = As<UShort4>(current.x & Short4(0xF000u)) >> 12;
+
+			current.x = current.x | current.y | current.z | current.w;
+		}
+		break;
 	case VK_FORMAT_R5G5B5A1_UNORM_PACK16:
 		{
 			current.x = As<UShort4>(current.x & Short4(0xF800));
@@ -1722,38 +1785,31 @@
 	switch(state.colorFormat[index])
 	{
 	case VK_FORMAT_R4G4B4A4_UNORM_PACK16:
-		{
-			buffer += 2 * x;
-			Int value = *Pointer<Int>(buffer);
-
-			Int channelMask = *Pointer<Int>(constants + OFFSET(Constants, mask4rgbaQ[bgraWriteMask & 0xF][0]));
-
-			Int c01 = Extract(As<Int2>(current.x), 0);
-			Int mask01 = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[0][0]) + xMask * 8);
-			if(bgraWriteMask != 0x0000000F)
-			{
-				mask01 &= channelMask;
-			}
-			*Pointer<Int>(buffer) = (c01 & mask01) | (value & ~mask01);
-
-			buffer += pitchB;
-			value = *Pointer<Int>(buffer);
-
-			Int c23 = Extract(As<Int2>(current.x), 1);
-			Int mask23 = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[0][2]) + xMask * 8);
-			if(bgraWriteMask != 0x0000000F)
-			{
-				mask23 &= channelMask;
-			}
-			*Pointer<Int>(buffer) = (c23 & mask23) | (value & ~mask23);
-		}
-		break;
 	case VK_FORMAT_B4G4R4A4_UNORM_PACK16:
+	case VK_FORMAT_A4B4G4R4_UNORM_PACK16_EXT:
+	case VK_FORMAT_A4R4G4B4_UNORM_PACK16_EXT:
 		{
 			buffer += 2 * x;
 			Int value = *Pointer<Int>(buffer);
 
-			Int channelMask = *Pointer<Int>(constants + OFFSET(Constants, mask4bgraQ[bgraWriteMask & 0xF][0]));
+			Int channelMask;
+			switch(state.colorFormat[index])
+			{
+			case VK_FORMAT_R4G4B4A4_UNORM_PACK16:
+				channelMask = *Pointer<Int>(constants + OFFSET(Constants, mask4rgbaQ[bgraWriteMask & 0xF][0]));
+				break;
+			case VK_FORMAT_B4G4R4A4_UNORM_PACK16:
+				channelMask = *Pointer<Int>(constants + OFFSET(Constants, mask4bgraQ[bgraWriteMask & 0xF][0]));
+				break;
+			case VK_FORMAT_A4R4G4B4_UNORM_PACK16_EXT:
+				channelMask = *Pointer<Int>(constants + OFFSET(Constants, mask4argbQ[bgraWriteMask & 0xF][0]));
+				break;
+			case VK_FORMAT_A4B4G4R4_UNORM_PACK16_EXT:
+				channelMask = *Pointer<Int>(constants + OFFSET(Constants, mask4abgrQ[bgraWriteMask & 0xF][0]));
+				break;
+			default:
+				UNREACHABLE("Format: %s", vk::Stringify(state.colorFormat[index]).c_str());
+			}
 
 			Int c01 = Extract(As<Int2>(current.x), 0);
 			Int mask01 = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[0][0]) + xMask * 8);
diff --git a/src/Pipeline/SamplerCore.cpp b/src/Pipeline/SamplerCore.cpp
index 9043038..40e9bb5 100644
--- a/src/Pipeline/SamplerCore.cpp
+++ b/src/Pipeline/SamplerCore.cpp
@@ -159,6 +159,8 @@
 				break;
 			case VK_FORMAT_R4G4B4A4_UNORM_PACK16:
 			case VK_FORMAT_B4G4R4A4_UNORM_PACK16:
+			case VK_FORMAT_A4R4G4B4_UNORM_PACK16_EXT:
+			case VK_FORMAT_A4B4G4R4_UNORM_PACK16_EXT:
 				c.x *= Float4(1.0f / 0xF000);
 				c.y *= Float4(1.0f / 0xF000);
 				c.z *= Float4(1.0f / 0xF000);
@@ -225,6 +227,8 @@
 			break;
 		case VK_FORMAT_R4G4B4A4_UNORM_PACK16:
 		case VK_FORMAT_B4G4R4A4_UNORM_PACK16:
+		case VK_FORMAT_A4R4G4B4_UNORM_PACK16_EXT:
+		case VK_FORMAT_A4B4G4R4_UNORM_PACK16_EXT:
 			c.x = Float4(As<UShort4>(cs.x)) * Float4(1.0f / 0xF000);
 			c.y = Float4(As<UShort4>(cs.y)) * Float4(1.0f / 0xF000);
 			c.z = Float4(As<UShort4>(cs.z)) * Float4(1.0f / 0xF000);
@@ -1551,6 +1555,18 @@
 			c.y = (c.x << 4) & Short4(0xF000u);
 			c.x = (c.x << 8) & Short4(0xF000u);
 			break;
+		case VK_FORMAT_A4R4G4B4_UNORM_PACK16_EXT:
+			c.w = (c.x) & Short4(0xF000u);
+			c.z = (c.x << 12) & Short4(0xF000u);
+			c.y = (c.x << 8) & Short4(0xF000u);
+			c.x = (c.x << 4) & Short4(0xF000u);
+			break;
+		case VK_FORMAT_A4B4G4R4_UNORM_PACK16_EXT:
+			c.w = (c.x) & Short4(0xF000u);
+			c.z = (c.x << 4) & Short4(0xF000u);
+			c.y = (c.x << 8) & Short4(0xF000u);
+			c.x = (c.x << 12) & Short4(0xF000u);
+			break;
 		case VK_FORMAT_R5G5B5A1_UNORM_PACK16:
 			c.w = (c.x << 15) & Short4(0x8000u);
 			c.z = (c.x << 10) & Short4(0xF800u);
diff --git a/src/Pipeline/SpirvShaderImage.cpp b/src/Pipeline/SpirvShaderImage.cpp
index d195ac0..d13ed5a 100644
--- a/src/Pipeline/SpirvShaderImage.cpp
+++ b/src/Pipeline/SpirvShaderImage.cpp
@@ -991,6 +991,18 @@
 		dst.move(2, SIMD::Float((packed[0] >> 12) & SIMD::Int(0xF)) * SIMD::Float(1.0f / 0xF));
 		dst.move(3, SIMD::Float((packed[0]) & SIMD::Int(0xF)) * SIMD::Float(1.0f / 0xF));
 		break;
+	case VK_FORMAT_A4R4G4B4_UNORM_PACK16_EXT:
+		dst.move(0, SIMD::Float((packed[0] >> 8) & SIMD::Int(0xF)) * SIMD::Float(1.0f / 0xF));
+		dst.move(1, SIMD::Float((packed[0] >> 4) & SIMD::Int(0xF)) * SIMD::Float(1.0f / 0xF));
+		dst.move(2, SIMD::Float((packed[0]) & SIMD::Int(0xF)) * SIMD::Float(1.0f / 0xF));
+		dst.move(3, SIMD::Float((packed[0] >> 12) & SIMD::Int(0xF)) * SIMD::Float(1.0f / 0xF));
+		break;
+	case VK_FORMAT_A4B4G4R4_UNORM_PACK16_EXT:
+		dst.move(0, SIMD::Float((packed[0]) & SIMD::Int(0xF)) * SIMD::Float(1.0f / 0xF));
+		dst.move(1, SIMD::Float((packed[0] >> 4) & SIMD::Int(0xF)) * SIMD::Float(1.0f / 0xF));
+		dst.move(2, SIMD::Float((packed[0] >> 8) & SIMD::Int(0xF)) * SIMD::Float(1.0f / 0xF));
+		dst.move(3, SIMD::Float((packed[0] >> 12) & SIMD::Int(0xF)) * SIMD::Float(1.0f / 0xF));
+		break;
 	case VK_FORMAT_R5G6B5_UNORM_PACK16:
 		dst.move(0, SIMD::Float((packed[0] >> 11) & SIMD::Int(0x1F)) * SIMD::Float(1.0f / 0x1F));
 		dst.move(1, SIMD::Float((packed[0] >> 5) & SIMD::Int(0x3F)) * SIMD::Float(1.0f / 0x3F));
diff --git a/src/Vulkan/VkFormat.cpp b/src/Vulkan/VkFormat.cpp
index 24ba6a2..2923f0a 100644
--- a/src/Vulkan/VkFormat.cpp
+++ b/src/Vulkan/VkFormat.cpp
@@ -26,6 +26,8 @@
 	case VK_FORMAT_R4G4_UNORM_PACK8:
 	case VK_FORMAT_R4G4B4A4_UNORM_PACK16:
 	case VK_FORMAT_B4G4R4A4_UNORM_PACK16:
+	case VK_FORMAT_A4R4G4B4_UNORM_PACK16_EXT:
+	case VK_FORMAT_A4B4G4R4_UNORM_PACK16_EXT:
 	case VK_FORMAT_A1R5G5B5_UNORM_PACK16:
 	case VK_FORMAT_R5G5B5A1_UNORM_PACK16:
 	case VK_FORMAT_B5G5R5A1_UNORM_PACK16:
@@ -331,6 +333,8 @@
 	case VK_FORMAT_R4G4_UNORM_PACK8:
 	case VK_FORMAT_R4G4B4A4_UNORM_PACK16:
 	case VK_FORMAT_B4G4R4A4_UNORM_PACK16:
+	case VK_FORMAT_A4R4G4B4_UNORM_PACK16_EXT:
+	case VK_FORMAT_A4B4G4R4_UNORM_PACK16_EXT:
 	case VK_FORMAT_R5G6B5_UNORM_PACK16:
 	case VK_FORMAT_B5G6R5_UNORM_PACK16:
 	case VK_FORMAT_R5G5B5A1_UNORM_PACK16:
@@ -641,6 +645,8 @@
 	// 16 - bit, Block size 2 bytes, 1 texel / block
 	case VK_FORMAT_R4G4B4A4_UNORM_PACK16:
 	case VK_FORMAT_B4G4R4A4_UNORM_PACK16:
+	case VK_FORMAT_A4R4G4B4_UNORM_PACK16_EXT:
+	case VK_FORMAT_A4B4G4R4_UNORM_PACK16_EXT:
 	case VK_FORMAT_R5G6B5_UNORM_PACK16:
 	case VK_FORMAT_B5G6R5_UNORM_PACK16:
 	case VK_FORMAT_R5G5B5A1_UNORM_PACK16:
@@ -1206,6 +1212,8 @@
 		return 3;
 	case VK_FORMAT_R4G4B4A4_UNORM_PACK16:
 	case VK_FORMAT_B4G4R4A4_UNORM_PACK16:
+	case VK_FORMAT_A4R4G4B4_UNORM_PACK16_EXT:
+	case VK_FORMAT_A4B4G4R4_UNORM_PACK16_EXT:
 	case VK_FORMAT_R5G5B5A1_UNORM_PACK16:
 	case VK_FORMAT_B5G5R5A1_UNORM_PACK16:
 	case VK_FORMAT_A1R5G5B5_UNORM_PACK16:
@@ -1311,6 +1319,8 @@
 	case VK_FORMAT_R4G4_UNORM_PACK8:
 	case VK_FORMAT_R4G4B4A4_UNORM_PACK16:
 	case VK_FORMAT_B4G4R4A4_UNORM_PACK16:
+	case VK_FORMAT_A4R4G4B4_UNORM_PACK16_EXT:
+	case VK_FORMAT_A4B4G4R4_UNORM_PACK16_EXT:
 	case VK_FORMAT_R5G6B5_UNORM_PACK16:
 	case VK_FORMAT_B5G6R5_UNORM_PACK16:
 	case VK_FORMAT_R5G5B5A1_UNORM_PACK16:
@@ -1504,6 +1514,8 @@
 		return 1;
 	case VK_FORMAT_R4G4B4A4_UNORM_PACK16:
 	case VK_FORMAT_B4G4R4A4_UNORM_PACK16:
+	case VK_FORMAT_A4R4G4B4_UNORM_PACK16_EXT:
+	case VK_FORMAT_A4B4G4R4_UNORM_PACK16_EXT:
 	case VK_FORMAT_R5G6B5_UNORM_PACK16:
 	case VK_FORMAT_B5G6R5_UNORM_PACK16:
 	case VK_FORMAT_R5G5B5A1_UNORM_PACK16:
@@ -1865,6 +1877,8 @@
 	case VK_FORMAT_R4G4_UNORM_PACK8:
 	case VK_FORMAT_R4G4B4A4_UNORM_PACK16:
 	case VK_FORMAT_B4G4R4A4_UNORM_PACK16:
+	case VK_FORMAT_A4R4G4B4_UNORM_PACK16_EXT:
+	case VK_FORMAT_A4B4G4R4_UNORM_PACK16_EXT:
 		return sw::float4(0xF, 0xF, 0xF, 0xF);
 	case VK_FORMAT_R8_UNORM:
 	case VK_FORMAT_R8G8_UNORM:
@@ -2011,6 +2025,8 @@
 	case VK_FORMAT_B10G11R11_UFLOAT_PACK32:
 	case VK_FORMAT_R4G4B4A4_UNORM_PACK16:
 	case VK_FORMAT_B4G4R4A4_UNORM_PACK16:
+	case VK_FORMAT_A4R4G4B4_UNORM_PACK16_EXT:
+	case VK_FORMAT_A4B4G4R4_UNORM_PACK16_EXT:
 	case VK_FORMAT_B5G6R5_UNORM_PACK16:
 	case VK_FORMAT_R5G5B5A1_UNORM_PACK16:
 	case VK_FORMAT_B5G5R5A1_UNORM_PACK16:
@@ -2031,6 +2047,8 @@
 	{
 	case VK_FORMAT_R4G4B4A4_UNORM_PACK16:
 	case VK_FORMAT_B4G4R4A4_UNORM_PACK16:
+	case VK_FORMAT_A4R4G4B4_UNORM_PACK16_EXT:
+	case VK_FORMAT_A4B4G4R4_UNORM_PACK16_EXT:
 	case VK_FORMAT_R5G6B5_UNORM_PACK16:
 	case VK_FORMAT_B5G6R5_UNORM_PACK16:
 	case VK_FORMAT_R5G5B5A1_UNORM_PACK16:
@@ -2091,6 +2109,8 @@
 	case VK_FORMAT_B4G4R4A4_UNORM_PACK16:
 	case VK_FORMAT_R5G6B5_UNORM_PACK16:
 	case VK_FORMAT_R4G4B4A4_UNORM_PACK16:
+	case VK_FORMAT_A4B4G4R4_UNORM_PACK16_EXT:
+	case VK_FORMAT_A4R4G4B4_UNORM_PACK16_EXT:
 	case VK_FORMAT_B5G6R5_UNORM_PACK16:
 	case VK_FORMAT_R5G5B5A1_UNORM_PACK16:
 	case VK_FORMAT_B5G5R5A1_UNORM_PACK16:
@@ -2143,6 +2163,8 @@
 	case VK_FORMAT_B4G4R4A4_UNORM_PACK16:
 	case VK_FORMAT_R5G6B5_UNORM_PACK16:
 	case VK_FORMAT_R4G4B4A4_UNORM_PACK16:
+	case VK_FORMAT_A4R4G4B4_UNORM_PACK16_EXT:
+	case VK_FORMAT_A4B4G4R4_UNORM_PACK16_EXT:
 	case VK_FORMAT_B5G6R5_UNORM_PACK16:
 	case VK_FORMAT_R5G5B5A1_UNORM_PACK16:
 	case VK_FORMAT_B5G5R5A1_UNORM_PACK16:
@@ -2222,6 +2244,8 @@
 	case VK_FORMAT_B4G4R4A4_UNORM_PACK16:
 	case VK_FORMAT_R5G6B5_UNORM_PACK16:
 	case VK_FORMAT_R4G4B4A4_UNORM_PACK16:
+	case VK_FORMAT_A4B4G4R4_UNORM_PACK16_EXT:
+	case VK_FORMAT_A4R4G4B4_UNORM_PACK16_EXT:
 	case VK_FORMAT_B5G6R5_UNORM_PACK16:
 	case VK_FORMAT_R5G5B5A1_UNORM_PACK16:
 	case VK_FORMAT_B5G5R5A1_UNORM_PACK16:
@@ -2329,6 +2353,8 @@
 	case VK_FORMAT_B4G4R4A4_UNORM_PACK16:
 	case VK_FORMAT_R5G6B5_UNORM_PACK16:
 	case VK_FORMAT_R4G4B4A4_UNORM_PACK16:
+	case VK_FORMAT_A4B4G4R4_UNORM_PACK16_EXT:
+	case VK_FORMAT_A4R4G4B4_UNORM_PACK16_EXT:
 	case VK_FORMAT_B5G6R5_UNORM_PACK16:
 	case VK_FORMAT_R5G5B5A1_UNORM_PACK16:
 	case VK_FORMAT_B5G5R5A1_UNORM_PACK16:
@@ -2372,7 +2398,7 @@
 
 static constexpr uint8_t pack(VkFormat format)
 {
-	if(format > VK_FORMAT_G16_B16_R16_3PLANE_444_UNORM)
+	if(format > VK_FORMAT_A4B4G4R4_UNORM_PACK16_EXT)
 	{
 		return 0;
 	}
@@ -2401,6 +2427,12 @@
 		return uint8_t(format - VK_FORMAT_ASTC_4x4_SFLOAT_BLOCK_EXT + 227);
 	}
 
+	// 100034000x -> 241 - 242
+	if(format >= VK_FORMAT_A4R4G4B4_UNORM_PACK16_EXT && format <= VK_FORMAT_A4B4G4R4_UNORM_PACK16_EXT)
+	{
+		return uint8_t(format - VK_FORMAT_A4R4G4B4_UNORM_PACK16_EXT + 241);
+	}
+
 	return 0;
 }
 
@@ -2412,6 +2444,8 @@
 static_assert(pack(VK_FORMAT_PVRTC2_4BPP_SRGB_BLOCK_IMG) == 226, "Incorrect VkFormat packed value");
 static_assert(pack(VK_FORMAT_ASTC_4x4_SFLOAT_BLOCK_EXT) == 227, "Incorrect VkFormat packed value");
 static_assert(pack(VK_FORMAT_ASTC_12x12_SFLOAT_BLOCK_EXT) == 240, "Incorrect VkFormat packed value");
+static_assert(pack(VK_FORMAT_A4R4G4B4_UNORM_PACK16_EXT) == 241, "Incorrect VkFormat packed value");
+static_assert(pack(VK_FORMAT_A4B4G4R4_UNORM_PACK16_EXT) == 242, "Incorrect VkFormat packed value");
 
 static constexpr VkFormat unpack(uint8_t format)
 {
@@ -2439,6 +2473,12 @@
 		return static_cast<VkFormat>(VK_FORMAT_ASTC_4x4_SFLOAT_BLOCK_EXT + (format - 227));
 	}
 
+	// 241 - 242 -> 100034000x
+	if(format >= 241 && format <= 242)
+	{
+		return static_cast<VkFormat>(VK_FORMAT_A4R4G4B4_UNORM_PACK16_EXT + (format - 241));
+	}
+
 	return VK_FORMAT_UNDEFINED;
 }
 
@@ -2450,11 +2490,13 @@
 static_assert(unpack(226) == VK_FORMAT_PVRTC2_4BPP_SRGB_BLOCK_IMG, "Incorrect VkFormat unpacked value");
 static_assert(unpack(227) == VK_FORMAT_ASTC_4x4_SFLOAT_BLOCK_EXT, "Incorrect VkFormat unpacked value");
 static_assert(unpack(240) == VK_FORMAT_ASTC_12x12_SFLOAT_BLOCK_EXT, "Incorrect VkFormat unpacked value");
-static_assert(unpack(241) == VK_FORMAT_UNDEFINED, "Incorrect VkFormat unpacked value");
+static_assert(unpack(241) == VK_FORMAT_A4R4G4B4_UNORM_PACK16_EXT, "Incorrect VkFormat unpacked value");
+static_assert(unpack(242) == VK_FORMAT_A4B4G4R4_UNORM_PACK16_EXT, "Incorrect VkFormat unpacked value");
+static_assert(unpack(243) == VK_FORMAT_UNDEFINED, "Incorrect VkFormat unpacked value");
 
 uint8_t Format::mapTo8bit(VkFormat format)
 {
-	ASSERT(format <= VK_FORMAT_G16_B16_R16_3PLANE_444_UNORM);
+	ASSERT(format <= VK_FORMAT_A4B4G4R4_UNORM_PACK16_EXT);
 	uint8_t packed = pack(format);
 	ASSERT_MSG(packed > 0, "Update VkFormat to uint8_t mapping");
 	return packed;
@@ -2462,7 +2504,7 @@
 
 VkFormat Format::mapFrom8bit(uint8_t format)
 {
-	ASSERT(format <= 240);
+	ASSERT(format <= 242);
 	VkFormat unpacked = unpack(format);
 	ASSERT_MSG(unpacked != VK_FORMAT_UNDEFINED, "Update uint8_t to VkFormat mapping");
 	return unpacked;
diff --git a/src/Vulkan/VkPhysicalDevice.cpp b/src/Vulkan/VkPhysicalDevice.cpp
index 4751cf2..c553eec 100644
--- a/src/Vulkan/VkPhysicalDevice.cpp
+++ b/src/Vulkan/VkPhysicalDevice.cpp
@@ -332,6 +332,12 @@
 	features->customBorderColorWithoutFormat = VK_TRUE;
 }
 
+static void getPhysicalDevice4444FormatsFeaturesExt(VkPhysicalDevice4444FormatsFeaturesEXT *features)
+{
+	features->formatA4R4G4B4 = VK_TRUE;
+	features->formatA4B4G4R4 = VK_TRUE;
+}
+
 void PhysicalDevice::getFeatures2(VkPhysicalDeviceFeatures2 *features) const
 {
 	features->features = getFeatures();
@@ -424,6 +430,9 @@
 		case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_CUSTOM_BORDER_COLOR_FEATURES_EXT:
 			getPhysicalDevicCustomBorderColorFeaturesExt(reinterpret_cast<VkPhysicalDeviceCustomBorderColorFeaturesEXT *>(curExtension));
 			break;
+		case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_4444_FORMATS_FEATURES_EXT:
+			getPhysicalDevice4444FormatsFeaturesExt(reinterpret_cast<struct VkPhysicalDevice4444FormatsFeaturesEXT *>(curExtension));
+			break;
 		default:
 			LOG_TRAP("curExtension->pNext->sType = %s", vk::Stringify(curExtension->sType).c_str());
 			break;
@@ -1108,6 +1117,8 @@
 	// Formats which can be sampled *and* filtered
 	case VK_FORMAT_R4G4B4A4_UNORM_PACK16:
 	case VK_FORMAT_B4G4R4A4_UNORM_PACK16:
+	case VK_FORMAT_A4R4G4B4_UNORM_PACK16_EXT:
+	case VK_FORMAT_A4B4G4R4_UNORM_PACK16_EXT:
 	case VK_FORMAT_R5G6B5_UNORM_PACK16:
 	case VK_FORMAT_B5G6R5_UNORM_PACK16:
 	case VK_FORMAT_R5G5B5A1_UNORM_PACK16:
@@ -1318,6 +1329,8 @@
 	case VK_FORMAT_A1R5G5B5_UNORM_PACK16:
 	case VK_FORMAT_R4G4B4A4_UNORM_PACK16:
 	case VK_FORMAT_B4G4R4A4_UNORM_PACK16:
+	case VK_FORMAT_A4R4G4B4_UNORM_PACK16_EXT:
+	case VK_FORMAT_A4B4G4R4_UNORM_PACK16_EXT:
 	case VK_FORMAT_B5G6R5_UNORM_PACK16:
 	case VK_FORMAT_R5G5B5A1_UNORM_PACK16:
 	case VK_FORMAT_B5G5R5A1_UNORM_PACK16:
diff --git a/src/Vulkan/VkStringify.cpp b/src/Vulkan/VkStringify.cpp
index 9a812dd..036c0a9 100644
--- a/src/Vulkan/VkStringify.cpp
+++ b/src/Vulkan/VkStringify.cpp
@@ -52,4 +52,13 @@
 #endif
 }
 
+std::string Stringify(VkFormat value)
+{
+#ifndef NDEBUG
+	return vkhpp::to_string(static_cast<vkhpp::Format>(value));
+#else
+	return std::to_string(static_cast<int>(value));
+#endif
+}
+
 }  // namespace vk
diff --git a/src/Vulkan/VkStringify.hpp b/src/Vulkan/VkStringify.hpp
index f46ec92..371813f 100644
--- a/src/Vulkan/VkStringify.hpp
+++ b/src/Vulkan/VkStringify.hpp
@@ -23,7 +23,8 @@
 namespace vk {
 
 std::string Stringify(VkStructureType value);
+std::string Stringify(VkFormat format);
 
-}
+}  // namespace vk
 
 #endif
diff --git a/src/Vulkan/libVulkan.cpp b/src/Vulkan/libVulkan.cpp
index 4efba76..cfd7191 100644
--- a/src/Vulkan/libVulkan.cpp
+++ b/src/Vulkan/libVulkan.cpp
@@ -365,6 +365,8 @@
 	// The following extension is used by ANGLE to emulate blitting the stencil buffer
 	{ { VK_EXT_SHADER_STENCIL_EXPORT_EXTENSION_NAME, VK_EXT_SHADER_STENCIL_EXPORT_SPEC_VERSION } },
 	{ { VK_EXT_IMAGE_ROBUSTNESS_EXTENSION_NAME, VK_EXT_IMAGE_ROBUSTNESS_SPEC_VERSION } },
+	// Useful for D3D emulation
+	{ { VK_EXT_4444_FORMATS_EXTENSION_NAME, VK_EXT_4444_FORMATS_SPEC_VERSION } },
 #ifndef __ANDROID__
 	// We fully support the KHR_swapchain v70 additions, so just track the spec version.
 	{ { VK_KHR_SWAPCHAIN_EXTENSION_NAME, VK_KHR_SWAPCHAIN_SPEC_VERSION } },