Fully support all 16-bit packed texture formats

Vulkan lists the following formats as 16-bit packed formats:
 * VK_FORMAT_R4G4B4A4_UNORM_PACK16
 * VK_FORMAT_B4G4R4A4_UNORM_PACK16
 * VK_FORMAT_R5G6B5_UNORM_PACK16
 * VK_FORMAT_B5G6R5_UNORM_PACK16
 * VK_FORMAT_R5G5B5A1_UNORM_PACK16
 * VK_FORMAT_B5G5R5A1_UNORM_PACK16
 * VK_FORMAT_A1R5G5B5_UNORM_PACK16

We already support VK_FORMAT_A1R5G5B5_UNORM_PACK16, so we can use that
as a crossreference to see which tests we expect to pass.

Bug: b/139351376
Tests: dEQP-VK.*pack16*

Change-Id: Ia7b0b9efe92c7f67c803579fee7c8e0d62ef081f
Reviewed-on: https://swiftshader-review.googlesource.com/c/SwiftShader/+/57048
Reviewed-by: Alexis Hétu <sugoi@google.com>
Reviewed-by: Nicolas Capens <nicolascapens@google.com>
Kokoro-Result: kokoro <noreply+kokoro@google.com>
Tested-by: Sean Risser <srisser@google.com>
Commit-Queue: Sean Risser <srisser@google.com>
diff --git a/src/Device/Blitter.cpp b/src/Device/Blitter.cpp
index 1a70606..152b7bd 100644
--- a/src/Device/Blitter.cpp
+++ b/src/Device/Blitter.cpp
@@ -466,11 +466,34 @@
 		    Float(1.0f / (1 << 24)));
 		c.w = 1.0f;
 		break;
+	case VK_FORMAT_R4G4B4A4_UNORM_PACK16:
+		c.x = Float(Int((*Pointer<UShort>(element) & UShort(0xF000)) >> UShort(12)));
+		c.y = Float(Int((*Pointer<UShort>(element) & UShort(0x0F00)) >> UShort(8)));
+		c.z = Float(Int((*Pointer<UShort>(element) & UShort(0x00F0)) >> UShort(4)));
+		c.w = Float(Int(*Pointer<UShort>(element) & UShort(0x000F)));
+		break;
 	case VK_FORMAT_R5G6B5_UNORM_PACK16:
 		c.x = Float(Int((*Pointer<UShort>(element) & UShort(0xF800)) >> UShort(11)));
 		c.y = Float(Int((*Pointer<UShort>(element) & UShort(0x07E0)) >> UShort(5)));
 		c.z = Float(Int(*Pointer<UShort>(element) & UShort(0x001F)));
 		break;
+	case VK_FORMAT_B5G6R5_UNORM_PACK16:
+		c.z = Float(Int((*Pointer<UShort>(element) & UShort(0xF800)) >> UShort(11)));
+		c.y = Float(Int((*Pointer<UShort>(element) & UShort(0x07E0)) >> UShort(5)));
+		c.x = Float(Int(*Pointer<UShort>(element) & UShort(0x001F)));
+		break;
+	case VK_FORMAT_R5G5B5A1_UNORM_PACK16:
+		c.x = Float(Int((*Pointer<UShort>(element) & UShort(0xF800)) >> UShort(11)));
+		c.y = Float(Int((*Pointer<UShort>(element) & UShort(0x07C0)) >> UShort(6)));
+		c.z = Float(Int((*Pointer<UShort>(element) & UShort(0x003E)) >> UShort(1)));
+		c.w = Float(Int(*Pointer<UShort>(element) & UShort(0x0001)));
+		break;
+	case VK_FORMAT_B5G5R5A1_UNORM_PACK16:
+		c.z = Float(Int((*Pointer<UShort>(element) & UShort(0xF800)) >> UShort(11)));
+		c.y = Float(Int((*Pointer<UShort>(element) & UShort(0x07C0)) >> UShort(6)));
+		c.x = Float(Int((*Pointer<UShort>(element) & UShort(0x003E)) >> UShort(1)));
+		c.w = Float(Int(*Pointer<UShort>(element) & UShort(0x0001)));
+		break;
 	case VK_FORMAT_A1R5G5B5_UNORM_PACK16:
 		c.w = Float(Int((*Pointer<UShort>(element) & UShort(0x8000)) >> UShort(15)));
 		c.x = Float(Int((*Pointer<UShort>(element) & UShort(0x7C00)) >> UShort(10)));
@@ -907,6 +930,20 @@
 			                             UShort(mask));
 		}
 		break;
+	case VK_FORMAT_B5G6R5_UNORM_PACK16:
+		if(writeR && writeG && writeB)
+		{
+			*Pointer<UShort>(element) = UShort(PackFields(RoundInt(c.zyxx), { 11, 5, 0, 0 }));
+		}
+		else
+		{
+			unsigned short mask = (writeR ? 0x001F : 0x0000) | (writeG ? 0x07E0 : 0x0000) | (writeB ? 0xF800 : 0x0000);
+			unsigned short unmask = ~mask;
+			*Pointer<UShort>(element) = (*Pointer<UShort>(element) & UShort(unmask)) |
+			                            (UShort(PackFields(RoundInt(c.zyxx), { 11, 5, 0, 0 })) &
+			                             UShort(mask));
+		}
+		break;
 	case VK_FORMAT_R5G5B5A1_UNORM_PACK16:
 		if(writeRGBA)
 		{
diff --git a/src/Pipeline/Constants.cpp b/src/Pipeline/Constants.cpp
index ce921e6..5855fce 100644
--- a/src/Pipeline/Constants.cpp
+++ b/src/Pipeline/Constants.cpp
@@ -252,6 +252,10 @@
 	for(int i = 0; i < 16; i++)
 	{
 		mask5551Q[i] = word4((i & 0x1 ? 0x001F : 0) | (i & 0x2 ? 0x03E0 : 0) | (i & 0x4 ? 0x7C00 : 0) | (i & 8 ? 0x8000 : 0));
+		maskr5g5b5a1Q[i] = word4((i & 0x1 ? 0x003E : 0) | (i & 0x2 ? 0x07C0 : 0) | (i & 0x4 ? 0xF800 : 0) | (i & 8 ? 0x0001 : 0));
+		maskb5g5r5a1Q[i] = word4((i & 0x1 ? 0xF800 : 0) | (i & 0x2 ? 0x07C0 : 0) | (i & 0x4 ? 0x003E : 0) | (i & 8 ? 0x0001 : 0));
+		mask4rgbaQ[i] = word4((i & 0x1 ? 0x00F0 : 0) | (i & 0x2 ? 0x0F00 : 0) | (i & 0x4 ? 0xF000 : 0) | (i & 8 ? 0x000F : 0));
+		mask4bgraQ[i] = word4((i & 0x1 ? 0xF000 : 0) | (i & 0x2 ? 0x0F00 : 0) | (i & 0x4 ? 0x00F0 : 0) | (i & 8 ? 0x000F : 0));
 	}
 
 	for(int i = 0; i < 4; i++)
diff --git a/src/Pipeline/Constants.hpp b/src/Pipeline/Constants.hpp
index 5bb1f53..dc66a9c 100644
--- a/src/Pipeline/Constants.hpp
+++ b/src/Pipeline/Constants.hpp
@@ -68,9 +68,13 @@
 	word4 maskW01Q[4];
 	dword4 maskD01X[4];
 	word4 mask565Q[8];
-	dword2 mask10Q[16];   // 4 bit writemask -> A2B10G10R10 bit patterns, replicated 2x
-	word4 mask5551Q[16];  // 4 bit writemask -> A1R5G5B5 bit patterns, replicated 4x
-	dword4 mask11X[8];    // 3 bit writemask -> B10G11R11 bit patterns, replicated 4x
+	dword2 mask10Q[16];       // 4 bit writemask -> A2B10G10R10 bit patterns, replicated 2x
+	word4 mask5551Q[16];      // 4 bit writemask -> A1R5G5B5 bit patterns, replicated 4x
+	word4 maskr5g5b5a1Q[16];  // 4 bit writemask -> R5G5B5A1 bit patterns, replicated 4x
+	word4 maskb5g5r5a1Q[16];  // 4 bit writemask -> B5G5R5A1 bit patterns, replicated 4x
+	word4 mask4rgbaQ[16];     // 4 bit writemask -> R4G4B4A4 bit patterns, replicated 4x
+	word4 mask4bgraQ[16];     // 4 bit writemask -> B4G4R4A4 bit patterns, replicated 4x
+	dword4 mask11X[8];        // 3 bit writemask -> B10G11R11 bit patterns, replicated 4x
 
 	unsigned short sRGBtoLinearFF_FF00[256];
 
diff --git a/src/Pipeline/PixelProgram.cpp b/src/Pipeline/PixelProgram.cpp
index 9b321f9..60e6805 100644
--- a/src/Pipeline/PixelProgram.cpp
+++ b/src/Pipeline/PixelProgram.cpp
@@ -279,6 +279,11 @@
 		auto format = state.colorFormat[index];
 		switch(format)
 		{
+		case VK_FORMAT_R4G4B4A4_UNORM_PACK16:
+		case VK_FORMAT_B4G4R4A4_UNORM_PACK16:
+		case VK_FORMAT_B5G6R5_UNORM_PACK16:
+		case VK_FORMAT_R5G5B5A1_UNORM_PACK16:
+		case VK_FORMAT_B5G5R5A1_UNORM_PACK16:
 		case VK_FORMAT_A1R5G5B5_UNORM_PACK16:
 		case VK_FORMAT_R5G6B5_UNORM_PACK16:
 		case VK_FORMAT_B8G8R8A8_UNORM:
@@ -364,6 +369,11 @@
 		{
 		case VK_FORMAT_UNDEFINED:
 			break;
+		case VK_FORMAT_R4G4B4A4_UNORM_PACK16:
+		case VK_FORMAT_B4G4R4A4_UNORM_PACK16:
+		case VK_FORMAT_B5G6R5_UNORM_PACK16:
+		case VK_FORMAT_R5G5B5A1_UNORM_PACK16:
+		case VK_FORMAT_B5G5R5A1_UNORM_PACK16:
 		case VK_FORMAT_A1R5G5B5_UNORM_PACK16:
 		case VK_FORMAT_R5G6B5_UNORM_PACK16:
 		case VK_FORMAT_B8G8R8A8_UNORM:
diff --git a/src/Pipeline/PixelRoutine.cpp b/src/Pipeline/PixelRoutine.cpp
index c24ca31..46af146 100644
--- a/src/Pipeline/PixelRoutine.cpp
+++ b/src/Pipeline/PixelRoutine.cpp
@@ -1082,6 +1082,82 @@
 
 	switch(state.colorFormat[index])
 	{
+	case VK_FORMAT_R4G4B4A4_UNORM_PACK16:
+		buffer += 2 * x;
+		buffer2 = buffer + pitchB;
+		c01 = As<Short4>(Int2(*Pointer<Int>(buffer), *Pointer<Int>(buffer2)));
+
+		pixel.x = (c01 & Short4(0xF000u));
+		pixel.y = (c01 & Short4(0x0F00u)) << 4;
+		pixel.z = (c01 & Short4(0x00F0u)) << 8;
+		pixel.w = (c01 & Short4(0x000Fu)) << 12;
+
+		// Expand to 16 bit range
+		pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 4);
+		pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 8);
+		pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 4);
+		pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 8);
+		pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 4);
+		pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 8);
+		pixel.w |= As<Short4>(As<UShort4>(pixel.w) >> 4);
+		pixel.w |= As<Short4>(As<UShort4>(pixel.w) >> 8);
+		break;
+	case VK_FORMAT_B4G4R4A4_UNORM_PACK16:
+		buffer += 2 * x;
+		buffer2 = buffer + pitchB;
+		c01 = As<Short4>(Int2(*Pointer<Int>(buffer), *Pointer<Int>(buffer2)));
+
+		pixel.z = (c01 & Short4(0xF000u));
+		pixel.y = (c01 & Short4(0x0F00u)) << 4;
+		pixel.x = (c01 & Short4(0x00F0u)) << 8;
+		pixel.w = (c01 & Short4(0x000Fu)) << 12;
+
+		// Expand to 16 bit range
+		pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 4);
+		pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 8);
+		pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 4);
+		pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 8);
+		pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 4);
+		pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 8);
+		pixel.w |= As<Short4>(As<UShort4>(pixel.w) >> 4);
+		pixel.w |= As<Short4>(As<UShort4>(pixel.w) >> 8);
+		break;
+	case VK_FORMAT_R5G5B5A1_UNORM_PACK16:
+		buffer += 2 * x;
+		buffer2 = buffer + pitchB;
+		c01 = As<Short4>(Int2(*Pointer<Int>(buffer), *Pointer<Int>(buffer2)));
+
+		pixel.x = (c01 & Short4(0xF800u));
+		pixel.y = (c01 & Short4(0x07C0u)) << 5;
+		pixel.z = (c01 & Short4(0x003Eu)) << 10;
+		pixel.w = ((c01 & Short4(0x0001u)) << 15) >> 15;
+
+		// Expand to 16 bit range
+		pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 5);
+		pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 10);
+		pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 5);
+		pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 10);
+		pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 5);
+		pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 10);
+		break;
+	case VK_FORMAT_B5G5R5A1_UNORM_PACK16:
+		buffer += 2 * x;
+		buffer2 = buffer + pitchB;
+		c01 = As<Short4>(Int2(*Pointer<Int>(buffer), *Pointer<Int>(buffer2)));
+
+		pixel.z = (c01 & Short4(0xF800u));
+		pixel.y = (c01 & Short4(0x07C0u)) << 5;
+		pixel.x = (c01 & Short4(0x003Eu)) << 10;
+		pixel.w = ((c01 & Short4(0x0001u)) << 15) >> 15;
+
+		// Expand to 16 bit range
+		pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 5);
+		pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 10);
+		pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 5);
+		pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 10);
+		pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 5);
+		pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 10);
+		break;
 	case VK_FORMAT_A1R5G5B5_UNORM_PACK16:
 		buffer += 2 * x;
 		buffer2 = buffer + pitchB;
@@ -1118,6 +1194,24 @@
 		pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 5);
 		pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 10);
 		break;
+	case VK_FORMAT_B5G6R5_UNORM_PACK16:
+		buffer += 2 * x;
+		buffer2 = buffer + pitchB;
+		c01 = As<Short4>(Int2(*Pointer<Int>(buffer), *Pointer<Int>(buffer2)));
+
+		pixel.z = c01 & Short4(0xF800u);
+		pixel.y = (c01 & Short4(0x07E0u)) << 5;
+		pixel.x = (c01 & Short4(0x001Fu)) << 11;
+		pixel.w = Short4(0xFFFFu);
+
+		// Expand to 16 bit range
+		pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 5);
+		pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 10);
+		pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 6);
+		pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 12);
+		pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 5);
+		pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 10);
+		break;
 	case VK_FORMAT_B8G8R8A8_UNORM:
 	case VK_FORMAT_B8G8R8A8_SRGB:
 		buffer += 4 * x;
@@ -1386,12 +1480,22 @@
 		current.z = current.z - As<Short4>(As<UShort4>(current.z) >> 10) + Short4(0x0020);
 		current.w = current.w - As<Short4>(As<UShort4>(current.w) >> 2) + Short4(0x2000);
 		break;
+	case VK_FORMAT_R4G4B4A4_UNORM_PACK16:
+	case VK_FORMAT_B4G4R4A4_UNORM_PACK16:
+		current.x = current.x - As<Short4>(As<UShort4>(current.x) >> 4) + Short4(0x0800);
+		current.y = current.y - As<Short4>(As<UShort4>(current.y) >> 4) + Short4(0x0800);
+		current.z = current.z - As<Short4>(As<UShort4>(current.z) >> 4) + Short4(0x0800);
+		current.w = current.w - As<Short4>(As<UShort4>(current.w) >> 4) + Short4(0x0800);
+		break;
+	case VK_FORMAT_R5G5B5A1_UNORM_PACK16:
+	case VK_FORMAT_B5G5R5A1_UNORM_PACK16:
 	case VK_FORMAT_A1R5G5B5_UNORM_PACK16:
 		current.x = current.x - As<Short4>(As<UShort4>(current.x) >> 5) + Short4(0x0400);
 		current.y = current.y - As<Short4>(As<UShort4>(current.y) >> 5) + Short4(0x0400);
 		current.z = current.z - As<Short4>(As<UShort4>(current.z) >> 5) + Short4(0x0400);
 		current.w = current.w - As<Short4>(As<UShort4>(current.w) >> 1) + Short4(0x4000);
 		break;
+	case VK_FORMAT_B5G6R5_UNORM_PACK16:
 	case VK_FORMAT_R5G6B5_UNORM_PACK16:
 		current.x = current.x - As<Short4>(As<UShort4>(current.x) >> 5) + Short4(0x0400);
 		current.y = current.y - As<Short4>(As<UShort4>(current.y) >> 6) + Short4(0x0200);
@@ -1406,6 +1510,46 @@
 
 	switch(state.colorFormat[index])
 	{
+	case VK_FORMAT_R4G4B4A4_UNORM_PACK16:
+		{
+			current.x = As<UShort4>(current.x & Short4(0xF000));
+			current.y = As<UShort4>(current.y & Short4(0xF000)) >> 4;
+			current.z = As<UShort4>(current.z & Short4(0xF000)) >> 8;
+			current.w = As<UShort4>(current.w & Short4(0xF000u)) >> 12;
+
+			current.x = current.x | current.y | current.z | current.w;
+		}
+		break;
+	case VK_FORMAT_B4G4R4A4_UNORM_PACK16:
+		{
+			current.z = As<UShort4>(current.z & Short4(0xF000));
+			current.y = As<UShort4>(current.y & Short4(0xF000)) >> 4;
+			current.x = As<UShort4>(current.x & Short4(0xF000)) >> 8;
+			current.w = As<UShort4>(current.w & Short4(0xF000u)) >> 12;
+
+			current.x = current.x | current.y | current.z | current.w;
+		}
+		break;
+	case VK_FORMAT_R5G5B5A1_UNORM_PACK16:
+		{
+			current.x = As<UShort4>(current.x & Short4(0xF800));
+			current.y = As<UShort4>(current.y & Short4(0xF800)) >> 5;
+			current.z = As<UShort4>(current.z & Short4(0xF800)) >> 10;
+			current.w = As<UShort4>(current.w & Short4(0x8000u)) >> 15;
+
+			current.x = current.x | current.y | current.z | current.w;
+		}
+		break;
+	case VK_FORMAT_B5G5R5A1_UNORM_PACK16:
+		{
+			current.z = As<UShort4>(current.z & Short4(0xF800));
+			current.y = As<UShort4>(current.y & Short4(0xF800)) >> 5;
+			current.x = As<UShort4>(current.x & Short4(0xF800)) >> 10;
+			current.w = As<UShort4>(current.w & Short4(0x8000u)) >> 15;
+
+			current.x = current.x | current.y | current.z | current.w;
+		}
+		break;
 	case VK_FORMAT_A1R5G5B5_UNORM_PACK16:
 		{
 			current.w = current.w & Short4(0x8000u);
@@ -1425,6 +1569,15 @@
 			current.x = current.x | current.y | current.z;
 		}
 		break;
+	case VK_FORMAT_B5G6R5_UNORM_PACK16:
+		{
+			current.z = current.z & Short4(0xF800u);
+			current.y = As<UShort4>(current.y & Short4(0xFC00u)) >> 5;
+			current.x = As<UShort4>(current.x) >> 11;
+
+			current.x = current.x | current.y | current.z;
+		}
+		break;
 	case VK_FORMAT_B8G8R8A8_UNORM:
 	case VK_FORMAT_B8G8R8A8_SRGB:
 		if(rgbaWriteMask == 0x7)
@@ -1573,6 +1726,114 @@
 
 	switch(state.colorFormat[index])
 	{
+	case VK_FORMAT_R4G4B4A4_UNORM_PACK16:
+		{
+			buffer += 2 * x;
+			Int value = *Pointer<Int>(buffer);
+
+			Int channelMask = *Pointer<Int>(constants + OFFSET(Constants, mask4rgbaQ[bgraWriteMask & 0xF][0]));
+
+			Int c01 = Extract(As<Int2>(current.x), 0);
+			Int mask01 = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[0][0]) + xMask * 8);
+			if(bgraWriteMask != 0x0000000F)
+			{
+				mask01 &= channelMask;
+			}
+			*Pointer<Int>(buffer) = (c01 & mask01) | (value & ~mask01);
+
+			buffer += pitchB;
+			value = *Pointer<Int>(buffer);
+
+			Int c23 = Extract(As<Int2>(current.x), 1);
+			Int mask23 = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[0][2]) + xMask * 8);
+			if(bgraWriteMask != 0x0000000F)
+			{
+				mask23 &= channelMask;
+			}
+			*Pointer<Int>(buffer) = (c23 & mask23) | (value & ~mask23);
+		}
+		break;
+	case VK_FORMAT_B4G4R4A4_UNORM_PACK16:
+		{
+			buffer += 2 * x;
+			Int value = *Pointer<Int>(buffer);
+
+			Int channelMask = *Pointer<Int>(constants + OFFSET(Constants, mask4bgraQ[bgraWriteMask & 0xF][0]));
+
+			Int c01 = Extract(As<Int2>(current.x), 0);
+			Int mask01 = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[0][0]) + xMask * 8);
+			if(bgraWriteMask != 0x0000000F)
+			{
+				mask01 &= channelMask;
+			}
+			*Pointer<Int>(buffer) = (c01 & mask01) | (value & ~mask01);
+
+			buffer += pitchB;
+			value = *Pointer<Int>(buffer);
+
+			Int c23 = Extract(As<Int2>(current.x), 1);
+			Int mask23 = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[0][2]) + xMask * 8);
+			if(bgraWriteMask != 0x0000000F)
+			{
+				mask23 &= channelMask;
+			}
+			*Pointer<Int>(buffer) = (c23 & mask23) | (value & ~mask23);
+		}
+		break;
+	case VK_FORMAT_R5G5B5A1_UNORM_PACK16:
+		{
+			buffer += 2 * x;
+			Int value = *Pointer<Int>(buffer);
+
+			Int channelMask = *Pointer<Int>(constants + OFFSET(Constants, maskr5g5b5a1Q[bgraWriteMask & 0xF][0]));
+
+			Int c01 = Extract(As<Int2>(current.x), 0);
+			Int mask01 = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[0][0]) + xMask * 8);
+			if(bgraWriteMask != 0x0000000F)
+			{
+				mask01 &= channelMask;
+			}
+			*Pointer<Int>(buffer) = (c01 & mask01) | (value & ~mask01);
+
+			buffer += pitchB;
+			value = *Pointer<Int>(buffer);
+
+			Int c23 = Extract(As<Int2>(current.x), 1);
+			Int mask23 = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[0][2]) + xMask * 8);
+			if(bgraWriteMask != 0x0000000F)
+			{
+				mask23 &= channelMask;
+			}
+			*Pointer<Int>(buffer) = (c23 & mask23) | (value & ~mask23);
+		}
+		break;
+	case VK_FORMAT_B5G5R5A1_UNORM_PACK16:
+		{
+			buffer += 2 * x;
+			Int value = *Pointer<Int>(buffer);
+
+			Int channelMask = *Pointer<Int>(constants + OFFSET(Constants, maskb5g5r5a1Q[bgraWriteMask & 0xF][0]));
+
+			Int c01 = Extract(As<Int2>(current.x), 0);
+			Int mask01 = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[0][0]) + xMask * 8);
+			if(bgraWriteMask != 0x0000000F)
+			{
+				mask01 &= channelMask;
+			}
+			*Pointer<Int>(buffer) = (c01 & mask01) | (value & ~mask01);
+
+			buffer += pitchB;
+			value = *Pointer<Int>(buffer);
+
+			Int c23 = Extract(As<Int2>(current.x), 1);
+			Int mask23 = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[0][2]) + xMask * 8);
+			if(bgraWriteMask != 0x0000000F)
+			{
+				mask23 &= channelMask;
+			}
+			*Pointer<Int>(buffer) = (c23 & mask23) | (value & ~mask23);
+		}
+		break;
 	case VK_FORMAT_A1R5G5B5_UNORM_PACK16:
 		{
 			buffer += 2 * x;
diff --git a/src/Pipeline/SamplerCore.cpp b/src/Pipeline/SamplerCore.cpp
index d821e46..9043038 100644
--- a/src/Pipeline/SamplerCore.cpp
+++ b/src/Pipeline/SamplerCore.cpp
@@ -152,16 +152,20 @@
 			switch(state.textureFormat)
 			{
 			case VK_FORMAT_R5G6B5_UNORM_PACK16:
+			case VK_FORMAT_B5G6R5_UNORM_PACK16:
 				c.x *= Float4(1.0f / 0xF800);
 				c.y *= Float4(1.0f / 0xFC00);
 				c.z *= Float4(1.0f / 0xF800);
 				break;
+			case VK_FORMAT_R4G4B4A4_UNORM_PACK16:
 			case VK_FORMAT_B4G4R4A4_UNORM_PACK16:
 				c.x *= Float4(1.0f / 0xF000);
 				c.y *= Float4(1.0f / 0xF000);
 				c.z *= Float4(1.0f / 0xF000);
 				c.w *= Float4(1.0f / 0xF000);
 				break;
+			case VK_FORMAT_R5G5B5A1_UNORM_PACK16:
+			case VK_FORMAT_B5G5R5A1_UNORM_PACK16:
 			case VK_FORMAT_A1R5G5B5_UNORM_PACK16:
 				c.x *= Float4(1.0f / 0xF800);
 				c.y *= Float4(1.0f / 0xF800);
@@ -214,16 +218,20 @@
 		switch(state.textureFormat)
 		{
 		case VK_FORMAT_R5G6B5_UNORM_PACK16:
+		case VK_FORMAT_B5G6R5_UNORM_PACK16:
 			c.x = Float4(As<UShort4>(cs.x)) * Float4(1.0f / 0xF800);
 			c.y = Float4(As<UShort4>(cs.y)) * Float4(1.0f / 0xFC00);
 			c.z = Float4(As<UShort4>(cs.z)) * Float4(1.0f / 0xF800);
 			break;
+		case VK_FORMAT_R4G4B4A4_UNORM_PACK16:
 		case VK_FORMAT_B4G4R4A4_UNORM_PACK16:
 			c.x = Float4(As<UShort4>(cs.x)) * Float4(1.0f / 0xF000);
 			c.y = Float4(As<UShort4>(cs.y)) * Float4(1.0f / 0xF000);
 			c.z = Float4(As<UShort4>(cs.z)) * Float4(1.0f / 0xF000);
 			c.w = Float4(As<UShort4>(cs.w)) * Float4(1.0f / 0xF000);
 			break;
+		case VK_FORMAT_R5G5B5A1_UNORM_PACK16:
+		case VK_FORMAT_B5G5R5A1_UNORM_PACK16:
 		case VK_FORMAT_A1R5G5B5_UNORM_PACK16:
 			c.x = Float4(As<UShort4>(cs.x)) * Float4(1.0f / 0xF800);
 			c.y = Float4(As<UShort4>(cs.y)) * Float4(1.0f / 0xF800);
@@ -1526,12 +1534,35 @@
 			c.y = (c.x & Short4(0x07E0u)) << 5;
 			c.x = (c.x & Short4(0xF800u));
 			break;
+		case VK_FORMAT_B5G6R5_UNORM_PACK16:
+			c.z = (c.x & Short4(0xF800u));
+			c.y = (c.x & Short4(0x07E0u)) << 5;
+			c.x = (c.x & Short4(0x001Fu)) << 11;
+			break;
+		case VK_FORMAT_R4G4B4A4_UNORM_PACK16:
+			c.w = (c.x << 12) & Short4(0xF000u);
+			c.z = (c.x << 8) & Short4(0xF000u);
+			c.y = (c.x << 4) & Short4(0xF000u);
+			c.x = (c.x) & Short4(0xF000u);
+			break;
 		case VK_FORMAT_B4G4R4A4_UNORM_PACK16:
 			c.w = (c.x << 12) & Short4(0xF000u);
 			c.z = (c.x) & Short4(0xF000u);
 			c.y = (c.x << 4) & Short4(0xF000u);
 			c.x = (c.x << 8) & Short4(0xF000u);
 			break;
+		case VK_FORMAT_R5G5B5A1_UNORM_PACK16:
+			c.w = (c.x << 15) & Short4(0x8000u);
+			c.z = (c.x << 10) & Short4(0xF800u);
+			c.y = (c.x << 5) & Short4(0xF800u);
+			c.x = (c.x) & Short4(0xF800u);
+			break;
+		case VK_FORMAT_B5G5R5A1_UNORM_PACK16:
+			c.w = (c.x << 15) & Short4(0x8000u);
+			c.z = (c.x) & Short4(0xF800u);
+			c.y = (c.x << 5) & Short4(0xF800u);
+			c.x = (c.x << 10) & Short4(0xF800u);
+			break;
 		case VK_FORMAT_A1R5G5B5_UNORM_PACK16:
 			c.w = (c.x) & Short4(0x8000u);
 			c.z = (c.x << 11) & Short4(0xF800u);
diff --git a/src/Pipeline/SpirvShaderImage.cpp b/src/Pipeline/SpirvShaderImage.cpp
index 067eaea..245d110 100644
--- a/src/Pipeline/SpirvShaderImage.cpp
+++ b/src/Pipeline/SpirvShaderImage.cpp
@@ -979,12 +979,42 @@
 		dst.move(0, SIMD::Float((packed[0] >> 20) & SIMD::Int(0x3FF)) * SIMD::Float(1.0f / 0x3FF));
 		dst.move(3, SIMD::Float((packed[0] >> 30) & SIMD::Int(0x3)) * SIMD::Float(1.0f / 0x3));
 		break;
+	case VK_FORMAT_R4G4B4A4_UNORM_PACK16:
+		dst.move(0, SIMD::Float((packed[0] >> 12) & SIMD::Int(0xF)) * SIMD::Float(1.0f / 0xF));
+		dst.move(1, SIMD::Float((packed[0] >> 8) & SIMD::Int(0xF)) * SIMD::Float(1.0f / 0xF));
+		dst.move(2, SIMD::Float((packed[0] >> 4) & SIMD::Int(0xF)) * SIMD::Float(1.0f / 0xF));
+		dst.move(3, SIMD::Float((packed[0]) & SIMD::Int(0xF)) * SIMD::Float(1.0f / 0xF));
+		break;
+	case VK_FORMAT_B4G4R4A4_UNORM_PACK16:
+		dst.move(0, SIMD::Float((packed[0] >> 4) & SIMD::Int(0xF)) * SIMD::Float(1.0f / 0xF));
+		dst.move(1, SIMD::Float((packed[0] >> 8) & SIMD::Int(0xF)) * SIMD::Float(1.0f / 0xF));
+		dst.move(2, SIMD::Float((packed[0] >> 12) & SIMD::Int(0xF)) * SIMD::Float(1.0f / 0xF));
+		dst.move(3, SIMD::Float((packed[0]) & SIMD::Int(0xF)) * SIMD::Float(1.0f / 0xF));
+		break;
 	case VK_FORMAT_R5G6B5_UNORM_PACK16:
 		dst.move(0, SIMD::Float((packed[0] >> 11) & SIMD::Int(0x1F)) * SIMD::Float(1.0f / 0x1F));
 		dst.move(1, SIMD::Float((packed[0] >> 5) & SIMD::Int(0x3F)) * SIMD::Float(1.0f / 0x3F));
 		dst.move(2, SIMD::Float((packed[0]) & SIMD::Int(0x1F)) * SIMD::Float(1.0f / 0x1F));
 		dst.move(3, SIMD::Float(1.0f));
 		break;
+	case VK_FORMAT_B5G6R5_UNORM_PACK16:
+		dst.move(0, SIMD::Float((packed[0]) & SIMD::Int(0x1F)) * SIMD::Float(1.0f / 0x1F));
+		dst.move(1, SIMD::Float((packed[0] >> 5) & SIMD::Int(0x3F)) * SIMD::Float(1.0f / 0x3F));
+		dst.move(2, SIMD::Float((packed[0] >> 11) & SIMD::Int(0x1F)) * SIMD::Float(1.0f / 0x1F));
+		dst.move(3, SIMD::Float(1.0f));
+		break;
+	case VK_FORMAT_R5G5B5A1_UNORM_PACK16:
+		dst.move(0, SIMD::Float((packed[0] >> 11) & SIMD::Int(0x1F)) * SIMD::Float(1.0f / 0x1F));
+		dst.move(1, SIMD::Float((packed[0] >> 6) & SIMD::Int(0x1F)) * SIMD::Float(1.0f / 0x1F));
+		dst.move(2, SIMD::Float((packed[0] >> 1) & SIMD::Int(0x1F)) * SIMD::Float(1.0f / 0x1F));
+		dst.move(3, SIMD::Float((packed[0]) & SIMD::Int(0x1)));
+		break;
+	case VK_FORMAT_B5G5R5A1_UNORM_PACK16:
+		dst.move(0, SIMD::Float((packed[0] >> 1) & SIMD::Int(0x1F)) * SIMD::Float(1.0f / 0x1F));
+		dst.move(1, SIMD::Float((packed[0] >> 6) & SIMD::Int(0x1F)) * SIMD::Float(1.0f / 0x1F));
+		dst.move(2, SIMD::Float((packed[0] >> 11) & SIMD::Int(0x1F)) * SIMD::Float(1.0f / 0x1F));
+		dst.move(3, SIMD::Float((packed[0]) & SIMD::Int(0x1)));
+		break;
 	case VK_FORMAT_A1R5G5B5_UNORM_PACK16:
 		dst.move(0, SIMD::Float((packed[0] >> 10) & SIMD::Int(0x1F)) * SIMD::Float(1.0f / 0x1F));
 		dst.move(1, SIMD::Float((packed[0] >> 5) & SIMD::Int(0x1F)) * SIMD::Float(1.0f / 0x1F));
diff --git a/src/Vulkan/VkFormat.cpp b/src/Vulkan/VkFormat.cpp
index 48340ab..24ba6a2 100644
--- a/src/Vulkan/VkFormat.cpp
+++ b/src/Vulkan/VkFormat.cpp
@@ -226,6 +226,8 @@
 		case VK_FORMAT_R8G8_UNORM:
 		case VK_FORMAT_G8_B8R8_2PLANE_420_UNORM:
 			return VK_FORMAT_R8G8_UNORM;
+		case VK_FORMAT_R4G4B4A4_UNORM_PACK16:
+			return VK_FORMAT_R4G4B4A4_UNORM_PACK16;
 		default:
 			UNSUPPORTED("format %d", int(format));
 			break;
@@ -2007,6 +2009,11 @@
 	case VK_FORMAT_R32G32_SFLOAT:
 	case VK_FORMAT_R32G32B32A32_SFLOAT:
 	case VK_FORMAT_B10G11R11_UFLOAT_PACK32:
+	case VK_FORMAT_R4G4B4A4_UNORM_PACK16:
+	case VK_FORMAT_B4G4R4A4_UNORM_PACK16:
+	case VK_FORMAT_B5G6R5_UNORM_PACK16:
+	case VK_FORMAT_R5G5B5A1_UNORM_PACK16:
+	case VK_FORMAT_B5G5R5A1_UNORM_PACK16:
 		return true;
 	default:
 		return false;
@@ -2022,8 +2029,12 @@
 
 	switch(format)
 	{
+	case VK_FORMAT_R4G4B4A4_UNORM_PACK16:
 	case VK_FORMAT_B4G4R4A4_UNORM_PACK16:
 	case VK_FORMAT_R5G6B5_UNORM_PACK16:
+	case VK_FORMAT_B5G6R5_UNORM_PACK16:
+	case VK_FORMAT_R5G5B5A1_UNORM_PACK16:
+	case VK_FORMAT_B5G5R5A1_UNORM_PACK16:
 	case VK_FORMAT_A1R5G5B5_UNORM_PACK16:
 		return true;
 	case VK_FORMAT_R8G8_SINT:
@@ -2079,6 +2090,10 @@
 	case VK_FORMAT_A1R5G5B5_UNORM_PACK16:
 	case VK_FORMAT_B4G4R4A4_UNORM_PACK16:
 	case VK_FORMAT_R5G6B5_UNORM_PACK16:
+	case VK_FORMAT_R4G4B4A4_UNORM_PACK16:
+	case VK_FORMAT_B5G6R5_UNORM_PACK16:
+	case VK_FORMAT_R5G5B5A1_UNORM_PACK16:
+	case VK_FORMAT_B5G5R5A1_UNORM_PACK16:
 	case VK_FORMAT_R32_SFLOAT:
 	case VK_FORMAT_R32G32_SFLOAT:
 	case VK_FORMAT_R32G32B32A32_SFLOAT:
@@ -2127,6 +2142,10 @@
 	case VK_FORMAT_A1R5G5B5_UNORM_PACK16:
 	case VK_FORMAT_B4G4R4A4_UNORM_PACK16:
 	case VK_FORMAT_R5G6B5_UNORM_PACK16:
+	case VK_FORMAT_R4G4B4A4_UNORM_PACK16:
+	case VK_FORMAT_B5G6R5_UNORM_PACK16:
+	case VK_FORMAT_R5G5B5A1_UNORM_PACK16:
+	case VK_FORMAT_B5G5R5A1_UNORM_PACK16:
 	case VK_FORMAT_R8_UNORM:
 	case VK_FORMAT_R8_SNORM:
 	case VK_FORMAT_R8_UINT:
@@ -2202,6 +2221,10 @@
 	case VK_FORMAT_A1R5G5B5_UNORM_PACK16:
 	case VK_FORMAT_B4G4R4A4_UNORM_PACK16:
 	case VK_FORMAT_R5G6B5_UNORM_PACK16:
+	case VK_FORMAT_R4G4B4A4_UNORM_PACK16:
+	case VK_FORMAT_B5G6R5_UNORM_PACK16:
+	case VK_FORMAT_R5G5B5A1_UNORM_PACK16:
+	case VK_FORMAT_B5G5R5A1_UNORM_PACK16:
 	case VK_FORMAT_R8_UNORM:
 	case VK_FORMAT_R8_SNORM:
 	case VK_FORMAT_R8_UINT:
@@ -2305,6 +2328,10 @@
 	case VK_FORMAT_A1R5G5B5_UNORM_PACK16:
 	case VK_FORMAT_B4G4R4A4_UNORM_PACK16:
 	case VK_FORMAT_R5G6B5_UNORM_PACK16:
+	case VK_FORMAT_R4G4B4A4_UNORM_PACK16:
+	case VK_FORMAT_B5G6R5_UNORM_PACK16:
+	case VK_FORMAT_R5G5B5A1_UNORM_PACK16:
+	case VK_FORMAT_B5G5R5A1_UNORM_PACK16:
 	case VK_FORMAT_R8G8B8A8_UNORM:
 	case VK_FORMAT_R8G8B8A8_SNORM:
 	case VK_FORMAT_R8G8B8A8_UINT:
diff --git a/src/Vulkan/VkPhysicalDevice.cpp b/src/Vulkan/VkPhysicalDevice.cpp
index 3eaff75..5468209 100644
--- a/src/Vulkan/VkPhysicalDevice.cpp
+++ b/src/Vulkan/VkPhysicalDevice.cpp
@@ -1105,8 +1105,12 @@
 	switch(format)
 	{
 	// Formats which can be sampled *and* filtered
+	case VK_FORMAT_R4G4B4A4_UNORM_PACK16:
 	case VK_FORMAT_B4G4R4A4_UNORM_PACK16:
 	case VK_FORMAT_R5G6B5_UNORM_PACK16:
+	case VK_FORMAT_B5G6R5_UNORM_PACK16:
+	case VK_FORMAT_R5G5B5A1_UNORM_PACK16:
+	case VK_FORMAT_B5G5R5A1_UNORM_PACK16:
 	case VK_FORMAT_A1R5G5B5_UNORM_PACK16:
 	case VK_FORMAT_R8_UNORM:
 	case VK_FORMAT_R8_SRGB:
@@ -1311,6 +1315,11 @@
 	{
 	case VK_FORMAT_R5G6B5_UNORM_PACK16:
 	case VK_FORMAT_A1R5G5B5_UNORM_PACK16:
+	case VK_FORMAT_R4G4B4A4_UNORM_PACK16:
+	case VK_FORMAT_B4G4R4A4_UNORM_PACK16:
+	case VK_FORMAT_B5G6R5_UNORM_PACK16:
+	case VK_FORMAT_R5G5B5A1_UNORM_PACK16:
+	case VK_FORMAT_B5G5R5A1_UNORM_PACK16:
 	case VK_FORMAT_R8_UNORM:
 	case VK_FORMAT_R8G8_UNORM:
 	case VK_FORMAT_R8G8B8A8_UNORM: