Add ROP support for VK_FORMAT_A1R5G5B5_UNORM_PACK16

Tested with ANGLE running dEQP-GLES2.*rbo_rgb5_a1*

Bug: b/139411772
Change-Id: I5681b008ae7bfb2df2705189f28667dac8d4c983
Reviewed-on: https://swiftshader-review.googlesource.com/c/SwiftShader/+/35812
Reviewed-by: Nicolas Capens <nicolas.capens@gmail.com>
Reviewed-by: Nicolas Capens <nicolascapens@google.com>
Tested-by: Chris Forbes <chrisforbes@google.com>
Kokoro-Presubmit: kokoro <noreply+kokoro@google.com>
diff --git a/src/Pipeline/Constants.cpp b/src/Pipeline/Constants.cpp
index ac9d636..5902c4b 100644
--- a/src/Pipeline/Constants.cpp
+++ b/src/Pipeline/Constants.cpp
@@ -249,6 +249,14 @@
 			mask565Q[i][3] = (i & 0x1 ? 0x001F : 0) | (i & 0x2 ? 0x07E0 : 0) | (i & 0x4 ? 0xF800 : 0);
 		}
 
+		for (int i = 0; i < 16; i++)
+		{
+			mask5551Q[i][0] =
+			mask5551Q[i][1] =
+			mask5551Q[i][2] =
+			mask5551Q[i][3] = (i & 0x1 ? 0x001F : 0) | (i & 0x2 ? 0x03E0 : 0) | (i & 0x4 ? 0x7C00 : 0) | (i & 8 ? 0x8000 : 0);
+		}
+
 		for(int i = 0; i < 4; i++)
 		{
 			maskW01Q[i][0] =  -(i >> 0 & 1);
diff --git a/src/Pipeline/Constants.hpp b/src/Pipeline/Constants.hpp
index 7f6c951..861887c 100644
--- a/src/Pipeline/Constants.hpp
+++ b/src/Pipeline/Constants.hpp
@@ -67,6 +67,7 @@
 		dword4 maskD01X[4];
 		word4 mask565Q[8];
 		dword2 mask10Q[16];		// 4 bit writemask -> A2B10G10R10 bit patterns, replicated 2x
+		word4 mask5551Q[16];	// 4 bit writemask -> A1R5G5B5 bit patterns, replicated 4x
 
 		unsigned short sRGBtoLinear8_16[256];
 
diff --git a/src/Pipeline/PixelProgram.cpp b/src/Pipeline/PixelProgram.cpp
index 2391e19..d9a7bb5 100644
--- a/src/Pipeline/PixelProgram.cpp
+++ b/src/Pipeline/PixelProgram.cpp
@@ -168,6 +168,7 @@
 			auto format = state.targetFormat[index];
 			switch(format)
 			{
+			case VK_FORMAT_A1R5G5B5_UNORM_PACK16:
 			case VK_FORMAT_R5G6B5_UNORM_PACK16:
 			case VK_FORMAT_B8G8R8A8_UNORM:
 			case VK_FORMAT_B8G8R8A8_SRGB:
@@ -185,7 +186,14 @@
 					Pointer<Byte> buffer = cBuffer[index] + q * *Pointer<Int>(data + OFFSET(DrawData, colorSliceB[index]));
 					Vector4s color;
 
-					if(format == VK_FORMAT_R5G6B5_UNORM_PACK16)
+					if(format == VK_FORMAT_A1R5G5B5_UNORM_PACK16)
+					{
+						color.x = UShort4(c[index].x * Float4(0xFBFF), false);
+						color.y = UShort4(c[index].y * Float4(0xFBFF), false);
+						color.z = UShort4(c[index].z * Float4(0xFBFF), false);
+						color.w = UShort4(c[index].w * Float4(0xFFFF), false);
+					}
+					else if(format == VK_FORMAT_R5G6B5_UNORM_PACK16)
 					{
 						color.x = UShort4(c[index].x * Float4(0xFBFF), false);
 						color.y = UShort4(c[index].y * Float4(0xFDFF), false);
@@ -265,6 +273,7 @@
 			{
 			case VK_FORMAT_UNDEFINED:
 				break;
+			case VK_FORMAT_A1R5G5B5_UNORM_PACK16:
 			case VK_FORMAT_R5G6B5_UNORM_PACK16:
 			case VK_FORMAT_B8G8R8A8_UNORM:
 			case VK_FORMAT_B8G8R8A8_SRGB:
diff --git a/src/Pipeline/PixelRoutine.cpp b/src/Pipeline/PixelRoutine.cpp
index 188ac73..a06b4d1 100644
--- a/src/Pipeline/PixelRoutine.cpp
+++ b/src/Pipeline/PixelRoutine.cpp
@@ -959,6 +959,16 @@
 
 		switch(state.targetFormat[index])
 		{
+		case VK_FORMAT_A1R5G5B5_UNORM_PACK16:
+			buffer = cBuffer + 2 * x;
+			buffer2 = buffer + *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
+			c01 = As<Short4>(Int2(*Pointer<Int>(buffer), *Pointer<Int>(buffer2)));
+
+			pixel.x = (c01 & Short4(0x7C00u)) << 1;
+			pixel.y = (c01 & Short4(0x03E0u)) << 6;
+			pixel.z = (c01 & Short4(0x001Fu)) << 11;
+			pixel.w = (c01 & Short4(0x8000u));
+			break;
 		case VK_FORMAT_R5G6B5_UNORM_PACK16:
 			buffer = cBuffer + 2 * x;
 			buffer2 = buffer + *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
@@ -1190,6 +1200,11 @@
 
 		switch(state.targetFormat[index])
 		{
+		case VK_FORMAT_A1R5G5B5_UNORM_PACK16:
+			current.x = AddSat(As<UShort4>(current.x), UShort4(0x0400));
+			current.y = AddSat(As<UShort4>(current.y), UShort4(0x0400));
+			current.z = AddSat(As<UShort4>(current.z), UShort4(0x0400));
+			break;
 		case VK_FORMAT_R5G6B5_UNORM_PACK16:
 			current.x = AddSat(As<UShort4>(current.x), UShort4(0x0400));
 			current.y = AddSat(As<UShort4>(current.y), UShort4(0x0200));
@@ -1217,6 +1232,16 @@
 
 		switch(state.targetFormat[index])
 		{
+		case VK_FORMAT_A1R5G5B5_UNORM_PACK16:
+			{
+				current.w = current.w & Short4(0x8000u);
+				current.x = As<UShort4>(current.x & Short4(0xF800)) >> 1;
+				current.y = As<UShort4>(current.y & Short4(0xF800)) >> 6;
+				current.z = As<UShort4>(current.z & Short4(0xF800)) >> 11;
+
+				current.x = current.x | current.y | current.z | current.w;
+			}
+			break;
 		case VK_FORMAT_R5G6B5_UNORM_PACK16:
 			{
 				current.x = current.x & Short4(0xF800u);
@@ -1358,6 +1383,45 @@
 
 		switch(state.targetFormat[index])
 		{
+		case VK_FORMAT_A1R5G5B5_UNORM_PACK16:
+			{
+				Pointer<Byte> buffer = cBuffer + 2 * x;
+				Int value = *Pointer<Int>(buffer);
+
+				Int c01 = Extract(As<Int2>(current.x), 0);
+
+				if(bgraWriteMask != 0x0000000F)
+				{
+					Int masked = value;
+					c01 &= *Pointer<Int>(constants + OFFSET(Constants,mask5551Q[bgraWriteMask][0]));
+					masked &= *Pointer<Int>(constants + OFFSET(Constants,mask5551Q[~bgraWriteMask & 0xF][0]));
+					c01 |= masked;
+				}
+
+				c01 &= *Pointer<Int>(constants + OFFSET(Constants,maskW4Q[0][0]) + xMask * 8);
+				value &= *Pointer<Int>(constants + OFFSET(Constants,invMaskW4Q[0][0]) + xMask * 8);
+				c01 |= value;
+				*Pointer<Int>(buffer) = c01;
+
+				buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
+				value = *Pointer<Int>(buffer);
+
+				Int c23 = Extract(As<Int2>(current.x), 1);
+
+				if(bgraWriteMask != 0x0000000F)
+				{
+					Int masked = value;
+					c23 &= *Pointer<Int>(constants + OFFSET(Constants,mask5551Q[bgraWriteMask][0]));
+					masked &= *Pointer<Int>(constants + OFFSET(Constants,mask5551Q[~bgraWriteMask & 0xF][0]));
+					c23 |= masked;
+				}
+
+				c23 &= *Pointer<Int>(constants + OFFSET(Constants,maskW4Q[0][2]) + xMask * 8);
+				value &= *Pointer<Int>(constants + OFFSET(Constants,invMaskW4Q[0][2]) + xMask * 8);
+				c23 |= value;
+				*Pointer<Int>(buffer) = c23;
+			}
+			break;
 		case VK_FORMAT_R5G6B5_UNORM_PACK16:
 			{
 				Pointer<Byte> buffer = cBuffer + 2 * x;