Implement Float16 formats and additional blend modes.

The float16 loads and stores are horrendously unoptimized, but this can be addressed as a later pass.

Tests: dEQP-VK.renderpass.suballocation.formats.r16_sfloat.*
Tests: dEQP-VK.renderpass.suballocation.formats.r16g16_sfloat.*
Tests: dEQP-VK.renderpass.suballocation.formats.r16g16b16a16_sfloat.*
Tests: dEQP-VK.pipeline.blend.format.r16_sfloat.*
Tests: dEQP-VK.pipeline.blend.format.r16g16_sfloat.*
Tests: dEQP-VK.pipeline.blend.format.r16g16b16a16_sfloat.*
Bug: b/132434966
Bug: b/132433217
Change-Id: Ifa8feaeecefa1926b1f500e6c9d23e6c242a6844
Reviewed-on: https://swiftshader-review.googlesource.com/c/SwiftShader/+/31113
Tested-by: Ben Clayton <bclayton@google.com>
Kokoro-Presubmit: kokoro <noreply+kokoro@google.com>
Reviewed-by: Chris Forbes <chrisforbes@google.com>
diff --git a/src/Pipeline/PixelProgram.cpp b/src/Pipeline/PixelProgram.cpp
index 57aead8..a90a398 100644
--- a/src/Pipeline/PixelProgram.cpp
+++ b/src/Pipeline/PixelProgram.cpp
@@ -187,6 +187,9 @@
 					}
 				}
 				break;
+			case VK_FORMAT_R16_SFLOAT:
+			case VK_FORMAT_R16G16_SFLOAT:
+			case VK_FORMAT_R16G16B16A16_SFLOAT:
 			case VK_FORMAT_R32_SFLOAT:
 			case VK_FORMAT_R32G32_SFLOAT:
 			case VK_FORMAT_R32G32B32A32_SFLOAT:
@@ -268,6 +271,9 @@
 			case VK_FORMAT_R32_UINT:
 			case VK_FORMAT_R32G32_UINT:
 			case VK_FORMAT_R32G32B32A32_UINT:
+			case VK_FORMAT_R16_SFLOAT:
+			case VK_FORMAT_R16G16_SFLOAT:
+			case VK_FORMAT_R16G16B16A16_SFLOAT:
 			case VK_FORMAT_R16_SINT:
 			case VK_FORMAT_R16G16_SINT:
 			case VK_FORMAT_R16G16B16A16_SINT:
diff --git a/src/Pipeline/PixelRoutine.cpp b/src/Pipeline/PixelRoutine.cpp
index 1cb8b23..616647c 100644
--- a/src/Pipeline/PixelRoutine.cpp
+++ b/src/Pipeline/PixelRoutine.cpp
@@ -1728,11 +1728,22 @@
 			blendFactor.y = *Pointer<Float4>(data + OFFSET(DrawData,factor.blendConstant4F[1]));
 			blendFactor.z = *Pointer<Float4>(data + OFFSET(DrawData,factor.blendConstant4F[2]));
 			break;
+		case VK_BLEND_FACTOR_CONSTANT_ALPHA:
+			blendFactor.x = *Pointer<Float4>(data + OFFSET(DrawData,factor.blendConstant4F[3]));
+			blendFactor.y = *Pointer<Float4>(data + OFFSET(DrawData,factor.blendConstant4F[3]));
+			blendFactor.z = *Pointer<Float4>(data + OFFSET(DrawData,factor.blendConstant4F[3]));
+			break;
 		case VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_COLOR:
 			blendFactor.x = *Pointer<Float4>(data + OFFSET(DrawData,factor.invBlendConstant4F[0]));
 			blendFactor.y = *Pointer<Float4>(data + OFFSET(DrawData,factor.invBlendConstant4F[1]));
 			blendFactor.z = *Pointer<Float4>(data + OFFSET(DrawData,factor.invBlendConstant4F[2]));
 			break;
+		case VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_ALPHA:
+			blendFactor.x = *Pointer<Float4>(data + OFFSET(DrawData,factor.invBlendConstant4F[3]));
+			blendFactor.y = *Pointer<Float4>(data + OFFSET(DrawData,factor.invBlendConstant4F[3]));
+			blendFactor.z = *Pointer<Float4>(data + OFFSET(DrawData,factor.invBlendConstant4F[3]));
+			break;
+
 		default:
 			UNIMPLEMENTED("VkBlendFactor: %d", int(blendFactorActive));
 		}
@@ -1776,9 +1787,11 @@
 			blendFactor.w = Float4(1.0f);
 			break;
 		case VK_BLEND_FACTOR_CONSTANT_COLOR:
+		case VK_BLEND_FACTOR_CONSTANT_ALPHA:
 			blendFactor.w = *Pointer<Float4>(data + OFFSET(DrawData,factor.blendConstant4F[3]));
 			break;
 		case VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_COLOR:
+		case VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_ALPHA:
 			blendFactor.w = *Pointer<Float4>(data + OFFSET(DrawData,factor.invBlendConstant4F[3]));
 			break;
 		default:
@@ -1794,6 +1807,11 @@
 		}
 
 		Pointer<Byte> buffer;
+
+		// pixel holds four texel color values.
+		// Note: Despite the type being Vector4f, the colors may be stored as
+		// integers. Half-floats are stored as full 32-bit floats.
+		// Non-float and non-fixed point formats are not alpha blended.
 		Vector4f pixel;
 
 		Vector4s color;
@@ -1850,6 +1868,48 @@
 			pixel.w = *Pointer<Float4>(buffer + 16 * x + 16, 16);
 			transpose4x4(pixel.x, pixel.y, pixel.z, pixel.w);
 			break;
+		case VK_FORMAT_R16_SFLOAT:
+			buffer = cBuffer;
+			pixel.x.x = Float(*Pointer<Half>(buffer + 2 * x + 0));
+			pixel.x.y = Float(*Pointer<Half>(buffer + 2 * x + 2));
+			buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
+			pixel.x.z = Float(*Pointer<Half>(buffer + 2 * x + 0));
+			pixel.x.w = Float(*Pointer<Half>(buffer + 2 * x + 2));
+			pixel.y = pixel.z = pixel.w = one;
+			break;
+		case VK_FORMAT_R16G16_SFLOAT:
+			buffer = cBuffer;
+			pixel.x.x = Float(*Pointer<Half>(buffer + 4 * x + 0));
+			pixel.y.x = Float(*Pointer<Half>(buffer + 4 * x + 2));
+			pixel.x.y = Float(*Pointer<Half>(buffer + 4 * x + 4));
+			pixel.y.y = Float(*Pointer<Half>(buffer + 4 * x + 6));
+			buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
+			pixel.x.z = Float(*Pointer<Half>(buffer + 4 * x + 0));
+			pixel.y.z = Float(*Pointer<Half>(buffer + 4 * x + 2));
+			pixel.x.w = Float(*Pointer<Half>(buffer + 4 * x + 4));
+			pixel.y.w = Float(*Pointer<Half>(buffer + 4 * x + 6));
+			pixel.z = pixel.w = one;
+			break;
+		case VK_FORMAT_R16G16B16A16_SFLOAT:
+			buffer = cBuffer;
+			pixel.x.x = Float(*Pointer<Half>(buffer + 8 * x + 0x0));
+			pixel.y.x = Float(*Pointer<Half>(buffer + 8 * x + 0x2));
+			pixel.z.x = Float(*Pointer<Half>(buffer + 8 * x + 0x4));
+			pixel.w.x = Float(*Pointer<Half>(buffer + 8 * x + 0x6));
+			pixel.x.y = Float(*Pointer<Half>(buffer + 8 * x + 0x8));
+			pixel.y.y = Float(*Pointer<Half>(buffer + 8 * x + 0xa));
+			pixel.z.y = Float(*Pointer<Half>(buffer + 8 * x + 0xc));
+			pixel.w.y = Float(*Pointer<Half>(buffer + 8 * x + 0xe));
+			buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
+			pixel.x.z = Float(*Pointer<Half>(buffer + 8 * x + 0x0));
+			pixel.y.z = Float(*Pointer<Half>(buffer + 8 * x + 0x2));
+			pixel.z.z = Float(*Pointer<Half>(buffer + 8 * x + 0x4));
+			pixel.w.z = Float(*Pointer<Half>(buffer + 8 * x + 0x6));
+			pixel.x.w = Float(*Pointer<Half>(buffer + 8 * x + 0x8));
+			pixel.y.w = Float(*Pointer<Half>(buffer + 8 * x + 0xa));
+			pixel.z.w = Float(*Pointer<Half>(buffer + 8 * x + 0xc));
+			pixel.w.w = Float(*Pointer<Half>(buffer + 8 * x + 0xe));
+			break;
 		default:
 			UNIMPLEMENTED("VkFormat: %d", int(state.targetFormat[index]));
 		}
@@ -1975,6 +2035,7 @@
 	{
 		switch(state.targetFormat[index])
 		{
+		case VK_FORMAT_R16_SFLOAT:
 		case VK_FORMAT_R32_SFLOAT:
 		case VK_FORMAT_R32_SINT:
 		case VK_FORMAT_R32_UINT:
@@ -1984,6 +2045,7 @@
 		case VK_FORMAT_R8_UINT:
 		case VK_FORMAT_A2B10G10R10_UINT_PACK32:
 			break;
+		case VK_FORMAT_R16G16_SFLOAT:
 		case VK_FORMAT_R32G32_SFLOAT:
 		case VK_FORMAT_R32G32_SINT:
 		case VK_FORMAT_R32G32_UINT:
@@ -1996,6 +2058,7 @@
 			oC.z = UnpackHigh(oC.z, oC.y);
 			oC.y = oC.z;
 			break;
+		case VK_FORMAT_R16G16B16A16_SFLOAT:
 		case VK_FORMAT_R32G32B32A32_SFLOAT:
 		case VK_FORMAT_R32G32B32A32_SINT:
 		case VK_FORMAT_R32G32B32A32_UINT:
@@ -2029,10 +2092,12 @@
 			xMask &= sMask;
 		}
 
+		auto targetFormat = state.targetFormat[index];
+
 		Pointer<Byte> buffer;
 		Float4 value;
 
-		switch(state.targetFormat[index])
+		switch(targetFormat)
 		{
 		case VK_FORMAT_R32_SFLOAT:
 		case VK_FORMAT_R32_SINT:
@@ -2066,6 +2131,32 @@
 				*Pointer<Float>(buffer + 4) = oC.x.y;
 			}
 			break;
+		case VK_FORMAT_R16_SFLOAT:
+			if(rgbaWriteMask & 0x00000001)
+			{
+				buffer = cBuffer + 2 * x;
+
+				value = Insert(value, Float(*Pointer<Half>(buffer + 0)), 0);
+				value = Insert(value, Float(*Pointer<Half>(buffer + 2)), 1);
+
+				buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
+
+				value = Insert(value, Float(*Pointer<Half>(buffer + 0)), 2);
+				value = Insert(value, Float(*Pointer<Half>(buffer + 2)), 3);
+
+				oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants, maskD4X) + xMask * 16, 16));
+				value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskD4X) + xMask * 16, 16));
+				oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(value));
+
+				*Pointer<Half>(buffer + 0) = Half(oC.x.z);
+				*Pointer<Half>(buffer + 2) = Half(oC.x.w);
+
+				buffer -= *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
+
+				*Pointer<Half>(buffer + 0) = Half(oC.x.x);
+				*Pointer<Half>(buffer + 2) = Half(oC.x.y);
+			}
+			break;
 		case VK_FORMAT_R16_SINT:
 		case VK_FORMAT_R16_UINT:
 			if(rgbaWriteMask & 0x00000001)
@@ -2084,7 +2175,7 @@
 				value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskD4X) + xMask * 16, 16));
 				oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(value));
 
-				if(state.targetFormat[index] == VK_FORMAT_R16_SINT)
+				if(targetFormat == VK_FORMAT_R16_SINT)
 				{
 					Float component = oC.x.z;
 					*Pointer<Short>(buffer + 0) = Short(As<Int>(component));
@@ -2127,7 +2218,7 @@
 				xyzw |= UInt(*Pointer<UShort>(buffer)) << 16;
 
 				Short4 tmpCol = Short4(As<Int4>(oC.x));
-				if(state.targetFormat[index] == VK_FORMAT_R8_SINT)
+				if(targetFormat == VK_FORMAT_R8_SINT)
 				{
 					tmpCol = As<Short4>(PackSigned(tmpCol, tmpCol));
 				}
@@ -2184,6 +2275,39 @@
 			oC.y = As<Float4>(As<Int4>(oC.y) | As<Int4>(value));
 			*Pointer<Float4>(buffer) = oC.y;
 			break;
+		case VK_FORMAT_R16G16_SFLOAT:
+			if((rgbaWriteMask & 0x00000003) != 0x0)
+			{
+				buffer = cBuffer + 4 * x;
+
+				UInt2 rgbaMask;
+				UInt2 packedCol;
+				packedCol = Insert(packedCol, (UInt(As<UShort>(Half(oC.x.y))) << 16) | UInt(As<UShort>(Half(oC.x.x))), 0);
+				packedCol = Insert(packedCol, (UInt(As<UShort>(Half(oC.x.w))) << 16) | UInt(As<UShort>(Half(oC.x.z))), 1);
+
+				UShort4 value = *Pointer<UShort4>(buffer);
+				UInt2 mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskD01Q) + xMask * 8);
+				if((rgbaWriteMask & 0x3) != 0x3)
+				{
+					Int tmpMask = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[rgbaWriteMask & 0x3][0]));
+					rgbaMask = As<UInt2>(Int2(tmpMask, tmpMask));
+					mergedMask &= rgbaMask;
+				}
+				*Pointer<UInt2>(buffer) = (packedCol & mergedMask) | (As<UInt2>(value) & ~mergedMask);
+
+				buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
+
+				packedCol = Insert(packedCol, (UInt(As<UShort>(Half(oC.y.y))) << 16) | UInt(As<UShort>(Half(oC.y.x))), 0);
+				packedCol = Insert(packedCol, (UInt(As<UShort>(Half(oC.y.w))) << 16) | UInt(As<UShort>(Half(oC.y.z))), 1);
+				value = *Pointer<UShort4>(buffer);
+				mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskD23Q) + xMask * 8);
+				if((rgbaWriteMask & 0x3) != 0x3)
+				{
+					mergedMask &= rgbaMask;
+				}
+				*Pointer<UInt2>(buffer) = (packedCol & mergedMask) | (As<UInt2>(value) & ~mergedMask);
+			}
+			break;
 		case VK_FORMAT_R16G16_SINT:
 		case VK_FORMAT_R16G16_UINT:
 			if((rgbaWriteMask & 0x00000003) != 0x0)
@@ -2226,7 +2350,7 @@
 				buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
 				xyzw = Insert(xyzw, *Pointer<Int>(buffer), 1);
 
-				if(state.targetFormat[index] == VK_FORMAT_R8G8_SINT)
+				if(targetFormat == VK_FORMAT_R8G8_SINT)
 				{
 					packedCol = As<Int2>(PackSigned(Short4(As<Int4>(oC.x)), Short4(As<Int4>(oC.y))));
 				}
@@ -2325,6 +2449,42 @@
 				*Pointer<Float4>(buffer + 16, 16) = oC.w;
 			}
 			break;
+		case VK_FORMAT_R16G16B16A16_SFLOAT:
+			if((rgbaWriteMask & 0x0000000F) != 0x0)
+			{
+				buffer = cBuffer + 8 * x;
+
+				UInt4 rgbaMask;
+				UInt4 value = *Pointer<UInt4>(buffer);
+				UInt4 packedCol;
+				packedCol = Insert(packedCol, (UInt(As<UShort>(Half(oC.x.y))) << 16) | UInt(As<UShort>(Half(oC.x.x))), 0);
+				packedCol = Insert(packedCol, (UInt(As<UShort>(Half(oC.x.w))) << 16) | UInt(As<UShort>(Half(oC.x.z))), 1);
+				packedCol = Insert(packedCol, (UInt(As<UShort>(Half(oC.y.y))) << 16) | UInt(As<UShort>(Half(oC.y.x))), 2);
+				packedCol = Insert(packedCol, (UInt(As<UShort>(Half(oC.y.w))) << 16) | UInt(As<UShort>(Half(oC.y.z))), 3);
+				UInt4 mergedMask = *Pointer<UInt4>(constants + OFFSET(Constants, maskQ01X) + xMask * 16);
+				if((rgbaWriteMask & 0xF) != 0xF)
+				{
+					UInt2 tmpMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskW4Q[rgbaWriteMask][0]));
+					rgbaMask = UInt4(tmpMask, tmpMask);
+					mergedMask &= rgbaMask;
+				}
+				*Pointer<UInt4>(buffer) = (packedCol & mergedMask) | (As<UInt4>(value) & ~mergedMask);
+
+				buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
+
+				value = *Pointer<UInt4>(buffer);
+				packedCol = Insert(packedCol, (UInt(As<UShort>(Half(oC.z.y))) << 16) | UInt(As<UShort>(Half(oC.z.x))), 0);
+				packedCol = Insert(packedCol, (UInt(As<UShort>(Half(oC.z.w))) << 16) | UInt(As<UShort>(Half(oC.z.z))), 1);
+				packedCol = Insert(packedCol, (UInt(As<UShort>(Half(oC.w.y))) << 16) | UInt(As<UShort>(Half(oC.w.x))), 2);
+				packedCol = Insert(packedCol, (UInt(As<UShort>(Half(oC.w.w))) << 16) | UInt(As<UShort>(Half(oC.w.z))), 3);
+				mergedMask = *Pointer<UInt4>(constants + OFFSET(Constants, maskQ23X) + xMask * 16);
+				if((rgbaWriteMask & 0xF) != 0xF)
+				{
+					mergedMask &= rgbaMask;
+				}
+				*Pointer<UInt4>(buffer) = (packedCol & mergedMask) | (As<UInt4>(value) & ~mergedMask);
+			}
+			break;
 		case VK_FORMAT_R16G16B16A16_SINT:
 		case VK_FORMAT_R16G16B16A16_UINT:
 			if((rgbaWriteMask & 0x0000000F) != 0x0)
@@ -2365,7 +2525,7 @@
 
 				buffer = cBuffer + 4 * x;
 
-				bool isSigned = state.targetFormat[index] == VK_FORMAT_R8G8B8A8_SINT || state.targetFormat[index] == VK_FORMAT_A8B8G8R8_SINT_PACK32;
+				bool isSigned = targetFormat == VK_FORMAT_R8G8B8A8_SINT || targetFormat == VK_FORMAT_A8B8G8R8_SINT_PACK32;
 
 				if(isSigned)
 				{
@@ -2432,7 +2592,7 @@
 			}
 			break;
 		default:
-			UNIMPLEMENTED("VkFormat: %d", int(state.targetFormat[index]));
+			UNIMPLEMENTED("VkFormat: %d", int(targetFormat));
 		}
 	}
 
diff --git a/src/Reactor/Reactor.cpp b/src/Reactor/Reactor.cpp
index c4740a7..f9a0072 100644
--- a/src/Reactor/Reactor.cpp
+++ b/src/Reactor/Reactor.cpp
@@ -3176,6 +3176,16 @@
 		return RValue<UInt2>(Nucleus::createNot(val.value));
 	}
 
+	RValue<UInt> Extract(RValue<UInt2> val, int i)
+	{
+		return RValue<UInt>(Nucleus::createExtractElement(val.value, UInt::getType(), i));
+	}
+
+	RValue<UInt2> Insert(RValue<UInt2> val, RValue<UInt> element, int i)
+	{
+		return RValue<UInt2>(Nucleus::createInsertElement(val.value, element.value, i));
+	}
+
 	Int4::Int4() : XYZW(this)
 	{
 	}
diff --git a/src/Reactor/Reactor.hpp b/src/Reactor/Reactor.hpp
index 6fbc061..5cf1be0 100644
--- a/src/Reactor/Reactor.hpp
+++ b/src/Reactor/Reactor.hpp
@@ -1399,6 +1399,8 @@
 //	RValue<Bool> operator==(RValue<UInt2> lhs, RValue<UInt2> rhs);
 
 //	RValue<UInt2> RoundInt(RValue<Float4> cast);
+	RValue<UInt> Extract(RValue<UInt2> val, int i);
+	RValue<UInt2> Insert(RValue<UInt2> val, RValue<UInt> element, int i);
 
 	template<class T>
 	struct Scalar;
diff --git a/src/Reactor/SubzeroReactor.cpp b/src/Reactor/SubzeroReactor.cpp
index 4310aba..ba18fd2 100644
--- a/src/Reactor/SubzeroReactor.cpp
+++ b/src/Reactor/SubzeroReactor.cpp
@@ -2689,16 +2689,6 @@
 		return T(Type_v2i32);
 	}
 
-	RValue<UInt> Extract(RValue<UInt2> val, int i)
-	{
-		return RValue<UInt>(Nucleus::createExtractElement(val.value, UInt::getType(), i));
-	}
-
-	RValue<UInt2> Insert(RValue<UInt2> val, RValue<UInt> element, int i)
-	{
-		return RValue<UInt2>(Nucleus::createInsertElement(val.value, element.value, i));
-	}
-
 	RValue<UInt2> operator<<(RValue<UInt2> lhs, unsigned char rhs)
 	{
 		if(emulateIntrinsics)