R11G11B10F support

- Moved the R11G11B10F conversion code from the Blitter to
  ShaderCore
- Used the conversion code in PixelRoutine where appropriate
  to make the R11G11B10F work as a renderable format
- Added the VK_FORMAT_FEATURE_COLOR_ATTACHMENT_BIT and
  VK_FORMAT_FEATURE_BLIT_DST_BIT to the
  VK_FORMAT_B10G11R11_UFLOAT_PACK32 format

This allows ANGLE to expose the GL_EXT_color_buffer_float
(and GL_EXT_color_buffer_half_float, which depends on
GL_EXT_color_buffer_float in ANGLE, due a driver issue).

Bug: b/146223877
Tests: dEQP-VK.*b10g11r11*
Change-Id: I04ad29f7b7d497705186ae290a05868abfc13c42
Reviewed-on: https://swiftshader-review.googlesource.com/c/SwiftShader/+/39550
Presubmit-Ready: Alexis Hétu <sugoi@google.com>
Reviewed-by: Antonio Maiorano <amaiorano@google.com>
Reviewed-by: Chris Forbes <chrisforbes@google.com>
Kokoro-Presubmit: kokoro <noreply+kokoro@google.com>
Tested-by: Alexis Hétu <sugoi@google.com>
diff --git a/src/Device/Blitter.cpp b/src/Device/Blitter.cpp
index 0b4bdab..3f42166 100644
--- a/src/Device/Blitter.cpp
+++ b/src/Device/Blitter.cpp
@@ -398,19 +398,7 @@
 		c.x = Float(*Pointer<Half>(element));
 		break;
 	case VK_FORMAT_B10G11R11_UFLOAT_PACK32:
-		// 10 (or 11) bit float formats are unsigned formats with a 5 bit exponent and a 5 (or 6) bit mantissa.
-		// Since the Half float format also has a 5 bit exponent, we can convert these formats to half by
-		// copy/pasting the bits so the the exponent bits and top mantissa bits are aligned to the half format.
-		// In this case, we have:
-		//              B B B B B B B B B B G G G G G G G G G G G R R R R R R R R R R R
-		// 1st Short:                                  |xxxxxxxxxx---------------------|
-		// 2nd Short:                  |xxxx---------------------xxxxxx|
-		// 3rd Short: |--------------------xxxxxxxxxxxx|
-		// These memory reads overlap, but each of them contains an entire channel, so we can read this without
-		// any int -> short conversion.
-		c.x = Float(As<Half>((*Pointer<UShort>(element + 0) & UShort(0x07FF)) << UShort(4)));
-		c.y = Float(As<Half>((*Pointer<UShort>(element + 1) & UShort(0x3FF8)) << UShort(1)));
-		c.z = Float(As<Half>((*Pointer<UShort>(element + 2) & UShort(0xFFC0)) >> UShort(1)));
+		c = r11g11b10Unpack(*Pointer<UInt>(element));
 		break;
 	case VK_FORMAT_E5B9G9R9_UFLOAT_PACK32:
 		// This type contains a common 5 bit exponent (E) and a 9 bit the mantissa for R, G and B.
@@ -621,16 +609,7 @@
 		break;
 	case VK_FORMAT_B10G11R11_UFLOAT_PACK32:
 		{
-			// 10 (or 11) bit float formats are unsigned formats with a 5 bit exponent and a 5 (or 6) bit mantissa.
-			// Since the 16-bit half-precision float format also has a 5 bit exponent, we can extract these minifloats from them.
-
-			// FIXME(b/138944025): Handle negative values, Inf, and NaN.
-			// FIXME(b/138944025): Perform rounding before truncating the mantissa.
-			UInt r = (UInt(As<UShort>(Half(c.x))) & 0x00007FF0) >> 4;
-			UInt g = (UInt(As<UShort>(Half(c.y))) & 0x00007FF0) << 7;
-			UInt b = (UInt(As<UShort>(Half(c.z))) & 0x00007FE0) << 17;
-
-			UInt rgb = r | g | b;
+			UInt rgb = r11g11b10Pack(c);
 
 			UInt old = *Pointer<UInt>(element);
 
diff --git a/src/Pipeline/PixelProgram.cpp b/src/Pipeline/PixelProgram.cpp
index 59f9cbb..81c137e 100644
--- a/src/Pipeline/PixelProgram.cpp
+++ b/src/Pipeline/PixelProgram.cpp
@@ -244,6 +244,7 @@
 		case VK_FORMAT_R16_SFLOAT:
 		case VK_FORMAT_R16G16_SFLOAT:
 		case VK_FORMAT_R16G16B16A16_SFLOAT:
+		case VK_FORMAT_B10G11R11_UFLOAT_PACK32:
 		case VK_FORMAT_R32_SFLOAT:
 		case VK_FORMAT_R32G32_SFLOAT:
 		case VK_FORMAT_R32G32B32A32_SFLOAT:
@@ -329,6 +330,7 @@
 		case VK_FORMAT_R16_SFLOAT:
 		case VK_FORMAT_R16G16_SFLOAT:
 		case VK_FORMAT_R16G16B16A16_SFLOAT:
+		case VK_FORMAT_B10G11R11_UFLOAT_PACK32:
 		case VK_FORMAT_R16_SINT:
 		case VK_FORMAT_R16G16_SINT:
 		case VK_FORMAT_R16G16B16A16_SINT:
diff --git a/src/Pipeline/PixelRoutine.cpp b/src/Pipeline/PixelRoutine.cpp
index 13a9e0c..db57ae4 100644
--- a/src/Pipeline/PixelRoutine.cpp
+++ b/src/Pipeline/PixelRoutine.cpp
@@ -924,13 +924,13 @@
 {
 	Short4 c01;
 	Short4 c23;
-	Pointer<Byte> buffer;
+	Pointer<Byte> buffer = cBuffer;
 	Pointer<Byte> buffer2;
 
 	switch(state.targetFormat[index])
 	{
 	case VK_FORMAT_A1R5G5B5_UNORM_PACK16:
-		buffer = cBuffer + 2 * x;
+		buffer += 2 * x;
 		buffer2 = buffer + *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
 		c01 = As<Short4>(Int2(*Pointer<Int>(buffer), *Pointer<Int>(buffer2)));
 
@@ -948,7 +948,7 @@
 		pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 10);
 		break;
 	case VK_FORMAT_R5G6B5_UNORM_PACK16:
-		buffer = cBuffer + 2 * x;
+		buffer += 2 * x;
 		buffer2 = buffer + *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
 		c01 = As<Short4>(Int2(*Pointer<Int>(buffer), *Pointer<Int>(buffer2)));
 
@@ -967,7 +967,7 @@
 		break;
 	case VK_FORMAT_B8G8R8A8_UNORM:
 	case VK_FORMAT_B8G8R8A8_SRGB:
-		buffer = cBuffer + 4 * x;
+		buffer += 4 * x;
 		c01 = *Pointer<Short4>(buffer);
 		buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
 		c23 = *Pointer<Short4>(buffer);
@@ -987,7 +987,7 @@
 		break;
 	case VK_FORMAT_R8G8B8A8_UNORM:
 	case VK_FORMAT_R8G8B8A8_SRGB:
-		buffer = cBuffer + 4 * x;
+		buffer += 4 * x;
 		c01 = *Pointer<Short4>(buffer);
 		buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
 		c23 = *Pointer<Short4>(buffer);
@@ -1006,7 +1006,7 @@
 		pixel.w = UnpackHigh(As<Byte8>(pixel.w), As<Byte8>(pixel.w));
 		break;
 	case VK_FORMAT_R8_UNORM:
-		buffer = cBuffer + 1 * x;
+		buffer += 1 * x;
 		pixel.x = Insert(pixel.x, *Pointer<Short>(buffer), 0);
 		buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
 		pixel.x = Insert(pixel.x, *Pointer<Short>(buffer), 1);
@@ -1016,7 +1016,7 @@
 		pixel.w = Short4(0xFFFFu);
 		break;
 	case VK_FORMAT_R8G8_UNORM:
-		buffer = cBuffer + 2 * x;
+		buffer += 2 * x;
 		c01 = As<Short4>(Insert(As<Int2>(c01), *Pointer<Int>(buffer), 0));
 		buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
 		c01 = As<Short4>(Insert(As<Int2>(c01), *Pointer<Int>(buffer), 1));
@@ -1026,7 +1026,6 @@
 		pixel.w = Short4(0xFFFFu);
 		break;
 	case VK_FORMAT_R16G16B16A16_UNORM:
-		buffer = cBuffer;
 		pixel.x = *Pointer<Short4>(buffer + 8 * x);
 		pixel.y = *Pointer<Short4>(buffer + 8 * x + 8);
 		buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
@@ -1035,7 +1034,6 @@
 		transpose4x4(pixel.x, pixel.y, pixel.z, pixel.w);
 		break;
 	case VK_FORMAT_R16G16_UNORM:
-		buffer = cBuffer;
 		pixel.x = *Pointer<Short4>(buffer + 4 * x);
 		buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
 		pixel.y = *Pointer<Short4>(buffer + 4 * x);
@@ -1050,7 +1048,6 @@
 		break;
 	case VK_FORMAT_A2B10G10R10_UNORM_PACK32:
 	{
-		buffer = cBuffer;
 		Int4 v = Int4(0);
 		v = Insert(v, *Pointer<Int>(buffer + 4 * x), 0);
 		v = Insert(v, *Pointer<Int>(buffer + 4 * x + 4), 1);
@@ -1372,11 +1369,13 @@
 		xMask &= sMask;
 	}
 
+	Pointer<Byte> buffer = cBuffer;
+
 	switch(state.targetFormat[index])
 	{
 	case VK_FORMAT_A1R5G5B5_UNORM_PACK16:
 		{
-			Pointer<Byte> buffer = cBuffer + 2 * x;
+			buffer += 2 * x;
 			Int value = *Pointer<Int>(buffer);
 
 			Int channelMask = *Pointer<Int>(constants + OFFSET(Constants,mask5551Q[bgraWriteMask & 0xF][0]));
@@ -1403,7 +1402,7 @@
 		break;
 	case VK_FORMAT_R5G6B5_UNORM_PACK16:
 		{
-			Pointer<Byte> buffer = cBuffer + 2 * x;
+			buffer += 2 * x;
 			Int value = *Pointer<Int>(buffer);
 
 			Int channelMask = *Pointer<Int>(constants + OFFSET(Constants,mask565Q[bgraWriteMask & 0x7][0]));
@@ -1431,7 +1430,7 @@
 	case VK_FORMAT_B8G8R8A8_UNORM:
 	case VK_FORMAT_B8G8R8A8_SRGB:
 		{
-			Pointer<Byte> buffer = cBuffer + x * 4;
+			buffer += x * 4;
 			Short4 value = *Pointer<Short4>(buffer);
 			Short4 channelMask = *Pointer<Short4>(constants + OFFSET(Constants,maskB4Q[bgraWriteMask][0]));
 
@@ -1458,7 +1457,7 @@
 	case VK_FORMAT_A8B8G8R8_UNORM_PACK32:
 	case VK_FORMAT_A8B8G8R8_SRGB_PACK32:
 		{
-			Pointer<Byte> buffer = cBuffer + x * 4;
+			buffer += x * 4;
 			Short4 value = *Pointer<Short4>(buffer);
 			Short4 channelMask = *Pointer<Short4>(constants + OFFSET(Constants,maskB4Q[rgbaWriteMask][0]));
 
@@ -1483,7 +1482,7 @@
 	case VK_FORMAT_R8G8_UNORM:
 		if((rgbaWriteMask & 0x00000003) != 0x0)
 		{
-			Pointer<Byte> buffer = cBuffer + 2 * x;
+			buffer += 2 * x;
 			Int2 value;
 			value = Insert(value, *Pointer<Int>(buffer), 0);
 			Int pitch = *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
@@ -1508,7 +1507,7 @@
 	case VK_FORMAT_R8_UNORM:
 		if(rgbaWriteMask & 0x00000001)
 		{
-			Pointer<Byte> buffer = cBuffer + 1 * x;
+			buffer += 1 * x;
 			Short4 value;
 			value = Insert(value, *Pointer<Short>(buffer), 0);
 			Int pitch = *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
@@ -1524,7 +1523,7 @@
 		break;
 	case VK_FORMAT_R16G16_UNORM:
 		{
-			Pointer<Byte> buffer = cBuffer + 4 * x;
+			buffer += 4 * x;
 
 			Short4 value = *Pointer<Short4>(buffer);
 
@@ -1561,7 +1560,7 @@
 		break;
 	case VK_FORMAT_R16G16B16A16_UNORM:
 		{
-			Pointer<Byte> buffer = cBuffer + 8 * x;
+			buffer += 8 * x;
 
 			{
 				Short4 value = *Pointer<Short4>(buffer);
@@ -1636,9 +1635,8 @@
 		break;
 		case VK_FORMAT_A2B10G10R10_UNORM_PACK32:
 		{
-			Pointer<Byte> buffer = cBuffer + 4 * x;
+			buffer += 4 * x;
 
-			buffer = cBuffer + 4 * x;
 			Int2 value = *Pointer<Int2>(buffer, 16);
 			Int2 mergedMask = *Pointer<Int2>(constants + OFFSET(Constants, maskD01Q) + xMask * 8);
 			if (rgbaWriteMask != 0xF)
@@ -1806,7 +1804,7 @@
 		return;
 	}
 
-	Pointer<Byte> buffer;
+	Pointer<Byte> buffer = cBuffer;
 
 	// pixel holds four texel color values.
 	// Note: Despite the type being Vector4f, the colors may be stored as
@@ -1834,7 +1832,6 @@
 	case VK_FORMAT_R32_SINT:
 	case VK_FORMAT_R32_UINT:
 	case VK_FORMAT_R32_SFLOAT:
-		buffer = cBuffer;
 		// FIXME: movlps
 		pixel.x.x = *Pointer<Float>(buffer + 4 * x + 0);
 		pixel.x.y = *Pointer<Float>(buffer + 4 * x + 4);
@@ -1847,7 +1844,6 @@
 	case VK_FORMAT_R32G32_SINT:
 	case VK_FORMAT_R32G32_UINT:
 	case VK_FORMAT_R32G32_SFLOAT:
-		buffer = cBuffer;
 		pixel.x = *Pointer<Float4>(buffer + 8 * x, 16);
 		buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
 		pixel.y = *Pointer<Float4>(buffer + 8 * x, 16);
@@ -1860,7 +1856,6 @@
 	case VK_FORMAT_R32G32B32A32_SFLOAT:
 	case VK_FORMAT_R32G32B32A32_SINT:
 	case VK_FORMAT_R32G32B32A32_UINT:
-		buffer = cBuffer;
 		pixel.x = *Pointer<Float4>(buffer + 16 * x, 16);
 		pixel.y = *Pointer<Float4>(buffer + 16 * x + 16, 16);
 		buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
@@ -1869,7 +1864,6 @@
 		transpose4x4(pixel.x, pixel.y, pixel.z, pixel.w);
 		break;
 	case VK_FORMAT_R16_SFLOAT:
-		buffer = cBuffer;
 		pixel.x.x = Float(*Pointer<Half>(buffer + 2 * x + 0));
 		pixel.x.y = Float(*Pointer<Half>(buffer + 2 * x + 2));
 		buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
@@ -1878,7 +1872,6 @@
 		pixel.y = pixel.z = pixel.w = one;
 		break;
 	case VK_FORMAT_R16G16_SFLOAT:
-		buffer = cBuffer;
 		pixel.x.x = Float(*Pointer<Half>(buffer + 4 * x + 0));
 		pixel.y.x = Float(*Pointer<Half>(buffer + 4 * x + 2));
 		pixel.x.y = Float(*Pointer<Half>(buffer + 4 * x + 4));
@@ -1891,7 +1884,6 @@
 		pixel.z = pixel.w = one;
 		break;
 	case VK_FORMAT_R16G16B16A16_SFLOAT:
-		buffer = cBuffer;
 		pixel.x.x = Float(*Pointer<Half>(buffer + 8 * x + 0x0));
 		pixel.y.x = Float(*Pointer<Half>(buffer + 8 * x + 0x2));
 		pixel.z.x = Float(*Pointer<Half>(buffer + 8 * x + 0x4));
@@ -1910,6 +1902,14 @@
 		pixel.z.w = Float(*Pointer<Half>(buffer + 8 * x + 0xc));
 		pixel.w.w = Float(*Pointer<Half>(buffer + 8 * x + 0xe));
 		break;
+	case VK_FORMAT_B10G11R11_UFLOAT_PACK32:
+		pixel.x = r11g11b10Unpack(*Pointer<UInt>(buffer + 4 * x + 0));
+		pixel.y = r11g11b10Unpack(*Pointer<UInt>(buffer + 4 * x + 4));
+		buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
+		pixel.z = r11g11b10Unpack(*Pointer<UInt>(buffer + 4 * x + 0));
+		pixel.w = r11g11b10Unpack(*Pointer<UInt>(buffer + 4 * x + 4));
+		transpose4x4(pixel.x, pixel.y, pixel.z, pixel.w);
+		break;
 	default:
 		UNIMPLEMENTED("VkFormat: %d", int(state.targetFormat[index]));
 	}
@@ -2039,6 +2039,7 @@
 		oC.y = oC.z;
 		break;
 	case VK_FORMAT_R16G16B16A16_SFLOAT:
+	case VK_FORMAT_B10G11R11_UFLOAT_PACK32:
 	case VK_FORMAT_R32G32B32A32_SFLOAT:
 	case VK_FORMAT_R32G32B32A32_SINT:
 	case VK_FORMAT_R32G32B32A32_UINT:
@@ -2074,7 +2075,7 @@
 
 	auto targetFormat = state.targetFormat[index];
 
-	Pointer<Byte> buffer;
+	Pointer<Byte> buffer = cBuffer;
 	Float4 value;
 
 	switch(targetFormat)
@@ -2084,7 +2085,7 @@
 	case VK_FORMAT_R32_UINT:
 		if(rgbaWriteMask & 0x00000001)
 		{
-			buffer = cBuffer + 4 * x;
+			buffer += 4 * x;
 
 			// FIXME: movlps
 			value.x = *Pointer<Float>(buffer + 0);
@@ -2114,7 +2115,7 @@
 	case VK_FORMAT_R16_SFLOAT:
 		if(rgbaWriteMask & 0x00000001)
 		{
-			buffer = cBuffer + 2 * x;
+			buffer += 2 * x;
 
 			value = Insert(value, Float(*Pointer<Half>(buffer + 0)), 0);
 			value = Insert(value, Float(*Pointer<Half>(buffer + 2)), 1);
@@ -2141,7 +2142,7 @@
 	case VK_FORMAT_R16_UINT:
 		if(rgbaWriteMask & 0x00000001)
 		{
-			buffer = cBuffer + 2 * x;
+			buffer += 2 * x;
 
 			UShort4 xyzw;
 			xyzw = As<UShort4>(Insert(As<Int2>(xyzw), *Pointer<Int>(buffer), 0));
@@ -2189,7 +2190,7 @@
 	case VK_FORMAT_R8_UINT:
 		if(rgbaWriteMask & 0x00000001)
 		{
-			buffer = cBuffer + x;
+			buffer += x;
 
 			UInt xyzw, packedCol;
 
@@ -2219,7 +2220,7 @@
 	case VK_FORMAT_R32G32_SFLOAT:
 	case VK_FORMAT_R32G32_SINT:
 	case VK_FORMAT_R32G32_UINT:
-		buffer = cBuffer + 8 * x;
+		buffer += 8 * x;
 
 		value = *Pointer<Float4>(buffer);
 
@@ -2258,7 +2259,7 @@
 	case VK_FORMAT_R16G16_SFLOAT:
 		if((rgbaWriteMask & 0x00000003) != 0x0)
 		{
-			buffer = cBuffer + 4 * x;
+			buffer += 4 * x;
 
 			UInt2 rgbaMask;
 			UInt2 packedCol;
@@ -2292,7 +2293,7 @@
 	case VK_FORMAT_R16G16_UINT:
 		if((rgbaWriteMask & 0x00000003) != 0x0)
 		{
-			buffer = cBuffer + 4 * x;
+			buffer += 4 * x;
 
 			UInt2 rgbaMask;
 			UShort4 packedCol = UShort4(As<Int4>(oC.x));
@@ -2322,7 +2323,7 @@
 	case VK_FORMAT_R8G8_UINT:
 		if((rgbaWriteMask & 0x00000003) != 0x0)
 		{
-			buffer = cBuffer + 2 * x;
+			buffer += 2 * x;
 
 			Int2 xyzw, packedCol;
 
@@ -2357,7 +2358,7 @@
 	case VK_FORMAT_R32G32B32A32_SFLOAT:
 	case VK_FORMAT_R32G32B32A32_SINT:
 	case VK_FORMAT_R32G32B32A32_UINT:
-		buffer = cBuffer + 16 * x;
+		buffer += 16 * x;
 
 		{
 			value = *Pointer<Float4>(buffer, 16);
@@ -2432,7 +2433,7 @@
 	case VK_FORMAT_R16G16B16A16_SFLOAT:
 		if((rgbaWriteMask & 0x0000000F) != 0x0)
 		{
-			buffer = cBuffer + 8 * x;
+			buffer += 8 * x;
 
 			UInt4 rgbaMask;
 			UInt4 value = *Pointer<UInt4>(buffer);
@@ -2465,11 +2466,31 @@
 			*Pointer<UInt4>(buffer) = (packedCol & mergedMask) | (As<UInt4>(value) & ~mergedMask);
 		}
 		break;
+	case VK_FORMAT_B10G11R11_UFLOAT_PACK32:
+		if((rgbaWriteMask & 0x7) != 0x0)
+		{
+			buffer += 4 * x;
+
+			unsigned int mask = ((rgbaWriteMask & 0x1) ? 0x000007FF : 0) |
+								((rgbaWriteMask & 0x2) ? 0x003FF800 : 0) |
+								((rgbaWriteMask & 0x4) ? 0xFFC00000 : 0);
+			UInt2 mergedMask(mask, mask);
+
+			UInt2 value;
+			value = Insert(value, r11g11b10Pack(oC.x), 0);
+			value = Insert(value, r11g11b10Pack(oC.y), 1);
+			*Pointer<UInt2>(buffer) = (value & mergedMask) | ((*Pointer<UInt2>(buffer)) & ~mergedMask);
+			buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
+			value = Insert(value, r11g11b10Pack(oC.z), 0);
+			value = Insert(value, r11g11b10Pack(oC.w), 1);
+			*Pointer<UInt2>(buffer) = (value & mergedMask) | ((*Pointer<UInt2>(buffer)) & ~mergedMask);
+		}
+		break;
 	case VK_FORMAT_R16G16B16A16_SINT:
 	case VK_FORMAT_R16G16B16A16_UINT:
 		if((rgbaWriteMask & 0x0000000F) != 0x0)
 		{
-			buffer = cBuffer + 8 * x;
+			buffer += 8 * x;
 
 			UInt4 rgbaMask;
 			UShort8 value = *Pointer<UShort8>(buffer);
@@ -2503,7 +2524,7 @@
 		{
 			UInt2 value, packedCol, mergedMask;
 
-			buffer = cBuffer + 4 * x;
+			buffer += 4 * x;
 
 			bool isSigned = targetFormat == VK_FORMAT_R8G8B8A8_SINT || targetFormat == VK_FORMAT_A8B8G8R8_SINT_PACK32;
 
@@ -2551,7 +2572,7 @@
 					((As<Int4>(oC.y) & Int4(0x3ff)) << 10) |
 					((As<Int4>(oC.x) & Int4(0x3ff)));
 
-			buffer = cBuffer + 4 * x;
+			buffer += 4 * x;
 			value = *Pointer<Int2>(buffer, 16);
 			mergedMask = *Pointer<Int2>(constants + OFFSET(Constants, maskD01Q) + xMask * 8);
 			if (rgbaWriteMask != 0xF)
diff --git a/src/Pipeline/ShaderCore.cpp b/src/Pipeline/ShaderCore.cpp
index 0f23096..10502fd 100644
--- a/src/Pipeline/ShaderCore.cpp
+++ b/src/Pipeline/ShaderCore.cpp
@@ -546,27 +546,72 @@
 	}
 }
 
-UInt4 halfToFloatBits(UInt4 halfBits)
+SIMD::UInt halfToFloatBits(SIMD::UInt halfBits)
 {
-	auto magic = UInt4(126 << 23);
+	auto magic = SIMD::UInt(126 << 23);
 
-	auto sign16 = halfBits & UInt4(0x8000);
-	auto man16  = halfBits & UInt4(0x3FF);
-	auto exp16  = halfBits & UInt4(0x7C00);
+	auto sign16 = halfBits & SIMD::UInt(0x8000);
+	auto man16  = halfBits & SIMD::UInt(0x03FF);
+	auto exp16  = halfBits & SIMD::UInt(0x7C00);
 
-	auto isDnormOrZero = CmpEQ(exp16, UInt4(0));
-	auto isInfOrNaN = CmpEQ(exp16, UInt4(0x7C00));
+	auto isDnormOrZero = CmpEQ(exp16, SIMD::UInt(0));
+	auto isInfOrNaN = CmpEQ(exp16, SIMD::UInt(0x7C00));
 
 	auto sign32 = sign16 << 16;
 	auto man32  = man16 << 13;
-	auto exp32  = (exp16 + UInt4(0x1C000)) << 13;
-	auto norm32 = (man32 | exp32) | (isInfOrNaN & UInt4(0x7F800000));
+	auto exp32  = (exp16 + SIMD::UInt(0x1C000)) << 13;
+	auto norm32 = (man32 | exp32) | (isInfOrNaN & SIMD::UInt(0x7F800000));
 
-	auto denorm32 = As<UInt4>(As<Float4>(magic + man16) - As<Float4>(magic));
+	auto denorm32 = As<SIMD::UInt>(As<SIMD::Float>(magic + man16) - As<SIMD::Float>(magic));
 
 	return sign32 | (norm32 & ~isDnormOrZero) | (denorm32 & isDnormOrZero);
 }
 
+SIMD::UInt floatToHalfBits(SIMD::UInt floatBits, bool storeInUpperBits)
+{
+	static const uint32_t mask_sign = 0x80000000u;
+	static const uint32_t mask_round = ~0xfffu;
+	static const uint32_t c_f32infty = 255 << 23;
+	static const uint32_t c_magic = 15 << 23;
+	static const uint32_t c_nanbit = 0x200;
+	static const uint32_t c_infty_as_fp16 = 0x7c00;
+	static const uint32_t c_clamp = (31 << 23) - 0x1000;
+
+	SIMD::UInt justsign = SIMD::UInt(mask_sign) & floatBits;
+	SIMD::UInt absf = floatBits ^ justsign;
+	SIMD::UInt b_isnormal = CmpNLE(SIMD::UInt(c_f32infty), absf);
+
+	// Note: this version doesn't round to the nearest even in case of a tie as defined by IEEE 754-2008, it rounds to +inf
+	//       instead of nearest even, since that's fine for GLSL ES 3.0's needs (see section 2.1.1 Floating-Point Computation)
+	SIMD::UInt joined = ((((As<SIMD::UInt>(Min(As<SIMD::Float>(absf & SIMD::UInt(mask_round)) * As<SIMD::Float>(SIMD::UInt(c_magic)),
+	                                           As<SIMD::Float>(SIMD::UInt(c_clamp))))) - SIMD::UInt(mask_round)) >> 13) & b_isnormal) |
+	                    ((b_isnormal ^ SIMD::UInt(0xFFFFFFFF)) &
+	                     ((CmpNLE(absf, SIMD::UInt(c_f32infty)) & SIMD::UInt(c_nanbit)) | SIMD::UInt(c_infty_as_fp16)));
+
+	return storeInUpperBits ? ((joined << 16) | justsign) : joined | (justsign >> 16);
+}
+
+sw::SIMD::Float r11g11b10Unpack(UInt r11g11b10bits)
+{
+	// 10 (or 11) bit float formats are unsigned formats with a 5 bit exponent and a 5 (or 6) bit mantissa.
+	// Since the Half float format also has a 5 bit exponent, we can convert these formats to half by
+	// copy/pasting the bits so the the exponent bits and top mantissa bits are aligned to the half format.
+	// In this case, we have:
+	// MSB | B B B B B B B B B B G G G G G G G G G G G R R R R R R R R R R R | LSB
+	SIMD::UInt halfBits;
+	halfBits = Insert(halfBits, (r11g11b10bits & UInt(0x000007FFu)) << 4, 0);
+	halfBits = Insert(halfBits, (r11g11b10bits & UInt(0x003FF800u)) >> 7, 1);
+	halfBits = Insert(halfBits, (r11g11b10bits & UInt(0xFFC00000u)) >> 17, 2);
+	halfBits = Insert(halfBits, UInt(0x00003C00u), 3);
+	return As<sw::SIMD::Float>(halfToFloatBits(halfBits));
+}
+
+UInt r11g11b10Pack(sw::SIMD::Float &value)
+{
+	SIMD::UInt halfBits = floatToHalfBits(As<SIMD::UInt>(value), true) &
+	                      SIMD::UInt(0x7FF00000, 0x7FF00000, 0x7FE00000, 0);
+	return (UInt(halfBits.x) >> 20)  | (UInt(halfBits.y) >> 9) | (UInt(halfBits.z) << 1);
+}
 
 rr::RValue<rr::Bool> AnyTrue(rr::RValue<sw::SIMD::Int> const &ints)
 {
diff --git a/src/Pipeline/ShaderCore.hpp b/src/Pipeline/ShaderCore.hpp
index 52522a5..fb056ff 100644
--- a/src/Pipeline/ShaderCore.hpp
+++ b/src/Pipeline/ShaderCore.hpp
@@ -190,7 +190,10 @@
 void transpose2x4(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3);
 void transpose4xN(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3, int N);
 
-UInt4 halfToFloatBits(UInt4 halfBits);
+sw::SIMD::UInt halfToFloatBits(sw::SIMD::UInt halfBits);
+sw::SIMD::UInt floatToHalfBits(sw::SIMD::UInt floatBits, bool storeInUpperBits);
+sw::SIMD::Float r11g11b10Unpack(UInt r11g11b10bits);
+UInt r11g11b10Pack(sw::SIMD::Float &value);
 
 rr::RValue<rr::Bool> AnyTrue(rr::RValue<sw::SIMD::Int> const &ints);
 
diff --git a/src/Pipeline/SpirvShader.hpp b/src/Pipeline/SpirvShader.hpp
index 1876dec..154ffd7 100644
--- a/src/Pipeline/SpirvShader.hpp
+++ b/src/Pipeline/SpirvShader.hpp
@@ -1087,8 +1087,6 @@
 	// Helper as we often need to take dot products as part of doing other things.
 	SIMD::Float Dot(unsigned numComponents, GenericValue const & x, GenericValue const & y) const;
 
-	SIMD::UInt FloatToHalfBits(SIMD::UInt floatBits, bool storeInUpperBits) const;
-
 	// Splits x into a floating-point significand in the range [0.5, 1.0)
 	// and an integral exponent of two, such that:
 	//   x = significand * 2^exponent
diff --git a/src/Pipeline/SpirvShaderArithmetic.cpp b/src/Pipeline/SpirvShaderArithmetic.cpp
index 947a109..5d9b980 100644
--- a/src/Pipeline/SpirvShaderArithmetic.cpp
+++ b/src/Pipeline/SpirvShaderArithmetic.cpp
@@ -544,30 +544,6 @@
 	return d;
 }
 
-SIMD::UInt SpirvShader::FloatToHalfBits(SIMD::UInt floatBits, bool storeInUpperBits) const
-{
-	static const uint32_t mask_sign = 0x80000000u;
-	static const uint32_t mask_round = ~0xfffu;
-	static const uint32_t c_f32infty = 255 << 23;
-	static const uint32_t c_magic = 15 << 23;
-	static const uint32_t c_nanbit = 0x200;
-	static const uint32_t c_infty_as_fp16 = 0x7c00;
-	static const uint32_t c_clamp = (31 << 23) - 0x1000;
-
-	SIMD::UInt justsign = SIMD::UInt(mask_sign) & floatBits;
-	SIMD::UInt absf = floatBits ^ justsign;
-	SIMD::UInt b_isnormal = CmpNLE(SIMD::UInt(c_f32infty), absf);
-
-	// Note: this version doesn't round to the nearest even in case of a tie as defined by IEEE 754-2008, it rounds to +inf
-	//       instead of nearest even, since that's fine for GLSL ES 3.0's needs (see section 2.1.1 Floating-Point Computation)
-	SIMD::UInt joined = ((((As<SIMD::UInt>(Min(As<SIMD::Float>(absf & SIMD::UInt(mask_round)) * As<SIMD::Float>(SIMD::UInt(c_magic)),
-										As<SIMD::Float>(SIMD::UInt(c_clamp))))) - SIMD::UInt(mask_round)) >> 13) & b_isnormal) |
-					((b_isnormal ^ SIMD::UInt(0xFFFFFFFF)) & ((CmpNLE(absf, SIMD::UInt(c_f32infty)) & SIMD::UInt(c_nanbit)) |
-														SIMD::UInt(c_infty_as_fp16)));
-
-	return storeInUpperBits ? ((joined << 16) | justsign) : joined | (justsign >> 16);
-}
-
 std::pair<SIMD::Float, SIMD::Int> SpirvShader::Frexp(RValue<SIMD::Float> val) const
 {
 	// Assumes IEEE 754
diff --git a/src/Pipeline/SpirvShaderGLSLstd450.cpp b/src/Pipeline/SpirvShaderGLSLstd450.cpp
index f9485ca..858765f 100644
--- a/src/Pipeline/SpirvShaderGLSLstd450.cpp
+++ b/src/Pipeline/SpirvShaderGLSLstd450.cpp
@@ -431,7 +431,7 @@
 	case GLSLstd450PackHalf2x16:
 	{
 		auto val = GenericValue(this, state, insn.word(5));
-		dst.move(0, FloatToHalfBits(val.UInt(0), false) | FloatToHalfBits(val.UInt(1), true));
+		dst.move(0, floatToHalfBits(val.UInt(0), false) | floatToHalfBits(val.UInt(1), true));
 		break;
 	}
 	case GLSLstd450UnpackSnorm4x8:
diff --git a/src/Pipeline/SpirvShaderImage.cpp b/src/Pipeline/SpirvShaderImage.cpp
index c64b317..f2828f8 100644
--- a/src/Pipeline/SpirvShaderImage.cpp
+++ b/src/Pipeline/SpirvShaderImage.cpp
@@ -900,8 +900,8 @@
 		break;
 	case spv::ImageFormatRgba16f:
 		texelSize = 8;
-		packed[0] = FloatToHalfBits(texel.UInt(0), false) | FloatToHalfBits(texel.UInt(1), true);
-		packed[1] = FloatToHalfBits(texel.UInt(2), false) | FloatToHalfBits(texel.UInt(3), true);
+		packed[0] = floatToHalfBits(texel.UInt(0), false) | floatToHalfBits(texel.UInt(1), true);
+		packed[1] = floatToHalfBits(texel.UInt(2), false) | floatToHalfBits(texel.UInt(3), true);
 		numPackedElements = 2;
 		break;
 	case spv::ImageFormatRgba16i:
diff --git a/src/Vulkan/VkPhysicalDevice.cpp b/src/Vulkan/VkPhysicalDevice.cpp
index b60e56e..61d3bf7 100644
--- a/src/Vulkan/VkPhysicalDevice.cpp
+++ b/src/Vulkan/VkPhysicalDevice.cpp
@@ -630,6 +630,7 @@
 	case VK_FORMAT_R32G32B32A32_UINT:
 	case VK_FORMAT_R32G32B32A32_SINT:
 	case VK_FORMAT_R32G32B32A32_SFLOAT:
+	case VK_FORMAT_B10G11R11_UFLOAT_PACK32:
 		pFormatProperties->optimalTilingFeatures |=
 			VK_FORMAT_FEATURE_COLOR_ATTACHMENT_BIT |
 			VK_FORMAT_FEATURE_BLIT_DST_BIT;