Implement fp16 texture formats

TODO: figure out what is happening with linear filtering. Possibly pre-existing breakage -- if we enable filtering support for R32_SFLOAT that fails in the same way.

Test: dEQP-VK.texture.*
Test: dEQP-VK.image.*
Test: dEQP-VK.pipeline.*
Change-Id: Ia461418d772eb5aceb101b84eaa239b0c0bce2c0
Reviewed-on: https://swiftshader-review.googlesource.com/c/SwiftShader/+/30288
Tested-by: Chris Forbes <chrisforbes@google.com>
Presubmit-Ready: Chris Forbes <chrisforbes@google.com>
Kokoro-Presubmit: kokoro <noreply+kokoro@google.com>
Reviewed-by: Nicolas Capens <nicolascapens@google.com>
diff --git a/src/Pipeline/SamplerCore.cpp b/src/Pipeline/SamplerCore.cpp
index c29a185..e1c6485 100644
--- a/src/Pipeline/SamplerCore.cpp
+++ b/src/Pipeline/SamplerCore.cpp
@@ -195,11 +195,14 @@
 				if(componentCount < 4) c.w = Float4(1.0f);
 				break;
 			case VK_FORMAT_R32_SFLOAT:
+			case VK_FORMAT_R16_SFLOAT:
 				c.y = Float4(0.0f);
 			case VK_FORMAT_R32G32_SFLOAT:
+			case VK_FORMAT_R16G16_SFLOAT:
 				c.z = Float4(0.0f);
 				c.w = Float4(1.0f);
 			case VK_FORMAT_R32G32B32A32_SFLOAT:
+			case VK_FORMAT_R16G16B16A16_SFLOAT:
 				break;
 			default:
 				ASSERT(false);
@@ -1830,42 +1833,95 @@
 			int f2 = state.textureType == TEXTURE_CUBE ? 2 : 0;
 			int f3 = state.textureType == TEXTURE_CUBE ? 3 : 0;
 
-			// Read texels
-			switch(textureComponentCount())
+			if (has16bitTextureComponents())
 			{
-			case 4:
-				c.x = *Pointer<Float4>(buffer[f0] + index[0] * 16, 16);
-				c.y = *Pointer<Float4>(buffer[f1] + index[1] * 16, 16);
-				c.z = *Pointer<Float4>(buffer[f2] + index[2] * 16, 16);
-				c.w = *Pointer<Float4>(buffer[f3] + index[3] * 16, 16);
-				transpose4x4(c.x, c.y, c.z, c.w);
-				break;
-			case 3:
-				c.x = *Pointer<Float4>(buffer[f0] + index[0] * 16, 16);
-				c.y = *Pointer<Float4>(buffer[f1] + index[1] * 16, 16);
-				c.z = *Pointer<Float4>(buffer[f2] + index[2] * 16, 16);
-				c.w = *Pointer<Float4>(buffer[f3] + index[3] * 16, 16);
-				transpose4x3(c.x, c.y, c.z, c.w);
-				break;
-			case 2:
-				// FIXME: Optimal shuffling?
-				c.x.xy = *Pointer<Float4>(buffer[f0] + index[0] * 8);
-				c.x.zw = *Pointer<Float4>(buffer[f1] + index[1] * 8 - 8);
-				c.z.xy = *Pointer<Float4>(buffer[f2] + index[2] * 8);
-				c.z.zw = *Pointer<Float4>(buffer[f3] + index[3] * 8 - 8);
-				c.y = c.x;
-				c.x = Float4(c.x.xz, c.z.xz);
-				c.y = Float4(c.y.yw, c.z.yw);
-				break;
-			case 1:
-				// FIXME: Optimal shuffling?
-				c.x.x = *Pointer<Float>(buffer[f0] + index[0] * 4);
-				c.x.y = *Pointer<Float>(buffer[f1] + index[1] * 4);
-				c.x.z = *Pointer<Float>(buffer[f2] + index[2] * 4);
-				c.x.w = *Pointer<Float>(buffer[f3] + index[3] * 4);
-				break;
-			default:
-				ASSERT(false);
+				switch (textureComponentCount())
+				{
+				case 4:
+				{
+					UInt4 t0 = Int4(*Pointer<UShort4>(buffer[f0] + index[0] * 8));
+					UInt4 t1 = Int4(*Pointer<UShort4>(buffer[f1] + index[1] * 8));
+					UInt4 t2 = Int4(*Pointer<UShort4>(buffer[f2] + index[2] * 8));
+					UInt4 t3 = Int4(*Pointer<UShort4>(buffer[f3] + index[3] * 8));
+
+					c.x = As<Float4>(halfToFloatBits(t0));
+					c.y = As<Float4>(halfToFloatBits(t1));
+					c.z = As<Float4>(halfToFloatBits(t2));
+					c.w = As<Float4>(halfToFloatBits(t3));
+					transpose4x4(c.x, c.y, c.z, c.w);
+					break;
+				}
+				case 2:
+				{
+					UInt4 t0 = Int4(*Pointer<UShort4>(buffer[f0] + index[0] * 4));
+					UInt4 t1 = Int4(*Pointer<UShort4>(buffer[f1] + index[1] * 4));
+					UInt4 t2 = Int4(*Pointer<UShort4>(buffer[f2] + index[2] * 4));
+					UInt4 t3 = Int4(*Pointer<UShort4>(buffer[f3] + index[3] * 4));
+
+					// FIXME: shuffles
+					c.x = As<Float4>(halfToFloatBits(t0));
+					c.y = As<Float4>(halfToFloatBits(t1));
+					c.z = As<Float4>(halfToFloatBits(t2));
+					c.w = As<Float4>(halfToFloatBits(t3));
+					transpose4x4(c.x, c.y, c.z, c.w);
+					break;
+				}
+				case 1:
+				{
+					UInt4 t0 = Int4(*Pointer<UShort4>(buffer[f0] + index[0] * 2));
+					UInt4 t1 = Int4(*Pointer<UShort4>(buffer[f1] + index[1] * 2));
+					UInt4 t2 = Int4(*Pointer<UShort4>(buffer[f2] + index[2] * 2));
+					UInt4 t3 = Int4(*Pointer<UShort4>(buffer[f3] + index[3] * 2));
+
+					c.x.x = Extract(As<Float4>(halfToFloatBits(t0)), 0);
+					c.x.y = Extract(As<Float4>(halfToFloatBits(t1)), 0);
+					c.x.z = Extract(As<Float4>(halfToFloatBits(t2)), 0);
+					c.x.w = Extract(As<Float4>(halfToFloatBits(t3)), 0);
+					break;
+				}
+				default:
+					UNIMPLEMENTED("fp16 sampling %d components", textureComponentCount());
+				}
+			}
+			else
+			{
+				// Read texels
+				switch (textureComponentCount())
+				{
+				case 4:
+					c.x = *Pointer<Float4>(buffer[f0] + index[0] * 16, 16);
+					c.y = *Pointer<Float4>(buffer[f1] + index[1] * 16, 16);
+					c.z = *Pointer<Float4>(buffer[f2] + index[2] * 16, 16);
+					c.w = *Pointer<Float4>(buffer[f3] + index[3] * 16, 16);
+					transpose4x4(c.x, c.y, c.z, c.w);
+					break;
+				case 3:
+					c.x = *Pointer<Float4>(buffer[f0] + index[0] * 16, 16);
+					c.y = *Pointer<Float4>(buffer[f1] + index[1] * 16, 16);
+					c.z = *Pointer<Float4>(buffer[f2] + index[2] * 16, 16);
+					c.w = *Pointer<Float4>(buffer[f3] + index[3] * 16, 16);
+					transpose4x3(c.x, c.y, c.z, c.w);
+					break;
+				case 2:
+					// FIXME: Optimal shuffling?
+					c.x.xy = *Pointer<Float4>(buffer[f0] + index[0] * 8);
+					c.x.zw = *Pointer<Float4>(buffer[f1] + index[1] * 8 - 8);
+					c.z.xy = *Pointer<Float4>(buffer[f2] + index[2] * 8);
+					c.z.zw = *Pointer<Float4>(buffer[f3] + index[3] * 8 - 8);
+					c.y = c.x;
+					c.x = Float4(c.x.xz, c.z.xz);
+					c.y = Float4(c.y.yw, c.z.yw);
+					break;
+				case 1:
+					// FIXME: Optimal shuffling?
+					c.x.x = *Pointer<Float>(buffer[f0] + index[0] * 4);
+					c.x.y = *Pointer<Float>(buffer[f1] + index[1] * 4);
+					c.x.z = *Pointer<Float>(buffer[f2] + index[2] * 4);
+					c.x.w = *Pointer<Float>(buffer[f3] + index[3] * 4);
+					break;
+				default:
+					ASSERT(false);
+				}
 			}
 
 			if(state.compare != COMPARE_BYPASS)
diff --git a/src/Pipeline/ShaderCore.cpp b/src/Pipeline/ShaderCore.cpp
index e2ebf83..4f7886f 100644
--- a/src/Pipeline/ShaderCore.cpp
+++ b/src/Pipeline/ShaderCore.cpp
@@ -559,4 +559,17 @@
 		case 4: transpose4x4(row0, row1, row2, row3); break;
 		}
 	}
+
+	UInt4 halfToFloatBits(UInt4 halfBits)
+	{
+		static const uint32_t mask_nosign = 0x7FFF;
+		static const uint32_t magic = (254 - 15) << 23;
+		static const uint32_t was_infnan = 0x7BFF;
+		static const uint32_t exp_infnan = 255 << 23;
+
+		UInt4 expmant = halfBits & UInt4(mask_nosign);
+		return As<UInt4>(As<Float4>(expmant << 13) * As<Float4>(UInt4(magic))) |
+		((halfBits ^ UInt4(expmant)) << 16) |
+		(CmpNLE(As<UInt4>(expmant), UInt4(was_infnan)) & UInt4(exp_infnan));
+	}
 }
diff --git a/src/Pipeline/ShaderCore.hpp b/src/Pipeline/ShaderCore.hpp
index 3f0ad27..e1da028 100644
--- a/src/Pipeline/ShaderCore.hpp
+++ b/src/Pipeline/ShaderCore.hpp
@@ -90,6 +90,8 @@
 	void transpose4x1(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3);
 	void transpose2x4(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3);
 	void transpose4xN(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3, int N);
+
+	UInt4 halfToFloatBits(UInt4 halfBits);
 }
 
 #endif   // sw_ShaderCore_hpp
diff --git a/src/Pipeline/SpirvShader.cpp b/src/Pipeline/SpirvShader.cpp
index 0ed371a..11cf404 100644
--- a/src/Pipeline/SpirvShader.cpp
+++ b/src/Pipeline/SpirvShader.cpp
@@ -3818,8 +3818,8 @@
 		case GLSLstd450UnpackHalf2x16:
 		{
 			auto val = GenericValue(this, routine, insn.word(5));
-			dst.move(0, HalfToFloatBits(val.UInt(0) & SIMD::UInt(0x0000FFFF)));
-			dst.move(1, HalfToFloatBits((val.UInt(0) & SIMD::UInt(0xFFFF0000)) >> 16));
+			dst.move(0, halfToFloatBits(val.UInt(0) & SIMD::UInt(0x0000FFFF)));
+			dst.move(1, halfToFloatBits((val.UInt(0) & SIMD::UInt(0xFFFF0000)) >> 16));
 			break;
 		}
 		case GLSLstd450Fma:
@@ -4325,19 +4325,6 @@
 		return storeInUpperBits ? ((joined << 16) | justsign) : joined | (justsign >> 16);
 	}
 
-	SIMD::UInt SpirvShader::HalfToFloatBits(SIMD::UInt halfBits) const
-	{
-		static const uint32_t mask_nosign = 0x7FFF;
-		static const uint32_t magic = (254 - 15) << 23;
-		static const uint32_t was_infnan = 0x7BFF;
-		static const uint32_t exp_infnan = 255 << 23;
-
-		SIMD::UInt expmant = halfBits & SIMD::UInt(mask_nosign);
-		return As<SIMD::UInt>(As<SIMD::Float>(expmant << 13) * As<SIMD::Float>(SIMD::UInt(magic))) |
-						 ((halfBits ^ SIMD::UInt(expmant)) << 16) |
-						 (CmpNLE(As<SIMD::UInt>(expmant), SIMD::UInt(was_infnan)) & SIMD::UInt(exp_infnan));
-	}
-
 	std::pair<SIMD::Float, SIMD::Int> SpirvShader::Frexp(RValue<SIMD::Float> val) const
 	{
 		// Assumes IEEE 754
@@ -4868,10 +4855,10 @@
 			dst.move(3, (packed[1] >> 16) & SIMD::Int(0xffff));
 			break;
 		case VK_FORMAT_R16G16B16A16_SFLOAT:
-			dst.move(0, HalfToFloatBits(As<SIMD::UInt>(packed[0]) & SIMD::UInt(0x0000FFFF)));
-			dst.move(1, HalfToFloatBits((As<SIMD::UInt>(packed[0]) & SIMD::UInt(0xFFFF0000)) >> 16));
-			dst.move(2, HalfToFloatBits(As<SIMD::UInt>(packed[1]) & SIMD::UInt(0x0000FFFF)));
-			dst.move(3, HalfToFloatBits((As<SIMD::UInt>(packed[1]) & SIMD::UInt(0xFFFF0000)) >> 16));
+			dst.move(0, halfToFloatBits(As<SIMD::UInt>(packed[0]) & SIMD::UInt(0x0000FFFF)));
+			dst.move(1, halfToFloatBits((As<SIMD::UInt>(packed[0]) & SIMD::UInt(0xFFFF0000)) >> 16));
+			dst.move(2, halfToFloatBits(As<SIMD::UInt>(packed[1]) & SIMD::UInt(0x0000FFFF)));
+			dst.move(3, halfToFloatBits((As<SIMD::UInt>(packed[1]) & SIMD::UInt(0xFFFF0000)) >> 16));
 			break;
 		case VK_FORMAT_R8G8B8A8_SNORM:
 			dst.move(0, Min(Max(SIMD::Float(((packed[0]<<24) & SIMD::Int(0xFF000000))) * SIMD::Float(1.0f / float(0x7f000000)), SIMD::Float(-1.0f)), SIMD::Float(1.0f)));
@@ -4956,7 +4943,7 @@
 			dst.move(3, SIMD::Int(1));
 			break;
 		case VK_FORMAT_R16_SFLOAT:
-			dst.move(0, HalfToFloatBits(As<SIMD::UInt>(packed[0]) & SIMD::UInt(0x0000FFFF)));
+			dst.move(0, halfToFloatBits(As<SIMD::UInt>(packed[0]) & SIMD::UInt(0x0000FFFF)));
 			dst.move(1, SIMD::Float(0));
 			dst.move(2, SIMD::Float(0));
 			dst.move(3, SIMD::Float(1));
@@ -4974,8 +4961,8 @@
 			dst.move(3, SIMD::Int(1));
 			break;
 		case VK_FORMAT_R16G16_SFLOAT:
-			dst.move(0, HalfToFloatBits(As<SIMD::UInt>(packed[0]) & SIMD::UInt(0x0000FFFF)));
-			dst.move(1, HalfToFloatBits((As<SIMD::UInt>(packed[0]) & SIMD::UInt(0xFFFF0000)) >> 16));
+			dst.move(0, halfToFloatBits(As<SIMD::UInt>(packed[0]) & SIMD::UInt(0x0000FFFF)));
+			dst.move(1, halfToFloatBits((As<SIMD::UInt>(packed[0]) & SIMD::UInt(0xFFFF0000)) >> 16));
 			dst.move(2, SIMD::Float(0));
 			dst.move(3, SIMD::Float(1));
 			break;
diff --git a/src/Pipeline/SpirvShader.hpp b/src/Pipeline/SpirvShader.hpp
index 29372f3..99b70e6 100644
--- a/src/Pipeline/SpirvShader.hpp
+++ b/src/Pipeline/SpirvShader.hpp
@@ -918,7 +918,6 @@
 		SIMD::Float Dot(unsigned numComponents, GenericValue const & x, GenericValue const & y) const;
 
 		SIMD::UInt FloatToHalfBits(SIMD::UInt floatBits, bool storeInUpperBits) const;
-		SIMD::UInt HalfToFloatBits(SIMD::UInt halfBits) const;
 
 		// Splits x into a floating-point significand in the range [0.5, 1.0)
 		// and an integral exponent of two, such that:
diff --git a/src/Vulkan/VkFormat.cpp b/src/Vulkan/VkFormat.cpp
index 325b61a..80624a6 100644
--- a/src/Vulkan/VkFormat.cpp
+++ b/src/Vulkan/VkFormat.cpp
@@ -1727,10 +1727,13 @@
 	case VK_FORMAT_R16G16B16A16_UNORM:
 	case VK_FORMAT_R16_SINT:
 	case VK_FORMAT_R16_UINT:
+	case VK_FORMAT_R16_SFLOAT:
 	case VK_FORMAT_R16G16_SINT:
 	case VK_FORMAT_R16G16_UINT:
+	case VK_FORMAT_R16G16_SFLOAT:
 	case VK_FORMAT_R16G16B16A16_SINT:
 	case VK_FORMAT_R16G16B16A16_UINT:
+	case VK_FORMAT_R16G16B16A16_SFLOAT:
 	case VK_FORMAT_G8_B8R8_2PLANE_420_UNORM:
 		return false;
 	default:
@@ -1773,10 +1776,13 @@
 	case VK_FORMAT_R32G32B32A32_UINT:
 	case VK_FORMAT_R16_SINT:
 	case VK_FORMAT_R16_UINT:
+	case VK_FORMAT_R16_SFLOAT:
 	case VK_FORMAT_R16G16_SINT:
 	case VK_FORMAT_R16G16_UINT:
+	case VK_FORMAT_R16G16_SFLOAT:
 	case VK_FORMAT_R16G16B16A16_SINT:
 	case VK_FORMAT_R16G16B16A16_UINT:
+	case VK_FORMAT_R16G16B16A16_SFLOAT:
 	case VK_FORMAT_G8_B8R8_2PLANE_420_UNORM:
 		return false;
 	default:
@@ -1820,10 +1826,13 @@
 	case VK_FORMAT_R16G16B16A16_UNORM:
 	case VK_FORMAT_R16_SINT:
 	case VK_FORMAT_R16_UINT:
+	case VK_FORMAT_R16_SFLOAT:
 	case VK_FORMAT_R16G16_SINT:
 	case VK_FORMAT_R16G16_UINT:
+	case VK_FORMAT_R16G16_SFLOAT:
 	case VK_FORMAT_R16G16B16A16_SINT:
 	case VK_FORMAT_R16G16B16A16_UINT:
+	case VK_FORMAT_R16G16B16A16_SFLOAT:
 		return true;
 	default:
 		UNIMPLEMENTED("Format: %d", int(format));
@@ -1854,10 +1863,13 @@
 	case VK_FORMAT_R16G16B16A16_UNORM:
 	case VK_FORMAT_R16_SINT:
 	case VK_FORMAT_R16_UINT:
+	case VK_FORMAT_R16_SFLOAT:
 	case VK_FORMAT_R16G16_SINT:
 	case VK_FORMAT_R16G16_UINT:
+	case VK_FORMAT_R16G16_SFLOAT:
 	case VK_FORMAT_R16G16B16A16_SINT:
 	case VK_FORMAT_R16G16B16A16_UINT:
+	case VK_FORMAT_R16G16B16A16_SFLOAT:
 	case VK_FORMAT_R32_SFLOAT:
 	case VK_FORMAT_R32G32_SFLOAT:
 	case VK_FORMAT_R32G32B32A32_SFLOAT:
@@ -1912,10 +1924,13 @@
 	case VK_FORMAT_R16G16B16A16_UNORM:
 	case VK_FORMAT_R16_SINT:
 	case VK_FORMAT_R16_UINT:
+	case VK_FORMAT_R16_SFLOAT:
 	case VK_FORMAT_R16G16_SINT:
 	case VK_FORMAT_R16G16_UINT:
+	case VK_FORMAT_R16G16_SFLOAT:
 	case VK_FORMAT_R16G16B16A16_SINT:
 	case VK_FORMAT_R16G16B16A16_UINT:
+	case VK_FORMAT_R16G16B16A16_SFLOAT:
 		return false;
 	default:
 		UNIMPLEMENTED("Format: %d", int(format));
@@ -1934,6 +1949,7 @@
 	case VK_FORMAT_R8_UINT:
 	case VK_FORMAT_R16_SINT:
 	case VK_FORMAT_R16_UINT:
+	case VK_FORMAT_R16_SFLOAT:
 	case VK_FORMAT_R32_SINT:
 	case VK_FORMAT_R32_UINT:
 	case VK_FORMAT_R32_SFLOAT:
@@ -1945,6 +1961,7 @@
 	case VK_FORMAT_R16G16_SINT:
 	case VK_FORMAT_R16G16_UINT:
 	case VK_FORMAT_R16G16_UNORM:
+	case VK_FORMAT_R16G16_SFLOAT:
 	case VK_FORMAT_R32G32_SINT:
 	case VK_FORMAT_R32G32_UINT:
 	case VK_FORMAT_R32G32_SFLOAT:
@@ -1959,6 +1976,7 @@
 	case VK_FORMAT_R16G16B16A16_UNORM:
 	case VK_FORMAT_R16G16B16A16_SINT:
 	case VK_FORMAT_R16G16B16A16_UINT:
+	case VK_FORMAT_R16G16B16A16_SFLOAT:
 	case VK_FORMAT_R32G32B32A32_SINT:
 	case VK_FORMAT_R32G32B32A32_UINT:
 	case VK_FORMAT_R32G32B32A32_SFLOAT: