Add support for D32_SFLOAT and D16_UNORM input attachments

D16_UNORM is the first format supported in this path with a texel size
of less than 4 bytes. Adjust the loading to round up the size so we always
emit at least one load.

Test: dEQP-VK.renderpass*
Bug: b/131171141
Change-Id: I985b85017ace84cb8b18c36989fea99ba8e0f44f
Reviewed-on: https://swiftshader-review.googlesource.com/c/SwiftShader/+/29729
Presubmit-Ready: Chris Forbes <chrisforbes@google.com>
Reviewed-by: Nicolas Capens <nicolascapens@google.com>
Tested-by: Chris Forbes <chrisforbes@google.com>
diff --git a/src/Pipeline/SpirvShader.cpp b/src/Pipeline/SpirvShader.cpp
index d63c97b..59f2b12 100644
--- a/src/Pipeline/SpirvShader.cpp
+++ b/src/Pipeline/SpirvShader.cpp
@@ -4656,7 +4656,10 @@
 		auto texelPtr = GetTexelAddress(state->routine, basePtr, coordinate, imageType, binding, texelSize);
 
 		SIMD::Int packed[4];
-		for (auto i = 0; i < texelSize/4; i++)
+		// Round up texel size: for formats smaller than 32 bits per texel, we will emit a bunch
+		// of (overlapping) 32b loads here, and each lane will pick out what it needs from the low bits.
+		// TODO: specialize for small formats?
+		for (auto i = 0; i < (texelSize + 3)/4; i++)
 		{
 			packed[i] = SIMD::Load<SIMD::Int>(texelPtr, state->activeLaneMask());
 			texelPtr += sizeof(float);
@@ -4681,12 +4684,20 @@
 			dst.move(3, SIMD::Int(1));
 			break;
 		case VK_FORMAT_R32_SFLOAT:
+		case VK_FORMAT_D32_SFLOAT:
+		//case VK_FORMAT_D32_SFLOAT_S8_UINT:
 			dst.move(0, packed[0]);
 			// Fill remaining channels with 0,0,1 (of the correct type)
 			dst.move(1, SIMD::Float(0));
 			dst.move(2, SIMD::Float(0));
 			dst.move(3, SIMD::Float(1));
 			break;
+		case VK_FORMAT_D16_UNORM:
+			dst.move(0, SIMD::Float(packed[0] & SIMD::Int(0xffff)) * SIMD::Float(1.0f / 65535.0f));
+			dst.move(1, SIMD::Float(0));
+			dst.move(2, SIMD::Float(0));
+			dst.move(3, SIMD::Float(1));
+			break;
 		case VK_FORMAT_R16G16B16A16_SINT:
 			dst.move(0, (packed[0] << 16) >> 16);
 			dst.move(1, (packed[0]) >> 16);