Implement scatter/gather for 1- and 2-byte texels

Fixes out-of-bounds reads/writes and also avoids unaligned accesses.

This is a generic reference implementation. Future optimizations could
use unaligned 4-byte accesses if they're known to be safe (e.g. due to
padding) and efficient. We can also eliminate if we know we're in a
basic block post-dominated by the entry block and the number of
active invocations is a multiple of the SIMD width.

Bug: b/160531165
Change-Id: I892cfd3c7da8d8891cabe80695e5f35c57da73b4
Reviewed-on: https://swiftshader-review.googlesource.com/c/SwiftShader/+/46168
Presubmit-Ready: Nicolas Capens <nicolascapens@google.com>
Kokoro-Result: kokoro <noreply+kokoro@google.com>
Tested-by: Nicolas Capens <nicolascapens@google.com>
Reviewed-by: Antonio Maiorano <amaiorano@google.com>
diff --git a/src/Pipeline/SpirvShaderImage.cpp b/src/Pipeline/SpirvShaderImage.cpp
index 2b7e239..c26790a 100644
--- a/src/Pipeline/SpirvShaderImage.cpp
+++ b/src/Pipeline/SpirvShaderImage.cpp
@@ -640,15 +640,45 @@
 	auto texelSize = vk::Format(vkFormat).bytes();
 	auto texelPtr = GetTexelAddress(state, imageBase, imageSizeInBytes, coordinate, imageType, binding, texelSize, sampleId, useStencilAspect, robustness);
 
+	// Gather packed texel data. Texels larger than 4 bytes occupy multiple SIMD::Int elements.
+	// TODO(b/160531165): Provide gather abstractions for various element sizes.
 	SIMD::Int packed[4];
-	// Round up texel size: for formats smaller than 32 bits per texel, we will emit a bunch
-	// of (overlapping) 32b loads here, and each lane will pick out what it needs from the low bits.
-	// TODO: specialize for small formats?
-	for(auto i = 0; i < (texelSize + 3) / 4; i++)
+	if(texelSize == 4 || texelSize == 8 || texelSize == 16)
 	{
-		packed[i] = texelPtr.Load<SIMD::Int>(robustness, state->activeLaneMask(), false, std::memory_order_relaxed, std::min(texelSize, 4));
-		texelPtr += sizeof(float);
+		for(auto i = 0; i < texelSize / 4; i++)
+		{
+			packed[i] = texelPtr.Load<SIMD::Int>(robustness, state->activeLaneMask());
+			texelPtr += sizeof(float);
+		}
 	}
+	else if(texelSize == 2)
+	{
+		SIMD::Int offsets = texelPtr.offsets();
+		SIMD::Int mask = state->activeLaneMask() & texelPtr.isInBounds(2, robustness);
+
+		for(int i = 0; i < SIMD::Width; i++)
+		{
+			If(Extract(mask, i) != 0)
+			{
+				packed[0] = Insert(packed[0], Int(*Pointer<Short>(texelPtr.base + Extract(offsets, i))), i);
+			}
+		}
+	}
+	else if(texelSize == 1)
+	{
+		SIMD::Int offsets = texelPtr.offsets();
+		SIMD::Int mask = state->activeLaneMask() & texelPtr.isInBounds(1, robustness);
+
+		for(int i = 0; i < SIMD::Width; i++)
+		{
+			If(Extract(mask, i) != 0)
+			{
+				packed[0] = Insert(packed[0], Int(*Pointer<Byte>(texelPtr.base + Extract(offsets, i))), i);
+			}
+		}
+	}
+	else
+		UNREACHABLE("texelSize: %d", int(texelSize));
 
 	// Format support requirements here come from two sources:
 	// - Minimum required set of formats for loads from storage images
@@ -918,7 +948,6 @@
 	auto imageSizeInBytes = *Pointer<Int>(binding + OFFSET(vk::StorageImageDescriptor, sizeInBytes));
 
 	SIMD::Int packed[4];
-	auto numPackedElements = 0u;
 	int texelSize = 0;
 	auto format = static_cast<spv::ImageFormat>(imageType.definition.word(8));
 	switch(format)
@@ -931,14 +960,12 @@
 			packed[1] = texel.Int(1);
 			packed[2] = texel.Int(2);
 			packed[3] = texel.Int(3);
-			numPackedElements = 4;
 			break;
 		case spv::ImageFormatR32f:
 		case spv::ImageFormatR32i:
 		case spv::ImageFormatR32ui:
 			texelSize = 4;
 			packed[0] = texel.Int(0);
-			numPackedElements = 1;
 			break;
 		case spv::ImageFormatRgba8:
 			texelSize = 4;
@@ -946,7 +973,6 @@
 			            ((SIMD::UInt(Round(Min(Max(texel.Float(1), SIMD::Float(0.0f)), SIMD::Float(1.0f)) * SIMD::Float(255.0f)))) << 8) |
 			            ((SIMD::UInt(Round(Min(Max(texel.Float(2), SIMD::Float(0.0f)), SIMD::Float(1.0f)) * SIMD::Float(255.0f)))) << 16) |
 			            ((SIMD::UInt(Round(Min(Max(texel.Float(3), SIMD::Float(0.0f)), SIMD::Float(1.0f)) * SIMD::Float(255.0f)))) << 24);
-			numPackedElements = 1;
 			break;
 		case spv::ImageFormatRgba8Snorm:
 			texelSize = 4;
@@ -961,7 +987,6 @@
 			            ((SIMD::Int(Round(Min(Max(texel.Float(3), SIMD::Float(-1.0f)), SIMD::Float(1.0f)) * SIMD::Float(127.0f))) &
 			              SIMD::Int(0xFF))
 			             << 24);
-			numPackedElements = 1;
 			break;
 		case spv::ImageFormatRgba8i:
 		case spv::ImageFormatRgba8ui:
@@ -970,20 +995,17 @@
 			            (SIMD::UInt(texel.UInt(1) & SIMD::UInt(0xff)) << 8) |
 			            (SIMD::UInt(texel.UInt(2) & SIMD::UInt(0xff)) << 16) |
 			            (SIMD::UInt(texel.UInt(3) & SIMD::UInt(0xff)) << 24);
-			numPackedElements = 1;
 			break;
 		case spv::ImageFormatRgba16f:
 			texelSize = 8;
 			packed[0] = floatToHalfBits(texel.UInt(0), false) | floatToHalfBits(texel.UInt(1), true);
 			packed[1] = floatToHalfBits(texel.UInt(2), false) | floatToHalfBits(texel.UInt(3), true);
-			numPackedElements = 2;
 			break;
 		case spv::ImageFormatRgba16i:
 		case spv::ImageFormatRgba16ui:
 			texelSize = 8;
 			packed[0] = SIMD::UInt(texel.UInt(0) & SIMD::UInt(0xFFFF)) | (SIMD::UInt(texel.UInt(1) & SIMD::UInt(0xFFFF)) << 16);
 			packed[1] = SIMD::UInt(texel.UInt(2) & SIMD::UInt(0xFFFF)) | (SIMD::UInt(texel.UInt(3) & SIMD::UInt(0xFFFF)) << 16);
-			numPackedElements = 2;
 			break;
 		case spv::ImageFormatRg32f:
 		case spv::ImageFormatRg32i:
@@ -991,18 +1013,15 @@
 			texelSize = 8;
 			packed[0] = texel.Int(0);
 			packed[1] = texel.Int(1);
-			numPackedElements = 2;
 			break;
 		case spv::ImageFormatRg16f:
 			texelSize = 4;
 			packed[0] = floatToHalfBits(texel.UInt(0), false) | floatToHalfBits(texel.UInt(1), true);
-			numPackedElements = 1;
 			break;
 		case spv::ImageFormatRg16i:
 		case spv::ImageFormatRg16ui:
 			texelSize = 4;
 			packed[0] = SIMD::UInt(texel.UInt(0) & SIMD::UInt(0xFFFF)) | (SIMD::UInt(texel.UInt(1) & SIMD::UInt(0xFFFF)) << 16);
-			numPackedElements = 1;
 			break;
 
 		case spv::ImageFormatR11fG11fB10f:
@@ -1041,11 +1060,44 @@
 
 	auto texelPtr = GetTexelAddress(state, imageBase, imageSizeInBytes, coordinate, imageType, binding, texelSize, 0, false, robustness);
 
-	for(auto i = 0u; i < numPackedElements; i++)
+	// Scatter packed texel data.
+	// TODO(b/160531165): Provide scatter abstractions for various element sizes.
+	if(texelSize == 4 || texelSize == 8 || texelSize == 16)
 	{
-		texelPtr.Store(packed[i], robustness, state->activeLaneMask());
-		texelPtr += sizeof(float);
+		for(auto i = 0; i < texelSize / 4; i++)
+		{
+			texelPtr.Store(packed[i], robustness, state->activeLaneMask());
+			texelPtr += sizeof(float);
+		}
 	}
+	else if(texelSize == 2)
+	{
+		SIMD::Int offsets = texelPtr.offsets();
+		SIMD::Int mask = state->activeLaneMask() & texelPtr.isInBounds(2, robustness);
+
+		for(int i = 0; i < SIMD::Width; i++)
+		{
+			If(Extract(mask, i) != 0)
+			{
+				*Pointer<Short>(texelPtr.base + Extract(offsets, i)) = Short(Extract(packed[0], i));
+			}
+		}
+	}
+	else if(texelSize == 1)
+	{
+		SIMD::Int offsets = texelPtr.offsets();
+		SIMD::Int mask = state->activeLaneMask() & texelPtr.isInBounds(1, robustness);
+
+		for(int i = 0; i < SIMD::Width; i++)
+		{
+			If(Extract(mask, i) != 0)
+			{
+				*Pointer<Byte>(texelPtr.base + Extract(offsets, i)) = Byte(Extract(packed[0], i));
+			}
+		}
+	}
+	else
+		UNREACHABLE("texelSize: %d", int(texelSize));
 
 	return EmitResult::Continue;
 }