Take advantage of return value optimization

Just a refactoring to improve syntax and avoid non-const references.

Also don't assume SIMD types are 4-wide.

Also add [[fallthrough]] cases found in the process.

Bug: b/143351714
Bug: b/142661203
Change-Id: I12a0178338ce4c711bcbf62825d230580f3e92f0
Reviewed-on: https://swiftshader-review.googlesource.com/c/SwiftShader/+/40288
Presubmit-Ready: Nicolas Capens <nicolascapens@google.com>
Tested-by: Nicolas Capens <nicolascapens@google.com>
Reviewed-by: Alexis Hétu <sugoi@google.com>
diff --git a/src/Device/Blitter.cpp b/src/Device/Blitter.cpp
index 99e9d3a..946eafc 100644
--- a/src/Device/Blitter.cpp
+++ b/src/Device/Blitter.cpp
@@ -611,10 +611,13 @@
 			break;
 		case VK_FORMAT_R16G16B16A16_SFLOAT:
 			if(writeA) { *Pointer<Half>(element + 6) = Half(c.w); }
+			// [[fallthrough]]
 		case VK_FORMAT_R16G16B16_SFLOAT:
 			if(writeB) { *Pointer<Half>(element + 4) = Half(c.z); }
+			// [[fallthrough]]
 		case VK_FORMAT_R16G16_SFLOAT:
 			if(writeG) { *Pointer<Half>(element + 2) = Half(c.y); }
+			// [[fallthrough]]
 		case VK_FORMAT_R16_SFLOAT:
 			if(writeR) { *Pointer<Half>(element) = Half(c.x); }
 			break;
@@ -690,14 +693,17 @@
 		case VK_FORMAT_R8G8B8A8_SSCALED:
 		case VK_FORMAT_A8B8G8R8_SSCALED_PACK32:
 			if(writeA) { *Pointer<SByte>(element + 3) = SByte(RoundInt(Float(c.w))); }
+			// [[fallthrough]]
 		case VK_FORMAT_R8G8B8_SINT:
 		case VK_FORMAT_R8G8B8_SNORM:
 		case VK_FORMAT_R8G8B8_SSCALED:
 			if(writeB) { *Pointer<SByte>(element + 2) = SByte(RoundInt(Float(c.z))); }
+			// [[fallthrough]]
 		case VK_FORMAT_R8G8_SINT:
 		case VK_FORMAT_R8G8_SNORM:
 		case VK_FORMAT_R8G8_SSCALED:
 			if(writeG) { *Pointer<SByte>(element + 1) = SByte(RoundInt(Float(c.y))); }
+			// [[fallthrough]]
 		case VK_FORMAT_R8_SINT:
 		case VK_FORMAT_R8_SNORM:
 		case VK_FORMAT_R8_SSCALED:
@@ -708,11 +714,13 @@
 		case VK_FORMAT_R8G8B8_USCALED:
 		case VK_FORMAT_R8G8B8_SRGB:
 			if(writeB) { *Pointer<Byte>(element + 2) = Byte(RoundInt(Float(c.z))); }
+			// [[fallthrough]]
 		case VK_FORMAT_R8G8_UINT:
 		case VK_FORMAT_R8G8_UNORM:
 		case VK_FORMAT_R8G8_USCALED:
 		case VK_FORMAT_R8G8_SRGB:
 			if(writeG) { *Pointer<Byte>(element + 1) = Byte(RoundInt(Float(c.y))); }
+			// [[fallthrough]]
 		case VK_FORMAT_R8_UINT:
 		case VK_FORMAT_R8_UNORM:
 		case VK_FORMAT_R8_USCALED:
@@ -814,8 +822,10 @@
 			break;
 		case VK_FORMAT_R32G32B32_SINT:
 			if(writeB) { *Pointer<Int>(element + 8) = RoundInt(Float(c.z)); }
+			// [[fallthrough]]
 		case VK_FORMAT_R32G32_SINT:
 			if(writeG) { *Pointer<Int>(element + 4) = RoundInt(Float(c.y)); }
+			// [[fallthrough]]
 		case VK_FORMAT_R32_SINT:
 			if(writeR) { *Pointer<Int>(element) = RoundInt(Float(c.x)); }
 			break;
@@ -834,8 +844,10 @@
 			break;
 		case VK_FORMAT_R32G32B32_UINT:
 			if(writeB) { *Pointer<UInt>(element + 8) = As<UInt>(RoundInt(Float(c.z))); }
+			// [[fallthrough]]
 		case VK_FORMAT_R32G32_UINT:
 			if(writeG) { *Pointer<UInt>(element + 4) = As<UInt>(RoundInt(Float(c.y))); }
+			// [[fallthrough]]
 		case VK_FORMAT_R32_UINT:
 			if(writeR) { *Pointer<UInt>(element) = As<UInt>(RoundInt(Float(c.x))); }
 			break;
@@ -970,8 +982,10 @@
 		case VK_FORMAT_R8G8B8A8_SINT:
 			c = Insert(c, Int(*Pointer<SByte>(element + 3)), 3);
 			c = Insert(c, Int(*Pointer<SByte>(element + 2)), 2);
+			// [[fallthrough]]
 		case VK_FORMAT_R8G8_SINT:
 			c = Insert(c, Int(*Pointer<SByte>(element + 1)), 1);
+			// [[fallthrough]]
 		case VK_FORMAT_R8_SINT:
 			c = Insert(c, Int(*Pointer<SByte>(element)), 0);
 			break;
@@ -991,8 +1005,10 @@
 		case VK_FORMAT_R8G8B8A8_UINT:
 			c = Insert(c, Int(*Pointer<Byte>(element + 3)), 3);
 			c = Insert(c, Int(*Pointer<Byte>(element + 2)), 2);
+			// [[fallthrough]]
 		case VK_FORMAT_R8G8_UINT:
 			c = Insert(c, Int(*Pointer<Byte>(element + 1)), 1);
+			// [[fallthrough]]
 		case VK_FORMAT_R8_UINT:
 		case VK_FORMAT_S8_UINT:
 			c = Insert(c, Int(*Pointer<Byte>(element)), 0);
@@ -1000,16 +1016,20 @@
 		case VK_FORMAT_R16G16B16A16_SINT:
 			c = Insert(c, Int(*Pointer<Short>(element + 6)), 3);
 			c = Insert(c, Int(*Pointer<Short>(element + 4)), 2);
+			// [[fallthrough]]
 		case VK_FORMAT_R16G16_SINT:
 			c = Insert(c, Int(*Pointer<Short>(element + 2)), 1);
+			// [[fallthrough]]
 		case VK_FORMAT_R16_SINT:
 			c = Insert(c, Int(*Pointer<Short>(element)), 0);
 			break;
 		case VK_FORMAT_R16G16B16A16_UINT:
 			c = Insert(c, Int(*Pointer<UShort>(element + 6)), 3);
 			c = Insert(c, Int(*Pointer<UShort>(element + 4)), 2);
+			// [[fallthrough]]
 		case VK_FORMAT_R16G16_UINT:
 			c = Insert(c, Int(*Pointer<UShort>(element + 2)), 1);
+			// [[fallthrough]]
 		case VK_FORMAT_R16_UINT:
 			c = Insert(c, Int(*Pointer<UShort>(element)), 0);
 			break;
@@ -1020,6 +1040,7 @@
 		case VK_FORMAT_R32G32_SINT:
 		case VK_FORMAT_R32G32_UINT:
 			c = Insert(c, *Pointer<Int>(element + 4), 1);
+			// [[fallthrough]]
 		case VK_FORMAT_R32_SINT:
 		case VK_FORMAT_R32_UINT:
 			c = Insert(c, *Pointer<Int>(element), 0);
@@ -1096,6 +1117,7 @@
 		case VK_FORMAT_B8G8R8A8_SINT:
 		case VK_FORMAT_B8G8R8A8_SSCALED:
 			if(writeA) { *Pointer<SByte>(element + 3) = SByte(Extract(c, 3)); }
+			// [[fallthrough]]
 		case VK_FORMAT_B8G8R8_SINT:
 		case VK_FORMAT_B8G8R8_SSCALED:
 			if(writeB) { *Pointer<SByte>(element) = SByte(Extract(c, 2)); }
@@ -1107,12 +1129,15 @@
 		case VK_FORMAT_R8G8B8A8_SSCALED:
 		case VK_FORMAT_A8B8G8R8_SSCALED_PACK32:
 			if(writeA) { *Pointer<SByte>(element + 3) = SByte(Extract(c, 3)); }
+			// [[fallthrough]]
 		case VK_FORMAT_R8G8B8_SINT:
 		case VK_FORMAT_R8G8B8_SSCALED:
 			if(writeB) { *Pointer<SByte>(element + 2) = SByte(Extract(c, 2)); }
+			// [[fallthrough]]
 		case VK_FORMAT_R8G8_SINT:
 		case VK_FORMAT_R8G8_SSCALED:
 			if(writeG) { *Pointer<SByte>(element + 1) = SByte(Extract(c, 1)); }
+			// [[fallthrough]]
 		case VK_FORMAT_R8_SINT:
 		case VK_FORMAT_R8_SSCALED:
 			if(writeR) { *Pointer<SByte>(element) = SByte(Extract(c, 0)); }
@@ -1158,6 +1183,7 @@
 		case VK_FORMAT_B8G8R8A8_UINT:
 		case VK_FORMAT_B8G8R8A8_USCALED:
 			if(writeA) { *Pointer<Byte>(element + 3) = Byte(Extract(c, 3)); }
+			// [[fallthrough]]
 		case VK_FORMAT_B8G8R8_UINT:
 		case VK_FORMAT_B8G8R8_USCALED:
 		case VK_FORMAT_B8G8R8_SRGB:
@@ -1170,12 +1196,15 @@
 		case VK_FORMAT_R8G8B8A8_USCALED:
 		case VK_FORMAT_A8B8G8R8_USCALED_PACK32:
 			if(writeA) { *Pointer<Byte>(element + 3) = Byte(Extract(c, 3)); }
+			// [[fallthrough]]
 		case VK_FORMAT_R8G8B8_UINT:
 		case VK_FORMAT_R8G8B8_USCALED:
 			if(writeB) { *Pointer<Byte>(element + 2) = Byte(Extract(c, 2)); }
+			// [[fallthrough]]
 		case VK_FORMAT_R8G8_UINT:
 		case VK_FORMAT_R8G8_USCALED:
 			if(writeG) { *Pointer<Byte>(element + 1) = Byte(Extract(c, 1)); }
+			// [[fallthrough]]
 		case VK_FORMAT_R8_UINT:
 		case VK_FORMAT_R8_USCALED:
 		case VK_FORMAT_S8_UINT:
@@ -1184,12 +1213,15 @@
 		case VK_FORMAT_R16G16B16A16_SINT:
 		case VK_FORMAT_R16G16B16A16_SSCALED:
 			if(writeA) { *Pointer<Short>(element + 6) = Short(Extract(c, 3)); }
+			// [[fallthrough]]
 		case VK_FORMAT_R16G16B16_SINT:
 		case VK_FORMAT_R16G16B16_SSCALED:
 			if(writeB) { *Pointer<Short>(element + 4) = Short(Extract(c, 2)); }
+			// [[fallthrough]]
 		case VK_FORMAT_R16G16_SINT:
 		case VK_FORMAT_R16G16_SSCALED:
 			if(writeG) { *Pointer<Short>(element + 2) = Short(Extract(c, 1)); }
+			// [[fallthrough]]
 		case VK_FORMAT_R16_SINT:
 		case VK_FORMAT_R16_SSCALED:
 			if(writeR) { *Pointer<Short>(element) = Short(Extract(c, 0)); }
@@ -1197,12 +1229,15 @@
 		case VK_FORMAT_R16G16B16A16_UINT:
 		case VK_FORMAT_R16G16B16A16_USCALED:
 			if(writeA) { *Pointer<UShort>(element + 6) = UShort(Extract(c, 3)); }
+			// [[fallthrough]]
 		case VK_FORMAT_R16G16B16_UINT:
 		case VK_FORMAT_R16G16B16_USCALED:
 			if(writeB) { *Pointer<UShort>(element + 4) = UShort(Extract(c, 2)); }
+			// [[fallthrough]]
 		case VK_FORMAT_R16G16_UINT:
 		case VK_FORMAT_R16G16_USCALED:
 			if(writeG) { *Pointer<UShort>(element + 2) = UShort(Extract(c, 1)); }
+			// [[fallthrough]]
 		case VK_FORMAT_R16_UINT:
 		case VK_FORMAT_R16_USCALED:
 			if(writeR) { *Pointer<UShort>(element) = UShort(Extract(c, 0)); }
@@ -1247,8 +1282,10 @@
 			break;
 		case VK_FORMAT_R32G32B32_UINT:
 			if(writeB) { *Pointer<UInt>(element + 8) = As<UInt>(Extract(c, 2)); }
+			// [[fallthrough]]
 		case VK_FORMAT_R32G32_UINT:
 			if(writeG) { *Pointer<UInt>(element + 4) = As<UInt>(Extract(c, 1)); }
+			// [[fallthrough]]
 		case VK_FORMAT_R32_UINT:
 			if(writeR) { *Pointer<UInt>(element) = As<UInt>(Extract(c, 0)); }
 			break;
diff --git a/src/Pipeline/PixelRoutine.cpp b/src/Pipeline/PixelRoutine.cpp
index 652d2ae..4aa7f03 100644
--- a/src/Pipeline/PixelRoutine.cpp
+++ b/src/Pipeline/PixelRoutine.cpp
@@ -1060,7 +1060,7 @@
 			v = Insert(v, *Pointer<Int>(buffer + 0), 2);
 			v = Insert(v, *Pointer<Int>(buffer + 4), 3);
 
-			a2b10g10r10Unpack(v, pixel);
+			pixel = a2b10g10r10Unpack(v);
 		}
 		break;
 		case VK_FORMAT_A2R10G10B10_UNORM_PACK32:
@@ -1072,7 +1072,7 @@
 			v = Insert(v, *Pointer<Int>(buffer + 4 * x), 2);
 			v = Insert(v, *Pointer<Int>(buffer + 4 * x + 4), 3);
 
-			a2r10g10b10Unpack(v, pixel);
+			pixel = a2r10g10b10Unpack(v);
 		}
 		break;
 		default:
diff --git a/src/Pipeline/SamplerCore.cpp b/src/Pipeline/SamplerCore.cpp
index 9b20c1d..6026600 100644
--- a/src/Pipeline/SamplerCore.cpp
+++ b/src/Pipeline/SamplerCore.cpp
@@ -1699,7 +1699,7 @@
 		cc = Insert(cc, Pointer<Int>(buffer)[index[2]], 2);
 		cc = Insert(cc, Pointer<Int>(buffer)[index[3]], 3);
 
-		a2b10g10r10Unpack(cc, c);
+		c = a2b10g10r10Unpack(cc);
 	}
 	else if(state.textureFormat == VK_FORMAT_A2R10G10B10_UNORM_PACK32)
 	{
@@ -1709,7 +1709,7 @@
 		cc = Insert(cc, Pointer<Int>(buffer)[index[2]], 2);
 		cc = Insert(cc, Pointer<Int>(buffer)[index[3]], 3);
 
-		a2r10g10b10Unpack(cc, c);
+		c = a2r10g10b10Unpack(cc);
 	}
 	else if(state.textureFormat == VK_FORMAT_A2B10G10R10_UINT_PACK32)
 	{
diff --git a/src/Pipeline/ShaderCore.cpp b/src/Pipeline/ShaderCore.cpp
index 00d02a0..a13f0d3 100644
--- a/src/Pipeline/ShaderCore.cpp
+++ b/src/Pipeline/ShaderCore.cpp
@@ -601,7 +601,7 @@
 	// copy/pasting the bits so the the exponent bits and top mantissa bits are aligned to the half format.
 	// In this case, we have:
 	// MSB | B B B B B B B B B B G G G G G G G G G G G R R R R R R R R R R R | LSB
-	SIMD::UInt halfBits;
+	UInt4 halfBits;
 	halfBits = Insert(halfBits, (r11g11b10bits & UInt(0x000007FFu)) << 4, 0);
 	halfBits = Insert(halfBits, (r11g11b10bits & UInt(0x003FF800u)) >> 7, 1);
 	halfBits = Insert(halfBits, (r11g11b10bits & UInt(0xFFC00000u)) >> 17, 2);
@@ -609,15 +609,18 @@
 	return As<Float4>(halfToFloatBits(halfBits));
 }
 
-UInt r11g11b10Pack(sw::SIMD::Float &value)
+UInt r11g11b10Pack(const Float4 &value)
 {
-	SIMD::UInt halfBits = floatToHalfBits(As<SIMD::UInt>(value), true) &
-	                      SIMD::UInt(0x7FF00000, 0x7FF00000, 0x7FE00000, 0);
-	return (UInt(halfBits.x) >> 20) | (UInt(halfBits.y) >> 9) | (UInt(halfBits.z) << 1);
+	auto halfBits = floatToHalfBits(As<UInt4>(value), true);
+	// Truncates instead of rounding. See b/147900455
+	UInt4 truncBits = halfBits & UInt4(0x7FF00000, 0x7FF00000, 0x7FE00000, 0);
+	return (UInt(truncBits.x) >> 20) | (UInt(truncBits.y) >> 9) | (UInt(truncBits.z) << 1);
 }
 
-void a2b10g10r10Unpack(Int4 &value, Vector4s &result)
+Vector4s a2b10g10r10Unpack(const Int4 &value)
 {
+	Vector4s result;
+
 	result.x = Short4(value << 6) & Short4(0xFFC0u);
 	result.y = Short4(value >> 4) & Short4(0xFFC0u);
 	result.z = Short4(value >> 14) & Short4(0xFFC0u);
@@ -630,10 +633,14 @@
 	result.w |= As<Short4>(As<UShort4>(result.w) >> 2);
 	result.w |= As<Short4>(As<UShort4>(result.w) >> 4);
 	result.w |= As<Short4>(As<UShort4>(result.w) >> 8);
+
+	return result;
 }
 
-void a2r10g10b10Unpack(Int4 &value, Vector4s &result)
+Vector4s a2r10g10b10Unpack(const Int4 &value)
 {
+	Vector4s result;
+
 	result.x = Short4(value >> 14) & Short4(0xFFC0u);
 	result.y = Short4(value >> 4) & Short4(0xFFC0u);
 	result.z = Short4(value << 6) & Short4(0xFFC0u);
@@ -646,6 +653,8 @@
 	result.w |= As<Short4>(As<UShort4>(result.w) >> 2);
 	result.w |= As<Short4>(As<UShort4>(result.w) >> 4);
 	result.w |= As<Short4>(As<UShort4>(result.w) >> 8);
+
+	return result;
 }
 
 rr::RValue<rr::Bool> AnyTrue(rr::RValue<sw::SIMD::Int> const &ints)
diff --git a/src/Pipeline/ShaderCore.hpp b/src/Pipeline/ShaderCore.hpp
index 9a1ce32..60a32b4 100644
--- a/src/Pipeline/ShaderCore.hpp
+++ b/src/Pipeline/ShaderCore.hpp
@@ -206,10 +206,10 @@
 
 sw::SIMD::UInt halfToFloatBits(sw::SIMD::UInt halfBits);
 sw::SIMD::UInt floatToHalfBits(sw::SIMD::UInt floatBits, bool storeInUpperBits);
-sw::SIMD::Float r11g11b10Unpack(UInt r11g11b10bits);
-UInt r11g11b10Pack(sw::SIMD::Float &value);
-void a2b10g10r10Unpack(Int4 &value, Vector4s &result);
-void a2r10g10b10Unpack(Int4 &value, Vector4s &result);
+Float4 r11g11b10Unpack(UInt r11g11b10bits);
+UInt r11g11b10Pack(const Float4 &value);
+Vector4s a2b10g10r10Unpack(const Int4 &value);
+Vector4s a2r10g10b10Unpack(const Int4 &value);
 
 rr::RValue<rr::Bool> AnyTrue(rr::RValue<sw::SIMD::Int> const &ints);