Add unit test for 11/10-bit minifloat denormal underflow The reference code for conversion from 32-bit floating-point to 11- and 10-bit minifloat formats supports producing denormals for values smaller than what can be represented as normalized representations. The arithmetic can underflow to produce zero for values too small to be represented as denormals. This arithmetic contains a 32-bit shift operation which can shift by an amount greater than 32, which has undefined behavior in C++ but produces zero on x86 processors. This change adds unit tests for the intended behavior around the cutoff between the smallest denormal, and zero, to help validate the fix for the UB in a future change. Bug: b/147900455 Bug: chromium:1117433 Change-Id: Ic5e495dd822231d52a5551ee12733a616728d486 Reviewed-on: https://swiftshader-review.googlesource.com/c/SwiftShader/+/48068 Reviewed-by: Antonio Maiorano <amaiorano@google.com> Tested-by: Nicolas Capens <nicolascapens@google.com> Kokoro-Result: kokoro <noreply+kokoro@google.com>

commit: 558540feea2ad57e9b8daecce1cb522df7d9266a [log] [tgz]
author: Nicolas Capens <capn@google.com> Fri Aug 21 11:20:11 2020 -0400
committer: Nicolas Capens <nicolascapens@google.com> Tue Sep 01 20:40:21 2020 +0000
tree: b53a6b44cbe6f1c48d6e23f07aeb3117cf66e861
parent: 2d5bbdc4d9de7bf12bb3da9cf87ca9c353ab6523 [diff]
diff --git a/src/System/Half.hpp b/src/System/Half.hpp
index 6f73b4d..5775141 100644
--- a/src/System/Half.hpp
+++ b/src/System/Half.hpp

@@ -128,9 +128,25 @@
 
 class R11G11B10F
 {
-	unsigned int R : 11;
-	unsigned int G : 11;
-	unsigned int B : 10;
+public:
+	R11G11B10F(float rgb[3])
+	{
+		R = float32ToFloat11(rgb[0]);
+		G = float32ToFloat11(rgb[1]);
+		B = float32ToFloat10(rgb[2]);
+	}
+
+	operator unsigned int() const
+	{
+		return *reinterpret_cast<const unsigned int *>(this);
+	}
+
+	void toRGB16F(half rgb[3]) const
+	{
+		rgb[0] = float11ToFloat16(R);
+		rgb[1] = float11ToFloat16(G);
+		rgb[2] = float10ToFloat16(B);
+	}
 
 	static inline half float11ToFloat16(unsigned short fp11)
 	{
@@ -142,7 +158,7 @@
 		return shortAsHalf(fp10 << 5);  // Sign bit 0
 	}
 
-	inline unsigned short float32ToFloat11(float fp32)
+	static inline unsigned short float32ToFloat11(float fp32)
 	{
 		const unsigned int float32MantissaMask = 0x7FFFFF;
 		const unsigned int float32ExponentMask = 0x7F800000;
@@ -215,7 +231,7 @@
 		}
 	}
 
-	inline unsigned short float32ToFloat10(float fp32)
+	static inline unsigned short float32ToFloat10(float fp32)
 	{
 		const unsigned int float32MantissaMask = 0x7FFFFF;
 		const unsigned int float32ExponentMask = 0x7F800000;
@@ -249,7 +265,7 @@
 			}
 			else if(float32Sign)
 			{
-				// -INF is clamped to 0 since float11 is positive only
+				// -INF is clamped to 0 since float10 is positive only
 				return 0;
 			}
 			else
@@ -264,14 +280,14 @@
 		}
 		else if(float32Val > float32Maxfloat10)
 		{
-			// The number is too large to be represented as a float11, set to max
+			// The number is too large to be represented as a float10, set to max
 			return float10Max;
 		}
 		else
 		{
 			if(float32Val < float32Minfloat10)
 			{
-				// The number is too small to be represented as a normalized float11
+				// The number is too small to be represented as a normalized float10
 				// Convert it to a denormalized value.
 				const unsigned int shift = (float32ExponentBias - float10ExponentBias) -
 				                           (float32Val >> float32ExponentFirstBit);
@@ -280,7 +296,7 @@
 			}
 			else
 			{
-				// Rebias the exponent to represent the value as a normalized float11
+				// Rebias the exponent to represent the value as a normalized float10
 				float32Val += 0xC8000000;
 			}
 
@@ -288,25 +304,10 @@
 		}
 	}
 
-public:
-	R11G11B10F(float rgb[3])
-	{
-		R = float32ToFloat11(rgb[0]);
-		G = float32ToFloat11(rgb[1]);
-		B = float32ToFloat10(rgb[2]);
-	}
-
-	operator unsigned int() const
-	{
-		return *reinterpret_cast<const unsigned int *>(this);
-	}
-
-	void toRGB16F(half rgb[3]) const
-	{
-		rgb[0] = float11ToFloat16(R);
-		rgb[1] = float11ToFloat16(G);
-		rgb[2] = float10ToFloat16(B);
-	}
+private:
+	unsigned int R : 11;
+	unsigned int G : 11;
+	unsigned int B : 10;
 };
 
 }  // namespace sw

diff --git a/tests/MathUnitTests/unittests.cpp b/tests/MathUnitTests/unittests.cpp
index 04f836c..1ce35c3 100644
--- a/tests/MathUnitTests/unittests.cpp
+++ b/tests/MathUnitTests/unittests.cpp

@@ -21,6 +21,18 @@
 
 using namespace sw;
 
+TEST(MathTest, UnsignedFloat11_10)
+{
+	// Test the largest value which causes underflow to 0, and the smallest value
+	// which produces a denormalized result.
+
+	EXPECT_EQ(R11G11B10F::float32ToFloat11(bit_cast<float>(0x3500007F)), 0x0000);
+	EXPECT_EQ(R11G11B10F::float32ToFloat11(bit_cast<float>(0x35000080)), 0x0001);
+
+	EXPECT_EQ(R11G11B10F::float32ToFloat10(bit_cast<float>(0x3580003F)), 0x0000);
+	EXPECT_EQ(R11G11B10F::float32ToFloat10(bit_cast<float>(0x35800040)), 0x0001);
+}
+
 // Clamps to the [0, hi] range. NaN input produces 0, hi must be non-NaN.
 float clamp0hi(float x, float hi)
 {
commit	558540feea2ad57e9b8daecce1cb522df7d9266a	[log] [tgz]
author	Nicolas Capens <capn@google.com>	Fri Aug 21 11:20:11 2020 -0400
committer	Nicolas Capens <nicolascapens@google.com>	Tue Sep 01 20:40:21 2020 +0000
tree	b53a6b44cbe6f1c48d6e23f07aeb3117cf66e861
parent	2d5bbdc4d9de7bf12bb3da9cf87ca9c353ab6523 [diff]