Improve the precision and performance of highp exp2()

This algorithm is based on the relaxed precision exp2() algorithm, with
a higher degree polynomial and using integer addition to correct the
exponent bias.

It eliminates a shift operation and loading a constant.

The code for the relaxed precision exp2() unit test was refactored for
readability.

Bug: b/169754022
Change-Id: Ie103484f88d7f324a4d6dbd4ba7e9893c5a0e2d7
Reviewed-on: https://swiftshader-review.googlesource.com/c/SwiftShader/+/64088
Kokoro-Result: kokoro <noreply+kokoro@google.com>
Tested-by: Nicolas Capens <nicolascapens@google.com>
Reviewed-by: Alexis Hétu <sugoi@google.com>
diff --git a/src/Pipeline/ShaderCore.cpp b/src/Pipeline/ShaderCore.cpp
index 832ce0f..1ce667f 100644
--- a/src/Pipeline/ShaderCore.cpp
+++ b/src/Pipeline/ShaderCore.cpp
@@ -349,24 +349,22 @@
 
 	if(!relaxedPrecision)  // highp
 	{
-		// This implementation is based on 2^(i + f) = 2^i * 2^f,
-		// where i is the integer part of x and f is the fraction.
-	
-		// For 2^i we can put the integer part directly in the exponent of the IEEE-754 floating-point number.
-		Int4 i = Int4(xi);
-		Float4 ii = As<Float4>((i + Int4(127)) << 23);  // Add single-precision bias, and shift into exponent.
-
-		// For the fractional part use a polynomial which approximates 2^f in the 0 to 1 range.
-		// To be exact at integers it uses the form f(x) * x + 1.
+		// Polynomial which approximates (2^x-x-1)/x. Multiplying with x
+		// gives us a correction term to be added to 1+x to obtain 2^x.
 		const Float4 a = 1.8852974e-3f;
 		const Float4 b = 8.9733787e-3f;
 		const Float4 c = 5.5835927e-2f;
 		const Float4 d = 2.4015281e-1f;
-		const Float4 e = 6.9315247e-1f;
+		const Float4 e = -3.0684753e-1f;
 
-		Float4 ff = MulAdd(MulAdd(MulAdd(MulAdd(MulAdd(a, f, b), f, c), f, d), f, e), f, 1.0f);
+		Float4 r = MulAdd(MulAdd(MulAdd(MulAdd(a, f, b), f, c), f, d), f, e);
 
-		return ii * ff;
+		// bit_cast<float>(int(x * 2^23)) is a piecewise linear approximation of 2^x.
+		// See "Fast Exponential Computation on SIMD Architectures" by Malossi et al.
+		Float4 y = MulAdd(r, f, x0);
+		Int4 i = Int4(y * (1 << 23)) + (127 << 23);
+
+		return As<Float4>(i);
 	}
 	else  // RelaxedPrecision / mediump
 	{
diff --git a/tests/MathUnitTests/unittests.cpp b/tests/MathUnitTests/unittests.cpp
index 1873d90..a4f497e 100644
--- a/tests/MathUnitTests/unittests.cpp
+++ b/tests/MathUnitTests/unittests.cpp
@@ -136,9 +136,9 @@
 	CPUID::setFlushToZero(false);
 }
 
-// lolremez --float - d 2 - r "0:1" "(2^x-x-1)/x" "1/x"
+// lolremez --float -d 2 -r "0:1" "(2^x-x-1)/x" "1/x"
 // ULP-16: 0.130859017
-float P(float x)
+float Pr(float x)
 {
 	float u = 7.8145574e-2f;
 	u = u * x + 2.2617357e-1f;
@@ -147,18 +147,17 @@
 
 float Exp2Relaxed(float x)
 {
-	float x0 = x;
-	x0 = min(x0, 128.0f);
-	x0 = max(x0, bit_cast<float>(int(0xC2FDFFFF)));  // -126.999992
-
-	const float f = x0 - floor(x0);
+	x = min(x, 128.0f);
+	x = max(x, bit_cast<float>(int(0xC2FDFFFF)));  // -126.999992
 
 	// 2^f - f - 1 as P(f) * f
-	float y = P(f) * f + x0;
+	// This is a correction term to be added to 1+x to obtain 2^x.
+	float f = x - floor(x);
+	float y = Pr(f) * f + x;
 
-	int i = (int)((1 << 23) * y + (127 << 23));
-
-	return bit_cast<float>(i);
+	// bit_cast<float>(int(x * 2^23)) is a piecewise linear approximation of 2^(x-127).
+	// See "Fast Exponential Computation on SIMD Architectures" by Malossi et al.
+	return bit_cast<float>(int((1 << 23) * y + (127 << 23)));
 }
 
 TEST(MathTest, Exp2RelaxedExhaustive)
@@ -316,39 +315,30 @@
 	return ii * ff;
 }
 
-// lolremez --float -d 4 -r "0:1" "(2^x-1)/x" "1/x"
-// ULP_32: 2.65837669, Vulkan margin: 0.847366512
-float f_r(float x)
+// lolremez --float -d 4 -r "0:1" "(2^x-x-1)/x" "1/x"
+// ULP_32: 2.14694786, Vulkan margin: 0.686957061
+float P(float x)
 {
 	float u = 1.8852974e-3f;
 	u = u * x + 8.9733787e-3f;
 	u = u * x + 5.5835927e-2f;
 	u = u * x + 2.4015281e-1f;
-	return u * x + 6.9315247e-1f;
+	return u * x + -3.0684753e-1f;
 }
 
 float Exp2(float x)
 {
-	// This implementation is based on 2^(i + f) = 2^i * 2^f,
-	// where i is the integer part of x and f is the fraction.
+	x = min(x, 128.0f);
+	x = max(x, bit_cast<float>(0xC2FDFFFF));  // -126.999992
 
-	// For 2^i we can put the integer part directly in the exponent of
-	// the IEEE-754 floating-point number. Clamp to prevent overflow
-	// past the representation of infinity.
-	float x0 = x;
-	x0 = min(x0, bit_cast<float>(int(0x4300FFFF)));  // 128.999985
-	x0 = max(x0, bit_cast<float>(int(0xC2FDFFFF)));  // -126.999992
+	// 2^f - f - 1 as P(f) * f
+	// This is a correction term to be added to 1+x to obtain 2^x.
+	float f = x - floor(x);
+	float y = P(f) * f + x;
 
-	float xi = floor(x0);
-	int i = int(xi);
-	float ii = bit_cast<float>((i + int(127)) << 23);  // Add single-precision bias, and shift into exponent.
-
-	// For the fractional part use a polynomial which approximates 2^f in the 0 to 1 range.
-	// To be exact at integers it uses the form f(x) * x + 1.
-	float f = x0 - xi;
-	float ff = f_r(f) * f + 1.0f;
-
-	return ii * ff;
+	// bit_cast<float>(int(x * 2^23)) is a piecewise linear approximation of 2^(x-127).
+	// See "Fast Exponential Computation on SIMD Architectures" by Malossi et al.
+	return bit_cast<float>(int(y * (1 << 23)) + (127 << 23));
 }
 
 TEST(MathTest, Exp2Exhaustive)