Correct reciprocal approximation for power-of-two values.

Intel's reciprocal approximation instruction is not exact for power-of-two
values. It provides 12 bits of mantissa precision and keeps a balance between
positive and negative errors, but the reciprocal of 2^x is not 2^-x. This
affects conformance tests which expect varyings not to be affected by the
perspective division. Correct for this by multiplying by the inverse.

Bug 27165393

Change-Id: Ie52ec511a14a4f447adc47ce9c875bbad03cd274
Reviewed-on: https://swiftshader-review.googlesource.com/4903
Tested-by: Nicolas Capens <capn@google.com>
Reviewed-by: Alexis Hétu <sugoi@google.com>
Reviewed-by: Nicolas Capens <capn@google.com>
diff --git a/src/Reactor/Nucleus.cpp b/src/Reactor/Nucleus.cpp
index 2325c7f..ce48b52 100644
--- a/src/Reactor/Nucleus.cpp
+++ b/src/Reactor/Nucleus.cpp
@@ -33,6 +33,7 @@
 #include "Thread.hpp"
 #include "Memory.hpp"
 
+#include <xmmintrin.h>
 #include <fstream>
 
 #if defined(__x86_64__) && defined(_WIN32)
@@ -4635,9 +4636,9 @@
 			Constant *shuffle[2];
 			shuffle[0] = Nucleus::createConstantInt(0);
 			shuffle[1] = Nucleus::createConstantInt(1);
-	
+
 			Value *packed = Nucleus::createShuffleVector(Nucleus::createBitCast(lo.value, VectorType::get(Int::getType(), 1)), Nucleus::createBitCast(hi.value, VectorType::get(Int::getType(), 1)), Nucleus::createConstantVector(shuffle, 2));
-	
+
 			storeValue(Nucleus::createBitCast(packed, Int2::getType()));
 		}
 	}
@@ -5199,7 +5200,7 @@
 		Value *element = Nucleus::createBitCast(cast.value, Long::getType());
 		long2 = Nucleus::createInsertElement(long2, element, 0);
 		RValue<Int4> vector = RValue<Int4>(Nucleus::createBitCast(long2, Int4::getType()));
-		
+
 		if(CPUID::supportsSSE4_1())
 		{
 			storeValue(x86::pmovsxwd(vector).value);
@@ -6069,9 +6070,18 @@
 		return IfThenElse(x < y, x, y);
 	}
 
-	RValue<Float> Rcp_pp(RValue<Float> x)
+	RValue<Float> Rcp_pp(RValue<Float> x, bool exactAtPow2)
 	{
-		return x86::rcpss(x);
+		if(exactAtPow2)
+		{
+			// rcpss uses a piecewise-linear approximation which minimizes the relative error
+			// but is not exact at power-of-two values. Rectify by multiplying by the inverse.
+			return x86::rcpss(x) * Float(1.0f / _mm_cvtss_f32(_mm_rcp_ss(_mm_set_ps1(1.0f))));
+		}
+		else
+		{
+			return x86::rcpss(x);
+		}
 	}
 
 	RValue<Float> RcpSqrt_pp(RValue<Float> x)
@@ -6580,9 +6590,18 @@
 		return x86::minps(x, y);
 	}
 
-	RValue<Float4> Rcp_pp(RValue<Float4> x)
+	RValue<Float4> Rcp_pp(RValue<Float4> x, bool exactAtPow2)
 	{
-		return x86::rcpps(x);
+		if(exactAtPow2)
+		{
+			// rcpps uses a piecewise-linear approximation which minimizes the relative error
+			// but is not exact at power-of-two values. Rectify by multiplying by the inverse.
+			return x86::rcpps(x) * Float4(1.0f / _mm_cvtss_f32(_mm_rcp_ss(_mm_set_ps1(1.0f))));
+		}
+		else
+		{
+			return x86::rcpps(x);
+		}
 	}
 
 	RValue<Float4> RcpSqrt_pp(RValue<Float4> x)