Optimize reciprocal operation

This change deprecates rr::Rcp_pp with rr::Rcp, which makes sure to
correctly compute the reciprocal using the Newton-Rhapson refinement
only if the current target supports the required instrinsic, otherwise
using 1 / x. Currently, only LLVM on Intel will use NR. Note that
passing in Precision::Relaxed will produce a faster, but less precise
reciprocal.

Also removed PixelProgram::linearToSRGB as it's unused.

Bug: b/169760262
Bug: b/149574741
Change-Id: I4a2f943aa60116c4397d7a8ae18583a260824788
Reviewed-on: https://swiftshader-review.googlesource.com/c/SwiftShader/+/50648
Reviewed-by: Alexis Hétu <sugoi@google.com>
Reviewed-by: Nicolas Capens <nicolascapens@google.com>
Tested-by: Antonio Maiorano <amaiorano@google.com>
Commit-Queue: Antonio Maiorano <amaiorano@google.com>
diff --git a/src/Pipeline/PixelProgram.cpp b/src/Pipeline/PixelProgram.cpp
index 16aebec..0deffa9 100644
--- a/src/Pipeline/PixelProgram.cpp
+++ b/src/Pipeline/PixelProgram.cpp
@@ -373,12 +373,4 @@
 	}
 }
 
-Float4 PixelProgram::linearToSRGB(const Float4 &x)  // Approximates x^(1.0/2.2)
-{
-	Float4 sqrtx = Rcp_pp(RcpSqrt_pp(x));
-	Float4 sRGB = sqrtx * Float4(1.14f) - x * Float4(0.14f);
-
-	return Min(Max(sRGB, Float4(0.0f)), Float4(1.0f));
-}
-
 }  // namespace sw
diff --git a/src/Pipeline/PixelProgram.hpp b/src/Pipeline/PixelProgram.hpp
index 59994fb..40f8b93 100644
--- a/src/Pipeline/PixelProgram.hpp
+++ b/src/Pipeline/PixelProgram.hpp
@@ -48,7 +48,6 @@
 
 	Int4 maskAny(Int cMask[4]) const;
 	Int4 maskAny(Int cMask[4], Int sMask[4], Int zMask[4]) const;
-	Float4 linearToSRGB(const Float4 &x);
 };
 
 }  // namespace sw
diff --git a/src/Pipeline/PixelRoutine.cpp b/src/Pipeline/PixelRoutine.cpp
index d9e50c3..3b4488b 100644
--- a/src/Pipeline/PixelRoutine.cpp
+++ b/src/Pipeline/PixelRoutine.cpp
@@ -133,7 +133,7 @@
 				WWWW += *Pointer<Float4>(constants + OFFSET(Constants, weight) + 16 * cMask[q]);
 			}
 
-			WWWW = Rcp_pp(WWWW);
+			WWWW = Rcp(WWWW, Precision::Relaxed);
 			XXXX *= WWWW;
 			YYYY *= WWWW;
 
diff --git a/src/Pipeline/SamplerCore.cpp b/src/Pipeline/SamplerCore.cpp
index f38ea35..2a80211 100644
--- a/src/Pipeline/SamplerCore.cpp
+++ b/src/Pipeline/SamplerCore.cpp
@@ -1226,10 +1226,10 @@
 		uDelta = As<Float4>((As<Int4>(dudx) & mask) | ((As<Int4>(dudy) & ~mask)));
 		vDelta = As<Float4>((As<Int4>(dvdx) & mask) | ((As<Int4>(dvdy) & ~mask)));
 
-		anisotropy = lod * Rcp_pp(det);
+		anisotropy = lod * Rcp(det, Precision::Relaxed);
 		anisotropy = Min(anisotropy, state.maxAnisotropy);
 
-		lod *= Rcp_pp(anisotropy * anisotropy);
+		lod *= Rcp(anisotropy * anisotropy, Precision::Relaxed);
 	}
 
 	lod = log2sqrt(lod);  // log2(sqrt(lod))
diff --git a/src/Pipeline/ShaderCore.cpp b/src/Pipeline/ShaderCore.cpp
index b010131..4807ad1 100644
--- a/src/Pipeline/ShaderCore.cpp
+++ b/src/Pipeline/ShaderCore.cpp
@@ -224,22 +224,7 @@
 
 Float4 reciprocal(RValue<Float4> x, bool pp, bool finite, bool exactAtPow2)
 {
-	//return Float4(1.0f) / x;
-
-	Float4 rcp = Rcp_pp(x, exactAtPow2);
-
-	if(!pp)
-	{
-		rcp = (rcp + rcp) - (x * rcp * rcp);
-	}
-
-	if(finite)
-	{
-		int big = 0x7F7FFFFF;
-		rcp = Min(rcp, Float4((float &)big));
-	}
-
-	return rcp;
+	return Rcp(x, pp ? Precision::Relaxed : Precision::Full, finite, exactAtPow2);
 }
 
 Float4 reciprocalSquareRoot(RValue<Float4> x, bool absolute, bool pp)
diff --git a/src/Reactor/LLVMReactor.cpp b/src/Reactor/LLVMReactor.cpp
index d4030d6..0840845 100644
--- a/src/Reactor/LLVMReactor.cpp
+++ b/src/Reactor/LLVMReactor.cpp
@@ -2865,6 +2865,47 @@
 #endif
 }
 
+bool HasRcpApprox()
+{
+#if defined(__i386__) || defined(__x86_64__)
+	return true;
+#else
+	return false;
+#endif
+}
+
+RValue<Float4> RcpApprox(RValue<Float4> x, bool exactAtPow2)
+{
+#if defined(__i386__) || defined(__x86_64__)
+	if(exactAtPow2)
+	{
+		// rcpps uses a piecewise-linear approximation which minimizes the relative error
+		// but is not exact at power-of-two values. Rectify by multiplying by the inverse.
+		return x86::rcpps(x) * Float4(1.0f / _mm_cvtss_f32(_mm_rcp_ss(_mm_set_ps1(1.0f))));
+	}
+	return x86::rcpps(x);
+#else
+	UNREACHABLE("RValue<Float4> RcpApprox() not available on this platform");
+	return { 0.0f };
+#endif
+}
+
+RValue<Float> RcpApprox(RValue<Float> x, bool exactAtPow2)
+{
+#if defined(__i386__) || defined(__x86_64__)
+	if(exactAtPow2)
+	{
+		// rcpss uses a piecewise-linear approximation which minimizes the relative error
+		// but is not exact at power-of-two values. Rectify by multiplying by the inverse.
+		return x86::rcpss(x) * Float(1.0f / _mm_cvtss_f32(_mm_rcp_ss(_mm_set_ps1(1.0f))));
+	}
+	return x86::rcpss(x);
+#else
+	UNREACHABLE("RValue<Float4> RcpApprox() not available on this platform");
+	return { 0.0f };
+#endif
+}
+
 RValue<Float> Sqrt(RValue<Float> x)
 {
 	RR_DEBUG_INFO_UPDATE_LOC();
diff --git a/src/Reactor/Reactor.cpp b/src/Reactor/Reactor.cpp
index 80b9612..226792a 100644
--- a/src/Reactor/Reactor.cpp
+++ b/src/Reactor/Reactor.cpp
@@ -14,6 +14,7 @@
 
 #include "Reactor.hpp"
 
+#include "CPUID.hpp"
 #include "Debug.hpp"
 #include "Print.hpp"
 
@@ -4659,4 +4660,56 @@
 
 #endif  // ENABLE_RR_PRINT
 
+// Functions implemented by backends
+bool HasRcpApprox();
+RValue<Float4> RcpApprox(RValue<Float4> x, bool exactAtPow2 = false);
+RValue<Float> RcpApprox(RValue<Float> x, bool exactAtPow2 = false);
+
+template<typename T>
+static RValue<T> DoRcp(RValue<T> x, Precision p, bool finite, bool exactAtPow2)
+{
+#if defined(__i386__) || defined(__x86_64__)  // On x86, 1/x is fast enough, except for lower precision
+	bool approx = HasRcpApprox() && (p != Precision::Full);
+#else
+	bool approx = HasRcpApprox();
+#endif
+
+	T rcp;
+
+	if(approx)
+	{
+		rcp = RcpApprox(x, exactAtPow2);
+
+		if(p == Precision::Full)
+		{
+			// Perform one more iteration of Newton-Rhapson division to increase precision
+			rcp = (rcp + rcp) - (x * rcp * rcp);
+		}
+	}
+	else
+	{
+		rcp = T(1.0f) / x;
+	}
+
+	if(finite)
+	{
+		constexpr int big = 0x7F7FFFFF;
+		rcp = Min(rcp, T((float &)big));
+	}
+
+	return rcp;
+}
+
+RValue<Float4> Rcp(RValue<Float4> x, Precision p, bool finite, bool exactAtPow2)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return DoRcp(x, p, finite, exactAtPow2);
+}
+
+RValue<Float> Rcp(RValue<Float> x, Precision p, bool finite, bool exactAtPow2)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return DoRcp(x, p, finite, exactAtPow2);
+}
+
 }  // namespace rr
diff --git a/src/Reactor/Reactor.hpp b/src/Reactor/Reactor.hpp
index a4a5f49..c9306c8 100644
--- a/src/Reactor/Reactor.hpp
+++ b/src/Reactor/Reactor.hpp
@@ -66,6 +66,16 @@
 
 namespace rr {
 
+// These generally map to the precision types as specified by the Vulkan specification.
+// See https://www.khronos.org/registry/vulkan/specs/1.2/html/chap37.html#spirvenv-precision-operation
+enum class Precision
+{
+	/*Exact,*/  // 0 ULP with correct rounding (i.e. Math.h)
+	Full,       // Single precision, but not relaxed
+	Relaxed,    // Single precision, relaxed
+	/*Half,*/   // Half precision
+};
+
 std::string BackendName();
 
 struct Capabilities
@@ -2155,8 +2165,11 @@
 RValue<Float> Abs(RValue<Float> x);
 RValue<Float> Max(RValue<Float> x, RValue<Float> y);
 RValue<Float> Min(RValue<Float> x, RValue<Float> y);
+// Deprecated: use Rcp
+// TODO(b/147516027): Remove when GLES frontend is removed
 RValue<Float> Rcp_pp(RValue<Float> val, bool exactAtPow2 = false);
 RValue<Float> RcpSqrt_pp(RValue<Float> val);
+RValue<Float> Rcp(RValue<Float> x, Precision p = Precision::Full, bool finite = false, bool exactAtPow2 = false);
 RValue<Float> Sqrt(RValue<Float> x);
 
 //	RValue<Int4> IsInf(RValue<Float> x);
@@ -2319,8 +2332,12 @@
 RValue<Float4> Abs(RValue<Float4> x);
 RValue<Float4> Max(RValue<Float4> x, RValue<Float4> y);
 RValue<Float4> Min(RValue<Float4> x, RValue<Float4> y);
+
+// Deprecated: use Rcp
+// TODO(b/147516027): Remove when GLES frontend is removed
 RValue<Float4> Rcp_pp(RValue<Float4> val, bool exactAtPow2 = false);
 RValue<Float4> RcpSqrt_pp(RValue<Float4> val);
+RValue<Float4> Rcp(RValue<Float4> x, Precision p = Precision::Full, bool finite = false, bool exactAtPow2 = false);
 RValue<Float4> Sqrt(RValue<Float4> x);
 RValue<Float4> Insert(RValue<Float4> val, RValue<Float> element, int i);
 RValue<Float> Extract(RValue<Float4> x, int i);
@@ -2372,13 +2389,6 @@
 RValue<Float4> Floor(RValue<Float4> x);
 RValue<Float4> Ceil(RValue<Float4> x);
 
-enum class Precision
-{
-	Full,
-	Relaxed,
-	//Half,
-};
-
 // Trigonometric functions
 // TODO: Currently unimplemented for Subzero.
 RValue<Float4> Sin(RValue<Float4> x);
diff --git a/src/Reactor/SubzeroReactor.cpp b/src/Reactor/SubzeroReactor.cpp
index 21787b8..5622170 100644
--- a/src/Reactor/SubzeroReactor.cpp
+++ b/src/Reactor/SubzeroReactor.cpp
@@ -3932,6 +3932,26 @@
 	return Rcp_pp(Sqrt(x));
 }
 
+bool HasRcpApprox()
+{
+	// TODO(b/175612820): Update once we implement x86 SSE rcp_ss and rsqrt_ss intrinsics in Subzero
+	return false;
+}
+
+RValue<Float4> RcpApprox(RValue<Float4> x, bool exactAtPow2)
+{
+	// TODO(b/175612820): Update once we implement x86 SSE rcp_ss and rsqrt_ss intrinsics in Subzero
+	UNREACHABLE("RValue<Float4> RcpApprox()");
+	return { 0.0f };
+}
+
+RValue<Float> RcpApprox(RValue<Float> x, bool exactAtPow2)
+{
+	// TODO(b/175612820): Update once we implement x86 SSE rcp_ss and rsqrt_ss intrinsics in Subzero
+	UNREACHABLE("RValue<Float> RcpApprox()");
+	return { 0.0f };
+}
+
 RValue<Float4> Sqrt(RValue<Float4> x)
 {
 	RR_DEBUG_INFO_UPDATE_LOC();