Optimize reciprocal sqrt operation

This change deprecates rr::RcpSqrt_pp with rr::RcpSqrt. As with Rcp,
RcpSqrt computes the result using Newton-Rhapson if it's faster and the
initial approximation intrinsic is available on the current target.
Currently, only LLVM on Intel will use NR for RelaxedPrecision. Note
that passing in Precision::Relaxed will produce a faster, but less
precise reciprocal sqrt.

Also made it so that SprivShader instruction GLSLstd450InverseSqrt now
invokes RcpSqrt(x, Precision::Full) instead of performing 1/sqrt(x).
Note that the Vulkan spec states that inversesqrt()'s precision is 2
ULP, and sqrt()'s precision is inherited from 1.0 / inversesqrt();
however, our rr::Sqrt is implemented in terms of x86's sqrt intrinsic on
x86, or as calls to sqrt from Math.h.

Bug: b/169760262
Change-Id: I65ba9a64d1db934c523dda11c1a2c186059d220b
Reviewed-on: https://swiftshader-review.googlesource.com/c/SwiftShader/+/51268
Commit-Queue: Antonio Maiorano <amaiorano@google.com>
Kokoro-Result: kokoro <noreply+kokoro@google.com>
Reviewed-by: Nicolas Capens <nicolascapens@google.com>
Tested-by: Antonio Maiorano <amaiorano@google.com>
diff --git a/src/Pipeline/ShaderCore.cpp b/src/Pipeline/ShaderCore.cpp
index 4807ad1..7a01ffc 100644
--- a/src/Pipeline/ShaderCore.cpp
+++ b/src/Pipeline/ShaderCore.cpp
@@ -236,25 +236,7 @@
 		abs = Abs(abs);
 	}
 
-	Float4 rsq;
-
-	if(!pp)
-	{
-		rsq = Float4(1.0f) / Sqrt(abs);
-	}
-	else
-	{
-		rsq = RcpSqrt_pp(abs);
-
-		if(!pp)
-		{
-			rsq = rsq * (Float4(3.0f) - rsq * rsq * abs) * Float4(0.5f);
-		}
-
-		rsq = As<Float4>(CmpNEQ(As<Int4>(abs), Int4(0x7F800000)) & As<Int4>(rsq));
-	}
-
-	return rsq;
+	return Rcp(abs, pp ? Precision::Relaxed : Precision::Full);
 }
 
 Float4 modulo(RValue<Float4> x, RValue<Float4> y)
diff --git a/src/Pipeline/SpirvShaderGLSLstd450.cpp b/src/Pipeline/SpirvShaderGLSLstd450.cpp
index 02281d1..1d82a5b 100644
--- a/src/Pipeline/SpirvShaderGLSLstd450.cpp
+++ b/src/Pipeline/SpirvShaderGLSLstd450.cpp
@@ -750,19 +750,10 @@
 			auto val = Operand(this, state, insn.word(5));
 			Decorations d;
 			ApplyDecorationsForId(&d, insn.word(5));
-			if(d.RelaxedPrecision)
+
+			for(auto i = 0u; i < type.componentCount; i++)
 			{
-				for(auto i = 0u; i < type.componentCount; i++)
-				{
-					dst.move(i, RcpSqrt_pp(val.Float(i)));
-				}
-			}
-			else
-			{
-				for(auto i = 0u; i < type.componentCount; i++)
-				{
-					dst.move(i, SIMD::Float(1.0f) / Sqrt(val.Float(i)));
-				}
+				dst.move(i, RcpSqrt(val.Float(i), d.RelaxedPrecision ? Precision::Relaxed : Precision::Full));
 			}
 			break;
 		}
diff --git a/src/Reactor/LLVMReactor.cpp b/src/Reactor/LLVMReactor.cpp
index 0840845..2cfd9cd 100644
--- a/src/Reactor/LLVMReactor.cpp
+++ b/src/Reactor/LLVMReactor.cpp
@@ -2906,6 +2906,35 @@
 #endif
 }
 
+bool HasRcpSqrtApprox()
+{
+#if defined(__i386__) || defined(__x86_64__)
+	return true;
+#else
+	return false;
+#endif
+}
+
+RValue<Float4> RcpSqrtApprox(RValue<Float4> x)
+{
+#if defined(__i386__) || defined(__x86_64__)
+	return x86::rsqrtps(x);
+#else
+	UNREACHABLE("RValue<Float4> RcpSqrtApprox() not available on this platform");
+	return { 0.0f };
+#endif
+}
+
+RValue<Float> RcpSqrtApprox(RValue<Float> x)
+{
+#if defined(__i386__) || defined(__x86_64__)
+	return x86::rsqrtss(x);
+#else
+	UNREACHABLE("RValue<Float4> RcpSqrtApprox() not available on this platform");
+	return { 0.0f };
+#endif
+}
+
 RValue<Float> Sqrt(RValue<Float> x)
 {
 	RR_DEBUG_INFO_UPDATE_LOC();
diff --git a/src/Reactor/Reactor.cpp b/src/Reactor/Reactor.cpp
index 226792a..0853519 100644
--- a/src/Reactor/Reactor.cpp
+++ b/src/Reactor/Reactor.cpp
@@ -4712,4 +4712,69 @@
 	return DoRcp(x, p, finite, exactAtPow2);
 }
 
+// Functions implemented by backends
+bool HasRcpSqrtApprox();
+RValue<Float4> RcpSqrtApprox(RValue<Float4> x);
+RValue<Float> RcpSqrtApprox(RValue<Float> x);
+
+template<typename T>
+struct CastToIntType;
+
+template<>
+struct CastToIntType<Float4>
+{
+	using type = Int4;
+};
+
+template<>
+struct CastToIntType<Float>
+{
+	using type = Int;
+};
+
+// TODO: move to Reactor.hpp?
+RValue<Int> CmpNEQ(RValue<Int> x, RValue<Int> y)
+{
+	return IfThenElse(x != y, Int(~0), Int(0));
+}
+
+template<typename T>
+static RValue<T> DoRcpSqrt(RValue<T> x, Precision p)
+{
+#if defined(__i386__) || defined(__x86_64__)  // On x86, 1/x is fast enough, except for lower precision
+	bool approx = HasRcpApprox() && (p != Precision::Full);
+#else
+	bool approx = HasRcpApprox();
+#endif
+
+	if(approx)
+	{
+		using IntType = typename CastToIntType<T>::type;
+
+		T rsq = RcpSqrtApprox(x);
+
+		if(p == Precision::Full)
+		{
+			rsq = rsq * (T(3.0f) - rsq * rsq * x) * T(0.5f);
+			rsq = As<T>(CmpNEQ(As<IntType>(x), IntType(0x7F800000)) & As<IntType>(rsq));
+		}
+
+		return rsq;
+	}
+	else
+	{
+		return T(1.0f) / Sqrt(x);
+	}
+}
+
+RValue<Float4> RcpSqrt(RValue<Float4> x, Precision p)
+{
+	return DoRcpSqrt(x, p);
+}
+
+RValue<Float> RcpSqrt(RValue<Float> x, Precision p)
+{
+	return DoRcpSqrt(x, p);
+}
+
 }  // namespace rr
diff --git a/src/Reactor/Reactor.hpp b/src/Reactor/Reactor.hpp
index c9306c8..fbaceaa 100644
--- a/src/Reactor/Reactor.hpp
+++ b/src/Reactor/Reactor.hpp
@@ -2168,8 +2168,11 @@
 // Deprecated: use Rcp
 // TODO(b/147516027): Remove when GLES frontend is removed
 RValue<Float> Rcp_pp(RValue<Float> val, bool exactAtPow2 = false);
+// Deprecated: use RcpSqrt
+// TODO(b/147516027): Remove when GLES frontend is removed
 RValue<Float> RcpSqrt_pp(RValue<Float> val);
 RValue<Float> Rcp(RValue<Float> x, Precision p = Precision::Full, bool finite = false, bool exactAtPow2 = false);
+RValue<Float> RcpSqrt(RValue<Float> x, Precision p = Precision::Full);
 RValue<Float> Sqrt(RValue<Float> x);
 
 //	RValue<Int4> IsInf(RValue<Float> x);
@@ -2336,8 +2339,11 @@
 // Deprecated: use Rcp
 // TODO(b/147516027): Remove when GLES frontend is removed
 RValue<Float4> Rcp_pp(RValue<Float4> val, bool exactAtPow2 = false);
+// Deprecated: use RcpSqrt
+// TODO(b/147516027): Remove when GLES frontend is removed
 RValue<Float4> RcpSqrt_pp(RValue<Float4> val);
 RValue<Float4> Rcp(RValue<Float4> x, Precision p = Precision::Full, bool finite = false, bool exactAtPow2 = false);
+RValue<Float4> RcpSqrt(RValue<Float4> x, Precision p = Precision::Full);
 RValue<Float4> Sqrt(RValue<Float4> x);
 RValue<Float4> Insert(RValue<Float4> val, RValue<Float> element, int i);
 RValue<Float> Extract(RValue<Float4> x, int i);
diff --git a/src/Reactor/SubzeroReactor.cpp b/src/Reactor/SubzeroReactor.cpp
index 5622170..bcc359f 100644
--- a/src/Reactor/SubzeroReactor.cpp
+++ b/src/Reactor/SubzeroReactor.cpp
@@ -3952,6 +3952,25 @@
 	return { 0.0f };
 }
 
+bool HasRcpSqrtApprox()
+{
+	return false;
+}
+
+RValue<Float4> RcpSqrtApprox(RValue<Float4> x)
+{
+	// TODO(b/175612820): Update once we implement x86 SSE rcp_ss and rsqrt_ss intrinsics in Subzero
+	UNREACHABLE("RValue<Float4> RcpSqrtApprox()");
+	return { 0.0f };
+}
+
+RValue<Float> RcpSqrtApprox(RValue<Float> x)
+{
+	// TODO(b/175612820): Update once we implement x86 SSE rcp_ss and rsqrt_ss intrinsics in Subzero
+	UNREACHABLE("RValue<Float> RcpSqrtApprox()");
+	return { 0.0f };
+}
+
 RValue<Float4> Sqrt(RValue<Float4> x)
 {
 	RR_DEBUG_INFO_UPDATE_LOC();