Optimize reciprocal sqrt operation
This change deprecates rr::RcpSqrt_pp with rr::RcpSqrt. As with Rcp,
RcpSqrt computes the result using Newton-Rhapson if it's faster and the
initial approximation intrinsic is available on the current target.
Currently, only LLVM on Intel will use NR for RelaxedPrecision. Note
that passing in Precision::Relaxed will produce a faster, but less
precise reciprocal sqrt.
Also made it so that SprivShader instruction GLSLstd450InverseSqrt now
invokes RcpSqrt(x, Precision::Full) instead of performing 1/sqrt(x).
Note that the Vulkan spec states that inversesqrt()'s precision is 2
ULP, and sqrt()'s precision is inherited from 1.0 / inversesqrt();
however, our rr::Sqrt is implemented in terms of x86's sqrt intrinsic on
x86, or as calls to sqrt from Math.h.
Bug: b/169760262
Change-Id: I65ba9a64d1db934c523dda11c1a2c186059d220b
Reviewed-on: https://swiftshader-review.googlesource.com/c/SwiftShader/+/51268
Commit-Queue: Antonio Maiorano <amaiorano@google.com>
Kokoro-Result: kokoro <noreply+kokoro@google.com>
Reviewed-by: Nicolas Capens <nicolascapens@google.com>
Tested-by: Antonio Maiorano <amaiorano@google.com>
diff --git a/src/Pipeline/ShaderCore.cpp b/src/Pipeline/ShaderCore.cpp
index 4807ad1..7a01ffc 100644
--- a/src/Pipeline/ShaderCore.cpp
+++ b/src/Pipeline/ShaderCore.cpp
@@ -236,25 +236,7 @@
abs = Abs(abs);
}
- Float4 rsq;
-
- if(!pp)
- {
- rsq = Float4(1.0f) / Sqrt(abs);
- }
- else
- {
- rsq = RcpSqrt_pp(abs);
-
- if(!pp)
- {
- rsq = rsq * (Float4(3.0f) - rsq * rsq * abs) * Float4(0.5f);
- }
-
- rsq = As<Float4>(CmpNEQ(As<Int4>(abs), Int4(0x7F800000)) & As<Int4>(rsq));
- }
-
- return rsq;
+ return Rcp(abs, pp ? Precision::Relaxed : Precision::Full);
}
Float4 modulo(RValue<Float4> x, RValue<Float4> y)
diff --git a/src/Pipeline/SpirvShaderGLSLstd450.cpp b/src/Pipeline/SpirvShaderGLSLstd450.cpp
index 02281d1..1d82a5b 100644
--- a/src/Pipeline/SpirvShaderGLSLstd450.cpp
+++ b/src/Pipeline/SpirvShaderGLSLstd450.cpp
@@ -750,19 +750,10 @@
auto val = Operand(this, state, insn.word(5));
Decorations d;
ApplyDecorationsForId(&d, insn.word(5));
- if(d.RelaxedPrecision)
+
+ for(auto i = 0u; i < type.componentCount; i++)
{
- for(auto i = 0u; i < type.componentCount; i++)
- {
- dst.move(i, RcpSqrt_pp(val.Float(i)));
- }
- }
- else
- {
- for(auto i = 0u; i < type.componentCount; i++)
- {
- dst.move(i, SIMD::Float(1.0f) / Sqrt(val.Float(i)));
- }
+ dst.move(i, RcpSqrt(val.Float(i), d.RelaxedPrecision ? Precision::Relaxed : Precision::Full));
}
break;
}
diff --git a/src/Reactor/LLVMReactor.cpp b/src/Reactor/LLVMReactor.cpp
index 0840845..2cfd9cd 100644
--- a/src/Reactor/LLVMReactor.cpp
+++ b/src/Reactor/LLVMReactor.cpp
@@ -2906,6 +2906,35 @@
#endif
}
+bool HasRcpSqrtApprox()
+{
+#if defined(__i386__) || defined(__x86_64__)
+ return true;
+#else
+ return false;
+#endif
+}
+
+RValue<Float4> RcpSqrtApprox(RValue<Float4> x)
+{
+#if defined(__i386__) || defined(__x86_64__)
+ return x86::rsqrtps(x);
+#else
+ UNREACHABLE("RValue<Float4> RcpSqrtApprox() not available on this platform");
+ return { 0.0f };
+#endif
+}
+
+RValue<Float> RcpSqrtApprox(RValue<Float> x)
+{
+#if defined(__i386__) || defined(__x86_64__)
+ return x86::rsqrtss(x);
+#else
+ UNREACHABLE("RValue<Float4> RcpSqrtApprox() not available on this platform");
+ return { 0.0f };
+#endif
+}
+
RValue<Float> Sqrt(RValue<Float> x)
{
RR_DEBUG_INFO_UPDATE_LOC();
diff --git a/src/Reactor/Reactor.cpp b/src/Reactor/Reactor.cpp
index 226792a..0853519 100644
--- a/src/Reactor/Reactor.cpp
+++ b/src/Reactor/Reactor.cpp
@@ -4712,4 +4712,69 @@
return DoRcp(x, p, finite, exactAtPow2);
}
+// Functions implemented by backends
+bool HasRcpSqrtApprox();
+RValue<Float4> RcpSqrtApprox(RValue<Float4> x);
+RValue<Float> RcpSqrtApprox(RValue<Float> x);
+
+template<typename T>
+struct CastToIntType;
+
+template<>
+struct CastToIntType<Float4>
+{
+ using type = Int4;
+};
+
+template<>
+struct CastToIntType<Float>
+{
+ using type = Int;
+};
+
+// TODO: move to Reactor.hpp?
+RValue<Int> CmpNEQ(RValue<Int> x, RValue<Int> y)
+{
+ return IfThenElse(x != y, Int(~0), Int(0));
+}
+
+template<typename T>
+static RValue<T> DoRcpSqrt(RValue<T> x, Precision p)
+{
+#if defined(__i386__) || defined(__x86_64__) // On x86, 1/x is fast enough, except for lower precision
+ bool approx = HasRcpApprox() && (p != Precision::Full);
+#else
+ bool approx = HasRcpApprox();
+#endif
+
+ if(approx)
+ {
+ using IntType = typename CastToIntType<T>::type;
+
+ T rsq = RcpSqrtApprox(x);
+
+ if(p == Precision::Full)
+ {
+ rsq = rsq * (T(3.0f) - rsq * rsq * x) * T(0.5f);
+ rsq = As<T>(CmpNEQ(As<IntType>(x), IntType(0x7F800000)) & As<IntType>(rsq));
+ }
+
+ return rsq;
+ }
+ else
+ {
+ return T(1.0f) / Sqrt(x);
+ }
+}
+
+RValue<Float4> RcpSqrt(RValue<Float4> x, Precision p)
+{
+ return DoRcpSqrt(x, p);
+}
+
+RValue<Float> RcpSqrt(RValue<Float> x, Precision p)
+{
+ return DoRcpSqrt(x, p);
+}
+
} // namespace rr
diff --git a/src/Reactor/Reactor.hpp b/src/Reactor/Reactor.hpp
index c9306c8..fbaceaa 100644
--- a/src/Reactor/Reactor.hpp
+++ b/src/Reactor/Reactor.hpp
@@ -2168,8 +2168,11 @@
// Deprecated: use Rcp
// TODO(b/147516027): Remove when GLES frontend is removed
RValue<Float> Rcp_pp(RValue<Float> val, bool exactAtPow2 = false);
+// Deprecated: use RcpSqrt
+// TODO(b/147516027): Remove when GLES frontend is removed
RValue<Float> RcpSqrt_pp(RValue<Float> val);
RValue<Float> Rcp(RValue<Float> x, Precision p = Precision::Full, bool finite = false, bool exactAtPow2 = false);
+RValue<Float> RcpSqrt(RValue<Float> x, Precision p = Precision::Full);
RValue<Float> Sqrt(RValue<Float> x);
// RValue<Int4> IsInf(RValue<Float> x);
@@ -2336,8 +2339,11 @@
// Deprecated: use Rcp
// TODO(b/147516027): Remove when GLES frontend is removed
RValue<Float4> Rcp_pp(RValue<Float4> val, bool exactAtPow2 = false);
+// Deprecated: use RcpSqrt
+// TODO(b/147516027): Remove when GLES frontend is removed
RValue<Float4> RcpSqrt_pp(RValue<Float4> val);
RValue<Float4> Rcp(RValue<Float4> x, Precision p = Precision::Full, bool finite = false, bool exactAtPow2 = false);
+RValue<Float4> RcpSqrt(RValue<Float4> x, Precision p = Precision::Full);
RValue<Float4> Sqrt(RValue<Float4> x);
RValue<Float4> Insert(RValue<Float4> val, RValue<Float> element, int i);
RValue<Float> Extract(RValue<Float4> x, int i);
diff --git a/src/Reactor/SubzeroReactor.cpp b/src/Reactor/SubzeroReactor.cpp
index 5622170..bcc359f 100644
--- a/src/Reactor/SubzeroReactor.cpp
+++ b/src/Reactor/SubzeroReactor.cpp
@@ -3952,6 +3952,25 @@
return { 0.0f };
}
+bool HasRcpSqrtApprox()
+{
+ return false;
+}
+
+RValue<Float4> RcpSqrtApprox(RValue<Float4> x)
+{
+ // TODO(b/175612820): Update once we implement x86 SSE rcp_ss and rsqrt_ss intrinsics in Subzero
+ UNREACHABLE("RValue<Float4> RcpSqrtApprox()");
+ return { 0.0f };
+}
+
+RValue<Float> RcpSqrtApprox(RValue<Float> x)
+{
+ // TODO(b/175612820): Update once we implement x86 SSE rcp_ss and rsqrt_ss intrinsics in Subzero
+ UNREACHABLE("RValue<Float> RcpSqrtApprox()");
+ return { 0.0f };
+}
+
RValue<Float4> Sqrt(RValue<Float4> x)
{
RR_DEBUG_INFO_UPDATE_LOC();