Optimize reciprocal operation
This change deprecates rr::Rcp_pp with rr::Rcp, which makes sure to
correctly compute the reciprocal using the Newton-Rhapson refinement
only if the current target supports the required instrinsic, otherwise
using 1 / x. Currently, only LLVM on Intel will use NR. Note that
passing in Precision::Relaxed will produce a faster, but less precise
reciprocal.
Also removed PixelProgram::linearToSRGB as it's unused.
Bug: b/169760262
Bug: b/149574741
Change-Id: I4a2f943aa60116c4397d7a8ae18583a260824788
Reviewed-on: https://swiftshader-review.googlesource.com/c/SwiftShader/+/50648
Reviewed-by: Alexis Hétu <sugoi@google.com>
Reviewed-by: Nicolas Capens <nicolascapens@google.com>
Tested-by: Antonio Maiorano <amaiorano@google.com>
Commit-Queue: Antonio Maiorano <amaiorano@google.com>
diff --git a/src/Pipeline/PixelProgram.cpp b/src/Pipeline/PixelProgram.cpp
index 16aebec..0deffa9 100644
--- a/src/Pipeline/PixelProgram.cpp
+++ b/src/Pipeline/PixelProgram.cpp
@@ -373,12 +373,4 @@
}
}
-Float4 PixelProgram::linearToSRGB(const Float4 &x) // Approximates x^(1.0/2.2)
-{
- Float4 sqrtx = Rcp_pp(RcpSqrt_pp(x));
- Float4 sRGB = sqrtx * Float4(1.14f) - x * Float4(0.14f);
-
- return Min(Max(sRGB, Float4(0.0f)), Float4(1.0f));
-}
-
} // namespace sw
diff --git a/src/Pipeline/PixelProgram.hpp b/src/Pipeline/PixelProgram.hpp
index 59994fb..40f8b93 100644
--- a/src/Pipeline/PixelProgram.hpp
+++ b/src/Pipeline/PixelProgram.hpp
@@ -48,7 +48,6 @@
Int4 maskAny(Int cMask[4]) const;
Int4 maskAny(Int cMask[4], Int sMask[4], Int zMask[4]) const;
- Float4 linearToSRGB(const Float4 &x);
};
} // namespace sw
diff --git a/src/Pipeline/PixelRoutine.cpp b/src/Pipeline/PixelRoutine.cpp
index d9e50c3..3b4488b 100644
--- a/src/Pipeline/PixelRoutine.cpp
+++ b/src/Pipeline/PixelRoutine.cpp
@@ -133,7 +133,7 @@
WWWW += *Pointer<Float4>(constants + OFFSET(Constants, weight) + 16 * cMask[q]);
}
- WWWW = Rcp_pp(WWWW);
+ WWWW = Rcp(WWWW, Precision::Relaxed);
XXXX *= WWWW;
YYYY *= WWWW;
diff --git a/src/Pipeline/SamplerCore.cpp b/src/Pipeline/SamplerCore.cpp
index f38ea35..2a80211 100644
--- a/src/Pipeline/SamplerCore.cpp
+++ b/src/Pipeline/SamplerCore.cpp
@@ -1226,10 +1226,10 @@
uDelta = As<Float4>((As<Int4>(dudx) & mask) | ((As<Int4>(dudy) & ~mask)));
vDelta = As<Float4>((As<Int4>(dvdx) & mask) | ((As<Int4>(dvdy) & ~mask)));
- anisotropy = lod * Rcp_pp(det);
+ anisotropy = lod * Rcp(det, Precision::Relaxed);
anisotropy = Min(anisotropy, state.maxAnisotropy);
- lod *= Rcp_pp(anisotropy * anisotropy);
+ lod *= Rcp(anisotropy * anisotropy, Precision::Relaxed);
}
lod = log2sqrt(lod); // log2(sqrt(lod))
diff --git a/src/Pipeline/ShaderCore.cpp b/src/Pipeline/ShaderCore.cpp
index b010131..4807ad1 100644
--- a/src/Pipeline/ShaderCore.cpp
+++ b/src/Pipeline/ShaderCore.cpp
@@ -224,22 +224,7 @@
Float4 reciprocal(RValue<Float4> x, bool pp, bool finite, bool exactAtPow2)
{
- //return Float4(1.0f) / x;
-
- Float4 rcp = Rcp_pp(x, exactAtPow2);
-
- if(!pp)
- {
- rcp = (rcp + rcp) - (x * rcp * rcp);
- }
-
- if(finite)
- {
- int big = 0x7F7FFFFF;
- rcp = Min(rcp, Float4((float &)big));
- }
-
- return rcp;
+ return Rcp(x, pp ? Precision::Relaxed : Precision::Full, finite, exactAtPow2);
}
Float4 reciprocalSquareRoot(RValue<Float4> x, bool absolute, bool pp)
diff --git a/src/Reactor/LLVMReactor.cpp b/src/Reactor/LLVMReactor.cpp
index d4030d6..0840845 100644
--- a/src/Reactor/LLVMReactor.cpp
+++ b/src/Reactor/LLVMReactor.cpp
@@ -2865,6 +2865,47 @@
#endif
}
+bool HasRcpApprox()
+{
+#if defined(__i386__) || defined(__x86_64__)
+ return true;
+#else
+ return false;
+#endif
+}
+
+RValue<Float4> RcpApprox(RValue<Float4> x, bool exactAtPow2)
+{
+#if defined(__i386__) || defined(__x86_64__)
+ if(exactAtPow2)
+ {
+ // rcpps uses a piecewise-linear approximation which minimizes the relative error
+ // but is not exact at power-of-two values. Rectify by multiplying by the inverse.
+ return x86::rcpps(x) * Float4(1.0f / _mm_cvtss_f32(_mm_rcp_ss(_mm_set_ps1(1.0f))));
+ }
+ return x86::rcpps(x);
+#else
+ UNREACHABLE("RValue<Float4> RcpApprox() not available on this platform");
+ return { 0.0f };
+#endif
+}
+
+RValue<Float> RcpApprox(RValue<Float> x, bool exactAtPow2)
+{
+#if defined(__i386__) || defined(__x86_64__)
+ if(exactAtPow2)
+ {
+ // rcpss uses a piecewise-linear approximation which minimizes the relative error
+ // but is not exact at power-of-two values. Rectify by multiplying by the inverse.
+ return x86::rcpss(x) * Float(1.0f / _mm_cvtss_f32(_mm_rcp_ss(_mm_set_ps1(1.0f))));
+ }
+ return x86::rcpss(x);
+#else
+ UNREACHABLE("RValue<Float4> RcpApprox() not available on this platform");
+ return { 0.0f };
+#endif
+}
+
RValue<Float> Sqrt(RValue<Float> x)
{
RR_DEBUG_INFO_UPDATE_LOC();
diff --git a/src/Reactor/Reactor.cpp b/src/Reactor/Reactor.cpp
index 80b9612..226792a 100644
--- a/src/Reactor/Reactor.cpp
+++ b/src/Reactor/Reactor.cpp
@@ -14,6 +14,7 @@
#include "Reactor.hpp"
+#include "CPUID.hpp"
#include "Debug.hpp"
#include "Print.hpp"
@@ -4659,4 +4660,56 @@
#endif // ENABLE_RR_PRINT
+// Functions implemented by backends
+bool HasRcpApprox();
+RValue<Float4> RcpApprox(RValue<Float4> x, bool exactAtPow2 = false);
+RValue<Float> RcpApprox(RValue<Float> x, bool exactAtPow2 = false);
+
+template<typename T>
+static RValue<T> DoRcp(RValue<T> x, Precision p, bool finite, bool exactAtPow2)
+{
+#if defined(__i386__) || defined(__x86_64__) // On x86, 1/x is fast enough, except for lower precision
+ bool approx = HasRcpApprox() && (p != Precision::Full);
+#else
+ bool approx = HasRcpApprox();
+#endif
+
+ T rcp;
+
+ if(approx)
+ {
+ rcp = RcpApprox(x, exactAtPow2);
+
+ if(p == Precision::Full)
+ {
+ // Perform one more iteration of Newton-Rhapson division to increase precision
+ rcp = (rcp + rcp) - (x * rcp * rcp);
+ }
+ }
+ else
+ {
+ rcp = T(1.0f) / x;
+ }
+
+ if(finite)
+ {
+ constexpr int big = 0x7F7FFFFF;
+ rcp = Min(rcp, T((float &)big));
+ }
+
+ return rcp;
+}
+
+RValue<Float4> Rcp(RValue<Float4> x, Precision p, bool finite, bool exactAtPow2)
+{
+ RR_DEBUG_INFO_UPDATE_LOC();
+ return DoRcp(x, p, finite, exactAtPow2);
+}
+
+RValue<Float> Rcp(RValue<Float> x, Precision p, bool finite, bool exactAtPow2)
+{
+ RR_DEBUG_INFO_UPDATE_LOC();
+ return DoRcp(x, p, finite, exactAtPow2);
+}
+
} // namespace rr
diff --git a/src/Reactor/Reactor.hpp b/src/Reactor/Reactor.hpp
index a4a5f49..c9306c8 100644
--- a/src/Reactor/Reactor.hpp
+++ b/src/Reactor/Reactor.hpp
@@ -66,6 +66,16 @@
namespace rr {
+// These generally map to the precision types as specified by the Vulkan specification.
+// See https://www.khronos.org/registry/vulkan/specs/1.2/html/chap37.html#spirvenv-precision-operation
+enum class Precision
+{
+ /*Exact,*/ // 0 ULP with correct rounding (i.e. Math.h)
+ Full, // Single precision, but not relaxed
+ Relaxed, // Single precision, relaxed
+ /*Half,*/ // Half precision
+};
+
std::string BackendName();
struct Capabilities
@@ -2155,8 +2165,11 @@
RValue<Float> Abs(RValue<Float> x);
RValue<Float> Max(RValue<Float> x, RValue<Float> y);
RValue<Float> Min(RValue<Float> x, RValue<Float> y);
+// Deprecated: use Rcp
+// TODO(b/147516027): Remove when GLES frontend is removed
RValue<Float> Rcp_pp(RValue<Float> val, bool exactAtPow2 = false);
RValue<Float> RcpSqrt_pp(RValue<Float> val);
+RValue<Float> Rcp(RValue<Float> x, Precision p = Precision::Full, bool finite = false, bool exactAtPow2 = false);
RValue<Float> Sqrt(RValue<Float> x);
// RValue<Int4> IsInf(RValue<Float> x);
@@ -2319,8 +2332,12 @@
RValue<Float4> Abs(RValue<Float4> x);
RValue<Float4> Max(RValue<Float4> x, RValue<Float4> y);
RValue<Float4> Min(RValue<Float4> x, RValue<Float4> y);
+
+// Deprecated: use Rcp
+// TODO(b/147516027): Remove when GLES frontend is removed
RValue<Float4> Rcp_pp(RValue<Float4> val, bool exactAtPow2 = false);
RValue<Float4> RcpSqrt_pp(RValue<Float4> val);
+RValue<Float4> Rcp(RValue<Float4> x, Precision p = Precision::Full, bool finite = false, bool exactAtPow2 = false);
RValue<Float4> Sqrt(RValue<Float4> x);
RValue<Float4> Insert(RValue<Float4> val, RValue<Float> element, int i);
RValue<Float> Extract(RValue<Float4> x, int i);
@@ -2372,13 +2389,6 @@
RValue<Float4> Floor(RValue<Float4> x);
RValue<Float4> Ceil(RValue<Float4> x);
-enum class Precision
-{
- Full,
- Relaxed,
- //Half,
-};
-
// Trigonometric functions
// TODO: Currently unimplemented for Subzero.
RValue<Float4> Sin(RValue<Float4> x);
diff --git a/src/Reactor/SubzeroReactor.cpp b/src/Reactor/SubzeroReactor.cpp
index 21787b8..5622170 100644
--- a/src/Reactor/SubzeroReactor.cpp
+++ b/src/Reactor/SubzeroReactor.cpp
@@ -3932,6 +3932,26 @@
return Rcp_pp(Sqrt(x));
}
+bool HasRcpApprox()
+{
+ // TODO(b/175612820): Update once we implement x86 SSE rcp_ss and rsqrt_ss intrinsics in Subzero
+ return false;
+}
+
+RValue<Float4> RcpApprox(RValue<Float4> x, bool exactAtPow2)
+{
+ // TODO(b/175612820): Update once we implement x86 SSE rcp_ss and rsqrt_ss intrinsics in Subzero
+ UNREACHABLE("RValue<Float4> RcpApprox()");
+ return { 0.0f };
+}
+
+RValue<Float> RcpApprox(RValue<Float> x, bool exactAtPow2)
+{
+ // TODO(b/175612820): Update once we implement x86 SSE rcp_ss and rsqrt_ss intrinsics in Subzero
+ UNREACHABLE("RValue<Float> RcpApprox()");
+ return { 0.0f };
+}
+
RValue<Float4> Sqrt(RValue<Float4> x)
{
RR_DEBUG_INFO_UPDATE_LOC();