Support specifying math precision through a template argument

The use of a Boolean parameter to select relaxed precision is fine for
functions such as Asin() which are only called from the SpirvShader
code and pass in a verbose d.RelaxedPrecision argument. But functions
like Sqrt() are also used in places like the VK_BLEND_OP_SOFTLIGHT_EXT
implementation, and Pow() is used in sRGB conversion, where there is no
concept of relaxed precision decorations and passing in true or false
would not have good readability.

To create a reasonably elegant syntax the Highp and Mediump enums have
been defined and they can be used as a template argument for these
functions.

The Vulkan spec states for VK_EXT_blend_operation_advanced that the
"blending precision may be limited to 16-bit floating-point", so we can
use Sqrt<Mediump>(). Likewise for sRGB conversion Pow<Mediump>()
suffices since this non-linear fixed-point encoding only makes sense for
relatively low bit width color components.

Bug: b/222218659
Change-Id: Id3bc4fa68f38574ff23b125befbac68072d39ee1
Reviewed-on: https://swiftshader-review.googlesource.com/c/SwiftShader/+/63768
Kokoro-Result: kokoro <noreply+kokoro@google.com>
Tested-by: Nicolas Capens <nicolascapens@google.com>
Reviewed-by: Alexis Hétu <sugoi@google.com>
diff --git a/src/Device/Blitter.cpp b/src/Device/Blitter.cpp
index 5a4a9ba..24e90bd 100644
--- a/src/Device/Blitter.cpp
+++ b/src/Device/Blitter.cpp
@@ -1487,7 +1487,7 @@
 Float4 Blitter::LinearToSRGB(const Float4 &c)
 {
 	Float4 lc = Min(c, 0.0031308f) * 12.92f;
-	Float4 ec = Float4(1.055f) * sw::Pow(c, (1.0f / 2.4f)) - 0.055f;
+	Float4 ec = Float4(1.055f) * Pow<Mediump>(c, (1.0f / 2.4f)) - 0.055f;  // TODO(b/149574741): Use a custom approximation.
 
 	Float4 s = c;
 	s.xyz = Max(lc, ec);
@@ -1498,7 +1498,7 @@
 Float4 Blitter::sRGBtoLinear(const Float4 &c)
 {
 	Float4 lc = c * (1.0f / 12.92f);
-	Float4 ec = sw::Pow((c + 0.055f) * (1.0f / 1.055f), 2.4f);
+	Float4 ec = Pow<Mediump>((c + 0.055f) * (1.0f / 1.055f), 2.4f);  // TODO(b/149574741): Use a custom approximation.
 
 	Int4 linear = CmpLT(c, 0.04045f);
 
diff --git a/src/Pipeline/PixelRoutine.cpp b/src/Pipeline/PixelRoutine.cpp
index 6788340..fb3172d 100644
--- a/src/Pipeline/PixelRoutine.cpp
+++ b/src/Pipeline/PixelRoutine.cpp
@@ -2027,7 +2027,7 @@
 	return As<Float4>(
 	    (~largeSrc & As<Int4>(dst - ((1.0f - (2.0f * src)) * dst * (1.0f - dst)))) |
 	    (largeSrc & ((~largeDst & As<Int4>(dst + (((2.0f * src) - 1.0f) * dst * ((((16.0f * dst) - 12.0f) * dst) + 3.0f)))) |
-	                 (largeDst & As<Int4>(dst + (((2.0f * src) - 1.0f) * (Sqrt(dst) - dst)))))));
+	                 (largeDst & As<Int4>(dst + (((2.0f * src) - 1.0f) * (Sqrt<Mediump>(dst) - dst)))))));
 }
 
 Float4 PixelRoutine::maxRGB(Vector4f &c)
diff --git a/src/Pipeline/ShaderCore.cpp b/src/Pipeline/ShaderCore.cpp
index 01ec942..dee7a30 100644
--- a/src/Pipeline/ShaderCore.cpp
+++ b/src/Pipeline/ShaderCore.cpp
@@ -402,7 +402,7 @@
 	return 6.93147181e-1f * sw::Log2(x);  // ln(2)
 }
 
-Float4 Pow(RValue<Float4> x, RValue<Float4> y)
+Float4 Pow(RValue<Float4> x, RValue<Float4> y, bool relaxedPrecision)
 {
 	Float4 log = sw::Log2(x);
 	log *= y;
@@ -441,6 +441,11 @@
 	return sw::Log((1.0f + x) / (1.0f - x)) * 0.5f;
 }
 
+RValue<Float4> Sqrt(RValue<Float4> x, bool relaxedPrecision)
+{
+	return rr::Sqrt(x);  // TODO(b/222218659): Optimize for relaxed precision.
+}
+
 Float4 reciprocal(RValue<Float4> x, bool pp, bool exactAtPow2)
 {
 	return Rcp(x, pp, exactAtPow2);
diff --git a/src/Pipeline/ShaderCore.hpp b/src/Pipeline/ShaderCore.hpp
index e33c4c0..7f7251d 100644
--- a/src/Pipeline/ShaderCore.hpp
+++ b/src/Pipeline/ShaderCore.hpp
@@ -195,13 +195,33 @@
 Float4 Log2(RValue<Float4> x);
 Float4 Exp(RValue<Float4> x);
 Float4 Log(RValue<Float4> x);
-Float4 Pow(RValue<Float4> x, RValue<Float4> y);
+Float4 Pow(RValue<Float4> x, RValue<Float4> y, bool relaxedPrecision);
 Float4 Sinh(RValue<Float4> x);
 Float4 Cosh(RValue<Float4> x);
 Float4 Tanh(RValue<Float4> x);
 Float4 Asinh(RValue<Float4> x);
 Float4 Acosh(RValue<Float4> x);
 Float4 Atanh(RValue<Float4> x);
+RValue<Float4> Sqrt(RValue<Float4> x, bool relaxedPrecision);
+
+// Math functions with uses outside of shaders can be invoked using a verbose template argument instead
+// of a Boolean argument to indicate precision. For example Sqrt<Mediump>(x) equals Sqrt(x, true).
+enum Precision
+{
+	Highp,
+	Relaxed,
+	Mediump = Relaxed,  // GLSL defines mediump and lowp as corresponding with SPIR-V's RelaxedPrecision
+};
+
+// clang-format off
+template<Precision precision> RValue<Float4> Sqrt(RValue<Float4> x);
+template<> inline RValue<Float4> Sqrt<Highp>(RValue<Float4> x) { return Sqrt(x, false); }
+template<> inline RValue<Float4> Sqrt<Mediump>(RValue<Float4> x) { return Sqrt(x, true); }
+
+template<Precision precision> RValue<Float4> Pow(RValue<Float4> x, RValue<Float4> y);
+template<> inline RValue<Float4> Pow<Highp>(RValue<Float4> x, RValue<Float4> y) { return Pow(x, y, false); }
+template<> inline RValue<Float4> Pow<Mediump>(RValue<Float4> x, RValue<Float4> y) { return Pow(x, y, true); }
+// clang-format on
 
 Float4 reciprocal(RValue<Float4> x, bool pp = false, bool exactAtPow2 = false);
 Float4 reciprocalSquareRoot(RValue<Float4> x, bool abs, bool pp = false);
diff --git a/src/Pipeline/SpirvShaderGLSLstd450.cpp b/src/Pipeline/SpirvShaderGLSLstd450.cpp
index 87205ae..a58ea03 100644
--- a/src/Pipeline/SpirvShaderGLSLstd450.cpp
+++ b/src/Pipeline/SpirvShaderGLSLstd450.cpp
@@ -714,9 +714,11 @@
 		{
 			auto x = Operand(this, state, insn.word(5));
 			auto y = Operand(this, state, insn.word(6));
+			Decorations d = GetDecorationsForId(insn.resultId());
+
 			for(auto i = 0u; i < type.componentCount; i++)
 			{
-				dst.move(i, sw::Pow(x.Float(i), y.Float(i)));
+				dst.move(i, sw::Pow(x.Float(i), y.Float(i), d.RelaxedPrecision));
 			}
 		}
 		break;
diff --git a/src/Pipeline/SpirvShaderImage.cpp b/src/Pipeline/SpirvShaderImage.cpp
index bc01ab6..76ac670 100644
--- a/src/Pipeline/SpirvShaderImage.cpp
+++ b/src/Pipeline/SpirvShaderImage.cpp
@@ -23,6 +23,8 @@
 
 namespace {
 
+using namespace sw;
+
 vk::Format SpirvFormatToVulkanFormat(spv::ImageFormat format)
 {
 	switch(format)
@@ -74,14 +76,14 @@
 	}
 }
 
-sw::SIMD::Float sRGBtoLinear(sw::SIMD::Float c)
+SIMD::Float sRGBtoLinear(SIMD::Float c)
 {
-	sw::SIMD::Float lc = c * (1.0f / 12.92f);
-	sw::SIMD::Float ec = sw::Pow((c + 0.055f) * (1.0f / 1.055f), 2.4f);  // TODO(b/149574741): Use an optimized approximation.
+	SIMD::Float lc = c * (1.0f / 12.92f);
+	SIMD::Float ec = Pow<Mediump>((c + 0.055f) * (1.0f / 1.055f), 2.4f);  // TODO(b/149574741): Use a custom approximation.
 
-	sw::SIMD::Int linear = CmpLT(c, 0.04045f);
+	SIMD::Int linear = CmpLT(c, 0.04045f);
 
-	return rr::As<sw::SIMD::Float>((linear & rr::As<sw::SIMD::Int>(lc)) | (~linear & rr::As<sw::SIMD::Int>(ec)));  // TODO: IfThenElse()
+	return rr::As<SIMD::Float>((linear & rr::As<SIMD::Int>(lc)) | (~linear & rr::As<SIMD::Int>(ec)));  // TODO: IfThenElse()
 }
 
 }  // anonymous namespace
diff --git a/tests/PipelineBenchmarks/PipelineBenchmarks.cpp b/tests/PipelineBenchmarks/PipelineBenchmarks.cpp
index 8b5803d..2d063c4 100644
--- a/tests/PipelineBenchmarks/PipelineBenchmarks.cpp
+++ b/tests/PipelineBenchmarks/PipelineBenchmarks.cpp
@@ -20,6 +20,7 @@
 #include <vector>
 
 using namespace rr;
+using namespace sw;
 
 BENCHMARK_MAIN();
 
@@ -106,11 +107,11 @@
 BENCHMARK_CAPTURE(Transcendental1, sw_Tan, sw::Tan)->Arg(REPS);
 
 BENCHMARK_CAPTURE(Transcendental1, rr_Asin, rr::Asin)->Arg(REPS);
-BENCHMARK_CAPTURE(Transcendental1, sw_Asin_highpp, sw::Asin, false /* relaxedPrecision */)->Arg(REPS);
-BENCHMARK_CAPTURE(Transcendental1, sw_Asin_relaxedp, sw::Asin, true /* relaxedPrecision */)->Arg(REPS);
+BENCHMARK_CAPTURE(Transcendental1, sw_Asin_highp, sw::Asin, false /* relaxedPrecision */)->Arg(REPS);
+BENCHMARK_CAPTURE(Transcendental1, sw_Asin_mediump, sw::Asin, true /* relaxedPrecision */)->Arg(REPS);
 BENCHMARK_CAPTURE(Transcendental1, rr_Acos, rr::Acos)->Arg(REPS);
 BENCHMARK_CAPTURE(Transcendental1, sw_Acos_highp, sw::Acos, false /* relaxedPrecision */)->Arg(REPS);
-BENCHMARK_CAPTURE(Transcendental1, sw_Acos_relaxedp, sw::Acos, true /* relaxedPrecision */)->Arg(REPS);
+BENCHMARK_CAPTURE(Transcendental1, sw_Acos_mediump, sw::Acos, true /* relaxedPrecision */)->Arg(REPS);
 
 BENCHMARK_CAPTURE(Transcendental1, rr_Atan, rr::Atan)->Arg(REPS);
 BENCHMARK_CAPTURE(Transcendental1, sw_Atan, sw::Atan)->Arg(REPS);
@@ -131,7 +132,8 @@
 BENCHMARK_CAPTURE(Transcendental2, sw_Atan2, sw::Atan2)->Arg(REPS);
 
 BENCHMARK_CAPTURE(Transcendental2, rr_Pow, rr::Pow)->Arg(REPS);
-BENCHMARK_CAPTURE(Transcendental2, sw_Pow, sw::Pow)->Arg(REPS);
+BENCHMARK_CAPTURE(Transcendental2, sw_Pow_highp, sw::Pow<Highp>)->Arg(REPS);
+BENCHMARK_CAPTURE(Transcendental2, sw_Pow_mediump, sw::Pow<Mediump>)->Arg(REPS);
 BENCHMARK_CAPTURE(Transcendental1, rr_Exp, rr::Exp)->Arg(REPS);
 BENCHMARK_CAPTURE(Transcendental1, sw_Exp, sw::Exp)->Arg(REPS);
 BENCHMARK_CAPTURE(Transcendental1, rr_Log, rr::Log)->Arg(REPS);