Implement MulAdd() which may perform FMA
rr::MulAdd(x, y, z) computes `x * y + z`, which may be fused into one
'fused' operation to produce a higher-precision result. AVX2 features
FMA instructions which perform a Fused Multiply-Add using just one
micro-operation and reducing latency compared to separate multiplication
and addition.
This corresponds directly with LLVM's fmuladd intrinsic:
https://llvm.org/docs/LangRef.html#llvm-fmuladd-intrinsic
The backend detects whether FMA is available and more efficient.
Subzero currently does not support AVX2 so it always implements MulAdd()
as just `a * b + c`.
Bug: b/214591655
Change-Id: Ia5e0d0adc0933e212a2f3f289c2cb75ecdea37cc
Reviewed-on: https://swiftshader-review.googlesource.com/c/SwiftShader/+/61848
Kokoro-Result: kokoro <noreply+kokoro@google.com>
Tested-by: Nicolas Capens <nicolascapens@google.com>
Reviewed-by: Alexis Hétu <sugoi@google.com>
diff --git a/src/Reactor/LLVMReactor.cpp b/src/Reactor/LLVMReactor.cpp
index 9cadafc..b2faab9 100644
--- a/src/Reactor/LLVMReactor.cpp
+++ b/src/Reactor/LLVMReactor.cpp
@@ -3159,6 +3159,12 @@
storeValue(replicate);
}
+RValue<Float4> MulAdd(RValue<Float4> x, RValue<Float4> y, RValue<Float4> z)
+{
+ auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::fmuladd, { T(Float4::type()) });
+ return RValue<Float4>(V(jit->builder->CreateCall(func, { V(x.value()), V(y.value()), V(z.value()) })));
+}
+
RValue<Float4> Abs(RValue<Float4> x)
{
auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::fabs, { V(x.value())->getType() });
diff --git a/src/Reactor/Reactor.hpp b/src/Reactor/Reactor.hpp
index 3b3d08b..f31304c 100644
--- a/src/Reactor/Reactor.hpp
+++ b/src/Reactor/Reactor.hpp
@@ -2343,6 +2343,9 @@
RValue<Float4> operator+(RValue<Float4> val);
RValue<Float4> operator-(RValue<Float4> val);
+// Computes `x * y + z`, which may be fused into one operation to produce a higher-precision result.
+RValue<Float4> MulAdd(RValue<Float4> x, RValue<Float4> y, RValue<Float4> z);
+
RValue<Float4> Abs(RValue<Float4> x);
RValue<Float4> Max(RValue<Float4> x, RValue<Float4> y);
RValue<Float4> Min(RValue<Float4> x, RValue<Float4> y);
diff --git a/src/Reactor/SubzeroReactor.cpp b/src/Reactor/SubzeroReactor.cpp
index cfd3d3a..5c2e7e7 100644
--- a/src/Reactor/SubzeroReactor.cpp
+++ b/src/Reactor/SubzeroReactor.cpp
@@ -3934,6 +3934,12 @@
storeValue(replicate);
}
+RValue<Float4> MulAdd(RValue<Float4> x, RValue<Float4> y, RValue<Float4> z)
+{
+ // TODO(b/214591655): Use FMA when available.
+ return x * y + z;
+}
+
RValue<Float4> Abs(RValue<Float4> x)
{
// TODO: Optimize.