Implement MulAdd() which may perform FMA rr::MulAdd(x, y, z) computes `x * y + z`, which may be fused into one 'fused' operation to produce a higher-precision result. AVX2 features FMA instructions which perform a Fused Multiply-Add using just one micro-operation and reducing latency compared to separate multiplication and addition. This corresponds directly with LLVM's fmuladd intrinsic: https://llvm.org/docs/LangRef.html#llvm-fmuladd-intrinsic The backend detects whether FMA is available and more efficient. Subzero currently does not support AVX2 so it always implements MulAdd() as just `a * b + c`. Bug: b/214591655 Change-Id: Ia5e0d0adc0933e212a2f3f289c2cb75ecdea37cc Reviewed-on: https://swiftshader-review.googlesource.com/c/SwiftShader/+/61848 Kokoro-Result: kokoro <noreply+kokoro@google.com> Tested-by: Nicolas Capens <nicolascapens@google.com> Reviewed-by: Alexis Hétu <sugoi@google.com>
diff --git a/src/Reactor/LLVMReactor.cpp b/src/Reactor/LLVMReactor.cpp index 9cadafc..b2faab9 100644 --- a/src/Reactor/LLVMReactor.cpp +++ b/src/Reactor/LLVMReactor.cpp
@@ -3159,6 +3159,12 @@ storeValue(replicate); } +RValue<Float4> MulAdd(RValue<Float4> x, RValue<Float4> y, RValue<Float4> z) +{ + auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::fmuladd, { T(Float4::type()) }); + return RValue<Float4>(V(jit->builder->CreateCall(func, { V(x.value()), V(y.value()), V(z.value()) }))); +} + RValue<Float4> Abs(RValue<Float4> x) { auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::fabs, { V(x.value())->getType() });
diff --git a/src/Reactor/Reactor.hpp b/src/Reactor/Reactor.hpp index 3b3d08b..f31304c 100644 --- a/src/Reactor/Reactor.hpp +++ b/src/Reactor/Reactor.hpp
@@ -2343,6 +2343,9 @@ RValue<Float4> operator+(RValue<Float4> val); RValue<Float4> operator-(RValue<Float4> val); +// Computes `x * y + z`, which may be fused into one operation to produce a higher-precision result. +RValue<Float4> MulAdd(RValue<Float4> x, RValue<Float4> y, RValue<Float4> z); + RValue<Float4> Abs(RValue<Float4> x); RValue<Float4> Max(RValue<Float4> x, RValue<Float4> y); RValue<Float4> Min(RValue<Float4> x, RValue<Float4> y);
diff --git a/src/Reactor/SubzeroReactor.cpp b/src/Reactor/SubzeroReactor.cpp index cfd3d3a..5c2e7e7 100644 --- a/src/Reactor/SubzeroReactor.cpp +++ b/src/Reactor/SubzeroReactor.cpp
@@ -3934,6 +3934,12 @@ storeValue(replicate); } +RValue<Float4> MulAdd(RValue<Float4> x, RValue<Float4> y, RValue<Float4> z) +{ + // TODO(b/214591655): Use FMA when available. + return x * y + z; +} + RValue<Float4> Abs(RValue<Float4> x) { // TODO: Optimize.