Implement MulAdd() which may perform FMA rr::MulAdd(x, y, z) computes `x * y + z`, which may be fused into one 'fused' operation to produce a higher-precision result. AVX2 features FMA instructions which perform a Fused Multiply-Add using just one micro-operation and reducing latency compared to separate multiplication and addition. This corresponds directly with LLVM's fmuladd intrinsic: https://llvm.org/docs/LangRef.html#llvm-fmuladd-intrinsic The backend detects whether FMA is available and more efficient. Subzero currently does not support AVX2 so it always implements MulAdd() as just `a * b + c`. Bug: b/214591655 Change-Id: Ia5e0d0adc0933e212a2f3f289c2cb75ecdea37cc Reviewed-on: https://swiftshader-review.googlesource.com/c/SwiftShader/+/61848 Kokoro-Result: kokoro <noreply+kokoro@google.com> Tested-by: Nicolas Capens <nicolascapens@google.com> Reviewed-by: Alexis Hétu <sugoi@google.com>

diff --git a/src/Reactor/LLVMReactor.cpp b/src/Reactor/LLVMReactor.cpp
index 9cadafc..b2faab9 100644
--- a/src/Reactor/LLVMReactor.cpp
+++ b/src/Reactor/LLVMReactor.cpp

@@ -3159,6 +3159,12 @@
 	storeValue(replicate);
 }
 
+RValue<Float4> MulAdd(RValue<Float4> x, RValue<Float4> y, RValue<Float4> z)
+{
+	auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::fmuladd, { T(Float4::type()) });
+	return RValue<Float4>(V(jit->builder->CreateCall(func, { V(x.value()), V(y.value()), V(z.value()) })));
+}
+
 RValue<Float4> Abs(RValue<Float4> x)
 {
 	auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::fabs, { V(x.value())->getType() });

diff --git a/src/Reactor/Reactor.hpp b/src/Reactor/Reactor.hpp
index 3b3d08b..f31304c 100644
--- a/src/Reactor/Reactor.hpp
+++ b/src/Reactor/Reactor.hpp

@@ -2343,6 +2343,9 @@
 RValue<Float4> operator+(RValue<Float4> val);
 RValue<Float4> operator-(RValue<Float4> val);
 
+// Computes `x * y + z`, which may be fused into one operation to produce a higher-precision result.
+RValue<Float4> MulAdd(RValue<Float4> x, RValue<Float4> y, RValue<Float4> z);
+
 RValue<Float4> Abs(RValue<Float4> x);
 RValue<Float4> Max(RValue<Float4> x, RValue<Float4> y);
 RValue<Float4> Min(RValue<Float4> x, RValue<Float4> y);

diff --git a/src/Reactor/SubzeroReactor.cpp b/src/Reactor/SubzeroReactor.cpp
index cfd3d3a..5c2e7e7 100644
--- a/src/Reactor/SubzeroReactor.cpp
+++ b/src/Reactor/SubzeroReactor.cpp

@@ -3934,6 +3934,12 @@
 	storeValue(replicate);
 }
 
+RValue<Float4> MulAdd(RValue<Float4> x, RValue<Float4> y, RValue<Float4> z)
+{
+	// TODO(b/214591655): Use FMA when available.
+	return x * y + z;
+}
+
 RValue<Float4> Abs(RValue<Float4> x)
 {
 	// TODO: Optimize.