Eliminate EmulatedIntrinsics

Reactor's LLVM backend only used two 'emulated' intrinsics; Sinh() and
Cosh(), which can both just call the C/C++ math functions.

All of the Subzero transcendental intrinsics now also directly call the
math.h functions, and gather/scatter and atomics implementations have
been moved so EmulatedIntrinsics could be eliminated entirely.

Bug: b/169755552
Fixes: b/149110874
Change-Id: I9fdbf32c8a08200addc5763fd13d3da4a13696c8
Reviewed-on: https://swiftshader-review.googlesource.com/c/SwiftShader/+/62291
Reviewed-by: Alexis Hétu <sugoi@google.com>
Tested-by: Nicolas Capens <nicolascapens@google.com>
diff --git a/CMakeSettings.json b/CMakeSettings.json
index c3d2919..23c622f 100644
--- a/CMakeSettings.json
+++ b/CMakeSettings.json
@@ -10,7 +10,13 @@
       "cmakeCommandArgs": "",
       "buildCommandArgs": "-v",
       "ctestCommandArgs": "",
-      "variables": []
+      "variables": [
+        {
+          "name": "REACTOR_BACKEND",
+          "value": "Subzero",
+          "type": "STRING"
+        }
+      ]
     },
     {
       "name": "x86-Debug",
@@ -21,8 +27,7 @@
       "cmakeCommandArgs": "",
       "buildCommandArgs": "-v",
       "ctestCommandArgs": "",
-      "inheritEnvironments": [ "msvc_x86" ],
-      "variables": []
+      "inheritEnvironments": [ "msvc_x86" ]
     },
     {
       "name": "x86-Release",
@@ -33,8 +38,7 @@
       "cmakeCommandArgs": "",
       "buildCommandArgs": "-v",
       "ctestCommandArgs": "",
-      "inheritEnvironments": [ "msvc_x86" ],
-      "variables": []
+      "inheritEnvironments": [ "msvc_x86" ]
     },
     {
       "name": "x64-Release",
@@ -45,8 +49,7 @@
       "cmakeCommandArgs": "",
       "buildCommandArgs": "-v",
       "ctestCommandArgs": "",
-      "inheritEnvironments": [ "msvc_x64_x64" ],
-      "variables": []
+      "inheritEnvironments": [ "msvc_x64_x64" ]
     }
   ]
 }
\ No newline at end of file
diff --git a/src/Android.bp b/src/Android.bp
index b87eaed..a2f8d8a 100644
--- a/src/Android.bp
+++ b/src/Android.bp
@@ -34,7 +34,6 @@
         "Reactor/Assert.cpp",
         "Reactor/CPUID.cpp",
         "Reactor/Debug.cpp",
-        "Reactor/EmulatedIntrinsics.cpp",
         "Reactor/ExecutableMemory.cpp",
         "Reactor/LLVMJIT.cpp",
         "Reactor/LLVMReactor.cpp",
diff --git a/src/Reactor/BUILD.gn b/src/Reactor/BUILD.gn
index e89fe65..676f99f 100644
--- a/src/Reactor/BUILD.gn
+++ b/src/Reactor/BUILD.gn
@@ -40,7 +40,6 @@
   sources = [
     "Assert.cpp",
     "Debug.cpp",
-    "EmulatedIntrinsics.cpp",
     "ExecutableMemory.cpp",
     "Pragma.cpp",
     "Reactor.cpp",
diff --git a/src/Reactor/CMakeLists.txt b/src/Reactor/CMakeLists.txt
index a69a5af..8eb2034 100644
--- a/src/Reactor/CMakeLists.txt
+++ b/src/Reactor/CMakeLists.txt
@@ -22,8 +22,6 @@
     Assert.hpp
     Debug.cpp
     Debug.hpp
-    EmulatedIntrinsics.cpp
-    EmulatedIntrinsics.hpp
     ExecutableMemory.cpp
     ExecutableMemory.hpp
     Nucleus.hpp
diff --git a/src/Reactor/EmulatedIntrinsics.cpp b/src/Reactor/EmulatedIntrinsics.cpp
deleted file mode 100644
index d259cea..0000000
--- a/src/Reactor/EmulatedIntrinsics.cpp
+++ /dev/null
@@ -1,299 +0,0 @@
-// Copyright 2019 The SwiftShader Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "EmulatedIntrinsics.hpp"
-
-#include <algorithm>
-#include <cmath>
-#include <functional>
-#include <mutex>
-#include <utility>
-
-namespace rr {
-namespace {
-
-template<typename T>
-struct UnderlyingType
-{
-	using Type = typename decltype(rr::Extract(std::declval<RValue<T>>(), 0))::rvalue_underlying_type;
-};
-
-template<typename T>
-using UnderlyingTypeT = typename UnderlyingType<T>::Type;
-
-// Call single arg function on a vector type
-template<typename Func, typename T>
-RValue<T> call4(Func func, const RValue<T> &x)
-{
-	T result;
-	result = Insert(result, Call(func, Extract(x, 0)), 0);
-	result = Insert(result, Call(func, Extract(x, 1)), 1);
-	result = Insert(result, Call(func, Extract(x, 2)), 2);
-	result = Insert(result, Call(func, Extract(x, 3)), 3);
-	return result;
-}
-
-// Call two arg function on a vector type
-template<typename Func, typename T>
-RValue<T> call4(Func func, const RValue<T> &x, const RValue<T> &y)
-{
-	T result;
-	result = Insert(result, Call(func, Extract(x, 0), Extract(y, 0)), 0);
-	result = Insert(result, Call(func, Extract(x, 1), Extract(y, 1)), 1);
-	result = Insert(result, Call(func, Extract(x, 2), Extract(y, 2)), 2);
-	result = Insert(result, Call(func, Extract(x, 3), Extract(y, 3)), 3);
-	return result;
-}
-
-// Call three arg function on a vector type
-template<typename Func, typename T>
-RValue<T> call4(Func func, const RValue<T> &x, const RValue<T> &y, const RValue<T> &z)
-{
-	T result;
-	result = Insert(result, Call(func, Extract(x, 0), Extract(y, 0), Extract(z, 0)), 0);
-	result = Insert(result, Call(func, Extract(x, 1), Extract(y, 1), Extract(z, 1)), 1);
-	result = Insert(result, Call(func, Extract(x, 2), Extract(y, 2), Extract(z, 2)), 2);
-	result = Insert(result, Call(func, Extract(x, 3), Extract(y, 3), Extract(z, 3)), 3);
-	return result;
-}
-
-template<typename T, typename EL = UnderlyingTypeT<T>>
-void gather(T &out, RValue<Pointer<EL>> base, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment, bool zeroMaskedLanes)
-{
-	constexpr bool atomic = false;
-	constexpr std::memory_order order = std::memory_order_relaxed;
-
-	Pointer<Byte> baseBytePtr = base;
-
-	out = T(0);
-	for(int i = 0; i < 4; i++)
-	{
-		If(Extract(mask, i) != 0)
-		{
-			auto offset = Extract(offsets, i);
-			auto el = Load(Pointer<EL>(&baseBytePtr[offset]), alignment, atomic, order);
-			out = Insert(out, el, i);
-		}
-		Else If(zeroMaskedLanes)
-		{
-			out = Insert(out, EL(0), i);
-		}
-	}
-}
-
-template<typename T, typename EL = UnderlyingTypeT<T>>
-void scatter(RValue<Pointer<EL>> base, RValue<T> val, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment)
-{
-	constexpr bool atomic = false;
-	constexpr std::memory_order order = std::memory_order_relaxed;
-
-	Pointer<Byte> baseBytePtr = base;
-
-	for(int i = 0; i < 4; i++)
-	{
-		If(Extract(mask, i) != 0)
-		{
-			auto offset = Extract(offsets, i);
-			Store(Extract(val, i), Pointer<EL>(&baseBytePtr[offset]), alignment, atomic, order);
-		}
-	}
-}
-
-// TODO(b/148276653): Both atomicMin and atomicMax use a static (global) mutex that makes all min
-// operations for a given T mutually exclusive, rather than only the ones on the value pointed to
-// by ptr. Use a CAS loop, as is done for LLVMReactor's min/max atomic for Android.
-// TODO(b/148207274): Or, move this down into Subzero as a CAS-based operation.
-template<typename T>
-static T atomicMin(T *ptr, T value)
-{
-	static std::mutex m;
-
-	std::lock_guard<std::mutex> lock(m);
-	T origValue = *ptr;
-	*ptr = std::min(origValue, value);
-	return origValue;
-}
-template<typename T>
-static T atomicMax(T *ptr, T value)
-{
-	static std::mutex m;
-
-	std::lock_guard<std::mutex> lock(m);
-	T origValue = *ptr;
-	*ptr = std::max(origValue, value);
-	return origValue;
-}
-
-}  // anonymous namespace
-
-namespace emulated {
-
-RValue<Float4> Gather(RValue<Pointer<Float>> base, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment, bool zeroMaskedLanes /* = false */)
-{
-	Float4 result{};
-	gather(result, base, offsets, mask, alignment, zeroMaskedLanes);
-	return result;
-}
-
-RValue<Int4> Gather(RValue<Pointer<Int>> base, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment, bool zeroMaskedLanes /* = false */)
-{
-	Int4 result{};
-	gather(result, base, offsets, mask, alignment, zeroMaskedLanes);
-	return result;
-}
-
-void Scatter(RValue<Pointer<Float>> base, RValue<Float4> val, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment)
-{
-	scatter(base, val, offsets, mask, alignment);
-}
-
-void Scatter(RValue<Pointer<Int>> base, RValue<Int4> val, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment)
-{
-	scatter<Int4>(base, val, offsets, mask, alignment);
-}
-
-RValue<Float> Exp2(RValue<Float> x)
-{
-	return Call(exp2f, x);
-}
-
-RValue<Float> Log2(RValue<Float> x)
-{
-	return Call(log2f, x);
-}
-
-RValue<Float4> Sin(RValue<Float4> x)
-{
-	return call4(sinf, x);
-}
-
-RValue<Float4> Cos(RValue<Float4> x)
-{
-	return call4(cosf, x);
-}
-
-RValue<Float4> Tan(RValue<Float4> x)
-{
-	return call4(tanf, x);
-}
-
-RValue<Float4> Asin(RValue<Float4> x)
-{
-	return call4(asinf, x);
-}
-
-RValue<Float4> Acos(RValue<Float4> x)
-{
-	return call4(acosf, x);
-}
-
-RValue<Float4> Atan(RValue<Float4> x)
-{
-	return call4(atanf, x);
-}
-
-RValue<Float4> Sinh(RValue<Float4> x)
-{
-	// TODO(b/149110874) Use coshf/sinhf when we've implemented SpirV versions at the SpirV level
-	return Float4(0.5f) * (emulated::Exp(x) - emulated::Exp(-x));
-}
-
-RValue<Float4> Cosh(RValue<Float4> x)
-{
-	// TODO(b/149110874) Use coshf/sinhf when we've implemented SpirV versions at the SpirV level
-	return Float4(0.5f) * (emulated::Exp(x) + emulated::Exp(-x));
-}
-
-RValue<Float4> Tanh(RValue<Float4> x)
-{
-	return call4(tanhf, x);
-}
-
-RValue<Float4> Asinh(RValue<Float4> x)
-{
-	return call4(asinhf, x);
-}
-
-RValue<Float4> Acosh(RValue<Float4> x)
-{
-	return call4(acoshf, x);
-}
-
-RValue<Float4> Atanh(RValue<Float4> x)
-{
-	return call4(atanhf, x);
-}
-
-RValue<Float4> Atan2(RValue<Float4> x, RValue<Float4> y)
-{
-	return call4(atan2f, x, y);
-}
-
-RValue<Float4> Pow(RValue<Float4> x, RValue<Float4> y)
-{
-	return call4(powf, x, y);
-}
-
-RValue<Float4> Exp(RValue<Float4> x)
-{
-	return call4(expf, x);
-}
-
-RValue<Float4> Log(RValue<Float4> x)
-{
-	return call4(logf, x);
-}
-
-RValue<Float4> Exp2(RValue<Float4> x)
-{
-	return call4(exp2f, x);
-}
-
-RValue<Float4> Log2(RValue<Float4> x)
-{
-	return call4(log2f, x);
-}
-
-RValue<Int> MinAtomic(RValue<Pointer<Int>> x, RValue<Int> y, std::memory_order memoryOrder)
-{
-	return Call(atomicMin<int32_t>, x, y);
-}
-
-RValue<UInt> MinAtomic(RValue<Pointer<UInt>> x, RValue<UInt> y, std::memory_order memoryOrder)
-{
-	return Call(atomicMin<uint32_t>, x, y);
-}
-
-RValue<Int> MaxAtomic(RValue<Pointer<Int>> x, RValue<Int> y, std::memory_order memoryOrder)
-{
-	return Call(atomicMax<int32_t>, x, y);
-}
-
-RValue<UInt> MaxAtomic(RValue<Pointer<UInt>> x, RValue<UInt> y, std::memory_order memoryOrder)
-{
-	return Call(atomicMax<uint32_t>, x, y);
-}
-
-RValue<Float4> FRem(RValue<Float4> lhs, RValue<Float4> rhs)
-{
-	return call4(fmodf, lhs, rhs);
-}
-
-RValue<Float4> FMA(RValue<Float4> x, RValue<Float4> y, RValue<Float4> z)
-{
-	return call4(fmaf, x, y, z);
-}
-
-}  // namespace emulated
-}  // namespace rr
diff --git a/src/Reactor/EmulatedIntrinsics.hpp b/src/Reactor/EmulatedIntrinsics.hpp
deleted file mode 100644
index 4ca6224..0000000
--- a/src/Reactor/EmulatedIntrinsics.hpp
+++ /dev/null
@@ -1,59 +0,0 @@
-// Copyright 2019 The SwiftShader Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "Reactor.hpp"
-
-// Implementation of intrinsics that are "emulated" - that is,
-// implemented either in terms of Reactor code, or make use of
-// rr::Call to C functions. These are typically slower than implementing
-// in terms of direct calls to the JIT backend; however, provide a good
-// starting point for implementing a new backend, or for when adding
-// functionality to an existing backend is non-trivial.
-
-namespace rr {
-namespace emulated {
-
-RValue<Float4> Gather(RValue<Pointer<Float>> base, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment, bool zeroMaskedLanes = false);
-RValue<Int4> Gather(RValue<Pointer<Int>> base, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment, bool zeroMaskedLanes = false);
-void Scatter(RValue<Pointer<Float>> base, RValue<Float4> val, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment);
-void Scatter(RValue<Pointer<Int>> base, RValue<Int4> val, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment);
-RValue<Float> Exp2(RValue<Float> x);
-RValue<Float> Log2(RValue<Float> x);
-RValue<Float4> Sin(RValue<Float4> x);
-RValue<Float4> Cos(RValue<Float4> x);
-RValue<Float4> Tan(RValue<Float4> x);
-RValue<Float4> Asin(RValue<Float4> x);
-RValue<Float4> Acos(RValue<Float4> x);
-RValue<Float4> Atan(RValue<Float4> x);
-RValue<Float4> Sinh(RValue<Float4> x);
-RValue<Float4> Cosh(RValue<Float4> x);
-RValue<Float4> Tanh(RValue<Float4> x);
-RValue<Float4> Asinh(RValue<Float4> x);
-RValue<Float4> Acosh(RValue<Float4> x);
-RValue<Float4> Atanh(RValue<Float4> x);
-RValue<Float4> Atan2(RValue<Float4> x, RValue<Float4> y);
-RValue<Float4> Pow(RValue<Float4> x, RValue<Float4> y);
-RValue<Float4> Exp(RValue<Float4> x);
-RValue<Float4> Log(RValue<Float4> x);
-RValue<Float4> Exp2(RValue<Float4> x);
-RValue<Float4> Log2(RValue<Float4> x);
-RValue<Int> MinAtomic(RValue<Pointer<Int>> x, RValue<Int> y, std::memory_order memoryOrder);
-RValue<UInt> MinAtomic(RValue<Pointer<UInt>> x, RValue<UInt> y, std::memory_order memoryOrder);
-RValue<Int> MaxAtomic(RValue<Pointer<Int>> x, RValue<Int> y, std::memory_order memoryOrder);
-RValue<UInt> MaxAtomic(RValue<Pointer<UInt>> x, RValue<UInt> y, std::memory_order memoryOrder);
-RValue<Float4> FRem(RValue<Float4> lhs, RValue<Float4> rhs);
-RValue<Float4> FMA(RValue<Float4> x, RValue<Float4> y, RValue<Float4> z);
-
-}  // namespace emulated
-}  // namespace rr
diff --git a/src/Reactor/LLVMReactor.cpp b/src/Reactor/LLVMReactor.cpp
index a2781b7..14db765 100644
--- a/src/Reactor/LLVMReactor.cpp
+++ b/src/Reactor/LLVMReactor.cpp
@@ -16,7 +16,6 @@
 
 #include "CPUID.hpp"
 #include "Debug.hpp"
-#include "EmulatedIntrinsics.hpp"
 #include "LLVMReactorDebugInfo.hpp"
 #include "Print.hpp"
 #include "Reactor.hpp"
@@ -3476,13 +3475,13 @@
 RValue<Float4> Sinh(RValue<Float4> v)
 {
 	RR_DEBUG_INFO_UPDATE_LOC();
-	return emulated::Sinh(v);
+	return TransformFloat4PerElement(v, "sinhf");
 }
 
 RValue<Float4> Cosh(RValue<Float4> v)
 {
 	RR_DEBUG_INFO_UPDATE_LOC();
-	return emulated::Cosh(v);
+	return TransformFloat4PerElement(v, "coshf");
 }
 
 RValue<Float4> Tanh(RValue<Float4> v)
diff --git a/src/Reactor/SubzeroReactor.cpp b/src/Reactor/SubzeroReactor.cpp
index 09d3ff3..dc38d8e 100644
--- a/src/Reactor/SubzeroReactor.cpp
+++ b/src/Reactor/SubzeroReactor.cpp
@@ -13,7 +13,6 @@
 // limitations under the License.
 
 #include "Debug.hpp"
-#include "EmulatedIntrinsics.hpp"
 #include "Print.hpp"
 #include "Reactor.hpp"
 #include "ReactorDebugInfo.hpp"
@@ -51,6 +50,7 @@
 #endif
 
 #include <array>
+#include <cmath>
 #include <iostream>
 #include <limits>
 #include <mutex>
@@ -1307,11 +1307,6 @@
 	return nullptr;
 }
 
-RValue<Float4> operator%(RValue<Float4> lhs, RValue<Float4> rhs)
-{
-	return emulated::FRem(lhs, rhs);
-}
-
 Value *Nucleus::createShl(Value *lhs, Value *rhs)
 {
 	RR_DEBUG_INFO_UPDATE_LOC();
@@ -3940,6 +3935,47 @@
 	storeValue(replicate);
 }
 
+// Call single arg function on a vector type
+template<typename Func, typename T>
+static RValue<T> call4(Func func, const RValue<T> &x)
+{
+	T result;
+	result = Insert(result, Call(func, Extract(x, 0)), 0);
+	result = Insert(result, Call(func, Extract(x, 1)), 1);
+	result = Insert(result, Call(func, Extract(x, 2)), 2);
+	result = Insert(result, Call(func, Extract(x, 3)), 3);
+	return result;
+}
+
+// Call two arg function on a vector type
+template<typename Func, typename T>
+static RValue<T> call4(Func func, const RValue<T> &x, const RValue<T> &y)
+{
+	T result;
+	result = Insert(result, Call(func, Extract(x, 0), Extract(y, 0)), 0);
+	result = Insert(result, Call(func, Extract(x, 1), Extract(y, 1)), 1);
+	result = Insert(result, Call(func, Extract(x, 2), Extract(y, 2)), 2);
+	result = Insert(result, Call(func, Extract(x, 3), Extract(y, 3)), 3);
+	return result;
+}
+
+// Call three arg function on a vector type
+template<typename Func, typename T>
+static RValue<T> call4(Func func, const RValue<T> &x, const RValue<T> &y, const RValue<T> &z)
+{
+	T result;
+	result = Insert(result, Call(func, Extract(x, 0), Extract(y, 0), Extract(z, 0)), 0);
+	result = Insert(result, Call(func, Extract(x, 1), Extract(y, 1), Extract(z, 1)), 1);
+	result = Insert(result, Call(func, Extract(x, 2), Extract(y, 2), Extract(z, 2)), 2);
+	result = Insert(result, Call(func, Extract(x, 3), Extract(y, 3), Extract(z, 3)), 3);
+	return result;
+}
+
+RValue<Float4> operator%(RValue<Float4> lhs, RValue<Float4> rhs)
+{
+	return call4(fmodf, lhs, rhs);
+}
+
 RValue<Float4> MulAdd(RValue<Float4> x, RValue<Float4> y, RValue<Float4> z)
 {
 	// TODO(b/214591655): Use FMA when available.
@@ -3949,7 +3985,7 @@
 RValue<Float4> FMA(RValue<Float4> x, RValue<Float4> y, RValue<Float4> z)
 {
 	// TODO(b/214591655): Use FMA instructions when available.
-	return emulated::FMA(x, y, z);
+	return call4(fmaf, x, y, z);
 }
 
 RValue<Float4> Abs(RValue<Float4> x)
@@ -4325,148 +4361,203 @@
 	UNIMPLEMENTED("b/155867273 Subzero createMaskedStore()");
 }
 
+template<typename T>
+struct UnderlyingType
+{
+	using Type = typename decltype(rr::Extract(std::declval<RValue<T>>(), 0))::rvalue_underlying_type;
+};
+
+template<typename T>
+using UnderlyingTypeT = typename UnderlyingType<T>::Type;
+
+template<typename T, typename EL = UnderlyingTypeT<T>>
+static void gather(T &out, RValue<Pointer<EL>> base, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment, bool zeroMaskedLanes)
+{
+	constexpr bool atomic = false;
+	constexpr std::memory_order order = std::memory_order_relaxed;
+
+	Pointer<Byte> baseBytePtr = base;
+
+	out = T(0);
+	for(int i = 0; i < 4; i++)
+	{
+		If(Extract(mask, i) != 0)
+		{
+			auto offset = Extract(offsets, i);
+			auto el = Load(Pointer<EL>(&baseBytePtr[offset]), alignment, atomic, order);
+			out = Insert(out, el, i);
+		}
+		Else If(zeroMaskedLanes)
+		{
+			out = Insert(out, EL(0), i);
+		}
+	}
+}
+
+template<typename T, typename EL = UnderlyingTypeT<T>>
+static void scatter(RValue<Pointer<EL>> base, RValue<T> val, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment)
+{
+	constexpr bool atomic = false;
+	constexpr std::memory_order order = std::memory_order_relaxed;
+
+	Pointer<Byte> baseBytePtr = base;
+
+	for(int i = 0; i < 4; i++)
+	{
+		If(Extract(mask, i) != 0)
+		{
+			auto offset = Extract(offsets, i);
+			Store(Extract(val, i), Pointer<EL>(&baseBytePtr[offset]), alignment, atomic, order);
+		}
+	}
+}
+
 RValue<Float4> Gather(RValue<Pointer<Float>> base, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment, bool zeroMaskedLanes /* = false */)
 {
 	RR_DEBUG_INFO_UPDATE_LOC();
-	return emulated::Gather(base, offsets, mask, alignment, zeroMaskedLanes);
+	Float4 result{};
+	gather(result, base, offsets, mask, alignment, zeroMaskedLanes);
+	return result;
 }
 
 RValue<Int4> Gather(RValue<Pointer<Int>> base, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment, bool zeroMaskedLanes /* = false */)
 {
 	RR_DEBUG_INFO_UPDATE_LOC();
-	return emulated::Gather(base, offsets, mask, alignment, zeroMaskedLanes);
+	Int4 result{};
+	gather(result, base, offsets, mask, alignment, zeroMaskedLanes);
+	return result;
 }
 
 void Scatter(RValue<Pointer<Float>> base, RValue<Float4> val, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment)
 {
 	RR_DEBUG_INFO_UPDATE_LOC();
-	return emulated::Scatter(base, val, offsets, mask, alignment);
+	scatter(base, val, offsets, mask, alignment);
 }
 
 void Scatter(RValue<Pointer<Int>> base, RValue<Int4> val, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment)
 {
 	RR_DEBUG_INFO_UPDATE_LOC();
-	return emulated::Scatter(base, val, offsets, mask, alignment);
+	scatter<Int4>(base, val, offsets, mask, alignment);
 }
 
 RValue<Float> Exp2(RValue<Float> x)
 {
 	RR_DEBUG_INFO_UPDATE_LOC();
-	return emulated::Exp2(x);
+	return Call(exp2f, x);
 }
 
 RValue<Float> Log2(RValue<Float> x)
 {
 	RR_DEBUG_INFO_UPDATE_LOC();
-	return emulated::Log2(x);
+	return Call(log2f, x);
 }
 
 RValue<Float4> Sin(RValue<Float4> x)
 {
 	RR_DEBUG_INFO_UPDATE_LOC();
-	return emulated::Sin(x);
+	return call4(sinf, x);
 }
 
 RValue<Float4> Cos(RValue<Float4> x)
 {
 	RR_DEBUG_INFO_UPDATE_LOC();
-	return emulated::Cos(x);
+	return call4(cosf, x);
 }
 
 RValue<Float4> Tan(RValue<Float4> x)
 {
 	RR_DEBUG_INFO_UPDATE_LOC();
-	return emulated::Tan(x);
+	return call4(tanf, x);
 }
 
 RValue<Float4> Asin(RValue<Float4> x)
 {
 	RR_DEBUG_INFO_UPDATE_LOC();
-	return emulated::Asin(x);
+	return call4(asinf, x);
 }
 
 RValue<Float4> Acos(RValue<Float4> x)
 {
 	RR_DEBUG_INFO_UPDATE_LOC();
-	return emulated::Acos(x);
+	return call4(acosf, x);
 }
 
 RValue<Float4> Atan(RValue<Float4> x)
 {
 	RR_DEBUG_INFO_UPDATE_LOC();
-	return emulated::Atan(x);
+	return call4(atanf, x);
 }
 
 RValue<Float4> Sinh(RValue<Float4> x)
 {
 	RR_DEBUG_INFO_UPDATE_LOC();
-	return emulated::Sinh(x);
+	return call4(sinhf, x);
 }
 
 RValue<Float4> Cosh(RValue<Float4> x)
 {
 	RR_DEBUG_INFO_UPDATE_LOC();
-	return emulated::Cosh(x);
+	return call4(coshf, x);
 }
 
 RValue<Float4> Tanh(RValue<Float4> x)
 {
 	RR_DEBUG_INFO_UPDATE_LOC();
-	return emulated::Tanh(x);
+	return call4(tanhf, x);
 }
 
 RValue<Float4> Asinh(RValue<Float4> x)
 {
 	RR_DEBUG_INFO_UPDATE_LOC();
-	return emulated::Asinh(x);
+	return call4(asinhf, x);
 }
 
 RValue<Float4> Acosh(RValue<Float4> x)
 {
 	RR_DEBUG_INFO_UPDATE_LOC();
-	return emulated::Acosh(x);
+	return call4(acoshf, x);
 }
 
 RValue<Float4> Atanh(RValue<Float4> x)
 {
 	RR_DEBUG_INFO_UPDATE_LOC();
-	return emulated::Atanh(x);
+	return call4(atanhf, x);
 }
 
 RValue<Float4> Atan2(RValue<Float4> x, RValue<Float4> y)
 {
 	RR_DEBUG_INFO_UPDATE_LOC();
-	return emulated::Atan2(x, y);
+	return call4(atan2f, x, y);
 }
 
 RValue<Float4> Pow(RValue<Float4> x, RValue<Float4> y)
 {
 	RR_DEBUG_INFO_UPDATE_LOC();
-	return emulated::Pow(x, y);
+	return call4(powf, x, y);
 }
 
 RValue<Float4> Exp(RValue<Float4> x)
 {
 	RR_DEBUG_INFO_UPDATE_LOC();
-	return emulated::Exp(x);
+	return call4(expf, x);
 }
 
 RValue<Float4> Log(RValue<Float4> x)
 {
 	RR_DEBUG_INFO_UPDATE_LOC();
-	return emulated::Log(x);
+	return call4(logf, x);
 }
 
 RValue<Float4> Exp2(RValue<Float4> x)
 {
 	RR_DEBUG_INFO_UPDATE_LOC();
-	return emulated::Exp2(x);
+	return call4(exp2f, x);
 }
 
 RValue<Float4> Log2(RValue<Float4> x)
 {
 	RR_DEBUG_INFO_UPDATE_LOC();
-	return emulated::Log2(x);
+	return call4(log2f, x);
 }
 
 RValue<UInt> Ctlz(RValue<UInt> x, bool isZeroUndef)
@@ -4549,28 +4640,54 @@
 	}
 }
 
+// TODO(b/148276653): Both atomicMin and atomicMax use a static (global) mutex that makes all min
+// operations for a given T mutually exclusive, rather than only the ones on the value pointed to
+// by ptr. Use a CAS loop, as is done for LLVMReactor's min/max atomic for Android.
+// TODO(b/148207274): Or, move this down into Subzero as a CAS-based operation.
+template<typename T>
+static T atomicMin(T *ptr, T value)
+{
+	static std::mutex m;
+
+	std::lock_guard<std::mutex> lock(m);
+	T origValue = *ptr;
+	*ptr = std::min(origValue, value);
+	return origValue;
+}
+
+template<typename T>
+static T atomicMax(T *ptr, T value)
+{
+	static std::mutex m;
+
+	std::lock_guard<std::mutex> lock(m);
+	T origValue = *ptr;
+	*ptr = std::max(origValue, value);
+	return origValue;
+}
+
 RValue<Int> MinAtomic(RValue<Pointer<Int>> x, RValue<Int> y, std::memory_order memoryOrder)
 {
 	RR_DEBUG_INFO_UPDATE_LOC();
-	return emulated::MinAtomic(x, y, memoryOrder);
+	return Call(atomicMin<int32_t>, x, y);
 }
 
 RValue<UInt> MinAtomic(RValue<Pointer<UInt>> x, RValue<UInt> y, std::memory_order memoryOrder)
 {
 	RR_DEBUG_INFO_UPDATE_LOC();
-	return emulated::MinAtomic(x, y, memoryOrder);
+	return Call(atomicMin<uint32_t>, x, y);
 }
 
 RValue<Int> MaxAtomic(RValue<Pointer<Int>> x, RValue<Int> y, std::memory_order memoryOrder)
 {
 	RR_DEBUG_INFO_UPDATE_LOC();
-	return emulated::MaxAtomic(x, y, memoryOrder);
+	return Call(atomicMax<int32_t>, x, y);
 }
 
 RValue<UInt> MaxAtomic(RValue<Pointer<UInt>> x, RValue<UInt> y, std::memory_order memoryOrder)
 {
 	RR_DEBUG_INFO_UPDATE_LOC();
-	return emulated::MaxAtomic(x, y, memoryOrder);
+	return Call(atomicMax<uint32_t>, x, y);
 }
 
 void EmitDebugLocation()