SubzeroReactor: implement most missing intrinsics

* Implement intrinsics for the Subzero backend required for
ANGLE / SwiftShader for GLES 2 and 3.
* Note that most intrinsics are implemented as "emulated". I've added
rr::emulated namespace in EmulatedReactor.hpp/cpp that contains the set
of Reactor functions that are presently being emulated. These are
invoked from SubzeroReactor until we decide to implement proper
intrinsics for these in Subzero.

Bug: b/130459196
Change-Id: I01171cfa7cc45b078c3b98be6b61328eee4f35e5
Reviewed-on: https://swiftshader-review.googlesource.com/c/SwiftShader/+/38874
Kokoro-Presubmit: kokoro <noreply+kokoro@google.com>
Tested-by: Antonio Maiorano <amaiorano@google.com>
Reviewed-by: Alexis Hétu <sugoi@google.com>
diff --git a/CMakeLists.txt b/CMakeLists.txt
index ae486e9..28ff2af 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1540,6 +1540,7 @@
     set(SUBZERO_REACTOR_LIST
         ${SOURCE_DIR}/Reactor/Debug.cpp
         ${SOURCE_DIR}/Reactor/Debug.hpp
+        ${SOURCE_DIR}/Reactor/EmulatedReactor.cpp
         ${SOURCE_DIR}/Reactor/ExecutableMemory.cpp
         ${SOURCE_DIR}/Reactor/ExecutableMemory.hpp
         ${SOURCE_DIR}/Reactor/Nucleus.hpp
@@ -1675,6 +1676,7 @@
     ${SOURCE_DIR}/Reactor/CPUID.hpp
     ${SOURCE_DIR}/Reactor/Debug.cpp
     ${SOURCE_DIR}/Reactor/Debug.hpp
+    ${SOURCE_DIR}/Reactor/EmulatedReactor.cpp
     ${SOURCE_DIR}/Reactor/ExecutableMemory.cpp
     ${SOURCE_DIR}/Reactor/ExecutableMemory.hpp
     ${SOURCE_DIR}/Reactor/LLVMReactor.cpp
diff --git a/src/Reactor/BUILD.gn b/src/Reactor/BUILD.gn
index b472d81..1d958e7 100644
--- a/src/Reactor/BUILD.gn
+++ b/src/Reactor/BUILD.gn
@@ -47,6 +47,7 @@
 swiftshader_source_set("swiftshader_reactor_base") {
   sources = [
     "Debug.cpp",
+    "EmulatedReactor.cpp",
     "ExecutableMemory.cpp",
     "Reactor.cpp",
   ]
diff --git a/src/Reactor/EmulatedReactor.cpp b/src/Reactor/EmulatedReactor.cpp
new file mode 100644
index 0000000..8a06d6f
--- /dev/null
+++ b/src/Reactor/EmulatedReactor.cpp
@@ -0,0 +1,213 @@
+#include "EmulatedReactor.hpp"
+
+#include <cmath>
+#include <functional>
+#include <utility>
+
+namespace rr
+{
+	namespace
+	{
+		template <typename T>
+		struct UnderlyingType
+		{
+			using Type = typename decltype(rr::Extract(std::declval<RValue<T>>(), 0))::rvalue_underlying_type;
+		};
+
+		template <typename T>
+		using UnderlyingTypeT = typename UnderlyingType<T>::Type;
+
+		// Call single arg function on a vector type
+		template <typename Func, typename T>
+		RValue<T> call4(Func func, const RValue<T>& x)
+		{
+			T result;
+			result = Insert(result, Call(func, Extract(x, 0)), 0);
+			result = Insert(result, Call(func, Extract(x, 1)), 1);
+			result = Insert(result, Call(func, Extract(x, 2)), 2);
+			result = Insert(result, Call(func, Extract(x, 3)), 3);
+			return result;
+		}
+
+		// Call two arg function on a vector type
+		template <typename Func, typename T>
+		RValue<T> call4(Func func, const RValue<T>& x, const RValue<T>& y)
+		{
+			T result;
+			result = Insert(result, Call(func, Extract(x, 0), Extract(y, 0)), 0);
+			result = Insert(result, Call(func, Extract(x, 1), Extract(y, 1)), 1);
+			result = Insert(result, Call(func, Extract(x, 2), Extract(y, 2)), 2);
+			result = Insert(result, Call(func, Extract(x, 3), Extract(y, 3)), 3);
+			return result;
+		}
+
+		template <typename T, typename EL = UnderlyingTypeT<T>>
+		void gather(T& out, RValue<Pointer<EL>> base, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment, bool zeroMaskedLanes)
+		{
+			constexpr bool atomic = false;
+			constexpr std::memory_order order = std::memory_order_relaxed;
+
+			Pointer<Byte> baseBytePtr = base;
+
+			out = T(0);
+			for (int i = 0; i < 4; i++)
+			{
+				If(Extract(mask, i) != 0)
+				{
+					auto offset = Extract(offsets, i);
+					auto el = Load(Pointer<EL>(&baseBytePtr[offset]), alignment, atomic, order);
+					out = Insert(out, el, i);
+				}
+				Else If(zeroMaskedLanes)
+				{
+					out = Insert(out, EL(0), i);
+				}
+			}
+		}
+
+		template <typename T, typename EL = UnderlyingTypeT<T>>
+		void scatter(RValue<Pointer<EL>> base, RValue<T> val, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment)
+		{
+			constexpr bool atomic = false;
+			constexpr std::memory_order order = std::memory_order_relaxed;
+
+			Pointer<Byte> baseBytePtr = base;
+
+			for (int i = 0; i < 4; i++)
+			{
+				If(Extract(mask, i) != 0)
+				{
+					auto offset = Extract(offsets, i);
+					Store(Extract(val, i), Pointer<EL>(&baseBytePtr[offset]), alignment, atomic, order);
+				}
+			}
+		}
+	}
+
+	namespace emulated
+	{
+		RValue<Float4> Gather(RValue<Pointer<Float>> base, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment, bool zeroMaskedLanes /* = false */)
+		{
+			Float4 result{};
+			gather(result, base, offsets, mask, alignment, zeroMaskedLanes);
+			return result;
+		}
+
+		RValue<Int4> Gather(RValue<Pointer<Int>> base, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment, bool zeroMaskedLanes /* = false */)
+		{
+			Int4 result{};
+			gather(result, base, offsets, mask, alignment, zeroMaskedLanes);
+			return result;
+		}
+
+		void Scatter(RValue<Pointer<Float>> base, RValue<Float4> val, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment)
+		{
+			scatter(base, val, offsets, mask, alignment);
+		}
+
+		void Scatter(RValue<Pointer<Int>> base, RValue<Int4> val, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment)
+		{
+			scatter<Int4>(base, val, offsets, mask, alignment);
+		}
+
+		RValue<Float> Exp2(RValue<Float> x)
+		{
+			return Call(exp2f, x);
+		}
+
+		RValue<Float> Log2(RValue<Float> x)
+		{
+			return Call(log2f, x);
+		}
+
+		RValue<Float4> Sin(RValue<Float4> x)
+		{
+			return call4(sinf, x);
+		}
+
+		RValue<Float4> Cos(RValue<Float4> x)
+		{
+			return call4(cosf, x);
+		}
+
+		RValue<Float4> Tan(RValue<Float4> x)
+		{
+			return call4(tanf, x);
+		}
+
+		RValue<Float4> Asin(RValue<Float4> x)
+		{
+			return call4(asinf, x);
+		}
+
+		RValue<Float4> Acos(RValue<Float4> x)
+		{
+			return call4(acosf, x);
+		}
+
+		RValue<Float4> Atan(RValue<Float4> x)
+		{
+			return call4(atanf, x);
+		}
+
+		RValue<Float4> Sinh(RValue<Float4> x)
+		{
+			return call4(sinhf, x);
+		}
+
+		RValue<Float4> Cosh(RValue<Float4> x)
+		{
+			return call4(coshf, x);
+		}
+
+		RValue<Float4> Tanh(RValue<Float4> x)
+		{
+			return call4(tanhf, x);
+		}
+
+		RValue<Float4> Asinh(RValue<Float4> x)
+		{
+			return call4(asinhf, x);
+		}
+
+		RValue<Float4> Acosh(RValue<Float4> x)
+		{
+			return call4(acoshf, x);
+		}
+
+		RValue<Float4> Atanh(RValue<Float4> x)
+		{
+			return call4(atanhf, x);
+		}
+
+		RValue<Float4> Atan2(RValue<Float4> x, RValue<Float4> y)
+		{
+			return call4(atan2f, x, y);
+		}
+
+		RValue<Float4> Pow(RValue<Float4> x, RValue<Float4> y)
+		{
+			return call4(powf, x, y);
+		}
+
+		RValue<Float4> Exp(RValue<Float4> x)
+		{
+			return call4(expf, x);
+		}
+
+		RValue<Float4> Log(RValue<Float4> x)
+		{
+			return call4(logf, x);
+		}
+
+		RValue<Float4> Exp2(RValue<Float4> x)
+		{
+			return call4(exp2f, x);
+		}
+
+		RValue<Float4> Log2(RValue<Float4> x)
+		{
+			return call4(log2f, x);
+		}
+	}
+}
diff --git a/src/Reactor/EmulatedReactor.hpp b/src/Reactor/EmulatedReactor.hpp
new file mode 100644
index 0000000..dbdc198
--- /dev/null
+++ b/src/Reactor/EmulatedReactor.hpp
@@ -0,0 +1,53 @@
+// Copyright 2019 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "Reactor.hpp"
+
+// Implementation of Reactor functions that are "emulated" - that is,
+// implemented either in terms of Reactor code, or make use of
+// rr::Call to C functions. These are typically slower than implementing
+// in terms of direct calls to the JIT backend; however, provide a good
+// starting point for implementing a new backend, or for when adding
+// functionality to an existing backend is non-trivial.
+
+namespace rr
+{
+	namespace emulated
+	{
+		RValue<Float4> Gather(RValue<Pointer<Float>> base, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment, bool zeroMaskedLanes = false);
+		RValue<Int4> Gather(RValue<Pointer<Int>> base, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment, bool zeroMaskedLanes = false);
+		void Scatter(RValue<Pointer<Float>> base, RValue<Float4> val, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment);
+		void Scatter(RValue<Pointer<Int>> base, RValue<Int4> val, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment);
+		RValue<Float> Exp2(RValue<Float> x);
+		RValue<Float> Log2(RValue<Float> x);
+		RValue<Float4> Sin(RValue<Float4> x);
+		RValue<Float4> Cos(RValue<Float4> x);
+		RValue<Float4> Tan(RValue<Float4> x);
+		RValue<Float4> Asin(RValue<Float4> x);
+		RValue<Float4> Acos(RValue<Float4> x);
+		RValue<Float4> Atan(RValue<Float4> x);
+		RValue<Float4> Sinh(RValue<Float4> x);
+		RValue<Float4> Cosh(RValue<Float4> x);
+		RValue<Float4> Tanh(RValue<Float4> x);
+		RValue<Float4> Asinh(RValue<Float4> x);
+		RValue<Float4> Acosh(RValue<Float4> x);
+		RValue<Float4> Atanh(RValue<Float4> x);
+		RValue<Float4> Atan2(RValue<Float4> x, RValue<Float4> y);
+		RValue<Float4> Pow(RValue<Float4> x, RValue<Float4> y);
+		RValue<Float4> Exp(RValue<Float4> x);
+		RValue<Float4> Log(RValue<Float4> x);
+		RValue<Float4> Exp2(RValue<Float4> x);
+		RValue<Float4> Log2(RValue<Float4> x);
+	}
+}
diff --git a/src/Reactor/LLVMReactor.cpp b/src/Reactor/LLVMReactor.cpp
index 003716e..0360138 100644
--- a/src/Reactor/LLVMReactor.cpp
+++ b/src/Reactor/LLVMReactor.cpp
@@ -873,6 +873,55 @@
 		llvm::Value *mulh = jit->builder->CreateAShr(mult, intTy->getBitWidth());
 		return jit->builder->CreateTrunc(mulh, ty);
 	}
+
+	llvm::Value *createGather(llvm::Value *base, llvm::Type *elTy, llvm::Value *offsets, llvm::Value *mask, unsigned int alignment, bool zeroMaskedLanes)
+	{
+		ASSERT(base->getType()->isPointerTy());
+		ASSERT(offsets->getType()->isVectorTy());
+		ASSERT(mask->getType()->isVectorTy());
+
+		auto numEls = mask->getType()->getVectorNumElements();
+		auto i1Ty = ::llvm::Type::getInt1Ty(jit->context);
+		auto i32Ty = ::llvm::Type::getInt32Ty(jit->context);
+		auto i8Ty = ::llvm::Type::getInt8Ty(jit->context);
+		auto i8PtrTy = i8Ty->getPointerTo();
+		auto elPtrTy = elTy->getPointerTo();
+		auto elVecTy = ::llvm::VectorType::get(elTy, numEls);
+		auto elPtrVecTy = ::llvm::VectorType::get(elPtrTy, numEls);
+		auto i8Base = jit->builder->CreatePointerCast(base, i8PtrTy);
+		auto i8Ptrs = jit->builder->CreateGEP(i8Base, offsets);
+		auto elPtrs = jit->builder->CreatePointerCast(i8Ptrs, elPtrVecTy);
+		auto i8Mask = jit->builder->CreateIntCast(mask, ::llvm::VectorType::get(i1Ty, numEls), false); // vec<int, int, ...> -> vec<bool, bool, ...>
+		auto passthrough = zeroMaskedLanes ? ::llvm::Constant::getNullValue(elVecTy) : llvm::UndefValue::get(elVecTy);
+		auto align = ::llvm::ConstantInt::get(i32Ty, alignment);
+		auto func = ::llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::masked_gather, { elVecTy, elPtrVecTy } );
+		return jit->builder->CreateCall(func, { elPtrs, align, i8Mask, passthrough });
+	}
+
+	void createScatter(llvm::Value *base, llvm::Value *val, llvm::Value *offsets, llvm::Value *mask, unsigned int alignment)
+	{
+		ASSERT(base->getType()->isPointerTy());
+		ASSERT(val->getType()->isVectorTy());
+		ASSERT(offsets->getType()->isVectorTy());
+		ASSERT(mask->getType()->isVectorTy());
+
+		auto numEls = mask->getType()->getVectorNumElements();
+		auto i1Ty = ::llvm::Type::getInt1Ty(jit->context);
+		auto i32Ty = ::llvm::Type::getInt32Ty(jit->context);
+		auto i8Ty = ::llvm::Type::getInt8Ty(jit->context);
+		auto i8PtrTy = i8Ty->getPointerTo();
+		auto elVecTy = val->getType();
+		auto elTy = elVecTy->getVectorElementType();
+		auto elPtrTy = elTy->getPointerTo();
+		auto elPtrVecTy = ::llvm::VectorType::get(elPtrTy, numEls);
+		auto i8Base = jit->builder->CreatePointerCast(base, i8PtrTy);
+		auto i8Ptrs = jit->builder->CreateGEP(i8Base, offsets);
+		auto elPtrs = jit->builder->CreatePointerCast(i8Ptrs, elPtrVecTy);
+		auto i8Mask = jit->builder->CreateIntCast(mask, ::llvm::VectorType::get(i1Ty, numEls), false); // vec<int, int, ...> -> vec<bool, bool, ...>
+		auto align = ::llvm::ConstantInt::get(i32Ty, alignment);
+		auto func = ::llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::masked_scatter, { elVecTy, elPtrVecTy } );
+		jit->builder->CreateCall(func, { val, elPtrs, align, i8Mask });
+	}
 }
 
 namespace rr
@@ -1751,53 +1800,24 @@
 		jit->builder->CreateCall(func, { V(val), V(ptr), align, i8Mask });
 	}
 
-	Value *Nucleus::createGather(Value *base, Type *elTy, Value *offsets, Value *mask, unsigned int alignment, bool zeroMaskedLanes)
+	RValue<Float4> Gather(RValue<Pointer<Float>> base, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment, bool zeroMaskedLanes /* = false */)
 	{
-		ASSERT(V(base)->getType()->isPointerTy());
-		ASSERT(V(offsets)->getType()->isVectorTy());
-		ASSERT(V(mask)->getType()->isVectorTy());
-
-		auto numEls = V(mask)->getType()->getVectorNumElements();
-		auto i1Ty = ::llvm::Type::getInt1Ty(jit->context);
-		auto i32Ty = ::llvm::Type::getInt32Ty(jit->context);
-		auto i8Ty = ::llvm::Type::getInt8Ty(jit->context);
-		auto i8PtrTy = i8Ty->getPointerTo();
-		auto elPtrTy = T(elTy)->getPointerTo();
-		auto elVecTy = ::llvm::VectorType::get(T(elTy), numEls);
-		auto elPtrVecTy = ::llvm::VectorType::get(elPtrTy, numEls);
-		auto i8Base = jit->builder->CreatePointerCast(V(base), i8PtrTy);
-		auto i8Ptrs = jit->builder->CreateGEP(i8Base, V(offsets));
-		auto elPtrs = jit->builder->CreatePointerCast(i8Ptrs, elPtrVecTy);
-		auto i8Mask = jit->builder->CreateIntCast(V(mask), ::llvm::VectorType::get(i1Ty, numEls), false); // vec<int, int, ...> -> vec<bool, bool, ...>
-		auto passthrough = zeroMaskedLanes ? ::llvm::Constant::getNullValue(elVecTy) : llvm::UndefValue::get(elVecTy);
-		auto align = ::llvm::ConstantInt::get(i32Ty, alignment);
-		auto func = ::llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::masked_gather, { elVecTy, elPtrVecTy } );
-		return V(jit->builder->CreateCall(func, { elPtrs, align, i8Mask, passthrough }));
+		return As<Float4>(V(createGather(V(base.value), T(Float::getType()), V(offsets.value), V(mask.value), alignment, zeroMaskedLanes)));
 	}
 
-	void Nucleus::createScatter(Value *base, Value *val, Value *offsets, Value *mask, unsigned int alignment)
+	RValue<Int4> Gather(RValue<Pointer<Int>> base, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment, bool zeroMaskedLanes /* = false */)
 	{
-		ASSERT(V(base)->getType()->isPointerTy());
-		ASSERT(V(val)->getType()->isVectorTy());
-		ASSERT(V(offsets)->getType()->isVectorTy());
-		ASSERT(V(mask)->getType()->isVectorTy());
+		return As<Int4>(V(createGather(V(base.value), T(Float::getType()), V(offsets.value), V(mask.value), alignment, zeroMaskedLanes)));
+	}
 
-		auto numEls = V(mask)->getType()->getVectorNumElements();
-		auto i1Ty = ::llvm::Type::getInt1Ty(jit->context);
-		auto i32Ty = ::llvm::Type::getInt32Ty(jit->context);
-		auto i8Ty = ::llvm::Type::getInt8Ty(jit->context);
-		auto i8PtrTy = i8Ty->getPointerTo();
-		auto elVecTy = V(val)->getType();
-		auto elTy = elVecTy->getVectorElementType();
-		auto elPtrTy = elTy->getPointerTo();
-		auto elPtrVecTy = ::llvm::VectorType::get(elPtrTy, numEls);
-		auto i8Base = jit->builder->CreatePointerCast(V(base), i8PtrTy);
-		auto i8Ptrs = jit->builder->CreateGEP(i8Base, V(offsets));
-		auto elPtrs = jit->builder->CreatePointerCast(i8Ptrs, elPtrVecTy);
-		auto i8Mask = jit->builder->CreateIntCast(V(mask), ::llvm::VectorType::get(i1Ty, numEls), false); // vec<int, int, ...> -> vec<bool, bool, ...>
-		auto align = ::llvm::ConstantInt::get(i32Ty, alignment);
-		auto func = ::llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::masked_scatter, { elVecTy, elPtrVecTy } );
-		jit->builder->CreateCall(func, { V(val), elPtrs, align, i8Mask });
+	void Scatter(RValue<Pointer<Float>> base, RValue<Float4> val, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment)
+	{
+		return createScatter(V(base.value), V(val.value), V(offsets.value), V(mask.value), alignment);
+	}
+
+	void Scatter(RValue<Pointer<Int>> base, RValue<Int4> val, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment)
+	{
+		return createScatter(V(base.value), V(val.value), V(offsets.value), V(mask.value), alignment);
 	}
 
 	void Nucleus::createFence(std::memory_order memoryOrder)
diff --git a/src/Reactor/Nucleus.hpp b/src/Reactor/Nucleus.hpp
index 9f66115..50e27ad 100644
--- a/src/Reactor/Nucleus.hpp
+++ b/src/Reactor/Nucleus.hpp
@@ -199,10 +199,6 @@
 		static Value *createMaskedLoad(Value *base, Type *elementType, Value *mask, unsigned int alignment, bool zeroMaskedLanes);
 		static void createMaskedStore(Value *base, Value *value, Value *mask, unsigned int alignment);
 
-		// Scatter / Gather instructions
-		static Value *createGather(Value *base, Type *elementType, Value *offsets, Value *mask, unsigned int alignment, bool zeroMaskedLanes);
-		static void createScatter(Value *base, Value *value, Value *offsets, Value *mask, unsigned int alignment);
-
 		// Barrier instructions
 		static void createFence(std::memory_order memoryOrder);
 
diff --git a/src/Reactor/Reactor.cpp b/src/Reactor/Reactor.cpp
index faa8738..31ab59e 100644
--- a/src/Reactor/Reactor.cpp
+++ b/src/Reactor/Reactor.cpp
@@ -4327,26 +4327,6 @@
 		Nucleus::createMaskedStore(base.value, val.value, mask.value, alignment);
 	}
 
-	RValue<Float4> Gather(RValue<Pointer<Float>> base, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment, bool zeroMaskedLanes /* = false */)
-	{
-		return RValue<Float4>(Nucleus::createGather(base.value, Float::getType(), offsets.value, mask.value, alignment, zeroMaskedLanes));
-	}
-
-	RValue<Int4> Gather(RValue<Pointer<Int>> base, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment, bool zeroMaskedLanes /* = false */)
-	{
-		return RValue<Int4>(Nucleus::createGather(base.value, Int::getType(), offsets.value, mask.value, alignment, zeroMaskedLanes));
-	}
-
-	void Scatter(RValue<Pointer<Float>> base, RValue<Float4> val, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment)
-	{
-		Nucleus::createScatter(base.value, val.value, offsets.value, mask.value, alignment);
-	}
-
-	void Scatter(RValue<Pointer<Int>> base, RValue<Int4> val, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment)
-	{
-		Nucleus::createScatter(base.value, val.value, offsets.value, mask.value, alignment);
-	}
-
 	void Fence(std::memory_order memoryOrder)
 	{
 		ASSERT_MSG(memoryOrder == std::memory_order_acquire ||
diff --git a/src/Reactor/Reactor.hpp b/src/Reactor/Reactor.hpp
index e4b6be0..81757a9 100644
--- a/src/Reactor/Reactor.hpp
+++ b/src/Reactor/Reactor.hpp
@@ -2316,14 +2316,14 @@
 	// TODO: Currently unimplemented for Subzero.
 
 	// Count leading zeros.
-	// Returns 32 when: isZeroUndef && x == 0.
-	// Returns an undefined value when: !isZeroUndef && x == 0.
+	// Returns 32 when: !isZeroUndef && x == 0.
+	// Returns an undefined value when: isZeroUndef && x == 0.
 	RValue<UInt> Ctlz(RValue<UInt> x, bool isZeroUndef);
 	RValue<UInt4> Ctlz(RValue<UInt4> x, bool isZeroUndef);
 
 	// Count trailing zeros.
-	// Returns 32 when: isZeroUndef && x == 0.
-	// Returns an undefined value when: !isZeroUndef && x == 0.
+	// Returns 32 when: !isZeroUndef && x == 0.
+	// Returns an undefined value when: isZeroUndef && x == 0.
 	RValue<UInt> Cttz(RValue<UInt> x, bool isZeroUndef);
 	RValue<UInt4> Cttz(RValue<UInt4> x, bool isZeroUndef);
 
diff --git a/src/Reactor/SubzeroReactor.cpp b/src/Reactor/SubzeroReactor.cpp
index 60daccb..47b4b16 100644
--- a/src/Reactor/SubzeroReactor.cpp
+++ b/src/Reactor/SubzeroReactor.cpp
@@ -14,6 +14,7 @@
 
 #include "Reactor.hpp"
 #include "Debug.hpp"
+#include "EmulatedReactor.hpp"
 
 #include "Optimizer.hpp"
 #include "ExecutableMemory.hpp"
@@ -3560,7 +3561,6 @@
 
 	Value* Call(RValue<Pointer<Byte>> fptr, Type* retTy, std::initializer_list<Value*> args, std::initializer_list<Type*> argTys)
 	{
-		// FIXME: This does not currently work on Windows.
 		Ice::Variable *ret = nullptr;
 		if (retTy != nullptr)
 		{
@@ -3583,37 +3583,203 @@
 		::basicBlock->appendInst(trap);
 	}
 
-	// Below are functions currently unimplemented for the Subzero backend.
-	// They are stubbed to satisfy the linker.
 	void Nucleus::createFence(std::memory_order memoryOrder) { UNIMPLEMENTED("Subzero createFence()"); }
 	Value *Nucleus::createMaskedLoad(Value *ptr, Type *elTy, Value *mask, unsigned int alignment, bool zeroMaskedLanes) { UNIMPLEMENTED("Subzero createMaskedLoad()"); return nullptr; }
 	void Nucleus::createMaskedStore(Value *ptr, Value *val, Value *mask, unsigned int alignment) { UNIMPLEMENTED("Subzero createMaskedStore()"); }
-	Value *Nucleus::createGather(Value *base, Type *elTy, Value *offsets, Value *mask, unsigned int alignment, bool zeroMaskedLanes) { UNIMPLEMENTED("Subzero createGather()"); return nullptr; }
-	void Nucleus::createScatter(Value *base, Value *val, Value *offsets, Value *mask, unsigned int alignment) { UNIMPLEMENTED("Subzero createScatter()"); }
-	RValue<Float> Exp2(RValue<Float> x) { UNIMPLEMENTED("Subzero Exp2()"); return Float(0); }
-	RValue<Float> Log2(RValue<Float> x) { UNIMPLEMENTED("Subzero Log2()"); return Float(0); }
-	RValue<Float4> Sin(RValue<Float4> x) { UNIMPLEMENTED("Subzero Sin()"); return Float4(0); }
-	RValue<Float4> Cos(RValue<Float4> x) { UNIMPLEMENTED("Subzero Cos()"); return Float4(0); }
-	RValue<Float4> Tan(RValue<Float4> x) { UNIMPLEMENTED("Subzero Tan()"); return Float4(0); }
-	RValue<Float4> Asin(RValue<Float4> x) { UNIMPLEMENTED("Subzero Asin()"); return Float4(0); }
-	RValue<Float4> Acos(RValue<Float4> x) { UNIMPLEMENTED("Subzero Acos()"); return Float4(0); }
-	RValue<Float4> Atan(RValue<Float4> x) { UNIMPLEMENTED("Subzero Atan()"); return Float4(0); }
-	RValue<Float4> Sinh(RValue<Float4> x) { UNIMPLEMENTED("Subzero Sinh()"); return Float4(0); }
-	RValue<Float4> Cosh(RValue<Float4> x) { UNIMPLEMENTED("Subzero Cosh()"); return Float4(0); }
-	RValue<Float4> Tanh(RValue<Float4> x) { UNIMPLEMENTED("Subzero Tanh()"); return Float4(0); }
-	RValue<Float4> Asinh(RValue<Float4> x) { UNIMPLEMENTED("Subzero Asinh()"); return Float4(0); }
-	RValue<Float4> Acosh(RValue<Float4> x) { UNIMPLEMENTED("Subzero Acosh()"); return Float4(0); }
-	RValue<Float4> Atanh(RValue<Float4> x) { UNIMPLEMENTED("Subzero Atanh()"); return Float4(0); }
-	RValue<Float4> Atan2(RValue<Float4> x, RValue<Float4> y) { UNIMPLEMENTED("Subzero Atan2()"); return Float4(0); }
-	RValue<Float4> Pow(RValue<Float4> x, RValue<Float4> y) { UNIMPLEMENTED("Subzero Pow()"); return Float4(0); }
-	RValue<Float4> Exp(RValue<Float4> x) { UNIMPLEMENTED("Subzero Exp()"); return Float4(0); }
-	RValue<Float4> Log(RValue<Float4> x) { UNIMPLEMENTED("Subzero Log()"); return Float4(0); }
-	RValue<Float4> Exp2(RValue<Float4> x) { UNIMPLEMENTED("Subzero Exp2()"); return Float4(0); }
-	RValue<Float4> Log2(RValue<Float4> x) { UNIMPLEMENTED("Subzero Log2()"); return Float4(0); }
-	RValue<UInt> Ctlz(RValue<UInt> x, bool isZeroUndef) { UNIMPLEMENTED("Subzero Ctlz()"); return UInt(0); }
-	RValue<UInt4> Ctlz(RValue<UInt4> x, bool isZeroUndef) { UNIMPLEMENTED("Subzero Ctlz()"); return UInt4(0); }
-	RValue<UInt> Cttz(RValue<UInt> x, bool isZeroUndef) { UNIMPLEMENTED("Subzero Cttz()"); return UInt(0); }
-	RValue<UInt4> Cttz(RValue<UInt4> x, bool isZeroUndef) { UNIMPLEMENTED("Subzero Cttz()"); return UInt4(0); }
+
+	RValue<Float4> Gather(RValue<Pointer<Float>> base, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment, bool zeroMaskedLanes /* = false */)
+	{
+		return emulated::Gather(base, offsets, mask, alignment, zeroMaskedLanes);
+	}
+
+	RValue<Int4> Gather(RValue<Pointer<Int>> base, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment, bool zeroMaskedLanes /* = false */)
+	{
+		return emulated::Gather(base, offsets, mask, alignment, zeroMaskedLanes);
+	}
+
+	void Scatter(RValue<Pointer<Float>> base, RValue<Float4> val, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment)
+	{
+		return emulated::Scatter(base, val, offsets, mask, alignment);
+	}
+
+	void Scatter(RValue<Pointer<Int>> base, RValue<Int4> val, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment)
+	{
+		return emulated::Scatter(base, val, offsets, mask, alignment);
+	}
+
+	RValue<Float> Exp2(RValue<Float> x)
+	{
+		return emulated::Exp2(x);
+	}
+
+	RValue<Float> Log2(RValue<Float> x)
+	{
+		return emulated::Log2(x);
+	}
+
+	RValue<Float4> Sin(RValue<Float4> x)
+	{
+		return emulated::Sin(x);
+	}
+
+	RValue<Float4> Cos(RValue<Float4> x)
+	{
+		return emulated::Cos(x);
+	}
+
+	RValue<Float4> Tan(RValue<Float4> x)
+	{
+		return emulated::Tan(x);
+	}
+
+	RValue<Float4> Asin(RValue<Float4> x)
+	{
+		return emulated::Asin(x);
+	}
+
+	RValue<Float4> Acos(RValue<Float4> x)
+	{
+		return emulated::Acos(x);
+	}
+
+	RValue<Float4> Atan(RValue<Float4> x)
+	{
+		return emulated::Atan(x);
+	}
+
+	RValue<Float4> Sinh(RValue<Float4> x)
+	{
+		return emulated::Sinh(x);
+	}
+
+	RValue<Float4> Cosh(RValue<Float4> x)
+	{
+		return emulated::Cosh(x);
+	}
+
+	RValue<Float4> Tanh(RValue<Float4> x)
+	{
+		return emulated::Tanh(x);
+	}
+
+	RValue<Float4> Asinh(RValue<Float4> x)
+	{
+		return emulated::Asinh(x);
+	}
+
+	RValue<Float4> Acosh(RValue<Float4> x)
+	{
+		return emulated::Acosh(x);
+	}
+
+	RValue<Float4> Atanh(RValue<Float4> x)
+	{
+		return emulated::Atanh(x);
+	}
+
+	RValue<Float4> Atan2(RValue<Float4> x, RValue<Float4> y)
+	{
+		return emulated::Atan2(x, y);
+	}
+
+	RValue<Float4> Pow(RValue<Float4> x, RValue<Float4> y)
+	{
+		return emulated::Pow(x, y);
+	}
+
+	RValue<Float4> Exp(RValue<Float4> x)
+	{
+		return emulated::Exp(x);
+	}
+
+	RValue<Float4> Log(RValue<Float4> x)
+	{
+		return emulated::Log(x);
+	}
+
+	RValue<Float4> Exp2(RValue<Float4> x)
+	{
+		return emulated::Exp2(x);
+	}
+
+	RValue<Float4> Log2(RValue<Float4> x)
+	{
+		return emulated::Log2(x);
+	}
+
+	RValue<UInt> Ctlz(RValue<UInt> x, bool isZeroUndef)
+	{
+		if (emulateIntrinsics)
+		{
+			UNIMPLEMENTED("Subzero Ctlz()"); return UInt(0);
+		}
+		else
+		{
+			Ice::Variable* result = ::function->makeVariable(Ice::IceType_i32);
+			const Ice::Intrinsics::IntrinsicInfo intrinsic = { Ice::Intrinsics::Ctlz, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F };
+			auto target = ::context->getConstantUndef(Ice::IceType_i32);
+			auto ctlz = Ice::InstIntrinsicCall::create(::function, 1, result, target, intrinsic);
+			ctlz->addArg(x.value);
+			::basicBlock->appendInst(ctlz);
+
+			return RValue<UInt>(V(result));
+		}
+	}
+
+	RValue<UInt4> Ctlz(RValue<UInt4> x, bool isZeroUndef)
+	{
+		if (emulateIntrinsics)
+		{
+			UNIMPLEMENTED("Subzero Ctlz()"); return UInt4(0);
+		}
+		else
+		{
+			// TODO: implement vectorized version in Subzero
+			UInt4 result;
+			result = Insert(result, Ctlz(Extract(x, 0), isZeroUndef), 0);
+			result = Insert(result, Ctlz(Extract(x, 1), isZeroUndef), 1);
+			result = Insert(result, Ctlz(Extract(x, 2), isZeroUndef), 2);
+			result = Insert(result, Ctlz(Extract(x, 3), isZeroUndef), 3);
+			return result;
+		}
+	}
+
+	RValue<UInt> Cttz(RValue<UInt> x, bool isZeroUndef)
+	{
+		if (emulateIntrinsics)
+		{
+			UNIMPLEMENTED("Subzero Cttz()"); return UInt(0);
+		}
+		else
+		{
+			Ice::Variable* result = ::function->makeVariable(Ice::IceType_i32);
+			const Ice::Intrinsics::IntrinsicInfo intrinsic = { Ice::Intrinsics::Cttz, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F };
+			auto target = ::context->getConstantUndef(Ice::IceType_i32);
+			auto ctlz = Ice::InstIntrinsicCall::create(::function, 1, result, target, intrinsic);
+			ctlz->addArg(x.value);
+			::basicBlock->appendInst(ctlz);
+
+			return RValue<UInt>(V(result));
+		}
+	}
+
+	RValue<UInt4> Cttz(RValue<UInt4> x, bool isZeroUndef)
+	{
+		if (emulateIntrinsics)
+		{
+			UNIMPLEMENTED("Subzero Cttz()"); return UInt4(0);
+		}
+		else
+		{
+			// TODO: implement vectorized version in Subzero
+			UInt4 result;
+			result = Insert(result, Cttz(Extract(x, 0), isZeroUndef), 0);
+			result = Insert(result, Cttz(Extract(x, 1), isZeroUndef), 1);
+			result = Insert(result, Cttz(Extract(x, 2), isZeroUndef), 2);
+			result = Insert(result, Cttz(Extract(x, 3), isZeroUndef), 3);
+			return result;
+		}
+	}
 
 	void EmitDebugLocation() {}
 	void EmitDebugVariable(Value* value) {}