Do not indent C++ namespace contents This is a style change. Visual Studio defaults to indenting namespace contents, and this was adopted for a long time, but with the new Vulkan implementation this was abandoned. However the legacy code borrowed from the OpenGL ES implementation still used indentation so it was inconsistent. The justification for not indenting namespace contents is that namespaces are merely a way to avoid name clashes with other projects we don't control directly (and in rare cases internal subprojects when we want to reuse the same names). Hence the vast majority of files have a single namespace, and unlike indentation used for ease of discerning control flow blocks, class contents, or function contents, which can become highly nested, there is no such readability advantage to indenting namespace contents. This is also the Google style recommendation (though no justification or discussion is provided): https://google.github.io/styleguide/cppguide.html#Namespace_Formatting One reasonable counter-argument is consistency with other blocks of curly brackets, but considering that most namespaces span almost the entire file, it's a substantial waste of line length. Because there is no indentation, there's also no need to have the open and closing brackets line up as a visual aid, like we prefer for other uses of curly brackets. So we place the open bracket on the same line as the namespace keyword. A comment is added to the closing bracket to discern it from other closing brackets. It also makes it easier to find the end of anonymous namespaces which typically go at the top of the source file. This change is make separately from applying clang-format because diff tools mark all these unindented lines as changes and this makes it hard to review the smaller style changes made by clang-format. The OpenGL ES and Direct3D code is left untouched because it is in maintenance mode and in case of regressions we want easy 'blame' tool usage. Bug: b/144825072 Change-Id: Ie2925ebd697e1ffa7c4cbdc9a946531f11f4d934 Reviewed-on: https://swiftshader-review.googlesource.com/c/SwiftShader/+/39348 Presubmit-Ready: Nicolas Capens <nicolascapens@google.com> Reviewed-by: Ben Clayton <bclayton@google.com> Tested-by: Nicolas Capens <nicolascapens@google.com>

diff --git a/src/Reactor/CPUID.cpp b/src/Reactor/CPUID.cpp
index 58ef009..f3b9024 100644
--- a/src/Reactor/CPUID.cpp
+++ b/src/Reactor/CPUID.cpp

@@ -27,201 +27,202 @@
 	#include <sys/types.h>
 #endif
 
-namespace rr
+namespace rr {
+
+bool CPUID::MMX = detectMMX();
+bool CPUID::CMOV = detectCMOV();
+bool CPUID::SSE = detectSSE();
+bool CPUID::SSE2 = detectSSE2();
+bool CPUID::SSE3 = detectSSE3();
+bool CPUID::SSSE3 = detectSSSE3();
+bool CPUID::SSE4_1 = detectSSE4_1();
+
+bool CPUID::enableMMX = true;
+bool CPUID::enableCMOV = true;
+bool CPUID::enableSSE = true;
+bool CPUID::enableSSE2 = true;
+bool CPUID::enableSSE3 = true;
+bool CPUID::enableSSSE3 = true;
+bool CPUID::enableSSE4_1 = true;
+
+void CPUID::setEnableMMX(bool enable)
 {
-	bool CPUID::MMX = detectMMX();
-	bool CPUID::CMOV = detectCMOV();
-	bool CPUID::SSE = detectSSE();
-	bool CPUID::SSE2 = detectSSE2();
-	bool CPUID::SSE3 = detectSSE3();
-	bool CPUID::SSSE3 = detectSSSE3();
-	bool CPUID::SSE4_1 = detectSSE4_1();
+	enableMMX = enable;
 
-	bool CPUID::enableMMX = true;
-	bool CPUID::enableCMOV = true;
-	bool CPUID::enableSSE = true;
-	bool CPUID::enableSSE2 = true;
-	bool CPUID::enableSSE3 = true;
-	bool CPUID::enableSSSE3 = true;
-	bool CPUID::enableSSE4_1 = true;
-
-	void CPUID::setEnableMMX(bool enable)
+	if(!enableMMX)
 	{
-		enableMMX = enable;
-
-		if(!enableMMX)
-		{
-			enableSSE = false;
-			enableSSE2 = false;
-			enableSSE3 = false;
-			enableSSSE3 = false;
-			enableSSE4_1 = false;
-		}
-	}
-
-	void CPUID::setEnableCMOV(bool enable)
-	{
-		enableCMOV = enable;
-
-		if(!CMOV)
-		{
-			enableSSE = false;
-			enableSSE2 = false;
-			enableSSE3 = false;
-			enableSSSE3 = false;
-			enableSSE4_1 = false;
-		}
-	}
-
-	void CPUID::setEnableSSE(bool enable)
-	{
-		enableSSE = enable;
-
-		if(enableSSE)
-		{
-			enableMMX = true;
-			enableCMOV = true;
-		}
-		else
-		{
-			enableSSE2 = false;
-			enableSSE3 = false;
-			enableSSSE3 = false;
-			enableSSE4_1 = false;
-		}
-	}
-
-	void CPUID::setEnableSSE2(bool enable)
-	{
-		enableSSE2 = enable;
-
-		if(enableSSE2)
-		{
-			enableMMX = true;
-			enableCMOV = true;
-			enableSSE = true;
-		}
-		else
-		{
-			enableSSE3 = false;
-			enableSSSE3 = false;
-			enableSSE4_1 = false;
-		}
-	}
-
-	void CPUID::setEnableSSE3(bool enable)
-	{
-		enableSSE3 = enable;
-
-		if(enableSSE3)
-		{
-			enableMMX = true;
-			enableCMOV = true;
-			enableSSE = true;
-			enableSSE2 = true;
-		}
-		else
-		{
-			enableSSSE3 = false;
-			enableSSE4_1 = false;
-		}
-	}
-
-	void CPUID::setEnableSSSE3(bool enable)
-	{
-		enableSSSE3 = enable;
-
-		if(enableSSSE3)
-		{
-			enableMMX = true;
-			enableCMOV = true;
-			enableSSE = true;
-			enableSSE2 = true;
-			enableSSE3 = true;
-		}
-		else
-		{
-			enableSSE4_1 = false;
-		}
-	}
-
-	void CPUID::setEnableSSE4_1(bool enable)
-	{
-		enableSSE4_1 = enable;
-
-		if(enableSSE4_1)
-		{
-			enableMMX = true;
-			enableCMOV = true;
-			enableSSE = true;
-			enableSSE2 = true;
-			enableSSE3 = true;
-			enableSSSE3 = true;
-		}
-	}
-
-	static void cpuid(int registers[4], int info)
-	{
-		#if defined(__i386__) || defined(__x86_64__)
-			#if defined(_WIN32)
-				__cpuid(registers, info);
-			#else
-				__asm volatile("cpuid": "=a" (registers[0]), "=b" (registers[1]), "=c" (registers[2]), "=d" (registers[3]): "a" (info));
-			#endif
-		#else
-			registers[0] = 0;
-			registers[1] = 0;
-			registers[2] = 0;
-			registers[3] = 0;
-		#endif
-	}
-
-	bool CPUID::detectMMX()
-	{
-		int registers[4];
-		cpuid(registers, 1);
-		return MMX = (registers[3] & 0x00800000) != 0;
-	}
-
-	bool CPUID::detectCMOV()
-	{
-		int registers[4];
-		cpuid(registers, 1);
-		return CMOV = (registers[3] & 0x00008000) != 0;
-	}
-
-	bool CPUID::detectSSE()
-	{
-		int registers[4];
-		cpuid(registers, 1);
-		return SSE = (registers[3] & 0x02000000) != 0;
-	}
-
-	bool CPUID::detectSSE2()
-	{
-		int registers[4];
-		cpuid(registers, 1);
-		return SSE2 = (registers[3] & 0x04000000) != 0;
-	}
-
-	bool CPUID::detectSSE3()
-	{
-		int registers[4];
-		cpuid(registers, 1);
-		return SSE3 = (registers[2] & 0x00000001) != 0;
-	}
-
-	bool CPUID::detectSSSE3()
-	{
-		int registers[4];
-		cpuid(registers, 1);
-		return SSSE3 = (registers[2] & 0x00000200) != 0;
-	}
-
-	bool CPUID::detectSSE4_1()
-	{
-		int registers[4];
-		cpuid(registers, 1);
-		return SSE4_1 = (registers[2] & 0x00080000) != 0;
+		enableSSE = false;
+		enableSSE2 = false;
+		enableSSE3 = false;
+		enableSSSE3 = false;
+		enableSSE4_1 = false;
 	}
 }
+
+void CPUID::setEnableCMOV(bool enable)
+{
+	enableCMOV = enable;
+
+	if(!CMOV)
+	{
+		enableSSE = false;
+		enableSSE2 = false;
+		enableSSE3 = false;
+		enableSSSE3 = false;
+		enableSSE4_1 = false;
+	}
+}
+
+void CPUID::setEnableSSE(bool enable)
+{
+	enableSSE = enable;
+
+	if(enableSSE)
+	{
+		enableMMX = true;
+		enableCMOV = true;
+	}
+	else
+	{
+		enableSSE2 = false;
+		enableSSE3 = false;
+		enableSSSE3 = false;
+		enableSSE4_1 = false;
+	}
+}
+
+void CPUID::setEnableSSE2(bool enable)
+{
+	enableSSE2 = enable;
+
+	if(enableSSE2)
+	{
+		enableMMX = true;
+		enableCMOV = true;
+		enableSSE = true;
+	}
+	else
+	{
+		enableSSE3 = false;
+		enableSSSE3 = false;
+		enableSSE4_1 = false;
+	}
+}
+
+void CPUID::setEnableSSE3(bool enable)
+{
+	enableSSE3 = enable;
+
+	if(enableSSE3)
+	{
+		enableMMX = true;
+		enableCMOV = true;
+		enableSSE = true;
+		enableSSE2 = true;
+	}
+	else
+	{
+		enableSSSE3 = false;
+		enableSSE4_1 = false;
+	}
+}
+
+void CPUID::setEnableSSSE3(bool enable)
+{
+	enableSSSE3 = enable;
+
+	if(enableSSSE3)
+	{
+		enableMMX = true;
+		enableCMOV = true;
+		enableSSE = true;
+		enableSSE2 = true;
+		enableSSE3 = true;
+	}
+	else
+	{
+		enableSSE4_1 = false;
+	}
+}
+
+void CPUID::setEnableSSE4_1(bool enable)
+{
+	enableSSE4_1 = enable;
+
+	if(enableSSE4_1)
+	{
+		enableMMX = true;
+		enableCMOV = true;
+		enableSSE = true;
+		enableSSE2 = true;
+		enableSSE3 = true;
+		enableSSSE3 = true;
+	}
+}
+
+static void cpuid(int registers[4], int info)
+{
+	#if defined(__i386__) || defined(__x86_64__)
+		#if defined(_WIN32)
+			__cpuid(registers, info);
+		#else
+			__asm volatile("cpuid": "=a" (registers[0]), "=b" (registers[1]), "=c" (registers[2]), "=d" (registers[3]): "a" (info));
+		#endif
+	#else
+		registers[0] = 0;
+		registers[1] = 0;
+		registers[2] = 0;
+		registers[3] = 0;
+	#endif
+}
+
+bool CPUID::detectMMX()
+{
+	int registers[4];
+	cpuid(registers, 1);
+	return MMX = (registers[3] & 0x00800000) != 0;
+}
+
+bool CPUID::detectCMOV()
+{
+	int registers[4];
+	cpuid(registers, 1);
+	return CMOV = (registers[3] & 0x00008000) != 0;
+}
+
+bool CPUID::detectSSE()
+{
+	int registers[4];
+	cpuid(registers, 1);
+	return SSE = (registers[3] & 0x02000000) != 0;
+}
+
+bool CPUID::detectSSE2()
+{
+	int registers[4];
+	cpuid(registers, 1);
+	return SSE2 = (registers[3] & 0x04000000) != 0;
+}
+
+bool CPUID::detectSSE3()
+{
+	int registers[4];
+	cpuid(registers, 1);
+	return SSE3 = (registers[2] & 0x00000001) != 0;
+}
+
+bool CPUID::detectSSSE3()
+{
+	int registers[4];
+	cpuid(registers, 1);
+	return SSSE3 = (registers[2] & 0x00000200) != 0;
+}
+
+bool CPUID::detectSSE4_1()
+{
+	int registers[4];
+	cpuid(registers, 1);
+	return SSE4_1 = (registers[2] & 0x00080000) != 0;
+}
+
+}  // namespace rr

diff --git a/src/Reactor/CPUID.hpp b/src/Reactor/CPUID.hpp
index 108d4a7..577e237 100644
--- a/src/Reactor/CPUID.hpp
+++ b/src/Reactor/CPUID.hpp

@@ -15,104 +15,108 @@
 #ifndef rr_CPUID_hpp
 #define rr_CPUID_hpp
 
-namespace rr
+namespace rr {
+
+#if !defined(__i386__) && defined(_M_IX86)
+	#define __i386__ 1
+#endif
+
+#if !defined(__x86_64__) && (defined(_M_AMD64) || defined (_M_X64))
+	#define __x86_64__ 1
+#endif
+
+class CPUID
 {
-	#if !defined(__i386__) && defined(_M_IX86)
-		#define __i386__ 1
-	#endif
+public:
+	static bool supportsMMX();
+	static bool supportsCMOV();
+	static bool supportsMMX2();   // MMX instructions added by SSE: pshufw, pmulhuw, pmovmskb, pavgw/b, pextrw, pinsrw, pmaxsw/ub, etc.
+	static bool supportsSSE();
+	static bool supportsSSE2();
+	static bool supportsSSE3();
+	static bool supportsSSSE3();
+	static bool supportsSSE4_1();
 
-	#if !defined(__x86_64__) && (defined(_M_AMD64) || defined (_M_X64))
-		#define __x86_64__ 1
-	#endif
+	static void setEnableMMX(bool enable);
+	static void setEnableCMOV(bool enable);
+	static void setEnableSSE(bool enable);
+	static void setEnableSSE2(bool enable);
+	static void setEnableSSE3(bool enable);
+	static void setEnableSSSE3(bool enable);
+	static void setEnableSSE4_1(bool enable);
 
-	class CPUID
-	{
-	public:
-		static bool supportsMMX();
-		static bool supportsCMOV();
-		static bool supportsMMX2();   // MMX instructions added by SSE: pshufw, pmulhuw, pmovmskb, pavgw/b, pextrw, pinsrw, pmaxsw/ub, etc.
-		static bool supportsSSE();
-		static bool supportsSSE2();
-		static bool supportsSSE3();
-		static bool supportsSSSE3();
-		static bool supportsSSE4_1();
+private:
+	static bool MMX;
+	static bool CMOV;
+	static bool SSE;
+	static bool SSE2;
+	static bool SSE3;
+	static bool SSSE3;
+	static bool SSE4_1;
 
-		static void setEnableMMX(bool enable);
-		static void setEnableCMOV(bool enable);
-		static void setEnableSSE(bool enable);
-		static void setEnableSSE2(bool enable);
-		static void setEnableSSE3(bool enable);
-		static void setEnableSSSE3(bool enable);
-		static void setEnableSSE4_1(bool enable);
+	static bool enableMMX;
+	static bool enableCMOV;
+	static bool enableSSE;
+	static bool enableSSE2;
+	static bool enableSSE3;
+	static bool enableSSSE3;
+	static bool enableSSE4_1;
 
-	private:
-		static bool MMX;
-		static bool CMOV;
-		static bool SSE;
-		static bool SSE2;
-		static bool SSE3;
-		static bool SSSE3;
-		static bool SSE4_1;
+	static bool detectMMX();
+	static bool detectCMOV();
+	static bool detectSSE();
+	static bool detectSSE2();
+	static bool detectSSE3();
+	static bool detectSSSE3();
+	static bool detectSSE4_1();
+};
 
-		static bool enableMMX;
-		static bool enableCMOV;
-		static bool enableSSE;
-		static bool enableSSE2;
-		static bool enableSSE3;
-		static bool enableSSSE3;
-		static bool enableSSE4_1;
+}  // namespace rr
 
-		static bool detectMMX();
-		static bool detectCMOV();
-		static bool detectSSE();
-		static bool detectSSE2();
-		static bool detectSSE3();
-		static bool detectSSSE3();
-		static bool detectSSE4_1();
-	};
+/* Inline implementation */
+
+namespace rr {
+
+inline bool CPUID::supportsMMX()
+{
+	return MMX && enableMMX;
 }
 
-namespace rr
+inline bool CPUID::supportsCMOV()
 {
-	inline bool CPUID::supportsMMX()
-	{
-		return MMX && enableMMX;
-	}
-
-	inline bool CPUID::supportsCMOV()
-	{
-		return CMOV && enableCMOV;
-	}
-
-	inline bool CPUID::supportsMMX2()
-	{
-		return supportsSSE();   // Coincides with 64-bit integer vector instructions supported by SSE
-	}
-
-	inline bool CPUID::supportsSSE()
-	{
-		return SSE && enableSSE;
-	}
-
-	inline bool CPUID::supportsSSE2()
-	{
-		return SSE2 && enableSSE2;
-	}
-
-	inline bool CPUID::supportsSSE3()
-	{
-		return SSE3 && enableSSE3;
-	}
-
-	inline bool CPUID::supportsSSSE3()
-	{
-		return SSSE3 && enableSSSE3;
-	}
-
-	inline bool CPUID::supportsSSE4_1()
-	{
-		return SSE4_1 && enableSSE4_1;
-	}
+	return CMOV && enableCMOV;
 }
 
+inline bool CPUID::supportsMMX2()
+{
+	return supportsSSE();   // Coincides with 64-bit integer vector instructions supported by SSE
+}
+
+inline bool CPUID::supportsSSE()
+{
+	return SSE && enableSSE;
+}
+
+inline bool CPUID::supportsSSE2()
+{
+	return SSE2 && enableSSE2;
+}
+
+inline bool CPUID::supportsSSE3()
+{
+	return SSE3 && enableSSE3;
+}
+
+inline bool CPUID::supportsSSSE3()
+{
+	return SSSE3 && enableSSSE3;
+}
+
+inline bool CPUID::supportsSSE4_1()
+{
+	return SSE4_1 && enableSSE4_1;
+}
+
+}  // namespace rr
+
 #endif   // rr_CPUID_hpp

diff --git a/src/Reactor/Coroutine.hpp b/src/Reactor/Coroutine.hpp
index 6bf7089..211d68b 100644
--- a/src/Reactor/Coroutine.hpp
+++ b/src/Reactor/Coroutine.hpp

@@ -19,180 +19,180 @@
 #ifndef rr_ReactorCoroutine_hpp
 #define rr_ReactorCoroutine_hpp
 
-namespace rr
+namespace rr {
+
+// Base class for the template Stream<T>
+class StreamBase
 {
-	// Base class for the template Stream<T>
-	class StreamBase
+protected:
+	StreamBase(const std::shared_ptr<Routine> &routine, Nucleus::CoroutineHandle handle)
+		: routine(routine), handle(handle) {}
+
+	~StreamBase()
 	{
-	protected:
-		StreamBase(const std::shared_ptr<Routine> &routine, Nucleus::CoroutineHandle handle)
-			: routine(routine), handle(handle) {}
+		auto pfn = (Nucleus::CoroutineDestroy*)routine->getEntry(Nucleus::CoroutineEntryDestroy);
+		pfn(handle);
+	}
 
-		~StreamBase()
-		{
-			auto pfn = (Nucleus::CoroutineDestroy*)routine->getEntry(Nucleus::CoroutineEntryDestroy);
-			pfn(handle);
-		}
-
-		bool await(void* out)
-		{
-			auto pfn = (Nucleus::CoroutineAwait*)routine->getEntry(Nucleus::CoroutineEntryAwait);
-			return pfn(handle, out);
-		}
+	bool await(void* out)
+	{
+		auto pfn = (Nucleus::CoroutineAwait*)routine->getEntry(Nucleus::CoroutineEntryAwait);
+		return pfn(handle, out);
+	}
 
 private:
-		std::shared_ptr<Routine> routine;
-		Nucleus::CoroutineHandle handle;
-	};
+	std::shared_ptr<Routine> routine;
+	Nucleus::CoroutineHandle handle;
+};
 
-	// Stream is the interface to a running Coroutine instance.
-	// A Coroutine may Yield() values of type T, which can be retrieved with
-	// await().
-	template<typename T>
-	class Stream : public StreamBase
+// Stream is the interface to a running Coroutine instance.
+// A Coroutine may Yield() values of type T, which can be retrieved with
+// await().
+template<typename T>
+class Stream : public StreamBase
+{
+public:
+	inline Stream(const std::shared_ptr<Routine> &routine, Nucleus::CoroutineHandle handle)
+		: StreamBase(routine, handle) {}
+
+	// await() retrieves the next yielded value from the coroutine.
+	// Returns true if the coroutine yieled a value and out was assigned a
+	// new value. If await() returns false, the coroutine has finished
+	// execution and await() will return false for all future calls.
+	inline bool await(T& out) { return StreamBase::await(&out); }
+};
+
+template<typename FunctionType>
+class Coroutine;
+
+// Coroutine constructs a reactor Coroutine function.
+// rr::Coroutine is similar to rr::Function in that it builds a new
+// executable function, but Coroutines have the following differences:
+//  (1) Coroutines do not support Return() statements.
+//  (2) Coroutines support Yield() statements to suspend execution of the
+//      coroutine and pass a value up to the caller. Yield can be called
+//      multiple times in a single execution of a coroutine.
+//  (3) The template argument T to Coroutine<T> is a C-style function
+//      signature.
+//  (4) Coroutine::operator() returns a rr::Stream<T> instead of an
+//      rr::Routine.
+//  (5) operator() starts execution of the coroutine immediately.
+//  (6) operator() uses the Coroutine's template function signature to
+//      ensure the argument types match the generated function signature.
+//
+// Example usage:
+//
+//   // Build the coroutine function
+//   Coroutine<int()> coroutine;
+//   {
+//       Yield(Int(0));
+//       Yield(Int(1));
+//       Int current = 1;
+//       Int next = 1;
+//       While (true) {
+//           Yield(next);
+//           auto tmp = current + next;
+//           current = next;
+//           next = tmp;
+//       }
+//   }
+//
+//   // Start the execution of the coroutine.
+//   auto s = coroutine();
+//
+//   // Grab the first 20 yielded values and print them.
+//   for (int i = 0; i < 20; i++)
+//   {
+//       int val = 0;
+//       s->await(val);
+//       printf("Fibonacci(%d): %d", i, val);
+//   }
+//
+template<typename Return, typename... Arguments>
+class Coroutine<Return(Arguments...)>
+{
+public:
+	Coroutine();
+
+	template<int index>
+	using CArgumentType = typename std::tuple_element<index, std::tuple<Arguments...>>::type;
+
+	template<int index>
+	using RArgumentType = CToReactorT<CArgumentType<index>>;
+
+	// Return the argument value with the given index.
+	template<int index>
+	Argument<RArgumentType<index>> Arg() const
 	{
-	public:
-		inline Stream(const std::shared_ptr<Routine> &routine, Nucleus::CoroutineHandle handle)
-			: StreamBase(routine, handle) {}
-
-		// await() retrieves the next yielded value from the coroutine.
-		// Returns true if the coroutine yieled a value and out was assigned a
-		// new value. If await() returns false, the coroutine has finished
-		// execution and await() will return false for all future calls.
-		inline bool await(T& out) { return StreamBase::await(&out); }
-	};
-
-	template<typename FunctionType>
-	class Coroutine;
-
-	// Coroutine constructs a reactor Coroutine function.
-	// rr::Coroutine is similar to rr::Function in that it builds a new
-	// executable function, but Coroutines have the following differences:
-	//  (1) Coroutines do not support Return() statements.
-	//  (2) Coroutines support Yield() statements to suspend execution of the
-	//      coroutine and pass a value up to the caller. Yield can be called
-	//      multiple times in a single execution of a coroutine.
-	//  (3) The template argument T to Coroutine<T> is a C-style function
-	//      signature.
-	//  (4) Coroutine::operator() returns a rr::Stream<T> instead of an
-	//      rr::Routine.
-	//  (5) operator() starts execution of the coroutine immediately.
-	//  (6) operator() uses the Coroutine's template function signature to
-	//      ensure the argument types match the generated function signature.
-	//
-	// Example usage:
-	//
-	//   // Build the coroutine function
-	//   Coroutine<int()> coroutine;
-	//   {
-	//       Yield(Int(0));
-	//       Yield(Int(1));
-	//       Int current = 1;
-	//       Int next = 1;
-	//       While (true) {
-	//           Yield(next);
-	//           auto tmp = current + next;
-	//           current = next;
-	//           next = tmp;
-	//       }
-	//   }
-	//
-	//   // Start the execution of the coroutine.
-	//   auto s = coroutine();
-	//
-	//   // Grab the first 20 yielded values and print them.
-	//   for (int i = 0; i < 20; i++)
-	//   {
-	//       int val = 0;
-	//       s->await(val);
-	//       printf("Fibonacci(%d): %d", i, val);
-	//   }
-	//
-	template<typename Return, typename... Arguments>
-	class Coroutine<Return(Arguments...)>
-	{
-	public:
-		Coroutine();
-
-		template<int index>
-		using CArgumentType = typename std::tuple_element<index, std::tuple<Arguments...>>::type;
-
-		template<int index>
-		using RArgumentType = CToReactorT<CArgumentType<index>>;
-
-		// Return the argument value with the given index.
-		template<int index>
-		Argument<RArgumentType<index>> Arg() const
-		{
-			Value *arg = Nucleus::getArgument(index);
-			return Argument<RArgumentType<index>>(arg);
-		}
-
-		// Completes building of the coroutine and generates the coroutine's
-		// executable code. After calling, no more reactor functions may be
-		// called without building a new rr::Function or rr::Coroutine.
-		// While automatically called by operator(), finalize() should be called
-		// as early as possible to release the global Reactor mutex lock.
-		inline void finalize(const Config::Edit &cfg = Config::Edit::None);
-
-		// Starts execution of the coroutine and returns a unique_ptr to a
-		// Stream<> that exposes the await() function for obtaining yielded
-		// values.
-		std::unique_ptr<Stream<Return>> operator()(Arguments...);
-
-	protected:
-		std::unique_ptr<Nucleus> core;
-		std::shared_ptr<Routine> routine;
-		std::vector<Type*> arguments;
-	};
-
-	template<typename Return, typename... Arguments>
-	Coroutine<Return(Arguments...)>::Coroutine()
-	{
-		core.reset(new Nucleus());
-
-		std::vector<Type*> types = {CToReactorT<Arguments>::getType()...};
-		for(auto type : types)
-		{
-			if(type != Void::getType())
-			{
-				arguments.push_back(type);
-			}
-		}
-
-		Nucleus::createCoroutine(CToReactorT<Return>::getType(), arguments);
+		Value *arg = Nucleus::getArgument(index);
+		return Argument<RArgumentType<index>>(arg);
 	}
 
-	template<typename Return, typename... Arguments>
-	void Coroutine<Return(Arguments...)>::finalize(const Config::Edit &cfg /* = Config::Edit::None */)
+	// Completes building of the coroutine and generates the coroutine's
+	// executable code. After calling, no more reactor functions may be
+	// called without building a new rr::Function or rr::Coroutine.
+	// While automatically called by operator(), finalize() should be called
+	// as early as possible to release the global Reactor mutex lock.
+	inline void finalize(const Config::Edit &cfg = Config::Edit::None);
+
+	// Starts execution of the coroutine and returns a unique_ptr to a
+	// Stream<> that exposes the await() function for obtaining yielded
+	// values.
+	std::unique_ptr<Stream<Return>> operator()(Arguments...);
+
+protected:
+	std::unique_ptr<Nucleus> core;
+	std::shared_ptr<Routine> routine;
+	std::vector<Type*> arguments;
+};
+
+template<typename Return, typename... Arguments>
+Coroutine<Return(Arguments...)>::Coroutine()
+{
+	core.reset(new Nucleus());
+
+	std::vector<Type*> types = {CToReactorT<Arguments>::getType()...};
+	for(auto type : types)
 	{
-		if(core != nullptr)
+		if(type != Void::getType())
 		{
-			routine = core->acquireCoroutine("coroutine", cfg);
-			core.reset(nullptr);
+			arguments.push_back(type);
 		}
 	}
 
-	template<typename Return, typename... Arguments>
-	std::unique_ptr<Stream<Return>>
-	Coroutine<Return(Arguments...)>::operator()(Arguments... args)
-	{
-		finalize();
+	Nucleus::createCoroutine(CToReactorT<Return>::getType(), arguments);
+}
 
-		using Sig = Nucleus::CoroutineBegin<Arguments...>;
-		auto pfn = (Sig*)routine->getEntry(Nucleus::CoroutineEntryBegin);
-		auto handle = pfn(args...);
-		return std::unique_ptr<Stream<Return>>(new Stream<Return>(routine, handle));
+template<typename Return, typename... Arguments>
+void Coroutine<Return(Arguments...)>::finalize(const Config::Edit &cfg /* = Config::Edit::None */)
+{
+	if(core != nullptr)
+	{
+		routine = core->acquireCoroutine("coroutine", cfg);
+		core.reset(nullptr);
 	}
+}
+
+template<typename Return, typename... Arguments>
+std::unique_ptr<Stream<Return>>
+Coroutine<Return(Arguments...)>::operator()(Arguments... args)
+{
+	finalize();
+
+	using Sig = Nucleus::CoroutineBegin<Arguments...>;
+	auto pfn = (Sig*)routine->getEntry(Nucleus::CoroutineEntryBegin);
+	auto handle = pfn(args...);
+	return std::unique_ptr<Stream<Return>>(new Stream<Return>(routine, handle));
+}
 
 #ifdef Yield // Defined in WinBase.h
 #undef Yield
 #endif
 
-	// Suspends execution of the coroutine and yields val to the caller.
-	// Execution of the coroutine will resume after val is retrieved.
-	template<typename T>
-	inline void Yield(const T &val) { Nucleus::yield(ValueOf(val)); }
+// Suspends execution of the coroutine and yields val to the caller.
+// Execution of the coroutine will resume after val is retrieved.
+template<typename T>
+inline void Yield(const T &val) { Nucleus::yield(ValueOf(val)); }
 
 } // namespace rr
 

diff --git a/src/Reactor/Debug.cpp b/src/Reactor/Debug.cpp
index 7f0d2cd..df8e8fc 100644
--- a/src/Reactor/Debug.cpp
+++ b/src/Reactor/Debug.cpp

@@ -17,8 +17,7 @@
 #include <string>
 #include <stdarg.h>
 
-namespace rr
-{
+namespace rr {
 
 void tracev(const char *format, va_list args)
 {
@@ -71,4 +70,4 @@
 	::abort();
 }
 
-} // namespace rr
+}  // namespace rr

diff --git a/src/Reactor/Debug.hpp b/src/Reactor/Debug.hpp
index 929a927..da8a48d 100644
--- a/src/Reactor/Debug.hpp
+++ b/src/Reactor/Debug.hpp

@@ -31,19 +31,20 @@
 #define CHECK_PRINTF_ARGS
 #endif
 
-namespace rr
-{
-	// Outputs text to the debugging log
-	void trace(const char *format, ...) CHECK_PRINTF_ARGS;
-	inline void trace() {}
+namespace rr {
 
-	// Outputs text to the debugging log and prints to stderr.
-	void warn(const char *format, ...) CHECK_PRINTF_ARGS;
-	inline void warn() {}
+// Outputs text to the debugging log
+void trace(const char *format, ...) CHECK_PRINTF_ARGS;
+inline void trace() {}
 
-	// Outputs the message to the debugging log and stderr, and calls abort().
-	void abort(const char *format, ...) CHECK_PRINTF_ARGS;
-}
+// Outputs text to the debugging log and prints to stderr.
+void warn(const char *format, ...) CHECK_PRINTF_ARGS;
+inline void warn() {}
+
+// Outputs the message to the debugging log and stderr, and calls abort().
+void abort(const char *format, ...) CHECK_PRINTF_ARGS;
+
+}  // namespace rr
 
 // A macro to output a trace of a function call and its arguments to the
 // debugging log. Disabled if RR_DISABLE_TRACE is defined.

diff --git a/src/Reactor/EmulatedReactor.cpp b/src/Reactor/EmulatedReactor.cpp
index 8a06d6f..efdb5a3 100644
--- a/src/Reactor/EmulatedReactor.cpp
+++ b/src/Reactor/EmulatedReactor.cpp

@@ -4,210 +4,211 @@
 #include <functional>
 #include <utility>
 
-namespace rr
+namespace rr {
+namespace {
+
+template <typename T>
+struct UnderlyingType
 {
-	namespace
+	using Type = typename decltype(rr::Extract(std::declval<RValue<T>>(), 0))::rvalue_underlying_type;
+};
+
+template <typename T>
+using UnderlyingTypeT = typename UnderlyingType<T>::Type;
+
+// Call single arg function on a vector type
+template <typename Func, typename T>
+RValue<T> call4(Func func, const RValue<T>& x)
+{
+	T result;
+	result = Insert(result, Call(func, Extract(x, 0)), 0);
+	result = Insert(result, Call(func, Extract(x, 1)), 1);
+	result = Insert(result, Call(func, Extract(x, 2)), 2);
+	result = Insert(result, Call(func, Extract(x, 3)), 3);
+	return result;
+}
+
+// Call two arg function on a vector type
+template <typename Func, typename T>
+RValue<T> call4(Func func, const RValue<T>& x, const RValue<T>& y)
+{
+	T result;
+	result = Insert(result, Call(func, Extract(x, 0), Extract(y, 0)), 0);
+	result = Insert(result, Call(func, Extract(x, 1), Extract(y, 1)), 1);
+	result = Insert(result, Call(func, Extract(x, 2), Extract(y, 2)), 2);
+	result = Insert(result, Call(func, Extract(x, 3), Extract(y, 3)), 3);
+	return result;
+}
+
+template <typename T, typename EL = UnderlyingTypeT<T>>
+void gather(T& out, RValue<Pointer<EL>> base, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment, bool zeroMaskedLanes)
+{
+	constexpr bool atomic = false;
+	constexpr std::memory_order order = std::memory_order_relaxed;
+
+	Pointer<Byte> baseBytePtr = base;
+
+	out = T(0);
+	for (int i = 0; i < 4; i++)
 	{
-		template <typename T>
-		struct UnderlyingType
+		If(Extract(mask, i) != 0)
 		{
-			using Type = typename decltype(rr::Extract(std::declval<RValue<T>>(), 0))::rvalue_underlying_type;
-		};
-
-		template <typename T>
-		using UnderlyingTypeT = typename UnderlyingType<T>::Type;
-
-		// Call single arg function on a vector type
-		template <typename Func, typename T>
-		RValue<T> call4(Func func, const RValue<T>& x)
-		{
-			T result;
-			result = Insert(result, Call(func, Extract(x, 0)), 0);
-			result = Insert(result, Call(func, Extract(x, 1)), 1);
-			result = Insert(result, Call(func, Extract(x, 2)), 2);
-			result = Insert(result, Call(func, Extract(x, 3)), 3);
-			return result;
+			auto offset = Extract(offsets, i);
+			auto el = Load(Pointer<EL>(&baseBytePtr[offset]), alignment, atomic, order);
+			out = Insert(out, el, i);
 		}
-
-		// Call two arg function on a vector type
-		template <typename Func, typename T>
-		RValue<T> call4(Func func, const RValue<T>& x, const RValue<T>& y)
+		Else If(zeroMaskedLanes)
 		{
-			T result;
-			result = Insert(result, Call(func, Extract(x, 0), Extract(y, 0)), 0);
-			result = Insert(result, Call(func, Extract(x, 1), Extract(y, 1)), 1);
-			result = Insert(result, Call(func, Extract(x, 2), Extract(y, 2)), 2);
-			result = Insert(result, Call(func, Extract(x, 3), Extract(y, 3)), 3);
-			return result;
-		}
-
-		template <typename T, typename EL = UnderlyingTypeT<T>>
-		void gather(T& out, RValue<Pointer<EL>> base, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment, bool zeroMaskedLanes)
-		{
-			constexpr bool atomic = false;
-			constexpr std::memory_order order = std::memory_order_relaxed;
-
-			Pointer<Byte> baseBytePtr = base;
-
-			out = T(0);
-			for (int i = 0; i < 4; i++)
-			{
-				If(Extract(mask, i) != 0)
-				{
-					auto offset = Extract(offsets, i);
-					auto el = Load(Pointer<EL>(&baseBytePtr[offset]), alignment, atomic, order);
-					out = Insert(out, el, i);
-				}
-				Else If(zeroMaskedLanes)
-				{
-					out = Insert(out, EL(0), i);
-				}
-			}
-		}
-
-		template <typename T, typename EL = UnderlyingTypeT<T>>
-		void scatter(RValue<Pointer<EL>> base, RValue<T> val, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment)
-		{
-			constexpr bool atomic = false;
-			constexpr std::memory_order order = std::memory_order_relaxed;
-
-			Pointer<Byte> baseBytePtr = base;
-
-			for (int i = 0; i < 4; i++)
-			{
-				If(Extract(mask, i) != 0)
-				{
-					auto offset = Extract(offsets, i);
-					Store(Extract(val, i), Pointer<EL>(&baseBytePtr[offset]), alignment, atomic, order);
-				}
-			}
-		}
-	}
-
-	namespace emulated
-	{
-		RValue<Float4> Gather(RValue<Pointer<Float>> base, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment, bool zeroMaskedLanes /* = false */)
-		{
-			Float4 result{};
-			gather(result, base, offsets, mask, alignment, zeroMaskedLanes);
-			return result;
-		}
-
-		RValue<Int4> Gather(RValue<Pointer<Int>> base, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment, bool zeroMaskedLanes /* = false */)
-		{
-			Int4 result{};
-			gather(result, base, offsets, mask, alignment, zeroMaskedLanes);
-			return result;
-		}
-
-		void Scatter(RValue<Pointer<Float>> base, RValue<Float4> val, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment)
-		{
-			scatter(base, val, offsets, mask, alignment);
-		}
-
-		void Scatter(RValue<Pointer<Int>> base, RValue<Int4> val, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment)
-		{
-			scatter<Int4>(base, val, offsets, mask, alignment);
-		}
-
-		RValue<Float> Exp2(RValue<Float> x)
-		{
-			return Call(exp2f, x);
-		}
-
-		RValue<Float> Log2(RValue<Float> x)
-		{
-			return Call(log2f, x);
-		}
-
-		RValue<Float4> Sin(RValue<Float4> x)
-		{
-			return call4(sinf, x);
-		}
-
-		RValue<Float4> Cos(RValue<Float4> x)
-		{
-			return call4(cosf, x);
-		}
-
-		RValue<Float4> Tan(RValue<Float4> x)
-		{
-			return call4(tanf, x);
-		}
-
-		RValue<Float4> Asin(RValue<Float4> x)
-		{
-			return call4(asinf, x);
-		}
-
-		RValue<Float4> Acos(RValue<Float4> x)
-		{
-			return call4(acosf, x);
-		}
-
-		RValue<Float4> Atan(RValue<Float4> x)
-		{
-			return call4(atanf, x);
-		}
-
-		RValue<Float4> Sinh(RValue<Float4> x)
-		{
-			return call4(sinhf, x);
-		}
-
-		RValue<Float4> Cosh(RValue<Float4> x)
-		{
-			return call4(coshf, x);
-		}
-
-		RValue<Float4> Tanh(RValue<Float4> x)
-		{
-			return call4(tanhf, x);
-		}
-
-		RValue<Float4> Asinh(RValue<Float4> x)
-		{
-			return call4(asinhf, x);
-		}
-
-		RValue<Float4> Acosh(RValue<Float4> x)
-		{
-			return call4(acoshf, x);
-		}
-
-		RValue<Float4> Atanh(RValue<Float4> x)
-		{
-			return call4(atanhf, x);
-		}
-
-		RValue<Float4> Atan2(RValue<Float4> x, RValue<Float4> y)
-		{
-			return call4(atan2f, x, y);
-		}
-
-		RValue<Float4> Pow(RValue<Float4> x, RValue<Float4> y)
-		{
-			return call4(powf, x, y);
-		}
-
-		RValue<Float4> Exp(RValue<Float4> x)
-		{
-			return call4(expf, x);
-		}
-
-		RValue<Float4> Log(RValue<Float4> x)
-		{
-			return call4(logf, x);
-		}
-
-		RValue<Float4> Exp2(RValue<Float4> x)
-		{
-			return call4(exp2f, x);
-		}
-
-		RValue<Float4> Log2(RValue<Float4> x)
-		{
-			return call4(log2f, x);
+			out = Insert(out, EL(0), i);
 		}
 	}
 }
+
+template <typename T, typename EL = UnderlyingTypeT<T>>
+void scatter(RValue<Pointer<EL>> base, RValue<T> val, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment)
+{
+	constexpr bool atomic = false;
+	constexpr std::memory_order order = std::memory_order_relaxed;
+
+	Pointer<Byte> baseBytePtr = base;
+
+	for (int i = 0; i < 4; i++)
+	{
+		If(Extract(mask, i) != 0)
+		{
+			auto offset = Extract(offsets, i);
+			Store(Extract(val, i), Pointer<EL>(&baseBytePtr[offset]), alignment, atomic, order);
+		}
+	}
+}
+
+}  // anonymous namespace
+
+namespace emulated {
+
+RValue<Float4> Gather(RValue<Pointer<Float>> base, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment, bool zeroMaskedLanes /* = false */)
+{
+	Float4 result{};
+	gather(result, base, offsets, mask, alignment, zeroMaskedLanes);
+	return result;
+}
+
+RValue<Int4> Gather(RValue<Pointer<Int>> base, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment, bool zeroMaskedLanes /* = false */)
+{
+	Int4 result{};
+	gather(result, base, offsets, mask, alignment, zeroMaskedLanes);
+	return result;
+}
+
+void Scatter(RValue<Pointer<Float>> base, RValue<Float4> val, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment)
+{
+	scatter(base, val, offsets, mask, alignment);
+}
+
+void Scatter(RValue<Pointer<Int>> base, RValue<Int4> val, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment)
+{
+	scatter<Int4>(base, val, offsets, mask, alignment);
+}
+
+RValue<Float> Exp2(RValue<Float> x)
+{
+	return Call(exp2f, x);
+}
+
+RValue<Float> Log2(RValue<Float> x)
+{
+	return Call(log2f, x);
+}
+
+RValue<Float4> Sin(RValue<Float4> x)
+{
+	return call4(sinf, x);
+}
+
+RValue<Float4> Cos(RValue<Float4> x)
+{
+	return call4(cosf, x);
+}
+
+RValue<Float4> Tan(RValue<Float4> x)
+{
+	return call4(tanf, x);
+}
+
+RValue<Float4> Asin(RValue<Float4> x)
+{
+	return call4(asinf, x);
+}
+
+RValue<Float4> Acos(RValue<Float4> x)
+{
+	return call4(acosf, x);
+}
+
+RValue<Float4> Atan(RValue<Float4> x)
+{
+	return call4(atanf, x);
+}
+
+RValue<Float4> Sinh(RValue<Float4> x)
+{
+	return call4(sinhf, x);
+}
+
+RValue<Float4> Cosh(RValue<Float4> x)
+{
+	return call4(coshf, x);
+}
+
+RValue<Float4> Tanh(RValue<Float4> x)
+{
+	return call4(tanhf, x);
+}
+
+RValue<Float4> Asinh(RValue<Float4> x)
+{
+	return call4(asinhf, x);
+}
+
+RValue<Float4> Acosh(RValue<Float4> x)
+{
+	return call4(acoshf, x);
+}
+
+RValue<Float4> Atanh(RValue<Float4> x)
+{
+	return call4(atanhf, x);
+}
+
+RValue<Float4> Atan2(RValue<Float4> x, RValue<Float4> y)
+{
+	return call4(atan2f, x, y);
+}
+
+RValue<Float4> Pow(RValue<Float4> x, RValue<Float4> y)
+{
+	return call4(powf, x, y);
+}
+
+RValue<Float4> Exp(RValue<Float4> x)
+{
+	return call4(expf, x);
+}
+
+RValue<Float4> Log(RValue<Float4> x)
+{
+	return call4(logf, x);
+}
+
+RValue<Float4> Exp2(RValue<Float4> x)
+{
+	return call4(exp2f, x);
+}
+
+RValue<Float4> Log2(RValue<Float4> x)
+{
+	return call4(log2f, x);
+}
+
+}  // namespace emulated
+}  // namespace rr

diff --git a/src/Reactor/EmulatedReactor.hpp b/src/Reactor/EmulatedReactor.hpp
index dbdc198..10b9b2a 100644
--- a/src/Reactor/EmulatedReactor.hpp
+++ b/src/Reactor/EmulatedReactor.hpp

@@ -21,33 +21,33 @@
 // starting point for implementing a new backend, or for when adding
 // functionality to an existing backend is non-trivial.
 
-namespace rr
-{
-	namespace emulated
-	{
-		RValue<Float4> Gather(RValue<Pointer<Float>> base, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment, bool zeroMaskedLanes = false);
-		RValue<Int4> Gather(RValue<Pointer<Int>> base, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment, bool zeroMaskedLanes = false);
-		void Scatter(RValue<Pointer<Float>> base, RValue<Float4> val, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment);
-		void Scatter(RValue<Pointer<Int>> base, RValue<Int4> val, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment);
-		RValue<Float> Exp2(RValue<Float> x);
-		RValue<Float> Log2(RValue<Float> x);
-		RValue<Float4> Sin(RValue<Float4> x);
-		RValue<Float4> Cos(RValue<Float4> x);
-		RValue<Float4> Tan(RValue<Float4> x);
-		RValue<Float4> Asin(RValue<Float4> x);
-		RValue<Float4> Acos(RValue<Float4> x);
-		RValue<Float4> Atan(RValue<Float4> x);
-		RValue<Float4> Sinh(RValue<Float4> x);
-		RValue<Float4> Cosh(RValue<Float4> x);
-		RValue<Float4> Tanh(RValue<Float4> x);
-		RValue<Float4> Asinh(RValue<Float4> x);
-		RValue<Float4> Acosh(RValue<Float4> x);
-		RValue<Float4> Atanh(RValue<Float4> x);
-		RValue<Float4> Atan2(RValue<Float4> x, RValue<Float4> y);
-		RValue<Float4> Pow(RValue<Float4> x, RValue<Float4> y);
-		RValue<Float4> Exp(RValue<Float4> x);
-		RValue<Float4> Log(RValue<Float4> x);
-		RValue<Float4> Exp2(RValue<Float4> x);
-		RValue<Float4> Log2(RValue<Float4> x);
-	}
-}
+namespace rr {
+namespace emulated {
+
+RValue<Float4> Gather(RValue<Pointer<Float>> base, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment, bool zeroMaskedLanes = false);
+RValue<Int4> Gather(RValue<Pointer<Int>> base, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment, bool zeroMaskedLanes = false);
+void Scatter(RValue<Pointer<Float>> base, RValue<Float4> val, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment);
+void Scatter(RValue<Pointer<Int>> base, RValue<Int4> val, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment);
+RValue<Float> Exp2(RValue<Float> x);
+RValue<Float> Log2(RValue<Float> x);
+RValue<Float4> Sin(RValue<Float4> x);
+RValue<Float4> Cos(RValue<Float4> x);
+RValue<Float4> Tan(RValue<Float4> x);
+RValue<Float4> Asin(RValue<Float4> x);
+RValue<Float4> Acos(RValue<Float4> x);
+RValue<Float4> Atan(RValue<Float4> x);
+RValue<Float4> Sinh(RValue<Float4> x);
+RValue<Float4> Cosh(RValue<Float4> x);
+RValue<Float4> Tanh(RValue<Float4> x);
+RValue<Float4> Asinh(RValue<Float4> x);
+RValue<Float4> Acosh(RValue<Float4> x);
+RValue<Float4> Atanh(RValue<Float4> x);
+RValue<Float4> Atan2(RValue<Float4> x, RValue<Float4> y);
+RValue<Float4> Pow(RValue<Float4> x, RValue<Float4> y);
+RValue<Float4> Exp(RValue<Float4> x);
+RValue<Float4> Log(RValue<Float4> x);
+RValue<Float4> Exp2(RValue<Float4> x);
+RValue<Float4> Log2(RValue<Float4> x);
+
+}  // namespace emulated
+}  // namespace rr

diff --git a/src/Reactor/ExecutableMemory.cpp b/src/Reactor/ExecutableMemory.cpp
index 15d5b39..40acb94 100644
--- a/src/Reactor/ExecutableMemory.cpp
+++ b/src/Reactor/ExecutableMemory.cpp

@@ -42,10 +42,9 @@
 #define __x86__
 #endif
 
-namespace rr
-{
-namespace
-{
+namespace rr {
+namespace {
+
 struct Allocation
 {
 //	size_t bytes;
@@ -301,4 +300,5 @@
 		deallocate(memory);
 	#endif
 }
-}
+
+}  // namespace rr

diff --git a/src/Reactor/ExecutableMemory.hpp b/src/Reactor/ExecutableMemory.hpp
index 95dac5e..4c1ef33 100644
--- a/src/Reactor/ExecutableMemory.hpp
+++ b/src/Reactor/ExecutableMemory.hpp

@@ -19,8 +19,8 @@
 #include <cstdint>
 #include <cstring>
 
-namespace rr
-{
+namespace rr {
+
 size_t memoryPageSize();
 
 void *allocateExecutable(size_t bytes);   // Allocates memory that can be made executable using markExecutable()
@@ -87,6 +87,7 @@
 private:
 	void *ptr;
 };
-}
+
+}  // namespace rr
 
 #endif   // rr_ExecutableMemory_hpp

diff --git a/src/Reactor/LLVMReactor.cpp b/src/Reactor/LLVMReactor.cpp
index 9654292..64ee668 100644
--- a/src/Reactor/LLVMReactor.cpp
+++ b/src/Reactor/LLVMReactor.cpp

@@ -107,2288 +107,2289 @@
 	extern "C" void _chkstk();
 #endif
 
-namespace rr
+namespace rr {
+
+void* resolveExternalSymbol(const char*);
+
+}  // namespace rr
+
+namespace {
+
+// Default configuration settings. Must be accessed under mutex lock.
+std::mutex defaultConfigLock;
+rr::Config &defaultConfig()
 {
-	void* resolveExternalSymbol(const char*);
+	// This uses a static in a function to avoid the cost of a global static
+	// initializer. See http://neugierig.org/software/chromium/notes/2011/08/static-initializers.html
+	static rr::Config config = rr::Config::Edit()
+		.add(rr::Optimization::Pass::ScalarReplAggregates)
+		.add(rr::Optimization::Pass::InstructionCombining)
+		.apply({});
+	return config;
 }
 
-namespace
+// Cache provides a simple, thread-safe key-value store.
+template <typename KEY, typename VALUE>
+class Cache
 {
-	// Default configuration settings. Must be accessed under mutex lock.
-	std::mutex defaultConfigLock;
-	rr::Config &defaultConfig()
+public:
+	Cache() = default;
+	Cache(const Cache& other);
+	VALUE getOrCreate(KEY key, std::function<VALUE()> create);
+private:
+	mutable std::mutex mutex; // mutable required for copy constructor.
+	std::unordered_map<KEY, VALUE> map;
+};
+
+template <typename KEY, typename VALUE>
+Cache<KEY, VALUE>::Cache(const Cache& other)
+{
+	std::unique_lock<std::mutex> lock(other.mutex);
+	map = other.map;
+}
+
+template <typename KEY, typename VALUE>
+VALUE Cache<KEY, VALUE>::getOrCreate(KEY key, std::function<VALUE()> create)
+{
+	std::unique_lock<std::mutex> lock(mutex);
+	auto it = map.find(key);
+	if (it != map.end())
 	{
-		// This uses a static in a function to avoid the cost of a global static
-		// initializer. See http://neugierig.org/software/chromium/notes/2011/08/static-initializers.html
-		static rr::Config config = rr::Config::Edit()
-			.add(rr::Optimization::Pass::ScalarReplAggregates)
-			.add(rr::Optimization::Pass::InstructionCombining)
-			.apply({});
-		return config;
+		return it->second;
 	}
+	auto value = create();
+	map.emplace(key, value);
+	return value;
+}
 
-	// Cache provides a simple, thread-safe key-value store.
-	template <typename KEY, typename VALUE>
-	class Cache
-	{
-	public:
-		Cache() = default;
-		Cache(const Cache& other);
-		VALUE getOrCreate(KEY key, std::function<VALUE()> create);
-	private:
-		mutable std::mutex mutex; // mutable required for copy constructor.
-		std::unordered_map<KEY, VALUE> map;
-	};
+// JITGlobals is a singleton that holds all the immutable machine specific
+// information for the host device.
+class JITGlobals
+{
+public:
+	using TargetMachineSPtr = std::shared_ptr<llvm::TargetMachine>;
 
-	template <typename KEY, typename VALUE>
-	Cache<KEY, VALUE>::Cache(const Cache& other)
-	{
-		std::unique_lock<std::mutex> lock(other.mutex);
-		map = other.map;
-	}
+	static JITGlobals * get();
 
-	template <typename KEY, typename VALUE>
-	VALUE Cache<KEY, VALUE>::getOrCreate(KEY key, std::function<VALUE()> create)
-	{
-		std::unique_lock<std::mutex> lock(mutex);
-		auto it = map.find(key);
-		if (it != map.end())
-		{
-			return it->second;
-		}
-		auto value = create();
-		map.emplace(key, value);
-		return value;
-	}
+	const std::string mcpu;
+	const std::vector<std::string> mattrs;
+	const char* const march;
+	const llvm::TargetOptions targetOptions;
+	const llvm::DataLayout dataLayout;
 
-	// JITGlobals is a singleton that holds all the immutable machine specific
-	// information for the host device.
-	class JITGlobals
-	{
-	public:
-		using TargetMachineSPtr = std::shared_ptr<llvm::TargetMachine>;
+	TargetMachineSPtr getTargetMachine(rr::Optimization::Level optlevel);
 
-		static JITGlobals * get();
+private:
+	static JITGlobals create();
+	static llvm::CodeGenOpt::Level toLLVM(rr::Optimization::Level level);
+	JITGlobals(const char *mcpu,
+	           const std::vector<std::string> &mattrs,
+	           const char *march,
+	           const llvm::TargetOptions &targetOptions,
+	           const llvm::DataLayout &dataLayout);
+	JITGlobals(const JITGlobals&) = default;
 
-		const std::string mcpu;
-		const std::vector<std::string> mattrs;
-		const char* const march;
-		const llvm::TargetOptions targetOptions;
-		const llvm::DataLayout dataLayout;
+	// The cache key here is actually a rr::Optimization::Level. We use int
+	// as 'enum class' types do not provide builtin hash functions until
+	// C++14. See: https://stackoverflow.com/a/29618545.
+	Cache<int, TargetMachineSPtr> targetMachines;
+};
 
-		TargetMachineSPtr getTargetMachine(rr::Optimization::Level optlevel);
+JITGlobals * JITGlobals::get()
+{
+	static JITGlobals instance = create();
+	return &instance;
+}
 
-	private:
-		static JITGlobals create();
-		static llvm::CodeGenOpt::Level toLLVM(rr::Optimization::Level level);
-		JITGlobals(const char *mcpu,
-		           const std::vector<std::string> &mattrs,
-		           const char *march,
-		           const llvm::TargetOptions &targetOptions,
-		           const llvm::DataLayout &dataLayout);
-		JITGlobals(const JITGlobals&) = default;
-
-		// The cache key here is actually a rr::Optimization::Level. We use int
-		// as 'enum class' types do not provide builtin hash functions until
-		// C++14. See: https://stackoverflow.com/a/29618545.
-		Cache<int, TargetMachineSPtr> targetMachines;
-	};
-
-	JITGlobals * JITGlobals::get()
-	{
-		static JITGlobals instance = create();
-		return &instance;
-	}
-
-	JITGlobals::TargetMachineSPtr JITGlobals::getTargetMachine(rr::Optimization::Level optlevel)
-	{
-		return targetMachines.getOrCreate(static_cast<int>(optlevel), [&]() {
-			return TargetMachineSPtr(llvm::EngineBuilder()
+JITGlobals::TargetMachineSPtr JITGlobals::getTargetMachine(rr::Optimization::Level optlevel)
+{
+	return targetMachines.getOrCreate(static_cast<int>(optlevel), [&]() {
+		return TargetMachineSPtr(llvm::EngineBuilder()
 #ifdef ENABLE_RR_DEBUG_INFO
-				.setOptLevel(toLLVM(rr::Optimization::Level::None))
+			.setOptLevel(toLLVM(rr::Optimization::Level::None))
 #else
-				.setOptLevel(toLLVM(optlevel))
+			.setOptLevel(toLLVM(optlevel))
 #endif // ENABLE_RR_DEBUG_INFO
-				.setMCPU(mcpu)
-				.setMArch(march)
-				.setMAttrs(mattrs)
-				.setTargetOptions(targetOptions)
-				.selectTarget());
-		});
-	}
+			.setMCPU(mcpu)
+			.setMArch(march)
+			.setMAttrs(mattrs)
+			.setTargetOptions(targetOptions)
+			.selectTarget());
+	});
+}
 
-	JITGlobals JITGlobals::create()
+JITGlobals JITGlobals::create()
+{
+	struct LLVMInitializer
 	{
-		struct LLVMInitializer
+		LLVMInitializer()
 		{
-			LLVMInitializer()
-			{
-				llvm::InitializeNativeTarget();
-				llvm::InitializeNativeTargetAsmPrinter();
-				llvm::InitializeNativeTargetAsmParser();
-			}
-		};
-		static LLVMInitializer initializeLLVM;
+			llvm::InitializeNativeTarget();
+			llvm::InitializeNativeTargetAsmPrinter();
+			llvm::InitializeNativeTargetAsmParser();
+		}
+	};
+	static LLVMInitializer initializeLLVM;
 
-		auto mcpu = llvm::sys::getHostCPUName();
+	auto mcpu = llvm::sys::getHostCPUName();
 
-		llvm::StringMap<bool> features;
-		bool ok = llvm::sys::getHostCPUFeatures(features);
+	llvm::StringMap<bool> features;
+	bool ok = llvm::sys::getHostCPUFeatures(features);
 
 #if defined(__i386__) || defined(__x86_64__) || \
 (defined(__linux__) && (defined(__arm__) || defined(__aarch64__)))
-		ASSERT_MSG(ok, "llvm::sys::getHostCPUFeatures returned false");
+	ASSERT_MSG(ok, "llvm::sys::getHostCPUFeatures returned false");
 #else
-		(void) ok; // getHostCPUFeatures always returns false on other platforms
+	(void) ok; // getHostCPUFeatures always returns false on other platforms
 #endif
 
-		std::vector<std::string> mattrs;
-		for (auto &feature : features)
-		{
-			if (feature.second) { mattrs.push_back(feature.first()); }
-		}
+	std::vector<std::string> mattrs;
+	for (auto &feature : features)
+	{
+		if (feature.second) { mattrs.push_back(feature.first()); }
+	}
 
-		const char* march = nullptr;
+	const char* march = nullptr;
 #if defined(__x86_64__)
-		march = "x86-64";
+	march = "x86-64";
 #elif defined(__i386__)
-		march = "x86";
+	march = "x86";
 #elif defined(__aarch64__)
-		march = "arm64";
+	march = "arm64";
 #elif defined(__arm__)
-		march = "arm";
+	march = "arm";
 #elif defined(__mips__)
 #if defined(__mips64)
-		march = "mips64el";
+	march = "mips64el";
 #else
-		march = "mipsel";
+	march = "mipsel";
 #endif
 #elif defined(__powerpc64__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
-		march = "ppc64le";
+	march = "ppc64le";
 #else
-		#error "unknown architecture"
+	#error "unknown architecture"
 #endif
 
-		llvm::TargetOptions targetOptions;
-		targetOptions.UnsafeFPMath = false;
+	llvm::TargetOptions targetOptions;
+	targetOptions.UnsafeFPMath = false;
 
-		auto targetMachine = std::unique_ptr<llvm::TargetMachine>(
-			llvm::EngineBuilder()
-				.setOptLevel(llvm::CodeGenOpt::None)
-				.setMCPU(mcpu)
-				.setMArch(march)
-				.setMAttrs(mattrs)
-				.setTargetOptions(targetOptions)
-				.selectTarget());
+	auto targetMachine = std::unique_ptr<llvm::TargetMachine>(
+		llvm::EngineBuilder()
+			.setOptLevel(llvm::CodeGenOpt::None)
+			.setMCPU(mcpu)
+			.setMArch(march)
+			.setMAttrs(mattrs)
+			.setTargetOptions(targetOptions)
+			.selectTarget());
 
-		auto dataLayout = targetMachine->createDataLayout();
+	auto dataLayout = targetMachine->createDataLayout();
 
-		return JITGlobals(mcpu.data(), mattrs, march, targetOptions, dataLayout);
-	}
+	return JITGlobals(mcpu.data(), mattrs, march, targetOptions, dataLayout);
+}
 
-	llvm::CodeGenOpt::Level JITGlobals::toLLVM(rr::Optimization::Level level)
+llvm::CodeGenOpt::Level JITGlobals::toLLVM(rr::Optimization::Level level)
+{
+	switch (level)
 	{
-		switch (level)
-		{
-			case rr::Optimization::Level::None:       return ::llvm::CodeGenOpt::None;
-			case rr::Optimization::Level::Less:       return ::llvm::CodeGenOpt::Less;
-			case rr::Optimization::Level::Default:    return ::llvm::CodeGenOpt::Default;
-			case rr::Optimization::Level::Aggressive: return ::llvm::CodeGenOpt::Aggressive;
-			default: UNREACHABLE("Unknown Optimization Level %d", int(level));
-		}
-		return ::llvm::CodeGenOpt::Default;
+		case rr::Optimization::Level::None:       return ::llvm::CodeGenOpt::None;
+		case rr::Optimization::Level::Less:       return ::llvm::CodeGenOpt::Less;
+		case rr::Optimization::Level::Default:    return ::llvm::CodeGenOpt::Default;
+		case rr::Optimization::Level::Aggressive: return ::llvm::CodeGenOpt::Aggressive;
+		default: UNREACHABLE("Unknown Optimization Level %d", int(level));
 	}
+	return ::llvm::CodeGenOpt::Default;
+}
 
-	JITGlobals::JITGlobals(const char* mcpu,
-	                       const std::vector<std::string> &mattrs,
-	                       const char* march,
-	                       const llvm::TargetOptions &targetOptions,
-	                       const llvm::DataLayout &dataLayout) :
-			mcpu(mcpu),
-			mattrs(mattrs),
-			march(march),
-			targetOptions(targetOptions),
-			dataLayout(dataLayout)
-	{
-	}
+JITGlobals::JITGlobals(const char* mcpu,
+                       const std::vector<std::string> &mattrs,
+                       const char* march,
+                       const llvm::TargetOptions &targetOptions,
+                       const llvm::DataLayout &dataLayout) :
+		mcpu(mcpu),
+		mattrs(mattrs),
+		march(march),
+		targetOptions(targetOptions),
+		dataLayout(dataLayout)
+{
+}
 
-	// JITRoutine is a rr::Routine that holds a LLVM JIT session, compiler and
-	// object layer as each routine may require different target machine
-	// settings and no Reactor routine directly links against another.
-	class JITRoutine : public rr::Routine
-	{
+// JITRoutine is a rr::Routine that holds a LLVM JIT session, compiler and
+// object layer as each routine may require different target machine
+// settings and no Reactor routine directly links against another.
+class JITRoutine : public rr::Routine
+{
 #if LLVM_VERSION_MAJOR >= 8
-		using ObjLayer = llvm::orc::LegacyRTDyldObjectLinkingLayer;
-		using CompileLayer = llvm::orc::LegacyIRCompileLayer<ObjLayer, llvm::orc::SimpleCompiler>;
+	using ObjLayer = llvm::orc::LegacyRTDyldObjectLinkingLayer;
+	using CompileLayer = llvm::orc::LegacyIRCompileLayer<ObjLayer, llvm::orc::SimpleCompiler>;
 #else
-		using ObjLayer = llvm::orc::RTDyldObjectLinkingLayer;
-		using CompileLayer = llvm::orc::IRCompileLayer<ObjLayer, llvm::orc::SimpleCompiler>;
+	using ObjLayer = llvm::orc::RTDyldObjectLinkingLayer;
+	using CompileLayer = llvm::orc::IRCompileLayer<ObjLayer, llvm::orc::SimpleCompiler>;
 #endif
 
-	public:
-		JITRoutine(
-				std::unique_ptr<llvm::Module> module,
-				llvm::Function **funcs,
-				size_t count,
-				const rr::Config &config) :
-			resolver(createLegacyLookupResolver(
-				session,
-				[&](const std::string &name) {
-					void *func = rr::resolveExternalSymbol(name.c_str());
-					if (func != nullptr)
-					{
-						return llvm::JITSymbol(
-							reinterpret_cast<uintptr_t>(func), llvm::JITSymbolFlags::Absolute);
-					}
-					return objLayer.findSymbol(name, true);
-				},
-				[](llvm::Error err) {
-					if (err)
-					{
-						// TODO: Log the symbol resolution errors.
-						return;
-					}
-				})),
-			targetMachine(JITGlobals::get()->getTargetMachine(config.getOptimization().getLevel())),
-			compileLayer(objLayer, llvm::orc::SimpleCompiler(*targetMachine)),
-			objLayer(
-				session,
-				[this](llvm::orc::VModuleKey) {
-					return ObjLayer::Resources{std::make_shared<llvm::SectionMemoryManager>(), resolver};
-				},
-				ObjLayer::NotifyLoadedFtor(),
-				[](llvm::orc::VModuleKey, const llvm::object::ObjectFile &Obj, const llvm::RuntimeDyld::LoadedObjectInfo &L) {
-#ifdef ENABLE_RR_DEBUG_INFO
-					rr::DebugInfo::NotifyObjectEmitted(Obj, L);
-#endif // ENABLE_RR_DEBUG_INFO
-				},
-				[](llvm::orc::VModuleKey, const llvm::object::ObjectFile &Obj) {
-#ifdef ENABLE_RR_DEBUG_INFO
-					rr::DebugInfo::NotifyFreeingObject(Obj);
-#endif // ENABLE_RR_DEBUG_INFO
-				}
-			),
-			addresses(count)
-		{
-			std::vector<std::string> mangledNames(count);
-			for (size_t i = 0; i < count; i++)
-			{
-				auto func = funcs[i];
-				static size_t numEmittedFunctions = 0;
-				std::string name = "f" + llvm::Twine(numEmittedFunctions++).str();
-				func->setName(name);
-				func->setLinkage(llvm::GlobalValue::ExternalLinkage);
-				func->setDoesNotThrow();
-
-				llvm::raw_string_ostream mangledNameStream(mangledNames[i]);
-				llvm::Mangler::getNameWithPrefix(mangledNameStream, name, JITGlobals::get()->dataLayout);
-			}
-
-			auto moduleKey = session.allocateVModule();
-
-			// Once the module is passed to the compileLayer, the
-			// llvm::Functions are freed. Make sure funcs are not referenced
-			// after this point.
-			funcs = nullptr;
-
-			llvm::cantFail(compileLayer.addModule(moduleKey, std::move(module)));
-
-			// Resolve the function addresses.
-			for (size_t i = 0; i < count; i++)
-			{
-				auto symbol = compileLayer.findSymbolIn(moduleKey, mangledNames[i], false);
-				if(auto address = symbol.getAddress())
+public:
+	JITRoutine(
+			std::unique_ptr<llvm::Module> module,
+			llvm::Function **funcs,
+			size_t count,
+			const rr::Config &config) :
+		resolver(createLegacyLookupResolver(
+			session,
+			[&](const std::string &name) {
+				void *func = rr::resolveExternalSymbol(name.c_str());
+				if (func != nullptr)
 				{
-					addresses[i] = reinterpret_cast<void *>(static_cast<intptr_t>(address.get()));
+					return llvm::JITSymbol(
+						reinterpret_cast<uintptr_t>(func), llvm::JITSymbolFlags::Absolute);
 				}
+				return objLayer.findSymbol(name, true);
+			},
+			[](llvm::Error err) {
+				if (err)
+				{
+					// TODO: Log the symbol resolution errors.
+					return;
+				}
+			})),
+		targetMachine(JITGlobals::get()->getTargetMachine(config.getOptimization().getLevel())),
+		compileLayer(objLayer, llvm::orc::SimpleCompiler(*targetMachine)),
+		objLayer(
+			session,
+			[this](llvm::orc::VModuleKey) {
+				return ObjLayer::Resources{std::make_shared<llvm::SectionMemoryManager>(), resolver};
+			},
+			ObjLayer::NotifyLoadedFtor(),
+			[](llvm::orc::VModuleKey, const llvm::object::ObjectFile &Obj, const llvm::RuntimeDyld::LoadedObjectInfo &L) {
+#ifdef ENABLE_RR_DEBUG_INFO
+				rr::DebugInfo::NotifyObjectEmitted(Obj, L);
+#endif // ENABLE_RR_DEBUG_INFO
+			},
+			[](llvm::orc::VModuleKey, const llvm::object::ObjectFile &Obj) {
+#ifdef ENABLE_RR_DEBUG_INFO
+				rr::DebugInfo::NotifyFreeingObject(Obj);
+#endif // ENABLE_RR_DEBUG_INFO
 			}
-		}
-
-		const void *getEntry(int index) const override
-		{
-			return addresses[index];
-		}
-
-	private:
-		std::shared_ptr<llvm::orc::SymbolResolver> resolver;
-		std::shared_ptr<llvm::TargetMachine> targetMachine;
-		llvm::orc::ExecutionSession session;
-		CompileLayer compileLayer;
-		ObjLayer objLayer;
-		std::vector<const void *> addresses;
-	};
-
-	// JITBuilder holds all the LLVM state for building routines.
-	class JITBuilder
+		),
+		addresses(count)
 	{
-	public:
-		JITBuilder(const rr::Config &config) :
-			config(config),
-			module(new llvm::Module("", context)),
-			builder(new llvm::IRBuilder<>(context))
+		std::vector<std::string> mangledNames(count);
+		for (size_t i = 0; i < count; i++)
 		{
-			module->setDataLayout(JITGlobals::get()->dataLayout);
+			auto func = funcs[i];
+			static size_t numEmittedFunctions = 0;
+			std::string name = "f" + llvm::Twine(numEmittedFunctions++).str();
+			func->setName(name);
+			func->setLinkage(llvm::GlobalValue::ExternalLinkage);
+			func->setDoesNotThrow();
+
+			llvm::raw_string_ostream mangledNameStream(mangledNames[i]);
+			llvm::Mangler::getNameWithPrefix(mangledNameStream, name, JITGlobals::get()->dataLayout);
 		}
 
-		void optimize(const rr::Config &cfg)
+		auto moduleKey = session.allocateVModule();
+
+		// Once the module is passed to the compileLayer, the
+		// llvm::Functions are freed. Make sure funcs are not referenced
+		// after this point.
+		funcs = nullptr;
+
+		llvm::cantFail(compileLayer.addModule(moduleKey, std::move(module)));
+
+		// Resolve the function addresses.
+		for (size_t i = 0; i < count; i++)
 		{
+			auto symbol = compileLayer.findSymbolIn(moduleKey, mangledNames[i], false);
+			if(auto address = symbol.getAddress())
+			{
+				addresses[i] = reinterpret_cast<void *>(static_cast<intptr_t>(address.get()));
+			}
+		}
+	}
+
+	const void *getEntry(int index) const override
+	{
+		return addresses[index];
+	}
+
+private:
+	std::shared_ptr<llvm::orc::SymbolResolver> resolver;
+	std::shared_ptr<llvm::TargetMachine> targetMachine;
+	llvm::orc::ExecutionSession session;
+	CompileLayer compileLayer;
+	ObjLayer objLayer;
+	std::vector<const void *> addresses;
+};
+
+// JITBuilder holds all the LLVM state for building routines.
+class JITBuilder
+{
+public:
+	JITBuilder(const rr::Config &config) :
+		config(config),
+		module(new llvm::Module("", context)),
+		builder(new llvm::IRBuilder<>(context))
+	{
+		module->setDataLayout(JITGlobals::get()->dataLayout);
+	}
+
+	void optimize(const rr::Config &cfg)
+	{
 
 #ifdef ENABLE_RR_DEBUG_INFO
-			if (debugInfo != nullptr)
-			{
-				return; // Don't optimize if we're generating debug info.
-			}
+		if (debugInfo != nullptr)
+		{
+			return; // Don't optimize if we're generating debug info.
+		}
 #endif // ENABLE_RR_DEBUG_INFO
 
-			std::unique_ptr<llvm::legacy::PassManager> passManager(
-				new llvm::legacy::PassManager());
+		std::unique_ptr<llvm::legacy::PassManager> passManager(
+			new llvm::legacy::PassManager());
 
-			for(auto pass : cfg.getOptimization().getPasses())
+		for(auto pass : cfg.getOptimization().getPasses())
+		{
+			switch(pass)
 			{
-				switch(pass)
-				{
-				case rr::Optimization::Pass::Disabled:                                                                       break;
-				case rr::Optimization::Pass::CFGSimplification:    passManager->add(llvm::createCFGSimplificationPass());    break;
-				case rr::Optimization::Pass::LICM:                 passManager->add(llvm::createLICMPass());                 break;
-				case rr::Optimization::Pass::AggressiveDCE:        passManager->add(llvm::createAggressiveDCEPass());        break;
-				case rr::Optimization::Pass::GVN:                  passManager->add(llvm::createGVNPass());                  break;
-				case rr::Optimization::Pass::InstructionCombining: passManager->add(llvm::createInstructionCombiningPass()); break;
-				case rr::Optimization::Pass::Reassociate:          passManager->add(llvm::createReassociatePass());          break;
-				case rr::Optimization::Pass::DeadStoreElimination: passManager->add(llvm::createDeadStoreEliminationPass()); break;
-				case rr::Optimization::Pass::SCCP:                 passManager->add(llvm::createSCCPPass());                 break;
-				case rr::Optimization::Pass::ScalarReplAggregates: passManager->add(llvm::createSROAPass());                 break;
-				case rr::Optimization::Pass::EarlyCSEPass:         passManager->add(llvm::createEarlyCSEPass());             break;
-				default:
-					UNREACHABLE("pass: %d", int(pass));
-				}
+			case rr::Optimization::Pass::Disabled:                                                                       break;
+			case rr::Optimization::Pass::CFGSimplification:    passManager->add(llvm::createCFGSimplificationPass());    break;
+			case rr::Optimization::Pass::LICM:                 passManager->add(llvm::createLICMPass());                 break;
+			case rr::Optimization::Pass::AggressiveDCE:        passManager->add(llvm::createAggressiveDCEPass());        break;
+			case rr::Optimization::Pass::GVN:                  passManager->add(llvm::createGVNPass());                  break;
+			case rr::Optimization::Pass::InstructionCombining: passManager->add(llvm::createInstructionCombiningPass()); break;
+			case rr::Optimization::Pass::Reassociate:          passManager->add(llvm::createReassociatePass());          break;
+			case rr::Optimization::Pass::DeadStoreElimination: passManager->add(llvm::createDeadStoreEliminationPass()); break;
+			case rr::Optimization::Pass::SCCP:                 passManager->add(llvm::createSCCPPass());                 break;
+			case rr::Optimization::Pass::ScalarReplAggregates: passManager->add(llvm::createSROAPass());                 break;
+			case rr::Optimization::Pass::EarlyCSEPass:         passManager->add(llvm::createEarlyCSEPass());             break;
+			default:
+				UNREACHABLE("pass: %d", int(pass));
 			}
-
-			passManager->run(*module);
 		}
 
-		std::shared_ptr<rr::Routine> acquireRoutine(llvm::Function **funcs, size_t count, const rr::Config &cfg)
-		{
-			ASSERT(module);
-			return std::make_shared<JITRoutine>(std::move(module), funcs, count, cfg);
-		}
+		passManager->run(*module);
+	}
 
-		const rr::Config config;
-		llvm::LLVMContext context;
-		std::unique_ptr<llvm::Module> module;
-		std::unique_ptr<llvm::IRBuilder<>> builder;
-		llvm::Function *function = nullptr;
+	std::shared_ptr<rr::Routine> acquireRoutine(llvm::Function **funcs, size_t count, const rr::Config &cfg)
+	{
+		ASSERT(module);
+		return std::make_shared<JITRoutine>(std::move(module), funcs, count, cfg);
+	}
 
-		struct CoroutineState
-		{
-			llvm::Function *await = nullptr;
-			llvm::Function *destroy = nullptr;
-			llvm::Value *handle = nullptr;
-			llvm::Value *id = nullptr;
-			llvm::Value *promise = nullptr;
-			llvm::Type *yieldType = nullptr;
-			llvm::BasicBlock *entryBlock = nullptr;
-			llvm::BasicBlock *suspendBlock = nullptr;
-			llvm::BasicBlock *endBlock = nullptr;
-			llvm::BasicBlock *destroyBlock = nullptr;
-		};
-		CoroutineState coroutine;
+	const rr::Config config;
+	llvm::LLVMContext context;
+	std::unique_ptr<llvm::Module> module;
+	std::unique_ptr<llvm::IRBuilder<>> builder;
+	llvm::Function *function = nullptr;
+
+	struct CoroutineState
+	{
+		llvm::Function *await = nullptr;
+		llvm::Function *destroy = nullptr;
+		llvm::Value *handle = nullptr;
+		llvm::Value *id = nullptr;
+		llvm::Value *promise = nullptr;
+		llvm::Type *yieldType = nullptr;
+		llvm::BasicBlock *entryBlock = nullptr;
+		llvm::BasicBlock *suspendBlock = nullptr;
+		llvm::BasicBlock *endBlock = nullptr;
+		llvm::BasicBlock *destroyBlock = nullptr;
+	};
+	CoroutineState coroutine;
 
 #ifdef ENABLE_RR_DEBUG_INFO
-		std::unique_ptr<rr::DebugInfo> debugInfo;
+	std::unique_ptr<rr::DebugInfo> debugInfo;
 #endif
-	};
+};
 
-	std::unique_ptr<JITBuilder> jit;
-	std::mutex codegenMutex;
+std::unique_ptr<JITBuilder> jit;
+std::mutex codegenMutex;
 
 #ifdef ENABLE_RR_PRINT
-	std::string replace(std::string str, const std::string& substr, const std::string& replacement)
-	{
-		size_t pos = 0;
-		while((pos = str.find(substr, pos)) != std::string::npos) {
-			str.replace(pos, substr.length(), replacement);
-			pos += replacement.length();
-		}
-		return str;
+std::string replace(std::string str, const std::string& substr, const std::string& replacement)
+{
+	size_t pos = 0;
+	while((pos = str.find(substr, pos)) != std::string::npos) {
+		str.replace(pos, substr.length(), replacement);
+		pos += replacement.length();
 	}
+	return str;
+}
 #endif // ENABLE_RR_PRINT
 
-	template <typename T>
-	T alignUp(T val, T alignment)
-	{
-		return alignment * ((val + alignment - 1) / alignment);
-	}
+template <typename T>
+T alignUp(T val, T alignment)
+{
+	return alignment * ((val + alignment - 1) / alignment);
+}
 
-	void* alignedAlloc(size_t size, size_t alignment)
-	{
-		ASSERT(alignment < 256);
-		auto allocation = new uint8_t[size + sizeof(uint8_t) + alignment];
-		auto aligned = allocation;
-		aligned += sizeof(uint8_t); // Make space for the base-address offset.
-		aligned = reinterpret_cast<uint8_t*>(alignUp(reinterpret_cast<uintptr_t>(aligned), alignment)); // align
-		auto offset = static_cast<uint8_t>(aligned - allocation);
-		aligned[-1] = offset;
-		return aligned;
-	}
+void* alignedAlloc(size_t size, size_t alignment)
+{
+	ASSERT(alignment < 256);
+	auto allocation = new uint8_t[size + sizeof(uint8_t) + alignment];
+	auto aligned = allocation;
+	aligned += sizeof(uint8_t); // Make space for the base-address offset.
+	aligned = reinterpret_cast<uint8_t*>(alignUp(reinterpret_cast<uintptr_t>(aligned), alignment)); // align
+	auto offset = static_cast<uint8_t>(aligned - allocation);
+	aligned[-1] = offset;
+	return aligned;
+}
 
-	void alignedFree(void* ptr)
-	{
-		auto aligned = reinterpret_cast<uint8_t*>(ptr);
-		auto offset = aligned[-1];
-		auto allocation = aligned - offset;
-		delete[] allocation;
-	}
+void alignedFree(void* ptr)
+{
+	auto aligned = reinterpret_cast<uint8_t*>(ptr);
+	auto offset = aligned[-1];
+	auto allocation = aligned - offset;
+	delete[] allocation;
+}
 
-	llvm::Value *lowerPAVG(llvm::Value *x, llvm::Value *y)
-	{
-		llvm::VectorType *ty = llvm::cast<llvm::VectorType>(x->getType());
+llvm::Value *lowerPAVG(llvm::Value *x, llvm::Value *y)
+{
+	llvm::VectorType *ty = llvm::cast<llvm::VectorType>(x->getType());
 
-		llvm::VectorType *extTy =
-			llvm::VectorType::getExtendedElementVectorType(ty);
-		x = jit->builder->CreateZExt(x, extTy);
-		y = jit->builder->CreateZExt(y, extTy);
+	llvm::VectorType *extTy =
+		llvm::VectorType::getExtendedElementVectorType(ty);
+	x = jit->builder->CreateZExt(x, extTy);
+	y = jit->builder->CreateZExt(y, extTy);
 
-		// (x + y + 1) >> 1
-		llvm::Constant *one = llvm::ConstantInt::get(extTy, 1);
-		llvm::Value *res = jit->builder->CreateAdd(x, y);
-		res = jit->builder->CreateAdd(res, one);
-		res = jit->builder->CreateLShr(res, one);
-		return jit->builder->CreateTrunc(res, ty);
-	}
+	// (x + y + 1) >> 1
+	llvm::Constant *one = llvm::ConstantInt::get(extTy, 1);
+	llvm::Value *res = jit->builder->CreateAdd(x, y);
+	res = jit->builder->CreateAdd(res, one);
+	res = jit->builder->CreateLShr(res, one);
+	return jit->builder->CreateTrunc(res, ty);
+}
 
-	llvm::Value *lowerPMINMAX(llvm::Value *x, llvm::Value *y,
-	                          llvm::ICmpInst::Predicate pred)
-	{
-		return jit->builder->CreateSelect(jit->builder->CreateICmp(pred, x, y), x, y);
-	}
+llvm::Value *lowerPMINMAX(llvm::Value *x, llvm::Value *y,
+                          llvm::ICmpInst::Predicate pred)
+{
+	return jit->builder->CreateSelect(jit->builder->CreateICmp(pred, x, y), x, y);
+}
 
-	llvm::Value *lowerPCMP(llvm::ICmpInst::Predicate pred, llvm::Value *x,
-	                       llvm::Value *y, llvm::Type *dstTy)
-	{
-		return jit->builder->CreateSExt(jit->builder->CreateICmp(pred, x, y), dstTy, "");
-	}
+llvm::Value *lowerPCMP(llvm::ICmpInst::Predicate pred, llvm::Value *x,
+                       llvm::Value *y, llvm::Type *dstTy)
+{
+	return jit->builder->CreateSExt(jit->builder->CreateICmp(pred, x, y), dstTy, "");
+}
 
 #if defined(__i386__) || defined(__x86_64__)
-	llvm::Value *lowerPMOV(llvm::Value *op, llvm::Type *dstType, bool sext)
-	{
-		llvm::VectorType *srcTy = llvm::cast<llvm::VectorType>(op->getType());
-		llvm::VectorType *dstTy = llvm::cast<llvm::VectorType>(dstType);
+llvm::Value *lowerPMOV(llvm::Value *op, llvm::Type *dstType, bool sext)
+{
+	llvm::VectorType *srcTy = llvm::cast<llvm::VectorType>(op->getType());
+	llvm::VectorType *dstTy = llvm::cast<llvm::VectorType>(dstType);
 
-		llvm::Value *undef = llvm::UndefValue::get(srcTy);
-		llvm::SmallVector<uint32_t, 16> mask(dstTy->getNumElements());
-		std::iota(mask.begin(), mask.end(), 0);
-		llvm::Value *v = jit->builder->CreateShuffleVector(op, undef, mask);
+	llvm::Value *undef = llvm::UndefValue::get(srcTy);
+	llvm::SmallVector<uint32_t, 16> mask(dstTy->getNumElements());
+	std::iota(mask.begin(), mask.end(), 0);
+	llvm::Value *v = jit->builder->CreateShuffleVector(op, undef, mask);
 
-		return sext ? jit->builder->CreateSExt(v, dstTy)
-		            : jit->builder->CreateZExt(v, dstTy);
-	}
+	return sext ? jit->builder->CreateSExt(v, dstTy)
+	            : jit->builder->CreateZExt(v, dstTy);
+}
 
-	llvm::Value *lowerPABS(llvm::Value *v)
-	{
-		llvm::Value *zero = llvm::Constant::getNullValue(v->getType());
-		llvm::Value *cmp = jit->builder->CreateICmp(llvm::ICmpInst::ICMP_SGT, v, zero);
-		llvm::Value *neg = jit->builder->CreateNeg(v);
-		return jit->builder->CreateSelect(cmp, v, neg);
-	}
+llvm::Value *lowerPABS(llvm::Value *v)
+{
+	llvm::Value *zero = llvm::Constant::getNullValue(v->getType());
+	llvm::Value *cmp = jit->builder->CreateICmp(llvm::ICmpInst::ICMP_SGT, v, zero);
+	llvm::Value *neg = jit->builder->CreateNeg(v);
+	return jit->builder->CreateSelect(cmp, v, neg);
+}
 #endif  // defined(__i386__) || defined(__x86_64__)
 
 #if !defined(__i386__) && !defined(__x86_64__)
-	llvm::Value *lowerPFMINMAX(llvm::Value *x, llvm::Value *y,
-	                           llvm::FCmpInst::Predicate pred)
+llvm::Value *lowerPFMINMAX(llvm::Value *x, llvm::Value *y,
+                           llvm::FCmpInst::Predicate pred)
+{
+	return jit->builder->CreateSelect(jit->builder->CreateFCmp(pred, x, y), x, y);
+}
+
+llvm::Value *lowerRound(llvm::Value *x)
+{
+	llvm::Function *nearbyint = llvm::Intrinsic::getDeclaration(
+		jit->module.get(), llvm::Intrinsic::nearbyint, {x->getType()});
+	return jit->builder->CreateCall(nearbyint, ARGS(x));
+}
+
+llvm::Value *lowerRoundInt(llvm::Value *x, llvm::Type *ty)
+{
+	return jit->builder->CreateFPToSI(lowerRound(x), ty);
+}
+
+llvm::Value *lowerFloor(llvm::Value *x)
+{
+	llvm::Function *floor = llvm::Intrinsic::getDeclaration(
+		jit->module.get(), llvm::Intrinsic::floor, {x->getType()});
+	return jit->builder->CreateCall(floor, ARGS(x));
+}
+
+llvm::Value *lowerTrunc(llvm::Value *x)
+{
+	llvm::Function *trunc = llvm::Intrinsic::getDeclaration(
+		jit->module.get(), llvm::Intrinsic::trunc, {x->getType()});
+	return jit->builder->CreateCall(trunc, ARGS(x));
+}
+
+// Packed add/sub with saturation
+llvm::Value *lowerPSAT(llvm::Value *x, llvm::Value *y, bool isAdd, bool isSigned)
+{
+	llvm::VectorType *ty = llvm::cast<llvm::VectorType>(x->getType());
+	llvm::VectorType *extTy = llvm::VectorType::getExtendedElementVectorType(ty);
+
+	unsigned numBits = ty->getScalarSizeInBits();
+
+	llvm::Value *max, *min, *extX, *extY;
+	if (isSigned)
 	{
-		return jit->builder->CreateSelect(jit->builder->CreateFCmp(pred, x, y), x, y);
+		max = llvm::ConstantInt::get(extTy, (1LL << (numBits - 1)) - 1, true);
+		min = llvm::ConstantInt::get(extTy, (-1LL << (numBits - 1)), true);
+		extX = jit->builder->CreateSExt(x, extTy);
+		extY = jit->builder->CreateSExt(y, extTy);
+	}
+	else
+	{
+		ASSERT_MSG(numBits <= 64, "numBits: %d", int(numBits));
+		uint64_t maxVal = (numBits == 64) ? ~0ULL : (1ULL << numBits) - 1;
+		max = llvm::ConstantInt::get(extTy, maxVal, false);
+		min = llvm::ConstantInt::get(extTy, 0, false);
+		extX = jit->builder->CreateZExt(x, extTy);
+		extY = jit->builder->CreateZExt(y, extTy);
 	}
 
-	llvm::Value *lowerRound(llvm::Value *x)
+	llvm::Value *res = isAdd ? jit->builder->CreateAdd(extX, extY)
+	                         : jit->builder->CreateSub(extX, extY);
+
+	res = lowerPMINMAX(res, min, llvm::ICmpInst::ICMP_SGT);
+	res = lowerPMINMAX(res, max, llvm::ICmpInst::ICMP_SLT);
+
+	return jit->builder->CreateTrunc(res, ty);
+}
+
+llvm::Value *lowerSQRT(llvm::Value *x)
+{
+	llvm::Function *sqrt = llvm::Intrinsic::getDeclaration(
+		jit->module.get(), llvm::Intrinsic::sqrt, {x->getType()});
+	return jit->builder->CreateCall(sqrt, ARGS(x));
+}
+
+llvm::Value *lowerRCP(llvm::Value *x)
+{
+	llvm::Type *ty = x->getType();
+	llvm::Constant *one;
+	if (llvm::VectorType *vectorTy = llvm::dyn_cast<llvm::VectorType>(ty))
 	{
-		llvm::Function *nearbyint = llvm::Intrinsic::getDeclaration(
-			jit->module.get(), llvm::Intrinsic::nearbyint, {x->getType()});
-		return jit->builder->CreateCall(nearbyint, ARGS(x));
+		one = llvm::ConstantVector::getSplat(
+			vectorTy->getNumElements(),
+			llvm::ConstantFP::get(vectorTy->getElementType(), 1));
+	}
+	else
+	{
+		one = llvm::ConstantFP::get(ty, 1);
+	}
+	return jit->builder->CreateFDiv(one, x);
+}
+
+llvm::Value *lowerRSQRT(llvm::Value *x)
+{
+	return lowerRCP(lowerSQRT(x));
+}
+
+llvm::Value *lowerVectorShl(llvm::Value *x, uint64_t scalarY)
+{
+	llvm::VectorType *ty = llvm::cast<llvm::VectorType>(x->getType());
+	llvm::Value *y = llvm::ConstantVector::getSplat(
+		ty->getNumElements(),
+		llvm::ConstantInt::get(ty->getElementType(), scalarY));
+	return jit->builder->CreateShl(x, y);
+}
+
+llvm::Value *lowerVectorAShr(llvm::Value *x, uint64_t scalarY)
+{
+	llvm::VectorType *ty = llvm::cast<llvm::VectorType>(x->getType());
+	llvm::Value *y = llvm::ConstantVector::getSplat(
+		ty->getNumElements(),
+		llvm::ConstantInt::get(ty->getElementType(), scalarY));
+	return jit->builder->CreateAShr(x, y);
+}
+
+llvm::Value *lowerVectorLShr(llvm::Value *x, uint64_t scalarY)
+{
+	llvm::VectorType *ty = llvm::cast<llvm::VectorType>(x->getType());
+	llvm::Value *y = llvm::ConstantVector::getSplat(
+		ty->getNumElements(),
+		llvm::ConstantInt::get(ty->getElementType(), scalarY));
+	return jit->builder->CreateLShr(x, y);
+}
+
+llvm::Value *lowerMulAdd(llvm::Value *x, llvm::Value *y)
+{
+	llvm::VectorType *ty = llvm::cast<llvm::VectorType>(x->getType());
+	llvm::VectorType *extTy = llvm::VectorType::getExtendedElementVectorType(ty);
+
+	llvm::Value *extX = jit->builder->CreateSExt(x, extTy);
+	llvm::Value *extY = jit->builder->CreateSExt(y, extTy);
+	llvm::Value *mult = jit->builder->CreateMul(extX, extY);
+
+	llvm::Value *undef = llvm::UndefValue::get(extTy);
+
+	llvm::SmallVector<uint32_t, 16> evenIdx;
+	llvm::SmallVector<uint32_t, 16> oddIdx;
+	for (uint64_t i = 0, n = ty->getNumElements(); i < n; i += 2)
+	{
+		evenIdx.push_back(i);
+		oddIdx.push_back(i + 1);
 	}
 
-	llvm::Value *lowerRoundInt(llvm::Value *x, llvm::Type *ty)
+	llvm::Value *lhs = jit->builder->CreateShuffleVector(mult, undef, evenIdx);
+	llvm::Value *rhs = jit->builder->CreateShuffleVector(mult, undef, oddIdx);
+	return jit->builder->CreateAdd(lhs, rhs);
+}
+
+llvm::Value *lowerPack(llvm::Value *x, llvm::Value *y, bool isSigned)
+{
+	llvm::VectorType *srcTy = llvm::cast<llvm::VectorType>(x->getType());
+	llvm::VectorType *dstTy = llvm::VectorType::getTruncatedElementVectorType(srcTy);
+
+	llvm::IntegerType *dstElemTy =
+		llvm::cast<llvm::IntegerType>(dstTy->getElementType());
+
+	uint64_t truncNumBits = dstElemTy->getIntegerBitWidth();
+	ASSERT_MSG(truncNumBits < 64, "shift 64 must be handled separately. truncNumBits: %d", int(truncNumBits));
+	llvm::Constant *max, *min;
+	if (isSigned)
 	{
-		return jit->builder->CreateFPToSI(lowerRound(x), ty);
+		max = llvm::ConstantInt::get(srcTy, (1LL << (truncNumBits - 1)) - 1, true);
+		min = llvm::ConstantInt::get(srcTy, (-1LL << (truncNumBits - 1)), true);
+	}
+	else
+	{
+		max = llvm::ConstantInt::get(srcTy, (1ULL << truncNumBits) - 1, false);
+		min = llvm::ConstantInt::get(srcTy, 0, false);
 	}
 
-	llvm::Value *lowerFloor(llvm::Value *x)
+	x = lowerPMINMAX(x, min, llvm::ICmpInst::ICMP_SGT);
+	x = lowerPMINMAX(x, max, llvm::ICmpInst::ICMP_SLT);
+	y = lowerPMINMAX(y, min, llvm::ICmpInst::ICMP_SGT);
+	y = lowerPMINMAX(y, max, llvm::ICmpInst::ICMP_SLT);
+
+	x = jit->builder->CreateTrunc(x, dstTy);
+	y = jit->builder->CreateTrunc(y, dstTy);
+
+	llvm::SmallVector<uint32_t, 16> index(srcTy->getNumElements() * 2);
+	std::iota(index.begin(), index.end(), 0);
+
+	return jit->builder->CreateShuffleVector(x, y, index);
+}
+
+llvm::Value *lowerSignMask(llvm::Value *x, llvm::Type *retTy)
+{
+	llvm::VectorType *ty = llvm::cast<llvm::VectorType>(x->getType());
+	llvm::Constant *zero = llvm::ConstantInt::get(ty, 0);
+	llvm::Value *cmp = jit->builder->CreateICmpSLT(x, zero);
+
+	llvm::Value *ret = jit->builder->CreateZExt(
+		jit->builder->CreateExtractElement(cmp, static_cast<uint64_t>(0)), retTy);
+	for (uint64_t i = 1, n = ty->getNumElements(); i < n; ++i)
 	{
-		llvm::Function *floor = llvm::Intrinsic::getDeclaration(
-			jit->module.get(), llvm::Intrinsic::floor, {x->getType()});
-		return jit->builder->CreateCall(floor, ARGS(x));
+		llvm::Value *elem = jit->builder->CreateZExt(
+			jit->builder->CreateExtractElement(cmp, i), retTy);
+		ret = jit->builder->CreateOr(ret, jit->builder->CreateShl(elem, i));
 	}
+	return ret;
+}
 
-	llvm::Value *lowerTrunc(llvm::Value *x)
+llvm::Value *lowerFPSignMask(llvm::Value *x, llvm::Type *retTy)
+{
+	llvm::VectorType *ty = llvm::cast<llvm::VectorType>(x->getType());
+	llvm::Constant *zero = llvm::ConstantFP::get(ty, 0);
+	llvm::Value *cmp = jit->builder->CreateFCmpULT(x, zero);
+
+	llvm::Value *ret = jit->builder->CreateZExt(
+		jit->builder->CreateExtractElement(cmp, static_cast<uint64_t>(0)), retTy);
+	for (uint64_t i = 1, n = ty->getNumElements(); i < n; ++i)
 	{
-		llvm::Function *trunc = llvm::Intrinsic::getDeclaration(
-			jit->module.get(), llvm::Intrinsic::trunc, {x->getType()});
-		return jit->builder->CreateCall(trunc, ARGS(x));
+		llvm::Value *elem = jit->builder->CreateZExt(
+			jit->builder->CreateExtractElement(cmp, i), retTy);
+		ret = jit->builder->CreateOr(ret, jit->builder->CreateShl(elem, i));
 	}
-
-	// Packed add/sub with saturation
-	llvm::Value *lowerPSAT(llvm::Value *x, llvm::Value *y, bool isAdd, bool isSigned)
-	{
-		llvm::VectorType *ty = llvm::cast<llvm::VectorType>(x->getType());
-		llvm::VectorType *extTy = llvm::VectorType::getExtendedElementVectorType(ty);
-
-		unsigned numBits = ty->getScalarSizeInBits();
-
-		llvm::Value *max, *min, *extX, *extY;
-		if (isSigned)
-		{
-			max = llvm::ConstantInt::get(extTy, (1LL << (numBits - 1)) - 1, true);
-			min = llvm::ConstantInt::get(extTy, (-1LL << (numBits - 1)), true);
-			extX = jit->builder->CreateSExt(x, extTy);
-			extY = jit->builder->CreateSExt(y, extTy);
-		}
-		else
-		{
-			ASSERT_MSG(numBits <= 64, "numBits: %d", int(numBits));
-			uint64_t maxVal = (numBits == 64) ? ~0ULL : (1ULL << numBits) - 1;
-			max = llvm::ConstantInt::get(extTy, maxVal, false);
-			min = llvm::ConstantInt::get(extTy, 0, false);
-			extX = jit->builder->CreateZExt(x, extTy);
-			extY = jit->builder->CreateZExt(y, extTy);
-		}
-
-		llvm::Value *res = isAdd ? jit->builder->CreateAdd(extX, extY)
-		                         : jit->builder->CreateSub(extX, extY);
-
-		res = lowerPMINMAX(res, min, llvm::ICmpInst::ICMP_SGT);
-		res = lowerPMINMAX(res, max, llvm::ICmpInst::ICMP_SLT);
-
-		return jit->builder->CreateTrunc(res, ty);
-	}
-
-	llvm::Value *lowerSQRT(llvm::Value *x)
-	{
-		llvm::Function *sqrt = llvm::Intrinsic::getDeclaration(
-			jit->module.get(), llvm::Intrinsic::sqrt, {x->getType()});
-		return jit->builder->CreateCall(sqrt, ARGS(x));
-	}
-
-	llvm::Value *lowerRCP(llvm::Value *x)
-	{
-		llvm::Type *ty = x->getType();
-		llvm::Constant *one;
-		if (llvm::VectorType *vectorTy = llvm::dyn_cast<llvm::VectorType>(ty))
-		{
-			one = llvm::ConstantVector::getSplat(
-				vectorTy->getNumElements(),
-				llvm::ConstantFP::get(vectorTy->getElementType(), 1));
-		}
-		else
-		{
-			one = llvm::ConstantFP::get(ty, 1);
-		}
-		return jit->builder->CreateFDiv(one, x);
-	}
-
-	llvm::Value *lowerRSQRT(llvm::Value *x)
-	{
-		return lowerRCP(lowerSQRT(x));
-	}
-
-	llvm::Value *lowerVectorShl(llvm::Value *x, uint64_t scalarY)
-	{
-		llvm::VectorType *ty = llvm::cast<llvm::VectorType>(x->getType());
-		llvm::Value *y = llvm::ConstantVector::getSplat(
-			ty->getNumElements(),
-			llvm::ConstantInt::get(ty->getElementType(), scalarY));
-		return jit->builder->CreateShl(x, y);
-	}
-
-	llvm::Value *lowerVectorAShr(llvm::Value *x, uint64_t scalarY)
-	{
-		llvm::VectorType *ty = llvm::cast<llvm::VectorType>(x->getType());
-		llvm::Value *y = llvm::ConstantVector::getSplat(
-			ty->getNumElements(),
-			llvm::ConstantInt::get(ty->getElementType(), scalarY));
-		return jit->builder->CreateAShr(x, y);
-	}
-
-	llvm::Value *lowerVectorLShr(llvm::Value *x, uint64_t scalarY)
-	{
-		llvm::VectorType *ty = llvm::cast<llvm::VectorType>(x->getType());
-		llvm::Value *y = llvm::ConstantVector::getSplat(
-			ty->getNumElements(),
-			llvm::ConstantInt::get(ty->getElementType(), scalarY));
-		return jit->builder->CreateLShr(x, y);
-	}
-
-	llvm::Value *lowerMulAdd(llvm::Value *x, llvm::Value *y)
-	{
-		llvm::VectorType *ty = llvm::cast<llvm::VectorType>(x->getType());
-		llvm::VectorType *extTy = llvm::VectorType::getExtendedElementVectorType(ty);
-
-		llvm::Value *extX = jit->builder->CreateSExt(x, extTy);
-		llvm::Value *extY = jit->builder->CreateSExt(y, extTy);
-		llvm::Value *mult = jit->builder->CreateMul(extX, extY);
-
-		llvm::Value *undef = llvm::UndefValue::get(extTy);
-
-		llvm::SmallVector<uint32_t, 16> evenIdx;
-		llvm::SmallVector<uint32_t, 16> oddIdx;
-		for (uint64_t i = 0, n = ty->getNumElements(); i < n; i += 2)
-		{
-			evenIdx.push_back(i);
-			oddIdx.push_back(i + 1);
-		}
-
-		llvm::Value *lhs = jit->builder->CreateShuffleVector(mult, undef, evenIdx);
-		llvm::Value *rhs = jit->builder->CreateShuffleVector(mult, undef, oddIdx);
-		return jit->builder->CreateAdd(lhs, rhs);
-	}
-
-	llvm::Value *lowerPack(llvm::Value *x, llvm::Value *y, bool isSigned)
-	{
-		llvm::VectorType *srcTy = llvm::cast<llvm::VectorType>(x->getType());
-		llvm::VectorType *dstTy = llvm::VectorType::getTruncatedElementVectorType(srcTy);
-
-		llvm::IntegerType *dstElemTy =
-			llvm::cast<llvm::IntegerType>(dstTy->getElementType());
-
-		uint64_t truncNumBits = dstElemTy->getIntegerBitWidth();
-		ASSERT_MSG(truncNumBits < 64, "shift 64 must be handled separately. truncNumBits: %d", int(truncNumBits));
-		llvm::Constant *max, *min;
-		if (isSigned)
-		{
-			max = llvm::ConstantInt::get(srcTy, (1LL << (truncNumBits - 1)) - 1, true);
-			min = llvm::ConstantInt::get(srcTy, (-1LL << (truncNumBits - 1)), true);
-		}
-		else
-		{
-			max = llvm::ConstantInt::get(srcTy, (1ULL << truncNumBits) - 1, false);
-			min = llvm::ConstantInt::get(srcTy, 0, false);
-		}
-
-		x = lowerPMINMAX(x, min, llvm::ICmpInst::ICMP_SGT);
-		x = lowerPMINMAX(x, max, llvm::ICmpInst::ICMP_SLT);
-		y = lowerPMINMAX(y, min, llvm::ICmpInst::ICMP_SGT);
-		y = lowerPMINMAX(y, max, llvm::ICmpInst::ICMP_SLT);
-
-		x = jit->builder->CreateTrunc(x, dstTy);
-		y = jit->builder->CreateTrunc(y, dstTy);
-
-		llvm::SmallVector<uint32_t, 16> index(srcTy->getNumElements() * 2);
-		std::iota(index.begin(), index.end(), 0);
-
-		return jit->builder->CreateShuffleVector(x, y, index);
-	}
-
-	llvm::Value *lowerSignMask(llvm::Value *x, llvm::Type *retTy)
-	{
-		llvm::VectorType *ty = llvm::cast<llvm::VectorType>(x->getType());
-		llvm::Constant *zero = llvm::ConstantInt::get(ty, 0);
-		llvm::Value *cmp = jit->builder->CreateICmpSLT(x, zero);
-
-		llvm::Value *ret = jit->builder->CreateZExt(
-			jit->builder->CreateExtractElement(cmp, static_cast<uint64_t>(0)), retTy);
-		for (uint64_t i = 1, n = ty->getNumElements(); i < n; ++i)
-		{
-			llvm::Value *elem = jit->builder->CreateZExt(
-				jit->builder->CreateExtractElement(cmp, i), retTy);
-			ret = jit->builder->CreateOr(ret, jit->builder->CreateShl(elem, i));
-		}
-		return ret;
-	}
-
-	llvm::Value *lowerFPSignMask(llvm::Value *x, llvm::Type *retTy)
-	{
-		llvm::VectorType *ty = llvm::cast<llvm::VectorType>(x->getType());
-		llvm::Constant *zero = llvm::ConstantFP::get(ty, 0);
-		llvm::Value *cmp = jit->builder->CreateFCmpULT(x, zero);
-
-		llvm::Value *ret = jit->builder->CreateZExt(
-			jit->builder->CreateExtractElement(cmp, static_cast<uint64_t>(0)), retTy);
-		for (uint64_t i = 1, n = ty->getNumElements(); i < n; ++i)
-		{
-			llvm::Value *elem = jit->builder->CreateZExt(
-				jit->builder->CreateExtractElement(cmp, i), retTy);
-			ret = jit->builder->CreateOr(ret, jit->builder->CreateShl(elem, i));
-		}
-		return ret;
-	}
+	return ret;
+}
 #endif  // !defined(__i386__) && !defined(__x86_64__)
 
 #if (LLVM_VERSION_MAJOR >= 8) || (!defined(__i386__) && !defined(__x86_64__))
-	llvm::Value *lowerPUADDSAT(llvm::Value *x, llvm::Value *y)
-	{
-		#if LLVM_VERSION_MAJOR >= 8
-			return jit->builder->CreateBinaryIntrinsic(llvm::Intrinsic::uadd_sat, x, y);
-		#else
-			return lowerPSAT(x, y, true, false);
-		#endif
-	}
+llvm::Value *lowerPUADDSAT(llvm::Value *x, llvm::Value *y)
+{
+	#if LLVM_VERSION_MAJOR >= 8
+		return jit->builder->CreateBinaryIntrinsic(llvm::Intrinsic::uadd_sat, x, y);
+	#else
+		return lowerPSAT(x, y, true, false);
+	#endif
+}
 
-	llvm::Value *lowerPSADDSAT(llvm::Value *x, llvm::Value *y)
-	{
-		#if LLVM_VERSION_MAJOR >= 8
-			return jit->builder->CreateBinaryIntrinsic(llvm::Intrinsic::sadd_sat, x, y);
-		#else
-			return lowerPSAT(x, y, true, true);
-		#endif
-	}
+llvm::Value *lowerPSADDSAT(llvm::Value *x, llvm::Value *y)
+{
+	#if LLVM_VERSION_MAJOR >= 8
+		return jit->builder->CreateBinaryIntrinsic(llvm::Intrinsic::sadd_sat, x, y);
+	#else
+		return lowerPSAT(x, y, true, true);
+	#endif
+}
 
-	llvm::Value *lowerPUSUBSAT(llvm::Value *x, llvm::Value *y)
-	{
-		#if LLVM_VERSION_MAJOR >= 8
-			return jit->builder->CreateBinaryIntrinsic(llvm::Intrinsic::usub_sat, x, y);
-		#else
-			return lowerPSAT(x, y, false, false);
-		#endif
-	}
+llvm::Value *lowerPUSUBSAT(llvm::Value *x, llvm::Value *y)
+{
+	#if LLVM_VERSION_MAJOR >= 8
+		return jit->builder->CreateBinaryIntrinsic(llvm::Intrinsic::usub_sat, x, y);
+	#else
+		return lowerPSAT(x, y, false, false);
+	#endif
+}
 
-	llvm::Value *lowerPSSUBSAT(llvm::Value *x, llvm::Value *y)
-	{
-		#if LLVM_VERSION_MAJOR >= 8
-			return jit->builder->CreateBinaryIntrinsic(llvm::Intrinsic::ssub_sat, x, y);
-		#else
-			return lowerPSAT(x, y, false, true);
-		#endif
-	}
+llvm::Value *lowerPSSUBSAT(llvm::Value *x, llvm::Value *y)
+{
+	#if LLVM_VERSION_MAJOR >= 8
+		return jit->builder->CreateBinaryIntrinsic(llvm::Intrinsic::ssub_sat, x, y);
+	#else
+		return lowerPSAT(x, y, false, true);
+	#endif
+}
 #endif  // (LLVM_VERSION_MAJOR >= 8) || (!defined(__i386__) && !defined(__x86_64__))
 
-	llvm::Value *lowerMulHigh(llvm::Value *x, llvm::Value *y, bool sext)
+llvm::Value *lowerMulHigh(llvm::Value *x, llvm::Value *y, bool sext)
+{
+	llvm::VectorType *ty = llvm::cast<llvm::VectorType>(x->getType());
+	llvm::VectorType *extTy = llvm::VectorType::getExtendedElementVectorType(ty);
+
+	llvm::Value *extX, *extY;
+	if (sext)
 	{
-		llvm::VectorType *ty = llvm::cast<llvm::VectorType>(x->getType());
-		llvm::VectorType *extTy = llvm::VectorType::getExtendedElementVectorType(ty);
-
-		llvm::Value *extX, *extY;
-		if (sext)
-		{
-			extX = jit->builder->CreateSExt(x, extTy);
-			extY = jit->builder->CreateSExt(y, extTy);
-		}
-		else
-		{
-			extX = jit->builder->CreateZExt(x, extTy);
-			extY = jit->builder->CreateZExt(y, extTy);
-		}
-
-		llvm::Value *mult = jit->builder->CreateMul(extX, extY);
-
-		llvm::IntegerType *intTy = llvm::cast<llvm::IntegerType>(ty->getElementType());
-		llvm::Value *mulh = jit->builder->CreateAShr(mult, intTy->getBitWidth());
-		return jit->builder->CreateTrunc(mulh, ty);
+		extX = jit->builder->CreateSExt(x, extTy);
+		extY = jit->builder->CreateSExt(y, extTy);
+	}
+	else
+	{
+		extX = jit->builder->CreateZExt(x, extTy);
+		extY = jit->builder->CreateZExt(y, extTy);
 	}
 
-	llvm::Value *createGather(llvm::Value *base, llvm::Type *elTy, llvm::Value *offsets, llvm::Value *mask, unsigned int alignment, bool zeroMaskedLanes)
+	llvm::Value *mult = jit->builder->CreateMul(extX, extY);
+
+	llvm::IntegerType *intTy = llvm::cast<llvm::IntegerType>(ty->getElementType());
+	llvm::Value *mulh = jit->builder->CreateAShr(mult, intTy->getBitWidth());
+	return jit->builder->CreateTrunc(mulh, ty);
+}
+
+llvm::Value *createGather(llvm::Value *base, llvm::Type *elTy, llvm::Value *offsets, llvm::Value *mask, unsigned int alignment, bool zeroMaskedLanes)
+{
+	ASSERT(base->getType()->isPointerTy());
+	ASSERT(offsets->getType()->isVectorTy());
+	ASSERT(mask->getType()->isVectorTy());
+
+	auto numEls = mask->getType()->getVectorNumElements();
+	auto i1Ty = ::llvm::Type::getInt1Ty(jit->context);
+	auto i32Ty = ::llvm::Type::getInt32Ty(jit->context);
+	auto i8Ty = ::llvm::Type::getInt8Ty(jit->context);
+	auto i8PtrTy = i8Ty->getPointerTo();
+	auto elPtrTy = elTy->getPointerTo();
+	auto elVecTy = ::llvm::VectorType::get(elTy, numEls);
+	auto elPtrVecTy = ::llvm::VectorType::get(elPtrTy, numEls);
+	auto i8Base = jit->builder->CreatePointerCast(base, i8PtrTy);
+	auto i8Ptrs = jit->builder->CreateGEP(i8Base, offsets);
+	auto elPtrs = jit->builder->CreatePointerCast(i8Ptrs, elPtrVecTy);
+	auto i8Mask = jit->builder->CreateIntCast(mask, ::llvm::VectorType::get(i1Ty, numEls), false); // vec<int, int, ...> -> vec<bool, bool, ...>
+	auto passthrough = zeroMaskedLanes ? ::llvm::Constant::getNullValue(elVecTy) : llvm::UndefValue::get(elVecTy);
+	auto align = ::llvm::ConstantInt::get(i32Ty, alignment);
+	auto func = ::llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::masked_gather, { elVecTy, elPtrVecTy } );
+	return jit->builder->CreateCall(func, { elPtrs, align, i8Mask, passthrough });
+}
+
+void createScatter(llvm::Value *base, llvm::Value *val, llvm::Value *offsets, llvm::Value *mask, unsigned int alignment)
+{
+	ASSERT(base->getType()->isPointerTy());
+	ASSERT(val->getType()->isVectorTy());
+	ASSERT(offsets->getType()->isVectorTy());
+	ASSERT(mask->getType()->isVectorTy());
+
+	auto numEls = mask->getType()->getVectorNumElements();
+	auto i1Ty = ::llvm::Type::getInt1Ty(jit->context);
+	auto i32Ty = ::llvm::Type::getInt32Ty(jit->context);
+	auto i8Ty = ::llvm::Type::getInt8Ty(jit->context);
+	auto i8PtrTy = i8Ty->getPointerTo();
+	auto elVecTy = val->getType();
+	auto elTy = elVecTy->getVectorElementType();
+	auto elPtrTy = elTy->getPointerTo();
+	auto elPtrVecTy = ::llvm::VectorType::get(elPtrTy, numEls);
+	auto i8Base = jit->builder->CreatePointerCast(base, i8PtrTy);
+	auto i8Ptrs = jit->builder->CreateGEP(i8Base, offsets);
+	auto elPtrs = jit->builder->CreatePointerCast(i8Ptrs, elPtrVecTy);
+	auto i8Mask = jit->builder->CreateIntCast(mask, ::llvm::VectorType::get(i1Ty, numEls), false); // vec<int, int, ...> -> vec<bool, bool, ...>
+	auto align = ::llvm::ConstantInt::get(i32Ty, alignment);
+	auto func = ::llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::masked_scatter, { elVecTy, elPtrVecTy } );
+	jit->builder->CreateCall(func, { val, elPtrs, align, i8Mask });
+}
+}
+
+namespace rr {
+
+const Capabilities Caps =
+{
+	true, // CoroutinesSupported
+};
+
+static std::memory_order atomicOrdering(llvm::AtomicOrdering memoryOrder)
+{
+	switch(memoryOrder)
 	{
-		ASSERT(base->getType()->isPointerTy());
-		ASSERT(offsets->getType()->isVectorTy());
-		ASSERT(mask->getType()->isVectorTy());
-
-		auto numEls = mask->getType()->getVectorNumElements();
-		auto i1Ty = ::llvm::Type::getInt1Ty(jit->context);
-		auto i32Ty = ::llvm::Type::getInt32Ty(jit->context);
-		auto i8Ty = ::llvm::Type::getInt8Ty(jit->context);
-		auto i8PtrTy = i8Ty->getPointerTo();
-		auto elPtrTy = elTy->getPointerTo();
-		auto elVecTy = ::llvm::VectorType::get(elTy, numEls);
-		auto elPtrVecTy = ::llvm::VectorType::get(elPtrTy, numEls);
-		auto i8Base = jit->builder->CreatePointerCast(base, i8PtrTy);
-		auto i8Ptrs = jit->builder->CreateGEP(i8Base, offsets);
-		auto elPtrs = jit->builder->CreatePointerCast(i8Ptrs, elPtrVecTy);
-		auto i8Mask = jit->builder->CreateIntCast(mask, ::llvm::VectorType::get(i1Ty, numEls), false); // vec<int, int, ...> -> vec<bool, bool, ...>
-		auto passthrough = zeroMaskedLanes ? ::llvm::Constant::getNullValue(elVecTy) : llvm::UndefValue::get(elVecTy);
-		auto align = ::llvm::ConstantInt::get(i32Ty, alignment);
-		auto func = ::llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::masked_gather, { elVecTy, elPtrVecTy } );
-		return jit->builder->CreateCall(func, { elPtrs, align, i8Mask, passthrough });
-	}
-
-	void createScatter(llvm::Value *base, llvm::Value *val, llvm::Value *offsets, llvm::Value *mask, unsigned int alignment)
-	{
-		ASSERT(base->getType()->isPointerTy());
-		ASSERT(val->getType()->isVectorTy());
-		ASSERT(offsets->getType()->isVectorTy());
-		ASSERT(mask->getType()->isVectorTy());
-
-		auto numEls = mask->getType()->getVectorNumElements();
-		auto i1Ty = ::llvm::Type::getInt1Ty(jit->context);
-		auto i32Ty = ::llvm::Type::getInt32Ty(jit->context);
-		auto i8Ty = ::llvm::Type::getInt8Ty(jit->context);
-		auto i8PtrTy = i8Ty->getPointerTo();
-		auto elVecTy = val->getType();
-		auto elTy = elVecTy->getVectorElementType();
-		auto elPtrTy = elTy->getPointerTo();
-		auto elPtrVecTy = ::llvm::VectorType::get(elPtrTy, numEls);
-		auto i8Base = jit->builder->CreatePointerCast(base, i8PtrTy);
-		auto i8Ptrs = jit->builder->CreateGEP(i8Base, offsets);
-		auto elPtrs = jit->builder->CreatePointerCast(i8Ptrs, elPtrVecTy);
-		auto i8Mask = jit->builder->CreateIntCast(mask, ::llvm::VectorType::get(i1Ty, numEls), false); // vec<int, int, ...> -> vec<bool, bool, ...>
-		auto align = ::llvm::ConstantInt::get(i32Ty, alignment);
-		auto func = ::llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::masked_scatter, { elVecTy, elPtrVecTy } );
-		jit->builder->CreateCall(func, { val, elPtrs, align, i8Mask });
+	case llvm::AtomicOrdering::Monotonic: return std::memory_order_relaxed;  // https://llvm.org/docs/Atomics.html#monotonic
+	case llvm::AtomicOrdering::Acquire: return std::memory_order_acquire;
+	case llvm::AtomicOrdering::Release: return std::memory_order_release;
+	case llvm::AtomicOrdering::AcquireRelease: return std::memory_order_acq_rel;
+	case llvm::AtomicOrdering::SequentiallyConsistent: return std::memory_order_seq_cst;
+	default:
+		UNREACHABLE("memoryOrder: %d", int(memoryOrder));
+		return std::memory_order_acq_rel;
 	}
 }
 
-namespace rr
+static llvm::AtomicOrdering atomicOrdering(bool atomic, std::memory_order memoryOrder)
 {
-	const Capabilities Caps =
+	if(!atomic)
 	{
-		true, // CoroutinesSupported
+		return llvm::AtomicOrdering::NotAtomic;
+	}
+
+	switch(memoryOrder)
+	{
+	case std::memory_order_relaxed: return llvm::AtomicOrdering::Monotonic;  // https://llvm.org/docs/Atomics.html#monotonic
+	case std::memory_order_consume: return llvm::AtomicOrdering::Acquire;    // https://llvm.org/docs/Atomics.html#acquire: "It should also be used for C++11/C11 memory_order_consume."
+	case std::memory_order_acquire: return llvm::AtomicOrdering::Acquire;
+	case std::memory_order_release: return llvm::AtomicOrdering::Release;
+	case std::memory_order_acq_rel: return llvm::AtomicOrdering::AcquireRelease;
+	case std::memory_order_seq_cst: return llvm::AtomicOrdering::SequentiallyConsistent;
+	default:
+		UNREACHABLE("memoryOrder: %d", int(memoryOrder));
+		return llvm::AtomicOrdering::AcquireRelease;
+	}
+}
+
+template <typename T>
+static void atomicLoad(void *ptr, void *ret, llvm::AtomicOrdering ordering)
+{
+	*reinterpret_cast<T*>(ret) = std::atomic_load_explicit<T>(reinterpret_cast<std::atomic<T>*>(ptr), atomicOrdering(ordering));
+}
+
+template <typename T>
+static void atomicStore(void *ptr, void *val, llvm::AtomicOrdering ordering)
+{
+	std::atomic_store_explicit<T>(reinterpret_cast<std::atomic<T>*>(ptr), *reinterpret_cast<T*>(val), atomicOrdering(ordering));
+}
+
+#ifdef __ANDROID__
+template<typename F>
+static uint32_t sync_fetch_and_op(uint32_t volatile *ptr, uint32_t val, F f)
+{
+	// Build an arbitrary op out of looped CAS
+	for (;;)
+	{
+		uint32_t expected = *ptr;
+		uint32_t desired = f(expected, val);
+
+		if (expected == __sync_val_compare_and_swap_4(ptr, expected, desired))
+			return expected;
+	}
+}
+#endif
+
+void* resolveExternalSymbol(const char* name)
+{
+	struct Atomic
+	{
+		static void load(size_t size, void *ptr, void *ret, llvm::AtomicOrdering ordering)
+		{
+			switch (size)
+			{
+				case 1: atomicLoad<uint8_t>(ptr, ret, ordering); break;
+				case 2: atomicLoad<uint16_t>(ptr, ret, ordering); break;
+				case 4: atomicLoad<uint32_t>(ptr, ret, ordering); break;
+				case 8: atomicLoad<uint64_t>(ptr, ret, ordering); break;
+				default:
+					UNIMPLEMENTED("Atomic::load(size: %d)", int(size));
+			}
+		}
+		static void store(size_t size, void *ptr, void *ret, llvm::AtomicOrdering ordering)
+		{
+			switch (size)
+			{
+				case 1: atomicStore<uint8_t>(ptr, ret, ordering); break;
+				case 2: atomicStore<uint16_t>(ptr, ret, ordering); break;
+				case 4: atomicStore<uint32_t>(ptr, ret, ordering); break;
+				case 8: atomicStore<uint64_t>(ptr, ret, ordering); break;
+				default:
+					UNIMPLEMENTED("Atomic::store(size: %d)", int(size));
+			}
+		}
 	};
 
-	static std::memory_order atomicOrdering(llvm::AtomicOrdering memoryOrder)
+	struct F
 	{
-		switch(memoryOrder)
-		{
-		case llvm::AtomicOrdering::Monotonic: return std::memory_order_relaxed;  // https://llvm.org/docs/Atomics.html#monotonic
-		case llvm::AtomicOrdering::Acquire: return std::memory_order_acquire;
-		case llvm::AtomicOrdering::Release: return std::memory_order_release;
-		case llvm::AtomicOrdering::AcquireRelease: return std::memory_order_acq_rel;
-		case llvm::AtomicOrdering::SequentiallyConsistent: return std::memory_order_seq_cst;
-		default:
-			UNREACHABLE("memoryOrder: %d", int(memoryOrder));
-			return std::memory_order_acq_rel;
-		}
-	}
+		static void nop() {}
+		static void neverCalled() { UNREACHABLE("Should never be called"); }
 
-	static llvm::AtomicOrdering atomicOrdering(bool atomic, std::memory_order memoryOrder)
-	{
-		if(!atomic)
-		{
-			return llvm::AtomicOrdering::NotAtomic;
-		}
-
-		switch(memoryOrder)
-		{
-		case std::memory_order_relaxed: return llvm::AtomicOrdering::Monotonic;  // https://llvm.org/docs/Atomics.html#monotonic
-		case std::memory_order_consume: return llvm::AtomicOrdering::Acquire;    // https://llvm.org/docs/Atomics.html#acquire: "It should also be used for C++11/C11 memory_order_consume."
-		case std::memory_order_acquire: return llvm::AtomicOrdering::Acquire;
-		case std::memory_order_release: return llvm::AtomicOrdering::Release;
-		case std::memory_order_acq_rel: return llvm::AtomicOrdering::AcquireRelease;
-		case std::memory_order_seq_cst: return llvm::AtomicOrdering::SequentiallyConsistent;
-		default:
-			UNREACHABLE("memoryOrder: %d", int(memoryOrder));
-			return llvm::AtomicOrdering::AcquireRelease;
-		}
-	}
-
-	template <typename T>
-	static void atomicLoad(void *ptr, void *ret, llvm::AtomicOrdering ordering)
-	{
-		*reinterpret_cast<T*>(ret) = std::atomic_load_explicit<T>(reinterpret_cast<std::atomic<T>*>(ptr), atomicOrdering(ordering));
-	}
-
-	template <typename T>
-	static void atomicStore(void *ptr, void *val, llvm::AtomicOrdering ordering)
-	{
-		std::atomic_store_explicit<T>(reinterpret_cast<std::atomic<T>*>(ptr), *reinterpret_cast<T*>(val), atomicOrdering(ordering));
-	}
+		static void* coroutine_alloc_frame(size_t size) { return alignedAlloc(size, 16); }
+		static void coroutine_free_frame(void* ptr) { alignedFree(ptr); }
 
 #ifdef __ANDROID__
-	template<typename F>
-	static uint32_t sync_fetch_and_op(uint32_t volatile *ptr, uint32_t val, F f)
-	{
-		// Build an arbitrary op out of looped CAS
-		for (;;)
-		{
-			uint32_t expected = *ptr;
-			uint32_t desired = f(expected, val);
+		// forwarders since we can't take address of builtins
+		static void sync_synchronize() { __sync_synchronize(); }
+		static uint32_t sync_fetch_and_add_4(uint32_t *ptr, uint32_t val) { return __sync_fetch_and_add_4(ptr, val); }
+		static uint32_t sync_fetch_and_and_4(uint32_t *ptr, uint32_t val) { return __sync_fetch_and_and_4(ptr, val); }
+		static uint32_t sync_fetch_and_or_4(uint32_t *ptr, uint32_t val) { return __sync_fetch_and_or_4(ptr, val); }
+		static uint32_t sync_fetch_and_xor_4(uint32_t *ptr, uint32_t val) { return __sync_fetch_and_xor_4(ptr, val); }
+		static uint32_t sync_fetch_and_sub_4(uint32_t *ptr, uint32_t val) { return __sync_fetch_and_sub_4(ptr, val); }
+		static uint32_t sync_lock_test_and_set_4(uint32_t *ptr, uint32_t val) { return __sync_lock_test_and_set_4(ptr, val); }
+		static uint32_t sync_val_compare_and_swap_4(uint32_t *ptr, uint32_t expected, uint32_t desired) { return __sync_val_compare_and_swap_4(ptr, expected, desired); }
 
-			if (expected == __sync_val_compare_and_swap_4(ptr, expected, desired))
-				return expected;
-		}
-	}
+		static uint32_t sync_fetch_and_max_4(uint32_t *ptr, uint32_t val) { return sync_fetch_and_op(ptr, val, [](int32_t a, int32_t b) { return std::max(a,b);}); }
+		static uint32_t sync_fetch_and_min_4(uint32_t *ptr, uint32_t val) { return sync_fetch_and_op(ptr, val, [](int32_t a, int32_t b) { return std::min(a,b);}); }
+		static uint32_t sync_fetch_and_umax_4(uint32_t *ptr, uint32_t val) { return sync_fetch_and_op(ptr, val, [](uint32_t a, uint32_t b) { return std::max(a,b);}); }
+		static uint32_t sync_fetch_and_umin_4(uint32_t *ptr, uint32_t val) { return sync_fetch_and_op(ptr, val, [](uint32_t a, uint32_t b) { return std::min(a,b);}); }
 #endif
+	};
 
-	void* resolveExternalSymbol(const char* name)
+	class Resolver
 	{
-		struct Atomic
+	public:
+		using FunctionMap = std::unordered_map<std::string, void *>;
+
+		FunctionMap functions;
+
+		Resolver()
 		{
-			static void load(size_t size, void *ptr, void *ret, llvm::AtomicOrdering ordering)
-			{
-				switch (size)
-				{
-					case 1: atomicLoad<uint8_t>(ptr, ret, ordering); break;
-					case 2: atomicLoad<uint16_t>(ptr, ret, ordering); break;
-					case 4: atomicLoad<uint32_t>(ptr, ret, ordering); break;
-					case 8: atomicLoad<uint64_t>(ptr, ret, ordering); break;
-					default:
-						UNIMPLEMENTED("Atomic::load(size: %d)", int(size));
-				}
-			}
-			static void store(size_t size, void *ptr, void *ret, llvm::AtomicOrdering ordering)
-			{
-				switch (size)
-				{
-					case 1: atomicStore<uint8_t>(ptr, ret, ordering); break;
-					case 2: atomicStore<uint16_t>(ptr, ret, ordering); break;
-					case 4: atomicStore<uint32_t>(ptr, ret, ordering); break;
-					case 8: atomicStore<uint64_t>(ptr, ret, ordering); break;
-					default:
-						UNIMPLEMENTED("Atomic::store(size: %d)", int(size));
-				}
-			}
-		};
+			functions.emplace("nop", reinterpret_cast<void*>(F::nop));
+			functions.emplace("floorf", reinterpret_cast<void*>(floorf));
+			functions.emplace("nearbyintf", reinterpret_cast<void*>(nearbyintf));
+			functions.emplace("truncf", reinterpret_cast<void*>(truncf));
+			functions.emplace("printf", reinterpret_cast<void*>(printf));
+			functions.emplace("puts", reinterpret_cast<void*>(puts));
+			functions.emplace("fmodf", reinterpret_cast<void*>(fmodf));
 
-		struct F
-		{
-			static void nop() {}
-			static void neverCalled() { UNREACHABLE("Should never be called"); }
+			functions.emplace("sinf", reinterpret_cast<void*>(sinf));
+			functions.emplace("cosf", reinterpret_cast<void*>(cosf));
+			functions.emplace("asinf", reinterpret_cast<void*>(asinf));
+			functions.emplace("acosf", reinterpret_cast<void*>(acosf));
+			functions.emplace("atanf", reinterpret_cast<void*>(atanf));
+			functions.emplace("sinhf", reinterpret_cast<void*>(sinhf));
+			functions.emplace("coshf", reinterpret_cast<void*>(coshf));
+			functions.emplace("tanhf", reinterpret_cast<void*>(tanhf));
+			functions.emplace("asinhf", reinterpret_cast<void*>(asinhf));
+			functions.emplace("acoshf", reinterpret_cast<void*>(acoshf));
+			functions.emplace("atanhf", reinterpret_cast<void*>(atanhf));
+			functions.emplace("atan2f", reinterpret_cast<void*>(atan2f));
+			functions.emplace("powf", reinterpret_cast<void*>(powf));
+			functions.emplace("expf", reinterpret_cast<void*>(expf));
+			functions.emplace("logf", reinterpret_cast<void*>(logf));
+			functions.emplace("exp2f", reinterpret_cast<void*>(exp2f));
+			functions.emplace("log2f", reinterpret_cast<void*>(log2f));
 
-			static void* coroutine_alloc_frame(size_t size) { return alignedAlloc(size, 16); }
-			static void coroutine_free_frame(void* ptr) { alignedFree(ptr); }
+			functions.emplace("sin", reinterpret_cast<void*>(static_cast<double(*)(double)>(sin)));
+			functions.emplace("cos", reinterpret_cast<void*>(static_cast<double(*)(double)>(cos)));
+			functions.emplace("asin", reinterpret_cast<void*>(static_cast<double(*)(double)>(asin)));
+			functions.emplace("acos", reinterpret_cast<void*>(static_cast<double(*)(double)>(acos)));
+			functions.emplace("atan", reinterpret_cast<void*>(static_cast<double(*)(double)>(atan)));
+			functions.emplace("sinh", reinterpret_cast<void*>(static_cast<double(*)(double)>(sinh)));
+			functions.emplace("cosh", reinterpret_cast<void*>(static_cast<double(*)(double)>(cosh)));
+			functions.emplace("tanh", reinterpret_cast<void*>(static_cast<double(*)(double)>(tanh)));
+			functions.emplace("asinh", reinterpret_cast<void*>(static_cast<double(*)(double)>(asinh)));
+			functions.emplace("acosh", reinterpret_cast<void*>(static_cast<double(*)(double)>(acosh)));
+			functions.emplace("atanh", reinterpret_cast<void*>(static_cast<double(*)(double)>(atanh)));
+			functions.emplace("atan2", reinterpret_cast<void*>(static_cast<double(*)(double,double)>(atan2)));
+			functions.emplace("pow", reinterpret_cast<void*>(static_cast<double(*)(double,double)>(pow)));
+			functions.emplace("exp", reinterpret_cast<void*>(static_cast<double(*)(double)>(exp)));
+			functions.emplace("log", reinterpret_cast<void*>(static_cast<double(*)(double)>(log)));
+			functions.emplace("exp2", reinterpret_cast<void*>(static_cast<double(*)(double)>(exp2)));
+			functions.emplace("log2", reinterpret_cast<void*>(static_cast<double(*)(double)>(log2)));
 
-#ifdef __ANDROID__
-			// forwarders since we can't take address of builtins
-			static void sync_synchronize() { __sync_synchronize(); }
-			static uint32_t sync_fetch_and_add_4(uint32_t *ptr, uint32_t val) { return __sync_fetch_and_add_4(ptr, val); }
-			static uint32_t sync_fetch_and_and_4(uint32_t *ptr, uint32_t val) { return __sync_fetch_and_and_4(ptr, val); }
-			static uint32_t sync_fetch_and_or_4(uint32_t *ptr, uint32_t val) { return __sync_fetch_and_or_4(ptr, val); }
-			static uint32_t sync_fetch_and_xor_4(uint32_t *ptr, uint32_t val) { return __sync_fetch_and_xor_4(ptr, val); }
-			static uint32_t sync_fetch_and_sub_4(uint32_t *ptr, uint32_t val) { return __sync_fetch_and_sub_4(ptr, val); }
-			static uint32_t sync_lock_test_and_set_4(uint32_t *ptr, uint32_t val) { return __sync_lock_test_and_set_4(ptr, val); }
-			static uint32_t sync_val_compare_and_swap_4(uint32_t *ptr, uint32_t expected, uint32_t desired) { return __sync_val_compare_and_swap_4(ptr, expected, desired); }
+			functions.emplace("atomic_load", reinterpret_cast<void*>(Atomic::load));
+			functions.emplace("atomic_store", reinterpret_cast<void*>(Atomic::store));
 
-			static uint32_t sync_fetch_and_max_4(uint32_t *ptr, uint32_t val) { return sync_fetch_and_op(ptr, val, [](int32_t a, int32_t b) { return std::max(a,b);}); }
-			static uint32_t sync_fetch_and_min_4(uint32_t *ptr, uint32_t val) { return sync_fetch_and_op(ptr, val, [](int32_t a, int32_t b) { return std::min(a,b);}); }
-			static uint32_t sync_fetch_and_umax_4(uint32_t *ptr, uint32_t val) { return sync_fetch_and_op(ptr, val, [](uint32_t a, uint32_t b) { return std::max(a,b);}); }
-			static uint32_t sync_fetch_and_umin_4(uint32_t *ptr, uint32_t val) { return sync_fetch_and_op(ptr, val, [](uint32_t a, uint32_t b) { return std::min(a,b);}); }
-#endif
-		};
-
-		class Resolver
-		{
-		public:
-			using FunctionMap = std::unordered_map<std::string, void *>;
-
-			FunctionMap functions;
-
-			Resolver()
-			{
-				functions.emplace("nop", reinterpret_cast<void*>(F::nop));
-				functions.emplace("floorf", reinterpret_cast<void*>(floorf));
-				functions.emplace("nearbyintf", reinterpret_cast<void*>(nearbyintf));
-				functions.emplace("truncf", reinterpret_cast<void*>(truncf));
-				functions.emplace("printf", reinterpret_cast<void*>(printf));
-				functions.emplace("puts", reinterpret_cast<void*>(puts));
-				functions.emplace("fmodf", reinterpret_cast<void*>(fmodf));
-
-				functions.emplace("sinf", reinterpret_cast<void*>(sinf));
-				functions.emplace("cosf", reinterpret_cast<void*>(cosf));
-				functions.emplace("asinf", reinterpret_cast<void*>(asinf));
-				functions.emplace("acosf", reinterpret_cast<void*>(acosf));
-				functions.emplace("atanf", reinterpret_cast<void*>(atanf));
-				functions.emplace("sinhf", reinterpret_cast<void*>(sinhf));
-				functions.emplace("coshf", reinterpret_cast<void*>(coshf));
-				functions.emplace("tanhf", reinterpret_cast<void*>(tanhf));
-				functions.emplace("asinhf", reinterpret_cast<void*>(asinhf));
-				functions.emplace("acoshf", reinterpret_cast<void*>(acoshf));
-				functions.emplace("atanhf", reinterpret_cast<void*>(atanhf));
-				functions.emplace("atan2f", reinterpret_cast<void*>(atan2f));
-				functions.emplace("powf", reinterpret_cast<void*>(powf));
-				functions.emplace("expf", reinterpret_cast<void*>(expf));
-				functions.emplace("logf", reinterpret_cast<void*>(logf));
-				functions.emplace("exp2f", reinterpret_cast<void*>(exp2f));
-				functions.emplace("log2f", reinterpret_cast<void*>(log2f));
-
-				functions.emplace("sin", reinterpret_cast<void*>(static_cast<double(*)(double)>(sin)));
-				functions.emplace("cos", reinterpret_cast<void*>(static_cast<double(*)(double)>(cos)));
-				functions.emplace("asin", reinterpret_cast<void*>(static_cast<double(*)(double)>(asin)));
-				functions.emplace("acos", reinterpret_cast<void*>(static_cast<double(*)(double)>(acos)));
-				functions.emplace("atan", reinterpret_cast<void*>(static_cast<double(*)(double)>(atan)));
-				functions.emplace("sinh", reinterpret_cast<void*>(static_cast<double(*)(double)>(sinh)));
-				functions.emplace("cosh", reinterpret_cast<void*>(static_cast<double(*)(double)>(cosh)));
-				functions.emplace("tanh", reinterpret_cast<void*>(static_cast<double(*)(double)>(tanh)));
-				functions.emplace("asinh", reinterpret_cast<void*>(static_cast<double(*)(double)>(asinh)));
-				functions.emplace("acosh", reinterpret_cast<void*>(static_cast<double(*)(double)>(acosh)));
-				functions.emplace("atanh", reinterpret_cast<void*>(static_cast<double(*)(double)>(atanh)));
-				functions.emplace("atan2", reinterpret_cast<void*>(static_cast<double(*)(double,double)>(atan2)));
-				functions.emplace("pow", reinterpret_cast<void*>(static_cast<double(*)(double,double)>(pow)));
-				functions.emplace("exp", reinterpret_cast<void*>(static_cast<double(*)(double)>(exp)));
-				functions.emplace("log", reinterpret_cast<void*>(static_cast<double(*)(double)>(log)));
-				functions.emplace("exp2", reinterpret_cast<void*>(static_cast<double(*)(double)>(exp2)));
-				functions.emplace("log2", reinterpret_cast<void*>(static_cast<double(*)(double)>(log2)));
-
-				functions.emplace("atomic_load", reinterpret_cast<void*>(Atomic::load));
-				functions.emplace("atomic_store", reinterpret_cast<void*>(Atomic::store));
-
-				// FIXME (b/119409619): use an allocator here so we can control all memory allocations
-				functions.emplace("coroutine_alloc_frame", reinterpret_cast<void*>(F::coroutine_alloc_frame));
-				functions.emplace("coroutine_free_frame", reinterpret_cast<void*>(F::coroutine_free_frame));
+			// FIXME (b/119409619): use an allocator here so we can control all memory allocations
+			functions.emplace("coroutine_alloc_frame", reinterpret_cast<void*>(F::coroutine_alloc_frame));
+			functions.emplace("coroutine_free_frame", reinterpret_cast<void*>(F::coroutine_free_frame));
 
 #ifdef __APPLE__
-				functions.emplace("sincosf_stret", reinterpret_cast<void*>(__sincosf_stret));
+			functions.emplace("sincosf_stret", reinterpret_cast<void*>(__sincosf_stret));
 #elif defined(__linux__)
-				functions.emplace("sincosf", reinterpret_cast<void*>(sincosf));
+			functions.emplace("sincosf", reinterpret_cast<void*>(sincosf));
 #elif defined(_WIN64)
-				functions.emplace("chkstk", reinterpret_cast<void*>(__chkstk));
+			functions.emplace("chkstk", reinterpret_cast<void*>(__chkstk));
 #elif defined(_WIN32)
-				functions.emplace("chkstk", reinterpret_cast<void*>(_chkstk));
+			functions.emplace("chkstk", reinterpret_cast<void*>(_chkstk));
 #endif
 
 #ifdef __ANDROID__
-				functions.emplace("aeabi_unwind_cpp_pr0", reinterpret_cast<void*>(F::neverCalled));
-				functions.emplace("sync_synchronize", reinterpret_cast<void*>(F::sync_synchronize));
-				functions.emplace("sync_fetch_and_add_4", reinterpret_cast<void*>(F::sync_fetch_and_add_4));
-				functions.emplace("sync_fetch_and_and_4", reinterpret_cast<void*>(F::sync_fetch_and_and_4));
-				functions.emplace("sync_fetch_and_or_4", reinterpret_cast<void*>(F::sync_fetch_and_or_4));
-				functions.emplace("sync_fetch_and_xor_4", reinterpret_cast<void*>(F::sync_fetch_and_xor_4));
-				functions.emplace("sync_fetch_and_sub_4", reinterpret_cast<void*>(F::sync_fetch_and_sub_4));
-				functions.emplace("sync_lock_test_and_set_4", reinterpret_cast<void*>(F::sync_lock_test_and_set_4));
-				functions.emplace("sync_val_compare_and_swap_4", reinterpret_cast<void*>(F::sync_val_compare_and_swap_4));
-				functions.emplace("sync_fetch_and_max_4", reinterpret_cast<void*>(F::sync_fetch_and_max_4));
-				functions.emplace("sync_fetch_and_min_4", reinterpret_cast<void*>(F::sync_fetch_and_min_4));
-				functions.emplace("sync_fetch_and_umax_4", reinterpret_cast<void*>(F::sync_fetch_and_umax_4));
-				functions.emplace("sync_fetch_and_umin_4", reinterpret_cast<void*>(F::sync_fetch_and_umin_4));
-	#endif
-			}
-		};
-
-		static Resolver resolver;
-
-		// Trim off any underscores from the start of the symbol. LLVM likes
-		// to append these on macOS.
-		const char* trimmed = name;
-		while (trimmed[0] == '_') { trimmed++; }
-
-		auto it = resolver.functions.find(trimmed);
-		// Missing functions will likely make the module fail in exciting non-obvious ways.
-		ASSERT_MSG(it != resolver.functions.end(), "Missing external function: '%s'", name);
-		return it->second;
-	}
-
-	// The abstract Type* types are implemented as LLVM types, except that
-	// 64-bit vectors are emulated using 128-bit ones to avoid use of MMX in x86
-	// and VFP in ARM, and eliminate the overhead of converting them to explicit
-	// 128-bit ones. LLVM types are pointers, so we can represent emulated types
-	// as abstract pointers with small enum values.
-	enum InternalType : uintptr_t
-	{
-		// Emulated types:
-		Type_v2i32,
-		Type_v4i16,
-		Type_v2i16,
-		Type_v8i8,
-		Type_v4i8,
-		Type_v2f32,
-		EmulatedTypeCount,
-		// Returned by asInternalType() to indicate that the abstract Type*
-		// should be interpreted as LLVM type pointer:
-		Type_LLVM
+			functions.emplace("aeabi_unwind_cpp_pr0", reinterpret_cast<void*>(F::neverCalled));
+			functions.emplace("sync_synchronize", reinterpret_cast<void*>(F::sync_synchronize));
+			functions.emplace("sync_fetch_and_add_4", reinterpret_cast<void*>(F::sync_fetch_and_add_4));
+			functions.emplace("sync_fetch_and_and_4", reinterpret_cast<void*>(F::sync_fetch_and_and_4));
+			functions.emplace("sync_fetch_and_or_4", reinterpret_cast<void*>(F::sync_fetch_and_or_4));
+			functions.emplace("sync_fetch_and_xor_4", reinterpret_cast<void*>(F::sync_fetch_and_xor_4));
+			functions.emplace("sync_fetch_and_sub_4", reinterpret_cast<void*>(F::sync_fetch_and_sub_4));
+			functions.emplace("sync_lock_test_and_set_4", reinterpret_cast<void*>(F::sync_lock_test_and_set_4));
+			functions.emplace("sync_val_compare_and_swap_4", reinterpret_cast<void*>(F::sync_val_compare_and_swap_4));
+			functions.emplace("sync_fetch_and_max_4", reinterpret_cast<void*>(F::sync_fetch_and_max_4));
+			functions.emplace("sync_fetch_and_min_4", reinterpret_cast<void*>(F::sync_fetch_and_min_4));
+			functions.emplace("sync_fetch_and_umax_4", reinterpret_cast<void*>(F::sync_fetch_and_umax_4));
+			functions.emplace("sync_fetch_and_umin_4", reinterpret_cast<void*>(F::sync_fetch_and_umin_4));
+#endif
+		}
 	};
 
-	inline InternalType asInternalType(Type *type)
-	{
-		InternalType t = static_cast<InternalType>(reinterpret_cast<uintptr_t>(type));
-		return (t < EmulatedTypeCount) ? t : Type_LLVM;
-	}
+	static Resolver resolver;
 
-	llvm::Type *T(Type *t)
+	// Trim off any underscores from the start of the symbol. LLVM likes
+	// to append these on macOS.
+	const char* trimmed = name;
+	while (trimmed[0] == '_') { trimmed++; }
+
+	auto it = resolver.functions.find(trimmed);
+	// Missing functions will likely make the module fail in exciting non-obvious ways.
+	ASSERT_MSG(it != resolver.functions.end(), "Missing external function: '%s'", name);
+	return it->second;
+}
+
+// The abstract Type* types are implemented as LLVM types, except that
+// 64-bit vectors are emulated using 128-bit ones to avoid use of MMX in x86
+// and VFP in ARM, and eliminate the overhead of converting them to explicit
+// 128-bit ones. LLVM types are pointers, so we can represent emulated types
+// as abstract pointers with small enum values.
+enum InternalType : uintptr_t
+{
+	// Emulated types:
+	Type_v2i32,
+	Type_v4i16,
+	Type_v2i16,
+	Type_v8i8,
+	Type_v4i8,
+	Type_v2f32,
+	EmulatedTypeCount,
+	// Returned by asInternalType() to indicate that the abstract Type*
+	// should be interpreted as LLVM type pointer:
+	Type_LLVM
+};
+
+inline InternalType asInternalType(Type *type)
+{
+	InternalType t = static_cast<InternalType>(reinterpret_cast<uintptr_t>(type));
+	return (t < EmulatedTypeCount) ? t : Type_LLVM;
+}
+
+llvm::Type *T(Type *t)
+{
+	// Use 128-bit vectors to implement logically shorter ones.
+	switch(asInternalType(t))
 	{
-		// Use 128-bit vectors to implement logically shorter ones.
-		switch(asInternalType(t))
+	case Type_v2i32: return T(Int4::getType());
+	case Type_v4i16: return T(Short8::getType());
+	case Type_v2i16: return T(Short8::getType());
+	case Type_v8i8:  return T(Byte16::getType());
+	case Type_v4i8:  return T(Byte16::getType());
+	case Type_v2f32: return T(Float4::getType());
+	case Type_LLVM:  return reinterpret_cast<llvm::Type*>(t);
+	default:
+		UNREACHABLE("asInternalType(t): %d", int(asInternalType(t)));
+		return nullptr;
+	}
+}
+
+Type *T(InternalType t)
+{
+	return reinterpret_cast<Type*>(t);
+}
+
+inline std::vector<llvm::Type*> &T(std::vector<Type*> &t)
+{
+	return reinterpret_cast<std::vector<llvm::Type*>&>(t);
+}
+
+inline llvm::BasicBlock *B(BasicBlock *t)
+{
+	return reinterpret_cast<llvm::BasicBlock*>(t);
+}
+
+inline BasicBlock *B(llvm::BasicBlock *t)
+{
+	return reinterpret_cast<BasicBlock*>(t);
+}
+
+static size_t typeSize(Type *type)
+{
+	switch(asInternalType(type))
+	{
+	case Type_v2i32: return 8;
+	case Type_v4i16: return 8;
+	case Type_v2i16: return 4;
+	case Type_v8i8:  return 8;
+	case Type_v4i8:  return 4;
+	case Type_v2f32: return 8;
+	case Type_LLVM:
 		{
-		case Type_v2i32: return T(Int4::getType());
-		case Type_v4i16: return T(Short8::getType());
-		case Type_v2i16: return T(Short8::getType());
-		case Type_v8i8:  return T(Byte16::getType());
-		case Type_v4i8:  return T(Byte16::getType());
-		case Type_v2f32: return T(Float4::getType());
-		case Type_LLVM:  return reinterpret_cast<llvm::Type*>(t);
-		default:
-			UNREACHABLE("asInternalType(t): %d", int(asInternalType(t)));
-			return nullptr;
-		}
-	}
+			llvm::Type *t = T(type);
 
-	Type *T(InternalType t)
-	{
-		return reinterpret_cast<Type*>(t);
-	}
-
-	inline std::vector<llvm::Type*> &T(std::vector<Type*> &t)
-	{
-		return reinterpret_cast<std::vector<llvm::Type*>&>(t);
-	}
-
-	inline llvm::BasicBlock *B(BasicBlock *t)
-	{
-		return reinterpret_cast<llvm::BasicBlock*>(t);
-	}
-
-	inline BasicBlock *B(llvm::BasicBlock *t)
-	{
-		return reinterpret_cast<BasicBlock*>(t);
-	}
-
-	static size_t typeSize(Type *type)
-	{
-		switch(asInternalType(type))
-		{
-		case Type_v2i32: return 8;
-		case Type_v4i16: return 8;
-		case Type_v2i16: return 4;
-		case Type_v8i8:  return 8;
-		case Type_v4i8:  return 4;
-		case Type_v2f32: return 8;
-		case Type_LLVM:
+			if(t->isPointerTy())
 			{
-				llvm::Type *t = T(type);
-
-				if(t->isPointerTy())
-				{
-					return sizeof(void*);
-				}
-
-				// At this point we should only have LLVM 'primitive' types.
-				unsigned int bits = t->getPrimitiveSizeInBits();
-				ASSERT_MSG(bits != 0, "bits: %d", int(bits));
-
-				// TODO(capn): Booleans are 1 bit integers in LLVM's SSA type system,
-				// but are typically stored as one byte. The DataLayout structure should
-				// be used here and many other places if this assumption fails.
-				return (bits + 7) / 8;
+				return sizeof(void*);
 			}
-			break;
-		default:
-			UNREACHABLE("asInternalType(type): %d", int(asInternalType(type)));
-			return 0;
+
+			// At this point we should only have LLVM 'primitive' types.
+			unsigned int bits = t->getPrimitiveSizeInBits();
+			ASSERT_MSG(bits != 0, "bits: %d", int(bits));
+
+			// TODO(capn): Booleans are 1 bit integers in LLVM's SSA type system,
+			// but are typically stored as one byte. The DataLayout structure should
+			// be used here and many other places if this assumption fails.
+			return (bits + 7) / 8;
 		}
+		break;
+	default:
+		UNREACHABLE("asInternalType(type): %d", int(asInternalType(type)));
+		return 0;
 	}
+}
 
-	static unsigned int elementCount(Type *type)
+static unsigned int elementCount(Type *type)
+{
+	switch(asInternalType(type))
 	{
-		switch(asInternalType(type))
+	case Type_v2i32: return 2;
+	case Type_v4i16: return 4;
+	case Type_v2i16: return 2;
+	case Type_v8i8:  return 8;
+	case Type_v4i8:  return 4;
+	case Type_v2f32: return 2;
+	case Type_LLVM:  return llvm::cast<llvm::VectorType>(T(type))->getNumElements();
+	default:
+		UNREACHABLE("asInternalType(type): %d", int(asInternalType(type)));
+		return 0;
+	}
+}
+
+static ::llvm::Function* createFunction(const char *name, ::llvm::Type *retTy, const std::vector<::llvm::Type*> &params)
+{
+	llvm::FunctionType *functionType = llvm::FunctionType::get(retTy, params, false);
+	auto func = llvm::Function::Create(functionType, llvm::GlobalValue::InternalLinkage, name, jit->module.get());
+	func->setDoesNotThrow();
+	func->setCallingConv(llvm::CallingConv::C);
+	return func;
+}
+
+Nucleus::Nucleus()
+{
+	::codegenMutex.lock();   // Reactor and LLVM are currently not thread safe
+
+	ASSERT(jit == nullptr);
+	jit.reset(new JITBuilder(Nucleus::getDefaultConfig()));
+}
+
+Nucleus::~Nucleus()
+{
+	jit.reset();
+	::codegenMutex.unlock();
+}
+
+void Nucleus::setDefaultConfig(const Config &cfg)
+{
+	std::unique_lock<std::mutex> lock(::defaultConfigLock);
+	::defaultConfig() = cfg;
+}
+
+void Nucleus::adjustDefaultConfig(const Config::Edit &cfgEdit)
+{
+	std::unique_lock<std::mutex> lock(::defaultConfigLock);
+	auto &config = ::defaultConfig();
+	config = cfgEdit.apply(config);
+}
+
+Config Nucleus::getDefaultConfig()
+{
+	std::unique_lock<std::mutex> lock(::defaultConfigLock);
+	return ::defaultConfig();
+}
+
+std::shared_ptr<Routine> Nucleus::acquireRoutine(const char *name, const Config::Edit &cfgEdit /* = Config::Edit::None */)
+{
+	auto cfg = cfgEdit.apply(jit->config);
+
+	if(jit->builder->GetInsertBlock()->empty() || !jit->builder->GetInsertBlock()->back().isTerminator())
+	{
+		llvm::Type *type = jit->function->getReturnType();
+
+		if(type->isVoidTy())
 		{
-		case Type_v2i32: return 2;
-		case Type_v4i16: return 4;
-		case Type_v2i16: return 2;
-		case Type_v8i8:  return 8;
-		case Type_v4i8:  return 4;
-		case Type_v2f32: return 2;
-		case Type_LLVM:  return llvm::cast<llvm::VectorType>(T(type))->getNumElements();
-		default:
-			UNREACHABLE("asInternalType(type): %d", int(asInternalType(type)));
-			return 0;
-		}
-	}
-
-	static ::llvm::Function* createFunction(const char *name, ::llvm::Type *retTy, const std::vector<::llvm::Type*> &params)
-	{
-		llvm::FunctionType *functionType = llvm::FunctionType::get(retTy, params, false);
-		auto func = llvm::Function::Create(functionType, llvm::GlobalValue::InternalLinkage, name, jit->module.get());
-		func->setDoesNotThrow();
-		func->setCallingConv(llvm::CallingConv::C);
-		return func;
-	}
-
-	Nucleus::Nucleus()
-	{
-		::codegenMutex.lock();   // Reactor and LLVM are currently not thread safe
-
-		ASSERT(jit == nullptr);
-		jit.reset(new JITBuilder(Nucleus::getDefaultConfig()));
-	}
-
-	Nucleus::~Nucleus()
-	{
-		jit.reset();
-		::codegenMutex.unlock();
-	}
-
-	void Nucleus::setDefaultConfig(const Config &cfg)
-	{
-		std::unique_lock<std::mutex> lock(::defaultConfigLock);
-		::defaultConfig() = cfg;
-	}
-
-	void Nucleus::adjustDefaultConfig(const Config::Edit &cfgEdit)
-	{
-		std::unique_lock<std::mutex> lock(::defaultConfigLock);
-		auto &config = ::defaultConfig();
-		config = cfgEdit.apply(config);
-	}
-
-	Config Nucleus::getDefaultConfig()
-	{
-		std::unique_lock<std::mutex> lock(::defaultConfigLock);
-		return ::defaultConfig();
-	}
-
-	std::shared_ptr<Routine> Nucleus::acquireRoutine(const char *name, const Config::Edit &cfgEdit /* = Config::Edit::None */)
-	{
-		auto cfg = cfgEdit.apply(jit->config);
-
-		if(jit->builder->GetInsertBlock()->empty() || !jit->builder->GetInsertBlock()->back().isTerminator())
-		{
-			llvm::Type *type = jit->function->getReturnType();
-
-			if(type->isVoidTy())
-			{
-				createRetVoid();
-			}
-			else
-			{
-				createRet(V(llvm::UndefValue::get(type)));
-			}
-		}
-
-#ifdef ENABLE_RR_DEBUG_INFO
-		if (jit->debugInfo != nullptr)
-		{
-			jit->debugInfo->Finalize();
-		}
-#endif // ENABLE_RR_DEBUG_INFO
-
-		if(false)
-		{
-			std::error_code error;
-			llvm::raw_fd_ostream file(std::string(name) + "-llvm-dump-unopt.txt", error);
-			jit->module->print(file, 0);
-		}
-
-#if defined(ENABLE_RR_LLVM_IR_VERIFICATION) || !defined(NDEBUG)
-		{
-			llvm::legacy::PassManager pm;
-			pm.add(llvm::createVerifierPass());
-			pm.run(*jit->module);
-		}
-#endif // defined(ENABLE_RR_LLVM_IR_VERIFICATION) || !defined(NDEBUG)
-
-		jit->optimize(cfg);
-
-		if(false)
-		{
-			std::error_code error;
-			llvm::raw_fd_ostream file(std::string(name) + "-llvm-dump-opt.txt", error);
-			jit->module->print(file, 0);
-		}
-
-		auto routine = jit->acquireRoutine(&jit->function, 1, cfg);
-		jit.reset();
-
-		return routine;
-	}
-
-	Value *Nucleus::allocateStackVariable(Type *type, int arraySize)
-	{
-		// Need to allocate it in the entry block for mem2reg to work
-		llvm::BasicBlock &entryBlock = jit->function->getEntryBlock();
-
-		llvm::Instruction *declaration;
-
-		if(arraySize)
-		{
-			declaration = new llvm::AllocaInst(T(type), 0, V(Nucleus::createConstantInt(arraySize)));
+			createRetVoid();
 		}
 		else
 		{
-			declaration = new llvm::AllocaInst(T(type), 0, (llvm::Value*)nullptr);
+			createRet(V(llvm::UndefValue::get(type)));
 		}
-
-		entryBlock.getInstList().push_front(declaration);
-
-		return V(declaration);
 	}
 
-	BasicBlock *Nucleus::createBasicBlock()
-	{
-		return B(llvm::BasicBlock::Create(jit->context, "", jit->function));
-	}
-
-	BasicBlock *Nucleus::getInsertBlock()
-	{
-		return B(jit->builder->GetInsertBlock());
-	}
-
-	void Nucleus::setInsertBlock(BasicBlock *basicBlock)
-	{
-	//	assert(jit->builder->GetInsertBlock()->back().isTerminator());
-
-		Variable::materializeAll();
-
-		jit->builder->SetInsertPoint(B(basicBlock));
-	}
-
-	void Nucleus::createFunction(Type *ReturnType, std::vector<Type*> &Params)
-	{
-		jit->function = rr::createFunction("", T(ReturnType), T(Params));
-
 #ifdef ENABLE_RR_DEBUG_INFO
-		jit->debugInfo = std::unique_ptr<DebugInfo>(new DebugInfo(jit->builder.get(), &jit->context, jit->module.get(), jit->function));
+	if (jit->debugInfo != nullptr)
+	{
+		jit->debugInfo->Finalize();
+	}
 #endif // ENABLE_RR_DEBUG_INFO
 
-		jit->builder->SetInsertPoint(llvm::BasicBlock::Create(jit->context, "", jit->function));
+	if(false)
+	{
+		std::error_code error;
+		llvm::raw_fd_ostream file(std::string(name) + "-llvm-dump-unopt.txt", error);
+		jit->module->print(file, 0);
 	}
 
-	Value *Nucleus::getArgument(unsigned int index)
+#if defined(ENABLE_RR_LLVM_IR_VERIFICATION) || !defined(NDEBUG)
 	{
-		llvm::Function::arg_iterator args = jit->function->arg_begin();
+		llvm::legacy::PassManager pm;
+		pm.add(llvm::createVerifierPass());
+		pm.run(*jit->module);
+	}
+#endif // defined(ENABLE_RR_LLVM_IR_VERIFICATION) || !defined(NDEBUG)
 
-		while(index)
-		{
-			args++;
-			index--;
-		}
+	jit->optimize(cfg);
 
-		return V(&*args);
+	if(false)
+	{
+		std::error_code error;
+		llvm::raw_fd_ostream file(std::string(name) + "-llvm-dump-opt.txt", error);
+		jit->module->print(file, 0);
 	}
 
-	void Nucleus::createRetVoid()
+	auto routine = jit->acquireRoutine(&jit->function, 1, cfg);
+	jit.reset();
+
+	return routine;
+}
+
+Value *Nucleus::allocateStackVariable(Type *type, int arraySize)
+{
+	// Need to allocate it in the entry block for mem2reg to work
+	llvm::BasicBlock &entryBlock = jit->function->getEntryBlock();
+
+	llvm::Instruction *declaration;
+
+	if(arraySize)
 	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-
-		ASSERT_MSG(jit->function->getReturnType() == T(Void::getType()), "Return type mismatch");
-
-		// Code generated after this point is unreachable, so any variables
-		// being read can safely return an undefined value. We have to avoid
-		// materializing variables after the terminator ret instruction.
-		Variable::killUnmaterialized();
-
-		jit->builder->CreateRetVoid();
+		declaration = new llvm::AllocaInst(T(type), 0, V(Nucleus::createConstantInt(arraySize)));
+	}
+	else
+	{
+		declaration = new llvm::AllocaInst(T(type), 0, (llvm::Value*)nullptr);
 	}
 
-	void Nucleus::createRet(Value *v)
+	entryBlock.getInstList().push_front(declaration);
+
+	return V(declaration);
+}
+
+BasicBlock *Nucleus::createBasicBlock()
+{
+	return B(llvm::BasicBlock::Create(jit->context, "", jit->function));
+}
+
+BasicBlock *Nucleus::getInsertBlock()
+{
+	return B(jit->builder->GetInsertBlock());
+}
+
+void Nucleus::setInsertBlock(BasicBlock *basicBlock)
+{
+//	assert(jit->builder->GetInsertBlock()->back().isTerminator());
+
+	Variable::materializeAll();
+
+	jit->builder->SetInsertPoint(B(basicBlock));
+}
+
+void Nucleus::createFunction(Type *ReturnType, std::vector<Type*> &Params)
+{
+	jit->function = rr::createFunction("", T(ReturnType), T(Params));
+
+#ifdef ENABLE_RR_DEBUG_INFO
+	jit->debugInfo = std::unique_ptr<DebugInfo>(new DebugInfo(jit->builder.get(), &jit->context, jit->module.get(), jit->function));
+#endif // ENABLE_RR_DEBUG_INFO
+
+	jit->builder->SetInsertPoint(llvm::BasicBlock::Create(jit->context, "", jit->function));
+}
+
+Value *Nucleus::getArgument(unsigned int index)
+{
+	llvm::Function::arg_iterator args = jit->function->arg_begin();
+
+	while(index)
 	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-
-		ASSERT_MSG(jit->function->getReturnType() == V(v)->getType(), "Return type mismatch");
-
-		// Code generated after this point is unreachable, so any variables
-		// being read can safely return an undefined value. We have to avoid
-		// materializing variables after the terminator ret instruction.
-		Variable::killUnmaterialized();
-
-		jit->builder->CreateRet(V(v));
+		args++;
+		index--;
 	}
 
-	void Nucleus::createBr(BasicBlock *dest)
+	return V(&*args);
+}
+
+void Nucleus::createRetVoid()
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+
+	ASSERT_MSG(jit->function->getReturnType() == T(Void::getType()), "Return type mismatch");
+
+	// Code generated after this point is unreachable, so any variables
+	// being read can safely return an undefined value. We have to avoid
+	// materializing variables after the terminator ret instruction.
+	Variable::killUnmaterialized();
+
+	jit->builder->CreateRetVoid();
+}
+
+void Nucleus::createRet(Value *v)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+
+	ASSERT_MSG(jit->function->getReturnType() == V(v)->getType(), "Return type mismatch");
+
+	// Code generated after this point is unreachable, so any variables
+	// being read can safely return an undefined value. We have to avoid
+	// materializing variables after the terminator ret instruction.
+	Variable::killUnmaterialized();
+
+	jit->builder->CreateRet(V(v));
+}
+
+void Nucleus::createBr(BasicBlock *dest)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	Variable::materializeAll();
+
+	jit->builder->CreateBr(B(dest));
+}
+
+void Nucleus::createCondBr(Value *cond, BasicBlock *ifTrue, BasicBlock *ifFalse)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	Variable::materializeAll();
+	jit->builder->CreateCondBr(V(cond), B(ifTrue), B(ifFalse));
+}
+
+Value *Nucleus::createAdd(Value *lhs, Value *rhs)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return V(jit->builder->CreateAdd(V(lhs), V(rhs)));
+}
+
+Value *Nucleus::createSub(Value *lhs, Value *rhs)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return V(jit->builder->CreateSub(V(lhs), V(rhs)));
+}
+
+Value *Nucleus::createMul(Value *lhs, Value *rhs)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return V(jit->builder->CreateMul(V(lhs), V(rhs)));
+}
+
+Value *Nucleus::createUDiv(Value *lhs, Value *rhs)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return V(jit->builder->CreateUDiv(V(lhs), V(rhs)));
+}
+
+Value *Nucleus::createSDiv(Value *lhs, Value *rhs)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return V(jit->builder->CreateSDiv(V(lhs), V(rhs)));
+}
+
+Value *Nucleus::createFAdd(Value *lhs, Value *rhs)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return V(jit->builder->CreateFAdd(V(lhs), V(rhs)));
+}
+
+Value *Nucleus::createFSub(Value *lhs, Value *rhs)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return V(jit->builder->CreateFSub(V(lhs), V(rhs)));
+}
+
+Value *Nucleus::createFMul(Value *lhs, Value *rhs)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return V(jit->builder->CreateFMul(V(lhs), V(rhs)));
+}
+
+Value *Nucleus::createFDiv(Value *lhs, Value *rhs)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return V(jit->builder->CreateFDiv(V(lhs), V(rhs)));
+}
+
+Value *Nucleus::createURem(Value *lhs, Value *rhs)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return V(jit->builder->CreateURem(V(lhs), V(rhs)));
+}
+
+Value *Nucleus::createSRem(Value *lhs, Value *rhs)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return V(jit->builder->CreateSRem(V(lhs), V(rhs)));
+}
+
+Value *Nucleus::createFRem(Value *lhs, Value *rhs)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return V(jit->builder->CreateFRem(V(lhs), V(rhs)));
+}
+
+Value *Nucleus::createShl(Value *lhs, Value *rhs)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return V(jit->builder->CreateShl(V(lhs), V(rhs)));
+}
+
+Value *Nucleus::createLShr(Value *lhs, Value *rhs)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return V(jit->builder->CreateLShr(V(lhs), V(rhs)));
+}
+
+Value *Nucleus::createAShr(Value *lhs, Value *rhs)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return V(jit->builder->CreateAShr(V(lhs), V(rhs)));
+}
+
+Value *Nucleus::createAnd(Value *lhs, Value *rhs)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return V(jit->builder->CreateAnd(V(lhs), V(rhs)));
+}
+
+Value *Nucleus::createOr(Value *lhs, Value *rhs)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return V(jit->builder->CreateOr(V(lhs), V(rhs)));
+}
+
+Value *Nucleus::createXor(Value *lhs, Value *rhs)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return V(jit->builder->CreateXor(V(lhs), V(rhs)));
+}
+
+Value *Nucleus::createNeg(Value *v)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return V(jit->builder->CreateNeg(V(v)));
+}
+
+Value *Nucleus::createFNeg(Value *v)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return V(jit->builder->CreateFNeg(V(v)));
+}
+
+Value *Nucleus::createNot(Value *v)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return V(jit->builder->CreateNot(V(v)));
+}
+
+Value *Nucleus::createLoad(Value *ptr, Type *type, bool isVolatile, unsigned int alignment, bool atomic, std::memory_order memoryOrder)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	switch(asInternalType(type))
 	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		Variable::materializeAll();
-
-		jit->builder->CreateBr(B(dest));
-	}
-
-	void Nucleus::createCondBr(Value *cond, BasicBlock *ifTrue, BasicBlock *ifFalse)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		Variable::materializeAll();
-		jit->builder->CreateCondBr(V(cond), B(ifTrue), B(ifFalse));
-	}
-
-	Value *Nucleus::createAdd(Value *lhs, Value *rhs)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return V(jit->builder->CreateAdd(V(lhs), V(rhs)));
-	}
-
-	Value *Nucleus::createSub(Value *lhs, Value *rhs)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return V(jit->builder->CreateSub(V(lhs), V(rhs)));
-	}
-
-	Value *Nucleus::createMul(Value *lhs, Value *rhs)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return V(jit->builder->CreateMul(V(lhs), V(rhs)));
-	}
-
-	Value *Nucleus::createUDiv(Value *lhs, Value *rhs)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return V(jit->builder->CreateUDiv(V(lhs), V(rhs)));
-	}
-
-	Value *Nucleus::createSDiv(Value *lhs, Value *rhs)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return V(jit->builder->CreateSDiv(V(lhs), V(rhs)));
-	}
-
-	Value *Nucleus::createFAdd(Value *lhs, Value *rhs)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return V(jit->builder->CreateFAdd(V(lhs), V(rhs)));
-	}
-
-	Value *Nucleus::createFSub(Value *lhs, Value *rhs)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return V(jit->builder->CreateFSub(V(lhs), V(rhs)));
-	}
-
-	Value *Nucleus::createFMul(Value *lhs, Value *rhs)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return V(jit->builder->CreateFMul(V(lhs), V(rhs)));
-	}
-
-	Value *Nucleus::createFDiv(Value *lhs, Value *rhs)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return V(jit->builder->CreateFDiv(V(lhs), V(rhs)));
-	}
-
-	Value *Nucleus::createURem(Value *lhs, Value *rhs)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return V(jit->builder->CreateURem(V(lhs), V(rhs)));
-	}
-
-	Value *Nucleus::createSRem(Value *lhs, Value *rhs)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return V(jit->builder->CreateSRem(V(lhs), V(rhs)));
-	}
-
-	Value *Nucleus::createFRem(Value *lhs, Value *rhs)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return V(jit->builder->CreateFRem(V(lhs), V(rhs)));
-	}
-
-	Value *Nucleus::createShl(Value *lhs, Value *rhs)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return V(jit->builder->CreateShl(V(lhs), V(rhs)));
-	}
-
-	Value *Nucleus::createLShr(Value *lhs, Value *rhs)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return V(jit->builder->CreateLShr(V(lhs), V(rhs)));
-	}
-
-	Value *Nucleus::createAShr(Value *lhs, Value *rhs)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return V(jit->builder->CreateAShr(V(lhs), V(rhs)));
-	}
-
-	Value *Nucleus::createAnd(Value *lhs, Value *rhs)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return V(jit->builder->CreateAnd(V(lhs), V(rhs)));
-	}
-
-	Value *Nucleus::createOr(Value *lhs, Value *rhs)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return V(jit->builder->CreateOr(V(lhs), V(rhs)));
-	}
-
-	Value *Nucleus::createXor(Value *lhs, Value *rhs)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return V(jit->builder->CreateXor(V(lhs), V(rhs)));
-	}
-
-	Value *Nucleus::createNeg(Value *v)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return V(jit->builder->CreateNeg(V(v)));
-	}
-
-	Value *Nucleus::createFNeg(Value *v)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return V(jit->builder->CreateFNeg(V(v)));
-	}
-
-	Value *Nucleus::createNot(Value *v)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return V(jit->builder->CreateNot(V(v)));
-	}
-
-	Value *Nucleus::createLoad(Value *ptr, Type *type, bool isVolatile, unsigned int alignment, bool atomic, std::memory_order memoryOrder)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		switch(asInternalType(type))
-		{
-		case Type_v2i32:
-		case Type_v4i16:
-		case Type_v8i8:
-		case Type_v2f32:
-			return createBitCast(
-				createInsertElement(
-					V(llvm::UndefValue::get(llvm::VectorType::get(T(Long::getType()), 2))),
-					createLoad(createBitCast(ptr, Pointer<Long>::getType()), Long::getType(), isVolatile, alignment, atomic, memoryOrder),
-					0),
-				type);
-		case Type_v2i16:
-		case Type_v4i8:
-			if(alignment != 0)   // Not a local variable (all vectors are 128-bit).
-			{
-				Value *u = V(llvm::UndefValue::get(llvm::VectorType::get(T(Long::getType()), 2)));
-				Value *i = createLoad(createBitCast(ptr, Pointer<Int>::getType()), Int::getType(), isVolatile, alignment, atomic, memoryOrder);
-				i = createZExt(i, Long::getType());
-				Value *v = createInsertElement(u, i, 0);
-				return createBitCast(v, type);
-			}
-			// Fallthrough to non-emulated case.
-		case Type_LLVM:
-			{
-				auto elTy = T(type);
-				ASSERT(V(ptr)->getType()->getContainedType(0) == elTy);
-
-				if (!atomic)
-				{
-					return V(jit->builder->CreateAlignedLoad(V(ptr), alignment, isVolatile));
-				}
-				else if (elTy->isIntegerTy() || elTy->isPointerTy())
-				{
-					// Integers and pointers can be atomically loaded by setting
-					// the ordering constraint on the load instruction.
-					auto load = jit->builder->CreateAlignedLoad(V(ptr), alignment, isVolatile);
-					load->setAtomic(atomicOrdering(atomic, memoryOrder));
-					return V(load);
-				}
-				else if (elTy->isFloatTy() || elTy->isDoubleTy())
-				{
-					// LLVM claims to support atomic loads of float types as
-					// above, but certain backends cannot deal with this.
-					// Load as an integer and bitcast. See b/136037244.
-  					auto size = jit->module->getDataLayout().getTypeStoreSize(elTy);
-					auto elAsIntTy = ::llvm::IntegerType::get(jit->context, size * 8);
-					auto ptrCast = jit->builder->CreatePointerCast(V(ptr), elAsIntTy->getPointerTo());
-					auto load = jit->builder->CreateAlignedLoad(ptrCast, alignment, isVolatile);
-					load->setAtomic(atomicOrdering(atomic, memoryOrder));
-					auto loadCast = jit->builder->CreateBitCast(load, elTy);
-					return V(loadCast);
-				}
-				else
-				{
-					// More exotic types require falling back to the extern:
-					// void __atomic_load(size_t size, void *ptr, void *ret, int ordering)
-					auto sizetTy = ::llvm::IntegerType::get(jit->context, sizeof(size_t) * 8);
-					auto intTy = ::llvm::IntegerType::get(jit->context, sizeof(int) * 8);
-					auto i8Ty = ::llvm::Type::getInt8Ty(jit->context);
-					auto i8PtrTy = i8Ty->getPointerTo();
-					auto voidTy = ::llvm::Type::getVoidTy(jit->context);
-					auto funcTy = ::llvm::FunctionType::get(voidTy, {sizetTy, i8PtrTy, i8PtrTy, intTy}, false);
-					auto func = jit->module->getOrInsertFunction("__atomic_load", funcTy);
-  					auto size = jit->module->getDataLayout().getTypeStoreSize(elTy);
-					auto out = allocateStackVariable(type);
-					jit->builder->CreateCall(func, {
-						::llvm::ConstantInt::get(sizetTy, size),
-						jit->builder->CreatePointerCast(V(ptr), i8PtrTy),
-						jit->builder->CreatePointerCast(V(out), i8PtrTy),
-						::llvm::ConstantInt::get(intTy, uint64_t(atomicOrdering(true, memoryOrder))),
-					 });
-					 return V(jit->builder->CreateLoad(V(out)));
-				}
-			}
-		default:
-			UNREACHABLE("asInternalType(type): %d", int(asInternalType(type)));
-			return nullptr;
-		}
-	}
-
-	Value *Nucleus::createStore(Value *value, Value *ptr, Type *type, bool isVolatile, unsigned int alignment, bool atomic, std::memory_order memoryOrder)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		switch(asInternalType(type))
-		{
-		case Type_v2i32:
-		case Type_v4i16:
-		case Type_v8i8:
-		case Type_v2f32:
-			createStore(
-				createExtractElement(
-					createBitCast(value, T(llvm::VectorType::get(T(Long::getType()), 2))), Long::getType(), 0),
-				createBitCast(ptr, Pointer<Long>::getType()),
-				Long::getType(), isVolatile, alignment, atomic, memoryOrder);
-			return value;
-		case Type_v2i16:
-		case Type_v4i8:
-			if(alignment != 0)   // Not a local variable (all vectors are 128-bit).
-			{
-				createStore(
-					createExtractElement(createBitCast(value, Int4::getType()), Int::getType(), 0),
-					createBitCast(ptr, Pointer<Int>::getType()),
-					Int::getType(), isVolatile, alignment, atomic, memoryOrder);
-				return value;
-			}
-			// Fallthrough to non-emulated case.
-		case Type_LLVM:
-			{
-				auto elTy = T(type);
-				ASSERT(V(ptr)->getType()->getContainedType(0) == elTy);
-
-				if (!atomic)
-				{
-					jit->builder->CreateAlignedStore(V(value), V(ptr), alignment, isVolatile);
-				}
-				else if (elTy->isIntegerTy() || elTy->isPointerTy())
-				{
-					// Integers and pointers can be atomically stored by setting
-					// the ordering constraint on the store instruction.
-					auto store = jit->builder->CreateAlignedStore(V(value), V(ptr), alignment, isVolatile);
-					store->setAtomic(atomicOrdering(atomic, memoryOrder));
-				}
-				else if (elTy->isFloatTy() || elTy->isDoubleTy())
-				{
-					// LLVM claims to support atomic stores of float types as
-					// above, but certain backends cannot deal with this.
-					// Store as an bitcast integer. See b/136037244.
-  					auto size = jit->module->getDataLayout().getTypeStoreSize(elTy);
-					auto elAsIntTy = ::llvm::IntegerType::get(jit->context, size * 8);
-					auto valCast = jit->builder->CreateBitCast(V(value), elAsIntTy);
-					auto ptrCast = jit->builder->CreatePointerCast(V(ptr), elAsIntTy->getPointerTo());
-					auto store = jit->builder->CreateAlignedStore(valCast, ptrCast, alignment, isVolatile);
-					store->setAtomic(atomicOrdering(atomic, memoryOrder));
-				}
-				else
-				{
-					// More exotic types require falling back to the extern:
-					// void __atomic_store(size_t size, void *ptr, void *val, int ordering)
-					auto sizetTy = ::llvm::IntegerType::get(jit->context, sizeof(size_t) * 8);
-					auto intTy = ::llvm::IntegerType::get(jit->context, sizeof(int) * 8);
-					auto i8Ty = ::llvm::Type::getInt8Ty(jit->context);
-					auto i8PtrTy = i8Ty->getPointerTo();
-					auto voidTy = ::llvm::Type::getVoidTy(jit->context);
-					auto funcTy = ::llvm::FunctionType::get(voidTy, {sizetTy, i8PtrTy, i8PtrTy, intTy}, false);
-					auto func = jit->module->getOrInsertFunction("__atomic_store", funcTy);
-  					auto size = jit->module->getDataLayout().getTypeStoreSize(elTy);
-					auto copy = allocateStackVariable(type);
-					jit->builder->CreateStore(V(value), V(copy));
-					jit->builder->CreateCall(func, {
-						::llvm::ConstantInt::get(sizetTy, size),
-						jit->builder->CreatePointerCast(V(ptr), i8PtrTy),
-						jit->builder->CreatePointerCast(V(copy), i8PtrTy),
-						::llvm::ConstantInt::get(intTy, uint64_t(atomicOrdering(true, memoryOrder))),
-					 });
-				}
-
-				return value;
-			}
-		default:
-			UNREACHABLE("asInternalType(type): %d", int(asInternalType(type)));
-			return nullptr;
-		}
-	}
-
-	Value *Nucleus::createMaskedLoad(Value *ptr, Type *elTy, Value *mask, unsigned int alignment, bool zeroMaskedLanes)
-	{
-		ASSERT(V(ptr)->getType()->isPointerTy());
-		ASSERT(V(mask)->getType()->isVectorTy());
-
-		auto numEls = V(mask)->getType()->getVectorNumElements();
-		auto i1Ty = ::llvm::Type::getInt1Ty(jit->context);
-		auto i32Ty = ::llvm::Type::getInt32Ty(jit->context);
-		auto elVecTy = ::llvm::VectorType::get(T(elTy), numEls);
-		auto elVecPtrTy = elVecTy->getPointerTo();
-		auto i8Mask = jit->builder->CreateIntCast(V(mask), ::llvm::VectorType::get(i1Ty, numEls), false); // vec<int, int, ...> -> vec<bool, bool, ...>
-		auto passthrough = zeroMaskedLanes ? ::llvm::Constant::getNullValue(elVecTy) : llvm::UndefValue::get(elVecTy);
-		auto align = ::llvm::ConstantInt::get(i32Ty, alignment);
-		auto func = ::llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::masked_load, { elVecTy, elVecPtrTy } );
-		return V(jit->builder->CreateCall(func, { V(ptr), align, i8Mask, passthrough }));
-	}
-
-	void Nucleus::createMaskedStore(Value *ptr, Value *val, Value *mask, unsigned int alignment)
-	{
-		ASSERT(V(ptr)->getType()->isPointerTy());
-		ASSERT(V(val)->getType()->isVectorTy());
-		ASSERT(V(mask)->getType()->isVectorTy());
-
-		auto numEls = V(mask)->getType()->getVectorNumElements();
-		auto i1Ty = ::llvm::Type::getInt1Ty(jit->context);
-		auto i32Ty = ::llvm::Type::getInt32Ty(jit->context);
-		auto elVecTy = V(val)->getType();
-		auto elVecPtrTy = elVecTy->getPointerTo();
-		auto i8Mask = jit->builder->CreateIntCast(V(mask), ::llvm::VectorType::get(i1Ty, numEls), false); // vec<int, int, ...> -> vec<bool, bool, ...>
-		auto align = ::llvm::ConstantInt::get(i32Ty, alignment);
-		auto func = ::llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::masked_store, { elVecTy, elVecPtrTy } );
-		jit->builder->CreateCall(func, { V(val), V(ptr), align, i8Mask });
-	}
-
-	RValue<Float4> Gather(RValue<Pointer<Float>> base, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment, bool zeroMaskedLanes /* = false */)
-	{
-		return As<Float4>(V(createGather(V(base.value), T(Float::getType()), V(offsets.value), V(mask.value), alignment, zeroMaskedLanes)));
-	}
-
-	RValue<Int4> Gather(RValue<Pointer<Int>> base, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment, bool zeroMaskedLanes /* = false */)
-	{
-		return As<Int4>(V(createGather(V(base.value), T(Float::getType()), V(offsets.value), V(mask.value), alignment, zeroMaskedLanes)));
-	}
-
-	void Scatter(RValue<Pointer<Float>> base, RValue<Float4> val, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment)
-	{
-		return createScatter(V(base.value), V(val.value), V(offsets.value), V(mask.value), alignment);
-	}
-
-	void Scatter(RValue<Pointer<Int>> base, RValue<Int4> val, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment)
-	{
-		return createScatter(V(base.value), V(val.value), V(offsets.value), V(mask.value), alignment);
-	}
-
-	void Nucleus::createFence(std::memory_order memoryOrder)
-	{
-		jit->builder->CreateFence(atomicOrdering(true, memoryOrder));
-	}
-
-	Value *Nucleus::createGEP(Value *ptr, Type *type, Value *index, bool unsignedIndex)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		ASSERT(V(ptr)->getType()->getContainedType(0) == T(type));
-		if(sizeof(void*) == 8)
-		{
-			// LLVM manual: "When indexing into an array, pointer or vector,
-			// integers of any width are allowed, and they are not required to
-			// be constant. These integers are treated as signed values where
-			// relevant."
-			//
-			// Thus if we want indexes to be treated as unsigned we have to
-			// zero-extend them ourselves.
-			//
-			// Note that this is not because we want to address anywhere near
-			// 4 GB of data. Instead this is important for performance because
-			// x86 supports automatic zero-extending of 32-bit registers to
-			// 64-bit. Thus when indexing into an array using a uint32 is
-			// actually faster than an int32.
-			index = unsignedIndex ?
-				createZExt(index, Long::getType()) :
-				createSExt(index, Long::getType());
-		}
-
-		// For non-emulated types we can rely on LLVM's GEP to calculate the
-		// effective address correctly.
-		if(asInternalType(type) == Type_LLVM)
-		{
-			return V(jit->builder->CreateGEP(V(ptr), V(index)));
-		}
-
-		// For emulated types we have to multiply the index by the intended
-		// type size ourselves to obain the byte offset.
-		index = (sizeof(void*) == 8) ?
-			createMul(index, createConstantLong((int64_t)typeSize(type))) :
-			createMul(index, createConstantInt((int)typeSize(type)));
-
-		// Cast to a byte pointer, apply the byte offset, and cast back to the
-		// original pointer type.
+	case Type_v2i32:
+	case Type_v4i16:
+	case Type_v8i8:
+	case Type_v2f32:
 		return createBitCast(
-			V(jit->builder->CreateGEP(V(createBitCast(ptr, T(llvm::PointerType::get(T(Byte::getType()), 0)))), V(index))),
-			T(llvm::PointerType::get(T(type), 0)));
-	}
-
-	Value *Nucleus::createAtomicAdd(Value *ptr, Value *value, std::memory_order memoryOrder)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return V(jit->builder->CreateAtomicRMW(llvm::AtomicRMWInst::Add, V(ptr), V(value), atomicOrdering(true, memoryOrder)));
-	}
-
-	Value *Nucleus::createAtomicSub(Value *ptr, Value *value, std::memory_order memoryOrder)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return V(jit->builder->CreateAtomicRMW(llvm::AtomicRMWInst::Sub, V(ptr), V(value), atomicOrdering(true, memoryOrder)));
-	}
-
-	Value *Nucleus::createAtomicAnd(Value *ptr, Value *value, std::memory_order memoryOrder)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return V(jit->builder->CreateAtomicRMW(llvm::AtomicRMWInst::And, V(ptr), V(value), atomicOrdering(true, memoryOrder)));
-	}
-
-	Value *Nucleus::createAtomicOr(Value *ptr, Value *value, std::memory_order memoryOrder)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return V(jit->builder->CreateAtomicRMW(llvm::AtomicRMWInst::Or, V(ptr), V(value), atomicOrdering(true, memoryOrder)));
-	}
-
-	Value *Nucleus::createAtomicXor(Value *ptr, Value *value, std::memory_order memoryOrder)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return V(jit->builder->CreateAtomicRMW(llvm::AtomicRMWInst::Xor, V(ptr), V(value), atomicOrdering(true, memoryOrder)));
-	}
-
-	Value *Nucleus::createAtomicMin(Value *ptr, Value *value, std::memory_order memoryOrder)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return V(jit->builder->CreateAtomicRMW(llvm::AtomicRMWInst::Min, V(ptr), V(value), atomicOrdering(true, memoryOrder)));
-	}
-
-	Value *Nucleus::createAtomicMax(Value *ptr, Value *value, std::memory_order memoryOrder)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return V(jit->builder->CreateAtomicRMW(llvm::AtomicRMWInst::Max, V(ptr), V(value), atomicOrdering(true, memoryOrder)));
-	}
-
-	Value *Nucleus::createAtomicUMin(Value *ptr, Value *value, std::memory_order memoryOrder)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return V(jit->builder->CreateAtomicRMW(llvm::AtomicRMWInst::UMin, V(ptr), V(value), atomicOrdering(true, memoryOrder)));
-	}
-
-	Value *Nucleus::createAtomicUMax(Value *ptr, Value *value, std::memory_order memoryOrder)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return V(jit->builder->CreateAtomicRMW(llvm::AtomicRMWInst::UMax, V(ptr), V(value), atomicOrdering(true, memoryOrder)));
-	}
-
-
-	Value *Nucleus::createAtomicExchange(Value *ptr, Value *value, std::memory_order memoryOrder)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return V(jit->builder->CreateAtomicRMW(llvm::AtomicRMWInst::Xchg, V(ptr), V(value), atomicOrdering(true, memoryOrder)));
-	}
-
-	Value *Nucleus::createAtomicCompareExchange(Value *ptr, Value *value, Value *compare, std::memory_order memoryOrderEqual, std::memory_order memoryOrderUnequal)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		// Note: AtomicCmpXchgInstruction returns a 2-member struct containing {result, success-flag}, not the result directly.
-		return V(jit->builder->CreateExtractValue(
-				jit->builder->CreateAtomicCmpXchg(V(ptr), V(compare), V(value), atomicOrdering(true, memoryOrderEqual), atomicOrdering(true, memoryOrderUnequal)),
-				llvm::ArrayRef<unsigned>(0u)));
-	}
-
-	Value *Nucleus::createTrunc(Value *v, Type *destType)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return V(jit->builder->CreateTrunc(V(v), T(destType)));
-	}
-
-	Value *Nucleus::createZExt(Value *v, Type *destType)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return V(jit->builder->CreateZExt(V(v), T(destType)));
-	}
-
-	Value *Nucleus::createSExt(Value *v, Type *destType)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return V(jit->builder->CreateSExt(V(v), T(destType)));
-	}
-
-	Value *Nucleus::createFPToUI(Value *v, Type *destType)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return V(jit->builder->CreateFPToUI(V(v), T(destType)));
-	}
-
-	Value *Nucleus::createFPToSI(Value *v, Type *destType)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return V(jit->builder->CreateFPToSI(V(v), T(destType)));
-	}
-
-	Value *Nucleus::createSIToFP(Value *v, Type *destType)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return V(jit->builder->CreateSIToFP(V(v), T(destType)));
-	}
-
-	Value *Nucleus::createFPTrunc(Value *v, Type *destType)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return V(jit->builder->CreateFPTrunc(V(v), T(destType)));
-	}
-
-	Value *Nucleus::createFPExt(Value *v, Type *destType)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return V(jit->builder->CreateFPExt(V(v), T(destType)));
-	}
-
-	Value *Nucleus::createBitCast(Value *v, Type *destType)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		// Bitcasts must be between types of the same logical size. But with emulated narrow vectors we need
-		// support for casting between scalars and wide vectors. Emulate them by writing to the stack and
-		// reading back as the destination type.
-		if(!V(v)->getType()->isVectorTy() && T(destType)->isVectorTy())
+			createInsertElement(
+				V(llvm::UndefValue::get(llvm::VectorType::get(T(Long::getType()), 2))),
+				createLoad(createBitCast(ptr, Pointer<Long>::getType()), Long::getType(), isVolatile, alignment, atomic, memoryOrder),
+				0),
+			type);
+	case Type_v2i16:
+	case Type_v4i8:
+		if(alignment != 0)   // Not a local variable (all vectors are 128-bit).
 		{
-			Value *readAddress = allocateStackVariable(destType);
-			Value *writeAddress = createBitCast(readAddress, T(llvm::PointerType::get(V(v)->getType(), 0)));
-			createStore(v, writeAddress, T(V(v)->getType()));
-			return createLoad(readAddress, destType);
+			Value *u = V(llvm::UndefValue::get(llvm::VectorType::get(T(Long::getType()), 2)));
+			Value *i = createLoad(createBitCast(ptr, Pointer<Int>::getType()), Int::getType(), isVolatile, alignment, atomic, memoryOrder);
+			i = createZExt(i, Long::getType());
+			Value *v = createInsertElement(u, i, 0);
+			return createBitCast(v, type);
 		}
-		else if(V(v)->getType()->isVectorTy() && !T(destType)->isVectorTy())
+		// Fallthrough to non-emulated case.
+	case Type_LLVM:
 		{
-			Value *writeAddress = allocateStackVariable(T(V(v)->getType()));
-			createStore(v, writeAddress, T(V(v)->getType()));
-			Value *readAddress = createBitCast(writeAddress, T(llvm::PointerType::get(T(destType), 0)));
-			return createLoad(readAddress, destType);
+			auto elTy = T(type);
+			ASSERT(V(ptr)->getType()->getContainedType(0) == elTy);
+
+			if (!atomic)
+			{
+				return V(jit->builder->CreateAlignedLoad(V(ptr), alignment, isVolatile));
+			}
+			else if (elTy->isIntegerTy() || elTy->isPointerTy())
+			{
+				// Integers and pointers can be atomically loaded by setting
+				// the ordering constraint on the load instruction.
+				auto load = jit->builder->CreateAlignedLoad(V(ptr), alignment, isVolatile);
+				load->setAtomic(atomicOrdering(atomic, memoryOrder));
+				return V(load);
+			}
+			else if (elTy->isFloatTy() || elTy->isDoubleTy())
+			{
+				// LLVM claims to support atomic loads of float types as
+				// above, but certain backends cannot deal with this.
+				// Load as an integer and bitcast. See b/136037244.
+				auto size = jit->module->getDataLayout().getTypeStoreSize(elTy);
+				auto elAsIntTy = ::llvm::IntegerType::get(jit->context, size * 8);
+				auto ptrCast = jit->builder->CreatePointerCast(V(ptr), elAsIntTy->getPointerTo());
+				auto load = jit->builder->CreateAlignedLoad(ptrCast, alignment, isVolatile);
+				load->setAtomic(atomicOrdering(atomic, memoryOrder));
+				auto loadCast = jit->builder->CreateBitCast(load, elTy);
+				return V(loadCast);
+			}
+			else
+			{
+				// More exotic types require falling back to the extern:
+				// void __atomic_load(size_t size, void *ptr, void *ret, int ordering)
+				auto sizetTy = ::llvm::IntegerType::get(jit->context, sizeof(size_t) * 8);
+				auto intTy = ::llvm::IntegerType::get(jit->context, sizeof(int) * 8);
+				auto i8Ty = ::llvm::Type::getInt8Ty(jit->context);
+				auto i8PtrTy = i8Ty->getPointerTo();
+				auto voidTy = ::llvm::Type::getVoidTy(jit->context);
+				auto funcTy = ::llvm::FunctionType::get(voidTy, {sizetTy, i8PtrTy, i8PtrTy, intTy}, false);
+				auto func = jit->module->getOrInsertFunction("__atomic_load", funcTy);
+				auto size = jit->module->getDataLayout().getTypeStoreSize(elTy);
+				auto out = allocateStackVariable(type);
+				jit->builder->CreateCall(func, {
+					::llvm::ConstantInt::get(sizetTy, size),
+					jit->builder->CreatePointerCast(V(ptr), i8PtrTy),
+					jit->builder->CreatePointerCast(V(out), i8PtrTy),
+					::llvm::ConstantInt::get(intTy, uint64_t(atomicOrdering(true, memoryOrder))),
+				 });
+				 return V(jit->builder->CreateLoad(V(out)));
+			}
 		}
-
-		return V(jit->builder->CreateBitCast(V(v), T(destType)));
+	default:
+		UNREACHABLE("asInternalType(type): %d", int(asInternalType(type)));
+		return nullptr;
 	}
+}
 
-	Value *Nucleus::createPtrEQ(Value *lhs, Value *rhs)
+Value *Nucleus::createStore(Value *value, Value *ptr, Type *type, bool isVolatile, unsigned int alignment, bool atomic, std::memory_order memoryOrder)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	switch(asInternalType(type))
 	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return V(jit->builder->CreateICmpEQ(V(lhs), V(rhs)));
-	}
-
-	Value *Nucleus::createICmpEQ(Value *lhs, Value *rhs)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return V(jit->builder->CreateICmpEQ(V(lhs), V(rhs)));
-	}
-
-	Value *Nucleus::createICmpNE(Value *lhs, Value *rhs)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return V(jit->builder->CreateICmpNE(V(lhs), V(rhs)));
-	}
-
-	Value *Nucleus::createICmpUGT(Value *lhs, Value *rhs)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return V(jit->builder->CreateICmpUGT(V(lhs), V(rhs)));
-	}
-
-	Value *Nucleus::createICmpUGE(Value *lhs, Value *rhs)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return V(jit->builder->CreateICmpUGE(V(lhs), V(rhs)));
-	}
-
-	Value *Nucleus::createICmpULT(Value *lhs, Value *rhs)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return V(jit->builder->CreateICmpULT(V(lhs), V(rhs)));
-	}
-
-	Value *Nucleus::createICmpULE(Value *lhs, Value *rhs)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return V(jit->builder->CreateICmpULE(V(lhs), V(rhs)));
-	}
-
-	Value *Nucleus::createICmpSGT(Value *lhs, Value *rhs)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return V(jit->builder->CreateICmpSGT(V(lhs), V(rhs)));
-	}
-
-	Value *Nucleus::createICmpSGE(Value *lhs, Value *rhs)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return V(jit->builder->CreateICmpSGE(V(lhs), V(rhs)));
-	}
-
-	Value *Nucleus::createICmpSLT(Value *lhs, Value *rhs)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return V(jit->builder->CreateICmpSLT(V(lhs), V(rhs)));
-	}
-
-	Value *Nucleus::createICmpSLE(Value *lhs, Value *rhs)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return V(jit->builder->CreateICmpSLE(V(lhs), V(rhs)));
-	}
-
-	Value *Nucleus::createFCmpOEQ(Value *lhs, Value *rhs)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return V(jit->builder->CreateFCmpOEQ(V(lhs), V(rhs)));
-	}
-
-	Value *Nucleus::createFCmpOGT(Value *lhs, Value *rhs)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return V(jit->builder->CreateFCmpOGT(V(lhs), V(rhs)));
-	}
-
-	Value *Nucleus::createFCmpOGE(Value *lhs, Value *rhs)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return V(jit->builder->CreateFCmpOGE(V(lhs), V(rhs)));
-	}
-
-	Value *Nucleus::createFCmpOLT(Value *lhs, Value *rhs)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return V(jit->builder->CreateFCmpOLT(V(lhs), V(rhs)));
-	}
-
-	Value *Nucleus::createFCmpOLE(Value *lhs, Value *rhs)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return V(jit->builder->CreateFCmpOLE(V(lhs), V(rhs)));
-	}
-
-	Value *Nucleus::createFCmpONE(Value *lhs, Value *rhs)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return V(jit->builder->CreateFCmpONE(V(lhs), V(rhs)));
-	}
-
-	Value *Nucleus::createFCmpORD(Value *lhs, Value *rhs)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return V(jit->builder->CreateFCmpORD(V(lhs), V(rhs)));
-	}
-
-	Value *Nucleus::createFCmpUNO(Value *lhs, Value *rhs)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return V(jit->builder->CreateFCmpUNO(V(lhs), V(rhs)));
-	}
-
-	Value *Nucleus::createFCmpUEQ(Value *lhs, Value *rhs)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return V(jit->builder->CreateFCmpUEQ(V(lhs), V(rhs)));
-	}
-
-	Value *Nucleus::createFCmpUGT(Value *lhs, Value *rhs)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return V(jit->builder->CreateFCmpUGT(V(lhs), V(rhs)));
-	}
-
-	Value *Nucleus::createFCmpUGE(Value *lhs, Value *rhs)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return V(jit->builder->CreateFCmpUGE(V(lhs), V(rhs)));
-	}
-
-	Value *Nucleus::createFCmpULT(Value *lhs, Value *rhs)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return V(jit->builder->CreateFCmpULT(V(lhs), V(rhs)));
-	}
-
-	Value *Nucleus::createFCmpULE(Value *lhs, Value *rhs)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return V(jit->builder->CreateFCmpULE(V(lhs), V(rhs)));
-	}
-
-	Value *Nucleus::createFCmpUNE(Value *lhs, Value *rhs)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return V(jit->builder->CreateFCmpUNE(V(lhs), V(rhs)));
-	}
-
-	Value *Nucleus::createExtractElement(Value *vector, Type *type, int index)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		ASSERT(V(vector)->getType()->getContainedType(0) == T(type));
-		return V(jit->builder->CreateExtractElement(V(vector), V(createConstantInt(index))));
-	}
-
-	Value *Nucleus::createInsertElement(Value *vector, Value *element, int index)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return V(jit->builder->CreateInsertElement(V(vector), V(element), V(createConstantInt(index))));
-	}
-
-	Value *Nucleus::createShuffleVector(Value *v1, Value *v2, const int *select)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-
-		int size = llvm::cast<llvm::VectorType>(V(v1)->getType())->getNumElements();
-		const int maxSize = 16;
-		llvm::Constant *swizzle[maxSize];
-		ASSERT(size <= maxSize);
-
-		for(int i = 0; i < size; i++)
+	case Type_v2i32:
+	case Type_v4i16:
+	case Type_v8i8:
+	case Type_v2f32:
+		createStore(
+			createExtractElement(
+				createBitCast(value, T(llvm::VectorType::get(T(Long::getType()), 2))), Long::getType(), 0),
+			createBitCast(ptr, Pointer<Long>::getType()),
+			Long::getType(), isVolatile, alignment, atomic, memoryOrder);
+		return value;
+	case Type_v2i16:
+	case Type_v4i8:
+		if(alignment != 0)   // Not a local variable (all vectors are 128-bit).
 		{
-			swizzle[i] = llvm::ConstantInt::get(llvm::Type::getInt32Ty(jit->context), select[i]);
+			createStore(
+				createExtractElement(createBitCast(value, Int4::getType()), Int::getType(), 0),
+				createBitCast(ptr, Pointer<Int>::getType()),
+				Int::getType(), isVolatile, alignment, atomic, memoryOrder);
+			return value;
 		}
-
-		llvm::Value *shuffle = llvm::ConstantVector::get(llvm::ArrayRef<llvm::Constant*>(swizzle, size));
-
-		return V(jit->builder->CreateShuffleVector(V(v1), V(v2), shuffle));
-	}
-
-	Value *Nucleus::createSelect(Value *c, Value *ifTrue, Value *ifFalse)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return V(jit->builder->CreateSelect(V(c), V(ifTrue), V(ifFalse)));
-	}
-
-	SwitchCases *Nucleus::createSwitch(Value *control, BasicBlock *defaultBranch, unsigned numCases)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return reinterpret_cast<SwitchCases*>(jit->builder->CreateSwitch(V(control), B(defaultBranch), numCases));
-	}
-
-	void Nucleus::addSwitchCase(SwitchCases *switchCases, int label, BasicBlock *branch)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		llvm::SwitchInst *sw = reinterpret_cast<llvm::SwitchInst *>(switchCases);
-		sw->addCase(llvm::ConstantInt::get(llvm::Type::getInt32Ty(jit->context), label, true), B(branch));
-	}
-
-	void Nucleus::createUnreachable()
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		jit->builder->CreateUnreachable();
-	}
-
-	Type *Nucleus::getPointerType(Type *ElementType)
-	{
-		return T(llvm::PointerType::get(T(ElementType), 0));
-	}
-
-	Value *Nucleus::createNullValue(Type *Ty)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return V(llvm::Constant::getNullValue(T(Ty)));
-	}
-
-	Value *Nucleus::createConstantLong(int64_t i)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return V(llvm::ConstantInt::get(llvm::Type::getInt64Ty(jit->context), i, true));
-	}
-
-	Value *Nucleus::createConstantInt(int i)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return V(llvm::ConstantInt::get(llvm::Type::getInt32Ty(jit->context), i, true));
-	}
-
-	Value *Nucleus::createConstantInt(unsigned int i)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return V(llvm::ConstantInt::get(llvm::Type::getInt32Ty(jit->context), i, false));
-	}
-
-	Value *Nucleus::createConstantBool(bool b)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return V(llvm::ConstantInt::get(llvm::Type::getInt1Ty(jit->context), b));
-	}
-
-	Value *Nucleus::createConstantByte(signed char i)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return V(llvm::ConstantInt::get(llvm::Type::getInt8Ty(jit->context), i, true));
-	}
-
-	Value *Nucleus::createConstantByte(unsigned char i)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return V(llvm::ConstantInt::get(llvm::Type::getInt8Ty(jit->context), i, false));
-	}
-
-	Value *Nucleus::createConstantShort(short i)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return V(llvm::ConstantInt::get(llvm::Type::getInt16Ty(jit->context), i, true));
-	}
-
-	Value *Nucleus::createConstantShort(unsigned short i)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return V(llvm::ConstantInt::get(llvm::Type::getInt16Ty(jit->context), i, false));
-	}
-
-	Value *Nucleus::createConstantFloat(float x)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return V(llvm::ConstantFP::get(T(Float::getType()), x));
-	}
-
-	Value *Nucleus::createNullPointer(Type *Ty)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return V(llvm::ConstantPointerNull::get(llvm::PointerType::get(T(Ty), 0)));
-	}
-
-	Value *Nucleus::createConstantVector(const int64_t *constants, Type *type)
-	{
-		ASSERT(llvm::isa<llvm::VectorType>(T(type)));
-		const int numConstants = elementCount(type);                                       // Number of provided constants for the (emulated) type.
-		const int numElements = llvm::cast<llvm::VectorType>(T(type))->getNumElements();   // Number of elements of the underlying vector type.
-		ASSERT(numElements <= 16 && numConstants <= numElements);
-		llvm::Constant *constantVector[16];
-
-		for(int i = 0; i < numElements; i++)
+		// Fallthrough to non-emulated case.
+	case Type_LLVM:
 		{
-			constantVector[i] = llvm::ConstantInt::get(T(type)->getContainedType(0), constants[i % numConstants]);
+			auto elTy = T(type);
+			ASSERT(V(ptr)->getType()->getContainedType(0) == elTy);
+
+			if (!atomic)
+			{
+				jit->builder->CreateAlignedStore(V(value), V(ptr), alignment, isVolatile);
+			}
+			else if (elTy->isIntegerTy() || elTy->isPointerTy())
+			{
+				// Integers and pointers can be atomically stored by setting
+				// the ordering constraint on the store instruction.
+				auto store = jit->builder->CreateAlignedStore(V(value), V(ptr), alignment, isVolatile);
+				store->setAtomic(atomicOrdering(atomic, memoryOrder));
+			}
+			else if (elTy->isFloatTy() || elTy->isDoubleTy())
+			{
+				// LLVM claims to support atomic stores of float types as
+				// above, but certain backends cannot deal with this.
+				// Store as an bitcast integer. See b/136037244.
+				auto size = jit->module->getDataLayout().getTypeStoreSize(elTy);
+				auto elAsIntTy = ::llvm::IntegerType::get(jit->context, size * 8);
+				auto valCast = jit->builder->CreateBitCast(V(value), elAsIntTy);
+				auto ptrCast = jit->builder->CreatePointerCast(V(ptr), elAsIntTy->getPointerTo());
+				auto store = jit->builder->CreateAlignedStore(valCast, ptrCast, alignment, isVolatile);
+				store->setAtomic(atomicOrdering(atomic, memoryOrder));
+			}
+			else
+			{
+				// More exotic types require falling back to the extern:
+				// void __atomic_store(size_t size, void *ptr, void *val, int ordering)
+				auto sizetTy = ::llvm::IntegerType::get(jit->context, sizeof(size_t) * 8);
+				auto intTy = ::llvm::IntegerType::get(jit->context, sizeof(int) * 8);
+				auto i8Ty = ::llvm::Type::getInt8Ty(jit->context);
+				auto i8PtrTy = i8Ty->getPointerTo();
+				auto voidTy = ::llvm::Type::getVoidTy(jit->context);
+				auto funcTy = ::llvm::FunctionType::get(voidTy, {sizetTy, i8PtrTy, i8PtrTy, intTy}, false);
+				auto func = jit->module->getOrInsertFunction("__atomic_store", funcTy);
+				auto size = jit->module->getDataLayout().getTypeStoreSize(elTy);
+				auto copy = allocateStackVariable(type);
+				jit->builder->CreateStore(V(value), V(copy));
+				jit->builder->CreateCall(func, {
+					::llvm::ConstantInt::get(sizetTy, size),
+					jit->builder->CreatePointerCast(V(ptr), i8PtrTy),
+					jit->builder->CreatePointerCast(V(copy), i8PtrTy),
+					::llvm::ConstantInt::get(intTy, uint64_t(atomicOrdering(true, memoryOrder))),
+				 });
+			}
+
+			return value;
 		}
+	default:
+		UNREACHABLE("asInternalType(type): %d", int(asInternalType(type)));
+		return nullptr;
+	}
+}
 
-		return V(llvm::ConstantVector::get(llvm::ArrayRef<llvm::Constant*>(constantVector, numElements)));
+Value *Nucleus::createMaskedLoad(Value *ptr, Type *elTy, Value *mask, unsigned int alignment, bool zeroMaskedLanes)
+{
+	ASSERT(V(ptr)->getType()->isPointerTy());
+	ASSERT(V(mask)->getType()->isVectorTy());
+
+	auto numEls = V(mask)->getType()->getVectorNumElements();
+	auto i1Ty = ::llvm::Type::getInt1Ty(jit->context);
+	auto i32Ty = ::llvm::Type::getInt32Ty(jit->context);
+	auto elVecTy = ::llvm::VectorType::get(T(elTy), numEls);
+	auto elVecPtrTy = elVecTy->getPointerTo();
+	auto i8Mask = jit->builder->CreateIntCast(V(mask), ::llvm::VectorType::get(i1Ty, numEls), false); // vec<int, int, ...> -> vec<bool, bool, ...>
+	auto passthrough = zeroMaskedLanes ? ::llvm::Constant::getNullValue(elVecTy) : llvm::UndefValue::get(elVecTy);
+	auto align = ::llvm::ConstantInt::get(i32Ty, alignment);
+	auto func = ::llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::masked_load, { elVecTy, elVecPtrTy } );
+	return V(jit->builder->CreateCall(func, { V(ptr), align, i8Mask, passthrough }));
+}
+
+void Nucleus::createMaskedStore(Value *ptr, Value *val, Value *mask, unsigned int alignment)
+{
+	ASSERT(V(ptr)->getType()->isPointerTy());
+	ASSERT(V(val)->getType()->isVectorTy());
+	ASSERT(V(mask)->getType()->isVectorTy());
+
+	auto numEls = V(mask)->getType()->getVectorNumElements();
+	auto i1Ty = ::llvm::Type::getInt1Ty(jit->context);
+	auto i32Ty = ::llvm::Type::getInt32Ty(jit->context);
+	auto elVecTy = V(val)->getType();
+	auto elVecPtrTy = elVecTy->getPointerTo();
+	auto i8Mask = jit->builder->CreateIntCast(V(mask), ::llvm::VectorType::get(i1Ty, numEls), false); // vec<int, int, ...> -> vec<bool, bool, ...>
+	auto align = ::llvm::ConstantInt::get(i32Ty, alignment);
+	auto func = ::llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::masked_store, { elVecTy, elVecPtrTy } );
+	jit->builder->CreateCall(func, { V(val), V(ptr), align, i8Mask });
+}
+
+RValue<Float4> Gather(RValue<Pointer<Float>> base, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment, bool zeroMaskedLanes /* = false */)
+{
+	return As<Float4>(V(createGather(V(base.value), T(Float::getType()), V(offsets.value), V(mask.value), alignment, zeroMaskedLanes)));
+}
+
+RValue<Int4> Gather(RValue<Pointer<Int>> base, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment, bool zeroMaskedLanes /* = false */)
+{
+	return As<Int4>(V(createGather(V(base.value), T(Float::getType()), V(offsets.value), V(mask.value), alignment, zeroMaskedLanes)));
+}
+
+void Scatter(RValue<Pointer<Float>> base, RValue<Float4> val, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment)
+{
+	return createScatter(V(base.value), V(val.value), V(offsets.value), V(mask.value), alignment);
+}
+
+void Scatter(RValue<Pointer<Int>> base, RValue<Int4> val, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment)
+{
+	return createScatter(V(base.value), V(val.value), V(offsets.value), V(mask.value), alignment);
+}
+
+void Nucleus::createFence(std::memory_order memoryOrder)
+{
+	jit->builder->CreateFence(atomicOrdering(true, memoryOrder));
+}
+
+Value *Nucleus::createGEP(Value *ptr, Type *type, Value *index, bool unsignedIndex)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	ASSERT(V(ptr)->getType()->getContainedType(0) == T(type));
+	if(sizeof(void*) == 8)
+	{
+		// LLVM manual: "When indexing into an array, pointer or vector,
+		// integers of any width are allowed, and they are not required to
+		// be constant. These integers are treated as signed values where
+		// relevant."
+		//
+		// Thus if we want indexes to be treated as unsigned we have to
+		// zero-extend them ourselves.
+		//
+		// Note that this is not because we want to address anywhere near
+		// 4 GB of data. Instead this is important for performance because
+		// x86 supports automatic zero-extending of 32-bit registers to
+		// 64-bit. Thus when indexing into an array using a uint32 is
+		// actually faster than an int32.
+		index = unsignedIndex ?
+			createZExt(index, Long::getType()) :
+			createSExt(index, Long::getType());
 	}
 
-	Value *Nucleus::createConstantVector(const double *constants, Type *type)
+	// For non-emulated types we can rely on LLVM's GEP to calculate the
+	// effective address correctly.
+	if(asInternalType(type) == Type_LLVM)
 	{
-		ASSERT(llvm::isa<llvm::VectorType>(T(type)));
-		const int numConstants = elementCount(type);                                       // Number of provided constants for the (emulated) type.
-		const int numElements = llvm::cast<llvm::VectorType>(T(type))->getNumElements();   // Number of elements of the underlying vector type.
-		ASSERT(numElements <= 8 && numConstants <= numElements);
-		llvm::Constant *constantVector[8];
-
-		for(int i = 0; i < numElements; i++)
-		{
-			constantVector[i] = llvm::ConstantFP::get(T(type)->getContainedType(0), constants[i % numConstants]);
-		}
-
-		return V(llvm::ConstantVector::get(llvm::ArrayRef<llvm::Constant*>(constantVector, numElements)));
+		return V(jit->builder->CreateGEP(V(ptr), V(index)));
 	}
 
-	Type *Void::getType()
+	// For emulated types we have to multiply the index by the intended
+	// type size ourselves to obain the byte offset.
+	index = (sizeof(void*) == 8) ?
+		createMul(index, createConstantLong((int64_t)typeSize(type))) :
+		createMul(index, createConstantInt((int)typeSize(type)));
+
+	// Cast to a byte pointer, apply the byte offset, and cast back to the
+	// original pointer type.
+	return createBitCast(
+		V(jit->builder->CreateGEP(V(createBitCast(ptr, T(llvm::PointerType::get(T(Byte::getType()), 0)))), V(index))),
+		T(llvm::PointerType::get(T(type), 0)));
+}
+
+Value *Nucleus::createAtomicAdd(Value *ptr, Value *value, std::memory_order memoryOrder)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return V(jit->builder->CreateAtomicRMW(llvm::AtomicRMWInst::Add, V(ptr), V(value), atomicOrdering(true, memoryOrder)));
+}
+
+Value *Nucleus::createAtomicSub(Value *ptr, Value *value, std::memory_order memoryOrder)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return V(jit->builder->CreateAtomicRMW(llvm::AtomicRMWInst::Sub, V(ptr), V(value), atomicOrdering(true, memoryOrder)));
+}
+
+Value *Nucleus::createAtomicAnd(Value *ptr, Value *value, std::memory_order memoryOrder)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return V(jit->builder->CreateAtomicRMW(llvm::AtomicRMWInst::And, V(ptr), V(value), atomicOrdering(true, memoryOrder)));
+}
+
+Value *Nucleus::createAtomicOr(Value *ptr, Value *value, std::memory_order memoryOrder)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return V(jit->builder->CreateAtomicRMW(llvm::AtomicRMWInst::Or, V(ptr), V(value), atomicOrdering(true, memoryOrder)));
+}
+
+Value *Nucleus::createAtomicXor(Value *ptr, Value *value, std::memory_order memoryOrder)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return V(jit->builder->CreateAtomicRMW(llvm::AtomicRMWInst::Xor, V(ptr), V(value), atomicOrdering(true, memoryOrder)));
+}
+
+Value *Nucleus::createAtomicMin(Value *ptr, Value *value, std::memory_order memoryOrder)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return V(jit->builder->CreateAtomicRMW(llvm::AtomicRMWInst::Min, V(ptr), V(value), atomicOrdering(true, memoryOrder)));
+}
+
+Value *Nucleus::createAtomicMax(Value *ptr, Value *value, std::memory_order memoryOrder)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return V(jit->builder->CreateAtomicRMW(llvm::AtomicRMWInst::Max, V(ptr), V(value), atomicOrdering(true, memoryOrder)));
+}
+
+Value *Nucleus::createAtomicUMin(Value *ptr, Value *value, std::memory_order memoryOrder)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return V(jit->builder->CreateAtomicRMW(llvm::AtomicRMWInst::UMin, V(ptr), V(value), atomicOrdering(true, memoryOrder)));
+}
+
+Value *Nucleus::createAtomicUMax(Value *ptr, Value *value, std::memory_order memoryOrder)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return V(jit->builder->CreateAtomicRMW(llvm::AtomicRMWInst::UMax, V(ptr), V(value), atomicOrdering(true, memoryOrder)));
+}
+
+
+Value *Nucleus::createAtomicExchange(Value *ptr, Value *value, std::memory_order memoryOrder)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return V(jit->builder->CreateAtomicRMW(llvm::AtomicRMWInst::Xchg, V(ptr), V(value), atomicOrdering(true, memoryOrder)));
+}
+
+Value *Nucleus::createAtomicCompareExchange(Value *ptr, Value *value, Value *compare, std::memory_order memoryOrderEqual, std::memory_order memoryOrderUnequal)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	// Note: AtomicCmpXchgInstruction returns a 2-member struct containing {result, success-flag}, not the result directly.
+	return V(jit->builder->CreateExtractValue(
+			jit->builder->CreateAtomicCmpXchg(V(ptr), V(compare), V(value), atomicOrdering(true, memoryOrderEqual), atomicOrdering(true, memoryOrderUnequal)),
+			llvm::ArrayRef<unsigned>(0u)));
+}
+
+Value *Nucleus::createTrunc(Value *v, Type *destType)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return V(jit->builder->CreateTrunc(V(v), T(destType)));
+}
+
+Value *Nucleus::createZExt(Value *v, Type *destType)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return V(jit->builder->CreateZExt(V(v), T(destType)));
+}
+
+Value *Nucleus::createSExt(Value *v, Type *destType)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return V(jit->builder->CreateSExt(V(v), T(destType)));
+}
+
+Value *Nucleus::createFPToUI(Value *v, Type *destType)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return V(jit->builder->CreateFPToUI(V(v), T(destType)));
+}
+
+Value *Nucleus::createFPToSI(Value *v, Type *destType)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return V(jit->builder->CreateFPToSI(V(v), T(destType)));
+}
+
+Value *Nucleus::createSIToFP(Value *v, Type *destType)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return V(jit->builder->CreateSIToFP(V(v), T(destType)));
+}
+
+Value *Nucleus::createFPTrunc(Value *v, Type *destType)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return V(jit->builder->CreateFPTrunc(V(v), T(destType)));
+}
+
+Value *Nucleus::createFPExt(Value *v, Type *destType)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return V(jit->builder->CreateFPExt(V(v), T(destType)));
+}
+
+Value *Nucleus::createBitCast(Value *v, Type *destType)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	// Bitcasts must be between types of the same logical size. But with emulated narrow vectors we need
+	// support for casting between scalars and wide vectors. Emulate them by writing to the stack and
+	// reading back as the destination type.
+	if(!V(v)->getType()->isVectorTy() && T(destType)->isVectorTy())
 	{
-		return T(llvm::Type::getVoidTy(jit->context));
+		Value *readAddress = allocateStackVariable(destType);
+		Value *writeAddress = createBitCast(readAddress, T(llvm::PointerType::get(V(v)->getType(), 0)));
+		createStore(v, writeAddress, T(V(v)->getType()));
+		return createLoad(readAddress, destType);
+	}
+	else if(V(v)->getType()->isVectorTy() && !T(destType)->isVectorTy())
+	{
+		Value *writeAddress = allocateStackVariable(T(V(v)->getType()));
+		createStore(v, writeAddress, T(V(v)->getType()));
+		Value *readAddress = createBitCast(writeAddress, T(llvm::PointerType::get(T(destType), 0)));
+		return createLoad(readAddress, destType);
 	}
 
-	Type *Bool::getType()
+	return V(jit->builder->CreateBitCast(V(v), T(destType)));
+}
+
+Value *Nucleus::createPtrEQ(Value *lhs, Value *rhs)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return V(jit->builder->CreateICmpEQ(V(lhs), V(rhs)));
+}
+
+Value *Nucleus::createICmpEQ(Value *lhs, Value *rhs)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return V(jit->builder->CreateICmpEQ(V(lhs), V(rhs)));
+}
+
+Value *Nucleus::createICmpNE(Value *lhs, Value *rhs)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return V(jit->builder->CreateICmpNE(V(lhs), V(rhs)));
+}
+
+Value *Nucleus::createICmpUGT(Value *lhs, Value *rhs)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return V(jit->builder->CreateICmpUGT(V(lhs), V(rhs)));
+}
+
+Value *Nucleus::createICmpUGE(Value *lhs, Value *rhs)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return V(jit->builder->CreateICmpUGE(V(lhs), V(rhs)));
+}
+
+Value *Nucleus::createICmpULT(Value *lhs, Value *rhs)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return V(jit->builder->CreateICmpULT(V(lhs), V(rhs)));
+}
+
+Value *Nucleus::createICmpULE(Value *lhs, Value *rhs)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return V(jit->builder->CreateICmpULE(V(lhs), V(rhs)));
+}
+
+Value *Nucleus::createICmpSGT(Value *lhs, Value *rhs)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return V(jit->builder->CreateICmpSGT(V(lhs), V(rhs)));
+}
+
+Value *Nucleus::createICmpSGE(Value *lhs, Value *rhs)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return V(jit->builder->CreateICmpSGE(V(lhs), V(rhs)));
+}
+
+Value *Nucleus::createICmpSLT(Value *lhs, Value *rhs)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return V(jit->builder->CreateICmpSLT(V(lhs), V(rhs)));
+}
+
+Value *Nucleus::createICmpSLE(Value *lhs, Value *rhs)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return V(jit->builder->CreateICmpSLE(V(lhs), V(rhs)));
+}
+
+Value *Nucleus::createFCmpOEQ(Value *lhs, Value *rhs)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return V(jit->builder->CreateFCmpOEQ(V(lhs), V(rhs)));
+}
+
+Value *Nucleus::createFCmpOGT(Value *lhs, Value *rhs)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return V(jit->builder->CreateFCmpOGT(V(lhs), V(rhs)));
+}
+
+Value *Nucleus::createFCmpOGE(Value *lhs, Value *rhs)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return V(jit->builder->CreateFCmpOGE(V(lhs), V(rhs)));
+}
+
+Value *Nucleus::createFCmpOLT(Value *lhs, Value *rhs)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return V(jit->builder->CreateFCmpOLT(V(lhs), V(rhs)));
+}
+
+Value *Nucleus::createFCmpOLE(Value *lhs, Value *rhs)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return V(jit->builder->CreateFCmpOLE(V(lhs), V(rhs)));
+}
+
+Value *Nucleus::createFCmpONE(Value *lhs, Value *rhs)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return V(jit->builder->CreateFCmpONE(V(lhs), V(rhs)));
+}
+
+Value *Nucleus::createFCmpORD(Value *lhs, Value *rhs)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return V(jit->builder->CreateFCmpORD(V(lhs), V(rhs)));
+}
+
+Value *Nucleus::createFCmpUNO(Value *lhs, Value *rhs)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return V(jit->builder->CreateFCmpUNO(V(lhs), V(rhs)));
+}
+
+Value *Nucleus::createFCmpUEQ(Value *lhs, Value *rhs)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return V(jit->builder->CreateFCmpUEQ(V(lhs), V(rhs)));
+}
+
+Value *Nucleus::createFCmpUGT(Value *lhs, Value *rhs)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return V(jit->builder->CreateFCmpUGT(V(lhs), V(rhs)));
+}
+
+Value *Nucleus::createFCmpUGE(Value *lhs, Value *rhs)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return V(jit->builder->CreateFCmpUGE(V(lhs), V(rhs)));
+}
+
+Value *Nucleus::createFCmpULT(Value *lhs, Value *rhs)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return V(jit->builder->CreateFCmpULT(V(lhs), V(rhs)));
+}
+
+Value *Nucleus::createFCmpULE(Value *lhs, Value *rhs)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return V(jit->builder->CreateFCmpULE(V(lhs), V(rhs)));
+}
+
+Value *Nucleus::createFCmpUNE(Value *lhs, Value *rhs)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return V(jit->builder->CreateFCmpUNE(V(lhs), V(rhs)));
+}
+
+Value *Nucleus::createExtractElement(Value *vector, Type *type, int index)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	ASSERT(V(vector)->getType()->getContainedType(0) == T(type));
+	return V(jit->builder->CreateExtractElement(V(vector), V(createConstantInt(index))));
+}
+
+Value *Nucleus::createInsertElement(Value *vector, Value *element, int index)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return V(jit->builder->CreateInsertElement(V(vector), V(element), V(createConstantInt(index))));
+}
+
+Value *Nucleus::createShuffleVector(Value *v1, Value *v2, const int *select)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+
+	int size = llvm::cast<llvm::VectorType>(V(v1)->getType())->getNumElements();
+	const int maxSize = 16;
+	llvm::Constant *swizzle[maxSize];
+	ASSERT(size <= maxSize);
+
+	for(int i = 0; i < size; i++)
 	{
-		return T(llvm::Type::getInt1Ty(jit->context));
+		swizzle[i] = llvm::ConstantInt::get(llvm::Type::getInt32Ty(jit->context), select[i]);
 	}
 
-	Type *Byte::getType()
+	llvm::Value *shuffle = llvm::ConstantVector::get(llvm::ArrayRef<llvm::Constant*>(swizzle, size));
+
+	return V(jit->builder->CreateShuffleVector(V(v1), V(v2), shuffle));
+}
+
+Value *Nucleus::createSelect(Value *c, Value *ifTrue, Value *ifFalse)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return V(jit->builder->CreateSelect(V(c), V(ifTrue), V(ifFalse)));
+}
+
+SwitchCases *Nucleus::createSwitch(Value *control, BasicBlock *defaultBranch, unsigned numCases)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return reinterpret_cast<SwitchCases*>(jit->builder->CreateSwitch(V(control), B(defaultBranch), numCases));
+}
+
+void Nucleus::addSwitchCase(SwitchCases *switchCases, int label, BasicBlock *branch)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	llvm::SwitchInst *sw = reinterpret_cast<llvm::SwitchInst *>(switchCases);
+	sw->addCase(llvm::ConstantInt::get(llvm::Type::getInt32Ty(jit->context), label, true), B(branch));
+}
+
+void Nucleus::createUnreachable()
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	jit->builder->CreateUnreachable();
+}
+
+Type *Nucleus::getPointerType(Type *ElementType)
+{
+	return T(llvm::PointerType::get(T(ElementType), 0));
+}
+
+Value *Nucleus::createNullValue(Type *Ty)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return V(llvm::Constant::getNullValue(T(Ty)));
+}
+
+Value *Nucleus::createConstantLong(int64_t i)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return V(llvm::ConstantInt::get(llvm::Type::getInt64Ty(jit->context), i, true));
+}
+
+Value *Nucleus::createConstantInt(int i)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return V(llvm::ConstantInt::get(llvm::Type::getInt32Ty(jit->context), i, true));
+}
+
+Value *Nucleus::createConstantInt(unsigned int i)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return V(llvm::ConstantInt::get(llvm::Type::getInt32Ty(jit->context), i, false));
+}
+
+Value *Nucleus::createConstantBool(bool b)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return V(llvm::ConstantInt::get(llvm::Type::getInt1Ty(jit->context), b));
+}
+
+Value *Nucleus::createConstantByte(signed char i)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return V(llvm::ConstantInt::get(llvm::Type::getInt8Ty(jit->context), i, true));
+}
+
+Value *Nucleus::createConstantByte(unsigned char i)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return V(llvm::ConstantInt::get(llvm::Type::getInt8Ty(jit->context), i, false));
+}
+
+Value *Nucleus::createConstantShort(short i)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return V(llvm::ConstantInt::get(llvm::Type::getInt16Ty(jit->context), i, true));
+}
+
+Value *Nucleus::createConstantShort(unsigned short i)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return V(llvm::ConstantInt::get(llvm::Type::getInt16Ty(jit->context), i, false));
+}
+
+Value *Nucleus::createConstantFloat(float x)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return V(llvm::ConstantFP::get(T(Float::getType()), x));
+}
+
+Value *Nucleus::createNullPointer(Type *Ty)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return V(llvm::ConstantPointerNull::get(llvm::PointerType::get(T(Ty), 0)));
+}
+
+Value *Nucleus::createConstantVector(const int64_t *constants, Type *type)
+{
+	ASSERT(llvm::isa<llvm::VectorType>(T(type)));
+	const int numConstants = elementCount(type);                                       // Number of provided constants for the (emulated) type.
+	const int numElements = llvm::cast<llvm::VectorType>(T(type))->getNumElements();   // Number of elements of the underlying vector type.
+	ASSERT(numElements <= 16 && numConstants <= numElements);
+	llvm::Constant *constantVector[16];
+
+	for(int i = 0; i < numElements; i++)
 	{
-		return T(llvm::Type::getInt8Ty(jit->context));
+		constantVector[i] = llvm::ConstantInt::get(T(type)->getContainedType(0), constants[i % numConstants]);
 	}
 
-	Type *SByte::getType()
+	return V(llvm::ConstantVector::get(llvm::ArrayRef<llvm::Constant*>(constantVector, numElements)));
+}
+
+Value *Nucleus::createConstantVector(const double *constants, Type *type)
+{
+	ASSERT(llvm::isa<llvm::VectorType>(T(type)));
+	const int numConstants = elementCount(type);                                       // Number of provided constants for the (emulated) type.
+	const int numElements = llvm::cast<llvm::VectorType>(T(type))->getNumElements();   // Number of elements of the underlying vector type.
+	ASSERT(numElements <= 8 && numConstants <= numElements);
+	llvm::Constant *constantVector[8];
+
+	for(int i = 0; i < numElements; i++)
 	{
-		return T(llvm::Type::getInt8Ty(jit->context));
+		constantVector[i] = llvm::ConstantFP::get(T(type)->getContainedType(0), constants[i % numConstants]);
 	}
 
-	Type *Short::getType()
-	{
-		return T(llvm::Type::getInt16Ty(jit->context));
-	}
+	return V(llvm::ConstantVector::get(llvm::ArrayRef<llvm::Constant*>(constantVector, numElements)));
+}
 
-	Type *UShort::getType()
-	{
-		return T(llvm::Type::getInt16Ty(jit->context));
-	}
+Type *Void::getType()
+{
+	return T(llvm::Type::getVoidTy(jit->context));
+}
 
-	Type *Byte4::getType()
-	{
-		return T(Type_v4i8);
-	}
+Type *Bool::getType()
+{
+	return T(llvm::Type::getInt1Ty(jit->context));
+}
 
-	Type *SByte4::getType()
-	{
-		return T(Type_v4i8);
-	}
+Type *Byte::getType()
+{
+	return T(llvm::Type::getInt8Ty(jit->context));
+}
 
-	RValue<Byte8> AddSat(RValue<Byte8> x, RValue<Byte8> y)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
+Type *SByte::getType()
+{
+	return T(llvm::Type::getInt8Ty(jit->context));
+}
+
+Type *Short::getType()
+{
+	return T(llvm::Type::getInt16Ty(jit->context));
+}
+
+Type *UShort::getType()
+{
+	return T(llvm::Type::getInt16Ty(jit->context));
+}
+
+Type *Byte4::getType()
+{
+	return T(Type_v4i8);
+}
+
+Type *SByte4::getType()
+{
+	return T(Type_v4i8);
+}
+
+RValue<Byte8> AddSat(RValue<Byte8> x, RValue<Byte8> y)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
 #if defined(__i386__) || defined(__x86_64__)
-		return x86::paddusb(x, y);
+	return x86::paddusb(x, y);
 #else
-		return As<Byte8>(V(lowerPUADDSAT(V(x.value), V(y.value))));
+	return As<Byte8>(V(lowerPUADDSAT(V(x.value), V(y.value))));
 #endif
-	}
+}
 
-	RValue<Byte8> SubSat(RValue<Byte8> x, RValue<Byte8> y)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
+RValue<Byte8> SubSat(RValue<Byte8> x, RValue<Byte8> y)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
 #if defined(__i386__) || defined(__x86_64__)
-		return x86::psubusb(x, y);
+	return x86::psubusb(x, y);
 #else
-		return As<Byte8>(V(lowerPUSUBSAT(V(x.value), V(y.value))));
+	return As<Byte8>(V(lowerPUSUBSAT(V(x.value), V(y.value))));
 #endif
-	}
+}
 
-	RValue<Int> SignMask(RValue<Byte8> x)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
+RValue<Int> SignMask(RValue<Byte8> x)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
 #if defined(__i386__) || defined(__x86_64__)
-		return x86::pmovmskb(x);
+	return x86::pmovmskb(x);
 #else
-		return As<Int>(V(lowerSignMask(V(x.value), T(Int::getType()))));
+	return As<Int>(V(lowerSignMask(V(x.value), T(Int::getType()))));
 #endif
-	}
+}
 
 //	RValue<Byte8> CmpGT(RValue<Byte8> x, RValue<Byte8> y)
 //	{
@@ -2399,575 +2400,575 @@
 //#endif
 //	}
 
-	RValue<Byte8> CmpEQ(RValue<Byte8> x, RValue<Byte8> y)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
+RValue<Byte8> CmpEQ(RValue<Byte8> x, RValue<Byte8> y)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
 #if defined(__i386__) || defined(__x86_64__)
-		return x86::pcmpeqb(x, y);
+	return x86::pcmpeqb(x, y);
 #else
-		return As<Byte8>(V(lowerPCMP(llvm::ICmpInst::ICMP_EQ, V(x.value), V(y.value), T(Byte8::getType()))));
+	return As<Byte8>(V(lowerPCMP(llvm::ICmpInst::ICMP_EQ, V(x.value), V(y.value), T(Byte8::getType()))));
 #endif
-	}
+}
 
-	Type *Byte8::getType()
-	{
-		return T(Type_v8i8);
-	}
+Type *Byte8::getType()
+{
+	return T(Type_v8i8);
+}
 
-	RValue<SByte8> AddSat(RValue<SByte8> x, RValue<SByte8> y)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
+RValue<SByte8> AddSat(RValue<SByte8> x, RValue<SByte8> y)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
 #if defined(__i386__) || defined(__x86_64__)
-		return x86::paddsb(x, y);
+	return x86::paddsb(x, y);
 #else
-		return As<SByte8>(V(lowerPSADDSAT(V(x.value), V(y.value))));
+	return As<SByte8>(V(lowerPSADDSAT(V(x.value), V(y.value))));
 #endif
-	}
+}
 
-	RValue<SByte8> SubSat(RValue<SByte8> x, RValue<SByte8> y)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
+RValue<SByte8> SubSat(RValue<SByte8> x, RValue<SByte8> y)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
 #if defined(__i386__) || defined(__x86_64__)
-		return x86::psubsb(x, y);
+	return x86::psubsb(x, y);
 #else
-		return As<SByte8>(V(lowerPSSUBSAT(V(x.value), V(y.value))));
+	return As<SByte8>(V(lowerPSSUBSAT(V(x.value), V(y.value))));
 #endif
-	}
+}
 
-	RValue<Int> SignMask(RValue<SByte8> x)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
+RValue<Int> SignMask(RValue<SByte8> x)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
 #if defined(__i386__) || defined(__x86_64__)
-		return x86::pmovmskb(As<Byte8>(x));
+	return x86::pmovmskb(As<Byte8>(x));
 #else
-		return As<Int>(V(lowerSignMask(V(x.value), T(Int::getType()))));
+	return As<Int>(V(lowerSignMask(V(x.value), T(Int::getType()))));
 #endif
-	}
+}
 
-	RValue<Byte8> CmpGT(RValue<SByte8> x, RValue<SByte8> y)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
+RValue<Byte8> CmpGT(RValue<SByte8> x, RValue<SByte8> y)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
 #if defined(__i386__) || defined(__x86_64__)
-		return x86::pcmpgtb(x, y);
+	return x86::pcmpgtb(x, y);
 #else
-		return As<Byte8>(V(lowerPCMP(llvm::ICmpInst::ICMP_SGT, V(x.value), V(y.value), T(Byte8::getType()))));
+	return As<Byte8>(V(lowerPCMP(llvm::ICmpInst::ICMP_SGT, V(x.value), V(y.value), T(Byte8::getType()))));
 #endif
-	}
+}
 
-	RValue<Byte8> CmpEQ(RValue<SByte8> x, RValue<SByte8> y)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
+RValue<Byte8> CmpEQ(RValue<SByte8> x, RValue<SByte8> y)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
 #if defined(__i386__) || defined(__x86_64__)
-		return x86::pcmpeqb(As<Byte8>(x), As<Byte8>(y));
+	return x86::pcmpeqb(As<Byte8>(x), As<Byte8>(y));
 #else
-		return As<Byte8>(V(lowerPCMP(llvm::ICmpInst::ICMP_EQ, V(x.value), V(y.value), T(Byte8::getType()))));
+	return As<Byte8>(V(lowerPCMP(llvm::ICmpInst::ICMP_EQ, V(x.value), V(y.value), T(Byte8::getType()))));
 #endif
-	}
+}
 
-	Type *SByte8::getType()
-	{
-		return T(Type_v8i8);
-	}
+Type *SByte8::getType()
+{
+	return T(Type_v8i8);
+}
 
-	Type *Byte16::getType()
-	{
-		return T(llvm::VectorType::get(T(Byte::getType()), 16));
-	}
+Type *Byte16::getType()
+{
+	return T(llvm::VectorType::get(T(Byte::getType()), 16));
+}
 
-	Type *SByte16::getType()
-	{
-		return T(llvm::VectorType::get(T(SByte::getType()), 16));
-	}
+Type *SByte16::getType()
+{
+	return T(llvm::VectorType::get(T(SByte::getType()), 16));
+}
 
-	Type *Short2::getType()
-	{
-		return T(Type_v2i16);
-	}
+Type *Short2::getType()
+{
+	return T(Type_v2i16);
+}
 
-	Type *UShort2::getType()
-	{
-		return T(Type_v2i16);
-	}
+Type *UShort2::getType()
+{
+	return T(Type_v2i16);
+}
 
-	Short4::Short4(RValue<Int4> cast)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		int select[8] = {0, 2, 4, 6, 0, 2, 4, 6};
-		Value *short8 = Nucleus::createBitCast(cast.value, Short8::getType());
+Short4::Short4(RValue<Int4> cast)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	int select[8] = {0, 2, 4, 6, 0, 2, 4, 6};
+	Value *short8 = Nucleus::createBitCast(cast.value, Short8::getType());
 
-		Value *packed = Nucleus::createShuffleVector(short8, short8, select);
-		Value *short4 = As<Short4>(Int2(As<Int4>(packed))).value;
+	Value *packed = Nucleus::createShuffleVector(short8, short8, select);
+	Value *short4 = As<Short4>(Int2(As<Int4>(packed))).value;
 
-		storeValue(short4);
-	}
+	storeValue(short4);
+}
 
 //	Short4::Short4(RValue<Float> cast)
 //	{
 //	}
 
-	Short4::Short4(RValue<Float4> cast)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		Int4 v4i32 = Int4(cast);
+Short4::Short4(RValue<Float4> cast)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	Int4 v4i32 = Int4(cast);
 #if defined(__i386__) || defined(__x86_64__)
-		v4i32 = As<Int4>(x86::packssdw(v4i32, v4i32));
+	v4i32 = As<Int4>(x86::packssdw(v4i32, v4i32));
 #else
-		Value *v = v4i32.loadValue();
-		v4i32 = As<Int4>(V(lowerPack(V(v), V(v), true)));
+	Value *v = v4i32.loadValue();
+	v4i32 = As<Int4>(V(lowerPack(V(v), V(v), true)));
 #endif
 
-		storeValue(As<Short4>(Int2(v4i32)).value);
-	}
+	storeValue(As<Short4>(Int2(v4i32)).value);
+}
 
-	RValue<Short4> operator<<(RValue<Short4> lhs, unsigned char rhs)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
+RValue<Short4> operator<<(RValue<Short4> lhs, unsigned char rhs)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
 #if defined(__i386__) || defined(__x86_64__)
-	//	return RValue<Short4>(Nucleus::createShl(lhs.value, rhs.value));
+//	return RValue<Short4>(Nucleus::createShl(lhs.value, rhs.value));
 
-		return x86::psllw(lhs, rhs);
+	return x86::psllw(lhs, rhs);
 #else
-		return As<Short4>(V(lowerVectorShl(V(lhs.value), rhs)));
+	return As<Short4>(V(lowerVectorShl(V(lhs.value), rhs)));
 #endif
-	}
+}
 
-	RValue<Short4> operator>>(RValue<Short4> lhs, unsigned char rhs)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
+RValue<Short4> operator>>(RValue<Short4> lhs, unsigned char rhs)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
 #if defined(__i386__) || defined(__x86_64__)
-		return x86::psraw(lhs, rhs);
+	return x86::psraw(lhs, rhs);
 #else
-		return As<Short4>(V(lowerVectorAShr(V(lhs.value), rhs)));
+	return As<Short4>(V(lowerVectorAShr(V(lhs.value), rhs)));
 #endif
-	}
+}
 
-	RValue<Short4> Max(RValue<Short4> x, RValue<Short4> y)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
+RValue<Short4> Max(RValue<Short4> x, RValue<Short4> y)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
 #if defined(__i386__) || defined(__x86_64__)
-		return x86::pmaxsw(x, y);
+	return x86::pmaxsw(x, y);
 #else
-		return RValue<Short4>(V(lowerPMINMAX(V(x.value), V(y.value), llvm::ICmpInst::ICMP_SGT)));
+	return RValue<Short4>(V(lowerPMINMAX(V(x.value), V(y.value), llvm::ICmpInst::ICMP_SGT)));
 #endif
-	}
+}
 
-	RValue<Short4> Min(RValue<Short4> x, RValue<Short4> y)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
+RValue<Short4> Min(RValue<Short4> x, RValue<Short4> y)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
 #if defined(__i386__) || defined(__x86_64__)
-		return x86::pminsw(x, y);
+	return x86::pminsw(x, y);
 #else
-		return RValue<Short4>(V(lowerPMINMAX(V(x.value), V(y.value), llvm::ICmpInst::ICMP_SLT)));
+	return RValue<Short4>(V(lowerPMINMAX(V(x.value), V(y.value), llvm::ICmpInst::ICMP_SLT)));
 #endif
-	}
+}
 
-	RValue<Short4> AddSat(RValue<Short4> x, RValue<Short4> y)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
+RValue<Short4> AddSat(RValue<Short4> x, RValue<Short4> y)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
 #if defined(__i386__) || defined(__x86_64__)
-		return x86::paddsw(x, y);
+	return x86::paddsw(x, y);
 #else
-		return As<Short4>(V(lowerPSADDSAT(V(x.value), V(y.value))));
+	return As<Short4>(V(lowerPSADDSAT(V(x.value), V(y.value))));
 #endif
-	}
+}
 
-	RValue<Short4> SubSat(RValue<Short4> x, RValue<Short4> y)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
+RValue<Short4> SubSat(RValue<Short4> x, RValue<Short4> y)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
 #if defined(__i386__) || defined(__x86_64__)
-		return x86::psubsw(x, y);
+	return x86::psubsw(x, y);
 #else
-		return As<Short4>(V(lowerPSSUBSAT(V(x.value), V(y.value))));
+	return As<Short4>(V(lowerPSSUBSAT(V(x.value), V(y.value))));
 #endif
-	}
+}
 
-	RValue<Short4> MulHigh(RValue<Short4> x, RValue<Short4> y)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
+RValue<Short4> MulHigh(RValue<Short4> x, RValue<Short4> y)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
 #if defined(__i386__) || defined(__x86_64__)
-		return x86::pmulhw(x, y);
+	return x86::pmulhw(x, y);
 #else
-		return As<Short4>(V(lowerMulHigh(V(x.value), V(y.value), true)));
+	return As<Short4>(V(lowerMulHigh(V(x.value), V(y.value), true)));
 #endif
-	}
+}
 
-	RValue<Int2> MulAdd(RValue<Short4> x, RValue<Short4> y)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
+RValue<Int2> MulAdd(RValue<Short4> x, RValue<Short4> y)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
 #if defined(__i386__) || defined(__x86_64__)
-		return x86::pmaddwd(x, y);
+	return x86::pmaddwd(x, y);
 #else
-		return As<Int2>(V(lowerMulAdd(V(x.value), V(y.value))));
+	return As<Int2>(V(lowerMulAdd(V(x.value), V(y.value))));
 #endif
-	}
+}
 
-	RValue<SByte8> PackSigned(RValue<Short4> x, RValue<Short4> y)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
+RValue<SByte8> PackSigned(RValue<Short4> x, RValue<Short4> y)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
 #if defined(__i386__) || defined(__x86_64__)
-		auto result = x86::packsswb(x, y);
+	auto result = x86::packsswb(x, y);
 #else
-		auto result = V(lowerPack(V(x.value), V(y.value), true));
+	auto result = V(lowerPack(V(x.value), V(y.value), true));
 #endif
-		return As<SByte8>(Swizzle(As<Int4>(result), 0x0202));
-	}
+	return As<SByte8>(Swizzle(As<Int4>(result), 0x0202));
+}
 
-	RValue<Byte8> PackUnsigned(RValue<Short4> x, RValue<Short4> y)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
+RValue<Byte8> PackUnsigned(RValue<Short4> x, RValue<Short4> y)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
 #if defined(__i386__) || defined(__x86_64__)
-		auto result = x86::packuswb(x, y);
+	auto result = x86::packuswb(x, y);
 #else
-		auto result = V(lowerPack(V(x.value), V(y.value), false));
+	auto result = V(lowerPack(V(x.value), V(y.value), false));
 #endif
-		return As<Byte8>(Swizzle(As<Int4>(result), 0x0202));
-	}
+	return As<Byte8>(Swizzle(As<Int4>(result), 0x0202));
+}
 
-	RValue<Short4> CmpGT(RValue<Short4> x, RValue<Short4> y)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
+RValue<Short4> CmpGT(RValue<Short4> x, RValue<Short4> y)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
 #if defined(__i386__) || defined(__x86_64__)
-		return x86::pcmpgtw(x, y);
+	return x86::pcmpgtw(x, y);
 #else
-		return As<Short4>(V(lowerPCMP(llvm::ICmpInst::ICMP_SGT, V(x.value), V(y.value), T(Short4::getType()))));
+	return As<Short4>(V(lowerPCMP(llvm::ICmpInst::ICMP_SGT, V(x.value), V(y.value), T(Short4::getType()))));
 #endif
-	}
+}
 
-	RValue<Short4> CmpEQ(RValue<Short4> x, RValue<Short4> y)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
+RValue<Short4> CmpEQ(RValue<Short4> x, RValue<Short4> y)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
 #if defined(__i386__) || defined(__x86_64__)
-		return x86::pcmpeqw(x, y);
+	return x86::pcmpeqw(x, y);
 #else
-		return As<Short4>(V(lowerPCMP(llvm::ICmpInst::ICMP_EQ, V(x.value), V(y.value), T(Short4::getType()))));
+	return As<Short4>(V(lowerPCMP(llvm::ICmpInst::ICMP_EQ, V(x.value), V(y.value), T(Short4::getType()))));
 #endif
-	}
+}
 
-	Type *Short4::getType()
-	{
-		return T(Type_v4i16);
-	}
+Type *Short4::getType()
+{
+	return T(Type_v4i16);
+}
 
-	UShort4::UShort4(RValue<Float4> cast, bool saturate)
+UShort4::UShort4(RValue<Float4> cast, bool saturate)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	if(saturate)
 	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		if(saturate)
+#if defined(__i386__) || defined(__x86_64__)
+		if(CPUID::supportsSSE4_1())
 		{
-#if defined(__i386__) || defined(__x86_64__)
-			if(CPUID::supportsSSE4_1())
-			{
-				Int4 int4(Min(cast, Float4(0xFFFF)));   // packusdw takes care of 0x0000 saturation
-				*this = As<Short4>(PackUnsigned(int4, int4));
-			}
-			else
-#endif
-			{
-				*this = Short4(Int4(Max(Min(cast, Float4(0xFFFF)), Float4(0x0000))));
-			}
+			Int4 int4(Min(cast, Float4(0xFFFF)));   // packusdw takes care of 0x0000 saturation
+			*this = As<Short4>(PackUnsigned(int4, int4));
 		}
 		else
+#endif
 		{
-			*this = Short4(Int4(cast));
+			*this = Short4(Int4(Max(Min(cast, Float4(0xFFFF)), Float4(0x0000))));
 		}
 	}
-
-	RValue<UShort4> operator<<(RValue<UShort4> lhs, unsigned char rhs)
+	else
 	{
-		RR_DEBUG_INFO_UPDATE_LOC();
+		*this = Short4(Int4(cast));
+	}
+}
+
+RValue<UShort4> operator<<(RValue<UShort4> lhs, unsigned char rhs)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
 #if defined(__i386__) || defined(__x86_64__)
-	//	return RValue<Short4>(Nucleus::createShl(lhs.value, rhs.value));
+//	return RValue<Short4>(Nucleus::createShl(lhs.value, rhs.value));
 
-		return As<UShort4>(x86::psllw(As<Short4>(lhs), rhs));
+	return As<UShort4>(x86::psllw(As<Short4>(lhs), rhs));
 #else
-		return As<UShort4>(V(lowerVectorShl(V(lhs.value), rhs)));
+	return As<UShort4>(V(lowerVectorShl(V(lhs.value), rhs)));
 #endif
-	}
+}
 
-	RValue<UShort4> operator>>(RValue<UShort4> lhs, unsigned char rhs)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
+RValue<UShort4> operator>>(RValue<UShort4> lhs, unsigned char rhs)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
 #if defined(__i386__) || defined(__x86_64__)
-	//	return RValue<Short4>(Nucleus::createLShr(lhs.value, rhs.value));
+//	return RValue<Short4>(Nucleus::createLShr(lhs.value, rhs.value));
 
-		return x86::psrlw(lhs, rhs);
+	return x86::psrlw(lhs, rhs);
 #else
-		return As<UShort4>(V(lowerVectorLShr(V(lhs.value), rhs)));
+	return As<UShort4>(V(lowerVectorLShr(V(lhs.value), rhs)));
 #endif
-	}
+}
 
-	RValue<UShort4> Max(RValue<UShort4> x, RValue<UShort4> y)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return RValue<UShort4>(Max(As<Short4>(x) - Short4(0x8000u, 0x8000u, 0x8000u, 0x8000u), As<Short4>(y) - Short4(0x8000u, 0x8000u, 0x8000u, 0x8000u)) + Short4(0x8000u, 0x8000u, 0x8000u, 0x8000u));
-	}
+RValue<UShort4> Max(RValue<UShort4> x, RValue<UShort4> y)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return RValue<UShort4>(Max(As<Short4>(x) - Short4(0x8000u, 0x8000u, 0x8000u, 0x8000u), As<Short4>(y) - Short4(0x8000u, 0x8000u, 0x8000u, 0x8000u)) + Short4(0x8000u, 0x8000u, 0x8000u, 0x8000u));
+}
 
-	RValue<UShort4> Min(RValue<UShort4> x, RValue<UShort4> y)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return RValue<UShort4>(Min(As<Short4>(x) - Short4(0x8000u, 0x8000u, 0x8000u, 0x8000u), As<Short4>(y) - Short4(0x8000u, 0x8000u, 0x8000u, 0x8000u)) + Short4(0x8000u, 0x8000u, 0x8000u, 0x8000u));
-	}
+RValue<UShort4> Min(RValue<UShort4> x, RValue<UShort4> y)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return RValue<UShort4>(Min(As<Short4>(x) - Short4(0x8000u, 0x8000u, 0x8000u, 0x8000u), As<Short4>(y) - Short4(0x8000u, 0x8000u, 0x8000u, 0x8000u)) + Short4(0x8000u, 0x8000u, 0x8000u, 0x8000u));
+}
 
-	RValue<UShort4> AddSat(RValue<UShort4> x, RValue<UShort4> y)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
+RValue<UShort4> AddSat(RValue<UShort4> x, RValue<UShort4> y)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
 #if defined(__i386__) || defined(__x86_64__)
-		return x86::paddusw(x, y);
+	return x86::paddusw(x, y);
 #else
-		return As<UShort4>(V(lowerPUADDSAT(V(x.value), V(y.value))));
+	return As<UShort4>(V(lowerPUADDSAT(V(x.value), V(y.value))));
 #endif
-	}
+}
 
-	RValue<UShort4> SubSat(RValue<UShort4> x, RValue<UShort4> y)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
+RValue<UShort4> SubSat(RValue<UShort4> x, RValue<UShort4> y)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
 #if defined(__i386__) || defined(__x86_64__)
-		return x86::psubusw(x, y);
+	return x86::psubusw(x, y);
 #else
-		return As<UShort4>(V(lowerPUSUBSAT(V(x.value), V(y.value))));
+	return As<UShort4>(V(lowerPUSUBSAT(V(x.value), V(y.value))));
 #endif
-	}
+}
 
-	RValue<UShort4> MulHigh(RValue<UShort4> x, RValue<UShort4> y)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
+RValue<UShort4> MulHigh(RValue<UShort4> x, RValue<UShort4> y)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
 #if defined(__i386__) || defined(__x86_64__)
-		return x86::pmulhuw(x, y);
+	return x86::pmulhuw(x, y);
 #else
-		return As<UShort4>(V(lowerMulHigh(V(x.value), V(y.value), false)));
+	return As<UShort4>(V(lowerMulHigh(V(x.value), V(y.value), false)));
 #endif
-	}
+}
 
-	RValue<UShort4> Average(RValue<UShort4> x, RValue<UShort4> y)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
+RValue<UShort4> Average(RValue<UShort4> x, RValue<UShort4> y)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
 #if defined(__i386__) || defined(__x86_64__)
-		return x86::pavgw(x, y);
+	return x86::pavgw(x, y);
 #else
-		return As<UShort4>(V(lowerPAVG(V(x.value), V(y.value))));
+	return As<UShort4>(V(lowerPAVG(V(x.value), V(y.value))));
 #endif
-	}
+}
 
-	Type *UShort4::getType()
-	{
-		return T(Type_v4i16);
-	}
+Type *UShort4::getType()
+{
+	return T(Type_v4i16);
+}
 
-	RValue<Short8> operator<<(RValue<Short8> lhs, unsigned char rhs)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
+RValue<Short8> operator<<(RValue<Short8> lhs, unsigned char rhs)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
 #if defined(__i386__) || defined(__x86_64__)
-		return x86::psllw(lhs, rhs);
+	return x86::psllw(lhs, rhs);
 #else
-		return As<Short8>(V(lowerVectorShl(V(lhs.value), rhs)));
+	return As<Short8>(V(lowerVectorShl(V(lhs.value), rhs)));
 #endif
-	}
+}
 
-	RValue<Short8> operator>>(RValue<Short8> lhs, unsigned char rhs)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
+RValue<Short8> operator>>(RValue<Short8> lhs, unsigned char rhs)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
 #if defined(__i386__) || defined(__x86_64__)
-		return x86::psraw(lhs, rhs);
+	return x86::psraw(lhs, rhs);
 #else
-		return As<Short8>(V(lowerVectorAShr(V(lhs.value), rhs)));
+	return As<Short8>(V(lowerVectorAShr(V(lhs.value), rhs)));
 #endif
-	}
+}
 
-	RValue<Int4> MulAdd(RValue<Short8> x, RValue<Short8> y)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
+RValue<Int4> MulAdd(RValue<Short8> x, RValue<Short8> y)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
 #if defined(__i386__) || defined(__x86_64__)
-		return x86::pmaddwd(x, y);
+	return x86::pmaddwd(x, y);
 #else
-		return As<Int4>(V(lowerMulAdd(V(x.value), V(y.value))));
+	return As<Int4>(V(lowerMulAdd(V(x.value), V(y.value))));
 #endif
-	}
+}
 
-	RValue<Short8> MulHigh(RValue<Short8> x, RValue<Short8> y)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
+RValue<Short8> MulHigh(RValue<Short8> x, RValue<Short8> y)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
 #if defined(__i386__) || defined(__x86_64__)
-		return x86::pmulhw(x, y);
+	return x86::pmulhw(x, y);
 #else
-		return As<Short8>(V(lowerMulHigh(V(x.value), V(y.value), true)));
+	return As<Short8>(V(lowerMulHigh(V(x.value), V(y.value), true)));
 #endif
-	}
+}
 
-	Type *Short8::getType()
-	{
-		return T(llvm::VectorType::get(T(Short::getType()), 8));
-	}
+Type *Short8::getType()
+{
+	return T(llvm::VectorType::get(T(Short::getType()), 8));
+}
 
-	RValue<UShort8> operator<<(RValue<UShort8> lhs, unsigned char rhs)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
+RValue<UShort8> operator<<(RValue<UShort8> lhs, unsigned char rhs)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
 #if defined(__i386__) || defined(__x86_64__)
-		return As<UShort8>(x86::psllw(As<Short8>(lhs), rhs));
+	return As<UShort8>(x86::psllw(As<Short8>(lhs), rhs));
 #else
-		return As<UShort8>(V(lowerVectorShl(V(lhs.value), rhs)));
+	return As<UShort8>(V(lowerVectorShl(V(lhs.value), rhs)));
 #endif
-	}
+}
 
-	RValue<UShort8> operator>>(RValue<UShort8> lhs, unsigned char rhs)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
+RValue<UShort8> operator>>(RValue<UShort8> lhs, unsigned char rhs)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
 #if defined(__i386__) || defined(__x86_64__)
-		return x86::psrlw(lhs, rhs);   // FIXME: Fallback required
+	return x86::psrlw(lhs, rhs);   // FIXME: Fallback required
 #else
-		return As<UShort8>(V(lowerVectorLShr(V(lhs.value), rhs)));
+	return As<UShort8>(V(lowerVectorLShr(V(lhs.value), rhs)));
 #endif
-	}
+}
 
-	RValue<UShort8> Swizzle(RValue<UShort8> x, char select0, char select1, char select2, char select3, char select4, char select5, char select6, char select7)
+RValue<UShort8> Swizzle(RValue<UShort8> x, char select0, char select1, char select2, char select3, char select4, char select5, char select6, char select7)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	int pshufb[16] =
 	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		int pshufb[16] =
-		{
-			select0 + 0,
-			select0 + 1,
-			select1 + 0,
-			select1 + 1,
-			select2 + 0,
-			select2 + 1,
-			select3 + 0,
-			select3 + 1,
-			select4 + 0,
-			select4 + 1,
-			select5 + 0,
-			select5 + 1,
-			select6 + 0,
-			select6 + 1,
-			select7 + 0,
-			select7 + 1,
-		};
+		select0 + 0,
+		select0 + 1,
+		select1 + 0,
+		select1 + 1,
+		select2 + 0,
+		select2 + 1,
+		select3 + 0,
+		select3 + 1,
+		select4 + 0,
+		select4 + 1,
+		select5 + 0,
+		select5 + 1,
+		select6 + 0,
+		select6 + 1,
+		select7 + 0,
+		select7 + 1,
+	};
 
-		Value *byte16 = Nucleus::createBitCast(x.value, Byte16::getType());
-		Value *shuffle = Nucleus::createShuffleVector(byte16, byte16, pshufb);
-		Value *short8 = Nucleus::createBitCast(shuffle, UShort8::getType());
+	Value *byte16 = Nucleus::createBitCast(x.value, Byte16::getType());
+	Value *shuffle = Nucleus::createShuffleVector(byte16, byte16, pshufb);
+	Value *short8 = Nucleus::createBitCast(shuffle, UShort8::getType());
 
-		return RValue<UShort8>(short8);
-	}
+	return RValue<UShort8>(short8);
+}
 
-	RValue<UShort8> MulHigh(RValue<UShort8> x, RValue<UShort8> y)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
+RValue<UShort8> MulHigh(RValue<UShort8> x, RValue<UShort8> y)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
 #if defined(__i386__) || defined(__x86_64__)
-		return x86::pmulhuw(x, y);
+	return x86::pmulhuw(x, y);
 #else
-		return As<UShort8>(V(lowerMulHigh(V(x.value), V(y.value), false)));
+	return As<UShort8>(V(lowerMulHigh(V(x.value), V(y.value), false)));
 #endif
-	}
+}
 
-	Type *UShort8::getType()
-	{
-		return T(llvm::VectorType::get(T(UShort::getType()), 8));
-	}
+Type *UShort8::getType()
+{
+	return T(llvm::VectorType::get(T(UShort::getType()), 8));
+}
 
-	RValue<Int> operator++(Int &val, int)   // Post-increment
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		RValue<Int> res = val;
+RValue<Int> operator++(Int &val, int)   // Post-increment
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	RValue<Int> res = val;
 
-		Value *inc = Nucleus::createAdd(res.value, Nucleus::createConstantInt(1));
-		val.storeValue(inc);
+	Value *inc = Nucleus::createAdd(res.value, Nucleus::createConstantInt(1));
+	val.storeValue(inc);
 
-		return res;
-	}
+	return res;
+}
 
-	const Int &operator++(Int &val)   // Pre-increment
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		Value *inc = Nucleus::createAdd(val.loadValue(), Nucleus::createConstantInt(1));
-		val.storeValue(inc);
+const Int &operator++(Int &val)   // Pre-increment
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	Value *inc = Nucleus::createAdd(val.loadValue(), Nucleus::createConstantInt(1));
+	val.storeValue(inc);
 
-		return val;
-	}
+	return val;
+}
 
-	RValue<Int> operator--(Int &val, int)   // Post-decrement
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		RValue<Int> res = val;
+RValue<Int> operator--(Int &val, int)   // Post-decrement
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	RValue<Int> res = val;
 
-		Value *inc = Nucleus::createSub(res.value, Nucleus::createConstantInt(1));
-		val.storeValue(inc);
+	Value *inc = Nucleus::createSub(res.value, Nucleus::createConstantInt(1));
+	val.storeValue(inc);
 
-		return res;
-	}
+	return res;
+}
 
-	const Int &operator--(Int &val)   // Pre-decrement
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		Value *inc = Nucleus::createSub(val.loadValue(), Nucleus::createConstantInt(1));
-		val.storeValue(inc);
+const Int &operator--(Int &val)   // Pre-decrement
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	Value *inc = Nucleus::createSub(val.loadValue(), Nucleus::createConstantInt(1));
+	val.storeValue(inc);
 
-		return val;
-	}
+	return val;
+}
 
-	RValue<Int> RoundInt(RValue<Float> cast)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
+RValue<Int> RoundInt(RValue<Float> cast)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
 #if defined(__i386__) || defined(__x86_64__)
-		return x86::cvtss2si(cast);
+	return x86::cvtss2si(cast);
 #else
-		return RValue<Int>(V(lowerRoundInt(V(cast.value), T(Int::getType()))));
+	return RValue<Int>(V(lowerRoundInt(V(cast.value), T(Int::getType()))));
 #endif
-	}
+}
 
-	Type *Int::getType()
-	{
-		return T(llvm::Type::getInt32Ty(jit->context));
-	}
+Type *Int::getType()
+{
+	return T(llvm::Type::getInt32Ty(jit->context));
+}
 
-	Type *Long::getType()
-	{
-		return T(llvm::Type::getInt64Ty(jit->context));
-	}
+Type *Long::getType()
+{
+	return T(llvm::Type::getInt64Ty(jit->context));
+}
 
-	UInt::UInt(RValue<Float> cast)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		Value *integer = Nucleus::createFPToUI(cast.value, UInt::getType());
-		storeValue(integer);
-	}
+UInt::UInt(RValue<Float> cast)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	Value *integer = Nucleus::createFPToUI(cast.value, UInt::getType());
+	storeValue(integer);
+}
 
-	RValue<UInt> operator++(UInt &val, int)   // Post-increment
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		RValue<UInt> res = val;
+RValue<UInt> operator++(UInt &val, int)   // Post-increment
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	RValue<UInt> res = val;
 
-		Value *inc = Nucleus::createAdd(res.value, Nucleus::createConstantInt(1));
-		val.storeValue(inc);
+	Value *inc = Nucleus::createAdd(res.value, Nucleus::createConstantInt(1));
+	val.storeValue(inc);
 
-		return res;
-	}
+	return res;
+}
 
-	const UInt &operator++(UInt &val)   // Pre-increment
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		Value *inc = Nucleus::createAdd(val.loadValue(), Nucleus::createConstantInt(1));
-		val.storeValue(inc);
+const UInt &operator++(UInt &val)   // Pre-increment
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	Value *inc = Nucleus::createAdd(val.loadValue(), Nucleus::createConstantInt(1));
+	val.storeValue(inc);
 
-		return val;
-	}
+	return val;
+}
 
-	RValue<UInt> operator--(UInt &val, int)   // Post-decrement
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		RValue<UInt> res = val;
+RValue<UInt> operator--(UInt &val, int)   // Post-decrement
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	RValue<UInt> res = val;
 
-		Value *inc = Nucleus::createSub(res.value, Nucleus::createConstantInt(1));
-		val.storeValue(inc);
+	Value *inc = Nucleus::createSub(res.value, Nucleus::createConstantInt(1));
+	val.storeValue(inc);
 
-		return res;
-	}
+	return res;
+}
 
-	const UInt &operator--(UInt &val)   // Pre-decrement
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		Value *inc = Nucleus::createSub(val.loadValue(), Nucleus::createConstantInt(1));
-		val.storeValue(inc);
+const UInt &operator--(UInt &val)   // Pre-decrement
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	Value *inc = Nucleus::createSub(val.loadValue(), Nucleus::createConstantInt(1));
+	val.storeValue(inc);
 
-		return val;
-	}
+	return val;
+}
 
 //	RValue<UInt> RoundUInt(RValue<Float> cast)
 //	{
@@ -2978,10 +2979,10 @@
 //#endif
 //	}
 
-	Type *UInt::getType()
-	{
-		return T(llvm::Type::getInt32Ty(jit->context));
-	}
+Type *UInt::getType()
+{
+	return T(llvm::Type::getInt32Ty(jit->context));
+}
 
 //	Int2::Int2(RValue<Int> cast)
 //	{
@@ -2994,1666 +2995,1668 @@
 //		storeValue(replicate);
 //	}
 
-	RValue<Int2> operator<<(RValue<Int2> lhs, unsigned char rhs)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
+RValue<Int2> operator<<(RValue<Int2> lhs, unsigned char rhs)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
 #if defined(__i386__) || defined(__x86_64__)
-	//	return RValue<Int2>(Nucleus::createShl(lhs.value, rhs.value));
+//	return RValue<Int2>(Nucleus::createShl(lhs.value, rhs.value));
 
-		return x86::pslld(lhs, rhs);
+	return x86::pslld(lhs, rhs);
 #else
-		return As<Int2>(V(lowerVectorShl(V(lhs.value), rhs)));
+	return As<Int2>(V(lowerVectorShl(V(lhs.value), rhs)));
 #endif
-	}
+}
 
-	RValue<Int2> operator>>(RValue<Int2> lhs, unsigned char rhs)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
+RValue<Int2> operator>>(RValue<Int2> lhs, unsigned char rhs)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
 #if defined(__i386__) || defined(__x86_64__)
-	//	return RValue<Int2>(Nucleus::createAShr(lhs.value, rhs.value));
+//	return RValue<Int2>(Nucleus::createAShr(lhs.value, rhs.value));
 
-		return x86::psrad(lhs, rhs);
+	return x86::psrad(lhs, rhs);
 #else
-		return As<Int2>(V(lowerVectorAShr(V(lhs.value), rhs)));
+	return As<Int2>(V(lowerVectorAShr(V(lhs.value), rhs)));
 #endif
-	}
+}
 
-	Type *Int2::getType()
-	{
-		return T(Type_v2i32);
-	}
+Type *Int2::getType()
+{
+	return T(Type_v2i32);
+}
 
-	RValue<UInt2> operator<<(RValue<UInt2> lhs, unsigned char rhs)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
+RValue<UInt2> operator<<(RValue<UInt2> lhs, unsigned char rhs)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
 #if defined(__i386__) || defined(__x86_64__)
-	//	return RValue<UInt2>(Nucleus::createShl(lhs.value, rhs.value));
+//	return RValue<UInt2>(Nucleus::createShl(lhs.value, rhs.value));
 
-		return As<UInt2>(x86::pslld(As<Int2>(lhs), rhs));
+	return As<UInt2>(x86::pslld(As<Int2>(lhs), rhs));
 #else
-		return As<UInt2>(V(lowerVectorShl(V(lhs.value), rhs)));
+	return As<UInt2>(V(lowerVectorShl(V(lhs.value), rhs)));
 #endif
-	}
+}
 
-	RValue<UInt2> operator>>(RValue<UInt2> lhs, unsigned char rhs)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
+RValue<UInt2> operator>>(RValue<UInt2> lhs, unsigned char rhs)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
 #if defined(__i386__) || defined(__x86_64__)
-	//	return RValue<UInt2>(Nucleus::createLShr(lhs.value, rhs.value));
+//	return RValue<UInt2>(Nucleus::createLShr(lhs.value, rhs.value));
 
-		return x86::psrld(lhs, rhs);
+	return x86::psrld(lhs, rhs);
 #else
-		return As<UInt2>(V(lowerVectorLShr(V(lhs.value), rhs)));
+	return As<UInt2>(V(lowerVectorLShr(V(lhs.value), rhs)));
 #endif
-	}
+}
 
-	Type *UInt2::getType()
-	{
-		return T(Type_v2i32);
-	}
+Type *UInt2::getType()
+{
+	return T(Type_v2i32);
+}
 
-	Int4::Int4(RValue<Byte4> cast) : XYZW(this)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
+Int4::Int4(RValue<Byte4> cast) : XYZW(this)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
 #if defined(__i386__) || defined(__x86_64__)
-		if(CPUID::supportsSSE4_1())
-		{
-			*this = x86::pmovzxbd(As<Byte16>(cast));
-		}
-		else
+	if(CPUID::supportsSSE4_1())
+	{
+		*this = x86::pmovzxbd(As<Byte16>(cast));
+	}
+	else
 #endif
-		{
-			int swizzle[16] = {0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23};
-			Value *a = Nucleus::createBitCast(cast.value, Byte16::getType());
-			Value *b = Nucleus::createShuffleVector(a, Nucleus::createNullValue(Byte16::getType()), swizzle);
-
-			int swizzle2[8] = {0, 8, 1, 9, 2, 10, 3, 11};
-			Value *c = Nucleus::createBitCast(b, Short8::getType());
-			Value *d = Nucleus::createShuffleVector(c, Nucleus::createNullValue(Short8::getType()), swizzle2);
-
-			*this = As<Int4>(d);
-		}
-	}
-
-	Int4::Int4(RValue<SByte4> cast) : XYZW(this)
 	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-#if defined(__i386__) || defined(__x86_64__)
-		if(CPUID::supportsSSE4_1())
-		{
-			*this = x86::pmovsxbd(As<SByte16>(cast));
-		}
-		else
-#endif
-		{
-			int swizzle[16] = {0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7};
-			Value *a = Nucleus::createBitCast(cast.value, Byte16::getType());
-			Value *b = Nucleus::createShuffleVector(a, a, swizzle);
+		int swizzle[16] = {0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23};
+		Value *a = Nucleus::createBitCast(cast.value, Byte16::getType());
+		Value *b = Nucleus::createShuffleVector(a, Nucleus::createNullValue(Byte16::getType()), swizzle);
 
-			int swizzle2[8] = {0, 0, 1, 1, 2, 2, 3, 3};
-			Value *c = Nucleus::createBitCast(b, Short8::getType());
-			Value *d = Nucleus::createShuffleVector(c, c, swizzle2);
+		int swizzle2[8] = {0, 8, 1, 9, 2, 10, 3, 11};
+		Value *c = Nucleus::createBitCast(b, Short8::getType());
+		Value *d = Nucleus::createShuffleVector(c, Nucleus::createNullValue(Short8::getType()), swizzle2);
 
-			*this = As<Int4>(d) >> 24;
-		}
-	}
-
-	Int4::Int4(RValue<Short4> cast) : XYZW(this)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-#if defined(__i386__) || defined(__x86_64__)
-		if(CPUID::supportsSSE4_1())
-		{
-			*this = x86::pmovsxwd(As<Short8>(cast));
-		}
-		else
-#endif
-		{
-			int swizzle[8] = {0, 0, 1, 1, 2, 2, 3, 3};
-			Value *c = Nucleus::createShuffleVector(cast.value, cast.value, swizzle);
-			*this = As<Int4>(c) >> 16;
-		}
-	}
-
-	Int4::Int4(RValue<UShort4> cast) : XYZW(this)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-#if defined(__i386__) || defined(__x86_64__)
-		if(CPUID::supportsSSE4_1())
-		{
-			*this = x86::pmovzxwd(As<UShort8>(cast));
-		}
-		else
-#endif
-		{
-			int swizzle[8] = {0, 8, 1, 9, 2, 10, 3, 11};
-			Value *c = Nucleus::createShuffleVector(cast.value, Short8(0, 0, 0, 0, 0, 0, 0, 0).loadValue(), swizzle);
-			*this = As<Int4>(c);
-		}
-	}
-
-	Int4::Int4(RValue<Int> rhs) : XYZW(this)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		Value *vector = loadValue();
-		Value *insert = Nucleus::createInsertElement(vector, rhs.value, 0);
-
-		int swizzle[4] = {0, 0, 0, 0};
-		Value *replicate = Nucleus::createShuffleVector(insert, insert, swizzle);
-
-		storeValue(replicate);
-	}
-
-	RValue<Int4> operator<<(RValue<Int4> lhs, unsigned char rhs)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-#if defined(__i386__) || defined(__x86_64__)
-		return x86::pslld(lhs, rhs);
-#else
-		return As<Int4>(V(lowerVectorShl(V(lhs.value), rhs)));
-#endif
-	}
-
-	RValue<Int4> operator>>(RValue<Int4> lhs, unsigned char rhs)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-#if defined(__i386__) || defined(__x86_64__)
-		return x86::psrad(lhs, rhs);
-#else
-		return As<Int4>(V(lowerVectorAShr(V(lhs.value), rhs)));
-#endif
-	}
-
-	RValue<Int4> CmpEQ(RValue<Int4> x, RValue<Int4> y)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpEQ(x.value, y.value), Int4::getType()));
-	}
-
-	RValue<Int4> CmpLT(RValue<Int4> x, RValue<Int4> y)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpSLT(x.value, y.value), Int4::getType()));
-	}
-
-	RValue<Int4> CmpLE(RValue<Int4> x, RValue<Int4> y)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpSLE(x.value, y.value), Int4::getType()));
-	}
-
-	RValue<Int4> CmpNEQ(RValue<Int4> x, RValue<Int4> y)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpNE(x.value, y.value), Int4::getType()));
-	}
-
-	RValue<Int4> CmpNLT(RValue<Int4> x, RValue<Int4> y)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpSGE(x.value, y.value), Int4::getType()));
-	}
-
-	RValue<Int4> CmpNLE(RValue<Int4> x, RValue<Int4> y)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpSGT(x.value, y.value), Int4::getType()));
-	}
-
-	RValue<Int4> Max(RValue<Int4> x, RValue<Int4> y)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-#if defined(__i386__) || defined(__x86_64__)
-		if(CPUID::supportsSSE4_1())
-		{
-			return x86::pmaxsd(x, y);
-		}
-		else
-#endif
-		{
-			RValue<Int4> greater = CmpNLE(x, y);
-			return (x & greater) | (y & ~greater);
-		}
-	}
-
-	RValue<Int4> Min(RValue<Int4> x, RValue<Int4> y)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-#if defined(__i386__) || defined(__x86_64__)
-		if(CPUID::supportsSSE4_1())
-		{
-			return x86::pminsd(x, y);
-		}
-		else
-#endif
-		{
-			RValue<Int4> less = CmpLT(x, y);
-			return (x & less) | (y & ~less);
-		}
-	}
-
-	RValue<Int4> RoundInt(RValue<Float4> cast)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-#if defined(__i386__) || defined(__x86_64__)
-		return x86::cvtps2dq(cast);
-#else
-		return As<Int4>(V(lowerRoundInt(V(cast.value), T(Int4::getType()))));
-#endif
-	}
-
-	RValue<Int4> MulHigh(RValue<Int4> x, RValue<Int4> y)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		// TODO: For x86, build an intrinsics version of this which uses shuffles + pmuludq.
-		return As<Int4>(V(lowerMulHigh(V(x.value), V(y.value), true)));
-	}
-
-	RValue<UInt4> MulHigh(RValue<UInt4> x, RValue<UInt4> y)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		// TODO: For x86, build an intrinsics version of this which uses shuffles + pmuludq.
-		return As<UInt4>(V(lowerMulHigh(V(x.value), V(y.value), false)));
-	}
-
-	RValue<Short8> PackSigned(RValue<Int4> x, RValue<Int4> y)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-#if defined(__i386__) || defined(__x86_64__)
-		return x86::packssdw(x, y);
-#else
-		return As<Short8>(V(lowerPack(V(x.value), V(y.value), true)));
-#endif
-	}
-
-	RValue<UShort8> PackUnsigned(RValue<Int4> x, RValue<Int4> y)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-#if defined(__i386__) || defined(__x86_64__)
-		return x86::packusdw(x, y);
-#else
-		return As<UShort8>(V(lowerPack(V(x.value), V(y.value), false)));
-#endif
-	}
-
-	RValue<Int> SignMask(RValue<Int4> x)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-#if defined(__i386__) || defined(__x86_64__)
-		return x86::movmskps(As<Float4>(x));
-#else
-		return As<Int>(V(lowerSignMask(V(x.value), T(Int::getType()))));
-#endif
-	}
-
-	Type *Int4::getType()
-	{
-		return T(llvm::VectorType::get(T(Int::getType()), 4));
-	}
-
-	UInt4::UInt4(RValue<Float4> cast) : XYZW(this)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		Value *xyzw = Nucleus::createFPToUI(cast.value, UInt4::getType());
-		storeValue(xyzw);
-	}
-
-	UInt4::UInt4(RValue<UInt> rhs) : XYZW(this)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		Value *vector = loadValue();
-		Value *insert = Nucleus::createInsertElement(vector, rhs.value, 0);
-
-		int swizzle[4] = {0, 0, 0, 0};
-		Value *replicate = Nucleus::createShuffleVector(insert, insert, swizzle);
-
-		storeValue(replicate);
-	}
-
-	RValue<UInt4> operator<<(RValue<UInt4> lhs, unsigned char rhs)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-#if defined(__i386__) || defined(__x86_64__)
-		return As<UInt4>(x86::pslld(As<Int4>(lhs), rhs));
-#else
-		return As<UInt4>(V(lowerVectorShl(V(lhs.value), rhs)));
-#endif
-	}
-
-	RValue<UInt4> operator>>(RValue<UInt4> lhs, unsigned char rhs)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-#if defined(__i386__) || defined(__x86_64__)
-		return x86::psrld(lhs, rhs);
-#else
-		return As<UInt4>(V(lowerVectorLShr(V(lhs.value), rhs)));
-#endif
-	}
-
-	RValue<UInt4> CmpEQ(RValue<UInt4> x, RValue<UInt4> y)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return RValue<UInt4>(Nucleus::createSExt(Nucleus::createICmpEQ(x.value, y.value), Int4::getType()));
-	}
-
-	RValue<UInt4> CmpLT(RValue<UInt4> x, RValue<UInt4> y)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return RValue<UInt4>(Nucleus::createSExt(Nucleus::createICmpULT(x.value, y.value), Int4::getType()));
-	}
-
-	RValue<UInt4> CmpLE(RValue<UInt4> x, RValue<UInt4> y)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return RValue<UInt4>(Nucleus::createSExt(Nucleus::createICmpULE(x.value, y.value), Int4::getType()));
-	}
-
-	RValue<UInt4> CmpNEQ(RValue<UInt4> x, RValue<UInt4> y)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return RValue<UInt4>(Nucleus::createSExt(Nucleus::createICmpNE(x.value, y.value), Int4::getType()));
-	}
-
-	RValue<UInt4> CmpNLT(RValue<UInt4> x, RValue<UInt4> y)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return RValue<UInt4>(Nucleus::createSExt(Nucleus::createICmpUGE(x.value, y.value), Int4::getType()));
-	}
-
-	RValue<UInt4> CmpNLE(RValue<UInt4> x, RValue<UInt4> y)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return RValue<UInt4>(Nucleus::createSExt(Nucleus::createICmpUGT(x.value, y.value), Int4::getType()));
-	}
-
-	RValue<UInt4> Max(RValue<UInt4> x, RValue<UInt4> y)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-#if defined(__i386__) || defined(__x86_64__)
-		if(CPUID::supportsSSE4_1())
-		{
-			return x86::pmaxud(x, y);
-		}
-		else
-#endif
-		{
-			RValue<UInt4> greater = CmpNLE(x, y);
-			return (x & greater) | (y & ~greater);
-		}
-	}
-
-	RValue<UInt4> Min(RValue<UInt4> x, RValue<UInt4> y)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-#if defined(__i386__) || defined(__x86_64__)
-		if(CPUID::supportsSSE4_1())
-		{
-			return x86::pminud(x, y);
-		}
-		else
-#endif
-		{
-			RValue<UInt4> less = CmpLT(x, y);
-			return (x & less) | (y & ~less);
-		}
-	}
-
-	Type *UInt4::getType()
-	{
-		return T(llvm::VectorType::get(T(UInt::getType()), 4));
-	}
-
-	Type *Half::getType()
-	{
-		return T(llvm::Type::getInt16Ty(jit->context));
-	}
-
-	RValue<Float> Rcp_pp(RValue<Float> x, bool exactAtPow2)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-#if defined(__i386__) || defined(__x86_64__)
-		if(exactAtPow2)
-		{
-			// rcpss uses a piecewise-linear approximation which minimizes the relative error
-			// but is not exact at power-of-two values. Rectify by multiplying by the inverse.
-			return x86::rcpss(x) * Float(1.0f / _mm_cvtss_f32(_mm_rcp_ss(_mm_set_ps1(1.0f))));
-		}
-		return x86::rcpss(x);
-#else
-		return As<Float>(V(lowerRCP(V(x.value))));
-#endif
-	}
-
-	RValue<Float> RcpSqrt_pp(RValue<Float> x)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-#if defined(__i386__) || defined(__x86_64__)
-		return x86::rsqrtss(x);
-#else
-		return As<Float>(V(lowerRSQRT(V(x.value))));
-#endif
-	}
-
-	RValue<Float> Sqrt(RValue<Float> x)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-#if defined(__i386__) || defined(__x86_64__)
-		return x86::sqrtss(x);
-#else
-		return As<Float>(V(lowerSQRT(V(x.value))));
-#endif
-	}
-
-	RValue<Float> Round(RValue<Float> x)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-#if defined(__i386__) || defined(__x86_64__)
-		if(CPUID::supportsSSE4_1())
-		{
-			return x86::roundss(x, 0);
-		}
-		else
-		{
-			return Float4(Round(Float4(x))).x;
-		}
-#else
-		return RValue<Float>(V(lowerRound(V(x.value))));
-#endif
-	}
-
-	RValue<Float> Trunc(RValue<Float> x)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-#if defined(__i386__) || defined(__x86_64__)
-		if(CPUID::supportsSSE4_1())
-		{
-			return x86::roundss(x, 3);
-		}
-		else
-		{
-			return Float(Int(x));   // Rounded toward zero
-		}
-#else
-		return RValue<Float>(V(lowerTrunc(V(x.value))));
-#endif
-	}
-
-	RValue<Float> Frac(RValue<Float> x)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-#if defined(__i386__) || defined(__x86_64__)
-		if(CPUID::supportsSSE4_1())
-		{
-			return x - x86::floorss(x);
-		}
-		else
-		{
-			return Float4(Frac(Float4(x))).x;
-		}
-#else
-		// x - floor(x) can be 1.0 for very small negative x.
-		// Clamp against the value just below 1.0.
-		return Min(x - Floor(x), As<Float>(Int(0x3F7FFFFF)));
-#endif
-	}
-
-	RValue<Float> Floor(RValue<Float> x)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-#if defined(__i386__) || defined(__x86_64__)
-		if(CPUID::supportsSSE4_1())
-		{
-			return x86::floorss(x);
-		}
-		else
-		{
-			return Float4(Floor(Float4(x))).x;
-		}
-#else
-		return RValue<Float>(V(lowerFloor(V(x.value))));
-#endif
-	}
-
-	RValue<Float> Ceil(RValue<Float> x)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-#if defined(__i386__) || defined(__x86_64__)
-		if(CPUID::supportsSSE4_1())
-		{
-			return x86::ceilss(x);
-		}
-		else
-#endif
-		{
-			return Float4(Ceil(Float4(x))).x;
-		}
-	}
-
-	Type *Float::getType()
-	{
-		return T(llvm::Type::getFloatTy(jit->context));
-	}
-
-	Type *Float2::getType()
-	{
-		return T(Type_v2f32);
-	}
-
-	RValue<Float> Exp2(RValue<Float> v)
-	{
-		auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::exp2, { T(Float::getType()) } );
-		return RValue<Float>(V(jit->builder->CreateCall(func, V(v.value))));
-	}
-
-	RValue<Float> Log2(RValue<Float> v)
-	{
-		auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::log2, { T(Float::getType()) } );
-		return RValue<Float>(V(jit->builder->CreateCall(func, V(v.value))));
-	}
-
-	Float4::Float4(RValue<Float> rhs) : XYZW(this)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		Value *vector = loadValue();
-		Value *insert = Nucleus::createInsertElement(vector, rhs.value, 0);
-
-		int swizzle[4] = {0, 0, 0, 0};
-		Value *replicate = Nucleus::createShuffleVector(insert, insert, swizzle);
-
-		storeValue(replicate);
-	}
-
-	RValue<Float4> Max(RValue<Float4> x, RValue<Float4> y)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-#if defined(__i386__) || defined(__x86_64__)
-		return x86::maxps(x, y);
-#else
-		return As<Float4>(V(lowerPFMINMAX(V(x.value), V(y.value), llvm::FCmpInst::FCMP_OGT)));
-#endif
-	}
-
-	RValue<Float4> Min(RValue<Float4> x, RValue<Float4> y)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-#if defined(__i386__) || defined(__x86_64__)
-		return x86::minps(x, y);
-#else
-		return As<Float4>(V(lowerPFMINMAX(V(x.value), V(y.value), llvm::FCmpInst::FCMP_OLT)));
-#endif
-	}
-
-	RValue<Float4> Rcp_pp(RValue<Float4> x, bool exactAtPow2)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-#if defined(__i386__) || defined(__x86_64__)
-		if(exactAtPow2)
-		{
-			// rcpps uses a piecewise-linear approximation which minimizes the relative error
-			// but is not exact at power-of-two values. Rectify by multiplying by the inverse.
-			return x86::rcpps(x) * Float4(1.0f / _mm_cvtss_f32(_mm_rcp_ss(_mm_set_ps1(1.0f))));
-		}
-		return x86::rcpps(x);
-#else
-		return As<Float4>(V(lowerRCP(V(x.value))));
-#endif
-	}
-
-	RValue<Float4> RcpSqrt_pp(RValue<Float4> x)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-#if defined(__i386__) || defined(__x86_64__)
-		return x86::rsqrtps(x);
-#else
-		return As<Float4>(V(lowerRSQRT(V(x.value))));
-#endif
-	}
-
-	RValue<Float4> Sqrt(RValue<Float4> x)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-#if defined(__i386__) || defined(__x86_64__)
-		return x86::sqrtps(x);
-#else
-		return As<Float4>(V(lowerSQRT(V(x.value))));
-#endif
-	}
-
-	RValue<Int> SignMask(RValue<Float4> x)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-#if defined(__i386__) || defined(__x86_64__)
-		return x86::movmskps(x);
-#else
-		return As<Int>(V(lowerFPSignMask(V(x.value), T(Int::getType()))));
-#endif
-	}
-
-	RValue<Int4> CmpEQ(RValue<Float4> x, RValue<Float4> y)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-	//	return As<Int4>(x86::cmpeqps(x, y));
-		return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpOEQ(x.value, y.value), Int4::getType()));
-	}
-
-	RValue<Int4> CmpLT(RValue<Float4> x, RValue<Float4> y)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-	//	return As<Int4>(x86::cmpltps(x, y));
-		return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpOLT(x.value, y.value), Int4::getType()));
-	}
-
-	RValue<Int4> CmpLE(RValue<Float4> x, RValue<Float4> y)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-	//	return As<Int4>(x86::cmpleps(x, y));
-		return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpOLE(x.value, y.value), Int4::getType()));
-	}
-
-	RValue<Int4> CmpNEQ(RValue<Float4> x, RValue<Float4> y)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-	//	return As<Int4>(x86::cmpneqps(x, y));
-		return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpONE(x.value, y.value), Int4::getType()));
-	}
-
-	RValue<Int4> CmpNLT(RValue<Float4> x, RValue<Float4> y)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-	//	return As<Int4>(x86::cmpnltps(x, y));
-		return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpOGE(x.value, y.value), Int4::getType()));
-	}
-
-	RValue<Int4> CmpNLE(RValue<Float4> x, RValue<Float4> y)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-	//	return As<Int4>(x86::cmpnleps(x, y));
-		return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpOGT(x.value, y.value), Int4::getType()));
-	}
-
-	RValue<Int4> CmpUEQ(RValue<Float4> x, RValue<Float4> y)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpUEQ(x.value, y.value), Int4::getType()));
-	}
-
-	RValue<Int4> CmpULT(RValue<Float4> x, RValue<Float4> y)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpULT(x.value, y.value), Int4::getType()));
-	}
-
-	RValue<Int4> CmpULE(RValue<Float4> x, RValue<Float4> y)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpULE(x.value, y.value), Int4::getType()));
-	}
-
-	RValue<Int4> CmpUNEQ(RValue<Float4> x, RValue<Float4> y)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpUNE(x.value, y.value), Int4::getType()));
-	}
-
-	RValue<Int4> CmpUNLT(RValue<Float4> x, RValue<Float4> y)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpUGE(x.value, y.value), Int4::getType()));
-	}
-
-	RValue<Int4> CmpUNLE(RValue<Float4> x, RValue<Float4> y)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpUGT(x.value, y.value), Int4::getType()));
-	}
-
-	RValue<Float4> Round(RValue<Float4> x)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-#if defined(__i386__) || defined(__x86_64__)
-		if(CPUID::supportsSSE4_1())
-		{
-			return x86::roundps(x, 0);
-		}
-		else
-		{
-			return Float4(RoundInt(x));
-		}
-#else
-		return RValue<Float4>(V(lowerRound(V(x.value))));
-#endif
-	}
-
-	RValue<Float4> Trunc(RValue<Float4> x)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-#if defined(__i386__) || defined(__x86_64__)
-		if(CPUID::supportsSSE4_1())
-		{
-			return x86::roundps(x, 3);
-		}
-		else
-		{
-			return Float4(Int4(x));
-		}
-#else
-		return RValue<Float4>(V(lowerTrunc(V(x.value))));
-#endif
-	}
-
-	RValue<Float4> Frac(RValue<Float4> x)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		Float4 frc;
-
-#if defined(__i386__) || defined(__x86_64__)
-		if(CPUID::supportsSSE4_1())
-		{
-			frc = x - Floor(x);
-		}
-		else
-		{
-			frc = x - Float4(Int4(x));   // Signed fractional part.
-
-			frc += As<Float4>(As<Int4>(CmpNLE(Float4(0.0f), frc)) & As<Int4>(Float4(1.0f)));   // Add 1.0 if negative.
-		}
-#else
-		frc = x - Floor(x);
-#endif
-
-		// x - floor(x) can be 1.0 for very small negative x.
-		// Clamp against the value just below 1.0.
-		return Min(frc, As<Float4>(Int4(0x3F7FFFFF)));
-	}
-
-	RValue<Float4> Floor(RValue<Float4> x)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-#if defined(__i386__) || defined(__x86_64__)
-		if(CPUID::supportsSSE4_1())
-		{
-			return x86::floorps(x);
-		}
-		else
-		{
-			return x - Frac(x);
-		}
-#else
-		return RValue<Float4>(V(lowerFloor(V(x.value))));
-#endif
-	}
-
-	RValue<Float4> Ceil(RValue<Float4> x)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-#if defined(__i386__) || defined(__x86_64__)
-		if(CPUID::supportsSSE4_1())
-		{
-			return x86::ceilps(x);
-		}
-		else
-#endif
-		{
-			return -Floor(-x);
-		}
-	}
-
-	RValue<Float4> Sin(RValue<Float4> v)
-	{
-		auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::sin, { V(v.value)->getType() } );
-		return RValue<Float4>(V(jit->builder->CreateCall(func, V(v.value))));
-	}
-
-	RValue<Float4> Cos(RValue<Float4> v)
-	{
-		auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::cos, { V(v.value)->getType() } );
-		return RValue<Float4>(V(jit->builder->CreateCall(func, V(v.value))));
-	}
-
-	RValue<Float4> Tan(RValue<Float4> v)
-	{
-		return Sin(v) / Cos(v);
-	}
-
-	static RValue<Float4> TransformFloat4PerElement(RValue<Float4> v, const char* name)
-	{
-		auto funcTy = ::llvm::FunctionType::get(T(Float::getType()), ::llvm::ArrayRef<llvm::Type*>(T(Float::getType())), false);
-		auto func = jit->module->getOrInsertFunction(name, funcTy);
-		llvm::Value *out = ::llvm::UndefValue::get(T(Float4::getType()));
-		for (uint64_t i = 0; i < 4; i++)
-		{
-			auto el = jit->builder->CreateCall(func, V(Nucleus::createExtractElement(v.value, Float::getType(), i)));
-			out = V(Nucleus::createInsertElement(V(out), V(el), i));
-		}
-		return RValue<Float4>(V(out));
-	}
-
-	RValue<Float4> Asin(RValue<Float4> v)
-	{
-		return TransformFloat4PerElement(v, "asinf");
-	}
-
-	RValue<Float4> Acos(RValue<Float4> v)
-	{
-		return TransformFloat4PerElement(v, "acosf");
-	}
-
-	RValue<Float4> Atan(RValue<Float4> v)
-	{
-		return TransformFloat4PerElement(v, "atanf");
-	}
-
-	RValue<Float4> Sinh(RValue<Float4> v)
-	{
-		return Float4(0.5f) * (Exp(v) - Exp(-v));
-	}
-
-	RValue<Float4> Cosh(RValue<Float4> v)
-	{
-		return Float4(0.5f) * (Exp(v) + Exp(-v));
-	}
-
-	RValue<Float4> Tanh(RValue<Float4> v)
-	{
-		return TransformFloat4PerElement(v, "tanhf");
-	}
-
-	RValue<Float4> Asinh(RValue<Float4> v)
-	{
-		return TransformFloat4PerElement(v, "asinhf");
-	}
-
-	RValue<Float4> Acosh(RValue<Float4> v)
-	{
-		return TransformFloat4PerElement(v, "acoshf");
-	}
-
-	RValue<Float4> Atanh(RValue<Float4> v)
-	{
-		return TransformFloat4PerElement(v, "atanhf");
-	}
-
-	RValue<Float4> Atan2(RValue<Float4> x, RValue<Float4> y)
-	{
-		::llvm::SmallVector<::llvm::Type*, 2> paramTys;
-		paramTys.push_back(T(Float::getType()));
-		paramTys.push_back(T(Float::getType()));
-		auto funcTy = ::llvm::FunctionType::get(T(Float::getType()), paramTys, false);
-		auto func = jit->module->getOrInsertFunction("atan2f", funcTy);
-		llvm::Value *out = ::llvm::UndefValue::get(T(Float4::getType()));
-		for (uint64_t i = 0; i < 4; i++)
-		{
-			auto el = jit->builder->CreateCall2(func, ARGS(
-					V(Nucleus::createExtractElement(x.value, Float::getType(), i)),
-					V(Nucleus::createExtractElement(y.value, Float::getType(), i))
-				));
-			out = V(Nucleus::createInsertElement(V(out), V(el), i));
-		}
-		return RValue<Float4>(V(out));
-	}
-
-	RValue<Float4> Pow(RValue<Float4> x, RValue<Float4> y)
-	{
-		auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::pow, { T(Float4::getType()) });
-		return RValue<Float4>(V(jit->builder->CreateCall2(func, ARGS(V(x.value), V(y.value)))));
-	}
-
-	RValue<Float4> Exp(RValue<Float4> v)
-	{
-		auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::exp, { T(Float4::getType()) } );
-		return RValue<Float4>(V(jit->builder->CreateCall(func, V(v.value))));
-	}
-
-	RValue<Float4> Log(RValue<Float4> v)
-	{
-		auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::log, { T(Float4::getType()) } );
-		return RValue<Float4>(V(jit->builder->CreateCall(func, V(v.value))));
-	}
-
-	RValue<Float4> Exp2(RValue<Float4> v)
-	{
-		auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::exp2, { T(Float4::getType()) } );
-		return RValue<Float4>(V(jit->builder->CreateCall(func, V(v.value))));
-	}
-
-	RValue<Float4> Log2(RValue<Float4> v)
-	{
-		auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::log2, { T(Float4::getType()) } );
-		return RValue<Float4>(V(jit->builder->CreateCall(func, V(v.value))));
-	}
-
-	RValue<UInt> Ctlz(RValue<UInt> v, bool isZeroUndef)
-	{
-		auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::ctlz, { T(UInt::getType()) } );
-		return RValue<UInt>(V(jit->builder->CreateCall2(func, ARGS(
-			V(v.value),
-			isZeroUndef ? ::llvm::ConstantInt::getTrue(jit->context) : ::llvm::ConstantInt::getFalse(jit->context)
-		))));
-	}
-
-	RValue<UInt4> Ctlz(RValue<UInt4> v, bool isZeroUndef)
-	{
-		auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::ctlz, { T(UInt4::getType()) } );
-		return RValue<UInt4>(V(jit->builder->CreateCall2(func, ARGS(
-			V(v.value),
-			isZeroUndef ? ::llvm::ConstantInt::getTrue(jit->context) : ::llvm::ConstantInt::getFalse(jit->context)
-		))));
-	}
-
-	RValue<UInt> Cttz(RValue<UInt> v, bool isZeroUndef)
-	{
-		auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::cttz, { T(UInt::getType()) } );
-		return RValue<UInt>(V(jit->builder->CreateCall2(func, ARGS(
-			V(v.value),
-			isZeroUndef ? ::llvm::ConstantInt::getTrue(jit->context) : ::llvm::ConstantInt::getFalse(jit->context)
-		))));
-	}
-
-	RValue<UInt4> Cttz(RValue<UInt4> v, bool isZeroUndef)
-	{
-		auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::cttz, { T(UInt4::getType()) } );
-		return RValue<UInt4>(V(jit->builder->CreateCall2(func, ARGS(
-			V(v.value),
-			isZeroUndef ? ::llvm::ConstantInt::getTrue(jit->context) : ::llvm::ConstantInt::getFalse(jit->context)
-		))));
-	}
-
-	Type *Float4::getType()
-	{
-		return T(llvm::VectorType::get(T(Float::getType()), 4));
-	}
-
-	RValue<Long> Ticks()
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		llvm::Function *rdtsc = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::readcyclecounter);
-
-		return RValue<Long>(V(jit->builder->CreateCall(rdtsc)));
-	}
-
-	RValue<Pointer<Byte>> ConstantPointer(void const * ptr)
-	{
-		// Note: this should work for 32-bit pointers as well because 'inttoptr'
-		// is defined to truncate (and zero extend) if necessary.
-		auto ptrAsInt = ::llvm::ConstantInt::get(::llvm::Type::getInt64Ty(jit->context), reinterpret_cast<uintptr_t>(ptr));
-		return RValue<Pointer<Byte>>(V(jit->builder->CreateIntToPtr(ptrAsInt, T(Pointer<Byte>::getType()))));
-	}
-
-	RValue<Pointer<Byte>> ConstantData(void const * data, size_t size)
-	{
-		auto str = ::llvm::StringRef(reinterpret_cast<const char*>(data), size);
-		auto ptr = jit->builder->CreateGlobalStringPtr(str);
-		return RValue<Pointer<Byte>>(V(ptr));
-	}
-
-	Value* Call(RValue<Pointer<Byte>> fptr, Type* retTy, std::initializer_list<Value*> args, std::initializer_list<Type*> argTys)
-	{
-		::llvm::SmallVector<::llvm::Type*, 8> paramTys;
-		for (auto ty : argTys) { paramTys.push_back(T(ty)); }
-		auto funcTy = ::llvm::FunctionType::get(T(retTy), paramTys, false);
-
-		auto funcPtrTy = funcTy->getPointerTo();
-		auto funcPtr = jit->builder->CreatePointerCast(V(fptr.value), funcPtrTy);
-
-		::llvm::SmallVector<::llvm::Value*, 8> arguments;
-		for (auto arg : args) { arguments.push_back(V(arg)); }
-		return V(jit->builder->CreateCall(funcPtr, arguments));
-	}
-
-	void Breakpoint()
-	{
-		llvm::Function *debugtrap = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::debugtrap);
-
-		jit->builder->CreateCall(debugtrap);
+		*this = As<Int4>(d);
 	}
 }
 
-namespace rr
+Int4::Int4(RValue<SByte4> cast) : XYZW(this)
 {
+	RR_DEBUG_INFO_UPDATE_LOC();
 #if defined(__i386__) || defined(__x86_64__)
-	namespace x86
+	if(CPUID::supportsSSE4_1())
 	{
-		RValue<Int> cvtss2si(RValue<Float> val)
-		{
-			llvm::Function *cvtss2si = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse_cvtss2si);
-
-			Float4 vector;
-			vector.x = val;
-
-			return RValue<Int>(V(jit->builder->CreateCall(cvtss2si, ARGS(V(RValue<Float4>(vector).value)))));
-		}
-
-		RValue<Int4> cvtps2dq(RValue<Float4> val)
-		{
-			llvm::Function *cvtps2dq = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_cvtps2dq);
-
-			return RValue<Int4>(V(jit->builder->CreateCall(cvtps2dq, ARGS(V(val.value)))));
-		}
-
-		RValue<Float> rcpss(RValue<Float> val)
-		{
-			llvm::Function *rcpss = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse_rcp_ss);
-
-			Value *vector = Nucleus::createInsertElement(V(llvm::UndefValue::get(T(Float4::getType()))), val.value, 0);
-
-			return RValue<Float>(Nucleus::createExtractElement(V(jit->builder->CreateCall(rcpss, ARGS(V(vector)))), Float::getType(), 0));
-		}
-
-		RValue<Float> sqrtss(RValue<Float> val)
-		{
-			llvm::Function *sqrt = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::sqrt, {V(val.value)->getType()});
-			return RValue<Float>(V(jit->builder->CreateCall(sqrt, ARGS(V(val.value)))));
-		}
-
-		RValue<Float> rsqrtss(RValue<Float> val)
-		{
-			llvm::Function *rsqrtss = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse_rsqrt_ss);
-
-			Value *vector = Nucleus::createInsertElement(V(llvm::UndefValue::get(T(Float4::getType()))), val.value, 0);
-
-			return RValue<Float>(Nucleus::createExtractElement(V(jit->builder->CreateCall(rsqrtss, ARGS(V(vector)))), Float::getType(), 0));
-		}
-
-		RValue<Float4> rcpps(RValue<Float4> val)
-		{
-			llvm::Function *rcpps = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse_rcp_ps);
-
-			return RValue<Float4>(V(jit->builder->CreateCall(rcpps, ARGS(V(val.value)))));
-		}
-
-		RValue<Float4> sqrtps(RValue<Float4> val)
-		{
-			llvm::Function *sqrtps = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::sqrt, {V(val.value)->getType()});
-
-			return RValue<Float4>(V(jit->builder->CreateCall(sqrtps, ARGS(V(val.value)))));
-		}
-
-		RValue<Float4> rsqrtps(RValue<Float4> val)
-		{
-			llvm::Function *rsqrtps = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse_rsqrt_ps);
-
-			return RValue<Float4>(V(jit->builder->CreateCall(rsqrtps, ARGS(V(val.value)))));
-		}
-
-		RValue<Float4> maxps(RValue<Float4> x, RValue<Float4> y)
-		{
-			llvm::Function *maxps = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse_max_ps);
-
-			return RValue<Float4>(V(jit->builder->CreateCall2(maxps, ARGS(V(x.value), V(y.value)))));
-		}
-
-		RValue<Float4> minps(RValue<Float4> x, RValue<Float4> y)
-		{
-			llvm::Function *minps = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse_min_ps);
-
-			return RValue<Float4>(V(jit->builder->CreateCall2(minps, ARGS(V(x.value), V(y.value)))));
-		}
-
-		RValue<Float> roundss(RValue<Float> val, unsigned char imm)
-		{
-			llvm::Function *roundss = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse41_round_ss);
-
-			Value *undef = V(llvm::UndefValue::get(T(Float4::getType())));
-			Value *vector = Nucleus::createInsertElement(undef, val.value, 0);
-
-			return RValue<Float>(Nucleus::createExtractElement(V(jit->builder->CreateCall3(roundss, ARGS(V(undef), V(vector), V(Nucleus::createConstantInt(imm))))), Float::getType(), 0));
-		}
-
-		RValue<Float> floorss(RValue<Float> val)
-		{
-			return roundss(val, 1);
-		}
-
-		RValue<Float> ceilss(RValue<Float> val)
-		{
-			return roundss(val, 2);
-		}
-
-		RValue<Float4> roundps(RValue<Float4> val, unsigned char imm)
-		{
-			llvm::Function *roundps = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse41_round_ps);
-
-			return RValue<Float4>(V(jit->builder->CreateCall2(roundps, ARGS(V(val.value), V(Nucleus::createConstantInt(imm))))));
-		}
-
-		RValue<Float4> floorps(RValue<Float4> val)
-		{
-			return roundps(val, 1);
-		}
-
-		RValue<Float4> ceilps(RValue<Float4> val)
-		{
-			return roundps(val, 2);
-		}
-
-		RValue<Int4> pabsd(RValue<Int4> x)
-		{
-			return RValue<Int4>(V(lowerPABS(V(x.value))));
-		}
-
-		RValue<Short4> paddsw(RValue<Short4> x, RValue<Short4> y)
-		{
-			#if LLVM_VERSION_MAJOR >= 8
-				return As<Short4>(V(lowerPSADDSAT(V(x.value), V(y.value))));
-			#else
-				llvm::Function *paddsw = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_padds_w);
-
-				return As<Short4>(V(jit->builder->CreateCall2(paddsw, ARGS(V(x.value), V(y.value)))));
-			#endif
-		}
-
-		RValue<Short4> psubsw(RValue<Short4> x, RValue<Short4> y)
-		{
-			#if LLVM_VERSION_MAJOR >= 8
-				return As<Short4>(V(lowerPSSUBSAT(V(x.value), V(y.value))));
-			#else
-				llvm::Function *psubsw = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_psubs_w);
-
-				return As<Short4>(V(jit->builder->CreateCall2(psubsw, ARGS(V(x.value), V(y.value)))));
-			#endif
-		}
-
-		RValue<UShort4> paddusw(RValue<UShort4> x, RValue<UShort4> y)
-		{
-			#if LLVM_VERSION_MAJOR >= 8
-				return As<UShort4>(V(lowerPUADDSAT(V(x.value), V(y.value))));
-			#else
-				llvm::Function *paddusw = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_paddus_w);
-
-				return As<UShort4>(V(jit->builder->CreateCall2(paddusw, ARGS(V(x.value), V(y.value)))));
-			#endif
-		}
-
-		RValue<UShort4> psubusw(RValue<UShort4> x, RValue<UShort4> y)
-		{
-			#if LLVM_VERSION_MAJOR >= 8
-				return As<UShort4>(V(lowerPUSUBSAT(V(x.value), V(y.value))));
-			#else
-				llvm::Function *psubusw = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_psubus_w);
-
-				return As<UShort4>(V(jit->builder->CreateCall2(psubusw, ARGS(V(x.value), V(y.value)))));
-			#endif
-		}
-
-		RValue<SByte8> paddsb(RValue<SByte8> x, RValue<SByte8> y)
-		{
-			#if LLVM_VERSION_MAJOR >= 8
-				return As<SByte8>(V(lowerPSADDSAT(V(x.value), V(y.value))));
-			#else
-				llvm::Function *paddsb = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_padds_b);
-
-				return As<SByte8>(V(jit->builder->CreateCall2(paddsb, ARGS(V(x.value), V(y.value)))));
-			#endif
-		}
-
-		RValue<SByte8> psubsb(RValue<SByte8> x, RValue<SByte8> y)
-		{
-			#if LLVM_VERSION_MAJOR >= 8
-				return As<SByte8>(V(lowerPSSUBSAT(V(x.value), V(y.value))));
-			#else
-				llvm::Function *psubsb = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_psubs_b);
-
-				return As<SByte8>(V(jit->builder->CreateCall2(psubsb, ARGS(V(x.value), V(y.value)))));
-			#endif
-		}
-
-		RValue<Byte8> paddusb(RValue<Byte8> x, RValue<Byte8> y)
-		{
-			#if LLVM_VERSION_MAJOR >= 8
-				return As<Byte8>(V(lowerPUADDSAT(V(x.value), V(y.value))));
-			#else
-				llvm::Function *paddusb = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_paddus_b);
-
-				return As<Byte8>(V(jit->builder->CreateCall2(paddusb, ARGS(V(x.value), V(y.value)))));
-			#endif
-		}
-
-		RValue<Byte8> psubusb(RValue<Byte8> x, RValue<Byte8> y)
-		{
-			#if LLVM_VERSION_MAJOR >= 8
-				return As<Byte8>(V(lowerPUSUBSAT(V(x.value), V(y.value))));
-			#else
-				llvm::Function *psubusb = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_psubus_b);
-
-				return As<Byte8>(V(jit->builder->CreateCall2(psubusb, ARGS(V(x.value), V(y.value)))));
-			#endif
-		}
-
-		RValue<UShort4> pavgw(RValue<UShort4> x, RValue<UShort4> y)
-		{
-			return As<UShort4>(V(lowerPAVG(V(x.value), V(y.value))));
-		}
-
-		RValue<Short4> pmaxsw(RValue<Short4> x, RValue<Short4> y)
-		{
-			return As<Short4>(V(lowerPMINMAX(V(x.value), V(y.value), llvm::ICmpInst::ICMP_SGT)));
-		}
-
-		RValue<Short4> pminsw(RValue<Short4> x, RValue<Short4> y)
-		{
-			return As<Short4>(V(lowerPMINMAX(V(x.value), V(y.value), llvm::ICmpInst::ICMP_SLT)));
-		}
-
-		RValue<Short4> pcmpgtw(RValue<Short4> x, RValue<Short4> y)
-		{
-			return As<Short4>(V(lowerPCMP(llvm::ICmpInst::ICMP_SGT, V(x.value), V(y.value), T(Short4::getType()))));
-		}
-
-		RValue<Short4> pcmpeqw(RValue<Short4> x, RValue<Short4> y)
-		{
-			return As<Short4>(V(lowerPCMP(llvm::ICmpInst::ICMP_EQ, V(x.value), V(y.value), T(Short4::getType()))));
-		}
-
-		RValue<Byte8> pcmpgtb(RValue<SByte8> x, RValue<SByte8> y)
-		{
-			return As<Byte8>(V(lowerPCMP(llvm::ICmpInst::ICMP_SGT, V(x.value), V(y.value), T(Byte8::getType()))));
-		}
-
-		RValue<Byte8> pcmpeqb(RValue<Byte8> x, RValue<Byte8> y)
-		{
-			return As<Byte8>(V(lowerPCMP(llvm::ICmpInst::ICMP_EQ, V(x.value), V(y.value), T(Byte8::getType()))));
-		}
-
-		RValue<Short4> packssdw(RValue<Int2> x, RValue<Int2> y)
-		{
-			llvm::Function *packssdw = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_packssdw_128);
-
-			return As<Short4>(V(jit->builder->CreateCall2(packssdw, ARGS(V(x.value), V(y.value)))));
-		}
-
-		RValue<Short8> packssdw(RValue<Int4> x, RValue<Int4> y)
-		{
-			llvm::Function *packssdw = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_packssdw_128);
-
-			return RValue<Short8>(V(jit->builder->CreateCall2(packssdw, ARGS(V(x.value), V(y.value)))));
-		}
-
-		RValue<SByte8> packsswb(RValue<Short4> x, RValue<Short4> y)
-		{
-			llvm::Function *packsswb = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_packsswb_128);
-
-			return As<SByte8>(V(jit->builder->CreateCall2(packsswb, ARGS(V(x.value), V(y.value)))));
-		}
-
-		RValue<Byte8> packuswb(RValue<Short4> x, RValue<Short4> y)
-		{
-			llvm::Function *packuswb = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_packuswb_128);
-
-			return As<Byte8>(V(jit->builder->CreateCall2(packuswb, ARGS(V(x.value), V(y.value)))));
-		}
-
-		RValue<UShort8> packusdw(RValue<Int4> x, RValue<Int4> y)
-		{
-			if(CPUID::supportsSSE4_1())
-			{
-				llvm::Function *packusdw = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse41_packusdw);
-
-				return RValue<UShort8>(V(jit->builder->CreateCall2(packusdw, ARGS(V(x.value), V(y.value)))));
-			}
-			else
-			{
-				RValue<Int4> bx = (x & ~(x >> 31)) - Int4(0x8000);
-				RValue<Int4> by = (y & ~(y >> 31)) - Int4(0x8000);
-
-				return As<UShort8>(packssdw(bx, by) + Short8(0x8000u));
-			}
-		}
-
-		RValue<UShort4> psrlw(RValue<UShort4> x, unsigned char y)
-		{
-			llvm::Function *psrlw = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_psrli_w);
-
-			return As<UShort4>(V(jit->builder->CreateCall2(psrlw, ARGS(V(x.value), V(Nucleus::createConstantInt(y))))));
-		}
-
-		RValue<UShort8> psrlw(RValue<UShort8> x, unsigned char y)
-		{
-			llvm::Function *psrlw = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_psrli_w);
-
-			return RValue<UShort8>(V(jit->builder->CreateCall2(psrlw, ARGS(V(x.value), V(Nucleus::createConstantInt(y))))));
-		}
-
-		RValue<Short4> psraw(RValue<Short4> x, unsigned char y)
-		{
-			llvm::Function *psraw = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_psrai_w);
-
-			return As<Short4>(V(jit->builder->CreateCall2(psraw, ARGS(V(x.value), V(Nucleus::createConstantInt(y))))));
-		}
-
-		RValue<Short8> psraw(RValue<Short8> x, unsigned char y)
-		{
-			llvm::Function *psraw = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_psrai_w);
-
-			return RValue<Short8>(V(jit->builder->CreateCall2(psraw, ARGS(V(x.value), V(Nucleus::createConstantInt(y))))));
-		}
-
-		RValue<Short4> psllw(RValue<Short4> x, unsigned char y)
-		{
-			llvm::Function *psllw = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_pslli_w);
-
-			return As<Short4>(V(jit->builder->CreateCall2(psllw, ARGS(V(x.value), V(Nucleus::createConstantInt(y))))));
-		}
-
-		RValue<Short8> psllw(RValue<Short8> x, unsigned char y)
-		{
-			llvm::Function *psllw = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_pslli_w);
-
-			return RValue<Short8>(V(jit->builder->CreateCall2(psllw, ARGS(V(x.value), V(Nucleus::createConstantInt(y))))));
-		}
-
-		RValue<Int2> pslld(RValue<Int2> x, unsigned char y)
-		{
-			llvm::Function *pslld = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_pslli_d);
-
-			return As<Int2>(V(jit->builder->CreateCall2(pslld, ARGS(V(x.value), V(Nucleus::createConstantInt(y))))));
-		}
-
-		RValue<Int4> pslld(RValue<Int4> x, unsigned char y)
-		{
-			llvm::Function *pslld = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_pslli_d);
-
-			return RValue<Int4>(V(jit->builder->CreateCall2(pslld, ARGS(V(x.value), V(Nucleus::createConstantInt(y))))));
-		}
-
-		RValue<Int2> psrad(RValue<Int2> x, unsigned char y)
-		{
-			llvm::Function *psrad = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_psrai_d);
-
-			return As<Int2>(V(jit->builder->CreateCall2(psrad, ARGS(V(x.value), V(Nucleus::createConstantInt(y))))));
-		}
-
-		RValue<Int4> psrad(RValue<Int4> x, unsigned char y)
-		{
-			llvm::Function *psrad = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_psrai_d);
-
-			return RValue<Int4>(V(jit->builder->CreateCall2(psrad, ARGS(V(x.value), V(Nucleus::createConstantInt(y))))));
-		}
-
-		RValue<UInt2> psrld(RValue<UInt2> x, unsigned char y)
-		{
-			llvm::Function *psrld = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_psrli_d);
-
-			return As<UInt2>(V(jit->builder->CreateCall2(psrld, ARGS(V(x.value), V(Nucleus::createConstantInt(y))))));
-		}
-
-		RValue<UInt4> psrld(RValue<UInt4> x, unsigned char y)
-		{
-			llvm::Function *psrld = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_psrli_d);
-
-			return RValue<UInt4>(V(jit->builder->CreateCall2(psrld, ARGS(V(x.value), V(Nucleus::createConstantInt(y))))));
-		}
-
-		RValue<Int4> pmaxsd(RValue<Int4> x, RValue<Int4> y)
-		{
-			return RValue<Int4>(V(lowerPMINMAX(V(x.value), V(y.value), llvm::ICmpInst::ICMP_SGT)));
-		}
-
-		RValue<Int4> pminsd(RValue<Int4> x, RValue<Int4> y)
-		{
-			return RValue<Int4>(V(lowerPMINMAX(V(x.value), V(y.value), llvm::ICmpInst::ICMP_SLT)));
-		}
-
-		RValue<UInt4> pmaxud(RValue<UInt4> x, RValue<UInt4> y)
-		{
-			return RValue<UInt4>(V(lowerPMINMAX(V(x.value), V(y.value), llvm::ICmpInst::ICMP_UGT)));
-		}
-
-		RValue<UInt4> pminud(RValue<UInt4> x, RValue<UInt4> y)
-		{
-			return RValue<UInt4>(V(lowerPMINMAX(V(x.value), V(y.value), llvm::ICmpInst::ICMP_ULT)));
-		}
-
-		RValue<Short4> pmulhw(RValue<Short4> x, RValue<Short4> y)
-		{
-			llvm::Function *pmulhw = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_pmulh_w);
-
-			return As<Short4>(V(jit->builder->CreateCall2(pmulhw, ARGS(V(x.value), V(y.value)))));
-		}
-
-		RValue<UShort4> pmulhuw(RValue<UShort4> x, RValue<UShort4> y)
-		{
-			llvm::Function *pmulhuw = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_pmulhu_w);
-
-			return As<UShort4>(V(jit->builder->CreateCall2(pmulhuw, ARGS(V(x.value), V(y.value)))));
-		}
-
-		RValue<Int2> pmaddwd(RValue<Short4> x, RValue<Short4> y)
-		{
-			llvm::Function *pmaddwd = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_pmadd_wd);
-
-			return As<Int2>(V(jit->builder->CreateCall2(pmaddwd, ARGS(V(x.value), V(y.value)))));
-		}
-
-		RValue<Short8> pmulhw(RValue<Short8> x, RValue<Short8> y)
-		{
-			llvm::Function *pmulhw = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_pmulh_w);
-
-			return RValue<Short8>(V(jit->builder->CreateCall2(pmulhw, ARGS(V(x.value), V(y.value)))));
-		}
-
-		RValue<UShort8> pmulhuw(RValue<UShort8> x, RValue<UShort8> y)
-		{
-			llvm::Function *pmulhuw = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_pmulhu_w);
-
-			return RValue<UShort8>(V(jit->builder->CreateCall2(pmulhuw, ARGS(V(x.value), V(y.value)))));
-		}
-
-		RValue<Int4> pmaddwd(RValue<Short8> x, RValue<Short8> y)
-		{
-			llvm::Function *pmaddwd = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_pmadd_wd);
-
-			return RValue<Int4>(V(jit->builder->CreateCall2(pmaddwd, ARGS(V(x.value), V(y.value)))));
-		}
-
-		RValue<Int> movmskps(RValue<Float4> x)
-		{
-			llvm::Function *movmskps = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse_movmsk_ps);
-
-			return RValue<Int>(V(jit->builder->CreateCall(movmskps, ARGS(V(x.value)))));
-		}
-
-		RValue<Int> pmovmskb(RValue<Byte8> x)
-		{
-			llvm::Function *pmovmskb = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_pmovmskb_128);
-
-			return RValue<Int>(V(jit->builder->CreateCall(pmovmskb, ARGS(V(x.value))))) & 0xFF;
-		}
-
-		RValue<Int4> pmovzxbd(RValue<Byte16> x)
-		{
-			return RValue<Int4>(V(lowerPMOV(V(x.value), T(Int4::getType()), false)));
-		}
-
-		RValue<Int4> pmovsxbd(RValue<SByte16> x)
-		{
-			return RValue<Int4>(V(lowerPMOV(V(x.value), T(Int4::getType()), true)));
-		}
-
-		RValue<Int4> pmovzxwd(RValue<UShort8> x)
-		{
-			return RValue<Int4>(V(lowerPMOV(V(x.value), T(Int4::getType()), false)));
-		}
-
-		RValue<Int4> pmovsxwd(RValue<Short8> x)
-		{
-			return RValue<Int4>(V(lowerPMOV(V(x.value), T(Int4::getType()), true)));
-		}
+		*this = x86::pmovsxbd(As<SByte16>(cast));
 	}
+	else
+#endif
+	{
+		int swizzle[16] = {0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7};
+		Value *a = Nucleus::createBitCast(cast.value, Byte16::getType());
+		Value *b = Nucleus::createShuffleVector(a, a, swizzle);
+
+		int swizzle2[8] = {0, 0, 1, 1, 2, 2, 3, 3};
+		Value *c = Nucleus::createBitCast(b, Short8::getType());
+		Value *d = Nucleus::createShuffleVector(c, c, swizzle2);
+
+		*this = As<Int4>(d) >> 24;
+	}
+}
+
+Int4::Int4(RValue<Short4> cast) : XYZW(this)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+#if defined(__i386__) || defined(__x86_64__)
+	if(CPUID::supportsSSE4_1())
+	{
+		*this = x86::pmovsxwd(As<Short8>(cast));
+	}
+	else
+#endif
+	{
+		int swizzle[8] = {0, 0, 1, 1, 2, 2, 3, 3};
+		Value *c = Nucleus::createShuffleVector(cast.value, cast.value, swizzle);
+		*this = As<Int4>(c) >> 16;
+	}
+}
+
+Int4::Int4(RValue<UShort4> cast) : XYZW(this)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+#if defined(__i386__) || defined(__x86_64__)
+	if(CPUID::supportsSSE4_1())
+	{
+		*this = x86::pmovzxwd(As<UShort8>(cast));
+	}
+	else
+#endif
+	{
+		int swizzle[8] = {0, 8, 1, 9, 2, 10, 3, 11};
+		Value *c = Nucleus::createShuffleVector(cast.value, Short8(0, 0, 0, 0, 0, 0, 0, 0).loadValue(), swizzle);
+		*this = As<Int4>(c);
+	}
+}
+
+Int4::Int4(RValue<Int> rhs) : XYZW(this)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	Value *vector = loadValue();
+	Value *insert = Nucleus::createInsertElement(vector, rhs.value, 0);
+
+	int swizzle[4] = {0, 0, 0, 0};
+	Value *replicate = Nucleus::createShuffleVector(insert, insert, swizzle);
+
+	storeValue(replicate);
+}
+
+RValue<Int4> operator<<(RValue<Int4> lhs, unsigned char rhs)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+#if defined(__i386__) || defined(__x86_64__)
+	return x86::pslld(lhs, rhs);
+#else
+	return As<Int4>(V(lowerVectorShl(V(lhs.value), rhs)));
+#endif
+}
+
+RValue<Int4> operator>>(RValue<Int4> lhs, unsigned char rhs)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+#if defined(__i386__) || defined(__x86_64__)
+	return x86::psrad(lhs, rhs);
+#else
+	return As<Int4>(V(lowerVectorAShr(V(lhs.value), rhs)));
+#endif
+}
+
+RValue<Int4> CmpEQ(RValue<Int4> x, RValue<Int4> y)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpEQ(x.value, y.value), Int4::getType()));
+}
+
+RValue<Int4> CmpLT(RValue<Int4> x, RValue<Int4> y)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpSLT(x.value, y.value), Int4::getType()));
+}
+
+RValue<Int4> CmpLE(RValue<Int4> x, RValue<Int4> y)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpSLE(x.value, y.value), Int4::getType()));
+}
+
+RValue<Int4> CmpNEQ(RValue<Int4> x, RValue<Int4> y)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpNE(x.value, y.value), Int4::getType()));
+}
+
+RValue<Int4> CmpNLT(RValue<Int4> x, RValue<Int4> y)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpSGE(x.value, y.value), Int4::getType()));
+}
+
+RValue<Int4> CmpNLE(RValue<Int4> x, RValue<Int4> y)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpSGT(x.value, y.value), Int4::getType()));
+}
+
+RValue<Int4> Max(RValue<Int4> x, RValue<Int4> y)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+#if defined(__i386__) || defined(__x86_64__)
+	if(CPUID::supportsSSE4_1())
+	{
+		return x86::pmaxsd(x, y);
+	}
+	else
+#endif
+	{
+		RValue<Int4> greater = CmpNLE(x, y);
+		return (x & greater) | (y & ~greater);
+	}
+}
+
+RValue<Int4> Min(RValue<Int4> x, RValue<Int4> y)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+#if defined(__i386__) || defined(__x86_64__)
+	if(CPUID::supportsSSE4_1())
+	{
+		return x86::pminsd(x, y);
+	}
+	else
+#endif
+	{
+		RValue<Int4> less = CmpLT(x, y);
+		return (x & less) | (y & ~less);
+	}
+}
+
+RValue<Int4> RoundInt(RValue<Float4> cast)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+#if defined(__i386__) || defined(__x86_64__)
+	return x86::cvtps2dq(cast);
+#else
+	return As<Int4>(V(lowerRoundInt(V(cast.value), T(Int4::getType()))));
+#endif
+}
+
+RValue<Int4> MulHigh(RValue<Int4> x, RValue<Int4> y)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	// TODO: For x86, build an intrinsics version of this which uses shuffles + pmuludq.
+	return As<Int4>(V(lowerMulHigh(V(x.value), V(y.value), true)));
+}
+
+RValue<UInt4> MulHigh(RValue<UInt4> x, RValue<UInt4> y)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	// TODO: For x86, build an intrinsics version of this which uses shuffles + pmuludq.
+	return As<UInt4>(V(lowerMulHigh(V(x.value), V(y.value), false)));
+}
+
+RValue<Short8> PackSigned(RValue<Int4> x, RValue<Int4> y)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+#if defined(__i386__) || defined(__x86_64__)
+	return x86::packssdw(x, y);
+#else
+	return As<Short8>(V(lowerPack(V(x.value), V(y.value), true)));
+#endif
+}
+
+RValue<UShort8> PackUnsigned(RValue<Int4> x, RValue<Int4> y)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+#if defined(__i386__) || defined(__x86_64__)
+	return x86::packusdw(x, y);
+#else
+	return As<UShort8>(V(lowerPack(V(x.value), V(y.value), false)));
+#endif
+}
+
+RValue<Int> SignMask(RValue<Int4> x)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+#if defined(__i386__) || defined(__x86_64__)
+	return x86::movmskps(As<Float4>(x));
+#else
+	return As<Int>(V(lowerSignMask(V(x.value), T(Int::getType()))));
+#endif
+}
+
+Type *Int4::getType()
+{
+	return T(llvm::VectorType::get(T(Int::getType()), 4));
+}
+
+UInt4::UInt4(RValue<Float4> cast) : XYZW(this)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	Value *xyzw = Nucleus::createFPToUI(cast.value, UInt4::getType());
+	storeValue(xyzw);
+}
+
+UInt4::UInt4(RValue<UInt> rhs) : XYZW(this)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	Value *vector = loadValue();
+	Value *insert = Nucleus::createInsertElement(vector, rhs.value, 0);
+
+	int swizzle[4] = {0, 0, 0, 0};
+	Value *replicate = Nucleus::createShuffleVector(insert, insert, swizzle);
+
+	storeValue(replicate);
+}
+
+RValue<UInt4> operator<<(RValue<UInt4> lhs, unsigned char rhs)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+#if defined(__i386__) || defined(__x86_64__)
+	return As<UInt4>(x86::pslld(As<Int4>(lhs), rhs));
+#else
+	return As<UInt4>(V(lowerVectorShl(V(lhs.value), rhs)));
+#endif
+}
+
+RValue<UInt4> operator>>(RValue<UInt4> lhs, unsigned char rhs)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+#if defined(__i386__) || defined(__x86_64__)
+	return x86::psrld(lhs, rhs);
+#else
+	return As<UInt4>(V(lowerVectorLShr(V(lhs.value), rhs)));
+#endif
+}
+
+RValue<UInt4> CmpEQ(RValue<UInt4> x, RValue<UInt4> y)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return RValue<UInt4>(Nucleus::createSExt(Nucleus::createICmpEQ(x.value, y.value), Int4::getType()));
+}
+
+RValue<UInt4> CmpLT(RValue<UInt4> x, RValue<UInt4> y)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return RValue<UInt4>(Nucleus::createSExt(Nucleus::createICmpULT(x.value, y.value), Int4::getType()));
+}
+
+RValue<UInt4> CmpLE(RValue<UInt4> x, RValue<UInt4> y)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return RValue<UInt4>(Nucleus::createSExt(Nucleus::createICmpULE(x.value, y.value), Int4::getType()));
+}
+
+RValue<UInt4> CmpNEQ(RValue<UInt4> x, RValue<UInt4> y)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return RValue<UInt4>(Nucleus::createSExt(Nucleus::createICmpNE(x.value, y.value), Int4::getType()));
+}
+
+RValue<UInt4> CmpNLT(RValue<UInt4> x, RValue<UInt4> y)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return RValue<UInt4>(Nucleus::createSExt(Nucleus::createICmpUGE(x.value, y.value), Int4::getType()));
+}
+
+RValue<UInt4> CmpNLE(RValue<UInt4> x, RValue<UInt4> y)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return RValue<UInt4>(Nucleus::createSExt(Nucleus::createICmpUGT(x.value, y.value), Int4::getType()));
+}
+
+RValue<UInt4> Max(RValue<UInt4> x, RValue<UInt4> y)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+#if defined(__i386__) || defined(__x86_64__)
+	if(CPUID::supportsSSE4_1())
+	{
+		return x86::pmaxud(x, y);
+	}
+	else
+#endif
+	{
+		RValue<UInt4> greater = CmpNLE(x, y);
+		return (x & greater) | (y & ~greater);
+	}
+}
+
+RValue<UInt4> Min(RValue<UInt4> x, RValue<UInt4> y)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+#if defined(__i386__) || defined(__x86_64__)
+	if(CPUID::supportsSSE4_1())
+	{
+		return x86::pminud(x, y);
+	}
+	else
+#endif
+	{
+		RValue<UInt4> less = CmpLT(x, y);
+		return (x & less) | (y & ~less);
+	}
+}
+
+Type *UInt4::getType()
+{
+	return T(llvm::VectorType::get(T(UInt::getType()), 4));
+}
+
+Type *Half::getType()
+{
+	return T(llvm::Type::getInt16Ty(jit->context));
+}
+
+RValue<Float> Rcp_pp(RValue<Float> x, bool exactAtPow2)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+#if defined(__i386__) || defined(__x86_64__)
+	if(exactAtPow2)
+	{
+		// rcpss uses a piecewise-linear approximation which minimizes the relative error
+		// but is not exact at power-of-two values. Rectify by multiplying by the inverse.
+		return x86::rcpss(x) * Float(1.0f / _mm_cvtss_f32(_mm_rcp_ss(_mm_set_ps1(1.0f))));
+	}
+	return x86::rcpss(x);
+#else
+	return As<Float>(V(lowerRCP(V(x.value))));
+#endif
+}
+
+RValue<Float> RcpSqrt_pp(RValue<Float> x)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+#if defined(__i386__) || defined(__x86_64__)
+	return x86::rsqrtss(x);
+#else
+	return As<Float>(V(lowerRSQRT(V(x.value))));
+#endif
+}
+
+RValue<Float> Sqrt(RValue<Float> x)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+#if defined(__i386__) || defined(__x86_64__)
+	return x86::sqrtss(x);
+#else
+	return As<Float>(V(lowerSQRT(V(x.value))));
+#endif
+}
+
+RValue<Float> Round(RValue<Float> x)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+#if defined(__i386__) || defined(__x86_64__)
+	if(CPUID::supportsSSE4_1())
+	{
+		return x86::roundss(x, 0);
+	}
+	else
+	{
+		return Float4(Round(Float4(x))).x;
+	}
+#else
+	return RValue<Float>(V(lowerRound(V(x.value))));
+#endif
+}
+
+RValue<Float> Trunc(RValue<Float> x)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+#if defined(__i386__) || defined(__x86_64__)
+	if(CPUID::supportsSSE4_1())
+	{
+		return x86::roundss(x, 3);
+	}
+	else
+	{
+		return Float(Int(x));   // Rounded toward zero
+	}
+#else
+	return RValue<Float>(V(lowerTrunc(V(x.value))));
+#endif
+}
+
+RValue<Float> Frac(RValue<Float> x)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+#if defined(__i386__) || defined(__x86_64__)
+	if(CPUID::supportsSSE4_1())
+	{
+		return x - x86::floorss(x);
+	}
+	else
+	{
+		return Float4(Frac(Float4(x))).x;
+	}
+#else
+	// x - floor(x) can be 1.0 for very small negative x.
+	// Clamp against the value just below 1.0.
+	return Min(x - Floor(x), As<Float>(Int(0x3F7FFFFF)));
+#endif
+}
+
+RValue<Float> Floor(RValue<Float> x)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+#if defined(__i386__) || defined(__x86_64__)
+	if(CPUID::supportsSSE4_1())
+	{
+		return x86::floorss(x);
+	}
+	else
+	{
+		return Float4(Floor(Float4(x))).x;
+	}
+#else
+	return RValue<Float>(V(lowerFloor(V(x.value))));
+#endif
+}
+
+RValue<Float> Ceil(RValue<Float> x)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+#if defined(__i386__) || defined(__x86_64__)
+	if(CPUID::supportsSSE4_1())
+	{
+		return x86::ceilss(x);
+	}
+	else
+#endif
+	{
+		return Float4(Ceil(Float4(x))).x;
+	}
+}
+
+Type *Float::getType()
+{
+	return T(llvm::Type::getFloatTy(jit->context));
+}
+
+Type *Float2::getType()
+{
+	return T(Type_v2f32);
+}
+
+RValue<Float> Exp2(RValue<Float> v)
+{
+	auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::exp2, { T(Float::getType()) } );
+	return RValue<Float>(V(jit->builder->CreateCall(func, V(v.value))));
+}
+
+RValue<Float> Log2(RValue<Float> v)
+{
+	auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::log2, { T(Float::getType()) } );
+	return RValue<Float>(V(jit->builder->CreateCall(func, V(v.value))));
+}
+
+Float4::Float4(RValue<Float> rhs) : XYZW(this)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	Value *vector = loadValue();
+	Value *insert = Nucleus::createInsertElement(vector, rhs.value, 0);
+
+	int swizzle[4] = {0, 0, 0, 0};
+	Value *replicate = Nucleus::createShuffleVector(insert, insert, swizzle);
+
+	storeValue(replicate);
+}
+
+RValue<Float4> Max(RValue<Float4> x, RValue<Float4> y)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+#if defined(__i386__) || defined(__x86_64__)
+	return x86::maxps(x, y);
+#else
+	return As<Float4>(V(lowerPFMINMAX(V(x.value), V(y.value), llvm::FCmpInst::FCMP_OGT)));
+#endif
+}
+
+RValue<Float4> Min(RValue<Float4> x, RValue<Float4> y)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+#if defined(__i386__) || defined(__x86_64__)
+	return x86::minps(x, y);
+#else
+	return As<Float4>(V(lowerPFMINMAX(V(x.value), V(y.value), llvm::FCmpInst::FCMP_OLT)));
+#endif
+}
+
+RValue<Float4> Rcp_pp(RValue<Float4> x, bool exactAtPow2)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+#if defined(__i386__) || defined(__x86_64__)
+	if(exactAtPow2)
+	{
+		// rcpps uses a piecewise-linear approximation which minimizes the relative error
+		// but is not exact at power-of-two values. Rectify by multiplying by the inverse.
+		return x86::rcpps(x) * Float4(1.0f / _mm_cvtss_f32(_mm_rcp_ss(_mm_set_ps1(1.0f))));
+	}
+	return x86::rcpps(x);
+#else
+	return As<Float4>(V(lowerRCP(V(x.value))));
+#endif
+}
+
+RValue<Float4> RcpSqrt_pp(RValue<Float4> x)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+#if defined(__i386__) || defined(__x86_64__)
+	return x86::rsqrtps(x);
+#else
+	return As<Float4>(V(lowerRSQRT(V(x.value))));
+#endif
+}
+
+RValue<Float4> Sqrt(RValue<Float4> x)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+#if defined(__i386__) || defined(__x86_64__)
+	return x86::sqrtps(x);
+#else
+	return As<Float4>(V(lowerSQRT(V(x.value))));
+#endif
+}
+
+RValue<Int> SignMask(RValue<Float4> x)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+#if defined(__i386__) || defined(__x86_64__)
+	return x86::movmskps(x);
+#else
+	return As<Int>(V(lowerFPSignMask(V(x.value), T(Int::getType()))));
+#endif
+}
+
+RValue<Int4> CmpEQ(RValue<Float4> x, RValue<Float4> y)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+//	return As<Int4>(x86::cmpeqps(x, y));
+	return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpOEQ(x.value, y.value), Int4::getType()));
+}
+
+RValue<Int4> CmpLT(RValue<Float4> x, RValue<Float4> y)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+//	return As<Int4>(x86::cmpltps(x, y));
+	return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpOLT(x.value, y.value), Int4::getType()));
+}
+
+RValue<Int4> CmpLE(RValue<Float4> x, RValue<Float4> y)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+//	return As<Int4>(x86::cmpleps(x, y));
+	return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpOLE(x.value, y.value), Int4::getType()));
+}
+
+RValue<Int4> CmpNEQ(RValue<Float4> x, RValue<Float4> y)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+//	return As<Int4>(x86::cmpneqps(x, y));
+	return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpONE(x.value, y.value), Int4::getType()));
+}
+
+RValue<Int4> CmpNLT(RValue<Float4> x, RValue<Float4> y)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+//	return As<Int4>(x86::cmpnltps(x, y));
+	return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpOGE(x.value, y.value), Int4::getType()));
+}
+
+RValue<Int4> CmpNLE(RValue<Float4> x, RValue<Float4> y)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+//	return As<Int4>(x86::cmpnleps(x, y));
+	return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpOGT(x.value, y.value), Int4::getType()));
+}
+
+RValue<Int4> CmpUEQ(RValue<Float4> x, RValue<Float4> y)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpUEQ(x.value, y.value), Int4::getType()));
+}
+
+RValue<Int4> CmpULT(RValue<Float4> x, RValue<Float4> y)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpULT(x.value, y.value), Int4::getType()));
+}
+
+RValue<Int4> CmpULE(RValue<Float4> x, RValue<Float4> y)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpULE(x.value, y.value), Int4::getType()));
+}
+
+RValue<Int4> CmpUNEQ(RValue<Float4> x, RValue<Float4> y)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpUNE(x.value, y.value), Int4::getType()));
+}
+
+RValue<Int4> CmpUNLT(RValue<Float4> x, RValue<Float4> y)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpUGE(x.value, y.value), Int4::getType()));
+}
+
+RValue<Int4> CmpUNLE(RValue<Float4> x, RValue<Float4> y)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpUGT(x.value, y.value), Int4::getType()));
+}
+
+RValue<Float4> Round(RValue<Float4> x)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+#if defined(__i386__) || defined(__x86_64__)
+	if(CPUID::supportsSSE4_1())
+	{
+		return x86::roundps(x, 0);
+	}
+	else
+	{
+		return Float4(RoundInt(x));
+	}
+#else
+	return RValue<Float4>(V(lowerRound(V(x.value))));
+#endif
+}
+
+RValue<Float4> Trunc(RValue<Float4> x)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+#if defined(__i386__) || defined(__x86_64__)
+	if(CPUID::supportsSSE4_1())
+	{
+		return x86::roundps(x, 3);
+	}
+	else
+	{
+		return Float4(Int4(x));
+	}
+#else
+	return RValue<Float4>(V(lowerTrunc(V(x.value))));
+#endif
+}
+
+RValue<Float4> Frac(RValue<Float4> x)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	Float4 frc;
+
+#if defined(__i386__) || defined(__x86_64__)
+	if(CPUID::supportsSSE4_1())
+	{
+		frc = x - Floor(x);
+	}
+	else
+	{
+		frc = x - Float4(Int4(x));   // Signed fractional part.
+
+		frc += As<Float4>(As<Int4>(CmpNLE(Float4(0.0f), frc)) & As<Int4>(Float4(1.0f)));   // Add 1.0 if negative.
+	}
+#else
+	frc = x - Floor(x);
+#endif
+
+	// x - floor(x) can be 1.0 for very small negative x.
+	// Clamp against the value just below 1.0.
+	return Min(frc, As<Float4>(Int4(0x3F7FFFFF)));
+}
+
+RValue<Float4> Floor(RValue<Float4> x)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+#if defined(__i386__) || defined(__x86_64__)
+	if(CPUID::supportsSSE4_1())
+	{
+		return x86::floorps(x);
+	}
+	else
+	{
+		return x - Frac(x);
+	}
+#else
+	return RValue<Float4>(V(lowerFloor(V(x.value))));
+#endif
+}
+
+RValue<Float4> Ceil(RValue<Float4> x)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+#if defined(__i386__) || defined(__x86_64__)
+	if(CPUID::supportsSSE4_1())
+	{
+		return x86::ceilps(x);
+	}
+	else
+#endif
+	{
+		return -Floor(-x);
+	}
+}
+
+RValue<Float4> Sin(RValue<Float4> v)
+{
+	auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::sin, { V(v.value)->getType() } );
+	return RValue<Float4>(V(jit->builder->CreateCall(func, V(v.value))));
+}
+
+RValue<Float4> Cos(RValue<Float4> v)
+{
+	auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::cos, { V(v.value)->getType() } );
+	return RValue<Float4>(V(jit->builder->CreateCall(func, V(v.value))));
+}
+
+RValue<Float4> Tan(RValue<Float4> v)
+{
+	return Sin(v) / Cos(v);
+}
+
+static RValue<Float4> TransformFloat4PerElement(RValue<Float4> v, const char* name)
+{
+	auto funcTy = ::llvm::FunctionType::get(T(Float::getType()), ::llvm::ArrayRef<llvm::Type*>(T(Float::getType())), false);
+	auto func = jit->module->getOrInsertFunction(name, funcTy);
+	llvm::Value *out = ::llvm::UndefValue::get(T(Float4::getType()));
+	for (uint64_t i = 0; i < 4; i++)
+	{
+		auto el = jit->builder->CreateCall(func, V(Nucleus::createExtractElement(v.value, Float::getType(), i)));
+		out = V(Nucleus::createInsertElement(V(out), V(el), i));
+	}
+	return RValue<Float4>(V(out));
+}
+
+RValue<Float4> Asin(RValue<Float4> v)
+{
+	return TransformFloat4PerElement(v, "asinf");
+}
+
+RValue<Float4> Acos(RValue<Float4> v)
+{
+	return TransformFloat4PerElement(v, "acosf");
+}
+
+RValue<Float4> Atan(RValue<Float4> v)
+{
+	return TransformFloat4PerElement(v, "atanf");
+}
+
+RValue<Float4> Sinh(RValue<Float4> v)
+{
+	return Float4(0.5f) * (Exp(v) - Exp(-v));
+}
+
+RValue<Float4> Cosh(RValue<Float4> v)
+{
+	return Float4(0.5f) * (Exp(v) + Exp(-v));
+}
+
+RValue<Float4> Tanh(RValue<Float4> v)
+{
+	return TransformFloat4PerElement(v, "tanhf");
+}
+
+RValue<Float4> Asinh(RValue<Float4> v)
+{
+	return TransformFloat4PerElement(v, "asinhf");
+}
+
+RValue<Float4> Acosh(RValue<Float4> v)
+{
+	return TransformFloat4PerElement(v, "acoshf");
+}
+
+RValue<Float4> Atanh(RValue<Float4> v)
+{
+	return TransformFloat4PerElement(v, "atanhf");
+}
+
+RValue<Float4> Atan2(RValue<Float4> x, RValue<Float4> y)
+{
+	::llvm::SmallVector<::llvm::Type*, 2> paramTys;
+	paramTys.push_back(T(Float::getType()));
+	paramTys.push_back(T(Float::getType()));
+	auto funcTy = ::llvm::FunctionType::get(T(Float::getType()), paramTys, false);
+	auto func = jit->module->getOrInsertFunction("atan2f", funcTy);
+	llvm::Value *out = ::llvm::UndefValue::get(T(Float4::getType()));
+	for (uint64_t i = 0; i < 4; i++)
+	{
+		auto el = jit->builder->CreateCall2(func, ARGS(
+				V(Nucleus::createExtractElement(x.value, Float::getType(), i)),
+				V(Nucleus::createExtractElement(y.value, Float::getType(), i))
+			));
+		out = V(Nucleus::createInsertElement(V(out), V(el), i));
+	}
+	return RValue<Float4>(V(out));
+}
+
+RValue<Float4> Pow(RValue<Float4> x, RValue<Float4> y)
+{
+	auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::pow, { T(Float4::getType()) });
+	return RValue<Float4>(V(jit->builder->CreateCall2(func, ARGS(V(x.value), V(y.value)))));
+}
+
+RValue<Float4> Exp(RValue<Float4> v)
+{
+	auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::exp, { T(Float4::getType()) } );
+	return RValue<Float4>(V(jit->builder->CreateCall(func, V(v.value))));
+}
+
+RValue<Float4> Log(RValue<Float4> v)
+{
+	auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::log, { T(Float4::getType()) } );
+	return RValue<Float4>(V(jit->builder->CreateCall(func, V(v.value))));
+}
+
+RValue<Float4> Exp2(RValue<Float4> v)
+{
+	auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::exp2, { T(Float4::getType()) } );
+	return RValue<Float4>(V(jit->builder->CreateCall(func, V(v.value))));
+}
+
+RValue<Float4> Log2(RValue<Float4> v)
+{
+	auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::log2, { T(Float4::getType()) } );
+	return RValue<Float4>(V(jit->builder->CreateCall(func, V(v.value))));
+}
+
+RValue<UInt> Ctlz(RValue<UInt> v, bool isZeroUndef)
+{
+	auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::ctlz, { T(UInt::getType()) } );
+	return RValue<UInt>(V(jit->builder->CreateCall2(func, ARGS(
+		V(v.value),
+		isZeroUndef ? ::llvm::ConstantInt::getTrue(jit->context) : ::llvm::ConstantInt::getFalse(jit->context)
+	))));
+}
+
+RValue<UInt4> Ctlz(RValue<UInt4> v, bool isZeroUndef)
+{
+	auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::ctlz, { T(UInt4::getType()) } );
+	return RValue<UInt4>(V(jit->builder->CreateCall2(func, ARGS(
+		V(v.value),
+		isZeroUndef ? ::llvm::ConstantInt::getTrue(jit->context) : ::llvm::ConstantInt::getFalse(jit->context)
+	))));
+}
+
+RValue<UInt> Cttz(RValue<UInt> v, bool isZeroUndef)
+{
+	auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::cttz, { T(UInt::getType()) } );
+	return RValue<UInt>(V(jit->builder->CreateCall2(func, ARGS(
+		V(v.value),
+		isZeroUndef ? ::llvm::ConstantInt::getTrue(jit->context) : ::llvm::ConstantInt::getFalse(jit->context)
+	))));
+}
+
+RValue<UInt4> Cttz(RValue<UInt4> v, bool isZeroUndef)
+{
+	auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::cttz, { T(UInt4::getType()) } );
+	return RValue<UInt4>(V(jit->builder->CreateCall2(func, ARGS(
+		V(v.value),
+		isZeroUndef ? ::llvm::ConstantInt::getTrue(jit->context) : ::llvm::ConstantInt::getFalse(jit->context)
+	))));
+}
+
+Type *Float4::getType()
+{
+	return T(llvm::VectorType::get(T(Float::getType()), 4));
+}
+
+RValue<Long> Ticks()
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	llvm::Function *rdtsc = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::readcyclecounter);
+
+	return RValue<Long>(V(jit->builder->CreateCall(rdtsc)));
+}
+
+RValue<Pointer<Byte>> ConstantPointer(void const * ptr)
+{
+	// Note: this should work for 32-bit pointers as well because 'inttoptr'
+	// is defined to truncate (and zero extend) if necessary.
+	auto ptrAsInt = ::llvm::ConstantInt::get(::llvm::Type::getInt64Ty(jit->context), reinterpret_cast<uintptr_t>(ptr));
+	return RValue<Pointer<Byte>>(V(jit->builder->CreateIntToPtr(ptrAsInt, T(Pointer<Byte>::getType()))));
+}
+
+RValue<Pointer<Byte>> ConstantData(void const * data, size_t size)
+{
+	auto str = ::llvm::StringRef(reinterpret_cast<const char*>(data), size);
+	auto ptr = jit->builder->CreateGlobalStringPtr(str);
+	return RValue<Pointer<Byte>>(V(ptr));
+}
+
+Value* Call(RValue<Pointer<Byte>> fptr, Type* retTy, std::initializer_list<Value*> args, std::initializer_list<Type*> argTys)
+{
+	::llvm::SmallVector<::llvm::Type*, 8> paramTys;
+	for (auto ty : argTys) { paramTys.push_back(T(ty)); }
+	auto funcTy = ::llvm::FunctionType::get(T(retTy), paramTys, false);
+
+	auto funcPtrTy = funcTy->getPointerTo();
+	auto funcPtr = jit->builder->CreatePointerCast(V(fptr.value), funcPtrTy);
+
+	::llvm::SmallVector<::llvm::Value*, 8> arguments;
+	for (auto arg : args) { arguments.push_back(V(arg)); }
+	return V(jit->builder->CreateCall(funcPtr, arguments));
+}
+
+void Breakpoint()
+{
+	llvm::Function *debugtrap = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::debugtrap);
+
+	jit->builder->CreateCall(debugtrap);
+}
+
+}  // namespace rr
+
+namespace rr {
+
+#if defined(__i386__) || defined(__x86_64__)
+namespace x86 {
+
+RValue<Int> cvtss2si(RValue<Float> val)
+{
+	llvm::Function *cvtss2si = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse_cvtss2si);
+
+	Float4 vector;
+	vector.x = val;
+
+	return RValue<Int>(V(jit->builder->CreateCall(cvtss2si, ARGS(V(RValue<Float4>(vector).value)))));
+}
+
+RValue<Int4> cvtps2dq(RValue<Float4> val)
+{
+	llvm::Function *cvtps2dq = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_cvtps2dq);
+
+	return RValue<Int4>(V(jit->builder->CreateCall(cvtps2dq, ARGS(V(val.value)))));
+}
+
+RValue<Float> rcpss(RValue<Float> val)
+{
+	llvm::Function *rcpss = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse_rcp_ss);
+
+	Value *vector = Nucleus::createInsertElement(V(llvm::UndefValue::get(T(Float4::getType()))), val.value, 0);
+
+	return RValue<Float>(Nucleus::createExtractElement(V(jit->builder->CreateCall(rcpss, ARGS(V(vector)))), Float::getType(), 0));
+}
+
+RValue<Float> sqrtss(RValue<Float> val)
+{
+	llvm::Function *sqrt = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::sqrt, {V(val.value)->getType()});
+	return RValue<Float>(V(jit->builder->CreateCall(sqrt, ARGS(V(val.value)))));
+}
+
+RValue<Float> rsqrtss(RValue<Float> val)
+{
+	llvm::Function *rsqrtss = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse_rsqrt_ss);
+
+	Value *vector = Nucleus::createInsertElement(V(llvm::UndefValue::get(T(Float4::getType()))), val.value, 0);
+
+	return RValue<Float>(Nucleus::createExtractElement(V(jit->builder->CreateCall(rsqrtss, ARGS(V(vector)))), Float::getType(), 0));
+}
+
+RValue<Float4> rcpps(RValue<Float4> val)
+{
+	llvm::Function *rcpps = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse_rcp_ps);
+
+	return RValue<Float4>(V(jit->builder->CreateCall(rcpps, ARGS(V(val.value)))));
+}
+
+RValue<Float4> sqrtps(RValue<Float4> val)
+{
+	llvm::Function *sqrtps = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::sqrt, {V(val.value)->getType()});
+
+	return RValue<Float4>(V(jit->builder->CreateCall(sqrtps, ARGS(V(val.value)))));
+}
+
+RValue<Float4> rsqrtps(RValue<Float4> val)
+{
+	llvm::Function *rsqrtps = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse_rsqrt_ps);
+
+	return RValue<Float4>(V(jit->builder->CreateCall(rsqrtps, ARGS(V(val.value)))));
+}
+
+RValue<Float4> maxps(RValue<Float4> x, RValue<Float4> y)
+{
+	llvm::Function *maxps = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse_max_ps);
+
+	return RValue<Float4>(V(jit->builder->CreateCall2(maxps, ARGS(V(x.value), V(y.value)))));
+}
+
+RValue<Float4> minps(RValue<Float4> x, RValue<Float4> y)
+{
+	llvm::Function *minps = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse_min_ps);
+
+	return RValue<Float4>(V(jit->builder->CreateCall2(minps, ARGS(V(x.value), V(y.value)))));
+}
+
+RValue<Float> roundss(RValue<Float> val, unsigned char imm)
+{
+	llvm::Function *roundss = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse41_round_ss);
+
+	Value *undef = V(llvm::UndefValue::get(T(Float4::getType())));
+	Value *vector = Nucleus::createInsertElement(undef, val.value, 0);
+
+	return RValue<Float>(Nucleus::createExtractElement(V(jit->builder->CreateCall3(roundss, ARGS(V(undef), V(vector), V(Nucleus::createConstantInt(imm))))), Float::getType(), 0));
+}
+
+RValue<Float> floorss(RValue<Float> val)
+{
+	return roundss(val, 1);
+}
+
+RValue<Float> ceilss(RValue<Float> val)
+{
+	return roundss(val, 2);
+}
+
+RValue<Float4> roundps(RValue<Float4> val, unsigned char imm)
+{
+	llvm::Function *roundps = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse41_round_ps);
+
+	return RValue<Float4>(V(jit->builder->CreateCall2(roundps, ARGS(V(val.value), V(Nucleus::createConstantInt(imm))))));
+}
+
+RValue<Float4> floorps(RValue<Float4> val)
+{
+	return roundps(val, 1);
+}
+
+RValue<Float4> ceilps(RValue<Float4> val)
+{
+	return roundps(val, 2);
+}
+
+RValue<Int4> pabsd(RValue<Int4> x)
+{
+	return RValue<Int4>(V(lowerPABS(V(x.value))));
+}
+
+RValue<Short4> paddsw(RValue<Short4> x, RValue<Short4> y)
+{
+	#if LLVM_VERSION_MAJOR >= 8
+		return As<Short4>(V(lowerPSADDSAT(V(x.value), V(y.value))));
+	#else
+		llvm::Function *paddsw = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_padds_w);
+
+		return As<Short4>(V(jit->builder->CreateCall2(paddsw, ARGS(V(x.value), V(y.value)))));
+	#endif
+}
+
+RValue<Short4> psubsw(RValue<Short4> x, RValue<Short4> y)
+{
+	#if LLVM_VERSION_MAJOR >= 8
+		return As<Short4>(V(lowerPSSUBSAT(V(x.value), V(y.value))));
+	#else
+		llvm::Function *psubsw = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_psubs_w);
+
+		return As<Short4>(V(jit->builder->CreateCall2(psubsw, ARGS(V(x.value), V(y.value)))));
+	#endif
+}
+
+RValue<UShort4> paddusw(RValue<UShort4> x, RValue<UShort4> y)
+{
+	#if LLVM_VERSION_MAJOR >= 8
+		return As<UShort4>(V(lowerPUADDSAT(V(x.value), V(y.value))));
+	#else
+		llvm::Function *paddusw = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_paddus_w);
+
+		return As<UShort4>(V(jit->builder->CreateCall2(paddusw, ARGS(V(x.value), V(y.value)))));
+	#endif
+}
+
+RValue<UShort4> psubusw(RValue<UShort4> x, RValue<UShort4> y)
+{
+	#if LLVM_VERSION_MAJOR >= 8
+		return As<UShort4>(V(lowerPUSUBSAT(V(x.value), V(y.value))));
+	#else
+		llvm::Function *psubusw = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_psubus_w);
+
+		return As<UShort4>(V(jit->builder->CreateCall2(psubusw, ARGS(V(x.value), V(y.value)))));
+	#endif
+}
+
+RValue<SByte8> paddsb(RValue<SByte8> x, RValue<SByte8> y)
+{
+	#if LLVM_VERSION_MAJOR >= 8
+		return As<SByte8>(V(lowerPSADDSAT(V(x.value), V(y.value))));
+	#else
+		llvm::Function *paddsb = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_padds_b);
+
+		return As<SByte8>(V(jit->builder->CreateCall2(paddsb, ARGS(V(x.value), V(y.value)))));
+	#endif
+}
+
+RValue<SByte8> psubsb(RValue<SByte8> x, RValue<SByte8> y)
+{
+	#if LLVM_VERSION_MAJOR >= 8
+		return As<SByte8>(V(lowerPSSUBSAT(V(x.value), V(y.value))));
+	#else
+		llvm::Function *psubsb = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_psubs_b);
+
+		return As<SByte8>(V(jit->builder->CreateCall2(psubsb, ARGS(V(x.value), V(y.value)))));
+	#endif
+}
+
+RValue<Byte8> paddusb(RValue<Byte8> x, RValue<Byte8> y)
+{
+	#if LLVM_VERSION_MAJOR >= 8
+		return As<Byte8>(V(lowerPUADDSAT(V(x.value), V(y.value))));
+	#else
+		llvm::Function *paddusb = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_paddus_b);
+
+		return As<Byte8>(V(jit->builder->CreateCall2(paddusb, ARGS(V(x.value), V(y.value)))));
+	#endif
+}
+
+RValue<Byte8> psubusb(RValue<Byte8> x, RValue<Byte8> y)
+{
+	#if LLVM_VERSION_MAJOR >= 8
+		return As<Byte8>(V(lowerPUSUBSAT(V(x.value), V(y.value))));
+	#else
+		llvm::Function *psubusb = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_psubus_b);
+
+		return As<Byte8>(V(jit->builder->CreateCall2(psubusb, ARGS(V(x.value), V(y.value)))));
+	#endif
+}
+
+RValue<UShort4> pavgw(RValue<UShort4> x, RValue<UShort4> y)
+{
+	return As<UShort4>(V(lowerPAVG(V(x.value), V(y.value))));
+}
+
+RValue<Short4> pmaxsw(RValue<Short4> x, RValue<Short4> y)
+{
+	return As<Short4>(V(lowerPMINMAX(V(x.value), V(y.value), llvm::ICmpInst::ICMP_SGT)));
+}
+
+RValue<Short4> pminsw(RValue<Short4> x, RValue<Short4> y)
+{
+	return As<Short4>(V(lowerPMINMAX(V(x.value), V(y.value), llvm::ICmpInst::ICMP_SLT)));
+}
+
+RValue<Short4> pcmpgtw(RValue<Short4> x, RValue<Short4> y)
+{
+	return As<Short4>(V(lowerPCMP(llvm::ICmpInst::ICMP_SGT, V(x.value), V(y.value), T(Short4::getType()))));
+}
+
+RValue<Short4> pcmpeqw(RValue<Short4> x, RValue<Short4> y)
+{
+	return As<Short4>(V(lowerPCMP(llvm::ICmpInst::ICMP_EQ, V(x.value), V(y.value), T(Short4::getType()))));
+}
+
+RValue<Byte8> pcmpgtb(RValue<SByte8> x, RValue<SByte8> y)
+{
+	return As<Byte8>(V(lowerPCMP(llvm::ICmpInst::ICMP_SGT, V(x.value), V(y.value), T(Byte8::getType()))));
+}
+
+RValue<Byte8> pcmpeqb(RValue<Byte8> x, RValue<Byte8> y)
+{
+	return As<Byte8>(V(lowerPCMP(llvm::ICmpInst::ICMP_EQ, V(x.value), V(y.value), T(Byte8::getType()))));
+}
+
+RValue<Short4> packssdw(RValue<Int2> x, RValue<Int2> y)
+{
+	llvm::Function *packssdw = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_packssdw_128);
+
+	return As<Short4>(V(jit->builder->CreateCall2(packssdw, ARGS(V(x.value), V(y.value)))));
+}
+
+RValue<Short8> packssdw(RValue<Int4> x, RValue<Int4> y)
+{
+	llvm::Function *packssdw = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_packssdw_128);
+
+	return RValue<Short8>(V(jit->builder->CreateCall2(packssdw, ARGS(V(x.value), V(y.value)))));
+}
+
+RValue<SByte8> packsswb(RValue<Short4> x, RValue<Short4> y)
+{
+	llvm::Function *packsswb = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_packsswb_128);
+
+	return As<SByte8>(V(jit->builder->CreateCall2(packsswb, ARGS(V(x.value), V(y.value)))));
+}
+
+RValue<Byte8> packuswb(RValue<Short4> x, RValue<Short4> y)
+{
+	llvm::Function *packuswb = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_packuswb_128);
+
+	return As<Byte8>(V(jit->builder->CreateCall2(packuswb, ARGS(V(x.value), V(y.value)))));
+}
+
+RValue<UShort8> packusdw(RValue<Int4> x, RValue<Int4> y)
+{
+	if(CPUID::supportsSSE4_1())
+	{
+		llvm::Function *packusdw = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse41_packusdw);
+
+		return RValue<UShort8>(V(jit->builder->CreateCall2(packusdw, ARGS(V(x.value), V(y.value)))));
+	}
+	else
+	{
+		RValue<Int4> bx = (x & ~(x >> 31)) - Int4(0x8000);
+		RValue<Int4> by = (y & ~(y >> 31)) - Int4(0x8000);
+
+		return As<UShort8>(packssdw(bx, by) + Short8(0x8000u));
+	}
+}
+
+RValue<UShort4> psrlw(RValue<UShort4> x, unsigned char y)
+{
+	llvm::Function *psrlw = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_psrli_w);
+
+	return As<UShort4>(V(jit->builder->CreateCall2(psrlw, ARGS(V(x.value), V(Nucleus::createConstantInt(y))))));
+}
+
+RValue<UShort8> psrlw(RValue<UShort8> x, unsigned char y)
+{
+	llvm::Function *psrlw = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_psrli_w);
+
+	return RValue<UShort8>(V(jit->builder->CreateCall2(psrlw, ARGS(V(x.value), V(Nucleus::createConstantInt(y))))));
+}
+
+RValue<Short4> psraw(RValue<Short4> x, unsigned char y)
+{
+	llvm::Function *psraw = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_psrai_w);
+
+	return As<Short4>(V(jit->builder->CreateCall2(psraw, ARGS(V(x.value), V(Nucleus::createConstantInt(y))))));
+}
+
+RValue<Short8> psraw(RValue<Short8> x, unsigned char y)
+{
+	llvm::Function *psraw = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_psrai_w);
+
+	return RValue<Short8>(V(jit->builder->CreateCall2(psraw, ARGS(V(x.value), V(Nucleus::createConstantInt(y))))));
+}
+
+RValue<Short4> psllw(RValue<Short4> x, unsigned char y)
+{
+	llvm::Function *psllw = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_pslli_w);
+
+	return As<Short4>(V(jit->builder->CreateCall2(psllw, ARGS(V(x.value), V(Nucleus::createConstantInt(y))))));
+}
+
+RValue<Short8> psllw(RValue<Short8> x, unsigned char y)
+{
+	llvm::Function *psllw = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_pslli_w);
+
+	return RValue<Short8>(V(jit->builder->CreateCall2(psllw, ARGS(V(x.value), V(Nucleus::createConstantInt(y))))));
+}
+
+RValue<Int2> pslld(RValue<Int2> x, unsigned char y)
+{
+	llvm::Function *pslld = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_pslli_d);
+
+	return As<Int2>(V(jit->builder->CreateCall2(pslld, ARGS(V(x.value), V(Nucleus::createConstantInt(y))))));
+}
+
+RValue<Int4> pslld(RValue<Int4> x, unsigned char y)
+{
+	llvm::Function *pslld = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_pslli_d);
+
+	return RValue<Int4>(V(jit->builder->CreateCall2(pslld, ARGS(V(x.value), V(Nucleus::createConstantInt(y))))));
+}
+
+RValue<Int2> psrad(RValue<Int2> x, unsigned char y)
+{
+	llvm::Function *psrad = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_psrai_d);
+
+	return As<Int2>(V(jit->builder->CreateCall2(psrad, ARGS(V(x.value), V(Nucleus::createConstantInt(y))))));
+}
+
+RValue<Int4> psrad(RValue<Int4> x, unsigned char y)
+{
+	llvm::Function *psrad = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_psrai_d);
+
+	return RValue<Int4>(V(jit->builder->CreateCall2(psrad, ARGS(V(x.value), V(Nucleus::createConstantInt(y))))));
+}
+
+RValue<UInt2> psrld(RValue<UInt2> x, unsigned char y)
+{
+	llvm::Function *psrld = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_psrli_d);
+
+	return As<UInt2>(V(jit->builder->CreateCall2(psrld, ARGS(V(x.value), V(Nucleus::createConstantInt(y))))));
+}
+
+RValue<UInt4> psrld(RValue<UInt4> x, unsigned char y)
+{
+	llvm::Function *psrld = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_psrli_d);
+
+	return RValue<UInt4>(V(jit->builder->CreateCall2(psrld, ARGS(V(x.value), V(Nucleus::createConstantInt(y))))));
+}
+
+RValue<Int4> pmaxsd(RValue<Int4> x, RValue<Int4> y)
+{
+	return RValue<Int4>(V(lowerPMINMAX(V(x.value), V(y.value), llvm::ICmpInst::ICMP_SGT)));
+}
+
+RValue<Int4> pminsd(RValue<Int4> x, RValue<Int4> y)
+{
+	return RValue<Int4>(V(lowerPMINMAX(V(x.value), V(y.value), llvm::ICmpInst::ICMP_SLT)));
+}
+
+RValue<UInt4> pmaxud(RValue<UInt4> x, RValue<UInt4> y)
+{
+	return RValue<UInt4>(V(lowerPMINMAX(V(x.value), V(y.value), llvm::ICmpInst::ICMP_UGT)));
+}
+
+RValue<UInt4> pminud(RValue<UInt4> x, RValue<UInt4> y)
+{
+	return RValue<UInt4>(V(lowerPMINMAX(V(x.value), V(y.value), llvm::ICmpInst::ICMP_ULT)));
+}
+
+RValue<Short4> pmulhw(RValue<Short4> x, RValue<Short4> y)
+{
+	llvm::Function *pmulhw = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_pmulh_w);
+
+	return As<Short4>(V(jit->builder->CreateCall2(pmulhw, ARGS(V(x.value), V(y.value)))));
+}
+
+RValue<UShort4> pmulhuw(RValue<UShort4> x, RValue<UShort4> y)
+{
+	llvm::Function *pmulhuw = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_pmulhu_w);
+
+	return As<UShort4>(V(jit->builder->CreateCall2(pmulhuw, ARGS(V(x.value), V(y.value)))));
+}
+
+RValue<Int2> pmaddwd(RValue<Short4> x, RValue<Short4> y)
+{
+	llvm::Function *pmaddwd = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_pmadd_wd);
+
+	return As<Int2>(V(jit->builder->CreateCall2(pmaddwd, ARGS(V(x.value), V(y.value)))));
+}
+
+RValue<Short8> pmulhw(RValue<Short8> x, RValue<Short8> y)
+{
+	llvm::Function *pmulhw = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_pmulh_w);
+
+	return RValue<Short8>(V(jit->builder->CreateCall2(pmulhw, ARGS(V(x.value), V(y.value)))));
+}
+
+RValue<UShort8> pmulhuw(RValue<UShort8> x, RValue<UShort8> y)
+{
+	llvm::Function *pmulhuw = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_pmulhu_w);
+
+	return RValue<UShort8>(V(jit->builder->CreateCall2(pmulhuw, ARGS(V(x.value), V(y.value)))));
+}
+
+RValue<Int4> pmaddwd(RValue<Short8> x, RValue<Short8> y)
+{
+	llvm::Function *pmaddwd = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_pmadd_wd);
+
+	return RValue<Int4>(V(jit->builder->CreateCall2(pmaddwd, ARGS(V(x.value), V(y.value)))));
+}
+
+RValue<Int> movmskps(RValue<Float4> x)
+{
+	llvm::Function *movmskps = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse_movmsk_ps);
+
+	return RValue<Int>(V(jit->builder->CreateCall(movmskps, ARGS(V(x.value)))));
+}
+
+RValue<Int> pmovmskb(RValue<Byte8> x)
+{
+	llvm::Function *pmovmskb = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_pmovmskb_128);
+
+	return RValue<Int>(V(jit->builder->CreateCall(pmovmskb, ARGS(V(x.value))))) & 0xFF;
+}
+
+RValue<Int4> pmovzxbd(RValue<Byte16> x)
+{
+	return RValue<Int4>(V(lowerPMOV(V(x.value), T(Int4::getType()), false)));
+}
+
+RValue<Int4> pmovsxbd(RValue<SByte16> x)
+{
+	return RValue<Int4>(V(lowerPMOV(V(x.value), T(Int4::getType()), true)));
+}
+
+RValue<Int4> pmovzxwd(RValue<UShort8> x)
+{
+	return RValue<Int4>(V(lowerPMOV(V(x.value), T(Int4::getType()), false)));
+}
+
+RValue<Int4> pmovsxwd(RValue<Short8> x)
+{
+	return RValue<Int4>(V(lowerPMOV(V(x.value), T(Int4::getType()), true)));
+}
+
+}  // namespace x86
 #endif  // defined(__i386__) || defined(__x86_64__)
 
 #ifdef ENABLE_RR_PRINT
-	// extractAll returns a vector containing the extracted n scalar value of
-	// the vector vec.
-	static std::vector<Value*> extractAll(Value* vec, int n)
+// extractAll returns a vector containing the extracted n scalar value of
+// the vector vec.
+static std::vector<Value*> extractAll(Value* vec, int n)
+{
+	std::vector<Value*> elements;
+	elements.reserve(n);
+	for (int i = 0; i < n; i++)
 	{
-		std::vector<Value*> elements;
-		elements.reserve(n);
-		for (int i = 0; i < n; i++)
+		auto el = V(jit->builder->CreateExtractElement(V(vec), i));
+		elements.push_back(el);
+	}
+	return elements;
+}
+
+// toInt returns all the integer values in vals extended to a native width
+// integer.
+static std::vector<Value*> toInt(const std::vector<Value*>& vals, bool isSigned)
+{
+	auto intTy = ::llvm::Type::getIntNTy(jit->context, sizeof(int) * 8); // Natural integer width.
+	std::vector<Value*> elements;
+	elements.reserve(vals.size());
+	for (auto v : vals)
+	{
+		if (isSigned)
 		{
-			auto el = V(jit->builder->CreateExtractElement(V(vec), i));
-			elements.push_back(el);
+			elements.push_back(V(jit->builder->CreateSExt(V(v), intTy)));
 		}
-		return elements;
+		else
+		{
+			elements.push_back(V(jit->builder->CreateZExt(V(v), intTy)));
+		}
+	}
+	return elements;
+}
+
+// toDouble returns all the float values in vals extended to doubles.
+static std::vector<Value*> toDouble(const std::vector<Value*>& vals)
+{
+	auto doubleTy = ::llvm::Type::getDoubleTy(jit->context);
+	std::vector<Value*> elements;
+	elements.reserve(vals.size());
+	for (auto v : vals)
+	{
+		elements.push_back(V(jit->builder->CreateFPExt(V(v), doubleTy)));
+	}
+	return elements;
+}
+
+std::vector<Value*> PrintValue::Ty<Byte>::val(const RValue<Byte>& v) { return toInt({v.value}, false); }
+std::vector<Value*> PrintValue::Ty<Byte4>::val(const RValue<Byte4>& v) { return toInt(extractAll(v.value, 4), false); }
+std::vector<Value*> PrintValue::Ty<Int>::val(const RValue<Int>& v) { return toInt({v.value}, true); }
+std::vector<Value*> PrintValue::Ty<Int2>::val(const RValue<Int2>& v) { return toInt(extractAll(v.value, 2), true); }
+std::vector<Value*> PrintValue::Ty<Int4>::val(const RValue<Int4>& v) { return toInt(extractAll(v.value, 4), true); }
+std::vector<Value*> PrintValue::Ty<UInt>::val(const RValue<UInt>& v) { return toInt({v.value}, false); }
+std::vector<Value*> PrintValue::Ty<UInt2>::val(const RValue<UInt2>& v) { return toInt(extractAll(v.value, 2), false); }
+std::vector<Value*> PrintValue::Ty<UInt4>::val(const RValue<UInt4>& v) { return toInt(extractAll(v.value, 4), false); }
+std::vector<Value*> PrintValue::Ty<Short>::val(const RValue<Short>& v) { return toInt({v.value}, true); }
+std::vector<Value*> PrintValue::Ty<Short4>::val(const RValue<Short4>& v) { return toInt(extractAll(v.value, 4), true); }
+std::vector<Value*> PrintValue::Ty<UShort>::val(const RValue<UShort>& v) { return toInt({v.value}, false); }
+std::vector<Value*> PrintValue::Ty<UShort4>::val(const RValue<UShort4>& v) { return toInt(extractAll(v.value, 4), false); }
+std::vector<Value*> PrintValue::Ty<Float>::val(const RValue<Float>& v) { return toDouble({v.value}); }
+std::vector<Value*> PrintValue::Ty<Float4>::val(const RValue<Float4>& v) { return toDouble(extractAll(v.value, 4)); }
+std::vector<Value*> PrintValue::Ty<const char*>::val(const char* v) { return {V(jit->builder->CreateGlobalStringPtr(v))}; }
+
+void Printv(const char* function, const char* file, int line, const char* fmt, std::initializer_list<PrintValue> args)
+{
+	// LLVM types used below.
+	auto i32Ty = ::llvm::Type::getInt32Ty(jit->context);
+	auto intTy = ::llvm::Type::getIntNTy(jit->context, sizeof(int) * 8); // Natural integer width.
+	auto i8PtrTy = ::llvm::Type::getInt8PtrTy(jit->context);
+	auto funcTy = ::llvm::FunctionType::get(i32Ty, {i8PtrTy}, true);
+
+	auto func = jit->module->getOrInsertFunction("printf", funcTy);
+
+	// Build the printf format message string.
+	std::string str;
+	if (file != nullptr) { str += (line > 0) ? "%s:%d " : "%s "; }
+	if (function != nullptr) { str += "%s "; }
+	str += fmt;
+
+	// Perform subsitution on all '{n}' bracketed indices in the format
+	// message.
+	int i = 0;
+	for (const PrintValue& arg : args)
+	{
+		str = replace(str, "{" + std::to_string(i++) + "}", arg.format);
 	}
 
-	// toInt returns all the integer values in vals extended to a native width
-	// integer.
-	static std::vector<Value*> toInt(const std::vector<Value*>& vals, bool isSigned)
+	::llvm::SmallVector<::llvm::Value*, 8> vals;
+
+	// The format message is always the first argument.
+	vals.push_back(jit->builder->CreateGlobalStringPtr(str));
+
+	// Add optional file, line and function info if provided.
+	if (file != nullptr)
 	{
-		auto intTy = ::llvm::Type::getIntNTy(jit->context, sizeof(int) * 8); // Natural integer width.
-		std::vector<Value*> elements;
-		elements.reserve(vals.size());
-		for (auto v : vals)
+		vals.push_back(jit->builder->CreateGlobalStringPtr(file));
+		if (line > 0)
 		{
-			if (isSigned)
-			{
-				elements.push_back(V(jit->builder->CreateSExt(V(v), intTy)));
-			}
-			else
-			{
-				elements.push_back(V(jit->builder->CreateZExt(V(v), intTy)));
-			}
+			vals.push_back(::llvm::ConstantInt::get(intTy, line));
 		}
-		return elements;
+	}
+	if (function != nullptr)
+	{
+		vals.push_back(jit->builder->CreateGlobalStringPtr(function));
 	}
 
-	// toDouble returns all the float values in vals extended to doubles.
-	static std::vector<Value*> toDouble(const std::vector<Value*>& vals)
+	// Add all format arguments.
+	for (const PrintValue& arg : args)
 	{
-		auto doubleTy = ::llvm::Type::getDoubleTy(jit->context);
-		std::vector<Value*> elements;
-		elements.reserve(vals.size());
-		for (auto v : vals)
+		for (auto val : arg.values)
 		{
-			elements.push_back(V(jit->builder->CreateFPExt(V(v), doubleTy)));
+			vals.push_back(V(val));
 		}
-		return elements;
 	}
 
-	std::vector<Value*> PrintValue::Ty<Byte>::val(const RValue<Byte>& v) { return toInt({v.value}, false); }
-	std::vector<Value*> PrintValue::Ty<Byte4>::val(const RValue<Byte4>& v) { return toInt(extractAll(v.value, 4), false); }
-	std::vector<Value*> PrintValue::Ty<Int>::val(const RValue<Int>& v) { return toInt({v.value}, true); }
-	std::vector<Value*> PrintValue::Ty<Int2>::val(const RValue<Int2>& v) { return toInt(extractAll(v.value, 2), true); }
-	std::vector<Value*> PrintValue::Ty<Int4>::val(const RValue<Int4>& v) { return toInt(extractAll(v.value, 4), true); }
-	std::vector<Value*> PrintValue::Ty<UInt>::val(const RValue<UInt>& v) { return toInt({v.value}, false); }
-	std::vector<Value*> PrintValue::Ty<UInt2>::val(const RValue<UInt2>& v) { return toInt(extractAll(v.value, 2), false); }
-	std::vector<Value*> PrintValue::Ty<UInt4>::val(const RValue<UInt4>& v) { return toInt(extractAll(v.value, 4), false); }
-	std::vector<Value*> PrintValue::Ty<Short>::val(const RValue<Short>& v) { return toInt({v.value}, true); }
-	std::vector<Value*> PrintValue::Ty<Short4>::val(const RValue<Short4>& v) { return toInt(extractAll(v.value, 4), true); }
-	std::vector<Value*> PrintValue::Ty<UShort>::val(const RValue<UShort>& v) { return toInt({v.value}, false); }
-	std::vector<Value*> PrintValue::Ty<UShort4>::val(const RValue<UShort4>& v) { return toInt(extractAll(v.value, 4), false); }
-	std::vector<Value*> PrintValue::Ty<Float>::val(const RValue<Float>& v) { return toDouble({v.value}); }
-	std::vector<Value*> PrintValue::Ty<Float4>::val(const RValue<Float4>& v) { return toDouble(extractAll(v.value, 4)); }
-	std::vector<Value*> PrintValue::Ty<const char*>::val(const char* v) { return {V(jit->builder->CreateGlobalStringPtr(v))}; }
-
-	void Printv(const char* function, const char* file, int line, const char* fmt, std::initializer_list<PrintValue> args)
-	{
-		// LLVM types used below.
-		auto i32Ty = ::llvm::Type::getInt32Ty(jit->context);
-		auto intTy = ::llvm::Type::getIntNTy(jit->context, sizeof(int) * 8); // Natural integer width.
-		auto i8PtrTy = ::llvm::Type::getInt8PtrTy(jit->context);
-		auto funcTy = ::llvm::FunctionType::get(i32Ty, {i8PtrTy}, true);
-
-		auto func = jit->module->getOrInsertFunction("printf", funcTy);
-
-		// Build the printf format message string.
-		std::string str;
-		if (file != nullptr) { str += (line > 0) ? "%s:%d " : "%s "; }
-		if (function != nullptr) { str += "%s "; }
-		str += fmt;
-
-		// Perform subsitution on all '{n}' bracketed indices in the format
-		// message.
-		int i = 0;
-		for (const PrintValue& arg : args)
-		{
-			str = replace(str, "{" + std::to_string(i++) + "}", arg.format);
-		}
-
-		::llvm::SmallVector<::llvm::Value*, 8> vals;
-
-		// The format message is always the first argument.
-		vals.push_back(jit->builder->CreateGlobalStringPtr(str));
-
-		// Add optional file, line and function info if provided.
-		if (file != nullptr)
-		{
-			vals.push_back(jit->builder->CreateGlobalStringPtr(file));
-			if (line > 0)
-			{
-				vals.push_back(::llvm::ConstantInt::get(intTy, line));
-			}
-		}
-		if (function != nullptr)
-		{
-			vals.push_back(jit->builder->CreateGlobalStringPtr(function));
-		}
-
-		// Add all format arguments.
-		for (const PrintValue& arg : args)
-		{
-			for (auto val : arg.values)
-			{
-				vals.push_back(V(val));
-			}
-		}
-
-		jit->builder->CreateCall(func, vals);
-	}
+	jit->builder->CreateCall(func, vals);
+}
 #endif // ENABLE_RR_PRINT
 
-	void Nop()
-	{
-		auto voidTy = ::llvm::Type::getVoidTy(jit->context);
-		auto funcTy = ::llvm::FunctionType::get(voidTy, {}, false);
-		auto func = jit->module->getOrInsertFunction("nop", funcTy);
-		jit->builder->CreateCall(func);
-	}
+void Nop()
+{
+	auto voidTy = ::llvm::Type::getVoidTy(jit->context);
+	auto funcTy = ::llvm::FunctionType::get(voidTy, {}, false);
+	auto func = jit->module->getOrInsertFunction("nop", funcTy);
+	jit->builder->CreateCall(func);
+}
 
-	void EmitDebugLocation()
-	{
+void EmitDebugLocation()
+{
 #ifdef ENABLE_RR_DEBUG_INFO
-		if (jit->debugInfo != nullptr)
-		{
-			jit->debugInfo->EmitLocation();
-		}
-#endif // ENABLE_RR_DEBUG_INFO
-	}
-
-	void EmitDebugVariable(Value* value)
+	if (jit->debugInfo != nullptr)
 	{
-#ifdef ENABLE_RR_DEBUG_INFO
-		if (jit->debugInfo != nullptr)
-		{
-			jit->debugInfo->EmitVariable(value);
-		}
-#endif // ENABLE_RR_DEBUG_INFO
+		jit->debugInfo->EmitLocation();
 	}
+#endif // ENABLE_RR_DEBUG_INFO
+}
 
-	void FlushDebug()
+void EmitDebugVariable(Value* value)
+{
+#ifdef ENABLE_RR_DEBUG_INFO
+	if (jit->debugInfo != nullptr)
 	{
-#ifdef ENABLE_RR_DEBUG_INFO
-		if (jit->debugInfo != nullptr)
-		{
-			jit->debugInfo->Flush();
-		}
-#endif // ENABLE_RR_DEBUG_INFO
+		jit->debugInfo->EmitVariable(value);
 	}
+#endif // ENABLE_RR_DEBUG_INFO
+}
 
-} // namespace rr
+void FlushDebug()
+{
+#ifdef ENABLE_RR_DEBUG_INFO
+	if (jit->debugInfo != nullptr)
+	{
+		jit->debugInfo->Flush();
+	}
+#endif // ENABLE_RR_DEBUG_INFO
+}
+
+}  // namespace rr
 
 // ------------------------------  Coroutines ------------------------------
 
 namespace {
-	// Magic values retuned by llvm.coro.suspend.
-	// See: https://llvm.org/docs/Coroutines.html#llvm-coro-suspend-intrinsic
-	enum SuspendAction
-	{
-		SuspendActionSuspend = -1,
-		SuspendActionResume = 0,
-		SuspendActionDestroy = 1
-	};
 
+// Magic values retuned by llvm.coro.suspend.
+// See: https://llvm.org/docs/Coroutines.html#llvm-coro-suspend-intrinsic
+enum SuspendAction
+{
+	SuspendActionSuspend = -1,
+	SuspendActionResume = 0,
+	SuspendActionDestroy = 1
+};
 
 void promoteFunctionToCoroutine()
 {

diff --git a/src/Reactor/LLVMReactor.hpp b/src/Reactor/LLVMReactor.hpp
index 4ff5274..bbf3332 100644
--- a/src/Reactor/LLVMReactor.hpp
+++ b/src/Reactor/LLVMReactor.hpp

@@ -15,38 +15,40 @@
 #ifndef rr_LLVMReactor_hpp
 #define rr_LLVMReactor_hpp
 
-namespace llvm
+namespace llvm {
+
+class Type;
+class Value;
+
+}  // namespace llvm
+
+namespace rr {
+
+class Type;
+class Value;
+
+llvm::Type *T(Type *t);
+
+inline Type *T(llvm::Type *t)
 {
-	class Type;
-	class Value;
+	return reinterpret_cast<Type*>(t);
 }
 
-namespace rr
+inline llvm::Value *V(Value *t)
 {
-	class Type;
-	class Value;
-
-	llvm::Type *T(Type *t);
-
-	inline Type *T(llvm::Type *t)
-	{
-		return reinterpret_cast<Type*>(t);
-	}
-
-	inline llvm::Value *V(Value *t)
-	{
-		return reinterpret_cast<llvm::Value*>(t);
-	}
-
-	inline Value *V(llvm::Value *t)
-	{
-		return reinterpret_cast<Value*>(t);
-	}
-
-	// Emits a no-op instruction that will not be optimized away.
-	// Useful for emitting something that can have a source location without
-	// effect.
-	void Nop();
+	return reinterpret_cast<llvm::Value*>(t);
 }
 
+inline Value *V(llvm::Value *t)
+{
+	return reinterpret_cast<Value*>(t);
+}
+
+// Emits a no-op instruction that will not be optimized away.
+// Useful for emitting something that can have a source location without
+// effect.
+void Nop();
+
+}  // namespace rr
+
 #endif // rr_LLVMReactor_hpp

diff --git a/src/Reactor/LLVMReactorDebugInfo.cpp b/src/Reactor/LLVMReactorDebugInfo.cpp
index 744ae42..ee090cc 100644
--- a/src/Reactor/LLVMReactorDebugInfo.cpp
+++ b/src/Reactor/LLVMReactorDebugInfo.cpp

@@ -40,518 +40,518 @@
 #define LOG(msg, ...)
 #endif
 
-namespace
+namespace {
+
+std::pair<llvm::StringRef, llvm::StringRef> splitPath(const char* path)
 {
-	std::pair<llvm::StringRef, llvm::StringRef> splitPath(const char* path)
-	{
-		return llvm::StringRef(path).rsplit('/');
-	}
+	return llvm::StringRef(path).rsplit('/');
+}
 
-	// Note: createGDBRegistrationListener() returns a pointer to a singleton.
-	// Nothing is actually created.
-	auto jitEventListener = llvm::JITEventListener::createGDBRegistrationListener(); // guarded by jitEventListenerMutex
-	std::mutex jitEventListenerMutex;
+// Note: createGDBRegistrationListener() returns a pointer to a singleton.
+// Nothing is actually created.
+auto jitEventListener = llvm::JITEventListener::createGDBRegistrationListener(); // guarded by jitEventListenerMutex
+std::mutex jitEventListenerMutex;
 
-} // anonymous namespaces
+}  // anonymous namespaces
 
-namespace rr
+namespace rr {
+
+DebugInfo::DebugInfo(
+		llvm::IRBuilder<> *builder,
+		llvm::LLVMContext *context,
+		llvm::Module *module,
+		llvm::Function *function)
+	: builder(builder), context(context), module(module), function(function)
 {
-	DebugInfo::DebugInfo(
-			llvm::IRBuilder<> *builder,
-			llvm::LLVMContext *context,
-			llvm::Module *module,
-			llvm::Function *function)
-		: builder(builder), context(context), module(module), function(function)
-	{
-		using namespace ::llvm;
+	using namespace ::llvm;
 
-		auto location = getCallerLocation();
+	auto location = getCallerLocation();
 
-		auto fileAndDir = splitPath(location.function.file.c_str());
-		diBuilder.reset(new llvm::DIBuilder(*module));
-		diCU = diBuilder->createCompileUnit(
-			llvm::dwarf::DW_LANG_C,
-			diBuilder->createFile(fileAndDir.first, fileAndDir.second),
-			"Reactor",
-			0, "", 0);
+	auto fileAndDir = splitPath(location.function.file.c_str());
+	diBuilder.reset(new llvm::DIBuilder(*module));
+	diCU = diBuilder->createCompileUnit(
+		llvm::dwarf::DW_LANG_C,
+		diBuilder->createFile(fileAndDir.first, fileAndDir.second),
+		"Reactor",
+		0, "", 0);
 
-		registerBasicTypes();
+	registerBasicTypes();
 
-		SmallVector<Metadata *, 8> EltTys;
-		auto funcTy = diBuilder->createSubroutineType(diBuilder->getOrCreateTypeArray(EltTys));
+	SmallVector<Metadata *, 8> EltTys;
+	auto funcTy = diBuilder->createSubroutineType(diBuilder->getOrCreateTypeArray(EltTys));
 
-		auto file = getOrCreateFile(location.function.file.c_str());
-		auto sp = diBuilder->createFunction(
-			file,                   // scope
-			"ReactorFunction",      // function name
-			"ReactorFunction",      // linkage
-			file,                   // file
-			location.line,          // line
-			funcTy,                 // type
-			false,                  // internal linkage
-			true,                   // definition
-			location.line,          // scope line
-			DINode::FlagPrototyped, // flags
-			false                   // is optimized
-		);
-		diSubprogram = sp;
-		function->setSubprogram(sp);
-		diRootLocation = DILocation::get(*context, location.line, 0, sp);
-		builder->SetCurrentDebugLocation(diRootLocation);
-	}
+	auto file = getOrCreateFile(location.function.file.c_str());
+	auto sp = diBuilder->createFunction(
+		file,                   // scope
+		"ReactorFunction",      // function name
+		"ReactorFunction",      // linkage
+		file,                   // file
+		location.line,          // line
+		funcTy,                 // type
+		false,                  // internal linkage
+		true,                   // definition
+		location.line,          // scope line
+		DINode::FlagPrototyped, // flags
+		false                   // is optimized
+	);
+	diSubprogram = sp;
+	function->setSubprogram(sp);
+	diRootLocation = DILocation::get(*context, location.line, 0, sp);
+	builder->SetCurrentDebugLocation(diRootLocation);
+}
 
-	DebugInfo::~DebugInfo() = default;
+DebugInfo::~DebugInfo() = default;
 
-	void DebugInfo::Finalize()
-	{
-		while (diScope.size() > 0)
-		{
-			emitPending(diScope.back(), builder);
-			diScope.pop_back();
-		}
-		diBuilder->finalize();
-	}
-
-	void DebugInfo::EmitLocation()
-	{
-		auto const& backtrace = getCallerBacktrace();
-		syncScope(backtrace);
-		builder->SetCurrentDebugLocation(getLocation(backtrace, backtrace.size() - 1));
-
-#ifdef ENABLE_RR_EMIT_PRINT_LOCATION
-		static Location lastLocation;
-		if (backtrace.size() == 0)
-		{
-			return;
-		}
-		Location currLocation = backtrace[backtrace.size() - 1];
-		if (currLocation != lastLocation)
-		{
-			rr::Print("rr> {0} [{1}:{2}]\n", currLocation.function.name.c_str(), currLocation.function.file.c_str(), currLocation.line);
-			lastLocation = std::move(currLocation);
-		}
-#endif // ENABLE_RR_EMIT_PRINT_LOCATION
-	}
-
-	void DebugInfo::Flush()
+void DebugInfo::Finalize()
+{
+	while (diScope.size() > 0)
 	{
 		emitPending(diScope.back(), builder);
+		diScope.pop_back();
+	}
+	diBuilder->finalize();
+}
+
+void DebugInfo::EmitLocation()
+{
+	auto const& backtrace = getCallerBacktrace();
+	syncScope(backtrace);
+	builder->SetCurrentDebugLocation(getLocation(backtrace, backtrace.size() - 1));
+
+#ifdef ENABLE_RR_EMIT_PRINT_LOCATION
+	static Location lastLocation;
+	if (backtrace.size() == 0)
+	{
+		return;
+	}
+	Location currLocation = backtrace[backtrace.size() - 1];
+	if (currLocation != lastLocation)
+	{
+		rr::Print("rr> {0} [{1}:{2}]\n", currLocation.function.name.c_str(), currLocation.function.file.c_str(), currLocation.line);
+		lastLocation = std::move(currLocation);
+	}
+#endif // ENABLE_RR_EMIT_PRINT_LOCATION
+}
+
+void DebugInfo::Flush()
+{
+	emitPending(diScope.back(), builder);
+}
+
+void DebugInfo::syncScope(Backtrace const& backtrace)
+{
+	auto shrink = [this](size_t newsize)
+	{
+		while (diScope.size() > newsize)
+		{
+			auto &scope = diScope.back();
+			LOG("- STACK(%d): di: %p, location: %s:%d",
+				int(diScope.size() - 1), scope.di,
+				scope.location.function.file.c_str(),
+				int(scope.location.line));
+			emitPending(scope, builder);
+			diScope.pop_back();
+		}
+	};
+
+	if (backtrace.size() < diScope.size())
+	{
+		shrink(backtrace.size());
 	}
 
-	void DebugInfo::syncScope(Backtrace const& backtrace)
+	for (size_t i = 0; i < diScope.size(); i++)
 	{
-		auto shrink = [this](size_t newsize)
-		{
-			while (diScope.size() > newsize)
-			{
-				auto &scope = diScope.back();
-				LOG("- STACK(%d): di: %p, location: %s:%d",
-					int(diScope.size() - 1), scope.di,
-					scope.location.function.file.c_str(),
-					int(scope.location.line));
-				emitPending(scope, builder);
-				diScope.pop_back();
-			}
-		};
+		auto &scope = diScope[i];
+		auto const &oldLocation = scope.location;
+		auto const &newLocation = backtrace[i];
 
-		if (backtrace.size() < diScope.size())
+		if (oldLocation.function != newLocation.function)
 		{
-			shrink(backtrace.size());
+			LOG("  STACK(%d): Changed function %s -> %s", int(i),
+				oldLocation.function.name.c_str(), newLocation.function.name.c_str());
+			shrink(i);
+			break;
 		}
 
-		for (size_t i = 0; i < diScope.size(); i++)
+		if (oldLocation.line > newLocation.line)
 		{
-			auto &scope = diScope[i];
-			auto const &oldLocation = scope.location;
-			auto const &newLocation = backtrace[i];
-
-			if (oldLocation.function != newLocation.function)
-			{
-				LOG("  STACK(%d): Changed function %s -> %s", int(i),
-					oldLocation.function.name.c_str(), newLocation.function.name.c_str());
-				shrink(i);
-				break;
-			}
-
-			if (oldLocation.line > newLocation.line)
-			{
-				// Create a new di block to shadow all the variables in the loop.
-				auto file = getOrCreateFile(newLocation.function.file.c_str());
-				auto di = diBuilder->createLexicalBlock(scope.di, file, newLocation.line, 0);
-				LOG("  STACK(%d): Jumped backwards %d -> %d. di: %p -> %p", int(i),
-					oldLocation.line, newLocation.line, scope.di, di);
-				emitPending(scope, builder);
-				scope = {newLocation, di};
-				shrink(i+1);
-				break;
-			}
-
-			scope.location = newLocation;
+			// Create a new di block to shadow all the variables in the loop.
+			auto file = getOrCreateFile(newLocation.function.file.c_str());
+			auto di = diBuilder->createLexicalBlock(scope.di, file, newLocation.line, 0);
+			LOG("  STACK(%d): Jumped backwards %d -> %d. di: %p -> %p", int(i),
+				oldLocation.line, newLocation.line, scope.di, di);
+			emitPending(scope, builder);
+			scope = {newLocation, di};
+			shrink(i+1);
+			break;
 		}
 
-		while (backtrace.size() > diScope.size())
-		{
-			auto i = diScope.size();
-			auto location = backtrace[i];
-			auto file = getOrCreateFile(location.function.file.c_str());
-			auto funcTy = diBuilder->createSubroutineType(diBuilder->getOrCreateTypeArray({}));
-
-			char buf[1024];
-			size_t size = sizeof(buf);
-			int status = 0;
-			llvm::itaniumDemangle(location.function.name.c_str(), buf, &size, &status);
-			auto name = "jit!" + (status == 0 ? std::string(buf) : location.function.name);
-
-			auto func = diBuilder->createFunction(
-				file,                           // scope
-				name,                           // function name
-				"",                             // linkage
-				file,                           // file
-				location.line,                  // line
-				funcTy,                         // type
-				false,                          // internal linkage
-				true,                           // definition
-				location.line,                  // scope line
-				llvm::DINode::FlagPrototyped,   // flags
-				false                           // is optimized
-			);
-			diScope.push_back({location, func});
-			LOG("+ STACK(%d): di: %p, location: %s:%d", int(i), di,
-				location.function.file.c_str(), int(location.line));
-		}
+		scope.location = newLocation;
 	}
 
-	llvm::DILocation* DebugInfo::getLocation(const Backtrace &backtrace, size_t i)
+	while (backtrace.size() > diScope.size())
 	{
-		if (backtrace.size() == 0) { return nullptr; }
-		assert(backtrace.size() == diScope.size());
-		return llvm::DILocation::get(
-			*context,
-			backtrace[i].line,
-			0,
-			diScope[i].di,
-			i > 0 ? getLocation(backtrace, i - 1) : diRootLocation
+		auto i = diScope.size();
+		auto location = backtrace[i];
+		auto file = getOrCreateFile(location.function.file.c_str());
+		auto funcTy = diBuilder->createSubroutineType(diBuilder->getOrCreateTypeArray({}));
+
+		char buf[1024];
+		size_t size = sizeof(buf);
+		int status = 0;
+		llvm::itaniumDemangle(location.function.name.c_str(), buf, &size, &status);
+		auto name = "jit!" + (status == 0 ? std::string(buf) : location.function.name);
+
+		auto func = diBuilder->createFunction(
+			file,                           // scope
+			name,                           // function name
+			"",                             // linkage
+			file,                           // file
+			location.line,                  // line
+			funcTy,                         // type
+			false,                          // internal linkage
+			true,                           // definition
+			location.line,                  // scope line
+			llvm::DINode::FlagPrototyped,   // flags
+			false                           // is optimized
 		);
+		diScope.push_back({location, func});
+		LOG("+ STACK(%d): di: %p, location: %s:%d", int(i), di,
+			location.function.file.c_str(), int(location.line));
 	}
+}
 
-	void DebugInfo::EmitVariable(Value *variable)
+llvm::DILocation* DebugInfo::getLocation(const Backtrace &backtrace, size_t i)
+{
+	if (backtrace.size() == 0) { return nullptr; }
+	assert(backtrace.size() == diScope.size());
+	return llvm::DILocation::get(
+		*context,
+		backtrace[i].line,
+		0,
+		diScope[i].di,
+		i > 0 ? getLocation(backtrace, i - 1) : diRootLocation
+	);
+}
+
+void DebugInfo::EmitVariable(Value *variable)
+{
+	auto const& backtrace = getCallerBacktrace();
+	syncScope(backtrace);
+
+	for (int i = backtrace.size() - 1; i >= 0; i--)
 	{
-		auto const& backtrace = getCallerBacktrace();
-		syncScope(backtrace);
-
-		for (int i = backtrace.size() - 1; i >= 0; i--)
+		auto const &location = backtrace[i];
+		auto tokens = getOrParseFileTokens(location.function.file.c_str());
+		auto tokIt = tokens->find(location.line);
+		if (tokIt == tokens->end())
 		{
-			auto const &location = backtrace[i];
-			auto tokens = getOrParseFileTokens(location.function.file.c_str());
-			auto tokIt = tokens->find(location.line);
-			if (tokIt == tokens->end())
-			{
-				break;
-			}
-			auto token = tokIt->second;
-			auto name = token.identifier;
-			if (token.kind == Token::Return)
-			{
-				// This is a:
-				//
-				//   return <expr>;
-				//
-				// Emit this expression as two variables -
-				// Once as a synthetic 'return_value' variable at this scope.
-				// Again by bubbling the expression value up the callstack as
-				// Return Value Optimizations (RVOs) are likely to carry across
-				// the value to a local without calling a constructor in
-				// statements like:
-				//
-				//   auto val = foo();
-				//
-				name = "return_value";
-			}
-
-			auto &scope = diScope[i];
-			if (scope.pending.location != location)
-			{
-				emitPending(scope, builder);
-			}
-
-			auto value = V(variable);
-			auto block = builder->GetInsertBlock();
-
-			auto insertAfter = block->size() > 0 ? &block->back() : nullptr;
-			while (insertAfter != nullptr && insertAfter->isTerminator())
-			{
-				insertAfter = insertAfter->getPrevNode();
-			}
-
-			scope.pending = Pending{};
-			scope.pending.name = name;
-			scope.pending.location = location;
-			scope.pending.diLocation = getLocation(backtrace, i);
-			scope.pending.value = value;
-			scope.pending.block = block;
-			scope.pending.insertAfter = insertAfter;
-			scope.pending.scope = scope.di;
-
-			if (token.kind == Token::Return)
-			{
-				// Insert a noop instruction so the debugger can inspect the
-				// return value before the function scope closes.
-				scope.pending.addNopOnNextLine = true;
-			}
-			else
-			{
-				break;
-			}
+			break;
 		}
-	}
-
-	void DebugInfo::emitPending(Scope &scope, IRBuilder *builder)
-	{
-		auto const &pending = scope.pending;
-		if (pending.value == nullptr)
+		auto token = tokIt->second;
+		auto name = token.identifier;
+		if (token.kind == Token::Return)
 		{
-			return;
-		}
-
-		if (!scope.symbols.emplace(pending.name).second)
-		{
-			return;
-		}
-
-		bool isAlloca = llvm::isa<llvm::AllocaInst>(pending.value);
-
-		LOG("  EMIT(%s): di: %p, location: %s:%d, isAlloca: %s", pending.name.c_str(), scope.di,
-			pending.location.function.file.c_str(), pending.location.line, isAlloca ? "true" : "false");
-
-		auto value = pending.value;
-
-		IRBuilder::InsertPointGuard guard(*builder);
-		if (pending.insertAfter != nullptr)
-		{
-			builder->SetInsertPoint(pending.block, ++pending.insertAfter->getIterator());
-		}
-		else
-		{
-			builder->SetInsertPoint(pending.block);
-		}
-		builder->SetCurrentDebugLocation(pending.diLocation);
-
-		if (!isAlloca)
-		{
-			// While insertDbgValueIntrinsic should be enough to declare a
-			// variable with no storage, variables of RValues can share the same
-			// llvm::Value, and only one can be named. Take for example:
+			// This is a:
 			//
-			//   Int a = 42;
-			//   RValue<Int> b = a;
-			//   RValue<Int> c = b;
+			//   return <expr>;
 			//
-			// To handle this, always promote named RValues to an alloca.
-
-			llvm::BasicBlock &entryBlock = function->getEntryBlock();
-			auto alloca = new llvm::AllocaInst(value->getType(), 0, pending.name);
-			entryBlock.getInstList().push_front(alloca);
-			builder->CreateStore(value, alloca);
-			value = alloca;
+			// Emit this expression as two variables -
+			// Once as a synthetic 'return_value' variable at this scope.
+			// Again by bubbling the expression value up the callstack as
+			// Return Value Optimizations (RVOs) are likely to carry across
+			// the value to a local without calling a constructor in
+			// statements like:
+			//
+			//   auto val = foo();
+			//
+			name = "return_value";
 		}
 
-		value->setName(pending.name);
-
-		auto diFile = getOrCreateFile(pending.location.function.file.c_str());
-		auto diType = getOrCreateType(value->getType()->getPointerElementType());
-		auto diVar = diBuilder->createAutoVariable(scope.di, pending.name, diFile, pending.location.line, diType);
-
-		auto di = diBuilder->insertDeclare(value, diVar, diBuilder->createExpression(), pending.diLocation, pending.block);
-		if (pending.insertAfter != nullptr) { di->moveAfter(pending.insertAfter); }
-
-		if (pending.addNopOnNextLine)
+		auto &scope = diScope[i];
+		if (scope.pending.location != location)
 		{
-			builder->SetCurrentDebugLocation(llvm::DILocation::get(
-				*context,
-				pending.diLocation->getLine() + 1,
-				0,
-				pending.diLocation->getScope(),
-				pending.diLocation->getInlinedAt()
-			));
-			Nop();
+			emitPending(scope, builder);
+		}
+
+		auto value = V(variable);
+		auto block = builder->GetInsertBlock();
+
+		auto insertAfter = block->size() > 0 ? &block->back() : nullptr;
+		while (insertAfter != nullptr && insertAfter->isTerminator())
+		{
+			insertAfter = insertAfter->getPrevNode();
 		}
 
 		scope.pending = Pending{};
-	}
+		scope.pending.name = name;
+		scope.pending.location = location;
+		scope.pending.diLocation = getLocation(backtrace, i);
+		scope.pending.value = value;
+		scope.pending.block = block;
+		scope.pending.insertAfter = insertAfter;
+		scope.pending.scope = scope.di;
 
-	void DebugInfo::NotifyObjectEmitted(const llvm::object::ObjectFile &Obj, const llvm::LoadedObjectInfo &L)
-	{
-		std::unique_lock<std::mutex> lock(jitEventListenerMutex);
-		jitEventListener->NotifyObjectEmitted(Obj, static_cast<const llvm::RuntimeDyld::LoadedObjectInfo&>(L));
-	}
-
-	void DebugInfo::NotifyFreeingObject(const llvm::object::ObjectFile &Obj)
-	{
-		std::unique_lock<std::mutex> lock(jitEventListenerMutex);
-		jitEventListener->NotifyFreeingObject(Obj);
-	}
-
-	void DebugInfo::registerBasicTypes()
-	{
-		using namespace rr;
-		using namespace llvm;
-
-		auto vec4 = diBuilder->getOrCreateArray(diBuilder->getOrCreateSubrange(0, 4));
-		auto vec8 = diBuilder->getOrCreateArray(diBuilder->getOrCreateSubrange(0, 8));
-		auto vec16 = diBuilder->getOrCreateArray(diBuilder->getOrCreateSubrange(0, 16));
-
-		diTypes.emplace(T(Bool::getType()), diBuilder->createBasicType("Bool", sizeof(bool), dwarf::DW_ATE_boolean));
-		diTypes.emplace(T(Byte::getType()), diBuilder->createBasicType("Byte", 8, dwarf::DW_ATE_unsigned_char));
-		diTypes.emplace(T(SByte::getType()), diBuilder->createBasicType("SByte", 8, dwarf::DW_ATE_signed_char));
-		diTypes.emplace(T(Short::getType()), diBuilder->createBasicType("Short", 16, dwarf::DW_ATE_signed));
-		diTypes.emplace(T(UShort::getType()), diBuilder->createBasicType("UShort", 16, dwarf::DW_ATE_unsigned));
-		diTypes.emplace(T(Int::getType()), diBuilder->createBasicType("Int", 32, dwarf::DW_ATE_signed));
-		diTypes.emplace(T(UInt::getType()), diBuilder->createBasicType("UInt", 32, dwarf::DW_ATE_unsigned));
-		diTypes.emplace(T(Long::getType()), diBuilder->createBasicType("Long", 64, dwarf::DW_ATE_signed));
-		diTypes.emplace(T(Half::getType()), diBuilder->createBasicType("Half", 16, dwarf::DW_ATE_float));
-		diTypes.emplace(T(Float::getType()), diBuilder->createBasicType("Float", 32, dwarf::DW_ATE_float));
-
-		diTypes.emplace(T(Byte4::getType()), diBuilder->createVectorType(128, 128, diTypes[T(Byte::getType())], {vec16}));
-		diTypes.emplace(T(SByte4::getType()), diBuilder->createVectorType(128, 128, diTypes[T(SByte::getType())], {vec16}));
-		diTypes.emplace(T(Byte8::getType()), diBuilder->createVectorType(128, 128, diTypes[T(Byte::getType())], {vec16}));
-		diTypes.emplace(T(SByte8::getType()), diBuilder->createVectorType(128, 128, diTypes[T(SByte::getType())], {vec16}));
-		diTypes.emplace(T(Byte16::getType()), diBuilder->createVectorType(128, 128, diTypes[T(Byte::getType())], {vec16}));
-		diTypes.emplace(T(SByte16::getType()), diBuilder->createVectorType(128, 128, diTypes[T(SByte::getType())], {vec16}));
-		diTypes.emplace(T(Short2::getType()), diBuilder->createVectorType(128, 128, diTypes[T(Short::getType())], {vec8}));
-		diTypes.emplace(T(UShort2::getType()), diBuilder->createVectorType(128, 128, diTypes[T(UShort::getType())], {vec8}));
-		diTypes.emplace(T(Short4::getType()), diBuilder->createVectorType(128, 128, diTypes[T(Short::getType())], {vec8}));
-		diTypes.emplace(T(UShort4::getType()), diBuilder->createVectorType(128, 128, diTypes[T(UShort::getType())], {vec8}));
-		diTypes.emplace(T(Short8::getType()), diBuilder->createVectorType(128, 128, diTypes[T(Short::getType())], {vec8}));
-		diTypes.emplace(T(UShort8::getType()), diBuilder->createVectorType(128, 128, diTypes[T(UShort::getType())], {vec8}));
-		diTypes.emplace(T(Int2::getType()), diBuilder->createVectorType(128, 128, diTypes[T(Int::getType())], {vec4}));
-		diTypes.emplace(T(UInt2::getType()), diBuilder->createVectorType(128, 128, diTypes[T(UInt::getType())], {vec4}));
-		diTypes.emplace(T(Int4::getType()), diBuilder->createVectorType(128, 128, diTypes[T(Int::getType())], {vec4}));
-		diTypes.emplace(T(UInt4::getType()), diBuilder->createVectorType(128, 128, diTypes[T(UInt::getType())], {vec4}));
-		diTypes.emplace(T(Float2::getType()), diBuilder->createVectorType(128, 128, diTypes[T(Float::getType())], {vec4}));
-		diTypes.emplace(T(Float4::getType()), diBuilder->createVectorType(128, 128, diTypes[T(Float::getType())], {vec4}));
-	}
-
-	DebugInfo::Location DebugInfo::getCallerLocation() const
-	{
-		return getCallerBacktrace(1)[0];
-	}
-
-	DebugInfo::Backtrace DebugInfo::getCallerBacktrace(size_t limit /* = 0 */) const
-	{
-		auto shouldSkipFile = [](llvm::StringRef fileSR) {
-				return fileSR.empty() ||
-					fileSR.endswith_lower("ReactorDebugInfo.cpp") ||
-					fileSR.endswith_lower("Reactor.cpp") ||
-					fileSR.endswith_lower("Reactor.hpp") ||
-					fileSR.endswith_lower("stacktrace.hpp");
-		};
-
-		std::vector<DebugInfo::Location> locations;
-
-		// Note that bs::stacktrace() effectively returns a vector of addresses; bs::frame construction is where
-		// the heavy lifting is done: resolving the function name, file and line number.
-		namespace bs = boost::stacktrace;
-		for (bs::frame frame : bs::stacktrace())
+		if (token.kind == Token::Return)
 		{
-			if (shouldSkipFile(frame.source_file()))
-			{
-				continue;
-			}
+			// Insert a noop instruction so the debugger can inspect the
+			// return value before the function scope closes.
+			scope.pending.addNopOnNextLine = true;
+		}
+		else
+		{
+			break;
+		}
+	}
+}
 
-			DebugInfo::Location location;
-			location.function.file = frame.source_file();
-			location.function.name = frame.name();
-			location.line = frame.source_line();
-			locations.push_back(location);
+void DebugInfo::emitPending(Scope &scope, IRBuilder *builder)
+{
+	auto const &pending = scope.pending;
+	if (pending.value == nullptr)
+	{
+		return;
+	}
 
-			if (limit > 0 && locations.size() >= limit)
-			{
-				break;
-			}
+	if (!scope.symbols.emplace(pending.name).second)
+	{
+		return;
+	}
+
+	bool isAlloca = llvm::isa<llvm::AllocaInst>(pending.value);
+
+	LOG("  EMIT(%s): di: %p, location: %s:%d, isAlloca: %s", pending.name.c_str(), scope.di,
+		pending.location.function.file.c_str(), pending.location.line, isAlloca ? "true" : "false");
+
+	auto value = pending.value;
+
+	IRBuilder::InsertPointGuard guard(*builder);
+	if (pending.insertAfter != nullptr)
+	{
+		builder->SetInsertPoint(pending.block, ++pending.insertAfter->getIterator());
+	}
+	else
+	{
+		builder->SetInsertPoint(pending.block);
+	}
+	builder->SetCurrentDebugLocation(pending.diLocation);
+
+	if (!isAlloca)
+	{
+		// While insertDbgValueIntrinsic should be enough to declare a
+		// variable with no storage, variables of RValues can share the same
+		// llvm::Value, and only one can be named. Take for example:
+		//
+		//   Int a = 42;
+		//   RValue<Int> b = a;
+		//   RValue<Int> c = b;
+		//
+		// To handle this, always promote named RValues to an alloca.
+
+		llvm::BasicBlock &entryBlock = function->getEntryBlock();
+		auto alloca = new llvm::AllocaInst(value->getType(), 0, pending.name);
+		entryBlock.getInstList().push_front(alloca);
+		builder->CreateStore(value, alloca);
+		value = alloca;
+	}
+
+	value->setName(pending.name);
+
+	auto diFile = getOrCreateFile(pending.location.function.file.c_str());
+	auto diType = getOrCreateType(value->getType()->getPointerElementType());
+	auto diVar = diBuilder->createAutoVariable(scope.di, pending.name, diFile, pending.location.line, diType);
+
+	auto di = diBuilder->insertDeclare(value, diVar, diBuilder->createExpression(), pending.diLocation, pending.block);
+	if (pending.insertAfter != nullptr) { di->moveAfter(pending.insertAfter); }
+
+	if (pending.addNopOnNextLine)
+	{
+		builder->SetCurrentDebugLocation(llvm::DILocation::get(
+			*context,
+			pending.diLocation->getLine() + 1,
+			0,
+			pending.diLocation->getScope(),
+			pending.diLocation->getInlinedAt()
+		));
+		Nop();
+	}
+
+	scope.pending = Pending{};
+}
+
+void DebugInfo::NotifyObjectEmitted(const llvm::object::ObjectFile &Obj, const llvm::LoadedObjectInfo &L)
+{
+	std::unique_lock<std::mutex> lock(jitEventListenerMutex);
+	jitEventListener->NotifyObjectEmitted(Obj, static_cast<const llvm::RuntimeDyld::LoadedObjectInfo&>(L));
+}
+
+void DebugInfo::NotifyFreeingObject(const llvm::object::ObjectFile &Obj)
+{
+	std::unique_lock<std::mutex> lock(jitEventListenerMutex);
+	jitEventListener->NotifyFreeingObject(Obj);
+}
+
+void DebugInfo::registerBasicTypes()
+{
+	using namespace rr;
+	using namespace llvm;
+
+	auto vec4 = diBuilder->getOrCreateArray(diBuilder->getOrCreateSubrange(0, 4));
+	auto vec8 = diBuilder->getOrCreateArray(diBuilder->getOrCreateSubrange(0, 8));
+	auto vec16 = diBuilder->getOrCreateArray(diBuilder->getOrCreateSubrange(0, 16));
+
+	diTypes.emplace(T(Bool::getType()), diBuilder->createBasicType("Bool", sizeof(bool), dwarf::DW_ATE_boolean));
+	diTypes.emplace(T(Byte::getType()), diBuilder->createBasicType("Byte", 8, dwarf::DW_ATE_unsigned_char));
+	diTypes.emplace(T(SByte::getType()), diBuilder->createBasicType("SByte", 8, dwarf::DW_ATE_signed_char));
+	diTypes.emplace(T(Short::getType()), diBuilder->createBasicType("Short", 16, dwarf::DW_ATE_signed));
+	diTypes.emplace(T(UShort::getType()), diBuilder->createBasicType("UShort", 16, dwarf::DW_ATE_unsigned));
+	diTypes.emplace(T(Int::getType()), diBuilder->createBasicType("Int", 32, dwarf::DW_ATE_signed));
+	diTypes.emplace(T(UInt::getType()), diBuilder->createBasicType("UInt", 32, dwarf::DW_ATE_unsigned));
+	diTypes.emplace(T(Long::getType()), diBuilder->createBasicType("Long", 64, dwarf::DW_ATE_signed));
+	diTypes.emplace(T(Half::getType()), diBuilder->createBasicType("Half", 16, dwarf::DW_ATE_float));
+	diTypes.emplace(T(Float::getType()), diBuilder->createBasicType("Float", 32, dwarf::DW_ATE_float));
+
+	diTypes.emplace(T(Byte4::getType()), diBuilder->createVectorType(128, 128, diTypes[T(Byte::getType())], {vec16}));
+	diTypes.emplace(T(SByte4::getType()), diBuilder->createVectorType(128, 128, diTypes[T(SByte::getType())], {vec16}));
+	diTypes.emplace(T(Byte8::getType()), diBuilder->createVectorType(128, 128, diTypes[T(Byte::getType())], {vec16}));
+	diTypes.emplace(T(SByte8::getType()), diBuilder->createVectorType(128, 128, diTypes[T(SByte::getType())], {vec16}));
+	diTypes.emplace(T(Byte16::getType()), diBuilder->createVectorType(128, 128, diTypes[T(Byte::getType())], {vec16}));
+	diTypes.emplace(T(SByte16::getType()), diBuilder->createVectorType(128, 128, diTypes[T(SByte::getType())], {vec16}));
+	diTypes.emplace(T(Short2::getType()), diBuilder->createVectorType(128, 128, diTypes[T(Short::getType())], {vec8}));
+	diTypes.emplace(T(UShort2::getType()), diBuilder->createVectorType(128, 128, diTypes[T(UShort::getType())], {vec8}));
+	diTypes.emplace(T(Short4::getType()), diBuilder->createVectorType(128, 128, diTypes[T(Short::getType())], {vec8}));
+	diTypes.emplace(T(UShort4::getType()), diBuilder->createVectorType(128, 128, diTypes[T(UShort::getType())], {vec8}));
+	diTypes.emplace(T(Short8::getType()), diBuilder->createVectorType(128, 128, diTypes[T(Short::getType())], {vec8}));
+	diTypes.emplace(T(UShort8::getType()), diBuilder->createVectorType(128, 128, diTypes[T(UShort::getType())], {vec8}));
+	diTypes.emplace(T(Int2::getType()), diBuilder->createVectorType(128, 128, diTypes[T(Int::getType())], {vec4}));
+	diTypes.emplace(T(UInt2::getType()), diBuilder->createVectorType(128, 128, diTypes[T(UInt::getType())], {vec4}));
+	diTypes.emplace(T(Int4::getType()), diBuilder->createVectorType(128, 128, diTypes[T(Int::getType())], {vec4}));
+	diTypes.emplace(T(UInt4::getType()), diBuilder->createVectorType(128, 128, diTypes[T(UInt::getType())], {vec4}));
+	diTypes.emplace(T(Float2::getType()), diBuilder->createVectorType(128, 128, diTypes[T(Float::getType())], {vec4}));
+	diTypes.emplace(T(Float4::getType()), diBuilder->createVectorType(128, 128, diTypes[T(Float::getType())], {vec4}));
+}
+
+DebugInfo::Location DebugInfo::getCallerLocation() const
+{
+	return getCallerBacktrace(1)[0];
+}
+
+DebugInfo::Backtrace DebugInfo::getCallerBacktrace(size_t limit /* = 0 */) const
+{
+	auto shouldSkipFile = [](llvm::StringRef fileSR) {
+			return fileSR.empty() ||
+				fileSR.endswith_lower("ReactorDebugInfo.cpp") ||
+				fileSR.endswith_lower("Reactor.cpp") ||
+				fileSR.endswith_lower("Reactor.hpp") ||
+				fileSR.endswith_lower("stacktrace.hpp");
+	};
+
+	std::vector<DebugInfo::Location> locations;
+
+	// Note that bs::stacktrace() effectively returns a vector of addresses; bs::frame construction is where
+	// the heavy lifting is done: resolving the function name, file and line number.
+	namespace bs = boost::stacktrace;
+	for (bs::frame frame : bs::stacktrace())
+	{
+		if (shouldSkipFile(frame.source_file()))
+		{
+			continue;
 		}
 
-		std::reverse(locations.begin(), locations.end());
+		DebugInfo::Location location;
+		location.function.file = frame.source_file();
+		location.function.name = frame.name();
+		location.line = frame.source_line();
+		locations.push_back(location);
 
-		return locations;
-	}
-
-	llvm::DIType *DebugInfo::getOrCreateType(llvm::Type* type)
-	{
-		auto it = diTypes.find(type);
-		if (it != diTypes.end()) { return it->second; }
-
-		if(type->isPointerTy())
+		if (limit > 0 && locations.size() >= limit)
 		{
-			auto dbgTy = diBuilder->createPointerType(
-				getOrCreateType(type->getPointerElementType()),
-				sizeof(void*)*8, alignof(void*)*8);
-			diTypes.emplace(type, dbgTy);
-			return dbgTy;
+			break;
 		}
-		llvm::errs() << "Unimplemented debug type: " << type << "\n";
-		assert(false);
-		return nullptr;
 	}
 
-	llvm::DIFile *DebugInfo::getOrCreateFile(const char* path)
+	std::reverse(locations.begin(), locations.end());
+
+	return locations;
+}
+
+llvm::DIType *DebugInfo::getOrCreateType(llvm::Type* type)
+{
+	auto it = diTypes.find(type);
+	if (it != diTypes.end()) { return it->second; }
+
+	if(type->isPointerTy())
 	{
-		auto it = diFiles.find(path);
-		if (it != diFiles.end()) { return it->second; }
-		auto dirAndName = splitPath(path);
-		auto file = diBuilder->createFile(dirAndName.second, dirAndName.first);
-		diFiles.emplace(path, file);
-		return file;
+		auto dbgTy = diBuilder->createPointerType(
+			getOrCreateType(type->getPointerElementType()),
+			sizeof(void*)*8, alignof(void*)*8);
+		diTypes.emplace(type, dbgTy);
+		return dbgTy;
+	}
+	llvm::errs() << "Unimplemented debug type: " << type << "\n";
+	assert(false);
+	return nullptr;
+}
+
+llvm::DIFile *DebugInfo::getOrCreateFile(const char* path)
+{
+	auto it = diFiles.find(path);
+	if (it != diFiles.end()) { return it->second; }
+	auto dirAndName = splitPath(path);
+	auto file = diBuilder->createFile(dirAndName.second, dirAndName.first);
+	diFiles.emplace(path, file);
+	return file;
+}
+
+DebugInfo::LineTokens const *DebugInfo::getOrParseFileTokens(const char* path)
+{
+	static std::regex reLocalDecl(
+		"^" // line start
+		"\\s*" // initial whitespace
+		"(?:For\\s*\\(\\s*)?" // optional 'For ('
+		"((?:\\w+(?:<[^>]+>)?)(?:::\\w+(?:<[^>]+>)?)*)" // type (match group 1)
+		"\\s+" // whitespace between type and name
+		"(\\w+)" // identifier (match group 2)
+		"\\s*" // whitespace after identifier
+		"(\\[.*\\])?"); // optional array suffix (match group 3)
+
+	auto it = fileTokens.find(path);
+	if (it != fileTokens.end())
+	{
+		return it->second.get();
 	}
 
-	DebugInfo::LineTokens const *DebugInfo::getOrParseFileTokens(const char* path)
+	auto tokens = std::unique_ptr<LineTokens>(new LineTokens());
+
+	std::ifstream file(path);
+	std::string line;
+	int lineCount = 0;
+	while (std::getline(file, line))
 	{
-		static std::regex reLocalDecl(
-			"^" // line start
-			"\\s*" // initial whitespace
-			"(?:For\\s*\\(\\s*)?" // optional 'For ('
-			"((?:\\w+(?:<[^>]+>)?)(?:::\\w+(?:<[^>]+>)?)*)" // type (match group 1)
-			"\\s+" // whitespace between type and name
-			"(\\w+)" // identifier (match group 2)
-			"\\s*" // whitespace after identifier
-			"(\\[.*\\])?"); // optional array suffix (match group 3)
-
-		auto it = fileTokens.find(path);
-		if (it != fileTokens.end())
+		lineCount++;
+		std::smatch match;
+		if (std::regex_search(line, match, reLocalDecl) && match.size() > 3)
 		{
-			return it->second.get();
-		}
-
-		auto tokens = std::unique_ptr<LineTokens>(new LineTokens());
-
-		std::ifstream file(path);
-		std::string line;
-		int lineCount = 0;
-		while (std::getline(file, line))
-		{
-			lineCount++;
-			std::smatch match;
-			if (std::regex_search(line, match, reLocalDecl) && match.size() > 3)
+			bool isArray = match.str(3) != "";
+			if (!isArray) // Cannot deal with C-arrays of values.
 			{
-				bool isArray = match.str(3) != "";
-				if (!isArray) // Cannot deal with C-arrays of values.
+				if (match.str(1) == "return")
 				{
-					if (match.str(1) == "return")
-					{
-						(*tokens)[lineCount] = Token{Token::Return};
-					}
-					else
-					{
-						(*tokens)[lineCount] = Token{Token::Identifier, match.str(2)};
-					}
+					(*tokens)[lineCount] = Token{Token::Return};
+				}
+				else
+				{
+					(*tokens)[lineCount] = Token{Token::Identifier, match.str(2)};
 				}
 			}
 		}
-
-		auto out = tokens.get();
-		fileTokens.emplace(path, std::move(tokens));
-		return out;
 	}
 
-} // namespace rr
+	auto out = tokens.get();
+	fileTokens.emplace(path, std::move(tokens));
+	return out;
+}
+
+}  // namespace rr
 
 #endif // ENABLE_RR_DEBUG_INFO

diff --git a/src/Reactor/LLVMReactorDebugInfo.hpp b/src/Reactor/LLVMReactorDebugInfo.hpp
index db743d7..f97e3d0 100644
--- a/src/Reactor/LLVMReactorDebugInfo.hpp
+++ b/src/Reactor/LLVMReactorDebugInfo.hpp

@@ -25,187 +25,188 @@
 #include <memory>
 
 // Forward declarations
-namespace llvm
+namespace llvm {
+
+class BasicBlock;
+class ConstantFolder;
+class DIBuilder;
+class DICompileUnit;
+class DIFile;
+class DILocation;
+class DIScope;
+class DISubprogram;
+class DIType;
+class Function;
+class Instruction;
+class IRBuilderDefaultInserter;
+class JITEventListener;
+class LLVMContext;
+class LoadedObjectInfo;
+class Module;
+class Type;
+class Value;
+
+namespace object
 {
-	class BasicBlock;
-	class ConstantFolder;
-	class DIBuilder;
-	class DICompileUnit;
-	class DIFile;
-	class DILocation;
-	class DIScope;
-	class DISubprogram;
-	class DIType;
-	class Function;
-	class Instruction;
-	class IRBuilderDefaultInserter;
-	class JITEventListener;
-	class LLVMContext;
-	class LoadedObjectInfo;
-	class Module;
-	class Type;
-	class Value;
+	class ObjectFile;
+}
 
-	namespace object
-	{
-		class ObjectFile;
-	}
+template <typename T, typename Inserter> class IRBuilder;
 
-	template <typename T, typename Inserter> class IRBuilder;
-} // namespace llvm
+}  // namespace llvm
 
-namespace rr
+namespace rr {
+
+class Type;
+class Value;
+
+// DebugInfo generates LLVM DebugInfo IR from the C++ source that calls
+// into Reactor functions. See docs/ReactorDebugInfo.mk for more information.
+class DebugInfo
 {
-	class Type;
-	class Value;
+public:
+	using IRBuilder = llvm::IRBuilder<llvm::ConstantFolder, llvm::IRBuilderDefaultInserter>;
 
-	// DebugInfo generates LLVM DebugInfo IR from the C++ source that calls
-	// into Reactor functions. See docs/ReactorDebugInfo.mk for more information.
-	class DebugInfo
+	DebugInfo(IRBuilder *builder,
+			llvm::LLVMContext *context,
+			llvm::Module *module,
+			llvm::Function *function);
+
+	~DebugInfo();
+
+	// Finalize debug info generation. Must be called before the LLVM module
+	// is built.
+	void Finalize();
+
+	// Updates the current source location.
+	void EmitLocation();
+
+	// Binds the value to its symbol in the source file.
+	// See docs/ReactorDebugInfo.mk for more information.
+	void EmitVariable(Value *value);
+
+	// Forcefully flush the binding of the last variable name.
+	// Used for binding the initializer of `For` loops.
+	void Flush();
+
+	// NotifyObjectEmitted informs any attached debuggers of the JIT'd
+	// object.
+	static void NotifyObjectEmitted(const llvm::object::ObjectFile &Obj, const llvm::LoadedObjectInfo &L);
+
+	// NotifyFreeingObject informs any attached debuggers that the JIT'd
+	// object is now invalid.
+	static void NotifyFreeingObject(const llvm::object::ObjectFile &Obj);
+
+private:
+	struct Token
 	{
-	public:
-		using IRBuilder = llvm::IRBuilder<llvm::ConstantFolder, llvm::IRBuilderDefaultInserter>;
-
-		DebugInfo(IRBuilder *builder,
-				llvm::LLVMContext *context,
-				llvm::Module *module,
-				llvm::Function *function);
-
-		~DebugInfo();
-
-		// Finalize debug info generation. Must be called before the LLVM module
-		// is built.
-		void Finalize();
-
-		// Updates the current source location.
-		void EmitLocation();
-
-		// Binds the value to its symbol in the source file.
-		// See docs/ReactorDebugInfo.mk for more information.
-		void EmitVariable(Value *value);
-
-		// Forcefully flush the binding of the last variable name.
-		// Used for binding the initializer of `For` loops.
-		void Flush();
-
-		// NotifyObjectEmitted informs any attached debuggers of the JIT'd
-		// object.
-		static void NotifyObjectEmitted(const llvm::object::ObjectFile &Obj, const llvm::LoadedObjectInfo &L);
-
-		// NotifyFreeingObject informs any attached debuggers that the JIT'd
-		// object is now invalid.
-		static void NotifyFreeingObject(const llvm::object::ObjectFile &Obj);
-
-	private:
-		struct Token
+		enum Kind
 		{
-			enum Kind
-			{
-				Identifier,
-				Return
-			};
-			Kind kind;
-			std::string identifier;
+			Identifier,
+			Return
 		};
-
-		using LineTokens = std::unordered_map<unsigned int, Token>;
-
-		struct FunctionLocation
-		{
-			std::string name;
-			std::string file;
-
-			bool operator == (const FunctionLocation &rhs) const { return name == rhs.name && file == rhs.file; }
-			bool operator != (const FunctionLocation &rhs) const { return !(*this == rhs); }
-
-			struct Hash
-			{
-				std::size_t operator()(const FunctionLocation &l) const noexcept
-				{
-					return std::hash<std::string>()(l.file) * 31 +
-							std::hash<std::string>()(l.name);
-				}
-			};
-		};
-
-		struct Location
-		{
-			FunctionLocation function;
-			unsigned int line = 0;
-
-			bool operator == (const Location &rhs) const { return function == rhs.function && line == rhs.line; }
-			bool operator != (const Location &rhs) const { return !(*this == rhs); }
-
-			struct Hash
-			{
-				std::size_t operator()(const Location &l) const noexcept
-				{
-					return FunctionLocation::Hash()(l.function) * 31 +
-							std::hash<unsigned int>()(l.line);
-				}
-			};
-		};
-
-		using Backtrace = std::vector<Location>;
-
-		struct Pending
-		{
-			std::string name;
-			Location location;
-			llvm::DILocation *diLocation = nullptr;
-			llvm::Value *value = nullptr;
-			llvm::Instruction *insertAfter = nullptr;
-			llvm::BasicBlock *block = nullptr;
-			llvm::DIScope *scope = nullptr;
-			bool addNopOnNextLine = false;
-		};
-
-		struct Scope
-		{
-			Location location;
-			llvm::DIScope *di;
-			std::unordered_set<std::string> symbols;
-			Pending pending;
-		};
-
-		void registerBasicTypes();
-
-		void emitPending(Scope &scope, IRBuilder *builder);
-
-		// Returns the source location of the non-Reactor calling function.
-		Location getCallerLocation() const;
-
-		// Returns the backtrace for the callstack, starting at the first
-		// non-Reactor file. If limit is non-zero, then a maximum of limit
-		// frames will be returned.
-		Backtrace getCallerBacktrace(size_t limit = 0) const;
-
-		llvm::DILocation* getLocation(const Backtrace &backtrace, size_t i);
-
-		llvm::DIType *getOrCreateType(llvm::Type* type);
-		llvm::DIFile *getOrCreateFile(const char* path);
-		LineTokens const *getOrParseFileTokens(const char* path);
-
-		// Synchronizes diScope with the current backtrace.
-		void syncScope(Backtrace const& backtrace);
-
-		IRBuilder *builder;
-		llvm::LLVMContext *context;
-		llvm::Module *module;
-		llvm::Function *function;
-
-		std::unique_ptr<llvm::DIBuilder> diBuilder;
-		llvm::DICompileUnit *diCU;
-		llvm::DISubprogram *diSubprogram;
-		llvm::DILocation *diRootLocation;
-		std::vector<Scope> diScope;
-		std::unordered_map<std::string, llvm::DIFile*> diFiles;
-		std::unordered_map<llvm::Type*, llvm::DIType*> diTypes;
-		std::unordered_map<std::string, std::unique_ptr<LineTokens>> fileTokens;
-		std::vector<void const*> pushed;
+		Kind kind;
+		std::string identifier;
 	};
 
-} // namespace rr
+	using LineTokens = std::unordered_map<unsigned int, Token>;
+
+	struct FunctionLocation
+	{
+		std::string name;
+		std::string file;
+
+		bool operator == (const FunctionLocation &rhs) const { return name == rhs.name && file == rhs.file; }
+		bool operator != (const FunctionLocation &rhs) const { return !(*this == rhs); }
+
+		struct Hash
+		{
+			std::size_t operator()(const FunctionLocation &l) const noexcept
+			{
+				return std::hash<std::string>()(l.file) * 31 +
+						std::hash<std::string>()(l.name);
+			}
+		};
+	};
+
+	struct Location
+	{
+		FunctionLocation function;
+		unsigned int line = 0;
+
+		bool operator == (const Location &rhs) const { return function == rhs.function && line == rhs.line; }
+		bool operator != (const Location &rhs) const { return !(*this == rhs); }
+
+		struct Hash
+		{
+			std::size_t operator()(const Location &l) const noexcept
+			{
+				return FunctionLocation::Hash()(l.function) * 31 +
+						std::hash<unsigned int>()(l.line);
+			}
+		};
+	};
+
+	using Backtrace = std::vector<Location>;
+
+	struct Pending
+	{
+		std::string name;
+		Location location;
+		llvm::DILocation *diLocation = nullptr;
+		llvm::Value *value = nullptr;
+		llvm::Instruction *insertAfter = nullptr;
+		llvm::BasicBlock *block = nullptr;
+		llvm::DIScope *scope = nullptr;
+		bool addNopOnNextLine = false;
+	};
+
+	struct Scope
+	{
+		Location location;
+		llvm::DIScope *di;
+		std::unordered_set<std::string> symbols;
+		Pending pending;
+	};
+
+	void registerBasicTypes();
+
+	void emitPending(Scope &scope, IRBuilder *builder);
+
+	// Returns the source location of the non-Reactor calling function.
+	Location getCallerLocation() const;
+
+	// Returns the backtrace for the callstack, starting at the first
+	// non-Reactor file. If limit is non-zero, then a maximum of limit
+	// frames will be returned.
+	Backtrace getCallerBacktrace(size_t limit = 0) const;
+
+	llvm::DILocation* getLocation(const Backtrace &backtrace, size_t i);
+
+	llvm::DIType *getOrCreateType(llvm::Type* type);
+	llvm::DIFile *getOrCreateFile(const char* path);
+	LineTokens const *getOrParseFileTokens(const char* path);
+
+	// Synchronizes diScope with the current backtrace.
+	void syncScope(Backtrace const& backtrace);
+
+	IRBuilder *builder;
+	llvm::LLVMContext *context;
+	llvm::Module *module;
+	llvm::Function *function;
+
+	std::unique_ptr<llvm::DIBuilder> diBuilder;
+	llvm::DICompileUnit *diCU;
+	llvm::DISubprogram *diSubprogram;
+	llvm::DILocation *diRootLocation;
+	std::vector<Scope> diScope;
+	std::unordered_map<std::string, llvm::DIFile*> diFiles;
+	std::unordered_map<llvm::Type*, llvm::DIType*> diTypes;
+	std::unordered_map<std::string, std::unique_ptr<LineTokens>> fileTokens;
+	std::vector<void const*> pushed;
+};
+
+}  // namespace rr
 
 #endif // ENABLE_RR_DEBUG_INFO
 

diff --git a/src/Reactor/MutexLock.hpp b/src/Reactor/MutexLock.hpp
index 759e5d5..000819a 100644
--- a/src/Reactor/MutexLock.hpp
+++ b/src/Reactor/MutexLock.hpp

@@ -22,155 +22,157 @@
 // at the same time it's best to just have the scheduler overhead.
 #include <pthread.h>
 
-namespace rr
+namespace rr {
+
+class MutexLock
 {
-	class MutexLock
+public:
+	MutexLock()
 	{
-	public:
-		MutexLock()
-		{
-			pthread_mutex_init(&mutex, NULL);
-		}
+		pthread_mutex_init(&mutex, NULL);
+	}
 
-		~MutexLock()
-		{
-			pthread_mutex_destroy(&mutex);
-		}
+	~MutexLock()
+	{
+		pthread_mutex_destroy(&mutex);
+	}
 
-		bool attemptLock()
-		{
-			return pthread_mutex_trylock(&mutex) == 0;
-		}
+	bool attemptLock()
+	{
+		return pthread_mutex_trylock(&mutex) == 0;
+	}
 
-		void lock()
-		{
-			pthread_mutex_lock(&mutex);
-		}
+	void lock()
+	{
+		pthread_mutex_lock(&mutex);
+	}
 
-		void unlock()
-		{
-			pthread_mutex_unlock(&mutex);
-		}
+	void unlock()
+	{
+		pthread_mutex_unlock(&mutex);
+	}
 
-	private:
-		pthread_mutex_t mutex;
-	};
-}
+private:
+	pthread_mutex_t mutex;
+};
+
+}  // namespace rr
 
 #else   // !__linux__
 
 #include <atomic>
 
-namespace rr
+namespace rr {
+
+class BackoffLock
 {
-	class BackoffLock
+public:
+	BackoffLock()
 	{
-	public:
-		BackoffLock()
-		{
-			mutex = 0;
-		}
+		mutex = 0;
+	}
 
-		bool attemptLock()
+	bool attemptLock()
+	{
+		if(!isLocked())
 		{
-			if(!isLocked())
+			if(mutex.exchange(true) == false)
 			{
-				if(mutex.exchange(true) == false)
-				{
-					return true;
-				}
+				return true;
 			}
-
-			return false;
 		}
 
-		void lock()
-		{
-			int backoff = 1;
+		return false;
+	}
 
-			while(!attemptLock())
+	void lock()
+	{
+		int backoff = 1;
+
+		while(!attemptLock())
+		{
+			if(backoff <= 64)
 			{
-				if(backoff <= 64)
+				for(int i = 0; i < backoff; i++)
 				{
-					for(int i = 0; i < backoff; i++)
-					{
-						nop();
-						nop();
-						nop();
-						nop();
-						nop();
+					nop();
+					nop();
+					nop();
+					nop();
+					nop();
 
-						nop();
-						nop();
-						nop();
-						nop();
-						nop();
+					nop();
+					nop();
+					nop();
+					nop();
+					nop();
 
-						nop();
-						nop();
-						nop();
-						nop();
-						nop();
+					nop();
+					nop();
+					nop();
+					nop();
+					nop();
 
-						nop();
-						nop();
-						nop();
-						nop();
-						nop();
+					nop();
+					nop();
+					nop();
+					nop();
+					nop();
 
-						nop();
-						nop();
-						nop();
-						nop();
-						nop();
+					nop();
+					nop();
+					nop();
+					nop();
+					nop();
 
-						nop();
-						nop();
-						nop();
-						nop();
-						nop();
+					nop();
+					nop();
+					nop();
+					nop();
+					nop();
 
-						nop();
-						nop();
-						nop();
-						nop();
-						nop();
-					}
-
-					backoff *= 2;
+					nop();
+					nop();
+					nop();
+					nop();
+					nop();
 				}
-				else
-				{
-					Thread::yield();
 
-					backoff = 1;
-				}
-			};
-		}
+				backoff *= 2;
+			}
+			else
+			{
+				Thread::yield();
 
-		void unlock()
-		{
-			mutex.store(false, std::memory_order_release);
-		}
-
-		bool isLocked()
-		{
-			return mutex.load(std::memory_order_acquire);
-		}
-
-	private:
-		struct
-		{
-			// Ensure that the mutex variable is on its own 64-byte cache line to avoid false sharing
-			// Padding must be public to avoid compiler warnings
-			volatile int padding1[16];
-			std::atomic<bool> mutex;
-			volatile int padding2[15];
+				backoff = 1;
+			}
 		};
-	};
+	}
 
-	using MutexLock = BackoffLock;
-}
+	void unlock()
+	{
+		mutex.store(false, std::memory_order_release);
+	}
+
+	bool isLocked()
+	{
+		return mutex.load(std::memory_order_acquire);
+	}
+
+private:
+	struct
+	{
+		// Ensure that the mutex variable is on its own 64-byte cache line to avoid false sharing
+		// Padding must be public to avoid compiler warnings
+		volatile int padding1[16];
+		std::atomic<bool> mutex;
+		volatile int padding2[15];
+	};
+};
+
+using MutexLock = BackoffLock;
+
+}  // namespace rr
 
 #endif   // !__linux__
 

diff --git a/src/Reactor/Nucleus.hpp b/src/Reactor/Nucleus.hpp
index 6414780..67e990f 100644
--- a/src/Reactor/Nucleus.hpp
+++ b/src/Reactor/Nucleus.hpp

@@ -29,268 +29,269 @@
 static_assert(sizeof(short) == 2, "Reactor's 'Short' type is 16-bit, and requires the C++ 'short' to match that.");
 static_assert(sizeof(int) == 4, "Reactor's 'Int' type is 32-bit, and requires the C++ 'int' to match that.");
 
-namespace rr
+namespace rr {
+
+class Type;
+class Value;
+class SwitchCases;
+class BasicBlock;
+class Routine;
+
+// Optimization holds the optimization settings for code generation.
+class Optimization
 {
-	class Type;
-	class Value;
-	class SwitchCases;
-	class BasicBlock;
-	class Routine;
-
-	// Optimization holds the optimization settings for code generation.
-	class Optimization
+public:
+	enum class Level
 	{
-	public:
-		enum class Level
+		None,
+		Less,
+		Default,
+		Aggressive,
+	};
+
+	enum class Pass
+	{
+		Disabled,
+		InstructionCombining,
+		CFGSimplification,
+		LICM,
+		AggressiveDCE,
+		GVN,
+		Reassociate,
+		DeadStoreElimination,
+		SCCP,
+		ScalarReplAggregates,
+		EarlyCSEPass,
+
+		Count,
+	};
+
+	using Passes = std::vector<Pass>;
+
+	Optimization(Level level = Level::Default, const Passes& passes = {})
+		: level(level), passes(passes)
+	{
+		#if defined(REACTOR_DEFAULT_OPT_LEVEL)
 		{
-			None,
-			Less,
-			Default,
-			Aggressive,
-		};
-
-		enum class Pass
-		{
-			Disabled,
-			InstructionCombining,
-			CFGSimplification,
-			LICM,
-			AggressiveDCE,
-			GVN,
-			Reassociate,
-			DeadStoreElimination,
-			SCCP,
-			ScalarReplAggregates,
-			EarlyCSEPass,
-
-			Count,
-		};
-
-		using Passes = std::vector<Pass>;
-
-		Optimization(Level level = Level::Default, const Passes& passes = {})
-			: level(level), passes(passes)
-		{
-			#if defined(REACTOR_DEFAULT_OPT_LEVEL)
-			{
-				this->level = Level::REACTOR_DEFAULT_OPT_LEVEL;
-			}
-			#endif
+			this->level = Level::REACTOR_DEFAULT_OPT_LEVEL;
 		}
+		#endif
+	}
 
-		Level getLevel() const { return level; }
-		const Passes & getPasses() const { return passes; }
+	Level getLevel() const { return level; }
+	const Passes & getPasses() const { return passes; }
 
-	private:
-		Level level = Level::Default;
-		Passes passes;
-	};
+private:
+	Level level = Level::Default;
+	Passes passes;
+};
 
-	// Config holds the Reactor configuration settings.
-	class Config
+// Config holds the Reactor configuration settings.
+class Config
+{
+public:
+	// Edit holds a number of modifications to a config, that can be applied
+	// on an existing Config to produce a new Config with the specified
+	// changes.
+	class Edit
 	{
 	public:
-		// Edit holds a number of modifications to a config, that can be applied
-		// on an existing Config to produce a new Config with the specified
-		// changes.
-		class Edit
-		{
-		public:
-			static const Edit None;
+		static const Edit None;
 
-			Edit & set(Optimization::Level level) { optLevel = level; optLevelChanged = true; return *this; }
-			Edit & add(Optimization::Pass pass) { optPassEdits.push_back({ListEdit::Add, pass}); return *this; }
-			Edit & remove(Optimization::Pass pass) { optPassEdits.push_back({ListEdit::Remove, pass}); return *this; }
-			Edit & clearOptimizationPasses() { optPassEdits.push_back({ListEdit::Clear, Optimization::Pass::Disabled}); return *this; }
+		Edit & set(Optimization::Level level) { optLevel = level; optLevelChanged = true; return *this; }
+		Edit & add(Optimization::Pass pass) { optPassEdits.push_back({ListEdit::Add, pass}); return *this; }
+		Edit & remove(Optimization::Pass pass) { optPassEdits.push_back({ListEdit::Remove, pass}); return *this; }
+		Edit & clearOptimizationPasses() { optPassEdits.push_back({ListEdit::Clear, Optimization::Pass::Disabled}); return *this; }
 
-			Config apply(const Config &cfg) const;
-
-		private:
-			enum class ListEdit { Add, Remove, Clear };
-			using OptPassesEdit = std::pair<ListEdit, Optimization::Pass>;
-
-			template <typename T>
-			void apply(const std::vector<std::pair<ListEdit, T>> & edits, std::vector<T>& list) const;
-
-			Optimization::Level optLevel;
-			bool optLevelChanged = false;
-			std::vector<OptPassesEdit> optPassEdits;
-		};
-
-		Config() = default;
-		Config(const Optimization & optimization) : optimization(optimization) {}
-
-		const Optimization & getOptimization() const { return optimization; }
+		Config apply(const Config &cfg) const;
 
 	private:
-		Optimization optimization;
+		enum class ListEdit { Add, Remove, Clear };
+		using OptPassesEdit = std::pair<ListEdit, Optimization::Pass>;
+
+		template <typename T>
+		void apply(const std::vector<std::pair<ListEdit, T>> & edits, std::vector<T>& list) const;
+
+		Optimization::Level optLevel;
+		bool optLevelChanged = false;
+		std::vector<OptPassesEdit> optPassEdits;
 	};
 
-	class Nucleus
+	Config() = default;
+	Config(const Optimization & optimization) : optimization(optimization) {}
+
+	const Optimization & getOptimization() const { return optimization; }
+
+private:
+	Optimization optimization;
+};
+
+class Nucleus
+{
+public:
+	Nucleus();
+
+	virtual ~Nucleus();
+
+	// Default configuration to use when no other configuration is specified.
+	// The new configuration will be applied to subsequent reactor calls.
+	static void setDefaultConfig(const Config &cfg);
+	static void adjustDefaultConfig(const Config::Edit &cfgEdit);
+	static Config getDefaultConfig();
+
+	std::shared_ptr<Routine> acquireRoutine(const char *name, const Config::Edit &cfgEdit = Config::Edit::None);
+
+	static Value *allocateStackVariable(Type *type, int arraySize = 0);
+	static BasicBlock *createBasicBlock();
+	static BasicBlock *getInsertBlock();
+	static void setInsertBlock(BasicBlock *basicBlock);
+
+	static void createFunction(Type *ReturnType, std::vector<Type*> &Params);
+	static Value *getArgument(unsigned int index);
+
+	// Coroutines
+	using CoroutineHandle = void*;
+
+	template <typename... ARGS>
+	using CoroutineBegin = CoroutineHandle(ARGS...);
+	using CoroutineAwait = bool(CoroutineHandle, void* yieldValue);
+	using CoroutineDestroy = void(CoroutineHandle);
+
+	enum CoroutineEntries
 	{
-	public:
-		Nucleus();
-
-		virtual ~Nucleus();
-
-		// Default configuration to use when no other configuration is specified.
-		// The new configuration will be applied to subsequent reactor calls.
-		static void setDefaultConfig(const Config &cfg);
-		static void adjustDefaultConfig(const Config::Edit &cfgEdit);
-		static Config getDefaultConfig();
-
-		std::shared_ptr<Routine> acquireRoutine(const char *name, const Config::Edit &cfgEdit = Config::Edit::None);
-
-		static Value *allocateStackVariable(Type *type, int arraySize = 0);
-		static BasicBlock *createBasicBlock();
-		static BasicBlock *getInsertBlock();
-		static void setInsertBlock(BasicBlock *basicBlock);
-
-		static void createFunction(Type *ReturnType, std::vector<Type*> &Params);
-		static Value *getArgument(unsigned int index);
-
-		// Coroutines
-		using CoroutineHandle = void*;
-
-		template <typename... ARGS>
-		using CoroutineBegin = CoroutineHandle(ARGS...);
-		using CoroutineAwait = bool(CoroutineHandle, void* yieldValue);
-		using CoroutineDestroy = void(CoroutineHandle);
-
-		enum CoroutineEntries
-		{
-			CoroutineEntryBegin = 0,
-			CoroutineEntryAwait,
-			CoroutineEntryDestroy,
-			CoroutineEntryCount
-		};
-
-		static void createCoroutine(Type *ReturnType, std::vector<Type*> &Params);
-		std::shared_ptr<Routine> acquireCoroutine(const char *name, const Config::Edit &cfg = Config::Edit::None);
-		static void yield(Value*);
-
-		// Terminators
-		static void createRetVoid();
-		static void createRet(Value *V);
-		static void createBr(BasicBlock *dest);
-		static void createCondBr(Value *cond, BasicBlock *ifTrue, BasicBlock *ifFalse);
-
-		// Binary operators
-		static Value *createAdd(Value *lhs, Value *rhs);
-		static Value *createSub(Value *lhs, Value *rhs);
-		static Value *createMul(Value *lhs, Value *rhs);
-		static Value *createUDiv(Value *lhs, Value *rhs);
-		static Value *createSDiv(Value *lhs, Value *rhs);
-		static Value *createFAdd(Value *lhs, Value *rhs);
-		static Value *createFSub(Value *lhs, Value *rhs);
-		static Value *createFMul(Value *lhs, Value *rhs);
-		static Value *createFDiv(Value *lhs, Value *rhs);
-		static Value *createURem(Value *lhs, Value *rhs);
-		static Value *createSRem(Value *lhs, Value *rhs);
-		static Value *createFRem(Value *lhs, Value *rhs);
-		static Value *createShl(Value *lhs, Value *rhs);
-		static Value *createLShr(Value *lhs, Value *rhs);
-		static Value *createAShr(Value *lhs, Value *rhs);
-		static Value *createAnd(Value *lhs, Value *rhs);
-		static Value *createOr(Value *lhs, Value *rhs);
-		static Value *createXor(Value *lhs, Value *rhs);
-
-		// Unary operators
-		static Value *createNeg(Value *V);
-		static Value *createFNeg(Value *V);
-		static Value *createNot(Value *V);
-
-		// Memory instructions
-		static Value *createLoad(Value *ptr, Type *type, bool isVolatile = false, unsigned int alignment = 0, bool atomic = false , std::memory_order memoryOrder = std::memory_order_relaxed);
-		static Value *createStore(Value *value, Value *ptr, Type *type, bool isVolatile = false, unsigned int aligment = 0, bool atomic = false, std::memory_order memoryOrder = std::memory_order_relaxed);
-		static Value *createGEP(Value *ptr, Type *type, Value *index, bool unsignedIndex);
-
-		// Masked Load / Store instructions
-		static Value *createMaskedLoad(Value *base, Type *elementType, Value *mask, unsigned int alignment, bool zeroMaskedLanes);
-		static void createMaskedStore(Value *base, Value *value, Value *mask, unsigned int alignment);
-
-		// Barrier instructions
-		static void createFence(std::memory_order memoryOrder);
-
-		// Atomic instructions
-		static Value *createAtomicAdd(Value *ptr, Value *value, std::memory_order memoryOrder = std::memory_order_relaxed);
-		static Value *createAtomicSub(Value *ptr, Value *value, std::memory_order memoryOrder = std::memory_order_relaxed);
-		static Value *createAtomicAnd(Value *ptr, Value *value, std::memory_order memoryOrder = std::memory_order_relaxed);
-		static Value *createAtomicOr(Value *ptr, Value *value, std::memory_order memoryOrder = std::memory_order_relaxed);
-		static Value *createAtomicXor(Value *ptr, Value *value, std::memory_order memoryOrder = std::memory_order_relaxed);
-		static Value *createAtomicMin(Value *ptr, Value *value, std::memory_order memoryOrder = std::memory_order_relaxed);
-		static Value *createAtomicMax(Value *ptr, Value *value, std::memory_order memoryOrder = std::memory_order_relaxed);
-		static Value *createAtomicUMin(Value *ptr, Value *value, std::memory_order memoryOrder = std::memory_order_relaxed);
-		static Value *createAtomicUMax(Value *ptr, Value *value, std::memory_order memoryOrder = std::memory_order_relaxed);
-		static Value *createAtomicExchange(Value *ptr, Value *value, std::memory_order memoryOrder = std::memory_order_relaxed);
-		static Value *createAtomicCompareExchange(Value *ptr, Value *value, Value *compare, std::memory_order memoryOrderEqual, std::memory_order memoryOrderUnequal);
-
-		// Cast/Conversion Operators
-		static Value *createTrunc(Value *V, Type *destType);
-		static Value *createZExt(Value *V, Type *destType);
-		static Value *createSExt(Value *V, Type *destType);
-		static Value *createFPToUI(Value *V, Type *destType);
-		static Value *createFPToSI(Value *V, Type *destType);
-		static Value *createSIToFP(Value *V, Type *destType);
-		static Value *createFPTrunc(Value *V, Type *destType);
-		static Value *createFPExt(Value *V, Type *destType);
-		static Value *createBitCast(Value *V, Type *destType);
-
-		// Compare instructions
-		static Value *createPtrEQ(Value *lhs, Value *rhs);
-		static Value *createICmpEQ(Value *lhs, Value *rhs);
-		static Value *createICmpNE(Value *lhs, Value *rhs);
-		static Value *createICmpUGT(Value *lhs, Value *rhs);
-		static Value *createICmpUGE(Value *lhs, Value *rhs);
-		static Value *createICmpULT(Value *lhs, Value *rhs);
-		static Value *createICmpULE(Value *lhs, Value *rhs);
-		static Value *createICmpSGT(Value *lhs, Value *rhs);
-		static Value *createICmpSGE(Value *lhs, Value *rhs);
-		static Value *createICmpSLT(Value *lhs, Value *rhs);
-		static Value *createICmpSLE(Value *lhs, Value *rhs);
-		static Value *createFCmpOEQ(Value *lhs, Value *rhs);
-		static Value *createFCmpOGT(Value *lhs, Value *rhs);
-		static Value *createFCmpOGE(Value *lhs, Value *rhs);
-		static Value *createFCmpOLT(Value *lhs, Value *rhs);
-		static Value *createFCmpOLE(Value *lhs, Value *rhs);
-		static Value *createFCmpONE(Value *lhs, Value *rhs);
-		static Value *createFCmpORD(Value *lhs, Value *rhs);
-		static Value *createFCmpUNO(Value *lhs, Value *rhs);
-		static Value *createFCmpUEQ(Value *lhs, Value *rhs);
-		static Value *createFCmpUGT(Value *lhs, Value *rhs);
-		static Value *createFCmpUGE(Value *lhs, Value *rhs);
-		static Value *createFCmpULT(Value *lhs, Value *rhs);
-		static Value *createFCmpULE(Value *lhs, Value *rhs);
-		static Value *createFCmpUNE(Value *lhs, Value *rhs);
-
-		// Vector instructions
-		static Value *createExtractElement(Value *vector, Type *type, int index);
-		static Value *createInsertElement(Value *vector, Value *element, int index);
-		static Value *createShuffleVector(Value *V1, Value *V2, const int *select);
-
-		// Other instructions
-		static Value *createSelect(Value *C, Value *ifTrue, Value *ifFalse);
-		static SwitchCases *createSwitch(Value *control, BasicBlock *defaultBranch, unsigned numCases);
-		static void addSwitchCase(SwitchCases *switchCases, int label, BasicBlock *branch);
-		static void createUnreachable();
-
-		// Constant values
-		static Value *createNullValue(Type *type);
-		static Value *createConstantLong(int64_t i);
-		static Value *createConstantInt(int i);
-		static Value *createConstantInt(unsigned int i);
-		static Value *createConstantBool(bool b);
-		static Value *createConstantByte(signed char i);
-		static Value *createConstantByte(unsigned char i);
-		static Value *createConstantShort(short i);
-		static Value *createConstantShort(unsigned short i);
-		static Value *createConstantFloat(float x);
-		static Value *createNullPointer(Type *type);
-		static Value *createConstantVector(const int64_t *constants, Type *type);
-		static Value *createConstantVector(const double *constants, Type *type);
-
-		static Type *getPointerType(Type *elementType);
+		CoroutineEntryBegin = 0,
+		CoroutineEntryAwait,
+		CoroutineEntryDestroy,
+		CoroutineEntryCount
 	};
-}
+
+	static void createCoroutine(Type *ReturnType, std::vector<Type*> &Params);
+	std::shared_ptr<Routine> acquireCoroutine(const char *name, const Config::Edit &cfg = Config::Edit::None);
+	static void yield(Value*);
+
+	// Terminators
+	static void createRetVoid();
+	static void createRet(Value *V);
+	static void createBr(BasicBlock *dest);
+	static void createCondBr(Value *cond, BasicBlock *ifTrue, BasicBlock *ifFalse);
+
+	// Binary operators
+	static Value *createAdd(Value *lhs, Value *rhs);
+	static Value *createSub(Value *lhs, Value *rhs);
+	static Value *createMul(Value *lhs, Value *rhs);
+	static Value *createUDiv(Value *lhs, Value *rhs);
+	static Value *createSDiv(Value *lhs, Value *rhs);
+	static Value *createFAdd(Value *lhs, Value *rhs);
+	static Value *createFSub(Value *lhs, Value *rhs);
+	static Value *createFMul(Value *lhs, Value *rhs);
+	static Value *createFDiv(Value *lhs, Value *rhs);
+	static Value *createURem(Value *lhs, Value *rhs);
+	static Value *createSRem(Value *lhs, Value *rhs);
+	static Value *createFRem(Value *lhs, Value *rhs);
+	static Value *createShl(Value *lhs, Value *rhs);
+	static Value *createLShr(Value *lhs, Value *rhs);
+	static Value *createAShr(Value *lhs, Value *rhs);
+	static Value *createAnd(Value *lhs, Value *rhs);
+	static Value *createOr(Value *lhs, Value *rhs);
+	static Value *createXor(Value *lhs, Value *rhs);
+
+	// Unary operators
+	static Value *createNeg(Value *V);
+	static Value *createFNeg(Value *V);
+	static Value *createNot(Value *V);
+
+	// Memory instructions
+	static Value *createLoad(Value *ptr, Type *type, bool isVolatile = false, unsigned int alignment = 0, bool atomic = false , std::memory_order memoryOrder = std::memory_order_relaxed);
+	static Value *createStore(Value *value, Value *ptr, Type *type, bool isVolatile = false, unsigned int aligment = 0, bool atomic = false, std::memory_order memoryOrder = std::memory_order_relaxed);
+	static Value *createGEP(Value *ptr, Type *type, Value *index, bool unsignedIndex);
+
+	// Masked Load / Store instructions
+	static Value *createMaskedLoad(Value *base, Type *elementType, Value *mask, unsigned int alignment, bool zeroMaskedLanes);
+	static void createMaskedStore(Value *base, Value *value, Value *mask, unsigned int alignment);
+
+	// Barrier instructions
+	static void createFence(std::memory_order memoryOrder);
+
+	// Atomic instructions
+	static Value *createAtomicAdd(Value *ptr, Value *value, std::memory_order memoryOrder = std::memory_order_relaxed);
+	static Value *createAtomicSub(Value *ptr, Value *value, std::memory_order memoryOrder = std::memory_order_relaxed);
+	static Value *createAtomicAnd(Value *ptr, Value *value, std::memory_order memoryOrder = std::memory_order_relaxed);
+	static Value *createAtomicOr(Value *ptr, Value *value, std::memory_order memoryOrder = std::memory_order_relaxed);
+	static Value *createAtomicXor(Value *ptr, Value *value, std::memory_order memoryOrder = std::memory_order_relaxed);
+	static Value *createAtomicMin(Value *ptr, Value *value, std::memory_order memoryOrder = std::memory_order_relaxed);
+	static Value *createAtomicMax(Value *ptr, Value *value, std::memory_order memoryOrder = std::memory_order_relaxed);
+	static Value *createAtomicUMin(Value *ptr, Value *value, std::memory_order memoryOrder = std::memory_order_relaxed);
+	static Value *createAtomicUMax(Value *ptr, Value *value, std::memory_order memoryOrder = std::memory_order_relaxed);
+	static Value *createAtomicExchange(Value *ptr, Value *value, std::memory_order memoryOrder = std::memory_order_relaxed);
+	static Value *createAtomicCompareExchange(Value *ptr, Value *value, Value *compare, std::memory_order memoryOrderEqual, std::memory_order memoryOrderUnequal);
+
+	// Cast/Conversion Operators
+	static Value *createTrunc(Value *V, Type *destType);
+	static Value *createZExt(Value *V, Type *destType);
+	static Value *createSExt(Value *V, Type *destType);
+	static Value *createFPToUI(Value *V, Type *destType);
+	static Value *createFPToSI(Value *V, Type *destType);
+	static Value *createSIToFP(Value *V, Type *destType);
+	static Value *createFPTrunc(Value *V, Type *destType);
+	static Value *createFPExt(Value *V, Type *destType);
+	static Value *createBitCast(Value *V, Type *destType);
+
+	// Compare instructions
+	static Value *createPtrEQ(Value *lhs, Value *rhs);
+	static Value *createICmpEQ(Value *lhs, Value *rhs);
+	static Value *createICmpNE(Value *lhs, Value *rhs);
+	static Value *createICmpUGT(Value *lhs, Value *rhs);
+	static Value *createICmpUGE(Value *lhs, Value *rhs);
+	static Value *createICmpULT(Value *lhs, Value *rhs);
+	static Value *createICmpULE(Value *lhs, Value *rhs);
+	static Value *createICmpSGT(Value *lhs, Value *rhs);
+	static Value *createICmpSGE(Value *lhs, Value *rhs);
+	static Value *createICmpSLT(Value *lhs, Value *rhs);
+	static Value *createICmpSLE(Value *lhs, Value *rhs);
+	static Value *createFCmpOEQ(Value *lhs, Value *rhs);
+	static Value *createFCmpOGT(Value *lhs, Value *rhs);
+	static Value *createFCmpOGE(Value *lhs, Value *rhs);
+	static Value *createFCmpOLT(Value *lhs, Value *rhs);
+	static Value *createFCmpOLE(Value *lhs, Value *rhs);
+	static Value *createFCmpONE(Value *lhs, Value *rhs);
+	static Value *createFCmpORD(Value *lhs, Value *rhs);
+	static Value *createFCmpUNO(Value *lhs, Value *rhs);
+	static Value *createFCmpUEQ(Value *lhs, Value *rhs);
+	static Value *createFCmpUGT(Value *lhs, Value *rhs);
+	static Value *createFCmpUGE(Value *lhs, Value *rhs);
+	static Value *createFCmpULT(Value *lhs, Value *rhs);
+	static Value *createFCmpULE(Value *lhs, Value *rhs);
+	static Value *createFCmpUNE(Value *lhs, Value *rhs);
+
+	// Vector instructions
+	static Value *createExtractElement(Value *vector, Type *type, int index);
+	static Value *createInsertElement(Value *vector, Value *element, int index);
+	static Value *createShuffleVector(Value *V1, Value *V2, const int *select);
+
+	// Other instructions
+	static Value *createSelect(Value *C, Value *ifTrue, Value *ifFalse);
+	static SwitchCases *createSwitch(Value *control, BasicBlock *defaultBranch, unsigned numCases);
+	static void addSwitchCase(SwitchCases *switchCases, int label, BasicBlock *branch);
+	static void createUnreachable();
+
+	// Constant values
+	static Value *createNullValue(Type *type);
+	static Value *createConstantLong(int64_t i);
+	static Value *createConstantInt(int i);
+	static Value *createConstantInt(unsigned int i);
+	static Value *createConstantBool(bool b);
+	static Value *createConstantByte(signed char i);
+	static Value *createConstantByte(unsigned char i);
+	static Value *createConstantShort(short i);
+	static Value *createConstantShort(unsigned short i);
+	static Value *createConstantFloat(float x);
+	static Value *createNullPointer(Type *type);
+	static Value *createConstantVector(const int64_t *constants, Type *type);
+	static Value *createConstantVector(const double *constants, Type *type);
+
+	static Type *getPointerType(Type *elementType);
+};
+
+}  // namespace rr
 
 #endif   // rr_Nucleus_hpp

diff --git a/src/Reactor/Optimizer.cpp b/src/Reactor/Optimizer.cpp
index 5d89878..7cc3540 100644
--- a/src/Reactor/Optimizer.cpp
+++ b/src/Reactor/Optimizer.cpp

@@ -19,819 +19,821 @@
 
 #include <vector>
 
-namespace
+namespace {
+
+class Optimizer
 {
-	class Optimizer
+public:
+	void run(Ice::Cfg *function);
+
+private:
+	void analyzeUses(Ice::Cfg *function);
+	void eliminateDeadCode();
+	void eliminateUnitializedLoads();
+	void eliminateLoadsFollowingSingleStore();
+	void optimizeStoresInSingleBasicBlock();
+
+	void replace(Ice::Inst *instruction, Ice::Operand *newValue);
+	void deleteInstruction(Ice::Inst *instruction);
+	bool isDead(Ice::Inst *instruction);
+
+	static const Ice::InstIntrinsicCall *asLoadSubVector(const Ice::Inst *instruction);
+	static const Ice::InstIntrinsicCall *asStoreSubVector(const Ice::Inst *instruction);
+	static bool isLoad(const Ice::Inst &instruction);
+	static bool isStore(const Ice::Inst &instruction);
+	static Ice::Operand *storeAddress(const Ice::Inst *instruction);
+	static Ice::Operand *loadAddress(const Ice::Inst *instruction);
+	static Ice::Operand *storeData(const Ice::Inst *instruction);
+	static std::size_t storeSize(const Ice::Inst *instruction);
+	static bool loadTypeMatchesStore(const Ice::Inst *load, const Ice::Inst *store);
+
+	Ice::Cfg *function;
+	Ice::GlobalContext *context;
+
+	struct Uses : std::vector<Ice::Inst*>
 	{
-	public:
-		void run(Ice::Cfg *function);
+		bool areOnlyLoadStore() const;
+		void insert(Ice::Operand *value, Ice::Inst *instruction);
+		void erase(Ice::Inst *instruction);
 
-	private:
-		void analyzeUses(Ice::Cfg *function);
-		void eliminateDeadCode();
-		void eliminateUnitializedLoads();
-		void eliminateLoadsFollowingSingleStore();
-		void optimizeStoresInSingleBasicBlock();
-
-		void replace(Ice::Inst *instruction, Ice::Operand *newValue);
-		void deleteInstruction(Ice::Inst *instruction);
-		bool isDead(Ice::Inst *instruction);
-
-		static const Ice::InstIntrinsicCall *asLoadSubVector(const Ice::Inst *instruction);
-		static const Ice::InstIntrinsicCall *asStoreSubVector(const Ice::Inst *instruction);
-		static bool isLoad(const Ice::Inst &instruction);
-		static bool isStore(const Ice::Inst &instruction);
-		static Ice::Operand *storeAddress(const Ice::Inst *instruction);
-		static Ice::Operand *loadAddress(const Ice::Inst *instruction);
-		static Ice::Operand *storeData(const Ice::Inst *instruction);
-		static std::size_t storeSize(const Ice::Inst *instruction);
-		static bool loadTypeMatchesStore(const Ice::Inst *load, const Ice::Inst *store);
-
-		Ice::Cfg *function;
-		Ice::GlobalContext *context;
-
-		struct Uses : std::vector<Ice::Inst*>
-		{
-			bool areOnlyLoadStore() const;
-			void insert(Ice::Operand *value, Ice::Inst *instruction);
-			void erase(Ice::Inst *instruction);
-
-			std::vector<Ice::Inst*> loads;
-			std::vector<Ice::Inst*> stores;
-		};
-
-		struct LoadStoreInst
-		{
-			LoadStoreInst(Ice::Inst* inst, bool isStore)
-			  : inst(inst),
-			    address(isStore ? storeAddress(inst) : loadAddress(inst)),
-			    isStore(isStore)
-			{
-			}
-
-			Ice::Inst* inst;
-			Ice::Operand *address;
-			bool isStore;
-		};
-
-		Optimizer::Uses* getUses(Ice::Operand*);
-		void setUses(Ice::Operand*, Optimizer::Uses*);
-		bool hasUses(Ice::Operand*) const;
-
-		Ice::CfgNode* getNode(Ice::Inst*);
-		void setNode(Ice::Inst*, Ice::CfgNode*);
-
-		Ice::Inst* getDefinition(Ice::Variable*);
-		void setDefinition(Ice::Variable*, Ice::Inst*);
-
-		const std::vector<LoadStoreInst>& getLoadStoreInsts(Ice::CfgNode*);
-		void setLoadStoreInsts(Ice::CfgNode*, std::vector<LoadStoreInst>*);
-		bool hasLoadStoreInsts(Ice::CfgNode* node) const;
-
-		std::vector<Optimizer::Uses*> allocatedUses;
+		std::vector<Ice::Inst*> loads;
+		std::vector<Ice::Inst*> stores;
 	};
 
-	void Optimizer::run(Ice::Cfg *function)
+	struct LoadStoreInst
 	{
-		this->function = function;
-		this->context = function->getContext();
-
-		analyzeUses(function);
-
-		eliminateDeadCode();
-		eliminateUnitializedLoads();
-		eliminateLoadsFollowingSingleStore();
-		optimizeStoresInSingleBasicBlock();
-		eliminateDeadCode();
-
-		for(auto uses : allocatedUses)
+		LoadStoreInst(Ice::Inst* inst, bool isStore)
+		  : inst(inst),
+		    address(isStore ? storeAddress(inst) : loadAddress(inst)),
+		    isStore(isStore)
 		{
-			delete uses;
 		}
-		allocatedUses.clear();
-	}
 
-	void Optimizer::eliminateDeadCode()
+		Ice::Inst* inst;
+		Ice::Operand *address;
+		bool isStore;
+	};
+
+	Optimizer::Uses* getUses(Ice::Operand*);
+	void setUses(Ice::Operand*, Optimizer::Uses*);
+	bool hasUses(Ice::Operand*) const;
+
+	Ice::CfgNode* getNode(Ice::Inst*);
+	void setNode(Ice::Inst*, Ice::CfgNode*);
+
+	Ice::Inst* getDefinition(Ice::Variable*);
+	void setDefinition(Ice::Variable*, Ice::Inst*);
+
+	const std::vector<LoadStoreInst>& getLoadStoreInsts(Ice::CfgNode*);
+	void setLoadStoreInsts(Ice::CfgNode*, std::vector<LoadStoreInst>*);
+	bool hasLoadStoreInsts(Ice::CfgNode* node) const;
+
+	std::vector<Optimizer::Uses*> allocatedUses;
+};
+
+void Optimizer::run(Ice::Cfg *function)
+{
+	this->function = function;
+	this->context = function->getContext();
+
+	analyzeUses(function);
+
+	eliminateDeadCode();
+	eliminateUnitializedLoads();
+	eliminateLoadsFollowingSingleStore();
+	optimizeStoresInSingleBasicBlock();
+	eliminateDeadCode();
+
+	for(auto uses : allocatedUses)
 	{
-		bool modified;
-		do
+		delete uses;
+	}
+	allocatedUses.clear();
+}
+
+void Optimizer::eliminateDeadCode()
+{
+	bool modified;
+	do
+	{
+		modified = false;
+		for(Ice::CfgNode *basicBlock : function->getNodes())
 		{
-			modified = false;
-			for(Ice::CfgNode *basicBlock : function->getNodes())
+			for(Ice::Inst &inst : Ice::reverse_range(basicBlock->getInsts()))
 			{
-				for(Ice::Inst &inst : Ice::reverse_range(basicBlock->getInsts()))
+				if(inst.isDeleted())
+				{
+					continue;
+				}
+
+				if(isDead(&inst))
+				{
+					deleteInstruction(&inst);
+					modified = true;
+				}
+			}
+		}
+	}
+	while(modified);
+}
+
+void Optimizer::eliminateUnitializedLoads()
+{
+	Ice::CfgNode *entryBlock = function->getEntryNode();
+
+	for(Ice::Inst &alloca : entryBlock->getInsts())
+	{
+		if(alloca.isDeleted())
+		{
+			continue;
+		}
+
+		if(!llvm::isa<Ice::InstAlloca>(alloca))
+		{
+			break;   // Allocas are all at the top
+		}
+
+		Ice::Operand *address = alloca.getDest();
+
+		if(!hasUses(address))
+		{
+			continue;
+		}
+
+		const auto &addressUses = *getUses(address);
+
+		if(!addressUses.areOnlyLoadStore())
+		{
+			continue;
+		}
+
+		if(addressUses.stores.empty())
+		{
+			for(Ice::Inst *load : addressUses.loads)
+			{
+				Ice::Variable *loadData = load->getDest();
+
+				if(hasUses(loadData))
+				{
+					for(Ice::Inst *use : *getUses(loadData))
+					{
+						for(Ice::SizeT i = 0; i < use->getSrcSize(); i++)
+						{
+							if(use->getSrc(i) == loadData)
+							{
+								auto *undef = context->getConstantUndef(loadData->getType());
+
+								use->replaceSource(i, undef);
+							}
+						}
+					}
+
+					setUses(loadData, nullptr);
+				}
+
+				load->setDeleted();
+			}
+
+			alloca.setDeleted();
+			setUses(address, nullptr);
+		}
+	}
+}
+
+void Optimizer::eliminateLoadsFollowingSingleStore()
+{
+	Ice::CfgNode *entryBlock = function->getEntryNode();
+
+	for(Ice::Inst &alloca : entryBlock->getInsts())
+	{
+		if(alloca.isDeleted())
+		{
+			continue;
+		}
+
+		if(!llvm::isa<Ice::InstAlloca>(alloca))
+		{
+			break;   // Allocas are all at the top
+		}
+
+		Ice::Operand *address = alloca.getDest();
+
+		if(!hasUses(address))
+		{
+			continue;
+		}
+
+		auto &addressUses = *getUses(address);
+
+		if(!addressUses.areOnlyLoadStore())
+		{
+			continue;
+		}
+
+		if(addressUses.stores.size() == 1)
+		{
+			Ice::Inst *store = addressUses.stores[0];
+			Ice::Operand *storeValue = storeData(store);
+
+			for(Ice::Inst *load = &*++store->getIterator(), *next = nullptr; load != next; next = load, load = &*++store->getIterator())
+			{
+				if(load->isDeleted() || !isLoad(*load))
+				{
+					continue;
+				}
+
+				if(loadAddress(load) != address)
+				{
+					continue;
+				}
+
+				if(!loadTypeMatchesStore(load, store))
+				{
+					continue;
+				}
+
+				replace(load, storeValue);
+
+				for(size_t i = 0; i < addressUses.loads.size(); i++)
+				{
+					if(addressUses.loads[i] == load)
+					{
+						addressUses.loads[i] = addressUses.loads.back();
+						addressUses.loads.pop_back();
+						break;
+					}
+				}
+
+				for(size_t i = 0; i < addressUses.size(); i++)
+				{
+					if(addressUses[i] == load)
+					{
+						addressUses[i] = addressUses.back();
+						addressUses.pop_back();
+						break;
+					}
+				}
+
+				if(addressUses.size() == 1)
+				{
+					assert(addressUses[0] == store);
+
+					alloca.setDeleted();
+					store->setDeleted();
+					setUses(address, nullptr);
+
+					if(hasUses(storeValue))
+					{
+						auto &valueUses = *getUses(storeValue);
+
+						for(size_t i = 0; i < valueUses.size(); i++)
+						{
+							if(valueUses[i] == store)
+							{
+								valueUses[i] = valueUses.back();
+								valueUses.pop_back();
+								break;
+							}
+						}
+
+						if(valueUses.empty())
+						{
+							setUses(storeValue, nullptr);
+						}
+					}
+
+					break;
+				}
+			}
+		}
+	}
+}
+
+void Optimizer::optimizeStoresInSingleBasicBlock()
+{
+	Ice::CfgNode *entryBlock = function->getEntryNode();
+
+	std::vector<std::vector<LoadStoreInst>* > allocatedVectors;
+
+	for(Ice::Inst &alloca : entryBlock->getInsts())
+	{
+		if(alloca.isDeleted())
+		{
+			continue;
+		}
+
+		if(!llvm::isa<Ice::InstAlloca>(alloca))
+		{
+			break;   // Allocas are all at the top
+		}
+
+		Ice::Operand *address = alloca.getDest();
+
+		if(!hasUses(address))
+		{
+			continue;
+		}
+
+		const auto &addressUses = *getUses(address);
+
+		if(!addressUses.areOnlyLoadStore())
+		{
+			continue;
+		}
+
+		Ice::CfgNode *singleBasicBlock = getNode(addressUses.stores[0]);
+
+		for(size_t i = 1; i < addressUses.stores.size(); i++)
+		{
+			Ice::Inst *store = addressUses.stores[i];
+			if(getNode(store) != singleBasicBlock)
+			{
+				singleBasicBlock = nullptr;
+				break;
+			}
+		}
+
+		if(singleBasicBlock)
+		{
+			if(!hasLoadStoreInsts(singleBasicBlock))
+			{
+				std::vector<LoadStoreInst>* loadStoreInstVector = new std::vector<LoadStoreInst>();
+				setLoadStoreInsts(singleBasicBlock, loadStoreInstVector);
+				allocatedVectors.push_back(loadStoreInstVector);
+				for(Ice::Inst &inst : singleBasicBlock->getInsts())
 				{
 					if(inst.isDeleted())
 					{
 						continue;
 					}
 
-					if(isDead(&inst))
+					bool isStoreInst = isStore(inst);
+					bool isLoadInst = isLoad(inst);
+
+					if(isStoreInst || isLoadInst)
 					{
-						deleteInstruction(&inst);
-						modified = true;
+						loadStoreInstVector->push_back(LoadStoreInst(&inst, isStoreInst));
 					}
 				}
 			}
-		}
-		while(modified);
-	}
 
-	void Optimizer::eliminateUnitializedLoads()
-	{
-		Ice::CfgNode *entryBlock = function->getEntryNode();
+			Ice::Inst *store = nullptr;
+			Ice::Operand *storeValue = nullptr;
+			bool unmatchedLoads = false;
 
-		for(Ice::Inst &alloca : entryBlock->getInsts())
-		{
-			if(alloca.isDeleted())
+			for (auto& loadStoreInst : getLoadStoreInsts(singleBasicBlock))
 			{
-				continue;
-			}
+				Ice::Inst* inst = loadStoreInst.inst;
 
-			if(!llvm::isa<Ice::InstAlloca>(alloca))
-			{
-				break;   // Allocas are all at the top
-			}
-
-			Ice::Operand *address = alloca.getDest();
-
-			if(!hasUses(address))
-			{
-				continue;
-			}
-
-			const auto &addressUses = *getUses(address);
-
-			if(!addressUses.areOnlyLoadStore())
-			{
-				continue;
-			}
-
-			if(addressUses.stores.empty())
-			{
-				for(Ice::Inst *load : addressUses.loads)
-				{
-					Ice::Variable *loadData = load->getDest();
-
-					if(hasUses(loadData))
-					{
-						for(Ice::Inst *use : *getUses(loadData))
-						{
-							for(Ice::SizeT i = 0; i < use->getSrcSize(); i++)
-							{
-								if(use->getSrc(i) == loadData)
-								{
-									auto *undef = context->getConstantUndef(loadData->getType());
-
-									use->replaceSource(i, undef);
-								}
-							}
-						}
-
-						setUses(loadData, nullptr);
-					}
-
-					load->setDeleted();
-				}
-
-				alloca.setDeleted();
-				setUses(address, nullptr);
-			}
-		}
-	}
-
-	void Optimizer::eliminateLoadsFollowingSingleStore()
-	{
-		Ice::CfgNode *entryBlock = function->getEntryNode();
-
-		for(Ice::Inst &alloca : entryBlock->getInsts())
-		{
-			if(alloca.isDeleted())
-			{
-				continue;
-			}
-
-			if(!llvm::isa<Ice::InstAlloca>(alloca))
-			{
-				break;   // Allocas are all at the top
-			}
-
-			Ice::Operand *address = alloca.getDest();
-
-			if(!hasUses(address))
-			{
-				continue;
-			}
-
-			auto &addressUses = *getUses(address);
-
-			if(!addressUses.areOnlyLoadStore())
-			{
-				continue;
-			}
-
-			if(addressUses.stores.size() == 1)
-			{
-				Ice::Inst *store = addressUses.stores[0];
-				Ice::Operand *storeValue = storeData(store);
-
-				for(Ice::Inst *load = &*++store->getIterator(), *next = nullptr; load != next; next = load, load = &*++store->getIterator())
-				{
-					if(load->isDeleted() || !isLoad(*load))
-					{
-						continue;
-					}
-
-					if(loadAddress(load) != address)
-					{
-						continue;
-					}
-
-					if(!loadTypeMatchesStore(load, store))
-					{
-						continue;
-					}
-
-					replace(load, storeValue);
-
-					for(size_t i = 0; i < addressUses.loads.size(); i++)
-					{
-						if(addressUses.loads[i] == load)
-						{
-							addressUses.loads[i] = addressUses.loads.back();
-							addressUses.loads.pop_back();
-							break;
-						}
-					}
-
-					for(size_t i = 0; i < addressUses.size(); i++)
-					{
-						if(addressUses[i] == load)
-						{
-							addressUses[i] = addressUses.back();
-							addressUses.pop_back();
-							break;
-						}
-					}
-
-					if(addressUses.size() == 1)
-					{
-						assert(addressUses[0] == store);
-
-						alloca.setDeleted();
-						store->setDeleted();
-						setUses(address, nullptr);
-
-						if(hasUses(storeValue))
-						{
-							auto &valueUses = *getUses(storeValue);
-
-							for(size_t i = 0; i < valueUses.size(); i++)
-							{
-								if(valueUses[i] == store)
-								{
-									valueUses[i] = valueUses.back();
-									valueUses.pop_back();
-									break;
-								}
-							}
-
-							if(valueUses.empty())
-							{
-								setUses(storeValue, nullptr);
-							}
-						}
-
-						break;
-					}
-				}
-			}
-		}
-	}
-
-	void Optimizer::optimizeStoresInSingleBasicBlock()
-	{
-		Ice::CfgNode *entryBlock = function->getEntryNode();
-
-		std::vector<std::vector<LoadStoreInst>* > allocatedVectors;
-
-		for(Ice::Inst &alloca : entryBlock->getInsts())
-		{
-			if(alloca.isDeleted())
-			{
-				continue;
-			}
-
-			if(!llvm::isa<Ice::InstAlloca>(alloca))
-			{
-				break;   // Allocas are all at the top
-			}
-
-			Ice::Operand *address = alloca.getDest();
-
-			if(!hasUses(address))
-			{
-				continue;
-			}
-
-			const auto &addressUses = *getUses(address);
-
-			if(!addressUses.areOnlyLoadStore())
-			{
-				continue;
-			}
-
-			Ice::CfgNode *singleBasicBlock = getNode(addressUses.stores[0]);
-
-			for(size_t i = 1; i < addressUses.stores.size(); i++)
-			{
-				Ice::Inst *store = addressUses.stores[i];
-				if(getNode(store) != singleBasicBlock)
-				{
-					singleBasicBlock = nullptr;
-					break;
-				}
-			}
-
-			if(singleBasicBlock)
-			{
-				if(!hasLoadStoreInsts(singleBasicBlock))
-				{
-					std::vector<LoadStoreInst>* loadStoreInstVector = new std::vector<LoadStoreInst>();
-					setLoadStoreInsts(singleBasicBlock, loadStoreInstVector);
-					allocatedVectors.push_back(loadStoreInstVector);
-					for(Ice::Inst &inst : singleBasicBlock->getInsts())
-					{
-						if(inst.isDeleted())
-						{
-							continue;
-						}
-
-						bool isStoreInst = isStore(inst);
-						bool isLoadInst = isLoad(inst);
-
-						if(isStoreInst || isLoadInst)
-						{
-							loadStoreInstVector->push_back(LoadStoreInst(&inst, isStoreInst));
-						}
-					}
-				}
-
-				Ice::Inst *store = nullptr;
-				Ice::Operand *storeValue = nullptr;
-				bool unmatchedLoads = false;
-
-				for (auto& loadStoreInst : getLoadStoreInsts(singleBasicBlock))
-				{
-					Ice::Inst* inst = loadStoreInst.inst;
-
-					if((loadStoreInst.address != address) || inst->isDeleted())
-					{
-						continue;
-					}
-
-					if(loadStoreInst.isStore)
-					{
-						// New store found. If we had a previous one, try to eliminate it.
-						if(store && !unmatchedLoads)
-						{
-							// If the previous store is wider than the new one, we can't eliminate it
-							// because there could be a wide load reading its non-overwritten data.
-							if(storeSize(inst) >= storeSize(store))
-							{
-								deleteInstruction(store);
-							}
-						}
-
-						store = inst;
-						storeValue = storeData(store);
-						unmatchedLoads = false;
-					}
-					else
-					{
-						if(!loadTypeMatchesStore(inst, store))
-						{
-							unmatchedLoads = true;
-							continue;
-						}
-
-						replace(inst, storeValue);
-					}
-				}
-			}
-		}
-
-		for(auto loadStoreInstVector : allocatedVectors)
-		{
-			delete loadStoreInstVector;
-		}
-	}
-
-	void Optimizer::analyzeUses(Ice::Cfg *function)
-	{
-		for(Ice::CfgNode *basicBlock : function->getNodes())
-		{
-			for(Ice::Inst &instruction : basicBlock->getInsts())
-			{
-				if(instruction.isDeleted())
+				if((loadStoreInst.address != address) || inst->isDeleted())
 				{
 					continue;
 				}
 
-				setNode(&instruction, basicBlock);
-				if(instruction.getDest())
+				if(loadStoreInst.isStore)
 				{
-					setDefinition(instruction.getDest(), &instruction);
-				}
-
-				for(Ice::SizeT i = 0; i < instruction.getSrcSize(); i++)
-				{
-					Ice::SizeT unique = 0;
-					for(; unique < i; unique++)
+					// New store found. If we had a previous one, try to eliminate it.
+					if(store && !unmatchedLoads)
 					{
-						if(instruction.getSrc(i) == instruction.getSrc(unique))
+						// If the previous store is wider than the new one, we can't eliminate it
+						// because there could be a wide load reading its non-overwritten data.
+						if(storeSize(inst) >= storeSize(store))
 						{
-							break;
+							deleteInstruction(store);
 						}
 					}
 
-					if(i == unique)
+					store = inst;
+					storeValue = storeData(store);
+					unmatchedLoads = false;
+				}
+				else
+				{
+					if(!loadTypeMatchesStore(inst, store))
 					{
-						Ice::Operand *src = instruction.getSrc(i);
-						getUses(src)->insert(src, &instruction);
+						unmatchedLoads = true;
+						continue;
 					}
+
+					replace(inst, storeValue);
 				}
 			}
 		}
 	}
 
-	void Optimizer::replace(Ice::Inst *instruction, Ice::Operand *newValue)
+	for(auto loadStoreInstVector : allocatedVectors)
 	{
-		Ice::Variable *oldValue = instruction->getDest();
+		delete loadStoreInstVector;
+	}
+}
 
-		if(!newValue)
+void Optimizer::analyzeUses(Ice::Cfg *function)
+{
+	for(Ice::CfgNode *basicBlock : function->getNodes())
+	{
+		for(Ice::Inst &instruction : basicBlock->getInsts())
 		{
-			newValue = context->getConstantUndef(oldValue->getType());
-		}
-
-		if(hasUses(oldValue))
-		{
-			for(Ice::Inst *use : *getUses(oldValue))
+			if(instruction.isDeleted())
 			{
-				assert(!use->isDeleted());   // Should have been removed from uses already
+				continue;
+			}
 
-				for(Ice::SizeT i = 0; i < use->getSrcSize(); i++)
+			setNode(&instruction, basicBlock);
+			if(instruction.getDest())
+			{
+				setDefinition(instruction.getDest(), &instruction);
+			}
+
+			for(Ice::SizeT i = 0; i < instruction.getSrcSize(); i++)
+			{
+				Ice::SizeT unique = 0;
+				for(; unique < i; unique++)
 				{
-					if(use->getSrc(i) == oldValue)
+					if(instruction.getSrc(i) == instruction.getSrc(unique))
 					{
-						use->replaceSource(i, newValue);
-					}
-				}
-
-				getUses(newValue)->insert(newValue, use);
-			}
-
-			setUses(oldValue, nullptr);
-		}
-
-		deleteInstruction(instruction);
-	}
-
-	void Optimizer::deleteInstruction(Ice::Inst *instruction)
-	{
-		if(!instruction || instruction->isDeleted())
-		{
-			return;
-		}
-
-		instruction->setDeleted();
-
-		for(Ice::SizeT i = 0; i < instruction->getSrcSize(); i++)
-		{
-			Ice::Operand *src = instruction->getSrc(i);
-
-			if(hasUses(src))
-			{
-				auto &srcUses = *getUses(src);
-
-				srcUses.erase(instruction);
-
-				if(srcUses.empty())
-				{
-					setUses(src, nullptr);
-
-					if(Ice::Variable *var = llvm::dyn_cast<Ice::Variable>(src))
-					{
-						deleteInstruction(getDefinition(var));
-					}
-				}
-			}
-		}
-	}
-
-	bool Optimizer::isDead(Ice::Inst *instruction)
-	{
-		Ice::Variable *dest = instruction->getDest();
-
-		if(dest)
-		{
-			return (!hasUses(dest) || getUses(dest)->empty()) && !instruction->hasSideEffects();
-		}
-		else if(isStore(*instruction))
-		{
-			if(Ice::Variable *address = llvm::dyn_cast<Ice::Variable>(storeAddress(instruction)))
-			{
-				Ice::Inst *def = getDefinition(address);
-
-				if(def && llvm::isa<Ice::InstAlloca>(def))
-				{
-					if(hasUses(address))
-					{
-						Optimizer::Uses* uses = getUses(address);
-						return uses->size() == uses->stores.size();   // Dead if all uses are stores
-					}
-					else
-					{
-						return true; // No uses
-					}
-				}
-			}
-		}
-
-		return false;
-	}
-
-	const Ice::InstIntrinsicCall *Optimizer::asLoadSubVector(const Ice::Inst *instruction)
-	{
-		if(auto *instrinsic = llvm::dyn_cast<Ice::InstIntrinsicCall>(instruction))
-		{
-			if(instrinsic->getIntrinsicInfo().ID == Ice::Intrinsics::LoadSubVector)
-			{
-				return instrinsic;
-			}
-		}
-
-		return nullptr;
-	}
-
-	const Ice::InstIntrinsicCall *Optimizer::asStoreSubVector(const Ice::Inst *instruction)
-	{
-		if(auto *instrinsic = llvm::dyn_cast<Ice::InstIntrinsicCall>(instruction))
-		{
-			if(instrinsic->getIntrinsicInfo().ID == Ice::Intrinsics::StoreSubVector)
-			{
-				return instrinsic;
-			}
-		}
-
-		return nullptr;
-	}
-
-	bool Optimizer::isLoad(const Ice::Inst &instruction)
-	{
-		if(llvm::isa<Ice::InstLoad>(&instruction))
-		{
-			return true;
-		}
-
-		return asLoadSubVector(&instruction) != nullptr;
-	}
-
-	bool Optimizer::isStore(const Ice::Inst &instruction)
-	{
-		if(llvm::isa<Ice::InstStore>(&instruction))
-		{
-			return true;
-		}
-
-		return asStoreSubVector(&instruction) != nullptr;
-	}
-
-	Ice::Operand *Optimizer::storeAddress(const Ice::Inst *instruction)
-	{
-		assert(isStore(*instruction));
-
-		if(auto *store = llvm::dyn_cast<Ice::InstStore>(instruction))
-		{
-			return store->getAddr();
-		}
-
-		if(auto *storeSubVector = asStoreSubVector(instruction))
-		{
-			return storeSubVector->getSrc(2);
-		}
-
-		return nullptr;
-	}
-
-	Ice::Operand *Optimizer::loadAddress(const Ice::Inst *instruction)
-	{
-		assert(isLoad(*instruction));
-
-		if(auto *load = llvm::dyn_cast<Ice::InstLoad>(instruction))
-		{
-			return load->getSourceAddress();
-		}
-
-		if(auto *loadSubVector = asLoadSubVector(instruction))
-		{
-			return loadSubVector->getSrc(1);
-		}
-
-		return nullptr;
-	}
-
-	Ice::Operand *Optimizer::storeData(const Ice::Inst *instruction)
-	{
-		assert(isStore(*instruction));
-
-		if(auto *store = llvm::dyn_cast<Ice::InstStore>(instruction))
-		{
-			return store->getData();
-		}
-
-		if(auto *storeSubVector = asStoreSubVector(instruction))
-		{
-			return storeSubVector->getSrc(1);
-		}
-
-		return nullptr;
-	}
-
-	std::size_t Optimizer::storeSize(const Ice::Inst *store)
-	{
-		assert(isStore(*store));
-
-		if(auto *instStore = llvm::dyn_cast<Ice::InstStore>(store))
-		{
-			return Ice::typeWidthInBytes(instStore->getData()->getType());
-		}
-
-		if(auto *storeSubVector = asStoreSubVector(store))
-		{
-			return llvm::cast<Ice::ConstantInteger32>(storeSubVector->getSrc(3))->getValue();
-		}
-
-		return 0;
-	}
-
-	bool Optimizer::loadTypeMatchesStore(const Ice::Inst *load, const Ice::Inst *store)
-	{
-		if(!load || !store)
-		{
-			return false;
-		}
-
-		assert(isLoad(*load) && isStore(*store));
-		assert(loadAddress(load) == storeAddress(store));
-
-		if(auto *instStore = llvm::dyn_cast<Ice::InstStore>(store))
-		{
-			if(auto *instLoad = llvm::dyn_cast<Ice::InstLoad>(load))
-			{
-				return instStore->getData()->getType() == instLoad->getDest()->getType();
-			}
-		}
-
-		if(auto *storeSubVector = asStoreSubVector(store))
-		{
-			if(auto *loadSubVector = asLoadSubVector(load))
-			{
-				// Check for matching type and sub-vector width.
-				return storeSubVector->getSrc(1)->getType() == loadSubVector->getDest()->getType() &&
-				       llvm::cast<Ice::ConstantInteger32>(storeSubVector->getSrc(3))->getValue() ==
-				       llvm::cast<Ice::ConstantInteger32>(loadSubVector->getSrc(2))->getValue();
-			}
-		}
-
-		return false;
-	}
-
-	Optimizer::Uses* Optimizer::getUses(Ice::Operand* operand)
-	{
-		Optimizer::Uses* uses = (Optimizer::Uses*)operand->Ice::Operand::getExternalData();
-		if(!uses)
-		{
-			uses = new Optimizer::Uses;
-			setUses(operand, uses);
-			allocatedUses.push_back(uses);
-		}
-		return uses;
-	}
-
-	void Optimizer::setUses(Ice::Operand* operand, Optimizer::Uses* uses)
-	{
-		operand->Ice::Operand::setExternalData(uses);
-	}
-
-	bool Optimizer::hasUses(Ice::Operand* operand) const
-	{
-		return operand->Ice::Operand::getExternalData() != nullptr;
-	}
-
-	Ice::CfgNode* Optimizer::getNode(Ice::Inst* inst)
-	{
-		return (Ice::CfgNode*)inst->Ice::Inst::getExternalData();
-	}
-
-	void Optimizer::setNode(Ice::Inst* inst, Ice::CfgNode* node)
-	{
-		inst->Ice::Inst::setExternalData(node);
-	}
-
-	Ice::Inst* Optimizer::getDefinition(Ice::Variable* var)
-	{
-		return (Ice::Inst*)var->Ice::Variable::getExternalData();
-	}
-
-	void Optimizer::setDefinition(Ice::Variable* var, Ice::Inst* inst)
-	{
-		var->Ice::Variable::setExternalData(inst);
-	}
-
-	const std::vector<Optimizer::LoadStoreInst>& Optimizer::getLoadStoreInsts(Ice::CfgNode* node)
-	{
-		return *((const std::vector<LoadStoreInst>*)node->Ice::CfgNode::getExternalData());
-	}
-
-	void Optimizer::setLoadStoreInsts(Ice::CfgNode* node, std::vector<LoadStoreInst>* insts)
-	{
-		node->Ice::CfgNode::setExternalData(insts);
-	}
-
-	bool Optimizer::hasLoadStoreInsts(Ice::CfgNode* node) const
-	{
-		return node->Ice::CfgNode::getExternalData() != nullptr;
-	}
-
-	bool Optimizer::Uses::areOnlyLoadStore() const
-	{
-		return size() == (loads.size() + stores.size());
-	}
-
-	void Optimizer::Uses::insert(Ice::Operand *value, Ice::Inst *instruction)
-	{
-		push_back(instruction);
-
-		if(isLoad(*instruction))
-		{
-			if(value == loadAddress(instruction))
-			{
-				loads.push_back(instruction);
-			}
-		}
-		else if(isStore(*instruction))
-		{
-			if(value == storeAddress(instruction))
-			{
-				stores.push_back(instruction);
-			}
-		}
-	}
-
-	void Optimizer::Uses::erase(Ice::Inst *instruction)
-	{
-		auto &uses = *this;
-
-		for(size_t i = 0; i < uses.size(); i++)
-		{
-			if(uses[i] == instruction)
-			{
-				uses[i] = back();
-				pop_back();
-
-				for(size_t i = 0; i < loads.size(); i++)
-				{
-					if(loads[i] == instruction)
-					{
-						loads[i] = loads.back();
-						loads.pop_back();
 						break;
 					}
 				}
 
-				for(size_t i = 0; i < stores.size(); i++)
+				if(i == unique)
 				{
-					if(stores[i] == instruction)
-					{
-						stores[i] = stores.back();
-						stores.pop_back();
-						break;
-					}
+					Ice::Operand *src = instruction.getSrc(i);
+					getUses(src)->insert(src, &instruction);
 				}
-
-				break;
 			}
 		}
 	}
 }
 
-namespace rr
+void Optimizer::replace(Ice::Inst *instruction, Ice::Operand *newValue)
 {
-	void optimize(Ice::Cfg *function)
-	{
-		Optimizer optimizer;
+	Ice::Variable *oldValue = instruction->getDest();
 
-		optimizer.run(function);
+	if(!newValue)
+	{
+		newValue = context->getConstantUndef(oldValue->getType());
 	}
-}
\ No newline at end of file
+
+	if(hasUses(oldValue))
+	{
+		for(Ice::Inst *use : *getUses(oldValue))
+		{
+			assert(!use->isDeleted());   // Should have been removed from uses already
+
+			for(Ice::SizeT i = 0; i < use->getSrcSize(); i++)
+			{
+				if(use->getSrc(i) == oldValue)
+				{
+					use->replaceSource(i, newValue);
+				}
+			}
+
+			getUses(newValue)->insert(newValue, use);
+		}
+
+		setUses(oldValue, nullptr);
+	}
+
+	deleteInstruction(instruction);
+}
+
+void Optimizer::deleteInstruction(Ice::Inst *instruction)
+{
+	if(!instruction || instruction->isDeleted())
+	{
+		return;
+	}
+
+	instruction->setDeleted();
+
+	for(Ice::SizeT i = 0; i < instruction->getSrcSize(); i++)
+	{
+		Ice::Operand *src = instruction->getSrc(i);
+
+		if(hasUses(src))
+		{
+			auto &srcUses = *getUses(src);
+
+			srcUses.erase(instruction);
+
+			if(srcUses.empty())
+			{
+				setUses(src, nullptr);
+
+				if(Ice::Variable *var = llvm::dyn_cast<Ice::Variable>(src))
+				{
+					deleteInstruction(getDefinition(var));
+				}
+			}
+		}
+	}
+}
+
+bool Optimizer::isDead(Ice::Inst *instruction)
+{
+	Ice::Variable *dest = instruction->getDest();
+
+	if(dest)
+	{
+		return (!hasUses(dest) || getUses(dest)->empty()) && !instruction->hasSideEffects();
+	}
+	else if(isStore(*instruction))
+	{
+		if(Ice::Variable *address = llvm::dyn_cast<Ice::Variable>(storeAddress(instruction)))
+		{
+			Ice::Inst *def = getDefinition(address);
+
+			if(def && llvm::isa<Ice::InstAlloca>(def))
+			{
+				if(hasUses(address))
+				{
+					Optimizer::Uses* uses = getUses(address);
+					return uses->size() == uses->stores.size();   // Dead if all uses are stores
+				}
+				else
+				{
+					return true; // No uses
+				}
+			}
+		}
+	}
+
+	return false;
+}
+
+const Ice::InstIntrinsicCall *Optimizer::asLoadSubVector(const Ice::Inst *instruction)
+{
+	if(auto *instrinsic = llvm::dyn_cast<Ice::InstIntrinsicCall>(instruction))
+	{
+		if(instrinsic->getIntrinsicInfo().ID == Ice::Intrinsics::LoadSubVector)
+		{
+			return instrinsic;
+		}
+	}
+
+	return nullptr;
+}
+
+const Ice::InstIntrinsicCall *Optimizer::asStoreSubVector(const Ice::Inst *instruction)
+{
+	if(auto *instrinsic = llvm::dyn_cast<Ice::InstIntrinsicCall>(instruction))
+	{
+		if(instrinsic->getIntrinsicInfo().ID == Ice::Intrinsics::StoreSubVector)
+		{
+			return instrinsic;
+		}
+	}
+
+	return nullptr;
+}
+
+bool Optimizer::isLoad(const Ice::Inst &instruction)
+{
+	if(llvm::isa<Ice::InstLoad>(&instruction))
+	{
+		return true;
+	}
+
+	return asLoadSubVector(&instruction) != nullptr;
+}
+
+bool Optimizer::isStore(const Ice::Inst &instruction)
+{
+	if(llvm::isa<Ice::InstStore>(&instruction))
+	{
+		return true;
+	}
+
+	return asStoreSubVector(&instruction) != nullptr;
+}
+
+Ice::Operand *Optimizer::storeAddress(const Ice::Inst *instruction)
+{
+	assert(isStore(*instruction));
+
+	if(auto *store = llvm::dyn_cast<Ice::InstStore>(instruction))
+	{
+		return store->getAddr();
+	}
+
+	if(auto *storeSubVector = asStoreSubVector(instruction))
+	{
+		return storeSubVector->getSrc(2);
+	}
+
+	return nullptr;
+}
+
+Ice::Operand *Optimizer::loadAddress(const Ice::Inst *instruction)
+{
+	assert(isLoad(*instruction));
+
+	if(auto *load = llvm::dyn_cast<Ice::InstLoad>(instruction))
+	{
+		return load->getSourceAddress();
+	}
+
+	if(auto *loadSubVector = asLoadSubVector(instruction))
+	{
+		return loadSubVector->getSrc(1);
+	}
+
+	return nullptr;
+}
+
+Ice::Operand *Optimizer::storeData(const Ice::Inst *instruction)
+{
+	assert(isStore(*instruction));
+
+	if(auto *store = llvm::dyn_cast<Ice::InstStore>(instruction))
+	{
+		return store->getData();
+	}
+
+	if(auto *storeSubVector = asStoreSubVector(instruction))
+	{
+		return storeSubVector->getSrc(1);
+	}
+
+	return nullptr;
+}
+
+std::size_t Optimizer::storeSize(const Ice::Inst *store)
+{
+	assert(isStore(*store));
+
+	if(auto *instStore = llvm::dyn_cast<Ice::InstStore>(store))
+	{
+		return Ice::typeWidthInBytes(instStore->getData()->getType());
+	}
+
+	if(auto *storeSubVector = asStoreSubVector(store))
+	{
+		return llvm::cast<Ice::ConstantInteger32>(storeSubVector->getSrc(3))->getValue();
+	}
+
+	return 0;
+}
+
+bool Optimizer::loadTypeMatchesStore(const Ice::Inst *load, const Ice::Inst *store)
+{
+	if(!load || !store)
+	{
+		return false;
+	}
+
+	assert(isLoad(*load) && isStore(*store));
+	assert(loadAddress(load) == storeAddress(store));
+
+	if(auto *instStore = llvm::dyn_cast<Ice::InstStore>(store))
+	{
+		if(auto *instLoad = llvm::dyn_cast<Ice::InstLoad>(load))
+		{
+			return instStore->getData()->getType() == instLoad->getDest()->getType();
+		}
+	}
+
+	if(auto *storeSubVector = asStoreSubVector(store))
+	{
+		if(auto *loadSubVector = asLoadSubVector(load))
+		{
+			// Check for matching type and sub-vector width.
+			return storeSubVector->getSrc(1)->getType() == loadSubVector->getDest()->getType() &&
+			       llvm::cast<Ice::ConstantInteger32>(storeSubVector->getSrc(3))->getValue() ==
+			       llvm::cast<Ice::ConstantInteger32>(loadSubVector->getSrc(2))->getValue();
+		}
+	}
+
+	return false;
+}
+
+Optimizer::Uses* Optimizer::getUses(Ice::Operand* operand)
+{
+	Optimizer::Uses* uses = (Optimizer::Uses*)operand->Ice::Operand::getExternalData();
+	if(!uses)
+	{
+		uses = new Optimizer::Uses;
+		setUses(operand, uses);
+		allocatedUses.push_back(uses);
+	}
+	return uses;
+}
+
+void Optimizer::setUses(Ice::Operand* operand, Optimizer::Uses* uses)
+{
+	operand->Ice::Operand::setExternalData(uses);
+}
+
+bool Optimizer::hasUses(Ice::Operand* operand) const
+{
+	return operand->Ice::Operand::getExternalData() != nullptr;
+}
+
+Ice::CfgNode* Optimizer::getNode(Ice::Inst* inst)
+{
+	return (Ice::CfgNode*)inst->Ice::Inst::getExternalData();
+}
+
+void Optimizer::setNode(Ice::Inst* inst, Ice::CfgNode* node)
+{
+	inst->Ice::Inst::setExternalData(node);
+}
+
+Ice::Inst* Optimizer::getDefinition(Ice::Variable* var)
+{
+	return (Ice::Inst*)var->Ice::Variable::getExternalData();
+}
+
+void Optimizer::setDefinition(Ice::Variable* var, Ice::Inst* inst)
+{
+	var->Ice::Variable::setExternalData(inst);
+}
+
+const std::vector<Optimizer::LoadStoreInst>& Optimizer::getLoadStoreInsts(Ice::CfgNode* node)
+{
+	return *((const std::vector<LoadStoreInst>*)node->Ice::CfgNode::getExternalData());
+}
+
+void Optimizer::setLoadStoreInsts(Ice::CfgNode* node, std::vector<LoadStoreInst>* insts)
+{
+	node->Ice::CfgNode::setExternalData(insts);
+}
+
+bool Optimizer::hasLoadStoreInsts(Ice::CfgNode* node) const
+{
+	return node->Ice::CfgNode::getExternalData() != nullptr;
+}
+
+bool Optimizer::Uses::areOnlyLoadStore() const
+{
+	return size() == (loads.size() + stores.size());
+}
+
+void Optimizer::Uses::insert(Ice::Operand *value, Ice::Inst *instruction)
+{
+	push_back(instruction);
+
+	if(isLoad(*instruction))
+	{
+		if(value == loadAddress(instruction))
+		{
+			loads.push_back(instruction);
+		}
+	}
+	else if(isStore(*instruction))
+	{
+		if(value == storeAddress(instruction))
+		{
+			stores.push_back(instruction);
+		}
+	}
+}
+
+void Optimizer::Uses::erase(Ice::Inst *instruction)
+{
+	auto &uses = *this;
+
+	for(size_t i = 0; i < uses.size(); i++)
+	{
+		if(uses[i] == instruction)
+		{
+			uses[i] = back();
+			pop_back();
+
+			for(size_t i = 0; i < loads.size(); i++)
+			{
+				if(loads[i] == instruction)
+				{
+					loads[i] = loads.back();
+					loads.pop_back();
+					break;
+				}
+			}
+
+			for(size_t i = 0; i < stores.size(); i++)
+			{
+				if(stores[i] == instruction)
+				{
+					stores[i] = stores.back();
+					stores.pop_back();
+					break;
+				}
+			}
+
+			break;
+		}
+	}
+}
+
+}  // anonymous namespace 
+
+namespace rr {
+
+void optimize(Ice::Cfg *function)
+{
+	Optimizer optimizer;
+
+	optimizer.run(function);
+}
+
+}  // namespace rr
\ No newline at end of file

diff --git a/src/Reactor/Optimizer.hpp b/src/Reactor/Optimizer.hpp
index e6027e9..8aa2019 100644
--- a/src/Reactor/Optimizer.hpp
+++ b/src/Reactor/Optimizer.hpp

@@ -17,9 +17,10 @@
 
 #include "src/IceCfg.h"
 
-namespace rr
-{
-	void optimize(Ice::Cfg *function);
-}
+namespace rr {
+
+void optimize(Ice::Cfg *function);
+
+}  // namespace rr
 
 #endif   // rr_Optimizer_hpp

diff --git a/src/Reactor/Print.hpp b/src/Reactor/Print.hpp
index 252e621..ca06f4e 100644
--- a/src/Reactor/Print.hpp
+++ b/src/Reactor/Print.hpp

@@ -28,341 +28,341 @@
 
 namespace rr {
 
-	// PrintValue holds the printf format and value(s) for a single argument
-	// to Print(). A single argument can be expanded into multiple printf
-	// values - for example a Float4 will expand to "%f %f %f %f" and four
-	// scalar values.
-	// The PrintValue constructor accepts the following:
-	//   * Reactor LValues, RValues, Pointers.
-	//   * Standard Plain-Old-Value types (int, float, bool, etc)
-	//   * Custom types that specialize the PrintValue::Ty template struct.
-	//   * Static arrays in the form T[N] where T can be any of the above.
-	class PrintValue
+// PrintValue holds the printf format and value(s) for a single argument
+// to Print(). A single argument can be expanded into multiple printf
+// values - for example a Float4 will expand to "%f %f %f %f" and four
+// scalar values.
+// The PrintValue constructor accepts the following:
+//   * Reactor LValues, RValues, Pointers.
+//   * Standard Plain-Old-Value types (int, float, bool, etc)
+//   * Custom types that specialize the PrintValue::Ty template struct.
+//   * Static arrays in the form T[N] where T can be any of the above.
+class PrintValue
+{
+	// Ty is a template that can be specialized for printing type T.
+	// Each specialization must expose:
+	//  * A 'static std::string fmt(const T& v)' method that provides the
+	//    printf format specifier.
+	//  * A 'static std::vector<rr::Value*> val(const T& v)' method that
+	//    returns all the printf format values.
+	template <typename T> struct Ty
 	{
-		// Ty is a template that can be specialized for printing type T.
-		// Each specialization must expose:
-		//  * A 'static std::string fmt(const T& v)' method that provides the
-		//    printf format specifier.
-		//  * A 'static std::vector<rr::Value*> val(const T& v)' method that
-		//    returns all the printf format values.
-		template <typename T> struct Ty
+		// static std::string fmt(const T& v);
+		// static std::vector<rr::Value*> val(const T& v);
+	};
+
+	// returns the printf values for all the values in the given array.
+	template <typename T>
+	static std::vector<Value*> val(const T* list, int count) {
+		std::vector<Value*> values;
+		values.reserve(count);
+		for (int i = 0; i < count; i++)
 		{
-			// static std::string fmt(const T& v);
-			// static std::vector<rr::Value*> val(const T& v);
-		};
-
-		// returns the printf values for all the values in the given array.
-		template <typename T>
-		static std::vector<Value*> val(const T* list, int count) {
-			std::vector<Value*> values;
-			values.reserve(count);
-			for (int i = 0; i < count; i++)
-			{
-				auto v = val(list[i]);
-				values.insert(values.end(), v.begin(), v.end());
-			}
-			return values;
+			auto v = val(list[i]);
+			values.insert(values.end(), v.begin(), v.end());
 		}
-
-		// fmt returns the comma-delimited list of printf format strings for
-		// every element in the provided list, all enclosed in square brackets.
-		template <typename T>
-		static std::string fmt(const T* list, int count)
-		{
-			std::string out = "[";
-			for (int i = 0; i < count; i++)
-			{
-				if (i > 0) { out += ", "; }
-				out += fmt(list[i]);
-			}
-			return out + "]";
-		}
-
-		static std::string addr(const void* ptr)
-		{
-			char buf[32];
-			snprintf(buf, sizeof(buf), "%p", ptr);
-			return buf;
-		}
-
-	public:
-		const std::string format;
-		const std::vector<Value*> values;
-
-		// Constructs a PrintValue for the given value.
-		template <typename T>
-		PrintValue(const T& v) : format(fmt(v)), values(val(v)) {}
-
-		// Constructs a PrintValue for the given static array.
-		template <typename T, int N>
-		PrintValue(const T (&v)[N]) : format(fmt(&v[0], N)), values(val(&v[0], N)) {}
-
-		// Constructs a PrintValue for the given array starting at arr of length
-		// len.
-		template <typename T>
-		PrintValue(const T* arr, int len) : format(fmt(arr, len)), values(val(arr, len)) {}
-
-		// PrintValue constructors for plain-old-data values.
-		PrintValue(bool v) : format(v ? "true" : "false") {}
-		PrintValue(int8_t v) : format(std::to_string(v)) {}
-		PrintValue(uint8_t v) : format(std::to_string(v)) {}
-		PrintValue(int16_t v) : format(std::to_string(v)) {}
-		PrintValue(uint16_t v) : format(std::to_string(v)) {}
-		PrintValue(int32_t v) : format(std::to_string(v)) {}
-		PrintValue(uint32_t v) : format(std::to_string(v)) {}
-		PrintValue(int64_t v) : format(std::to_string(v)) {}
-		PrintValue(uint64_t v) : format(std::to_string(v)) {}
-		PrintValue(float v) : format(std::to_string(v)) {}
-		PrintValue(double v) : format(std::to_string(v)) {}
-
-		template <typename T>
-		PrintValue(const T* v) : format(addr(v)) {}
-
-		// vals is a helper to build composite value lists.
-		// vals returns the full, sequential list of printf argument values used
-		// to print all the provided variadic values.
-		// vals() is intended to be used by implementations of
-		// PrintValue::Ty<>::vals() to help declare aggregate types.
-		// For example, if you were declaring a PrintValue::Ty<> specialization
-		// for a custom Mat4x4 matrix formed from four Vector4 values, you'd
-		// write:
-		//
-		// namespace rr
-		// {
-		//		template <> struct PrintValue::Ty<Mat4x4>
-		//		{
-		//			static std::string fmt(const Mat4x4& v)
-		//			{
-		//				return	"[a: <%f, %f, %f, %f>,"
-		//				        " b: <%f, %f, %f, %f>,"
-		//				        " c: <%f, %f, %f, %f>,"
-		//				        " d: <%f, %f, %f, %f>]";
-		//			}
-		//			static std::vector<rr::Value*> val(const Mat4x4& v)
-		//			{
-		//				return PrintValue::vals(v.a, v.b, v.c, v.d);
-		//			}
-		//		};
-		//	}
-		template<typename ... ARGS>
-		static std::vector<Value*> vals(ARGS... v)
-		{
-			std::vector< std::vector<Value*> > lists = {val(v)...};
-			std::vector<Value*> joined;
-			for (const auto& list : lists)
-			{
-				joined.insert(joined.end(), list.begin(), list.end());
-			}
-			return joined;
-		}
-
-		// returns the printf format specifier for the given type via the
-		// PrintValue::Ty<T> specialization.
-		template <typename T>
-		static std::string fmt(const T& v) { return Ty<T>::fmt(v); }
-
-		// returns the printf value for the given type with a
-		// PrintValue::Ty<T> specialization.
-		template <typename T>
-		static std::vector<Value*> val(const T& v) { return Ty<T>::val(v); }
-	};
-
-	// PrintValue::Ty<T> specializations for basic types.
-	template <> struct PrintValue::Ty<const char*>
-	{
-		static std::string fmt(const char* v) { return "%s"; }
-		static std::vector<Value*> val(const char* v);
-	};
-	template <> struct PrintValue::Ty<std::string>
-	{
-		static std::string fmt(const std::string& v) { return PrintValue::Ty<const char*>::fmt(v.c_str()); }
-		static std::vector<Value*> val(const std::string& v) { return PrintValue::Ty<const char*>::val(v.c_str()); }
-	};
-
-	// PrintValue::Ty<T> specializations for standard Reactor types.
-	template <> struct PrintValue::Ty<Bool>
-	{
-		static std::string fmt(const RValue<Bool>& v) { return "%d"; }
-		static std::vector<Value*> val(const RValue<Bool>& v) { return {v.value}; }
-	};
-	template <> struct PrintValue::Ty<Byte>
-	{
-		static std::string fmt(const RValue<Byte>& v) { return "%d"; }
-		static std::vector<Value*> val(const RValue<Byte>& v);
-	};
-	template <> struct PrintValue::Ty<Byte4>
-	{
-		static std::string fmt(const RValue<Byte4>& v) { return "[%d, %d, %d, %d]"; }
-		static std::vector<Value*> val(const RValue<Byte4>& v);
-	};
-	template <> struct PrintValue::Ty<Int>
-	{
-		static std::string fmt(const RValue<Int>& v) { return "%d"; }
-		static std::vector<Value*> val(const RValue<Int>& v);
-	};
-	template <> struct PrintValue::Ty<Int2>
-	{
-		static std::string fmt(const RValue<Int>& v) { return "[%d, %d]"; }
-		static std::vector<Value*> val(const RValue<Int2>& v);
-	};
-	template <> struct PrintValue::Ty<Int4>
-	{
-		static std::string fmt(const RValue<Int4>& v) { return "[%d, %d, %d, %d]"; }
-		static std::vector<Value*> val(const RValue<Int4>& v);
-	};
-	template <> struct PrintValue::Ty<UInt>
-	{
-		static std::string fmt(const RValue<UInt>& v) { return "%u"; }
-		static std::vector<Value*> val(const RValue<UInt>& v);
-	};
-	template <> struct PrintValue::Ty<UInt2>
-	{
-		static std::string fmt(const RValue<UInt>& v) { return "[%u, %u]"; }
-		static std::vector<Value*> val(const RValue<UInt2>& v);
-	};
-	template <> struct PrintValue::Ty<UInt4>
-	{
-		static std::string fmt(const RValue<UInt4>& v) { return "[%u, %u, %u, %u]"; }
-		static std::vector<Value*> val(const RValue<UInt4>& v);
-	};
-	template <> struct PrintValue::Ty<Short>
-	{
-		static std::string fmt(const RValue<Short>& v) { return "%d"; }
-		static std::vector<Value*> val(const RValue<Short>& v);
-	};
-	template <> struct PrintValue::Ty<Short4>
-	{
-		static std::string fmt(const RValue<Short4>& v) { return "[%d, %d, %d, %d]"; }
-		static std::vector<Value*> val(const RValue<Short4>& v);
-	};
-	template <> struct PrintValue::Ty<UShort>
-	{
-		static std::string fmt(const RValue<UShort>& v) { return "%u"; }
-		static std::vector<Value*> val(const RValue<UShort>& v);
-	};
-	template <> struct PrintValue::Ty<UShort4>
-	{
-		static std::string fmt(const RValue<UShort4>& v) { return "[%u, %u, %u, %u]"; }
-		static std::vector<Value*> val(const RValue<UShort4>& v);
-	};
-	template <> struct PrintValue::Ty<Float>
-	{
-		static std::string fmt(const RValue<Float>& v) { return "[%f]"; }
-		static std::vector<Value*> val(const RValue<Float>& v);
-	};
-	template <> struct PrintValue::Ty<Float4>
-	{
-		static std::string fmt(const RValue<Float4>& v) { return "[%f, %f, %f, %f]"; }
-		static std::vector<Value*> val(const RValue<Float4>& v);
-	};
-	template <> struct PrintValue::Ty<Long>
-	{
-		static std::string fmt(const RValue<Long>& v) { return "%lld"; }
-		static std::vector<Value*> val(const RValue<Long>& v) { return {v.value}; }
-	};
-	template <typename T> struct PrintValue::Ty< Pointer<T> >
-	{
-		static std::string fmt(const RValue<Pointer<T>>& v) { return "%p"; }
-		static std::vector<Value*> val(const RValue<Pointer<T>>& v) { return {v.value}; }
-	};
-	template <typename T> struct PrintValue::Ty< Reference<T> >
-	{
-		static std::string fmt(const Reference<T>& v) { return PrintValue::Ty<T>::fmt(v); }
-		static std::vector<Value*> val(const Reference<T>& v) { return PrintValue::Ty<T>::val(v); }
-	};
-	template <typename T> struct PrintValue::Ty< RValue<T> >
-	{
-		static std::string fmt(const RValue<T>& v) { return PrintValue::Ty<T>::fmt(v); }
-		static std::vector<Value*> val(const RValue<T>& v) { return PrintValue::Ty<T>::val(v); }
-	};
-
-	// Printv emits a call to printf() using the function, file and line,
-	// message and optional values.
-	// See Printv below.
-	void Printv(const char* function, const char* file, int line, const char* msg, std::initializer_list<PrintValue> vals);
-
-	// Printv emits a call to printf() using the provided message and optional
-	// values.
-	// Printf replaces any bracketed indices in the message with string
-	// representations of the corresponding value in vals.
-	// For example:
-	//   Printv("{0} and {1}", "red", "green");
-	// Would print the string:
-	//   "red and green"
-	// Arguments can be indexed in any order.
-	// Invalid indices are not substituted.
-	inline void Printv(const char* msg, std::initializer_list<PrintValue> vals)
-	{
-		Printv(nullptr, nullptr, 0, msg, vals);
+		return values;
 	}
 
-	// Print is a wrapper over Printv that wraps the variadic arguments into an
-	// initializer_list before calling Printv.
-	template <typename ... ARGS>
-	void Print(const char* msg, const ARGS& ... vals) { Printv(msg, {vals...}); }
-
-	// Print is a wrapper over Printv that wraps the variadic arguments into an
-	// initializer_list before calling Printv.
-	template <typename ... ARGS>
-	void Print(const char* function, const char* file, int line, const char* msg, const ARGS& ... vals)
+	// fmt returns the comma-delimited list of printf format strings for
+	// every element in the provided list, all enclosed in square brackets.
+	template <typename T>
+	static std::string fmt(const T* list, int count)
 	{
-		Printv(function, file, line, msg, {vals...});
+		std::string out = "[";
+		for (int i = 0; i < count; i++)
+		{
+			if (i > 0) { out += ", "; }
+			out += fmt(list[i]);
+		}
+		return out + "]";
 	}
 
-	// RR_LOG is a macro that calls Print(), automatically populating the
-	// function, file and line parameters and appending a newline to the string.
-	//
-	// RR_LOG() is intended to be used for debugging JIT compiled code, and is
-	// not intended for production use.
-	#if defined(_WIN32)
-		#define RR_LOG(msg, ...) Print(__FUNCSIG__, __FILE__, static_cast<int>(__LINE__), msg "\n", ##__VA_ARGS__)
-	#else
-		#define RR_LOG(msg, ...) Print(__PRETTY_FUNCTION__, __FILE__, static_cast<int>(__LINE__), msg "\n", ##__VA_ARGS__)
-	#endif
+	static std::string addr(const void* ptr)
+	{
+		char buf[32];
+		snprintf(buf, sizeof(buf), "%p", ptr);
+		return buf;
+	}
 
-	// Macro magic to perform variadic dispatch.
-	// See: https://renenyffenegger.ch/notes/development/languages/C-C-plus-plus/preprocessor/macros/__VA_ARGS__/count-arguments
-	// Note, this doesn't attempt to use the ##__VA_ARGS__ trick to handle 0
-	#define RR_MSVC_EXPAND_BUG(X) X // Helper macro to force expanding __VA_ARGS__ to satisfy MSVC compiler.
-	#define RR_GET_NTH_ARG(_1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, N, ...) N
-	#define RR_COUNT_ARGUMENTS(...) RR_MSVC_EXPAND_BUG(RR_GET_NTH_ARG(__VA_ARGS__, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0))
-	static_assert(1 == RR_COUNT_ARGUMENTS(a), "RR_COUNT_ARGUMENTS broken"); // Sanity checks.
-	static_assert(2 == RR_COUNT_ARGUMENTS(a, b), "RR_COUNT_ARGUMENTS broken");
-	static_assert(3 == RR_COUNT_ARGUMENTS(a, b, c), "RR_COUNT_ARGUMENTS broken");
+public:
+	const std::string format;
+	const std::vector<Value*> values;
 
-	// RR_WATCH_FMT(...) resolves to a string literal that lists all the
-	// arguments by name. This string can be passed to LOG() to print each of
-	// the arguments with their name and value.
-	//
-	// RR_WATCH_FMT(...) uses the RR_COUNT_ARGUMENTS helper macro to delegate to a
-	// corresponding RR_WATCH_FMT_n specialization macro below.
-	#define RR_WATCH_CONCAT(a, b) a ## b
-	#define RR_WATCH_CONCAT2(a, b) RR_WATCH_CONCAT(a, b)
-	#define RR_WATCH_FMT(...) RR_MSVC_EXPAND_BUG(RR_WATCH_CONCAT2(RR_WATCH_FMT_, RR_COUNT_ARGUMENTS(__VA_ARGS__))(__VA_ARGS__))
-	#define RR_WATCH_FMT_1(_1) "\n  " #_1 ": {0}"
-	#define RR_WATCH_FMT_2(_1, _2)                                             RR_WATCH_FMT_1(_1) "\n  " #_2 ": {1}"
-	#define RR_WATCH_FMT_3(_1, _2, _3)                                         RR_WATCH_FMT_2(_1, _2) "\n  " #_3 ": {2}"
-	#define RR_WATCH_FMT_4(_1, _2, _3, _4)                                     RR_WATCH_FMT_3(_1, _2, _3) "\n  " #_4 ": {3}"
-	#define RR_WATCH_FMT_5(_1, _2, _3, _4, _5)                                 RR_WATCH_FMT_4(_1, _2, _3, _4) "\n  " #_5 ": {4}"
-	#define RR_WATCH_FMT_6(_1, _2, _3, _4, _5, _6)                             RR_WATCH_FMT_5(_1, _2, _3, _4, _5) "\n  " #_6 ": {5}"
-	#define RR_WATCH_FMT_7(_1, _2, _3, _4, _5, _6, _7)                         RR_WATCH_FMT_6(_1, _2, _3, _4, _5, _6) "\n  " #_7 ": {6}"
-	#define RR_WATCH_FMT_8(_1, _2, _3, _4, _5, _6, _7, _8)                     RR_WATCH_FMT_7(_1, _2, _3, _4, _5, _6, _7) "\n  " #_8 ": {7}"
-	#define RR_WATCH_FMT_9(_1, _2, _3, _4, _5, _6, _7, _8, _9)                 RR_WATCH_FMT_8(_1, _2, _3, _4, _5, _6, _7, _8) "\n  " #_9 ": {8}"
-	#define RR_WATCH_FMT_10(_1, _2, _3, _4, _5, _6, _7, _8, _9, _10)           RR_WATCH_FMT_9(_1, _2, _3, _4, _5, _6, _7, _8, _9) "\n  " #_10 ": {9}"
-	#define RR_WATCH_FMT_11(_1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11)      RR_WATCH_FMT_10(_1, _2, _3, _4, _5, _6, _7, _8, _9, _10) "\n  " #_11 ": {10}"
-	#define RR_WATCH_FMT_12(_1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12) RR_WATCH_FMT_11(_1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11) "\n  " #_12 ": {11}"
+	// Constructs a PrintValue for the given value.
+	template <typename T>
+	PrintValue(const T& v) : format(fmt(v)), values(val(v)) {}
 
-	// RR_WATCH() is a helper that prints the name and value of all the supplied
-	// arguments.
-	// For example, if you had the Int and bool variables 'foo' and 'bar' that
-	// you want to print, you can simply write:
-	//    RR_WATCH(foo, bar)
-	// When this JIT compiled code is executed, it will print the string
-	// "foo: 1, bar: true" to stdout.
+	// Constructs a PrintValue for the given static array.
+	template <typename T, int N>
+	PrintValue(const T (&v)[N]) : format(fmt(&v[0], N)), values(val(&v[0], N)) {}
+
+	// Constructs a PrintValue for the given array starting at arr of length
+	// len.
+	template <typename T>
+	PrintValue(const T* arr, int len) : format(fmt(arr, len)), values(val(arr, len)) {}
+
+	// PrintValue constructors for plain-old-data values.
+	PrintValue(bool v) : format(v ? "true" : "false") {}
+	PrintValue(int8_t v) : format(std::to_string(v)) {}
+	PrintValue(uint8_t v) : format(std::to_string(v)) {}
+	PrintValue(int16_t v) : format(std::to_string(v)) {}
+	PrintValue(uint16_t v) : format(std::to_string(v)) {}
+	PrintValue(int32_t v) : format(std::to_string(v)) {}
+	PrintValue(uint32_t v) : format(std::to_string(v)) {}
+	PrintValue(int64_t v) : format(std::to_string(v)) {}
+	PrintValue(uint64_t v) : format(std::to_string(v)) {}
+	PrintValue(float v) : format(std::to_string(v)) {}
+	PrintValue(double v) : format(std::to_string(v)) {}
+
+	template <typename T>
+	PrintValue(const T* v) : format(addr(v)) {}
+
+	// vals is a helper to build composite value lists.
+	// vals returns the full, sequential list of printf argument values used
+	// to print all the provided variadic values.
+	// vals() is intended to be used by implementations of
+	// PrintValue::Ty<>::vals() to help declare aggregate types.
+	// For example, if you were declaring a PrintValue::Ty<> specialization
+	// for a custom Mat4x4 matrix formed from four Vector4 values, you'd
+	// write:
 	//
-	// RR_WATCH() is intended to be used for debugging JIT compiled code, and
-	// is not intended for production use.
-	#define RR_WATCH(...) RR_LOG(RR_WATCH_FMT(__VA_ARGS__), __VA_ARGS__)
+	// namespace rr
+	// {
+	//		template <> struct PrintValue::Ty<Mat4x4>
+	//		{
+	//			static std::string fmt(const Mat4x4& v)
+	//			{
+	//				return	"[a: <%f, %f, %f, %f>,"
+	//				        " b: <%f, %f, %f, %f>,"
+	//				        " c: <%f, %f, %f, %f>,"
+	//				        " d: <%f, %f, %f, %f>]";
+	//			}
+	//			static std::vector<rr::Value*> val(const Mat4x4& v)
+	//			{
+	//				return PrintValue::vals(v.a, v.b, v.c, v.d);
+	//			}
+	//		};
+	//	}
+	template<typename ... ARGS>
+	static std::vector<Value*> vals(ARGS... v)
+	{
+		std::vector< std::vector<Value*> > lists = {val(v)...};
+		std::vector<Value*> joined;
+		for (const auto& list : lists)
+		{
+			joined.insert(joined.end(), list.begin(), list.end());
+		}
+		return joined;
+	}
+
+	// returns the printf format specifier for the given type via the
+	// PrintValue::Ty<T> specialization.
+	template <typename T>
+	static std::string fmt(const T& v) { return Ty<T>::fmt(v); }
+
+	// returns the printf value for the given type with a
+	// PrintValue::Ty<T> specialization.
+	template <typename T>
+	static std::vector<Value*> val(const T& v) { return Ty<T>::val(v); }
+};
+
+// PrintValue::Ty<T> specializations for basic types.
+template <> struct PrintValue::Ty<const char*>
+{
+	static std::string fmt(const char* v) { return "%s"; }
+	static std::vector<Value*> val(const char* v);
+};
+template <> struct PrintValue::Ty<std::string>
+{
+	static std::string fmt(const std::string& v) { return PrintValue::Ty<const char*>::fmt(v.c_str()); }
+	static std::vector<Value*> val(const std::string& v) { return PrintValue::Ty<const char*>::val(v.c_str()); }
+};
+
+// PrintValue::Ty<T> specializations for standard Reactor types.
+template <> struct PrintValue::Ty<Bool>
+{
+	static std::string fmt(const RValue<Bool>& v) { return "%d"; }
+	static std::vector<Value*> val(const RValue<Bool>& v) { return {v.value}; }
+};
+template <> struct PrintValue::Ty<Byte>
+{
+	static std::string fmt(const RValue<Byte>& v) { return "%d"; }
+	static std::vector<Value*> val(const RValue<Byte>& v);
+};
+template <> struct PrintValue::Ty<Byte4>
+{
+	static std::string fmt(const RValue<Byte4>& v) { return "[%d, %d, %d, %d]"; }
+	static std::vector<Value*> val(const RValue<Byte4>& v);
+};
+template <> struct PrintValue::Ty<Int>
+{
+	static std::string fmt(const RValue<Int>& v) { return "%d"; }
+	static std::vector<Value*> val(const RValue<Int>& v);
+};
+template <> struct PrintValue::Ty<Int2>
+{
+	static std::string fmt(const RValue<Int>& v) { return "[%d, %d]"; }
+	static std::vector<Value*> val(const RValue<Int2>& v);
+};
+template <> struct PrintValue::Ty<Int4>
+{
+	static std::string fmt(const RValue<Int4>& v) { return "[%d, %d, %d, %d]"; }
+	static std::vector<Value*> val(const RValue<Int4>& v);
+};
+template <> struct PrintValue::Ty<UInt>
+{
+	static std::string fmt(const RValue<UInt>& v) { return "%u"; }
+	static std::vector<Value*> val(const RValue<UInt>& v);
+};
+template <> struct PrintValue::Ty<UInt2>
+{
+	static std::string fmt(const RValue<UInt>& v) { return "[%u, %u]"; }
+	static std::vector<Value*> val(const RValue<UInt2>& v);
+};
+template <> struct PrintValue::Ty<UInt4>
+{
+	static std::string fmt(const RValue<UInt4>& v) { return "[%u, %u, %u, %u]"; }
+	static std::vector<Value*> val(const RValue<UInt4>& v);
+};
+template <> struct PrintValue::Ty<Short>
+{
+	static std::string fmt(const RValue<Short>& v) { return "%d"; }
+	static std::vector<Value*> val(const RValue<Short>& v);
+};
+template <> struct PrintValue::Ty<Short4>
+{
+	static std::string fmt(const RValue<Short4>& v) { return "[%d, %d, %d, %d]"; }
+	static std::vector<Value*> val(const RValue<Short4>& v);
+};
+template <> struct PrintValue::Ty<UShort>
+{
+	static std::string fmt(const RValue<UShort>& v) { return "%u"; }
+	static std::vector<Value*> val(const RValue<UShort>& v);
+};
+template <> struct PrintValue::Ty<UShort4>
+{
+	static std::string fmt(const RValue<UShort4>& v) { return "[%u, %u, %u, %u]"; }
+	static std::vector<Value*> val(const RValue<UShort4>& v);
+};
+template <> struct PrintValue::Ty<Float>
+{
+	static std::string fmt(const RValue<Float>& v) { return "[%f]"; }
+	static std::vector<Value*> val(const RValue<Float>& v);
+};
+template <> struct PrintValue::Ty<Float4>
+{
+	static std::string fmt(const RValue<Float4>& v) { return "[%f, %f, %f, %f]"; }
+	static std::vector<Value*> val(const RValue<Float4>& v);
+};
+template <> struct PrintValue::Ty<Long>
+{
+	static std::string fmt(const RValue<Long>& v) { return "%lld"; }
+	static std::vector<Value*> val(const RValue<Long>& v) { return {v.value}; }
+};
+template <typename T> struct PrintValue::Ty< Pointer<T> >
+{
+	static std::string fmt(const RValue<Pointer<T>>& v) { return "%p"; }
+	static std::vector<Value*> val(const RValue<Pointer<T>>& v) { return {v.value}; }
+};
+template <typename T> struct PrintValue::Ty< Reference<T> >
+{
+	static std::string fmt(const Reference<T>& v) { return PrintValue::Ty<T>::fmt(v); }
+	static std::vector<Value*> val(const Reference<T>& v) { return PrintValue::Ty<T>::val(v); }
+};
+template <typename T> struct PrintValue::Ty< RValue<T> >
+{
+	static std::string fmt(const RValue<T>& v) { return PrintValue::Ty<T>::fmt(v); }
+	static std::vector<Value*> val(const RValue<T>& v) { return PrintValue::Ty<T>::val(v); }
+};
+
+// Printv emits a call to printf() using the function, file and line,
+// message and optional values.
+// See Printv below.
+void Printv(const char* function, const char* file, int line, const char* msg, std::initializer_list<PrintValue> vals);
+
+// Printv emits a call to printf() using the provided message and optional
+// values.
+// Printf replaces any bracketed indices in the message with string
+// representations of the corresponding value in vals.
+// For example:
+//   Printv("{0} and {1}", "red", "green");
+// Would print the string:
+//   "red and green"
+// Arguments can be indexed in any order.
+// Invalid indices are not substituted.
+inline void Printv(const char* msg, std::initializer_list<PrintValue> vals)
+{
+	Printv(nullptr, nullptr, 0, msg, vals);
+}
+
+// Print is a wrapper over Printv that wraps the variadic arguments into an
+// initializer_list before calling Printv.
+template <typename ... ARGS>
+void Print(const char* msg, const ARGS& ... vals) { Printv(msg, {vals...}); }
+
+// Print is a wrapper over Printv that wraps the variadic arguments into an
+// initializer_list before calling Printv.
+template <typename ... ARGS>
+void Print(const char* function, const char* file, int line, const char* msg, const ARGS& ... vals)
+{
+	Printv(function, file, line, msg, {vals...});
+}
+
+// RR_LOG is a macro that calls Print(), automatically populating the
+// function, file and line parameters and appending a newline to the string.
+//
+// RR_LOG() is intended to be used for debugging JIT compiled code, and is
+// not intended for production use.
+#if defined(_WIN32)
+	#define RR_LOG(msg, ...) Print(__FUNCSIG__, __FILE__, static_cast<int>(__LINE__), msg "\n", ##__VA_ARGS__)
+#else
+	#define RR_LOG(msg, ...) Print(__PRETTY_FUNCTION__, __FILE__, static_cast<int>(__LINE__), msg "\n", ##__VA_ARGS__)
+#endif
+
+// Macro magic to perform variadic dispatch.
+// See: https://renenyffenegger.ch/notes/development/languages/C-C-plus-plus/preprocessor/macros/__VA_ARGS__/count-arguments
+// Note, this doesn't attempt to use the ##__VA_ARGS__ trick to handle 0
+#define RR_MSVC_EXPAND_BUG(X) X // Helper macro to force expanding __VA_ARGS__ to satisfy MSVC compiler.
+#define RR_GET_NTH_ARG(_1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, N, ...) N
+#define RR_COUNT_ARGUMENTS(...) RR_MSVC_EXPAND_BUG(RR_GET_NTH_ARG(__VA_ARGS__, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0))
+static_assert(1 == RR_COUNT_ARGUMENTS(a), "RR_COUNT_ARGUMENTS broken"); // Sanity checks.
+static_assert(2 == RR_COUNT_ARGUMENTS(a, b), "RR_COUNT_ARGUMENTS broken");
+static_assert(3 == RR_COUNT_ARGUMENTS(a, b, c), "RR_COUNT_ARGUMENTS broken");
+
+// RR_WATCH_FMT(...) resolves to a string literal that lists all the
+// arguments by name. This string can be passed to LOG() to print each of
+// the arguments with their name and value.
+//
+// RR_WATCH_FMT(...) uses the RR_COUNT_ARGUMENTS helper macro to delegate to a
+// corresponding RR_WATCH_FMT_n specialization macro below.
+#define RR_WATCH_CONCAT(a, b) a ## b
+#define RR_WATCH_CONCAT2(a, b) RR_WATCH_CONCAT(a, b)
+#define RR_WATCH_FMT(...) RR_MSVC_EXPAND_BUG(RR_WATCH_CONCAT2(RR_WATCH_FMT_, RR_COUNT_ARGUMENTS(__VA_ARGS__))(__VA_ARGS__))
+#define RR_WATCH_FMT_1(_1) "\n  " #_1 ": {0}"
+#define RR_WATCH_FMT_2(_1, _2)                                             RR_WATCH_FMT_1(_1) "\n  " #_2 ": {1}"
+#define RR_WATCH_FMT_3(_1, _2, _3)                                         RR_WATCH_FMT_2(_1, _2) "\n  " #_3 ": {2}"
+#define RR_WATCH_FMT_4(_1, _2, _3, _4)                                     RR_WATCH_FMT_3(_1, _2, _3) "\n  " #_4 ": {3}"
+#define RR_WATCH_FMT_5(_1, _2, _3, _4, _5)                                 RR_WATCH_FMT_4(_1, _2, _3, _4) "\n  " #_5 ": {4}"
+#define RR_WATCH_FMT_6(_1, _2, _3, _4, _5, _6)                             RR_WATCH_FMT_5(_1, _2, _3, _4, _5) "\n  " #_6 ": {5}"
+#define RR_WATCH_FMT_7(_1, _2, _3, _4, _5, _6, _7)                         RR_WATCH_FMT_6(_1, _2, _3, _4, _5, _6) "\n  " #_7 ": {6}"
+#define RR_WATCH_FMT_8(_1, _2, _3, _4, _5, _6, _7, _8)                     RR_WATCH_FMT_7(_1, _2, _3, _4, _5, _6, _7) "\n  " #_8 ": {7}"
+#define RR_WATCH_FMT_9(_1, _2, _3, _4, _5, _6, _7, _8, _9)                 RR_WATCH_FMT_8(_1, _2, _3, _4, _5, _6, _7, _8) "\n  " #_9 ": {8}"
+#define RR_WATCH_FMT_10(_1, _2, _3, _4, _5, _6, _7, _8, _9, _10)           RR_WATCH_FMT_9(_1, _2, _3, _4, _5, _6, _7, _8, _9) "\n  " #_10 ": {9}"
+#define RR_WATCH_FMT_11(_1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11)      RR_WATCH_FMT_10(_1, _2, _3, _4, _5, _6, _7, _8, _9, _10) "\n  " #_11 ": {10}"
+#define RR_WATCH_FMT_12(_1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12) RR_WATCH_FMT_11(_1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11) "\n  " #_12 ": {11}"
+
+// RR_WATCH() is a helper that prints the name and value of all the supplied
+// arguments.
+// For example, if you had the Int and bool variables 'foo' and 'bar' that
+// you want to print, you can simply write:
+//    RR_WATCH(foo, bar)
+// When this JIT compiled code is executed, it will print the string
+// "foo: 1, bar: true" to stdout.
+//
+// RR_WATCH() is intended to be used for debugging JIT compiled code, and
+// is not intended for production use.
+#define RR_WATCH(...) RR_LOG(RR_WATCH_FMT(__VA_ARGS__), __VA_ARGS__)
 
 }  // namespace rr
 

diff --git a/src/Reactor/Reactor.cpp b/src/Reactor/Reactor.cpp
index 79c0891..07e83c5 100644
--- a/src/Reactor/Reactor.cpp
+++ b/src/Reactor/Reactor.cpp

@@ -23,1292 +23,1293 @@
 #define REACTOR_MATERIALIZE_LVALUES_ON_DEFINITION 0
 #endif
 
-namespace
+namespace {
+
+// Introduced in C++20.
+template <class ForwardIterator, class UnaryPredicate>
+ForwardIterator remove_if(ForwardIterator first, ForwardIterator last,
+							UnaryPredicate pred)
 {
-	// Introduced in C++20.
-	template <class ForwardIterator, class UnaryPredicate>
-	ForwardIterator remove_if(ForwardIterator first, ForwardIterator last,
-								UnaryPredicate pred)
-	{
-		ForwardIterator result = first;
-		while (first!=last) {
-			if (!pred(*first)) {
-				*result = std::move(*first);
-				++result;
-			}
-			++first;
+	ForwardIterator result = first;
+	while (first!=last) {
+		if (!pred(*first)) {
+			*result = std::move(*first);
+			++result;
 		}
-		return result;
+		++first;
+	}
+	return result;
+}
+
+}  // anonymous namespace
+
+namespace rr {
+
+const Config::Edit Config::Edit::None = {};
+
+Config Config::Edit::apply(const Config &cfg) const
+{
+	if (this == &None) { return cfg; }
+
+	auto level = optLevelChanged ? optLevel : cfg.optimization.getLevel();
+	auto passes = cfg.optimization.getPasses();
+	apply(optPassEdits, passes);
+	return Config{ Optimization{level, passes} };
+}
+
+template <typename T>
+void rr::Config::Edit::apply(const std::vector<std::pair<ListEdit, T>> & edits, std::vector<T>& list) const
+{
+	for (auto & edit : edits)
+	{
+		switch (edit.first)
+		{
+		case ListEdit::Add:
+			list.push_back(edit.second);
+			break;
+		case ListEdit::Remove:
+			::remove_if(list.begin(), list.end(), [&](T item) { return item == edit.second; });
+			break;
+		case ListEdit::Clear:
+			list.clear();
+			break;
+		}
 	}
 }
 
-namespace rr
+// Set of variables that do not have a stack location yet.
+std::unordered_set<Variable*> Variable::unmaterializedVariables;
+
+Variable::Variable(Type *type, int arraySize) : arraySize(arraySize), type(type)
 {
-	const Config::Edit Config::Edit::None = {};
+	#if REACTOR_MATERIALIZE_LVALUES_ON_DEFINITION
+		materialize();
+	#else
+		unmaterializedVariables.emplace(this);
+	#endif
+}
 
-	Config Config::Edit::apply(const Config &cfg) const
-	{
-		if (this == &None) { return cfg; }
+Variable::~Variable()
+{
+	unmaterializedVariables.erase(this);
+}
 
-		auto level = optLevelChanged ? optLevel : cfg.optimization.getLevel();
-		auto passes = cfg.optimization.getPasses();
-		apply(optPassEdits, passes);
-		return Config{ Optimization{level, passes} };
-	}
-
-	template <typename T>
-	void rr::Config::Edit::apply(const std::vector<std::pair<ListEdit, T>> & edits, std::vector<T>& list) const
+void Variable::materializeAll()
+{
+	for(auto *var : unmaterializedVariables)
 	{
-		for (auto & edit : edits)
-		{
-			switch (edit.first)
-			{
-			case ListEdit::Add:
-				list.push_back(edit.second);
-				break;
-			case ListEdit::Remove:
-				::remove_if(list.begin(), list.end(), [&](T item) { return item == edit.second; });
-				break;
-			case ListEdit::Clear:
-				list.clear();
-				break;
-			}
-		}
+		var->materialize();
 	}
 
-	// Set of variables that do not have a stack location yet.
-	std::unordered_set<Variable*> Variable::unmaterializedVariables;
+	unmaterializedVariables.clear();
+}
 
-	Variable::Variable(Type *type, int arraySize) : arraySize(arraySize), type(type)
-	{
-		#if REACTOR_MATERIALIZE_LVALUES_ON_DEFINITION
-			materialize();
-		#else
-			unmaterializedVariables.emplace(this);
-		#endif
-	}
-
-	Variable::~Variable()
-	{
-		unmaterializedVariables.erase(this);
-	}
+void Variable::killUnmaterialized()
+{
+	unmaterializedVariables.clear();
+}
 
-	void Variable::materializeAll()
+// NOTE: Only 12 bits out of 16 of the |select| value are used.
+// More specifically, the value should look like:
+//
+//    msb               lsb
+//     v                 v
+//    [.xxx|.yyy|.zzz|.www]    where '.' means an ignored bit
+//
+// This format makes it easy to write calls with hexadecimal select values,
+// since each hex digit is a separate swizzle index.
+//
+// For example:
+//      createBlend4( [a,b,c,d], [e,f,g,h], 0x0123 ) -> [a,b,c,d]
+//      createBlend4( [a,b,c,d], [e,f,g,h], 0x4567 ) -> [e,f,g,h]
+//      createBlend4( [a,b,c,d], [e,f,g,h], 0x4012 ) -> [e,a,b,c]
+//
+static Value *createBlend4(Value *lhs, Value *rhs, uint16_t select)
+{
+	int swizzle[4] =
 	{
-		for(auto *var : unmaterializedVariables)
-		{
-			var->materialize();
-		}
+		(select >> 12) & 0x07,
+		(select >> 8)  & 0x07,
+		(select >> 4)  & 0x07,
+		(select >> 0)  & 0x07,
+	};
 
-		unmaterializedVariables.clear();
-	}
+	return Nucleus::createShuffleVector(lhs, rhs, swizzle);
+}
 
-	void Variable::killUnmaterialized()
+// NOTE: Only 8 bits out of 16 of the |select| value are used.
+// More specifically, the value should look like:
+//
+//    msb               lsb
+//     v                 v
+//    [..xx|..yy|..zz|..ww]    where '.' means an ignored bit
+//
+// This format makes it easy to write calls with hexadecimal select values,
+// since each hex digit is a separate swizzle index.
+//
+// For example:
+//      createSwizzle4( [a,b,c,d], 0x0123 ) -> [a,b,c,d]
+//      createSwizzle4( [a,b,c,d], 0x0033 ) -> [a,a,d,d]
+//
+static Value *createSwizzle4(Value *val, uint16_t select)
+{
+	int swizzle[4] =
 	{
-		unmaterializedVariables.clear();
-	}
-
-	// NOTE: Only 12 bits out of 16 of the |select| value are used.
-	// More specifically, the value should look like:
-	//
-	//    msb               lsb
-	//     v                 v
-	//    [.xxx|.yyy|.zzz|.www]    where '.' means an ignored bit
-	//
-	// This format makes it easy to write calls with hexadecimal select values,
-	// since each hex digit is a separate swizzle index.
-	//
-	// For example:
-	//      createBlend4( [a,b,c,d], [e,f,g,h], 0x0123 ) -> [a,b,c,d]
-	//      createBlend4( [a,b,c,d], [e,f,g,h], 0x4567 ) -> [e,f,g,h]
-	//      createBlend4( [a,b,c,d], [e,f,g,h], 0x4012 ) -> [e,a,b,c]
-	//
-	static Value *createBlend4(Value *lhs, Value *rhs, uint16_t select)
-	{
-		int swizzle[4] =
-		{
-			(select >> 12) & 0x07,
-			(select >> 8)  & 0x07,
-			(select >> 4)  & 0x07,
-			(select >> 0)  & 0x07,
-		};
+		(select >> 12) & 0x03,
+		(select >> 8)  & 0x03,
+		(select >> 4)  & 0x03,
+		(select >> 0)  & 0x03,
+	};
 
-		return Nucleus::createShuffleVector(lhs, rhs, swizzle);
-	}
+	return Nucleus::createShuffleVector(val, val, swizzle);
+}
 
-	// NOTE: Only 8 bits out of 16 of the |select| value are used.
-	// More specifically, the value should look like:
-	//
-	//    msb               lsb
-	//     v                 v
-	//    [..xx|..yy|..zz|..ww]    where '.' means an ignored bit
-	//
-	// This format makes it easy to write calls with hexadecimal select values,
-	// since each hex digit is a separate swizzle index.
-	//
-	// For example:
-	//      createSwizzle4( [a,b,c,d], 0x0123 ) -> [a,b,c,d]
-	//      createSwizzle4( [a,b,c,d], 0x0033 ) -> [a,a,d,d]
-	//
-	static Value *createSwizzle4(Value *val, uint16_t select)
-	{
-		int swizzle[4] =
-		{
-			(select >> 12) & 0x03,
-			(select >> 8)  & 0x03,
-			(select >> 4)  & 0x03,
-			(select >> 0)  & 0x03,
-		};
+static Value *createMask4(Value *lhs, Value *rhs, uint16_t select)
+{
+	bool mask[4] = {false, false, false, false};
 
-		return Nucleus::createShuffleVector(val, val, swizzle);
-	}
+	mask[(select >> 12) & 0x03] = true;
+	mask[(select >> 8)  & 0x03] = true;
+	mask[(select >> 4)  & 0x03] = true;
+	mask[(select >> 0)  & 0x03] = true;
 
-	static Value *createMask4(Value *lhs, Value *rhs, uint16_t select)
+	int swizzle[4] =
 	{
-		bool mask[4] = {false, false, false, false};
+		mask[0] ? 4 : 0,
+		mask[1] ? 5 : 1,
+		mask[2] ? 6 : 2,
+		mask[3] ? 7 : 3,
+	};
 
-		mask[(select >> 12) & 0x03] = true;
-		mask[(select >> 8)  & 0x03] = true;
-		mask[(select >> 4)  & 0x03] = true;
-		mask[(select >> 0)  & 0x03] = true;
+	return Nucleus::createShuffleVector(lhs, rhs, swizzle);
+}
 
-		int swizzle[4] =
-		{
-			mask[0] ? 4 : 0,
-			mask[1] ? 5 : 1,
-			mask[2] ? 6 : 2,
-			mask[3] ? 7 : 3,
-		};
+Bool::Bool(Argument<Bool> argument)
+{
+	storeValue(argument.value);
+}
 
-		return Nucleus::createShuffleVector(lhs, rhs, swizzle);
-	}
+Bool::Bool(bool x)
+{
+	storeValue(Nucleus::createConstantBool(x));
+}
 
-	Bool::Bool(Argument<Bool> argument)
-	{
-		storeValue(argument.value);
-	}
+Bool::Bool(RValue<Bool> rhs)
+{
+	storeValue(rhs.value);
+}
 
-	Bool::Bool(bool x)
-	{
-		storeValue(Nucleus::createConstantBool(x));
-	}
+Bool::Bool(const Bool &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
+}
 
-	Bool::Bool(RValue<Bool> rhs)
-	{
-		storeValue(rhs.value);
-	}
+Bool::Bool(const Reference<Bool> &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
+}
 
-	Bool::Bool(const Bool &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
-	}
+RValue<Bool> Bool::operator=(RValue<Bool> rhs)
+{
+	storeValue(rhs.value);
 
-	Bool::Bool(const Reference<Bool> &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
-	}
+	return rhs;
+}
 
-	RValue<Bool> Bool::operator=(RValue<Bool> rhs)
-	{
-		storeValue(rhs.value);
+RValue<Bool> Bool::operator=(const Bool &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
 
-		return rhs;
-	}
+	return RValue<Bool>(value);
+}
 
-	RValue<Bool> Bool::operator=(const Bool &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
+RValue<Bool> Bool::operator=(const Reference<Bool> &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
 
-		return RValue<Bool>(value);
-	}
+	return RValue<Bool>(value);
+}
 
-	RValue<Bool> Bool::operator=(const Reference<Bool> &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
+RValue<Bool> operator!(RValue<Bool> val)
+{
+	return RValue<Bool>(Nucleus::createNot(val.value));
+}
 
-		return RValue<Bool>(value);
-	}
+RValue<Bool> operator&&(RValue<Bool> lhs, RValue<Bool> rhs)
+{
+	return RValue<Bool>(Nucleus::createAnd(lhs.value, rhs.value));
+}
 
-	RValue<Bool> operator!(RValue<Bool> val)
-	{
-		return RValue<Bool>(Nucleus::createNot(val.value));
-	}
+RValue<Bool> operator||(RValue<Bool> lhs, RValue<Bool> rhs)
+{
+	return RValue<Bool>(Nucleus::createOr(lhs.value, rhs.value));
+}
 
-	RValue<Bool> operator&&(RValue<Bool> lhs, RValue<Bool> rhs)
-	{
-		return RValue<Bool>(Nucleus::createAnd(lhs.value, rhs.value));
-	}
+RValue<Bool> operator!=(RValue<Bool> lhs, RValue<Bool> rhs)
+{
+	return RValue<Bool>(Nucleus::createICmpNE(lhs.value, rhs.value));
+}
 
-	RValue<Bool> operator||(RValue<Bool> lhs, RValue<Bool> rhs)
-	{
-		return RValue<Bool>(Nucleus::createOr(lhs.value, rhs.value));
-	}
+RValue<Bool> operator==(RValue<Bool> lhs, RValue<Bool> rhs)
+{
+	return RValue<Bool>(Nucleus::createICmpEQ(lhs.value, rhs.value));
+}
 
-	RValue<Bool> operator!=(RValue<Bool> lhs, RValue<Bool> rhs)
-	{
-		return RValue<Bool>(Nucleus::createICmpNE(lhs.value, rhs.value));
-	}
+Byte::Byte(Argument<Byte> argument)
+{
+	storeValue(argument.value);
+}
 
-	RValue<Bool> operator==(RValue<Bool> lhs, RValue<Bool> rhs)
-	{
-		return RValue<Bool>(Nucleus::createICmpEQ(lhs.value, rhs.value));
-	}
+Byte::Byte(RValue<Int> cast)
+{
+	Value *integer = Nucleus::createTrunc(cast.value, Byte::getType());
 
-	Byte::Byte(Argument<Byte> argument)
-	{
-		storeValue(argument.value);
-	}
+	storeValue(integer);
+}
 
-	Byte::Byte(RValue<Int> cast)
-	{
-		Value *integer = Nucleus::createTrunc(cast.value, Byte::getType());
+Byte::Byte(RValue<UInt> cast)
+{
+	Value *integer = Nucleus::createTrunc(cast.value, Byte::getType());
 
-		storeValue(integer);
-	}
+	storeValue(integer);
+}
 
-	Byte::Byte(RValue<UInt> cast)
-	{
-		Value *integer = Nucleus::createTrunc(cast.value, Byte::getType());
+Byte::Byte(RValue<UShort> cast)
+{
+	Value *integer = Nucleus::createTrunc(cast.value, Byte::getType());
 
-		storeValue(integer);
-	}
+	storeValue(integer);
+}
 
-	Byte::Byte(RValue<UShort> cast)
-	{
-		Value *integer = Nucleus::createTrunc(cast.value, Byte::getType());
+Byte::Byte(int x)
+{
+	storeValue(Nucleus::createConstantByte((unsigned char)x));
+}
 
-		storeValue(integer);
-	}
+Byte::Byte(unsigned char x)
+{
+	storeValue(Nucleus::createConstantByte(x));
+}
 
-	Byte::Byte(int x)
-	{
-		storeValue(Nucleus::createConstantByte((unsigned char)x));
-	}
+Byte::Byte(RValue<Byte> rhs)
+{
+	storeValue(rhs.value);
+}
 
-	Byte::Byte(unsigned char x)
-	{
-		storeValue(Nucleus::createConstantByte(x));
-	}
+Byte::Byte(const Byte &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
+}
 
-	Byte::Byte(RValue<Byte> rhs)
-	{
-		storeValue(rhs.value);
-	}
+Byte::Byte(const Reference<Byte> &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
+}
 
-	Byte::Byte(const Byte &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
-	}
+RValue<Byte> Byte::operator=(RValue<Byte> rhs)
+{
+	storeValue(rhs.value);
 
-	Byte::Byte(const Reference<Byte> &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
-	}
+	return rhs;
+}
 
-	RValue<Byte> Byte::operator=(RValue<Byte> rhs)
-	{
-		storeValue(rhs.value);
+RValue<Byte> Byte::operator=(const Byte &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
 
-		return rhs;
-	}
+	return RValue<Byte>(value);
+}
 
-	RValue<Byte> Byte::operator=(const Byte &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
+RValue<Byte> Byte::operator=(const Reference<Byte> &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
 
-		return RValue<Byte>(value);
-	}
+	return RValue<Byte>(value);
+}
 
-	RValue<Byte> Byte::operator=(const Reference<Byte> &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
+RValue<Byte> operator+(RValue<Byte> lhs, RValue<Byte> rhs)
+{
+	return RValue<Byte>(Nucleus::createAdd(lhs.value, rhs.value));
+}
 
-		return RValue<Byte>(value);
-	}
+RValue<Byte> operator-(RValue<Byte> lhs, RValue<Byte> rhs)
+{
+	return RValue<Byte>(Nucleus::createSub(lhs.value, rhs.value));
+}
 
-	RValue<Byte> operator+(RValue<Byte> lhs, RValue<Byte> rhs)
-	{
-		return RValue<Byte>(Nucleus::createAdd(lhs.value, rhs.value));
-	}
+RValue<Byte> operator*(RValue<Byte> lhs, RValue<Byte> rhs)
+{
+	return RValue<Byte>(Nucleus::createMul(lhs.value, rhs.value));
+}
 
-	RValue<Byte> operator-(RValue<Byte> lhs, RValue<Byte> rhs)
-	{
-		return RValue<Byte>(Nucleus::createSub(lhs.value, rhs.value));
-	}
+RValue<Byte> operator/(RValue<Byte> lhs, RValue<Byte> rhs)
+{
+	return RValue<Byte>(Nucleus::createUDiv(lhs.value, rhs.value));
+}
 
-	RValue<Byte> operator*(RValue<Byte> lhs, RValue<Byte> rhs)
-	{
-		return RValue<Byte>(Nucleus::createMul(lhs.value, rhs.value));
-	}
+RValue<Byte> operator%(RValue<Byte> lhs, RValue<Byte> rhs)
+{
+	return RValue<Byte>(Nucleus::createURem(lhs.value, rhs.value));
+}
 
-	RValue<Byte> operator/(RValue<Byte> lhs, RValue<Byte> rhs)
-	{
-		return RValue<Byte>(Nucleus::createUDiv(lhs.value, rhs.value));
-	}
+RValue<Byte> operator&(RValue<Byte> lhs, RValue<Byte> rhs)
+{
+	return RValue<Byte>(Nucleus::createAnd(lhs.value, rhs.value));
+}
 
-	RValue<Byte> operator%(RValue<Byte> lhs, RValue<Byte> rhs)
-	{
-		return RValue<Byte>(Nucleus::createURem(lhs.value, rhs.value));
-	}
+RValue<Byte> operator|(RValue<Byte> lhs, RValue<Byte> rhs)
+{
+	return RValue<Byte>(Nucleus::createOr(lhs.value, rhs.value));
+}
 
-	RValue<Byte> operator&(RValue<Byte> lhs, RValue<Byte> rhs)
-	{
-		return RValue<Byte>(Nucleus::createAnd(lhs.value, rhs.value));
-	}
+RValue<Byte> operator^(RValue<Byte> lhs, RValue<Byte> rhs)
+{
+	return RValue<Byte>(Nucleus::createXor(lhs.value, rhs.value));
+}
 
-	RValue<Byte> operator|(RValue<Byte> lhs, RValue<Byte> rhs)
-	{
-		return RValue<Byte>(Nucleus::createOr(lhs.value, rhs.value));
-	}
+RValue<Byte> operator<<(RValue<Byte> lhs, RValue<Byte> rhs)
+{
+	return RValue<Byte>(Nucleus::createShl(lhs.value, rhs.value));
+}
 
-	RValue<Byte> operator^(RValue<Byte> lhs, RValue<Byte> rhs)
-	{
-		return RValue<Byte>(Nucleus::createXor(lhs.value, rhs.value));
-	}
+RValue<Byte> operator>>(RValue<Byte> lhs, RValue<Byte> rhs)
+{
+	return RValue<Byte>(Nucleus::createLShr(lhs.value, rhs.value));
+}
 
-	RValue<Byte> operator<<(RValue<Byte> lhs, RValue<Byte> rhs)
-	{
-		return RValue<Byte>(Nucleus::createShl(lhs.value, rhs.value));
-	}
+RValue<Byte> operator+=(Byte &lhs, RValue<Byte> rhs)
+{
+	return lhs = lhs + rhs;
+}
 
-	RValue<Byte> operator>>(RValue<Byte> lhs, RValue<Byte> rhs)
-	{
-		return RValue<Byte>(Nucleus::createLShr(lhs.value, rhs.value));
-	}
+RValue<Byte> operator-=(Byte &lhs, RValue<Byte> rhs)
+{
+	return lhs = lhs - rhs;
+}
 
-	RValue<Byte> operator+=(Byte &lhs, RValue<Byte> rhs)
-	{
-		return lhs = lhs + rhs;
-	}
+RValue<Byte> operator*=(Byte &lhs, RValue<Byte> rhs)
+{
+	return lhs = lhs * rhs;
+}
 
-	RValue<Byte> operator-=(Byte &lhs, RValue<Byte> rhs)
-	{
-		return lhs = lhs - rhs;
-	}
+RValue<Byte> operator/=(Byte &lhs, RValue<Byte> rhs)
+{
+	return lhs = lhs / rhs;
+}
 
-	RValue<Byte> operator*=(Byte &lhs, RValue<Byte> rhs)
-	{
-		return lhs = lhs * rhs;
-	}
+RValue<Byte> operator%=(Byte &lhs, RValue<Byte> rhs)
+{
+	return lhs = lhs % rhs;
+}
 
-	RValue<Byte> operator/=(Byte &lhs, RValue<Byte> rhs)
-	{
-		return lhs = lhs / rhs;
-	}
+RValue<Byte> operator&=(Byte &lhs, RValue<Byte> rhs)
+{
+	return lhs = lhs & rhs;
+}
 
-	RValue<Byte> operator%=(Byte &lhs, RValue<Byte> rhs)
-	{
-		return lhs = lhs % rhs;
-	}
+RValue<Byte> operator|=(Byte &lhs, RValue<Byte> rhs)
+{
+	return lhs = lhs | rhs;
+}
 
-	RValue<Byte> operator&=(Byte &lhs, RValue<Byte> rhs)
-	{
-		return lhs = lhs & rhs;
-	}
+RValue<Byte> operator^=(Byte &lhs, RValue<Byte> rhs)
+{
+	return lhs = lhs ^ rhs;
+}
 
-	RValue<Byte> operator|=(Byte &lhs, RValue<Byte> rhs)
-	{
-		return lhs = lhs | rhs;
-	}
+RValue<Byte> operator<<=(Byte &lhs, RValue<Byte> rhs)
+{
+	return lhs = lhs << rhs;
+}
 
-	RValue<Byte> operator^=(Byte &lhs, RValue<Byte> rhs)
-	{
-		return lhs = lhs ^ rhs;
-	}
+RValue<Byte> operator>>=(Byte &lhs, RValue<Byte> rhs)
+{
+	return lhs = lhs >> rhs;
+}
 
-	RValue<Byte> operator<<=(Byte &lhs, RValue<Byte> rhs)
-	{
-		return lhs = lhs << rhs;
-	}
+RValue<Byte> operator+(RValue<Byte> val)
+{
+	return val;
+}
 
-	RValue<Byte> operator>>=(Byte &lhs, RValue<Byte> rhs)
-	{
-		return lhs = lhs >> rhs;
-	}
+RValue<Byte> operator-(RValue<Byte> val)
+{
+	return RValue<Byte>(Nucleus::createNeg(val.value));
+}
 
-	RValue<Byte> operator+(RValue<Byte> val)
-	{
-		return val;
-	}
+RValue<Byte> operator~(RValue<Byte> val)
+{
+	return RValue<Byte>(Nucleus::createNot(val.value));
+}
 
-	RValue<Byte> operator-(RValue<Byte> val)
-	{
-		return RValue<Byte>(Nucleus::createNeg(val.value));
-	}
+RValue<Byte> operator++(Byte &val, int)   // Post-increment
+{
+	RValue<Byte> res = val;
 
-	RValue<Byte> operator~(RValue<Byte> val)
-	{
-		return RValue<Byte>(Nucleus::createNot(val.value));
-	}
+	Value *inc = Nucleus::createAdd(res.value, Nucleus::createConstantByte((unsigned char)1));
+	val.storeValue(inc);
 
-	RValue<Byte> operator++(Byte &val, int)   // Post-increment
-	{
-		RValue<Byte> res = val;
+	return res;
+}
 
-		Value *inc = Nucleus::createAdd(res.value, Nucleus::createConstantByte((unsigned char)1));
-		val.storeValue(inc);
+const Byte &operator++(Byte &val)   // Pre-increment
+{
+	Value *inc = Nucleus::createAdd(val.loadValue(), Nucleus::createConstantByte((unsigned char)1));
+	val.storeValue(inc);
 
-		return res;
-	}
+	return val;
+}
 
-	const Byte &operator++(Byte &val)   // Pre-increment
-	{
-		Value *inc = Nucleus::createAdd(val.loadValue(), Nucleus::createConstantByte((unsigned char)1));
-		val.storeValue(inc);
+RValue<Byte> operator--(Byte &val, int)   // Post-decrement
+{
+	RValue<Byte> res = val;
 
-		return val;
-	}
+	Value *inc = Nucleus::createSub(res.value, Nucleus::createConstantByte((unsigned char)1));
+	val.storeValue(inc);
 
-	RValue<Byte> operator--(Byte &val, int)   // Post-decrement
-	{
-		RValue<Byte> res = val;
+	return res;
+}
 
-		Value *inc = Nucleus::createSub(res.value, Nucleus::createConstantByte((unsigned char)1));
-		val.storeValue(inc);
+const Byte &operator--(Byte &val)   // Pre-decrement
+{
+	Value *inc = Nucleus::createSub(val.loadValue(), Nucleus::createConstantByte((unsigned char)1));
+	val.storeValue(inc);
 
-		return res;
-	}
+	return val;
+}
 
-	const Byte &operator--(Byte &val)   // Pre-decrement
-	{
-		Value *inc = Nucleus::createSub(val.loadValue(), Nucleus::createConstantByte((unsigned char)1));
-		val.storeValue(inc);
+RValue<Bool> operator<(RValue<Byte> lhs, RValue<Byte> rhs)
+{
+	return RValue<Bool>(Nucleus::createICmpULT(lhs.value, rhs.value));
+}
 
-		return val;
-	}
+RValue<Bool> operator<=(RValue<Byte> lhs, RValue<Byte> rhs)
+{
+	return RValue<Bool>(Nucleus::createICmpULE(lhs.value, rhs.value));
+}
 
-	RValue<Bool> operator<(RValue<Byte> lhs, RValue<Byte> rhs)
-	{
-		return RValue<Bool>(Nucleus::createICmpULT(lhs.value, rhs.value));
-	}
+RValue<Bool> operator>(RValue<Byte> lhs, RValue<Byte> rhs)
+{
+	return RValue<Bool>(Nucleus::createICmpUGT(lhs.value, rhs.value));
+}
 
-	RValue<Bool> operator<=(RValue<Byte> lhs, RValue<Byte> rhs)
-	{
-		return RValue<Bool>(Nucleus::createICmpULE(lhs.value, rhs.value));
-	}
+RValue<Bool> operator>=(RValue<Byte> lhs, RValue<Byte> rhs)
+{
+	return RValue<Bool>(Nucleus::createICmpUGE(lhs.value, rhs.value));
+}
 
-	RValue<Bool> operator>(RValue<Byte> lhs, RValue<Byte> rhs)
-	{
-		return RValue<Bool>(Nucleus::createICmpUGT(lhs.value, rhs.value));
-	}
+RValue<Bool> operator!=(RValue<Byte> lhs, RValue<Byte> rhs)
+{
+	return RValue<Bool>(Nucleus::createICmpNE(lhs.value, rhs.value));
+}
 
-	RValue<Bool> operator>=(RValue<Byte> lhs, RValue<Byte> rhs)
-	{
-		return RValue<Bool>(Nucleus::createICmpUGE(lhs.value, rhs.value));
-	}
+RValue<Bool> operator==(RValue<Byte> lhs, RValue<Byte> rhs)
+{
+	return RValue<Bool>(Nucleus::createICmpEQ(lhs.value, rhs.value));
+}
 
-	RValue<Bool> operator!=(RValue<Byte> lhs, RValue<Byte> rhs)
-	{
-		return RValue<Bool>(Nucleus::createICmpNE(lhs.value, rhs.value));
-	}
+SByte::SByte(Argument<SByte> argument)
+{
+	storeValue(argument.value);
+}
 
-	RValue<Bool> operator==(RValue<Byte> lhs, RValue<Byte> rhs)
-	{
-		return RValue<Bool>(Nucleus::createICmpEQ(lhs.value, rhs.value));
-	}
+SByte::SByte(RValue<Int> cast)
+{
+	Value *integer = Nucleus::createTrunc(cast.value, SByte::getType());
 
-	SByte::SByte(Argument<SByte> argument)
-	{
-		storeValue(argument.value);
-	}
+	storeValue(integer);
+}
 
-	SByte::SByte(RValue<Int> cast)
-	{
-		Value *integer = Nucleus::createTrunc(cast.value, SByte::getType());
+SByte::SByte(RValue<Short> cast)
+{
+	Value *integer = Nucleus::createTrunc(cast.value, SByte::getType());
 
-		storeValue(integer);
-	}
+	storeValue(integer);
+}
 
-	SByte::SByte(RValue<Short> cast)
-	{
-		Value *integer = Nucleus::createTrunc(cast.value, SByte::getType());
+SByte::SByte(signed char x)
+{
+	storeValue(Nucleus::createConstantByte(x));
+}
 
-		storeValue(integer);
-	}
+SByte::SByte(RValue<SByte> rhs)
+{
+	storeValue(rhs.value);
+}
 
-	SByte::SByte(signed char x)
-	{
-		storeValue(Nucleus::createConstantByte(x));
-	}
+SByte::SByte(const SByte &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
+}
 
-	SByte::SByte(RValue<SByte> rhs)
-	{
-		storeValue(rhs.value);
-	}
+SByte::SByte(const Reference<SByte> &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
+}
 
-	SByte::SByte(const SByte &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
-	}
+RValue<SByte> SByte::operator=(RValue<SByte> rhs)
+{
+	storeValue(rhs.value);
 
-	SByte::SByte(const Reference<SByte> &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
-	}
+	return rhs;
+}
 
-	RValue<SByte> SByte::operator=(RValue<SByte> rhs)
-	{
-		storeValue(rhs.value);
+RValue<SByte> SByte::operator=(const SByte &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
 
-		return rhs;
-	}
+	return RValue<SByte>(value);
+}
 
-	RValue<SByte> SByte::operator=(const SByte &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
+RValue<SByte> SByte::operator=(const Reference<SByte> &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
 
-		return RValue<SByte>(value);
-	}
+	return RValue<SByte>(value);
+}
 
-	RValue<SByte> SByte::operator=(const Reference<SByte> &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
+RValue<SByte> operator+(RValue<SByte> lhs, RValue<SByte> rhs)
+{
+	return RValue<SByte>(Nucleus::createAdd(lhs.value, rhs.value));
+}
 
-		return RValue<SByte>(value);
-	}
+RValue<SByte> operator-(RValue<SByte> lhs, RValue<SByte> rhs)
+{
+	return RValue<SByte>(Nucleus::createSub(lhs.value, rhs.value));
+}
 
-	RValue<SByte> operator+(RValue<SByte> lhs, RValue<SByte> rhs)
-	{
-		return RValue<SByte>(Nucleus::createAdd(lhs.value, rhs.value));
-	}
+RValue<SByte> operator*(RValue<SByte> lhs, RValue<SByte> rhs)
+{
+	return RValue<SByte>(Nucleus::createMul(lhs.value, rhs.value));
+}
 
-	RValue<SByte> operator-(RValue<SByte> lhs, RValue<SByte> rhs)
-	{
-		return RValue<SByte>(Nucleus::createSub(lhs.value, rhs.value));
-	}
+RValue<SByte> operator/(RValue<SByte> lhs, RValue<SByte> rhs)
+{
+	return RValue<SByte>(Nucleus::createSDiv(lhs.value, rhs.value));
+}
 
-	RValue<SByte> operator*(RValue<SByte> lhs, RValue<SByte> rhs)
-	{
-		return RValue<SByte>(Nucleus::createMul(lhs.value, rhs.value));
-	}
+RValue<SByte> operator%(RValue<SByte> lhs, RValue<SByte> rhs)
+{
+	return RValue<SByte>(Nucleus::createSRem(lhs.value, rhs.value));
+}
 
-	RValue<SByte> operator/(RValue<SByte> lhs, RValue<SByte> rhs)
-	{
-		return RValue<SByte>(Nucleus::createSDiv(lhs.value, rhs.value));
-	}
+RValue<SByte> operator&(RValue<SByte> lhs, RValue<SByte> rhs)
+{
+	return RValue<SByte>(Nucleus::createAnd(lhs.value, rhs.value));
+}
 
-	RValue<SByte> operator%(RValue<SByte> lhs, RValue<SByte> rhs)
-	{
-		return RValue<SByte>(Nucleus::createSRem(lhs.value, rhs.value));
-	}
+RValue<SByte> operator|(RValue<SByte> lhs, RValue<SByte> rhs)
+{
+	return RValue<SByte>(Nucleus::createOr(lhs.value, rhs.value));
+}
 
-	RValue<SByte> operator&(RValue<SByte> lhs, RValue<SByte> rhs)
-	{
-		return RValue<SByte>(Nucleus::createAnd(lhs.value, rhs.value));
-	}
+RValue<SByte> operator^(RValue<SByte> lhs, RValue<SByte> rhs)
+{
+	return RValue<SByte>(Nucleus::createXor(lhs.value, rhs.value));
+}
 
-	RValue<SByte> operator|(RValue<SByte> lhs, RValue<SByte> rhs)
-	{
-		return RValue<SByte>(Nucleus::createOr(lhs.value, rhs.value));
-	}
+RValue<SByte> operator<<(RValue<SByte> lhs, RValue<SByte> rhs)
+{
+	return RValue<SByte>(Nucleus::createShl(lhs.value, rhs.value));
+}
 
-	RValue<SByte> operator^(RValue<SByte> lhs, RValue<SByte> rhs)
-	{
-		return RValue<SByte>(Nucleus::createXor(lhs.value, rhs.value));
-	}
+RValue<SByte> operator>>(RValue<SByte> lhs, RValue<SByte> rhs)
+{
+	return RValue<SByte>(Nucleus::createAShr(lhs.value, rhs.value));
+}
 
-	RValue<SByte> operator<<(RValue<SByte> lhs, RValue<SByte> rhs)
-	{
-		return RValue<SByte>(Nucleus::createShl(lhs.value, rhs.value));
-	}
+RValue<SByte> operator+=(SByte &lhs, RValue<SByte> rhs)
+{
+	return lhs = lhs + rhs;
+}
 
-	RValue<SByte> operator>>(RValue<SByte> lhs, RValue<SByte> rhs)
-	{
-		return RValue<SByte>(Nucleus::createAShr(lhs.value, rhs.value));
-	}
+RValue<SByte> operator-=(SByte &lhs, RValue<SByte> rhs)
+{
+	return lhs = lhs - rhs;
+}
 
-	RValue<SByte> operator+=(SByte &lhs, RValue<SByte> rhs)
-	{
-		return lhs = lhs + rhs;
-	}
+RValue<SByte> operator*=(SByte &lhs, RValue<SByte> rhs)
+{
+	return lhs = lhs * rhs;
+}
 
-	RValue<SByte> operator-=(SByte &lhs, RValue<SByte> rhs)
-	{
-		return lhs = lhs - rhs;
-	}
+RValue<SByte> operator/=(SByte &lhs, RValue<SByte> rhs)
+{
+	return lhs = lhs / rhs;
+}
 
-	RValue<SByte> operator*=(SByte &lhs, RValue<SByte> rhs)
-	{
-		return lhs = lhs * rhs;
-	}
+RValue<SByte> operator%=(SByte &lhs, RValue<SByte> rhs)
+{
+	return lhs = lhs % rhs;
+}
 
-	RValue<SByte> operator/=(SByte &lhs, RValue<SByte> rhs)
-	{
-		return lhs = lhs / rhs;
-	}
+RValue<SByte> operator&=(SByte &lhs, RValue<SByte> rhs)
+{
+	return lhs = lhs & rhs;
+}
 
-	RValue<SByte> operator%=(SByte &lhs, RValue<SByte> rhs)
-	{
-		return lhs = lhs % rhs;
-	}
+RValue<SByte> operator|=(SByte &lhs, RValue<SByte> rhs)
+{
+	return lhs = lhs | rhs;
+}
 
-	RValue<SByte> operator&=(SByte &lhs, RValue<SByte> rhs)
-	{
-		return lhs = lhs & rhs;
-	}
+RValue<SByte> operator^=(SByte &lhs, RValue<SByte> rhs)
+{
+	return lhs = lhs ^ rhs;
+}
 
-	RValue<SByte> operator|=(SByte &lhs, RValue<SByte> rhs)
-	{
-		return lhs = lhs | rhs;
-	}
+RValue<SByte> operator<<=(SByte &lhs, RValue<SByte> rhs)
+{
+	return lhs = lhs << rhs;
+}
 
-	RValue<SByte> operator^=(SByte &lhs, RValue<SByte> rhs)
-	{
-		return lhs = lhs ^ rhs;
-	}
+RValue<SByte> operator>>=(SByte &lhs, RValue<SByte> rhs)
+{
+	return lhs = lhs >> rhs;
+}
 
-	RValue<SByte> operator<<=(SByte &lhs, RValue<SByte> rhs)
-	{
-		return lhs = lhs << rhs;
-	}
+RValue<SByte> operator+(RValue<SByte> val)
+{
+	return val;
+}
 
-	RValue<SByte> operator>>=(SByte &lhs, RValue<SByte> rhs)
-	{
-		return lhs = lhs >> rhs;
-	}
-
-	RValue<SByte> operator+(RValue<SByte> val)
-	{
-		return val;
-	}
-
-	RValue<SByte> operator-(RValue<SByte> val)
-	{
-		return RValue<SByte>(Nucleus::createNeg(val.value));
-	}
+RValue<SByte> operator-(RValue<SByte> val)
+{
+	return RValue<SByte>(Nucleus::createNeg(val.value));
+}
 
-	RValue<SByte> operator~(RValue<SByte> val)
-	{
-		return RValue<SByte>(Nucleus::createNot(val.value));
-	}
+RValue<SByte> operator~(RValue<SByte> val)
+{
+	return RValue<SByte>(Nucleus::createNot(val.value));
+}
 
-	RValue<SByte> operator++(SByte &val, int)   // Post-increment
-	{
-		RValue<SByte> res = val;
+RValue<SByte> operator++(SByte &val, int)   // Post-increment
+{
+	RValue<SByte> res = val;
 
-		Value *inc = Nucleus::createAdd(res.value, Nucleus::createConstantByte((signed char)1));
-		val.storeValue(inc);
+	Value *inc = Nucleus::createAdd(res.value, Nucleus::createConstantByte((signed char)1));
+	val.storeValue(inc);
 
-		return res;
-	}
+	return res;
+}
 
-	const SByte &operator++(SByte &val)   // Pre-increment
-	{
-		Value *inc = Nucleus::createAdd(val.loadValue(), Nucleus::createConstantByte((signed char)1));
-		val.storeValue(inc);
+const SByte &operator++(SByte &val)   // Pre-increment
+{
+	Value *inc = Nucleus::createAdd(val.loadValue(), Nucleus::createConstantByte((signed char)1));
+	val.storeValue(inc);
 
-		return val;
-	}
+	return val;
+}
 
-	RValue<SByte> operator--(SByte &val, int)   // Post-decrement
-	{
-		RValue<SByte> res = val;
+RValue<SByte> operator--(SByte &val, int)   // Post-decrement
+{
+	RValue<SByte> res = val;
 
-		Value *inc = Nucleus::createSub(res.value, Nucleus::createConstantByte((signed char)1));
-		val.storeValue(inc);
+	Value *inc = Nucleus::createSub(res.value, Nucleus::createConstantByte((signed char)1));
+	val.storeValue(inc);
 
-		return res;
-	}
+	return res;
+}
 
-	const SByte &operator--(SByte &val)   // Pre-decrement
-	{
-		Value *inc = Nucleus::createSub(val.loadValue(), Nucleus::createConstantByte((signed char)1));
-		val.storeValue(inc);
+const SByte &operator--(SByte &val)   // Pre-decrement
+{
+	Value *inc = Nucleus::createSub(val.loadValue(), Nucleus::createConstantByte((signed char)1));
+	val.storeValue(inc);
 
-		return val;
-	}
+	return val;
+}
 
-	RValue<Bool> operator<(RValue<SByte> lhs, RValue<SByte> rhs)
-	{
-		return RValue<Bool>(Nucleus::createICmpSLT(lhs.value, rhs.value));
-	}
+RValue<Bool> operator<(RValue<SByte> lhs, RValue<SByte> rhs)
+{
+	return RValue<Bool>(Nucleus::createICmpSLT(lhs.value, rhs.value));
+}
 
-	RValue<Bool> operator<=(RValue<SByte> lhs, RValue<SByte> rhs)
-	{
-		return RValue<Bool>(Nucleus::createICmpSLE(lhs.value, rhs.value));
-	}
+RValue<Bool> operator<=(RValue<SByte> lhs, RValue<SByte> rhs)
+{
+	return RValue<Bool>(Nucleus::createICmpSLE(lhs.value, rhs.value));
+}
 
-	RValue<Bool> operator>(RValue<SByte> lhs, RValue<SByte> rhs)
-	{
-		return RValue<Bool>(Nucleus::createICmpSGT(lhs.value, rhs.value));
-	}
+RValue<Bool> operator>(RValue<SByte> lhs, RValue<SByte> rhs)
+{
+	return RValue<Bool>(Nucleus::createICmpSGT(lhs.value, rhs.value));
+}
 
-	RValue<Bool> operator>=(RValue<SByte> lhs, RValue<SByte> rhs)
-	{
-		return RValue<Bool>(Nucleus::createICmpSGE(lhs.value, rhs.value));
-	}
+RValue<Bool> operator>=(RValue<SByte> lhs, RValue<SByte> rhs)
+{
+	return RValue<Bool>(Nucleus::createICmpSGE(lhs.value, rhs.value));
+}
 
-	RValue<Bool> operator!=(RValue<SByte> lhs, RValue<SByte> rhs)
-	{
-		return RValue<Bool>(Nucleus::createICmpNE(lhs.value, rhs.value));
-	}
+RValue<Bool> operator!=(RValue<SByte> lhs, RValue<SByte> rhs)
+{
+	return RValue<Bool>(Nucleus::createICmpNE(lhs.value, rhs.value));
+}
 
-	RValue<Bool> operator==(RValue<SByte> lhs, RValue<SByte> rhs)
-	{
-		return RValue<Bool>(Nucleus::createICmpEQ(lhs.value, rhs.value));
-	}
+RValue<Bool> operator==(RValue<SByte> lhs, RValue<SByte> rhs)
+{
+	return RValue<Bool>(Nucleus::createICmpEQ(lhs.value, rhs.value));
+}
 
-	Short::Short(Argument<Short> argument)
-	{
-		storeValue(argument.value);
-	}
+Short::Short(Argument<Short> argument)
+{
+	storeValue(argument.value);
+}
 
-	Short::Short(RValue<Int> cast)
-	{
-		Value *integer = Nucleus::createTrunc(cast.value, Short::getType());
+Short::Short(RValue<Int> cast)
+{
+	Value *integer = Nucleus::createTrunc(cast.value, Short::getType());
 
-		storeValue(integer);
-	}
+	storeValue(integer);
+}
 
-	Short::Short(short x)
-	{
-		storeValue(Nucleus::createConstantShort(x));
-	}
+Short::Short(short x)
+{
+	storeValue(Nucleus::createConstantShort(x));
+}
 
-	Short::Short(RValue<Short> rhs)
-	{
-		storeValue(rhs.value);
-	}
+Short::Short(RValue<Short> rhs)
+{
+	storeValue(rhs.value);
+}
 
-	Short::Short(const Short &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
-	}
+Short::Short(const Short &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
+}
 
-	Short::Short(const Reference<Short> &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
-	}
+Short::Short(const Reference<Short> &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
+}
 
-	RValue<Short> Short::operator=(RValue<Short> rhs)
-	{
-		storeValue(rhs.value);
+RValue<Short> Short::operator=(RValue<Short> rhs)
+{
+	storeValue(rhs.value);
 
-		return rhs;
-	}
+	return rhs;
+}
 
-	RValue<Short> Short::operator=(const Short &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
+RValue<Short> Short::operator=(const Short &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
 
-		return RValue<Short>(value);
-	}
+	return RValue<Short>(value);
+}
 
-	RValue<Short> Short::operator=(const Reference<Short> &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
+RValue<Short> Short::operator=(const Reference<Short> &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
 
-		return RValue<Short>(value);
-	}
+	return RValue<Short>(value);
+}
 
-	RValue<Short> operator+(RValue<Short> lhs, RValue<Short> rhs)
-	{
-		return RValue<Short>(Nucleus::createAdd(lhs.value, rhs.value));
-	}
+RValue<Short> operator+(RValue<Short> lhs, RValue<Short> rhs)
+{
+	return RValue<Short>(Nucleus::createAdd(lhs.value, rhs.value));
+}
 
-	RValue<Short> operator-(RValue<Short> lhs, RValue<Short> rhs)
-	{
-		return RValue<Short>(Nucleus::createSub(lhs.value, rhs.value));
-	}
+RValue<Short> operator-(RValue<Short> lhs, RValue<Short> rhs)
+{
+	return RValue<Short>(Nucleus::createSub(lhs.value, rhs.value));
+}
 
-	RValue<Short> operator*(RValue<Short> lhs, RValue<Short> rhs)
-	{
-		return RValue<Short>(Nucleus::createMul(lhs.value, rhs.value));
-	}
+RValue<Short> operator*(RValue<Short> lhs, RValue<Short> rhs)
+{
+	return RValue<Short>(Nucleus::createMul(lhs.value, rhs.value));
+}
 
-	RValue<Short> operator/(RValue<Short> lhs, RValue<Short> rhs)
-	{
-		return RValue<Short>(Nucleus::createSDiv(lhs.value, rhs.value));
-	}
+RValue<Short> operator/(RValue<Short> lhs, RValue<Short> rhs)
+{
+	return RValue<Short>(Nucleus::createSDiv(lhs.value, rhs.value));
+}
 
-	RValue<Short> operator%(RValue<Short> lhs, RValue<Short> rhs)
-	{
-		return RValue<Short>(Nucleus::createSRem(lhs.value, rhs.value));
-	}
+RValue<Short> operator%(RValue<Short> lhs, RValue<Short> rhs)
+{
+	return RValue<Short>(Nucleus::createSRem(lhs.value, rhs.value));
+}
 
-	RValue<Short> operator&(RValue<Short> lhs, RValue<Short> rhs)
-	{
-		return RValue<Short>(Nucleus::createAnd(lhs.value, rhs.value));
-	}
+RValue<Short> operator&(RValue<Short> lhs, RValue<Short> rhs)
+{
+	return RValue<Short>(Nucleus::createAnd(lhs.value, rhs.value));
+}
 
-	RValue<Short> operator|(RValue<Short> lhs, RValue<Short> rhs)
-	{
-		return RValue<Short>(Nucleus::createOr(lhs.value, rhs.value));
-	}
+RValue<Short> operator|(RValue<Short> lhs, RValue<Short> rhs)
+{
+	return RValue<Short>(Nucleus::createOr(lhs.value, rhs.value));
+}
 
-	RValue<Short> operator^(RValue<Short> lhs, RValue<Short> rhs)
-	{
-		return RValue<Short>(Nucleus::createXor(lhs.value, rhs.value));
-	}
+RValue<Short> operator^(RValue<Short> lhs, RValue<Short> rhs)
+{
+	return RValue<Short>(Nucleus::createXor(lhs.value, rhs.value));
+}
 
-	RValue<Short> operator<<(RValue<Short> lhs, RValue<Short> rhs)
-	{
-		return RValue<Short>(Nucleus::createShl(lhs.value, rhs.value));
-	}
+RValue<Short> operator<<(RValue<Short> lhs, RValue<Short> rhs)
+{
+	return RValue<Short>(Nucleus::createShl(lhs.value, rhs.value));
+}
 
-	RValue<Short> operator>>(RValue<Short> lhs, RValue<Short> rhs)
-	{
-		return RValue<Short>(Nucleus::createAShr(lhs.value, rhs.value));
-	}
+RValue<Short> operator>>(RValue<Short> lhs, RValue<Short> rhs)
+{
+	return RValue<Short>(Nucleus::createAShr(lhs.value, rhs.value));
+}
 
-	RValue<Short> operator+=(Short &lhs, RValue<Short> rhs)
-	{
-		return lhs = lhs + rhs;
-	}
+RValue<Short> operator+=(Short &lhs, RValue<Short> rhs)
+{
+	return lhs = lhs + rhs;
+}
 
-	RValue<Short> operator-=(Short &lhs, RValue<Short> rhs)
-	{
-		return lhs = lhs - rhs;
-	}
+RValue<Short> operator-=(Short &lhs, RValue<Short> rhs)
+{
+	return lhs = lhs - rhs;
+}
 
-	RValue<Short> operator*=(Short &lhs, RValue<Short> rhs)
-	{
-		return lhs = lhs * rhs;
-	}
+RValue<Short> operator*=(Short &lhs, RValue<Short> rhs)
+{
+	return lhs = lhs * rhs;
+}
 
-	RValue<Short> operator/=(Short &lhs, RValue<Short> rhs)
-	{
-		return lhs = lhs / rhs;
-	}
+RValue<Short> operator/=(Short &lhs, RValue<Short> rhs)
+{
+	return lhs = lhs / rhs;
+}
 
-	RValue<Short> operator%=(Short &lhs, RValue<Short> rhs)
-	{
-		return lhs = lhs % rhs;
-	}
+RValue<Short> operator%=(Short &lhs, RValue<Short> rhs)
+{
+	return lhs = lhs % rhs;
+}
 
-	RValue<Short> operator&=(Short &lhs, RValue<Short> rhs)
-	{
-		return lhs = lhs & rhs;
-	}
+RValue<Short> operator&=(Short &lhs, RValue<Short> rhs)
+{
+	return lhs = lhs & rhs;
+}
 
-	RValue<Short> operator|=(Short &lhs, RValue<Short> rhs)
-	{
-		return lhs = lhs | rhs;
-	}
+RValue<Short> operator|=(Short &lhs, RValue<Short> rhs)
+{
+	return lhs = lhs | rhs;
+}
 
-	RValue<Short> operator^=(Short &lhs, RValue<Short> rhs)
-	{
-		return lhs = lhs ^ rhs;
-	}
+RValue<Short> operator^=(Short &lhs, RValue<Short> rhs)
+{
+	return lhs = lhs ^ rhs;
+}
 
-	RValue<Short> operator<<=(Short &lhs, RValue<Short> rhs)
-	{
-		return lhs = lhs << rhs;
-	}
+RValue<Short> operator<<=(Short &lhs, RValue<Short> rhs)
+{
+	return lhs = lhs << rhs;
+}
 
-	RValue<Short> operator>>=(Short &lhs, RValue<Short> rhs)
-	{
-		return lhs = lhs >> rhs;
-	}
+RValue<Short> operator>>=(Short &lhs, RValue<Short> rhs)
+{
+	return lhs = lhs >> rhs;
+}
 
-	RValue<Short> operator+(RValue<Short> val)
-	{
-		return val;
-	}
+RValue<Short> operator+(RValue<Short> val)
+{
+	return val;
+}
 
-	RValue<Short> operator-(RValue<Short> val)
-	{
-		return RValue<Short>(Nucleus::createNeg(val.value));
-	}
+RValue<Short> operator-(RValue<Short> val)
+{
+	return RValue<Short>(Nucleus::createNeg(val.value));
+}
 
-	RValue<Short> operator~(RValue<Short> val)
-	{
-		return RValue<Short>(Nucleus::createNot(val.value));
-	}
+RValue<Short> operator~(RValue<Short> val)
+{
+	return RValue<Short>(Nucleus::createNot(val.value));
+}
 
-	RValue<Short> operator++(Short &val, int)   // Post-increment
-	{
-		RValue<Short> res = val;
+RValue<Short> operator++(Short &val, int)   // Post-increment
+{
+	RValue<Short> res = val;
 
-		Value *inc = Nucleus::createAdd(res.value, Nucleus::createConstantShort((short)1));
-		val.storeValue(inc);
+	Value *inc = Nucleus::createAdd(res.value, Nucleus::createConstantShort((short)1));
+	val.storeValue(inc);
 
-		return res;
-	}
+	return res;
+}
 
-	const Short &operator++(Short &val)   // Pre-increment
-	{
-		Value *inc = Nucleus::createAdd(val.loadValue(), Nucleus::createConstantShort((short)1));
-		val.storeValue(inc);
+const Short &operator++(Short &val)   // Pre-increment
+{
+	Value *inc = Nucleus::createAdd(val.loadValue(), Nucleus::createConstantShort((short)1));
+	val.storeValue(inc);
 
-		return val;
-	}
+	return val;
+}
 
-	RValue<Short> operator--(Short &val, int)   // Post-decrement
-	{
-		RValue<Short> res = val;
+RValue<Short> operator--(Short &val, int)   // Post-decrement
+{
+	RValue<Short> res = val;
 
-		Value *inc = Nucleus::createSub(res.value, Nucleus::createConstantShort((short)1));
-		val.storeValue(inc);
+	Value *inc = Nucleus::createSub(res.value, Nucleus::createConstantShort((short)1));
+	val.storeValue(inc);
 
-		return res;
-	}
+	return res;
+}
 
-	const Short &operator--(Short &val)   // Pre-decrement
-	{
-		Value *inc = Nucleus::createSub(val.loadValue(), Nucleus::createConstantShort((short)1));
-		val.storeValue(inc);
+const Short &operator--(Short &val)   // Pre-decrement
+{
+	Value *inc = Nucleus::createSub(val.loadValue(), Nucleus::createConstantShort((short)1));
+	val.storeValue(inc);
 
-		return val;
-	}
+	return val;
+}
 
-	RValue<Bool> operator<(RValue<Short> lhs, RValue<Short> rhs)
-	{
-		return RValue<Bool>(Nucleus::createICmpSLT(lhs.value, rhs.value));
-	}
+RValue<Bool> operator<(RValue<Short> lhs, RValue<Short> rhs)
+{
+	return RValue<Bool>(Nucleus::createICmpSLT(lhs.value, rhs.value));
+}
 
-	RValue<Bool> operator<=(RValue<Short> lhs, RValue<Short> rhs)
-	{
-		return RValue<Bool>(Nucleus::createICmpSLE(lhs.value, rhs.value));
-	}
+RValue<Bool> operator<=(RValue<Short> lhs, RValue<Short> rhs)
+{
+	return RValue<Bool>(Nucleus::createICmpSLE(lhs.value, rhs.value));
+}
 
-	RValue<Bool> operator>(RValue<Short> lhs, RValue<Short> rhs)
-	{
-		return RValue<Bool>(Nucleus::createICmpSGT(lhs.value, rhs.value));
-	}
+RValue<Bool> operator>(RValue<Short> lhs, RValue<Short> rhs)
+{
+	return RValue<Bool>(Nucleus::createICmpSGT(lhs.value, rhs.value));
+}
 
-	RValue<Bool> operator>=(RValue<Short> lhs, RValue<Short> rhs)
-	{
-		return RValue<Bool>(Nucleus::createICmpSGE(lhs.value, rhs.value));
-	}
+RValue<Bool> operator>=(RValue<Short> lhs, RValue<Short> rhs)
+{
+	return RValue<Bool>(Nucleus::createICmpSGE(lhs.value, rhs.value));
+}
 
-	RValue<Bool> operator!=(RValue<Short> lhs, RValue<Short> rhs)
-	{
-		return RValue<Bool>(Nucleus::createICmpNE(lhs.value, rhs.value));
-	}
+RValue<Bool> operator!=(RValue<Short> lhs, RValue<Short> rhs)
+{
+	return RValue<Bool>(Nucleus::createICmpNE(lhs.value, rhs.value));
+}
 
-	RValue<Bool> operator==(RValue<Short> lhs, RValue<Short> rhs)
-	{
-		return RValue<Bool>(Nucleus::createICmpEQ(lhs.value, rhs.value));
-	}
+RValue<Bool> operator==(RValue<Short> lhs, RValue<Short> rhs)
+{
+	return RValue<Bool>(Nucleus::createICmpEQ(lhs.value, rhs.value));
+}
 
-	UShort::UShort(Argument<UShort> argument)
-	{
-		storeValue(argument.value);
-	}
+UShort::UShort(Argument<UShort> argument)
+{
+	storeValue(argument.value);
+}
 
-	UShort::UShort(RValue<UInt> cast)
-	{
-		Value *integer = Nucleus::createTrunc(cast.value, UShort::getType());
+UShort::UShort(RValue<UInt> cast)
+{
+	Value *integer = Nucleus::createTrunc(cast.value, UShort::getType());
 
-		storeValue(integer);
-	}
+	storeValue(integer);
+}
 
-	UShort::UShort(RValue<Int> cast)
-	{
-		Value *integer = Nucleus::createTrunc(cast.value, UShort::getType());
+UShort::UShort(RValue<Int> cast)
+{
+	Value *integer = Nucleus::createTrunc(cast.value, UShort::getType());
 
-		storeValue(integer);
-	}
+	storeValue(integer);
+}
 
-	UShort::UShort(unsigned short x)
-	{
-		storeValue(Nucleus::createConstantShort(x));
-	}
+UShort::UShort(unsigned short x)
+{
+	storeValue(Nucleus::createConstantShort(x));
+}
 
-	UShort::UShort(RValue<UShort> rhs)
-	{
-		storeValue(rhs.value);
-	}
+UShort::UShort(RValue<UShort> rhs)
+{
+	storeValue(rhs.value);
+}
 
-	UShort::UShort(const UShort &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
-	}
+UShort::UShort(const UShort &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
+}
 
-	UShort::UShort(const Reference<UShort> &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
-	}
+UShort::UShort(const Reference<UShort> &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
+}
 
-	RValue<UShort> UShort::operator=(RValue<UShort> rhs)
-	{
-		storeValue(rhs.value);
+RValue<UShort> UShort::operator=(RValue<UShort> rhs)
+{
+	storeValue(rhs.value);
 
-		return rhs;
-	}
+	return rhs;
+}
 
-	RValue<UShort> UShort::operator=(const UShort &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
+RValue<UShort> UShort::operator=(const UShort &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
 
-		return RValue<UShort>(value);
-	}
+	return RValue<UShort>(value);
+}
 
-	RValue<UShort> UShort::operator=(const Reference<UShort> &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
+RValue<UShort> UShort::operator=(const Reference<UShort> &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
 
-		return RValue<UShort>(value);
-	}
+	return RValue<UShort>(value);
+}
 
-	RValue<UShort> operator+(RValue<UShort> lhs, RValue<UShort> rhs)
-	{
-		return RValue<UShort>(Nucleus::createAdd(lhs.value, rhs.value));
-	}
+RValue<UShort> operator+(RValue<UShort> lhs, RValue<UShort> rhs)
+{
+	return RValue<UShort>(Nucleus::createAdd(lhs.value, rhs.value));
+}
 
-	RValue<UShort> operator-(RValue<UShort> lhs, RValue<UShort> rhs)
-	{
-		return RValue<UShort>(Nucleus::createSub(lhs.value, rhs.value));
-	}
+RValue<UShort> operator-(RValue<UShort> lhs, RValue<UShort> rhs)
+{
+	return RValue<UShort>(Nucleus::createSub(lhs.value, rhs.value));
+}
 
-	RValue<UShort> operator*(RValue<UShort> lhs, RValue<UShort> rhs)
-	{
-		return RValue<UShort>(Nucleus::createMul(lhs.value, rhs.value));
-	}
+RValue<UShort> operator*(RValue<UShort> lhs, RValue<UShort> rhs)
+{
+	return RValue<UShort>(Nucleus::createMul(lhs.value, rhs.value));
+}
 
-	RValue<UShort> operator/(RValue<UShort> lhs, RValue<UShort> rhs)
-	{
-		return RValue<UShort>(Nucleus::createUDiv(lhs.value, rhs.value));
-	}
+RValue<UShort> operator/(RValue<UShort> lhs, RValue<UShort> rhs)
+{
+	return RValue<UShort>(Nucleus::createUDiv(lhs.value, rhs.value));
+}
 
-	RValue<UShort> operator%(RValue<UShort> lhs, RValue<UShort> rhs)
-	{
-		return RValue<UShort>(Nucleus::createURem(lhs.value, rhs.value));
-	}
+RValue<UShort> operator%(RValue<UShort> lhs, RValue<UShort> rhs)
+{
+	return RValue<UShort>(Nucleus::createURem(lhs.value, rhs.value));
+}
 
-	RValue<UShort> operator&(RValue<UShort> lhs, RValue<UShort> rhs)
-	{
-		return RValue<UShort>(Nucleus::createAnd(lhs.value, rhs.value));
-	}
+RValue<UShort> operator&(RValue<UShort> lhs, RValue<UShort> rhs)
+{
+	return RValue<UShort>(Nucleus::createAnd(lhs.value, rhs.value));
+}
 
-	RValue<UShort> operator|(RValue<UShort> lhs, RValue<UShort> rhs)
-	{
-		return RValue<UShort>(Nucleus::createOr(lhs.value, rhs.value));
-	}
+RValue<UShort> operator|(RValue<UShort> lhs, RValue<UShort> rhs)
+{
+	return RValue<UShort>(Nucleus::createOr(lhs.value, rhs.value));
+}
 
-	RValue<UShort> operator^(RValue<UShort> lhs, RValue<UShort> rhs)
-	{
-		return RValue<UShort>(Nucleus::createXor(lhs.value, rhs.value));
-	}
+RValue<UShort> operator^(RValue<UShort> lhs, RValue<UShort> rhs)
+{
+	return RValue<UShort>(Nucleus::createXor(lhs.value, rhs.value));
+}
 
-	RValue<UShort> operator<<(RValue<UShort> lhs, RValue<UShort> rhs)
-	{
-		return RValue<UShort>(Nucleus::createShl(lhs.value, rhs.value));
-	}
+RValue<UShort> operator<<(RValue<UShort> lhs, RValue<UShort> rhs)
+{
+	return RValue<UShort>(Nucleus::createShl(lhs.value, rhs.value));
+}
 
-	RValue<UShort> operator>>(RValue<UShort> lhs, RValue<UShort> rhs)
-	{
-		return RValue<UShort>(Nucleus::createLShr(lhs.value, rhs.value));
-	}
+RValue<UShort> operator>>(RValue<UShort> lhs, RValue<UShort> rhs)
+{
+	return RValue<UShort>(Nucleus::createLShr(lhs.value, rhs.value));
+}
 
-	RValue<UShort> operator+=(UShort &lhs, RValue<UShort> rhs)
-	{
-		return lhs = lhs + rhs;
-	}
+RValue<UShort> operator+=(UShort &lhs, RValue<UShort> rhs)
+{
+	return lhs = lhs + rhs;
+}
 
-	RValue<UShort> operator-=(UShort &lhs, RValue<UShort> rhs)
-	{
-		return lhs = lhs - rhs;
-	}
+RValue<UShort> operator-=(UShort &lhs, RValue<UShort> rhs)
+{
+	return lhs = lhs - rhs;
+}
 
-	RValue<UShort> operator*=(UShort &lhs, RValue<UShort> rhs)
-	{
-		return lhs = lhs * rhs;
-	}
+RValue<UShort> operator*=(UShort &lhs, RValue<UShort> rhs)
+{
+	return lhs = lhs * rhs;
+}
 
-	RValue<UShort> operator/=(UShort &lhs, RValue<UShort> rhs)
-	{
-		return lhs = lhs / rhs;
-	}
+RValue<UShort> operator/=(UShort &lhs, RValue<UShort> rhs)
+{
+	return lhs = lhs / rhs;
+}
 
-	RValue<UShort> operator%=(UShort &lhs, RValue<UShort> rhs)
-	{
-		return lhs = lhs % rhs;
-	}
+RValue<UShort> operator%=(UShort &lhs, RValue<UShort> rhs)
+{
+	return lhs = lhs % rhs;
+}
 
-	RValue<UShort> operator&=(UShort &lhs, RValue<UShort> rhs)
-	{
-		return lhs = lhs & rhs;
-	}
+RValue<UShort> operator&=(UShort &lhs, RValue<UShort> rhs)
+{
+	return lhs = lhs & rhs;
+}
 
-	RValue<UShort> operator|=(UShort &lhs, RValue<UShort> rhs)
-	{
-		return lhs = lhs | rhs;
-	}
+RValue<UShort> operator|=(UShort &lhs, RValue<UShort> rhs)
+{
+	return lhs = lhs | rhs;
+}
 
-	RValue<UShort> operator^=(UShort &lhs, RValue<UShort> rhs)
-	{
-		return lhs = lhs ^ rhs;
-	}
+RValue<UShort> operator^=(UShort &lhs, RValue<UShort> rhs)
+{
+	return lhs = lhs ^ rhs;
+}
 
-	RValue<UShort> operator<<=(UShort &lhs, RValue<UShort> rhs)
-	{
-		return lhs = lhs << rhs;
-	}
+RValue<UShort> operator<<=(UShort &lhs, RValue<UShort> rhs)
+{
+	return lhs = lhs << rhs;
+}
 
-	RValue<UShort> operator>>=(UShort &lhs, RValue<UShort> rhs)
-	{
-		return lhs = lhs >> rhs;
-	}
+RValue<UShort> operator>>=(UShort &lhs, RValue<UShort> rhs)
+{
+	return lhs = lhs >> rhs;
+}
 
-	RValue<UShort> operator+(RValue<UShort> val)
-	{
-		return val;
-	}
+RValue<UShort> operator+(RValue<UShort> val)
+{
+	return val;
+}
 
-	RValue<UShort> operator-(RValue<UShort> val)
-	{
-		return RValue<UShort>(Nucleus::createNeg(val.value));
-	}
+RValue<UShort> operator-(RValue<UShort> val)
+{
+	return RValue<UShort>(Nucleus::createNeg(val.value));
+}
 
-	RValue<UShort> operator~(RValue<UShort> val)
-	{
-		return RValue<UShort>(Nucleus::createNot(val.value));
-	}
+RValue<UShort> operator~(RValue<UShort> val)
+{
+	return RValue<UShort>(Nucleus::createNot(val.value));
+}
 
-	RValue<UShort> operator++(UShort &val, int)   // Post-increment
-	{
-		RValue<UShort> res = val;
+RValue<UShort> operator++(UShort &val, int)   // Post-increment
+{
+	RValue<UShort> res = val;
 
-		Value *inc = Nucleus::createAdd(res.value, Nucleus::createConstantShort((unsigned short)1));
-		val.storeValue(inc);
+	Value *inc = Nucleus::createAdd(res.value, Nucleus::createConstantShort((unsigned short)1));
+	val.storeValue(inc);
 
-		return res;
-	}
+	return res;
+}
 
-	const UShort &operator++(UShort &val)   // Pre-increment
-	{
-		Value *inc = Nucleus::createAdd(val.loadValue(), Nucleus::createConstantShort((unsigned short)1));
-		val.storeValue(inc);
+const UShort &operator++(UShort &val)   // Pre-increment
+{
+	Value *inc = Nucleus::createAdd(val.loadValue(), Nucleus::createConstantShort((unsigned short)1));
+	val.storeValue(inc);
 
-		return val;
-	}
+	return val;
+}
 
-	RValue<UShort> operator--(UShort &val, int)   // Post-decrement
-	{
-		RValue<UShort> res = val;
+RValue<UShort> operator--(UShort &val, int)   // Post-decrement
+{
+	RValue<UShort> res = val;
 
-		Value *inc = Nucleus::createSub(res.value, Nucleus::createConstantShort((unsigned short)1));
-		val.storeValue(inc);
+	Value *inc = Nucleus::createSub(res.value, Nucleus::createConstantShort((unsigned short)1));
+	val.storeValue(inc);
 
-		return res;
-	}
+	return res;
+}
 
-	const UShort &operator--(UShort &val)   // Pre-decrement
-	{
-		Value *inc = Nucleus::createSub(val.loadValue(), Nucleus::createConstantShort((unsigned short)1));
-		val.storeValue(inc);
+const UShort &operator--(UShort &val)   // Pre-decrement
+{
+	Value *inc = Nucleus::createSub(val.loadValue(), Nucleus::createConstantShort((unsigned short)1));
+	val.storeValue(inc);
 
-		return val;
-	}
+	return val;
+}
 
-	RValue<Bool> operator<(RValue<UShort> lhs, RValue<UShort> rhs)
-	{
-		return RValue<Bool>(Nucleus::createICmpULT(lhs.value, rhs.value));
-	}
+RValue<Bool> operator<(RValue<UShort> lhs, RValue<UShort> rhs)
+{
+	return RValue<Bool>(Nucleus::createICmpULT(lhs.value, rhs.value));
+}
 
-	RValue<Bool> operator<=(RValue<UShort> lhs, RValue<UShort> rhs)
-	{
-		return RValue<Bool>(Nucleus::createICmpULE(lhs.value, rhs.value));
-	}
+RValue<Bool> operator<=(RValue<UShort> lhs, RValue<UShort> rhs)
+{
+	return RValue<Bool>(Nucleus::createICmpULE(lhs.value, rhs.value));
+}
 
-	RValue<Bool> operator>(RValue<UShort> lhs, RValue<UShort> rhs)
-	{
-		return RValue<Bool>(Nucleus::createICmpUGT(lhs.value, rhs.value));
-	}
+RValue<Bool> operator>(RValue<UShort> lhs, RValue<UShort> rhs)
+{
+	return RValue<Bool>(Nucleus::createICmpUGT(lhs.value, rhs.value));
+}
 
-	RValue<Bool> operator>=(RValue<UShort> lhs, RValue<UShort> rhs)
-	{
-		return RValue<Bool>(Nucleus::createICmpUGE(lhs.value, rhs.value));
-	}
+RValue<Bool> operator>=(RValue<UShort> lhs, RValue<UShort> rhs)
+{
+	return RValue<Bool>(Nucleus::createICmpUGE(lhs.value, rhs.value));
+}
 
-	RValue<Bool> operator!=(RValue<UShort> lhs, RValue<UShort> rhs)
-	{
-		return RValue<Bool>(Nucleus::createICmpNE(lhs.value, rhs.value));
-	}
+RValue<Bool> operator!=(RValue<UShort> lhs, RValue<UShort> rhs)
+{
+	return RValue<Bool>(Nucleus::createICmpNE(lhs.value, rhs.value));
+}
 
-	RValue<Bool> operator==(RValue<UShort> lhs, RValue<UShort> rhs)
-	{
-		return RValue<Bool>(Nucleus::createICmpEQ(lhs.value, rhs.value));
-	}
+RValue<Bool> operator==(RValue<UShort> lhs, RValue<UShort> rhs)
+{
+	return RValue<Bool>(Nucleus::createICmpEQ(lhs.value, rhs.value));
+}
 
-	Byte4::Byte4(RValue<Byte8> cast)
-	{
-		storeValue(Nucleus::createBitCast(cast.value, getType()));
-	}
+Byte4::Byte4(RValue<Byte8> cast)
+{
+	storeValue(Nucleus::createBitCast(cast.value, getType()));
+}
 
-	Byte4::Byte4(const Reference<Byte4> &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
-	}
+Byte4::Byte4(const Reference<Byte4> &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
+}
 
-	Byte8::Byte8(uint8_t x0, uint8_t x1, uint8_t x2, uint8_t x3, uint8_t x4, uint8_t x5, uint8_t x6, uint8_t x7)
-	{
-		int64_t constantVector[8] = {x0, x1, x2, x3, x4, x5, x6, x7};
-		storeValue(Nucleus::createConstantVector(constantVector, getType()));
-	}
+Byte8::Byte8(uint8_t x0, uint8_t x1, uint8_t x2, uint8_t x3, uint8_t x4, uint8_t x5, uint8_t x6, uint8_t x7)
+{
+	int64_t constantVector[8] = {x0, x1, x2, x3, x4, x5, x6, x7};
+	storeValue(Nucleus::createConstantVector(constantVector, getType()));
+}
 
-	Byte8::Byte8(RValue<Byte8> rhs)
-	{
-		storeValue(rhs.value);
-	}
+Byte8::Byte8(RValue<Byte8> rhs)
+{
+	storeValue(rhs.value);
+}
 
-	Byte8::Byte8(const Byte8 &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
-	}
+Byte8::Byte8(const Byte8 &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
+}
 
-	Byte8::Byte8(const Reference<Byte8> &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
-	}
+Byte8::Byte8(const Reference<Byte8> &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
+}
 
-	RValue<Byte8> Byte8::operator=(RValue<Byte8> rhs)
-	{
-		storeValue(rhs.value);
+RValue<Byte8> Byte8::operator=(RValue<Byte8> rhs)
+{
+	storeValue(rhs.value);
 
-		return rhs;
-	}
+	return rhs;
+}
 
-	RValue<Byte8> Byte8::operator=(const Byte8 &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
+RValue<Byte8> Byte8::operator=(const Byte8 &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
 
-		return RValue<Byte8>(value);
-	}
+	return RValue<Byte8>(value);
+}
 
-	RValue<Byte8> Byte8::operator=(const Reference<Byte8> &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
+RValue<Byte8> Byte8::operator=(const Reference<Byte8> &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
 
-		return RValue<Byte8>(value);
-	}
+	return RValue<Byte8>(value);
+}
 
-	RValue<Byte8> operator+(RValue<Byte8> lhs, RValue<Byte8> rhs)
-	{
-		return RValue<Byte8>(Nucleus::createAdd(lhs.value, rhs.value));
-	}
+RValue<Byte8> operator+(RValue<Byte8> lhs, RValue<Byte8> rhs)
+{
+	return RValue<Byte8>(Nucleus::createAdd(lhs.value, rhs.value));
+}
 
-	RValue<Byte8> operator-(RValue<Byte8> lhs, RValue<Byte8> rhs)
-	{
-		return RValue<Byte8>(Nucleus::createSub(lhs.value, rhs.value));
-	}
+RValue<Byte8> operator-(RValue<Byte8> lhs, RValue<Byte8> rhs)
+{
+	return RValue<Byte8>(Nucleus::createSub(lhs.value, rhs.value));
+}
 
 //	RValue<Byte8> operator*(RValue<Byte8> lhs, RValue<Byte8> rhs)
 //	{
@@ -1325,20 +1326,20 @@
 //		return RValue<Byte8>(Nucleus::createURem(lhs.value, rhs.value));
 //	}
 
-	RValue<Byte8> operator&(RValue<Byte8> lhs, RValue<Byte8> rhs)
-	{
-		return RValue<Byte8>(Nucleus::createAnd(lhs.value, rhs.value));
-	}
+RValue<Byte8> operator&(RValue<Byte8> lhs, RValue<Byte8> rhs)
+{
+	return RValue<Byte8>(Nucleus::createAnd(lhs.value, rhs.value));
+}
 
-	RValue<Byte8> operator|(RValue<Byte8> lhs, RValue<Byte8> rhs)
-	{
-		return RValue<Byte8>(Nucleus::createOr(lhs.value, rhs.value));
-	}
+RValue<Byte8> operator|(RValue<Byte8> lhs, RValue<Byte8> rhs)
+{
+	return RValue<Byte8>(Nucleus::createOr(lhs.value, rhs.value));
+}
 
-	RValue<Byte8> operator^(RValue<Byte8> lhs, RValue<Byte8> rhs)
-	{
-		return RValue<Byte8>(Nucleus::createXor(lhs.value, rhs.value));
-	}
+RValue<Byte8> operator^(RValue<Byte8> lhs, RValue<Byte8> rhs)
+{
+	return RValue<Byte8>(Nucleus::createXor(lhs.value, rhs.value));
+}
 
 //	RValue<Byte8> operator<<(RValue<Byte8> lhs, unsigned char rhs)
 //	{
@@ -1350,15 +1351,15 @@
 //		return RValue<Byte8>(Nucleus::createLShr(lhs.value, rhs.value));
 //	}
 
-	RValue<Byte8> operator+=(Byte8 &lhs, RValue<Byte8> rhs)
-	{
-		return lhs = lhs + rhs;
-	}
+RValue<Byte8> operator+=(Byte8 &lhs, RValue<Byte8> rhs)
+{
+	return lhs = lhs + rhs;
+}
 
-	RValue<Byte8> operator-=(Byte8 &lhs, RValue<Byte8> rhs)
-	{
-		return lhs = lhs - rhs;
-	}
+RValue<Byte8> operator-=(Byte8 &lhs, RValue<Byte8> rhs)
+{
+	return lhs = lhs - rhs;
+}
 
 //	RValue<Byte8> operator*=(Byte8 &lhs, RValue<Byte8> rhs)
 //	{
@@ -1375,20 +1376,20 @@
 //		return lhs = lhs % rhs;
 //	}
 
-	RValue<Byte8> operator&=(Byte8 &lhs, RValue<Byte8> rhs)
-	{
-		return lhs = lhs & rhs;
-	}
+RValue<Byte8> operator&=(Byte8 &lhs, RValue<Byte8> rhs)
+{
+	return lhs = lhs & rhs;
+}
 
-	RValue<Byte8> operator|=(Byte8 &lhs, RValue<Byte8> rhs)
-	{
-		return lhs = lhs | rhs;
-	}
+RValue<Byte8> operator|=(Byte8 &lhs, RValue<Byte8> rhs)
+{
+	return lhs = lhs | rhs;
+}
 
-	RValue<Byte8> operator^=(Byte8 &lhs, RValue<Byte8> rhs)
-	{
-		return lhs = lhs ^ rhs;
-	}
+RValue<Byte8> operator^=(Byte8 &lhs, RValue<Byte8> rhs)
+{
+	return lhs = lhs ^ rhs;
+}
 
 //	RValue<Byte8> operator<<=(Byte8 &lhs, RValue<Byte8> rhs)
 //	{
@@ -1410,92 +1411,92 @@
 //		return RValue<Byte8>(Nucleus::createNeg(val.value));
 //	}
 
-	RValue<Byte8> operator~(RValue<Byte8> val)
-	{
-		return RValue<Byte8>(Nucleus::createNot(val.value));
-	}
+RValue<Byte8> operator~(RValue<Byte8> val)
+{
+	return RValue<Byte8>(Nucleus::createNot(val.value));
+}
 
-	RValue<Short4> Unpack(RValue<Byte4> x)
-	{
-		int shuffle[16] = {0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7};   // Real type is v16i8
-		return As<Short4>(Nucleus::createShuffleVector(x.value, x.value, shuffle));
-	}
+RValue<Short4> Unpack(RValue<Byte4> x)
+{
+	int shuffle[16] = {0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7};   // Real type is v16i8
+	return As<Short4>(Nucleus::createShuffleVector(x.value, x.value, shuffle));
+}
 
-	RValue<Short4> Unpack(RValue<Byte4> x, RValue<Byte4> y)
-	{
-		return UnpackLow(As<Byte8>(x), As<Byte8>(y));
-	}
+RValue<Short4> Unpack(RValue<Byte4> x, RValue<Byte4> y)
+{
+	return UnpackLow(As<Byte8>(x), As<Byte8>(y));
+}
 
-	RValue<Short4> UnpackLow(RValue<Byte8> x, RValue<Byte8> y)
-	{
-		int shuffle[16] = {0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23};   // Real type is v16i8
-		return As<Short4>(Nucleus::createShuffleVector(x.value, y.value, shuffle));
-	}
+RValue<Short4> UnpackLow(RValue<Byte8> x, RValue<Byte8> y)
+{
+	int shuffle[16] = {0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23};   // Real type is v16i8
+	return As<Short4>(Nucleus::createShuffleVector(x.value, y.value, shuffle));
+}
 
-	RValue<Short4> UnpackHigh(RValue<Byte8> x, RValue<Byte8> y)
-	{
-		int shuffle[16] = {0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23};   // Real type is v16i8
-		auto lowHigh = RValue<Byte16>(Nucleus::createShuffleVector(x.value, y.value, shuffle));
-		return As<Short4>(Swizzle(As<Int4>(lowHigh), 0x2323));
-	}
+RValue<Short4> UnpackHigh(RValue<Byte8> x, RValue<Byte8> y)
+{
+	int shuffle[16] = {0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23};   // Real type is v16i8
+	auto lowHigh = RValue<Byte16>(Nucleus::createShuffleVector(x.value, y.value, shuffle));
+	return As<Short4>(Swizzle(As<Int4>(lowHigh), 0x2323));
+}
 
-	SByte8::SByte8(uint8_t x0, uint8_t x1, uint8_t x2, uint8_t x3, uint8_t x4, uint8_t x5, uint8_t x6, uint8_t x7)
-	{
-		int64_t constantVector[8] = {x0, x1, x2, x3, x4, x5, x6, x7};
-		Value *vector = Nucleus::createConstantVector(constantVector, getType());
+SByte8::SByte8(uint8_t x0, uint8_t x1, uint8_t x2, uint8_t x3, uint8_t x4, uint8_t x5, uint8_t x6, uint8_t x7)
+{
+	int64_t constantVector[8] = {x0, x1, x2, x3, x4, x5, x6, x7};
+	Value *vector = Nucleus::createConstantVector(constantVector, getType());
 
-		storeValue(Nucleus::createBitCast(vector, getType()));
-	}
+	storeValue(Nucleus::createBitCast(vector, getType()));
+}
 
-	SByte8::SByte8(RValue<SByte8> rhs)
-	{
-		storeValue(rhs.value);
-	}
+SByte8::SByte8(RValue<SByte8> rhs)
+{
+	storeValue(rhs.value);
+}
 
-	SByte8::SByte8(const SByte8 &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
-	}
+SByte8::SByte8(const SByte8 &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
+}
 
-	SByte8::SByte8(const Reference<SByte8> &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
-	}
+SByte8::SByte8(const Reference<SByte8> &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
+}
 
-	RValue<SByte8> SByte8::operator=(RValue<SByte8> rhs)
-	{
-		storeValue(rhs.value);
+RValue<SByte8> SByte8::operator=(RValue<SByte8> rhs)
+{
+	storeValue(rhs.value);
 
-		return rhs;
-	}
+	return rhs;
+}
 
-	RValue<SByte8> SByte8::operator=(const SByte8 &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
+RValue<SByte8> SByte8::operator=(const SByte8 &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
 
-		return RValue<SByte8>(value);
-	}
+	return RValue<SByte8>(value);
+}
 
-	RValue<SByte8> SByte8::operator=(const Reference<SByte8> &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
+RValue<SByte8> SByte8::operator=(const Reference<SByte8> &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
 
-		return RValue<SByte8>(value);
-	}
+	return RValue<SByte8>(value);
+}
 
-	RValue<SByte8> operator+(RValue<SByte8> lhs, RValue<SByte8> rhs)
-	{
-		return RValue<SByte8>(Nucleus::createAdd(lhs.value, rhs.value));
-	}
+RValue<SByte8> operator+(RValue<SByte8> lhs, RValue<SByte8> rhs)
+{
+	return RValue<SByte8>(Nucleus::createAdd(lhs.value, rhs.value));
+}
 
-	RValue<SByte8> operator-(RValue<SByte8> lhs, RValue<SByte8> rhs)
-	{
-		return RValue<SByte8>(Nucleus::createSub(lhs.value, rhs.value));
-	}
+RValue<SByte8> operator-(RValue<SByte8> lhs, RValue<SByte8> rhs)
+{
+	return RValue<SByte8>(Nucleus::createSub(lhs.value, rhs.value));
+}
 
 //	RValue<SByte8> operator*(RValue<SByte8> lhs, RValue<SByte8> rhs)
 //	{
@@ -1512,20 +1513,20 @@
 //		return RValue<SByte8>(Nucleus::createSRem(lhs.value, rhs.value));
 //	}
 
-	RValue<SByte8> operator&(RValue<SByte8> lhs, RValue<SByte8> rhs)
-	{
-		return RValue<SByte8>(Nucleus::createAnd(lhs.value, rhs.value));
-	}
+RValue<SByte8> operator&(RValue<SByte8> lhs, RValue<SByte8> rhs)
+{
+	return RValue<SByte8>(Nucleus::createAnd(lhs.value, rhs.value));
+}
 
-	RValue<SByte8> operator|(RValue<SByte8> lhs, RValue<SByte8> rhs)
-	{
-		return RValue<SByte8>(Nucleus::createOr(lhs.value, rhs.value));
-	}
+RValue<SByte8> operator|(RValue<SByte8> lhs, RValue<SByte8> rhs)
+{
+	return RValue<SByte8>(Nucleus::createOr(lhs.value, rhs.value));
+}
 
-	RValue<SByte8> operator^(RValue<SByte8> lhs, RValue<SByte8> rhs)
-	{
-		return RValue<SByte8>(Nucleus::createXor(lhs.value, rhs.value));
-	}
+RValue<SByte8> operator^(RValue<SByte8> lhs, RValue<SByte8> rhs)
+{
+	return RValue<SByte8>(Nucleus::createXor(lhs.value, rhs.value));
+}
 
 //	RValue<SByte8> operator<<(RValue<SByte8> lhs, unsigned char rhs)
 //	{
@@ -1537,15 +1538,15 @@
 //		return RValue<SByte8>(Nucleus::createAShr(lhs.value, rhs.value));
 //	}
 
-	RValue<SByte8> operator+=(SByte8 &lhs, RValue<SByte8> rhs)
-	{
-		return lhs = lhs + rhs;
-	}
+RValue<SByte8> operator+=(SByte8 &lhs, RValue<SByte8> rhs)
+{
+	return lhs = lhs + rhs;
+}
 
-	RValue<SByte8> operator-=(SByte8 &lhs, RValue<SByte8> rhs)
-	{
-		return lhs = lhs - rhs;
-	}
+RValue<SByte8> operator-=(SByte8 &lhs, RValue<SByte8> rhs)
+{
+	return lhs = lhs - rhs;
+}
 
 //	RValue<SByte8> operator*=(SByte8 &lhs, RValue<SByte8> rhs)
 //	{
@@ -1562,20 +1563,20 @@
 //		return lhs = lhs % rhs;
 //	}
 
-	RValue<SByte8> operator&=(SByte8 &lhs, RValue<SByte8> rhs)
-	{
-		return lhs = lhs & rhs;
-	}
+RValue<SByte8> operator&=(SByte8 &lhs, RValue<SByte8> rhs)
+{
+	return lhs = lhs & rhs;
+}
 
-	RValue<SByte8> operator|=(SByte8 &lhs, RValue<SByte8> rhs)
-	{
-		return lhs = lhs | rhs;
-	}
+RValue<SByte8> operator|=(SByte8 &lhs, RValue<SByte8> rhs)
+{
+	return lhs = lhs | rhs;
+}
 
-	RValue<SByte8> operator^=(SByte8 &lhs, RValue<SByte8> rhs)
-	{
-		return lhs = lhs ^ rhs;
-	}
+RValue<SByte8> operator^=(SByte8 &lhs, RValue<SByte8> rhs)
+{
+	return lhs = lhs ^ rhs;
+}
 
 //	RValue<SByte8> operator<<=(SByte8 &lhs, RValue<SByte8> rhs)
 //	{
@@ -1597,192 +1598,192 @@
 //		return RValue<SByte8>(Nucleus::createNeg(val.value));
 //	}
 
-	RValue<SByte8> operator~(RValue<SByte8> val)
-	{
-		return RValue<SByte8>(Nucleus::createNot(val.value));
-	}
+RValue<SByte8> operator~(RValue<SByte8> val)
+{
+	return RValue<SByte8>(Nucleus::createNot(val.value));
+}
 
-	RValue<Short4> UnpackLow(RValue<SByte8> x, RValue<SByte8> y)
-	{
-		int shuffle[16] = {0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23};   // Real type is v16i8
-		return As<Short4>(Nucleus::createShuffleVector(x.value, y.value, shuffle));
-	}
+RValue<Short4> UnpackLow(RValue<SByte8> x, RValue<SByte8> y)
+{
+	int shuffle[16] = {0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23};   // Real type is v16i8
+	return As<Short4>(Nucleus::createShuffleVector(x.value, y.value, shuffle));
+}
 
-	RValue<Short4> UnpackHigh(RValue<SByte8> x, RValue<SByte8> y)
-	{
-		int shuffle[16] = {0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23};   // Real type is v16i8
-		auto lowHigh = RValue<Byte16>(Nucleus::createShuffleVector(x.value, y.value, shuffle));
-		return As<Short4>(Swizzle(As<Int4>(lowHigh), 0x2323));
-	}
+RValue<Short4> UnpackHigh(RValue<SByte8> x, RValue<SByte8> y)
+{
+	int shuffle[16] = {0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23};   // Real type is v16i8
+	auto lowHigh = RValue<Byte16>(Nucleus::createShuffleVector(x.value, y.value, shuffle));
+	return As<Short4>(Swizzle(As<Int4>(lowHigh), 0x2323));
+}
 
-	Byte16::Byte16(RValue<Byte16> rhs)
-	{
-		storeValue(rhs.value);
-	}
+Byte16::Byte16(RValue<Byte16> rhs)
+{
+	storeValue(rhs.value);
+}
 
-	Byte16::Byte16(const Byte16 &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
-	}
+Byte16::Byte16(const Byte16 &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
+}
 
-	Byte16::Byte16(const Reference<Byte16> &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
-	}
+Byte16::Byte16(const Reference<Byte16> &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
+}
 
-	RValue<Byte16> Byte16::operator=(RValue<Byte16> rhs)
-	{
-		storeValue(rhs.value);
+RValue<Byte16> Byte16::operator=(RValue<Byte16> rhs)
+{
+	storeValue(rhs.value);
 
-		return rhs;
-	}
+	return rhs;
+}
 
-	RValue<Byte16> Byte16::operator=(const Byte16 &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
+RValue<Byte16> Byte16::operator=(const Byte16 &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
 
-		return RValue<Byte16>(value);
-	}
+	return RValue<Byte16>(value);
+}
 
-	RValue<Byte16> Byte16::operator=(const Reference<Byte16> &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
+RValue<Byte16> Byte16::operator=(const Reference<Byte16> &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
 
-		return RValue<Byte16>(value);
-	}
+	return RValue<Byte16>(value);
+}
 
-	Short2::Short2(RValue<Short4> cast)
-	{
-		storeValue(Nucleus::createBitCast(cast.value, getType()));
-	}
+Short2::Short2(RValue<Short4> cast)
+{
+	storeValue(Nucleus::createBitCast(cast.value, getType()));
+}
 
-	UShort2::UShort2(RValue<UShort4> cast)
-	{
-		storeValue(Nucleus::createBitCast(cast.value, getType()));
-	}
+UShort2::UShort2(RValue<UShort4> cast)
+{
+	storeValue(Nucleus::createBitCast(cast.value, getType()));
+}
 
-	Short4::Short4(RValue<Int> cast)
-	{
-		Value *vector = loadValue();
-		Value *element = Nucleus::createTrunc(cast.value, Short::getType());
-		Value *insert = Nucleus::createInsertElement(vector, element, 0);
-		Value *swizzle = Swizzle(RValue<Short4>(insert), 0x0000).value;
+Short4::Short4(RValue<Int> cast)
+{
+	Value *vector = loadValue();
+	Value *element = Nucleus::createTrunc(cast.value, Short::getType());
+	Value *insert = Nucleus::createInsertElement(vector, element, 0);
+	Value *swizzle = Swizzle(RValue<Short4>(insert), 0x0000).value;
 
-		storeValue(swizzle);
-	}
+	storeValue(swizzle);
+}
 
 //	Short4::Short4(RValue<Float> cast)
 //	{
 //	}
 
-	Short4::Short4(short xyzw)
-	{
-		int64_t constantVector[4] = {xyzw, xyzw, xyzw, xyzw};
-		storeValue(Nucleus::createConstantVector(constantVector, getType()));
-	}
+Short4::Short4(short xyzw)
+{
+	int64_t constantVector[4] = {xyzw, xyzw, xyzw, xyzw};
+	storeValue(Nucleus::createConstantVector(constantVector, getType()));
+}
 
-	Short4::Short4(short x, short y, short z, short w)
-	{
-		int64_t constantVector[4] = {x, y, z, w};
-		storeValue(Nucleus::createConstantVector(constantVector, getType()));
-	}
+Short4::Short4(short x, short y, short z, short w)
+{
+	int64_t constantVector[4] = {x, y, z, w};
+	storeValue(Nucleus::createConstantVector(constantVector, getType()));
+}
 
-	Short4::Short4(RValue<Short4> rhs)
-	{
-		storeValue(rhs.value);
-	}
+Short4::Short4(RValue<Short4> rhs)
+{
+	storeValue(rhs.value);
+}
 
-	Short4::Short4(const Short4 &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
-	}
+Short4::Short4(const Short4 &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
+}
 
-	Short4::Short4(const Reference<Short4> &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
-	}
+Short4::Short4(const Reference<Short4> &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
+}
 
-	Short4::Short4(RValue<UShort4> rhs)
-	{
-		storeValue(rhs.value);
-	}
+Short4::Short4(RValue<UShort4> rhs)
+{
+	storeValue(rhs.value);
+}
 
-	Short4::Short4(const UShort4 &rhs)
-	{
-		storeValue(rhs.loadValue());
-	}
+Short4::Short4(const UShort4 &rhs)
+{
+	storeValue(rhs.loadValue());
+}
 
-	Short4::Short4(const Reference<UShort4> &rhs)
-	{
-		storeValue(rhs.loadValue());
-	}
+Short4::Short4(const Reference<UShort4> &rhs)
+{
+	storeValue(rhs.loadValue());
+}
 
-	RValue<Short4> Short4::operator=(RValue<Short4> rhs)
-	{
-		storeValue(rhs.value);
+RValue<Short4> Short4::operator=(RValue<Short4> rhs)
+{
+	storeValue(rhs.value);
 
-		return rhs;
-	}
+	return rhs;
+}
 
-	RValue<Short4> Short4::operator=(const Short4 &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
+RValue<Short4> Short4::operator=(const Short4 &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
 
-		return RValue<Short4>(value);
-	}
+	return RValue<Short4>(value);
+}
 
-	RValue<Short4> Short4::operator=(const Reference<Short4> &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
+RValue<Short4> Short4::operator=(const Reference<Short4> &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
 
-		return RValue<Short4>(value);
-	}
+	return RValue<Short4>(value);
+}
 
-	RValue<Short4> Short4::operator=(RValue<UShort4> rhs)
-	{
-		storeValue(rhs.value);
+RValue<Short4> Short4::operator=(RValue<UShort4> rhs)
+{
+	storeValue(rhs.value);
 
-		return RValue<Short4>(rhs);
-	}
+	return RValue<Short4>(rhs);
+}
 
-	RValue<Short4> Short4::operator=(const UShort4 &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
+RValue<Short4> Short4::operator=(const UShort4 &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
 
-		return RValue<Short4>(value);
-	}
+	return RValue<Short4>(value);
+}
 
-	RValue<Short4> Short4::operator=(const Reference<UShort4> &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
+RValue<Short4> Short4::operator=(const Reference<UShort4> &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
 
-		return RValue<Short4>(value);
-	}
+	return RValue<Short4>(value);
+}
 
-	RValue<Short4> operator+(RValue<Short4> lhs, RValue<Short4> rhs)
-	{
-		return RValue<Short4>(Nucleus::createAdd(lhs.value, rhs.value));
-	}
+RValue<Short4> operator+(RValue<Short4> lhs, RValue<Short4> rhs)
+{
+	return RValue<Short4>(Nucleus::createAdd(lhs.value, rhs.value));
+}
 
-	RValue<Short4> operator-(RValue<Short4> lhs, RValue<Short4> rhs)
-	{
-		return RValue<Short4>(Nucleus::createSub(lhs.value, rhs.value));
-	}
+RValue<Short4> operator-(RValue<Short4> lhs, RValue<Short4> rhs)
+{
+	return RValue<Short4>(Nucleus::createSub(lhs.value, rhs.value));
+}
 
-	RValue<Short4> operator*(RValue<Short4> lhs, RValue<Short4> rhs)
-	{
-		return RValue<Short4>(Nucleus::createMul(lhs.value, rhs.value));
-	}
+RValue<Short4> operator*(RValue<Short4> lhs, RValue<Short4> rhs)
+{
+	return RValue<Short4>(Nucleus::createMul(lhs.value, rhs.value));
+}
 
 //	RValue<Short4> operator/(RValue<Short4> lhs, RValue<Short4> rhs)
 //	{
@@ -1794,35 +1795,35 @@
 //		return RValue<Short4>(Nucleus::createSRem(lhs.value, rhs.value));
 //	}
 
-	RValue<Short4> operator&(RValue<Short4> lhs, RValue<Short4> rhs)
-	{
-		return RValue<Short4>(Nucleus::createAnd(lhs.value, rhs.value));
-	}
+RValue<Short4> operator&(RValue<Short4> lhs, RValue<Short4> rhs)
+{
+	return RValue<Short4>(Nucleus::createAnd(lhs.value, rhs.value));
+}
 
-	RValue<Short4> operator|(RValue<Short4> lhs, RValue<Short4> rhs)
-	{
-		return RValue<Short4>(Nucleus::createOr(lhs.value, rhs.value));
-	}
+RValue<Short4> operator|(RValue<Short4> lhs, RValue<Short4> rhs)
+{
+	return RValue<Short4>(Nucleus::createOr(lhs.value, rhs.value));
+}
 
-	RValue<Short4> operator^(RValue<Short4> lhs, RValue<Short4> rhs)
-	{
-		return RValue<Short4>(Nucleus::createXor(lhs.value, rhs.value));
-	}
+RValue<Short4> operator^(RValue<Short4> lhs, RValue<Short4> rhs)
+{
+	return RValue<Short4>(Nucleus::createXor(lhs.value, rhs.value));
+}
 
-	RValue<Short4> operator+=(Short4 &lhs, RValue<Short4> rhs)
-	{
-		return lhs = lhs + rhs;
-	}
+RValue<Short4> operator+=(Short4 &lhs, RValue<Short4> rhs)
+{
+	return lhs = lhs + rhs;
+}
 
-	RValue<Short4> operator-=(Short4 &lhs, RValue<Short4> rhs)
-	{
-		return lhs = lhs - rhs;
-	}
+RValue<Short4> operator-=(Short4 &lhs, RValue<Short4> rhs)
+{
+	return lhs = lhs - rhs;
+}
 
-	RValue<Short4> operator*=(Short4 &lhs, RValue<Short4> rhs)
-	{
-		return lhs = lhs * rhs;
-	}
+RValue<Short4> operator*=(Short4 &lhs, RValue<Short4> rhs)
+{
+	return lhs = lhs * rhs;
+}
 
 //	RValue<Short4> operator/=(Short4 &lhs, RValue<Short4> rhs)
 //	{
@@ -1834,1166 +1835,1166 @@
 //		return lhs = lhs % rhs;
 //	}
 
-	RValue<Short4> operator&=(Short4 &lhs, RValue<Short4> rhs)
-	{
-		return lhs = lhs & rhs;
-	}
+RValue<Short4> operator&=(Short4 &lhs, RValue<Short4> rhs)
+{
+	return lhs = lhs & rhs;
+}
 
-	RValue<Short4> operator|=(Short4 &lhs, RValue<Short4> rhs)
-	{
-		return lhs = lhs | rhs;
-	}
+RValue<Short4> operator|=(Short4 &lhs, RValue<Short4> rhs)
+{
+	return lhs = lhs | rhs;
+}
 
-	RValue<Short4> operator^=(Short4 &lhs, RValue<Short4> rhs)
-	{
-		return lhs = lhs ^ rhs;
-	}
+RValue<Short4> operator^=(Short4 &lhs, RValue<Short4> rhs)
+{
+	return lhs = lhs ^ rhs;
+}
 
-	RValue<Short4> operator<<=(Short4 &lhs, unsigned char rhs)
-	{
-		return lhs = lhs << rhs;
-	}
+RValue<Short4> operator<<=(Short4 &lhs, unsigned char rhs)
+{
+	return lhs = lhs << rhs;
+}
 
-	RValue<Short4> operator>>=(Short4 &lhs, unsigned char rhs)
-	{
-		return lhs = lhs >> rhs;
-	}
+RValue<Short4> operator>>=(Short4 &lhs, unsigned char rhs)
+{
+	return lhs = lhs >> rhs;
+}
 
 //	RValue<Short4> operator+(RValue<Short4> val)
 //	{
 //		return val;
 //	}
 
-	RValue<Short4> operator-(RValue<Short4> val)
-	{
-		return RValue<Short4>(Nucleus::createNeg(val.value));
-	}
+RValue<Short4> operator-(RValue<Short4> val)
+{
+	return RValue<Short4>(Nucleus::createNeg(val.value));
+}
 
-	RValue<Short4> operator~(RValue<Short4> val)
-	{
-		return RValue<Short4>(Nucleus::createNot(val.value));
-	}
+RValue<Short4> operator~(RValue<Short4> val)
+{
+	return RValue<Short4>(Nucleus::createNot(val.value));
+}
 
-	RValue<Short4> RoundShort4(RValue<Float4> cast)
-	{
-		RValue<Int4> int4 = RoundInt(cast);
-		return As<Short4>(PackSigned(int4, int4));
-	}
+RValue<Short4> RoundShort4(RValue<Float4> cast)
+{
+	RValue<Int4> int4 = RoundInt(cast);
+	return As<Short4>(PackSigned(int4, int4));
+}
 
-	RValue<Int2> UnpackLow(RValue<Short4> x, RValue<Short4> y)
-	{
-		int shuffle[8] = {0, 8, 1, 9, 2, 10, 3, 11};   // Real type is v8i16
-		return As<Int2>(Nucleus::createShuffleVector(x.value, y.value, shuffle));
-	}
+RValue<Int2> UnpackLow(RValue<Short4> x, RValue<Short4> y)
+{
+	int shuffle[8] = {0, 8, 1, 9, 2, 10, 3, 11};   // Real type is v8i16
+	return As<Int2>(Nucleus::createShuffleVector(x.value, y.value, shuffle));
+}
 
-	RValue<Int2> UnpackHigh(RValue<Short4> x, RValue<Short4> y)
-	{
-		int shuffle[8] = {0, 8, 1, 9, 2, 10, 3, 11};   // Real type is v8i16
-		auto lowHigh = RValue<Short8>(Nucleus::createShuffleVector(x.value, y.value, shuffle));
-		return As<Int2>(Swizzle(As<Int4>(lowHigh), 0x2323));
-	}
+RValue<Int2> UnpackHigh(RValue<Short4> x, RValue<Short4> y)
+{
+	int shuffle[8] = {0, 8, 1, 9, 2, 10, 3, 11};   // Real type is v8i16
+	auto lowHigh = RValue<Short8>(Nucleus::createShuffleVector(x.value, y.value, shuffle));
+	return As<Int2>(Swizzle(As<Int4>(lowHigh), 0x2323));
+}
 
-	RValue<Short4> Swizzle(RValue<Short4> x, uint16_t select)
+RValue<Short4> Swizzle(RValue<Short4> x, uint16_t select)
+{
+	// Real type is v8i16
+	int shuffle[8] =
 	{
-		// Real type is v8i16
-		int shuffle[8] =
-		{
-			(select >> 12) & 0x03,
-			(select >>  8) & 0x03,
-			(select >>  4) & 0x03,
-			(select >>  0) & 0x03,
-			(select >> 12) & 0x03,
-			(select >>  8) & 0x03,
-			(select >>  4) & 0x03,
-			(select >>  0) & 0x03,
-		};
+		(select >> 12) & 0x03,
+		(select >>  8) & 0x03,
+		(select >>  4) & 0x03,
+		(select >>  0) & 0x03,
+		(select >> 12) & 0x03,
+		(select >>  8) & 0x03,
+		(select >>  4) & 0x03,
+		(select >>  0) & 0x03,
+	};
 
-		return As<Short4>(Nucleus::createShuffleVector(x.value, x.value, shuffle));
-	}
+	return As<Short4>(Nucleus::createShuffleVector(x.value, x.value, shuffle));
+}
 
-	RValue<Short4> Insert(RValue<Short4> val, RValue<Short> element, int i)
-	{
-		return RValue<Short4>(Nucleus::createInsertElement(val.value, element.value, i));
-	}
+RValue<Short4> Insert(RValue<Short4> val, RValue<Short> element, int i)
+{
+	return RValue<Short4>(Nucleus::createInsertElement(val.value, element.value, i));
+}
 
-	RValue<Short> Extract(RValue<Short4> val, int i)
-	{
-		return RValue<Short>(Nucleus::createExtractElement(val.value, Short::getType(), i));
-	}
+RValue<Short> Extract(RValue<Short4> val, int i)
+{
+	return RValue<Short>(Nucleus::createExtractElement(val.value, Short::getType(), i));
+}
 
-	UShort4::UShort4(RValue<Int4> cast)
-	{
-		*this = Short4(cast);
-	}
+UShort4::UShort4(RValue<Int4> cast)
+{
+	*this = Short4(cast);
+}
 
-	UShort4::UShort4(unsigned short xyzw)
-	{
-		int64_t constantVector[4] = {xyzw, xyzw, xyzw, xyzw};
-		storeValue(Nucleus::createConstantVector(constantVector, getType()));
-	}
+UShort4::UShort4(unsigned short xyzw)
+{
+	int64_t constantVector[4] = {xyzw, xyzw, xyzw, xyzw};
+	storeValue(Nucleus::createConstantVector(constantVector, getType()));
+}
 
-	UShort4::UShort4(unsigned short x, unsigned short y, unsigned short z, unsigned short w)
-	{
-		int64_t constantVector[4] = {x, y, z, w};
-		storeValue(Nucleus::createConstantVector(constantVector, getType()));
-	}
+UShort4::UShort4(unsigned short x, unsigned short y, unsigned short z, unsigned short w)
+{
+	int64_t constantVector[4] = {x, y, z, w};
+	storeValue(Nucleus::createConstantVector(constantVector, getType()));
+}
 
-	UShort4::UShort4(RValue<UShort4> rhs)
-	{
-		storeValue(rhs.value);
-	}
+UShort4::UShort4(RValue<UShort4> rhs)
+{
+	storeValue(rhs.value);
+}
 
-	UShort4::UShort4(const UShort4 &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
-	}
+UShort4::UShort4(const UShort4 &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
+}
 
-	UShort4::UShort4(const Reference<UShort4> &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
-	}
+UShort4::UShort4(const Reference<UShort4> &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
+}
 
-	UShort4::UShort4(RValue<Short4> rhs)
-	{
-		storeValue(rhs.value);
-	}
+UShort4::UShort4(RValue<Short4> rhs)
+{
+	storeValue(rhs.value);
+}
 
-	UShort4::UShort4(const Short4 &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
-	}
+UShort4::UShort4(const Short4 &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
+}
 
-	UShort4::UShort4(const Reference<Short4> &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
-	}
+UShort4::UShort4(const Reference<Short4> &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
+}
 
-	RValue<UShort4> UShort4::operator=(RValue<UShort4> rhs)
-	{
-		storeValue(rhs.value);
+RValue<UShort4> UShort4::operator=(RValue<UShort4> rhs)
+{
+	storeValue(rhs.value);
 
-		return rhs;
-	}
+	return rhs;
+}
 
-	RValue<UShort4> UShort4::operator=(const UShort4 &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
+RValue<UShort4> UShort4::operator=(const UShort4 &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
 
-		return RValue<UShort4>(value);
-	}
+	return RValue<UShort4>(value);
+}
 
-	RValue<UShort4> UShort4::operator=(const Reference<UShort4> &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
+RValue<UShort4> UShort4::operator=(const Reference<UShort4> &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
 
-		return RValue<UShort4>(value);
-	}
+	return RValue<UShort4>(value);
+}
 
-	RValue<UShort4> UShort4::operator=(RValue<Short4> rhs)
-	{
-		storeValue(rhs.value);
+RValue<UShort4> UShort4::operator=(RValue<Short4> rhs)
+{
+	storeValue(rhs.value);
 
-		return RValue<UShort4>(rhs);
-	}
+	return RValue<UShort4>(rhs);
+}
 
-	RValue<UShort4> UShort4::operator=(const Short4 &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
+RValue<UShort4> UShort4::operator=(const Short4 &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
 
-		return RValue<UShort4>(value);
-	}
+	return RValue<UShort4>(value);
+}
 
-	RValue<UShort4> UShort4::operator=(const Reference<Short4> &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
+RValue<UShort4> UShort4::operator=(const Reference<Short4> &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
 
-		return RValue<UShort4>(value);
-	}
+	return RValue<UShort4>(value);
+}
 
-	RValue<UShort4> operator+(RValue<UShort4> lhs, RValue<UShort4> rhs)
-	{
-		return RValue<UShort4>(Nucleus::createAdd(lhs.value, rhs.value));
-	}
+RValue<UShort4> operator+(RValue<UShort4> lhs, RValue<UShort4> rhs)
+{
+	return RValue<UShort4>(Nucleus::createAdd(lhs.value, rhs.value));
+}
 
-	RValue<UShort4> operator-(RValue<UShort4> lhs, RValue<UShort4> rhs)
-	{
-		return RValue<UShort4>(Nucleus::createSub(lhs.value, rhs.value));
-	}
+RValue<UShort4> operator-(RValue<UShort4> lhs, RValue<UShort4> rhs)
+{
+	return RValue<UShort4>(Nucleus::createSub(lhs.value, rhs.value));
+}
 
-	RValue<UShort4> operator*(RValue<UShort4> lhs, RValue<UShort4> rhs)
-	{
-		return RValue<UShort4>(Nucleus::createMul(lhs.value, rhs.value));
-	}
+RValue<UShort4> operator*(RValue<UShort4> lhs, RValue<UShort4> rhs)
+{
+	return RValue<UShort4>(Nucleus::createMul(lhs.value, rhs.value));
+}
 
-	RValue<UShort4> operator&(RValue<UShort4> lhs, RValue<UShort4> rhs)
-	{
-		return RValue<UShort4>(Nucleus::createAnd(lhs.value, rhs.value));
-	}
+RValue<UShort4> operator&(RValue<UShort4> lhs, RValue<UShort4> rhs)
+{
+	return RValue<UShort4>(Nucleus::createAnd(lhs.value, rhs.value));
+}
 
-	RValue<UShort4> operator|(RValue<UShort4> lhs, RValue<UShort4> rhs)
-	{
-		return RValue<UShort4>(Nucleus::createOr(lhs.value, rhs.value));
-	}
+RValue<UShort4> operator|(RValue<UShort4> lhs, RValue<UShort4> rhs)
+{
+	return RValue<UShort4>(Nucleus::createOr(lhs.value, rhs.value));
+}
 
-	RValue<UShort4> operator^(RValue<UShort4> lhs, RValue<UShort4> rhs)
-	{
-		return RValue<UShort4>(Nucleus::createXor(lhs.value, rhs.value));
-	}
+RValue<UShort4> operator^(RValue<UShort4> lhs, RValue<UShort4> rhs)
+{
+	return RValue<UShort4>(Nucleus::createXor(lhs.value, rhs.value));
+}
 
-	RValue<UShort4> operator<<=(UShort4 &lhs, unsigned char rhs)
-	{
-		return lhs = lhs << rhs;
-	}
+RValue<UShort4> operator<<=(UShort4 &lhs, unsigned char rhs)
+{
+	return lhs = lhs << rhs;
+}
 
-	RValue<UShort4> operator>>=(UShort4 &lhs, unsigned char rhs)
-	{
-		return lhs = lhs >> rhs;
-	}
+RValue<UShort4> operator>>=(UShort4 &lhs, unsigned char rhs)
+{
+	return lhs = lhs >> rhs;
+}
 
-	RValue<UShort4> operator~(RValue<UShort4> val)
-	{
-		return RValue<UShort4>(Nucleus::createNot(val.value));
-	}
+RValue<UShort4> operator~(RValue<UShort4> val)
+{
+	return RValue<UShort4>(Nucleus::createNot(val.value));
+}
 
-	Short8::Short8(short c)
-	{
-		int64_t constantVector[8] = {c, c, c, c, c, c, c, c};
-		storeValue(Nucleus::createConstantVector(constantVector, getType()));
-	}
+Short8::Short8(short c)
+{
+	int64_t constantVector[8] = {c, c, c, c, c, c, c, c};
+	storeValue(Nucleus::createConstantVector(constantVector, getType()));
+}
 
-	Short8::Short8(short c0, short c1, short c2, short c3, short c4, short c5, short c6, short c7)
-	{
-		int64_t constantVector[8] = {c0, c1, c2, c3, c4, c5, c6, c7};
-		storeValue(Nucleus::createConstantVector(constantVector, getType()));
-	}
+Short8::Short8(short c0, short c1, short c2, short c3, short c4, short c5, short c6, short c7)
+{
+	int64_t constantVector[8] = {c0, c1, c2, c3, c4, c5, c6, c7};
+	storeValue(Nucleus::createConstantVector(constantVector, getType()));
+}
 
-	Short8::Short8(RValue<Short8> rhs)
-	{
-		storeValue(rhs.value);
-	}
+Short8::Short8(RValue<Short8> rhs)
+{
+	storeValue(rhs.value);
+}
 
-	Short8::Short8(const Reference<Short8> &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
-	}
+Short8::Short8(const Reference<Short8> &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
+}
 
-	Short8::Short8(RValue<Short4> lo, RValue<Short4> hi)
-	{
-		int shuffle[8] = {0, 1, 2, 3, 8, 9, 10, 11};   // Real type is v8i16
-		Value *packed = Nucleus::createShuffleVector(lo.value, hi.value, shuffle);
+Short8::Short8(RValue<Short4> lo, RValue<Short4> hi)
+{
+	int shuffle[8] = {0, 1, 2, 3, 8, 9, 10, 11};   // Real type is v8i16
+	Value *packed = Nucleus::createShuffleVector(lo.value, hi.value, shuffle);
 
-		storeValue(packed);
-	}
+	storeValue(packed);
+}
 
-	RValue<Short8> Short8::operator=(RValue<Short8> rhs)
-	{
-		storeValue(rhs.value);
+RValue<Short8> Short8::operator=(RValue<Short8> rhs)
+{
+	storeValue(rhs.value);
 
-		return rhs;
-	}
+	return rhs;
+}
 
-	RValue<Short8> Short8::operator=(const Short8 &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
+RValue<Short8> Short8::operator=(const Short8 &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
 
-		return RValue<Short8>(value);
-	}
+	return RValue<Short8>(value);
+}
 
-	RValue<Short8> Short8::operator=(const Reference<Short8> &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
+RValue<Short8> Short8::operator=(const Reference<Short8> &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
 
-		return RValue<Short8>(value);
-	}
+	return RValue<Short8>(value);
+}
 
-	RValue<Short8> operator+(RValue<Short8> lhs, RValue<Short8> rhs)
-	{
-		return RValue<Short8>(Nucleus::createAdd(lhs.value, rhs.value));
-	}
+RValue<Short8> operator+(RValue<Short8> lhs, RValue<Short8> rhs)
+{
+	return RValue<Short8>(Nucleus::createAdd(lhs.value, rhs.value));
+}
 
-	RValue<Short8> operator&(RValue<Short8> lhs, RValue<Short8> rhs)
-	{
-		return RValue<Short8>(Nucleus::createAnd(lhs.value, rhs.value));
-	}
+RValue<Short8> operator&(RValue<Short8> lhs, RValue<Short8> rhs)
+{
+	return RValue<Short8>(Nucleus::createAnd(lhs.value, rhs.value));
+}
 
-	RValue<Int4> Abs(RValue<Int4> x)
-	{
-		// TODO: Optimize.
-		auto negative = x >> 31;
-		return (x ^ negative) - negative;
-	}
+RValue<Int4> Abs(RValue<Int4> x)
+{
+	// TODO: Optimize.
+	auto negative = x >> 31;
+	return (x ^ negative) - negative;
+}
 
-	UShort8::UShort8(unsigned short c)
-	{
-		int64_t constantVector[8] = {c, c, c, c, c, c, c, c};
-		storeValue(Nucleus::createConstantVector(constantVector, getType()));
-	}
+UShort8::UShort8(unsigned short c)
+{
+	int64_t constantVector[8] = {c, c, c, c, c, c, c, c};
+	storeValue(Nucleus::createConstantVector(constantVector, getType()));
+}
 
-	UShort8::UShort8(unsigned short c0, unsigned short c1, unsigned short c2, unsigned short c3, unsigned short c4, unsigned short c5, unsigned short c6, unsigned short c7)
-	{
-		int64_t constantVector[8] = {c0, c1, c2, c3, c4, c5, c6, c7};
-		storeValue(Nucleus::createConstantVector(constantVector, getType()));
-	}
+UShort8::UShort8(unsigned short c0, unsigned short c1, unsigned short c2, unsigned short c3, unsigned short c4, unsigned short c5, unsigned short c6, unsigned short c7)
+{
+	int64_t constantVector[8] = {c0, c1, c2, c3, c4, c5, c6, c7};
+	storeValue(Nucleus::createConstantVector(constantVector, getType()));
+}
 
-	UShort8::UShort8(RValue<UShort8> rhs)
-	{
-		storeValue(rhs.value);
-	}
+UShort8::UShort8(RValue<UShort8> rhs)
+{
+	storeValue(rhs.value);
+}
 
-	UShort8::UShort8(const Reference<UShort8> &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
-	}
+UShort8::UShort8(const Reference<UShort8> &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
+}
 
-	UShort8::UShort8(RValue<UShort4> lo, RValue<UShort4> hi)
-	{
-		int shuffle[8] = {0, 1, 2, 3, 8, 9, 10, 11};   // Real type is v8i16
-		Value *packed = Nucleus::createShuffleVector(lo.value, hi.value, shuffle);
+UShort8::UShort8(RValue<UShort4> lo, RValue<UShort4> hi)
+{
+	int shuffle[8] = {0, 1, 2, 3, 8, 9, 10, 11};   // Real type is v8i16
+	Value *packed = Nucleus::createShuffleVector(lo.value, hi.value, shuffle);
 
-		storeValue(packed);
-	}
+	storeValue(packed);
+}
 
-	RValue<UShort8> UShort8::operator=(RValue<UShort8> rhs)
-	{
-		storeValue(rhs.value);
+RValue<UShort8> UShort8::operator=(RValue<UShort8> rhs)
+{
+	storeValue(rhs.value);
 
-		return rhs;
-	}
+	return rhs;
+}
 
-	RValue<UShort8> UShort8::operator=(const UShort8 &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
+RValue<UShort8> UShort8::operator=(const UShort8 &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
 
-		return RValue<UShort8>(value);
-	}
+	return RValue<UShort8>(value);
+}
 
-	RValue<UShort8> UShort8::operator=(const Reference<UShort8> &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
+RValue<UShort8> UShort8::operator=(const Reference<UShort8> &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
 
-		return RValue<UShort8>(value);
-	}
+	return RValue<UShort8>(value);
+}
 
-	RValue<UShort8> operator&(RValue<UShort8> lhs, RValue<UShort8> rhs)
-	{
-		return RValue<UShort8>(Nucleus::createAnd(lhs.value, rhs.value));
-	}
+RValue<UShort8> operator&(RValue<UShort8> lhs, RValue<UShort8> rhs)
+{
+	return RValue<UShort8>(Nucleus::createAnd(lhs.value, rhs.value));
+}
 
-	RValue<UShort8> operator+(RValue<UShort8> lhs, RValue<UShort8> rhs)
-	{
-		return RValue<UShort8>(Nucleus::createAdd(lhs.value, rhs.value));
-	}
+RValue<UShort8> operator+(RValue<UShort8> lhs, RValue<UShort8> rhs)
+{
+	return RValue<UShort8>(Nucleus::createAdd(lhs.value, rhs.value));
+}
 
-	RValue<UShort8> operator*(RValue<UShort8> lhs, RValue<UShort8> rhs)
-	{
-		return RValue<UShort8>(Nucleus::createMul(lhs.value, rhs.value));
-	}
+RValue<UShort8> operator*(RValue<UShort8> lhs, RValue<UShort8> rhs)
+{
+	return RValue<UShort8>(Nucleus::createMul(lhs.value, rhs.value));
+}
 
-	RValue<UShort8> operator+=(UShort8 &lhs, RValue<UShort8> rhs)
-	{
-		return lhs = lhs + rhs;
-	}
+RValue<UShort8> operator+=(UShort8 &lhs, RValue<UShort8> rhs)
+{
+	return lhs = lhs + rhs;
+}
 
-	RValue<UShort8> operator~(RValue<UShort8> val)
-	{
-		return RValue<UShort8>(Nucleus::createNot(val.value));
-	}
+RValue<UShort8> operator~(RValue<UShort8> val)
+{
+	return RValue<UShort8>(Nucleus::createNot(val.value));
+}
 
-	Int::Int(Argument<Int> argument)
-	{
-		storeValue(argument.value);
-	}
+Int::Int(Argument<Int> argument)
+{
+	storeValue(argument.value);
+}
 
-	Int::Int(RValue<Byte> cast)
-	{
-		Value *integer = Nucleus::createZExt(cast.value, Int::getType());
+Int::Int(RValue<Byte> cast)
+{
+	Value *integer = Nucleus::createZExt(cast.value, Int::getType());
 
-		storeValue(integer);
-	}
+	storeValue(integer);
+}
 
-	Int::Int(RValue<SByte> cast)
-	{
-		Value *integer = Nucleus::createSExt(cast.value, Int::getType());
+Int::Int(RValue<SByte> cast)
+{
+	Value *integer = Nucleus::createSExt(cast.value, Int::getType());
 
-		storeValue(integer);
-	}
+	storeValue(integer);
+}
 
-	Int::Int(RValue<Short> cast)
-	{
-		Value *integer = Nucleus::createSExt(cast.value, Int::getType());
+Int::Int(RValue<Short> cast)
+{
+	Value *integer = Nucleus::createSExt(cast.value, Int::getType());
 
-		storeValue(integer);
-	}
+	storeValue(integer);
+}
 
-	Int::Int(RValue<UShort> cast)
-	{
-		Value *integer = Nucleus::createZExt(cast.value, Int::getType());
+Int::Int(RValue<UShort> cast)
+{
+	Value *integer = Nucleus::createZExt(cast.value, Int::getType());
 
-		storeValue(integer);
-	}
+	storeValue(integer);
+}
 
-	Int::Int(RValue<Int2> cast)
-	{
-		*this = Extract(cast, 0);
-	}
+Int::Int(RValue<Int2> cast)
+{
+	*this = Extract(cast, 0);
+}
 
-	Int::Int(RValue<Long> cast)
-	{
-		Value *integer = Nucleus::createTrunc(cast.value, Int::getType());
+Int::Int(RValue<Long> cast)
+{
+	Value *integer = Nucleus::createTrunc(cast.value, Int::getType());
 
-		storeValue(integer);
-	}
+	storeValue(integer);
+}
 
-	Int::Int(RValue<Float> cast)
-	{
-		Value *integer = Nucleus::createFPToSI(cast.value, Int::getType());
+Int::Int(RValue<Float> cast)
+{
+	Value *integer = Nucleus::createFPToSI(cast.value, Int::getType());
 
-		storeValue(integer);
-	}
+	storeValue(integer);
+}
 
-	Int::Int(int x)
-	{
-		storeValue(Nucleus::createConstantInt(x));
-	}
+Int::Int(int x)
+{
+	storeValue(Nucleus::createConstantInt(x));
+}
 
-	Int::Int(RValue<Int> rhs)
-	{
-		storeValue(rhs.value);
-	}
+Int::Int(RValue<Int> rhs)
+{
+	storeValue(rhs.value);
+}
 
-	Int::Int(RValue<UInt> rhs)
-	{
-		storeValue(rhs.value);
-	}
+Int::Int(RValue<UInt> rhs)
+{
+	storeValue(rhs.value);
+}
 
-	Int::Int(const Int &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
-	}
+Int::Int(const Int &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
+}
 
-	Int::Int(const Reference<Int> &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
-	}
+Int::Int(const Reference<Int> &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
+}
 
-	Int::Int(const UInt &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
-	}
+Int::Int(const UInt &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
+}
 
-	Int::Int(const Reference<UInt> &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
-	}
+Int::Int(const Reference<UInt> &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
+}
 
-	RValue<Int> Int::operator=(int rhs)
-	{
-		return RValue<Int>(storeValue(Nucleus::createConstantInt(rhs)));
-	}
+RValue<Int> Int::operator=(int rhs)
+{
+	return RValue<Int>(storeValue(Nucleus::createConstantInt(rhs)));
+}
 
-	RValue<Int> Int::operator=(RValue<Int> rhs)
-	{
-		storeValue(rhs.value);
+RValue<Int> Int::operator=(RValue<Int> rhs)
+{
+	storeValue(rhs.value);
 
-		return rhs;
-	}
+	return rhs;
+}
 
-	RValue<Int> Int::operator=(RValue<UInt> rhs)
-	{
-		storeValue(rhs.value);
+RValue<Int> Int::operator=(RValue<UInt> rhs)
+{
+	storeValue(rhs.value);
 
-		return RValue<Int>(rhs);
-	}
+	return RValue<Int>(rhs);
+}
 
-	RValue<Int> Int::operator=(const Int &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
+RValue<Int> Int::operator=(const Int &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
 
-		return RValue<Int>(value);
-	}
+	return RValue<Int>(value);
+}
 
-	RValue<Int> Int::operator=(const Reference<Int> &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
+RValue<Int> Int::operator=(const Reference<Int> &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
 
-		return RValue<Int>(value);
-	}
+	return RValue<Int>(value);
+}
 
-	RValue<Int> Int::operator=(const UInt &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
+RValue<Int> Int::operator=(const UInt &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
 
-		return RValue<Int>(value);
-	}
+	return RValue<Int>(value);
+}
 
-	RValue<Int> Int::operator=(const Reference<UInt> &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
+RValue<Int> Int::operator=(const Reference<UInt> &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
 
-		return RValue<Int>(value);
-	}
+	return RValue<Int>(value);
+}
 
-	RValue<Int> operator+(RValue<Int> lhs, RValue<Int> rhs)
-	{
-		return RValue<Int>(Nucleus::createAdd(lhs.value, rhs.value));
-	}
+RValue<Int> operator+(RValue<Int> lhs, RValue<Int> rhs)
+{
+	return RValue<Int>(Nucleus::createAdd(lhs.value, rhs.value));
+}
 
-	RValue<Int> operator-(RValue<Int> lhs, RValue<Int> rhs)
-	{
-		return RValue<Int>(Nucleus::createSub(lhs.value, rhs.value));
-	}
+RValue<Int> operator-(RValue<Int> lhs, RValue<Int> rhs)
+{
+	return RValue<Int>(Nucleus::createSub(lhs.value, rhs.value));
+}
 
-	RValue<Int> operator*(RValue<Int> lhs, RValue<Int> rhs)
-	{
-		return RValue<Int>(Nucleus::createMul(lhs.value, rhs.value));
-	}
+RValue<Int> operator*(RValue<Int> lhs, RValue<Int> rhs)
+{
+	return RValue<Int>(Nucleus::createMul(lhs.value, rhs.value));
+}
 
-	RValue<Int> operator/(RValue<Int> lhs, RValue<Int> rhs)
-	{
-		return RValue<Int>(Nucleus::createSDiv(lhs.value, rhs.value));
-	}
+RValue<Int> operator/(RValue<Int> lhs, RValue<Int> rhs)
+{
+	return RValue<Int>(Nucleus::createSDiv(lhs.value, rhs.value));
+}
 
-	RValue<Int> operator%(RValue<Int> lhs, RValue<Int> rhs)
-	{
-		return RValue<Int>(Nucleus::createSRem(lhs.value, rhs.value));
-	}
+RValue<Int> operator%(RValue<Int> lhs, RValue<Int> rhs)
+{
+	return RValue<Int>(Nucleus::createSRem(lhs.value, rhs.value));
+}
 
-	RValue<Int> operator&(RValue<Int> lhs, RValue<Int> rhs)
-	{
-		return RValue<Int>(Nucleus::createAnd(lhs.value, rhs.value));
-	}
+RValue<Int> operator&(RValue<Int> lhs, RValue<Int> rhs)
+{
+	return RValue<Int>(Nucleus::createAnd(lhs.value, rhs.value));
+}
 
-	RValue<Int> operator|(RValue<Int> lhs, RValue<Int> rhs)
-	{
-		return RValue<Int>(Nucleus::createOr(lhs.value, rhs.value));
-	}
+RValue<Int> operator|(RValue<Int> lhs, RValue<Int> rhs)
+{
+	return RValue<Int>(Nucleus::createOr(lhs.value, rhs.value));
+}
 
-	RValue<Int> operator^(RValue<Int> lhs, RValue<Int> rhs)
-	{
-		return RValue<Int>(Nucleus::createXor(lhs.value, rhs.value));
-	}
+RValue<Int> operator^(RValue<Int> lhs, RValue<Int> rhs)
+{
+	return RValue<Int>(Nucleus::createXor(lhs.value, rhs.value));
+}
 
-	RValue<Int> operator<<(RValue<Int> lhs, RValue<Int> rhs)
-	{
-		return RValue<Int>(Nucleus::createShl(lhs.value, rhs.value));
-	}
+RValue<Int> operator<<(RValue<Int> lhs, RValue<Int> rhs)
+{
+	return RValue<Int>(Nucleus::createShl(lhs.value, rhs.value));
+}
 
-	RValue<Int> operator>>(RValue<Int> lhs, RValue<Int> rhs)
-	{
-		return RValue<Int>(Nucleus::createAShr(lhs.value, rhs.value));
-	}
+RValue<Int> operator>>(RValue<Int> lhs, RValue<Int> rhs)
+{
+	return RValue<Int>(Nucleus::createAShr(lhs.value, rhs.value));
+}
 
-	RValue<Int> operator+=(Int &lhs, RValue<Int> rhs)
-	{
-		return lhs = lhs + rhs;
-	}
+RValue<Int> operator+=(Int &lhs, RValue<Int> rhs)
+{
+	return lhs = lhs + rhs;
+}
 
-	RValue<Int> operator-=(Int &lhs, RValue<Int> rhs)
-	{
-		return lhs = lhs - rhs;
-	}
+RValue<Int> operator-=(Int &lhs, RValue<Int> rhs)
+{
+	return lhs = lhs - rhs;
+}
 
-	RValue<Int> operator*=(Int &lhs, RValue<Int> rhs)
-	{
-		return lhs = lhs * rhs;
-	}
+RValue<Int> operator*=(Int &lhs, RValue<Int> rhs)
+{
+	return lhs = lhs * rhs;
+}
 
-	RValue<Int> operator/=(Int &lhs, RValue<Int> rhs)
-	{
-		return lhs = lhs / rhs;
-	}
+RValue<Int> operator/=(Int &lhs, RValue<Int> rhs)
+{
+	return lhs = lhs / rhs;
+}
 
-	RValue<Int> operator%=(Int &lhs, RValue<Int> rhs)
-	{
-		return lhs = lhs % rhs;
-	}
+RValue<Int> operator%=(Int &lhs, RValue<Int> rhs)
+{
+	return lhs = lhs % rhs;
+}
 
-	RValue<Int> operator&=(Int &lhs, RValue<Int> rhs)
-	{
-		return lhs = lhs & rhs;
-	}
+RValue<Int> operator&=(Int &lhs, RValue<Int> rhs)
+{
+	return lhs = lhs & rhs;
+}
 
-	RValue<Int> operator|=(Int &lhs, RValue<Int> rhs)
-	{
-		return lhs = lhs | rhs;
-	}
+RValue<Int> operator|=(Int &lhs, RValue<Int> rhs)
+{
+	return lhs = lhs | rhs;
+}
 
-	RValue<Int> operator^=(Int &lhs, RValue<Int> rhs)
-	{
-		return lhs = lhs ^ rhs;
-	}
+RValue<Int> operator^=(Int &lhs, RValue<Int> rhs)
+{
+	return lhs = lhs ^ rhs;
+}
 
-	RValue<Int> operator<<=(Int &lhs, RValue<Int> rhs)
-	{
-		return lhs = lhs << rhs;
-	}
+RValue<Int> operator<<=(Int &lhs, RValue<Int> rhs)
+{
+	return lhs = lhs << rhs;
+}
 
-	RValue<Int> operator>>=(Int &lhs, RValue<Int> rhs)
-	{
-		return lhs = lhs >> rhs;
-	}
+RValue<Int> operator>>=(Int &lhs, RValue<Int> rhs)
+{
+	return lhs = lhs >> rhs;
+}
 
-	RValue<Int> operator+(RValue<Int> val)
-	{
-		return val;
-	}
+RValue<Int> operator+(RValue<Int> val)
+{
+	return val;
+}
 
-	RValue<Int> operator-(RValue<Int> val)
-	{
-		return RValue<Int>(Nucleus::createNeg(val.value));
-	}
+RValue<Int> operator-(RValue<Int> val)
+{
+	return RValue<Int>(Nucleus::createNeg(val.value));
+}
 
-	RValue<Int> operator~(RValue<Int> val)
-	{
-		return RValue<Int>(Nucleus::createNot(val.value));
-	}
+RValue<Int> operator~(RValue<Int> val)
+{
+	return RValue<Int>(Nucleus::createNot(val.value));
+}
 
-	RValue<Bool> operator<(RValue<Int> lhs, RValue<Int> rhs)
-	{
-		return RValue<Bool>(Nucleus::createICmpSLT(lhs.value, rhs.value));
-	}
+RValue<Bool> operator<(RValue<Int> lhs, RValue<Int> rhs)
+{
+	return RValue<Bool>(Nucleus::createICmpSLT(lhs.value, rhs.value));
+}
 
-	RValue<Bool> operator<=(RValue<Int> lhs, RValue<Int> rhs)
-	{
-		return RValue<Bool>(Nucleus::createICmpSLE(lhs.value, rhs.value));
-	}
+RValue<Bool> operator<=(RValue<Int> lhs, RValue<Int> rhs)
+{
+	return RValue<Bool>(Nucleus::createICmpSLE(lhs.value, rhs.value));
+}
 
-	RValue<Bool> operator>(RValue<Int> lhs, RValue<Int> rhs)
-	{
-		return RValue<Bool>(Nucleus::createICmpSGT(lhs.value, rhs.value));
-	}
+RValue<Bool> operator>(RValue<Int> lhs, RValue<Int> rhs)
+{
+	return RValue<Bool>(Nucleus::createICmpSGT(lhs.value, rhs.value));
+}
 
-	RValue<Bool> operator>=(RValue<Int> lhs, RValue<Int> rhs)
-	{
-		return RValue<Bool>(Nucleus::createICmpSGE(lhs.value, rhs.value));
-	}
+RValue<Bool> operator>=(RValue<Int> lhs, RValue<Int> rhs)
+{
+	return RValue<Bool>(Nucleus::createICmpSGE(lhs.value, rhs.value));
+}
 
-	RValue<Bool> operator!=(RValue<Int> lhs, RValue<Int> rhs)
-	{
-		return RValue<Bool>(Nucleus::createICmpNE(lhs.value, rhs.value));
-	}
+RValue<Bool> operator!=(RValue<Int> lhs, RValue<Int> rhs)
+{
+	return RValue<Bool>(Nucleus::createICmpNE(lhs.value, rhs.value));
+}
 
-	RValue<Bool> operator==(RValue<Int> lhs, RValue<Int> rhs)
-	{
-		return RValue<Bool>(Nucleus::createICmpEQ(lhs.value, rhs.value));
-	}
+RValue<Bool> operator==(RValue<Int> lhs, RValue<Int> rhs)
+{
+	return RValue<Bool>(Nucleus::createICmpEQ(lhs.value, rhs.value));
+}
 
-	RValue<Int> Max(RValue<Int> x, RValue<Int> y)
-	{
-		return IfThenElse(x > y, x, y);
-	}
+RValue<Int> Max(RValue<Int> x, RValue<Int> y)
+{
+	return IfThenElse(x > y, x, y);
+}
 
-	RValue<Int> Min(RValue<Int> x, RValue<Int> y)
-	{
-		return IfThenElse(x < y, x, y);
-	}
+RValue<Int> Min(RValue<Int> x, RValue<Int> y)
+{
+	return IfThenElse(x < y, x, y);
+}
 
-	RValue<Int> Clamp(RValue<Int> x, RValue<Int> min, RValue<Int> max)
-	{
-		return Min(Max(x, min), max);
-	}
+RValue<Int> Clamp(RValue<Int> x, RValue<Int> min, RValue<Int> max)
+{
+	return Min(Max(x, min), max);
+}
 
-	Long::Long(RValue<Int> cast)
-	{
-		Value *integer = Nucleus::createSExt(cast.value, Long::getType());
+Long::Long(RValue<Int> cast)
+{
+	Value *integer = Nucleus::createSExt(cast.value, Long::getType());
 
-		storeValue(integer);
-	}
+	storeValue(integer);
+}
 
-	Long::Long(RValue<UInt> cast)
-	{
-		Value *integer = Nucleus::createZExt(cast.value, Long::getType());
+Long::Long(RValue<UInt> cast)
+{
+	Value *integer = Nucleus::createZExt(cast.value, Long::getType());
 
-		storeValue(integer);
-	}
+	storeValue(integer);
+}
 
-	Long::Long(RValue<Long> rhs)
-	{
-		storeValue(rhs.value);
-	}
+Long::Long(RValue<Long> rhs)
+{
+	storeValue(rhs.value);
+}
 
-	RValue<Long> Long::operator=(int64_t rhs)
-	{
-		return RValue<Long>(storeValue(Nucleus::createConstantLong(rhs)));
-	}
+RValue<Long> Long::operator=(int64_t rhs)
+{
+	return RValue<Long>(storeValue(Nucleus::createConstantLong(rhs)));
+}
 
-	RValue<Long> Long::operator=(RValue<Long> rhs)
-	{
-		storeValue(rhs.value);
+RValue<Long> Long::operator=(RValue<Long> rhs)
+{
+	storeValue(rhs.value);
 
-		return rhs;
-	}
+	return rhs;
+}
 
-	RValue<Long> Long::operator=(const Long &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
+RValue<Long> Long::operator=(const Long &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
 
-		return RValue<Long>(value);
-	}
+	return RValue<Long>(value);
+}
 
-	RValue<Long> Long::operator=(const Reference<Long> &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
+RValue<Long> Long::operator=(const Reference<Long> &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
 
-		return RValue<Long>(value);
-	}
+	return RValue<Long>(value);
+}
 
-	RValue<Long> operator+(RValue<Long> lhs, RValue<Long> rhs)
-	{
-		return RValue<Long>(Nucleus::createAdd(lhs.value, rhs.value));
-	}
+RValue<Long> operator+(RValue<Long> lhs, RValue<Long> rhs)
+{
+	return RValue<Long>(Nucleus::createAdd(lhs.value, rhs.value));
+}
 
-	RValue<Long> operator-(RValue<Long> lhs, RValue<Long> rhs)
-	{
-		return RValue<Long>(Nucleus::createSub(lhs.value, rhs.value));
-	}
+RValue<Long> operator-(RValue<Long> lhs, RValue<Long> rhs)
+{
+	return RValue<Long>(Nucleus::createSub(lhs.value, rhs.value));
+}
 
-	RValue<Long> operator*(RValue<Long> lhs, RValue<Long> rhs)
-	{
-		return RValue<Long>(Nucleus::createMul(lhs.value, rhs.value));
-	}
+RValue<Long> operator*(RValue<Long> lhs, RValue<Long> rhs)
+{
+	return RValue<Long>(Nucleus::createMul(lhs.value, rhs.value));
+}
 
-	RValue<Long> operator>>(RValue<Long> lhs, RValue<Long> rhs)
-	{
-		return RValue<Long>(Nucleus::createAShr(lhs.value, rhs.value));
-	}
+RValue<Long> operator>>(RValue<Long> lhs, RValue<Long> rhs)
+{
+	return RValue<Long>(Nucleus::createAShr(lhs.value, rhs.value));
+}
 
-	RValue<Long> operator+=(Long &lhs, RValue<Long> rhs)
-	{
-		return lhs = lhs + rhs;
-	}
+RValue<Long> operator+=(Long &lhs, RValue<Long> rhs)
+{
+	return lhs = lhs + rhs;
+}
 
-	RValue<Long> operator-=(Long &lhs, RValue<Long> rhs)
-	{
-		return lhs = lhs - rhs;
-	}
+RValue<Long> operator-=(Long &lhs, RValue<Long> rhs)
+{
+	return lhs = lhs - rhs;
+}
 
-	RValue<Long> AddAtomic(RValue<Pointer<Long> > x, RValue<Long> y)
-	{
-		return RValue<Long>(Nucleus::createAtomicAdd(x.value, y.value));
-	}
+RValue<Long> AddAtomic(RValue<Pointer<Long> > x, RValue<Long> y)
+{
+	return RValue<Long>(Nucleus::createAtomicAdd(x.value, y.value));
+}
 
-	RValue<UInt> AddAtomic(RValue<Pointer<UInt> > x, RValue<UInt> y, std::memory_order memoryOrder)
-	{
-		return RValue<UInt>(Nucleus::createAtomicAdd(x.value, y.value, memoryOrder));
-	}
+RValue<UInt> AddAtomic(RValue<Pointer<UInt> > x, RValue<UInt> y, std::memory_order memoryOrder)
+{
+	return RValue<UInt>(Nucleus::createAtomicAdd(x.value, y.value, memoryOrder));
+}
 
-	RValue<UInt> SubAtomic(RValue<Pointer<UInt> > x, RValue<UInt> y, std::memory_order memoryOrder)
-	{
-		return RValue<UInt>(Nucleus::createAtomicSub(x.value, y.value, memoryOrder));
-	}
+RValue<UInt> SubAtomic(RValue<Pointer<UInt> > x, RValue<UInt> y, std::memory_order memoryOrder)
+{
+	return RValue<UInt>(Nucleus::createAtomicSub(x.value, y.value, memoryOrder));
+}
 
-	RValue<UInt> AndAtomic(RValue<Pointer<UInt> > x, RValue<UInt> y, std::memory_order memoryOrder)
-	{
-		return RValue<UInt>(Nucleus::createAtomicAnd(x.value, y.value, memoryOrder));
-	}
+RValue<UInt> AndAtomic(RValue<Pointer<UInt> > x, RValue<UInt> y, std::memory_order memoryOrder)
+{
+	return RValue<UInt>(Nucleus::createAtomicAnd(x.value, y.value, memoryOrder));
+}
 
-	RValue<UInt> OrAtomic(RValue<Pointer<UInt> > x, RValue<UInt> y, std::memory_order memoryOrder)
-	{
-		return RValue<UInt>(Nucleus::createAtomicOr(x.value, y.value, memoryOrder));
-	}
+RValue<UInt> OrAtomic(RValue<Pointer<UInt> > x, RValue<UInt> y, std::memory_order memoryOrder)
+{
+	return RValue<UInt>(Nucleus::createAtomicOr(x.value, y.value, memoryOrder));
+}
 
-	RValue<UInt> XorAtomic(RValue<Pointer<UInt> > x, RValue<UInt> y, std::memory_order memoryOrder)
-	{
-		return RValue<UInt>(Nucleus::createAtomicXor(x.value, y.value, memoryOrder));
-	}
+RValue<UInt> XorAtomic(RValue<Pointer<UInt> > x, RValue<UInt> y, std::memory_order memoryOrder)
+{
+	return RValue<UInt>(Nucleus::createAtomicXor(x.value, y.value, memoryOrder));
+}
 
-	RValue<Int> MinAtomic(RValue<Pointer<Int> > x, RValue<Int> y, std::memory_order memoryOrder)
-	{
-		return RValue<Int>(Nucleus::createAtomicMin(x.value, y.value, memoryOrder));
-	}
+RValue<Int> MinAtomic(RValue<Pointer<Int> > x, RValue<Int> y, std::memory_order memoryOrder)
+{
+	return RValue<Int>(Nucleus::createAtomicMin(x.value, y.value, memoryOrder));
+}
 
-	RValue<UInt> MinAtomic(RValue<Pointer<UInt> > x, RValue<UInt> y, std::memory_order memoryOrder)
-	{
-		return RValue<UInt>(Nucleus::createAtomicUMin(x.value, y.value, memoryOrder));
-	}
+RValue<UInt> MinAtomic(RValue<Pointer<UInt> > x, RValue<UInt> y, std::memory_order memoryOrder)
+{
+	return RValue<UInt>(Nucleus::createAtomicUMin(x.value, y.value, memoryOrder));
+}
 
-	RValue<Int> MaxAtomic(RValue<Pointer<Int> > x, RValue<Int> y, std::memory_order memoryOrder)
-	{
-		return RValue<Int>(Nucleus::createAtomicMax(x.value, y.value, memoryOrder));
-	}
+RValue<Int> MaxAtomic(RValue<Pointer<Int> > x, RValue<Int> y, std::memory_order memoryOrder)
+{
+	return RValue<Int>(Nucleus::createAtomicMax(x.value, y.value, memoryOrder));
+}
 
-	RValue<UInt> MaxAtomic(RValue<Pointer<UInt> > x, RValue<UInt> y, std::memory_order memoryOrder)
-	{
-		return RValue<UInt>(Nucleus::createAtomicUMax(x.value, y.value, memoryOrder));
-	}
+RValue<UInt> MaxAtomic(RValue<Pointer<UInt> > x, RValue<UInt> y, std::memory_order memoryOrder)
+{
+	return RValue<UInt>(Nucleus::createAtomicUMax(x.value, y.value, memoryOrder));
+}
 
-	RValue<UInt> ExchangeAtomic(RValue<Pointer<UInt> > x, RValue<UInt> y, std::memory_order memoryOrder)
-	{
-		return RValue<UInt>(Nucleus::createAtomicExchange(x.value, y.value, memoryOrder));
-	}
+RValue<UInt> ExchangeAtomic(RValue<Pointer<UInt> > x, RValue<UInt> y, std::memory_order memoryOrder)
+{
+	return RValue<UInt>(Nucleus::createAtomicExchange(x.value, y.value, memoryOrder));
+}
 
-	RValue<UInt> CompareExchangeAtomic(RValue<Pointer<UInt> > x, RValue<UInt> y, RValue<UInt> compare, std::memory_order memoryOrderEqual, std::memory_order memoryOrderUnequal)
-	{
-		return RValue<UInt>(Nucleus::createAtomicCompareExchange(x.value, y.value, compare.value, memoryOrderEqual, memoryOrderUnequal));
-	}
+RValue<UInt> CompareExchangeAtomic(RValue<Pointer<UInt> > x, RValue<UInt> y, RValue<UInt> compare, std::memory_order memoryOrderEqual, std::memory_order memoryOrderUnequal)
+{
+	return RValue<UInt>(Nucleus::createAtomicCompareExchange(x.value, y.value, compare.value, memoryOrderEqual, memoryOrderUnequal));
+}
 
-	UInt::UInt(Argument<UInt> argument)
-	{
-		storeValue(argument.value);
-	}
+UInt::UInt(Argument<UInt> argument)
+{
+	storeValue(argument.value);
+}
 
-	UInt::UInt(RValue<UShort> cast)
-	{
-		Value *integer = Nucleus::createZExt(cast.value, UInt::getType());
+UInt::UInt(RValue<UShort> cast)
+{
+	Value *integer = Nucleus::createZExt(cast.value, UInt::getType());
 
-		storeValue(integer);
-	}
+	storeValue(integer);
+}
 
-	UInt::UInt(RValue<Long> cast)
-	{
-		Value *integer = Nucleus::createTrunc(cast.value, UInt::getType());
+UInt::UInt(RValue<Long> cast)
+{
+	Value *integer = Nucleus::createTrunc(cast.value, UInt::getType());
 
-		storeValue(integer);
-	}
+	storeValue(integer);
+}
 
-	UInt::UInt(int x)
-	{
-		storeValue(Nucleus::createConstantInt(x));
-	}
+UInt::UInt(int x)
+{
+	storeValue(Nucleus::createConstantInt(x));
+}
 
-	UInt::UInt(unsigned int x)
-	{
-		storeValue(Nucleus::createConstantInt(x));
-	}
+UInt::UInt(unsigned int x)
+{
+	storeValue(Nucleus::createConstantInt(x));
+}
 
-	UInt::UInt(RValue<UInt> rhs)
-	{
-		storeValue(rhs.value);
-	}
+UInt::UInt(RValue<UInt> rhs)
+{
+	storeValue(rhs.value);
+}
 
-	UInt::UInt(RValue<Int> rhs)
-	{
-		storeValue(rhs.value);
-	}
+UInt::UInt(RValue<Int> rhs)
+{
+	storeValue(rhs.value);
+}
 
-	UInt::UInt(const UInt &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
-	}
+UInt::UInt(const UInt &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
+}
 
-	UInt::UInt(const Reference<UInt> &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
-	}
+UInt::UInt(const Reference<UInt> &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
+}
 
-	UInt::UInt(const Int &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
-	}
+UInt::UInt(const Int &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
+}
 
-	UInt::UInt(const Reference<Int> &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
-	}
+UInt::UInt(const Reference<Int> &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
+}
 
-	RValue<UInt> UInt::operator=(unsigned int rhs)
-	{
-		return RValue<UInt>(storeValue(Nucleus::createConstantInt(rhs)));
-	}
+RValue<UInt> UInt::operator=(unsigned int rhs)
+{
+	return RValue<UInt>(storeValue(Nucleus::createConstantInt(rhs)));
+}
 
-	RValue<UInt> UInt::operator=(RValue<UInt> rhs)
-	{
-		storeValue(rhs.value);
+RValue<UInt> UInt::operator=(RValue<UInt> rhs)
+{
+	storeValue(rhs.value);
 
-		return rhs;
-	}
+	return rhs;
+}
 
-	RValue<UInt> UInt::operator=(RValue<Int> rhs)
-	{
-		storeValue(rhs.value);
+RValue<UInt> UInt::operator=(RValue<Int> rhs)
+{
+	storeValue(rhs.value);
 
-		return RValue<UInt>(rhs);
-	}
+	return RValue<UInt>(rhs);
+}
 
-	RValue<UInt> UInt::operator=(const UInt &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
+RValue<UInt> UInt::operator=(const UInt &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
 
-		return RValue<UInt>(value);
-	}
+	return RValue<UInt>(value);
+}
 
-	RValue<UInt> UInt::operator=(const Reference<UInt> &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
+RValue<UInt> UInt::operator=(const Reference<UInt> &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
 
-		return RValue<UInt>(value);
-	}
+	return RValue<UInt>(value);
+}
 
-	RValue<UInt> UInt::operator=(const Int &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
+RValue<UInt> UInt::operator=(const Int &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
 
-		return RValue<UInt>(value);
-	}
+	return RValue<UInt>(value);
+}
 
-	RValue<UInt> UInt::operator=(const Reference<Int> &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
+RValue<UInt> UInt::operator=(const Reference<Int> &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
 
-		return RValue<UInt>(value);
-	}
+	return RValue<UInt>(value);
+}
 
-	RValue<UInt> operator+(RValue<UInt> lhs, RValue<UInt> rhs)
-	{
-		return RValue<UInt>(Nucleus::createAdd(lhs.value, rhs.value));
-	}
+RValue<UInt> operator+(RValue<UInt> lhs, RValue<UInt> rhs)
+{
+	return RValue<UInt>(Nucleus::createAdd(lhs.value, rhs.value));
+}
 
-	RValue<UInt> operator-(RValue<UInt> lhs, RValue<UInt> rhs)
-	{
-		return RValue<UInt>(Nucleus::createSub(lhs.value, rhs.value));
-	}
+RValue<UInt> operator-(RValue<UInt> lhs, RValue<UInt> rhs)
+{
+	return RValue<UInt>(Nucleus::createSub(lhs.value, rhs.value));
+}
 
-	RValue<UInt> operator*(RValue<UInt> lhs, RValue<UInt> rhs)
-	{
-		return RValue<UInt>(Nucleus::createMul(lhs.value, rhs.value));
-	}
+RValue<UInt> operator*(RValue<UInt> lhs, RValue<UInt> rhs)
+{
+	return RValue<UInt>(Nucleus::createMul(lhs.value, rhs.value));
+}
 
-	RValue<UInt> operator/(RValue<UInt> lhs, RValue<UInt> rhs)
-	{
-		return RValue<UInt>(Nucleus::createUDiv(lhs.value, rhs.value));
-	}
+RValue<UInt> operator/(RValue<UInt> lhs, RValue<UInt> rhs)
+{
+	return RValue<UInt>(Nucleus::createUDiv(lhs.value, rhs.value));
+}
 
-	RValue<UInt> operator%(RValue<UInt> lhs, RValue<UInt> rhs)
-	{
-		return RValue<UInt>(Nucleus::createURem(lhs.value, rhs.value));
-	}
+RValue<UInt> operator%(RValue<UInt> lhs, RValue<UInt> rhs)
+{
+	return RValue<UInt>(Nucleus::createURem(lhs.value, rhs.value));
+}
 
-	RValue<UInt> operator&(RValue<UInt> lhs, RValue<UInt> rhs)
-	{
-		return RValue<UInt>(Nucleus::createAnd(lhs.value, rhs.value));
-	}
+RValue<UInt> operator&(RValue<UInt> lhs, RValue<UInt> rhs)
+{
+	return RValue<UInt>(Nucleus::createAnd(lhs.value, rhs.value));
+}
 
-	RValue<UInt> operator|(RValue<UInt> lhs, RValue<UInt> rhs)
-	{
-		return RValue<UInt>(Nucleus::createOr(lhs.value, rhs.value));
-	}
+RValue<UInt> operator|(RValue<UInt> lhs, RValue<UInt> rhs)
+{
+	return RValue<UInt>(Nucleus::createOr(lhs.value, rhs.value));
+}
 
-	RValue<UInt> operator^(RValue<UInt> lhs, RValue<UInt> rhs)
-	{
-		return RValue<UInt>(Nucleus::createXor(lhs.value, rhs.value));
-	}
+RValue<UInt> operator^(RValue<UInt> lhs, RValue<UInt> rhs)
+{
+	return RValue<UInt>(Nucleus::createXor(lhs.value, rhs.value));
+}
 
-	RValue<UInt> operator<<(RValue<UInt> lhs, RValue<UInt> rhs)
-	{
-		return RValue<UInt>(Nucleus::createShl(lhs.value, rhs.value));
-	}
+RValue<UInt> operator<<(RValue<UInt> lhs, RValue<UInt> rhs)
+{
+	return RValue<UInt>(Nucleus::createShl(lhs.value, rhs.value));
+}
 
-	RValue<UInt> operator>>(RValue<UInt> lhs, RValue<UInt> rhs)
-	{
-		return RValue<UInt>(Nucleus::createLShr(lhs.value, rhs.value));
-	}
+RValue<UInt> operator>>(RValue<UInt> lhs, RValue<UInt> rhs)
+{
+	return RValue<UInt>(Nucleus::createLShr(lhs.value, rhs.value));
+}
 
-	RValue<UInt> operator+=(UInt &lhs, RValue<UInt> rhs)
-	{
-		return lhs = lhs + rhs;
-	}
+RValue<UInt> operator+=(UInt &lhs, RValue<UInt> rhs)
+{
+	return lhs = lhs + rhs;
+}
 
-	RValue<UInt> operator-=(UInt &lhs, RValue<UInt> rhs)
-	{
-		return lhs = lhs - rhs;
-	}
+RValue<UInt> operator-=(UInt &lhs, RValue<UInt> rhs)
+{
+	return lhs = lhs - rhs;
+}
 
-	RValue<UInt> operator*=(UInt &lhs, RValue<UInt> rhs)
-	{
-		return lhs = lhs * rhs;
-	}
+RValue<UInt> operator*=(UInt &lhs, RValue<UInt> rhs)
+{
+	return lhs = lhs * rhs;
+}
 
-	RValue<UInt> operator/=(UInt &lhs, RValue<UInt> rhs)
-	{
-		return lhs = lhs / rhs;
-	}
+RValue<UInt> operator/=(UInt &lhs, RValue<UInt> rhs)
+{
+	return lhs = lhs / rhs;
+}
 
-	RValue<UInt> operator%=(UInt &lhs, RValue<UInt> rhs)
-	{
-		return lhs = lhs % rhs;
-	}
+RValue<UInt> operator%=(UInt &lhs, RValue<UInt> rhs)
+{
+	return lhs = lhs % rhs;
+}
 
-	RValue<UInt> operator&=(UInt &lhs, RValue<UInt> rhs)
-	{
-		return lhs = lhs & rhs;
-	}
+RValue<UInt> operator&=(UInt &lhs, RValue<UInt> rhs)
+{
+	return lhs = lhs & rhs;
+}
 
-	RValue<UInt> operator|=(UInt &lhs, RValue<UInt> rhs)
-	{
-		return lhs = lhs | rhs;
-	}
+RValue<UInt> operator|=(UInt &lhs, RValue<UInt> rhs)
+{
+	return lhs = lhs | rhs;
+}
 
-	RValue<UInt> operator^=(UInt &lhs, RValue<UInt> rhs)
-	{
-		return lhs = lhs ^ rhs;
-	}
+RValue<UInt> operator^=(UInt &lhs, RValue<UInt> rhs)
+{
+	return lhs = lhs ^ rhs;
+}
 
-	RValue<UInt> operator<<=(UInt &lhs, RValue<UInt> rhs)
-	{
-		return lhs = lhs << rhs;
-	}
+RValue<UInt> operator<<=(UInt &lhs, RValue<UInt> rhs)
+{
+	return lhs = lhs << rhs;
+}
 
-	RValue<UInt> operator>>=(UInt &lhs, RValue<UInt> rhs)
-	{
-		return lhs = lhs >> rhs;
-	}
+RValue<UInt> operator>>=(UInt &lhs, RValue<UInt> rhs)
+{
+	return lhs = lhs >> rhs;
+}
 
-	RValue<UInt> operator+(RValue<UInt> val)
-	{
-		return val;
-	}
+RValue<UInt> operator+(RValue<UInt> val)
+{
+	return val;
+}
 
-	RValue<UInt> operator-(RValue<UInt> val)
-	{
-		return RValue<UInt>(Nucleus::createNeg(val.value));
-	}
+RValue<UInt> operator-(RValue<UInt> val)
+{
+	return RValue<UInt>(Nucleus::createNeg(val.value));
+}
 
-	RValue<UInt> operator~(RValue<UInt> val)
-	{
-		return RValue<UInt>(Nucleus::createNot(val.value));
-	}
+RValue<UInt> operator~(RValue<UInt> val)
+{
+	return RValue<UInt>(Nucleus::createNot(val.value));
+}
 
-	RValue<UInt> Max(RValue<UInt> x, RValue<UInt> y)
-	{
-		return IfThenElse(x > y, x, y);
-	}
+RValue<UInt> Max(RValue<UInt> x, RValue<UInt> y)
+{
+	return IfThenElse(x > y, x, y);
+}
 
-	RValue<UInt> Min(RValue<UInt> x, RValue<UInt> y)
-	{
-		return IfThenElse(x < y, x, y);
-	}
+RValue<UInt> Min(RValue<UInt> x, RValue<UInt> y)
+{
+	return IfThenElse(x < y, x, y);
+}
 
-	RValue<UInt> Clamp(RValue<UInt> x, RValue<UInt> min, RValue<UInt> max)
-	{
-		return Min(Max(x, min), max);
-	}
+RValue<UInt> Clamp(RValue<UInt> x, RValue<UInt> min, RValue<UInt> max)
+{
+	return Min(Max(x, min), max);
+}
 
-	RValue<Bool> operator<(RValue<UInt> lhs, RValue<UInt> rhs)
-	{
-		return RValue<Bool>(Nucleus::createICmpULT(lhs.value, rhs.value));
-	}
+RValue<Bool> operator<(RValue<UInt> lhs, RValue<UInt> rhs)
+{
+	return RValue<Bool>(Nucleus::createICmpULT(lhs.value, rhs.value));
+}
 
-	RValue<Bool> operator<=(RValue<UInt> lhs, RValue<UInt> rhs)
-	{
-		return RValue<Bool>(Nucleus::createICmpULE(lhs.value, rhs.value));
-	}
+RValue<Bool> operator<=(RValue<UInt> lhs, RValue<UInt> rhs)
+{
+	return RValue<Bool>(Nucleus::createICmpULE(lhs.value, rhs.value));
+}
 
-	RValue<Bool> operator>(RValue<UInt> lhs, RValue<UInt> rhs)
-	{
-		return RValue<Bool>(Nucleus::createICmpUGT(lhs.value, rhs.value));
-	}
+RValue<Bool> operator>(RValue<UInt> lhs, RValue<UInt> rhs)
+{
+	return RValue<Bool>(Nucleus::createICmpUGT(lhs.value, rhs.value));
+}
 
-	RValue<Bool> operator>=(RValue<UInt> lhs, RValue<UInt> rhs)
-	{
-		return RValue<Bool>(Nucleus::createICmpUGE(lhs.value, rhs.value));
-	}
+RValue<Bool> operator>=(RValue<UInt> lhs, RValue<UInt> rhs)
+{
+	return RValue<Bool>(Nucleus::createICmpUGE(lhs.value, rhs.value));
+}
 
-	RValue<Bool> operator!=(RValue<UInt> lhs, RValue<UInt> rhs)
-	{
-		return RValue<Bool>(Nucleus::createICmpNE(lhs.value, rhs.value));
-	}
+RValue<Bool> operator!=(RValue<UInt> lhs, RValue<UInt> rhs)
+{
+	return RValue<Bool>(Nucleus::createICmpNE(lhs.value, rhs.value));
+}
 
-	RValue<Bool> operator==(RValue<UInt> lhs, RValue<UInt> rhs)
-	{
-		return RValue<Bool>(Nucleus::createICmpEQ(lhs.value, rhs.value));
-	}
+RValue<Bool> operator==(RValue<UInt> lhs, RValue<UInt> rhs)
+{
+	return RValue<Bool>(Nucleus::createICmpEQ(lhs.value, rhs.value));
+}
 
-	Int2::Int2(RValue<Int4> cast)
-	{
-		storeValue(Nucleus::createBitCast(cast.value, getType()));
-	}
+Int2::Int2(RValue<Int4> cast)
+{
+	storeValue(Nucleus::createBitCast(cast.value, getType()));
+}
 
-	Int2::Int2(int x, int y)
-	{
-		int64_t constantVector[2] = {x, y};
-		storeValue(Nucleus::createConstantVector(constantVector, getType()));
-	}
+Int2::Int2(int x, int y)
+{
+	int64_t constantVector[2] = {x, y};
+	storeValue(Nucleus::createConstantVector(constantVector, getType()));
+}
 
-	Int2::Int2(RValue<Int2> rhs)
-	{
-		storeValue(rhs.value);
-	}
+Int2::Int2(RValue<Int2> rhs)
+{
+	storeValue(rhs.value);
+}
 
-	Int2::Int2(const Int2 &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
-	}
+Int2::Int2(const Int2 &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
+}
 
-	Int2::Int2(const Reference<Int2> &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
-	}
+Int2::Int2(const Reference<Int2> &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
+}
 
-	Int2::Int2(RValue<Int> lo, RValue<Int> hi)
-	{
-		int shuffle[4] = {0, 4, 1, 5};
-		Value *packed = Nucleus::createShuffleVector(Int4(lo).loadValue(), Int4(hi).loadValue(), shuffle);
+Int2::Int2(RValue<Int> lo, RValue<Int> hi)
+{
+	int shuffle[4] = {0, 4, 1, 5};
+	Value *packed = Nucleus::createShuffleVector(Int4(lo).loadValue(), Int4(hi).loadValue(), shuffle);
 
-		storeValue(Nucleus::createBitCast(packed, Int2::getType()));
-	}
+	storeValue(Nucleus::createBitCast(packed, Int2::getType()));
+}
 
-	RValue<Int2> Int2::operator=(RValue<Int2> rhs)
-	{
-		storeValue(rhs.value);
+RValue<Int2> Int2::operator=(RValue<Int2> rhs)
+{
+	storeValue(rhs.value);
 
-		return rhs;
-	}
+	return rhs;
+}
 
-	RValue<Int2> Int2::operator=(const Int2 &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
+RValue<Int2> Int2::operator=(const Int2 &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
 
-		return RValue<Int2>(value);
-	}
+	return RValue<Int2>(value);
+}
 
-	RValue<Int2> Int2::operator=(const Reference<Int2> &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
+RValue<Int2> Int2::operator=(const Reference<Int2> &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
 
-		return RValue<Int2>(value);
-	}
+	return RValue<Int2>(value);
+}
 
-	RValue<Int2> operator+(RValue<Int2> lhs, RValue<Int2> rhs)
-	{
-		return RValue<Int2>(Nucleus::createAdd(lhs.value, rhs.value));
-	}
+RValue<Int2> operator+(RValue<Int2> lhs, RValue<Int2> rhs)
+{
+	return RValue<Int2>(Nucleus::createAdd(lhs.value, rhs.value));
+}
 
-	RValue<Int2> operator-(RValue<Int2> lhs, RValue<Int2> rhs)
-	{
-		return RValue<Int2>(Nucleus::createSub(lhs.value, rhs.value));
-	}
+RValue<Int2> operator-(RValue<Int2> lhs, RValue<Int2> rhs)
+{
+	return RValue<Int2>(Nucleus::createSub(lhs.value, rhs.value));
+}
 
 //	RValue<Int2> operator*(RValue<Int2> lhs, RValue<Int2> rhs)
 //	{
@@ -3010,30 +3011,30 @@
 //		return RValue<Int2>(Nucleus::createSRem(lhs.value, rhs.value));
 //	}
 
-	RValue<Int2> operator&(RValue<Int2> lhs, RValue<Int2> rhs)
-	{
-		return RValue<Int2>(Nucleus::createAnd(lhs.value, rhs.value));
-	}
+RValue<Int2> operator&(RValue<Int2> lhs, RValue<Int2> rhs)
+{
+	return RValue<Int2>(Nucleus::createAnd(lhs.value, rhs.value));
+}
 
-	RValue<Int2> operator|(RValue<Int2> lhs, RValue<Int2> rhs)
-	{
-		return RValue<Int2>(Nucleus::createOr(lhs.value, rhs.value));
-	}
+RValue<Int2> operator|(RValue<Int2> lhs, RValue<Int2> rhs)
+{
+	return RValue<Int2>(Nucleus::createOr(lhs.value, rhs.value));
+}
 
-	RValue<Int2> operator^(RValue<Int2> lhs, RValue<Int2> rhs)
-	{
-		return RValue<Int2>(Nucleus::createXor(lhs.value, rhs.value));
-	}
+RValue<Int2> operator^(RValue<Int2> lhs, RValue<Int2> rhs)
+{
+	return RValue<Int2>(Nucleus::createXor(lhs.value, rhs.value));
+}
 
-	RValue<Int2> operator+=(Int2 &lhs, RValue<Int2> rhs)
-	{
-		return lhs = lhs + rhs;
-	}
+RValue<Int2> operator+=(Int2 &lhs, RValue<Int2> rhs)
+{
+	return lhs = lhs + rhs;
+}
 
-	RValue<Int2> operator-=(Int2 &lhs, RValue<Int2> rhs)
-	{
-		return lhs = lhs - rhs;
-	}
+RValue<Int2> operator-=(Int2 &lhs, RValue<Int2> rhs)
+{
+	return lhs = lhs - rhs;
+}
 
 //	RValue<Int2> operator*=(Int2 &lhs, RValue<Int2> rhs)
 //	{
@@ -3050,30 +3051,30 @@
 //		return lhs = lhs % rhs;
 //	}
 
-	RValue<Int2> operator&=(Int2 &lhs, RValue<Int2> rhs)
-	{
-		return lhs = lhs & rhs;
-	}
+RValue<Int2> operator&=(Int2 &lhs, RValue<Int2> rhs)
+{
+	return lhs = lhs & rhs;
+}
 
-	RValue<Int2> operator|=(Int2 &lhs, RValue<Int2> rhs)
-	{
-		return lhs = lhs | rhs;
-	}
+RValue<Int2> operator|=(Int2 &lhs, RValue<Int2> rhs)
+{
+	return lhs = lhs | rhs;
+}
 
-	RValue<Int2> operator^=(Int2 &lhs, RValue<Int2> rhs)
-	{
-		return lhs = lhs ^ rhs;
-	}
+RValue<Int2> operator^=(Int2 &lhs, RValue<Int2> rhs)
+{
+	return lhs = lhs ^ rhs;
+}
 
-	RValue<Int2> operator<<=(Int2 &lhs, unsigned char rhs)
-	{
-		return lhs = lhs << rhs;
-	}
+RValue<Int2> operator<<=(Int2 &lhs, unsigned char rhs)
+{
+	return lhs = lhs << rhs;
+}
 
-	RValue<Int2> operator>>=(Int2 &lhs, unsigned char rhs)
-	{
-		return lhs = lhs >> rhs;
-	}
+RValue<Int2> operator>>=(Int2 &lhs, unsigned char rhs)
+{
+	return lhs = lhs >> rhs;
+}
 
 //	RValue<Int2> operator+(RValue<Int2> val)
 //	{
@@ -3085,89 +3086,89 @@
 //		return RValue<Int2>(Nucleus::createNeg(val.value));
 //	}
 
-	RValue<Int2> operator~(RValue<Int2> val)
-	{
-		return RValue<Int2>(Nucleus::createNot(val.value));
-	}
+RValue<Int2> operator~(RValue<Int2> val)
+{
+	return RValue<Int2>(Nucleus::createNot(val.value));
+}
 
-	RValue<Short4> UnpackLow(RValue<Int2> x, RValue<Int2> y)
-	{
-		int shuffle[4] = {0, 4, 1, 5};   // Real type is v4i32
-		return As<Short4>(Nucleus::createShuffleVector(x.value, y.value, shuffle));
-	}
+RValue<Short4> UnpackLow(RValue<Int2> x, RValue<Int2> y)
+{
+	int shuffle[4] = {0, 4, 1, 5};   // Real type is v4i32
+	return As<Short4>(Nucleus::createShuffleVector(x.value, y.value, shuffle));
+}
 
-	RValue<Short4> UnpackHigh(RValue<Int2> x, RValue<Int2> y)
-	{
-		int shuffle[4] = {0, 4, 1, 5};   // Real type is v4i32
-		auto lowHigh = RValue<Int4>(Nucleus::createShuffleVector(x.value, y.value, shuffle));
-		return As<Short4>(Swizzle(lowHigh, 0x2323));
-	}
+RValue<Short4> UnpackHigh(RValue<Int2> x, RValue<Int2> y)
+{
+	int shuffle[4] = {0, 4, 1, 5};   // Real type is v4i32
+	auto lowHigh = RValue<Int4>(Nucleus::createShuffleVector(x.value, y.value, shuffle));
+	return As<Short4>(Swizzle(lowHigh, 0x2323));
+}
 
-	RValue<Int> Extract(RValue<Int2> val, int i)
-	{
-		return RValue<Int>(Nucleus::createExtractElement(val.value, Int::getType(), i));
-	}
+RValue<Int> Extract(RValue<Int2> val, int i)
+{
+	return RValue<Int>(Nucleus::createExtractElement(val.value, Int::getType(), i));
+}
 
-	RValue<Int2> Insert(RValue<Int2> val, RValue<Int> element, int i)
-	{
-		return RValue<Int2>(Nucleus::createInsertElement(val.value, element.value, i));
-	}
+RValue<Int2> Insert(RValue<Int2> val, RValue<Int> element, int i)
+{
+	return RValue<Int2>(Nucleus::createInsertElement(val.value, element.value, i));
+}
 
-	UInt2::UInt2(unsigned int x, unsigned int y)
-	{
-		int64_t constantVector[2] = {x, y};
-		storeValue(Nucleus::createConstantVector(constantVector, getType()));
-	}
+UInt2::UInt2(unsigned int x, unsigned int y)
+{
+	int64_t constantVector[2] = {x, y};
+	storeValue(Nucleus::createConstantVector(constantVector, getType()));
+}
 
-	UInt2::UInt2(RValue<UInt2> rhs)
-	{
-		storeValue(rhs.value);
-	}
+UInt2::UInt2(RValue<UInt2> rhs)
+{
+	storeValue(rhs.value);
+}
 
-	UInt2::UInt2(const UInt2 &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
-	}
+UInt2::UInt2(const UInt2 &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
+}
 
-	UInt2::UInt2(const Reference<UInt2> &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
-	}
+UInt2::UInt2(const Reference<UInt2> &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
+}
 
-	RValue<UInt2> UInt2::operator=(RValue<UInt2> rhs)
-	{
-		storeValue(rhs.value);
+RValue<UInt2> UInt2::operator=(RValue<UInt2> rhs)
+{
+	storeValue(rhs.value);
 
-		return rhs;
-	}
+	return rhs;
+}
 
-	RValue<UInt2> UInt2::operator=(const UInt2 &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
+RValue<UInt2> UInt2::operator=(const UInt2 &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
 
-		return RValue<UInt2>(value);
-	}
+	return RValue<UInt2>(value);
+}
 
-	RValue<UInt2> UInt2::operator=(const Reference<UInt2> &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
+RValue<UInt2> UInt2::operator=(const Reference<UInt2> &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
 
-		return RValue<UInt2>(value);
-	}
+	return RValue<UInt2>(value);
+}
 
-	RValue<UInt2> operator+(RValue<UInt2> lhs, RValue<UInt2> rhs)
-	{
-		return RValue<UInt2>(Nucleus::createAdd(lhs.value, rhs.value));
-	}
+RValue<UInt2> operator+(RValue<UInt2> lhs, RValue<UInt2> rhs)
+{
+	return RValue<UInt2>(Nucleus::createAdd(lhs.value, rhs.value));
+}
 
-	RValue<UInt2> operator-(RValue<UInt2> lhs, RValue<UInt2> rhs)
-	{
-		return RValue<UInt2>(Nucleus::createSub(lhs.value, rhs.value));
-	}
+RValue<UInt2> operator-(RValue<UInt2> lhs, RValue<UInt2> rhs)
+{
+	return RValue<UInt2>(Nucleus::createSub(lhs.value, rhs.value));
+}
 
 //	RValue<UInt2> operator*(RValue<UInt2> lhs, RValue<UInt2> rhs)
 //	{
@@ -3184,30 +3185,30 @@
 //		return RValue<UInt2>(Nucleus::createURem(lhs.value, rhs.value));
 //	}
 
-	RValue<UInt2> operator&(RValue<UInt2> lhs, RValue<UInt2> rhs)
-	{
-		return RValue<UInt2>(Nucleus::createAnd(lhs.value, rhs.value));
-	}
+RValue<UInt2> operator&(RValue<UInt2> lhs, RValue<UInt2> rhs)
+{
+	return RValue<UInt2>(Nucleus::createAnd(lhs.value, rhs.value));
+}
 
-	RValue<UInt2> operator|(RValue<UInt2> lhs, RValue<UInt2> rhs)
-	{
-		return RValue<UInt2>(Nucleus::createOr(lhs.value, rhs.value));
-	}
+RValue<UInt2> operator|(RValue<UInt2> lhs, RValue<UInt2> rhs)
+{
+	return RValue<UInt2>(Nucleus::createOr(lhs.value, rhs.value));
+}
 
-	RValue<UInt2> operator^(RValue<UInt2> lhs, RValue<UInt2> rhs)
-	{
-		return RValue<UInt2>(Nucleus::createXor(lhs.value, rhs.value));
-	}
+RValue<UInt2> operator^(RValue<UInt2> lhs, RValue<UInt2> rhs)
+{
+	return RValue<UInt2>(Nucleus::createXor(lhs.value, rhs.value));
+}
 
-	RValue<UInt2> operator+=(UInt2 &lhs, RValue<UInt2> rhs)
-	{
-		return lhs = lhs + rhs;
-	}
+RValue<UInt2> operator+=(UInt2 &lhs, RValue<UInt2> rhs)
+{
+	return lhs = lhs + rhs;
+}
 
-	RValue<UInt2> operator-=(UInt2 &lhs, RValue<UInt2> rhs)
-	{
-		return lhs = lhs - rhs;
-	}
+RValue<UInt2> operator-=(UInt2 &lhs, RValue<UInt2> rhs)
+{
+	return lhs = lhs - rhs;
+}
 
 //	RValue<UInt2> operator*=(UInt2 &lhs, RValue<UInt2> rhs)
 //	{
@@ -3224,30 +3225,30 @@
 //		return lhs = lhs % rhs;
 //	}
 
-	RValue<UInt2> operator&=(UInt2 &lhs, RValue<UInt2> rhs)
-	{
-		return lhs = lhs & rhs;
-	}
+RValue<UInt2> operator&=(UInt2 &lhs, RValue<UInt2> rhs)
+{
+	return lhs = lhs & rhs;
+}
 
-	RValue<UInt2> operator|=(UInt2 &lhs, RValue<UInt2> rhs)
-	{
-		return lhs = lhs | rhs;
-	}
+RValue<UInt2> operator|=(UInt2 &lhs, RValue<UInt2> rhs)
+{
+	return lhs = lhs | rhs;
+}
 
-	RValue<UInt2> operator^=(UInt2 &lhs, RValue<UInt2> rhs)
-	{
-		return lhs = lhs ^ rhs;
-	}
+RValue<UInt2> operator^=(UInt2 &lhs, RValue<UInt2> rhs)
+{
+	return lhs = lhs ^ rhs;
+}
 
-	RValue<UInt2> operator<<=(UInt2 &lhs, unsigned char rhs)
-	{
-		return lhs = lhs << rhs;
-	}
+RValue<UInt2> operator<<=(UInt2 &lhs, unsigned char rhs)
+{
+	return lhs = lhs << rhs;
+}
 
-	RValue<UInt2> operator>>=(UInt2 &lhs, unsigned char rhs)
-	{
-		return lhs = lhs >> rhs;
-	}
+RValue<UInt2> operator>>=(UInt2 &lhs, unsigned char rhs)
+{
+	return lhs = lhs >> rhs;
+}
 
 //	RValue<UInt2> operator+(RValue<UInt2> val)
 //	{
@@ -3259,197 +3260,197 @@
 //		return RValue<UInt2>(Nucleus::createNeg(val.value));
 //	}
 
-	RValue<UInt2> operator~(RValue<UInt2> val)
-	{
-		return RValue<UInt2>(Nucleus::createNot(val.value));
-	}
+RValue<UInt2> operator~(RValue<UInt2> val)
+{
+	return RValue<UInt2>(Nucleus::createNot(val.value));
+}
 
-	RValue<UInt> Extract(RValue<UInt2> val, int i)
-	{
-		return RValue<UInt>(Nucleus::createExtractElement(val.value, UInt::getType(), i));
-	}
+RValue<UInt> Extract(RValue<UInt2> val, int i)
+{
+	return RValue<UInt>(Nucleus::createExtractElement(val.value, UInt::getType(), i));
+}
 
-	RValue<UInt2> Insert(RValue<UInt2> val, RValue<UInt> element, int i)
-	{
-		return RValue<UInt2>(Nucleus::createInsertElement(val.value, element.value, i));
-	}
+RValue<UInt2> Insert(RValue<UInt2> val, RValue<UInt> element, int i)
+{
+	return RValue<UInt2>(Nucleus::createInsertElement(val.value, element.value, i));
+}
 
-	Int4::Int4() : XYZW(this)
-	{
-	}
+Int4::Int4() : XYZW(this)
+{
+}
 
-	Int4::Int4(RValue<Float4> cast) : XYZW(this)
-	{
-		Value *xyzw = Nucleus::createFPToSI(cast.value, Int4::getType());
+Int4::Int4(RValue<Float4> cast) : XYZW(this)
+{
+	Value *xyzw = Nucleus::createFPToSI(cast.value, Int4::getType());
 
-		storeValue(xyzw);
-	}
+	storeValue(xyzw);
+}
 
-	Int4::Int4(int xyzw) : XYZW(this)
-	{
-		constant(xyzw, xyzw, xyzw, xyzw);
-	}
+Int4::Int4(int xyzw) : XYZW(this)
+{
+	constant(xyzw, xyzw, xyzw, xyzw);
+}
 
-	Int4::Int4(int x, int yzw) : XYZW(this)
-	{
-		constant(x, yzw, yzw, yzw);
-	}
+Int4::Int4(int x, int yzw) : XYZW(this)
+{
+	constant(x, yzw, yzw, yzw);
+}
 
-	Int4::Int4(int x, int y, int zw) : XYZW(this)
-	{
-		constant(x, y, zw, zw);
-	}
+Int4::Int4(int x, int y, int zw) : XYZW(this)
+{
+	constant(x, y, zw, zw);
+}
 
-	Int4::Int4(int x, int y, int z, int w) : XYZW(this)
-	{
-		constant(x, y, z, w);
-	}
+Int4::Int4(int x, int y, int z, int w) : XYZW(this)
+{
+	constant(x, y, z, w);
+}
 
-	void Int4::constant(int x, int y, int z, int w)
-	{
-		int64_t constantVector[4] = {x, y, z, w};
-		storeValue(Nucleus::createConstantVector(constantVector, getType()));
-	}
+void Int4::constant(int x, int y, int z, int w)
+{
+	int64_t constantVector[4] = {x, y, z, w};
+	storeValue(Nucleus::createConstantVector(constantVector, getType()));
+}
 
-	Int4::Int4(RValue<Int4> rhs) : XYZW(this)
-	{
-		storeValue(rhs.value);
-	}
+Int4::Int4(RValue<Int4> rhs) : XYZW(this)
+{
+	storeValue(rhs.value);
+}
 
-	Int4::Int4(const Int4 &rhs) : XYZW(this)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
-	}
+Int4::Int4(const Int4 &rhs) : XYZW(this)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
+}
 
-	Int4::Int4(const Reference<Int4> &rhs) : XYZW(this)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
-	}
+Int4::Int4(const Reference<Int4> &rhs) : XYZW(this)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
+}
 
-	Int4::Int4(RValue<UInt4> rhs) : XYZW(this)
-	{
-		storeValue(rhs.value);
-	}
+Int4::Int4(RValue<UInt4> rhs) : XYZW(this)
+{
+	storeValue(rhs.value);
+}
 
-	Int4::Int4(const UInt4 &rhs) : XYZW(this)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
-	}
+Int4::Int4(const UInt4 &rhs) : XYZW(this)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
+}
 
-	Int4::Int4(const Reference<UInt4> &rhs) : XYZW(this)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
-	}
+Int4::Int4(const Reference<UInt4> &rhs) : XYZW(this)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
+}
 
-	Int4::Int4(RValue<Int2> lo, RValue<Int2> hi) : XYZW(this)
-	{
-		int shuffle[4] = {0, 1, 4, 5};   // Real type is v4i32
-		Value *packed = Nucleus::createShuffleVector(lo.value, hi.value, shuffle);
+Int4::Int4(RValue<Int2> lo, RValue<Int2> hi) : XYZW(this)
+{
+	int shuffle[4] = {0, 1, 4, 5};   // Real type is v4i32
+	Value *packed = Nucleus::createShuffleVector(lo.value, hi.value, shuffle);
 
-		storeValue(packed);
-	}
+	storeValue(packed);
+}
 
-	Int4::Int4(const Int &rhs) : XYZW(this)
-	{
-		*this = RValue<Int>(rhs.loadValue());
-	}
+Int4::Int4(const Int &rhs) : XYZW(this)
+{
+	*this = RValue<Int>(rhs.loadValue());
+}
 
-	Int4::Int4(const Reference<Int> &rhs) : XYZW(this)
-	{
-		*this = RValue<Int>(rhs.loadValue());
-	}
+Int4::Int4(const Reference<Int> &rhs) : XYZW(this)
+{
+	*this = RValue<Int>(rhs.loadValue());
+}
 
-	RValue<Int4> Int4::operator=(RValue<Int4> rhs)
-	{
-		storeValue(rhs.value);
+RValue<Int4> Int4::operator=(RValue<Int4> rhs)
+{
+	storeValue(rhs.value);
 
-		return rhs;
-	}
+	return rhs;
+}
 
-	RValue<Int4> Int4::operator=(const Int4 &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
+RValue<Int4> Int4::operator=(const Int4 &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
 
-		return RValue<Int4>(value);
-	}
+	return RValue<Int4>(value);
+}
 
-	RValue<Int4> Int4::operator=(const Reference<Int4> &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
+RValue<Int4> Int4::operator=(const Reference<Int4> &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
 
-		return RValue<Int4>(value);
-	}
+	return RValue<Int4>(value);
+}
 
-	RValue<Int4> operator+(RValue<Int4> lhs, RValue<Int4> rhs)
-	{
-		return RValue<Int4>(Nucleus::createAdd(lhs.value, rhs.value));
-	}
+RValue<Int4> operator+(RValue<Int4> lhs, RValue<Int4> rhs)
+{
+	return RValue<Int4>(Nucleus::createAdd(lhs.value, rhs.value));
+}
 
-	RValue<Int4> operator-(RValue<Int4> lhs, RValue<Int4> rhs)
-	{
-		return RValue<Int4>(Nucleus::createSub(lhs.value, rhs.value));
-	}
+RValue<Int4> operator-(RValue<Int4> lhs, RValue<Int4> rhs)
+{
+	return RValue<Int4>(Nucleus::createSub(lhs.value, rhs.value));
+}
 
-	RValue<Int4> operator*(RValue<Int4> lhs, RValue<Int4> rhs)
-	{
-		return RValue<Int4>(Nucleus::createMul(lhs.value, rhs.value));
-	}
+RValue<Int4> operator*(RValue<Int4> lhs, RValue<Int4> rhs)
+{
+	return RValue<Int4>(Nucleus::createMul(lhs.value, rhs.value));
+}
 
-	RValue<Int4> operator/(RValue<Int4> lhs, RValue<Int4> rhs)
-	{
-		return RValue<Int4>(Nucleus::createSDiv(lhs.value, rhs.value));
-	}
+RValue<Int4> operator/(RValue<Int4> lhs, RValue<Int4> rhs)
+{
+	return RValue<Int4>(Nucleus::createSDiv(lhs.value, rhs.value));
+}
 
-	RValue<Int4> operator%(RValue<Int4> lhs, RValue<Int4> rhs)
-	{
-		return RValue<Int4>(Nucleus::createSRem(lhs.value, rhs.value));
-	}
+RValue<Int4> operator%(RValue<Int4> lhs, RValue<Int4> rhs)
+{
+	return RValue<Int4>(Nucleus::createSRem(lhs.value, rhs.value));
+}
 
-	RValue<Int4> operator&(RValue<Int4> lhs, RValue<Int4> rhs)
-	{
-		return RValue<Int4>(Nucleus::createAnd(lhs.value, rhs.value));
-	}
+RValue<Int4> operator&(RValue<Int4> lhs, RValue<Int4> rhs)
+{
+	return RValue<Int4>(Nucleus::createAnd(lhs.value, rhs.value));
+}
 
-	RValue<Int4> operator|(RValue<Int4> lhs, RValue<Int4> rhs)
-	{
-		return RValue<Int4>(Nucleus::createOr(lhs.value, rhs.value));
-	}
+RValue<Int4> operator|(RValue<Int4> lhs, RValue<Int4> rhs)
+{
+	return RValue<Int4>(Nucleus::createOr(lhs.value, rhs.value));
+}
 
-	RValue<Int4> operator^(RValue<Int4> lhs, RValue<Int4> rhs)
-	{
-		return RValue<Int4>(Nucleus::createXor(lhs.value, rhs.value));
-	}
+RValue<Int4> operator^(RValue<Int4> lhs, RValue<Int4> rhs)
+{
+	return RValue<Int4>(Nucleus::createXor(lhs.value, rhs.value));
+}
 
-	RValue<Int4> operator<<(RValue<Int4> lhs, RValue<Int4> rhs)
-	{
-		return RValue<Int4>(Nucleus::createShl(lhs.value, rhs.value));
-	}
+RValue<Int4> operator<<(RValue<Int4> lhs, RValue<Int4> rhs)
+{
+	return RValue<Int4>(Nucleus::createShl(lhs.value, rhs.value));
+}
 
-	RValue<Int4> operator>>(RValue<Int4> lhs, RValue<Int4> rhs)
-	{
-		return RValue<Int4>(Nucleus::createAShr(lhs.value, rhs.value));
-	}
+RValue<Int4> operator>>(RValue<Int4> lhs, RValue<Int4> rhs)
+{
+	return RValue<Int4>(Nucleus::createAShr(lhs.value, rhs.value));
+}
 
-	RValue<Int4> operator+=(Int4 &lhs, RValue<Int4> rhs)
-	{
-		return lhs = lhs + rhs;
-	}
+RValue<Int4> operator+=(Int4 &lhs, RValue<Int4> rhs)
+{
+	return lhs = lhs + rhs;
+}
 
-	RValue<Int4> operator-=(Int4 &lhs, RValue<Int4> rhs)
-	{
-		return lhs = lhs - rhs;
-	}
+RValue<Int4> operator-=(Int4 &lhs, RValue<Int4> rhs)
+{
+	return lhs = lhs - rhs;
+}
 
-	RValue<Int4> operator*=(Int4 &lhs, RValue<Int4> rhs)
-	{
-		return lhs = lhs * rhs;
-	}
+RValue<Int4> operator*=(Int4 &lhs, RValue<Int4> rhs)
+{
+	return lhs = lhs * rhs;
+}
 
 //	RValue<Int4> operator/=(Int4 &lhs, RValue<Int4> rhs)
 //	{
@@ -3461,235 +3462,235 @@
 //		return lhs = lhs % rhs;
 //	}
 
-	RValue<Int4> operator&=(Int4 &lhs, RValue<Int4> rhs)
-	{
-		return lhs = lhs & rhs;
-	}
+RValue<Int4> operator&=(Int4 &lhs, RValue<Int4> rhs)
+{
+	return lhs = lhs & rhs;
+}
 
-	RValue<Int4> operator|=(Int4 &lhs, RValue<Int4> rhs)
-	{
-		return lhs = lhs | rhs;
-	}
+RValue<Int4> operator|=(Int4 &lhs, RValue<Int4> rhs)
+{
+	return lhs = lhs | rhs;
+}
 
-	RValue<Int4> operator^=(Int4 &lhs, RValue<Int4> rhs)
-	{
-		return lhs = lhs ^ rhs;
-	}
+RValue<Int4> operator^=(Int4 &lhs, RValue<Int4> rhs)
+{
+	return lhs = lhs ^ rhs;
+}
 
-	RValue<Int4> operator<<=(Int4 &lhs, unsigned char rhs)
-	{
-		return lhs = lhs << rhs;
-	}
+RValue<Int4> operator<<=(Int4 &lhs, unsigned char rhs)
+{
+	return lhs = lhs << rhs;
+}
 
-	RValue<Int4> operator>>=(Int4 &lhs, unsigned char rhs)
-	{
-		return lhs = lhs >> rhs;
-	}
+RValue<Int4> operator>>=(Int4 &lhs, unsigned char rhs)
+{
+	return lhs = lhs >> rhs;
+}
 
-	RValue<Int4> operator+(RValue<Int4> val)
-	{
-		return val;
-	}
+RValue<Int4> operator+(RValue<Int4> val)
+{
+	return val;
+}
 
-	RValue<Int4> operator-(RValue<Int4> val)
-	{
-		return RValue<Int4>(Nucleus::createNeg(val.value));
-	}
+RValue<Int4> operator-(RValue<Int4> val)
+{
+	return RValue<Int4>(Nucleus::createNeg(val.value));
+}
 
-	RValue<Int4> operator~(RValue<Int4> val)
-	{
-		return RValue<Int4>(Nucleus::createNot(val.value));
-	}
+RValue<Int4> operator~(RValue<Int4> val)
+{
+	return RValue<Int4>(Nucleus::createNot(val.value));
+}
 
-	RValue<Int> Extract(RValue<Int4> x, int i)
-	{
-		return RValue<Int>(Nucleus::createExtractElement(x.value, Int::getType(), i));
-	}
+RValue<Int> Extract(RValue<Int4> x, int i)
+{
+	return RValue<Int>(Nucleus::createExtractElement(x.value, Int::getType(), i));
+}
 
-	RValue<Int4> Insert(RValue<Int4> x, RValue<Int> element, int i)
-	{
-		return RValue<Int4>(Nucleus::createInsertElement(x.value, element.value, i));
-	}
+RValue<Int4> Insert(RValue<Int4> x, RValue<Int> element, int i)
+{
+	return RValue<Int4>(Nucleus::createInsertElement(x.value, element.value, i));
+}
 
-	RValue<Int4> Swizzle(RValue<Int4> x, uint16_t select)
-	{
-		return RValue<Int4>(createSwizzle4(x.value, select));
-	}
+RValue<Int4> Swizzle(RValue<Int4> x, uint16_t select)
+{
+	return RValue<Int4>(createSwizzle4(x.value, select));
+}
 
-	RValue<Int4> Shuffle(RValue<Int4> x, RValue<Int4> y, unsigned short select)
-	{
-		return RValue<Int4>(createBlend4(x.value, y.value, select));
-	}
+RValue<Int4> Shuffle(RValue<Int4> x, RValue<Int4> y, unsigned short select)
+{
+	return RValue<Int4>(createBlend4(x.value, y.value, select));
+}
 
-	UInt4::UInt4() : XYZW(this)
-	{
-	}
+UInt4::UInt4() : XYZW(this)
+{
+}
 
-	UInt4::UInt4(int xyzw) : XYZW(this)
-	{
-		constant(xyzw, xyzw, xyzw, xyzw);
-	}
+UInt4::UInt4(int xyzw) : XYZW(this)
+{
+	constant(xyzw, xyzw, xyzw, xyzw);
+}
 
-	UInt4::UInt4(int x, int yzw) : XYZW(this)
-	{
-		constant(x, yzw, yzw, yzw);
-	}
+UInt4::UInt4(int x, int yzw) : XYZW(this)
+{
+	constant(x, yzw, yzw, yzw);
+}
 
-	UInt4::UInt4(int x, int y, int zw) : XYZW(this)
-	{
-		constant(x, y, zw, zw);
-	}
+UInt4::UInt4(int x, int y, int zw) : XYZW(this)
+{
+	constant(x, y, zw, zw);
+}
 
-	UInt4::UInt4(int x, int y, int z, int w) : XYZW(this)
-	{
-		constant(x, y, z, w);
-	}
+UInt4::UInt4(int x, int y, int z, int w) : XYZW(this)
+{
+	constant(x, y, z, w);
+}
 
-	void UInt4::constant(int x, int y, int z, int w)
-	{
-		int64_t constantVector[4] = {x, y, z, w};
-		storeValue(Nucleus::createConstantVector(constantVector, getType()));
-	}
+void UInt4::constant(int x, int y, int z, int w)
+{
+	int64_t constantVector[4] = {x, y, z, w};
+	storeValue(Nucleus::createConstantVector(constantVector, getType()));
+}
 
-	UInt4::UInt4(RValue<UInt4> rhs) : XYZW(this)
-	{
-		storeValue(rhs.value);
-	}
+UInt4::UInt4(RValue<UInt4> rhs) : XYZW(this)
+{
+	storeValue(rhs.value);
+}
 
-	UInt4::UInt4(const UInt4 &rhs) : XYZW(this)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
-	}
+UInt4::UInt4(const UInt4 &rhs) : XYZW(this)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
+}
 
-	UInt4::UInt4(const Reference<UInt4> &rhs) : XYZW(this)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
-	}
+UInt4::UInt4(const Reference<UInt4> &rhs) : XYZW(this)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
+}
 
-	UInt4::UInt4(RValue<Int4> rhs) : XYZW(this)
-	{
-		storeValue(rhs.value);
-	}
+UInt4::UInt4(RValue<Int4> rhs) : XYZW(this)
+{
+	storeValue(rhs.value);
+}
 
-	UInt4::UInt4(const Int4 &rhs) : XYZW(this)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
-	}
+UInt4::UInt4(const Int4 &rhs) : XYZW(this)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
+}
 
-	UInt4::UInt4(const Reference<Int4> &rhs) : XYZW(this)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
-	}
+UInt4::UInt4(const Reference<Int4> &rhs) : XYZW(this)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
+}
 
-	UInt4::UInt4(RValue<UInt2> lo, RValue<UInt2> hi) : XYZW(this)
-	{
-		int shuffle[4] = {0, 1, 4, 5};   // Real type is v4i32
-		Value *packed = Nucleus::createShuffleVector(lo.value, hi.value, shuffle);
+UInt4::UInt4(RValue<UInt2> lo, RValue<UInt2> hi) : XYZW(this)
+{
+	int shuffle[4] = {0, 1, 4, 5};   // Real type is v4i32
+	Value *packed = Nucleus::createShuffleVector(lo.value, hi.value, shuffle);
 
-		storeValue(packed);
-	}
+	storeValue(packed);
+}
 
-	UInt4::UInt4(const UInt &rhs) : XYZW(this)
-	{
-		*this = RValue<UInt>(rhs.loadValue());
-	}
+UInt4::UInt4(const UInt &rhs) : XYZW(this)
+{
+	*this = RValue<UInt>(rhs.loadValue());
+}
 
-	UInt4::UInt4(const Reference<UInt> &rhs) : XYZW(this)
-	{
-		*this = RValue<UInt>(rhs.loadValue());
-	}
+UInt4::UInt4(const Reference<UInt> &rhs) : XYZW(this)
+{
+	*this = RValue<UInt>(rhs.loadValue());
+}
 
-	RValue<UInt4> UInt4::operator=(RValue<UInt4> rhs)
-	{
-		storeValue(rhs.value);
+RValue<UInt4> UInt4::operator=(RValue<UInt4> rhs)
+{
+	storeValue(rhs.value);
 
-		return rhs;
-	}
+	return rhs;
+}
 
-	RValue<UInt4> UInt4::operator=(const UInt4 &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
+RValue<UInt4> UInt4::operator=(const UInt4 &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
 
-		return RValue<UInt4>(value);
-	}
+	return RValue<UInt4>(value);
+}
 
-	RValue<UInt4> UInt4::operator=(const Reference<UInt4> &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
+RValue<UInt4> UInt4::operator=(const Reference<UInt4> &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
 
-		return RValue<UInt4>(value);
-	}
+	return RValue<UInt4>(value);
+}
 
-	RValue<UInt4> operator+(RValue<UInt4> lhs, RValue<UInt4> rhs)
-	{
-		return RValue<UInt4>(Nucleus::createAdd(lhs.value, rhs.value));
-	}
+RValue<UInt4> operator+(RValue<UInt4> lhs, RValue<UInt4> rhs)
+{
+	return RValue<UInt4>(Nucleus::createAdd(lhs.value, rhs.value));
+}
 
-	RValue<UInt4> operator-(RValue<UInt4> lhs, RValue<UInt4> rhs)
-	{
-		return RValue<UInt4>(Nucleus::createSub(lhs.value, rhs.value));
-	}
+RValue<UInt4> operator-(RValue<UInt4> lhs, RValue<UInt4> rhs)
+{
+	return RValue<UInt4>(Nucleus::createSub(lhs.value, rhs.value));
+}
 
-	RValue<UInt4> operator*(RValue<UInt4> lhs, RValue<UInt4> rhs)
-	{
-		return RValue<UInt4>(Nucleus::createMul(lhs.value, rhs.value));
-	}
+RValue<UInt4> operator*(RValue<UInt4> lhs, RValue<UInt4> rhs)
+{
+	return RValue<UInt4>(Nucleus::createMul(lhs.value, rhs.value));
+}
 
-	RValue<UInt4> operator/(RValue<UInt4> lhs, RValue<UInt4> rhs)
-	{
-		return RValue<UInt4>(Nucleus::createUDiv(lhs.value, rhs.value));
-	}
+RValue<UInt4> operator/(RValue<UInt4> lhs, RValue<UInt4> rhs)
+{
+	return RValue<UInt4>(Nucleus::createUDiv(lhs.value, rhs.value));
+}
 
-	RValue<UInt4> operator%(RValue<UInt4> lhs, RValue<UInt4> rhs)
-	{
-		return RValue<UInt4>(Nucleus::createURem(lhs.value, rhs.value));
-	}
+RValue<UInt4> operator%(RValue<UInt4> lhs, RValue<UInt4> rhs)
+{
+	return RValue<UInt4>(Nucleus::createURem(lhs.value, rhs.value));
+}
 
-	RValue<UInt4> operator&(RValue<UInt4> lhs, RValue<UInt4> rhs)
-	{
-		return RValue<UInt4>(Nucleus::createAnd(lhs.value, rhs.value));
-	}
+RValue<UInt4> operator&(RValue<UInt4> lhs, RValue<UInt4> rhs)
+{
+	return RValue<UInt4>(Nucleus::createAnd(lhs.value, rhs.value));
+}
 
-	RValue<UInt4> operator|(RValue<UInt4> lhs, RValue<UInt4> rhs)
-	{
-		return RValue<UInt4>(Nucleus::createOr(lhs.value, rhs.value));
-	}
+RValue<UInt4> operator|(RValue<UInt4> lhs, RValue<UInt4> rhs)
+{
+	return RValue<UInt4>(Nucleus::createOr(lhs.value, rhs.value));
+}
 
-	RValue<UInt4> operator^(RValue<UInt4> lhs, RValue<UInt4> rhs)
-	{
-		return RValue<UInt4>(Nucleus::createXor(lhs.value, rhs.value));
-	}
+RValue<UInt4> operator^(RValue<UInt4> lhs, RValue<UInt4> rhs)
+{
+	return RValue<UInt4>(Nucleus::createXor(lhs.value, rhs.value));
+}
 
-	RValue<UInt4> operator<<(RValue<UInt4> lhs, RValue<UInt4> rhs)
-	{
-		return RValue<UInt4>(Nucleus::createShl(lhs.value, rhs.value));
-	}
+RValue<UInt4> operator<<(RValue<UInt4> lhs, RValue<UInt4> rhs)
+{
+	return RValue<UInt4>(Nucleus::createShl(lhs.value, rhs.value));
+}
 
-	RValue<UInt4> operator>>(RValue<UInt4> lhs, RValue<UInt4> rhs)
-	{
-		return RValue<UInt4>(Nucleus::createLShr(lhs.value, rhs.value));
-	}
+RValue<UInt4> operator>>(RValue<UInt4> lhs, RValue<UInt4> rhs)
+{
+	return RValue<UInt4>(Nucleus::createLShr(lhs.value, rhs.value));
+}
 
-	RValue<UInt4> operator+=(UInt4 &lhs, RValue<UInt4> rhs)
-	{
-		return lhs = lhs + rhs;
-	}
+RValue<UInt4> operator+=(UInt4 &lhs, RValue<UInt4> rhs)
+{
+	return lhs = lhs + rhs;
+}
 
-	RValue<UInt4> operator-=(UInt4 &lhs, RValue<UInt4> rhs)
-	{
-		return lhs = lhs - rhs;
-	}
+RValue<UInt4> operator-=(UInt4 &lhs, RValue<UInt4> rhs)
+{
+	return lhs = lhs - rhs;
+}
 
-	RValue<UInt4> operator*=(UInt4 &lhs, RValue<UInt4> rhs)
-	{
-		return lhs = lhs * rhs;
-	}
+RValue<UInt4> operator*=(UInt4 &lhs, RValue<UInt4> rhs)
+{
+	return lhs = lhs * rhs;
+}
 
 //	RValue<UInt4> operator/=(UInt4 &lhs, RValue<UInt4> rhs)
 //	{
@@ -3701,722 +3702,723 @@
 //		return lhs = lhs % rhs;
 //	}
 
-	RValue<UInt4> operator&=(UInt4 &lhs, RValue<UInt4> rhs)
-	{
-		return lhs = lhs & rhs;
-	}
-
-	RValue<UInt4> operator|=(UInt4 &lhs, RValue<UInt4> rhs)
-	{
-		return lhs = lhs | rhs;
-	}
-
-	RValue<UInt4> operator^=(UInt4 &lhs, RValue<UInt4> rhs)
-	{
-		return lhs = lhs ^ rhs;
-	}
-
-	RValue<UInt4> operator<<=(UInt4 &lhs, unsigned char rhs)
-	{
-		return lhs = lhs << rhs;
-	}
-
-	RValue<UInt4> operator>>=(UInt4 &lhs, unsigned char rhs)
-	{
-		return lhs = lhs >> rhs;
-	}
-
-	RValue<UInt4> operator+(RValue<UInt4> val)
-	{
-		return val;
-	}
-
-	RValue<UInt4> operator-(RValue<UInt4> val)
-	{
-		return RValue<UInt4>(Nucleus::createNeg(val.value));
-	}
-
-	RValue<UInt4> operator~(RValue<UInt4> val)
-	{
-		return RValue<UInt4>(Nucleus::createNot(val.value));
-	}
-
-	RValue<UInt> Extract(RValue<UInt4> x, int i)
-	{
-		return RValue<UInt>(Nucleus::createExtractElement(x.value, Int::getType(), i));
-	}
-
-	RValue<UInt4> Insert(RValue<UInt4> x, RValue<UInt> element, int i)
-	{
-		return RValue<UInt4>(Nucleus::createInsertElement(x.value, element.value, i));
-	}
-
-	RValue<UInt4> Swizzle(RValue<UInt4> x, uint16_t select)
-	{
-		return RValue<UInt4>(createSwizzle4(x.value, select));
-	}
-
-	RValue<UInt4> Shuffle(RValue<UInt4> x, RValue<UInt4> y, unsigned short select)
-	{
-		return RValue<UInt4>(createBlend4(x.value, y.value, select));
-	}
-
-	Half::Half(RValue<Float> cast)
-	{
-		UInt fp32i = As<UInt>(cast);
-		UInt abs = fp32i & 0x7FFFFFFF;
-		UShort fp16i((fp32i & 0x80000000) >> 16); // sign
-
-		If(abs > 0x47FFEFFF) // Infinity
-		{
-			fp16i |= UShort(0x7FFF);
-		}
-		Else
-		{
-			If(abs < 0x38800000) // Denormal
-			{
-				Int mantissa = (abs & 0x007FFFFF) | 0x00800000;
-				Int e = 113 - (abs >> 23);
-				abs = IfThenElse(e < 24, mantissa >> e, Int(0));
-				fp16i |= UShort((abs + 0x00000FFF + ((abs >> 13) & 1)) >> 13);
-			}
-			Else
-			{
-				fp16i |= UShort((abs + 0xC8000000 + 0x00000FFF + ((abs >> 13) & 1)) >> 13);
-			}
-		}
-
-		storeValue(fp16i.loadValue());
-	}
-
-	Float::Float(RValue<Int> cast)
-	{
-		Value *integer = Nucleus::createSIToFP(cast.value, Float::getType());
-
-		storeValue(integer);
-	}
-
-	Float::Float(RValue<UInt> cast)
-	{
-		RValue<Float> result = Float(Int(cast & UInt(0x7FFFFFFF))) +
-		                       As<Float>((As<Int>(cast) >> 31) & As<Int>(Float(0x80000000u)));
-
-		storeValue(result.value);
-	}
-
-	Float::Float(RValue<Half> cast)
-	{
-		Int fp16i(As<UShort>(cast));
-
-		Int s = (fp16i >> 15) & 0x00000001;
-		Int e = (fp16i >> 10) & 0x0000001F;
-		Int m = fp16i & 0x000003FF;
-
-		UInt fp32i(s << 31);
-		If(e == 0)
-		{
-			If(m != 0)
-			{
-				While((m & 0x00000400) == 0)
-				{
-					m <<= 1;
-					e -= 1;
-				}
-
-				fp32i |= As<UInt>(((e + (127 - 15) + 1) << 23) | ((m & ~0x00000400) << 13));
-			}
-		}
-		Else
-		{
-			fp32i |= As<UInt>(((e + (127 - 15)) << 23) | (m << 13));
-		}
-
-		storeValue(As<Float>(fp32i).value);
-	}
-
-	Float::Float(float x)
-	{
-		// C++ does not have a way to write an infinite or NaN literal,
-		// nor does it allow division by zero as a constant expression.
-		// Thus we should not accept inf or NaN as a Reactor Float constant,
-		// as this would typically idicate a bug, and avoids undefined
-		// behavior.
-		//
-		// This also prevents the issue of the LLVM JIT only taking double
-		// values for constructing floating-point constants. During the
-		// conversion from single-precision to double, a signaling NaN can
-		// become a quiet NaN, thus altering its bit pattern. Hence this
-		// assert is also helpful for detecting cases where integers are
-		// being reinterpreted as float and then bitcast to integer again,
-		// which does not guarantee preserving the integer value.
-		//
-		// Should infinity and NaN constants be required, methods like
-		// infinity(), quiet_NaN(), and signaling_NaN() should be added
-		// to the Float class.
-		ASSERT(std::isfinite(x));
-
-		storeValue(Nucleus::createConstantFloat(x));
-	}
-
-	Float::Float(RValue<Float> rhs)
-	{
-		storeValue(rhs.value);
-	}
-
-	Float::Float(const Float &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
-	}
-
-	Float::Float(const Reference<Float> &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
-	}
-
-	Float::Float(Argument<Float> argument)
-	{
-		storeValue(argument.value);
-	}
-
-	RValue<Float> Float::operator=(RValue<Float> rhs)
-	{
-		storeValue(rhs.value);
-
-		return rhs;
-	}
-
-	RValue<Float> Float::operator=(const Float &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
-
-		return RValue<Float>(value);
-	}
-
-	RValue<Float> Float::operator=(const Reference<Float> &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
-
-		return RValue<Float>(value);
-	}
-
-	RValue<Float> operator+(RValue<Float> lhs, RValue<Float> rhs)
-	{
-		return RValue<Float>(Nucleus::createFAdd(lhs.value, rhs.value));
-	}
-
-	RValue<Float> operator-(RValue<Float> lhs, RValue<Float> rhs)
-	{
-		return RValue<Float>(Nucleus::createFSub(lhs.value, rhs.value));
-	}
-
-	RValue<Float> operator*(RValue<Float> lhs, RValue<Float> rhs)
-	{
-		return RValue<Float>(Nucleus::createFMul(lhs.value, rhs.value));
-	}
-
-	RValue<Float> operator/(RValue<Float> lhs, RValue<Float> rhs)
-	{
-		return RValue<Float>(Nucleus::createFDiv(lhs.value, rhs.value));
-	}
-
-	RValue<Float> operator+=(Float &lhs, RValue<Float> rhs)
-	{
-		return lhs = lhs + rhs;
-	}
-
-	RValue<Float> operator-=(Float &lhs, RValue<Float> rhs)
-	{
-		return lhs = lhs - rhs;
-	}
-
-	RValue<Float> operator*=(Float &lhs, RValue<Float> rhs)
-	{
-		return lhs = lhs * rhs;
-	}
-
-	RValue<Float> operator/=(Float &lhs, RValue<Float> rhs)
-	{
-		return lhs = lhs / rhs;
-	}
-
-	RValue<Float> operator+(RValue<Float> val)
-	{
-		return val;
-	}
-
-	RValue<Float> operator-(RValue<Float> val)
-	{
-		return RValue<Float>(Nucleus::createFNeg(val.value));
-	}
-
-	RValue<Bool> operator<(RValue<Float> lhs, RValue<Float> rhs)
-	{
-		return RValue<Bool>(Nucleus::createFCmpOLT(lhs.value, rhs.value));
-	}
-
-	RValue<Bool> operator<=(RValue<Float> lhs, RValue<Float> rhs)
-	{
-		return RValue<Bool>(Nucleus::createFCmpOLE(lhs.value, rhs.value));
-	}
-
-	RValue<Bool> operator>(RValue<Float> lhs, RValue<Float> rhs)
-	{
-		return RValue<Bool>(Nucleus::createFCmpOGT(lhs.value, rhs.value));
-	}
-
-	RValue<Bool> operator>=(RValue<Float> lhs, RValue<Float> rhs)
-	{
-		return RValue<Bool>(Nucleus::createFCmpOGE(lhs.value, rhs.value));
-	}
-
-	RValue<Bool> operator!=(RValue<Float> lhs, RValue<Float> rhs)
-	{
-		return RValue<Bool>(Nucleus::createFCmpONE(lhs.value, rhs.value));
-	}
-
-	RValue<Bool> operator==(RValue<Float> lhs, RValue<Float> rhs)
-	{
-		return RValue<Bool>(Nucleus::createFCmpOEQ(lhs.value, rhs.value));
-	}
-
-	RValue<Float> Abs(RValue<Float> x)
-	{
-		return IfThenElse(x > 0.0f, x, -x);
-	}
-
-	RValue<Float> Max(RValue<Float> x, RValue<Float> y)
-	{
-		return IfThenElse(x > y, x, y);
-	}
-
-	RValue<Float> Min(RValue<Float> x, RValue<Float> y)
-	{
-		return IfThenElse(x < y, x, y);
-	}
-
-	Float2::Float2(RValue<Float4> cast)
-	{
-		storeValue(Nucleus::createBitCast(cast.value, getType()));
-	}
-
-	Float4::Float4(RValue<Byte4> cast) : XYZW(this)
-	{
-		Value *a = Int4(cast).loadValue();
-		Value *xyzw = Nucleus::createSIToFP(a, Float4::getType());
-
-		storeValue(xyzw);
-	}
-
-	Float4::Float4(RValue<SByte4> cast) : XYZW(this)
-	{
-		Value *a = Int4(cast).loadValue();
-		Value *xyzw = Nucleus::createSIToFP(a, Float4::getType());
-
-		storeValue(xyzw);
-	}
-
-	Float4::Float4(RValue<Short4> cast) : XYZW(this)
-	{
-		Int4 c(cast);
-		storeValue(Nucleus::createSIToFP(RValue<Int4>(c).value, Float4::getType()));
-	}
-
-	Float4::Float4(RValue<UShort4> cast) : XYZW(this)
-	{
-		Int4 c(cast);
-		storeValue(Nucleus::createSIToFP(RValue<Int4>(c).value, Float4::getType()));
-	}
-
-	Float4::Float4(RValue<Int4> cast) : XYZW(this)
-	{
-		Value *xyzw = Nucleus::createSIToFP(cast.value, Float4::getType());
-
-		storeValue(xyzw);
-	}
-
-	Float4::Float4(RValue<UInt4> cast) : XYZW(this)
-	{
-		RValue<Float4> result = Float4(Int4(cast & UInt4(0x7FFFFFFF))) +
-		                        As<Float4>((As<Int4>(cast) >> 31) & As<Int4>(Float4(0x80000000u)));
-
-		storeValue(result.value);
-	}
-
-	Float4::Float4() : XYZW(this)
-	{
-	}
-
-	Float4::Float4(float xyzw) : XYZW(this)
-	{
-		constant(xyzw, xyzw, xyzw, xyzw);
-	}
-
-	Float4::Float4(float x, float yzw) : XYZW(this)
-	{
-		constant(x, yzw, yzw, yzw);
-	}
-
-	Float4::Float4(float x, float y, float zw) : XYZW(this)
-	{
-		constant(x, y, zw, zw);
-	}
-
-	Float4::Float4(float x, float y, float z, float w) : XYZW(this)
-	{
-		constant(x, y, z, w);
-	}
-
-	Float4 Float4::positive_inf()
-	{
-		Float4 result;
-		result.infinity_constant(false);
-		return result;
-	}
-
-	Float4 Float4::negative_inf()
-	{
-		Float4 result;
-		result.infinity_constant(true);
-		return result;
-	}
-
-	void Float4::infinity_constant(bool negative)
-	{
-		double inf = negative ? -INFINITY : INFINITY;
-		double constantVector[4] = {inf, inf, inf, inf};
-		storeValue(Nucleus::createConstantVector(constantVector, getType()));
-	}
-
-	void Float4::constant(float x, float y, float z, float w)
-	{
-		// See Float(float) constructor for the rationale behind this assert.
-		ASSERT(std::isfinite(x) && std::isfinite(y) && std::isfinite(z) && std::isfinite(w));
-
-		double constantVector[4] = {x, y, z, w};
-		storeValue(Nucleus::createConstantVector(constantVector, getType()));
-	}
-
-	Float4::Float4(RValue<Float4> rhs) : XYZW(this)
-	{
-		storeValue(rhs.value);
-	}
-
-	Float4::Float4(const Float4 &rhs) : XYZW(this)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
-	}
-
-	Float4::Float4(const Reference<Float4> &rhs) : XYZW(this)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
-	}
-
-	Float4::Float4(const Float &rhs) : XYZW(this)
-	{
-		*this = RValue<Float>(rhs.loadValue());
-	}
-
-	Float4::Float4(const Reference<Float> &rhs) : XYZW(this)
-	{
-		*this = RValue<Float>(rhs.loadValue());
-	}
-
-	RValue<Float4> Float4::operator=(float x)
-	{
-		return *this = Float4(x, x, x, x);
-	}
-
-	RValue<Float4> Float4::operator=(RValue<Float4> rhs)
-	{
-		storeValue(rhs.value);
-
-		return rhs;
-	}
-
-	RValue<Float4> Float4::operator=(const Float4 &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
-
-		return RValue<Float4>(value);
-	}
-
-	RValue<Float4> Float4::operator=(const Reference<Float4> &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
-
-		return RValue<Float4>(value);
-	}
-
-	RValue<Float4> Float4::operator=(RValue<Float> rhs)
-	{
-		return *this = Float4(rhs);
-	}
-
-	RValue<Float4> Float4::operator=(const Float &rhs)
-	{
-		return *this = Float4(rhs);
-	}
-
-	RValue<Float4> Float4::operator=(const Reference<Float> &rhs)
-	{
-		return *this = Float4(rhs);
-	}
-
-	RValue<Float4> operator+(RValue<Float4> lhs, RValue<Float4> rhs)
-	{
-		return RValue<Float4>(Nucleus::createFAdd(lhs.value, rhs.value));
-	}
-
-	RValue<Float4> operator-(RValue<Float4> lhs, RValue<Float4> rhs)
-	{
-		return RValue<Float4>(Nucleus::createFSub(lhs.value, rhs.value));
-	}
-
-	RValue<Float4> operator*(RValue<Float4> lhs, RValue<Float4> rhs)
-	{
-		return RValue<Float4>(Nucleus::createFMul(lhs.value, rhs.value));
-	}
-
-	RValue<Float4> operator/(RValue<Float4> lhs, RValue<Float4> rhs)
-	{
-		return RValue<Float4>(Nucleus::createFDiv(lhs.value, rhs.value));
-	}
-
-	RValue<Float4> operator%(RValue<Float4> lhs, RValue<Float4> rhs)
-	{
-		return RValue<Float4>(Nucleus::createFRem(lhs.value, rhs.value));
-	}
-
-	RValue<Float4> operator+=(Float4 &lhs, RValue<Float4> rhs)
-	{
-		return lhs = lhs + rhs;
-	}
-
-	RValue<Float4> operator-=(Float4 &lhs, RValue<Float4> rhs)
-	{
-		return lhs = lhs - rhs;
-	}
-
-	RValue<Float4> operator*=(Float4 &lhs, RValue<Float4> rhs)
-	{
-		return lhs = lhs * rhs;
-	}
-
-	RValue<Float4> operator/=(Float4 &lhs, RValue<Float4> rhs)
-	{
-		return lhs = lhs / rhs;
-	}
-
-	RValue<Float4> operator%=(Float4 &lhs, RValue<Float4> rhs)
-	{
-		return lhs = lhs % rhs;
-	}
-
-	RValue<Float4> operator+(RValue<Float4> val)
-	{
-		return val;
-	}
-
-	RValue<Float4> operator-(RValue<Float4> val)
-	{
-		return RValue<Float4>(Nucleus::createFNeg(val.value));
-	}
-
-	RValue<Float4> Abs(RValue<Float4> x)
-	{
-		// TODO: Optimize.
-		Value *vector = Nucleus::createBitCast(x.value, Int4::getType());
-		int64_t constantVector[4] = {0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF};
-		Value *result = Nucleus::createAnd(vector, Nucleus::createConstantVector(constantVector, Int4::getType()));
-
-		return As<Float4>(result);
-	}
-
-	RValue<Float4> Insert(RValue<Float4> x, RValue<Float> element, int i)
-	{
-		return RValue<Float4>(Nucleus::createInsertElement(x.value, element.value, i));
-	}
-
-	RValue<Float> Extract(RValue<Float4> x, int i)
-	{
-		return RValue<Float>(Nucleus::createExtractElement(x.value, Float::getType(), i));
-	}
-
-	RValue<Float4> Swizzle(RValue<Float4> x, uint16_t select)
-	{
-		return RValue<Float4>(createSwizzle4(x.value, select));
-	}
-
-	RValue<Float4> Shuffle(RValue<Float4> x, RValue<Float4> y, uint16_t select)
-	{
-		return RValue<Float4>(createBlend4(x.value, y.value, select));
-	}
-
-	RValue<Float4> ShuffleLowHigh(RValue<Float4> x, RValue<Float4> y, uint16_t imm)
-	{
-		int shuffle[4] =
-		{
-			((imm >> 12) & 0x03) + 0,
-			((imm >>  8) & 0x03) + 0,
-			((imm >>  4) & 0x03) + 4,
-			((imm >>  0) & 0x03) + 4,
-		};
-
-		return RValue<Float4>(Nucleus::createShuffleVector(x.value, y.value, shuffle));
-	}
-
-	RValue<Float4> UnpackLow(RValue<Float4> x, RValue<Float4> y)
-	{
-		int shuffle[4] = {0, 4, 1, 5};
-		return RValue<Float4>(Nucleus::createShuffleVector(x.value, y.value, shuffle));
-	}
-
-	RValue<Float4> UnpackHigh(RValue<Float4> x, RValue<Float4> y)
-	{
-		int shuffle[4] = {2, 6, 3, 7};
-		return RValue<Float4>(Nucleus::createShuffleVector(x.value, y.value, shuffle));
-	}
-
-	RValue<Float4> Mask(Float4 &lhs, RValue<Float4> rhs, uint16_t select)
-	{
-		Value *vector = lhs.loadValue();
-		Value *result = createMask4(vector, rhs.value, select);
-		lhs.storeValue(result);
-
-		return RValue<Float4>(result);
-	}
-
-	RValue<Int4> IsInf(RValue<Float4> x)
-	{
-		return CmpEQ(As<Int4>(x) & Int4(0x7FFFFFFF), Int4(0x7F800000));
-	}
-
-	RValue<Int4> IsNan(RValue<Float4> x)
-	{
-		return ~CmpEQ(x, x);
-	}
-
-	RValue<Pointer<Byte>> operator+(RValue<Pointer<Byte>> lhs, int offset)
-	{
-		return lhs + RValue<Int>(Nucleus::createConstantInt(offset));
-	}
-
-	RValue<Pointer<Byte>> operator+(RValue<Pointer<Byte>> lhs, RValue<Int> offset)
-	{
-		return RValue<Pointer<Byte>>(Nucleus::createGEP(lhs.value, Byte::getType(), offset.value, false));
-	}
-
-	RValue<Pointer<Byte>> operator+(RValue<Pointer<Byte>> lhs, RValue<UInt> offset)
-	{
-		return RValue<Pointer<Byte>>(Nucleus::createGEP(lhs.value, Byte::getType(), offset.value, true));
-	}
-
-	RValue<Pointer<Byte>> operator+=(Pointer<Byte> &lhs, int offset)
-	{
-		return lhs = lhs + offset;
-	}
-
-	RValue<Pointer<Byte>> operator+=(Pointer<Byte> &lhs, RValue<Int> offset)
-	{
-		return lhs = lhs + offset;
-	}
-
-	RValue<Pointer<Byte>> operator+=(Pointer<Byte> &lhs, RValue<UInt> offset)
-	{
-		return lhs = lhs + offset;
-	}
-
-	RValue<Pointer<Byte>> operator-(RValue<Pointer<Byte>> lhs, int offset)
-	{
-		return lhs + -offset;
-	}
-
-	RValue<Pointer<Byte>> operator-(RValue<Pointer<Byte>> lhs, RValue<Int> offset)
-	{
-		return lhs + -offset;
-	}
-
-	RValue<Pointer<Byte>> operator-(RValue<Pointer<Byte>> lhs, RValue<UInt> offset)
-	{
-		return lhs + -offset;
-	}
-
-	RValue<Pointer<Byte>> operator-=(Pointer<Byte> &lhs, int offset)
-	{
-		return lhs = lhs - offset;
-	}
-
-	RValue<Pointer<Byte>> operator-=(Pointer<Byte> &lhs, RValue<Int> offset)
-	{
-		return lhs = lhs - offset;
-	}
-
-	RValue<Pointer<Byte>> operator-=(Pointer<Byte> &lhs, RValue<UInt> offset)
-	{
-		return lhs = lhs - offset;
-	}
-
-	void Return()
-	{
-		Nucleus::createRetVoid();
-		// Place any unreachable instructions in an unreferenced block.
-		Nucleus::setInsertBlock(Nucleus::createBasicBlock());
-	}
-
-	void branch(RValue<Bool> cmp, BasicBlock *bodyBB, BasicBlock *endBB)
-	{
-		Nucleus::createCondBr(cmp.value, bodyBB, endBB);
-		Nucleus::setInsertBlock(bodyBB);
-	}
-
-	RValue<Float4> MaskedLoad(RValue<Pointer<Float4>> base, RValue<Int4> mask, unsigned int alignment, bool zeroMaskedLanes /* = false */)
-	{
-		return RValue<Float4>(Nucleus::createMaskedLoad(base.value, Float::getType(), mask.value, alignment, zeroMaskedLanes));
-	}
-
-	RValue<Int4> MaskedLoad(RValue<Pointer<Int4>> base, RValue<Int4> mask, unsigned int alignment, bool zeroMaskedLanes /* = false */)
-	{
-		return RValue<Int4>(Nucleus::createMaskedLoad(base.value, Int::getType(), mask.value, alignment, zeroMaskedLanes));
-	}
-
-	void MaskedStore(RValue<Pointer<Float4>> base, RValue<Float4> val, RValue<Int4> mask, unsigned int alignment)
-	{
-		Nucleus::createMaskedStore(base.value, val.value, mask.value, alignment);
-	}
-
-	void MaskedStore(RValue<Pointer<Int4>> base, RValue<Int4> val, RValue<Int4> mask, unsigned int alignment)
-	{
-		Nucleus::createMaskedStore(base.value, val.value, mask.value, alignment);
-	}
-
-	void Fence(std::memory_order memoryOrder)
-	{
-		ASSERT_MSG(memoryOrder == std::memory_order_acquire ||
-			memoryOrder == std::memory_order_release ||
-			memoryOrder == std::memory_order_acq_rel ||
-			memoryOrder == std::memory_order_seq_cst,
-			"Unsupported memoryOrder: %d", int(memoryOrder));
-		Nucleus::createFence(memoryOrder);
-	}
-
-	Bool          CToReactor<bool>::cast(bool v)               { return type(v); }
-	Byte          CToReactor<uint8_t>::cast(uint8_t v)         { return type(v); }
-	SByte         CToReactor<int8_t>::cast(int8_t v)           { return type(v); }
-	Short         CToReactor<int16_t>::cast(int16_t v)         { return type(v); }
-	UShort        CToReactor<uint16_t>::cast(uint16_t v)       { return type(v); }
-	Int           CToReactor<int32_t>::cast(int32_t v)         { return type(v); }
-	UInt          CToReactor<uint32_t>::cast(uint32_t v)       { return type(v); }
-	Float         CToReactor<float>::cast(float v)             { return type(v); }
-	Float4        CToReactor<float[4]>::cast(float v[4])       { return type(v[0], v[1], v[2], v[3]); }
-
-	// TODO: Long has no constructor that takes a uint64_t
-	// Long          CToReactor<uint64_t>::cast(uint64_t v)       { return type(v); }
+RValue<UInt4> operator&=(UInt4 &lhs, RValue<UInt4> rhs)
+{
+	return lhs = lhs & rhs;
 }
+
+RValue<UInt4> operator|=(UInt4 &lhs, RValue<UInt4> rhs)
+{
+	return lhs = lhs | rhs;
+}
+
+RValue<UInt4> operator^=(UInt4 &lhs, RValue<UInt4> rhs)
+{
+	return lhs = lhs ^ rhs;
+}
+
+RValue<UInt4> operator<<=(UInt4 &lhs, unsigned char rhs)
+{
+	return lhs = lhs << rhs;
+}
+
+RValue<UInt4> operator>>=(UInt4 &lhs, unsigned char rhs)
+{
+	return lhs = lhs >> rhs;
+}
+
+RValue<UInt4> operator+(RValue<UInt4> val)
+{
+	return val;
+}
+
+RValue<UInt4> operator-(RValue<UInt4> val)
+{
+	return RValue<UInt4>(Nucleus::createNeg(val.value));
+}
+
+RValue<UInt4> operator~(RValue<UInt4> val)
+{
+	return RValue<UInt4>(Nucleus::createNot(val.value));
+}
+
+RValue<UInt> Extract(RValue<UInt4> x, int i)
+{
+	return RValue<UInt>(Nucleus::createExtractElement(x.value, Int::getType(), i));
+}
+
+RValue<UInt4> Insert(RValue<UInt4> x, RValue<UInt> element, int i)
+{
+	return RValue<UInt4>(Nucleus::createInsertElement(x.value, element.value, i));
+}
+
+RValue<UInt4> Swizzle(RValue<UInt4> x, uint16_t select)
+{
+	return RValue<UInt4>(createSwizzle4(x.value, select));
+}
+
+RValue<UInt4> Shuffle(RValue<UInt4> x, RValue<UInt4> y, unsigned short select)
+{
+	return RValue<UInt4>(createBlend4(x.value, y.value, select));
+}
+
+Half::Half(RValue<Float> cast)
+{
+	UInt fp32i = As<UInt>(cast);
+	UInt abs = fp32i & 0x7FFFFFFF;
+	UShort fp16i((fp32i & 0x80000000) >> 16); // sign
+
+	If(abs > 0x47FFEFFF) // Infinity
+	{
+		fp16i |= UShort(0x7FFF);
+	}
+	Else
+	{
+		If(abs < 0x38800000) // Denormal
+		{
+			Int mantissa = (abs & 0x007FFFFF) | 0x00800000;
+			Int e = 113 - (abs >> 23);
+			abs = IfThenElse(e < 24, mantissa >> e, Int(0));
+			fp16i |= UShort((abs + 0x00000FFF + ((abs >> 13) & 1)) >> 13);
+		}
+		Else
+		{
+			fp16i |= UShort((abs + 0xC8000000 + 0x00000FFF + ((abs >> 13) & 1)) >> 13);
+		}
+	}
+
+	storeValue(fp16i.loadValue());
+}
+
+Float::Float(RValue<Int> cast)
+{
+	Value *integer = Nucleus::createSIToFP(cast.value, Float::getType());
+
+	storeValue(integer);
+}
+
+Float::Float(RValue<UInt> cast)
+{
+	RValue<Float> result = Float(Int(cast & UInt(0x7FFFFFFF))) +
+	                       As<Float>((As<Int>(cast) >> 31) & As<Int>(Float(0x80000000u)));
+
+	storeValue(result.value);
+}
+
+Float::Float(RValue<Half> cast)
+{
+	Int fp16i(As<UShort>(cast));
+
+	Int s = (fp16i >> 15) & 0x00000001;
+	Int e = (fp16i >> 10) & 0x0000001F;
+	Int m = fp16i & 0x000003FF;
+
+	UInt fp32i(s << 31);
+	If(e == 0)
+	{
+		If(m != 0)
+		{
+			While((m & 0x00000400) == 0)
+			{
+				m <<= 1;
+				e -= 1;
+			}
+
+			fp32i |= As<UInt>(((e + (127 - 15) + 1) << 23) | ((m & ~0x00000400) << 13));
+		}
+	}
+	Else
+	{
+		fp32i |= As<UInt>(((e + (127 - 15)) << 23) | (m << 13));
+	}
+
+	storeValue(As<Float>(fp32i).value);
+}
+
+Float::Float(float x)
+{
+	// C++ does not have a way to write an infinite or NaN literal,
+	// nor does it allow division by zero as a constant expression.
+	// Thus we should not accept inf or NaN as a Reactor Float constant,
+	// as this would typically idicate a bug, and avoids undefined
+	// behavior.
+	//
+	// This also prevents the issue of the LLVM JIT only taking double
+	// values for constructing floating-point constants. During the
+	// conversion from single-precision to double, a signaling NaN can
+	// become a quiet NaN, thus altering its bit pattern. Hence this
+	// assert is also helpful for detecting cases where integers are
+	// being reinterpreted as float and then bitcast to integer again,
+	// which does not guarantee preserving the integer value.
+	//
+	// Should infinity and NaN constants be required, methods like
+	// infinity(), quiet_NaN(), and signaling_NaN() should be added
+	// to the Float class.
+	ASSERT(std::isfinite(x));
+
+	storeValue(Nucleus::createConstantFloat(x));
+}
+
+Float::Float(RValue<Float> rhs)
+{
+	storeValue(rhs.value);
+}
+
+Float::Float(const Float &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
+}
+
+Float::Float(const Reference<Float> &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
+}
+
+Float::Float(Argument<Float> argument)
+{
+	storeValue(argument.value);
+}
+
+RValue<Float> Float::operator=(RValue<Float> rhs)
+{
+	storeValue(rhs.value);
+
+	return rhs;
+}
+
+RValue<Float> Float::operator=(const Float &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
+
+	return RValue<Float>(value);
+}
+
+RValue<Float> Float::operator=(const Reference<Float> &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
+
+	return RValue<Float>(value);
+}
+
+RValue<Float> operator+(RValue<Float> lhs, RValue<Float> rhs)
+{
+	return RValue<Float>(Nucleus::createFAdd(lhs.value, rhs.value));
+}
+
+RValue<Float> operator-(RValue<Float> lhs, RValue<Float> rhs)
+{
+	return RValue<Float>(Nucleus::createFSub(lhs.value, rhs.value));
+}
+
+RValue<Float> operator*(RValue<Float> lhs, RValue<Float> rhs)
+{
+	return RValue<Float>(Nucleus::createFMul(lhs.value, rhs.value));
+}
+
+RValue<Float> operator/(RValue<Float> lhs, RValue<Float> rhs)
+{
+	return RValue<Float>(Nucleus::createFDiv(lhs.value, rhs.value));
+}
+
+RValue<Float> operator+=(Float &lhs, RValue<Float> rhs)
+{
+	return lhs = lhs + rhs;
+}
+
+RValue<Float> operator-=(Float &lhs, RValue<Float> rhs)
+{
+	return lhs = lhs - rhs;
+}
+
+RValue<Float> operator*=(Float &lhs, RValue<Float> rhs)
+{
+	return lhs = lhs * rhs;
+}
+
+RValue<Float> operator/=(Float &lhs, RValue<Float> rhs)
+{
+	return lhs = lhs / rhs;
+}
+
+RValue<Float> operator+(RValue<Float> val)
+{
+	return val;
+}
+
+RValue<Float> operator-(RValue<Float> val)
+{
+	return RValue<Float>(Nucleus::createFNeg(val.value));
+}
+
+RValue<Bool> operator<(RValue<Float> lhs, RValue<Float> rhs)
+{
+	return RValue<Bool>(Nucleus::createFCmpOLT(lhs.value, rhs.value));
+}
+
+RValue<Bool> operator<=(RValue<Float> lhs, RValue<Float> rhs)
+{
+	return RValue<Bool>(Nucleus::createFCmpOLE(lhs.value, rhs.value));
+}
+
+RValue<Bool> operator>(RValue<Float> lhs, RValue<Float> rhs)
+{
+	return RValue<Bool>(Nucleus::createFCmpOGT(lhs.value, rhs.value));
+}
+
+RValue<Bool> operator>=(RValue<Float> lhs, RValue<Float> rhs)
+{
+	return RValue<Bool>(Nucleus::createFCmpOGE(lhs.value, rhs.value));
+}
+
+RValue<Bool> operator!=(RValue<Float> lhs, RValue<Float> rhs)
+{
+	return RValue<Bool>(Nucleus::createFCmpONE(lhs.value, rhs.value));
+}
+
+RValue<Bool> operator==(RValue<Float> lhs, RValue<Float> rhs)
+{
+	return RValue<Bool>(Nucleus::createFCmpOEQ(lhs.value, rhs.value));
+}
+
+RValue<Float> Abs(RValue<Float> x)
+{
+	return IfThenElse(x > 0.0f, x, -x);
+}
+
+RValue<Float> Max(RValue<Float> x, RValue<Float> y)
+{
+	return IfThenElse(x > y, x, y);
+}
+
+RValue<Float> Min(RValue<Float> x, RValue<Float> y)
+{
+	return IfThenElse(x < y, x, y);
+}
+
+Float2::Float2(RValue<Float4> cast)
+{
+	storeValue(Nucleus::createBitCast(cast.value, getType()));
+}
+
+Float4::Float4(RValue<Byte4> cast) : XYZW(this)
+{
+	Value *a = Int4(cast).loadValue();
+	Value *xyzw = Nucleus::createSIToFP(a, Float4::getType());
+
+	storeValue(xyzw);
+}
+
+Float4::Float4(RValue<SByte4> cast) : XYZW(this)
+{
+	Value *a = Int4(cast).loadValue();
+	Value *xyzw = Nucleus::createSIToFP(a, Float4::getType());
+
+	storeValue(xyzw);
+}
+
+Float4::Float4(RValue<Short4> cast) : XYZW(this)
+{
+	Int4 c(cast);
+	storeValue(Nucleus::createSIToFP(RValue<Int4>(c).value, Float4::getType()));
+}
+
+Float4::Float4(RValue<UShort4> cast) : XYZW(this)
+{
+	Int4 c(cast);
+	storeValue(Nucleus::createSIToFP(RValue<Int4>(c).value, Float4::getType()));
+}
+
+Float4::Float4(RValue<Int4> cast) : XYZW(this)
+{
+	Value *xyzw = Nucleus::createSIToFP(cast.value, Float4::getType());
+
+	storeValue(xyzw);
+}
+
+Float4::Float4(RValue<UInt4> cast) : XYZW(this)
+{
+	RValue<Float4> result = Float4(Int4(cast & UInt4(0x7FFFFFFF))) +
+	                        As<Float4>((As<Int4>(cast) >> 31) & As<Int4>(Float4(0x80000000u)));
+
+	storeValue(result.value);
+}
+
+Float4::Float4() : XYZW(this)
+{
+}
+
+Float4::Float4(float xyzw) : XYZW(this)
+{
+	constant(xyzw, xyzw, xyzw, xyzw);
+}
+
+Float4::Float4(float x, float yzw) : XYZW(this)
+{
+	constant(x, yzw, yzw, yzw);
+}
+
+Float4::Float4(float x, float y, float zw) : XYZW(this)
+{
+	constant(x, y, zw, zw);
+}
+
+Float4::Float4(float x, float y, float z, float w) : XYZW(this)
+{
+	constant(x, y, z, w);
+}
+
+Float4 Float4::positive_inf()
+{
+	Float4 result;
+	result.infinity_constant(false);
+	return result;
+}
+
+Float4 Float4::negative_inf()
+{
+	Float4 result;
+	result.infinity_constant(true);
+	return result;
+}
+
+void Float4::infinity_constant(bool negative)
+{
+	double inf = negative ? -INFINITY : INFINITY;
+	double constantVector[4] = {inf, inf, inf, inf};
+	storeValue(Nucleus::createConstantVector(constantVector, getType()));
+}
+
+void Float4::constant(float x, float y, float z, float w)
+{
+	// See Float(float) constructor for the rationale behind this assert.
+	ASSERT(std::isfinite(x) && std::isfinite(y) && std::isfinite(z) && std::isfinite(w));
+
+	double constantVector[4] = {x, y, z, w};
+	storeValue(Nucleus::createConstantVector(constantVector, getType()));
+}
+
+Float4::Float4(RValue<Float4> rhs) : XYZW(this)
+{
+	storeValue(rhs.value);
+}
+
+Float4::Float4(const Float4 &rhs) : XYZW(this)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
+}
+
+Float4::Float4(const Reference<Float4> &rhs) : XYZW(this)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
+}
+
+Float4::Float4(const Float &rhs) : XYZW(this)
+{
+	*this = RValue<Float>(rhs.loadValue());
+}
+
+Float4::Float4(const Reference<Float> &rhs) : XYZW(this)
+{
+	*this = RValue<Float>(rhs.loadValue());
+}
+
+RValue<Float4> Float4::operator=(float x)
+{
+	return *this = Float4(x, x, x, x);
+}
+
+RValue<Float4> Float4::operator=(RValue<Float4> rhs)
+{
+	storeValue(rhs.value);
+
+	return rhs;
+}
+
+RValue<Float4> Float4::operator=(const Float4 &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
+
+	return RValue<Float4>(value);
+}
+
+RValue<Float4> Float4::operator=(const Reference<Float4> &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
+
+	return RValue<Float4>(value);
+}
+
+RValue<Float4> Float4::operator=(RValue<Float> rhs)
+{
+	return *this = Float4(rhs);
+}
+
+RValue<Float4> Float4::operator=(const Float &rhs)
+{
+	return *this = Float4(rhs);
+}
+
+RValue<Float4> Float4::operator=(const Reference<Float> &rhs)
+{
+	return *this = Float4(rhs);
+}
+
+RValue<Float4> operator+(RValue<Float4> lhs, RValue<Float4> rhs)
+{
+	return RValue<Float4>(Nucleus::createFAdd(lhs.value, rhs.value));
+}
+
+RValue<Float4> operator-(RValue<Float4> lhs, RValue<Float4> rhs)
+{
+	return RValue<Float4>(Nucleus::createFSub(lhs.value, rhs.value));
+}
+
+RValue<Float4> operator*(RValue<Float4> lhs, RValue<Float4> rhs)
+{
+	return RValue<Float4>(Nucleus::createFMul(lhs.value, rhs.value));
+}
+
+RValue<Float4> operator/(RValue<Float4> lhs, RValue<Float4> rhs)
+{
+	return RValue<Float4>(Nucleus::createFDiv(lhs.value, rhs.value));
+}
+
+RValue<Float4> operator%(RValue<Float4> lhs, RValue<Float4> rhs)
+{
+	return RValue<Float4>(Nucleus::createFRem(lhs.value, rhs.value));
+}
+
+RValue<Float4> operator+=(Float4 &lhs, RValue<Float4> rhs)
+{
+	return lhs = lhs + rhs;
+}
+
+RValue<Float4> operator-=(Float4 &lhs, RValue<Float4> rhs)
+{
+	return lhs = lhs - rhs;
+}
+
+RValue<Float4> operator*=(Float4 &lhs, RValue<Float4> rhs)
+{
+	return lhs = lhs * rhs;
+}
+
+RValue<Float4> operator/=(Float4 &lhs, RValue<Float4> rhs)
+{
+	return lhs = lhs / rhs;
+}
+
+RValue<Float4> operator%=(Float4 &lhs, RValue<Float4> rhs)
+{
+	return lhs = lhs % rhs;
+}
+
+RValue<Float4> operator+(RValue<Float4> val)
+{
+	return val;
+}
+
+RValue<Float4> operator-(RValue<Float4> val)
+{
+	return RValue<Float4>(Nucleus::createFNeg(val.value));
+}
+
+RValue<Float4> Abs(RValue<Float4> x)
+{
+	// TODO: Optimize.
+	Value *vector = Nucleus::createBitCast(x.value, Int4::getType());
+	int64_t constantVector[4] = {0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF};
+	Value *result = Nucleus::createAnd(vector, Nucleus::createConstantVector(constantVector, Int4::getType()));
+
+	return As<Float4>(result);
+}
+
+RValue<Float4> Insert(RValue<Float4> x, RValue<Float> element, int i)
+{
+	return RValue<Float4>(Nucleus::createInsertElement(x.value, element.value, i));
+}
+
+RValue<Float> Extract(RValue<Float4> x, int i)
+{
+	return RValue<Float>(Nucleus::createExtractElement(x.value, Float::getType(), i));
+}
+
+RValue<Float4> Swizzle(RValue<Float4> x, uint16_t select)
+{
+	return RValue<Float4>(createSwizzle4(x.value, select));
+}
+
+RValue<Float4> Shuffle(RValue<Float4> x, RValue<Float4> y, uint16_t select)
+{
+	return RValue<Float4>(createBlend4(x.value, y.value, select));
+}
+
+RValue<Float4> ShuffleLowHigh(RValue<Float4> x, RValue<Float4> y, uint16_t imm)
+{
+	int shuffle[4] =
+	{
+		((imm >> 12) & 0x03) + 0,
+		((imm >>  8) & 0x03) + 0,
+		((imm >>  4) & 0x03) + 4,
+		((imm >>  0) & 0x03) + 4,
+	};
+
+	return RValue<Float4>(Nucleus::createShuffleVector(x.value, y.value, shuffle));
+}
+
+RValue<Float4> UnpackLow(RValue<Float4> x, RValue<Float4> y)
+{
+	int shuffle[4] = {0, 4, 1, 5};
+	return RValue<Float4>(Nucleus::createShuffleVector(x.value, y.value, shuffle));
+}
+
+RValue<Float4> UnpackHigh(RValue<Float4> x, RValue<Float4> y)
+{
+	int shuffle[4] = {2, 6, 3, 7};
+	return RValue<Float4>(Nucleus::createShuffleVector(x.value, y.value, shuffle));
+}
+
+RValue<Float4> Mask(Float4 &lhs, RValue<Float4> rhs, uint16_t select)
+{
+	Value *vector = lhs.loadValue();
+	Value *result = createMask4(vector, rhs.value, select);
+	lhs.storeValue(result);
+
+	return RValue<Float4>(result);
+}
+
+RValue<Int4> IsInf(RValue<Float4> x)
+{
+	return CmpEQ(As<Int4>(x) & Int4(0x7FFFFFFF), Int4(0x7F800000));
+}
+
+RValue<Int4> IsNan(RValue<Float4> x)
+{
+	return ~CmpEQ(x, x);
+}
+
+RValue<Pointer<Byte>> operator+(RValue<Pointer<Byte>> lhs, int offset)
+{
+	return lhs + RValue<Int>(Nucleus::createConstantInt(offset));
+}
+
+RValue<Pointer<Byte>> operator+(RValue<Pointer<Byte>> lhs, RValue<Int> offset)
+{
+	return RValue<Pointer<Byte>>(Nucleus::createGEP(lhs.value, Byte::getType(), offset.value, false));
+}
+
+RValue<Pointer<Byte>> operator+(RValue<Pointer<Byte>> lhs, RValue<UInt> offset)
+{
+	return RValue<Pointer<Byte>>(Nucleus::createGEP(lhs.value, Byte::getType(), offset.value, true));
+}
+
+RValue<Pointer<Byte>> operator+=(Pointer<Byte> &lhs, int offset)
+{
+	return lhs = lhs + offset;
+}
+
+RValue<Pointer<Byte>> operator+=(Pointer<Byte> &lhs, RValue<Int> offset)
+{
+	return lhs = lhs + offset;
+}
+
+RValue<Pointer<Byte>> operator+=(Pointer<Byte> &lhs, RValue<UInt> offset)
+{
+	return lhs = lhs + offset;
+}
+
+RValue<Pointer<Byte>> operator-(RValue<Pointer<Byte>> lhs, int offset)
+{
+	return lhs + -offset;
+}
+
+RValue<Pointer<Byte>> operator-(RValue<Pointer<Byte>> lhs, RValue<Int> offset)
+{
+	return lhs + -offset;
+}
+
+RValue<Pointer<Byte>> operator-(RValue<Pointer<Byte>> lhs, RValue<UInt> offset)
+{
+	return lhs + -offset;
+}
+
+RValue<Pointer<Byte>> operator-=(Pointer<Byte> &lhs, int offset)
+{
+	return lhs = lhs - offset;
+}
+
+RValue<Pointer<Byte>> operator-=(Pointer<Byte> &lhs, RValue<Int> offset)
+{
+	return lhs = lhs - offset;
+}
+
+RValue<Pointer<Byte>> operator-=(Pointer<Byte> &lhs, RValue<UInt> offset)
+{
+	return lhs = lhs - offset;
+}
+
+void Return()
+{
+	Nucleus::createRetVoid();
+	// Place any unreachable instructions in an unreferenced block.
+	Nucleus::setInsertBlock(Nucleus::createBasicBlock());
+}
+
+void branch(RValue<Bool> cmp, BasicBlock *bodyBB, BasicBlock *endBB)
+{
+	Nucleus::createCondBr(cmp.value, bodyBB, endBB);
+	Nucleus::setInsertBlock(bodyBB);
+}
+
+RValue<Float4> MaskedLoad(RValue<Pointer<Float4>> base, RValue<Int4> mask, unsigned int alignment, bool zeroMaskedLanes /* = false */)
+{
+	return RValue<Float4>(Nucleus::createMaskedLoad(base.value, Float::getType(), mask.value, alignment, zeroMaskedLanes));
+}
+
+RValue<Int4> MaskedLoad(RValue<Pointer<Int4>> base, RValue<Int4> mask, unsigned int alignment, bool zeroMaskedLanes /* = false */)
+{
+	return RValue<Int4>(Nucleus::createMaskedLoad(base.value, Int::getType(), mask.value, alignment, zeroMaskedLanes));
+}
+
+void MaskedStore(RValue<Pointer<Float4>> base, RValue<Float4> val, RValue<Int4> mask, unsigned int alignment)
+{
+	Nucleus::createMaskedStore(base.value, val.value, mask.value, alignment);
+}
+
+void MaskedStore(RValue<Pointer<Int4>> base, RValue<Int4> val, RValue<Int4> mask, unsigned int alignment)
+{
+	Nucleus::createMaskedStore(base.value, val.value, mask.value, alignment);
+}
+
+void Fence(std::memory_order memoryOrder)
+{
+	ASSERT_MSG(memoryOrder == std::memory_order_acquire ||
+		memoryOrder == std::memory_order_release ||
+		memoryOrder == std::memory_order_acq_rel ||
+		memoryOrder == std::memory_order_seq_cst,
+		"Unsupported memoryOrder: %d", int(memoryOrder));
+	Nucleus::createFence(memoryOrder);
+}
+
+Bool          CToReactor<bool>::cast(bool v)               { return type(v); }
+Byte          CToReactor<uint8_t>::cast(uint8_t v)         { return type(v); }
+SByte         CToReactor<int8_t>::cast(int8_t v)           { return type(v); }
+Short         CToReactor<int16_t>::cast(int16_t v)         { return type(v); }
+UShort        CToReactor<uint16_t>::cast(uint16_t v)       { return type(v); }
+Int           CToReactor<int32_t>::cast(int32_t v)         { return type(v); }
+UInt          CToReactor<uint32_t>::cast(uint32_t v)       { return type(v); }
+Float         CToReactor<float>::cast(float v)             { return type(v); }
+Float4        CToReactor<float[4]>::cast(float v[4])       { return type(v[0], v[1], v[2], v[3]); }
+
+// TODO: Long has no constructor that takes a uint64_t
+// Long          CToReactor<uint64_t>::cast(uint64_t v)       { return type(v); }
+
+}  // namespace rr

diff --git a/src/Reactor/Reactor.hpp b/src/Reactor/Reactor.hpp
index eec950b..f0b18b5 100644
--- a/src/Reactor/Reactor.hpp
+++ b/src/Reactor/Reactor.hpp

@@ -49,484 +49,484 @@
 	#define RR_DEBUG_INFO_FLUSH()
 #endif // ENABLE_RR_DEBUG_INFO
 
-namespace rr
+namespace rr {
+
+struct Capabilities
 {
-	struct Capabilities
+	bool CoroutinesSupported; // Support for rr::Coroutine<F>
+};
+extern const Capabilities Caps;
+
+class Bool;
+class Byte;
+class SByte;
+class Byte4;
+class SByte4;
+class Byte8;
+class SByte8;
+class Byte16;
+class SByte16;
+class Short;
+class UShort;
+class Short2;
+class UShort2;
+class Short4;
+class UShort4;
+class Short8;
+class UShort8;
+class Int;
+class UInt;
+class Int2;
+class UInt2;
+class Int4;
+class UInt4;
+class Long;
+class Half;
+class Float;
+class Float2;
+class Float4;
+
+class Void
+{
+public:
+	static Type *getType();
+
+	static bool isVoid()
 	{
-		bool CoroutinesSupported; // Support for rr::Coroutine<F>
-	};
-	extern const Capabilities Caps;
+		return true;
+	}
+};
 
-	class Bool;
-	class Byte;
-	class SByte;
-	class Byte4;
-	class SByte4;
-	class Byte8;
-	class SByte8;
-	class Byte16;
-	class SByte16;
-	class Short;
-	class UShort;
-	class Short2;
-	class UShort2;
-	class Short4;
-	class UShort4;
-	class Short8;
-	class UShort8;
-	class Int;
-	class UInt;
-	class Int2;
-	class UInt2;
-	class Int4;
-	class UInt4;
-	class Long;
-	class Half;
-	class Float;
-	class Float2;
-	class Float4;
+template<class T>
+class RValue;
 
-	class Void
+template<class T>
+class Pointer;
+
+class Variable
+{
+	friend class Nucleus;
+	friend class PrintValue;
+
+	Variable() = delete;
+	Variable &operator=(const Variable&) = delete;
+
+public:
+	void materialize() const;
+
+	Value *loadValue() const;
+	Value *storeValue(Value *value) const;
+
+	Value *getBaseAddress() const;
+	Value *getElementPointer(Value *index, bool unsignedIndex) const;
+
+protected:
+	Variable(Type *type, int arraySize);
+	Variable(const Variable&) = default;
+
+	~Variable();
+
+	const int arraySize;
+
+private:
+	static void materializeAll();
+	static void killUnmaterialized();
+
+	static std::unordered_set<Variable*> unmaterializedVariables;
+
+	Type *const type;
+	mutable Value *rvalue = nullptr;
+	mutable Value *address = nullptr;
+};
+
+template<class T>
+class LValue : public Variable
+{
+public:
+	LValue(int arraySize = 0);
+
+	RValue<Pointer<T>> operator&();
+
+	static bool isVoid()
 	{
-	public:
-		static Type *getType();
+		return false;
+	}
 
-		static bool isVoid()
-		{
-			return true;
-		}
-	};
+	// self() returns the this pointer to this LValue<T> object.
+	// This function exists because operator&() is overloaded.
+	inline LValue<T>* self() { return this; }
+};
 
-	template<class T>
-	class RValue;
+template<class T>
+class Reference
+{
+public:
+	using reference_underlying_type = T;
 
-	template<class T>
-	class Pointer;
+	explicit Reference(Value *pointer, int alignment = 1);
 
-	class Variable
-	{
-		friend class Nucleus;
-		friend class PrintValue;
+	RValue<T> operator=(RValue<T> rhs) const;
+	RValue<T> operator=(const Reference<T> &ref) const;
 
-		Variable() = delete;
-		Variable &operator=(const Variable&) = delete;
+	RValue<T> operator+=(RValue<T> rhs) const;
 
-	public:
-		void materialize() const;
+	RValue<Pointer<T>> operator&() const { return RValue<Pointer<T>>(address); }
 
-		Value *loadValue() const;
-		Value *storeValue(Value *value) const;
+	Value *loadValue() const;
+	int getAlignment() const;
 
-		Value *getBaseAddress() const;
-		Value *getElementPointer(Value *index, bool unsignedIndex) const;
+private:
+	Value *address;
 
-	protected:
-		Variable(Type *type, int arraySize);
-		Variable(const Variable&) = default;
+	const int alignment;
+};
 
-		~Variable();
+template<class T>
+struct BoolLiteral
+{
+	struct type;
+};
 
-		const int arraySize;
+template<>
+struct BoolLiteral<Bool>
+{
+	typedef bool type;
+};
 
-	private:
-		static void materializeAll();
-		static void killUnmaterialized();
+template<class T>
+struct IntLiteral
+{
+	struct type;
+};
 
-		static std::unordered_set<Variable*> unmaterializedVariables;
+template<>
+struct IntLiteral<Int>
+{
+	typedef int type;
+};
 
-		Type *const type;
-		mutable Value *rvalue = nullptr;
-		mutable Value *address = nullptr;
-	};
+template<>
+struct IntLiteral<UInt>
+{
+	typedef unsigned int type;
+};
 
-	template<class T>
-	class LValue : public Variable
-	{
-	public:
-		LValue(int arraySize = 0);
+template<>
+struct IntLiteral<Long>
+{
+	typedef int64_t type;
+};
 
-		RValue<Pointer<T>> operator&();
+template<class T>
+struct FloatLiteral
+{
+	struct type;
+};
 
-		static bool isVoid()
-		{
-			return false;
-		}
+template<>
+struct FloatLiteral<Float>
+{
+	typedef float type;
+};
 
-		// self() returns the this pointer to this LValue<T> object.
-		// This function exists because operator&() is overloaded.
-		inline LValue<T>* self() { return this; }
-	};
+template<class T>
+class RValue
+{
+public:
+	using rvalue_underlying_type = T;
 
-	template<class T>
-	class Reference
-	{
-	public:
-		using reference_underlying_type = T;
-
-		explicit Reference(Value *pointer, int alignment = 1);
-
-		RValue<T> operator=(RValue<T> rhs) const;
-		RValue<T> operator=(const Reference<T> &ref) const;
-
-		RValue<T> operator+=(RValue<T> rhs) const;
-
-		RValue<Pointer<T>> operator&() const { return RValue<Pointer<T>>(address); }
-
-		Value *loadValue() const;
-		int getAlignment() const;
-
-	private:
-		Value *address;
-
-		const int alignment;
-	};
-
-	template<class T>
-	struct BoolLiteral
-	{
-		struct type;
-	};
-
-	template<>
-	struct BoolLiteral<Bool>
-	{
-		typedef bool type;
-	};
-
-	template<class T>
-	struct IntLiteral
-	{
-		struct type;
-	};
-
-	template<>
-	struct IntLiteral<Int>
-	{
-		typedef int type;
-	};
-
-	template<>
-	struct IntLiteral<UInt>
-	{
-		typedef unsigned int type;
-	};
-
-	template<>
-	struct IntLiteral<Long>
-	{
-		typedef int64_t type;
-	};
-
-	template<class T>
-	struct FloatLiteral
-	{
-		struct type;
-	};
-
-	template<>
-	struct FloatLiteral<Float>
-	{
-		typedef float type;
-	};
-
-	template<class T>
-	class RValue
-	{
-	public:
-		using rvalue_underlying_type = T;
-
-		explicit RValue(Value *rvalue);
+	explicit RValue(Value *rvalue);
 
 #ifdef ENABLE_RR_DEBUG_INFO
-		RValue(const RValue<T> &rvalue);
+	RValue(const RValue<T> &rvalue);
 #endif // ENABLE_RR_DEBUG_INFO
 
-		RValue(const T &lvalue);
-		RValue(typename BoolLiteral<T>::type i);
-		RValue(typename IntLiteral<T>::type i);
-		RValue(typename FloatLiteral<T>::type f);
-		RValue(const Reference<T> &rhs);
+	RValue(const T &lvalue);
+	RValue(typename BoolLiteral<T>::type i);
+	RValue(typename IntLiteral<T>::type i);
+	RValue(typename FloatLiteral<T>::type f);
+	RValue(const Reference<T> &rhs);
 
-		RValue<T> &operator=(const RValue<T>&) = delete;
+	RValue<T> &operator=(const RValue<T>&) = delete;
 
-		Value *value;   // FIXME: Make private
-	};
+	Value *value;   // FIXME: Make private
+};
 
-	template<typename T>
-	struct Argument
-	{
-		explicit Argument(Value *value) : value(value) {}
+template<typename T>
+struct Argument
+{
+	explicit Argument(Value *value) : value(value) {}
 
-		Value *value;
-	};
+	Value *value;
+};
 
-	class Bool : public LValue<Bool>
-	{
-	public:
-		Bool(Argument<Bool> argument);
+class Bool : public LValue<Bool>
+{
+public:
+	Bool(Argument<Bool> argument);
 
-		Bool() = default;
-		Bool(bool x);
-		Bool(RValue<Bool> rhs);
-		Bool(const Bool &rhs);
-		Bool(const Reference<Bool> &rhs);
+	Bool() = default;
+	Bool(bool x);
+	Bool(RValue<Bool> rhs);
+	Bool(const Bool &rhs);
+	Bool(const Reference<Bool> &rhs);
 
-	//	RValue<Bool> operator=(bool rhs);   // FIXME: Implement
-		RValue<Bool> operator=(RValue<Bool> rhs);
-		RValue<Bool> operator=(const Bool &rhs);
-		RValue<Bool> operator=(const Reference<Bool> &rhs);
+//	RValue<Bool> operator=(bool rhs);   // FIXME: Implement
+	RValue<Bool> operator=(RValue<Bool> rhs);
+	RValue<Bool> operator=(const Bool &rhs);
+	RValue<Bool> operator=(const Reference<Bool> &rhs);
 
-		static Type *getType();
-	};
+	static Type *getType();
+};
 
-	RValue<Bool> operator!(RValue<Bool> val);
-	RValue<Bool> operator&&(RValue<Bool> lhs, RValue<Bool> rhs);
-	RValue<Bool> operator||(RValue<Bool> lhs, RValue<Bool> rhs);
-	RValue<Bool> operator!=(RValue<Bool> lhs, RValue<Bool> rhs);
-	RValue<Bool> operator==(RValue<Bool> lhs, RValue<Bool> rhs);
+RValue<Bool> operator!(RValue<Bool> val);
+RValue<Bool> operator&&(RValue<Bool> lhs, RValue<Bool> rhs);
+RValue<Bool> operator||(RValue<Bool> lhs, RValue<Bool> rhs);
+RValue<Bool> operator!=(RValue<Bool> lhs, RValue<Bool> rhs);
+RValue<Bool> operator==(RValue<Bool> lhs, RValue<Bool> rhs);
 
-	class Byte : public LValue<Byte>
-	{
-	public:
-		Byte(Argument<Byte> argument);
+class Byte : public LValue<Byte>
+{
+public:
+	Byte(Argument<Byte> argument);
 
-		explicit Byte(RValue<Int> cast);
-		explicit Byte(RValue<UInt> cast);
-		explicit Byte(RValue<UShort> cast);
+	explicit Byte(RValue<Int> cast);
+	explicit Byte(RValue<UInt> cast);
+	explicit Byte(RValue<UShort> cast);
 
-		Byte() = default;
-		Byte(int x);
-		Byte(unsigned char x);
-		Byte(RValue<Byte> rhs);
-		Byte(const Byte &rhs);
-		Byte(const Reference<Byte> &rhs);
+	Byte() = default;
+	Byte(int x);
+	Byte(unsigned char x);
+	Byte(RValue<Byte> rhs);
+	Byte(const Byte &rhs);
+	Byte(const Reference<Byte> &rhs);
 
-	//	RValue<Byte> operator=(unsigned char rhs);   // FIXME: Implement
-		RValue<Byte> operator=(RValue<Byte> rhs);
-		RValue<Byte> operator=(const Byte &rhs);
-		RValue<Byte> operator=(const Reference<Byte> &rhs);
+//	RValue<Byte> operator=(unsigned char rhs);   // FIXME: Implement
+	RValue<Byte> operator=(RValue<Byte> rhs);
+	RValue<Byte> operator=(const Byte &rhs);
+	RValue<Byte> operator=(const Reference<Byte> &rhs);
 
-		static Type *getType();
-	};
+	static Type *getType();
+};
 
-	RValue<Byte> operator+(RValue<Byte> lhs, RValue<Byte> rhs);
-	RValue<Byte> operator-(RValue<Byte> lhs, RValue<Byte> rhs);
-	RValue<Byte> operator*(RValue<Byte> lhs, RValue<Byte> rhs);
-	RValue<Byte> operator/(RValue<Byte> lhs, RValue<Byte> rhs);
-	RValue<Byte> operator%(RValue<Byte> lhs, RValue<Byte> rhs);
-	RValue<Byte> operator&(RValue<Byte> lhs, RValue<Byte> rhs);
-	RValue<Byte> operator|(RValue<Byte> lhs, RValue<Byte> rhs);
-	RValue<Byte> operator^(RValue<Byte> lhs, RValue<Byte> rhs);
-	RValue<Byte> operator<<(RValue<Byte> lhs, RValue<Byte> rhs);
-	RValue<Byte> operator>>(RValue<Byte> lhs, RValue<Byte> rhs);
-	RValue<Byte> operator+=(Byte &lhs, RValue<Byte> rhs);
-	RValue<Byte> operator-=(Byte &lhs, RValue<Byte> rhs);
-	RValue<Byte> operator*=(Byte &lhs, RValue<Byte> rhs);
-	RValue<Byte> operator/=(Byte &lhs, RValue<Byte> rhs);
-	RValue<Byte> operator%=(Byte &lhs, RValue<Byte> rhs);
-	RValue<Byte> operator&=(Byte &lhs, RValue<Byte> rhs);
-	RValue<Byte> operator|=(Byte &lhs, RValue<Byte> rhs);
-	RValue<Byte> operator^=(Byte &lhs, RValue<Byte> rhs);
-	RValue<Byte> operator<<=(Byte &lhs, RValue<Byte> rhs);
-	RValue<Byte> operator>>=(Byte &lhs, RValue<Byte> rhs);
-	RValue<Byte> operator+(RValue<Byte> val);
-	RValue<Byte> operator-(RValue<Byte> val);
-	RValue<Byte> operator~(RValue<Byte> val);
-	RValue<Byte> operator++(Byte &val, int);   // Post-increment
-	const Byte &operator++(Byte &val);   // Pre-increment
-	RValue<Byte> operator--(Byte &val, int);   // Post-decrement
-	const Byte &operator--(Byte &val);   // Pre-decrement
-	RValue<Bool> operator<(RValue<Byte> lhs, RValue<Byte> rhs);
-	RValue<Bool> operator<=(RValue<Byte> lhs, RValue<Byte> rhs);
-	RValue<Bool> operator>(RValue<Byte> lhs, RValue<Byte> rhs);
-	RValue<Bool> operator>=(RValue<Byte> lhs, RValue<Byte> rhs);
-	RValue<Bool> operator!=(RValue<Byte> lhs, RValue<Byte> rhs);
-	RValue<Bool> operator==(RValue<Byte> lhs, RValue<Byte> rhs);
+RValue<Byte> operator+(RValue<Byte> lhs, RValue<Byte> rhs);
+RValue<Byte> operator-(RValue<Byte> lhs, RValue<Byte> rhs);
+RValue<Byte> operator*(RValue<Byte> lhs, RValue<Byte> rhs);
+RValue<Byte> operator/(RValue<Byte> lhs, RValue<Byte> rhs);
+RValue<Byte> operator%(RValue<Byte> lhs, RValue<Byte> rhs);
+RValue<Byte> operator&(RValue<Byte> lhs, RValue<Byte> rhs);
+RValue<Byte> operator|(RValue<Byte> lhs, RValue<Byte> rhs);
+RValue<Byte> operator^(RValue<Byte> lhs, RValue<Byte> rhs);
+RValue<Byte> operator<<(RValue<Byte> lhs, RValue<Byte> rhs);
+RValue<Byte> operator>>(RValue<Byte> lhs, RValue<Byte> rhs);
+RValue<Byte> operator+=(Byte &lhs, RValue<Byte> rhs);
+RValue<Byte> operator-=(Byte &lhs, RValue<Byte> rhs);
+RValue<Byte> operator*=(Byte &lhs, RValue<Byte> rhs);
+RValue<Byte> operator/=(Byte &lhs, RValue<Byte> rhs);
+RValue<Byte> operator%=(Byte &lhs, RValue<Byte> rhs);
+RValue<Byte> operator&=(Byte &lhs, RValue<Byte> rhs);
+RValue<Byte> operator|=(Byte &lhs, RValue<Byte> rhs);
+RValue<Byte> operator^=(Byte &lhs, RValue<Byte> rhs);
+RValue<Byte> operator<<=(Byte &lhs, RValue<Byte> rhs);
+RValue<Byte> operator>>=(Byte &lhs, RValue<Byte> rhs);
+RValue<Byte> operator+(RValue<Byte> val);
+RValue<Byte> operator-(RValue<Byte> val);
+RValue<Byte> operator~(RValue<Byte> val);
+RValue<Byte> operator++(Byte &val, int);   // Post-increment
+const Byte &operator++(Byte &val);   // Pre-increment
+RValue<Byte> operator--(Byte &val, int);   // Post-decrement
+const Byte &operator--(Byte &val);   // Pre-decrement
+RValue<Bool> operator<(RValue<Byte> lhs, RValue<Byte> rhs);
+RValue<Bool> operator<=(RValue<Byte> lhs, RValue<Byte> rhs);
+RValue<Bool> operator>(RValue<Byte> lhs, RValue<Byte> rhs);
+RValue<Bool> operator>=(RValue<Byte> lhs, RValue<Byte> rhs);
+RValue<Bool> operator!=(RValue<Byte> lhs, RValue<Byte> rhs);
+RValue<Bool> operator==(RValue<Byte> lhs, RValue<Byte> rhs);
 
-	class SByte : public LValue<SByte>
-	{
-	public:
-		SByte(Argument<SByte> argument);
+class SByte : public LValue<SByte>
+{
+public:
+	SByte(Argument<SByte> argument);
 
-		explicit SByte(RValue<Int> cast);
-		explicit SByte(RValue<Short> cast);
+	explicit SByte(RValue<Int> cast);
+	explicit SByte(RValue<Short> cast);
 
-		SByte() = default;
-		SByte(signed char x);
-		SByte(RValue<SByte> rhs);
-		SByte(const SByte &rhs);
-		SByte(const Reference<SByte> &rhs);
+	SByte() = default;
+	SByte(signed char x);
+	SByte(RValue<SByte> rhs);
+	SByte(const SByte &rhs);
+	SByte(const Reference<SByte> &rhs);
 
-	//	RValue<SByte> operator=(signed char rhs);   // FIXME: Implement
-		RValue<SByte> operator=(RValue<SByte> rhs);
-		RValue<SByte> operator=(const SByte &rhs);
-		RValue<SByte> operator=(const Reference<SByte> &rhs);
+//	RValue<SByte> operator=(signed char rhs);   // FIXME: Implement
+	RValue<SByte> operator=(RValue<SByte> rhs);
+	RValue<SByte> operator=(const SByte &rhs);
+	RValue<SByte> operator=(const Reference<SByte> &rhs);
 
-		static Type *getType();
-	};
+	static Type *getType();
+};
 
-	RValue<SByte> operator+(RValue<SByte> lhs, RValue<SByte> rhs);
-	RValue<SByte> operator-(RValue<SByte> lhs, RValue<SByte> rhs);
-	RValue<SByte> operator*(RValue<SByte> lhs, RValue<SByte> rhs);
-	RValue<SByte> operator/(RValue<SByte> lhs, RValue<SByte> rhs);
-	RValue<SByte> operator%(RValue<SByte> lhs, RValue<SByte> rhs);
-	RValue<SByte> operator&(RValue<SByte> lhs, RValue<SByte> rhs);
-	RValue<SByte> operator|(RValue<SByte> lhs, RValue<SByte> rhs);
-	RValue<SByte> operator^(RValue<SByte> lhs, RValue<SByte> rhs);
-	RValue<SByte> operator<<(RValue<SByte> lhs, RValue<SByte> rhs);
-	RValue<SByte> operator>>(RValue<SByte> lhs, RValue<SByte> rhs);
-	RValue<SByte> operator+=(SByte &lhs, RValue<SByte> rhs);
-	RValue<SByte> operator-=(SByte &lhs, RValue<SByte> rhs);
-	RValue<SByte> operator*=(SByte &lhs, RValue<SByte> rhs);
-	RValue<SByte> operator/=(SByte &lhs, RValue<SByte> rhs);
-	RValue<SByte> operator%=(SByte &lhs, RValue<SByte> rhs);
-	RValue<SByte> operator&=(SByte &lhs, RValue<SByte> rhs);
-	RValue<SByte> operator|=(SByte &lhs, RValue<SByte> rhs);
-	RValue<SByte> operator^=(SByte &lhs, RValue<SByte> rhs);
-	RValue<SByte> operator<<=(SByte &lhs, RValue<SByte> rhs);
-	RValue<SByte> operator>>=(SByte &lhs, RValue<SByte> rhs);
-	RValue<SByte> operator+(RValue<SByte> val);
-	RValue<SByte> operator-(RValue<SByte> val);
-	RValue<SByte> operator~(RValue<SByte> val);
-	RValue<SByte> operator++(SByte &val, int);   // Post-increment
-	const SByte &operator++(SByte &val);   // Pre-increment
-	RValue<SByte> operator--(SByte &val, int);   // Post-decrement
-	const SByte &operator--(SByte &val);   // Pre-decrement
-	RValue<Bool> operator<(RValue<SByte> lhs, RValue<SByte> rhs);
-	RValue<Bool> operator<=(RValue<SByte> lhs, RValue<SByte> rhs);
-	RValue<Bool> operator>(RValue<SByte> lhs, RValue<SByte> rhs);
-	RValue<Bool> operator>=(RValue<SByte> lhs, RValue<SByte> rhs);
-	RValue<Bool> operator!=(RValue<SByte> lhs, RValue<SByte> rhs);
-	RValue<Bool> operator==(RValue<SByte> lhs, RValue<SByte> rhs);
+RValue<SByte> operator+(RValue<SByte> lhs, RValue<SByte> rhs);
+RValue<SByte> operator-(RValue<SByte> lhs, RValue<SByte> rhs);
+RValue<SByte> operator*(RValue<SByte> lhs, RValue<SByte> rhs);
+RValue<SByte> operator/(RValue<SByte> lhs, RValue<SByte> rhs);
+RValue<SByte> operator%(RValue<SByte> lhs, RValue<SByte> rhs);
+RValue<SByte> operator&(RValue<SByte> lhs, RValue<SByte> rhs);
+RValue<SByte> operator|(RValue<SByte> lhs, RValue<SByte> rhs);
+RValue<SByte> operator^(RValue<SByte> lhs, RValue<SByte> rhs);
+RValue<SByte> operator<<(RValue<SByte> lhs, RValue<SByte> rhs);
+RValue<SByte> operator>>(RValue<SByte> lhs, RValue<SByte> rhs);
+RValue<SByte> operator+=(SByte &lhs, RValue<SByte> rhs);
+RValue<SByte> operator-=(SByte &lhs, RValue<SByte> rhs);
+RValue<SByte> operator*=(SByte &lhs, RValue<SByte> rhs);
+RValue<SByte> operator/=(SByte &lhs, RValue<SByte> rhs);
+RValue<SByte> operator%=(SByte &lhs, RValue<SByte> rhs);
+RValue<SByte> operator&=(SByte &lhs, RValue<SByte> rhs);
+RValue<SByte> operator|=(SByte &lhs, RValue<SByte> rhs);
+RValue<SByte> operator^=(SByte &lhs, RValue<SByte> rhs);
+RValue<SByte> operator<<=(SByte &lhs, RValue<SByte> rhs);
+RValue<SByte> operator>>=(SByte &lhs, RValue<SByte> rhs);
+RValue<SByte> operator+(RValue<SByte> val);
+RValue<SByte> operator-(RValue<SByte> val);
+RValue<SByte> operator~(RValue<SByte> val);
+RValue<SByte> operator++(SByte &val, int);   // Post-increment
+const SByte &operator++(SByte &val);   // Pre-increment
+RValue<SByte> operator--(SByte &val, int);   // Post-decrement
+const SByte &operator--(SByte &val);   // Pre-decrement
+RValue<Bool> operator<(RValue<SByte> lhs, RValue<SByte> rhs);
+RValue<Bool> operator<=(RValue<SByte> lhs, RValue<SByte> rhs);
+RValue<Bool> operator>(RValue<SByte> lhs, RValue<SByte> rhs);
+RValue<Bool> operator>=(RValue<SByte> lhs, RValue<SByte> rhs);
+RValue<Bool> operator!=(RValue<SByte> lhs, RValue<SByte> rhs);
+RValue<Bool> operator==(RValue<SByte> lhs, RValue<SByte> rhs);
 
-	class Short : public LValue<Short>
-	{
-	public:
-		Short(Argument<Short> argument);
+class Short : public LValue<Short>
+{
+public:
+	Short(Argument<Short> argument);
 
-		explicit Short(RValue<Int> cast);
+	explicit Short(RValue<Int> cast);
 
-		Short() = default;
-		Short(short x);
-		Short(RValue<Short> rhs);
-		Short(const Short &rhs);
-		Short(const Reference<Short> &rhs);
+	Short() = default;
+	Short(short x);
+	Short(RValue<Short> rhs);
+	Short(const Short &rhs);
+	Short(const Reference<Short> &rhs);
 
-	//	RValue<Short> operator=(short rhs);   // FIXME: Implement
-		RValue<Short> operator=(RValue<Short> rhs);
-		RValue<Short> operator=(const Short &rhs);
-		RValue<Short> operator=(const Reference<Short> &rhs);
+//	RValue<Short> operator=(short rhs);   // FIXME: Implement
+	RValue<Short> operator=(RValue<Short> rhs);
+	RValue<Short> operator=(const Short &rhs);
+	RValue<Short> operator=(const Reference<Short> &rhs);
 
-		static Type *getType();
-	};
+	static Type *getType();
+};
 
-	RValue<Short> operator+(RValue<Short> lhs, RValue<Short> rhs);
-	RValue<Short> operator-(RValue<Short> lhs, RValue<Short> rhs);
-	RValue<Short> operator*(RValue<Short> lhs, RValue<Short> rhs);
-	RValue<Short> operator/(RValue<Short> lhs, RValue<Short> rhs);
-	RValue<Short> operator%(RValue<Short> lhs, RValue<Short> rhs);
-	RValue<Short> operator&(RValue<Short> lhs, RValue<Short> rhs);
-	RValue<Short> operator|(RValue<Short> lhs, RValue<Short> rhs);
-	RValue<Short> operator^(RValue<Short> lhs, RValue<Short> rhs);
-	RValue<Short> operator<<(RValue<Short> lhs, RValue<Short> rhs);
-	RValue<Short> operator>>(RValue<Short> lhs, RValue<Short> rhs);
-	RValue<Short> operator+=(Short &lhs, RValue<Short> rhs);
-	RValue<Short> operator-=(Short &lhs, RValue<Short> rhs);
-	RValue<Short> operator*=(Short &lhs, RValue<Short> rhs);
-	RValue<Short> operator/=(Short &lhs, RValue<Short> rhs);
-	RValue<Short> operator%=(Short &lhs, RValue<Short> rhs);
-	RValue<Short> operator&=(Short &lhs, RValue<Short> rhs);
-	RValue<Short> operator|=(Short &lhs, RValue<Short> rhs);
-	RValue<Short> operator^=(Short &lhs, RValue<Short> rhs);
-	RValue<Short> operator<<=(Short &lhs, RValue<Short> rhs);
-	RValue<Short> operator>>=(Short &lhs, RValue<Short> rhs);
-	RValue<Short> operator+(RValue<Short> val);
-	RValue<Short> operator-(RValue<Short> val);
-	RValue<Short> operator~(RValue<Short> val);
-	RValue<Short> operator++(Short &val, int);   // Post-increment
-	const Short &operator++(Short &val);   // Pre-increment
-	RValue<Short> operator--(Short &val, int);   // Post-decrement
-	const Short &operator--(Short &val);   // Pre-decrement
-	RValue<Bool> operator<(RValue<Short> lhs, RValue<Short> rhs);
-	RValue<Bool> operator<=(RValue<Short> lhs, RValue<Short> rhs);
-	RValue<Bool> operator>(RValue<Short> lhs, RValue<Short> rhs);
-	RValue<Bool> operator>=(RValue<Short> lhs, RValue<Short> rhs);
-	RValue<Bool> operator!=(RValue<Short> lhs, RValue<Short> rhs);
-	RValue<Bool> operator==(RValue<Short> lhs, RValue<Short> rhs);
+RValue<Short> operator+(RValue<Short> lhs, RValue<Short> rhs);
+RValue<Short> operator-(RValue<Short> lhs, RValue<Short> rhs);
+RValue<Short> operator*(RValue<Short> lhs, RValue<Short> rhs);
+RValue<Short> operator/(RValue<Short> lhs, RValue<Short> rhs);
+RValue<Short> operator%(RValue<Short> lhs, RValue<Short> rhs);
+RValue<Short> operator&(RValue<Short> lhs, RValue<Short> rhs);
+RValue<Short> operator|(RValue<Short> lhs, RValue<Short> rhs);
+RValue<Short> operator^(RValue<Short> lhs, RValue<Short> rhs);
+RValue<Short> operator<<(RValue<Short> lhs, RValue<Short> rhs);
+RValue<Short> operator>>(RValue<Short> lhs, RValue<Short> rhs);
+RValue<Short> operator+=(Short &lhs, RValue<Short> rhs);
+RValue<Short> operator-=(Short &lhs, RValue<Short> rhs);
+RValue<Short> operator*=(Short &lhs, RValue<Short> rhs);
+RValue<Short> operator/=(Short &lhs, RValue<Short> rhs);
+RValue<Short> operator%=(Short &lhs, RValue<Short> rhs);
+RValue<Short> operator&=(Short &lhs, RValue<Short> rhs);
+RValue<Short> operator|=(Short &lhs, RValue<Short> rhs);
+RValue<Short> operator^=(Short &lhs, RValue<Short> rhs);
+RValue<Short> operator<<=(Short &lhs, RValue<Short> rhs);
+RValue<Short> operator>>=(Short &lhs, RValue<Short> rhs);
+RValue<Short> operator+(RValue<Short> val);
+RValue<Short> operator-(RValue<Short> val);
+RValue<Short> operator~(RValue<Short> val);
+RValue<Short> operator++(Short &val, int);   // Post-increment
+const Short &operator++(Short &val);   // Pre-increment
+RValue<Short> operator--(Short &val, int);   // Post-decrement
+const Short &operator--(Short &val);   // Pre-decrement
+RValue<Bool> operator<(RValue<Short> lhs, RValue<Short> rhs);
+RValue<Bool> operator<=(RValue<Short> lhs, RValue<Short> rhs);
+RValue<Bool> operator>(RValue<Short> lhs, RValue<Short> rhs);
+RValue<Bool> operator>=(RValue<Short> lhs, RValue<Short> rhs);
+RValue<Bool> operator!=(RValue<Short> lhs, RValue<Short> rhs);
+RValue<Bool> operator==(RValue<Short> lhs, RValue<Short> rhs);
 
-	class UShort : public LValue<UShort>
-	{
-	public:
-		UShort(Argument<UShort> argument);
+class UShort : public LValue<UShort>
+{
+public:
+	UShort(Argument<UShort> argument);
 
-		explicit UShort(RValue<UInt> cast);
-		explicit UShort(RValue<Int> cast);
+	explicit UShort(RValue<UInt> cast);
+	explicit UShort(RValue<Int> cast);
 
-		UShort() = default;
-		UShort(unsigned short x);
-		UShort(RValue<UShort> rhs);
-		UShort(const UShort &rhs);
-		UShort(const Reference<UShort> &rhs);
+	UShort() = default;
+	UShort(unsigned short x);
+	UShort(RValue<UShort> rhs);
+	UShort(const UShort &rhs);
+	UShort(const Reference<UShort> &rhs);
 
-	//	RValue<UShort> operator=(unsigned short rhs);   // FIXME: Implement
-		RValue<UShort> operator=(RValue<UShort> rhs);
-		RValue<UShort> operator=(const UShort &rhs);
-		RValue<UShort> operator=(const Reference<UShort> &rhs);
+//	RValue<UShort> operator=(unsigned short rhs);   // FIXME: Implement
+	RValue<UShort> operator=(RValue<UShort> rhs);
+	RValue<UShort> operator=(const UShort &rhs);
+	RValue<UShort> operator=(const Reference<UShort> &rhs);
 
-		static Type *getType();
-	};
+	static Type *getType();
+};
 
-	RValue<UShort> operator+(RValue<UShort> lhs, RValue<UShort> rhs);
-	RValue<UShort> operator-(RValue<UShort> lhs, RValue<UShort> rhs);
-	RValue<UShort> operator*(RValue<UShort> lhs, RValue<UShort> rhs);
-	RValue<UShort> operator/(RValue<UShort> lhs, RValue<UShort> rhs);
-	RValue<UShort> operator%(RValue<UShort> lhs, RValue<UShort> rhs);
-	RValue<UShort> operator&(RValue<UShort> lhs, RValue<UShort> rhs);
-	RValue<UShort> operator|(RValue<UShort> lhs, RValue<UShort> rhs);
-	RValue<UShort> operator^(RValue<UShort> lhs, RValue<UShort> rhs);
-	RValue<UShort> operator<<(RValue<UShort> lhs, RValue<UShort> rhs);
-	RValue<UShort> operator>>(RValue<UShort> lhs, RValue<UShort> rhs);
-	RValue<UShort> operator+=(UShort &lhs, RValue<UShort> rhs);
-	RValue<UShort> operator-=(UShort &lhs, RValue<UShort> rhs);
-	RValue<UShort> operator*=(UShort &lhs, RValue<UShort> rhs);
-	RValue<UShort> operator/=(UShort &lhs, RValue<UShort> rhs);
-	RValue<UShort> operator%=(UShort &lhs, RValue<UShort> rhs);
-	RValue<UShort> operator&=(UShort &lhs, RValue<UShort> rhs);
-	RValue<UShort> operator|=(UShort &lhs, RValue<UShort> rhs);
-	RValue<UShort> operator^=(UShort &lhs, RValue<UShort> rhs);
-	RValue<UShort> operator<<=(UShort &lhs, RValue<UShort> rhs);
-	RValue<UShort> operator>>=(UShort &lhs, RValue<UShort> rhs);
-	RValue<UShort> operator+(RValue<UShort> val);
-	RValue<UShort> operator-(RValue<UShort> val);
-	RValue<UShort> operator~(RValue<UShort> val);
-	RValue<UShort> operator++(UShort &val, int);   // Post-increment
-	const UShort &operator++(UShort &val);   // Pre-increment
-	RValue<UShort> operator--(UShort &val, int);   // Post-decrement
-	const UShort &operator--(UShort &val);   // Pre-decrement
-	RValue<Bool> operator<(RValue<UShort> lhs, RValue<UShort> rhs);
-	RValue<Bool> operator<=(RValue<UShort> lhs, RValue<UShort> rhs);
-	RValue<Bool> operator>(RValue<UShort> lhs, RValue<UShort> rhs);
-	RValue<Bool> operator>=(RValue<UShort> lhs, RValue<UShort> rhs);
-	RValue<Bool> operator!=(RValue<UShort> lhs, RValue<UShort> rhs);
-	RValue<Bool> operator==(RValue<UShort> lhs, RValue<UShort> rhs);
+RValue<UShort> operator+(RValue<UShort> lhs, RValue<UShort> rhs);
+RValue<UShort> operator-(RValue<UShort> lhs, RValue<UShort> rhs);
+RValue<UShort> operator*(RValue<UShort> lhs, RValue<UShort> rhs);
+RValue<UShort> operator/(RValue<UShort> lhs, RValue<UShort> rhs);
+RValue<UShort> operator%(RValue<UShort> lhs, RValue<UShort> rhs);
+RValue<UShort> operator&(RValue<UShort> lhs, RValue<UShort> rhs);
+RValue<UShort> operator|(RValue<UShort> lhs, RValue<UShort> rhs);
+RValue<UShort> operator^(RValue<UShort> lhs, RValue<UShort> rhs);
+RValue<UShort> operator<<(RValue<UShort> lhs, RValue<UShort> rhs);
+RValue<UShort> operator>>(RValue<UShort> lhs, RValue<UShort> rhs);
+RValue<UShort> operator+=(UShort &lhs, RValue<UShort> rhs);
+RValue<UShort> operator-=(UShort &lhs, RValue<UShort> rhs);
+RValue<UShort> operator*=(UShort &lhs, RValue<UShort> rhs);
+RValue<UShort> operator/=(UShort &lhs, RValue<UShort> rhs);
+RValue<UShort> operator%=(UShort &lhs, RValue<UShort> rhs);
+RValue<UShort> operator&=(UShort &lhs, RValue<UShort> rhs);
+RValue<UShort> operator|=(UShort &lhs, RValue<UShort> rhs);
+RValue<UShort> operator^=(UShort &lhs, RValue<UShort> rhs);
+RValue<UShort> operator<<=(UShort &lhs, RValue<UShort> rhs);
+RValue<UShort> operator>>=(UShort &lhs, RValue<UShort> rhs);
+RValue<UShort> operator+(RValue<UShort> val);
+RValue<UShort> operator-(RValue<UShort> val);
+RValue<UShort> operator~(RValue<UShort> val);
+RValue<UShort> operator++(UShort &val, int);   // Post-increment
+const UShort &operator++(UShort &val);   // Pre-increment
+RValue<UShort> operator--(UShort &val, int);   // Post-decrement
+const UShort &operator--(UShort &val);   // Pre-decrement
+RValue<Bool> operator<(RValue<UShort> lhs, RValue<UShort> rhs);
+RValue<Bool> operator<=(RValue<UShort> lhs, RValue<UShort> rhs);
+RValue<Bool> operator>(RValue<UShort> lhs, RValue<UShort> rhs);
+RValue<Bool> operator>=(RValue<UShort> lhs, RValue<UShort> rhs);
+RValue<Bool> operator!=(RValue<UShort> lhs, RValue<UShort> rhs);
+RValue<Bool> operator==(RValue<UShort> lhs, RValue<UShort> rhs);
 
-	class Byte4 : public LValue<Byte4>
-	{
-	public:
-		explicit Byte4(RValue<Byte8> cast);
+class Byte4 : public LValue<Byte4>
+{
+public:
+	explicit Byte4(RValue<Byte8> cast);
 
-		Byte4() = default;
-	//	Byte4(int x, int y, int z, int w);
-	//	Byte4(RValue<Byte4> rhs);
-	//	Byte4(const Byte4 &rhs);
-		Byte4(const Reference<Byte4> &rhs);
+	Byte4() = default;
+//	Byte4(int x, int y, int z, int w);
+//	Byte4(RValue<Byte4> rhs);
+//	Byte4(const Byte4 &rhs);
+	Byte4(const Reference<Byte4> &rhs);
 
-	//	RValue<Byte4> operator=(RValue<Byte4> rhs);
-	//	RValue<Byte4> operator=(const Byte4 &rhs);
-	//	RValue<Byte4> operator=(const Reference<Byte4> &rhs);
+//	RValue<Byte4> operator=(RValue<Byte4> rhs);
+//	RValue<Byte4> operator=(const Byte4 &rhs);
+//	RValue<Byte4> operator=(const Reference<Byte4> &rhs);
 
-		static Type *getType();
-	};
+	static Type *getType();
+};
 
 //	RValue<Byte4> operator+(RValue<Byte4> lhs, RValue<Byte4> rhs);
 //	RValue<Byte4> operator-(RValue<Byte4> lhs, RValue<Byte4> rhs);
@@ -556,21 +556,21 @@
 //	RValue<Byte4> operator--(Byte4 &val, int);   // Post-decrement
 //	const Byte4 &operator--(Byte4 &val);   // Pre-decrement
 
-	class SByte4 : public LValue<SByte4>
-	{
-	public:
-		SByte4() = default;
-	//	SByte4(int x, int y, int z, int w);
-	//	SByte4(RValue<SByte4> rhs);
-	//	SByte4(const SByte4 &rhs);
-	//	SByte4(const Reference<SByte4> &rhs);
+class SByte4 : public LValue<SByte4>
+{
+public:
+	SByte4() = default;
+//	SByte4(int x, int y, int z, int w);
+//	SByte4(RValue<SByte4> rhs);
+//	SByte4(const SByte4 &rhs);
+//	SByte4(const Reference<SByte4> &rhs);
 
-	//	RValue<SByte4> operator=(RValue<SByte4> rhs);
-	//	RValue<SByte4> operator=(const SByte4 &rhs);
-	//	RValue<SByte4> operator=(const Reference<SByte4> &rhs);
+//	RValue<SByte4> operator=(RValue<SByte4> rhs);
+//	RValue<SByte4> operator=(const SByte4 &rhs);
+//	RValue<SByte4> operator=(const Reference<SByte4> &rhs);
 
-		static Type *getType();
-	};
+	static Type *getType();
+};
 
 //	RValue<SByte4> operator+(RValue<SByte4> lhs, RValue<SByte4> rhs);
 //	RValue<SByte4> operator-(RValue<SByte4> lhs, RValue<SByte4> rhs);
@@ -600,127 +600,127 @@
 //	RValue<SByte4> operator--(SByte4 &val, int);   // Post-decrement
 //	const SByte4 &operator--(SByte4 &val);   // Pre-decrement
 
-	class Byte8 : public LValue<Byte8>
-	{
-	public:
-		Byte8() = default;
-		Byte8(uint8_t x0, uint8_t x1, uint8_t x2, uint8_t x3, uint8_t x4, uint8_t x5, uint8_t x6, uint8_t x7);
-		Byte8(RValue<Byte8> rhs);
-		Byte8(const Byte8 &rhs);
-		Byte8(const Reference<Byte8> &rhs);
+class Byte8 : public LValue<Byte8>
+{
+public:
+	Byte8() = default;
+	Byte8(uint8_t x0, uint8_t x1, uint8_t x2, uint8_t x3, uint8_t x4, uint8_t x5, uint8_t x6, uint8_t x7);
+	Byte8(RValue<Byte8> rhs);
+	Byte8(const Byte8 &rhs);
+	Byte8(const Reference<Byte8> &rhs);
 
-		RValue<Byte8> operator=(RValue<Byte8> rhs);
-		RValue<Byte8> operator=(const Byte8 &rhs);
-		RValue<Byte8> operator=(const Reference<Byte8> &rhs);
+	RValue<Byte8> operator=(RValue<Byte8> rhs);
+	RValue<Byte8> operator=(const Byte8 &rhs);
+	RValue<Byte8> operator=(const Reference<Byte8> &rhs);
 
-		static Type *getType();
-	};
+	static Type *getType();
+};
 
-	RValue<Byte8> operator+(RValue<Byte8> lhs, RValue<Byte8> rhs);
-	RValue<Byte8> operator-(RValue<Byte8> lhs, RValue<Byte8> rhs);
+RValue<Byte8> operator+(RValue<Byte8> lhs, RValue<Byte8> rhs);
+RValue<Byte8> operator-(RValue<Byte8> lhs, RValue<Byte8> rhs);
 //	RValue<Byte8> operator*(RValue<Byte8> lhs, RValue<Byte8> rhs);
 //	RValue<Byte8> operator/(RValue<Byte8> lhs, RValue<Byte8> rhs);
 //	RValue<Byte8> operator%(RValue<Byte8> lhs, RValue<Byte8> rhs);
-	RValue<Byte8> operator&(RValue<Byte8> lhs, RValue<Byte8> rhs);
-	RValue<Byte8> operator|(RValue<Byte8> lhs, RValue<Byte8> rhs);
-	RValue<Byte8> operator^(RValue<Byte8> lhs, RValue<Byte8> rhs);
+RValue<Byte8> operator&(RValue<Byte8> lhs, RValue<Byte8> rhs);
+RValue<Byte8> operator|(RValue<Byte8> lhs, RValue<Byte8> rhs);
+RValue<Byte8> operator^(RValue<Byte8> lhs, RValue<Byte8> rhs);
 //	RValue<Byte8> operator<<(RValue<Byte8> lhs, RValue<Byte8> rhs);
 //	RValue<Byte8> operator>>(RValue<Byte8> lhs, RValue<Byte8> rhs);
-	RValue<Byte8> operator+=(Byte8 &lhs, RValue<Byte8> rhs);
-	RValue<Byte8> operator-=(Byte8 &lhs, RValue<Byte8> rhs);
+RValue<Byte8> operator+=(Byte8 &lhs, RValue<Byte8> rhs);
+RValue<Byte8> operator-=(Byte8 &lhs, RValue<Byte8> rhs);
 //	RValue<Byte8> operator*=(Byte8 &lhs, RValue<Byte8> rhs);
 //	RValue<Byte8> operator/=(Byte8 &lhs, RValue<Byte8> rhs);
 //	RValue<Byte8> operator%=(Byte8 &lhs, RValue<Byte8> rhs);
-	RValue<Byte8> operator&=(Byte8 &lhs, RValue<Byte8> rhs);
-	RValue<Byte8> operator|=(Byte8 &lhs, RValue<Byte8> rhs);
-	RValue<Byte8> operator^=(Byte8 &lhs, RValue<Byte8> rhs);
+RValue<Byte8> operator&=(Byte8 &lhs, RValue<Byte8> rhs);
+RValue<Byte8> operator|=(Byte8 &lhs, RValue<Byte8> rhs);
+RValue<Byte8> operator^=(Byte8 &lhs, RValue<Byte8> rhs);
 //	RValue<Byte8> operator<<=(Byte8 &lhs, RValue<Byte8> rhs);
 //	RValue<Byte8> operator>>=(Byte8 &lhs, RValue<Byte8> rhs);
 //	RValue<Byte8> operator+(RValue<Byte8> val);
 //	RValue<Byte8> operator-(RValue<Byte8> val);
-	RValue<Byte8> operator~(RValue<Byte8> val);
+RValue<Byte8> operator~(RValue<Byte8> val);
 //	RValue<Byte8> operator++(Byte8 &val, int);   // Post-increment
 //	const Byte8 &operator++(Byte8 &val);   // Pre-increment
 //	RValue<Byte8> operator--(Byte8 &val, int);   // Post-decrement
 //	const Byte8 &operator--(Byte8 &val);   // Pre-decrement
 
-	RValue<Byte8> AddSat(RValue<Byte8> x, RValue<Byte8> y);
-	RValue<Byte8> SubSat(RValue<Byte8> x, RValue<Byte8> y);
-	RValue<Short4> Unpack(RValue<Byte4> x);
-	RValue<Short4> Unpack(RValue<Byte4> x, RValue<Byte4> y);
-	RValue<Short4> UnpackLow(RValue<Byte8> x, RValue<Byte8> y);
-	RValue<Short4> UnpackHigh(RValue<Byte8> x, RValue<Byte8> y);
-	RValue<Int> SignMask(RValue<Byte8> x);
+RValue<Byte8> AddSat(RValue<Byte8> x, RValue<Byte8> y);
+RValue<Byte8> SubSat(RValue<Byte8> x, RValue<Byte8> y);
+RValue<Short4> Unpack(RValue<Byte4> x);
+RValue<Short4> Unpack(RValue<Byte4> x, RValue<Byte4> y);
+RValue<Short4> UnpackLow(RValue<Byte8> x, RValue<Byte8> y);
+RValue<Short4> UnpackHigh(RValue<Byte8> x, RValue<Byte8> y);
+RValue<Int> SignMask(RValue<Byte8> x);
 //	RValue<Byte8> CmpGT(RValue<Byte8> x, RValue<Byte8> y);
-	RValue<Byte8> CmpEQ(RValue<Byte8> x, RValue<Byte8> y);
+RValue<Byte8> CmpEQ(RValue<Byte8> x, RValue<Byte8> y);
 
-	class SByte8 : public LValue<SByte8>
-	{
-	public:
-		SByte8() = default;
-		SByte8(uint8_t x0, uint8_t x1, uint8_t x2, uint8_t x3, uint8_t x4, uint8_t x5, uint8_t x6, uint8_t x7);
-		SByte8(RValue<SByte8> rhs);
-		SByte8(const SByte8 &rhs);
-		SByte8(const Reference<SByte8> &rhs);
+class SByte8 : public LValue<SByte8>
+{
+public:
+	SByte8() = default;
+	SByte8(uint8_t x0, uint8_t x1, uint8_t x2, uint8_t x3, uint8_t x4, uint8_t x5, uint8_t x6, uint8_t x7);
+	SByte8(RValue<SByte8> rhs);
+	SByte8(const SByte8 &rhs);
+	SByte8(const Reference<SByte8> &rhs);
 
-		RValue<SByte8> operator=(RValue<SByte8> rhs);
-		RValue<SByte8> operator=(const SByte8 &rhs);
-		RValue<SByte8> operator=(const Reference<SByte8> &rhs);
+	RValue<SByte8> operator=(RValue<SByte8> rhs);
+	RValue<SByte8> operator=(const SByte8 &rhs);
+	RValue<SByte8> operator=(const Reference<SByte8> &rhs);
 
-		static Type *getType();
-	};
+	static Type *getType();
+};
 
-	RValue<SByte8> operator+(RValue<SByte8> lhs, RValue<SByte8> rhs);
-	RValue<SByte8> operator-(RValue<SByte8> lhs, RValue<SByte8> rhs);
+RValue<SByte8> operator+(RValue<SByte8> lhs, RValue<SByte8> rhs);
+RValue<SByte8> operator-(RValue<SByte8> lhs, RValue<SByte8> rhs);
 //	RValue<SByte8> operator*(RValue<SByte8> lhs, RValue<SByte8> rhs);
 //	RValue<SByte8> operator/(RValue<SByte8> lhs, RValue<SByte8> rhs);
 //	RValue<SByte8> operator%(RValue<SByte8> lhs, RValue<SByte8> rhs);
-	RValue<SByte8> operator&(RValue<SByte8> lhs, RValue<SByte8> rhs);
-	RValue<SByte8> operator|(RValue<SByte8> lhs, RValue<SByte8> rhs);
-	RValue<SByte8> operator^(RValue<SByte8> lhs, RValue<SByte8> rhs);
+RValue<SByte8> operator&(RValue<SByte8> lhs, RValue<SByte8> rhs);
+RValue<SByte8> operator|(RValue<SByte8> lhs, RValue<SByte8> rhs);
+RValue<SByte8> operator^(RValue<SByte8> lhs, RValue<SByte8> rhs);
 //	RValue<SByte8> operator<<(RValue<SByte8> lhs, RValue<SByte8> rhs);
 //	RValue<SByte8> operator>>(RValue<SByte8> lhs, RValue<SByte8> rhs);
-	RValue<SByte8> operator+=(SByte8 &lhs, RValue<SByte8> rhs);
-	RValue<SByte8> operator-=(SByte8 &lhs, RValue<SByte8> rhs);
+RValue<SByte8> operator+=(SByte8 &lhs, RValue<SByte8> rhs);
+RValue<SByte8> operator-=(SByte8 &lhs, RValue<SByte8> rhs);
 //	RValue<SByte8> operator*=(SByte8 &lhs, RValue<SByte8> rhs);
 //	RValue<SByte8> operator/=(SByte8 &lhs, RValue<SByte8> rhs);
 //	RValue<SByte8> operator%=(SByte8 &lhs, RValue<SByte8> rhs);
-	RValue<SByte8> operator&=(SByte8 &lhs, RValue<SByte8> rhs);
-	RValue<SByte8> operator|=(SByte8 &lhs, RValue<SByte8> rhs);
-	RValue<SByte8> operator^=(SByte8 &lhs, RValue<SByte8> rhs);
+RValue<SByte8> operator&=(SByte8 &lhs, RValue<SByte8> rhs);
+RValue<SByte8> operator|=(SByte8 &lhs, RValue<SByte8> rhs);
+RValue<SByte8> operator^=(SByte8 &lhs, RValue<SByte8> rhs);
 //	RValue<SByte8> operator<<=(SByte8 &lhs, RValue<SByte8> rhs);
 //	RValue<SByte8> operator>>=(SByte8 &lhs, RValue<SByte8> rhs);
 //	RValue<SByte8> operator+(RValue<SByte8> val);
 //	RValue<SByte8> operator-(RValue<SByte8> val);
-	RValue<SByte8> operator~(RValue<SByte8> val);
+RValue<SByte8> operator~(RValue<SByte8> val);
 //	RValue<SByte8> operator++(SByte8 &val, int);   // Post-increment
 //	const SByte8 &operator++(SByte8 &val);   // Pre-increment
 //	RValue<SByte8> operator--(SByte8 &val, int);   // Post-decrement
 //	const SByte8 &operator--(SByte8 &val);   // Pre-decrement
 
-	RValue<SByte8> AddSat(RValue<SByte8> x, RValue<SByte8> y);
-	RValue<SByte8> SubSat(RValue<SByte8> x, RValue<SByte8> y);
-	RValue<Short4> UnpackLow(RValue<SByte8> x, RValue<SByte8> y);
-	RValue<Short4> UnpackHigh(RValue<SByte8> x, RValue<SByte8> y);
-	RValue<Int> SignMask(RValue<SByte8> x);
-	RValue<Byte8> CmpGT(RValue<SByte8> x, RValue<SByte8> y);
-	RValue<Byte8> CmpEQ(RValue<SByte8> x, RValue<SByte8> y);
+RValue<SByte8> AddSat(RValue<SByte8> x, RValue<SByte8> y);
+RValue<SByte8> SubSat(RValue<SByte8> x, RValue<SByte8> y);
+RValue<Short4> UnpackLow(RValue<SByte8> x, RValue<SByte8> y);
+RValue<Short4> UnpackHigh(RValue<SByte8> x, RValue<SByte8> y);
+RValue<Int> SignMask(RValue<SByte8> x);
+RValue<Byte8> CmpGT(RValue<SByte8> x, RValue<SByte8> y);
+RValue<Byte8> CmpEQ(RValue<SByte8> x, RValue<SByte8> y);
 
-	class Byte16 : public LValue<Byte16>
-	{
-	public:
-		Byte16() = default;
-	//	Byte16(int x, int y, int z, int w);
-		Byte16(RValue<Byte16> rhs);
-		Byte16(const Byte16 &rhs);
-		Byte16(const Reference<Byte16> &rhs);
+class Byte16 : public LValue<Byte16>
+{
+public:
+	Byte16() = default;
+//	Byte16(int x, int y, int z, int w);
+	Byte16(RValue<Byte16> rhs);
+	Byte16(const Byte16 &rhs);
+	Byte16(const Reference<Byte16> &rhs);
 
-		RValue<Byte16> operator=(RValue<Byte16> rhs);
-		RValue<Byte16> operator=(const Byte16 &rhs);
-		RValue<Byte16> operator=(const Reference<Byte16> &rhs);
+	RValue<Byte16> operator=(RValue<Byte16> rhs);
+	RValue<Byte16> operator=(const Byte16 &rhs);
+	RValue<Byte16> operator=(const Reference<Byte16> &rhs);
 
-		static Type *getType();
-	};
+	static Type *getType();
+};
 
 //	RValue<Byte16> operator+(RValue<Byte16> lhs, RValue<Byte16> rhs);
 //	RValue<Byte16> operator-(RValue<Byte16> lhs, RValue<Byte16> rhs);
@@ -750,21 +750,21 @@
 //	RValue<Byte16> operator--(Byte16 &val, int);   // Post-decrement
 //	const Byte16 &operator--(Byte16 &val);   // Pre-decrement
 
-	class SByte16 : public LValue<SByte16>
-	{
-	public:
-		SByte16() = default;
-	//	SByte16(int x, int y, int z, int w);
-	//	SByte16(RValue<SByte16> rhs);
-	//	SByte16(const SByte16 &rhs);
-	//	SByte16(const Reference<SByte16> &rhs);
+class SByte16 : public LValue<SByte16>
+{
+public:
+	SByte16() = default;
+//	SByte16(int x, int y, int z, int w);
+//	SByte16(RValue<SByte16> rhs);
+//	SByte16(const SByte16 &rhs);
+//	SByte16(const Reference<SByte16> &rhs);
 
-	//	RValue<SByte16> operator=(RValue<SByte16> rhs);
-	//	RValue<SByte16> operator=(const SByte16 &rhs);
-	//	RValue<SByte16> operator=(const Reference<SByte16> &rhs);
+//	RValue<SByte16> operator=(RValue<SByte16> rhs);
+//	RValue<SByte16> operator=(const SByte16 &rhs);
+//	RValue<SByte16> operator=(const Reference<SByte16> &rhs);
 
-		static Type *getType();
-	};
+	static Type *getType();
+};
 
 //	RValue<SByte16> operator+(RValue<SByte16> lhs, RValue<SByte16> rhs);
 //	RValue<SByte16> operator-(RValue<SByte16> lhs, RValue<SByte16> rhs);
@@ -794,73 +794,73 @@
 //	RValue<SByte16> operator--(SByte16 &val, int);   // Post-decrement
 //	const SByte16 &operator--(SByte16 &val);   // Pre-decrement
 
-	class Short2 : public LValue<Short2>
-	{
-	public:
-		explicit Short2(RValue<Short4> cast);
+class Short2 : public LValue<Short2>
+{
+public:
+	explicit Short2(RValue<Short4> cast);
 
-		static Type *getType();
-	};
+	static Type *getType();
+};
 
-	class UShort2 : public LValue<UShort2>
-	{
-	public:
-		explicit UShort2(RValue<UShort4> cast);
+class UShort2 : public LValue<UShort2>
+{
+public:
+	explicit UShort2(RValue<UShort4> cast);
 
-		static Type *getType();
-	};
+	static Type *getType();
+};
 
-	class Short4 : public LValue<Short4>
-	{
-	public:
-		explicit Short4(RValue<Int> cast);
-		explicit Short4(RValue<Int4> cast);
-	//	explicit Short4(RValue<Float> cast);
-		explicit Short4(RValue<Float4> cast);
+class Short4 : public LValue<Short4>
+{
+public:
+	explicit Short4(RValue<Int> cast);
+	explicit Short4(RValue<Int4> cast);
+//	explicit Short4(RValue<Float> cast);
+	explicit Short4(RValue<Float4> cast);
 
-		Short4() = default;
-		Short4(short xyzw);
-		Short4(short x, short y, short z, short w);
-		Short4(RValue<Short4> rhs);
-		Short4(const Short4 &rhs);
-		Short4(const Reference<Short4> &rhs);
-		Short4(RValue<UShort4> rhs);
-		Short4(const UShort4 &rhs);
-		Short4(const Reference<UShort4> &rhs);
+	Short4() = default;
+	Short4(short xyzw);
+	Short4(short x, short y, short z, short w);
+	Short4(RValue<Short4> rhs);
+	Short4(const Short4 &rhs);
+	Short4(const Reference<Short4> &rhs);
+	Short4(RValue<UShort4> rhs);
+	Short4(const UShort4 &rhs);
+	Short4(const Reference<UShort4> &rhs);
 
-		RValue<Short4> operator=(RValue<Short4> rhs);
-		RValue<Short4> operator=(const Short4 &rhs);
-		RValue<Short4> operator=(const Reference<Short4> &rhs);
-		RValue<Short4> operator=(RValue<UShort4> rhs);
-		RValue<Short4> operator=(const UShort4 &rhs);
-		RValue<Short4> operator=(const Reference<UShort4> &rhs);
+	RValue<Short4> operator=(RValue<Short4> rhs);
+	RValue<Short4> operator=(const Short4 &rhs);
+	RValue<Short4> operator=(const Reference<Short4> &rhs);
+	RValue<Short4> operator=(RValue<UShort4> rhs);
+	RValue<Short4> operator=(const UShort4 &rhs);
+	RValue<Short4> operator=(const Reference<UShort4> &rhs);
 
-		static Type *getType();
-	};
+	static Type *getType();
+};
 
-	RValue<Short4> operator+(RValue<Short4> lhs, RValue<Short4> rhs);
-	RValue<Short4> operator-(RValue<Short4> lhs, RValue<Short4> rhs);
-	RValue<Short4> operator*(RValue<Short4> lhs, RValue<Short4> rhs);
+RValue<Short4> operator+(RValue<Short4> lhs, RValue<Short4> rhs);
+RValue<Short4> operator-(RValue<Short4> lhs, RValue<Short4> rhs);
+RValue<Short4> operator*(RValue<Short4> lhs, RValue<Short4> rhs);
 //	RValue<Short4> operator/(RValue<Short4> lhs, RValue<Short4> rhs);
 //	RValue<Short4> operator%(RValue<Short4> lhs, RValue<Short4> rhs);
-	RValue<Short4> operator&(RValue<Short4> lhs, RValue<Short4> rhs);
-	RValue<Short4> operator|(RValue<Short4> lhs, RValue<Short4> rhs);
-	RValue<Short4> operator^(RValue<Short4> lhs, RValue<Short4> rhs);
-	RValue<Short4> operator<<(RValue<Short4> lhs, unsigned char rhs);
-	RValue<Short4> operator>>(RValue<Short4> lhs, unsigned char rhs);
-	RValue<Short4> operator+=(Short4 &lhs, RValue<Short4> rhs);
-	RValue<Short4> operator-=(Short4 &lhs, RValue<Short4> rhs);
-	RValue<Short4> operator*=(Short4 &lhs, RValue<Short4> rhs);
+RValue<Short4> operator&(RValue<Short4> lhs, RValue<Short4> rhs);
+RValue<Short4> operator|(RValue<Short4> lhs, RValue<Short4> rhs);
+RValue<Short4> operator^(RValue<Short4> lhs, RValue<Short4> rhs);
+RValue<Short4> operator<<(RValue<Short4> lhs, unsigned char rhs);
+RValue<Short4> operator>>(RValue<Short4> lhs, unsigned char rhs);
+RValue<Short4> operator+=(Short4 &lhs, RValue<Short4> rhs);
+RValue<Short4> operator-=(Short4 &lhs, RValue<Short4> rhs);
+RValue<Short4> operator*=(Short4 &lhs, RValue<Short4> rhs);
 //	RValue<Short4> operator/=(Short4 &lhs, RValue<Short4> rhs);
 //	RValue<Short4> operator%=(Short4 &lhs, RValue<Short4> rhs);
-	RValue<Short4> operator&=(Short4 &lhs, RValue<Short4> rhs);
-	RValue<Short4> operator|=(Short4 &lhs, RValue<Short4> rhs);
-	RValue<Short4> operator^=(Short4 &lhs, RValue<Short4> rhs);
-	RValue<Short4> operator<<=(Short4 &lhs, unsigned char rhs);
-	RValue<Short4> operator>>=(Short4 &lhs, unsigned char rhs);
+RValue<Short4> operator&=(Short4 &lhs, RValue<Short4> rhs);
+RValue<Short4> operator|=(Short4 &lhs, RValue<Short4> rhs);
+RValue<Short4> operator^=(Short4 &lhs, RValue<Short4> rhs);
+RValue<Short4> operator<<=(Short4 &lhs, unsigned char rhs);
+RValue<Short4> operator>>=(Short4 &lhs, unsigned char rhs);
 //	RValue<Short4> operator+(RValue<Short4> val);
-	RValue<Short4> operator-(RValue<Short4> val);
-	RValue<Short4> operator~(RValue<Short4> val);
+RValue<Short4> operator-(RValue<Short4> val);
+RValue<Short4> operator~(RValue<Short4> val);
 //	RValue<Short4> operator++(Short4 &val, int);   // Post-increment
 //	const Short4 &operator++(Short4 &val);   // Pre-increment
 //	RValue<Short4> operator--(Short4 &val, int);   // Post-decrement
@@ -872,59 +872,59 @@
 //	RValue<Bool> operator!=(RValue<Short4> lhs, RValue<Short4> rhs);
 //	RValue<Bool> operator==(RValue<Short4> lhs, RValue<Short4> rhs);
 
-	RValue<Short4> RoundShort4(RValue<Float4> cast);
-	RValue<Short4> Max(RValue<Short4> x, RValue<Short4> y);
-	RValue<Short4> Min(RValue<Short4> x, RValue<Short4> y);
-	RValue<Short4> AddSat(RValue<Short4> x, RValue<Short4> y);
-	RValue<Short4> SubSat(RValue<Short4> x, RValue<Short4> y);
-	RValue<Short4> MulHigh(RValue<Short4> x, RValue<Short4> y);
-	RValue<Int2> MulAdd(RValue<Short4> x, RValue<Short4> y);
-	RValue<SByte8> PackSigned(RValue<Short4> x, RValue<Short4> y);
-	RValue<Byte8> PackUnsigned(RValue<Short4> x, RValue<Short4> y);
-	RValue<Int2> UnpackLow(RValue<Short4> x, RValue<Short4> y);
-	RValue<Int2> UnpackHigh(RValue<Short4> x, RValue<Short4> y);
-	RValue<Short4> Swizzle(RValue<Short4> x, uint16_t select);
-	RValue<Short4> Insert(RValue<Short4> val, RValue<Short> element, int i);
-	RValue<Short> Extract(RValue<Short4> val, int i);
-	RValue<Short4> CmpGT(RValue<Short4> x, RValue<Short4> y);
-	RValue<Short4> CmpEQ(RValue<Short4> x, RValue<Short4> y);
+RValue<Short4> RoundShort4(RValue<Float4> cast);
+RValue<Short4> Max(RValue<Short4> x, RValue<Short4> y);
+RValue<Short4> Min(RValue<Short4> x, RValue<Short4> y);
+RValue<Short4> AddSat(RValue<Short4> x, RValue<Short4> y);
+RValue<Short4> SubSat(RValue<Short4> x, RValue<Short4> y);
+RValue<Short4> MulHigh(RValue<Short4> x, RValue<Short4> y);
+RValue<Int2> MulAdd(RValue<Short4> x, RValue<Short4> y);
+RValue<SByte8> PackSigned(RValue<Short4> x, RValue<Short4> y);
+RValue<Byte8> PackUnsigned(RValue<Short4> x, RValue<Short4> y);
+RValue<Int2> UnpackLow(RValue<Short4> x, RValue<Short4> y);
+RValue<Int2> UnpackHigh(RValue<Short4> x, RValue<Short4> y);
+RValue<Short4> Swizzle(RValue<Short4> x, uint16_t select);
+RValue<Short4> Insert(RValue<Short4> val, RValue<Short> element, int i);
+RValue<Short> Extract(RValue<Short4> val, int i);
+RValue<Short4> CmpGT(RValue<Short4> x, RValue<Short4> y);
+RValue<Short4> CmpEQ(RValue<Short4> x, RValue<Short4> y);
 
-	class UShort4 : public LValue<UShort4>
-	{
-	public:
-		explicit UShort4(RValue<Int4> cast);
-		explicit UShort4(RValue<Float4> cast, bool saturate = false);
+class UShort4 : public LValue<UShort4>
+{
+public:
+	explicit UShort4(RValue<Int4> cast);
+	explicit UShort4(RValue<Float4> cast, bool saturate = false);
 
-		UShort4() = default;
-		UShort4(unsigned short xyzw);
-		UShort4(unsigned short x, unsigned short y, unsigned short z, unsigned short w);
-		UShort4(RValue<UShort4> rhs);
-		UShort4(const UShort4 &rhs);
-		UShort4(const Reference<UShort4> &rhs);
-		UShort4(RValue<Short4> rhs);
-		UShort4(const Short4 &rhs);
-		UShort4(const Reference<Short4> &rhs);
+	UShort4() = default;
+	UShort4(unsigned short xyzw);
+	UShort4(unsigned short x, unsigned short y, unsigned short z, unsigned short w);
+	UShort4(RValue<UShort4> rhs);
+	UShort4(const UShort4 &rhs);
+	UShort4(const Reference<UShort4> &rhs);
+	UShort4(RValue<Short4> rhs);
+	UShort4(const Short4 &rhs);
+	UShort4(const Reference<Short4> &rhs);
 
-		RValue<UShort4> operator=(RValue<UShort4> rhs);
-		RValue<UShort4> operator=(const UShort4 &rhs);
-		RValue<UShort4> operator=(const Reference<UShort4> &rhs);
-		RValue<UShort4> operator=(RValue<Short4> rhs);
-		RValue<UShort4> operator=(const Short4 &rhs);
-		RValue<UShort4> operator=(const Reference<Short4> &rhs);
+	RValue<UShort4> operator=(RValue<UShort4> rhs);
+	RValue<UShort4> operator=(const UShort4 &rhs);
+	RValue<UShort4> operator=(const Reference<UShort4> &rhs);
+	RValue<UShort4> operator=(RValue<Short4> rhs);
+	RValue<UShort4> operator=(const Short4 &rhs);
+	RValue<UShort4> operator=(const Reference<Short4> &rhs);
 
-		static Type *getType();
-	};
+	static Type *getType();
+};
 
-	RValue<UShort4> operator+(RValue<UShort4> lhs, RValue<UShort4> rhs);
-	RValue<UShort4> operator-(RValue<UShort4> lhs, RValue<UShort4> rhs);
-	RValue<UShort4> operator*(RValue<UShort4> lhs, RValue<UShort4> rhs);
+RValue<UShort4> operator+(RValue<UShort4> lhs, RValue<UShort4> rhs);
+RValue<UShort4> operator-(RValue<UShort4> lhs, RValue<UShort4> rhs);
+RValue<UShort4> operator*(RValue<UShort4> lhs, RValue<UShort4> rhs);
 //	RValue<UShort4> operator/(RValue<UShort4> lhs, RValue<UShort4> rhs);
 //	RValue<UShort4> operator%(RValue<UShort4> lhs, RValue<UShort4> rhs);
-	RValue<UShort4> operator&(RValue<UShort4> lhs, RValue<UShort4> rhs);
-	RValue<UShort4> operator|(RValue<UShort4> lhs, RValue<UShort4> rhs);
-	RValue<UShort4> operator^(RValue<UShort4> lhs, RValue<UShort4> rhs);
-	RValue<UShort4> operator<<(RValue<UShort4> lhs, unsigned char rhs);
-	RValue<UShort4> operator>>(RValue<UShort4> lhs, unsigned char rhs);
+RValue<UShort4> operator&(RValue<UShort4> lhs, RValue<UShort4> rhs);
+RValue<UShort4> operator|(RValue<UShort4> lhs, RValue<UShort4> rhs);
+RValue<UShort4> operator^(RValue<UShort4> lhs, RValue<UShort4> rhs);
+RValue<UShort4> operator<<(RValue<UShort4> lhs, unsigned char rhs);
+RValue<UShort4> operator>>(RValue<UShort4> lhs, unsigned char rhs);
 //	RValue<UShort4> operator+=(UShort4 &lhs, RValue<UShort4> rhs);
 //	RValue<UShort4> operator-=(UShort4 &lhs, RValue<UShort4> rhs);
 //	RValue<UShort4> operator*=(UShort4 &lhs, RValue<UShort4> rhs);
@@ -933,51 +933,51 @@
 //	RValue<UShort4> operator&=(UShort4 &lhs, RValue<UShort4> rhs);
 //	RValue<UShort4> operator|=(UShort4 &lhs, RValue<UShort4> rhs);
 //	RValue<UShort4> operator^=(UShort4 &lhs, RValue<UShort4> rhs);
-	RValue<UShort4> operator<<=(UShort4 &lhs, unsigned char rhs);
-	RValue<UShort4> operator>>=(UShort4 &lhs, unsigned char rhs);
+RValue<UShort4> operator<<=(UShort4 &lhs, unsigned char rhs);
+RValue<UShort4> operator>>=(UShort4 &lhs, unsigned char rhs);
 //	RValue<UShort4> operator+(RValue<UShort4> val);
 //	RValue<UShort4> operator-(RValue<UShort4> val);
-	RValue<UShort4> operator~(RValue<UShort4> val);
+RValue<UShort4> operator~(RValue<UShort4> val);
 //	RValue<UShort4> operator++(UShort4 &val, int);   // Post-increment
 //	const UShort4 &operator++(UShort4 &val);   // Pre-increment
 //	RValue<UShort4> operator--(UShort4 &val, int);   // Post-decrement
 //	const UShort4 &operator--(UShort4 &val);   // Pre-decrement
 
-	RValue<UShort4> Max(RValue<UShort4> x, RValue<UShort4> y);
-	RValue<UShort4> Min(RValue<UShort4> x, RValue<UShort4> y);
-	RValue<UShort4> AddSat(RValue<UShort4> x, RValue<UShort4> y);
-	RValue<UShort4> SubSat(RValue<UShort4> x, RValue<UShort4> y);
-	RValue<UShort4> MulHigh(RValue<UShort4> x, RValue<UShort4> y);
-	RValue<UShort4> Average(RValue<UShort4> x, RValue<UShort4> y);
+RValue<UShort4> Max(RValue<UShort4> x, RValue<UShort4> y);
+RValue<UShort4> Min(RValue<UShort4> x, RValue<UShort4> y);
+RValue<UShort4> AddSat(RValue<UShort4> x, RValue<UShort4> y);
+RValue<UShort4> SubSat(RValue<UShort4> x, RValue<UShort4> y);
+RValue<UShort4> MulHigh(RValue<UShort4> x, RValue<UShort4> y);
+RValue<UShort4> Average(RValue<UShort4> x, RValue<UShort4> y);
 
-	class Short8 : public LValue<Short8>
-	{
-	public:
-		Short8() = default;
-		Short8(short c);
-		Short8(short c0, short c1, short c2, short c3, short c4, short c5, short c6, short c7);
-		Short8(RValue<Short8> rhs);
-	//	Short8(const Short8 &rhs);
-		Short8(const Reference<Short8> &rhs);
-		Short8(RValue<Short4> lo, RValue<Short4> hi);
+class Short8 : public LValue<Short8>
+{
+public:
+	Short8() = default;
+	Short8(short c);
+	Short8(short c0, short c1, short c2, short c3, short c4, short c5, short c6, short c7);
+	Short8(RValue<Short8> rhs);
+//	Short8(const Short8 &rhs);
+	Short8(const Reference<Short8> &rhs);
+	Short8(RValue<Short4> lo, RValue<Short4> hi);
 
-		RValue<Short8> operator=(RValue<Short8> rhs);
-		RValue<Short8> operator=(const Short8 &rhs);
-		RValue<Short8> operator=(const Reference<Short8> &rhs);
+	RValue<Short8> operator=(RValue<Short8> rhs);
+	RValue<Short8> operator=(const Short8 &rhs);
+	RValue<Short8> operator=(const Reference<Short8> &rhs);
 
-		static Type *getType();
-	};
+	static Type *getType();
+};
 
-	RValue<Short8> operator+(RValue<Short8> lhs, RValue<Short8> rhs);
+RValue<Short8> operator+(RValue<Short8> lhs, RValue<Short8> rhs);
 //	RValue<Short8> operator-(RValue<Short8> lhs, RValue<Short8> rhs);
 //	RValue<Short8> operator*(RValue<Short8> lhs, RValue<Short8> rhs);
 //	RValue<Short8> operator/(RValue<Short8> lhs, RValue<Short8> rhs);
 //	RValue<Short8> operator%(RValue<Short8> lhs, RValue<Short8> rhs);
-	RValue<Short8> operator&(RValue<Short8> lhs, RValue<Short8> rhs);
+RValue<Short8> operator&(RValue<Short8> lhs, RValue<Short8> rhs);
 //	RValue<Short8> operator|(RValue<Short8> lhs, RValue<Short8> rhs);
 //	RValue<Short8> operator^(RValue<Short8> lhs, RValue<Short8> rhs);
-	RValue<Short8> operator<<(RValue<Short8> lhs, unsigned char rhs);
-	RValue<Short8> operator>>(RValue<Short8> lhs, unsigned char rhs);
+RValue<Short8> operator<<(RValue<Short8> lhs, unsigned char rhs);
+RValue<Short8> operator>>(RValue<Short8> lhs, unsigned char rhs);
 //	RValue<Short8> operator<<(RValue<Short8> lhs, RValue<Short8> rhs);
 //	RValue<Short8> operator>>(RValue<Short8> lhs, RValue<Short8> rhs);
 //	RValue<Short8> operator+=(Short8 &lhs, RValue<Short8> rhs);
@@ -1004,41 +1004,41 @@
 //	RValue<Bool> operator!=(RValue<Short8> lhs, RValue<Short8> rhs);
 //	RValue<Bool> operator==(RValue<Short8> lhs, RValue<Short8> rhs);
 
-	RValue<Short8> MulHigh(RValue<Short8> x, RValue<Short8> y);
-	RValue<Int4> MulAdd(RValue<Short8> x, RValue<Short8> y);
-	RValue<Int4> Abs(RValue<Int4> x);
+RValue<Short8> MulHigh(RValue<Short8> x, RValue<Short8> y);
+RValue<Int4> MulAdd(RValue<Short8> x, RValue<Short8> y);
+RValue<Int4> Abs(RValue<Int4> x);
 
-	class UShort8 : public LValue<UShort8>
-	{
-	public:
-		UShort8() = default;
-		UShort8(unsigned short c);
-		UShort8(unsigned short c0, unsigned short c1, unsigned short c2, unsigned short c3, unsigned short c4, unsigned short c5, unsigned short c6, unsigned short c7);
-		UShort8(RValue<UShort8> rhs);
-	//	UShort8(const UShort8 &rhs);
-		UShort8(const Reference<UShort8> &rhs);
-		UShort8(RValue<UShort4> lo, RValue<UShort4> hi);
+class UShort8 : public LValue<UShort8>
+{
+public:
+	UShort8() = default;
+	UShort8(unsigned short c);
+	UShort8(unsigned short c0, unsigned short c1, unsigned short c2, unsigned short c3, unsigned short c4, unsigned short c5, unsigned short c6, unsigned short c7);
+	UShort8(RValue<UShort8> rhs);
+//	UShort8(const UShort8 &rhs);
+	UShort8(const Reference<UShort8> &rhs);
+	UShort8(RValue<UShort4> lo, RValue<UShort4> hi);
 
-		RValue<UShort8> operator=(RValue<UShort8> rhs);
-		RValue<UShort8> operator=(const UShort8 &rhs);
-		RValue<UShort8> operator=(const Reference<UShort8> &rhs);
+	RValue<UShort8> operator=(RValue<UShort8> rhs);
+	RValue<UShort8> operator=(const UShort8 &rhs);
+	RValue<UShort8> operator=(const Reference<UShort8> &rhs);
 
-		static Type *getType();
-	};
+	static Type *getType();
+};
 
-	RValue<UShort8> operator+(RValue<UShort8> lhs, RValue<UShort8> rhs);
+RValue<UShort8> operator+(RValue<UShort8> lhs, RValue<UShort8> rhs);
 //	RValue<UShort8> operator-(RValue<UShort8> lhs, RValue<UShort8> rhs);
-	RValue<UShort8> operator*(RValue<UShort8> lhs, RValue<UShort8> rhs);
+RValue<UShort8> operator*(RValue<UShort8> lhs, RValue<UShort8> rhs);
 //	RValue<UShort8> operator/(RValue<UShort8> lhs, RValue<UShort8> rhs);
 //	RValue<UShort8> operator%(RValue<UShort8> lhs, RValue<UShort8> rhs);
-	RValue<UShort8> operator&(RValue<UShort8> lhs, RValue<UShort8> rhs);
+RValue<UShort8> operator&(RValue<UShort8> lhs, RValue<UShort8> rhs);
 //	RValue<UShort8> operator|(RValue<UShort8> lhs, RValue<UShort8> rhs);
 //	RValue<UShort8> operator^(RValue<UShort8> lhs, RValue<UShort8> rhs);
-	RValue<UShort8> operator<<(RValue<UShort8> lhs, unsigned char rhs);
-	RValue<UShort8> operator>>(RValue<UShort8> lhs, unsigned char rhs);
+RValue<UShort8> operator<<(RValue<UShort8> lhs, unsigned char rhs);
+RValue<UShort8> operator>>(RValue<UShort8> lhs, unsigned char rhs);
 //	RValue<UShort8> operator<<(RValue<UShort8> lhs, RValue<UShort8> rhs);
 //	RValue<UShort8> operator>>(RValue<UShort8> lhs, RValue<UShort8> rhs);
-	RValue<UShort8> operator+=(UShort8 &lhs, RValue<UShort8> rhs);
+RValue<UShort8> operator+=(UShort8 &lhs, RValue<UShort8> rhs);
 //	RValue<UShort8> operator-=(UShort8 &lhs, RValue<UShort8> rhs);
 //	RValue<UShort8> operator*=(UShort8 &lhs, RValue<UShort8> rhs);
 //	RValue<UShort8> operator/=(UShort8 &lhs, RValue<UShort8> rhs);
@@ -1050,7 +1050,7 @@
 //	RValue<UShort8> operator>>=(UShort8 &lhs, RValue<UShort8> rhs);
 //	RValue<UShort8> operator+(RValue<UShort8> val);
 //	RValue<UShort8> operator-(RValue<UShort8> val);
-	RValue<UShort8> operator~(RValue<UShort8> val);
+RValue<UShort8> operator~(RValue<UShort8> val);
 //	RValue<UShort8> operator++(UShort8 &val, int);   // Post-increment
 //	const UShort8 &operator++(UShort8 &val);   // Pre-increment
 //	RValue<UShort8> operator--(UShort8 &val, int);   // Post-decrement
@@ -1062,124 +1062,124 @@
 //	RValue<Bool> operator!=(RValue<UShort8> lhs, RValue<UShort8> rhs);
 //	RValue<Bool> operator==(RValue<UShort8> lhs, RValue<UShort8> rhs);
 
-	RValue<UShort8> Swizzle(RValue<UShort8> x, char select0, char select1, char select2, char select3, char select4, char select5, char select6, char select7);
-	RValue<UShort8> MulHigh(RValue<UShort8> x, RValue<UShort8> y);
+RValue<UShort8> Swizzle(RValue<UShort8> x, char select0, char select1, char select2, char select3, char select4, char select5, char select6, char select7);
+RValue<UShort8> MulHigh(RValue<UShort8> x, RValue<UShort8> y);
 
-	class Int : public LValue<Int>
-	{
-	public:
-		Int(Argument<Int> argument);
+class Int : public LValue<Int>
+{
+public:
+	Int(Argument<Int> argument);
 
-		explicit Int(RValue<Byte> cast);
-		explicit Int(RValue<SByte> cast);
-		explicit Int(RValue<Short> cast);
-		explicit Int(RValue<UShort> cast);
-		explicit Int(RValue<Int2> cast);
-		explicit Int(RValue<Long> cast);
-		explicit Int(RValue<Float> cast);
+	explicit Int(RValue<Byte> cast);
+	explicit Int(RValue<SByte> cast);
+	explicit Int(RValue<Short> cast);
+	explicit Int(RValue<UShort> cast);
+	explicit Int(RValue<Int2> cast);
+	explicit Int(RValue<Long> cast);
+	explicit Int(RValue<Float> cast);
 
-		Int() = default;
-		Int(int x);
-		Int(RValue<Int> rhs);
-		Int(RValue<UInt> rhs);
-		Int(const Int &rhs);
-		Int(const UInt &rhs);
-		Int(const Reference<Int> &rhs);
-		Int(const Reference<UInt> &rhs);
+	Int() = default;
+	Int(int x);
+	Int(RValue<Int> rhs);
+	Int(RValue<UInt> rhs);
+	Int(const Int &rhs);
+	Int(const UInt &rhs);
+	Int(const Reference<Int> &rhs);
+	Int(const Reference<UInt> &rhs);
 
-		RValue<Int> operator=(int rhs);
-		RValue<Int> operator=(RValue<Int> rhs);
-		RValue<Int> operator=(RValue<UInt> rhs);
-		RValue<Int> operator=(const Int &rhs);
-		RValue<Int> operator=(const UInt &rhs);
-		RValue<Int> operator=(const Reference<Int> &rhs);
-		RValue<Int> operator=(const Reference<UInt> &rhs);
+	RValue<Int> operator=(int rhs);
+	RValue<Int> operator=(RValue<Int> rhs);
+	RValue<Int> operator=(RValue<UInt> rhs);
+	RValue<Int> operator=(const Int &rhs);
+	RValue<Int> operator=(const UInt &rhs);
+	RValue<Int> operator=(const Reference<Int> &rhs);
+	RValue<Int> operator=(const Reference<UInt> &rhs);
 
-		static Type *getType();
-	};
+	static Type *getType();
+};
 
-	RValue<Int> operator+(RValue<Int> lhs, RValue<Int> rhs);
-	RValue<Int> operator-(RValue<Int> lhs, RValue<Int> rhs);
-	RValue<Int> operator*(RValue<Int> lhs, RValue<Int> rhs);
-	RValue<Int> operator/(RValue<Int> lhs, RValue<Int> rhs);
-	RValue<Int> operator%(RValue<Int> lhs, RValue<Int> rhs);
-	RValue<Int> operator&(RValue<Int> lhs, RValue<Int> rhs);
-	RValue<Int> operator|(RValue<Int> lhs, RValue<Int> rhs);
-	RValue<Int> operator^(RValue<Int> lhs, RValue<Int> rhs);
-	RValue<Int> operator<<(RValue<Int> lhs, RValue<Int> rhs);
-	RValue<Int> operator>>(RValue<Int> lhs, RValue<Int> rhs);
-	RValue<Int> operator+=(Int &lhs, RValue<Int> rhs);
-	RValue<Int> operator-=(Int &lhs, RValue<Int> rhs);
-	RValue<Int> operator*=(Int &lhs, RValue<Int> rhs);
-	RValue<Int> operator/=(Int &lhs, RValue<Int> rhs);
-	RValue<Int> operator%=(Int &lhs, RValue<Int> rhs);
-	RValue<Int> operator&=(Int &lhs, RValue<Int> rhs);
-	RValue<Int> operator|=(Int &lhs, RValue<Int> rhs);
-	RValue<Int> operator^=(Int &lhs, RValue<Int> rhs);
-	RValue<Int> operator<<=(Int &lhs, RValue<Int> rhs);
-	RValue<Int> operator>>=(Int &lhs, RValue<Int> rhs);
-	RValue<Int> operator+(RValue<Int> val);
-	RValue<Int> operator-(RValue<Int> val);
-	RValue<Int> operator~(RValue<Int> val);
-	RValue<Int> operator++(Int &val, int);   // Post-increment
-	const Int &operator++(Int &val);   // Pre-increment
-	RValue<Int> operator--(Int &val, int);   // Post-decrement
-	const Int &operator--(Int &val);   // Pre-decrement
-	RValue<Bool> operator<(RValue<Int> lhs, RValue<Int> rhs);
-	RValue<Bool> operator<=(RValue<Int> lhs, RValue<Int> rhs);
-	RValue<Bool> operator>(RValue<Int> lhs, RValue<Int> rhs);
-	RValue<Bool> operator>=(RValue<Int> lhs, RValue<Int> rhs);
-	RValue<Bool> operator!=(RValue<Int> lhs, RValue<Int> rhs);
-	RValue<Bool> operator==(RValue<Int> lhs, RValue<Int> rhs);
+RValue<Int> operator+(RValue<Int> lhs, RValue<Int> rhs);
+RValue<Int> operator-(RValue<Int> lhs, RValue<Int> rhs);
+RValue<Int> operator*(RValue<Int> lhs, RValue<Int> rhs);
+RValue<Int> operator/(RValue<Int> lhs, RValue<Int> rhs);
+RValue<Int> operator%(RValue<Int> lhs, RValue<Int> rhs);
+RValue<Int> operator&(RValue<Int> lhs, RValue<Int> rhs);
+RValue<Int> operator|(RValue<Int> lhs, RValue<Int> rhs);
+RValue<Int> operator^(RValue<Int> lhs, RValue<Int> rhs);
+RValue<Int> operator<<(RValue<Int> lhs, RValue<Int> rhs);
+RValue<Int> operator>>(RValue<Int> lhs, RValue<Int> rhs);
+RValue<Int> operator+=(Int &lhs, RValue<Int> rhs);
+RValue<Int> operator-=(Int &lhs, RValue<Int> rhs);
+RValue<Int> operator*=(Int &lhs, RValue<Int> rhs);
+RValue<Int> operator/=(Int &lhs, RValue<Int> rhs);
+RValue<Int> operator%=(Int &lhs, RValue<Int> rhs);
+RValue<Int> operator&=(Int &lhs, RValue<Int> rhs);
+RValue<Int> operator|=(Int &lhs, RValue<Int> rhs);
+RValue<Int> operator^=(Int &lhs, RValue<Int> rhs);
+RValue<Int> operator<<=(Int &lhs, RValue<Int> rhs);
+RValue<Int> operator>>=(Int &lhs, RValue<Int> rhs);
+RValue<Int> operator+(RValue<Int> val);
+RValue<Int> operator-(RValue<Int> val);
+RValue<Int> operator~(RValue<Int> val);
+RValue<Int> operator++(Int &val, int);   // Post-increment
+const Int &operator++(Int &val);   // Pre-increment
+RValue<Int> operator--(Int &val, int);   // Post-decrement
+const Int &operator--(Int &val);   // Pre-decrement
+RValue<Bool> operator<(RValue<Int> lhs, RValue<Int> rhs);
+RValue<Bool> operator<=(RValue<Int> lhs, RValue<Int> rhs);
+RValue<Bool> operator>(RValue<Int> lhs, RValue<Int> rhs);
+RValue<Bool> operator>=(RValue<Int> lhs, RValue<Int> rhs);
+RValue<Bool> operator!=(RValue<Int> lhs, RValue<Int> rhs);
+RValue<Bool> operator==(RValue<Int> lhs, RValue<Int> rhs);
 
-	RValue<Int> Max(RValue<Int> x, RValue<Int> y);
-	RValue<Int> Min(RValue<Int> x, RValue<Int> y);
-	RValue<Int> Clamp(RValue<Int> x, RValue<Int> min, RValue<Int> max);
-	RValue<Int> RoundInt(RValue<Float> cast);
+RValue<Int> Max(RValue<Int> x, RValue<Int> y);
+RValue<Int> Min(RValue<Int> x, RValue<Int> y);
+RValue<Int> Clamp(RValue<Int> x, RValue<Int> min, RValue<Int> max);
+RValue<Int> RoundInt(RValue<Float> cast);
 
-	class Long : public LValue<Long>
-	{
-	public:
-	//	Long(Argument<Long> argument);
+class Long : public LValue<Long>
+{
+public:
+//	Long(Argument<Long> argument);
 
-	//	explicit Long(RValue<Short> cast);
-	//	explicit Long(RValue<UShort> cast);
-		explicit Long(RValue<Int> cast);
-		explicit Long(RValue<UInt> cast);
-	//	explicit Long(RValue<Float> cast);
+//	explicit Long(RValue<Short> cast);
+//	explicit Long(RValue<UShort> cast);
+	explicit Long(RValue<Int> cast);
+	explicit Long(RValue<UInt> cast);
+//	explicit Long(RValue<Float> cast);
 
-		Long() = default;
-	//	Long(qword x);
-		Long(RValue<Long> rhs);
-	//	Long(RValue<ULong> rhs);
-	//	Long(const Long &rhs);
-	//	Long(const Reference<Long> &rhs);
-	//	Long(const ULong &rhs);
-	//	Long(const Reference<ULong> &rhs);
+	Long() = default;
+//	Long(qword x);
+	Long(RValue<Long> rhs);
+//	Long(RValue<ULong> rhs);
+//	Long(const Long &rhs);
+//	Long(const Reference<Long> &rhs);
+//	Long(const ULong &rhs);
+//	Long(const Reference<ULong> &rhs);
 
-		RValue<Long> operator=(int64_t rhs);
-		RValue<Long> operator=(RValue<Long> rhs);
-	//	RValue<Long> operator=(RValue<ULong> rhs);
-		RValue<Long> operator=(const Long &rhs);
-		RValue<Long> operator=(const Reference<Long> &rhs);
-	//	RValue<Long> operator=(const ULong &rhs);
-	//	RValue<Long> operator=(const Reference<ULong> &rhs);
+	RValue<Long> operator=(int64_t rhs);
+	RValue<Long> operator=(RValue<Long> rhs);
+//	RValue<Long> operator=(RValue<ULong> rhs);
+	RValue<Long> operator=(const Long &rhs);
+	RValue<Long> operator=(const Reference<Long> &rhs);
+//	RValue<Long> operator=(const ULong &rhs);
+//	RValue<Long> operator=(const Reference<ULong> &rhs);
 
-		static Type *getType();
-	};
+	static Type *getType();
+};
 
-	RValue<Long> operator+(RValue<Long> lhs, RValue<Long> rhs);
-	RValue<Long> operator-(RValue<Long> lhs, RValue<Long> rhs);
-	RValue<Long> operator*(RValue<Long> lhs, RValue<Long> rhs);
+RValue<Long> operator+(RValue<Long> lhs, RValue<Long> rhs);
+RValue<Long> operator-(RValue<Long> lhs, RValue<Long> rhs);
+RValue<Long> operator*(RValue<Long> lhs, RValue<Long> rhs);
 //	RValue<Long> operator/(RValue<Long> lhs, RValue<Long> rhs);
 //	RValue<Long> operator%(RValue<Long> lhs, RValue<Long> rhs);
 //	RValue<Long> operator&(RValue<Long> lhs, RValue<Long> rhs);
 //	RValue<Long> operator|(RValue<Long> lhs, RValue<Long> rhs);
 //	RValue<Long> operator^(RValue<Long> lhs, RValue<Long> rhs);
 //	RValue<Long> operator<<(RValue<Long> lhs, RValue<Long> rhs);
-	RValue<Long> operator>>(RValue<Long> lhs, RValue<Long> rhs);
-	RValue<Long> operator+=(Long &lhs, RValue<Long> rhs);
-	RValue<Long> operator-=(Long &lhs, RValue<Long> rhs);
+RValue<Long> operator>>(RValue<Long> lhs, RValue<Long> rhs);
+RValue<Long> operator+=(Long &lhs, RValue<Long> rhs);
+RValue<Long> operator-=(Long &lhs, RValue<Long> rhs);
 //	RValue<Long> operator*=(Long &lhs, RValue<Long> rhs);
 //	RValue<Long> operator/=(Long &lhs, RValue<Long> rhs);
 //	RValue<Long> operator%=(Long &lhs, RValue<Long> rhs);
@@ -1203,133 +1203,133 @@
 //	RValue<Bool> operator==(RValue<Long> lhs, RValue<Long> rhs);
 
 //	RValue<Long> RoundLong(RValue<Float> cast);
-	RValue<Long> AddAtomic( RValue<Pointer<Long>> x, RValue<Long> y);
+RValue<Long> AddAtomic( RValue<Pointer<Long>> x, RValue<Long> y);
 
-	class UInt : public LValue<UInt>
-	{
-	public:
-		UInt(Argument<UInt> argument);
+class UInt : public LValue<UInt>
+{
+public:
+	UInt(Argument<UInt> argument);
 
-		explicit UInt(RValue<UShort> cast);
-		explicit UInt(RValue<Long> cast);
-		explicit UInt(RValue<Float> cast);
+	explicit UInt(RValue<UShort> cast);
+	explicit UInt(RValue<Long> cast);
+	explicit UInt(RValue<Float> cast);
 
-		UInt() = default;
-		UInt(int x);
-		UInt(unsigned int x);
-		UInt(RValue<UInt> rhs);
-		UInt(RValue<Int> rhs);
-		UInt(const UInt &rhs);
-		UInt(const Int &rhs);
-		UInt(const Reference<UInt> &rhs);
-		UInt(const Reference<Int> &rhs);
+	UInt() = default;
+	UInt(int x);
+	UInt(unsigned int x);
+	UInt(RValue<UInt> rhs);
+	UInt(RValue<Int> rhs);
+	UInt(const UInt &rhs);
+	UInt(const Int &rhs);
+	UInt(const Reference<UInt> &rhs);
+	UInt(const Reference<Int> &rhs);
 
-		RValue<UInt> operator=(unsigned int rhs);
-		RValue<UInt> operator=(RValue<UInt> rhs);
-		RValue<UInt> operator=(RValue<Int> rhs);
-		RValue<UInt> operator=(const UInt &rhs);
-		RValue<UInt> operator=(const Int &rhs);
-		RValue<UInt> operator=(const Reference<UInt> &rhs);
-		RValue<UInt> operator=(const Reference<Int> &rhs);
+	RValue<UInt> operator=(unsigned int rhs);
+	RValue<UInt> operator=(RValue<UInt> rhs);
+	RValue<UInt> operator=(RValue<Int> rhs);
+	RValue<UInt> operator=(const UInt &rhs);
+	RValue<UInt> operator=(const Int &rhs);
+	RValue<UInt> operator=(const Reference<UInt> &rhs);
+	RValue<UInt> operator=(const Reference<Int> &rhs);
 
-		static Type *getType();
-	};
+	static Type *getType();
+};
 
-	RValue<UInt> operator+(RValue<UInt> lhs, RValue<UInt> rhs);
-	RValue<UInt> operator-(RValue<UInt> lhs, RValue<UInt> rhs);
-	RValue<UInt> operator*(RValue<UInt> lhs, RValue<UInt> rhs);
-	RValue<UInt> operator/(RValue<UInt> lhs, RValue<UInt> rhs);
-	RValue<UInt> operator%(RValue<UInt> lhs, RValue<UInt> rhs);
-	RValue<UInt> operator&(RValue<UInt> lhs, RValue<UInt> rhs);
-	RValue<UInt> operator|(RValue<UInt> lhs, RValue<UInt> rhs);
-	RValue<UInt> operator^(RValue<UInt> lhs, RValue<UInt> rhs);
-	RValue<UInt> operator<<(RValue<UInt> lhs, RValue<UInt> rhs);
-	RValue<UInt> operator>>(RValue<UInt> lhs, RValue<UInt> rhs);
-	RValue<UInt> operator+=(UInt &lhs, RValue<UInt> rhs);
-	RValue<UInt> operator-=(UInt &lhs, RValue<UInt> rhs);
-	RValue<UInt> operator*=(UInt &lhs, RValue<UInt> rhs);
-	RValue<UInt> operator/=(UInt &lhs, RValue<UInt> rhs);
-	RValue<UInt> operator%=(UInt &lhs, RValue<UInt> rhs);
-	RValue<UInt> operator&=(UInt &lhs, RValue<UInt> rhs);
-	RValue<UInt> operator|=(UInt &lhs, RValue<UInt> rhs);
-	RValue<UInt> operator^=(UInt &lhs, RValue<UInt> rhs);
-	RValue<UInt> operator<<=(UInt &lhs, RValue<UInt> rhs);
-	RValue<UInt> operator>>=(UInt &lhs, RValue<UInt> rhs);
-	RValue<UInt> operator+(RValue<UInt> val);
-	RValue<UInt> operator-(RValue<UInt> val);
-	RValue<UInt> operator~(RValue<UInt> val);
-	RValue<UInt> operator++(UInt &val, int);   // Post-increment
-	const UInt &operator++(UInt &val);   // Pre-increment
-	RValue<UInt> operator--(UInt &val, int);   // Post-decrement
-	const UInt &operator--(UInt &val);   // Pre-decrement
-	RValue<Bool> operator<(RValue<UInt> lhs, RValue<UInt> rhs);
-	RValue<Bool> operator<=(RValue<UInt> lhs, RValue<UInt> rhs);
-	RValue<Bool> operator>(RValue<UInt> lhs, RValue<UInt> rhs);
-	RValue<Bool> operator>=(RValue<UInt> lhs, RValue<UInt> rhs);
-	RValue<Bool> operator!=(RValue<UInt> lhs, RValue<UInt> rhs);
-	RValue<Bool> operator==(RValue<UInt> lhs, RValue<UInt> rhs);
+RValue<UInt> operator+(RValue<UInt> lhs, RValue<UInt> rhs);
+RValue<UInt> operator-(RValue<UInt> lhs, RValue<UInt> rhs);
+RValue<UInt> operator*(RValue<UInt> lhs, RValue<UInt> rhs);
+RValue<UInt> operator/(RValue<UInt> lhs, RValue<UInt> rhs);
+RValue<UInt> operator%(RValue<UInt> lhs, RValue<UInt> rhs);
+RValue<UInt> operator&(RValue<UInt> lhs, RValue<UInt> rhs);
+RValue<UInt> operator|(RValue<UInt> lhs, RValue<UInt> rhs);
+RValue<UInt> operator^(RValue<UInt> lhs, RValue<UInt> rhs);
+RValue<UInt> operator<<(RValue<UInt> lhs, RValue<UInt> rhs);
+RValue<UInt> operator>>(RValue<UInt> lhs, RValue<UInt> rhs);
+RValue<UInt> operator+=(UInt &lhs, RValue<UInt> rhs);
+RValue<UInt> operator-=(UInt &lhs, RValue<UInt> rhs);
+RValue<UInt> operator*=(UInt &lhs, RValue<UInt> rhs);
+RValue<UInt> operator/=(UInt &lhs, RValue<UInt> rhs);
+RValue<UInt> operator%=(UInt &lhs, RValue<UInt> rhs);
+RValue<UInt> operator&=(UInt &lhs, RValue<UInt> rhs);
+RValue<UInt> operator|=(UInt &lhs, RValue<UInt> rhs);
+RValue<UInt> operator^=(UInt &lhs, RValue<UInt> rhs);
+RValue<UInt> operator<<=(UInt &lhs, RValue<UInt> rhs);
+RValue<UInt> operator>>=(UInt &lhs, RValue<UInt> rhs);
+RValue<UInt> operator+(RValue<UInt> val);
+RValue<UInt> operator-(RValue<UInt> val);
+RValue<UInt> operator~(RValue<UInt> val);
+RValue<UInt> operator++(UInt &val, int);   // Post-increment
+const UInt &operator++(UInt &val);   // Pre-increment
+RValue<UInt> operator--(UInt &val, int);   // Post-decrement
+const UInt &operator--(UInt &val);   // Pre-decrement
+RValue<Bool> operator<(RValue<UInt> lhs, RValue<UInt> rhs);
+RValue<Bool> operator<=(RValue<UInt> lhs, RValue<UInt> rhs);
+RValue<Bool> operator>(RValue<UInt> lhs, RValue<UInt> rhs);
+RValue<Bool> operator>=(RValue<UInt> lhs, RValue<UInt> rhs);
+RValue<Bool> operator!=(RValue<UInt> lhs, RValue<UInt> rhs);
+RValue<Bool> operator==(RValue<UInt> lhs, RValue<UInt> rhs);
 
-	RValue<UInt> Max(RValue<UInt> x, RValue<UInt> y);
-	RValue<UInt> Min(RValue<UInt> x, RValue<UInt> y);
-	RValue<UInt> Clamp(RValue<UInt> x, RValue<UInt> min, RValue<UInt> max);
+RValue<UInt> Max(RValue<UInt> x, RValue<UInt> y);
+RValue<UInt> Min(RValue<UInt> x, RValue<UInt> y);
+RValue<UInt> Clamp(RValue<UInt> x, RValue<UInt> min, RValue<UInt> max);
 
-	RValue<UInt> AddAtomic(RValue<Pointer<UInt>> x, RValue<UInt> y, std::memory_order memoryOrder);
-	RValue<UInt> SubAtomic(RValue<Pointer<UInt>> x, RValue<UInt> y, std::memory_order memoryOrder);
-	RValue<UInt> AndAtomic(RValue<Pointer<UInt>> x, RValue<UInt> y, std::memory_order memoryOrder);
-	RValue<UInt> OrAtomic(RValue<Pointer<UInt>> x, RValue<UInt> y, std::memory_order memoryOrder);
-	RValue<UInt> XorAtomic(RValue<Pointer<UInt>> x, RValue<UInt> y, std::memory_order memoryOrder);
-	RValue<Int> MinAtomic(RValue<Pointer<Int>> x, RValue<Int> y, std::memory_order memoryOrder);
-	RValue<Int> MaxAtomic(RValue<Pointer<Int>> x, RValue<Int> y, std::memory_order memoryOrder);
-	RValue<UInt> MinAtomic(RValue<Pointer<UInt>> x, RValue<UInt> y, std::memory_order memoryOrder);
-	RValue<UInt> MaxAtomic(RValue<Pointer<UInt>> x, RValue<UInt> y, std::memory_order memoryOrder);
-	RValue<UInt> ExchangeAtomic(RValue<Pointer<UInt>> x, RValue<UInt> y, std::memory_order memoryOrder);
-	RValue<UInt> CompareExchangeAtomic(RValue<Pointer<UInt>> x, RValue<UInt> y, RValue<UInt> compare, std::memory_order memoryOrderEqual, std::memory_order memoryOrderUnequal);
+RValue<UInt> AddAtomic(RValue<Pointer<UInt>> x, RValue<UInt> y, std::memory_order memoryOrder);
+RValue<UInt> SubAtomic(RValue<Pointer<UInt>> x, RValue<UInt> y, std::memory_order memoryOrder);
+RValue<UInt> AndAtomic(RValue<Pointer<UInt>> x, RValue<UInt> y, std::memory_order memoryOrder);
+RValue<UInt> OrAtomic(RValue<Pointer<UInt>> x, RValue<UInt> y, std::memory_order memoryOrder);
+RValue<UInt> XorAtomic(RValue<Pointer<UInt>> x, RValue<UInt> y, std::memory_order memoryOrder);
+RValue<Int> MinAtomic(RValue<Pointer<Int>> x, RValue<Int> y, std::memory_order memoryOrder);
+RValue<Int> MaxAtomic(RValue<Pointer<Int>> x, RValue<Int> y, std::memory_order memoryOrder);
+RValue<UInt> MinAtomic(RValue<Pointer<UInt>> x, RValue<UInt> y, std::memory_order memoryOrder);
+RValue<UInt> MaxAtomic(RValue<Pointer<UInt>> x, RValue<UInt> y, std::memory_order memoryOrder);
+RValue<UInt> ExchangeAtomic(RValue<Pointer<UInt>> x, RValue<UInt> y, std::memory_order memoryOrder);
+RValue<UInt> CompareExchangeAtomic(RValue<Pointer<UInt>> x, RValue<UInt> y, RValue<UInt> compare, std::memory_order memoryOrderEqual, std::memory_order memoryOrderUnequal);
 
 //	RValue<UInt> RoundUInt(RValue<Float> cast);
 
-	class Int2 : public LValue<Int2>
-	{
-	public:
-	//	explicit Int2(RValue<Int> cast);
-		explicit Int2(RValue<Int4> cast);
+class Int2 : public LValue<Int2>
+{
+public:
+//	explicit Int2(RValue<Int> cast);
+	explicit Int2(RValue<Int4> cast);
 
-		Int2() = default;
-		Int2(int x, int y);
-		Int2(RValue<Int2> rhs);
-		Int2(const Int2 &rhs);
-		Int2(const Reference<Int2> &rhs);
-		Int2(RValue<Int> lo, RValue<Int> hi);
+	Int2() = default;
+	Int2(int x, int y);
+	Int2(RValue<Int2> rhs);
+	Int2(const Int2 &rhs);
+	Int2(const Reference<Int2> &rhs);
+	Int2(RValue<Int> lo, RValue<Int> hi);
 
-		RValue<Int2> operator=(RValue<Int2> rhs);
-		RValue<Int2> operator=(const Int2 &rhs);
-		RValue<Int2> operator=(const Reference<Int2> &rhs);
+	RValue<Int2> operator=(RValue<Int2> rhs);
+	RValue<Int2> operator=(const Int2 &rhs);
+	RValue<Int2> operator=(const Reference<Int2> &rhs);
 
-		static Type *getType();
-	};
+	static Type *getType();
+};
 
-	RValue<Int2> operator+(RValue<Int2> lhs, RValue<Int2> rhs);
-	RValue<Int2> operator-(RValue<Int2> lhs, RValue<Int2> rhs);
+RValue<Int2> operator+(RValue<Int2> lhs, RValue<Int2> rhs);
+RValue<Int2> operator-(RValue<Int2> lhs, RValue<Int2> rhs);
 //	RValue<Int2> operator*(RValue<Int2> lhs, RValue<Int2> rhs);
 //	RValue<Int2> operator/(RValue<Int2> lhs, RValue<Int2> rhs);
 //	RValue<Int2> operator%(RValue<Int2> lhs, RValue<Int2> rhs);
-	RValue<Int2> operator&(RValue<Int2> lhs, RValue<Int2> rhs);
-	RValue<Int2> operator|(RValue<Int2> lhs, RValue<Int2> rhs);
-	RValue<Int2> operator^(RValue<Int2> lhs, RValue<Int2> rhs);
-	RValue<Int2> operator<<(RValue<Int2> lhs, unsigned char rhs);
-	RValue<Int2> operator>>(RValue<Int2> lhs, unsigned char rhs);
-	RValue<Int2> operator+=(Int2 &lhs, RValue<Int2> rhs);
-	RValue<Int2> operator-=(Int2 &lhs, RValue<Int2> rhs);
+RValue<Int2> operator&(RValue<Int2> lhs, RValue<Int2> rhs);
+RValue<Int2> operator|(RValue<Int2> lhs, RValue<Int2> rhs);
+RValue<Int2> operator^(RValue<Int2> lhs, RValue<Int2> rhs);
+RValue<Int2> operator<<(RValue<Int2> lhs, unsigned char rhs);
+RValue<Int2> operator>>(RValue<Int2> lhs, unsigned char rhs);
+RValue<Int2> operator+=(Int2 &lhs, RValue<Int2> rhs);
+RValue<Int2> operator-=(Int2 &lhs, RValue<Int2> rhs);
 //	RValue<Int2> operator*=(Int2 &lhs, RValue<Int2> rhs);
 //	RValue<Int2> operator/=(Int2 &lhs, RValue<Int2> rhs);
 //	RValue<Int2> operator%=(Int2 &lhs, RValue<Int2> rhs);
-	RValue<Int2> operator&=(Int2 &lhs, RValue<Int2> rhs);
-	RValue<Int2> operator|=(Int2 &lhs, RValue<Int2> rhs);
-	RValue<Int2> operator^=(Int2 &lhs, RValue<Int2> rhs);
-	RValue<Int2> operator<<=(Int2 &lhs, unsigned char rhs);
-	RValue<Int2> operator>>=(Int2 &lhs, unsigned char rhs);
+RValue<Int2> operator&=(Int2 &lhs, RValue<Int2> rhs);
+RValue<Int2> operator|=(Int2 &lhs, RValue<Int2> rhs);
+RValue<Int2> operator^=(Int2 &lhs, RValue<Int2> rhs);
+RValue<Int2> operator<<=(Int2 &lhs, unsigned char rhs);
+RValue<Int2> operator>>=(Int2 &lhs, unsigned char rhs);
 //	RValue<Int2> operator+(RValue<Int2> val);
 //	RValue<Int2> operator-(RValue<Int2> val);
-	RValue<Int2> operator~(RValue<Int2> val);
+RValue<Int2> operator~(RValue<Int2> val);
 //	RValue<Int2> operator++(Int2 &val, int);   // Post-increment
 //	const Int2 &operator++(Int2 &val);   // Pre-increment
 //	RValue<Int2> operator--(Int2 &val, int);   // Post-decrement
@@ -1342,50 +1342,50 @@
 //	RValue<Bool> operator==(RValue<Int2> lhs, RValue<Int2> rhs);
 
 //	RValue<Int2> RoundInt(RValue<Float4> cast);
-	RValue<Short4> UnpackLow(RValue<Int2> x, RValue<Int2> y);
-	RValue<Short4> UnpackHigh(RValue<Int2> x, RValue<Int2> y);
-	RValue<Int> Extract(RValue<Int2> val, int i);
-	RValue<Int2> Insert(RValue<Int2> val, RValue<Int> element, int i);
+RValue<Short4> UnpackLow(RValue<Int2> x, RValue<Int2> y);
+RValue<Short4> UnpackHigh(RValue<Int2> x, RValue<Int2> y);
+RValue<Int> Extract(RValue<Int2> val, int i);
+RValue<Int2> Insert(RValue<Int2> val, RValue<Int> element, int i);
 
-	class UInt2 : public LValue<UInt2>
-	{
-	public:
-		UInt2() = default;
-		UInt2(unsigned int x, unsigned int y);
-		UInt2(RValue<UInt2> rhs);
-		UInt2(const UInt2 &rhs);
-		UInt2(const Reference<UInt2> &rhs);
+class UInt2 : public LValue<UInt2>
+{
+public:
+	UInt2() = default;
+	UInt2(unsigned int x, unsigned int y);
+	UInt2(RValue<UInt2> rhs);
+	UInt2(const UInt2 &rhs);
+	UInt2(const Reference<UInt2> &rhs);
 
-		RValue<UInt2> operator=(RValue<UInt2> rhs);
-		RValue<UInt2> operator=(const UInt2 &rhs);
-		RValue<UInt2> operator=(const Reference<UInt2> &rhs);
+	RValue<UInt2> operator=(RValue<UInt2> rhs);
+	RValue<UInt2> operator=(const UInt2 &rhs);
+	RValue<UInt2> operator=(const Reference<UInt2> &rhs);
 
-		static Type *getType();
-	};
+	static Type *getType();
+};
 
-	RValue<UInt2> operator+(RValue<UInt2> lhs, RValue<UInt2> rhs);
-	RValue<UInt2> operator-(RValue<UInt2> lhs, RValue<UInt2> rhs);
+RValue<UInt2> operator+(RValue<UInt2> lhs, RValue<UInt2> rhs);
+RValue<UInt2> operator-(RValue<UInt2> lhs, RValue<UInt2> rhs);
 //	RValue<UInt2> operator*(RValue<UInt2> lhs, RValue<UInt2> rhs);
 //	RValue<UInt2> operator/(RValue<UInt2> lhs, RValue<UInt2> rhs);
 //	RValue<UInt2> operator%(RValue<UInt2> lhs, RValue<UInt2> rhs);
-	RValue<UInt2> operator&(RValue<UInt2> lhs, RValue<UInt2> rhs);
-	RValue<UInt2> operator|(RValue<UInt2> lhs, RValue<UInt2> rhs);
-	RValue<UInt2> operator^(RValue<UInt2> lhs, RValue<UInt2> rhs);
-	RValue<UInt2> operator<<(RValue<UInt2> lhs, unsigned char rhs);
-	RValue<UInt2> operator>>(RValue<UInt2> lhs, unsigned char rhs);
-	RValue<UInt2> operator+=(UInt2 &lhs, RValue<UInt2> rhs);
-	RValue<UInt2> operator-=(UInt2 &lhs, RValue<UInt2> rhs);
+RValue<UInt2> operator&(RValue<UInt2> lhs, RValue<UInt2> rhs);
+RValue<UInt2> operator|(RValue<UInt2> lhs, RValue<UInt2> rhs);
+RValue<UInt2> operator^(RValue<UInt2> lhs, RValue<UInt2> rhs);
+RValue<UInt2> operator<<(RValue<UInt2> lhs, unsigned char rhs);
+RValue<UInt2> operator>>(RValue<UInt2> lhs, unsigned char rhs);
+RValue<UInt2> operator+=(UInt2 &lhs, RValue<UInt2> rhs);
+RValue<UInt2> operator-=(UInt2 &lhs, RValue<UInt2> rhs);
 //	RValue<UInt2> operator*=(UInt2 &lhs, RValue<UInt2> rhs);
 //	RValue<UInt2> operator/=(UInt2 &lhs, RValue<UInt2> rhs);
 //	RValue<UInt2> operator%=(UInt2 &lhs, RValue<UInt2> rhs);
-	RValue<UInt2> operator&=(UInt2 &lhs, RValue<UInt2> rhs);
-	RValue<UInt2> operator|=(UInt2 &lhs, RValue<UInt2> rhs);
-	RValue<UInt2> operator^=(UInt2 &lhs, RValue<UInt2> rhs);
-	RValue<UInt2> operator<<=(UInt2 &lhs, unsigned char rhs);
-	RValue<UInt2> operator>>=(UInt2 &lhs, unsigned char rhs);
+RValue<UInt2> operator&=(UInt2 &lhs, RValue<UInt2> rhs);
+RValue<UInt2> operator|=(UInt2 &lhs, RValue<UInt2> rhs);
+RValue<UInt2> operator^=(UInt2 &lhs, RValue<UInt2> rhs);
+RValue<UInt2> operator<<=(UInt2 &lhs, unsigned char rhs);
+RValue<UInt2> operator>>=(UInt2 &lhs, unsigned char rhs);
 //	RValue<UInt2> operator+(RValue<UInt2> val);
 //	RValue<UInt2> operator-(RValue<UInt2> val);
-	RValue<UInt2> operator~(RValue<UInt2> val);
+RValue<UInt2> operator~(RValue<UInt2> val);
 //	RValue<UInt2> operator++(UInt2 &val, int);   // Post-increment
 //	const UInt2 &operator++(UInt2 &val);   // Pre-increment
 //	RValue<UInt2> operator--(UInt2 &val, int);   // Post-decrement
@@ -1398,516 +1398,516 @@
 //	RValue<Bool> operator==(RValue<UInt2> lhs, RValue<UInt2> rhs);
 
 //	RValue<UInt2> RoundInt(RValue<Float4> cast);
-	RValue<UInt> Extract(RValue<UInt2> val, int i);
-	RValue<UInt2> Insert(RValue<UInt2> val, RValue<UInt> element, int i);
+RValue<UInt> Extract(RValue<UInt2> val, int i);
+RValue<UInt2> Insert(RValue<UInt2> val, RValue<UInt> element, int i);
 
-	template<class T>
-	struct Scalar;
+template<class T>
+struct Scalar;
 
-	template<class Vector4>
-	struct XYZW;
+template<class Vector4>
+struct XYZW;
 
-	template<class Vector4, int T>
-	class Swizzle2
+template<class Vector4, int T>
+class Swizzle2
+{
+	friend Vector4;
+
+public:
+	operator RValue<Vector4>() const;
+
+private:
+	Vector4 *parent;
+};
+
+template<class Vector4, int T>
+class Swizzle4
+{
+public:
+	operator RValue<Vector4>() const;
+
+private:
+	Vector4 *parent;
+};
+
+template<class Vector4, int T>
+class SwizzleMask4
+{
+	friend XYZW<Vector4>;
+
+public:
+	operator RValue<Vector4>() const;
+
+	RValue<Vector4> operator=(RValue<Vector4> rhs);
+	RValue<Vector4> operator=(RValue<typename Scalar<Vector4>::Type> rhs);
+
+private:
+	Vector4 *parent;
+};
+
+template<>
+struct Scalar<Float4>
+{
+	using Type = Float;
+};
+
+template<>
+struct Scalar<Int4>
+{
+	using Type = Int;
+};
+
+template<>
+struct Scalar<UInt4>
+{
+	using Type = UInt;
+};
+
+template<class Vector4, int T>
+class SwizzleMask1
+{
+public:
+	operator RValue<typename Scalar<Vector4>::Type>() const;
+	operator RValue<Vector4>() const;
+
+	RValue<Vector4> operator=(float x);
+	RValue<Vector4> operator=(RValue<Vector4> rhs);
+	RValue<Vector4> operator=(RValue<typename Scalar<Vector4>::Type> rhs);
+
+private:
+	Vector4 *parent;
+};
+
+template<class Vector4, int T>
+class SwizzleMask2
+{
+	friend class Float4;
+
+public:
+	operator RValue<Vector4>() const;
+
+	RValue<Vector4> operator=(RValue<Vector4> rhs);
+
+private:
+	Float4 *parent;
+};
+
+template<class Vector4>
+struct XYZW
+{
+	friend Vector4;
+
+private:
+	XYZW(Vector4 *parent)
 	{
-		friend Vector4;
+		xyzw.parent = parent;
+	}
 
-	public:
-		operator RValue<Vector4>() const;
-
-	private:
-		Vector4 *parent;
-	};
-
-	template<class Vector4, int T>
-	class Swizzle4
+public:
+	union
 	{
-	public:
-		operator RValue<Vector4>() const;
-
-	private:
-		Vector4 *parent;
+		SwizzleMask1<Vector4, 0x0000> x;
+		SwizzleMask1<Vector4, 0x1111> y;
+		SwizzleMask1<Vector4, 0x2222> z;
+		SwizzleMask1<Vector4, 0x3333> w;
+		Swizzle2<Vector4, 0x0000>     xx;
+		Swizzle2<Vector4, 0x1000>     yx;
+		Swizzle2<Vector4, 0x2000>     zx;
+		Swizzle2<Vector4, 0x3000>     wx;
+		SwizzleMask2<Vector4, 0x0111> xy;
+		Swizzle2<Vector4, 0x1111>     yy;
+		Swizzle2<Vector4, 0x2111>     zy;
+		Swizzle2<Vector4, 0x3111>     wy;
+		SwizzleMask2<Vector4, 0x0222> xz;
+		SwizzleMask2<Vector4, 0x1222> yz;
+		Swizzle2<Vector4, 0x2222>     zz;
+		Swizzle2<Vector4, 0x3222>     wz;
+		SwizzleMask2<Vector4, 0x0333> xw;
+		SwizzleMask2<Vector4, 0x1333> yw;
+		SwizzleMask2<Vector4, 0x2333> zw;
+		Swizzle2<Vector4, 0x3333>     ww;
+		Swizzle4<Vector4, 0x0000>     xxx;
+		Swizzle4<Vector4, 0x1000>     yxx;
+		Swizzle4<Vector4, 0x2000>     zxx;
+		Swizzle4<Vector4, 0x3000>     wxx;
+		Swizzle4<Vector4, 0x0100>     xyx;
+		Swizzle4<Vector4, 0x1100>     yyx;
+		Swizzle4<Vector4, 0x2100>     zyx;
+		Swizzle4<Vector4, 0x3100>     wyx;
+		Swizzle4<Vector4, 0x0200>     xzx;
+		Swizzle4<Vector4, 0x1200>     yzx;
+		Swizzle4<Vector4, 0x2200>     zzx;
+		Swizzle4<Vector4, 0x3200>     wzx;
+		Swizzle4<Vector4, 0x0300>     xwx;
+		Swizzle4<Vector4, 0x1300>     ywx;
+		Swizzle4<Vector4, 0x2300>     zwx;
+		Swizzle4<Vector4, 0x3300>     wwx;
+		Swizzle4<Vector4, 0x0011>     xxy;
+		Swizzle4<Vector4, 0x1011>     yxy;
+		Swizzle4<Vector4, 0x2011>     zxy;
+		Swizzle4<Vector4, 0x3011>     wxy;
+		Swizzle4<Vector4, 0x0111>     xyy;
+		Swizzle4<Vector4, 0x1111>     yyy;
+		Swizzle4<Vector4, 0x2111>     zyy;
+		Swizzle4<Vector4, 0x3111>     wyy;
+		Swizzle4<Vector4, 0x0211>     xzy;
+		Swizzle4<Vector4, 0x1211>     yzy;
+		Swizzle4<Vector4, 0x2211>     zzy;
+		Swizzle4<Vector4, 0x3211>     wzy;
+		Swizzle4<Vector4, 0x0311>     xwy;
+		Swizzle4<Vector4, 0x1311>     ywy;
+		Swizzle4<Vector4, 0x2311>     zwy;
+		Swizzle4<Vector4, 0x3311>     wwy;
+		Swizzle4<Vector4, 0x0022>     xxz;
+		Swizzle4<Vector4, 0x1022>     yxz;
+		Swizzle4<Vector4, 0x2022>     zxz;
+		Swizzle4<Vector4, 0x3022>     wxz;
+		SwizzleMask4<Vector4, 0x0122> xyz;
+		Swizzle4<Vector4, 0x1122>     yyz;
+		Swizzle4<Vector4, 0x2122>     zyz;
+		Swizzle4<Vector4, 0x3122>     wyz;
+		Swizzle4<Vector4, 0x0222>     xzz;
+		Swizzle4<Vector4, 0x1222>     yzz;
+		Swizzle4<Vector4, 0x2222>     zzz;
+		Swizzle4<Vector4, 0x3222>     wzz;
+		Swizzle4<Vector4, 0x0322>     xwz;
+		Swizzle4<Vector4, 0x1322>     ywz;
+		Swizzle4<Vector4, 0x2322>     zwz;
+		Swizzle4<Vector4, 0x3322>     wwz;
+		Swizzle4<Vector4, 0x0033>     xxw;
+		Swizzle4<Vector4, 0x1033>     yxw;
+		Swizzle4<Vector4, 0x2033>     zxw;
+		Swizzle4<Vector4, 0x3033>     wxw;
+		SwizzleMask4<Vector4, 0x0133> xyw;
+		Swizzle4<Vector4, 0x1133>     yyw;
+		Swizzle4<Vector4, 0x2133>     zyw;
+		Swizzle4<Vector4, 0x3133>     wyw;
+		SwizzleMask4<Vector4, 0x0233> xzw;
+		SwizzleMask4<Vector4, 0x1233> yzw;
+		Swizzle4<Vector4, 0x2233>     zzw;
+		Swizzle4<Vector4, 0x3233>     wzw;
+		Swizzle4<Vector4, 0x0333>     xww;
+		Swizzle4<Vector4, 0x1333>     yww;
+		Swizzle4<Vector4, 0x2333>     zww;
+		Swizzle4<Vector4, 0x3333>     www;
+		Swizzle4<Vector4, 0x0000>     xxxx;
+		Swizzle4<Vector4, 0x1000>     yxxx;
+		Swizzle4<Vector4, 0x2000>     zxxx;
+		Swizzle4<Vector4, 0x3000>     wxxx;
+		Swizzle4<Vector4, 0x0100>     xyxx;
+		Swizzle4<Vector4, 0x1100>     yyxx;
+		Swizzle4<Vector4, 0x2100>     zyxx;
+		Swizzle4<Vector4, 0x3100>     wyxx;
+		Swizzle4<Vector4, 0x0200>     xzxx;
+		Swizzle4<Vector4, 0x1200>     yzxx;
+		Swizzle4<Vector4, 0x2200>     zzxx;
+		Swizzle4<Vector4, 0x3200>     wzxx;
+		Swizzle4<Vector4, 0x0300>     xwxx;
+		Swizzle4<Vector4, 0x1300>     ywxx;
+		Swizzle4<Vector4, 0x2300>     zwxx;
+		Swizzle4<Vector4, 0x3300>     wwxx;
+		Swizzle4<Vector4, 0x0010>     xxyx;
+		Swizzle4<Vector4, 0x1010>     yxyx;
+		Swizzle4<Vector4, 0x2010>     zxyx;
+		Swizzle4<Vector4, 0x3010>     wxyx;
+		Swizzle4<Vector4, 0x0110>     xyyx;
+		Swizzle4<Vector4, 0x1110>     yyyx;
+		Swizzle4<Vector4, 0x2110>     zyyx;
+		Swizzle4<Vector4, 0x3110>     wyyx;
+		Swizzle4<Vector4, 0x0210>     xzyx;
+		Swizzle4<Vector4, 0x1210>     yzyx;
+		Swizzle4<Vector4, 0x2210>     zzyx;
+		Swizzle4<Vector4, 0x3210>     wzyx;
+		Swizzle4<Vector4, 0x0310>     xwyx;
+		Swizzle4<Vector4, 0x1310>     ywyx;
+		Swizzle4<Vector4, 0x2310>     zwyx;
+		Swizzle4<Vector4, 0x3310>     wwyx;
+		Swizzle4<Vector4, 0x0020>     xxzx;
+		Swizzle4<Vector4, 0x1020>     yxzx;
+		Swizzle4<Vector4, 0x2020>     zxzx;
+		Swizzle4<Vector4, 0x3020>     wxzx;
+		Swizzle4<Vector4, 0x0120>     xyzx;
+		Swizzle4<Vector4, 0x1120>     yyzx;
+		Swizzle4<Vector4, 0x2120>     zyzx;
+		Swizzle4<Vector4, 0x3120>     wyzx;
+		Swizzle4<Vector4, 0x0220>     xzzx;
+		Swizzle4<Vector4, 0x1220>     yzzx;
+		Swizzle4<Vector4, 0x2220>     zzzx;
+		Swizzle4<Vector4, 0x3220>     wzzx;
+		Swizzle4<Vector4, 0x0320>     xwzx;
+		Swizzle4<Vector4, 0x1320>     ywzx;
+		Swizzle4<Vector4, 0x2320>     zwzx;
+		Swizzle4<Vector4, 0x3320>     wwzx;
+		Swizzle4<Vector4, 0x0030>     xxwx;
+		Swizzle4<Vector4, 0x1030>     yxwx;
+		Swizzle4<Vector4, 0x2030>     zxwx;
+		Swizzle4<Vector4, 0x3030>     wxwx;
+		Swizzle4<Vector4, 0x0130>     xywx;
+		Swizzle4<Vector4, 0x1130>     yywx;
+		Swizzle4<Vector4, 0x2130>     zywx;
+		Swizzle4<Vector4, 0x3130>     wywx;
+		Swizzle4<Vector4, 0x0230>     xzwx;
+		Swizzle4<Vector4, 0x1230>     yzwx;
+		Swizzle4<Vector4, 0x2230>     zzwx;
+		Swizzle4<Vector4, 0x3230>     wzwx;
+		Swizzle4<Vector4, 0x0330>     xwwx;
+		Swizzle4<Vector4, 0x1330>     ywwx;
+		Swizzle4<Vector4, 0x2330>     zwwx;
+		Swizzle4<Vector4, 0x3330>     wwwx;
+		Swizzle4<Vector4, 0x0001>     xxxy;
+		Swizzle4<Vector4, 0x1001>     yxxy;
+		Swizzle4<Vector4, 0x2001>     zxxy;
+		Swizzle4<Vector4, 0x3001>     wxxy;
+		Swizzle4<Vector4, 0x0101>     xyxy;
+		Swizzle4<Vector4, 0x1101>     yyxy;
+		Swizzle4<Vector4, 0x2101>     zyxy;
+		Swizzle4<Vector4, 0x3101>     wyxy;
+		Swizzle4<Vector4, 0x0201>     xzxy;
+		Swizzle4<Vector4, 0x1201>     yzxy;
+		Swizzle4<Vector4, 0x2201>     zzxy;
+		Swizzle4<Vector4, 0x3201>     wzxy;
+		Swizzle4<Vector4, 0x0301>     xwxy;
+		Swizzle4<Vector4, 0x1301>     ywxy;
+		Swizzle4<Vector4, 0x2301>     zwxy;
+		Swizzle4<Vector4, 0x3301>     wwxy;
+		Swizzle4<Vector4, 0x0011>     xxyy;
+		Swizzle4<Vector4, 0x1011>     yxyy;
+		Swizzle4<Vector4, 0x2011>     zxyy;
+		Swizzle4<Vector4, 0x3011>     wxyy;
+		Swizzle4<Vector4, 0x0111>     xyyy;
+		Swizzle4<Vector4, 0x1111>     yyyy;
+		Swizzle4<Vector4, 0x2111>     zyyy;
+		Swizzle4<Vector4, 0x3111>     wyyy;
+		Swizzle4<Vector4, 0x0211>     xzyy;
+		Swizzle4<Vector4, 0x1211>     yzyy;
+		Swizzle4<Vector4, 0x2211>     zzyy;
+		Swizzle4<Vector4, 0x3211>     wzyy;
+		Swizzle4<Vector4, 0x0311>     xwyy;
+		Swizzle4<Vector4, 0x1311>     ywyy;
+		Swizzle4<Vector4, 0x2311>     zwyy;
+		Swizzle4<Vector4, 0x3311>     wwyy;
+		Swizzle4<Vector4, 0x0021>     xxzy;
+		Swizzle4<Vector4, 0x1021>     yxzy;
+		Swizzle4<Vector4, 0x2021>     zxzy;
+		Swizzle4<Vector4, 0x3021>     wxzy;
+		Swizzle4<Vector4, 0x0121>     xyzy;
+		Swizzle4<Vector4, 0x1121>     yyzy;
+		Swizzle4<Vector4, 0x2121>     zyzy;
+		Swizzle4<Vector4, 0x3121>     wyzy;
+		Swizzle4<Vector4, 0x0221>     xzzy;
+		Swizzle4<Vector4, 0x1221>     yzzy;
+		Swizzle4<Vector4, 0x2221>     zzzy;
+		Swizzle4<Vector4, 0x3221>     wzzy;
+		Swizzle4<Vector4, 0x0321>     xwzy;
+		Swizzle4<Vector4, 0x1321>     ywzy;
+		Swizzle4<Vector4, 0x2321>     zwzy;
+		Swizzle4<Vector4, 0x3321>     wwzy;
+		Swizzle4<Vector4, 0x0031>     xxwy;
+		Swizzle4<Vector4, 0x1031>     yxwy;
+		Swizzle4<Vector4, 0x2031>     zxwy;
+		Swizzle4<Vector4, 0x3031>     wxwy;
+		Swizzle4<Vector4, 0x0131>     xywy;
+		Swizzle4<Vector4, 0x1131>     yywy;
+		Swizzle4<Vector4, 0x2131>     zywy;
+		Swizzle4<Vector4, 0x3131>     wywy;
+		Swizzle4<Vector4, 0x0231>     xzwy;
+		Swizzle4<Vector4, 0x1231>     yzwy;
+		Swizzle4<Vector4, 0x2231>     zzwy;
+		Swizzle4<Vector4, 0x3231>     wzwy;
+		Swizzle4<Vector4, 0x0331>     xwwy;
+		Swizzle4<Vector4, 0x1331>     ywwy;
+		Swizzle4<Vector4, 0x2331>     zwwy;
+		Swizzle4<Vector4, 0x3331>     wwwy;
+		Swizzle4<Vector4, 0x0002>     xxxz;
+		Swizzle4<Vector4, 0x1002>     yxxz;
+		Swizzle4<Vector4, 0x2002>     zxxz;
+		Swizzle4<Vector4, 0x3002>     wxxz;
+		Swizzle4<Vector4, 0x0102>     xyxz;
+		Swizzle4<Vector4, 0x1102>     yyxz;
+		Swizzle4<Vector4, 0x2102>     zyxz;
+		Swizzle4<Vector4, 0x3102>     wyxz;
+		Swizzle4<Vector4, 0x0202>     xzxz;
+		Swizzle4<Vector4, 0x1202>     yzxz;
+		Swizzle4<Vector4, 0x2202>     zzxz;
+		Swizzle4<Vector4, 0x3202>     wzxz;
+		Swizzle4<Vector4, 0x0302>     xwxz;
+		Swizzle4<Vector4, 0x1302>     ywxz;
+		Swizzle4<Vector4, 0x2302>     zwxz;
+		Swizzle4<Vector4, 0x3302>     wwxz;
+		Swizzle4<Vector4, 0x0012>     xxyz;
+		Swizzle4<Vector4, 0x1012>     yxyz;
+		Swizzle4<Vector4, 0x2012>     zxyz;
+		Swizzle4<Vector4, 0x3012>     wxyz;
+		Swizzle4<Vector4, 0x0112>     xyyz;
+		Swizzle4<Vector4, 0x1112>     yyyz;
+		Swizzle4<Vector4, 0x2112>     zyyz;
+		Swizzle4<Vector4, 0x3112>     wyyz;
+		Swizzle4<Vector4, 0x0212>     xzyz;
+		Swizzle4<Vector4, 0x1212>     yzyz;
+		Swizzle4<Vector4, 0x2212>     zzyz;
+		Swizzle4<Vector4, 0x3212>     wzyz;
+		Swizzle4<Vector4, 0x0312>     xwyz;
+		Swizzle4<Vector4, 0x1312>     ywyz;
+		Swizzle4<Vector4, 0x2312>     zwyz;
+		Swizzle4<Vector4, 0x3312>     wwyz;
+		Swizzle4<Vector4, 0x0022>     xxzz;
+		Swizzle4<Vector4, 0x1022>     yxzz;
+		Swizzle4<Vector4, 0x2022>     zxzz;
+		Swizzle4<Vector4, 0x3022>     wxzz;
+		Swizzle4<Vector4, 0x0122>     xyzz;
+		Swizzle4<Vector4, 0x1122>     yyzz;
+		Swizzle4<Vector4, 0x2122>     zyzz;
+		Swizzle4<Vector4, 0x3122>     wyzz;
+		Swizzle4<Vector4, 0x0222>     xzzz;
+		Swizzle4<Vector4, 0x1222>     yzzz;
+		Swizzle4<Vector4, 0x2222>     zzzz;
+		Swizzle4<Vector4, 0x3222>     wzzz;
+		Swizzle4<Vector4, 0x0322>     xwzz;
+		Swizzle4<Vector4, 0x1322>     ywzz;
+		Swizzle4<Vector4, 0x2322>     zwzz;
+		Swizzle4<Vector4, 0x3322>     wwzz;
+		Swizzle4<Vector4, 0x0032>     xxwz;
+		Swizzle4<Vector4, 0x1032>     yxwz;
+		Swizzle4<Vector4, 0x2032>     zxwz;
+		Swizzle4<Vector4, 0x3032>     wxwz;
+		Swizzle4<Vector4, 0x0132>     xywz;
+		Swizzle4<Vector4, 0x1132>     yywz;
+		Swizzle4<Vector4, 0x2132>     zywz;
+		Swizzle4<Vector4, 0x3132>     wywz;
+		Swizzle4<Vector4, 0x0232>     xzwz;
+		Swizzle4<Vector4, 0x1232>     yzwz;
+		Swizzle4<Vector4, 0x2232>     zzwz;
+		Swizzle4<Vector4, 0x3232>     wzwz;
+		Swizzle4<Vector4, 0x0332>     xwwz;
+		Swizzle4<Vector4, 0x1332>     ywwz;
+		Swizzle4<Vector4, 0x2332>     zwwz;
+		Swizzle4<Vector4, 0x3332>     wwwz;
+		Swizzle4<Vector4, 0x0003>     xxxw;
+		Swizzle4<Vector4, 0x1003>     yxxw;
+		Swizzle4<Vector4, 0x2003>     zxxw;
+		Swizzle4<Vector4, 0x3003>     wxxw;
+		Swizzle4<Vector4, 0x0103>     xyxw;
+		Swizzle4<Vector4, 0x1103>     yyxw;
+		Swizzle4<Vector4, 0x2103>     zyxw;
+		Swizzle4<Vector4, 0x3103>     wyxw;
+		Swizzle4<Vector4, 0x0203>     xzxw;
+		Swizzle4<Vector4, 0x1203>     yzxw;
+		Swizzle4<Vector4, 0x2203>     zzxw;
+		Swizzle4<Vector4, 0x3203>     wzxw;
+		Swizzle4<Vector4, 0x0303>     xwxw;
+		Swizzle4<Vector4, 0x1303>     ywxw;
+		Swizzle4<Vector4, 0x2303>     zwxw;
+		Swizzle4<Vector4, 0x3303>     wwxw;
+		Swizzle4<Vector4, 0x0013>     xxyw;
+		Swizzle4<Vector4, 0x1013>     yxyw;
+		Swizzle4<Vector4, 0x2013>     zxyw;
+		Swizzle4<Vector4, 0x3013>     wxyw;
+		Swizzle4<Vector4, 0x0113>     xyyw;
+		Swizzle4<Vector4, 0x1113>     yyyw;
+		Swizzle4<Vector4, 0x2113>     zyyw;
+		Swizzle4<Vector4, 0x3113>     wyyw;
+		Swizzle4<Vector4, 0x0213>     xzyw;
+		Swizzle4<Vector4, 0x1213>     yzyw;
+		Swizzle4<Vector4, 0x2213>     zzyw;
+		Swizzle4<Vector4, 0x3213>     wzyw;
+		Swizzle4<Vector4, 0x0313>     xwyw;
+		Swizzle4<Vector4, 0x1313>     ywyw;
+		Swizzle4<Vector4, 0x2313>     zwyw;
+		Swizzle4<Vector4, 0x3313>     wwyw;
+		Swizzle4<Vector4, 0x0023>     xxzw;
+		Swizzle4<Vector4, 0x1023>     yxzw;
+		Swizzle4<Vector4, 0x2023>     zxzw;
+		Swizzle4<Vector4, 0x3023>     wxzw;
+		SwizzleMask4<Vector4, 0x0123> xyzw;
+		Swizzle4<Vector4, 0x1123>     yyzw;
+		Swizzle4<Vector4, 0x2123>     zyzw;
+		Swizzle4<Vector4, 0x3123>     wyzw;
+		Swizzle4<Vector4, 0x0223>     xzzw;
+		Swizzle4<Vector4, 0x1223>     yzzw;
+		Swizzle4<Vector4, 0x2223>     zzzw;
+		Swizzle4<Vector4, 0x3223>     wzzw;
+		Swizzle4<Vector4, 0x0323>     xwzw;
+		Swizzle4<Vector4, 0x1323>     ywzw;
+		Swizzle4<Vector4, 0x2323>     zwzw;
+		Swizzle4<Vector4, 0x3323>     wwzw;
+		Swizzle4<Vector4, 0x0033>     xxww;
+		Swizzle4<Vector4, 0x1033>     yxww;
+		Swizzle4<Vector4, 0x2033>     zxww;
+		Swizzle4<Vector4, 0x3033>     wxww;
+		Swizzle4<Vector4, 0x0133>     xyww;
+		Swizzle4<Vector4, 0x1133>     yyww;
+		Swizzle4<Vector4, 0x2133>     zyww;
+		Swizzle4<Vector4, 0x3133>     wyww;
+		Swizzle4<Vector4, 0x0233>     xzww;
+		Swizzle4<Vector4, 0x1233>     yzww;
+		Swizzle4<Vector4, 0x2233>     zzww;
+		Swizzle4<Vector4, 0x3233>     wzww;
+		Swizzle4<Vector4, 0x0333>     xwww;
+		Swizzle4<Vector4, 0x1333>     ywww;
+		Swizzle4<Vector4, 0x2333>     zwww;
+		Swizzle4<Vector4, 0x3333>     wwww;
 	};
+};
 
-	template<class Vector4, int T>
-	class SwizzleMask4
-	{
-		friend XYZW<Vector4>;
+class Int4 : public LValue<Int4>, public XYZW<Int4>
+{
+public:
+	explicit Int4(RValue<Byte4> cast);
+	explicit Int4(RValue<SByte4> cast);
+	explicit Int4(RValue<Float4> cast);
+	explicit Int4(RValue<Short4> cast);
+	explicit Int4(RValue<UShort4> cast);
 
-	public:
-		operator RValue<Vector4>() const;
+	Int4();
+	Int4(int xyzw);
+	Int4(int x, int yzw);
+	Int4(int x, int y, int zw);
+	Int4(int x, int y, int z, int w);
+	Int4(RValue<Int4> rhs);
+	Int4(const Int4 &rhs);
+	Int4(const Reference<Int4> &rhs);
+	Int4(RValue<UInt4> rhs);
+	Int4(const UInt4 &rhs);
+	Int4(const Reference<UInt4> &rhs);
+	Int4(RValue<Int2> lo, RValue<Int2> hi);
+	Int4(RValue<Int> rhs);
+	Int4(const Int &rhs);
+	Int4(const Reference<Int> &rhs);
 
-		RValue<Vector4> operator=(RValue<Vector4> rhs);
-		RValue<Vector4> operator=(RValue<typename Scalar<Vector4>::Type> rhs);
+	RValue<Int4> operator=(RValue<Int4> rhs);
+	RValue<Int4> operator=(const Int4 &rhs);
+	RValue<Int4> operator=(const Reference<Int4> &rhs);
 
-	private:
-		Vector4 *parent;
-	};
+	static Type *getType();
 
-	template<>
-	struct Scalar<Float4>
-	{
-		using Type = Float;
-	};
+private:
+	void constant(int x, int y, int z, int w);
+};
 
-	template<>
-	struct Scalar<Int4>
-	{
-		using Type = Int;
-	};
-
-	template<>
-	struct Scalar<UInt4>
-	{
-		using Type = UInt;
-	};
-
-	template<class Vector4, int T>
-	class SwizzleMask1
-	{
-	public:
-		operator RValue<typename Scalar<Vector4>::Type>() const;
-		operator RValue<Vector4>() const;
-
-		RValue<Vector4> operator=(float x);
-		RValue<Vector4> operator=(RValue<Vector4> rhs);
-		RValue<Vector4> operator=(RValue<typename Scalar<Vector4>::Type> rhs);
-
-	private:
-		Vector4 *parent;
-	};
-
-	template<class Vector4, int T>
-	class SwizzleMask2
-	{
-		friend class Float4;
-
-	public:
-		operator RValue<Vector4>() const;
-
-		RValue<Vector4> operator=(RValue<Vector4> rhs);
-
-	private:
-		Float4 *parent;
-	};
-
-	template<class Vector4>
-	struct XYZW
-	{
-		friend Vector4;
-
-	private:
-		XYZW(Vector4 *parent)
-		{
-			xyzw.parent = parent;
-		}
-
-	public:
-		union
-		{
-			SwizzleMask1<Vector4, 0x0000> x;
-			SwizzleMask1<Vector4, 0x1111> y;
-			SwizzleMask1<Vector4, 0x2222> z;
-			SwizzleMask1<Vector4, 0x3333> w;
-			Swizzle2<Vector4, 0x0000>     xx;
-			Swizzle2<Vector4, 0x1000>     yx;
-			Swizzle2<Vector4, 0x2000>     zx;
-			Swizzle2<Vector4, 0x3000>     wx;
-			SwizzleMask2<Vector4, 0x0111> xy;
-			Swizzle2<Vector4, 0x1111>     yy;
-			Swizzle2<Vector4, 0x2111>     zy;
-			Swizzle2<Vector4, 0x3111>     wy;
-			SwizzleMask2<Vector4, 0x0222> xz;
-			SwizzleMask2<Vector4, 0x1222> yz;
-			Swizzle2<Vector4, 0x2222>     zz;
-			Swizzle2<Vector4, 0x3222>     wz;
-			SwizzleMask2<Vector4, 0x0333> xw;
-			SwizzleMask2<Vector4, 0x1333> yw;
-			SwizzleMask2<Vector4, 0x2333> zw;
-			Swizzle2<Vector4, 0x3333>     ww;
-			Swizzle4<Vector4, 0x0000>     xxx;
-			Swizzle4<Vector4, 0x1000>     yxx;
-			Swizzle4<Vector4, 0x2000>     zxx;
-			Swizzle4<Vector4, 0x3000>     wxx;
-			Swizzle4<Vector4, 0x0100>     xyx;
-			Swizzle4<Vector4, 0x1100>     yyx;
-			Swizzle4<Vector4, 0x2100>     zyx;
-			Swizzle4<Vector4, 0x3100>     wyx;
-			Swizzle4<Vector4, 0x0200>     xzx;
-			Swizzle4<Vector4, 0x1200>     yzx;
-			Swizzle4<Vector4, 0x2200>     zzx;
-			Swizzle4<Vector4, 0x3200>     wzx;
-			Swizzle4<Vector4, 0x0300>     xwx;
-			Swizzle4<Vector4, 0x1300>     ywx;
-			Swizzle4<Vector4, 0x2300>     zwx;
-			Swizzle4<Vector4, 0x3300>     wwx;
-			Swizzle4<Vector4, 0x0011>     xxy;
-			Swizzle4<Vector4, 0x1011>     yxy;
-			Swizzle4<Vector4, 0x2011>     zxy;
-			Swizzle4<Vector4, 0x3011>     wxy;
-			Swizzle4<Vector4, 0x0111>     xyy;
-			Swizzle4<Vector4, 0x1111>     yyy;
-			Swizzle4<Vector4, 0x2111>     zyy;
-			Swizzle4<Vector4, 0x3111>     wyy;
-			Swizzle4<Vector4, 0x0211>     xzy;
-			Swizzle4<Vector4, 0x1211>     yzy;
-			Swizzle4<Vector4, 0x2211>     zzy;
-			Swizzle4<Vector4, 0x3211>     wzy;
-			Swizzle4<Vector4, 0x0311>     xwy;
-			Swizzle4<Vector4, 0x1311>     ywy;
-			Swizzle4<Vector4, 0x2311>     zwy;
-			Swizzle4<Vector4, 0x3311>     wwy;
-			Swizzle4<Vector4, 0x0022>     xxz;
-			Swizzle4<Vector4, 0x1022>     yxz;
-			Swizzle4<Vector4, 0x2022>     zxz;
-			Swizzle4<Vector4, 0x3022>     wxz;
-			SwizzleMask4<Vector4, 0x0122> xyz;
-			Swizzle4<Vector4, 0x1122>     yyz;
-			Swizzle4<Vector4, 0x2122>     zyz;
-			Swizzle4<Vector4, 0x3122>     wyz;
-			Swizzle4<Vector4, 0x0222>     xzz;
-			Swizzle4<Vector4, 0x1222>     yzz;
-			Swizzle4<Vector4, 0x2222>     zzz;
-			Swizzle4<Vector4, 0x3222>     wzz;
-			Swizzle4<Vector4, 0x0322>     xwz;
-			Swizzle4<Vector4, 0x1322>     ywz;
-			Swizzle4<Vector4, 0x2322>     zwz;
-			Swizzle4<Vector4, 0x3322>     wwz;
-			Swizzle4<Vector4, 0x0033>     xxw;
-			Swizzle4<Vector4, 0x1033>     yxw;
-			Swizzle4<Vector4, 0x2033>     zxw;
-			Swizzle4<Vector4, 0x3033>     wxw;
-			SwizzleMask4<Vector4, 0x0133> xyw;
-			Swizzle4<Vector4, 0x1133>     yyw;
-			Swizzle4<Vector4, 0x2133>     zyw;
-			Swizzle4<Vector4, 0x3133>     wyw;
-			SwizzleMask4<Vector4, 0x0233> xzw;
-			SwizzleMask4<Vector4, 0x1233> yzw;
-			Swizzle4<Vector4, 0x2233>     zzw;
-			Swizzle4<Vector4, 0x3233>     wzw;
-			Swizzle4<Vector4, 0x0333>     xww;
-			Swizzle4<Vector4, 0x1333>     yww;
-			Swizzle4<Vector4, 0x2333>     zww;
-			Swizzle4<Vector4, 0x3333>     www;
-			Swizzle4<Vector4, 0x0000>     xxxx;
-			Swizzle4<Vector4, 0x1000>     yxxx;
-			Swizzle4<Vector4, 0x2000>     zxxx;
-			Swizzle4<Vector4, 0x3000>     wxxx;
-			Swizzle4<Vector4, 0x0100>     xyxx;
-			Swizzle4<Vector4, 0x1100>     yyxx;
-			Swizzle4<Vector4, 0x2100>     zyxx;
-			Swizzle4<Vector4, 0x3100>     wyxx;
-			Swizzle4<Vector4, 0x0200>     xzxx;
-			Swizzle4<Vector4, 0x1200>     yzxx;
-			Swizzle4<Vector4, 0x2200>     zzxx;
-			Swizzle4<Vector4, 0x3200>     wzxx;
-			Swizzle4<Vector4, 0x0300>     xwxx;
-			Swizzle4<Vector4, 0x1300>     ywxx;
-			Swizzle4<Vector4, 0x2300>     zwxx;
-			Swizzle4<Vector4, 0x3300>     wwxx;
-			Swizzle4<Vector4, 0x0010>     xxyx;
-			Swizzle4<Vector4, 0x1010>     yxyx;
-			Swizzle4<Vector4, 0x2010>     zxyx;
-			Swizzle4<Vector4, 0x3010>     wxyx;
-			Swizzle4<Vector4, 0x0110>     xyyx;
-			Swizzle4<Vector4, 0x1110>     yyyx;
-			Swizzle4<Vector4, 0x2110>     zyyx;
-			Swizzle4<Vector4, 0x3110>     wyyx;
-			Swizzle4<Vector4, 0x0210>     xzyx;
-			Swizzle4<Vector4, 0x1210>     yzyx;
-			Swizzle4<Vector4, 0x2210>     zzyx;
-			Swizzle4<Vector4, 0x3210>     wzyx;
-			Swizzle4<Vector4, 0x0310>     xwyx;
-			Swizzle4<Vector4, 0x1310>     ywyx;
-			Swizzle4<Vector4, 0x2310>     zwyx;
-			Swizzle4<Vector4, 0x3310>     wwyx;
-			Swizzle4<Vector4, 0x0020>     xxzx;
-			Swizzle4<Vector4, 0x1020>     yxzx;
-			Swizzle4<Vector4, 0x2020>     zxzx;
-			Swizzle4<Vector4, 0x3020>     wxzx;
-			Swizzle4<Vector4, 0x0120>     xyzx;
-			Swizzle4<Vector4, 0x1120>     yyzx;
-			Swizzle4<Vector4, 0x2120>     zyzx;
-			Swizzle4<Vector4, 0x3120>     wyzx;
-			Swizzle4<Vector4, 0x0220>     xzzx;
-			Swizzle4<Vector4, 0x1220>     yzzx;
-			Swizzle4<Vector4, 0x2220>     zzzx;
-			Swizzle4<Vector4, 0x3220>     wzzx;
-			Swizzle4<Vector4, 0x0320>     xwzx;
-			Swizzle4<Vector4, 0x1320>     ywzx;
-			Swizzle4<Vector4, 0x2320>     zwzx;
-			Swizzle4<Vector4, 0x3320>     wwzx;
-			Swizzle4<Vector4, 0x0030>     xxwx;
-			Swizzle4<Vector4, 0x1030>     yxwx;
-			Swizzle4<Vector4, 0x2030>     zxwx;
-			Swizzle4<Vector4, 0x3030>     wxwx;
-			Swizzle4<Vector4, 0x0130>     xywx;
-			Swizzle4<Vector4, 0x1130>     yywx;
-			Swizzle4<Vector4, 0x2130>     zywx;
-			Swizzle4<Vector4, 0x3130>     wywx;
-			Swizzle4<Vector4, 0x0230>     xzwx;
-			Swizzle4<Vector4, 0x1230>     yzwx;
-			Swizzle4<Vector4, 0x2230>     zzwx;
-			Swizzle4<Vector4, 0x3230>     wzwx;
-			Swizzle4<Vector4, 0x0330>     xwwx;
-			Swizzle4<Vector4, 0x1330>     ywwx;
-			Swizzle4<Vector4, 0x2330>     zwwx;
-			Swizzle4<Vector4, 0x3330>     wwwx;
-			Swizzle4<Vector4, 0x0001>     xxxy;
-			Swizzle4<Vector4, 0x1001>     yxxy;
-			Swizzle4<Vector4, 0x2001>     zxxy;
-			Swizzle4<Vector4, 0x3001>     wxxy;
-			Swizzle4<Vector4, 0x0101>     xyxy;
-			Swizzle4<Vector4, 0x1101>     yyxy;
-			Swizzle4<Vector4, 0x2101>     zyxy;
-			Swizzle4<Vector4, 0x3101>     wyxy;
-			Swizzle4<Vector4, 0x0201>     xzxy;
-			Swizzle4<Vector4, 0x1201>     yzxy;
-			Swizzle4<Vector4, 0x2201>     zzxy;
-			Swizzle4<Vector4, 0x3201>     wzxy;
-			Swizzle4<Vector4, 0x0301>     xwxy;
-			Swizzle4<Vector4, 0x1301>     ywxy;
-			Swizzle4<Vector4, 0x2301>     zwxy;
-			Swizzle4<Vector4, 0x3301>     wwxy;
-			Swizzle4<Vector4, 0x0011>     xxyy;
-			Swizzle4<Vector4, 0x1011>     yxyy;
-			Swizzle4<Vector4, 0x2011>     zxyy;
-			Swizzle4<Vector4, 0x3011>     wxyy;
-			Swizzle4<Vector4, 0x0111>     xyyy;
-			Swizzle4<Vector4, 0x1111>     yyyy;
-			Swizzle4<Vector4, 0x2111>     zyyy;
-			Swizzle4<Vector4, 0x3111>     wyyy;
-			Swizzle4<Vector4, 0x0211>     xzyy;
-			Swizzle4<Vector4, 0x1211>     yzyy;
-			Swizzle4<Vector4, 0x2211>     zzyy;
-			Swizzle4<Vector4, 0x3211>     wzyy;
-			Swizzle4<Vector4, 0x0311>     xwyy;
-			Swizzle4<Vector4, 0x1311>     ywyy;
-			Swizzle4<Vector4, 0x2311>     zwyy;
-			Swizzle4<Vector4, 0x3311>     wwyy;
-			Swizzle4<Vector4, 0x0021>     xxzy;
-			Swizzle4<Vector4, 0x1021>     yxzy;
-			Swizzle4<Vector4, 0x2021>     zxzy;
-			Swizzle4<Vector4, 0x3021>     wxzy;
-			Swizzle4<Vector4, 0x0121>     xyzy;
-			Swizzle4<Vector4, 0x1121>     yyzy;
-			Swizzle4<Vector4, 0x2121>     zyzy;
-			Swizzle4<Vector4, 0x3121>     wyzy;
-			Swizzle4<Vector4, 0x0221>     xzzy;
-			Swizzle4<Vector4, 0x1221>     yzzy;
-			Swizzle4<Vector4, 0x2221>     zzzy;
-			Swizzle4<Vector4, 0x3221>     wzzy;
-			Swizzle4<Vector4, 0x0321>     xwzy;
-			Swizzle4<Vector4, 0x1321>     ywzy;
-			Swizzle4<Vector4, 0x2321>     zwzy;
-			Swizzle4<Vector4, 0x3321>     wwzy;
-			Swizzle4<Vector4, 0x0031>     xxwy;
-			Swizzle4<Vector4, 0x1031>     yxwy;
-			Swizzle4<Vector4, 0x2031>     zxwy;
-			Swizzle4<Vector4, 0x3031>     wxwy;
-			Swizzle4<Vector4, 0x0131>     xywy;
-			Swizzle4<Vector4, 0x1131>     yywy;
-			Swizzle4<Vector4, 0x2131>     zywy;
-			Swizzle4<Vector4, 0x3131>     wywy;
-			Swizzle4<Vector4, 0x0231>     xzwy;
-			Swizzle4<Vector4, 0x1231>     yzwy;
-			Swizzle4<Vector4, 0x2231>     zzwy;
-			Swizzle4<Vector4, 0x3231>     wzwy;
-			Swizzle4<Vector4, 0x0331>     xwwy;
-			Swizzle4<Vector4, 0x1331>     ywwy;
-			Swizzle4<Vector4, 0x2331>     zwwy;
-			Swizzle4<Vector4, 0x3331>     wwwy;
-			Swizzle4<Vector4, 0x0002>     xxxz;
-			Swizzle4<Vector4, 0x1002>     yxxz;
-			Swizzle4<Vector4, 0x2002>     zxxz;
-			Swizzle4<Vector4, 0x3002>     wxxz;
-			Swizzle4<Vector4, 0x0102>     xyxz;
-			Swizzle4<Vector4, 0x1102>     yyxz;
-			Swizzle4<Vector4, 0x2102>     zyxz;
-			Swizzle4<Vector4, 0x3102>     wyxz;
-			Swizzle4<Vector4, 0x0202>     xzxz;
-			Swizzle4<Vector4, 0x1202>     yzxz;
-			Swizzle4<Vector4, 0x2202>     zzxz;
-			Swizzle4<Vector4, 0x3202>     wzxz;
-			Swizzle4<Vector4, 0x0302>     xwxz;
-			Swizzle4<Vector4, 0x1302>     ywxz;
-			Swizzle4<Vector4, 0x2302>     zwxz;
-			Swizzle4<Vector4, 0x3302>     wwxz;
-			Swizzle4<Vector4, 0x0012>     xxyz;
-			Swizzle4<Vector4, 0x1012>     yxyz;
-			Swizzle4<Vector4, 0x2012>     zxyz;
-			Swizzle4<Vector4, 0x3012>     wxyz;
-			Swizzle4<Vector4, 0x0112>     xyyz;
-			Swizzle4<Vector4, 0x1112>     yyyz;
-			Swizzle4<Vector4, 0x2112>     zyyz;
-			Swizzle4<Vector4, 0x3112>     wyyz;
-			Swizzle4<Vector4, 0x0212>     xzyz;
-			Swizzle4<Vector4, 0x1212>     yzyz;
-			Swizzle4<Vector4, 0x2212>     zzyz;
-			Swizzle4<Vector4, 0x3212>     wzyz;
-			Swizzle4<Vector4, 0x0312>     xwyz;
-			Swizzle4<Vector4, 0x1312>     ywyz;
-			Swizzle4<Vector4, 0x2312>     zwyz;
-			Swizzle4<Vector4, 0x3312>     wwyz;
-			Swizzle4<Vector4, 0x0022>     xxzz;
-			Swizzle4<Vector4, 0x1022>     yxzz;
-			Swizzle4<Vector4, 0x2022>     zxzz;
-			Swizzle4<Vector4, 0x3022>     wxzz;
-			Swizzle4<Vector4, 0x0122>     xyzz;
-			Swizzle4<Vector4, 0x1122>     yyzz;
-			Swizzle4<Vector4, 0x2122>     zyzz;
-			Swizzle4<Vector4, 0x3122>     wyzz;
-			Swizzle4<Vector4, 0x0222>     xzzz;
-			Swizzle4<Vector4, 0x1222>     yzzz;
-			Swizzle4<Vector4, 0x2222>     zzzz;
-			Swizzle4<Vector4, 0x3222>     wzzz;
-			Swizzle4<Vector4, 0x0322>     xwzz;
-			Swizzle4<Vector4, 0x1322>     ywzz;
-			Swizzle4<Vector4, 0x2322>     zwzz;
-			Swizzle4<Vector4, 0x3322>     wwzz;
-			Swizzle4<Vector4, 0x0032>     xxwz;
-			Swizzle4<Vector4, 0x1032>     yxwz;
-			Swizzle4<Vector4, 0x2032>     zxwz;
-			Swizzle4<Vector4, 0x3032>     wxwz;
-			Swizzle4<Vector4, 0x0132>     xywz;
-			Swizzle4<Vector4, 0x1132>     yywz;
-			Swizzle4<Vector4, 0x2132>     zywz;
-			Swizzle4<Vector4, 0x3132>     wywz;
-			Swizzle4<Vector4, 0x0232>     xzwz;
-			Swizzle4<Vector4, 0x1232>     yzwz;
-			Swizzle4<Vector4, 0x2232>     zzwz;
-			Swizzle4<Vector4, 0x3232>     wzwz;
-			Swizzle4<Vector4, 0x0332>     xwwz;
-			Swizzle4<Vector4, 0x1332>     ywwz;
-			Swizzle4<Vector4, 0x2332>     zwwz;
-			Swizzle4<Vector4, 0x3332>     wwwz;
-			Swizzle4<Vector4, 0x0003>     xxxw;
-			Swizzle4<Vector4, 0x1003>     yxxw;
-			Swizzle4<Vector4, 0x2003>     zxxw;
-			Swizzle4<Vector4, 0x3003>     wxxw;
-			Swizzle4<Vector4, 0x0103>     xyxw;
-			Swizzle4<Vector4, 0x1103>     yyxw;
-			Swizzle4<Vector4, 0x2103>     zyxw;
-			Swizzle4<Vector4, 0x3103>     wyxw;
-			Swizzle4<Vector4, 0x0203>     xzxw;
-			Swizzle4<Vector4, 0x1203>     yzxw;
-			Swizzle4<Vector4, 0x2203>     zzxw;
-			Swizzle4<Vector4, 0x3203>     wzxw;
-			Swizzle4<Vector4, 0x0303>     xwxw;
-			Swizzle4<Vector4, 0x1303>     ywxw;
-			Swizzle4<Vector4, 0x2303>     zwxw;
-			Swizzle4<Vector4, 0x3303>     wwxw;
-			Swizzle4<Vector4, 0x0013>     xxyw;
-			Swizzle4<Vector4, 0x1013>     yxyw;
-			Swizzle4<Vector4, 0x2013>     zxyw;
-			Swizzle4<Vector4, 0x3013>     wxyw;
-			Swizzle4<Vector4, 0x0113>     xyyw;
-			Swizzle4<Vector4, 0x1113>     yyyw;
-			Swizzle4<Vector4, 0x2113>     zyyw;
-			Swizzle4<Vector4, 0x3113>     wyyw;
-			Swizzle4<Vector4, 0x0213>     xzyw;
-			Swizzle4<Vector4, 0x1213>     yzyw;
-			Swizzle4<Vector4, 0x2213>     zzyw;
-			Swizzle4<Vector4, 0x3213>     wzyw;
-			Swizzle4<Vector4, 0x0313>     xwyw;
-			Swizzle4<Vector4, 0x1313>     ywyw;
-			Swizzle4<Vector4, 0x2313>     zwyw;
-			Swizzle4<Vector4, 0x3313>     wwyw;
-			Swizzle4<Vector4, 0x0023>     xxzw;
-			Swizzle4<Vector4, 0x1023>     yxzw;
-			Swizzle4<Vector4, 0x2023>     zxzw;
-			Swizzle4<Vector4, 0x3023>     wxzw;
-			SwizzleMask4<Vector4, 0x0123> xyzw;
-			Swizzle4<Vector4, 0x1123>     yyzw;
-			Swizzle4<Vector4, 0x2123>     zyzw;
-			Swizzle4<Vector4, 0x3123>     wyzw;
-			Swizzle4<Vector4, 0x0223>     xzzw;
-			Swizzle4<Vector4, 0x1223>     yzzw;
-			Swizzle4<Vector4, 0x2223>     zzzw;
-			Swizzle4<Vector4, 0x3223>     wzzw;
-			Swizzle4<Vector4, 0x0323>     xwzw;
-			Swizzle4<Vector4, 0x1323>     ywzw;
-			Swizzle4<Vector4, 0x2323>     zwzw;
-			Swizzle4<Vector4, 0x3323>     wwzw;
-			Swizzle4<Vector4, 0x0033>     xxww;
-			Swizzle4<Vector4, 0x1033>     yxww;
-			Swizzle4<Vector4, 0x2033>     zxww;
-			Swizzle4<Vector4, 0x3033>     wxww;
-			Swizzle4<Vector4, 0x0133>     xyww;
-			Swizzle4<Vector4, 0x1133>     yyww;
-			Swizzle4<Vector4, 0x2133>     zyww;
-			Swizzle4<Vector4, 0x3133>     wyww;
-			Swizzle4<Vector4, 0x0233>     xzww;
-			Swizzle4<Vector4, 0x1233>     yzww;
-			Swizzle4<Vector4, 0x2233>     zzww;
-			Swizzle4<Vector4, 0x3233>     wzww;
-			Swizzle4<Vector4, 0x0333>     xwww;
-			Swizzle4<Vector4, 0x1333>     ywww;
-			Swizzle4<Vector4, 0x2333>     zwww;
-			Swizzle4<Vector4, 0x3333>     wwww;
-		};
-	};
-
-	class Int4 : public LValue<Int4>, public XYZW<Int4>
-	{
-	public:
-		explicit Int4(RValue<Byte4> cast);
-		explicit Int4(RValue<SByte4> cast);
-		explicit Int4(RValue<Float4> cast);
-		explicit Int4(RValue<Short4> cast);
-		explicit Int4(RValue<UShort4> cast);
-
-		Int4();
-		Int4(int xyzw);
-		Int4(int x, int yzw);
-		Int4(int x, int y, int zw);
-		Int4(int x, int y, int z, int w);
-		Int4(RValue<Int4> rhs);
-		Int4(const Int4 &rhs);
-		Int4(const Reference<Int4> &rhs);
-		Int4(RValue<UInt4> rhs);
-		Int4(const UInt4 &rhs);
-		Int4(const Reference<UInt4> &rhs);
-		Int4(RValue<Int2> lo, RValue<Int2> hi);
-		Int4(RValue<Int> rhs);
-		Int4(const Int &rhs);
-		Int4(const Reference<Int> &rhs);
-
-		RValue<Int4> operator=(RValue<Int4> rhs);
-		RValue<Int4> operator=(const Int4 &rhs);
-		RValue<Int4> operator=(const Reference<Int4> &rhs);
-
-		static Type *getType();
-
-	private:
-		void constant(int x, int y, int z, int w);
-	};
-
-	RValue<Int4> operator+(RValue<Int4> lhs, RValue<Int4> rhs);
-	RValue<Int4> operator-(RValue<Int4> lhs, RValue<Int4> rhs);
-	RValue<Int4> operator*(RValue<Int4> lhs, RValue<Int4> rhs);
-	RValue<Int4> operator/(RValue<Int4> lhs, RValue<Int4> rhs);
-	RValue<Int4> operator%(RValue<Int4> lhs, RValue<Int4> rhs);
-	RValue<Int4> operator&(RValue<Int4> lhs, RValue<Int4> rhs);
-	RValue<Int4> operator|(RValue<Int4> lhs, RValue<Int4> rhs);
-	RValue<Int4> operator^(RValue<Int4> lhs, RValue<Int4> rhs);
-	RValue<Int4> operator<<(RValue<Int4> lhs, unsigned char rhs);
-	RValue<Int4> operator>>(RValue<Int4> lhs, unsigned char rhs);
-	RValue<Int4> operator<<(RValue<Int4> lhs, RValue<Int4> rhs);
-	RValue<Int4> operator>>(RValue<Int4> lhs, RValue<Int4> rhs);
-	RValue<Int4> operator+=(Int4 &lhs, RValue<Int4> rhs);
-	RValue<Int4> operator-=(Int4 &lhs, RValue<Int4> rhs);
-	RValue<Int4> operator*=(Int4 &lhs, RValue<Int4> rhs);
+RValue<Int4> operator+(RValue<Int4> lhs, RValue<Int4> rhs);
+RValue<Int4> operator-(RValue<Int4> lhs, RValue<Int4> rhs);
+RValue<Int4> operator*(RValue<Int4> lhs, RValue<Int4> rhs);
+RValue<Int4> operator/(RValue<Int4> lhs, RValue<Int4> rhs);
+RValue<Int4> operator%(RValue<Int4> lhs, RValue<Int4> rhs);
+RValue<Int4> operator&(RValue<Int4> lhs, RValue<Int4> rhs);
+RValue<Int4> operator|(RValue<Int4> lhs, RValue<Int4> rhs);
+RValue<Int4> operator^(RValue<Int4> lhs, RValue<Int4> rhs);
+RValue<Int4> operator<<(RValue<Int4> lhs, unsigned char rhs);
+RValue<Int4> operator>>(RValue<Int4> lhs, unsigned char rhs);
+RValue<Int4> operator<<(RValue<Int4> lhs, RValue<Int4> rhs);
+RValue<Int4> operator>>(RValue<Int4> lhs, RValue<Int4> rhs);
+RValue<Int4> operator+=(Int4 &lhs, RValue<Int4> rhs);
+RValue<Int4> operator-=(Int4 &lhs, RValue<Int4> rhs);
+RValue<Int4> operator*=(Int4 &lhs, RValue<Int4> rhs);
 //	RValue<Int4> operator/=(Int4 &lhs, RValue<Int4> rhs);
 //	RValue<Int4> operator%=(Int4 &lhs, RValue<Int4> rhs);
-	RValue<Int4> operator&=(Int4 &lhs, RValue<Int4> rhs);
-	RValue<Int4> operator|=(Int4 &lhs, RValue<Int4> rhs);
-	RValue<Int4> operator^=(Int4 &lhs, RValue<Int4> rhs);
-	RValue<Int4> operator<<=(Int4 &lhs, unsigned char rhs);
-	RValue<Int4> operator>>=(Int4 &lhs, unsigned char rhs);
-	RValue<Int4> operator+(RValue<Int4> val);
-	RValue<Int4> operator-(RValue<Int4> val);
-	RValue<Int4> operator~(RValue<Int4> val);
+RValue<Int4> operator&=(Int4 &lhs, RValue<Int4> rhs);
+RValue<Int4> operator|=(Int4 &lhs, RValue<Int4> rhs);
+RValue<Int4> operator^=(Int4 &lhs, RValue<Int4> rhs);
+RValue<Int4> operator<<=(Int4 &lhs, unsigned char rhs);
+RValue<Int4> operator>>=(Int4 &lhs, unsigned char rhs);
+RValue<Int4> operator+(RValue<Int4> val);
+RValue<Int4> operator-(RValue<Int4> val);
+RValue<Int4> operator~(RValue<Int4> val);
 //	RValue<Int4> operator++(Int4 &val, int);   // Post-increment
 //	const Int4 &operator++(Int4 &val);   // Pre-increment
 //	RValue<Int4> operator--(Int4 &val, int);   // Post-decrement
@@ -1919,92 +1919,92 @@
 //	RValue<Bool> operator!=(RValue<Int4> lhs, RValue<Int4> rhs);
 //	RValue<Bool> operator==(RValue<Int4> lhs, RValue<Int4> rhs);
 
-	inline RValue<Int4> operator+(RValue<Int> lhs, RValue<Int4> rhs)
-	{
-		return Int4(lhs) + rhs;
-	}
+inline RValue<Int4> operator+(RValue<Int> lhs, RValue<Int4> rhs)
+{
+	return Int4(lhs) + rhs;
+}
 
-	inline RValue<Int4> operator+(RValue<Int4> lhs, RValue<Int> rhs)
-	{
-		return lhs + Int4(rhs);
-	}
+inline RValue<Int4> operator+(RValue<Int4> lhs, RValue<Int> rhs)
+{
+	return lhs + Int4(rhs);
+}
 
-	RValue<Int4> CmpEQ(RValue<Int4> x, RValue<Int4> y);
-	RValue<Int4> CmpLT(RValue<Int4> x, RValue<Int4> y);
-	RValue<Int4> CmpLE(RValue<Int4> x, RValue<Int4> y);
-	RValue<Int4> CmpNEQ(RValue<Int4> x, RValue<Int4> y);
-	RValue<Int4> CmpNLT(RValue<Int4> x, RValue<Int4> y);
-	RValue<Int4> CmpNLE(RValue<Int4> x, RValue<Int4> y);
-	inline RValue<Int4> CmpGT(RValue<Int4> x, RValue<Int4> y) { return CmpNLE(x, y); }
-	inline RValue<Int4> CmpGE(RValue<Int4> x, RValue<Int4> y) { return CmpNLT(x, y); }
-	RValue<Int4> Max(RValue<Int4> x, RValue<Int4> y);
-	RValue<Int4> Min(RValue<Int4> x, RValue<Int4> y);
-	RValue<Int4> RoundInt(RValue<Float4> cast);
-	RValue<Short8> PackSigned(RValue<Int4> x, RValue<Int4> y);
-	RValue<UShort8> PackUnsigned(RValue<Int4> x, RValue<Int4> y);
-	RValue<Int> Extract(RValue<Int4> val, int i);
-	RValue<Int4> Insert(RValue<Int4> val, RValue<Int> element, int i);
-	RValue<Int> SignMask(RValue<Int4> x);
-	RValue<Int4> Swizzle(RValue<Int4> x, uint16_t select);
-	RValue<Int4> Shuffle(RValue<Int4> x, RValue<Int4> y, uint16_t select);
-	RValue<Int4> MulHigh(RValue<Int4> x, RValue<Int4> y);
+RValue<Int4> CmpEQ(RValue<Int4> x, RValue<Int4> y);
+RValue<Int4> CmpLT(RValue<Int4> x, RValue<Int4> y);
+RValue<Int4> CmpLE(RValue<Int4> x, RValue<Int4> y);
+RValue<Int4> CmpNEQ(RValue<Int4> x, RValue<Int4> y);
+RValue<Int4> CmpNLT(RValue<Int4> x, RValue<Int4> y);
+RValue<Int4> CmpNLE(RValue<Int4> x, RValue<Int4> y);
+inline RValue<Int4> CmpGT(RValue<Int4> x, RValue<Int4> y) { return CmpNLE(x, y); }
+inline RValue<Int4> CmpGE(RValue<Int4> x, RValue<Int4> y) { return CmpNLT(x, y); }
+RValue<Int4> Max(RValue<Int4> x, RValue<Int4> y);
+RValue<Int4> Min(RValue<Int4> x, RValue<Int4> y);
+RValue<Int4> RoundInt(RValue<Float4> cast);
+RValue<Short8> PackSigned(RValue<Int4> x, RValue<Int4> y);
+RValue<UShort8> PackUnsigned(RValue<Int4> x, RValue<Int4> y);
+RValue<Int> Extract(RValue<Int4> val, int i);
+RValue<Int4> Insert(RValue<Int4> val, RValue<Int> element, int i);
+RValue<Int> SignMask(RValue<Int4> x);
+RValue<Int4> Swizzle(RValue<Int4> x, uint16_t select);
+RValue<Int4> Shuffle(RValue<Int4> x, RValue<Int4> y, uint16_t select);
+RValue<Int4> MulHigh(RValue<Int4> x, RValue<Int4> y);
 
-	class UInt4 : public LValue<UInt4>, public XYZW<UInt4>
-	{
-	public:
-		explicit UInt4(RValue<Float4> cast);
+class UInt4 : public LValue<UInt4>, public XYZW<UInt4>
+{
+public:
+	explicit UInt4(RValue<Float4> cast);
 
-		UInt4();
-		UInt4(int xyzw);
-		UInt4(int x, int yzw);
-		UInt4(int x, int y, int zw);
-		UInt4(int x, int y, int z, int w);
-		UInt4(RValue<UInt4> rhs);
-		UInt4(const UInt4 &rhs);
-		UInt4(const Reference<UInt4> &rhs);
-		UInt4(RValue<Int4> rhs);
-		UInt4(const Int4 &rhs);
-		UInt4(const Reference<Int4> &rhs);
-		UInt4(RValue<UInt2> lo, RValue<UInt2> hi);
-		UInt4(RValue<UInt> rhs);
-		UInt4(const UInt &rhs);
-		UInt4(const Reference<UInt> &rhs);
+	UInt4();
+	UInt4(int xyzw);
+	UInt4(int x, int yzw);
+	UInt4(int x, int y, int zw);
+	UInt4(int x, int y, int z, int w);
+	UInt4(RValue<UInt4> rhs);
+	UInt4(const UInt4 &rhs);
+	UInt4(const Reference<UInt4> &rhs);
+	UInt4(RValue<Int4> rhs);
+	UInt4(const Int4 &rhs);
+	UInt4(const Reference<Int4> &rhs);
+	UInt4(RValue<UInt2> lo, RValue<UInt2> hi);
+	UInt4(RValue<UInt> rhs);
+	UInt4(const UInt &rhs);
+	UInt4(const Reference<UInt> &rhs);
 
-		RValue<UInt4> operator=(RValue<UInt4> rhs);
-		RValue<UInt4> operator=(const UInt4 &rhs);
-		RValue<UInt4> operator=(const Reference<UInt4> &rhs);
+	RValue<UInt4> operator=(RValue<UInt4> rhs);
+	RValue<UInt4> operator=(const UInt4 &rhs);
+	RValue<UInt4> operator=(const Reference<UInt4> &rhs);
 
-		static Type *getType();
+	static Type *getType();
 
-	private:
-		void constant(int x, int y, int z, int w);
-	};
+private:
+	void constant(int x, int y, int z, int w);
+};
 
-	RValue<UInt4> operator+(RValue<UInt4> lhs, RValue<UInt4> rhs);
-	RValue<UInt4> operator-(RValue<UInt4> lhs, RValue<UInt4> rhs);
-	RValue<UInt4> operator*(RValue<UInt4> lhs, RValue<UInt4> rhs);
-	RValue<UInt4> operator/(RValue<UInt4> lhs, RValue<UInt4> rhs);
-	RValue<UInt4> operator%(RValue<UInt4> lhs, RValue<UInt4> rhs);
-	RValue<UInt4> operator&(RValue<UInt4> lhs, RValue<UInt4> rhs);
-	RValue<UInt4> operator|(RValue<UInt4> lhs, RValue<UInt4> rhs);
-	RValue<UInt4> operator^(RValue<UInt4> lhs, RValue<UInt4> rhs);
-	RValue<UInt4> operator<<(RValue<UInt4> lhs, unsigned char rhs);
-	RValue<UInt4> operator>>(RValue<UInt4> lhs, unsigned char rhs);
-	RValue<UInt4> operator<<(RValue<UInt4> lhs, RValue<UInt4> rhs);
-	RValue<UInt4> operator>>(RValue<UInt4> lhs, RValue<UInt4> rhs);
-	RValue<UInt4> operator+=(UInt4 &lhs, RValue<UInt4> rhs);
-	RValue<UInt4> operator-=(UInt4 &lhs, RValue<UInt4> rhs);
-	RValue<UInt4> operator*=(UInt4 &lhs, RValue<UInt4> rhs);
+RValue<UInt4> operator+(RValue<UInt4> lhs, RValue<UInt4> rhs);
+RValue<UInt4> operator-(RValue<UInt4> lhs, RValue<UInt4> rhs);
+RValue<UInt4> operator*(RValue<UInt4> lhs, RValue<UInt4> rhs);
+RValue<UInt4> operator/(RValue<UInt4> lhs, RValue<UInt4> rhs);
+RValue<UInt4> operator%(RValue<UInt4> lhs, RValue<UInt4> rhs);
+RValue<UInt4> operator&(RValue<UInt4> lhs, RValue<UInt4> rhs);
+RValue<UInt4> operator|(RValue<UInt4> lhs, RValue<UInt4> rhs);
+RValue<UInt4> operator^(RValue<UInt4> lhs, RValue<UInt4> rhs);
+RValue<UInt4> operator<<(RValue<UInt4> lhs, unsigned char rhs);
+RValue<UInt4> operator>>(RValue<UInt4> lhs, unsigned char rhs);
+RValue<UInt4> operator<<(RValue<UInt4> lhs, RValue<UInt4> rhs);
+RValue<UInt4> operator>>(RValue<UInt4> lhs, RValue<UInt4> rhs);
+RValue<UInt4> operator+=(UInt4 &lhs, RValue<UInt4> rhs);
+RValue<UInt4> operator-=(UInt4 &lhs, RValue<UInt4> rhs);
+RValue<UInt4> operator*=(UInt4 &lhs, RValue<UInt4> rhs);
 //	RValue<UInt4> operator/=(UInt4 &lhs, RValue<UInt4> rhs);
 //	RValue<UInt4> operator%=(UInt4 &lhs, RValue<UInt4> rhs);
-	RValue<UInt4> operator&=(UInt4 &lhs, RValue<UInt4> rhs);
-	RValue<UInt4> operator|=(UInt4 &lhs, RValue<UInt4> rhs);
-	RValue<UInt4> operator^=(UInt4 &lhs, RValue<UInt4> rhs);
-	RValue<UInt4> operator<<=(UInt4 &lhs, unsigned char rhs);
-	RValue<UInt4> operator>>=(UInt4 &lhs, unsigned char rhs);
-	RValue<UInt4> operator+(RValue<UInt4> val);
-	RValue<UInt4> operator-(RValue<UInt4> val);
-	RValue<UInt4> operator~(RValue<UInt4> val);
+RValue<UInt4> operator&=(UInt4 &lhs, RValue<UInt4> rhs);
+RValue<UInt4> operator|=(UInt4 &lhs, RValue<UInt4> rhs);
+RValue<UInt4> operator^=(UInt4 &lhs, RValue<UInt4> rhs);
+RValue<UInt4> operator<<=(UInt4 &lhs, unsigned char rhs);
+RValue<UInt4> operator>>=(UInt4 &lhs, unsigned char rhs);
+RValue<UInt4> operator+(RValue<UInt4> val);
+RValue<UInt4> operator-(RValue<UInt4> val);
+RValue<UInt4> operator~(RValue<UInt4> val);
 //	RValue<UInt4> operator++(UInt4 &val, int);   // Post-increment
 //	const UInt4 &operator++(UInt4 &val);   // Pre-increment
 //	RValue<UInt4> operator--(UInt4 &val, int);   // Post-decrement
@@ -2016,93 +2016,93 @@
 //	RValue<Bool> operator!=(RValue<UInt4> lhs, RValue<UInt4> rhs);
 //	RValue<Bool> operator==(RValue<UInt4> lhs, RValue<UInt4> rhs);
 
-	RValue<UInt4> CmpEQ(RValue<UInt4> x, RValue<UInt4> y);
-	RValue<UInt4> CmpLT(RValue<UInt4> x, RValue<UInt4> y);
-	RValue<UInt4> CmpLE(RValue<UInt4> x, RValue<UInt4> y);
-	RValue<UInt4> CmpNEQ(RValue<UInt4> x, RValue<UInt4> y);
-	RValue<UInt4> CmpNLT(RValue<UInt4> x, RValue<UInt4> y);
-	RValue<UInt4> CmpNLE(RValue<UInt4> x, RValue<UInt4> y);
-	inline RValue<UInt4> CmpGT(RValue<UInt4> x, RValue<UInt4> y) { return CmpNLE(x, y); }
-	inline RValue<UInt4> CmpGE(RValue<UInt4> x, RValue<UInt4> y) { return CmpNLT(x, y); }
-	RValue<UInt4> Max(RValue<UInt4> x, RValue<UInt4> y);
-	RValue<UInt4> Min(RValue<UInt4> x, RValue<UInt4> y);
-	RValue<UInt4> MulHigh(RValue<UInt4> x, RValue<UInt4> y);
-	RValue<UInt> Extract(RValue<UInt4> val, int i);
-	RValue<UInt4> Insert(RValue<UInt4> val, RValue<UInt> element, int i);
+RValue<UInt4> CmpEQ(RValue<UInt4> x, RValue<UInt4> y);
+RValue<UInt4> CmpLT(RValue<UInt4> x, RValue<UInt4> y);
+RValue<UInt4> CmpLE(RValue<UInt4> x, RValue<UInt4> y);
+RValue<UInt4> CmpNEQ(RValue<UInt4> x, RValue<UInt4> y);
+RValue<UInt4> CmpNLT(RValue<UInt4> x, RValue<UInt4> y);
+RValue<UInt4> CmpNLE(RValue<UInt4> x, RValue<UInt4> y);
+inline RValue<UInt4> CmpGT(RValue<UInt4> x, RValue<UInt4> y) { return CmpNLE(x, y); }
+inline RValue<UInt4> CmpGE(RValue<UInt4> x, RValue<UInt4> y) { return CmpNLT(x, y); }
+RValue<UInt4> Max(RValue<UInt4> x, RValue<UInt4> y);
+RValue<UInt4> Min(RValue<UInt4> x, RValue<UInt4> y);
+RValue<UInt4> MulHigh(RValue<UInt4> x, RValue<UInt4> y);
+RValue<UInt> Extract(RValue<UInt4> val, int i);
+RValue<UInt4> Insert(RValue<UInt4> val, RValue<UInt> element, int i);
 //	RValue<UInt4> RoundInt(RValue<Float4> cast);
-	RValue<UInt4> Swizzle(RValue<UInt4> x, uint16_t select);
-	RValue<UInt4> Shuffle(RValue<UInt4> x, RValue<UInt4> y, uint16_t select);
+RValue<UInt4> Swizzle(RValue<UInt4> x, uint16_t select);
+RValue<UInt4> Shuffle(RValue<UInt4> x, RValue<UInt4> y, uint16_t select);
 
-	class Half : public LValue<Half>
-	{
-	public:
-		explicit Half(RValue<Float> cast);
+class Half : public LValue<Half>
+{
+public:
+	explicit Half(RValue<Float> cast);
 
-		static Type *getType();
-	};
+	static Type *getType();
+};
 
-	class Float : public LValue<Float>
-	{
-	public:
-		explicit Float(RValue<Int> cast);
-		explicit Float(RValue<UInt> cast);
-		explicit Float(RValue<Half> cast);
+class Float : public LValue<Float>
+{
+public:
+	explicit Float(RValue<Int> cast);
+	explicit Float(RValue<UInt> cast);
+	explicit Float(RValue<Half> cast);
 
-		Float() = default;
-		Float(float x);
-		Float(RValue<Float> rhs);
-		Float(const Float &rhs);
-		Float(const Reference<Float> &rhs);
-		Float(Argument<Float> argument);
+	Float() = default;
+	Float(float x);
+	Float(RValue<Float> rhs);
+	Float(const Float &rhs);
+	Float(const Reference<Float> &rhs);
+	Float(Argument<Float> argument);
 
-		template<int T>
-		Float(const SwizzleMask1<Float4, T> &rhs);
+	template<int T>
+	Float(const SwizzleMask1<Float4, T> &rhs);
 
-	//	RValue<Float> operator=(float rhs);   // FIXME: Implement
-		RValue<Float> operator=(RValue<Float> rhs);
-		RValue<Float> operator=(const Float &rhs);
-		RValue<Float> operator=(const Reference<Float> &rhs);
+//	RValue<Float> operator=(float rhs);   // FIXME: Implement
+	RValue<Float> operator=(RValue<Float> rhs);
+	RValue<Float> operator=(const Float &rhs);
+	RValue<Float> operator=(const Reference<Float> &rhs);
 
-		template<int T>
-		RValue<Float> operator=(const SwizzleMask1<Float4, T> &rhs);
+	template<int T>
+	RValue<Float> operator=(const SwizzleMask1<Float4, T> &rhs);
 
-		static Type *getType();
-	};
+	static Type *getType();
+};
 
-	RValue<Float> operator+(RValue<Float> lhs, RValue<Float> rhs);
-	RValue<Float> operator-(RValue<Float> lhs, RValue<Float> rhs);
-	RValue<Float> operator*(RValue<Float> lhs, RValue<Float> rhs);
-	RValue<Float> operator/(RValue<Float> lhs, RValue<Float> rhs);
-	RValue<Float> operator+=(Float &lhs, RValue<Float> rhs);
-	RValue<Float> operator-=(Float &lhs, RValue<Float> rhs);
-	RValue<Float> operator*=(Float &lhs, RValue<Float> rhs);
-	RValue<Float> operator/=(Float &lhs, RValue<Float> rhs);
-	RValue<Float> operator+(RValue<Float> val);
-	RValue<Float> operator-(RValue<Float> val);
-	RValue<Bool> operator<(RValue<Float> lhs, RValue<Float> rhs);
-	RValue<Bool> operator<=(RValue<Float> lhs, RValue<Float> rhs);
-	RValue<Bool> operator>(RValue<Float> lhs, RValue<Float> rhs);
-	RValue<Bool> operator>=(RValue<Float> lhs, RValue<Float> rhs);
-	RValue<Bool> operator!=(RValue<Float> lhs, RValue<Float> rhs);
-	RValue<Bool> operator==(RValue<Float> lhs, RValue<Float> rhs);
+RValue<Float> operator+(RValue<Float> lhs, RValue<Float> rhs);
+RValue<Float> operator-(RValue<Float> lhs, RValue<Float> rhs);
+RValue<Float> operator*(RValue<Float> lhs, RValue<Float> rhs);
+RValue<Float> operator/(RValue<Float> lhs, RValue<Float> rhs);
+RValue<Float> operator+=(Float &lhs, RValue<Float> rhs);
+RValue<Float> operator-=(Float &lhs, RValue<Float> rhs);
+RValue<Float> operator*=(Float &lhs, RValue<Float> rhs);
+RValue<Float> operator/=(Float &lhs, RValue<Float> rhs);
+RValue<Float> operator+(RValue<Float> val);
+RValue<Float> operator-(RValue<Float> val);
+RValue<Bool> operator<(RValue<Float> lhs, RValue<Float> rhs);
+RValue<Bool> operator<=(RValue<Float> lhs, RValue<Float> rhs);
+RValue<Bool> operator>(RValue<Float> lhs, RValue<Float> rhs);
+RValue<Bool> operator>=(RValue<Float> lhs, RValue<Float> rhs);
+RValue<Bool> operator!=(RValue<Float> lhs, RValue<Float> rhs);
+RValue<Bool> operator==(RValue<Float> lhs, RValue<Float> rhs);
 
-	RValue<Float> Abs(RValue<Float> x);
-	RValue<Float> Max(RValue<Float> x, RValue<Float> y);
-	RValue<Float> Min(RValue<Float> x, RValue<Float> y);
-	RValue<Float> Rcp_pp(RValue<Float> val, bool exactAtPow2 = false);
-	RValue<Float> RcpSqrt_pp(RValue<Float> val);
-	RValue<Float> Sqrt(RValue<Float> x);
+RValue<Float> Abs(RValue<Float> x);
+RValue<Float> Max(RValue<Float> x, RValue<Float> y);
+RValue<Float> Min(RValue<Float> x, RValue<Float> y);
+RValue<Float> Rcp_pp(RValue<Float> val, bool exactAtPow2 = false);
+RValue<Float> RcpSqrt_pp(RValue<Float> val);
+RValue<Float> Sqrt(RValue<Float> x);
 
 //	RValue<Int4> IsInf(RValue<Float> x);
 //	RValue<Int4> IsNan(RValue<Float> x);
-	RValue<Float> Round(RValue<Float> x);
-	RValue<Float> Trunc(RValue<Float> x);
-	RValue<Float> Frac(RValue<Float> x);
-	RValue<Float> Floor(RValue<Float> x);
-	RValue<Float> Ceil(RValue<Float> x);
+RValue<Float> Round(RValue<Float> x);
+RValue<Float> Trunc(RValue<Float> x);
+RValue<Float> Frac(RValue<Float> x);
+RValue<Float> Floor(RValue<Float> x);
+RValue<Float> Ceil(RValue<Float> x);
 
-	// Trigonometric functions
-	// TODO: Currently unimplemented for Subzero.
+// Trigonometric functions
+// TODO: Currently unimplemented for Subzero.
 //	RValue<Float> Sin(RValue<Float> x);
 //	RValue<Float> Cos(RValue<Float> x);
 //	RValue<Float> Tan(RValue<Float> x);
@@ -2117,49 +2117,49 @@
 //	RValue<Float> Atanh(RValue<Float> x);
 //	RValue<Float> Atan2(RValue<Float> x, RValue<Float> y);
 
-	// Exponential functions
-	// TODO: Currently unimplemented for Subzero.
+// Exponential functions
+// TODO: Currently unimplemented for Subzero.
 //	RValue<Float> Pow(RValue<Float> x, RValue<Float> y);
 //	RValue<Float> Exp(RValue<Float> x);
 //	RValue<Float> Log(RValue<Float> x);
-	RValue<Float> Exp2(RValue<Float> x);
-	RValue<Float> Log2(RValue<Float> x);
+RValue<Float> Exp2(RValue<Float> x);
+RValue<Float> Log2(RValue<Float> x);
 
-	class Float2 : public LValue<Float2>
-	{
-	public:
-	//	explicit Float2(RValue<Byte2> cast);
-	//	explicit Float2(RValue<Short2> cast);
-	//	explicit Float2(RValue<UShort2> cast);
-	//	explicit Float2(RValue<Int2> cast);
-	//	explicit Float2(RValue<UInt2> cast);
-		explicit Float2(RValue<Float4> cast);
+class Float2 : public LValue<Float2>
+{
+public:
+//	explicit Float2(RValue<Byte2> cast);
+//	explicit Float2(RValue<Short2> cast);
+//	explicit Float2(RValue<UShort2> cast);
+//	explicit Float2(RValue<Int2> cast);
+//	explicit Float2(RValue<UInt2> cast);
+	explicit Float2(RValue<Float4> cast);
 
-		Float2() = default;
-	//	Float2(float x, float y);
-	//	Float2(RValue<Float2> rhs);
-	//	Float2(const Float2 &rhs);
-	//	Float2(const Reference<Float2> &rhs);
-	//	Float2(RValue<Float> rhs);
-	//	Float2(const Float &rhs);
-	//	Float2(const Reference<Float> &rhs);
+	Float2() = default;
+//	Float2(float x, float y);
+//	Float2(RValue<Float2> rhs);
+//	Float2(const Float2 &rhs);
+//	Float2(const Reference<Float2> &rhs);
+//	Float2(RValue<Float> rhs);
+//	Float2(const Float &rhs);
+//	Float2(const Reference<Float> &rhs);
 
-	//	template<int T>
-	//	Float2(const SwizzleMask1<T> &rhs);
+//	template<int T>
+//	Float2(const SwizzleMask1<T> &rhs);
 
-	//	RValue<Float2> operator=(float replicate);
-	//	RValue<Float2> operator=(RValue<Float2> rhs);
-	//	RValue<Float2> operator=(const Float2 &rhs);
-	//	RValue<Float2> operator=(const Reference<Float2> &rhs);
-	//	RValue<Float2> operator=(RValue<Float> rhs);
-	//	RValue<Float2> operator=(const Float &rhs);
-	//	RValue<Float2> operator=(const Reference<Float> &rhs);
+//	RValue<Float2> operator=(float replicate);
+//	RValue<Float2> operator=(RValue<Float2> rhs);
+//	RValue<Float2> operator=(const Float2 &rhs);
+//	RValue<Float2> operator=(const Reference<Float2> &rhs);
+//	RValue<Float2> operator=(RValue<Float> rhs);
+//	RValue<Float2> operator=(const Float &rhs);
+//	RValue<Float2> operator=(const Reference<Float> &rhs);
 
-	//	template<int T>
-	//	RValue<Float2> operator=(const SwizzleMask1<T> &rhs);
+//	template<int T>
+//	RValue<Float2> operator=(const SwizzleMask1<T> &rhs);
 
-		static Type *getType();
-	};
+	static Type *getType();
+};
 
 //	RValue<Float2> operator+(RValue<Float2> lhs, RValue<Float2> rhs);
 //	RValue<Float2> operator-(RValue<Float2> lhs, RValue<Float2> rhs);
@@ -2180,868 +2180,871 @@
 //	RValue<Float2> Swizzle(RValue<Float2> x, uint16_t select);
 //	RValue<Float2> Mask(Float2 &lhs, RValue<Float2> rhs, uint16_t select);
 
-	class Float4 : public LValue<Float4>, public XYZW<Float4>
+class Float4 : public LValue<Float4>, public XYZW<Float4>
+{
+public:
+	explicit Float4(RValue<Byte4> cast);
+	explicit Float4(RValue<SByte4> cast);
+	explicit Float4(RValue<Short4> cast);
+	explicit Float4(RValue<UShort4> cast);
+	explicit Float4(RValue<Int4> cast);
+	explicit Float4(RValue<UInt4> cast);
+
+	Float4();
+	Float4(float xyzw);
+	Float4(float x, float yzw);
+	Float4(float x, float y, float zw);
+	Float4(float x, float y, float z, float w);
+	Float4(RValue<Float4> rhs);
+	Float4(const Float4 &rhs);
+	Float4(const Reference<Float4> &rhs);
+	Float4(RValue<Float> rhs);
+	Float4(const Float &rhs);
+	Float4(const Reference<Float> &rhs);
+
+	template<int T>
+	Float4(const SwizzleMask1<Float4, T> &rhs);
+	template<int T>
+	Float4(const Swizzle4<Float4, T> &rhs);
+	template<int X, int Y>
+	Float4(const Swizzle2<Float4, X> &x, const Swizzle2<Float4, Y> &y);
+	template<int X, int Y>
+	Float4(const SwizzleMask2<Float4, X> &x, const Swizzle2<Float4, Y> &y);
+	template<int X, int Y>
+	Float4(const Swizzle2<Float4, X> &x, const SwizzleMask2<Float4, Y> &y);
+	template<int X, int Y>
+	Float4(const SwizzleMask2<Float4, X> &x, const SwizzleMask2<Float4, Y> &y);
+
+	RValue<Float4> operator=(float replicate);
+	RValue<Float4> operator=(RValue<Float4> rhs);
+	RValue<Float4> operator=(const Float4 &rhs);
+	RValue<Float4> operator=(const Reference<Float4> &rhs);
+	RValue<Float4> operator=(RValue<Float> rhs);
+	RValue<Float4> operator=(const Float &rhs);
+	RValue<Float4> operator=(const Reference<Float> &rhs);
+
+	template<int T>
+	RValue<Float4> operator=(const SwizzleMask1<Float4, T> &rhs);
+	template<int T>
+	RValue<Float4> operator=(const Swizzle4<Float4, T> &rhs);
+
+	static Type *getType();
+	static Float4 negative_inf();
+	static Float4 positive_inf();
+private:
+	void constant(float x, float y, float z, float w);
+	void infinity_constant(bool negative);
+};
+
+RValue<Float4> operator+(RValue<Float4> lhs, RValue<Float4> rhs);
+RValue<Float4> operator-(RValue<Float4> lhs, RValue<Float4> rhs);
+RValue<Float4> operator*(RValue<Float4> lhs, RValue<Float4> rhs);
+RValue<Float4> operator/(RValue<Float4> lhs, RValue<Float4> rhs);
+RValue<Float4> operator%(RValue<Float4> lhs, RValue<Float4> rhs);
+RValue<Float4> operator+=(Float4 &lhs, RValue<Float4> rhs);
+RValue<Float4> operator-=(Float4 &lhs, RValue<Float4> rhs);
+RValue<Float4> operator*=(Float4 &lhs, RValue<Float4> rhs);
+RValue<Float4> operator/=(Float4 &lhs, RValue<Float4> rhs);
+RValue<Float4> operator%=(Float4 &lhs, RValue<Float4> rhs);
+RValue<Float4> operator+(RValue<Float4> val);
+RValue<Float4> operator-(RValue<Float4> val);
+
+RValue<Float4> Abs(RValue<Float4> x);
+RValue<Float4> Max(RValue<Float4> x, RValue<Float4> y);
+RValue<Float4> Min(RValue<Float4> x, RValue<Float4> y);
+RValue<Float4> Rcp_pp(RValue<Float4> val, bool exactAtPow2 = false);
+RValue<Float4> RcpSqrt_pp(RValue<Float4> val);
+RValue<Float4> Sqrt(RValue<Float4> x);
+RValue<Float4> Insert(RValue<Float4> val, RValue<Float> element, int i);
+RValue<Float> Extract(RValue<Float4> x, int i);
+RValue<Float4> Swizzle(RValue<Float4> x, uint16_t select);
+RValue<Float4> Shuffle(RValue<Float4> x, RValue<Float4> y, uint16_t select);
+RValue<Float4> ShuffleLowHigh(RValue<Float4> x, RValue<Float4> y, uint16_t imm);
+RValue<Float4> UnpackLow(RValue<Float4> x, RValue<Float4> y);
+RValue<Float4> UnpackHigh(RValue<Float4> x, RValue<Float4> y);
+RValue<Float4> Mask(Float4 &lhs, RValue<Float4> rhs, uint16_t select);
+RValue<Int> SignMask(RValue<Float4> x);
+
+// Ordered comparison functions
+RValue<Int4> CmpEQ(RValue<Float4> x, RValue<Float4> y);
+RValue<Int4> CmpLT(RValue<Float4> x, RValue<Float4> y);
+RValue<Int4> CmpLE(RValue<Float4> x, RValue<Float4> y);
+RValue<Int4> CmpNEQ(RValue<Float4> x, RValue<Float4> y);
+RValue<Int4> CmpNLT(RValue<Float4> x, RValue<Float4> y);
+RValue<Int4> CmpNLE(RValue<Float4> x, RValue<Float4> y);
+inline RValue<Int4> CmpGT(RValue<Float4> x, RValue<Float4> y) { return CmpNLE(x, y); }
+inline RValue<Int4> CmpGE(RValue<Float4> x, RValue<Float4> y) { return CmpNLT(x, y); }
+
+// Unordered comparison functions
+RValue<Int4> CmpUEQ(RValue<Float4> x, RValue<Float4> y);
+RValue<Int4> CmpULT(RValue<Float4> x, RValue<Float4> y);
+RValue<Int4> CmpULE(RValue<Float4> x, RValue<Float4> y);
+RValue<Int4> CmpUNEQ(RValue<Float4> x, RValue<Float4> y);
+RValue<Int4> CmpUNLT(RValue<Float4> x, RValue<Float4> y);
+RValue<Int4> CmpUNLE(RValue<Float4> x, RValue<Float4> y);
+inline RValue<Int4> CmpUGT(RValue<Float4> x, RValue<Float4> y) { return CmpUNLE(x, y); }
+inline RValue<Int4> CmpUGE(RValue<Float4> x, RValue<Float4> y) { return CmpUNLT(x, y); }
+
+RValue<Int4> IsInf(RValue<Float4> x);
+RValue<Int4> IsNan(RValue<Float4> x);
+RValue<Float4> Round(RValue<Float4> x);
+RValue<Float4> Trunc(RValue<Float4> x);
+RValue<Float4> Frac(RValue<Float4> x);
+RValue<Float4> Floor(RValue<Float4> x);
+RValue<Float4> Ceil(RValue<Float4> x);
+
+// Trigonometric functions
+// TODO: Currently unimplemented for Subzero.
+RValue<Float4> Sin(RValue<Float4> x);
+RValue<Float4> Cos(RValue<Float4> x);
+RValue<Float4> Tan(RValue<Float4> x);
+RValue<Float4> Asin(RValue<Float4> x);
+RValue<Float4> Acos(RValue<Float4> x);
+RValue<Float4> Atan(RValue<Float4> x);
+RValue<Float4> Sinh(RValue<Float4> x);
+RValue<Float4> Cosh(RValue<Float4> x);
+RValue<Float4> Tanh(RValue<Float4> x);
+RValue<Float4> Asinh(RValue<Float4> x);
+RValue<Float4> Acosh(RValue<Float4> x);
+RValue<Float4> Atanh(RValue<Float4> x);
+RValue<Float4> Atan2(RValue<Float4> x, RValue<Float4> y);
+
+// Exponential functions
+// TODO: Currently unimplemented for Subzero.
+RValue<Float4> Pow(RValue<Float4> x, RValue<Float4> y);
+RValue<Float4> Exp(RValue<Float4> x);
+RValue<Float4> Log(RValue<Float4> x);
+RValue<Float4> Exp2(RValue<Float4> x);
+RValue<Float4> Log2(RValue<Float4> x);
+
+// Bit Manipulation functions.
+// TODO: Currently unimplemented for Subzero.
+
+// Count leading zeros.
+// Returns 32 when: !isZeroUndef && x == 0.
+// Returns an undefined value when: isZeroUndef && x == 0.
+RValue<UInt> Ctlz(RValue<UInt> x, bool isZeroUndef);
+RValue<UInt4> Ctlz(RValue<UInt4> x, bool isZeroUndef);
+
+// Count trailing zeros.
+// Returns 32 when: !isZeroUndef && x == 0.
+// Returns an undefined value when: isZeroUndef && x == 0.
+RValue<UInt> Cttz(RValue<UInt> x, bool isZeroUndef);
+RValue<UInt4> Cttz(RValue<UInt4> x, bool isZeroUndef);
+
+template<class T>
+class Pointer : public LValue<Pointer<T>>
+{
+public:
+	template<class S>
+	Pointer(RValue<Pointer<S>> pointerS, int alignment = 1) : alignment(alignment)
 	{
-	public:
-		explicit Float4(RValue<Byte4> cast);
-		explicit Float4(RValue<SByte4> cast);
-		explicit Float4(RValue<Short4> cast);
-		explicit Float4(RValue<UShort4> cast);
-		explicit Float4(RValue<Int4> cast);
-		explicit Float4(RValue<UInt4> cast);
-
-		Float4();
-		Float4(float xyzw);
-		Float4(float x, float yzw);
-		Float4(float x, float y, float zw);
-		Float4(float x, float y, float z, float w);
-		Float4(RValue<Float4> rhs);
-		Float4(const Float4 &rhs);
-		Float4(const Reference<Float4> &rhs);
-		Float4(RValue<Float> rhs);
-		Float4(const Float &rhs);
-		Float4(const Reference<Float> &rhs);
-
-		template<int T>
-		Float4(const SwizzleMask1<Float4, T> &rhs);
-		template<int T>
-		Float4(const Swizzle4<Float4, T> &rhs);
-		template<int X, int Y>
-		Float4(const Swizzle2<Float4, X> &x, const Swizzle2<Float4, Y> &y);
-		template<int X, int Y>
-		Float4(const SwizzleMask2<Float4, X> &x, const Swizzle2<Float4, Y> &y);
-		template<int X, int Y>
-		Float4(const Swizzle2<Float4, X> &x, const SwizzleMask2<Float4, Y> &y);
-		template<int X, int Y>
-		Float4(const SwizzleMask2<Float4, X> &x, const SwizzleMask2<Float4, Y> &y);
-
-		RValue<Float4> operator=(float replicate);
-		RValue<Float4> operator=(RValue<Float4> rhs);
-		RValue<Float4> operator=(const Float4 &rhs);
-		RValue<Float4> operator=(const Reference<Float4> &rhs);
-		RValue<Float4> operator=(RValue<Float> rhs);
-		RValue<Float4> operator=(const Float &rhs);
-		RValue<Float4> operator=(const Reference<Float> &rhs);
-
-		template<int T>
-		RValue<Float4> operator=(const SwizzleMask1<Float4, T> &rhs);
-		template<int T>
-		RValue<Float4> operator=(const Swizzle4<Float4, T> &rhs);
-
-		static Type *getType();
-		static Float4 negative_inf();
-		static Float4 positive_inf();
-	private:
-		void constant(float x, float y, float z, float w);
-		void infinity_constant(bool negative);
-	};
-
-	RValue<Float4> operator+(RValue<Float4> lhs, RValue<Float4> rhs);
-	RValue<Float4> operator-(RValue<Float4> lhs, RValue<Float4> rhs);
-	RValue<Float4> operator*(RValue<Float4> lhs, RValue<Float4> rhs);
-	RValue<Float4> operator/(RValue<Float4> lhs, RValue<Float4> rhs);
-	RValue<Float4> operator%(RValue<Float4> lhs, RValue<Float4> rhs);
-	RValue<Float4> operator+=(Float4 &lhs, RValue<Float4> rhs);
-	RValue<Float4> operator-=(Float4 &lhs, RValue<Float4> rhs);
-	RValue<Float4> operator*=(Float4 &lhs, RValue<Float4> rhs);
-	RValue<Float4> operator/=(Float4 &lhs, RValue<Float4> rhs);
-	RValue<Float4> operator%=(Float4 &lhs, RValue<Float4> rhs);
-	RValue<Float4> operator+(RValue<Float4> val);
-	RValue<Float4> operator-(RValue<Float4> val);
-
-	RValue<Float4> Abs(RValue<Float4> x);
-	RValue<Float4> Max(RValue<Float4> x, RValue<Float4> y);
-	RValue<Float4> Min(RValue<Float4> x, RValue<Float4> y);
-	RValue<Float4> Rcp_pp(RValue<Float4> val, bool exactAtPow2 = false);
-	RValue<Float4> RcpSqrt_pp(RValue<Float4> val);
-	RValue<Float4> Sqrt(RValue<Float4> x);
-	RValue<Float4> Insert(RValue<Float4> val, RValue<Float> element, int i);
-	RValue<Float> Extract(RValue<Float4> x, int i);
-	RValue<Float4> Swizzle(RValue<Float4> x, uint16_t select);
-	RValue<Float4> Shuffle(RValue<Float4> x, RValue<Float4> y, uint16_t select);
-	RValue<Float4> ShuffleLowHigh(RValue<Float4> x, RValue<Float4> y, uint16_t imm);
-	RValue<Float4> UnpackLow(RValue<Float4> x, RValue<Float4> y);
-	RValue<Float4> UnpackHigh(RValue<Float4> x, RValue<Float4> y);
-	RValue<Float4> Mask(Float4 &lhs, RValue<Float4> rhs, uint16_t select);
-	RValue<Int> SignMask(RValue<Float4> x);
-
-	// Ordered comparison functions
-	RValue<Int4> CmpEQ(RValue<Float4> x, RValue<Float4> y);
-	RValue<Int4> CmpLT(RValue<Float4> x, RValue<Float4> y);
-	RValue<Int4> CmpLE(RValue<Float4> x, RValue<Float4> y);
-	RValue<Int4> CmpNEQ(RValue<Float4> x, RValue<Float4> y);
-	RValue<Int4> CmpNLT(RValue<Float4> x, RValue<Float4> y);
-	RValue<Int4> CmpNLE(RValue<Float4> x, RValue<Float4> y);
-	inline RValue<Int4> CmpGT(RValue<Float4> x, RValue<Float4> y) { return CmpNLE(x, y); }
-	inline RValue<Int4> CmpGE(RValue<Float4> x, RValue<Float4> y) { return CmpNLT(x, y); }
-
-	// Unordered comparison functions
-	RValue<Int4> CmpUEQ(RValue<Float4> x, RValue<Float4> y);
-	RValue<Int4> CmpULT(RValue<Float4> x, RValue<Float4> y);
-	RValue<Int4> CmpULE(RValue<Float4> x, RValue<Float4> y);
-	RValue<Int4> CmpUNEQ(RValue<Float4> x, RValue<Float4> y);
-	RValue<Int4> CmpUNLT(RValue<Float4> x, RValue<Float4> y);
-	RValue<Int4> CmpUNLE(RValue<Float4> x, RValue<Float4> y);
-	inline RValue<Int4> CmpUGT(RValue<Float4> x, RValue<Float4> y) { return CmpUNLE(x, y); }
-	inline RValue<Int4> CmpUGE(RValue<Float4> x, RValue<Float4> y) { return CmpUNLT(x, y); }
-
-	RValue<Int4> IsInf(RValue<Float4> x);
-	RValue<Int4> IsNan(RValue<Float4> x);
-	RValue<Float4> Round(RValue<Float4> x);
-	RValue<Float4> Trunc(RValue<Float4> x);
-	RValue<Float4> Frac(RValue<Float4> x);
-	RValue<Float4> Floor(RValue<Float4> x);
-	RValue<Float4> Ceil(RValue<Float4> x);
-
-	// Trigonometric functions
-	// TODO: Currently unimplemented for Subzero.
-	RValue<Float4> Sin(RValue<Float4> x);
-	RValue<Float4> Cos(RValue<Float4> x);
-	RValue<Float4> Tan(RValue<Float4> x);
-	RValue<Float4> Asin(RValue<Float4> x);
-	RValue<Float4> Acos(RValue<Float4> x);
-	RValue<Float4> Atan(RValue<Float4> x);
-	RValue<Float4> Sinh(RValue<Float4> x);
-	RValue<Float4> Cosh(RValue<Float4> x);
-	RValue<Float4> Tanh(RValue<Float4> x);
-	RValue<Float4> Asinh(RValue<Float4> x);
-	RValue<Float4> Acosh(RValue<Float4> x);
-	RValue<Float4> Atanh(RValue<Float4> x);
-	RValue<Float4> Atan2(RValue<Float4> x, RValue<Float4> y);
-
-	// Exponential functions
-	// TODO: Currently unimplemented for Subzero.
-	RValue<Float4> Pow(RValue<Float4> x, RValue<Float4> y);
-	RValue<Float4> Exp(RValue<Float4> x);
-	RValue<Float4> Log(RValue<Float4> x);
-	RValue<Float4> Exp2(RValue<Float4> x);
-	RValue<Float4> Log2(RValue<Float4> x);
-
-	// Bit Manipulation functions.
-	// TODO: Currently unimplemented for Subzero.
-
-	// Count leading zeros.
-	// Returns 32 when: !isZeroUndef && x == 0.
-	// Returns an undefined value when: isZeroUndef && x == 0.
-	RValue<UInt> Ctlz(RValue<UInt> x, bool isZeroUndef);
-	RValue<UInt4> Ctlz(RValue<UInt4> x, bool isZeroUndef);
-
-	// Count trailing zeros.
-	// Returns 32 when: !isZeroUndef && x == 0.
-	// Returns an undefined value when: isZeroUndef && x == 0.
-	RValue<UInt> Cttz(RValue<UInt> x, bool isZeroUndef);
-	RValue<UInt4> Cttz(RValue<UInt4> x, bool isZeroUndef);
-
-	template<class T>
-	class Pointer : public LValue<Pointer<T>>
-	{
-	public:
-		template<class S>
-		Pointer(RValue<Pointer<S>> pointerS, int alignment = 1) : alignment(alignment)
-		{
-			Value *pointerT = Nucleus::createBitCast(pointerS.value, Nucleus::getPointerType(T::getType()));
-			LValue<Pointer<T>>::storeValue(pointerT);
-		}
-
-		template<class S>
-		Pointer(const Pointer<S> &pointer, int alignment = 1) : alignment(alignment)
-		{
-			Value *pointerS = pointer.loadValue();
-			Value *pointerT = Nucleus::createBitCast(pointerS, Nucleus::getPointerType(T::getType()));
-			LValue<Pointer<T>>::storeValue(pointerT);
-		}
-
-		Pointer(Argument<Pointer<T>> argument);
-
-		Pointer();
-		Pointer(RValue<Pointer<T>> rhs);
-		Pointer(const Pointer<T> &rhs);
-		Pointer(const Reference<Pointer<T>> &rhs);
-		Pointer(std::nullptr_t);
-
-		RValue<Pointer<T>> operator=(RValue<Pointer<T>> rhs);
-		RValue<Pointer<T>> operator=(const Pointer<T> &rhs);
-		RValue<Pointer<T>> operator=(const Reference<Pointer<T>> &rhs);
-		RValue<Pointer<T>> operator=(std::nullptr_t);
-
-		Reference<T> operator*();
-		Reference<T> operator[](int index);
-		Reference<T> operator[](unsigned int index);
-		Reference<T> operator[](RValue<Int> index);
-		Reference<T> operator[](RValue<UInt> index);
-
-		static Type *getType();
-
-	private:
-		const int alignment;
-	};
-
-	RValue<Pointer<Byte>> operator+(RValue<Pointer<Byte>> lhs, int offset);
-	RValue<Pointer<Byte>> operator+(RValue<Pointer<Byte>> lhs, RValue<Int> offset);
-	RValue<Pointer<Byte>> operator+(RValue<Pointer<Byte>> lhs, RValue<UInt> offset);
-	RValue<Pointer<Byte>> operator+=(Pointer<Byte> &lhs, int offset);
-	RValue<Pointer<Byte>> operator+=(Pointer<Byte> &lhs, RValue<Int> offset);
-	RValue<Pointer<Byte>> operator+=(Pointer<Byte> &lhs, RValue<UInt> offset);
-
-	RValue<Pointer<Byte>> operator-(RValue<Pointer<Byte>> lhs, int offset);
-	RValue<Pointer<Byte>> operator-(RValue<Pointer<Byte>> lhs, RValue<Int> offset);
-	RValue<Pointer<Byte>> operator-(RValue<Pointer<Byte>> lhs, RValue<UInt> offset);
-	RValue<Pointer<Byte>> operator-=(Pointer<Byte> &lhs, int offset);
-	RValue<Pointer<Byte>> operator-=(Pointer<Byte> &lhs, RValue<Int> offset);
-	RValue<Pointer<Byte>> operator-=(Pointer<Byte> &lhs, RValue<UInt> offset);
-
-	template <typename T>
-	RValue<Bool> operator==(const Pointer<T> &lhs, const Pointer<T> &rhs)
-	{
-		return RValue<Bool>(Nucleus::createPtrEQ(lhs.loadValue(), rhs.loadValue()));
+		Value *pointerT = Nucleus::createBitCast(pointerS.value, Nucleus::getPointerType(T::getType()));
+		LValue<Pointer<T>>::storeValue(pointerT);
 	}
 
-	template<typename T>
-	RValue<T> Load(RValue<Pointer<T>> pointer, unsigned int alignment, bool atomic, std::memory_order memoryOrder)
+	template<class S>
+	Pointer(const Pointer<S> &pointer, int alignment = 1) : alignment(alignment)
 	{
-		return RValue<T>(Nucleus::createLoad(pointer.value, T::getType(), false, alignment, atomic, memoryOrder));
+		Value *pointerS = pointer.loadValue();
+		Value *pointerT = Nucleus::createBitCast(pointerS, Nucleus::getPointerType(T::getType()));
+		LValue<Pointer<T>>::storeValue(pointerT);
 	}
 
-	template<typename T>
-	RValue<T> Load(Pointer<T> pointer, unsigned int alignment, bool atomic, std::memory_order memoryOrder)
-	{
-		return Load(RValue<Pointer<T>>(pointer), alignment, atomic, memoryOrder);
-	}
+	Pointer(Argument<Pointer<T>> argument);
 
-	// TODO: Use SIMD to template these.
-	RValue<Float4> MaskedLoad(RValue<Pointer<Float4>> base, RValue<Int4> mask, unsigned int alignment, bool zeroMaskedLanes = false);
-	RValue<Int4> MaskedLoad(RValue<Pointer<Int4>> base, RValue<Int4> mask, unsigned int alignment, bool zeroMaskedLanes = false);
-	void MaskedStore(RValue<Pointer<Float4>> base, RValue<Float4> val, RValue<Int4> mask, unsigned int alignment);
-	void MaskedStore(RValue<Pointer<Int4>> base, RValue<Int4> val, RValue<Int4> mask, unsigned int alignment);
+	Pointer();
+	Pointer(RValue<Pointer<T>> rhs);
+	Pointer(const Pointer<T> &rhs);
+	Pointer(const Reference<Pointer<T>> &rhs);
+	Pointer(std::nullptr_t);
 
-	RValue<Float4> Gather(RValue<Pointer<Float>> base, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment, bool zeroMaskedLanes = false);
-	RValue<Int4> Gather(RValue<Pointer<Int>> base, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment, bool zeroMaskedLanes = false);
-	void Scatter(RValue<Pointer<Float>> base, RValue<Float4> val, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment);
-	void Scatter(RValue<Pointer<Int>> base, RValue<Int4> val, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment);
+	RValue<Pointer<T>> operator=(RValue<Pointer<T>> rhs);
+	RValue<Pointer<T>> operator=(const Pointer<T> &rhs);
+	RValue<Pointer<T>> operator=(const Reference<Pointer<T>> &rhs);
+	RValue<Pointer<T>> operator=(std::nullptr_t);
 
-	template<typename T>
-	void Store(RValue<T> value, RValue<Pointer<T>> pointer, unsigned int alignment, bool atomic, std::memory_order memoryOrder)
-	{
-		Nucleus::createStore(value.value, pointer.value, T::getType(), false, alignment, atomic, memoryOrder);
-	}
+	Reference<T> operator*();
+	Reference<T> operator[](int index);
+	Reference<T> operator[](unsigned int index);
+	Reference<T> operator[](RValue<Int> index);
+	Reference<T> operator[](RValue<UInt> index);
 
-	template<typename T>
-	void Store(RValue<T> value, Pointer<T> pointer, unsigned int alignment, bool atomic, std::memory_order memoryOrder)
-	{
-		Store(value, RValue<Pointer<T>>(pointer), alignment, atomic, memoryOrder);
-	}
+	static Type *getType();
 
-	template<typename T>
-	void Store(T value, Pointer<T> pointer, unsigned int alignment, bool atomic, std::memory_order memoryOrder)
-	{
-		Store(RValue<T>(value), RValue<Pointer<T>>(pointer), alignment, atomic, memoryOrder);
-	}
+private:
+	const int alignment;
+};
 
-	// Fence adds a memory barrier that enforces ordering constraints on memory
-	// operations. memoryOrder can only be one of:
-	// std::memory_order_acquire, std::memory_order_release,
-	// std::memory_order_acq_rel, or std::memory_order_seq_cst.
-	void Fence(std::memory_order memoryOrder);
+RValue<Pointer<Byte>> operator+(RValue<Pointer<Byte>> lhs, int offset);
+RValue<Pointer<Byte>> operator+(RValue<Pointer<Byte>> lhs, RValue<Int> offset);
+RValue<Pointer<Byte>> operator+(RValue<Pointer<Byte>> lhs, RValue<UInt> offset);
+RValue<Pointer<Byte>> operator+=(Pointer<Byte> &lhs, int offset);
+RValue<Pointer<Byte>> operator+=(Pointer<Byte> &lhs, RValue<Int> offset);
+RValue<Pointer<Byte>> operator+=(Pointer<Byte> &lhs, RValue<UInt> offset);
 
-	template<class T, int S = 1>
-	class Array : public LValue<T>
-	{
-	public:
-		Array(int size = S);
+RValue<Pointer<Byte>> operator-(RValue<Pointer<Byte>> lhs, int offset);
+RValue<Pointer<Byte>> operator-(RValue<Pointer<Byte>> lhs, RValue<Int> offset);
+RValue<Pointer<Byte>> operator-(RValue<Pointer<Byte>> lhs, RValue<UInt> offset);
+RValue<Pointer<Byte>> operator-=(Pointer<Byte> &lhs, int offset);
+RValue<Pointer<Byte>> operator-=(Pointer<Byte> &lhs, RValue<Int> offset);
+RValue<Pointer<Byte>> operator-=(Pointer<Byte> &lhs, RValue<UInt> offset);
 
-		Reference<T> operator[](int index);
-		Reference<T> operator[](unsigned int index);
-		Reference<T> operator[](RValue<Int> index);
-		Reference<T> operator[](RValue<UInt> index);
+template <typename T>
+RValue<Bool> operator==(const Pointer<T> &lhs, const Pointer<T> &rhs)
+{
+	return RValue<Bool>(Nucleus::createPtrEQ(lhs.loadValue(), rhs.loadValue()));
+}
 
-		// self() returns the this pointer to this Array object.
-		// This function exists because operator&() is overloaded by LValue<T>.
-		inline Array* self() { return this; }
-	};
+template<typename T>
+RValue<T> Load(RValue<Pointer<T>> pointer, unsigned int alignment, bool atomic, std::memory_order memoryOrder)
+{
+	return RValue<T>(Nucleus::createLoad(pointer.value, T::getType(), false, alignment, atomic, memoryOrder));
+}
+
+template<typename T>
+RValue<T> Load(Pointer<T> pointer, unsigned int alignment, bool atomic, std::memory_order memoryOrder)
+{
+	return Load(RValue<Pointer<T>>(pointer), alignment, atomic, memoryOrder);
+}
+
+// TODO: Use SIMD to template these.
+RValue<Float4> MaskedLoad(RValue<Pointer<Float4>> base, RValue<Int4> mask, unsigned int alignment, bool zeroMaskedLanes = false);
+RValue<Int4> MaskedLoad(RValue<Pointer<Int4>> base, RValue<Int4> mask, unsigned int alignment, bool zeroMaskedLanes = false);
+void MaskedStore(RValue<Pointer<Float4>> base, RValue<Float4> val, RValue<Int4> mask, unsigned int alignment);
+void MaskedStore(RValue<Pointer<Int4>> base, RValue<Int4> val, RValue<Int4> mask, unsigned int alignment);
+
+RValue<Float4> Gather(RValue<Pointer<Float>> base, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment, bool zeroMaskedLanes = false);
+RValue<Int4> Gather(RValue<Pointer<Int>> base, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment, bool zeroMaskedLanes = false);
+void Scatter(RValue<Pointer<Float>> base, RValue<Float4> val, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment);
+void Scatter(RValue<Pointer<Int>> base, RValue<Int4> val, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment);
+
+template<typename T>
+void Store(RValue<T> value, RValue<Pointer<T>> pointer, unsigned int alignment, bool atomic, std::memory_order memoryOrder)
+{
+	Nucleus::createStore(value.value, pointer.value, T::getType(), false, alignment, atomic, memoryOrder);
+}
+
+template<typename T>
+void Store(RValue<T> value, Pointer<T> pointer, unsigned int alignment, bool atomic, std::memory_order memoryOrder)
+{
+	Store(value, RValue<Pointer<T>>(pointer), alignment, atomic, memoryOrder);
+}
+
+template<typename T>
+void Store(T value, Pointer<T> pointer, unsigned int alignment, bool atomic, std::memory_order memoryOrder)
+{
+	Store(RValue<T>(value), RValue<Pointer<T>>(pointer), alignment, atomic, memoryOrder);
+}
+
+// Fence adds a memory barrier that enforces ordering constraints on memory
+// operations. memoryOrder can only be one of:
+// std::memory_order_acquire, std::memory_order_release,
+// std::memory_order_acq_rel, or std::memory_order_seq_cst.
+void Fence(std::memory_order memoryOrder);
+
+template<class T, int S = 1>
+class Array : public LValue<T>
+{
+public:
+	Array(int size = S);
+
+	Reference<T> operator[](int index);
+	Reference<T> operator[](unsigned int index);
+	Reference<T> operator[](RValue<Int> index);
+	Reference<T> operator[](RValue<UInt> index);
+
+	// self() returns the this pointer to this Array object.
+	// This function exists because operator&() is overloaded by LValue<T>.
+	inline Array* self() { return this; }
+};
 
 //	RValue<Array<T>> operator++(Array<T> &val, int);   // Post-increment
 //	const Array<T> &operator++(Array<T> &val);   // Pre-increment
 //	RValue<Array<T>> operator--(Array<T> &val, int);   // Post-decrement
 //	const Array<T> &operator--(Array<T> &val);   // Pre-decrement
 
-	void branch(RValue<Bool> cmp, BasicBlock *bodyBB, BasicBlock *endBB);
+void branch(RValue<Bool> cmp, BasicBlock *bodyBB, BasicBlock *endBB);
 
-	// ValueOf returns a rr::Value* for the given C-type, RValue<T>, LValue<T>
-	// or Reference<T>.
-	template <typename T>
-	inline Value* ValueOf(const T &v)
-	{
-		return ReactorType<T>::cast(v).loadValue();
-	}
-
-	void Return();
-
-	template<class T>
-	void Return(const T &ret)
-	{
-		static_assert(CanBeUsedAsReturn< ReactorTypeT<T> >::value, "Unsupported type for Return()");
-		Nucleus::createRet(ValueOf<T>(ret));
-		// Place any unreachable instructions in an unreferenced block.
-		Nucleus::setInsertBlock(Nucleus::createBasicBlock());
-	}
-
-	// Generic template, leave undefined!
-	template<typename FunctionType>
-	class Function;
-
-	// Specialized for function types
-	template<typename Return, typename... Arguments>
-	class Function<Return(Arguments...)>
-	{
-		// Static assert that the function signature is valid.
-		static_assert(sizeof(AssertFunctionSignatureIsValid<Return(Arguments...)>) >= 0, "Invalid function signature");
-
-	public:
-		Function();
-
-		virtual ~Function();
-
-		template<int index>
-		Argument<typename std::tuple_element<index, std::tuple<Arguments...>>::type> Arg() const
-		{
-			Value *arg = Nucleus::getArgument(index);
-			return Argument<typename std::tuple_element<index, std::tuple<Arguments...>>::type>(arg);
-		}
-
-		std::shared_ptr<Routine> operator()(const char *name, ...);
-		std::shared_ptr<Routine> operator()(const Config::Edit &cfg, const char *name, ...);
-
-	protected:
-		Nucleus *core;
-		std::vector<Type*> arguments;
-	};
-
-	template<typename Return>
-	class Function<Return()> : public Function<Return(Void)>
-	{
-	};
-
-	// FunctionT accepts a C-style function type template argument, allowing it to return a type-safe RoutineT wrapper
-	template<typename FunctionType>
-	class FunctionT;
-
-	template<typename Return, typename... Arguments>
-	class FunctionT<Return(Arguments...)> : public Function<CToReactorT<Return>(CToReactorT<Arguments>...)>
-	{
-	public:
-		// Type of base class
-		using BaseType = Function<CToReactorT<Return>(CToReactorT<Arguments>...)>;
-
-		// Function type, e.g. void(int,float)
-		using CFunctionType = Return(Arguments...);
-
-		// Reactor function type, e.g. Void(Int, Float)
-		using ReactorFunctionType = CToReactorT<Return>(CToReactorT<Arguments>...);
-
-		// Returned RoutineT type
-		using RoutineType = RoutineT<CFunctionType>;
-
-		// Hide base implementations of operator()
-
-		RoutineType operator()(const char* name, ...)
-		{
-			return RoutineType(BaseType::operator()(name));
-		}
-
-		RoutineType operator()(const Config::Edit& cfg, const char* name, ...)
-		{
-			return RoutineType(BaseType::operator()(cfg, name));
-		}
-	};
-
-	RValue<Long> Ticks();
+// ValueOf returns a rr::Value* for the given C-type, RValue<T>, LValue<T>
+// or Reference<T>.
+template <typename T>
+inline Value* ValueOf(const T &v)
+{
+	return ReactorType<T>::cast(v).loadValue();
 }
 
-namespace rr
+void Return();
+
+template<class T>
+void Return(const T &ret)
 {
-	template<class T>
-	LValue<T>::LValue(int arraySize) : Variable(T::getType(), arraySize)
+	static_assert(CanBeUsedAsReturn< ReactorTypeT<T> >::value, "Unsupported type for Return()");
+	Nucleus::createRet(ValueOf<T>(ret));
+	// Place any unreachable instructions in an unreferenced block.
+	Nucleus::setInsertBlock(Nucleus::createBasicBlock());
+}
+
+// Generic template, leave undefined!
+template<typename FunctionType>
+class Function;
+
+// Specialized for function types
+template<typename Return, typename... Arguments>
+class Function<Return(Arguments...)>
+{
+	// Static assert that the function signature is valid.
+	static_assert(sizeof(AssertFunctionSignatureIsValid<Return(Arguments...)>) >= 0, "Invalid function signature");
+
+public:
+	Function();
+
+	virtual ~Function();
+
+	template<int index>
+	Argument<typename std::tuple_element<index, std::tuple<Arguments...>>::type> Arg() const
 	{
+		Value *arg = Nucleus::getArgument(index);
+		return Argument<typename std::tuple_element<index, std::tuple<Arguments...>>::type>(arg);
+	}
+
+	std::shared_ptr<Routine> operator()(const char *name, ...);
+	std::shared_ptr<Routine> operator()(const Config::Edit &cfg, const char *name, ...);
+
+protected:
+	Nucleus *core;
+	std::vector<Type*> arguments;
+};
+
+template<typename Return>
+class Function<Return()> : public Function<Return(Void)>
+{
+};
+
+// FunctionT accepts a C-style function type template argument, allowing it to return a type-safe RoutineT wrapper
+template<typename FunctionType>
+class FunctionT;
+
+template<typename Return, typename... Arguments>
+class FunctionT<Return(Arguments...)> : public Function<CToReactorT<Return>(CToReactorT<Arguments>...)>
+{
+public:
+	// Type of base class
+	using BaseType = Function<CToReactorT<Return>(CToReactorT<Arguments>...)>;
+
+	// Function type, e.g. void(int,float)
+	using CFunctionType = Return(Arguments...);
+
+	// Reactor function type, e.g. Void(Int, Float)
+	using ReactorFunctionType = CToReactorT<Return>(CToReactorT<Arguments>...);
+
+	// Returned RoutineT type
+	using RoutineType = RoutineT<CFunctionType>;
+
+	// Hide base implementations of operator()
+
+	RoutineType operator()(const char* name, ...)
+	{
+		return RoutineType(BaseType::operator()(name));
+	}
+
+	RoutineType operator()(const Config::Edit& cfg, const char* name, ...)
+	{
+		return RoutineType(BaseType::operator()(cfg, name));
+	}
+};
+
+RValue<Long> Ticks();
+
+}  // namespace rr
+
+/* Inline implementations */
+
+namespace rr {
+
+template<class T>
+LValue<T>::LValue(int arraySize) : Variable(T::getType(), arraySize)
+{
 #ifdef ENABLE_RR_DEBUG_INFO
-		materialize();
+	materialize();
 #endif // ENABLE_RR_DEBUG_INFO
-	}
+}
 
-	inline void Variable::materialize() const
+inline void Variable::materialize() const
+{
+	if(!address)
 	{
-		if(!address)
-		{
-			address = Nucleus::allocateStackVariable(type, arraySize);
-			RR_DEBUG_INFO_EMIT_VAR(address);
+		address = Nucleus::allocateStackVariable(type, arraySize);
+		RR_DEBUG_INFO_EMIT_VAR(address);
 
-			if(rvalue)
-			{
-				storeValue(rvalue);
-				rvalue = nullptr;
-			}
-		}
-	}
-
-	inline Value *Variable::loadValue() const
-	{
 		if(rvalue)
 		{
-			return rvalue;
+			storeValue(rvalue);
+			rvalue = nullptr;
 		}
+	}
+}
 
-		if(!address)
-		{
-			// TODO: Return undef instead.
-			materialize();
-		}
-
-		return Nucleus::createLoad(address, type, false, 0);
+inline Value *Variable::loadValue() const
+{
+	if(rvalue)
+	{
+		return rvalue;
 	}
 
-	inline Value *Variable::storeValue(Value *value) const
+	if(!address)
 	{
-		if(address)
-		{
-			return Nucleus::createStore(value, address, type, false, 0);
-		}
-
-		rvalue = value;
-
-		return value;
-	}
-
-	inline Value *Variable::getBaseAddress() const
-	{
+		// TODO: Return undef instead.
 		materialize();
-
-		return address;
 	}
 
-	inline Value *Variable::getElementPointer(Value *index, bool unsignedIndex) const
+	return Nucleus::createLoad(address, type, false, 0);
+}
+
+inline Value *Variable::storeValue(Value *value) const
+{
+	if(address)
 	{
-		return Nucleus::createGEP(getBaseAddress(), type, index, unsignedIndex);
+		return Nucleus::createStore(value, address, type, false, 0);
 	}
 
-	template<class T>
-	RValue<Pointer<T>> LValue<T>::operator&()
-	{
-		return RValue<Pointer<T>>(getBaseAddress());
-	}
+	rvalue = value;
 
-	template<class T>
-	Reference<T>::Reference(Value *pointer, int alignment) : alignment(alignment)
-	{
-		address = pointer;
-	}
+	return value;
+}
 
-	template<class T>
-	RValue<T> Reference<T>::operator=(RValue<T> rhs) const
-	{
-		Nucleus::createStore(rhs.value, address, T::getType(), false, alignment);
+inline Value *Variable::getBaseAddress() const
+{
+	materialize();
 
-		return rhs;
-	}
+	return address;
+}
 
-	template<class T>
-	RValue<T> Reference<T>::operator=(const Reference<T> &ref) const
-	{
-		Value *tmp = Nucleus::createLoad(ref.address, T::getType(), false, ref.alignment);
-		Nucleus::createStore(tmp, address, T::getType(), false, alignment);
+inline Value *Variable::getElementPointer(Value *index, bool unsignedIndex) const
+{
+	return Nucleus::createGEP(getBaseAddress(), type, index, unsignedIndex);
+}
 
-		return RValue<T>(tmp);
-	}
+template<class T>
+RValue<Pointer<T>> LValue<T>::operator&()
+{
+	return RValue<Pointer<T>>(getBaseAddress());
+}
 
-	template<class T>
-	RValue<T> Reference<T>::operator+=(RValue<T> rhs) const
-	{
-		return *this = *this + rhs;
-	}
+template<class T>
+Reference<T>::Reference(Value *pointer, int alignment) : alignment(alignment)
+{
+	address = pointer;
+}
 
-	template<class T>
-	Value *Reference<T>::loadValue() const
-	{
-		return Nucleus::createLoad(address, T::getType(), false, alignment);
-	}
+template<class T>
+RValue<T> Reference<T>::operator=(RValue<T> rhs) const
+{
+	Nucleus::createStore(rhs.value, address, T::getType(), false, alignment);
 
-	template<class T>
-	int Reference<T>::getAlignment() const
-	{
-		return alignment;
-	}
+	return rhs;
+}
+
+template<class T>
+RValue<T> Reference<T>::operator=(const Reference<T> &ref) const
+{
+	Value *tmp = Nucleus::createLoad(ref.address, T::getType(), false, ref.alignment);
+	Nucleus::createStore(tmp, address, T::getType(), false, alignment);
+
+	return RValue<T>(tmp);
+}
+
+template<class T>
+RValue<T> Reference<T>::operator+=(RValue<T> rhs) const
+{
+	return *this = *this + rhs;
+}
+
+template<class T>
+Value *Reference<T>::loadValue() const
+{
+	return Nucleus::createLoad(address, T::getType(), false, alignment);
+}
+
+template<class T>
+int Reference<T>::getAlignment() const
+{
+	return alignment;
+}
 
 #ifdef ENABLE_RR_DEBUG_INFO
-	template<class T>
-	RValue<T>::RValue(const RValue<T> &rvalue) : value(rvalue.value)
-	{
-		RR_DEBUG_INFO_EMIT_VAR(value);
-	}
+template<class T>
+RValue<T>::RValue(const RValue<T> &rvalue) : value(rvalue.value)
+{
+	RR_DEBUG_INFO_EMIT_VAR(value);
+}
 #endif // ENABLE_RR_DEBUG_INFO
 
-	template<class T>
-	RValue<T>::RValue(Value *rvalue)
-	{
-		assert(Nucleus::createBitCast(rvalue, T::getType()) == rvalue);   // Run-time type should match T, so bitcast is no-op.
+template<class T>
+RValue<T>::RValue(Value *rvalue)
+{
+	assert(Nucleus::createBitCast(rvalue, T::getType()) == rvalue);   // Run-time type should match T, so bitcast is no-op.
 
-		value = rvalue;
-		RR_DEBUG_INFO_EMIT_VAR(value);
-	}
+	value = rvalue;
+	RR_DEBUG_INFO_EMIT_VAR(value);
+}
 
-	template<class T>
-	RValue<T>::RValue(const T &lvalue)
-	{
-		value = lvalue.loadValue();
-		RR_DEBUG_INFO_EMIT_VAR(value);
-	}
+template<class T>
+RValue<T>::RValue(const T &lvalue)
+{
+	value = lvalue.loadValue();
+	RR_DEBUG_INFO_EMIT_VAR(value);
+}
 
-	template<class T>
-	RValue<T>::RValue(typename BoolLiteral<T>::type i)
-	{
-		value = Nucleus::createConstantBool(i);
-		RR_DEBUG_INFO_EMIT_VAR(value);
-	}
+template<class T>
+RValue<T>::RValue(typename BoolLiteral<T>::type i)
+{
+	value = Nucleus::createConstantBool(i);
+	RR_DEBUG_INFO_EMIT_VAR(value);
+}
 
-	template<class T>
-	RValue<T>::RValue(typename IntLiteral<T>::type i)
-	{
-		value = Nucleus::createConstantInt(i);
-		RR_DEBUG_INFO_EMIT_VAR(value);
-	}
+template<class T>
+RValue<T>::RValue(typename IntLiteral<T>::type i)
+{
+	value = Nucleus::createConstantInt(i);
+	RR_DEBUG_INFO_EMIT_VAR(value);
+}
 
-	template<class T>
-	RValue<T>::RValue(typename FloatLiteral<T>::type f)
-	{
-		value = Nucleus::createConstantFloat(f);
-		RR_DEBUG_INFO_EMIT_VAR(value);
-	}
+template<class T>
+RValue<T>::RValue(typename FloatLiteral<T>::type f)
+{
+	value = Nucleus::createConstantFloat(f);
+	RR_DEBUG_INFO_EMIT_VAR(value);
+}
 
-	template<class T>
-	RValue<T>::RValue(const Reference<T> &ref)
-	{
-		value = ref.loadValue();
-		RR_DEBUG_INFO_EMIT_VAR(value);
-	}
+template<class T>
+RValue<T>::RValue(const Reference<T> &ref)
+{
+	value = ref.loadValue();
+	RR_DEBUG_INFO_EMIT_VAR(value);
+}
 
-	template<class Vector4, int T>
-	Swizzle2<Vector4, T>::operator RValue<Vector4>() const
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		Value *vector = parent->loadValue();
+template<class Vector4, int T>
+Swizzle2<Vector4, T>::operator RValue<Vector4>() const
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	Value *vector = parent->loadValue();
 
-		return Swizzle(RValue<Vector4>(vector), T);
-	}
+	return Swizzle(RValue<Vector4>(vector), T);
+}
 
-	template<class Vector4, int T>
-	Swizzle4<Vector4, T>::operator RValue<Vector4>() const
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		Value *vector = parent->loadValue();
+template<class Vector4, int T>
+Swizzle4<Vector4, T>::operator RValue<Vector4>() const
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	Value *vector = parent->loadValue();
 
-		return Swizzle(RValue<Vector4>(vector), T);
-	}
+	return Swizzle(RValue<Vector4>(vector), T);
+}
 
-	template<class Vector4, int T>
-	SwizzleMask4<Vector4, T>::operator RValue<Vector4>() const
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		Value *vector = parent->loadValue();
+template<class Vector4, int T>
+SwizzleMask4<Vector4, T>::operator RValue<Vector4>() const
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	Value *vector = parent->loadValue();
 
-		return Swizzle(RValue<Vector4>(vector), T);
-	}
+	return Swizzle(RValue<Vector4>(vector), T);
+}
 
-	template<class Vector4, int T>
-	RValue<Vector4> SwizzleMask4<Vector4, T>::operator=(RValue<Vector4> rhs)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return Mask(*parent, rhs, T);
-	}
+template<class Vector4, int T>
+RValue<Vector4> SwizzleMask4<Vector4, T>::operator=(RValue<Vector4> rhs)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return Mask(*parent, rhs, T);
+}
 
-	template<class Vector4, int T>
-	RValue<Vector4> SwizzleMask4<Vector4, T>::operator=(RValue<typename Scalar<Vector4>::Type> rhs)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return Mask(*parent, Vector4(rhs), T);
-	}
+template<class Vector4, int T>
+RValue<Vector4> SwizzleMask4<Vector4, T>::operator=(RValue<typename Scalar<Vector4>::Type> rhs)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return Mask(*parent, Vector4(rhs), T);
+}
 
-	template<class Vector4, int T>
-	SwizzleMask1<Vector4, T>::operator RValue<typename Scalar<Vector4>::Type>() const   // FIXME: Call a non-template function
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return Extract(*parent, T & 0x3);
-	}
+template<class Vector4, int T>
+SwizzleMask1<Vector4, T>::operator RValue<typename Scalar<Vector4>::Type>() const   // FIXME: Call a non-template function
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return Extract(*parent, T & 0x3);
+}
 
-	template<class Vector4, int T>
-	SwizzleMask1<Vector4, T>::operator RValue<Vector4>() const
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		Value *vector = parent->loadValue();
+template<class Vector4, int T>
+SwizzleMask1<Vector4, T>::operator RValue<Vector4>() const
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	Value *vector = parent->loadValue();
 
-		return Swizzle(RValue<Vector4>(vector), T);
-	}
+	return Swizzle(RValue<Vector4>(vector), T);
+}
 
-	template<class Vector4, int T>
-	RValue<Vector4> SwizzleMask1<Vector4, T>::operator=(float x)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return *parent = Insert(*parent, Float(x), T & 0x3);
-	}
+template<class Vector4, int T>
+RValue<Vector4> SwizzleMask1<Vector4, T>::operator=(float x)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return *parent = Insert(*parent, Float(x), T & 0x3);
+}
 
-	template<class Vector4, int T>
-	RValue<Vector4> SwizzleMask1<Vector4, T>::operator=(RValue<Vector4> rhs)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return Mask(*parent, Float4(rhs), T);
-	}
+template<class Vector4, int T>
+RValue<Vector4> SwizzleMask1<Vector4, T>::operator=(RValue<Vector4> rhs)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return Mask(*parent, Float4(rhs), T);
+}
 
-	template<class Vector4, int T>
-	RValue<Vector4> SwizzleMask1<Vector4, T>::operator=(RValue<typename Scalar<Vector4>::Type> rhs)   // FIXME: Call a non-template function
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return *parent = Insert(*parent, rhs, T & 0x3);
-	}
+template<class Vector4, int T>
+RValue<Vector4> SwizzleMask1<Vector4, T>::operator=(RValue<typename Scalar<Vector4>::Type> rhs)   // FIXME: Call a non-template function
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return *parent = Insert(*parent, rhs, T & 0x3);
+}
 
-	template<class Vector4, int T>
-	SwizzleMask2<Vector4, T>::operator RValue<Vector4>() const
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		Value *vector = parent->loadValue();
+template<class Vector4, int T>
+SwizzleMask2<Vector4, T>::operator RValue<Vector4>() const
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	Value *vector = parent->loadValue();
 
-		return Swizzle(RValue<Float4>(vector), T);
-	}
+	return Swizzle(RValue<Float4>(vector), T);
+}
 
-	template<class Vector4, int T>
-	RValue<Vector4> SwizzleMask2<Vector4, T>::operator=(RValue<Vector4> rhs)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return Mask(*parent, Float4(rhs), T);
-	}
+template<class Vector4, int T>
+RValue<Vector4> SwizzleMask2<Vector4, T>::operator=(RValue<Vector4> rhs)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return Mask(*parent, Float4(rhs), T);
+}
 
-	template<int T>
-	Float::Float(const SwizzleMask1<Float4, T> &rhs)
-	{
-		*this = rhs.operator RValue<Float>();
-	}
+template<int T>
+Float::Float(const SwizzleMask1<Float4, T> &rhs)
+{
+	*this = rhs.operator RValue<Float>();
+}
 
-	template<int T>
-	RValue<Float> Float::operator=(const SwizzleMask1<Float4, T> &rhs)
-	{
-		return *this = rhs.operator RValue<Float>();
-	}
+template<int T>
+RValue<Float> Float::operator=(const SwizzleMask1<Float4, T> &rhs)
+{
+	return *this = rhs.operator RValue<Float>();
+}
 
-	template<int T>
-	Float4::Float4(const SwizzleMask1<Float4, T> &rhs) : XYZW(this)
-	{
-		*this = rhs.operator RValue<Float4>();
-	}
+template<int T>
+Float4::Float4(const SwizzleMask1<Float4, T> &rhs) : XYZW(this)
+{
+	*this = rhs.operator RValue<Float4>();
+}
 
-	template<int T>
-	Float4::Float4(const Swizzle4<Float4, T> &rhs) : XYZW(this)
-	{
-		*this = rhs.operator RValue<Float4>();
-	}
+template<int T>
+Float4::Float4(const Swizzle4<Float4, T> &rhs) : XYZW(this)
+{
+	*this = rhs.operator RValue<Float4>();
+}
 
-	template<int X, int Y>
-	Float4::Float4(const Swizzle2<Float4, X> &x, const Swizzle2<Float4, Y> &y) : XYZW(this)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		*this = ShuffleLowHigh(*x.parent, *y.parent, (uint16_t(X) & 0xFF00u) | (uint16_t(Y >> 8) & 0x00FFu));
-	}
+template<int X, int Y>
+Float4::Float4(const Swizzle2<Float4, X> &x, const Swizzle2<Float4, Y> &y) : XYZW(this)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	*this = ShuffleLowHigh(*x.parent, *y.parent, (uint16_t(X) & 0xFF00u) | (uint16_t(Y >> 8) & 0x00FFu));
+}
 
-	template<int X, int Y>
-	Float4::Float4(const SwizzleMask2<Float4, X> &x, const Swizzle2<Float4, Y> &y) : XYZW(this)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		*this = ShuffleLowHigh(*x.parent, *y.parent, (uint16_t(X) & 0xFF00u) | (uint16_t(Y >> 8) & 0x00FFu));
-	}
+template<int X, int Y>
+Float4::Float4(const SwizzleMask2<Float4, X> &x, const Swizzle2<Float4, Y> &y) : XYZW(this)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	*this = ShuffleLowHigh(*x.parent, *y.parent, (uint16_t(X) & 0xFF00u) | (uint16_t(Y >> 8) & 0x00FFu));
+}
 
-	template<int X, int Y>
-	Float4::Float4(const Swizzle2<Float4, X> &x, const SwizzleMask2<Float4, Y> &y) : XYZW(this)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		*this = ShuffleLowHigh(*x.parent, *y.parent, (uint16_t(X) & 0xFF00u) | (uint16_t(Y >> 8) & 0x00FFu));
-	}
+template<int X, int Y>
+Float4::Float4(const Swizzle2<Float4, X> &x, const SwizzleMask2<Float4, Y> &y) : XYZW(this)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	*this = ShuffleLowHigh(*x.parent, *y.parent, (uint16_t(X) & 0xFF00u) | (uint16_t(Y >> 8) & 0x00FFu));
+}
 
-	template<int X, int Y>
-	Float4::Float4(const SwizzleMask2<Float4, X> &x, const SwizzleMask2<Float4, Y> &y) : XYZW(this)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		*this = ShuffleLowHigh(*x.parent, *y.parent, (uint16_t(X) & 0xFF00u) | (uint16_t(Y >> 8) & 0x00FFu));
-	}
+template<int X, int Y>
+Float4::Float4(const SwizzleMask2<Float4, X> &x, const SwizzleMask2<Float4, Y> &y) : XYZW(this)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	*this = ShuffleLowHigh(*x.parent, *y.parent, (uint16_t(X) & 0xFF00u) | (uint16_t(Y >> 8) & 0x00FFu));
+}
 
-	template<int T>
-	RValue<Float4> Float4::operator=(const SwizzleMask1<Float4, T> &rhs)
-	{
-		return *this = rhs.operator RValue<Float4>();
-	}
+template<int T>
+RValue<Float4> Float4::operator=(const SwizzleMask1<Float4, T> &rhs)
+{
+	return *this = rhs.operator RValue<Float4>();
+}
 
-	template<int T>
-	RValue<Float4> Float4::operator=(const Swizzle4<Float4, T> &rhs)
-	{
-		return *this = rhs.operator RValue<Float4>();
-	}
+template<int T>
+RValue<Float4> Float4::operator=(const Swizzle4<Float4, T> &rhs)
+{
+	return *this = rhs.operator RValue<Float4>();
+}
 
-	// Returns a reactor pointer to the fixed-address ptr.
-	RValue<Pointer<Byte>> ConstantPointer(void const * ptr);
+// Returns a reactor pointer to the fixed-address ptr.
+RValue<Pointer<Byte>> ConstantPointer(void const * ptr);
 
-	// Returns a reactor pointer to an immutable copy of the data of size bytes.
-	RValue<Pointer<Byte>> ConstantData(void const * data, size_t size);
+// Returns a reactor pointer to an immutable copy of the data of size bytes.
+RValue<Pointer<Byte>> ConstantData(void const * data, size_t size);
 
-	template<class T>
-	Pointer<T>::Pointer(Argument<Pointer<T>> argument) : alignment(1)
-	{
-		LValue<Pointer<T>>::storeValue(argument.value);
-	}
+template<class T>
+Pointer<T>::Pointer(Argument<Pointer<T>> argument) : alignment(1)
+{
+	LValue<Pointer<T>>::storeValue(argument.value);
+}
 
-	template<class T>
-	Pointer<T>::Pointer() : alignment(1) {}
+template<class T>
+Pointer<T>::Pointer() : alignment(1) {}
 
-	template<class T>
-	Pointer<T>::Pointer(RValue<Pointer<T>> rhs) : alignment(1)
-	{
-		LValue<Pointer<T>>::storeValue(rhs.value);
-	}
+template<class T>
+Pointer<T>::Pointer(RValue<Pointer<T>> rhs) : alignment(1)
+{
+	LValue<Pointer<T>>::storeValue(rhs.value);
+}
 
-	template<class T>
-	Pointer<T>::Pointer(const Pointer<T> &rhs) : alignment(rhs.alignment)
-	{
-		Value *value = rhs.loadValue();
-		LValue<Pointer<T>>::storeValue(value);
-	}
+template<class T>
+Pointer<T>::Pointer(const Pointer<T> &rhs) : alignment(rhs.alignment)
+{
+	Value *value = rhs.loadValue();
+	LValue<Pointer<T>>::storeValue(value);
+}
 
-	template<class T>
-	Pointer<T>::Pointer(const Reference<Pointer<T>> &rhs) : alignment(rhs.getAlignment())
-	{
-		Value *value = rhs.loadValue();
-		LValue<Pointer<T>>::storeValue(value);
-	}
+template<class T>
+Pointer<T>::Pointer(const Reference<Pointer<T>> &rhs) : alignment(rhs.getAlignment())
+{
+	Value *value = rhs.loadValue();
+	LValue<Pointer<T>>::storeValue(value);
+}
 
-	template<class T>
-	Pointer<T>::Pointer(std::nullptr_t) : alignment(1)
-	{
-		Value *value = Nucleus::createNullPointer(T::getType());
-		LValue<Pointer<T>>::storeValue(value);
-	}
+template<class T>
+Pointer<T>::Pointer(std::nullptr_t) : alignment(1)
+{
+	Value *value = Nucleus::createNullPointer(T::getType());
+	LValue<Pointer<T>>::storeValue(value);
+}
 
-	template<class T>
-	RValue<Pointer<T>> Pointer<T>::operator=(RValue<Pointer<T>> rhs)
-	{
-		LValue<Pointer<T>>::storeValue(rhs.value);
+template<class T>
+RValue<Pointer<T>> Pointer<T>::operator=(RValue<Pointer<T>> rhs)
+{
+	LValue<Pointer<T>>::storeValue(rhs.value);
 
-		return rhs;
-	}
+	return rhs;
+}
 
-	template<class T>
-	RValue<Pointer<T>> Pointer<T>::operator=(const Pointer<T> &rhs)
-	{
-		Value *value = rhs.loadValue();
-		LValue<Pointer<T>>::storeValue(value);
+template<class T>
+RValue<Pointer<T>> Pointer<T>::operator=(const Pointer<T> &rhs)
+{
+	Value *value = rhs.loadValue();
+	LValue<Pointer<T>>::storeValue(value);
 
-		return RValue<Pointer<T>>(value);
-	}
+	return RValue<Pointer<T>>(value);
+}
 
-	template<class T>
-	RValue<Pointer<T>> Pointer<T>::operator=(const Reference<Pointer<T>> &rhs)
-	{
-		Value *value = rhs.loadValue();
-		LValue<Pointer<T>>::storeValue(value);
+template<class T>
+RValue<Pointer<T>> Pointer<T>::operator=(const Reference<Pointer<T>> &rhs)
+{
+	Value *value = rhs.loadValue();
+	LValue<Pointer<T>>::storeValue(value);
 
-		return RValue<Pointer<T>>(value);
-	}
+	return RValue<Pointer<T>>(value);
+}
 
-	template<class T>
-	RValue<Pointer<T>> Pointer<T>::operator=(std::nullptr_t)
-	{
-		Value *value = Nucleus::createNullPointer(T::getType());
-		LValue<Pointer<T>>::storeValue(value);
+template<class T>
+RValue<Pointer<T>> Pointer<T>::operator=(std::nullptr_t)
+{
+	Value *value = Nucleus::createNullPointer(T::getType());
+	LValue<Pointer<T>>::storeValue(value);
 
-		return RValue<Pointer<T>>(this);
-	}
+	return RValue<Pointer<T>>(this);
+}
 
-	template<class T>
-	Reference<T> Pointer<T>::operator*()
-	{
-		return Reference<T>(LValue<Pointer<T>>::loadValue(), alignment);
-	}
+template<class T>
+Reference<T> Pointer<T>::operator*()
+{
+	return Reference<T>(LValue<Pointer<T>>::loadValue(), alignment);
+}
 
-	template<class T>
-	Reference<T> Pointer<T>::operator[](int index)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		Value *element = Nucleus::createGEP(LValue<Pointer<T>>::loadValue(), T::getType(), Nucleus::createConstantInt(index), false);
+template<class T>
+Reference<T> Pointer<T>::operator[](int index)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	Value *element = Nucleus::createGEP(LValue<Pointer<T>>::loadValue(), T::getType(), Nucleus::createConstantInt(index), false);
 
-		return Reference<T>(element, alignment);
-	}
+	return Reference<T>(element, alignment);
+}
 
-	template<class T>
-	Reference<T> Pointer<T>::operator[](unsigned int index)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		Value *element = Nucleus::createGEP(LValue<Pointer<T>>::loadValue(), T::getType(), Nucleus::createConstantInt(index), true);
+template<class T>
+Reference<T> Pointer<T>::operator[](unsigned int index)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	Value *element = Nucleus::createGEP(LValue<Pointer<T>>::loadValue(), T::getType(), Nucleus::createConstantInt(index), true);
 
-		return Reference<T>(element, alignment);
-	}
+	return Reference<T>(element, alignment);
+}
 
-	template<class T>
-	Reference<T> Pointer<T>::operator[](RValue<Int> index)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		Value *element = Nucleus::createGEP(LValue<Pointer<T>>::loadValue(), T::getType(), index.value, false);
+template<class T>
+Reference<T> Pointer<T>::operator[](RValue<Int> index)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	Value *element = Nucleus::createGEP(LValue<Pointer<T>>::loadValue(), T::getType(), index.value, false);
 
-		return Reference<T>(element, alignment);
-	}
+	return Reference<T>(element, alignment);
+}
 
-	template<class T>
-	Reference<T> Pointer<T>::operator[](RValue<UInt> index)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		Value *element = Nucleus::createGEP(LValue<Pointer<T>>::loadValue(), T::getType(), index.value, true);
+template<class T>
+Reference<T> Pointer<T>::operator[](RValue<UInt> index)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	Value *element = Nucleus::createGEP(LValue<Pointer<T>>::loadValue(), T::getType(), index.value, true);
 
-		return Reference<T>(element, alignment);
-	}
+	return Reference<T>(element, alignment);
+}
 
-	template<class T>
-	Type *Pointer<T>::getType()
-	{
-		return Nucleus::getPointerType(T::getType());
-	}
+template<class T>
+Type *Pointer<T>::getType()
+{
+	return Nucleus::getPointerType(T::getType());
+}
 
-	template<class T, int S>
-	Array<T, S>::Array(int size) : LValue<T>(size)
-	{
-	}
+template<class T, int S>
+Array<T, S>::Array(int size) : LValue<T>(size)
+{
+}
 
-	template<class T, int S>
-	Reference<T> Array<T, S>::operator[](int index)
-	{
-		assert(index < this->arraySize);
-		Value *element = LValue<T>::getElementPointer(Nucleus::createConstantInt(index), false);
+template<class T, int S>
+Reference<T> Array<T, S>::operator[](int index)
+{
+	assert(index < this->arraySize);
+	Value *element = LValue<T>::getElementPointer(Nucleus::createConstantInt(index), false);
 
-		return Reference<T>(element);
-	}
+	return Reference<T>(element);
+}
 
-	template<class T, int S>
-	Reference<T> Array<T, S>::operator[](unsigned int index)
-	{
-		assert(index < static_cast<unsigned int>(this->arraySize));
-		Value *element = LValue<T>::getElementPointer(Nucleus::createConstantInt(index), true);
+template<class T, int S>
+Reference<T> Array<T, S>::operator[](unsigned int index)
+{
+	assert(index < static_cast<unsigned int>(this->arraySize));
+	Value *element = LValue<T>::getElementPointer(Nucleus::createConstantInt(index), true);
 
-		return Reference<T>(element);
-	}
+	return Reference<T>(element);
+}
 
-	template<class T, int S>
-	Reference<T> Array<T, S>::operator[](RValue<Int> index)
-	{
-		Value *element = LValue<T>::getElementPointer(index.value, false);
+template<class T, int S>
+Reference<T> Array<T, S>::operator[](RValue<Int> index)
+{
+	Value *element = LValue<T>::getElementPointer(index.value, false);
 
-		return Reference<T>(element);
-	}
+	return Reference<T>(element);
+}
 
-	template<class T, int S>
-	Reference<T> Array<T, S>::operator[](RValue<UInt> index)
-	{
-		Value *element = LValue<T>::getElementPointer(index.value, true);
+template<class T, int S>
+Reference<T> Array<T, S>::operator[](RValue<UInt> index)
+{
+	Value *element = LValue<T>::getElementPointer(index.value, true);
 
-		return Reference<T>(element);
-	}
+	return Reference<T>(element);
+}
 
 //	template<class T>
 //	RValue<Array<T>> operator++(Array<T> &val, int)
@@ -3067,404 +3070,411 @@
 //		// FIXME: Requires storing the address of the array
 //	}
 
-	template<class T>
-	RValue<T> IfThenElse(RValue<Bool> condition, RValue<T> ifTrue, RValue<T> ifFalse)
+template<class T>
+RValue<T> IfThenElse(RValue<Bool> condition, RValue<T> ifTrue, RValue<T> ifFalse)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return RValue<T>(Nucleus::createSelect(condition.value, ifTrue.value, ifFalse.value));
+}
+
+template<class T>
+RValue<T> IfThenElse(RValue<Bool> condition, const T &ifTrue, RValue<T> ifFalse)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	Value *trueValue = ifTrue.loadValue();
+
+	return RValue<T>(Nucleus::createSelect(condition.value, trueValue, ifFalse.value));
+}
+
+template<class T>
+RValue<T> IfThenElse(RValue<Bool> condition, RValue<T> ifTrue, const T &ifFalse)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	Value *falseValue = ifFalse.loadValue();
+
+	return RValue<T>(Nucleus::createSelect(condition.value, ifTrue.value, falseValue));
+}
+
+template<class T>
+RValue<T> IfThenElse(RValue<Bool> condition, const T &ifTrue, const T &ifFalse)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	Value *trueValue = ifTrue.loadValue();
+	Value *falseValue = ifFalse.loadValue();
+
+	return RValue<T>(Nucleus::createSelect(condition.value, trueValue, falseValue));
+}
+
+template<typename Return, typename... Arguments>
+Function<Return(Arguments...)>::Function()
+{
+	core = new Nucleus();
+
+	Type *types[] = {Arguments::getType()...};
+	for(Type *type : types)
 	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return RValue<T>(Nucleus::createSelect(condition.value, ifTrue.value, ifFalse.value));
-	}
-
-	template<class T>
-	RValue<T> IfThenElse(RValue<Bool> condition, const T &ifTrue, RValue<T> ifFalse)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		Value *trueValue = ifTrue.loadValue();
-
-		return RValue<T>(Nucleus::createSelect(condition.value, trueValue, ifFalse.value));
-	}
-
-	template<class T>
-	RValue<T> IfThenElse(RValue<Bool> condition, RValue<T> ifTrue, const T &ifFalse)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		Value *falseValue = ifFalse.loadValue();
-
-		return RValue<T>(Nucleus::createSelect(condition.value, ifTrue.value, falseValue));
-	}
-
-	template<class T>
-	RValue<T> IfThenElse(RValue<Bool> condition, const T &ifTrue, const T &ifFalse)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		Value *trueValue = ifTrue.loadValue();
-		Value *falseValue = ifFalse.loadValue();
-
-		return RValue<T>(Nucleus::createSelect(condition.value, trueValue, falseValue));
-	}
-
-	template<typename Return, typename... Arguments>
-	Function<Return(Arguments...)>::Function()
-	{
-		core = new Nucleus();
-
-		Type *types[] = {Arguments::getType()...};
-		for(Type *type : types)
+		if(type != Void::getType())
 		{
-			if(type != Void::getType())
-			{
-				arguments.push_back(type);
-			}
+			arguments.push_back(type);
 		}
-
-		Nucleus::createFunction(Return::getType(), arguments);
 	}
 
-	template<typename Return, typename... Arguments>
-	Function<Return(Arguments...)>::~Function()
+	Nucleus::createFunction(Return::getType(), arguments);
+}
+
+template<typename Return, typename... Arguments>
+Function<Return(Arguments...)>::~Function()
+{
+	delete core;
+}
+
+template<typename Return, typename... Arguments>
+std::shared_ptr<Routine> Function<Return(Arguments...)>::operator()(const char *name, ...)
+{
+	char fullName[1024 + 1];
+
+	va_list vararg;
+	va_start(vararg, name);
+	vsnprintf(fullName, 1024, name, vararg);
+	va_end(vararg);
+
+	return core->acquireRoutine(fullName, Config::Edit::None);
+}
+
+template<typename Return, typename... Arguments>
+std::shared_ptr<Routine> Function<Return(Arguments...)>::operator()(const Config::Edit &cfg, const char *name, ...)
+{
+	char fullName[1024 + 1];
+
+	va_list vararg;
+	va_start(vararg, name);
+	vsnprintf(fullName, 1024, name, vararg);
+	va_end(vararg);
+
+	return core->acquireRoutine(fullName, cfg);
+}
+
+template<class T, class S>
+RValue<T> ReinterpretCast(RValue<S> val)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return RValue<T>(Nucleus::createBitCast(val.value, T::getType()));
+}
+
+template<class T, class S>
+RValue<T> ReinterpretCast(const LValue<S> &var)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	Value *val = var.loadValue();
+
+	return RValue<T>(Nucleus::createBitCast(val, T::getType()));
+}
+
+template<class T, class S>
+RValue<T> ReinterpretCast(const Reference<S> &var)
+{
+	return ReinterpretCast<T>(RValue<S>(var));
+}
+
+template<class T>
+RValue<T> As(Value *val)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return RValue<T>(Nucleus::createBitCast(val, T::getType()));
+}
+
+template<class T, class S>
+RValue<T> As(RValue<S> val)
+{
+	return ReinterpretCast<T>(val);
+}
+
+template<class T, class S>
+RValue<T> As(const LValue<S> &var)
+{
+	return ReinterpretCast<T>(var);
+}
+
+template<class T, class S>
+RValue<T> As(const Reference<S> &val)
+{
+	return ReinterpretCast<T>(val);
+}
+
+// Calls the function pointer fptr with the given arguments, return type
+// and parameter types. Returns the call's return value if the function has
+// a non-void return type.
+Value* Call(RValue<Pointer<Byte>> fptr, Type* retTy, std::initializer_list<Value*> args, std::initializer_list<Type*> paramTys);
+
+template <typename F>
+class CallHelper {};
+
+template<typename Return, typename ... Arguments>
+class CallHelper<Return(Arguments...)>
+{
+public:
+	using RReturn = CToReactorT<Return>;
+
+	static inline RReturn Call(Return(fptr)(Arguments...), CToReactorT<Arguments>... args)
 	{
-		delete core;
+		return RValue<RReturn>(rr::Call(
+			ConstantPointer(reinterpret_cast<void*>(fptr)),
+			RReturn::getType(),
+			{ ValueOf(args) ... },
+			{ CToReactorT<Arguments>::getType() ... }));
 	}
 
-	template<typename Return, typename... Arguments>
-	std::shared_ptr<Routine> Function<Return(Arguments...)>::operator()(const char *name, ...)
+	static inline RReturn Call(Pointer<Byte> fptr, CToReactorT<Arguments>... args)
 	{
-		char fullName[1024 + 1];
+		return RValue<RReturn>(rr::Call(
+			fptr,
+			RReturn::getType(),
+			{ ValueOf(args) ... },
+			{ CToReactorT<Arguments>::getType() ... }));
+	}
+};
 
-		va_list vararg;
-		va_start(vararg, name);
-		vsnprintf(fullName, 1024, name, vararg);
-		va_end(vararg);
-
-		return core->acquireRoutine(fullName, Config::Edit::None);
+template<typename ... Arguments>
+class CallHelper<void(Arguments...)>
+{
+public:
+	static inline void Call(void(fptr)(Arguments...), CToReactorT<Arguments>... args)
+	{
+		rr::Call(ConstantPointer(reinterpret_cast<void*>(fptr)),
+			Void::getType(),
+			{ ValueOf(args) ... },
+			{ CToReactorT<Arguments>::getType() ... });
 	}
 
-	template<typename Return, typename... Arguments>
-	std::shared_ptr<Routine> Function<Return(Arguments...)>::operator()(const Config::Edit &cfg, const char *name, ...)
+	static inline void Call(Pointer<Byte> fptr, CToReactorT<Arguments>... args)
 	{
-		char fullName[1024 + 1];
-
-		va_list vararg;
-		va_start(vararg, name);
-		vsnprintf(fullName, 1024, name, vararg);
-		va_end(vararg);
-
-		return core->acquireRoutine(fullName, cfg);
+		rr::Call(fptr,
+			Void::getType(),
+			{ ValueOf(args) ... },
+			{ CToReactorT<Arguments>::getType() ... });
 	}
+};
 
-	template<class T, class S>
-	RValue<T> ReinterpretCast(RValue<S> val)
+template <typename T>
+inline ReactorTypeT<T> CastToReactor(const T& v) { return ReactorType<T>::cast(v); }
+
+// Calls the static function pointer fptr with the given arguments args.
+template<typename Return, typename ... CArgs, typename ... RArgs>
+inline CToReactorT<Return> Call(Return(fptr)(CArgs...), RArgs&&... args)
+{
+	return CallHelper<Return(CArgs...)>::Call(fptr, CastToReactor(std::forward<RArgs>(args))...);
+}
+
+// Calls the static function pointer fptr with the given arguments args.
+// Overload for calling functions with void return type.
+template<typename ... CArgs, typename ... RArgs>
+inline void Call(void(fptr)(CArgs...), RArgs&&... args)
+{
+	CallHelper<void(CArgs...)>::Call(fptr, CastToReactor(std::forward<RArgs>(args))...);
+}
+
+// Calls the member function pointer fptr with the given arguments args.
+// object can be a Class*, or a Pointer<Byte>.
+template<typename Return, typename Class, typename C, typename ... CArgs, typename ... RArgs>
+inline CToReactorT<Return> Call(Return(Class::* fptr)(CArgs...), C&& object, RArgs&&... args)
+{
+	using Helper = CallHelper<Return(Class*, void*, CArgs...)>;
+	using fptrTy = decltype(fptr);
+
+	struct Static
 	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return RValue<T>(Nucleus::createBitCast(val.value, T::getType()));
-	}
-
-	template<class T, class S>
-	RValue<T> ReinterpretCast(const LValue<S> &var)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		Value *val = var.loadValue();
-
-		return RValue<T>(Nucleus::createBitCast(val, T::getType()));
-	}
-
-	template<class T, class S>
-	RValue<T> ReinterpretCast(const Reference<S> &var)
-	{
-		return ReinterpretCast<T>(RValue<S>(var));
-	}
-
-	template<class T>
-	RValue<T> As(Value *val)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return RValue<T>(Nucleus::createBitCast(val, T::getType()));
-	}
-
-	template<class T, class S>
-	RValue<T> As(RValue<S> val)
-	{
-		return ReinterpretCast<T>(val);
-	}
-
-	template<class T, class S>
-	RValue<T> As(const LValue<S> &var)
-	{
-		return ReinterpretCast<T>(var);
-	}
-
-	template<class T, class S>
-	RValue<T> As(const Reference<S> &val)
-	{
-		return ReinterpretCast<T>(val);
-	}
-
-	// Calls the function pointer fptr with the given arguments, return type
-	// and parameter types. Returns the call's return value if the function has
-	// a non-void return type.
-	Value* Call(RValue<Pointer<Byte>> fptr, Type* retTy, std::initializer_list<Value*> args, std::initializer_list<Type*> paramTys);
-
-	template <typename F>
-	class CallHelper {};
-
-	template<typename Return, typename ... Arguments>
-	class CallHelper<Return(Arguments...)>
-	{
-	public:
-		using RReturn = CToReactorT<Return>;
-
-		static inline RReturn Call(Return(fptr)(Arguments...), CToReactorT<Arguments>... args)
+		static inline Return Call(Class* object, void* fptrptr, CArgs... args)
 		{
-			return RValue<RReturn>(rr::Call(
-				ConstantPointer(reinterpret_cast<void*>(fptr)),
-				RReturn::getType(),
-				{ ValueOf(args) ... },
-				{ CToReactorT<Arguments>::getType() ... }));
-		}
-
-		static inline RReturn Call(Pointer<Byte> fptr, CToReactorT<Arguments>... args)
-		{
-			return RValue<RReturn>(rr::Call(
-				fptr,
-				RReturn::getType(),
-				{ ValueOf(args) ... },
-				{ CToReactorT<Arguments>::getType() ... }));
+			auto fptr = *reinterpret_cast<fptrTy*>(fptrptr);
+			return (object->*fptr)(std::forward<CArgs>(args)...);
 		}
 	};
 
-	template<typename ... Arguments>
-	class CallHelper<void(Arguments...)>
-	{
-	public:
-		static inline void Call(void(fptr)(Arguments...), CToReactorT<Arguments>... args)
-		{
-			rr::Call(ConstantPointer(reinterpret_cast<void*>(fptr)),
-				Void::getType(),
-				{ ValueOf(args) ... },
-				{ CToReactorT<Arguments>::getType() ... });
-		}
+	return Helper::Call(&Static::Call,
+	                    CastToReactor(object),
+	                    ConstantData(&fptr, sizeof(fptr)),
+	                    CastToReactor(std::forward<RArgs>(args))...);
+}
 
-		static inline void Call(Pointer<Byte> fptr, CToReactorT<Arguments>... args)
+// Calls the member function pointer fptr with the given arguments args.
+// Overload for calling functions with void return type.
+// object can be a Class*, or a Pointer<Byte>.
+template<typename Class, typename C, typename ... CArgs, typename ... RArgs>
+inline void Call(void(Class::* fptr)(CArgs...), C&& object, RArgs&&... args)
+{
+	using Helper = CallHelper<void(Class*, void*, CArgs...)>;
+	using fptrTy = decltype(fptr);
+
+	struct Static
+	{
+		static inline void Call(Class* object, void* fptrptr, CArgs... args)
 		{
-			rr::Call(fptr,
-				Void::getType(),
-				{ ValueOf(args) ... },
-				{ CToReactorT<Arguments>::getType() ... });
+			auto fptr = *reinterpret_cast<fptrTy*>(fptrptr);
+			(object->*fptr)(std::forward<CArgs>(args)...);
 		}
 	};
 
-	template <typename T>
-	inline ReactorTypeT<T> CastToReactor(const T& v) { return ReactorType<T>::cast(v); }
+	Helper::Call(&Static::Call,
+	             CastToReactor(object),
+	             ConstantData(&fptr, sizeof(fptr)),
+	             CastToReactor(std::forward<RArgs>(args))...);
+}
 
-	// Calls the static function pointer fptr with the given arguments args.
-	template<typename Return, typename ... CArgs, typename ... RArgs>
-	inline CToReactorT<Return> Call(Return(fptr)(CArgs...), RArgs&&... args)
+// Calls the Reactor function pointer fptr with the signature
+// FUNCTION_SIGNATURE and arguments.
+template<typename FUNCTION_SIGNATURE, typename ... RArgs>
+inline void Call(Pointer<Byte> fptr, RArgs&& ... args)
+{
+	CallHelper<FUNCTION_SIGNATURE>::Call(fptr, CastToReactor(std::forward<RArgs>(args))...);
+}
+
+// Breakpoint emits an instruction that will cause the application to trap.
+// This can be used to stop an attached debugger at the given call.
+void Breakpoint();
+
+class ForData
+{
+public:
+	ForData(bool init) : loopOnce(init)
 	{
-		return CallHelper<Return(CArgs...)>::Call(fptr, CastToReactor(std::forward<RArgs>(args))...);
 	}
 
-	// Calls the static function pointer fptr with the given arguments args.
-	// Overload for calling functions with void return type.
-	template<typename ... CArgs, typename ... RArgs>
-	inline void Call(void(fptr)(CArgs...), RArgs&&... args)
+	operator bool()
 	{
-		CallHelper<void(CArgs...)>::Call(fptr, CastToReactor(std::forward<RArgs>(args))...);
+		return loopOnce;
 	}
 
-	// Calls the member function pointer fptr with the given arguments args.
-	// object can be a Class*, or a Pointer<Byte>.
-	template<typename Return, typename Class, typename C, typename ... CArgs, typename ... RArgs>
-	inline CToReactorT<Return> Call(Return(Class::* fptr)(CArgs...), C&& object, RArgs&&... args)
+	bool operator=(bool value)
 	{
-		using Helper = CallHelper<Return(Class*, void*, CArgs...)>;
-		using fptrTy = decltype(fptr);
-		struct Static {
-			static inline Return Call(Class* object, void* fptrptr, CArgs... args)
-			{
-				auto fptr = *reinterpret_cast<fptrTy*>(fptrptr);
-				return (object->*fptr)(std::forward<CArgs>(args)...);
-			}
-		};
-		return Helper::Call(&Static::Call,
-		                    CastToReactor(object),
-		                    ConstantData(&fptr, sizeof(fptr)),
-		                    CastToReactor(std::forward<RArgs>(args))...);
+		return loopOnce = value;
 	}
 
-	// Calls the member function pointer fptr with the given arguments args.
-	// Overload for calling functions with void return type.
-	// object can be a Class*, or a Pointer<Byte>.
-	template<typename Class, typename C, typename ... CArgs, typename ... RArgs>
-	inline void Call(void(Class::* fptr)(CArgs...), C&& object, RArgs&&... args)
+	bool setup()
 	{
-		using Helper = CallHelper<void(Class*, void*, CArgs...)>;
-		using fptrTy = decltype(fptr);
-		struct Static {
-			static inline void Call(Class* object, void* fptrptr, CArgs... args)
-			{
-				auto fptr = *reinterpret_cast<fptrTy*>(fptrptr);
-				(object->*fptr)(std::forward<CArgs>(args)...);
-			}
-		};
-		Helper::Call(&Static::Call,
-		             CastToReactor(object),
-		             ConstantData(&fptr, sizeof(fptr)),
-		             CastToReactor(std::forward<RArgs>(args))...);
-	}
-
-	// Calls the Reactor function pointer fptr with the signature
-	// FUNCTION_SIGNATURE and arguments.
-	template<typename FUNCTION_SIGNATURE, typename ... RArgs>
-	inline void Call(Pointer<Byte> fptr, RArgs&& ... args)
-	{
-		CallHelper<FUNCTION_SIGNATURE>::Call(fptr, CastToReactor(std::forward<RArgs>(args))...);
-	}
-
-	// Breakpoint emits an instruction that will cause the application to trap.
-	// This can be used to stop an attached debugger at the given call.
-	void Breakpoint();
-
-	class ForData
-	{
-	public:
-		ForData(bool init) : loopOnce(init)
+		RR_DEBUG_INFO_FLUSH();
+		if(Nucleus::getInsertBlock() != endBB)
 		{
-		}
+			testBB = Nucleus::createBasicBlock();
 
-		operator bool()
-		{
-			return loopOnce;
-		}
-
-		bool operator=(bool value)
-		{
-			return loopOnce = value;
-		}
-
-		bool setup()
-		{
-			RR_DEBUG_INFO_FLUSH();
-			if(Nucleus::getInsertBlock() != endBB)
-			{
-				testBB = Nucleus::createBasicBlock();
-
-				Nucleus::createBr(testBB);
-				Nucleus::setInsertBlock(testBB);
-
-				return true;
-			}
-
-			return false;
-		}
-
-		bool test(RValue<Bool> cmp)
-		{
-			BasicBlock *bodyBB = Nucleus::createBasicBlock();
-			endBB = Nucleus::createBasicBlock();
-
-			Nucleus::createCondBr(cmp.value, bodyBB, endBB);
-			Nucleus::setInsertBlock(bodyBB);
+			Nucleus::createBr(testBB);
+			Nucleus::setInsertBlock(testBB);
 
 			return true;
 		}
 
-		void end()
-		{
-			Nucleus::createBr(testBB);
-			Nucleus::setInsertBlock(endBB);
-		}
-
-	private:
-		BasicBlock *testBB = nullptr;
-		BasicBlock *endBB = nullptr;
-		bool loopOnce = true;
-	};
-
-	class IfElseData
-	{
-	public:
-		IfElseData(RValue<Bool> cmp) : iteration(0)
-		{
-			condition = cmp.value;
-
-			beginBB = Nucleus::getInsertBlock();
-			trueBB = Nucleus::createBasicBlock();
-			falseBB = nullptr;
-			endBB = Nucleus::createBasicBlock();
-
-			Nucleus::setInsertBlock(trueBB);
-		}
-
-		~IfElseData()
-		{
-			Nucleus::createBr(endBB);
-
-			Nucleus::setInsertBlock(beginBB);
-			Nucleus::createCondBr(condition, trueBB, falseBB ? falseBB : endBB);
-
-			Nucleus::setInsertBlock(endBB);
-		}
-
-		operator int()
-		{
-			return iteration;
-		}
-
-		IfElseData &operator++()
-		{
-			++iteration;
-
-			return *this;
-		}
-
-		void elseClause()
-		{
-			Nucleus::createBr(endBB);
-
-			falseBB = Nucleus::createBasicBlock();
-			Nucleus::setInsertBlock(falseBB);
-		}
-
-	private:
-		Value *condition;
-		BasicBlock *beginBB;
-		BasicBlock *trueBB;
-		BasicBlock *falseBB;
-		BasicBlock *endBB;
-		int iteration;
-	};
-
-	#define For(init, cond, inc) \
-	for(ForData for__ = true; for__; for__ = false) \
-	for(init; for__.setup() && for__.test(cond); inc, for__.end())
-
-	#define While(cond) For((void)0, cond, (void)0)
-
-	#define Do                                            \
-	{                                                     \
-		BasicBlock *body__ = Nucleus::createBasicBlock(); \
-		Nucleus::createBr(body__);                        \
-		Nucleus::setInsertBlock(body__);
-
-	#define Until(cond)                                     \
-		BasicBlock *end__ = Nucleus::createBasicBlock();    \
-		Nucleus::createCondBr((cond).value, end__, body__); \
-		Nucleus::setInsertBlock(end__);                     \
+		return false;
 	}
 
-	enum {IF_BLOCK__, ELSE_CLAUSE__, ELSE_BLOCK__, IFELSE_NUM__};
+	bool test(RValue<Bool> cmp)
+	{
+		BasicBlock *bodyBB = Nucleus::createBasicBlock();
+		endBB = Nucleus::createBasicBlock();
 
-	#define If(cond)                                                    \
-	for(IfElseData ifElse__(cond); ifElse__ < IFELSE_NUM__; ++ifElse__) \
-	if(ifElse__ == IF_BLOCK__)
+		Nucleus::createCondBr(cmp.value, bodyBB, endBB);
+		Nucleus::setInsertBlock(bodyBB);
 
-	#define Else                       \
-	else if(ifElse__ == ELSE_CLAUSE__) \
-	{                                  \
-		 ifElse__.elseClause();        \
-	}                                  \
-	else   // ELSE_BLOCK__
+		return true;
+	}
+
+	void end()
+	{
+		Nucleus::createBr(testBB);
+		Nucleus::setInsertBlock(endBB);
+	}
+
+private:
+	BasicBlock *testBB = nullptr;
+	BasicBlock *endBB = nullptr;
+	bool loopOnce = true;
+};
+
+class IfElseData
+{
+public:
+	IfElseData(RValue<Bool> cmp) : iteration(0)
+	{
+		condition = cmp.value;
+
+		beginBB = Nucleus::getInsertBlock();
+		trueBB = Nucleus::createBasicBlock();
+		falseBB = nullptr;
+		endBB = Nucleus::createBasicBlock();
+
+		Nucleus::setInsertBlock(trueBB);
+	}
+
+	~IfElseData()
+	{
+		Nucleus::createBr(endBB);
+
+		Nucleus::setInsertBlock(beginBB);
+		Nucleus::createCondBr(condition, trueBB, falseBB ? falseBB : endBB);
+
+		Nucleus::setInsertBlock(endBB);
+	}
+
+	operator int()
+	{
+		return iteration;
+	}
+
+	IfElseData &operator++()
+	{
+		++iteration;
+
+		return *this;
+	}
+
+	void elseClause()
+	{
+		Nucleus::createBr(endBB);
+
+		falseBB = Nucleus::createBasicBlock();
+		Nucleus::setInsertBlock(falseBB);
+	}
+
+private:
+	Value *condition;
+	BasicBlock *beginBB;
+	BasicBlock *trueBB;
+	BasicBlock *falseBB;
+	BasicBlock *endBB;
+	int iteration;
+};
+
+#define For(init, cond, inc) \
+for(ForData for__ = true; for__; for__ = false) \
+for(init; for__.setup() && for__.test(cond); inc, for__.end())
+
+#define While(cond) For((void)0, cond, (void)0)
+
+#define Do                                            \
+{                                                     \
+	BasicBlock *body__ = Nucleus::createBasicBlock(); \
+	Nucleus::createBr(body__);                        \
+	Nucleus::setInsertBlock(body__);
+
+#define Until(cond)                                     \
+	BasicBlock *end__ = Nucleus::createBasicBlock();    \
+	Nucleus::createCondBr((cond).value, end__, body__); \
+	Nucleus::setInsertBlock(end__);                     \
 }
 
+enum {IF_BLOCK__, ELSE_CLAUSE__, ELSE_BLOCK__, IFELSE_NUM__};
+
+#define If(cond)                                                    \
+for(IfElseData ifElse__(cond); ifElse__ < IFELSE_NUM__; ++ifElse__) \
+if(ifElse__ == IF_BLOCK__)
+
+#define Else                       \
+else if(ifElse__ == ELSE_CLAUSE__) \
+{                                  \
+	ifElse__.elseClause();         \
+}                                  \
+else  // ELSE_BLOCK__
+
+}  // namespace rr
+
 #include "Traits.inl"
 
 #endif   // rr_Reactor_hpp

diff --git a/src/Reactor/ReactorUnitTests.cpp b/src/Reactor/ReactorUnitTests.cpp
index b97894b..afcb6e3 100644
--- a/src/Reactor/ReactorUnitTests.cpp
+++ b/src/Reactor/ReactorUnitTests.cpp

@@ -89,262 +89,241 @@
 
 TEST(ReactorUnitTests, Sample)
 {
+	FunctionT<int(int*, int)> function;
 	{
-		FunctionT<int(int*, int)> function;
+		Pointer<Int> p = function.Arg<0>();
+		Int x = p[-1];
+		Int y = function.Arg<1>();
+		Int z = 4;
+
+		For(Int i = 0, i < 10, i++)
 		{
-			Pointer<Int> p = function.Arg<0>();
-			Int x = p[-1];
-			Int y = function.Arg<1>();
-			Int z = 4;
-
-			For(Int i = 0, i < 10, i++)
-			{
-				z += (2 << i) - (i / 3);
-			}
-
-			Float4 v;
-			v.z = As<Float>(z);
-			z = As<Int>(Float(Float4(v.xzxx).y));
-
-			Int sum = x + y + z;
-
-			Return(sum);
+			z += (2 << i) - (i / 3);
 		}
 
-		auto routine = function("one");
+		Float4 v;
+		v.z = As<Float>(z);
+		z = As<Int>(Float(Float4(v.xzxx).y));
 
-		if(routine)
-		{
-			int one[2] = {1, 0};
-			int result = routine(&one[1], 2);
-			EXPECT_EQ(result, reference(&one[1], 2));
-		}
+		Int sum = x + y + z;
+
+		Return(sum);
 	}
 
+	auto routine = function("one");
+
+	if(routine)
+	{
+		int one[2] = {1, 0};
+		int result = routine(&one[1], 2);
+		EXPECT_EQ(result, reference(&one[1], 2));
+	}
 }
 
 TEST(ReactorUnitTests, Uninitialized)
 {
+	FunctionT<int()> function;
 	{
-		FunctionT<int()> function;
+		Int a;
+		Int z = 4;
+		Int q;
+		Int c;
+		Int p;
+		Bool b;
+
+		q += q;
+
+		If(b)
 		{
-			Int a;
-			Int z = 4;
-			Int q;
-			Int c;
-			Int p;
-			Bool b;
-
-			q += q;
-
-			If(b)
-			{
-				c = p;
-			}
-
-			Return(a + z + q + c);
+			c = p;
 		}
 
-		auto routine = function("one");
-
-		if(routine)
-		{
-			int result = routine();
-			EXPECT_EQ(result, result);   // Anything is fine, just don't crash
-		}
+		Return(a + z + q + c);
 	}
 
+	auto routine = function("one");
+
+	if(routine)
+	{
+		int result = routine();
+		EXPECT_EQ(result, result);   // Anything is fine, just don't crash
+	}
 }
 
 TEST(ReactorUnitTests, Unreachable)
 {
+	FunctionT<int(int)> function;
 	{
-		FunctionT<int(int)> function;
-		{
-			Int a = function.Arg<0>();
-			Int z = 4;
+		Int a = function.Arg<0>();
+		Int z = 4;
 
-			Return(a + z);
+		Return(a + z);
 
-			// Code beyond this point is unreachable but should not cause any
-			// compilation issues.
+		// Code beyond this point is unreachable but should not cause any
+		// compilation issues.
 
-			z += a;
-		}
-
-		auto routine = function("one");
-
-		if(routine)
-		{
-			int result = routine(16);
-			EXPECT_EQ(result, 20);
-		}
+		z += a;
 	}
 
+	auto routine = function("one");
+
+	if(routine)
+	{
+		int result = routine(16);
+		EXPECT_EQ(result, 20);
+	}
 }
 
 TEST(ReactorUnitTests, VariableAddress)
 {
+	FunctionT<int(int)> function;
 	{
-		FunctionT<int(int)> function;
-		{
-			Int a = function.Arg<0>();
-			Int z = 0;
-			Pointer<Int> p = &z;
-			*p = 4;
+		Int a = function.Arg<0>();
+		Int z = 0;
+		Pointer<Int> p = &z;
+		*p = 4;
 
-			Return(a + z);
-		}
-
-		auto routine = function("one");
-
-		if(routine)
-		{
-			int result = routine(16);
-			EXPECT_EQ(result, 20);
-		}
+		Return(a + z);
 	}
 
+	auto routine = function("one");
+
+	if(routine)
+	{
+		int result = routine(16);
+		EXPECT_EQ(result, 20);
+	}
 }
 
 TEST(ReactorUnitTests, SubVectorLoadStore)
 {
+	FunctionT<int(void*, void*)> function;
 	{
-		FunctionT<int(void*, void*)> function;
+		Pointer<Byte> in = function.Arg<0>();
+		Pointer<Byte> out = function.Arg<1>();
+
+		*Pointer<Int4>(out + 16 * 0)   = *Pointer<Int4>(in + 16 * 0);
+		*Pointer<Short4>(out + 16 * 1) = *Pointer<Short4>(in + 16 * 1);
+		*Pointer<Byte8>(out + 16 * 2)  = *Pointer<Byte8>(in + 16 * 2);
+		*Pointer<Byte4>(out + 16 * 3)  = *Pointer<Byte4>(in + 16 * 3);
+		*Pointer<Short2>(out + 16 * 4) = *Pointer<Short2>(in + 16 * 4);
+
+		Return(0);
+	}
+
+	auto routine = function("one");
+
+	if(routine)
+	{
+		int8_t in[16 * 5] = {1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16,
+			                    17, 18, 19, 20, 21, 22, 23, 24,  0,  0,  0,  0,  0,  0,  0,  0,
+			                    25, 26, 27, 28, 29, 30, 31, 32,  0,  0,  0,  0,  0,  0,  0,  0,
+			                    33, 34, 35, 36,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+			                    37, 38, 39, 40,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0};
+
+		int8_t out[16 * 5] = {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+			                    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+			                    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+			                    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+			                    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1};
+
+		routine(in, out);
+
+		for(int row = 0; row < 5; row++)
 		{
-			Pointer<Byte> in = function.Arg<0>();
-			Pointer<Byte> out = function.Arg<1>();
-
-			*Pointer<Int4>(out + 16 * 0)   = *Pointer<Int4>(in + 16 * 0);
-			*Pointer<Short4>(out + 16 * 1) = *Pointer<Short4>(in + 16 * 1);
-			*Pointer<Byte8>(out + 16 * 2)  = *Pointer<Byte8>(in + 16 * 2);
-			*Pointer<Byte4>(out + 16 * 3)  = *Pointer<Byte4>(in + 16 * 3);
-			*Pointer<Short2>(out + 16 * 4) = *Pointer<Short2>(in + 16 * 4);
-
-			Return(0);
-		}
-
-		auto routine = function("one");
-
-		if(routine)
-		{
-			int8_t in[16 * 5] = {1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16,
-			                     17, 18, 19, 20, 21, 22, 23, 24,  0,  0,  0,  0,  0,  0,  0,  0,
-			                     25, 26, 27, 28, 29, 30, 31, 32,  0,  0,  0,  0,  0,  0,  0,  0,
-			                     33, 34, 35, 36,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
-			                     37, 38, 39, 40,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0};
-
-			int8_t out[16 * 5] = {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-			                      -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-			                      -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-			                      -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-			                      -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1};
-
-			routine(in, out);
-
-			for(int row = 0; row < 5; row++)
+			for(int col = 0; col < 16; col++)
 			{
-				for(int col = 0; col < 16; col++)
-				{
-					int i = row * 16 + col;
+				int i = row * 16 + col;
 
-					if(in[i] ==  0)
-					{
-						EXPECT_EQ(out[i], -1) << "Row " << row << " column " << col <<  " not left untouched.";
-					}
-					else
-					{
-						EXPECT_EQ(out[i], in[i]) << "Row " << row << " column " << col << " not equal to input.";
-					}
+				if(in[i] ==  0)
+				{
+					EXPECT_EQ(out[i], -1) << "Row " << row << " column " << col <<  " not left untouched.";
+				}
+				else
+				{
+					EXPECT_EQ(out[i], in[i]) << "Row " << row << " column " << col << " not equal to input.";
 				}
 			}
 		}
 	}
-
 }
 
 TEST(ReactorUnitTests, VectorConstant)
 {
+	FunctionT<int(void*)> function;
 	{
-		FunctionT<int(void*)> function;
+		Pointer<Byte> out = function.Arg<0>();
+
+		*Pointer<Int4>(out + 16 * 0) = Int4(0x04030201, 0x08070605, 0x0C0B0A09, 0x100F0E0D);
+		*Pointer<Short4>(out + 16 * 1) = Short4(0x1211, 0x1413, 0x1615, 0x1817);
+		*Pointer<Byte8>(out + 16 * 2) = Byte8(0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F, 0x20);
+		*Pointer<Int2>(out + 16 * 3) = Int2(0x24232221, 0x28272625);
+
+		Return(0);
+	}
+
+	auto routine = function("one");
+
+	if(routine)
+	{
+		int8_t out[16 * 4] = {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+			                    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+			                    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+			                    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1};
+
+		int8_t exp[16 * 4] = {1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
+			                    17, 18, 19, 20, 21, 22, 23, 24, -1, -1, -1, -1, -1, -1, -1, -1,
+			                    25, 26, 27, 28, 29, 30, 31, 32, -1, -1, -1, -1, -1, -1, -1, -1,
+			                    33, 34, 35, 36, 37, 38, 39, 40, -1, -1, -1, -1, -1, -1, -1, -1};
+
+		routine(out);
+
+		for(int row = 0; row < 4; row++)
 		{
-			Pointer<Byte> out = function.Arg<0>();
-
-			*Pointer<Int4>(out + 16 * 0) = Int4(0x04030201, 0x08070605, 0x0C0B0A09, 0x100F0E0D);
-			*Pointer<Short4>(out + 16 * 1) = Short4(0x1211, 0x1413, 0x1615, 0x1817);
-			*Pointer<Byte8>(out + 16 * 2) = Byte8(0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F, 0x20);
-			*Pointer<Int2>(out + 16 * 3) = Int2(0x24232221, 0x28272625);
-
-			Return(0);
-		}
-
-		auto routine = function("one");
-
-		if(routine)
-		{
-			int8_t out[16 * 4] = {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-			                      -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-			                      -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-			                      -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1};
-
-			int8_t exp[16 * 4] = {1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
-			                      17, 18, 19, 20, 21, 22, 23, 24, -1, -1, -1, -1, -1, -1, -1, -1,
-			                      25, 26, 27, 28, 29, 30, 31, 32, -1, -1, -1, -1, -1, -1, -1, -1,
-			                      33, 34, 35, 36, 37, 38, 39, 40, -1, -1, -1, -1, -1, -1, -1, -1};
-
-			routine(out);
-
-			for(int row = 0; row < 4; row++)
+			for(int col = 0; col < 16; col++)
 			{
-				for(int col = 0; col < 16; col++)
-				{
-					int i = row * 16 + col;
+				int i = row * 16 + col;
 
-					EXPECT_EQ(out[i], exp[i]);
-				}
+				EXPECT_EQ(out[i], exp[i]);
 			}
 		}
 	}
-
 }
 
 TEST(ReactorUnitTests, Concatenate)
 {
+	FunctionT<int(void*)> function;
 	{
-		FunctionT<int(void*)> function;
+		Pointer<Byte> out = function.Arg<0>();
+
+		*Pointer<Int4>(out + 16 * 0)   = Int4(Int2(0x04030201, 0x08070605), Int2(0x0C0B0A09, 0x100F0E0D));
+		*Pointer<Short8>(out + 16 * 1) = Short8(Short4(0x0201, 0x0403, 0x0605, 0x0807), Short4(0x0A09, 0x0C0B, 0x0E0D, 0x100F));
+
+		Return(0);
+	}
+
+	auto routine = function("one");
+
+	if(routine)
+	{
+		int8_t ref[16 * 5] = {1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16,
+			                    1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16};
+
+		int8_t out[16 * 5] = {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+			                    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1};
+
+		routine(out);
+
+		for(int row = 0; row < 2; row++)
 		{
-			Pointer<Byte> out = function.Arg<0>();
-
-			*Pointer<Int4>(out + 16 * 0)   = Int4(Int2(0x04030201, 0x08070605), Int2(0x0C0B0A09, 0x100F0E0D));
-			*Pointer<Short8>(out + 16 * 1) = Short8(Short4(0x0201, 0x0403, 0x0605, 0x0807), Short4(0x0A09, 0x0C0B, 0x0E0D, 0x100F));
-
-			Return(0);
-		}
-
-		auto routine = function("one");
-
-		if(routine)
-		{
-			int8_t ref[16 * 5] = {1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16,
-			                      1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16};
-
-			int8_t out[16 * 5] = {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-			                      -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1};
-
-			routine(out);
-
-			for(int row = 0; row < 2; row++)
+			for(int col = 0; col < 16; col++)
 			{
-				for(int col = 0; col < 16; col++)
-				{
-					int i = row * 16 + col;
+				int i = row * 16 + col;
 
-					EXPECT_EQ(out[i], ref[i]) << "Row " << row << " column " << col << " not equal to reference.";
-				}
+				EXPECT_EQ(out[i], ref[i]) << "Row " << row << " column " << col << " not equal to reference.";
 			}
 		}
 	}
-
 }
 
 TEST(ReactorUnitTests, Swizzle)
@@ -476,7 +455,6 @@
 			}
 		}
 	}
-
 }
 
 TEST(ReactorUnitTests, Blend)
@@ -581,231 +559,223 @@
 
 TEST(ReactorUnitTests, Branching)
 {
+	FunctionT<int()> function;
 	{
-		FunctionT<int()> function;
+		Int x = 0;
+
+		For(Int i = 0, i < 8, i++)
 		{
-			Int x = 0;
-
-			For(Int i = 0, i < 8, i++)
+			If(i < 2)
 			{
-				If(i < 2)
-				{
-					x += 1;
-				}
-				Else If(i < 4)
-				{
-					x += 10;
-				}
-				Else If(i < 6)
-				{
-					x += 100;
-				}
-				Else
-				{
-					x += 1000;
-				}
-
-				For(Int i = 0, i < 5, i++)
-					x += 10000;
+				x += 1;
+			}
+			Else If(i < 4)
+			{
+				x += 10;
+			}
+			Else If(i < 6)
+			{
+				x += 100;
+			}
+			Else
+			{
+				x += 1000;
 			}
 
-			For(Int i = 0, i < 10, i++)
-				for(int i = 0; i < 10; i++)
-					For(Int i = 0, i < 10, i++)
-					{
-						x += 1000000;
-					}
+			For(Int i = 0, i < 5, i++)
+				x += 10000;
+		}
 
-			For(Int i = 0, i < 2, i++)
-				If(x == 1000402222)
+		For(Int i = 0, i < 10, i++)
+			for(int i = 0; i < 10; i++)
+				For(Int i = 0, i < 10, i++)
 				{
-					If(x != 1000402222)
-						x += 1000000000;
+					x += 1000000;
 				}
-				Else
-					x = -5;
 
-			Return(x);
-		}
+		For(Int i = 0, i < 2, i++)
+			If(x == 1000402222)
+			{
+				If(x != 1000402222)
+					x += 1000000000;
+			}
+			Else
+				x = -5;
 
-		auto routine = function("one");
+		Return(x);
+	}
 
-		if(routine)
-		{
-			int result = routine();
+	auto routine = function("one");
 
-			EXPECT_EQ(result, 1000402222);
-		}
+	if(routine)
+	{
+		int result = routine();
+
+		EXPECT_EQ(result, 1000402222);
 	}
 
 }
 
 TEST(ReactorUnitTests, MinMax)
 {
+	FunctionT<int(void*)> function;
 	{
-		FunctionT<int(void*)> function;
-		{
-			Pointer<Byte> out = function.Arg<0>();
+		Pointer<Byte> out = function.Arg<0>();
 
-			*Pointer<Float4>(out + 16 * 0) = Min(Float4(1.0f, 0.0f, -0.0f, +0.0f), Float4(0.0f, 1.0f, +0.0f, -0.0f));
-			*Pointer<Float4>(out + 16 * 1) = Max(Float4(1.0f, 0.0f, -0.0f, +0.0f), Float4(0.0f, 1.0f, +0.0f, -0.0f));
+		*Pointer<Float4>(out + 16 * 0) = Min(Float4(1.0f, 0.0f, -0.0f, +0.0f), Float4(0.0f, 1.0f, +0.0f, -0.0f));
+		*Pointer<Float4>(out + 16 * 1) = Max(Float4(1.0f, 0.0f, -0.0f, +0.0f), Float4(0.0f, 1.0f, +0.0f, -0.0f));
 
-			*Pointer<Int4>(out + 16 * 2) = Min(Int4(1, 0, -1, -0), Int4(0, 1, 0, +0));
-			*Pointer<Int4>(out + 16 * 3) = Max(Int4(1, 0, -1, -0), Int4(0, 1, 0, +0));
-			*Pointer<UInt4>(out + 16 * 4) = Min(UInt4(1, 0, -1, -0), UInt4(0, 1, 0, +0));
-			*Pointer<UInt4>(out + 16 * 5) = Max(UInt4(1, 0, -1, -0), UInt4(0, 1, 0, +0));
+		*Pointer<Int4>(out + 16 * 2) = Min(Int4(1, 0, -1, -0), Int4(0, 1, 0, +0));
+		*Pointer<Int4>(out + 16 * 3) = Max(Int4(1, 0, -1, -0), Int4(0, 1, 0, +0));
+		*Pointer<UInt4>(out + 16 * 4) = Min(UInt4(1, 0, -1, -0), UInt4(0, 1, 0, +0));
+		*Pointer<UInt4>(out + 16 * 5) = Max(UInt4(1, 0, -1, -0), UInt4(0, 1, 0, +0));
 
-			*Pointer<Short4>(out + 16 * 6) = Min(Short4(1, 0, -1, -0), Short4(0, 1, 0, +0));
-			*Pointer<Short4>(out + 16 * 7) = Max(Short4(1, 0, -1, -0), Short4(0, 1, 0, +0));
-			*Pointer<UShort4>(out + 16 * 8) = Min(UShort4(1, 0, -1, -0), UShort4(0, 1, 0, +0));
-			*Pointer<UShort4>(out + 16 * 9) = Max(UShort4(1, 0, -1, -0), UShort4(0, 1, 0, +0));
+		*Pointer<Short4>(out + 16 * 6) = Min(Short4(1, 0, -1, -0), Short4(0, 1, 0, +0));
+		*Pointer<Short4>(out + 16 * 7) = Max(Short4(1, 0, -1, -0), Short4(0, 1, 0, +0));
+		*Pointer<UShort4>(out + 16 * 8) = Min(UShort4(1, 0, -1, -0), UShort4(0, 1, 0, +0));
+		*Pointer<UShort4>(out + 16 * 9) = Max(UShort4(1, 0, -1, -0), UShort4(0, 1, 0, +0));
 
-			Return(0);
-		}
-
-		auto routine = function("one");
-
-		if(routine)
-		{
-			unsigned int out[10][4];
-
-			memset(&out, 0, sizeof(out));
-
-			routine(&out);
-
-			EXPECT_EQ(out[0][0], 0x00000000u);
-			EXPECT_EQ(out[0][1], 0x00000000u);
-			EXPECT_EQ(out[0][2], 0x00000000u);
-			EXPECT_EQ(out[0][3], 0x80000000u);
-
-			EXPECT_EQ(out[1][0], 0x3F800000u);
-			EXPECT_EQ(out[1][1], 0x3F800000u);
-			EXPECT_EQ(out[1][2], 0x00000000u);
-			EXPECT_EQ(out[1][3], 0x80000000u);
-
-			EXPECT_EQ(out[2][0], 0x00000000u);
-			EXPECT_EQ(out[2][1], 0x00000000u);
-			EXPECT_EQ(out[2][2], 0xFFFFFFFFu);
-			EXPECT_EQ(out[2][3], 0x00000000u);
-
-			EXPECT_EQ(out[3][0], 0x00000001u);
-			EXPECT_EQ(out[3][1], 0x00000001u);
-			EXPECT_EQ(out[3][2], 0x00000000u);
-			EXPECT_EQ(out[3][3], 0x00000000u);
-
-			EXPECT_EQ(out[4][0], 0x00000000u);
-			EXPECT_EQ(out[4][1], 0x00000000u);
-			EXPECT_EQ(out[4][2], 0x00000000u);
-			EXPECT_EQ(out[4][3], 0x00000000u);
-
-			EXPECT_EQ(out[5][0], 0x00000001u);
-			EXPECT_EQ(out[5][1], 0x00000001u);
-			EXPECT_EQ(out[5][2], 0xFFFFFFFFu);
-			EXPECT_EQ(out[5][3], 0x00000000u);
-
-			EXPECT_EQ(out[6][0], 0x00000000u);
-			EXPECT_EQ(out[6][1], 0x0000FFFFu);
-			EXPECT_EQ(out[6][2], 0x00000000u);
-			EXPECT_EQ(out[6][3], 0x00000000u);
-
-			EXPECT_EQ(out[7][0], 0x00010001u);
-			EXPECT_EQ(out[7][1], 0x00000000u);
-			EXPECT_EQ(out[7][2], 0x00000000u);
-			EXPECT_EQ(out[7][3], 0x00000000u);
-
-			EXPECT_EQ(out[8][0], 0x00000000u);
-			EXPECT_EQ(out[8][1], 0x00000000u);
-			EXPECT_EQ(out[8][2], 0x00000000u);
-			EXPECT_EQ(out[8][3], 0x00000000u);
-
-			EXPECT_EQ(out[9][0], 0x00010001u);
-			EXPECT_EQ(out[9][1], 0x0000FFFFu);
-			EXPECT_EQ(out[9][2], 0x00000000u);
-			EXPECT_EQ(out[9][3], 0x00000000u);
-		}
+		Return(0);
 	}
 
+	auto routine = function("one");
+
+	if(routine)
+	{
+		unsigned int out[10][4];
+
+		memset(&out, 0, sizeof(out));
+
+		routine(&out);
+
+		EXPECT_EQ(out[0][0], 0x00000000u);
+		EXPECT_EQ(out[0][1], 0x00000000u);
+		EXPECT_EQ(out[0][2], 0x00000000u);
+		EXPECT_EQ(out[0][3], 0x80000000u);
+
+		EXPECT_EQ(out[1][0], 0x3F800000u);
+		EXPECT_EQ(out[1][1], 0x3F800000u);
+		EXPECT_EQ(out[1][2], 0x00000000u);
+		EXPECT_EQ(out[1][3], 0x80000000u);
+
+		EXPECT_EQ(out[2][0], 0x00000000u);
+		EXPECT_EQ(out[2][1], 0x00000000u);
+		EXPECT_EQ(out[2][2], 0xFFFFFFFFu);
+		EXPECT_EQ(out[2][3], 0x00000000u);
+
+		EXPECT_EQ(out[3][0], 0x00000001u);
+		EXPECT_EQ(out[3][1], 0x00000001u);
+		EXPECT_EQ(out[3][2], 0x00000000u);
+		EXPECT_EQ(out[3][3], 0x00000000u);
+
+		EXPECT_EQ(out[4][0], 0x00000000u);
+		EXPECT_EQ(out[4][1], 0x00000000u);
+		EXPECT_EQ(out[4][2], 0x00000000u);
+		EXPECT_EQ(out[4][3], 0x00000000u);
+
+		EXPECT_EQ(out[5][0], 0x00000001u);
+		EXPECT_EQ(out[5][1], 0x00000001u);
+		EXPECT_EQ(out[5][2], 0xFFFFFFFFu);
+		EXPECT_EQ(out[5][3], 0x00000000u);
+
+		EXPECT_EQ(out[6][0], 0x00000000u);
+		EXPECT_EQ(out[6][1], 0x0000FFFFu);
+		EXPECT_EQ(out[6][2], 0x00000000u);
+		EXPECT_EQ(out[6][3], 0x00000000u);
+
+		EXPECT_EQ(out[7][0], 0x00010001u);
+		EXPECT_EQ(out[7][1], 0x00000000u);
+		EXPECT_EQ(out[7][2], 0x00000000u);
+		EXPECT_EQ(out[7][3], 0x00000000u);
+
+		EXPECT_EQ(out[8][0], 0x00000000u);
+		EXPECT_EQ(out[8][1], 0x00000000u);
+		EXPECT_EQ(out[8][2], 0x00000000u);
+		EXPECT_EQ(out[8][3], 0x00000000u);
+
+		EXPECT_EQ(out[9][0], 0x00010001u);
+		EXPECT_EQ(out[9][1], 0x0000FFFFu);
+		EXPECT_EQ(out[9][2], 0x00000000u);
+		EXPECT_EQ(out[9][3], 0x00000000u);
+	}
 }
 
 TEST(ReactorUnitTests, NotNeg)
 {
+	FunctionT<int(void*)> function;
 	{
-		FunctionT<int(void*)> function;
-		{
-			Pointer<Byte> out = function.Arg<0>();
+		Pointer<Byte> out = function.Arg<0>();
 
-			*Pointer<Int>(out + 16 * 0) = ~Int(0x55555555);
-			*Pointer<Short>(out + 16 * 1) = ~Short(0x5555);
-			*Pointer<Int4>(out + 16 * 2) = ~Int4(0x55555555, 0xAAAAAAAA, 0x00000000, 0xFFFFFFFF);
-			*Pointer<Short4>(out + 16 * 3) = ~Short4(0x5555, 0xAAAA, 0x0000, 0xFFFF);
+		*Pointer<Int>(out + 16 * 0) = ~Int(0x55555555);
+		*Pointer<Short>(out + 16 * 1) = ~Short(0x5555);
+		*Pointer<Int4>(out + 16 * 2) = ~Int4(0x55555555, 0xAAAAAAAA, 0x00000000, 0xFFFFFFFF);
+		*Pointer<Short4>(out + 16 * 3) = ~Short4(0x5555, 0xAAAA, 0x0000, 0xFFFF);
 
-			*Pointer<Int>(out + 16 * 4) = -Int(0x55555555);
-			*Pointer<Short>(out + 16 * 5) = -Short(0x5555);
-			*Pointer<Int4>(out + 16 * 6) = -Int4(0x55555555, 0xAAAAAAAA, 0x00000000, 0xFFFFFFFF);
-			*Pointer<Short4>(out + 16 * 7) = -Short4(0x5555, 0xAAAA, 0x0000, 0xFFFF);
+		*Pointer<Int>(out + 16 * 4) = -Int(0x55555555);
+		*Pointer<Short>(out + 16 * 5) = -Short(0x5555);
+		*Pointer<Int4>(out + 16 * 6) = -Int4(0x55555555, 0xAAAAAAAA, 0x00000000, 0xFFFFFFFF);
+		*Pointer<Short4>(out + 16 * 7) = -Short4(0x5555, 0xAAAA, 0x0000, 0xFFFF);
 
-			*Pointer<Float4>(out + 16 * 8) = -Float4(1.0f, -1.0f, 0.0f, -0.0f);
+		*Pointer<Float4>(out + 16 * 8) = -Float4(1.0f, -1.0f, 0.0f, -0.0f);
 
-			Return(0);
-		}
-
-		auto routine = function("one");
-
-		if(routine)
-		{
-			unsigned int out[10][4];
-
-			memset(&out, 0, sizeof(out));
-
-			routine(&out);
-
-			EXPECT_EQ(out[0][0], 0xAAAAAAAAu);
-			EXPECT_EQ(out[0][1], 0x00000000u);
-			EXPECT_EQ(out[0][2], 0x00000000u);
-			EXPECT_EQ(out[0][3], 0x00000000u);
-
-			EXPECT_EQ(out[1][0], 0x0000AAAAu);
-			EXPECT_EQ(out[1][1], 0x00000000u);
-			EXPECT_EQ(out[1][2], 0x00000000u);
-			EXPECT_EQ(out[1][3], 0x00000000u);
-
-			EXPECT_EQ(out[2][0], 0xAAAAAAAAu);
-			EXPECT_EQ(out[2][1], 0x55555555u);
-			EXPECT_EQ(out[2][2], 0xFFFFFFFFu);
-			EXPECT_EQ(out[2][3], 0x00000000u);
-
-			EXPECT_EQ(out[3][0], 0x5555AAAAu);
-			EXPECT_EQ(out[3][1], 0x0000FFFFu);
-			EXPECT_EQ(out[3][2], 0x00000000u);
-			EXPECT_EQ(out[3][3], 0x00000000u);
-
-			EXPECT_EQ(out[4][0], 0xAAAAAAABu);
-			EXPECT_EQ(out[4][1], 0x00000000u);
-			EXPECT_EQ(out[4][2], 0x00000000u);
-			EXPECT_EQ(out[4][3], 0x00000000u);
-
-			EXPECT_EQ(out[5][0], 0x0000AAABu);
-			EXPECT_EQ(out[5][1], 0x00000000u);
-			EXPECT_EQ(out[5][2], 0x00000000u);
-			EXPECT_EQ(out[5][3], 0x00000000u);
-
-			EXPECT_EQ(out[6][0], 0xAAAAAAABu);
-			EXPECT_EQ(out[6][1], 0x55555556u);
-			EXPECT_EQ(out[6][2], 0x00000000u);
-			EXPECT_EQ(out[6][3], 0x00000001u);
-
-			EXPECT_EQ(out[7][0], 0x5556AAABu);
-			EXPECT_EQ(out[7][1], 0x00010000u);
-			EXPECT_EQ(out[7][2], 0x00000000u);
-			EXPECT_EQ(out[7][3], 0x00000000u);
-
-			EXPECT_EQ(out[8][0], 0xBF800000u);
-			EXPECT_EQ(out[8][1], 0x3F800000u);
-			EXPECT_EQ(out[8][2], 0x80000000u);
-			EXPECT_EQ(out[8][3], 0x00000000u);
-		}
+		Return(0);
 	}
 
+	auto routine = function("one");
+
+	if(routine)
+	{
+		unsigned int out[10][4];
+
+		memset(&out, 0, sizeof(out));
+
+		routine(&out);
+
+		EXPECT_EQ(out[0][0], 0xAAAAAAAAu);
+		EXPECT_EQ(out[0][1], 0x00000000u);
+		EXPECT_EQ(out[0][2], 0x00000000u);
+		EXPECT_EQ(out[0][3], 0x00000000u);
+
+		EXPECT_EQ(out[1][0], 0x0000AAAAu);
+		EXPECT_EQ(out[1][1], 0x00000000u);
+		EXPECT_EQ(out[1][2], 0x00000000u);
+		EXPECT_EQ(out[1][3], 0x00000000u);
+
+		EXPECT_EQ(out[2][0], 0xAAAAAAAAu);
+		EXPECT_EQ(out[2][1], 0x55555555u);
+		EXPECT_EQ(out[2][2], 0xFFFFFFFFu);
+		EXPECT_EQ(out[2][3], 0x00000000u);
+
+		EXPECT_EQ(out[3][0], 0x5555AAAAu);
+		EXPECT_EQ(out[3][1], 0x0000FFFFu);
+		EXPECT_EQ(out[3][2], 0x00000000u);
+		EXPECT_EQ(out[3][3], 0x00000000u);
+
+		EXPECT_EQ(out[4][0], 0xAAAAAAABu);
+		EXPECT_EQ(out[4][1], 0x00000000u);
+		EXPECT_EQ(out[4][2], 0x00000000u);
+		EXPECT_EQ(out[4][3], 0x00000000u);
+
+		EXPECT_EQ(out[5][0], 0x0000AAABu);
+		EXPECT_EQ(out[5][1], 0x00000000u);
+		EXPECT_EQ(out[5][2], 0x00000000u);
+		EXPECT_EQ(out[5][3], 0x00000000u);
+
+		EXPECT_EQ(out[6][0], 0xAAAAAAABu);
+		EXPECT_EQ(out[6][1], 0x55555556u);
+		EXPECT_EQ(out[6][2], 0x00000000u);
+		EXPECT_EQ(out[6][3], 0x00000001u);
+
+		EXPECT_EQ(out[7][0], 0x5556AAABu);
+		EXPECT_EQ(out[7][1], 0x00010000u);
+		EXPECT_EQ(out[7][2], 0x00000000u);
+		EXPECT_EQ(out[7][3], 0x00000000u);
+
+		EXPECT_EQ(out[8][0], 0xBF800000u);
+		EXPECT_EQ(out[8][1], 0x3F800000u);
+		EXPECT_EQ(out[8][2], 0x80000000u);
+		EXPECT_EQ(out[8][3], 0x00000000u);
+	}
 }
 
 TEST(ReactorUnitTests, FPtoUI)
@@ -848,375 +818,357 @@
 
 TEST(ReactorUnitTests, VectorCompare)
 {
+	FunctionT<int(void*)> function;
 	{
-		FunctionT<int(void*)> function;
-		{
-			Pointer<Byte> out = function.Arg<0>();
+		Pointer<Byte> out = function.Arg<0>();
 
-			*Pointer<Int4>(out + 16 * 0) = CmpEQ(Float4(1.0f, 1.0f, -0.0f, +0.0f), Float4(0.0f, 1.0f, +0.0f, -0.0f));
-			*Pointer<Int4>(out + 16 * 1) = CmpEQ(Int4(1, 0, -1, -0), Int4(0, 1, 0, +0));
-			*Pointer<Byte8>(out + 16 * 2) = CmpEQ(SByte8(1, 2, 3, 4, 5, 6, 7, 8), SByte8(7, 6, 5, 4, 3, 2, 1, 0));
+		*Pointer<Int4>(out + 16 * 0) = CmpEQ(Float4(1.0f, 1.0f, -0.0f, +0.0f), Float4(0.0f, 1.0f, +0.0f, -0.0f));
+		*Pointer<Int4>(out + 16 * 1) = CmpEQ(Int4(1, 0, -1, -0), Int4(0, 1, 0, +0));
+		*Pointer<Byte8>(out + 16 * 2) = CmpEQ(SByte8(1, 2, 3, 4, 5, 6, 7, 8), SByte8(7, 6, 5, 4, 3, 2, 1, 0));
 
-			*Pointer<Int4>(out + 16 * 3) = CmpNLT(Float4(1.0f, 1.0f, -0.0f, +0.0f), Float4(0.0f, 1.0f, +0.0f, -0.0f));
-			*Pointer<Int4>(out + 16 * 4) = CmpNLT(Int4(1, 0, -1, -0), Int4(0, 1, 0, +0));
-			*Pointer<Byte8>(out + 16 * 5) = CmpGT(SByte8(1, 2, 3, 4, 5, 6, 7, 8), SByte8(7, 6, 5, 4, 3, 2, 1, 0));
+		*Pointer<Int4>(out + 16 * 3) = CmpNLT(Float4(1.0f, 1.0f, -0.0f, +0.0f), Float4(0.0f, 1.0f, +0.0f, -0.0f));
+		*Pointer<Int4>(out + 16 * 4) = CmpNLT(Int4(1, 0, -1, -0), Int4(0, 1, 0, +0));
+		*Pointer<Byte8>(out + 16 * 5) = CmpGT(SByte8(1, 2, 3, 4, 5, 6, 7, 8), SByte8(7, 6, 5, 4, 3, 2, 1, 0));
 
-			Return(0);
-		}
-
-		auto routine = function("one");
-
-		if(routine)
-		{
-			unsigned int out[6][4];
-
-			memset(&out, 0, sizeof(out));
-
-			routine(&out);
-
-			EXPECT_EQ(out[0][0], 0x00000000u);
-			EXPECT_EQ(out[0][1], 0xFFFFFFFFu);
-			EXPECT_EQ(out[0][2], 0xFFFFFFFFu);
-			EXPECT_EQ(out[0][3], 0xFFFFFFFFu);
-
-			EXPECT_EQ(out[1][0], 0x00000000u);
-			EXPECT_EQ(out[1][1], 0x00000000u);
-			EXPECT_EQ(out[1][2], 0x00000000u);
-			EXPECT_EQ(out[1][3], 0xFFFFFFFFu);
-
-			EXPECT_EQ(out[2][0], 0xFF000000u);
-			EXPECT_EQ(out[2][1], 0x00000000u);
-
-			EXPECT_EQ(out[3][0], 0xFFFFFFFFu);
-			EXPECT_EQ(out[3][1], 0xFFFFFFFFu);
-			EXPECT_EQ(out[3][2], 0xFFFFFFFFu);
-			EXPECT_EQ(out[3][3], 0xFFFFFFFFu);
-
-			EXPECT_EQ(out[4][0], 0xFFFFFFFFu);
-			EXPECT_EQ(out[4][1], 0x00000000u);
-			EXPECT_EQ(out[4][2], 0x00000000u);
-			EXPECT_EQ(out[4][3], 0xFFFFFFFFu);
-
-			EXPECT_EQ(out[5][0], 0x00000000u);
-			EXPECT_EQ(out[5][1], 0xFFFFFFFFu);
-		}
+		Return(0);
 	}
 
+	auto routine = function("one");
+
+	if(routine)
+	{
+		unsigned int out[6][4];
+
+		memset(&out, 0, sizeof(out));
+
+		routine(&out);
+
+		EXPECT_EQ(out[0][0], 0x00000000u);
+		EXPECT_EQ(out[0][1], 0xFFFFFFFFu);
+		EXPECT_EQ(out[0][2], 0xFFFFFFFFu);
+		EXPECT_EQ(out[0][3], 0xFFFFFFFFu);
+
+		EXPECT_EQ(out[1][0], 0x00000000u);
+		EXPECT_EQ(out[1][1], 0x00000000u);
+		EXPECT_EQ(out[1][2], 0x00000000u);
+		EXPECT_EQ(out[1][3], 0xFFFFFFFFu);
+
+		EXPECT_EQ(out[2][0], 0xFF000000u);
+		EXPECT_EQ(out[2][1], 0x00000000u);
+
+		EXPECT_EQ(out[3][0], 0xFFFFFFFFu);
+		EXPECT_EQ(out[3][1], 0xFFFFFFFFu);
+		EXPECT_EQ(out[3][2], 0xFFFFFFFFu);
+		EXPECT_EQ(out[3][3], 0xFFFFFFFFu);
+
+		EXPECT_EQ(out[4][0], 0xFFFFFFFFu);
+		EXPECT_EQ(out[4][1], 0x00000000u);
+		EXPECT_EQ(out[4][2], 0x00000000u);
+		EXPECT_EQ(out[4][3], 0xFFFFFFFFu);
+
+		EXPECT_EQ(out[5][0], 0x00000000u);
+		EXPECT_EQ(out[5][1], 0xFFFFFFFFu);
+	}
 }
 
 TEST(ReactorUnitTests, SaturatedAddAndSubtract)
 {
+	FunctionT<int(void*)> function;
 	{
-		FunctionT<int(void*)> function;
-		{
-			Pointer<Byte> out = function.Arg<0>();
+		Pointer<Byte> out = function.Arg<0>();
 
-			*Pointer<Byte8>(out + 8 * 0) =
-				AddSat(Byte8(1, 2, 3, 4, 5, 6, 7, 8),
-				       Byte8(7, 6, 5, 4, 3, 2, 1, 0));
-			*Pointer<Byte8>(out + 8 * 1) =
-				AddSat(Byte8(0xFE, 0xFE, 0xFE, 0xFE, 0xFE, 0xFE, 0xFE, 0xFE),
-				       Byte8(7, 6, 5, 4, 3, 2, 1, 0));
-			*Pointer<Byte8>(out + 8 * 2) =
-				SubSat(Byte8(1, 2, 3, 4, 5, 6, 7, 8),
-				       Byte8(7, 6, 5, 4, 3, 2, 1, 0));
+		*Pointer<Byte8>(out + 8 * 0) =
+			AddSat(Byte8(1, 2, 3, 4, 5, 6, 7, 8),
+				    Byte8(7, 6, 5, 4, 3, 2, 1, 0));
+		*Pointer<Byte8>(out + 8 * 1) =
+			AddSat(Byte8(0xFE, 0xFE, 0xFE, 0xFE, 0xFE, 0xFE, 0xFE, 0xFE),
+				    Byte8(7, 6, 5, 4, 3, 2, 1, 0));
+		*Pointer<Byte8>(out + 8 * 2) =
+			SubSat(Byte8(1, 2, 3, 4, 5, 6, 7, 8),
+				    Byte8(7, 6, 5, 4, 3, 2, 1, 0));
 
-			*Pointer<SByte8>(out + 8 * 3) =
-				AddSat(SByte8(1, 2, 3, 4, 5, 6, 7, 8),
-				       SByte8(7, 6, 5, 4, 3, 2, 1, 0));
-			*Pointer<SByte8>(out + 8 * 4) =
-				AddSat(SByte8(0x7E, 0x7E, 0x7E, 0x7E, 0x7E, 0x7E, 0x7E, 0x7E),
-				       SByte8(7, 6, 5, 4, 3, 2, 1, 0));
-			*Pointer<SByte8>(out + 8 * 5) =
-				AddSat(SByte8(0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88),
-				       SByte8(-7, -6, -5, -4, -3, -2, -1, -0));
-			*Pointer<SByte8>(out + 8 * 6) =
-				SubSat(SByte8(0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88),
-				       SByte8(7, 6, 5, 4, 3, 2, 1, 0));
+		*Pointer<SByte8>(out + 8 * 3) =
+			AddSat(SByte8(1, 2, 3, 4, 5, 6, 7, 8),
+				    SByte8(7, 6, 5, 4, 3, 2, 1, 0));
+		*Pointer<SByte8>(out + 8 * 4) =
+			AddSat(SByte8(0x7E, 0x7E, 0x7E, 0x7E, 0x7E, 0x7E, 0x7E, 0x7E),
+				    SByte8(7, 6, 5, 4, 3, 2, 1, 0));
+		*Pointer<SByte8>(out + 8 * 5) =
+			AddSat(SByte8(0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88),
+				    SByte8(-7, -6, -5, -4, -3, -2, -1, -0));
+		*Pointer<SByte8>(out + 8 * 6) =
+			SubSat(SByte8(0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88),
+				    SByte8(7, 6, 5, 4, 3, 2, 1, 0));
 
-			*Pointer<Short4>(out + 8 * 7) =
-				AddSat(Short4(1, 2, 3, 4), Short4(3, 2, 1, 0));
-			*Pointer<Short4>(out + 8 * 8) =
-				AddSat(Short4(0x7FFE, 0x7FFE, 0x7FFE, 0x7FFE),
-				       Short4(3, 2, 1, 0));
-			*Pointer<Short4>(out + 8 * 9) =
-				AddSat(Short4(0x8001, 0x8002, 0x8003, 0x8004),
-				       Short4(-3, -2, -1, -0));
-			*Pointer<Short4>(out + 8 * 10) =
-				SubSat(Short4(0x8001, 0x8002, 0x8003, 0x8004),
-				       Short4(3, 2, 1, 0));
+		*Pointer<Short4>(out + 8 * 7) =
+			AddSat(Short4(1, 2, 3, 4), Short4(3, 2, 1, 0));
+		*Pointer<Short4>(out + 8 * 8) =
+			AddSat(Short4(0x7FFE, 0x7FFE, 0x7FFE, 0x7FFE),
+				    Short4(3, 2, 1, 0));
+		*Pointer<Short4>(out + 8 * 9) =
+			AddSat(Short4(0x8001, 0x8002, 0x8003, 0x8004),
+				    Short4(-3, -2, -1, -0));
+		*Pointer<Short4>(out + 8 * 10) =
+			SubSat(Short4(0x8001, 0x8002, 0x8003, 0x8004),
+				    Short4(3, 2, 1, 0));
 
-			*Pointer<UShort4>(out + 8 * 11) =
-				AddSat(UShort4(1, 2, 3, 4), UShort4(3, 2, 1, 0));
-			*Pointer<UShort4>(out + 8 * 12) =
-				AddSat(UShort4(0xFFFE, 0xFFFE, 0xFFFE, 0xFFFE),
-				       UShort4(3, 2, 1, 0));
-			*Pointer<UShort4>(out + 8 * 13) =
-				SubSat(UShort4(1, 2, 3, 4), UShort4(3, 2, 1, 0));
+		*Pointer<UShort4>(out + 8 * 11) =
+			AddSat(UShort4(1, 2, 3, 4), UShort4(3, 2, 1, 0));
+		*Pointer<UShort4>(out + 8 * 12) =
+			AddSat(UShort4(0xFFFE, 0xFFFE, 0xFFFE, 0xFFFE),
+				    UShort4(3, 2, 1, 0));
+		*Pointer<UShort4>(out + 8 * 13) =
+			SubSat(UShort4(1, 2, 3, 4), UShort4(3, 2, 1, 0));
 
-			Return(0);
-		}
-
-		auto routine = function("one");
-
-		if(routine)
-		{
-			unsigned int out[14][2];
-
-			memset(&out, 0, sizeof(out));
-
-			routine(&out);
-
-			EXPECT_EQ(out[0][0], 0x08080808u);
-			EXPECT_EQ(out[0][1], 0x08080808u);
-
-			EXPECT_EQ(out[1][0], 0xFFFFFFFFu);
-			EXPECT_EQ(out[1][1], 0xFEFFFFFFu);
-
-			EXPECT_EQ(out[2][0], 0x00000000u);
-			EXPECT_EQ(out[2][1], 0x08060402u);
-
-			EXPECT_EQ(out[3][0], 0x08080808u);
-			EXPECT_EQ(out[3][1], 0x08080808u);
-
-			EXPECT_EQ(out[4][0], 0x7F7F7F7Fu);
-			EXPECT_EQ(out[4][1], 0x7E7F7F7Fu);
-
-			EXPECT_EQ(out[5][0], 0x80808080u);
-			EXPECT_EQ(out[5][1], 0x88868482u);
-
-			EXPECT_EQ(out[6][0], 0x80808080u);
-			EXPECT_EQ(out[6][1], 0x88868482u);
-
-			EXPECT_EQ(out[7][0], 0x00040004u);
-			EXPECT_EQ(out[7][1], 0x00040004u);
-
-			EXPECT_EQ(out[8][0], 0x7FFF7FFFu);
-			EXPECT_EQ(out[8][1], 0x7FFE7FFFu);
-
-			EXPECT_EQ(out[9][0], 0x80008000u);
-			EXPECT_EQ(out[9][1], 0x80048002u);
-
-			EXPECT_EQ(out[10][0], 0x80008000u);
-			EXPECT_EQ(out[10][1], 0x80048002u);
-
-			EXPECT_EQ(out[11][0], 0x00040004u);
-			EXPECT_EQ(out[11][1], 0x00040004u);
-
-			EXPECT_EQ(out[12][0], 0xFFFFFFFFu);
-			EXPECT_EQ(out[12][1], 0xFFFEFFFFu);
-
-			EXPECT_EQ(out[13][0], 0x00000000u);
-			EXPECT_EQ(out[13][1], 0x00040002u);
-		}
+		Return(0);
 	}
 
+	auto routine = function("one");
+
+	if(routine)
+	{
+		unsigned int out[14][2];
+
+		memset(&out, 0, sizeof(out));
+
+		routine(&out);
+
+		EXPECT_EQ(out[0][0], 0x08080808u);
+		EXPECT_EQ(out[0][1], 0x08080808u);
+
+		EXPECT_EQ(out[1][0], 0xFFFFFFFFu);
+		EXPECT_EQ(out[1][1], 0xFEFFFFFFu);
+
+		EXPECT_EQ(out[2][0], 0x00000000u);
+		EXPECT_EQ(out[2][1], 0x08060402u);
+
+		EXPECT_EQ(out[3][0], 0x08080808u);
+		EXPECT_EQ(out[3][1], 0x08080808u);
+
+		EXPECT_EQ(out[4][0], 0x7F7F7F7Fu);
+		EXPECT_EQ(out[4][1], 0x7E7F7F7Fu);
+
+		EXPECT_EQ(out[5][0], 0x80808080u);
+		EXPECT_EQ(out[5][1], 0x88868482u);
+
+		EXPECT_EQ(out[6][0], 0x80808080u);
+		EXPECT_EQ(out[6][1], 0x88868482u);
+
+		EXPECT_EQ(out[7][0], 0x00040004u);
+		EXPECT_EQ(out[7][1], 0x00040004u);
+
+		EXPECT_EQ(out[8][0], 0x7FFF7FFFu);
+		EXPECT_EQ(out[8][1], 0x7FFE7FFFu);
+
+		EXPECT_EQ(out[9][0], 0x80008000u);
+		EXPECT_EQ(out[9][1], 0x80048002u);
+
+		EXPECT_EQ(out[10][0], 0x80008000u);
+		EXPECT_EQ(out[10][1], 0x80048002u);
+
+		EXPECT_EQ(out[11][0], 0x00040004u);
+		EXPECT_EQ(out[11][1], 0x00040004u);
+
+		EXPECT_EQ(out[12][0], 0xFFFFFFFFu);
+		EXPECT_EQ(out[12][1], 0xFFFEFFFFu);
+
+		EXPECT_EQ(out[13][0], 0x00000000u);
+		EXPECT_EQ(out[13][1], 0x00040002u);
+	}
 }
 
 TEST(ReactorUnitTests, Unpack)
 {
+	FunctionT<int(void*, void*)> function;
 	{
-		FunctionT<int(void*, void*)> function;
-		{
-			Pointer<Byte> in = function.Arg<0>();
-			Pointer<Byte> out = function.Arg<1>();
+		Pointer<Byte> in = function.Arg<0>();
+		Pointer<Byte> out = function.Arg<1>();
 
-			Byte4 test_byte_a = *Pointer<Byte4>(in + 4 * 0);
-			Byte4 test_byte_b = *Pointer<Byte4>(in + 4 * 1);
+		Byte4 test_byte_a = *Pointer<Byte4>(in + 4 * 0);
+		Byte4 test_byte_b = *Pointer<Byte4>(in + 4 * 1);
 
-			*Pointer<Short4>(out + 8 * 0) =
-				Unpack(test_byte_a, test_byte_b);
+		*Pointer<Short4>(out + 8 * 0) =
+			Unpack(test_byte_a, test_byte_b);
 
-			*Pointer<Short4>(out + 8 * 1) = Unpack(test_byte_a);
+		*Pointer<Short4>(out + 8 * 1) = Unpack(test_byte_a);
 
-			Return(0);
-		}
-
-		auto routine = function("one");
-
-		if(routine)
-		{
-			unsigned int in[1][2];
-			unsigned int out[2][2];
-
-			memset(&out, 0, sizeof(out));
-
-			in[0][0] = 0xABCDEF12u;
-			in[0][1] = 0x34567890u;
-
-			routine(&in, &out);
-
-			EXPECT_EQ(out[0][0], 0x78EF9012u);
-			EXPECT_EQ(out[0][1], 0x34AB56CDu);
-
-			EXPECT_EQ(out[1][0], 0xEFEF1212u);
-			EXPECT_EQ(out[1][1], 0xABABCDCDu);
-		}
+		Return(0);
 	}
 
+	auto routine = function("one");
+
+	if(routine)
+	{
+		unsigned int in[1][2];
+		unsigned int out[2][2];
+
+		memset(&out, 0, sizeof(out));
+
+		in[0][0] = 0xABCDEF12u;
+		in[0][1] = 0x34567890u;
+
+		routine(&in, &out);
+
+		EXPECT_EQ(out[0][0], 0x78EF9012u);
+		EXPECT_EQ(out[0][1], 0x34AB56CDu);
+
+		EXPECT_EQ(out[1][0], 0xEFEF1212u);
+		EXPECT_EQ(out[1][1], 0xABABCDCDu);
+	}
 }
 
 TEST(ReactorUnitTests, Pack)
 {
+	FunctionT<int(void*)> function;
 	{
-		FunctionT<int(void*)> function;
-		{
-			Pointer<Byte> out = function.Arg<0>();
+		Pointer<Byte> out = function.Arg<0>();
 
-			*Pointer<SByte8>(out + 8 * 0) =
-				PackSigned(Short4(-1, -2, 1, 2),
-					   Short4(3, 4, -3, -4));
+		*Pointer<SByte8>(out + 8 * 0) =
+			PackSigned(Short4(-1, -2, 1, 2),
+					Short4(3, 4, -3, -4));
 
-			*Pointer<Byte8>(out + 8 * 1) =
-				PackUnsigned(Short4(-1, -2, 1, 2),
-					     Short4(3, 4, -3, -4));
+		*Pointer<Byte8>(out + 8 * 1) =
+			PackUnsigned(Short4(-1, -2, 1, 2),
+					    Short4(3, 4, -3, -4));
 
-			*Pointer<Short8>(out + 8 * 2) =
-				PackSigned(Int4(-1, -2, 1, 2),
-					   Int4(3, 4, -3, -4));
+		*Pointer<Short8>(out + 8 * 2) =
+			PackSigned(Int4(-1, -2, 1, 2),
+					Int4(3, 4, -3, -4));
 
-			*Pointer<UShort8>(out + 8 * 4) =
-				PackUnsigned(Int4(-1, -2, 1, 2),
-					     Int4(3, 4, -3, -4));
+		*Pointer<UShort8>(out + 8 * 4) =
+			PackUnsigned(Int4(-1, -2, 1, 2),
+					    Int4(3, 4, -3, -4));
 
-			Return(0);
-		}
-
-		auto routine = function("one");
-
-		if(routine)
-		{
-			unsigned int out[6][2];
-
-			memset(&out, 0, sizeof(out));
-
-			routine(&out);
-
-			EXPECT_EQ(out[0][0], 0x0201FEFFu);
-			EXPECT_EQ(out[0][1], 0xFCFD0403u);
-
-			EXPECT_EQ(out[1][0], 0x02010000u);
-			EXPECT_EQ(out[1][1], 0x00000403u);
-
-			EXPECT_EQ(out[2][0], 0xFFFEFFFFu);
-			EXPECT_EQ(out[2][1], 0x00020001u);
-
-			EXPECT_EQ(out[3][0], 0x00040003u);
-			EXPECT_EQ(out[3][1], 0xFFFCFFFDu);
-
-			EXPECT_EQ(out[4][0], 0x00000000u);
-			EXPECT_EQ(out[4][1], 0x00020001u);
-
-			EXPECT_EQ(out[5][0], 0x00040003u);
-			EXPECT_EQ(out[5][1], 0x00000000u);
-		}
+		Return(0);
 	}
 
+	auto routine = function("one");
+
+	if(routine)
+	{
+		unsigned int out[6][2];
+
+		memset(&out, 0, sizeof(out));
+
+		routine(&out);
+
+		EXPECT_EQ(out[0][0], 0x0201FEFFu);
+		EXPECT_EQ(out[0][1], 0xFCFD0403u);
+
+		EXPECT_EQ(out[1][0], 0x02010000u);
+		EXPECT_EQ(out[1][1], 0x00000403u);
+
+		EXPECT_EQ(out[2][0], 0xFFFEFFFFu);
+		EXPECT_EQ(out[2][1], 0x00020001u);
+
+		EXPECT_EQ(out[3][0], 0x00040003u);
+		EXPECT_EQ(out[3][1], 0xFFFCFFFDu);
+
+		EXPECT_EQ(out[4][0], 0x00000000u);
+		EXPECT_EQ(out[4][1], 0x00020001u);
+
+		EXPECT_EQ(out[5][0], 0x00040003u);
+		EXPECT_EQ(out[5][1], 0x00000000u);
+	}
 }
 
 TEST(ReactorUnitTests, MulHigh)
 {
+	FunctionT<int(void*)> function;
 	{
-		FunctionT<int(void*)> function;
-		{
-			Pointer<Byte> out = function.Arg<0>();
+		Pointer<Byte> out = function.Arg<0>();
 
-			*Pointer<Short4>(out + 16 * 0) =
-				MulHigh(Short4(0x01AA, 0x02DD, 0x03EE, 0xF422),
-				        Short4(0x01BB, 0x02CC, 0x03FF, 0xF411));
-			*Pointer<UShort4>(out + 16 * 1) =
-				MulHigh(UShort4(0x01AA, 0x02DD, 0x03EE, 0xF422),
-				        UShort4(0x01BB, 0x02CC, 0x03FF, 0xF411));
+		*Pointer<Short4>(out + 16 * 0) =
+			MulHigh(Short4(0x01AA, 0x02DD, 0x03EE, 0xF422),
+				    Short4(0x01BB, 0x02CC, 0x03FF, 0xF411));
+		*Pointer<UShort4>(out + 16 * 1) =
+			MulHigh(UShort4(0x01AA, 0x02DD, 0x03EE, 0xF422),
+				    UShort4(0x01BB, 0x02CC, 0x03FF, 0xF411));
 
-			*Pointer<Int4>(out + 16 * 2) =
-				MulHigh(Int4(0x000001AA, 0x000002DD, 0xC8000000, 0xF8000000),
-				        Int4(0x000001BB, 0x84000000, 0x000003EE, 0xD7000000));
-			*Pointer<UInt4>(out + 16 * 3) =
-				MulHigh(UInt4(0x000001AAu, 0x000002DDu, 0xC8000000u, 0xD8000000u),
-				        UInt4(0x000001BBu, 0x84000000u, 0x000003EEu, 0xD7000000u));
+		*Pointer<Int4>(out + 16 * 2) =
+			MulHigh(Int4(0x000001AA, 0x000002DD, 0xC8000000, 0xF8000000),
+				    Int4(0x000001BB, 0x84000000, 0x000003EE, 0xD7000000));
+		*Pointer<UInt4>(out + 16 * 3) =
+			MulHigh(UInt4(0x000001AAu, 0x000002DDu, 0xC8000000u, 0xD8000000u),
+				    UInt4(0x000001BBu, 0x84000000u, 0x000003EEu, 0xD7000000u));
 
-			*Pointer<Int4>(out + 16 * 4) =
-				MulHigh(Int4(0x7FFFFFFF, 0x7FFFFFFF, 0x80008000, 0xFFFFFFFF),
-				        Int4(0x7FFFFFFF, 0x80000000, 0x80008000, 0xFFFFFFFF));
-			*Pointer<UInt4>(out + 16 * 5) =
-				MulHigh(UInt4(0x7FFFFFFFu, 0x7FFFFFFFu, 0x80008000u, 0xFFFFFFFFu),
-				        UInt4(0x7FFFFFFFu, 0x80000000u, 0x80008000u, 0xFFFFFFFFu));
+		*Pointer<Int4>(out + 16 * 4) =
+			MulHigh(Int4(0x7FFFFFFF, 0x7FFFFFFF, 0x80008000, 0xFFFFFFFF),
+				    Int4(0x7FFFFFFF, 0x80000000, 0x80008000, 0xFFFFFFFF));
+		*Pointer<UInt4>(out + 16 * 5) =
+			MulHigh(UInt4(0x7FFFFFFFu, 0x7FFFFFFFu, 0x80008000u, 0xFFFFFFFFu),
+				    UInt4(0x7FFFFFFFu, 0x80000000u, 0x80008000u, 0xFFFFFFFFu));
 
-			// (U)Short8 variants currently unimplemented.
+		// (U)Short8 variants currently unimplemented.
 
-			Return(0);
-		}
-
-		auto routine = function("one");
-
-		if(routine)
-		{
-			unsigned int out[6][4];
-
-			memset(&out, 0, sizeof(out));
-
-			routine(&out);
-
-			EXPECT_EQ(out[0][0], 0x00080002u);
-			EXPECT_EQ(out[0][1], 0x008D000Fu);
-
-			EXPECT_EQ(out[1][0], 0x00080002u);
-			EXPECT_EQ(out[1][1], 0xE8C0000Fu);
-
-			EXPECT_EQ(out[2][0], 0x00000000u);
-			EXPECT_EQ(out[2][1], 0xFFFFFE9Cu);
-			EXPECT_EQ(out[2][2], 0xFFFFFF23u);
-			EXPECT_EQ(out[2][3], 0x01480000u);
-
-			EXPECT_EQ(out[3][0], 0x00000000u);
-			EXPECT_EQ(out[3][1], 0x00000179u);
-			EXPECT_EQ(out[3][2], 0x00000311u);
-			EXPECT_EQ(out[3][3], 0xB5680000u);
-
-			EXPECT_EQ(out[4][0], 0x3FFFFFFFu);
-			EXPECT_EQ(out[4][1], 0xC0000000u);
-			EXPECT_EQ(out[4][2], 0x3FFF8000u);
-			EXPECT_EQ(out[4][3], 0x00000000u);
-
-			EXPECT_EQ(out[5][0], 0x3FFFFFFFu);
-			EXPECT_EQ(out[5][1], 0x3FFFFFFFu);
-			EXPECT_EQ(out[5][2], 0x40008000u);
-			EXPECT_EQ(out[5][3], 0xFFFFFFFEu);
-		}
+		Return(0);
 	}
 
+	auto routine = function("one");
+
+	if(routine)
+	{
+		unsigned int out[6][4];
+
+		memset(&out, 0, sizeof(out));
+
+		routine(&out);
+
+		EXPECT_EQ(out[0][0], 0x00080002u);
+		EXPECT_EQ(out[0][1], 0x008D000Fu);
+
+		EXPECT_EQ(out[1][0], 0x00080002u);
+		EXPECT_EQ(out[1][1], 0xE8C0000Fu);
+
+		EXPECT_EQ(out[2][0], 0x00000000u);
+		EXPECT_EQ(out[2][1], 0xFFFFFE9Cu);
+		EXPECT_EQ(out[2][2], 0xFFFFFF23u);
+		EXPECT_EQ(out[2][3], 0x01480000u);
+
+		EXPECT_EQ(out[3][0], 0x00000000u);
+		EXPECT_EQ(out[3][1], 0x00000179u);
+		EXPECT_EQ(out[3][2], 0x00000311u);
+		EXPECT_EQ(out[3][3], 0xB5680000u);
+
+		EXPECT_EQ(out[4][0], 0x3FFFFFFFu);
+		EXPECT_EQ(out[4][1], 0xC0000000u);
+		EXPECT_EQ(out[4][2], 0x3FFF8000u);
+		EXPECT_EQ(out[4][3], 0x00000000u);
+
+		EXPECT_EQ(out[5][0], 0x3FFFFFFFu);
+		EXPECT_EQ(out[5][1], 0x3FFFFFFFu);
+		EXPECT_EQ(out[5][2], 0x40008000u);
+		EXPECT_EQ(out[5][3], 0xFFFFFFFEu);
+	}
 }
 
 TEST(ReactorUnitTests, MulAdd)
 {
+	FunctionT<int(void*)> function;
 	{
-		FunctionT<int(void*)> function;
-		{
-			Pointer<Byte> out = function.Arg<0>();
+		Pointer<Byte> out = function.Arg<0>();
 
-			*Pointer<Int2>(out + 8 * 0) =
-				MulAdd(Short4(0x1aa, 0x2dd, 0x3ee, 0xF422),
-				       Short4(0x1bb, 0x2cc, 0x3ff, 0xF411));
+		*Pointer<Int2>(out + 8 * 0) =
+			MulAdd(Short4(0x1aa, 0x2dd, 0x3ee, 0xF422),
+				    Short4(0x1bb, 0x2cc, 0x3ff, 0xF411));
 
-			// (U)Short8 variant is mentioned but unimplemented
-			Return(0);
-		}
-
-		auto routine = function("one");
-
-		if(routine)
-		{
-			unsigned int out[1][2];
-
-			memset(&out, 0, sizeof(out));
-
-			routine(&out);
-
-			EXPECT_EQ(out[0][0], 0x000AE34Au);
-			EXPECT_EQ(out[0][1], 0x009D5254u);
-		}
+		// (U)Short8 variant is mentioned but unimplemented
+		Return(0);
 	}
 
+	auto routine = function("one");
+
+	if(routine)
+	{
+		unsigned int out[1][2];
+
+		memset(&out, 0, sizeof(out));
+
+		routine(&out);
+
+		EXPECT_EQ(out[0][0], 0x000AE34Au);
+		EXPECT_EQ(out[0][1], 0x009D5254u);
+	}
 }
 
 TEST(ReactorUnitTests, PointersEqual)
@@ -1638,80 +1590,77 @@
 // It's necessary to inspect the registers in a debugger to actually verify.)
 TEST(ReactorUnitTests, PreserveXMMRegisters)
 {
+    FunctionT<void(void*, void*)> function;
     {
-        FunctionT<void(void*, void*)> function;
-        {
-            Pointer<Byte> in = function.Arg<0>();
-            Pointer<Byte> out = function.Arg<1>();
+        Pointer<Byte> in = function.Arg<0>();
+        Pointer<Byte> out = function.Arg<1>();
 
-            Float4 a = *Pointer<Float4>(in + 16 * 0);
-            Float4 b = *Pointer<Float4>(in + 16 * 1);
-            Float4 c = *Pointer<Float4>(in + 16 * 2);
-            Float4 d = *Pointer<Float4>(in + 16 * 3);
-            Float4 e = *Pointer<Float4>(in + 16 * 4);
-            Float4 f = *Pointer<Float4>(in + 16 * 5);
-            Float4 g = *Pointer<Float4>(in + 16 * 6);
-            Float4 h = *Pointer<Float4>(in + 16 * 7);
-            Float4 i = *Pointer<Float4>(in + 16 * 8);
-            Float4 j = *Pointer<Float4>(in + 16 * 9);
-            Float4 k = *Pointer<Float4>(in + 16 * 10);
-            Float4 l = *Pointer<Float4>(in + 16 * 11);
-            Float4 m = *Pointer<Float4>(in + 16 * 12);
-            Float4 n = *Pointer<Float4>(in + 16 * 13);
-            Float4 o = *Pointer<Float4>(in + 16 * 14);
-            Float4 p = *Pointer<Float4>(in + 16 * 15);
+        Float4 a = *Pointer<Float4>(in + 16 * 0);
+        Float4 b = *Pointer<Float4>(in + 16 * 1);
+        Float4 c = *Pointer<Float4>(in + 16 * 2);
+        Float4 d = *Pointer<Float4>(in + 16 * 3);
+        Float4 e = *Pointer<Float4>(in + 16 * 4);
+        Float4 f = *Pointer<Float4>(in + 16 * 5);
+        Float4 g = *Pointer<Float4>(in + 16 * 6);
+        Float4 h = *Pointer<Float4>(in + 16 * 7);
+        Float4 i = *Pointer<Float4>(in + 16 * 8);
+        Float4 j = *Pointer<Float4>(in + 16 * 9);
+        Float4 k = *Pointer<Float4>(in + 16 * 10);
+        Float4 l = *Pointer<Float4>(in + 16 * 11);
+        Float4 m = *Pointer<Float4>(in + 16 * 12);
+        Float4 n = *Pointer<Float4>(in + 16 * 13);
+        Float4 o = *Pointer<Float4>(in + 16 * 14);
+        Float4 p = *Pointer<Float4>(in + 16 * 15);
 
-            Float4 ab = a + b;
-            Float4 cd = c + d;
-            Float4 ef = e + f;
-            Float4 gh = g + h;
-            Float4 ij = i + j;
-            Float4 kl = k + l;
-            Float4 mn = m + n;
-            Float4 op = o + p;
+        Float4 ab = a + b;
+        Float4 cd = c + d;
+        Float4 ef = e + f;
+        Float4 gh = g + h;
+        Float4 ij = i + j;
+        Float4 kl = k + l;
+        Float4 mn = m + n;
+        Float4 op = o + p;
 
-            Float4 abcd = ab + cd;
-            Float4 efgh = ef + gh;
-            Float4 ijkl = ij + kl;
-            Float4 mnop = mn + op;
+        Float4 abcd = ab + cd;
+        Float4 efgh = ef + gh;
+        Float4 ijkl = ij + kl;
+        Float4 mnop = mn + op;
 
-            Float4 abcdefgh = abcd + efgh;
-            Float4 ijklmnop = ijkl + mnop;
-            Float4 sum = abcdefgh + ijklmnop;
-            *Pointer<Float4>(out) = sum;
-            Return();
-        }
-
-        auto routine = function("one");
-        assert(routine);
-
-        float input[64] = { 1.0f,  0.0f,   0.0f, 0.0f,
-                           -1.0f,  1.0f,  -1.0f, 0.0f,
-                            1.0f,  2.0f,  -2.0f, 0.0f,
-                           -1.0f,  3.0f,  -3.0f, 0.0f,
-                            1.0f,  4.0f,  -4.0f, 0.0f,
-                           -1.0f,  5.0f,  -5.0f, 0.0f,
-                            1.0f,  6.0f,  -6.0f, 0.0f,
-                           -1.0f,  7.0f,  -7.0f, 0.0f,
-                            1.0f,  8.0f,  -8.0f, 0.0f,
-                           -1.0f,  9.0f,  -9.0f, 0.0f,
-                            1.0f, 10.0f, -10.0f, 0.0f,
-                           -1.0f, 11.0f, -11.0f, 0.0f,
-                            1.0f, 12.0f, -12.0f, 0.0f,
-                           -1.0f, 13.0f, -13.0f, 0.0f,
-                            1.0f, 14.0f, -14.0f, 0.0f,
-                           -1.0f, 15.0f, -15.0f, 0.0f };
-
-        float result[4];
-
-        routine(input, result);
-
-        EXPECT_EQ(result[0], 0.0f);
-        EXPECT_EQ(result[1], 120.0f);
-        EXPECT_EQ(result[2], -120.0f);
-        EXPECT_EQ(result[3], 0.0f);
+        Float4 abcdefgh = abcd + efgh;
+        Float4 ijklmnop = ijkl + mnop;
+        Float4 sum = abcdefgh + ijklmnop;
+        *Pointer<Float4>(out) = sum;
+        Return();
     }
 
+    auto routine = function("one");
+    assert(routine);
+
+    float input[64] = { 1.0f,  0.0f,   0.0f, 0.0f,
+                        -1.0f,  1.0f,  -1.0f, 0.0f,
+                        1.0f,  2.0f,  -2.0f, 0.0f,
+                        -1.0f,  3.0f,  -3.0f, 0.0f,
+                        1.0f,  4.0f,  -4.0f, 0.0f,
+                        -1.0f,  5.0f,  -5.0f, 0.0f,
+                        1.0f,  6.0f,  -6.0f, 0.0f,
+                        -1.0f,  7.0f,  -7.0f, 0.0f,
+                        1.0f,  8.0f,  -8.0f, 0.0f,
+                        -1.0f,  9.0f,  -9.0f, 0.0f,
+                        1.0f, 10.0f, -10.0f, 0.0f,
+                        -1.0f, 11.0f, -11.0f, 0.0f,
+                        1.0f, 12.0f, -12.0f, 0.0f,
+                        -1.0f, 13.0f, -13.0f, 0.0f,
+                        1.0f, 14.0f, -14.0f, 0.0f,
+                        -1.0f, 15.0f, -15.0f, 0.0f };
+
+    float result[4];
+
+    routine(input, result);
+
+    EXPECT_EQ(result[0], 0.0f);
+    EXPECT_EQ(result[1], 120.0f);
+    EXPECT_EQ(result[2], -120.0f);
+    EXPECT_EQ(result[3], 0.0f);
 }
 
 template <typename T>

diff --git a/src/Reactor/Routine.hpp b/src/Reactor/Routine.hpp
index 4e643fd..922d3ab 100644
--- a/src/Reactor/Routine.hpp
+++ b/src/Reactor/Routine.hpp

@@ -17,57 +17,58 @@
 
 #include <memory>
 
-namespace rr
+namespace rr {
+
+class Routine
 {
-	class Routine
+public:
+	Routine() = default;
+	virtual ~Routine() = default;
+
+	virtual const void *getEntry(int index = 0) const = 0;
+};
+
+// RoutineT is a type-safe wrapper around a Routine and its callable entry, returned by FunctionT
+template<typename FunctionType>
+class RoutineT;
+
+template<typename Return, typename... Arguments>
+class RoutineT<Return(Arguments...)>
+{
+public:
+	RoutineT() = default;
+
+	explicit RoutineT(const std::shared_ptr<Routine>& routine)
+		: routine(routine)
 	{
-	public:
-		Routine() = default;
-		virtual ~Routine() = default;
+		if (routine)
+		{
+			callable = reinterpret_cast<CallableType>(const_cast<void*>(routine->getEntry(0)));
+		}
+	}
 
-		virtual const void *getEntry(int index = 0) const = 0;
-	};
-
-	// RoutineT is a type-safe wrapper around a Routine and its callable entry, returned by FunctionT
-	template<typename FunctionType>
-	class RoutineT;
-
-	template<typename Return, typename... Arguments>
-	class RoutineT<Return(Arguments...)>
+	operator bool() const
 	{
-	public:
-		RoutineT() = default;
+		return callable != nullptr;
+	}
 
-		explicit RoutineT(const std::shared_ptr<Routine>& routine)
-			: routine(routine)
-		{
-			if (routine)
-			{
-				callable = reinterpret_cast<CallableType>(const_cast<void*>(routine->getEntry(0)));
-			}
-		}
+	template <typename... Args>
+	Return operator()(Args&&... args) const
+	{
+		return callable(std::forward<Args>(args)...);
+	}
 
-		operator bool() const
-		{
-			return callable != nullptr;
-		}
+	const void* getEntry() const
+	{
+		return reinterpret_cast<void*>(callable);
+	}
 
-		template <typename... Args>
-		Return operator()(Args&&... args) const
-		{
-			return callable(std::forward<Args>(args)...);
-		}
+private:
+	std::shared_ptr<Routine> routine;
+	using CallableType = Return(*)(Arguments...);
+	CallableType callable = nullptr;
+};
 
-		const void* getEntry() const
-		{
-			return reinterpret_cast<void*>(callable);
-		}
-
-	private:
-		std::shared_ptr<Routine> routine;
-		using CallableType = Return(*)(Arguments...);
-		CallableType callable = nullptr;
-	};
-}
+}  // namespace rr
 
 #endif   // rr_Routine_hpp

diff --git a/src/Reactor/SubzeroReactor.cpp b/src/Reactor/SubzeroReactor.cpp
index 8b2d19c..8683862 100644
--- a/src/Reactor/SubzeroReactor.cpp
+++ b/src/Reactor/SubzeroReactor.cpp

@@ -54,1811 +54,1810 @@
 #include <limits>
 #include <iostream>
 
-namespace rr
+namespace rr { class ELFMemoryStreamer; }
+
+namespace {
+
+// Default configuration settings. Must be accessed under mutex lock.
+std::mutex defaultConfigLock;
+rr::Config &defaultConfig()
 {
-	class ELFMemoryStreamer;
+	// This uses a static in a function to avoid the cost of a global static
+	// initializer. See http://neugierig.org/software/chromium/notes/2011/08/static-initializers.html
+	static rr::Config config = rr::Config::Edit()
+		.apply({});
+	return config;
 }
 
-namespace
+Ice::GlobalContext *context = nullptr;
+Ice::Cfg *function = nullptr;
+Ice::CfgNode *basicBlock = nullptr;
+Ice::CfgLocalAllocatorScope *allocator = nullptr;
+rr::ELFMemoryStreamer *routine = nullptr;
+
+std::mutex codegenMutex;
+
+Ice::ELFFileStreamer *elfFile = nullptr;
+Ice::Fdstream *out = nullptr;
+
+}  // Anonymous namespace
+
+namespace {
+
+#if !defined(__i386__) && defined(_M_IX86)
+	#define __i386__ 1
+#endif
+
+#if !defined(__x86_64__) && (defined(_M_AMD64) || defined (_M_X64))
+	#define __x86_64__ 1
+#endif
+
+static Ice::OptLevel toIce(rr::Optimization::Level level)
 {
-	// Default configuration settings. Must be accessed under mutex lock.
-	std::mutex defaultConfigLock;
-	rr::Config &defaultConfig()
+	switch (level)
 	{
-		// This uses a static in a function to avoid the cost of a global static
-		// initializer. See http://neugierig.org/software/chromium/notes/2011/08/static-initializers.html
-		static rr::Config config = rr::Config::Edit()
-			.apply({});
-		return config;
+		// Note that Opt_0 and Opt_1 are not implemented by Subzero
+		case rr::Optimization::Level::None:       return Ice::Opt_m1;
+		case rr::Optimization::Level::Less:       return Ice::Opt_m1;
+		case rr::Optimization::Level::Default:    return Ice::Opt_2;
+		case rr::Optimization::Level::Aggressive: return Ice::Opt_2;
+		default: UNREACHABLE("Unknown Optimization Level %d", int(level));
 	}
-
-	Ice::GlobalContext *context = nullptr;
-	Ice::Cfg *function = nullptr;
-	Ice::CfgNode *basicBlock = nullptr;
-	Ice::CfgLocalAllocatorScope *allocator = nullptr;
-	rr::ELFMemoryStreamer *routine = nullptr;
-
-	std::mutex codegenMutex;
-
-	Ice::ELFFileStreamer *elfFile = nullptr;
-	Ice::Fdstream *out = nullptr;
+	return Ice::Opt_2;
 }
 
-namespace
+class CPUID
 {
-	#if !defined(__i386__) && defined(_M_IX86)
-		#define __i386__ 1
-	#endif
+public:
+	const static bool ARM;
+	const static bool SSE4_1;
 
-	#if !defined(__x86_64__) && (defined(_M_AMD64) || defined (_M_X64))
-		#define __x86_64__ 1
-	#endif
-
-	static Ice::OptLevel toIce(rr::Optimization::Level level)
+private:
+	static void cpuid(int registers[4], int info)
 	{
-		switch (level)
-		{
-			// Note that Opt_0 and Opt_1 are not implemented by Subzero
-			case rr::Optimization::Level::None:       return Ice::Opt_m1;
-			case rr::Optimization::Level::Less:       return Ice::Opt_m1;
-			case rr::Optimization::Level::Default:    return Ice::Opt_2;
-			case rr::Optimization::Level::Aggressive: return Ice::Opt_2;
-			default: UNREACHABLE("Unknown Optimization Level %d", int(level));
-		}
-		return Ice::Opt_2;
+		#if defined(__i386__) || defined(__x86_64__)
+			#if defined(_WIN32)
+				__cpuid(registers, info);
+			#else
+				__asm volatile("cpuid": "=a" (registers[0]), "=b" (registers[1]), "=c" (registers[2]), "=d" (registers[3]): "a" (info));
+			#endif
+		#else
+			registers[0] = 0;
+			registers[1] = 0;
+			registers[2] = 0;
+			registers[3] = 0;
+		#endif
 	}
 
-	class CPUID
+	static bool detectARM()
 	{
-	public:
-		const static bool ARM;
-		const static bool SSE4_1;
+		#if defined(__arm__) || defined(__aarch64__)
+			return true;
+		#elif defined(__i386__) || defined(__x86_64__)
+			return false;
+		#elif defined(__mips__)
+			return false;
+		#else
+			#error "Unknown architecture"
+		#endif
+	}
 
-	private:
-		static void cpuid(int registers[4], int info)
-		{
-			#if defined(__i386__) || defined(__x86_64__)
-				#if defined(_WIN32)
-					__cpuid(registers, info);
-				#else
-					__asm volatile("cpuid": "=a" (registers[0]), "=b" (registers[1]), "=c" (registers[2]), "=d" (registers[3]): "a" (info));
-				#endif
-			#else
-				registers[0] = 0;
-				registers[1] = 0;
-				registers[2] = 0;
-				registers[3] = 0;
-			#endif
-		}
+	static bool detectSSE4_1()
+	{
+		#if defined(__i386__) || defined(__x86_64__)
+			int registers[4];
+			cpuid(registers, 1);
+			return (registers[2] & 0x00080000) != 0;
+		#else
+			return false;
+		#endif
+	}
+};
 
-		static bool detectARM()
-		{
-			#if defined(__arm__) || defined(__aarch64__)
-				return true;
-			#elif defined(__i386__) || defined(__x86_64__)
-				return false;
-			#elif defined(__mips__)
-				return false;
-			#else
-				#error "Unknown architecture"
-			#endif
-		}
+const bool CPUID::ARM = CPUID::detectARM();
+const bool CPUID::SSE4_1 = CPUID::detectSSE4_1();
+const bool emulateIntrinsics = false;
+const bool emulateMismatchedBitCast = CPUID::ARM;
 
-		static bool detectSSE4_1()
-		{
-			#if defined(__i386__) || defined(__x86_64__)
-				int registers[4];
-				cpuid(registers, 1);
-				return (registers[2] & 0x00080000) != 0;
-			#else
-				return false;
-			#endif
-		}
-	};
-
-	const bool CPUID::ARM = CPUID::detectARM();
-	const bool CPUID::SSE4_1 = CPUID::detectSSE4_1();
-	const bool emulateIntrinsics = false;
-	const bool emulateMismatchedBitCast = CPUID::ARM;
-
-	constexpr bool subzeroDumpEnabled = false;
-	constexpr bool subzeroEmitTextAsm = false;
+constexpr bool subzeroDumpEnabled = false;
+constexpr bool subzeroEmitTextAsm = false;
 
 #if !ALLOW_DUMP
-	static_assert(!subzeroDumpEnabled, "Compile Subzero with ALLOW_DUMP=1 for subzeroDumpEnabled");
-	static_assert(!subzeroEmitTextAsm, "Compile Subzero with ALLOW_DUMP=1 for subzeroEmitTextAsm");
+static_assert(!subzeroDumpEnabled, "Compile Subzero with ALLOW_DUMP=1 for subzeroDumpEnabled");
+static_assert(!subzeroEmitTextAsm, "Compile Subzero with ALLOW_DUMP=1 for subzeroEmitTextAsm");
 #endif
+
+}  // anonymous namespace
+
+namespace rr {
+
+const Capabilities Caps =
+{
+	false, // CoroutinesSupported
+};
+
+enum EmulatedType
+{
+	EmulatedShift = 16,
+	EmulatedV2 = 2 << EmulatedShift,
+	EmulatedV4 = 4 << EmulatedShift,
+	EmulatedV8 = 8 << EmulatedShift,
+	EmulatedBits = EmulatedV2 | EmulatedV4 | EmulatedV8,
+
+	Type_v2i32 = Ice::IceType_v4i32 | EmulatedV2,
+	Type_v4i16 = Ice::IceType_v8i16 | EmulatedV4,
+	Type_v2i16 = Ice::IceType_v8i16 | EmulatedV2,
+	Type_v8i8 =  Ice::IceType_v16i8 | EmulatedV8,
+	Type_v4i8 =  Ice::IceType_v16i8 | EmulatedV4,
+	Type_v2f32 = Ice::IceType_v4f32 | EmulatedV2,
+};
+
+class Value : public Ice::Operand {};
+class SwitchCases : public Ice::InstSwitch {};
+class BasicBlock : public Ice::CfgNode {};
+
+Ice::Type T(Type *t)
+{
+	static_assert(static_cast<unsigned int>(Ice::IceType_NUM) < static_cast<unsigned int>(EmulatedBits), "Ice::Type overlaps with our emulated types!");
+	return (Ice::Type)(reinterpret_cast<std::intptr_t>(t) & ~EmulatedBits);
 }
 
-namespace rr
+Type *T(Ice::Type t)
 {
-	const Capabilities Caps =
+	return reinterpret_cast<Type*>(t);
+}
+
+Type *T(EmulatedType t)
+{
+	return reinterpret_cast<Type*>(t);
+}
+
+Value *V(Ice::Operand *v)
+{
+	return reinterpret_cast<Value*>(v);
+}
+
+BasicBlock *B(Ice::CfgNode *b)
+{
+	return reinterpret_cast<BasicBlock*>(b);
+}
+
+static size_t typeSize(Type *type)
+{
+	if(reinterpret_cast<std::intptr_t>(type) & EmulatedBits)
 	{
-		false, // CoroutinesSupported
-	};
-
-	enum EmulatedType
-	{
-		EmulatedShift = 16,
-		EmulatedV2 = 2 << EmulatedShift,
-		EmulatedV4 = 4 << EmulatedShift,
-		EmulatedV8 = 8 << EmulatedShift,
-		EmulatedBits = EmulatedV2 | EmulatedV4 | EmulatedV8,
-
-		Type_v2i32 = Ice::IceType_v4i32 | EmulatedV2,
-		Type_v4i16 = Ice::IceType_v8i16 | EmulatedV4,
-		Type_v2i16 = Ice::IceType_v8i16 | EmulatedV2,
-		Type_v8i8 =  Ice::IceType_v16i8 | EmulatedV8,
-		Type_v4i8 =  Ice::IceType_v16i8 | EmulatedV4,
-		Type_v2f32 = Ice::IceType_v4f32 | EmulatedV2,
-	};
-
-	class Value : public Ice::Operand {};
-	class SwitchCases : public Ice::InstSwitch {};
-	class BasicBlock : public Ice::CfgNode {};
-
-	Ice::Type T(Type *t)
-	{
-		static_assert(static_cast<unsigned int>(Ice::IceType_NUM) < static_cast<unsigned int>(EmulatedBits), "Ice::Type overlaps with our emulated types!");
-		return (Ice::Type)(reinterpret_cast<std::intptr_t>(t) & ~EmulatedBits);
-	}
-
-	Type *T(Ice::Type t)
-	{
-		return reinterpret_cast<Type*>(t);
-	}
-
-	Type *T(EmulatedType t)
-	{
-		return reinterpret_cast<Type*>(t);
-	}
-
-	Value *V(Ice::Operand *v)
-	{
-		return reinterpret_cast<Value*>(v);
-	}
-
-	BasicBlock *B(Ice::CfgNode *b)
-	{
-		return reinterpret_cast<BasicBlock*>(b);
-	}
-
-	static size_t typeSize(Type *type)
-	{
-		if(reinterpret_cast<std::intptr_t>(type) & EmulatedBits)
+		switch(reinterpret_cast<std::intptr_t>(type))
 		{
-			switch(reinterpret_cast<std::intptr_t>(type))
-			{
-			case Type_v2i32: return 8;
-			case Type_v4i16: return 8;
-			case Type_v2i16: return 4;
-			case Type_v8i8:  return 8;
-			case Type_v4i8:  return 4;
-			case Type_v2f32: return 8;
-			default: ASSERT(false);
-			}
+		case Type_v2i32: return 8;
+		case Type_v4i16: return 8;
+		case Type_v2i16: return 4;
+		case Type_v8i8:  return 8;
+		case Type_v4i8:  return 4;
+		case Type_v2f32: return 8;
+		default: ASSERT(false);
+		}
+	}
+
+	return Ice::typeWidthInBytes(T(type));
+}
+
+using ElfHeader = std::conditional<sizeof(void*) == 8, Elf64_Ehdr, Elf32_Ehdr>::type;
+using SectionHeader = std::conditional<sizeof(void*) == 8, Elf64_Shdr, Elf32_Shdr>::type;
+
+inline const SectionHeader *sectionHeader(const ElfHeader *elfHeader)
+{
+	return reinterpret_cast<const SectionHeader*>((intptr_t)elfHeader + elfHeader->e_shoff);
+}
+
+inline const SectionHeader *elfSection(const ElfHeader *elfHeader, int index)
+{
+	return &sectionHeader(elfHeader)[index];
+}
+
+static void *relocateSymbol(const ElfHeader *elfHeader, const Elf32_Rel &relocation, const SectionHeader &relocationTable)
+{
+	const SectionHeader *target = elfSection(elfHeader, relocationTable.sh_info);
+
+	uint32_t index = relocation.getSymbol();
+	int table = relocationTable.sh_link;
+	void *symbolValue = nullptr;
+
+	if(index != SHN_UNDEF)
+	{
+		if(table == SHN_UNDEF) return nullptr;
+		const SectionHeader *symbolTable = elfSection(elfHeader, table);
+
+		uint32_t symtab_entries = symbolTable->sh_size / symbolTable->sh_entsize;
+		if(index >= symtab_entries)
+		{
+			ASSERT(index < symtab_entries && "Symbol Index out of range");
+			return nullptr;
 		}
 
-		return Ice::typeWidthInBytes(T(type));
-	}
+		intptr_t symbolAddress = (intptr_t)elfHeader + symbolTable->sh_offset;
+		Elf32_Sym &symbol = ((Elf32_Sym*)symbolAddress)[index];
+		uint16_t section = symbol.st_shndx;
 
-	using ElfHeader = std::conditional<sizeof(void*) == 8, Elf64_Ehdr, Elf32_Ehdr>::type;
-	using SectionHeader = std::conditional<sizeof(void*) == 8, Elf64_Shdr, Elf32_Shdr>::type;
-
-	inline const SectionHeader *sectionHeader(const ElfHeader *elfHeader)
-	{
-		return reinterpret_cast<const SectionHeader*>((intptr_t)elfHeader + elfHeader->e_shoff);
-	}
-
-	inline const SectionHeader *elfSection(const ElfHeader *elfHeader, int index)
-	{
-		return &sectionHeader(elfHeader)[index];
-	}
-
-	static void *relocateSymbol(const ElfHeader *elfHeader, const Elf32_Rel &relocation, const SectionHeader &relocationTable)
-	{
-		const SectionHeader *target = elfSection(elfHeader, relocationTable.sh_info);
-
-		uint32_t index = relocation.getSymbol();
-		int table = relocationTable.sh_link;
-		void *symbolValue = nullptr;
-
-		if(index != SHN_UNDEF)
+		if(section != SHN_UNDEF && section < SHN_LORESERVE)
 		{
-			if(table == SHN_UNDEF) return nullptr;
-			const SectionHeader *symbolTable = elfSection(elfHeader, table);
-
-			uint32_t symtab_entries = symbolTable->sh_size / symbolTable->sh_entsize;
-			if(index >= symtab_entries)
-			{
-				ASSERT(index < symtab_entries && "Symbol Index out of range");
-				return nullptr;
-			}
-
-			intptr_t symbolAddress = (intptr_t)elfHeader + symbolTable->sh_offset;
-			Elf32_Sym &symbol = ((Elf32_Sym*)symbolAddress)[index];
-			uint16_t section = symbol.st_shndx;
-
-			if(section != SHN_UNDEF && section < SHN_LORESERVE)
-			{
-				const SectionHeader *target = elfSection(elfHeader, symbol.st_shndx);
-				symbolValue = reinterpret_cast<void*>((intptr_t)elfHeader + symbol.st_value + target->sh_offset);
-			}
-			else
-			{
-				return nullptr;
-			}
-		}
-
-		intptr_t address = (intptr_t)elfHeader + target->sh_offset;
-		unaligned_ptr<int32_t> patchSite = (int32_t*)(address + relocation.r_offset);
-
-		if(CPUID::ARM)
-		{
-			switch(relocation.getType())
-			{
-			case R_ARM_NONE:
-				// No relocation
-				break;
-			case R_ARM_MOVW_ABS_NC:
-				{
-					uint32_t thumb = 0;   // Calls to Thumb code not supported.
-					uint32_t lo = (uint32_t)(intptr_t)symbolValue | thumb;
-					*patchSite = (*patchSite & 0xFFF0F000) | ((lo & 0xF000) << 4) | (lo & 0x0FFF);
-				}
-				break;
-			case R_ARM_MOVT_ABS:
-				{
-					uint32_t hi = (uint32_t)(intptr_t)(symbolValue) >> 16;
-					*patchSite = (*patchSite & 0xFFF0F000) | ((hi & 0xF000) << 4) | (hi & 0x0FFF);
-				}
-				break;
-			default:
-				ASSERT(false && "Unsupported relocation type");
-				return nullptr;
-			}
+			const SectionHeader *target = elfSection(elfHeader, symbol.st_shndx);
+			symbolValue = reinterpret_cast<void*>((intptr_t)elfHeader + symbol.st_value + target->sh_offset);
 		}
 		else
 		{
-			switch(relocation.getType())
-			{
-			case R_386_NONE:
-				// No relocation
-				break;
-			case R_386_32:
-				*patchSite = (int32_t)((intptr_t)symbolValue + *patchSite);
-				break;
-			case R_386_PC32:
-				*patchSite = (int32_t)((intptr_t)symbolValue + *patchSite - (intptr_t)patchSite);
-				break;
-			default:
-				ASSERT(false && "Unsupported relocation type");
-				return nullptr;
-			}
+			return nullptr;
 		}
-
-		return symbolValue;
 	}
 
-	static void *relocateSymbol(const ElfHeader *elfHeader, const Elf64_Rela &relocation, const SectionHeader &relocationTable)
+	intptr_t address = (intptr_t)elfHeader + target->sh_offset;
+	unaligned_ptr<int32_t> patchSite = (int32_t*)(address + relocation.r_offset);
+
+	if(CPUID::ARM)
 	{
-		const SectionHeader *target = elfSection(elfHeader, relocationTable.sh_info);
-
-		uint32_t index = relocation.getSymbol();
-		int table = relocationTable.sh_link;
-		void *symbolValue = nullptr;
-
-		if(index != SHN_UNDEF)
-		{
-			if(table == SHN_UNDEF) return nullptr;
-			const SectionHeader *symbolTable = elfSection(elfHeader, table);
-
-			uint32_t symtab_entries = symbolTable->sh_size / symbolTable->sh_entsize;
-			if(index >= symtab_entries)
-			{
-				ASSERT(index < symtab_entries && "Symbol Index out of range");
-				return nullptr;
-			}
-
-			intptr_t symbolAddress = (intptr_t)elfHeader + symbolTable->sh_offset;
-			Elf64_Sym &symbol = ((Elf64_Sym*)symbolAddress)[index];
-			uint16_t section = symbol.st_shndx;
-
-			if(section != SHN_UNDEF && section < SHN_LORESERVE)
-			{
-				const SectionHeader *target = elfSection(elfHeader, symbol.st_shndx);
-				symbolValue = reinterpret_cast<void*>((intptr_t)elfHeader + symbol.st_value + target->sh_offset);
-			}
-			else
-			{
-				return nullptr;
-			}
-		}
-
-		intptr_t address = (intptr_t)elfHeader + target->sh_offset;
-		unaligned_ptr<int32_t> patchSite32 = (int32_t*)(address + relocation.r_offset);
-		unaligned_ptr<int64_t> patchSite64 = (int64_t*)(address + relocation.r_offset);
-
 		switch(relocation.getType())
 		{
-		case R_X86_64_NONE:
+		case R_ARM_NONE:
 			// No relocation
 			break;
-		case R_X86_64_64:
-			*patchSite64 = (int64_t)((intptr_t)symbolValue + *patchSite64 + relocation.r_addend);
+		case R_ARM_MOVW_ABS_NC:
+			{
+				uint32_t thumb = 0;   // Calls to Thumb code not supported.
+				uint32_t lo = (uint32_t)(intptr_t)symbolValue | thumb;
+				*patchSite = (*patchSite & 0xFFF0F000) | ((lo & 0xF000) << 4) | (lo & 0x0FFF);
+			}
 			break;
-		case R_X86_64_PC32:
-			*patchSite32 = (int32_t)((intptr_t)symbolValue + *patchSite32 - (intptr_t)patchSite32 + relocation.r_addend);
-			break;
-		case R_X86_64_32S:
-			*patchSite32 = (int32_t)((intptr_t)symbolValue + *patchSite32 + relocation.r_addend);
+		case R_ARM_MOVT_ABS:
+			{
+				uint32_t hi = (uint32_t)(intptr_t)(symbolValue) >> 16;
+				*patchSite = (*patchSite & 0xFFF0F000) | ((hi & 0xF000) << 4) | (hi & 0x0FFF);
+			}
 			break;
 		default:
 			ASSERT(false && "Unsupported relocation type");
 			return nullptr;
 		}
-
-		return symbolValue;
+	}
+	else
+	{
+		switch(relocation.getType())
+		{
+		case R_386_NONE:
+			// No relocation
+			break;
+		case R_386_32:
+			*patchSite = (int32_t)((intptr_t)symbolValue + *patchSite);
+			break;
+		case R_386_PC32:
+			*patchSite = (int32_t)((intptr_t)symbolValue + *patchSite - (intptr_t)patchSite);
+			break;
+		default:
+			ASSERT(false && "Unsupported relocation type");
+			return nullptr;
+		}
 	}
 
-	void *loadImage(uint8_t *const elfImage, size_t &codeSize)
-	{
-		ElfHeader *elfHeader = (ElfHeader*)elfImage;
+	return symbolValue;
+}
 
-		if(!elfHeader->checkMagic())
+static void *relocateSymbol(const ElfHeader *elfHeader, const Elf64_Rela &relocation, const SectionHeader &relocationTable)
+{
+	const SectionHeader *target = elfSection(elfHeader, relocationTable.sh_info);
+
+	uint32_t index = relocation.getSymbol();
+	int table = relocationTable.sh_link;
+	void *symbolValue = nullptr;
+
+	if(index != SHN_UNDEF)
+	{
+		if(table == SHN_UNDEF) return nullptr;
+		const SectionHeader *symbolTable = elfSection(elfHeader, table);
+
+		uint32_t symtab_entries = symbolTable->sh_size / symbolTable->sh_entsize;
+		if(index >= symtab_entries)
 		{
+			ASSERT(index < symtab_entries && "Symbol Index out of range");
 			return nullptr;
 		}
 
-		// Expect ELF bitness to match platform
-		ASSERT(sizeof(void*) == 8 ? elfHeader->getFileClass() == ELFCLASS64 : elfHeader->getFileClass() == ELFCLASS32);
-		#if defined(__i386__)
-			ASSERT(sizeof(void*) == 4 && elfHeader->e_machine == EM_386);
-		#elif defined(__x86_64__)
-			ASSERT(sizeof(void*) == 8 && elfHeader->e_machine == EM_X86_64);
-		#elif defined(__arm__)
-			ASSERT(sizeof(void*) == 4 && elfHeader->e_machine == EM_ARM);
-		#elif defined(__aarch64__)
-			ASSERT(sizeof(void*) == 8 && elfHeader->e_machine == EM_AARCH64);
-		#elif defined(__mips__)
-			ASSERT(sizeof(void*) == 4 && elfHeader->e_machine == EM_MIPS);
-		#else
-			#error "Unsupported platform"
-		#endif
+		intptr_t symbolAddress = (intptr_t)elfHeader + symbolTable->sh_offset;
+		Elf64_Sym &symbol = ((Elf64_Sym*)symbolAddress)[index];
+		uint16_t section = symbol.st_shndx;
 
-		SectionHeader *sectionHeader = (SectionHeader*)(elfImage + elfHeader->e_shoff);
-		void *entry = nullptr;
-
-		for(int i = 0; i < elfHeader->e_shnum; i++)
+		if(section != SHN_UNDEF && section < SHN_LORESERVE)
 		{
-			if(sectionHeader[i].sh_type == SHT_PROGBITS)
-			{
-				if(sectionHeader[i].sh_flags & SHF_EXECINSTR)
-				{
-					entry = elfImage + sectionHeader[i].sh_offset;
-					codeSize = sectionHeader[i].sh_size;
-				}
-			}
-			else if(sectionHeader[i].sh_type == SHT_REL)
-			{
-				ASSERT(sizeof(void*) == 4 && "UNIMPLEMENTED");   // Only expected/implemented for 32-bit code
+			const SectionHeader *target = elfSection(elfHeader, symbol.st_shndx);
+			symbolValue = reinterpret_cast<void*>((intptr_t)elfHeader + symbol.st_value + target->sh_offset);
+		}
+		else
+		{
+			return nullptr;
+		}
+	}
 
-				for(Elf32_Word index = 0; index < sectionHeader[i].sh_size / sectionHeader[i].sh_entsize; index++)
-				{
-					const Elf32_Rel &relocation = ((const Elf32_Rel*)(elfImage + sectionHeader[i].sh_offset))[index];
-					relocateSymbol(elfHeader, relocation, sectionHeader[i]);
-				}
-			}
-			else if(sectionHeader[i].sh_type == SHT_RELA)
-			{
-				ASSERT(sizeof(void*) == 8 && "UNIMPLEMENTED");   // Only expected/implemented for 64-bit code
+	intptr_t address = (intptr_t)elfHeader + target->sh_offset;
+	unaligned_ptr<int32_t> patchSite32 = (int32_t*)(address + relocation.r_offset);
+	unaligned_ptr<int64_t> patchSite64 = (int64_t*)(address + relocation.r_offset);
 
-				for(Elf32_Word index = 0; index < sectionHeader[i].sh_size / sectionHeader[i].sh_entsize; index++)
-				{
-					const Elf64_Rela &relocation = ((const Elf64_Rela*)(elfImage + sectionHeader[i].sh_offset))[index];
-					relocateSymbol(elfHeader, relocation, sectionHeader[i]);
-				}
+	switch(relocation.getType())
+	{
+	case R_X86_64_NONE:
+		// No relocation
+		break;
+	case R_X86_64_64:
+		*patchSite64 = (int64_t)((intptr_t)symbolValue + *patchSite64 + relocation.r_addend);
+		break;
+	case R_X86_64_PC32:
+		*patchSite32 = (int32_t)((intptr_t)symbolValue + *patchSite32 - (intptr_t)patchSite32 + relocation.r_addend);
+		break;
+	case R_X86_64_32S:
+		*patchSite32 = (int32_t)((intptr_t)symbolValue + *patchSite32 + relocation.r_addend);
+		break;
+	default:
+		ASSERT(false && "Unsupported relocation type");
+		return nullptr;
+	}
+
+	return symbolValue;
+}
+
+void *loadImage(uint8_t *const elfImage, size_t &codeSize)
+{
+	ElfHeader *elfHeader = (ElfHeader*)elfImage;
+
+	if(!elfHeader->checkMagic())
+	{
+		return nullptr;
+	}
+
+	// Expect ELF bitness to match platform
+	ASSERT(sizeof(void*) == 8 ? elfHeader->getFileClass() == ELFCLASS64 : elfHeader->getFileClass() == ELFCLASS32);
+	#if defined(__i386__)
+		ASSERT(sizeof(void*) == 4 && elfHeader->e_machine == EM_386);
+	#elif defined(__x86_64__)
+		ASSERT(sizeof(void*) == 8 && elfHeader->e_machine == EM_X86_64);
+	#elif defined(__arm__)
+		ASSERT(sizeof(void*) == 4 && elfHeader->e_machine == EM_ARM);
+	#elif defined(__aarch64__)
+		ASSERT(sizeof(void*) == 8 && elfHeader->e_machine == EM_AARCH64);
+	#elif defined(__mips__)
+		ASSERT(sizeof(void*) == 4 && elfHeader->e_machine == EM_MIPS);
+	#else
+		#error "Unsupported platform"
+	#endif
+
+	SectionHeader *sectionHeader = (SectionHeader*)(elfImage + elfHeader->e_shoff);
+	void *entry = nullptr;
+
+	for(int i = 0; i < elfHeader->e_shnum; i++)
+	{
+		if(sectionHeader[i].sh_type == SHT_PROGBITS)
+		{
+			if(sectionHeader[i].sh_flags & SHF_EXECINSTR)
+			{
+				entry = elfImage + sectionHeader[i].sh_offset;
+				codeSize = sectionHeader[i].sh_size;
 			}
 		}
+		else if(sectionHeader[i].sh_type == SHT_REL)
+		{
+			ASSERT(sizeof(void*) == 4 && "UNIMPLEMENTED");   // Only expected/implemented for 32-bit code
 
+			for(Elf32_Word index = 0; index < sectionHeader[i].sh_size / sectionHeader[i].sh_entsize; index++)
+			{
+				const Elf32_Rel &relocation = ((const Elf32_Rel*)(elfImage + sectionHeader[i].sh_offset))[index];
+				relocateSymbol(elfHeader, relocation, sectionHeader[i]);
+			}
+		}
+		else if(sectionHeader[i].sh_type == SHT_RELA)
+		{
+			ASSERT(sizeof(void*) == 8 && "UNIMPLEMENTED");   // Only expected/implemented for 64-bit code
+
+			for(Elf32_Word index = 0; index < sectionHeader[i].sh_size / sectionHeader[i].sh_entsize; index++)
+			{
+				const Elf64_Rela &relocation = ((const Elf64_Rela*)(elfImage + sectionHeader[i].sh_offset))[index];
+				relocateSymbol(elfHeader, relocation, sectionHeader[i]);
+			}
+		}
+	}
+
+	return entry;
+}
+
+template<typename T>
+struct ExecutableAllocator
+{
+	ExecutableAllocator() {}
+	template<class U> ExecutableAllocator(const ExecutableAllocator<U> &other) {}
+
+	using value_type = T;
+	using size_type = std::size_t;
+
+	T *allocate(size_type n)
+	{
+		return (T*)allocateExecutable(sizeof(T) * n);
+	}
+
+	void deallocate(T *p, size_type n)
+	{
+		deallocateExecutable(p, sizeof(T) * n);
+	}
+};
+
+class ELFMemoryStreamer : public Ice::ELFStreamer, public Routine
+{
+	ELFMemoryStreamer(const ELFMemoryStreamer &) = delete;
+	ELFMemoryStreamer &operator=(const ELFMemoryStreamer &) = delete;
+
+public:
+	ELFMemoryStreamer() : Routine()
+	{
+		position = 0;
+		buffer.reserve(0x1000);
+	}
+
+	~ELFMemoryStreamer() override
+	{
+		#if defined(_WIN32)
+			if(buffer.size() != 0)
+			{
+				DWORD exeProtection;
+				VirtualProtect(&buffer[0], buffer.size(), oldProtection, &exeProtection);
+			}
+		#endif
+	}
+
+	void write8(uint8_t Value) override
+	{
+		if(position == (uint64_t)buffer.size())
+		{
+			buffer.push_back(Value);
+			position++;
+		}
+		else if(position < (uint64_t)buffer.size())
+		{
+			buffer[position] = Value;
+			position++;
+		}
+		else ASSERT(false && "UNIMPLEMENTED");
+	}
+
+	void writeBytes(llvm::StringRef Bytes) override
+	{
+		std::size_t oldSize = buffer.size();
+		buffer.resize(oldSize + Bytes.size());
+		memcpy(&buffer[oldSize], Bytes.begin(), Bytes.size());
+		position += Bytes.size();
+	}
+
+	uint64_t tell() const override { return position; }
+
+	void seek(uint64_t Off) override { position = Off; }
+
+	const void* finalizeEntryBegin()
+	{
+		position = std::numeric_limits<std::size_t>::max();   // Can't stream more data after this
+
+		size_t codeSize = 0;
+		const void *entry = loadImage(&buffer[0], codeSize);
+
+#if defined(_WIN32)
+		VirtualProtect(&buffer[0], buffer.size(), PAGE_EXECUTE_READ, &oldProtection);
+		FlushInstructionCache(GetCurrentProcess(), NULL, 0);
+#else
+		mprotect(&buffer[0], buffer.size(), PROT_READ | PROT_EXEC);
+		__builtin___clear_cache((char*)entry, (char*)entry + codeSize);
+#endif
 		return entry;
 	}
 
-	template<typename T>
-	struct ExecutableAllocator
+	void setEntry(int index, const void* func)
 	{
-		ExecutableAllocator() {}
-		template<class U> ExecutableAllocator(const ExecutableAllocator<U> &other) {}
+		ASSERT(func);
+		funcs[index] = func;
+	}
 
-		using value_type = T;
-		using size_type = std::size_t;
-
-		T *allocate(size_type n)
-		{
-			return (T*)allocateExecutable(sizeof(T) * n);
-		}
-
-		void deallocate(T *p, size_type n)
-		{
-			deallocateExecutable(p, sizeof(T) * n);
-		}
-	};
-
-	class ELFMemoryStreamer : public Ice::ELFStreamer, public Routine
+	const void *getEntry(int index) const override
 	{
-		ELFMemoryStreamer(const ELFMemoryStreamer &) = delete;
-		ELFMemoryStreamer &operator=(const ELFMemoryStreamer &) = delete;
+		ASSERT(funcs[index]);
+		return funcs[index];
+	}
 
-	public:
-		ELFMemoryStreamer() : Routine()
-		{
-			position = 0;
-			buffer.reserve(0x1000);
-		}
+	const void* addConstantData(const void* data, size_t size)
+	{
+		auto buf = std::unique_ptr<uint8_t[]>(new uint8_t[size]);
+		memcpy(buf.get(), data, size);
+		auto ptr = buf.get();
+		constantData.emplace_back(std::move(buf));
+		return ptr;
+	}
 
-		~ELFMemoryStreamer() override
-		{
-			#if defined(_WIN32)
-				if(buffer.size() != 0)
-				{
-					DWORD exeProtection;
-					VirtualProtect(&buffer[0], buffer.size(), oldProtection, &exeProtection);
-				}
-			#endif
-		}
+private:
+	std::array<const void*, Nucleus::CoroutineEntryCount> funcs = {};
+	std::vector<uint8_t, ExecutableAllocator<uint8_t>> buffer;
+	std::size_t position;
+	std::vector<std::unique_ptr<uint8_t[]>> constantData;
 
-		void write8(uint8_t Value) override
+	#if defined(_WIN32)
+	DWORD oldProtection;
+	#endif
+};
+
+Nucleus::Nucleus()
+{
+	::codegenMutex.lock();   // Reactor is currently not thread safe
+
+	Ice::ClFlags &Flags = Ice::ClFlags::Flags;
+	Ice::ClFlags::getParsedClFlags(Flags);
+
+	#if defined(__arm__)
+		Flags.setTargetArch(Ice::Target_ARM32);
+		Flags.setTargetInstructionSet(Ice::ARM32InstructionSet_HWDivArm);
+	#elif defined(__mips__)
+		Flags.setTargetArch(Ice::Target_MIPS32);
+		Flags.setTargetInstructionSet(Ice::BaseInstructionSet);
+	#else   // x86
+		Flags.setTargetArch(sizeof(void*) == 8 ? Ice::Target_X8664 : Ice::Target_X8632);
+		Flags.setTargetInstructionSet(CPUID::SSE4_1 ? Ice::X86InstructionSet_SSE4_1 : Ice::X86InstructionSet_SSE2);
+	#endif
+	Flags.setOutFileType(Ice::FT_Elf);
+	Flags.setOptLevel(toIce(getDefaultConfig().getOptimization().getLevel()));
+	Flags.setApplicationBinaryInterface(Ice::ABI_Platform);
+	Flags.setVerbose(subzeroDumpEnabled ? Ice::IceV_Most : Ice::IceV_None);
+	Flags.setDisableHybridAssembly(true);
+
+	static llvm::raw_os_ostream cout(std::cout);
+	static llvm::raw_os_ostream cerr(std::cerr);
+
+	if (subzeroEmitTextAsm)
+	{
+		// Decorate text asm with liveness info
+		Flags.setDecorateAsm(true);
+	}
+
+	if(false)   // Write out to a file
+	{
+		std::error_code errorCode;
+		::out = new Ice::Fdstream("out.o", errorCode, llvm::sys::fs::F_None);
+		::elfFile = new Ice::ELFFileStreamer(*out);
+		::context = new Ice::GlobalContext(&cout, &cout, &cerr, elfFile);
+	}
+	else
+	{
+		ELFMemoryStreamer *elfMemory = new ELFMemoryStreamer();
+		::context = new Ice::GlobalContext(&cout, &cout, &cerr, elfMemory);
+		::routine = elfMemory;
+	}
+}
+
+Nucleus::~Nucleus()
+{
+	delete ::routine;
+
+	delete ::allocator;
+	delete ::function;
+	delete ::context;
+
+	delete ::elfFile;
+	delete ::out;
+
+	::codegenMutex.unlock();
+}
+
+void Nucleus::setDefaultConfig(const Config &cfg)
+{
+	std::unique_lock<std::mutex> lock(::defaultConfigLock);
+	::defaultConfig() = cfg;
+}
+
+void Nucleus::adjustDefaultConfig(const Config::Edit &cfgEdit)
+{
+	std::unique_lock<std::mutex> lock(::defaultConfigLock);
+	auto &config = ::defaultConfig();
+	config = cfgEdit.apply(config);
+}
+
+Config Nucleus::getDefaultConfig()
+{
+	std::unique_lock<std::mutex> lock(::defaultConfigLock);
+	return ::defaultConfig();
+}
+
+std::shared_ptr<Routine> Nucleus::acquireRoutine(const char *name, const Config::Edit &cfgEdit /* = Config::Edit::None */)
+{
+	if (subzeroDumpEnabled)
+	{
+		// Output dump strings immediately, rather than once buffer is full. Useful for debugging.
+		context->getStrDump().SetUnbuffered();
+	}
+
+	if(basicBlock->getInsts().empty() || basicBlock->getInsts().back().getKind() != Ice::Inst::Ret)
+	{
+		createRetVoid();
+	}
+
+	::function->setFunctionName(Ice::GlobalString::createWithString(::context, name));
+
+	rr::optimize(::function);
+
+	::function->computeInOutEdges();
+	ASSERT(!::function->hasError());
+
+	::function->translate();
+	ASSERT(!::function->hasError());
+
+	auto globals = ::function->getGlobalInits();
+
+	if(globals && !globals->empty())
+	{
+		::context->getGlobals()->merge(globals.get());
+	}
+
+	::context->emitFileHeader();
+
+	if (subzeroEmitTextAsm)
+	{
+		::function->emit();
+	}
+
+	::function->emitIAS();
+	auto assembler = ::function->releaseAssembler();
+	auto objectWriter = ::context->getObjectWriter();
+	assembler->alignFunction();
+	objectWriter->writeFunctionCode(::function->getFunctionName(), false, assembler.get());
+	::context->lowerGlobals("last");
+	::context->lowerConstants();
+	::context->lowerJumpTables();
+	objectWriter->setUndefinedSyms(::context->getConstantExternSyms());
+	objectWriter->writeNonUserSections();
+
+	const void* entryBegin = ::routine->finalizeEntryBegin();
+	::routine->setEntry(Nucleus::CoroutineEntryBegin, entryBegin);
+
+	Routine *handoffRoutine = ::routine;
+	::routine = nullptr;
+
+	return std::shared_ptr<Routine>(handoffRoutine);
+}
+
+Value *Nucleus::allocateStackVariable(Type *t, int arraySize)
+{
+	Ice::Type type = T(t);
+	int typeSize = Ice::typeWidthInBytes(type);
+	int totalSize = typeSize * (arraySize ? arraySize : 1);
+
+	auto bytes = Ice::ConstantInteger32::create(::context, Ice::IceType_i32, totalSize);
+	auto address = ::function->makeVariable(T(getPointerType(t)));
+	auto alloca = Ice::InstAlloca::create(::function, address, bytes, typeSize);
+	::function->getEntryNode()->getInsts().push_front(alloca);
+
+	return V(address);
+}
+
+BasicBlock *Nucleus::createBasicBlock()
+{
+	return B(::function->makeNode());
+}
+
+BasicBlock *Nucleus::getInsertBlock()
+{
+	return B(::basicBlock);
+}
+
+void Nucleus::setInsertBlock(BasicBlock *basicBlock)
+{
+//	ASSERT(::basicBlock->getInsts().back().getTerminatorEdges().size() >= 0 && "Previous basic block must have a terminator");
+
+	Variable::materializeAll();
+
+	::basicBlock = basicBlock;
+}
+
+void Nucleus::createFunction(Type *ReturnType, std::vector<Type*> &Params)
+{
+	uint32_t sequenceNumber = 0;
+	::function = Ice::Cfg::create(::context, sequenceNumber).release();
+	::allocator = new Ice::CfgLocalAllocatorScope(::function);
+
+	for(Type *type : Params)
+	{
+		Ice::Variable *arg = ::function->makeVariable(T(type));
+		::function->addArg(arg);
+	}
+
+	Ice::CfgNode *node = ::function->makeNode();
+	::function->setEntryNode(node);
+	::basicBlock = node;
+}
+
+Value *Nucleus::getArgument(unsigned int index)
+{
+	return V(::function->getArgs()[index]);
+}
+
+void Nucleus::createRetVoid()
+{
+	// Code generated after this point is unreachable, so any variables
+	// being read can safely return an undefined value. We have to avoid
+	// materializing variables after the terminator ret instruction.
+	Variable::killUnmaterialized();
+
+	Ice::InstRet *ret = Ice::InstRet::create(::function);
+	::basicBlock->appendInst(ret);
+}
+
+void Nucleus::createRet(Value *v)
+{
+	// Code generated after this point is unreachable, so any variables
+	// being read can safely return an undefined value. We have to avoid
+	// materializing variables after the terminator ret instruction.
+	Variable::killUnmaterialized();
+
+	Ice::InstRet *ret = Ice::InstRet::create(::function, v);
+	::basicBlock->appendInst(ret);
+}
+
+void Nucleus::createBr(BasicBlock *dest)
+{
+	Variable::materializeAll();
+
+	auto br = Ice::InstBr::create(::function, dest);
+	::basicBlock->appendInst(br);
+}
+
+void Nucleus::createCondBr(Value *cond, BasicBlock *ifTrue, BasicBlock *ifFalse)
+{
+	Variable::materializeAll();
+
+	auto br = Ice::InstBr::create(::function, cond, ifTrue, ifFalse);
+	::basicBlock->appendInst(br);
+}
+
+static bool isCommutative(Ice::InstArithmetic::OpKind op)
+{
+	switch(op)
+	{
+	case Ice::InstArithmetic::Add:
+	case Ice::InstArithmetic::Fadd:
+	case Ice::InstArithmetic::Mul:
+	case Ice::InstArithmetic::Fmul:
+	case Ice::InstArithmetic::And:
+	case Ice::InstArithmetic::Or:
+	case Ice::InstArithmetic::Xor:
+		return true;
+	default:
+		return false;
+	}
+}
+
+static Value *createArithmetic(Ice::InstArithmetic::OpKind op, Value *lhs, Value *rhs)
+{
+	ASSERT(lhs->getType() == rhs->getType() || llvm::isa<Ice::Constant>(rhs));
+
+	bool swapOperands = llvm::isa<Ice::Constant>(lhs) && isCommutative(op);
+
+	Ice::Variable *result = ::function->makeVariable(lhs->getType());
+	Ice::InstArithmetic *arithmetic = Ice::InstArithmetic::create(::function, op, result, swapOperands ? rhs : lhs, swapOperands ? lhs : rhs);
+	::basicBlock->appendInst(arithmetic);
+
+	return V(result);
+}
+
+Value *Nucleus::createAdd(Value *lhs, Value *rhs)
+{
+	return createArithmetic(Ice::InstArithmetic::Add, lhs, rhs);
+}
+
+Value *Nucleus::createSub(Value *lhs, Value *rhs)
+{
+	return createArithmetic(Ice::InstArithmetic::Sub, lhs, rhs);
+}
+
+Value *Nucleus::createMul(Value *lhs, Value *rhs)
+{
+	return createArithmetic(Ice::InstArithmetic::Mul, lhs, rhs);
+}
+
+Value *Nucleus::createUDiv(Value *lhs, Value *rhs)
+{
+	return createArithmetic(Ice::InstArithmetic::Udiv, lhs, rhs);
+}
+
+Value *Nucleus::createSDiv(Value *lhs, Value *rhs)
+{
+	return createArithmetic(Ice::InstArithmetic::Sdiv, lhs, rhs);
+}
+
+Value *Nucleus::createFAdd(Value *lhs, Value *rhs)
+{
+	return createArithmetic(Ice::InstArithmetic::Fadd, lhs, rhs);
+}
+
+Value *Nucleus::createFSub(Value *lhs, Value *rhs)
+{
+	return createArithmetic(Ice::InstArithmetic::Fsub, lhs, rhs);
+}
+
+Value *Nucleus::createFMul(Value *lhs, Value *rhs)
+{
+	return createArithmetic(Ice::InstArithmetic::Fmul, lhs, rhs);
+}
+
+Value *Nucleus::createFDiv(Value *lhs, Value *rhs)
+{
+	return createArithmetic(Ice::InstArithmetic::Fdiv, lhs, rhs);
+}
+
+Value *Nucleus::createURem(Value *lhs, Value *rhs)
+{
+	return createArithmetic(Ice::InstArithmetic::Urem, lhs, rhs);
+}
+
+Value *Nucleus::createSRem(Value *lhs, Value *rhs)
+{
+	return createArithmetic(Ice::InstArithmetic::Srem, lhs, rhs);
+}
+
+Value *Nucleus::createFRem(Value *lhs, Value *rhs)
+{
+	return createArithmetic(Ice::InstArithmetic::Frem, lhs, rhs);
+}
+
+Value *Nucleus::createShl(Value *lhs, Value *rhs)
+{
+	return createArithmetic(Ice::InstArithmetic::Shl, lhs, rhs);
+}
+
+Value *Nucleus::createLShr(Value *lhs, Value *rhs)
+{
+	return createArithmetic(Ice::InstArithmetic::Lshr, lhs, rhs);
+}
+
+Value *Nucleus::createAShr(Value *lhs, Value *rhs)
+{
+	return createArithmetic(Ice::InstArithmetic::Ashr, lhs, rhs);
+}
+
+Value *Nucleus::createAnd(Value *lhs, Value *rhs)
+{
+	return createArithmetic(Ice::InstArithmetic::And, lhs, rhs);
+}
+
+Value *Nucleus::createOr(Value *lhs, Value *rhs)
+{
+	return createArithmetic(Ice::InstArithmetic::Or, lhs, rhs);
+}
+
+Value *Nucleus::createXor(Value *lhs, Value *rhs)
+{
+	return createArithmetic(Ice::InstArithmetic::Xor, lhs, rhs);
+}
+
+Value *Nucleus::createNeg(Value *v)
+{
+	return createSub(createNullValue(T(v->getType())), v);
+}
+
+Value *Nucleus::createFNeg(Value *v)
+{
+	double c[4] = {-0.0, -0.0, -0.0, -0.0};
+	Value *negativeZero = Ice::isVectorType(v->getType()) ?
+	                      createConstantVector(c, T(v->getType())) :
+	                      V(::context->getConstantFloat(-0.0f));
+
+	return createFSub(negativeZero, v);
+}
+
+Value *Nucleus::createNot(Value *v)
+{
+	if(Ice::isScalarIntegerType(v->getType()))
+	{
+		return createXor(v, V(::context->getConstantInt(v->getType(), -1)));
+	}
+	else   // Vector
+	{
+		int64_t c[16] = {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1};
+		return createXor(v, createConstantVector(c, T(v->getType())));
+	}
+}
+
+Value *Nucleus::createLoad(Value *ptr, Type *type, bool isVolatile, unsigned int align, bool atomic, std::memory_order memoryOrder)
+{
+	ASSERT(!atomic);  // Unimplemented
+	ASSERT(memoryOrder == std::memory_order_relaxed);  // Unimplemented
+
+	int valueType = (int)reinterpret_cast<intptr_t>(type);
+	Ice::Variable *result = ::function->makeVariable(T(type));
+
+	if((valueType & EmulatedBits) && (align != 0))   // Narrow vector not stored on stack.
+	{
+		if(emulateIntrinsics)
 		{
-			if(position == (uint64_t)buffer.size())
+			if(typeSize(type) == 4)
 			{
-				buffer.push_back(Value);
-				position++;
+				auto pointer = RValue<Pointer<Byte>>(ptr);
+				Int x = *Pointer<Int>(pointer);
+
+				Int4 vector;
+				vector = Insert(vector, x, 0);
+
+				auto bitcast = Ice::InstCast::create(::function, Ice::InstCast::Bitcast, result, vector.loadValue());
+				::basicBlock->appendInst(bitcast);
 			}
-			else if(position < (uint64_t)buffer.size())
+			else if(typeSize(type) == 8)
 			{
-				buffer[position] = Value;
-				position++;
+				auto pointer = RValue<Pointer<Byte>>(ptr);
+				Int x = *Pointer<Int>(pointer);
+				Int y = *Pointer<Int>(pointer + 4);
+
+				Int4 vector;
+				vector = Insert(vector, x, 0);
+				vector = Insert(vector, y, 1);
+
+				auto bitcast = Ice::InstCast::create(::function, Ice::InstCast::Bitcast, result, vector.loadValue());
+				::basicBlock->appendInst(bitcast);
 			}
-			else ASSERT(false && "UNIMPLEMENTED");
+			else UNREACHABLE("typeSize(type): %d", int(typeSize(type)));
 		}
-
-		void writeBytes(llvm::StringRef Bytes) override
+		else
 		{
-			std::size_t oldSize = buffer.size();
-			buffer.resize(oldSize + Bytes.size());
-			memcpy(&buffer[oldSize], Bytes.begin(), Bytes.size());
-			position += Bytes.size();
+			const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::LoadSubVector, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
+			auto target = ::context->getConstantUndef(Ice::IceType_i32);
+			auto load = Ice::InstIntrinsicCall::create(::function, 2, result, target, intrinsic);
+			load->addArg(ptr);
+			load->addArg(::context->getConstantInt32(typeSize(type)));
+			::basicBlock->appendInst(load);
 		}
+	}
+	else
+	{
+		auto load = Ice::InstLoad::create(::function, result, ptr, align);
+		::basicBlock->appendInst(load);
+	}
 
-		uint64_t tell() const override { return position; }
+	return V(result);
+}
 
-		void seek(uint64_t Off) override { position = Off; }
+Value *Nucleus::createStore(Value *value, Value *ptr, Type *type, bool isVolatile, unsigned int align, bool atomic, std::memory_order memoryOrder)
+{
+	ASSERT(!atomic);  // Unimplemented
+	ASSERT(memoryOrder == std::memory_order_relaxed);  // Unimplemented
 
-		const void* finalizeEntryBegin()
+	#if __has_feature(memory_sanitizer)
+		// Mark all (non-stack) memory writes as initialized by calling __msan_unpoison
+		if(align != 0)
 		{
-			position = std::numeric_limits<std::size_t>::max();   // Can't stream more data after this
-
-			size_t codeSize = 0;
-			const void *entry = loadImage(&buffer[0], codeSize);
-
-#if defined(_WIN32)
-			VirtualProtect(&buffer[0], buffer.size(), PAGE_EXECUTE_READ, &oldProtection);
-			FlushInstructionCache(GetCurrentProcess(), NULL, 0);
-#else
-			mprotect(&buffer[0], buffer.size(), PROT_READ | PROT_EXEC);
-			__builtin___clear_cache((char*)entry, (char*)entry + codeSize);
-#endif
-			return entry;
+			auto call = Ice::InstCall::create(::function, 2, nullptr, ::context->getConstantInt64(reinterpret_cast<intptr_t>(__msan_unpoison)), false);
+			call->addArg(ptr);
+			call->addArg(::context->getConstantInt64(typeSize(type)));
+			::basicBlock->appendInst(call);
 		}
+	#endif
 
-		void setEntry(int index, const void* func)
+	int valueType = (int)reinterpret_cast<intptr_t>(type);
+
+	if((valueType & EmulatedBits) && (align != 0))   // Narrow vector not stored on stack.
+	{
+		if(emulateIntrinsics)
 		{
-			ASSERT(func);
-			funcs[index] = func;
+			if(typeSize(type) == 4)
+			{
+				Ice::Variable *vector = ::function->makeVariable(Ice::IceType_v4i32);
+				auto bitcast = Ice::InstCast::create(::function, Ice::InstCast::Bitcast, vector, value);
+				::basicBlock->appendInst(bitcast);
+
+				RValue<Int4> v(V(vector));
+
+				auto pointer = RValue<Pointer<Byte>>(ptr);
+				Int x = Extract(v, 0);
+				*Pointer<Int>(pointer) = x;
+			}
+			else if(typeSize(type) == 8)
+			{
+				Ice::Variable *vector = ::function->makeVariable(Ice::IceType_v4i32);
+				auto bitcast = Ice::InstCast::create(::function, Ice::InstCast::Bitcast, vector, value);
+				::basicBlock->appendInst(bitcast);
+
+				RValue<Int4> v(V(vector));
+
+				auto pointer = RValue<Pointer<Byte>>(ptr);
+				Int x = Extract(v, 0);
+				*Pointer<Int>(pointer) = x;
+				Int y = Extract(v, 1);
+				*Pointer<Int>(pointer + 4) = y;
+			}
+			else UNREACHABLE("typeSize(type): %d", int(typeSize(type)));
 		}
-
-		const void *getEntry(int index) const override
+		else
 		{
-			ASSERT(funcs[index]);
-			return funcs[index];
+			const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::StoreSubVector, Ice::Intrinsics::SideEffects_T, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_T};
+			auto target = ::context->getConstantUndef(Ice::IceType_i32);
+			auto store = Ice::InstIntrinsicCall::create(::function, 3, nullptr, target, intrinsic);
+			store->addArg(value);
+			store->addArg(ptr);
+			store->addArg(::context->getConstantInt32(typeSize(type)));
+			::basicBlock->appendInst(store);
 		}
+	}
+	else
+	{
+		ASSERT(value->getType() == T(type));
 
-		const void* addConstantData(const void* data, size_t size)
+		auto store = Ice::InstStore::create(::function, value, ptr, align);
+		::basicBlock->appendInst(store);
+	}
+
+	return value;
+}
+
+Value *Nucleus::createGEP(Value *ptr, Type *type, Value *index, bool unsignedIndex)
+{
+	ASSERT(index->getType() == Ice::IceType_i32);
+
+	if(auto *constant = llvm::dyn_cast<Ice::ConstantInteger32>(index))
+	{
+		int32_t offset = constant->getValue() * (int)typeSize(type);
+
+		if(offset == 0)
 		{
-			auto buf = std::unique_ptr<uint8_t[]>(new uint8_t[size]);
-			memcpy(buf.get(), data, size);
-			auto ptr = buf.get();
-			constantData.emplace_back(std::move(buf));
 			return ptr;
 		}
 
-	private:
-		std::array<const void*, Nucleus::CoroutineEntryCount> funcs = {};
-		std::vector<uint8_t, ExecutableAllocator<uint8_t>> buffer;
-		std::size_t position;
-		std::vector<std::unique_ptr<uint8_t[]>> constantData;
+		return createAdd(ptr, createConstantInt(offset));
+	}
 
-		#if defined(_WIN32)
-		DWORD oldProtection;
+	if(!Ice::isByteSizedType(T(type)))
+	{
+		index = createMul(index, createConstantInt((int)typeSize(type)));
+	}
+
+	if(sizeof(void*) == 8)
+	{
+		if(unsignedIndex)
+		{
+			index = createZExt(index, T(Ice::IceType_i64));
+		}
+		else
+		{
+			index = createSExt(index, T(Ice::IceType_i64));
+		}
+	}
+
+	return createAdd(ptr, index);
+}
+
+Value *Nucleus::createAtomicAdd(Value *ptr, Value *value, std::memory_order memoryOrder)
+{
+	UNIMPLEMENTED("createAtomicAdd");
+	return nullptr;
+}
+
+Value *Nucleus::createAtomicSub(Value *ptr, Value *value, std::memory_order memoryOrder)
+{
+	UNIMPLEMENTED("createAtomicSub");
+	return nullptr;
+}
+
+Value *Nucleus::createAtomicAnd(Value *ptr, Value *value, std::memory_order memoryOrder)
+{
+	UNIMPLEMENTED("createAtomicAnd");
+	return nullptr;
+}
+
+Value *Nucleus::createAtomicOr(Value *ptr, Value *value, std::memory_order memoryOrder)
+{
+	UNIMPLEMENTED("createAtomicOr");
+	return nullptr;
+}
+
+Value *Nucleus::createAtomicXor(Value *ptr, Value *value, std::memory_order memoryOrder)
+{
+	UNIMPLEMENTED("createAtomicXor");
+	return nullptr;
+}
+
+Value *Nucleus::createAtomicMin(Value *ptr, Value *value, std::memory_order memoryOrder)
+{
+	UNIMPLEMENTED("createAtomicMin");
+	return nullptr;
+}
+
+Value *Nucleus::createAtomicMax(Value *ptr, Value *value, std::memory_order memoryOrder)
+{
+	UNIMPLEMENTED("createAtomicMax");
+	return nullptr;
+}
+
+Value *Nucleus::createAtomicUMin(Value *ptr, Value *value, std::memory_order memoryOrder)
+{
+	UNIMPLEMENTED("createAtomicUMin");
+	return nullptr;
+}
+
+Value *Nucleus::createAtomicUMax(Value *ptr, Value *value, std::memory_order memoryOrder)
+{
+	UNIMPLEMENTED("createAtomicUMax");
+	return nullptr;
+}
+
+Value *Nucleus::createAtomicExchange(Value *ptr, Value *value, std::memory_order memoryOrder)
+{
+	UNIMPLEMENTED("createAtomicExchange");
+	return nullptr;
+}
+
+Value *Nucleus::createAtomicCompareExchange(Value *ptr, Value *value, Value *compare, std::memory_order memoryOrderEqual, std::memory_order memoryOrderUnequal)
+{
+	UNIMPLEMENTED("createAtomicCompareExchange");
+	return nullptr;
+}
+
+static Value *createCast(Ice::InstCast::OpKind op, Value *v, Type *destType)
+{
+	if(v->getType() == T(destType))
+	{
+		return v;
+	}
+
+	Ice::Variable *result = ::function->makeVariable(T(destType));
+	Ice::InstCast *cast = Ice::InstCast::create(::function, op, result, v);
+	::basicBlock->appendInst(cast);
+
+	return V(result);
+}
+
+Value *Nucleus::createTrunc(Value *v, Type *destType)
+{
+	return createCast(Ice::InstCast::Trunc, v, destType);
+}
+
+Value *Nucleus::createZExt(Value *v, Type *destType)
+{
+	return createCast(Ice::InstCast::Zext, v, destType);
+}
+
+Value *Nucleus::createSExt(Value *v, Type *destType)
+{
+	return createCast(Ice::InstCast::Sext, v, destType);
+}
+
+Value *Nucleus::createFPToUI(Value *v, Type *destType)
+{
+	return createCast(Ice::InstCast::Fptoui, v, destType);
+}
+
+Value *Nucleus::createFPToSI(Value *v, Type *destType)
+{
+	return createCast(Ice::InstCast::Fptosi, v, destType);
+}
+
+Value *Nucleus::createSIToFP(Value *v, Type *destType)
+{
+	return createCast(Ice::InstCast::Sitofp, v, destType);
+}
+
+Value *Nucleus::createFPTrunc(Value *v, Type *destType)
+{
+	return createCast(Ice::InstCast::Fptrunc, v, destType);
+}
+
+Value *Nucleus::createFPExt(Value *v, Type *destType)
+{
+	return createCast(Ice::InstCast::Fpext, v, destType);
+}
+
+Value *Nucleus::createBitCast(Value *v, Type *destType)
+{
+	// Bitcasts must be between types of the same logical size. But with emulated narrow vectors we need
+	// support for casting between scalars and wide vectors. For platforms where this is not supported,
+	// emulate them by writing to the stack and reading back as the destination type.
+	if(emulateMismatchedBitCast)
+	{
+		if(!Ice::isVectorType(v->getType()) && Ice::isVectorType(T(destType)))
+		{
+			Value *address = allocateStackVariable(destType);
+			createStore(v, address, T(v->getType()));
+			return createLoad(address, destType);
+		}
+		else if(Ice::isVectorType(v->getType()) && !Ice::isVectorType(T(destType)))
+		{
+			Value *address = allocateStackVariable(T(v->getType()));
+			createStore(v, address, T(v->getType()));
+			return createLoad(address, destType);
+		}
+	}
+
+	return createCast(Ice::InstCast::Bitcast, v, destType);
+}
+
+static Value *createIntCompare(Ice::InstIcmp::ICond condition, Value *lhs, Value *rhs)
+{
+	ASSERT(lhs->getType() == rhs->getType());
+
+	auto result = ::function->makeVariable(Ice::isScalarIntegerType(lhs->getType()) ? Ice::IceType_i1 : lhs->getType());
+	auto cmp = Ice::InstIcmp::create(::function, condition, result, lhs, rhs);
+	::basicBlock->appendInst(cmp);
+
+	return V(result);
+}
+
+Value *Nucleus::createPtrEQ(Value *lhs, Value *rhs)
+{
+	return createIntCompare(Ice::InstIcmp::Eq, lhs, rhs);
+}
+
+Value *Nucleus::createICmpEQ(Value *lhs, Value *rhs)
+{
+	return createIntCompare(Ice::InstIcmp::Eq, lhs, rhs);
+}
+
+Value *Nucleus::createICmpNE(Value *lhs, Value *rhs)
+{
+	return createIntCompare(Ice::InstIcmp::Ne, lhs, rhs);
+}
+
+Value *Nucleus::createICmpUGT(Value *lhs, Value *rhs)
+{
+	return createIntCompare(Ice::InstIcmp::Ugt, lhs, rhs);
+}
+
+Value *Nucleus::createICmpUGE(Value *lhs, Value *rhs)
+{
+	return createIntCompare(Ice::InstIcmp::Uge, lhs, rhs);
+}
+
+Value *Nucleus::createICmpULT(Value *lhs, Value *rhs)
+{
+	return createIntCompare(Ice::InstIcmp::Ult, lhs, rhs);
+}
+
+Value *Nucleus::createICmpULE(Value *lhs, Value *rhs)
+{
+	return createIntCompare(Ice::InstIcmp::Ule, lhs, rhs);
+}
+
+Value *Nucleus::createICmpSGT(Value *lhs, Value *rhs)
+{
+	return createIntCompare(Ice::InstIcmp::Sgt, lhs, rhs);
+}
+
+Value *Nucleus::createICmpSGE(Value *lhs, Value *rhs)
+{
+	return createIntCompare(Ice::InstIcmp::Sge, lhs, rhs);
+}
+
+Value *Nucleus::createICmpSLT(Value *lhs, Value *rhs)
+{
+	return createIntCompare(Ice::InstIcmp::Slt, lhs, rhs);
+}
+
+Value *Nucleus::createICmpSLE(Value *lhs, Value *rhs)
+{
+	return createIntCompare(Ice::InstIcmp::Sle, lhs, rhs);
+}
+
+static Value *createFloatCompare(Ice::InstFcmp::FCond condition, Value *lhs, Value *rhs)
+{
+	ASSERT(lhs->getType() == rhs->getType());
+	ASSERT(Ice::isScalarFloatingType(lhs->getType()) || lhs->getType() == Ice::IceType_v4f32);
+
+	auto result = ::function->makeVariable(Ice::isScalarFloatingType(lhs->getType()) ? Ice::IceType_i1 : Ice::IceType_v4i32);
+	auto cmp = Ice::InstFcmp::create(::function, condition, result, lhs, rhs);
+	::basicBlock->appendInst(cmp);
+
+	return V(result);
+}
+
+Value *Nucleus::createFCmpOEQ(Value *lhs, Value *rhs)
+{
+	return createFloatCompare(Ice::InstFcmp::Oeq, lhs, rhs);
+}
+
+Value *Nucleus::createFCmpOGT(Value *lhs, Value *rhs)
+{
+	return createFloatCompare(Ice::InstFcmp::Ogt, lhs, rhs);
+}
+
+Value *Nucleus::createFCmpOGE(Value *lhs, Value *rhs)
+{
+	return createFloatCompare(Ice::InstFcmp::Oge, lhs, rhs);
+}
+
+Value *Nucleus::createFCmpOLT(Value *lhs, Value *rhs)
+{
+	return createFloatCompare(Ice::InstFcmp::Olt, lhs, rhs);
+}
+
+Value *Nucleus::createFCmpOLE(Value *lhs, Value *rhs)
+{
+	return createFloatCompare(Ice::InstFcmp::Ole, lhs, rhs);
+}
+
+Value *Nucleus::createFCmpONE(Value *lhs, Value *rhs)
+{
+	return createFloatCompare(Ice::InstFcmp::One, lhs, rhs);
+}
+
+Value *Nucleus::createFCmpORD(Value *lhs, Value *rhs)
+{
+	return createFloatCompare(Ice::InstFcmp::Ord, lhs, rhs);
+}
+
+Value *Nucleus::createFCmpUNO(Value *lhs, Value *rhs)
+{
+	return createFloatCompare(Ice::InstFcmp::Uno, lhs, rhs);
+}
+
+Value *Nucleus::createFCmpUEQ(Value *lhs, Value *rhs)
+{
+	return createFloatCompare(Ice::InstFcmp::Ueq, lhs, rhs);
+}
+
+Value *Nucleus::createFCmpUGT(Value *lhs, Value *rhs)
+{
+	return createFloatCompare(Ice::InstFcmp::Ugt, lhs, rhs);
+}
+
+Value *Nucleus::createFCmpUGE(Value *lhs, Value *rhs)
+{
+	return createFloatCompare(Ice::InstFcmp::Uge, lhs, rhs);
+}
+
+Value *Nucleus::createFCmpULT(Value *lhs, Value *rhs)
+{
+	return createFloatCompare(Ice::InstFcmp::Ult, lhs, rhs);
+}
+
+Value *Nucleus::createFCmpULE(Value *lhs, Value *rhs)
+{
+	return createFloatCompare(Ice::InstFcmp::Ule, lhs, rhs);
+}
+
+Value *Nucleus::createFCmpUNE(Value *lhs, Value *rhs)
+{
+	return createFloatCompare(Ice::InstFcmp::Une, lhs, rhs);
+}
+
+Value *Nucleus::createExtractElement(Value *vector, Type *type, int index)
+{
+	auto result = ::function->makeVariable(T(type));
+	auto extract = Ice::InstExtractElement::create(::function, result, vector, ::context->getConstantInt32(index));
+	::basicBlock->appendInst(extract);
+
+	return V(result);
+}
+
+Value *Nucleus::createInsertElement(Value *vector, Value *element, int index)
+{
+	auto result = ::function->makeVariable(vector->getType());
+	auto insert = Ice::InstInsertElement::create(::function, result, vector, element, ::context->getConstantInt32(index));
+	::basicBlock->appendInst(insert);
+
+	return V(result);
+}
+
+Value *Nucleus::createShuffleVector(Value *V1, Value *V2, const int *select)
+{
+	ASSERT(V1->getType() == V2->getType());
+
+	int size = Ice::typeNumElements(V1->getType());
+	auto result = ::function->makeVariable(V1->getType());
+	auto shuffle = Ice::InstShuffleVector::create(::function, result, V1, V2);
+
+	for(int i = 0; i < size; i++)
+	{
+		shuffle->addIndex(llvm::cast<Ice::ConstantInteger32>(::context->getConstantInt32(select[i])));
+	}
+
+	::basicBlock->appendInst(shuffle);
+
+	return V(result);
+}
+
+Value *Nucleus::createSelect(Value *C, Value *ifTrue, Value *ifFalse)
+{
+	ASSERT(ifTrue->getType() == ifFalse->getType());
+
+	auto result = ::function->makeVariable(ifTrue->getType());
+	auto *select = Ice::InstSelect::create(::function, result, C, ifTrue, ifFalse);
+	::basicBlock->appendInst(select);
+
+	return V(result);
+}
+
+SwitchCases *Nucleus::createSwitch(Value *control, BasicBlock *defaultBranch, unsigned numCases)
+{
+	auto switchInst = Ice::InstSwitch::create(::function, numCases, control, defaultBranch);
+	::basicBlock->appendInst(switchInst);
+
+	return reinterpret_cast<SwitchCases*>(switchInst);
+}
+
+void Nucleus::addSwitchCase(SwitchCases *switchCases, int label, BasicBlock *branch)
+{
+	switchCases->addBranch(label, label, branch);
+}
+
+void Nucleus::createUnreachable()
+{
+	Ice::InstUnreachable *unreachable = Ice::InstUnreachable::create(::function);
+	::basicBlock->appendInst(unreachable);
+}
+
+Type *Nucleus::getPointerType(Type *ElementType)
+{
+	if(sizeof(void*) == 8)
+	{
+		return T(Ice::IceType_i64);
+	}
+	else
+	{
+		return T(Ice::IceType_i32);
+	}
+}
+
+Value *Nucleus::createNullValue(Type *Ty)
+{
+	if(Ice::isVectorType(T(Ty)))
+	{
+		ASSERT(Ice::typeNumElements(T(Ty)) <= 16);
+		int64_t c[16] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+		return createConstantVector(c, Ty);
+	}
+	else
+	{
+		return V(::context->getConstantZero(T(Ty)));
+	}
+}
+
+Value *Nucleus::createConstantLong(int64_t i)
+{
+	return V(::context->getConstantInt64(i));
+}
+
+Value *Nucleus::createConstantInt(int i)
+{
+	return V(::context->getConstantInt32(i));
+}
+
+Value *Nucleus::createConstantInt(unsigned int i)
+{
+	return V(::context->getConstantInt32(i));
+}
+
+Value *Nucleus::createConstantBool(bool b)
+{
+	return V(::context->getConstantInt1(b));
+}
+
+Value *Nucleus::createConstantByte(signed char i)
+{
+	return V(::context->getConstantInt8(i));
+}
+
+Value *Nucleus::createConstantByte(unsigned char i)
+{
+	return V(::context->getConstantInt8(i));
+}
+
+Value *Nucleus::createConstantShort(short i)
+{
+	return V(::context->getConstantInt16(i));
+}
+
+Value *Nucleus::createConstantShort(unsigned short i)
+{
+	return V(::context->getConstantInt16(i));
+}
+
+Value *Nucleus::createConstantFloat(float x)
+{
+	return V(::context->getConstantFloat(x));
+}
+
+Value *Nucleus::createNullPointer(Type *Ty)
+{
+	return createNullValue(T(sizeof(void*) == 8 ? Ice::IceType_i64 : Ice::IceType_i32));
+}
+
+Value *Nucleus::createConstantVector(const int64_t *constants, Type *type)
+{
+	const int vectorSize = 16;
+	ASSERT(Ice::typeWidthInBytes(T(type)) == vectorSize);
+	const int alignment = vectorSize;
+	auto globalPool = ::function->getGlobalPool();
+
+	const int64_t *i = constants;
+	const double *f = reinterpret_cast<const double*>(constants);
+	Ice::VariableDeclaration::DataInitializer *dataInitializer = nullptr;
+
+	switch((int)reinterpret_cast<intptr_t>(type))
+	{
+	case Ice::IceType_v4i32:
+	case Ice::IceType_v4i1:
+		{
+			const int initializer[4] = {(int)i[0], (int)i[1], (int)i[2], (int)i[3]};
+			static_assert(sizeof(initializer) == vectorSize, "!");
+			dataInitializer = Ice::VariableDeclaration::DataInitializer::create(globalPool, (const char*)initializer, vectorSize);
+		}
+		break;
+	case Ice::IceType_v4f32:
+		{
+			const float initializer[4] = {(float)f[0], (float)f[1], (float)f[2], (float)f[3]};
+			static_assert(sizeof(initializer) == vectorSize, "!");
+			dataInitializer = Ice::VariableDeclaration::DataInitializer::create(globalPool, (const char*)initializer, vectorSize);
+		}
+		break;
+	case Ice::IceType_v8i16:
+	case Ice::IceType_v8i1:
+		{
+			const short initializer[8] = {(short)i[0], (short)i[1], (short)i[2], (short)i[3], (short)i[4], (short)i[5], (short)i[6], (short)i[7]};
+			static_assert(sizeof(initializer) == vectorSize, "!");
+			dataInitializer = Ice::VariableDeclaration::DataInitializer::create(globalPool, (const char*)initializer, vectorSize);
+		}
+		break;
+	case Ice::IceType_v16i8:
+	case Ice::IceType_v16i1:
+		{
+			const char initializer[16] = {(char)i[0], (char)i[1], (char)i[2], (char)i[3], (char)i[4], (char)i[5], (char)i[6], (char)i[7], (char)i[8], (char)i[9], (char)i[10], (char)i[11], (char)i[12], (char)i[13], (char)i[14], (char)i[15]};
+			static_assert(sizeof(initializer) == vectorSize, "!");
+			dataInitializer = Ice::VariableDeclaration::DataInitializer::create(globalPool, (const char*)initializer, vectorSize);
+		}
+		break;
+	case Type_v2i32:
+		{
+			const int initializer[4] = {(int)i[0], (int)i[1], (int)i[0], (int)i[1]};
+			static_assert(sizeof(initializer) == vectorSize, "!");
+			dataInitializer = Ice::VariableDeclaration::DataInitializer::create(globalPool, (const char*)initializer, vectorSize);
+		}
+		break;
+	case Type_v2f32:
+		{
+			const float initializer[4] = {(float)f[0], (float)f[1], (float)f[0], (float)f[1]};
+			static_assert(sizeof(initializer) == vectorSize, "!");
+			dataInitializer = Ice::VariableDeclaration::DataInitializer::create(globalPool, (const char*)initializer, vectorSize);
+		}
+		break;
+	case Type_v4i16:
+		{
+			const short initializer[8] = {(short)i[0], (short)i[1], (short)i[2], (short)i[3], (short)i[0], (short)i[1], (short)i[2], (short)i[3]};
+			static_assert(sizeof(initializer) == vectorSize, "!");
+			dataInitializer = Ice::VariableDeclaration::DataInitializer::create(globalPool, (const char*)initializer, vectorSize);
+		}
+		break;
+	case Type_v8i8:
+		{
+			const char initializer[16] = {(char)i[0], (char)i[1], (char)i[2], (char)i[3], (char)i[4], (char)i[5], (char)i[6], (char)i[7], (char)i[0], (char)i[1], (char)i[2], (char)i[3], (char)i[4], (char)i[5], (char)i[6], (char)i[7]};
+			static_assert(sizeof(initializer) == vectorSize, "!");
+			dataInitializer = Ice::VariableDeclaration::DataInitializer::create(globalPool, (const char*)initializer, vectorSize);
+		}
+		break;
+	case Type_v4i8:
+		{
+			const char initializer[16] = {(char)i[0], (char)i[1], (char)i[2], (char)i[3], (char)i[0], (char)i[1], (char)i[2], (char)i[3], (char)i[0], (char)i[1], (char)i[2], (char)i[3], (char)i[0], (char)i[1], (char)i[2], (char)i[3]};
+			static_assert(sizeof(initializer) == vectorSize, "!");
+			dataInitializer = Ice::VariableDeclaration::DataInitializer::create(globalPool, (const char*)initializer, vectorSize);
+		}
+		break;
+	default:
+		UNREACHABLE("Unknown constant vector type: %d", (int)reinterpret_cast<intptr_t>(type));
+	}
+
+	auto name = Ice::GlobalString::createWithoutString(::context);
+	auto *variableDeclaration = Ice::VariableDeclaration::create(globalPool);
+	variableDeclaration->setName(name);
+	variableDeclaration->setAlignment(alignment);
+	variableDeclaration->setIsConstant(true);
+	variableDeclaration->addInitializer(dataInitializer);
+
+	::function->addGlobal(variableDeclaration);
+
+	constexpr int32_t offset = 0;
+	Ice::Operand *ptr = ::context->getConstantSym(offset, name);
+
+	Ice::Variable *result = ::function->makeVariable(T(type));
+	auto load = Ice::InstLoad::create(::function, result, ptr, alignment);
+	::basicBlock->appendInst(load);
+
+	return V(result);
+}
+
+Value *Nucleus::createConstantVector(const double *constants, Type *type)
+{
+	return createConstantVector((const int64_t*)constants, type);
+}
+
+Type *Void::getType()
+{
+	return T(Ice::IceType_void);
+}
+
+Type *Bool::getType()
+{
+	return T(Ice::IceType_i1);
+}
+
+Type *Byte::getType()
+{
+	return T(Ice::IceType_i8);
+}
+
+Type *SByte::getType()
+{
+	return T(Ice::IceType_i8);
+}
+
+Type *Short::getType()
+{
+	return T(Ice::IceType_i16);
+}
+
+Type *UShort::getType()
+{
+	return T(Ice::IceType_i16);
+}
+
+Type *Byte4::getType()
+{
+	return T(Type_v4i8);
+}
+
+Type *SByte4::getType()
+{
+	return T(Type_v4i8);
+}
+
+namespace
+{
+	RValue<Byte> SaturateUnsigned(RValue<Short> x)
+	{
+		return Byte(IfThenElse(Int(x) > 0xFF, Int(0xFF), IfThenElse(Int(x) < 0, Int(0), Int(x))));
+	}
+
+	RValue<Byte> Extract(RValue<Byte8> val, int i)
+	{
+		return RValue<Byte>(Nucleus::createExtractElement(val.value, Byte::getType(), i));
+	}
+
+	RValue<Byte8> Insert(RValue<Byte8> val, RValue<Byte> element, int i)
+	{
+		return RValue<Byte8>(Nucleus::createInsertElement(val.value, element.value, i));
+	}
+}
+
+RValue<Byte8> AddSat(RValue<Byte8> x, RValue<Byte8> y)
+{
+	if(emulateIntrinsics)
+	{
+		Byte8 result;
+		result = Insert(result, SaturateUnsigned(Short(Int(Extract(x, 0)) + Int(Extract(y, 0)))), 0);
+		result = Insert(result, SaturateUnsigned(Short(Int(Extract(x, 1)) + Int(Extract(y, 1)))), 1);
+		result = Insert(result, SaturateUnsigned(Short(Int(Extract(x, 2)) + Int(Extract(y, 2)))), 2);
+		result = Insert(result, SaturateUnsigned(Short(Int(Extract(x, 3)) + Int(Extract(y, 3)))), 3);
+		result = Insert(result, SaturateUnsigned(Short(Int(Extract(x, 4)) + Int(Extract(y, 4)))), 4);
+		result = Insert(result, SaturateUnsigned(Short(Int(Extract(x, 5)) + Int(Extract(y, 5)))), 5);
+		result = Insert(result, SaturateUnsigned(Short(Int(Extract(x, 6)) + Int(Extract(y, 6)))), 6);
+		result = Insert(result, SaturateUnsigned(Short(Int(Extract(x, 7)) + Int(Extract(y, 7)))), 7);
+
+		return result;
+	}
+	else
+	{
+		Ice::Variable *result = ::function->makeVariable(Ice::IceType_v16i8);
+		const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::AddSaturateUnsigned, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
+		auto target = ::context->getConstantUndef(Ice::IceType_i32);
+		auto paddusb = Ice::InstIntrinsicCall::create(::function, 2, result, target, intrinsic);
+		paddusb->addArg(x.value);
+		paddusb->addArg(y.value);
+		::basicBlock->appendInst(paddusb);
+
+		return RValue<Byte8>(V(result));
+	}
+}
+
+RValue<Byte8> SubSat(RValue<Byte8> x, RValue<Byte8> y)
+{
+	if(emulateIntrinsics)
+	{
+		Byte8 result;
+		result = Insert(result, SaturateUnsigned(Short(Int(Extract(x, 0)) - Int(Extract(y, 0)))), 0);
+		result = Insert(result, SaturateUnsigned(Short(Int(Extract(x, 1)) - Int(Extract(y, 1)))), 1);
+		result = Insert(result, SaturateUnsigned(Short(Int(Extract(x, 2)) - Int(Extract(y, 2)))), 2);
+		result = Insert(result, SaturateUnsigned(Short(Int(Extract(x, 3)) - Int(Extract(y, 3)))), 3);
+		result = Insert(result, SaturateUnsigned(Short(Int(Extract(x, 4)) - Int(Extract(y, 4)))), 4);
+		result = Insert(result, SaturateUnsigned(Short(Int(Extract(x, 5)) - Int(Extract(y, 5)))), 5);
+		result = Insert(result, SaturateUnsigned(Short(Int(Extract(x, 6)) - Int(Extract(y, 6)))), 6);
+		result = Insert(result, SaturateUnsigned(Short(Int(Extract(x, 7)) - Int(Extract(y, 7)))), 7);
+
+		return result;
+	}
+	else
+	{
+		Ice::Variable *result = ::function->makeVariable(Ice::IceType_v16i8);
+		const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::SubtractSaturateUnsigned, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
+		auto target = ::context->getConstantUndef(Ice::IceType_i32);
+		auto psubusw = Ice::InstIntrinsicCall::create(::function, 2, result, target, intrinsic);
+		psubusw->addArg(x.value);
+		psubusw->addArg(y.value);
+		::basicBlock->appendInst(psubusw);
+
+		return RValue<Byte8>(V(result));
+	}
+}
+
+RValue<SByte> Extract(RValue<SByte8> val, int i)
+{
+	return RValue<SByte>(Nucleus::createExtractElement(val.value, SByte::getType(), i));
+}
+
+RValue<SByte8> Insert(RValue<SByte8> val, RValue<SByte> element, int i)
+{
+	return RValue<SByte8>(Nucleus::createInsertElement(val.value, element.value, i));
+}
+
+RValue<SByte8> operator>>(RValue<SByte8> lhs, unsigned char rhs)
+{
+	if(emulateIntrinsics)
+	{
+		SByte8 result;
+		result = Insert(result, Extract(lhs, 0) >> SByte(rhs), 0);
+		result = Insert(result, Extract(lhs, 1) >> SByte(rhs), 1);
+		result = Insert(result, Extract(lhs, 2) >> SByte(rhs), 2);
+		result = Insert(result, Extract(lhs, 3) >> SByte(rhs), 3);
+		result = Insert(result, Extract(lhs, 4) >> SByte(rhs), 4);
+		result = Insert(result, Extract(lhs, 5) >> SByte(rhs), 5);
+		result = Insert(result, Extract(lhs, 6) >> SByte(rhs), 6);
+		result = Insert(result, Extract(lhs, 7) >> SByte(rhs), 7);
+
+		return result;
+	}
+	else
+	{
+		#if defined(__i386__) || defined(__x86_64__)
+			// SSE2 doesn't support byte vector shifts, so shift as shorts and recombine.
+			RValue<Short4> hi = (As<Short4>(lhs) >> rhs) & Short4(0xFF00u);
+			RValue<Short4> lo = As<Short4>(As<UShort4>((As<Short4>(lhs) << 8) >> rhs) >> 8);
+
+			return As<SByte8>(hi | lo);
+		#else
+			return RValue<SByte8>(Nucleus::createAShr(lhs.value, V(::context->getConstantInt32(rhs))));
 		#endif
-	};
-
-	Nucleus::Nucleus()
-	{
-		::codegenMutex.lock();   // Reactor is currently not thread safe
-
-		Ice::ClFlags &Flags = Ice::ClFlags::Flags;
-		Ice::ClFlags::getParsedClFlags(Flags);
-
-		#if defined(__arm__)
-			Flags.setTargetArch(Ice::Target_ARM32);
-			Flags.setTargetInstructionSet(Ice::ARM32InstructionSet_HWDivArm);
-		#elif defined(__mips__)
-			Flags.setTargetArch(Ice::Target_MIPS32);
-			Flags.setTargetInstructionSet(Ice::BaseInstructionSet);
-		#else   // x86
-			Flags.setTargetArch(sizeof(void*) == 8 ? Ice::Target_X8664 : Ice::Target_X8632);
-			Flags.setTargetInstructionSet(CPUID::SSE4_1 ? Ice::X86InstructionSet_SSE4_1 : Ice::X86InstructionSet_SSE2);
-		#endif
-		Flags.setOutFileType(Ice::FT_Elf);
-		Flags.setOptLevel(toIce(getDefaultConfig().getOptimization().getLevel()));
-		Flags.setApplicationBinaryInterface(Ice::ABI_Platform);
-		Flags.setVerbose(subzeroDumpEnabled ? Ice::IceV_Most : Ice::IceV_None);
-		Flags.setDisableHybridAssembly(true);
-
-		static llvm::raw_os_ostream cout(std::cout);
-		static llvm::raw_os_ostream cerr(std::cerr);
-
-		if (subzeroEmitTextAsm)
-		{
-			// Decorate text asm with liveness info
-			Flags.setDecorateAsm(true);
-		}
-
-		if(false)   // Write out to a file
-		{
-			std::error_code errorCode;
-			::out = new Ice::Fdstream("out.o", errorCode, llvm::sys::fs::F_None);
-			::elfFile = new Ice::ELFFileStreamer(*out);
-			::context = new Ice::GlobalContext(&cout, &cout, &cerr, elfFile);
-		}
-		else
-		{
-			ELFMemoryStreamer *elfMemory = new ELFMemoryStreamer();
-			::context = new Ice::GlobalContext(&cout, &cout, &cerr, elfMemory);
-			::routine = elfMemory;
-		}
 	}
+}
 
-	Nucleus::~Nucleus()
+RValue<Int> SignMask(RValue<Byte8> x)
+{
+	if(emulateIntrinsics || CPUID::ARM)
 	{
-		delete ::routine;
-
-		delete ::allocator;
-		delete ::function;
-		delete ::context;
-
-		delete ::elfFile;
-		delete ::out;
-
-		::codegenMutex.unlock();
+		Byte8 xx = As<Byte8>(As<SByte8>(x) >> 7) & Byte8(0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80);
+		return Int(Extract(xx, 0)) | Int(Extract(xx, 1)) | Int(Extract(xx, 2)) | Int(Extract(xx, 3)) | Int(Extract(xx, 4)) | Int(Extract(xx, 5)) | Int(Extract(xx, 6)) | Int(Extract(xx, 7));
 	}
-
-	void Nucleus::setDefaultConfig(const Config &cfg)
+	else
 	{
-		std::unique_lock<std::mutex> lock(::defaultConfigLock);
-		::defaultConfig() = cfg;
-	}
+		Ice::Variable *result = ::function->makeVariable(Ice::IceType_i32);
+		const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::SignMask, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
+		auto target = ::context->getConstantUndef(Ice::IceType_i32);
+		auto movmsk = Ice::InstIntrinsicCall::create(::function, 1, result, target, intrinsic);
+		movmsk->addArg(x.value);
+		::basicBlock->appendInst(movmsk);
 
-	void Nucleus::adjustDefaultConfig(const Config::Edit &cfgEdit)
-	{
-		std::unique_lock<std::mutex> lock(::defaultConfigLock);
-		auto &config = ::defaultConfig();
-		config = cfgEdit.apply(config);
+		return RValue<Int>(V(result)) & 0xFF;
 	}
-
-	Config Nucleus::getDefaultConfig()
-	{
-		std::unique_lock<std::mutex> lock(::defaultConfigLock);
-		return ::defaultConfig();
-	}
-
-	std::shared_ptr<Routine> Nucleus::acquireRoutine(const char *name, const Config::Edit &cfgEdit /* = Config::Edit::None */)
-	{
-		if (subzeroDumpEnabled)
-		{
-			// Output dump strings immediately, rather than once buffer is full. Useful for debugging.
-			context->getStrDump().SetUnbuffered();
-		}
-
-		if(basicBlock->getInsts().empty() || basicBlock->getInsts().back().getKind() != Ice::Inst::Ret)
-		{
-			createRetVoid();
-		}
-
-		::function->setFunctionName(Ice::GlobalString::createWithString(::context, name));
-
-		rr::optimize(::function);
-
-		::function->computeInOutEdges();
-		ASSERT(!::function->hasError());
-
-		::function->translate();
-		ASSERT(!::function->hasError());
-
-		auto globals = ::function->getGlobalInits();
-
-		if(globals && !globals->empty())
-		{
-			::context->getGlobals()->merge(globals.get());
-		}
-
-		::context->emitFileHeader();
-
-		if (subzeroEmitTextAsm)
-		{
-			::function->emit();
-		}
-
-		::function->emitIAS();
-		auto assembler = ::function->releaseAssembler();
-		auto objectWriter = ::context->getObjectWriter();
-		assembler->alignFunction();
-		objectWriter->writeFunctionCode(::function->getFunctionName(), false, assembler.get());
-		::context->lowerGlobals("last");
-		::context->lowerConstants();
-		::context->lowerJumpTables();
-		objectWriter->setUndefinedSyms(::context->getConstantExternSyms());
-		objectWriter->writeNonUserSections();
-
-		const void* entryBegin = ::routine->finalizeEntryBegin();
-		::routine->setEntry(Nucleus::CoroutineEntryBegin, entryBegin);
-
-		Routine *handoffRoutine = ::routine;
-		::routine = nullptr;
-
-		return std::shared_ptr<Routine>(handoffRoutine);
-	}
-
-	Value *Nucleus::allocateStackVariable(Type *t, int arraySize)
-	{
-		Ice::Type type = T(t);
-		int typeSize = Ice::typeWidthInBytes(type);
-		int totalSize = typeSize * (arraySize ? arraySize : 1);
-
-		auto bytes = Ice::ConstantInteger32::create(::context, Ice::IceType_i32, totalSize);
-		auto address = ::function->makeVariable(T(getPointerType(t)));
-		auto alloca = Ice::InstAlloca::create(::function, address, bytes, typeSize);
-		::function->getEntryNode()->getInsts().push_front(alloca);
-
-		return V(address);
-	}
-
-	BasicBlock *Nucleus::createBasicBlock()
-	{
-		return B(::function->makeNode());
-	}
-
-	BasicBlock *Nucleus::getInsertBlock()
-	{
-		return B(::basicBlock);
-	}
-
-	void Nucleus::setInsertBlock(BasicBlock *basicBlock)
-	{
-	//	ASSERT(::basicBlock->getInsts().back().getTerminatorEdges().size() >= 0 && "Previous basic block must have a terminator");
-
-		Variable::materializeAll();
-
-		::basicBlock = basicBlock;
-	}
-
-	void Nucleus::createFunction(Type *ReturnType, std::vector<Type*> &Params)
-	{
-		uint32_t sequenceNumber = 0;
-		::function = Ice::Cfg::create(::context, sequenceNumber).release();
-		::allocator = new Ice::CfgLocalAllocatorScope(::function);
-
-		for(Type *type : Params)
-		{
-			Ice::Variable *arg = ::function->makeVariable(T(type));
-			::function->addArg(arg);
-		}
-
-		Ice::CfgNode *node = ::function->makeNode();
-		::function->setEntryNode(node);
-		::basicBlock = node;
-	}
-
-	Value *Nucleus::getArgument(unsigned int index)
-	{
-		return V(::function->getArgs()[index]);
-	}
-
-	void Nucleus::createRetVoid()
-	{
-		// Code generated after this point is unreachable, so any variables
-		// being read can safely return an undefined value. We have to avoid
-		// materializing variables after the terminator ret instruction.
-		Variable::killUnmaterialized();
-
-		Ice::InstRet *ret = Ice::InstRet::create(::function);
-		::basicBlock->appendInst(ret);
-	}
-
-	void Nucleus::createRet(Value *v)
-	{
-		// Code generated after this point is unreachable, so any variables
-		// being read can safely return an undefined value. We have to avoid
-		// materializing variables after the terminator ret instruction.
-		Variable::killUnmaterialized();
-
-		Ice::InstRet *ret = Ice::InstRet::create(::function, v);
-		::basicBlock->appendInst(ret);
-	}
-
-	void Nucleus::createBr(BasicBlock *dest)
-	{
-		Variable::materializeAll();
-
-		auto br = Ice::InstBr::create(::function, dest);
-		::basicBlock->appendInst(br);
-	}
-
-	void Nucleus::createCondBr(Value *cond, BasicBlock *ifTrue, BasicBlock *ifFalse)
-	{
-		Variable::materializeAll();
-
-		auto br = Ice::InstBr::create(::function, cond, ifTrue, ifFalse);
-		::basicBlock->appendInst(br);
-	}
-
-	static bool isCommutative(Ice::InstArithmetic::OpKind op)
-	{
-		switch(op)
-		{
-		case Ice::InstArithmetic::Add:
-		case Ice::InstArithmetic::Fadd:
-		case Ice::InstArithmetic::Mul:
-		case Ice::InstArithmetic::Fmul:
-		case Ice::InstArithmetic::And:
-		case Ice::InstArithmetic::Or:
-		case Ice::InstArithmetic::Xor:
-			return true;
-		default:
-			return false;
-		}
-	}
-
-	static Value *createArithmetic(Ice::InstArithmetic::OpKind op, Value *lhs, Value *rhs)
-	{
-		ASSERT(lhs->getType() == rhs->getType() || llvm::isa<Ice::Constant>(rhs));
-
-		bool swapOperands = llvm::isa<Ice::Constant>(lhs) && isCommutative(op);
-
-		Ice::Variable *result = ::function->makeVariable(lhs->getType());
-		Ice::InstArithmetic *arithmetic = Ice::InstArithmetic::create(::function, op, result, swapOperands ? rhs : lhs, swapOperands ? lhs : rhs);
-		::basicBlock->appendInst(arithmetic);
-
-		return V(result);
-	}
-
-	Value *Nucleus::createAdd(Value *lhs, Value *rhs)
-	{
-		return createArithmetic(Ice::InstArithmetic::Add, lhs, rhs);
-	}
-
-	Value *Nucleus::createSub(Value *lhs, Value *rhs)
-	{
-		return createArithmetic(Ice::InstArithmetic::Sub, lhs, rhs);
-	}
-
-	Value *Nucleus::createMul(Value *lhs, Value *rhs)
-	{
-		return createArithmetic(Ice::InstArithmetic::Mul, lhs, rhs);
-	}
-
-	Value *Nucleus::createUDiv(Value *lhs, Value *rhs)
-	{
-		return createArithmetic(Ice::InstArithmetic::Udiv, lhs, rhs);
-	}
-
-	Value *Nucleus::createSDiv(Value *lhs, Value *rhs)
-	{
-		return createArithmetic(Ice::InstArithmetic::Sdiv, lhs, rhs);
-	}
-
-	Value *Nucleus::createFAdd(Value *lhs, Value *rhs)
-	{
-		return createArithmetic(Ice::InstArithmetic::Fadd, lhs, rhs);
-	}
-
-	Value *Nucleus::createFSub(Value *lhs, Value *rhs)
-	{
-		return createArithmetic(Ice::InstArithmetic::Fsub, lhs, rhs);
-	}
-
-	Value *Nucleus::createFMul(Value *lhs, Value *rhs)
-	{
-		return createArithmetic(Ice::InstArithmetic::Fmul, lhs, rhs);
-	}
-
-	Value *Nucleus::createFDiv(Value *lhs, Value *rhs)
-	{
-		return createArithmetic(Ice::InstArithmetic::Fdiv, lhs, rhs);
-	}
-
-	Value *Nucleus::createURem(Value *lhs, Value *rhs)
-	{
-		return createArithmetic(Ice::InstArithmetic::Urem, lhs, rhs);
-	}
-
-	Value *Nucleus::createSRem(Value *lhs, Value *rhs)
-	{
-		return createArithmetic(Ice::InstArithmetic::Srem, lhs, rhs);
-	}
-
-	Value *Nucleus::createFRem(Value *lhs, Value *rhs)
-	{
-		return createArithmetic(Ice::InstArithmetic::Frem, lhs, rhs);
-	}
-
-	Value *Nucleus::createShl(Value *lhs, Value *rhs)
-	{
-		return createArithmetic(Ice::InstArithmetic::Shl, lhs, rhs);
-	}
-
-	Value *Nucleus::createLShr(Value *lhs, Value *rhs)
-	{
-		return createArithmetic(Ice::InstArithmetic::Lshr, lhs, rhs);
-	}
-
-	Value *Nucleus::createAShr(Value *lhs, Value *rhs)
-	{
-		return createArithmetic(Ice::InstArithmetic::Ashr, lhs, rhs);
-	}
-
-	Value *Nucleus::createAnd(Value *lhs, Value *rhs)
-	{
-		return createArithmetic(Ice::InstArithmetic::And, lhs, rhs);
-	}
-
-	Value *Nucleus::createOr(Value *lhs, Value *rhs)
-	{
-		return createArithmetic(Ice::InstArithmetic::Or, lhs, rhs);
-	}
-
-	Value *Nucleus::createXor(Value *lhs, Value *rhs)
-	{
-		return createArithmetic(Ice::InstArithmetic::Xor, lhs, rhs);
-	}
-
-	Value *Nucleus::createNeg(Value *v)
-	{
-		return createSub(createNullValue(T(v->getType())), v);
-	}
-
-	Value *Nucleus::createFNeg(Value *v)
-	{
-		double c[4] = {-0.0, -0.0, -0.0, -0.0};
-		Value *negativeZero = Ice::isVectorType(v->getType()) ?
-		                      createConstantVector(c, T(v->getType())) :
-		                      V(::context->getConstantFloat(-0.0f));
-
-		return createFSub(negativeZero, v);
-	}
-
-	Value *Nucleus::createNot(Value *v)
-	{
-		if(Ice::isScalarIntegerType(v->getType()))
-		{
-			return createXor(v, V(::context->getConstantInt(v->getType(), -1)));
-		}
-		else   // Vector
-		{
-			int64_t c[16] = {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1};
-			return createXor(v, createConstantVector(c, T(v->getType())));
-		}
-	}
-
-	Value *Nucleus::createLoad(Value *ptr, Type *type, bool isVolatile, unsigned int align, bool atomic, std::memory_order memoryOrder)
-	{
-		ASSERT(!atomic);  // Unimplemented
-		ASSERT(memoryOrder == std::memory_order_relaxed);  // Unimplemented
-
-		int valueType = (int)reinterpret_cast<intptr_t>(type);
-		Ice::Variable *result = ::function->makeVariable(T(type));
-
-		if((valueType & EmulatedBits) && (align != 0))   // Narrow vector not stored on stack.
-		{
-			if(emulateIntrinsics)
-			{
-				if(typeSize(type) == 4)
-				{
-					auto pointer = RValue<Pointer<Byte>>(ptr);
-					Int x = *Pointer<Int>(pointer);
-
-					Int4 vector;
-					vector = Insert(vector, x, 0);
-
-					auto bitcast = Ice::InstCast::create(::function, Ice::InstCast::Bitcast, result, vector.loadValue());
-					::basicBlock->appendInst(bitcast);
-				}
-				else if(typeSize(type) == 8)
-				{
-					auto pointer = RValue<Pointer<Byte>>(ptr);
-					Int x = *Pointer<Int>(pointer);
-					Int y = *Pointer<Int>(pointer + 4);
-
-					Int4 vector;
-					vector = Insert(vector, x, 0);
-					vector = Insert(vector, y, 1);
-
-					auto bitcast = Ice::InstCast::create(::function, Ice::InstCast::Bitcast, result, vector.loadValue());
-					::basicBlock->appendInst(bitcast);
-				}
-				else UNREACHABLE("typeSize(type): %d", int(typeSize(type)));
-			}
-			else
-			{
-				const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::LoadSubVector, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
-				auto target = ::context->getConstantUndef(Ice::IceType_i32);
-				auto load = Ice::InstIntrinsicCall::create(::function, 2, result, target, intrinsic);
-				load->addArg(ptr);
-				load->addArg(::context->getConstantInt32(typeSize(type)));
-				::basicBlock->appendInst(load);
-			}
-		}
-		else
-		{
-			auto load = Ice::InstLoad::create(::function, result, ptr, align);
-			::basicBlock->appendInst(load);
-		}
-
-		return V(result);
-	}
-
-	Value *Nucleus::createStore(Value *value, Value *ptr, Type *type, bool isVolatile, unsigned int align, bool atomic, std::memory_order memoryOrder)
-	{
-		ASSERT(!atomic);  // Unimplemented
-		ASSERT(memoryOrder == std::memory_order_relaxed);  // Unimplemented
-
-		#if __has_feature(memory_sanitizer)
-			// Mark all (non-stack) memory writes as initialized by calling __msan_unpoison
-			if(align != 0)
-			{
-				auto call = Ice::InstCall::create(::function, 2, nullptr, ::context->getConstantInt64(reinterpret_cast<intptr_t>(__msan_unpoison)), false);
-				call->addArg(ptr);
-				call->addArg(::context->getConstantInt64(typeSize(type)));
-				::basicBlock->appendInst(call);
-			}
-		#endif
-
-		int valueType = (int)reinterpret_cast<intptr_t>(type);
-
-		if((valueType & EmulatedBits) && (align != 0))   // Narrow vector not stored on stack.
-		{
-			if(emulateIntrinsics)
-			{
-				if(typeSize(type) == 4)
-				{
-					Ice::Variable *vector = ::function->makeVariable(Ice::IceType_v4i32);
-					auto bitcast = Ice::InstCast::create(::function, Ice::InstCast::Bitcast, vector, value);
-					::basicBlock->appendInst(bitcast);
-
-					RValue<Int4> v(V(vector));
-
-					auto pointer = RValue<Pointer<Byte>>(ptr);
-					Int x = Extract(v, 0);
-					*Pointer<Int>(pointer) = x;
-				}
-				else if(typeSize(type) == 8)
-				{
-					Ice::Variable *vector = ::function->makeVariable(Ice::IceType_v4i32);
-					auto bitcast = Ice::InstCast::create(::function, Ice::InstCast::Bitcast, vector, value);
-					::basicBlock->appendInst(bitcast);
-
-					RValue<Int4> v(V(vector));
-
-					auto pointer = RValue<Pointer<Byte>>(ptr);
-					Int x = Extract(v, 0);
-					*Pointer<Int>(pointer) = x;
-					Int y = Extract(v, 1);
-					*Pointer<Int>(pointer + 4) = y;
-				}
-				else UNREACHABLE("typeSize(type): %d", int(typeSize(type)));
-			}
-			else
-			{
-				const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::StoreSubVector, Ice::Intrinsics::SideEffects_T, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_T};
-				auto target = ::context->getConstantUndef(Ice::IceType_i32);
-				auto store = Ice::InstIntrinsicCall::create(::function, 3, nullptr, target, intrinsic);
-				store->addArg(value);
-				store->addArg(ptr);
-				store->addArg(::context->getConstantInt32(typeSize(type)));
-				::basicBlock->appendInst(store);
-			}
-		}
-		else
-		{
-			ASSERT(value->getType() == T(type));
-
-			auto store = Ice::InstStore::create(::function, value, ptr, align);
-			::basicBlock->appendInst(store);
-		}
-
-		return value;
-	}
-
-	Value *Nucleus::createGEP(Value *ptr, Type *type, Value *index, bool unsignedIndex)
-	{
-		ASSERT(index->getType() == Ice::IceType_i32);
-
-		if(auto *constant = llvm::dyn_cast<Ice::ConstantInteger32>(index))
-		{
-			int32_t offset = constant->getValue() * (int)typeSize(type);
-
-			if(offset == 0)
-			{
-				return ptr;
-			}
-
-			return createAdd(ptr, createConstantInt(offset));
-		}
-
-		if(!Ice::isByteSizedType(T(type)))
-		{
-			index = createMul(index, createConstantInt((int)typeSize(type)));
-		}
-
-		if(sizeof(void*) == 8)
-		{
-			if(unsignedIndex)
-			{
-				index = createZExt(index, T(Ice::IceType_i64));
-			}
-			else
-			{
-				index = createSExt(index, T(Ice::IceType_i64));
-			}
-		}
-
-		return createAdd(ptr, index);
-	}
-
-	Value *Nucleus::createAtomicAdd(Value *ptr, Value *value, std::memory_order memoryOrder)
-	{
-		UNIMPLEMENTED("createAtomicAdd");
-		return nullptr;
-	}
-
-	Value *Nucleus::createAtomicSub(Value *ptr, Value *value, std::memory_order memoryOrder)
-	{
-		UNIMPLEMENTED("createAtomicSub");
-		return nullptr;
-	}
-
-	Value *Nucleus::createAtomicAnd(Value *ptr, Value *value, std::memory_order memoryOrder)
-	{
-		UNIMPLEMENTED("createAtomicAnd");
-		return nullptr;
-	}
-
-	Value *Nucleus::createAtomicOr(Value *ptr, Value *value, std::memory_order memoryOrder)
-	{
-		UNIMPLEMENTED("createAtomicOr");
-		return nullptr;
-	}
-
-	Value *Nucleus::createAtomicXor(Value *ptr, Value *value, std::memory_order memoryOrder)
-	{
-		UNIMPLEMENTED("createAtomicXor");
-		return nullptr;
-	}
-
-	Value *Nucleus::createAtomicMin(Value *ptr, Value *value, std::memory_order memoryOrder)
-	{
-		UNIMPLEMENTED("createAtomicMin");
-		return nullptr;
-	}
-
-	Value *Nucleus::createAtomicMax(Value *ptr, Value *value, std::memory_order memoryOrder)
-	{
-		UNIMPLEMENTED("createAtomicMax");
-		return nullptr;
-	}
-
-	Value *Nucleus::createAtomicUMin(Value *ptr, Value *value, std::memory_order memoryOrder)
-	{
-		UNIMPLEMENTED("createAtomicUMin");
-		return nullptr;
-	}
-
-	Value *Nucleus::createAtomicUMax(Value *ptr, Value *value, std::memory_order memoryOrder)
-	{
-		UNIMPLEMENTED("createAtomicUMax");
-		return nullptr;
-	}
-
-	Value *Nucleus::createAtomicExchange(Value *ptr, Value *value, std::memory_order memoryOrder)
-	{
-		UNIMPLEMENTED("createAtomicExchange");
-		return nullptr;
-	}
-
-	Value *Nucleus::createAtomicCompareExchange(Value *ptr, Value *value, Value *compare, std::memory_order memoryOrderEqual, std::memory_order memoryOrderUnequal)
-	{
-		UNIMPLEMENTED("createAtomicCompareExchange");
-		return nullptr;
-	}
-
-	static Value *createCast(Ice::InstCast::OpKind op, Value *v, Type *destType)
-	{
-		if(v->getType() == T(destType))
-		{
-			return v;
-		}
-
-		Ice::Variable *result = ::function->makeVariable(T(destType));
-		Ice::InstCast *cast = Ice::InstCast::create(::function, op, result, v);
-		::basicBlock->appendInst(cast);
-
-		return V(result);
-	}
-
-	Value *Nucleus::createTrunc(Value *v, Type *destType)
-	{
-		return createCast(Ice::InstCast::Trunc, v, destType);
-	}
-
-	Value *Nucleus::createZExt(Value *v, Type *destType)
-	{
-		return createCast(Ice::InstCast::Zext, v, destType);
-	}
-
-	Value *Nucleus::createSExt(Value *v, Type *destType)
-	{
-		return createCast(Ice::InstCast::Sext, v, destType);
-	}
-
-	Value *Nucleus::createFPToUI(Value *v, Type *destType)
-	{
-		return createCast(Ice::InstCast::Fptoui, v, destType);
-	}
-
-	Value *Nucleus::createFPToSI(Value *v, Type *destType)
-	{
-		return createCast(Ice::InstCast::Fptosi, v, destType);
-	}
-
-	Value *Nucleus::createSIToFP(Value *v, Type *destType)
-	{
-		return createCast(Ice::InstCast::Sitofp, v, destType);
-	}
-
-	Value *Nucleus::createFPTrunc(Value *v, Type *destType)
-	{
-		return createCast(Ice::InstCast::Fptrunc, v, destType);
-	}
-
-	Value *Nucleus::createFPExt(Value *v, Type *destType)
-	{
-		return createCast(Ice::InstCast::Fpext, v, destType);
-	}
-
-	Value *Nucleus::createBitCast(Value *v, Type *destType)
-	{
-		// Bitcasts must be between types of the same logical size. But with emulated narrow vectors we need
-		// support for casting between scalars and wide vectors. For platforms where this is not supported,
-		// emulate them by writing to the stack and reading back as the destination type.
-		if(emulateMismatchedBitCast)
-		{
-			if(!Ice::isVectorType(v->getType()) && Ice::isVectorType(T(destType)))
-			{
-				Value *address = allocateStackVariable(destType);
-				createStore(v, address, T(v->getType()));
-				return createLoad(address, destType);
-			}
-			else if(Ice::isVectorType(v->getType()) && !Ice::isVectorType(T(destType)))
-			{
-				Value *address = allocateStackVariable(T(v->getType()));
-				createStore(v, address, T(v->getType()));
-				return createLoad(address, destType);
-			}
-		}
-
-		return createCast(Ice::InstCast::Bitcast, v, destType);
-	}
-
-	static Value *createIntCompare(Ice::InstIcmp::ICond condition, Value *lhs, Value *rhs)
-	{
-		ASSERT(lhs->getType() == rhs->getType());
-
-		auto result = ::function->makeVariable(Ice::isScalarIntegerType(lhs->getType()) ? Ice::IceType_i1 : lhs->getType());
-		auto cmp = Ice::InstIcmp::create(::function, condition, result, lhs, rhs);
-		::basicBlock->appendInst(cmp);
-
-		return V(result);
-	}
-
-	Value *Nucleus::createPtrEQ(Value *lhs, Value *rhs)
-	{
-		return createIntCompare(Ice::InstIcmp::Eq, lhs, rhs);
-	}
-
-	Value *Nucleus::createICmpEQ(Value *lhs, Value *rhs)
-	{
-		return createIntCompare(Ice::InstIcmp::Eq, lhs, rhs);
-	}
-
-	Value *Nucleus::createICmpNE(Value *lhs, Value *rhs)
-	{
-		return createIntCompare(Ice::InstIcmp::Ne, lhs, rhs);
-	}
-
-	Value *Nucleus::createICmpUGT(Value *lhs, Value *rhs)
-	{
-		return createIntCompare(Ice::InstIcmp::Ugt, lhs, rhs);
-	}
-
-	Value *Nucleus::createICmpUGE(Value *lhs, Value *rhs)
-	{
-		return createIntCompare(Ice::InstIcmp::Uge, lhs, rhs);
-	}
-
-	Value *Nucleus::createICmpULT(Value *lhs, Value *rhs)
-	{
-		return createIntCompare(Ice::InstIcmp::Ult, lhs, rhs);
-	}
-
-	Value *Nucleus::createICmpULE(Value *lhs, Value *rhs)
-	{
-		return createIntCompare(Ice::InstIcmp::Ule, lhs, rhs);
-	}
-
-	Value *Nucleus::createICmpSGT(Value *lhs, Value *rhs)
-	{
-		return createIntCompare(Ice::InstIcmp::Sgt, lhs, rhs);
-	}
-
-	Value *Nucleus::createICmpSGE(Value *lhs, Value *rhs)
-	{
-		return createIntCompare(Ice::InstIcmp::Sge, lhs, rhs);
-	}
-
-	Value *Nucleus::createICmpSLT(Value *lhs, Value *rhs)
-	{
-		return createIntCompare(Ice::InstIcmp::Slt, lhs, rhs);
-	}
-
-	Value *Nucleus::createICmpSLE(Value *lhs, Value *rhs)
-	{
-		return createIntCompare(Ice::InstIcmp::Sle, lhs, rhs);
-	}
-
-	static Value *createFloatCompare(Ice::InstFcmp::FCond condition, Value *lhs, Value *rhs)
-	{
-		ASSERT(lhs->getType() == rhs->getType());
-		ASSERT(Ice::isScalarFloatingType(lhs->getType()) || lhs->getType() == Ice::IceType_v4f32);
-
-		auto result = ::function->makeVariable(Ice::isScalarFloatingType(lhs->getType()) ? Ice::IceType_i1 : Ice::IceType_v4i32);
-		auto cmp = Ice::InstFcmp::create(::function, condition, result, lhs, rhs);
-		::basicBlock->appendInst(cmp);
-
-		return V(result);
-	}
-
-	Value *Nucleus::createFCmpOEQ(Value *lhs, Value *rhs)
-	{
-		return createFloatCompare(Ice::InstFcmp::Oeq, lhs, rhs);
-	}
-
-	Value *Nucleus::createFCmpOGT(Value *lhs, Value *rhs)
-	{
-		return createFloatCompare(Ice::InstFcmp::Ogt, lhs, rhs);
-	}
-
-	Value *Nucleus::createFCmpOGE(Value *lhs, Value *rhs)
-	{
-		return createFloatCompare(Ice::InstFcmp::Oge, lhs, rhs);
-	}
-
-	Value *Nucleus::createFCmpOLT(Value *lhs, Value *rhs)
-	{
-		return createFloatCompare(Ice::InstFcmp::Olt, lhs, rhs);
-	}
-
-	Value *Nucleus::createFCmpOLE(Value *lhs, Value *rhs)
-	{
-		return createFloatCompare(Ice::InstFcmp::Ole, lhs, rhs);
-	}
-
-	Value *Nucleus::createFCmpONE(Value *lhs, Value *rhs)
-	{
-		return createFloatCompare(Ice::InstFcmp::One, lhs, rhs);
-	}
-
-	Value *Nucleus::createFCmpORD(Value *lhs, Value *rhs)
-	{
-		return createFloatCompare(Ice::InstFcmp::Ord, lhs, rhs);
-	}
-
-	Value *Nucleus::createFCmpUNO(Value *lhs, Value *rhs)
-	{
-		return createFloatCompare(Ice::InstFcmp::Uno, lhs, rhs);
-	}
-
-	Value *Nucleus::createFCmpUEQ(Value *lhs, Value *rhs)
-	{
-		return createFloatCompare(Ice::InstFcmp::Ueq, lhs, rhs);
-	}
-
-	Value *Nucleus::createFCmpUGT(Value *lhs, Value *rhs)
-	{
-		return createFloatCompare(Ice::InstFcmp::Ugt, lhs, rhs);
-	}
-
-	Value *Nucleus::createFCmpUGE(Value *lhs, Value *rhs)
-	{
-		return createFloatCompare(Ice::InstFcmp::Uge, lhs, rhs);
-	}
-
-	Value *Nucleus::createFCmpULT(Value *lhs, Value *rhs)
-	{
-		return createFloatCompare(Ice::InstFcmp::Ult, lhs, rhs);
-	}
-
-	Value *Nucleus::createFCmpULE(Value *lhs, Value *rhs)
-	{
-		return createFloatCompare(Ice::InstFcmp::Ule, lhs, rhs);
-	}
-
-	Value *Nucleus::createFCmpUNE(Value *lhs, Value *rhs)
-	{
-		return createFloatCompare(Ice::InstFcmp::Une, lhs, rhs);
-	}
-
-	Value *Nucleus::createExtractElement(Value *vector, Type *type, int index)
-	{
-		auto result = ::function->makeVariable(T(type));
-		auto extract = Ice::InstExtractElement::create(::function, result, vector, ::context->getConstantInt32(index));
-		::basicBlock->appendInst(extract);
-
-		return V(result);
-	}
-
-	Value *Nucleus::createInsertElement(Value *vector, Value *element, int index)
-	{
-		auto result = ::function->makeVariable(vector->getType());
-		auto insert = Ice::InstInsertElement::create(::function, result, vector, element, ::context->getConstantInt32(index));
-		::basicBlock->appendInst(insert);
-
-		return V(result);
-	}
-
-	Value *Nucleus::createShuffleVector(Value *V1, Value *V2, const int *select)
-	{
-		ASSERT(V1->getType() == V2->getType());
-
-		int size = Ice::typeNumElements(V1->getType());
-		auto result = ::function->makeVariable(V1->getType());
-		auto shuffle = Ice::InstShuffleVector::create(::function, result, V1, V2);
-
-		for(int i = 0; i < size; i++)
-		{
-			shuffle->addIndex(llvm::cast<Ice::ConstantInteger32>(::context->getConstantInt32(select[i])));
-		}
-
-		::basicBlock->appendInst(shuffle);
-
-		return V(result);
-	}
-
-	Value *Nucleus::createSelect(Value *C, Value *ifTrue, Value *ifFalse)
-	{
-		ASSERT(ifTrue->getType() == ifFalse->getType());
-
-		auto result = ::function->makeVariable(ifTrue->getType());
-		auto *select = Ice::InstSelect::create(::function, result, C, ifTrue, ifFalse);
-		::basicBlock->appendInst(select);
-
-		return V(result);
-	}
-
-	SwitchCases *Nucleus::createSwitch(Value *control, BasicBlock *defaultBranch, unsigned numCases)
-	{
-		auto switchInst = Ice::InstSwitch::create(::function, numCases, control, defaultBranch);
-		::basicBlock->appendInst(switchInst);
-
-		return reinterpret_cast<SwitchCases*>(switchInst);
-	}
-
-	void Nucleus::addSwitchCase(SwitchCases *switchCases, int label, BasicBlock *branch)
-	{
-		switchCases->addBranch(label, label, branch);
-	}
-
-	void Nucleus::createUnreachable()
-	{
-		Ice::InstUnreachable *unreachable = Ice::InstUnreachable::create(::function);
-		::basicBlock->appendInst(unreachable);
-	}
-
-	Type *Nucleus::getPointerType(Type *ElementType)
-	{
-		if(sizeof(void*) == 8)
-		{
-			return T(Ice::IceType_i64);
-		}
-		else
-		{
-			return T(Ice::IceType_i32);
-		}
-	}
-
-	Value *Nucleus::createNullValue(Type *Ty)
-	{
-		if(Ice::isVectorType(T(Ty)))
-		{
-			ASSERT(Ice::typeNumElements(T(Ty)) <= 16);
-			int64_t c[16] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
-			return createConstantVector(c, Ty);
-		}
-		else
-		{
-			return V(::context->getConstantZero(T(Ty)));
-		}
-	}
-
-	Value *Nucleus::createConstantLong(int64_t i)
-	{
-		return V(::context->getConstantInt64(i));
-	}
-
-	Value *Nucleus::createConstantInt(int i)
-	{
-		return V(::context->getConstantInt32(i));
-	}
-
-	Value *Nucleus::createConstantInt(unsigned int i)
-	{
-		return V(::context->getConstantInt32(i));
-	}
-
-	Value *Nucleus::createConstantBool(bool b)
-	{
-		return V(::context->getConstantInt1(b));
-	}
-
-	Value *Nucleus::createConstantByte(signed char i)
-	{
-		return V(::context->getConstantInt8(i));
-	}
-
-	Value *Nucleus::createConstantByte(unsigned char i)
-	{
-		return V(::context->getConstantInt8(i));
-	}
-
-	Value *Nucleus::createConstantShort(short i)
-	{
-		return V(::context->getConstantInt16(i));
-	}
-
-	Value *Nucleus::createConstantShort(unsigned short i)
-	{
-		return V(::context->getConstantInt16(i));
-	}
-
-	Value *Nucleus::createConstantFloat(float x)
-	{
-		return V(::context->getConstantFloat(x));
-	}
-
-	Value *Nucleus::createNullPointer(Type *Ty)
-	{
-		return createNullValue(T(sizeof(void*) == 8 ? Ice::IceType_i64 : Ice::IceType_i32));
-	}
-
-	Value *Nucleus::createConstantVector(const int64_t *constants, Type *type)
-	{
-		const int vectorSize = 16;
-		ASSERT(Ice::typeWidthInBytes(T(type)) == vectorSize);
-		const int alignment = vectorSize;
-		auto globalPool = ::function->getGlobalPool();
-
-		const int64_t *i = constants;
-		const double *f = reinterpret_cast<const double*>(constants);
-		Ice::VariableDeclaration::DataInitializer *dataInitializer = nullptr;
-
-		switch((int)reinterpret_cast<intptr_t>(type))
-		{
-		case Ice::IceType_v4i32:
-		case Ice::IceType_v4i1:
-			{
-				const int initializer[4] = {(int)i[0], (int)i[1], (int)i[2], (int)i[3]};
-				static_assert(sizeof(initializer) == vectorSize, "!");
-				dataInitializer = Ice::VariableDeclaration::DataInitializer::create(globalPool, (const char*)initializer, vectorSize);
-			}
-			break;
-		case Ice::IceType_v4f32:
-			{
-				const float initializer[4] = {(float)f[0], (float)f[1], (float)f[2], (float)f[3]};
-				static_assert(sizeof(initializer) == vectorSize, "!");
-				dataInitializer = Ice::VariableDeclaration::DataInitializer::create(globalPool, (const char*)initializer, vectorSize);
-			}
-			break;
-		case Ice::IceType_v8i16:
-		case Ice::IceType_v8i1:
-			{
-				const short initializer[8] = {(short)i[0], (short)i[1], (short)i[2], (short)i[3], (short)i[4], (short)i[5], (short)i[6], (short)i[7]};
-				static_assert(sizeof(initializer) == vectorSize, "!");
-				dataInitializer = Ice::VariableDeclaration::DataInitializer::create(globalPool, (const char*)initializer, vectorSize);
-			}
-			break;
-		case Ice::IceType_v16i8:
-		case Ice::IceType_v16i1:
-			{
-				const char initializer[16] = {(char)i[0], (char)i[1], (char)i[2], (char)i[3], (char)i[4], (char)i[5], (char)i[6], (char)i[7], (char)i[8], (char)i[9], (char)i[10], (char)i[11], (char)i[12], (char)i[13], (char)i[14], (char)i[15]};
-				static_assert(sizeof(initializer) == vectorSize, "!");
-				dataInitializer = Ice::VariableDeclaration::DataInitializer::create(globalPool, (const char*)initializer, vectorSize);
-			}
-			break;
-		case Type_v2i32:
-			{
-				const int initializer[4] = {(int)i[0], (int)i[1], (int)i[0], (int)i[1]};
-				static_assert(sizeof(initializer) == vectorSize, "!");
-				dataInitializer = Ice::VariableDeclaration::DataInitializer::create(globalPool, (const char*)initializer, vectorSize);
-			}
-			break;
-		case Type_v2f32:
-			{
-				const float initializer[4] = {(float)f[0], (float)f[1], (float)f[0], (float)f[1]};
-				static_assert(sizeof(initializer) == vectorSize, "!");
-				dataInitializer = Ice::VariableDeclaration::DataInitializer::create(globalPool, (const char*)initializer, vectorSize);
-			}
-			break;
-		case Type_v4i16:
-			{
-				const short initializer[8] = {(short)i[0], (short)i[1], (short)i[2], (short)i[3], (short)i[0], (short)i[1], (short)i[2], (short)i[3]};
-				static_assert(sizeof(initializer) == vectorSize, "!");
-				dataInitializer = Ice::VariableDeclaration::DataInitializer::create(globalPool, (const char*)initializer, vectorSize);
-			}
-			break;
-		case Type_v8i8:
-			{
-				const char initializer[16] = {(char)i[0], (char)i[1], (char)i[2], (char)i[3], (char)i[4], (char)i[5], (char)i[6], (char)i[7], (char)i[0], (char)i[1], (char)i[2], (char)i[3], (char)i[4], (char)i[5], (char)i[6], (char)i[7]};
-				static_assert(sizeof(initializer) == vectorSize, "!");
-				dataInitializer = Ice::VariableDeclaration::DataInitializer::create(globalPool, (const char*)initializer, vectorSize);
-			}
-			break;
-		case Type_v4i8:
-			{
-				const char initializer[16] = {(char)i[0], (char)i[1], (char)i[2], (char)i[3], (char)i[0], (char)i[1], (char)i[2], (char)i[3], (char)i[0], (char)i[1], (char)i[2], (char)i[3], (char)i[0], (char)i[1], (char)i[2], (char)i[3]};
-				static_assert(sizeof(initializer) == vectorSize, "!");
-				dataInitializer = Ice::VariableDeclaration::DataInitializer::create(globalPool, (const char*)initializer, vectorSize);
-			}
-			break;
-		default:
-			UNREACHABLE("Unknown constant vector type: %d", (int)reinterpret_cast<intptr_t>(type));
-		}
-
-		auto name = Ice::GlobalString::createWithoutString(::context);
-		auto *variableDeclaration = Ice::VariableDeclaration::create(globalPool);
-		variableDeclaration->setName(name);
-		variableDeclaration->setAlignment(alignment);
-		variableDeclaration->setIsConstant(true);
-		variableDeclaration->addInitializer(dataInitializer);
-
-		::function->addGlobal(variableDeclaration);
-
-		constexpr int32_t offset = 0;
-		Ice::Operand *ptr = ::context->getConstantSym(offset, name);
-
-		Ice::Variable *result = ::function->makeVariable(T(type));
-		auto load = Ice::InstLoad::create(::function, result, ptr, alignment);
-		::basicBlock->appendInst(load);
-
-		return V(result);
-	}
-
-	Value *Nucleus::createConstantVector(const double *constants, Type *type)
-	{
-		return createConstantVector((const int64_t*)constants, type);
-	}
-
-	Type *Void::getType()
-	{
-		return T(Ice::IceType_void);
-	}
-
-	Type *Bool::getType()
-	{
-		return T(Ice::IceType_i1);
-	}
-
-	Type *Byte::getType()
-	{
-		return T(Ice::IceType_i8);
-	}
-
-	Type *SByte::getType()
-	{
-		return T(Ice::IceType_i8);
-	}
-
-	Type *Short::getType()
-	{
-		return T(Ice::IceType_i16);
-	}
-
-	Type *UShort::getType()
-	{
-		return T(Ice::IceType_i16);
-	}
-
-	Type *Byte4::getType()
-	{
-		return T(Type_v4i8);
-	}
-
-	Type *SByte4::getType()
-	{
-		return T(Type_v4i8);
-	}
-
-	namespace
-	{
-		RValue<Byte> SaturateUnsigned(RValue<Short> x)
-		{
-			return Byte(IfThenElse(Int(x) > 0xFF, Int(0xFF), IfThenElse(Int(x) < 0, Int(0), Int(x))));
-		}
-
-		RValue<Byte> Extract(RValue<Byte8> val, int i)
-		{
-			return RValue<Byte>(Nucleus::createExtractElement(val.value, Byte::getType(), i));
-		}
-
-		RValue<Byte8> Insert(RValue<Byte8> val, RValue<Byte> element, int i)
-		{
-			return RValue<Byte8>(Nucleus::createInsertElement(val.value, element.value, i));
-		}
-	}
-
-	RValue<Byte8> AddSat(RValue<Byte8> x, RValue<Byte8> y)
-	{
-		if(emulateIntrinsics)
-		{
-			Byte8 result;
-			result = Insert(result, SaturateUnsigned(Short(Int(Extract(x, 0)) + Int(Extract(y, 0)))), 0);
-			result = Insert(result, SaturateUnsigned(Short(Int(Extract(x, 1)) + Int(Extract(y, 1)))), 1);
-			result = Insert(result, SaturateUnsigned(Short(Int(Extract(x, 2)) + Int(Extract(y, 2)))), 2);
-			result = Insert(result, SaturateUnsigned(Short(Int(Extract(x, 3)) + Int(Extract(y, 3)))), 3);
-			result = Insert(result, SaturateUnsigned(Short(Int(Extract(x, 4)) + Int(Extract(y, 4)))), 4);
-			result = Insert(result, SaturateUnsigned(Short(Int(Extract(x, 5)) + Int(Extract(y, 5)))), 5);
-			result = Insert(result, SaturateUnsigned(Short(Int(Extract(x, 6)) + Int(Extract(y, 6)))), 6);
-			result = Insert(result, SaturateUnsigned(Short(Int(Extract(x, 7)) + Int(Extract(y, 7)))), 7);
-
-			return result;
-		}
-		else
-		{
-			Ice::Variable *result = ::function->makeVariable(Ice::IceType_v16i8);
-			const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::AddSaturateUnsigned, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
-			auto target = ::context->getConstantUndef(Ice::IceType_i32);
-			auto paddusb = Ice::InstIntrinsicCall::create(::function, 2, result, target, intrinsic);
-			paddusb->addArg(x.value);
-			paddusb->addArg(y.value);
-			::basicBlock->appendInst(paddusb);
-
-			return RValue<Byte8>(V(result));
-		}
-	}
-
-	RValue<Byte8> SubSat(RValue<Byte8> x, RValue<Byte8> y)
-	{
-		if(emulateIntrinsics)
-		{
-			Byte8 result;
-			result = Insert(result, SaturateUnsigned(Short(Int(Extract(x, 0)) - Int(Extract(y, 0)))), 0);
-			result = Insert(result, SaturateUnsigned(Short(Int(Extract(x, 1)) - Int(Extract(y, 1)))), 1);
-			result = Insert(result, SaturateUnsigned(Short(Int(Extract(x, 2)) - Int(Extract(y, 2)))), 2);
-			result = Insert(result, SaturateUnsigned(Short(Int(Extract(x, 3)) - Int(Extract(y, 3)))), 3);
-			result = Insert(result, SaturateUnsigned(Short(Int(Extract(x, 4)) - Int(Extract(y, 4)))), 4);
-			result = Insert(result, SaturateUnsigned(Short(Int(Extract(x, 5)) - Int(Extract(y, 5)))), 5);
-			result = Insert(result, SaturateUnsigned(Short(Int(Extract(x, 6)) - Int(Extract(y, 6)))), 6);
-			result = Insert(result, SaturateUnsigned(Short(Int(Extract(x, 7)) - Int(Extract(y, 7)))), 7);
-
-			return result;
-		}
-		else
-		{
-			Ice::Variable *result = ::function->makeVariable(Ice::IceType_v16i8);
-			const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::SubtractSaturateUnsigned, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
-			auto target = ::context->getConstantUndef(Ice::IceType_i32);
-			auto psubusw = Ice::InstIntrinsicCall::create(::function, 2, result, target, intrinsic);
-			psubusw->addArg(x.value);
-			psubusw->addArg(y.value);
-			::basicBlock->appendInst(psubusw);
-
-			return RValue<Byte8>(V(result));
-		}
-	}
-
-	RValue<SByte> Extract(RValue<SByte8> val, int i)
-	{
-		return RValue<SByte>(Nucleus::createExtractElement(val.value, SByte::getType(), i));
-	}
-
-	RValue<SByte8> Insert(RValue<SByte8> val, RValue<SByte> element, int i)
-	{
-		return RValue<SByte8>(Nucleus::createInsertElement(val.value, element.value, i));
-	}
-
-	RValue<SByte8> operator>>(RValue<SByte8> lhs, unsigned char rhs)
-	{
-		if(emulateIntrinsics)
-		{
-			SByte8 result;
-			result = Insert(result, Extract(lhs, 0) >> SByte(rhs), 0);
-			result = Insert(result, Extract(lhs, 1) >> SByte(rhs), 1);
-			result = Insert(result, Extract(lhs, 2) >> SByte(rhs), 2);
-			result = Insert(result, Extract(lhs, 3) >> SByte(rhs), 3);
-			result = Insert(result, Extract(lhs, 4) >> SByte(rhs), 4);
-			result = Insert(result, Extract(lhs, 5) >> SByte(rhs), 5);
-			result = Insert(result, Extract(lhs, 6) >> SByte(rhs), 6);
-			result = Insert(result, Extract(lhs, 7) >> SByte(rhs), 7);
-
-			return result;
-		}
-		else
-		{
-			#if defined(__i386__) || defined(__x86_64__)
-				// SSE2 doesn't support byte vector shifts, so shift as shorts and recombine.
-				RValue<Short4> hi = (As<Short4>(lhs) >> rhs) & Short4(0xFF00u);
-				RValue<Short4> lo = As<Short4>(As<UShort4>((As<Short4>(lhs) << 8) >> rhs) >> 8);
-
-				return As<SByte8>(hi | lo);
-			#else
-				return RValue<SByte8>(Nucleus::createAShr(lhs.value, V(::context->getConstantInt32(rhs))));
-			#endif
-		}
-	}
-
-	RValue<Int> SignMask(RValue<Byte8> x)
-	{
-		if(emulateIntrinsics || CPUID::ARM)
-		{
-			Byte8 xx = As<Byte8>(As<SByte8>(x) >> 7) & Byte8(0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80);
-			return Int(Extract(xx, 0)) | Int(Extract(xx, 1)) | Int(Extract(xx, 2)) | Int(Extract(xx, 3)) | Int(Extract(xx, 4)) | Int(Extract(xx, 5)) | Int(Extract(xx, 6)) | Int(Extract(xx, 7));
-		}
-		else
-		{
-			Ice::Variable *result = ::function->makeVariable(Ice::IceType_i32);
-			const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::SignMask, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
-			auto target = ::context->getConstantUndef(Ice::IceType_i32);
-			auto movmsk = Ice::InstIntrinsicCall::create(::function, 1, result, target, intrinsic);
-			movmsk->addArg(x.value);
-			::basicBlock->appendInst(movmsk);
-
-			return RValue<Int>(V(result)) & 0xFF;
-		}
-	}
+}
 
 //	RValue<Byte8> CmpGT(RValue<Byte8> x, RValue<Byte8> y)
 //	{
 //		return RValue<Byte8>(createIntCompare(Ice::InstIcmp::Ugt, x.value, y.value));
 //	}
 
-	RValue<Byte8> CmpEQ(RValue<Byte8> x, RValue<Byte8> y)
-	{
-		return RValue<Byte8>(Nucleus::createICmpEQ(x.value, y.value));
-	}
+RValue<Byte8> CmpEQ(RValue<Byte8> x, RValue<Byte8> y)
+{
+	return RValue<Byte8>(Nucleus::createICmpEQ(x.value, y.value));
+}
 
-	Type *Byte8::getType()
-	{
-		return T(Type_v8i8);
-	}
+Type *Byte8::getType()
+{
+	return T(Type_v8i8);
+}
 
 //	RValue<SByte8> operator<<(RValue<SByte8> lhs, unsigned char rhs)
 //	{
@@ -1870,886 +1869,886 @@
 //		return RValue<SByte8>(Nucleus::createAShr(lhs.value, V(::context->getConstantInt32(rhs))));
 //	}
 
-	RValue<SByte> SaturateSigned(RValue<Short> x)
+RValue<SByte> SaturateSigned(RValue<Short> x)
+{
+	return SByte(IfThenElse(Int(x) > 0x7F, Int(0x7F), IfThenElse(Int(x) < -0x80, Int(0x80), Int(x))));
+}
+
+RValue<SByte8> AddSat(RValue<SByte8> x, RValue<SByte8> y)
+{
+	if(emulateIntrinsics)
 	{
-		return SByte(IfThenElse(Int(x) > 0x7F, Int(0x7F), IfThenElse(Int(x) < -0x80, Int(0x80), Int(x))));
-	}
+		SByte8 result;
+		result = Insert(result, SaturateSigned(Short(Int(Extract(x, 0)) + Int(Extract(y, 0)))), 0);
+		result = Insert(result, SaturateSigned(Short(Int(Extract(x, 1)) + Int(Extract(y, 1)))), 1);
+		result = Insert(result, SaturateSigned(Short(Int(Extract(x, 2)) + Int(Extract(y, 2)))), 2);
+		result = Insert(result, SaturateSigned(Short(Int(Extract(x, 3)) + Int(Extract(y, 3)))), 3);
+		result = Insert(result, SaturateSigned(Short(Int(Extract(x, 4)) + Int(Extract(y, 4)))), 4);
+		result = Insert(result, SaturateSigned(Short(Int(Extract(x, 5)) + Int(Extract(y, 5)))), 5);
+		result = Insert(result, SaturateSigned(Short(Int(Extract(x, 6)) + Int(Extract(y, 6)))), 6);
+		result = Insert(result, SaturateSigned(Short(Int(Extract(x, 7)) + Int(Extract(y, 7)))), 7);
 
-	RValue<SByte8> AddSat(RValue<SByte8> x, RValue<SByte8> y)
+		return result;
+	}
+	else
 	{
-		if(emulateIntrinsics)
-		{
-			SByte8 result;
-			result = Insert(result, SaturateSigned(Short(Int(Extract(x, 0)) + Int(Extract(y, 0)))), 0);
-			result = Insert(result, SaturateSigned(Short(Int(Extract(x, 1)) + Int(Extract(y, 1)))), 1);
-			result = Insert(result, SaturateSigned(Short(Int(Extract(x, 2)) + Int(Extract(y, 2)))), 2);
-			result = Insert(result, SaturateSigned(Short(Int(Extract(x, 3)) + Int(Extract(y, 3)))), 3);
-			result = Insert(result, SaturateSigned(Short(Int(Extract(x, 4)) + Int(Extract(y, 4)))), 4);
-			result = Insert(result, SaturateSigned(Short(Int(Extract(x, 5)) + Int(Extract(y, 5)))), 5);
-			result = Insert(result, SaturateSigned(Short(Int(Extract(x, 6)) + Int(Extract(y, 6)))), 6);
-			result = Insert(result, SaturateSigned(Short(Int(Extract(x, 7)) + Int(Extract(y, 7)))), 7);
+		Ice::Variable *result = ::function->makeVariable(Ice::IceType_v16i8);
+		const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::AddSaturateSigned, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
+		auto target = ::context->getConstantUndef(Ice::IceType_i32);
+		auto paddsb = Ice::InstIntrinsicCall::create(::function, 2, result, target, intrinsic);
+		paddsb->addArg(x.value);
+		paddsb->addArg(y.value);
+		::basicBlock->appendInst(paddsb);
 
-			return result;
-		}
-		else
-		{
-			Ice::Variable *result = ::function->makeVariable(Ice::IceType_v16i8);
-			const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::AddSaturateSigned, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
-			auto target = ::context->getConstantUndef(Ice::IceType_i32);
-			auto paddsb = Ice::InstIntrinsicCall::create(::function, 2, result, target, intrinsic);
-			paddsb->addArg(x.value);
-			paddsb->addArg(y.value);
-			::basicBlock->appendInst(paddsb);
-
-			return RValue<SByte8>(V(result));
-		}
+		return RValue<SByte8>(V(result));
 	}
+}
 
-	RValue<SByte8> SubSat(RValue<SByte8> x, RValue<SByte8> y)
+RValue<SByte8> SubSat(RValue<SByte8> x, RValue<SByte8> y)
+{
+	if(emulateIntrinsics)
 	{
-		if(emulateIntrinsics)
-		{
-			SByte8 result;
-			result = Insert(result, SaturateSigned(Short(Int(Extract(x, 0)) - Int(Extract(y, 0)))), 0);
-			result = Insert(result, SaturateSigned(Short(Int(Extract(x, 1)) - Int(Extract(y, 1)))), 1);
-			result = Insert(result, SaturateSigned(Short(Int(Extract(x, 2)) - Int(Extract(y, 2)))), 2);
-			result = Insert(result, SaturateSigned(Short(Int(Extract(x, 3)) - Int(Extract(y, 3)))), 3);
-			result = Insert(result, SaturateSigned(Short(Int(Extract(x, 4)) - Int(Extract(y, 4)))), 4);
-			result = Insert(result, SaturateSigned(Short(Int(Extract(x, 5)) - Int(Extract(y, 5)))), 5);
-			result = Insert(result, SaturateSigned(Short(Int(Extract(x, 6)) - Int(Extract(y, 6)))), 6);
-			result = Insert(result, SaturateSigned(Short(Int(Extract(x, 7)) - Int(Extract(y, 7)))), 7);
+		SByte8 result;
+		result = Insert(result, SaturateSigned(Short(Int(Extract(x, 0)) - Int(Extract(y, 0)))), 0);
+		result = Insert(result, SaturateSigned(Short(Int(Extract(x, 1)) - Int(Extract(y, 1)))), 1);
+		result = Insert(result, SaturateSigned(Short(Int(Extract(x, 2)) - Int(Extract(y, 2)))), 2);
+		result = Insert(result, SaturateSigned(Short(Int(Extract(x, 3)) - Int(Extract(y, 3)))), 3);
+		result = Insert(result, SaturateSigned(Short(Int(Extract(x, 4)) - Int(Extract(y, 4)))), 4);
+		result = Insert(result, SaturateSigned(Short(Int(Extract(x, 5)) - Int(Extract(y, 5)))), 5);
+		result = Insert(result, SaturateSigned(Short(Int(Extract(x, 6)) - Int(Extract(y, 6)))), 6);
+		result = Insert(result, SaturateSigned(Short(Int(Extract(x, 7)) - Int(Extract(y, 7)))), 7);
 
-			return result;
-		}
-		else
-		{
-			Ice::Variable *result = ::function->makeVariable(Ice::IceType_v16i8);
-			const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::SubtractSaturateSigned, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
-			auto target = ::context->getConstantUndef(Ice::IceType_i32);
-			auto psubsb = Ice::InstIntrinsicCall::create(::function, 2, result, target, intrinsic);
-			psubsb->addArg(x.value);
-			psubsb->addArg(y.value);
-			::basicBlock->appendInst(psubsb);
-
-			return RValue<SByte8>(V(result));
-		}
+		return result;
 	}
-
-	RValue<Int> SignMask(RValue<SByte8> x)
+	else
 	{
-		if(emulateIntrinsics || CPUID::ARM)
-		{
-			SByte8 xx = (x >> 7) & SByte8(0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80);
-			return Int(Extract(xx, 0)) | Int(Extract(xx, 1)) | Int(Extract(xx, 2)) | Int(Extract(xx, 3)) | Int(Extract(xx, 4)) | Int(Extract(xx, 5)) | Int(Extract(xx, 6)) | Int(Extract(xx, 7));
-		}
-		else
-		{
-			Ice::Variable *result = ::function->makeVariable(Ice::IceType_i32);
-			const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::SignMask, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
-			auto target = ::context->getConstantUndef(Ice::IceType_i32);
-			auto movmsk = Ice::InstIntrinsicCall::create(::function, 1, result, target, intrinsic);
-			movmsk->addArg(x.value);
-			::basicBlock->appendInst(movmsk);
+		Ice::Variable *result = ::function->makeVariable(Ice::IceType_v16i8);
+		const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::SubtractSaturateSigned, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
+		auto target = ::context->getConstantUndef(Ice::IceType_i32);
+		auto psubsb = Ice::InstIntrinsicCall::create(::function, 2, result, target, intrinsic);
+		psubsb->addArg(x.value);
+		psubsb->addArg(y.value);
+		::basicBlock->appendInst(psubsb);
 
-			return RValue<Int>(V(result)) & 0xFF;
-		}
+		return RValue<SByte8>(V(result));
 	}
+}
 
-	RValue<Byte8> CmpGT(RValue<SByte8> x, RValue<SByte8> y)
+RValue<Int> SignMask(RValue<SByte8> x)
+{
+	if(emulateIntrinsics || CPUID::ARM)
 	{
-		return RValue<Byte8>(createIntCompare(Ice::InstIcmp::Sgt, x.value, y.value));
+		SByte8 xx = (x >> 7) & SByte8(0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80);
+		return Int(Extract(xx, 0)) | Int(Extract(xx, 1)) | Int(Extract(xx, 2)) | Int(Extract(xx, 3)) | Int(Extract(xx, 4)) | Int(Extract(xx, 5)) | Int(Extract(xx, 6)) | Int(Extract(xx, 7));
 	}
-
-	RValue<Byte8> CmpEQ(RValue<SByte8> x, RValue<SByte8> y)
+	else
 	{
-		return RValue<Byte8>(Nucleus::createICmpEQ(x.value, y.value));
+		Ice::Variable *result = ::function->makeVariable(Ice::IceType_i32);
+		const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::SignMask, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
+		auto target = ::context->getConstantUndef(Ice::IceType_i32);
+		auto movmsk = Ice::InstIntrinsicCall::create(::function, 1, result, target, intrinsic);
+		movmsk->addArg(x.value);
+		::basicBlock->appendInst(movmsk);
+
+		return RValue<Int>(V(result)) & 0xFF;
 	}
+}
 
-	Type *SByte8::getType()
-	{
-		return T(Type_v8i8);
-	}
+RValue<Byte8> CmpGT(RValue<SByte8> x, RValue<SByte8> y)
+{
+	return RValue<Byte8>(createIntCompare(Ice::InstIcmp::Sgt, x.value, y.value));
+}
 
-	Type *Byte16::getType()
-	{
-		return T(Ice::IceType_v16i8);
-	}
+RValue<Byte8> CmpEQ(RValue<SByte8> x, RValue<SByte8> y)
+{
+	return RValue<Byte8>(Nucleus::createICmpEQ(x.value, y.value));
+}
 
-	Type *SByte16::getType()
-	{
-		return T(Ice::IceType_v16i8);
-	}
+Type *SByte8::getType()
+{
+	return T(Type_v8i8);
+}
 
-	Type *Short2::getType()
-	{
-		return T(Type_v2i16);
-	}
+Type *Byte16::getType()
+{
+	return T(Ice::IceType_v16i8);
+}
 
-	Type *UShort2::getType()
-	{
-		return T(Type_v2i16);
-	}
+Type *SByte16::getType()
+{
+	return T(Ice::IceType_v16i8);
+}
 
-	Short4::Short4(RValue<Int4> cast)
-	{
-		int select[8] = {0, 2, 4, 6, 0, 2, 4, 6};
-		Value *short8 = Nucleus::createBitCast(cast.value, Short8::getType());
-		Value *packed = Nucleus::createShuffleVector(short8, short8, select);
+Type *Short2::getType()
+{
+	return T(Type_v2i16);
+}
 
-		Value *int2 = RValue<Int2>(Int2(As<Int4>(packed))).value;
-		Value *short4 = Nucleus::createBitCast(int2, Short4::getType());
+Type *UShort2::getType()
+{
+	return T(Type_v2i16);
+}
 
-		storeValue(short4);
-	}
+Short4::Short4(RValue<Int4> cast)
+{
+	int select[8] = {0, 2, 4, 6, 0, 2, 4, 6};
+	Value *short8 = Nucleus::createBitCast(cast.value, Short8::getType());
+	Value *packed = Nucleus::createShuffleVector(short8, short8, select);
+
+	Value *int2 = RValue<Int2>(Int2(As<Int4>(packed))).value;
+	Value *short4 = Nucleus::createBitCast(int2, Short4::getType());
+
+	storeValue(short4);
+}
 
 //	Short4::Short4(RValue<Float> cast)
 //	{
 //	}
 
-	Short4::Short4(RValue<Float4> cast)
+Short4::Short4(RValue<Float4> cast)
+{
+	UNIMPLEMENTED("Short4::Short4(RValue<Float4> cast)");
+}
+
+RValue<Short4> operator<<(RValue<Short4> lhs, unsigned char rhs)
+{
+	if(emulateIntrinsics)
 	{
-		UNIMPLEMENTED("Short4::Short4(RValue<Float4> cast)");
-	}
-
-	RValue<Short4> operator<<(RValue<Short4> lhs, unsigned char rhs)
-	{
-		if(emulateIntrinsics)
-		{
-			Short4 result;
-			result = Insert(result, Extract(lhs, 0) << Short(rhs), 0);
-			result = Insert(result, Extract(lhs, 1) << Short(rhs), 1);
-			result = Insert(result, Extract(lhs, 2) << Short(rhs), 2);
-			result = Insert(result, Extract(lhs, 3) << Short(rhs), 3);
-
-			return result;
-		}
-		else
-		{
-			return RValue<Short4>(Nucleus::createShl(lhs.value, V(::context->getConstantInt32(rhs))));
-		}
-	}
-
-	RValue<Short4> operator>>(RValue<Short4> lhs, unsigned char rhs)
-	{
-		if(emulateIntrinsics)
-		{
-			Short4 result;
-			result = Insert(result, Extract(lhs, 0) >> Short(rhs), 0);
-			result = Insert(result, Extract(lhs, 1) >> Short(rhs), 1);
-			result = Insert(result, Extract(lhs, 2) >> Short(rhs), 2);
-			result = Insert(result, Extract(lhs, 3) >> Short(rhs), 3);
-
-			return result;
-		}
-		else
-		{
-			return RValue<Short4>(Nucleus::createAShr(lhs.value, V(::context->getConstantInt32(rhs))));
-		}
-	}
-
-	RValue<Short4> Max(RValue<Short4> x, RValue<Short4> y)
-	{
-		Ice::Variable *condition = ::function->makeVariable(Ice::IceType_v8i1);
-		auto cmp = Ice::InstIcmp::create(::function, Ice::InstIcmp::Sle, condition, x.value, y.value);
-		::basicBlock->appendInst(cmp);
-
-		Ice::Variable *result = ::function->makeVariable(Ice::IceType_v8i16);
-		auto select = Ice::InstSelect::create(::function, result, condition, y.value, x.value);
-		::basicBlock->appendInst(select);
-
-		return RValue<Short4>(V(result));
-	}
-
-	RValue<Short4> Min(RValue<Short4> x, RValue<Short4> y)
-	{
-		Ice::Variable *condition = ::function->makeVariable(Ice::IceType_v8i1);
-		auto cmp = Ice::InstIcmp::create(::function, Ice::InstIcmp::Sgt, condition, x.value, y.value);
-		::basicBlock->appendInst(cmp);
-
-		Ice::Variable *result = ::function->makeVariable(Ice::IceType_v8i16);
-		auto select = Ice::InstSelect::create(::function, result, condition, y.value, x.value);
-		::basicBlock->appendInst(select);
-
-		return RValue<Short4>(V(result));
-	}
-
-	RValue<Short> SaturateSigned(RValue<Int> x)
-	{
-		return Short(IfThenElse(x > 0x7FFF, Int(0x7FFF), IfThenElse(x < -0x8000, Int(0x8000), x)));
-	}
-
-	RValue<Short4> AddSat(RValue<Short4> x, RValue<Short4> y)
-	{
-		if(emulateIntrinsics)
-		{
-			Short4 result;
-			result = Insert(result, SaturateSigned(Int(Extract(x, 0)) + Int(Extract(y, 0))), 0);
-			result = Insert(result, SaturateSigned(Int(Extract(x, 1)) + Int(Extract(y, 1))), 1);
-			result = Insert(result, SaturateSigned(Int(Extract(x, 2)) + Int(Extract(y, 2))), 2);
-			result = Insert(result, SaturateSigned(Int(Extract(x, 3)) + Int(Extract(y, 3))), 3);
-
-			return result;
-		}
-		else
-		{
-			Ice::Variable *result = ::function->makeVariable(Ice::IceType_v8i16);
-			const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::AddSaturateSigned, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
-			auto target = ::context->getConstantUndef(Ice::IceType_i32);
-			auto paddsw = Ice::InstIntrinsicCall::create(::function, 2, result, target, intrinsic);
-			paddsw->addArg(x.value);
-			paddsw->addArg(y.value);
-			::basicBlock->appendInst(paddsw);
-
-			return RValue<Short4>(V(result));
-		}
-	}
-
-	RValue<Short4> SubSat(RValue<Short4> x, RValue<Short4> y)
-	{
-		if(emulateIntrinsics)
-		{
-			Short4 result;
-			result = Insert(result, SaturateSigned(Int(Extract(x, 0)) - Int(Extract(y, 0))), 0);
-			result = Insert(result, SaturateSigned(Int(Extract(x, 1)) - Int(Extract(y, 1))), 1);
-			result = Insert(result, SaturateSigned(Int(Extract(x, 2)) - Int(Extract(y, 2))), 2);
-			result = Insert(result, SaturateSigned(Int(Extract(x, 3)) - Int(Extract(y, 3))), 3);
-
-			return result;
-		}
-		else
-		{
-			Ice::Variable *result = ::function->makeVariable(Ice::IceType_v8i16);
-			const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::SubtractSaturateSigned, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
-			auto target = ::context->getConstantUndef(Ice::IceType_i32);
-			auto psubsw = Ice::InstIntrinsicCall::create(::function, 2, result, target, intrinsic);
-			psubsw->addArg(x.value);
-			psubsw->addArg(y.value);
-			::basicBlock->appendInst(psubsw);
-
-			return RValue<Short4>(V(result));
-		}
-	}
-
-	RValue<Short4> MulHigh(RValue<Short4> x, RValue<Short4> y)
-	{
-		if(emulateIntrinsics)
-		{
-			Short4 result;
-			result = Insert(result, Short((Int(Extract(x, 0)) * Int(Extract(y, 0))) >> 16), 0);
-			result = Insert(result, Short((Int(Extract(x, 1)) * Int(Extract(y, 1))) >> 16), 1);
-			result = Insert(result, Short((Int(Extract(x, 2)) * Int(Extract(y, 2))) >> 16), 2);
-			result = Insert(result, Short((Int(Extract(x, 3)) * Int(Extract(y, 3))) >> 16), 3);
-
-			return result;
-		}
-		else
-		{
-			Ice::Variable *result = ::function->makeVariable(Ice::IceType_v8i16);
-			const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::MultiplyHighSigned, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
-			auto target = ::context->getConstantUndef(Ice::IceType_i32);
-			auto pmulhw = Ice::InstIntrinsicCall::create(::function, 2, result, target, intrinsic);
-			pmulhw->addArg(x.value);
-			pmulhw->addArg(y.value);
-			::basicBlock->appendInst(pmulhw);
-
-			return RValue<Short4>(V(result));
-		}
-	}
-
-	RValue<Int2> MulAdd(RValue<Short4> x, RValue<Short4> y)
-	{
-		if(emulateIntrinsics)
-		{
-			Int2 result;
-			result = Insert(result, Int(Extract(x, 0)) * Int(Extract(y, 0)) + Int(Extract(x, 1)) * Int(Extract(y, 1)), 0);
-			result = Insert(result, Int(Extract(x, 2)) * Int(Extract(y, 2)) + Int(Extract(x, 3)) * Int(Extract(y, 3)), 1);
-
-			return result;
-		}
-		else
-		{
-			Ice::Variable *result = ::function->makeVariable(Ice::IceType_v8i16);
-			const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::MultiplyAddPairs, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
-			auto target = ::context->getConstantUndef(Ice::IceType_i32);
-			auto pmaddwd = Ice::InstIntrinsicCall::create(::function, 2, result, target, intrinsic);
-			pmaddwd->addArg(x.value);
-			pmaddwd->addArg(y.value);
-			::basicBlock->appendInst(pmaddwd);
-
-			return As<Int2>(V(result));
-		}
-	}
-
-	RValue<SByte8> PackSigned(RValue<Short4> x, RValue<Short4> y)
-	{
-		if(emulateIntrinsics)
-		{
-			SByte8 result;
-			result = Insert(result, SaturateSigned(Extract(x, 0)), 0);
-			result = Insert(result, SaturateSigned(Extract(x, 1)), 1);
-			result = Insert(result, SaturateSigned(Extract(x, 2)), 2);
-			result = Insert(result, SaturateSigned(Extract(x, 3)), 3);
-			result = Insert(result, SaturateSigned(Extract(y, 0)), 4);
-			result = Insert(result, SaturateSigned(Extract(y, 1)), 5);
-			result = Insert(result, SaturateSigned(Extract(y, 2)), 6);
-			result = Insert(result, SaturateSigned(Extract(y, 3)), 7);
-
-			return result;
-		}
-		else
-		{
-			Ice::Variable *result = ::function->makeVariable(Ice::IceType_v16i8);
-			const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::VectorPackSigned, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
-			auto target = ::context->getConstantUndef(Ice::IceType_i32);
-			auto pack = Ice::InstIntrinsicCall::create(::function, 2, result, target, intrinsic);
-			pack->addArg(x.value);
-			pack->addArg(y.value);
-			::basicBlock->appendInst(pack);
-
-			return As<SByte8>(Swizzle(As<Int4>(V(result)), 0x0202));
-		}
-	}
-
-	RValue<Byte8> PackUnsigned(RValue<Short4> x, RValue<Short4> y)
-	{
-		if(emulateIntrinsics)
-		{
-			Byte8 result;
-			result = Insert(result, SaturateUnsigned(Extract(x, 0)), 0);
-			result = Insert(result, SaturateUnsigned(Extract(x, 1)), 1);
-			result = Insert(result, SaturateUnsigned(Extract(x, 2)), 2);
-			result = Insert(result, SaturateUnsigned(Extract(x, 3)), 3);
-			result = Insert(result, SaturateUnsigned(Extract(y, 0)), 4);
-			result = Insert(result, SaturateUnsigned(Extract(y, 1)), 5);
-			result = Insert(result, SaturateUnsigned(Extract(y, 2)), 6);
-			result = Insert(result, SaturateUnsigned(Extract(y, 3)), 7);
-
-			return result;
-		}
-		else
-		{
-			Ice::Variable *result = ::function->makeVariable(Ice::IceType_v16i8);
-			const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::VectorPackUnsigned, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
-			auto target = ::context->getConstantUndef(Ice::IceType_i32);
-			auto pack = Ice::InstIntrinsicCall::create(::function, 2, result, target, intrinsic);
-			pack->addArg(x.value);
-			pack->addArg(y.value);
-			::basicBlock->appendInst(pack);
-
-			return As<Byte8>(Swizzle(As<Int4>(V(result)), 0x0202));
-		}
-	}
-
-	RValue<Short4> CmpGT(RValue<Short4> x, RValue<Short4> y)
-	{
-		return RValue<Short4>(createIntCompare(Ice::InstIcmp::Sgt, x.value, y.value));
-	}
-
-	RValue<Short4> CmpEQ(RValue<Short4> x, RValue<Short4> y)
-	{
-		return RValue<Short4>(Nucleus::createICmpEQ(x.value, y.value));
-	}
-
-	Type *Short4::getType()
-	{
-		return T(Type_v4i16);
-	}
-
-	UShort4::UShort4(RValue<Float4> cast, bool saturate)
-	{
-		if(saturate)
-		{
-			if(CPUID::SSE4_1)
-			{
-				// x86 produces 0x80000000 on 32-bit integer overflow/underflow.
-				// PackUnsigned takes care of 0x0000 saturation.
-				Int4 int4(Min(cast, Float4(0xFFFF)));
-				*this = As<UShort4>(PackUnsigned(int4, int4));
-			}
-			else if(CPUID::ARM)
-			{
-				// ARM saturates the 32-bit integer result on overflow/undeflow.
-				Int4 int4(cast);
-				*this = As<UShort4>(PackUnsigned(int4, int4));
-			}
-			else
-			{
-				*this = Short4(Int4(Max(Min(cast, Float4(0xFFFF)), Float4(0x0000))));
-			}
-		}
-		else
-		{
-			*this = Short4(Int4(cast));
-		}
-	}
-
-	RValue<UShort> Extract(RValue<UShort4> val, int i)
-	{
-		return RValue<UShort>(Nucleus::createExtractElement(val.value, UShort::getType(), i));
-	}
-
-	RValue<UShort4> Insert(RValue<UShort4> val, RValue<UShort> element, int i)
-	{
-		return RValue<UShort4>(Nucleus::createInsertElement(val.value, element.value, i));
-	}
-
-	RValue<UShort4> operator<<(RValue<UShort4> lhs, unsigned char rhs)
-	{
-		if(emulateIntrinsics)
-		{
-			UShort4 result;
-			result = Insert(result, Extract(lhs, 0) << UShort(rhs), 0);
-			result = Insert(result, Extract(lhs, 1) << UShort(rhs), 1);
-			result = Insert(result, Extract(lhs, 2) << UShort(rhs), 2);
-			result = Insert(result, Extract(lhs, 3) << UShort(rhs), 3);
-
-			return result;
-		}
-		else
-		{
-			return RValue<UShort4>(Nucleus::createShl(lhs.value, V(::context->getConstantInt32(rhs))));
-		}
-	}
-
-	RValue<UShort4> operator>>(RValue<UShort4> lhs, unsigned char rhs)
-	{
-		if(emulateIntrinsics)
-		{
-			UShort4 result;
-			result = Insert(result, Extract(lhs, 0) >> UShort(rhs), 0);
-			result = Insert(result, Extract(lhs, 1) >> UShort(rhs), 1);
-			result = Insert(result, Extract(lhs, 2) >> UShort(rhs), 2);
-			result = Insert(result, Extract(lhs, 3) >> UShort(rhs), 3);
-
-			return result;
-		}
-		else
-		{
-			return RValue<UShort4>(Nucleus::createLShr(lhs.value, V(::context->getConstantInt32(rhs))));
-		}
-	}
-
-	RValue<UShort4> Max(RValue<UShort4> x, RValue<UShort4> y)
-	{
-		Ice::Variable *condition = ::function->makeVariable(Ice::IceType_v8i1);
-		auto cmp = Ice::InstIcmp::create(::function, Ice::InstIcmp::Ule, condition, x.value, y.value);
-		::basicBlock->appendInst(cmp);
-
-		Ice::Variable *result = ::function->makeVariable(Ice::IceType_v8i16);
-		auto select = Ice::InstSelect::create(::function, result, condition, y.value, x.value);
-		::basicBlock->appendInst(select);
-
-		return RValue<UShort4>(V(result));
-	}
-
-	RValue<UShort4> Min(RValue<UShort4> x, RValue<UShort4> y)
-	{
-		Ice::Variable *condition = ::function->makeVariable(Ice::IceType_v8i1);
-		auto cmp = Ice::InstIcmp::create(::function, Ice::InstIcmp::Ugt, condition, x.value, y.value);
-		::basicBlock->appendInst(cmp);
-
-		Ice::Variable *result = ::function->makeVariable(Ice::IceType_v8i16);
-		auto select = Ice::InstSelect::create(::function, result, condition, y.value, x.value);
-		::basicBlock->appendInst(select);
-
-		return RValue<UShort4>(V(result));
-	}
-
-	RValue<UShort> SaturateUnsigned(RValue<Int> x)
-	{
-		return UShort(IfThenElse(x > 0xFFFF, Int(0xFFFF), IfThenElse(x < 0, Int(0), x)));
-	}
-
-	RValue<UShort4> AddSat(RValue<UShort4> x, RValue<UShort4> y)
-	{
-		if(emulateIntrinsics)
-		{
-			UShort4 result;
-			result = Insert(result, SaturateUnsigned(Int(Extract(x, 0)) + Int(Extract(y, 0))), 0);
-			result = Insert(result, SaturateUnsigned(Int(Extract(x, 1)) + Int(Extract(y, 1))), 1);
-			result = Insert(result, SaturateUnsigned(Int(Extract(x, 2)) + Int(Extract(y, 2))), 2);
-			result = Insert(result, SaturateUnsigned(Int(Extract(x, 3)) + Int(Extract(y, 3))), 3);
-
-			return result;
-		}
-		else
-		{
-			Ice::Variable *result = ::function->makeVariable(Ice::IceType_v8i16);
-			const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::AddSaturateUnsigned, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
-			auto target = ::context->getConstantUndef(Ice::IceType_i32);
-			auto paddusw = Ice::InstIntrinsicCall::create(::function, 2, result, target, intrinsic);
-			paddusw->addArg(x.value);
-			paddusw->addArg(y.value);
-			::basicBlock->appendInst(paddusw);
-
-			return RValue<UShort4>(V(result));
-		}
-	}
-
-	RValue<UShort4> SubSat(RValue<UShort4> x, RValue<UShort4> y)
-	{
-		if(emulateIntrinsics)
-		{
-			UShort4 result;
-			result = Insert(result, SaturateUnsigned(Int(Extract(x, 0)) - Int(Extract(y, 0))), 0);
-			result = Insert(result, SaturateUnsigned(Int(Extract(x, 1)) - Int(Extract(y, 1))), 1);
-			result = Insert(result, SaturateUnsigned(Int(Extract(x, 2)) - Int(Extract(y, 2))), 2);
-			result = Insert(result, SaturateUnsigned(Int(Extract(x, 3)) - Int(Extract(y, 3))), 3);
-
-			return result;
-		}
-		else
-		{
-			Ice::Variable *result = ::function->makeVariable(Ice::IceType_v8i16);
-			const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::SubtractSaturateUnsigned, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
-			auto target = ::context->getConstantUndef(Ice::IceType_i32);
-			auto psubusw = Ice::InstIntrinsicCall::create(::function, 2, result, target, intrinsic);
-			psubusw->addArg(x.value);
-			psubusw->addArg(y.value);
-			::basicBlock->appendInst(psubusw);
-
-			return RValue<UShort4>(V(result));
-		}
-	}
-
-	RValue<UShort4> MulHigh(RValue<UShort4> x, RValue<UShort4> y)
-	{
-		if(emulateIntrinsics)
-		{
-			UShort4 result;
-			result = Insert(result, UShort((UInt(Extract(x, 0)) * UInt(Extract(y, 0))) >> 16), 0);
-			result = Insert(result, UShort((UInt(Extract(x, 1)) * UInt(Extract(y, 1))) >> 16), 1);
-			result = Insert(result, UShort((UInt(Extract(x, 2)) * UInt(Extract(y, 2))) >> 16), 2);
-			result = Insert(result, UShort((UInt(Extract(x, 3)) * UInt(Extract(y, 3))) >> 16), 3);
-
-			return result;
-		}
-		else
-		{
-			Ice::Variable *result = ::function->makeVariable(Ice::IceType_v8i16);
-			const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::MultiplyHighUnsigned, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
-			auto target = ::context->getConstantUndef(Ice::IceType_i32);
-			auto pmulhuw = Ice::InstIntrinsicCall::create(::function, 2, result, target, intrinsic);
-			pmulhuw->addArg(x.value);
-			pmulhuw->addArg(y.value);
-			::basicBlock->appendInst(pmulhuw);
-
-			return RValue<UShort4>(V(result));
-		}
-	}
-
-	RValue<Int4> MulHigh(RValue<Int4> x, RValue<Int4> y)
-	{
-		// TODO: For x86, build an intrinsics version of this which uses shuffles + pmuludq.
-
-		// Scalarized implementation.
-		Int4 result;
-		result = Insert(result, Int((Long(Extract(x, 0)) * Long(Extract(y, 0))) >> Long(Int(32))), 0);
-		result = Insert(result, Int((Long(Extract(x, 1)) * Long(Extract(y, 1))) >> Long(Int(32))), 1);
-		result = Insert(result, Int((Long(Extract(x, 2)) * Long(Extract(y, 2))) >> Long(Int(32))), 2);
-		result = Insert(result, Int((Long(Extract(x, 3)) * Long(Extract(y, 3))) >> Long(Int(32))), 3);
+		Short4 result;
+		result = Insert(result, Extract(lhs, 0) << Short(rhs), 0);
+		result = Insert(result, Extract(lhs, 1) << Short(rhs), 1);
+		result = Insert(result, Extract(lhs, 2) << Short(rhs), 2);
+		result = Insert(result, Extract(lhs, 3) << Short(rhs), 3);
 
 		return result;
 	}
-
-	RValue<UInt4> MulHigh(RValue<UInt4> x, RValue<UInt4> y)
+	else
 	{
-		// TODO: For x86, build an intrinsics version of this which uses shuffles + pmuludq.
+		return RValue<Short4>(Nucleus::createShl(lhs.value, V(::context->getConstantInt32(rhs))));
+	}
+}
 
-		if(false)  // Partial product based implementation.
+RValue<Short4> operator>>(RValue<Short4> lhs, unsigned char rhs)
+{
+	if(emulateIntrinsics)
+	{
+		Short4 result;
+		result = Insert(result, Extract(lhs, 0) >> Short(rhs), 0);
+		result = Insert(result, Extract(lhs, 1) >> Short(rhs), 1);
+		result = Insert(result, Extract(lhs, 2) >> Short(rhs), 2);
+		result = Insert(result, Extract(lhs, 3) >> Short(rhs), 3);
+
+		return result;
+	}
+	else
+	{
+		return RValue<Short4>(Nucleus::createAShr(lhs.value, V(::context->getConstantInt32(rhs))));
+	}
+}
+
+RValue<Short4> Max(RValue<Short4> x, RValue<Short4> y)
+{
+	Ice::Variable *condition = ::function->makeVariable(Ice::IceType_v8i1);
+	auto cmp = Ice::InstIcmp::create(::function, Ice::InstIcmp::Sle, condition, x.value, y.value);
+	::basicBlock->appendInst(cmp);
+
+	Ice::Variable *result = ::function->makeVariable(Ice::IceType_v8i16);
+	auto select = Ice::InstSelect::create(::function, result, condition, y.value, x.value);
+	::basicBlock->appendInst(select);
+
+	return RValue<Short4>(V(result));
+}
+
+RValue<Short4> Min(RValue<Short4> x, RValue<Short4> y)
+{
+	Ice::Variable *condition = ::function->makeVariable(Ice::IceType_v8i1);
+	auto cmp = Ice::InstIcmp::create(::function, Ice::InstIcmp::Sgt, condition, x.value, y.value);
+	::basicBlock->appendInst(cmp);
+
+	Ice::Variable *result = ::function->makeVariable(Ice::IceType_v8i16);
+	auto select = Ice::InstSelect::create(::function, result, condition, y.value, x.value);
+	::basicBlock->appendInst(select);
+
+	return RValue<Short4>(V(result));
+}
+
+RValue<Short> SaturateSigned(RValue<Int> x)
+{
+	return Short(IfThenElse(x > 0x7FFF, Int(0x7FFF), IfThenElse(x < -0x8000, Int(0x8000), x)));
+}
+
+RValue<Short4> AddSat(RValue<Short4> x, RValue<Short4> y)
+{
+	if(emulateIntrinsics)
+	{
+		Short4 result;
+		result = Insert(result, SaturateSigned(Int(Extract(x, 0)) + Int(Extract(y, 0))), 0);
+		result = Insert(result, SaturateSigned(Int(Extract(x, 1)) + Int(Extract(y, 1))), 1);
+		result = Insert(result, SaturateSigned(Int(Extract(x, 2)) + Int(Extract(y, 2))), 2);
+		result = Insert(result, SaturateSigned(Int(Extract(x, 3)) + Int(Extract(y, 3))), 3);
+
+		return result;
+	}
+	else
+	{
+		Ice::Variable *result = ::function->makeVariable(Ice::IceType_v8i16);
+		const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::AddSaturateSigned, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
+		auto target = ::context->getConstantUndef(Ice::IceType_i32);
+		auto paddsw = Ice::InstIntrinsicCall::create(::function, 2, result, target, intrinsic);
+		paddsw->addArg(x.value);
+		paddsw->addArg(y.value);
+		::basicBlock->appendInst(paddsw);
+
+		return RValue<Short4>(V(result));
+	}
+}
+
+RValue<Short4> SubSat(RValue<Short4> x, RValue<Short4> y)
+{
+	if(emulateIntrinsics)
+	{
+		Short4 result;
+		result = Insert(result, SaturateSigned(Int(Extract(x, 0)) - Int(Extract(y, 0))), 0);
+		result = Insert(result, SaturateSigned(Int(Extract(x, 1)) - Int(Extract(y, 1))), 1);
+		result = Insert(result, SaturateSigned(Int(Extract(x, 2)) - Int(Extract(y, 2))), 2);
+		result = Insert(result, SaturateSigned(Int(Extract(x, 3)) - Int(Extract(y, 3))), 3);
+
+		return result;
+	}
+	else
+	{
+		Ice::Variable *result = ::function->makeVariable(Ice::IceType_v8i16);
+		const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::SubtractSaturateSigned, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
+		auto target = ::context->getConstantUndef(Ice::IceType_i32);
+		auto psubsw = Ice::InstIntrinsicCall::create(::function, 2, result, target, intrinsic);
+		psubsw->addArg(x.value);
+		psubsw->addArg(y.value);
+		::basicBlock->appendInst(psubsw);
+
+		return RValue<Short4>(V(result));
+	}
+}
+
+RValue<Short4> MulHigh(RValue<Short4> x, RValue<Short4> y)
+{
+	if(emulateIntrinsics)
+	{
+		Short4 result;
+		result = Insert(result, Short((Int(Extract(x, 0)) * Int(Extract(y, 0))) >> 16), 0);
+		result = Insert(result, Short((Int(Extract(x, 1)) * Int(Extract(y, 1))) >> 16), 1);
+		result = Insert(result, Short((Int(Extract(x, 2)) * Int(Extract(y, 2))) >> 16), 2);
+		result = Insert(result, Short((Int(Extract(x, 3)) * Int(Extract(y, 3))) >> 16), 3);
+
+		return result;
+	}
+	else
+	{
+		Ice::Variable *result = ::function->makeVariable(Ice::IceType_v8i16);
+		const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::MultiplyHighSigned, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
+		auto target = ::context->getConstantUndef(Ice::IceType_i32);
+		auto pmulhw = Ice::InstIntrinsicCall::create(::function, 2, result, target, intrinsic);
+		pmulhw->addArg(x.value);
+		pmulhw->addArg(y.value);
+		::basicBlock->appendInst(pmulhw);
+
+		return RValue<Short4>(V(result));
+	}
+}
+
+RValue<Int2> MulAdd(RValue<Short4> x, RValue<Short4> y)
+{
+	if(emulateIntrinsics)
+	{
+		Int2 result;
+		result = Insert(result, Int(Extract(x, 0)) * Int(Extract(y, 0)) + Int(Extract(x, 1)) * Int(Extract(y, 1)), 0);
+		result = Insert(result, Int(Extract(x, 2)) * Int(Extract(y, 2)) + Int(Extract(x, 3)) * Int(Extract(y, 3)), 1);
+
+		return result;
+	}
+	else
+	{
+		Ice::Variable *result = ::function->makeVariable(Ice::IceType_v8i16);
+		const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::MultiplyAddPairs, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
+		auto target = ::context->getConstantUndef(Ice::IceType_i32);
+		auto pmaddwd = Ice::InstIntrinsicCall::create(::function, 2, result, target, intrinsic);
+		pmaddwd->addArg(x.value);
+		pmaddwd->addArg(y.value);
+		::basicBlock->appendInst(pmaddwd);
+
+		return As<Int2>(V(result));
+	}
+}
+
+RValue<SByte8> PackSigned(RValue<Short4> x, RValue<Short4> y)
+{
+	if(emulateIntrinsics)
+	{
+		SByte8 result;
+		result = Insert(result, SaturateSigned(Extract(x, 0)), 0);
+		result = Insert(result, SaturateSigned(Extract(x, 1)), 1);
+		result = Insert(result, SaturateSigned(Extract(x, 2)), 2);
+		result = Insert(result, SaturateSigned(Extract(x, 3)), 3);
+		result = Insert(result, SaturateSigned(Extract(y, 0)), 4);
+		result = Insert(result, SaturateSigned(Extract(y, 1)), 5);
+		result = Insert(result, SaturateSigned(Extract(y, 2)), 6);
+		result = Insert(result, SaturateSigned(Extract(y, 3)), 7);
+
+		return result;
+	}
+	else
+	{
+		Ice::Variable *result = ::function->makeVariable(Ice::IceType_v16i8);
+		const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::VectorPackSigned, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
+		auto target = ::context->getConstantUndef(Ice::IceType_i32);
+		auto pack = Ice::InstIntrinsicCall::create(::function, 2, result, target, intrinsic);
+		pack->addArg(x.value);
+		pack->addArg(y.value);
+		::basicBlock->appendInst(pack);
+
+		return As<SByte8>(Swizzle(As<Int4>(V(result)), 0x0202));
+	}
+}
+
+RValue<Byte8> PackUnsigned(RValue<Short4> x, RValue<Short4> y)
+{
+	if(emulateIntrinsics)
+	{
+		Byte8 result;
+		result = Insert(result, SaturateUnsigned(Extract(x, 0)), 0);
+		result = Insert(result, SaturateUnsigned(Extract(x, 1)), 1);
+		result = Insert(result, SaturateUnsigned(Extract(x, 2)), 2);
+		result = Insert(result, SaturateUnsigned(Extract(x, 3)), 3);
+		result = Insert(result, SaturateUnsigned(Extract(y, 0)), 4);
+		result = Insert(result, SaturateUnsigned(Extract(y, 1)), 5);
+		result = Insert(result, SaturateUnsigned(Extract(y, 2)), 6);
+		result = Insert(result, SaturateUnsigned(Extract(y, 3)), 7);
+
+		return result;
+	}
+	else
+	{
+		Ice::Variable *result = ::function->makeVariable(Ice::IceType_v16i8);
+		const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::VectorPackUnsigned, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
+		auto target = ::context->getConstantUndef(Ice::IceType_i32);
+		auto pack = Ice::InstIntrinsicCall::create(::function, 2, result, target, intrinsic);
+		pack->addArg(x.value);
+		pack->addArg(y.value);
+		::basicBlock->appendInst(pack);
+
+		return As<Byte8>(Swizzle(As<Int4>(V(result)), 0x0202));
+	}
+}
+
+RValue<Short4> CmpGT(RValue<Short4> x, RValue<Short4> y)
+{
+	return RValue<Short4>(createIntCompare(Ice::InstIcmp::Sgt, x.value, y.value));
+}
+
+RValue<Short4> CmpEQ(RValue<Short4> x, RValue<Short4> y)
+{
+	return RValue<Short4>(Nucleus::createICmpEQ(x.value, y.value));
+}
+
+Type *Short4::getType()
+{
+	return T(Type_v4i16);
+}
+
+UShort4::UShort4(RValue<Float4> cast, bool saturate)
+{
+	if(saturate)
+	{
+		if(CPUID::SSE4_1)
 		{
-			auto xh = x >> 16;
-			auto yh = y >> 16;
-			auto xl = x & UInt4(0x0000FFFF);
-			auto yl = y & UInt4(0x0000FFFF);
-			auto xlyh = xl * yh;
-			auto xhyl = xh * yl;
-			auto xlyhh = xlyh >> 16;
-			auto xhylh = xhyl >> 16;
-			auto xlyhl = xlyh & UInt4(0x0000FFFF);
-			auto xhyll = xhyl & UInt4(0x0000FFFF);
-			auto xlylh = (xl * yl) >> 16;
-			auto oflow = (xlyhl + xhyll + xlylh) >> 16;
-
-			return (xh * yh) + (xlyhh + xhylh) + oflow;
+			// x86 produces 0x80000000 on 32-bit integer overflow/underflow.
+			// PackUnsigned takes care of 0x0000 saturation.
+			Int4 int4(Min(cast, Float4(0xFFFF)));
+			*this = As<UShort4>(PackUnsigned(int4, int4));
 		}
-
-		// Scalarized implementation.
-		Int4 result;
-		result = Insert(result, Int((Long(UInt(Extract(As<Int4>(x), 0))) * Long(UInt(Extract(As<Int4>(y), 0)))) >> Long(Int(32))), 0);
-		result = Insert(result, Int((Long(UInt(Extract(As<Int4>(x), 1))) * Long(UInt(Extract(As<Int4>(y), 1)))) >> Long(Int(32))), 1);
-		result = Insert(result, Int((Long(UInt(Extract(As<Int4>(x), 2))) * Long(UInt(Extract(As<Int4>(y), 2)))) >> Long(Int(32))), 2);
-		result = Insert(result, Int((Long(UInt(Extract(As<Int4>(x), 3))) * Long(UInt(Extract(As<Int4>(y), 3)))) >> Long(Int(32))), 3);
-
-		return As<UInt4>(result);
-	}
-
-	RValue<UShort4> Average(RValue<UShort4> x, RValue<UShort4> y)
-	{
-		UNIMPLEMENTED("RValue<UShort4> Average(RValue<UShort4> x, RValue<UShort4> y)");
-		return UShort4(0);
-	}
-
-	Type *UShort4::getType()
-	{
-		return T(Type_v4i16);
-	}
-
-	RValue<Short> Extract(RValue<Short8> val, int i)
-	{
-		return RValue<Short>(Nucleus::createExtractElement(val.value, Short::getType(), i));
-	}
-
-	RValue<Short8> Insert(RValue<Short8> val, RValue<Short> element, int i)
-	{
-		return RValue<Short8>(Nucleus::createInsertElement(val.value, element.value, i));
-	}
-
-	RValue<Short8> operator<<(RValue<Short8> lhs, unsigned char rhs)
-	{
-		if(emulateIntrinsics)
+		else if(CPUID::ARM)
 		{
-			Short8 result;
-			result = Insert(result, Extract(lhs, 0) << Short(rhs), 0);
-			result = Insert(result, Extract(lhs, 1) << Short(rhs), 1);
-			result = Insert(result, Extract(lhs, 2) << Short(rhs), 2);
-			result = Insert(result, Extract(lhs, 3) << Short(rhs), 3);
-			result = Insert(result, Extract(lhs, 4) << Short(rhs), 4);
-			result = Insert(result, Extract(lhs, 5) << Short(rhs), 5);
-			result = Insert(result, Extract(lhs, 6) << Short(rhs), 6);
-			result = Insert(result, Extract(lhs, 7) << Short(rhs), 7);
-
-			return result;
+			// ARM saturates the 32-bit integer result on overflow/undeflow.
+			Int4 int4(cast);
+			*this = As<UShort4>(PackUnsigned(int4, int4));
 		}
 		else
 		{
-			return RValue<Short8>(Nucleus::createShl(lhs.value, V(::context->getConstantInt32(rhs))));
+			*this = Short4(Int4(Max(Min(cast, Float4(0xFFFF)), Float4(0x0000))));
 		}
 	}
-
-	RValue<Short8> operator>>(RValue<Short8> lhs, unsigned char rhs)
+	else
 	{
-		if(emulateIntrinsics)
-		{
-			Short8 result;
-			result = Insert(result, Extract(lhs, 0) >> Short(rhs), 0);
-			result = Insert(result, Extract(lhs, 1) >> Short(rhs), 1);
-			result = Insert(result, Extract(lhs, 2) >> Short(rhs), 2);
-			result = Insert(result, Extract(lhs, 3) >> Short(rhs), 3);
-			result = Insert(result, Extract(lhs, 4) >> Short(rhs), 4);
-			result = Insert(result, Extract(lhs, 5) >> Short(rhs), 5);
-			result = Insert(result, Extract(lhs, 6) >> Short(rhs), 6);
-			result = Insert(result, Extract(lhs, 7) >> Short(rhs), 7);
+		*this = Short4(Int4(cast));
+	}
+}
 
-			return result;
-		}
-		else
-		{
-			return RValue<Short8>(Nucleus::createAShr(lhs.value, V(::context->getConstantInt32(rhs))));
-		}
+RValue<UShort> Extract(RValue<UShort4> val, int i)
+{
+	return RValue<UShort>(Nucleus::createExtractElement(val.value, UShort::getType(), i));
+}
+
+RValue<UShort4> Insert(RValue<UShort4> val, RValue<UShort> element, int i)
+{
+	return RValue<UShort4>(Nucleus::createInsertElement(val.value, element.value, i));
+}
+
+RValue<UShort4> operator<<(RValue<UShort4> lhs, unsigned char rhs)
+{
+	if(emulateIntrinsics)
+	{
+		UShort4 result;
+		result = Insert(result, Extract(lhs, 0) << UShort(rhs), 0);
+		result = Insert(result, Extract(lhs, 1) << UShort(rhs), 1);
+		result = Insert(result, Extract(lhs, 2) << UShort(rhs), 2);
+		result = Insert(result, Extract(lhs, 3) << UShort(rhs), 3);
+
+		return result;
+	}
+	else
+	{
+		return RValue<UShort4>(Nucleus::createShl(lhs.value, V(::context->getConstantInt32(rhs))));
+	}
+}
+
+RValue<UShort4> operator>>(RValue<UShort4> lhs, unsigned char rhs)
+{
+	if(emulateIntrinsics)
+	{
+		UShort4 result;
+		result = Insert(result, Extract(lhs, 0) >> UShort(rhs), 0);
+		result = Insert(result, Extract(lhs, 1) >> UShort(rhs), 1);
+		result = Insert(result, Extract(lhs, 2) >> UShort(rhs), 2);
+		result = Insert(result, Extract(lhs, 3) >> UShort(rhs), 3);
+
+		return result;
+	}
+	else
+	{
+		return RValue<UShort4>(Nucleus::createLShr(lhs.value, V(::context->getConstantInt32(rhs))));
+	}
+}
+
+RValue<UShort4> Max(RValue<UShort4> x, RValue<UShort4> y)
+{
+	Ice::Variable *condition = ::function->makeVariable(Ice::IceType_v8i1);
+	auto cmp = Ice::InstIcmp::create(::function, Ice::InstIcmp::Ule, condition, x.value, y.value);
+	::basicBlock->appendInst(cmp);
+
+	Ice::Variable *result = ::function->makeVariable(Ice::IceType_v8i16);
+	auto select = Ice::InstSelect::create(::function, result, condition, y.value, x.value);
+	::basicBlock->appendInst(select);
+
+	return RValue<UShort4>(V(result));
+}
+
+RValue<UShort4> Min(RValue<UShort4> x, RValue<UShort4> y)
+{
+	Ice::Variable *condition = ::function->makeVariable(Ice::IceType_v8i1);
+	auto cmp = Ice::InstIcmp::create(::function, Ice::InstIcmp::Ugt, condition, x.value, y.value);
+	::basicBlock->appendInst(cmp);
+
+	Ice::Variable *result = ::function->makeVariable(Ice::IceType_v8i16);
+	auto select = Ice::InstSelect::create(::function, result, condition, y.value, x.value);
+	::basicBlock->appendInst(select);
+
+	return RValue<UShort4>(V(result));
+}
+
+RValue<UShort> SaturateUnsigned(RValue<Int> x)
+{
+	return UShort(IfThenElse(x > 0xFFFF, Int(0xFFFF), IfThenElse(x < 0, Int(0), x)));
+}
+
+RValue<UShort4> AddSat(RValue<UShort4> x, RValue<UShort4> y)
+{
+	if(emulateIntrinsics)
+	{
+		UShort4 result;
+		result = Insert(result, SaturateUnsigned(Int(Extract(x, 0)) + Int(Extract(y, 0))), 0);
+		result = Insert(result, SaturateUnsigned(Int(Extract(x, 1)) + Int(Extract(y, 1))), 1);
+		result = Insert(result, SaturateUnsigned(Int(Extract(x, 2)) + Int(Extract(y, 2))), 2);
+		result = Insert(result, SaturateUnsigned(Int(Extract(x, 3)) + Int(Extract(y, 3))), 3);
+
+		return result;
+	}
+	else
+	{
+		Ice::Variable *result = ::function->makeVariable(Ice::IceType_v8i16);
+		const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::AddSaturateUnsigned, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
+		auto target = ::context->getConstantUndef(Ice::IceType_i32);
+		auto paddusw = Ice::InstIntrinsicCall::create(::function, 2, result, target, intrinsic);
+		paddusw->addArg(x.value);
+		paddusw->addArg(y.value);
+		::basicBlock->appendInst(paddusw);
+
+		return RValue<UShort4>(V(result));
+	}
+}
+
+RValue<UShort4> SubSat(RValue<UShort4> x, RValue<UShort4> y)
+{
+	if(emulateIntrinsics)
+	{
+		UShort4 result;
+		result = Insert(result, SaturateUnsigned(Int(Extract(x, 0)) - Int(Extract(y, 0))), 0);
+		result = Insert(result, SaturateUnsigned(Int(Extract(x, 1)) - Int(Extract(y, 1))), 1);
+		result = Insert(result, SaturateUnsigned(Int(Extract(x, 2)) - Int(Extract(y, 2))), 2);
+		result = Insert(result, SaturateUnsigned(Int(Extract(x, 3)) - Int(Extract(y, 3))), 3);
+
+		return result;
+	}
+	else
+	{
+		Ice::Variable *result = ::function->makeVariable(Ice::IceType_v8i16);
+		const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::SubtractSaturateUnsigned, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
+		auto target = ::context->getConstantUndef(Ice::IceType_i32);
+		auto psubusw = Ice::InstIntrinsicCall::create(::function, 2, result, target, intrinsic);
+		psubusw->addArg(x.value);
+		psubusw->addArg(y.value);
+		::basicBlock->appendInst(psubusw);
+
+		return RValue<UShort4>(V(result));
+	}
+}
+
+RValue<UShort4> MulHigh(RValue<UShort4> x, RValue<UShort4> y)
+{
+	if(emulateIntrinsics)
+	{
+		UShort4 result;
+		result = Insert(result, UShort((UInt(Extract(x, 0)) * UInt(Extract(y, 0))) >> 16), 0);
+		result = Insert(result, UShort((UInt(Extract(x, 1)) * UInt(Extract(y, 1))) >> 16), 1);
+		result = Insert(result, UShort((UInt(Extract(x, 2)) * UInt(Extract(y, 2))) >> 16), 2);
+		result = Insert(result, UShort((UInt(Extract(x, 3)) * UInt(Extract(y, 3))) >> 16), 3);
+
+		return result;
+	}
+	else
+	{
+		Ice::Variable *result = ::function->makeVariable(Ice::IceType_v8i16);
+		const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::MultiplyHighUnsigned, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
+		auto target = ::context->getConstantUndef(Ice::IceType_i32);
+		auto pmulhuw = Ice::InstIntrinsicCall::create(::function, 2, result, target, intrinsic);
+		pmulhuw->addArg(x.value);
+		pmulhuw->addArg(y.value);
+		::basicBlock->appendInst(pmulhuw);
+
+		return RValue<UShort4>(V(result));
+	}
+}
+
+RValue<Int4> MulHigh(RValue<Int4> x, RValue<Int4> y)
+{
+	// TODO: For x86, build an intrinsics version of this which uses shuffles + pmuludq.
+
+	// Scalarized implementation.
+	Int4 result;
+	result = Insert(result, Int((Long(Extract(x, 0)) * Long(Extract(y, 0))) >> Long(Int(32))), 0);
+	result = Insert(result, Int((Long(Extract(x, 1)) * Long(Extract(y, 1))) >> Long(Int(32))), 1);
+	result = Insert(result, Int((Long(Extract(x, 2)) * Long(Extract(y, 2))) >> Long(Int(32))), 2);
+	result = Insert(result, Int((Long(Extract(x, 3)) * Long(Extract(y, 3))) >> Long(Int(32))), 3);
+
+	return result;
+}
+
+RValue<UInt4> MulHigh(RValue<UInt4> x, RValue<UInt4> y)
+{
+	// TODO: For x86, build an intrinsics version of this which uses shuffles + pmuludq.
+
+	if(false)  // Partial product based implementation.
+	{
+		auto xh = x >> 16;
+		auto yh = y >> 16;
+		auto xl = x & UInt4(0x0000FFFF);
+		auto yl = y & UInt4(0x0000FFFF);
+		auto xlyh = xl * yh;
+		auto xhyl = xh * yl;
+		auto xlyhh = xlyh >> 16;
+		auto xhylh = xhyl >> 16;
+		auto xlyhl = xlyh & UInt4(0x0000FFFF);
+		auto xhyll = xhyl & UInt4(0x0000FFFF);
+		auto xlylh = (xl * yl) >> 16;
+		auto oflow = (xlyhl + xhyll + xlylh) >> 16;
+
+		return (xh * yh) + (xlyhh + xhylh) + oflow;
 	}
 
-	RValue<Int4> MulAdd(RValue<Short8> x, RValue<Short8> y)
+	// Scalarized implementation.
+	Int4 result;
+	result = Insert(result, Int((Long(UInt(Extract(As<Int4>(x), 0))) * Long(UInt(Extract(As<Int4>(y), 0)))) >> Long(Int(32))), 0);
+	result = Insert(result, Int((Long(UInt(Extract(As<Int4>(x), 1))) * Long(UInt(Extract(As<Int4>(y), 1)))) >> Long(Int(32))), 1);
+	result = Insert(result, Int((Long(UInt(Extract(As<Int4>(x), 2))) * Long(UInt(Extract(As<Int4>(y), 2)))) >> Long(Int(32))), 2);
+	result = Insert(result, Int((Long(UInt(Extract(As<Int4>(x), 3))) * Long(UInt(Extract(As<Int4>(y), 3)))) >> Long(Int(32))), 3);
+
+	return As<UInt4>(result);
+}
+
+RValue<UShort4> Average(RValue<UShort4> x, RValue<UShort4> y)
+{
+	UNIMPLEMENTED("RValue<UShort4> Average(RValue<UShort4> x, RValue<UShort4> y)");
+	return UShort4(0);
+}
+
+Type *UShort4::getType()
+{
+	return T(Type_v4i16);
+}
+
+RValue<Short> Extract(RValue<Short8> val, int i)
+{
+	return RValue<Short>(Nucleus::createExtractElement(val.value, Short::getType(), i));
+}
+
+RValue<Short8> Insert(RValue<Short8> val, RValue<Short> element, int i)
+{
+	return RValue<Short8>(Nucleus::createInsertElement(val.value, element.value, i));
+}
+
+RValue<Short8> operator<<(RValue<Short8> lhs, unsigned char rhs)
+{
+	if(emulateIntrinsics)
 	{
-		UNIMPLEMENTED("RValue<Int4> MulAdd(RValue<Short8> x, RValue<Short8> y)");
-		return Int4(0);
-	}
+		Short8 result;
+		result = Insert(result, Extract(lhs, 0) << Short(rhs), 0);
+		result = Insert(result, Extract(lhs, 1) << Short(rhs), 1);
+		result = Insert(result, Extract(lhs, 2) << Short(rhs), 2);
+		result = Insert(result, Extract(lhs, 3) << Short(rhs), 3);
+		result = Insert(result, Extract(lhs, 4) << Short(rhs), 4);
+		result = Insert(result, Extract(lhs, 5) << Short(rhs), 5);
+		result = Insert(result, Extract(lhs, 6) << Short(rhs), 6);
+		result = Insert(result, Extract(lhs, 7) << Short(rhs), 7);
 
-	RValue<Short8> MulHigh(RValue<Short8> x, RValue<Short8> y)
+		return result;
+	}
+	else
 	{
-		UNIMPLEMENTED("RValue<Short8> MulHigh(RValue<Short8> x, RValue<Short8> y)");
-		return Short8(0);
+		return RValue<Short8>(Nucleus::createShl(lhs.value, V(::context->getConstantInt32(rhs))));
 	}
+}
 
-	Type *Short8::getType()
+RValue<Short8> operator>>(RValue<Short8> lhs, unsigned char rhs)
+{
+	if(emulateIntrinsics)
 	{
-		return T(Ice::IceType_v8i16);
-	}
+		Short8 result;
+		result = Insert(result, Extract(lhs, 0) >> Short(rhs), 0);
+		result = Insert(result, Extract(lhs, 1) >> Short(rhs), 1);
+		result = Insert(result, Extract(lhs, 2) >> Short(rhs), 2);
+		result = Insert(result, Extract(lhs, 3) >> Short(rhs), 3);
+		result = Insert(result, Extract(lhs, 4) >> Short(rhs), 4);
+		result = Insert(result, Extract(lhs, 5) >> Short(rhs), 5);
+		result = Insert(result, Extract(lhs, 6) >> Short(rhs), 6);
+		result = Insert(result, Extract(lhs, 7) >> Short(rhs), 7);
 
-	RValue<UShort> Extract(RValue<UShort8> val, int i)
+		return result;
+	}
+	else
 	{
-		return RValue<UShort>(Nucleus::createExtractElement(val.value, UShort::getType(), i));
+		return RValue<Short8>(Nucleus::createAShr(lhs.value, V(::context->getConstantInt32(rhs))));
 	}
+}
 
-	RValue<UShort8> Insert(RValue<UShort8> val, RValue<UShort> element, int i)
+RValue<Int4> MulAdd(RValue<Short8> x, RValue<Short8> y)
+{
+	UNIMPLEMENTED("RValue<Int4> MulAdd(RValue<Short8> x, RValue<Short8> y)");
+	return Int4(0);
+}
+
+RValue<Short8> MulHigh(RValue<Short8> x, RValue<Short8> y)
+{
+	UNIMPLEMENTED("RValue<Short8> MulHigh(RValue<Short8> x, RValue<Short8> y)");
+	return Short8(0);
+}
+
+Type *Short8::getType()
+{
+	return T(Ice::IceType_v8i16);
+}
+
+RValue<UShort> Extract(RValue<UShort8> val, int i)
+{
+	return RValue<UShort>(Nucleus::createExtractElement(val.value, UShort::getType(), i));
+}
+
+RValue<UShort8> Insert(RValue<UShort8> val, RValue<UShort> element, int i)
+{
+	return RValue<UShort8>(Nucleus::createInsertElement(val.value, element.value, i));
+}
+
+RValue<UShort8> operator<<(RValue<UShort8> lhs, unsigned char rhs)
+{
+	if(emulateIntrinsics)
 	{
-		return RValue<UShort8>(Nucleus::createInsertElement(val.value, element.value, i));
-	}
+		UShort8 result;
+		result = Insert(result, Extract(lhs, 0) << UShort(rhs), 0);
+		result = Insert(result, Extract(lhs, 1) << UShort(rhs), 1);
+		result = Insert(result, Extract(lhs, 2) << UShort(rhs), 2);
+		result = Insert(result, Extract(lhs, 3) << UShort(rhs), 3);
+		result = Insert(result, Extract(lhs, 4) << UShort(rhs), 4);
+		result = Insert(result, Extract(lhs, 5) << UShort(rhs), 5);
+		result = Insert(result, Extract(lhs, 6) << UShort(rhs), 6);
+		result = Insert(result, Extract(lhs, 7) << UShort(rhs), 7);
 
-	RValue<UShort8> operator<<(RValue<UShort8> lhs, unsigned char rhs)
+		return result;
+	}
+	else
 	{
-		if(emulateIntrinsics)
-		{
-			UShort8 result;
-			result = Insert(result, Extract(lhs, 0) << UShort(rhs), 0);
-			result = Insert(result, Extract(lhs, 1) << UShort(rhs), 1);
-			result = Insert(result, Extract(lhs, 2) << UShort(rhs), 2);
-			result = Insert(result, Extract(lhs, 3) << UShort(rhs), 3);
-			result = Insert(result, Extract(lhs, 4) << UShort(rhs), 4);
-			result = Insert(result, Extract(lhs, 5) << UShort(rhs), 5);
-			result = Insert(result, Extract(lhs, 6) << UShort(rhs), 6);
-			result = Insert(result, Extract(lhs, 7) << UShort(rhs), 7);
-
-			return result;
-		}
-		else
-		{
-			return RValue<UShort8>(Nucleus::createShl(lhs.value, V(::context->getConstantInt32(rhs))));
-		}
+		return RValue<UShort8>(Nucleus::createShl(lhs.value, V(::context->getConstantInt32(rhs))));
 	}
+}
 
-	RValue<UShort8> operator>>(RValue<UShort8> lhs, unsigned char rhs)
+RValue<UShort8> operator>>(RValue<UShort8> lhs, unsigned char rhs)
+{
+	if(emulateIntrinsics)
 	{
-		if(emulateIntrinsics)
-		{
-			UShort8 result;
-			result = Insert(result, Extract(lhs, 0) >> UShort(rhs), 0);
-			result = Insert(result, Extract(lhs, 1) >> UShort(rhs), 1);
-			result = Insert(result, Extract(lhs, 2) >> UShort(rhs), 2);
-			result = Insert(result, Extract(lhs, 3) >> UShort(rhs), 3);
-			result = Insert(result, Extract(lhs, 4) >> UShort(rhs), 4);
-			result = Insert(result, Extract(lhs, 5) >> UShort(rhs), 5);
-			result = Insert(result, Extract(lhs, 6) >> UShort(rhs), 6);
-			result = Insert(result, Extract(lhs, 7) >> UShort(rhs), 7);
+		UShort8 result;
+		result = Insert(result, Extract(lhs, 0) >> UShort(rhs), 0);
+		result = Insert(result, Extract(lhs, 1) >> UShort(rhs), 1);
+		result = Insert(result, Extract(lhs, 2) >> UShort(rhs), 2);
+		result = Insert(result, Extract(lhs, 3) >> UShort(rhs), 3);
+		result = Insert(result, Extract(lhs, 4) >> UShort(rhs), 4);
+		result = Insert(result, Extract(lhs, 5) >> UShort(rhs), 5);
+		result = Insert(result, Extract(lhs, 6) >> UShort(rhs), 6);
+		result = Insert(result, Extract(lhs, 7) >> UShort(rhs), 7);
 
-			return result;
-		}
-		else
-		{
-			return RValue<UShort8>(Nucleus::createLShr(lhs.value, V(::context->getConstantInt32(rhs))));
-		}
+		return result;
 	}
-
-	RValue<UShort8> Swizzle(RValue<UShort8> x, char select0, char select1, char select2, char select3, char select4, char select5, char select6, char select7)
+	else
 	{
-		UNIMPLEMENTED("RValue<UShort8> Swizzle(RValue<UShort8> x, char select0, char select1, char select2, char select3, char select4, char select5, char select6, char select7)");
-		return UShort8(0);
+		return RValue<UShort8>(Nucleus::createLShr(lhs.value, V(::context->getConstantInt32(rhs))));
 	}
+}
 
-	RValue<UShort8> MulHigh(RValue<UShort8> x, RValue<UShort8> y)
-	{
-		UNIMPLEMENTED("RValue<UShort8> MulHigh(RValue<UShort8> x, RValue<UShort8> y)");
-		return UShort8(0);
-	}
+RValue<UShort8> Swizzle(RValue<UShort8> x, char select0, char select1, char select2, char select3, char select4, char select5, char select6, char select7)
+{
+	UNIMPLEMENTED("RValue<UShort8> Swizzle(RValue<UShort8> x, char select0, char select1, char select2, char select3, char select4, char select5, char select6, char select7)");
+	return UShort8(0);
+}
 
-	// FIXME: Implement as Shuffle(x, y, Select(i0, ..., i16)) and Shuffle(x, y, SELECT_PACK_REPEAT(element))
+RValue<UShort8> MulHigh(RValue<UShort8> x, RValue<UShort8> y)
+{
+	UNIMPLEMENTED("RValue<UShort8> MulHigh(RValue<UShort8> x, RValue<UShort8> y)");
+	return UShort8(0);
+}
+
+// FIXME: Implement as Shuffle(x, y, Select(i0, ..., i16)) and Shuffle(x, y, SELECT_PACK_REPEAT(element))
 //	RValue<UShort8> PackRepeat(RValue<Byte16> x, RValue<Byte16> y, int element)
 //	{
 //		ASSERT(false && "UNIMPLEMENTED"); return RValue<UShort8>(V(nullptr));
 //	}
 
-	Type *UShort8::getType()
+Type *UShort8::getType()
+{
+	return T(Ice::IceType_v8i16);
+}
+
+RValue<Int> operator++(Int &val, int)   // Post-increment
+{
+	RValue<Int> res = val;
+	val += 1;
+	return res;
+}
+
+const Int &operator++(Int &val)   // Pre-increment
+{
+	val += 1;
+	return val;
+}
+
+RValue<Int> operator--(Int &val, int)   // Post-decrement
+{
+	RValue<Int> res = val;
+	val -= 1;
+	return res;
+}
+
+const Int &operator--(Int &val)   // Pre-decrement
+{
+	val -= 1;
+	return val;
+}
+
+RValue<Int> RoundInt(RValue<Float> cast)
+{
+	if(emulateIntrinsics || CPUID::ARM)
 	{
-		return T(Ice::IceType_v8i16);
+		// Push the fractional part off the mantissa. Accurate up to +/-2^22.
+		return Int((cast + Float(0x00C00000)) - Float(0x00C00000));
 	}
-
-	RValue<Int> operator++(Int &val, int)   // Post-increment
+	else
 	{
-		RValue<Int> res = val;
-		val += 1;
-		return res;
+		Ice::Variable *result = ::function->makeVariable(Ice::IceType_i32);
+		const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::Nearbyint, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
+		auto target = ::context->getConstantUndef(Ice::IceType_i32);
+		auto nearbyint = Ice::InstIntrinsicCall::create(::function, 1, result, target, intrinsic);
+		nearbyint->addArg(cast.value);
+		::basicBlock->appendInst(nearbyint);
+
+		return RValue<Int>(V(result));
 	}
+}
 
-	const Int &operator++(Int &val)   // Pre-increment
-	{
-		val += 1;
-		return val;
-	}
+Type *Int::getType()
+{
+	return T(Ice::IceType_i32);
+}
 
-	RValue<Int> operator--(Int &val, int)   // Post-decrement
-	{
-		RValue<Int> res = val;
-		val -= 1;
-		return res;
-	}
+Type *Long::getType()
+{
+	return T(Ice::IceType_i64);
+}
 
-	const Int &operator--(Int &val)   // Pre-decrement
-	{
-		val -= 1;
-		return val;
-	}
+UInt::UInt(RValue<Float> cast)
+{
+	// Smallest positive value representable in UInt, but not in Int
+	const unsigned int ustart = 0x80000000u;
+	const float ustartf = float(ustart);
 
-	RValue<Int> RoundInt(RValue<Float> cast)
-	{
-		if(emulateIntrinsics || CPUID::ARM)
-		{
-			// Push the fractional part off the mantissa. Accurate up to +/-2^22.
-			return Int((cast + Float(0x00C00000)) - Float(0x00C00000));
-		}
-		else
-		{
-			Ice::Variable *result = ::function->makeVariable(Ice::IceType_i32);
-			const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::Nearbyint, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
-			auto target = ::context->getConstantUndef(Ice::IceType_i32);
-			auto nearbyint = Ice::InstIntrinsicCall::create(::function, 1, result, target, intrinsic);
-			nearbyint->addArg(cast.value);
-			::basicBlock->appendInst(nearbyint);
+	// If the value is negative, store 0, otherwise store the result of the conversion
+	storeValue((~(As<Int>(cast) >> 31) &
+	// Check if the value can be represented as an Int
+		IfThenElse(cast >= ustartf,
+	// If the value is too large, subtract ustart and re-add it after conversion.
+			As<Int>(As<UInt>(Int(cast - Float(ustartf))) + UInt(ustart)),
+	// Otherwise, just convert normally
+			Int(cast))).value);
+}
 
-			return RValue<Int>(V(result));
-		}
-	}
+RValue<UInt> operator++(UInt &val, int)   // Post-increment
+{
+	RValue<UInt> res = val;
+	val += 1;
+	return res;
+}
 
-	Type *Int::getType()
-	{
-		return T(Ice::IceType_i32);
-	}
+const UInt &operator++(UInt &val)   // Pre-increment
+{
+	val += 1;
+	return val;
+}
 
-	Type *Long::getType()
-	{
-		return T(Ice::IceType_i64);
-	}
+RValue<UInt> operator--(UInt &val, int)   // Post-decrement
+{
+	RValue<UInt> res = val;
+	val -= 1;
+	return res;
+}
 
-	UInt::UInt(RValue<Float> cast)
-	{
-		// Smallest positive value representable in UInt, but not in Int
-		const unsigned int ustart = 0x80000000u;
-		const float ustartf = float(ustart);
-
-		// If the value is negative, store 0, otherwise store the result of the conversion
-		storeValue((~(As<Int>(cast) >> 31) &
-		// Check if the value can be represented as an Int
-			IfThenElse(cast >= ustartf,
-		// If the value is too large, subtract ustart and re-add it after conversion.
-				As<Int>(As<UInt>(Int(cast - Float(ustartf))) + UInt(ustart)),
-		// Otherwise, just convert normally
-				Int(cast))).value);
-	}
-
-	RValue<UInt> operator++(UInt &val, int)   // Post-increment
-	{
-		RValue<UInt> res = val;
-		val += 1;
-		return res;
-	}
-
-	const UInt &operator++(UInt &val)   // Pre-increment
-	{
-		val += 1;
-		return val;
-	}
-
-	RValue<UInt> operator--(UInt &val, int)   // Post-decrement
-	{
-		RValue<UInt> res = val;
-		val -= 1;
-		return res;
-	}
-
-	const UInt &operator--(UInt &val)   // Pre-decrement
-	{
-		val -= 1;
-		return val;
-	}
+const UInt &operator--(UInt &val)   // Pre-decrement
+{
+	val -= 1;
+	return val;
+}
 
 //	RValue<UInt> RoundUInt(RValue<Float> cast)
 //	{
 //		ASSERT(false && "UNIMPLEMENTED"); return RValue<UInt>(V(nullptr));
 //	}
 
-	Type *UInt::getType()
-	{
-		return T(Ice::IceType_i32);
-	}
+Type *UInt::getType()
+{
+	return T(Ice::IceType_i32);
+}
 
 //	Int2::Int2(RValue<Int> cast)
 //	{
@@ -2765,1052 +2764,1052 @@
 //		storeValue(replicate);
 //	}
 
-	RValue<Int2> operator<<(RValue<Int2> lhs, unsigned char rhs)
+RValue<Int2> operator<<(RValue<Int2> lhs, unsigned char rhs)
+{
+	if(emulateIntrinsics)
 	{
-		if(emulateIntrinsics)
-		{
-			Int2 result;
-			result = Insert(result, Extract(lhs, 0) << Int(rhs), 0);
-			result = Insert(result, Extract(lhs, 1) << Int(rhs), 1);
+		Int2 result;
+		result = Insert(result, Extract(lhs, 0) << Int(rhs), 0);
+		result = Insert(result, Extract(lhs, 1) << Int(rhs), 1);
 
-			return result;
-		}
-		else
-		{
-			return RValue<Int2>(Nucleus::createShl(lhs.value, V(::context->getConstantInt32(rhs))));
-		}
+		return result;
 	}
-
-	RValue<Int2> operator>>(RValue<Int2> lhs, unsigned char rhs)
+	else
 	{
-		if(emulateIntrinsics)
-		{
-			Int2 result;
-			result = Insert(result, Extract(lhs, 0) >> Int(rhs), 0);
-			result = Insert(result, Extract(lhs, 1) >> Int(rhs), 1);
-
-			return result;
-		}
-		else
-		{
-			return RValue<Int2>(Nucleus::createAShr(lhs.value, V(::context->getConstantInt32(rhs))));
-		}
+		return RValue<Int2>(Nucleus::createShl(lhs.value, V(::context->getConstantInt32(rhs))));
 	}
+}
 
-	Type *Int2::getType()
+RValue<Int2> operator>>(RValue<Int2> lhs, unsigned char rhs)
+{
+	if(emulateIntrinsics)
 	{
-		return T(Type_v2i32);
+		Int2 result;
+		result = Insert(result, Extract(lhs, 0) >> Int(rhs), 0);
+		result = Insert(result, Extract(lhs, 1) >> Int(rhs), 1);
+
+		return result;
 	}
-
-	RValue<UInt2> operator<<(RValue<UInt2> lhs, unsigned char rhs)
+	else
 	{
-		if(emulateIntrinsics)
-		{
-			UInt2 result;
-			result = Insert(result, Extract(lhs, 0) << UInt(rhs), 0);
-			result = Insert(result, Extract(lhs, 1) << UInt(rhs), 1);
-
-			return result;
-		}
-		else
-		{
-			return RValue<UInt2>(Nucleus::createShl(lhs.value, V(::context->getConstantInt32(rhs))));
-		}
+		return RValue<Int2>(Nucleus::createAShr(lhs.value, V(::context->getConstantInt32(rhs))));
 	}
+}
 
-	RValue<UInt2> operator>>(RValue<UInt2> lhs, unsigned char rhs)
+Type *Int2::getType()
+{
+	return T(Type_v2i32);
+}
+
+RValue<UInt2> operator<<(RValue<UInt2> lhs, unsigned char rhs)
+{
+	if(emulateIntrinsics)
 	{
-		if(emulateIntrinsics)
-		{
-			UInt2 result;
-			result = Insert(result, Extract(lhs, 0) >> UInt(rhs), 0);
-			result = Insert(result, Extract(lhs, 1) >> UInt(rhs), 1);
+		UInt2 result;
+		result = Insert(result, Extract(lhs, 0) << UInt(rhs), 0);
+		result = Insert(result, Extract(lhs, 1) << UInt(rhs), 1);
 
-			return result;
-		}
-		else
-		{
-			return RValue<UInt2>(Nucleus::createLShr(lhs.value, V(::context->getConstantInt32(rhs))));
-		}
+		return result;
 	}
-
-	Type *UInt2::getType()
+	else
 	{
-		return T(Type_v2i32);
+		return RValue<UInt2>(Nucleus::createShl(lhs.value, V(::context->getConstantInt32(rhs))));
 	}
+}
 
-	Int4::Int4(RValue<Byte4> cast) : XYZW(this)
+RValue<UInt2> operator>>(RValue<UInt2> lhs, unsigned char rhs)
+{
+	if(emulateIntrinsics)
 	{
-		Value *x = Nucleus::createBitCast(cast.value, Int::getType());
-		Value *a = Nucleus::createInsertElement(loadValue(), x, 0);
+		UInt2 result;
+		result = Insert(result, Extract(lhs, 0) >> UInt(rhs), 0);
+		result = Insert(result, Extract(lhs, 1) >> UInt(rhs), 1);
 
-		Value *e;
-		int swizzle[16] = {0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23};
-		Value *b = Nucleus::createBitCast(a, Byte16::getType());
-		Value *c = Nucleus::createShuffleVector(b, V(Nucleus::createNullValue(Byte16::getType())), swizzle);
-
-		int swizzle2[8] = {0, 8, 1, 9, 2, 10, 3, 11};
-		Value *d = Nucleus::createBitCast(c, Short8::getType());
-		e = Nucleus::createShuffleVector(d, V(Nucleus::createNullValue(Short8::getType())), swizzle2);
-
-		Value *f = Nucleus::createBitCast(e, Int4::getType());
-		storeValue(f);
+		return result;
 	}
-
-	Int4::Int4(RValue<SByte4> cast) : XYZW(this)
+	else
 	{
-		Value *x = Nucleus::createBitCast(cast.value, Int::getType());
-		Value *a = Nucleus::createInsertElement(loadValue(), x, 0);
-
-		int swizzle[16] = {0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7};
-		Value *b = Nucleus::createBitCast(a, Byte16::getType());
-		Value *c = Nucleus::createShuffleVector(b, b, swizzle);
-
-		int swizzle2[8] = {0, 0, 1, 1, 2, 2, 3, 3};
-		Value *d = Nucleus::createBitCast(c, Short8::getType());
-		Value *e = Nucleus::createShuffleVector(d, d, swizzle2);
-
-		*this = As<Int4>(e) >> 24;
+		return RValue<UInt2>(Nucleus::createLShr(lhs.value, V(::context->getConstantInt32(rhs))));
 	}
+}
 
-	Int4::Int4(RValue<Short4> cast) : XYZW(this)
+Type *UInt2::getType()
+{
+	return T(Type_v2i32);
+}
+
+Int4::Int4(RValue<Byte4> cast) : XYZW(this)
+{
+	Value *x = Nucleus::createBitCast(cast.value, Int::getType());
+	Value *a = Nucleus::createInsertElement(loadValue(), x, 0);
+
+	Value *e;
+	int swizzle[16] = {0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23};
+	Value *b = Nucleus::createBitCast(a, Byte16::getType());
+	Value *c = Nucleus::createShuffleVector(b, V(Nucleus::createNullValue(Byte16::getType())), swizzle);
+
+	int swizzle2[8] = {0, 8, 1, 9, 2, 10, 3, 11};
+	Value *d = Nucleus::createBitCast(c, Short8::getType());
+	e = Nucleus::createShuffleVector(d, V(Nucleus::createNullValue(Short8::getType())), swizzle2);
+
+	Value *f = Nucleus::createBitCast(e, Int4::getType());
+	storeValue(f);
+}
+
+Int4::Int4(RValue<SByte4> cast) : XYZW(this)
+{
+	Value *x = Nucleus::createBitCast(cast.value, Int::getType());
+	Value *a = Nucleus::createInsertElement(loadValue(), x, 0);
+
+	int swizzle[16] = {0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7};
+	Value *b = Nucleus::createBitCast(a, Byte16::getType());
+	Value *c = Nucleus::createShuffleVector(b, b, swizzle);
+
+	int swizzle2[8] = {0, 0, 1, 1, 2, 2, 3, 3};
+	Value *d = Nucleus::createBitCast(c, Short8::getType());
+	Value *e = Nucleus::createShuffleVector(d, d, swizzle2);
+
+	*this = As<Int4>(e) >> 24;
+}
+
+Int4::Int4(RValue<Short4> cast) : XYZW(this)
+{
+	int swizzle[8] = {0, 0, 1, 1, 2, 2, 3, 3};
+	Value *c = Nucleus::createShuffleVector(cast.value, cast.value, swizzle);
+
+	*this = As<Int4>(c) >> 16;
+}
+
+Int4::Int4(RValue<UShort4> cast) : XYZW(this)
+{
+	int swizzle[8] = {0, 8, 1, 9, 2, 10, 3, 11};
+	Value *c = Nucleus::createShuffleVector(cast.value, Short8(0, 0, 0, 0, 0, 0, 0, 0).loadValue(), swizzle);
+	Value *d = Nucleus::createBitCast(c, Int4::getType());
+	storeValue(d);
+}
+
+Int4::Int4(RValue<Int> rhs) : XYZW(this)
+{
+	Value *vector = Nucleus::createBitCast(rhs.value, Int4::getType());
+
+	int swizzle[4] = {0, 0, 0, 0};
+	Value *replicate = Nucleus::createShuffleVector(vector, vector, swizzle);
+
+	storeValue(replicate);
+}
+
+RValue<Int4> operator<<(RValue<Int4> lhs, unsigned char rhs)
+{
+	if(emulateIntrinsics)
 	{
-		int swizzle[8] = {0, 0, 1, 1, 2, 2, 3, 3};
-		Value *c = Nucleus::createShuffleVector(cast.value, cast.value, swizzle);
+		Int4 result;
+		result = Insert(result, Extract(lhs, 0) << Int(rhs), 0);
+		result = Insert(result, Extract(lhs, 1) << Int(rhs), 1);
+		result = Insert(result, Extract(lhs, 2) << Int(rhs), 2);
+		result = Insert(result, Extract(lhs, 3) << Int(rhs), 3);
 
-		*this = As<Int4>(c) >> 16;
+		return result;
 	}
-
-	Int4::Int4(RValue<UShort4> cast) : XYZW(this)
+	else
 	{
-		int swizzle[8] = {0, 8, 1, 9, 2, 10, 3, 11};
-		Value *c = Nucleus::createShuffleVector(cast.value, Short8(0, 0, 0, 0, 0, 0, 0, 0).loadValue(), swizzle);
-		Value *d = Nucleus::createBitCast(c, Int4::getType());
-		storeValue(d);
+		return RValue<Int4>(Nucleus::createShl(lhs.value, V(::context->getConstantInt32(rhs))));
 	}
+}
 
-	Int4::Int4(RValue<Int> rhs) : XYZW(this)
+RValue<Int4> operator>>(RValue<Int4> lhs, unsigned char rhs)
+{
+	if(emulateIntrinsics)
 	{
-		Value *vector = Nucleus::createBitCast(rhs.value, Int4::getType());
+		Int4 result;
+		result = Insert(result, Extract(lhs, 0) >> Int(rhs), 0);
+		result = Insert(result, Extract(lhs, 1) >> Int(rhs), 1);
+		result = Insert(result, Extract(lhs, 2) >> Int(rhs), 2);
+		result = Insert(result, Extract(lhs, 3) >> Int(rhs), 3);
 
-		int swizzle[4] = {0, 0, 0, 0};
-		Value *replicate = Nucleus::createShuffleVector(vector, vector, swizzle);
-
-		storeValue(replicate);
+		return result;
 	}
-
-	RValue<Int4> operator<<(RValue<Int4> lhs, unsigned char rhs)
+	else
 	{
-		if(emulateIntrinsics)
-		{
-			Int4 result;
-			result = Insert(result, Extract(lhs, 0) << Int(rhs), 0);
-			result = Insert(result, Extract(lhs, 1) << Int(rhs), 1);
-			result = Insert(result, Extract(lhs, 2) << Int(rhs), 2);
-			result = Insert(result, Extract(lhs, 3) << Int(rhs), 3);
-
-			return result;
-		}
-		else
-		{
-			return RValue<Int4>(Nucleus::createShl(lhs.value, V(::context->getConstantInt32(rhs))));
-		}
+		return RValue<Int4>(Nucleus::createAShr(lhs.value, V(::context->getConstantInt32(rhs))));
 	}
+}
 
-	RValue<Int4> operator>>(RValue<Int4> lhs, unsigned char rhs)
+RValue<Int4> CmpEQ(RValue<Int4> x, RValue<Int4> y)
+{
+	return RValue<Int4>(Nucleus::createICmpEQ(x.value, y.value));
+}
+
+RValue<Int4> CmpLT(RValue<Int4> x, RValue<Int4> y)
+{
+	return RValue<Int4>(Nucleus::createICmpSLT(x.value, y.value));
+}
+
+RValue<Int4> CmpLE(RValue<Int4> x, RValue<Int4> y)
+{
+	return RValue<Int4>(Nucleus::createICmpSLE(x.value, y.value));
+}
+
+RValue<Int4> CmpNEQ(RValue<Int4> x, RValue<Int4> y)
+{
+	return RValue<Int4>(Nucleus::createICmpNE(x.value, y.value));
+}
+
+RValue<Int4> CmpNLT(RValue<Int4> x, RValue<Int4> y)
+{
+	return RValue<Int4>(Nucleus::createICmpSGE(x.value, y.value));
+}
+
+RValue<Int4> CmpNLE(RValue<Int4> x, RValue<Int4> y)
+{
+	return RValue<Int4>(Nucleus::createICmpSGT(x.value, y.value));
+}
+
+RValue<Int4> Max(RValue<Int4> x, RValue<Int4> y)
+{
+	Ice::Variable *condition = ::function->makeVariable(Ice::IceType_v4i1);
+	auto cmp = Ice::InstIcmp::create(::function, Ice::InstIcmp::Sle, condition, x.value, y.value);
+	::basicBlock->appendInst(cmp);
+
+	Ice::Variable *result = ::function->makeVariable(Ice::IceType_v4i32);
+	auto select = Ice::InstSelect::create(::function, result, condition, y.value, x.value);
+	::basicBlock->appendInst(select);
+
+	return RValue<Int4>(V(result));
+}
+
+RValue<Int4> Min(RValue<Int4> x, RValue<Int4> y)
+{
+	Ice::Variable *condition = ::function->makeVariable(Ice::IceType_v4i1);
+	auto cmp = Ice::InstIcmp::create(::function, Ice::InstIcmp::Sgt, condition, x.value, y.value);
+	::basicBlock->appendInst(cmp);
+
+	Ice::Variable *result = ::function->makeVariable(Ice::IceType_v4i32);
+	auto select = Ice::InstSelect::create(::function, result, condition, y.value, x.value);
+	::basicBlock->appendInst(select);
+
+	return RValue<Int4>(V(result));
+}
+
+RValue<Int4> RoundInt(RValue<Float4> cast)
+{
+	if(emulateIntrinsics || CPUID::ARM)
 	{
-		if(emulateIntrinsics)
-		{
-			Int4 result;
-			result = Insert(result, Extract(lhs, 0) >> Int(rhs), 0);
-			result = Insert(result, Extract(lhs, 1) >> Int(rhs), 1);
-			result = Insert(result, Extract(lhs, 2) >> Int(rhs), 2);
-			result = Insert(result, Extract(lhs, 3) >> Int(rhs), 3);
-
-			return result;
-		}
-		else
-		{
-			return RValue<Int4>(Nucleus::createAShr(lhs.value, V(::context->getConstantInt32(rhs))));
-		}
+		// Push the fractional part off the mantissa. Accurate up to +/-2^22.
+		return Int4((cast + Float4(0x00C00000)) - Float4(0x00C00000));
 	}
-
-	RValue<Int4> CmpEQ(RValue<Int4> x, RValue<Int4> y)
+	else
 	{
-		return RValue<Int4>(Nucleus::createICmpEQ(x.value, y.value));
-	}
-
-	RValue<Int4> CmpLT(RValue<Int4> x, RValue<Int4> y)
-	{
-		return RValue<Int4>(Nucleus::createICmpSLT(x.value, y.value));
-	}
-
-	RValue<Int4> CmpLE(RValue<Int4> x, RValue<Int4> y)
-	{
-		return RValue<Int4>(Nucleus::createICmpSLE(x.value, y.value));
-	}
-
-	RValue<Int4> CmpNEQ(RValue<Int4> x, RValue<Int4> y)
-	{
-		return RValue<Int4>(Nucleus::createICmpNE(x.value, y.value));
-	}
-
-	RValue<Int4> CmpNLT(RValue<Int4> x, RValue<Int4> y)
-	{
-		return RValue<Int4>(Nucleus::createICmpSGE(x.value, y.value));
-	}
-
-	RValue<Int4> CmpNLE(RValue<Int4> x, RValue<Int4> y)
-	{
-		return RValue<Int4>(Nucleus::createICmpSGT(x.value, y.value));
-	}
-
-	RValue<Int4> Max(RValue<Int4> x, RValue<Int4> y)
-	{
-		Ice::Variable *condition = ::function->makeVariable(Ice::IceType_v4i1);
-		auto cmp = Ice::InstIcmp::create(::function, Ice::InstIcmp::Sle, condition, x.value, y.value);
-		::basicBlock->appendInst(cmp);
-
 		Ice::Variable *result = ::function->makeVariable(Ice::IceType_v4i32);
-		auto select = Ice::InstSelect::create(::function, result, condition, y.value, x.value);
-		::basicBlock->appendInst(select);
+		const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::Nearbyint, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
+		auto target = ::context->getConstantUndef(Ice::IceType_i32);
+		auto nearbyint = Ice::InstIntrinsicCall::create(::function, 1, result, target, intrinsic);
+		nearbyint->addArg(cast.value);
+		::basicBlock->appendInst(nearbyint);
 
 		return RValue<Int4>(V(result));
 	}
+}
 
-	RValue<Int4> Min(RValue<Int4> x, RValue<Int4> y)
+RValue<Short8> PackSigned(RValue<Int4> x, RValue<Int4> y)
+{
+	if(emulateIntrinsics)
 	{
-		Ice::Variable *condition = ::function->makeVariable(Ice::IceType_v4i1);
-		auto cmp = Ice::InstIcmp::create(::function, Ice::InstIcmp::Sgt, condition, x.value, y.value);
-		::basicBlock->appendInst(cmp);
+		Short8 result;
+		result = Insert(result, SaturateSigned(Extract(x, 0)), 0);
+		result = Insert(result, SaturateSigned(Extract(x, 1)), 1);
+		result = Insert(result, SaturateSigned(Extract(x, 2)), 2);
+		result = Insert(result, SaturateSigned(Extract(x, 3)), 3);
+		result = Insert(result, SaturateSigned(Extract(y, 0)), 4);
+		result = Insert(result, SaturateSigned(Extract(y, 1)), 5);
+		result = Insert(result, SaturateSigned(Extract(y, 2)), 6);
+		result = Insert(result, SaturateSigned(Extract(y, 3)), 7);
 
-		Ice::Variable *result = ::function->makeVariable(Ice::IceType_v4i32);
-		auto select = Ice::InstSelect::create(::function, result, condition, y.value, x.value);
-		::basicBlock->appendInst(select);
-
-		return RValue<Int4>(V(result));
+		return result;
 	}
-
-	RValue<Int4> RoundInt(RValue<Float4> cast)
+	else
 	{
-		if(emulateIntrinsics || CPUID::ARM)
-		{
-			// Push the fractional part off the mantissa. Accurate up to +/-2^22.
-			return Int4((cast + Float4(0x00C00000)) - Float4(0x00C00000));
-		}
-		else
-		{
-			Ice::Variable *result = ::function->makeVariable(Ice::IceType_v4i32);
-			const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::Nearbyint, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
-			auto target = ::context->getConstantUndef(Ice::IceType_i32);
-			auto nearbyint = Ice::InstIntrinsicCall::create(::function, 1, result, target, intrinsic);
-			nearbyint->addArg(cast.value);
-			::basicBlock->appendInst(nearbyint);
+		Ice::Variable *result = ::function->makeVariable(Ice::IceType_v8i16);
+		const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::VectorPackSigned, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
+		auto target = ::context->getConstantUndef(Ice::IceType_i32);
+		auto pack = Ice::InstIntrinsicCall::create(::function, 2, result, target, intrinsic);
+		pack->addArg(x.value);
+		pack->addArg(y.value);
+		::basicBlock->appendInst(pack);
 
-			return RValue<Int4>(V(result));
-		}
+		return RValue<Short8>(V(result));
 	}
+}
 
-	RValue<Short8> PackSigned(RValue<Int4> x, RValue<Int4> y)
+RValue<UShort8> PackUnsigned(RValue<Int4> x, RValue<Int4> y)
+{
+	if(emulateIntrinsics || !(CPUID::SSE4_1 || CPUID::ARM))
 	{
-		if(emulateIntrinsics)
-		{
-			Short8 result;
-			result = Insert(result, SaturateSigned(Extract(x, 0)), 0);
-			result = Insert(result, SaturateSigned(Extract(x, 1)), 1);
-			result = Insert(result, SaturateSigned(Extract(x, 2)), 2);
-			result = Insert(result, SaturateSigned(Extract(x, 3)), 3);
-			result = Insert(result, SaturateSigned(Extract(y, 0)), 4);
-			result = Insert(result, SaturateSigned(Extract(y, 1)), 5);
-			result = Insert(result, SaturateSigned(Extract(y, 2)), 6);
-			result = Insert(result, SaturateSigned(Extract(y, 3)), 7);
+		RValue<Int4> sx = As<Int4>(x);
+		RValue<Int4> bx = (sx & ~(sx >> 31)) - Int4(0x8000);
 
-			return result;
-		}
-		else
-		{
-			Ice::Variable *result = ::function->makeVariable(Ice::IceType_v8i16);
-			const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::VectorPackSigned, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
-			auto target = ::context->getConstantUndef(Ice::IceType_i32);
-			auto pack = Ice::InstIntrinsicCall::create(::function, 2, result, target, intrinsic);
-			pack->addArg(x.value);
-			pack->addArg(y.value);
-			::basicBlock->appendInst(pack);
+		RValue<Int4> sy = As<Int4>(y);
+		RValue<Int4> by = (sy & ~(sy >> 31)) - Int4(0x8000);
 
-			return RValue<Short8>(V(result));
-		}
+		return As<UShort8>(PackSigned(bx, by) + Short8(0x8000u));
 	}
-
-	RValue<UShort8> PackUnsigned(RValue<Int4> x, RValue<Int4> y)
+	else
 	{
-		if(emulateIntrinsics || !(CPUID::SSE4_1 || CPUID::ARM))
-		{
-			RValue<Int4> sx = As<Int4>(x);
-			RValue<Int4> bx = (sx & ~(sx >> 31)) - Int4(0x8000);
+		Ice::Variable *result = ::function->makeVariable(Ice::IceType_v8i16);
+		const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::VectorPackUnsigned, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
+		auto target = ::context->getConstantUndef(Ice::IceType_i32);
+		auto pack = Ice::InstIntrinsicCall::create(::function, 2, result, target, intrinsic);
+		pack->addArg(x.value);
+		pack->addArg(y.value);
+		::basicBlock->appendInst(pack);
 
-			RValue<Int4> sy = As<Int4>(y);
-			RValue<Int4> by = (sy & ~(sy >> 31)) - Int4(0x8000);
-
-			return As<UShort8>(PackSigned(bx, by) + Short8(0x8000u));
-		}
-		else
-		{
-			Ice::Variable *result = ::function->makeVariable(Ice::IceType_v8i16);
-			const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::VectorPackUnsigned, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
-			auto target = ::context->getConstantUndef(Ice::IceType_i32);
-			auto pack = Ice::InstIntrinsicCall::create(::function, 2, result, target, intrinsic);
-			pack->addArg(x.value);
-			pack->addArg(y.value);
-			::basicBlock->appendInst(pack);
-
-			return RValue<UShort8>(V(result));
-		}
+		return RValue<UShort8>(V(result));
 	}
+}
 
-	RValue<Int> SignMask(RValue<Int4> x)
+RValue<Int> SignMask(RValue<Int4> x)
+{
+	if(emulateIntrinsics || CPUID::ARM)
 	{
-		if(emulateIntrinsics || CPUID::ARM)
-		{
-			Int4 xx = (x >> 31) & Int4(0x00000001, 0x00000002, 0x00000004, 0x00000008);
-			return Extract(xx, 0) | Extract(xx, 1) | Extract(xx, 2) | Extract(xx, 3);
-		}
-		else
-		{
-			Ice::Variable *result = ::function->makeVariable(Ice::IceType_i32);
-			const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::SignMask, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
-			auto target = ::context->getConstantUndef(Ice::IceType_i32);
-			auto movmsk = Ice::InstIntrinsicCall::create(::function, 1, result, target, intrinsic);
-			movmsk->addArg(x.value);
-			::basicBlock->appendInst(movmsk);
-
-			return RValue<Int>(V(result));
-		}
+		Int4 xx = (x >> 31) & Int4(0x00000001, 0x00000002, 0x00000004, 0x00000008);
+		return Extract(xx, 0) | Extract(xx, 1) | Extract(xx, 2) | Extract(xx, 3);
 	}
-
-	Type *Int4::getType()
+	else
 	{
-		return T(Ice::IceType_v4i32);
+		Ice::Variable *result = ::function->makeVariable(Ice::IceType_i32);
+		const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::SignMask, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
+		auto target = ::context->getConstantUndef(Ice::IceType_i32);
+		auto movmsk = Ice::InstIntrinsicCall::create(::function, 1, result, target, intrinsic);
+		movmsk->addArg(x.value);
+		::basicBlock->appendInst(movmsk);
+
+		return RValue<Int>(V(result));
 	}
+}
 
-	UInt4::UInt4(RValue<Float4> cast) : XYZW(this)
+Type *Int4::getType()
+{
+	return T(Ice::IceType_v4i32);
+}
+
+UInt4::UInt4(RValue<Float4> cast) : XYZW(this)
+{
+	// Smallest positive value representable in UInt, but not in Int
+	const unsigned int ustart = 0x80000000u;
+	const float ustartf = float(ustart);
+
+	// Check if the value can be represented as an Int
+	Int4 uiValue = CmpNLT(cast, Float4(ustartf));
+	// If the value is too large, subtract ustart and re-add it after conversion.
+	uiValue = (uiValue & As<Int4>(As<UInt4>(Int4(cast - Float4(ustartf))) + UInt4(ustart))) |
+	// Otherwise, just convert normally
+	          (~uiValue & Int4(cast));
+	// If the value is negative, store 0, otherwise store the result of the conversion
+	storeValue((~(As<Int4>(cast) >> 31) & uiValue).value);
+}
+
+UInt4::UInt4(RValue<UInt> rhs) : XYZW(this)
+{
+	Value *vector = Nucleus::createBitCast(rhs.value, UInt4::getType());
+
+	int swizzle[4] = {0, 0, 0, 0};
+	Value *replicate = Nucleus::createShuffleVector(vector, vector, swizzle);
+
+	storeValue(replicate);
+}
+
+RValue<UInt4> operator<<(RValue<UInt4> lhs, unsigned char rhs)
+{
+	if(emulateIntrinsics)
 	{
-		// Smallest positive value representable in UInt, but not in Int
-		const unsigned int ustart = 0x80000000u;
-		const float ustartf = float(ustart);
+		UInt4 result;
+		result = Insert(result, Extract(lhs, 0) << UInt(rhs), 0);
+		result = Insert(result, Extract(lhs, 1) << UInt(rhs), 1);
+		result = Insert(result, Extract(lhs, 2) << UInt(rhs), 2);
+		result = Insert(result, Extract(lhs, 3) << UInt(rhs), 3);
 
-		// Check if the value can be represented as an Int
-		Int4 uiValue = CmpNLT(cast, Float4(ustartf));
-		// If the value is too large, subtract ustart and re-add it after conversion.
-		uiValue = (uiValue & As<Int4>(As<UInt4>(Int4(cast - Float4(ustartf))) + UInt4(ustart))) |
-		// Otherwise, just convert normally
-		          (~uiValue & Int4(cast));
-		// If the value is negative, store 0, otherwise store the result of the conversion
-		storeValue((~(As<Int4>(cast) >> 31) & uiValue).value);
+		return result;
 	}
-
-	UInt4::UInt4(RValue<UInt> rhs) : XYZW(this)
+	else
 	{
-		Value *vector = Nucleus::createBitCast(rhs.value, UInt4::getType());
-
-		int swizzle[4] = {0, 0, 0, 0};
-		Value *replicate = Nucleus::createShuffleVector(vector, vector, swizzle);
-
-		storeValue(replicate);
+		return RValue<UInt4>(Nucleus::createShl(lhs.value, V(::context->getConstantInt32(rhs))));
 	}
+}
 
-	RValue<UInt4> operator<<(RValue<UInt4> lhs, unsigned char rhs)
+RValue<UInt4> operator>>(RValue<UInt4> lhs, unsigned char rhs)
+{
+	if(emulateIntrinsics)
 	{
-		if(emulateIntrinsics)
-		{
-			UInt4 result;
-			result = Insert(result, Extract(lhs, 0) << UInt(rhs), 0);
-			result = Insert(result, Extract(lhs, 1) << UInt(rhs), 1);
-			result = Insert(result, Extract(lhs, 2) << UInt(rhs), 2);
-			result = Insert(result, Extract(lhs, 3) << UInt(rhs), 3);
+		UInt4 result;
+		result = Insert(result, Extract(lhs, 0) >> UInt(rhs), 0);
+		result = Insert(result, Extract(lhs, 1) >> UInt(rhs), 1);
+		result = Insert(result, Extract(lhs, 2) >> UInt(rhs), 2);
+		result = Insert(result, Extract(lhs, 3) >> UInt(rhs), 3);
 
-			return result;
-		}
-		else
-		{
-			return RValue<UInt4>(Nucleus::createShl(lhs.value, V(::context->getConstantInt32(rhs))));
-		}
+		return result;
 	}
-
-	RValue<UInt4> operator>>(RValue<UInt4> lhs, unsigned char rhs)
+	else
 	{
-		if(emulateIntrinsics)
-		{
-			UInt4 result;
-			result = Insert(result, Extract(lhs, 0) >> UInt(rhs), 0);
-			result = Insert(result, Extract(lhs, 1) >> UInt(rhs), 1);
-			result = Insert(result, Extract(lhs, 2) >> UInt(rhs), 2);
-			result = Insert(result, Extract(lhs, 3) >> UInt(rhs), 3);
-
-			return result;
-		}
-		else
-		{
-			return RValue<UInt4>(Nucleus::createLShr(lhs.value, V(::context->getConstantInt32(rhs))));
-		}
+		return RValue<UInt4>(Nucleus::createLShr(lhs.value, V(::context->getConstantInt32(rhs))));
 	}
+}
 
-	RValue<UInt4> CmpEQ(RValue<UInt4> x, RValue<UInt4> y)
+RValue<UInt4> CmpEQ(RValue<UInt4> x, RValue<UInt4> y)
+{
+	return RValue<UInt4>(Nucleus::createICmpEQ(x.value, y.value));
+}
+
+RValue<UInt4> CmpLT(RValue<UInt4> x, RValue<UInt4> y)
+{
+	return RValue<UInt4>(Nucleus::createICmpULT(x.value, y.value));
+}
+
+RValue<UInt4> CmpLE(RValue<UInt4> x, RValue<UInt4> y)
+{
+	return RValue<UInt4>(Nucleus::createICmpULE(x.value, y.value));
+}
+
+RValue<UInt4> CmpNEQ(RValue<UInt4> x, RValue<UInt4> y)
+{
+	return RValue<UInt4>(Nucleus::createICmpNE(x.value, y.value));
+}
+
+RValue<UInt4> CmpNLT(RValue<UInt4> x, RValue<UInt4> y)
+{
+	return RValue<UInt4>(Nucleus::createICmpUGE(x.value, y.value));
+}
+
+RValue<UInt4> CmpNLE(RValue<UInt4> x, RValue<UInt4> y)
+{
+	return RValue<UInt4>(Nucleus::createICmpUGT(x.value, y.value));
+}
+
+RValue<UInt4> Max(RValue<UInt4> x, RValue<UInt4> y)
+{
+	Ice::Variable *condition = ::function->makeVariable(Ice::IceType_v4i1);
+	auto cmp = Ice::InstIcmp::create(::function, Ice::InstIcmp::Ule, condition, x.value, y.value);
+	::basicBlock->appendInst(cmp);
+
+	Ice::Variable *result = ::function->makeVariable(Ice::IceType_v4i32);
+	auto select = Ice::InstSelect::create(::function, result, condition, y.value, x.value);
+	::basicBlock->appendInst(select);
+
+	return RValue<UInt4>(V(result));
+}
+
+RValue<UInt4> Min(RValue<UInt4> x, RValue<UInt4> y)
+{
+	Ice::Variable *condition = ::function->makeVariable(Ice::IceType_v4i1);
+	auto cmp = Ice::InstIcmp::create(::function, Ice::InstIcmp::Ugt, condition, x.value, y.value);
+	::basicBlock->appendInst(cmp);
+
+	Ice::Variable *result = ::function->makeVariable(Ice::IceType_v4i32);
+	auto select = Ice::InstSelect::create(::function, result, condition, y.value, x.value);
+	::basicBlock->appendInst(select);
+
+	return RValue<UInt4>(V(result));
+}
+
+Type *UInt4::getType()
+{
+	return T(Ice::IceType_v4i32);
+}
+
+Type *Half::getType()
+{
+	return T(Ice::IceType_i16);
+}
+
+RValue<Float> Rcp_pp(RValue<Float> x, bool exactAtPow2)
+{
+	return 1.0f / x;
+}
+
+RValue<Float> RcpSqrt_pp(RValue<Float> x)
+{
+	return Rcp_pp(Sqrt(x));
+}
+
+RValue<Float> Sqrt(RValue<Float> x)
+{
+	Ice::Variable *result = ::function->makeVariable(Ice::IceType_f32);
+	const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::Sqrt, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
+	auto target = ::context->getConstantUndef(Ice::IceType_i32);
+	auto sqrt = Ice::InstIntrinsicCall::create(::function, 1, result, target, intrinsic);
+	sqrt->addArg(x.value);
+	::basicBlock->appendInst(sqrt);
+
+	return RValue<Float>(V(result));
+}
+
+RValue<Float> Round(RValue<Float> x)
+{
+	return Float4(Round(Float4(x))).x;
+}
+
+RValue<Float> Trunc(RValue<Float> x)
+{
+	return Float4(Trunc(Float4(x))).x;
+}
+
+RValue<Float> Frac(RValue<Float> x)
+{
+	return Float4(Frac(Float4(x))).x;
+}
+
+RValue<Float> Floor(RValue<Float> x)
+{
+	return Float4(Floor(Float4(x))).x;
+}
+
+RValue<Float> Ceil(RValue<Float> x)
+{
+	return Float4(Ceil(Float4(x))).x;
+}
+
+Type *Float::getType()
+{
+	return T(Ice::IceType_f32);
+}
+
+Type *Float2::getType()
+{
+	return T(Type_v2f32);
+}
+
+Float4::Float4(RValue<Float> rhs) : XYZW(this)
+{
+	Value *vector = Nucleus::createBitCast(rhs.value, Float4::getType());
+
+	int swizzle[4] = {0, 0, 0, 0};
+	Value *replicate = Nucleus::createShuffleVector(vector, vector, swizzle);
+
+	storeValue(replicate);
+}
+
+RValue<Float4> Max(RValue<Float4> x, RValue<Float4> y)
+{
+	Ice::Variable *condition = ::function->makeVariable(Ice::IceType_v4i1);
+	auto cmp = Ice::InstFcmp::create(::function, Ice::InstFcmp::Ogt, condition, x.value, y.value);
+	::basicBlock->appendInst(cmp);
+
+	Ice::Variable *result = ::function->makeVariable(Ice::IceType_v4f32);
+	auto select = Ice::InstSelect::create(::function, result, condition, x.value, y.value);
+	::basicBlock->appendInst(select);
+
+	return RValue<Float4>(V(result));
+}
+
+RValue<Float4> Min(RValue<Float4> x, RValue<Float4> y)
+{
+	Ice::Variable *condition = ::function->makeVariable(Ice::IceType_v4i1);
+	auto cmp = Ice::InstFcmp::create(::function, Ice::InstFcmp::Olt, condition, x.value, y.value);
+	::basicBlock->appendInst(cmp);
+
+	Ice::Variable *result = ::function->makeVariable(Ice::IceType_v4f32);
+	auto select = Ice::InstSelect::create(::function, result, condition, x.value, y.value);
+	::basicBlock->appendInst(select);
+
+	return RValue<Float4>(V(result));
+}
+
+RValue<Float4> Rcp_pp(RValue<Float4> x, bool exactAtPow2)
+{
+	return Float4(1.0f) / x;
+}
+
+RValue<Float4> RcpSqrt_pp(RValue<Float4> x)
+{
+	return Rcp_pp(Sqrt(x));
+}
+
+RValue<Float4> Sqrt(RValue<Float4> x)
+{
+	if(emulateIntrinsics || CPUID::ARM)
 	{
-		return RValue<UInt4>(Nucleus::createICmpEQ(x.value, y.value));
+		Float4 result;
+		result.x = Sqrt(Float(Float4(x).x));
+		result.y = Sqrt(Float(Float4(x).y));
+		result.z = Sqrt(Float(Float4(x).z));
+		result.w = Sqrt(Float(Float4(x).w));
+
+		return result;
 	}
-
-	RValue<UInt4> CmpLT(RValue<UInt4> x, RValue<UInt4> y)
+	else
 	{
-		return RValue<UInt4>(Nucleus::createICmpULT(x.value, y.value));
-	}
-
-	RValue<UInt4> CmpLE(RValue<UInt4> x, RValue<UInt4> y)
-	{
-		return RValue<UInt4>(Nucleus::createICmpULE(x.value, y.value));
-	}
-
-	RValue<UInt4> CmpNEQ(RValue<UInt4> x, RValue<UInt4> y)
-	{
-		return RValue<UInt4>(Nucleus::createICmpNE(x.value, y.value));
-	}
-
-	RValue<UInt4> CmpNLT(RValue<UInt4> x, RValue<UInt4> y)
-	{
-		return RValue<UInt4>(Nucleus::createICmpUGE(x.value, y.value));
-	}
-
-	RValue<UInt4> CmpNLE(RValue<UInt4> x, RValue<UInt4> y)
-	{
-		return RValue<UInt4>(Nucleus::createICmpUGT(x.value, y.value));
-	}
-
-	RValue<UInt4> Max(RValue<UInt4> x, RValue<UInt4> y)
-	{
-		Ice::Variable *condition = ::function->makeVariable(Ice::IceType_v4i1);
-		auto cmp = Ice::InstIcmp::create(::function, Ice::InstIcmp::Ule, condition, x.value, y.value);
-		::basicBlock->appendInst(cmp);
-
-		Ice::Variable *result = ::function->makeVariable(Ice::IceType_v4i32);
-		auto select = Ice::InstSelect::create(::function, result, condition, y.value, x.value);
-		::basicBlock->appendInst(select);
-
-		return RValue<UInt4>(V(result));
-	}
-
-	RValue<UInt4> Min(RValue<UInt4> x, RValue<UInt4> y)
-	{
-		Ice::Variable *condition = ::function->makeVariable(Ice::IceType_v4i1);
-		auto cmp = Ice::InstIcmp::create(::function, Ice::InstIcmp::Ugt, condition, x.value, y.value);
-		::basicBlock->appendInst(cmp);
-
-		Ice::Variable *result = ::function->makeVariable(Ice::IceType_v4i32);
-		auto select = Ice::InstSelect::create(::function, result, condition, y.value, x.value);
-		::basicBlock->appendInst(select);
-
-		return RValue<UInt4>(V(result));
-	}
-
-	Type *UInt4::getType()
-	{
-		return T(Ice::IceType_v4i32);
-	}
-
-	Type *Half::getType()
-	{
-		return T(Ice::IceType_i16);
-	}
-
-	RValue<Float> Rcp_pp(RValue<Float> x, bool exactAtPow2)
-	{
-		return 1.0f / x;
-	}
-
-	RValue<Float> RcpSqrt_pp(RValue<Float> x)
-	{
-		return Rcp_pp(Sqrt(x));
-	}
-
-	RValue<Float> Sqrt(RValue<Float> x)
-	{
-		Ice::Variable *result = ::function->makeVariable(Ice::IceType_f32);
+		Ice::Variable *result = ::function->makeVariable(Ice::IceType_v4f32);
 		const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::Sqrt, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
 		auto target = ::context->getConstantUndef(Ice::IceType_i32);
 		auto sqrt = Ice::InstIntrinsicCall::create(::function, 1, result, target, intrinsic);
 		sqrt->addArg(x.value);
 		::basicBlock->appendInst(sqrt);
 
-		return RValue<Float>(V(result));
-	}
-
-	RValue<Float> Round(RValue<Float> x)
-	{
-		return Float4(Round(Float4(x))).x;
-	}
-
-	RValue<Float> Trunc(RValue<Float> x)
-	{
-		return Float4(Trunc(Float4(x))).x;
-	}
-
-	RValue<Float> Frac(RValue<Float> x)
-	{
-		return Float4(Frac(Float4(x))).x;
-	}
-
-	RValue<Float> Floor(RValue<Float> x)
-	{
-		return Float4(Floor(Float4(x))).x;
-	}
-
-	RValue<Float> Ceil(RValue<Float> x)
-	{
-		return Float4(Ceil(Float4(x))).x;
-	}
-
-	Type *Float::getType()
-	{
-		return T(Ice::IceType_f32);
-	}
-
-	Type *Float2::getType()
-	{
-		return T(Type_v2f32);
-	}
-
-	Float4::Float4(RValue<Float> rhs) : XYZW(this)
-	{
-		Value *vector = Nucleus::createBitCast(rhs.value, Float4::getType());
-
-		int swizzle[4] = {0, 0, 0, 0};
-		Value *replicate = Nucleus::createShuffleVector(vector, vector, swizzle);
-
-		storeValue(replicate);
-	}
-
-	RValue<Float4> Max(RValue<Float4> x, RValue<Float4> y)
-	{
-		Ice::Variable *condition = ::function->makeVariable(Ice::IceType_v4i1);
-		auto cmp = Ice::InstFcmp::create(::function, Ice::InstFcmp::Ogt, condition, x.value, y.value);
-		::basicBlock->appendInst(cmp);
-
-		Ice::Variable *result = ::function->makeVariable(Ice::IceType_v4f32);
-		auto select = Ice::InstSelect::create(::function, result, condition, x.value, y.value);
-		::basicBlock->appendInst(select);
-
 		return RValue<Float4>(V(result));
 	}
-
-	RValue<Float4> Min(RValue<Float4> x, RValue<Float4> y)
-	{
-		Ice::Variable *condition = ::function->makeVariable(Ice::IceType_v4i1);
-		auto cmp = Ice::InstFcmp::create(::function, Ice::InstFcmp::Olt, condition, x.value, y.value);
-		::basicBlock->appendInst(cmp);
-
-		Ice::Variable *result = ::function->makeVariable(Ice::IceType_v4f32);
-		auto select = Ice::InstSelect::create(::function, result, condition, x.value, y.value);
-		::basicBlock->appendInst(select);
-
-		return RValue<Float4>(V(result));
-	}
-
-	RValue<Float4> Rcp_pp(RValue<Float4> x, bool exactAtPow2)
-	{
-		return Float4(1.0f) / x;
-	}
-
-	RValue<Float4> RcpSqrt_pp(RValue<Float4> x)
-	{
-		return Rcp_pp(Sqrt(x));
-	}
-
-	RValue<Float4> Sqrt(RValue<Float4> x)
-	{
-		if(emulateIntrinsics || CPUID::ARM)
-		{
-			Float4 result;
-			result.x = Sqrt(Float(Float4(x).x));
-			result.y = Sqrt(Float(Float4(x).y));
-			result.z = Sqrt(Float(Float4(x).z));
-			result.w = Sqrt(Float(Float4(x).w));
-
-			return result;
-		}
-		else
-		{
-			Ice::Variable *result = ::function->makeVariable(Ice::IceType_v4f32);
-			const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::Sqrt, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
-			auto target = ::context->getConstantUndef(Ice::IceType_i32);
-			auto sqrt = Ice::InstIntrinsicCall::create(::function, 1, result, target, intrinsic);
-			sqrt->addArg(x.value);
-			::basicBlock->appendInst(sqrt);
-
-			return RValue<Float4>(V(result));
-		}
-	}
-
-	RValue<Int> SignMask(RValue<Float4> x)
-	{
-		if(emulateIntrinsics || CPUID::ARM)
-		{
-			Int4 xx = (As<Int4>(x) >> 31) & Int4(0x00000001, 0x00000002, 0x00000004, 0x00000008);
-			return Extract(xx, 0) | Extract(xx, 1) | Extract(xx, 2) | Extract(xx, 3);
-		}
-		else
-		{
-			Ice::Variable *result = ::function->makeVariable(Ice::IceType_i32);
-			const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::SignMask, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
-			auto target = ::context->getConstantUndef(Ice::IceType_i32);
-			auto movmsk = Ice::InstIntrinsicCall::create(::function, 1, result, target, intrinsic);
-			movmsk->addArg(x.value);
-			::basicBlock->appendInst(movmsk);
-
-			return RValue<Int>(V(result));
-		}
-	}
-
-	RValue<Int4> CmpEQ(RValue<Float4> x, RValue<Float4> y)
-	{
-		return RValue<Int4>(Nucleus::createFCmpOEQ(x.value, y.value));
-	}
-
-	RValue<Int4> CmpLT(RValue<Float4> x, RValue<Float4> y)
-	{
-		return RValue<Int4>(Nucleus::createFCmpOLT(x.value, y.value));
-	}
-
-	RValue<Int4> CmpLE(RValue<Float4> x, RValue<Float4> y)
-	{
-		return RValue<Int4>(Nucleus::createFCmpOLE(x.value, y.value));
-	}
-
-	RValue<Int4> CmpNEQ(RValue<Float4> x, RValue<Float4> y)
-	{
-		return RValue<Int4>(Nucleus::createFCmpONE(x.value, y.value));
-	}
-
-	RValue<Int4> CmpNLT(RValue<Float4> x, RValue<Float4> y)
-	{
-		return RValue<Int4>(Nucleus::createFCmpOGE(x.value, y.value));
-	}
-
-	RValue<Int4> CmpNLE(RValue<Float4> x, RValue<Float4> y)
-	{
-		return RValue<Int4>(Nucleus::createFCmpOGT(x.value, y.value));
-	}
-
-	RValue<Int4> CmpUEQ(RValue<Float4> x, RValue<Float4> y)
-	{
-		return RValue<Int4>(Nucleus::createFCmpUEQ(x.value, y.value));
-	}
-
-	RValue<Int4> CmpULT(RValue<Float4> x, RValue<Float4> y)
-	{
-		return RValue<Int4>(Nucleus::createFCmpULT(x.value, y.value));
-	}
-
-	RValue<Int4> CmpULE(RValue<Float4> x, RValue<Float4> y)
-	{
-		return RValue<Int4>(Nucleus::createFCmpULE(x.value, y.value));
-	}
-
-	RValue<Int4> CmpUNEQ(RValue<Float4> x, RValue<Float4> y)
-	{
-		return RValue<Int4>(Nucleus::createFCmpUNE(x.value, y.value));
-	}
-
-	RValue<Int4> CmpUNLT(RValue<Float4> x, RValue<Float4> y)
-	{
-		return RValue<Int4>(Nucleus::createFCmpUGE(x.value, y.value));
-	}
-
-	RValue<Int4> CmpUNLE(RValue<Float4> x, RValue<Float4> y)
-	{
-		return RValue<Int4>(Nucleus::createFCmpUGT(x.value, y.value));
-	}
-
-	RValue<Float4> Round(RValue<Float4> x)
-	{
-		if(emulateIntrinsics || CPUID::ARM)
-		{
-			// Push the fractional part off the mantissa. Accurate up to +/-2^22.
-			return (x + Float4(0x00C00000)) - Float4(0x00C00000);
-		}
-		else if(CPUID::SSE4_1)
-		{
-			Ice::Variable *result = ::function->makeVariable(Ice::IceType_v4f32);
-			const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::Round, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
-			auto target = ::context->getConstantUndef(Ice::IceType_i32);
-			auto round = Ice::InstIntrinsicCall::create(::function, 2, result, target, intrinsic);
-			round->addArg(x.value);
-			round->addArg(::context->getConstantInt32(0));
-			::basicBlock->appendInst(round);
-
-			return RValue<Float4>(V(result));
-		}
-		else
-		{
-			return Float4(RoundInt(x));
-		}
-	}
-
-	RValue<Float4> Trunc(RValue<Float4> x)
-	{
-		if(CPUID::SSE4_1)
-		{
-			Ice::Variable *result = ::function->makeVariable(Ice::IceType_v4f32);
-			const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::Round, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
-			auto target = ::context->getConstantUndef(Ice::IceType_i32);
-			auto round = Ice::InstIntrinsicCall::create(::function, 2, result, target, intrinsic);
-			round->addArg(x.value);
-			round->addArg(::context->getConstantInt32(3));
-			::basicBlock->appendInst(round);
-
-			return RValue<Float4>(V(result));
-		}
-		else
-		{
-			return Float4(Int4(x));
-		}
-	}
-
-	RValue<Float4> Frac(RValue<Float4> x)
-	{
-		Float4 frc;
-
-		if(CPUID::SSE4_1)
-		{
-			frc = x - Floor(x);
-		}
-		else
-		{
-			frc = x - Float4(Int4(x));   // Signed fractional part.
-
-			frc += As<Float4>(As<Int4>(CmpNLE(Float4(0.0f), frc)) & As<Int4>(Float4(1, 1, 1, 1)));   // Add 1.0 if negative.
-		}
-
-		// x - floor(x) can be 1.0 for very small negative x.
-		// Clamp against the value just below 1.0.
-		return Min(frc, As<Float4>(Int4(0x3F7FFFFF)));
-	}
-
-	RValue<Float4> Floor(RValue<Float4> x)
-	{
-		if(CPUID::SSE4_1)
-		{
-			Ice::Variable *result = ::function->makeVariable(Ice::IceType_v4f32);
-			const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::Round, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
-			auto target = ::context->getConstantUndef(Ice::IceType_i32);
-			auto round = Ice::InstIntrinsicCall::create(::function, 2, result, target, intrinsic);
-			round->addArg(x.value);
-			round->addArg(::context->getConstantInt32(1));
-			::basicBlock->appendInst(round);
-
-			return RValue<Float4>(V(result));
-		}
-		else
-		{
-			return x - Frac(x);
-		}
-	}
-
-	RValue<Float4> Ceil(RValue<Float4> x)
-	{
-		if(CPUID::SSE4_1)
-		{
-			Ice::Variable *result = ::function->makeVariable(Ice::IceType_v4f32);
-			const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::Round, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
-			auto target = ::context->getConstantUndef(Ice::IceType_i32);
-			auto round = Ice::InstIntrinsicCall::create(::function, 2, result, target, intrinsic);
-			round->addArg(x.value);
-			round->addArg(::context->getConstantInt32(2));
-			::basicBlock->appendInst(round);
-
-			return RValue<Float4>(V(result));
-		}
-		else
-		{
-			return -Floor(-x);
-		}
-	}
-
-	Type *Float4::getType()
-	{
-		return T(Ice::IceType_v4f32);
-	}
-
-	RValue<Long> Ticks()
-	{
-		UNIMPLEMENTED("RValue<Long> Ticks()");
-		return Long(Int(0));
-	}
-
-	RValue<Pointer<Byte>> ConstantPointer(void const * ptr)
-	{
-		if (sizeof(void*) == 8)
-		{
-			return RValue<Pointer<Byte>>(V(::context->getConstantInt64(reinterpret_cast<intptr_t>(ptr))));
-		}
-		else
-		{
-			return RValue<Pointer<Byte>>(V(::context->getConstantInt32(reinterpret_cast<intptr_t>(ptr))));
-		}
-	}
-
-	RValue<Pointer<Byte>> ConstantData(void const * data, size_t size)
-	{
-		// TODO: Try to use Ice::VariableDeclaration::DataInitializer and
-		// getConstantSym instead of tagging data on the routine.
-		return ConstantPointer(::routine->addConstantData(data, size));
-	}
-
-	Value* Call(RValue<Pointer<Byte>> fptr, Type* retTy, std::initializer_list<Value*> args, std::initializer_list<Type*> argTys)
-	{
-		Ice::Variable *ret = nullptr;
-		if (retTy != nullptr)
-		{
-			ret = ::function->makeVariable(T(retTy));
-		}
-		auto call = Ice::InstCall::create(::function, args.size(), ret, V(fptr.value), false);
-		for (auto arg : args)
-		{
-			call->addArg(V(arg));
-		}
-		::basicBlock->appendInst(call);
-		return V(ret);
-	}
-
-	void Breakpoint()
-	{
-		const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::Trap, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
-		auto target = ::context->getConstantUndef(Ice::IceType_i32);
-		auto trap = Ice::InstIntrinsicCall::create(::function, 0, nullptr, target, intrinsic);
-		::basicBlock->appendInst(trap);
-	}
-
-	void Nucleus::createFence(std::memory_order memoryOrder) { UNIMPLEMENTED("Subzero createFence()"); }
-	Value *Nucleus::createMaskedLoad(Value *ptr, Type *elTy, Value *mask, unsigned int alignment, bool zeroMaskedLanes) { UNIMPLEMENTED("Subzero createMaskedLoad()"); return nullptr; }
-	void Nucleus::createMaskedStore(Value *ptr, Value *val, Value *mask, unsigned int alignment) { UNIMPLEMENTED("Subzero createMaskedStore()"); }
-
-	RValue<Float4> Gather(RValue<Pointer<Float>> base, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment, bool zeroMaskedLanes /* = false */)
-	{
-		return emulated::Gather(base, offsets, mask, alignment, zeroMaskedLanes);
-	}
-
-	RValue<Int4> Gather(RValue<Pointer<Int>> base, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment, bool zeroMaskedLanes /* = false */)
-	{
-		return emulated::Gather(base, offsets, mask, alignment, zeroMaskedLanes);
-	}
-
-	void Scatter(RValue<Pointer<Float>> base, RValue<Float4> val, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment)
-	{
-		return emulated::Scatter(base, val, offsets, mask, alignment);
-	}
-
-	void Scatter(RValue<Pointer<Int>> base, RValue<Int4> val, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment)
-	{
-		return emulated::Scatter(base, val, offsets, mask, alignment);
-	}
-
-	RValue<Float> Exp2(RValue<Float> x)
-	{
-		return emulated::Exp2(x);
-	}
-
-	RValue<Float> Log2(RValue<Float> x)
-	{
-		return emulated::Log2(x);
-	}
-
-	RValue<Float4> Sin(RValue<Float4> x)
-	{
-		return emulated::Sin(x);
-	}
-
-	RValue<Float4> Cos(RValue<Float4> x)
-	{
-		return emulated::Cos(x);
-	}
-
-	RValue<Float4> Tan(RValue<Float4> x)
-	{
-		return emulated::Tan(x);
-	}
-
-	RValue<Float4> Asin(RValue<Float4> x)
-	{
-		return emulated::Asin(x);
-	}
-
-	RValue<Float4> Acos(RValue<Float4> x)
-	{
-		return emulated::Acos(x);
-	}
-
-	RValue<Float4> Atan(RValue<Float4> x)
-	{
-		return emulated::Atan(x);
-	}
-
-	RValue<Float4> Sinh(RValue<Float4> x)
-	{
-		return emulated::Sinh(x);
-	}
-
-	RValue<Float4> Cosh(RValue<Float4> x)
-	{
-		return emulated::Cosh(x);
-	}
-
-	RValue<Float4> Tanh(RValue<Float4> x)
-	{
-		return emulated::Tanh(x);
-	}
-
-	RValue<Float4> Asinh(RValue<Float4> x)
-	{
-		return emulated::Asinh(x);
-	}
-
-	RValue<Float4> Acosh(RValue<Float4> x)
-	{
-		return emulated::Acosh(x);
-	}
-
-	RValue<Float4> Atanh(RValue<Float4> x)
-	{
-		return emulated::Atanh(x);
-	}
-
-	RValue<Float4> Atan2(RValue<Float4> x, RValue<Float4> y)
-	{
-		return emulated::Atan2(x, y);
-	}
-
-	RValue<Float4> Pow(RValue<Float4> x, RValue<Float4> y)
-	{
-		return emulated::Pow(x, y);
-	}
-
-	RValue<Float4> Exp(RValue<Float4> x)
-	{
-		return emulated::Exp(x);
-	}
-
-	RValue<Float4> Log(RValue<Float4> x)
-	{
-		return emulated::Log(x);
-	}
-
-	RValue<Float4> Exp2(RValue<Float4> x)
-	{
-		return emulated::Exp2(x);
-	}
-
-	RValue<Float4> Log2(RValue<Float4> x)
-	{
-		return emulated::Log2(x);
-	}
-
-	RValue<UInt> Ctlz(RValue<UInt> x, bool isZeroUndef)
-	{
-		if (emulateIntrinsics)
-		{
-			UNIMPLEMENTED("Subzero Ctlz()"); return UInt(0);
-		}
-		else
-		{
-			Ice::Variable* result = ::function->makeVariable(Ice::IceType_i32);
-			const Ice::Intrinsics::IntrinsicInfo intrinsic = { Ice::Intrinsics::Ctlz, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F };
-			auto target = ::context->getConstantUndef(Ice::IceType_i32);
-			auto ctlz = Ice::InstIntrinsicCall::create(::function, 1, result, target, intrinsic);
-			ctlz->addArg(x.value);
-			::basicBlock->appendInst(ctlz);
-
-			return RValue<UInt>(V(result));
-		}
-	}
-
-	RValue<UInt4> Ctlz(RValue<UInt4> x, bool isZeroUndef)
-	{
-		if (emulateIntrinsics)
-		{
-			UNIMPLEMENTED("Subzero Ctlz()"); return UInt4(0);
-		}
-		else
-		{
-			// TODO: implement vectorized version in Subzero
-			UInt4 result;
-			result = Insert(result, Ctlz(Extract(x, 0), isZeroUndef), 0);
-			result = Insert(result, Ctlz(Extract(x, 1), isZeroUndef), 1);
-			result = Insert(result, Ctlz(Extract(x, 2), isZeroUndef), 2);
-			result = Insert(result, Ctlz(Extract(x, 3), isZeroUndef), 3);
-			return result;
-		}
-	}
-
-	RValue<UInt> Cttz(RValue<UInt> x, bool isZeroUndef)
-	{
-		if (emulateIntrinsics)
-		{
-			UNIMPLEMENTED("Subzero Cttz()"); return UInt(0);
-		}
-		else
-		{
-			Ice::Variable* result = ::function->makeVariable(Ice::IceType_i32);
-			const Ice::Intrinsics::IntrinsicInfo intrinsic = { Ice::Intrinsics::Cttz, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F };
-			auto target = ::context->getConstantUndef(Ice::IceType_i32);
-			auto ctlz = Ice::InstIntrinsicCall::create(::function, 1, result, target, intrinsic);
-			ctlz->addArg(x.value);
-			::basicBlock->appendInst(ctlz);
-
-			return RValue<UInt>(V(result));
-		}
-	}
-
-	RValue<UInt4> Cttz(RValue<UInt4> x, bool isZeroUndef)
-	{
-		if (emulateIntrinsics)
-		{
-			UNIMPLEMENTED("Subzero Cttz()"); return UInt4(0);
-		}
-		else
-		{
-			// TODO: implement vectorized version in Subzero
-			UInt4 result;
-			result = Insert(result, Cttz(Extract(x, 0), isZeroUndef), 0);
-			result = Insert(result, Cttz(Extract(x, 1), isZeroUndef), 1);
-			result = Insert(result, Cttz(Extract(x, 2), isZeroUndef), 2);
-			result = Insert(result, Cttz(Extract(x, 3), isZeroUndef), 3);
-			return result;
-		}
-	}
-
-	void EmitDebugLocation() {}
-	void EmitDebugVariable(Value* value) {}
-	void FlushDebug() {}
-
-	void Nucleus::createCoroutine(Type *YieldType, std::vector<Type*> &Params)
-	{
-		// Subzero currently only supports coroutines as functions (i.e. that do not yield)
-		createFunction(YieldType, Params);
-	}
-
-	static bool coroutineEntryAwaitStub(Nucleus::CoroutineHandle, void* yieldValue) { return false; }
-	static void coroutineEntryDestroyStub(Nucleus::CoroutineHandle) {}
-
-	std::shared_ptr<Routine> Nucleus::acquireCoroutine(const char *name, const Config::Edit &cfgEdit /* = Config::Edit::None */)
-	{
-		// acquireRoutine sets the CoroutineEntryBegin entry
-		auto coroutineEntry = acquireRoutine(name, cfgEdit);
-
-		// For now, set the await and destroy entries to stubs, until we add proper coroutine support to the Subzero backend
-		auto routine = std::static_pointer_cast<ELFMemoryStreamer>(coroutineEntry);
-		routine->setEntry(Nucleus::CoroutineEntryAwait, reinterpret_cast<const void*>(&coroutineEntryAwaitStub));
-		routine->setEntry(Nucleus::CoroutineEntryDestroy, reinterpret_cast<const void*>(&coroutineEntryDestroyStub));
-
-		return coroutineEntry;
-	}
-
-	void Nucleus::yield(Value* val) { UNIMPLEMENTED("Yield"); }
-
 }
+
+RValue<Int> SignMask(RValue<Float4> x)
+{
+	if(emulateIntrinsics || CPUID::ARM)
+	{
+		Int4 xx = (As<Int4>(x) >> 31) & Int4(0x00000001, 0x00000002, 0x00000004, 0x00000008);
+		return Extract(xx, 0) | Extract(xx, 1) | Extract(xx, 2) | Extract(xx, 3);
+	}
+	else
+	{
+		Ice::Variable *result = ::function->makeVariable(Ice::IceType_i32);
+		const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::SignMask, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
+		auto target = ::context->getConstantUndef(Ice::IceType_i32);
+		auto movmsk = Ice::InstIntrinsicCall::create(::function, 1, result, target, intrinsic);
+		movmsk->addArg(x.value);
+		::basicBlock->appendInst(movmsk);
+
+		return RValue<Int>(V(result));
+	}
+}
+
+RValue<Int4> CmpEQ(RValue<Float4> x, RValue<Float4> y)
+{
+	return RValue<Int4>(Nucleus::createFCmpOEQ(x.value, y.value));
+}
+
+RValue<Int4> CmpLT(RValue<Float4> x, RValue<Float4> y)
+{
+	return RValue<Int4>(Nucleus::createFCmpOLT(x.value, y.value));
+}
+
+RValue<Int4> CmpLE(RValue<Float4> x, RValue<Float4> y)
+{
+	return RValue<Int4>(Nucleus::createFCmpOLE(x.value, y.value));
+}
+
+RValue<Int4> CmpNEQ(RValue<Float4> x, RValue<Float4> y)
+{
+	return RValue<Int4>(Nucleus::createFCmpONE(x.value, y.value));
+}
+
+RValue<Int4> CmpNLT(RValue<Float4> x, RValue<Float4> y)
+{
+	return RValue<Int4>(Nucleus::createFCmpOGE(x.value, y.value));
+}
+
+RValue<Int4> CmpNLE(RValue<Float4> x, RValue<Float4> y)
+{
+	return RValue<Int4>(Nucleus::createFCmpOGT(x.value, y.value));
+}
+
+RValue<Int4> CmpUEQ(RValue<Float4> x, RValue<Float4> y)
+{
+	return RValue<Int4>(Nucleus::createFCmpUEQ(x.value, y.value));
+}
+
+RValue<Int4> CmpULT(RValue<Float4> x, RValue<Float4> y)
+{
+	return RValue<Int4>(Nucleus::createFCmpULT(x.value, y.value));
+}
+
+RValue<Int4> CmpULE(RValue<Float4> x, RValue<Float4> y)
+{
+	return RValue<Int4>(Nucleus::createFCmpULE(x.value, y.value));
+}
+
+RValue<Int4> CmpUNEQ(RValue<Float4> x, RValue<Float4> y)
+{
+	return RValue<Int4>(Nucleus::createFCmpUNE(x.value, y.value));
+}
+
+RValue<Int4> CmpUNLT(RValue<Float4> x, RValue<Float4> y)
+{
+	return RValue<Int4>(Nucleus::createFCmpUGE(x.value, y.value));
+}
+
+RValue<Int4> CmpUNLE(RValue<Float4> x, RValue<Float4> y)
+{
+	return RValue<Int4>(Nucleus::createFCmpUGT(x.value, y.value));
+}
+
+RValue<Float4> Round(RValue<Float4> x)
+{
+	if(emulateIntrinsics || CPUID::ARM)
+	{
+		// Push the fractional part off the mantissa. Accurate up to +/-2^22.
+		return (x + Float4(0x00C00000)) - Float4(0x00C00000);
+	}
+	else if(CPUID::SSE4_1)
+	{
+		Ice::Variable *result = ::function->makeVariable(Ice::IceType_v4f32);
+		const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::Round, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
+		auto target = ::context->getConstantUndef(Ice::IceType_i32);
+		auto round = Ice::InstIntrinsicCall::create(::function, 2, result, target, intrinsic);
+		round->addArg(x.value);
+		round->addArg(::context->getConstantInt32(0));
+		::basicBlock->appendInst(round);
+
+		return RValue<Float4>(V(result));
+	}
+	else
+	{
+		return Float4(RoundInt(x));
+	}
+}
+
+RValue<Float4> Trunc(RValue<Float4> x)
+{
+	if(CPUID::SSE4_1)
+	{
+		Ice::Variable *result = ::function->makeVariable(Ice::IceType_v4f32);
+		const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::Round, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
+		auto target = ::context->getConstantUndef(Ice::IceType_i32);
+		auto round = Ice::InstIntrinsicCall::create(::function, 2, result, target, intrinsic);
+		round->addArg(x.value);
+		round->addArg(::context->getConstantInt32(3));
+		::basicBlock->appendInst(round);
+
+		return RValue<Float4>(V(result));
+	}
+	else
+	{
+		return Float4(Int4(x));
+	}
+}
+
+RValue<Float4> Frac(RValue<Float4> x)
+{
+	Float4 frc;
+
+	if(CPUID::SSE4_1)
+	{
+		frc = x - Floor(x);
+	}
+	else
+	{
+		frc = x - Float4(Int4(x));   // Signed fractional part.
+
+		frc += As<Float4>(As<Int4>(CmpNLE(Float4(0.0f), frc)) & As<Int4>(Float4(1, 1, 1, 1)));   // Add 1.0 if negative.
+	}
+
+	// x - floor(x) can be 1.0 for very small negative x.
+	// Clamp against the value just below 1.0.
+	return Min(frc, As<Float4>(Int4(0x3F7FFFFF)));
+}
+
+RValue<Float4> Floor(RValue<Float4> x)
+{
+	if(CPUID::SSE4_1)
+	{
+		Ice::Variable *result = ::function->makeVariable(Ice::IceType_v4f32);
+		const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::Round, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
+		auto target = ::context->getConstantUndef(Ice::IceType_i32);
+		auto round = Ice::InstIntrinsicCall::create(::function, 2, result, target, intrinsic);
+		round->addArg(x.value);
+		round->addArg(::context->getConstantInt32(1));
+		::basicBlock->appendInst(round);
+
+		return RValue<Float4>(V(result));
+	}
+	else
+	{
+		return x - Frac(x);
+	}
+}
+
+RValue<Float4> Ceil(RValue<Float4> x)
+{
+	if(CPUID::SSE4_1)
+	{
+		Ice::Variable *result = ::function->makeVariable(Ice::IceType_v4f32);
+		const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::Round, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
+		auto target = ::context->getConstantUndef(Ice::IceType_i32);
+		auto round = Ice::InstIntrinsicCall::create(::function, 2, result, target, intrinsic);
+		round->addArg(x.value);
+		round->addArg(::context->getConstantInt32(2));
+		::basicBlock->appendInst(round);
+
+		return RValue<Float4>(V(result));
+	}
+	else
+	{
+		return -Floor(-x);
+	}
+}
+
+Type *Float4::getType()
+{
+	return T(Ice::IceType_v4f32);
+}
+
+RValue<Long> Ticks()
+{
+	UNIMPLEMENTED("RValue<Long> Ticks()");
+	return Long(Int(0));
+}
+
+RValue<Pointer<Byte>> ConstantPointer(void const * ptr)
+{
+	if (sizeof(void*) == 8)
+	{
+		return RValue<Pointer<Byte>>(V(::context->getConstantInt64(reinterpret_cast<intptr_t>(ptr))));
+	}
+	else
+	{
+		return RValue<Pointer<Byte>>(V(::context->getConstantInt32(reinterpret_cast<intptr_t>(ptr))));
+	}
+}
+
+RValue<Pointer<Byte>> ConstantData(void const * data, size_t size)
+{
+	// TODO: Try to use Ice::VariableDeclaration::DataInitializer and
+	// getConstantSym instead of tagging data on the routine.
+	return ConstantPointer(::routine->addConstantData(data, size));
+}
+
+Value* Call(RValue<Pointer<Byte>> fptr, Type* retTy, std::initializer_list<Value*> args, std::initializer_list<Type*> argTys)
+{
+	Ice::Variable *ret = nullptr;
+	if (retTy != nullptr)
+	{
+		ret = ::function->makeVariable(T(retTy));
+	}
+	auto call = Ice::InstCall::create(::function, args.size(), ret, V(fptr.value), false);
+	for (auto arg : args)
+	{
+		call->addArg(V(arg));
+	}
+	::basicBlock->appendInst(call);
+	return V(ret);
+}
+
+void Breakpoint()
+{
+	const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::Trap, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
+	auto target = ::context->getConstantUndef(Ice::IceType_i32);
+	auto trap = Ice::InstIntrinsicCall::create(::function, 0, nullptr, target, intrinsic);
+	::basicBlock->appendInst(trap);
+}
+
+void Nucleus::createFence(std::memory_order memoryOrder) { UNIMPLEMENTED("Subzero createFence()"); }
+Value *Nucleus::createMaskedLoad(Value *ptr, Type *elTy, Value *mask, unsigned int alignment, bool zeroMaskedLanes) { UNIMPLEMENTED("Subzero createMaskedLoad()"); return nullptr; }
+void Nucleus::createMaskedStore(Value *ptr, Value *val, Value *mask, unsigned int alignment) { UNIMPLEMENTED("Subzero createMaskedStore()"); }
+
+RValue<Float4> Gather(RValue<Pointer<Float>> base, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment, bool zeroMaskedLanes /* = false */)
+{
+	return emulated::Gather(base, offsets, mask, alignment, zeroMaskedLanes);
+}
+
+RValue<Int4> Gather(RValue<Pointer<Int>> base, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment, bool zeroMaskedLanes /* = false */)
+{
+	return emulated::Gather(base, offsets, mask, alignment, zeroMaskedLanes);
+}
+
+void Scatter(RValue<Pointer<Float>> base, RValue<Float4> val, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment)
+{
+	return emulated::Scatter(base, val, offsets, mask, alignment);
+}
+
+void Scatter(RValue<Pointer<Int>> base, RValue<Int4> val, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment)
+{
+	return emulated::Scatter(base, val, offsets, mask, alignment);
+}
+
+RValue<Float> Exp2(RValue<Float> x)
+{
+	return emulated::Exp2(x);
+}
+
+RValue<Float> Log2(RValue<Float> x)
+{
+	return emulated::Log2(x);
+}
+
+RValue<Float4> Sin(RValue<Float4> x)
+{
+	return emulated::Sin(x);
+}
+
+RValue<Float4> Cos(RValue<Float4> x)
+{
+	return emulated::Cos(x);
+}
+
+RValue<Float4> Tan(RValue<Float4> x)
+{
+	return emulated::Tan(x);
+}
+
+RValue<Float4> Asin(RValue<Float4> x)
+{
+	return emulated::Asin(x);
+}
+
+RValue<Float4> Acos(RValue<Float4> x)
+{
+	return emulated::Acos(x);
+}
+
+RValue<Float4> Atan(RValue<Float4> x)
+{
+	return emulated::Atan(x);
+}
+
+RValue<Float4> Sinh(RValue<Float4> x)
+{
+	return emulated::Sinh(x);
+}
+
+RValue<Float4> Cosh(RValue<Float4> x)
+{
+	return emulated::Cosh(x);
+}
+
+RValue<Float4> Tanh(RValue<Float4> x)
+{
+	return emulated::Tanh(x);
+}
+
+RValue<Float4> Asinh(RValue<Float4> x)
+{
+	return emulated::Asinh(x);
+}
+
+RValue<Float4> Acosh(RValue<Float4> x)
+{
+	return emulated::Acosh(x);
+}
+
+RValue<Float4> Atanh(RValue<Float4> x)
+{
+	return emulated::Atanh(x);
+}
+
+RValue<Float4> Atan2(RValue<Float4> x, RValue<Float4> y)
+{
+	return emulated::Atan2(x, y);
+}
+
+RValue<Float4> Pow(RValue<Float4> x, RValue<Float4> y)
+{
+	return emulated::Pow(x, y);
+}
+
+RValue<Float4> Exp(RValue<Float4> x)
+{
+	return emulated::Exp(x);
+}
+
+RValue<Float4> Log(RValue<Float4> x)
+{
+	return emulated::Log(x);
+}
+
+RValue<Float4> Exp2(RValue<Float4> x)
+{
+	return emulated::Exp2(x);
+}
+
+RValue<Float4> Log2(RValue<Float4> x)
+{
+	return emulated::Log2(x);
+}
+
+RValue<UInt> Ctlz(RValue<UInt> x, bool isZeroUndef)
+{
+	if (emulateIntrinsics)
+	{
+		UNIMPLEMENTED("Subzero Ctlz()"); return UInt(0);
+	}
+	else
+	{
+		Ice::Variable* result = ::function->makeVariable(Ice::IceType_i32);
+		const Ice::Intrinsics::IntrinsicInfo intrinsic = { Ice::Intrinsics::Ctlz, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F };
+		auto target = ::context->getConstantUndef(Ice::IceType_i32);
+		auto ctlz = Ice::InstIntrinsicCall::create(::function, 1, result, target, intrinsic);
+		ctlz->addArg(x.value);
+		::basicBlock->appendInst(ctlz);
+
+		return RValue<UInt>(V(result));
+	}
+}
+
+RValue<UInt4> Ctlz(RValue<UInt4> x, bool isZeroUndef)
+{
+	if (emulateIntrinsics)
+	{
+		UNIMPLEMENTED("Subzero Ctlz()"); return UInt4(0);
+	}
+	else
+	{
+		// TODO: implement vectorized version in Subzero
+		UInt4 result;
+		result = Insert(result, Ctlz(Extract(x, 0), isZeroUndef), 0);
+		result = Insert(result, Ctlz(Extract(x, 1), isZeroUndef), 1);
+		result = Insert(result, Ctlz(Extract(x, 2), isZeroUndef), 2);
+		result = Insert(result, Ctlz(Extract(x, 3), isZeroUndef), 3);
+		return result;
+	}
+}
+
+RValue<UInt> Cttz(RValue<UInt> x, bool isZeroUndef)
+{
+	if (emulateIntrinsics)
+	{
+		UNIMPLEMENTED("Subzero Cttz()"); return UInt(0);
+	}
+	else
+	{
+		Ice::Variable* result = ::function->makeVariable(Ice::IceType_i32);
+		const Ice::Intrinsics::IntrinsicInfo intrinsic = { Ice::Intrinsics::Cttz, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F };
+		auto target = ::context->getConstantUndef(Ice::IceType_i32);
+		auto ctlz = Ice::InstIntrinsicCall::create(::function, 1, result, target, intrinsic);
+		ctlz->addArg(x.value);
+		::basicBlock->appendInst(ctlz);
+
+		return RValue<UInt>(V(result));
+	}
+}
+
+RValue<UInt4> Cttz(RValue<UInt4> x, bool isZeroUndef)
+{
+	if (emulateIntrinsics)
+	{
+		UNIMPLEMENTED("Subzero Cttz()"); return UInt4(0);
+	}
+	else
+	{
+		// TODO: implement vectorized version in Subzero
+		UInt4 result;
+		result = Insert(result, Cttz(Extract(x, 0), isZeroUndef), 0);
+		result = Insert(result, Cttz(Extract(x, 1), isZeroUndef), 1);
+		result = Insert(result, Cttz(Extract(x, 2), isZeroUndef), 2);
+		result = Insert(result, Cttz(Extract(x, 3), isZeroUndef), 3);
+		return result;
+	}
+}
+
+void EmitDebugLocation() {}
+void EmitDebugVariable(Value* value) {}
+void FlushDebug() {}
+
+void Nucleus::createCoroutine(Type *YieldType, std::vector<Type*> &Params)
+{
+	// Subzero currently only supports coroutines as functions (i.e. that do not yield)
+	createFunction(YieldType, Params);
+}
+
+static bool coroutineEntryAwaitStub(Nucleus::CoroutineHandle, void* yieldValue) { return false; }
+static void coroutineEntryDestroyStub(Nucleus::CoroutineHandle) {}
+
+std::shared_ptr<Routine> Nucleus::acquireCoroutine(const char *name, const Config::Edit &cfgEdit /* = Config::Edit::None */)
+{
+	// acquireRoutine sets the CoroutineEntryBegin entry
+	auto coroutineEntry = acquireRoutine(name, cfgEdit);
+
+	// For now, set the await and destroy entries to stubs, until we add proper coroutine support to the Subzero backend
+	auto routine = std::static_pointer_cast<ELFMemoryStreamer>(coroutineEntry);
+	routine->setEntry(Nucleus::CoroutineEntryAwait, reinterpret_cast<const void*>(&coroutineEntryAwaitStub));
+	routine->setEntry(Nucleus::CoroutineEntryDestroy, reinterpret_cast<const void*>(&coroutineEntryDestroyStub));
+
+	return coroutineEntry;
+}
+
+void Nucleus::yield(Value* val) { UNIMPLEMENTED("Yield"); }
+
+}  // namespace rr

diff --git a/src/Reactor/Thread.cpp b/src/Reactor/Thread.cpp
index 7a9b07d..46f3550 100644
--- a/src/Reactor/Thread.cpp
+++ b/src/Reactor/Thread.cpp

@@ -14,78 +14,79 @@
 
 #include "Thread.hpp"
 
-namespace rr
+namespace rr {
+
+Thread::Thread(void (*threadFunction)(void *parameters), void *parameters)
 {
-	Thread::Thread(void (*threadFunction)(void *parameters), void *parameters)
-	{
-		Event init;
-		Entry entry = {threadFunction, parameters, &init};
-
-		#if defined(_WIN32)
-			handle = CreateThread(NULL, 1024 * 1024, startFunction, &entry, 0, NULL);
-		#else
-			pthread_create(&handle, NULL, startFunction, &entry);
-		#endif
-
-		init.wait();
-	}
-
-	Thread::~Thread()
-	{
-		join();   // Make threads exit before deleting them to not block here
-	}
-
-	void Thread::join()
-	{
-		if(!hasJoined)
-		{
-			#if defined(_WIN32)
-				WaitForSingleObject(handle, INFINITE);
-				CloseHandle(handle);
-			#else
-				pthread_join(handle, NULL);
-			#endif
-
-			hasJoined = true;
-		}
-	}
+	Event init;
+	Entry entry = {threadFunction, parameters, &init};
 
 	#if defined(_WIN32)
-		unsigned long __stdcall Thread::startFunction(void *parameters)
-		{
-			Entry entry = *(Entry*)parameters;
-			entry.init->signal();
-			entry.threadFunction(entry.threadParameters);
-			return 0;
-		}
+		handle = CreateThread(NULL, 1024 * 1024, startFunction, &entry, 0, NULL);
 	#else
-		void *Thread::startFunction(void *parameters)
-		{
-			Entry entry = *(Entry*)parameters;
-			entry.init->signal();
-			entry.threadFunction(entry.threadParameters);
-			return nullptr;
-		}
+		pthread_create(&handle, NULL, startFunction, &entry);
 	#endif
 
-	Event::Event()
-	{
-		#if defined(_WIN32)
-			handle = CreateEvent(NULL, FALSE, FALSE, NULL);
-		#else
-			pthread_cond_init(&handle, NULL);
-			pthread_mutex_init(&mutex, NULL);
-			signaled = false;
-		#endif
-	}
+	init.wait();
+}
 
-	Event::~Event()
+Thread::~Thread()
+{
+	join();   // Make threads exit before deleting them to not block here
+}
+
+void Thread::join()
+{
+	if(!hasJoined)
 	{
 		#if defined(_WIN32)
+			WaitForSingleObject(handle, INFINITE);
 			CloseHandle(handle);
 		#else
-			pthread_cond_destroy(&handle);
-			pthread_mutex_destroy(&mutex);
+			pthread_join(handle, NULL);
 		#endif
+
+		hasJoined = true;
 	}
 }
+
+#if defined(_WIN32)
+	unsigned long __stdcall Thread::startFunction(void *parameters)
+	{
+		Entry entry = *(Entry*)parameters;
+		entry.init->signal();
+		entry.threadFunction(entry.threadParameters);
+		return 0;
+	}
+#else
+	void *Thread::startFunction(void *parameters)
+	{
+		Entry entry = *(Entry*)parameters;
+		entry.init->signal();
+		entry.threadFunction(entry.threadParameters);
+		return nullptr;
+	}
+#endif
+
+Event::Event()
+{
+	#if defined(_WIN32)
+		handle = CreateEvent(NULL, FALSE, FALSE, NULL);
+	#else
+		pthread_cond_init(&handle, NULL);
+		pthread_mutex_init(&mutex, NULL);
+		signaled = false;
+	#endif
+}
+
+Event::~Event()
+{
+	#if defined(_WIN32)
+		CloseHandle(handle);
+	#else
+		pthread_cond_destroy(&handle);
+		pthread_mutex_destroy(&mutex);
+	#endif
+}
+
+}  // namespace rr

diff --git a/src/Reactor/Thread.hpp b/src/Reactor/Thread.hpp
index ed3bb55..7feb61e 100644
--- a/src/Reactor/Thread.hpp
+++ b/src/Reactor/Thread.hpp

@@ -43,296 +43,300 @@
 #include <atomic>
 #endif
 
-namespace rr
+namespace rr {
+
+class Event;
+
+class Thread
 {
-	class Event;
+public:
+	Thread(void (*threadFunction)(void *parameters), void *parameters);
 
-	class Thread
-	{
-	public:
-		Thread(void (*threadFunction)(void *parameters), void *parameters);
+	~Thread();
 
-		~Thread();
+	void join();
 
-		void join();
+	static void yield();
+	static void sleep(int milliseconds);
 
-		static void yield();
-		static void sleep(int milliseconds);
-
-		#if defined(_WIN32)
-			typedef DWORD LocalStorageKey;
-		#else
-			typedef pthread_key_t LocalStorageKey;
-		#endif
-
-		static LocalStorageKey allocateLocalStorageKey(void (*destructor)(void *storage) = free);
-		static void freeLocalStorageKey(LocalStorageKey key);
-		static void *allocateLocalStorage(LocalStorageKey key, size_t size);
-		static void *getLocalStorage(LocalStorageKey key);
-		static void freeLocalStorage(LocalStorageKey key);
-
-	private:
-		struct Entry
-		{
-			void (*const threadFunction)(void *parameters);
-			void *threadParameters;
-			Event *init;
-		};
-
-		#if defined(_WIN32)
-			static unsigned long __stdcall startFunction(void *parameters);
-			HANDLE handle;
-		#else
-			static void *startFunction(void *parameters);
-			pthread_t handle;
-		#endif
-
-		bool hasJoined = false;
-	};
-
-	class Event
-	{
-		friend class Thread;
-
-	public:
-		Event();
-
-		~Event();
-
-		void signal();
-		void wait();
-
-	private:
-		#if defined(_WIN32)
-			HANDLE handle;
-		#else
-			pthread_cond_t handle;
-			pthread_mutex_t mutex;
-			volatile bool signaled;
-		#endif
-	};
-
-	#if PERF_PROFILE
-	int64_t atomicExchange(int64_t volatile *target, int64_t value);
-	int atomicExchange(int volatile *target, int value);
+	#if defined(_WIN32)
+		typedef DWORD LocalStorageKey;
+	#else
+		typedef pthread_key_t LocalStorageKey;
 	#endif
 
-	int atomicIncrement(int volatile *value);
-	int atomicDecrement(int volatile *value);
-	int atomicAdd(int volatile *target, int value);
-	void nop();
+	static LocalStorageKey allocateLocalStorageKey(void (*destructor)(void *storage) = free);
+	static void freeLocalStorageKey(LocalStorageKey key);
+	static void *allocateLocalStorage(LocalStorageKey key, size_t size);
+	static void *getLocalStorage(LocalStorageKey key);
+	static void freeLocalStorage(LocalStorageKey key);
+
+private:
+	struct Entry
+	{
+		void (*const threadFunction)(void *parameters);
+		void *threadParameters;
+		Event *init;
+	};
+
+	#if defined(_WIN32)
+		static unsigned long __stdcall startFunction(void *parameters);
+		HANDLE handle;
+	#else
+		static void *startFunction(void *parameters);
+		pthread_t handle;
+	#endif
+
+	bool hasJoined = false;
+};
+
+class Event
+{
+	friend class Thread;
+
+public:
+	Event();
+
+	~Event();
+
+	void signal();
+	void wait();
+
+private:
+	#if defined(_WIN32)
+		HANDLE handle;
+	#else
+		pthread_cond_t handle;
+		pthread_mutex_t mutex;
+		volatile bool signaled;
+	#endif
+};
+
+#if PERF_PROFILE
+int64_t atomicExchange(int64_t volatile *target, int64_t value);
+int atomicExchange(int volatile *target, int value);
+#endif
+
+int atomicIncrement(int volatile *value);
+int atomicDecrement(int volatile *value);
+int atomicAdd(int volatile *target, int value);
+void nop();
+
+}  // namespace rr
+
+/* Inline implementation */
+
+namespace rr {
+
+inline void Thread::yield()
+{
+	#if defined(_WIN32)
+		Sleep(0);
+	#elif defined(__APPLE__)
+		pthread_yield_np();
+	#else
+		sched_yield();
+	#endif
 }
 
-namespace rr
+inline void Thread::sleep(int milliseconds)
 {
-	inline void Thread::yield()
+	#if defined(_WIN32)
+		Sleep(milliseconds);
+	#else
+		usleep(1000 * milliseconds);
+	#endif
+}
+
+inline Thread::LocalStorageKey Thread::allocateLocalStorageKey(void (*destructor)(void *storage))
+{
+	#if defined(_WIN32)
+		return TlsAlloc();
+	#else
+		LocalStorageKey key;
+		pthread_key_create(&key, destructor);
+		return key;
+	#endif
+}
+
+inline void Thread::freeLocalStorageKey(LocalStorageKey key)
+{
+	#if defined(_WIN32)
+		TlsFree(key);
+	#else
+		pthread_key_delete(key);   // Using an invalid key is an error but not undefined behavior.
+	#endif
+}
+
+inline void *Thread::allocateLocalStorage(LocalStorageKey key, size_t size)
+{
+	if(key == TLS_OUT_OF_INDEXES)
 	{
-		#if defined(_WIN32)
-			Sleep(0);
-		#elif defined(__APPLE__)
-			pthread_yield_np();
-		#else
-			sched_yield();
-		#endif
+		return nullptr;
 	}
 
-	inline void Thread::sleep(int milliseconds)
-	{
-		#if defined(_WIN32)
-			Sleep(milliseconds);
-		#else
-			usleep(1000 * milliseconds);
-		#endif
-	}
+	freeLocalStorage(key);
 
-	inline Thread::LocalStorageKey Thread::allocateLocalStorageKey(void (*destructor)(void *storage))
-	{
-		#if defined(_WIN32)
-			return TlsAlloc();
-		#else
-			LocalStorageKey key;
-			pthread_key_create(&key, destructor);
-			return key;
-		#endif
-	}
+	void *storage = malloc(size);
 
-	inline void Thread::freeLocalStorageKey(LocalStorageKey key)
-	{
-		#if defined(_WIN32)
-			TlsFree(key);
-		#else
-			pthread_key_delete(key);   // Using an invalid key is an error but not undefined behavior.
-		#endif
-	}
+	#if defined(_WIN32)
+		TlsSetValue(key, storage);
+	#else
+		pthread_setspecific(key, storage);
+	#endif
 
-	inline void *Thread::allocateLocalStorage(LocalStorageKey key, size_t size)
-	{
-		if(key == TLS_OUT_OF_INDEXES)
+	return storage;
+}
+
+inline void *Thread::getLocalStorage(LocalStorageKey key)
+{
+	#if defined(_WIN32)
+		return TlsGetValue(key);
+	#else
+		if(key == TLS_OUT_OF_INDEXES)   // Avoid undefined behavior.
 		{
 			return nullptr;
 		}
 
-		freeLocalStorage(key);
-
-		void *storage = malloc(size);
-
-		#if defined(_WIN32)
-			TlsSetValue(key, storage);
-		#else
-			pthread_setspecific(key, storage);
-		#endif
-
-		return storage;
-	}
-
-	inline void *Thread::getLocalStorage(LocalStorageKey key)
-	{
-		#if defined(_WIN32)
-			return TlsGetValue(key);
-		#else
-			if(key == TLS_OUT_OF_INDEXES)   // Avoid undefined behavior.
-			{
-				return nullptr;
-			}
-
-			return pthread_getspecific(key);
-		#endif
-	}
-
-	inline void Thread::freeLocalStorage(LocalStorageKey key)
-	{
-		free(getLocalStorage(key));
-
-		#if defined(_WIN32)
-			TlsSetValue(key, nullptr);
-		#else
-			pthread_setspecific(key, nullptr);
-		#endif
-	}
-
-	inline void Event::signal()
-	{
-		#if defined(_WIN32)
-			SetEvent(handle);
-		#else
-			pthread_mutex_lock(&mutex);
-			signaled = true;
-			pthread_cond_signal(&handle);
-			pthread_mutex_unlock(&mutex);
-		#endif
-	}
-
-	inline void Event::wait()
-	{
-		#if defined(_WIN32)
-			WaitForSingleObject(handle, INFINITE);
-		#else
-			pthread_mutex_lock(&mutex);
-			while(!signaled) pthread_cond_wait(&handle, &mutex);
-			signaled = false;
-			pthread_mutex_unlock(&mutex);
-		#endif
-	}
-
-	#if PERF_PROFILE
-	inline int64_t atomicExchange(volatile int64_t *target, int64_t value)
-	{
-		#if defined(_WIN32)
-			return InterlockedExchange64(target, value);
-		#else
-			int ret;
-			__asm__ __volatile__("lock; xchg8 %x0,(%x1)" : "=r" (ret) :"r" (target), "0" (value) : "memory" );
-			return ret;
-		#endif
-	}
-
-	inline int atomicExchange(volatile int *target, int value)
-	{
-		#if defined(_WIN32)
-			return InterlockedExchange((volatile long*)target, (long)value);
-		#else
-			int ret;
-			__asm__ __volatile__("lock; xchgl %x0,(%x1)" : "=r" (ret) :"r" (target), "0" (value) : "memory" );
-			return ret;
-		#endif
-	}
-	#endif
-
-	inline int atomicIncrement(volatile int *value)
-	{
-		#if defined(_WIN32)
-			return InterlockedIncrement((volatile long*)value);
-		#else
-			return __sync_add_and_fetch(value, 1);
-		#endif
-	}
-
-	inline int atomicDecrement(volatile int *value)
-	{
-		#if defined(_WIN32)
-			return InterlockedDecrement((volatile long*)value);
-		#else
-			return __sync_sub_and_fetch(value, 1);
-		#endif
-	}
-
-	inline int atomicAdd(volatile int* target, int value)
-	{
-		#if defined(_WIN32)
-			return InterlockedExchangeAdd((volatile long*)target, value) + value;
-		#else
-			return __sync_add_and_fetch(target, value);
-		#endif
-	}
-
-	inline void nop()
-	{
-		#if defined(_WIN32)
-			__nop();
-		#else
-			__asm__ __volatile__ ("nop");
-		#endif
-	}
-
-	#if USE_STD_ATOMIC
-		class AtomicInt
-		{
-		public:
-			AtomicInt() : ai() {}
-			AtomicInt(int i) : ai(i) {}
-
-			inline operator int() const { return ai.load(std::memory_order_acquire); }
-			inline void operator=(const AtomicInt& i) { ai.store(i.ai.load(std::memory_order_acquire), std::memory_order_release); }
-			inline void operator=(int i) { ai.store(i, std::memory_order_release); }
-			inline void operator--() { ai.fetch_sub(1, std::memory_order_acq_rel); }
-			inline void operator++() { ai.fetch_add(1, std::memory_order_acq_rel); }
-			inline int operator--(int) { return ai.fetch_sub(1, std::memory_order_acq_rel) - 1; }
-			inline int operator++(int) { return ai.fetch_add(1, std::memory_order_acq_rel) + 1; }
-			inline void operator-=(int i) { ai.fetch_sub(i, std::memory_order_acq_rel); }
-			inline void operator+=(int i) { ai.fetch_add(i, std::memory_order_acq_rel); }
-		private:
-			std::atomic<int> ai;
-		};
-	#else
-		class AtomicInt
-		{
-		public:
-			AtomicInt() {}
-			AtomicInt(int i) : vi(i) {}
-
-			inline operator int() const { return vi; } // Note: this isn't a guaranteed atomic operation
-			inline void operator=(const AtomicInt& i) { atomicExchange(&vi, i.vi); }
-			inline void operator=(int i) { atomicExchange(&vi, i); }
-			inline void operator--() { atomicDecrement(&vi); }
-			inline void operator++() { atomicIncrement(&vi); }
-			inline int operator--(int) { return atomicDecrement(&vi); }
-			inline int operator++(int) { return atomicIncrement(&vi); }
-			inline void operator-=(int i) { atomicAdd(&vi, -i); }
-			inline void operator+=(int i) { atomicAdd(&vi, i); }
-		private:
-			volatile int vi;
-		};
+		return pthread_getspecific(key);
 	#endif
 }
 
+inline void Thread::freeLocalStorage(LocalStorageKey key)
+{
+	free(getLocalStorage(key));
+
+	#if defined(_WIN32)
+		TlsSetValue(key, nullptr);
+	#else
+		pthread_setspecific(key, nullptr);
+	#endif
+}
+
+inline void Event::signal()
+{
+	#if defined(_WIN32)
+		SetEvent(handle);
+	#else
+		pthread_mutex_lock(&mutex);
+		signaled = true;
+		pthread_cond_signal(&handle);
+		pthread_mutex_unlock(&mutex);
+	#endif
+}
+
+inline void Event::wait()
+{
+	#if defined(_WIN32)
+		WaitForSingleObject(handle, INFINITE);
+	#else
+		pthread_mutex_lock(&mutex);
+		while(!signaled) pthread_cond_wait(&handle, &mutex);
+		signaled = false;
+		pthread_mutex_unlock(&mutex);
+	#endif
+}
+
+#if PERF_PROFILE
+inline int64_t atomicExchange(volatile int64_t *target, int64_t value)
+{
+	#if defined(_WIN32)
+		return InterlockedExchange64(target, value);
+	#else
+		int ret;
+		__asm__ __volatile__("lock; xchg8 %x0,(%x1)" : "=r" (ret) :"r" (target), "0" (value) : "memory" );
+		return ret;
+	#endif
+}
+
+inline int atomicExchange(volatile int *target, int value)
+{
+	#if defined(_WIN32)
+		return InterlockedExchange((volatile long*)target, (long)value);
+	#else
+		int ret;
+		__asm__ __volatile__("lock; xchgl %x0,(%x1)" : "=r" (ret) :"r" (target), "0" (value) : "memory" );
+		return ret;
+	#endif
+}
+#endif
+
+inline int atomicIncrement(volatile int *value)
+{
+	#if defined(_WIN32)
+		return InterlockedIncrement((volatile long*)value);
+	#else
+		return __sync_add_and_fetch(value, 1);
+	#endif
+}
+
+inline int atomicDecrement(volatile int *value)
+{
+	#if defined(_WIN32)
+		return InterlockedDecrement((volatile long*)value);
+	#else
+		return __sync_sub_and_fetch(value, 1);
+	#endif
+}
+
+inline int atomicAdd(volatile int* target, int value)
+{
+	#if defined(_WIN32)
+		return InterlockedExchangeAdd((volatile long*)target, value) + value;
+	#else
+		return __sync_add_and_fetch(target, value);
+	#endif
+}
+
+inline void nop()
+{
+	#if defined(_WIN32)
+		__nop();
+	#else
+		__asm__ __volatile__ ("nop");
+	#endif
+}
+
+#if USE_STD_ATOMIC
+	class AtomicInt
+	{
+	public:
+		AtomicInt() : ai() {}
+		AtomicInt(int i) : ai(i) {}
+
+		inline operator int() const { return ai.load(std::memory_order_acquire); }
+		inline void operator=(const AtomicInt& i) { ai.store(i.ai.load(std::memory_order_acquire), std::memory_order_release); }
+		inline void operator=(int i) { ai.store(i, std::memory_order_release); }
+		inline void operator--() { ai.fetch_sub(1, std::memory_order_acq_rel); }
+		inline void operator++() { ai.fetch_add(1, std::memory_order_acq_rel); }
+		inline int operator--(int) { return ai.fetch_sub(1, std::memory_order_acq_rel) - 1; }
+		inline int operator++(int) { return ai.fetch_add(1, std::memory_order_acq_rel) + 1; }
+		inline void operator-=(int i) { ai.fetch_sub(i, std::memory_order_acq_rel); }
+		inline void operator+=(int i) { ai.fetch_add(i, std::memory_order_acq_rel); }
+	private:
+		std::atomic<int> ai;
+	};
+#else
+	class AtomicInt
+	{
+	public:
+		AtomicInt() {}
+		AtomicInt(int i) : vi(i) {}
+
+		inline operator int() const { return vi; } // Note: this isn't a guaranteed atomic operation
+		inline void operator=(const AtomicInt& i) { atomicExchange(&vi, i.vi); }
+		inline void operator=(int i) { atomicExchange(&vi, i); }
+		inline void operator--() { atomicDecrement(&vi); }
+		inline void operator++() { atomicIncrement(&vi); }
+		inline int operator--(int) { return atomicDecrement(&vi); }
+		inline int operator++(int) { return atomicIncrement(&vi); }
+		inline void operator-=(int i) { atomicAdd(&vi, -i); }
+		inline void operator+=(int i) { atomicAdd(&vi, i); }
+	private:
+		volatile int vi;
+	};
+#endif
+
+}  // namespace rr
+
 #endif   // rr_Thread_hpp

diff --git a/src/Reactor/Traits.hpp b/src/Reactor/Traits.hpp
index d26abc7..53f36f0 100644
--- a/src/Reactor/Traits.hpp
+++ b/src/Reactor/Traits.hpp

@@ -22,216 +22,216 @@
 #undef Bool // b/127920555
 #endif
 
-namespace rr
+namespace rr {
+
+// Forward declarations
+class Value;
+
+class Void;
+class Bool;
+class Byte;
+class SByte;
+class Short;
+class UShort;
+class Int;
+class UInt;
+class Long;
+class Half;
+class Float;
+class Float4;
+
+template<class T> class Pointer;
+template<class T> class LValue;
+template<class T> class RValue;
+
+// enabled_if_t is identical to C++14's std::enable_if_t.
+// std::enable_if_t was introduced in C++14, but Reactor must support
+// C++11.
+template<bool Condition, class TrueType = void>
+using enable_if_t = typename std::enable_if<Condition, TrueType>::type;
+
+// IsDefined<T>::value is true if T is a valid type, otherwise false.
+template <typename T, typename Enable = void>
+struct IsDefined
 {
-	// Forward declarations
-	class Value;
+	static constexpr bool value = false;
+};
 
-	class Void;
-	class Bool;
-	class Byte;
-	class SByte;
-	class Short;
-	class UShort;
-	class Int;
-	class UInt;
-	class Long;
-	class Half;
-	class Float;
-	class Float4;
+template <typename T>
+struct IsDefined<T, enable_if_t<(sizeof(T)>0)> >
+{
+	static constexpr bool value = true;
+};
 
-	template<class T> class Pointer;
-	template<class T> class LValue;
-	template<class T> class RValue;
+template <>
+struct IsDefined<void>
+{
+	static constexpr bool value = true;
+};
 
-	// enabled_if_t is identical to C++14's std::enable_if_t.
-	// std::enable_if_t was introduced in C++14, but Reactor must support
-	// C++11.
-	template<bool Condition, class TrueType = void>
-	using enable_if_t = typename std::enable_if<Condition, TrueType>::type;
+// CToReactorT<T> resolves to the corresponding Reactor type for the given C
+// template type T.
+template<typename T, typename ENABLE = void> struct CToReactor;
+template<typename T> using CToReactorT = typename CToReactor<T>::type;
 
-	// IsDefined<T>::value is true if T is a valid type, otherwise false.
-	template <typename T, typename Enable = void>
-	struct IsDefined
-	{
-		static constexpr bool value = false;
-	};
+// CToReactor specializations for POD types.
+template<> struct CToReactor<void>    	{ using type = Void; };
+template<> struct CToReactor<bool>    	{ using type = Bool;   static Bool   cast(bool);     };
+template<> struct CToReactor<uint8_t> 	{ using type = Byte;   static Byte   cast(uint8_t);  };
+template<> struct CToReactor<int8_t>  	{ using type = SByte;  static SByte  cast(int8_t);   };
+template<> struct CToReactor<int16_t> 	{ using type = Short;  static Short  cast(int16_t);  };
+template<> struct CToReactor<uint16_t>	{ using type = UShort; static UShort cast(uint16_t); };
+template<> struct CToReactor<int32_t> 	{ using type = Int;    static Int    cast(int32_t);  };
+template<> struct CToReactor<uint32_t>	{ using type = UInt;   static UInt   cast(uint32_t); };
+template<> struct CToReactor<float>   	{ using type = Float;  static Float  cast(float);    };
+template<> struct CToReactor<float[4]>	{ using type = Float4; static Float4 cast(float[4]); };
 
-	template <typename T>
-	struct IsDefined<T, enable_if_t<(sizeof(T)>0)> >
-	{
-		static constexpr bool value = true;
-	};
+// TODO: Long has no constructor that takes a uint64_t
+template<> struct CToReactor<uint64_t>	{ using type = Long;  /* static Long   cast(uint64_t); */ };
 
-	template <>
-	struct IsDefined<void>
-	{
-		static constexpr bool value = true;
-	};
+// HasReactorType<T>::value resolves to true iff there exists a
+// CToReactorT specialization for type T.
+template<typename T>
+using HasReactorType = IsDefined< CToReactorT<T> >;
 
-	// CToReactorT<T> resolves to the corresponding Reactor type for the given C
-	// template type T.
-	template<typename T, typename ENABLE = void> struct CToReactor;
-	template<typename T> using CToReactorT = typename CToReactor<T>::type;
+// CToReactorPtr<T>::type resolves to the corresponding Reactor Pointer<>
+// type for T*.
+// For T types that have a CToReactorT<> specialization,
+// CToReactorPtr<T>::type resolves to Pointer< CToReactorT<T> >, otherwise
+// CToReactorPtr<T>::type resolves to Pointer<Byte>.
+template<typename T, typename ENABLE = void> struct CToReactorPtr
+{
+	using type = Pointer<Byte>;
+	static inline type cast(const T* v); // implemented in Traits.inl
+};
 
-	// CToReactor specializations for POD types.
-	template<> struct CToReactor<void>    	{ using type = Void; };
-	template<> struct CToReactor<bool>    	{ using type = Bool;   static Bool   cast(bool);     };
-	template<> struct CToReactor<uint8_t> 	{ using type = Byte;   static Byte   cast(uint8_t);  };
-	template<> struct CToReactor<int8_t>  	{ using type = SByte;  static SByte  cast(int8_t);   };
-	template<> struct CToReactor<int16_t> 	{ using type = Short;  static Short  cast(int16_t);  };
-	template<> struct CToReactor<uint16_t>	{ using type = UShort; static UShort cast(uint16_t); };
-	template<> struct CToReactor<int32_t> 	{ using type = Int;    static Int    cast(int32_t);  };
-	template<> struct CToReactor<uint32_t>	{ using type = UInt;   static UInt   cast(uint32_t); };
-	template<> struct CToReactor<float>   	{ using type = Float;  static Float  cast(float);    };
-	template<> struct CToReactor<float[4]>	{ using type = Float4; static Float4 cast(float[4]); };
+// CToReactorPtr specialization for T types that have a CToReactorT<>
+// specialization.
+template<typename T> struct CToReactorPtr<T, enable_if_t< HasReactorType<T>::value > >
+{
+	using type = Pointer< CToReactorT<T> >;
+	static inline type cast(const T* v); // implemented in Traits.inl
+};
 
-	// TODO: Long has no constructor that takes a uint64_t
-	template<> struct CToReactor<uint64_t>	{ using type = Long;  /* static Long   cast(uint64_t); */ };
+// CToReactorPtr specialization for void*.
+// Maps to Pointer<Byte> instead of Pointer<Void>.
+template<> struct CToReactorPtr<void, void>
+{
+	using type = Pointer<Byte>;
+	static inline type cast(const void* v); // implemented in Traits.inl
+};
 
-	// HasReactorType<T>::value resolves to true iff there exists a
-	// CToReactorT specialization for type T.
-	template<typename T>
-	using HasReactorType = IsDefined< CToReactorT<T> >;
+// CToReactorPtr specialization for function pointer types.
+// Maps to Pointer<Byte>.
+// Drops the 'const' qualifier from the cast() method to avoid warnings
+// about const having no meaning for function types.
+template<typename T> struct CToReactorPtr<T, enable_if_t< std::is_function<T>::value > >
+{
+	using type = Pointer<Byte>;
+	static inline type cast(T* v); // implemented in Traits.inl
+};
 
-	// CToReactorPtr<T>::type resolves to the corresponding Reactor Pointer<>
-	// type for T*.
-	// For T types that have a CToReactorT<> specialization,
-	// CToReactorPtr<T>::type resolves to Pointer< CToReactorT<T> >, otherwise
-	// CToReactorPtr<T>::type resolves to Pointer<Byte>.
-	template<typename T, typename ENABLE = void> struct CToReactorPtr
-	{
-		using type = Pointer<Byte>;
-		static inline type cast(const T* v); // implemented in Traits.inl
-	};
+template<typename T> using CToReactorPtrT = typename CToReactorPtr<T>::type;
 
-	// CToReactorPtr specialization for T types that have a CToReactorT<>
-	// specialization.
-	template<typename T> struct CToReactorPtr<T, enable_if_t< HasReactorType<T>::value > >
-	{
-		using type = Pointer< CToReactorT<T> >;
-		static inline type cast(const T* v); // implemented in Traits.inl
-	};
+// CToReactor specialization for pointer types.
+// For T types that have a CToReactorT<> specialization,
+// CToReactorT<T*>::type resolves to Pointer< CToReactorT<T> >, otherwise
+// CToReactorT<T*>::type resolves to Pointer<Byte>.
+template<typename T>
+struct CToReactor<T, enable_if_t<std::is_pointer<T>::value> >
+{
+	using elem = typename std::remove_pointer<T>::type;
+	using type = CToReactorPtrT<elem>;
+	static inline type cast(T v); // implemented in Traits.inl
+};
 
-	// CToReactorPtr specialization for void*.
-	// Maps to Pointer<Byte> instead of Pointer<Void>.
-	template<> struct CToReactorPtr<void, void>
-	{
-		using type = Pointer<Byte>;
-		static inline type cast(const void* v); // implemented in Traits.inl
-	};
+// CToReactor specialization for enum types.
+template<typename T>
+struct CToReactor<T, enable_if_t<std::is_enum<T>::value> >
+{
+	using underlying = typename std::underlying_type<T>::type;
+	using type = CToReactorT<underlying>;
+	static type cast(T v); // implemented in Traits.inl
+};
 
-	// CToReactorPtr specialization for function pointer types.
-	// Maps to Pointer<Byte>.
-	// Drops the 'const' qualifier from the cast() method to avoid warnings
-	// about const having no meaning for function types.
-	template<typename T> struct CToReactorPtr<T, enable_if_t< std::is_function<T>::value > >
-	{
-		using type = Pointer<Byte>;
-		static inline type cast(T* v); // implemented in Traits.inl
-	};
+// IsRValue::value is true if T is of type RValue<X>, where X is any type.
+template <typename T, typename Enable = void> struct IsRValue { static constexpr bool value = false; };
+template <typename T> struct IsRValue<T, enable_if_t<IsDefined<typename T::rvalue_underlying_type>::value> > { static constexpr bool value = true; };
 
-	template<typename T> using CToReactorPtrT = typename CToReactorPtr<T>::type;
+// IsLValue::value is true if T is of, or derives from type LValue<T>.
+template <typename T> struct IsLValue { static constexpr bool value = std::is_base_of<LValue<T>, T>::value; };
 
-	// CToReactor specialization for pointer types.
-	// For T types that have a CToReactorT<> specialization,
-	// CToReactorT<T*>::type resolves to Pointer< CToReactorT<T> >, otherwise
-	// CToReactorT<T*>::type resolves to Pointer<Byte>.
-	template<typename T>
-	struct CToReactor<T, enable_if_t<std::is_pointer<T>::value> >
-	{
-		using elem = typename std::remove_pointer<T>::type;
-		using type = CToReactorPtrT<elem>;
-		static inline type cast(T v); // implemented in Traits.inl
-	};
+// IsReference::value is true if T is of type Reference<X>, where X is any type.
+template <typename T, typename Enable = void> struct IsReference { static constexpr bool value = false; };
+template <typename T> struct IsReference<T, enable_if_t<IsDefined<typename T::reference_underlying_type>::value> > { static constexpr bool value = true; };
 
-	// CToReactor specialization for enum types.
-	template<typename T>
-	struct CToReactor<T, enable_if_t<std::is_enum<T>::value> >
-	{
-		using underlying = typename std::underlying_type<T>::type;
-		using type = CToReactorT<underlying>;
-		static type cast(T v); // implemented in Traits.inl
-	};
-
-	// IsRValue::value is true if T is of type RValue<X>, where X is any type.
-	template <typename T, typename Enable = void> struct IsRValue { static constexpr bool value = false; };
-	template <typename T> struct IsRValue<T, enable_if_t<IsDefined<typename T::rvalue_underlying_type>::value> > { static constexpr bool value = true; };
-
-	// IsLValue::value is true if T is of, or derives from type LValue<T>.
-	template <typename T> struct IsLValue { static constexpr bool value = std::is_base_of<LValue<T>, T>::value; };
-
-	// IsReference::value is true if T is of type Reference<X>, where X is any type.
-	template <typename T, typename Enable = void> struct IsReference { static constexpr bool value = false; };
-	template <typename T> struct IsReference<T, enable_if_t<IsDefined<typename T::reference_underlying_type>::value> > { static constexpr bool value = true; };
-
-	// ReactorTypeT<T> returns the LValue Reactor type for T.
-	// T can be a C-type, RValue or LValue.
-	template<typename T, typename ENABLE = void> struct ReactorType;
-	template<typename T> using ReactorTypeT = typename ReactorType<T>::type;
-	template<typename T> struct ReactorType<T, enable_if_t<IsDefined<CToReactorT<T>>::value> >
-	{
-		using type = CToReactorT<T>;
-		static type cast(T v) { return CToReactor<T>::cast(v); }
-	};
-	template<typename T> struct ReactorType<T, enable_if_t<IsRValue<T>::value> >
-	{
-		using type = typename T::rvalue_underlying_type;
-		static type cast(T v) { return type(v); }
-	};
-	template<typename T> struct ReactorType<T, enable_if_t<IsLValue<T>::value> >
-	{
-		using type = T;
-		static type cast(T v) { return type(v); }
-	};
-	template<typename T> struct ReactorType<T, enable_if_t<IsReference<T>::value> >
-	{
-		using type = T;
-		static type cast(T v) { return type(v); }
-	};
+// ReactorTypeT<T> returns the LValue Reactor type for T.
+// T can be a C-type, RValue or LValue.
+template<typename T, typename ENABLE = void> struct ReactorType;
+template<typename T> using ReactorTypeT = typename ReactorType<T>::type;
+template<typename T> struct ReactorType<T, enable_if_t<IsDefined<CToReactorT<T>>::value> >
+{
+	using type = CToReactorT<T>;
+	static type cast(T v) { return CToReactor<T>::cast(v); }
+};
+template<typename T> struct ReactorType<T, enable_if_t<IsRValue<T>::value> >
+{
+	using type = typename T::rvalue_underlying_type;
+	static type cast(T v) { return type(v); }
+};
+template<typename T> struct ReactorType<T, enable_if_t<IsLValue<T>::value> >
+{
+	using type = T;
+	static type cast(T v) { return type(v); }
+};
+template<typename T> struct ReactorType<T, enable_if_t<IsReference<T>::value> >
+{
+	using type = T;
+	static type cast(T v) { return type(v); }
+};
 
 
-	// Reactor types that can be used as a return type for a function.
-	template <typename T> struct CanBeUsedAsReturn { static constexpr bool value = false; };
-	template <> struct CanBeUsedAsReturn<Void>     { static constexpr bool value = true; };
-	template <> struct CanBeUsedAsReturn<Int>      { static constexpr bool value = true; };
-	template <> struct CanBeUsedAsReturn<UInt>     { static constexpr bool value = true; };
-	template <> struct CanBeUsedAsReturn<Float>    { static constexpr bool value = true; };
-	template <typename T> struct CanBeUsedAsReturn<Pointer<T>> { static constexpr bool value = true; };
+// Reactor types that can be used as a return type for a function.
+template <typename T> struct CanBeUsedAsReturn { static constexpr bool value = false; };
+template <> struct CanBeUsedAsReturn<Void>     { static constexpr bool value = true; };
+template <> struct CanBeUsedAsReturn<Int>      { static constexpr bool value = true; };
+template <> struct CanBeUsedAsReturn<UInt>     { static constexpr bool value = true; };
+template <> struct CanBeUsedAsReturn<Float>    { static constexpr bool value = true; };
+template <typename T> struct CanBeUsedAsReturn<Pointer<T>> { static constexpr bool value = true; };
 
-	// Reactor types that can be used as a parameter types for a function.
-	template <typename T> struct CanBeUsedAsParameter { static constexpr bool value = false; };
-	template <> struct CanBeUsedAsParameter<Int>      { static constexpr bool value = true; };
-	template <> struct CanBeUsedAsParameter<UInt>     { static constexpr bool value = true; };
-	template <> struct CanBeUsedAsParameter<Float>    { static constexpr bool value = true; };
-	template <typename T> struct CanBeUsedAsParameter<Pointer<T>> { static constexpr bool value = true; };
+// Reactor types that can be used as a parameter types for a function.
+template <typename T> struct CanBeUsedAsParameter { static constexpr bool value = false; };
+template <> struct CanBeUsedAsParameter<Int>      { static constexpr bool value = true; };
+template <> struct CanBeUsedAsParameter<UInt>     { static constexpr bool value = true; };
+template <> struct CanBeUsedAsParameter<Float>    { static constexpr bool value = true; };
+template <typename T> struct CanBeUsedAsParameter<Pointer<T>> { static constexpr bool value = true; };
 
-	// AssertParameterTypeIsValid statically asserts that all template parameter
-	// types can be used as a Reactor function parameter.
-	template<typename T, typename ... other>
-	struct AssertParameterTypeIsValid : AssertParameterTypeIsValid<other...>
-	{
-		static_assert(CanBeUsedAsParameter<T>::value, "Invalid parameter type");
-	};
-	template<typename T>
-	struct AssertParameterTypeIsValid<T>
-	{
-		static_assert(CanBeUsedAsParameter<T>::value, "Invalid parameter type");
-	};
+// AssertParameterTypeIsValid statically asserts that all template parameter
+// types can be used as a Reactor function parameter.
+template<typename T, typename ... other>
+struct AssertParameterTypeIsValid : AssertParameterTypeIsValid<other...>
+{
+	static_assert(CanBeUsedAsParameter<T>::value, "Invalid parameter type");
+};
+template<typename T>
+struct AssertParameterTypeIsValid<T>
+{
+	static_assert(CanBeUsedAsParameter<T>::value, "Invalid parameter type");
+};
 
-	// AssertFunctionSignatureIsValid statically asserts that the Reactor
-	// function signature is valid.
-	template<typename Return, typename... Arguments>
-	class AssertFunctionSignatureIsValid;
-	template<typename Return>
-	class AssertFunctionSignatureIsValid<Return(Void)> {};
-	template<typename Return, typename... Arguments>
-	class AssertFunctionSignatureIsValid<Return(Arguments...)>
-	{
-		static_assert(CanBeUsedAsReturn<Return>::value, "Invalid return type");
-		static_assert(sizeof(AssertParameterTypeIsValid<Arguments...>) >= 0, "");
-	};
+// AssertFunctionSignatureIsValid statically asserts that the Reactor
+// function signature is valid.
+template<typename Return, typename... Arguments>
+class AssertFunctionSignatureIsValid;
+template<typename Return>
+class AssertFunctionSignatureIsValid<Return(Void)> {};
+template<typename Return, typename... Arguments>
+class AssertFunctionSignatureIsValid<Return(Arguments...)>
+{
+	static_assert(CanBeUsedAsReturn<Return>::value, "Invalid return type");
+	static_assert(sizeof(AssertParameterTypeIsValid<Arguments...>) >= 0, "");
+};
 
-} // namespace rr
+}  // namespace rr
 
 #endif // rr_Traits_hpp

diff --git a/src/Reactor/Traits.inl b/src/Reactor/Traits.inl
index 2e10568..23a5941 100644
--- a/src/Reactor/Traits.inl
+++ b/src/Reactor/Traits.inl

@@ -15,55 +15,55 @@
 #ifndef rr_Traits_inl
 #define rr_Traits_inl
 
-namespace rr
+namespace rr {
+
+// Non-specialized implementation of CToReactorPtr::cast() defaults to
+// returning a ConstantPointer for v.
+template<typename T, typename ENABLE>
+Pointer<Byte> CToReactorPtr<T, ENABLE>::cast(const T* v)
 {
-	// Non-specialized implementation of CToReactorPtr::cast() defaults to
-	// returning a ConstantPointer for v.
-	template<typename T, typename ENABLE>
-	Pointer<Byte> CToReactorPtr<T, ENABLE>::cast(const T* v)
-	{
-		return ConstantPointer(v);
-	}
+	return ConstantPointer(v);
+}
 
-	// CToReactorPtr specialization for T types that have a CToReactorT<>
-	// specialization.
-	template<typename T>
-	Pointer<CToReactorT<T>>
-	CToReactorPtr<T, enable_if_t< HasReactorType<T>::value > >::cast(const T* v)
-	{
-		return type(v);
-	}
+// CToReactorPtr specialization for T types that have a CToReactorT<>
+// specialization.
+template<typename T>
+Pointer<CToReactorT<T>>
+CToReactorPtr<T, enable_if_t< HasReactorType<T>::value > >::cast(const T* v)
+{
+	return type(v);
+}
 
-	// CToReactorPtr specialization for void*.
-	Pointer<Byte> CToReactorPtr<void, void>::cast(const void* v)
-	{
-		return ConstantPointer(v);
-	}
+// CToReactorPtr specialization for void*.
+Pointer<Byte> CToReactorPtr<void, void>::cast(const void* v)
+{
+	return ConstantPointer(v);
+}
 
-	// CToReactorPtrT specialization for function pointer types.
-	template<typename T>
-	Pointer<Byte>
-	CToReactorPtr<T, enable_if_t< std::is_function<T>::value > >::cast(T* v)
-	{
-		return ConstantPointer(v);
-	}
+// CToReactorPtrT specialization for function pointer types.
+template<typename T>
+Pointer<Byte>
+CToReactorPtr<T, enable_if_t< std::is_function<T>::value > >::cast(T* v)
+{
+	return ConstantPointer(v);
+}
 
-	// CToReactor specialization for pointer types.
-	template<typename T>
-	CToReactorPtrT<typename std::remove_pointer<T>::type>
-	CToReactor<T, enable_if_t<std::is_pointer<T>::value> >::cast(T v)
-	{
-		return CToReactorPtr<elem>::cast(v);
-	}
+// CToReactor specialization for pointer types.
+template<typename T>
+CToReactorPtrT<typename std::remove_pointer<T>::type>
+CToReactor<T, enable_if_t<std::is_pointer<T>::value> >::cast(T v)
+{
+	return CToReactorPtr<elem>::cast(v);
+}
 
-	// CToReactor specialization for enum types.
-	template<typename T>
-	CToReactorT<typename std::underlying_type<T>::type>
-	CToReactor<T, enable_if_t<std::is_enum<T>::value> >::cast(T v)
-	{
-		return CToReactor<underlying>::cast(v);
-	}
+// CToReactor specialization for enum types.
+template<typename T>
+CToReactorT<typename std::underlying_type<T>::type>
+CToReactor<T, enable_if_t<std::is_enum<T>::value> >::cast(T v)
+{
+	return CToReactor<underlying>::cast(v);
+}
 
-} // namespace rr
+}  // namespace rr
 
 #endif // rr_Traits_inl

diff --git a/src/Reactor/x86.hpp b/src/Reactor/x86.hpp
index 6d3e8e8..dd98173 100644
--- a/src/Reactor/x86.hpp
+++ b/src/Reactor/x86.hpp

@@ -17,93 +17,93 @@
 
 #include "Reactor.hpp"
 
-namespace rr
-{
-	namespace x86
-	{
-		RValue<Int> cvtss2si(RValue<Float> val);
-		RValue<Int4> cvtps2dq(RValue<Float4> val);
+namespace rr {
+namespace x86 {
 
-		RValue<Float> rcpss(RValue<Float> val);
-		RValue<Float> sqrtss(RValue<Float> val);
-		RValue<Float> rsqrtss(RValue<Float> val);
+RValue<Int> cvtss2si(RValue<Float> val);
+RValue<Int4> cvtps2dq(RValue<Float4> val);
 
-		RValue<Float4> rcpps(RValue<Float4> val);
-		RValue<Float4> sqrtps(RValue<Float4> val);
-		RValue<Float4> rsqrtps(RValue<Float4> val);
-		RValue<Float4> maxps(RValue<Float4> x, RValue<Float4> y);
-		RValue<Float4> minps(RValue<Float4> x, RValue<Float4> y);
+RValue<Float> rcpss(RValue<Float> val);
+RValue<Float> sqrtss(RValue<Float> val);
+RValue<Float> rsqrtss(RValue<Float> val);
 
-		RValue<Float> roundss(RValue<Float> val, unsigned char imm);
-		RValue<Float> floorss(RValue<Float> val);
-		RValue<Float> ceilss(RValue<Float> val);
+RValue<Float4> rcpps(RValue<Float4> val);
+RValue<Float4> sqrtps(RValue<Float4> val);
+RValue<Float4> rsqrtps(RValue<Float4> val);
+RValue<Float4> maxps(RValue<Float4> x, RValue<Float4> y);
+RValue<Float4> minps(RValue<Float4> x, RValue<Float4> y);
 
-		RValue<Float4> roundps(RValue<Float4> val, unsigned char imm);
-		RValue<Float4> floorps(RValue<Float4> val);
-		RValue<Float4> ceilps(RValue<Float4> val);
+RValue<Float> roundss(RValue<Float> val, unsigned char imm);
+RValue<Float> floorss(RValue<Float> val);
+RValue<Float> ceilss(RValue<Float> val);
 
-		RValue<Int4> pabsd(RValue<Int4> x);
+RValue<Float4> roundps(RValue<Float4> val, unsigned char imm);
+RValue<Float4> floorps(RValue<Float4> val);
+RValue<Float4> ceilps(RValue<Float4> val);
 
-		RValue<Short4> paddsw(RValue<Short4> x, RValue<Short4> y);
-		RValue<Short4> psubsw(RValue<Short4> x, RValue<Short4> y);
-		RValue<UShort4> paddusw(RValue<UShort4> x, RValue<UShort4> y);
-		RValue<UShort4> psubusw(RValue<UShort4> x, RValue<UShort4> y);
-		RValue<SByte8> paddsb(RValue<SByte8> x, RValue<SByte8> y);
-		RValue<SByte8> psubsb(RValue<SByte8> x, RValue<SByte8> y);
-		RValue<Byte8> paddusb(RValue<Byte8> x, RValue<Byte8> y);
-		RValue<Byte8> psubusb(RValue<Byte8> x, RValue<Byte8> y);
+RValue<Int4> pabsd(RValue<Int4> x);
 
-		RValue<UShort4> pavgw(RValue<UShort4> x, RValue<UShort4> y);
+RValue<Short4> paddsw(RValue<Short4> x, RValue<Short4> y);
+RValue<Short4> psubsw(RValue<Short4> x, RValue<Short4> y);
+RValue<UShort4> paddusw(RValue<UShort4> x, RValue<UShort4> y);
+RValue<UShort4> psubusw(RValue<UShort4> x, RValue<UShort4> y);
+RValue<SByte8> paddsb(RValue<SByte8> x, RValue<SByte8> y);
+RValue<SByte8> psubsb(RValue<SByte8> x, RValue<SByte8> y);
+RValue<Byte8> paddusb(RValue<Byte8> x, RValue<Byte8> y);
+RValue<Byte8> psubusb(RValue<Byte8> x, RValue<Byte8> y);
 
-		RValue<Short4> pmaxsw(RValue<Short4> x, RValue<Short4> y);
-		RValue<Short4> pminsw(RValue<Short4> x, RValue<Short4> y);
+RValue<UShort4> pavgw(RValue<UShort4> x, RValue<UShort4> y);
 
-		RValue<Short4> pcmpgtw(RValue<Short4> x, RValue<Short4> y);
-		RValue<Short4> pcmpeqw(RValue<Short4> x, RValue<Short4> y);
-		RValue<Byte8> pcmpgtb(RValue<SByte8> x, RValue<SByte8> y);
-		RValue<Byte8> pcmpeqb(RValue<Byte8> x, RValue<Byte8> y);
+RValue<Short4> pmaxsw(RValue<Short4> x, RValue<Short4> y);
+RValue<Short4> pminsw(RValue<Short4> x, RValue<Short4> y);
 
-		RValue<Short4> packssdw(RValue<Int2> x, RValue<Int2> y);
-		RValue<Short8> packssdw(RValue<Int4> x, RValue<Int4> y);
-		RValue<SByte8> packsswb(RValue<Short4> x, RValue<Short4> y);
-		RValue<Byte8> packuswb(RValue<Short4> x, RValue<Short4> y);
+RValue<Short4> pcmpgtw(RValue<Short4> x, RValue<Short4> y);
+RValue<Short4> pcmpeqw(RValue<Short4> x, RValue<Short4> y);
+RValue<Byte8> pcmpgtb(RValue<SByte8> x, RValue<SByte8> y);
+RValue<Byte8> pcmpeqb(RValue<Byte8> x, RValue<Byte8> y);
 
-		RValue<UShort8> packusdw(RValue<Int4> x, RValue<Int4> y);
+RValue<Short4> packssdw(RValue<Int2> x, RValue<Int2> y);
+RValue<Short8> packssdw(RValue<Int4> x, RValue<Int4> y);
+RValue<SByte8> packsswb(RValue<Short4> x, RValue<Short4> y);
+RValue<Byte8> packuswb(RValue<Short4> x, RValue<Short4> y);
 
-		RValue<UShort4> psrlw(RValue<UShort4> x, unsigned char y);
-		RValue<UShort8> psrlw(RValue<UShort8> x, unsigned char y);
-		RValue<Short4> psraw(RValue<Short4> x, unsigned char y);
-		RValue<Short8> psraw(RValue<Short8> x, unsigned char y);
-		RValue<Short4> psllw(RValue<Short4> x, unsigned char y);
-		RValue<Short8> psllw(RValue<Short8> x, unsigned char y);
-		RValue<Int2> pslld(RValue<Int2> x, unsigned char y);
-		RValue<Int4> pslld(RValue<Int4> x, unsigned char y);
-		RValue<Int2> psrad(RValue<Int2> x, unsigned char y);
-		RValue<Int4> psrad(RValue<Int4> x, unsigned char y);
-		RValue<UInt2> psrld(RValue<UInt2> x, unsigned char y);
-		RValue<UInt4> psrld(RValue<UInt4> x, unsigned char y);
+RValue<UShort8> packusdw(RValue<Int4> x, RValue<Int4> y);
 
-		RValue<Int4> pmaxsd(RValue<Int4> x, RValue<Int4> y);
-		RValue<Int4> pminsd(RValue<Int4> x, RValue<Int4> y);
-		RValue<UInt4> pmaxud(RValue<UInt4> x, RValue<UInt4> y);
-		RValue<UInt4> pminud(RValue<UInt4> x, RValue<UInt4> y);
+RValue<UShort4> psrlw(RValue<UShort4> x, unsigned char y);
+RValue<UShort8> psrlw(RValue<UShort8> x, unsigned char y);
+RValue<Short4> psraw(RValue<Short4> x, unsigned char y);
+RValue<Short8> psraw(RValue<Short8> x, unsigned char y);
+RValue<Short4> psllw(RValue<Short4> x, unsigned char y);
+RValue<Short8> psllw(RValue<Short8> x, unsigned char y);
+RValue<Int2> pslld(RValue<Int2> x, unsigned char y);
+RValue<Int4> pslld(RValue<Int4> x, unsigned char y);
+RValue<Int2> psrad(RValue<Int2> x, unsigned char y);
+RValue<Int4> psrad(RValue<Int4> x, unsigned char y);
+RValue<UInt2> psrld(RValue<UInt2> x, unsigned char y);
+RValue<UInt4> psrld(RValue<UInt4> x, unsigned char y);
 
-		RValue<Short4> pmulhw(RValue<Short4> x, RValue<Short4> y);
-		RValue<UShort4> pmulhuw(RValue<UShort4> x, RValue<UShort4> y);
-		RValue<Int2> pmaddwd(RValue<Short4> x, RValue<Short4> y);
+RValue<Int4> pmaxsd(RValue<Int4> x, RValue<Int4> y);
+RValue<Int4> pminsd(RValue<Int4> x, RValue<Int4> y);
+RValue<UInt4> pmaxud(RValue<UInt4> x, RValue<UInt4> y);
+RValue<UInt4> pminud(RValue<UInt4> x, RValue<UInt4> y);
 
-		RValue<Short8> pmulhw(RValue<Short8> x, RValue<Short8> y);
-		RValue<UShort8> pmulhuw(RValue<UShort8> x, RValue<UShort8> y);
-		RValue<Int4> pmaddwd(RValue<Short8> x, RValue<Short8> y);
+RValue<Short4> pmulhw(RValue<Short4> x, RValue<Short4> y);
+RValue<UShort4> pmulhuw(RValue<UShort4> x, RValue<UShort4> y);
+RValue<Int2> pmaddwd(RValue<Short4> x, RValue<Short4> y);
 
-		RValue<Int> movmskps(RValue<Float4> x);
-		RValue<Int> pmovmskb(RValue<Byte8> x);
+RValue<Short8> pmulhw(RValue<Short8> x, RValue<Short8> y);
+RValue<UShort8> pmulhuw(RValue<UShort8> x, RValue<UShort8> y);
+RValue<Int4> pmaddwd(RValue<Short8> x, RValue<Short8> y);
 
-		RValue<Int4> pmovzxbd(RValue<Byte16> x);
-		RValue<Int4> pmovsxbd(RValue<SByte16> x);
-		RValue<Int4> pmovzxwd(RValue<UShort8> x);
-		RValue<Int4> pmovsxwd(RValue<Short8> x);
-	}
-}
+RValue<Int> movmskps(RValue<Float4> x);
+RValue<Int> pmovmskb(RValue<Byte8> x);
+
+RValue<Int4> pmovzxbd(RValue<Byte16> x);
+RValue<Int4> pmovsxbd(RValue<SByte16> x);
+RValue<Int4> pmovzxwd(RValue<UShort8> x);
+RValue<Int4> pmovsxwd(RValue<Short8> x);
+
+}  // namespace x86
+}  // namespace rr
 
 #endif   // rr_x86_hpp
\ No newline at end of file