Do not indent C++ namespace contents

This is a style change. Visual Studio defaults to indenting namespace
contents, and this was adopted for a long time, but with the new Vulkan
implementation this was abandoned. However the legacy code borrowed from
the OpenGL ES implementation still used indentation so it was
inconsistent.

The justification for not indenting namespace contents is that
namespaces are merely a way to avoid name clashes with other projects
we don't control directly (and in rare cases internal subprojects when
we want to reuse the same names). Hence the vast majority of files have
a single namespace, and unlike indentation used for ease of discerning
control flow blocks, class contents, or function contents, which can
become highly nested, there is no such readability advantage to
indenting namespace contents.

This is also the Google style recommendation (though no justification or
discussion is provided):
https://google.github.io/styleguide/cppguide.html#Namespace_Formatting

One reasonable counter-argument is consistency with other blocks of
curly brackets, but considering that most namespaces span almost the
entire file, it's a substantial waste of line length.

Because there is no indentation, there's also no need to have the open
and closing brackets line up as a visual aid, like we prefer for other
uses of curly brackets. So we place the open bracket on the same line as
the namespace keyword.

A comment is added to the closing bracket to discern it from other
closing brackets. It also makes it easier to find the end of anonymous
namespaces which typically go at the top of the source file.

This change is make separately from applying clang-format because diff
tools mark all these unindented lines as changes and this makes it hard
to review the smaller style changes made by clang-format. The OpenGL ES
and Direct3D code is left untouched because it is in maintenance mode
and in case of regressions we want easy 'blame' tool usage.

Bug: b/144825072
Change-Id: Ie2925ebd697e1ffa7c4cbdc9a946531f11f4d934
Reviewed-on: https://swiftshader-review.googlesource.com/c/SwiftShader/+/39348
Presubmit-Ready: Nicolas Capens <nicolascapens@google.com>
Reviewed-by: Ben Clayton <bclayton@google.com>
Tested-by: Nicolas Capens <nicolascapens@google.com>
diff --git a/src/Device/Blitter.cpp b/src/Device/Blitter.cpp
index e06601b..0b4bdab 100644
--- a/src/Device/Blitter.cpp
+++ b/src/Device/Blitter.cpp
@@ -24,1435 +24,1462 @@
 
 #include <utility>
 
-namespace sw
+namespace sw {
+
+Blitter::Blitter() :
+	blitMutex(),
+	blitCache(1024),
+	cornerUpdateMutex(),
+	cornerUpdateCache(64) // We only need one of these per format
 {
-	Blitter::Blitter() :
-		blitMutex(),
-		blitCache(1024),
-		cornerUpdateMutex(),
-		cornerUpdateCache(64) // We only need one of these per format
+}
+
+Blitter::~Blitter()
+{
+}
+
+void Blitter::clear(void *pixel, vk::Format format, vk::Image *dest, const vk::Format& viewFormat, const VkImageSubresourceRange& subresourceRange, const VkRect2D* renderArea)
+{
+	VkImageAspectFlagBits aspect = static_cast<VkImageAspectFlagBits>(subresourceRange.aspectMask);
+	vk::Format dstFormat = viewFormat.getAspectFormat(aspect);
+	if(dstFormat == VK_FORMAT_UNDEFINED)
 	{
+		return;
 	}
 
-	Blitter::~Blitter()
+	float *pPixel = static_cast<float *>(pixel);
+	if (viewFormat.isUnsignedNormalized())
 	{
+		pPixel[0] = sw::clamp(pPixel[0], 0.0f, 1.0f);
+		pPixel[1] = sw::clamp(pPixel[1], 0.0f, 1.0f);
+		pPixel[2] = sw::clamp(pPixel[2], 0.0f, 1.0f);
+		pPixel[3] = sw::clamp(pPixel[3], 0.0f, 1.0f);
+	}
+	else if (viewFormat.isSignedNormalized())
+	{
+		pPixel[0] = sw::clamp(pPixel[0], -1.0f, 1.0f);
+		pPixel[1] = sw::clamp(pPixel[1], -1.0f, 1.0f);
+		pPixel[2] = sw::clamp(pPixel[2], -1.0f, 1.0f);
+		pPixel[3] = sw::clamp(pPixel[3], -1.0f, 1.0f);
 	}
 
-	void Blitter::clear(void *pixel, vk::Format format, vk::Image *dest, const vk::Format& viewFormat, const VkImageSubresourceRange& subresourceRange, const VkRect2D* renderArea)
+	if(fastClear(pixel, format, dest, dstFormat, subresourceRange, renderArea))
 	{
-		VkImageAspectFlagBits aspect = static_cast<VkImageAspectFlagBits>(subresourceRange.aspectMask);
-		vk::Format dstFormat = viewFormat.getAspectFormat(aspect);
-		if(dstFormat == VK_FORMAT_UNDEFINED)
+		return;
+	}
+
+	State state(format, dstFormat, 1, dest->getSampleCountFlagBits(), Options{ 0xF });
+	auto blitRoutine = getBlitRoutine(state);
+	if(!blitRoutine)
+	{
+		return;
+	}
+
+	VkImageSubresourceLayers subresLayers =
+	{
+		subresourceRange.aspectMask,
+		subresourceRange.baseMipLevel,
+		subresourceRange.baseArrayLayer,
+		1
+	};
+
+	uint32_t lastMipLevel = dest->getLastMipLevel(subresourceRange);
+	uint32_t lastLayer = dest->getLastLayerIndex(subresourceRange);
+
+	VkRect2D area = { { 0, 0 }, { 0, 0 } };
+	if(renderArea)
+	{
+		ASSERT(subresourceRange.levelCount == 1);
+		area = *renderArea;
+	}
+
+	for(; subresLayers.mipLevel <= lastMipLevel; subresLayers.mipLevel++)
+	{
+		VkExtent3D extent = dest->getMipLevelExtent(aspect, subresLayers.mipLevel);
+		if(!renderArea)
 		{
-			return;
+			area.extent.width = extent.width;
+			area.extent.height = extent.height;
 		}
 
-		float *pPixel = static_cast<float *>(pixel);
-		if (viewFormat.isUnsignedNormalized())
+		BlitData data =
 		{
-			pPixel[0] = sw::clamp(pPixel[0], 0.0f, 1.0f);
-			pPixel[1] = sw::clamp(pPixel[1], 0.0f, 1.0f);
-			pPixel[2] = sw::clamp(pPixel[2], 0.0f, 1.0f);
-			pPixel[3] = sw::clamp(pPixel[3], 0.0f, 1.0f);
-		}
-		else if (viewFormat.isSignedNormalized())
-		{
-			pPixel[0] = sw::clamp(pPixel[0], -1.0f, 1.0f);
-			pPixel[1] = sw::clamp(pPixel[1], -1.0f, 1.0f);
-			pPixel[2] = sw::clamp(pPixel[2], -1.0f, 1.0f);
-			pPixel[3] = sw::clamp(pPixel[3], -1.0f, 1.0f);
-		}
+			pixel, nullptr, // source, dest
 
-		if(fastClear(pixel, format, dest, dstFormat, subresourceRange, renderArea))
-		{
-			return;
-		}
+			format.bytes(),                                       // sPitchB
+			dest->rowPitchBytes(aspect, subresLayers.mipLevel),   // dPitchB
+			0,                                                    // sSliceB (unused in clear operations)
+			dest->slicePitchBytes(aspect, subresLayers.mipLevel), // dSliceB
 
-		State state(format, dstFormat, 1, dest->getSampleCountFlagBits(), Options{ 0xF });
-		auto blitRoutine = getBlitRoutine(state);
-		if(!blitRoutine)
-		{
-			return;
-		}
+			0.5f, 0.5f, 0.0f, 0.0f, // x0, y0, w, h
 
-		VkImageSubresourceLayers subresLayers =
-		{
-			subresourceRange.aspectMask,
-			subresourceRange.baseMipLevel,
-			subresourceRange.baseArrayLayer,
-			1
+			area.offset.y, static_cast<int>(area.offset.y + area.extent.height), // y0d, y1d
+			area.offset.x, static_cast<int>(area.offset.x + area.extent.width),  // x0d, x1d
+
+			0, 0, // sWidth, sHeight
 		};
 
-		uint32_t lastMipLevel = dest->getLastMipLevel(subresourceRange);
-		uint32_t lastLayer = dest->getLastLayerIndex(subresourceRange);
-
-		VkRect2D area = { { 0, 0 }, { 0, 0 } };
-		if(renderArea)
+		if (renderArea && dest->is3DSlice())
 		{
-			ASSERT(subresourceRange.levelCount == 1);
-			area = *renderArea;
-		}
-
-		for(; subresLayers.mipLevel <= lastMipLevel; subresLayers.mipLevel++)
-		{
-			VkExtent3D extent = dest->getMipLevelExtent(aspect, subresLayers.mipLevel);
-			if(!renderArea)
+			// Reinterpret layers as depth slices
+			subresLayers.baseArrayLayer = 0;
+			subresLayers.layerCount = 1;
+			for (uint32_t depth = subresourceRange.baseArrayLayer; depth <= lastLayer; depth++)
 			{
-				area.extent.width = extent.width;
-				area.extent.height = extent.height;
-			}
-
-			BlitData data =
-			{
-				pixel, nullptr, // source, dest
-
-				format.bytes(),                                       // sPitchB
-				dest->rowPitchBytes(aspect, subresLayers.mipLevel),   // dPitchB
-				0,                                                    // sSliceB (unused in clear operations)
-				dest->slicePitchBytes(aspect, subresLayers.mipLevel), // dSliceB
-
-				0.5f, 0.5f, 0.0f, 0.0f, // x0, y0, w, h
-
-				area.offset.y, static_cast<int>(area.offset.y + area.extent.height), // y0d, y1d
-				area.offset.x, static_cast<int>(area.offset.x + area.extent.width),  // x0d, x1d
-
-				0, 0, // sWidth, sHeight
-			};
-
-			if (renderArea && dest->is3DSlice())
-			{
-				// Reinterpret layers as depth slices
-				subresLayers.baseArrayLayer = 0;
-				subresLayers.layerCount = 1;
-				for (uint32_t depth = subresourceRange.baseArrayLayer; depth <= lastLayer; depth++)
-				{
-					data.dest = dest->getTexelPointer({0, 0, static_cast<int32_t>(depth)}, subresLayers);
-					blitRoutine(&data);
-				}
-			}
-			else
-			{
-				for(subresLayers.baseArrayLayer = subresourceRange.baseArrayLayer; subresLayers.baseArrayLayer <= lastLayer; subresLayers.baseArrayLayer++)
-				{
-					for(uint32_t depth = 0; depth < extent.depth; depth++)
-					{
-						data.dest = dest->getTexelPointer({ 0, 0, static_cast<int32_t>(depth) }, subresLayers);
-
-						blitRoutine(&data);
-					}
-				}
-			}
-		}
-	}
-
-	bool Blitter::fastClear(void *pixel, vk::Format format, vk::Image *dest, const vk::Format& viewFormat, const VkImageSubresourceRange& subresourceRange, const VkRect2D* renderArea)
-	{
-		if(format != VK_FORMAT_R32G32B32A32_SFLOAT)
-		{
-			return false;
-		}
-
-		float *color = (float*)pixel;
-		float r = color[0];
-		float g = color[1];
-		float b = color[2];
-		float a = color[3];
-
-		uint32_t packed;
-
-		VkImageAspectFlagBits aspect = static_cast<VkImageAspectFlagBits>(subresourceRange.aspectMask);
-		switch(viewFormat)
-		{
-		case VK_FORMAT_R5G6B5_UNORM_PACK16:
-			packed = ((uint16_t)(31 * b + 0.5f) << 0) |
-			         ((uint16_t)(63 * g + 0.5f) << 5) |
-			         ((uint16_t)(31 * r + 0.5f) << 11);
-			break;
-		case VK_FORMAT_B5G6R5_UNORM_PACK16:
-			packed = ((uint16_t)(31 * r + 0.5f) << 0) |
-			         ((uint16_t)(63 * g + 0.5f) << 5) |
-			         ((uint16_t)(31 * b + 0.5f) << 11);
-			break;
-		case VK_FORMAT_A8B8G8R8_UINT_PACK32:
-		case VK_FORMAT_A8B8G8R8_UNORM_PACK32:
-		case VK_FORMAT_R8G8B8A8_UNORM:
-			packed = ((uint32_t)(255 * a + 0.5f) << 24) |
-			         ((uint32_t)(255 * b + 0.5f) << 16) |
-			         ((uint32_t)(255 * g + 0.5f) << 8) |
-			         ((uint32_t)(255 * r + 0.5f) << 0);
-			break;
-		case VK_FORMAT_B8G8R8A8_UNORM:
-			packed = ((uint32_t)(255 * a + 0.5f) << 24) |
-			         ((uint32_t)(255 * r + 0.5f) << 16) |
-			         ((uint32_t)(255 * g + 0.5f) << 8) |
-			         ((uint32_t)(255 * b + 0.5f) << 0);
-			break;
-		case VK_FORMAT_B10G11R11_UFLOAT_PACK32:
-			packed = R11G11B10F(color);
-			break;
-		case VK_FORMAT_E5B9G9R9_UFLOAT_PACK32:
-			packed = RGB9E5(color);
-			break;
-		default:
-			return false;
-		}
-
-		VkImageSubresourceLayers subresLayers =
-		{
-			subresourceRange.aspectMask,
-			subresourceRange.baseMipLevel,
-			subresourceRange.baseArrayLayer,
-			1
-		};
-		uint32_t lastMipLevel = dest->getLastMipLevel(subresourceRange);
-		uint32_t lastLayer = dest->getLastLayerIndex(subresourceRange);
-
-		VkRect2D area = { { 0, 0 }, { 0, 0 } };
-		if(renderArea)
-		{
-			ASSERT(subresourceRange.levelCount == 1);
-			area = *renderArea;
-		}
-
-		for(; subresLayers.mipLevel <= lastMipLevel; subresLayers.mipLevel++)
-		{
-			int rowPitchBytes = dest->rowPitchBytes(aspect, subresLayers.mipLevel);
-			int slicePitchBytes = dest->slicePitchBytes(aspect, subresLayers.mipLevel);
-			VkExtent3D extent = dest->getMipLevelExtent(aspect, subresLayers.mipLevel);
-			if(!renderArea)
-			{
-				area.extent.width = extent.width;
-				area.extent.height = extent.height;
-			}
-			if(dest->is3DSlice())
-			{
-				extent.depth = 1; // The 3D image is instead interpreted as a 2D image with layers
-			}
-
-			for(subresLayers.baseArrayLayer = subresourceRange.baseArrayLayer; subresLayers.baseArrayLayer <= lastLayer; subresLayers.baseArrayLayer++)
-			{
-				for(uint32_t depth = 0; depth < extent.depth; depth++)
-				{
-					uint8_t *slice = (uint8_t*)dest->getTexelPointer(
-						{ area.offset.x, area.offset.y, static_cast<int32_t>(depth) }, subresLayers);
-
-					for(int j = 0; j < dest->getSampleCountFlagBits(); j++)
-					{
-						uint8_t *d = slice;
-
-						switch(viewFormat.bytes())
-						{
-						case 2:
-							for(uint32_t i = 0; i < area.extent.height; i++)
-							{
-								ASSERT(d < dest->end());
-								sw::clear((uint16_t*)d, static_cast<uint16_t>(packed), area.extent.width);
-								d += rowPitchBytes;
-							}
-							break;
-						case 4:
-							for(uint32_t i = 0; i < area.extent.height; i++)
-							{
-								ASSERT(d < dest->end());
-								sw::clear((uint32_t*)d, packed, area.extent.width);
-								d += rowPitchBytes;
-							}
-							break;
-						default:
-							assert(false);
-						}
-
-						slice += slicePitchBytes;
-					}
-				}
-			}
-		}
-
-		return true;
-	}
-
-	Float4 Blitter::readFloat4(Pointer<Byte> element, const State &state)
-	{
-		Float4 c(0.0f, 0.0f, 0.0f, 1.0f);
-
-		switch(state.sourceFormat)
-		{
-		case VK_FORMAT_B4G4R4A4_UNORM_PACK16:
-			c.w = Float(Int(*Pointer<Byte>(element)) & Int(0xF));
-			c.x = Float((Int(*Pointer<Byte>(element)) >> 4) & Int(0xF));
-			c.y = Float(Int(*Pointer<Byte>(element + 1)) & Int(0xF));
-			c.z = Float((Int(*Pointer<Byte>(element + 1)) >> 4) & Int(0xF));
-			break;
-		case VK_FORMAT_R8_SINT:
-		case VK_FORMAT_R8_SNORM:
-			c.x = Float(Int(*Pointer<SByte>(element)));
-			c.w = float(0x7F);
-			break;
-		case VK_FORMAT_R8_UNORM:
-		case VK_FORMAT_R8_UINT:
-		case VK_FORMAT_R8_SRGB:
-			c.x = Float(Int(*Pointer<Byte>(element)));
-			c.w = float(0xFF);
-			break;
-		case VK_FORMAT_R16_SINT:
-		case VK_FORMAT_R16_SNORM:
-			c.x = Float(Int(*Pointer<Short>(element)));
-			c.w = float(0x7FFF);
-			break;
-		case VK_FORMAT_R16_UNORM:
-		case VK_FORMAT_R16_UINT:
-			c.x = Float(Int(*Pointer<UShort>(element)));
-			c.w = float(0xFFFF);
-			break;
-		case VK_FORMAT_R32_SINT:
-			c.x = Float(*Pointer<Int>(element));
-			c.w = float(0x7FFFFFFF);
-			break;
-		case VK_FORMAT_R32_UINT:
-			c.x = Float(*Pointer<UInt>(element));
-			c.w = float(0xFFFFFFFF);
-			break;
-		case VK_FORMAT_B8G8R8A8_SRGB:
-		case VK_FORMAT_B8G8R8A8_UNORM:
-			c = Float4(*Pointer<Byte4>(element)).zyxw;
-			break;
-		case VK_FORMAT_A8B8G8R8_SINT_PACK32:
-		case VK_FORMAT_R8G8B8A8_SINT:
-		case VK_FORMAT_A8B8G8R8_SNORM_PACK32:
-		case VK_FORMAT_R8G8B8A8_SNORM:
-			c = Float4(*Pointer<SByte4>(element));
-			break;
-		case VK_FORMAT_A8B8G8R8_UINT_PACK32:
-		case VK_FORMAT_A8B8G8R8_UNORM_PACK32:
-		case VK_FORMAT_R8G8B8A8_UNORM:
-		case VK_FORMAT_R8G8B8A8_UINT:
-		case VK_FORMAT_A8B8G8R8_SRGB_PACK32:
-		case VK_FORMAT_R8G8B8A8_SRGB:
-			c = Float4(*Pointer<Byte4>(element));
-			break;
-		case VK_FORMAT_R16G16B16A16_SINT:
-			c = Float4(*Pointer<Short4>(element));
-			break;
-		case VK_FORMAT_R16G16B16A16_UNORM:
-		case VK_FORMAT_R16G16B16A16_UINT:
-			c = Float4(*Pointer<UShort4>(element));
-			break;
-		case VK_FORMAT_R32G32B32A32_SINT:
-			c = Float4(*Pointer<Int4>(element));
-			break;
-		case VK_FORMAT_R32G32B32A32_UINT:
-			c = Float4(*Pointer<UInt4>(element));
-			break;
-		case VK_FORMAT_R8G8_SINT:
-		case VK_FORMAT_R8G8_SNORM:
-			c.x = Float(Int(*Pointer<SByte>(element + 0)));
-			c.y = Float(Int(*Pointer<SByte>(element + 1)));
-			c.w = float(0x7F);
-			break;
-		case VK_FORMAT_R8G8_UNORM:
-		case VK_FORMAT_R8G8_UINT:
-		case VK_FORMAT_R8G8_SRGB:
-			c.x = Float(Int(*Pointer<Byte>(element + 0)));
-			c.y = Float(Int(*Pointer<Byte>(element + 1)));
-			c.w = float(0xFF);
-			break;
-		case VK_FORMAT_R16G16_SINT:
-		case VK_FORMAT_R16G16_SNORM:
-			c.x = Float(Int(*Pointer<Short>(element + 0)));
-			c.y = Float(Int(*Pointer<Short>(element + 2)));
-			c.w = float(0x7FFF);
-			break;
-		case VK_FORMAT_R16G16_UNORM:
-		case VK_FORMAT_R16G16_UINT:
-			c.x = Float(Int(*Pointer<UShort>(element + 0)));
-			c.y = Float(Int(*Pointer<UShort>(element + 2)));
-			c.w = float(0xFFFF);
-			break;
-		case VK_FORMAT_R32G32_SINT:
-			c.x = Float(*Pointer<Int>(element + 0));
-			c.y = Float(*Pointer<Int>(element + 4));
-			c.w = float(0x7FFFFFFF);
-			break;
-		case VK_FORMAT_R32G32_UINT:
-			c.x = Float(*Pointer<UInt>(element + 0));
-			c.y = Float(*Pointer<UInt>(element + 4));
-			c.w = float(0xFFFFFFFF);
-			break;
-		case VK_FORMAT_R32G32B32A32_SFLOAT:
-			c = *Pointer<Float4>(element);
-			break;
-		case VK_FORMAT_R32G32_SFLOAT:
-			c.x = *Pointer<Float>(element + 0);
-			c.y = *Pointer<Float>(element + 4);
-			break;
-		case VK_FORMAT_R32_SFLOAT:
-			c.x = *Pointer<Float>(element);
-			break;
-		case VK_FORMAT_R16G16B16A16_SFLOAT:
-			c.w = Float(*Pointer<Half>(element + 6));
-		case VK_FORMAT_R16G16B16_SFLOAT:
-			c.z = Float(*Pointer<Half>(element + 4));
-		case VK_FORMAT_R16G16_SFLOAT:
-			c.y = Float(*Pointer<Half>(element + 2));
-		case VK_FORMAT_R16_SFLOAT:
-			c.x = Float(*Pointer<Half>(element));
-			break;
-		case VK_FORMAT_B10G11R11_UFLOAT_PACK32:
-			// 10 (or 11) bit float formats are unsigned formats with a 5 bit exponent and a 5 (or 6) bit mantissa.
-			// Since the Half float format also has a 5 bit exponent, we can convert these formats to half by
-			// copy/pasting the bits so the the exponent bits and top mantissa bits are aligned to the half format.
-			// In this case, we have:
-			//              B B B B B B B B B B G G G G G G G G G G G R R R R R R R R R R R
-			// 1st Short:                                  |xxxxxxxxxx---------------------|
-			// 2nd Short:                  |xxxx---------------------xxxxxx|
-			// 3rd Short: |--------------------xxxxxxxxxxxx|
-			// These memory reads overlap, but each of them contains an entire channel, so we can read this without
-			// any int -> short conversion.
-			c.x = Float(As<Half>((*Pointer<UShort>(element + 0) & UShort(0x07FF)) << UShort(4)));
-			c.y = Float(As<Half>((*Pointer<UShort>(element + 1) & UShort(0x3FF8)) << UShort(1)));
-			c.z = Float(As<Half>((*Pointer<UShort>(element + 2) & UShort(0xFFC0)) >> UShort(1)));
-			break;
-		case VK_FORMAT_E5B9G9R9_UFLOAT_PACK32:
-			// This type contains a common 5 bit exponent (E) and a 9 bit the mantissa for R, G and B.
-			c.x = Float(*Pointer<UInt>(element) & UInt(0x000001FF));         // R's mantissa (bits 0-8)
-			c.y = Float((*Pointer<UInt>(element) & UInt(0x0003FE00)) >> 9);  // G's mantissa (bits 9-17)
-			c.z = Float((*Pointer<UInt>(element) & UInt(0x07FC0000)) >> 18); // B's mantissa (bits 18-26)
-			c *= Float4(
-				// 2^E, using the exponent (bits 27-31) and treating it as an unsigned integer value
-				Float(UInt(1) << ((*Pointer<UInt>(element) & UInt(0xF8000000)) >> 27)) *
-				// Since the 9 bit mantissa values currently stored in RGB were converted straight
-				// from int to float (in the [0, 1<<9] range instead of the [0, 1] range), they
-				// are (1 << 9) times too high.
-				// Also, the exponent has 5 bits and we compute the exponent bias of floating point
-				// formats using "2^(k-1) - 1", so, in this case, the exponent bias is 2^(5-1)-1 = 15
-				// Exponent bias (15) + number of mantissa bits per component (9) = 24
-				Float(1.0f / (1 << 24)));
-			c.w = 1.0f;
-			break;
-		case VK_FORMAT_R5G6B5_UNORM_PACK16:
-			c.x = Float(Int((*Pointer<UShort>(element) & UShort(0xF800)) >> UShort(11)));
-			c.y = Float(Int((*Pointer<UShort>(element) & UShort(0x07E0)) >> UShort(5)));
-			c.z = Float(Int(*Pointer<UShort>(element) & UShort(0x001F)));
-			break;
-		case VK_FORMAT_A1R5G5B5_UNORM_PACK16:
-			c.w = Float(Int((*Pointer<UShort>(element) & UShort(0x8000)) >> UShort(15)));
-			c.x = Float(Int((*Pointer<UShort>(element) & UShort(0x7C00)) >> UShort(10)));
-			c.y = Float(Int((*Pointer<UShort>(element) & UShort(0x03E0)) >> UShort(5)));
-			c.z = Float(Int(*Pointer<UShort>(element) & UShort(0x001F)));
-			break;
-		case VK_FORMAT_A2B10G10R10_UNORM_PACK32:
-		case VK_FORMAT_A2B10G10R10_UINT_PACK32:
-			c.x = Float(Int((*Pointer<UInt>(element) & UInt(0x000003FF))));
-			c.y = Float(Int((*Pointer<UInt>(element) & UInt(0x000FFC00)) >> 10));
-			c.z = Float(Int((*Pointer<UInt>(element) & UInt(0x3FF00000)) >> 20));
-			c.w = Float(Int((*Pointer<UInt>(element) & UInt(0xC0000000)) >> 30));
-			break;
-		case VK_FORMAT_D16_UNORM:
-			c.x = Float(Int((*Pointer<UShort>(element))));
-			break;
-		case VK_FORMAT_X8_D24_UNORM_PACK32:
-			c.x = Float(Int((*Pointer<UInt>(element) & UInt(0xFFFFFF00)) >> 8));
-			break;
-		case VK_FORMAT_D32_SFLOAT:
-			c.x = *Pointer<Float>(element);
-			break;
-		case VK_FORMAT_S8_UINT:
-			c.x = Float(Int(*Pointer<Byte>(element)));
-			break;
-		default:
-			UNSUPPORTED("Blitter source format %d", (int)state.sourceFormat);
-		}
-
-		return c;
-	}
-
-	void Blitter::write(Float4 &c, Pointer<Byte> element, const State &state)
-	{
-		bool writeR = state.writeRed;
-		bool writeG = state.writeGreen;
-		bool writeB = state.writeBlue;
-		bool writeA = state.writeAlpha;
-		bool writeRGBA = writeR && writeG && writeB && writeA;
-
-		switch(state.destFormat)
-		{
-		case VK_FORMAT_R4G4_UNORM_PACK8:
-			if(writeR | writeG)
-			{
-				if(!writeR)
-				{
-					*Pointer<Byte>(element) = (Byte(RoundInt(Float(c.y))) & Byte(0xF)) |
-				                              (*Pointer<Byte>(element) & Byte(0xF0));
-				}
-				else if(!writeG)
-				{
-					*Pointer<Byte>(element) = (*Pointer<Byte>(element) & Byte(0xF)) |
-				                              (Byte(RoundInt(Float(c.x))) << Byte(4));
-				}
-				else
-				{
-					*Pointer<Byte>(element) = (Byte(RoundInt(Float(c.y))) & Byte(0xF)) |
-				                              (Byte(RoundInt(Float(c.x))) << Byte(4));
-				}
-			}
-			break;
-		case VK_FORMAT_R4G4B4A4_UNORM_PACK16:
-			if(writeR || writeG || writeB || writeA)
-			{
-				*Pointer<UShort>(element) = (writeR ? ((UShort(RoundInt(Float(c.x))) & UShort(0xF)) << UShort(12)) :
-				                                      (*Pointer<UShort>(element) & UShort(0x000F))) |
-				                            (writeG ? ((UShort(RoundInt(Float(c.y))) & UShort(0xF)) << UShort(8)) :
-				                                      (*Pointer<UShort>(element) & UShort(0x00F0))) |
-				                            (writeB ? ((UShort(RoundInt(Float(c.z))) & UShort(0xF)) << UShort(4)) :
-			                                          (*Pointer<UShort>(element) & UShort(0x0F00))) |
-			                                (writeA ? (UShort(RoundInt(Float(c.w))) & UShort(0xF)) :
-			                                          (*Pointer<UShort>(element) & UShort(0xF000)));
-			}
-			break;
-		case VK_FORMAT_B4G4R4A4_UNORM_PACK16:
-			if(writeRGBA)
-			{
-				*Pointer<UShort>(element) = UShort(RoundInt(Float(c.w)) & Int(0xF)) |
-				                            UShort((RoundInt(Float(c.x)) & Int(0xF)) << 4) |
-				                            UShort((RoundInt(Float(c.y)) & Int(0xF)) << 8) |
-				                            UShort((RoundInt(Float(c.z)) & Int(0xF)) << 12);
-			}
-			else
-			{
-				unsigned short mask = (writeA ? 0x000F : 0x0000) |
-				                      (writeR ? 0x00F0 : 0x0000) |
-				                      (writeG ? 0x0F00 : 0x0000) |
-				                      (writeB ? 0xF000 : 0x0000);
-				unsigned short unmask = ~mask;
-				*Pointer<UShort>(element) = (*Pointer<UShort>(element) & UShort(unmask)) |
-				                            ((UShort(RoundInt(Float(c.w)) & Int(0xF)) |
-				                              UShort((RoundInt(Float(c.x)) & Int(0xF)) << 4) |
-				                              UShort((RoundInt(Float(c.y)) & Int(0xF)) << 8) |
-				                              UShort((RoundInt(Float(c.z)) & Int(0xF)) << 12)) & UShort(mask));
-			}
-			break;
-		case VK_FORMAT_B8G8R8A8_SRGB:
-		case VK_FORMAT_B8G8R8A8_UNORM:
-			if(writeRGBA)
-			{
-				Short4 c0 = RoundShort4(c.zyxw);
-				*Pointer<Byte4>(element) = Byte4(PackUnsigned(c0, c0));
-			}
-			else
-			{
-				if(writeB) { *Pointer<Byte>(element + 0) = Byte(RoundInt(Float(c.z))); }
-				if(writeG) { *Pointer<Byte>(element + 1) = Byte(RoundInt(Float(c.y))); }
-				if(writeR) { *Pointer<Byte>(element + 2) = Byte(RoundInt(Float(c.x))); }
-				if(writeA) { *Pointer<Byte>(element + 3) = Byte(RoundInt(Float(c.w))); }
-			}
-			break;
-		case VK_FORMAT_B8G8R8_SNORM:
-			if(writeB) { *Pointer<SByte>(element + 0) = SByte(RoundInt(Float(c.z))); }
-			if(writeG) { *Pointer<SByte>(element + 1) = SByte(RoundInt(Float(c.y))); }
-			if(writeR) { *Pointer<SByte>(element + 2) = SByte(RoundInt(Float(c.x))); }
-			break;
-		case VK_FORMAT_B8G8R8_UNORM:
-		case VK_FORMAT_B8G8R8_SRGB:
-			if(writeB) { *Pointer<Byte>(element + 0) = Byte(RoundInt(Float(c.z))); }
-			if(writeG) { *Pointer<Byte>(element + 1) = Byte(RoundInt(Float(c.y))); }
-			if(writeR) { *Pointer<Byte>(element + 2) = Byte(RoundInt(Float(c.x))); }
-			break;
-		case VK_FORMAT_A8B8G8R8_UNORM_PACK32:
-		case VK_FORMAT_R8G8B8A8_UNORM:
-		case VK_FORMAT_A8B8G8R8_SRGB_PACK32:
-		case VK_FORMAT_R8G8B8A8_SRGB:
-		case VK_FORMAT_A8B8G8R8_UINT_PACK32:
-		case VK_FORMAT_R8G8B8A8_UINT:
-		case VK_FORMAT_R8G8B8A8_USCALED:
-		case VK_FORMAT_A8B8G8R8_USCALED_PACK32:
-			if(writeRGBA)
-			{
-				Short4 c0 = RoundShort4(c);
-				*Pointer<Byte4>(element) = Byte4(PackUnsigned(c0, c0));
-			}
-			else
-			{
-				if(writeR) { *Pointer<Byte>(element + 0) = Byte(RoundInt(Float(c.x))); }
-				if(writeG) { *Pointer<Byte>(element + 1) = Byte(RoundInt(Float(c.y))); }
-				if(writeB) { *Pointer<Byte>(element + 2) = Byte(RoundInt(Float(c.z))); }
-				if(writeA) { *Pointer<Byte>(element + 3) = Byte(RoundInt(Float(c.w))); }
-			}
-			break;
-		case VK_FORMAT_R32G32B32A32_SFLOAT:
-			if(writeRGBA)
-			{
-				*Pointer<Float4>(element) = c;
-			}
-			else
-			{
-				if(writeR) { *Pointer<Float>(element) = c.x; }
-				if(writeG) { *Pointer<Float>(element + 4) = c.y; }
-				if(writeB) { *Pointer<Float>(element + 8) = c.z; }
-				if(writeA) { *Pointer<Float>(element + 12) = c.w; }
-			}
-			break;
-		case VK_FORMAT_R32G32B32_SFLOAT:
-			if(writeR) { *Pointer<Float>(element) = c.x; }
-			if(writeG) { *Pointer<Float>(element + 4) = c.y; }
-			if(writeB) { *Pointer<Float>(element + 8) = c.z; }
-			break;
-		case VK_FORMAT_R32G32_SFLOAT:
-			if(writeR && writeG)
-			{
-				*Pointer<Float2>(element) = Float2(c);
-			}
-			else
-			{
-				if(writeR) { *Pointer<Float>(element) = c.x; }
-				if(writeG) { *Pointer<Float>(element + 4) = c.y; }
-			}
-			break;
-		case VK_FORMAT_R32_SFLOAT:
-			if(writeR) { *Pointer<Float>(element) = c.x; }
-			break;
-		case VK_FORMAT_R16G16B16A16_SFLOAT:
-			if(writeA) { *Pointer<Half>(element + 6) = Half(c.w); }
-		case VK_FORMAT_R16G16B16_SFLOAT:
-			if(writeB) { *Pointer<Half>(element + 4) = Half(c.z); }
-		case VK_FORMAT_R16G16_SFLOAT:
-			if(writeG) { *Pointer<Half>(element + 2) = Half(c.y); }
-		case VK_FORMAT_R16_SFLOAT:
-			if(writeR) { *Pointer<Half>(element) = Half(c.x); }
-			break;
-		case VK_FORMAT_B10G11R11_UFLOAT_PACK32:
-			{
-				// 10 (or 11) bit float formats are unsigned formats with a 5 bit exponent and a 5 (or 6) bit mantissa.
-				// Since the 16-bit half-precision float format also has a 5 bit exponent, we can extract these minifloats from them.
-
-				// FIXME(b/138944025): Handle negative values, Inf, and NaN.
-				// FIXME(b/138944025): Perform rounding before truncating the mantissa.
-				UInt r = (UInt(As<UShort>(Half(c.x))) & 0x00007FF0) >> 4;
-				UInt g = (UInt(As<UShort>(Half(c.y))) & 0x00007FF0) << 7;
-				UInt b = (UInt(As<UShort>(Half(c.z))) & 0x00007FE0) << 17;
-
-				UInt rgb = r | g | b;
-
-				UInt old = *Pointer<UInt>(element);
-
-				unsigned int mask = (writeR ? 0x000007FF : 0) |
-				                    (writeG ? 0x003FF800 : 0) |
-				                    (writeB ? 0xFFC00000 : 0);
-
-				*Pointer<UInt>(element) = (rgb & mask) | (old & ~mask);
-			}
-			break;
-		case VK_FORMAT_E5B9G9R9_UFLOAT_PACK32:
-			{
-				ASSERT(writeRGBA);  // Can't sensibly write just part of this format.
-
-				// Vulkan 1.1.117 section 15.2.1 RGB to Shared Exponent Conversion
-
-				constexpr int N = 9;       // number of mantissa bits per component
-				constexpr int B = 15;      // exponent bias
-				constexpr int E_max = 31;  // maximum possible biased exponent value
-
-				// Maximum representable value.
-				constexpr float sharedexp_max = ((static_cast<float>(1 << N) - 1) / static_cast<float>(1 << N)) * static_cast<float>(1 << (E_max - B));
-
-				// Clamp components to valid range. NaN becomes 0.
-				Float red_c =   Min(IfThenElse(!(c.x > 0), Float(0), Float(c.x)), sharedexp_max);
-				Float green_c = Min(IfThenElse(!(c.y > 0), Float(0), Float(c.y)), sharedexp_max);
-				Float blue_c =  Min(IfThenElse(!(c.z > 0), Float(0), Float(c.z)), sharedexp_max);
-
-				// We're reducing the mantissa to 9 bits, so we must round up if the next
-				// bit is 1. In other words add 0.5 to the new mantissa's position and
-				// allow overflow into the exponent so we can scale correctly.
-				constexpr int half = 1 << (23 - N);
-				Float red_r = As<Float>(As<Int>(red_c) + half);
-				Float green_r = As<Float>(As<Int>(green_c) + half);
-				Float blue_r = As<Float>(As<Int>(blue_c) + half);
-
-				// The largest component determines the shared exponent. It can't be lower
-				// than 0 (after bias subtraction) so also limit to the mimimum representable.
-				constexpr float min_s = 0.5f / (1 << B);
-				Float max_s = Max(Max(red_r, green_r), Max(blue_r, min_s));
-
-				// Obtain the reciprocal of the shared exponent by inverting the bits,
-				// and scale by the new mantissa's size. Note that the IEEE-754 single-precision
-				// format has an implicit leading 1, but this shared component format does not.
-				Float scale = As<Float>((As<Int>(max_s) & 0x7F800000) ^ 0x7F800000) * (1 << (N - 2));
-
-				UInt R9 = RoundInt(red_c * scale);
-				UInt G9 = UInt(RoundInt(green_c * scale));
-				UInt B9 = UInt(RoundInt(blue_c * scale));
-				UInt E5 = (As<UInt>(max_s) >> 23) - 127 + 15 + 1;
-
-				UInt E5B9G9R9 = (E5 << 27) | (B9 << 18) | (G9 << 9) | R9;
-
-				*Pointer<UInt>(element) = E5B9G9R9;
-			}
-			break;
-		case VK_FORMAT_B8G8R8A8_SNORM:
-			if(writeB) { *Pointer<SByte>(element) = SByte(RoundInt(Float(c.z))); }
-			if(writeG) { *Pointer<SByte>(element + 1) = SByte(RoundInt(Float(c.y))); }
-			if(writeR) { *Pointer<SByte>(element + 2) = SByte(RoundInt(Float(c.x))); }
-			if(writeA) { *Pointer<SByte>(element + 3) = SByte(RoundInt(Float(c.w))); }
-			break;
-		case VK_FORMAT_A8B8G8R8_SINT_PACK32:
-		case VK_FORMAT_R8G8B8A8_SINT:
-		case VK_FORMAT_A8B8G8R8_SNORM_PACK32:
-		case VK_FORMAT_R8G8B8A8_SNORM:
-		case VK_FORMAT_R8G8B8A8_SSCALED:
-		case VK_FORMAT_A8B8G8R8_SSCALED_PACK32:
-			if(writeA) { *Pointer<SByte>(element + 3) = SByte(RoundInt(Float(c.w))); }
-		case VK_FORMAT_R8G8B8_SINT:
-		case VK_FORMAT_R8G8B8_SNORM:
-		case VK_FORMAT_R8G8B8_SSCALED:
-			if(writeB) { *Pointer<SByte>(element + 2) = SByte(RoundInt(Float(c.z))); }
-		case VK_FORMAT_R8G8_SINT:
-		case VK_FORMAT_R8G8_SNORM:
-		case VK_FORMAT_R8G8_SSCALED:
-			if(writeG) { *Pointer<SByte>(element + 1) = SByte(RoundInt(Float(c.y))); }
-		case VK_FORMAT_R8_SINT:
-		case VK_FORMAT_R8_SNORM:
-		case VK_FORMAT_R8_SSCALED:
-			if(writeR) { *Pointer<SByte>(element) = SByte(RoundInt(Float(c.x))); }
-			break;
-		case VK_FORMAT_R8G8B8_UINT:
-		case VK_FORMAT_R8G8B8_UNORM:
-		case VK_FORMAT_R8G8B8_USCALED:
-		case VK_FORMAT_R8G8B8_SRGB:
-			if(writeB) { *Pointer<Byte>(element + 2) = Byte(RoundInt(Float(c.z))); }
-		case VK_FORMAT_R8G8_UINT:
-		case VK_FORMAT_R8G8_UNORM:
-		case VK_FORMAT_R8G8_USCALED:
-		case VK_FORMAT_R8G8_SRGB:
-			if(writeG) { *Pointer<Byte>(element + 1) = Byte(RoundInt(Float(c.y))); }
-		case VK_FORMAT_R8_UINT:
-		case VK_FORMAT_R8_UNORM:
-		case VK_FORMAT_R8_USCALED:
-		case VK_FORMAT_R8_SRGB:
-			if(writeR) { *Pointer<Byte>(element) = Byte(RoundInt(Float(c.x))); }
-			break;
-		case VK_FORMAT_R16G16B16A16_SINT:
-		case VK_FORMAT_R16G16B16A16_SNORM:
-		case VK_FORMAT_R16G16B16A16_SSCALED:
-			if(writeRGBA)
-			{
-				*Pointer<Short4>(element) = Short4(RoundInt(c));
-			}
-			else
-			{
-				if(writeR) { *Pointer<Short>(element) = Short(RoundInt(Float(c.x))); }
-				if(writeG) { *Pointer<Short>(element + 2) = Short(RoundInt(Float(c.y))); }
-				if(writeB) { *Pointer<Short>(element + 4) = Short(RoundInt(Float(c.z))); }
-				if(writeA) { *Pointer<Short>(element + 6) = Short(RoundInt(Float(c.w))); }
-			}
-			break;
-		case VK_FORMAT_R16G16B16_SINT:
-		case VK_FORMAT_R16G16B16_SNORM:
-		case VK_FORMAT_R16G16B16_SSCALED:
-			if(writeR) { *Pointer<Short>(element) = Short(RoundInt(Float(c.x))); }
-			if(writeG) { *Pointer<Short>(element + 2) = Short(RoundInt(Float(c.y))); }
-			if(writeB) { *Pointer<Short>(element + 4) = Short(RoundInt(Float(c.z))); }
-			break;
-		case VK_FORMAT_R16G16_SINT:
-		case VK_FORMAT_R16G16_SNORM:
-		case VK_FORMAT_R16G16_SSCALED:
-			if(writeR && writeG)
-			{
-				*Pointer<Short2>(element) = Short2(Short4(RoundInt(c)));
-			}
-			else
-			{
-				if(writeR) { *Pointer<Short>(element) = Short(RoundInt(Float(c.x))); }
-				if(writeG) { *Pointer<Short>(element + 2) = Short(RoundInt(Float(c.y))); }
-			}
-			break;
-		case VK_FORMAT_R16_SINT:
-		case VK_FORMAT_R16_SNORM:
-		case VK_FORMAT_R16_SSCALED:
-			if(writeR) { *Pointer<Short>(element) = Short(RoundInt(Float(c.x))); }
-			break;
-		case VK_FORMAT_R16G16B16A16_UINT:
-		case VK_FORMAT_R16G16B16A16_UNORM:
-		case VK_FORMAT_R16G16B16A16_USCALED:
-			if(writeRGBA)
-			{
-				*Pointer<UShort4>(element) = UShort4(RoundInt(c));
-			}
-			else
-			{
-				if(writeR) { *Pointer<UShort>(element) = UShort(RoundInt(Float(c.x))); }
-				if(writeG) { *Pointer<UShort>(element + 2) = UShort(RoundInt(Float(c.y))); }
-				if(writeB) { *Pointer<UShort>(element + 4) = UShort(RoundInt(Float(c.z))); }
-				if(writeA) { *Pointer<UShort>(element + 6) = UShort(RoundInt(Float(c.w))); }
-			}
-			break;
-		case VK_FORMAT_R16G16B16_UINT:
-		case VK_FORMAT_R16G16B16_UNORM:
-		case VK_FORMAT_R16G16B16_USCALED:
-			if(writeR) { *Pointer<UShort>(element) = UShort(RoundInt(Float(c.x))); }
-			if(writeG) { *Pointer<UShort>(element + 2) = UShort(RoundInt(Float(c.y))); }
-			if(writeB) { *Pointer<UShort>(element + 4) = UShort(RoundInt(Float(c.z))); }
-			break;
-		case VK_FORMAT_R16G16_UINT:
-		case VK_FORMAT_R16G16_UNORM:
-		case VK_FORMAT_R16G16_USCALED:
-			if(writeR && writeG)
-			{
-				*Pointer<UShort2>(element) = UShort2(UShort4(RoundInt(c)));
-			}
-			else
-			{
-				if(writeR) { *Pointer<UShort>(element) = UShort(RoundInt(Float(c.x))); }
-				if(writeG) { *Pointer<UShort>(element + 2) = UShort(RoundInt(Float(c.y))); }
-			}
-			break;
-		case VK_FORMAT_R16_UINT:
-		case VK_FORMAT_R16_UNORM:
-		case VK_FORMAT_R16_USCALED:
-			if(writeR) { *Pointer<UShort>(element) = UShort(RoundInt(Float(c.x))); }
-			break;
-		case VK_FORMAT_R32G32B32A32_SINT:
-			if(writeRGBA)
-			{
-				*Pointer<Int4>(element) = RoundInt(c);
-			}
-			else
-			{
-				if(writeR) { *Pointer<Int>(element) = RoundInt(Float(c.x)); }
-				if(writeG) { *Pointer<Int>(element + 4) = RoundInt(Float(c.y)); }
-				if(writeB) { *Pointer<Int>(element + 8) = RoundInt(Float(c.z)); }
-				if(writeA) { *Pointer<Int>(element + 12) = RoundInt(Float(c.w)); }
-			}
-			break;
-		case VK_FORMAT_R32G32B32_SINT:
-			if(writeB) { *Pointer<Int>(element + 8) = RoundInt(Float(c.z)); }
-		case VK_FORMAT_R32G32_SINT:
-			if(writeG) { *Pointer<Int>(element + 4) = RoundInt(Float(c.y)); }
-		case VK_FORMAT_R32_SINT:
-			if(writeR) { *Pointer<Int>(element) = RoundInt(Float(c.x)); }
-			break;
-		case VK_FORMAT_R32G32B32A32_UINT:
-			if(writeRGBA)
-			{
-				*Pointer<UInt4>(element) = UInt4(RoundInt(c));
-			}
-			else
-			{
-				if(writeR) { *Pointer<UInt>(element) = As<UInt>(RoundInt(Float(c.x))); }
-				if(writeG) { *Pointer<UInt>(element + 4) = As<UInt>(RoundInt(Float(c.y))); }
-				if(writeB) { *Pointer<UInt>(element + 8) = As<UInt>(RoundInt(Float(c.z))); }
-				if(writeA) { *Pointer<UInt>(element + 12) = As<UInt>(RoundInt(Float(c.w))); }
-			}
-			break;
-		case VK_FORMAT_R32G32B32_UINT:
-			if(writeB) { *Pointer<UInt>(element + 8) = As<UInt>(RoundInt(Float(c.z))); }
-		case VK_FORMAT_R32G32_UINT:
-			if(writeG) { *Pointer<UInt>(element + 4) = As<UInt>(RoundInt(Float(c.y))); }
-		case VK_FORMAT_R32_UINT:
-			if(writeR) { *Pointer<UInt>(element) = As<UInt>(RoundInt(Float(c.x))); }
-			break;
-		case VK_FORMAT_R5G6B5_UNORM_PACK16:
-			if(writeR && writeG && writeB)
-			{
-				*Pointer<UShort>(element) = UShort(RoundInt(Float(c.z)) |
-				                                  (RoundInt(Float(c.y)) << Int(5)) |
-				                                  (RoundInt(Float(c.x)) << Int(11)));
-			}
-			else
-			{
-				unsigned short mask = (writeB ? 0x001F : 0x0000) | (writeG ? 0x07E0 : 0x0000) | (writeR ? 0xF800 : 0x0000);
-				unsigned short unmask = ~mask;
-				*Pointer<UShort>(element) = (*Pointer<UShort>(element) & UShort(unmask)) |
-				                            (UShort(RoundInt(Float(c.z)) |
-				                                   (RoundInt(Float(c.y)) << Int(5)) |
-				                                   (RoundInt(Float(c.x)) << Int(11))) & UShort(mask));
-			}
-			break;
-		case VK_FORMAT_R5G5B5A1_UNORM_PACK16:
-			if(writeRGBA)
-			{
-				*Pointer<UShort>(element) = UShort(RoundInt(Float(c.w)) |
-				                                  (RoundInt(Float(c.z)) << Int(1)) |
-				                                  (RoundInt(Float(c.y)) << Int(6)) |
-				                                  (RoundInt(Float(c.x)) << Int(11)));
-			}
-			else
-			{
-				unsigned short mask = (writeA ? 0x8000 : 0x0000) |
-				                      (writeR ? 0x7C00 : 0x0000) |
-				                      (writeG ? 0x03E0 : 0x0000) |
-				                      (writeB ? 0x001F : 0x0000);
-				unsigned short unmask = ~mask;
-				*Pointer<UShort>(element) = (*Pointer<UShort>(element) & UShort(unmask)) |
-				                            (UShort(RoundInt(Float(c.w)) |
-				                                   (RoundInt(Float(c.z)) << Int(1)) |
-				                                   (RoundInt(Float(c.y)) << Int(6)) |
-				                                   (RoundInt(Float(c.x)) << Int(11))) & UShort(mask));
-			}
-			break;
-		case VK_FORMAT_B5G5R5A1_UNORM_PACK16:
-			if(writeRGBA)
-			{
-				*Pointer<UShort>(element) = UShort(RoundInt(Float(c.w)) |
-				                                  (RoundInt(Float(c.x)) << Int(1)) |
-				                                  (RoundInt(Float(c.y)) << Int(6)) |
-				                                  (RoundInt(Float(c.z)) << Int(11)));
-			}
-			else
-			{
-				unsigned short mask = (writeA ? 0x8000 : 0x0000) |
-				                      (writeR ? 0x7C00 : 0x0000) |
-				                      (writeG ? 0x03E0 : 0x0000) |
-				                      (writeB ? 0x001F : 0x0000);
-				unsigned short unmask = ~mask;
-				*Pointer<UShort>(element) = (*Pointer<UShort>(element) & UShort(unmask)) |
-				                            (UShort(RoundInt(Float(c.w)) |
-				                                   (RoundInt(Float(c.x)) << Int(1)) |
-				                                   (RoundInt(Float(c.y)) << Int(6)) |
-				                                   (RoundInt(Float(c.z)) << Int(11))) & UShort(mask));
-			}
-			break;
-		case VK_FORMAT_A1R5G5B5_UNORM_PACK16:
-			if(writeRGBA)
-			{
-				*Pointer<UShort>(element) = UShort(RoundInt(Float(c.z)) |
-				                                  (RoundInt(Float(c.y)) << Int(5)) |
-				                                  (RoundInt(Float(c.x)) << Int(10)) |
-				                                  (RoundInt(Float(c.w)) << Int(15)));
-			}
-			else
-			{
-				unsigned short mask = (writeA ? 0x8000 : 0x0000) |
-				                      (writeR ? 0x7C00 : 0x0000) |
-				                      (writeG ? 0x03E0 : 0x0000) |
-				                      (writeB ? 0x001F : 0x0000);
-				unsigned short unmask = ~mask;
-				*Pointer<UShort>(element) = (*Pointer<UShort>(element) & UShort(unmask)) |
-				                            (UShort(RoundInt(Float(c.z)) |
-				                                   (RoundInt(Float(c.y)) << Int(5)) |
-				                                   (RoundInt(Float(c.x)) << Int(10)) |
-				                                   (RoundInt(Float(c.w)) << Int(15))) & UShort(mask));
-			}
-			break;
-		case VK_FORMAT_A2B10G10R10_UNORM_PACK32:
-		case VK_FORMAT_A2B10G10R10_UINT_PACK32:
-		case VK_FORMAT_A2B10G10R10_SNORM_PACK32:
-			if(writeRGBA)
-			{
-				*Pointer<UInt>(element) = UInt(RoundInt(Float(c.x)) |
-				                              (RoundInt(Float(c.y)) << 10) |
-				                              (RoundInt(Float(c.z)) << 20) |
-				                              (RoundInt(Float(c.w)) << 30));
-			}
-			else
-			{
-				unsigned int mask = (writeA ? 0xC0000000 : 0x0000) |
-				                    (writeB ? 0x3FF00000 : 0x0000) |
-				                    (writeG ? 0x000FFC00 : 0x0000) |
-				                    (writeR ? 0x000003FF : 0x0000);
-				unsigned int unmask = ~mask;
-				*Pointer<UInt>(element) = (*Pointer<UInt>(element) & UInt(unmask)) |
-				                            (UInt(RoundInt(Float(c.x)) |
-				                                 (RoundInt(Float(c.y)) << 10) |
-				                                 (RoundInt(Float(c.z)) << 20) |
-				                                 (RoundInt(Float(c.w)) << 30)) & UInt(mask));
-			}
-			break;
-		case VK_FORMAT_A2R10G10B10_UNORM_PACK32:
-		case VK_FORMAT_A2R10G10B10_UINT_PACK32:
-		case VK_FORMAT_A2R10G10B10_SNORM_PACK32:
-			if(writeRGBA)
-			{
-				*Pointer<UInt>(element) = UInt(RoundInt(Float(c.z)) |
-				                              (RoundInt(Float(c.y)) << 10) |
-				                              (RoundInt(Float(c.x)) << 20) |
-				                              (RoundInt(Float(c.w)) << 30));
-			}
-			else
-			{
-				unsigned int mask = (writeA ? 0xC0000000 : 0x0000) |
-				                    (writeR ? 0x3FF00000 : 0x0000) |
-				                    (writeG ? 0x000FFC00 : 0x0000) |
-				                    (writeB ? 0x000003FF : 0x0000);
-				unsigned int unmask = ~mask;
-				*Pointer<UInt>(element) = (*Pointer<UInt>(element) & UInt(unmask)) |
-				                            (UInt(RoundInt(Float(c.z)) |
-				                                 (RoundInt(Float(c.y)) << 10) |
-				                                 (RoundInt(Float(c.x)) << 20) |
-				                                 (RoundInt(Float(c.w)) << 30)) & UInt(mask));
-			}
-			break;
-		case VK_FORMAT_D16_UNORM:
-			*Pointer<UShort>(element) = UShort(RoundInt(Float(c.x)));
-			break;
-		case VK_FORMAT_X8_D24_UNORM_PACK32:
-			*Pointer<UInt>(element) = UInt(RoundInt(Float(c.x)) << 8);
-			break;
-		case VK_FORMAT_D32_SFLOAT:
-			*Pointer<Float>(element) = c.x;
-			break;
-		case VK_FORMAT_S8_UINT:
-			*Pointer<Byte>(element) = Byte(RoundInt(Float(c.x)));
-			break;
-		default:
-			UNSUPPORTED("Blitter destination format %d", (int)state.destFormat);
-			break;
-		}
-	}
-
-	Int4 Blitter::readInt4(Pointer<Byte> element, const State &state)
-	{
-		Int4 c(0, 0, 0, 1);
-
-		switch(state.sourceFormat)
-		{
-		case VK_FORMAT_A8B8G8R8_SINT_PACK32:
-		case VK_FORMAT_R8G8B8A8_SINT:
-			c = Insert(c, Int(*Pointer<SByte>(element + 3)), 3);
-			c = Insert(c, Int(*Pointer<SByte>(element + 2)), 2);
-		case VK_FORMAT_R8G8_SINT:
-			c = Insert(c, Int(*Pointer<SByte>(element + 1)), 1);
-		case VK_FORMAT_R8_SINT:
-			c = Insert(c, Int(*Pointer<SByte>(element)), 0);
-			break;
-		case VK_FORMAT_A2B10G10R10_UINT_PACK32:
-			c = Insert(c, Int((*Pointer<UInt>(element) & UInt(0x000003FF))), 0);
-			c = Insert(c, Int((*Pointer<UInt>(element) & UInt(0x000FFC00)) >> 10), 1);
-			c = Insert(c, Int((*Pointer<UInt>(element) & UInt(0x3FF00000)) >> 20), 2);
-			c = Insert(c, Int((*Pointer<UInt>(element) & UInt(0xC0000000)) >> 30), 3);
-			break;
-		case VK_FORMAT_A8B8G8R8_UINT_PACK32:
-		case VK_FORMAT_R8G8B8A8_UINT:
-			c = Insert(c, Int(*Pointer<Byte>(element + 3)), 3);
-			c = Insert(c, Int(*Pointer<Byte>(element + 2)), 2);
-		case VK_FORMAT_R8G8_UINT:
-			c = Insert(c, Int(*Pointer<Byte>(element + 1)), 1);
-		case VK_FORMAT_R8_UINT:
-		case VK_FORMAT_S8_UINT:
-			c = Insert(c, Int(*Pointer<Byte>(element)), 0);
-			break;
-		case VK_FORMAT_R16G16B16A16_SINT:
-			c = Insert(c, Int(*Pointer<Short>(element + 6)), 3);
-			c = Insert(c, Int(*Pointer<Short>(element + 4)), 2);
-		case VK_FORMAT_R16G16_SINT:
-			c = Insert(c, Int(*Pointer<Short>(element + 2)), 1);
-		case VK_FORMAT_R16_SINT:
-			c = Insert(c, Int(*Pointer<Short>(element)), 0);
-			break;
-		case VK_FORMAT_R16G16B16A16_UINT:
-			c = Insert(c, Int(*Pointer<UShort>(element + 6)), 3);
-			c = Insert(c, Int(*Pointer<UShort>(element + 4)), 2);
-		case VK_FORMAT_R16G16_UINT:
-			c = Insert(c, Int(*Pointer<UShort>(element + 2)), 1);
-		case VK_FORMAT_R16_UINT:
-			c = Insert(c, Int(*Pointer<UShort>(element)), 0);
-			break;
-		case VK_FORMAT_R32G32B32A32_SINT:
-		case VK_FORMAT_R32G32B32A32_UINT:
-			c = *Pointer<Int4>(element);
-			break;
-		case VK_FORMAT_R32G32_SINT:
-		case VK_FORMAT_R32G32_UINT:
-			c = Insert(c, *Pointer<Int>(element + 4), 1);
-		case VK_FORMAT_R32_SINT:
-		case VK_FORMAT_R32_UINT:
-			c = Insert(c, *Pointer<Int>(element), 0);
-			break;
-		default:
-			UNSUPPORTED("Blitter source format %d", (int)state.sourceFormat);
-		}
-
-		return c;
-	}
-
-	void Blitter::write(Int4 &c, Pointer<Byte> element, const State &state)
-	{
-		bool writeR = state.writeRed;
-		bool writeG = state.writeGreen;
-		bool writeB = state.writeBlue;
-		bool writeA = state.writeAlpha;
-		bool writeRGBA = writeR && writeG && writeB && writeA;
-
-		switch(state.destFormat)
-		{
-		case VK_FORMAT_A2B10G10R10_UINT_PACK32:
-			c = Min(As<UInt4>(c), UInt4(0x03FF, 0x03FF, 0x03FF, 0x0003));
-			break;
-		case VK_FORMAT_A8B8G8R8_UINT_PACK32:
-		case VK_FORMAT_R8G8B8A8_UINT:
-		case VK_FORMAT_R8G8B8_UINT:
-		case VK_FORMAT_R8G8_UINT:
-		case VK_FORMAT_R8_UINT:
-		case VK_FORMAT_R8G8B8A8_USCALED:
-		case VK_FORMAT_R8G8B8_USCALED:
-		case VK_FORMAT_R8G8_USCALED:
-		case VK_FORMAT_R8_USCALED:
-		case VK_FORMAT_S8_UINT:
-			c = Min(As<UInt4>(c), UInt4(0xFF));
-			break;
-		case VK_FORMAT_R16G16B16A16_UINT:
-		case VK_FORMAT_R16G16B16_UINT:
-		case VK_FORMAT_R16G16_UINT:
-		case VK_FORMAT_R16_UINT:
-		case VK_FORMAT_R16G16B16A16_USCALED:
-		case VK_FORMAT_R16G16B16_USCALED:
-		case VK_FORMAT_R16G16_USCALED:
-		case VK_FORMAT_R16_USCALED:
-			c = Min(As<UInt4>(c), UInt4(0xFFFF));
-			break;
-		case VK_FORMAT_A8B8G8R8_SINT_PACK32:
-		case VK_FORMAT_R8G8B8A8_SINT:
-		case VK_FORMAT_R8G8_SINT:
-		case VK_FORMAT_R8_SINT:
-		case VK_FORMAT_R8G8B8A8_SSCALED:
-		case VK_FORMAT_R8G8B8_SSCALED:
-		case VK_FORMAT_R8G8_SSCALED:
-		case VK_FORMAT_R8_SSCALED:
-			c = Min(Max(c, Int4(-0x80)), Int4(0x7F));
-			break;
-		case VK_FORMAT_R16G16B16A16_SINT:
-		case VK_FORMAT_R16G16B16_SINT:
-		case VK_FORMAT_R16G16_SINT:
-		case VK_FORMAT_R16_SINT:
-		case VK_FORMAT_R16G16B16A16_SSCALED:
-		case VK_FORMAT_R16G16B16_SSCALED:
-		case VK_FORMAT_R16G16_SSCALED:
-		case VK_FORMAT_R16_SSCALED:
-			c = Min(Max(c, Int4(-0x8000)), Int4(0x7FFF));
-			break;
-		default:
-			break;
-		}
-
-		switch(state.destFormat)
-		{
-		case VK_FORMAT_B8G8R8A8_SINT:
-		case VK_FORMAT_B8G8R8A8_SSCALED:
-			if(writeA) { *Pointer<SByte>(element + 3) = SByte(Extract(c, 3)); }
-		case VK_FORMAT_B8G8R8_SINT:
-		case VK_FORMAT_B8G8R8_SSCALED:
-			if(writeB) { *Pointer<SByte>(element) = SByte(Extract(c, 2)); }
-			if(writeG) { *Pointer<SByte>(element + 1) = SByte(Extract(c, 1)); }
-			if(writeR) { *Pointer<SByte>(element + 2) = SByte(Extract(c, 0)); }
-			break;
-		case VK_FORMAT_A8B8G8R8_SINT_PACK32:
-		case VK_FORMAT_R8G8B8A8_SINT:
-		case VK_FORMAT_R8G8B8A8_SSCALED:
-		case VK_FORMAT_A8B8G8R8_SSCALED_PACK32:
-			if(writeA) { *Pointer<SByte>(element + 3) = SByte(Extract(c, 3)); }
-		case VK_FORMAT_R8G8B8_SINT:
-		case VK_FORMAT_R8G8B8_SSCALED:
-			if(writeB) { *Pointer<SByte>(element + 2) = SByte(Extract(c, 2)); }
-		case VK_FORMAT_R8G8_SINT:
-		case VK_FORMAT_R8G8_SSCALED:
-			if(writeG) { *Pointer<SByte>(element + 1) = SByte(Extract(c, 1)); }
-		case VK_FORMAT_R8_SINT:
-		case VK_FORMAT_R8_SSCALED:
-			if(writeR) { *Pointer<SByte>(element) = SByte(Extract(c, 0)); }
-			break;
-		case VK_FORMAT_A2B10G10R10_UINT_PACK32:
-		case VK_FORMAT_A2B10G10R10_SINT_PACK32:
-		case VK_FORMAT_A2B10G10R10_USCALED_PACK32:
-		case VK_FORMAT_A2B10G10R10_SSCALED_PACK32:
-			if(writeRGBA)
-			{
-				*Pointer<UInt>(element) =
-					UInt((Extract(c, 0)) | (Extract(c, 1) << 10) | (Extract(c, 2) << 20) | (Extract(c, 3) << 30));
-			}
-			else
-			{
-				unsigned int mask = (writeA ? 0xC0000000 : 0x0000) |
-				                    (writeB ? 0x3FF00000 : 0x0000) |
-				                    (writeG ? 0x000FFC00 : 0x0000) |
-				                    (writeR ? 0x000003FF : 0x0000);
-				unsigned int unmask = ~mask;
-				*Pointer<UInt>(element) = (*Pointer<UInt>(element) & UInt(unmask)) |
-					(UInt(Extract(c, 0) | (Extract(c, 1) << 10) | (Extract(c, 2) << 20) | (Extract(c, 3) << 30)) & UInt(mask));
-			}
-			break;
-		case VK_FORMAT_A2R10G10B10_UINT_PACK32:
-		case VK_FORMAT_A2R10G10B10_SINT_PACK32:
-		case VK_FORMAT_A2R10G10B10_USCALED_PACK32:
-		case VK_FORMAT_A2R10G10B10_SSCALED_PACK32:
-			if(writeRGBA)
-			{
-				*Pointer<UInt>(element) =
-					UInt((Extract(c, 2)) | (Extract(c, 1) << 10) | (Extract(c, 0) << 20) | (Extract(c, 3) << 30));
-			}
-			else
-			{
-				unsigned int mask = (writeA ? 0xC0000000 : 0x0000) |
-				                    (writeR ? 0x3FF00000 : 0x0000) |
-				                    (writeG ? 0x000FFC00 : 0x0000) |
-				                    (writeB ? 0x000003FF : 0x0000);
-				unsigned int unmask = ~mask;
-				*Pointer<UInt>(element) = (*Pointer<UInt>(element) & UInt(unmask)) |
-					(UInt(Extract(c, 2) | (Extract(c, 1) << 10) | (Extract(c, 0) << 20) | (Extract(c, 3) << 30)) & UInt(mask));
-			}
-			break;
-		case VK_FORMAT_B8G8R8A8_UINT:
-		case VK_FORMAT_B8G8R8A8_USCALED:
-			if(writeA) { *Pointer<Byte>(element + 3) = Byte(Extract(c, 3)); }
-		case VK_FORMAT_B8G8R8_UINT:
-		case VK_FORMAT_B8G8R8_USCALED:
-		case VK_FORMAT_B8G8R8_SRGB:
-			if(writeB) { *Pointer<Byte>(element) = Byte(Extract(c, 2)); }
-			if(writeG) { *Pointer<Byte>(element + 1) = Byte(Extract(c, 1)); }
-			if(writeR) { *Pointer<Byte>(element + 2) = Byte(Extract(c, 0)); }
-			break;
-		case VK_FORMAT_A8B8G8R8_UINT_PACK32:
-		case VK_FORMAT_R8G8B8A8_UINT:
-		case VK_FORMAT_R8G8B8A8_USCALED:
-		case VK_FORMAT_A8B8G8R8_USCALED_PACK32:
-			if(writeA) { *Pointer<Byte>(element + 3) = Byte(Extract(c, 3)); }
-		case VK_FORMAT_R8G8B8_UINT:
-		case VK_FORMAT_R8G8B8_USCALED:
-			if(writeB) { *Pointer<Byte>(element + 2) = Byte(Extract(c, 2)); }
-		case VK_FORMAT_R8G8_UINT:
-		case VK_FORMAT_R8G8_USCALED:
-			if(writeG) { *Pointer<Byte>(element + 1) = Byte(Extract(c, 1)); }
-		case VK_FORMAT_R8_UINT:
-		case VK_FORMAT_R8_USCALED:
-		case VK_FORMAT_S8_UINT:
-			if(writeR) { *Pointer<Byte>(element) = Byte(Extract(c, 0)); }
-			break;
-		case VK_FORMAT_R16G16B16A16_SINT:
-		case VK_FORMAT_R16G16B16A16_SSCALED:
-			if(writeA) { *Pointer<Short>(element + 6) = Short(Extract(c, 3)); }
-		case VK_FORMAT_R16G16B16_SINT:
-		case VK_FORMAT_R16G16B16_SSCALED:
-			if(writeB) { *Pointer<Short>(element + 4) = Short(Extract(c, 2)); }
-		case VK_FORMAT_R16G16_SINT:
-		case VK_FORMAT_R16G16_SSCALED:
-			if(writeG) { *Pointer<Short>(element + 2) = Short(Extract(c, 1)); }
-		case VK_FORMAT_R16_SINT:
-		case VK_FORMAT_R16_SSCALED:
-			if(writeR) { *Pointer<Short>(element) = Short(Extract(c, 0)); }
-			break;
-		case VK_FORMAT_R16G16B16A16_UINT:
-		case VK_FORMAT_R16G16B16A16_USCALED:
-			if(writeA) { *Pointer<UShort>(element + 6) = UShort(Extract(c, 3)); }
-		case VK_FORMAT_R16G16B16_UINT:
-		case VK_FORMAT_R16G16B16_USCALED:
-			if(writeB) { *Pointer<UShort>(element + 4) = UShort(Extract(c, 2)); }
-		case VK_FORMAT_R16G16_UINT:
-		case VK_FORMAT_R16G16_USCALED:
-			if(writeG) { *Pointer<UShort>(element + 2) = UShort(Extract(c, 1)); }
-		case VK_FORMAT_R16_UINT:
-		case VK_FORMAT_R16_USCALED:
-			if(writeR) { *Pointer<UShort>(element) = UShort(Extract(c, 0)); }
-			break;
-		case VK_FORMAT_R32G32B32A32_SINT:
-			if(writeRGBA)
-			{
-				*Pointer<Int4>(element) = c;
-			}
-			else
-			{
-				if(writeR) { *Pointer<Int>(element) = Extract(c, 0); }
-				if(writeG) { *Pointer<Int>(element + 4) = Extract(c, 1); }
-				if(writeB) { *Pointer<Int>(element + 8) = Extract(c, 2); }
-				if(writeA) { *Pointer<Int>(element + 12) = Extract(c, 3); }
-			}
-			break;
-		case VK_FORMAT_R32G32B32_SINT:
-			if(writeR) { *Pointer<Int>(element) = Extract(c, 0); }
-			if(writeG) { *Pointer<Int>(element + 4) = Extract(c, 1); }
-			if(writeB) { *Pointer<Int>(element + 8) = Extract(c, 2); }
-			break;
-		case VK_FORMAT_R32G32_SINT:
-			if(writeR) { *Pointer<Int>(element) = Extract(c, 0); }
-			if(writeG) { *Pointer<Int>(element + 4) = Extract(c, 1); }
-			break;
-		case VK_FORMAT_R32_SINT:
-			if(writeR) { *Pointer<Int>(element) = Extract(c, 0); }
-			break;
-		case VK_FORMAT_R32G32B32A32_UINT:
-			if(writeRGBA)
-			{
-				*Pointer<UInt4>(element) = As<UInt4>(c);
-			}
-			else
-			{
-				if(writeR) { *Pointer<UInt>(element) = As<UInt>(Extract(c, 0)); }
-				if(writeG) { *Pointer<UInt>(element + 4) = As<UInt>(Extract(c, 1)); }
-				if(writeB) { *Pointer<UInt>(element + 8) = As<UInt>(Extract(c, 2)); }
-				if(writeA) { *Pointer<UInt>(element + 12) = As<UInt>(Extract(c, 3)); }
-			}
-			break;
-		case VK_FORMAT_R32G32B32_UINT:
-			if(writeB) { *Pointer<UInt>(element + 8) = As<UInt>(Extract(c, 2)); }
-		case VK_FORMAT_R32G32_UINT:
-			if(writeG) { *Pointer<UInt>(element + 4) = As<UInt>(Extract(c, 1)); }
-		case VK_FORMAT_R32_UINT:
-			if(writeR) { *Pointer<UInt>(element) = As<UInt>(Extract(c, 0)); }
-			break;
-		default:
-			UNSUPPORTED("Blitter destination format %d", (int)state.destFormat);
-		}
-	}
-
-	void Blitter::ApplyScaleAndClamp(Float4 &value, const State &state, bool preScaled)
-	{
-		float4 scale{}, unscale{};
-
-		if(state.clearOperation &&
-		   state.sourceFormat.isNonNormalizedInteger() &&
-		   !state.destFormat.isNonNormalizedInteger())
-		{
-			// If we're clearing a buffer from an int or uint color into a normalized color,
-			// then the whole range of the int or uint color must be scaled between 0 and 1.
-			switch(state.sourceFormat)
-			{
-			case VK_FORMAT_R32G32B32A32_SINT:
-				unscale = replicate(static_cast<float>(0x7FFFFFFF));
-				break;
-			case VK_FORMAT_R32G32B32A32_UINT:
-				unscale = replicate(static_cast<float>(0xFFFFFFFF));
-				break;
-			default:
-				UNSUPPORTED("Blitter source format %d", (int)state.sourceFormat);
+				data.dest = dest->getTexelPointer({0, 0, static_cast<int32_t>(depth)}, subresLayers);
+				blitRoutine(&data);
 			}
 		}
 		else
 		{
-			unscale = state.sourceFormat.getScale();
-		}
-
-		scale = state.destFormat.getScale();
-
-		bool srcSRGB = state.sourceFormat.isSRGBformat();
-		bool dstSRGB = state.destFormat.isSRGBformat();
-
-		if(state.allowSRGBConversion && ((srcSRGB && !preScaled) || dstSRGB))   // One of the formats is sRGB encoded.
-		{
-			value *= preScaled ? Float4(1.0f / scale.x, 1.0f / scale.y, 1.0f / scale.z, 1.0f / scale.w) : // Unapply scale
-			                     Float4(1.0f / unscale.x, 1.0f / unscale.y, 1.0f / unscale.z, 1.0f / unscale.w); // Apply unscale
-			value = (srcSRGB && !preScaled) ? sRGBtoLinear(value) : LinearToSRGB(value);
-			value *= Float4(scale.x, scale.y, scale.z, scale.w); // Apply scale
-		}
-		else if(unscale != scale)
-		{
-			value *= Float4(scale.x / unscale.x, scale.y / unscale.y, scale.z / unscale.z, scale.w / unscale.w);
-		}
-
-		if(state.sourceFormat.isFloatFormat() && !state.destFormat.isFloatFormat())
-		{
-			value = Min(value, Float4(scale.x, scale.y, scale.z, scale.w));
-
-			value = Max(value, Float4(state.destFormat.isUnsignedComponent(0) ? 0.0f : -scale.x,
-			                          state.destFormat.isUnsignedComponent(1) ? 0.0f : -scale.y,
-			                          state.destFormat.isUnsignedComponent(2) ? 0.0f : -scale.z,
-			                          state.destFormat.isUnsignedComponent(3) ? 0.0f : -scale.w));
-		}
-	}
-
-	Int Blitter::ComputeOffset(Int &x, Int &y, Int &pitchB, int bytes)
-	{
-		return y * pitchB + x * bytes;
-	}
-
-	Float4 Blitter::LinearToSRGB(Float4 &c)
-	{
-		Float4 lc = Min(c, Float4(0.0031308f)) * Float4(12.92f);
-		Float4 ec = Float4(1.055f) * power(c, Float4(1.0f / 2.4f)) - Float4(0.055f);
-
-		Float4 s = c;
-		s.xyz = Max(lc, ec);
-
-		return s;
-	}
-
-	Float4 Blitter::sRGBtoLinear(Float4 &c)
-	{
-		Float4 lc = c * Float4(1.0f / 12.92f);
-		Float4 ec = power((c + Float4(0.055f)) * Float4(1.0f / 1.055f), Float4(2.4f));
-
-		Int4 linear = CmpLT(c, Float4(0.04045f));
-
-		Float4 s = c;
-		s.xyz = As<Float4>((linear & As<Int4>(lc)) | (~linear & As<Int4>(ec)));   // TODO: IfThenElse()
-
-		return s;
-	}
-
-	Blitter::BlitRoutineType Blitter::generate(const State &state)
-	{
-		BlitFunction function;
-		{
-			Pointer<Byte> blit(function.Arg<0>());
-
-			Pointer<Byte> source = *Pointer<Pointer<Byte>>(blit + OFFSET(BlitData,source));
-			Pointer<Byte> dest = *Pointer<Pointer<Byte>>(blit + OFFSET(BlitData,dest));
-			Int sPitchB = *Pointer<Int>(blit + OFFSET(BlitData,sPitchB));
-			Int dPitchB = *Pointer<Int>(blit + OFFSET(BlitData,dPitchB));
-
-			Float x0 = *Pointer<Float>(blit + OFFSET(BlitData,x0));
-			Float y0 = *Pointer<Float>(blit + OFFSET(BlitData,y0));
-			Float w = *Pointer<Float>(blit + OFFSET(BlitData,w));
-			Float h = *Pointer<Float>(blit + OFFSET(BlitData,h));
-
-			Int x0d = *Pointer<Int>(blit + OFFSET(BlitData,x0d));
-			Int x1d = *Pointer<Int>(blit + OFFSET(BlitData,x1d));
-			Int y0d = *Pointer<Int>(blit + OFFSET(BlitData,y0d));
-			Int y1d = *Pointer<Int>(blit + OFFSET(BlitData,y1d));
-
-			Int sWidth = *Pointer<Int>(blit + OFFSET(BlitData,sWidth));
-			Int sHeight = *Pointer<Int>(blit + OFFSET(BlitData,sHeight));
-
-			bool intSrc = state.sourceFormat.isNonNormalizedInteger();
-			bool intDst = state.destFormat.isNonNormalizedInteger();
-			bool intBoth = intSrc && intDst;
-			int srcBytes = state.sourceFormat.bytes();
-			int dstBytes = state.destFormat.bytes();
-
-			bool hasConstantColorI = false;
-			Int4 constantColorI;
-			bool hasConstantColorF = false;
-			Float4 constantColorF;
-			if(state.clearOperation)
+			for(subresLayers.baseArrayLayer = subresourceRange.baseArrayLayer; subresLayers.baseArrayLayer <= lastLayer; subresLayers.baseArrayLayer++)
 			{
-				if(intBoth) // Integer types
+				for(uint32_t depth = 0; depth < extent.depth; depth++)
 				{
-					constantColorI = readInt4(source, state);
-					hasConstantColorI = true;
+					data.dest = dest->getTexelPointer({ 0, 0, static_cast<int32_t>(depth) }, subresLayers);
+
+					blitRoutine(&data);
+				}
+			}
+		}
+	}
+}
+
+bool Blitter::fastClear(void *pixel, vk::Format format, vk::Image *dest, const vk::Format& viewFormat, const VkImageSubresourceRange& subresourceRange, const VkRect2D* renderArea)
+{
+	if(format != VK_FORMAT_R32G32B32A32_SFLOAT)
+	{
+		return false;
+	}
+
+	float *color = (float*)pixel;
+	float r = color[0];
+	float g = color[1];
+	float b = color[2];
+	float a = color[3];
+
+	uint32_t packed;
+
+	VkImageAspectFlagBits aspect = static_cast<VkImageAspectFlagBits>(subresourceRange.aspectMask);
+	switch(viewFormat)
+	{
+	case VK_FORMAT_R5G6B5_UNORM_PACK16:
+		packed = ((uint16_t)(31 * b + 0.5f) << 0) |
+			        ((uint16_t)(63 * g + 0.5f) << 5) |
+			        ((uint16_t)(31 * r + 0.5f) << 11);
+		break;
+	case VK_FORMAT_B5G6R5_UNORM_PACK16:
+		packed = ((uint16_t)(31 * r + 0.5f) << 0) |
+			        ((uint16_t)(63 * g + 0.5f) << 5) |
+			        ((uint16_t)(31 * b + 0.5f) << 11);
+		break;
+	case VK_FORMAT_A8B8G8R8_UINT_PACK32:
+	case VK_FORMAT_A8B8G8R8_UNORM_PACK32:
+	case VK_FORMAT_R8G8B8A8_UNORM:
+		packed = ((uint32_t)(255 * a + 0.5f) << 24) |
+		         ((uint32_t)(255 * b + 0.5f) << 16) |
+		         ((uint32_t)(255 * g + 0.5f) << 8) |
+		         ((uint32_t)(255 * r + 0.5f) << 0);
+		break;
+	case VK_FORMAT_B8G8R8A8_UNORM:
+		packed = ((uint32_t)(255 * a + 0.5f) << 24) |
+		         ((uint32_t)(255 * r + 0.5f) << 16) |
+		         ((uint32_t)(255 * g + 0.5f) << 8) |
+		         ((uint32_t)(255 * b + 0.5f) << 0);
+		break;
+	case VK_FORMAT_B10G11R11_UFLOAT_PACK32:
+		packed = R11G11B10F(color);
+		break;
+	case VK_FORMAT_E5B9G9R9_UFLOAT_PACK32:
+		packed = RGB9E5(color);
+		break;
+	default:
+		return false;
+	}
+
+	VkImageSubresourceLayers subresLayers =
+	{
+		subresourceRange.aspectMask,
+		subresourceRange.baseMipLevel,
+		subresourceRange.baseArrayLayer,
+		1
+	};
+	uint32_t lastMipLevel = dest->getLastMipLevel(subresourceRange);
+	uint32_t lastLayer = dest->getLastLayerIndex(subresourceRange);
+
+	VkRect2D area = { { 0, 0 }, { 0, 0 } };
+	if(renderArea)
+	{
+		ASSERT(subresourceRange.levelCount == 1);
+		area = *renderArea;
+	}
+
+	for(; subresLayers.mipLevel <= lastMipLevel; subresLayers.mipLevel++)
+	{
+		int rowPitchBytes = dest->rowPitchBytes(aspect, subresLayers.mipLevel);
+		int slicePitchBytes = dest->slicePitchBytes(aspect, subresLayers.mipLevel);
+		VkExtent3D extent = dest->getMipLevelExtent(aspect, subresLayers.mipLevel);
+		if(!renderArea)
+		{
+			area.extent.width = extent.width;
+			area.extent.height = extent.height;
+		}
+		if(dest->is3DSlice())
+		{
+			extent.depth = 1; // The 3D image is instead interpreted as a 2D image with layers
+		}
+
+		for(subresLayers.baseArrayLayer = subresourceRange.baseArrayLayer; subresLayers.baseArrayLayer <= lastLayer; subresLayers.baseArrayLayer++)
+		{
+			for(uint32_t depth = 0; depth < extent.depth; depth++)
+			{
+				uint8_t *slice = (uint8_t*)dest->getTexelPointer(
+					{ area.offset.x, area.offset.y, static_cast<int32_t>(depth) }, subresLayers);
+
+				for(int j = 0; j < dest->getSampleCountFlagBits(); j++)
+				{
+					uint8_t *d = slice;
+
+					switch(viewFormat.bytes())
+					{
+					case 2:
+						for(uint32_t i = 0; i < area.extent.height; i++)
+						{
+							ASSERT(d < dest->end());
+							sw::clear((uint16_t*)d, static_cast<uint16_t>(packed), area.extent.width);
+							d += rowPitchBytes;
+						}
+						break;
+					case 4:
+						for(uint32_t i = 0; i < area.extent.height; i++)
+						{
+							ASSERT(d < dest->end());
+							sw::clear((uint32_t*)d, packed, area.extent.width);
+							d += rowPitchBytes;
+						}
+						break;
+					default:
+						assert(false);
+					}
+
+					slice += slicePitchBytes;
+				}
+			}
+		}
+	}
+
+	return true;
+}
+
+Float4 Blitter::readFloat4(Pointer<Byte> element, const State &state)
+{
+	Float4 c(0.0f, 0.0f, 0.0f, 1.0f);
+
+	switch(state.sourceFormat)
+	{
+	case VK_FORMAT_B4G4R4A4_UNORM_PACK16:
+		c.w = Float(Int(*Pointer<Byte>(element)) & Int(0xF));
+		c.x = Float((Int(*Pointer<Byte>(element)) >> 4) & Int(0xF));
+		c.y = Float(Int(*Pointer<Byte>(element + 1)) & Int(0xF));
+		c.z = Float((Int(*Pointer<Byte>(element + 1)) >> 4) & Int(0xF));
+		break;
+	case VK_FORMAT_R8_SINT:
+	case VK_FORMAT_R8_SNORM:
+		c.x = Float(Int(*Pointer<SByte>(element)));
+		c.w = float(0x7F);
+		break;
+	case VK_FORMAT_R8_UNORM:
+	case VK_FORMAT_R8_UINT:
+	case VK_FORMAT_R8_SRGB:
+		c.x = Float(Int(*Pointer<Byte>(element)));
+		c.w = float(0xFF);
+		break;
+	case VK_FORMAT_R16_SINT:
+	case VK_FORMAT_R16_SNORM:
+		c.x = Float(Int(*Pointer<Short>(element)));
+		c.w = float(0x7FFF);
+		break;
+	case VK_FORMAT_R16_UNORM:
+	case VK_FORMAT_R16_UINT:
+		c.x = Float(Int(*Pointer<UShort>(element)));
+		c.w = float(0xFFFF);
+		break;
+	case VK_FORMAT_R32_SINT:
+		c.x = Float(*Pointer<Int>(element));
+		c.w = float(0x7FFFFFFF);
+		break;
+	case VK_FORMAT_R32_UINT:
+		c.x = Float(*Pointer<UInt>(element));
+		c.w = float(0xFFFFFFFF);
+		break;
+	case VK_FORMAT_B8G8R8A8_SRGB:
+	case VK_FORMAT_B8G8R8A8_UNORM:
+		c = Float4(*Pointer<Byte4>(element)).zyxw;
+		break;
+	case VK_FORMAT_A8B8G8R8_SINT_PACK32:
+	case VK_FORMAT_R8G8B8A8_SINT:
+	case VK_FORMAT_A8B8G8R8_SNORM_PACK32:
+	case VK_FORMAT_R8G8B8A8_SNORM:
+		c = Float4(*Pointer<SByte4>(element));
+		break;
+	case VK_FORMAT_A8B8G8R8_UINT_PACK32:
+	case VK_FORMAT_A8B8G8R8_UNORM_PACK32:
+	case VK_FORMAT_R8G8B8A8_UNORM:
+	case VK_FORMAT_R8G8B8A8_UINT:
+	case VK_FORMAT_A8B8G8R8_SRGB_PACK32:
+	case VK_FORMAT_R8G8B8A8_SRGB:
+		c = Float4(*Pointer<Byte4>(element));
+		break;
+	case VK_FORMAT_R16G16B16A16_SINT:
+		c = Float4(*Pointer<Short4>(element));
+		break;
+	case VK_FORMAT_R16G16B16A16_UNORM:
+	case VK_FORMAT_R16G16B16A16_UINT:
+		c = Float4(*Pointer<UShort4>(element));
+		break;
+	case VK_FORMAT_R32G32B32A32_SINT:
+		c = Float4(*Pointer<Int4>(element));
+		break;
+	case VK_FORMAT_R32G32B32A32_UINT:
+		c = Float4(*Pointer<UInt4>(element));
+		break;
+	case VK_FORMAT_R8G8_SINT:
+	case VK_FORMAT_R8G8_SNORM:
+		c.x = Float(Int(*Pointer<SByte>(element + 0)));
+		c.y = Float(Int(*Pointer<SByte>(element + 1)));
+		c.w = float(0x7F);
+		break;
+	case VK_FORMAT_R8G8_UNORM:
+	case VK_FORMAT_R8G8_UINT:
+	case VK_FORMAT_R8G8_SRGB:
+		c.x = Float(Int(*Pointer<Byte>(element + 0)));
+		c.y = Float(Int(*Pointer<Byte>(element + 1)));
+		c.w = float(0xFF);
+		break;
+	case VK_FORMAT_R16G16_SINT:
+	case VK_FORMAT_R16G16_SNORM:
+		c.x = Float(Int(*Pointer<Short>(element + 0)));
+		c.y = Float(Int(*Pointer<Short>(element + 2)));
+		c.w = float(0x7FFF);
+		break;
+	case VK_FORMAT_R16G16_UNORM:
+	case VK_FORMAT_R16G16_UINT:
+		c.x = Float(Int(*Pointer<UShort>(element + 0)));
+		c.y = Float(Int(*Pointer<UShort>(element + 2)));
+		c.w = float(0xFFFF);
+		break;
+	case VK_FORMAT_R32G32_SINT:
+		c.x = Float(*Pointer<Int>(element + 0));
+		c.y = Float(*Pointer<Int>(element + 4));
+		c.w = float(0x7FFFFFFF);
+		break;
+	case VK_FORMAT_R32G32_UINT:
+		c.x = Float(*Pointer<UInt>(element + 0));
+		c.y = Float(*Pointer<UInt>(element + 4));
+		c.w = float(0xFFFFFFFF);
+		break;
+	case VK_FORMAT_R32G32B32A32_SFLOAT:
+		c = *Pointer<Float4>(element);
+		break;
+	case VK_FORMAT_R32G32_SFLOAT:
+		c.x = *Pointer<Float>(element + 0);
+		c.y = *Pointer<Float>(element + 4);
+		break;
+	case VK_FORMAT_R32_SFLOAT:
+		c.x = *Pointer<Float>(element);
+		break;
+	case VK_FORMAT_R16G16B16A16_SFLOAT:
+		c.w = Float(*Pointer<Half>(element + 6));
+	case VK_FORMAT_R16G16B16_SFLOAT:
+		c.z = Float(*Pointer<Half>(element + 4));
+	case VK_FORMAT_R16G16_SFLOAT:
+		c.y = Float(*Pointer<Half>(element + 2));
+	case VK_FORMAT_R16_SFLOAT:
+		c.x = Float(*Pointer<Half>(element));
+		break;
+	case VK_FORMAT_B10G11R11_UFLOAT_PACK32:
+		// 10 (or 11) bit float formats are unsigned formats with a 5 bit exponent and a 5 (or 6) bit mantissa.
+		// Since the Half float format also has a 5 bit exponent, we can convert these formats to half by
+		// copy/pasting the bits so the the exponent bits and top mantissa bits are aligned to the half format.
+		// In this case, we have:
+		//              B B B B B B B B B B G G G G G G G G G G G R R R R R R R R R R R
+		// 1st Short:                                  |xxxxxxxxxx---------------------|
+		// 2nd Short:                  |xxxx---------------------xxxxxx|
+		// 3rd Short: |--------------------xxxxxxxxxxxx|
+		// These memory reads overlap, but each of them contains an entire channel, so we can read this without
+		// any int -> short conversion.
+		c.x = Float(As<Half>((*Pointer<UShort>(element + 0) & UShort(0x07FF)) << UShort(4)));
+		c.y = Float(As<Half>((*Pointer<UShort>(element + 1) & UShort(0x3FF8)) << UShort(1)));
+		c.z = Float(As<Half>((*Pointer<UShort>(element + 2) & UShort(0xFFC0)) >> UShort(1)));
+		break;
+	case VK_FORMAT_E5B9G9R9_UFLOAT_PACK32:
+		// This type contains a common 5 bit exponent (E) and a 9 bit the mantissa for R, G and B.
+		c.x = Float(*Pointer<UInt>(element) & UInt(0x000001FF));         // R's mantissa (bits 0-8)
+		c.y = Float((*Pointer<UInt>(element) & UInt(0x0003FE00)) >> 9);  // G's mantissa (bits 9-17)
+		c.z = Float((*Pointer<UInt>(element) & UInt(0x07FC0000)) >> 18); // B's mantissa (bits 18-26)
+		c *= Float4(
+			// 2^E, using the exponent (bits 27-31) and treating it as an unsigned integer value
+			Float(UInt(1) << ((*Pointer<UInt>(element) & UInt(0xF8000000)) >> 27)) *
+			// Since the 9 bit mantissa values currently stored in RGB were converted straight
+			// from int to float (in the [0, 1<<9] range instead of the [0, 1] range), they
+			// are (1 << 9) times too high.
+			// Also, the exponent has 5 bits and we compute the exponent bias of floating point
+			// formats using "2^(k-1) - 1", so, in this case, the exponent bias is 2^(5-1)-1 = 15
+			// Exponent bias (15) + number of mantissa bits per component (9) = 24
+			Float(1.0f / (1 << 24)));
+		c.w = 1.0f;
+		break;
+	case VK_FORMAT_R5G6B5_UNORM_PACK16:
+		c.x = Float(Int((*Pointer<UShort>(element) & UShort(0xF800)) >> UShort(11)));
+		c.y = Float(Int((*Pointer<UShort>(element) & UShort(0x07E0)) >> UShort(5)));
+		c.z = Float(Int(*Pointer<UShort>(element) & UShort(0x001F)));
+		break;
+	case VK_FORMAT_A1R5G5B5_UNORM_PACK16:
+		c.w = Float(Int((*Pointer<UShort>(element) & UShort(0x8000)) >> UShort(15)));
+		c.x = Float(Int((*Pointer<UShort>(element) & UShort(0x7C00)) >> UShort(10)));
+		c.y = Float(Int((*Pointer<UShort>(element) & UShort(0x03E0)) >> UShort(5)));
+		c.z = Float(Int(*Pointer<UShort>(element) & UShort(0x001F)));
+		break;
+	case VK_FORMAT_A2B10G10R10_UNORM_PACK32:
+	case VK_FORMAT_A2B10G10R10_UINT_PACK32:
+		c.x = Float(Int((*Pointer<UInt>(element) & UInt(0x000003FF))));
+		c.y = Float(Int((*Pointer<UInt>(element) & UInt(0x000FFC00)) >> 10));
+		c.z = Float(Int((*Pointer<UInt>(element) & UInt(0x3FF00000)) >> 20));
+		c.w = Float(Int((*Pointer<UInt>(element) & UInt(0xC0000000)) >> 30));
+		break;
+	case VK_FORMAT_D16_UNORM:
+		c.x = Float(Int((*Pointer<UShort>(element))));
+		break;
+	case VK_FORMAT_X8_D24_UNORM_PACK32:
+		c.x = Float(Int((*Pointer<UInt>(element) & UInt(0xFFFFFF00)) >> 8));
+		break;
+	case VK_FORMAT_D32_SFLOAT:
+		c.x = *Pointer<Float>(element);
+		break;
+	case VK_FORMAT_S8_UINT:
+		c.x = Float(Int(*Pointer<Byte>(element)));
+		break;
+	default:
+		UNSUPPORTED("Blitter source format %d", (int)state.sourceFormat);
+	}
+
+	return c;
+}
+
+void Blitter::write(Float4 &c, Pointer<Byte> element, const State &state)
+{
+	bool writeR = state.writeRed;
+	bool writeG = state.writeGreen;
+	bool writeB = state.writeBlue;
+	bool writeA = state.writeAlpha;
+	bool writeRGBA = writeR && writeG && writeB && writeA;
+
+	switch(state.destFormat)
+	{
+	case VK_FORMAT_R4G4_UNORM_PACK8:
+		if(writeR | writeG)
+		{
+			if(!writeR)
+			{
+				*Pointer<Byte>(element) = (Byte(RoundInt(Float(c.y))) & Byte(0xF)) |
+			                              (*Pointer<Byte>(element) & Byte(0xF0));
+			}
+			else if(!writeG)
+			{
+				*Pointer<Byte>(element) = (*Pointer<Byte>(element) & Byte(0xF)) |
+			                              (Byte(RoundInt(Float(c.x))) << Byte(4));
+			}
+			else
+			{
+				*Pointer<Byte>(element) = (Byte(RoundInt(Float(c.y))) & Byte(0xF)) |
+			                              (Byte(RoundInt(Float(c.x))) << Byte(4));
+			}
+		}
+		break;
+	case VK_FORMAT_R4G4B4A4_UNORM_PACK16:
+		if(writeR || writeG || writeB || writeA)
+		{
+			*Pointer<UShort>(element) = (writeR ? ((UShort(RoundInt(Float(c.x))) & UShort(0xF)) << UShort(12)) :
+			                                      (*Pointer<UShort>(element) & UShort(0x000F))) |
+			                            (writeG ? ((UShort(RoundInt(Float(c.y))) & UShort(0xF)) << UShort(8)) :
+			                                      (*Pointer<UShort>(element) & UShort(0x00F0))) |
+			                            (writeB ? ((UShort(RoundInt(Float(c.z))) & UShort(0xF)) << UShort(4)) :
+		                                          (*Pointer<UShort>(element) & UShort(0x0F00))) |
+		                                (writeA ? (UShort(RoundInt(Float(c.w))) & UShort(0xF)) :
+		                                          (*Pointer<UShort>(element) & UShort(0xF000)));
+		}
+		break;
+	case VK_FORMAT_B4G4R4A4_UNORM_PACK16:
+		if(writeRGBA)
+		{
+			*Pointer<UShort>(element) = UShort(RoundInt(Float(c.w)) & Int(0xF)) |
+			                            UShort((RoundInt(Float(c.x)) & Int(0xF)) << 4) |
+			                            UShort((RoundInt(Float(c.y)) & Int(0xF)) << 8) |
+			                            UShort((RoundInt(Float(c.z)) & Int(0xF)) << 12);
+		}
+		else
+		{
+			unsigned short mask = (writeA ? 0x000F : 0x0000) |
+			                      (writeR ? 0x00F0 : 0x0000) |
+			                      (writeG ? 0x0F00 : 0x0000) |
+			                      (writeB ? 0xF000 : 0x0000);
+			unsigned short unmask = ~mask;
+			*Pointer<UShort>(element) = (*Pointer<UShort>(element) & UShort(unmask)) |
+			                            ((UShort(RoundInt(Float(c.w)) & Int(0xF)) |
+			                              UShort((RoundInt(Float(c.x)) & Int(0xF)) << 4) |
+			                              UShort((RoundInt(Float(c.y)) & Int(0xF)) << 8) |
+			                              UShort((RoundInt(Float(c.z)) & Int(0xF)) << 12)) & UShort(mask));
+		}
+		break;
+	case VK_FORMAT_B8G8R8A8_SRGB:
+	case VK_FORMAT_B8G8R8A8_UNORM:
+		if(writeRGBA)
+		{
+			Short4 c0 = RoundShort4(c.zyxw);
+			*Pointer<Byte4>(element) = Byte4(PackUnsigned(c0, c0));
+		}
+		else
+		{
+			if(writeB) { *Pointer<Byte>(element + 0) = Byte(RoundInt(Float(c.z))); }
+			if(writeG) { *Pointer<Byte>(element + 1) = Byte(RoundInt(Float(c.y))); }
+			if(writeR) { *Pointer<Byte>(element + 2) = Byte(RoundInt(Float(c.x))); }
+			if(writeA) { *Pointer<Byte>(element + 3) = Byte(RoundInt(Float(c.w))); }
+		}
+		break;
+	case VK_FORMAT_B8G8R8_SNORM:
+		if(writeB) { *Pointer<SByte>(element + 0) = SByte(RoundInt(Float(c.z))); }
+		if(writeG) { *Pointer<SByte>(element + 1) = SByte(RoundInt(Float(c.y))); }
+		if(writeR) { *Pointer<SByte>(element + 2) = SByte(RoundInt(Float(c.x))); }
+		break;
+	case VK_FORMAT_B8G8R8_UNORM:
+	case VK_FORMAT_B8G8R8_SRGB:
+		if(writeB) { *Pointer<Byte>(element + 0) = Byte(RoundInt(Float(c.z))); }
+		if(writeG) { *Pointer<Byte>(element + 1) = Byte(RoundInt(Float(c.y))); }
+		if(writeR) { *Pointer<Byte>(element + 2) = Byte(RoundInt(Float(c.x))); }
+		break;
+	case VK_FORMAT_A8B8G8R8_UNORM_PACK32:
+	case VK_FORMAT_R8G8B8A8_UNORM:
+	case VK_FORMAT_A8B8G8R8_SRGB_PACK32:
+	case VK_FORMAT_R8G8B8A8_SRGB:
+	case VK_FORMAT_A8B8G8R8_UINT_PACK32:
+	case VK_FORMAT_R8G8B8A8_UINT:
+	case VK_FORMAT_R8G8B8A8_USCALED:
+	case VK_FORMAT_A8B8G8R8_USCALED_PACK32:
+		if(writeRGBA)
+		{
+			Short4 c0 = RoundShort4(c);
+			*Pointer<Byte4>(element) = Byte4(PackUnsigned(c0, c0));
+		}
+		else
+		{
+			if(writeR) { *Pointer<Byte>(element + 0) = Byte(RoundInt(Float(c.x))); }
+			if(writeG) { *Pointer<Byte>(element + 1) = Byte(RoundInt(Float(c.y))); }
+			if(writeB) { *Pointer<Byte>(element + 2) = Byte(RoundInt(Float(c.z))); }
+			if(writeA) { *Pointer<Byte>(element + 3) = Byte(RoundInt(Float(c.w))); }
+		}
+		break;
+	case VK_FORMAT_R32G32B32A32_SFLOAT:
+		if(writeRGBA)
+		{
+			*Pointer<Float4>(element) = c;
+		}
+		else
+		{
+			if(writeR) { *Pointer<Float>(element) = c.x; }
+			if(writeG) { *Pointer<Float>(element + 4) = c.y; }
+			if(writeB) { *Pointer<Float>(element + 8) = c.z; }
+			if(writeA) { *Pointer<Float>(element + 12) = c.w; }
+		}
+		break;
+	case VK_FORMAT_R32G32B32_SFLOAT:
+		if(writeR) { *Pointer<Float>(element) = c.x; }
+		if(writeG) { *Pointer<Float>(element + 4) = c.y; }
+		if(writeB) { *Pointer<Float>(element + 8) = c.z; }
+		break;
+	case VK_FORMAT_R32G32_SFLOAT:
+		if(writeR && writeG)
+		{
+			*Pointer<Float2>(element) = Float2(c);
+		}
+		else
+		{
+			if(writeR) { *Pointer<Float>(element) = c.x; }
+			if(writeG) { *Pointer<Float>(element + 4) = c.y; }
+		}
+		break;
+	case VK_FORMAT_R32_SFLOAT:
+		if(writeR) { *Pointer<Float>(element) = c.x; }
+		break;
+	case VK_FORMAT_R16G16B16A16_SFLOAT:
+		if(writeA) { *Pointer<Half>(element + 6) = Half(c.w); }
+	case VK_FORMAT_R16G16B16_SFLOAT:
+		if(writeB) { *Pointer<Half>(element + 4) = Half(c.z); }
+	case VK_FORMAT_R16G16_SFLOAT:
+		if(writeG) { *Pointer<Half>(element + 2) = Half(c.y); }
+	case VK_FORMAT_R16_SFLOAT:
+		if(writeR) { *Pointer<Half>(element) = Half(c.x); }
+		break;
+	case VK_FORMAT_B10G11R11_UFLOAT_PACK32:
+		{
+			// 10 (or 11) bit float formats are unsigned formats with a 5 bit exponent and a 5 (or 6) bit mantissa.
+			// Since the 16-bit half-precision float format also has a 5 bit exponent, we can extract these minifloats from them.
+
+			// FIXME(b/138944025): Handle negative values, Inf, and NaN.
+			// FIXME(b/138944025): Perform rounding before truncating the mantissa.
+			UInt r = (UInt(As<UShort>(Half(c.x))) & 0x00007FF0) >> 4;
+			UInt g = (UInt(As<UShort>(Half(c.y))) & 0x00007FF0) << 7;
+			UInt b = (UInt(As<UShort>(Half(c.z))) & 0x00007FE0) << 17;
+
+			UInt rgb = r | g | b;
+
+			UInt old = *Pointer<UInt>(element);
+
+			unsigned int mask = (writeR ? 0x000007FF : 0) |
+			                    (writeG ? 0x003FF800 : 0) |
+			                    (writeB ? 0xFFC00000 : 0);
+
+			*Pointer<UInt>(element) = (rgb & mask) | (old & ~mask);
+		}
+		break;
+	case VK_FORMAT_E5B9G9R9_UFLOAT_PACK32:
+		{
+			ASSERT(writeRGBA);  // Can't sensibly write just part of this format.
+
+			// Vulkan 1.1.117 section 15.2.1 RGB to Shared Exponent Conversion
+
+			constexpr int N = 9;       // number of mantissa bits per component
+			constexpr int B = 15;      // exponent bias
+			constexpr int E_max = 31;  // maximum possible biased exponent value
+
+			// Maximum representable value.
+			constexpr float sharedexp_max = ((static_cast<float>(1 << N) - 1) / static_cast<float>(1 << N)) * static_cast<float>(1 << (E_max - B));
+
+			// Clamp components to valid range. NaN becomes 0.
+			Float red_c =   Min(IfThenElse(!(c.x > 0), Float(0), Float(c.x)), sharedexp_max);
+			Float green_c = Min(IfThenElse(!(c.y > 0), Float(0), Float(c.y)), sharedexp_max);
+			Float blue_c =  Min(IfThenElse(!(c.z > 0), Float(0), Float(c.z)), sharedexp_max);
+
+			// We're reducing the mantissa to 9 bits, so we must round up if the next
+			// bit is 1. In other words add 0.5 to the new mantissa's position and
+			// allow overflow into the exponent so we can scale correctly.
+			constexpr int half = 1 << (23 - N);
+			Float red_r = As<Float>(As<Int>(red_c) + half);
+			Float green_r = As<Float>(As<Int>(green_c) + half);
+			Float blue_r = As<Float>(As<Int>(blue_c) + half);
+
+			// The largest component determines the shared exponent. It can't be lower
+			// than 0 (after bias subtraction) so also limit to the mimimum representable.
+			constexpr float min_s = 0.5f / (1 << B);
+			Float max_s = Max(Max(red_r, green_r), Max(blue_r, min_s));
+
+			// Obtain the reciprocal of the shared exponent by inverting the bits,
+			// and scale by the new mantissa's size. Note that the IEEE-754 single-precision
+			// format has an implicit leading 1, but this shared component format does not.
+			Float scale = As<Float>((As<Int>(max_s) & 0x7F800000) ^ 0x7F800000) * (1 << (N - 2));
+
+			UInt R9 = RoundInt(red_c * scale);
+			UInt G9 = UInt(RoundInt(green_c * scale));
+			UInt B9 = UInt(RoundInt(blue_c * scale));
+			UInt E5 = (As<UInt>(max_s) >> 23) - 127 + 15 + 1;
+
+			UInt E5B9G9R9 = (E5 << 27) | (B9 << 18) | (G9 << 9) | R9;
+
+			*Pointer<UInt>(element) = E5B9G9R9;
+		}
+		break;
+	case VK_FORMAT_B8G8R8A8_SNORM:
+		if(writeB) { *Pointer<SByte>(element) = SByte(RoundInt(Float(c.z))); }
+		if(writeG) { *Pointer<SByte>(element + 1) = SByte(RoundInt(Float(c.y))); }
+		if(writeR) { *Pointer<SByte>(element + 2) = SByte(RoundInt(Float(c.x))); }
+		if(writeA) { *Pointer<SByte>(element + 3) = SByte(RoundInt(Float(c.w))); }
+		break;
+	case VK_FORMAT_A8B8G8R8_SINT_PACK32:
+	case VK_FORMAT_R8G8B8A8_SINT:
+	case VK_FORMAT_A8B8G8R8_SNORM_PACK32:
+	case VK_FORMAT_R8G8B8A8_SNORM:
+	case VK_FORMAT_R8G8B8A8_SSCALED:
+	case VK_FORMAT_A8B8G8R8_SSCALED_PACK32:
+		if(writeA) { *Pointer<SByte>(element + 3) = SByte(RoundInt(Float(c.w))); }
+	case VK_FORMAT_R8G8B8_SINT:
+	case VK_FORMAT_R8G8B8_SNORM:
+	case VK_FORMAT_R8G8B8_SSCALED:
+		if(writeB) { *Pointer<SByte>(element + 2) = SByte(RoundInt(Float(c.z))); }
+	case VK_FORMAT_R8G8_SINT:
+	case VK_FORMAT_R8G8_SNORM:
+	case VK_FORMAT_R8G8_SSCALED:
+		if(writeG) { *Pointer<SByte>(element + 1) = SByte(RoundInt(Float(c.y))); }
+	case VK_FORMAT_R8_SINT:
+	case VK_FORMAT_R8_SNORM:
+	case VK_FORMAT_R8_SSCALED:
+		if(writeR) { *Pointer<SByte>(element) = SByte(RoundInt(Float(c.x))); }
+		break;
+	case VK_FORMAT_R8G8B8_UINT:
+	case VK_FORMAT_R8G8B8_UNORM:
+	case VK_FORMAT_R8G8B8_USCALED:
+	case VK_FORMAT_R8G8B8_SRGB:
+		if(writeB) { *Pointer<Byte>(element + 2) = Byte(RoundInt(Float(c.z))); }
+	case VK_FORMAT_R8G8_UINT:
+	case VK_FORMAT_R8G8_UNORM:
+	case VK_FORMAT_R8G8_USCALED:
+	case VK_FORMAT_R8G8_SRGB:
+		if(writeG) { *Pointer<Byte>(element + 1) = Byte(RoundInt(Float(c.y))); }
+	case VK_FORMAT_R8_UINT:
+	case VK_FORMAT_R8_UNORM:
+	case VK_FORMAT_R8_USCALED:
+	case VK_FORMAT_R8_SRGB:
+		if(writeR) { *Pointer<Byte>(element) = Byte(RoundInt(Float(c.x))); }
+		break;
+	case VK_FORMAT_R16G16B16A16_SINT:
+	case VK_FORMAT_R16G16B16A16_SNORM:
+	case VK_FORMAT_R16G16B16A16_SSCALED:
+		if(writeRGBA)
+		{
+			*Pointer<Short4>(element) = Short4(RoundInt(c));
+		}
+		else
+		{
+			if(writeR) { *Pointer<Short>(element) = Short(RoundInt(Float(c.x))); }
+			if(writeG) { *Pointer<Short>(element + 2) = Short(RoundInt(Float(c.y))); }
+			if(writeB) { *Pointer<Short>(element + 4) = Short(RoundInt(Float(c.z))); }
+			if(writeA) { *Pointer<Short>(element + 6) = Short(RoundInt(Float(c.w))); }
+		}
+		break;
+	case VK_FORMAT_R16G16B16_SINT:
+	case VK_FORMAT_R16G16B16_SNORM:
+	case VK_FORMAT_R16G16B16_SSCALED:
+		if(writeR) { *Pointer<Short>(element) = Short(RoundInt(Float(c.x))); }
+		if(writeG) { *Pointer<Short>(element + 2) = Short(RoundInt(Float(c.y))); }
+		if(writeB) { *Pointer<Short>(element + 4) = Short(RoundInt(Float(c.z))); }
+		break;
+	case VK_FORMAT_R16G16_SINT:
+	case VK_FORMAT_R16G16_SNORM:
+	case VK_FORMAT_R16G16_SSCALED:
+		if(writeR && writeG)
+		{
+			*Pointer<Short2>(element) = Short2(Short4(RoundInt(c)));
+		}
+		else
+		{
+			if(writeR) { *Pointer<Short>(element) = Short(RoundInt(Float(c.x))); }
+			if(writeG) { *Pointer<Short>(element + 2) = Short(RoundInt(Float(c.y))); }
+		}
+		break;
+	case VK_FORMAT_R16_SINT:
+	case VK_FORMAT_R16_SNORM:
+	case VK_FORMAT_R16_SSCALED:
+		if(writeR) { *Pointer<Short>(element) = Short(RoundInt(Float(c.x))); }
+		break;
+	case VK_FORMAT_R16G16B16A16_UINT:
+	case VK_FORMAT_R16G16B16A16_UNORM:
+	case VK_FORMAT_R16G16B16A16_USCALED:
+		if(writeRGBA)
+		{
+			*Pointer<UShort4>(element) = UShort4(RoundInt(c));
+		}
+		else
+		{
+			if(writeR) { *Pointer<UShort>(element) = UShort(RoundInt(Float(c.x))); }
+			if(writeG) { *Pointer<UShort>(element + 2) = UShort(RoundInt(Float(c.y))); }
+			if(writeB) { *Pointer<UShort>(element + 4) = UShort(RoundInt(Float(c.z))); }
+			if(writeA) { *Pointer<UShort>(element + 6) = UShort(RoundInt(Float(c.w))); }
+		}
+		break;
+	case VK_FORMAT_R16G16B16_UINT:
+	case VK_FORMAT_R16G16B16_UNORM:
+	case VK_FORMAT_R16G16B16_USCALED:
+		if(writeR) { *Pointer<UShort>(element) = UShort(RoundInt(Float(c.x))); }
+		if(writeG) { *Pointer<UShort>(element + 2) = UShort(RoundInt(Float(c.y))); }
+		if(writeB) { *Pointer<UShort>(element + 4) = UShort(RoundInt(Float(c.z))); }
+		break;
+	case VK_FORMAT_R16G16_UINT:
+	case VK_FORMAT_R16G16_UNORM:
+	case VK_FORMAT_R16G16_USCALED:
+		if(writeR && writeG)
+		{
+			*Pointer<UShort2>(element) = UShort2(UShort4(RoundInt(c)));
+		}
+		else
+		{
+			if(writeR) { *Pointer<UShort>(element) = UShort(RoundInt(Float(c.x))); }
+			if(writeG) { *Pointer<UShort>(element + 2) = UShort(RoundInt(Float(c.y))); }
+		}
+		break;
+	case VK_FORMAT_R16_UINT:
+	case VK_FORMAT_R16_UNORM:
+	case VK_FORMAT_R16_USCALED:
+		if(writeR) { *Pointer<UShort>(element) = UShort(RoundInt(Float(c.x))); }
+		break;
+	case VK_FORMAT_R32G32B32A32_SINT:
+		if(writeRGBA)
+		{
+			*Pointer<Int4>(element) = RoundInt(c);
+		}
+		else
+		{
+			if(writeR) { *Pointer<Int>(element) = RoundInt(Float(c.x)); }
+			if(writeG) { *Pointer<Int>(element + 4) = RoundInt(Float(c.y)); }
+			if(writeB) { *Pointer<Int>(element + 8) = RoundInt(Float(c.z)); }
+			if(writeA) { *Pointer<Int>(element + 12) = RoundInt(Float(c.w)); }
+		}
+		break;
+	case VK_FORMAT_R32G32B32_SINT:
+		if(writeB) { *Pointer<Int>(element + 8) = RoundInt(Float(c.z)); }
+	case VK_FORMAT_R32G32_SINT:
+		if(writeG) { *Pointer<Int>(element + 4) = RoundInt(Float(c.y)); }
+	case VK_FORMAT_R32_SINT:
+		if(writeR) { *Pointer<Int>(element) = RoundInt(Float(c.x)); }
+		break;
+	case VK_FORMAT_R32G32B32A32_UINT:
+		if(writeRGBA)
+		{
+			*Pointer<UInt4>(element) = UInt4(RoundInt(c));
+		}
+		else
+		{
+			if(writeR) { *Pointer<UInt>(element) = As<UInt>(RoundInt(Float(c.x))); }
+			if(writeG) { *Pointer<UInt>(element + 4) = As<UInt>(RoundInt(Float(c.y))); }
+			if(writeB) { *Pointer<UInt>(element + 8) = As<UInt>(RoundInt(Float(c.z))); }
+			if(writeA) { *Pointer<UInt>(element + 12) = As<UInt>(RoundInt(Float(c.w))); }
+		}
+		break;
+	case VK_FORMAT_R32G32B32_UINT:
+		if(writeB) { *Pointer<UInt>(element + 8) = As<UInt>(RoundInt(Float(c.z))); }
+	case VK_FORMAT_R32G32_UINT:
+		if(writeG) { *Pointer<UInt>(element + 4) = As<UInt>(RoundInt(Float(c.y))); }
+	case VK_FORMAT_R32_UINT:
+		if(writeR) { *Pointer<UInt>(element) = As<UInt>(RoundInt(Float(c.x))); }
+		break;
+	case VK_FORMAT_R5G6B5_UNORM_PACK16:
+		if(writeR && writeG && writeB)
+		{
+			*Pointer<UShort>(element) = UShort(RoundInt(Float(c.z)) |
+			                                  (RoundInt(Float(c.y)) << Int(5)) |
+			                                  (RoundInt(Float(c.x)) << Int(11)));
+		}
+		else
+		{
+			unsigned short mask = (writeB ? 0x001F : 0x0000) | (writeG ? 0x07E0 : 0x0000) | (writeR ? 0xF800 : 0x0000);
+			unsigned short unmask = ~mask;
+			*Pointer<UShort>(element) = (*Pointer<UShort>(element) & UShort(unmask)) |
+			                            (UShort(RoundInt(Float(c.z)) |
+			                                   (RoundInt(Float(c.y)) << Int(5)) |
+			                                   (RoundInt(Float(c.x)) << Int(11))) & UShort(mask));
+		}
+		break;
+	case VK_FORMAT_R5G5B5A1_UNORM_PACK16:
+		if(writeRGBA)
+		{
+			*Pointer<UShort>(element) = UShort(RoundInt(Float(c.w)) |
+			                                  (RoundInt(Float(c.z)) << Int(1)) |
+			                                  (RoundInt(Float(c.y)) << Int(6)) |
+			                                  (RoundInt(Float(c.x)) << Int(11)));
+		}
+		else
+		{
+			unsigned short mask = (writeA ? 0x8000 : 0x0000) |
+			                      (writeR ? 0x7C00 : 0x0000) |
+			                      (writeG ? 0x03E0 : 0x0000) |
+			                      (writeB ? 0x001F : 0x0000);
+			unsigned short unmask = ~mask;
+			*Pointer<UShort>(element) = (*Pointer<UShort>(element) & UShort(unmask)) |
+			                            (UShort(RoundInt(Float(c.w)) |
+			                                   (RoundInt(Float(c.z)) << Int(1)) |
+			                                   (RoundInt(Float(c.y)) << Int(6)) |
+			                                   (RoundInt(Float(c.x)) << Int(11))) & UShort(mask));
+		}
+		break;
+	case VK_FORMAT_B5G5R5A1_UNORM_PACK16:
+		if(writeRGBA)
+		{
+			*Pointer<UShort>(element) = UShort(RoundInt(Float(c.w)) |
+			                                  (RoundInt(Float(c.x)) << Int(1)) |
+			                                  (RoundInt(Float(c.y)) << Int(6)) |
+			                                  (RoundInt(Float(c.z)) << Int(11)));
+		}
+		else
+		{
+			unsigned short mask = (writeA ? 0x8000 : 0x0000) |
+			                      (writeR ? 0x7C00 : 0x0000) |
+			                      (writeG ? 0x03E0 : 0x0000) |
+			                      (writeB ? 0x001F : 0x0000);
+			unsigned short unmask = ~mask;
+			*Pointer<UShort>(element) = (*Pointer<UShort>(element) & UShort(unmask)) |
+			                            (UShort(RoundInt(Float(c.w)) |
+			                                   (RoundInt(Float(c.x)) << Int(1)) |
+			                                   (RoundInt(Float(c.y)) << Int(6)) |
+			                                   (RoundInt(Float(c.z)) << Int(11))) & UShort(mask));
+		}
+		break;
+	case VK_FORMAT_A1R5G5B5_UNORM_PACK16:
+		if(writeRGBA)
+		{
+			*Pointer<UShort>(element) = UShort(RoundInt(Float(c.z)) |
+			                                  (RoundInt(Float(c.y)) << Int(5)) |
+			                                  (RoundInt(Float(c.x)) << Int(10)) |
+			                                  (RoundInt(Float(c.w)) << Int(15)));
+		}
+		else
+		{
+			unsigned short mask = (writeA ? 0x8000 : 0x0000) |
+			                      (writeR ? 0x7C00 : 0x0000) |
+			                      (writeG ? 0x03E0 : 0x0000) |
+			                      (writeB ? 0x001F : 0x0000);
+			unsigned short unmask = ~mask;
+			*Pointer<UShort>(element) = (*Pointer<UShort>(element) & UShort(unmask)) |
+			                            (UShort(RoundInt(Float(c.z)) |
+			                                   (RoundInt(Float(c.y)) << Int(5)) |
+			                                   (RoundInt(Float(c.x)) << Int(10)) |
+			                                   (RoundInt(Float(c.w)) << Int(15))) & UShort(mask));
+		}
+		break;
+	case VK_FORMAT_A2B10G10R10_UNORM_PACK32:
+	case VK_FORMAT_A2B10G10R10_UINT_PACK32:
+	case VK_FORMAT_A2B10G10R10_SNORM_PACK32:
+		if(writeRGBA)
+		{
+			*Pointer<UInt>(element) = UInt(RoundInt(Float(c.x)) |
+			                              (RoundInt(Float(c.y)) << 10) |
+			                              (RoundInt(Float(c.z)) << 20) |
+			                              (RoundInt(Float(c.w)) << 30));
+		}
+		else
+		{
+			unsigned int mask = (writeA ? 0xC0000000 : 0x0000) |
+			                    (writeB ? 0x3FF00000 : 0x0000) |
+			                    (writeG ? 0x000FFC00 : 0x0000) |
+			                    (writeR ? 0x000003FF : 0x0000);
+			unsigned int unmask = ~mask;
+			*Pointer<UInt>(element) = (*Pointer<UInt>(element) & UInt(unmask)) |
+			                            (UInt(RoundInt(Float(c.x)) |
+			                                 (RoundInt(Float(c.y)) << 10) |
+			                                 (RoundInt(Float(c.z)) << 20) |
+			                                 (RoundInt(Float(c.w)) << 30)) & UInt(mask));
+		}
+		break;
+	case VK_FORMAT_A2R10G10B10_UNORM_PACK32:
+	case VK_FORMAT_A2R10G10B10_UINT_PACK32:
+	case VK_FORMAT_A2R10G10B10_SNORM_PACK32:
+		if(writeRGBA)
+		{
+			*Pointer<UInt>(element) = UInt(RoundInt(Float(c.z)) |
+			                              (RoundInt(Float(c.y)) << 10) |
+			                              (RoundInt(Float(c.x)) << 20) |
+			                              (RoundInt(Float(c.w)) << 30));
+		}
+		else
+		{
+			unsigned int mask = (writeA ? 0xC0000000 : 0x0000) |
+			                    (writeR ? 0x3FF00000 : 0x0000) |
+			                    (writeG ? 0x000FFC00 : 0x0000) |
+			                    (writeB ? 0x000003FF : 0x0000);
+			unsigned int unmask = ~mask;
+			*Pointer<UInt>(element) = (*Pointer<UInt>(element) & UInt(unmask)) |
+			                            (UInt(RoundInt(Float(c.z)) |
+			                                 (RoundInt(Float(c.y)) << 10) |
+			                                 (RoundInt(Float(c.x)) << 20) |
+			                                 (RoundInt(Float(c.w)) << 30)) & UInt(mask));
+		}
+		break;
+	case VK_FORMAT_D16_UNORM:
+		*Pointer<UShort>(element) = UShort(RoundInt(Float(c.x)));
+		break;
+	case VK_FORMAT_X8_D24_UNORM_PACK32:
+		*Pointer<UInt>(element) = UInt(RoundInt(Float(c.x)) << 8);
+		break;
+	case VK_FORMAT_D32_SFLOAT:
+		*Pointer<Float>(element) = c.x;
+		break;
+	case VK_FORMAT_S8_UINT:
+		*Pointer<Byte>(element) = Byte(RoundInt(Float(c.x)));
+		break;
+	default:
+		UNSUPPORTED("Blitter destination format %d", (int)state.destFormat);
+		break;
+	}
+}
+
+Int4 Blitter::readInt4(Pointer<Byte> element, const State &state)
+{
+	Int4 c(0, 0, 0, 1);
+
+	switch(state.sourceFormat)
+	{
+	case VK_FORMAT_A8B8G8R8_SINT_PACK32:
+	case VK_FORMAT_R8G8B8A8_SINT:
+		c = Insert(c, Int(*Pointer<SByte>(element + 3)), 3);
+		c = Insert(c, Int(*Pointer<SByte>(element + 2)), 2);
+	case VK_FORMAT_R8G8_SINT:
+		c = Insert(c, Int(*Pointer<SByte>(element + 1)), 1);
+	case VK_FORMAT_R8_SINT:
+		c = Insert(c, Int(*Pointer<SByte>(element)), 0);
+		break;
+	case VK_FORMAT_A2B10G10R10_UINT_PACK32:
+		c = Insert(c, Int((*Pointer<UInt>(element) & UInt(0x000003FF))), 0);
+		c = Insert(c, Int((*Pointer<UInt>(element) & UInt(0x000FFC00)) >> 10), 1);
+		c = Insert(c, Int((*Pointer<UInt>(element) & UInt(0x3FF00000)) >> 20), 2);
+		c = Insert(c, Int((*Pointer<UInt>(element) & UInt(0xC0000000)) >> 30), 3);
+		break;
+	case VK_FORMAT_A8B8G8R8_UINT_PACK32:
+	case VK_FORMAT_R8G8B8A8_UINT:
+		c = Insert(c, Int(*Pointer<Byte>(element + 3)), 3);
+		c = Insert(c, Int(*Pointer<Byte>(element + 2)), 2);
+	case VK_FORMAT_R8G8_UINT:
+		c = Insert(c, Int(*Pointer<Byte>(element + 1)), 1);
+	case VK_FORMAT_R8_UINT:
+	case VK_FORMAT_S8_UINT:
+		c = Insert(c, Int(*Pointer<Byte>(element)), 0);
+		break;
+	case VK_FORMAT_R16G16B16A16_SINT:
+		c = Insert(c, Int(*Pointer<Short>(element + 6)), 3);
+		c = Insert(c, Int(*Pointer<Short>(element + 4)), 2);
+	case VK_FORMAT_R16G16_SINT:
+		c = Insert(c, Int(*Pointer<Short>(element + 2)), 1);
+	case VK_FORMAT_R16_SINT:
+		c = Insert(c, Int(*Pointer<Short>(element)), 0);
+		break;
+	case VK_FORMAT_R16G16B16A16_UINT:
+		c = Insert(c, Int(*Pointer<UShort>(element + 6)), 3);
+		c = Insert(c, Int(*Pointer<UShort>(element + 4)), 2);
+	case VK_FORMAT_R16G16_UINT:
+		c = Insert(c, Int(*Pointer<UShort>(element + 2)), 1);
+	case VK_FORMAT_R16_UINT:
+		c = Insert(c, Int(*Pointer<UShort>(element)), 0);
+		break;
+	case VK_FORMAT_R32G32B32A32_SINT:
+	case VK_FORMAT_R32G32B32A32_UINT:
+		c = *Pointer<Int4>(element);
+		break;
+	case VK_FORMAT_R32G32_SINT:
+	case VK_FORMAT_R32G32_UINT:
+		c = Insert(c, *Pointer<Int>(element + 4), 1);
+	case VK_FORMAT_R32_SINT:
+	case VK_FORMAT_R32_UINT:
+		c = Insert(c, *Pointer<Int>(element), 0);
+		break;
+	default:
+		UNSUPPORTED("Blitter source format %d", (int)state.sourceFormat);
+	}
+
+	return c;
+}
+
+void Blitter::write(Int4 &c, Pointer<Byte> element, const State &state)
+{
+	bool writeR = state.writeRed;
+	bool writeG = state.writeGreen;
+	bool writeB = state.writeBlue;
+	bool writeA = state.writeAlpha;
+	bool writeRGBA = writeR && writeG && writeB && writeA;
+
+	switch(state.destFormat)
+	{
+	case VK_FORMAT_A2B10G10R10_UINT_PACK32:
+		c = Min(As<UInt4>(c), UInt4(0x03FF, 0x03FF, 0x03FF, 0x0003));
+		break;
+	case VK_FORMAT_A8B8G8R8_UINT_PACK32:
+	case VK_FORMAT_R8G8B8A8_UINT:
+	case VK_FORMAT_R8G8B8_UINT:
+	case VK_FORMAT_R8G8_UINT:
+	case VK_FORMAT_R8_UINT:
+	case VK_FORMAT_R8G8B8A8_USCALED:
+	case VK_FORMAT_R8G8B8_USCALED:
+	case VK_FORMAT_R8G8_USCALED:
+	case VK_FORMAT_R8_USCALED:
+	case VK_FORMAT_S8_UINT:
+		c = Min(As<UInt4>(c), UInt4(0xFF));
+		break;
+	case VK_FORMAT_R16G16B16A16_UINT:
+	case VK_FORMAT_R16G16B16_UINT:
+	case VK_FORMAT_R16G16_UINT:
+	case VK_FORMAT_R16_UINT:
+	case VK_FORMAT_R16G16B16A16_USCALED:
+	case VK_FORMAT_R16G16B16_USCALED:
+	case VK_FORMAT_R16G16_USCALED:
+	case VK_FORMAT_R16_USCALED:
+		c = Min(As<UInt4>(c), UInt4(0xFFFF));
+		break;
+	case VK_FORMAT_A8B8G8R8_SINT_PACK32:
+	case VK_FORMAT_R8G8B8A8_SINT:
+	case VK_FORMAT_R8G8_SINT:
+	case VK_FORMAT_R8_SINT:
+	case VK_FORMAT_R8G8B8A8_SSCALED:
+	case VK_FORMAT_R8G8B8_SSCALED:
+	case VK_FORMAT_R8G8_SSCALED:
+	case VK_FORMAT_R8_SSCALED:
+		c = Min(Max(c, Int4(-0x80)), Int4(0x7F));
+		break;
+	case VK_FORMAT_R16G16B16A16_SINT:
+	case VK_FORMAT_R16G16B16_SINT:
+	case VK_FORMAT_R16G16_SINT:
+	case VK_FORMAT_R16_SINT:
+	case VK_FORMAT_R16G16B16A16_SSCALED:
+	case VK_FORMAT_R16G16B16_SSCALED:
+	case VK_FORMAT_R16G16_SSCALED:
+	case VK_FORMAT_R16_SSCALED:
+		c = Min(Max(c, Int4(-0x8000)), Int4(0x7FFF));
+		break;
+	default:
+		break;
+	}
+
+	switch(state.destFormat)
+	{
+	case VK_FORMAT_B8G8R8A8_SINT:
+	case VK_FORMAT_B8G8R8A8_SSCALED:
+		if(writeA) { *Pointer<SByte>(element + 3) = SByte(Extract(c, 3)); }
+	case VK_FORMAT_B8G8R8_SINT:
+	case VK_FORMAT_B8G8R8_SSCALED:
+		if(writeB) { *Pointer<SByte>(element) = SByte(Extract(c, 2)); }
+		if(writeG) { *Pointer<SByte>(element + 1) = SByte(Extract(c, 1)); }
+		if(writeR) { *Pointer<SByte>(element + 2) = SByte(Extract(c, 0)); }
+		break;
+	case VK_FORMAT_A8B8G8R8_SINT_PACK32:
+	case VK_FORMAT_R8G8B8A8_SINT:
+	case VK_FORMAT_R8G8B8A8_SSCALED:
+	case VK_FORMAT_A8B8G8R8_SSCALED_PACK32:
+		if(writeA) { *Pointer<SByte>(element + 3) = SByte(Extract(c, 3)); }
+	case VK_FORMAT_R8G8B8_SINT:
+	case VK_FORMAT_R8G8B8_SSCALED:
+		if(writeB) { *Pointer<SByte>(element + 2) = SByte(Extract(c, 2)); }
+	case VK_FORMAT_R8G8_SINT:
+	case VK_FORMAT_R8G8_SSCALED:
+		if(writeG) { *Pointer<SByte>(element + 1) = SByte(Extract(c, 1)); }
+	case VK_FORMAT_R8_SINT:
+	case VK_FORMAT_R8_SSCALED:
+		if(writeR) { *Pointer<SByte>(element) = SByte(Extract(c, 0)); }
+		break;
+	case VK_FORMAT_A2B10G10R10_UINT_PACK32:
+	case VK_FORMAT_A2B10G10R10_SINT_PACK32:
+	case VK_FORMAT_A2B10G10R10_USCALED_PACK32:
+	case VK_FORMAT_A2B10G10R10_SSCALED_PACK32:
+		if(writeRGBA)
+		{
+			*Pointer<UInt>(element) =
+				UInt((Extract(c, 0)) | (Extract(c, 1) << 10) | (Extract(c, 2) << 20) | (Extract(c, 3) << 30));
+		}
+		else
+		{
+			unsigned int mask = (writeA ? 0xC0000000 : 0x0000) |
+			                    (writeB ? 0x3FF00000 : 0x0000) |
+			                    (writeG ? 0x000FFC00 : 0x0000) |
+			                    (writeR ? 0x000003FF : 0x0000);
+			unsigned int unmask = ~mask;
+			*Pointer<UInt>(element) = (*Pointer<UInt>(element) & UInt(unmask)) |
+				(UInt(Extract(c, 0) | (Extract(c, 1) << 10) | (Extract(c, 2) << 20) | (Extract(c, 3) << 30)) & UInt(mask));
+		}
+		break;
+	case VK_FORMAT_A2R10G10B10_UINT_PACK32:
+	case VK_FORMAT_A2R10G10B10_SINT_PACK32:
+	case VK_FORMAT_A2R10G10B10_USCALED_PACK32:
+	case VK_FORMAT_A2R10G10B10_SSCALED_PACK32:
+		if(writeRGBA)
+		{
+			*Pointer<UInt>(element) =
+				UInt((Extract(c, 2)) | (Extract(c, 1) << 10) | (Extract(c, 0) << 20) | (Extract(c, 3) << 30));
+		}
+		else
+		{
+			unsigned int mask = (writeA ? 0xC0000000 : 0x0000) |
+			                    (writeR ? 0x3FF00000 : 0x0000) |
+			                    (writeG ? 0x000FFC00 : 0x0000) |
+			                    (writeB ? 0x000003FF : 0x0000);
+			unsigned int unmask = ~mask;
+			*Pointer<UInt>(element) = (*Pointer<UInt>(element) & UInt(unmask)) |
+				(UInt(Extract(c, 2) | (Extract(c, 1) << 10) | (Extract(c, 0) << 20) | (Extract(c, 3) << 30)) & UInt(mask));
+		}
+		break;
+	case VK_FORMAT_B8G8R8A8_UINT:
+	case VK_FORMAT_B8G8R8A8_USCALED:
+		if(writeA) { *Pointer<Byte>(element + 3) = Byte(Extract(c, 3)); }
+	case VK_FORMAT_B8G8R8_UINT:
+	case VK_FORMAT_B8G8R8_USCALED:
+	case VK_FORMAT_B8G8R8_SRGB:
+		if(writeB) { *Pointer<Byte>(element) = Byte(Extract(c, 2)); }
+		if(writeG) { *Pointer<Byte>(element + 1) = Byte(Extract(c, 1)); }
+		if(writeR) { *Pointer<Byte>(element + 2) = Byte(Extract(c, 0)); }
+		break;
+	case VK_FORMAT_A8B8G8R8_UINT_PACK32:
+	case VK_FORMAT_R8G8B8A8_UINT:
+	case VK_FORMAT_R8G8B8A8_USCALED:
+	case VK_FORMAT_A8B8G8R8_USCALED_PACK32:
+		if(writeA) { *Pointer<Byte>(element + 3) = Byte(Extract(c, 3)); }
+	case VK_FORMAT_R8G8B8_UINT:
+	case VK_FORMAT_R8G8B8_USCALED:
+		if(writeB) { *Pointer<Byte>(element + 2) = Byte(Extract(c, 2)); }
+	case VK_FORMAT_R8G8_UINT:
+	case VK_FORMAT_R8G8_USCALED:
+		if(writeG) { *Pointer<Byte>(element + 1) = Byte(Extract(c, 1)); }
+	case VK_FORMAT_R8_UINT:
+	case VK_FORMAT_R8_USCALED:
+	case VK_FORMAT_S8_UINT:
+		if(writeR) { *Pointer<Byte>(element) = Byte(Extract(c, 0)); }
+		break;
+	case VK_FORMAT_R16G16B16A16_SINT:
+	case VK_FORMAT_R16G16B16A16_SSCALED:
+		if(writeA) { *Pointer<Short>(element + 6) = Short(Extract(c, 3)); }
+	case VK_FORMAT_R16G16B16_SINT:
+	case VK_FORMAT_R16G16B16_SSCALED:
+		if(writeB) { *Pointer<Short>(element + 4) = Short(Extract(c, 2)); }
+	case VK_FORMAT_R16G16_SINT:
+	case VK_FORMAT_R16G16_SSCALED:
+		if(writeG) { *Pointer<Short>(element + 2) = Short(Extract(c, 1)); }
+	case VK_FORMAT_R16_SINT:
+	case VK_FORMAT_R16_SSCALED:
+		if(writeR) { *Pointer<Short>(element) = Short(Extract(c, 0)); }
+		break;
+	case VK_FORMAT_R16G16B16A16_UINT:
+	case VK_FORMAT_R16G16B16A16_USCALED:
+		if(writeA) { *Pointer<UShort>(element + 6) = UShort(Extract(c, 3)); }
+	case VK_FORMAT_R16G16B16_UINT:
+	case VK_FORMAT_R16G16B16_USCALED:
+		if(writeB) { *Pointer<UShort>(element + 4) = UShort(Extract(c, 2)); }
+	case VK_FORMAT_R16G16_UINT:
+	case VK_FORMAT_R16G16_USCALED:
+		if(writeG) { *Pointer<UShort>(element + 2) = UShort(Extract(c, 1)); }
+	case VK_FORMAT_R16_UINT:
+	case VK_FORMAT_R16_USCALED:
+		if(writeR) { *Pointer<UShort>(element) = UShort(Extract(c, 0)); }
+		break;
+	case VK_FORMAT_R32G32B32A32_SINT:
+		if(writeRGBA)
+		{
+			*Pointer<Int4>(element) = c;
+		}
+		else
+		{
+			if(writeR) { *Pointer<Int>(element) = Extract(c, 0); }
+			if(writeG) { *Pointer<Int>(element + 4) = Extract(c, 1); }
+			if(writeB) { *Pointer<Int>(element + 8) = Extract(c, 2); }
+			if(writeA) { *Pointer<Int>(element + 12) = Extract(c, 3); }
+		}
+		break;
+	case VK_FORMAT_R32G32B32_SINT:
+		if(writeR) { *Pointer<Int>(element) = Extract(c, 0); }
+		if(writeG) { *Pointer<Int>(element + 4) = Extract(c, 1); }
+		if(writeB) { *Pointer<Int>(element + 8) = Extract(c, 2); }
+		break;
+	case VK_FORMAT_R32G32_SINT:
+		if(writeR) { *Pointer<Int>(element) = Extract(c, 0); }
+		if(writeG) { *Pointer<Int>(element + 4) = Extract(c, 1); }
+		break;
+	case VK_FORMAT_R32_SINT:
+		if(writeR) { *Pointer<Int>(element) = Extract(c, 0); }
+		break;
+	case VK_FORMAT_R32G32B32A32_UINT:
+		if(writeRGBA)
+		{
+			*Pointer<UInt4>(element) = As<UInt4>(c);
+		}
+		else
+		{
+			if(writeR) { *Pointer<UInt>(element) = As<UInt>(Extract(c, 0)); }
+			if(writeG) { *Pointer<UInt>(element + 4) = As<UInt>(Extract(c, 1)); }
+			if(writeB) { *Pointer<UInt>(element + 8) = As<UInt>(Extract(c, 2)); }
+			if(writeA) { *Pointer<UInt>(element + 12) = As<UInt>(Extract(c, 3)); }
+		}
+		break;
+	case VK_FORMAT_R32G32B32_UINT:
+		if(writeB) { *Pointer<UInt>(element + 8) = As<UInt>(Extract(c, 2)); }
+	case VK_FORMAT_R32G32_UINT:
+		if(writeG) { *Pointer<UInt>(element + 4) = As<UInt>(Extract(c, 1)); }
+	case VK_FORMAT_R32_UINT:
+		if(writeR) { *Pointer<UInt>(element) = As<UInt>(Extract(c, 0)); }
+		break;
+	default:
+		UNSUPPORTED("Blitter destination format %d", (int)state.destFormat);
+	}
+}
+
+void Blitter::ApplyScaleAndClamp(Float4 &value, const State &state, bool preScaled)
+{
+	float4 scale{}, unscale{};
+
+	if(state.clearOperation &&
+	   state.sourceFormat.isNonNormalizedInteger() &&
+	   !state.destFormat.isNonNormalizedInteger())
+	{
+		// If we're clearing a buffer from an int or uint color into a normalized color,
+		// then the whole range of the int or uint color must be scaled between 0 and 1.
+		switch(state.sourceFormat)
+		{
+		case VK_FORMAT_R32G32B32A32_SINT:
+			unscale = replicate(static_cast<float>(0x7FFFFFFF));
+			break;
+		case VK_FORMAT_R32G32B32A32_UINT:
+			unscale = replicate(static_cast<float>(0xFFFFFFFF));
+			break;
+		default:
+			UNSUPPORTED("Blitter source format %d", (int)state.sourceFormat);
+		}
+	}
+	else
+	{
+		unscale = state.sourceFormat.getScale();
+	}
+
+	scale = state.destFormat.getScale();
+
+	bool srcSRGB = state.sourceFormat.isSRGBformat();
+	bool dstSRGB = state.destFormat.isSRGBformat();
+
+	if(state.allowSRGBConversion && ((srcSRGB && !preScaled) || dstSRGB))   // One of the formats is sRGB encoded.
+	{
+		value *= preScaled ? Float4(1.0f / scale.x, 1.0f / scale.y, 1.0f / scale.z, 1.0f / scale.w) : // Unapply scale
+		                     Float4(1.0f / unscale.x, 1.0f / unscale.y, 1.0f / unscale.z, 1.0f / unscale.w); // Apply unscale
+		value = (srcSRGB && !preScaled) ? sRGBtoLinear(value) : LinearToSRGB(value);
+		value *= Float4(scale.x, scale.y, scale.z, scale.w); // Apply scale
+	}
+	else if(unscale != scale)
+	{
+		value *= Float4(scale.x / unscale.x, scale.y / unscale.y, scale.z / unscale.z, scale.w / unscale.w);
+	}
+
+	if(state.sourceFormat.isFloatFormat() && !state.destFormat.isFloatFormat())
+	{
+		value = Min(value, Float4(scale.x, scale.y, scale.z, scale.w));
+
+		value = Max(value, Float4(state.destFormat.isUnsignedComponent(0) ? 0.0f : -scale.x,
+		                          state.destFormat.isUnsignedComponent(1) ? 0.0f : -scale.y,
+		                          state.destFormat.isUnsignedComponent(2) ? 0.0f : -scale.z,
+		                          state.destFormat.isUnsignedComponent(3) ? 0.0f : -scale.w));
+	}
+}
+
+Int Blitter::ComputeOffset(Int &x, Int &y, Int &pitchB, int bytes)
+{
+	return y * pitchB + x * bytes;
+}
+
+Float4 Blitter::LinearToSRGB(Float4 &c)
+{
+	Float4 lc = Min(c, Float4(0.0031308f)) * Float4(12.92f);
+	Float4 ec = Float4(1.055f) * power(c, Float4(1.0f / 2.4f)) - Float4(0.055f);
+
+	Float4 s = c;
+	s.xyz = Max(lc, ec);
+
+	return s;
+}
+
+Float4 Blitter::sRGBtoLinear(Float4 &c)
+{
+	Float4 lc = c * Float4(1.0f / 12.92f);
+	Float4 ec = power((c + Float4(0.055f)) * Float4(1.0f / 1.055f), Float4(2.4f));
+
+	Int4 linear = CmpLT(c, Float4(0.04045f));
+
+	Float4 s = c;
+	s.xyz = As<Float4>((linear & As<Int4>(lc)) | (~linear & As<Int4>(ec)));   // TODO: IfThenElse()
+
+	return s;
+}
+
+Blitter::BlitRoutineType Blitter::generate(const State &state)
+{
+	BlitFunction function;
+	{
+		Pointer<Byte> blit(function.Arg<0>());
+
+		Pointer<Byte> source = *Pointer<Pointer<Byte>>(blit + OFFSET(BlitData,source));
+		Pointer<Byte> dest = *Pointer<Pointer<Byte>>(blit + OFFSET(BlitData,dest));
+		Int sPitchB = *Pointer<Int>(blit + OFFSET(BlitData,sPitchB));
+		Int dPitchB = *Pointer<Int>(blit + OFFSET(BlitData,dPitchB));
+
+		Float x0 = *Pointer<Float>(blit + OFFSET(BlitData,x0));
+		Float y0 = *Pointer<Float>(blit + OFFSET(BlitData,y0));
+		Float w = *Pointer<Float>(blit + OFFSET(BlitData,w));
+		Float h = *Pointer<Float>(blit + OFFSET(BlitData,h));
+
+		Int x0d = *Pointer<Int>(blit + OFFSET(BlitData,x0d));
+		Int x1d = *Pointer<Int>(blit + OFFSET(BlitData,x1d));
+		Int y0d = *Pointer<Int>(blit + OFFSET(BlitData,y0d));
+		Int y1d = *Pointer<Int>(blit + OFFSET(BlitData,y1d));
+
+		Int sWidth = *Pointer<Int>(blit + OFFSET(BlitData,sWidth));
+		Int sHeight = *Pointer<Int>(blit + OFFSET(BlitData,sHeight));
+
+		bool intSrc = state.sourceFormat.isNonNormalizedInteger();
+		bool intDst = state.destFormat.isNonNormalizedInteger();
+		bool intBoth = intSrc && intDst;
+		int srcBytes = state.sourceFormat.bytes();
+		int dstBytes = state.destFormat.bytes();
+
+		bool hasConstantColorI = false;
+		Int4 constantColorI;
+		bool hasConstantColorF = false;
+		Float4 constantColorF;
+		if(state.clearOperation)
+		{
+			if(intBoth) // Integer types
+			{
+				constantColorI = readInt4(source, state);
+				hasConstantColorI = true;
+			}
+			else
+			{
+				constantColorF = readFloat4(source, state);
+				hasConstantColorF = true;
+
+				ApplyScaleAndClamp(constantColorF, state);
+			}
+		}
+
+		For(Int j = y0d, j < y1d, j++)
+		{
+			Float y = state.clearOperation ? RValue<Float>(y0) : y0 + Float(j) * h;
+			Pointer<Byte> destLine = dest + j * dPitchB;
+
+			For(Int i = x0d, i < x1d, i++)
+			{
+				Float x = state.clearOperation ? RValue<Float>(x0) : x0 + Float(i) * w;
+				Pointer<Byte> d = destLine + i * dstBytes;
+
+				if(hasConstantColorI)
+				{
+					for(int s = 0; s < state.destSamples; s++)
+					{
+						write(constantColorI, d, state);
+
+						d += *Pointer<Int>(blit + OFFSET(BlitData, dSliceB));
+					}
+				}
+				else if(hasConstantColorF)
+				{
+					for(int s = 0; s < state.destSamples; s++)
+					{
+						write(constantColorF, d, state);
+
+						d += *Pointer<Int>(blit + OFFSET(BlitData, dSliceB));
+					}
+				}
+				else if(intBoth) // Integer types do not support filtering
+				{
+					Int X = Int(x);
+					Int Y = Int(y);
+
+					if(state.clampToEdge)
+					{
+						X = Clamp(X, 0, sWidth - 1);
+						Y = Clamp(Y, 0, sHeight - 1);
+					}
+
+					Pointer<Byte> s = source + ComputeOffset(X, Y, sPitchB, srcBytes);
+
+					// When both formats are true integer types, we don't go to float to avoid losing precision
+					Int4 color = readInt4(s, state);
+					for(int s = 0; s < state.destSamples; s++)
+					{
+						write(color, d, state);
+
+						d += *Pointer<Int>(blit + OFFSET(BlitData,dSliceB));
+					}
 				}
 				else
 				{
-					constantColorF = readFloat4(source, state);
-					hasConstantColorF = true;
+					Float4 color;
 
-					ApplyScaleAndClamp(constantColorF, state);
-				}
-			}
-
-			For(Int j = y0d, j < y1d, j++)
-			{
-				Float y = state.clearOperation ? RValue<Float>(y0) : y0 + Float(j) * h;
-				Pointer<Byte> destLine = dest + j * dPitchB;
-
-				For(Int i = x0d, i < x1d, i++)
-				{
-					Float x = state.clearOperation ? RValue<Float>(x0) : x0 + Float(i) * w;
-					Pointer<Byte> d = destLine + i * dstBytes;
-
-					if(hasConstantColorI)
-					{
-						for(int s = 0; s < state.destSamples; s++)
-						{
-							write(constantColorI, d, state);
-
-							d += *Pointer<Int>(blit + OFFSET(BlitData, dSliceB));
-						}
-					}
-					else if(hasConstantColorF)
-					{
-						for(int s = 0; s < state.destSamples; s++)
-						{
-							write(constantColorF, d, state);
-
-							d += *Pointer<Int>(blit + OFFSET(BlitData, dSliceB));
-						}
-					}
-					else if(intBoth) // Integer types do not support filtering
+					bool preScaled = false;
+					if(!state.filter || intSrc)
 					{
 						Int X = Int(x);
 						Int Y = Int(y);
@@ -1465,611 +1492,585 @@
 
 						Pointer<Byte> s = source + ComputeOffset(X, Y, sPitchB, srcBytes);
 
-						// When both formats are true integer types, we don't go to float to avoid losing precision
-						Int4 color = readInt4(s, state);
-						for(int s = 0; s < state.destSamples; s++)
+						color = readFloat4(s, state);
+
+						if(state.srcSamples > 1) // Resolve multisampled source
 						{
-							write(color, d, state);
-
-							d += *Pointer<Int>(blit + OFFSET(BlitData,dSliceB));
-						}
-					}
-					else
-					{
-						Float4 color;
-
-						bool preScaled = false;
-						if(!state.filter || intSrc)
-						{
-							Int X = Int(x);
-							Int Y = Int(y);
-
-							if(state.clampToEdge)
+							if(state.allowSRGBConversion && state.sourceFormat.isSRGBformat()) // sRGB -> RGB
 							{
-								X = Clamp(X, 0, sWidth - 1);
-								Y = Clamp(Y, 0, sHeight - 1);
+								ApplyScaleAndClamp(color, state);
+								preScaled = true;
 							}
-
-							Pointer<Byte> s = source + ComputeOffset(X, Y, sPitchB, srcBytes);
-
-							color = readFloat4(s, state);
-
-							if(state.srcSamples > 1) // Resolve multisampled source
+							Float4 accum = color;
+							for(int sample = 1; sample < state.srcSamples; sample++)
 							{
+								s += *Pointer<Int>(blit + OFFSET(BlitData, sSliceB));
+								color = readFloat4(s, state);
+
 								if(state.allowSRGBConversion && state.sourceFormat.isSRGBformat()) // sRGB -> RGB
 								{
 									ApplyScaleAndClamp(color, state);
 									preScaled = true;
 								}
-								Float4 accum = color;
-								for(int sample = 1; sample < state.srcSamples; sample++)
-								{
-									s += *Pointer<Int>(blit + OFFSET(BlitData, sSliceB));
-									color = readFloat4(s, state);
-
-									if(state.allowSRGBConversion && state.sourceFormat.isSRGBformat()) // sRGB -> RGB
-									{
-										ApplyScaleAndClamp(color, state);
-										preScaled = true;
-									}
-									accum += color;
-								}
-								color = accum * Float4(1.0f / static_cast<float>(state.srcSamples));
+								accum += color;
 							}
+							color = accum * Float4(1.0f / static_cast<float>(state.srcSamples));
 						}
-						else   // Bilinear filtering
+					}
+					else   // Bilinear filtering
+					{
+						Float X = x;
+						Float Y = y;
+
+						if(state.clampToEdge)
 						{
-							Float X = x;
-							Float Y = y;
-
-							if(state.clampToEdge)
-							{
-								X = Min(Max(x, 0.5f), Float(sWidth) - 0.5f);
-								Y = Min(Max(y, 0.5f), Float(sHeight) - 0.5f);
-							}
-
-							Float x0 = X - 0.5f;
-							Float y0 = Y - 0.5f;
-
-							Int X0 = Max(Int(x0), 0);
-							Int Y0 = Max(Int(y0), 0);
-
-							Int X1 = X0 + 1;
-							Int Y1 = Y0 + 1;
-							X1 = IfThenElse(X1 >= sWidth, X0, X1);
-							Y1 = IfThenElse(Y1 >= sHeight, Y0, Y1);
-
-							Pointer<Byte> s00 = source + ComputeOffset(X0, Y0, sPitchB, srcBytes);
-							Pointer<Byte> s01 = source + ComputeOffset(X1, Y0, sPitchB, srcBytes);
-							Pointer<Byte> s10 = source + ComputeOffset(X0, Y1, sPitchB, srcBytes);
-							Pointer<Byte> s11 = source + ComputeOffset(X1, Y1, sPitchB, srcBytes);
-
-							Float4 c00 = readFloat4(s00, state);
-							Float4 c01 = readFloat4(s01, state);
-							Float4 c10 = readFloat4(s10, state);
-							Float4 c11 = readFloat4(s11, state);
-
-							if(state.allowSRGBConversion && state.sourceFormat.isSRGBformat()) // sRGB -> RGB
-							{
-								ApplyScaleAndClamp(c00, state);
-								ApplyScaleAndClamp(c01, state);
-								ApplyScaleAndClamp(c10, state);
-								ApplyScaleAndClamp(c11, state);
-								preScaled = true;
-							}
-
-							Float4 fx = Float4(x0 - Float(X0));
-							Float4 fy = Float4(y0 - Float(Y0));
-							Float4 ix = Float4(1.0f) - fx;
-							Float4 iy = Float4(1.0f) - fy;
-
-							color = (c00 * ix + c01 * fx) * iy +
-							        (c10 * ix + c11 * fx) * fy;
+							X = Min(Max(x, 0.5f), Float(sWidth) - 0.5f);
+							Y = Min(Max(y, 0.5f), Float(sHeight) - 0.5f);
 						}
 
-						ApplyScaleAndClamp(color, state, preScaled);
+						Float x0 = X - 0.5f;
+						Float y0 = Y - 0.5f;
 
-						for(int s = 0; s < state.destSamples; s++)
+						Int X0 = Max(Int(x0), 0);
+						Int Y0 = Max(Int(y0), 0);
+
+						Int X1 = X0 + 1;
+						Int Y1 = Y0 + 1;
+						X1 = IfThenElse(X1 >= sWidth, X0, X1);
+						Y1 = IfThenElse(Y1 >= sHeight, Y0, Y1);
+
+						Pointer<Byte> s00 = source + ComputeOffset(X0, Y0, sPitchB, srcBytes);
+						Pointer<Byte> s01 = source + ComputeOffset(X1, Y0, sPitchB, srcBytes);
+						Pointer<Byte> s10 = source + ComputeOffset(X0, Y1, sPitchB, srcBytes);
+						Pointer<Byte> s11 = source + ComputeOffset(X1, Y1, sPitchB, srcBytes);
+
+						Float4 c00 = readFloat4(s00, state);
+						Float4 c01 = readFloat4(s01, state);
+						Float4 c10 = readFloat4(s10, state);
+						Float4 c11 = readFloat4(s11, state);
+
+						if(state.allowSRGBConversion && state.sourceFormat.isSRGBformat()) // sRGB -> RGB
 						{
-							write(color, d, state);
-
-							d += *Pointer<Int>(blit + OFFSET(BlitData,dSliceB));
+							ApplyScaleAndClamp(c00, state);
+							ApplyScaleAndClamp(c01, state);
+							ApplyScaleAndClamp(c10, state);
+							ApplyScaleAndClamp(c11, state);
+							preScaled = true;
 						}
+
+						Float4 fx = Float4(x0 - Float(X0));
+						Float4 fy = Float4(y0 - Float(Y0));
+						Float4 ix = Float4(1.0f) - fx;
+						Float4 iy = Float4(1.0f) - fy;
+
+						color = (c00 * ix + c01 * fx) * iy +
+						        (c10 * ix + c11 * fx) * fy;
+					}
+
+					ApplyScaleAndClamp(color, state, preScaled);
+
+					for(int s = 0; s < state.destSamples; s++)
+					{
+						write(color, d, state);
+
+						d += *Pointer<Int>(blit + OFFSET(BlitData,dSliceB));
 					}
 				}
 			}
 		}
-
-		return function("BlitRoutine");
 	}
 
-	Blitter::BlitRoutineType Blitter::getBlitRoutine(const State &state)
+	return function("BlitRoutine");
+}
+
+Blitter::BlitRoutineType Blitter::getBlitRoutine(const State &state)
+{
+	std::unique_lock<std::mutex> lock(blitMutex);
+	auto blitRoutine = blitCache.query(state);
+
+	if(!blitRoutine)
 	{
-		std::unique_lock<std::mutex> lock(blitMutex);
-		auto blitRoutine = blitCache.query(state);
-
-		if(!blitRoutine)
-		{
-			blitRoutine = generate(state);
-			blitCache.add(state, blitRoutine);
-		}
-
-		return blitRoutine;
+		blitRoutine = generate(state);
+		blitCache.add(state, blitRoutine);
 	}
 
-	Blitter::CornerUpdateRoutineType Blitter::getCornerUpdateRoutine(const State &state)
+	return blitRoutine;
+}
+
+Blitter::CornerUpdateRoutineType Blitter::getCornerUpdateRoutine(const State &state)
+{
+	std::unique_lock<std::mutex> lock(cornerUpdateMutex);
+	auto cornerUpdateRoutine = cornerUpdateCache.query(state);
+
+	if(!cornerUpdateRoutine)
 	{
-		std::unique_lock<std::mutex> lock(cornerUpdateMutex);
-		auto cornerUpdateRoutine = cornerUpdateCache.query(state);
-
-		if(!cornerUpdateRoutine)
-		{
-			cornerUpdateRoutine = generateCornerUpdate(state);
-			cornerUpdateCache.add(state, cornerUpdateRoutine);
-		}
-
-		return cornerUpdateRoutine;
+		cornerUpdateRoutine = generateCornerUpdate(state);
+		cornerUpdateCache.add(state, cornerUpdateRoutine);
 	}
 
-	void Blitter::blitToBuffer(const vk::Image *src, VkImageSubresourceLayers subresource, VkOffset3D offset, VkExtent3D extent, uint8_t *dst, int bufferRowPitch, int bufferSlicePitch)
+	return cornerUpdateRoutine;
+}
+
+void Blitter::blitToBuffer(const vk::Image *src, VkImageSubresourceLayers subresource, VkOffset3D offset, VkExtent3D extent, uint8_t *dst, int bufferRowPitch, int bufferSlicePitch)
+{
+	auto aspect = static_cast<VkImageAspectFlagBits>(subresource.aspectMask);
+	auto format = src->getFormat(aspect);
+	State state(format, format, VK_SAMPLE_COUNT_1_BIT, VK_SAMPLE_COUNT_1_BIT, Options{false, false});
+
+	auto blitRoutine = getBlitRoutine(state);
+	if(!blitRoutine)
 	{
-		auto aspect = static_cast<VkImageAspectFlagBits>(subresource.aspectMask);
-		auto format = src->getFormat(aspect);
-		State state(format, format, VK_SAMPLE_COUNT_1_BIT, VK_SAMPLE_COUNT_1_BIT, Options{false, false});
-
-		auto blitRoutine = getBlitRoutine(state);
-		if(!blitRoutine)
-		{
-			return;
-		}
-
-		BlitData data =
-		{
-			nullptr, // source
-			dst, // dest
-			src->rowPitchBytes(aspect, subresource.mipLevel),   // sPitchB
-			bufferRowPitch,   // dPitchB
-			src->slicePitchBytes(aspect, subresource.mipLevel), // sSliceB
-			bufferSlicePitch, // dSliceB
-
-			0, 0, 1, 1,
-
-			0, // y0d
-			static_cast<int>(extent.height), // y1d
-			0, // x0d
-			static_cast<int>(extent.width), // x1d
-
-			static_cast<int>(extent.width), // sWidth
-			static_cast<int>(extent.height) // sHeight;
-		};
-
-		VkOffset3D srcOffset = { 0, 0, offset.z };
-
-		VkImageSubresourceLayers srcSubresLayers = subresource;
-		srcSubresLayers.layerCount = 1;
-
-		VkImageSubresourceRange srcSubresRange =
-		{
-			subresource.aspectMask,
-			subresource.mipLevel,
-			1,
-			subresource.baseArrayLayer,
-			subresource.layerCount
-		};
-
-		uint32_t lastLayer = src->getLastLayerIndex(srcSubresRange);
-
-		for(; srcSubresLayers.baseArrayLayer <= lastLayer; srcSubresLayers.baseArrayLayer++)
-		{
-			srcOffset.z = offset.z;
-
-			for(auto i = 0u; i < extent.depth; i++)
-			{
-				data.source = src->getTexelPointer(srcOffset, srcSubresLayers);
-				ASSERT(data.source < src->end());
-				blitRoutine(&data);
-				srcOffset.z++;
-				data.dest = (dst += bufferSlicePitch);
-			}
-		}
+		return;
 	}
 
-	void Blitter::blitFromBuffer(const vk::Image *dst, VkImageSubresourceLayers subresource, VkOffset3D offset, VkExtent3D extent, uint8_t *src, int bufferRowPitch, int bufferSlicePitch)
+	BlitData data =
 	{
-		auto aspect = static_cast<VkImageAspectFlagBits>(subresource.aspectMask);
-		auto format = dst->getFormat(aspect);
-		State state(format, format, VK_SAMPLE_COUNT_1_BIT, VK_SAMPLE_COUNT_1_BIT, Options{false, false});
+		nullptr, // source
+		dst, // dest
+		src->rowPitchBytes(aspect, subresource.mipLevel),   // sPitchB
+		bufferRowPitch,   // dPitchB
+		src->slicePitchBytes(aspect, subresource.mipLevel), // sSliceB
+		bufferSlicePitch, // dSliceB
 
-		auto blitRoutine = getBlitRoutine(state);
-		if(!blitRoutine)
-		{
-			return;
-		}
+		0, 0, 1, 1,
 
-		BlitData data =
-		{
-			src, // source
-			nullptr, // dest
-			bufferRowPitch,   // sPitchB
-			dst->rowPitchBytes(aspect, subresource.mipLevel),   // dPitchB
-			bufferSlicePitch, // sSliceB
-			dst->slicePitchBytes(aspect, subresource.mipLevel), // dSliceB
+		0, // y0d
+		static_cast<int>(extent.height), // y1d
+		0, // x0d
+		static_cast<int>(extent.width), // x1d
 
-			static_cast<float>(-offset.x), // x0
-			static_cast<float>(-offset.y), // y0
-			1.0f, // w
-			1.0f, // h
+		static_cast<int>(extent.width), // sWidth
+		static_cast<int>(extent.height) // sHeight;
+	};
 
-			offset.y, // y0d
-			static_cast<int>(offset.y + extent.height), // y1d
-			offset.x, // x0d
-			static_cast<int>(offset.x + extent.width), // x1d
+	VkOffset3D srcOffset = { 0, 0, offset.z };
 
-			static_cast<int>(extent.width), // sWidth
-			static_cast<int>(extent.height) // sHeight;
-		};
+	VkImageSubresourceLayers srcSubresLayers = subresource;
+	srcSubresLayers.layerCount = 1;
 
-		VkOffset3D dstOffset = { 0, 0, offset.z };
-
-		VkImageSubresourceLayers dstSubresLayers = subresource;
-		dstSubresLayers.layerCount = 1;
-
-		VkImageSubresourceRange dstSubresRange =
-		{
-			subresource.aspectMask,
-			subresource.mipLevel,
-			1,
-			subresource.baseArrayLayer,
-			subresource.layerCount
-		};
-
-		uint32_t lastLayer = dst->getLastLayerIndex(dstSubresRange);
-
-		for(; dstSubresLayers.baseArrayLayer <= lastLayer; dstSubresLayers.baseArrayLayer++)
-		{
-			dstOffset.z = offset.z;
-
-			for(auto i = 0u; i < extent.depth; i++)
-			{
-				data.dest = dst->getTexelPointer(dstOffset, dstSubresLayers);
-				ASSERT(data.dest < dst->end());
-				blitRoutine(&data);
-				dstOffset.z++;
-				data.source = (src += bufferSlicePitch);
-			}
-		}
-	}
-
-	void Blitter::blit(const vk::Image *src, vk::Image *dst, VkImageBlit region, VkFilter filter)
+	VkImageSubresourceRange srcSubresRange =
 	{
-		if(dst->getFormat() == VK_FORMAT_UNDEFINED)
-		{
-			return;
-		}
+		subresource.aspectMask,
+		subresource.mipLevel,
+		1,
+		subresource.baseArrayLayer,
+		subresource.layerCount
+	};
 
-		if((region.srcSubresource.layerCount != region.dstSubresource.layerCount) ||
-		   (region.srcSubresource.aspectMask != region.dstSubresource.aspectMask))
-		{
-			UNIMPLEMENTED("region");
-		}
+	uint32_t lastLayer = src->getLastLayerIndex(srcSubresRange);
 
-		if(region.dstOffsets[0].x > region.dstOffsets[1].x)
-		{
-			std::swap(region.srcOffsets[0].x, region.srcOffsets[1].x);
-			std::swap(region.dstOffsets[0].x, region.dstOffsets[1].x);
-		}
-
-		if(region.dstOffsets[0].y > region.dstOffsets[1].y)
-		{
-			std::swap(region.srcOffsets[0].y, region.srcOffsets[1].y);
-			std::swap(region.dstOffsets[0].y, region.dstOffsets[1].y);
-		}
-
-		VkImageAspectFlagBits srcAspect = static_cast<VkImageAspectFlagBits>(region.srcSubresource.aspectMask);
-		VkImageAspectFlagBits dstAspect = static_cast<VkImageAspectFlagBits>(region.dstSubresource.aspectMask);
-		VkExtent3D srcExtent = src->getMipLevelExtent(srcAspect, region.srcSubresource.mipLevel);
-
-		int32_t numSlices = (region.srcOffsets[1].z - region.srcOffsets[0].z);
-		ASSERT(numSlices == (region.dstOffsets[1].z - region.dstOffsets[0].z));
-
-		float widthRatio = static_cast<float>(region.srcOffsets[1].x - region.srcOffsets[0].x) /
-		                   static_cast<float>(region.dstOffsets[1].x - region.dstOffsets[0].x);
-		float heightRatio = static_cast<float>(region.srcOffsets[1].y - region.srcOffsets[0].y) /
-		                    static_cast<float>(region.dstOffsets[1].y - region.dstOffsets[0].y);
-		float x0 = region.srcOffsets[0].x + (0.5f - region.dstOffsets[0].x) * widthRatio;
-		float y0 = region.srcOffsets[0].y + (0.5f - region.dstOffsets[0].y) * heightRatio;
-
-		auto srcFormat = src->getFormat(srcAspect);
-		auto dstFormat = dst->getFormat(dstAspect);
-
-		bool doFilter = (filter != VK_FILTER_NEAREST);
-		bool allowSRGBConversion =
-			doFilter ||
-			(src->getSampleCountFlagBits() > 1) ||
-			(srcFormat.isSRGBformat() != dstFormat.isSRGBformat());
-
-		State state(src->getFormat(srcAspect), dst->getFormat(dstAspect), src->getSampleCountFlagBits(), dst->getSampleCountFlagBits(),
-		            Options{ doFilter, allowSRGBConversion });
-		state.clampToEdge = (region.srcOffsets[0].x < 0) ||
-		                    (region.srcOffsets[0].y < 0) ||
-		                    (static_cast<uint32_t>(region.srcOffsets[1].x) > srcExtent.width) ||
-		                    (static_cast<uint32_t>(region.srcOffsets[1].y) > srcExtent.height) ||
-		                    (doFilter && ((x0 < 0.5f) || (y0 < 0.5f)));
-
-		auto blitRoutine = getBlitRoutine(state);
-		if(!blitRoutine)
-		{
-			return;
-		}
-
-		BlitData data =
-		{
-			nullptr, // source
-			nullptr, // dest
-			src->rowPitchBytes(srcAspect, region.srcSubresource.mipLevel),   // sPitchB
-			dst->rowPitchBytes(dstAspect, region.dstSubresource.mipLevel),   // dPitchB
-			src->slicePitchBytes(srcAspect, region.srcSubresource.mipLevel), // sSliceB
-			dst->slicePitchBytes(dstAspect, region.dstSubresource.mipLevel), // dSliceB
-
-			x0,
-			y0,
-			widthRatio,
-			heightRatio,
-
-			region.dstOffsets[0].y, // y0d
-			region.dstOffsets[1].y, // y1d
-			region.dstOffsets[0].x, // x0d
-			region.dstOffsets[1].x, // x1d
-
-			static_cast<int>(srcExtent.width), // sWidth
-			static_cast<int>(srcExtent.height) // sHeight;
-		};
-
-		VkOffset3D srcOffset = { 0, 0, region.srcOffsets[0].z };
-		VkOffset3D dstOffset = { 0, 0, region.dstOffsets[0].z };
-
-		VkImageSubresourceLayers srcSubresLayers =
-		{
-			region.srcSubresource.aspectMask,
-			region.srcSubresource.mipLevel,
-			region.srcSubresource.baseArrayLayer,
-			1
-		};
-
-		VkImageSubresourceLayers dstSubresLayers =
-		{
-			region.dstSubresource.aspectMask,
-			region.dstSubresource.mipLevel,
-			region.dstSubresource.baseArrayLayer,
-			1
-		};
-
-		VkImageSubresourceRange srcSubresRange =
-		{
-			region.srcSubresource.aspectMask,
-			region.srcSubresource.mipLevel,
-			1,
-			region.srcSubresource.baseArrayLayer,
-			region.srcSubresource.layerCount
-		};
-
-		uint32_t lastLayer = src->getLastLayerIndex(srcSubresRange);
-
-		for(; srcSubresLayers.baseArrayLayer <= lastLayer; srcSubresLayers.baseArrayLayer++, dstSubresLayers.baseArrayLayer++)
-		{
-			srcOffset.z = region.srcOffsets[0].z;
-			dstOffset.z = region.dstOffsets[0].z;
-
-			for(int i = 0; i < numSlices; i++)
-			{
-				data.source = src->getTexelPointer(srcOffset, srcSubresLayers);
-				data.dest = dst->getTexelPointer(dstOffset, dstSubresLayers);
-
-				ASSERT(data.source < src->end());
-				ASSERT(data.dest < dst->end());
-
-				blitRoutine(&data);
-				srcOffset.z++;
-				dstOffset.z++;
-			}
-		}
-	}
-
-	void Blitter::computeCubeCorner(Pointer<Byte>& layer, Int& x0, Int& x1, Int& y0, Int& y1, Int& pitchB, const State& state)
+	for(; srcSubresLayers.baseArrayLayer <= lastLayer; srcSubresLayers.baseArrayLayer++)
 	{
-		int bytes = state.sourceFormat.bytes();
+		srcOffset.z = offset.z;
 
-		Float4 c = readFloat4(layer + ComputeOffset(x0, y1, pitchB, bytes), state) +
-		           readFloat4(layer + ComputeOffset(x1, y0, pitchB, bytes), state) +
-		           readFloat4(layer + ComputeOffset(x1, y1, pitchB, bytes), state);
-
-		c *= Float4(1.0f / 3.0f);
-
-		write(c, layer + ComputeOffset(x0, y0, pitchB, bytes), state);
-	}
-
-	Blitter::CornerUpdateRoutineType Blitter::generateCornerUpdate(const State& state)
-	{
-		// Reading and writing from/to the same image
-		ASSERT(state.sourceFormat == state.destFormat);
-		ASSERT(state.srcSamples == state.destSamples);
-
-		if(state.srcSamples != 1)
+		for(auto i = 0u; i < extent.depth; i++)
 		{
-			UNIMPLEMENTED("state.srcSamples %d", state.srcSamples);
-		}
-
-		CornerUpdateFunction function;
-		{
-			Pointer<Byte> blit(function.Arg<0>());
-
-			Pointer<Byte> layers = *Pointer<Pointer<Byte>>(blit + OFFSET(CubeBorderData, layers));
-			Int pitchB = *Pointer<Int>(blit + OFFSET(CubeBorderData, pitchB));
-			UInt layerSize = *Pointer<Int>(blit + OFFSET(CubeBorderData, layerSize));
-			UInt dim = *Pointer<Int>(blit + OFFSET(CubeBorderData, dim));
-
-			// Low Border, Low Pixel, High Border, High Pixel
-			Int LB(-1), LP(0), HB(dim), HP(dim-1);
-
-			for(int face = 0; face < 6; face++)
-			{
-				computeCubeCorner(layers, LB, LP, LB, LP, pitchB, state);
-				computeCubeCorner(layers, LB, LP, HB, HP, pitchB, state);
-				computeCubeCorner(layers, HB, HP, LB, LP, pitchB, state);
-				computeCubeCorner(layers, HB, HP, HB, HP, pitchB, state);
-				layers = layers + layerSize;
-			}
-		}
-
-		return function("BlitRoutine");
-	}
-
-	void Blitter::updateBorders(vk::Image* image, const VkImageSubresourceLayers& subresourceLayers)
-	{
-		if(image->getArrayLayers() < (subresourceLayers.baseArrayLayer + 6))
-		{
-			UNIMPLEMENTED("image->getArrayLayers() %d, baseArrayLayer %d",
-			              image->getArrayLayers(), subresourceLayers.baseArrayLayer);
-		}
-
-		// From Vulkan 1.1 spec, section 11.5. Image Views:
-		// "For cube and cube array image views, the layers of the image view starting
-		//  at baseArrayLayer correspond to faces in the order +X, -X, +Y, -Y, +Z, -Z."
-		VkImageSubresourceLayers posX = subresourceLayers;
-		posX.layerCount = 1;
-		VkImageSubresourceLayers negX = posX;
-		negX.baseArrayLayer++;
-		VkImageSubresourceLayers posY = negX;
-		posY.baseArrayLayer++;
-		VkImageSubresourceLayers negY = posY;
-		negY.baseArrayLayer++;
-		VkImageSubresourceLayers posZ = negY;
-		posZ.baseArrayLayer++;
-		VkImageSubresourceLayers negZ = posZ;
-		negZ.baseArrayLayer++;
-
-		// Copy top / bottom
-		copyCubeEdge(image, posX, BOTTOM, negY, RIGHT);
-		copyCubeEdge(image, posY, BOTTOM, posZ, TOP);
-		copyCubeEdge(image, posZ, BOTTOM, negY, TOP);
-		copyCubeEdge(image, negX, BOTTOM, negY, LEFT);
-		copyCubeEdge(image, negY, BOTTOM, negZ, BOTTOM);
-		copyCubeEdge(image, negZ, BOTTOM, negY, BOTTOM);
-
-		copyCubeEdge(image, posX, TOP, posY, RIGHT);
-		copyCubeEdge(image, posY, TOP, negZ, TOP);
-		copyCubeEdge(image, posZ, TOP, posY, BOTTOM);
-		copyCubeEdge(image, negX, TOP, posY, LEFT);
-		copyCubeEdge(image, negY, TOP, posZ, BOTTOM);
-		copyCubeEdge(image, negZ, TOP, posY, TOP);
-
-		// Copy left / right
-		copyCubeEdge(image, posX, RIGHT, negZ, LEFT);
-		copyCubeEdge(image, posY, RIGHT, posX, TOP);
-		copyCubeEdge(image, posZ, RIGHT, posX, LEFT);
-		copyCubeEdge(image, negX, RIGHT, posZ, LEFT);
-		copyCubeEdge(image, negY, RIGHT, posX, BOTTOM);
-		copyCubeEdge(image, negZ, RIGHT, negX, LEFT);
-
-		copyCubeEdge(image, posX, LEFT, posZ, RIGHT);
-		copyCubeEdge(image, posY, LEFT, negX, TOP);
-		copyCubeEdge(image, posZ, LEFT, negX, RIGHT);
-		copyCubeEdge(image, negX, LEFT, negZ, RIGHT);
-		copyCubeEdge(image, negY, LEFT, negX, BOTTOM);
-		copyCubeEdge(image, negZ, LEFT, posX, RIGHT);
-
-		// Compute corner colors
-		VkImageAspectFlagBits aspect = static_cast<VkImageAspectFlagBits>(subresourceLayers.aspectMask);
-		vk::Format format = image->getFormat(aspect);
-		VkSampleCountFlagBits samples = image->getSampleCountFlagBits();
-		State state(format, format, samples, samples, Options{ 0xF });
-
-		if(samples != VK_SAMPLE_COUNT_1_BIT)
-		{
-			UNIMPLEMENTED("Multi-sampled cube: %d samples", static_cast<int>(samples));
-		}
-
-		auto cornerUpdateRoutine = getCornerUpdateRoutine(state);
-		if(!cornerUpdateRoutine)
-		{
-			return;
-		}
-
-		VkExtent3D extent = image->getMipLevelExtent(aspect, subresourceLayers.mipLevel);
-		CubeBorderData data =
-		{
-			image->getTexelPointer({ 0, 0, 0 }, posX),
-			image->rowPitchBytes(aspect, subresourceLayers.mipLevel),
-			static_cast<uint32_t>(image->getLayerSize(aspect)),
-			extent.width
-		};
-		cornerUpdateRoutine(&data);
-	}
-
-	void Blitter::copyCubeEdge(vk::Image* image,
-	                           const VkImageSubresourceLayers& dstSubresourceLayers, Edge dstEdge,
-	                           const VkImageSubresourceLayers& srcSubresourceLayers, Edge srcEdge)
-	{
-		ASSERT(srcSubresourceLayers.aspectMask == dstSubresourceLayers.aspectMask);
-		ASSERT(srcSubresourceLayers.mipLevel == dstSubresourceLayers.mipLevel);
-		ASSERT(srcSubresourceLayers.baseArrayLayer != dstSubresourceLayers.baseArrayLayer);
-		ASSERT(srcSubresourceLayers.layerCount == 1);
-		ASSERT(dstSubresourceLayers.layerCount == 1);
-
-		// Figure out if the edges to be copied in reverse order respectively from one another
-		// The copy should be reversed whenever the same edges are contiguous or if we're
-		// copying top <-> right or bottom <-> left. This is explained by the layout, which is:
-		//
-		//      | +y |
-		// | -x | +z | +x | -z |
-		//      | -y |
-
-		bool reverse = (srcEdge == dstEdge) ||
-		               ((srcEdge == TOP) && (dstEdge == RIGHT)) ||
-		               ((srcEdge == RIGHT) && (dstEdge == TOP)) ||
-		               ((srcEdge == BOTTOM) && (dstEdge == LEFT)) ||
-		               ((srcEdge == LEFT) && (dstEdge == BOTTOM));
-
-		VkImageAspectFlagBits aspect = static_cast<VkImageAspectFlagBits>(srcSubresourceLayers.aspectMask);
-		int bytes = image->getFormat(aspect).bytes();
-		int pitchB = image->rowPitchBytes(aspect, srcSubresourceLayers.mipLevel);
-
-		VkExtent3D extent = image->getMipLevelExtent(aspect, srcSubresourceLayers.mipLevel);
-		int w = extent.width;
-		int h = extent.height;
-		if(w != h)
-		{
-			UNSUPPORTED("Cube doesn't have square faces : (%d, %d)", w, h);
-		}
-
-		// Src is expressed in the regular [0, width-1], [0, height-1] space
-		bool srcHorizontal = ((srcEdge == TOP) || (srcEdge == BOTTOM));
-		int srcDelta = srcHorizontal ? bytes : pitchB;
-		VkOffset3D srcOffset = { (srcEdge == RIGHT) ? (w - 1) : 0, (srcEdge == BOTTOM) ? (h - 1) : 0, 0 };
-
-		// Dst contains borders, so it is expressed in the [-1, width], [-1, height] space
-		bool dstHorizontal = ((dstEdge == TOP) || (dstEdge == BOTTOM));
-		int dstDelta = (dstHorizontal ? bytes : pitchB) * (reverse ? -1 : 1);
-		VkOffset3D dstOffset = { (dstEdge == RIGHT) ? w : -1, (dstEdge == BOTTOM) ? h : -1, 0 };
-
-		// Don't write in the corners
-		if(dstHorizontal)
-		{
-			dstOffset.x += reverse ? w : 1;
-		}
-		else
-		{
-			dstOffset.y += reverse ? h : 1;
-		}
-
-		const uint8_t* src = static_cast<const uint8_t*>(image->getTexelPointer(srcOffset, srcSubresourceLayers));
-		uint8_t *dst = static_cast<uint8_t*>(image->getTexelPointer(dstOffset, dstSubresourceLayers));
-		ASSERT((src < image->end()) && ((src + (w * srcDelta)) < image->end()));
-		ASSERT((dst < image->end()) && ((dst + (w * dstDelta)) < image->end()));
-
-		for(int i = 0; i < w; ++i, dst += dstDelta, src += srcDelta)
-		{
-			memcpy(dst, src, bytes);
+			data.source = src->getTexelPointer(srcOffset, srcSubresLayers);
+			ASSERT(data.source < src->end());
+			blitRoutine(&data);
+			srcOffset.z++;
+			data.dest = (dst += bufferSlicePitch);
 		}
 	}
 }
+
+void Blitter::blitFromBuffer(const vk::Image *dst, VkImageSubresourceLayers subresource, VkOffset3D offset, VkExtent3D extent, uint8_t *src, int bufferRowPitch, int bufferSlicePitch)
+{
+	auto aspect = static_cast<VkImageAspectFlagBits>(subresource.aspectMask);
+	auto format = dst->getFormat(aspect);
+	State state(format, format, VK_SAMPLE_COUNT_1_BIT, VK_SAMPLE_COUNT_1_BIT, Options{false, false});
+
+	auto blitRoutine = getBlitRoutine(state);
+	if(!blitRoutine)
+	{
+		return;
+	}
+
+	BlitData data =
+	{
+		src, // source
+		nullptr, // dest
+		bufferRowPitch,   // sPitchB
+		dst->rowPitchBytes(aspect, subresource.mipLevel),   // dPitchB
+		bufferSlicePitch, // sSliceB
+		dst->slicePitchBytes(aspect, subresource.mipLevel), // dSliceB
+
+		static_cast<float>(-offset.x), // x0
+		static_cast<float>(-offset.y), // y0
+		1.0f, // w
+		1.0f, // h
+
+		offset.y, // y0d
+		static_cast<int>(offset.y + extent.height), // y1d
+		offset.x, // x0d
+		static_cast<int>(offset.x + extent.width), // x1d
+
+		static_cast<int>(extent.width), // sWidth
+		static_cast<int>(extent.height) // sHeight;
+	};
+
+	VkOffset3D dstOffset = { 0, 0, offset.z };
+
+	VkImageSubresourceLayers dstSubresLayers = subresource;
+	dstSubresLayers.layerCount = 1;
+
+	VkImageSubresourceRange dstSubresRange =
+	{
+		subresource.aspectMask,
+		subresource.mipLevel,
+		1,
+		subresource.baseArrayLayer,
+		subresource.layerCount
+	};
+
+	uint32_t lastLayer = dst->getLastLayerIndex(dstSubresRange);
+
+	for(; dstSubresLayers.baseArrayLayer <= lastLayer; dstSubresLayers.baseArrayLayer++)
+	{
+		dstOffset.z = offset.z;
+
+		for(auto i = 0u; i < extent.depth; i++)
+		{
+			data.dest = dst->getTexelPointer(dstOffset, dstSubresLayers);
+			ASSERT(data.dest < dst->end());
+			blitRoutine(&data);
+			dstOffset.z++;
+			data.source = (src += bufferSlicePitch);
+		}
+	}
+}
+
+void Blitter::blit(const vk::Image *src, vk::Image *dst, VkImageBlit region, VkFilter filter)
+{
+	if(dst->getFormat() == VK_FORMAT_UNDEFINED)
+	{
+		return;
+	}
+
+	if((region.srcSubresource.layerCount != region.dstSubresource.layerCount) ||
+	   (region.srcSubresource.aspectMask != region.dstSubresource.aspectMask))
+	{
+		UNIMPLEMENTED("region");
+	}
+
+	if(region.dstOffsets[0].x > region.dstOffsets[1].x)
+	{
+		std::swap(region.srcOffsets[0].x, region.srcOffsets[1].x);
+		std::swap(region.dstOffsets[0].x, region.dstOffsets[1].x);
+	}
+
+	if(region.dstOffsets[0].y > region.dstOffsets[1].y)
+	{
+		std::swap(region.srcOffsets[0].y, region.srcOffsets[1].y);
+		std::swap(region.dstOffsets[0].y, region.dstOffsets[1].y);
+	}
+
+	VkImageAspectFlagBits srcAspect = static_cast<VkImageAspectFlagBits>(region.srcSubresource.aspectMask);
+	VkImageAspectFlagBits dstAspect = static_cast<VkImageAspectFlagBits>(region.dstSubresource.aspectMask);
+	VkExtent3D srcExtent = src->getMipLevelExtent(srcAspect, region.srcSubresource.mipLevel);
+
+	int32_t numSlices = (region.srcOffsets[1].z - region.srcOffsets[0].z);
+	ASSERT(numSlices == (region.dstOffsets[1].z - region.dstOffsets[0].z));
+
+	float widthRatio = static_cast<float>(region.srcOffsets[1].x - region.srcOffsets[0].x) /
+	                   static_cast<float>(region.dstOffsets[1].x - region.dstOffsets[0].x);
+	float heightRatio = static_cast<float>(region.srcOffsets[1].y - region.srcOffsets[0].y) /
+	                    static_cast<float>(region.dstOffsets[1].y - region.dstOffsets[0].y);
+	float x0 = region.srcOffsets[0].x + (0.5f - region.dstOffsets[0].x) * widthRatio;
+	float y0 = region.srcOffsets[0].y + (0.5f - region.dstOffsets[0].y) * heightRatio;
+
+	auto srcFormat = src->getFormat(srcAspect);
+	auto dstFormat = dst->getFormat(dstAspect);
+
+	bool doFilter = (filter != VK_FILTER_NEAREST);
+	bool allowSRGBConversion =
+		doFilter ||
+		(src->getSampleCountFlagBits() > 1) ||
+		(srcFormat.isSRGBformat() != dstFormat.isSRGBformat());
+
+	State state(src->getFormat(srcAspect), dst->getFormat(dstAspect), src->getSampleCountFlagBits(), dst->getSampleCountFlagBits(),
+	            Options{ doFilter, allowSRGBConversion });
+	state.clampToEdge = (region.srcOffsets[0].x < 0) ||
+	                    (region.srcOffsets[0].y < 0) ||
+	                    (static_cast<uint32_t>(region.srcOffsets[1].x) > srcExtent.width) ||
+	                    (static_cast<uint32_t>(region.srcOffsets[1].y) > srcExtent.height) ||
+	                    (doFilter && ((x0 < 0.5f) || (y0 < 0.5f)));
+
+	auto blitRoutine = getBlitRoutine(state);
+	if(!blitRoutine)
+	{
+		return;
+	}
+
+	BlitData data =
+	{
+		nullptr, // source
+		nullptr, // dest
+		src->rowPitchBytes(srcAspect, region.srcSubresource.mipLevel),   // sPitchB
+		dst->rowPitchBytes(dstAspect, region.dstSubresource.mipLevel),   // dPitchB
+		src->slicePitchBytes(srcAspect, region.srcSubresource.mipLevel), // sSliceB
+		dst->slicePitchBytes(dstAspect, region.dstSubresource.mipLevel), // dSliceB
+
+		x0,
+		y0,
+		widthRatio,
+		heightRatio,
+
+		region.dstOffsets[0].y, // y0d
+		region.dstOffsets[1].y, // y1d
+		region.dstOffsets[0].x, // x0d
+		region.dstOffsets[1].x, // x1d
+
+		static_cast<int>(srcExtent.width), // sWidth
+		static_cast<int>(srcExtent.height) // sHeight;
+	};
+
+	VkOffset3D srcOffset = { 0, 0, region.srcOffsets[0].z };
+	VkOffset3D dstOffset = { 0, 0, region.dstOffsets[0].z };
+
+	VkImageSubresourceLayers srcSubresLayers =
+	{
+		region.srcSubresource.aspectMask,
+		region.srcSubresource.mipLevel,
+		region.srcSubresource.baseArrayLayer,
+		1
+	};
+
+	VkImageSubresourceLayers dstSubresLayers =
+	{
+		region.dstSubresource.aspectMask,
+		region.dstSubresource.mipLevel,
+		region.dstSubresource.baseArrayLayer,
+		1
+	};
+
+	VkImageSubresourceRange srcSubresRange =
+	{
+		region.srcSubresource.aspectMask,
+		region.srcSubresource.mipLevel,
+		1,
+		region.srcSubresource.baseArrayLayer,
+		region.srcSubresource.layerCount
+	};
+
+	uint32_t lastLayer = src->getLastLayerIndex(srcSubresRange);
+
+	for(; srcSubresLayers.baseArrayLayer <= lastLayer; srcSubresLayers.baseArrayLayer++, dstSubresLayers.baseArrayLayer++)
+	{
+		srcOffset.z = region.srcOffsets[0].z;
+		dstOffset.z = region.dstOffsets[0].z;
+
+		for(int i = 0; i < numSlices; i++)
+		{
+			data.source = src->getTexelPointer(srcOffset, srcSubresLayers);
+			data.dest = dst->getTexelPointer(dstOffset, dstSubresLayers);
+
+			ASSERT(data.source < src->end());
+			ASSERT(data.dest < dst->end());
+
+			blitRoutine(&data);
+			srcOffset.z++;
+			dstOffset.z++;
+		}
+	}
+}
+
+void Blitter::computeCubeCorner(Pointer<Byte>& layer, Int& x0, Int& x1, Int& y0, Int& y1, Int& pitchB, const State& state)
+{
+	int bytes = state.sourceFormat.bytes();
+
+	Float4 c = readFloat4(layer + ComputeOffset(x0, y1, pitchB, bytes), state) +
+	           readFloat4(layer + ComputeOffset(x1, y0, pitchB, bytes), state) +
+	           readFloat4(layer + ComputeOffset(x1, y1, pitchB, bytes), state);
+
+	c *= Float4(1.0f / 3.0f);
+
+	write(c, layer + ComputeOffset(x0, y0, pitchB, bytes), state);
+}
+
+Blitter::CornerUpdateRoutineType Blitter::generateCornerUpdate(const State& state)
+{
+	// Reading and writing from/to the same image
+	ASSERT(state.sourceFormat == state.destFormat);
+	ASSERT(state.srcSamples == state.destSamples);
+
+	if(state.srcSamples != 1)
+	{
+		UNIMPLEMENTED("state.srcSamples %d", state.srcSamples);
+	}
+
+	CornerUpdateFunction function;
+	{
+		Pointer<Byte> blit(function.Arg<0>());
+
+		Pointer<Byte> layers = *Pointer<Pointer<Byte>>(blit + OFFSET(CubeBorderData, layers));
+		Int pitchB = *Pointer<Int>(blit + OFFSET(CubeBorderData, pitchB));
+		UInt layerSize = *Pointer<Int>(blit + OFFSET(CubeBorderData, layerSize));
+		UInt dim = *Pointer<Int>(blit + OFFSET(CubeBorderData, dim));
+
+		// Low Border, Low Pixel, High Border, High Pixel
+		Int LB(-1), LP(0), HB(dim), HP(dim-1);
+
+		for(int face = 0; face < 6; face++)
+		{
+			computeCubeCorner(layers, LB, LP, LB, LP, pitchB, state);
+			computeCubeCorner(layers, LB, LP, HB, HP, pitchB, state);
+			computeCubeCorner(layers, HB, HP, LB, LP, pitchB, state);
+			computeCubeCorner(layers, HB, HP, HB, HP, pitchB, state);
+			layers = layers + layerSize;
+		}
+	}
+
+	return function("BlitRoutine");
+}
+
+void Blitter::updateBorders(vk::Image* image, const VkImageSubresourceLayers& subresourceLayers)
+{
+	if(image->getArrayLayers() < (subresourceLayers.baseArrayLayer + 6))
+	{
+		UNIMPLEMENTED("image->getArrayLayers() %d, baseArrayLayer %d",
+		              image->getArrayLayers(), subresourceLayers.baseArrayLayer);
+	}
+
+	// From Vulkan 1.1 spec, section 11.5. Image Views:
+	// "For cube and cube array image views, the layers of the image view starting
+	//  at baseArrayLayer correspond to faces in the order +X, -X, +Y, -Y, +Z, -Z."
+	VkImageSubresourceLayers posX = subresourceLayers;
+	posX.layerCount = 1;
+	VkImageSubresourceLayers negX = posX;
+	negX.baseArrayLayer++;
+	VkImageSubresourceLayers posY = negX;
+	posY.baseArrayLayer++;
+	VkImageSubresourceLayers negY = posY;
+	negY.baseArrayLayer++;
+	VkImageSubresourceLayers posZ = negY;
+	posZ.baseArrayLayer++;
+	VkImageSubresourceLayers negZ = posZ;
+	negZ.baseArrayLayer++;
+
+	// Copy top / bottom
+	copyCubeEdge(image, posX, BOTTOM, negY, RIGHT);
+	copyCubeEdge(image, posY, BOTTOM, posZ, TOP);
+	copyCubeEdge(image, posZ, BOTTOM, negY, TOP);
+	copyCubeEdge(image, negX, BOTTOM, negY, LEFT);
+	copyCubeEdge(image, negY, BOTTOM, negZ, BOTTOM);
+	copyCubeEdge(image, negZ, BOTTOM, negY, BOTTOM);
+
+	copyCubeEdge(image, posX, TOP, posY, RIGHT);
+	copyCubeEdge(image, posY, TOP, negZ, TOP);
+	copyCubeEdge(image, posZ, TOP, posY, BOTTOM);
+	copyCubeEdge(image, negX, TOP, posY, LEFT);
+	copyCubeEdge(image, negY, TOP, posZ, BOTTOM);
+	copyCubeEdge(image, negZ, TOP, posY, TOP);
+
+	// Copy left / right
+	copyCubeEdge(image, posX, RIGHT, negZ, LEFT);
+	copyCubeEdge(image, posY, RIGHT, posX, TOP);
+	copyCubeEdge(image, posZ, RIGHT, posX, LEFT);
+	copyCubeEdge(image, negX, RIGHT, posZ, LEFT);
+	copyCubeEdge(image, negY, RIGHT, posX, BOTTOM);
+	copyCubeEdge(image, negZ, RIGHT, negX, LEFT);
+
+	copyCubeEdge(image, posX, LEFT, posZ, RIGHT);
+	copyCubeEdge(image, posY, LEFT, negX, TOP);
+	copyCubeEdge(image, posZ, LEFT, negX, RIGHT);
+	copyCubeEdge(image, negX, LEFT, negZ, RIGHT);
+	copyCubeEdge(image, negY, LEFT, negX, BOTTOM);
+	copyCubeEdge(image, negZ, LEFT, posX, RIGHT);
+
+	// Compute corner colors
+	VkImageAspectFlagBits aspect = static_cast<VkImageAspectFlagBits>(subresourceLayers.aspectMask);
+	vk::Format format = image->getFormat(aspect);
+	VkSampleCountFlagBits samples = image->getSampleCountFlagBits();
+	State state(format, format, samples, samples, Options{ 0xF });
+
+	if(samples != VK_SAMPLE_COUNT_1_BIT)
+	{
+		UNIMPLEMENTED("Multi-sampled cube: %d samples", static_cast<int>(samples));
+	}
+
+	auto cornerUpdateRoutine = getCornerUpdateRoutine(state);
+	if(!cornerUpdateRoutine)
+	{
+		return;
+	}
+
+	VkExtent3D extent = image->getMipLevelExtent(aspect, subresourceLayers.mipLevel);
+	CubeBorderData data =
+	{
+		image->getTexelPointer({ 0, 0, 0 }, posX),
+		image->rowPitchBytes(aspect, subresourceLayers.mipLevel),
+		static_cast<uint32_t>(image->getLayerSize(aspect)),
+		extent.width
+	};
+	cornerUpdateRoutine(&data);
+}
+
+void Blitter::copyCubeEdge(vk::Image* image,
+                           const VkImageSubresourceLayers& dstSubresourceLayers, Edge dstEdge,
+                           const VkImageSubresourceLayers& srcSubresourceLayers, Edge srcEdge)
+{
+	ASSERT(srcSubresourceLayers.aspectMask == dstSubresourceLayers.aspectMask);
+	ASSERT(srcSubresourceLayers.mipLevel == dstSubresourceLayers.mipLevel);
+	ASSERT(srcSubresourceLayers.baseArrayLayer != dstSubresourceLayers.baseArrayLayer);
+	ASSERT(srcSubresourceLayers.layerCount == 1);
+	ASSERT(dstSubresourceLayers.layerCount == 1);
+
+	// Figure out if the edges to be copied in reverse order respectively from one another
+	// The copy should be reversed whenever the same edges are contiguous or if we're
+	// copying top <-> right or bottom <-> left. This is explained by the layout, which is:
+	//
+	//      | +y |
+	// | -x | +z | +x | -z |
+	//      | -y |
+
+	bool reverse = (srcEdge == dstEdge) ||
+	               ((srcEdge == TOP) && (dstEdge == RIGHT)) ||
+	               ((srcEdge == RIGHT) && (dstEdge == TOP)) ||
+	               ((srcEdge == BOTTOM) && (dstEdge == LEFT)) ||
+	               ((srcEdge == LEFT) && (dstEdge == BOTTOM));
+
+	VkImageAspectFlagBits aspect = static_cast<VkImageAspectFlagBits>(srcSubresourceLayers.aspectMask);
+	int bytes = image->getFormat(aspect).bytes();
+	int pitchB = image->rowPitchBytes(aspect, srcSubresourceLayers.mipLevel);
+
+	VkExtent3D extent = image->getMipLevelExtent(aspect, srcSubresourceLayers.mipLevel);
+	int w = extent.width;
+	int h = extent.height;
+	if(w != h)
+	{
+		UNSUPPORTED("Cube doesn't have square faces : (%d, %d)", w, h);
+	}
+
+	// Src is expressed in the regular [0, width-1], [0, height-1] space
+	bool srcHorizontal = ((srcEdge == TOP) || (srcEdge == BOTTOM));
+	int srcDelta = srcHorizontal ? bytes : pitchB;
+	VkOffset3D srcOffset = { (srcEdge == RIGHT) ? (w - 1) : 0, (srcEdge == BOTTOM) ? (h - 1) : 0, 0 };
+
+	// Dst contains borders, so it is expressed in the [-1, width], [-1, height] space
+	bool dstHorizontal = ((dstEdge == TOP) || (dstEdge == BOTTOM));
+	int dstDelta = (dstHorizontal ? bytes : pitchB) * (reverse ? -1 : 1);
+	VkOffset3D dstOffset = { (dstEdge == RIGHT) ? w : -1, (dstEdge == BOTTOM) ? h : -1, 0 };
+
+	// Don't write in the corners
+	if(dstHorizontal)
+	{
+		dstOffset.x += reverse ? w : 1;
+	}
+	else
+	{
+		dstOffset.y += reverse ? h : 1;
+	}
+
+	const uint8_t* src = static_cast<const uint8_t*>(image->getTexelPointer(srcOffset, srcSubresourceLayers));
+	uint8_t *dst = static_cast<uint8_t*>(image->getTexelPointer(dstOffset, dstSubresourceLayers));
+	ASSERT((src < image->end()) && ((src + (w * srcDelta)) < image->end()));
+	ASSERT((dst < image->end()) && ((dst + (w * dstDelta)) < image->end()));
+
+	for(int i = 0; i < w; ++i, dst += dstDelta, src += srcDelta)
+	{
+		memcpy(dst, src, bytes);
+	}
+}
+
+}  // namepspace sw
diff --git a/src/Device/Blitter.hpp b/src/Device/Blitter.hpp
index 0157e88..317fdcc 100644
--- a/src/Device/Blitter.hpp
+++ b/src/Device/Blitter.hpp
@@ -23,139 +23,141 @@
 #include <mutex>
 #include <cstring>
 
-namespace vk
-{
-	class Image;
-	class Buffer;
-}
+namespace vk {
 
-namespace sw
+class Image;
+class Buffer;
+
+}  // namespace vk
+
+namespace sw {
+
+class Blitter
 {
-	class Blitter
+	struct Options
 	{
-		struct Options
+		explicit Options() = default;
+		explicit Options(bool filter, bool allowSRGBConversion)
+			: writeMask(0xF), clearOperation(false), filter(filter), allowSRGBConversion(allowSRGBConversion), clampToEdge(false) {}
+		explicit Options(unsigned int writeMask)
+			: writeMask(writeMask), clearOperation(true), filter(false), allowSRGBConversion(true), clampToEdge(false) {}
+
+		union
 		{
-			explicit Options() = default;
-			explicit Options(bool filter, bool allowSRGBConversion)
-				: writeMask(0xF), clearOperation(false), filter(filter), allowSRGBConversion(allowSRGBConversion), clampToEdge(false) {}
-			explicit Options(unsigned int writeMask)
-				: writeMask(writeMask), clearOperation(true), filter(false), allowSRGBConversion(true), clampToEdge(false) {}
-
-			union
+			struct
 			{
-				struct
-				{
-					bool writeRed : 1;
-					bool writeGreen : 1;
-					bool writeBlue : 1;
-					bool writeAlpha : 1;
-				};
-
-				unsigned char writeMask;
+				bool writeRed : 1;
+				bool writeGreen : 1;
+				bool writeBlue : 1;
+				bool writeAlpha : 1;
 			};
 
-			bool clearOperation : 1;
-			bool filter : 1;
-			bool allowSRGBConversion : 1;
-			bool clampToEdge : 1;
+			unsigned char writeMask;
 		};
 
-		struct State : Memset<State>, Options
-		{
-			State() : Memset(this, 0) {}
-			State(const Options &options) : Memset(this, 0), Options(options) {}
-			State(vk::Format sourceFormat, vk::Format destFormat, int srcSamples, int destSamples, const Options &options) :
-				Memset(this, 0), Options(options), sourceFormat(sourceFormat), destFormat(destFormat), srcSamples(srcSamples), destSamples(destSamples) {}
-
-			bool operator==(const State &state) const
-			{
-				static_assert(is_memcmparable<State>::value, "Cannot memcmp State");
-				return memcmp(this, &state, sizeof(State)) == 0;
-			}
-
-			vk::Format sourceFormat;
-			vk::Format destFormat;
-			int srcSamples = 0;
-			int destSamples = 0;
-		};
-
-		struct BlitData
-		{
-			void *source;
-			void *dest;
-			int sPitchB;
-			int dPitchB;
-			int sSliceB;
-			int dSliceB;
-
-			float x0;
-			float y0;
-			float w;
-			float h;
-
-			int y0d;
-			int y1d;
-			int x0d;
-			int x1d;
-
-			int sWidth;
-			int sHeight;
-		};
-
-		struct CubeBorderData
-		{
-			void *layers;
-			int pitchB;
-			uint32_t layerSize;
-			uint32_t dim;
-		};
-
-	public:
-		Blitter();
-		virtual ~Blitter();
-
-		void clear(void *pixel, vk::Format format, vk::Image *dest, const vk::Format& viewFormat, const VkImageSubresourceRange& subresourceRange, const VkRect2D* renderArea = nullptr);
-
-		void blit(const vk::Image *src, vk::Image *dst, VkImageBlit region, VkFilter filter);
-		void blitToBuffer(const vk::Image *src, VkImageSubresourceLayers subresource, VkOffset3D offset, VkExtent3D extent, uint8_t *dst, int bufferRowPitch, int bufferSlicePitch);
-		void blitFromBuffer(const vk::Image *dst, VkImageSubresourceLayers subresource, VkOffset3D offset, VkExtent3D extent, uint8_t *src, int bufferRowPitch, int bufferSlicePitch);
-
-		void updateBorders(vk::Image* image, const VkImageSubresourceLayers& subresourceLayers);
-
-	private:
-		enum Edge { TOP, BOTTOM, RIGHT, LEFT };
-
-		bool fastClear(void *pixel, vk::Format format, vk::Image *dest, const vk::Format& viewFormat, const VkImageSubresourceRange& subresourceRange, const VkRect2D* renderArea);
-
-		Float4 readFloat4(Pointer<Byte> element, const State &state);
-		void write(Float4 &color, Pointer<Byte> element, const State &state);
-		Int4 readInt4(Pointer<Byte> element, const State &state);
-		void write(Int4 &color, Pointer<Byte> element, const State &state);
-		static void ApplyScaleAndClamp(Float4 &value, const State &state, bool preScaled = false);
-		static Int ComputeOffset(Int &x, Int &y, Int &pitchB, int bytes);
-		static Float4 LinearToSRGB(Float4 &color);
-		static Float4 sRGBtoLinear(Float4 &color);
-
-		using BlitFunction = FunctionT<void(const BlitData*)>;
-		using BlitRoutineType = BlitFunction::RoutineType;
-		BlitRoutineType getBlitRoutine(const State &state);
-		BlitRoutineType generate(const State &state);
-
-		using CornerUpdateFunction = FunctionT<void(const CubeBorderData*)>;
-		using CornerUpdateRoutineType = CornerUpdateFunction::RoutineType;
-		CornerUpdateRoutineType getCornerUpdateRoutine(const State &state);
-		CornerUpdateRoutineType generateCornerUpdate(const State& state);
-		void computeCubeCorner(Pointer<Byte>& layer, Int& x0, Int& x1, Int& y0, Int& y1, Int& pitchB, const State& state);
-
-		void copyCubeEdge(vk::Image* image,
-	                      const VkImageSubresourceLayers& dstSubresourceLayers, Edge dstEdge,
-	                      const VkImageSubresourceLayers& srcSubresourceLayers, Edge srcEdge);
-
-		std::mutex blitMutex;
-		RoutineCacheT<State, BlitFunction::CFunctionType> blitCache; // guarded by blitMutex
-		std::mutex cornerUpdateMutex;
-		RoutineCacheT<State, CornerUpdateFunction::CFunctionType> cornerUpdateCache; // guarded by cornerUpdateMutex
+		bool clearOperation : 1;
+		bool filter : 1;
+		bool allowSRGBConversion : 1;
+		bool clampToEdge : 1;
 	};
-}
+
+	struct State : Memset<State>, Options
+	{
+		State() : Memset(this, 0) {}
+		State(const Options &options) : Memset(this, 0), Options(options) {}
+		State(vk::Format sourceFormat, vk::Format destFormat, int srcSamples, int destSamples, const Options &options) :
+			Memset(this, 0), Options(options), sourceFormat(sourceFormat), destFormat(destFormat), srcSamples(srcSamples), destSamples(destSamples) {}
+
+		bool operator==(const State &state) const
+		{
+			static_assert(is_memcmparable<State>::value, "Cannot memcmp State");
+			return memcmp(this, &state, sizeof(State)) == 0;
+		}
+
+		vk::Format sourceFormat;
+		vk::Format destFormat;
+		int srcSamples = 0;
+		int destSamples = 0;
+	};
+
+	struct BlitData
+	{
+		void *source;
+		void *dest;
+		int sPitchB;
+		int dPitchB;
+		int sSliceB;
+		int dSliceB;
+
+		float x0;
+		float y0;
+		float w;
+		float h;
+
+		int y0d;
+		int y1d;
+		int x0d;
+		int x1d;
+
+		int sWidth;
+		int sHeight;
+	};
+
+	struct CubeBorderData
+	{
+		void *layers;
+		int pitchB;
+		uint32_t layerSize;
+		uint32_t dim;
+	};
+
+public:
+	Blitter();
+	virtual ~Blitter();
+
+	void clear(void *pixel, vk::Format format, vk::Image *dest, const vk::Format& viewFormat, const VkImageSubresourceRange& subresourceRange, const VkRect2D* renderArea = nullptr);
+
+	void blit(const vk::Image *src, vk::Image *dst, VkImageBlit region, VkFilter filter);
+	void blitToBuffer(const vk::Image *src, VkImageSubresourceLayers subresource, VkOffset3D offset, VkExtent3D extent, uint8_t *dst, int bufferRowPitch, int bufferSlicePitch);
+	void blitFromBuffer(const vk::Image *dst, VkImageSubresourceLayers subresource, VkOffset3D offset, VkExtent3D extent, uint8_t *src, int bufferRowPitch, int bufferSlicePitch);
+
+	void updateBorders(vk::Image* image, const VkImageSubresourceLayers& subresourceLayers);
+
+private:
+	enum Edge { TOP, BOTTOM, RIGHT, LEFT };
+
+	bool fastClear(void *pixel, vk::Format format, vk::Image *dest, const vk::Format& viewFormat, const VkImageSubresourceRange& subresourceRange, const VkRect2D* renderArea);
+
+	Float4 readFloat4(Pointer<Byte> element, const State &state);
+	void write(Float4 &color, Pointer<Byte> element, const State &state);
+	Int4 readInt4(Pointer<Byte> element, const State &state);
+	void write(Int4 &color, Pointer<Byte> element, const State &state);
+	static void ApplyScaleAndClamp(Float4 &value, const State &state, bool preScaled = false);
+	static Int ComputeOffset(Int &x, Int &y, Int &pitchB, int bytes);
+	static Float4 LinearToSRGB(Float4 &color);
+	static Float4 sRGBtoLinear(Float4 &color);
+
+	using BlitFunction = FunctionT<void(const BlitData*)>;
+	using BlitRoutineType = BlitFunction::RoutineType;
+	BlitRoutineType getBlitRoutine(const State &state);
+	BlitRoutineType generate(const State &state);
+
+	using CornerUpdateFunction = FunctionT<void(const CubeBorderData*)>;
+	using CornerUpdateRoutineType = CornerUpdateFunction::RoutineType;
+	CornerUpdateRoutineType getCornerUpdateRoutine(const State &state);
+	CornerUpdateRoutineType generateCornerUpdate(const State& state);
+	void computeCubeCorner(Pointer<Byte>& layer, Int& x0, Int& x1, Int& y0, Int& y1, Int& pitchB, const State& state);
+
+	void copyCubeEdge(vk::Image* image,
+                      const VkImageSubresourceLayers& dstSubresourceLayers, Edge dstEdge,
+                      const VkImageSubresourceLayers& srcSubresourceLayers, Edge srcEdge);
+
+	std::mutex blitMutex;
+	RoutineCacheT<State, BlitFunction::CFunctionType> blitCache; // guarded by blitMutex
+	std::mutex cornerUpdateMutex;
+	RoutineCacheT<State, CornerUpdateFunction::CFunctionType> cornerUpdateCache; // guarded by cornerUpdateMutex
+};
+
+}  // namespace sw
 
 #endif   // sw_Blitter_hpp
diff --git a/src/Device/Clipper.cpp b/src/Device/Clipper.cpp
index 43fa72b..d36de8c 100644
--- a/src/Device/Clipper.cpp
+++ b/src/Device/Clipper.cpp
@@ -17,278 +17,280 @@
 #include "Polygon.hpp"
 #include "Renderer.hpp"
 
-namespace
+namespace {
+
+inline void clipEdge(sw::float4 &Vo, const sw::float4 &Vi, const sw::float4 &Vj, float di, float dj)
 {
-	inline void clipEdge(sw::float4 &Vo, const sw::float4 &Vi, const sw::float4 &Vj, float di, float dj)
-	{
-		float D = 1.0f / (dj - di);
+	float D = 1.0f / (dj - di);
 
-		Vo.x = (dj * Vi.x - di * Vj.x) * D;
-		Vo.y = (dj * Vi.y - di * Vj.y) * D;
-		Vo.z = (dj * Vi.z - di * Vj.z) * D;
-		Vo.w = (dj * Vi.w - di * Vj.w) * D;
-	}
-
-	void clipNear(sw::Polygon &polygon)
-	{
-		const sw::float4 **V = polygon.P[polygon.i];
-		const sw::float4 **T = polygon.P[polygon.i + 1];
-
-		int t = 0;
-
-		for(int i = 0; i < polygon.n; i++)
-		{
-			int j = i == polygon.n - 1 ? 0 : i + 1;
-
-			float di = V[i]->z;
-			float dj = V[j]->z;
-
-			if(di >= 0)
-			{
-				T[t++] = V[i];
-
-				if(dj < 0)
-				{
-					clipEdge(polygon.B[polygon.b], *V[i], *V[j], di, dj);
-					T[t++] = &polygon.B[polygon.b++];
-				}
-			}
-			else
-			{
-				if(dj > 0)
-				{
-					clipEdge(polygon.B[polygon.b], *V[j], *V[i], dj, di);
-					T[t++] = &polygon.B[polygon.b++];
-				}
-			}
-		}
-
-		polygon.n = t;
-		polygon.i += 1;
-	}
-
-	void clipFar(sw::Polygon &polygon)
-	{
-		const sw::float4 **V = polygon.P[polygon.i];
-		const sw::float4 **T = polygon.P[polygon.i + 1];
-
-		int t = 0;
-
-		for(int i = 0; i < polygon.n; i++)
-		{
-			int j = i == polygon.n - 1 ? 0 : i + 1;
-
-			float di = V[i]->w - V[i]->z;
-			float dj = V[j]->w - V[j]->z;
-
-			if(di >= 0)
-			{
-				T[t++] = V[i];
-
-				if(dj < 0)
-				{
-					clipEdge(polygon.B[polygon.b], *V[i], *V[j], di, dj);
-					T[t++] = &polygon.B[polygon.b++];
-				}
-			}
-			else
-			{
-				if(dj > 0)
-				{
-					clipEdge(polygon.B[polygon.b], *V[j], *V[i], dj, di);
-					T[t++] = &polygon.B[polygon.b++];
-				}
-			}
-		}
-
-		polygon.n = t;
-		polygon.i += 1;
-	}
-
-	void clipLeft(sw::Polygon &polygon)
-	{
-		const sw::float4 **V = polygon.P[polygon.i];
-		const sw::float4 **T = polygon.P[polygon.i + 1];
-
-		int t = 0;
-
-		for(int i = 0; i < polygon.n; i++)
-		{
-			int j = i == polygon.n - 1 ? 0 : i + 1;
-
-			float di = V[i]->w + V[i]->x;
-			float dj = V[j]->w + V[j]->x;
-
-			if(di >= 0)
-			{
-				T[t++] = V[i];
-
-				if(dj < 0)
-				{
-					clipEdge(polygon.B[polygon.b], *V[i], *V[j], di, dj);
-					T[t++] = &polygon.B[polygon.b++];
-				}
-			}
-			else
-			{
-				if(dj > 0)
-				{
-					clipEdge(polygon.B[polygon.b], *V[j], *V[i], dj, di);
-					T[t++] = &polygon.B[polygon.b++];
-				}
-			}
-		}
-
-		polygon.n = t;
-		polygon.i += 1;
-	}
-
-	void clipRight(sw::Polygon &polygon)
-	{
-		const sw::float4 **V = polygon.P[polygon.i];
-		const sw::float4 **T = polygon.P[polygon.i + 1];
-
-		int t = 0;
-
-		for(int i = 0; i < polygon.n; i++)
-		{
-			int j = i == polygon.n - 1 ? 0 : i + 1;
-
-			float di = V[i]->w - V[i]->x;
-			float dj = V[j]->w - V[j]->x;
-
-			if(di >= 0)
-			{
-				T[t++] = V[i];
-
-				if(dj < 0)
-				{
-					clipEdge(polygon.B[polygon.b], *V[i], *V[j], di, dj);
-					T[t++] = &polygon.B[polygon.b++];
-				}
-			}
-			else
-			{
-				if(dj > 0)
-				{
-					clipEdge(polygon.B[polygon.b], *V[j], *V[i], dj, di);
-					T[t++] = &polygon.B[polygon.b++];
-				}
-			}
-		}
-
-		polygon.n = t;
-		polygon.i += 1;
-	}
-
-	void clipTop(sw::Polygon &polygon)
-	{
-		const sw::float4 **V = polygon.P[polygon.i];
-		const sw::float4 **T = polygon.P[polygon.i + 1];
-
-		int t = 0;
-
-		for(int i = 0; i < polygon.n; i++)
-		{
-			int j = i == polygon.n - 1 ? 0 : i + 1;
-
-			float di = V[i]->w - V[i]->y;
-			float dj = V[j]->w - V[j]->y;
-
-			if(di >= 0)
-			{
-				T[t++] = V[i];
-
-				if(dj < 0)
-				{
-					clipEdge(polygon.B[polygon.b], *V[i], *V[j], di, dj);
-					T[t++] = &polygon.B[polygon.b++];
-				}
-			}
-			else
-			{
-				if(dj > 0)
-				{
-					clipEdge(polygon.B[polygon.b], *V[j], *V[i], dj, di);
-					T[t++] = &polygon.B[polygon.b++];
-				}
-			}
-		}
-
-		polygon.n = t;
-		polygon.i += 1;
-	}
-
-	void clipBottom(sw::Polygon &polygon)
-	{
-		const sw::float4 **V = polygon.P[polygon.i];
-		const sw::float4 **T = polygon.P[polygon.i + 1];
-
-		int t = 0;
-
-		for(int i = 0; i < polygon.n; i++)
-		{
-			int j = i == polygon.n - 1 ? 0 : i + 1;
-
-			float di = V[i]->w + V[i]->y;
-			float dj = V[j]->w + V[j]->y;
-
-			if(di >= 0)
-			{
-				T[t++] = V[i];
-
-				if(dj < 0)
-				{
-					clipEdge(polygon.B[polygon.b], *V[i], *V[j], di, dj);
-					T[t++] = &polygon.B[polygon.b++];
-				}
-			}
-			else
-			{
-				if(dj > 0)
-				{
-					clipEdge(polygon.B[polygon.b], *V[j], *V[i], dj, di);
-					T[t++] = &polygon.B[polygon.b++];
-				}
-			}
-		}
-
-		polygon.n = t;
-		polygon.i += 1;
-	}
+	Vo.x = (dj * Vi.x - di * Vj.x) * D;
+	Vo.y = (dj * Vi.y - di * Vj.y) * D;
+	Vo.z = (dj * Vi.z - di * Vj.z) * D;
+	Vo.w = (dj * Vi.w - di * Vj.w) * D;
 }
 
-namespace sw
+void clipNear(sw::Polygon &polygon)
 {
-	unsigned int Clipper::ComputeClipFlags(const float4 &v)
-	{
-		return ((v.x > v.w)     ? CLIP_RIGHT  : 0) |
-		       ((v.y > v.w)     ? CLIP_TOP    : 0) |
-		       ((v.z > v.w)     ? CLIP_FAR    : 0) |
-		       ((v.x < -v.w)    ? CLIP_LEFT   : 0) |
-		       ((v.y < -v.w)    ? CLIP_BOTTOM : 0) |
-		       ((v.z < 0)       ? CLIP_NEAR   : 0) |
-		       Clipper::CLIP_FINITE;   // FIXME: xyz finite
-	}
+	const sw::float4 **V = polygon.P[polygon.i];
+	const sw::float4 **T = polygon.P[polygon.i + 1];
 
-	bool Clipper::Clip(Polygon &polygon, int clipFlagsOr, const DrawCall &draw)
+	int t = 0;
+
+	for(int i = 0; i < polygon.n; i++)
 	{
-		if(clipFlagsOr & CLIP_FRUSTUM)
+		int j = i == polygon.n - 1 ? 0 : i + 1;
+
+		float di = V[i]->z;
+		float dj = V[j]->z;
+
+		if(di >= 0)
 		{
-			if(clipFlagsOr & CLIP_NEAR)   clipNear(polygon);
-			if(polygon.n >= 3) {
-			if(clipFlagsOr & CLIP_FAR)    clipFar(polygon);
-			if(polygon.n >= 3) {
-			if(clipFlagsOr & CLIP_LEFT)   clipLeft(polygon);
-			if(polygon.n >= 3) {
-			if(clipFlagsOr & CLIP_RIGHT)  clipRight(polygon);
-			if(polygon.n >= 3) {
-			if(clipFlagsOr & CLIP_TOP)    clipTop(polygon);
-			if(polygon.n >= 3) {
-			if(clipFlagsOr & CLIP_BOTTOM) clipBottom(polygon);
-			}}}}}
-		}
+			T[t++] = V[i];
 
-		return polygon.n >= 3;
+			if(dj < 0)
+			{
+				clipEdge(polygon.B[polygon.b], *V[i], *V[j], di, dj);
+				T[t++] = &polygon.B[polygon.b++];
+			}
+		}
+		else
+		{
+			if(dj > 0)
+			{
+				clipEdge(polygon.B[polygon.b], *V[j], *V[i], dj, di);
+				T[t++] = &polygon.B[polygon.b++];
+			}
+		}
 	}
+
+	polygon.n = t;
+	polygon.i += 1;
 }
+
+void clipFar(sw::Polygon &polygon)
+{
+	const sw::float4 **V = polygon.P[polygon.i];
+	const sw::float4 **T = polygon.P[polygon.i + 1];
+
+	int t = 0;
+
+	for(int i = 0; i < polygon.n; i++)
+	{
+		int j = i == polygon.n - 1 ? 0 : i + 1;
+
+		float di = V[i]->w - V[i]->z;
+		float dj = V[j]->w - V[j]->z;
+
+		if(di >= 0)
+		{
+			T[t++] = V[i];
+
+			if(dj < 0)
+			{
+				clipEdge(polygon.B[polygon.b], *V[i], *V[j], di, dj);
+				T[t++] = &polygon.B[polygon.b++];
+			}
+		}
+		else
+		{
+			if(dj > 0)
+			{
+				clipEdge(polygon.B[polygon.b], *V[j], *V[i], dj, di);
+				T[t++] = &polygon.B[polygon.b++];
+			}
+		}
+	}
+
+	polygon.n = t;
+	polygon.i += 1;
+}
+
+void clipLeft(sw::Polygon &polygon)
+{
+	const sw::float4 **V = polygon.P[polygon.i];
+	const sw::float4 **T = polygon.P[polygon.i + 1];
+
+	int t = 0;
+
+	for(int i = 0; i < polygon.n; i++)
+	{
+		int j = i == polygon.n - 1 ? 0 : i + 1;
+
+		float di = V[i]->w + V[i]->x;
+		float dj = V[j]->w + V[j]->x;
+
+		if(di >= 0)
+		{
+			T[t++] = V[i];
+
+			if(dj < 0)
+			{
+				clipEdge(polygon.B[polygon.b], *V[i], *V[j], di, dj);
+				T[t++] = &polygon.B[polygon.b++];
+			}
+		}
+		else
+		{
+			if(dj > 0)
+			{
+				clipEdge(polygon.B[polygon.b], *V[j], *V[i], dj, di);
+				T[t++] = &polygon.B[polygon.b++];
+			}
+		}
+	}
+
+	polygon.n = t;
+	polygon.i += 1;
+}
+
+void clipRight(sw::Polygon &polygon)
+{
+	const sw::float4 **V = polygon.P[polygon.i];
+	const sw::float4 **T = polygon.P[polygon.i + 1];
+
+	int t = 0;
+
+	for(int i = 0; i < polygon.n; i++)
+	{
+		int j = i == polygon.n - 1 ? 0 : i + 1;
+
+		float di = V[i]->w - V[i]->x;
+		float dj = V[j]->w - V[j]->x;
+
+		if(di >= 0)
+		{
+			T[t++] = V[i];
+
+			if(dj < 0)
+			{
+				clipEdge(polygon.B[polygon.b], *V[i], *V[j], di, dj);
+				T[t++] = &polygon.B[polygon.b++];
+			}
+		}
+		else
+		{
+			if(dj > 0)
+			{
+				clipEdge(polygon.B[polygon.b], *V[j], *V[i], dj, di);
+				T[t++] = &polygon.B[polygon.b++];
+			}
+		}
+	}
+
+	polygon.n = t;
+	polygon.i += 1;
+}
+
+void clipTop(sw::Polygon &polygon)
+{
+	const sw::float4 **V = polygon.P[polygon.i];
+	const sw::float4 **T = polygon.P[polygon.i + 1];
+
+	int t = 0;
+
+	for(int i = 0; i < polygon.n; i++)
+	{
+		int j = i == polygon.n - 1 ? 0 : i + 1;
+
+		float di = V[i]->w - V[i]->y;
+		float dj = V[j]->w - V[j]->y;
+
+		if(di >= 0)
+		{
+			T[t++] = V[i];
+
+			if(dj < 0)
+			{
+				clipEdge(polygon.B[polygon.b], *V[i], *V[j], di, dj);
+				T[t++] = &polygon.B[polygon.b++];
+			}
+		}
+		else
+		{
+			if(dj > 0)
+			{
+				clipEdge(polygon.B[polygon.b], *V[j], *V[i], dj, di);
+				T[t++] = &polygon.B[polygon.b++];
+			}
+		}
+	}
+
+	polygon.n = t;
+	polygon.i += 1;
+}
+
+void clipBottom(sw::Polygon &polygon)
+{
+	const sw::float4 **V = polygon.P[polygon.i];
+	const sw::float4 **T = polygon.P[polygon.i + 1];
+
+	int t = 0;
+
+	for(int i = 0; i < polygon.n; i++)
+	{
+		int j = i == polygon.n - 1 ? 0 : i + 1;
+
+		float di = V[i]->w + V[i]->y;
+		float dj = V[j]->w + V[j]->y;
+
+		if(di >= 0)
+		{
+			T[t++] = V[i];
+
+			if(dj < 0)
+			{
+				clipEdge(polygon.B[polygon.b], *V[i], *V[j], di, dj);
+				T[t++] = &polygon.B[polygon.b++];
+			}
+		}
+		else
+		{
+			if(dj > 0)
+			{
+				clipEdge(polygon.B[polygon.b], *V[j], *V[i], dj, di);
+				T[t++] = &polygon.B[polygon.b++];
+			}
+		}
+	}
+
+	polygon.n = t;
+	polygon.i += 1;
+}
+
+}  // anonymous namespace
+
+namespace sw {
+
+unsigned int Clipper::ComputeClipFlags(const float4 &v)
+{
+	return ((v.x > v.w)     ? CLIP_RIGHT  : 0) |
+	       ((v.y > v.w)     ? CLIP_TOP    : 0) |
+	       ((v.z > v.w)     ? CLIP_FAR    : 0) |
+	       ((v.x < -v.w)    ? CLIP_LEFT   : 0) |
+	       ((v.y < -v.w)    ? CLIP_BOTTOM : 0) |
+	       ((v.z < 0)       ? CLIP_NEAR   : 0) |
+	       Clipper::CLIP_FINITE;   // FIXME: xyz finite
+}
+
+bool Clipper::Clip(Polygon &polygon, int clipFlagsOr, const DrawCall &draw)
+{
+	if(clipFlagsOr & CLIP_FRUSTUM)
+	{
+		if(clipFlagsOr & CLIP_NEAR)   clipNear(polygon);
+		if(polygon.n >= 3) {
+		if(clipFlagsOr & CLIP_FAR)    clipFar(polygon);
+		if(polygon.n >= 3) {
+		if(clipFlagsOr & CLIP_LEFT)   clipLeft(polygon);
+		if(polygon.n >= 3) {
+		if(clipFlagsOr & CLIP_RIGHT)  clipRight(polygon);
+		if(polygon.n >= 3) {
+		if(clipFlagsOr & CLIP_TOP)    clipTop(polygon);
+		if(polygon.n >= 3) {
+		if(clipFlagsOr & CLIP_BOTTOM) clipBottom(polygon);
+		}}}}}
+	}
+
+	return polygon.n >= 3;
+}
+
+}  // namespace sw
diff --git a/src/Device/Clipper.hpp b/src/Device/Clipper.hpp
index 0d111fd..4992a57 100644
--- a/src/Device/Clipper.hpp
+++ b/src/Device/Clipper.hpp
@@ -15,32 +15,33 @@
 #ifndef sw_Clipper_hpp
 #define sw_Clipper_hpp
 
-namespace sw
+namespace sw {
+
+struct DrawCall;
+struct Polygon;
+struct float4;
+
+struct Clipper
 {
-	struct DrawCall;
-	struct Polygon;
-	struct float4;
-
-	struct Clipper
+	enum ClipFlags
 	{
-		enum ClipFlags
-		{
-			// Indicates the vertex is outside the respective frustum plane
-			CLIP_RIGHT  = 1 << 0,
-			CLIP_TOP    = 1 << 1,
-			CLIP_FAR    = 1 << 2,
-			CLIP_LEFT   = 1 << 3,
-			CLIP_BOTTOM = 1 << 4,
-			CLIP_NEAR   = 1 << 5,
+		// Indicates the vertex is outside the respective frustum plane
+		CLIP_RIGHT  = 1 << 0,
+		CLIP_TOP    = 1 << 1,
+		CLIP_FAR    = 1 << 2,
+		CLIP_LEFT   = 1 << 3,
+		CLIP_BOTTOM = 1 << 4,
+		CLIP_NEAR   = 1 << 5,
 
-			CLIP_FRUSTUM = 0x003F,
+		CLIP_FRUSTUM = 0x003F,
 
-			CLIP_FINITE = 1 << 7,   // All position coordinates are finite
-		};
-
-		static unsigned int ComputeClipFlags(const float4 &v);
-		static bool Clip(Polygon &polygon, int clipFlagsOr, const DrawCall &draw);
+		CLIP_FINITE = 1 << 7,   // All position coordinates are finite
 	};
-}
+
+	static unsigned int ComputeClipFlags(const float4 &v);
+	static bool Clip(Polygon &polygon, int clipFlagsOr, const DrawCall &draw);
+};
+
+}  // namespace sw
 
 #endif   // sw_Clipper_hpp
diff --git a/src/Device/Color.cpp b/src/Device/Color.cpp
index 9ad6767..d028028 100644
--- a/src/Device/Color.cpp
+++ b/src/Device/Color.cpp
@@ -14,6 +14,5 @@
 
 #include "Color.hpp"
 
-namespace sw
-{
-}
+namespace sw {
+}  // namespace sw
diff --git a/src/Device/Color.hpp b/src/Device/Color.hpp
index 0e6fc27..2b27e86 100644
--- a/src/Device/Color.hpp
+++ b/src/Device/Color.hpp
@@ -18,455 +18,456 @@
 #include "System/Types.hpp"
 #include "System/Math.hpp"
 
-namespace sw
+namespace sw {
+
+template<class T>
+struct Color
 {
-	template<class T>
-	struct Color
-	{
-		Color();
+	Color();
+
+	Color(const Color<byte> &c);
+	Color(const Color<short> &c);
+	Color(const Color<float> &c);
 	
-		Color(const Color<byte> &c);
-		Color(const Color<short> &c);
-		Color(const Color<float> &c);
-		
-		Color(int c);
-		Color(unsigned short c);
-		Color(unsigned long c);
-		Color(unsigned int c);
-		
-		Color(T r, T g, T b, T a = 1);
+	Color(int c);
+	Color(unsigned short c);
+	Color(unsigned long c);
+	Color(unsigned int c);
+	
+	Color(T r, T g, T b, T a = 1);
 
-		operator unsigned int() const;
+	operator unsigned int() const;
 
-		T &operator[](int i);
-		const T &operator[](int i) const;
+	T &operator[](int i);
+	const T &operator[](int i) const;
 
-		Color<T> operator+() const;
-		Color<T> operator-() const;
+	Color<T> operator+() const;
+	Color<T> operator-() const;
 
-		Color<T>& operator=(const Color<T>& c);
+	Color<T>& operator=(const Color<T>& c);
 
-		Color<T> &operator+=(const Color<T> &c);
-		Color<T> &operator*=(float l);
+	Color<T> &operator+=(const Color<T> &c);
+	Color<T> &operator*=(float l);
 
-		static Color<T> gradient(const Color<T> &c1, const Color<T>  &c2, float d);
-		static Color<T> shade(const Color<T> &c1, const Color<T>  &c2, float d);
+	static Color<T> gradient(const Color<T> &c1, const Color<T>  &c2, float d);
+	static Color<T> shade(const Color<T> &c1, const Color<T>  &c2, float d);
 
-		template<class S>
-		friend Color<S> operator+(const Color<S> &c1, const Color<S> &c2);
-		template<class S>
-		friend Color<S> operator-(const Color<S> &c1, const Color<S> &c2);
+	template<class S>
+	friend Color<S> operator+(const Color<S> &c1, const Color<S> &c2);
+	template<class S>
+	friend Color<S> operator-(const Color<S> &c1, const Color<S> &c2);
 
-		template<class S>
-		friend Color<S> operator*(float l, const Color<S> &c);
-		template<class S>
-		friend Color<S> operator*(const Color<S> &c1, const Color<S> &c2);
-		template<class S>
-		friend Color<S> operator/(const Color<S> &c, float l);
+	template<class S>
+	friend Color<S> operator*(float l, const Color<S> &c);
+	template<class S>
+	friend Color<S> operator*(const Color<S> &c1, const Color<S> &c2);
+	template<class S>
+	friend Color<S> operator/(const Color<S> &c, float l);
 
-		T r;
-		T g;
-		T b;
-		T a;
-	};
+	T r;
+	T g;
+	T b;
+	T a;
+};
 }
 
 #include "System/Math.hpp"
 
-namespace sw
+namespace sw {
+
+template<class T>
+inline Color<T>::Color()
 {
-	template<class T>
-	inline Color<T>::Color()
-	{
-	}
-
-	template<>
-	inline Color<byte>::Color(const Color<byte> &c)
-	{
-		r = c.r;
-		g = c.g;
-		b = c.b;
-		a = c.a;
-	}
-
-	template<>
-	inline Color<byte>::Color(const Color<short> &c)
-	{
-		r = static_cast<byte>(clamp(c.r >> 4, 0, 255));
-		g = static_cast<byte>(clamp(c.g >> 4, 0, 255));
-		b = static_cast<byte>(clamp(c.b >> 4, 0, 255));
-		a = static_cast<byte>(clamp(c.a >> 4, 0, 255));
-	}
-
-	template<>
-	inline Color<byte>::Color(const Color<float> &c)
-	{
-		r = static_cast<byte>(ifloor(clamp(c.r * 256.0f, 0.0f, 255.0f)));
-		g = static_cast<byte>(ifloor(clamp(c.g * 256.0f, 0.0f, 255.0f)));
-		b = static_cast<byte>(ifloor(clamp(c.b * 256.0f, 0.0f, 255.0f)));
-		a = static_cast<byte>(ifloor(clamp(c.a * 256.0f, 0.0f, 255.0f)));
-	}
-
-	template<>
-	inline Color<short>::Color(const Color<short> &c)
-	{
-		r = c.r;
-		g = c.g;
-		b = c.b;
-		a = c.a;
-	}
-
-	template<>
-	inline Color<short>::Color(const Color<byte> &c)
-	{
-		r = c.r << 4;
-		g = c.g << 4;
-		b = c.b << 4;
-		a = c.a << 4;
-	}
-
-	template<>
-	inline Color<float>::Color(const Color<float> &c)
-	{
-		r = c.r;
-		g = c.g;
-		b = c.b;
-		a = c.a;
-	}
-
-	template<>
-	inline Color<short>::Color(const Color<float> &c)
-	{
-		r = static_cast<short>(iround(clamp(c.r * 4095.0f, -4096.0f, 4095.0f)));
-		g = static_cast<short>(iround(clamp(c.g * 4095.0f, -4096.0f, 4095.0f)));
-		b = static_cast<short>(iround(clamp(c.b * 4095.0f, -4096.0f, 4095.0f)));
-		a = static_cast<short>(iround(clamp(c.a * 4095.0f, -4096.0f, 4095.0f)));
-	}
-
-	template<>
-	inline Color<float>::Color(const Color<byte> &c)
-	{
-		r = c.r / 255.0f;
-		g = c.g / 255.0f;
-		b = c.b / 255.0f;
-		a = c.a / 255.0f;
-	}
-
-	template<>
-	inline Color<float>::Color(const Color<short> &c)
-	{
-		r = c.r / 4095.0f;
-		g = c.g / 4095.0f;
-		b = c.b / 4095.0f;
-		a = c.a / 4095.0f;
-	}
-
-	template<>
-	inline Color<float>::Color(unsigned short c)
-	{
-		r = (float)(c & 0xF800) / (float)0xF800;
-		g = (float)(c & 0x07E0) / (float)0x07E0;
-		b = (float)(c & 0x001F) / (float)0x001F;
-		a = 1;
-	}
-
-	template<>
-	inline Color<short>::Color(unsigned short c)
-	{
-		// 4.12 fixed-point format
-		r = ((c & 0xF800) >> 4) + ((c & 0xF800) >> 9) + ((c & 0xF800) >> 14);
-		g = ((c & 0x07E0) << 1) + ((c & 0x07E0) >> 5);
-		b = ((c & 0x001F) << 7) + ((c & 0x001F) << 2) + ((c & 0x001F) >> 3);
-		a = 0x1000;
-	}
-
-	template<>
-	inline Color<byte>::Color(unsigned short c)
-	{
-		r = (byte)(((c & 0xF800) >> 8) + ((c & 0xE000) >> 13));
-		g = (byte)(((c & 0x07E0) >> 3) + ((c & 0x0600) >> 9));
-		b = (byte)(((c & 0x001F) << 3) + ((c & 0x001C) >> 2));
-		a = 0xFF;
-	}
-
-	template<>
-	inline Color<float>::Color(int c)
-	{
-		const float d = 1.0f / 255.0f;
-
-		r = (float)((c & 0x00FF0000) >> 16) * d;
-		g = (float)((c & 0x0000FF00) >> 8) * d;
-		b = (float)((c & 0x000000FF) >> 0) * d;
-		a = (float)((c & 0xFF000000) >> 24) * d;
-	}
-
-	template<>
-	inline Color<short>::Color(int c)
-	{
-		// 4.12 fixed-point format
-		r = (short)((c & 0x00FF0000) >> 12);
-		g = (short)((c & 0x0000FF00) >> 4);
-		b = (short)((c & 0x000000FF) << 4);
-		a = (short)((c & 0xFF000000) >> 20);
-	}
-
-	template<>
-	inline Color<byte>::Color(int c)
-	{
-		r = (byte)((c & 0x00FF0000) >> 16);
-		g = (byte)((c & 0x0000FF00) >> 8);
-		b = (byte)((c & 0x000000FF) >> 0);
-		a = (byte)((c & 0xFF000000) >> 24);
-	}
-
-	template<>
-	inline Color<float>::Color(unsigned int c)
-	{
-		const float d = 1.0f / 255.0f;
-
-		r = (float)((c & 0x00FF0000) >> 16) * d;
-		g = (float)((c & 0x0000FF00) >> 8) * d;
-		b = (float)((c & 0x000000FF) >> 0) * d;
-		a = (float)((c & 0xFF000000) >> 24) * d;
-	}
-
-	template<>
-	inline Color<short>::Color(unsigned int c)
-	{
-		// 4.12 fixed-point format
-		r = (short)((c & 0x00FF0000) >> 12);
-		g = (short)((c & 0x0000FF00) >> 4);
-		b = (short)((c & 0x000000FF) << 4);
-		a = (short)((c & 0xFF000000) >> 20);
-	}
-
-	template<>
-	inline Color<byte>::Color(unsigned int c)
-	{
-		r = (byte)((c & 0x00FF0000) >> 16);
-		g = (byte)((c & 0x0000FF00) >> 8);
-		b = (byte)((c & 0x000000FF) >> 0);
-		a = (byte)((c & 0xFF000000) >> 24);
-	}
-
-	template<>
-	inline Color<float>::Color(unsigned long c)
-	{
-		const float d = 1.0f / 255.0f;
-
-		r = (float)((c & 0x00FF0000) >> 16) * d;
-		g = (float)((c & 0x0000FF00) >> 8) * d;
-		b = (float)((c & 0x000000FF) >> 0) * d;
-		a = (float)((c & 0xFF000000) >> 24) * d;
-	}
-
-	template<>
-	inline Color<short>::Color(unsigned long c)
-	{
-		// 4.12 fixed-point format
-		r = (short)((c & 0x00FF0000) >> 12);
-		g = (short)((c & 0x0000FF00) >> 4);
-		b = (short)((c & 0x000000FF) << 4);
-		a = (short)((c & 0xFF000000) >> 20);
-	}
-
-	template<>
-	inline Color<byte>::Color(unsigned long c)
-	{
-		r = (byte)((c & 0x00FF0000) >> 16);
-		g = (byte)((c & 0x0000FF00) >> 8);
-		b = (byte)((c & 0x000000FF) >> 0);
-		a = (byte)((c & 0xFF000000) >> 24);
-	}
-
-	template<class T>
-	inline Color<T>::Color(T r_, T g_, T b_, T a_)
-	{
-		r = r_;
-		g = g_;
-		b = b_;
-		a = a_;
-	}
-
-	template<>
-	inline Color<float>::operator unsigned int() const
-	{
-		return ((unsigned int)min(b * 255.0f, 255.0f) << 0) |
-		       ((unsigned int)min(g * 255.0f, 255.0f) << 8) |
-		       ((unsigned int)min(r * 255.0f, 255.0f) << 16) |
-		       ((unsigned int)min(a * 255.0f, 255.0f) << 24);
-	}
-
-	template<>
-	inline Color<short>::operator unsigned int() const
-	{
-		return ((unsigned int)min(b >> 4, 255) << 0) |
-		       ((unsigned int)min(g >> 4, 255) << 8) |
-		       ((unsigned int)min(r >> 4, 255) << 16) |
-		       ((unsigned int)min(a >> 4, 255) << 24);
-	}
-
-	template<>
-	inline Color<byte>::operator unsigned int() const
-	{
-		return (b << 0) +
-		       (g << 8) +
-		       (r << 16) +
-			   (a << 24);
-	}
-
-	template<class T>
-	inline T &Color<T>::operator[](int i)
-	{
-		return (&r)[i];
-	}
-
-	template<class T>
-	inline const T &Color<T>::operator[](int i) const
-	{
-		return (&r)[i];
-	}
-
-	template<class T>
-	inline Color<T> Color<T>::operator+() const
-	{
-		return *this;
-	}
-
-	template<class T>
-	inline Color<T> Color<T>::operator-() const
-	{
-		return Color(-r, -g, -b, -a);
-	}
-
-	template<class T>
-	inline Color<T> &Color<T>::operator=(const Color& c)
-	{
-		r = c.r;
-		g = c.g;
-		b = c.b;
-		a = c.a;
-
-		return *this;
-	}
-
-	template<class T>
-	inline Color<T> &Color<T>::operator+=(const Color &c)
-	{
-		r += c.r;
-		g += c.g;
-		b += c.b;
-		a += c.a;
-
-		return *this;
-	}
-
-	template<class T>
-	inline Color<T> &Color<T>::operator*=(float l)
-	{
-		*this = l * *this;
-
-		return *this;
-	}
-
-	template<class T>
-	inline Color<T> operator+(const Color<T> &c1, const Color<T> &c2)
-	{
-		return Color<T>(c1.r + c2.r,
-		                c1.g + c2.g,
-		                c1.b + c2.b,
-		                c1.a + c2.a);	
-	}
-
-	template<class T>
-	inline Color<T> operator-(const Color<T> &c1, const Color<T> &c2)
-	{
-		return Color<T>(c1.r - c2.r,
-		                c1.g - c2.g,
-		                c1.b - c2.b,
-		                c1.a - c2.a);	
-	}
-
-	template<class T>
-	inline Color<T> operator*(float l, const Color<T> &c)
-	{
-		T r = (T)(l * c.r);
-		T g = (T)(l * c.g);
-		T b = (T)(l * c.b);
-		T a = (T)(l * c.a);
-
-		return Color<T>(r, g, b, a);
-	}
-
-	template<class T>
-	inline Color<T> operator*(const Color<T> &c1, const Color<T> &c2)
-	{
-		T r = c1.r * c2.r;
-		T g = c1.g * c2.g;
-		T b = c1.b * c2.b;
-		T a = c1.a * c2.a;
-
-		return Color<T>(r, g, b, a);
-	}
-
-	template<>
-	inline Color<short> operator*(const Color<short> &c1, const Color<short> &c2)
-	{
-		short r = c1.r * c2.r >> 12;
-		short g = c1.g * c2.g >> 12;
-		short b = c1.b * c2.b >> 12;
-		short a = c1.a * c2.a >> 12;
-
-		return Color<short>(r, g, b, a);
-	}
-
-	template<>
-	inline Color<byte> operator*(const Color<byte> &c1, const Color<byte> &c2)
-	{
-		byte r = c1.r * c2.r >> 8;
-		byte g = c1.g * c2.g >> 8;
-		byte b = c1.b * c2.b >> 8;
-		byte a = c1.a * c2.a >> 8;
-
-		return Color<byte>(r, g, b, a);
-	}
-
-	template<class T>
-	inline Color<T> operator/(const Color<T> &c, float l)
-	{
-		l = 1.0f / l; 
-
-		T r = (T)(l * c.r);
-		T g = (T)(l * c.g);
-		T b = (T)(l * c.b);
-		T a = (T)(l * c.a);
-
-		return Color<T>(r, g, b, a);
-	}
-
-	template<class T>
-	inline Color<T> Color<T>::gradient(const Color<T> &c1, const Color<T> &c2, float d)
-	{
-		d = 1.0f / d; 
-
-		T r = (c2.r - c1.r) * d;
-		T g = (c2.g - c1.g) * d;
-		T b = (c2.b - c1.b) * d;
-		T a = (c2.a - c1.a) * d;
-
-		return Color<T>(r, g, b, a);
-	}
-
-	template<class T>
-	inline Color<T> Color<T>::shade(const Color<T> &c1, const Color<T>  &c2, float d)
-	{
-		T r = c1.r + (T)(d * (c2.r - c1.r));
-		T g = c1.g + (T)(d * (c2.g - c1.g));
-		T b = c1.b + (T)(d * (c2.b - c1.b));
-		T a = c1.a + (T)(d * (c2.a - c1.a));
-
-		return Color<T>(r, g, b, a);
-	}
 }
 
+template<>
+inline Color<byte>::Color(const Color<byte> &c)
+{
+	r = c.r;
+	g = c.g;
+	b = c.b;
+	a = c.a;
+}
+
+template<>
+inline Color<byte>::Color(const Color<short> &c)
+{
+	r = static_cast<byte>(clamp(c.r >> 4, 0, 255));
+	g = static_cast<byte>(clamp(c.g >> 4, 0, 255));
+	b = static_cast<byte>(clamp(c.b >> 4, 0, 255));
+	a = static_cast<byte>(clamp(c.a >> 4, 0, 255));
+}
+
+template<>
+inline Color<byte>::Color(const Color<float> &c)
+{
+	r = static_cast<byte>(ifloor(clamp(c.r * 256.0f, 0.0f, 255.0f)));
+	g = static_cast<byte>(ifloor(clamp(c.g * 256.0f, 0.0f, 255.0f)));
+	b = static_cast<byte>(ifloor(clamp(c.b * 256.0f, 0.0f, 255.0f)));
+	a = static_cast<byte>(ifloor(clamp(c.a * 256.0f, 0.0f, 255.0f)));
+}
+
+template<>
+inline Color<short>::Color(const Color<short> &c)
+{
+	r = c.r;
+	g = c.g;
+	b = c.b;
+	a = c.a;
+}
+
+template<>
+inline Color<short>::Color(const Color<byte> &c)
+{
+	r = c.r << 4;
+	g = c.g << 4;
+	b = c.b << 4;
+	a = c.a << 4;
+}
+
+template<>
+inline Color<float>::Color(const Color<float> &c)
+{
+	r = c.r;
+	g = c.g;
+	b = c.b;
+	a = c.a;
+}
+
+template<>
+inline Color<short>::Color(const Color<float> &c)
+{
+	r = static_cast<short>(iround(clamp(c.r * 4095.0f, -4096.0f, 4095.0f)));
+	g = static_cast<short>(iround(clamp(c.g * 4095.0f, -4096.0f, 4095.0f)));
+	b = static_cast<short>(iround(clamp(c.b * 4095.0f, -4096.0f, 4095.0f)));
+	a = static_cast<short>(iround(clamp(c.a * 4095.0f, -4096.0f, 4095.0f)));
+}
+
+template<>
+inline Color<float>::Color(const Color<byte> &c)
+{
+	r = c.r / 255.0f;
+	g = c.g / 255.0f;
+	b = c.b / 255.0f;
+	a = c.a / 255.0f;
+}
+
+template<>
+inline Color<float>::Color(const Color<short> &c)
+{
+	r = c.r / 4095.0f;
+	g = c.g / 4095.0f;
+	b = c.b / 4095.0f;
+	a = c.a / 4095.0f;
+}
+
+template<>
+inline Color<float>::Color(unsigned short c)
+{
+	r = (float)(c & 0xF800) / (float)0xF800;
+	g = (float)(c & 0x07E0) / (float)0x07E0;
+	b = (float)(c & 0x001F) / (float)0x001F;
+	a = 1;
+}
+
+template<>
+inline Color<short>::Color(unsigned short c)
+{
+	// 4.12 fixed-point format
+	r = ((c & 0xF800) >> 4) + ((c & 0xF800) >> 9) + ((c & 0xF800) >> 14);
+	g = ((c & 0x07E0) << 1) + ((c & 0x07E0) >> 5);
+	b = ((c & 0x001F) << 7) + ((c & 0x001F) << 2) + ((c & 0x001F) >> 3);
+	a = 0x1000;
+}
+
+template<>
+inline Color<byte>::Color(unsigned short c)
+{
+	r = (byte)(((c & 0xF800) >> 8) + ((c & 0xE000) >> 13));
+	g = (byte)(((c & 0x07E0) >> 3) + ((c & 0x0600) >> 9));
+	b = (byte)(((c & 0x001F) << 3) + ((c & 0x001C) >> 2));
+	a = 0xFF;
+}
+
+template<>
+inline Color<float>::Color(int c)
+{
+	const float d = 1.0f / 255.0f;
+
+	r = (float)((c & 0x00FF0000) >> 16) * d;
+	g = (float)((c & 0x0000FF00) >> 8) * d;
+	b = (float)((c & 0x000000FF) >> 0) * d;
+	a = (float)((c & 0xFF000000) >> 24) * d;
+}
+
+template<>
+inline Color<short>::Color(int c)
+{
+	// 4.12 fixed-point format
+	r = (short)((c & 0x00FF0000) >> 12);
+	g = (short)((c & 0x0000FF00) >> 4);
+	b = (short)((c & 0x000000FF) << 4);
+	a = (short)((c & 0xFF000000) >> 20);
+}
+
+template<>
+inline Color<byte>::Color(int c)
+{
+	r = (byte)((c & 0x00FF0000) >> 16);
+	g = (byte)((c & 0x0000FF00) >> 8);
+	b = (byte)((c & 0x000000FF) >> 0);
+	a = (byte)((c & 0xFF000000) >> 24);
+}
+
+template<>
+inline Color<float>::Color(unsigned int c)
+{
+	const float d = 1.0f / 255.0f;
+
+	r = (float)((c & 0x00FF0000) >> 16) * d;
+	g = (float)((c & 0x0000FF00) >> 8) * d;
+	b = (float)((c & 0x000000FF) >> 0) * d;
+	a = (float)((c & 0xFF000000) >> 24) * d;
+}
+
+template<>
+inline Color<short>::Color(unsigned int c)
+{
+	// 4.12 fixed-point format
+	r = (short)((c & 0x00FF0000) >> 12);
+	g = (short)((c & 0x0000FF00) >> 4);
+	b = (short)((c & 0x000000FF) << 4);
+	a = (short)((c & 0xFF000000) >> 20);
+}
+
+template<>
+inline Color<byte>::Color(unsigned int c)
+{
+	r = (byte)((c & 0x00FF0000) >> 16);
+	g = (byte)((c & 0x0000FF00) >> 8);
+	b = (byte)((c & 0x000000FF) >> 0);
+	a = (byte)((c & 0xFF000000) >> 24);
+}
+
+template<>
+inline Color<float>::Color(unsigned long c)
+{
+	const float d = 1.0f / 255.0f;
+
+	r = (float)((c & 0x00FF0000) >> 16) * d;
+	g = (float)((c & 0x0000FF00) >> 8) * d;
+	b = (float)((c & 0x000000FF) >> 0) * d;
+	a = (float)((c & 0xFF000000) >> 24) * d;
+}
+
+template<>
+inline Color<short>::Color(unsigned long c)
+{
+	// 4.12 fixed-point format
+	r = (short)((c & 0x00FF0000) >> 12);
+	g = (short)((c & 0x0000FF00) >> 4);
+	b = (short)((c & 0x000000FF) << 4);
+	a = (short)((c & 0xFF000000) >> 20);
+}
+
+template<>
+inline Color<byte>::Color(unsigned long c)
+{
+	r = (byte)((c & 0x00FF0000) >> 16);
+	g = (byte)((c & 0x0000FF00) >> 8);
+	b = (byte)((c & 0x000000FF) >> 0);
+	a = (byte)((c & 0xFF000000) >> 24);
+}
+
+template<class T>
+inline Color<T>::Color(T r_, T g_, T b_, T a_)
+{
+	r = r_;
+	g = g_;
+	b = b_;
+	a = a_;
+}
+
+template<>
+inline Color<float>::operator unsigned int() const
+{
+	return ((unsigned int)min(b * 255.0f, 255.0f) << 0) |
+	       ((unsigned int)min(g * 255.0f, 255.0f) << 8) |
+	       ((unsigned int)min(r * 255.0f, 255.0f) << 16) |
+	       ((unsigned int)min(a * 255.0f, 255.0f) << 24);
+}
+
+template<>
+inline Color<short>::operator unsigned int() const
+{
+	return ((unsigned int)min(b >> 4, 255) << 0) |
+	       ((unsigned int)min(g >> 4, 255) << 8) |
+	       ((unsigned int)min(r >> 4, 255) << 16) |
+	       ((unsigned int)min(a >> 4, 255) << 24);
+}
+
+template<>
+inline Color<byte>::operator unsigned int() const
+{
+	return (b << 0) +
+	       (g << 8) +
+	       (r << 16) +
+		   (a << 24);
+}
+
+template<class T>
+inline T &Color<T>::operator[](int i)
+{
+	return (&r)[i];
+}
+
+template<class T>
+inline const T &Color<T>::operator[](int i) const
+{
+	return (&r)[i];
+}
+
+template<class T>
+inline Color<T> Color<T>::operator+() const
+{
+	return *this;
+}
+
+template<class T>
+inline Color<T> Color<T>::operator-() const
+{
+	return Color(-r, -g, -b, -a);
+}
+
+template<class T>
+inline Color<T> &Color<T>::operator=(const Color& c)
+{
+	r = c.r;
+	g = c.g;
+	b = c.b;
+	a = c.a;
+
+	return *this;
+}
+
+template<class T>
+inline Color<T> &Color<T>::operator+=(const Color &c)
+{
+	r += c.r;
+	g += c.g;
+	b += c.b;
+	a += c.a;
+
+	return *this;
+}
+
+template<class T>
+inline Color<T> &Color<T>::operator*=(float l)
+{
+	*this = l * *this;
+
+	return *this;
+}
+
+template<class T>
+inline Color<T> operator+(const Color<T> &c1, const Color<T> &c2)
+{
+	return Color<T>(c1.r + c2.r,
+	                c1.g + c2.g,
+	                c1.b + c2.b,
+	                c1.a + c2.a);	
+}
+
+template<class T>
+inline Color<T> operator-(const Color<T> &c1, const Color<T> &c2)
+{
+	return Color<T>(c1.r - c2.r,
+	                c1.g - c2.g,
+	                c1.b - c2.b,
+	                c1.a - c2.a);	
+}
+
+template<class T>
+inline Color<T> operator*(float l, const Color<T> &c)
+{
+	T r = (T)(l * c.r);
+	T g = (T)(l * c.g);
+	T b = (T)(l * c.b);
+	T a = (T)(l * c.a);
+
+	return Color<T>(r, g, b, a);
+}
+
+template<class T>
+inline Color<T> operator*(const Color<T> &c1, const Color<T> &c2)
+{
+	T r = c1.r * c2.r;
+	T g = c1.g * c2.g;
+	T b = c1.b * c2.b;
+	T a = c1.a * c2.a;
+
+	return Color<T>(r, g, b, a);
+}
+
+template<>
+inline Color<short> operator*(const Color<short> &c1, const Color<short> &c2)
+{
+	short r = c1.r * c2.r >> 12;
+	short g = c1.g * c2.g >> 12;
+	short b = c1.b * c2.b >> 12;
+	short a = c1.a * c2.a >> 12;
+
+	return Color<short>(r, g, b, a);
+}
+
+template<>
+inline Color<byte> operator*(const Color<byte> &c1, const Color<byte> &c2)
+{
+	byte r = c1.r * c2.r >> 8;
+	byte g = c1.g * c2.g >> 8;
+	byte b = c1.b * c2.b >> 8;
+	byte a = c1.a * c2.a >> 8;
+
+	return Color<byte>(r, g, b, a);
+}
+
+template<class T>
+inline Color<T> operator/(const Color<T> &c, float l)
+{
+	l = 1.0f / l; 
+
+	T r = (T)(l * c.r);
+	T g = (T)(l * c.g);
+	T b = (T)(l * c.b);
+	T a = (T)(l * c.a);
+
+	return Color<T>(r, g, b, a);
+}
+
+template<class T>
+inline Color<T> Color<T>::gradient(const Color<T> &c1, const Color<T> &c2, float d)
+{
+	d = 1.0f / d; 
+
+	T r = (c2.r - c1.r) * d;
+	T g = (c2.g - c1.g) * d;
+	T b = (c2.b - c1.b) * d;
+	T a = (c2.a - c1.a) * d;
+
+	return Color<T>(r, g, b, a);
+}
+
+template<class T>
+inline Color<T> Color<T>::shade(const Color<T> &c1, const Color<T>  &c2, float d)
+{
+	T r = c1.r + (T)(d * (c2.r - c1.r));
+	T g = c1.g + (T)(d * (c2.g - c1.g));
+	T b = c1.b + (T)(d * (c2.b - c1.b));
+	T a = c1.a + (T)(d * (c2.a - c1.a));
+
+	return Color<T>(r, g, b, a);
+}
+
+}  // namespace sw
+
 #endif   // sw_Color_hpp
diff --git a/src/Device/Config.cpp b/src/Device/Config.cpp
index 6eb61ab..5a2de75 100644
--- a/src/Device/Config.cpp
+++ b/src/Device/Config.cpp
@@ -16,37 +16,38 @@
 
 #include "System/Timer.hpp"
 
-namespace sw
+namespace sw {
+
+Profiler profiler;
+
+Profiler::Profiler()
 {
-	Profiler profiler;
+	reset();
+}
 
-	Profiler::Profiler()
-	{
-		reset();
-	}
+void Profiler::reset()
+{
+	framesSec = 0;
+	framesTotal = 0;
+	FPS = 0;
+}
 
-	void Profiler::reset()
+void Profiler::nextFrame()
+{
+	static double fpsTime = sw::Timer::seconds();
+
+	double time = sw::Timer::seconds();
+	double delta = time - fpsTime;
+	framesSec++;
+
+	if(delta > 1.0)
 	{
+		FPS = framesSec / delta;
+
+		fpsTime = time;
+		framesTotal += framesSec;
 		framesSec = 0;
-		framesTotal = 0;
-		FPS = 0;
 	}
+}
 
-	void Profiler::nextFrame()
-	{
-		static double fpsTime = sw::Timer::seconds();
-
-		double time = sw::Timer::seconds();
-		double delta = time - fpsTime;
-		framesSec++;
-
-		if(delta > 1.0)
-		{
-			FPS = framesSec / delta;
-
-			fpsTime = time;
-			framesTotal += framesSec;
-			framesSec = 0;
-		}
-	}
-}
\ No newline at end of file
+}  // namespace sw
\ No newline at end of file
diff --git a/src/Device/Config.hpp b/src/Device/Config.hpp
index e1e0235..7584f07 100644
--- a/src/Device/Config.hpp
+++ b/src/Device/Config.hpp
@@ -17,49 +17,50 @@
 
 #include "System/Types.hpp"
 
-namespace sw
+namespace sw {
+
+enum
 {
-	enum
-	{
-		PERF_PIXEL,
-		PERF_PIPE,
-		PERF_INTERP,
-		PERF_SHADER,
-		PERF_TEX,
-		PERF_ROP,
+	PERF_PIXEL,
+	PERF_PIPE,
+	PERF_INTERP,
+	PERF_SHADER,
+	PERF_TEX,
+	PERF_ROP,
 
-		PERF_TIMERS
-	};
+	PERF_TIMERS
+};
 
-	struct Profiler
-	{
-		Profiler();
+struct Profiler
+{
+	Profiler();
 
-		void reset();
-		void nextFrame();
+	void reset();
+	void nextFrame();
 
-		int framesSec;
-		int framesTotal;
-		double FPS;
-	};
+	int framesSec;
+	int framesTotal;
+	double FPS;
+};
 
-	extern Profiler profiler;
+extern Profiler profiler;
 
-	enum
-	{
-		OUTLINE_RESOLUTION = 8192,   // Maximum vertical resolution of the render target
-		MIPMAP_LEVELS = 14,
-		MAX_UNIFORM_BLOCK_SIZE = 16384,
-		MAX_CLIP_DISTANCES = 8,
-		MAX_CULL_DISTANCES = 8,
-		MAX_TRANSFORM_FEEDBACK_SEPARATE_COMPONENTS = 64,
-		MAX_TRANSFORM_FEEDBACK_INTERLEAVED_COMPONENTS = 64,
-		MIN_TEXEL_OFFSET = -8,
-		MAX_TEXEL_OFFSET = 7,
-		MAX_TEXTURE_LOD = MIPMAP_LEVELS - 2,   // Trilinear accesses lod+1
-		RENDERTARGETS = 8,
-		MAX_INTERFACE_COMPONENTS = 32 * 4,  // Must be multiple of 4 for 16-byte alignment.
-	};
-}
+enum
+{
+	OUTLINE_RESOLUTION = 8192,   // Maximum vertical resolution of the render target
+	MIPMAP_LEVELS = 14,
+	MAX_UNIFORM_BLOCK_SIZE = 16384,
+	MAX_CLIP_DISTANCES = 8,
+	MAX_CULL_DISTANCES = 8,
+	MAX_TRANSFORM_FEEDBACK_SEPARATE_COMPONENTS = 64,
+	MAX_TRANSFORM_FEEDBACK_INTERLEAVED_COMPONENTS = 64,
+	MIN_TEXEL_OFFSET = -8,
+	MAX_TEXEL_OFFSET = 7,
+	MAX_TEXTURE_LOD = MIPMAP_LEVELS - 2,   // Trilinear accesses lod+1
+	RENDERTARGETS = 8,
+	MAX_INTERFACE_COMPONENTS = 32 * 4,  // Must be multiple of 4 for 16-byte alignment.
+};
+
+}  // namespace sw
 
 #endif   // sw_Config_hpp
diff --git a/src/Device/Context.cpp b/src/Device/Context.cpp
index 49505c9..e41ce74 100644
--- a/src/Device/Context.cpp
+++ b/src/Device/Context.cpp
@@ -22,552 +22,553 @@
 
 #include <string.h>
 
-namespace sw
+namespace sw {
+
+Context::Context()
 {
-	Context::Context()
+	init();
+}
+
+bool Context::isDrawPoint(bool polygonModeAware) const
+{
+	switch(topology)
 	{
-		init();
-	}
-
-	bool Context::isDrawPoint(bool polygonModeAware) const
-	{
-		switch(topology)
-		{
-		case VK_PRIMITIVE_TOPOLOGY_POINT_LIST:
-			return true;
-		case VK_PRIMITIVE_TOPOLOGY_LINE_LIST:
-		case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP:
-			return false;
-		case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST:
-		case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP:
-		case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN:
-			return polygonModeAware ? (polygonMode == VK_POLYGON_MODE_POINT) : false;
-		default:
-			UNIMPLEMENTED("topology %d", int(topology));
-		}
-		return false;
-	}
-
-	bool Context::isDrawLine(bool polygonModeAware) const
-	{
-		switch(topology)
-		{
-		case VK_PRIMITIVE_TOPOLOGY_POINT_LIST:
-			return false;
-		case VK_PRIMITIVE_TOPOLOGY_LINE_LIST:
-		case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP:
-			return true;
-		case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST:
-		case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP:
-		case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN:
-			return polygonModeAware ? (polygonMode == VK_POLYGON_MODE_LINE) : false;
-		default:
-			UNIMPLEMENTED("topology %d", int(topology));
-		}
-		return false;
-	}
-
-	bool Context::isDrawTriangle(bool polygonModeAware) const
-	{
-		switch(topology)
-		{
-		case VK_PRIMITIVE_TOPOLOGY_POINT_LIST:
-		case VK_PRIMITIVE_TOPOLOGY_LINE_LIST:
-		case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP:
-			return false;
-		case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST:
-		case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP:
-		case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN:
-			return polygonModeAware ? (polygonMode == VK_POLYGON_MODE_FILL) : true;
-		default:
-			UNIMPLEMENTED("topology %d", int(topology));
-		}
-		return false;
-	}
-
-	void Context::init()
-	{
-		for(int i = 0; i < RENDERTARGETS; ++i)
-		{
-			renderTarget[i] = nullptr;
-		}
-
-		depthBuffer = nullptr;
-		stencilBuffer = nullptr;
-
-		stencilEnable = false;
-		frontStencil = {};
-		backStencil = {};
-
-		robustBufferAccess = false;
-
-		rasterizerDiscard = false;
-
-		depthCompareMode = VK_COMPARE_OP_LESS;
-		depthBoundsTestEnable = false;
-		depthBufferEnable = false;
-		depthWriteEnable = false;
-
-		cullMode = VK_CULL_MODE_FRONT_BIT;
-		frontFace = VK_FRONT_FACE_COUNTER_CLOCKWISE;
-		provokingVertexMode = VK_PROVOKING_VERTEX_MODE_FIRST_VERTEX_EXT;
-		lineRasterizationMode = VK_LINE_RASTERIZATION_MODE_DEFAULT_EXT;
-
-		depthBias = 0.0f;
-		slopeDepthBias = 0.0f;
-
-		for(int i = 0; i < RENDERTARGETS; i++)
-		{
-			colorWriteMask[i] = 0x0000000F;
-		}
-
-		pipelineLayout = nullptr;
-
-		pixelShader = nullptr;
-		vertexShader = nullptr;
-
-		occlusionEnabled = false;
-
-		lineWidth = 1.0f;
-
-		sampleMask = 0xFFFFFFFF;
-		alphaToCoverage = false;
-	}
-
-	bool Context::depthWriteActive() const
-	{
-		if(!depthBufferActive()) return false;
-
-		return depthWriteEnable;
-	}
-
-	bool Context::depthBufferActive() const
-	{
-		return depthBuffer && depthBufferEnable;
-	}
-
-	bool Context::stencilActive() const
-	{
-		return stencilBuffer && stencilEnable;
-	}
-
-	void Context::setBlendState(int index, BlendState state)
-	{
-		ASSERT((index >= 0) && (index < RENDERTARGETS));
-
-		blendState[index] = state;
-	}
-
-	BlendState Context::getBlendState(int index) const
-	{
-		ASSERT((index >= 0) && (index < RENDERTARGETS));
-
-		BlendState activeBlendState;
-		activeBlendState.alphaBlendEnable = alphaBlendActive(index);
-		activeBlendState.sourceBlendFactor = sourceBlendFactor(index);
-		activeBlendState.destBlendFactor = destBlendFactor(index);
-		activeBlendState.blendOperation = blendOperation(index);
-		activeBlendState.sourceBlendFactorAlpha = sourceBlendFactorAlpha(index);
-		activeBlendState.destBlendFactorAlpha = destBlendFactorAlpha(index);
-		activeBlendState.blendOperationAlpha = blendOperationAlpha(index);
-		return activeBlendState;
-	}
-
-	bool Context::alphaBlendActive(int index) const
-	{
-		ASSERT((index >= 0) && (index < RENDERTARGETS));
-
-		if(!blendState[index].alphaBlendEnable)
-		{
-			return false;
-		}
-
-		if(!colorUsed())
-		{
-			return false;
-		}
-
-		bool colorBlend = !(blendOperation(index) == VK_BLEND_OP_SRC_EXT && sourceBlendFactor(index) == VK_BLEND_FACTOR_ONE);
-		bool alphaBlend = !(blendOperationAlpha(index) == VK_BLEND_OP_SRC_EXT && sourceBlendFactorAlpha(index) == VK_BLEND_FACTOR_ONE);
-
-		return colorBlend || alphaBlend;
-	}
-
-	VkBlendFactor Context::sourceBlendFactor(int index) const
-	{
-		ASSERT((index >= 0) && (index < RENDERTARGETS));
-
-		if(!blendState[index].alphaBlendEnable) return VK_BLEND_FACTOR_ONE;
-
-		switch(blendState[index].blendOperation)
-		{
-		case VK_BLEND_OP_ADD:
-		case VK_BLEND_OP_SUBTRACT:
-		case VK_BLEND_OP_REVERSE_SUBTRACT:
-			return blendState[index].sourceBlendFactor;
-		case VK_BLEND_OP_MIN:
-			return VK_BLEND_FACTOR_ONE;
-		case VK_BLEND_OP_MAX:
-			return VK_BLEND_FACTOR_ONE;
-		default:
-			ASSERT(false);
-		}
-
-		return blendState[index].sourceBlendFactor;
-	}
-
-	VkBlendFactor Context::destBlendFactor(int index) const
-	{
-		ASSERT((index >= 0) && (index < RENDERTARGETS));
-
-		if(!blendState[index].alphaBlendEnable) return VK_BLEND_FACTOR_ONE;
-
-		switch(blendState[index].blendOperation)
-		{
-		case VK_BLEND_OP_ADD:
-		case VK_BLEND_OP_SUBTRACT:
-		case VK_BLEND_OP_REVERSE_SUBTRACT:
-			return blendState[index].destBlendFactor;
-		case VK_BLEND_OP_MIN:
-			return VK_BLEND_FACTOR_ONE;
-		case VK_BLEND_OP_MAX:
-			return VK_BLEND_FACTOR_ONE;
-		default:
-			ASSERT(false);
-		}
-
-		return blendState[index].destBlendFactor;
-	}
-
-	bool Context::allTargetsColorClamp() const
-	{
-		// TODO: remove all of this and support VkPhysicalDeviceFeatures::independentBlend instead
-		for (int i = 0; i < RENDERTARGETS; i++)
-		{
-			if (renderTarget[i] && renderTarget[i]->getFormat().isFloatFormat())
-			{
-				return false;
-			}
-		}
-
+	case VK_PRIMITIVE_TOPOLOGY_POINT_LIST:
 		return true;
+	case VK_PRIMITIVE_TOPOLOGY_LINE_LIST:
+	case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP:
+		return false;
+	case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST:
+	case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP:
+	case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN:
+		return polygonModeAware ? (polygonMode == VK_POLYGON_MODE_POINT) : false;
+	default:
+		UNIMPLEMENTED("topology %d", int(topology));
+	}
+	return false;
+}
+
+bool Context::isDrawLine(bool polygonModeAware) const
+{
+	switch(topology)
+	{
+	case VK_PRIMITIVE_TOPOLOGY_POINT_LIST:
+		return false;
+	case VK_PRIMITIVE_TOPOLOGY_LINE_LIST:
+	case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP:
+		return true;
+	case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST:
+	case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP:
+	case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN:
+		return polygonModeAware ? (polygonMode == VK_POLYGON_MODE_LINE) : false;
+	default:
+		UNIMPLEMENTED("topology %d", int(topology));
+	}
+	return false;
+}
+
+bool Context::isDrawTriangle(bool polygonModeAware) const
+{
+	switch(topology)
+	{
+	case VK_PRIMITIVE_TOPOLOGY_POINT_LIST:
+	case VK_PRIMITIVE_TOPOLOGY_LINE_LIST:
+	case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP:
+		return false;
+	case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST:
+	case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP:
+	case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN:
+		return polygonModeAware ? (polygonMode == VK_POLYGON_MODE_FILL) : true;
+	default:
+		UNIMPLEMENTED("topology %d", int(topology));
+	}
+	return false;
+}
+
+void Context::init()
+{
+	for(int i = 0; i < RENDERTARGETS; ++i)
+	{
+		renderTarget[i] = nullptr;
 	}
 
-	VkBlendOp Context::blendOperation(int index) const
+	depthBuffer = nullptr;
+	stencilBuffer = nullptr;
+
+	stencilEnable = false;
+	frontStencil = {};
+	backStencil = {};
+
+	robustBufferAccess = false;
+
+	rasterizerDiscard = false;
+
+	depthCompareMode = VK_COMPARE_OP_LESS;
+	depthBoundsTestEnable = false;
+	depthBufferEnable = false;
+	depthWriteEnable = false;
+
+	cullMode = VK_CULL_MODE_FRONT_BIT;
+	frontFace = VK_FRONT_FACE_COUNTER_CLOCKWISE;
+	provokingVertexMode = VK_PROVOKING_VERTEX_MODE_FIRST_VERTEX_EXT;
+	lineRasterizationMode = VK_LINE_RASTERIZATION_MODE_DEFAULT_EXT;
+
+	depthBias = 0.0f;
+	slopeDepthBias = 0.0f;
+
+	for(int i = 0; i < RENDERTARGETS; i++)
 	{
-		ASSERT((index >= 0) && (index < RENDERTARGETS));
+		colorWriteMask[i] = 0x0000000F;
+	}
 
-		if(!blendState[index].alphaBlendEnable) return VK_BLEND_OP_SRC_EXT;
+	pipelineLayout = nullptr;
 
-		switch(blendState[index].blendOperation)
+	pixelShader = nullptr;
+	vertexShader = nullptr;
+
+	occlusionEnabled = false;
+
+	lineWidth = 1.0f;
+
+	sampleMask = 0xFFFFFFFF;
+	alphaToCoverage = false;
+}
+
+bool Context::depthWriteActive() const
+{
+	if(!depthBufferActive()) return false;
+
+	return depthWriteEnable;
+}
+
+bool Context::depthBufferActive() const
+{
+	return depthBuffer && depthBufferEnable;
+}
+
+bool Context::stencilActive() const
+{
+	return stencilBuffer && stencilEnable;
+}
+
+void Context::setBlendState(int index, BlendState state)
+{
+	ASSERT((index >= 0) && (index < RENDERTARGETS));
+
+	blendState[index] = state;
+}
+
+BlendState Context::getBlendState(int index) const
+{
+	ASSERT((index >= 0) && (index < RENDERTARGETS));
+
+	BlendState activeBlendState;
+	activeBlendState.alphaBlendEnable = alphaBlendActive(index);
+	activeBlendState.sourceBlendFactor = sourceBlendFactor(index);
+	activeBlendState.destBlendFactor = destBlendFactor(index);
+	activeBlendState.blendOperation = blendOperation(index);
+	activeBlendState.sourceBlendFactorAlpha = sourceBlendFactorAlpha(index);
+	activeBlendState.destBlendFactorAlpha = destBlendFactorAlpha(index);
+	activeBlendState.blendOperationAlpha = blendOperationAlpha(index);
+	return activeBlendState;
+}
+
+bool Context::alphaBlendActive(int index) const
+{
+	ASSERT((index >= 0) && (index < RENDERTARGETS));
+
+	if(!blendState[index].alphaBlendEnable)
+	{
+		return false;
+	}
+
+	if(!colorUsed())
+	{
+		return false;
+	}
+
+	bool colorBlend = !(blendOperation(index) == VK_BLEND_OP_SRC_EXT && sourceBlendFactor(index) == VK_BLEND_FACTOR_ONE);
+	bool alphaBlend = !(blendOperationAlpha(index) == VK_BLEND_OP_SRC_EXT && sourceBlendFactorAlpha(index) == VK_BLEND_FACTOR_ONE);
+
+	return colorBlend || alphaBlend;
+}
+
+VkBlendFactor Context::sourceBlendFactor(int index) const
+{
+	ASSERT((index >= 0) && (index < RENDERTARGETS));
+
+	if(!blendState[index].alphaBlendEnable) return VK_BLEND_FACTOR_ONE;
+
+	switch(blendState[index].blendOperation)
+	{
+	case VK_BLEND_OP_ADD:
+	case VK_BLEND_OP_SUBTRACT:
+	case VK_BLEND_OP_REVERSE_SUBTRACT:
+		return blendState[index].sourceBlendFactor;
+	case VK_BLEND_OP_MIN:
+		return VK_BLEND_FACTOR_ONE;
+	case VK_BLEND_OP_MAX:
+		return VK_BLEND_FACTOR_ONE;
+	default:
+		ASSERT(false);
+	}
+
+	return blendState[index].sourceBlendFactor;
+}
+
+VkBlendFactor Context::destBlendFactor(int index) const
+{
+	ASSERT((index >= 0) && (index < RENDERTARGETS));
+
+	if(!blendState[index].alphaBlendEnable) return VK_BLEND_FACTOR_ONE;
+
+	switch(blendState[index].blendOperation)
+	{
+	case VK_BLEND_OP_ADD:
+	case VK_BLEND_OP_SUBTRACT:
+	case VK_BLEND_OP_REVERSE_SUBTRACT:
+		return blendState[index].destBlendFactor;
+	case VK_BLEND_OP_MIN:
+		return VK_BLEND_FACTOR_ONE;
+	case VK_BLEND_OP_MAX:
+		return VK_BLEND_FACTOR_ONE;
+	default:
+		ASSERT(false);
+	}
+
+	return blendState[index].destBlendFactor;
+}
+
+bool Context::allTargetsColorClamp() const
+{
+	// TODO: remove all of this and support VkPhysicalDeviceFeatures::independentBlend instead
+	for (int i = 0; i < RENDERTARGETS; i++)
+	{
+		if (renderTarget[i] && renderTarget[i]->getFormat().isFloatFormat())
 		{
-		case VK_BLEND_OP_ADD:
-			if(sourceBlendFactor(index) == VK_BLEND_FACTOR_ZERO)
-			{
-				if(destBlendFactor(index) == VK_BLEND_FACTOR_ZERO)
-				{
-					return VK_BLEND_OP_ZERO_EXT;
-				}
-				else
-				{
-					return VK_BLEND_OP_DST_EXT;
-				}
-			}
-			else if(sourceBlendFactor(index) == VK_BLEND_FACTOR_ONE)
-			{
-				if(destBlendFactor(index) == VK_BLEND_FACTOR_ZERO)
-				{
-					return VK_BLEND_OP_SRC_EXT;
-				}
-				else
-				{
-					return VK_BLEND_OP_ADD;
-				}
-			}
-			else
-			{
-				if(destBlendFactor(index) == VK_BLEND_FACTOR_ZERO)
-				{
-					return VK_BLEND_OP_SRC_EXT;
-				}
-				else
-				{
-					return VK_BLEND_OP_ADD;
-				}
-			}
-		case VK_BLEND_OP_SUBTRACT:
-			if(sourceBlendFactor(index) == VK_BLEND_FACTOR_ZERO && allTargetsColorClamp())
-			{
-				return VK_BLEND_OP_ZERO_EXT;   // Negative, clamped to zero
-			}
-			else if(sourceBlendFactor(index) == VK_BLEND_FACTOR_ONE)
-			{
-				if(destBlendFactor(index) == VK_BLEND_FACTOR_ZERO)
-				{
-					return VK_BLEND_OP_SRC_EXT;
-				}
-				else
-				{
-					return VK_BLEND_OP_SUBTRACT;
-				}
-			}
-			else
-			{
-				if(destBlendFactor(index) == VK_BLEND_FACTOR_ZERO)
-				{
-					return VK_BLEND_OP_SRC_EXT;
-				}
-				else
-				{
-					return VK_BLEND_OP_SUBTRACT;
-				}
-			}
-		case VK_BLEND_OP_REVERSE_SUBTRACT:
-			if(sourceBlendFactor(index) == VK_BLEND_FACTOR_ZERO)
-			{
-				if(destBlendFactor(index) == VK_BLEND_FACTOR_ZERO)
-				{
-					return VK_BLEND_OP_ZERO_EXT;
-				}
-				else
-				{
-					return VK_BLEND_OP_DST_EXT;
-				}
-			}
-			else if(sourceBlendFactor(index) == VK_BLEND_FACTOR_ONE)
-			{
-				if(destBlendFactor(index) == VK_BLEND_FACTOR_ZERO && allTargetsColorClamp())
-				{
-					return VK_BLEND_OP_ZERO_EXT;   // Negative, clamped to zero
-				}
-				else
-				{
-					return VK_BLEND_OP_REVERSE_SUBTRACT;
-				}
-			}
-			else
-			{
-				if(destBlendFactor(index) == VK_BLEND_FACTOR_ZERO && allTargetsColorClamp())
-				{
-					return VK_BLEND_OP_ZERO_EXT;   // Negative, clamped to zero
-				}
-				else
-				{
-					return VK_BLEND_OP_REVERSE_SUBTRACT;
-				}
-			}
-		case VK_BLEND_OP_MIN:
-			return VK_BLEND_OP_MIN;
-		case VK_BLEND_OP_MAX:
-			return VK_BLEND_OP_MAX;
-		default:
-			ASSERT(false);
+			return false;
 		}
-
-		return blendState[index].blendOperation;
 	}
 
-	VkBlendFactor Context::sourceBlendFactorAlpha(int index) const
+	return true;
+}
+
+VkBlendOp Context::blendOperation(int index) const
+{
+	ASSERT((index >= 0) && (index < RENDERTARGETS));
+
+	if(!blendState[index].alphaBlendEnable) return VK_BLEND_OP_SRC_EXT;
+
+	switch(blendState[index].blendOperation)
 	{
-		ASSERT((index >= 0) && (index < RENDERTARGETS));
-
-		switch (blendState[index].blendOperationAlpha)
+	case VK_BLEND_OP_ADD:
+		if(sourceBlendFactor(index) == VK_BLEND_FACTOR_ZERO)
 		{
-		case VK_BLEND_OP_ADD:
-		case VK_BLEND_OP_SUBTRACT:
-		case VK_BLEND_OP_REVERSE_SUBTRACT:
-			return blendState[index].sourceBlendFactorAlpha;
-		case VK_BLEND_OP_MIN:
-			return VK_BLEND_FACTOR_ONE;
-		case VK_BLEND_OP_MAX:
-			return VK_BLEND_FACTOR_ONE;
-		default:
-			ASSERT(false);
-		}
-
-		return blendState[index].sourceBlendFactorAlpha;
-	}
-
-	VkBlendFactor Context::destBlendFactorAlpha(int index) const
-	{
-		ASSERT((index >= 0) && (index < RENDERTARGETS));
-
-		switch (blendState[index].blendOperationAlpha)
-		{
-		case VK_BLEND_OP_ADD:
-		case VK_BLEND_OP_SUBTRACT:
-		case VK_BLEND_OP_REVERSE_SUBTRACT:
-			return blendState[index].destBlendFactorAlpha;
-		case VK_BLEND_OP_MIN:
-			return VK_BLEND_FACTOR_ONE;
-		case VK_BLEND_OP_MAX:
-			return VK_BLEND_FACTOR_ONE;
-		default:
-			ASSERT(false);
-		}
-
-		return blendState[index].destBlendFactorAlpha;
-	}
-
-	VkBlendOp Context::blendOperationAlpha(int index) const
-	{
-		ASSERT((index >= 0) && (index < RENDERTARGETS));
-
-		switch (blendState[index].blendOperationAlpha)
-		{
-		case VK_BLEND_OP_ADD:
-			if (sourceBlendFactorAlpha(index) == VK_BLEND_FACTOR_ZERO)
+			if(destBlendFactor(index) == VK_BLEND_FACTOR_ZERO)
 			{
-				if (destBlendFactorAlpha(index) == VK_BLEND_FACTOR_ZERO)
-				{
-					return VK_BLEND_OP_ZERO_EXT;
-				}
-				else
-				{
-					return VK_BLEND_OP_DST_EXT;
-				}
-			}
-			else if (sourceBlendFactorAlpha(index) == VK_BLEND_FACTOR_ONE)
-			{
-				if (destBlendFactorAlpha(index) == VK_BLEND_FACTOR_ZERO)
-				{
-					return VK_BLEND_OP_SRC_EXT;
-				}
-				else
-				{
-					return VK_BLEND_OP_ADD;
-				}
+				return VK_BLEND_OP_ZERO_EXT;
 			}
 			else
 			{
-				if (destBlendFactorAlpha(index) == VK_BLEND_FACTOR_ZERO)
-				{
-					return VK_BLEND_OP_SRC_EXT;
-				}
-				else
-				{
-					return VK_BLEND_OP_ADD;
-				}
+				return VK_BLEND_OP_DST_EXT;
 			}
-		case VK_BLEND_OP_SUBTRACT:
-			if (sourceBlendFactorAlpha(index) == VK_BLEND_FACTOR_ZERO && allTargetsColorClamp())
-			{
-				return VK_BLEND_OP_ZERO_EXT;   // Negative, clamped to zero
-			}
-			else if (sourceBlendFactorAlpha(index) == VK_BLEND_FACTOR_ONE)
-			{
-				if (destBlendFactorAlpha(index) == VK_BLEND_FACTOR_ZERO)
-				{
-					return VK_BLEND_OP_SRC_EXT;
-				}
-				else
-				{
-					return VK_BLEND_OP_SUBTRACT;
-				}
-			}
-			else
-			{
-				if (destBlendFactorAlpha(index) == VK_BLEND_FACTOR_ZERO)
-				{
-					return VK_BLEND_OP_SRC_EXT;
-				}
-				else
-				{
-					return VK_BLEND_OP_SUBTRACT;
-				}
-			}
-		case VK_BLEND_OP_REVERSE_SUBTRACT:
-			if (sourceBlendFactorAlpha(index) == VK_BLEND_FACTOR_ZERO)
-			{
-				if (destBlendFactorAlpha(index) == VK_BLEND_FACTOR_ZERO)
-				{
-					return VK_BLEND_OP_ZERO_EXT;
-				}
-				else
-				{
-					return VK_BLEND_OP_DST_EXT;
-				}
-			}
-			else if (sourceBlendFactorAlpha(index) == VK_BLEND_FACTOR_ONE)
-			{
-				if (destBlendFactorAlpha(index) == VK_BLEND_FACTOR_ZERO && allTargetsColorClamp())
-				{
-					return VK_BLEND_OP_ZERO_EXT;   // Negative, clamped to zero
-				}
-				else
-				{
-					return VK_BLEND_OP_REVERSE_SUBTRACT;
-				}
-			}
-			else
-			{
-				if (destBlendFactorAlpha(index) == VK_BLEND_FACTOR_ZERO && allTargetsColorClamp())
-				{
-					return VK_BLEND_OP_ZERO_EXT;   // Negative, clamped to zero
-				}
-				else
-				{
-					return VK_BLEND_OP_REVERSE_SUBTRACT;
-				}
-			}
-		case VK_BLEND_OP_MIN:
-			return VK_BLEND_OP_MIN;
-		case VK_BLEND_OP_MAX:
-			return VK_BLEND_OP_MAX;
-		default:
-			ASSERT(false);
 		}
-
-		return blendState[index].blendOperationAlpha;
-	}
-
-	VkFormat Context::renderTargetInternalFormat(int index) const
-	{
-		ASSERT((index >= 0) && (index < RENDERTARGETS));
-
-		if(renderTarget[index])
+		else if(sourceBlendFactor(index) == VK_BLEND_FACTOR_ONE)
 		{
-			return renderTarget[index]->getFormat();
+			if(destBlendFactor(index) == VK_BLEND_FACTOR_ZERO)
+			{
+				return VK_BLEND_OP_SRC_EXT;
+			}
+			else
+			{
+				return VK_BLEND_OP_ADD;
+			}
 		}
 		else
 		{
-			return VK_FORMAT_UNDEFINED;
-		}
-	}
-
-	bool Context::colorWriteActive() const
-	{
-		for (int i = 0; i < RENDERTARGETS; i++)
-		{
-			if (colorWriteActive(i))
+			if(destBlendFactor(index) == VK_BLEND_FACTOR_ZERO)
 			{
-				return true;
+				return VK_BLEND_OP_SRC_EXT;
+			}
+			else
+			{
+				return VK_BLEND_OP_ADD;
 			}
 		}
-
-		return false;
+	case VK_BLEND_OP_SUBTRACT:
+		if(sourceBlendFactor(index) == VK_BLEND_FACTOR_ZERO && allTargetsColorClamp())
+		{
+			return VK_BLEND_OP_ZERO_EXT;   // Negative, clamped to zero
+		}
+		else if(sourceBlendFactor(index) == VK_BLEND_FACTOR_ONE)
+		{
+			if(destBlendFactor(index) == VK_BLEND_FACTOR_ZERO)
+			{
+				return VK_BLEND_OP_SRC_EXT;
+			}
+			else
+			{
+				return VK_BLEND_OP_SUBTRACT;
+			}
+		}
+		else
+		{
+			if(destBlendFactor(index) == VK_BLEND_FACTOR_ZERO)
+			{
+				return VK_BLEND_OP_SRC_EXT;
+			}
+			else
+			{
+				return VK_BLEND_OP_SUBTRACT;
+			}
+		}
+	case VK_BLEND_OP_REVERSE_SUBTRACT:
+		if(sourceBlendFactor(index) == VK_BLEND_FACTOR_ZERO)
+		{
+			if(destBlendFactor(index) == VK_BLEND_FACTOR_ZERO)
+			{
+				return VK_BLEND_OP_ZERO_EXT;
+			}
+			else
+			{
+				return VK_BLEND_OP_DST_EXT;
+			}
+		}
+		else if(sourceBlendFactor(index) == VK_BLEND_FACTOR_ONE)
+		{
+			if(destBlendFactor(index) == VK_BLEND_FACTOR_ZERO && allTargetsColorClamp())
+			{
+				return VK_BLEND_OP_ZERO_EXT;   // Negative, clamped to zero
+			}
+			else
+			{
+				return VK_BLEND_OP_REVERSE_SUBTRACT;
+			}
+		}
+		else
+		{
+			if(destBlendFactor(index) == VK_BLEND_FACTOR_ZERO && allTargetsColorClamp())
+			{
+				return VK_BLEND_OP_ZERO_EXT;   // Negative, clamped to zero
+			}
+			else
+			{
+				return VK_BLEND_OP_REVERSE_SUBTRACT;
+			}
+		}
+	case VK_BLEND_OP_MIN:
+		return VK_BLEND_OP_MIN;
+	case VK_BLEND_OP_MAX:
+		return VK_BLEND_OP_MAX;
+	default:
+		ASSERT(false);
 	}
 
-	int Context::colorWriteActive(int index) const
+	return blendState[index].blendOperation;
+}
+
+VkBlendFactor Context::sourceBlendFactorAlpha(int index) const
+{
+	ASSERT((index >= 0) && (index < RENDERTARGETS));
+
+	switch (blendState[index].blendOperationAlpha)
 	{
-		ASSERT((index >= 0) && (index < RENDERTARGETS));
-
-		if(!renderTarget[index] || renderTarget[index]->getFormat() == VK_FORMAT_UNDEFINED)
-		{
-			return 0;
-		}
-
-		if(blendOperation(index) == VK_BLEND_OP_DST_EXT && destBlendFactor(index) == VK_BLEND_FACTOR_ONE &&
-		   (blendOperationAlpha(index) == VK_BLEND_OP_DST_EXT && destBlendFactorAlpha(index) == VK_BLEND_FACTOR_ONE))
-		{
-			return 0;
-		}
-
-		return colorWriteMask[index];
+	case VK_BLEND_OP_ADD:
+	case VK_BLEND_OP_SUBTRACT:
+	case VK_BLEND_OP_REVERSE_SUBTRACT:
+		return blendState[index].sourceBlendFactorAlpha;
+	case VK_BLEND_OP_MIN:
+		return VK_BLEND_FACTOR_ONE;
+	case VK_BLEND_OP_MAX:
+		return VK_BLEND_FACTOR_ONE;
+	default:
+		ASSERT(false);
 	}
 
-	bool Context::colorUsed() const
+	return blendState[index].sourceBlendFactorAlpha;
+}
+
+VkBlendFactor Context::destBlendFactorAlpha(int index) const
+{
+	ASSERT((index >= 0) && (index < RENDERTARGETS));
+
+	switch (blendState[index].blendOperationAlpha)
 	{
-		return colorWriteActive() || (pixelShader && pixelShader->getModes().ContainsKill);
+	case VK_BLEND_OP_ADD:
+	case VK_BLEND_OP_SUBTRACT:
+	case VK_BLEND_OP_REVERSE_SUBTRACT:
+		return blendState[index].destBlendFactorAlpha;
+	case VK_BLEND_OP_MIN:
+		return VK_BLEND_FACTOR_ONE;
+	case VK_BLEND_OP_MAX:
+		return VK_BLEND_FACTOR_ONE;
+	default:
+		ASSERT(false);
+	}
+
+	return blendState[index].destBlendFactorAlpha;
+}
+
+VkBlendOp Context::blendOperationAlpha(int index) const
+{
+	ASSERT((index >= 0) && (index < RENDERTARGETS));
+
+	switch (blendState[index].blendOperationAlpha)
+	{
+	case VK_BLEND_OP_ADD:
+		if (sourceBlendFactorAlpha(index) == VK_BLEND_FACTOR_ZERO)
+		{
+			if (destBlendFactorAlpha(index) == VK_BLEND_FACTOR_ZERO)
+			{
+				return VK_BLEND_OP_ZERO_EXT;
+			}
+			else
+			{
+				return VK_BLEND_OP_DST_EXT;
+			}
+		}
+		else if (sourceBlendFactorAlpha(index) == VK_BLEND_FACTOR_ONE)
+		{
+			if (destBlendFactorAlpha(index) == VK_BLEND_FACTOR_ZERO)
+			{
+				return VK_BLEND_OP_SRC_EXT;
+			}
+			else
+			{
+				return VK_BLEND_OP_ADD;
+			}
+		}
+		else
+		{
+			if (destBlendFactorAlpha(index) == VK_BLEND_FACTOR_ZERO)
+			{
+				return VK_BLEND_OP_SRC_EXT;
+			}
+			else
+			{
+				return VK_BLEND_OP_ADD;
+			}
+		}
+	case VK_BLEND_OP_SUBTRACT:
+		if (sourceBlendFactorAlpha(index) == VK_BLEND_FACTOR_ZERO && allTargetsColorClamp())
+		{
+			return VK_BLEND_OP_ZERO_EXT;   // Negative, clamped to zero
+		}
+		else if (sourceBlendFactorAlpha(index) == VK_BLEND_FACTOR_ONE)
+		{
+			if (destBlendFactorAlpha(index) == VK_BLEND_FACTOR_ZERO)
+			{
+				return VK_BLEND_OP_SRC_EXT;
+			}
+			else
+			{
+				return VK_BLEND_OP_SUBTRACT;
+			}
+		}
+		else
+		{
+			if (destBlendFactorAlpha(index) == VK_BLEND_FACTOR_ZERO)
+			{
+				return VK_BLEND_OP_SRC_EXT;
+			}
+			else
+			{
+				return VK_BLEND_OP_SUBTRACT;
+			}
+		}
+	case VK_BLEND_OP_REVERSE_SUBTRACT:
+		if (sourceBlendFactorAlpha(index) == VK_BLEND_FACTOR_ZERO)
+		{
+			if (destBlendFactorAlpha(index) == VK_BLEND_FACTOR_ZERO)
+			{
+				return VK_BLEND_OP_ZERO_EXT;
+			}
+			else
+			{
+				return VK_BLEND_OP_DST_EXT;
+			}
+		}
+		else if (sourceBlendFactorAlpha(index) == VK_BLEND_FACTOR_ONE)
+		{
+			if (destBlendFactorAlpha(index) == VK_BLEND_FACTOR_ZERO && allTargetsColorClamp())
+			{
+				return VK_BLEND_OP_ZERO_EXT;   // Negative, clamped to zero
+			}
+			else
+			{
+				return VK_BLEND_OP_REVERSE_SUBTRACT;
+			}
+		}
+		else
+		{
+			if (destBlendFactorAlpha(index) == VK_BLEND_FACTOR_ZERO && allTargetsColorClamp())
+			{
+				return VK_BLEND_OP_ZERO_EXT;   // Negative, clamped to zero
+			}
+			else
+			{
+				return VK_BLEND_OP_REVERSE_SUBTRACT;
+			}
+		}
+	case VK_BLEND_OP_MIN:
+		return VK_BLEND_OP_MIN;
+	case VK_BLEND_OP_MAX:
+		return VK_BLEND_OP_MAX;
+	default:
+		ASSERT(false);
+	}
+
+	return blendState[index].blendOperationAlpha;
+}
+
+VkFormat Context::renderTargetInternalFormat(int index) const
+{
+	ASSERT((index >= 0) && (index < RENDERTARGETS));
+
+	if(renderTarget[index])
+	{
+		return renderTarget[index]->getFormat();
+	}
+	else
+	{
+		return VK_FORMAT_UNDEFINED;
 	}
 }
+
+bool Context::colorWriteActive() const
+{
+	for (int i = 0; i < RENDERTARGETS; i++)
+	{
+		if (colorWriteActive(i))
+		{
+			return true;
+		}
+	}
+
+	return false;
+}
+
+int Context::colorWriteActive(int index) const
+{
+	ASSERT((index >= 0) && (index < RENDERTARGETS));
+
+	if(!renderTarget[index] || renderTarget[index]->getFormat() == VK_FORMAT_UNDEFINED)
+	{
+		return 0;
+	}
+
+	if(blendOperation(index) == VK_BLEND_OP_DST_EXT && destBlendFactor(index) == VK_BLEND_FACTOR_ONE &&
+	   (blendOperationAlpha(index) == VK_BLEND_OP_DST_EXT && destBlendFactorAlpha(index) == VK_BLEND_FACTOR_ONE))
+	{
+		return 0;
+	}
+
+	return colorWriteMask[index];
+}
+
+bool Context::colorUsed() const
+{
+	return colorWriteActive() || (pixelShader && pixelShader->getModes().ContainsKill);
+}
+
+}  // namespace sw
diff --git a/src/Device/Context.hpp b/src/Device/Context.hpp
index bb21eb7..20bc089 100644
--- a/src/Device/Context.hpp
+++ b/src/Device/Context.hpp
@@ -22,137 +22,139 @@
 #include "Stream.hpp"
 #include "System/Types.hpp"
 
-namespace vk
+namespace vk {
+
+class ImageView;
+class PipelineLayout;
+
+}  // namespace vk
+
+namespace sw {
+
+class SpirvShader;
+
+struct PushConstantStorage
 {
-	class ImageView;
-	class PipelineLayout;
-}
+	unsigned char data[vk::MAX_PUSH_CONSTANT_SIZE];
+};
 
-namespace sw
+struct BlendState : Memset<BlendState>
 {
-	class SpirvShader;
+	BlendState() : Memset(this, 0) {}
 
-	struct PushConstantStorage
-	{
-		unsigned char data[vk::MAX_PUSH_CONSTANT_SIZE];
-	};
+	BlendState(bool alphaBlendEnable,
+	           VkBlendFactor sourceBlendFactor,
+	           VkBlendFactor destBlendFactor,
+	           VkBlendOp blendOperation,
+	           VkBlendFactor sourceBlendFactorAlpha,
+	           VkBlendFactor destBlendFactorAlpha,
+	           VkBlendOp blendOperationAlpha) :
+		Memset(this, 0),
+		alphaBlendEnable(alphaBlendEnable),
+		sourceBlendFactor(sourceBlendFactor),
+		destBlendFactor(destBlendFactor),
+		blendOperation(blendOperation),
+		sourceBlendFactorAlpha(sourceBlendFactorAlpha),
+		destBlendFactorAlpha(destBlendFactorAlpha),
+		blendOperationAlpha(blendOperationAlpha)
+	{}
 
-	struct BlendState : Memset<BlendState>
-	{
-		BlendState() : Memset(this, 0) {}
+	bool alphaBlendEnable;
+	VkBlendFactor sourceBlendFactor;
+	VkBlendFactor destBlendFactor;
+	VkBlendOp blendOperation;
+	VkBlendFactor sourceBlendFactorAlpha;
+	VkBlendFactor destBlendFactorAlpha;
+	VkBlendOp blendOperationAlpha;
+};
 
-		BlendState(bool alphaBlendEnable,
-		           VkBlendFactor sourceBlendFactor,
-		           VkBlendFactor destBlendFactor,
-		           VkBlendOp blendOperation,
-		           VkBlendFactor sourceBlendFactorAlpha,
-		           VkBlendFactor destBlendFactorAlpha,
-		           VkBlendOp blendOperationAlpha) :
-			Memset(this, 0),
-			alphaBlendEnable(alphaBlendEnable),
-			sourceBlendFactor(sourceBlendFactor),
-			destBlendFactor(destBlendFactor),
-			blendOperation(blendOperation),
-			sourceBlendFactorAlpha(sourceBlendFactorAlpha),
-			destBlendFactorAlpha(destBlendFactorAlpha),
-			blendOperationAlpha(blendOperationAlpha)
-		{}
+class Context
+{
+public:
+	Context();
 
-		bool alphaBlendEnable;
-		VkBlendFactor sourceBlendFactor;
-		VkBlendFactor destBlendFactor;
-		VkBlendOp blendOperation;
-		VkBlendFactor sourceBlendFactorAlpha;
-		VkBlendFactor destBlendFactorAlpha;
-		VkBlendOp blendOperationAlpha;
-	};
+	void init();
 
-	class Context
-	{
-	public:
-		Context();
+	bool isDrawPoint(bool polygonModeAware) const;
+	bool isDrawLine(bool polygonModeAware) const;
+	bool isDrawTriangle(bool polygonModeAware) const;
 
-		void init();
+	bool depthWriteActive() const;
+	bool depthBufferActive() const;
+	bool stencilActive() const;
 
-		bool isDrawPoint(bool polygonModeAware) const;
-		bool isDrawLine(bool polygonModeAware) const;
-		bool isDrawTriangle(bool polygonModeAware) const;
+	bool allTargetsColorClamp() const;
 
-		bool depthWriteActive() const;
-		bool depthBufferActive() const;
-		bool stencilActive() const;
+	void setBlendState(int index, BlendState state);
+	BlendState getBlendState(int index) const;
 
-		bool allTargetsColorClamp() const;
+	VkPrimitiveTopology topology;
+	VkProvokingVertexModeEXT provokingVertexMode;
 
-		void setBlendState(int index, BlendState state);
-		BlendState getBlendState(int index) const;
+	bool stencilEnable;
+	VkStencilOpState frontStencil;
+	VkStencilOpState backStencil;
 
-		VkPrimitiveTopology topology;
-		VkProvokingVertexModeEXT provokingVertexMode;
+	// Pixel processor states
+	VkCullModeFlags cullMode;
+	VkFrontFace frontFace;
+	VkPolygonMode polygonMode;
+	VkLineRasterizationModeEXT lineRasterizationMode;
 
-		bool stencilEnable;
-		VkStencilOpState frontStencil;
-		VkStencilOpState backStencil;
+	float depthBias;
+	float slopeDepthBias;
 
-		// Pixel processor states
-		VkCullModeFlags cullMode;
-		VkFrontFace frontFace;
-		VkPolygonMode polygonMode;
-		VkLineRasterizationModeEXT lineRasterizationMode;
+	VkFormat renderTargetInternalFormat(int index) const;
+	int colorWriteActive(int index) const;
 
-		float depthBias;
-		float slopeDepthBias;
+	vk::DescriptorSet::Bindings descriptorSets = {};
+	vk::DescriptorSet::DynamicOffsets descriptorDynamicOffsets = {};
+	Stream input[MAX_INTERFACE_COMPONENTS / 4];
+	bool robustBufferAccess;
 
-		VkFormat renderTargetInternalFormat(int index) const;
-		int colorWriteActive(int index) const;
+	vk::ImageView *renderTarget[RENDERTARGETS];
+	vk::ImageView *depthBuffer;
+	vk::ImageView *stencilBuffer;
 
-		vk::DescriptorSet::Bindings descriptorSets = {};
-		vk::DescriptorSet::DynamicOffsets descriptorDynamicOffsets = {};
-		Stream input[MAX_INTERFACE_COMPONENTS / 4];
-		bool robustBufferAccess;
+	vk::PipelineLayout const *pipelineLayout;
 
-		vk::ImageView *renderTarget[RENDERTARGETS];
-		vk::ImageView *depthBuffer;
-		vk::ImageView *stencilBuffer;
+	// Shaders
+	const SpirvShader *pixelShader;
+	const SpirvShader *vertexShader;
 
-		vk::PipelineLayout const *pipelineLayout;
+	bool occlusionEnabled;
 
-		// Shaders
-		const SpirvShader *pixelShader;
-		const SpirvShader *vertexShader;
+	// Pixel processor states
+	bool rasterizerDiscard;
+	bool depthBoundsTestEnable;
+	bool depthBufferEnable;
+	VkCompareOp depthCompareMode;
+	bool depthWriteEnable;
 
-		bool occlusionEnabled;
+	float lineWidth;
 
-		// Pixel processor states
-		bool rasterizerDiscard;
-		bool depthBoundsTestEnable;
-		bool depthBufferEnable;
-		VkCompareOp depthCompareMode;
-		bool depthWriteEnable;
+	int colorWriteMask[RENDERTARGETS];   // RGBA
+	unsigned int sampleMask;
+	unsigned int multiSampleMask;
+	int sampleCount;
+	bool alphaToCoverage;
 
-		float lineWidth;
+private:
+	bool colorWriteActive() const;
+	bool colorUsed() const;
 
-		int colorWriteMask[RENDERTARGETS];   // RGBA
-		unsigned int sampleMask;
-		unsigned int multiSampleMask;
-		int sampleCount;
-		bool alphaToCoverage;
+	bool alphaBlendActive(int index) const;
+	VkBlendFactor sourceBlendFactor(int index) const;
+	VkBlendFactor destBlendFactor(int index) const;
+	VkBlendOp blendOperation(int index) const;
 
-	private:
-		bool colorWriteActive() const;
-		bool colorUsed() const;
+	VkBlendFactor sourceBlendFactorAlpha(int index) const;
+	VkBlendFactor destBlendFactorAlpha(int index) const;
+	VkBlendOp blendOperationAlpha(int index) const;
 
-		bool alphaBlendActive(int index) const;
-		VkBlendFactor sourceBlendFactor(int index) const;
-		VkBlendFactor destBlendFactor(int index) const;
-		VkBlendOp blendOperation(int index) const;
+	BlendState blendState[RENDERTARGETS];
+};
 
-		VkBlendFactor sourceBlendFactorAlpha(int index) const;
-		VkBlendFactor destBlendFactorAlpha(int index) const;
-		VkBlendOp blendOperationAlpha(int index) const;
-
-		BlendState blendState[RENDERTARGETS];
-	};
-}
+}  // namespace sw
 
 #endif   // sw_Context_hpp
diff --git a/src/Device/LRUCache.hpp b/src/Device/LRUCache.hpp
index a4478d1..f549769 100644
--- a/src/Device/LRUCache.hpp
+++ b/src/Device/LRUCache.hpp
@@ -20,180 +20,181 @@
 #include <type_traits>
 #include <unordered_map>
 
-namespace sw
+namespace sw {
+
+template<class Key, class Data>
+class LRUCache
 {
-	template<class Key, class Data>
-	class LRUCache
+public:
+	LRUCache(int n);
+
+	virtual ~LRUCache();
+
+	Data query(const Key &key) const;
+	virtual Data add(const Key &key, const Data &data);
+
+	int getSize() {return size;}
+	Key &getKey(int i) {return key[i];}
+
+protected:
+	int size;
+	int mask;
+	int top;
+	int fill;
+
+	Key *key;
+	Key **ref;
+	Data *data;
+};
+
+template<class Key, class Data, class Hasher = std::hash<Key>>
+class LRUConstCache : public LRUCache<Key, Data>
+{
+	using LRUBase = LRUCache<Key, Data>;
+public:
+	LRUConstCache(int n) : LRUBase(n) {}
+	~LRUConstCache() { clearConstCache(); }
+
+	Data add(const Key &key, const Data& data) override
 	{
-	public:
-		LRUCache(int n);
+		constCacheNeedsUpdate = true;
+		return LRUBase::add(key, data);
+	}
 
-		virtual ~LRUCache();
+	void updateConstCache();
+	const Data& queryConstCache(const Key &key) const;
 
-		Data query(const Key &key) const;
-		virtual Data add(const Key &key, const Data &data);
+private:
+	void clearConstCache();
+	bool constCacheNeedsUpdate = false;
+	std::unordered_map<Key, Data, Hasher> constCache;
+};
 
-		int getSize() {return size;}
-		Key &getKey(int i) {return key[i];}
-
-	protected:
-		int size;
-		int mask;
-		int top;
-		int fill;
-
-		Key *key;
-		Key **ref;
-		Data *data;
-	};
-
-	template<class Key, class Data, class Hasher = std::hash<Key>>
-	class LRUConstCache : public LRUCache<Key, Data>
-	{
-		using LRUBase = LRUCache<Key, Data>;
-	public:
-		LRUConstCache(int n) : LRUBase(n) {}
-		~LRUConstCache() { clearConstCache(); }
-
-		Data add(const Key &key, const Data& data) override
-		{
-			constCacheNeedsUpdate = true;
-			return LRUBase::add(key, data);
-		}
-
-		void updateConstCache();
-		const Data& queryConstCache(const Key &key) const;
-
-	private:
-		void clearConstCache();
-		bool constCacheNeedsUpdate = false;
-		std::unordered_map<Key, Data, Hasher> constCache;
-	};
-
-	// Traits-like helper class for checking if objects can be compared using memcmp().
-	// Useful for statically asserting if a cache key can implement operator==() with memcmp().
-	template<typename T>
-	struct is_memcmparable
-	{
-		// std::is_trivially_copyable is not available in older GCC versions.
-		#if !defined(__GNUC__) || __GNUC__ > 5
-			static const bool value = std::is_trivially_copyable<T>::value;
-		#else
-			// At least check it doesn't have virtual methods.
-			static const bool value = !std::is_polymorphic<T>::value;
-		#endif
-	};
+// Traits-like helper class for checking if objects can be compared using memcmp().
+// Useful for statically asserting if a cache key can implement operator==() with memcmp().
+template<typename T>
+struct is_memcmparable
+{
+	// std::is_trivially_copyable is not available in older GCC versions.
+	#if !defined(__GNUC__) || __GNUC__ > 5
+		static const bool value = std::is_trivially_copyable<T>::value;
+	#else
+		// At least check it doesn't have virtual methods.
+		static const bool value = !std::is_polymorphic<T>::value;
+	#endif
+};
 }
 
-namespace sw
+namespace sw {
+
+template<class Key, class Data>
+LRUCache<Key, Data>::LRUCache(int n)
 {
-	template<class Key, class Data>
-	LRUCache<Key, Data>::LRUCache(int n)
+	size = ceilPow2(n);
+	mask = size - 1;
+	top = 0;
+	fill = 0;
+
+	key = new Key[size];
+	ref = new Key*[size];
+	data = new Data[size];
+
+	for(int i = 0; i < size; i++)
 	{
-		size = ceilPow2(n);
-		mask = size - 1;
-		top = 0;
-		fill = 0;
+		ref[i] = &key[i];
+	}
+}
 
-		key = new Key[size];
-		ref = new Key*[size];
-		data = new Data[size];
+template<class Key, class Data>
+LRUCache<Key, Data>::~LRUCache()
+{
+	delete[] key;
+	key = nullptr;
 
-		for(int i = 0; i < size; i++)
+	delete[] ref;
+	ref = nullptr;
+
+	delete[] data;
+	data = nullptr;
+}
+
+template<class Key, class Data>
+Data LRUCache<Key, Data>::query(const Key &key) const
+{
+	for(int i = top; i > top - fill; i--)
+	{
+		int j = i & mask;
+
+		if(key == *ref[j])
 		{
-			ref[i] = &key[i];
+			Data hit = data[j];
+
+			if(i != top)
+			{
+				// Move one up
+				int k = (j + 1) & mask;
+
+				Data swapD = data[k];
+				data[k] = data[j];
+				data[j] = swapD;
+
+				Key *swapK = ref[k];
+				ref[k] = ref[j];
+				ref[j] = swapK;
+			}
+
+			return hit;
 		}
 	}
 
-	template<class Key, class Data>
-	LRUCache<Key, Data>::~LRUCache()
+	return {};   // Not found
+}
+
+template<class Key, class Data>
+Data LRUCache<Key, Data>::add(const Key &key, const Data &data)
+{
+	top = (top + 1) & mask;
+	fill = fill + 1 < size ? fill + 1 : size;
+
+	*ref[top] = key;
+	this->data[top] = data;
+
+	return data;
+}
+
+template<class Key, class Data, class Hasher>
+void LRUConstCache<Key, Data, Hasher>::clearConstCache()
+{
+	constCache.clear();
+}
+
+template<class Key, class Data, class Hasher>
+void LRUConstCache<Key, Data, Hasher>::updateConstCache()
+{
+	if(constCacheNeedsUpdate)
 	{
-		delete[] key;
-		key = nullptr;
+		clearConstCache();
 
-		delete[] ref;
-		ref = nullptr;
-
-		delete[] data;
-		data = nullptr;
-	}
-
-	template<class Key, class Data>
-	Data LRUCache<Key, Data>::query(const Key &key) const
-	{
-		for(int i = top; i > top - fill; i--)
+		for(int i = 0; i < LRUBase::size; i++)
 		{
-			int j = i & mask;
-
-			if(key == *ref[j])
+			if(LRUBase::data[i])
 			{
-				Data hit = data[j];
-
-				if(i != top)
-				{
-					// Move one up
-					int k = (j + 1) & mask;
-
-					Data swapD = data[k];
-					data[k] = data[j];
-					data[j] = swapD;
-
-					Key *swapK = ref[k];
-					ref[k] = ref[j];
-					ref[j] = swapK;
-				}
-
-				return hit;
+				constCache[*LRUBase::ref[i]] = LRUBase::data[i];
 			}
 		}
 
-		return {};   // Not found
-	}
-
-	template<class Key, class Data>
-	Data LRUCache<Key, Data>::add(const Key &key, const Data &data)
-	{
-		top = (top + 1) & mask;
-		fill = fill + 1 < size ? fill + 1 : size;
-
-		*ref[top] = key;
-		this->data[top] = data;
-
-		return data;
-	}
-
-	template<class Key, class Data, class Hasher>
-	void LRUConstCache<Key, Data, Hasher>::clearConstCache()
-	{
-		constCache.clear();
-	}
-
-	template<class Key, class Data, class Hasher>
-	void LRUConstCache<Key, Data, Hasher>::updateConstCache()
-	{
-		if(constCacheNeedsUpdate)
-		{
-			clearConstCache();
-
-			for(int i = 0; i < LRUBase::size; i++)
-			{
-				if(LRUBase::data[i])
-				{
-					constCache[*LRUBase::ref[i]] = LRUBase::data[i];
-				}
-			}
-
-			constCacheNeedsUpdate = false;
-		}
-	}
-
-	template<class Key, class Data, class Hasher>
-	const Data& LRUConstCache<Key, Data, Hasher>::queryConstCache(const Key &key) const
-	{
-		auto it = constCache.find(key);
-		static Data null = {};
-		return (it != constCache.end()) ? it->second : null;
+		constCacheNeedsUpdate = false;
 	}
 }
 
+template<class Key, class Data, class Hasher>
+const Data& LRUConstCache<Key, Data, Hasher>::queryConstCache(const Key &key) const
+{
+	auto it = constCache.find(key);
+	static Data null = {};
+	return (it != constCache.end()) ? it->second : null;
+}
+
+}  // namespace sw
+
 #endif   // sw_LRUCache_hpp
diff --git a/src/Device/Matrix.cpp b/src/Device/Matrix.cpp
index f449841..006ca1b 100644
--- a/src/Device/Matrix.cpp
+++ b/src/Device/Matrix.cpp
@@ -17,386 +17,387 @@
 #include "Point.hpp"
 #include "System/Math.hpp"
 
-namespace sw
+namespace sw {
+
+Matrix Matrix::diag(float m11, float m22, float m33, float m44)
 {
-	Matrix Matrix::diag(float m11, float m22, float m33, float m44)
-	{
-		return Matrix(m11, 0,   0,   0,
-		              0,   m22, 0,   0,
-		              0,   0,   m33, 0,
-		              0,   0,   0,   m44);
-	}
-
-	Matrix::operator float*()
-	{
-		return &(*this)(1, 1);
-	}
-
-	Matrix Matrix::operator+() const
-	{
-		return *this;
-	}
-
-	Matrix Matrix::operator-() const
-	{
-		const Matrix &M = *this;
-
-		return Matrix(-M(1, 1), -M(1, 2), -M(1, 3), -M(1, 4), 
-		              -M(2, 1), -M(2, 2), -M(2, 3), -M(2, 4), 
-		              -M(3, 1), -M(3, 2), -M(3, 3), -M(3, 4), 
-		              -M(4, 1), -M(4, 2), -M(4, 3), -M(4, 4));
-	}
-
-	Matrix Matrix::operator!() const
-	{
-		const Matrix &M = *this;
-		Matrix I;
-
-		float M3344 = M(3, 3) * M(4, 4) - M(4, 3) * M(3, 4);
-		float M2344 = M(2, 3) * M(4, 4) - M(4, 3) * M(2, 4);
-		float M2334 = M(2, 3) * M(3, 4) - M(3, 3) * M(2, 4);
-		float M3244 = M(3, 2) * M(4, 4) - M(4, 2) * M(3, 4);
-		float M2244 = M(2, 2) * M(4, 4) - M(4, 2) * M(2, 4);
-		float M2234 = M(2, 2) * M(3, 4) - M(3, 2) * M(2, 4);
-		float M3243 = M(3, 2) * M(4, 3) - M(4, 2) * M(3, 3);
-		float M2243 = M(2, 2) * M(4, 3) - M(4, 2) * M(2, 3);
-		float M2233 = M(2, 2) * M(3, 3) - M(3, 2) * M(2, 3);
-		float M1344 = M(1, 3) * M(4, 4) - M(4, 3) * M(1, 4);
-		float M1334 = M(1, 3) * M(3, 4) - M(3, 3) * M(1, 4);
-		float M1244 = M(1, 2) * M(4, 4) - M(4, 2) * M(1, 4);
-		float M1234 = M(1, 2) * M(3, 4) - M(3, 2) * M(1, 4);
-		float M1243 = M(1, 2) * M(4, 3) - M(4, 2) * M(1, 3);
-		float M1233 = M(1, 2) * M(3, 3) - M(3, 2) * M(1, 3);
-		float M1324 = M(1, 3) * M(2, 4) - M(2, 3) * M(1, 4);
-		float M1224 = M(1, 2) * M(2, 4) - M(2, 2) * M(1, 4);
-		float M1223 = M(1, 2) * M(2, 3) - M(2, 2) * M(1, 3);
-
-		// Adjoint Matrix
-		I(1, 1) =  M(2, 2) * M3344 - M(3, 2) * M2344 + M(4, 2) * M2334;
-		I(2, 1) = -M(2, 1) * M3344 + M(3, 1) * M2344 - M(4, 1) * M2334;
-		I(3, 1) =  M(2, 1) * M3244 - M(3, 1) * M2244 + M(4, 1) * M2234;
-		I(4, 1) = -M(2, 1) * M3243 + M(3, 1) * M2243 - M(4, 1) * M2233;
-
-		I(1, 2) = -M(1, 2) * M3344 + M(3, 2) * M1344 - M(4, 2) * M1334;
-		I(2, 2) =  M(1, 1) * M3344 - M(3, 1) * M1344 + M(4, 1) * M1334;
-		I(3, 2) = -M(1, 1) * M3244 + M(3, 1) * M1244 - M(4, 1) * M1234;
-		I(4, 2) =  M(1, 1) * M3243 - M(3, 1) * M1243 + M(4, 1) * M1233;
-
-		I(1, 3) =  M(1, 2) * M2344 - M(2, 2) * M1344 + M(4, 2) * M1324;
-		I(2, 3) = -M(1, 1) * M2344 + M(2, 1) * M1344 - M(4, 1) * M1324;
-		I(3, 3) =  M(1, 1) * M2244 - M(2, 1) * M1244 + M(4, 1) * M1224;
-		I(4, 3) = -M(1, 1) * M2243 + M(2, 1) * M1243 - M(4, 1) * M1223;
-
-		I(1, 4) = -M(1, 2) * M2334 + M(2, 2) * M1334 - M(3, 2) * M1324;
-		I(2, 4) =  M(1, 1) * M2334 - M(2, 1) * M1334 + M(3, 1) * M1324;
-		I(3, 4) = -M(1, 1) * M2234 + M(2, 1) * M1234 - M(3, 1) * M1224;
-		I(4, 4) =  M(1, 1) * M2233 - M(2, 1) * M1233 + M(3, 1) * M1223;
-
-		// Division by determinant
-		I /= M(1, 1) * I(1, 1) +
-		     M(2, 1) * I(1, 2) +
-		     M(3, 1) * I(1, 3) +
-		     M(4, 1) * I(1, 4);
-
-		return I;
-	}
-
-	Matrix Matrix::operator~() const
-	{
-		const Matrix &M = *this;
-
-		return Matrix(M(1, 1), M(2, 1), M(3, 1), M(4, 1), 
-		              M(1, 2), M(2, 2), M(3, 2), M(4, 2), 
-		              M(1, 3), M(2, 3), M(3, 3), M(4, 3), 
-		              M(1, 4), M(2, 4), M(3, 4), M(4, 4));
-	}
-
-	Matrix &Matrix::operator+=(const Matrix &N)
-	{
-		Matrix &M = *this;
-
-		M(1, 1) += N(1, 1); M(1, 2) += N(1, 2); M(1, 3) += N(1, 3); M(1, 4) += N(1, 4);
-		M(2, 1) += N(2, 1); M(2, 2) += N(2, 2); M(2, 3) += N(2, 3); M(2, 4) += N(2, 4);
-		M(3, 1) += N(3, 1); M(3, 2) += N(3, 2); M(3, 3) += N(3, 3); M(3, 4) += N(3, 4);
-		M(4, 1) += N(4, 1); M(4, 2) += N(4, 2); M(4, 3) += N(4, 3); M(4, 4) += N(4, 4);
-
-		return M;
-	}
-
-	Matrix &Matrix::operator-=(const Matrix &N)
-	{
-		Matrix &M = *this;
-
-		M(1, 1) -= N(1, 1); M(1, 2) -= N(1, 2); M(1, 3) -= N(1, 3); M(1, 4) -= N(1, 4);
-		M(2, 1) -= N(2, 1); M(2, 2) -= N(2, 2); M(2, 3) -= N(2, 3); M(2, 4) -= N(2, 4);
-		M(3, 1) -= N(3, 1); M(3, 2) -= N(3, 2); M(3, 3) -= N(3, 3); M(3, 4) -= N(3, 4);
-		M(4, 1) -= N(4, 1); M(4, 2) -= N(4, 2); M(4, 3) -= N(4, 3); M(4, 4) -= N(4, 4);
-
-		return M;
-	}
-
-	Matrix &Matrix::operator*=(float s)
-	{
-		Matrix &M = *this;
-
-		M(1, 1) *= s; M(1, 2) *= s; M(1, 3) *= s; M(1, 4) *= s;
-		M(2, 1) *= s; M(2, 2) *= s; M(2, 3) *= s; M(2, 4) *= s;
-		M(3, 1) *= s; M(3, 2) *= s; M(3, 3) *= s; M(3, 4) *= s;
-		M(4, 1) *= s; M(4, 2) *= s; M(4, 3) *= s; M(4, 4) *= s;
-
-		return M;
-	}
-
-	Matrix &Matrix::operator*=(const Matrix &M)
-	{
-		return *this = *this * M;
-	}
-
-	Matrix &Matrix::operator/=(float s)
-	{
-		float r = 1.0f / s;
-
-		return *this *= r;
-	}
-
-	bool operator==(const Matrix &M, const Matrix &N)
-	{
-		if(M(1, 1) == N(1, 1) && M(1, 2) == N(1, 2) && M(1, 3) == N(1, 3) && M(1, 4) == N(1, 4) &&
-		   M(2, 1) == N(2, 1) && M(2, 2) == N(2, 2) && M(2, 3) == N(2, 3) && M(2, 4) == N(2, 4) &&
-		   M(3, 1) == N(3, 1) && M(3, 2) == N(3, 2) && M(3, 3) == N(3, 3) && M(3, 4) == N(3, 4) &&
-		   M(4, 1) == N(4, 1) && M(4, 2) == N(4, 2) && M(4, 3) == N(4, 3) && M(4, 4) == N(4, 4))
-			return true;
-		else
-			return false;
-	}
-
-	bool operator!=(const Matrix &M, const Matrix &N)
-	{
-		if(M(1, 1) != N(1, 1) || M(1, 2) != N(1, 2) || M(1, 3) != N(1, 3) || M(1, 4) != N(1, 4) ||
-		   M(2, 1) != N(2, 1) || M(2, 2) != N(2, 2) || M(2, 3) != N(2, 3) || M(2, 4) != N(2, 4) ||
-		   M(3, 1) != N(3, 1) || M(3, 2) != N(3, 2) || M(3, 3) != N(3, 3) || M(3, 4) != N(3, 4) ||
-		   M(4, 1) != N(4, 1) || M(4, 2) != N(4, 2) || M(4, 3) != N(4, 3) || M(4, 4) != N(4, 4))
-			return true;
-		else
-			return false;
-	}
-
-	Matrix operator+(const Matrix &M, const Matrix &N)
-	{
-		return Matrix(M(1, 1) + N(1, 1), M(1, 2) + N(1, 2), M(1, 3) + N(1, 3), M(1, 4) + N(1, 4), 
-		              M(2, 1) + N(2, 1), M(2, 2) + N(2, 2), M(2, 3) + N(2, 3), M(2, 4) + N(2, 4), 
-		              M(3, 1) + N(3, 1), M(3, 2) + N(3, 2), M(3, 3) + N(3, 3), M(3, 4) + N(3, 4), 
-		              M(4, 1) + N(4, 1), M(4, 2) + N(4, 2), M(4, 3) + N(4, 3), M(4, 4) + N(4, 4));
-	}
-
-	Matrix operator-(const Matrix &M, const Matrix &N)
-	{
-		return Matrix(M(1, 1) - N(1, 1), M(1, 2) - N(1, 2), M(1, 3) - N(1, 3), M(1, 4) - N(1, 4), 
-		              M(2, 1) - N(2, 1), M(2, 2) - N(2, 2), M(2, 3) - N(2, 3), M(2, 4) - N(2, 4), 
-		              M(3, 1) - N(3, 1), M(3, 2) - N(3, 2), M(3, 3) - N(3, 3), M(3, 4) - N(3, 4), 
-		              M(4, 1) - N(4, 1), M(4, 2) - N(4, 2), M(4, 3) - N(4, 3), M(4, 4) - N(4, 4));
-	}
-
-	Matrix operator*(float s, const Matrix &M)
-	{
-		return Matrix(s * M(1, 1), s * M(1, 2), s * M(1, 3), s * M(1, 4), 
-		              s * M(2, 1), s * M(2, 2), s * M(2, 3), s * M(2, 4), 
-		              s * M(3, 1), s * M(3, 2), s * M(3, 3), s * M(3, 4), 
-		              s * M(4, 1), s * M(4, 2), s * M(4, 3), s * M(4, 4));
-	}
-
-	Matrix operator*(const Matrix &M, float s)
-	{
-		return Matrix(M(1, 1) * s, M(1, 2) * s, M(1, 3) * s, M(1, 4) * s, 
-		              M(2, 1) * s, M(2, 2) * s, M(2, 3) * s, M(2, 4) * s, 
-		              M(3, 1) * s, M(3, 2) * s, M(3, 3) * s, M(3, 4) * s, 
-		              M(4, 1) * s, M(4, 2) * s, M(4, 3) * s, M(4, 4) * s);
-	}
-
-	Matrix operator*(const Matrix &M, const Matrix &N)
-	{
-		return Matrix(M(1, 1) * N(1, 1) + M(1, 2) * N(2, 1) + M(1, 3) * N(3, 1) + M(1, 4) * N(4, 1), M(1, 1) * N(1, 2) + M(1, 2) * N(2, 2) + M(1, 3) * N(3, 2) + M(1, 4) * N(4, 2), M(1, 1) * N(1, 3) + M(1, 2) * N(2, 3) + M(1, 3) * N(3, 3) + M(1, 4) * N(4, 3), M(1, 1) * N(1, 4) + M(1, 2) * N(2, 4) + M(1, 3) * N(3, 4) + M(1, 4) * N(4, 4), 
-		              M(2, 1) * N(1, 1) + M(2, 2) * N(2, 1) + M(2, 3) * N(3, 1) + M(2, 4) * N(4, 1), M(2, 1) * N(1, 2) + M(2, 2) * N(2, 2) + M(2, 3) * N(3, 2) + M(2, 4) * N(4, 2), M(2, 1) * N(1, 3) + M(2, 2) * N(2, 3) + M(2, 3) * N(3, 3) + M(2, 4) * N(4, 3), M(2, 1) * N(1, 4) + M(2, 2) * N(2, 4) + M(2, 3) * N(3, 4) + M(2, 4) * N(4, 4), 
-		              M(3, 1) * N(1, 1) + M(3, 2) * N(2, 1) + M(3, 3) * N(3, 1) + M(3, 4) * N(4, 1), M(3, 1) * N(1, 2) + M(3, 2) * N(2, 2) + M(3, 3) * N(3, 2) + M(3, 4) * N(4, 2), M(3, 1) * N(1, 3) + M(3, 2) * N(2, 3) + M(3, 3) * N(3, 3) + M(3, 4) * N(4, 3), M(3, 1) * N(1, 4) + M(3, 2) * N(2, 4) + M(3, 3) * N(3, 4) + M(3, 4) * N(4, 4), 
-		              M(4, 1) * N(1, 1) + M(4, 2) * N(2, 1) + M(4, 3) * N(3, 1) + M(4, 4) * N(4, 1), M(4, 1) * N(1, 2) + M(4, 2) * N(2, 2) + M(4, 3) * N(3, 2) + M(4, 4) * N(4, 2), M(4, 1) * N(1, 3) + M(4, 2) * N(2, 3) + M(4, 3) * N(3, 3) + M(4, 4) * N(4, 3), M(4, 1) * N(1, 4) + M(4, 2) * N(2, 4) + M(4, 3) * N(3, 4) + M(4, 4) * N(4, 4));
-	}
-
-	Matrix operator/(const Matrix &M, float s)
-	{
-		float r = 1.0f / s;
-
-		return M * r;
-	}
-
-	float4 Matrix::operator*(const float4 &v) const
-	{
-		const Matrix &M = *this;
-		float Mx = M(1, 1) * v.x + M(1, 2) * v.y + M(1, 3) * v.z + M(1, 4) * v.w;
-		float My = M(2, 1) * v.x + M(2, 2) * v.y + M(2, 3) * v.z + M(2, 4) * v.w;
-		float Mz = M(3, 1) * v.x + M(3, 2) * v.y + M(3, 3) * v.z + M(3, 4) * v.w;
-		float Mw = M(4, 1) * v.x + M(4, 2) * v.y + M(4, 3) * v.z + M(4, 4) * v.w;
-
-		return {Mx, My, Mz, Mw};
-	}
-
-	float Matrix::det(const Matrix &M)
-	{
-		float M3344 = M(3, 3) * M(4, 4) - M(4, 3) * M(3, 4);
-		float M2344 = M(2, 3) * M(4, 4) - M(4, 3) * M(2, 4);
-		float M2334 = M(2, 3) * M(3, 4) - M(3, 3) * M(2, 4);
-		float M1344 = M(1, 3) * M(4, 4) - M(4, 3) * M(1, 4);
-		float M1334 = M(1, 3) * M(3, 4) - M(3, 3) * M(1, 4);
-		float M1324 = M(1, 3) * M(2, 4) - M(2, 3) * M(1, 4);
-
-		return M(1, 1) * (M(2, 2) * M3344 - M(3, 2) * M2344 + M(4, 2) * M2334) -
-		       M(2, 1) * (M(1, 2) * M3344 - M(3, 2) * M1344 + M(4, 2) * M1334) +
-		       M(3, 1) * (M(1, 2) * M2344 - M(2, 2) * M1344 + M(4, 2) * M1324) -
-		       M(4, 1) * (M(1, 2) * M2334 - M(2, 2) * M1334 + M(3, 2) * M1324);
-	}
-
-	float Matrix::det(float m11)
-	{
-		return m11;
-	}
-
-	float Matrix::det(float m11, float m12, 
-	                  float m21, float m22)
-	{
-		return m11 * m22 - m12 * m21; 
-	}
-
-	float Matrix::det(float m11, float m12, float m13, 
-	                  float m21, float m22, float m23, 
-	                  float m31, float m32, float m33)
-	{
-		return m11 * (m22 * m33 - m32 * m23) -
-		       m21 * (m12 * m33 - m32 * m13) +
-		       m31 * (m12 * m23 - m22 * m13);
-	}
-
-	float Matrix::det(float m11, float m12, float m13, float m14, 
-	                  float m21, float m22, float m23, float m24, 
-	                  float m31, float m32, float m33, float m34, 
-	                  float m41, float m42, float m43, float m44)
-	{
-		float M3344 = m33 * m44 - m43 * m34;
-		float M2344 = m23 * m44 - m43 * m24;
-		float M2334 = m23 * m34 - m33 * m24;
-		float M1344 = m13 * m44 - m43 * m14;
-		float M1334 = m13 * m34 - m33 * m14;
-		float M1324 = m13 * m24 - m23 * m14;
-
-		return m11 * (m22 * M3344 - m32 * M2344 + m42 * M2334) -
-		       m21 * (m12 * M3344 - m32 * M1344 + m42 * M1334) +
-		       m31 * (m12 * M2344 - m22 * M1344 + m42 * M1324) -
-		       m41 * (m12 * M2334 - m22 * M1334 + m32 * M1324);
-	}
-
-	float Matrix::det(const Vector &v1, const Vector &v2, const Vector &v3)
-	{
-		return v1 * (v2 % v3);
-	}
-
-	float Matrix::det3(const Matrix &M)
-	{
-		return M(1, 1) * (M(2, 2) * M(3, 3) - M(3, 2) * M(2, 3)) -
-		       M(2, 1) * (M(1, 2) * M(3, 3) - M(3, 2) * M(1, 3)) +
-		       M(3, 1) * (M(1, 2) * M(2, 3) - M(2, 2) * M(1, 3));
-	}
-
-	float Matrix::tr(const Matrix &M)
-	{
-		return M(1, 1) + M(2, 2) + M(3, 3) + M(4, 4);
-	}
-
-	Matrix &Matrix::orthogonalise()
-	{
-		// NOTE: Numnerically instable, won't return exact the same result when already orhtogonal
-
-		Matrix &M = *this;
-
-		Vector v1(M(1, 1), M(2, 1), M(3, 1));
-		Vector v2(M(1, 2), M(2, 2), M(3, 2));
-		Vector v3(M(1, 3), M(2, 3), M(3, 3));
-
-		v2 -= v1 * (v1 * v2) / (v1 * v1);
-		v3 -= v1 * (v1 * v3) / (v1 * v1);
-		v3 -= v2 * (v2 * v3) / (v2 * v2);
-
-		v1 /= Vector::N(v1);
-		v2 /= Vector::N(v2);
-		v3 /= Vector::N(v3);
-
-		M(1, 1) = v1.x;  M(1, 2) = v2.x;  M(1, 3) = v3.x;
-		M(2, 1) = v1.y;  M(2, 2) = v2.y;  M(2, 3) = v3.y;
-		M(3, 1) = v1.z;  M(3, 2) = v2.z;  M(3, 3) = v3.z;
-
-		return *this;
-	}
-
-	Matrix Matrix::eulerRotate(const Vector &v)
-	{
-		float cz = cos(v.z);
-		float sz = sin(v.z);
-		float cx = cos(v.x);
-		float sx = sin(v.x);
-		float cy = cos(v.y);
-		float sy = sin(v.y);
-
-		float sxsy = sx * sy;
-		float sxcy = sx * cy;
-
-		return Matrix(cy * cz - sxsy * sz, -cy * sz - sxsy * cz, -sy * cx,
-		              cx * sz,              cx * cz,             -sx,
-		              sy * cz + sxcy * sz, -sy * sz + sxcy * cz,  cy * cx);
-	}
-
-	Matrix Matrix::eulerRotate(float x, float y, float z)
-	{
-		return eulerRotate(Vector(x, y, z));
-	}
-
-	Matrix Matrix::translate(const Vector &v)
-	{
-		return Matrix(1, 0, 0, v.x,
-		              0, 1, 0, v.y,
-		              0, 0, 1, v.z,
-		              0, 0, 0, 1);
-	}
-
-	Matrix Matrix::translate(float x, float y, float z)
-	{
-		return translate(Vector(x, y, z));
-	}
-
-	Matrix Matrix::scale(const Vector &v)
-	{
-		return Matrix(v.x, 0,   0,
-		              0,   v.y, 0,
-		              0,   0,   v.z);
-	}
-
-	Matrix Matrix::scale(float x, float y, float z)
-	{
-		return scale(Vector(x, y, z));
-	}
-
-	Matrix Matrix::lookAt(const Vector &v)
-	{
-		Vector y = v;
-		y /= Vector::N(y);
-
-		Vector x = y % Vector(0, 0, 1);
-		x /= Vector::N(x);
-
-		Vector z = x % y;
-		z /= Vector::N(z);
-
-		return ~Matrix(x, y, z);
-	}
-
-	Matrix Matrix::lookAt(float x, float y, float z)
-	{
-		return translate(Vector(x, y, z));
-	}
+	return Matrix(m11, 0,   0,   0,
+	              0,   m22, 0,   0,
+	              0,   0,   m33, 0,
+	              0,   0,   0,   m44);
 }
+
+Matrix::operator float*()
+{
+	return &(*this)(1, 1);
+}
+
+Matrix Matrix::operator+() const
+{
+	return *this;
+}
+
+Matrix Matrix::operator-() const
+{
+	const Matrix &M = *this;
+
+	return Matrix(-M(1, 1), -M(1, 2), -M(1, 3), -M(1, 4), 
+	              -M(2, 1), -M(2, 2), -M(2, 3), -M(2, 4), 
+	              -M(3, 1), -M(3, 2), -M(3, 3), -M(3, 4), 
+	              -M(4, 1), -M(4, 2), -M(4, 3), -M(4, 4));
+}
+
+Matrix Matrix::operator!() const
+{
+	const Matrix &M = *this;
+	Matrix I;
+
+	float M3344 = M(3, 3) * M(4, 4) - M(4, 3) * M(3, 4);
+	float M2344 = M(2, 3) * M(4, 4) - M(4, 3) * M(2, 4);
+	float M2334 = M(2, 3) * M(3, 4) - M(3, 3) * M(2, 4);
+	float M3244 = M(3, 2) * M(4, 4) - M(4, 2) * M(3, 4);
+	float M2244 = M(2, 2) * M(4, 4) - M(4, 2) * M(2, 4);
+	float M2234 = M(2, 2) * M(3, 4) - M(3, 2) * M(2, 4);
+	float M3243 = M(3, 2) * M(4, 3) - M(4, 2) * M(3, 3);
+	float M2243 = M(2, 2) * M(4, 3) - M(4, 2) * M(2, 3);
+	float M2233 = M(2, 2) * M(3, 3) - M(3, 2) * M(2, 3);
+	float M1344 = M(1, 3) * M(4, 4) - M(4, 3) * M(1, 4);
+	float M1334 = M(1, 3) * M(3, 4) - M(3, 3) * M(1, 4);
+	float M1244 = M(1, 2) * M(4, 4) - M(4, 2) * M(1, 4);
+	float M1234 = M(1, 2) * M(3, 4) - M(3, 2) * M(1, 4);
+	float M1243 = M(1, 2) * M(4, 3) - M(4, 2) * M(1, 3);
+	float M1233 = M(1, 2) * M(3, 3) - M(3, 2) * M(1, 3);
+	float M1324 = M(1, 3) * M(2, 4) - M(2, 3) * M(1, 4);
+	float M1224 = M(1, 2) * M(2, 4) - M(2, 2) * M(1, 4);
+	float M1223 = M(1, 2) * M(2, 3) - M(2, 2) * M(1, 3);
+
+	// Adjoint Matrix
+	I(1, 1) =  M(2, 2) * M3344 - M(3, 2) * M2344 + M(4, 2) * M2334;
+	I(2, 1) = -M(2, 1) * M3344 + M(3, 1) * M2344 - M(4, 1) * M2334;
+	I(3, 1) =  M(2, 1) * M3244 - M(3, 1) * M2244 + M(4, 1) * M2234;
+	I(4, 1) = -M(2, 1) * M3243 + M(3, 1) * M2243 - M(4, 1) * M2233;
+
+	I(1, 2) = -M(1, 2) * M3344 + M(3, 2) * M1344 - M(4, 2) * M1334;
+	I(2, 2) =  M(1, 1) * M3344 - M(3, 1) * M1344 + M(4, 1) * M1334;
+	I(3, 2) = -M(1, 1) * M3244 + M(3, 1) * M1244 - M(4, 1) * M1234;
+	I(4, 2) =  M(1, 1) * M3243 - M(3, 1) * M1243 + M(4, 1) * M1233;
+
+	I(1, 3) =  M(1, 2) * M2344 - M(2, 2) * M1344 + M(4, 2) * M1324;
+	I(2, 3) = -M(1, 1) * M2344 + M(2, 1) * M1344 - M(4, 1) * M1324;
+	I(3, 3) =  M(1, 1) * M2244 - M(2, 1) * M1244 + M(4, 1) * M1224;
+	I(4, 3) = -M(1, 1) * M2243 + M(2, 1) * M1243 - M(4, 1) * M1223;
+
+	I(1, 4) = -M(1, 2) * M2334 + M(2, 2) * M1334 - M(3, 2) * M1324;
+	I(2, 4) =  M(1, 1) * M2334 - M(2, 1) * M1334 + M(3, 1) * M1324;
+	I(3, 4) = -M(1, 1) * M2234 + M(2, 1) * M1234 - M(3, 1) * M1224;
+	I(4, 4) =  M(1, 1) * M2233 - M(2, 1) * M1233 + M(3, 1) * M1223;
+
+	// Division by determinant
+	I /= M(1, 1) * I(1, 1) +
+	     M(2, 1) * I(1, 2) +
+	     M(3, 1) * I(1, 3) +
+	     M(4, 1) * I(1, 4);
+
+	return I;
+}
+
+Matrix Matrix::operator~() const
+{
+	const Matrix &M = *this;
+
+	return Matrix(M(1, 1), M(2, 1), M(3, 1), M(4, 1), 
+	              M(1, 2), M(2, 2), M(3, 2), M(4, 2), 
+	              M(1, 3), M(2, 3), M(3, 3), M(4, 3), 
+	              M(1, 4), M(2, 4), M(3, 4), M(4, 4));
+}
+
+Matrix &Matrix::operator+=(const Matrix &N)
+{
+	Matrix &M = *this;
+
+	M(1, 1) += N(1, 1); M(1, 2) += N(1, 2); M(1, 3) += N(1, 3); M(1, 4) += N(1, 4);
+	M(2, 1) += N(2, 1); M(2, 2) += N(2, 2); M(2, 3) += N(2, 3); M(2, 4) += N(2, 4);
+	M(3, 1) += N(3, 1); M(3, 2) += N(3, 2); M(3, 3) += N(3, 3); M(3, 4) += N(3, 4);
+	M(4, 1) += N(4, 1); M(4, 2) += N(4, 2); M(4, 3) += N(4, 3); M(4, 4) += N(4, 4);
+
+	return M;
+}
+
+Matrix &Matrix::operator-=(const Matrix &N)
+{
+	Matrix &M = *this;
+
+	M(1, 1) -= N(1, 1); M(1, 2) -= N(1, 2); M(1, 3) -= N(1, 3); M(1, 4) -= N(1, 4);
+	M(2, 1) -= N(2, 1); M(2, 2) -= N(2, 2); M(2, 3) -= N(2, 3); M(2, 4) -= N(2, 4);
+	M(3, 1) -= N(3, 1); M(3, 2) -= N(3, 2); M(3, 3) -= N(3, 3); M(3, 4) -= N(3, 4);
+	M(4, 1) -= N(4, 1); M(4, 2) -= N(4, 2); M(4, 3) -= N(4, 3); M(4, 4) -= N(4, 4);
+
+	return M;
+}
+
+Matrix &Matrix::operator*=(float s)
+{
+	Matrix &M = *this;
+
+	M(1, 1) *= s; M(1, 2) *= s; M(1, 3) *= s; M(1, 4) *= s;
+	M(2, 1) *= s; M(2, 2) *= s; M(2, 3) *= s; M(2, 4) *= s;
+	M(3, 1) *= s; M(3, 2) *= s; M(3, 3) *= s; M(3, 4) *= s;
+	M(4, 1) *= s; M(4, 2) *= s; M(4, 3) *= s; M(4, 4) *= s;
+
+	return M;
+}
+
+Matrix &Matrix::operator*=(const Matrix &M)
+{
+	return *this = *this * M;
+}
+
+Matrix &Matrix::operator/=(float s)
+{
+	float r = 1.0f / s;
+
+	return *this *= r;
+}
+
+bool operator==(const Matrix &M, const Matrix &N)
+{
+	if(M(1, 1) == N(1, 1) && M(1, 2) == N(1, 2) && M(1, 3) == N(1, 3) && M(1, 4) == N(1, 4) &&
+	   M(2, 1) == N(2, 1) && M(2, 2) == N(2, 2) && M(2, 3) == N(2, 3) && M(2, 4) == N(2, 4) &&
+	   M(3, 1) == N(3, 1) && M(3, 2) == N(3, 2) && M(3, 3) == N(3, 3) && M(3, 4) == N(3, 4) &&
+	   M(4, 1) == N(4, 1) && M(4, 2) == N(4, 2) && M(4, 3) == N(4, 3) && M(4, 4) == N(4, 4))
+		return true;
+	else
+		return false;
+}
+
+bool operator!=(const Matrix &M, const Matrix &N)
+{
+	if(M(1, 1) != N(1, 1) || M(1, 2) != N(1, 2) || M(1, 3) != N(1, 3) || M(1, 4) != N(1, 4) ||
+	   M(2, 1) != N(2, 1) || M(2, 2) != N(2, 2) || M(2, 3) != N(2, 3) || M(2, 4) != N(2, 4) ||
+	   M(3, 1) != N(3, 1) || M(3, 2) != N(3, 2) || M(3, 3) != N(3, 3) || M(3, 4) != N(3, 4) ||
+	   M(4, 1) != N(4, 1) || M(4, 2) != N(4, 2) || M(4, 3) != N(4, 3) || M(4, 4) != N(4, 4))
+		return true;
+	else
+		return false;
+}
+
+Matrix operator+(const Matrix &M, const Matrix &N)
+{
+	return Matrix(M(1, 1) + N(1, 1), M(1, 2) + N(1, 2), M(1, 3) + N(1, 3), M(1, 4) + N(1, 4), 
+	              M(2, 1) + N(2, 1), M(2, 2) + N(2, 2), M(2, 3) + N(2, 3), M(2, 4) + N(2, 4), 
+	              M(3, 1) + N(3, 1), M(3, 2) + N(3, 2), M(3, 3) + N(3, 3), M(3, 4) + N(3, 4), 
+	              M(4, 1) + N(4, 1), M(4, 2) + N(4, 2), M(4, 3) + N(4, 3), M(4, 4) + N(4, 4));
+}
+
+Matrix operator-(const Matrix &M, const Matrix &N)
+{
+	return Matrix(M(1, 1) - N(1, 1), M(1, 2) - N(1, 2), M(1, 3) - N(1, 3), M(1, 4) - N(1, 4), 
+	              M(2, 1) - N(2, 1), M(2, 2) - N(2, 2), M(2, 3) - N(2, 3), M(2, 4) - N(2, 4), 
+	              M(3, 1) - N(3, 1), M(3, 2) - N(3, 2), M(3, 3) - N(3, 3), M(3, 4) - N(3, 4), 
+	              M(4, 1) - N(4, 1), M(4, 2) - N(4, 2), M(4, 3) - N(4, 3), M(4, 4) - N(4, 4));
+}
+
+Matrix operator*(float s, const Matrix &M)
+{
+	return Matrix(s * M(1, 1), s * M(1, 2), s * M(1, 3), s * M(1, 4), 
+	              s * M(2, 1), s * M(2, 2), s * M(2, 3), s * M(2, 4), 
+	              s * M(3, 1), s * M(3, 2), s * M(3, 3), s * M(3, 4), 
+	              s * M(4, 1), s * M(4, 2), s * M(4, 3), s * M(4, 4));
+}
+
+Matrix operator*(const Matrix &M, float s)
+{
+	return Matrix(M(1, 1) * s, M(1, 2) * s, M(1, 3) * s, M(1, 4) * s, 
+	              M(2, 1) * s, M(2, 2) * s, M(2, 3) * s, M(2, 4) * s, 
+	              M(3, 1) * s, M(3, 2) * s, M(3, 3) * s, M(3, 4) * s, 
+	              M(4, 1) * s, M(4, 2) * s, M(4, 3) * s, M(4, 4) * s);
+}
+
+Matrix operator*(const Matrix &M, const Matrix &N)
+{
+	return Matrix(M(1, 1) * N(1, 1) + M(1, 2) * N(2, 1) + M(1, 3) * N(3, 1) + M(1, 4) * N(4, 1), M(1, 1) * N(1, 2) + M(1, 2) * N(2, 2) + M(1, 3) * N(3, 2) + M(1, 4) * N(4, 2), M(1, 1) * N(1, 3) + M(1, 2) * N(2, 3) + M(1, 3) * N(3, 3) + M(1, 4) * N(4, 3), M(1, 1) * N(1, 4) + M(1, 2) * N(2, 4) + M(1, 3) * N(3, 4) + M(1, 4) * N(4, 4), 
+	              M(2, 1) * N(1, 1) + M(2, 2) * N(2, 1) + M(2, 3) * N(3, 1) + M(2, 4) * N(4, 1), M(2, 1) * N(1, 2) + M(2, 2) * N(2, 2) + M(2, 3) * N(3, 2) + M(2, 4) * N(4, 2), M(2, 1) * N(1, 3) + M(2, 2) * N(2, 3) + M(2, 3) * N(3, 3) + M(2, 4) * N(4, 3), M(2, 1) * N(1, 4) + M(2, 2) * N(2, 4) + M(2, 3) * N(3, 4) + M(2, 4) * N(4, 4), 
+	              M(3, 1) * N(1, 1) + M(3, 2) * N(2, 1) + M(3, 3) * N(3, 1) + M(3, 4) * N(4, 1), M(3, 1) * N(1, 2) + M(3, 2) * N(2, 2) + M(3, 3) * N(3, 2) + M(3, 4) * N(4, 2), M(3, 1) * N(1, 3) + M(3, 2) * N(2, 3) + M(3, 3) * N(3, 3) + M(3, 4) * N(4, 3), M(3, 1) * N(1, 4) + M(3, 2) * N(2, 4) + M(3, 3) * N(3, 4) + M(3, 4) * N(4, 4), 
+	              M(4, 1) * N(1, 1) + M(4, 2) * N(2, 1) + M(4, 3) * N(3, 1) + M(4, 4) * N(4, 1), M(4, 1) * N(1, 2) + M(4, 2) * N(2, 2) + M(4, 3) * N(3, 2) + M(4, 4) * N(4, 2), M(4, 1) * N(1, 3) + M(4, 2) * N(2, 3) + M(4, 3) * N(3, 3) + M(4, 4) * N(4, 3), M(4, 1) * N(1, 4) + M(4, 2) * N(2, 4) + M(4, 3) * N(3, 4) + M(4, 4) * N(4, 4));
+}
+
+Matrix operator/(const Matrix &M, float s)
+{
+	float r = 1.0f / s;
+
+	return M * r;
+}
+
+float4 Matrix::operator*(const float4 &v) const
+{
+	const Matrix &M = *this;
+	float Mx = M(1, 1) * v.x + M(1, 2) * v.y + M(1, 3) * v.z + M(1, 4) * v.w;
+	float My = M(2, 1) * v.x + M(2, 2) * v.y + M(2, 3) * v.z + M(2, 4) * v.w;
+	float Mz = M(3, 1) * v.x + M(3, 2) * v.y + M(3, 3) * v.z + M(3, 4) * v.w;
+	float Mw = M(4, 1) * v.x + M(4, 2) * v.y + M(4, 3) * v.z + M(4, 4) * v.w;
+
+	return {Mx, My, Mz, Mw};
+}
+
+float Matrix::det(const Matrix &M)
+{
+	float M3344 = M(3, 3) * M(4, 4) - M(4, 3) * M(3, 4);
+	float M2344 = M(2, 3) * M(4, 4) - M(4, 3) * M(2, 4);
+	float M2334 = M(2, 3) * M(3, 4) - M(3, 3) * M(2, 4);
+	float M1344 = M(1, 3) * M(4, 4) - M(4, 3) * M(1, 4);
+	float M1334 = M(1, 3) * M(3, 4) - M(3, 3) * M(1, 4);
+	float M1324 = M(1, 3) * M(2, 4) - M(2, 3) * M(1, 4);
+
+	return M(1, 1) * (M(2, 2) * M3344 - M(3, 2) * M2344 + M(4, 2) * M2334) -
+	       M(2, 1) * (M(1, 2) * M3344 - M(3, 2) * M1344 + M(4, 2) * M1334) +
+	       M(3, 1) * (M(1, 2) * M2344 - M(2, 2) * M1344 + M(4, 2) * M1324) -
+	       M(4, 1) * (M(1, 2) * M2334 - M(2, 2) * M1334 + M(3, 2) * M1324);
+}
+
+float Matrix::det(float m11)
+{
+	return m11;
+}
+
+float Matrix::det(float m11, float m12, 
+                  float m21, float m22)
+{
+	return m11 * m22 - m12 * m21; 
+}
+
+float Matrix::det(float m11, float m12, float m13, 
+                  float m21, float m22, float m23, 
+                  float m31, float m32, float m33)
+{
+	return m11 * (m22 * m33 - m32 * m23) -
+	       m21 * (m12 * m33 - m32 * m13) +
+	       m31 * (m12 * m23 - m22 * m13);
+}
+
+float Matrix::det(float m11, float m12, float m13, float m14, 
+                  float m21, float m22, float m23, float m24, 
+                  float m31, float m32, float m33, float m34, 
+                  float m41, float m42, float m43, float m44)
+{
+	float M3344 = m33 * m44 - m43 * m34;
+	float M2344 = m23 * m44 - m43 * m24;
+	float M2334 = m23 * m34 - m33 * m24;
+	float M1344 = m13 * m44 - m43 * m14;
+	float M1334 = m13 * m34 - m33 * m14;
+	float M1324 = m13 * m24 - m23 * m14;
+
+	return m11 * (m22 * M3344 - m32 * M2344 + m42 * M2334) -
+	       m21 * (m12 * M3344 - m32 * M1344 + m42 * M1334) +
+	       m31 * (m12 * M2344 - m22 * M1344 + m42 * M1324) -
+	       m41 * (m12 * M2334 - m22 * M1334 + m32 * M1324);
+}
+
+float Matrix::det(const Vector &v1, const Vector &v2, const Vector &v3)
+{
+	return v1 * (v2 % v3);
+}
+
+float Matrix::det3(const Matrix &M)
+{
+	return M(1, 1) * (M(2, 2) * M(3, 3) - M(3, 2) * M(2, 3)) -
+	       M(2, 1) * (M(1, 2) * M(3, 3) - M(3, 2) * M(1, 3)) +
+	       M(3, 1) * (M(1, 2) * M(2, 3) - M(2, 2) * M(1, 3));
+}
+
+float Matrix::tr(const Matrix &M)
+{
+	return M(1, 1) + M(2, 2) + M(3, 3) + M(4, 4);
+}
+
+Matrix &Matrix::orthogonalise()
+{
+	// NOTE: Numnerically instable, won't return exact the same result when already orhtogonal
+
+	Matrix &M = *this;
+
+	Vector v1(M(1, 1), M(2, 1), M(3, 1));
+	Vector v2(M(1, 2), M(2, 2), M(3, 2));
+	Vector v3(M(1, 3), M(2, 3), M(3, 3));
+
+	v2 -= v1 * (v1 * v2) / (v1 * v1);
+	v3 -= v1 * (v1 * v3) / (v1 * v1);
+	v3 -= v2 * (v2 * v3) / (v2 * v2);
+
+	v1 /= Vector::N(v1);
+	v2 /= Vector::N(v2);
+	v3 /= Vector::N(v3);
+
+	M(1, 1) = v1.x;  M(1, 2) = v2.x;  M(1, 3) = v3.x;
+	M(2, 1) = v1.y;  M(2, 2) = v2.y;  M(2, 3) = v3.y;
+	M(3, 1) = v1.z;  M(3, 2) = v2.z;  M(3, 3) = v3.z;
+
+	return *this;
+}
+
+Matrix Matrix::eulerRotate(const Vector &v)
+{
+	float cz = cos(v.z);
+	float sz = sin(v.z);
+	float cx = cos(v.x);
+	float sx = sin(v.x);
+	float cy = cos(v.y);
+	float sy = sin(v.y);
+
+	float sxsy = sx * sy;
+	float sxcy = sx * cy;
+
+	return Matrix(cy * cz - sxsy * sz, -cy * sz - sxsy * cz, -sy * cx,
+	              cx * sz,              cx * cz,             -sx,
+	              sy * cz + sxcy * sz, -sy * sz + sxcy * cz,  cy * cx);
+}
+
+Matrix Matrix::eulerRotate(float x, float y, float z)
+{
+	return eulerRotate(Vector(x, y, z));
+}
+
+Matrix Matrix::translate(const Vector &v)
+{
+	return Matrix(1, 0, 0, v.x,
+	              0, 1, 0, v.y,
+	              0, 0, 1, v.z,
+	              0, 0, 0, 1);
+}
+
+Matrix Matrix::translate(float x, float y, float z)
+{
+	return translate(Vector(x, y, z));
+}
+
+Matrix Matrix::scale(const Vector &v)
+{
+	return Matrix(v.x, 0,   0,
+	              0,   v.y, 0,
+	              0,   0,   v.z);
+}
+
+Matrix Matrix::scale(float x, float y, float z)
+{
+	return scale(Vector(x, y, z));
+}
+
+Matrix Matrix::lookAt(const Vector &v)
+{
+	Vector y = v;
+	y /= Vector::N(y);
+
+	Vector x = y % Vector(0, 0, 1);
+	x /= Vector::N(x);
+
+	Vector z = x % y;
+	z /= Vector::N(z);
+
+	return ~Matrix(x, y, z);
+}
+
+Matrix Matrix::lookAt(float x, float y, float z)
+{
+	return translate(Vector(x, y, z));
+}
+
+}  // namespace sw
diff --git a/src/Device/Matrix.hpp b/src/Device/Matrix.hpp
index 41281a6..e4f5ecc 100644
--- a/src/Device/Matrix.hpp
+++ b/src/Device/Matrix.hpp
@@ -15,203 +15,204 @@
 #ifndef Matrix_hpp
 #define Matrix_hpp
 
-namespace sw
+namespace sw {
+
+struct Vector;
+struct Point;
+struct float4;
+
+struct Matrix
 {
-	struct Vector;
-	struct Point;
-	struct float4;
+	Matrix();
+	Matrix(const int i);
+	Matrix(const float m[16]);
+	Matrix(const float m[4][4]);
+	Matrix(float m11, float m12, float m13,
+	       float m21, float m22, float m23,
+	       float m31, float m32, float m33);
+	Matrix(float m11, float m12, float m13, float m14,
+	       float m21, float m22, float m23, float m24,
+	       float m31, float m32, float m33, float m34,
+	       float m41, float m42, float m43, float m44);
+	Matrix(const Vector &v1, const Vector &v2, const Vector &v3);   // Column vectors
 
-	struct Matrix
-	{
-		Matrix();
-		Matrix(const int i);
-		Matrix(const float m[16]);
-		Matrix(const float m[4][4]);
-		Matrix(float m11, float m12, float m13,
-		       float m21, float m22, float m23,
-		       float m31, float m32, float m33);
-		Matrix(float m11, float m12, float m13, float m14,
-		       float m21, float m22, float m23, float m24,
-		       float m31, float m32, float m33, float m34,
-		       float m41, float m42, float m43, float m44);
-		Matrix(const Vector &v1, const Vector &v2, const Vector &v3);   // Column vectors
+	Matrix &operator=(const Matrix &N);
 
-		Matrix &operator=(const Matrix &N);
+	// Row major order
+	float m[4][4];
 
-		// Row major order
-		float m[4][4];
+	static Matrix diag(float m11, float m22, float m33, float m44);
 
-		static Matrix diag(float m11, float m22, float m33, float m44);
+	operator float*();
 
-		operator float*();
+	Matrix operator+() const;
+	Matrix operator-() const;
 
-		Matrix operator+() const;
-		Matrix operator-() const;
+	Matrix operator!() const;   // Inverse
+	Matrix operator~() const;   // Transpose
 
-		Matrix operator!() const;   // Inverse
-		Matrix operator~() const;   // Transpose
+	Matrix &operator+=(const Matrix &N);
+	Matrix &operator-=(const Matrix &N);
+	Matrix &operator*=(float s);
+	Matrix &operator*=(const Matrix &N);
+	Matrix &operator/=(float s);
 
-		Matrix &operator+=(const Matrix &N);
-		Matrix &operator-=(const Matrix &N);
-		Matrix &operator*=(float s);
-		Matrix &operator*=(const Matrix &N);
-		Matrix &operator/=(float s);
+	float *operator[](int i);   // Access element [row][col], starting with [0][0]
+	const float *operator[](int i) const;
 
-		float *operator[](int i);   // Access element [row][col], starting with [0][0]
-		const float *operator[](int i) const;
+	float &operator()(int i, int j);   // Access element (row, col), starting with (1, 1)
+	const float &operator()(int i, int j) const;
 
-		float &operator()(int i, int j);   // Access element (row, col), starting with (1, 1)
-		const float &operator()(int i, int j) const;
+	friend bool operator==(const Matrix &M, const Matrix &N);
+	friend bool operator!=(const Matrix &M, const Matrix &N);
 
-		friend bool operator==(const Matrix &M, const Matrix &N);
-		friend bool operator!=(const Matrix &M, const Matrix &N);
+	friend Matrix operator+(const Matrix &M, const Matrix &N);
+	friend Matrix operator-(const Matrix &M, const Matrix &N);
+	friend Matrix operator*(float s, const Matrix &M);
+	friend Matrix operator*(const Matrix &M, const Matrix &N);
+	friend Matrix operator/(const Matrix &M, float s);
 
-		friend Matrix operator+(const Matrix &M, const Matrix &N);
-		friend Matrix operator-(const Matrix &M, const Matrix &N);
-		friend Matrix operator*(float s, const Matrix &M);
-		friend Matrix operator*(const Matrix &M, const Matrix &N);
-		friend Matrix operator/(const Matrix &M, float s);
+	float4 operator*(const float4 &v) const;
 
-		float4 operator*(const float4 &v) const;
+	static float det(const Matrix &M);
+	static float det(float m11);
+	static float det(float m11, float m12,
+	                 float m21, float m22);
+	static float det(float m11, float m12, float m13,
+	                 float m21, float m22, float m23,
+	                 float m31, float m32, float m33);
+	static float det(float m11, float m12, float m13, float m14,
+	                 float m21, float m22, float m23, float m24,
+	                 float m31, float m32, float m33, float m34,
+	                 float m41, float m42, float m43, float m44);
+	static float det(const Vector &v1, const Vector &v2, const Vector &v3);
+	static float det3(const Matrix &M);
 
-		static float det(const Matrix &M);
-		static float det(float m11);
-		static float det(float m11, float m12,
-		                 float m21, float m22);
-		static float det(float m11, float m12, float m13,
-		                 float m21, float m22, float m23,
-		                 float m31, float m32, float m33);
-		static float det(float m11, float m12, float m13, float m14,
-		                 float m21, float m22, float m23, float m24,
-		                 float m31, float m32, float m33, float m34,
-		                 float m41, float m42, float m43, float m44);
-		static float det(const Vector &v1, const Vector &v2, const Vector &v3);
-		static float det3(const Matrix &M);
+	static float tr(const Matrix &M);
 
-		static float tr(const Matrix &M);
+	Matrix &orthogonalise();   // Gram-Schmidt orthogonalisation of 3x3 submatrix
 
-		Matrix &orthogonalise();   // Gram-Schmidt orthogonalisation of 3x3 submatrix
+	static Matrix eulerRotate(const Vector &v);
+	static Matrix eulerRotate(float x, float y, float z);
 
-		static Matrix eulerRotate(const Vector &v);
-		static Matrix eulerRotate(float x, float y, float z);
+	static Matrix translate(const Vector &v);
+	static Matrix translate(float x, float y, float z);
 	
-		static Matrix translate(const Vector &v);
-		static Matrix translate(float x, float y, float z);
-		
-		static Matrix scale(const Vector &v);
-		static Matrix scale(float x, float y, float z);
+	static Matrix scale(const Vector &v);
+	static Matrix scale(float x, float y, float z);
 
-		static Matrix lookAt(const Vector &v);
-		static Matrix lookAt(float x, float y, float z);
-	};
+	static Matrix lookAt(const Vector &v);
+	static Matrix lookAt(float x, float y, float z);
+};
 }
 
 #include "Vector.hpp"
 
-namespace sw
+namespace sw {
+
+inline Matrix::Matrix()
 {
-	inline Matrix::Matrix()
-	{
-	}
-
-	inline Matrix::Matrix(const int i)
-	{
-		const float s = (float)i;
-
-		Matrix &M = *this;
-
-		M(1, 1) = s; M(1, 2) = 0; M(1, 3) = 0; M(1, 4) = 0;
-		M(2, 1) = 0; M(2, 2) = s; M(2, 3) = 0; M(2, 4) = 0;
-		M(3, 1) = 0; M(3, 2) = 0; M(3, 3) = s; M(3, 4) = 0;
-		M(4, 1) = 0; M(4, 2) = 0; M(4, 3) = 0; M(4, 4) = s;
-	}
-
-	inline Matrix::Matrix(const float m[16])
-	{
-		Matrix &M = *this;
-
-		M(1, 1) = m[0];  M(1, 2) = m[1];  M(1, 3) = m[2];  M(1, 4) = m[3];
-		M(2, 1) = m[4];  M(2, 2) = m[5];  M(2, 3) = m[6];  M(2, 4) = m[7];
-		M(3, 1) = m[8];  M(3, 2) = m[8];  M(3, 3) = m[10]; M(3, 4) = m[11];
-		M(4, 1) = m[12]; M(4, 2) = m[13]; M(4, 3) = m[14]; M(4, 4) = m[15];
-	}
-
-	inline Matrix::Matrix(const float m[4][4])
-	{
-		Matrix &M = *this;
-
-		M[0][0] = m[0][0];  M[0][1] = m[0][1];  M[0][2] = m[0][2];  M[0][3] = m[0][3];
-		M[1][0] = m[1][0];  M[1][1] = m[1][1];  M[1][2] = m[1][2];  M[1][3] = m[1][3];
-		M[2][0] = m[2][0];  M[2][1] = m[2][1];  M[2][2] = m[2][2];  M[2][3] = m[2][3];
-		M[3][0] = m[3][0];  M[3][1] = m[3][1];  M[3][2] = m[3][2];  M[3][3] = m[3][3];
-	}
-
-	inline Matrix::Matrix(float m11, float m12, float m13, 
-	                      float m21, float m22, float m23, 
-	                      float m31, float m32, float m33)
-	{
-		Matrix &M = *this;
-
-		M(1, 1) = m11; M(1, 2) = m12; M(1, 3) = m13; M(1, 4) = 0;
-		M(2, 1) = m21; M(2, 2) = m22; M(2, 3) = m23; M(2, 4) = 0;
-		M(3, 1) = m31; M(3, 2) = m32; M(3, 3) = m33; M(3, 4) = 0;
-		M(4, 1) = 0;   M(4, 2) = 0;   M(4, 3) = 0;   M(4, 4) = 1;
-	}
-
-	inline Matrix::Matrix(float m11, float m12, float m13, float m14, 
-	                      float m21, float m22, float m23, float m24, 
-	                      float m31, float m32, float m33, float m34, 
-	                      float m41, float m42, float m43, float m44)
-	{
-		Matrix &M = *this;
-
-		M(1, 1) = m11; M(1, 2) = m12; M(1, 3) = m13; M(1, 4) = m14;
-		M(2, 1) = m21; M(2, 2) = m22; M(2, 3) = m23; M(2, 4) = m24;
-		M(3, 1) = m31; M(3, 2) = m32; M(3, 3) = m33; M(3, 4) = m34;
-		M(4, 1) = m41; M(4, 2) = m42; M(4, 3) = m43; M(4, 4) = m44;
-	}
-
-	inline Matrix::Matrix(const Vector &v1, const Vector &v2, const Vector &v3)
-	{
-		Matrix &M = *this;
-
-		M(1, 1) = v1.x; M(1, 2) = v2.x; M(1, 3) = v3.x; M(1, 4) = 0;
-		M(2, 1) = v1.y; M(2, 2) = v2.y; M(2, 3) = v3.y; M(2, 4) = 0;
-		M(3, 1) = v1.z; M(3, 2) = v2.z; M(3, 3) = v3.z; M(3, 4) = 0;
-		M(4, 1) = 0;    M(4, 2) = 0;    M(4, 3) = 0;    M(4, 4) = 1;
-	}
-
-	inline Matrix &Matrix::operator=(const Matrix &N)
-	{
-		Matrix &M = *this;
-
-		M(1, 1) = N(1, 1); M(1, 2) = N(1, 2); M(1, 3) = N(1, 3); M(1, 4) = N(1, 4);
-		M(2, 1) = N(2, 1); M(2, 2) = N(2, 2); M(2, 3) = N(2, 3); M(2, 4) = N(2, 4);
-		M(3, 1) = N(3, 1); M(3, 2) = N(3, 2); M(3, 3) = N(3, 3); M(3, 4) = N(3, 4);
-		M(4, 1) = N(4, 1); M(4, 2) = N(4, 2); M(4, 3) = N(4, 3); M(4, 4) = N(4, 4);
-
-		return M;
-	}
-
-	inline float *Matrix::operator[](int i)
-	{
-		return m[i];
-	}
-
-	inline const float *Matrix::operator[](int i) const
-	{
-		return m[i];
-	}
-
-	inline float &Matrix::operator()(int i, int j)
-	{
-		return m[i - 1][j - 1];
-	}
-
-	inline const float &Matrix::operator()(int i, int j) const
-	{
-		return m[i - 1][j - 1];
-	}
 }
 
+inline Matrix::Matrix(const int i)
+{
+	const float s = (float)i;
+
+	Matrix &M = *this;
+
+	M(1, 1) = s; M(1, 2) = 0; M(1, 3) = 0; M(1, 4) = 0;
+	M(2, 1) = 0; M(2, 2) = s; M(2, 3) = 0; M(2, 4) = 0;
+	M(3, 1) = 0; M(3, 2) = 0; M(3, 3) = s; M(3, 4) = 0;
+	M(4, 1) = 0; M(4, 2) = 0; M(4, 3) = 0; M(4, 4) = s;
+}
+
+inline Matrix::Matrix(const float m[16])
+{
+	Matrix &M = *this;
+
+	M(1, 1) = m[0];  M(1, 2) = m[1];  M(1, 3) = m[2];  M(1, 4) = m[3];
+	M(2, 1) = m[4];  M(2, 2) = m[5];  M(2, 3) = m[6];  M(2, 4) = m[7];
+	M(3, 1) = m[8];  M(3, 2) = m[8];  M(3, 3) = m[10]; M(3, 4) = m[11];
+	M(4, 1) = m[12]; M(4, 2) = m[13]; M(4, 3) = m[14]; M(4, 4) = m[15];
+}
+
+inline Matrix::Matrix(const float m[4][4])
+{
+	Matrix &M = *this;
+
+	M[0][0] = m[0][0];  M[0][1] = m[0][1];  M[0][2] = m[0][2];  M[0][3] = m[0][3];
+	M[1][0] = m[1][0];  M[1][1] = m[1][1];  M[1][2] = m[1][2];  M[1][3] = m[1][3];
+	M[2][0] = m[2][0];  M[2][1] = m[2][1];  M[2][2] = m[2][2];  M[2][3] = m[2][3];
+	M[3][0] = m[3][0];  M[3][1] = m[3][1];  M[3][2] = m[3][2];  M[3][3] = m[3][3];
+}
+
+inline Matrix::Matrix(float m11, float m12, float m13, 
+                      float m21, float m22, float m23, 
+                      float m31, float m32, float m33)
+{
+	Matrix &M = *this;
+
+	M(1, 1) = m11; M(1, 2) = m12; M(1, 3) = m13; M(1, 4) = 0;
+	M(2, 1) = m21; M(2, 2) = m22; M(2, 3) = m23; M(2, 4) = 0;
+	M(3, 1) = m31; M(3, 2) = m32; M(3, 3) = m33; M(3, 4) = 0;
+	M(4, 1) = 0;   M(4, 2) = 0;   M(4, 3) = 0;   M(4, 4) = 1;
+}
+
+inline Matrix::Matrix(float m11, float m12, float m13, float m14, 
+                      float m21, float m22, float m23, float m24, 
+                      float m31, float m32, float m33, float m34, 
+                      float m41, float m42, float m43, float m44)
+{
+	Matrix &M = *this;
+
+	M(1, 1) = m11; M(1, 2) = m12; M(1, 3) = m13; M(1, 4) = m14;
+	M(2, 1) = m21; M(2, 2) = m22; M(2, 3) = m23; M(2, 4) = m24;
+	M(3, 1) = m31; M(3, 2) = m32; M(3, 3) = m33; M(3, 4) = m34;
+	M(4, 1) = m41; M(4, 2) = m42; M(4, 3) = m43; M(4, 4) = m44;
+}
+
+inline Matrix::Matrix(const Vector &v1, const Vector &v2, const Vector &v3)
+{
+	Matrix &M = *this;
+
+	M(1, 1) = v1.x; M(1, 2) = v2.x; M(1, 3) = v3.x; M(1, 4) = 0;
+	M(2, 1) = v1.y; M(2, 2) = v2.y; M(2, 3) = v3.y; M(2, 4) = 0;
+	M(3, 1) = v1.z; M(3, 2) = v2.z; M(3, 3) = v3.z; M(3, 4) = 0;
+	M(4, 1) = 0;    M(4, 2) = 0;    M(4, 3) = 0;    M(4, 4) = 1;
+}
+
+inline Matrix &Matrix::operator=(const Matrix &N)
+{
+	Matrix &M = *this;
+
+	M(1, 1) = N(1, 1); M(1, 2) = N(1, 2); M(1, 3) = N(1, 3); M(1, 4) = N(1, 4);
+	M(2, 1) = N(2, 1); M(2, 2) = N(2, 2); M(2, 3) = N(2, 3); M(2, 4) = N(2, 4);
+	M(3, 1) = N(3, 1); M(3, 2) = N(3, 2); M(3, 3) = N(3, 3); M(3, 4) = N(3, 4);
+	M(4, 1) = N(4, 1); M(4, 2) = N(4, 2); M(4, 3) = N(4, 3); M(4, 4) = N(4, 4);
+
+	return M;
+}
+
+inline float *Matrix::operator[](int i)
+{
+	return m[i];
+}
+
+inline const float *Matrix::operator[](int i) const
+{
+	return m[i];
+}
+
+inline float &Matrix::operator()(int i, int j)
+{
+	return m[i - 1][j - 1];
+}
+
+inline const float &Matrix::operator()(int i, int j) const
+{
+	return m[i - 1][j - 1];
+}
+
+}  // namespace sw
+
 #endif   // Matrix_hpp
diff --git a/src/Device/Memset.hpp b/src/Device/Memset.hpp
index 8c015fa..9db5d47 100644
--- a/src/Device/Memset.hpp
+++ b/src/Device/Memset.hpp
@@ -18,35 +18,35 @@
 #include <cstring>
 #include <type_traits>
 
-namespace sw
+namespace sw {
+
+// Helper class for clearing the memory of objects at construction.
+// Useful as the first base class of cache keys which may contain padding
+// bytes or bits otherwise left uninitialized.
+template<class T>
+struct Memset
 {
-	// Helper class for clearing the memory of objects at construction.
-	// Useful as the first base class of cache keys which may contain padding
-	// bytes or bits otherwise left uninitialized.
-	template<class T>
-	struct Memset
+	Memset(T *object, int val)
 	{
-		Memset(T *object, int val)
-		{
-			static_assert(std::is_base_of<Memset<T>, T>::value, "Memset<T> must only clear the memory of a type of which it is a base class");
+		static_assert(std::is_base_of<Memset<T>, T>::value, "Memset<T> must only clear the memory of a type of which it is a base class");
 
-			// GCC 8+ warns that
-			// "‘void* memset(void*, int, size_t)’ clearing an object of non-trivial type ‘T’;
-			//  use assignment or value-initialization instead [-Werror=class-memaccess]"
-			// This is benign iff it happens before any of the base or member constructrs are called.
-			#if defined(__GNUC__) && (__GNUC__ >= 8)
-			#pragma GCC diagnostic push
-			#pragma GCC diagnostic ignored "-Wclass-memaccess"
-			#endif
+		// GCC 8+ warns that
+		// "‘void* memset(void*, int, size_t)’ clearing an object of non-trivial type ‘T’;
+		//  use assignment or value-initialization instead [-Werror=class-memaccess]"
+		// This is benign iff it happens before any of the base or member constructrs are called.
+		#if defined(__GNUC__) && (__GNUC__ >= 8)
+		#pragma GCC diagnostic push
+		#pragma GCC diagnostic ignored "-Wclass-memaccess"
+		#endif
 
-			memset(object, 0, sizeof(T));
+		memset(object, 0, sizeof(T));
 
-			#if defined(__GNUC__) && (__GNUC__ >= 8)
-			#pragma GCC diagnostic pop
-			#endif
-		}
-	};
+		#if defined(__GNUC__) && (__GNUC__ >= 8)
+		#pragma GCC diagnostic pop
+		#endif
+	}
+};
 
-}
+}  // namespace sw
 
 #endif   // sw_Memset_hpp
\ No newline at end of file
diff --git a/src/Device/PixelProcessor.cpp b/src/Device/PixelProcessor.cpp
index 26731a2..a1e8df1 100644
--- a/src/Device/PixelProcessor.cpp
+++ b/src/Device/PixelProcessor.cpp
@@ -22,211 +22,212 @@
 
 #include <cstring>
 
-namespace sw
+namespace sw {
+
+uint32_t PixelProcessor::States::computeHash()
 {
-	uint32_t PixelProcessor::States::computeHash()
+	uint32_t *state = reinterpret_cast<uint32_t*>(this);
+	uint32_t hash = 0;
+
+	for(unsigned int i = 0; i < sizeof(States) / sizeof(uint32_t); i++)
 	{
-		uint32_t *state = reinterpret_cast<uint32_t*>(this);
-		uint32_t hash = 0;
-
-		for(unsigned int i = 0; i < sizeof(States) / sizeof(uint32_t); i++)
-		{
-			hash ^= state[i];
-		}
-
-		return hash;
+		hash ^= state[i];
 	}
 
-	bool PixelProcessor::State::operator==(const State &state) const
-	{
-		if(hash != state.hash)
-		{
-			return false;
-		}
-
-		static_assert(is_memcmparable<State>::value, "Cannot memcmp State");
-		return memcmp(static_cast<const States*>(this), static_cast<const States*>(&state), sizeof(States)) == 0;
-	}
-
-	PixelProcessor::PixelProcessor()
-	{
-		routineCache = nullptr;
-		setRoutineCacheSize(1024);
-	}
-
-	PixelProcessor::~PixelProcessor()
-	{
-		delete routineCache;
-		routineCache = nullptr;
-	}
-
-	void PixelProcessor::setBlendConstant(const Color<float> &blendConstant)
-	{
-		// TODO(b/140935644): Compact into generic function, cheack if clamp is required
-		factor.blendConstant4W[0][0] =
-		factor.blendConstant4W[0][1] =
-		factor.blendConstant4W[0][2] =
-		factor.blendConstant4W[0][3] = static_cast<uint16_t>(iround(65535.0f * blendConstant.r));
-
-		factor.blendConstant4W[1][0] =
-		factor.blendConstant4W[1][1] =
-		factor.blendConstant4W[1][2] =
-		factor.blendConstant4W[1][3] = static_cast<uint16_t>(iround(65535.0f * blendConstant.g));
-
-		factor.blendConstant4W[2][0] =
-		factor.blendConstant4W[2][1] =
-		factor.blendConstant4W[2][2] =
-		factor.blendConstant4W[2][3] = static_cast<uint16_t>(iround(65535.0f * blendConstant.b));
-
-		factor.blendConstant4W[3][0] =
-		factor.blendConstant4W[3][1] =
-		factor.blendConstant4W[3][2] =
-		factor.blendConstant4W[3][3] = static_cast<uint16_t>(iround(65535.0f * blendConstant.a));
-
-		factor.invBlendConstant4W[0][0] =
-		factor.invBlendConstant4W[0][1] =
-		factor.invBlendConstant4W[0][2] =
-		factor.invBlendConstant4W[0][3] = 0xFFFFu - factor.blendConstant4W[0][0];
-
-		factor.invBlendConstant4W[1][0] =
-		factor.invBlendConstant4W[1][1] =
-		factor.invBlendConstant4W[1][2] =
-		factor.invBlendConstant4W[1][3] = 0xFFFFu - factor.blendConstant4W[1][0];
-
-		factor.invBlendConstant4W[2][0] =
-		factor.invBlendConstant4W[2][1] =
-		factor.invBlendConstant4W[2][2] =
-		factor.invBlendConstant4W[2][3] = 0xFFFFu - factor.blendConstant4W[2][0];
-
-		factor.invBlendConstant4W[3][0] =
-		factor.invBlendConstant4W[3][1] =
-		factor.invBlendConstant4W[3][2] =
-		factor.invBlendConstant4W[3][3] = 0xFFFFu - factor.blendConstant4W[3][0];
-
-		factor.blendConstant4F[0][0] =
-		factor.blendConstant4F[0][1] =
-		factor.blendConstant4F[0][2] =
-		factor.blendConstant4F[0][3] = blendConstant.r;
-
-		factor.blendConstant4F[1][0] =
-		factor.blendConstant4F[1][1] =
-		factor.blendConstant4F[1][2] =
-		factor.blendConstant4F[1][3] = blendConstant.g;
-
-		factor.blendConstant4F[2][0] =
-		factor.blendConstant4F[2][1] =
-		factor.blendConstant4F[2][2] =
-		factor.blendConstant4F[2][3] = blendConstant.b;
-
-		factor.blendConstant4F[3][0] =
-		factor.blendConstant4F[3][1] =
-		factor.blendConstant4F[3][2] =
-		factor.blendConstant4F[3][3] = blendConstant.a;
-
-		factor.invBlendConstant4F[0][0] =
-		factor.invBlendConstant4F[0][1] =
-		factor.invBlendConstant4F[0][2] =
-		factor.invBlendConstant4F[0][3] = 1 - blendConstant.r;
-
-		factor.invBlendConstant4F[1][0] =
-		factor.invBlendConstant4F[1][1] =
-		factor.invBlendConstant4F[1][2] =
-		factor.invBlendConstant4F[1][3] = 1 - blendConstant.g;
-
-		factor.invBlendConstant4F[2][0] =
-		factor.invBlendConstant4F[2][1] =
-		factor.invBlendConstant4F[2][2] =
-		factor.invBlendConstant4F[2][3] = 1 - blendConstant.b;
-
-		factor.invBlendConstant4F[3][0] =
-		factor.invBlendConstant4F[3][1] =
-		factor.invBlendConstant4F[3][2] =
-		factor.invBlendConstant4F[3][3] = 1 - blendConstant.a;
-	}
-
-	void PixelProcessor::setRoutineCacheSize(int cacheSize)
-	{
-		delete routineCache;
-		routineCache = new RoutineCacheType(clamp(cacheSize, 1, 65536));
-	}
-
-	const PixelProcessor::State PixelProcessor::update(const Context* context) const
-	{
-		State state;
-
-		state.numClipDistances = context->vertexShader->getNumOutputClipDistances();
-		state.numCullDistances = context->vertexShader->getNumOutputCullDistances();
-
-		if(context->pixelShader)
-		{
-			state.shaderID = context->pixelShader->getSerialID();
-		}
-		else
-		{
-			state.shaderID = 0;
-		}
-
-		state.alphaToCoverage = context->alphaToCoverage;
-		state.depthWriteEnable = context->depthWriteActive();
-
-		if(context->stencilActive())
-		{
-			state.stencilActive = true;
-			state.frontStencil = context->frontStencil;
-			state.backStencil = context->backStencil;
-		}
-
-		if(context->depthBufferActive())
-		{
-			state.depthTestActive = true;
-			state.depthCompareMode = context->depthCompareMode;
-			state.depthFormat = context->depthBuffer->getFormat();
-		}
-
-		state.occlusionEnabled = context->occlusionEnabled;
-		state.depthClamp = (context->depthBias != 0.0f) || (context->slopeDepthBias != 0.0f);
-
-		for(int i = 0; i < RENDERTARGETS; i++)
-		{
-			state.colorWriteMask |= context->colorWriteActive(i) << (4 * i);
-			state.targetFormat[i] = context->renderTargetInternalFormat(i);
-			state.blendState[i] = context->getBlendState(i);
-		}
-
-		state.multiSample = static_cast<unsigned int>(context->sampleCount);
-		state.multiSampleMask = context->multiSampleMask;
-		state.multiSampledBresenham = (state.multiSample > 1) && context->isDrawLine(true) &&
-		                              (context->lineRasterizationMode == VK_LINE_RASTERIZATION_MODE_BRESENHAM_EXT);
-
-		if(state.multiSample > 1 && context->pixelShader)
-		{
-			state.centroid = context->pixelShader->getModes().NeedsCentroid;
-		}
-
-		state.frontFace = context->frontFace;
-
-		state.hash = state.computeHash();
-
-		return state;
-	}
-
-	PixelProcessor::RoutineType PixelProcessor::routine(const State &state,
-		vk::PipelineLayout const *pipelineLayout,
-		SpirvShader const *pixelShader,
-		const vk::DescriptorSet::Bindings &descriptorSets)
-	{
-		auto routine = routineCache->query(state);
-
-		if(!routine)
-		{
-			QuadRasterizer *generator = new PixelProgram(state, pipelineLayout, pixelShader, descriptorSets);
-			generator->generate();
-			routine = (*generator)("PixelRoutine_%0.8X", state.shaderID);
-			delete generator;
-
-			routineCache->add(state, routine);
-		}
-
-		return routine;
-	}
+	return hash;
 }
+
+bool PixelProcessor::State::operator==(const State &state) const
+{
+	if(hash != state.hash)
+	{
+		return false;
+	}
+
+	static_assert(is_memcmparable<State>::value, "Cannot memcmp State");
+	return memcmp(static_cast<const States*>(this), static_cast<const States*>(&state), sizeof(States)) == 0;
+}
+
+PixelProcessor::PixelProcessor()
+{
+	routineCache = nullptr;
+	setRoutineCacheSize(1024);
+}
+
+PixelProcessor::~PixelProcessor()
+{
+	delete routineCache;
+	routineCache = nullptr;
+}
+
+void PixelProcessor::setBlendConstant(const Color<float> &blendConstant)
+{
+	// TODO(b/140935644): Compact into generic function, cheack if clamp is required
+	factor.blendConstant4W[0][0] =
+	factor.blendConstant4W[0][1] =
+	factor.blendConstant4W[0][2] =
+	factor.blendConstant4W[0][3] = static_cast<uint16_t>(iround(65535.0f * blendConstant.r));
+
+	factor.blendConstant4W[1][0] =
+	factor.blendConstant4W[1][1] =
+	factor.blendConstant4W[1][2] =
+	factor.blendConstant4W[1][3] = static_cast<uint16_t>(iround(65535.0f * blendConstant.g));
+
+	factor.blendConstant4W[2][0] =
+	factor.blendConstant4W[2][1] =
+	factor.blendConstant4W[2][2] =
+	factor.blendConstant4W[2][3] = static_cast<uint16_t>(iround(65535.0f * blendConstant.b));
+
+	factor.blendConstant4W[3][0] =
+	factor.blendConstant4W[3][1] =
+	factor.blendConstant4W[3][2] =
+	factor.blendConstant4W[3][3] = static_cast<uint16_t>(iround(65535.0f * blendConstant.a));
+
+	factor.invBlendConstant4W[0][0] =
+	factor.invBlendConstant4W[0][1] =
+	factor.invBlendConstant4W[0][2] =
+	factor.invBlendConstant4W[0][3] = 0xFFFFu - factor.blendConstant4W[0][0];
+
+	factor.invBlendConstant4W[1][0] =
+	factor.invBlendConstant4W[1][1] =
+	factor.invBlendConstant4W[1][2] =
+	factor.invBlendConstant4W[1][3] = 0xFFFFu - factor.blendConstant4W[1][0];
+
+	factor.invBlendConstant4W[2][0] =
+	factor.invBlendConstant4W[2][1] =
+	factor.invBlendConstant4W[2][2] =
+	factor.invBlendConstant4W[2][3] = 0xFFFFu - factor.blendConstant4W[2][0];
+
+	factor.invBlendConstant4W[3][0] =
+	factor.invBlendConstant4W[3][1] =
+	factor.invBlendConstant4W[3][2] =
+	factor.invBlendConstant4W[3][3] = 0xFFFFu - factor.blendConstant4W[3][0];
+
+	factor.blendConstant4F[0][0] =
+	factor.blendConstant4F[0][1] =
+	factor.blendConstant4F[0][2] =
+	factor.blendConstant4F[0][3] = blendConstant.r;
+
+	factor.blendConstant4F[1][0] =
+	factor.blendConstant4F[1][1] =
+	factor.blendConstant4F[1][2] =
+	factor.blendConstant4F[1][3] = blendConstant.g;
+
+	factor.blendConstant4F[2][0] =
+	factor.blendConstant4F[2][1] =
+	factor.blendConstant4F[2][2] =
+	factor.blendConstant4F[2][3] = blendConstant.b;
+
+	factor.blendConstant4F[3][0] =
+	factor.blendConstant4F[3][1] =
+	factor.blendConstant4F[3][2] =
+	factor.blendConstant4F[3][3] = blendConstant.a;
+
+	factor.invBlendConstant4F[0][0] =
+	factor.invBlendConstant4F[0][1] =
+	factor.invBlendConstant4F[0][2] =
+	factor.invBlendConstant4F[0][3] = 1 - blendConstant.r;
+
+	factor.invBlendConstant4F[1][0] =
+	factor.invBlendConstant4F[1][1] =
+	factor.invBlendConstant4F[1][2] =
+	factor.invBlendConstant4F[1][3] = 1 - blendConstant.g;
+
+	factor.invBlendConstant4F[2][0] =
+	factor.invBlendConstant4F[2][1] =
+	factor.invBlendConstant4F[2][2] =
+	factor.invBlendConstant4F[2][3] = 1 - blendConstant.b;
+
+	factor.invBlendConstant4F[3][0] =
+	factor.invBlendConstant4F[3][1] =
+	factor.invBlendConstant4F[3][2] =
+	factor.invBlendConstant4F[3][3] = 1 - blendConstant.a;
+}
+
+void PixelProcessor::setRoutineCacheSize(int cacheSize)
+{
+	delete routineCache;
+	routineCache = new RoutineCacheType(clamp(cacheSize, 1, 65536));
+}
+
+const PixelProcessor::State PixelProcessor::update(const Context* context) const
+{
+	State state;
+
+	state.numClipDistances = context->vertexShader->getNumOutputClipDistances();
+	state.numCullDistances = context->vertexShader->getNumOutputCullDistances();
+
+	if(context->pixelShader)
+	{
+		state.shaderID = context->pixelShader->getSerialID();
+	}
+	else
+	{
+		state.shaderID = 0;
+	}
+
+	state.alphaToCoverage = context->alphaToCoverage;
+	state.depthWriteEnable = context->depthWriteActive();
+
+	if(context->stencilActive())
+	{
+		state.stencilActive = true;
+		state.frontStencil = context->frontStencil;
+		state.backStencil = context->backStencil;
+	}
+
+	if(context->depthBufferActive())
+	{
+		state.depthTestActive = true;
+		state.depthCompareMode = context->depthCompareMode;
+		state.depthFormat = context->depthBuffer->getFormat();
+	}
+
+	state.occlusionEnabled = context->occlusionEnabled;
+	state.depthClamp = (context->depthBias != 0.0f) || (context->slopeDepthBias != 0.0f);
+
+	for(int i = 0; i < RENDERTARGETS; i++)
+	{
+		state.colorWriteMask |= context->colorWriteActive(i) << (4 * i);
+		state.targetFormat[i] = context->renderTargetInternalFormat(i);
+		state.blendState[i] = context->getBlendState(i);
+	}
+
+	state.multiSample = static_cast<unsigned int>(context->sampleCount);
+	state.multiSampleMask = context->multiSampleMask;
+	state.multiSampledBresenham = (state.multiSample > 1) && context->isDrawLine(true) &&
+	                              (context->lineRasterizationMode == VK_LINE_RASTERIZATION_MODE_BRESENHAM_EXT);
+
+	if(state.multiSample > 1 && context->pixelShader)
+	{
+		state.centroid = context->pixelShader->getModes().NeedsCentroid;
+	}
+
+	state.frontFace = context->frontFace;
+
+	state.hash = state.computeHash();
+
+	return state;
+}
+
+PixelProcessor::RoutineType PixelProcessor::routine(const State &state,
+	vk::PipelineLayout const *pipelineLayout,
+	SpirvShader const *pixelShader,
+	const vk::DescriptorSet::Bindings &descriptorSets)
+{
+	auto routine = routineCache->query(state);
+
+	if(!routine)
+	{
+		QuadRasterizer *generator = new PixelProgram(state, pipelineLayout, pixelShader, descriptorSets);
+		generator->generate();
+		routine = (*generator)("PixelRoutine_%0.8X", state.shaderID);
+		delete generator;
+
+		routineCache->add(state, routine);
+	}
+
+	return routine;
+}
+
+}  // namespace sw
diff --git a/src/Device/PixelProcessor.hpp b/src/Device/PixelProcessor.hpp
index f657a59..8bc19ab 100644
--- a/src/Device/PixelProcessor.hpp
+++ b/src/Device/PixelProcessor.hpp
@@ -20,148 +20,149 @@
 #include "Memset.hpp"
 #include "RoutineCache.hpp"
 
-namespace sw
+namespace sw {
+
+class PixelShader;
+class Rasterizer;
+struct Texture;
+struct DrawData;
+struct Primitive;
+
+using RasterizerFunction = FunctionT<void(const Primitive* primitive, int count, int cluster, int clusterCount, DrawData* draw)>;
+
+class PixelProcessor
 {
-	class PixelShader;
-	class Rasterizer;
-	struct Texture;
-	struct DrawData;
-	struct Primitive;
-
-	using RasterizerFunction = FunctionT<void(const Primitive* primitive, int count, int cluster, int clusterCount, DrawData* draw)>;
-
-	class PixelProcessor
+public:
+	struct States : Memset<States>
 	{
-	public:
-		struct States : Memset<States>
+		// Same as VkStencilOpState, but with no reference, as it's not part of the state
+		// (it doesn't require a different program to be generated)
+		struct StencilOpState
 		{
-			// Same as VkStencilOpState, but with no reference, as it's not part of the state
-			// (it doesn't require a different program to be generated)
-			struct StencilOpState
+			VkStencilOp    failOp;
+			VkStencilOp    passOp;
+			VkStencilOp    depthFailOp;
+			VkCompareOp    compareOp;
+			uint32_t       compareMask;
+			uint32_t       writeMask;
+
+			void operator=(const VkStencilOpState &rhs)
 			{
-				VkStencilOp    failOp;
-				VkStencilOp    passOp;
-				VkStencilOp    depthFailOp;
-				VkCompareOp    compareOp;
-				uint32_t       compareMask;
-				uint32_t       writeMask;
-
-				void operator=(const VkStencilOpState &rhs)
-				{
-					failOp = rhs.failOp;

-					passOp = rhs.passOp;

-					depthFailOp = rhs.depthFailOp;

-					compareOp = rhs.compareOp;

-					compareMask = rhs.compareMask;

-					writeMask = rhs.writeMask;
-				}
-			};
-
-			States() : Memset(this, 0) {}
-
-			uint32_t computeHash();
-
-			uint64_t shaderID;
-
-			unsigned int numClipDistances;
-			unsigned int numCullDistances;
-
-			VkCompareOp depthCompareMode;
-			bool depthWriteEnable;
-
-			bool stencilActive;
-			StencilOpState frontStencil;
-			StencilOpState backStencil;
-
-			bool depthTestActive;
-			bool occlusionEnabled;
-			bool perspective;
-			bool depthClamp;
-
-			BlendState blendState[RENDERTARGETS];
-
-			unsigned int colorWriteMask;
-			VkFormat targetFormat[RENDERTARGETS];
-			unsigned int multiSample;
-			unsigned int multiSampleMask;
-			bool multiSampledBresenham;
-			bool alphaToCoverage;
-			bool centroid;
-			VkFrontFace frontFace;
-			VkFormat depthFormat;
-		};
-
-		struct State : States
-		{
-			bool operator==(const State &state) const;
-
-			int colorWriteActive(int index) const
-			{
-				return (colorWriteMask >> (index * 4)) & 0xF;
-			}
-
-			uint32_t hash;
-		};
-
-		struct Stencil
-		{
-			int64_t testMaskQ;
-			int64_t referenceMaskedQ;
-			int64_t referenceMaskedSignedQ;
-			int64_t writeMaskQ;
-			int64_t invWriteMaskQ;
-			int64_t referenceQ;
-
-			void set(int reference, int testMask, int writeMask)
-			{
-				referenceQ = replicate(reference);
-				testMaskQ = replicate(testMask);
-				writeMaskQ = replicate(writeMask);
-				invWriteMaskQ = ~writeMaskQ;
-				referenceMaskedQ = referenceQ & testMaskQ;
-				referenceMaskedSignedQ = replicate(((reference & testMask) + 0x80) & 0xFF);
-			}
-
-			static int64_t replicate(int b)
-			{
-				int64_t w = b & 0xFF;
-
-				return (w << 0) | (w << 8) | (w << 16) | (w << 24) | (w << 32) | (w << 40) | (w << 48) | (w << 56);
+				failOp = rhs.failOp;

+				passOp = rhs.passOp;

+				depthFailOp = rhs.depthFailOp;

+				compareOp = rhs.compareOp;

+				compareMask = rhs.compareMask;

+				writeMask = rhs.writeMask;
 			}
 		};
 
-		struct Factor
-		{
-			word4 alphaReference4;
+		States() : Memset(this, 0) {}
 
-			word4 blendConstant4W[4];
-			float4 blendConstant4F[4];
-			word4 invBlendConstant4W[4];
-			float4 invBlendConstant4F[4];
-		};
+		uint32_t computeHash();
 
-	public:
-		using RoutineType = RasterizerFunction::RoutineType;
+		uint64_t shaderID;
 
-		PixelProcessor();
+		unsigned int numClipDistances;
+		unsigned int numCullDistances;
 
-		virtual ~PixelProcessor();
+		VkCompareOp depthCompareMode;
+		bool depthWriteEnable;
 
-		void setBlendConstant(const Color<float> &blendConstant);
+		bool stencilActive;
+		StencilOpState frontStencil;
+		StencilOpState backStencil;
 
-	protected:
-		const State update(const Context* context) const;
-		RoutineType routine(const State &state, vk::PipelineLayout const *pipelineLayout,
-		                                 SpirvShader const *pixelShader, const vk::DescriptorSet::Bindings &descriptorSets);
-		void setRoutineCacheSize(int routineCacheSize);
+		bool depthTestActive;
+		bool occlusionEnabled;
+		bool perspective;
+		bool depthClamp;
 
-		// Other semi-constants
-		Factor factor;
+		BlendState blendState[RENDERTARGETS];
 
-	private:
-		using RoutineCacheType = RoutineCacheT<State, RasterizerFunction::CFunctionType>;
-		RoutineCacheType *routineCache;
+		unsigned int colorWriteMask;
+		VkFormat targetFormat[RENDERTARGETS];
+		unsigned int multiSample;
+		unsigned int multiSampleMask;
+		bool multiSampledBresenham;
+		bool alphaToCoverage;
+		bool centroid;
+		VkFrontFace frontFace;
+		VkFormat depthFormat;
 	};
-}
+
+	struct State : States
+	{
+		bool operator==(const State &state) const;
+
+		int colorWriteActive(int index) const
+		{
+			return (colorWriteMask >> (index * 4)) & 0xF;
+		}
+
+		uint32_t hash;
+	};
+
+	struct Stencil
+	{
+		int64_t testMaskQ;
+		int64_t referenceMaskedQ;
+		int64_t referenceMaskedSignedQ;
+		int64_t writeMaskQ;
+		int64_t invWriteMaskQ;
+		int64_t referenceQ;
+
+		void set(int reference, int testMask, int writeMask)
+		{
+			referenceQ = replicate(reference);
+			testMaskQ = replicate(testMask);
+			writeMaskQ = replicate(writeMask);
+			invWriteMaskQ = ~writeMaskQ;
+			referenceMaskedQ = referenceQ & testMaskQ;
+			referenceMaskedSignedQ = replicate(((reference & testMask) + 0x80) & 0xFF);
+		}
+
+		static int64_t replicate(int b)
+		{
+			int64_t w = b & 0xFF;
+
+			return (w << 0) | (w << 8) | (w << 16) | (w << 24) | (w << 32) | (w << 40) | (w << 48) | (w << 56);
+		}
+	};
+
+	struct Factor
+	{
+		word4 alphaReference4;
+
+		word4 blendConstant4W[4];
+		float4 blendConstant4F[4];
+		word4 invBlendConstant4W[4];
+		float4 invBlendConstant4F[4];
+	};
+
+public:
+	using RoutineType = RasterizerFunction::RoutineType;
+
+	PixelProcessor();
+
+	virtual ~PixelProcessor();
+
+	void setBlendConstant(const Color<float> &blendConstant);
+
+protected:
+	const State update(const Context* context) const;
+	RoutineType routine(const State &state, vk::PipelineLayout const *pipelineLayout,
+	                    SpirvShader const *pixelShader, const vk::DescriptorSet::Bindings &descriptorSets);
+	void setRoutineCacheSize(int routineCacheSize);
+
+	// Other semi-constants
+	Factor factor;
+
+private:
+	using RoutineCacheType = RoutineCacheT<State, RasterizerFunction::CFunctionType>;
+	RoutineCacheType *routineCache;
+};
+
+}  // namespace sw
 
 #endif   // sw_PixelProcessor_hpp
diff --git a/src/Device/Plane.cpp b/src/Device/Plane.cpp
index 095b7f2..8a89546 100644
--- a/src/Device/Plane.cpp
+++ b/src/Device/Plane.cpp
@@ -16,45 +16,46 @@
 
 #include "Matrix.hpp"
 
-namespace sw
+namespace sw {
+
+Plane::Plane()
 {
-	Plane::Plane()
-	{
-	}
-
-	Plane::Plane(float p_A, float p_B, float p_C, float p_D)
-	{
-		A = p_A;
-		B = p_B;
-		C = p_C;
-		D = p_D;
-	}
-
-	Plane::Plane(const float ABCD[4])
-	{
-		A = ABCD[0];
-		B = ABCD[1];
-		C = ABCD[2];
-		D = ABCD[3];
-	}
-
-	Plane operator*(const Plane &p, const Matrix &T)
-	{
-		Matrix M = !T;
-
-		return Plane(p.A * M(1, 1) + p.B * M(1, 2) + p.C * M(1, 3) + p.D * M(1, 4),
-		             p.A * M(2, 1) + p.B * M(2, 2) + p.C * M(2, 3) + p.D * M(2, 4),
-		             p.A * M(3, 1) + p.B * M(3, 2) + p.C * M(3, 3) + p.D * M(3, 4),
-		             p.A * M(4, 1) + p.B * M(4, 2) + p.C * M(4, 3) + p.D * M(4, 4));
-	}
-
-	Plane operator*(const Matrix &T, const Plane &p)
-	{
-		Matrix M = !T;
-
-		return Plane(M(1, 1) * p.A + M(2, 1) * p.B + M(3, 1) * p.C + M(4, 1) * p.D,
-		             M(1, 2) * p.A + M(2, 2) * p.B + M(3, 2) * p.C + M(4, 2) * p.D,
-		             M(1, 3) * p.A + M(2, 3) * p.B + M(3, 3) * p.C + M(4, 3) * p.D,
-		             M(1, 4) * p.A + M(2, 4) * p.B + M(3, 4) * p.C + M(4, 4) * p.D);
-	}
 }
+
+Plane::Plane(float p_A, float p_B, float p_C, float p_D)
+{
+	A = p_A;
+	B = p_B;
+	C = p_C;
+	D = p_D;
+}
+
+Plane::Plane(const float ABCD[4])
+{
+	A = ABCD[0];
+	B = ABCD[1];
+	C = ABCD[2];
+	D = ABCD[3];
+}
+
+Plane operator*(const Plane &p, const Matrix &T)
+{
+	Matrix M = !T;
+
+	return Plane(p.A * M(1, 1) + p.B * M(1, 2) + p.C * M(1, 3) + p.D * M(1, 4),
+	             p.A * M(2, 1) + p.B * M(2, 2) + p.C * M(2, 3) + p.D * M(2, 4),
+	             p.A * M(3, 1) + p.B * M(3, 2) + p.C * M(3, 3) + p.D * M(3, 4),
+	             p.A * M(4, 1) + p.B * M(4, 2) + p.C * M(4, 3) + p.D * M(4, 4));
+}
+
+Plane operator*(const Matrix &T, const Plane &p)
+{
+	Matrix M = !T;
+
+	return Plane(M(1, 1) * p.A + M(2, 1) * p.B + M(3, 1) * p.C + M(4, 1) * p.D,
+	             M(1, 2) * p.A + M(2, 2) * p.B + M(3, 2) * p.C + M(4, 2) * p.D,
+	             M(1, 3) * p.A + M(2, 3) * p.B + M(3, 3) * p.C + M(4, 3) * p.D,
+	             M(1, 4) * p.A + M(2, 4) * p.B + M(3, 4) * p.C + M(4, 4) * p.D);
+}
+
+}  // namespace sw
diff --git a/src/Device/Plane.hpp b/src/Device/Plane.hpp
index 962b9ae..dcce294 100644
--- a/src/Device/Plane.hpp
+++ b/src/Device/Plane.hpp
@@ -17,24 +17,25 @@
 
 #include "Vector.hpp"
 
-namespace sw
+namespace sw {
+
+struct Matrix;
+
+struct Plane
 {
-	struct Matrix;
+	float A;
+	float B;
+	float C;
+	float D;
 
-	struct Plane
-	{
-		float A;
-		float B;
-		float C;
-		float D;
+	Plane();
+	Plane(float A, float B, float C, float D);   // Plane equation 
+	Plane(const float ABCD[4]);
 
-		Plane();
-		Plane(float A, float B, float C, float D);   // Plane equation 
-		Plane(const float ABCD[4]);
+	friend Plane operator*(const Plane &p, const Matrix &A);   // Transform plane by matrix (post-multiply)
+	friend Plane operator*(const Matrix &A, const Plane &p);   // Transform plane by matrix (pre-multiply)
+};
 
-		friend Plane operator*(const Plane &p, const Matrix &A);   // Transform plane by matrix (post-multiply)
-		friend Plane operator*(const Matrix &A, const Plane &p);   // Transform plane by matrix (pre-multiply)
-	};
-}
+}  // namespace sw
 
 #endif   // Plane_hpp
diff --git a/src/Device/Point.cpp b/src/Device/Point.cpp
index e7e33dd..a93616d 100644
--- a/src/Device/Point.cpp
+++ b/src/Device/Point.cpp
@@ -16,77 +16,78 @@
 
 #include "Matrix.hpp"
 
-namespace sw
+namespace sw {
+
+Point &Point::operator+=(const Vector &v)
 {
-	Point &Point::operator+=(const Vector &v)
-	{
-		x += v.x;
-		y += v.y;
-		z += v.z;
+	x += v.x;
+	y += v.y;
+	z += v.z;
 
-		return *this;
-	}
-
-	Point &Point::operator-=(const Vector &v)
-	{
-		x -= v.x;
-		y -= v.y;
-		z -= v.z;
-
-		return *this;
-	}
-
-	Point operator+(const Point &P, const Vector &v)
-	{
-		return Point(P.x + v.x, P.y + v.y, P.z + v.z);
-	}
-
-	Point operator-(const Point &P, const Vector &v)
-	{
-		return Point(P.x - v.x, P.y - v.y, P.z - v.z);
-	}
-
-	Vector operator-(const Point &P, const Point &Q)
-	{
-		return Vector(P.x - Q.x, P.y - Q.y, P.z - Q.z);
-	}
-
-	Point operator*(const Matrix &M, const Point &P)
-	{
-		return Point(M(1, 1) * P.x + M(1, 2) * P.y + M(1, 3) * P.z + M(1, 4),
-		             M(2, 1) * P.x + M(2, 2) * P.y + M(2, 3) * P.z + M(2, 4),
-		             M(3, 1) * P.x + M(3, 2) * P.y + M(3, 3) * P.z + M(3, 4));
-	}
-
-	Point operator*(const Point &P, const Matrix &M)
-	{
-		return Point(P.x * M(1, 1) + P.y * M(2, 1) + P.z * M(3, 1),
-		             P.x * M(1, 2) + P.y * M(2, 2) + P.z * M(3, 2),
-		             P.x * M(1, 3) + P.y * M(2, 3) + P.z * M(3, 3));
-	}
-
-	Point &operator*=(Point &P, const Matrix &M)
-	{
-		return P = P * M;
-	}
-
-	float Point::d(const Point &P) const
-	{
-		return Vector::N(*this - P);
-	}
-
-	float Point::d2(const Point &P) const
-	{
-		return Vector::N2(*this - P);
-	}
-
-	float Point::d(const Point &P, const Point &Q)
-	{
-		return Vector::N(P - Q);
-	}
-
-	float Point::d2(const Point &P, const Point &Q)
-	{
-		return Vector::N2(P - Q);
-	}
+	return *this;
 }
+
+Point &Point::operator-=(const Vector &v)
+{
+	x -= v.x;
+	y -= v.y;
+	z -= v.z;
+
+	return *this;
+}
+
+Point operator+(const Point &P, const Vector &v)
+{
+	return Point(P.x + v.x, P.y + v.y, P.z + v.z);
+}
+
+Point operator-(const Point &P, const Vector &v)
+{
+	return Point(P.x - v.x, P.y - v.y, P.z - v.z);
+}
+
+Vector operator-(const Point &P, const Point &Q)
+{
+	return Vector(P.x - Q.x, P.y - Q.y, P.z - Q.z);
+}
+
+Point operator*(const Matrix &M, const Point &P)
+{
+	return Point(M(1, 1) * P.x + M(1, 2) * P.y + M(1, 3) * P.z + M(1, 4),
+	             M(2, 1) * P.x + M(2, 2) * P.y + M(2, 3) * P.z + M(2, 4),
+	             M(3, 1) * P.x + M(3, 2) * P.y + M(3, 3) * P.z + M(3, 4));
+}
+
+Point operator*(const Point &P, const Matrix &M)
+{
+	return Point(P.x * M(1, 1) + P.y * M(2, 1) + P.z * M(3, 1),
+	             P.x * M(1, 2) + P.y * M(2, 2) + P.z * M(3, 2),
+	             P.x * M(1, 3) + P.y * M(2, 3) + P.z * M(3, 3));
+}
+
+Point &operator*=(Point &P, const Matrix &M)
+{
+	return P = P * M;
+}
+
+float Point::d(const Point &P) const
+{
+	return Vector::N(*this - P);
+}
+
+float Point::d2(const Point &P) const
+{
+	return Vector::N2(*this - P);
+}
+
+float Point::d(const Point &P, const Point &Q)
+{
+	return Vector::N(P - Q);
+}
+
+float Point::d2(const Point &P, const Point &Q)
+{
+	return Vector::N2(P - Q);
+}
+
+}  // namespace sw
diff --git a/src/Device/Point.hpp b/src/Device/Point.hpp
index 85198c5..5602209 100644
--- a/src/Device/Point.hpp
+++ b/src/Device/Point.hpp
@@ -15,125 +15,127 @@
 #ifndef Point_hpp
 #define Point_hpp
 
-namespace sw
+namespace sw {
+
+struct Vector;
+struct Matrix;
+
+struct Point
 {
-	struct Vector;
-	struct Matrix;
+	Point();
+	Point(const int i);
+	Point(const Point &P);
+	Point(const Vector &v);
+	Point(float Px, float Py, float Pz);
 
-	struct Point
+	Point &operator=(const Point &P);
+
+	union
 	{
-		Point();
-		Point(const int i);
-		Point(const Point &P);
-		Point(const Vector &v);
-		Point(float Px, float Py, float Pz);
+		float p[3];
 
-		Point &operator=(const Point &P);
-
-		union
-		{
-			float p[3];
-
-			struct
-			{	
-				float x;
-				float y;
-				float z;
-			};
+		struct
+		{	
+			float x;
+			float y;
+			float z;
 		};
-
-		float &operator[](int i);
-		float &operator()(int i);
-
-		const float &operator[](int i) const;
-		const float &operator()(int i) const;
-
-		Point &operator+=(const Vector &v);
-		Point &operator-=(const Vector &v);
-
-		friend Point operator+(const Point &P, const Vector &v);
-		friend Point operator-(const Point &P, const Vector &v);
-
-		friend Vector operator-(const Point &P, const Point &Q);
-
-		friend Point operator*(const Matrix &M, const Point& P);
-		friend Point operator*(const Point &P, const Matrix &M);
-		friend Point &operator*=(Point &P, const Matrix &M);
-
-		float d(const Point &P) const;   // Distance between two points
-		float d2(const Point &P) const;   // Squared distance between two points
-
-		static float d(const Point &P, const Point &Q);   // Distance between two points
-		static float d2(const Point &P, const Point &Q);   // Squared distance between two points
 	};
-}
+
+	float &operator[](int i);
+	float &operator()(int i);
+
+	const float &operator[](int i) const;
+	const float &operator()(int i) const;
+
+	Point &operator+=(const Vector &v);
+	Point &operator-=(const Vector &v);
+
+	friend Point operator+(const Point &P, const Vector &v);
+	friend Point operator-(const Point &P, const Vector &v);
+
+	friend Vector operator-(const Point &P, const Point &Q);
+
+	friend Point operator*(const Matrix &M, const Point& P);
+	friend Point operator*(const Point &P, const Matrix &M);
+	friend Point &operator*=(Point &P, const Matrix &M);
+
+	float d(const Point &P) const;   // Distance between two points
+	float d2(const Point &P) const;   // Squared distance between two points
+
+	static float d(const Point &P, const Point &Q);   // Distance between two points
+	static float d2(const Point &P, const Point &Q);   // Squared distance between two points
+};
+
+}  // namespace sw
 
 #include "Vector.hpp"
 
-namespace sw
+namespace sw {
+
+inline Point::Point()
 {
-	inline Point::Point()
-	{
-	}
-
-	inline Point::Point(const int i)
-	{
-		const float s = (float)i;
-
-		x = s;
-		y = s;
-		z = s;
-	}
-
-	inline Point::Point(const Point &P)
-	{
-		x = P.x;
-		y = P.y;
-		z = P.z;
-	}
-
-	inline Point::Point(const Vector &v)
-	{
-		x = v.x;
-		y = v.y;
-		z = v.z;
-	}
-
-	inline Point::Point(float P_x, float P_y, float P_z)
-	{
-		x = P_x;
-		y = P_y;
-		z = P_z;
-	}
-
-	inline Point &Point::operator=(const Point &P)
-	{
-		x = P.x;
-		y = P.y;
-		z = P.z;
-
-		return *this;
-	}
-
-	inline float &Point::operator()(int i)
-	{
-		return p[i];
-	}
-
-	inline float &Point::operator[](int i)
-	{
-		return p[i];
-	}
-
-	inline const float &Point::operator()(int i) const
-	{
-		return p[i];
-	}
-
-	inline const float &Point::operator[](int i) const
-	{
-		return p[i];
-	}
 }
 
+inline Point::Point(const int i)
+{
+	const float s = (float)i;
+
+	x = s;
+	y = s;
+	z = s;
+}
+
+inline Point::Point(const Point &P)
+{
+	x = P.x;
+	y = P.y;
+	z = P.z;
+}
+
+inline Point::Point(const Vector &v)
+{
+	x = v.x;
+	y = v.y;
+	z = v.z;
+}
+
+inline Point::Point(float P_x, float P_y, float P_z)
+{
+	x = P_x;
+	y = P_y;
+	z = P_z;
+}
+
+inline Point &Point::operator=(const Point &P)
+{
+	x = P.x;
+	y = P.y;
+	z = P.z;
+
+	return *this;
+}
+
+inline float &Point::operator()(int i)
+{
+	return p[i];
+}
+
+inline float &Point::operator[](int i)
+{
+	return p[i];
+}
+
+inline const float &Point::operator()(int i) const
+{
+	return p[i];
+}
+
+inline const float &Point::operator[](int i) const
+{
+	return p[i];
+}
+
+}  // namespace sw
+
 #endif   // Point_hpp
diff --git a/src/Device/Polygon.hpp b/src/Device/Polygon.hpp
index 8ee8562..5412128 100644
--- a/src/Device/Polygon.hpp
+++ b/src/Device/Polygon.hpp
@@ -17,40 +17,41 @@
 
 #include "Vertex.hpp"
 
-namespace sw
+namespace sw {
+
+struct Polygon
 {
-	struct Polygon
+	Polygon(const float4 *P0, const float4 *P1, const float4 *P2)
 	{
-		Polygon(const float4 *P0, const float4 *P1, const float4 *P2)
-		{
-			P[0][0] = P0;
-			P[0][1] = P1;
-			P[0][2] = P2;
+		P[0][0] = P0;
+		P[0][1] = P1;
+		P[0][2] = P2;
 
-			n = 3;
-			i = 0;
-			b = 0;
+		n = 3;
+		i = 0;
+		b = 0;
+	}
+
+	Polygon(const float4 *P, int n)
+	{
+		for(int i = 0; i < n; i++)
+		{
+			this->P[0][i] = &P[i];
 		}
 
-		Polygon(const float4 *P, int n)
-		{
-			for(int i = 0; i < n; i++)
-			{
-				this->P[0][i] = &P[i];
-			}
+		this->n = n;
+		this->i = 0;
+		this->b = 0;
+	}
 
-			this->n = n;
-			this->i = 0;
-			this->b = 0;
-		}
+	float4 B[16];              // Buffer for clipped vertices
+	const float4 *P[16][16];   // Pointers to clipped polygon's vertices
 
-		float4 B[16];              // Buffer for clipped vertices
-		const float4 *P[16][16];   // Pointers to clipped polygon's vertices
+	int n;   // Number of vertices
+	int i;   // Level of P to use
+	int b;   // Next available new vertex
+};
 
-		int n;   // Number of vertices
-		int i;   // Level of P to use
-		int b;   // Next available new vertex
-	};
-}
+}  // namespace sw
 
 #endif   // sw_Polygon_hpp
diff --git a/src/Device/Primitive.hpp b/src/Device/Primitive.hpp
index 45b2e42..85a9db4 100644
--- a/src/Device/Primitive.hpp
+++ b/src/Device/Primitive.hpp
@@ -20,70 +20,71 @@
 #include "Device/Config.hpp"
 #include "System/Build.hpp"
 
-namespace sw
+namespace sw {
+
+struct Triangle MEMORY_SANITIZER_ONLY(: Memset<Triangle>)
 {
-	struct Triangle MEMORY_SANITIZER_ONLY(: Memset<Triangle>)
-	{
 #if MEMORY_SANITIZER_ENABLED
-		// Memory sanitizer cannot 'see' writes from JIT'd code, and can raise
-		// false-positives when read. By clearing the struct in the constructor,
-		// we can avoid triggering these false-positives.
-		inline Triangle() : Memset<Triangle>(this, 0) {}
+	// Memory sanitizer cannot 'see' writes from JIT'd code, and can raise
+	// false-positives when read. By clearing the struct in the constructor,
+	// we can avoid triggering these false-positives.
+	inline Triangle() : Memset<Triangle>(this, 0) {}
 #endif // MEMORY_SANITIZER_ENABLED
 
-		Vertex v0;
-		Vertex v1;
-		Vertex v2;
-	};
+	Vertex v0;
+	Vertex v1;
+	Vertex v2;
+};
 
-	struct PlaneEquation   // z = A * x + B * y + C
-	{
-		float4 A;
-		float4 B;
-		float4 C;
-	};
+struct PlaneEquation   // z = A * x + B * y + C
+{
+	float4 A;
+	float4 B;
+	float4 C;
+};
 
-	struct Primitive MEMORY_SANITIZER_ONLY(: Memset<Primitive>)
-	{
+struct Primitive MEMORY_SANITIZER_ONLY(: Memset<Primitive>)
+{
 #if MEMORY_SANITIZER_ENABLED
-		// Memory sanitizer cannot 'see' writes from JIT'd code, and can raise
-		// false-positives when read. By clearing the struct in the constructor,
-		// we can avoid triggering these false-positives.
-		inline Primitive() : Memset<Primitive>(this, 0) {}
+	// Memory sanitizer cannot 'see' writes from JIT'd code, and can raise
+	// false-positives when read. By clearing the struct in the constructor,
+	// we can avoid triggering these false-positives.
+	inline Primitive() : Memset<Primitive>(this, 0) {}
 #endif // MEMORY_SANITIZER_ENABLED
 
-		int yMin;
-		int yMax;
+	int yMin;
+	int yMax;
 
-		float4 xQuad;
-		float4 yQuad;
+	float4 xQuad;
+	float4 yQuad;
 
-		float pointCoordX;
-		float pointCoordY;
+	float pointCoordX;
+	float pointCoordY;
 
-		PlaneEquation z;
-		PlaneEquation w;
-		PlaneEquation V[MAX_INTERFACE_COMPONENTS];
+	PlaneEquation z;
+	PlaneEquation w;
+	PlaneEquation V[MAX_INTERFACE_COMPONENTS];
 
-		PlaneEquation clipDistance[MAX_CLIP_DISTANCES];
-		PlaneEquation cullDistance[MAX_CULL_DISTANCES];
+	PlaneEquation clipDistance[MAX_CLIP_DISTANCES];
+	PlaneEquation cullDistance[MAX_CULL_DISTANCES];
 
-		// Masks for two-sided stencil
-		int64_t clockwiseMask;
-		int64_t invClockwiseMask;
+	// Masks for two-sided stencil
+	int64_t clockwiseMask;
+	int64_t invClockwiseMask;
 
-		struct Span
-		{
-			unsigned short left;
-			unsigned short right;
-		};
-
-		// The rasterizer adds a zero length span to the top and bottom of the polygon to allow
-		// for 2x2 pixel processing. We need an even number of spans to keep accesses aligned.
-		Span outlineUnderflow[2];
-		Span outline[OUTLINE_RESOLUTION];
-		Span outlineOverflow[2];
+	struct Span
+	{
+		unsigned short left;
+		unsigned short right;
 	};
-}
+
+	// The rasterizer adds a zero length span to the top and bottom of the polygon to allow
+	// for 2x2 pixel processing. We need an even number of spans to keep accesses aligned.
+	Span outlineUnderflow[2];
+	Span outline[OUTLINE_RESOLUTION];
+	Span outlineOverflow[2];
+};
+
+}  // namespace sw
 
 #endif   // sw_Primitive_hpp
diff --git a/src/Device/QuadRasterizer.cpp b/src/Device/QuadRasterizer.cpp
index 0f23599..10b9e73 100644
--- a/src/Device/QuadRasterizer.cpp
+++ b/src/Device/QuadRasterizer.cpp
@@ -20,246 +20,247 @@
 #include "System/Math.hpp"
 #include "Vulkan/VkDebug.hpp"
 
-namespace sw
+namespace sw {
+
+QuadRasterizer::QuadRasterizer(const PixelProcessor::State &state, SpirvShader const *spirvShader) : state(state), spirvShader{spirvShader}
 {
-	QuadRasterizer::QuadRasterizer(const PixelProcessor::State &state, SpirvShader const *spirvShader) : state(state), spirvShader{spirvShader}
-	{
-	}
+}
 
-	QuadRasterizer::~QuadRasterizer()
-	{
-	}
+QuadRasterizer::~QuadRasterizer()
+{
+}
 
-	void QuadRasterizer::generate()
-	{
-		constants = *Pointer<Pointer<Byte>>(data + OFFSET(DrawData,constants));
-		occlusion = 0;
+void QuadRasterizer::generate()
+{
+	constants = *Pointer<Pointer<Byte>>(data + OFFSET(DrawData,constants));
+	occlusion = 0;
 
-		Do
+	Do
+	{
+		Int yMin = *Pointer<Int>(primitive + OFFSET(Primitive,yMin));
+		Int yMax = *Pointer<Int>(primitive + OFFSET(Primitive,yMax));
+
+		Int cluster2 = cluster + cluster;
+		yMin += clusterCount * 2 - 2 - cluster2;
+		yMin &= -clusterCount * 2;
+		yMin += cluster2;
+
+		If(yMin < yMax)
 		{
-			Int yMin = *Pointer<Int>(primitive + OFFSET(Primitive,yMin));
-			Int yMax = *Pointer<Int>(primitive + OFFSET(Primitive,yMax));
+			rasterize(yMin, yMax);
+		}
 
-			Int cluster2 = cluster + cluster;
-			yMin += clusterCount * 2 - 2 - cluster2;
-			yMin &= -clusterCount * 2;
-			yMin += cluster2;
+		primitive += sizeof(Primitive) * state.multiSample;
+		count--;
+	}
+	Until(count == 0)
 
-			If(yMin < yMax)
+	if(state.occlusionEnabled)
+	{
+		UInt clusterOcclusion = *Pointer<UInt>(data + OFFSET(DrawData,occlusion) + 4 * cluster);
+		clusterOcclusion += occlusion;
+		*Pointer<UInt>(data + OFFSET(DrawData,occlusion) + 4 * cluster) = clusterOcclusion;
+	}
+
+	Return();
+}
+
+void QuadRasterizer::rasterize(Int &yMin, Int &yMax)
+{
+	Pointer<Byte> cBuffer[RENDERTARGETS];
+	Pointer<Byte> zBuffer;
+	Pointer<Byte> sBuffer;
+
+	Int clusterCountLog2 = 31 - Ctlz(UInt(clusterCount), false);
+
+	for(int index = 0; index < RENDERTARGETS; index++)
+	{
+		if(state.colorWriteActive(index))
+		{
+			cBuffer[index] = *Pointer<Pointer<Byte>>(data + OFFSET(DrawData,colorBuffer[index])) + yMin * *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
+		}
+	}
+
+	if(state.depthTestActive)
+	{
+		zBuffer = *Pointer<Pointer<Byte>>(data + OFFSET(DrawData,depthBuffer)) + yMin * *Pointer<Int>(data + OFFSET(DrawData,depthPitchB));
+	}
+
+	if(state.stencilActive)
+	{
+		sBuffer = *Pointer<Pointer<Byte>>(data + OFFSET(DrawData,stencilBuffer)) + yMin * *Pointer<Int>(data + OFFSET(DrawData,stencilPitchB));
+	}
+
+	Int y = yMin;
+
+	Do
+	{
+		Int x0a = Int(*Pointer<Short>(primitive + OFFSET(Primitive,outline->left) + (y + 0) * sizeof(Primitive::Span)));
+		Int x0b = Int(*Pointer<Short>(primitive + OFFSET(Primitive,outline->left) + (y + 1) * sizeof(Primitive::Span)));
+		Int x0 = Min(x0a, x0b);
+
+		for(unsigned int q = 1; q < state.multiSample; q++)
+		{
+			x0a = Int(*Pointer<Short>(primitive + q * sizeof(Primitive) + OFFSET(Primitive,outline->left) + (y + 0) * sizeof(Primitive::Span)));
+			x0b = Int(*Pointer<Short>(primitive + q * sizeof(Primitive) + OFFSET(Primitive,outline->left) + (y + 1) * sizeof(Primitive::Span)));
+			x0 = Min(x0, Min(x0a, x0b));
+		}
+
+		x0 &= 0xFFFFFFFE;
+
+		Int x1a = Int(*Pointer<Short>(primitive + OFFSET(Primitive,outline->right) + (y + 0) * sizeof(Primitive::Span)));
+		Int x1b = Int(*Pointer<Short>(primitive + OFFSET(Primitive,outline->right) + (y + 1) * sizeof(Primitive::Span)));
+		Int x1 = Max(x1a, x1b);
+
+		for(unsigned int q = 1; q < state.multiSample; q++)
+		{
+			x1a = Int(*Pointer<Short>(primitive + q * sizeof(Primitive) + OFFSET(Primitive,outline->right) + (y + 0) * sizeof(Primitive::Span)));
+			x1b = Int(*Pointer<Short>(primitive + q * sizeof(Primitive) + OFFSET(Primitive,outline->right) + (y + 1) * sizeof(Primitive::Span)));
+			x1 = Max(x1, Max(x1a, x1b));
+		}
+
+		Float4 yyyy = Float4(Float(y)) + *Pointer<Float4>(primitive + OFFSET(Primitive,yQuad), 16);
+
+		if(interpolateZ())
+		{
+			for(unsigned int q = 0; q < state.multiSample; q++)
 			{
-				rasterize(yMin, yMax);
+				Float4 y = yyyy;
+
+				if(state.multiSample > 1)
+				{
+					y -= *Pointer<Float4>(constants + OFFSET(Constants,Y) + q * sizeof(float4));
+				}
+
+				Dz[q] = *Pointer<Float4>(primitive + OFFSET(Primitive,z.C), 16) + y * *Pointer<Float4>(primitive + OFFSET(Primitive,z.B), 16);
+			}
+		}
+
+		If(x0 < x1)
+		{
+			if(interpolateW())
+			{
+				Dw = *Pointer<Float4>(primitive + OFFSET(Primitive,w.C), 16) + yyyy * *Pointer<Float4>(primitive + OFFSET(Primitive,w.B), 16);
 			}
 
-			primitive += sizeof(Primitive) * state.multiSample;
-			count--;
+			if (spirvShader)
+			{
+				for (int interpolant = 0; interpolant < MAX_INTERFACE_COMPONENTS; interpolant++)
+				{
+					if (spirvShader->inputs[interpolant].Type == SpirvShader::ATTRIBTYPE_UNUSED)
+						continue;
+
+					Dv[interpolant] = *Pointer<Float4>(primitive + OFFSET(Primitive, V[interpolant].C), 16);
+					if (!spirvShader->inputs[interpolant].Flat)
+					{
+						Dv[interpolant] +=
+								yyyy * *Pointer<Float4>(primitive + OFFSET(Primitive, V[interpolant].B), 16);
+					}
+				}
+
+				for (unsigned int i = 0; i < state.numClipDistances; i++)
+				{
+					DclipDistance[i] = *Pointer<Float4>(primitive + OFFSET(Primitive, clipDistance[i].C), 16) +
+								yyyy * *Pointer<Float4>(primitive + OFFSET(Primitive, clipDistance[i].B), 16);
+				}
+
+				for (unsigned int i = 0; i < state.numCullDistances; i++)
+				{
+					DcullDistance[i] = *Pointer<Float4>(primitive + OFFSET(Primitive, cullDistance[i].C), 16) +
+								yyyy * *Pointer<Float4>(primitive + OFFSET(Primitive, cullDistance[i].B), 16);
+				}
+			}
+
+			Short4 xLeft[4];
+			Short4 xRight[4];
+
+			for(unsigned int q = 0; q < state.multiSample; q++)
+			{
+				xLeft[q] = *Pointer<Short4>(primitive + q * sizeof(Primitive) + OFFSET(Primitive,outline) + y * sizeof(Primitive::Span));
+				xRight[q] = xLeft[q];
+
+				xLeft[q] = Swizzle(xLeft[q], 0x0022) - Short4(1, 2, 1, 2);
+				xRight[q] = Swizzle(xRight[q], 0x1133) - Short4(0, 1, 0, 1);
+			}
+
+			For(Int x = x0, x < x1, x += 2)
+			{
+				Short4 xxxx = Short4(x);
+				Int cMask[4];
+
+				for(unsigned int q = 0; q < state.multiSample; q++)
+				{
+					if (state.multiSampleMask & (1<<q))
+					{
+						unsigned int i = state.multiSampledBresenham ? 0 : q;
+						Short4 mask = CmpGT(xxxx, xLeft[i]) & CmpGT(xRight[i], xxxx);
+						cMask[q] = SignMask(PackSigned(mask, mask)) & 0x0000000F;
+					}
+					else
+					{
+						cMask[q] = 0;
+					}
+				}
+
+				quad(cBuffer, zBuffer, sBuffer, cMask, x, y);
+			}
 		}
-		Until(count == 0)
-
-		if(state.occlusionEnabled)
-		{
-			UInt clusterOcclusion = *Pointer<UInt>(data + OFFSET(DrawData,occlusion) + 4 * cluster);
-			clusterOcclusion += occlusion;
-			*Pointer<UInt>(data + OFFSET(DrawData,occlusion) + 4 * cluster) = clusterOcclusion;
-		}
-
-		Return();
-	}
-
-	void QuadRasterizer::rasterize(Int &yMin, Int &yMax)
-	{
-		Pointer<Byte> cBuffer[RENDERTARGETS];
-		Pointer<Byte> zBuffer;
-		Pointer<Byte> sBuffer;
-
-		Int clusterCountLog2 = 31 - Ctlz(UInt(clusterCount), false);
 
 		for(int index = 0; index < RENDERTARGETS; index++)
 		{
 			if(state.colorWriteActive(index))
 			{
-				cBuffer[index] = *Pointer<Pointer<Byte>>(data + OFFSET(DrawData,colorBuffer[index])) + yMin * *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
+				cBuffer[index] += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index])) << (1 + clusterCountLog2);   // FIXME: Precompute
 			}
 		}
 
 		if(state.depthTestActive)
 		{
-			zBuffer = *Pointer<Pointer<Byte>>(data + OFFSET(DrawData,depthBuffer)) + yMin * *Pointer<Int>(data + OFFSET(DrawData,depthPitchB));
+			zBuffer += *Pointer<Int>(data + OFFSET(DrawData,depthPitchB)) << (1 + clusterCountLog2);   // FIXME: Precompute
 		}
 
 		if(state.stencilActive)
 		{
-			sBuffer = *Pointer<Pointer<Byte>>(data + OFFSET(DrawData,stencilBuffer)) + yMin * *Pointer<Int>(data + OFFSET(DrawData,stencilPitchB));
+			sBuffer += *Pointer<Int>(data + OFFSET(DrawData,stencilPitchB)) << (1 + clusterCountLog2);   // FIXME: Precompute
 		}
 
-		Int y = yMin;
-
-		Do
-		{
-			Int x0a = Int(*Pointer<Short>(primitive + OFFSET(Primitive,outline->left) + (y + 0) * sizeof(Primitive::Span)));
-			Int x0b = Int(*Pointer<Short>(primitive + OFFSET(Primitive,outline->left) + (y + 1) * sizeof(Primitive::Span)));
-			Int x0 = Min(x0a, x0b);
-
-			for(unsigned int q = 1; q < state.multiSample; q++)
-			{
-				x0a = Int(*Pointer<Short>(primitive + q * sizeof(Primitive) + OFFSET(Primitive,outline->left) + (y + 0) * sizeof(Primitive::Span)));
-				x0b = Int(*Pointer<Short>(primitive + q * sizeof(Primitive) + OFFSET(Primitive,outline->left) + (y + 1) * sizeof(Primitive::Span)));
-				x0 = Min(x0, Min(x0a, x0b));
-			}
-
-			x0 &= 0xFFFFFFFE;
-
-			Int x1a = Int(*Pointer<Short>(primitive + OFFSET(Primitive,outline->right) + (y + 0) * sizeof(Primitive::Span)));
-			Int x1b = Int(*Pointer<Short>(primitive + OFFSET(Primitive,outline->right) + (y + 1) * sizeof(Primitive::Span)));
-			Int x1 = Max(x1a, x1b);
-
-			for(unsigned int q = 1; q < state.multiSample; q++)
-			{
-				x1a = Int(*Pointer<Short>(primitive + q * sizeof(Primitive) + OFFSET(Primitive,outline->right) + (y + 0) * sizeof(Primitive::Span)));
-				x1b = Int(*Pointer<Short>(primitive + q * sizeof(Primitive) + OFFSET(Primitive,outline->right) + (y + 1) * sizeof(Primitive::Span)));
-				x1 = Max(x1, Max(x1a, x1b));
-			}
-
-			Float4 yyyy = Float4(Float(y)) + *Pointer<Float4>(primitive + OFFSET(Primitive,yQuad), 16);
-
-			if(interpolateZ())
-			{
-				for(unsigned int q = 0; q < state.multiSample; q++)
-				{
-					Float4 y = yyyy;
-
-					if(state.multiSample > 1)
-					{
-						y -= *Pointer<Float4>(constants + OFFSET(Constants,Y) + q * sizeof(float4));
-					}
-
-					Dz[q] = *Pointer<Float4>(primitive + OFFSET(Primitive,z.C), 16) + y * *Pointer<Float4>(primitive + OFFSET(Primitive,z.B), 16);
-				}
-			}
-
-			If(x0 < x1)
-			{
-				if(interpolateW())
-				{
-					Dw = *Pointer<Float4>(primitive + OFFSET(Primitive,w.C), 16) + yyyy * *Pointer<Float4>(primitive + OFFSET(Primitive,w.B), 16);
-				}
-
-				if (spirvShader)
-				{
-					for (int interpolant = 0; interpolant < MAX_INTERFACE_COMPONENTS; interpolant++)
-					{
-						if (spirvShader->inputs[interpolant].Type == SpirvShader::ATTRIBTYPE_UNUSED)
-							continue;
-
-						Dv[interpolant] = *Pointer<Float4>(primitive + OFFSET(Primitive, V[interpolant].C), 16);
-						if (!spirvShader->inputs[interpolant].Flat)
-						{
-							Dv[interpolant] +=
-									yyyy * *Pointer<Float4>(primitive + OFFSET(Primitive, V[interpolant].B), 16);
-						}
-					}
-
-					for (unsigned int i = 0; i < state.numClipDistances; i++)
-					{
-						DclipDistance[i] = *Pointer<Float4>(primitive + OFFSET(Primitive, clipDistance[i].C), 16) +
-									yyyy * *Pointer<Float4>(primitive + OFFSET(Primitive, clipDistance[i].B), 16);
-					}
-
-					for (unsigned int i = 0; i < state.numCullDistances; i++)
-					{
-						DcullDistance[i] = *Pointer<Float4>(primitive + OFFSET(Primitive, cullDistance[i].C), 16) +
-									yyyy * *Pointer<Float4>(primitive + OFFSET(Primitive, cullDistance[i].B), 16);
-					}
-				}
-
-				Short4 xLeft[4];
-				Short4 xRight[4];
-
-				for(unsigned int q = 0; q < state.multiSample; q++)
-				{
-					xLeft[q] = *Pointer<Short4>(primitive + q * sizeof(Primitive) + OFFSET(Primitive,outline) + y * sizeof(Primitive::Span));
-					xRight[q] = xLeft[q];
-
-					xLeft[q] = Swizzle(xLeft[q], 0x0022) - Short4(1, 2, 1, 2);
-					xRight[q] = Swizzle(xRight[q], 0x1133) - Short4(0, 1, 0, 1);
-				}
-
-				For(Int x = x0, x < x1, x += 2)
-				{
-					Short4 xxxx = Short4(x);
-					Int cMask[4];
-
-					for(unsigned int q = 0; q < state.multiSample; q++)
-					{
-						if (state.multiSampleMask & (1<<q))
-						{
-							unsigned int i = state.multiSampledBresenham ? 0 : q;
-							Short4 mask = CmpGT(xxxx, xLeft[i]) & CmpGT(xRight[i], xxxx);
-							cMask[q] = SignMask(PackSigned(mask, mask)) & 0x0000000F;
-						}
-						else
-						{
-							cMask[q] = 0;
-						}
-					}
-
-					quad(cBuffer, zBuffer, sBuffer, cMask, x, y);
-				}
-			}
-
-			for(int index = 0; index < RENDERTARGETS; index++)
-			{
-				if(state.colorWriteActive(index))
-				{
-					cBuffer[index] += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index])) << (1 + clusterCountLog2);   // FIXME: Precompute
-				}
-			}
-
-			if(state.depthTestActive)
-			{
-				zBuffer += *Pointer<Int>(data + OFFSET(DrawData,depthPitchB)) << (1 + clusterCountLog2);   // FIXME: Precompute
-			}
-
-			if(state.stencilActive)
-			{
-				sBuffer += *Pointer<Int>(data + OFFSET(DrawData,stencilPitchB)) << (1 + clusterCountLog2);   // FIXME: Precompute
-			}
-
-			y += 2 * clusterCount;
-		}
-		Until(y >= yMax)
+		y += 2 * clusterCount;
 	}
-
-	Float4 QuadRasterizer::interpolate(Float4 &x, Float4 &D, Float4 &rhw, Pointer<Byte> planeEquation, bool flat, bool perspective, bool clamp)
-	{
-		Float4 interpolant = D;
-
-		if(!flat)
-		{
-			interpolant += x * *Pointer<Float4>(planeEquation + OFFSET(PlaneEquation, A), 16);
-
-			if(perspective)
-			{
-				interpolant *= rhw;
-			}
-		}
-
-		if(clamp)
-		{
-			interpolant = Min(Max(interpolant, Float4(0.0f)), Float4(1.0f));
-		}
-
-		return interpolant;
-	}
-
-	bool QuadRasterizer::interpolateZ() const
-	{
-		return state.depthTestActive || (spirvShader && spirvShader->hasBuiltinInput(spv::BuiltInFragCoord));
-	}
-
-	bool QuadRasterizer::interpolateW() const
-	{
-		// Note: could optimize cases where there is a fragment shader but it has no
-		// perspective-correct inputs, but that's vanishingly rare.
-		return spirvShader != nullptr;
-	}
+	Until(y >= yMax)
 }
+
+Float4 QuadRasterizer::interpolate(Float4 &x, Float4 &D, Float4 &rhw, Pointer<Byte> planeEquation, bool flat, bool perspective, bool clamp)
+{
+	Float4 interpolant = D;
+
+	if(!flat)
+	{
+		interpolant += x * *Pointer<Float4>(planeEquation + OFFSET(PlaneEquation, A), 16);
+
+		if(perspective)
+		{
+			interpolant *= rhw;
+		}
+	}
+
+	if(clamp)
+	{
+		interpolant = Min(Max(interpolant, Float4(0.0f)), Float4(1.0f));
+	}
+
+	return interpolant;
+}
+
+bool QuadRasterizer::interpolateZ() const
+{
+	return state.depthTestActive || (spirvShader && spirvShader->hasBuiltinInput(spv::BuiltInFragCoord));
+}
+
+bool QuadRasterizer::interpolateW() const
+{
+	// Note: could optimize cases where there is a fragment shader but it has no
+	// perspective-correct inputs, but that's vanishingly rare.
+	return spirvShader != nullptr;
+}
+
+}  // namespace sw
diff --git a/src/Device/QuadRasterizer.hpp b/src/Device/QuadRasterizer.hpp
index 6d349e7..0311d8a 100644
--- a/src/Device/QuadRasterizer.hpp
+++ b/src/Device/QuadRasterizer.hpp
@@ -20,40 +20,41 @@
 #include "Pipeline/SpirvShader.hpp"
 #include "System/Types.hpp"
 
-namespace sw
+namespace sw {
+
+class QuadRasterizer : public Rasterizer
 {
-	class QuadRasterizer : public Rasterizer
-	{
-	public:
-		QuadRasterizer(const PixelProcessor::State &state, SpirvShader const *spirvShader);
-		virtual ~QuadRasterizer();
+public:
+	QuadRasterizer(const PixelProcessor::State &state, SpirvShader const *spirvShader);
+	virtual ~QuadRasterizer();
 
-		void generate();
+	void generate();
 
-	protected:
-		Pointer<Byte> constants;
+protected:
+	Pointer<Byte> constants;
 
-		Float4 Dz[4];
-		Float4 Dw;
-		Float4 Dv[MAX_INTERFACE_COMPONENTS];
-		Float4 Df;
-		Float4 DclipDistance[MAX_CLIP_DISTANCES];
-		Float4 DcullDistance[MAX_CULL_DISTANCES];
+	Float4 Dz[4];
+	Float4 Dw;
+	Float4 Dv[MAX_INTERFACE_COMPONENTS];
+	Float4 Df;
+	Float4 DclipDistance[MAX_CLIP_DISTANCES];
+	Float4 DcullDistance[MAX_CULL_DISTANCES];
 
-		UInt occlusion;
+	UInt occlusion;
 
-		virtual void quad(Pointer<Byte> cBuffer[4], Pointer<Byte> &zBuffer, Pointer<Byte> &sBuffer, Int cMask[4], Int &x, Int &y) = 0;
+	virtual void quad(Pointer<Byte> cBuffer[4], Pointer<Byte> &zBuffer, Pointer<Byte> &sBuffer, Int cMask[4], Int &x, Int &y) = 0;
 
-		bool interpolateZ() const;
-		bool interpolateW() const;
-		Float4 interpolate(Float4 &x, Float4 &D, Float4 &rhw, Pointer<Byte> planeEquation, bool flat, bool perspective, bool clamp);
+	bool interpolateZ() const;
+	bool interpolateW() const;
+	Float4 interpolate(Float4 &x, Float4 &D, Float4 &rhw, Pointer<Byte> planeEquation, bool flat, bool perspective, bool clamp);
 
-		const PixelProcessor::State &state;
-		const SpirvShader *const spirvShader;
+	const PixelProcessor::State &state;
+	const SpirvShader *const spirvShader;
 
-	private:
-		void rasterize(Int &yMin, Int &yMax);
-	};
-}
+private:
+	void rasterize(Int &yMin, Int &yMax);
+};
+
+}  // namespace sw
 
 #endif   // sw_QuadRasterizer_hpp
diff --git a/src/Device/Rasterizer.hpp b/src/Device/Rasterizer.hpp
index 4e64e0e..cf229e6 100644
--- a/src/Device/Rasterizer.hpp
+++ b/src/Device/Rasterizer.hpp
@@ -19,21 +19,22 @@
 #include "PixelProcessor.hpp"
 #include "Device/Config.hpp"
 
-namespace sw
-{
-	class Rasterizer : public RasterizerFunction
-	{
-	public:
-		Rasterizer() : primitive(Arg<0>()), count(Arg<1>()), cluster(Arg<2>()), clusterCount(Arg<3>()), data(Arg<4>()) {}
-		virtual ~Rasterizer() {}
+namespace sw {
 
-	protected:
-		Pointer<Byte> primitive;
-		Int count;
-		Int cluster;
-		Int clusterCount;
-		Pointer<Byte> data;
-	};
-}
+class Rasterizer : public RasterizerFunction
+{
+public:
+	Rasterizer() : primitive(Arg<0>()), count(Arg<1>()), cluster(Arg<2>()), clusterCount(Arg<3>()), data(Arg<4>()) {}
+	virtual ~Rasterizer() {}
+
+protected:
+	Pointer<Byte> primitive;
+	Int count;
+	Int cluster;
+	Int clusterCount;
+	Pointer<Byte> data;
+};
+
+}  // namespace sw
 
 #endif   // sw_Rasterizer_hpp
diff --git a/src/Device/Renderer.cpp b/src/Device/Renderer.cpp
index 46bed2a..776eb13 100644
--- a/src/Device/Renderer.cpp
+++ b/src/Device/Renderer.cpp
@@ -43,1097 +43,832 @@
 unsigned int maxPrimitives = 1 << 21;
 #endif
 
-namespace sw
+namespace sw {
+
+template<typename T>
+inline bool setBatchIndices(unsigned int batch[128][3], VkPrimitiveTopology topology, VkProvokingVertexModeEXT provokingVertexMode, T indices, unsigned int start, unsigned int triangleCount)
 {
-	template<typename T>
-	inline bool setBatchIndices(unsigned int batch[128][3], VkPrimitiveTopology topology, VkProvokingVertexModeEXT provokingVertexMode, T indices, unsigned int start, unsigned int triangleCount)
+	bool provokeFirst = (provokingVertexMode == VK_PROVOKING_VERTEX_MODE_FIRST_VERTEX_EXT);
+
+	switch(topology)
 	{
-		bool provokeFirst = (provokingVertexMode == VK_PROVOKING_VERTEX_MODE_FIRST_VERTEX_EXT);
-
-		switch(topology)
+	case VK_PRIMITIVE_TOPOLOGY_POINT_LIST:
+	{
+		auto index = start;
+		auto pointBatch = &(batch[0][0]);
+		for(unsigned int i = 0; i < triangleCount; i++)
 		{
-		case VK_PRIMITIVE_TOPOLOGY_POINT_LIST:
-		{
-			auto index = start;
-			auto pointBatch = &(batch[0][0]);
-			for(unsigned int i = 0; i < triangleCount; i++)
-			{
-				*pointBatch++ = indices[index++];
-			}
-
-			// Repeat the last index to allow for SIMD width overrun.
-			index--;
-			for(unsigned int i = 0; i < 3; i++)
-			{
-				*pointBatch++ = indices[index];
-			}
-			break;
-		}
-		case VK_PRIMITIVE_TOPOLOGY_LINE_LIST:
-		{
-			auto index = 2 * start;
-			for(unsigned int i = 0; i < triangleCount; i++)
-			{
-				batch[i][0] = indices[index + (provokeFirst ? 0 : 1)];
-				batch[i][1] = indices[index + (provokeFirst ? 1 : 0)];
-				batch[i][2] = indices[index + 1];
-
-				index += 2;
-			}
-			break;
-		}
-		case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP:
-		{
-			auto index = start;
-			for(unsigned int i = 0; i < triangleCount; i++)
-			{
-				batch[i][0] = indices[index + (provokeFirst ? 0 : 1)];
-				batch[i][1] = indices[index + (provokeFirst ? 1 : 0)];
-				batch[i][2] = indices[index + 1];
-
-				index += 1;
-			}
-			break;
-		}
-		case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST:
-		{
-			auto index = 3 * start;
-			for(unsigned int i = 0; i < triangleCount; i++)
-			{
-				batch[i][0] = indices[index + (provokeFirst ? 0 : 2)];
-				batch[i][1] = indices[index + (provokeFirst ? 1 : 0)];
-				batch[i][2] = indices[index + (provokeFirst ? 2 : 1)];
-
-				index += 3;
-			}
-			break;
-		}
-		case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP:
-		{
-			auto index = start;
-			for(unsigned int i = 0; i < triangleCount; i++)
-			{
-				batch[i][0] = indices[index + (provokeFirst ? 0 : 2)];
-				batch[i][1] = indices[index + ((start + i) & 1) + (provokeFirst ? 1 : 0)];
-				batch[i][2] = indices[index + (~(start + i) & 1) + (provokeFirst ? 1 : 0)];
-
-				index += 1;
-			}
-			break;
-		}
-		case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN:
-		{
-			auto index = start + 1;
-			for(unsigned int i = 0; i < triangleCount; i++)
-			{
-				batch[i][provokeFirst ? 0 : 2] = indices[index + 0];
-				batch[i][provokeFirst ? 1 : 0] = indices[index + 1];
-				batch[i][provokeFirst ? 2 : 1] = indices[0];
-
-				index += 1;
-			}
-			break;
-		}
-		default:
-			ASSERT(false);
-			return false;
+			*pointBatch++ = indices[index++];
 		}
 
-		return true;
+		// Repeat the last index to allow for SIMD width overrun.
+		index--;
+		for(unsigned int i = 0; i < 3; i++)
+		{
+			*pointBatch++ = indices[index];
+		}
+		break;
 	}
-
-	DrawCall::DrawCall()
+	case VK_PRIMITIVE_TOPOLOGY_LINE_LIST:
 	{
-		data = (DrawData*)allocate(sizeof(DrawData));
-		data->constants = &constants;
+		auto index = 2 * start;
+		for(unsigned int i = 0; i < triangleCount; i++)
+		{
+			batch[i][0] = indices[index + (provokeFirst ? 0 : 1)];
+			batch[i][1] = indices[index + (provokeFirst ? 1 : 0)];
+			batch[i][2] = indices[index + 1];
+
+			index += 2;
+		}
+		break;
 	}
-
-	DrawCall::~DrawCall()
+	case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP:
 	{
-		deallocate(data);
+		auto index = start;
+		for(unsigned int i = 0; i < triangleCount; i++)
+		{
+			batch[i][0] = indices[index + (provokeFirst ? 0 : 1)];
+			batch[i][1] = indices[index + (provokeFirst ? 1 : 0)];
+			batch[i][2] = indices[index + 1];
+
+			index += 1;
+		}
+		break;
 	}
-
-	Renderer::Renderer(vk::Device* device) : device(device)
+	case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST:
 	{
-		VertexProcessor::setRoutineCacheSize(1024);
-		PixelProcessor::setRoutineCacheSize(1024);
-		SetupProcessor::setRoutineCacheSize(1024);
+		auto index = 3 * start;
+		for(unsigned int i = 0; i < triangleCount; i++)
+		{
+			batch[i][0] = indices[index + (provokeFirst ? 0 : 2)];
+			batch[i][1] = indices[index + (provokeFirst ? 1 : 0)];
+			batch[i][2] = indices[index + (provokeFirst ? 2 : 1)];
+
+			index += 3;
+		}
+		break;
 	}
-
-	Renderer::~Renderer()
+	case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP:
 	{
-		drawTickets.take().wait();
+		auto index = start;
+		for(unsigned int i = 0; i < triangleCount; i++)
+		{
+			batch[i][0] = indices[index + (provokeFirst ? 0 : 2)];
+			batch[i][1] = indices[index + ((start + i) & 1) + (provokeFirst ? 1 : 0)];
+			batch[i][2] = indices[index + (~(start + i) & 1) + (provokeFirst ? 1 : 0)];
+
+			index += 1;
+		}
+		break;
 	}
-
-	// Renderer objects have to be mem aligned to the alignment provided in the class declaration
-	void* Renderer::operator new(size_t size)
+	case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN:
 	{
-		ASSERT(size == sizeof(Renderer));  // This operator can't be called from a derived class
-		return vk::allocate(sizeof(Renderer), alignof(Renderer), vk::DEVICE_MEMORY, VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
+		auto index = start + 1;
+		for(unsigned int i = 0; i < triangleCount; i++)
+		{
+			batch[i][provokeFirst ? 0 : 2] = indices[index + 0];
+			batch[i][provokeFirst ? 1 : 0] = indices[index + 1];
+			batch[i][provokeFirst ? 2 : 1] = indices[0];
+
+			index += 1;
+		}
+		break;
 	}
-
-	void Renderer::operator delete(void* mem)
-	{
-		vk::deallocate(mem, vk::DEVICE_MEMORY);
-	}
-
-	void Renderer::draw(const sw::Context* context, VkIndexType indexType, unsigned int count, int baseVertex,
-			TaskEvents *events, int instanceID, int viewID, void *indexBuffer, const VkExtent3D& framebufferExtent,
-			PushConstantStorage const & pushConstants, bool update)
-	{
-		if(count == 0) { return; }
-
-		auto id = nextDrawID++;
-		MARL_SCOPED_EVENT("draw %d", id);
-
-		#ifndef NDEBUG
-		{
-			unsigned int minPrimitives = 1;
-			unsigned int maxPrimitives = 1 << 21;
-			if(count < minPrimitives || count > maxPrimitives)
-			{
-				return;
-			}
-		}
-		#endif
-
-		int ms = context->sampleCount;
-
-		if(!context->multiSampleMask)
-		{
-			return;
-		}
-
-		marl::Pool<sw::DrawCall>::Loan draw;
-		{
-			MARL_SCOPED_EVENT("drawCallPool.borrow()");
-			draw = drawCallPool.borrow();
-		}
-		draw->id = id;
-
-		if(update)
-		{
-			MARL_SCOPED_EVENT("update");
-			vertexState = VertexProcessor::update(context);
-			setupState = SetupProcessor::update(context);
-			pixelState = PixelProcessor::update(context);
-
-			vertexRoutine = VertexProcessor::routine(vertexState, context->pipelineLayout, context->vertexShader, context->descriptorSets);
-			setupRoutine = SetupProcessor::routine(setupState);
-			pixelRoutine = PixelProcessor::routine(pixelState, context->pipelineLayout, context->pixelShader, context->descriptorSets);
-		}
-
-		DrawCall::SetupFunction setupPrimitives = nullptr;
-		unsigned int numPrimitivesPerBatch = MaxBatchSize / ms;
-
-		if(context->isDrawTriangle(false))
-		{
-			switch(context->polygonMode)
-			{
-			case VK_POLYGON_MODE_FILL:
-				setupPrimitives = &DrawCall::setupSolidTriangles;
-				break;
-			case VK_POLYGON_MODE_LINE:
-				setupPrimitives = &DrawCall::setupWireframeTriangles;
-				numPrimitivesPerBatch /= 3;
-				break;
-			case VK_POLYGON_MODE_POINT:
-				setupPrimitives = &DrawCall::setupPointTriangles;
-				numPrimitivesPerBatch /= 3;
-				break;
-			default:
-				UNSUPPORTED("polygon mode: %d", int(context->polygonMode));
-				return;
-			}
-		}
-		else if(context->isDrawLine(false))
-		{
-			setupPrimitives = &DrawCall::setupLines;
-		}
-		else  // Point primitive topology
-		{
-			setupPrimitives = &DrawCall::setupPoints;
-		}
-
-		DrawData *data = draw->data;
-		draw->occlusionQuery = occlusionQuery;
-		draw->batchDataPool = &batchDataPool;
-		draw->numPrimitives = count;
-		draw->numPrimitivesPerBatch = numPrimitivesPerBatch;
-		draw->numBatches = (count + draw->numPrimitivesPerBatch - 1) / draw->numPrimitivesPerBatch;
-		draw->topology = context->topology;
-		draw->provokingVertexMode = context->provokingVertexMode;
-		draw->indexType = indexType;
-		draw->lineRasterizationMode = context->lineRasterizationMode;
-
-		draw->vertexRoutine = vertexRoutine;
-		draw->setupRoutine = setupRoutine;
-		draw->pixelRoutine = pixelRoutine;
-		draw->setupPrimitives = setupPrimitives;
-		draw->setupState = setupState;
-
-		data->descriptorSets = context->descriptorSets;
-		data->descriptorDynamicOffsets = context->descriptorDynamicOffsets;
-
-		for(int i = 0; i < MAX_INTERFACE_COMPONENTS/4; i++)
-		{
-			data->input[i] = context->input[i].buffer;
-			data->robustnessSize[i] = context->input[i].robustnessSize;
-			data->stride[i] = context->input[i].vertexStride;
-		}
-
-		data->indices = indexBuffer;
-		data->viewID = viewID;
-		data->instanceID = instanceID;
-		data->baseVertex = baseVertex;
-
-		if(pixelState.stencilActive)
-		{
-			data->stencil[0].set(context->frontStencil.reference, context->frontStencil.compareMask, context->frontStencil.writeMask);
-			data->stencil[1].set(context->backStencil.reference, context->backStencil.compareMask, context->backStencil.writeMask);
-		}
-
-		data->lineWidth = context->lineWidth;
-
-		data->factor = factor;
-
-		if(pixelState.alphaToCoverage)
-		{
-			if(ms == 4)
-			{
-				data->a2c0 = replicate(0.2f);
-				data->a2c1 = replicate(0.4f);
-				data->a2c2 = replicate(0.6f);
-				data->a2c3 = replicate(0.8f);
-			}
-			else if(ms == 2)
-			{
-				data->a2c0 = replicate(0.25f);
-				data->a2c1 = replicate(0.75f);
-			}
-			else ASSERT(false);
-		}
-
-		if(pixelState.occlusionEnabled)
-		{
-			for(int cluster = 0; cluster < MaxClusterCount; cluster++)
-			{
-				data->occlusion[cluster] = 0;
-			}
-		}
-
-		// Viewport
-		{
-			float W = 0.5f * viewport.width;
-			float H = 0.5f * viewport.height;
-			float X0 = viewport.x + W;
-			float Y0 = viewport.y + H;
-			float N = viewport.minDepth;
-			float F = viewport.maxDepth;
-			float Z = F - N;
-			constexpr float subPixF = vk::SUBPIXEL_PRECISION_FACTOR;
-
-			if(context->isDrawTriangle(false))
-			{
-				N += context->depthBias;
-			}
-
-			data->WxF = replicate(W * subPixF);
-			data->HxF = replicate(H * subPixF);
-			data->X0xF = replicate(X0 * subPixF - subPixF / 2);
-			data->Y0xF = replicate(Y0 * subPixF - subPixF / 2);
-			data->halfPixelX = replicate(0.5f / W);
-			data->halfPixelY = replicate(0.5f / H);
-			data->viewportHeight = abs(viewport.height);
-			data->slopeDepthBias = context->slopeDepthBias;
-			data->depthRange = Z;
-			data->depthNear = N;
-		}
-
-		// Target
-		{
-			for(int index = 0; index < RENDERTARGETS; index++)
-			{
-				draw->renderTarget[index] = context->renderTarget[index];
-
-				if(draw->renderTarget[index])
-				{
-					data->colorBuffer[index] = (unsigned int*)context->renderTarget[index]->getOffsetPointer({0, 0, 0}, VK_IMAGE_ASPECT_COLOR_BIT, 0, data->viewID);
-					data->colorPitchB[index] = context->renderTarget[index]->rowPitchBytes(VK_IMAGE_ASPECT_COLOR_BIT, 0);
-					data->colorSliceB[index] = context->renderTarget[index]->slicePitchBytes(VK_IMAGE_ASPECT_COLOR_BIT, 0);
-				}
-			}
-
-			draw->depthBuffer = context->depthBuffer;
-			draw->stencilBuffer = context->stencilBuffer;
-
-			if(draw->depthBuffer)
-			{
-				data->depthBuffer = (float*)context->depthBuffer->getOffsetPointer({0, 0, 0}, VK_IMAGE_ASPECT_DEPTH_BIT, 0, data->viewID);
-				data->depthPitchB = context->depthBuffer->rowPitchBytes(VK_IMAGE_ASPECT_DEPTH_BIT, 0);
-				data->depthSliceB = context->depthBuffer->slicePitchBytes(VK_IMAGE_ASPECT_DEPTH_BIT, 0);
-			}
-
-			if(draw->stencilBuffer)
-			{
-				data->stencilBuffer = (unsigned char*)context->stencilBuffer->getOffsetPointer({0, 0, 0}, VK_IMAGE_ASPECT_STENCIL_BIT, 0, data->viewID);
-				data->stencilPitchB = context->stencilBuffer->rowPitchBytes(VK_IMAGE_ASPECT_STENCIL_BIT, 0);
-				data->stencilSliceB = context->stencilBuffer->slicePitchBytes(VK_IMAGE_ASPECT_STENCIL_BIT, 0);
-			}
-		}
-
-		// Scissor
-		{
-			data->scissorX0 = clamp<int>(scissor.offset.x, 0, framebufferExtent.width);
-			data->scissorX1 = clamp<int>(scissor.offset.x + scissor.extent.width, 0, framebufferExtent.width);
-			data->scissorY0 = clamp<int>(scissor.offset.y, 0, framebufferExtent.height);
-			data->scissorY1 = clamp<int>(scissor.offset.y + scissor.extent.height, 0, framebufferExtent.height);
-		}
-
-		// Push constants
-		{
-			data->pushConstants = pushConstants;
-		}
-
-		draw->events = events;
-
-		DrawCall::run(draw, &drawTickets, clusterQueues);
-	}
-
-	void DrawCall::setup()
-	{
-		if(occlusionQuery != nullptr)
-		{
-			occlusionQuery->start();
-		}
-
-		if(events)
-		{
-			events->start();
-		}
-	}
-
-	void DrawCall::teardown()
-	{
-		if(events)
-		{
-			events->finish();
-			events = nullptr;
-		}
-
-		if (occlusionQuery != nullptr)
-		{
-			for(int cluster = 0; cluster < MaxClusterCount; cluster++)
-			{
-				occlusionQuery->add(data->occlusion[cluster]);
-			}
-			occlusionQuery->finish();
-		}
-
-		vertexRoutine = {};
-		setupRoutine = {};
-		pixelRoutine = {};
-	}
-
-	void DrawCall::run(const marl::Loan<DrawCall>& draw, marl::Ticket::Queue* tickets, marl::Ticket::Queue clusterQueues[MaxClusterCount])
-	{
-		draw->setup();
-
-		auto const numPrimitives = draw->numPrimitives;
-		auto const numPrimitivesPerBatch = draw->numPrimitivesPerBatch;
-		auto const numBatches = draw->numBatches;
-
-		auto ticket = tickets->take();
-		auto finally = marl::make_shared_finally([draw, ticket] {
-			MARL_SCOPED_EVENT("FINISH draw %d", draw->id);
-			draw->teardown();
-			ticket.done();
-		});
-
-		for (unsigned int batchId = 0; batchId < numBatches; batchId++)
-		{
-			auto batch = draw->batchDataPool->borrow();
-			batch->id = batchId;
-			batch->firstPrimitive = batch->id * numPrimitivesPerBatch;
-			batch->numPrimitives = std::min(batch->firstPrimitive + numPrimitivesPerBatch, numPrimitives) - batch->firstPrimitive;
-
-			for (int cluster = 0; cluster < MaxClusterCount; cluster++)
-			{
-				batch->clusterTickets[cluster] = std::move(clusterQueues[cluster].take());
-			}
-
-			marl::schedule([draw, batch, finally] {
-
-				processVertices(draw.get(), batch.get());
-
-				if (!draw->setupState.rasterizerDiscard)
-				{
-					processPrimitives(draw.get(), batch.get());
-
-					if (batch->numVisible > 0)
-					{
-						processPixels(draw, batch, finally);
-						return;
-					}
-				}
-
-				for (int cluster = 0; cluster < MaxClusterCount; cluster++)
-				{
-					batch->clusterTickets[cluster].done();
-				}
-			});
-		}
-	}
-
-	void DrawCall::processVertices(DrawCall* draw, BatchData* batch)
-	{
-		MARL_SCOPED_EVENT("VERTEX draw %d, batch %d", draw->id, batch->id);
-
-		unsigned int triangleIndices[MaxBatchSize + 1][3];  // One extra for SIMD width overrun. TODO: Adjust to dynamic batch size.
-		{
-			MARL_SCOPED_EVENT("processPrimitiveVertices");
-			processPrimitiveVertices(
-				triangleIndices,
-				draw->data->indices,
-				draw->indexType,
-				batch->firstPrimitive,
-				batch->numPrimitives,
-				draw->topology,
-				draw->provokingVertexMode);
-		}
-
-		auto& vertexTask = batch->vertexTask;
-		vertexTask.primitiveStart = batch->firstPrimitive;
-		// We're only using batch compaction for points, not lines
-		vertexTask.vertexCount = batch->numPrimitives * ((draw->topology == VK_PRIMITIVE_TOPOLOGY_POINT_LIST) ? 1 : 3);
-		if (vertexTask.vertexCache.drawCall != draw->id)
-		{
-			vertexTask.vertexCache.clear();
-			vertexTask.vertexCache.drawCall = draw->id;
-		}
-
-		draw->vertexRoutine(&batch->triangles.front().v0, &triangleIndices[0][0], &vertexTask, draw->data);
-	}
-
-	void DrawCall::processPrimitives(DrawCall* draw, BatchData* batch)
-	{
-		MARL_SCOPED_EVENT("PRIMITIVES draw %d batch %d", draw->id, batch->id);
-		auto triangles = &batch->triangles[0];
-		auto primitives = &batch->primitives[0];
-		batch->numVisible = draw->setupPrimitives(triangles, primitives, draw, batch->numPrimitives);
-	}
-
-	void DrawCall::processPixels(const marl::Loan<DrawCall>& draw, const marl::Loan<BatchData>& batch, const std::shared_ptr<marl::Finally>& finally)
-	{
-		struct Data
-		{
-			Data(const marl::Loan<DrawCall>& draw, const marl::Loan<BatchData>& batch, const std::shared_ptr<marl::Finally>& finally)
-				: draw(draw), batch(batch), finally(finally) {}
-			marl::Loan<DrawCall> draw;
-			marl::Loan<BatchData> batch;
-			std::shared_ptr<marl::Finally> finally;
-		};
-		auto data = std::make_shared<Data>(draw, batch, finally);
-		for (int cluster = 0; cluster < MaxClusterCount; cluster++)
-		{
-			batch->clusterTickets[cluster].onCall([data, cluster]
-			{
-				auto& draw = data->draw;
-				auto& batch = data->batch;
-				MARL_SCOPED_EVENT("PIXEL draw %d, batch %d, cluster %d", draw->id, batch->id, cluster);
-				draw->pixelRoutine(&batch->primitives.front(), batch->numVisible, cluster, MaxClusterCount, draw->data);
-				batch->clusterTickets[cluster].done();
-			});
-		}
-	}
-
-	void Renderer::synchronize()
-	{
-		MARL_SCOPED_EVENT("synchronize");
-		auto ticket = drawTickets.take();
-		ticket.wait();
-		device->updateSamplingRoutineConstCache();
-		ticket.done();
-	}
-
-	void DrawCall::processPrimitiveVertices(
-		unsigned int triangleIndicesOut[MaxBatchSize + 1][3],
-		const void *primitiveIndices,
-		VkIndexType indexType,
-		unsigned int start,
-		unsigned int triangleCount,
-		VkPrimitiveTopology topology,
-		VkProvokingVertexModeEXT provokingVertexMode)
-	{
-		if(!primitiveIndices)
-		{
-			struct LinearIndex
-			{
-				unsigned int operator[](unsigned int i) { return i; }
-			};
-
-			if(!setBatchIndices(triangleIndicesOut, topology, provokingVertexMode, LinearIndex(), start, triangleCount))
-			{
-				return;
-			}
-		}
-		else
-		{
-			switch(indexType)
-			{
-			case VK_INDEX_TYPE_UINT16:
-				if(!setBatchIndices(triangleIndicesOut, topology, provokingVertexMode, static_cast<const uint16_t*>(primitiveIndices), start, triangleCount))
-				{
-					return;
-				}
-				break;
-			case VK_INDEX_TYPE_UINT32:
-				if(!setBatchIndices(triangleIndicesOut, topology, provokingVertexMode, static_cast<const uint32_t*>(primitiveIndices), start, triangleCount))
-				{
-					return;
-				}
-				break;
-			break;
-			default:
-				ASSERT(false);
-				return;
-			}
-		}
-
-		// setBatchIndices() takes care of the point case, since it's different due to the compaction
-		if (topology != VK_PRIMITIVE_TOPOLOGY_POINT_LIST)
-		{
-			// Repeat the last index to allow for SIMD width overrun.
-			triangleIndicesOut[triangleCount][0] = triangleIndicesOut[triangleCount - 1][2];
-			triangleIndicesOut[triangleCount][1] = triangleIndicesOut[triangleCount - 1][2];
-			triangleIndicesOut[triangleCount][2] = triangleIndicesOut[triangleCount - 1][2];
-		}
-	}
-
-	int DrawCall::setupSolidTriangles(Triangle *triangles, Primitive *primitives, const DrawCall *drawCall, int count)
-	{
-		auto &state = drawCall->setupState;
-
-		int ms = state.multiSample;
-		const DrawData *data = drawCall->data;
-		int visible = 0;
-
-		for(int i = 0; i < count; i++, triangles++)
-		{
-			Vertex &v0 = triangles->v0;
-			Vertex &v1 = triangles->v1;
-			Vertex &v2 = triangles->v2;
-
-			Polygon polygon(&v0.position, &v1.position, &v2.position);
-
-
-			if((v0.cullMask | v1.cullMask | v2.cullMask) == 0)
-			{
-				continue;
-			}
-
-			if((v0.clipFlags & v1.clipFlags & v2.clipFlags) != Clipper::CLIP_FINITE)
-			{
-				continue;
-			}
-
-			int clipFlagsOr = v0.clipFlags | v1.clipFlags | v2.clipFlags;
-			if(clipFlagsOr != Clipper::CLIP_FINITE)
-			{
-				if(!Clipper::Clip(polygon, clipFlagsOr, *drawCall))
-				{
-					continue;
-				}
-			}
-
-			if(drawCall->setupRoutine(primitives, triangles, &polygon, data))
-			{
-				primitives += ms;
-				visible++;
-			}
-		}
-
-		return visible;
-	}
-
-	int DrawCall::setupWireframeTriangles(Triangle *triangles, Primitive *primitives, const DrawCall *drawCall, int count)
-	{
-		auto& state = drawCall->setupState;
-
-		int ms = state.multiSample;
-		int visible = 0;
-
-		for(int i = 0; i < count; i++)
-		{
-			const Vertex &v0 = triangles[i].v0;
-			const Vertex &v1 = triangles[i].v1;
-			const Vertex &v2 = triangles[i].v2;
-
-			float d = (v0.y * v1.x - v0.x * v1.y) * v2.w +
-			          (v0.x * v2.y - v0.y * v2.x) * v1.w +
-			          (v2.x * v1.y - v1.x * v2.y) * v0.w;
-
-			bool frontFacing = (state.frontFace == VK_FRONT_FACE_COUNTER_CLOCKWISE) ? (d > 0) : (d < 0);
-			if(state.cullMode & VK_CULL_MODE_FRONT_BIT)
-			{
-				if(frontFacing) continue;
-			}
-			if(state.cullMode & VK_CULL_MODE_BACK_BIT)
-			{
-				if(!frontFacing) continue;
-			}
-
-			Triangle lines[3];
-			lines[0].v0 = v0;
-			lines[0].v1 = v1;
-			lines[1].v0 = v1;
-			lines[1].v1 = v2;
-			lines[2].v0 = v2;
-			lines[2].v1 = v0;
-
-			for(int i = 0; i < 3; i++)
-			{
-				if(setupLine(*primitives, lines[i], *drawCall))
-				{
-					primitives += ms;
-					visible++;
-				}
-			}
-		}
-
-		return visible;
-	}
-
-	int DrawCall::setupPointTriangles(Triangle *triangles, Primitive *primitives, const DrawCall *drawCall, int count)
-	{
-		auto& state = drawCall->setupState;
-
-		int ms = state.multiSample;
-		int visible = 0;
-
-		for(int i = 0; i < count; i++)
-		{
-			const Vertex &v0 = triangles[i].v0;
-			const Vertex &v1 = triangles[i].v1;
-			const Vertex &v2 = triangles[i].v2;
-
-			float d = (v0.y * v1.x - v0.x * v1.y) * v2.w +
-			          (v0.x * v2.y - v0.y * v2.x) * v1.w +
-			          (v2.x * v1.y - v1.x * v2.y) * v0.w;
-
-			bool frontFacing = (state.frontFace == VK_FRONT_FACE_COUNTER_CLOCKWISE) ? (d > 0) : (d < 0);
-			if(state.cullMode & VK_CULL_MODE_FRONT_BIT)
-			{
-				if(frontFacing) continue;
-			}
-			if(state.cullMode & VK_CULL_MODE_BACK_BIT)
-			{
-				if(!frontFacing) continue;
-			}
-
-			Triangle points[3];
-			points[0].v0 = v0;
-			points[1].v0 = v1;
-			points[2].v0 = v2;
-
-			for(int i = 0; i < 3; i++)
-			{
-				if(setupPoint(*primitives, points[i], *drawCall))
-				{
-					primitives += ms;
-					visible++;
-				}
-			}
-		}
-
-		return visible;
-	}
-
-	int DrawCall::setupLines(Triangle *triangles, Primitive *primitives, const DrawCall *drawCall, int count)
-	{
-		auto &state = drawCall->setupState;
-
-		int visible = 0;
-		int ms = state.multiSample;
-
-		for(int i = 0; i < count; i++)
-		{
-			if(setupLine(*primitives, *triangles, *drawCall))
-			{
-				primitives += ms;
-				visible++;
-			}
-
-			triangles++;
-		}
-
-		return visible;
-	}
-
-	int DrawCall::setupPoints(Triangle *triangles, Primitive *primitives, const DrawCall *drawCall, int count)
-	{
-		auto &state = drawCall->setupState;
-
-		int visible = 0;
-		int ms = state.multiSample;
-
-		for(int i = 0; i < count; i++)
-		{
-			if(setupPoint(*primitives, *triangles, *drawCall))
-			{
-				primitives += ms;
-				visible++;
-			}
-
-			triangles++;
-		}
-
-		return visible;
-	}
-
-	bool DrawCall::setupLine(Primitive &primitive, Triangle &triangle, const DrawCall &draw)
-	{
-		const DrawData &data = *draw.data;
-
-		float lineWidth = data.lineWidth;
-
-		Vertex &v0 = triangle.v0;
-		Vertex &v1 = triangle.v1;
-
-		if((v0.cullMask | v1.cullMask) == 0)
-		{
-			return false;
-		}
-
-		const float4 &P0 = v0.position;
-		const float4 &P1 = v1.position;
-
-		if(P0.w <= 0 && P1.w <= 0)
-		{
-			return false;
-		}
-
-		constexpr float subPixF = vk::SUBPIXEL_PRECISION_FACTOR;
-
-		const float W = data.WxF[0] * (1.0f / subPixF);
-		const float H = data.HxF[0] * (1.0f / subPixF);
-
-		float dx = W * (P1.x / P1.w - P0.x / P0.w);
-		float dy = H * (P1.y / P1.w - P0.y / P0.w);
-
-		if(dx == 0 && dy == 0)
-		{
-			return false;
-		}
-
-		if(draw.lineRasterizationMode != VK_LINE_RASTERIZATION_MODE_BRESENHAM_EXT)
-		{
-			// Rectangle centered on the line segment
-
-			float4 P[4];
-			int C[4];
-
-			P[0] = P0;
-			P[1] = P1;
-			P[2] = P1;
-			P[3] = P0;
-
-			float scale = lineWidth * 0.5f / sqrt(dx*dx + dy*dy);
-
-			dx *= scale;
-			dy *= scale;
-
-			float dx0h = dx * P0.w / H;
-			float dy0w = dy * P0.w / W;
-
-			float dx1h = dx * P1.w / H;
-			float dy1w = dy * P1.w / W;
-
-			P[0].x += -dy0w;
-			P[0].y += +dx0h;
-			C[0] = Clipper::ComputeClipFlags(P[0]);
-
-			P[1].x += -dy1w;
-			P[1].y += +dx1h;
-			C[1] = Clipper::ComputeClipFlags(P[1]);
-
-			P[2].x += +dy1w;
-			P[2].y += -dx1h;
-			C[2] = Clipper::ComputeClipFlags(P[2]);
-
-			P[3].x += +dy0w;
-			P[3].y += -dx0h;
-			C[3] = Clipper::ComputeClipFlags(P[3]);
-
-			if((C[0] & C[1] & C[2] & C[3]) == Clipper::CLIP_FINITE)
-			{
-				Polygon polygon(P, 4);
-
-				int clipFlagsOr = C[0] | C[1] | C[2] | C[3];
-
-				if(clipFlagsOr != Clipper::CLIP_FINITE)
-				{
-					if(!Clipper::Clip(polygon, clipFlagsOr, draw))
-					{
-						return false;
-					}
-				}
-
-				return draw.setupRoutine(&primitive, &triangle, &polygon, &data);
-			}
-		}
-		else if(false)  // TODO(b/80135519): Deprecate
-		{
-			// Connecting diamonds polygon
-			// This shape satisfies the diamond test convention, except for the exit rule part.
-			// Line segments with overlapping endpoints have duplicate fragments.
-			// The ideal algorithm requires half-open line rasterization (b/80135519).
-
-			float4 P[8];
-			int C[8];
-
-			P[0] = P0;
-			P[1] = P0;
-			P[2] = P0;
-			P[3] = P0;
-			P[4] = P1;
-			P[5] = P1;
-			P[6] = P1;
-			P[7] = P1;
-
-			float dx0 = lineWidth * 0.5f * P0.w / W;
-			float dy0 = lineWidth * 0.5f * P0.w / H;
-
-			float dx1 = lineWidth * 0.5f * P1.w / W;
-			float dy1 = lineWidth * 0.5f * P1.w / H;
-
-			P[0].x += -dx0;
-			C[0] = Clipper::ComputeClipFlags(P[0]);
-
-			P[1].y += +dy0;
-			C[1] = Clipper::ComputeClipFlags(P[1]);
-
-			P[2].x += +dx0;
-			C[2] = Clipper::ComputeClipFlags(P[2]);
-
-			P[3].y += -dy0;
-			C[3] = Clipper::ComputeClipFlags(P[3]);
-
-			P[4].x += -dx1;
-			C[4] = Clipper::ComputeClipFlags(P[4]);
-
-			P[5].y += +dy1;
-			C[5] = Clipper::ComputeClipFlags(P[5]);
-
-			P[6].x += +dx1;
-			C[6] = Clipper::ComputeClipFlags(P[6]);
-
-			P[7].y += -dy1;
-			C[7] = Clipper::ComputeClipFlags(P[7]);
-
-			if((C[0] & C[1] & C[2] & C[3] & C[4] & C[5] & C[6] & C[7]) == Clipper::CLIP_FINITE)
-			{
-				float4 L[6];
-
-				if(dx > -dy)
-				{
-					if(dx > dy)   // Right
-					{
-						L[0] = P[0];
-						L[1] = P[1];
-						L[2] = P[5];
-						L[3] = P[6];
-						L[4] = P[7];
-						L[5] = P[3];
-					}
-					else   // Down
-					{
-						L[0] = P[0];
-						L[1] = P[4];
-						L[2] = P[5];
-						L[3] = P[6];
-						L[4] = P[2];
-						L[5] = P[3];
-					}
-				}
-				else
-				{
-					if(dx > dy)   // Up
-					{
-						L[0] = P[0];
-						L[1] = P[1];
-						L[2] = P[2];
-						L[3] = P[6];
-						L[4] = P[7];
-						L[5] = P[4];
-					}
-					else   // Left
-					{
-						L[0] = P[1];
-						L[1] = P[2];
-						L[2] = P[3];
-						L[3] = P[7];
-						L[4] = P[4];
-						L[5] = P[5];
-					}
-				}
-
-				Polygon polygon(L, 6);
-
-				int clipFlagsOr = C[0] | C[1] | C[2] | C[3] | C[4] | C[5] | C[6] | C[7];
-
-				if(clipFlagsOr != Clipper::CLIP_FINITE)
-				{
-					if(!Clipper::Clip(polygon, clipFlagsOr, draw))
-					{
-						return false;
-					}
-				}
-
-				return draw.setupRoutine(&primitive, &triangle, &polygon, &data);
-			}
-		}
-		else
-		{
-			// Parallelogram approximating Bresenham line
-			// This algorithm does not satisfy the ideal diamond-exit rule, but does avoid the
-			// duplicate fragment rasterization problem and satisfies all of Vulkan's minimum
-			// requirements for Bresenham line segment rasterization.
-
-			float4 P[8];
-			P[0] = P0;
-			P[1] = P0;
-			P[2] = P0;
-			P[3] = P0;
-			P[4] = P1;
-			P[5] = P1;
-			P[6] = P1;
-			P[7] = P1;
-
-			float dx0 = lineWidth * 0.5f * P0.w / W;
-			float dy0 = lineWidth * 0.5f * P0.w / H;
-
-			float dx1 = lineWidth * 0.5f * P1.w / W;
-			float dy1 = lineWidth * 0.5f * P1.w / H;
-
-			P[0].x += -dx0;
-			P[1].y += +dy0;
-			P[2].x += +dx0;
-			P[3].y += -dy0;
-			P[4].x += -dx1;
-			P[5].y += +dy1;
-			P[6].x += +dx1;
-			P[7].y += -dy1;
-
-			float4 L[4];
-
-			if(dx > -dy)
-			{
-				if(dx > dy)   // Right
-				{
-					L[0] = P[1];
-					L[1] = P[5];
-					L[2] = P[7];
-					L[3] = P[3];
-				}
-				else   // Down
-				{
-					L[0] = P[0];
-					L[1] = P[4];
-					L[2] = P[6];
-					L[3] = P[2];
-				}
-			}
-			else
-			{
-				if(dx > dy)   // Up
-				{
-					L[0] = P[0];
-					L[1] = P[2];
-					L[2] = P[6];
-					L[3] = P[4];
-				}
-				else   // Left
-				{
-					L[0] = P[1];
-					L[1] = P[3];
-					L[2] = P[7];
-					L[3] = P[5];
-				}
-			}
-
-			int C0 = Clipper::ComputeClipFlags(L[0]);
-			int C1 = Clipper::ComputeClipFlags(L[1]);
-			int C2 = Clipper::ComputeClipFlags(L[2]);
-			int C3 = Clipper::ComputeClipFlags(L[3]);
-
-			if((C0 & C1 & C2 & C3) == Clipper::CLIP_FINITE)
-			{
-				Polygon polygon(L, 4);
-
-				int clipFlagsOr = C0 | C1 | C2 | C3;
-
-				if(clipFlagsOr != Clipper::CLIP_FINITE)
-				{
-					if(!Clipper::Clip(polygon, clipFlagsOr, draw))
-					{
-						return false;
-					}
-				}
-
-				return draw.setupRoutine(&primitive, &triangle, &polygon, &data);
-			}
-		}
-
+	default:
+		ASSERT(false);
 		return false;
 	}
 
-	bool DrawCall::setupPoint(Primitive &primitive, Triangle &triangle, const DrawCall &draw)
+	return true;
+}
+
+DrawCall::DrawCall()
+{
+	data = (DrawData*)allocate(sizeof(DrawData));
+	data->constants = &constants;
+}
+
+DrawCall::~DrawCall()
+{
+	deallocate(data);
+}
+
+Renderer::Renderer(vk::Device* device) : device(device)
+{
+	VertexProcessor::setRoutineCacheSize(1024);
+	PixelProcessor::setRoutineCacheSize(1024);
+	SetupProcessor::setRoutineCacheSize(1024);
+}
+
+Renderer::~Renderer()
+{
+	drawTickets.take().wait();
+}
+
+// Renderer objects have to be mem aligned to the alignment provided in the class declaration
+void* Renderer::operator new(size_t size)
+{
+	ASSERT(size == sizeof(Renderer));  // This operator can't be called from a derived class
+	return vk::allocate(sizeof(Renderer), alignof(Renderer), vk::DEVICE_MEMORY, VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
+}
+
+void Renderer::operator delete(void* mem)
+{
+	vk::deallocate(mem, vk::DEVICE_MEMORY);
+}
+
+void Renderer::draw(const sw::Context* context, VkIndexType indexType, unsigned int count, int baseVertex,
+		TaskEvents *events, int instanceID, int viewID, void *indexBuffer, const VkExtent3D& framebufferExtent,
+		PushConstantStorage const & pushConstants, bool update)
+{
+	if(count == 0) { return; }
+
+	auto id = nextDrawID++;
+	MARL_SCOPED_EVENT("draw %d", id);
+
+	#ifndef NDEBUG
 	{
-		const DrawData &data = *draw.data;
-
-		Vertex &v = triangle.v0;
-
-		if(v.cullMask == 0)
+		unsigned int minPrimitives = 1;
+		unsigned int maxPrimitives = 1 << 21;
+		if(count < minPrimitives || count > maxPrimitives)
 		{
-			return false;
+			return;
+		}
+	}
+	#endif
+
+	int ms = context->sampleCount;
+
+	if(!context->multiSampleMask)
+	{
+		return;
+	}
+
+	marl::Pool<sw::DrawCall>::Loan draw;
+	{
+		MARL_SCOPED_EVENT("drawCallPool.borrow()");
+		draw = drawCallPool.borrow();
+	}
+	draw->id = id;
+
+	if(update)
+	{
+		MARL_SCOPED_EVENT("update");
+		vertexState = VertexProcessor::update(context);
+		setupState = SetupProcessor::update(context);
+		pixelState = PixelProcessor::update(context);
+
+		vertexRoutine = VertexProcessor::routine(vertexState, context->pipelineLayout, context->vertexShader, context->descriptorSets);
+		setupRoutine = SetupProcessor::routine(setupState);
+		pixelRoutine = PixelProcessor::routine(pixelState, context->pipelineLayout, context->pixelShader, context->descriptorSets);
+	}
+
+	DrawCall::SetupFunction setupPrimitives = nullptr;
+	unsigned int numPrimitivesPerBatch = MaxBatchSize / ms;
+
+	if(context->isDrawTriangle(false))
+	{
+		switch(context->polygonMode)
+		{
+		case VK_POLYGON_MODE_FILL:
+			setupPrimitives = &DrawCall::setupSolidTriangles;
+			break;
+		case VK_POLYGON_MODE_LINE:
+			setupPrimitives = &DrawCall::setupWireframeTriangles;
+			numPrimitivesPerBatch /= 3;
+			break;
+		case VK_POLYGON_MODE_POINT:
+			setupPrimitives = &DrawCall::setupPointTriangles;
+			numPrimitivesPerBatch /= 3;
+			break;
+		default:
+			UNSUPPORTED("polygon mode: %d", int(context->polygonMode));
+			return;
+		}
+	}
+	else if(context->isDrawLine(false))
+	{
+		setupPrimitives = &DrawCall::setupLines;
+	}
+	else  // Point primitive topology
+	{
+		setupPrimitives = &DrawCall::setupPoints;
+	}
+
+	DrawData *data = draw->data;
+	draw->occlusionQuery = occlusionQuery;
+	draw->batchDataPool = &batchDataPool;
+	draw->numPrimitives = count;
+	draw->numPrimitivesPerBatch = numPrimitivesPerBatch;
+	draw->numBatches = (count + draw->numPrimitivesPerBatch - 1) / draw->numPrimitivesPerBatch;
+	draw->topology = context->topology;
+	draw->provokingVertexMode = context->provokingVertexMode;
+	draw->indexType = indexType;
+	draw->lineRasterizationMode = context->lineRasterizationMode;
+
+	draw->vertexRoutine = vertexRoutine;
+	draw->setupRoutine = setupRoutine;
+	draw->pixelRoutine = pixelRoutine;
+	draw->setupPrimitives = setupPrimitives;
+	draw->setupState = setupState;
+
+	data->descriptorSets = context->descriptorSets;
+	data->descriptorDynamicOffsets = context->descriptorDynamicOffsets;
+
+	for(int i = 0; i < MAX_INTERFACE_COMPONENTS/4; i++)
+	{
+		data->input[i] = context->input[i].buffer;
+		data->robustnessSize[i] = context->input[i].robustnessSize;
+		data->stride[i] = context->input[i].vertexStride;
+	}
+
+	data->indices = indexBuffer;
+	data->viewID = viewID;
+	data->instanceID = instanceID;
+	data->baseVertex = baseVertex;
+
+	if(pixelState.stencilActive)
+	{
+		data->stencil[0].set(context->frontStencil.reference, context->frontStencil.compareMask, context->frontStencil.writeMask);
+		data->stencil[1].set(context->backStencil.reference, context->backStencil.compareMask, context->backStencil.writeMask);
+	}
+
+	data->lineWidth = context->lineWidth;
+
+	data->factor = factor;
+
+	if(pixelState.alphaToCoverage)
+	{
+		if(ms == 4)
+		{
+			data->a2c0 = replicate(0.2f);
+			data->a2c1 = replicate(0.4f);
+			data->a2c2 = replicate(0.6f);
+			data->a2c3 = replicate(0.8f);
+		}
+		else if(ms == 2)
+		{
+			data->a2c0 = replicate(0.25f);
+			data->a2c1 = replicate(0.75f);
+		}
+		else ASSERT(false);
+	}
+
+	if(pixelState.occlusionEnabled)
+	{
+		for(int cluster = 0; cluster < MaxClusterCount; cluster++)
+		{
+			data->occlusion[cluster] = 0;
+		}
+	}
+
+	// Viewport
+	{
+		float W = 0.5f * viewport.width;
+		float H = 0.5f * viewport.height;
+		float X0 = viewport.x + W;
+		float Y0 = viewport.y + H;
+		float N = viewport.minDepth;
+		float F = viewport.maxDepth;
+		float Z = F - N;
+		constexpr float subPixF = vk::SUBPIXEL_PRECISION_FACTOR;
+
+		if(context->isDrawTriangle(false))
+		{
+			N += context->depthBias;
 		}
 
-		float pSize = v.pointSize;
+		data->WxF = replicate(W * subPixF);
+		data->HxF = replicate(H * subPixF);
+		data->X0xF = replicate(X0 * subPixF - subPixF / 2);
+		data->Y0xF = replicate(Y0 * subPixF - subPixF / 2);
+		data->halfPixelX = replicate(0.5f / W);
+		data->halfPixelY = replicate(0.5f / H);
+		data->viewportHeight = abs(viewport.height);
+		data->slopeDepthBias = context->slopeDepthBias;
+		data->depthRange = Z;
+		data->depthNear = N;
+	}
 
-		pSize = clamp(pSize, 1.0f, static_cast<float>(vk::MAX_POINT_SIZE));
+	// Target
+	{
+		for(int index = 0; index < RENDERTARGETS; index++)
+		{
+			draw->renderTarget[index] = context->renderTarget[index];
+
+			if(draw->renderTarget[index])
+			{
+				data->colorBuffer[index] = (unsigned int*)context->renderTarget[index]->getOffsetPointer({0, 0, 0}, VK_IMAGE_ASPECT_COLOR_BIT, 0, data->viewID);
+				data->colorPitchB[index] = context->renderTarget[index]->rowPitchBytes(VK_IMAGE_ASPECT_COLOR_BIT, 0);
+				data->colorSliceB[index] = context->renderTarget[index]->slicePitchBytes(VK_IMAGE_ASPECT_COLOR_BIT, 0);
+			}
+		}
+
+		draw->depthBuffer = context->depthBuffer;
+		draw->stencilBuffer = context->stencilBuffer;
+
+		if(draw->depthBuffer)
+		{
+			data->depthBuffer = (float*)context->depthBuffer->getOffsetPointer({0, 0, 0}, VK_IMAGE_ASPECT_DEPTH_BIT, 0, data->viewID);
+			data->depthPitchB = context->depthBuffer->rowPitchBytes(VK_IMAGE_ASPECT_DEPTH_BIT, 0);
+			data->depthSliceB = context->depthBuffer->slicePitchBytes(VK_IMAGE_ASPECT_DEPTH_BIT, 0);
+		}
+
+		if(draw->stencilBuffer)
+		{
+			data->stencilBuffer = (unsigned char*)context->stencilBuffer->getOffsetPointer({0, 0, 0}, VK_IMAGE_ASPECT_STENCIL_BIT, 0, data->viewID);
+			data->stencilPitchB = context->stencilBuffer->rowPitchBytes(VK_IMAGE_ASPECT_STENCIL_BIT, 0);
+			data->stencilSliceB = context->stencilBuffer->slicePitchBytes(VK_IMAGE_ASPECT_STENCIL_BIT, 0);
+		}
+	}
+
+	// Scissor
+	{
+		data->scissorX0 = clamp<int>(scissor.offset.x, 0, framebufferExtent.width);
+		data->scissorX1 = clamp<int>(scissor.offset.x + scissor.extent.width, 0, framebufferExtent.width);
+		data->scissorY0 = clamp<int>(scissor.offset.y, 0, framebufferExtent.height);
+		data->scissorY1 = clamp<int>(scissor.offset.y + scissor.extent.height, 0, framebufferExtent.height);
+	}
+
+	// Push constants
+	{
+		data->pushConstants = pushConstants;
+	}
+
+	draw->events = events;
+
+	DrawCall::run(draw, &drawTickets, clusterQueues);
+}
+
+void DrawCall::setup()
+{
+	if(occlusionQuery != nullptr)
+	{
+		occlusionQuery->start();
+	}
+
+	if(events)
+	{
+		events->start();
+	}
+}
+
+void DrawCall::teardown()
+{
+	if(events)
+	{
+		events->finish();
+		events = nullptr;
+	}
+
+	if (occlusionQuery != nullptr)
+	{
+		for(int cluster = 0; cluster < MaxClusterCount; cluster++)
+		{
+			occlusionQuery->add(data->occlusion[cluster]);
+		}
+		occlusionQuery->finish();
+	}
+
+	vertexRoutine = {};
+	setupRoutine = {};
+	pixelRoutine = {};
+}
+
+void DrawCall::run(const marl::Loan<DrawCall>& draw, marl::Ticket::Queue* tickets, marl::Ticket::Queue clusterQueues[MaxClusterCount])
+{
+	draw->setup();
+
+	auto const numPrimitives = draw->numPrimitives;
+	auto const numPrimitivesPerBatch = draw->numPrimitivesPerBatch;
+	auto const numBatches = draw->numBatches;
+
+	auto ticket = tickets->take();
+	auto finally = marl::make_shared_finally([draw, ticket] {
+		MARL_SCOPED_EVENT("FINISH draw %d", draw->id);
+		draw->teardown();
+		ticket.done();
+	});
+
+	for (unsigned int batchId = 0; batchId < numBatches; batchId++)
+	{
+		auto batch = draw->batchDataPool->borrow();
+		batch->id = batchId;
+		batch->firstPrimitive = batch->id * numPrimitivesPerBatch;
+		batch->numPrimitives = std::min(batch->firstPrimitive + numPrimitivesPerBatch, numPrimitives) - batch->firstPrimitive;
+
+		for (int cluster = 0; cluster < MaxClusterCount; cluster++)
+		{
+			batch->clusterTickets[cluster] = std::move(clusterQueues[cluster].take());
+		}
+
+		marl::schedule([draw, batch, finally] {
+
+			processVertices(draw.get(), batch.get());
+
+			if (!draw->setupState.rasterizerDiscard)
+			{
+				processPrimitives(draw.get(), batch.get());
+
+				if (batch->numVisible > 0)
+				{
+					processPixels(draw, batch, finally);
+					return;
+				}
+			}
+
+			for (int cluster = 0; cluster < MaxClusterCount; cluster++)
+			{
+				batch->clusterTickets[cluster].done();
+			}
+		});
+	}
+}
+
+void DrawCall::processVertices(DrawCall* draw, BatchData* batch)
+{
+	MARL_SCOPED_EVENT("VERTEX draw %d, batch %d", draw->id, batch->id);
+
+	unsigned int triangleIndices[MaxBatchSize + 1][3];  // One extra for SIMD width overrun. TODO: Adjust to dynamic batch size.
+	{
+		MARL_SCOPED_EVENT("processPrimitiveVertices");
+		processPrimitiveVertices(
+			triangleIndices,
+			draw->data->indices,
+			draw->indexType,
+			batch->firstPrimitive,
+			batch->numPrimitives,
+			draw->topology,
+			draw->provokingVertexMode);
+	}
+
+	auto& vertexTask = batch->vertexTask;
+	vertexTask.primitiveStart = batch->firstPrimitive;
+	// We're only using batch compaction for points, not lines
+	vertexTask.vertexCount = batch->numPrimitives * ((draw->topology == VK_PRIMITIVE_TOPOLOGY_POINT_LIST) ? 1 : 3);
+	if (vertexTask.vertexCache.drawCall != draw->id)
+	{
+		vertexTask.vertexCache.clear();
+		vertexTask.vertexCache.drawCall = draw->id;
+	}
+
+	draw->vertexRoutine(&batch->triangles.front().v0, &triangleIndices[0][0], &vertexTask, draw->data);
+}
+
+void DrawCall::processPrimitives(DrawCall* draw, BatchData* batch)
+{
+	MARL_SCOPED_EVENT("PRIMITIVES draw %d batch %d", draw->id, batch->id);
+	auto triangles = &batch->triangles[0];
+	auto primitives = &batch->primitives[0];
+	batch->numVisible = draw->setupPrimitives(triangles, primitives, draw, batch->numPrimitives);
+}
+
+void DrawCall::processPixels(const marl::Loan<DrawCall>& draw, const marl::Loan<BatchData>& batch, const std::shared_ptr<marl::Finally>& finally)
+{
+	struct Data
+	{
+		Data(const marl::Loan<DrawCall>& draw, const marl::Loan<BatchData>& batch, const std::shared_ptr<marl::Finally>& finally)
+			: draw(draw), batch(batch), finally(finally) {}
+		marl::Loan<DrawCall> draw;
+		marl::Loan<BatchData> batch;
+		std::shared_ptr<marl::Finally> finally;
+	};
+	auto data = std::make_shared<Data>(draw, batch, finally);
+	for (int cluster = 0; cluster < MaxClusterCount; cluster++)
+	{
+		batch->clusterTickets[cluster].onCall([data, cluster]
+		{
+			auto& draw = data->draw;
+			auto& batch = data->batch;
+			MARL_SCOPED_EVENT("PIXEL draw %d, batch %d, cluster %d", draw->id, batch->id, cluster);
+			draw->pixelRoutine(&batch->primitives.front(), batch->numVisible, cluster, MaxClusterCount, draw->data);
+			batch->clusterTickets[cluster].done();
+		});
+	}
+}
+
+void Renderer::synchronize()
+{
+	MARL_SCOPED_EVENT("synchronize");
+	auto ticket = drawTickets.take();
+	ticket.wait();
+	device->updateSamplingRoutineConstCache();
+	ticket.done();
+}
+
+void DrawCall::processPrimitiveVertices(
+	unsigned int triangleIndicesOut[MaxBatchSize + 1][3],
+	const void *primitiveIndices,
+	VkIndexType indexType,
+	unsigned int start,
+	unsigned int triangleCount,
+	VkPrimitiveTopology topology,
+	VkProvokingVertexModeEXT provokingVertexMode)
+{
+	if(!primitiveIndices)
+	{
+		struct LinearIndex
+		{
+			unsigned int operator[](unsigned int i) { return i; }
+		};
+
+		if(!setBatchIndices(triangleIndicesOut, topology, provokingVertexMode, LinearIndex(), start, triangleCount))
+		{
+			return;
+		}
+	}
+	else
+	{
+		switch(indexType)
+		{
+		case VK_INDEX_TYPE_UINT16:
+			if(!setBatchIndices(triangleIndicesOut, topology, provokingVertexMode, static_cast<const uint16_t*>(primitiveIndices), start, triangleCount))
+			{
+				return;
+			}
+			break;
+		case VK_INDEX_TYPE_UINT32:
+			if(!setBatchIndices(triangleIndicesOut, topology, provokingVertexMode, static_cast<const uint32_t*>(primitiveIndices), start, triangleCount))
+			{
+				return;
+			}
+			break;
+		break;
+		default:
+			ASSERT(false);
+			return;
+		}
+	}
+
+	// setBatchIndices() takes care of the point case, since it's different due to the compaction
+	if (topology != VK_PRIMITIVE_TOPOLOGY_POINT_LIST)
+	{
+		// Repeat the last index to allow for SIMD width overrun.
+		triangleIndicesOut[triangleCount][0] = triangleIndicesOut[triangleCount - 1][2];
+		triangleIndicesOut[triangleCount][1] = triangleIndicesOut[triangleCount - 1][2];
+		triangleIndicesOut[triangleCount][2] = triangleIndicesOut[triangleCount - 1][2];
+	}
+}
+
+int DrawCall::setupSolidTriangles(Triangle *triangles, Primitive *primitives, const DrawCall *drawCall, int count)
+{
+	auto &state = drawCall->setupState;
+
+	int ms = state.multiSample;
+	const DrawData *data = drawCall->data;
+	int visible = 0;
+
+	for(int i = 0; i < count; i++, triangles++)
+	{
+		Vertex &v0 = triangles->v0;
+		Vertex &v1 = triangles->v1;
+		Vertex &v2 = triangles->v2;
+
+		Polygon polygon(&v0.position, &v1.position, &v2.position);
+
+
+		if((v0.cullMask | v1.cullMask | v2.cullMask) == 0)
+		{
+			continue;
+		}
+
+		if((v0.clipFlags & v1.clipFlags & v2.clipFlags) != Clipper::CLIP_FINITE)
+		{
+			continue;
+		}
+
+		int clipFlagsOr = v0.clipFlags | v1.clipFlags | v2.clipFlags;
+		if(clipFlagsOr != Clipper::CLIP_FINITE)
+		{
+			if(!Clipper::Clip(polygon, clipFlagsOr, *drawCall))
+			{
+				continue;
+			}
+		}
+
+		if(drawCall->setupRoutine(primitives, triangles, &polygon, data))
+		{
+			primitives += ms;
+			visible++;
+		}
+	}
+
+	return visible;
+}
+
+int DrawCall::setupWireframeTriangles(Triangle *triangles, Primitive *primitives, const DrawCall *drawCall, int count)
+{
+	auto& state = drawCall->setupState;
+
+	int ms = state.multiSample;
+	int visible = 0;
+
+	for(int i = 0; i < count; i++)
+	{
+		const Vertex &v0 = triangles[i].v0;
+		const Vertex &v1 = triangles[i].v1;
+		const Vertex &v2 = triangles[i].v2;
+
+		float d = (v0.y * v1.x - v0.x * v1.y) * v2.w +
+		          (v0.x * v2.y - v0.y * v2.x) * v1.w +
+		          (v2.x * v1.y - v1.x * v2.y) * v0.w;
+
+		bool frontFacing = (state.frontFace == VK_FRONT_FACE_COUNTER_CLOCKWISE) ? (d > 0) : (d < 0);
+		if(state.cullMode & VK_CULL_MODE_FRONT_BIT)
+		{
+			if(frontFacing) continue;
+		}
+		if(state.cullMode & VK_CULL_MODE_BACK_BIT)
+		{
+			if(!frontFacing) continue;
+		}
+
+		Triangle lines[3];
+		lines[0].v0 = v0;
+		lines[0].v1 = v1;
+		lines[1].v0 = v1;
+		lines[1].v1 = v2;
+		lines[2].v0 = v2;
+		lines[2].v1 = v0;
+
+		for(int i = 0; i < 3; i++)
+		{
+			if(setupLine(*primitives, lines[i], *drawCall))
+			{
+				primitives += ms;
+				visible++;
+			}
+		}
+	}
+
+	return visible;
+}
+
+int DrawCall::setupPointTriangles(Triangle *triangles, Primitive *primitives, const DrawCall *drawCall, int count)
+{
+	auto& state = drawCall->setupState;
+
+	int ms = state.multiSample;
+	int visible = 0;
+
+	for(int i = 0; i < count; i++)
+	{
+		const Vertex &v0 = triangles[i].v0;
+		const Vertex &v1 = triangles[i].v1;
+		const Vertex &v2 = triangles[i].v2;
+
+		float d = (v0.y * v1.x - v0.x * v1.y) * v2.w +
+		          (v0.x * v2.y - v0.y * v2.x) * v1.w +
+		          (v2.x * v1.y - v1.x * v2.y) * v0.w;
+
+		bool frontFacing = (state.frontFace == VK_FRONT_FACE_COUNTER_CLOCKWISE) ? (d > 0) : (d < 0);
+		if(state.cullMode & VK_CULL_MODE_FRONT_BIT)
+		{
+			if(frontFacing) continue;
+		}
+		if(state.cullMode & VK_CULL_MODE_BACK_BIT)
+		{
+			if(!frontFacing) continue;
+		}
+
+		Triangle points[3];
+		points[0].v0 = v0;
+		points[1].v0 = v1;
+		points[2].v0 = v2;
+
+		for(int i = 0; i < 3; i++)
+		{
+			if(setupPoint(*primitives, points[i], *drawCall))
+			{
+				primitives += ms;
+				visible++;
+			}
+		}
+	}
+
+	return visible;
+}
+
+int DrawCall::setupLines(Triangle *triangles, Primitive *primitives, const DrawCall *drawCall, int count)
+{
+	auto &state = drawCall->setupState;
+
+	int visible = 0;
+	int ms = state.multiSample;
+
+	for(int i = 0; i < count; i++)
+	{
+		if(setupLine(*primitives, *triangles, *drawCall))
+		{
+			primitives += ms;
+			visible++;
+		}
+
+		triangles++;
+	}
+
+	return visible;
+}
+
+int DrawCall::setupPoints(Triangle *triangles, Primitive *primitives, const DrawCall *drawCall, int count)
+{
+	auto &state = drawCall->setupState;
+
+	int visible = 0;
+	int ms = state.multiSample;
+
+	for(int i = 0; i < count; i++)
+	{
+		if(setupPoint(*primitives, *triangles, *drawCall))
+		{
+			primitives += ms;
+			visible++;
+		}
+
+		triangles++;
+	}
+
+	return visible;
+}
+
+bool DrawCall::setupLine(Primitive &primitive, Triangle &triangle, const DrawCall &draw)
+{
+	const DrawData &data = *draw.data;
+
+	float lineWidth = data.lineWidth;
+
+	Vertex &v0 = triangle.v0;
+	Vertex &v1 = triangle.v1;
+
+	if((v0.cullMask | v1.cullMask) == 0)
+	{
+		return false;
+	}
+
+	const float4 &P0 = v0.position;
+	const float4 &P1 = v1.position;
+
+	if(P0.w <= 0 && P1.w <= 0)
+	{
+		return false;
+	}
+
+	constexpr float subPixF = vk::SUBPIXEL_PRECISION_FACTOR;
+
+	const float W = data.WxF[0] * (1.0f / subPixF);
+	const float H = data.HxF[0] * (1.0f / subPixF);
+
+	float dx = W * (P1.x / P1.w - P0.x / P0.w);
+	float dy = H * (P1.y / P1.w - P0.y / P0.w);
+
+	if(dx == 0 && dy == 0)
+	{
+		return false;
+	}
+
+	if(draw.lineRasterizationMode != VK_LINE_RASTERIZATION_MODE_BRESENHAM_EXT)
+	{
+		// Rectangle centered on the line segment
 
 		float4 P[4];
 		int C[4];
 
-		P[0] = v.position;
-		P[1] = v.position;
-		P[2] = v.position;
-		P[3] = v.position;
+		P[0] = P0;
+		P[1] = P1;
+		P[2] = P1;
+		P[3] = P0;
 
-		const float X = pSize * P[0].w * data.halfPixelX[0];
-		const float Y = pSize * P[0].w * data.halfPixelY[0];
+		float scale = lineWidth * 0.5f / sqrt(dx*dx + dy*dy);
 
-		P[0].x -= X;
-		P[0].y += Y;
+		dx *= scale;
+		dy *= scale;
+
+		float dx0h = dx * P0.w / H;
+		float dy0w = dy * P0.w / W;
+
+		float dx1h = dx * P1.w / H;
+		float dy1w = dy * P1.w / W;
+
+		P[0].x += -dy0w;
+		P[0].y += +dx0h;
 		C[0] = Clipper::ComputeClipFlags(P[0]);
 
-		P[1].x += X;
-		P[1].y += Y;
+		P[1].x += -dy1w;
+		P[1].y += +dx1h;
 		C[1] = Clipper::ComputeClipFlags(P[1]);
 
-		P[2].x += X;
-		P[2].y -= Y;
+		P[2].x += +dy1w;
+		P[2].y += -dx1h;
 		C[2] = Clipper::ComputeClipFlags(P[2]);
 
-		P[3].x -= X;
-		P[3].y -= Y;
+		P[3].x += +dy0w;
+		P[3].y += -dx0h;
 		C[3] = Clipper::ComputeClipFlags(P[3]);
 
-		Polygon polygon(P, 4);
-
 		if((C[0] & C[1] & C[2] & C[3]) == Clipper::CLIP_FINITE)
 		{
+			Polygon polygon(P, 4);
+
 			int clipFlagsOr = C[0] | C[1] | C[2] | C[3];
 
 			if(clipFlagsOr != Clipper::CLIP_FINITE)
@@ -1144,57 +879,322 @@
 				}
 			}
 
-			triangle.v1 = triangle.v0;
-			triangle.v2 = triangle.v0;
-
-			constexpr float subPixF = vk::SUBPIXEL_PRECISION_FACTOR;
-
-			triangle.v1.projected.x += iround(subPixF * 0.5f * pSize);
-			triangle.v2.projected.y -= iround(subPixF * 0.5f * pSize) * (data.HxF[0] > 0.0f ? 1 : -1);   // Both Direct3D and OpenGL expect (0, 0) in the top-left corner
 			return draw.setupRoutine(&primitive, &triangle, &polygon, &data);
 		}
+	}
+	else if(false)  // TODO(b/80135519): Deprecate
+	{
+		// Connecting diamonds polygon
+		// This shape satisfies the diamond test convention, except for the exit rule part.
+		// Line segments with overlapping endpoints have duplicate fragments.
+		// The ideal algorithm requires half-open line rasterization (b/80135519).
 
+		float4 P[8];
+		int C[8];
+
+		P[0] = P0;
+		P[1] = P0;
+		P[2] = P0;
+		P[3] = P0;
+		P[4] = P1;
+		P[5] = P1;
+		P[6] = P1;
+		P[7] = P1;
+
+		float dx0 = lineWidth * 0.5f * P0.w / W;
+		float dy0 = lineWidth * 0.5f * P0.w / H;
+
+		float dx1 = lineWidth * 0.5f * P1.w / W;
+		float dy1 = lineWidth * 0.5f * P1.w / H;
+
+		P[0].x += -dx0;
+		C[0] = Clipper::ComputeClipFlags(P[0]);
+
+		P[1].y += +dy0;
+		C[1] = Clipper::ComputeClipFlags(P[1]);
+
+		P[2].x += +dx0;
+		C[2] = Clipper::ComputeClipFlags(P[2]);
+
+		P[3].y += -dy0;
+		C[3] = Clipper::ComputeClipFlags(P[3]);
+
+		P[4].x += -dx1;
+		C[4] = Clipper::ComputeClipFlags(P[4]);
+
+		P[5].y += +dy1;
+		C[5] = Clipper::ComputeClipFlags(P[5]);
+
+		P[6].x += +dx1;
+		C[6] = Clipper::ComputeClipFlags(P[6]);
+
+		P[7].y += -dy1;
+		C[7] = Clipper::ComputeClipFlags(P[7]);
+
+		if((C[0] & C[1] & C[2] & C[3] & C[4] & C[5] & C[6] & C[7]) == Clipper::CLIP_FINITE)
+		{
+			float4 L[6];
+
+			if(dx > -dy)
+			{
+				if(dx > dy)   // Right
+				{
+					L[0] = P[0];
+					L[1] = P[1];
+					L[2] = P[5];
+					L[3] = P[6];
+					L[4] = P[7];
+					L[5] = P[3];
+				}
+				else   // Down
+				{
+					L[0] = P[0];
+					L[1] = P[4];
+					L[2] = P[5];
+					L[3] = P[6];
+					L[4] = P[2];
+					L[5] = P[3];
+				}
+			}
+			else
+			{
+				if(dx > dy)   // Up
+				{
+					L[0] = P[0];
+					L[1] = P[1];
+					L[2] = P[2];
+					L[3] = P[6];
+					L[4] = P[7];
+					L[5] = P[4];
+				}
+				else   // Left
+				{
+					L[0] = P[1];
+					L[1] = P[2];
+					L[2] = P[3];
+					L[3] = P[7];
+					L[4] = P[4];
+					L[5] = P[5];
+				}
+			}
+
+			Polygon polygon(L, 6);
+
+			int clipFlagsOr = C[0] | C[1] | C[2] | C[3] | C[4] | C[5] | C[6] | C[7];
+
+			if(clipFlagsOr != Clipper::CLIP_FINITE)
+			{
+				if(!Clipper::Clip(polygon, clipFlagsOr, draw))
+				{
+					return false;
+				}
+			}
+
+			return draw.setupRoutine(&primitive, &triangle, &polygon, &data);
+		}
+	}
+	else
+	{
+		// Parallelogram approximating Bresenham line
+		// This algorithm does not satisfy the ideal diamond-exit rule, but does avoid the
+		// duplicate fragment rasterization problem and satisfies all of Vulkan's minimum
+		// requirements for Bresenham line segment rasterization.
+
+		float4 P[8];
+		P[0] = P0;
+		P[1] = P0;
+		P[2] = P0;
+		P[3] = P0;
+		P[4] = P1;
+		P[5] = P1;
+		P[6] = P1;
+		P[7] = P1;
+
+		float dx0 = lineWidth * 0.5f * P0.w / W;
+		float dy0 = lineWidth * 0.5f * P0.w / H;
+
+		float dx1 = lineWidth * 0.5f * P1.w / W;
+		float dy1 = lineWidth * 0.5f * P1.w / H;
+
+		P[0].x += -dx0;
+		P[1].y += +dy0;
+		P[2].x += +dx0;
+		P[3].y += -dy0;
+		P[4].x += -dx1;
+		P[5].y += +dy1;
+		P[6].x += +dx1;
+		P[7].y += -dy1;
+
+		float4 L[4];
+
+		if(dx > -dy)
+		{
+			if(dx > dy)   // Right
+			{
+				L[0] = P[1];
+				L[1] = P[5];
+				L[2] = P[7];
+				L[3] = P[3];
+			}
+			else   // Down
+			{
+				L[0] = P[0];
+				L[1] = P[4];
+				L[2] = P[6];
+				L[3] = P[2];
+			}
+		}
+		else
+		{
+			if(dx > dy)   // Up
+			{
+				L[0] = P[0];
+				L[1] = P[2];
+				L[2] = P[6];
+				L[3] = P[4];
+			}
+			else   // Left
+			{
+				L[0] = P[1];
+				L[1] = P[3];
+				L[2] = P[7];
+				L[3] = P[5];
+			}
+		}
+
+		int C0 = Clipper::ComputeClipFlags(L[0]);
+		int C1 = Clipper::ComputeClipFlags(L[1]);
+		int C2 = Clipper::ComputeClipFlags(L[2]);
+		int C3 = Clipper::ComputeClipFlags(L[3]);
+
+		if((C0 & C1 & C2 & C3) == Clipper::CLIP_FINITE)
+		{
+			Polygon polygon(L, 4);
+
+			int clipFlagsOr = C0 | C1 | C2 | C3;
+
+			if(clipFlagsOr != Clipper::CLIP_FINITE)
+			{
+				if(!Clipper::Clip(polygon, clipFlagsOr, draw))
+				{
+					return false;
+				}
+			}
+
+			return draw.setupRoutine(&primitive, &triangle, &polygon, &data);
+		}
+	}
+
+	return false;
+}
+
+bool DrawCall::setupPoint(Primitive &primitive, Triangle &triangle, const DrawCall &draw)
+{
+	const DrawData &data = *draw.data;
+
+	Vertex &v = triangle.v0;
+
+	if(v.cullMask == 0)
+	{
 		return false;
 	}
 
-	void Renderer::addQuery(vk::Query *query)
+	float pSize = v.pointSize;
+
+	pSize = clamp(pSize, 1.0f, static_cast<float>(vk::MAX_POINT_SIZE));
+
+	float4 P[4];
+	int C[4];
+
+	P[0] = v.position;
+	P[1] = v.position;
+	P[2] = v.position;
+	P[3] = v.position;
+
+	const float X = pSize * P[0].w * data.halfPixelX[0];
+	const float Y = pSize * P[0].w * data.halfPixelY[0];
+
+	P[0].x -= X;
+	P[0].y += Y;
+	C[0] = Clipper::ComputeClipFlags(P[0]);
+
+	P[1].x += X;
+	P[1].y += Y;
+	C[1] = Clipper::ComputeClipFlags(P[1]);
+
+	P[2].x += X;
+	P[2].y -= Y;
+	C[2] = Clipper::ComputeClipFlags(P[2]);
+
+	P[3].x -= X;
+	P[3].y -= Y;
+	C[3] = Clipper::ComputeClipFlags(P[3]);
+
+	Polygon polygon(P, 4);
+
+	if((C[0] & C[1] & C[2] & C[3]) == Clipper::CLIP_FINITE)
 	{
-		ASSERT(query->getType() == VK_QUERY_TYPE_OCCLUSION);
-		ASSERT(!occlusionQuery);
+		int clipFlagsOr = C[0] | C[1] | C[2] | C[3];
 
-		occlusionQuery = query;
-	}
-
-	void Renderer::removeQuery(vk::Query *query)
-	{
-		ASSERT(query->getType() == VK_QUERY_TYPE_OCCLUSION);
-		ASSERT(occlusionQuery == query);
-
-		occlusionQuery = nullptr;
-	}
-
-	void Renderer::advanceInstanceAttributes(Stream* inputs)
-	{
-		for(uint32_t i = 0; i < vk::MAX_VERTEX_INPUT_BINDINGS; i++)
+		if(clipFlagsOr != Clipper::CLIP_FINITE)
 		{
-			auto &attrib = inputs[i];
-			if (attrib.count && attrib.instanceStride && (attrib.instanceStride < attrib.robustnessSize))
+			if(!Clipper::Clip(polygon, clipFlagsOr, draw))
 			{
-				// Under the casts: attrib.buffer += attrib.instanceStride
-				attrib.buffer = (void const *)((uintptr_t)attrib.buffer + attrib.instanceStride);
-				attrib.robustnessSize -= attrib.instanceStride;
+				return false;
 			}
 		}
+
+		triangle.v1 = triangle.v0;
+		triangle.v2 = triangle.v0;
+
+		constexpr float subPixF = vk::SUBPIXEL_PRECISION_FACTOR;
+
+		triangle.v1.projected.x += iround(subPixF * 0.5f * pSize);
+		triangle.v2.projected.y -= iround(subPixF * 0.5f * pSize) * (data.HxF[0] > 0.0f ? 1 : -1);   // Both Direct3D and OpenGL expect (0, 0) in the top-left corner
+		return draw.setupRoutine(&primitive, &triangle, &polygon, &data);
 	}
 
-	void Renderer::setViewport(const VkViewport &viewport)
-	{
-		this->viewport = viewport;
-	}
-
-	void Renderer::setScissor(const VkRect2D &scissor)
-	{
-		this->scissor = scissor;
-	}
-
+	return false;
 }
+
+void Renderer::addQuery(vk::Query *query)
+{
+	ASSERT(query->getType() == VK_QUERY_TYPE_OCCLUSION);
+	ASSERT(!occlusionQuery);
+
+	occlusionQuery = query;
+}
+
+void Renderer::removeQuery(vk::Query *query)
+{
+	ASSERT(query->getType() == VK_QUERY_TYPE_OCCLUSION);
+	ASSERT(occlusionQuery == query);
+
+	occlusionQuery = nullptr;
+}
+
+void Renderer::advanceInstanceAttributes(Stream* inputs)
+{
+	for(uint32_t i = 0; i < vk::MAX_VERTEX_INPUT_BINDINGS; i++)
+	{
+		auto &attrib = inputs[i];
+		if (attrib.count && attrib.instanceStride && (attrib.instanceStride < attrib.robustnessSize))
+		{
+			// Under the casts: attrib.buffer += attrib.instanceStride
+			attrib.buffer = (void const *)((uintptr_t)attrib.buffer + attrib.instanceStride);
+			attrib.robustnessSize -= attrib.instanceStride;
+		}
+	}
+}
+
+void Renderer::setViewport(const VkViewport &viewport)
+{
+	this->viewport = viewport;
+}
+
+void Renderer::setScissor(const VkRect2D &scissor)
+{
+	this->scissor = scissor;
+}
+
+}  // namespace sw
diff --git a/src/Device/Renderer.hpp b/src/Device/Renderer.hpp
index ac38616..1598c11 100644
--- a/src/Device/Renderer.hpp
+++ b/src/Device/Renderer.hpp
@@ -33,213 +33,214 @@
 #include <mutex>
 #include <thread>
 
-namespace vk
+namespace vk {
+
+class DescriptorSet;
+class Device;
+class Query;
+
+}  // namespace vk
+
+namespace sw {
+
+struct DrawCall;
+class PixelShader;
+class VertexShader;
+struct Task;
+class TaskEvents;
+class Resource;
+struct Constants;
+
+static constexpr int MaxBatchSize = 128;
+static constexpr int MaxBatchCount = 16;
+static constexpr int MaxClusterCount = 16;
+static constexpr int MaxDrawCount = 16;
+
+using TriangleBatch = std::array<Triangle, MaxBatchSize>;
+using PrimitiveBatch = std::array<Primitive, MaxBatchSize>;
+
+struct DrawData
 {
-	class DescriptorSet;
-	class Device;
-	class Query;
-}
+	const Constants *constants;
 
-namespace sw
+	vk::DescriptorSet::Bindings descriptorSets = {};
+	vk::DescriptorSet::DynamicOffsets descriptorDynamicOffsets = {};
+
+	const void *input[MAX_INTERFACE_COMPONENTS / 4];
+	unsigned int robustnessSize[MAX_INTERFACE_COMPONENTS / 4];
+	unsigned int stride[MAX_INTERFACE_COMPONENTS / 4];
+	const void *indices;
+
+	int instanceID;
+	int baseVertex;
+	float lineWidth;
+	int viewID;
+
+	PixelProcessor::Stencil stencil[2];   // clockwise, counterclockwise
+	PixelProcessor::Factor factor;
+	unsigned int occlusion[MaxClusterCount];   // Number of pixels passing depth test
+
+	float4 WxF;
+	float4 HxF;
+	float4 X0xF;
+	float4 Y0xF;
+	float4 halfPixelX;
+	float4 halfPixelY;
+	float viewportHeight;
+	float slopeDepthBias;
+	float depthRange;
+	float depthNear;
+
+	unsigned int *colorBuffer[RENDERTARGETS];
+	int colorPitchB[RENDERTARGETS];
+	int colorSliceB[RENDERTARGETS];
+	float *depthBuffer;
+	int depthPitchB;
+	int depthSliceB;
+	unsigned char *stencilBuffer;
+	int stencilPitchB;
+	int stencilSliceB;
+
+	int scissorX0;
+	int scissorX1;
+	int scissorY0;
+	int scissorY1;
+
+	float4 a2c0;
+	float4 a2c1;
+	float4 a2c2;
+	float4 a2c3;
+
+	PushConstantStorage pushConstants;
+};
+
+struct DrawCall
 {
-	struct DrawCall;
-	class PixelShader;
-	class VertexShader;
-	struct Task;
-	class TaskEvents;
-	class Resource;
-	struct Constants;
-
-	static constexpr int MaxBatchSize = 128;
-	static constexpr int MaxBatchCount = 16;
-	static constexpr int MaxClusterCount = 16;
-	static constexpr int MaxDrawCount = 16;
-
-	using TriangleBatch = std::array<Triangle, MaxBatchSize>;
-	using PrimitiveBatch = std::array<Primitive, MaxBatchSize>;
-
-	struct DrawData
+	struct BatchData
 	{
-		const Constants *constants;
+		using Pool = marl::BoundedPool<BatchData, MaxBatchCount, marl::PoolPolicy::Preserve>;
 
-		vk::DescriptorSet::Bindings descriptorSets = {};
-		vk::DescriptorSet::DynamicOffsets descriptorDynamicOffsets = {};
-
-		const void *input[MAX_INTERFACE_COMPONENTS / 4];
-		unsigned int robustnessSize[MAX_INTERFACE_COMPONENTS / 4];
-		unsigned int stride[MAX_INTERFACE_COMPONENTS / 4];
-		const void *indices;
-
-		int instanceID;
-		int baseVertex;
-		float lineWidth;
-		int viewID;
-
-		PixelProcessor::Stencil stencil[2];   // clockwise, counterclockwise
-		PixelProcessor::Factor factor;
-		unsigned int occlusion[MaxClusterCount];   // Number of pixels passing depth test
-
-		float4 WxF;
-		float4 HxF;
-		float4 X0xF;
-		float4 Y0xF;
-		float4 halfPixelX;
-		float4 halfPixelY;
-		float viewportHeight;
-		float slopeDepthBias;
-		float depthRange;
-		float depthNear;
-
-		unsigned int *colorBuffer[RENDERTARGETS];
-		int colorPitchB[RENDERTARGETS];
-		int colorSliceB[RENDERTARGETS];
-		float *depthBuffer;
-		int depthPitchB;
-		int depthSliceB;
-		unsigned char *stencilBuffer;
-		int stencilPitchB;
-		int stencilSliceB;
-
-		int scissorX0;
-		int scissorX1;
-		int scissorY0;
-		int scissorY1;
-
-		float4 a2c0;
-		float4 a2c1;
-		float4 a2c2;
-		float4 a2c3;
-
-		PushConstantStorage pushConstants;
-	};
-
-	struct DrawCall
-	{
-		struct BatchData
-		{
-			using Pool = marl::BoundedPool<BatchData, MaxBatchCount, marl::PoolPolicy::Preserve>;
-
-			TriangleBatch triangles;
-			PrimitiveBatch primitives;
-			VertexTask vertexTask;
-			unsigned int id;
-			unsigned int firstPrimitive;
-			unsigned int numPrimitives;
-			int numVisible;
-			marl::Ticket clusterTickets[MaxClusterCount];
-		};
-
-		using Pool = marl::BoundedPool<DrawCall, MaxDrawCount, marl::PoolPolicy::Preserve>;
-		using SetupFunction = int(*)(Triangle *triangles, Primitive *primitives, const DrawCall *drawCall, int count);
-
-		DrawCall();
-		~DrawCall();
-
-		static void run(const marl::Loan<DrawCall>& draw, marl::Ticket::Queue* tickets, marl::Ticket::Queue clusterQueues[MaxClusterCount]);
-		static void processVertices(DrawCall* draw, BatchData* batch);
-		static void processPrimitives(DrawCall* draw, BatchData* batch);
-		static void processPixels(const marl::Loan<DrawCall>& draw, const marl::Loan<BatchData>& batch, const std::shared_ptr<marl::Finally>& finally);
-		void setup();
-		void teardown();
-
-		int id;
-
-		BatchData::Pool *batchDataPool;
+		TriangleBatch triangles;
+		PrimitiveBatch primitives;
+		VertexTask vertexTask;
+		unsigned int id;
+		unsigned int firstPrimitive;
 		unsigned int numPrimitives;
-		unsigned int numPrimitivesPerBatch;
-		unsigned int numBatches;
-
-		VkPrimitiveTopology topology;
-		VkProvokingVertexModeEXT provokingVertexMode;
-		VkIndexType indexType;
-		VkLineRasterizationModeEXT lineRasterizationMode;
-
-		VertexProcessor::RoutineType vertexRoutine;
-		SetupProcessor::RoutineType setupRoutine;
-		PixelProcessor::RoutineType pixelRoutine;
-
-		SetupFunction setupPrimitives;
-		SetupProcessor::State setupState;
-
-		vk::ImageView *renderTarget[RENDERTARGETS];
-		vk::ImageView *depthBuffer;
-		vk::ImageView *stencilBuffer;
-		TaskEvents *events;
-
-		vk::Query* occlusionQuery;
-
-		DrawData *data;
-
-		static void processPrimitiveVertices(
-				unsigned int triangleIndicesOut[MaxBatchSize + 1][3],
-				const void *primitiveIndices,
-				VkIndexType indexType,
-				unsigned int start,
-				unsigned int triangleCount,
-				VkPrimitiveTopology topology,
-				VkProvokingVertexModeEXT provokingVertexMode);
-
-		static int setupSolidTriangles(Triangle* triangles, Primitive* primitives, const DrawCall* drawCall, int count);
-		static int setupWireframeTriangles(Triangle* triangles, Primitive* primitives, const DrawCall* drawCall, int count);
-		static int setupPointTriangles(Triangle* triangles, Primitive* primitives, const DrawCall* drawCall, int count);
-		static int setupLines(Triangle *triangles, Primitive *primitives, const DrawCall *drawCall, int count);
-		static int setupPoints(Triangle *triangles, Primitive *primitives, const DrawCall *drawCall, int count);
-
-		static bool setupLine(Primitive &primitive, Triangle &triangle, const DrawCall &draw);
-		static bool setupPoint(Primitive &primitive, Triangle &triangle, const DrawCall &draw);
+		int numVisible;
+		marl::Ticket clusterTickets[MaxClusterCount];
 	};
 
-	class alignas(16) Renderer : public VertexProcessor, public PixelProcessor, public SetupProcessor
-	{
-	public:
-		Renderer(vk::Device* device);
+	using Pool = marl::BoundedPool<DrawCall, MaxDrawCount, marl::PoolPolicy::Preserve>;
+	using SetupFunction = int(*)(Triangle *triangles, Primitive *primitives, const DrawCall *drawCall, int count);
 
-		virtual ~Renderer();
+	DrawCall();
+	~DrawCall();
 
-		void* operator new(size_t size);
-		void operator delete(void* mem);
+	static void run(const marl::Loan<DrawCall>& draw, marl::Ticket::Queue* tickets, marl::Ticket::Queue clusterQueues[MaxClusterCount]);
+	static void processVertices(DrawCall* draw, BatchData* batch);
+	static void processPrimitives(DrawCall* draw, BatchData* batch);
+	static void processPixels(const marl::Loan<DrawCall>& draw, const marl::Loan<BatchData>& batch, const std::shared_ptr<marl::Finally>& finally);
+	void setup();
+	void teardown();
 
-		bool hasOcclusionQuery() const { return occlusionQuery != nullptr; }
+	int id;
 
-		void draw(const sw::Context* context, VkIndexType indexType, unsigned int count, int baseVertex,
-				TaskEvents *events, int instanceID, int viewID, void *indexBuffer, const VkExtent3D& framebufferExtent,
-				PushConstantStorage const & pushConstants, bool update = true);
+	BatchData::Pool *batchDataPool;
+	unsigned int numPrimitives;
+	unsigned int numPrimitivesPerBatch;
+	unsigned int numBatches;
 
-		// Viewport & Clipper
-		void setViewport(const VkViewport &viewport);
-		void setScissor(const VkRect2D &scissor);
+	VkPrimitiveTopology topology;
+	VkProvokingVertexModeEXT provokingVertexMode;
+	VkIndexType indexType;
+	VkLineRasterizationModeEXT lineRasterizationMode;
 
-		void addQuery(vk::Query *query);
-		void removeQuery(vk::Query *query);
+	VertexProcessor::RoutineType vertexRoutine;
+	SetupProcessor::RoutineType setupRoutine;
+	PixelProcessor::RoutineType pixelRoutine;
 
-		void advanceInstanceAttributes(Stream* inputs);
+	SetupFunction setupPrimitives;
+	SetupProcessor::State setupState;
 
-		void synchronize();
+	vk::ImageView *renderTarget[RENDERTARGETS];
+	vk::ImageView *depthBuffer;
+	vk::ImageView *stencilBuffer;
+	TaskEvents *events;
 
-	private:
-		VkViewport viewport;
-		VkRect2D scissor;
+	vk::Query* occlusionQuery;
 
-		DrawCall::Pool drawCallPool;
-		DrawCall::BatchData::Pool batchDataPool;
+	DrawData *data;
 
-		std::atomic<int> nextDrawID = {0};
+	static void processPrimitiveVertices(
+			unsigned int triangleIndicesOut[MaxBatchSize + 1][3],
+			const void *primitiveIndices,
+			VkIndexType indexType,
+			unsigned int start,
+			unsigned int triangleCount,
+			VkPrimitiveTopology topology,
+			VkProvokingVertexModeEXT provokingVertexMode);
 
-		vk::Query *occlusionQuery = nullptr;
-		marl::Ticket::Queue drawTickets;
-		marl::Ticket::Queue clusterQueues[MaxClusterCount];
+	static int setupSolidTriangles(Triangle* triangles, Primitive* primitives, const DrawCall* drawCall, int count);
+	static int setupWireframeTriangles(Triangle* triangles, Primitive* primitives, const DrawCall* drawCall, int count);
+	static int setupPointTriangles(Triangle* triangles, Primitive* primitives, const DrawCall* drawCall, int count);
+	static int setupLines(Triangle *triangles, Primitive *primitives, const DrawCall *drawCall, int count);
+	static int setupPoints(Triangle *triangles, Primitive *primitives, const DrawCall *drawCall, int count);
 
-		VertexProcessor::State vertexState;
-		SetupProcessor::State setupState;
-		PixelProcessor::State pixelState;
+	static bool setupLine(Primitive &primitive, Triangle &triangle, const DrawCall &draw);
+	static bool setupPoint(Primitive &primitive, Triangle &triangle, const DrawCall &draw);
+};
 
-		VertexProcessor::RoutineType vertexRoutine;
-		SetupProcessor::RoutineType setupRoutine;
-		PixelProcessor::RoutineType pixelRoutine;
+class alignas(16) Renderer : public VertexProcessor, public PixelProcessor, public SetupProcessor
+{
+public:
+	Renderer(vk::Device* device);
 
-		vk::Device* device;
-	};
+	virtual ~Renderer();
 
-}
+	void* operator new(size_t size);
+	void operator delete(void* mem);
+
+	bool hasOcclusionQuery() const { return occlusionQuery != nullptr; }
+
+	void draw(const sw::Context* context, VkIndexType indexType, unsigned int count, int baseVertex,
+			TaskEvents *events, int instanceID, int viewID, void *indexBuffer, const VkExtent3D& framebufferExtent,
+			PushConstantStorage const & pushConstants, bool update = true);
+
+	// Viewport & Clipper
+	void setViewport(const VkViewport &viewport);
+	void setScissor(const VkRect2D &scissor);
+
+	void addQuery(vk::Query *query);
+	void removeQuery(vk::Query *query);
+
+	void advanceInstanceAttributes(Stream* inputs);
+
+	void synchronize();
+
+private:
+	VkViewport viewport;
+	VkRect2D scissor;
+
+	DrawCall::Pool drawCallPool;
+	DrawCall::BatchData::Pool batchDataPool;
+
+	std::atomic<int> nextDrawID = {0};
+
+	vk::Query *occlusionQuery = nullptr;
+	marl::Ticket::Queue drawTickets;
+	marl::Ticket::Queue clusterQueues[MaxClusterCount];
+
+	VertexProcessor::State vertexState;
+	SetupProcessor::State setupState;
+	PixelProcessor::State pixelState;
+
+	VertexProcessor::RoutineType vertexRoutine;
+	SetupProcessor::RoutineType setupRoutine;
+	PixelProcessor::RoutineType pixelRoutine;
+
+	vk::Device* device;
+};
+
+}  // namespace sw
 
 #endif   // sw_Renderer_hpp
diff --git a/src/Device/RoutineCache.hpp b/src/Device/RoutineCache.hpp
index b015c3b..9bde0d5 100644
--- a/src/Device/RoutineCache.hpp
+++ b/src/Device/RoutineCache.hpp
@@ -19,15 +19,16 @@
 
 #include "Reactor/Reactor.hpp"
 
-namespace sw
-{
-	using namespace rr;
+namespace sw {
 
-	template<class State>
-	using RoutineCache = LRUCache<State, std::shared_ptr<Routine>>;
+using namespace rr;
 
-	template<class State, class FunctionType>
-	using RoutineCacheT = LRUCache<State, RoutineT<FunctionType>>;
+template<class State>
+using RoutineCache = LRUCache<State, std::shared_ptr<Routine>>;
+
+template<class State, class FunctionType>
+using RoutineCacheT = LRUCache<State, RoutineT<FunctionType>>;
+
 }
 
 #endif   // sw_RoutineCache_hpp
diff --git a/src/Device/Sampler.hpp b/src/Device/Sampler.hpp
index 0c0bd9d..a836ce2 100644
--- a/src/Device/Sampler.hpp
+++ b/src/Device/Sampler.hpp
@@ -20,100 +20,98 @@
 #include "System/Types.hpp"
 #include "Vulkan/VkFormat.h"
 
-namespace vk
+namespace vk { class Image; }
+
+namespace sw {
+
+struct Mipmap
 {
-	class Image;
-}
+	const void *buffer;
 
-namespace sw
+	short4 uHalf;
+	short4 vHalf;
+	short4 wHalf;
+	int4 width;
+	int4 height;
+	int4 depth;
+	short4 onePitchP;
+	int4 pitchP;
+	int4 sliceP;
+	int4 samplePitchP;
+	int4 sampleMax;
+};
+
+struct Texture
 {
-	struct Mipmap
-	{
-		const void *buffer;
+	Mipmap mipmap[MIPMAP_LEVELS];
 
-		short4 uHalf;
-		short4 vHalf;
-		short4 wHalf;
-		int4 width;
-		int4 height;
-		int4 depth;
-		short4 onePitchP;
-		int4 pitchP;
-		int4 sliceP;
-		int4 samplePitchP;
-		int4 sampleMax;
-	};
+	float4 widthWidthHeightHeight;
+	float4 width;
+	float4 height;
+	float4 depth;
+};
 
-	struct Texture
-	{
-		Mipmap mipmap[MIPMAP_LEVELS];
+enum FilterType ENUM_UNDERLYING_TYPE_UNSIGNED_INT
+{
+	FILTER_POINT,
+	FILTER_GATHER,
+	FILTER_MIN_POINT_MAG_LINEAR,
+	FILTER_MIN_LINEAR_MAG_POINT,
+	FILTER_LINEAR,
+	FILTER_ANISOTROPIC,
 
-		float4 widthWidthHeightHeight;
-		float4 width;
-		float4 height;
-		float4 depth;
-	};
+	FILTER_LAST = FILTER_ANISOTROPIC
+};
 
-	enum FilterType ENUM_UNDERLYING_TYPE_UNSIGNED_INT
-	{
-		FILTER_POINT,
-		FILTER_GATHER,
-		FILTER_MIN_POINT_MAG_LINEAR,
-		FILTER_MIN_LINEAR_MAG_POINT,
-		FILTER_LINEAR,
-		FILTER_ANISOTROPIC,
+enum MipmapType ENUM_UNDERLYING_TYPE_UNSIGNED_INT
+{
+	MIPMAP_NONE,
+	MIPMAP_POINT,
+	MIPMAP_LINEAR,
 
-		FILTER_LAST = FILTER_ANISOTROPIC
-	};
+	MIPMAP_LAST = MIPMAP_LINEAR
+};
 
-	enum MipmapType ENUM_UNDERLYING_TYPE_UNSIGNED_INT
-	{
-		MIPMAP_NONE,
-		MIPMAP_POINT,
-		MIPMAP_LINEAR,
+enum AddressingMode ENUM_UNDERLYING_TYPE_UNSIGNED_INT
+{
+	ADDRESSING_UNUSED,
+	ADDRESSING_WRAP,
+	ADDRESSING_CLAMP,
+	ADDRESSING_MIRROR,
+	ADDRESSING_MIRRORONCE,
+	ADDRESSING_BORDER,     // Single color
+	ADDRESSING_SEAMLESS,   // Border of pixels
+	ADDRESSING_CUBEFACE,   // Cube face layer
+	ADDRESSING_LAYER,      // Array layer
+	ADDRESSING_TEXELFETCH,
 
-		MIPMAP_LAST = MIPMAP_LINEAR
-	};
+	ADDRESSING_LAST = ADDRESSING_TEXELFETCH
+};
 
-	enum AddressingMode ENUM_UNDERLYING_TYPE_UNSIGNED_INT
-	{
-		ADDRESSING_UNUSED,
-		ADDRESSING_WRAP,
-		ADDRESSING_CLAMP,
-		ADDRESSING_MIRROR,
-		ADDRESSING_MIRRORONCE,
-		ADDRESSING_BORDER,     // Single color
-		ADDRESSING_SEAMLESS,   // Border of pixels
-		ADDRESSING_CUBEFACE,   // Cube face layer
-		ADDRESSING_LAYER,      // Array layer
-		ADDRESSING_TEXELFETCH,
+struct Sampler
+{
+	VkImageViewType textureType;
+	vk::Format textureFormat;
+	FilterType textureFilter;
+	AddressingMode addressingModeU;
+	AddressingMode addressingModeV;
+	AddressingMode addressingModeW;
+	AddressingMode addressingModeY;
+	MipmapType mipmapFilter;
+	VkComponentMapping swizzle;
+	int gatherComponent;
+	bool highPrecisionFiltering;
+	bool compareEnable;
+	VkCompareOp compareOp;
+	VkBorderColor border;
+	bool unnormalizedCoordinates;
+	bool largeTexture;
 
-		ADDRESSING_LAST = ADDRESSING_TEXELFETCH
-	};
+	VkSamplerYcbcrModelConversion ycbcrModel;
+	bool studioSwing;    // Narrow range
+	bool swappedChroma;  // Cb/Cr components in reverse order
+};
 
-	struct Sampler
-	{
-		VkImageViewType textureType;
-		vk::Format textureFormat;
-		FilterType textureFilter;
-		AddressingMode addressingModeU;
-		AddressingMode addressingModeV;
-		AddressingMode addressingModeW;
-		AddressingMode addressingModeY;
-		MipmapType mipmapFilter;
-		VkComponentMapping swizzle;
-		int gatherComponent;
-		bool highPrecisionFiltering;
-		bool compareEnable;
-		VkCompareOp compareOp;
-		VkBorderColor border;
-		bool unnormalizedCoordinates;
-		bool largeTexture;
-
-		VkSamplerYcbcrModelConversion ycbcrModel;
-		bool studioSwing;    // Narrow range
-		bool swappedChroma;  // Cb/Cr components in reverse order
-	};
-}
+}  // namespace sw
 
 #endif   // sw_Sampler_hpp
diff --git a/src/Device/SetupProcessor.cpp b/src/Device/SetupProcessor.cpp
index 69371ab..df55e1a 100644
--- a/src/Device/SetupProcessor.cpp
+++ b/src/Device/SetupProcessor.cpp
@@ -25,98 +25,99 @@
 
 #include <cstring>
 
-namespace sw
+namespace sw {
+
+uint32_t SetupProcessor::States::computeHash()
 {
-	uint32_t SetupProcessor::States::computeHash()
+	uint32_t *state = reinterpret_cast<uint32_t*>(this);
+	uint32_t hash = 0;
+
+	for(unsigned int i = 0; i < sizeof(States) / sizeof(uint32_t); i++)
 	{
-		uint32_t *state = reinterpret_cast<uint32_t*>(this);
-		uint32_t hash = 0;
-
-		for(unsigned int i = 0; i < sizeof(States) / sizeof(uint32_t); i++)
-		{
-			hash ^= state[i];
-		}
-
-		return hash;
+		hash ^= state[i];
 	}
 
-	bool SetupProcessor::State::operator==(const State &state) const
-	{
-		if(hash != state.hash)
-		{
-			return false;
-		}
-
-		static_assert(is_memcmparable<State>::value, "Cannot memcmp States");
-		return memcmp(static_cast<const States*>(this), static_cast<const States*>(&state), sizeof(States)) == 0;
-	}
-
-	SetupProcessor::SetupProcessor()
-	{
-		routineCache = nullptr;
-		setRoutineCacheSize(1024);
-	}
-
-	SetupProcessor::~SetupProcessor()
-	{
-		delete routineCache;
-		routineCache = nullptr;
-	}
-
-	SetupProcessor::State SetupProcessor::update(const sw::Context* context) const
-	{
-		State state;
-
-		bool vPosZW = (context->pixelShader && context->pixelShader->hasBuiltinInput(spv::BuiltInFragCoord));
-
-		state.isDrawPoint = context->isDrawPoint(true);
-		state.isDrawLine = context->isDrawLine(true);
-		state.isDrawTriangle = context->isDrawTriangle(true);
-		state.applySlopeDepthBias = context->isDrawTriangle(false) && (context->slopeDepthBias != 0.0f);
-		state.interpolateZ = context->depthBufferActive() || vPosZW;
-		state.interpolateW = context->pixelShader != nullptr;
-		state.frontFace = context->frontFace;
-		state.cullMode = context->cullMode;
-
-		state.multiSample = context->sampleCount;
-		state.rasterizerDiscard = context->rasterizerDiscard;
-
-		state.numClipDistances = context->vertexShader->getNumOutputClipDistances();
-		state.numCullDistances = context->vertexShader->getNumOutputCullDistances();
-
-		if (context->pixelShader)
-		{
-			for (int interpolant = 0; interpolant < MAX_INTERFACE_COMPONENTS; interpolant++)
-			{
-				state.gradient[interpolant] = context->pixelShader->inputs[interpolant];
-			}
-		}
-
-		state.hash = state.computeHash();
-
-		return state;
-	}
-
-	SetupProcessor::RoutineType SetupProcessor::routine(const State &state)
-	{
-		auto routine = routineCache->query(state);
-
-		if(!routine)
-		{
-			SetupRoutine *generator = new SetupRoutine(state);
-			generator->generate();
-			routine = generator->getRoutine();
-			delete generator;
-
-			routineCache->add(state, routine);
-		}
-
-		return routine;
-	}
-
-	void SetupProcessor::setRoutineCacheSize(int cacheSize)
-	{
-		delete routineCache;
-		routineCache = new RoutineCacheType(clamp(cacheSize, 1, 65536));
-	}
+	return hash;
 }
+
+bool SetupProcessor::State::operator==(const State &state) const
+{
+	if(hash != state.hash)
+	{
+		return false;
+	}
+
+	static_assert(is_memcmparable<State>::value, "Cannot memcmp States");
+	return memcmp(static_cast<const States*>(this), static_cast<const States*>(&state), sizeof(States)) == 0;
+}
+
+SetupProcessor::SetupProcessor()
+{
+	routineCache = nullptr;
+	setRoutineCacheSize(1024);
+}
+
+SetupProcessor::~SetupProcessor()
+{
+	delete routineCache;
+	routineCache = nullptr;
+}
+
+SetupProcessor::State SetupProcessor::update(const sw::Context* context) const
+{
+	State state;
+
+	bool vPosZW = (context->pixelShader && context->pixelShader->hasBuiltinInput(spv::BuiltInFragCoord));
+
+	state.isDrawPoint = context->isDrawPoint(true);
+	state.isDrawLine = context->isDrawLine(true);
+	state.isDrawTriangle = context->isDrawTriangle(true);
+	state.applySlopeDepthBias = context->isDrawTriangle(false) && (context->slopeDepthBias != 0.0f);
+	state.interpolateZ = context->depthBufferActive() || vPosZW;
+	state.interpolateW = context->pixelShader != nullptr;
+	state.frontFace = context->frontFace;
+	state.cullMode = context->cullMode;
+
+	state.multiSample = context->sampleCount;
+	state.rasterizerDiscard = context->rasterizerDiscard;
+
+	state.numClipDistances = context->vertexShader->getNumOutputClipDistances();
+	state.numCullDistances = context->vertexShader->getNumOutputCullDistances();
+
+	if (context->pixelShader)
+	{
+		for (int interpolant = 0; interpolant < MAX_INTERFACE_COMPONENTS; interpolant++)
+		{
+			state.gradient[interpolant] = context->pixelShader->inputs[interpolant];
+		}
+	}
+
+	state.hash = state.computeHash();
+
+	return state;
+}
+
+SetupProcessor::RoutineType SetupProcessor::routine(const State &state)
+{
+	auto routine = routineCache->query(state);
+
+	if(!routine)
+	{
+		SetupRoutine *generator = new SetupRoutine(state);
+		generator->generate();
+		routine = generator->getRoutine();
+		delete generator;
+
+		routineCache->add(state, routine);
+	}
+
+	return routine;
+}
+
+void SetupProcessor::setRoutineCacheSize(int cacheSize)
+{
+	delete routineCache;
+	routineCache = new RoutineCacheType(clamp(cacheSize, 1, 65536));
+}
+
+}  // namespace sw
diff --git a/src/Device/SetupProcessor.hpp b/src/Device/SetupProcessor.hpp
index 683c93c..4b6d5a8 100644
--- a/src/Device/SetupProcessor.hpp
+++ b/src/Device/SetupProcessor.hpp
@@ -21,65 +21,66 @@
 #include "RoutineCache.hpp"
 #include "System/Types.hpp"
 
-namespace sw
+namespace sw {
+
+struct Primitive;
+struct Triangle;
+struct Polygon;
+struct Vertex;
+struct DrawCall;
+struct DrawData;
+
+using SetupFunction = FunctionT<int(Primitive* primitive, const Triangle* triangle, const Polygon* polygon, const DrawData* draw)>;
+
+class SetupProcessor
 {
-	struct Primitive;
-	struct Triangle;
-	struct Polygon;
-	struct Vertex;
-	struct DrawCall;
-	struct DrawData;
-
-	using SetupFunction = FunctionT<int(Primitive* primitive, const Triangle* triangle, const Polygon* polygon, const DrawData* draw)>;
-
-	class SetupProcessor
+public:
+	struct States : Memset<States>
 	{
-	public:
-		struct States : Memset<States>
-		{
-			States() : Memset(this, 0) {}
+		States() : Memset(this, 0) {}
 
-			uint32_t computeHash();
+		uint32_t computeHash();
 
-			bool isDrawPoint               : 1;
-			bool isDrawLine                : 1;
-			bool isDrawTriangle            : 1;
-			bool applySlopeDepthBias       : 1;
-			bool interpolateZ              : 1;
-			bool interpolateW              : 1;
-			VkFrontFace frontFace          : BITS(VK_FRONT_FACE_MAX_ENUM);
-			VkCullModeFlags cullMode       : BITS(VK_CULL_MODE_FLAG_BITS_MAX_ENUM);
-			unsigned int multiSample       : 3;   // 1, 2 or 4
-			bool rasterizerDiscard         : 1;
-			unsigned int numClipDistances  : 4; // [0 - 8]
-			unsigned int numCullDistances  : 4; // [0 - 8]
+		bool isDrawPoint               : 1;
+		bool isDrawLine                : 1;
+		bool isDrawTriangle            : 1;
+		bool applySlopeDepthBias       : 1;
+		bool interpolateZ              : 1;
+		bool interpolateW              : 1;
+		VkFrontFace frontFace          : BITS(VK_FRONT_FACE_MAX_ENUM);
+		VkCullModeFlags cullMode       : BITS(VK_CULL_MODE_FLAG_BITS_MAX_ENUM);
+		unsigned int multiSample       : 3;   // 1, 2 or 4
+		bool rasterizerDiscard         : 1;
+		unsigned int numClipDistances  : 4; // [0 - 8]
+		unsigned int numCullDistances  : 4; // [0 - 8]
 
-			SpirvShader::InterfaceComponent gradient[MAX_INTERFACE_COMPONENTS];
-		};
-
-		struct State : States
-		{
-			bool operator==(const State &states) const;
-
-			uint32_t hash;
-		};
-
-		using RoutineType = SetupFunction::RoutineType;
-
-		SetupProcessor();
-
-		~SetupProcessor();
-
-	protected:
-		State update(const sw::Context* context) const;
-		RoutineType routine(const State &state);
-
-		void setRoutineCacheSize(int cacheSize);
-
-	private:
-		using RoutineCacheType = RoutineCacheT<State, SetupFunction::CFunctionType>;
-		RoutineCacheType *routineCache;
+		SpirvShader::InterfaceComponent gradient[MAX_INTERFACE_COMPONENTS];
 	};
-}
+
+	struct State : States
+	{
+		bool operator==(const State &states) const;
+
+		uint32_t hash;
+	};
+
+	using RoutineType = SetupFunction::RoutineType;
+
+	SetupProcessor();
+
+	~SetupProcessor();
+
+protected:
+	State update(const sw::Context* context) const;
+	RoutineType routine(const State &state);
+
+	void setRoutineCacheSize(int cacheSize);
+
+private:
+	using RoutineCacheType = RoutineCacheT<State, SetupFunction::CFunctionType>;
+	RoutineCacheType *routineCache;
+};
+
+}  // namespace sw
 
 #endif   // sw_SetupProcessor_hpp
diff --git a/src/Device/Stream.hpp b/src/Device/Stream.hpp
index b6bb56c..f83d97a 100644
--- a/src/Device/Stream.hpp
+++ b/src/Device/Stream.hpp
@@ -17,37 +17,38 @@
 
 #include "System/Types.hpp"
 
-namespace sw
+namespace sw {
+
+enum StreamType ENUM_UNDERLYING_TYPE_UNSIGNED_INT
 {
-	enum StreamType ENUM_UNDERLYING_TYPE_UNSIGNED_INT
-	{
-		STREAMTYPE_COLOR,     // 4 normalized unsigned bytes, ZYXW order
-		STREAMTYPE_FLOAT,     // Normalization ignored
-		STREAMTYPE_BYTE,
-		STREAMTYPE_SBYTE,
-		STREAMTYPE_SHORT,
-		STREAMTYPE_USHORT,
-		STREAMTYPE_INT,
-		STREAMTYPE_UINT,
-		STREAMTYPE_HALF,      // Normalization ignored
-		STREAMTYPE_2_10_10_10_INT,
-		STREAMTYPE_2_10_10_10_UINT,
+	STREAMTYPE_COLOR,     // 4 normalized unsigned bytes, ZYXW order
+	STREAMTYPE_FLOAT,     // Normalization ignored
+	STREAMTYPE_BYTE,
+	STREAMTYPE_SBYTE,
+	STREAMTYPE_SHORT,
+	STREAMTYPE_USHORT,
+	STREAMTYPE_INT,
+	STREAMTYPE_UINT,
+	STREAMTYPE_HALF,      // Normalization ignored
+	STREAMTYPE_2_10_10_10_INT,
+	STREAMTYPE_2_10_10_10_UINT,
 
-		STREAMTYPE_LAST = STREAMTYPE_2_10_10_10_UINT
-	};
+	STREAMTYPE_LAST = STREAMTYPE_2_10_10_10_UINT
+};
 
-	struct Stream
-	{
-		const void *buffer = nullptr;
-		unsigned int robustnessSize = 0;
-		unsigned int vertexStride = 0;
-		unsigned int instanceStride = 0;
-		StreamType type = STREAMTYPE_FLOAT;
-		unsigned char count = 0;
-		bool normalized = false;
-		unsigned int offset = 0;
-		unsigned int binding = 0;
-	};
-}
+struct Stream
+{
+	const void *buffer = nullptr;
+	unsigned int robustnessSize = 0;
+	unsigned int vertexStride = 0;
+	unsigned int instanceStride = 0;
+	StreamType type = STREAMTYPE_FLOAT;
+	unsigned char count = 0;
+	bool normalized = false;
+	unsigned int offset = 0;
+	unsigned int binding = 0;
+};
+
+}  // namespace sw
 
 #endif   // sw_Stream_hpp
diff --git a/src/Device/Triangle.hpp b/src/Device/Triangle.hpp
index 8a91fab..7cb4055 100644
--- a/src/Device/Triangle.hpp
+++ b/src/Device/Triangle.hpp
@@ -17,14 +17,15 @@
 
 #include "Vertex.hpp"
 
-namespace sw
+namespace sw {
+
+struct Triangle
 {
-	struct Triangle
-	{
-		Vertex V0;
-		Vertex V1;
-		Vertex V2;
-	};
-}
+	Vertex V0;
+	Vertex V1;
+	Vertex V2;
+};
+
+}  // namespace sw
 
 #endif   // sw_Triangle_hpp
diff --git a/src/Device/Vector.cpp b/src/Device/Vector.cpp
index b58f15e..511b51f 100644
--- a/src/Device/Vector.cpp
+++ b/src/Device/Vector.cpp
@@ -17,159 +17,160 @@
 #include "Matrix.hpp"
 #include "System/Math.hpp"
 
-namespace sw
+namespace sw {
+
+Vector Vector::operator+() const
 {
-	Vector Vector::operator+() const
-	{
-		return *this;
-	}
-
-	Vector Vector::operator-() const
-	{
-		return Vector(-x, -y, -z);
-	}
-
-	Vector &Vector::operator+=(const Vector &v)
-	{
-		x += v.x;
-		y += v.y;
-		z += v.z;
-
-		return *this;
-	}
-
-	Vector &Vector::operator-=(const Vector &v)
-	{
-		x -= v.x;
-		y -= v.y;
-		z -= v.z;
-
-		return *this;
-	}
-
-	Vector &Vector::operator*=(float s)
-	{
-		x *= s;
-		y *= s;
-		z *= s;
-
-		return *this;
-	}
-
-	Vector &Vector::operator/=(float s)
-	{
-		float r = 1.0f / s;
-
-		return *this *= r;
-	}
-
-	bool operator==(const Vector &U, const Vector &v)
-	{
-		if(U.x == v.x && U.y == v.y && U.z == v.z)
-			return true;
-		else
-			return false;
-	}
-
-	bool operator!=(const Vector &U, const Vector &v)
-	{
-		if(U.x != v.x || U.y != v.y || U.z != v.z)
-			return true;
-		else
-			return false;
-	}
-
-	bool operator>(const Vector &u, const Vector &v)
-	{
-		if((u^2) > (v^2))
-			return true;
-		else
-			return false;
-	}
-
-	bool operator<(const Vector &u, const Vector &v)
-	{
-		if((u^2) < (v^2))
-			return true;
-		else
-			return false;
-	}
-
-	Vector operator+(const Vector &u, const Vector &v)
-	{
-		return Vector(u.x + v.x, u.y + v.y, u.z + v.z);
-	}
-
-	Vector operator-(const Vector &u, const Vector &v)
-	{
-		return Vector(u.x - v.x, u.y - v.y, u.z - v.z);
-	}
-
-	float operator*(const Vector &u, const Vector &v)
-	{
-		return u.x * v.x + u.y * v.y + u.z * v.z;
-	}
-
-	Vector operator*(float s, const Vector &v)
-	{
-		return Vector(s * v.x, s * v.y, s * v.z);
-	}
-
-	Vector operator*(const Vector &v, float s)
-	{
-		return Vector(v.x * s, v.y * s, v.z * s);
-	}
-
-	Vector operator/(const Vector &v, float s)
-	{
-		float r = 1.0f / s;
-
-		return Vector(v.x * r, v.y * r, v.z * r);
-	}
-
-	float operator^(const Vector &u, const Vector &v)
-	{
-		return acos(u / Vector::N(u) * v / Vector::N(v));
-	}
-
-	Vector operator%(const Vector &u, const Vector &v)
-	{
-		return Vector(u.y * v.z - u.z * v.y, u.z * v.x - u.x * v.z, u.x * v.y - u.y * v.x);
-	}
-
-	Vector operator*(const Matrix &M, const Vector &v)
-	{
-		return Vector(M(1, 1) * v.x + M(1, 2) * v.y + M(1, 3) * v.z,
-		              M(2, 1) * v.x + M(2, 2) * v.y + M(2, 3) * v.z,
-		              M(3, 1) * v.x + M(3, 2) * v.y + M(3, 3) * v.z);
-	}
-
-	Vector operator*(const Vector &v, const Matrix &M)
-	{
-		return Vector(v.x * M(1, 1) + v.y * M(2, 1) + v.z * M(3, 1) + M(4, 1),
-		              v.x * M(1, 2) + v.y * M(2, 2) + v.z * M(3, 2) + M(4, 2),
-		              v.x * M(1, 3) + v.y * M(2, 3) + v.z * M(3, 3) + M(4, 3));
-	}
-
-	Vector &operator*=(Vector &v, const Matrix &M)
-	{
-		return v = v * M;
-	}
-
-	float Vector::N(const Vector &v)
-	{
-		return sqrt(v.x*v.x + v.y*v.y + v.z*v.z);
-	}
-
-	float Vector::N2(const Vector &v)
-	{
-		return v.x*v.x + v.y*v.y + v.z*v.z;
-	}
-
-	Vector lerp(const Vector &u, const Vector &v, float t)
-	{
-		return Vector(u.x + t * (v.x - u.x),
-		              u.y + t * (v.y - u.y),
-		              u.z + t * (v.z - u.z));
-	}
+	return *this;
 }
+
+Vector Vector::operator-() const
+{
+	return Vector(-x, -y, -z);
+}
+
+Vector &Vector::operator+=(const Vector &v)
+{
+	x += v.x;
+	y += v.y;
+	z += v.z;
+
+	return *this;
+}
+
+Vector &Vector::operator-=(const Vector &v)
+{
+	x -= v.x;
+	y -= v.y;
+	z -= v.z;
+
+	return *this;
+}
+
+Vector &Vector::operator*=(float s)
+{
+	x *= s;
+	y *= s;
+	z *= s;
+
+	return *this;
+}
+
+Vector &Vector::operator/=(float s)
+{
+	float r = 1.0f / s;
+
+	return *this *= r;
+}
+
+bool operator==(const Vector &U, const Vector &v)
+{
+	if(U.x == v.x && U.y == v.y && U.z == v.z)
+		return true;
+	else
+		return false;
+}
+
+bool operator!=(const Vector &U, const Vector &v)
+{
+	if(U.x != v.x || U.y != v.y || U.z != v.z)
+		return true;
+	else
+		return false;
+}
+
+bool operator>(const Vector &u, const Vector &v)
+{
+	if((u^2) > (v^2))
+		return true;
+	else
+		return false;
+}
+
+bool operator<(const Vector &u, const Vector &v)
+{
+	if((u^2) < (v^2))
+		return true;
+	else
+		return false;
+}
+
+Vector operator+(const Vector &u, const Vector &v)
+{
+	return Vector(u.x + v.x, u.y + v.y, u.z + v.z);
+}
+
+Vector operator-(const Vector &u, const Vector &v)
+{
+	return Vector(u.x - v.x, u.y - v.y, u.z - v.z);
+}
+
+float operator*(const Vector &u, const Vector &v)
+{
+	return u.x * v.x + u.y * v.y + u.z * v.z;
+}
+
+Vector operator*(float s, const Vector &v)
+{
+	return Vector(s * v.x, s * v.y, s * v.z);
+}
+
+Vector operator*(const Vector &v, float s)
+{
+	return Vector(v.x * s, v.y * s, v.z * s);
+}
+
+Vector operator/(const Vector &v, float s)
+{
+	float r = 1.0f / s;
+
+	return Vector(v.x * r, v.y * r, v.z * r);
+}
+
+float operator^(const Vector &u, const Vector &v)
+{
+	return acos(u / Vector::N(u) * v / Vector::N(v));
+}
+
+Vector operator%(const Vector &u, const Vector &v)
+{
+	return Vector(u.y * v.z - u.z * v.y, u.z * v.x - u.x * v.z, u.x * v.y - u.y * v.x);
+}
+
+Vector operator*(const Matrix &M, const Vector &v)
+{
+	return Vector(M(1, 1) * v.x + M(1, 2) * v.y + M(1, 3) * v.z,
+	              M(2, 1) * v.x + M(2, 2) * v.y + M(2, 3) * v.z,
+	              M(3, 1) * v.x + M(3, 2) * v.y + M(3, 3) * v.z);
+}
+
+Vector operator*(const Vector &v, const Matrix &M)
+{
+	return Vector(v.x * M(1, 1) + v.y * M(2, 1) + v.z * M(3, 1) + M(4, 1),
+	              v.x * M(1, 2) + v.y * M(2, 2) + v.z * M(3, 2) + M(4, 2),
+	              v.x * M(1, 3) + v.y * M(2, 3) + v.z * M(3, 3) + M(4, 3));
+}
+
+Vector &operator*=(Vector &v, const Matrix &M)
+{
+	return v = v * M;
+}
+
+float Vector::N(const Vector &v)
+{
+	return sqrt(v.x*v.x + v.y*v.y + v.z*v.z);
+}
+
+float Vector::N2(const Vector &v)
+{
+	return v.x*v.x + v.y*v.y + v.z*v.z;
+}
+
+Vector lerp(const Vector &u, const Vector &v, float t)
+{
+	return Vector(u.x + t * (v.x - u.x),
+	              u.y + t * (v.y - u.y),
+	              u.z + t * (v.z - u.z));
+}
+
+}  // namespace sw
diff --git a/src/Device/Vector.hpp b/src/Device/Vector.hpp
index e7f261d..0df6f5a 100644
--- a/src/Device/Vector.hpp
+++ b/src/Device/Vector.hpp
@@ -15,139 +15,143 @@
 #ifndef Vector_hpp
 #define Vector_hpp
 
-namespace sw
+namespace sw {
+
+struct Point;
+struct Matrix;
+struct Plane;
+
+struct Vector
 {
-	struct Point;
-	struct Matrix;
-	struct Plane;
+	Vector();
+	Vector(const int i);
+	Vector(const Vector &v);
+	Vector(const Point &p);
+	Vector(float v_x, float v_y, float v_z);
 
-	struct Vector
+	Vector &operator=(const Vector &v);
+
+	union
 	{
-		Vector();
-		Vector(const int i);
-		Vector(const Vector &v);
-		Vector(const Point &p);
-		Vector(float v_x, float v_y, float v_z);
+		float v[3];
 
-		Vector &operator=(const Vector &v);
-
-		union
+		struct
 		{
-			float v[3];
-
-			struct
-			{
-				float x;
-				float y;
-				float z;
-			};
+			float x;
+			float y;
+			float z;
 		};
-
-		float &operator[](int i);
-		float &operator()(int i);
-
-		const float &operator[](int i) const;
-		const float &operator()(int i) const;
-
-		Vector operator+() const;
-		Vector operator-() const;
-
-		Vector &operator+=(const Vector &v);
-		Vector &operator-=(const Vector &v);
-		Vector &operator*=(float s);
-		Vector &operator/=(float s);
-
-		friend bool operator==(const Vector &u, const Vector &v);
-		friend bool operator!=(const Vector &u, const Vector &v);
-
-		friend Vector operator+(const Vector &u, const Vector &v);
-		friend Vector operator-(const Vector &u, const Vector &v);
-		friend float operator*(const Vector &u, const Vector &v);   // Dot product
-		friend Vector operator*(float s, const Vector &v);
-		friend Vector operator*(const Vector &v, float s);
-		friend Vector operator/(const Vector &v, float s);
-		friend float operator^(const Vector &u, const Vector &v);   // Angle between vectors
-		friend Vector operator%(const Vector &u, const Vector &v);   // Cross product
-
-		friend Vector operator*(const Matrix &M, const Vector& v);
-		friend Vector operator*(const Vector &v, const Matrix &M);
-		friend Vector &operator*=(Vector &v, const Matrix &M);
-
-		static float N(const Vector &v);   // Norm
-		static float N2(const Vector &v);   // Squared norm
-
-		static Vector mirror(const Vector &v, const Plane &p);
-		static Vector reflect(const Vector &v, const Plane &p);
-		static Vector lerp(const Vector &u, const Vector &v, float t);
 	};
-}
+
+	float &operator[](int i);
+	float &operator()(int i);
+
+	const float &operator[](int i) const;
+	const float &operator()(int i) const;
+
+	Vector operator+() const;
+	Vector operator-() const;
+
+	Vector &operator+=(const Vector &v);
+	Vector &operator-=(const Vector &v);
+	Vector &operator*=(float s);
+	Vector &operator/=(float s);
+
+	friend bool operator==(const Vector &u, const Vector &v);
+	friend bool operator!=(const Vector &u, const Vector &v);
+
+	friend Vector operator+(const Vector &u, const Vector &v);
+	friend Vector operator-(const Vector &u, const Vector &v);
+	friend float operator*(const Vector &u, const Vector &v);   // Dot product
+	friend Vector operator*(float s, const Vector &v);
+	friend Vector operator*(const Vector &v, float s);
+	friend Vector operator/(const Vector &v, float s);
+	friend float operator^(const Vector &u, const Vector &v);   // Angle between vectors
+	friend Vector operator%(const Vector &u, const Vector &v);   // Cross product
+
+	friend Vector operator*(const Matrix &M, const Vector& v);
+	friend Vector operator*(const Vector &v, const Matrix &M);
+	friend Vector &operator*=(Vector &v, const Matrix &M);
+
+	static float N(const Vector &v);   // Norm
+	static float N2(const Vector &v);   // Squared norm
+
+	static Vector mirror(const Vector &v, const Plane &p);
+	static Vector reflect(const Vector &v, const Plane &p);
+	static Vector lerp(const Vector &u, const Vector &v, float t);
+};
+
+}  // namespace sw
+
+/* Inline implementation */
 
 #include "Point.hpp"
 
-namespace sw
+namespace sw {
+
+inline Vector::Vector()
 {
-	inline Vector::Vector()
-	{
-	}
-
-	inline Vector::Vector(const int i)
-	{
-		const float s = (float)i;
-
-		x = s;
-		y = s;
-		z = s;
-	}
-
-	inline Vector::Vector(const Vector &v)
-	{
-		x = v.x;
-		y = v.y;
-		z = v.z;
-	}
-
-	inline Vector::Vector(const Point &P)
-	{
-		x = P.x;
-		y = P.y;
-		z = P.z;
-	}
-
-	inline Vector::Vector(float v_x, float v_y, float v_z)
-	{
-		x = v_x;
-		y = v_y;
-		z = v_z;
-	}
-
-	inline Vector &Vector::operator=(const Vector &v)
-	{
-		x = v.x;
-		y = v.y;
-		z = v.z;
-
-		return *this;
-	}
-
-	inline float &Vector::operator()(int i)
-	{
-		return v[i];
-	}
-
-	inline float &Vector::operator[](int i)
-	{
-		return v[i];
-	}
-
-	inline const float &Vector::operator()(int i) const
-	{
-		return v[i];
-	}
-
-	inline const float &Vector::operator[](int i) const
-	{
-		return v[i];
-	}
 }
 
+inline Vector::Vector(const int i)
+{
+	const float s = (float)i;
+
+	x = s;
+	y = s;
+	z = s;
+}
+
+inline Vector::Vector(const Vector &v)
+{
+	x = v.x;
+	y = v.y;
+	z = v.z;
+}
+
+inline Vector::Vector(const Point &P)
+{
+	x = P.x;
+	y = P.y;
+	z = P.z;
+}
+
+inline Vector::Vector(float v_x, float v_y, float v_z)
+{
+	x = v_x;
+	y = v_y;
+	z = v_z;
+}
+
+inline Vector &Vector::operator=(const Vector &v)
+{
+	x = v.x;
+	y = v.y;
+	z = v.z;
+
+	return *this;
+}
+
+inline float &Vector::operator()(int i)
+{
+	return v[i];
+}
+
+inline float &Vector::operator[](int i)
+{
+	return v[i];
+}
+
+inline const float &Vector::operator()(int i) const
+{
+	return v[i];
+}
+
+inline const float &Vector::operator[](int i) const
+{
+	return v[i];
+}
+
+}  // namespace sw
+
 #endif   // Vector_hpp
diff --git a/src/Device/Vertex.hpp b/src/Device/Vertex.hpp
index 050a925..63af666 100644
--- a/src/Device/Vertex.hpp
+++ b/src/Device/Vertex.hpp
@@ -19,42 +19,43 @@
 #include "System/Types.hpp"
 #include "Device/Config.hpp"
 
-namespace sw
+namespace sw {
+
+ALIGN(16, struct Vertex
 {
-	ALIGN(16, struct Vertex
+	union
 	{
-		union
+		struct
 		{
-			struct
-			{
-				float x;
-				float y;
-				float z;
-				float w;
-			};
-
-			float4 position;
-		};
-
-		float pointSize;
-
-		int clipFlags;
-		int cullMask;
-		float clipDistance[MAX_CLIP_DISTANCES];
-		float cullDistance[MAX_CLIP_DISTANCES];
-
-		alignas(16) struct
-		{
-			int x;
-			int y;
+			float x;
+			float y;
 			float z;
 			float w;
-		} projected;
+		};
 
-		alignas(16) float v[MAX_INTERFACE_COMPONENTS];
-	});
+		float4 position;
+	};
 
-	static_assert((sizeof(Vertex) & 0x0000000F) == 0, "Vertex size not a multiple of 16 bytes (alignment requirement)");
-}
+	float pointSize;
+
+	int clipFlags;
+	int cullMask;
+	float clipDistance[MAX_CLIP_DISTANCES];
+	float cullDistance[MAX_CLIP_DISTANCES];
+
+	alignas(16) struct
+	{
+		int x;
+		int y;
+		float z;
+		float w;
+	} projected;
+
+	alignas(16) float v[MAX_INTERFACE_COMPONENTS];
+});
+
+static_assert((sizeof(Vertex) & 0x0000000F) == 0, "Vertex size not a multiple of 16 bytes (alignment requirement)");
+
+}  // namespace sw
 
 #endif   // Vertex_hpp
diff --git a/src/Device/VertexProcessor.cpp b/src/Device/VertexProcessor.cpp
index c6b96e1..e77b2f7 100644
--- a/src/Device/VertexProcessor.cpp
+++ b/src/Device/VertexProcessor.cpp
@@ -21,124 +21,125 @@
 
 #include <cstring>
 
-namespace sw
+namespace sw {
+
+void VertexCache::clear()
 {
-	void VertexCache::clear()
+	for(uint32_t i = 0; i < SIZE; i++)
 	{
-		for(uint32_t i = 0; i < SIZE; i++)
-		{
-			tag[i] = 0xFFFFFFFF;
-		}
-	}
-
-	uint32_t VertexProcessor::States::computeHash()
-	{
-		uint32_t *state = reinterpret_cast<uint32_t*>(this);
-		uint32_t hash = 0;
-
-		for(unsigned int i = 0; i < sizeof(States) / sizeof(uint32_t); i++)
-		{
-			hash ^= state[i];
-		}
-
-		return hash;
-	}
-
-	unsigned int VertexProcessor::States::Input::bytesPerAttrib() const
-	{
-		switch(type)
-		{
-		case STREAMTYPE_FLOAT:
-		case STREAMTYPE_INT:
-		case STREAMTYPE_UINT:
-			return count * sizeof(uint32_t);
-		case STREAMTYPE_HALF:
-		case STREAMTYPE_SHORT:
-		case STREAMTYPE_USHORT:
-			return count * sizeof(uint16_t);
-		case STREAMTYPE_BYTE:
-		case STREAMTYPE_SBYTE:
-			return count * sizeof(uint8_t);
-		case STREAMTYPE_COLOR:
-		case STREAMTYPE_2_10_10_10_INT:
-		case STREAMTYPE_2_10_10_10_UINT:
-			return sizeof(int);
-		default:
-			UNSUPPORTED("stream.type %d", int(type));
-		}
-
-		return 0;
-	}
-
-	bool VertexProcessor::State::operator==(const State &state) const
-	{
-		if(hash != state.hash)
-		{
-			return false;
-		}
-
-		static_assert(is_memcmparable<State>::value, "Cannot memcmp States");
-		return memcmp(static_cast<const States*>(this), static_cast<const States*>(&state), sizeof(States)) == 0;
-	}
-
-	VertexProcessor::VertexProcessor()
-	{
-		routineCache = nullptr;
-		setRoutineCacheSize(1024);
-	}
-
-	VertexProcessor::~VertexProcessor()
-	{
-		delete routineCache;
-		routineCache = nullptr;
-	}
-
-	void VertexProcessor::setRoutineCacheSize(int cacheSize)
-	{
-		delete routineCache;
-		routineCache = new RoutineCacheType(clamp(cacheSize, 1, 65536));
-	}
-
-	const VertexProcessor::State VertexProcessor::update(const sw::Context* context)
-	{
-		State state;
-
-		state.shaderID = context->vertexShader->getSerialID();
-		state.robustBufferAccess = context->robustBufferAccess;
-		state.isPoint = context->topology == VK_PRIMITIVE_TOPOLOGY_POINT_LIST;
-
-		for(int i = 0; i < MAX_INTERFACE_COMPONENTS / 4; i++)
-		{
-			state.input[i].type = context->input[i].type;
-			state.input[i].count = context->input[i].count;
-			state.input[i].normalized = context->input[i].normalized;
-			// TODO: get rid of attribType -- just keep the VK format all the way through, this fully determines
-			// how to handle the attribute.
-			state.input[i].attribType = context->vertexShader->inputs[i*4].Type;
-		}
-
-		state.hash = state.computeHash();
-
-		return state;
-	}
-
-	VertexProcessor::RoutineType VertexProcessor::routine(const State &state,
-	                                                      vk::PipelineLayout const *pipelineLayout,
-	                                                      SpirvShader const *vertexShader,
-	                                                      const vk::DescriptorSet::Bindings &descriptorSets)
-	{
-		auto routine = routineCache->query(state);
-
-		if(!routine)   // Create one
-		{
-			VertexRoutine *generator = new VertexProgram(state, pipelineLayout, vertexShader, descriptorSets);
-			generator->generate();
-			routine = (*generator)("VertexRoutine_%0.8X", state.shaderID);
-			delete generator;
-
-			routineCache->add(state, routine);
-		}
-
-		return routine;
+		tag[i] = 0xFFFFFFFF;
 	}
 }
+
+uint32_t VertexProcessor::States::computeHash()
+{
+	uint32_t *state = reinterpret_cast<uint32_t*>(this);
+	uint32_t hash = 0;
+
+	for(unsigned int i = 0; i < sizeof(States) / sizeof(uint32_t); i++)
+	{
+		hash ^= state[i];
+	}
+
+	return hash;
+}
+
+unsigned int VertexProcessor::States::Input::bytesPerAttrib() const
+{
+	switch(type)
+	{
+	case STREAMTYPE_FLOAT:
+	case STREAMTYPE_INT:
+	case STREAMTYPE_UINT:
+		return count * sizeof(uint32_t);
+	case STREAMTYPE_HALF:
+	case STREAMTYPE_SHORT:
+	case STREAMTYPE_USHORT:
+		return count * sizeof(uint16_t);
+	case STREAMTYPE_BYTE:
+	case STREAMTYPE_SBYTE:
+		return count * sizeof(uint8_t);
+	case STREAMTYPE_COLOR:
+	case STREAMTYPE_2_10_10_10_INT:
+	case STREAMTYPE_2_10_10_10_UINT:
+		return sizeof(int);
+	default:
+		UNSUPPORTED("stream.type %d", int(type));
+	}
+
+	return 0;
+}
+
+bool VertexProcessor::State::operator==(const State &state) const
+{
+	if(hash != state.hash)
+	{
+		return false;
+	}
+
+	static_assert(is_memcmparable<State>::value, "Cannot memcmp States");
+	return memcmp(static_cast<const States*>(this), static_cast<const States*>(&state), sizeof(States)) == 0;
+}
+
+VertexProcessor::VertexProcessor()
+{
+	routineCache = nullptr;
+	setRoutineCacheSize(1024);
+}
+
+VertexProcessor::~VertexProcessor()
+{
+	delete routineCache;
+	routineCache = nullptr;
+}
+
+void VertexProcessor::setRoutineCacheSize(int cacheSize)
+{
+	delete routineCache;
+	routineCache = new RoutineCacheType(clamp(cacheSize, 1, 65536));
+}
+
+const VertexProcessor::State VertexProcessor::update(const sw::Context* context)
+{
+	State state;
+
+	state.shaderID = context->vertexShader->getSerialID();
+	state.robustBufferAccess = context->robustBufferAccess;
+	state.isPoint = context->topology == VK_PRIMITIVE_TOPOLOGY_POINT_LIST;
+
+	for(int i = 0; i < MAX_INTERFACE_COMPONENTS / 4; i++)
+	{
+		state.input[i].type = context->input[i].type;
+		state.input[i].count = context->input[i].count;
+		state.input[i].normalized = context->input[i].normalized;
+		// TODO: get rid of attribType -- just keep the VK format all the way through, this fully determines
+		// how to handle the attribute.
+		state.input[i].attribType = context->vertexShader->inputs[i*4].Type;
+	}
+
+	state.hash = state.computeHash();
+
+	return state;
+}
+
+VertexProcessor::RoutineType VertexProcessor::routine(const State &state,
+                                                      vk::PipelineLayout const *pipelineLayout,
+                                                      SpirvShader const *vertexShader,
+                                                      const vk::DescriptorSet::Bindings &descriptorSets)
+{
+	auto routine = routineCache->query(state);
+
+	if(!routine)   // Create one
+	{
+		VertexRoutine *generator = new VertexProgram(state, pipelineLayout, vertexShader, descriptorSets);
+		generator->generate();
+		routine = (*generator)("VertexRoutine_%0.8X", state.shaderID);
+		delete generator;
+
+		routineCache->add(state, routine);
+	}
+
+	return routine;
+}
+
+}  // namespace sw
diff --git a/src/Device/VertexProcessor.hpp b/src/Device/VertexProcessor.hpp
index 62a65d1..c94e82e 100644
--- a/src/Device/VertexProcessor.hpp
+++ b/src/Device/VertexProcessor.hpp
@@ -22,91 +22,92 @@
 #include "Vertex.hpp"
 #include "Pipeline/SpirvShader.hpp"
 
-namespace sw
+namespace sw {
+
+struct DrawData;
+
+// Basic direct mapped vertex cache.
+struct VertexCache
 {
-	struct DrawData;
+	static constexpr uint32_t SIZE = 64;  // TODO: Variable size?
+	static constexpr uint32_t TAG_MASK = SIZE - 1;  // Size must be power of 2.
 
-	// Basic direct mapped vertex cache.
-	struct VertexCache
+	void clear();
+
+	Vertex vertex[SIZE];
+	uint32_t tag[SIZE];
+
+	// Identifier of the draw call for the cache data. If this cache is
+	// used with a different draw call, then the cache should be invalidated
+	// before use.
+	int drawCall = -1;
+};
+
+struct VertexTask
+{
+	unsigned int vertexCount;
+	unsigned int primitiveStart;
+	VertexCache vertexCache;
+};
+
+using VertexRoutineFunction = FunctionT<void(Vertex* output, unsigned int* batch, VertexTask* vertextask, DrawData* draw)>;
+
+class VertexProcessor
+{
+public:
+	struct States : Memset<States>
 	{
-		static constexpr uint32_t SIZE = 64;  // TODO: Variable size?
-		static constexpr uint32_t TAG_MASK = SIZE - 1;  // Size must be power of 2.
+		States() : Memset(this, 0) {}
 
-		void clear();
+		uint32_t computeHash();
 
-		Vertex vertex[SIZE];
-		uint32_t tag[SIZE];
+		uint64_t shaderID;
 
-		// Identifier of the draw call for the cache data. If this cache is
-		// used with a different draw call, then the cache should be invalidated
-		// before use.
-		int drawCall = -1;
-	};
-
-	struct VertexTask
-	{
-		unsigned int vertexCount;
-		unsigned int primitiveStart;
-		VertexCache vertexCache;
-	};
-
-	using VertexRoutineFunction = FunctionT<void(Vertex* output, unsigned int* batch, VertexTask* vertextask, DrawData* draw)>;
-
-	class VertexProcessor
-	{
-	public:
-		struct States : Memset<States>
+		struct Input
 		{
-			States() : Memset(this, 0) {}
-
-			uint32_t computeHash();
-
-			uint64_t shaderID;
-
-			struct Input
+			operator bool() const   // Returns true if stream contains data
 			{
-				operator bool() const   // Returns true if stream contains data
-				{
-					return count != 0;
-				}
+				return count != 0;
+			}
 
-				unsigned int bytesPerAttrib() const;
+			unsigned int bytesPerAttrib() const;
 
-				StreamType type    : BITS(STREAMTYPE_LAST);
-				unsigned int count : 3;
-				bool normalized    : 1;
-				unsigned int attribType : BITS(SpirvShader::ATTRIBTYPE_LAST);
-			};
-
-			Input input[MAX_INTERFACE_COMPONENTS / 4];
-			bool robustBufferAccess : 1;
-			bool isPoint : 1;
+			StreamType type    : BITS(STREAMTYPE_LAST);
+			unsigned int count : 3;
+			bool normalized    : 1;
+			unsigned int attribType : BITS(SpirvShader::ATTRIBTYPE_LAST);
 		};
 
-		struct State : States
-		{
-			bool operator==(const State &state) const;
-
-			uint32_t hash;
-		};
-
-		using RoutineType = VertexRoutineFunction::RoutineType;
-
-		VertexProcessor();
-
-		virtual ~VertexProcessor();
-
-	protected:
-		const State update(const sw::Context* context);
-		RoutineType routine(const State &state, vk::PipelineLayout const *pipelineLayout,
-		                                 SpirvShader const *vertexShader, const vk::DescriptorSet::Bindings &descriptorSets);
-
-		void setRoutineCacheSize(int cacheSize);
-
-	private:
-		using RoutineCacheType = RoutineCacheT<State, VertexRoutineFunction::CFunctionType>;
-		RoutineCacheType *routineCache;
+		Input input[MAX_INTERFACE_COMPONENTS / 4];
+		bool robustBufferAccess : 1;
+		bool isPoint : 1;
 	};
-}
+
+	struct State : States
+	{
+		bool operator==(const State &state) const;
+
+		uint32_t hash;
+	};
+
+	using RoutineType = VertexRoutineFunction::RoutineType;
+
+	VertexProcessor();
+
+	virtual ~VertexProcessor();
+
+protected:
+	const State update(const sw::Context* context);
+	RoutineType routine(const State &state, vk::PipelineLayout const *pipelineLayout,
+	                    SpirvShader const *vertexShader, const vk::DescriptorSet::Bindings &descriptorSets);
+
+	void setRoutineCacheSize(int cacheSize);
+
+private:
+	using RoutineCacheType = RoutineCacheT<State, VertexRoutineFunction::CFunctionType>;
+	RoutineCacheType *routineCache;
+};
+
+}  // namespace sw
 
 #endif   // sw_VertexProcessor_hpp
diff --git a/src/Pipeline/ComputeProgram.cpp b/src/Pipeline/ComputeProgram.cpp
index d0f57be..dd45d17 100644
--- a/src/Pipeline/ComputeProgram.cpp
+++ b/src/Pipeline/ComputeProgram.cpp
@@ -24,268 +24,269 @@
 
 #include <queue>
 
-namespace
+namespace {
+
+enum { X, Y, Z };
+
+}  // anonymous namespace
+
+namespace sw {
+
+ComputeProgram::ComputeProgram(SpirvShader const *shader, vk::PipelineLayout const *pipelineLayout, const vk::DescriptorSet::Bindings &descriptorSets)
+	: shader(shader),
+	  pipelineLayout(pipelineLayout),
+	  descriptorSets(descriptorSets)
 {
-	enum { X, Y, Z };
-} // anonymous namespace
+}
 
-namespace sw
+ComputeProgram::~ComputeProgram()
 {
-	ComputeProgram::ComputeProgram(SpirvShader const *shader, vk::PipelineLayout const *pipelineLayout, const vk::DescriptorSet::Bindings &descriptorSets)
-		: shader(shader),
-		  pipelineLayout(pipelineLayout),
-		  descriptorSets(descriptorSets)
+}
+
+void ComputeProgram::generate()
+{
+	MARL_SCOPED_EVENT("ComputeProgram::generate");
+
+	SpirvRoutine routine(pipelineLayout);
+	shader->emitProlog(&routine);
+	emit(&routine);
+	shader->emitEpilog(&routine);
+}
+
+void ComputeProgram::setWorkgroupBuiltins(Pointer<Byte> data, SpirvRoutine* routine, Int workgroupID[3])
+{
+	routine->setInputBuiltin(shader, spv::BuiltInNumWorkgroups, [&](const SpirvShader::BuiltinMapping& builtin, Array<SIMD::Float>& value)
 	{
-	}
-
-	ComputeProgram::~ComputeProgram()
-	{
-	}
-
-	void ComputeProgram::generate()
-	{
-		MARL_SCOPED_EVENT("ComputeProgram::generate");
-
-		SpirvRoutine routine(pipelineLayout);
-		shader->emitProlog(&routine);
-		emit(&routine);
-		shader->emitEpilog(&routine);
-	}
-
-	void ComputeProgram::setWorkgroupBuiltins(Pointer<Byte> data, SpirvRoutine* routine, Int workgroupID[3])
-	{
-		routine->setInputBuiltin(shader, spv::BuiltInNumWorkgroups, [&](const SpirvShader::BuiltinMapping& builtin, Array<SIMD::Float>& value)
+		auto numWorkgroups = *Pointer<Int4>(data + OFFSET(Data, numWorkgroups));
+		for (uint32_t component = 0; component < builtin.SizeInComponents; component++)
 		{
-			auto numWorkgroups = *Pointer<Int4>(data + OFFSET(Data, numWorkgroups));
-			for (uint32_t component = 0; component < builtin.SizeInComponents; component++)
-			{
-				value[builtin.FirstComponent + component] =
-					As<SIMD::Float>(SIMD::Int(Extract(numWorkgroups, component)));
-			}
-		});
-
-		routine->setInputBuiltin(shader, spv::BuiltInWorkgroupId, [&](const SpirvShader::BuiltinMapping& builtin, Array<SIMD::Float>& value)
-		{
-			for (uint32_t component = 0; component < builtin.SizeInComponents; component++)
-			{
-				value[builtin.FirstComponent + component] =
-					As<SIMD::Float>(SIMD::Int(workgroupID[component]));
-			}
-		});
-
-		routine->setInputBuiltin(shader, spv::BuiltInWorkgroupSize, [&](const SpirvShader::BuiltinMapping& builtin, Array<SIMD::Float>& value)
-		{
-			auto workgroupSize = *Pointer<Int4>(data + OFFSET(Data, workgroupSize));
-			for (uint32_t component = 0; component < builtin.SizeInComponents; component++)
-			{
-				value[builtin.FirstComponent + component] =
-					As<SIMD::Float>(SIMD::Int(Extract(workgroupSize, component)));
-			}
-		});
-
-		routine->setInputBuiltin(shader, spv::BuiltInNumSubgroups, [&](const SpirvShader::BuiltinMapping& builtin, Array<SIMD::Float>& value)
-		{
-			ASSERT(builtin.SizeInComponents == 1);
-			auto subgroupsPerWorkgroup = *Pointer<Int>(data + OFFSET(Data, subgroupsPerWorkgroup));
-			value[builtin.FirstComponent] = As<SIMD::Float>(SIMD::Int(subgroupsPerWorkgroup));
-		});
-
-		routine->setInputBuiltin(shader, spv::BuiltInSubgroupSize, [&](const SpirvShader::BuiltinMapping& builtin, Array<SIMD::Float>& value)
-		{
-			ASSERT(builtin.SizeInComponents == 1);
-			auto invocationsPerSubgroup = *Pointer<Int>(data + OFFSET(Data, invocationsPerSubgroup));
-			value[builtin.FirstComponent] = As<SIMD::Float>(SIMD::Int(invocationsPerSubgroup));
-		});
-
-		routine->setImmutableInputBuiltins(shader);
-	}
-
-	void ComputeProgram::setSubgroupBuiltins(Pointer<Byte> data, SpirvRoutine* routine, Int workgroupID[3], SIMD::Int localInvocationIndex, Int subgroupIndex)
-	{
-		Int4 numWorkgroups = *Pointer<Int4>(data + OFFSET(Data, numWorkgroups));
-		Int4 workgroupSize = *Pointer<Int4>(data + OFFSET(Data, workgroupSize));
-
-		// TODO: Fix Int4 swizzles so we can just use workgroupSize.x, workgroupSize.y.
-		Int workgroupSizeX = Extract(workgroupSize, X);
-		Int workgroupSizeY = Extract(workgroupSize, Y);
-
-		SIMD::Int localInvocationID[3];
-		{
-			SIMD::Int idx = localInvocationIndex;
-			localInvocationID[Z] = idx / SIMD::Int(workgroupSizeX * workgroupSizeY);
-			idx -= localInvocationID[Z] * SIMD::Int(workgroupSizeX * workgroupSizeY); // modulo
-			localInvocationID[Y] = idx / SIMD::Int(workgroupSizeX);
-			idx -= localInvocationID[Y] * SIMD::Int(workgroupSizeX); // modulo
-			localInvocationID[X] = idx;
+			value[builtin.FirstComponent + component] =
+				As<SIMD::Float>(SIMD::Int(Extract(numWorkgroups, component)));
 		}
+	});
 
-		routine->setInputBuiltin(shader, spv::BuiltInLocalInvocationIndex, [&](const SpirvShader::BuiltinMapping& builtin, Array<SIMD::Float>& value)
-		{
-			ASSERT(builtin.SizeInComponents == 1);
-			value[builtin.FirstComponent] = As<SIMD::Float>(localInvocationIndex);
-		});
-
-		routine->setInputBuiltin(shader, spv::BuiltInSubgroupId, [&](const SpirvShader::BuiltinMapping& builtin, Array<SIMD::Float>& value)
-		{
-			ASSERT(builtin.SizeInComponents == 1);
-			value[builtin.FirstComponent] = As<SIMD::Float>(SIMD::Int(subgroupIndex));
-		});
-
-		routine->setInputBuiltin(shader, spv::BuiltInLocalInvocationId, [&](const SpirvShader::BuiltinMapping& builtin, Array<SIMD::Float>& value)
-		{
-			for (uint32_t component = 0; component < builtin.SizeInComponents; component++)
-			{
-				value[builtin.FirstComponent + component] =
-					As<SIMD::Float>(localInvocationID[component]);
-			}
-		});
-
-		routine->setInputBuiltin(shader, spv::BuiltInGlobalInvocationId, [&](const SpirvShader::BuiltinMapping& builtin, Array<SIMD::Float>& value)
-		{
-			SIMD::Int wgID = 0;
-			wgID = Insert(wgID, workgroupID[X], X);
-			wgID = Insert(wgID, workgroupID[Y], Y);
-			wgID = Insert(wgID, workgroupID[Z], Z);
-			auto localBase = workgroupSize * wgID;
-			for (uint32_t component = 0; component < builtin.SizeInComponents; component++)
-			{
-				auto globalInvocationID = SIMD::Int(Extract(localBase, component)) + localInvocationID[component];
-				value[builtin.FirstComponent + component] = As<SIMD::Float>(globalInvocationID);
-			}
-		});
-	}
-
-	void ComputeProgram::emit(SpirvRoutine* routine)
+	routine->setInputBuiltin(shader, spv::BuiltInWorkgroupId, [&](const SpirvShader::BuiltinMapping& builtin, Array<SIMD::Float>& value)
 	{
-		Pointer<Byte> data = Arg<0>();
-		Int workgroupX = Arg<1>();
-		Int workgroupY = Arg<2>();
-		Int workgroupZ = Arg<3>();
-		Pointer<Byte> workgroupMemory = Arg<4>();
-		Int firstSubgroup = Arg<5>();
-		Int subgroupCount = Arg<6>();
-
-		routine->descriptorSets = data + OFFSET(Data, descriptorSets);
-		routine->descriptorDynamicOffsets = data + OFFSET(Data, descriptorDynamicOffsets);
-		routine->pushConstants = data + OFFSET(Data, pushConstants);
-		routine->constants = *Pointer<Pointer<Byte>>(data + OFFSET(Data, constants));
-		routine->workgroupMemory = workgroupMemory;
-
-		Int invocationsPerWorkgroup = *Pointer<Int>(data + OFFSET(Data, invocationsPerWorkgroup));
-
-		Int workgroupID[3] = {workgroupX, workgroupY, workgroupZ};
-		setWorkgroupBuiltins(data, routine, workgroupID);
-
-		For(Int i = 0, i < subgroupCount, i++)
+		for (uint32_t component = 0; component < builtin.SizeInComponents; component++)
 		{
-			auto subgroupIndex = firstSubgroup + i;
-
-			// TODO: Replace SIMD::Int(0, 1, 2, 3) with SIMD-width equivalent
-			auto localInvocationIndex = SIMD::Int(subgroupIndex * SIMD::Width) + SIMD::Int(0, 1, 2, 3);
-
-			// Disable lanes where (invocationIDs >= invocationsPerWorkgroup)
-			auto activeLaneMask = CmpLT(localInvocationIndex, SIMD::Int(invocationsPerWorkgroup));
-
-			setSubgroupBuiltins(data, routine, workgroupID, localInvocationIndex, subgroupIndex);
-
-			shader->emit(routine, activeLaneMask, activeLaneMask, descriptorSets);
+			value[builtin.FirstComponent + component] =
+				As<SIMD::Float>(SIMD::Int(workgroupID[component]));
 		}
+	});
+
+	routine->setInputBuiltin(shader, spv::BuiltInWorkgroupSize, [&](const SpirvShader::BuiltinMapping& builtin, Array<SIMD::Float>& value)
+	{
+		auto workgroupSize = *Pointer<Int4>(data + OFFSET(Data, workgroupSize));
+		for (uint32_t component = 0; component < builtin.SizeInComponents; component++)
+		{
+			value[builtin.FirstComponent + component] =
+				As<SIMD::Float>(SIMD::Int(Extract(workgroupSize, component)));
+		}
+	});
+
+	routine->setInputBuiltin(shader, spv::BuiltInNumSubgroups, [&](const SpirvShader::BuiltinMapping& builtin, Array<SIMD::Float>& value)
+	{
+		ASSERT(builtin.SizeInComponents == 1);
+		auto subgroupsPerWorkgroup = *Pointer<Int>(data + OFFSET(Data, subgroupsPerWorkgroup));
+		value[builtin.FirstComponent] = As<SIMD::Float>(SIMD::Int(subgroupsPerWorkgroup));
+	});
+
+	routine->setInputBuiltin(shader, spv::BuiltInSubgroupSize, [&](const SpirvShader::BuiltinMapping& builtin, Array<SIMD::Float>& value)
+	{
+		ASSERT(builtin.SizeInComponents == 1);
+		auto invocationsPerSubgroup = *Pointer<Int>(data + OFFSET(Data, invocationsPerSubgroup));
+		value[builtin.FirstComponent] = As<SIMD::Float>(SIMD::Int(invocationsPerSubgroup));
+	});
+
+	routine->setImmutableInputBuiltins(shader);
+}
+
+void ComputeProgram::setSubgroupBuiltins(Pointer<Byte> data, SpirvRoutine* routine, Int workgroupID[3], SIMD::Int localInvocationIndex, Int subgroupIndex)
+{
+	Int4 numWorkgroups = *Pointer<Int4>(data + OFFSET(Data, numWorkgroups));
+	Int4 workgroupSize = *Pointer<Int4>(data + OFFSET(Data, workgroupSize));
+
+	// TODO: Fix Int4 swizzles so we can just use workgroupSize.x, workgroupSize.y.
+	Int workgroupSizeX = Extract(workgroupSize, X);
+	Int workgroupSizeY = Extract(workgroupSize, Y);
+
+	SIMD::Int localInvocationID[3];
+	{
+		SIMD::Int idx = localInvocationIndex;
+		localInvocationID[Z] = idx / SIMD::Int(workgroupSizeX * workgroupSizeY);
+		idx -= localInvocationID[Z] * SIMD::Int(workgroupSizeX * workgroupSizeY); // modulo
+		localInvocationID[Y] = idx / SIMD::Int(workgroupSizeX);
+		idx -= localInvocationID[Y] * SIMD::Int(workgroupSizeX); // modulo
+		localInvocationID[X] = idx;
 	}
 
-	void ComputeProgram::run(
-		vk::DescriptorSet::Bindings const &descriptorSets,
-		vk::DescriptorSet::DynamicOffsets const &descriptorDynamicOffsets,
-		PushConstantStorage const &pushConstants,
-		uint32_t baseGroupX, uint32_t baseGroupY, uint32_t baseGroupZ,
-		uint32_t groupCountX, uint32_t groupCountY, uint32_t groupCountZ)
+	routine->setInputBuiltin(shader, spv::BuiltInLocalInvocationIndex, [&](const SpirvShader::BuiltinMapping& builtin, Array<SIMD::Float>& value)
 	{
-		auto &modes = shader->getModes();
+		ASSERT(builtin.SizeInComponents == 1);
+		value[builtin.FirstComponent] = As<SIMD::Float>(localInvocationIndex);
+	});
 
-		auto invocationsPerSubgroup = SIMD::Width;
-		auto invocationsPerWorkgroup = modes.WorkgroupSizeX * modes.WorkgroupSizeY * modes.WorkgroupSizeZ;
-		auto subgroupsPerWorkgroup = (invocationsPerWorkgroup + invocationsPerSubgroup - 1) / invocationsPerSubgroup;
+	routine->setInputBuiltin(shader, spv::BuiltInSubgroupId, [&](const SpirvShader::BuiltinMapping& builtin, Array<SIMD::Float>& value)
+	{
+		ASSERT(builtin.SizeInComponents == 1);
+		value[builtin.FirstComponent] = As<SIMD::Float>(SIMD::Int(subgroupIndex));
+	});
 
-		Data data;
-		data.descriptorSets = descriptorSets;
-		data.descriptorDynamicOffsets = descriptorDynamicOffsets;
-		data.numWorkgroups[X] = groupCountX;
-		data.numWorkgroups[Y] = groupCountY;
-		data.numWorkgroups[Z] = groupCountZ;
-		data.numWorkgroups[3] = 0;
-		data.workgroupSize[X] = modes.WorkgroupSizeX;
-		data.workgroupSize[Y] = modes.WorkgroupSizeY;
-		data.workgroupSize[Z] = modes.WorkgroupSizeZ;
-		data.workgroupSize[3] = 0;
-		data.invocationsPerSubgroup = invocationsPerSubgroup;
-		data.invocationsPerWorkgroup = invocationsPerWorkgroup;
-		data.subgroupsPerWorkgroup = subgroupsPerWorkgroup;
-		data.pushConstants = pushConstants;
-		data.constants = &sw::constants;
-
-		marl::WaitGroup wg;
-		const uint32_t batchCount = 16;
-
-		auto groupCount = groupCountX * groupCountY * groupCountZ;
-
-		for (uint32_t batchID = 0; batchID < batchCount && batchID < groupCount; batchID++)
+	routine->setInputBuiltin(shader, spv::BuiltInLocalInvocationId, [&](const SpirvShader::BuiltinMapping& builtin, Array<SIMD::Float>& value)
+	{
+		for (uint32_t component = 0; component < builtin.SizeInComponents; component++)
 		{
-			wg.add(1);
-			marl::schedule([=, &data]
-			{
-				defer(wg.done());
-				std::vector<uint8_t> workgroupMemory(shader->workgroupMemory.size());
+			value[builtin.FirstComponent + component] =
+				As<SIMD::Float>(localInvocationID[component]);
+		}
+	});
 
-				for (uint32_t groupIndex = batchID; groupIndex < groupCount; groupIndex += batchCount)
+	routine->setInputBuiltin(shader, spv::BuiltInGlobalInvocationId, [&](const SpirvShader::BuiltinMapping& builtin, Array<SIMD::Float>& value)
+	{
+		SIMD::Int wgID = 0;
+		wgID = Insert(wgID, workgroupID[X], X);
+		wgID = Insert(wgID, workgroupID[Y], Y);
+		wgID = Insert(wgID, workgroupID[Z], Z);
+		auto localBase = workgroupSize * wgID;
+		for (uint32_t component = 0; component < builtin.SizeInComponents; component++)
+		{
+			auto globalInvocationID = SIMD::Int(Extract(localBase, component)) + localInvocationID[component];
+			value[builtin.FirstComponent + component] = As<SIMD::Float>(globalInvocationID);
+		}
+	});
+}
+
+void ComputeProgram::emit(SpirvRoutine* routine)
+{
+	Pointer<Byte> data = Arg<0>();
+	Int workgroupX = Arg<1>();
+	Int workgroupY = Arg<2>();
+	Int workgroupZ = Arg<3>();
+	Pointer<Byte> workgroupMemory = Arg<4>();
+	Int firstSubgroup = Arg<5>();
+	Int subgroupCount = Arg<6>();
+
+	routine->descriptorSets = data + OFFSET(Data, descriptorSets);
+	routine->descriptorDynamicOffsets = data + OFFSET(Data, descriptorDynamicOffsets);
+	routine->pushConstants = data + OFFSET(Data, pushConstants);
+	routine->constants = *Pointer<Pointer<Byte>>(data + OFFSET(Data, constants));
+	routine->workgroupMemory = workgroupMemory;
+
+	Int invocationsPerWorkgroup = *Pointer<Int>(data + OFFSET(Data, invocationsPerWorkgroup));
+
+	Int workgroupID[3] = {workgroupX, workgroupY, workgroupZ};
+	setWorkgroupBuiltins(data, routine, workgroupID);
+
+	For(Int i = 0, i < subgroupCount, i++)
+	{
+		auto subgroupIndex = firstSubgroup + i;
+
+		// TODO: Replace SIMD::Int(0, 1, 2, 3) with SIMD-width equivalent
+		auto localInvocationIndex = SIMD::Int(subgroupIndex * SIMD::Width) + SIMD::Int(0, 1, 2, 3);
+
+		// Disable lanes where (invocationIDs >= invocationsPerWorkgroup)
+		auto activeLaneMask = CmpLT(localInvocationIndex, SIMD::Int(invocationsPerWorkgroup));
+
+		setSubgroupBuiltins(data, routine, workgroupID, localInvocationIndex, subgroupIndex);
+
+		shader->emit(routine, activeLaneMask, activeLaneMask, descriptorSets);
+	}
+}
+
+void ComputeProgram::run(
+	vk::DescriptorSet::Bindings const &descriptorSets,
+	vk::DescriptorSet::DynamicOffsets const &descriptorDynamicOffsets,
+	PushConstantStorage const &pushConstants,
+	uint32_t baseGroupX, uint32_t baseGroupY, uint32_t baseGroupZ,
+	uint32_t groupCountX, uint32_t groupCountY, uint32_t groupCountZ)
+{
+	auto &modes = shader->getModes();
+
+	auto invocationsPerSubgroup = SIMD::Width;
+	auto invocationsPerWorkgroup = modes.WorkgroupSizeX * modes.WorkgroupSizeY * modes.WorkgroupSizeZ;
+	auto subgroupsPerWorkgroup = (invocationsPerWorkgroup + invocationsPerSubgroup - 1) / invocationsPerSubgroup;
+
+	Data data;
+	data.descriptorSets = descriptorSets;
+	data.descriptorDynamicOffsets = descriptorDynamicOffsets;
+	data.numWorkgroups[X] = groupCountX;
+	data.numWorkgroups[Y] = groupCountY;
+	data.numWorkgroups[Z] = groupCountZ;
+	data.numWorkgroups[3] = 0;
+	data.workgroupSize[X] = modes.WorkgroupSizeX;
+	data.workgroupSize[Y] = modes.WorkgroupSizeY;
+	data.workgroupSize[Z] = modes.WorkgroupSizeZ;
+	data.workgroupSize[3] = 0;
+	data.invocationsPerSubgroup = invocationsPerSubgroup;
+	data.invocationsPerWorkgroup = invocationsPerWorkgroup;
+	data.subgroupsPerWorkgroup = subgroupsPerWorkgroup;
+	data.pushConstants = pushConstants;
+	data.constants = &sw::constants;
+
+	marl::WaitGroup wg;
+	const uint32_t batchCount = 16;
+
+	auto groupCount = groupCountX * groupCountY * groupCountZ;
+
+	for (uint32_t batchID = 0; batchID < batchCount && batchID < groupCount; batchID++)
+	{
+		wg.add(1);
+		marl::schedule([=, &data]
+		{
+			defer(wg.done());
+			std::vector<uint8_t> workgroupMemory(shader->workgroupMemory.size());
+
+			for (uint32_t groupIndex = batchID; groupIndex < groupCount; groupIndex += batchCount)
+			{
+				auto modulo = groupIndex;
+				auto groupOffsetZ = modulo / (groupCountX * groupCountY);
+				modulo -= groupOffsetZ * (groupCountX * groupCountY);
+				auto groupOffsetY = modulo / groupCountX;
+				modulo -= groupOffsetY * groupCountX;
+				auto groupOffsetX = modulo;
+
+				auto groupZ = baseGroupZ + groupOffsetZ;
+				auto groupY = baseGroupY + groupOffsetY;
+				auto groupX = baseGroupX + groupOffsetX;
+				MARL_SCOPED_EVENT("groupX: %d, groupY: %d, groupZ: %d", groupX, groupY, groupZ);
+
+				using Coroutine = std::unique_ptr<rr::Stream<SpirvShader::YieldResult>>;
+				std::queue<Coroutine> coroutines;
+
+				if (modes.ContainsControlBarriers)
 				{
-					auto modulo = groupIndex;
-					auto groupOffsetZ = modulo / (groupCountX * groupCountY);
-					modulo -= groupOffsetZ * (groupCountX * groupCountY);
-					auto groupOffsetY = modulo / groupCountX;
-					modulo -= groupOffsetY * groupCountX;
-					auto groupOffsetX = modulo;
-
-					auto groupZ = baseGroupZ + groupOffsetZ;
-					auto groupY = baseGroupY + groupOffsetY;
-					auto groupX = baseGroupX + groupOffsetX;
-					MARL_SCOPED_EVENT("groupX: %d, groupY: %d, groupZ: %d", groupX, groupY, groupZ);
-
-					using Coroutine = std::unique_ptr<rr::Stream<SpirvShader::YieldResult>>;
-					std::queue<Coroutine> coroutines;
-
-					if (modes.ContainsControlBarriers)
+					// Make a function call per subgroup so each subgroup
+					// can yield, bringing all subgroups to the barrier
+					// together.
+					for(int subgroupIndex = 0; subgroupIndex < subgroupsPerWorkgroup; subgroupIndex++)
 					{
-						// Make a function call per subgroup so each subgroup
-						// can yield, bringing all subgroups to the barrier
-						// together.
-						for(int subgroupIndex = 0; subgroupIndex < subgroupsPerWorkgroup; subgroupIndex++)
-						{
-							auto coroutine = (*this)(&data, groupX, groupY, groupZ, workgroupMemory.data(), subgroupIndex, 1);
-							coroutines.push(std::move(coroutine));
-						}
-					}
-					else
-					{
-						auto coroutine = (*this)(&data, groupX, groupY, groupZ, workgroupMemory.data(), 0, subgroupsPerWorkgroup);
+						auto coroutine = (*this)(&data, groupX, groupY, groupZ, workgroupMemory.data(), subgroupIndex, 1);
 						coroutines.push(std::move(coroutine));
 					}
+				}
+				else
+				{
+					auto coroutine = (*this)(&data, groupX, groupY, groupZ, workgroupMemory.data(), 0, subgroupsPerWorkgroup);
+					coroutines.push(std::move(coroutine));
+				}
 
-					while (coroutines.size() > 0)
+				while (coroutines.size() > 0)
+				{
+					auto coroutine = std::move(coroutines.front());
+					coroutines.pop();
+
+					SpirvShader::YieldResult result;
+					if (coroutine->await(result))
 					{
-						auto coroutine = std::move(coroutines.front());
-						coroutines.pop();
-
-						SpirvShader::YieldResult result;
-						if (coroutine->await(result))
-						{
-							// TODO: Consider result (when the enum is more than 1 entry).
-							coroutines.push(std::move(coroutine));
-						}
+						// TODO: Consider result (when the enum is more than 1 entry).
+						coroutines.push(std::move(coroutine));
 					}
 				}
-			});
-		}
-
-		wg.wait();
+			}
+		});
 	}
 
-} // namespace sw
+	wg.wait();
+}
+
+}  // namespace sw
diff --git a/src/Pipeline/ComputeProgram.hpp b/src/Pipeline/ComputeProgram.hpp
index 3a04ed2..75f0cf4 100644
--- a/src/Pipeline/ComputeProgram.hpp
+++ b/src/Pipeline/ComputeProgram.hpp
@@ -23,68 +23,64 @@
 
 #include <functional>
 
-namespace vk
+namespace vk { class PipelineLayout; }
+
+namespace sw {
+
+using namespace rr;
+
+class DescriptorSetsLayout;
+struct Constants;
+
+// ComputeProgram builds a SPIR-V compute shader.
+class ComputeProgram : public Coroutine<SpirvShader::YieldResult(
+		void* data,
+		int32_t workgroupX,
+		int32_t workgroupY,
+		int32_t workgroupZ,
+		void* workgroupMemory,
+		int32_t firstSubgroup,
+		int32_t subgroupCount)>
 {
-	class PipelineLayout;
-} // namespace vk
+public:
+	ComputeProgram(SpirvShader const *spirvShader, vk::PipelineLayout const *pipelineLayout, const vk::DescriptorSet::Bindings &descriptorSets);
 
-namespace sw
-{
+	virtual ~ComputeProgram();
 
-	using namespace rr;
+	// generate builds the shader program.
+	void generate();
 
-	class DescriptorSetsLayout;
-	struct Constants;
+	// run executes the compute shader routine for all workgroups.
+	void run(
+		vk::DescriptorSet::Bindings const &descriptorSetBindings,
+		vk::DescriptorSet::DynamicOffsets const &descriptorDynamicOffsets,
+		PushConstantStorage const &pushConstants,
+		uint32_t baseGroupX, uint32_t baseGroupY, uint32_t baseGroupZ,
+		uint32_t groupCountX, uint32_t groupCountY, uint32_t groupCountZ);
 
-	// ComputeProgram builds a SPIR-V compute shader.
-	class ComputeProgram : public Coroutine<SpirvShader::YieldResult(
-			void* data,
-			int32_t workgroupX,
-			int32_t workgroupY,
-			int32_t workgroupZ,
-			void* workgroupMemory,
-			int32_t firstSubgroup,
-			int32_t subgroupCount)>
+protected:
+	void emit(SpirvRoutine* routine);
+	void setWorkgroupBuiltins(Pointer<Byte> data, SpirvRoutine* routine, Int workgroupID[3]);
+	void setSubgroupBuiltins(Pointer<Byte> data, SpirvRoutine* routine, Int workgroupID[3], SIMD::Int localInvocationIndex, Int subgroupIndex);
+
+	struct Data
 	{
-	public:
-		ComputeProgram(SpirvShader const *spirvShader, vk::PipelineLayout const *pipelineLayout, const vk::DescriptorSet::Bindings &descriptorSets);
-
-		virtual ~ComputeProgram();
-
-		// generate builds the shader program.
-		void generate();
-
-		// run executes the compute shader routine for all workgroups.
-		void run(
-			vk::DescriptorSet::Bindings const &descriptorSetBindings,
-			vk::DescriptorSet::DynamicOffsets const &descriptorDynamicOffsets,
-			PushConstantStorage const &pushConstants,
-			uint32_t baseGroupX, uint32_t baseGroupY, uint32_t baseGroupZ,
-			uint32_t groupCountX, uint32_t groupCountY, uint32_t groupCountZ);
-
-	protected:
-		void emit(SpirvRoutine* routine);
-		void setWorkgroupBuiltins(Pointer<Byte> data, SpirvRoutine* routine, Int workgroupID[3]);
-		void setSubgroupBuiltins(Pointer<Byte> data, SpirvRoutine* routine, Int workgroupID[3], SIMD::Int localInvocationIndex, Int subgroupIndex);
-
-		struct Data
-		{
-			vk::DescriptorSet::Bindings descriptorSets;
-			vk::DescriptorSet::DynamicOffsets descriptorDynamicOffsets;
-			uint4 numWorkgroups; // [x, y, z, 0]
-			uint4 workgroupSize; // [x, y, z, 0]
-			uint32_t invocationsPerSubgroup; // SPIR-V: "SubgroupSize"
-			uint32_t subgroupsPerWorkgroup; // SPIR-V: "NumSubgroups"
-			uint32_t invocationsPerWorkgroup; // Total number of invocations per workgroup.
-			PushConstantStorage pushConstants;
-			const Constants *constants;
-		};
-
-		SpirvShader const * const shader;
-		vk::PipelineLayout const * const pipelineLayout;
-		const vk::DescriptorSet::Bindings &descriptorSets;
+		vk::DescriptorSet::Bindings descriptorSets;
+		vk::DescriptorSet::DynamicOffsets descriptorDynamicOffsets;
+		uint4 numWorkgroups; // [x, y, z, 0]
+		uint4 workgroupSize; // [x, y, z, 0]
+		uint32_t invocationsPerSubgroup; // SPIR-V: "SubgroupSize"
+		uint32_t subgroupsPerWorkgroup; // SPIR-V: "NumSubgroups"
+		uint32_t invocationsPerWorkgroup; // Total number of invocations per workgroup.
+		PushConstantStorage pushConstants;
+		const Constants *constants;
 	};
 
-} // namespace sw
+	SpirvShader const * const shader;
+	vk::PipelineLayout const * const pipelineLayout;
+	const vk::DescriptorSet::Bindings &descriptorSets;
+};
+
+}  // namespace sw
 
 #endif   // sw_ComputeProgram_hpp
diff --git a/src/Pipeline/Constants.cpp b/src/Pipeline/Constants.cpp
index d8595b5..eba5eb8 100644
--- a/src/Pipeline/Constants.cpp
+++ b/src/Pipeline/Constants.cpp
@@ -19,359 +19,360 @@
 
 #include <cstring>
 
-namespace sw
+namespace sw {
+
+Constants constants;
+
+Constants::Constants()
 {
-	Constants constants;
-
-	Constants::Constants()
+	static const unsigned int transposeBit0[16] =
 	{
-		static const unsigned int transposeBit0[16] =
+		0x00000000,
+		0x00000001,
+		0x00000010,
+		0x00000011,
+		0x00000100,
+		0x00000101,
+		0x00000110,
+		0x00000111,
+		0x00001000,
+		0x00001001,
+		0x00001010,
+		0x00001011,
+		0x00001100,
+		0x00001101,
+		0x00001110,
+		0x00001111
+	};
+
+	static const unsigned int transposeBit1[16] =
+	{
+		0x00000000,
+		0x00000002,
+		0x00000020,
+		0x00000022,
+		0x00000200,
+		0x00000202,
+		0x00000220,
+		0x00000222,
+		0x00002000,
+		0x00002002,
+		0x00002020,
+		0x00002022,
+		0x00002200,
+		0x00002202,
+		0x00002220,
+		0x00002222
+	};
+
+	static const unsigned int transposeBit2[16] =
+	{
+		0x00000000,
+		0x00000004,
+		0x00000040,
+		0x00000044,
+		0x00000400,
+		0x00000404,
+		0x00000440,
+		0x00000444,
+		0x00004000,
+		0x00004004,
+		0x00004040,
+		0x00004044,
+		0x00004400,
+		0x00004404,
+		0x00004440,
+		0x00004444
+	};
+
+	memcpy(&this->transposeBit0, transposeBit0, sizeof(transposeBit0));
+	memcpy(&this->transposeBit1, transposeBit1, sizeof(transposeBit1));
+	memcpy(&this->transposeBit2, transposeBit2, sizeof(transposeBit2));
+
+	static const ushort4 cWeight[17] =
+	{
+		{0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF},   // 0xFFFF / 1  = 0xFFFF
+		{0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF},   // 0xFFFF / 1  = 0xFFFF
+		{0x8000, 0x8000, 0x8000, 0x8000},   // 0xFFFF / 2  = 0x8000
+		{0x5555, 0x5555, 0x5555, 0x5555},   // 0xFFFF / 3  = 0x5555
+		{0x4000, 0x4000, 0x4000, 0x4000},   // 0xFFFF / 4  = 0x4000
+		{0x3333, 0x3333, 0x3333, 0x3333},   // 0xFFFF / 5  = 0x3333
+		{0x2AAA, 0x2AAA, 0x2AAA, 0x2AAA},   // 0xFFFF / 6  = 0x2AAA
+		{0x2492, 0x2492, 0x2492, 0x2492},   // 0xFFFF / 7  = 0x2492
+		{0x2000, 0x2000, 0x2000, 0x2000},   // 0xFFFF / 8  = 0x2000
+		{0x1C71, 0x1C71, 0x1C71, 0x1C71},   // 0xFFFF / 9  = 0x1C71
+		{0x1999, 0x1999, 0x1999, 0x1999},   // 0xFFFF / 10 = 0x1999
+		{0x1745, 0x1745, 0x1745, 0x1745},   // 0xFFFF / 11 = 0x1745
+		{0x1555, 0x1555, 0x1555, 0x1555},   // 0xFFFF / 12 = 0x1555
+		{0x13B1, 0x13B1, 0x13B1, 0x13B1},   // 0xFFFF / 13 = 0x13B1
+		{0x1249, 0x1249, 0x1249, 0x1249},   // 0xFFFF / 14 = 0x1249
+		{0x1111, 0x1111, 0x1111, 0x1111},   // 0xFFFF / 15 = 0x1111
+		{0x1000, 0x1000, 0x1000, 0x1000},   // 0xFFFF / 16 = 0x1000
+	};
+
+	static const float4 uvWeight[17] =
+	{
+		{1.0f / 1.0f,  1.0f / 1.0f,  1.0f / 1.0f,  1.0f / 1.0f},
+		{1.0f / 1.0f,  1.0f / 1.0f,  1.0f / 1.0f,  1.0f / 1.0f},
+		{1.0f / 2.0f,  1.0f / 2.0f,  1.0f / 2.0f,  1.0f / 2.0f},
+		{1.0f / 3.0f,  1.0f / 3.0f,  1.0f / 3.0f,  1.0f / 3.0f},
+		{1.0f / 4.0f,  1.0f / 4.0f,  1.0f / 4.0f,  1.0f / 4.0f},
+		{1.0f / 5.0f,  1.0f / 5.0f,  1.0f / 5.0f,  1.0f / 5.0f},
+		{1.0f / 6.0f,  1.0f / 6.0f,  1.0f / 6.0f,  1.0f / 6.0f},
+		{1.0f / 7.0f,  1.0f / 7.0f,  1.0f / 7.0f,  1.0f / 7.0f},
+		{1.0f / 8.0f,  1.0f / 8.0f,  1.0f / 8.0f,  1.0f / 8.0f},
+		{1.0f / 9.0f,  1.0f / 9.0f,  1.0f / 9.0f,  1.0f / 9.0f},
+		{1.0f / 10.0f, 1.0f / 10.0f, 1.0f / 10.0f, 1.0f / 10.0f},
+		{1.0f / 11.0f, 1.0f / 11.0f, 1.0f / 11.0f, 1.0f / 11.0f},
+		{1.0f / 12.0f, 1.0f / 12.0f, 1.0f / 12.0f, 1.0f / 12.0f},
+		{1.0f / 13.0f, 1.0f / 13.0f, 1.0f / 13.0f, 1.0f / 13.0f},
+		{1.0f / 14.0f, 1.0f / 14.0f, 1.0f / 14.0f, 1.0f / 14.0f},
+		{1.0f / 15.0f, 1.0f / 15.0f, 1.0f / 15.0f, 1.0f / 15.0f},
+		{1.0f / 16.0f, 1.0f / 16.0f, 1.0f / 16.0f, 1.0f / 16.0f},
+	};
+
+	static const float4 uvStart[17] =
+	{
+		{-0.0f / 2.0f,   -0.0f / 2.0f,   -0.0f / 2.0f,   -0.0f / 2.0f},
+		{-0.0f / 2.0f,   -0.0f / 2.0f,   -0.0f / 2.0f,   -0.0f / 2.0f},
+		{-1.0f / 4.0f,   -1.0f / 4.0f,   -1.0f / 4.0f,   -1.0f / 4.0f},
+		{-2.0f / 6.0f,   -2.0f / 6.0f,   -2.0f / 6.0f,   -2.0f / 6.0f},
+		{-3.0f / 8.0f,   -3.0f / 8.0f,   -3.0f / 8.0f,   -3.0f / 8.0f},
+		{-4.0f / 10.0f,  -4.0f / 10.0f,  -4.0f / 10.0f,  -4.0f / 10.0f},
+		{-5.0f / 12.0f,  -5.0f / 12.0f,  -5.0f / 12.0f,  -5.0f / 12.0f},
+		{-6.0f / 14.0f,  -6.0f / 14.0f,  -6.0f / 14.0f,  -6.0f / 14.0f},
+		{-7.0f / 16.0f,  -7.0f / 16.0f,  -7.0f / 16.0f,  -7.0f / 16.0f},
+		{-8.0f / 18.0f,  -8.0f / 18.0f,  -8.0f / 18.0f,  -8.0f / 18.0f},
+		{-9.0f / 20.0f,  -9.0f / 20.0f,  -9.0f / 20.0f,  -9.0f / 20.0f},
+		{-10.0f / 22.0f, -10.0f / 22.0f, -10.0f / 22.0f, -10.0f / 22.0f},
+		{-11.0f / 24.0f, -11.0f / 24.0f, -11.0f / 24.0f, -11.0f / 24.0f},
+		{-12.0f / 26.0f, -12.0f / 26.0f, -12.0f / 26.0f, -12.0f / 26.0f},
+		{-13.0f / 28.0f, -13.0f / 28.0f, -13.0f / 28.0f, -13.0f / 28.0f},
+		{-14.0f / 30.0f, -14.0f / 30.0f, -14.0f / 30.0f, -14.0f / 30.0f},
+		{-15.0f / 32.0f, -15.0f / 32.0f, -15.0f / 32.0f, -15.0f / 32.0f},
+	};
+
+	memcpy(&this->cWeight, cWeight, sizeof(cWeight));
+	memcpy(&this->uvWeight, uvWeight, sizeof(uvWeight));
+	memcpy(&this->uvStart, uvStart, sizeof(uvStart));
+
+	static const unsigned int occlusionCount[16] = {0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4};
+
+	memcpy(&this->occlusionCount, &occlusionCount, sizeof(occlusionCount));
+
+	for(int i = 0; i < 16; i++)
+	{
+		maskB4Q[i][0] = -(i >> 0 & 1);
+		maskB4Q[i][1] = -(i >> 1 & 1);
+		maskB4Q[i][2] = -(i >> 2 & 1);
+		maskB4Q[i][3] = -(i >> 3 & 1);
+		maskB4Q[i][4] = -(i >> 0 & 1);
+		maskB4Q[i][5] = -(i >> 1 & 1);
+		maskB4Q[i][6] = -(i >> 2 & 1);
+		maskB4Q[i][7] = -(i >> 3 & 1);
+
+		invMaskB4Q[i][0] = ~maskB4Q[i][0];
+		invMaskB4Q[i][1] = ~maskB4Q[i][1];
+		invMaskB4Q[i][2] = ~maskB4Q[i][2];
+		invMaskB4Q[i][3] = ~maskB4Q[i][3];
+		invMaskB4Q[i][4] = ~maskB4Q[i][4];
+		invMaskB4Q[i][5] = ~maskB4Q[i][5];
+		invMaskB4Q[i][6] = ~maskB4Q[i][6];
+		invMaskB4Q[i][7] = ~maskB4Q[i][7];
+
+		maskW4Q[i][0] = -(i >> 0 & 1);
+		maskW4Q[i][1] = -(i >> 1 & 1);
+		maskW4Q[i][2] = -(i >> 2 & 1);
+		maskW4Q[i][3] = -(i >> 3 & 1);
+
+		invMaskW4Q[i][0] = ~maskW4Q[i][0];
+		invMaskW4Q[i][1] = ~maskW4Q[i][1];
+		invMaskW4Q[i][2] = ~maskW4Q[i][2];
+		invMaskW4Q[i][3] = ~maskW4Q[i][3];
+
+		maskD4X[i][0] = -(i >> 0 & 1);
+		maskD4X[i][1] = -(i >> 1 & 1);
+		maskD4X[i][2] = -(i >> 2 & 1);
+		maskD4X[i][3] = -(i >> 3 & 1);
+
+		invMaskD4X[i][0] = ~maskD4X[i][0];
+		invMaskD4X[i][1] = ~maskD4X[i][1];
+		invMaskD4X[i][2] = ~maskD4X[i][2];
+		invMaskD4X[i][3] = ~maskD4X[i][3];
+
+		maskQ0Q[i] = -(i >> 0 & 1);
+		maskQ1Q[i] = -(i >> 1 & 1);
+		maskQ2Q[i] = -(i >> 2 & 1);
+		maskQ3Q[i] = -(i >> 3 & 1);
+
+		invMaskQ0Q[i] = ~maskQ0Q[i];
+		invMaskQ1Q[i] = ~maskQ1Q[i];
+		invMaskQ2Q[i] = ~maskQ2Q[i];
+		invMaskQ3Q[i] = ~maskQ3Q[i];
+
+		maskX0X[i][0] = maskX0X[i][1] = maskX0X[i][2] = maskX0X[i][3] = -(i >> 0 & 1);
+		maskX1X[i][0] = maskX1X[i][1] = maskX1X[i][2] = maskX1X[i][3] = -(i >> 1 & 1);
+		maskX2X[i][0] = maskX2X[i][1] = maskX2X[i][2] = maskX2X[i][3] = -(i >> 2 & 1);
+		maskX3X[i][0] = maskX3X[i][1] = maskX3X[i][2] = maskX3X[i][3] = -(i >> 3 & 1);
+
+		invMaskX0X[i][0] = invMaskX0X[i][1] = invMaskX0X[i][2] = invMaskX0X[i][3] = ~maskX0X[i][0];
+		invMaskX1X[i][0] = invMaskX1X[i][1] = invMaskX1X[i][2] = invMaskX1X[i][3] = ~maskX1X[i][0];
+		invMaskX2X[i][0] = invMaskX2X[i][1] = invMaskX2X[i][2] = invMaskX2X[i][3] = ~maskX2X[i][0];
+		invMaskX3X[i][0] = invMaskX3X[i][1] = invMaskX3X[i][2] = invMaskX3X[i][3] = ~maskX3X[i][0];
+
+		maskD01Q[i][0] = -(i >> 0 & 1);
+		maskD01Q[i][1] = -(i >> 1 & 1);
+		maskD23Q[i][0] = -(i >> 2 & 1);
+		maskD23Q[i][1] = -(i >> 3 & 1);
+
+		invMaskD01Q[i][0] = ~maskD01Q[i][0];
+		invMaskD01Q[i][1] = ~maskD01Q[i][1];
+		invMaskD23Q[i][0] = ~maskD23Q[i][0];
+		invMaskD23Q[i][1] = ~maskD23Q[i][1];
+
+		maskQ01X[i][0] = -(i >> 0 & 1);
+		maskQ01X[i][1] = -(i >> 1 & 1);
+		maskQ23X[i][0] = -(i >> 2 & 1);
+		maskQ23X[i][1] = -(i >> 3 & 1);
+
+		invMaskQ01X[i][0] = ~maskQ01X[i][0];
+		invMaskQ01X[i][1] = ~maskQ01X[i][1];
+		invMaskQ23X[i][0] = ~maskQ23X[i][0];
+		invMaskQ23X[i][1] = ~maskQ23X[i][1];
+	}
+
+	for(int i = 0; i < 8; i++)
+	{
+		mask565Q[i][0] =
+		mask565Q[i][1] =
+		mask565Q[i][2] =
+		mask565Q[i][3] = (i & 0x1 ? 0x001F : 0) | (i & 0x2 ? 0x07E0 : 0) | (i & 0x4 ? 0xF800 : 0);
+	}
+
+	for (int i = 0; i < 16; i++)
+	{
+		mask5551Q[i][0] =
+		mask5551Q[i][1] =
+		mask5551Q[i][2] =
+		mask5551Q[i][3] = (i & 0x1 ? 0x001F : 0) | (i & 0x2 ? 0x03E0 : 0) | (i & 0x4 ? 0x7C00 : 0) | (i & 8 ? 0x8000 : 0);
+	}
+
+	for(int i = 0; i < 4; i++)
+	{
+		maskW01Q[i][0] =  -(i >> 0 & 1);
+		maskW01Q[i][1] =  -(i >> 1 & 1);
+		maskW01Q[i][2] =  -(i >> 0 & 1);
+		maskW01Q[i][3] =  -(i >> 1 & 1);
+
+		maskD01X[i][0] =  -(i >> 0 & 1);
+		maskD01X[i][1] =  -(i >> 1 & 1);
+		maskD01X[i][2] =  -(i >> 0 & 1);
+		maskD01X[i][3] =  -(i >> 1 & 1);
+	}
+
+	for (int i = 0; i < 16; i++)
+	{
+		mask10Q[i][0] = mask10Q[i][1] =
+				(i & 0x1 ? 0x3FF : 0) |
+				(i & 0x2 ? 0xFFC00 : 0) |
+				(i & 0x4 ? 0x3FF00000 : 0) |
+				(i & 0x8 ? 0xC0000000 : 0);
+	}
+
+	for(int i = 0; i < 256; i++)
+	{
+		sRGBtoLinear8_16[i] = (unsigned short)(sw::sRGBtoLinear((float)i / 0xFF) * 0xFFFF + 0.5f);
+	}
+
+	for(int i = 0; i < 0x1000; i++)
+	{
+		linearToSRGB12_16[i] = (unsigned short)(clamp(sw::linearToSRGB((float)i / 0x0FFF) * 0xFFFF + 0.5f, 0.0f, (float)0xFFFF));
+		sRGBtoLinear12_16[i] = (unsigned short)(clamp(sw::sRGBtoLinear((float)i / 0x0FFF) * 0xFFFF + 0.5f, 0.0f, (float)0xFFFF));
+	}
+
+	constexpr float4 X[4] = {
+		sw::replicate(SampleLocationsX[0]),
+		sw::replicate(SampleLocationsX[1]),
+		sw::replicate(SampleLocationsX[2]),
+		sw::replicate(SampleLocationsX[3]),
+	};
+
+	constexpr float4 Y[4] = {
+		sw::replicate(SampleLocationsY[0]),
+		sw::replicate(SampleLocationsY[1]),
+		sw::replicate(SampleLocationsY[2]),
+		sw::replicate(SampleLocationsY[3]),
+	};
+
+	for(int q = 0; q < 4; q++)
+	{
+		for(int c = 0; c < 16; c++)
 		{
-			0x00000000,
-			0x00000001,
-			0x00000010,
-			0x00000011,
-			0x00000100,
-			0x00000101,
-			0x00000110,
-			0x00000111,
-			0x00001000,
-			0x00001001,
-			0x00001010,
-			0x00001011,
-			0x00001100,
-			0x00001101,
-			0x00001110,
-			0x00001111
-		};
-
-		static const unsigned int transposeBit1[16] =
-		{
-			0x00000000,
-			0x00000002,
-			0x00000020,
-			0x00000022,
-			0x00000200,
-			0x00000202,
-			0x00000220,
-			0x00000222,
-			0x00002000,
-			0x00002002,
-			0x00002020,
-			0x00002022,
-			0x00002200,
-			0x00002202,
-			0x00002220,
-			0x00002222
-		};
-
-		static const unsigned int transposeBit2[16] =
-		{
-			0x00000000,
-			0x00000004,
-			0x00000040,
-			0x00000044,
-			0x00000400,
-			0x00000404,
-			0x00000440,
-			0x00000444,
-			0x00004000,
-			0x00004004,
-			0x00004040,
-			0x00004044,
-			0x00004400,
-			0x00004404,
-			0x00004440,
-			0x00004444
-		};
-
-		memcpy(&this->transposeBit0, transposeBit0, sizeof(transposeBit0));
-		memcpy(&this->transposeBit1, transposeBit1, sizeof(transposeBit1));
-		memcpy(&this->transposeBit2, transposeBit2, sizeof(transposeBit2));
-
-		static const ushort4 cWeight[17] =
-		{
-			{0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF},   // 0xFFFF / 1  = 0xFFFF
-			{0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF},   // 0xFFFF / 1  = 0xFFFF
-			{0x8000, 0x8000, 0x8000, 0x8000},   // 0xFFFF / 2  = 0x8000
-			{0x5555, 0x5555, 0x5555, 0x5555},   // 0xFFFF / 3  = 0x5555
-			{0x4000, 0x4000, 0x4000, 0x4000},   // 0xFFFF / 4  = 0x4000
-			{0x3333, 0x3333, 0x3333, 0x3333},   // 0xFFFF / 5  = 0x3333
-			{0x2AAA, 0x2AAA, 0x2AAA, 0x2AAA},   // 0xFFFF / 6  = 0x2AAA
-			{0x2492, 0x2492, 0x2492, 0x2492},   // 0xFFFF / 7  = 0x2492
-			{0x2000, 0x2000, 0x2000, 0x2000},   // 0xFFFF / 8  = 0x2000
-			{0x1C71, 0x1C71, 0x1C71, 0x1C71},   // 0xFFFF / 9  = 0x1C71
-			{0x1999, 0x1999, 0x1999, 0x1999},   // 0xFFFF / 10 = 0x1999
-			{0x1745, 0x1745, 0x1745, 0x1745},   // 0xFFFF / 11 = 0x1745
-			{0x1555, 0x1555, 0x1555, 0x1555},   // 0xFFFF / 12 = 0x1555
-			{0x13B1, 0x13B1, 0x13B1, 0x13B1},   // 0xFFFF / 13 = 0x13B1
-			{0x1249, 0x1249, 0x1249, 0x1249},   // 0xFFFF / 14 = 0x1249
-			{0x1111, 0x1111, 0x1111, 0x1111},   // 0xFFFF / 15 = 0x1111
-			{0x1000, 0x1000, 0x1000, 0x1000},   // 0xFFFF / 16 = 0x1000
-		};
-
-		static const float4 uvWeight[17] =
-		{
-			{1.0f / 1.0f,  1.0f / 1.0f,  1.0f / 1.0f,  1.0f / 1.0f},
-			{1.0f / 1.0f,  1.0f / 1.0f,  1.0f / 1.0f,  1.0f / 1.0f},
-			{1.0f / 2.0f,  1.0f / 2.0f,  1.0f / 2.0f,  1.0f / 2.0f},
-			{1.0f / 3.0f,  1.0f / 3.0f,  1.0f / 3.0f,  1.0f / 3.0f},
-			{1.0f / 4.0f,  1.0f / 4.0f,  1.0f / 4.0f,  1.0f / 4.0f},
-			{1.0f / 5.0f,  1.0f / 5.0f,  1.0f / 5.0f,  1.0f / 5.0f},
-			{1.0f / 6.0f,  1.0f / 6.0f,  1.0f / 6.0f,  1.0f / 6.0f},
-			{1.0f / 7.0f,  1.0f / 7.0f,  1.0f / 7.0f,  1.0f / 7.0f},
-			{1.0f / 8.0f,  1.0f / 8.0f,  1.0f / 8.0f,  1.0f / 8.0f},
-			{1.0f / 9.0f,  1.0f / 9.0f,  1.0f / 9.0f,  1.0f / 9.0f},
-			{1.0f / 10.0f, 1.0f / 10.0f, 1.0f / 10.0f, 1.0f / 10.0f},
-			{1.0f / 11.0f, 1.0f / 11.0f, 1.0f / 11.0f, 1.0f / 11.0f},
-			{1.0f / 12.0f, 1.0f / 12.0f, 1.0f / 12.0f, 1.0f / 12.0f},
-			{1.0f / 13.0f, 1.0f / 13.0f, 1.0f / 13.0f, 1.0f / 13.0f},
-			{1.0f / 14.0f, 1.0f / 14.0f, 1.0f / 14.0f, 1.0f / 14.0f},
-			{1.0f / 15.0f, 1.0f / 15.0f, 1.0f / 15.0f, 1.0f / 15.0f},
-			{1.0f / 16.0f, 1.0f / 16.0f, 1.0f / 16.0f, 1.0f / 16.0f},
-		};
-
-		static const float4 uvStart[17] =
-		{
-			{-0.0f / 2.0f,   -0.0f / 2.0f,   -0.0f / 2.0f,   -0.0f / 2.0f},
-			{-0.0f / 2.0f,   -0.0f / 2.0f,   -0.0f / 2.0f,   -0.0f / 2.0f},
-			{-1.0f / 4.0f,   -1.0f / 4.0f,   -1.0f / 4.0f,   -1.0f / 4.0f},
-			{-2.0f / 6.0f,   -2.0f / 6.0f,   -2.0f / 6.0f,   -2.0f / 6.0f},
-			{-3.0f / 8.0f,   -3.0f / 8.0f,   -3.0f / 8.0f,   -3.0f / 8.0f},
-			{-4.0f / 10.0f,  -4.0f / 10.0f,  -4.0f / 10.0f,  -4.0f / 10.0f},
-			{-5.0f / 12.0f,  -5.0f / 12.0f,  -5.0f / 12.0f,  -5.0f / 12.0f},
-			{-6.0f / 14.0f,  -6.0f / 14.0f,  -6.0f / 14.0f,  -6.0f / 14.0f},
-			{-7.0f / 16.0f,  -7.0f / 16.0f,  -7.0f / 16.0f,  -7.0f / 16.0f},
-			{-8.0f / 18.0f,  -8.0f / 18.0f,  -8.0f / 18.0f,  -8.0f / 18.0f},
-			{-9.0f / 20.0f,  -9.0f / 20.0f,  -9.0f / 20.0f,  -9.0f / 20.0f},
-			{-10.0f / 22.0f, -10.0f / 22.0f, -10.0f / 22.0f, -10.0f / 22.0f},
-			{-11.0f / 24.0f, -11.0f / 24.0f, -11.0f / 24.0f, -11.0f / 24.0f},
-			{-12.0f / 26.0f, -12.0f / 26.0f, -12.0f / 26.0f, -12.0f / 26.0f},
-			{-13.0f / 28.0f, -13.0f / 28.0f, -13.0f / 28.0f, -13.0f / 28.0f},
-			{-14.0f / 30.0f, -14.0f / 30.0f, -14.0f / 30.0f, -14.0f / 30.0f},
-			{-15.0f / 32.0f, -15.0f / 32.0f, -15.0f / 32.0f, -15.0f / 32.0f},
-		};
-
-		memcpy(&this->cWeight, cWeight, sizeof(cWeight));
-		memcpy(&this->uvWeight, uvWeight, sizeof(uvWeight));
-		memcpy(&this->uvStart, uvStart, sizeof(uvStart));
-
-		static const unsigned int occlusionCount[16] = {0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4};
-
-		memcpy(&this->occlusionCount, &occlusionCount, sizeof(occlusionCount));
-
-		for(int i = 0; i < 16; i++)
-		{
-			maskB4Q[i][0] = -(i >> 0 & 1);
-			maskB4Q[i][1] = -(i >> 1 & 1);
-			maskB4Q[i][2] = -(i >> 2 & 1);
-			maskB4Q[i][3] = -(i >> 3 & 1);
-			maskB4Q[i][4] = -(i >> 0 & 1);
-			maskB4Q[i][5] = -(i >> 1 & 1);
-			maskB4Q[i][6] = -(i >> 2 & 1);
-			maskB4Q[i][7] = -(i >> 3 & 1);
-
-			invMaskB4Q[i][0] = ~maskB4Q[i][0];
-			invMaskB4Q[i][1] = ~maskB4Q[i][1];
-			invMaskB4Q[i][2] = ~maskB4Q[i][2];
-			invMaskB4Q[i][3] = ~maskB4Q[i][3];
-			invMaskB4Q[i][4] = ~maskB4Q[i][4];
-			invMaskB4Q[i][5] = ~maskB4Q[i][5];
-			invMaskB4Q[i][6] = ~maskB4Q[i][6];
-			invMaskB4Q[i][7] = ~maskB4Q[i][7];
-
-			maskW4Q[i][0] = -(i >> 0 & 1);
-			maskW4Q[i][1] = -(i >> 1 & 1);
-			maskW4Q[i][2] = -(i >> 2 & 1);
-			maskW4Q[i][3] = -(i >> 3 & 1);
-
-			invMaskW4Q[i][0] = ~maskW4Q[i][0];
-			invMaskW4Q[i][1] = ~maskW4Q[i][1];
-			invMaskW4Q[i][2] = ~maskW4Q[i][2];
-			invMaskW4Q[i][3] = ~maskW4Q[i][3];
-
-			maskD4X[i][0] = -(i >> 0 & 1);
-			maskD4X[i][1] = -(i >> 1 & 1);
-			maskD4X[i][2] = -(i >> 2 & 1);
-			maskD4X[i][3] = -(i >> 3 & 1);
-
-			invMaskD4X[i][0] = ~maskD4X[i][0];
-			invMaskD4X[i][1] = ~maskD4X[i][1];
-			invMaskD4X[i][2] = ~maskD4X[i][2];
-			invMaskD4X[i][3] = ~maskD4X[i][3];
-
-			maskQ0Q[i] = -(i >> 0 & 1);
-			maskQ1Q[i] = -(i >> 1 & 1);
-			maskQ2Q[i] = -(i >> 2 & 1);
-			maskQ3Q[i] = -(i >> 3 & 1);
-
-			invMaskQ0Q[i] = ~maskQ0Q[i];
-			invMaskQ1Q[i] = ~maskQ1Q[i];
-			invMaskQ2Q[i] = ~maskQ2Q[i];
-			invMaskQ3Q[i] = ~maskQ3Q[i];
-
-			maskX0X[i][0] = maskX0X[i][1] = maskX0X[i][2] = maskX0X[i][3] = -(i >> 0 & 1);
-			maskX1X[i][0] = maskX1X[i][1] = maskX1X[i][2] = maskX1X[i][3] = -(i >> 1 & 1);
-			maskX2X[i][0] = maskX2X[i][1] = maskX2X[i][2] = maskX2X[i][3] = -(i >> 2 & 1);
-			maskX3X[i][0] = maskX3X[i][1] = maskX3X[i][2] = maskX3X[i][3] = -(i >> 3 & 1);
-
-			invMaskX0X[i][0] = invMaskX0X[i][1] = invMaskX0X[i][2] = invMaskX0X[i][3] = ~maskX0X[i][0];
-			invMaskX1X[i][0] = invMaskX1X[i][1] = invMaskX1X[i][2] = invMaskX1X[i][3] = ~maskX1X[i][0];
-			invMaskX2X[i][0] = invMaskX2X[i][1] = invMaskX2X[i][2] = invMaskX2X[i][3] = ~maskX2X[i][0];
-			invMaskX3X[i][0] = invMaskX3X[i][1] = invMaskX3X[i][2] = invMaskX3X[i][3] = ~maskX3X[i][0];
-
-			maskD01Q[i][0] = -(i >> 0 & 1);
-			maskD01Q[i][1] = -(i >> 1 & 1);
-			maskD23Q[i][0] = -(i >> 2 & 1);
-			maskD23Q[i][1] = -(i >> 3 & 1);
-
-			invMaskD01Q[i][0] = ~maskD01Q[i][0];
-			invMaskD01Q[i][1] = ~maskD01Q[i][1];
-			invMaskD23Q[i][0] = ~maskD23Q[i][0];
-			invMaskD23Q[i][1] = ~maskD23Q[i][1];
-
-			maskQ01X[i][0] = -(i >> 0 & 1);
-			maskQ01X[i][1] = -(i >> 1 & 1);
-			maskQ23X[i][0] = -(i >> 2 & 1);
-			maskQ23X[i][1] = -(i >> 3 & 1);
-
-			invMaskQ01X[i][0] = ~maskQ01X[i][0];
-			invMaskQ01X[i][1] = ~maskQ01X[i][1];
-			invMaskQ23X[i][0] = ~maskQ23X[i][0];
-			invMaskQ23X[i][1] = ~maskQ23X[i][1];
-		}
-
-		for(int i = 0; i < 8; i++)
-		{
-			mask565Q[i][0] =
-			mask565Q[i][1] =
-			mask565Q[i][2] =
-			mask565Q[i][3] = (i & 0x1 ? 0x001F : 0) | (i & 0x2 ? 0x07E0 : 0) | (i & 0x4 ? 0xF800 : 0);
-		}
-
-		for (int i = 0; i < 16; i++)
-		{
-			mask5551Q[i][0] =
-			mask5551Q[i][1] =
-			mask5551Q[i][2] =
-			mask5551Q[i][3] = (i & 0x1 ? 0x001F : 0) | (i & 0x2 ? 0x03E0 : 0) | (i & 0x4 ? 0x7C00 : 0) | (i & 8 ? 0x8000 : 0);
-		}
-
-		for(int i = 0; i < 4; i++)
-		{
-			maskW01Q[i][0] =  -(i >> 0 & 1);
-			maskW01Q[i][1] =  -(i >> 1 & 1);
-			maskW01Q[i][2] =  -(i >> 0 & 1);
-			maskW01Q[i][3] =  -(i >> 1 & 1);
-
-			maskD01X[i][0] =  -(i >> 0 & 1);
-			maskD01X[i][1] =  -(i >> 1 & 1);
-			maskD01X[i][2] =  -(i >> 0 & 1);
-			maskD01X[i][3] =  -(i >> 1 & 1);
-		}
-
-		for (int i = 0; i < 16; i++)
-		{
-			mask10Q[i][0] = mask10Q[i][1] =
-					(i & 0x1 ? 0x3FF : 0) |
-					(i & 0x2 ? 0xFFC00 : 0) |
-					(i & 0x4 ? 0x3FF00000 : 0) |
-					(i & 0x8 ? 0xC0000000 : 0);
-		}
-
-		for(int i = 0; i < 256; i++)
-		{
-			sRGBtoLinear8_16[i] = (unsigned short)(sw::sRGBtoLinear((float)i / 0xFF) * 0xFFFF + 0.5f);
-		}
-
-		for(int i = 0; i < 0x1000; i++)
-		{
-			linearToSRGB12_16[i] = (unsigned short)(clamp(sw::linearToSRGB((float)i / 0x0FFF) * 0xFFFF + 0.5f, 0.0f, (float)0xFFFF));
-			sRGBtoLinear12_16[i] = (unsigned short)(clamp(sw::sRGBtoLinear((float)i / 0x0FFF) * 0xFFFF + 0.5f, 0.0f, (float)0xFFFF));
-		}
-
-		constexpr float4 X[4] = {
-			sw::replicate(SampleLocationsX[0]),
-			sw::replicate(SampleLocationsX[1]),
-			sw::replicate(SampleLocationsX[2]),
-			sw::replicate(SampleLocationsX[3]),
-		};
-
-		constexpr float4 Y[4] = {
-			sw::replicate(SampleLocationsY[0]),
-			sw::replicate(SampleLocationsY[1]),
-			sw::replicate(SampleLocationsY[2]),
-			sw::replicate(SampleLocationsY[3]),
-		};
-
-		for(int q = 0; q < 4; q++)
-		{
-			for(int c = 0; c < 16; c++)
+			for(int i = 0; i < 4; i++)
 			{
-				for(int i = 0; i < 4; i++)
-				{
-					// Reorder sample points for centroid computation
-					const float Xs[4] = { X[1][0], X[2][0], X[0][0], X[3][0] };
-					const float Ys[4] = { Y[1][0], Y[2][0], Y[0][0], Y[3][0] };
+				// Reorder sample points for centroid computation
+				const float Xs[4] = { X[1][0], X[2][0], X[0][0], X[3][0] };
+				const float Ys[4] = { Y[1][0], Y[2][0], Y[0][0], Y[3][0] };
 
-					sampleX[q][c][i] = c & (1 << i) ? Xs[q] : 0.0f;
-					sampleY[q][c][i] = c & (1 << i) ? Ys[q] : 0.0f;
-					weight[c][i] = c & (1 << i) ? 1.0f : 0.0f;
-				}
+				sampleX[q][c][i] = c & (1 << i) ? Xs[q] : 0.0f;
+				sampleY[q][c][i] = c & (1 << i) ? Ys[q] : 0.0f;
+				weight[c][i] = c & (1 << i) ? 1.0f : 0.0f;
 			}
 		}
-
-		constexpr auto subPixB = vk::SUBPIXEL_PRECISION_BITS;
-
-		// Reorder sample points for fragment offset computation
-		const int Xf[4] = { toFixedPoint(X[2][0], subPixB), toFixedPoint(X[1][0], subPixB), toFixedPoint(X[3][0], subPixB), toFixedPoint(X[0][0], subPixB) };
-		const int Yf[4] = { toFixedPoint(Y[2][0], subPixB), toFixedPoint(Y[1][0], subPixB), toFixedPoint(Y[3][0], subPixB), toFixedPoint(Y[0][0], subPixB) };
-
-		memcpy(&this->Xf, &Xf, sizeof(Xf));
-		memcpy(&this->Yf, &Yf, sizeof(Yf));
-
-		memcpy(&this->X, &X, sizeof(X));
-		memcpy(&this->Y, &Y, sizeof(Y));
-
-		const dword maxX[16] = {0x00000000, 0x00000001, 0x00000100, 0x00000101, 0x00010000, 0x00010001, 0x00010100, 0x00010101, 0x01000000, 0x01000001, 0x01000100, 0x01000101, 0x01010000, 0x01010001, 0x01010100, 0x01010101};
-		const dword maxY[16] = {0x00000000, 0x00000002, 0x00000200, 0x00000202, 0x00020000, 0x00020002, 0x00020200, 0x00020202, 0x02000000, 0x02000002, 0x02000200, 0x02000202, 0x02020000, 0x02020002, 0x02020200, 0x02020202};
-		const dword maxZ[16] = {0x00000000, 0x00000004, 0x00000400, 0x00000404, 0x00040000, 0x00040004, 0x00040400, 0x00040404, 0x04000000, 0x04000004, 0x04000400, 0x04000404, 0x04040000, 0x04040004, 0x04040400, 0x04040404};
-		const dword minX[16] = {0x00000000, 0x00000008, 0x00000800, 0x00000808, 0x00080000, 0x00080008, 0x00080800, 0x00080808, 0x08000000, 0x08000008, 0x08000800, 0x08000808, 0x08080000, 0x08080008, 0x08080800, 0x08080808};
-		const dword minY[16] = {0x00000000, 0x00000010, 0x00001000, 0x00001010, 0x00100000, 0x00100010, 0x00101000, 0x00101010, 0x10000000, 0x10000010, 0x10001000, 0x10001010, 0x10100000, 0x10100010, 0x10101000, 0x10101010};
-		const dword minZ[16] = {0x00000000, 0x00000020, 0x00002000, 0x00002020, 0x00200000, 0x00200020, 0x00202000, 0x00202020, 0x20000000, 0x20000020, 0x20002000, 0x20002020, 0x20200000, 0x20200020, 0x20202000, 0x20202020};
-		const dword fini[16] = {0x00000000, 0x00000080, 0x00008000, 0x00008080, 0x00800000, 0x00800080, 0x00808000, 0x00808080, 0x80000000, 0x80000080, 0x80008000, 0x80008080, 0x80800000, 0x80800080, 0x80808000, 0x80808080};
-
-		memcpy(&this->maxX, &maxX, sizeof(maxX));
-		memcpy(&this->maxY, &maxY, sizeof(maxY));
-		memcpy(&this->maxZ, &maxZ, sizeof(maxZ));
-		memcpy(&this->minX, &minX, sizeof(minX));
-		memcpy(&this->minY, &minY, sizeof(minY));
-		memcpy(&this->minZ, &minZ, sizeof(minZ));
-		memcpy(&this->fini, &fini, sizeof(fini));
-
-		static const dword4 maxPos = {0x7F7FFFFF, 0x7F7FFFFF, 0x7F7FFFFF, 0x7F7FFFFE};
-
-		memcpy(&this->maxPos, &maxPos, sizeof(maxPos));
-
-		static const float4 unscaleByte = {1.0f / 0xFF, 1.0f / 0xFF, 1.0f / 0xFF, 1.0f / 0xFF};
-		static const float4 unscaleSByte = {1.0f / 0x7F, 1.0f / 0x7F, 1.0f / 0x7F, 1.0f / 0x7F};
-		static const float4 unscaleShort = {1.0f / 0x7FFF, 1.0f / 0x7FFF, 1.0f / 0x7FFF, 1.0f / 0x7FFF};
-		static const float4 unscaleUShort = {1.0f / 0xFFFF, 1.0f / 0xFFFF, 1.0f / 0xFFFF, 1.0f / 0xFFFF};
-		static const float4 unscaleInt = {1.0f / 0x7FFFFFFF, 1.0f / 0x7FFFFFFF, 1.0f / 0x7FFFFFFF, 1.0f / 0x7FFFFFFF};
-		static const float4 unscaleUInt = {1.0f / 0xFFFFFFFF, 1.0f / 0xFFFFFFFF, 1.0f / 0xFFFFFFFF, 1.0f / 0xFFFFFFFF};
-		static const float4 unscaleFixed = {1.0f / 0x00010000, 1.0f / 0x00010000, 1.0f / 0x00010000, 1.0f / 0x00010000};
-
-		memcpy(&this->unscaleByte, &unscaleByte, sizeof(unscaleByte));
-		memcpy(&this->unscaleSByte, &unscaleSByte, sizeof(unscaleSByte));
-		memcpy(&this->unscaleShort, &unscaleShort, sizeof(unscaleShort));
-		memcpy(&this->unscaleUShort, &unscaleUShort, sizeof(unscaleUShort));
-		memcpy(&this->unscaleInt, &unscaleInt, sizeof(unscaleInt));
-		memcpy(&this->unscaleUInt, &unscaleUInt, sizeof(unscaleUInt));
-		memcpy(&this->unscaleFixed, &unscaleFixed, sizeof(unscaleFixed));
-
-		for(int i = 0; i <= 0xFFFF; i++)
-		{
-			half2float[i] = (float)reinterpret_cast<half&>(i);
-		}
 	}
-}
\ No newline at end of file
+
+	constexpr auto subPixB = vk::SUBPIXEL_PRECISION_BITS;
+
+	// Reorder sample points for fragment offset computation
+	const int Xf[4] = { toFixedPoint(X[2][0], subPixB), toFixedPoint(X[1][0], subPixB), toFixedPoint(X[3][0], subPixB), toFixedPoint(X[0][0], subPixB) };
+	const int Yf[4] = { toFixedPoint(Y[2][0], subPixB), toFixedPoint(Y[1][0], subPixB), toFixedPoint(Y[3][0], subPixB), toFixedPoint(Y[0][0], subPixB) };
+
+	memcpy(&this->Xf, &Xf, sizeof(Xf));
+	memcpy(&this->Yf, &Yf, sizeof(Yf));
+
+	memcpy(&this->X, &X, sizeof(X));
+	memcpy(&this->Y, &Y, sizeof(Y));
+
+	const dword maxX[16] = {0x00000000, 0x00000001, 0x00000100, 0x00000101, 0x00010000, 0x00010001, 0x00010100, 0x00010101, 0x01000000, 0x01000001, 0x01000100, 0x01000101, 0x01010000, 0x01010001, 0x01010100, 0x01010101};
+	const dword maxY[16] = {0x00000000, 0x00000002, 0x00000200, 0x00000202, 0x00020000, 0x00020002, 0x00020200, 0x00020202, 0x02000000, 0x02000002, 0x02000200, 0x02000202, 0x02020000, 0x02020002, 0x02020200, 0x02020202};
+	const dword maxZ[16] = {0x00000000, 0x00000004, 0x00000400, 0x00000404, 0x00040000, 0x00040004, 0x00040400, 0x00040404, 0x04000000, 0x04000004, 0x04000400, 0x04000404, 0x04040000, 0x04040004, 0x04040400, 0x04040404};
+	const dword minX[16] = {0x00000000, 0x00000008, 0x00000800, 0x00000808, 0x00080000, 0x00080008, 0x00080800, 0x00080808, 0x08000000, 0x08000008, 0x08000800, 0x08000808, 0x08080000, 0x08080008, 0x08080800, 0x08080808};
+	const dword minY[16] = {0x00000000, 0x00000010, 0x00001000, 0x00001010, 0x00100000, 0x00100010, 0x00101000, 0x00101010, 0x10000000, 0x10000010, 0x10001000, 0x10001010, 0x10100000, 0x10100010, 0x10101000, 0x10101010};
+	const dword minZ[16] = {0x00000000, 0x00000020, 0x00002000, 0x00002020, 0x00200000, 0x00200020, 0x00202000, 0x00202020, 0x20000000, 0x20000020, 0x20002000, 0x20002020, 0x20200000, 0x20200020, 0x20202000, 0x20202020};
+	const dword fini[16] = {0x00000000, 0x00000080, 0x00008000, 0x00008080, 0x00800000, 0x00800080, 0x00808000, 0x00808080, 0x80000000, 0x80000080, 0x80008000, 0x80008080, 0x80800000, 0x80800080, 0x80808000, 0x80808080};
+
+	memcpy(&this->maxX, &maxX, sizeof(maxX));
+	memcpy(&this->maxY, &maxY, sizeof(maxY));
+	memcpy(&this->maxZ, &maxZ, sizeof(maxZ));
+	memcpy(&this->minX, &minX, sizeof(minX));
+	memcpy(&this->minY, &minY, sizeof(minY));
+	memcpy(&this->minZ, &minZ, sizeof(minZ));
+	memcpy(&this->fini, &fini, sizeof(fini));
+
+	static const dword4 maxPos = {0x7F7FFFFF, 0x7F7FFFFF, 0x7F7FFFFF, 0x7F7FFFFE};
+
+	memcpy(&this->maxPos, &maxPos, sizeof(maxPos));
+
+	static const float4 unscaleByte = {1.0f / 0xFF, 1.0f / 0xFF, 1.0f / 0xFF, 1.0f / 0xFF};
+	static const float4 unscaleSByte = {1.0f / 0x7F, 1.0f / 0x7F, 1.0f / 0x7F, 1.0f / 0x7F};
+	static const float4 unscaleShort = {1.0f / 0x7FFF, 1.0f / 0x7FFF, 1.0f / 0x7FFF, 1.0f / 0x7FFF};
+	static const float4 unscaleUShort = {1.0f / 0xFFFF, 1.0f / 0xFFFF, 1.0f / 0xFFFF, 1.0f / 0xFFFF};
+	static const float4 unscaleInt = {1.0f / 0x7FFFFFFF, 1.0f / 0x7FFFFFFF, 1.0f / 0x7FFFFFFF, 1.0f / 0x7FFFFFFF};
+	static const float4 unscaleUInt = {1.0f / 0xFFFFFFFF, 1.0f / 0xFFFFFFFF, 1.0f / 0xFFFFFFFF, 1.0f / 0xFFFFFFFF};
+	static const float4 unscaleFixed = {1.0f / 0x00010000, 1.0f / 0x00010000, 1.0f / 0x00010000, 1.0f / 0x00010000};
+
+	memcpy(&this->unscaleByte, &unscaleByte, sizeof(unscaleByte));
+	memcpy(&this->unscaleSByte, &unscaleSByte, sizeof(unscaleSByte));
+	memcpy(&this->unscaleShort, &unscaleShort, sizeof(unscaleShort));
+	memcpy(&this->unscaleUShort, &unscaleUShort, sizeof(unscaleUShort));
+	memcpy(&this->unscaleInt, &unscaleInt, sizeof(unscaleInt));
+	memcpy(&this->unscaleUInt, &unscaleUInt, sizeof(unscaleUInt));
+	memcpy(&this->unscaleFixed, &unscaleFixed, sizeof(unscaleFixed));
+
+	for(int i = 0; i <= 0xFFFF; i++)
+	{
+		half2float[i] = (float)reinterpret_cast<half&>(i);
+	}
+}
+
+}  // namespace sw
\ No newline at end of file
diff --git a/src/Pipeline/Constants.hpp b/src/Pipeline/Constants.hpp
index 58c8e10..484682b 100644
--- a/src/Pipeline/Constants.hpp
+++ b/src/Pipeline/Constants.hpp
@@ -19,127 +19,128 @@
 #include "System/Math.hpp"
 #include "Vulkan/VkConfig.h"
 
-namespace sw
+namespace sw {
+
+struct Constants
 {
-	struct Constants
-	{
-		Constants();
+	Constants();
 
-		unsigned int transposeBit0[16];
-		unsigned int transposeBit1[16];
-		unsigned int transposeBit2[16];
+	unsigned int transposeBit0[16];
+	unsigned int transposeBit1[16];
+	unsigned int transposeBit2[16];
 
-		ushort4 cWeight[17];
-		float4 uvWeight[17];
-		float4 uvStart[17];
+	ushort4 cWeight[17];
+	float4 uvWeight[17];
+	float4 uvStart[17];
 
-		unsigned int occlusionCount[16];
+	unsigned int occlusionCount[16];
 
-		byte8 maskB4Q[16];
-		byte8 invMaskB4Q[16];
-		word4 maskW4Q[16];
-		word4 invMaskW4Q[16];
-		dword4 maskD4X[16];
-		dword4 invMaskD4X[16];
-		qword maskQ0Q[16];
-		qword maskQ1Q[16];
-		qword maskQ2Q[16];
-		qword maskQ3Q[16];
-		qword invMaskQ0Q[16];
-		qword invMaskQ1Q[16];
-		qword invMaskQ2Q[16];
-		qword invMaskQ3Q[16];
-		dword4 maskX0X[16];
-		dword4 maskX1X[16];
-		dword4 maskX2X[16];
-		dword4 maskX3X[16];
-		dword4 invMaskX0X[16];
-		dword4 invMaskX1X[16];
-		dword4 invMaskX2X[16];
-		dword4 invMaskX3X[16];
-		dword2 maskD01Q[16];
-		dword2 maskD23Q[16];
-		dword2 invMaskD01Q[16];
-		dword2 invMaskD23Q[16];
-		qword2 maskQ01X[16];
-		qword2 maskQ23X[16];
-		qword2 invMaskQ01X[16];
-		qword2 invMaskQ23X[16];
-		word4 maskW01Q[4];
-		dword4 maskD01X[4];
-		word4 mask565Q[8];
-		dword2 mask10Q[16];		// 4 bit writemask -> A2B10G10R10 bit patterns, replicated 2x
-		word4 mask5551Q[16];	// 4 bit writemask -> A1R5G5B5 bit patterns, replicated 4x
+	byte8 maskB4Q[16];
+	byte8 invMaskB4Q[16];
+	word4 maskW4Q[16];
+	word4 invMaskW4Q[16];
+	dword4 maskD4X[16];
+	dword4 invMaskD4X[16];
+	qword maskQ0Q[16];
+	qword maskQ1Q[16];
+	qword maskQ2Q[16];
+	qword maskQ3Q[16];
+	qword invMaskQ0Q[16];
+	qword invMaskQ1Q[16];
+	qword invMaskQ2Q[16];
+	qword invMaskQ3Q[16];
+	dword4 maskX0X[16];
+	dword4 maskX1X[16];
+	dword4 maskX2X[16];
+	dword4 maskX3X[16];
+	dword4 invMaskX0X[16];
+	dword4 invMaskX1X[16];
+	dword4 invMaskX2X[16];
+	dword4 invMaskX3X[16];
+	dword2 maskD01Q[16];
+	dword2 maskD23Q[16];
+	dword2 invMaskD01Q[16];
+	dword2 invMaskD23Q[16];
+	qword2 maskQ01X[16];
+	qword2 maskQ23X[16];
+	qword2 invMaskQ01X[16];
+	qword2 invMaskQ23X[16];
+	word4 maskW01Q[4];
+	dword4 maskD01X[4];
+	word4 mask565Q[8];
+	dword2 mask10Q[16];		// 4 bit writemask -> A2B10G10R10 bit patterns, replicated 2x
+	word4 mask5551Q[16];	// 4 bit writemask -> A1R5G5B5 bit patterns, replicated 4x
 
-		unsigned short sRGBtoLinear8_16[256];
+	unsigned short sRGBtoLinear8_16[256];
 
-		unsigned short linearToSRGB12_16[4096];
-		unsigned short sRGBtoLinear12_16[4096];
+	unsigned short linearToSRGB12_16[4096];
+	unsigned short sRGBtoLinear12_16[4096];
 
-		// Centroid parameters
-		float4 sampleX[4][16];
-		float4 sampleY[4][16];
-		float4 weight[16];
+	// Centroid parameters
+	float4 sampleX[4][16];
+	float4 sampleY[4][16];
+	float4 weight[16];
 
-		// Fragment offsets
-		int Xf[4];
-		int Yf[4];
+	// Fragment offsets
+	int Xf[4];
+	int Yf[4];
 
-		float4 X[4];
-		float4 Y[4];
+	float4 X[4];
+	float4 Y[4];
 
-		// VK_SAMPLE_COUNT_4_BIT
-		// https://www.khronos.org/registry/vulkan/specs/1.1/html/vkspec.html#primsrast-multisampling
-		static constexpr float VkSampleLocations4[][2] = {
-			{0.375, 0.125},
-			{0.875, 0.375},
-			{0.125, 0.625},
-			{0.625, 0.875},
-		};
-
-		// Vulkan spec sample positions are relative to 0,0 in top left corner, with Y+ going down.
-		// Convert to our space, with 0,0 in center, and Y+ going up.
-		static constexpr float SampleLocationsX[4] = {
-			VkSampleLocations4[0][0] - 0.5f,
-			VkSampleLocations4[1][0] - 0.5f,
-			VkSampleLocations4[2][0] - 0.5f,
-			VkSampleLocations4[3][0] - 0.5f,
-		};
-
-		static constexpr float SampleLocationsY[4] = {
-			-(VkSampleLocations4[0][1] - 0.5f),
-			-(VkSampleLocations4[1][1] - 0.5f),
-			-(VkSampleLocations4[2][1] - 0.5f),
-			-(VkSampleLocations4[3][1] - 0.5f),
-		};
-
-		// Compute the yMin and yMax multisample offsets so that they are just
-		// large enough (+/- max range - epsilon) to include sample points
-		static constexpr int yMinMultiSampleOffset = sw::toFixedPoint(1, vk::SUBPIXEL_PRECISION_BITS) - sw::toFixedPoint(sw::max(SampleLocationsY[0], SampleLocationsY[1], SampleLocationsY[2], SampleLocationsY[3]), vk::SUBPIXEL_PRECISION_BITS) - 1;
-		static constexpr int yMaxMultiSampleOffset = sw::toFixedPoint(1, vk::SUBPIXEL_PRECISION_BITS) + sw::toFixedPoint(sw::max(SampleLocationsY[0], SampleLocationsY[1], SampleLocationsY[2], SampleLocationsY[3]), vk::SUBPIXEL_PRECISION_BITS) - 1;
-
-		dword maxX[16];
-		dword maxY[16];
-		dword maxZ[16];
-		dword minX[16];
-		dword minY[16];
-		dword minZ[16];
-		dword fini[16];
-
-		dword4 maxPos;
-
-		float4 unscaleByte;
-		float4 unscaleSByte;
-		float4 unscaleShort;
-		float4 unscaleUShort;
-		float4 unscaleInt;
-		float4 unscaleUInt;
-		float4 unscaleFixed;
-
-		float half2float[65536];
+	// VK_SAMPLE_COUNT_4_BIT
+	// https://www.khronos.org/registry/vulkan/specs/1.1/html/vkspec.html#primsrast-multisampling
+	static constexpr float VkSampleLocations4[][2] = {
+		{0.375, 0.125},
+		{0.875, 0.375},
+		{0.125, 0.625},
+		{0.625, 0.875},
 	};
 
-	extern Constants constants;
-}
+	// Vulkan spec sample positions are relative to 0,0 in top left corner, with Y+ going down.
+	// Convert to our space, with 0,0 in center, and Y+ going up.
+	static constexpr float SampleLocationsX[4] = {
+		VkSampleLocations4[0][0] - 0.5f,
+		VkSampleLocations4[1][0] - 0.5f,
+		VkSampleLocations4[2][0] - 0.5f,
+		VkSampleLocations4[3][0] - 0.5f,
+	};
+
+	static constexpr float SampleLocationsY[4] = {
+		-(VkSampleLocations4[0][1] - 0.5f),
+		-(VkSampleLocations4[1][1] - 0.5f),
+		-(VkSampleLocations4[2][1] - 0.5f),
+		-(VkSampleLocations4[3][1] - 0.5f),
+	};
+
+	// Compute the yMin and yMax multisample offsets so that they are just
+	// large enough (+/- max range - epsilon) to include sample points
+	static constexpr int yMinMultiSampleOffset = sw::toFixedPoint(1, vk::SUBPIXEL_PRECISION_BITS) - sw::toFixedPoint(sw::max(SampleLocationsY[0], SampleLocationsY[1], SampleLocationsY[2], SampleLocationsY[3]), vk::SUBPIXEL_PRECISION_BITS) - 1;
+	static constexpr int yMaxMultiSampleOffset = sw::toFixedPoint(1, vk::SUBPIXEL_PRECISION_BITS) + sw::toFixedPoint(sw::max(SampleLocationsY[0], SampleLocationsY[1], SampleLocationsY[2], SampleLocationsY[3]), vk::SUBPIXEL_PRECISION_BITS) - 1;
+
+	dword maxX[16];
+	dword maxY[16];
+	dword maxZ[16];
+	dword minX[16];
+	dword minY[16];
+	dword minZ[16];
+	dword fini[16];
+
+	dword4 maxPos;
+
+	float4 unscaleByte;
+	float4 unscaleSByte;
+	float4 unscaleShort;
+	float4 unscaleUShort;
+	float4 unscaleInt;
+	float4 unscaleUInt;
+	float4 unscaleFixed;
+
+	float half2float[65536];
+};
+
+extern Constants constants;
+
+}  // namepsace sw
 
 #endif   // sw_Constants_hpp
diff --git a/src/Pipeline/PixelProgram.cpp b/src/Pipeline/PixelProgram.cpp
index a8a9d5a..59f9cbb 100644
--- a/src/Pipeline/PixelProgram.cpp
+++ b/src/Pipeline/PixelProgram.cpp
@@ -18,344 +18,345 @@
 #include "Device/Primitive.hpp"
 #include "Device/Renderer.hpp"
 
-namespace sw
+namespace sw {
+
+// Union all cMask and return it as 4 booleans
+Int4 PixelProgram::maskAny(Int cMask[4]) const
 {
-	// Union all cMask and return it as 4 booleans
-	Int4 PixelProgram::maskAny(Int cMask[4]) const
+	// See if at least 1 sample is used
+	Int maskUnion = cMask[0];
+	for(auto i = 1u; i < state.multiSample; i++)
 	{
-		// See if at least 1 sample is used
-		Int maskUnion = cMask[0];
-		for(auto i = 1u; i < state.multiSample; i++)
-		{
-			maskUnion |= cMask[i];
-		}
+		maskUnion |= cMask[i];
+	}
 
-		// Convert to 4 booleans
+	// Convert to 4 booleans
+	Int4 laneBits = Int4(1, 2, 4, 8);
+	Int4 laneShiftsToMSB = Int4(31, 30, 29, 28);
+	Int4 mask(maskUnion);
+	mask = ((mask & laneBits) << laneShiftsToMSB) >> Int4(31);
+	return mask;
+}
+
+// Union all cMask/sMask/zMask and return it as 4 booleans
+Int4 PixelProgram::maskAny(Int cMask[4], Int sMask[4], Int zMask[4]) const
+{
+	// See if at least 1 sample is used
+	Int maskUnion = cMask[0] & sMask[0] & zMask[0];
+	for(auto i = 1u; i < state.multiSample; i++)
+	{
+		maskUnion |= (cMask[i] & sMask[i] & zMask[i]);
+	}
+
+	// Convert to 4 booleans
+	Int4 laneBits = Int4(1, 2, 4, 8);
+	Int4 laneShiftsToMSB = Int4(31, 30, 29, 28);
+	Int4 mask(maskUnion);
+	mask = ((mask & laneBits) << laneShiftsToMSB) >> Int4(31);
+	return mask;
+}
+
+void PixelProgram::setBuiltins(Int &x, Int &y, Float4(&z)[4], Float4 &w, Int cMask[4])
+{
+	routine.setImmutableInputBuiltins(spirvShader);
+
+	routine.setInputBuiltin(spirvShader, spv::BuiltInViewIndex, [&](const SpirvShader::BuiltinMapping& builtin, Array<SIMD::Float>& value)
+	{
+		assert(builtin.SizeInComponents == 1);
+		value[builtin.FirstComponent] = As<Float4>(Int4((*Pointer<Int>(data + OFFSET(DrawData, viewID)))));
+	});
+
+	routine.setInputBuiltin(spirvShader, spv::BuiltInFragCoord, [&](const SpirvShader::BuiltinMapping& builtin, Array<SIMD::Float>& value)
+	{
+		assert(builtin.SizeInComponents == 4);
+		value[builtin.FirstComponent+0] = SIMD::Float(Float(x)) + SIMD::Float(0.5f, 1.5f, 0.5f, 1.5f);
+		value[builtin.FirstComponent+1] = SIMD::Float(Float(y)) + SIMD::Float(0.5f, 0.5f, 1.5f, 1.5f);
+		value[builtin.FirstComponent+2] = z[0];	// sample 0
+		value[builtin.FirstComponent+3] = w;
+	});
+
+	routine.setInputBuiltin(spirvShader, spv::BuiltInPointCoord, [&](const SpirvShader::BuiltinMapping& builtin, Array<SIMD::Float>& value)
+	{
+		assert(builtin.SizeInComponents == 2);
+		value[builtin.FirstComponent+0] = SIMD::Float(0.5f, 1.5f, 0.5f, 1.5f) +
+			SIMD::Float(Float(x) - (*Pointer<Float>(primitive + OFFSET(Primitive, pointCoordX))));
+		value[builtin.FirstComponent+1] = SIMD::Float(0.5f, 0.5f, 1.5f, 1.5f) +
+			SIMD::Float(Float(y) - (*Pointer<Float>(primitive + OFFSET(Primitive, pointCoordY))));
+	});
+
+	routine.setInputBuiltin(spirvShader, spv::BuiltInSubgroupSize, [&](const SpirvShader::BuiltinMapping& builtin, Array<SIMD::Float>& value)
+	{
+		assert(builtin.SizeInComponents == 1);
+		value[builtin.FirstComponent] = As<SIMD::Float>(SIMD::Int(SIMD::Width));
+	});
+
+	routine.setInputBuiltin(spirvShader, spv::BuiltInHelperInvocation, [&](const SpirvShader::BuiltinMapping& builtin, Array<SIMD::Float>& value)
+	{
+		assert(builtin.SizeInComponents == 1);
+		value[builtin.FirstComponent] = As<SIMD::Float>(~maskAny(cMask));
+	});
+
+	routine.windowSpacePosition[0] = x + SIMD::Int(0,1,0,1);
+	routine.windowSpacePosition[1] = y + SIMD::Int(0,0,1,1);
+	routine.viewID = *Pointer<Int>(data + OFFSET(DrawData, viewID));
+}
+
+void PixelProgram::applyShader(Int cMask[4], Int sMask[4], Int zMask[4])
+{
+	routine.descriptorSets = data + OFFSET(DrawData, descriptorSets);
+	routine.descriptorDynamicOffsets = data + OFFSET(DrawData, descriptorDynamicOffsets);
+	routine.pushConstants = data + OFFSET(DrawData, pushConstants);
+	routine.constants = *Pointer<Pointer<Byte>>(data + OFFSET(DrawData, constants));
+
+	auto it = spirvShader->inputBuiltins.find(spv::BuiltInFrontFacing);
+	if (it != spirvShader->inputBuiltins.end())
+	{
+		ASSERT(it->second.SizeInComponents == 1);
+		auto frontFacing = Int4(*Pointer<Int>(primitive + OFFSET(Primitive, clockwiseMask)));
+		routine.getVariable(it->second.Id)[it->second.FirstComponent] = As<Float4>(frontFacing);
+	}
+
+	it = spirvShader->inputBuiltins.find(spv::BuiltInSampleMask);
+	if (it != spirvShader->inputBuiltins.end())
+	{
+		static_assert(SIMD::Width == 4, "Expects SIMD width to be 4");
 		Int4 laneBits = Int4(1, 2, 4, 8);
-		Int4 laneShiftsToMSB = Int4(31, 30, 29, 28);
-		Int4 mask(maskUnion);
-		mask = ((mask & laneBits) << laneShiftsToMSB) >> Int4(31);
-		return mask;
+
+		Int4 inputSampleMask = Int4(1) & CmpNEQ(Int4(cMask[0]) & laneBits, Int4(0));
+		for (auto i = 1u; i < state.multiSample; i++)
+		{
+			inputSampleMask |= Int4(1 << i) & CmpNEQ(Int4(cMask[i]) & laneBits, Int4(0));
+		}
+
+		routine.getVariable(it->second.Id)[it->second.FirstComponent] = As<Float4>(inputSampleMask);
+		// Sample mask input is an array, as the spec contemplates MSAA levels higher than 32.
+		// Fill any non-zero indices with 0.
+		for (auto i = 1u; i < it->second.SizeInComponents; i++)
+			routine.getVariable(it->second.Id)[it->second.FirstComponent + i] = Float4(0);
 	}
 
-	// Union all cMask/sMask/zMask and return it as 4 booleans
-	Int4 PixelProgram::maskAny(Int cMask[4], Int sMask[4], Int zMask[4]) const
-	{
-		// See if at least 1 sample is used
-		Int maskUnion = cMask[0] & sMask[0] & zMask[0];
-		for(auto i = 1u; i < state.multiSample; i++)
-		{
-			maskUnion |= (cMask[i] & sMask[i] & zMask[i]);
-		}
+	// Note: all lanes initially active to facilitate derivatives etc. Actual coverage is
+	// handled separately, through the cMask.
+	auto activeLaneMask = SIMD::Int(0xFFFFFFFF);
+	auto storesAndAtomicsMask = maskAny(cMask, sMask, zMask);
+	routine.killMask = 0;
 
-		// Convert to 4 booleans
-		Int4 laneBits = Int4(1, 2, 4, 8);
-		Int4 laneShiftsToMSB = Int4(31, 30, 29, 28);
-		Int4 mask(maskUnion);
-		mask = ((mask & laneBits) << laneShiftsToMSB) >> Int4(31);
-		return mask;
+	spirvShader->emit(&routine, activeLaneMask, storesAndAtomicsMask, descriptorSets);
+	spirvShader->emitEpilog(&routine);
+
+	for(int i = 0; i < RENDERTARGETS; i++)
+	{
+		c[i].x = routine.outputs[i * 4];
+		c[i].y = routine.outputs[i * 4 + 1];
+		c[i].z = routine.outputs[i * 4 + 2];
+		c[i].w = routine.outputs[i * 4 + 3];
 	}
 
-	void PixelProgram::setBuiltins(Int &x, Int &y, Float4(&z)[4], Float4 &w, Int cMask[4])
+	clampColor(c);
+
+	if(spirvShader->getModes().ContainsKill)
 	{
-		routine.setImmutableInputBuiltins(spirvShader);
-
-		routine.setInputBuiltin(spirvShader, spv::BuiltInViewIndex, [&](const SpirvShader::BuiltinMapping& builtin, Array<SIMD::Float>& value)
+		for (auto i = 0u; i < state.multiSample; i++)
 		{
-			assert(builtin.SizeInComponents == 1);
-			value[builtin.FirstComponent] = As<Float4>(Int4((*Pointer<Int>(data + OFFSET(DrawData, viewID)))));
-		});
-
-		routine.setInputBuiltin(spirvShader, spv::BuiltInFragCoord, [&](const SpirvShader::BuiltinMapping& builtin, Array<SIMD::Float>& value)
-		{
-			assert(builtin.SizeInComponents == 4);
-			value[builtin.FirstComponent+0] = SIMD::Float(Float(x)) + SIMD::Float(0.5f, 1.5f, 0.5f, 1.5f);
-			value[builtin.FirstComponent+1] = SIMD::Float(Float(y)) + SIMD::Float(0.5f, 0.5f, 1.5f, 1.5f);
-			value[builtin.FirstComponent+2] = z[0];	// sample 0
-			value[builtin.FirstComponent+3] = w;
-		});
-
-		routine.setInputBuiltin(spirvShader, spv::BuiltInPointCoord, [&](const SpirvShader::BuiltinMapping& builtin, Array<SIMD::Float>& value)
-		{
-			assert(builtin.SizeInComponents == 2);
-			value[builtin.FirstComponent+0] = SIMD::Float(0.5f, 1.5f, 0.5f, 1.5f) +
-				SIMD::Float(Float(x) - (*Pointer<Float>(primitive + OFFSET(Primitive, pointCoordX))));
-			value[builtin.FirstComponent+1] = SIMD::Float(0.5f, 0.5f, 1.5f, 1.5f) +
-				SIMD::Float(Float(y) - (*Pointer<Float>(primitive + OFFSET(Primitive, pointCoordY))));
-		});
-
-		routine.setInputBuiltin(spirvShader, spv::BuiltInSubgroupSize, [&](const SpirvShader::BuiltinMapping& builtin, Array<SIMD::Float>& value)
-		{
-			assert(builtin.SizeInComponents == 1);
-			value[builtin.FirstComponent] = As<SIMD::Float>(SIMD::Int(SIMD::Width));
-		});
-
-		routine.setInputBuiltin(spirvShader, spv::BuiltInHelperInvocation, [&](const SpirvShader::BuiltinMapping& builtin, Array<SIMD::Float>& value)
-		{
-			assert(builtin.SizeInComponents == 1);
-			value[builtin.FirstComponent] = As<SIMD::Float>(~maskAny(cMask));
-		});
-
-		routine.windowSpacePosition[0] = x + SIMD::Int(0,1,0,1);
-		routine.windowSpacePosition[1] = y + SIMD::Int(0,0,1,1);
-		routine.viewID = *Pointer<Int>(data + OFFSET(DrawData, viewID));
-	}
-
-	void PixelProgram::applyShader(Int cMask[4], Int sMask[4], Int zMask[4])
-	{
-		routine.descriptorSets = data + OFFSET(DrawData, descriptorSets);
-		routine.descriptorDynamicOffsets = data + OFFSET(DrawData, descriptorDynamicOffsets);
-		routine.pushConstants = data + OFFSET(DrawData, pushConstants);
-		routine.constants = *Pointer<Pointer<Byte>>(data + OFFSET(DrawData, constants));
-
-		auto it = spirvShader->inputBuiltins.find(spv::BuiltInFrontFacing);
-		if (it != spirvShader->inputBuiltins.end())
-		{
-			ASSERT(it->second.SizeInComponents == 1);
-			auto frontFacing = Int4(*Pointer<Int>(primitive + OFFSET(Primitive, clockwiseMask)));
-			routine.getVariable(it->second.Id)[it->second.FirstComponent] = As<Float4>(frontFacing);
-		}
-
-		it = spirvShader->inputBuiltins.find(spv::BuiltInSampleMask);
-		if (it != spirvShader->inputBuiltins.end())
-		{
-			static_assert(SIMD::Width == 4, "Expects SIMD width to be 4");
-			Int4 laneBits = Int4(1, 2, 4, 8);
-
-			Int4 inputSampleMask = Int4(1) & CmpNEQ(Int4(cMask[0]) & laneBits, Int4(0));
-			for (auto i = 1u; i < state.multiSample; i++)
-			{
-				inputSampleMask |= Int4(1 << i) & CmpNEQ(Int4(cMask[i]) & laneBits, Int4(0));
-			}
-
-			routine.getVariable(it->second.Id)[it->second.FirstComponent] = As<Float4>(inputSampleMask);
-			// Sample mask input is an array, as the spec contemplates MSAA levels higher than 32.
-			// Fill any non-zero indices with 0.
-			for (auto i = 1u; i < it->second.SizeInComponents; i++)
-				routine.getVariable(it->second.Id)[it->second.FirstComponent + i] = Float4(0);
-		}
-
-		// Note: all lanes initially active to facilitate derivatives etc. Actual coverage is
-		// handled separately, through the cMask.
-		auto activeLaneMask = SIMD::Int(0xFFFFFFFF);
-		auto storesAndAtomicsMask = maskAny(cMask, sMask, zMask);
-		routine.killMask = 0;
-
-		spirvShader->emit(&routine, activeLaneMask, storesAndAtomicsMask, descriptorSets);
-		spirvShader->emitEpilog(&routine);
-
-		for(int i = 0; i < RENDERTARGETS; i++)
-		{
-			c[i].x = routine.outputs[i * 4];
-			c[i].y = routine.outputs[i * 4 + 1];
-			c[i].z = routine.outputs[i * 4 + 2];
-			c[i].w = routine.outputs[i * 4 + 3];
-		}
-
-		clampColor(c);
-
-		if(spirvShader->getModes().ContainsKill)
-		{
-			for (auto i = 0u; i < state.multiSample; i++)
-			{
-				cMask[i] &= ~routine.killMask;
-			}
-		}
-
-		it = spirvShader->outputBuiltins.find(spv::BuiltInSampleMask);
-		if (it != spirvShader->outputBuiltins.end())
-		{
-			auto outputSampleMask = As<SIMD::Int>(routine.getVariable(it->second.Id)[it->second.FirstComponent]);
-
-			for (auto i = 0u; i < state.multiSample; i++)
-			{
-				cMask[i] &= SignMask(CmpNEQ(outputSampleMask & SIMD::Int(1<<i), SIMD::Int(0)));
-			}
-		}
-
-		it = spirvShader->outputBuiltins.find(spv::BuiltInFragDepth);
-		if (it != spirvShader->outputBuiltins.end())
-		{
-			oDepth = Min(Max(routine.getVariable(it->second.Id)[it->second.FirstComponent], Float4(0.0f)), Float4(1.0f));
+			cMask[i] &= ~routine.killMask;
 		}
 	}
 
-	Bool PixelProgram::alphaTest(Int cMask[4])
+	it = spirvShader->outputBuiltins.find(spv::BuiltInSampleMask);
+	if (it != spirvShader->outputBuiltins.end())
 	{
-		if(!state.alphaToCoverage)
+		auto outputSampleMask = As<SIMD::Int>(routine.getVariable(it->second.Id)[it->second.FirstComponent]);
+
+		for (auto i = 0u; i < state.multiSample; i++)
 		{
-			return true;
-		}
-
-		alphaToCoverage(cMask, c[0].w);
-
-		Int pass = cMask[0];
-
-		for(unsigned int q = 1; q < state.multiSample; q++)
-		{
-			pass = pass | cMask[q];
-		}
-
-		return pass != 0x0;
-	}
-
-	void PixelProgram::rasterOperation(Pointer<Byte> cBuffer[4], Int &x, Int sMask[4], Int zMask[4], Int cMask[4])
-	{
-		for(int index = 0; index < RENDERTARGETS; index++)
-		{
-			if(!state.colorWriteActive(index))
-			{
-				continue;
-			}
-
-			auto format = state.targetFormat[index];
-			switch(format)
-			{
-			case VK_FORMAT_A1R5G5B5_UNORM_PACK16:
-			case VK_FORMAT_R5G6B5_UNORM_PACK16:
-			case VK_FORMAT_B8G8R8A8_UNORM:
-			case VK_FORMAT_B8G8R8A8_SRGB:
-			case VK_FORMAT_R8G8B8A8_UNORM:
-			case VK_FORMAT_R8G8B8A8_SRGB:
-			case VK_FORMAT_R8G8_UNORM:
-			case VK_FORMAT_R8_UNORM:
-			case VK_FORMAT_R16G16_UNORM:
-			case VK_FORMAT_R16G16B16A16_UNORM:
-			case VK_FORMAT_A8B8G8R8_UNORM_PACK32:
-			case VK_FORMAT_A8B8G8R8_SRGB_PACK32:
-			case VK_FORMAT_A2B10G10R10_UNORM_PACK32:
-				for(unsigned int q = 0; q < state.multiSample; q++)
-				{
-					if(state.multiSampleMask & (1 << q))
-					{
-						Pointer<Byte> buffer = cBuffer[index] + q * *Pointer<Int>(data + OFFSET(DrawData, colorSliceB[index]));
-						Vector4s color;
-
-						color.x = convertFixed16(c[index].x, false);
-						color.y = convertFixed16(c[index].y, false);
-						color.z = convertFixed16(c[index].z, false);
-						color.w = convertFixed16(c[index].w, false);
-
-						alphaBlend(index, buffer, color, x);
-						writeColor(index, buffer, x, color, sMask[q], zMask[q], cMask[q]);
-					}
-				}
-				break;
-			case VK_FORMAT_R16_SFLOAT:
-			case VK_FORMAT_R16G16_SFLOAT:
-			case VK_FORMAT_R16G16B16A16_SFLOAT:
-			case VK_FORMAT_R32_SFLOAT:
-			case VK_FORMAT_R32G32_SFLOAT:
-			case VK_FORMAT_R32G32B32A32_SFLOAT:
-			case VK_FORMAT_R32_SINT:
-			case VK_FORMAT_R32G32_SINT:
-			case VK_FORMAT_R32G32B32A32_SINT:
-			case VK_FORMAT_R32_UINT:
-			case VK_FORMAT_R32G32_UINT:
-			case VK_FORMAT_R32G32B32A32_UINT:
-			case VK_FORMAT_R16_SINT:
-			case VK_FORMAT_R16G16_SINT:
-			case VK_FORMAT_R16G16B16A16_SINT:
-			case VK_FORMAT_R16_UINT:
-			case VK_FORMAT_R16G16_UINT:
-			case VK_FORMAT_R16G16B16A16_UINT:
-			case VK_FORMAT_R8_SINT:
-			case VK_FORMAT_R8G8_SINT:
-			case VK_FORMAT_R8G8B8A8_SINT:
-			case VK_FORMAT_R8_UINT:
-			case VK_FORMAT_R8G8_UINT:
-			case VK_FORMAT_R8G8B8A8_UINT:
-			case VK_FORMAT_A8B8G8R8_UINT_PACK32:
-			case VK_FORMAT_A8B8G8R8_SINT_PACK32:
-			case VK_FORMAT_A2B10G10R10_UINT_PACK32:
-				for(unsigned int q = 0; q < state.multiSample; q++)
-				{
-					if(state.multiSampleMask & (1 << q))
-					{
-						Pointer<Byte> buffer = cBuffer[index] + q * *Pointer<Int>(data + OFFSET(DrawData, colorSliceB[index]));
-						Vector4f color = c[index];
-
-						alphaBlend(index, buffer, color, x);
-						writeColor(index, buffer, x, color, sMask[q], zMask[q], cMask[q]);
-					}
-				}
-				break;
-			default:
-				UNIMPLEMENTED("VkFormat: %d", int(format));
-			}
+			cMask[i] &= SignMask(CmpNEQ(outputSampleMask & SIMD::Int(1<<i), SIMD::Int(0)));
 		}
 	}
 
-	void PixelProgram::clampColor(Vector4f oC[RENDERTARGETS])
+	it = spirvShader->outputBuiltins.find(spv::BuiltInFragDepth);
+	if (it != spirvShader->outputBuiltins.end())
 	{
-		for(int index = 0; index < RENDERTARGETS; index++)
-		{
-			if(!state.colorWriteActive(index) && !(index == 0 && state.alphaToCoverage))
-			{
-				continue;
-			}
-
-			switch(state.targetFormat[index])
-			{
-			case VK_FORMAT_UNDEFINED:
-				break;
-			case VK_FORMAT_A1R5G5B5_UNORM_PACK16:
-			case VK_FORMAT_R5G6B5_UNORM_PACK16:
-			case VK_FORMAT_B8G8R8A8_UNORM:
-			case VK_FORMAT_B8G8R8A8_SRGB:
-			case VK_FORMAT_R8G8B8A8_UNORM:
-			case VK_FORMAT_R8G8B8A8_SRGB:
-			case VK_FORMAT_R8G8_UNORM:
-			case VK_FORMAT_R8_UNORM:
-			case VK_FORMAT_R16G16_UNORM:
-			case VK_FORMAT_R16G16B16A16_UNORM:
-			case VK_FORMAT_A8B8G8R8_UNORM_PACK32:
-			case VK_FORMAT_A8B8G8R8_SRGB_PACK32:
-			case VK_FORMAT_A2B10G10R10_UNORM_PACK32:
-				oC[index].x = Max(oC[index].x, Float4(0.0f)); oC[index].x = Min(oC[index].x, Float4(1.0f));
-				oC[index].y = Max(oC[index].y, Float4(0.0f)); oC[index].y = Min(oC[index].y, Float4(1.0f));
-				oC[index].z = Max(oC[index].z, Float4(0.0f)); oC[index].z = Min(oC[index].z, Float4(1.0f));
-				oC[index].w = Max(oC[index].w, Float4(0.0f)); oC[index].w = Min(oC[index].w, Float4(1.0f));
-				break;
-			case VK_FORMAT_R32_SFLOAT:
-			case VK_FORMAT_R32G32_SFLOAT:
-			case VK_FORMAT_R32G32B32A32_SFLOAT:
-			case VK_FORMAT_R32_SINT:
-			case VK_FORMAT_R32G32_SINT:
-			case VK_FORMAT_R32G32B32A32_SINT:
-			case VK_FORMAT_R32_UINT:
-			case VK_FORMAT_R32G32_UINT:
-			case VK_FORMAT_R32G32B32A32_UINT:
-			case VK_FORMAT_R16_SFLOAT:
-			case VK_FORMAT_R16G16_SFLOAT:
-			case VK_FORMAT_R16G16B16A16_SFLOAT:
-			case VK_FORMAT_R16_SINT:
-			case VK_FORMAT_R16G16_SINT:
-			case VK_FORMAT_R16G16B16A16_SINT:
-			case VK_FORMAT_R16_UINT:
-			case VK_FORMAT_R16G16_UINT:
-			case VK_FORMAT_R16G16B16A16_UINT:
-			case VK_FORMAT_R8_SINT:
-			case VK_FORMAT_R8G8_SINT:
-			case VK_FORMAT_R8G8B8A8_SINT:
-			case VK_FORMAT_R8_UINT:
-			case VK_FORMAT_R8G8_UINT:
-			case VK_FORMAT_R8G8B8A8_UINT:
-			case VK_FORMAT_A8B8G8R8_UINT_PACK32:
-			case VK_FORMAT_A8B8G8R8_SINT_PACK32:
-			case VK_FORMAT_A2B10G10R10_UINT_PACK32:
-				break;
-			default:
-				UNIMPLEMENTED("VkFormat: %d", int(state.targetFormat[index]));
-			}
-		}
-	}
-
-	Float4 PixelProgram::linearToSRGB(const Float4 &x)   // Approximates x^(1.0/2.2)
-	{
-		Float4 sqrtx = Rcp_pp(RcpSqrt_pp(x));
-		Float4 sRGB = sqrtx * Float4(1.14f) - x * Float4(0.14f);
-
-		return Min(Max(sRGB, Float4(0.0f)), Float4(1.0f));
+		oDepth = Min(Max(routine.getVariable(it->second.Id)[it->second.FirstComponent], Float4(0.0f)), Float4(1.0f));
 	}
 }
+
+Bool PixelProgram::alphaTest(Int cMask[4])
+{
+	if(!state.alphaToCoverage)
+	{
+		return true;
+	}
+
+	alphaToCoverage(cMask, c[0].w);
+
+	Int pass = cMask[0];
+
+	for(unsigned int q = 1; q < state.multiSample; q++)
+	{
+		pass = pass | cMask[q];
+	}
+
+	return pass != 0x0;
+}
+
+void PixelProgram::rasterOperation(Pointer<Byte> cBuffer[4], Int &x, Int sMask[4], Int zMask[4], Int cMask[4])
+{
+	for(int index = 0; index < RENDERTARGETS; index++)
+	{
+		if(!state.colorWriteActive(index))
+		{
+			continue;
+		}
+
+		auto format = state.targetFormat[index];
+		switch(format)
+		{
+		case VK_FORMAT_A1R5G5B5_UNORM_PACK16:
+		case VK_FORMAT_R5G6B5_UNORM_PACK16:
+		case VK_FORMAT_B8G8R8A8_UNORM:
+		case VK_FORMAT_B8G8R8A8_SRGB:
+		case VK_FORMAT_R8G8B8A8_UNORM:
+		case VK_FORMAT_R8G8B8A8_SRGB:
+		case VK_FORMAT_R8G8_UNORM:
+		case VK_FORMAT_R8_UNORM:
+		case VK_FORMAT_R16G16_UNORM:
+		case VK_FORMAT_R16G16B16A16_UNORM:
+		case VK_FORMAT_A8B8G8R8_UNORM_PACK32:
+		case VK_FORMAT_A8B8G8R8_SRGB_PACK32:
+		case VK_FORMAT_A2B10G10R10_UNORM_PACK32:
+			for(unsigned int q = 0; q < state.multiSample; q++)
+			{
+				if(state.multiSampleMask & (1 << q))
+				{
+					Pointer<Byte> buffer = cBuffer[index] + q * *Pointer<Int>(data + OFFSET(DrawData, colorSliceB[index]));
+					Vector4s color;
+
+					color.x = convertFixed16(c[index].x, false);
+					color.y = convertFixed16(c[index].y, false);
+					color.z = convertFixed16(c[index].z, false);
+					color.w = convertFixed16(c[index].w, false);
+
+					alphaBlend(index, buffer, color, x);
+					writeColor(index, buffer, x, color, sMask[q], zMask[q], cMask[q]);
+				}
+			}
+			break;
+		case VK_FORMAT_R16_SFLOAT:
+		case VK_FORMAT_R16G16_SFLOAT:
+		case VK_FORMAT_R16G16B16A16_SFLOAT:
+		case VK_FORMAT_R32_SFLOAT:
+		case VK_FORMAT_R32G32_SFLOAT:
+		case VK_FORMAT_R32G32B32A32_SFLOAT:
+		case VK_FORMAT_R32_SINT:
+		case VK_FORMAT_R32G32_SINT:
+		case VK_FORMAT_R32G32B32A32_SINT:
+		case VK_FORMAT_R32_UINT:
+		case VK_FORMAT_R32G32_UINT:
+		case VK_FORMAT_R32G32B32A32_UINT:
+		case VK_FORMAT_R16_SINT:
+		case VK_FORMAT_R16G16_SINT:
+		case VK_FORMAT_R16G16B16A16_SINT:
+		case VK_FORMAT_R16_UINT:
+		case VK_FORMAT_R16G16_UINT:
+		case VK_FORMAT_R16G16B16A16_UINT:
+		case VK_FORMAT_R8_SINT:
+		case VK_FORMAT_R8G8_SINT:
+		case VK_FORMAT_R8G8B8A8_SINT:
+		case VK_FORMAT_R8_UINT:
+		case VK_FORMAT_R8G8_UINT:
+		case VK_FORMAT_R8G8B8A8_UINT:
+		case VK_FORMAT_A8B8G8R8_UINT_PACK32:
+		case VK_FORMAT_A8B8G8R8_SINT_PACK32:
+		case VK_FORMAT_A2B10G10R10_UINT_PACK32:
+			for(unsigned int q = 0; q < state.multiSample; q++)
+			{
+				if(state.multiSampleMask & (1 << q))
+				{
+					Pointer<Byte> buffer = cBuffer[index] + q * *Pointer<Int>(data + OFFSET(DrawData, colorSliceB[index]));
+					Vector4f color = c[index];
+
+					alphaBlend(index, buffer, color, x);
+					writeColor(index, buffer, x, color, sMask[q], zMask[q], cMask[q]);
+				}
+			}
+			break;
+		default:
+			UNIMPLEMENTED("VkFormat: %d", int(format));
+		}
+	}
+}
+
+void PixelProgram::clampColor(Vector4f oC[RENDERTARGETS])
+{
+	for(int index = 0; index < RENDERTARGETS; index++)
+	{
+		if(!state.colorWriteActive(index) && !(index == 0 && state.alphaToCoverage))
+		{
+			continue;
+		}
+
+		switch(state.targetFormat[index])
+		{
+		case VK_FORMAT_UNDEFINED:
+			break;
+		case VK_FORMAT_A1R5G5B5_UNORM_PACK16:
+		case VK_FORMAT_R5G6B5_UNORM_PACK16:
+		case VK_FORMAT_B8G8R8A8_UNORM:
+		case VK_FORMAT_B8G8R8A8_SRGB:
+		case VK_FORMAT_R8G8B8A8_UNORM:
+		case VK_FORMAT_R8G8B8A8_SRGB:
+		case VK_FORMAT_R8G8_UNORM:
+		case VK_FORMAT_R8_UNORM:
+		case VK_FORMAT_R16G16_UNORM:
+		case VK_FORMAT_R16G16B16A16_UNORM:
+		case VK_FORMAT_A8B8G8R8_UNORM_PACK32:
+		case VK_FORMAT_A8B8G8R8_SRGB_PACK32:
+		case VK_FORMAT_A2B10G10R10_UNORM_PACK32:
+			oC[index].x = Max(oC[index].x, Float4(0.0f)); oC[index].x = Min(oC[index].x, Float4(1.0f));
+			oC[index].y = Max(oC[index].y, Float4(0.0f)); oC[index].y = Min(oC[index].y, Float4(1.0f));
+			oC[index].z = Max(oC[index].z, Float4(0.0f)); oC[index].z = Min(oC[index].z, Float4(1.0f));
+			oC[index].w = Max(oC[index].w, Float4(0.0f)); oC[index].w = Min(oC[index].w, Float4(1.0f));
+			break;
+		case VK_FORMAT_R32_SFLOAT:
+		case VK_FORMAT_R32G32_SFLOAT:
+		case VK_FORMAT_R32G32B32A32_SFLOAT:
+		case VK_FORMAT_R32_SINT:
+		case VK_FORMAT_R32G32_SINT:
+		case VK_FORMAT_R32G32B32A32_SINT:
+		case VK_FORMAT_R32_UINT:
+		case VK_FORMAT_R32G32_UINT:
+		case VK_FORMAT_R32G32B32A32_UINT:
+		case VK_FORMAT_R16_SFLOAT:
+		case VK_FORMAT_R16G16_SFLOAT:
+		case VK_FORMAT_R16G16B16A16_SFLOAT:
+		case VK_FORMAT_R16_SINT:
+		case VK_FORMAT_R16G16_SINT:
+		case VK_FORMAT_R16G16B16A16_SINT:
+		case VK_FORMAT_R16_UINT:
+		case VK_FORMAT_R16G16_UINT:
+		case VK_FORMAT_R16G16B16A16_UINT:
+		case VK_FORMAT_R8_SINT:
+		case VK_FORMAT_R8G8_SINT:
+		case VK_FORMAT_R8G8B8A8_SINT:
+		case VK_FORMAT_R8_UINT:
+		case VK_FORMAT_R8G8_UINT:
+		case VK_FORMAT_R8G8B8A8_UINT:
+		case VK_FORMAT_A8B8G8R8_UINT_PACK32:
+		case VK_FORMAT_A8B8G8R8_SINT_PACK32:
+		case VK_FORMAT_A2B10G10R10_UINT_PACK32:
+			break;
+		default:
+			UNIMPLEMENTED("VkFormat: %d", int(state.targetFormat[index]));
+		}
+	}
+}
+
+Float4 PixelProgram::linearToSRGB(const Float4 &x)   // Approximates x^(1.0/2.2)
+{
+	Float4 sqrtx = Rcp_pp(RcpSqrt_pp(x));
+	Float4 sRGB = sqrtx * Float4(1.14f) - x * Float4(0.14f);
+
+	return Min(Max(sRGB, Float4(0.0f)), Float4(1.0f));
+}
+
+}  // namepsace sw
diff --git a/src/Pipeline/PixelProgram.hpp b/src/Pipeline/PixelProgram.hpp
index 3555aee..7888115 100644
--- a/src/Pipeline/PixelProgram.hpp
+++ b/src/Pipeline/PixelProgram.hpp
@@ -17,39 +17,40 @@
 
 #include "PixelRoutine.hpp"
 
-namespace sw
+namespace sw {
+
+class PixelProgram : public PixelRoutine
 {
-	class PixelProgram : public PixelRoutine
+public:
+	PixelProgram(
+			const PixelProcessor::State &state,
+			vk::PipelineLayout const *pipelineLayout,
+			SpirvShader const *spirvShader,
+			const vk::DescriptorSet::Bindings &descriptorSets) :
+		PixelRoutine(state, pipelineLayout, spirvShader, descriptorSets)
 	{
-	public:
-		PixelProgram(
-				const PixelProcessor::State &state,
-				vk::PipelineLayout const *pipelineLayout,
-				SpirvShader const *spirvShader,
-				const vk::DescriptorSet::Bindings &descriptorSets) :
-			PixelRoutine(state, pipelineLayout, spirvShader, descriptorSets)
-		{
-		}
+	}
 
-		virtual ~PixelProgram() {}
+	virtual ~PixelProgram() {}
 
-	protected:
-		virtual void setBuiltins(Int &x, Int &y, Float4(&z)[4], Float4 &w, Int cMask[4]);
-		virtual void applyShader(Int cMask[4], Int sMask[4], Int zMask[4]);
-		virtual Bool alphaTest(Int cMask[4]);
-		virtual void rasterOperation(Pointer<Byte> cBuffer[4], Int &x, Int sMask[4], Int zMask[4], Int cMask[4]);
+protected:
+	virtual void setBuiltins(Int &x, Int &y, Float4(&z)[4], Float4 &w, Int cMask[4]);
+	virtual void applyShader(Int cMask[4], Int sMask[4], Int zMask[4]);
+	virtual Bool alphaTest(Int cMask[4]);
+	virtual void rasterOperation(Pointer<Byte> cBuffer[4], Int &x, Int sMask[4], Int zMask[4], Int cMask[4]);
 
-	private:
-		// Color outputs
-		Vector4f c[RENDERTARGETS];
+private:
+	// Color outputs
+	Vector4f c[RENDERTARGETS];
 
-		// Raster operations
-		void clampColor(Vector4f oC[RENDERTARGETS]);
+	// Raster operations
+	void clampColor(Vector4f oC[RENDERTARGETS]);
 
-		Int4 maskAny(Int cMask[4]) const;
-		Int4 maskAny(Int cMask[4], Int sMask[4], Int zMask[4]) const;
-		Float4 linearToSRGB(const Float4 &x);
-	};
-}
+	Int4 maskAny(Int cMask[4]) const;
+	Int4 maskAny(Int cMask[4], Int sMask[4], Int zMask[4]) const;
+	Float4 linearToSRGB(const Float4 &x);
+};
+
+}  // namespace sw
 
 #endif
diff --git a/src/Pipeline/PixelRoutine.cpp b/src/Pipeline/PixelRoutine.cpp
index 6fe6e74..13a9e0c 100644
--- a/src/Pipeline/PixelRoutine.cpp
+++ b/src/Pipeline/PixelRoutine.cpp
@@ -22,2623 +22,2624 @@
 #include "Vulkan/VkDebug.hpp"
 #include "Vulkan/VkPipelineLayout.hpp"
 
-namespace sw
+namespace sw {
+
+PixelRoutine::PixelRoutine(
+		const PixelProcessor::State &state,
+		vk::PipelineLayout const *pipelineLayout,
+		SpirvShader const *spirvShader,
+		const vk::DescriptorSet::Bindings &descriptorSets)
+	: QuadRasterizer(state, spirvShader),
+	  routine(pipelineLayout),
+	  descriptorSets(descriptorSets)
 {
-	PixelRoutine::PixelRoutine(
-			const PixelProcessor::State &state,
-			vk::PipelineLayout const *pipelineLayout,
-			SpirvShader const *spirvShader,
-			const vk::DescriptorSet::Bindings &descriptorSets)
-		: QuadRasterizer(state, spirvShader),
-		  routine(pipelineLayout),
-		  descriptorSets(descriptorSets)
+	if (spirvShader)
 	{
-		if (spirvShader)
-		{
-			spirvShader->emitProlog(&routine);
+		spirvShader->emitProlog(&routine);
 
-			// Clearing inputs to 0 is not demanded by the spec,
-			// but it makes the undefined behavior deterministic.
-			for(int i = 0; i < MAX_INTERFACE_COMPONENTS; i++)
+		// Clearing inputs to 0 is not demanded by the spec,
+		// but it makes the undefined behavior deterministic.
+		for(int i = 0; i < MAX_INTERFACE_COMPONENTS; i++)
+		{
+			routine.inputs[i] = Float4(0.0f);
+		}
+	}
+}
+
+PixelRoutine::~PixelRoutine()
+{
+}
+
+void PixelRoutine::quad(Pointer<Byte> cBuffer[RENDERTARGETS], Pointer<Byte> &zBuffer, Pointer<Byte> &sBuffer, Int cMask[4], Int &x, Int &y)
+{
+	// TODO: consider shader which modifies sample mask in general
+	const bool earlyDepthTest = !spirvShader || (spirvShader->getModes().EarlyFragmentTests && !spirvShader->getModes().DepthReplacing && !state.alphaToCoverage);
+
+	Int zMask[4];   // Depth mask
+	Int sMask[4];   // Stencil mask
+
+	for(unsigned int q = 0; q < state.multiSample; q++)
+	{
+		zMask[q] = cMask[q];
+		sMask[q] = cMask[q];
+	}
+
+	for(unsigned int q = 0; q < state.multiSample; q++)
+	{
+		stencilTest(sBuffer, q, x, sMask[q], cMask[q]);
+	}
+
+	Float4 f;
+	Float4 rhwCentroid;
+
+	Float4 xxxx = Float4(Float(x)) + *Pointer<Float4>(primitive + OFFSET(Primitive,xQuad), 16);
+
+	if(interpolateZ())
+	{
+		for(unsigned int q = 0; q < state.multiSample; q++)
+		{
+			Float4 x = xxxx;
+
+			if(state.multiSample > 1)
 			{
-				routine.inputs[i] = Float4(0.0f);
+				x -= *Pointer<Float4>(constants + OFFSET(Constants,X) + q * sizeof(float4));
 			}
+
+			z[q] = interpolate(x, Dz[q], z[q], primitive + OFFSET(Primitive,z), false, false, state.depthClamp);
 		}
 	}
 
-	PixelRoutine::~PixelRoutine()
+	Bool depthPass = false;
+
+	if(earlyDepthTest)
 	{
+		for(unsigned int q = 0; q < state.multiSample; q++)
+		{
+			depthPass = depthPass || depthTest(zBuffer, q, x, z[q], sMask[q], zMask[q], cMask[q]);
+		}
 	}
 
-	void PixelRoutine::quad(Pointer<Byte> cBuffer[RENDERTARGETS], Pointer<Byte> &zBuffer, Pointer<Byte> &sBuffer, Int cMask[4], Int &x, Int &y)
+	If(depthPass || Bool(!earlyDepthTest))
 	{
-		// TODO: consider shader which modifies sample mask in general
-		const bool earlyDepthTest = !spirvShader || (spirvShader->getModes().EarlyFragmentTests && !spirvShader->getModes().DepthReplacing && !state.alphaToCoverage);
+		Float4 yyyy = Float4(Float(y)) + *Pointer<Float4>(primitive + OFFSET(Primitive,yQuad), 16);
 
-		Int zMask[4];   // Depth mask
-		Int sMask[4];   // Stencil mask
+		// Centroid locations
+		Float4 XXXX = Float4(0.0f);
+		Float4 YYYY = Float4(0.0f);
 
-		for(unsigned int q = 0; q < state.multiSample; q++)
+		if(state.centroid)
 		{
-			zMask[q] = cMask[q];
-			sMask[q] = cMask[q];
-		}
+			Float4 WWWW(1.0e-9f);
 
-		for(unsigned int q = 0; q < state.multiSample; q++)
-		{
-			stencilTest(sBuffer, q, x, sMask[q], cMask[q]);
-		}
-
-		Float4 f;
-		Float4 rhwCentroid;
-
-		Float4 xxxx = Float4(Float(x)) + *Pointer<Float4>(primitive + OFFSET(Primitive,xQuad), 16);
-
-		if(interpolateZ())
-		{
 			for(unsigned int q = 0; q < state.multiSample; q++)
 			{
-				Float4 x = xxxx;
-
-				if(state.multiSample > 1)
-				{
-					x -= *Pointer<Float4>(constants + OFFSET(Constants,X) + q * sizeof(float4));
-				}
-
-				z[q] = interpolate(x, Dz[q], z[q], primitive + OFFSET(Primitive,z), false, false, state.depthClamp);
+				XXXX += *Pointer<Float4>(constants + OFFSET(Constants,sampleX[q]) + 16 * cMask[q]);
+				YYYY += *Pointer<Float4>(constants + OFFSET(Constants,sampleY[q]) + 16 * cMask[q]);
+				WWWW += *Pointer<Float4>(constants + OFFSET(Constants,weight) + 16 * cMask[q]);
 			}
+
+			WWWW = Rcp_pp(WWWW);
+			XXXX *= WWWW;
+			YYYY *= WWWW;
+
+			XXXX += xxxx;
+			YYYY += yyyy;
 		}
 
-		Bool depthPass = false;
-
-		if(earlyDepthTest)
+		if(interpolateW())
 		{
-			for(unsigned int q = 0; q < state.multiSample; q++)
-			{
-				depthPass = depthPass || depthTest(zBuffer, q, x, z[q], sMask[q], zMask[q], cMask[q]);
-			}
-		}
-
-		If(depthPass || Bool(!earlyDepthTest))
-		{
-			Float4 yyyy = Float4(Float(y)) + *Pointer<Float4>(primitive + OFFSET(Primitive,yQuad), 16);
-
-			// Centroid locations
-			Float4 XXXX = Float4(0.0f);
-			Float4 YYYY = Float4(0.0f);
+			w = interpolate(xxxx, Dw, rhw, primitive + OFFSET(Primitive,w), false, false, false);
+			rhw = reciprocal(w, false, false, true);
 
 			if(state.centroid)
 			{
-				Float4 WWWW(1.0e-9f);
-
-				for(unsigned int q = 0; q < state.multiSample; q++)
-				{
-					XXXX += *Pointer<Float4>(constants + OFFSET(Constants,sampleX[q]) + 16 * cMask[q]);
-					YYYY += *Pointer<Float4>(constants + OFFSET(Constants,sampleY[q]) + 16 * cMask[q]);
-					WWWW += *Pointer<Float4>(constants + OFFSET(Constants,weight) + 16 * cMask[q]);
-				}
-
-				WWWW = Rcp_pp(WWWW);
-				XXXX *= WWWW;
-				YYYY *= WWWW;
-
-				XXXX += xxxx;
-				YYYY += yyyy;
+				rhwCentroid = reciprocal(interpolateCentroid(XXXX, YYYY, rhwCentroid, primitive + OFFSET(Primitive,w), false, false));
 			}
+		}
 
-			if(interpolateW())
+		if (spirvShader)
+		{
+			for (int interpolant = 0; interpolant < MAX_INTERFACE_COMPONENTS; interpolant++)
 			{
-				w = interpolate(xxxx, Dw, rhw, primitive + OFFSET(Primitive,w), false, false, false);
-				rhw = reciprocal(w, false, false, true);
-
-				if(state.centroid)
+				auto const &input = spirvShader->inputs[interpolant];
+				if (input.Type != SpirvShader::ATTRIBTYPE_UNUSED)
 				{
-					rhwCentroid = reciprocal(interpolateCentroid(XXXX, YYYY, rhwCentroid, primitive + OFFSET(Primitive,w), false, false));
+					if (input.Centroid && state.multiSample > 1)
+					{
+						routine.inputs[interpolant] =
+								interpolateCentroid(XXXX, YYYY, rhwCentroid,
+													primitive + OFFSET(Primitive, V[interpolant]),
+													input.Flat, !input.NoPerspective);
+					}
+					else
+					{
+						routine.inputs[interpolant] =
+								interpolate(xxxx, Dv[interpolant], rhw,
+											primitive + OFFSET(Primitive, V[interpolant]),
+											input.Flat, !input.NoPerspective, false);
+					}
 				}
 			}
 
-			if (spirvShader)
+			setBuiltins(x, y, z, w, cMask);
+
+			for (uint32_t i = 0; i < state.numClipDistances; i++)
 			{
-				for (int interpolant = 0; interpolant < MAX_INTERFACE_COMPONENTS; interpolant++)
+				auto distance = interpolate(xxxx, DclipDistance[i], rhw,
+											primitive + OFFSET(Primitive, clipDistance[i]),
+											false, true, false);
+
+				auto clipMask = SignMask(CmpGE(distance, SIMD::Float(0)));
+				for (auto ms = 0u; ms < state.multiSample; ms++)
 				{
-					auto const &input = spirvShader->inputs[interpolant];
-					if (input.Type != SpirvShader::ATTRIBTYPE_UNUSED)
-					{
-						if (input.Centroid && state.multiSample > 1)
-						{
-							routine.inputs[interpolant] =
-									interpolateCentroid(XXXX, YYYY, rhwCentroid,
-														primitive + OFFSET(Primitive, V[interpolant]),
-														input.Flat, !input.NoPerspective);
-						}
-						else
-						{
-							routine.inputs[interpolant] =
-									interpolate(xxxx, Dv[interpolant], rhw,
-												primitive + OFFSET(Primitive, V[interpolant]),
-												input.Flat, !input.NoPerspective, false);
-						}
-					}
+					// TODO: Fragments discarded by clipping do not exist at
+					// all -- they should not be counted in queries or have
+					// their Z/S effects performed when early fragment tests
+					// are enabled.
+					cMask[ms] &= clipMask;
 				}
 
-				setBuiltins(x, y, z, w, cMask);
-
-				for (uint32_t i = 0; i < state.numClipDistances; i++)
+				if (spirvShader->getUsedCapabilities().ClipDistance)
 				{
-					auto distance = interpolate(xxxx, DclipDistance[i], rhw,
-												primitive + OFFSET(Primitive, clipDistance[i]),
-												false, true, false);
-
-					auto clipMask = SignMask(CmpGE(distance, SIMD::Float(0)));
-					for (auto ms = 0u; ms < state.multiSample; ms++)
-					{
-						// TODO: Fragments discarded by clipping do not exist at
-						// all -- they should not be counted in queries or have
-						// their Z/S effects performed when early fragment tests
-						// are enabled.
-						cMask[ms] &= clipMask;
-					}
-
-					if (spirvShader->getUsedCapabilities().ClipDistance)
-					{
-						auto it = spirvShader->inputBuiltins.find(spv::BuiltInClipDistance);
-						if(it != spirvShader->inputBuiltins.end())
-						{
-							if (i < it->second.SizeInComponents)
-							{
-								routine.getVariable(it->second.Id)[it->second.FirstComponent + i] = distance;
-							}
-						}
-					}
-				}
-
-				if (spirvShader->getUsedCapabilities().CullDistance)
-				{
-					auto it = spirvShader->inputBuiltins.find(spv::BuiltInCullDistance);
+					auto it = spirvShader->inputBuiltins.find(spv::BuiltInClipDistance);
 					if(it != spirvShader->inputBuiltins.end())
 					{
-						for (uint32_t i = 0; i < state.numCullDistances; i++)
+						if (i < it->second.SizeInComponents)
 						{
-							if (i < it->second.SizeInComponents)
-							{
-								routine.getVariable(it->second.Id)[it->second.FirstComponent + i] =
-										interpolate(xxxx, DcullDistance[i], rhw,
-													primitive + OFFSET(Primitive, cullDistance[i]),
-													false, true, false);
-							}
+							routine.getVariable(it->second.Id)[it->second.FirstComponent + i] = distance;
 						}
 					}
 				}
 			}
 
-			Bool alphaPass = true;
-
-			if (spirvShader)
+			if (spirvShader->getUsedCapabilities().CullDistance)
 			{
-				bool earlyFragTests = (spirvShader && spirvShader->getModes().EarlyFragmentTests);
-				applyShader(cMask, earlyFragTests ? sMask : cMask, earlyDepthTest ? zMask : cMask);
+				auto it = spirvShader->inputBuiltins.find(spv::BuiltInCullDistance);
+				if(it != spirvShader->inputBuiltins.end())
+				{
+					for (uint32_t i = 0; i < state.numCullDistances; i++)
+					{
+						if (i < it->second.SizeInComponents)
+						{
+							routine.getVariable(it->second.Id)[it->second.FirstComponent + i] =
+									interpolate(xxxx, DcullDistance[i], rhw,
+												primitive + OFFSET(Primitive, cullDistance[i]),
+												false, true, false);
+						}
+					}
+				}
 			}
+		}
 
-			alphaPass = alphaTest(cMask);
+		Bool alphaPass = true;
 
-			if((spirvShader && spirvShader->getModes().ContainsKill) || state.alphaToCoverage)
+		if (spirvShader)
+		{
+			bool earlyFragTests = (spirvShader && spirvShader->getModes().EarlyFragmentTests);
+			applyShader(cMask, earlyFragTests ? sMask : cMask, earlyDepthTest ? zMask : cMask);
+		}
+
+		alphaPass = alphaTest(cMask);
+
+		if((spirvShader && spirvShader->getModes().ContainsKill) || state.alphaToCoverage)
+		{
+			for(unsigned int q = 0; q < state.multiSample; q++)
+			{
+				zMask[q] &= cMask[q];
+				sMask[q] &= cMask[q];
+			}
+		}
+
+		If(alphaPass)
+		{
+			if(!earlyDepthTest)
 			{
 				for(unsigned int q = 0; q < state.multiSample; q++)
 				{
-					zMask[q] &= cMask[q];
-					sMask[q] &= cMask[q];
+					depthPass = depthPass || depthTest(zBuffer, q, x, z[q], sMask[q], zMask[q], cMask[q]);
 				}
 			}
 
-			If(alphaPass)
+			If(depthPass || Bool(earlyDepthTest))
 			{
-				if(!earlyDepthTest)
+				for(unsigned int q = 0; q < state.multiSample; q++)
 				{
-					for(unsigned int q = 0; q < state.multiSample; q++)
+					if(state.multiSampleMask & (1 << q))
 					{
-						depthPass = depthPass || depthTest(zBuffer, q, x, z[q], sMask[q], zMask[q], cMask[q]);
-					}
-				}
+						writeDepth(zBuffer, q, x, z[q], zMask[q]);
 
-				If(depthPass || Bool(earlyDepthTest))
-				{
-					for(unsigned int q = 0; q < state.multiSample; q++)
-					{
-						if(state.multiSampleMask & (1 << q))
+						if(state.occlusionEnabled)
 						{
-							writeDepth(zBuffer, q, x, z[q], zMask[q]);
-
-							if(state.occlusionEnabled)
-							{
-								occlusion += *Pointer<UInt>(constants + OFFSET(Constants,occlusionCount) + 4 * (zMask[q] & sMask[q]));
-							}
+							occlusion += *Pointer<UInt>(constants + OFFSET(Constants,occlusionCount) + 4 * (zMask[q] & sMask[q]));
 						}
 					}
-
-					rasterOperation(cBuffer, x, sMask, zMask, cMask);
 				}
-			}
-		}
 
-		for(unsigned int q = 0; q < state.multiSample; q++)
-		{
-			if(state.multiSampleMask & (1 << q))
-			{
-				writeStencil(sBuffer, q, x, sMask[q], zMask[q], cMask[q]);
+				rasterOperation(cBuffer, x, sMask, zMask, cMask);
 			}
 		}
 	}
 
-	Float4 PixelRoutine::interpolateCentroid(const Float4 &x, const Float4 &y, const Float4 &rhw, Pointer<Byte> planeEquation, bool flat, bool perspective)
+	for(unsigned int q = 0; q < state.multiSample; q++)
 	{
-		Float4 interpolant = *Pointer<Float4>(planeEquation + OFFSET(PlaneEquation,C), 16);
-
-		if(!flat)
+		if(state.multiSampleMask & (1 << q))
 		{
-			interpolant += x * *Pointer<Float4>(planeEquation + OFFSET(PlaneEquation,A), 16) +
-			               y * *Pointer<Float4>(planeEquation + OFFSET(PlaneEquation,B), 16);
-
-			if(perspective)
-			{
-				interpolant *= rhw;
-			}
+			writeStencil(sBuffer, q, x, sMask[q], zMask[q], cMask[q]);
 		}
-
-		return interpolant;
 	}
+}
 
-	void PixelRoutine::stencilTest(const Pointer<Byte> &sBuffer, int q, const Int &x, Int &sMask, const Int &cMask)
+Float4 PixelRoutine::interpolateCentroid(const Float4 &x, const Float4 &y, const Float4 &rhw, Pointer<Byte> planeEquation, bool flat, bool perspective)
+{
+	Float4 interpolant = *Pointer<Float4>(planeEquation + OFFSET(PlaneEquation,C), 16);
+
+	if(!flat)
 	{
-		if(!state.stencilActive)
+		interpolant += x * *Pointer<Float4>(planeEquation + OFFSET(PlaneEquation,A), 16) +
+		               y * *Pointer<Float4>(planeEquation + OFFSET(PlaneEquation,B), 16);
+
+		if(perspective)
 		{
-			return;
-		}
-
-		// (StencilRef & StencilMask) CompFunc (StencilBufferValue & StencilMask)
-
-		Pointer<Byte> buffer = sBuffer + x;
-
-		if(q > 0)
-		{
-			buffer += q * *Pointer<Int>(data + OFFSET(DrawData,stencilSliceB));
-		}
-
-		Int pitch = *Pointer<Int>(data + OFFSET(DrawData, stencilPitchB));
-		Byte8 value = *Pointer<Byte8>(buffer) & Byte8(-1, -1, 0, 0, 0, 0, 0, 0);
-		value = value | (*Pointer<Byte8>(buffer + pitch - 2) & Byte8(0, 0, -1, -1, 0, 0, 0, 0));
-		Byte8 valueBack = value;
-
-		if(state.frontStencil.compareMask != 0xff)
-		{
-			value &= *Pointer<Byte8>(data + OFFSET(DrawData,stencil[0].testMaskQ));
-		}
-
-		stencilTest(value, state.frontStencil.compareOp, false);
-
-		if(state.backStencil.compareMask != 0xff)
-		{
-			valueBack &= *Pointer<Byte8>(data + OFFSET(DrawData,stencil[1].testMaskQ));
-		}
-
-		stencilTest(valueBack, state.backStencil.compareOp, true);
-
-		value &= *Pointer<Byte8>(primitive + OFFSET(Primitive,clockwiseMask));
-		valueBack &= *Pointer<Byte8>(primitive + OFFSET(Primitive,invClockwiseMask));
-		value |= valueBack;
-
-		sMask = SignMask(value) & cMask;
-	}
-
-	void PixelRoutine::stencilTest(Byte8 &value, VkCompareOp stencilCompareMode, bool isBack)
-	{
-		Byte8 equal;
-
-		switch(stencilCompareMode)
-		{
-		case VK_COMPARE_OP_ALWAYS:
-			value = Byte8(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
-			break;
-		case VK_COMPARE_OP_NEVER:
-			value = Byte8(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
-			break;
-		case VK_COMPARE_OP_LESS:			// a < b ~ b > a
-			value += Byte8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
-			value = CmpGT(As<SByte8>(value), *Pointer<SByte8>(data + OFFSET(DrawData,stencil[isBack].referenceMaskedSignedQ)));
-			break;
-		case VK_COMPARE_OP_EQUAL:
-			value = CmpEQ(value, *Pointer<Byte8>(data + OFFSET(DrawData,stencil[isBack].referenceMaskedQ)));
-			break;
-		case VK_COMPARE_OP_NOT_EQUAL:		// a != b ~ !(a == b)
-			value = CmpEQ(value, *Pointer<Byte8>(data + OFFSET(DrawData,stencil[isBack].referenceMaskedQ)));
-			value ^= Byte8(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
-			break;
-		case VK_COMPARE_OP_LESS_OR_EQUAL:	// a <= b ~ (b > a) || (a == b)
-			equal = value;
-			equal = CmpEQ(equal, *Pointer<Byte8>(data + OFFSET(DrawData,stencil[isBack].referenceMaskedQ)));
-			value += Byte8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
-			value = CmpGT(As<SByte8>(value), *Pointer<SByte8>(data + OFFSET(DrawData,stencil[isBack].referenceMaskedSignedQ)));
-			value |= equal;
-			break;
-		case VK_COMPARE_OP_GREATER:		// a > b
-			equal = *Pointer<Byte8>(data + OFFSET(DrawData,stencil[isBack].referenceMaskedSignedQ));
-			value += Byte8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
-			equal = CmpGT(As<SByte8>(equal), As<SByte8>(value));
-			value = equal;
-			break;
-		case VK_COMPARE_OP_GREATER_OR_EQUAL:	// a >= b ~ !(a < b) ~ !(b > a)
-			value += Byte8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
-			value = CmpGT(As<SByte8>(value), *Pointer<SByte8>(data + OFFSET(DrawData,stencil[isBack].referenceMaskedSignedQ)));
-			value ^= Byte8(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
-			break;
-		default:
-			UNIMPLEMENTED("VkCompareOp: %d", int(stencilCompareMode));
+			interpolant *= rhw;
 		}
 	}
 
-	Bool PixelRoutine::depthTest32F(const Pointer<Byte> &zBuffer, int q, const Int &x, const Float4 &z, const Int &sMask, Int &zMask, const Int &cMask)
+	return interpolant;
+}
+
+void PixelRoutine::stencilTest(const Pointer<Byte> &sBuffer, int q, const Int &x, Int &sMask, const Int &cMask)
+{
+	if(!state.stencilActive)
 	{
-		Float4 Z = z;
-
-		if(spirvShader && spirvShader->getModes().DepthReplacing)
-		{
-			Z = oDepth;
-		}
-
-		Pointer<Byte> buffer = zBuffer + 4 * x;
-		Int pitch = *Pointer<Int>(data + OFFSET(DrawData,depthPitchB));
-
-		if(q > 0)
-		{
-			buffer += q * *Pointer<Int>(data + OFFSET(DrawData,depthSliceB));
-		}
-
-		Float4 zValue;
-
-		if(state.depthCompareMode != VK_COMPARE_OP_NEVER || (state.depthCompareMode != VK_COMPARE_OP_ALWAYS && !state.depthWriteEnable))
-		{
-			// FIXME: Properly optimizes?
-			zValue.xy = *Pointer<Float4>(buffer);
-			zValue.zw = *Pointer<Float4>(buffer + pitch - 8);
-		}
-
-		Int4 zTest;
-
-		switch(state.depthCompareMode)
-		{
-		case VK_COMPARE_OP_ALWAYS:
-			// Optimized
-			break;
-		case VK_COMPARE_OP_NEVER:
-			// Optimized
-			break;
-		case VK_COMPARE_OP_EQUAL:
-			zTest = CmpEQ(zValue, Z);
-			break;
-		case VK_COMPARE_OP_NOT_EQUAL:
-			zTest = CmpNEQ(zValue, Z);
-			break;
-		case VK_COMPARE_OP_LESS:
-			zTest = CmpNLE(zValue, Z);
-			break;
-		case VK_COMPARE_OP_GREATER_OR_EQUAL:
-			zTest = CmpLE(zValue, Z);
-			break;
-		case VK_COMPARE_OP_LESS_OR_EQUAL:
-			zTest = CmpNLT(zValue, Z);
-			break;
-		case VK_COMPARE_OP_GREATER:
-			zTest = CmpLT(zValue, Z);
-			break;
-		default:
-			UNIMPLEMENTED("VkCompareOp: %d", int(state.depthCompareMode));
-		}
-
-		switch(state.depthCompareMode)
-		{
-		case VK_COMPARE_OP_ALWAYS:
-			zMask = cMask;
-			break;
-		case VK_COMPARE_OP_NEVER:
-			zMask = 0x0;
-			break;
-		default:
-			zMask = SignMask(zTest) & cMask;
-			break;
-		}
-
-		if(state.stencilActive)
-		{
-			zMask &= sMask;
-		}
-
-		return zMask != 0;
+		return;
 	}
 
-	Bool PixelRoutine::depthTest16(const Pointer<Byte> &zBuffer, int q, const Int &x, const Float4 &z, const Int &sMask, Int &zMask, const Int &cMask)
+	// (StencilRef & StencilMask) CompFunc (StencilBufferValue & StencilMask)
+
+	Pointer<Byte> buffer = sBuffer + x;
+
+	if(q > 0)
 	{
-		Short4 Z = convertFixed16(z, true);
-
-		if(spirvShader && spirvShader->getModes().DepthReplacing)
-		{
-			Z = convertFixed16(oDepth, true);
-		}
-
-		Pointer<Byte> buffer = zBuffer + 2 * x;
-		Int pitch = *Pointer<Int>(data + OFFSET(DrawData, depthPitchB));
-
-		if(q > 0)
-		{
-			buffer += q * *Pointer<Int>(data + OFFSET(DrawData,depthSliceB));
-		}
-
-		Short4 zValue;
-
-		if(state.depthCompareMode != VK_COMPARE_OP_NEVER || (state.depthCompareMode != VK_COMPARE_OP_ALWAYS && !state.depthWriteEnable))
-		{
-			// FIXME: Properly optimizes?
-			zValue = *Pointer<Short4>(buffer) & Short4(-1, -1, 0, 0);
-			zValue = zValue | (*Pointer<Short4>(buffer + pitch - 4) & Short4(0, 0, -1, -1));
-		}
-
-		Int4 zTest;
-
-		// Bias values to make unsigned compares out of Reactor's (due SSE's) signed compares only
-		zValue = zValue - Short4(0x8000u);
-		Z = Z - Short4(0x8000u);
-
-		switch(state.depthCompareMode)
-		{
-		case VK_COMPARE_OP_ALWAYS:
-			// Optimized
-			break;
-		case VK_COMPARE_OP_NEVER:
-			// Optimized
-			break;
-		case VK_COMPARE_OP_EQUAL:
-			zTest = Int4(CmpEQ(zValue, Z));
-			break;
-		case VK_COMPARE_OP_NOT_EQUAL:
-			zTest = ~Int4(CmpEQ(zValue, Z));
-			break;
-		case VK_COMPARE_OP_LESS:
-			zTest = Int4(CmpGT(zValue, Z));
-			break;
-		case VK_COMPARE_OP_GREATER_OR_EQUAL:
-			zTest = ~Int4(CmpGT(zValue, Z));
-			break;
-		case VK_COMPARE_OP_LESS_OR_EQUAL:
-			zTest = ~Int4(CmpGT(Z, zValue));
-			break;
-		case VK_COMPARE_OP_GREATER:
-			zTest = Int4(CmpGT(Z, zValue));
-			break;
-		default:
-			UNIMPLEMENTED("VkCompareOp: %d", int(state.depthCompareMode));
-		}
-
-		switch(state.depthCompareMode)
-		{
-		case VK_COMPARE_OP_ALWAYS:
-			zMask = cMask;
-			break;
-		case VK_COMPARE_OP_NEVER:
-			zMask = 0x0;
-			break;
-		default:
-			zMask = SignMask(zTest) & cMask;
-			break;
-		}
-
-		if(state.stencilActive)
-		{
-			zMask &= sMask;
-		}
-
-		return zMask != 0;
+		buffer += q * *Pointer<Int>(data + OFFSET(DrawData,stencilSliceB));
 	}
 
-	Bool PixelRoutine::depthTest(const Pointer<Byte> &zBuffer, int q, const Int &x, const Float4 &z, const Int &sMask, Int &zMask, const Int &cMask)
-	{
-		if(!state.depthTestActive)
-		{
-			return true;
-		}
+	Int pitch = *Pointer<Int>(data + OFFSET(DrawData, stencilPitchB));
+	Byte8 value = *Pointer<Byte8>(buffer) & Byte8(-1, -1, 0, 0, 0, 0, 0, 0);
+	value = value | (*Pointer<Byte8>(buffer + pitch - 2) & Byte8(0, 0, -1, -1, 0, 0, 0, 0));
+	Byte8 valueBack = value;
 
-		if (state.depthFormat == VK_FORMAT_D16_UNORM)
-			return depthTest16(zBuffer, q, x, z, sMask, zMask, cMask);
-		else
-			return depthTest32F(zBuffer, q, x, z, sMask, zMask, cMask);
+	if(state.frontStencil.compareMask != 0xff)
+	{
+		value &= *Pointer<Byte8>(data + OFFSET(DrawData,stencil[0].testMaskQ));
 	}
 
-	void PixelRoutine::alphaToCoverage(Int cMask[4], const Float4 &alpha)
+	stencilTest(value, state.frontStencil.compareOp, false);
+
+	if(state.backStencil.compareMask != 0xff)
 	{
-		Int4 coverage0 = CmpNLT(alpha, *Pointer<Float4>(data + OFFSET(DrawData,a2c0)));
-		Int4 coverage1 = CmpNLT(alpha, *Pointer<Float4>(data + OFFSET(DrawData,a2c1)));
-		Int4 coverage2 = CmpNLT(alpha, *Pointer<Float4>(data + OFFSET(DrawData,a2c2)));
-		Int4 coverage3 = CmpNLT(alpha, *Pointer<Float4>(data + OFFSET(DrawData,a2c3)));
-
-		Int aMask0 = SignMask(coverage0);
-		Int aMask1 = SignMask(coverage1);
-		Int aMask2 = SignMask(coverage2);
-		Int aMask3 = SignMask(coverage3);
-
-		cMask[0] &= aMask0;
-		cMask[1] &= aMask1;
-		cMask[2] &= aMask2;
-		cMask[3] &= aMask3;
+		valueBack &= *Pointer<Byte8>(data + OFFSET(DrawData,stencil[1].testMaskQ));
 	}
 
-	void PixelRoutine::writeDepth32F(Pointer<Byte> &zBuffer, int q, const Int &x, const Float4 &z, const Int &zMask)
+	stencilTest(valueBack, state.backStencil.compareOp, true);
+
+	value &= *Pointer<Byte8>(primitive + OFFSET(Primitive,clockwiseMask));
+	valueBack &= *Pointer<Byte8>(primitive + OFFSET(Primitive,invClockwiseMask));
+	value |= valueBack;
+
+	sMask = SignMask(value) & cMask;
+}
+
+void PixelRoutine::stencilTest(Byte8 &value, VkCompareOp stencilCompareMode, bool isBack)
+{
+	Byte8 equal;
+
+	switch(stencilCompareMode)
 	{
-		Float4 Z = z;
+	case VK_COMPARE_OP_ALWAYS:
+		value = Byte8(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
+		break;
+	case VK_COMPARE_OP_NEVER:
+		value = Byte8(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
+		break;
+	case VK_COMPARE_OP_LESS:			// a < b ~ b > a
+		value += Byte8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
+		value = CmpGT(As<SByte8>(value), *Pointer<SByte8>(data + OFFSET(DrawData,stencil[isBack].referenceMaskedSignedQ)));
+		break;
+	case VK_COMPARE_OP_EQUAL:
+		value = CmpEQ(value, *Pointer<Byte8>(data + OFFSET(DrawData,stencil[isBack].referenceMaskedQ)));
+		break;
+	case VK_COMPARE_OP_NOT_EQUAL:		// a != b ~ !(a == b)
+		value = CmpEQ(value, *Pointer<Byte8>(data + OFFSET(DrawData,stencil[isBack].referenceMaskedQ)));
+		value ^= Byte8(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
+		break;
+	case VK_COMPARE_OP_LESS_OR_EQUAL:	// a <= b ~ (b > a) || (a == b)
+		equal = value;
+		equal = CmpEQ(equal, *Pointer<Byte8>(data + OFFSET(DrawData,stencil[isBack].referenceMaskedQ)));
+		value += Byte8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
+		value = CmpGT(As<SByte8>(value), *Pointer<SByte8>(data + OFFSET(DrawData,stencil[isBack].referenceMaskedSignedQ)));
+		value |= equal;
+		break;
+	case VK_COMPARE_OP_GREATER:		// a > b
+		equal = *Pointer<Byte8>(data + OFFSET(DrawData,stencil[isBack].referenceMaskedSignedQ));
+		value += Byte8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
+		equal = CmpGT(As<SByte8>(equal), As<SByte8>(value));
+		value = equal;
+		break;
+	case VK_COMPARE_OP_GREATER_OR_EQUAL:	// a >= b ~ !(a < b) ~ !(b > a)
+		value += Byte8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
+		value = CmpGT(As<SByte8>(value), *Pointer<SByte8>(data + OFFSET(DrawData,stencil[isBack].referenceMaskedSignedQ)));
+		value ^= Byte8(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
+		break;
+	default:
+		UNIMPLEMENTED("VkCompareOp: %d", int(stencilCompareMode));
+	}
+}
 
-		if(spirvShader && spirvShader->getModes().DepthReplacing)
-		{
-			Z = oDepth;
-		}
+Bool PixelRoutine::depthTest32F(const Pointer<Byte> &zBuffer, int q, const Int &x, const Float4 &z, const Int &sMask, Int &zMask, const Int &cMask)
+{
+	Float4 Z = z;
 
-		Pointer<Byte> buffer = zBuffer + 4 * x;
-		Int pitch = *Pointer<Int>(data + OFFSET(DrawData,depthPitchB));
+	if(spirvShader && spirvShader->getModes().DepthReplacing)
+	{
+		Z = oDepth;
+	}
 
-		if(q > 0)
-		{
-			buffer += q * *Pointer<Int>(data + OFFSET(DrawData,depthSliceB));
-		}
+	Pointer<Byte> buffer = zBuffer + 4 * x;
+	Int pitch = *Pointer<Int>(data + OFFSET(DrawData,depthPitchB));
 
-		Float4 zValue;
+	if(q > 0)
+	{
+		buffer += q * *Pointer<Int>(data + OFFSET(DrawData,depthSliceB));
+	}
 
-		if(state.depthCompareMode != VK_COMPARE_OP_NEVER || (state.depthCompareMode != VK_COMPARE_OP_ALWAYS && !state.depthWriteEnable))
-		{
-			// FIXME: Properly optimizes?
-			zValue.xy = *Pointer<Float4>(buffer);
-			zValue.zw = *Pointer<Float4>(buffer + pitch - 8);
-		}
+	Float4 zValue;
 
-		Z = As<Float4>(As<Int4>(Z) & *Pointer<Int4>(constants + OFFSET(Constants,maskD4X) + zMask * 16, 16));
-		zValue = As<Float4>(As<Int4>(zValue) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD4X) + zMask * 16, 16));
-		Z = As<Float4>(As<Int4>(Z) | As<Int4>(zValue));
-
+	if(state.depthCompareMode != VK_COMPARE_OP_NEVER || (state.depthCompareMode != VK_COMPARE_OP_ALWAYS && !state.depthWriteEnable))
+	{
 		// FIXME: Properly optimizes?
-		*Pointer<Float2>(buffer) = Float2(Z.xy);
-		*Pointer<Float2>(buffer + pitch) = Float2(Z.zw);
+		zValue.xy = *Pointer<Float4>(buffer);
+		zValue.zw = *Pointer<Float4>(buffer + pitch - 8);
 	}
 
-	void PixelRoutine::writeDepth16(Pointer<Byte> &zBuffer, int q, const Int &x, const Float4 &z, const Int &zMask)
+	Int4 zTest;
+
+	switch(state.depthCompareMode)
 	{
-		Short4 Z = As<Short4>(convertFixed16(z, true));
+	case VK_COMPARE_OP_ALWAYS:
+		// Optimized
+		break;
+	case VK_COMPARE_OP_NEVER:
+		// Optimized
+		break;
+	case VK_COMPARE_OP_EQUAL:
+		zTest = CmpEQ(zValue, Z);
+		break;
+	case VK_COMPARE_OP_NOT_EQUAL:
+		zTest = CmpNEQ(zValue, Z);
+		break;
+	case VK_COMPARE_OP_LESS:
+		zTest = CmpNLE(zValue, Z);
+		break;
+	case VK_COMPARE_OP_GREATER_OR_EQUAL:
+		zTest = CmpLE(zValue, Z);
+		break;
+	case VK_COMPARE_OP_LESS_OR_EQUAL:
+		zTest = CmpNLT(zValue, Z);
+		break;
+	case VK_COMPARE_OP_GREATER:
+		zTest = CmpLT(zValue, Z);
+		break;
+	default:
+		UNIMPLEMENTED("VkCompareOp: %d", int(state.depthCompareMode));
+	}
 
-		if(spirvShader && spirvShader->getModes().DepthReplacing)
-		{
-			Z = As<Short4>(convertFixed16(oDepth, true));
-		}
+	switch(state.depthCompareMode)
+	{
+	case VK_COMPARE_OP_ALWAYS:
+		zMask = cMask;
+		break;
+	case VK_COMPARE_OP_NEVER:
+		zMask = 0x0;
+		break;
+	default:
+		zMask = SignMask(zTest) & cMask;
+		break;
+	}
 
-		Pointer<Byte> buffer = zBuffer + 2 * x;
-		Int pitch = *Pointer<Int>(data + OFFSET(DrawData,depthPitchB));
+	if(state.stencilActive)
+	{
+		zMask &= sMask;
+	}
 
-		if(q > 0)
-		{
-			buffer += q * *Pointer<Int>(data + OFFSET(DrawData,depthSliceB));
-		}
+	return zMask != 0;
+}
 
-		Short4 zValue;
+Bool PixelRoutine::depthTest16(const Pointer<Byte> &zBuffer, int q, const Int &x, const Float4 &z, const Int &sMask, Int &zMask, const Int &cMask)
+{
+	Short4 Z = convertFixed16(z, true);
 
-		if(state.depthCompareMode != VK_COMPARE_OP_NEVER || (state.depthCompareMode != VK_COMPARE_OP_ALWAYS && !state.depthWriteEnable))
-		{
-			// FIXME: Properly optimizes?
-			zValue = *Pointer<Short4>(buffer) & Short4(-1, -1, 0, 0);
-			zValue = zValue | (*Pointer<Short4>(buffer + pitch - 4) & Short4(0, 0, -1, -1));
-		}
+	if(spirvShader && spirvShader->getModes().DepthReplacing)
+	{
+		Z = convertFixed16(oDepth, true);
+	}
 
-		Z = Z & *Pointer<Short4>(constants + OFFSET(Constants,maskW4Q) + zMask * 8, 8);
-		zValue = zValue & *Pointer<Short4>(constants + OFFSET(Constants,invMaskW4Q) + zMask * 8, 8);
-		Z = Z | zValue;
+	Pointer<Byte> buffer = zBuffer + 2 * x;
+	Int pitch = *Pointer<Int>(data + OFFSET(DrawData, depthPitchB));
 
+	if(q > 0)
+	{
+		buffer += q * *Pointer<Int>(data + OFFSET(DrawData,depthSliceB));
+	}
+
+	Short4 zValue;
+
+	if(state.depthCompareMode != VK_COMPARE_OP_NEVER || (state.depthCompareMode != VK_COMPARE_OP_ALWAYS && !state.depthWriteEnable))
+	{
 		// FIXME: Properly optimizes?
-		*Pointer<Short>(buffer) = Extract(Z, 0);
-		*Pointer<Short>(buffer+2) = Extract(Z, 1);
-		*Pointer<Short>(buffer+pitch) = Extract(Z, 2);
-		*Pointer<Short>(buffer+pitch+2) = Extract(Z, 3);
+		zValue = *Pointer<Short4>(buffer) & Short4(-1, -1, 0, 0);
+		zValue = zValue | (*Pointer<Short4>(buffer + pitch - 4) & Short4(0, 0, -1, -1));
 	}
 
-	void PixelRoutine::writeDepth(Pointer<Byte> &zBuffer, int q, const Int &x, const Float4 &z, const Int &zMask)
+	Int4 zTest;
+
+	// Bias values to make unsigned compares out of Reactor's (due SSE's) signed compares only
+	zValue = zValue - Short4(0x8000u);
+	Z = Z - Short4(0x8000u);
+
+	switch(state.depthCompareMode)
 	{
-		if(!state.depthWriteEnable)
+	case VK_COMPARE_OP_ALWAYS:
+		// Optimized
+		break;
+	case VK_COMPARE_OP_NEVER:
+		// Optimized
+		break;
+	case VK_COMPARE_OP_EQUAL:
+		zTest = Int4(CmpEQ(zValue, Z));
+		break;
+	case VK_COMPARE_OP_NOT_EQUAL:
+		zTest = ~Int4(CmpEQ(zValue, Z));
+		break;
+	case VK_COMPARE_OP_LESS:
+		zTest = Int4(CmpGT(zValue, Z));
+		break;
+	case VK_COMPARE_OP_GREATER_OR_EQUAL:
+		zTest = ~Int4(CmpGT(zValue, Z));
+		break;
+	case VK_COMPARE_OP_LESS_OR_EQUAL:
+		zTest = ~Int4(CmpGT(Z, zValue));
+		break;
+	case VK_COMPARE_OP_GREATER:
+		zTest = Int4(CmpGT(Z, zValue));
+		break;
+	default:
+		UNIMPLEMENTED("VkCompareOp: %d", int(state.depthCompareMode));
+	}
+
+	switch(state.depthCompareMode)
+	{
+	case VK_COMPARE_OP_ALWAYS:
+		zMask = cMask;
+		break;
+	case VK_COMPARE_OP_NEVER:
+		zMask = 0x0;
+		break;
+	default:
+		zMask = SignMask(zTest) & cMask;
+		break;
+	}
+
+	if(state.stencilActive)
+	{
+		zMask &= sMask;
+	}
+
+	return zMask != 0;
+}
+
+Bool PixelRoutine::depthTest(const Pointer<Byte> &zBuffer, int q, const Int &x, const Float4 &z, const Int &sMask, Int &zMask, const Int &cMask)
+{
+	if(!state.depthTestActive)
+	{
+		return true;
+	}
+
+	if (state.depthFormat == VK_FORMAT_D16_UNORM)
+		return depthTest16(zBuffer, q, x, z, sMask, zMask, cMask);
+	else
+		return depthTest32F(zBuffer, q, x, z, sMask, zMask, cMask);
+}
+
+void PixelRoutine::alphaToCoverage(Int cMask[4], const Float4 &alpha)
+{
+	Int4 coverage0 = CmpNLT(alpha, *Pointer<Float4>(data + OFFSET(DrawData,a2c0)));
+	Int4 coverage1 = CmpNLT(alpha, *Pointer<Float4>(data + OFFSET(DrawData,a2c1)));
+	Int4 coverage2 = CmpNLT(alpha, *Pointer<Float4>(data + OFFSET(DrawData,a2c2)));
+	Int4 coverage3 = CmpNLT(alpha, *Pointer<Float4>(data + OFFSET(DrawData,a2c3)));
+
+	Int aMask0 = SignMask(coverage0);
+	Int aMask1 = SignMask(coverage1);
+	Int aMask2 = SignMask(coverage2);
+	Int aMask3 = SignMask(coverage3);
+
+	cMask[0] &= aMask0;
+	cMask[1] &= aMask1;
+	cMask[2] &= aMask2;
+	cMask[3] &= aMask3;
+}
+
+void PixelRoutine::writeDepth32F(Pointer<Byte> &zBuffer, int q, const Int &x, const Float4 &z, const Int &zMask)
+{
+	Float4 Z = z;
+
+	if(spirvShader && spirvShader->getModes().DepthReplacing)
+	{
+		Z = oDepth;
+	}
+
+	Pointer<Byte> buffer = zBuffer + 4 * x;
+	Int pitch = *Pointer<Int>(data + OFFSET(DrawData,depthPitchB));
+
+	if(q > 0)
+	{
+		buffer += q * *Pointer<Int>(data + OFFSET(DrawData,depthSliceB));
+	}
+
+	Float4 zValue;
+
+	if(state.depthCompareMode != VK_COMPARE_OP_NEVER || (state.depthCompareMode != VK_COMPARE_OP_ALWAYS && !state.depthWriteEnable))
+	{
+		// FIXME: Properly optimizes?
+		zValue.xy = *Pointer<Float4>(buffer);
+		zValue.zw = *Pointer<Float4>(buffer + pitch - 8);
+	}
+
+	Z = As<Float4>(As<Int4>(Z) & *Pointer<Int4>(constants + OFFSET(Constants,maskD4X) + zMask * 16, 16));
+	zValue = As<Float4>(As<Int4>(zValue) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD4X) + zMask * 16, 16));
+	Z = As<Float4>(As<Int4>(Z) | As<Int4>(zValue));
+
+	// FIXME: Properly optimizes?
+	*Pointer<Float2>(buffer) = Float2(Z.xy);
+	*Pointer<Float2>(buffer + pitch) = Float2(Z.zw);
+}
+
+void PixelRoutine::writeDepth16(Pointer<Byte> &zBuffer, int q, const Int &x, const Float4 &z, const Int &zMask)
+{
+	Short4 Z = As<Short4>(convertFixed16(z, true));
+
+	if(spirvShader && spirvShader->getModes().DepthReplacing)
+	{
+		Z = As<Short4>(convertFixed16(oDepth, true));
+	}
+
+	Pointer<Byte> buffer = zBuffer + 2 * x;
+	Int pitch = *Pointer<Int>(data + OFFSET(DrawData,depthPitchB));
+
+	if(q > 0)
+	{
+		buffer += q * *Pointer<Int>(data + OFFSET(DrawData,depthSliceB));
+	}
+
+	Short4 zValue;
+
+	if(state.depthCompareMode != VK_COMPARE_OP_NEVER || (state.depthCompareMode != VK_COMPARE_OP_ALWAYS && !state.depthWriteEnable))
+	{
+		// FIXME: Properly optimizes?
+		zValue = *Pointer<Short4>(buffer) & Short4(-1, -1, 0, 0);
+		zValue = zValue | (*Pointer<Short4>(buffer + pitch - 4) & Short4(0, 0, -1, -1));
+	}
+
+	Z = Z & *Pointer<Short4>(constants + OFFSET(Constants,maskW4Q) + zMask * 8, 8);
+	zValue = zValue & *Pointer<Short4>(constants + OFFSET(Constants,invMaskW4Q) + zMask * 8, 8);
+	Z = Z | zValue;
+
+	// FIXME: Properly optimizes?
+	*Pointer<Short>(buffer) = Extract(Z, 0);
+	*Pointer<Short>(buffer+2) = Extract(Z, 1);
+	*Pointer<Short>(buffer+pitch) = Extract(Z, 2);
+	*Pointer<Short>(buffer+pitch+2) = Extract(Z, 3);
+}
+
+void PixelRoutine::writeDepth(Pointer<Byte> &zBuffer, int q, const Int &x, const Float4 &z, const Int &zMask)
+{
+	if(!state.depthWriteEnable)
+	{
+		return;
+	}
+
+	if (state.depthFormat == VK_FORMAT_D16_UNORM)
+		writeDepth16(zBuffer, q, x, z, zMask);
+	else
+		writeDepth32F(zBuffer, q, x, z, zMask);
+}
+
+void PixelRoutine::writeStencil(Pointer<Byte> &sBuffer, int q, const Int &x, const Int &sMask, const Int &zMask, const Int &cMask)
+{
+	if(!state.stencilActive)
+	{
+		return;
+	}
+
+	if(state.frontStencil.passOp == VK_STENCIL_OP_KEEP && state.frontStencil.depthFailOp == VK_STENCIL_OP_KEEP && state.frontStencil.failOp == VK_STENCIL_OP_KEEP)
+	{
+		if(state.backStencil.passOp == VK_STENCIL_OP_KEEP && state.backStencil.depthFailOp == VK_STENCIL_OP_KEEP && state.backStencil.failOp == VK_STENCIL_OP_KEEP)
 		{
 			return;
 		}
-
-		if (state.depthFormat == VK_FORMAT_D16_UNORM)
-			writeDepth16(zBuffer, q, x, z, zMask);
-		else
-			writeDepth32F(zBuffer, q, x, z, zMask);
 	}
 
-	void PixelRoutine::writeStencil(Pointer<Byte> &sBuffer, int q, const Int &x, const Int &sMask, const Int &zMask, const Int &cMask)
+	if((state.frontStencil.writeMask == 0) && (state.backStencil.writeMask == 0))
 	{
-		if(!state.stencilActive)
-		{
-			return;
-		}
-
-		if(state.frontStencil.passOp == VK_STENCIL_OP_KEEP && state.frontStencil.depthFailOp == VK_STENCIL_OP_KEEP && state.frontStencil.failOp == VK_STENCIL_OP_KEEP)
-		{
-			if(state.backStencil.passOp == VK_STENCIL_OP_KEEP && state.backStencil.depthFailOp == VK_STENCIL_OP_KEEP && state.backStencil.failOp == VK_STENCIL_OP_KEEP)
-			{
-				return;
-			}
-		}
-
-		if((state.frontStencil.writeMask == 0) && (state.backStencil.writeMask == 0))
-		{
-			return;
-		}
-
-		Pointer<Byte> buffer = sBuffer + x;
-
-		if(q > 0)
-		{
-			buffer += q * *Pointer<Int>(data + OFFSET(DrawData,stencilSliceB));
-		}
-
-		Int pitch = *Pointer<Int>(data + OFFSET(DrawData, stencilPitchB));
-		Byte8 bufferValue = *Pointer<Byte8>(buffer) & Byte8(-1, -1, 0, 0, 0, 0, 0, 0);
-		bufferValue = bufferValue | (*Pointer<Byte8>(buffer + pitch - 2) & Byte8(0, 0, -1, -1, 0, 0, 0, 0));
-		Byte8 newValue;
-		stencilOperation(newValue, bufferValue, state.frontStencil, false, zMask, sMask);
-
-		if((state.frontStencil.writeMask & 0xFF) != 0xFF) // Assume 8-bit stencil buffer
-		{
-			Byte8 maskedValue = bufferValue;
-			newValue &= *Pointer<Byte8>(data + OFFSET(DrawData,stencil[0].writeMaskQ));
-			maskedValue &= *Pointer<Byte8>(data + OFFSET(DrawData,stencil[0].invWriteMaskQ));
-			newValue |= maskedValue;
-		}
-
-		Byte8 newValueBack;
-
-		stencilOperation(newValueBack, bufferValue, state.backStencil, true, zMask, sMask);
-
-		if((state.backStencil.writeMask & 0xFF) != 0xFF) // Assume 8-bit stencil buffer
-		{
-			Byte8 maskedValue = bufferValue;
-			newValueBack &= *Pointer<Byte8>(data + OFFSET(DrawData,stencil[1].writeMaskQ));
-			maskedValue &= *Pointer<Byte8>(data + OFFSET(DrawData,stencil[1].invWriteMaskQ));
-			newValueBack |= maskedValue;
-		}
-
-		newValue &= *Pointer<Byte8>(primitive + OFFSET(Primitive,clockwiseMask));
-		newValueBack &= *Pointer<Byte8>(primitive + OFFSET(Primitive,invClockwiseMask));
-		newValue |= newValueBack;
-
-		newValue &= *Pointer<Byte8>(constants + OFFSET(Constants,maskB4Q) + 8 * cMask);
-		bufferValue &= *Pointer<Byte8>(constants + OFFSET(Constants,invMaskB4Q) + 8 * cMask);
-		newValue |= bufferValue;
-
-		*Pointer<Short>(buffer) = Extract(As<Short4>(newValue), 0);
-		*Pointer<Short>(buffer + pitch) = Extract(As<Short4>(newValue), 1);
+		return;
 	}
 
-	void PixelRoutine::stencilOperation(Byte8 &newValue, const Byte8 &bufferValue, const PixelProcessor::States::StencilOpState &ops, bool isBack, const Int &zMask, const Int &sMask)
+	Pointer<Byte> buffer = sBuffer + x;
+
+	if(q > 0)
 	{
-		Byte8 &pass = newValue;
-		Byte8 fail;
-		Byte8 zFail;
-
-		stencilOperation(pass, bufferValue, ops.passOp, isBack);
-
-		if(ops.depthFailOp != ops.passOp)
-		{
-			stencilOperation(zFail, bufferValue, ops.depthFailOp, isBack);
-		}
-
-		if(ops.failOp != ops.passOp || ops.failOp != ops.depthFailOp)
-		{
-			stencilOperation(fail, bufferValue, ops.failOp, isBack);
-		}
-
-		if(ops.failOp != ops.passOp || ops.failOp != ops.depthFailOp)
-		{
-			if(state.depthTestActive && ops.depthFailOp != ops.passOp)   // zMask valid and values not the same
-			{
-				pass &= *Pointer<Byte8>(constants + OFFSET(Constants,maskB4Q) + 8 * zMask);
-				zFail &= *Pointer<Byte8>(constants + OFFSET(Constants,invMaskB4Q) + 8 * zMask);
-				pass |= zFail;
-			}
-
-			pass &= *Pointer<Byte8>(constants + OFFSET(Constants,maskB4Q) + 8 * sMask);
-			fail &= *Pointer<Byte8>(constants + OFFSET(Constants,invMaskB4Q) + 8 * sMask);
-			pass |= fail;
-		}
+		buffer += q * *Pointer<Int>(data + OFFSET(DrawData,stencilSliceB));
 	}
 
-	void PixelRoutine::stencilOperation(Byte8 &output, const Byte8 &bufferValue, VkStencilOp operation, bool isBack)
+	Int pitch = *Pointer<Int>(data + OFFSET(DrawData, stencilPitchB));
+	Byte8 bufferValue = *Pointer<Byte8>(buffer) & Byte8(-1, -1, 0, 0, 0, 0, 0, 0);
+	bufferValue = bufferValue | (*Pointer<Byte8>(buffer + pitch - 2) & Byte8(0, 0, -1, -1, 0, 0, 0, 0));
+	Byte8 newValue;
+	stencilOperation(newValue, bufferValue, state.frontStencil, false, zMask, sMask);
+
+	if((state.frontStencil.writeMask & 0xFF) != 0xFF) // Assume 8-bit stencil buffer
 	{
-		switch(operation)
-		{
-		case VK_STENCIL_OP_KEEP:
-			output = bufferValue;
-			break;
-		case VK_STENCIL_OP_ZERO:
-			output = Byte8(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
-			break;
-		case VK_STENCIL_OP_REPLACE:
-			output = *Pointer<Byte8>(data + OFFSET(DrawData,stencil[isBack].referenceQ));
-			break;
-		case VK_STENCIL_OP_INCREMENT_AND_CLAMP:
-			output = AddSat(bufferValue, Byte8(1, 1, 1, 1, 1, 1, 1, 1));
-			break;
-		case VK_STENCIL_OP_DECREMENT_AND_CLAMP:
-			output = SubSat(bufferValue, Byte8(1, 1, 1, 1, 1, 1, 1, 1));
-			break;
-		case VK_STENCIL_OP_INVERT:
-			output = bufferValue ^ Byte8(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
-			break;
-		case VK_STENCIL_OP_INCREMENT_AND_WRAP:
-			output = bufferValue + Byte8(1, 1, 1, 1, 1, 1, 1, 1);
-			break;
-		case VK_STENCIL_OP_DECREMENT_AND_WRAP:
-			output = bufferValue - Byte8(1, 1, 1, 1, 1, 1, 1, 1);
-			break;
-		default:
-			UNIMPLEMENTED("VkStencilOp: %d", int(operation));
-		}
+		Byte8 maskedValue = bufferValue;
+		newValue &= *Pointer<Byte8>(data + OFFSET(DrawData,stencil[0].writeMaskQ));
+		maskedValue &= *Pointer<Byte8>(data + OFFSET(DrawData,stencil[0].invWriteMaskQ));
+		newValue |= maskedValue;
 	}
 
-	void PixelRoutine::blendFactor(Vector4s &blendFactor, const Vector4s &current, const Vector4s &pixel, VkBlendFactor blendFactorActive)
+	Byte8 newValueBack;
+
+	stencilOperation(newValueBack, bufferValue, state.backStencil, true, zMask, sMask);
+
+	if((state.backStencil.writeMask & 0xFF) != 0xFF) // Assume 8-bit stencil buffer
 	{
-		switch(blendFactorActive)
-		{
-		case VK_BLEND_FACTOR_ZERO:
-			// Optimized
-			break;
-		case VK_BLEND_FACTOR_ONE:
-			// Optimized
-			break;
-		case VK_BLEND_FACTOR_SRC_COLOR:
-			blendFactor.x = current.x;
-			blendFactor.y = current.y;
-			blendFactor.z = current.z;
-			break;
-		case VK_BLEND_FACTOR_ONE_MINUS_SRC_COLOR:
-			blendFactor.x = Short4(0xFFFFu) - current.x;
-			blendFactor.y = Short4(0xFFFFu) - current.y;
-			blendFactor.z = Short4(0xFFFFu) - current.z;
-			break;
-		case VK_BLEND_FACTOR_DST_COLOR:
-			blendFactor.x = pixel.x;
-			blendFactor.y = pixel.y;
-			blendFactor.z = pixel.z;
-			break;
-		case VK_BLEND_FACTOR_ONE_MINUS_DST_COLOR:
-			blendFactor.x = Short4(0xFFFFu) - pixel.x;
-			blendFactor.y = Short4(0xFFFFu) - pixel.y;
-			blendFactor.z = Short4(0xFFFFu) - pixel.z;
-			break;
-		case VK_BLEND_FACTOR_SRC_ALPHA:
-			blendFactor.x = current.w;
-			blendFactor.y = current.w;
-			blendFactor.z = current.w;
-			break;
-		case VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA:
-			blendFactor.x = Short4(0xFFFFu) - current.w;
-			blendFactor.y = Short4(0xFFFFu) - current.w;
-			blendFactor.z = Short4(0xFFFFu) - current.w;
-			break;
-		case VK_BLEND_FACTOR_DST_ALPHA:
-			blendFactor.x = pixel.w;
-			blendFactor.y = pixel.w;
-			blendFactor.z = pixel.w;
-			break;
-		case VK_BLEND_FACTOR_ONE_MINUS_DST_ALPHA:
-			blendFactor.x = Short4(0xFFFFu) - pixel.w;
-			blendFactor.y = Short4(0xFFFFu) - pixel.w;
-			blendFactor.z = Short4(0xFFFFu) - pixel.w;
-			break;
-		case VK_BLEND_FACTOR_SRC_ALPHA_SATURATE:
-			blendFactor.x = Short4(0xFFFFu) - pixel.w;
-			blendFactor.x = Min(As<UShort4>(blendFactor.x), As<UShort4>(current.w));
-			blendFactor.y = blendFactor.x;
-			blendFactor.z = blendFactor.x;
-			break;
-		case VK_BLEND_FACTOR_CONSTANT_COLOR:
-			blendFactor.x = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[0]));
-			blendFactor.y = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[1]));
-			blendFactor.z = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[2]));
-			break;
-		case VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_COLOR:
-			blendFactor.x = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[0]));
-			blendFactor.y = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[1]));
-			blendFactor.z = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[2]));
-			break;
-		case VK_BLEND_FACTOR_CONSTANT_ALPHA:
-			blendFactor.x = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[3]));
-			blendFactor.y = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[3]));
-			blendFactor.z = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[3]));
-			break;
-		case VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_ALPHA:
-			blendFactor.x = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[3]));
-			blendFactor.y = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[3]));
-			blendFactor.z = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[3]));
-			break;
-		default:
-			UNIMPLEMENTED("VkBlendFactor: %d", int(blendFactorActive));
-		}
+		Byte8 maskedValue = bufferValue;
+		newValueBack &= *Pointer<Byte8>(data + OFFSET(DrawData,stencil[1].writeMaskQ));
+		maskedValue &= *Pointer<Byte8>(data + OFFSET(DrawData,stencil[1].invWriteMaskQ));
+		newValueBack |= maskedValue;
 	}
 
-	void PixelRoutine::blendFactorAlpha(Vector4s &blendFactor, const Vector4s &current, const Vector4s &pixel, VkBlendFactor blendFactorAlphaActive)
+	newValue &= *Pointer<Byte8>(primitive + OFFSET(Primitive,clockwiseMask));
+	newValueBack &= *Pointer<Byte8>(primitive + OFFSET(Primitive,invClockwiseMask));
+	newValue |= newValueBack;
+
+	newValue &= *Pointer<Byte8>(constants + OFFSET(Constants,maskB4Q) + 8 * cMask);
+	bufferValue &= *Pointer<Byte8>(constants + OFFSET(Constants,invMaskB4Q) + 8 * cMask);
+	newValue |= bufferValue;
+
+	*Pointer<Short>(buffer) = Extract(As<Short4>(newValue), 0);
+	*Pointer<Short>(buffer + pitch) = Extract(As<Short4>(newValue), 1);
+}
+
+void PixelRoutine::stencilOperation(Byte8 &newValue, const Byte8 &bufferValue, const PixelProcessor::States::StencilOpState &ops, bool isBack, const Int &zMask, const Int &sMask)
+{
+	Byte8 &pass = newValue;
+	Byte8 fail;
+	Byte8 zFail;
+
+	stencilOperation(pass, bufferValue, ops.passOp, isBack);
+
+	if(ops.depthFailOp != ops.passOp)
 	{
-		switch(blendFactorAlphaActive)
-		{
-		case VK_BLEND_FACTOR_ZERO:
-			// Optimized
-			break;
-		case VK_BLEND_FACTOR_ONE:
-			// Optimized
-			break;
-		case VK_BLEND_FACTOR_SRC_COLOR:
-			blendFactor.w = current.w;
-			break;
-		case VK_BLEND_FACTOR_ONE_MINUS_SRC_COLOR:
-			blendFactor.w = Short4(0xFFFFu) - current.w;
-			break;
-		case VK_BLEND_FACTOR_DST_COLOR:
-			blendFactor.w = pixel.w;
-			break;
-		case VK_BLEND_FACTOR_ONE_MINUS_DST_COLOR:
-			blendFactor.w = Short4(0xFFFFu) - pixel.w;
-			break;
-		case VK_BLEND_FACTOR_SRC_ALPHA:
-			blendFactor.w = current.w;
-			break;
-		case VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA:
-			blendFactor.w = Short4(0xFFFFu) - current.w;
-			break;
-		case VK_BLEND_FACTOR_DST_ALPHA:
-			blendFactor.w = pixel.w;
-			break;
-		case VK_BLEND_FACTOR_ONE_MINUS_DST_ALPHA:
-			blendFactor.w = Short4(0xFFFFu) - pixel.w;
-			break;
-		case VK_BLEND_FACTOR_SRC_ALPHA_SATURATE:
-			blendFactor.w = Short4(0xFFFFu);
-			break;
-		case VK_BLEND_FACTOR_CONSTANT_COLOR:
-		case VK_BLEND_FACTOR_CONSTANT_ALPHA:
-			blendFactor.w = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[3]));
-			break;
-		case VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_COLOR:
-		case VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_ALPHA:
-			blendFactor.w = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[3]));
-			break;
-		default:
-			UNIMPLEMENTED("VkBlendFactor: %d", int(blendFactorAlphaActive));
-		}
+		stencilOperation(zFail, bufferValue, ops.depthFailOp, isBack);
 	}
 
-	bool PixelRoutine::isSRGB(int index) const
+	if(ops.failOp != ops.passOp || ops.failOp != ops.depthFailOp)
 	{
-		return vk::Format(state.targetFormat[index]).isSRGBformat();
+		stencilOperation(fail, bufferValue, ops.failOp, isBack);
 	}
 
-	void PixelRoutine::readPixel(int index, const Pointer<Byte> &cBuffer, const Int &x, Vector4s &pixel)
+	if(ops.failOp != ops.passOp || ops.failOp != ops.depthFailOp)
 	{
-		Short4 c01;
-		Short4 c23;
-		Pointer<Byte> buffer;
-		Pointer<Byte> buffer2;
-
-		switch(state.targetFormat[index])
+		if(state.depthTestActive && ops.depthFailOp != ops.passOp)   // zMask valid and values not the same
 		{
-		case VK_FORMAT_A1R5G5B5_UNORM_PACK16:
-			buffer = cBuffer + 2 * x;
-			buffer2 = buffer + *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
-			c01 = As<Short4>(Int2(*Pointer<Int>(buffer), *Pointer<Int>(buffer2)));
-
-			pixel.x = (c01 & Short4(0x7C00u)) << 1;
-			pixel.y = (c01 & Short4(0x03E0u)) << 6;
-			pixel.z = (c01 & Short4(0x001Fu)) << 11;
-			pixel.w = (c01 & Short4(0x8000u)) >> 15;
-
-			// Expand to 16 bit range
-			pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 5);
-			pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 10);
-			pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 5);
-			pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 10);
-			pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 5);
-			pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 10);
-			break;
-		case VK_FORMAT_R5G6B5_UNORM_PACK16:
-			buffer = cBuffer + 2 * x;
-			buffer2 = buffer + *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
-			c01 = As<Short4>(Int2(*Pointer<Int>(buffer), *Pointer<Int>(buffer2)));
-
-			pixel.x = c01 & Short4(0xF800u);
-			pixel.y = (c01 & Short4(0x07E0u)) << 5;
-			pixel.z = (c01 & Short4(0x001Fu)) << 11;
-			pixel.w = Short4(0xFFFFu);
-
-			// Expand to 16 bit range
-			pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 5);
-			pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 10);
-			pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 6);
-			pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 12);
-			pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 5);
-			pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 10);
-			break;
-		case VK_FORMAT_B8G8R8A8_UNORM:
-		case VK_FORMAT_B8G8R8A8_SRGB:
-			buffer = cBuffer + 4 * x;
-			c01 = *Pointer<Short4>(buffer);
-			buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
-			c23 = *Pointer<Short4>(buffer);
-			pixel.z = c01;
-			pixel.y = c01;
-			pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(c23));
-			pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(c23));
-			pixel.x = pixel.z;
-			pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.y));
-			pixel.x = UnpackHigh(As<Byte8>(pixel.x), As<Byte8>(pixel.y));
-			pixel.y = pixel.z;
-			pixel.w = pixel.x;
-			pixel.x = UnpackLow(As<Byte8>(pixel.x), As<Byte8>(pixel.x));
-			pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(pixel.y));
-			pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.z));
-			pixel.w = UnpackHigh(As<Byte8>(pixel.w), As<Byte8>(pixel.w));
-			break;
-		case VK_FORMAT_R8G8B8A8_UNORM:
-		case VK_FORMAT_R8G8B8A8_SRGB:
-			buffer = cBuffer + 4 * x;
-			c01 = *Pointer<Short4>(buffer);
-			buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
-			c23 = *Pointer<Short4>(buffer);
-			pixel.z = c01;
-			pixel.y = c01;
-			pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(c23));
-			pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(c23));
-			pixel.x = pixel.z;
-			pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.y));
-			pixel.x = UnpackHigh(As<Byte8>(pixel.x), As<Byte8>(pixel.y));
-			pixel.y = pixel.z;
-			pixel.w = pixel.x;
-			pixel.x = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.z));
-			pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(pixel.y));
-			pixel.z = UnpackLow(As<Byte8>(pixel.w), As<Byte8>(pixel.w));
-			pixel.w = UnpackHigh(As<Byte8>(pixel.w), As<Byte8>(pixel.w));
-			break;
-		case VK_FORMAT_R8_UNORM:
-			buffer = cBuffer + 1 * x;
-			pixel.x = Insert(pixel.x, *Pointer<Short>(buffer), 0);
-			buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
-			pixel.x = Insert(pixel.x, *Pointer<Short>(buffer), 1);
-			pixel.x = UnpackLow(As<Byte8>(pixel.x), As<Byte8>(pixel.x));
-			pixel.y = Short4(0x0000);
-			pixel.z = Short4(0x0000);
-			pixel.w = Short4(0xFFFFu);
-			break;
-		case VK_FORMAT_R8G8_UNORM:
-			buffer = cBuffer + 2 * x;
-			c01 = As<Short4>(Insert(As<Int2>(c01), *Pointer<Int>(buffer), 0));
-			buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
-			c01 = As<Short4>(Insert(As<Int2>(c01), *Pointer<Int>(buffer), 1));
-			pixel.x = (c01 & Short4(0x00FFu)) | (c01 << 8);
-			pixel.y = (c01 & Short4(0xFF00u)) | As<Short4>(As<UShort4>(c01) >> 8);
-			pixel.z = Short4(0x0000u);
-			pixel.w = Short4(0xFFFFu);
-			break;
-		case VK_FORMAT_R16G16B16A16_UNORM:
-			buffer = cBuffer;
-			pixel.x = *Pointer<Short4>(buffer + 8 * x);
-			pixel.y = *Pointer<Short4>(buffer + 8 * x + 8);
-			buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
-			pixel.z = *Pointer<Short4>(buffer + 8 * x);
-			pixel.w = *Pointer<Short4>(buffer + 8 * x + 8);
-			transpose4x4(pixel.x, pixel.y, pixel.z, pixel.w);
-			break;
-		case VK_FORMAT_R16G16_UNORM:
-			buffer = cBuffer;
-			pixel.x = *Pointer<Short4>(buffer + 4 * x);
-			buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
-			pixel.y = *Pointer<Short4>(buffer + 4 * x);
-			pixel.z = pixel.x;
-			pixel.x = As<Short4>(UnpackLow(pixel.x, pixel.y));
-			pixel.z = As<Short4>(UnpackHigh(pixel.z, pixel.y));
-			pixel.y = pixel.z;
-			pixel.x = As<Short4>(UnpackLow(pixel.x, pixel.z));
-			pixel.y = As<Short4>(UnpackHigh(pixel.y, pixel.z));
-			pixel.z = Short4(0xFFFFu);
-			pixel.w = Short4(0xFFFFu);
-			break;
-		case VK_FORMAT_A2B10G10R10_UNORM_PACK32:
-		{
-			buffer = cBuffer;
-			Int4 v = Int4(0);
-			v = Insert(v, *Pointer<Int>(buffer + 4 * x), 0);
-			v = Insert(v, *Pointer<Int>(buffer + 4 * x + 4), 1);
-			buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
-			v = Insert(v, *Pointer<Int>(buffer + 4 * x), 2);
-			v = Insert(v, *Pointer<Int>(buffer + 4 * x + 4), 3);
-
-			pixel.x = Short4(v << 6) & Short4(0xFFC0u);
-			pixel.y = Short4(v >> 4) & Short4(0xFFC0u);
-			pixel.z = Short4(v >> 14) & Short4(0xFFC0u);
-			pixel.w = Short4(v >> 16) & Short4(0xC000u);
-		} break;
-		default:
-			UNIMPLEMENTED("VkFormat %d", state.targetFormat[index]);
+			pass &= *Pointer<Byte8>(constants + OFFSET(Constants,maskB4Q) + 8 * zMask);
+			zFail &= *Pointer<Byte8>(constants + OFFSET(Constants,invMaskB4Q) + 8 * zMask);
+			pass |= zFail;
 		}
 
-		if(isSRGB(index))
-		{
-			sRGBtoLinear16_12_16(pixel);
-		}
+		pass &= *Pointer<Byte8>(constants + OFFSET(Constants,maskB4Q) + 8 * sMask);
+		fail &= *Pointer<Byte8>(constants + OFFSET(Constants,invMaskB4Q) + 8 * sMask);
+		pass |= fail;
+	}
+}
+
+void PixelRoutine::stencilOperation(Byte8 &output, const Byte8 &bufferValue, VkStencilOp operation, bool isBack)
+{
+	switch(operation)
+	{
+	case VK_STENCIL_OP_KEEP:
+		output = bufferValue;
+		break;
+	case VK_STENCIL_OP_ZERO:
+		output = Byte8(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
+		break;
+	case VK_STENCIL_OP_REPLACE:
+		output = *Pointer<Byte8>(data + OFFSET(DrawData,stencil[isBack].referenceQ));
+		break;
+	case VK_STENCIL_OP_INCREMENT_AND_CLAMP:
+		output = AddSat(bufferValue, Byte8(1, 1, 1, 1, 1, 1, 1, 1));
+		break;
+	case VK_STENCIL_OP_DECREMENT_AND_CLAMP:
+		output = SubSat(bufferValue, Byte8(1, 1, 1, 1, 1, 1, 1, 1));
+		break;
+	case VK_STENCIL_OP_INVERT:
+		output = bufferValue ^ Byte8(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
+		break;
+	case VK_STENCIL_OP_INCREMENT_AND_WRAP:
+		output = bufferValue + Byte8(1, 1, 1, 1, 1, 1, 1, 1);
+		break;
+	case VK_STENCIL_OP_DECREMENT_AND_WRAP:
+		output = bufferValue - Byte8(1, 1, 1, 1, 1, 1, 1, 1);
+		break;
+	default:
+		UNIMPLEMENTED("VkStencilOp: %d", int(operation));
+	}
+}
+
+void PixelRoutine::blendFactor(Vector4s &blendFactor, const Vector4s &current, const Vector4s &pixel, VkBlendFactor blendFactorActive)
+{
+	switch(blendFactorActive)
+	{
+	case VK_BLEND_FACTOR_ZERO:
+		// Optimized
+		break;
+	case VK_BLEND_FACTOR_ONE:
+		// Optimized
+		break;
+	case VK_BLEND_FACTOR_SRC_COLOR:
+		blendFactor.x = current.x;
+		blendFactor.y = current.y;
+		blendFactor.z = current.z;
+		break;
+	case VK_BLEND_FACTOR_ONE_MINUS_SRC_COLOR:
+		blendFactor.x = Short4(0xFFFFu) - current.x;
+		blendFactor.y = Short4(0xFFFFu) - current.y;
+		blendFactor.z = Short4(0xFFFFu) - current.z;
+		break;
+	case VK_BLEND_FACTOR_DST_COLOR:
+		blendFactor.x = pixel.x;
+		blendFactor.y = pixel.y;
+		blendFactor.z = pixel.z;
+		break;
+	case VK_BLEND_FACTOR_ONE_MINUS_DST_COLOR:
+		blendFactor.x = Short4(0xFFFFu) - pixel.x;
+		blendFactor.y = Short4(0xFFFFu) - pixel.y;
+		blendFactor.z = Short4(0xFFFFu) - pixel.z;
+		break;
+	case VK_BLEND_FACTOR_SRC_ALPHA:
+		blendFactor.x = current.w;
+		blendFactor.y = current.w;
+		blendFactor.z = current.w;
+		break;
+	case VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA:
+		blendFactor.x = Short4(0xFFFFu) - current.w;
+		blendFactor.y = Short4(0xFFFFu) - current.w;
+		blendFactor.z = Short4(0xFFFFu) - current.w;
+		break;
+	case VK_BLEND_FACTOR_DST_ALPHA:
+		blendFactor.x = pixel.w;
+		blendFactor.y = pixel.w;
+		blendFactor.z = pixel.w;
+		break;
+	case VK_BLEND_FACTOR_ONE_MINUS_DST_ALPHA:
+		blendFactor.x = Short4(0xFFFFu) - pixel.w;
+		blendFactor.y = Short4(0xFFFFu) - pixel.w;
+		blendFactor.z = Short4(0xFFFFu) - pixel.w;
+		break;
+	case VK_BLEND_FACTOR_SRC_ALPHA_SATURATE:
+		blendFactor.x = Short4(0xFFFFu) - pixel.w;
+		blendFactor.x = Min(As<UShort4>(blendFactor.x), As<UShort4>(current.w));
+		blendFactor.y = blendFactor.x;
+		blendFactor.z = blendFactor.x;
+		break;
+	case VK_BLEND_FACTOR_CONSTANT_COLOR:
+		blendFactor.x = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[0]));
+		blendFactor.y = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[1]));
+		blendFactor.z = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[2]));
+		break;
+	case VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_COLOR:
+		blendFactor.x = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[0]));
+		blendFactor.y = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[1]));
+		blendFactor.z = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[2]));
+		break;
+	case VK_BLEND_FACTOR_CONSTANT_ALPHA:
+		blendFactor.x = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[3]));
+		blendFactor.y = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[3]));
+		blendFactor.z = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[3]));
+		break;
+	case VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_ALPHA:
+		blendFactor.x = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[3]));
+		blendFactor.y = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[3]));
+		blendFactor.z = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[3]));
+		break;
+	default:
+		UNIMPLEMENTED("VkBlendFactor: %d", int(blendFactorActive));
+	}
+}
+
+void PixelRoutine::blendFactorAlpha(Vector4s &blendFactor, const Vector4s &current, const Vector4s &pixel, VkBlendFactor blendFactorAlphaActive)
+{
+	switch(blendFactorAlphaActive)
+	{
+	case VK_BLEND_FACTOR_ZERO:
+		// Optimized
+		break;
+	case VK_BLEND_FACTOR_ONE:
+		// Optimized
+		break;
+	case VK_BLEND_FACTOR_SRC_COLOR:
+		blendFactor.w = current.w;
+		break;
+	case VK_BLEND_FACTOR_ONE_MINUS_SRC_COLOR:
+		blendFactor.w = Short4(0xFFFFu) - current.w;
+		break;
+	case VK_BLEND_FACTOR_DST_COLOR:
+		blendFactor.w = pixel.w;
+		break;
+	case VK_BLEND_FACTOR_ONE_MINUS_DST_COLOR:
+		blendFactor.w = Short4(0xFFFFu) - pixel.w;
+		break;
+	case VK_BLEND_FACTOR_SRC_ALPHA:
+		blendFactor.w = current.w;
+		break;
+	case VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA:
+		blendFactor.w = Short4(0xFFFFu) - current.w;
+		break;
+	case VK_BLEND_FACTOR_DST_ALPHA:
+		blendFactor.w = pixel.w;
+		break;
+	case VK_BLEND_FACTOR_ONE_MINUS_DST_ALPHA:
+		blendFactor.w = Short4(0xFFFFu) - pixel.w;
+		break;
+	case VK_BLEND_FACTOR_SRC_ALPHA_SATURATE:
+		blendFactor.w = Short4(0xFFFFu);
+		break;
+	case VK_BLEND_FACTOR_CONSTANT_COLOR:
+	case VK_BLEND_FACTOR_CONSTANT_ALPHA:
+		blendFactor.w = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[3]));
+		break;
+	case VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_COLOR:
+	case VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_ALPHA:
+		blendFactor.w = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[3]));
+		break;
+	default:
+		UNIMPLEMENTED("VkBlendFactor: %d", int(blendFactorAlphaActive));
+	}
+}
+
+bool PixelRoutine::isSRGB(int index) const
+{
+	return vk::Format(state.targetFormat[index]).isSRGBformat();
+}
+
+void PixelRoutine::readPixel(int index, const Pointer<Byte> &cBuffer, const Int &x, Vector4s &pixel)
+{
+	Short4 c01;
+	Short4 c23;
+	Pointer<Byte> buffer;
+	Pointer<Byte> buffer2;
+
+	switch(state.targetFormat[index])
+	{
+	case VK_FORMAT_A1R5G5B5_UNORM_PACK16:
+		buffer = cBuffer + 2 * x;
+		buffer2 = buffer + *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
+		c01 = As<Short4>(Int2(*Pointer<Int>(buffer), *Pointer<Int>(buffer2)));
+
+		pixel.x = (c01 & Short4(0x7C00u)) << 1;
+		pixel.y = (c01 & Short4(0x03E0u)) << 6;
+		pixel.z = (c01 & Short4(0x001Fu)) << 11;
+		pixel.w = (c01 & Short4(0x8000u)) >> 15;
+
+		// Expand to 16 bit range
+		pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 5);
+		pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 10);
+		pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 5);
+		pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 10);
+		pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 5);
+		pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 10);
+		break;
+	case VK_FORMAT_R5G6B5_UNORM_PACK16:
+		buffer = cBuffer + 2 * x;
+		buffer2 = buffer + *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
+		c01 = As<Short4>(Int2(*Pointer<Int>(buffer), *Pointer<Int>(buffer2)));
+
+		pixel.x = c01 & Short4(0xF800u);
+		pixel.y = (c01 & Short4(0x07E0u)) << 5;
+		pixel.z = (c01 & Short4(0x001Fu)) << 11;
+		pixel.w = Short4(0xFFFFu);
+
+		// Expand to 16 bit range
+		pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 5);
+		pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 10);
+		pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 6);
+		pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 12);
+		pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 5);
+		pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 10);
+		break;
+	case VK_FORMAT_B8G8R8A8_UNORM:
+	case VK_FORMAT_B8G8R8A8_SRGB:
+		buffer = cBuffer + 4 * x;
+		c01 = *Pointer<Short4>(buffer);
+		buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
+		c23 = *Pointer<Short4>(buffer);
+		pixel.z = c01;
+		pixel.y = c01;
+		pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(c23));
+		pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(c23));
+		pixel.x = pixel.z;
+		pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.y));
+		pixel.x = UnpackHigh(As<Byte8>(pixel.x), As<Byte8>(pixel.y));
+		pixel.y = pixel.z;
+		pixel.w = pixel.x;
+		pixel.x = UnpackLow(As<Byte8>(pixel.x), As<Byte8>(pixel.x));
+		pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(pixel.y));
+		pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.z));
+		pixel.w = UnpackHigh(As<Byte8>(pixel.w), As<Byte8>(pixel.w));
+		break;
+	case VK_FORMAT_R8G8B8A8_UNORM:
+	case VK_FORMAT_R8G8B8A8_SRGB:
+		buffer = cBuffer + 4 * x;
+		c01 = *Pointer<Short4>(buffer);
+		buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
+		c23 = *Pointer<Short4>(buffer);
+		pixel.z = c01;
+		pixel.y = c01;
+		pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(c23));
+		pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(c23));
+		pixel.x = pixel.z;
+		pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.y));
+		pixel.x = UnpackHigh(As<Byte8>(pixel.x), As<Byte8>(pixel.y));
+		pixel.y = pixel.z;
+		pixel.w = pixel.x;
+		pixel.x = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.z));
+		pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(pixel.y));
+		pixel.z = UnpackLow(As<Byte8>(pixel.w), As<Byte8>(pixel.w));
+		pixel.w = UnpackHigh(As<Byte8>(pixel.w), As<Byte8>(pixel.w));
+		break;
+	case VK_FORMAT_R8_UNORM:
+		buffer = cBuffer + 1 * x;
+		pixel.x = Insert(pixel.x, *Pointer<Short>(buffer), 0);
+		buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
+		pixel.x = Insert(pixel.x, *Pointer<Short>(buffer), 1);
+		pixel.x = UnpackLow(As<Byte8>(pixel.x), As<Byte8>(pixel.x));
+		pixel.y = Short4(0x0000);
+		pixel.z = Short4(0x0000);
+		pixel.w = Short4(0xFFFFu);
+		break;
+	case VK_FORMAT_R8G8_UNORM:
+		buffer = cBuffer + 2 * x;
+		c01 = As<Short4>(Insert(As<Int2>(c01), *Pointer<Int>(buffer), 0));
+		buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
+		c01 = As<Short4>(Insert(As<Int2>(c01), *Pointer<Int>(buffer), 1));
+		pixel.x = (c01 & Short4(0x00FFu)) | (c01 << 8);
+		pixel.y = (c01 & Short4(0xFF00u)) | As<Short4>(As<UShort4>(c01) >> 8);
+		pixel.z = Short4(0x0000u);
+		pixel.w = Short4(0xFFFFu);
+		break;
+	case VK_FORMAT_R16G16B16A16_UNORM:
+		buffer = cBuffer;
+		pixel.x = *Pointer<Short4>(buffer + 8 * x);
+		pixel.y = *Pointer<Short4>(buffer + 8 * x + 8);
+		buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
+		pixel.z = *Pointer<Short4>(buffer + 8 * x);
+		pixel.w = *Pointer<Short4>(buffer + 8 * x + 8);
+		transpose4x4(pixel.x, pixel.y, pixel.z, pixel.w);
+		break;
+	case VK_FORMAT_R16G16_UNORM:
+		buffer = cBuffer;
+		pixel.x = *Pointer<Short4>(buffer + 4 * x);
+		buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
+		pixel.y = *Pointer<Short4>(buffer + 4 * x);
+		pixel.z = pixel.x;
+		pixel.x = As<Short4>(UnpackLow(pixel.x, pixel.y));
+		pixel.z = As<Short4>(UnpackHigh(pixel.z, pixel.y));
+		pixel.y = pixel.z;
+		pixel.x = As<Short4>(UnpackLow(pixel.x, pixel.z));
+		pixel.y = As<Short4>(UnpackHigh(pixel.y, pixel.z));
+		pixel.z = Short4(0xFFFFu);
+		pixel.w = Short4(0xFFFFu);
+		break;
+	case VK_FORMAT_A2B10G10R10_UNORM_PACK32:
+	{
+		buffer = cBuffer;
+		Int4 v = Int4(0);
+		v = Insert(v, *Pointer<Int>(buffer + 4 * x), 0);
+		v = Insert(v, *Pointer<Int>(buffer + 4 * x + 4), 1);
+		buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
+		v = Insert(v, *Pointer<Int>(buffer + 4 * x), 2);
+		v = Insert(v, *Pointer<Int>(buffer + 4 * x + 4), 3);
+
+		pixel.x = Short4(v << 6) & Short4(0xFFC0u);
+		pixel.y = Short4(v >> 4) & Short4(0xFFC0u);
+		pixel.z = Short4(v >> 14) & Short4(0xFFC0u);
+		pixel.w = Short4(v >> 16) & Short4(0xC000u);
+	} break;
+	default:
+		UNIMPLEMENTED("VkFormat %d", state.targetFormat[index]);
 	}
 
-	void PixelRoutine::alphaBlend(int index, const Pointer<Byte> &cBuffer, Vector4s &current, const Int &x)
+	if(isSRGB(index))
 	{
-		if(!state.blendState[index].alphaBlendEnable)
-		{
-			return;
-		}
+		sRGBtoLinear16_12_16(pixel);
+	}
+}
 
-		Vector4s pixel;
-		readPixel(index, cBuffer, x, pixel);
-
-		// Final Color = ObjectColor * SourceBlendFactor + PixelColor * DestinationBlendFactor
-		Vector4s sourceFactor;
-		Vector4s destFactor;
-
-		blendFactor(sourceFactor, current, pixel, state.blendState[index].sourceBlendFactor);
-		blendFactor(destFactor, current, pixel, state.blendState[index].destBlendFactor);
-
-		if(state.blendState[index].sourceBlendFactor != VK_BLEND_FACTOR_ONE && state.blendState[index].sourceBlendFactor != VK_BLEND_FACTOR_ZERO)
-		{
-			current.x = MulHigh(As<UShort4>(current.x), As<UShort4>(sourceFactor.x));
-			current.y = MulHigh(As<UShort4>(current.y), As<UShort4>(sourceFactor.y));
-			current.z = MulHigh(As<UShort4>(current.z), As<UShort4>(sourceFactor.z));
-		}
-
-		if(state.blendState[index].destBlendFactor != VK_BLEND_FACTOR_ONE && state.blendState[index].destBlendFactor != VK_BLEND_FACTOR_ZERO)
-		{
-			pixel.x = MulHigh(As<UShort4>(pixel.x), As<UShort4>(destFactor.x));
-			pixel.y = MulHigh(As<UShort4>(pixel.y), As<UShort4>(destFactor.y));
-			pixel.z = MulHigh(As<UShort4>(pixel.z), As<UShort4>(destFactor.z));
-		}
-
-		switch(state.blendState[index].blendOperation)
-		{
-		case VK_BLEND_OP_ADD:
-			current.x = AddSat(As<UShort4>(current.x), As<UShort4>(pixel.x));
-			current.y = AddSat(As<UShort4>(current.y), As<UShort4>(pixel.y));
-			current.z = AddSat(As<UShort4>(current.z), As<UShort4>(pixel.z));
-			break;
-		case VK_BLEND_OP_SUBTRACT:
-			current.x = SubSat(As<UShort4>(current.x), As<UShort4>(pixel.x));
-			current.y = SubSat(As<UShort4>(current.y), As<UShort4>(pixel.y));
-			current.z = SubSat(As<UShort4>(current.z), As<UShort4>(pixel.z));
-			break;
-		case VK_BLEND_OP_REVERSE_SUBTRACT:
-			current.x = SubSat(As<UShort4>(pixel.x), As<UShort4>(current.x));
-			current.y = SubSat(As<UShort4>(pixel.y), As<UShort4>(current.y));
-			current.z = SubSat(As<UShort4>(pixel.z), As<UShort4>(current.z));
-			break;
-		case VK_BLEND_OP_MIN:
-			current.x = Min(As<UShort4>(current.x), As<UShort4>(pixel.x));
-			current.y = Min(As<UShort4>(current.y), As<UShort4>(pixel.y));
-			current.z = Min(As<UShort4>(current.z), As<UShort4>(pixel.z));
-			break;
-		case VK_BLEND_OP_MAX:
-			current.x = Max(As<UShort4>(current.x), As<UShort4>(pixel.x));
-			current.y = Max(As<UShort4>(current.y), As<UShort4>(pixel.y));
-			current.z = Max(As<UShort4>(current.z), As<UShort4>(pixel.z));
-			break;
-		case VK_BLEND_OP_SRC_EXT:
-			// No operation
-			break;
-		case VK_BLEND_OP_DST_EXT:
-			current.x = pixel.x;
-			current.y = pixel.y;
-			current.z = pixel.z;
-			break;
-		case VK_BLEND_OP_ZERO_EXT:
-			current.x = Short4(0x0000);
-			current.y = Short4(0x0000);
-			current.z = Short4(0x0000);
-			break;
-		default:
-			UNIMPLEMENTED("VkBlendOp: %d", int(state.blendState[index].blendOperation));
-		}
-
-		blendFactorAlpha(sourceFactor, current, pixel, state.blendState[index].sourceBlendFactorAlpha);
-		blendFactorAlpha(destFactor, current, pixel, state.blendState[index].destBlendFactorAlpha);
-
-		if(state.blendState[index].sourceBlendFactorAlpha != VK_BLEND_FACTOR_ONE && state.blendState[index].sourceBlendFactorAlpha != VK_BLEND_FACTOR_ZERO)
-		{
-			current.w = MulHigh(As<UShort4>(current.w), As<UShort4>(sourceFactor.w));
-		}
-
-		if(state.blendState[index].destBlendFactorAlpha != VK_BLEND_FACTOR_ONE && state.blendState[index].destBlendFactorAlpha != VK_BLEND_FACTOR_ZERO)
-		{
-			pixel.w = MulHigh(As<UShort4>(pixel.w), As<UShort4>(destFactor.w));
-		}
-
-		switch(state.blendState[index].blendOperationAlpha)
-		{
-		case VK_BLEND_OP_ADD:
-			current.w = AddSat(As<UShort4>(current.w), As<UShort4>(pixel.w));
-			break;
-		case VK_BLEND_OP_SUBTRACT:
-			current.w = SubSat(As<UShort4>(current.w), As<UShort4>(pixel.w));
-			break;
-		case VK_BLEND_OP_REVERSE_SUBTRACT:
-			current.w = SubSat(As<UShort4>(pixel.w), As<UShort4>(current.w));
-			break;
-		case VK_BLEND_OP_MIN:
-			current.w = Min(As<UShort4>(current.w), As<UShort4>(pixel.w));
-			break;
-		case VK_BLEND_OP_MAX:
-			current.w = Max(As<UShort4>(current.w), As<UShort4>(pixel.w));
-			break;
-		case VK_BLEND_OP_SRC_EXT:
-			// No operation
-			break;
-		case VK_BLEND_OP_DST_EXT:
-			current.w = pixel.w;
-			break;
-		case VK_BLEND_OP_ZERO_EXT:
-			current.w = Short4(0x0000);
-			break;
-		default:
-			UNIMPLEMENTED("VkBlendOp: %d", int(state.blendState[index].blendOperationAlpha));
-		}
+void PixelRoutine::alphaBlend(int index, const Pointer<Byte> &cBuffer, Vector4s &current, const Int &x)
+{
+	if(!state.blendState[index].alphaBlendEnable)
+	{
+		return;
 	}
 
-	void PixelRoutine::writeColor(int index, const Pointer<Byte> &cBuffer, const Int &x, Vector4s &current, const Int &sMask, const Int &zMask, const Int &cMask)
+	Vector4s pixel;
+	readPixel(index, cBuffer, x, pixel);
+
+	// Final Color = ObjectColor * SourceBlendFactor + PixelColor * DestinationBlendFactor
+	Vector4s sourceFactor;
+	Vector4s destFactor;
+
+	blendFactor(sourceFactor, current, pixel, state.blendState[index].sourceBlendFactor);
+	blendFactor(destFactor, current, pixel, state.blendState[index].destBlendFactor);
+
+	if(state.blendState[index].sourceBlendFactor != VK_BLEND_FACTOR_ONE && state.blendState[index].sourceBlendFactor != VK_BLEND_FACTOR_ZERO)
 	{
-		if(isSRGB(index))
+		current.x = MulHigh(As<UShort4>(current.x), As<UShort4>(sourceFactor.x));
+		current.y = MulHigh(As<UShort4>(current.y), As<UShort4>(sourceFactor.y));
+		current.z = MulHigh(As<UShort4>(current.z), As<UShort4>(sourceFactor.z));
+	}
+
+	if(state.blendState[index].destBlendFactor != VK_BLEND_FACTOR_ONE && state.blendState[index].destBlendFactor != VK_BLEND_FACTOR_ZERO)
+	{
+		pixel.x = MulHigh(As<UShort4>(pixel.x), As<UShort4>(destFactor.x));
+		pixel.y = MulHigh(As<UShort4>(pixel.y), As<UShort4>(destFactor.y));
+		pixel.z = MulHigh(As<UShort4>(pixel.z), As<UShort4>(destFactor.z));
+	}
+
+	switch(state.blendState[index].blendOperation)
+	{
+	case VK_BLEND_OP_ADD:
+		current.x = AddSat(As<UShort4>(current.x), As<UShort4>(pixel.x));
+		current.y = AddSat(As<UShort4>(current.y), As<UShort4>(pixel.y));
+		current.z = AddSat(As<UShort4>(current.z), As<UShort4>(pixel.z));
+		break;
+	case VK_BLEND_OP_SUBTRACT:
+		current.x = SubSat(As<UShort4>(current.x), As<UShort4>(pixel.x));
+		current.y = SubSat(As<UShort4>(current.y), As<UShort4>(pixel.y));
+		current.z = SubSat(As<UShort4>(current.z), As<UShort4>(pixel.z));
+		break;
+	case VK_BLEND_OP_REVERSE_SUBTRACT:
+		current.x = SubSat(As<UShort4>(pixel.x), As<UShort4>(current.x));
+		current.y = SubSat(As<UShort4>(pixel.y), As<UShort4>(current.y));
+		current.z = SubSat(As<UShort4>(pixel.z), As<UShort4>(current.z));
+		break;
+	case VK_BLEND_OP_MIN:
+		current.x = Min(As<UShort4>(current.x), As<UShort4>(pixel.x));
+		current.y = Min(As<UShort4>(current.y), As<UShort4>(pixel.y));
+		current.z = Min(As<UShort4>(current.z), As<UShort4>(pixel.z));
+		break;
+	case VK_BLEND_OP_MAX:
+		current.x = Max(As<UShort4>(current.x), As<UShort4>(pixel.x));
+		current.y = Max(As<UShort4>(current.y), As<UShort4>(pixel.y));
+		current.z = Max(As<UShort4>(current.z), As<UShort4>(pixel.z));
+		break;
+	case VK_BLEND_OP_SRC_EXT:
+		// No operation
+		break;
+	case VK_BLEND_OP_DST_EXT:
+		current.x = pixel.x;
+		current.y = pixel.y;
+		current.z = pixel.z;
+		break;
+	case VK_BLEND_OP_ZERO_EXT:
+		current.x = Short4(0x0000);
+		current.y = Short4(0x0000);
+		current.z = Short4(0x0000);
+		break;
+	default:
+		UNIMPLEMENTED("VkBlendOp: %d", int(state.blendState[index].blendOperation));
+	}
+
+	blendFactorAlpha(sourceFactor, current, pixel, state.blendState[index].sourceBlendFactorAlpha);
+	blendFactorAlpha(destFactor, current, pixel, state.blendState[index].destBlendFactorAlpha);
+
+	if(state.blendState[index].sourceBlendFactorAlpha != VK_BLEND_FACTOR_ONE && state.blendState[index].sourceBlendFactorAlpha != VK_BLEND_FACTOR_ZERO)
+	{
+		current.w = MulHigh(As<UShort4>(current.w), As<UShort4>(sourceFactor.w));
+	}
+
+	if(state.blendState[index].destBlendFactorAlpha != VK_BLEND_FACTOR_ONE && state.blendState[index].destBlendFactorAlpha != VK_BLEND_FACTOR_ZERO)
+	{
+		pixel.w = MulHigh(As<UShort4>(pixel.w), As<UShort4>(destFactor.w));
+	}
+
+	switch(state.blendState[index].blendOperationAlpha)
+	{
+	case VK_BLEND_OP_ADD:
+		current.w = AddSat(As<UShort4>(current.w), As<UShort4>(pixel.w));
+		break;
+	case VK_BLEND_OP_SUBTRACT:
+		current.w = SubSat(As<UShort4>(current.w), As<UShort4>(pixel.w));
+		break;
+	case VK_BLEND_OP_REVERSE_SUBTRACT:
+		current.w = SubSat(As<UShort4>(pixel.w), As<UShort4>(current.w));
+		break;
+	case VK_BLEND_OP_MIN:
+		current.w = Min(As<UShort4>(current.w), As<UShort4>(pixel.w));
+		break;
+	case VK_BLEND_OP_MAX:
+		current.w = Max(As<UShort4>(current.w), As<UShort4>(pixel.w));
+		break;
+	case VK_BLEND_OP_SRC_EXT:
+		// No operation
+		break;
+	case VK_BLEND_OP_DST_EXT:
+		current.w = pixel.w;
+		break;
+	case VK_BLEND_OP_ZERO_EXT:
+		current.w = Short4(0x0000);
+		break;
+	default:
+		UNIMPLEMENTED("VkBlendOp: %d", int(state.blendState[index].blendOperationAlpha));
+	}
+}
+
+void PixelRoutine::writeColor(int index, const Pointer<Byte> &cBuffer, const Int &x, Vector4s &current, const Int &sMask, const Int &zMask, const Int &cMask)
+{
+	if(isSRGB(index))
+	{
+		linearToSRGB16_12_16(current);
+	}
+
+	switch(state.targetFormat[index])
+	{
+	case VK_FORMAT_B8G8R8A8_UNORM:
+	case VK_FORMAT_B8G8R8A8_SRGB:
+	case VK_FORMAT_R8G8B8A8_UNORM:
+	case VK_FORMAT_R8G8B8A8_SRGB:
+	case VK_FORMAT_R8G8_UNORM:
+	case VK_FORMAT_R8_UNORM:
+	case VK_FORMAT_A8B8G8R8_UNORM_PACK32:
+	case VK_FORMAT_A8B8G8R8_SRGB_PACK32:
+		current.x = current.x - As<Short4>(As<UShort4>(current.x) >> 8) + Short4(0x0080);
+		current.y = current.y - As<Short4>(As<UShort4>(current.y) >> 8) + Short4(0x0080);
+		current.z = current.z - As<Short4>(As<UShort4>(current.z) >> 8) + Short4(0x0080);
+		current.w = current.w - As<Short4>(As<UShort4>(current.w) >> 8) + Short4(0x0080);
+		break;
+	default:
+		break;
+	}
+
+	int rgbaWriteMask = state.colorWriteActive(index);
+	int bgraWriteMask = (rgbaWriteMask & 0x0000000A) | (rgbaWriteMask & 0x00000001) << 2 | (rgbaWriteMask & 0x00000004) >> 2;
+
+	switch(state.targetFormat[index])
+	{
+	case VK_FORMAT_A1R5G5B5_UNORM_PACK16:
 		{
-			linearToSRGB16_12_16(current);
+			current.w = current.w & Short4(0x8000u);
+			current.x = As<UShort4>(current.x & Short4(0xF800)) >> 1;
+			current.y = As<UShort4>(current.y & Short4(0xF800)) >> 6;
+			current.z = As<UShort4>(current.z & Short4(0xF800)) >> 11;
+
+			current.x = current.x | current.y | current.z | current.w;
 		}
-
-		switch(state.targetFormat[index])
+		break;
+	case VK_FORMAT_R5G6B5_UNORM_PACK16:
 		{
-		case VK_FORMAT_B8G8R8A8_UNORM:
-		case VK_FORMAT_B8G8R8A8_SRGB:
-		case VK_FORMAT_R8G8B8A8_UNORM:
-		case VK_FORMAT_R8G8B8A8_SRGB:
-		case VK_FORMAT_R8G8_UNORM:
-		case VK_FORMAT_R8_UNORM:
-		case VK_FORMAT_A8B8G8R8_UNORM_PACK32:
-		case VK_FORMAT_A8B8G8R8_SRGB_PACK32:
-			current.x = current.x - As<Short4>(As<UShort4>(current.x) >> 8) + Short4(0x0080);
-			current.y = current.y - As<Short4>(As<UShort4>(current.y) >> 8) + Short4(0x0080);
-			current.z = current.z - As<Short4>(As<UShort4>(current.z) >> 8) + Short4(0x0080);
-			current.w = current.w - As<Short4>(As<UShort4>(current.w) >> 8) + Short4(0x0080);
-			break;
-		default:
-			break;
+			current.x = current.x & Short4(0xF800u);
+			current.y = As<UShort4>(current.y & Short4(0xFC00u)) >> 5;
+			current.z = As<UShort4>(current.z) >> 11;
+
+			current.x = current.x | current.y | current.z;
 		}
-
-		int rgbaWriteMask = state.colorWriteActive(index);
-		int bgraWriteMask = (rgbaWriteMask & 0x0000000A) | (rgbaWriteMask & 0x00000001) << 2 | (rgbaWriteMask & 0x00000004) >> 2;
-
-		switch(state.targetFormat[index])
+		break;
+	case VK_FORMAT_B8G8R8A8_UNORM:
+	case VK_FORMAT_B8G8R8A8_SRGB:
+		if(rgbaWriteMask == 0x7)
 		{
-		case VK_FORMAT_A1R5G5B5_UNORM_PACK16:
-			{
-				current.w = current.w & Short4(0x8000u);
-				current.x = As<UShort4>(current.x & Short4(0xF800)) >> 1;
-				current.y = As<UShort4>(current.y & Short4(0xF800)) >> 6;
-				current.z = As<UShort4>(current.z & Short4(0xF800)) >> 11;
-
-				current.x = current.x | current.y | current.z | current.w;
-			}
-			break;
-		case VK_FORMAT_R5G6B5_UNORM_PACK16:
-			{
-				current.x = current.x & Short4(0xF800u);
-				current.y = As<UShort4>(current.y & Short4(0xFC00u)) >> 5;
-				current.z = As<UShort4>(current.z) >> 11;
-
-				current.x = current.x | current.y | current.z;
-			}
-			break;
-		case VK_FORMAT_B8G8R8A8_UNORM:
-		case VK_FORMAT_B8G8R8A8_SRGB:
-			if(rgbaWriteMask == 0x7)
-			{
-				current.x = As<Short4>(As<UShort4>(current.x) >> 8);
-				current.y = As<Short4>(As<UShort4>(current.y) >> 8);
-				current.z = As<Short4>(As<UShort4>(current.z) >> 8);
-
-				current.z = As<Short4>(PackUnsigned(current.z, current.x));
-				current.y = As<Short4>(PackUnsigned(current.y, current.y));
-
-				current.x = current.z;
-				current.z = UnpackLow(As<Byte8>(current.z), As<Byte8>(current.y));
-				current.x = UnpackHigh(As<Byte8>(current.x), As<Byte8>(current.y));
-				current.y = current.z;
-				current.z = As<Short4>(UnpackLow(current.z, current.x));
-				current.y = As<Short4>(UnpackHigh(current.y, current.x));
-			}
-			else
-			{
-				current.x = As<Short4>(As<UShort4>(current.x) >> 8);
-				current.y = As<Short4>(As<UShort4>(current.y) >> 8);
-				current.z = As<Short4>(As<UShort4>(current.z) >> 8);
-				current.w = As<Short4>(As<UShort4>(current.w) >> 8);
-
-				current.z = As<Short4>(PackUnsigned(current.z, current.x));
-				current.y = As<Short4>(PackUnsigned(current.y, current.w));
-
-				current.x = current.z;
-				current.z = UnpackLow(As<Byte8>(current.z), As<Byte8>(current.y));
-				current.x = UnpackHigh(As<Byte8>(current.x), As<Byte8>(current.y));
-				current.y = current.z;
-				current.z = As<Short4>(UnpackLow(current.z, current.x));
-				current.y = As<Short4>(UnpackHigh(current.y, current.x));
-			}
-			break;
-		case VK_FORMAT_R8G8B8A8_UNORM:
-		case VK_FORMAT_R8G8B8A8_SRGB:
-		case VK_FORMAT_A8B8G8R8_UNORM_PACK32:
-		case VK_FORMAT_A8B8G8R8_SRGB_PACK32:
-			if(rgbaWriteMask == 0x7)
-			{
-				current.x = As<Short4>(As<UShort4>(current.x) >> 8);
-				current.y = As<Short4>(As<UShort4>(current.y) >> 8);
-				current.z = As<Short4>(As<UShort4>(current.z) >> 8);
-
-				current.z = As<Short4>(PackUnsigned(current.x, current.z));
-				current.y = As<Short4>(PackUnsigned(current.y, current.y));
-
-				current.x = current.z;
-				current.z = UnpackLow(As<Byte8>(current.z), As<Byte8>(current.y));
-				current.x = UnpackHigh(As<Byte8>(current.x), As<Byte8>(current.y));
-				current.y = current.z;
-				current.z = As<Short4>(UnpackLow(current.z, current.x));
-				current.y = As<Short4>(UnpackHigh(current.y, current.x));
-			}
-			else
-			{
-				current.x = As<Short4>(As<UShort4>(current.x) >> 8);
-				current.y = As<Short4>(As<UShort4>(current.y) >> 8);
-				current.z = As<Short4>(As<UShort4>(current.z) >> 8);
-				current.w = As<Short4>(As<UShort4>(current.w) >> 8);
-
-				current.z = As<Short4>(PackUnsigned(current.x, current.z));
-				current.y = As<Short4>(PackUnsigned(current.y, current.w));
-
-				current.x = current.z;
-				current.z = UnpackLow(As<Byte8>(current.z), As<Byte8>(current.y));
-				current.x = UnpackHigh(As<Byte8>(current.x), As<Byte8>(current.y));
-				current.y = current.z;
-				current.z = As<Short4>(UnpackLow(current.z, current.x));
-				current.y = As<Short4>(UnpackHigh(current.y, current.x));
-			}
-			break;
-		case VK_FORMAT_R8G8_UNORM:
 			current.x = As<Short4>(As<UShort4>(current.x) >> 8);
 			current.y = As<Short4>(As<UShort4>(current.y) >> 8);
-			current.x = As<Short4>(PackUnsigned(current.x, current.x));
+			current.z = As<Short4>(As<UShort4>(current.z) >> 8);
+
+			current.z = As<Short4>(PackUnsigned(current.z, current.x));
 			current.y = As<Short4>(PackUnsigned(current.y, current.y));
-			current.x = UnpackLow(As<Byte8>(current.x), As<Byte8>(current.y));
-			break;
-		case VK_FORMAT_R8_UNORM:
-			current.x = As<Short4>(As<UShort4>(current.x) >> 8);
-			current.x = As<Short4>(PackUnsigned(current.x, current.x));
-			break;
-		case VK_FORMAT_R16G16_UNORM:
-			current.z = current.x;
-			current.x = As<Short4>(UnpackLow(current.x, current.y));
-			current.z = As<Short4>(UnpackHigh(current.z, current.y));
+
+			current.x = current.z;
+			current.z = UnpackLow(As<Byte8>(current.z), As<Byte8>(current.y));
+			current.x = UnpackHigh(As<Byte8>(current.x), As<Byte8>(current.y));
 			current.y = current.z;
-			break;
-		case VK_FORMAT_R16G16B16A16_UNORM:
-			transpose4x4(current.x, current.y, current.z, current.w);
-			break;
-		case VK_FORMAT_A2B10G10R10_UNORM_PACK32:
-		{
-			auto r = (Int4(current.x) >> 6) & Int4(0x3ff);
-			auto g = (Int4(current.y) >> 6) & Int4(0x3ff);
-			auto b = (Int4(current.z) >> 6) & Int4(0x3ff);
-			auto a = (Int4(current.w) >> 14) & Int4(0x3);
-			Int4 packed = (a << 30) | (b << 20) | (g << 10) | r;
-			auto c02 = As<Int2>(Int4(packed.xzzz)); // TODO: auto c02 = packed.xz;
-			auto c13 = As<Int2>(Int4(packed.ywww)); // TODO: auto c13 = packed.yw;
-			current.x = UnpackLow(c02, c13);
-			current.y = UnpackHigh(c02, c13);
-			break;
-		}
-		default:
-			UNIMPLEMENTED("VkFormat: %d", int(state.targetFormat[index]));
-		}
-
-		Short4 c01 = current.z;
-		Short4 c23 = current.y;
-
-		Int xMask;   // Combination of all masks
-
-		if(state.depthTestActive)
-		{
-			xMask = zMask;
+			current.z = As<Short4>(UnpackLow(current.z, current.x));
+			current.y = As<Short4>(UnpackHigh(current.y, current.x));
 		}
 		else
 		{
-			xMask = cMask;
+			current.x = As<Short4>(As<UShort4>(current.x) >> 8);
+			current.y = As<Short4>(As<UShort4>(current.y) >> 8);
+			current.z = As<Short4>(As<UShort4>(current.z) >> 8);
+			current.w = As<Short4>(As<UShort4>(current.w) >> 8);
+
+			current.z = As<Short4>(PackUnsigned(current.z, current.x));
+			current.y = As<Short4>(PackUnsigned(current.y, current.w));
+
+			current.x = current.z;
+			current.z = UnpackLow(As<Byte8>(current.z), As<Byte8>(current.y));
+			current.x = UnpackHigh(As<Byte8>(current.x), As<Byte8>(current.y));
+			current.y = current.z;
+			current.z = As<Short4>(UnpackLow(current.z, current.x));
+			current.y = As<Short4>(UnpackHigh(current.y, current.x));
 		}
-
-		if(state.stencilActive)
+		break;
+	case VK_FORMAT_R8G8B8A8_UNORM:
+	case VK_FORMAT_R8G8B8A8_SRGB:
+	case VK_FORMAT_A8B8G8R8_UNORM_PACK32:
+	case VK_FORMAT_A8B8G8R8_SRGB_PACK32:
+		if(rgbaWriteMask == 0x7)
 		{
-			xMask &= sMask;
+			current.x = As<Short4>(As<UShort4>(current.x) >> 8);
+			current.y = As<Short4>(As<UShort4>(current.y) >> 8);
+			current.z = As<Short4>(As<UShort4>(current.z) >> 8);
+
+			current.z = As<Short4>(PackUnsigned(current.x, current.z));
+			current.y = As<Short4>(PackUnsigned(current.y, current.y));
+
+			current.x = current.z;
+			current.z = UnpackLow(As<Byte8>(current.z), As<Byte8>(current.y));
+			current.x = UnpackHigh(As<Byte8>(current.x), As<Byte8>(current.y));
+			current.y = current.z;
+			current.z = As<Short4>(UnpackLow(current.z, current.x));
+			current.y = As<Short4>(UnpackHigh(current.y, current.x));
 		}
-
-		switch(state.targetFormat[index])
+		else
 		{
-		case VK_FORMAT_A1R5G5B5_UNORM_PACK16:
+			current.x = As<Short4>(As<UShort4>(current.x) >> 8);
+			current.y = As<Short4>(As<UShort4>(current.y) >> 8);
+			current.z = As<Short4>(As<UShort4>(current.z) >> 8);
+			current.w = As<Short4>(As<UShort4>(current.w) >> 8);
+
+			current.z = As<Short4>(PackUnsigned(current.x, current.z));
+			current.y = As<Short4>(PackUnsigned(current.y, current.w));
+
+			current.x = current.z;
+			current.z = UnpackLow(As<Byte8>(current.z), As<Byte8>(current.y));
+			current.x = UnpackHigh(As<Byte8>(current.x), As<Byte8>(current.y));
+			current.y = current.z;
+			current.z = As<Short4>(UnpackLow(current.z, current.x));
+			current.y = As<Short4>(UnpackHigh(current.y, current.x));
+		}
+		break;
+	case VK_FORMAT_R8G8_UNORM:
+		current.x = As<Short4>(As<UShort4>(current.x) >> 8);
+		current.y = As<Short4>(As<UShort4>(current.y) >> 8);
+		current.x = As<Short4>(PackUnsigned(current.x, current.x));
+		current.y = As<Short4>(PackUnsigned(current.y, current.y));
+		current.x = UnpackLow(As<Byte8>(current.x), As<Byte8>(current.y));
+		break;
+	case VK_FORMAT_R8_UNORM:
+		current.x = As<Short4>(As<UShort4>(current.x) >> 8);
+		current.x = As<Short4>(PackUnsigned(current.x, current.x));
+		break;
+	case VK_FORMAT_R16G16_UNORM:
+		current.z = current.x;
+		current.x = As<Short4>(UnpackLow(current.x, current.y));
+		current.z = As<Short4>(UnpackHigh(current.z, current.y));
+		current.y = current.z;
+		break;
+	case VK_FORMAT_R16G16B16A16_UNORM:
+		transpose4x4(current.x, current.y, current.z, current.w);
+		break;
+	case VK_FORMAT_A2B10G10R10_UNORM_PACK32:
+	{
+		auto r = (Int4(current.x) >> 6) & Int4(0x3ff);
+		auto g = (Int4(current.y) >> 6) & Int4(0x3ff);
+		auto b = (Int4(current.z) >> 6) & Int4(0x3ff);
+		auto a = (Int4(current.w) >> 14) & Int4(0x3);
+		Int4 packed = (a << 30) | (b << 20) | (g << 10) | r;
+		auto c02 = As<Int2>(Int4(packed.xzzz)); // TODO: auto c02 = packed.xz;
+		auto c13 = As<Int2>(Int4(packed.ywww)); // TODO: auto c13 = packed.yw;
+		current.x = UnpackLow(c02, c13);
+		current.y = UnpackHigh(c02, c13);
+		break;
+	}
+	default:
+		UNIMPLEMENTED("VkFormat: %d", int(state.targetFormat[index]));
+	}
+
+	Short4 c01 = current.z;
+	Short4 c23 = current.y;
+
+	Int xMask;   // Combination of all masks
+
+	if(state.depthTestActive)
+	{
+		xMask = zMask;
+	}
+	else
+	{
+		xMask = cMask;
+	}
+
+	if(state.stencilActive)
+	{
+		xMask &= sMask;
+	}
+
+	switch(state.targetFormat[index])
+	{
+	case VK_FORMAT_A1R5G5B5_UNORM_PACK16:
+		{
+			Pointer<Byte> buffer = cBuffer + 2 * x;
+			Int value = *Pointer<Int>(buffer);
+
+			Int channelMask = *Pointer<Int>(constants + OFFSET(Constants,mask5551Q[bgraWriteMask & 0xF][0]));
+
+			Int c01 = Extract(As<Int2>(current.x), 0);
+			Int mask01 = *Pointer<Int>(constants + OFFSET(Constants,maskW4Q[0][0]) + xMask * 8);
+			if(bgraWriteMask != 0x0000000F)
 			{
-				Pointer<Byte> buffer = cBuffer + 2 * x;
-				Int value = *Pointer<Int>(buffer);
-
-				Int channelMask = *Pointer<Int>(constants + OFFSET(Constants,mask5551Q[bgraWriteMask & 0xF][0]));
-
-				Int c01 = Extract(As<Int2>(current.x), 0);
-				Int mask01 = *Pointer<Int>(constants + OFFSET(Constants,maskW4Q[0][0]) + xMask * 8);
-				if(bgraWriteMask != 0x0000000F)
-				{
-					mask01 &= channelMask;
-				}
-				*Pointer<Int>(buffer) = (c01 & mask01) | (value & ~mask01);
-
-				buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
-				value = *Pointer<Int>(buffer);
-
-				Int c23 = Extract(As<Int2>(current.x), 1);
-				Int mask23 = *Pointer<Int>(constants + OFFSET(Constants,maskW4Q[0][2]) + xMask * 8);
-				if(bgraWriteMask != 0x0000000F)
-				{
-					mask23 &= channelMask;
-				}
-				*Pointer<Int>(buffer) = (c23 & mask23) | (value & ~mask23);
+				mask01 &= channelMask;
 			}
-			break;
-		case VK_FORMAT_R5G6B5_UNORM_PACK16:
+			*Pointer<Int>(buffer) = (c01 & mask01) | (value & ~mask01);
+
+			buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
+			value = *Pointer<Int>(buffer);
+
+			Int c23 = Extract(As<Int2>(current.x), 1);
+			Int mask23 = *Pointer<Int>(constants + OFFSET(Constants,maskW4Q[0][2]) + xMask * 8);
+			if(bgraWriteMask != 0x0000000F)
 			{
-				Pointer<Byte> buffer = cBuffer + 2 * x;
-				Int value = *Pointer<Int>(buffer);
-
-				Int channelMask = *Pointer<Int>(constants + OFFSET(Constants,mask565Q[bgraWriteMask & 0x7][0]));
-
-				Int c01 = Extract(As<Int2>(current.x), 0);
-				Int mask01 = *Pointer<Int>(constants + OFFSET(Constants,maskW4Q[0][0]) + xMask * 8);
-				if((bgraWriteMask & 0x00000007) != 0x00000007)
-				{
-					mask01 &= channelMask;
-				}
-				*Pointer<Int>(buffer) = (c01 & mask01) | (value & ~mask01);
-
-				buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
-				value = *Pointer<Int>(buffer);
-
-				Int c23 = Extract(As<Int2>(current.x), 1);
-				Int mask23 = *Pointer<Int>(constants + OFFSET(Constants,maskW4Q[0][2]) + xMask * 8);
-				if((bgraWriteMask & 0x00000007) != 0x00000007)
-				{
-					mask23 &= channelMask;
-				}
-				*Pointer<Int>(buffer) = (c23 & mask23) | (value & ~mask23);
+				mask23 &= channelMask;
 			}
-			break;
-		case VK_FORMAT_B8G8R8A8_UNORM:
-		case VK_FORMAT_B8G8R8A8_SRGB:
+			*Pointer<Int>(buffer) = (c23 & mask23) | (value & ~mask23);
+		}
+		break;
+	case VK_FORMAT_R5G6B5_UNORM_PACK16:
+		{
+			Pointer<Byte> buffer = cBuffer + 2 * x;
+			Int value = *Pointer<Int>(buffer);
+
+			Int channelMask = *Pointer<Int>(constants + OFFSET(Constants,mask565Q[bgraWriteMask & 0x7][0]));
+
+			Int c01 = Extract(As<Int2>(current.x), 0);
+			Int mask01 = *Pointer<Int>(constants + OFFSET(Constants,maskW4Q[0][0]) + xMask * 8);
+			if((bgraWriteMask & 0x00000007) != 0x00000007)
 			{
-				Pointer<Byte> buffer = cBuffer + x * 4;
+				mask01 &= channelMask;
+			}
+			*Pointer<Int>(buffer) = (c01 & mask01) | (value & ~mask01);
+
+			buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
+			value = *Pointer<Int>(buffer);
+
+			Int c23 = Extract(As<Int2>(current.x), 1);
+			Int mask23 = *Pointer<Int>(constants + OFFSET(Constants,maskW4Q[0][2]) + xMask * 8);
+			if((bgraWriteMask & 0x00000007) != 0x00000007)
+			{
+				mask23 &= channelMask;
+			}
+			*Pointer<Int>(buffer) = (c23 & mask23) | (value & ~mask23);
+		}
+		break;
+	case VK_FORMAT_B8G8R8A8_UNORM:
+	case VK_FORMAT_B8G8R8A8_SRGB:
+		{
+			Pointer<Byte> buffer = cBuffer + x * 4;
+			Short4 value = *Pointer<Short4>(buffer);
+			Short4 channelMask = *Pointer<Short4>(constants + OFFSET(Constants,maskB4Q[bgraWriteMask][0]));
+
+			Short4 mask01 = *Pointer<Short4>(constants + OFFSET(Constants,maskD01Q) + xMask * 8);
+			if(bgraWriteMask != 0x0000000F)
+			{
+				mask01 &= channelMask;
+			}
+			*Pointer<Short4>(buffer) = (c01 & mask01) | (value & ~mask01);
+
+			buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
+			value = *Pointer<Short4>(buffer);
+
+			Short4 mask23 = *Pointer<Short4>(constants + OFFSET(Constants,maskD23Q) + xMask * 8);
+			if(bgraWriteMask != 0x0000000F)
+			{
+				mask23 &= channelMask;
+			}
+			*Pointer<Short4>(buffer) = (c23 & mask23) | (value & ~mask23);
+		}
+		break;
+	case VK_FORMAT_R8G8B8A8_UNORM:
+	case VK_FORMAT_R8G8B8A8_SRGB:
+	case VK_FORMAT_A8B8G8R8_UNORM_PACK32:
+	case VK_FORMAT_A8B8G8R8_SRGB_PACK32:
+		{
+			Pointer<Byte> buffer = cBuffer + x * 4;
+			Short4 value = *Pointer<Short4>(buffer);
+			Short4 channelMask = *Pointer<Short4>(constants + OFFSET(Constants,maskB4Q[rgbaWriteMask][0]));
+
+			Short4 mask01 = *Pointer<Short4>(constants + OFFSET(Constants,maskD01Q) + xMask * 8);
+			if(rgbaWriteMask != 0x0000000F)
+			{
+				mask01 &= channelMask;
+			}
+			*Pointer<Short4>(buffer) = (c01 & mask01) | (value & ~mask01);
+
+			buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
+			value = *Pointer<Short4>(buffer);
+
+			Short4 mask23 = *Pointer<Short4>(constants + OFFSET(Constants,maskD23Q) + xMask * 8);
+			if(rgbaWriteMask != 0x0000000F)
+			{
+				mask23 &= channelMask;
+			}
+			*Pointer<Short4>(buffer) = (c23 & mask23) | (value & ~mask23);
+		}
+		break;
+	case VK_FORMAT_R8G8_UNORM:
+		if((rgbaWriteMask & 0x00000003) != 0x0)
+		{
+			Pointer<Byte> buffer = cBuffer + 2 * x;
+			Int2 value;
+			value = Insert(value, *Pointer<Int>(buffer), 0);
+			Int pitch = *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
+			value = Insert(value, *Pointer<Int>(buffer + pitch), 1);
+
+			Int2 packedCol = As<Int2>(current.x);
+
+			UInt2 mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskW4Q) + xMask * 8);
+			if((rgbaWriteMask & 0x3) != 0x3)
+			{
+				Int tmpMask = *Pointer<Int>(constants + OFFSET(Constants, maskB4Q[5 * (rgbaWriteMask & 0x3)][0]));
+				UInt2 rgbaMask = As<UInt2>(Int2(tmpMask, tmpMask));
+				mergedMask &= rgbaMask;
+			}
+
+			packedCol = As<Int2>((As<UInt2>(packedCol) & mergedMask) | (As<UInt2>(value) & ~mergedMask));
+
+			*Pointer<UInt>(buffer) = As<UInt>(Extract(packedCol, 0));
+			*Pointer<UInt>(buffer + pitch) = As<UInt>(Extract(packedCol, 1));
+		}
+		break;
+	case VK_FORMAT_R8_UNORM:
+		if(rgbaWriteMask & 0x00000001)
+		{
+			Pointer<Byte> buffer = cBuffer + 1 * x;
+			Short4 value;
+			value = Insert(value, *Pointer<Short>(buffer), 0);
+			Int pitch = *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
+			value = Insert(value, *Pointer<Short>(buffer + pitch), 1);
+
+			current.x &= *Pointer<Short4>(constants + OFFSET(Constants, maskB4Q) + 8 * xMask);
+			value &= *Pointer<Short4>(constants + OFFSET(Constants, invMaskB4Q) + 8 * xMask);
+			current.x |= value;
+
+			*Pointer<Short>(buffer) = Extract(current.x, 0);
+			*Pointer<Short>(buffer + pitch) = Extract(current.x, 1);
+		}
+		break;
+	case VK_FORMAT_R16G16_UNORM:
+		{
+			Pointer<Byte> buffer = cBuffer + 4 * x;
+
+			Short4 value = *Pointer<Short4>(buffer);
+
+			if((rgbaWriteMask & 0x00000003) != 0x00000003)
+			{
+				Short4 masked = value;
+				current.x &= *Pointer<Short4>(constants + OFFSET(Constants,maskW01Q[rgbaWriteMask & 0x3][0]));
+				masked &= *Pointer<Short4>(constants + OFFSET(Constants,maskW01Q[~rgbaWriteMask & 0x3][0]));
+				current.x |= masked;
+			}
+
+			current.x &= *Pointer<Short4>(constants + OFFSET(Constants,maskD01Q) + xMask * 8);
+			value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD01Q) + xMask * 8);
+			current.x |= value;
+			*Pointer<Short4>(buffer) = current.x;
+
+			buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
+
+			value = *Pointer<Short4>(buffer);
+
+			if((rgbaWriteMask & 0x00000003) != 0x00000003)
+			{
+				Short4 masked = value;
+				current.y &= *Pointer<Short4>(constants + OFFSET(Constants,maskW01Q[rgbaWriteMask & 0x3][0]));
+				masked &= *Pointer<Short4>(constants + OFFSET(Constants,maskW01Q[~rgbaWriteMask & 0x3][0]));
+				current.y |= masked;
+			}
+
+			current.y &= *Pointer<Short4>(constants + OFFSET(Constants,maskD23Q) + xMask * 8);
+			value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD23Q) + xMask * 8);
+			current.y |= value;
+			*Pointer<Short4>(buffer) = current.y;
+		}
+		break;
+	case VK_FORMAT_R16G16B16A16_UNORM:
+		{
+			Pointer<Byte> buffer = cBuffer + 8 * x;
+
+			{
 				Short4 value = *Pointer<Short4>(buffer);
-				Short4 channelMask = *Pointer<Short4>(constants + OFFSET(Constants,maskB4Q[bgraWriteMask][0]));
 
-				Short4 mask01 = *Pointer<Short4>(constants + OFFSET(Constants,maskD01Q) + xMask * 8);
-				if(bgraWriteMask != 0x0000000F)
-				{
-					mask01 &= channelMask;
-				}
-				*Pointer<Short4>(buffer) = (c01 & mask01) | (value & ~mask01);
-
-				buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
-				value = *Pointer<Short4>(buffer);
-
-				Short4 mask23 = *Pointer<Short4>(constants + OFFSET(Constants,maskD23Q) + xMask * 8);
-				if(bgraWriteMask != 0x0000000F)
-				{
-					mask23 &= channelMask;
-				}
-				*Pointer<Short4>(buffer) = (c23 & mask23) | (value & ~mask23);
-			}
-			break;
-		case VK_FORMAT_R8G8B8A8_UNORM:
-		case VK_FORMAT_R8G8B8A8_SRGB:
-		case VK_FORMAT_A8B8G8R8_UNORM_PACK32:
-		case VK_FORMAT_A8B8G8R8_SRGB_PACK32:
-			{
-				Pointer<Byte> buffer = cBuffer + x * 4;
-				Short4 value = *Pointer<Short4>(buffer);
-				Short4 channelMask = *Pointer<Short4>(constants + OFFSET(Constants,maskB4Q[rgbaWriteMask][0]));
-
-				Short4 mask01 = *Pointer<Short4>(constants + OFFSET(Constants,maskD01Q) + xMask * 8);
 				if(rgbaWriteMask != 0x0000000F)
 				{
-					mask01 &= channelMask;
-				}
-				*Pointer<Short4>(buffer) = (c01 & mask01) | (value & ~mask01);
-
-				buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
-				value = *Pointer<Short4>(buffer);
-
-				Short4 mask23 = *Pointer<Short4>(constants + OFFSET(Constants,maskD23Q) + xMask * 8);
-				if(rgbaWriteMask != 0x0000000F)
-				{
-					mask23 &= channelMask;
-				}
-				*Pointer<Short4>(buffer) = (c23 & mask23) | (value & ~mask23);
-			}
-			break;
-		case VK_FORMAT_R8G8_UNORM:
-			if((rgbaWriteMask & 0x00000003) != 0x0)
-			{
-				Pointer<Byte> buffer = cBuffer + 2 * x;
-				Int2 value;
-				value = Insert(value, *Pointer<Int>(buffer), 0);
-				Int pitch = *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
-				value = Insert(value, *Pointer<Int>(buffer + pitch), 1);
-
-				Int2 packedCol = As<Int2>(current.x);
-
-				UInt2 mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskW4Q) + xMask * 8);
-				if((rgbaWriteMask & 0x3) != 0x3)
-				{
-					Int tmpMask = *Pointer<Int>(constants + OFFSET(Constants, maskB4Q[5 * (rgbaWriteMask & 0x3)][0]));
-					UInt2 rgbaMask = As<UInt2>(Int2(tmpMask, tmpMask));
-					mergedMask &= rgbaMask;
-				}
-
-				packedCol = As<Int2>((As<UInt2>(packedCol) & mergedMask) | (As<UInt2>(value) & ~mergedMask));
-
-				*Pointer<UInt>(buffer) = As<UInt>(Extract(packedCol, 0));
-				*Pointer<UInt>(buffer + pitch) = As<UInt>(Extract(packedCol, 1));
-			}
-			break;
-		case VK_FORMAT_R8_UNORM:
-			if(rgbaWriteMask & 0x00000001)
-			{
-				Pointer<Byte> buffer = cBuffer + 1 * x;
-				Short4 value;
-				value = Insert(value, *Pointer<Short>(buffer), 0);
-				Int pitch = *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
-				value = Insert(value, *Pointer<Short>(buffer + pitch), 1);
-
-				current.x &= *Pointer<Short4>(constants + OFFSET(Constants, maskB4Q) + 8 * xMask);
-				value &= *Pointer<Short4>(constants + OFFSET(Constants, invMaskB4Q) + 8 * xMask);
-				current.x |= value;
-
-				*Pointer<Short>(buffer) = Extract(current.x, 0);
-				*Pointer<Short>(buffer + pitch) = Extract(current.x, 1);
-			}
-			break;
-		case VK_FORMAT_R16G16_UNORM:
-			{
-				Pointer<Byte> buffer = cBuffer + 4 * x;
-
-				Short4 value = *Pointer<Short4>(buffer);
-
-				if((rgbaWriteMask & 0x00000003) != 0x00000003)
-				{
 					Short4 masked = value;
-					current.x &= *Pointer<Short4>(constants + OFFSET(Constants,maskW01Q[rgbaWriteMask & 0x3][0]));
-					masked &= *Pointer<Short4>(constants + OFFSET(Constants,maskW01Q[~rgbaWriteMask & 0x3][0]));
+					current.x &= *Pointer<Short4>(constants + OFFSET(Constants,maskW4Q[rgbaWriteMask][0]));
+					masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskW4Q[rgbaWriteMask][0]));
 					current.x |= masked;
 				}
 
-				current.x &= *Pointer<Short4>(constants + OFFSET(Constants,maskD01Q) + xMask * 8);
-				value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD01Q) + xMask * 8);
+				current.x &= *Pointer<Short4>(constants + OFFSET(Constants,maskQ0Q) + xMask * 8);
+				value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskQ0Q) + xMask * 8);
 				current.x |= value;
 				*Pointer<Short4>(buffer) = current.x;
+			}
 
-				buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
+			{
+				Short4 value = *Pointer<Short4>(buffer + 8);
 
-				value = *Pointer<Short4>(buffer);
-
-				if((rgbaWriteMask & 0x00000003) != 0x00000003)
+				if(rgbaWriteMask != 0x0000000F)
 				{
 					Short4 masked = value;
-					current.y &= *Pointer<Short4>(constants + OFFSET(Constants,maskW01Q[rgbaWriteMask & 0x3][0]));
-					masked &= *Pointer<Short4>(constants + OFFSET(Constants,maskW01Q[~rgbaWriteMask & 0x3][0]));
+					current.y &= *Pointer<Short4>(constants + OFFSET(Constants,maskW4Q[rgbaWriteMask][0]));
+					masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskW4Q[rgbaWriteMask][0]));
 					current.y |= masked;
 				}
 
-				current.y &= *Pointer<Short4>(constants + OFFSET(Constants,maskD23Q) + xMask * 8);
-				value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD23Q) + xMask * 8);
+				current.y &= *Pointer<Short4>(constants + OFFSET(Constants,maskQ1Q) + xMask * 8);
+				value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskQ1Q) + xMask * 8);
 				current.y |= value;
-				*Pointer<Short4>(buffer) = current.y;
+				*Pointer<Short4>(buffer + 8) = current.y;
 			}
-			break;
-		case VK_FORMAT_R16G16B16A16_UNORM:
+
+			buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
+
 			{
-				Pointer<Byte> buffer = cBuffer + 8 * x;
+				Short4 value = *Pointer<Short4>(buffer);
 
+				if(rgbaWriteMask != 0x0000000F)
 				{
-					Short4 value = *Pointer<Short4>(buffer);
-
-					if(rgbaWriteMask != 0x0000000F)
-					{
-						Short4 masked = value;
-						current.x &= *Pointer<Short4>(constants + OFFSET(Constants,maskW4Q[rgbaWriteMask][0]));
-						masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskW4Q[rgbaWriteMask][0]));
-						current.x |= masked;
-					}
-
-					current.x &= *Pointer<Short4>(constants + OFFSET(Constants,maskQ0Q) + xMask * 8);
-					value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskQ0Q) + xMask * 8);
-					current.x |= value;
-					*Pointer<Short4>(buffer) = current.x;
+					Short4 masked = value;
+					current.z &= *Pointer<Short4>(constants + OFFSET(Constants,maskW4Q[rgbaWriteMask][0]));
+					masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskW4Q[rgbaWriteMask][0]));
+					current.z |= masked;
 				}
 
-				{
-					Short4 value = *Pointer<Short4>(buffer + 8);
-
-					if(rgbaWriteMask != 0x0000000F)
-					{
-						Short4 masked = value;
-						current.y &= *Pointer<Short4>(constants + OFFSET(Constants,maskW4Q[rgbaWriteMask][0]));
-						masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskW4Q[rgbaWriteMask][0]));
-						current.y |= masked;
-					}
-
-					current.y &= *Pointer<Short4>(constants + OFFSET(Constants,maskQ1Q) + xMask * 8);
-					value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskQ1Q) + xMask * 8);
-					current.y |= value;
-					*Pointer<Short4>(buffer + 8) = current.y;
-				}
-
-				buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
-
-				{
-					Short4 value = *Pointer<Short4>(buffer);
-
-					if(rgbaWriteMask != 0x0000000F)
-					{
-						Short4 masked = value;
-						current.z &= *Pointer<Short4>(constants + OFFSET(Constants,maskW4Q[rgbaWriteMask][0]));
-						masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskW4Q[rgbaWriteMask][0]));
-						current.z |= masked;
-					}
-
-					current.z &= *Pointer<Short4>(constants + OFFSET(Constants,maskQ2Q) + xMask * 8);
-					value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskQ2Q) + xMask * 8);
-					current.z |= value;
-					*Pointer<Short4>(buffer) = current.z;
-				}
-
-				{
-					Short4 value = *Pointer<Short4>(buffer + 8);
-
-					if(rgbaWriteMask != 0x0000000F)
-					{
-						Short4 masked = value;
-						current.w &= *Pointer<Short4>(constants + OFFSET(Constants,maskW4Q[rgbaWriteMask][0]));
-						masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskW4Q[rgbaWriteMask][0]));
-						current.w |= masked;
-					}
-
-					current.w &= *Pointer<Short4>(constants + OFFSET(Constants,maskQ3Q) + xMask * 8);
-					value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskQ3Q) + xMask * 8);
-					current.w |= value;
-					*Pointer<Short4>(buffer + 8) = current.w;
-				}
+				current.z &= *Pointer<Short4>(constants + OFFSET(Constants,maskQ2Q) + xMask * 8);
+				value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskQ2Q) + xMask * 8);
+				current.z |= value;
+				*Pointer<Short4>(buffer) = current.z;
 			}
-			break;
-			case VK_FORMAT_A2B10G10R10_UNORM_PACK32:
+
 			{
-				Pointer<Byte> buffer = cBuffer + 4 * x;
+				Short4 value = *Pointer<Short4>(buffer + 8);
 
-				buffer = cBuffer + 4 * x;
-				Int2 value = *Pointer<Int2>(buffer, 16);
-				Int2 mergedMask = *Pointer<Int2>(constants + OFFSET(Constants, maskD01Q) + xMask * 8);
-				if (rgbaWriteMask != 0xF)
+				if(rgbaWriteMask != 0x0000000F)
 				{
-					mergedMask &= *Pointer<Int2>(constants + OFFSET(Constants, mask10Q[rgbaWriteMask][0]));
+					Short4 masked = value;
+					current.w &= *Pointer<Short4>(constants + OFFSET(Constants,maskW4Q[rgbaWriteMask][0]));
+					masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskW4Q[rgbaWriteMask][0]));
+					current.w |= masked;
 				}
-				*Pointer<Int2>(buffer) = (As<Int2>(current.x) & mergedMask) | (value & ~mergedMask);
 
-				buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
-
-				value = *Pointer<Int2>(buffer, 16);
-				mergedMask = *Pointer<Int2>(constants + OFFSET(Constants, maskD23Q) + xMask * 8);
-				if (rgbaWriteMask != 0xF)
-				{
-					mergedMask &= *Pointer<Int2>(constants + OFFSET(Constants, mask10Q[rgbaWriteMask][0]));
-				}
-				*Pointer<Int2>(buffer) = (As<Int2>(current.y) & mergedMask) | (value & ~mergedMask);
+				current.w &= *Pointer<Short4>(constants + OFFSET(Constants,maskQ3Q) + xMask * 8);
+				value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskQ3Q) + xMask * 8);
+				current.w |= value;
+				*Pointer<Short4>(buffer + 8) = current.w;
 			}
-			break;
-		default:
-			UNIMPLEMENTED("VkFormat: %d", int(state.targetFormat[index]));
 		}
+		break;
+		case VK_FORMAT_A2B10G10R10_UNORM_PACK32:
+		{
+			Pointer<Byte> buffer = cBuffer + 4 * x;
+
+			buffer = cBuffer + 4 * x;
+			Int2 value = *Pointer<Int2>(buffer, 16);
+			Int2 mergedMask = *Pointer<Int2>(constants + OFFSET(Constants, maskD01Q) + xMask * 8);
+			if (rgbaWriteMask != 0xF)
+			{
+				mergedMask &= *Pointer<Int2>(constants + OFFSET(Constants, mask10Q[rgbaWriteMask][0]));
+			}
+			*Pointer<Int2>(buffer) = (As<Int2>(current.x) & mergedMask) | (value & ~mergedMask);
+
+			buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
+
+			value = *Pointer<Int2>(buffer, 16);
+			mergedMask = *Pointer<Int2>(constants + OFFSET(Constants, maskD23Q) + xMask * 8);
+			if (rgbaWriteMask != 0xF)
+			{
+				mergedMask &= *Pointer<Int2>(constants + OFFSET(Constants, mask10Q[rgbaWriteMask][0]));
+			}
+			*Pointer<Int2>(buffer) = (As<Int2>(current.y) & mergedMask) | (value & ~mergedMask);
+		}
+		break;
+	default:
+		UNIMPLEMENTED("VkFormat: %d", int(state.targetFormat[index]));
+	}
+}
+
+void PixelRoutine::blendFactor(Vector4f &blendFactor, const Vector4f &oC, const Vector4f &pixel, VkBlendFactor blendFactorActive)
+{
+	switch(blendFactorActive)
+	{
+	case VK_BLEND_FACTOR_ZERO:
+		blendFactor.x = Float4(0);
+		blendFactor.y = Float4(0);
+		blendFactor.z = Float4(0);
+		break;
+	case VK_BLEND_FACTOR_ONE:
+		blendFactor.x = Float4(1);
+		blendFactor.y = Float4(1);
+		blendFactor.z = Float4(1);
+		break;
+	case VK_BLEND_FACTOR_SRC_COLOR:
+		blendFactor.x = oC.x;
+		blendFactor.y = oC.y;
+		blendFactor.z = oC.z;
+		break;
+	case VK_BLEND_FACTOR_ONE_MINUS_SRC_COLOR:
+		blendFactor.x = Float4(1.0f) - oC.x;
+		blendFactor.y = Float4(1.0f) - oC.y;
+		blendFactor.z = Float4(1.0f) - oC.z;
+		break;
+	case VK_BLEND_FACTOR_DST_COLOR:
+		blendFactor.x = pixel.x;
+		blendFactor.y = pixel.y;
+		blendFactor.z = pixel.z;
+		break;
+	case VK_BLEND_FACTOR_ONE_MINUS_DST_COLOR:
+		blendFactor.x = Float4(1.0f) - pixel.x;
+		blendFactor.y = Float4(1.0f) - pixel.y;
+		blendFactor.z = Float4(1.0f) - pixel.z;
+		break;
+	case VK_BLEND_FACTOR_SRC_ALPHA:
+		blendFactor.x = oC.w;
+		blendFactor.y = oC.w;
+		blendFactor.z = oC.w;
+		break;
+	case VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA:
+		blendFactor.x = Float4(1.0f) - oC.w;
+		blendFactor.y = Float4(1.0f) - oC.w;
+		blendFactor.z = Float4(1.0f) - oC.w;
+		break;
+	case VK_BLEND_FACTOR_DST_ALPHA:
+		blendFactor.x = pixel.w;
+		blendFactor.y = pixel.w;
+		blendFactor.z = pixel.w;
+		break;
+	case VK_BLEND_FACTOR_ONE_MINUS_DST_ALPHA:
+		blendFactor.x = Float4(1.0f) - pixel.w;
+		blendFactor.y = Float4(1.0f) - pixel.w;
+		blendFactor.z = Float4(1.0f) - pixel.w;
+		break;
+	case VK_BLEND_FACTOR_SRC_ALPHA_SATURATE:
+		blendFactor.x = Float4(1.0f) - pixel.w;
+		blendFactor.x = Min(blendFactor.x, oC.w);
+		blendFactor.y = blendFactor.x;
+		blendFactor.z = blendFactor.x;
+		break;
+	case VK_BLEND_FACTOR_CONSTANT_COLOR:
+		blendFactor.x = *Pointer<Float4>(data + OFFSET(DrawData,factor.blendConstant4F[0]));
+		blendFactor.y = *Pointer<Float4>(data + OFFSET(DrawData,factor.blendConstant4F[1]));
+		blendFactor.z = *Pointer<Float4>(data + OFFSET(DrawData,factor.blendConstant4F[2]));
+		break;
+	case VK_BLEND_FACTOR_CONSTANT_ALPHA:
+		blendFactor.x = *Pointer<Float4>(data + OFFSET(DrawData,factor.blendConstant4F[3]));
+		blendFactor.y = *Pointer<Float4>(data + OFFSET(DrawData,factor.blendConstant4F[3]));
+		blendFactor.z = *Pointer<Float4>(data + OFFSET(DrawData,factor.blendConstant4F[3]));
+		break;
+	case VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_COLOR:
+		blendFactor.x = *Pointer<Float4>(data + OFFSET(DrawData,factor.invBlendConstant4F[0]));
+		blendFactor.y = *Pointer<Float4>(data + OFFSET(DrawData,factor.invBlendConstant4F[1]));
+		blendFactor.z = *Pointer<Float4>(data + OFFSET(DrawData,factor.invBlendConstant4F[2]));
+		break;
+	case VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_ALPHA:
+		blendFactor.x = *Pointer<Float4>(data + OFFSET(DrawData,factor.invBlendConstant4F[3]));
+		blendFactor.y = *Pointer<Float4>(data + OFFSET(DrawData,factor.invBlendConstant4F[3]));
+		blendFactor.z = *Pointer<Float4>(data + OFFSET(DrawData,factor.invBlendConstant4F[3]));
+		break;
+
+	default:
+		UNIMPLEMENTED("VkBlendFactor: %d", int(blendFactorActive));
+	}
+}
+
+void PixelRoutine::blendFactorAlpha(Vector4f &blendFactor, const Vector4f &oC, const Vector4f &pixel, VkBlendFactor blendFactorAlphaActive)
+{
+	switch(blendFactorAlphaActive)
+	{
+	case VK_BLEND_FACTOR_ZERO:
+		blendFactor.w = Float4(0);
+		break;
+	case VK_BLEND_FACTOR_ONE:
+		blendFactor.w = Float4(1);
+		break;
+	case VK_BLEND_FACTOR_SRC_COLOR:
+		blendFactor.w = oC.w;
+		break;
+	case VK_BLEND_FACTOR_ONE_MINUS_SRC_COLOR:
+		blendFactor.w = Float4(1.0f) - oC.w;
+		break;
+	case VK_BLEND_FACTOR_DST_COLOR:
+		blendFactor.w = pixel.w;
+		break;
+	case VK_BLEND_FACTOR_ONE_MINUS_DST_COLOR:
+		blendFactor.w = Float4(1.0f) - pixel.w;
+		break;
+	case VK_BLEND_FACTOR_SRC_ALPHA:
+		blendFactor.w = oC.w;
+		break;
+	case VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA:
+		blendFactor.w = Float4(1.0f) - oC.w;
+		break;
+	case VK_BLEND_FACTOR_DST_ALPHA:
+		blendFactor.w = pixel.w;
+		break;
+	case VK_BLEND_FACTOR_ONE_MINUS_DST_ALPHA:
+		blendFactor.w = Float4(1.0f) - pixel.w;
+		break;
+	case VK_BLEND_FACTOR_SRC_ALPHA_SATURATE:
+		blendFactor.w = Float4(1.0f);
+		break;
+	case VK_BLEND_FACTOR_CONSTANT_COLOR:
+	case VK_BLEND_FACTOR_CONSTANT_ALPHA:
+		blendFactor.w = *Pointer<Float4>(data + OFFSET(DrawData,factor.blendConstant4F[3]));
+		break;
+	case VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_COLOR:
+	case VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_ALPHA:
+		blendFactor.w = *Pointer<Float4>(data + OFFSET(DrawData,factor.invBlendConstant4F[3]));
+		break;
+	default:
+		UNIMPLEMENTED("VkBlendFactor: %d", int(blendFactorAlphaActive));
+	}
+}
+
+void PixelRoutine::alphaBlend(int index, const Pointer<Byte> &cBuffer, Vector4f &oC, const Int &x)
+{
+	if(!state.blendState[index].alphaBlendEnable)
+	{
+		return;
 	}
 
-	void PixelRoutine::blendFactor(Vector4f &blendFactor, const Vector4f &oC, const Vector4f &pixel, VkBlendFactor blendFactorActive)
-	{
-		switch(blendFactorActive)
-		{
-		case VK_BLEND_FACTOR_ZERO:
-			blendFactor.x = Float4(0);
-			blendFactor.y = Float4(0);
-			blendFactor.z = Float4(0);
-			break;
-		case VK_BLEND_FACTOR_ONE:
-			blendFactor.x = Float4(1);
-			blendFactor.y = Float4(1);
-			blendFactor.z = Float4(1);
-			break;
-		case VK_BLEND_FACTOR_SRC_COLOR:
-			blendFactor.x = oC.x;
-			blendFactor.y = oC.y;
-			blendFactor.z = oC.z;
-			break;
-		case VK_BLEND_FACTOR_ONE_MINUS_SRC_COLOR:
-			blendFactor.x = Float4(1.0f) - oC.x;
-			blendFactor.y = Float4(1.0f) - oC.y;
-			blendFactor.z = Float4(1.0f) - oC.z;
-			break;
-		case VK_BLEND_FACTOR_DST_COLOR:
-			blendFactor.x = pixel.x;
-			blendFactor.y = pixel.y;
-			blendFactor.z = pixel.z;
-			break;
-		case VK_BLEND_FACTOR_ONE_MINUS_DST_COLOR:
-			blendFactor.x = Float4(1.0f) - pixel.x;
-			blendFactor.y = Float4(1.0f) - pixel.y;
-			blendFactor.z = Float4(1.0f) - pixel.z;
-			break;
-		case VK_BLEND_FACTOR_SRC_ALPHA:
-			blendFactor.x = oC.w;
-			blendFactor.y = oC.w;
-			blendFactor.z = oC.w;
-			break;
-		case VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA:
-			blendFactor.x = Float4(1.0f) - oC.w;
-			blendFactor.y = Float4(1.0f) - oC.w;
-			blendFactor.z = Float4(1.0f) - oC.w;
-			break;
-		case VK_BLEND_FACTOR_DST_ALPHA:
-			blendFactor.x = pixel.w;
-			blendFactor.y = pixel.w;
-			blendFactor.z = pixel.w;
-			break;
-		case VK_BLEND_FACTOR_ONE_MINUS_DST_ALPHA:
-			blendFactor.x = Float4(1.0f) - pixel.w;
-			blendFactor.y = Float4(1.0f) - pixel.w;
-			blendFactor.z = Float4(1.0f) - pixel.w;
-			break;
-		case VK_BLEND_FACTOR_SRC_ALPHA_SATURATE:
-			blendFactor.x = Float4(1.0f) - pixel.w;
-			blendFactor.x = Min(blendFactor.x, oC.w);
-			blendFactor.y = blendFactor.x;
-			blendFactor.z = blendFactor.x;
-			break;
-		case VK_BLEND_FACTOR_CONSTANT_COLOR:
-			blendFactor.x = *Pointer<Float4>(data + OFFSET(DrawData,factor.blendConstant4F[0]));
-			blendFactor.y = *Pointer<Float4>(data + OFFSET(DrawData,factor.blendConstant4F[1]));
-			blendFactor.z = *Pointer<Float4>(data + OFFSET(DrawData,factor.blendConstant4F[2]));
-			break;
-		case VK_BLEND_FACTOR_CONSTANT_ALPHA:
-			blendFactor.x = *Pointer<Float4>(data + OFFSET(DrawData,factor.blendConstant4F[3]));
-			blendFactor.y = *Pointer<Float4>(data + OFFSET(DrawData,factor.blendConstant4F[3]));
-			blendFactor.z = *Pointer<Float4>(data + OFFSET(DrawData,factor.blendConstant4F[3]));
-			break;
-		case VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_COLOR:
-			blendFactor.x = *Pointer<Float4>(data + OFFSET(DrawData,factor.invBlendConstant4F[0]));
-			blendFactor.y = *Pointer<Float4>(data + OFFSET(DrawData,factor.invBlendConstant4F[1]));
-			blendFactor.z = *Pointer<Float4>(data + OFFSET(DrawData,factor.invBlendConstant4F[2]));
-			break;
-		case VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_ALPHA:
-			blendFactor.x = *Pointer<Float4>(data + OFFSET(DrawData,factor.invBlendConstant4F[3]));
-			blendFactor.y = *Pointer<Float4>(data + OFFSET(DrawData,factor.invBlendConstant4F[3]));
-			blendFactor.z = *Pointer<Float4>(data + OFFSET(DrawData,factor.invBlendConstant4F[3]));
-			break;
+	Pointer<Byte> buffer;
 
-		default:
-			UNIMPLEMENTED("VkBlendFactor: %d", int(blendFactorActive));
-		}
+	// pixel holds four texel color values.
+	// Note: Despite the type being Vector4f, the colors may be stored as
+	// integers. Half-floats are stored as full 32-bit floats.
+	// Non-float and non-fixed point formats are not alpha blended.
+	Vector4f pixel;
+
+	Vector4s color;
+	Short4 c01;
+	Short4 c23;
+
+	Float4 one;
+	vk::Format format(state.targetFormat[index]);
+	if(format.isFloatFormat())
+	{
+		one = Float4(1.0f);
+	}
+	else if(format.isNonNormalizedInteger())
+	{
+		one = As<Float4>(format.isUnsignedComponent(0) ? Int4(0xFFFFFFFF) : Int4(0x7FFFFFFF));
 	}
 
-	void PixelRoutine::blendFactorAlpha(Vector4f &blendFactor, const Vector4f &oC, const Vector4f &pixel, VkBlendFactor blendFactorAlphaActive)
+	switch(state.targetFormat[index])
 	{
-		switch(blendFactorAlphaActive)
-		{
-		case VK_BLEND_FACTOR_ZERO:
-			blendFactor.w = Float4(0);
-			break;
-		case VK_BLEND_FACTOR_ONE:
-			blendFactor.w = Float4(1);
-			break;
-		case VK_BLEND_FACTOR_SRC_COLOR:
-			blendFactor.w = oC.w;
-			break;
-		case VK_BLEND_FACTOR_ONE_MINUS_SRC_COLOR:
-			blendFactor.w = Float4(1.0f) - oC.w;
-			break;
-		case VK_BLEND_FACTOR_DST_COLOR:
-			blendFactor.w = pixel.w;
-			break;
-		case VK_BLEND_FACTOR_ONE_MINUS_DST_COLOR:
-			blendFactor.w = Float4(1.0f) - pixel.w;
-			break;
-		case VK_BLEND_FACTOR_SRC_ALPHA:
-			blendFactor.w = oC.w;
-			break;
-		case VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA:
-			blendFactor.w = Float4(1.0f) - oC.w;
-			break;
-		case VK_BLEND_FACTOR_DST_ALPHA:
-			blendFactor.w = pixel.w;
-			break;
-		case VK_BLEND_FACTOR_ONE_MINUS_DST_ALPHA:
-			blendFactor.w = Float4(1.0f) - pixel.w;
-			break;
-		case VK_BLEND_FACTOR_SRC_ALPHA_SATURATE:
-			blendFactor.w = Float4(1.0f);
-			break;
-		case VK_BLEND_FACTOR_CONSTANT_COLOR:
-		case VK_BLEND_FACTOR_CONSTANT_ALPHA:
-			blendFactor.w = *Pointer<Float4>(data + OFFSET(DrawData,factor.blendConstant4F[3]));
-			break;
-		case VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_COLOR:
-		case VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_ALPHA:
-			blendFactor.w = *Pointer<Float4>(data + OFFSET(DrawData,factor.invBlendConstant4F[3]));
-			break;
-		default:
-			UNIMPLEMENTED("VkBlendFactor: %d", int(blendFactorAlphaActive));
-		}
+	case VK_FORMAT_R32_SINT:
+	case VK_FORMAT_R32_UINT:
+	case VK_FORMAT_R32_SFLOAT:
+		buffer = cBuffer;
+		// FIXME: movlps
+		pixel.x.x = *Pointer<Float>(buffer + 4 * x + 0);
+		pixel.x.y = *Pointer<Float>(buffer + 4 * x + 4);
+		buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
+		// FIXME: movhps
+		pixel.x.z = *Pointer<Float>(buffer + 4 * x + 0);
+		pixel.x.w = *Pointer<Float>(buffer + 4 * x + 4);
+		pixel.y = pixel.z = pixel.w = one;
+		break;
+	case VK_FORMAT_R32G32_SINT:
+	case VK_FORMAT_R32G32_UINT:
+	case VK_FORMAT_R32G32_SFLOAT:
+		buffer = cBuffer;
+		pixel.x = *Pointer<Float4>(buffer + 8 * x, 16);
+		buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
+		pixel.y = *Pointer<Float4>(buffer + 8 * x, 16);
+		pixel.z = pixel.x;
+		pixel.x = ShuffleLowHigh(pixel.x, pixel.y, 0x0202);
+		pixel.z = ShuffleLowHigh(pixel.z, pixel.y, 0x1313);
+		pixel.y = pixel.z;
+		pixel.z = pixel.w = one;
+		break;
+	case VK_FORMAT_R32G32B32A32_SFLOAT:
+	case VK_FORMAT_R32G32B32A32_SINT:
+	case VK_FORMAT_R32G32B32A32_UINT:
+		buffer = cBuffer;
+		pixel.x = *Pointer<Float4>(buffer + 16 * x, 16);
+		pixel.y = *Pointer<Float4>(buffer + 16 * x + 16, 16);
+		buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
+		pixel.z = *Pointer<Float4>(buffer + 16 * x, 16);
+		pixel.w = *Pointer<Float4>(buffer + 16 * x + 16, 16);
+		transpose4x4(pixel.x, pixel.y, pixel.z, pixel.w);
+		break;
+	case VK_FORMAT_R16_SFLOAT:
+		buffer = cBuffer;
+		pixel.x.x = Float(*Pointer<Half>(buffer + 2 * x + 0));
+		pixel.x.y = Float(*Pointer<Half>(buffer + 2 * x + 2));
+		buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
+		pixel.x.z = Float(*Pointer<Half>(buffer + 2 * x + 0));
+		pixel.x.w = Float(*Pointer<Half>(buffer + 2 * x + 2));
+		pixel.y = pixel.z = pixel.w = one;
+		break;
+	case VK_FORMAT_R16G16_SFLOAT:
+		buffer = cBuffer;
+		pixel.x.x = Float(*Pointer<Half>(buffer + 4 * x + 0));
+		pixel.y.x = Float(*Pointer<Half>(buffer + 4 * x + 2));
+		pixel.x.y = Float(*Pointer<Half>(buffer + 4 * x + 4));
+		pixel.y.y = Float(*Pointer<Half>(buffer + 4 * x + 6));
+		buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
+		pixel.x.z = Float(*Pointer<Half>(buffer + 4 * x + 0));
+		pixel.y.z = Float(*Pointer<Half>(buffer + 4 * x + 2));
+		pixel.x.w = Float(*Pointer<Half>(buffer + 4 * x + 4));
+		pixel.y.w = Float(*Pointer<Half>(buffer + 4 * x + 6));
+		pixel.z = pixel.w = one;
+		break;
+	case VK_FORMAT_R16G16B16A16_SFLOAT:
+		buffer = cBuffer;
+		pixel.x.x = Float(*Pointer<Half>(buffer + 8 * x + 0x0));
+		pixel.y.x = Float(*Pointer<Half>(buffer + 8 * x + 0x2));
+		pixel.z.x = Float(*Pointer<Half>(buffer + 8 * x + 0x4));
+		pixel.w.x = Float(*Pointer<Half>(buffer + 8 * x + 0x6));
+		pixel.x.y = Float(*Pointer<Half>(buffer + 8 * x + 0x8));
+		pixel.y.y = Float(*Pointer<Half>(buffer + 8 * x + 0xa));
+		pixel.z.y = Float(*Pointer<Half>(buffer + 8 * x + 0xc));
+		pixel.w.y = Float(*Pointer<Half>(buffer + 8 * x + 0xe));
+		buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
+		pixel.x.z = Float(*Pointer<Half>(buffer + 8 * x + 0x0));
+		pixel.y.z = Float(*Pointer<Half>(buffer + 8 * x + 0x2));
+		pixel.z.z = Float(*Pointer<Half>(buffer + 8 * x + 0x4));
+		pixel.w.z = Float(*Pointer<Half>(buffer + 8 * x + 0x6));
+		pixel.x.w = Float(*Pointer<Half>(buffer + 8 * x + 0x8));
+		pixel.y.w = Float(*Pointer<Half>(buffer + 8 * x + 0xa));
+		pixel.z.w = Float(*Pointer<Half>(buffer + 8 * x + 0xc));
+		pixel.w.w = Float(*Pointer<Half>(buffer + 8 * x + 0xe));
+		break;
+	default:
+		UNIMPLEMENTED("VkFormat: %d", int(state.targetFormat[index]));
 	}
 
-	void PixelRoutine::alphaBlend(int index, const Pointer<Byte> &cBuffer, Vector4f &oC, const Int &x)
+	// Final Color = ObjectColor * SourceBlendFactor + PixelColor * DestinationBlendFactor
+	Vector4f sourceFactor;
+	Vector4f destFactor;
+
+	blendFactor(sourceFactor, oC, pixel, state.blendState[index].sourceBlendFactor);
+	blendFactor(destFactor, oC, pixel, state.blendState[index].destBlendFactor);
+
+	oC.x *= sourceFactor.x;
+	oC.y *= sourceFactor.y;
+	oC.z *= sourceFactor.z;
+
+	pixel.x *= destFactor.x;
+	pixel.y *= destFactor.y;
+	pixel.z *= destFactor.z;
+
+	switch(state.blendState[index].blendOperation)
 	{
-		if(!state.blendState[index].alphaBlendEnable)
+	case VK_BLEND_OP_ADD:
+		oC.x += pixel.x;
+		oC.y += pixel.y;
+		oC.z += pixel.z;
+		break;
+	case VK_BLEND_OP_SUBTRACT:
+		oC.x -= pixel.x;
+		oC.y -= pixel.y;
+		oC.z -= pixel.z;
+		break;
+	case VK_BLEND_OP_REVERSE_SUBTRACT:
+		oC.x = pixel.x - oC.x;
+		oC.y = pixel.y - oC.y;
+		oC.z = pixel.z - oC.z;
+		break;
+	case VK_BLEND_OP_MIN:
+		oC.x = Min(oC.x, pixel.x);
+		oC.y = Min(oC.y, pixel.y);
+		oC.z = Min(oC.z, pixel.z);
+		break;
+	case VK_BLEND_OP_MAX:
+		oC.x = Max(oC.x, pixel.x);
+		oC.y = Max(oC.y, pixel.y);
+		oC.z = Max(oC.z, pixel.z);
+		break;
+	case VK_BLEND_OP_SRC_EXT:
+		// No operation
+		break;
+	case VK_BLEND_OP_DST_EXT:
+		oC.x = pixel.x;
+		oC.y = pixel.y;
+		oC.z = pixel.z;
+		break;
+	case VK_BLEND_OP_ZERO_EXT:
+		oC.x = Float4(0.0f);
+		oC.y = Float4(0.0f);
+		oC.z = Float4(0.0f);
+		break;
+	default:
+		UNIMPLEMENTED("VkBlendOp: %d", int(state.blendState[index].blendOperation));
+	}
+
+	blendFactorAlpha(sourceFactor, oC, pixel, state.blendState[index].sourceBlendFactorAlpha);
+	blendFactorAlpha(destFactor, oC, pixel, state.blendState[index].destBlendFactorAlpha);
+
+	oC.w *= sourceFactor.w;
+	pixel.w *= destFactor.w;
+
+	switch(state.blendState[index].blendOperationAlpha)
+	{
+	case VK_BLEND_OP_ADD:
+		oC.w += pixel.w;
+		break;
+	case VK_BLEND_OP_SUBTRACT:
+		oC.w -= pixel.w;
+		break;
+	case VK_BLEND_OP_REVERSE_SUBTRACT:
+		pixel.w -= oC.w;
+		oC.w = pixel.w;
+		break;
+	case VK_BLEND_OP_MIN:
+		oC.w = Min(oC.w, pixel.w);
+		break;
+	case VK_BLEND_OP_MAX:
+		oC.w = Max(oC.w, pixel.w);
+		break;
+	case VK_BLEND_OP_SRC_EXT:
+		// No operation
+		break;
+	case VK_BLEND_OP_DST_EXT:
+		oC.w = pixel.w;
+		break;
+	case VK_BLEND_OP_ZERO_EXT:
+		oC.w = Float4(0.0f);
+		break;
+	default:
+		UNIMPLEMENTED("VkBlendOp: %d", int(state.blendState[index].blendOperationAlpha));
+	}
+}
+
+void PixelRoutine::writeColor(int index, const Pointer<Byte> &cBuffer, const Int &x, Vector4f &oC, const Int &sMask, const Int &zMask, const Int &cMask)
+{
+	switch(state.targetFormat[index])
+	{
+	case VK_FORMAT_R16_SFLOAT:
+	case VK_FORMAT_R32_SFLOAT:
+	case VK_FORMAT_R32_SINT:
+	case VK_FORMAT_R32_UINT:
+	case VK_FORMAT_R16_SINT:
+	case VK_FORMAT_R16_UINT:
+	case VK_FORMAT_R8_SINT:
+	case VK_FORMAT_R8_UINT:
+	case VK_FORMAT_A2B10G10R10_UINT_PACK32:
+		break;
+	case VK_FORMAT_R16G16_SFLOAT:
+	case VK_FORMAT_R32G32_SFLOAT:
+	case VK_FORMAT_R32G32_SINT:
+	case VK_FORMAT_R32G32_UINT:
+	case VK_FORMAT_R16G16_SINT:
+	case VK_FORMAT_R16G16_UINT:
+	case VK_FORMAT_R8G8_SINT:
+	case VK_FORMAT_R8G8_UINT:
+		oC.z = oC.x;
+		oC.x = UnpackLow(oC.x, oC.y);
+		oC.z = UnpackHigh(oC.z, oC.y);
+		oC.y = oC.z;
+		break;
+	case VK_FORMAT_R16G16B16A16_SFLOAT:
+	case VK_FORMAT_R32G32B32A32_SFLOAT:
+	case VK_FORMAT_R32G32B32A32_SINT:
+	case VK_FORMAT_R32G32B32A32_UINT:
+	case VK_FORMAT_R16G16B16A16_SINT:
+	case VK_FORMAT_R16G16B16A16_UINT:
+	case VK_FORMAT_R8G8B8A8_SINT:
+	case VK_FORMAT_R8G8B8A8_UINT:
+	case VK_FORMAT_A8B8G8R8_UINT_PACK32:
+	case VK_FORMAT_A8B8G8R8_SINT_PACK32:
+		transpose4x4(oC.x, oC.y, oC.z, oC.w);
+		break;
+	default:
+		UNIMPLEMENTED("VkFormat: %d", int(state.targetFormat[index]));
+	}
+
+	int rgbaWriteMask = state.colorWriteActive(index);
+
+	Int xMask;   // Combination of all masks
+
+	if(state.depthTestActive)
+	{
+		xMask = zMask;
+	}
+	else
+	{
+		xMask = cMask;
+	}
+
+	if(state.stencilActive)
+	{
+		xMask &= sMask;
+	}
+
+	auto targetFormat = state.targetFormat[index];
+
+	Pointer<Byte> buffer;
+	Float4 value;
+
+	switch(targetFormat)
+	{
+	case VK_FORMAT_R32_SFLOAT:
+	case VK_FORMAT_R32_SINT:
+	case VK_FORMAT_R32_UINT:
+		if(rgbaWriteMask & 0x00000001)
 		{
-			return;
-		}
+			buffer = cBuffer + 4 * x;
 
-		Pointer<Byte> buffer;
-
-		// pixel holds four texel color values.
-		// Note: Despite the type being Vector4f, the colors may be stored as
-		// integers. Half-floats are stored as full 32-bit floats.
-		// Non-float and non-fixed point formats are not alpha blended.
-		Vector4f pixel;
-
-		Vector4s color;
-		Short4 c01;
-		Short4 c23;
-
-		Float4 one;
-		vk::Format format(state.targetFormat[index]);
-		if(format.isFloatFormat())
-		{
-			one = Float4(1.0f);
-		}
-		else if(format.isNonNormalizedInteger())
-		{
-			one = As<Float4>(format.isUnsignedComponent(0) ? Int4(0xFFFFFFFF) : Int4(0x7FFFFFFF));
-		}
-
-		switch(state.targetFormat[index])
-		{
-		case VK_FORMAT_R32_SINT:
-		case VK_FORMAT_R32_UINT:
-		case VK_FORMAT_R32_SFLOAT:
-			buffer = cBuffer;
 			// FIXME: movlps
-			pixel.x.x = *Pointer<Float>(buffer + 4 * x + 0);
-			pixel.x.y = *Pointer<Float>(buffer + 4 * x + 4);
+			value.x = *Pointer<Float>(buffer + 0);
+			value.y = *Pointer<Float>(buffer + 4);
+
 			buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
+
 			// FIXME: movhps
-			pixel.x.z = *Pointer<Float>(buffer + 4 * x + 0);
-			pixel.x.w = *Pointer<Float>(buffer + 4 * x + 4);
-			pixel.y = pixel.z = pixel.w = one;
-			break;
-		case VK_FORMAT_R32G32_SINT:
-		case VK_FORMAT_R32G32_UINT:
-		case VK_FORMAT_R32G32_SFLOAT:
-			buffer = cBuffer;
-			pixel.x = *Pointer<Float4>(buffer + 8 * x, 16);
-			buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
-			pixel.y = *Pointer<Float4>(buffer + 8 * x, 16);
-			pixel.z = pixel.x;
-			pixel.x = ShuffleLowHigh(pixel.x, pixel.y, 0x0202);
-			pixel.z = ShuffleLowHigh(pixel.z, pixel.y, 0x1313);
-			pixel.y = pixel.z;
-			pixel.z = pixel.w = one;
-			break;
-		case VK_FORMAT_R32G32B32A32_SFLOAT:
-		case VK_FORMAT_R32G32B32A32_SINT:
-		case VK_FORMAT_R32G32B32A32_UINT:
-			buffer = cBuffer;
-			pixel.x = *Pointer<Float4>(buffer + 16 * x, 16);
-			pixel.y = *Pointer<Float4>(buffer + 16 * x + 16, 16);
-			buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
-			pixel.z = *Pointer<Float4>(buffer + 16 * x, 16);
-			pixel.w = *Pointer<Float4>(buffer + 16 * x + 16, 16);
-			transpose4x4(pixel.x, pixel.y, pixel.z, pixel.w);
-			break;
-		case VK_FORMAT_R16_SFLOAT:
-			buffer = cBuffer;
-			pixel.x.x = Float(*Pointer<Half>(buffer + 2 * x + 0));
-			pixel.x.y = Float(*Pointer<Half>(buffer + 2 * x + 2));
-			buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
-			pixel.x.z = Float(*Pointer<Half>(buffer + 2 * x + 0));
-			pixel.x.w = Float(*Pointer<Half>(buffer + 2 * x + 2));
-			pixel.y = pixel.z = pixel.w = one;
-			break;
-		case VK_FORMAT_R16G16_SFLOAT:
-			buffer = cBuffer;
-			pixel.x.x = Float(*Pointer<Half>(buffer + 4 * x + 0));
-			pixel.y.x = Float(*Pointer<Half>(buffer + 4 * x + 2));
-			pixel.x.y = Float(*Pointer<Half>(buffer + 4 * x + 4));
-			pixel.y.y = Float(*Pointer<Half>(buffer + 4 * x + 6));
-			buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
-			pixel.x.z = Float(*Pointer<Half>(buffer + 4 * x + 0));
-			pixel.y.z = Float(*Pointer<Half>(buffer + 4 * x + 2));
-			pixel.x.w = Float(*Pointer<Half>(buffer + 4 * x + 4));
-			pixel.y.w = Float(*Pointer<Half>(buffer + 4 * x + 6));
-			pixel.z = pixel.w = one;
-			break;
-		case VK_FORMAT_R16G16B16A16_SFLOAT:
-			buffer = cBuffer;
-			pixel.x.x = Float(*Pointer<Half>(buffer + 8 * x + 0x0));
-			pixel.y.x = Float(*Pointer<Half>(buffer + 8 * x + 0x2));
-			pixel.z.x = Float(*Pointer<Half>(buffer + 8 * x + 0x4));
-			pixel.w.x = Float(*Pointer<Half>(buffer + 8 * x + 0x6));
-			pixel.x.y = Float(*Pointer<Half>(buffer + 8 * x + 0x8));
-			pixel.y.y = Float(*Pointer<Half>(buffer + 8 * x + 0xa));
-			pixel.z.y = Float(*Pointer<Half>(buffer + 8 * x + 0xc));
-			pixel.w.y = Float(*Pointer<Half>(buffer + 8 * x + 0xe));
-			buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
-			pixel.x.z = Float(*Pointer<Half>(buffer + 8 * x + 0x0));
-			pixel.y.z = Float(*Pointer<Half>(buffer + 8 * x + 0x2));
-			pixel.z.z = Float(*Pointer<Half>(buffer + 8 * x + 0x4));
-			pixel.w.z = Float(*Pointer<Half>(buffer + 8 * x + 0x6));
-			pixel.x.w = Float(*Pointer<Half>(buffer + 8 * x + 0x8));
-			pixel.y.w = Float(*Pointer<Half>(buffer + 8 * x + 0xa));
-			pixel.z.w = Float(*Pointer<Half>(buffer + 8 * x + 0xc));
-			pixel.w.w = Float(*Pointer<Half>(buffer + 8 * x + 0xe));
-			break;
-		default:
-			UNIMPLEMENTED("VkFormat: %d", int(state.targetFormat[index]));
+			value.z = *Pointer<Float>(buffer + 0);
+			value.w = *Pointer<Float>(buffer + 4);
+
+			oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants,maskD4X) + xMask * 16, 16));
+			value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD4X) + xMask * 16, 16));
+			oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(value));
+
+			// FIXME: movhps
+			*Pointer<Float>(buffer + 0) = oC.x.z;
+			*Pointer<Float>(buffer + 4) = oC.x.w;
+
+			buffer -= *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
+
+			// FIXME: movlps
+			*Pointer<Float>(buffer + 0) = oC.x.x;
+			*Pointer<Float>(buffer + 4) = oC.x.y;
 		}
-
-		// Final Color = ObjectColor * SourceBlendFactor + PixelColor * DestinationBlendFactor
-		Vector4f sourceFactor;
-		Vector4f destFactor;
-
-		blendFactor(sourceFactor, oC, pixel, state.blendState[index].sourceBlendFactor);
-		blendFactor(destFactor, oC, pixel, state.blendState[index].destBlendFactor);
-
-		oC.x *= sourceFactor.x;
-		oC.y *= sourceFactor.y;
-		oC.z *= sourceFactor.z;
-
-		pixel.x *= destFactor.x;
-		pixel.y *= destFactor.y;
-		pixel.z *= destFactor.z;
-
-		switch(state.blendState[index].blendOperation)
+		break;
+	case VK_FORMAT_R16_SFLOAT:
+		if(rgbaWriteMask & 0x00000001)
 		{
-		case VK_BLEND_OP_ADD:
-			oC.x += pixel.x;
-			oC.y += pixel.y;
-			oC.z += pixel.z;
-			break;
-		case VK_BLEND_OP_SUBTRACT:
-			oC.x -= pixel.x;
-			oC.y -= pixel.y;
-			oC.z -= pixel.z;
-			break;
-		case VK_BLEND_OP_REVERSE_SUBTRACT:
-			oC.x = pixel.x - oC.x;
-			oC.y = pixel.y - oC.y;
-			oC.z = pixel.z - oC.z;
-			break;
-		case VK_BLEND_OP_MIN:
-			oC.x = Min(oC.x, pixel.x);
-			oC.y = Min(oC.y, pixel.y);
-			oC.z = Min(oC.z, pixel.z);
-			break;
-		case VK_BLEND_OP_MAX:
-			oC.x = Max(oC.x, pixel.x);
-			oC.y = Max(oC.y, pixel.y);
-			oC.z = Max(oC.z, pixel.z);
-			break;
-		case VK_BLEND_OP_SRC_EXT:
-			// No operation
-			break;
-		case VK_BLEND_OP_DST_EXT:
-			oC.x = pixel.x;
-			oC.y = pixel.y;
-			oC.z = pixel.z;
-			break;
-		case VK_BLEND_OP_ZERO_EXT:
-			oC.x = Float4(0.0f);
-			oC.y = Float4(0.0f);
-			oC.z = Float4(0.0f);
-			break;
-		default:
-			UNIMPLEMENTED("VkBlendOp: %d", int(state.blendState[index].blendOperation));
+			buffer = cBuffer + 2 * x;
+
+			value = Insert(value, Float(*Pointer<Half>(buffer + 0)), 0);
+			value = Insert(value, Float(*Pointer<Half>(buffer + 2)), 1);
+
+			buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
+
+			value = Insert(value, Float(*Pointer<Half>(buffer + 0)), 2);
+			value = Insert(value, Float(*Pointer<Half>(buffer + 2)), 3);
+
+			oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants, maskD4X) + xMask * 16, 16));
+			value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskD4X) + xMask * 16, 16));
+			oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(value));
+
+			*Pointer<Half>(buffer + 0) = Half(oC.x.z);
+			*Pointer<Half>(buffer + 2) = Half(oC.x.w);
+
+			buffer -= *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
+
+			*Pointer<Half>(buffer + 0) = Half(oC.x.x);
+			*Pointer<Half>(buffer + 2) = Half(oC.x.y);
 		}
-
-		blendFactorAlpha(sourceFactor, oC, pixel, state.blendState[index].sourceBlendFactorAlpha);
-		blendFactorAlpha(destFactor, oC, pixel, state.blendState[index].destBlendFactorAlpha);
-
-		oC.w *= sourceFactor.w;
-		pixel.w *= destFactor.w;
-
-		switch(state.blendState[index].blendOperationAlpha)
+		break;
+	case VK_FORMAT_R16_SINT:
+	case VK_FORMAT_R16_UINT:
+		if(rgbaWriteMask & 0x00000001)
 		{
-		case VK_BLEND_OP_ADD:
-			oC.w += pixel.w;
-			break;
-		case VK_BLEND_OP_SUBTRACT:
-			oC.w -= pixel.w;
-			break;
-		case VK_BLEND_OP_REVERSE_SUBTRACT:
-			pixel.w -= oC.w;
-			oC.w = pixel.w;
-			break;
-		case VK_BLEND_OP_MIN:
-			oC.w = Min(oC.w, pixel.w);
-			break;
-		case VK_BLEND_OP_MAX:
-			oC.w = Max(oC.w, pixel.w);
-			break;
-		case VK_BLEND_OP_SRC_EXT:
-			// No operation
-			break;
-		case VK_BLEND_OP_DST_EXT:
-			oC.w = pixel.w;
-			break;
-		case VK_BLEND_OP_ZERO_EXT:
-			oC.w = Float4(0.0f);
-			break;
-		default:
-			UNIMPLEMENTED("VkBlendOp: %d", int(state.blendState[index].blendOperationAlpha));
-		}
-	}
+			buffer = cBuffer + 2 * x;
 
-	void PixelRoutine::writeColor(int index, const Pointer<Byte> &cBuffer, const Int &x, Vector4f &oC, const Int &sMask, const Int &zMask, const Int &cMask)
-	{
-		switch(state.targetFormat[index])
-		{
-		case VK_FORMAT_R16_SFLOAT:
-		case VK_FORMAT_R32_SFLOAT:
-		case VK_FORMAT_R32_SINT:
-		case VK_FORMAT_R32_UINT:
-		case VK_FORMAT_R16_SINT:
-		case VK_FORMAT_R16_UINT:
-		case VK_FORMAT_R8_SINT:
-		case VK_FORMAT_R8_UINT:
-		case VK_FORMAT_A2B10G10R10_UINT_PACK32:
-			break;
-		case VK_FORMAT_R16G16_SFLOAT:
-		case VK_FORMAT_R32G32_SFLOAT:
-		case VK_FORMAT_R32G32_SINT:
-		case VK_FORMAT_R32G32_UINT:
-		case VK_FORMAT_R16G16_SINT:
-		case VK_FORMAT_R16G16_UINT:
-		case VK_FORMAT_R8G8_SINT:
-		case VK_FORMAT_R8G8_UINT:
-			oC.z = oC.x;
-			oC.x = UnpackLow(oC.x, oC.y);
-			oC.z = UnpackHigh(oC.z, oC.y);
-			oC.y = oC.z;
-			break;
-		case VK_FORMAT_R16G16B16A16_SFLOAT:
-		case VK_FORMAT_R32G32B32A32_SFLOAT:
-		case VK_FORMAT_R32G32B32A32_SINT:
-		case VK_FORMAT_R32G32B32A32_UINT:
-		case VK_FORMAT_R16G16B16A16_SINT:
-		case VK_FORMAT_R16G16B16A16_UINT:
-		case VK_FORMAT_R8G8B8A8_SINT:
-		case VK_FORMAT_R8G8B8A8_UINT:
-		case VK_FORMAT_A8B8G8R8_UINT_PACK32:
-		case VK_FORMAT_A8B8G8R8_SINT_PACK32:
-			transpose4x4(oC.x, oC.y, oC.z, oC.w);
-			break;
-		default:
-			UNIMPLEMENTED("VkFormat: %d", int(state.targetFormat[index]));
-		}
+			UShort4 xyzw;
+			xyzw = As<UShort4>(Insert(As<Int2>(xyzw), *Pointer<Int>(buffer), 0));
 
-		int rgbaWriteMask = state.colorWriteActive(index);
+			buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
 
-		Int xMask;   // Combination of all masks
+			xyzw = As<UShort4>(Insert(As<Int2>(xyzw), *Pointer<Int>(buffer), 1));
+			value = As<Float4>(Int4(xyzw));
 
-		if(state.depthTestActive)
-		{
-			xMask = zMask;
-		}
-		else
-		{
-			xMask = cMask;
-		}
+			oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants, maskD4X) + xMask * 16, 16));
+			value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskD4X) + xMask * 16, 16));
+			oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(value));
 
-		if(state.stencilActive)
-		{
-			xMask &= sMask;
-		}
-
-		auto targetFormat = state.targetFormat[index];
-
-		Pointer<Byte> buffer;
-		Float4 value;
-
-		switch(targetFormat)
-		{
-		case VK_FORMAT_R32_SFLOAT:
-		case VK_FORMAT_R32_SINT:
-		case VK_FORMAT_R32_UINT:
-			if(rgbaWriteMask & 0x00000001)
+			if(targetFormat == VK_FORMAT_R16_SINT)
 			{
-				buffer = cBuffer + 4 * x;
-
-				// FIXME: movlps
-				value.x = *Pointer<Float>(buffer + 0);
-				value.y = *Pointer<Float>(buffer + 4);
-
-				buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
-
-				// FIXME: movhps
-				value.z = *Pointer<Float>(buffer + 0);
-				value.w = *Pointer<Float>(buffer + 4);
-
-				oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants,maskD4X) + xMask * 16, 16));
-				value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD4X) + xMask * 16, 16));
-				oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(value));
-
-				// FIXME: movhps
-				*Pointer<Float>(buffer + 0) = oC.x.z;
-				*Pointer<Float>(buffer + 4) = oC.x.w;
-
-				buffer -= *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
-
-				// FIXME: movlps
-				*Pointer<Float>(buffer + 0) = oC.x.x;
-				*Pointer<Float>(buffer + 4) = oC.x.y;
-			}
-			break;
-		case VK_FORMAT_R16_SFLOAT:
-			if(rgbaWriteMask & 0x00000001)
-			{
-				buffer = cBuffer + 2 * x;
-
-				value = Insert(value, Float(*Pointer<Half>(buffer + 0)), 0);
-				value = Insert(value, Float(*Pointer<Half>(buffer + 2)), 1);
-
-				buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
-
-				value = Insert(value, Float(*Pointer<Half>(buffer + 0)), 2);
-				value = Insert(value, Float(*Pointer<Half>(buffer + 2)), 3);
-
-				oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants, maskD4X) + xMask * 16, 16));
-				value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskD4X) + xMask * 16, 16));
-				oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(value));
-
-				*Pointer<Half>(buffer + 0) = Half(oC.x.z);
-				*Pointer<Half>(buffer + 2) = Half(oC.x.w);
+				Float component = oC.x.z;
+				*Pointer<Short>(buffer + 0) = Short(As<Int>(component));
+				component = oC.x.w;
+				*Pointer<Short>(buffer + 2) = Short(As<Int>(component));
 
 				buffer -= *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
 
-				*Pointer<Half>(buffer + 0) = Half(oC.x.x);
-				*Pointer<Half>(buffer + 2) = Half(oC.x.y);
+				component = oC.x.x;
+				*Pointer<Short>(buffer + 0) = Short(As<Int>(component));
+				component = oC.x.y;
+				*Pointer<Short>(buffer + 2) = Short(As<Int>(component));
 			}
-			break;
-		case VK_FORMAT_R16_SINT:
-		case VK_FORMAT_R16_UINT:
-			if(rgbaWriteMask & 0x00000001)
+			else // VK_FORMAT_R16_UINT
 			{
-				buffer = cBuffer + 2 * x;
+				Float component = oC.x.z;
+				*Pointer<UShort>(buffer + 0) = UShort(As<Int>(component));
+				component = oC.x.w;
+				*Pointer<UShort>(buffer + 2) = UShort(As<Int>(component));
 
-				UShort4 xyzw;
-				xyzw = As<UShort4>(Insert(As<Int2>(xyzw), *Pointer<Int>(buffer), 0));
-
-				buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
-
-				xyzw = As<UShort4>(Insert(As<Int2>(xyzw), *Pointer<Int>(buffer), 1));
-				value = As<Float4>(Int4(xyzw));
-
-				oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants, maskD4X) + xMask * 16, 16));
-				value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskD4X) + xMask * 16, 16));
-				oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(value));
-
-				if(targetFormat == VK_FORMAT_R16_SINT)
-				{
-					Float component = oC.x.z;
-					*Pointer<Short>(buffer + 0) = Short(As<Int>(component));
-					component = oC.x.w;
-					*Pointer<Short>(buffer + 2) = Short(As<Int>(component));
-
-					buffer -= *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
-
-					component = oC.x.x;
-					*Pointer<Short>(buffer + 0) = Short(As<Int>(component));
-					component = oC.x.y;
-					*Pointer<Short>(buffer + 2) = Short(As<Int>(component));
-				}
-				else // VK_FORMAT_R16_UINT
-				{
-					Float component = oC.x.z;
-					*Pointer<UShort>(buffer + 0) = UShort(As<Int>(component));
-					component = oC.x.w;
-					*Pointer<UShort>(buffer + 2) = UShort(As<Int>(component));
-
-					buffer -= *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
-
-					component = oC.x.x;
-					*Pointer<UShort>(buffer + 0) = UShort(As<Int>(component));
-					component = oC.x.y;
-					*Pointer<UShort>(buffer + 2) = UShort(As<Int>(component));
-				}
-			}
-			break;
-		case VK_FORMAT_R8_SINT:
-		case VK_FORMAT_R8_UINT:
-			if(rgbaWriteMask & 0x00000001)
-			{
-				buffer = cBuffer + x;
-
-				UInt xyzw, packedCol;
-
-				xyzw = UInt(*Pointer<UShort>(buffer)) & 0xFFFF;
-				buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
-				xyzw |= UInt(*Pointer<UShort>(buffer)) << 16;
-
-				Short4 tmpCol = Short4(As<Int4>(oC.x));
-				if(targetFormat == VK_FORMAT_R8_SINT)
-				{
-					tmpCol = As<Short4>(PackSigned(tmpCol, tmpCol));
-				}
-				else
-				{
-					tmpCol = As<Short4>(PackUnsigned(tmpCol, tmpCol));
-				}
-				packedCol = Extract(As<Int2>(tmpCol), 0);
-
-				packedCol = (packedCol & *Pointer<UInt>(constants + OFFSET(Constants, maskB4Q) + 8 * xMask)) |
-				            (xyzw & *Pointer<UInt>(constants + OFFSET(Constants, invMaskB4Q) + 8 * xMask));
-
-				*Pointer<UShort>(buffer) = UShort(packedCol >> 16);
 				buffer -= *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
-				*Pointer<UShort>(buffer) = UShort(packedCol);
+
+				component = oC.x.x;
+				*Pointer<UShort>(buffer + 0) = UShort(As<Int>(component));
+				component = oC.x.y;
+				*Pointer<UShort>(buffer + 2) = UShort(As<Int>(component));
 			}
-			break;
-		case VK_FORMAT_R32G32_SFLOAT:
-		case VK_FORMAT_R32G32_SINT:
-		case VK_FORMAT_R32G32_UINT:
-			buffer = cBuffer + 8 * x;
+		}
+		break;
+	case VK_FORMAT_R8_SINT:
+	case VK_FORMAT_R8_UINT:
+		if(rgbaWriteMask & 0x00000001)
+		{
+			buffer = cBuffer + x;
 
-			value = *Pointer<Float4>(buffer);
+			UInt xyzw, packedCol;
 
-			if((rgbaWriteMask & 0x00000003) != 0x00000003)
+			xyzw = UInt(*Pointer<UShort>(buffer)) & 0xFFFF;
+			buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
+			xyzw |= UInt(*Pointer<UShort>(buffer)) << 16;
+
+			Short4 tmpCol = Short4(As<Int4>(oC.x));
+			if(targetFormat == VK_FORMAT_R8_SINT)
+			{
+				tmpCol = As<Short4>(PackSigned(tmpCol, tmpCol));
+			}
+			else
+			{
+				tmpCol = As<Short4>(PackUnsigned(tmpCol, tmpCol));
+			}
+			packedCol = Extract(As<Int2>(tmpCol), 0);
+
+			packedCol = (packedCol & *Pointer<UInt>(constants + OFFSET(Constants, maskB4Q) + 8 * xMask)) |
+			            (xyzw & *Pointer<UInt>(constants + OFFSET(Constants, invMaskB4Q) + 8 * xMask));
+
+			*Pointer<UShort>(buffer) = UShort(packedCol >> 16);
+			buffer -= *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
+			*Pointer<UShort>(buffer) = UShort(packedCol);
+		}
+		break;
+	case VK_FORMAT_R32G32_SFLOAT:
+	case VK_FORMAT_R32G32_SINT:
+	case VK_FORMAT_R32G32_UINT:
+		buffer = cBuffer + 8 * x;
+
+		value = *Pointer<Float4>(buffer);
+
+		if((rgbaWriteMask & 0x00000003) != 0x00000003)
+		{
+			Float4 masked = value;
+			oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants,maskD01X[rgbaWriteMask & 0x3][0])));
+			masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants,maskD01X[~rgbaWriteMask & 0x3][0])));
+			oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(masked));
+		}
+
+		oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants,maskQ01X) + xMask * 16, 16));
+		value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskQ01X) + xMask * 16, 16));
+		oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(value));
+		*Pointer<Float4>(buffer) = oC.x;
+
+		buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
+
+		value = *Pointer<Float4>(buffer);
+
+		if((rgbaWriteMask & 0x00000003) != 0x00000003)
+		{
+			Float4 masked;
+
+			masked = value;
+			oC.y = As<Float4>(As<Int4>(oC.y) & *Pointer<Int4>(constants + OFFSET(Constants,maskD01X[rgbaWriteMask & 0x3][0])));
+			masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants,maskD01X[~rgbaWriteMask & 0x3][0])));
+			oC.y = As<Float4>(As<Int4>(oC.y) | As<Int4>(masked));
+		}
+
+		oC.y = As<Float4>(As<Int4>(oC.y) & *Pointer<Int4>(constants + OFFSET(Constants,maskQ23X) + xMask * 16, 16));
+		value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskQ23X) + xMask * 16, 16));
+		oC.y = As<Float4>(As<Int4>(oC.y) | As<Int4>(value));
+		*Pointer<Float4>(buffer) = oC.y;
+		break;
+	case VK_FORMAT_R16G16_SFLOAT:
+		if((rgbaWriteMask & 0x00000003) != 0x0)
+		{
+			buffer = cBuffer + 4 * x;
+
+			UInt2 rgbaMask;
+			UInt2 packedCol;
+			packedCol = Insert(packedCol, (UInt(As<UShort>(Half(oC.x.y))) << 16) | UInt(As<UShort>(Half(oC.x.x))), 0);
+			packedCol = Insert(packedCol, (UInt(As<UShort>(Half(oC.x.w))) << 16) | UInt(As<UShort>(Half(oC.x.z))), 1);
+
+			UShort4 value = *Pointer<UShort4>(buffer);
+			UInt2 mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskD01Q) + xMask * 8);
+			if((rgbaWriteMask & 0x3) != 0x3)
+			{
+				Int tmpMask = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[rgbaWriteMask & 0x3][0]));
+				rgbaMask = As<UInt2>(Int2(tmpMask, tmpMask));
+				mergedMask &= rgbaMask;
+			}
+			*Pointer<UInt2>(buffer) = (packedCol & mergedMask) | (As<UInt2>(value) & ~mergedMask);
+
+			buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
+
+			packedCol = Insert(packedCol, (UInt(As<UShort>(Half(oC.y.y))) << 16) | UInt(As<UShort>(Half(oC.y.x))), 0);
+			packedCol = Insert(packedCol, (UInt(As<UShort>(Half(oC.y.w))) << 16) | UInt(As<UShort>(Half(oC.y.z))), 1);
+			value = *Pointer<UShort4>(buffer);
+			mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskD23Q) + xMask * 8);
+			if((rgbaWriteMask & 0x3) != 0x3)
+			{
+				mergedMask &= rgbaMask;
+			}
+			*Pointer<UInt2>(buffer) = (packedCol & mergedMask) | (As<UInt2>(value) & ~mergedMask);
+		}
+		break;
+	case VK_FORMAT_R16G16_SINT:
+	case VK_FORMAT_R16G16_UINT:
+		if((rgbaWriteMask & 0x00000003) != 0x0)
+		{
+			buffer = cBuffer + 4 * x;
+
+			UInt2 rgbaMask;
+			UShort4 packedCol = UShort4(As<Int4>(oC.x));
+			UShort4 value = *Pointer<UShort4>(buffer);
+			UInt2 mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskD01Q) + xMask * 8);
+			if((rgbaWriteMask & 0x3) != 0x3)
+			{
+				Int tmpMask = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[rgbaWriteMask & 0x3][0]));
+				rgbaMask = As<UInt2>(Int2(tmpMask, tmpMask));
+				mergedMask &= rgbaMask;
+			}
+			*Pointer<UInt2>(buffer) = (As<UInt2>(packedCol) & mergedMask) | (As<UInt2>(value) & ~mergedMask);
+
+			buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
+
+			packedCol = UShort4(As<Int4>(oC.y));
+			value = *Pointer<UShort4>(buffer);
+			mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskD23Q) + xMask * 8);
+			if((rgbaWriteMask & 0x3) != 0x3)
+			{
+				mergedMask &= rgbaMask;
+			}
+			*Pointer<UInt2>(buffer) = (As<UInt2>(packedCol) & mergedMask) | (As<UInt2>(value) & ~mergedMask);
+		}
+		break;
+	case VK_FORMAT_R8G8_SINT:
+	case VK_FORMAT_R8G8_UINT:
+		if((rgbaWriteMask & 0x00000003) != 0x0)
+		{
+			buffer = cBuffer + 2 * x;
+
+			Int2 xyzw, packedCol;
+
+			xyzw = Insert(xyzw, *Pointer<Int>(buffer), 0);
+			buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
+			xyzw = Insert(xyzw, *Pointer<Int>(buffer), 1);
+
+			if(targetFormat == VK_FORMAT_R8G8_SINT)
+			{
+				packedCol = As<Int2>(PackSigned(Short4(As<Int4>(oC.x)), Short4(As<Int4>(oC.y))));
+			}
+			else
+			{
+				packedCol = As<Int2>(PackUnsigned(Short4(As<Int4>(oC.x)), Short4(As<Int4>(oC.y))));
+			}
+
+			UInt2 mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskW4Q) + xMask * 8);
+			if((rgbaWriteMask & 0x3) != 0x3)
+			{
+				Int tmpMask = *Pointer<Int>(constants + OFFSET(Constants, maskB4Q[5 * (rgbaWriteMask & 0x3)][0]));
+				UInt2 rgbaMask = As<UInt2>(Int2(tmpMask, tmpMask));
+				mergedMask &= rgbaMask;
+			}
+
+			packedCol = As<Int2>((As<UInt2>(packedCol) & mergedMask) | (As<UInt2>(xyzw) & ~mergedMask));
+
+			*Pointer<UInt>(buffer) = As<UInt>(Extract(packedCol, 1));
+			buffer -= *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
+			*Pointer<UInt>(buffer) = As<UInt>(Extract(packedCol, 0));
+		}
+		break;
+	case VK_FORMAT_R32G32B32A32_SFLOAT:
+	case VK_FORMAT_R32G32B32A32_SINT:
+	case VK_FORMAT_R32G32B32A32_UINT:
+		buffer = cBuffer + 16 * x;
+
+		{
+			value = *Pointer<Float4>(buffer, 16);
+
+			if(rgbaWriteMask != 0x0000000F)
 			{
 				Float4 masked = value;
-				oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants,maskD01X[rgbaWriteMask & 0x3][0])));
-				masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants,maskD01X[~rgbaWriteMask & 0x3][0])));
+				oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants,maskD4X[rgbaWriteMask][0])));
+				masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD4X[rgbaWriteMask][0])));
 				oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(masked));
 			}
 
-			oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants,maskQ01X) + xMask * 16, 16));
-			value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskQ01X) + xMask * 16, 16));
+			oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants,maskX0X) + xMask * 16, 16));
+			value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskX0X) + xMask * 16, 16));
 			oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(value));
-			*Pointer<Float4>(buffer) = oC.x;
+			*Pointer<Float4>(buffer, 16) = oC.x;
+		}
 
-			buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
+		{
+			value = *Pointer<Float4>(buffer + 16, 16);
 
-			value = *Pointer<Float4>(buffer);
-
-			if((rgbaWriteMask & 0x00000003) != 0x00000003)
+			if(rgbaWriteMask != 0x0000000F)
 			{
-				Float4 masked;
-
-				masked = value;
-				oC.y = As<Float4>(As<Int4>(oC.y) & *Pointer<Int4>(constants + OFFSET(Constants,maskD01X[rgbaWriteMask & 0x3][0])));
-				masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants,maskD01X[~rgbaWriteMask & 0x3][0])));
+				Float4 masked = value;
+				oC.y = As<Float4>(As<Int4>(oC.y) & *Pointer<Int4>(constants + OFFSET(Constants,maskD4X[rgbaWriteMask][0])));
+				masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD4X[rgbaWriteMask][0])));
 				oC.y = As<Float4>(As<Int4>(oC.y) | As<Int4>(masked));
 			}
 
-			oC.y = As<Float4>(As<Int4>(oC.y) & *Pointer<Int4>(constants + OFFSET(Constants,maskQ23X) + xMask * 16, 16));
-			value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskQ23X) + xMask * 16, 16));
+			oC.y = As<Float4>(As<Int4>(oC.y) & *Pointer<Int4>(constants + OFFSET(Constants,maskX1X) + xMask * 16, 16));
+			value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskX1X) + xMask * 16, 16));
 			oC.y = As<Float4>(As<Int4>(oC.y) | As<Int4>(value));
-			*Pointer<Float4>(buffer) = oC.y;
-			break;
-		case VK_FORMAT_R16G16_SFLOAT:
-			if((rgbaWriteMask & 0x00000003) != 0x0)
-			{
-				buffer = cBuffer + 4 * x;
-
-				UInt2 rgbaMask;
-				UInt2 packedCol;
-				packedCol = Insert(packedCol, (UInt(As<UShort>(Half(oC.x.y))) << 16) | UInt(As<UShort>(Half(oC.x.x))), 0);
-				packedCol = Insert(packedCol, (UInt(As<UShort>(Half(oC.x.w))) << 16) | UInt(As<UShort>(Half(oC.x.z))), 1);
-
-				UShort4 value = *Pointer<UShort4>(buffer);
-				UInt2 mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskD01Q) + xMask * 8);
-				if((rgbaWriteMask & 0x3) != 0x3)
-				{
-					Int tmpMask = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[rgbaWriteMask & 0x3][0]));
-					rgbaMask = As<UInt2>(Int2(tmpMask, tmpMask));
-					mergedMask &= rgbaMask;
-				}
-				*Pointer<UInt2>(buffer) = (packedCol & mergedMask) | (As<UInt2>(value) & ~mergedMask);
-
-				buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
-
-				packedCol = Insert(packedCol, (UInt(As<UShort>(Half(oC.y.y))) << 16) | UInt(As<UShort>(Half(oC.y.x))), 0);
-				packedCol = Insert(packedCol, (UInt(As<UShort>(Half(oC.y.w))) << 16) | UInt(As<UShort>(Half(oC.y.z))), 1);
-				value = *Pointer<UShort4>(buffer);
-				mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskD23Q) + xMask * 8);
-				if((rgbaWriteMask & 0x3) != 0x3)
-				{
-					mergedMask &= rgbaMask;
-				}
-				*Pointer<UInt2>(buffer) = (packedCol & mergedMask) | (As<UInt2>(value) & ~mergedMask);
-			}
-			break;
-		case VK_FORMAT_R16G16_SINT:
-		case VK_FORMAT_R16G16_UINT:
-			if((rgbaWriteMask & 0x00000003) != 0x0)
-			{
-				buffer = cBuffer + 4 * x;
-
-				UInt2 rgbaMask;
-				UShort4 packedCol = UShort4(As<Int4>(oC.x));
-				UShort4 value = *Pointer<UShort4>(buffer);
-				UInt2 mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskD01Q) + xMask * 8);
-				if((rgbaWriteMask & 0x3) != 0x3)
-				{
-					Int tmpMask = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[rgbaWriteMask & 0x3][0]));
-					rgbaMask = As<UInt2>(Int2(tmpMask, tmpMask));
-					mergedMask &= rgbaMask;
-				}
-				*Pointer<UInt2>(buffer) = (As<UInt2>(packedCol) & mergedMask) | (As<UInt2>(value) & ~mergedMask);
-
-				buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
-
-				packedCol = UShort4(As<Int4>(oC.y));
-				value = *Pointer<UShort4>(buffer);
-				mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskD23Q) + xMask * 8);
-				if((rgbaWriteMask & 0x3) != 0x3)
-				{
-					mergedMask &= rgbaMask;
-				}
-				*Pointer<UInt2>(buffer) = (As<UInt2>(packedCol) & mergedMask) | (As<UInt2>(value) & ~mergedMask);
-			}
-			break;
-		case VK_FORMAT_R8G8_SINT:
-		case VK_FORMAT_R8G8_UINT:
-			if((rgbaWriteMask & 0x00000003) != 0x0)
-			{
-				buffer = cBuffer + 2 * x;
-
-				Int2 xyzw, packedCol;
-
-				xyzw = Insert(xyzw, *Pointer<Int>(buffer), 0);
-				buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
-				xyzw = Insert(xyzw, *Pointer<Int>(buffer), 1);
-
-				if(targetFormat == VK_FORMAT_R8G8_SINT)
-				{
-					packedCol = As<Int2>(PackSigned(Short4(As<Int4>(oC.x)), Short4(As<Int4>(oC.y))));
-				}
-				else
-				{
-					packedCol = As<Int2>(PackUnsigned(Short4(As<Int4>(oC.x)), Short4(As<Int4>(oC.y))));
-				}
-
-				UInt2 mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskW4Q) + xMask * 8);
-				if((rgbaWriteMask & 0x3) != 0x3)
-				{
-					Int tmpMask = *Pointer<Int>(constants + OFFSET(Constants, maskB4Q[5 * (rgbaWriteMask & 0x3)][0]));
-					UInt2 rgbaMask = As<UInt2>(Int2(tmpMask, tmpMask));
-					mergedMask &= rgbaMask;
-				}
-
-				packedCol = As<Int2>((As<UInt2>(packedCol) & mergedMask) | (As<UInt2>(xyzw) & ~mergedMask));
-
-				*Pointer<UInt>(buffer) = As<UInt>(Extract(packedCol, 1));
-				buffer -= *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
-				*Pointer<UInt>(buffer) = As<UInt>(Extract(packedCol, 0));
-			}
-			break;
-		case VK_FORMAT_R32G32B32A32_SFLOAT:
-		case VK_FORMAT_R32G32B32A32_SINT:
-		case VK_FORMAT_R32G32B32A32_UINT:
-			buffer = cBuffer + 16 * x;
-
-			{
-				value = *Pointer<Float4>(buffer, 16);
-
-				if(rgbaWriteMask != 0x0000000F)
-				{
-					Float4 masked = value;
-					oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants,maskD4X[rgbaWriteMask][0])));
-					masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD4X[rgbaWriteMask][0])));
-					oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(masked));
-				}
-
-				oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants,maskX0X) + xMask * 16, 16));
-				value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskX0X) + xMask * 16, 16));
-				oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(value));
-				*Pointer<Float4>(buffer, 16) = oC.x;
-			}
-
-			{
-				value = *Pointer<Float4>(buffer + 16, 16);
-
-				if(rgbaWriteMask != 0x0000000F)
-				{
-					Float4 masked = value;
-					oC.y = As<Float4>(As<Int4>(oC.y) & *Pointer<Int4>(constants + OFFSET(Constants,maskD4X[rgbaWriteMask][0])));
-					masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD4X[rgbaWriteMask][0])));
-					oC.y = As<Float4>(As<Int4>(oC.y) | As<Int4>(masked));
-				}
-
-				oC.y = As<Float4>(As<Int4>(oC.y) & *Pointer<Int4>(constants + OFFSET(Constants,maskX1X) + xMask * 16, 16));
-				value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskX1X) + xMask * 16, 16));
-				oC.y = As<Float4>(As<Int4>(oC.y) | As<Int4>(value));
-				*Pointer<Float4>(buffer + 16, 16) = oC.y;
-			}
-
-			buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
-
-			{
-				value = *Pointer<Float4>(buffer, 16);
-
-				if(rgbaWriteMask != 0x0000000F)
-				{
-					Float4 masked = value;
-					oC.z = As<Float4>(As<Int4>(oC.z) & *Pointer<Int4>(constants + OFFSET(Constants,maskD4X[rgbaWriteMask][0])));
-					masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD4X[rgbaWriteMask][0])));
-					oC.z = As<Float4>(As<Int4>(oC.z) | As<Int4>(masked));
-				}
-
-				oC.z = As<Float4>(As<Int4>(oC.z) & *Pointer<Int4>(constants + OFFSET(Constants,maskX2X) + xMask * 16, 16));
-				value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskX2X) + xMask * 16, 16));
-				oC.z = As<Float4>(As<Int4>(oC.z) | As<Int4>(value));
-				*Pointer<Float4>(buffer, 16) = oC.z;
-			}
-
-			{
-				value = *Pointer<Float4>(buffer + 16, 16);
-
-				if(rgbaWriteMask != 0x0000000F)
-				{
-					Float4 masked = value;
-					oC.w = As<Float4>(As<Int4>(oC.w) & *Pointer<Int4>(constants + OFFSET(Constants,maskD4X[rgbaWriteMask][0])));
-					masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD4X[rgbaWriteMask][0])));
-					oC.w = As<Float4>(As<Int4>(oC.w) | As<Int4>(masked));
-				}
-
-				oC.w = As<Float4>(As<Int4>(oC.w) & *Pointer<Int4>(constants + OFFSET(Constants,maskX3X) + xMask * 16, 16));
-				value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskX3X) + xMask * 16, 16));
-				oC.w = As<Float4>(As<Int4>(oC.w) | As<Int4>(value));
-				*Pointer<Float4>(buffer + 16, 16) = oC.w;
-			}
-			break;
-		case VK_FORMAT_R16G16B16A16_SFLOAT:
-			if((rgbaWriteMask & 0x0000000F) != 0x0)
-			{
-				buffer = cBuffer + 8 * x;
-
-				UInt4 rgbaMask;
-				UInt4 value = *Pointer<UInt4>(buffer);
-				UInt4 packedCol;
-				packedCol = Insert(packedCol, (UInt(As<UShort>(Half(oC.x.y))) << 16) | UInt(As<UShort>(Half(oC.x.x))), 0);
-				packedCol = Insert(packedCol, (UInt(As<UShort>(Half(oC.x.w))) << 16) | UInt(As<UShort>(Half(oC.x.z))), 1);
-				packedCol = Insert(packedCol, (UInt(As<UShort>(Half(oC.y.y))) << 16) | UInt(As<UShort>(Half(oC.y.x))), 2);
-				packedCol = Insert(packedCol, (UInt(As<UShort>(Half(oC.y.w))) << 16) | UInt(As<UShort>(Half(oC.y.z))), 3);
-				UInt4 mergedMask = *Pointer<UInt4>(constants + OFFSET(Constants, maskQ01X) + xMask * 16);
-				if((rgbaWriteMask & 0xF) != 0xF)
-				{
-					UInt2 tmpMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskW4Q[rgbaWriteMask][0]));
-					rgbaMask = UInt4(tmpMask, tmpMask);
-					mergedMask &= rgbaMask;
-				}
-				*Pointer<UInt4>(buffer) = (packedCol & mergedMask) | (As<UInt4>(value) & ~mergedMask);
-
-				buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
-
-				value = *Pointer<UInt4>(buffer);
-				packedCol = Insert(packedCol, (UInt(As<UShort>(Half(oC.z.y))) << 16) | UInt(As<UShort>(Half(oC.z.x))), 0);
-				packedCol = Insert(packedCol, (UInt(As<UShort>(Half(oC.z.w))) << 16) | UInt(As<UShort>(Half(oC.z.z))), 1);
-				packedCol = Insert(packedCol, (UInt(As<UShort>(Half(oC.w.y))) << 16) | UInt(As<UShort>(Half(oC.w.x))), 2);
-				packedCol = Insert(packedCol, (UInt(As<UShort>(Half(oC.w.w))) << 16) | UInt(As<UShort>(Half(oC.w.z))), 3);
-				mergedMask = *Pointer<UInt4>(constants + OFFSET(Constants, maskQ23X) + xMask * 16);
-				if((rgbaWriteMask & 0xF) != 0xF)
-				{
-					mergedMask &= rgbaMask;
-				}
-				*Pointer<UInt4>(buffer) = (packedCol & mergedMask) | (As<UInt4>(value) & ~mergedMask);
-			}
-			break;
-		case VK_FORMAT_R16G16B16A16_SINT:
-		case VK_FORMAT_R16G16B16A16_UINT:
-			if((rgbaWriteMask & 0x0000000F) != 0x0)
-			{
-				buffer = cBuffer + 8 * x;
-
-				UInt4 rgbaMask;
-				UShort8 value = *Pointer<UShort8>(buffer);
-				UShort8 packedCol = UShort8(UShort4(As<Int4>(oC.x)), UShort4(As<Int4>(oC.y)));
-				UInt4 mergedMask = *Pointer<UInt4>(constants + OFFSET(Constants, maskQ01X) + xMask * 16);
-				if((rgbaWriteMask & 0xF) != 0xF)
-				{
-					UInt2 tmpMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskW4Q[rgbaWriteMask][0]));
-					rgbaMask = UInt4(tmpMask, tmpMask);
-					mergedMask &= rgbaMask;
-				}
-				*Pointer<UInt4>(buffer) = (As<UInt4>(packedCol) & mergedMask) | (As<UInt4>(value) & ~mergedMask);
-
-				buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
-
-				value = *Pointer<UShort8>(buffer);
-				packedCol = UShort8(UShort4(As<Int4>(oC.z)), UShort4(As<Int4>(oC.w)));
-				mergedMask = *Pointer<UInt4>(constants + OFFSET(Constants, maskQ23X) + xMask * 16);
-				if((rgbaWriteMask & 0xF) != 0xF)
-				{
-					mergedMask &= rgbaMask;
-				}
-				*Pointer<UInt4>(buffer) = (As<UInt4>(packedCol) & mergedMask) | (As<UInt4>(value) & ~mergedMask);
-			}
-			break;
-		case VK_FORMAT_R8G8B8A8_SINT:
-		case VK_FORMAT_R8G8B8A8_UINT:
-		case VK_FORMAT_A8B8G8R8_UINT_PACK32:
-		case VK_FORMAT_A8B8G8R8_SINT_PACK32:
-			if((rgbaWriteMask & 0x0000000F) != 0x0)
-			{
-				UInt2 value, packedCol, mergedMask;
-
-				buffer = cBuffer + 4 * x;
-
-				bool isSigned = targetFormat == VK_FORMAT_R8G8B8A8_SINT || targetFormat == VK_FORMAT_A8B8G8R8_SINT_PACK32;
-
-				if(isSigned)
-				{
-					packedCol = As<UInt2>(PackSigned(Short4(As<Int4>(oC.x)), Short4(As<Int4>(oC.y))));
-				}
-				else
-				{
-					packedCol = As<UInt2>(PackUnsigned(Short4(As<Int4>(oC.x)), Short4(As<Int4>(oC.y))));
-				}
-				value = *Pointer<UInt2>(buffer, 16);
-				mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskD01Q) + xMask * 8);
-				if(rgbaWriteMask != 0xF)
-				{
-					mergedMask &= *Pointer<UInt2>(constants + OFFSET(Constants, maskB4Q[rgbaWriteMask][0]));
-				}
-				*Pointer<UInt2>(buffer) = (packedCol & mergedMask) | (value & ~mergedMask);
-
-				buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
-
-				if(isSigned)
-				{
-					packedCol = As<UInt2>(PackSigned(Short4(As<Int4>(oC.z)), Short4(As<Int4>(oC.w))));
-				}
-				else
-				{
-					packedCol = As<UInt2>(PackUnsigned(Short4(As<Int4>(oC.z)), Short4(As<Int4>(oC.w))));
-				}
-				value = *Pointer<UInt2>(buffer, 16);
-				mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskD23Q) + xMask * 8);
-				if(rgbaWriteMask != 0xF)
-				{
-					mergedMask &= *Pointer<UInt2>(constants + OFFSET(Constants, maskB4Q[rgbaWriteMask][0]));
-				}
-				*Pointer<UInt2>(buffer) = (packedCol & mergedMask) | (value & ~mergedMask);
-			}
-			break;
-		case VK_FORMAT_A2B10G10R10_UINT_PACK32:
-			if ((rgbaWriteMask & 0x0000000F) != 0x0)
-			{
-				Int2 mergedMask, packedCol, value;
-				Int4 packed = ((As<Int4>(oC.w) & Int4(0x3)) << 30) |
-						((As<Int4>(oC.z) & Int4(0x3ff)) << 20) |
-						((As<Int4>(oC.y) & Int4(0x3ff)) << 10) |
-						((As<Int4>(oC.x) & Int4(0x3ff)));
-
-				buffer = cBuffer + 4 * x;
-				value = *Pointer<Int2>(buffer, 16);
-				mergedMask = *Pointer<Int2>(constants + OFFSET(Constants, maskD01Q) + xMask * 8);
-				if (rgbaWriteMask != 0xF)
-				{
-					mergedMask &= *Pointer<Int2>(constants + OFFSET(Constants, mask10Q[rgbaWriteMask][0]));
-				}
-				*Pointer<Int2>(buffer) = (As<Int2>(packed) & mergedMask) | (value & ~mergedMask);
-
-				buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
-
-				value = *Pointer<Int2>(buffer, 16);
-				mergedMask = *Pointer<Int2>(constants + OFFSET(Constants, maskD23Q) + xMask * 8);
-				if (rgbaWriteMask != 0xF)
-				{
-					mergedMask &= *Pointer<Int2>(constants + OFFSET(Constants, mask10Q[rgbaWriteMask][0]));
-				}
-				*Pointer<Int2>(buffer) = (As<Int2>(Int4(packed.zwww)) & mergedMask) | (value & ~mergedMask);
-			}
-			break;
-		default:
-			UNIMPLEMENTED("VkFormat: %d", int(targetFormat));
+			*Pointer<Float4>(buffer + 16, 16) = oC.y;
 		}
-	}
 
-	UShort4 PixelRoutine::convertFixed16(const Float4 &cf, bool saturate)
-	{
-		return UShort4(cf * Float4(0xFFFF), saturate);
-	}
+		buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
 
-	void PixelRoutine::sRGBtoLinear16_12_16(Vector4s &c)
-	{
-		Pointer<Byte> LUT = constants + OFFSET(Constants,sRGBtoLinear12_16);
+		{
+			value = *Pointer<Float4>(buffer, 16);
 
-		c.x = As<UShort4>(c.x) >> 4;
-		c.y = As<UShort4>(c.y) >> 4;
-		c.z = As<UShort4>(c.z) >> 4;
+			if(rgbaWriteMask != 0x0000000F)
+			{
+				Float4 masked = value;
+				oC.z = As<Float4>(As<Int4>(oC.z) & *Pointer<Int4>(constants + OFFSET(Constants,maskD4X[rgbaWriteMask][0])));
+				masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD4X[rgbaWriteMask][0])));
+				oC.z = As<Float4>(As<Int4>(oC.z) | As<Int4>(masked));
+			}
 
-		c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 0))), 0);
-		c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 1))), 1);
-		c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 2))), 2);
-		c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 3))), 3);
+			oC.z = As<Float4>(As<Int4>(oC.z) & *Pointer<Int4>(constants + OFFSET(Constants,maskX2X) + xMask * 16, 16));
+			value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskX2X) + xMask * 16, 16));
+			oC.z = As<Float4>(As<Int4>(oC.z) | As<Int4>(value));
+			*Pointer<Float4>(buffer, 16) = oC.z;
+		}
 
-		c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 0))), 0);
-		c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 1))), 1);
-		c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 2))), 2);
-		c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 3))), 3);
+		{
+			value = *Pointer<Float4>(buffer + 16, 16);
 
-		c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 0))), 0);
-		c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 1))), 1);
-		c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 2))), 2);
-		c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 3))), 3);
-	}
+			if(rgbaWriteMask != 0x0000000F)
+			{
+				Float4 masked = value;
+				oC.w = As<Float4>(As<Int4>(oC.w) & *Pointer<Int4>(constants + OFFSET(Constants,maskD4X[rgbaWriteMask][0])));
+				masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD4X[rgbaWriteMask][0])));
+				oC.w = As<Float4>(As<Int4>(oC.w) | As<Int4>(masked));
+			}
 
-	void PixelRoutine::linearToSRGB16_12_16(Vector4s &c)
-	{
-		c.x = As<UShort4>(c.x) >> 4;
-		c.y = As<UShort4>(c.y) >> 4;
-		c.z = As<UShort4>(c.z) >> 4;
+			oC.w = As<Float4>(As<Int4>(oC.w) & *Pointer<Int4>(constants + OFFSET(Constants,maskX3X) + xMask * 16, 16));
+			value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskX3X) + xMask * 16, 16));
+			oC.w = As<Float4>(As<Int4>(oC.w) | As<Int4>(value));
+			*Pointer<Float4>(buffer + 16, 16) = oC.w;
+		}
+		break;
+	case VK_FORMAT_R16G16B16A16_SFLOAT:
+		if((rgbaWriteMask & 0x0000000F) != 0x0)
+		{
+			buffer = cBuffer + 8 * x;
 
-		linearToSRGB12_16(c);
-	}
+			UInt4 rgbaMask;
+			UInt4 value = *Pointer<UInt4>(buffer);
+			UInt4 packedCol;
+			packedCol = Insert(packedCol, (UInt(As<UShort>(Half(oC.x.y))) << 16) | UInt(As<UShort>(Half(oC.x.x))), 0);
+			packedCol = Insert(packedCol, (UInt(As<UShort>(Half(oC.x.w))) << 16) | UInt(As<UShort>(Half(oC.x.z))), 1);
+			packedCol = Insert(packedCol, (UInt(As<UShort>(Half(oC.y.y))) << 16) | UInt(As<UShort>(Half(oC.y.x))), 2);
+			packedCol = Insert(packedCol, (UInt(As<UShort>(Half(oC.y.w))) << 16) | UInt(As<UShort>(Half(oC.y.z))), 3);
+			UInt4 mergedMask = *Pointer<UInt4>(constants + OFFSET(Constants, maskQ01X) + xMask * 16);
+			if((rgbaWriteMask & 0xF) != 0xF)
+			{
+				UInt2 tmpMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskW4Q[rgbaWriteMask][0]));
+				rgbaMask = UInt4(tmpMask, tmpMask);
+				mergedMask &= rgbaMask;
+			}
+			*Pointer<UInt4>(buffer) = (packedCol & mergedMask) | (As<UInt4>(value) & ~mergedMask);
 
-	void PixelRoutine::linearToSRGB12_16(Vector4s &c)
-	{
-		Pointer<Byte> LUT = constants + OFFSET(Constants,linearToSRGB12_16);
+			buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
 
-		c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 0))), 0);
-		c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 1))), 1);
-		c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 2))), 2);
-		c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 3))), 3);
+			value = *Pointer<UInt4>(buffer);
+			packedCol = Insert(packedCol, (UInt(As<UShort>(Half(oC.z.y))) << 16) | UInt(As<UShort>(Half(oC.z.x))), 0);
+			packedCol = Insert(packedCol, (UInt(As<UShort>(Half(oC.z.w))) << 16) | UInt(As<UShort>(Half(oC.z.z))), 1);
+			packedCol = Insert(packedCol, (UInt(As<UShort>(Half(oC.w.y))) << 16) | UInt(As<UShort>(Half(oC.w.x))), 2);
+			packedCol = Insert(packedCol, (UInt(As<UShort>(Half(oC.w.w))) << 16) | UInt(As<UShort>(Half(oC.w.z))), 3);
+			mergedMask = *Pointer<UInt4>(constants + OFFSET(Constants, maskQ23X) + xMask * 16);
+			if((rgbaWriteMask & 0xF) != 0xF)
+			{
+				mergedMask &= rgbaMask;
+			}
+			*Pointer<UInt4>(buffer) = (packedCol & mergedMask) | (As<UInt4>(value) & ~mergedMask);
+		}
+		break;
+	case VK_FORMAT_R16G16B16A16_SINT:
+	case VK_FORMAT_R16G16B16A16_UINT:
+		if((rgbaWriteMask & 0x0000000F) != 0x0)
+		{
+			buffer = cBuffer + 8 * x;
 
-		c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 0))), 0);
-		c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 1))), 1);
-		c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 2))), 2);
-		c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 3))), 3);
+			UInt4 rgbaMask;
+			UShort8 value = *Pointer<UShort8>(buffer);
+			UShort8 packedCol = UShort8(UShort4(As<Int4>(oC.x)), UShort4(As<Int4>(oC.y)));
+			UInt4 mergedMask = *Pointer<UInt4>(constants + OFFSET(Constants, maskQ01X) + xMask * 16);
+			if((rgbaWriteMask & 0xF) != 0xF)
+			{
+				UInt2 tmpMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskW4Q[rgbaWriteMask][0]));
+				rgbaMask = UInt4(tmpMask, tmpMask);
+				mergedMask &= rgbaMask;
+			}
+			*Pointer<UInt4>(buffer) = (As<UInt4>(packedCol) & mergedMask) | (As<UInt4>(value) & ~mergedMask);
 
-		c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 0))), 0);
-		c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 1))), 1);
-		c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 2))), 2);
-		c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 3))), 3);
-	}
+			buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
 
-	Float4 PixelRoutine::sRGBtoLinear(const Float4 &x)   // Approximates x^2.2
-	{
-		Float4 linear = x * x;
-		linear = linear * Float4(0.73f) + linear * x * Float4(0.27f);
+			value = *Pointer<UShort8>(buffer);
+			packedCol = UShort8(UShort4(As<Int4>(oC.z)), UShort4(As<Int4>(oC.w)));
+			mergedMask = *Pointer<UInt4>(constants + OFFSET(Constants, maskQ23X) + xMask * 16);
+			if((rgbaWriteMask & 0xF) != 0xF)
+			{
+				mergedMask &= rgbaMask;
+			}
+			*Pointer<UInt4>(buffer) = (As<UInt4>(packedCol) & mergedMask) | (As<UInt4>(value) & ~mergedMask);
+		}
+		break;
+	case VK_FORMAT_R8G8B8A8_SINT:
+	case VK_FORMAT_R8G8B8A8_UINT:
+	case VK_FORMAT_A8B8G8R8_UINT_PACK32:
+	case VK_FORMAT_A8B8G8R8_SINT_PACK32:
+		if((rgbaWriteMask & 0x0000000F) != 0x0)
+		{
+			UInt2 value, packedCol, mergedMask;
 
-		return Min(Max(linear, Float4(0.0f)), Float4(1.0f));
+			buffer = cBuffer + 4 * x;
+
+			bool isSigned = targetFormat == VK_FORMAT_R8G8B8A8_SINT || targetFormat == VK_FORMAT_A8B8G8R8_SINT_PACK32;
+
+			if(isSigned)
+			{
+				packedCol = As<UInt2>(PackSigned(Short4(As<Int4>(oC.x)), Short4(As<Int4>(oC.y))));
+			}
+			else
+			{
+				packedCol = As<UInt2>(PackUnsigned(Short4(As<Int4>(oC.x)), Short4(As<Int4>(oC.y))));
+			}
+			value = *Pointer<UInt2>(buffer, 16);
+			mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskD01Q) + xMask * 8);
+			if(rgbaWriteMask != 0xF)
+			{
+				mergedMask &= *Pointer<UInt2>(constants + OFFSET(Constants, maskB4Q[rgbaWriteMask][0]));
+			}
+			*Pointer<UInt2>(buffer) = (packedCol & mergedMask) | (value & ~mergedMask);
+
+			buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
+
+			if(isSigned)
+			{
+				packedCol = As<UInt2>(PackSigned(Short4(As<Int4>(oC.z)), Short4(As<Int4>(oC.w))));
+			}
+			else
+			{
+				packedCol = As<UInt2>(PackUnsigned(Short4(As<Int4>(oC.z)), Short4(As<Int4>(oC.w))));
+			}
+			value = *Pointer<UInt2>(buffer, 16);
+			mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskD23Q) + xMask * 8);
+			if(rgbaWriteMask != 0xF)
+			{
+				mergedMask &= *Pointer<UInt2>(constants + OFFSET(Constants, maskB4Q[rgbaWriteMask][0]));
+			}
+			*Pointer<UInt2>(buffer) = (packedCol & mergedMask) | (value & ~mergedMask);
+		}
+		break;
+	case VK_FORMAT_A2B10G10R10_UINT_PACK32:
+		if ((rgbaWriteMask & 0x0000000F) != 0x0)
+		{
+			Int2 mergedMask, packedCol, value;
+			Int4 packed = ((As<Int4>(oC.w) & Int4(0x3)) << 30) |
+					((As<Int4>(oC.z) & Int4(0x3ff)) << 20) |
+					((As<Int4>(oC.y) & Int4(0x3ff)) << 10) |
+					((As<Int4>(oC.x) & Int4(0x3ff)));
+
+			buffer = cBuffer + 4 * x;
+			value = *Pointer<Int2>(buffer, 16);
+			mergedMask = *Pointer<Int2>(constants + OFFSET(Constants, maskD01Q) + xMask * 8);
+			if (rgbaWriteMask != 0xF)
+			{
+				mergedMask &= *Pointer<Int2>(constants + OFFSET(Constants, mask10Q[rgbaWriteMask][0]));
+			}
+			*Pointer<Int2>(buffer) = (As<Int2>(packed) & mergedMask) | (value & ~mergedMask);
+
+			buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
+
+			value = *Pointer<Int2>(buffer, 16);
+			mergedMask = *Pointer<Int2>(constants + OFFSET(Constants, maskD23Q) + xMask * 8);
+			if (rgbaWriteMask != 0xF)
+			{
+				mergedMask &= *Pointer<Int2>(constants + OFFSET(Constants, mask10Q[rgbaWriteMask][0]));
+			}
+			*Pointer<Int2>(buffer) = (As<Int2>(Int4(packed.zwww)) & mergedMask) | (value & ~mergedMask);
+		}
+		break;
+	default:
+		UNIMPLEMENTED("VkFormat: %d", int(targetFormat));
 	}
 }
+
+UShort4 PixelRoutine::convertFixed16(const Float4 &cf, bool saturate)
+{
+	return UShort4(cf * Float4(0xFFFF), saturate);
+}
+
+void PixelRoutine::sRGBtoLinear16_12_16(Vector4s &c)
+{
+	Pointer<Byte> LUT = constants + OFFSET(Constants,sRGBtoLinear12_16);
+
+	c.x = As<UShort4>(c.x) >> 4;
+	c.y = As<UShort4>(c.y) >> 4;
+	c.z = As<UShort4>(c.z) >> 4;
+
+	c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 0))), 0);
+	c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 1))), 1);
+	c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 2))), 2);
+	c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 3))), 3);
+
+	c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 0))), 0);
+	c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 1))), 1);
+	c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 2))), 2);
+	c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 3))), 3);
+
+	c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 0))), 0);
+	c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 1))), 1);
+	c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 2))), 2);
+	c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 3))), 3);
+}
+
+void PixelRoutine::linearToSRGB16_12_16(Vector4s &c)
+{
+	c.x = As<UShort4>(c.x) >> 4;
+	c.y = As<UShort4>(c.y) >> 4;
+	c.z = As<UShort4>(c.z) >> 4;
+
+	linearToSRGB12_16(c);
+}
+
+void PixelRoutine::linearToSRGB12_16(Vector4s &c)
+{
+	Pointer<Byte> LUT = constants + OFFSET(Constants,linearToSRGB12_16);
+
+	c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 0))), 0);
+	c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 1))), 1);
+	c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 2))), 2);
+	c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 3))), 3);
+
+	c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 0))), 0);
+	c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 1))), 1);
+	c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 2))), 2);
+	c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 3))), 3);
+
+	c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 0))), 0);
+	c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 1))), 1);
+	c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 2))), 2);
+	c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 3))), 3);
+}
+
+Float4 PixelRoutine::sRGBtoLinear(const Float4 &x)   // Approximates x^2.2
+{
+	Float4 linear = x * x;
+	linear = linear * Float4(0.73f) + linear * x * Float4(0.27f);
+
+	return Min(Max(linear, Float4(0.0f)), Float4(1.0f));
+}
+
+}  // namespace sw
diff --git a/src/Pipeline/PixelRoutine.hpp b/src/Pipeline/PixelRoutine.hpp
index f638bd6..678d780 100644
--- a/src/Pipeline/PixelRoutine.hpp
+++ b/src/Pipeline/PixelRoutine.hpp
@@ -17,79 +17,80 @@
 
 #include "Device/QuadRasterizer.hpp"
 
-namespace sw
+namespace sw {
+
+class PixelShader;
+class SamplerCore;
+
+class PixelRoutine : public sw::QuadRasterizer
 {
-	class PixelShader;
-	class SamplerCore;
+public:
+	PixelRoutine(const PixelProcessor::State &state,
+		vk::PipelineLayout const *pipelineLayout,
+		SpirvShader const *spirvShader,
+		const vk::DescriptorSet::Bindings &descriptorSets);
 
-	class PixelRoutine : public sw::QuadRasterizer
-	{
-	public:
-		PixelRoutine(const PixelProcessor::State &state,
-			vk::PipelineLayout const *pipelineLayout,
-			SpirvShader const *spirvShader,
-			const vk::DescriptorSet::Bindings &descriptorSets);
+	virtual ~PixelRoutine();
 
-		virtual ~PixelRoutine();
+protected:
+	Float4 z[4]; // Multisampled z
+	Float4 w;    // Used as is
+	Float4 rhw;  // Reciprocal w
 
-	protected:
-		Float4 z[4]; // Multisampled z
-		Float4 w;    // Used as is
-		Float4 rhw;  // Reciprocal w
+	SpirvRoutine routine;
+	const vk::DescriptorSet::Bindings &descriptorSets;
 
-		SpirvRoutine routine;
-		const vk::DescriptorSet::Bindings &descriptorSets;
+	// Depth output
+	Float4 oDepth;
 
-		// Depth output
-		Float4 oDepth;
+	virtual void setBuiltins(Int &x, Int &y, Float4(&z)[4], Float4 &w, Int cMask[4]) = 0;
+	virtual void applyShader(Int cMask[4], Int sMask[4], Int zMask[4]) = 0;
+	virtual Bool alphaTest(Int cMask[4]) = 0;
+	virtual void rasterOperation(Pointer<Byte> cBuffer[4], Int &x, Int sMask[4], Int zMask[4], Int cMask[4]) = 0;
 
-		virtual void setBuiltins(Int &x, Int &y, Float4(&z)[4], Float4 &w, Int cMask[4]) = 0;
-		virtual void applyShader(Int cMask[4], Int sMask[4], Int zMask[4]) = 0;
-		virtual Bool alphaTest(Int cMask[4]) = 0;
-		virtual void rasterOperation(Pointer<Byte> cBuffer[4], Int &x, Int sMask[4], Int zMask[4], Int cMask[4]) = 0;
+	void quad(Pointer<Byte> cBuffer[4], Pointer<Byte> &zBuffer, Pointer<Byte> &sBuffer, Int cMask[4], Int &x, Int &y) override;
 
-		void quad(Pointer<Byte> cBuffer[4], Pointer<Byte> &zBuffer, Pointer<Byte> &sBuffer, Int cMask[4], Int &x, Int &y) override;
+	void alphaTest(Int &aMask, const Short4 &alpha);
+	void alphaToCoverage(Int cMask[4], const Float4 &alpha);
 
-		void alphaTest(Int &aMask, const Short4 &alpha);
-		void alphaToCoverage(Int cMask[4], const Float4 &alpha);
+	// Raster operations
+	void alphaBlend(int index, const Pointer<Byte> &cBuffer, Vector4s &current, const Int &x);
+	void writeColor(int index, const Pointer<Byte> &cBuffer, const Int& x, Vector4f& oC, const Int& sMask, const Int& zMask, const Int& cMask);
+	void alphaBlend(int index, const Pointer<Byte> &cBuffer, Vector4f &oC, const Int &x);
+	void writeColor(int index, const Pointer<Byte> &cBuffer, const Int &x, Vector4s &current, const Int &sMask, const Int &zMask, const Int &cMask);
 
-		// Raster operations
-		void alphaBlend(int index, const Pointer<Byte> &cBuffer, Vector4s &current, const Int &x);
-		void writeColor(int index, const Pointer<Byte> &cBuffer, const Int& x, Vector4f& oC, const Int& sMask, const Int& zMask, const Int& cMask);
-		void alphaBlend(int index, const Pointer<Byte> &cBuffer, Vector4f &oC, const Int &x);
-		void writeColor(int index, const Pointer<Byte> &cBuffer, const Int &x, Vector4s &current, const Int &sMask, const Int &zMask, const Int &cMask);
+	bool isSRGB(int index) const;
+	UShort4 convertFixed16(const Float4 &cf, bool saturate = true);
+	void linearToSRGB12_16(Vector4s &c);
 
-		bool isSRGB(int index) const;
-		UShort4 convertFixed16(const Float4 &cf, bool saturate = true);
-		void linearToSRGB12_16(Vector4s &c);
+private:
+	Float4 interpolateCentroid(const Float4 &x, const Float4 &y, const Float4 &rhw, Pointer<Byte> planeEquation, bool flat, bool perspective);
+	void stencilTest(const Pointer<Byte> &sBuffer, int q, const Int &x, Int &sMask, const Int &cMask);
+	void stencilTest(Byte8 &value, VkCompareOp stencilCompareMode, bool isBack);
+	void stencilOperation(Byte8 &newValue, const Byte8 &bufferValue, const PixelProcessor::States::StencilOpState &ops, bool isBack, const Int &zMask, const Int &sMask);
+	void stencilOperation(Byte8 &output, const Byte8 &bufferValue, VkStencilOp operation, bool isBack);
+	Bool depthTest(const Pointer<Byte> &zBuffer, int q, const Int &x, const Float4 &z, const Int &sMask, Int &zMask, const Int &cMask);
 
-	private:
-		Float4 interpolateCentroid(const Float4 &x, const Float4 &y, const Float4 &rhw, Pointer<Byte> planeEquation, bool flat, bool perspective);
-		void stencilTest(const Pointer<Byte> &sBuffer, int q, const Int &x, Int &sMask, const Int &cMask);
-		void stencilTest(Byte8 &value, VkCompareOp stencilCompareMode, bool isBack);
-		void stencilOperation(Byte8 &newValue, const Byte8 &bufferValue, const PixelProcessor::States::StencilOpState &ops, bool isBack, const Int &zMask, const Int &sMask);
-		void stencilOperation(Byte8 &output, const Byte8 &bufferValue, VkStencilOp operation, bool isBack);
-		Bool depthTest(const Pointer<Byte> &zBuffer, int q, const Int &x, const Float4 &z, const Int &sMask, Int &zMask, const Int &cMask);
+	// Raster operations
+	void blendFactor(Vector4s &blendFactor, const Vector4s &current, const Vector4s &pixel, VkBlendFactor blendFactorActive);
+	void blendFactorAlpha(Vector4s &blendFactor, const Vector4s &current, const Vector4s &pixel, VkBlendFactor blendFactorAlphaActive);
+	void readPixel(int index, const Pointer<Byte> &cBuffer, const Int &x, Vector4s &pixel);
+	void blendFactor(Vector4f &blendFactor, const Vector4f &oC, const Vector4f &pixel, VkBlendFactor blendFactorActive);
+	void blendFactorAlpha(Vector4f &blendFactor, const Vector4f &oC, const Vector4f &pixel, VkBlendFactor blendFactorAlphaActive);
+	void writeStencil(Pointer<Byte> &sBuffer, int q, const Int &x, const Int &sMask, const Int &zMask, const Int &cMask);
+	void writeDepth(Pointer<Byte> &zBuffer, int q, const Int &x, const Float4 &z, const Int &zMask);
 
-		// Raster operations
-		void blendFactor(Vector4s &blendFactor, const Vector4s &current, const Vector4s &pixel, VkBlendFactor blendFactorActive);
-		void blendFactorAlpha(Vector4s &blendFactor, const Vector4s &current, const Vector4s &pixel, VkBlendFactor blendFactorAlphaActive);
-		void readPixel(int index, const Pointer<Byte> &cBuffer, const Int &x, Vector4s &pixel);
-		void blendFactor(Vector4f &blendFactor, const Vector4f &oC, const Vector4f &pixel, VkBlendFactor blendFactorActive);
-		void blendFactorAlpha(Vector4f &blendFactor, const Vector4f &oC, const Vector4f &pixel, VkBlendFactor blendFactorAlphaActive);
-		void writeStencil(Pointer<Byte> &sBuffer, int q, const Int &x, const Int &sMask, const Int &zMask, const Int &cMask);
-		void writeDepth(Pointer<Byte> &zBuffer, int q, const Int &x, const Float4 &z, const Int &zMask);
+	void sRGBtoLinear16_12_16(Vector4s &c);
+	void linearToSRGB16_12_16(Vector4s &c);
+	Float4 sRGBtoLinear(const Float4 &x);
 
-		void sRGBtoLinear16_12_16(Vector4s &c);
-		void linearToSRGB16_12_16(Vector4s &c);
-		Float4 sRGBtoLinear(const Float4 &x);
+	Bool depthTest32F(const Pointer<Byte> &zBuffer, int q, const Int &x, const Float4 &z, const Int &sMask, Int &zMask, const Int &cMask);
+	Bool depthTest16(const Pointer<Byte> &zBuffer, int q, const Int &x, const Float4 &z, const Int &sMask, Int &zMask, const Int &cMask);
 
-		Bool depthTest32F(const Pointer<Byte> &zBuffer, int q, const Int &x, const Float4 &z, const Int &sMask, Int &zMask, const Int &cMask);
-		Bool depthTest16(const Pointer<Byte> &zBuffer, int q, const Int &x, const Float4 &z, const Int &sMask, Int &zMask, const Int &cMask);
+	void writeDepth32F(Pointer<Byte> &zBuffer, int q, const Int &x, const Float4 &z, const Int &zMask);
+	void writeDepth16(Pointer<Byte> &zBuffer, int q, const Int &x, const Float4 &z, const Int &zMask);
+};
 
-		void writeDepth32F(Pointer<Byte> &zBuffer, int q, const Int &x, const Float4 &z, const Int &zMask);
-		void writeDepth16(Pointer<Byte> &zBuffer, int q, const Int &x, const Float4 &z, const Int &zMask);
-	};
-}
+}  // namespace sw
 
 #endif   // sw_PixelRoutine_hpp
diff --git a/src/Pipeline/SamplerCore.cpp b/src/Pipeline/SamplerCore.cpp
index 9c2b06b..0178b84 100644
--- a/src/Pipeline/SamplerCore.cpp
+++ b/src/Pipeline/SamplerCore.cpp
@@ -21,222 +21,169 @@
 
 #include <limits>
 
-namespace
+namespace {
+
+void applySwizzle(VkComponentSwizzle swizzle, sw::Float4& f, const sw::Vector4f& c, bool integer)
 {
-	void applySwizzle(VkComponentSwizzle swizzle, sw::Float4& f, const sw::Vector4f& c, bool integer)
+	switch(swizzle)
 	{
-		switch(swizzle)
+	case VK_COMPONENT_SWIZZLE_R:    f = c.x; break;
+	case VK_COMPONENT_SWIZZLE_G:    f = c.y; break;
+	case VK_COMPONENT_SWIZZLE_B:    f = c.z; break;
+	case VK_COMPONENT_SWIZZLE_A:    f = c.w; break;
+	case VK_COMPONENT_SWIZZLE_ZERO: f = sw::Float4(0.0f, 0.0f, 0.0f, 0.0f); break;
+	case VK_COMPONENT_SWIZZLE_ONE:
+		if (integer)
 		{
-		case VK_COMPONENT_SWIZZLE_R:    f = c.x; break;
-		case VK_COMPONENT_SWIZZLE_G:    f = c.y; break;
-		case VK_COMPONENT_SWIZZLE_B:    f = c.z; break;
-		case VK_COMPONENT_SWIZZLE_A:    f = c.w; break;
-		case VK_COMPONENT_SWIZZLE_ZERO: f = sw::Float4(0.0f, 0.0f, 0.0f, 0.0f); break;
-		case VK_COMPONENT_SWIZZLE_ONE:
-			if (integer)
-			{
-				f = rr::As<sw::Float4>(sw::Int4(1, 1, 1, 1));
-			}
-			else
-			{
-				f = sw::Float4(1.0f, 1.0f, 1.0f, 1.0f);
-			}
-			break;
-		default: ASSERT(false);
+			f = rr::As<sw::Float4>(sw::Int4(1, 1, 1, 1));
 		}
+		else
+		{
+			f = sw::Float4(1.0f, 1.0f, 1.0f, 1.0f);
+		}
+		break;
+	default: ASSERT(false);
 	}
 }
 
-namespace sw
+}  // anonymous namespace
+
+namespace sw {
+
+SamplerCore::SamplerCore(Pointer<Byte> &constants, const Sampler &state) : constants(constants), state(state)
 {
-	SamplerCore::SamplerCore(Pointer<Byte> &constants, const Sampler &state) : constants(constants), state(state)
+}
+
+Vector4f SamplerCore::sampleTexture(Pointer<Byte> &texture, Pointer<Byte> &sampler, Float4 uvw[4], Float4 &q, Float &&lodOrBias, Float4 &dsx, Float4 &dsy, Vector4f &offset, Int4& sampleId, SamplerFunction function)
+{
+	Vector4f c;
+
+	Float4 uuuu = uvw[0];
+	Float4 vvvv = uvw[1];
+	Float4 wwww = uvw[2];
+	Float4 cubeArrayCoord = uvw[3];
+	Float4 qqqq = q;
+
+	Float lod;
+	Float anisotropy;
+	Float4 uDelta;
+	Float4 vDelta;
+	Float4 M;  // Major axis
+
+	if(isCube())
 	{
+		Int4 face = cubeFace(uuuu, vvvv, uvw[0], uvw[1], uvw[2], M);
+		wwww = As<Float4>(face);
 	}
 
-	Vector4f SamplerCore::sampleTexture(Pointer<Byte> &texture, Pointer<Byte> &sampler, Float4 uvw[4], Float4 &q, Float &&lodOrBias, Float4 &dsx, Float4 &dsy, Vector4f &offset, Int4& sampleId, SamplerFunction function)
+	if(function == Implicit || function == Bias || function == Grad || function == Query)
 	{
-		Vector4f c;
-
-		Float4 uuuu = uvw[0];
-		Float4 vvvv = uvw[1];
-		Float4 wwww = uvw[2];
-		Float4 cubeArrayCoord = uvw[3];
-		Float4 qqqq = q;
-
-		Float lod;
-		Float anisotropy;
-		Float4 uDelta;
-		Float4 vDelta;
-		Float4 M;  // Major axis
-
-		if(isCube())
+		if(state.textureType != VK_IMAGE_VIEW_TYPE_3D)
 		{
-			Int4 face = cubeFace(uuuu, vvvv, uvw[0], uvw[1], uvw[2], M);
-			wwww = As<Float4>(face);
-		}
-
-		if(function == Implicit || function == Bias || function == Grad || function == Query)
-		{
-			if(state.textureType != VK_IMAGE_VIEW_TYPE_3D)
+			if(!isCube())
 			{
-				if(!isCube())
-				{
-					computeLod(texture, sampler, lod, anisotropy, uDelta, vDelta, uuuu, vvvv, dsx, dsy, function);
-				}
-				else
-				{
-					computeLodCube(texture, sampler, lod, uvw[0], uvw[1], uvw[2], dsx, dsy, M, function);
-				}
+				computeLod(texture, sampler, lod, anisotropy, uDelta, vDelta, uuuu, vvvv, dsx, dsy, function);
 			}
 			else
 			{
-				computeLod3D(texture, sampler, lod, uuuu, vvvv, wwww, dsx, dsy, function);
+				computeLodCube(texture, sampler, lod, uvw[0], uvw[1], uvw[2], dsx, dsy, M, function);
 			}
+		}
+		else
+		{
+			computeLod3D(texture, sampler, lod, uuuu, vvvv, wwww, dsx, dsy, function);
+		}
 
-			Float bias = *Pointer<Float>(sampler + OFFSET(vk::Sampler, mipLodBias));
+		Float bias = *Pointer<Float>(sampler + OFFSET(vk::Sampler, mipLodBias));
 
-			if(function == Bias)
+		if(function == Bias)
+		{
+			// Add SPIR-V Bias operand to the sampler provided bias and clamp to maxSamplerLodBias limit.
+			bias = Min(Max(bias + lodOrBias, -vk::MAX_SAMPLER_LOD_BIAS), vk::MAX_SAMPLER_LOD_BIAS);
+		}
+
+		lod += bias;
+	}
+	else if(function == Lod)
+	{
+		// Vulkan 1.1: "The absolute value of mipLodBias must be less than or equal to VkPhysicalDeviceLimits::maxSamplerLodBias"
+		// Hence no explicit clamping to maxSamplerLodBias is required in this case.
+		lod = lodOrBias + *Pointer<Float>(sampler + OFFSET(vk::Sampler, mipLodBias));
+	}
+	else if(function == Fetch)
+	{
+		// TODO: Eliminate int-float-int conversion.
+		lod = Float(As<Int>(lodOrBias));
+	}
+	else if(function == Base || function == Gather)
+	{
+		lod = Float(0);
+	}
+	else UNREACHABLE("Sampler function %d", int(function));
+
+	if(function != Base && function != Fetch && function != Gather)
+	{
+		if(function == Query)
+		{
+			c.y = Float4(lod);  // Unclamped LOD.
+		}
+
+		lod = Max(lod, *Pointer<Float>(sampler + OFFSET(vk::Sampler, minLod)));
+		lod = Min(lod, *Pointer<Float>(sampler + OFFSET(vk::Sampler, maxLod)));
+
+		if(function == Query)
+		{
+			if(state.mipmapFilter == MIPMAP_POINT)
 			{
-				// Add SPIR-V Bias operand to the sampler provided bias and clamp to maxSamplerLodBias limit.
-				bias = Min(Max(bias + lodOrBias, -vk::MAX_SAMPLER_LOD_BIAS), vk::MAX_SAMPLER_LOD_BIAS);
+				lod = Round(lod);  // TODO: Preferred formula is ceil(lod + 0.5) - 1
 			}
 
-			lod += bias;
+			c.x = lod;
+		//	c.y contains unclamped LOD.
+
+			return c;
 		}
-		else if(function == Lod)
+	}
+
+	bool force32BitFiltering = state.highPrecisionFiltering && !isYcbcrFormat() && (state.textureFilter != FILTER_POINT);
+	bool seamlessCube = (state.addressingModeU == ADDRESSING_SEAMLESS);
+	bool use32BitFiltering = hasFloatTexture() || hasUnnormalizedIntegerTexture() || force32BitFiltering ||
+	                         seamlessCube || state.unnormalizedCoordinates || state.compareEnable || state.largeTexture ||
+	                         borderModeActive() || (function == Gather);
+
+	if(use32BitFiltering)
+	{
+		c = sampleFloatFilter(texture, uuuu, vvvv, wwww, qqqq, offset, cubeArrayCoord, sampleId, lod, anisotropy, uDelta, vDelta, function);
+
+		if (!hasFloatTexture() && !hasUnnormalizedIntegerTexture() && !state.compareEnable)
 		{
-			// Vulkan 1.1: "The absolute value of mipLodBias must be less than or equal to VkPhysicalDeviceLimits::maxSamplerLodBias"
-			// Hence no explicit clamping to maxSamplerLodBias is required in this case.
-			lod = lodOrBias + *Pointer<Float>(sampler + OFFSET(vk::Sampler, mipLodBias));
-		}
-		else if(function == Fetch)
-		{
-			// TODO: Eliminate int-float-int conversion.
-			lod = Float(As<Int>(lodOrBias));
-		}
-		else if(function == Base || function == Gather)
-		{
-			lod = Float(0);
-		}
-		else UNREACHABLE("Sampler function %d", int(function));
-
-		if(function != Base && function != Fetch && function != Gather)
-		{
-			if(function == Query)
-			{
-				c.y = Float4(lod);  // Unclamped LOD.
-			}
-
-			lod = Max(lod, *Pointer<Float>(sampler + OFFSET(vk::Sampler, minLod)));
-			lod = Min(lod, *Pointer<Float>(sampler + OFFSET(vk::Sampler, maxLod)));
-
-			if(function == Query)
-			{
-				if(state.mipmapFilter == MIPMAP_POINT)
-				{
-					lod = Round(lod);  // TODO: Preferred formula is ceil(lod + 0.5) - 1
-				}
-
-				c.x = lod;
-			//	c.y contains unclamped LOD.
-
-				return c;
-			}
-		}
-
-		bool force32BitFiltering = state.highPrecisionFiltering && !isYcbcrFormat() && (state.textureFilter != FILTER_POINT);
-		bool seamlessCube = (state.addressingModeU == ADDRESSING_SEAMLESS);
-		bool use32BitFiltering = hasFloatTexture() || hasUnnormalizedIntegerTexture() || force32BitFiltering ||
-		                         seamlessCube || state.unnormalizedCoordinates || state.compareEnable || state.largeTexture ||
-		                         borderModeActive() || (function == Gather);
-
-		if(use32BitFiltering)
-		{
-			c = sampleFloatFilter(texture, uuuu, vvvv, wwww, qqqq, offset, cubeArrayCoord, sampleId, lod, anisotropy, uDelta, vDelta, function);
-
-			if (!hasFloatTexture() && !hasUnnormalizedIntegerTexture() && !state.compareEnable)
-			{
-				switch (state.textureFormat)
-				{
-				case VK_FORMAT_R5G6B5_UNORM_PACK16:
-					c.x *= Float4(1.0f / 0xF800);
-					c.y *= Float4(1.0f / 0xFC00);
-					c.z *= Float4(1.0f / 0xF800);
-					break;
-				case VK_FORMAT_B4G4R4A4_UNORM_PACK16:
-					c.x *= Float4(1.0f / 0xF000);
-					c.y *= Float4(1.0f / 0xF000);
-					c.z *= Float4(1.0f / 0xF000);
-					c.w *= Float4(1.0f / 0xF000);
-					break;
-				case VK_FORMAT_A1R5G5B5_UNORM_PACK16:
-					c.x *= Float4(1.0f / 0xF800);
-					c.y *= Float4(1.0f / 0xF800);
-					c.z *= Float4(1.0f / 0xF800);
-					c.w *= Float4(1.0f / 0x8000);
-					break;
-				case VK_FORMAT_R8_SNORM:
-				case VK_FORMAT_R8G8_SNORM:
-				case VK_FORMAT_R8G8B8A8_SNORM:
-				case VK_FORMAT_A8B8G8R8_SNORM_PACK32:
-					c.x *= Float4(1.0f / 0x7F00);
-					c.y *= Float4(1.0f / 0x7F00);
-					c.z *= Float4(1.0f / 0x7F00);
-					c.w *= Float4(1.0f / 0x7F00);
-					break;
-				case VK_FORMAT_R8_UNORM:
-				case VK_FORMAT_R8G8_UNORM:
-				case VK_FORMAT_R8G8B8A8_UNORM:
-				case VK_FORMAT_B8G8R8A8_UNORM:
-				case VK_FORMAT_A8B8G8R8_UNORM_PACK32:
-				case VK_FORMAT_B8G8R8A8_SRGB:
-				case VK_FORMAT_R8G8B8A8_SRGB:
-				case VK_FORMAT_R8_SRGB:
-				case VK_FORMAT_R8G8_SRGB:
-					c.x *= Float4(1.0f / 0xFF00u);
-					c.y *= Float4(1.0f / 0xFF00u);
-					c.z *= Float4(1.0f / 0xFF00u);
-					c.w *= Float4(1.0f / 0xFF00u);
-					break;
-				default:
-					for (int component = 0; component < textureComponentCount(); component++)
-					{
-						c[component] *= Float4(hasUnsignedTextureComponent(component) ? 1.0f / 0xFFFF : 1.0f / 0x7FFF);
-					}
-				}
-			}
-		}
-		else  // 16-bit filtering.
-		{
-			Vector4s cs = sampleFilter(texture, uuuu, vvvv, wwww, offset, cubeArrayCoord, sampleId, lod, anisotropy, uDelta, vDelta, function);
-
 			switch (state.textureFormat)
 			{
 			case VK_FORMAT_R5G6B5_UNORM_PACK16:
-				c.x = Float4(As<UShort4>(cs.x)) * Float4(1.0f / 0xF800);
-				c.y = Float4(As<UShort4>(cs.y)) * Float4(1.0f / 0xFC00);
-				c.z = Float4(As<UShort4>(cs.z)) * Float4(1.0f / 0xF800);
+				c.x *= Float4(1.0f / 0xF800);
+				c.y *= Float4(1.0f / 0xFC00);
+				c.z *= Float4(1.0f / 0xF800);
 				break;
 			case VK_FORMAT_B4G4R4A4_UNORM_PACK16:
-				c.x = Float4(As<UShort4>(cs.x)) * Float4(1.0f / 0xF000);
-				c.y = Float4(As<UShort4>(cs.y)) * Float4(1.0f / 0xF000);
-				c.z = Float4(As<UShort4>(cs.z)) * Float4(1.0f / 0xF000);
-				c.w = Float4(As<UShort4>(cs.w)) * Float4(1.0f / 0xF000);
+				c.x *= Float4(1.0f / 0xF000);
+				c.y *= Float4(1.0f / 0xF000);
+				c.z *= Float4(1.0f / 0xF000);
+				c.w *= Float4(1.0f / 0xF000);
 				break;
 			case VK_FORMAT_A1R5G5B5_UNORM_PACK16:
-				c.x = Float4(As<UShort4>(cs.x)) * Float4(1.0f / 0xF800);
-				c.y = Float4(As<UShort4>(cs.y)) * Float4(1.0f / 0xF800);
-				c.z = Float4(As<UShort4>(cs.z)) * Float4(1.0f / 0xF800);
-				c.w = Float4(As<UShort4>(cs.w)) * Float4(1.0f / 0x8000);
+				c.x *= Float4(1.0f / 0xF800);
+				c.y *= Float4(1.0f / 0xF800);
+				c.z *= Float4(1.0f / 0xF800);
+				c.w *= Float4(1.0f / 0x8000);
 				break;
 			case VK_FORMAT_R8_SNORM:
 			case VK_FORMAT_R8G8_SNORM:
 			case VK_FORMAT_R8G8B8A8_SNORM:
 			case VK_FORMAT_A8B8G8R8_SNORM_PACK32:
-				c.x = Float4(cs.x) * Float4(1.0f / 0x7F00);
-				c.y = Float4(cs.y) * Float4(1.0f / 0x7F00);
-				c.z = Float4(cs.z) * Float4(1.0f / 0x7F00);
-				c.w = Float4(cs.w) * Float4(1.0f / 0x7F00);
+				c.x *= Float4(1.0f / 0x7F00);
+				c.y *= Float4(1.0f / 0x7F00);
+				c.z *= Float4(1.0f / 0x7F00);
+				c.w *= Float4(1.0f / 0x7F00);
 				break;
 			case VK_FORMAT_R8_UNORM:
 			case VK_FORMAT_R8G8_UNORM:
@@ -247,1177 +194,1322 @@
 			case VK_FORMAT_R8G8B8A8_SRGB:
 			case VK_FORMAT_R8_SRGB:
 			case VK_FORMAT_R8G8_SRGB:
-				c.x = Float4(As<UShort4>(cs.x)) * Float4(1.0f / 0xFF00u);
-				c.y = Float4(As<UShort4>(cs.y)) * Float4(1.0f / 0xFF00u);
-				c.z = Float4(As<UShort4>(cs.z)) * Float4(1.0f / 0xFF00u);
-				c.w = Float4(As<UShort4>(cs.w)) * Float4(1.0f / 0xFF00u);
+				c.x *= Float4(1.0f / 0xFF00u);
+				c.y *= Float4(1.0f / 0xFF00u);
+				c.z *= Float4(1.0f / 0xFF00u);
+				c.w *= Float4(1.0f / 0xFF00u);
 				break;
 			default:
-				for(int component = 0; component < textureComponentCount(); component++)
+				for (int component = 0; component < textureComponentCount(); component++)
 				{
-					if(hasUnsignedTextureComponent(component))
-					{
-						convertUnsigned16(c[component], cs[component]);
-					}
-					else
-					{
-						convertSigned15(c[component], cs[component]);
-					}
+					c[component] *= Float4(hasUnsignedTextureComponent(component) ? 1.0f / 0xFFFF : 1.0f / 0x7FFF);
 				}
 			}
 		}
+	}
+	else  // 16-bit filtering.
+	{
+		Vector4s cs = sampleFilter(texture, uuuu, vvvv, wwww, offset, cubeArrayCoord, sampleId, lod, anisotropy, uDelta, vDelta, function);
 
-		if(state.textureFilter != FILTER_GATHER)
+		switch (state.textureFormat)
 		{
-			if((state.swizzle.r != VK_COMPONENT_SWIZZLE_R) ||
-			   (state.swizzle.g != VK_COMPONENT_SWIZZLE_G) ||
-			   (state.swizzle.b != VK_COMPONENT_SWIZZLE_B) ||
-			   (state.swizzle.a != VK_COMPONENT_SWIZZLE_A))
+		case VK_FORMAT_R5G6B5_UNORM_PACK16:
+			c.x = Float4(As<UShort4>(cs.x)) * Float4(1.0f / 0xF800);
+			c.y = Float4(As<UShort4>(cs.y)) * Float4(1.0f / 0xFC00);
+			c.z = Float4(As<UShort4>(cs.z)) * Float4(1.0f / 0xF800);
+			break;
+		case VK_FORMAT_B4G4R4A4_UNORM_PACK16:
+			c.x = Float4(As<UShort4>(cs.x)) * Float4(1.0f / 0xF000);
+			c.y = Float4(As<UShort4>(cs.y)) * Float4(1.0f / 0xF000);
+			c.z = Float4(As<UShort4>(cs.z)) * Float4(1.0f / 0xF000);
+			c.w = Float4(As<UShort4>(cs.w)) * Float4(1.0f / 0xF000);
+			break;
+		case VK_FORMAT_A1R5G5B5_UNORM_PACK16:
+			c.x = Float4(As<UShort4>(cs.x)) * Float4(1.0f / 0xF800);
+			c.y = Float4(As<UShort4>(cs.y)) * Float4(1.0f / 0xF800);
+			c.z = Float4(As<UShort4>(cs.z)) * Float4(1.0f / 0xF800);
+			c.w = Float4(As<UShort4>(cs.w)) * Float4(1.0f / 0x8000);
+			break;
+		case VK_FORMAT_R8_SNORM:
+		case VK_FORMAT_R8G8_SNORM:
+		case VK_FORMAT_R8G8B8A8_SNORM:
+		case VK_FORMAT_A8B8G8R8_SNORM_PACK32:
+			c.x = Float4(cs.x) * Float4(1.0f / 0x7F00);
+			c.y = Float4(cs.y) * Float4(1.0f / 0x7F00);
+			c.z = Float4(cs.z) * Float4(1.0f / 0x7F00);
+			c.w = Float4(cs.w) * Float4(1.0f / 0x7F00);
+			break;
+		case VK_FORMAT_R8_UNORM:
+		case VK_FORMAT_R8G8_UNORM:
+		case VK_FORMAT_R8G8B8A8_UNORM:
+		case VK_FORMAT_B8G8R8A8_UNORM:
+		case VK_FORMAT_A8B8G8R8_UNORM_PACK32:
+		case VK_FORMAT_B8G8R8A8_SRGB:
+		case VK_FORMAT_R8G8B8A8_SRGB:
+		case VK_FORMAT_R8_SRGB:
+		case VK_FORMAT_R8G8_SRGB:
+			c.x = Float4(As<UShort4>(cs.x)) * Float4(1.0f / 0xFF00u);
+			c.y = Float4(As<UShort4>(cs.y)) * Float4(1.0f / 0xFF00u);
+			c.z = Float4(As<UShort4>(cs.z)) * Float4(1.0f / 0xFF00u);
+			c.w = Float4(As<UShort4>(cs.w)) * Float4(1.0f / 0xFF00u);
+			break;
+		default:
+			for(int component = 0; component < textureComponentCount(); component++)
 			{
-				const Vector4f col(c);
-				bool integer = hasUnnormalizedIntegerTexture();
-				applySwizzle(state.swizzle.r, c.x, col, integer);
-				applySwizzle(state.swizzle.g, c.y, col, integer);
-				applySwizzle(state.swizzle.b, c.z, col, integer);
-				applySwizzle(state.swizzle.a, c.w, col, integer);
+				if(hasUnsignedTextureComponent(component))
+				{
+					convertUnsigned16(c[component], cs[component]);
+				}
+				else
+				{
+					convertSigned15(c[component], cs[component]);
+				}
+			}
+		}
+	}
+
+	if(state.textureFilter != FILTER_GATHER)
+	{
+		if((state.swizzle.r != VK_COMPONENT_SWIZZLE_R) ||
+		   (state.swizzle.g != VK_COMPONENT_SWIZZLE_G) ||
+		   (state.swizzle.b != VK_COMPONENT_SWIZZLE_B) ||
+		   (state.swizzle.a != VK_COMPONENT_SWIZZLE_A))
+		{
+			const Vector4f col(c);
+			bool integer = hasUnnormalizedIntegerTexture();
+			applySwizzle(state.swizzle.r, c.x, col, integer);
+			applySwizzle(state.swizzle.g, c.y, col, integer);
+			applySwizzle(state.swizzle.b, c.z, col, integer);
+			applySwizzle(state.swizzle.a, c.w, col, integer);
+		}
+	}
+	else  // Gather
+	{
+		VkComponentSwizzle swizzle = gatherSwizzle();
+
+		// R/G/B/A swizzles affect the component collected from each texel earlier.
+		// Handle the ZERO and ONE cases here because we don't need to know the format.
+
+		if(swizzle == VK_COMPONENT_SWIZZLE_ZERO)
+		{
+			c.x = c.y = c.z = c.w = Float4(0);
+		}
+		else if(swizzle == VK_COMPONENT_SWIZZLE_ONE)
+		{
+			bool integer = hasUnnormalizedIntegerTexture();
+			c.x = c.y = c.z = c.w = integer ? As<Float4>(Int4(1)) : RValue<Float4>(Float4(1.0f));
+		}
+	}
+
+	return c;
+}
+
+Short4 SamplerCore::offsetSample(Short4 &uvw, Pointer<Byte> &mipmap, int halfOffset, bool wrap, int count, Float &lod)
+{
+	Short4 offset = *Pointer<Short4>(mipmap + halfOffset);
+
+	if(state.textureFilter == FILTER_MIN_LINEAR_MAG_POINT)
+	{
+		offset &= Short4(CmpNLE(Float4(lod), Float4(0.0f)));
+	}
+	else if(state.textureFilter == FILTER_MIN_POINT_MAG_LINEAR)
+	{
+		offset &= Short4(CmpLE(Float4(lod), Float4(0.0f)));
+	}
+
+	if(wrap)
+	{
+		switch(count)
+		{
+		case -1: return uvw - offset;
+		case  0: return uvw;
+		case +1: return uvw + offset;
+		case  2: return uvw + offset + offset;
+		}
+	}
+	else   // Clamp or mirror
+	{
+		switch(count)
+		{
+		case -1: return SubSat(As<UShort4>(uvw), As<UShort4>(offset));
+		case  0: return uvw;
+		case +1: return AddSat(As<UShort4>(uvw), As<UShort4>(offset));
+		case  2: return AddSat(AddSat(As<UShort4>(uvw), As<UShort4>(offset)), As<UShort4>(offset));
+		}
+	}
+
+	return uvw;
+}
+
+Vector4s SamplerCore::sampleFilter(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, Vector4f &offset, const Float4& cubeArrayCoord, const Int4& sampleId, Float &lod, Float &anisotropy, Float4 &uDelta, Float4 &vDelta, SamplerFunction function)
+{
+	Vector4s c = sampleAniso(texture, u, v, w, offset, cubeArrayCoord, sampleId, lod, anisotropy, uDelta, vDelta, false, function);
+
+	if(function == Fetch)
+	{
+		return c;
+	}
+
+	if(state.mipmapFilter == MIPMAP_LINEAR)
+	{
+		Vector4s cc = sampleAniso(texture, u, v, w, offset, cubeArrayCoord, sampleId, lod, anisotropy, uDelta, vDelta, true, function);
+
+		lod *= Float(1 << 16);
+
+		UShort4 utri = UShort4(Float4(lod));   // FIXME: Optimize
+		Short4 stri = utri >> 1;   // FIXME: Optimize
+
+		if(hasUnsignedTextureComponent(0)) cc.x = MulHigh(As<UShort4>(cc.x), utri); else cc.x = MulHigh(cc.x, stri);
+		if(hasUnsignedTextureComponent(1)) cc.y = MulHigh(As<UShort4>(cc.y), utri); else cc.y = MulHigh(cc.y, stri);
+		if(hasUnsignedTextureComponent(2)) cc.z = MulHigh(As<UShort4>(cc.z), utri); else cc.z = MulHigh(cc.z, stri);
+		if(hasUnsignedTextureComponent(3)) cc.w = MulHigh(As<UShort4>(cc.w), utri); else cc.w = MulHigh(cc.w, stri);
+
+		utri = ~utri;
+		stri = Short4(0x7FFF) - stri;
+
+		if(hasUnsignedTextureComponent(0)) c.x = MulHigh(As<UShort4>(c.x), utri); else c.x = MulHigh(c.x, stri);
+		if(hasUnsignedTextureComponent(1)) c.y = MulHigh(As<UShort4>(c.y), utri); else c.y = MulHigh(c.y, stri);
+		if(hasUnsignedTextureComponent(2)) c.z = MulHigh(As<UShort4>(c.z), utri); else c.z = MulHigh(c.z, stri);
+		if(hasUnsignedTextureComponent(3)) c.w = MulHigh(As<UShort4>(c.w), utri); else c.w = MulHigh(c.w, stri);
+
+		c.x += cc.x;
+		c.y += cc.y;
+		c.z += cc.z;
+		c.w += cc.w;
+
+		if(!hasUnsignedTextureComponent(0)) c.x += c.x;
+		if(!hasUnsignedTextureComponent(1)) c.y += c.y;
+		if(!hasUnsignedTextureComponent(2)) c.z += c.z;
+		if(!hasUnsignedTextureComponent(3)) c.w += c.w;
+	}
+
+	return c;
+}
+
+Vector4s SamplerCore::sampleAniso(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, Vector4f &offset, const Float4& cubeArrayCoord, const Int4& sampleId, Float &lod, Float &anisotropy, Float4 &uDelta, Float4 &vDelta, bool secondLOD, SamplerFunction function)
+{
+	Vector4s c;
+
+	if(state.textureFilter != FILTER_ANISOTROPIC || function == Lod || function == Fetch)
+	{
+		c = sampleQuad(texture, u, v, w, offset, cubeArrayCoord, sampleId, lod, secondLOD, function);
+	}
+	else
+	{
+		Int a = RoundInt(anisotropy);
+
+		Vector4s cSum;
+
+		cSum.x = Short4(0);
+		cSum.y = Short4(0);
+		cSum.z = Short4(0);
+		cSum.w = Short4(0);
+
+		Float4 A = *Pointer<Float4>(constants + OFFSET(Constants,uvWeight) + 16 * a);
+		Float4 B = *Pointer<Float4>(constants + OFFSET(Constants,uvStart) + 16 * a);
+		UShort4 cw = *Pointer<UShort4>(constants + OFFSET(Constants,cWeight) + 8 * a);
+		Short4 sw = Short4(cw >> 1);
+
+		Float4 du = uDelta;
+		Float4 dv = vDelta;
+
+		Float4 u0 = u + B * du;
+		Float4 v0 = v + B * dv;
+
+		du *= A;
+		dv *= A;
+
+		Int i = 0;
+
+		Do
+		{
+			c = sampleQuad(texture, u0, v0, w, offset, cubeArrayCoord, sampleId, lod, secondLOD, function);
+
+			u0 += du;
+			v0 += dv;
+
+			if(hasUnsignedTextureComponent(0)) cSum.x += As<Short4>(MulHigh(As<UShort4>(c.x), cw)); else cSum.x += MulHigh(c.x, sw);
+			if(hasUnsignedTextureComponent(1)) cSum.y += As<Short4>(MulHigh(As<UShort4>(c.y), cw)); else cSum.y += MulHigh(c.y, sw);
+			if(hasUnsignedTextureComponent(2)) cSum.z += As<Short4>(MulHigh(As<UShort4>(c.z), cw)); else cSum.z += MulHigh(c.z, sw);
+			if(hasUnsignedTextureComponent(3)) cSum.w += As<Short4>(MulHigh(As<UShort4>(c.w), cw)); else cSum.w += MulHigh(c.w, sw);
+
+			i++;
+		}
+		Until(i >= a)
+
+		if(hasUnsignedTextureComponent(0)) c.x = cSum.x; else c.x = AddSat(cSum.x, cSum.x);
+		if(hasUnsignedTextureComponent(1)) c.y = cSum.y; else c.y = AddSat(cSum.y, cSum.y);
+		if(hasUnsignedTextureComponent(2)) c.z = cSum.z; else c.z = AddSat(cSum.z, cSum.z);
+		if(hasUnsignedTextureComponent(3)) c.w = cSum.w; else c.w = AddSat(cSum.w, cSum.w);
+	}
+
+	return c;
+}
+
+Vector4s SamplerCore::sampleQuad(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, Vector4f &offset, const Float4& cubeArrayCoord, const Int4& sampleId, Float &lod, bool secondLOD, SamplerFunction function)
+{
+	if(state.textureType != VK_IMAGE_VIEW_TYPE_3D)
+	{
+		return sampleQuad2D(texture, u, v, w, offset, cubeArrayCoord, sampleId, lod, secondLOD, function);
+	}
+	else
+	{
+		return sample3D(texture, u, v, w, offset, cubeArrayCoord, sampleId, lod, secondLOD, function);
+	}
+}
+
+Vector4s SamplerCore::sampleQuad2D(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, Vector4f &offset, const Float4& cubeArrayCoord, const Int4& sampleId, Float &lod, bool secondLOD, SamplerFunction function)
+{
+	Vector4s c;
+
+	int componentCount = textureComponentCount();
+	bool gather = (state.textureFilter == FILTER_GATHER);
+
+	Pointer<Byte> mipmap;
+	Pointer<Byte> buffer;
+	selectMipmap(texture, mipmap, buffer, lod, secondLOD);
+
+	bool texelFetch = (function == Fetch);
+
+	Short4 uuuu = texelFetch ? Short4(As<Int4>(u)) : address(u, state.addressingModeU, mipmap);
+	Short4 vvvv = texelFetch ? Short4(As<Int4>(v)) : address(v, state.addressingModeV, mipmap);
+	Short4 wwww = texelFetch ? Short4(As<Int4>(w)) : address(w, state.addressingModeW, mipmap);
+
+	Short4 cubeArrayId(0);
+	if(state.textureType == VK_IMAGE_VIEW_TYPE_CUBE_ARRAY)
+	{
+		cubeArrayId = address(cubeArrayCoord, state.addressingModeY, mipmap);
+	}
+
+	if(state.textureFilter == FILTER_POINT || texelFetch)
+	{
+		c = sampleTexel(uuuu, vvvv, wwww, offset, mipmap, cubeArrayId, sampleId, buffer, function);
+	}
+	else
+	{
+		Short4 uuuu0 = offsetSample(uuuu, mipmap, OFFSET(Mipmap,uHalf), state.addressingModeU == ADDRESSING_WRAP, -1, lod);
+		Short4 vvvv0 = offsetSample(vvvv, mipmap, OFFSET(Mipmap,vHalf), state.addressingModeV == ADDRESSING_WRAP, -1, lod);
+		Short4 uuuu1 = offsetSample(uuuu, mipmap, OFFSET(Mipmap,uHalf), state.addressingModeU == ADDRESSING_WRAP, +1, lod);
+		Short4 vvvv1 = offsetSample(vvvv, mipmap, OFFSET(Mipmap,vHalf), state.addressingModeV == ADDRESSING_WRAP, +1, lod);
+
+		Vector4s c00 = sampleTexel(uuuu0, vvvv0, wwww, offset, mipmap, cubeArrayId, sampleId, buffer, function);
+		Vector4s c10 = sampleTexel(uuuu1, vvvv0, wwww, offset, mipmap, cubeArrayId, sampleId, buffer, function);
+		Vector4s c01 = sampleTexel(uuuu0, vvvv1, wwww, offset, mipmap, cubeArrayId, sampleId, buffer, function);
+		Vector4s c11 = sampleTexel(uuuu1, vvvv1, wwww, offset, mipmap, cubeArrayId, sampleId, buffer, function);
+
+		if(!gather)   // Blend
+		{
+			// Fractions
+			UShort4 f0u = As<UShort4>(uuuu0) * UShort4(*Pointer<Int4>(mipmap + OFFSET(Mipmap,width)));
+			UShort4 f0v = As<UShort4>(vvvv0) * UShort4(*Pointer<Int4>(mipmap + OFFSET(Mipmap,height)));
+
+			UShort4 f1u = ~f0u;
+			UShort4 f1v = ~f0v;
+
+			UShort4 f0u0v = MulHigh(f0u, f0v);
+			UShort4 f1u0v = MulHigh(f1u, f0v);
+			UShort4 f0u1v = MulHigh(f0u, f1v);
+			UShort4 f1u1v = MulHigh(f1u, f1v);
+
+			// Signed fractions
+			Short4 f1u1vs;
+			Short4 f0u1vs;
+			Short4 f1u0vs;
+			Short4 f0u0vs;
+
+			if(!hasUnsignedTextureComponent(0) || !hasUnsignedTextureComponent(1) || !hasUnsignedTextureComponent(2) || !hasUnsignedTextureComponent(3))
+			{
+				f1u1vs = f1u1v >> 1;
+				f0u1vs = f0u1v >> 1;
+				f1u0vs = f1u0v >> 1;
+				f0u0vs = f0u0v >> 1;
+			}
+
+			// Bilinear interpolation
+			if(componentCount >= 1)
+			{
+				if(has16bitTextureComponents() && hasUnsignedTextureComponent(0))
+				{
+					c00.x = As<UShort4>(c00.x) - MulHigh(As<UShort4>(c00.x), f0u) + MulHigh(As<UShort4>(c10.x), f0u);
+					c01.x = As<UShort4>(c01.x) - MulHigh(As<UShort4>(c01.x), f0u) + MulHigh(As<UShort4>(c11.x), f0u);
+					c.x   = As<UShort4>(c00.x) - MulHigh(As<UShort4>(c00.x), f0v) + MulHigh(As<UShort4>(c01.x), f0v);
+				}
+				else
+				{
+					if(hasUnsignedTextureComponent(0))
+					{
+						c00.x = MulHigh(As<UShort4>(c00.x), f1u1v);
+						c10.x = MulHigh(As<UShort4>(c10.x), f0u1v);
+						c01.x = MulHigh(As<UShort4>(c01.x), f1u0v);
+						c11.x = MulHigh(As<UShort4>(c11.x), f0u0v);
+					}
+					else
+					{
+						c00.x = MulHigh(c00.x, f1u1vs);
+						c10.x = MulHigh(c10.x, f0u1vs);
+						c01.x = MulHigh(c01.x, f1u0vs);
+						c11.x = MulHigh(c11.x, f0u0vs);
+					}
+
+					c.x = (c00.x + c10.x) + (c01.x + c11.x);
+					if(!hasUnsignedTextureComponent(0)) c.x = AddSat(c.x, c.x);   // Correct for signed fractions
+				}
+			}
+
+			if(componentCount >= 2)
+			{
+				if(has16bitTextureComponents() && hasUnsignedTextureComponent(1))
+				{
+					c00.y = As<UShort4>(c00.y) - MulHigh(As<UShort4>(c00.y), f0u) + MulHigh(As<UShort4>(c10.y), f0u);
+					c01.y = As<UShort4>(c01.y) - MulHigh(As<UShort4>(c01.y), f0u) + MulHigh(As<UShort4>(c11.y), f0u);
+					c.y   = As<UShort4>(c00.y) - MulHigh(As<UShort4>(c00.y), f0v) + MulHigh(As<UShort4>(c01.y), f0v);
+				}
+				else
+				{
+					if(hasUnsignedTextureComponent(1))
+					{
+						c00.y = MulHigh(As<UShort4>(c00.y), f1u1v);
+						c10.y = MulHigh(As<UShort4>(c10.y), f0u1v);
+						c01.y = MulHigh(As<UShort4>(c01.y), f1u0v);
+						c11.y = MulHigh(As<UShort4>(c11.y), f0u0v);
+					}
+					else
+					{
+						c00.y = MulHigh(c00.y, f1u1vs);
+						c10.y = MulHigh(c10.y, f0u1vs);
+						c01.y = MulHigh(c01.y, f1u0vs);
+						c11.y = MulHigh(c11.y, f0u0vs);
+					}
+
+					c.y = (c00.y + c10.y) + (c01.y + c11.y);
+					if(!hasUnsignedTextureComponent(1)) c.y = AddSat(c.y, c.y);   // Correct for signed fractions
+				}
+			}
+
+			if(componentCount >= 3)
+			{
+				if(has16bitTextureComponents() && hasUnsignedTextureComponent(2))
+				{
+					c00.z = As<UShort4>(c00.z) - MulHigh(As<UShort4>(c00.z), f0u) + MulHigh(As<UShort4>(c10.z), f0u);
+					c01.z = As<UShort4>(c01.z) - MulHigh(As<UShort4>(c01.z), f0u) + MulHigh(As<UShort4>(c11.z), f0u);
+					c.z   = As<UShort4>(c00.z) - MulHigh(As<UShort4>(c00.z), f0v) + MulHigh(As<UShort4>(c01.z), f0v);
+				}
+				else
+				{
+					if(hasUnsignedTextureComponent(2))
+					{
+						c00.z = MulHigh(As<UShort4>(c00.z), f1u1v);
+						c10.z = MulHigh(As<UShort4>(c10.z), f0u1v);
+						c01.z = MulHigh(As<UShort4>(c01.z), f1u0v);
+						c11.z = MulHigh(As<UShort4>(c11.z), f0u0v);
+					}
+					else
+					{
+						c00.z = MulHigh(c00.z, f1u1vs);
+						c10.z = MulHigh(c10.z, f0u1vs);
+						c01.z = MulHigh(c01.z, f1u0vs);
+						c11.z = MulHigh(c11.z, f0u0vs);
+					}
+
+					c.z = (c00.z + c10.z) + (c01.z + c11.z);
+					if(!hasUnsignedTextureComponent(2)) c.z = AddSat(c.z, c.z);   // Correct for signed fractions
+				}
+			}
+
+			if(componentCount >= 4)
+			{
+				if(has16bitTextureComponents() && hasUnsignedTextureComponent(3))
+				{
+					c00.w = As<UShort4>(c00.w) - MulHigh(As<UShort4>(c00.w), f0u) + MulHigh(As<UShort4>(c10.w), f0u);
+					c01.w = As<UShort4>(c01.w) - MulHigh(As<UShort4>(c01.w), f0u) + MulHigh(As<UShort4>(c11.w), f0u);
+					c.w  = As<UShort4>(c00.w) - MulHigh(As<UShort4>(c00.w), f0v) + MulHigh(As<UShort4>(c01.w), f0v);
+				}
+				else
+				{
+					if(hasUnsignedTextureComponent(3))
+					{
+						c00.w = MulHigh(As<UShort4>(c00.w), f1u1v);
+						c10.w = MulHigh(As<UShort4>(c10.w), f0u1v);
+						c01.w = MulHigh(As<UShort4>(c01.w), f1u0v);
+						c11.w = MulHigh(As<UShort4>(c11.w), f0u0v);
+					}
+					else
+					{
+						c00.w = MulHigh(c00.w, f1u1vs);
+						c10.w = MulHigh(c10.w, f0u1vs);
+						c01.w = MulHigh(c01.w, f1u0vs);
+						c11.w = MulHigh(c11.w, f0u0vs);
+					}
+
+					c.w = (c00.w + c10.w) + (c01.w + c11.w);
+					if(!hasUnsignedTextureComponent(3)) c.w = AddSat(c.w, c.w);   // Correct for signed fractions
+				}
 			}
 		}
 		else  // Gather
 		{
 			VkComponentSwizzle swizzle = gatherSwizzle();
-
-			// R/G/B/A swizzles affect the component collected from each texel earlier.
-			// Handle the ZERO and ONE cases here because we don't need to know the format.
-
-			if(swizzle == VK_COMPONENT_SWIZZLE_ZERO)
+			switch(swizzle)
 			{
-				c.x = c.y = c.z = c.w = Float4(0);
+			case VK_COMPONENT_SWIZZLE_ZERO:
+			case VK_COMPONENT_SWIZZLE_ONE:
+				// Handled at the final component swizzle.
+				break;
+			default:
+				c.x = c01[swizzle - VK_COMPONENT_SWIZZLE_R];
+				c.y = c11[swizzle - VK_COMPONENT_SWIZZLE_R];
+				c.z = c10[swizzle - VK_COMPONENT_SWIZZLE_R];
+				c.w = c00[swizzle - VK_COMPONENT_SWIZZLE_R];
+				break;
 			}
-			else if(swizzle == VK_COMPONENT_SWIZZLE_ONE)
+		}
+	}
+
+	return c;
+}
+
+Vector4s SamplerCore::sample3D(Pointer<Byte> &texture, Float4 &u_, Float4 &v_, Float4 &w_, Vector4f &offset, const Float4& cubeArrayCoord, const Int4& sampleId, Float &lod, bool secondLOD, SamplerFunction function)
+{
+	Vector4s c_;
+
+	int componentCount = textureComponentCount();
+
+	Pointer<Byte> mipmap;
+	Pointer<Byte> buffer;
+	selectMipmap(texture, mipmap, buffer, lod, secondLOD);
+
+	bool texelFetch = (function == Fetch);
+
+	Short4 uuuu = texelFetch ? Short4(As<Int4>(u_)) : address(u_, state.addressingModeU, mipmap);
+	Short4 vvvv = texelFetch ? Short4(As<Int4>(v_)) : address(v_, state.addressingModeV, mipmap);
+	Short4 wwww = texelFetch ? Short4(As<Int4>(w_)) : address(w_, state.addressingModeW, mipmap);
+
+	Short4 cubeArrayId(0);
+	if(state.textureType == VK_IMAGE_VIEW_TYPE_CUBE_ARRAY)
+	{
+		cubeArrayId = address(cubeArrayCoord, state.addressingModeY, mipmap);
+	}
+
+	if(state.textureFilter == FILTER_POINT || texelFetch)
+	{
+		c_ = sampleTexel(uuuu, vvvv, wwww, offset, mipmap, cubeArrayId, sampleId, buffer, function);
+	}
+	else
+	{
+		Vector4s c[2][2][2];
+
+		Short4 u[2][2][2];
+		Short4 v[2][2][2];
+		Short4 s[2][2][2];
+
+		for(int i = 0; i < 2; i++)
+		{
+			for(int j = 0; j < 2; j++)
 			{
-				bool integer = hasUnnormalizedIntegerTexture();
-				c.x = c.y = c.z = c.w = integer ? As<Float4>(Int4(1)) : RValue<Float4>(Float4(1.0f));
+				for(int k = 0; k < 2; k++)
+				{
+					u[i][j][k] = offsetSample(uuuu, mipmap, OFFSET(Mipmap,uHalf), state.addressingModeU == ADDRESSING_WRAP, i * 2 - 1, lod);
+					v[i][j][k] = offsetSample(vvvv, mipmap, OFFSET(Mipmap,vHalf), state.addressingModeV == ADDRESSING_WRAP, j * 2 - 1, lod);
+					s[i][j][k] = offsetSample(wwww, mipmap, OFFSET(Mipmap,wHalf), state.addressingModeW == ADDRESSING_WRAP, k * 2 - 1, lod);
+				}
 			}
 		}
 
+		// Fractions
+		UShort4 f0u = As<UShort4>(u[0][0][0]) * UShort4(*Pointer<Int4>(mipmap + OFFSET(Mipmap,width)));
+		UShort4 f0v = As<UShort4>(v[0][0][0]) * UShort4(*Pointer<Int4>(mipmap + OFFSET(Mipmap,height)));
+		UShort4 f0s = As<UShort4>(s[0][0][0]) * UShort4(*Pointer<Int4>(mipmap + OFFSET(Mipmap,depth)));
+
+		UShort4 f1u = ~f0u;
+		UShort4 f1v = ~f0v;
+		UShort4 f1s = ~f0s;
+
+		UShort4 f[2][2][2];
+		Short4 fs[2][2][2];
+
+		f[1][1][1] = MulHigh(f1u, f1v);
+		f[0][1][1] = MulHigh(f0u, f1v);
+		f[1][0][1] = MulHigh(f1u, f0v);
+		f[0][0][1] = MulHigh(f0u, f0v);
+		f[1][1][0] = MulHigh(f1u, f1v);
+		f[0][1][0] = MulHigh(f0u, f1v);
+		f[1][0][0] = MulHigh(f1u, f0v);
+		f[0][0][0] = MulHigh(f0u, f0v);
+
+		f[1][1][1] = MulHigh(f[1][1][1], f1s);
+		f[0][1][1] = MulHigh(f[0][1][1], f1s);
+		f[1][0][1] = MulHigh(f[1][0][1], f1s);
+		f[0][0][1] = MulHigh(f[0][0][1], f1s);
+		f[1][1][0] = MulHigh(f[1][1][0], f0s);
+		f[0][1][0] = MulHigh(f[0][1][0], f0s);
+		f[1][0][0] = MulHigh(f[1][0][0], f0s);
+		f[0][0][0] = MulHigh(f[0][0][0], f0s);
+
+		// Signed fractions
+		if(!hasUnsignedTextureComponent(0) || !hasUnsignedTextureComponent(1) || !hasUnsignedTextureComponent(2) || !hasUnsignedTextureComponent(3))
+		{
+			fs[0][0][0] = f[0][0][0] >> 1;
+			fs[0][0][1] = f[0][0][1] >> 1;
+			fs[0][1][0] = f[0][1][0] >> 1;
+			fs[0][1][1] = f[0][1][1] >> 1;
+			fs[1][0][0] = f[1][0][0] >> 1;
+			fs[1][0][1] = f[1][0][1] >> 1;
+			fs[1][1][0] = f[1][1][0] >> 1;
+			fs[1][1][1] = f[1][1][1] >> 1;
+		}
+
+		for(int i = 0; i < 2; i++)
+		{
+			for(int j = 0; j < 2; j++)
+			{
+				for(int k = 0; k < 2; k++)
+				{
+					c[i][j][k] = sampleTexel(u[i][j][k], v[i][j][k], s[i][j][k], offset, mipmap, cubeArrayId, sampleId, buffer, function);
+
+					if(componentCount >= 1) { if(hasUnsignedTextureComponent(0)) c[i][j][k].x = MulHigh(As<UShort4>(c[i][j][k].x), f[1 - i][1 - j][1 - k]); else c[i][j][k].x = MulHigh(c[i][j][k].x, fs[1 - i][1 - j][1 - k]); }
+					if(componentCount >= 2) { if(hasUnsignedTextureComponent(1)) c[i][j][k].y = MulHigh(As<UShort4>(c[i][j][k].y), f[1 - i][1 - j][1 - k]); else c[i][j][k].y = MulHigh(c[i][j][k].y, fs[1 - i][1 - j][1 - k]); }
+					if(componentCount >= 3) { if(hasUnsignedTextureComponent(2)) c[i][j][k].z = MulHigh(As<UShort4>(c[i][j][k].z), f[1 - i][1 - j][1 - k]); else c[i][j][k].z = MulHigh(c[i][j][k].z, fs[1 - i][1 - j][1 - k]); }
+					if(componentCount >= 4) { if(hasUnsignedTextureComponent(3)) c[i][j][k].w = MulHigh(As<UShort4>(c[i][j][k].w), f[1 - i][1 - j][1 - k]); else c[i][j][k].w = MulHigh(c[i][j][k].w, fs[1 - i][1 - j][1 - k]); }
+
+					if(i != 0 || j != 0 || k != 0)
+					{
+						if(componentCount >= 1) c[0][0][0].x += c[i][j][k].x;
+						if(componentCount >= 2) c[0][0][0].y += c[i][j][k].y;
+						if(componentCount >= 3) c[0][0][0].z += c[i][j][k].z;
+						if(componentCount >= 4) c[0][0][0].w += c[i][j][k].w;
+					}
+				}
+			}
+		}
+
+		if(componentCount >= 1) c_.x = c[0][0][0].x;
+		if(componentCount >= 2) c_.y = c[0][0][0].y;
+		if(componentCount >= 3) c_.z = c[0][0][0].z;
+		if(componentCount >= 4) c_.w = c[0][0][0].w;
+
+		// Correct for signed fractions
+		if(componentCount >= 1) if(!hasUnsignedTextureComponent(0)) c_.x = AddSat(c_.x, c_.x);
+		if(componentCount >= 2) if(!hasUnsignedTextureComponent(1)) c_.y = AddSat(c_.y, c_.y);
+		if(componentCount >= 3) if(!hasUnsignedTextureComponent(2)) c_.z = AddSat(c_.z, c_.z);
+		if(componentCount >= 4) if(!hasUnsignedTextureComponent(3)) c_.w = AddSat(c_.w, c_.w);
+	}
+
+	return c_;
+}
+
+Vector4f SamplerCore::sampleFloatFilter(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, Float4 &q, Vector4f &offset, const Float4& cubeArrayCoord, const Int4& sampleId, Float &lod, Float &anisotropy, Float4 &uDelta, Float4 &vDelta, SamplerFunction function)
+{
+	Vector4f c = sampleFloatAniso(texture, u, v, w, q, offset, cubeArrayCoord, sampleId, lod, anisotropy, uDelta, vDelta, false, function);
+
+	if(function == Fetch)
+	{
 		return c;
 	}
 
-	Short4 SamplerCore::offsetSample(Short4 &uvw, Pointer<Byte> &mipmap, int halfOffset, bool wrap, int count, Float &lod)
+	if(state.mipmapFilter == MIPMAP_LINEAR)
 	{
-		Short4 offset = *Pointer<Short4>(mipmap + halfOffset);
+		Vector4f cc = sampleFloatAniso(texture, u, v, w, q, offset, cubeArrayCoord, sampleId, lod, anisotropy, uDelta, vDelta, true, function);
 
-		if(state.textureFilter == FILTER_MIN_LINEAR_MAG_POINT)
-		{
-			offset &= Short4(CmpNLE(Float4(lod), Float4(0.0f)));
-		}
-		else if(state.textureFilter == FILTER_MIN_POINT_MAG_LINEAR)
-		{
-			offset &= Short4(CmpLE(Float4(lod), Float4(0.0f)));
-		}
+		Float4 lod4 = Float4(Frac(lod));
 
-		if(wrap)
+		c.x = (cc.x - c.x) * lod4 + c.x;
+		c.y = (cc.y - c.y) * lod4 + c.y;
+		c.z = (cc.z - c.z) * lod4 + c.z;
+		c.w = (cc.w - c.w) * lod4 + c.w;
+	}
+
+	return c;
+}
+
+Vector4f SamplerCore::sampleFloatAniso(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, Float4 &q, Vector4f &offset, const Float4& cubeArrayCoord, const Int4& sampleId, Float &lod, Float &anisotropy, Float4 &uDelta, Float4 &vDelta, bool secondLOD, SamplerFunction function)
+{
+	Vector4f c;
+
+	if(state.textureFilter != FILTER_ANISOTROPIC || function == Lod || function == Fetch)
+	{
+		c = sampleFloat(texture, u, v, w, q, offset, cubeArrayCoord, sampleId, lod, secondLOD, function);
+	}
+	else
+	{
+		Int a = RoundInt(anisotropy);
+
+		Vector4f cSum;
+
+		cSum.x = Float4(0.0f);
+		cSum.y = Float4(0.0f);
+		cSum.z = Float4(0.0f);
+		cSum.w = Float4(0.0f);
+
+		Float4 A = *Pointer<Float4>(constants + OFFSET(Constants,uvWeight) + 16 * a);
+		Float4 B = *Pointer<Float4>(constants + OFFSET(Constants,uvStart) + 16 * a);
+
+		Float4 du = uDelta;
+		Float4 dv = vDelta;
+
+		Float4 u0 = u + B * du;
+		Float4 v0 = v + B * dv;
+
+		du *= A;
+		dv *= A;
+
+		Int i = 0;
+
+		Do
 		{
-			switch(count)
+			c = sampleFloat(texture, u0, v0, w, q, offset, cubeArrayCoord, sampleId, lod, secondLOD, function);
+
+			u0 += du;
+			v0 += dv;
+
+			cSum.x += c.x * A;
+			cSum.y += c.y * A;
+			cSum.z += c.z * A;
+			cSum.w += c.w * A;
+
+			i++;
+		}
+		Until(i >= a)
+
+		c.x = cSum.x;
+		c.y = cSum.y;
+		c.z = cSum.z;
+		c.w = cSum.w;
+	}
+
+	return c;
+}
+
+Vector4f SamplerCore::sampleFloat(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, Float4 &q, Vector4f &offset, const Float4& cubeArrayCoord, const Int4& sampleId, Float &lod, bool secondLOD, SamplerFunction function)
+{
+	if(state.textureType != VK_IMAGE_VIEW_TYPE_3D)
+	{
+		return sampleFloat2D(texture, u, v, w, q, offset, cubeArrayCoord, sampleId, lod, secondLOD, function);
+	}
+	else
+	{
+		return sampleFloat3D(texture, u, v, w, offset, cubeArrayCoord, sampleId, lod, secondLOD, function);
+	}
+}
+
+Vector4f SamplerCore::sampleFloat2D(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, Float4 &q, Vector4f &offset, const Float4& cubeArrayCoord, const Int4& sampleId, Float &lod, bool secondLOD, SamplerFunction function)
+{
+	Vector4f c;
+
+	int componentCount = textureComponentCount();
+	bool gather = (state.textureFilter == FILTER_GATHER);
+
+	Pointer<Byte> mipmap;
+	Pointer<Byte> buffer;
+	selectMipmap(texture, mipmap, buffer, lod, secondLOD);
+
+	Int4 x0, x1, y0, y1, z0;
+	Float4 fu, fv, fw;
+	Int4 filter = computeFilterOffset(lod);
+	address(u, x0, x1, fu, mipmap, offset.x, filter, OFFSET(Mipmap, width), state.addressingModeU, function);
+	address(v, y0, y1, fv, mipmap, offset.y, filter, OFFSET(Mipmap, height), state.addressingModeV, function);
+	address(w, z0, z0, fw, mipmap, offset.z, filter, OFFSET(Mipmap, depth), state.addressingModeW, function);
+
+	Int4 cubeArrayId(0);
+	if(state.textureType == VK_IMAGE_VIEW_TYPE_CUBE_ARRAY)
+	{
+		address(cubeArrayCoord, cubeArrayId, cubeArrayId, fw, mipmap, offset.w, filter, OFFSET(Mipmap, depth), state.addressingModeY, function);
+	}
+
+	Int4 pitchP = *Pointer<Int4>(mipmap + OFFSET(Mipmap, pitchP), 16);
+	y0 *= pitchP;
+	if(state.addressingModeW != ADDRESSING_UNUSED)
+	{
+		z0 *= *Pointer<Int4>(mipmap + OFFSET(Mipmap, sliceP), 16);
+	}
+
+	if(state.textureFilter == FILTER_POINT || (function == Fetch))
+	{
+		c = sampleTexel(x0, y0, z0, q, mipmap, cubeArrayId, sampleId, buffer, function);
+	}
+	else
+	{
+		y1 *= pitchP;
+
+		Vector4f c00 = sampleTexel(x0, y0, z0, q, mipmap, cubeArrayId, sampleId, buffer, function);
+		Vector4f c10 = sampleTexel(x1, y0, z0, q, mipmap, cubeArrayId, sampleId, buffer, function);
+		Vector4f c01 = sampleTexel(x0, y1, z0, q, mipmap, cubeArrayId, sampleId, buffer, function);
+		Vector4f c11 = sampleTexel(x1, y1, z0, q, mipmap, cubeArrayId, sampleId, buffer, function);
+
+		if(!gather)   // Blend
+		{
+			if(componentCount >= 1) c00.x = c00.x + fu * (c10.x - c00.x);
+			if(componentCount >= 2) c00.y = c00.y + fu * (c10.y - c00.y);
+			if(componentCount >= 3) c00.z = c00.z + fu * (c10.z - c00.z);
+			if(componentCount >= 4) c00.w = c00.w + fu * (c10.w - c00.w);
+
+			if(componentCount >= 1) c01.x = c01.x + fu * (c11.x - c01.x);
+			if(componentCount >= 2) c01.y = c01.y + fu * (c11.y - c01.y);
+			if(componentCount >= 3) c01.z = c01.z + fu * (c11.z - c01.z);
+			if(componentCount >= 4) c01.w = c01.w + fu * (c11.w - c01.w);
+
+			if(componentCount >= 1) c.x = c00.x + fv * (c01.x - c00.x);
+			if(componentCount >= 2) c.y = c00.y + fv * (c01.y - c00.y);
+			if(componentCount >= 3) c.z = c00.z + fv * (c01.z - c00.z);
+			if(componentCount >= 4) c.w = c00.w + fv * (c01.w - c00.w);
+		}
+		else  // Gather
+		{
+			VkComponentSwizzle swizzle = gatherSwizzle();
+			switch(swizzle)
 			{
-			case -1: return uvw - offset;
-			case  0: return uvw;
-			case +1: return uvw + offset;
-			case  2: return uvw + offset + offset;
+			case VK_COMPONENT_SWIZZLE_ZERO:
+			case VK_COMPONENT_SWIZZLE_ONE:
+				// Handled at the final component swizzle.
+				break;
+			default:
+				c.x = c01[swizzle - VK_COMPONENT_SWIZZLE_R];
+				c.y = c11[swizzle - VK_COMPONENT_SWIZZLE_R];
+				c.z = c10[swizzle - VK_COMPONENT_SWIZZLE_R];
+				c.w = c00[swizzle - VK_COMPONENT_SWIZZLE_R];
+				break;
 			}
 		}
-		else   // Clamp or mirror
+	}
+
+	return c;
+}
+
+Vector4f SamplerCore::sampleFloat3D(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, Vector4f &offset, const Float4& cubeArrayCoord, const Int4& sampleId, Float &lod, bool secondLOD, SamplerFunction function)
+{
+	Vector4f c;
+
+	int componentCount = textureComponentCount();
+
+	Pointer<Byte> mipmap;
+	Pointer<Byte> buffer;
+	selectMipmap(texture, mipmap, buffer, lod, secondLOD);
+
+	Int4 x0, x1, y0, y1, z0, z1;
+	Float4 fu, fv, fw;
+	Int4 filter = computeFilterOffset(lod);
+	address(u, x0, x1, fu, mipmap, offset.x, filter, OFFSET(Mipmap, width), state.addressingModeU, function);
+	address(v, y0, y1, fv, mipmap, offset.y, filter, OFFSET(Mipmap, height), state.addressingModeV, function);
+	address(w, z0, z1, fw, mipmap, offset.z, filter, OFFSET(Mipmap, depth), state.addressingModeW, function);
+
+	Int4 cubeArrayId(0);
+	if(state.textureType == VK_IMAGE_VIEW_TYPE_CUBE_ARRAY)
+	{
+		address(cubeArrayCoord, cubeArrayId, cubeArrayId, fw, mipmap, offset.w, filter, OFFSET(Mipmap, depth), state.addressingModeY, function);
+	}
+
+	Int4 pitchP = *Pointer<Int4>(mipmap + OFFSET(Mipmap, pitchP), 16);
+	Int4 sliceP = *Pointer<Int4>(mipmap + OFFSET(Mipmap, sliceP), 16);
+	y0 *= pitchP;
+	z0 *= sliceP;
+
+	if(state.textureFilter == FILTER_POINT || (function == Fetch))
+	{
+		c = sampleTexel(x0, y0, z0, w, mipmap, cubeArrayId, sampleId, buffer, function);
+	}
+	else
+	{
+		y1 *= pitchP;
+		z1 *= sliceP;
+
+		Vector4f c000 = sampleTexel(x0, y0, z0, w, mipmap, cubeArrayId, sampleId, buffer, function);
+		Vector4f c100 = sampleTexel(x1, y0, z0, w, mipmap, cubeArrayId, sampleId, buffer, function);
+		Vector4f c010 = sampleTexel(x0, y1, z0, w, mipmap, cubeArrayId, sampleId, buffer, function);
+		Vector4f c110 = sampleTexel(x1, y1, z0, w, mipmap, cubeArrayId, sampleId, buffer, function);
+		Vector4f c001 = sampleTexel(x0, y0, z1, w, mipmap, cubeArrayId, sampleId, buffer, function);
+		Vector4f c101 = sampleTexel(x1, y0, z1, w, mipmap, cubeArrayId, sampleId, buffer, function);
+		Vector4f c011 = sampleTexel(x0, y1, z1, w, mipmap, cubeArrayId, sampleId, buffer, function);
+		Vector4f c111 = sampleTexel(x1, y1, z1, w, mipmap, cubeArrayId, sampleId, buffer, function);
+
+		// Blend first slice
+		if(componentCount >= 1) c000.x = c000.x + fu * (c100.x - c000.x);
+		if(componentCount >= 2) c000.y = c000.y + fu * (c100.y - c000.y);
+		if(componentCount >= 3) c000.z = c000.z + fu * (c100.z - c000.z);
+		if(componentCount >= 4) c000.w = c000.w + fu * (c100.w - c000.w);
+
+		if(componentCount >= 1) c010.x = c010.x + fu * (c110.x - c010.x);
+		if(componentCount >= 2) c010.y = c010.y + fu * (c110.y - c010.y);
+		if(componentCount >= 3) c010.z = c010.z + fu * (c110.z - c010.z);
+		if(componentCount >= 4) c010.w = c010.w + fu * (c110.w - c010.w);
+
+		if(componentCount >= 1) c000.x = c000.x + fv * (c010.x - c000.x);
+		if(componentCount >= 2) c000.y = c000.y + fv * (c010.y - c000.y);
+		if(componentCount >= 3) c000.z = c000.z + fv * (c010.z - c000.z);
+		if(componentCount >= 4) c000.w = c000.w + fv * (c010.w - c000.w);
+
+		// Blend second slice
+		if(componentCount >= 1) c001.x = c001.x + fu * (c101.x - c001.x);
+		if(componentCount >= 2) c001.y = c001.y + fu * (c101.y - c001.y);
+		if(componentCount >= 3) c001.z = c001.z + fu * (c101.z - c001.z);
+		if(componentCount >= 4) c001.w = c001.w + fu * (c101.w - c001.w);
+
+		if(componentCount >= 1) c011.x = c011.x + fu * (c111.x - c011.x);
+		if(componentCount >= 2) c011.y = c011.y + fu * (c111.y - c011.y);
+		if(componentCount >= 3) c011.z = c011.z + fu * (c111.z - c011.z);
+		if(componentCount >= 4) c011.w = c011.w + fu * (c111.w - c011.w);
+
+		if(componentCount >= 1) c001.x = c001.x + fv * (c011.x - c001.x);
+		if(componentCount >= 2) c001.y = c001.y + fv * (c011.y - c001.y);
+		if(componentCount >= 3) c001.z = c001.z + fv * (c011.z - c001.z);
+		if(componentCount >= 4) c001.w = c001.w + fv * (c011.w - c001.w);
+
+		// Blend slices
+		if(componentCount >= 1) c.x = c000.x + fw * (c001.x - c000.x);
+		if(componentCount >= 2) c.y = c000.y + fw * (c001.y - c000.y);
+		if(componentCount >= 3) c.z = c000.z + fw * (c001.z - c000.z);
+		if(componentCount >= 4) c.w = c000.w + fw * (c001.w - c000.w);
+	}
+
+	return c;
+}
+
+Float SamplerCore::log2sqrt(Float lod)
+{
+	// log2(sqrt(lod))                               // Equals 0.25 * log2(lod^2).
+	lod *= lod;                                      // Squaring doubles the exponent and produces an extra bit of precision.
+	lod = Float(As<Int>(lod)) - Float(0x3F800000);   // Interpret as integer and subtract the exponent bias.
+	lod *= As<Float>(Int(0x33000000));               // Scale by 0.25 * 2^-23 (mantissa length).
+
+	return lod;
+}
+
+Float SamplerCore::log2(Float lod)
+{
+	lod *= lod;                                      // Squaring doubles the exponent and produces an extra bit of precision.
+	lod = Float(As<Int>(lod)) - Float(0x3F800000);   // Interpret as integer and subtract the exponent bias.
+	lod *= As<Float>(Int(0x33800000));               // Scale by 0.5 * 2^-23 (mantissa length).
+
+	return lod;
+}
+
+void SamplerCore::computeLod(Pointer<Byte> &texture, Pointer<Byte> &sampler, Float &lod, Float &anisotropy, Float4 &uDelta, Float4 &vDelta, Float4 &uuuu, Float4 &vvvv, Float4 &dsx, Float4 &dsy, SamplerFunction function)
+{
+	Float4 duvdxy;
+
+	if(function != Grad)   // Implicit
+	{
+		duvdxy = Float4(uuuu.yz, vvvv.yz) - Float4(uuuu.xx, vvvv.xx);
+	}
+	else
+	{
+		Float4 dudxy = Float4(dsx.xx, dsy.xx);
+		Float4 dvdxy = Float4(dsx.yy, dsy.yy);
+
+		duvdxy = Float4(dudxy.xz, dvdxy.xz);
+	}
+
+	// Scale by texture dimensions.
+	Float4 dUVdxy = duvdxy * *Pointer<Float4>(texture + OFFSET(Texture, widthWidthHeightHeight));
+
+	Float4 dUV2dxy = dUVdxy * dUVdxy;
+	Float4 dUV2 = dUV2dxy.xy + dUV2dxy.zw;
+
+	lod = Max(Float(dUV2.x), Float(dUV2.y));   // Square length of major axis
+
+	if(state.textureFilter == FILTER_ANISOTROPIC)
+	{
+		Float det = Abs(Float(dUVdxy.x) * Float(dUVdxy.w) - Float(dUVdxy.y) * Float(dUVdxy.z));
+
+		Float4 dudx = duvdxy.xxxx;
+		Float4 dudy = duvdxy.yyyy;
+		Float4 dvdx = duvdxy.zzzz;
+		Float4 dvdy = duvdxy.wwww;
+
+		Int4 mask = As<Int4>(CmpNLT(dUV2.x, dUV2.y));
+		uDelta = As<Float4>((As<Int4>(dudx) & mask) | ((As<Int4>(dudy) & ~mask)));
+		vDelta = As<Float4>((As<Int4>(dvdx) & mask) | ((As<Int4>(dvdy) & ~mask)));
+
+		anisotropy = lod * Rcp_pp(det);
+		anisotropy = Min(anisotropy, *Pointer<Float>(sampler + OFFSET(vk::Sampler,maxAnisotropy)));
+
+		lod *= Rcp_pp(anisotropy * anisotropy);
+	}
+
+	lod = log2sqrt(lod);   // log2(sqrt(lod))
+}
+
+void SamplerCore::computeLodCube(Pointer<Byte> &texture, Pointer<Byte> &sampler, Float &lod, Float4 &u, Float4 &v, Float4 &w, Float4 &dsx, Float4 &dsy, Float4 &M, SamplerFunction function)
+{
+	Float4 dudxy, dvdxy, dsdxy;
+
+	if(function != Grad)  // Implicit
+	{
+		Float4 U = u * M;
+		Float4 V = v * M;
+		Float4 W = w * M;
+
+		dudxy = Abs(U - U.xxxx);
+		dvdxy = Abs(V - V.xxxx);
+		dsdxy = Abs(W - W.xxxx);
+	}
+	else
+	{
+		dudxy = Float4(dsx.xx, dsy.xx);
+		dvdxy = Float4(dsx.yy, dsy.yy);
+		dsdxy = Float4(dsx.zz, dsy.zz);
+
+		dudxy = Abs(dudxy * Float4(M.x));
+		dvdxy = Abs(dvdxy * Float4(M.x));
+		dsdxy = Abs(dsdxy * Float4(M.x));
+	}
+
+	// Compute the largest Manhattan distance in two dimensions.
+	// This takes the footprint across adjacent faces into account.
+	Float4 duvdxy = dudxy + dvdxy;
+	Float4 dusdxy = dudxy + dsdxy;
+	Float4 dvsdxy = dvdxy + dsdxy;
+
+	dudxy = Max(Max(duvdxy, dusdxy), dvsdxy);
+
+	lod = Max(Float(dudxy.y), Float(dudxy.z));   // FIXME: Max(dudxy.y, dudxy.z);
+
+	// Scale by texture dimension.
+	lod *= *Pointer<Float>(texture + OFFSET(Texture,width));
+
+	lod = log2(lod);
+}
+
+void SamplerCore::computeLod3D(Pointer<Byte> &texture, Pointer<Byte> &sampler, Float &lod, Float4 &uuuu, Float4 &vvvv, Float4 &wwww, Float4 &dsx, Float4 &dsy, SamplerFunction function)
+{
+	Float4 dudxy, dvdxy, dsdxy;
+
+	if(function != Grad)   // Implicit
+	{
+		dudxy = uuuu - uuuu.xxxx;
+		dvdxy = vvvv - vvvv.xxxx;
+		dsdxy = wwww - wwww.xxxx;
+	}
+	else
+	{
+		dudxy = Float4(dsx.xx, dsy.xx);
+		dvdxy = Float4(dsx.yy, dsy.yy);
+		dsdxy = Float4(dsx.zz, dsy.zz);
+	}
+
+	// Scale by texture dimensions.
+	dudxy *= *Pointer<Float4>(texture + OFFSET(Texture, width));
+	dvdxy *= *Pointer<Float4>(texture + OFFSET(Texture, height));
+	dsdxy *= *Pointer<Float4>(texture + OFFSET(Texture, depth));
+
+	dudxy *= dudxy;
+	dvdxy *= dvdxy;
+	dsdxy *= dsdxy;
+
+	dudxy += dvdxy;
+	dudxy += dsdxy;
+
+	lod = Max(Float(dudxy.y), Float(dudxy.z));   // FIXME: Max(dudxy.y, dudxy.z);
+
+	lod = log2sqrt(lod);   // log2(sqrt(lod))
+}
+
+Int4 SamplerCore::cubeFace(Float4 &U, Float4 &V, Float4 &x, Float4 &y, Float4 &z, Float4 &M)
+{
+	// TODO: Comply with Vulkan recommendation:
+	// Vulkan 1.1: "The rules should have as the first rule that rz wins over ry and rx, and the second rule that ry wins over rx."
+
+	Int4 xn = CmpLT(x, Float4(0.0f));   // x < 0
+	Int4 yn = CmpLT(y, Float4(0.0f));   // y < 0
+	Int4 zn = CmpLT(z, Float4(0.0f));   // z < 0
+
+	Float4 absX = Abs(x);
+	Float4 absY = Abs(y);
+	Float4 absZ = Abs(z);
+
+	Int4 xy = CmpNLE(absX, absY);   // abs(x) > abs(y)
+	Int4 yz = CmpNLE(absY, absZ);   // abs(y) > abs(z)
+	Int4 zx = CmpNLE(absZ, absX);   // abs(z) > abs(x)
+	Int4 xMajor = xy & ~zx;   // abs(x) > abs(y) && abs(x) > abs(z)
+	Int4 yMajor = yz & ~xy;   // abs(y) > abs(z) && abs(y) > abs(x)
+	Int4 zMajor = zx & ~yz;   // abs(z) > abs(x) && abs(z) > abs(y)
+
+	// FACE_POSITIVE_X = 000b
+	// FACE_NEGATIVE_X = 001b
+	// FACE_POSITIVE_Y = 010b
+	// FACE_NEGATIVE_Y = 011b
+	// FACE_POSITIVE_Z = 100b
+	// FACE_NEGATIVE_Z = 101b
+
+	Int yAxis = SignMask(yMajor);
+	Int zAxis = SignMask(zMajor);
+
+	Int4 n = ((xn & xMajor) | (yn & yMajor) | (zn & zMajor)) & Int4(0x80000000);
+	Int negative = SignMask(n);
+
+	Int faces = *Pointer<Int>(constants + OFFSET(Constants,transposeBit0) + negative * 4);
+	faces |= *Pointer<Int>(constants + OFFSET(Constants,transposeBit1) + yAxis * 4);
+	faces |= *Pointer<Int>(constants + OFFSET(Constants,transposeBit2) + zAxis * 4);
+
+	Int4 face;
+	face.x = faces & 0x7;
+	face.y = (faces >> 4)  & 0x7;
+	face.z = (faces >> 8)  & 0x7;
+	face.w = (faces >> 12) & 0x7;
+
+	M = Max(Max(absX, absY), Max(absZ, Float4(std::numeric_limits<float>::min())));
+
+	// U = xMajor ? (neg ^ -z) : ((zMajor & neg) ^ x)
+	U = As<Float4>((xMajor & (n ^ As<Int4>(-z))) | (~xMajor & ((zMajor & n) ^ As<Int4>(x))));
+
+	// V = !yMajor ? -y : (n ^ z)
+	V = As<Float4>((~yMajor & As<Int4>(-y)) | (yMajor & (n ^ As<Int4>(z))));
+
+	M = reciprocal(M) * Float4(0.5f);
+	U = U * M + Float4(0.5f);
+	V = V * M + Float4(0.5f);
+
+	return face;
+}
+
+Short4 SamplerCore::applyOffset(Short4 &uvw, Float4 &offset, const Int4 &whd, AddressingMode mode)
+{
+	Int4 tmp = Int4(As<UShort4>(uvw));
+	tmp = tmp + As<Int4>(offset);
+
+	switch(mode)
+	{
+	case AddressingMode::ADDRESSING_WRAP:
+		tmp = (tmp + whd * Int4(-MIN_TEXEL_OFFSET)) % whd;
+		break;
+	case AddressingMode::ADDRESSING_CLAMP:
+	case AddressingMode::ADDRESSING_MIRROR:
+	case AddressingMode::ADDRESSING_MIRRORONCE:
+	case AddressingMode::ADDRESSING_BORDER: // FIXME: Implement and test ADDRESSING_MIRROR, ADDRESSING_MIRRORONCE, ADDRESSING_BORDER
+		tmp = Min(Max(tmp, Int4(0)), whd - Int4(1));
+		break;
+	case ADDRESSING_TEXELFETCH:
+		break;
+	case AddressingMode::ADDRESSING_SEAMLESS:
+		ASSERT(false);   // Cube sampling doesn't support offset.
+	default:
+		ASSERT(false);
+	}
+
+	return As<Short4>(UShort4(tmp));
+}
+
+void SamplerCore::computeIndices(UInt index[4], Short4 uuuu, Short4 vvvv, Short4 wwww, Vector4f &offset, const Pointer<Byte> &mipmap, const Short4& cubeArrayId, const Int4& sampleId, SamplerFunction function)
+{
+	bool texelFetch = (function == Fetch);
+	bool hasOffset = (function.offset != 0);
+
+	if(!texelFetch)
+	{
+		uuuu = MulHigh(As<UShort4>(uuuu), UShort4(*Pointer<Int4>(mipmap + OFFSET(Mipmap, width))));
+		vvvv = MulHigh(As<UShort4>(vvvv), UShort4(*Pointer<Int4>(mipmap + OFFSET(Mipmap, height))));
+	}
+
+	if(hasOffset)
+	{
+		uuuu = applyOffset(uuuu, offset.x, *Pointer<Int4>(mipmap + OFFSET(Mipmap, width)),
+		                   texelFetch ? ADDRESSING_TEXELFETCH : state.addressingModeU);
+		vvvv = applyOffset(vvvv, offset.y, *Pointer<Int4>(mipmap + OFFSET(Mipmap, height)),
+		                   texelFetch ? ADDRESSING_TEXELFETCH : state.addressingModeV);
+	}
+
+	Short4 uuu2 = uuuu;
+	uuuu = As<Short4>(UnpackLow(uuuu, vvvv));
+	uuu2 = As<Short4>(UnpackHigh(uuu2, vvvv));
+	uuuu = As<Short4>(MulAdd(uuuu, *Pointer<Short4>(mipmap + OFFSET(Mipmap,onePitchP))));
+	uuu2 = As<Short4>(MulAdd(uuu2, *Pointer<Short4>(mipmap + OFFSET(Mipmap,onePitchP))));
+
+	if(hasThirdCoordinate())
+	{
+		if(state.textureType == VK_IMAGE_VIEW_TYPE_3D)
 		{
-			switch(count)
+			if(!texelFetch)
 			{
-			case -1: return SubSat(As<UShort4>(uvw), As<UShort4>(offset));
-			case  0: return uvw;
-			case +1: return AddSat(As<UShort4>(uvw), As<UShort4>(offset));
-			case  2: return AddSat(AddSat(As<UShort4>(uvw), As<UShort4>(offset)), As<UShort4>(offset));
+				wwww = MulHigh(As<UShort4>(wwww), UShort4(*Pointer<Int4>(mipmap + OFFSET(Mipmap, depth))));
+			}
+
+			if(hasOffset)
+			{
+				wwww = applyOffset(wwww, offset.z, *Pointer<Int4>(mipmap + OFFSET(Mipmap, depth)),
+				                   texelFetch ? ADDRESSING_TEXELFETCH : state.addressingModeW);
 			}
 		}
 
-		return uvw;
+		UInt4 uv(As<UInt2>(uuuu), As<UInt2>(uuu2));
+		uv += As<UInt4>(Int4(As<UShort4>(wwww))) * *Pointer<UInt4>(mipmap + OFFSET(Mipmap, sliceP));
+
+		index[0] = Extract(As<Int4>(uv), 0);
+		index[1] = Extract(As<Int4>(uv), 1);
+		index[2] = Extract(As<Int4>(uv), 2);
+		index[3] = Extract(As<Int4>(uv), 3);
+	}
+	else
+	{
+		index[0] = Extract(As<Int2>(uuuu), 0);
+		index[1] = Extract(As<Int2>(uuuu), 1);
+		index[2] = Extract(As<Int2>(uuu2), 0);
+		index[3] = Extract(As<Int2>(uuu2), 1);
 	}
 
-	Vector4s SamplerCore::sampleFilter(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, Vector4f &offset, const Float4& cubeArrayCoord, const Int4& sampleId, Float &lod, Float &anisotropy, Float4 &uDelta, Float4 &vDelta, SamplerFunction function)
+	if(texelFetch)
 	{
-		Vector4s c = sampleAniso(texture, u, v, w, offset, cubeArrayCoord, sampleId, lod, anisotropy, uDelta, vDelta, false, function);
-
-		if(function == Fetch)
-		{
-			return c;
-		}
-
-		if(state.mipmapFilter == MIPMAP_LINEAR)
-		{
-			Vector4s cc = sampleAniso(texture, u, v, w, offset, cubeArrayCoord, sampleId, lod, anisotropy, uDelta, vDelta, true, function);
-
-			lod *= Float(1 << 16);
-
-			UShort4 utri = UShort4(Float4(lod));   // FIXME: Optimize
-			Short4 stri = utri >> 1;   // FIXME: Optimize
-
-			if(hasUnsignedTextureComponent(0)) cc.x = MulHigh(As<UShort4>(cc.x), utri); else cc.x = MulHigh(cc.x, stri);
-			if(hasUnsignedTextureComponent(1)) cc.y = MulHigh(As<UShort4>(cc.y), utri); else cc.y = MulHigh(cc.y, stri);
-			if(hasUnsignedTextureComponent(2)) cc.z = MulHigh(As<UShort4>(cc.z), utri); else cc.z = MulHigh(cc.z, stri);
-			if(hasUnsignedTextureComponent(3)) cc.w = MulHigh(As<UShort4>(cc.w), utri); else cc.w = MulHigh(cc.w, stri);
-
-			utri = ~utri;
-			stri = Short4(0x7FFF) - stri;
-
-			if(hasUnsignedTextureComponent(0)) c.x = MulHigh(As<UShort4>(c.x), utri); else c.x = MulHigh(c.x, stri);
-			if(hasUnsignedTextureComponent(1)) c.y = MulHigh(As<UShort4>(c.y), utri); else c.y = MulHigh(c.y, stri);
-			if(hasUnsignedTextureComponent(2)) c.z = MulHigh(As<UShort4>(c.z), utri); else c.z = MulHigh(c.z, stri);
-			if(hasUnsignedTextureComponent(3)) c.w = MulHigh(As<UShort4>(c.w), utri); else c.w = MulHigh(c.w, stri);
-
-			c.x += cc.x;
-			c.y += cc.y;
-			c.z += cc.z;
-			c.w += cc.w;
-
-			if(!hasUnsignedTextureComponent(0)) c.x += c.x;
-			if(!hasUnsignedTextureComponent(1)) c.y += c.y;
-			if(!hasUnsignedTextureComponent(2)) c.z += c.z;
-			if(!hasUnsignedTextureComponent(3)) c.w += c.w;
-		}
-
-		return c;
-	}
-
-	Vector4s SamplerCore::sampleAniso(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, Vector4f &offset, const Float4& cubeArrayCoord, const Int4& sampleId, Float &lod, Float &anisotropy, Float4 &uDelta, Float4 &vDelta, bool secondLOD, SamplerFunction function)
-	{
-		Vector4s c;
-
-		if(state.textureFilter != FILTER_ANISOTROPIC || function == Lod || function == Fetch)
-		{
-			c = sampleQuad(texture, u, v, w, offset, cubeArrayCoord, sampleId, lod, secondLOD, function);
-		}
-		else
-		{
-			Int a = RoundInt(anisotropy);
-
-			Vector4s cSum;
-
-			cSum.x = Short4(0);
-			cSum.y = Short4(0);
-			cSum.z = Short4(0);
-			cSum.w = Short4(0);
-
-			Float4 A = *Pointer<Float4>(constants + OFFSET(Constants,uvWeight) + 16 * a);
-			Float4 B = *Pointer<Float4>(constants + OFFSET(Constants,uvStart) + 16 * a);
-			UShort4 cw = *Pointer<UShort4>(constants + OFFSET(Constants,cWeight) + 8 * a);
-			Short4 sw = Short4(cw >> 1);
-
-			Float4 du = uDelta;
-			Float4 dv = vDelta;
-
-			Float4 u0 = u + B * du;
-			Float4 v0 = v + B * dv;
-
-			du *= A;
-			dv *= A;
-
-			Int i = 0;
-
-			Do
-			{
-				c = sampleQuad(texture, u0, v0, w, offset, cubeArrayCoord, sampleId, lod, secondLOD, function);
-
-				u0 += du;
-				v0 += dv;
-
-				if(hasUnsignedTextureComponent(0)) cSum.x += As<Short4>(MulHigh(As<UShort4>(c.x), cw)); else cSum.x += MulHigh(c.x, sw);
-				if(hasUnsignedTextureComponent(1)) cSum.y += As<Short4>(MulHigh(As<UShort4>(c.y), cw)); else cSum.y += MulHigh(c.y, sw);
-				if(hasUnsignedTextureComponent(2)) cSum.z += As<Short4>(MulHigh(As<UShort4>(c.z), cw)); else cSum.z += MulHigh(c.z, sw);
-				if(hasUnsignedTextureComponent(3)) cSum.w += As<Short4>(MulHigh(As<UShort4>(c.w), cw)); else cSum.w += MulHigh(c.w, sw);
-
-				i++;
-			}
-			Until(i >= a)
-
-			if(hasUnsignedTextureComponent(0)) c.x = cSum.x; else c.x = AddSat(cSum.x, cSum.x);
-			if(hasUnsignedTextureComponent(1)) c.y = cSum.y; else c.y = AddSat(cSum.y, cSum.y);
-			if(hasUnsignedTextureComponent(2)) c.z = cSum.z; else c.z = AddSat(cSum.z, cSum.z);
-			if(hasUnsignedTextureComponent(3)) c.w = cSum.w; else c.w = AddSat(cSum.w, cSum.w);
-		}
-
-		return c;
-	}
-
-	Vector4s SamplerCore::sampleQuad(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, Vector4f &offset, const Float4& cubeArrayCoord, const Int4& sampleId, Float &lod, bool secondLOD, SamplerFunction function)
-	{
-		if(state.textureType != VK_IMAGE_VIEW_TYPE_3D)
-		{
-			return sampleQuad2D(texture, u, v, w, offset, cubeArrayCoord, sampleId, lod, secondLOD, function);
-		}
-		else
-		{
-			return sample3D(texture, u, v, w, offset, cubeArrayCoord, sampleId, lod, secondLOD, function);
-		}
-	}
-
-	Vector4s SamplerCore::sampleQuad2D(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, Vector4f &offset, const Float4& cubeArrayCoord, const Int4& sampleId, Float &lod, bool secondLOD, SamplerFunction function)
-	{
-		Vector4s c;
-
-		int componentCount = textureComponentCount();
-		bool gather = (state.textureFilter == FILTER_GATHER);
-
-		Pointer<Byte> mipmap;
-		Pointer<Byte> buffer;
-		selectMipmap(texture, mipmap, buffer, lod, secondLOD);
-
-		bool texelFetch = (function == Fetch);
-
-		Short4 uuuu = texelFetch ? Short4(As<Int4>(u)) : address(u, state.addressingModeU, mipmap);
-		Short4 vvvv = texelFetch ? Short4(As<Int4>(v)) : address(v, state.addressingModeV, mipmap);
-		Short4 wwww = texelFetch ? Short4(As<Int4>(w)) : address(w, state.addressingModeW, mipmap);
-
-		Short4 cubeArrayId(0);
-		if(state.textureType == VK_IMAGE_VIEW_TYPE_CUBE_ARRAY)
-		{
-			cubeArrayId = address(cubeArrayCoord, state.addressingModeY, mipmap);
-		}
-
-		if(state.textureFilter == FILTER_POINT || texelFetch)
-		{
-			c = sampleTexel(uuuu, vvvv, wwww, offset, mipmap, cubeArrayId, sampleId, buffer, function);
-		}
-		else
-		{
-			Short4 uuuu0 = offsetSample(uuuu, mipmap, OFFSET(Mipmap,uHalf), state.addressingModeU == ADDRESSING_WRAP, -1, lod);
-			Short4 vvvv0 = offsetSample(vvvv, mipmap, OFFSET(Mipmap,vHalf), state.addressingModeV == ADDRESSING_WRAP, -1, lod);
-			Short4 uuuu1 = offsetSample(uuuu, mipmap, OFFSET(Mipmap,uHalf), state.addressingModeU == ADDRESSING_WRAP, +1, lod);
-			Short4 vvvv1 = offsetSample(vvvv, mipmap, OFFSET(Mipmap,vHalf), state.addressingModeV == ADDRESSING_WRAP, +1, lod);
-
-			Vector4s c00 = sampleTexel(uuuu0, vvvv0, wwww, offset, mipmap, cubeArrayId, sampleId, buffer, function);
-			Vector4s c10 = sampleTexel(uuuu1, vvvv0, wwww, offset, mipmap, cubeArrayId, sampleId, buffer, function);
-			Vector4s c01 = sampleTexel(uuuu0, vvvv1, wwww, offset, mipmap, cubeArrayId, sampleId, buffer, function);
-			Vector4s c11 = sampleTexel(uuuu1, vvvv1, wwww, offset, mipmap, cubeArrayId, sampleId, buffer, function);
-
-			if(!gather)   // Blend
-			{
-				// Fractions
-				UShort4 f0u = As<UShort4>(uuuu0) * UShort4(*Pointer<Int4>(mipmap + OFFSET(Mipmap,width)));
-				UShort4 f0v = As<UShort4>(vvvv0) * UShort4(*Pointer<Int4>(mipmap + OFFSET(Mipmap,height)));
-
-				UShort4 f1u = ~f0u;
-				UShort4 f1v = ~f0v;
-
-				UShort4 f0u0v = MulHigh(f0u, f0v);
-				UShort4 f1u0v = MulHigh(f1u, f0v);
-				UShort4 f0u1v = MulHigh(f0u, f1v);
-				UShort4 f1u1v = MulHigh(f1u, f1v);
-
-				// Signed fractions
-				Short4 f1u1vs;
-				Short4 f0u1vs;
-				Short4 f1u0vs;
-				Short4 f0u0vs;
-
-				if(!hasUnsignedTextureComponent(0) || !hasUnsignedTextureComponent(1) || !hasUnsignedTextureComponent(2) || !hasUnsignedTextureComponent(3))
-				{
-					f1u1vs = f1u1v >> 1;
-					f0u1vs = f0u1v >> 1;
-					f1u0vs = f1u0v >> 1;
-					f0u0vs = f0u0v >> 1;
-				}
-
-				// Bilinear interpolation
-				if(componentCount >= 1)
-				{
-					if(has16bitTextureComponents() && hasUnsignedTextureComponent(0))
-					{
-						c00.x = As<UShort4>(c00.x) - MulHigh(As<UShort4>(c00.x), f0u) + MulHigh(As<UShort4>(c10.x), f0u);
-						c01.x = As<UShort4>(c01.x) - MulHigh(As<UShort4>(c01.x), f0u) + MulHigh(As<UShort4>(c11.x), f0u);
-						c.x   = As<UShort4>(c00.x) - MulHigh(As<UShort4>(c00.x), f0v) + MulHigh(As<UShort4>(c01.x), f0v);
-					}
-					else
-					{
-						if(hasUnsignedTextureComponent(0))
-						{
-							c00.x = MulHigh(As<UShort4>(c00.x), f1u1v);
-							c10.x = MulHigh(As<UShort4>(c10.x), f0u1v);
-							c01.x = MulHigh(As<UShort4>(c01.x), f1u0v);
-							c11.x = MulHigh(As<UShort4>(c11.x), f0u0v);
-						}
-						else
-						{
-							c00.x = MulHigh(c00.x, f1u1vs);
-							c10.x = MulHigh(c10.x, f0u1vs);
-							c01.x = MulHigh(c01.x, f1u0vs);
-							c11.x = MulHigh(c11.x, f0u0vs);
-						}
-
-						c.x = (c00.x + c10.x) + (c01.x + c11.x);
-						if(!hasUnsignedTextureComponent(0)) c.x = AddSat(c.x, c.x);   // Correct for signed fractions
-					}
-				}
-
-				if(componentCount >= 2)
-				{
-					if(has16bitTextureComponents() && hasUnsignedTextureComponent(1))
-					{
-						c00.y = As<UShort4>(c00.y) - MulHigh(As<UShort4>(c00.y), f0u) + MulHigh(As<UShort4>(c10.y), f0u);
-						c01.y = As<UShort4>(c01.y) - MulHigh(As<UShort4>(c01.y), f0u) + MulHigh(As<UShort4>(c11.y), f0u);
-						c.y   = As<UShort4>(c00.y) - MulHigh(As<UShort4>(c00.y), f0v) + MulHigh(As<UShort4>(c01.y), f0v);
-					}
-					else
-					{
-						if(hasUnsignedTextureComponent(1))
-						{
-							c00.y = MulHigh(As<UShort4>(c00.y), f1u1v);
-							c10.y = MulHigh(As<UShort4>(c10.y), f0u1v);
-							c01.y = MulHigh(As<UShort4>(c01.y), f1u0v);
-							c11.y = MulHigh(As<UShort4>(c11.y), f0u0v);
-						}
-						else
-						{
-							c00.y = MulHigh(c00.y, f1u1vs);
-							c10.y = MulHigh(c10.y, f0u1vs);
-							c01.y = MulHigh(c01.y, f1u0vs);
-							c11.y = MulHigh(c11.y, f0u0vs);
-						}
-
-						c.y = (c00.y + c10.y) + (c01.y + c11.y);
-						if(!hasUnsignedTextureComponent(1)) c.y = AddSat(c.y, c.y);   // Correct for signed fractions
-					}
-				}
-
-				if(componentCount >= 3)
-				{
-					if(has16bitTextureComponents() && hasUnsignedTextureComponent(2))
-					{
-						c00.z = As<UShort4>(c00.z) - MulHigh(As<UShort4>(c00.z), f0u) + MulHigh(As<UShort4>(c10.z), f0u);
-						c01.z = As<UShort4>(c01.z) - MulHigh(As<UShort4>(c01.z), f0u) + MulHigh(As<UShort4>(c11.z), f0u);
-						c.z   = As<UShort4>(c00.z) - MulHigh(As<UShort4>(c00.z), f0v) + MulHigh(As<UShort4>(c01.z), f0v);
-					}
-					else
-					{
-						if(hasUnsignedTextureComponent(2))
-						{
-							c00.z = MulHigh(As<UShort4>(c00.z), f1u1v);
-							c10.z = MulHigh(As<UShort4>(c10.z), f0u1v);
-							c01.z = MulHigh(As<UShort4>(c01.z), f1u0v);
-							c11.z = MulHigh(As<UShort4>(c11.z), f0u0v);
-						}
-						else
-						{
-							c00.z = MulHigh(c00.z, f1u1vs);
-							c10.z = MulHigh(c10.z, f0u1vs);
-							c01.z = MulHigh(c01.z, f1u0vs);
-							c11.z = MulHigh(c11.z, f0u0vs);
-						}
-
-						c.z = (c00.z + c10.z) + (c01.z + c11.z);
-						if(!hasUnsignedTextureComponent(2)) c.z = AddSat(c.z, c.z);   // Correct for signed fractions
-					}
-				}
-
-				if(componentCount >= 4)
-				{
-					if(has16bitTextureComponents() && hasUnsignedTextureComponent(3))
-					{
-						c00.w = As<UShort4>(c00.w) - MulHigh(As<UShort4>(c00.w), f0u) + MulHigh(As<UShort4>(c10.w), f0u);
-						c01.w = As<UShort4>(c01.w) - MulHigh(As<UShort4>(c01.w), f0u) + MulHigh(As<UShort4>(c11.w), f0u);
-						c.w  = As<UShort4>(c00.w) - MulHigh(As<UShort4>(c00.w), f0v) + MulHigh(As<UShort4>(c01.w), f0v);
-					}
-					else
-					{
-						if(hasUnsignedTextureComponent(3))
-						{
-							c00.w = MulHigh(As<UShort4>(c00.w), f1u1v);
-							c10.w = MulHigh(As<UShort4>(c10.w), f0u1v);
-							c01.w = MulHigh(As<UShort4>(c01.w), f1u0v);
-							c11.w = MulHigh(As<UShort4>(c11.w), f0u0v);
-						}
-						else
-						{
-							c00.w = MulHigh(c00.w, f1u1vs);
-							c10.w = MulHigh(c10.w, f0u1vs);
-							c01.w = MulHigh(c01.w, f1u0vs);
-							c11.w = MulHigh(c11.w, f0u0vs);
-						}
-
-						c.w = (c00.w + c10.w) + (c01.w + c11.w);
-						if(!hasUnsignedTextureComponent(3)) c.w = AddSat(c.w, c.w);   // Correct for signed fractions
-					}
-				}
-			}
-			else  // Gather
-			{
-				VkComponentSwizzle swizzle = gatherSwizzle();
-				switch(swizzle)
-				{
-				case VK_COMPONENT_SWIZZLE_ZERO:
-				case VK_COMPONENT_SWIZZLE_ONE:
-					// Handled at the final component swizzle.
-					break;
-				default:
-					c.x = c01[swizzle - VK_COMPONENT_SWIZZLE_R];
-					c.y = c11[swizzle - VK_COMPONENT_SWIZZLE_R];
-					c.z = c10[swizzle - VK_COMPONENT_SWIZZLE_R];
-					c.w = c00[swizzle - VK_COMPONENT_SWIZZLE_R];
-					break;
-				}
-			}
-		}
-
-		return c;
-	}
-
-	Vector4s SamplerCore::sample3D(Pointer<Byte> &texture, Float4 &u_, Float4 &v_, Float4 &w_, Vector4f &offset, const Float4& cubeArrayCoord, const Int4& sampleId, Float &lod, bool secondLOD, SamplerFunction function)
-	{
-		Vector4s c_;
-
-		int componentCount = textureComponentCount();
-
-		Pointer<Byte> mipmap;
-		Pointer<Byte> buffer;
-		selectMipmap(texture, mipmap, buffer, lod, secondLOD);
-
-		bool texelFetch = (function == Fetch);
-
-		Short4 uuuu = texelFetch ? Short4(As<Int4>(u_)) : address(u_, state.addressingModeU, mipmap);
-		Short4 vvvv = texelFetch ? Short4(As<Int4>(v_)) : address(v_, state.addressingModeV, mipmap);
-		Short4 wwww = texelFetch ? Short4(As<Int4>(w_)) : address(w_, state.addressingModeW, mipmap);
-
-		Short4 cubeArrayId(0);
-		if(state.textureType == VK_IMAGE_VIEW_TYPE_CUBE_ARRAY)
-		{
-			cubeArrayId = address(cubeArrayCoord, state.addressingModeY, mipmap);
-		}
-
-		if(state.textureFilter == FILTER_POINT || texelFetch)
-		{
-			c_ = sampleTexel(uuuu, vvvv, wwww, offset, mipmap, cubeArrayId, sampleId, buffer, function);
-		}
-		else
-		{
-			Vector4s c[2][2][2];
-
-			Short4 u[2][2][2];
-			Short4 v[2][2][2];
-			Short4 s[2][2][2];
-
-			for(int i = 0; i < 2; i++)
-			{
-				for(int j = 0; j < 2; j++)
-				{
-					for(int k = 0; k < 2; k++)
-					{
-						u[i][j][k] = offsetSample(uuuu, mipmap, OFFSET(Mipmap,uHalf), state.addressingModeU == ADDRESSING_WRAP, i * 2 - 1, lod);
-						v[i][j][k] = offsetSample(vvvv, mipmap, OFFSET(Mipmap,vHalf), state.addressingModeV == ADDRESSING_WRAP, j * 2 - 1, lod);
-						s[i][j][k] = offsetSample(wwww, mipmap, OFFSET(Mipmap,wHalf), state.addressingModeW == ADDRESSING_WRAP, k * 2 - 1, lod);
-					}
-				}
-			}
-
-			// Fractions
-			UShort4 f0u = As<UShort4>(u[0][0][0]) * UShort4(*Pointer<Int4>(mipmap + OFFSET(Mipmap,width)));
-			UShort4 f0v = As<UShort4>(v[0][0][0]) * UShort4(*Pointer<Int4>(mipmap + OFFSET(Mipmap,height)));
-			UShort4 f0s = As<UShort4>(s[0][0][0]) * UShort4(*Pointer<Int4>(mipmap + OFFSET(Mipmap,depth)));
-
-			UShort4 f1u = ~f0u;
-			UShort4 f1v = ~f0v;
-			UShort4 f1s = ~f0s;
-
-			UShort4 f[2][2][2];
-			Short4 fs[2][2][2];
-
-			f[1][1][1] = MulHigh(f1u, f1v);
-			f[0][1][1] = MulHigh(f0u, f1v);
-			f[1][0][1] = MulHigh(f1u, f0v);
-			f[0][0][1] = MulHigh(f0u, f0v);
-			f[1][1][0] = MulHigh(f1u, f1v);
-			f[0][1][0] = MulHigh(f0u, f1v);
-			f[1][0][0] = MulHigh(f1u, f0v);
-			f[0][0][0] = MulHigh(f0u, f0v);
-
-			f[1][1][1] = MulHigh(f[1][1][1], f1s);
-			f[0][1][1] = MulHigh(f[0][1][1], f1s);
-			f[1][0][1] = MulHigh(f[1][0][1], f1s);
-			f[0][0][1] = MulHigh(f[0][0][1], f1s);
-			f[1][1][0] = MulHigh(f[1][1][0], f0s);
-			f[0][1][0] = MulHigh(f[0][1][0], f0s);
-			f[1][0][0] = MulHigh(f[1][0][0], f0s);
-			f[0][0][0] = MulHigh(f[0][0][0], f0s);
-
-			// Signed fractions
-			if(!hasUnsignedTextureComponent(0) || !hasUnsignedTextureComponent(1) || !hasUnsignedTextureComponent(2) || !hasUnsignedTextureComponent(3))
-			{
-				fs[0][0][0] = f[0][0][0] >> 1;
-				fs[0][0][1] = f[0][0][1] >> 1;
-				fs[0][1][0] = f[0][1][0] >> 1;
-				fs[0][1][1] = f[0][1][1] >> 1;
-				fs[1][0][0] = f[1][0][0] >> 1;
-				fs[1][0][1] = f[1][0][1] >> 1;
-				fs[1][1][0] = f[1][1][0] >> 1;
-				fs[1][1][1] = f[1][1][1] >> 1;
-			}
-
-			for(int i = 0; i < 2; i++)
-			{
-				for(int j = 0; j < 2; j++)
-				{
-					for(int k = 0; k < 2; k++)
-					{
-						c[i][j][k] = sampleTexel(u[i][j][k], v[i][j][k], s[i][j][k], offset, mipmap, cubeArrayId, sampleId, buffer, function);
-
-						if(componentCount >= 1) { if(hasUnsignedTextureComponent(0)) c[i][j][k].x = MulHigh(As<UShort4>(c[i][j][k].x), f[1 - i][1 - j][1 - k]); else c[i][j][k].x = MulHigh(c[i][j][k].x, fs[1 - i][1 - j][1 - k]); }
-						if(componentCount >= 2) { if(hasUnsignedTextureComponent(1)) c[i][j][k].y = MulHigh(As<UShort4>(c[i][j][k].y), f[1 - i][1 - j][1 - k]); else c[i][j][k].y = MulHigh(c[i][j][k].y, fs[1 - i][1 - j][1 - k]); }
-						if(componentCount >= 3) { if(hasUnsignedTextureComponent(2)) c[i][j][k].z = MulHigh(As<UShort4>(c[i][j][k].z), f[1 - i][1 - j][1 - k]); else c[i][j][k].z = MulHigh(c[i][j][k].z, fs[1 - i][1 - j][1 - k]); }
-						if(componentCount >= 4) { if(hasUnsignedTextureComponent(3)) c[i][j][k].w = MulHigh(As<UShort4>(c[i][j][k].w), f[1 - i][1 - j][1 - k]); else c[i][j][k].w = MulHigh(c[i][j][k].w, fs[1 - i][1 - j][1 - k]); }
-
-						if(i != 0 || j != 0 || k != 0)
-						{
-							if(componentCount >= 1) c[0][0][0].x += c[i][j][k].x;
-							if(componentCount >= 2) c[0][0][0].y += c[i][j][k].y;
-							if(componentCount >= 3) c[0][0][0].z += c[i][j][k].z;
-							if(componentCount >= 4) c[0][0][0].w += c[i][j][k].w;
-						}
-					}
-				}
-			}
-
-			if(componentCount >= 1) c_.x = c[0][0][0].x;
-			if(componentCount >= 2) c_.y = c[0][0][0].y;
-			if(componentCount >= 3) c_.z = c[0][0][0].z;
-			if(componentCount >= 4) c_.w = c[0][0][0].w;
-
-			// Correct for signed fractions
-			if(componentCount >= 1) if(!hasUnsignedTextureComponent(0)) c_.x = AddSat(c_.x, c_.x);
-			if(componentCount >= 2) if(!hasUnsignedTextureComponent(1)) c_.y = AddSat(c_.y, c_.y);
-			if(componentCount >= 3) if(!hasUnsignedTextureComponent(2)) c_.z = AddSat(c_.z, c_.z);
-			if(componentCount >= 4) if(!hasUnsignedTextureComponent(3)) c_.w = AddSat(c_.w, c_.w);
-		}
-
-		return c_;
-	}
-
-	Vector4f SamplerCore::sampleFloatFilter(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, Float4 &q, Vector4f &offset, const Float4& cubeArrayCoord, const Int4& sampleId, Float &lod, Float &anisotropy, Float4 &uDelta, Float4 &vDelta, SamplerFunction function)
-	{
-		Vector4f c = sampleFloatAniso(texture, u, v, w, q, offset, cubeArrayCoord, sampleId, lod, anisotropy, uDelta, vDelta, false, function);
-
-		if(function == Fetch)
-		{
-			return c;
-		}
-
-		if(state.mipmapFilter == MIPMAP_LINEAR)
-		{
-			Vector4f cc = sampleFloatAniso(texture, u, v, w, q, offset, cubeArrayCoord, sampleId, lod, anisotropy, uDelta, vDelta, true, function);
-
-			Float4 lod4 = Float4(Frac(lod));
-
-			c.x = (cc.x - c.x) * lod4 + c.x;
-			c.y = (cc.y - c.y) * lod4 + c.y;
-			c.z = (cc.z - c.z) * lod4 + c.z;
-			c.w = (cc.w - c.w) * lod4 + c.w;
-		}
-
-		return c;
-	}
-
-	Vector4f SamplerCore::sampleFloatAniso(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, Float4 &q, Vector4f &offset, const Float4& cubeArrayCoord, const Int4& sampleId, Float &lod, Float &anisotropy, Float4 &uDelta, Float4 &vDelta, bool secondLOD, SamplerFunction function)
-	{
-		Vector4f c;
-
-		if(state.textureFilter != FILTER_ANISOTROPIC || function == Lod || function == Fetch)
-		{
-			c = sampleFloat(texture, u, v, w, q, offset, cubeArrayCoord, sampleId, lod, secondLOD, function);
-		}
-		else
-		{
-			Int a = RoundInt(anisotropy);
-
-			Vector4f cSum;
-
-			cSum.x = Float4(0.0f);
-			cSum.y = Float4(0.0f);
-			cSum.z = Float4(0.0f);
-			cSum.w = Float4(0.0f);
-
-			Float4 A = *Pointer<Float4>(constants + OFFSET(Constants,uvWeight) + 16 * a);
-			Float4 B = *Pointer<Float4>(constants + OFFSET(Constants,uvStart) + 16 * a);
-
-			Float4 du = uDelta;
-			Float4 dv = vDelta;
-
-			Float4 u0 = u + B * du;
-			Float4 v0 = v + B * dv;
-
-			du *= A;
-			dv *= A;
-
-			Int i = 0;
-
-			Do
-			{
-				c = sampleFloat(texture, u0, v0, w, q, offset, cubeArrayCoord, sampleId, lod, secondLOD, function);
-
-				u0 += du;
-				v0 += dv;
-
-				cSum.x += c.x * A;
-				cSum.y += c.y * A;
-				cSum.z += c.z * A;
-				cSum.w += c.w * A;
-
-				i++;
-			}
-			Until(i >= a)
-
-			c.x = cSum.x;
-			c.y = cSum.y;
-			c.z = cSum.z;
-			c.w = cSum.w;
-		}
-
-		return c;
-	}
-
-	Vector4f SamplerCore::sampleFloat(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, Float4 &q, Vector4f &offset, const Float4& cubeArrayCoord, const Int4& sampleId, Float &lod, bool secondLOD, SamplerFunction function)
-	{
-		if(state.textureType != VK_IMAGE_VIEW_TYPE_3D)
-		{
-			return sampleFloat2D(texture, u, v, w, q, offset, cubeArrayCoord, sampleId, lod, secondLOD, function);
-		}
-		else
-		{
-			return sampleFloat3D(texture, u, v, w, offset, cubeArrayCoord, sampleId, lod, secondLOD, function);
-		}
-	}
-
-	Vector4f SamplerCore::sampleFloat2D(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, Float4 &q, Vector4f &offset, const Float4& cubeArrayCoord, const Int4& sampleId, Float &lod, bool secondLOD, SamplerFunction function)
-	{
-		Vector4f c;
-
-		int componentCount = textureComponentCount();
-		bool gather = (state.textureFilter == FILTER_GATHER);
-
-		Pointer<Byte> mipmap;
-		Pointer<Byte> buffer;
-		selectMipmap(texture, mipmap, buffer, lod, secondLOD);
-
-		Int4 x0, x1, y0, y1, z0;
-		Float4 fu, fv, fw;
-		Int4 filter = computeFilterOffset(lod);
-		address(u, x0, x1, fu, mipmap, offset.x, filter, OFFSET(Mipmap, width), state.addressingModeU, function);
-		address(v, y0, y1, fv, mipmap, offset.y, filter, OFFSET(Mipmap, height), state.addressingModeV, function);
-		address(w, z0, z0, fw, mipmap, offset.z, filter, OFFSET(Mipmap, depth), state.addressingModeW, function);
-
-		Int4 cubeArrayId(0);
-		if(state.textureType == VK_IMAGE_VIEW_TYPE_CUBE_ARRAY)
-		{
-			address(cubeArrayCoord, cubeArrayId, cubeArrayId, fw, mipmap, offset.w, filter, OFFSET(Mipmap, depth), state.addressingModeY, function);
-		}
-
-		Int4 pitchP = *Pointer<Int4>(mipmap + OFFSET(Mipmap, pitchP), 16);
-		y0 *= pitchP;
-		if(state.addressingModeW != ADDRESSING_UNUSED)
-		{
-			z0 *= *Pointer<Int4>(mipmap + OFFSET(Mipmap, sliceP), 16);
-		}
-
-		if(state.textureFilter == FILTER_POINT || (function == Fetch))
-		{
-			c = sampleTexel(x0, y0, z0, q, mipmap, cubeArrayId, sampleId, buffer, function);
-		}
-		else
-		{
-			y1 *= pitchP;
-
-			Vector4f c00 = sampleTexel(x0, y0, z0, q, mipmap, cubeArrayId, sampleId, buffer, function);
-			Vector4f c10 = sampleTexel(x1, y0, z0, q, mipmap, cubeArrayId, sampleId, buffer, function);
-			Vector4f c01 = sampleTexel(x0, y1, z0, q, mipmap, cubeArrayId, sampleId, buffer, function);
-			Vector4f c11 = sampleTexel(x1, y1, z0, q, mipmap, cubeArrayId, sampleId, buffer, function);
-
-			if(!gather)   // Blend
-			{
-				if(componentCount >= 1) c00.x = c00.x + fu * (c10.x - c00.x);
-				if(componentCount >= 2) c00.y = c00.y + fu * (c10.y - c00.y);
-				if(componentCount >= 3) c00.z = c00.z + fu * (c10.z - c00.z);
-				if(componentCount >= 4) c00.w = c00.w + fu * (c10.w - c00.w);
-
-				if(componentCount >= 1) c01.x = c01.x + fu * (c11.x - c01.x);
-				if(componentCount >= 2) c01.y = c01.y + fu * (c11.y - c01.y);
-				if(componentCount >= 3) c01.z = c01.z + fu * (c11.z - c01.z);
-				if(componentCount >= 4) c01.w = c01.w + fu * (c11.w - c01.w);
-
-				if(componentCount >= 1) c.x = c00.x + fv * (c01.x - c00.x);
-				if(componentCount >= 2) c.y = c00.y + fv * (c01.y - c00.y);
-				if(componentCount >= 3) c.z = c00.z + fv * (c01.z - c00.z);
-				if(componentCount >= 4) c.w = c00.w + fv * (c01.w - c00.w);
-			}
-			else  // Gather
-			{
-				VkComponentSwizzle swizzle = gatherSwizzle();
-				switch(swizzle)
-				{
-				case VK_COMPONENT_SWIZZLE_ZERO:
-				case VK_COMPONENT_SWIZZLE_ONE:
-					// Handled at the final component swizzle.
-					break;
-				default:
-					c.x = c01[swizzle - VK_COMPONENT_SWIZZLE_R];
-					c.y = c11[swizzle - VK_COMPONENT_SWIZZLE_R];
-					c.z = c10[swizzle - VK_COMPONENT_SWIZZLE_R];
-					c.w = c00[swizzle - VK_COMPONENT_SWIZZLE_R];
-					break;
-				}
-			}
-		}
-
-		return c;
-	}
-
-	Vector4f SamplerCore::sampleFloat3D(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, Vector4f &offset, const Float4& cubeArrayCoord, const Int4& sampleId, Float &lod, bool secondLOD, SamplerFunction function)
-	{
-		Vector4f c;
-
-		int componentCount = textureComponentCount();
-
-		Pointer<Byte> mipmap;
-		Pointer<Byte> buffer;
-		selectMipmap(texture, mipmap, buffer, lod, secondLOD);
-
-		Int4 x0, x1, y0, y1, z0, z1;
-		Float4 fu, fv, fw;
-		Int4 filter = computeFilterOffset(lod);
-		address(u, x0, x1, fu, mipmap, offset.x, filter, OFFSET(Mipmap, width), state.addressingModeU, function);
-		address(v, y0, y1, fv, mipmap, offset.y, filter, OFFSET(Mipmap, height), state.addressingModeV, function);
-		address(w, z0, z1, fw, mipmap, offset.z, filter, OFFSET(Mipmap, depth), state.addressingModeW, function);
-
-		Int4 cubeArrayId(0);
-		if(state.textureType == VK_IMAGE_VIEW_TYPE_CUBE_ARRAY)
-		{
-			address(cubeArrayCoord, cubeArrayId, cubeArrayId, fw, mipmap, offset.w, filter, OFFSET(Mipmap, depth), state.addressingModeY, function);
-		}
-
-		Int4 pitchP = *Pointer<Int4>(mipmap + OFFSET(Mipmap, pitchP), 16);
-		Int4 sliceP = *Pointer<Int4>(mipmap + OFFSET(Mipmap, sliceP), 16);
-		y0 *= pitchP;
-		z0 *= sliceP;
-
-		if(state.textureFilter == FILTER_POINT || (function == Fetch))
-		{
-			c = sampleTexel(x0, y0, z0, w, mipmap, cubeArrayId, sampleId, buffer, function);
-		}
-		else
-		{
-			y1 *= pitchP;
-			z1 *= sliceP;
-
-			Vector4f c000 = sampleTexel(x0, y0, z0, w, mipmap, cubeArrayId, sampleId, buffer, function);
-			Vector4f c100 = sampleTexel(x1, y0, z0, w, mipmap, cubeArrayId, sampleId, buffer, function);
-			Vector4f c010 = sampleTexel(x0, y1, z0, w, mipmap, cubeArrayId, sampleId, buffer, function);
-			Vector4f c110 = sampleTexel(x1, y1, z0, w, mipmap, cubeArrayId, sampleId, buffer, function);
-			Vector4f c001 = sampleTexel(x0, y0, z1, w, mipmap, cubeArrayId, sampleId, buffer, function);
-			Vector4f c101 = sampleTexel(x1, y0, z1, w, mipmap, cubeArrayId, sampleId, buffer, function);
-			Vector4f c011 = sampleTexel(x0, y1, z1, w, mipmap, cubeArrayId, sampleId, buffer, function);
-			Vector4f c111 = sampleTexel(x1, y1, z1, w, mipmap, cubeArrayId, sampleId, buffer, function);
-
-			// Blend first slice
-			if(componentCount >= 1) c000.x = c000.x + fu * (c100.x - c000.x);
-			if(componentCount >= 2) c000.y = c000.y + fu * (c100.y - c000.y);
-			if(componentCount >= 3) c000.z = c000.z + fu * (c100.z - c000.z);
-			if(componentCount >= 4) c000.w = c000.w + fu * (c100.w - c000.w);
-
-			if(componentCount >= 1) c010.x = c010.x + fu * (c110.x - c010.x);
-			if(componentCount >= 2) c010.y = c010.y + fu * (c110.y - c010.y);
-			if(componentCount >= 3) c010.z = c010.z + fu * (c110.z - c010.z);
-			if(componentCount >= 4) c010.w = c010.w + fu * (c110.w - c010.w);
-
-			if(componentCount >= 1) c000.x = c000.x + fv * (c010.x - c000.x);
-			if(componentCount >= 2) c000.y = c000.y + fv * (c010.y - c000.y);
-			if(componentCount >= 3) c000.z = c000.z + fv * (c010.z - c000.z);
-			if(componentCount >= 4) c000.w = c000.w + fv * (c010.w - c000.w);
-
-			// Blend second slice
-			if(componentCount >= 1) c001.x = c001.x + fu * (c101.x - c001.x);
-			if(componentCount >= 2) c001.y = c001.y + fu * (c101.y - c001.y);
-			if(componentCount >= 3) c001.z = c001.z + fu * (c101.z - c001.z);
-			if(componentCount >= 4) c001.w = c001.w + fu * (c101.w - c001.w);
-
-			if(componentCount >= 1) c011.x = c011.x + fu * (c111.x - c011.x);
-			if(componentCount >= 2) c011.y = c011.y + fu * (c111.y - c011.y);
-			if(componentCount >= 3) c011.z = c011.z + fu * (c111.z - c011.z);
-			if(componentCount >= 4) c011.w = c011.w + fu * (c111.w - c011.w);
-
-			if(componentCount >= 1) c001.x = c001.x + fv * (c011.x - c001.x);
-			if(componentCount >= 2) c001.y = c001.y + fv * (c011.y - c001.y);
-			if(componentCount >= 3) c001.z = c001.z + fv * (c011.z - c001.z);
-			if(componentCount >= 4) c001.w = c001.w + fv * (c011.w - c001.w);
-
-			// Blend slices
-			if(componentCount >= 1) c.x = c000.x + fw * (c001.x - c000.x);
-			if(componentCount >= 2) c.y = c000.y + fw * (c001.y - c000.y);
-			if(componentCount >= 3) c.z = c000.z + fw * (c001.z - c000.z);
-			if(componentCount >= 4) c.w = c000.w + fw * (c001.w - c000.w);
-		}
-
-		return c;
-	}
-
-	Float SamplerCore::log2sqrt(Float lod)
-	{
-		// log2(sqrt(lod))                               // Equals 0.25 * log2(lod^2).
-		lod *= lod;                                      // Squaring doubles the exponent and produces an extra bit of precision.
-		lod = Float(As<Int>(lod)) - Float(0x3F800000);   // Interpret as integer and subtract the exponent bias.
-		lod *= As<Float>(Int(0x33000000));               // Scale by 0.25 * 2^-23 (mantissa length).
-
-		return lod;
-	}
-
-	Float SamplerCore::log2(Float lod)
-	{
-		lod *= lod;                                      // Squaring doubles the exponent and produces an extra bit of precision.
-		lod = Float(As<Int>(lod)) - Float(0x3F800000);   // Interpret as integer and subtract the exponent bias.
-		lod *= As<Float>(Int(0x33800000));               // Scale by 0.5 * 2^-23 (mantissa length).
-
-		return lod;
-	}
-
-	void SamplerCore::computeLod(Pointer<Byte> &texture, Pointer<Byte> &sampler, Float &lod, Float &anisotropy, Float4 &uDelta, Float4 &vDelta, Float4 &uuuu, Float4 &vvvv, Float4 &dsx, Float4 &dsy, SamplerFunction function)
-	{
-		Float4 duvdxy;
-
-		if(function != Grad)   // Implicit
-		{
-			duvdxy = Float4(uuuu.yz, vvvv.yz) - Float4(uuuu.xx, vvvv.xx);
-		}
-		else
-		{
-			Float4 dudxy = Float4(dsx.xx, dsy.xx);
-			Float4 dvdxy = Float4(dsx.yy, dsy.yy);
-
-			duvdxy = Float4(dudxy.xz, dvdxy.xz);
-		}
-
-		// Scale by texture dimensions.
-		Float4 dUVdxy = duvdxy * *Pointer<Float4>(texture + OFFSET(Texture, widthWidthHeightHeight));
-
-		Float4 dUV2dxy = dUVdxy * dUVdxy;
-		Float4 dUV2 = dUV2dxy.xy + dUV2dxy.zw;
-
-		lod = Max(Float(dUV2.x), Float(dUV2.y));   // Square length of major axis
-
-		if(state.textureFilter == FILTER_ANISOTROPIC)
-		{
-			Float det = Abs(Float(dUVdxy.x) * Float(dUVdxy.w) - Float(dUVdxy.y) * Float(dUVdxy.z));
-
-			Float4 dudx = duvdxy.xxxx;
-			Float4 dudy = duvdxy.yyyy;
-			Float4 dvdx = duvdxy.zzzz;
-			Float4 dvdy = duvdxy.wwww;
-
-			Int4 mask = As<Int4>(CmpNLT(dUV2.x, dUV2.y));
-			uDelta = As<Float4>((As<Int4>(dudx) & mask) | ((As<Int4>(dudy) & ~mask)));
-			vDelta = As<Float4>((As<Int4>(dvdx) & mask) | ((As<Int4>(dvdy) & ~mask)));
-
-			anisotropy = lod * Rcp_pp(det);
-			anisotropy = Min(anisotropy, *Pointer<Float>(sampler + OFFSET(vk::Sampler,maxAnisotropy)));
-
-			lod *= Rcp_pp(anisotropy * anisotropy);
-		}
-
-		lod = log2sqrt(lod);   // log2(sqrt(lod))
-	}
-
-	void SamplerCore::computeLodCube(Pointer<Byte> &texture, Pointer<Byte> &sampler, Float &lod, Float4 &u, Float4 &v, Float4 &w, Float4 &dsx, Float4 &dsy, Float4 &M, SamplerFunction function)
-	{
-		Float4 dudxy, dvdxy, dsdxy;
-
-		if(function != Grad)  // Implicit
-		{
-			Float4 U = u * M;
-			Float4 V = v * M;
-			Float4 W = w * M;
-
-			dudxy = Abs(U - U.xxxx);
-			dvdxy = Abs(V - V.xxxx);
-			dsdxy = Abs(W - W.xxxx);
-		}
-		else
-		{
-			dudxy = Float4(dsx.xx, dsy.xx);
-			dvdxy = Float4(dsx.yy, dsy.yy);
-			dsdxy = Float4(dsx.zz, dsy.zz);
-
-			dudxy = Abs(dudxy * Float4(M.x));
-			dvdxy = Abs(dvdxy * Float4(M.x));
-			dsdxy = Abs(dsdxy * Float4(M.x));
-		}
-
-		// Compute the largest Manhattan distance in two dimensions.
-		// This takes the footprint across adjacent faces into account.
-		Float4 duvdxy = dudxy + dvdxy;
-		Float4 dusdxy = dudxy + dsdxy;
-		Float4 dvsdxy = dvdxy + dsdxy;
-
-		dudxy = Max(Max(duvdxy, dusdxy), dvsdxy);
-
-		lod = Max(Float(dudxy.y), Float(dudxy.z));   // FIXME: Max(dudxy.y, dudxy.z);
-
-		// Scale by texture dimension.
-		lod *= *Pointer<Float>(texture + OFFSET(Texture,width));
-
-		lod = log2(lod);
-	}
-
-	void SamplerCore::computeLod3D(Pointer<Byte> &texture, Pointer<Byte> &sampler, Float &lod, Float4 &uuuu, Float4 &vvvv, Float4 &wwww, Float4 &dsx, Float4 &dsy, SamplerFunction function)
-	{
-		Float4 dudxy, dvdxy, dsdxy;
-
-		if(function != Grad)   // Implicit
-		{
-			dudxy = uuuu - uuuu.xxxx;
-			dvdxy = vvvv - vvvv.xxxx;
-			dsdxy = wwww - wwww.xxxx;
-		}
-		else
-		{
-			dudxy = Float4(dsx.xx, dsy.xx);
-			dvdxy = Float4(dsx.yy, dsy.yy);
-			dsdxy = Float4(dsx.zz, dsy.zz);
-		}
-
-		// Scale by texture dimensions.
-		dudxy *= *Pointer<Float4>(texture + OFFSET(Texture, width));
-		dvdxy *= *Pointer<Float4>(texture + OFFSET(Texture, height));
-		dsdxy *= *Pointer<Float4>(texture + OFFSET(Texture, depth));
-
-		dudxy *= dudxy;
-		dvdxy *= dvdxy;
-		dsdxy *= dsdxy;
-
-		dudxy += dvdxy;
-		dudxy += dsdxy;
-
-		lod = Max(Float(dudxy.y), Float(dudxy.z));   // FIXME: Max(dudxy.y, dudxy.z);
-
-		lod = log2sqrt(lod);   // log2(sqrt(lod))
-	}
-
-	Int4 SamplerCore::cubeFace(Float4 &U, Float4 &V, Float4 &x, Float4 &y, Float4 &z, Float4 &M)
-	{
-		// TODO: Comply with Vulkan recommendation:
-		// Vulkan 1.1: "The rules should have as the first rule that rz wins over ry and rx, and the second rule that ry wins over rx."
-
-		Int4 xn = CmpLT(x, Float4(0.0f));   // x < 0
-		Int4 yn = CmpLT(y, Float4(0.0f));   // y < 0
-		Int4 zn = CmpLT(z, Float4(0.0f));   // z < 0
-
-		Float4 absX = Abs(x);
-		Float4 absY = Abs(y);
-		Float4 absZ = Abs(z);
-
-		Int4 xy = CmpNLE(absX, absY);   // abs(x) > abs(y)
-		Int4 yz = CmpNLE(absY, absZ);   // abs(y) > abs(z)
-		Int4 zx = CmpNLE(absZ, absX);   // abs(z) > abs(x)
-		Int4 xMajor = xy & ~zx;   // abs(x) > abs(y) && abs(x) > abs(z)
-		Int4 yMajor = yz & ~xy;   // abs(y) > abs(z) && abs(y) > abs(x)
-		Int4 zMajor = zx & ~yz;   // abs(z) > abs(x) && abs(z) > abs(y)
-
-		// FACE_POSITIVE_X = 000b
-		// FACE_NEGATIVE_X = 001b
-		// FACE_POSITIVE_Y = 010b
-		// FACE_NEGATIVE_Y = 011b
-		// FACE_POSITIVE_Z = 100b
-		// FACE_NEGATIVE_Z = 101b
-
-		Int yAxis = SignMask(yMajor);
-		Int zAxis = SignMask(zMajor);
-
-		Int4 n = ((xn & xMajor) | (yn & yMajor) | (zn & zMajor)) & Int4(0x80000000);
-		Int negative = SignMask(n);
-
-		Int faces = *Pointer<Int>(constants + OFFSET(Constants,transposeBit0) + negative * 4);
-		faces |= *Pointer<Int>(constants + OFFSET(Constants,transposeBit1) + yAxis * 4);
-		faces |= *Pointer<Int>(constants + OFFSET(Constants,transposeBit2) + zAxis * 4);
-
-		Int4 face;
-		face.x = faces & 0x7;
-		face.y = (faces >> 4)  & 0x7;
-		face.z = (faces >> 8)  & 0x7;
-		face.w = (faces >> 12) & 0x7;
-
-		M = Max(Max(absX, absY), Max(absZ, Float4(std::numeric_limits<float>::min())));
-
-		// U = xMajor ? (neg ^ -z) : ((zMajor & neg) ^ x)
-		U = As<Float4>((xMajor & (n ^ As<Int4>(-z))) | (~xMajor & ((zMajor & n) ^ As<Int4>(x))));
-
-		// V = !yMajor ? -y : (n ^ z)
-		V = As<Float4>((~yMajor & As<Int4>(-y)) | (yMajor & (n ^ As<Int4>(z))));
-
-		M = reciprocal(M) * Float4(0.5f);
-		U = U * M + Float4(0.5f);
-		V = V * M + Float4(0.5f);
-
-		return face;
-	}
-
-	Short4 SamplerCore::applyOffset(Short4 &uvw, Float4 &offset, const Int4 &whd, AddressingMode mode)
-	{
-		Int4 tmp = Int4(As<UShort4>(uvw));
-		tmp = tmp + As<Int4>(offset);
-
-		switch(mode)
-		{
-		case AddressingMode::ADDRESSING_WRAP:
-			tmp = (tmp + whd * Int4(-MIN_TEXEL_OFFSET)) % whd;
-			break;
-		case AddressingMode::ADDRESSING_CLAMP:
-		case AddressingMode::ADDRESSING_MIRROR:
-		case AddressingMode::ADDRESSING_MIRRORONCE:
-		case AddressingMode::ADDRESSING_BORDER: // FIXME: Implement and test ADDRESSING_MIRROR, ADDRESSING_MIRRORONCE, ADDRESSING_BORDER
-			tmp = Min(Max(tmp, Int4(0)), whd - Int4(1));
-			break;
-		case ADDRESSING_TEXELFETCH:
-			break;
-		case AddressingMode::ADDRESSING_SEAMLESS:
-			ASSERT(false);   // Cube sampling doesn't support offset.
-		default:
-			ASSERT(false);
-		}
-
-		return As<Short4>(UShort4(tmp));
-	}
-
-	void SamplerCore::computeIndices(UInt index[4], Short4 uuuu, Short4 vvvv, Short4 wwww, Vector4f &offset, const Pointer<Byte> &mipmap, const Short4& cubeArrayId, const Int4& sampleId, SamplerFunction function)
-	{
-		bool texelFetch = (function == Fetch);
-		bool hasOffset = (function.offset != 0);
-
-		if(!texelFetch)
-		{
-			uuuu = MulHigh(As<UShort4>(uuuu), UShort4(*Pointer<Int4>(mipmap + OFFSET(Mipmap, width))));
-			vvvv = MulHigh(As<UShort4>(vvvv), UShort4(*Pointer<Int4>(mipmap + OFFSET(Mipmap, height))));
-		}
-
-		if(hasOffset)
-		{
-			uuuu = applyOffset(uuuu, offset.x, *Pointer<Int4>(mipmap + OFFSET(Mipmap, width)),
-			                   texelFetch ? ADDRESSING_TEXELFETCH : state.addressingModeU);
-			vvvv = applyOffset(vvvv, offset.y, *Pointer<Int4>(mipmap + OFFSET(Mipmap, height)),
-			                   texelFetch ? ADDRESSING_TEXELFETCH : state.addressingModeV);
-		}
-
-		Short4 uuu2 = uuuu;
-		uuuu = As<Short4>(UnpackLow(uuuu, vvvv));
-		uuu2 = As<Short4>(UnpackHigh(uuu2, vvvv));
-		uuuu = As<Short4>(MulAdd(uuuu, *Pointer<Short4>(mipmap + OFFSET(Mipmap,onePitchP))));
-		uuu2 = As<Short4>(MulAdd(uuu2, *Pointer<Short4>(mipmap + OFFSET(Mipmap,onePitchP))));
-
+		Int size = *Pointer<Int>(mipmap + OFFSET(Mipmap, sliceP));
 		if(hasThirdCoordinate())
 		{
-			if(state.textureType == VK_IMAGE_VIEW_TYPE_3D)
-			{
-				if(!texelFetch)
-				{
-					wwww = MulHigh(As<UShort4>(wwww), UShort4(*Pointer<Int4>(mipmap + OFFSET(Mipmap, depth))));
-				}
-
-				if(hasOffset)
-				{
-					wwww = applyOffset(wwww, offset.z, *Pointer<Int4>(mipmap + OFFSET(Mipmap, depth)),
-					                   texelFetch ? ADDRESSING_TEXELFETCH : state.addressingModeW);
-				}
-			}
-
-			UInt4 uv(As<UInt2>(uuuu), As<UInt2>(uuu2));
-			uv += As<UInt4>(Int4(As<UShort4>(wwww))) * *Pointer<UInt4>(mipmap + OFFSET(Mipmap, sliceP));
-
-			index[0] = Extract(As<Int4>(uv), 0);
-			index[1] = Extract(As<Int4>(uv), 1);
-			index[2] = Extract(As<Int4>(uv), 2);
-			index[3] = Extract(As<Int4>(uv), 3);
+			size *= *Pointer<Int>(mipmap + OFFSET(Mipmap, depth));
 		}
-		else
-		{
-			index[0] = Extract(As<Int2>(uuuu), 0);
-			index[1] = Extract(As<Int2>(uuuu), 1);
-			index[2] = Extract(As<Int2>(uuu2), 0);
-			index[3] = Extract(As<Int2>(uuu2), 1);
-		}
-
-		if(texelFetch)
-		{
-			Int size = *Pointer<Int>(mipmap + OFFSET(Mipmap, sliceP));
-			if(hasThirdCoordinate())
-			{
-				size *= *Pointer<Int>(mipmap + OFFSET(Mipmap, depth));
-			}
-			UInt min = 0;
-			UInt max = size - 1;
-
-			for(int i = 0; i < 4; i++)
-			{
-				index[i] = Min(Max(index[i], min), max);
-			}
-		}
-
-		if(function.sample)
-		{
-			UInt4 sampleOffset = Min(As<UInt4>(sampleId), *Pointer<UInt4>(mipmap + OFFSET(Mipmap, sampleMax), 16)) *
-			                     *Pointer<UInt4>(mipmap + OFFSET(Mipmap, samplePitchP), 16);
-			for(int i = 0; i < 4; i++)
-			{
-				index[i] += Extract(sampleOffset, i);
-			}
-		}
-
-		if(state.textureType == VK_IMAGE_VIEW_TYPE_CUBE_ARRAY)
-		{
-			UInt4 cubeLayerOffset = As<UInt4>(cubeArrayId) * *Pointer<UInt4>(mipmap + OFFSET(Mipmap, sliceP)) * UInt4(6);
-			for(int i = 0; i < 4; i++)
-			{
-				index[i] += Extract(cubeLayerOffset, i);
-			}
-		}
-	}
-
-	void SamplerCore::computeIndices(UInt index[4], Int4 uuuu, Int4 vvvv, Int4 wwww, Int4 valid, const Pointer<Byte> &mipmap, const Int4& cubeArrayId, const Int4& sampleId, SamplerFunction function)
-	{
-		UInt4 indices = uuuu + vvvv;
-
-		if(state.addressingModeW != ADDRESSING_UNUSED)
-		{
-			indices += As<UInt4>(wwww);
-		}
-
-		if(borderModeActive())
-		{
-			// Texels out of range are still sampled before being replaced
-			// with the border color, so sample them at linear index 0.
-			indices &= As<UInt4>(valid);
-		}
-
-		if(function.sample)
-		{
-			indices += Min(As<UInt4>(sampleId), *Pointer<UInt4>(mipmap + OFFSET(Mipmap, sampleMax), 16)) *
-			           *Pointer<UInt4>(mipmap + OFFSET(Mipmap, samplePitchP), 16);
-		}
-
-		if(state.textureType == VK_IMAGE_VIEW_TYPE_CUBE_ARRAY)
-		{
-			indices += As<UInt4>(cubeArrayId) * *Pointer<UInt4>(mipmap + OFFSET(Mipmap, sliceP)) * UInt4(6);
-		}
+		UInt min = 0;
+		UInt max = size - 1;
 
 		for(int i = 0; i < 4; i++)
 		{
-			index[i] = Extract(As<Int4>(indices), i);
+			index[i] = Min(Max(index[i], min), max);
 		}
 	}
 
-	Vector4s SamplerCore::sampleTexel(UInt index[4], Pointer<Byte> buffer)
+	if(function.sample)
 	{
-		Vector4s c;
-
-		if(has16bitTextureFormat())
+		UInt4 sampleOffset = Min(As<UInt4>(sampleId), *Pointer<UInt4>(mipmap + OFFSET(Mipmap, sampleMax), 16)) *
+		                     *Pointer<UInt4>(mipmap + OFFSET(Mipmap, samplePitchP), 16);
+		for(int i = 0; i < 4; i++)
 		{
+			index[i] += Extract(sampleOffset, i);
+		}
+	}
+
+	if(state.textureType == VK_IMAGE_VIEW_TYPE_CUBE_ARRAY)
+	{
+		UInt4 cubeLayerOffset = As<UInt4>(cubeArrayId) * *Pointer<UInt4>(mipmap + OFFSET(Mipmap, sliceP)) * UInt4(6);
+		for(int i = 0; i < 4; i++)
+		{
+			index[i] += Extract(cubeLayerOffset, i);
+		}
+	}
+}
+
+void SamplerCore::computeIndices(UInt index[4], Int4 uuuu, Int4 vvvv, Int4 wwww, Int4 valid, const Pointer<Byte> &mipmap, const Int4& cubeArrayId, const Int4& sampleId, SamplerFunction function)
+{
+	UInt4 indices = uuuu + vvvv;
+
+	if(state.addressingModeW != ADDRESSING_UNUSED)
+	{
+		indices += As<UInt4>(wwww);
+	}
+
+	if(borderModeActive())
+	{
+		// Texels out of range are still sampled before being replaced
+		// with the border color, so sample them at linear index 0.
+		indices &= As<UInt4>(valid);
+	}
+
+	if(function.sample)
+	{
+		indices += Min(As<UInt4>(sampleId), *Pointer<UInt4>(mipmap + OFFSET(Mipmap, sampleMax), 16)) *
+		           *Pointer<UInt4>(mipmap + OFFSET(Mipmap, samplePitchP), 16);
+	}
+
+	if(state.textureType == VK_IMAGE_VIEW_TYPE_CUBE_ARRAY)
+	{
+		indices += As<UInt4>(cubeArrayId) * *Pointer<UInt4>(mipmap + OFFSET(Mipmap, sliceP)) * UInt4(6);
+	}
+
+	for(int i = 0; i < 4; i++)
+	{
+		index[i] = Extract(As<Int4>(indices), i);
+	}
+}
+
+Vector4s SamplerCore::sampleTexel(UInt index[4], Pointer<Byte> buffer)
+{
+	Vector4s c;
+
+	if(has16bitTextureFormat())
+	{
+		c.x = Insert(c.x, Pointer<Short>(buffer)[index[0]], 0);
+		c.x = Insert(c.x, Pointer<Short>(buffer)[index[1]], 1);
+		c.x = Insert(c.x, Pointer<Short>(buffer)[index[2]], 2);
+		c.x = Insert(c.x, Pointer<Short>(buffer)[index[3]], 3);
+
+		switch(state.textureFormat)
+		{
+		case VK_FORMAT_R5G6B5_UNORM_PACK16:
+			c.z = (c.x & Short4(0x001Fu)) << 11;
+			c.y = (c.x & Short4(0x07E0u)) << 5;
+			c.x = (c.x & Short4(0xF800u));
+			break;
+		case VK_FORMAT_B4G4R4A4_UNORM_PACK16:
+			c.w = (c.x << 12) & Short4(0xF000u);
+			c.z = (c.x) & Short4(0xF000u);
+			c.y = (c.x << 4) & Short4(0xF000u);
+			c.x = (c.x << 8) & Short4(0xF000u);
+			break;
+		case VK_FORMAT_A1R5G5B5_UNORM_PACK16:
+			c.w = (c.x) & Short4(0x8000u);
+			c.z = (c.x << 11) & Short4(0xF800u);
+			c.y = (c.x << 6) & Short4(0xF800u);
+			c.x = (c.x << 1) & Short4(0xF800u);
+			break;
+		default:
+			ASSERT(false);
+		}
+	}
+	else if(has8bitTextureComponents())
+	{
+		switch(textureComponentCount())
+		{
+		case 4:
+			{
+				Byte4 c0 = Pointer<Byte4>(buffer)[index[0]];
+				Byte4 c1 = Pointer<Byte4>(buffer)[index[1]];
+				Byte4 c2 = Pointer<Byte4>(buffer)[index[2]];
+				Byte4 c3 = Pointer<Byte4>(buffer)[index[3]];
+				c.x = Unpack(c0, c1);
+				c.y = Unpack(c2, c3);
+
+				switch(state.textureFormat)
+				{
+				case VK_FORMAT_B8G8R8A8_UNORM:
+				case VK_FORMAT_B8G8R8A8_SRGB:
+					c.z = As<Short4>(UnpackLow(c.x, c.y));
+					c.x = As<Short4>(UnpackHigh(c.x, c.y));
+					c.y = c.z;
+					c.w = c.x;
+					c.z = UnpackLow(As<Byte8>(Short4(0)), As<Byte8>(c.z));
+					c.y = UnpackHigh(As<Byte8>(Short4(0)), As<Byte8>(c.y));
+					c.x = UnpackLow(As<Byte8>(Short4(0)), As<Byte8>(c.x));
+					c.w = UnpackHigh(As<Byte8>(Short4(0)), As<Byte8>(c.w));
+					break;
+				case VK_FORMAT_R8G8B8A8_UNORM:
+				case VK_FORMAT_R8G8B8A8_SINT:
+				case VK_FORMAT_R8G8B8A8_SNORM:
+				case VK_FORMAT_R8G8B8A8_SRGB:
+					c.z = As<Short4>(UnpackHigh(c.x, c.y));
+					c.x = As<Short4>(UnpackLow(c.x, c.y));
+					c.y = c.x;
+					c.w = c.z;
+					c.x = UnpackLow(As<Byte8>(Short4(0)), As<Byte8>(c.x));
+					c.y = UnpackHigh(As<Byte8>(Short4(0)), As<Byte8>(c.y));
+					c.z = UnpackLow(As<Byte8>(Short4(0)), As<Byte8>(c.z));
+					c.w = UnpackHigh(As<Byte8>(Short4(0)), As<Byte8>(c.w));
+					// Propagate sign bit
+					if(state.textureFormat == VK_FORMAT_R8G8B8A8_SINT)
+					{
+						c.x >>= 8;
+						c.y >>= 8;
+						c.z >>= 8;
+						c.w >>= 8;
+					}
+					break;
+				case VK_FORMAT_R8G8B8A8_UINT:
+					c.z = As<Short4>(UnpackHigh(c.x, c.y));
+					c.x = As<Short4>(UnpackLow(c.x, c.y));
+					c.y = c.x;
+					c.w = c.z;
+					c.x = UnpackLow(As<Byte8>(c.x), As<Byte8>(Short4(0)));
+					c.y = UnpackHigh(As<Byte8>(c.y), As<Byte8>(Short4(0)));
+					c.z = UnpackLow(As<Byte8>(c.z), As<Byte8>(Short4(0)));
+					c.w = UnpackHigh(As<Byte8>(c.w), As<Byte8>(Short4(0)));
+					break;
+				default:
+					ASSERT(false);
+				}
+			}
+			break;
+		case 2:
 			c.x = Insert(c.x, Pointer<Short>(buffer)[index[0]], 0);
 			c.x = Insert(c.x, Pointer<Short>(buffer)[index[1]], 1);
 			c.x = Insert(c.x, Pointer<Short>(buffer)[index[2]], 2);
@@ -1425,1070 +1517,980 @@
 
 			switch(state.textureFormat)
 			{
-			case VK_FORMAT_R5G6B5_UNORM_PACK16:
-				c.z = (c.x & Short4(0x001Fu)) << 11;
-				c.y = (c.x & Short4(0x07E0u)) << 5;
-				c.x = (c.x & Short4(0xF800u));
+			case VK_FORMAT_R8G8_UNORM:
+			case VK_FORMAT_R8G8_SNORM:
+			case VK_FORMAT_R8G8_SRGB:
+				c.y = (c.x & Short4(0xFF00u));
+				c.x = (c.x << 8);
 				break;
-			case VK_FORMAT_B4G4R4A4_UNORM_PACK16:
-				c.w = (c.x << 12) & Short4(0xF000u);
-				c.z = (c.x) & Short4(0xF000u);
-				c.y = (c.x << 4) & Short4(0xF000u);
-				c.x = (c.x << 8) & Short4(0xF000u);
+			case VK_FORMAT_R8G8_SINT:
+				c.y = c.x >> 8;
+				c.x = (c.x << 8) >> 8; // Propagate sign bit
 				break;
-			case VK_FORMAT_A1R5G5B5_UNORM_PACK16:
-				c.w = (c.x) & Short4(0x8000u);
-				c.z = (c.x << 11) & Short4(0xF800u);
-				c.y = (c.x << 6) & Short4(0xF800u);
-				c.x = (c.x << 1) & Short4(0xF800u);
+			case VK_FORMAT_R8G8_UINT:
+				c.y = As<Short4>(As<UShort4>(c.x) >> 8);
+				c.x &= Short4(0x00FFu);
 				break;
 			default:
 				ASSERT(false);
 			}
-		}
-		else if(has8bitTextureComponents())
-		{
-			switch(textureComponentCount())
+			break;
+		case 1:
 			{
-			case 4:
-				{
-					Byte4 c0 = Pointer<Byte4>(buffer)[index[0]];
-					Byte4 c1 = Pointer<Byte4>(buffer)[index[1]];
-					Byte4 c2 = Pointer<Byte4>(buffer)[index[2]];
-					Byte4 c3 = Pointer<Byte4>(buffer)[index[3]];
-					c.x = Unpack(c0, c1);
-					c.y = Unpack(c2, c3);
-
-					switch(state.textureFormat)
-					{
-					case VK_FORMAT_B8G8R8A8_UNORM:
-					case VK_FORMAT_B8G8R8A8_SRGB:
-						c.z = As<Short4>(UnpackLow(c.x, c.y));
-						c.x = As<Short4>(UnpackHigh(c.x, c.y));
-						c.y = c.z;
-						c.w = c.x;
-						c.z = UnpackLow(As<Byte8>(Short4(0)), As<Byte8>(c.z));
-						c.y = UnpackHigh(As<Byte8>(Short4(0)), As<Byte8>(c.y));
-						c.x = UnpackLow(As<Byte8>(Short4(0)), As<Byte8>(c.x));
-						c.w = UnpackHigh(As<Byte8>(Short4(0)), As<Byte8>(c.w));
-						break;
-					case VK_FORMAT_R8G8B8A8_UNORM:
-					case VK_FORMAT_R8G8B8A8_SINT:
-					case VK_FORMAT_R8G8B8A8_SNORM:
-					case VK_FORMAT_R8G8B8A8_SRGB:
-						c.z = As<Short4>(UnpackHigh(c.x, c.y));
-						c.x = As<Short4>(UnpackLow(c.x, c.y));
-						c.y = c.x;
-						c.w = c.z;
-						c.x = UnpackLow(As<Byte8>(Short4(0)), As<Byte8>(c.x));
-						c.y = UnpackHigh(As<Byte8>(Short4(0)), As<Byte8>(c.y));
-						c.z = UnpackLow(As<Byte8>(Short4(0)), As<Byte8>(c.z));
-						c.w = UnpackHigh(As<Byte8>(Short4(0)), As<Byte8>(c.w));
-						// Propagate sign bit
-						if(state.textureFormat == VK_FORMAT_R8G8B8A8_SINT)
-						{
-							c.x >>= 8;
-							c.y >>= 8;
-							c.z >>= 8;
-							c.w >>= 8;
-						}
-						break;
-					case VK_FORMAT_R8G8B8A8_UINT:
-						c.z = As<Short4>(UnpackHigh(c.x, c.y));
-						c.x = As<Short4>(UnpackLow(c.x, c.y));
-						c.y = c.x;
-						c.w = c.z;
-						c.x = UnpackLow(As<Byte8>(c.x), As<Byte8>(Short4(0)));
-						c.y = UnpackHigh(As<Byte8>(c.y), As<Byte8>(Short4(0)));
-						c.z = UnpackLow(As<Byte8>(c.z), As<Byte8>(Short4(0)));
-						c.w = UnpackHigh(As<Byte8>(c.w), As<Byte8>(Short4(0)));
-						break;
-					default:
-						ASSERT(false);
-					}
-				}
-				break;
-			case 2:
-				c.x = Insert(c.x, Pointer<Short>(buffer)[index[0]], 0);
-				c.x = Insert(c.x, Pointer<Short>(buffer)[index[1]], 1);
-				c.x = Insert(c.x, Pointer<Short>(buffer)[index[2]], 2);
-				c.x = Insert(c.x, Pointer<Short>(buffer)[index[3]], 3);
+				Int c0 = Int(*Pointer<Byte>(buffer + index[0]));
+				Int c1 = Int(*Pointer<Byte>(buffer + index[1]));
+				Int c2 = Int(*Pointer<Byte>(buffer + index[2]));
+				Int c3 = Int(*Pointer<Byte>(buffer + index[3]));
+				c0 = c0 | (c1 << 8) | (c2 << 16) | (c3 << 24);
 
 				switch(state.textureFormat)
 				{
-				case VK_FORMAT_R8G8_UNORM:
-				case VK_FORMAT_R8G8_SNORM:
-				case VK_FORMAT_R8G8_SRGB:
-					c.y = (c.x & Short4(0xFF00u));
-					c.x = (c.x << 8);
+				case VK_FORMAT_R8_SINT:
+				case VK_FORMAT_R8_UINT:
+				case VK_FORMAT_S8_UINT:
+					{
+						Int zero(0);
+						c.x = Unpack(As<Byte4>(c0), As<Byte4>(zero));
+						// Propagate sign bit
+						if(state.textureFormat == VK_FORMAT_R8_SINT)
+						{
+							c.x = (c.x << 8) >> 8;
+						}
+					}
 					break;
-				case VK_FORMAT_R8G8_SINT:
-					c.y = c.x >> 8;
-					c.x = (c.x << 8) >> 8; // Propagate sign bit
-					break;
-				case VK_FORMAT_R8G8_UINT:
-					c.y = As<Short4>(As<UShort4>(c.x) >> 8);
-					c.x &= Short4(0x00FFu);
+				case VK_FORMAT_R8_SNORM:
+				case VK_FORMAT_R8_UNORM:
+				case VK_FORMAT_R8_SRGB:
+					// TODO: avoid populating the low bits at all.
+					c.x = Unpack(As<Byte4>(c0));
+					c.x &= Short4(0xFF00u);
 					break;
 				default:
-					ASSERT(false);
-				}
-				break;
-			case 1:
-				{
-					Int c0 = Int(*Pointer<Byte>(buffer + index[0]));
-					Int c1 = Int(*Pointer<Byte>(buffer + index[1]));
-					Int c2 = Int(*Pointer<Byte>(buffer + index[2]));
-					Int c3 = Int(*Pointer<Byte>(buffer + index[3]));
-					c0 = c0 | (c1 << 8) | (c2 << 16) | (c3 << 24);
-
-					switch(state.textureFormat)
-					{
-					case VK_FORMAT_R8_SINT:
-					case VK_FORMAT_R8_UINT:
-					case VK_FORMAT_S8_UINT:
-						{
-							Int zero(0);
-							c.x = Unpack(As<Byte4>(c0), As<Byte4>(zero));
-							// Propagate sign bit
-							if(state.textureFormat == VK_FORMAT_R8_SINT)
-							{
-								c.x = (c.x << 8) >> 8;
-							}
-						}
-						break;
-					case VK_FORMAT_R8_SNORM:
-					case VK_FORMAT_R8_UNORM:
-					case VK_FORMAT_R8_SRGB:
-						// TODO: avoid populating the low bits at all.
-						c.x = Unpack(As<Byte4>(c0));
-						c.x &= Short4(0xFF00u);
-						break;
-					default:
-						c.x = Unpack(As<Byte4>(c0));
-						break;
-					}
-				}
-				break;
-			default:
-				ASSERT(false);
-			}
-		}
-		else if(has16bitTextureComponents())
-		{
-			switch(textureComponentCount())
-			{
-			case 4:
-				c.x = Pointer<Short4>(buffer)[index[0]];
-				c.y = Pointer<Short4>(buffer)[index[1]];
-				c.z = Pointer<Short4>(buffer)[index[2]];
-				c.w = Pointer<Short4>(buffer)[index[3]];
-				transpose4x4(c.x, c.y, c.z, c.w);
-				break;
-			case 3:
-				c.x = Pointer<Short4>(buffer)[index[0]];
-				c.y = Pointer<Short4>(buffer)[index[1]];
-				c.z = Pointer<Short4>(buffer)[index[2]];
-				c.w = Pointer<Short4>(buffer)[index[3]];
-				transpose4x3(c.x, c.y, c.z, c.w);
-				break;
-			case 2:
-				c.x = *Pointer<Short4>(buffer + 4 * index[0]);
-				c.x = As<Short4>(UnpackLow(c.x, *Pointer<Short4>(buffer + 4 * index[1])));
-				c.z = *Pointer<Short4>(buffer + 4 * index[2]);
-				c.z = As<Short4>(UnpackLow(c.z, *Pointer<Short4>(buffer + 4 * index[3])));
-				c.y = c.x;
-				c.x = UnpackLow(As<Int2>(c.x), As<Int2>(c.z));
-				c.y = UnpackHigh(As<Int2>(c.y), As<Int2>(c.z));
-				break;
-			case 1:
-				c.x = Insert(c.x, Pointer<Short>(buffer)[index[0]], 0);
-				c.x = Insert(c.x, Pointer<Short>(buffer)[index[1]], 1);
-				c.x = Insert(c.x, Pointer<Short>(buffer)[index[2]], 2);
-				c.x = Insert(c.x, Pointer<Short>(buffer)[index[3]], 3);
-				break;
-			default:
-				ASSERT(false);
-			}
-		}
-		else if(state.textureFormat == VK_FORMAT_A2B10G10R10_UNORM_PACK32)
-		{
-			Int4 cc;
-			cc = Insert(cc, Pointer<Int>(buffer)[index[0]], 0);
-			cc = Insert(cc, Pointer<Int>(buffer)[index[1]], 1);
-			cc = Insert(cc, Pointer<Int>(buffer)[index[2]], 2);
-			cc = Insert(cc, Pointer<Int>(buffer)[index[3]], 3);
-
-			// shift each 10 bit field left 6, and replicate 6 high bits into bottom 6
-			c.x = Short4(((cc << 6) & Int4(0xFFC0)) | ((cc >> 4) & Int4(0x3F)));
-			c.y = Short4(((cc >> 4) & Int4(0xFFC0)) | ((cc >> 14) & Int4(0x3F)));
-			c.z = Short4(((cc >> 14) & Int4(0xFFC0)) | ((cc >> 24) & Int4(0x3F)));
-			c.w = Short4(((cc >> 16) & Int4(0xC000)));
-
-			// replicate 2 bit alpha component all the way down
-			c.w |= (c.w >> 8) & Short4(0xc0);
-			c.w |= (c.w >> 4) & Short4(0x0c0c);
-			c.w |= (c.w >> 2) & Short4(0x3333);
-		}
-		else if(state.textureFormat == VK_FORMAT_A2B10G10R10_UINT_PACK32)
-		{
-			Int4 cc;
-			cc = Insert(cc, Pointer<Int>(buffer)[index[0]], 0);
-			cc = Insert(cc, Pointer<Int>(buffer)[index[1]], 1);
-			cc = Insert(cc, Pointer<Int>(buffer)[index[2]], 2);
-			cc = Insert(cc, Pointer<Int>(buffer)[index[3]], 3);
-
-			c.x = Short4(((cc) & Int4(0x3FF)));
-			c.y = Short4(((cc >> 10) & Int4(0x3FF)));
-			c.z = Short4(((cc >> 20) & Int4(0x3FF)));
-			c.w = Short4(((cc >> 30) & Int4(0x3)));
-		}
-		else ASSERT(false);
-
-		if (state.textureFormat.isSRGBformat())
-		{
-			for(int i = 0; i < textureComponentCount(); i++)
-			{
-				if(isRGBComponent(i))
-				{
-					sRGBtoLinear16_8_16(c[i]);
+					c.x = Unpack(As<Byte4>(c0));
+					break;
 				}
 			}
-		}
-
-		return c;
-	}
-
-	Vector4s SamplerCore::sampleTexel(Short4 &uuuu, Short4 &vvvv, Short4 &wwww, Vector4f &offset, Pointer<Byte> &mipmap, const Short4& cubeArrayId, const Int4& sampleId, Pointer<Byte> buffer, SamplerFunction function)
-	{
-		Vector4s c;
-
-		UInt index[4];
-		computeIndices(index, uuuu, vvvv, wwww, offset, mipmap, cubeArrayId, sampleId, function);
-
-		if(isYcbcrFormat())
-		{
-			// Pointers to the planes of YCbCr images are stored in consecutive mipmap levels.
-			Pointer<Byte> bufferY = buffer;  // *Pointer<Pointer<Byte>>(mipmap + 0 * sizeof(Mipmap) + OFFSET(Mipmap, buffer));
-			Pointer<Byte> bufferU = *Pointer<Pointer<Byte>>(mipmap + 1 * sizeof(Mipmap) + OFFSET(Mipmap, buffer));  // U/V for 2-plane interleaved formats.
-			Pointer<Byte> bufferV = *Pointer<Pointer<Byte>>(mipmap + 2 * sizeof(Mipmap) + OFFSET(Mipmap, buffer));
-
-			// Luminance
-			Int c0 = Int(bufferY[index[0]]);
-			Int c1 = Int(bufferY[index[1]]);
-			Int c2 = Int(bufferY[index[2]]);
-			Int c3 = Int(bufferY[index[3]]);
-			c0 = c0 | (c1 << 8) | (c2 << 16) | (c3 << 24);
-			UShort4 Y = As<UShort4>(Unpack(As<Byte4>(c0)));
-
-			UShort4 Cb, Cr;
-
-			// Chroma
-			{
-				computeIndices(index, uuuu, vvvv, wwww, offset, mipmap + sizeof(Mipmap), cubeArrayId, sampleId, function);
-				UShort4 U, V;
-
-				if(state.textureFormat == VK_FORMAT_G8_B8_R8_3PLANE_420_UNORM)
-				{
-					c0 = Int(bufferU[index[0]]);
-					c1 = Int(bufferU[index[1]]);
-					c2 = Int(bufferU[index[2]]);
-					c3 = Int(bufferU[index[3]]);
-					c0 = c0 | (c1 << 8) | (c2 << 16) | (c3 << 24);
-					U = As<UShort4>(Unpack(As<Byte4>(c0)));
-
-					c0 = Int(bufferV[index[0]]);
-					c1 = Int(bufferV[index[1]]);
-					c2 = Int(bufferV[index[2]]);
-					c3 = Int(bufferV[index[3]]);
-					c0 = c0 | (c1 << 8) | (c2 << 16) | (c3 << 24);
-					V = As<UShort4>(Unpack(As<Byte4>(c0)));
-				}
-				else if(state.textureFormat == VK_FORMAT_G8_B8R8_2PLANE_420_UNORM)
-				{
-					Short4 UV;
-					UV = Insert(UV, Pointer<Short>(bufferU)[index[0]], 0);  // TODO: Insert(UShort4, UShort)
-					UV = Insert(UV, Pointer<Short>(bufferU)[index[1]], 1);
-					UV = Insert(UV, Pointer<Short>(bufferU)[index[2]], 2);
-					UV = Insert(UV, Pointer<Short>(bufferU)[index[3]], 3);
-					U = (UV & Short4(0x00FFu)) | (UV << 8);
-					V = (UV & Short4(0xFF00u)) | As<Short4>(As<UShort4>(UV) >> 8);
-				}
-				else UNSUPPORTED("state.textureFormat %d", (int)state.textureFormat);
-
-				if(!state.swappedChroma)
-				{
-					Cb = U;
-					Cr = V;
-				}
-				else
-				{
-					Cb = V;
-					Cr = U;
-				}
-			}
-
-			if(state.ycbcrModel == VK_SAMPLER_YCBCR_MODEL_CONVERSION_RGB_IDENTITY)
-			{
-				// YCbCr formats are treated as signed 15-bit.
-				c.x = Cr >> 1;
-				c.y = Y  >> 1;
-				c.z = Cb >> 1;
-			}
-			else
-			{
-				// Scaling and bias for studio-swing range: Y = [16 .. 235], U/V = [16 .. 240]
-				// Scale down by 0x0101 to normalize the 8.8 samples, and up by 0x7FFF for signed 15-bit output.
-				float yOffset  = static_cast<float>(state.studioSwing ? 16 * 0x0101 : 0);
-				float uvOffset = static_cast<float>(128 * 0x0101);
-				float yFactor  = static_cast<float>(0x7FFF) / static_cast<float>(state.studioSwing ? 219 * 0x0101 : 255 * 0x0101);
-				float uvFactor = static_cast<float>(0x7FFF) / static_cast<float>(state.studioSwing ? 224 * 0x0101 : 255 * 0x0101);
-
-				Float4 y = (Float4(Y)  - Float4(yOffset))  * Float4(yFactor);
-				Float4 u = (Float4(Cb) - Float4(uvOffset)) * Float4(uvFactor);
-				Float4 v = (Float4(Cr) - Float4(uvOffset)) * Float4(uvFactor);
-
-				if(state.ycbcrModel == VK_SAMPLER_YCBCR_MODEL_CONVERSION_YCBCR_IDENTITY)
-				{
-					c.x = Short4(v);
-					c.y = Short4(y);
-					c.z = Short4(u);
-				}
-				else
-				{
-					// Generic YCbCr to RGB transformation:
-					// R = Y                               +           2 * (1 - Kr) * Cr
-					// G = Y - 2 * Kb * (1 - Kb) / Kg * Cb - 2 * Kr * (1 - Kr) / Kg * Cr
-					// B = Y +           2 * (1 - Kb) * Cb
-
-					float Kb = 0.114f;
-					float Kr = 0.299f;
-
-					switch(state.ycbcrModel)
-					{
-					case VK_SAMPLER_YCBCR_MODEL_CONVERSION_YCBCR_709:
-						Kb = 0.0722f;
-						Kr = 0.2126f;
-						break;
-					case VK_SAMPLER_YCBCR_MODEL_CONVERSION_YCBCR_601:
-						Kb = 0.114f;
-						Kr = 0.299f;
-						break;
-					case VK_SAMPLER_YCBCR_MODEL_CONVERSION_YCBCR_2020:
-						Kb = 0.0593f;
-						Kr = 0.2627f;
-						break;
-					default:
-						UNSUPPORTED("ycbcrModel %d", int(state.ycbcrModel));
-					}
-
-					const float Kg = 1.0f - Kr - Kb;
-
-					const float Rr = 2 * (1 - Kr);
-					const float Gb = -2 * Kb * (1 - Kb) / Kg;
-					const float Gr = -2 * Kr * (1 - Kr) / Kg;
-					const float Bb = 2 * (1 - Kb);
-
-					Float4 r = y                  + Float4(Rr) * v;
-					Float4 g = y + Float4(Gb) * u + Float4(Gr) * v;
-					Float4 b = y + Float4(Bb) * u                 ;
-
-					c.x = Short4(r);
-					c.y = Short4(g);
-					c.z = Short4(b);
-				}
-			}
-		}
-		else
-		{
-			return sampleTexel(index, buffer);
-		}
-
-		return c;
-	}
-
-	Vector4f SamplerCore::sampleTexel(Int4 &uuuu, Int4 &vvvv, Int4 &wwww, Float4 &z, Pointer<Byte> &mipmap, const Int4& cubeArrayId, const Int4& sampleId, Pointer<Byte> buffer, SamplerFunction function)
-	{
-		Int4 valid;
-
-		if(borderModeActive())
-		{
-			// Valid texels have positive coordinates.
-			Int4 negative = Int4(0);
-			if(state.addressingModeU == ADDRESSING_BORDER) negative |= uuuu;
-			if(state.addressingModeV == ADDRESSING_BORDER) negative |= vvvv;
-			if(state.addressingModeW == ADDRESSING_BORDER) negative |= wwww;
-			valid = CmpNLT(negative, Int4(0));
-		}
-
-		UInt index[4];
-		UInt4 t0, t1, t2, t3;
-		computeIndices(index, uuuu, vvvv, wwww, valid, mipmap, cubeArrayId, sampleId, function);
-
-		Vector4f c;
-
-		if(hasFloatTexture() || has32bitIntegerTextureComponents())
-		{
-			switch (state.textureFormat)
-			{
-			case VK_FORMAT_R16_SFLOAT:
-				t0 = Int4(*Pointer<UShort4>(buffer + index[0] * 2));
-				t1 = Int4(*Pointer<UShort4>(buffer + index[1] * 2));
-				t2 = Int4(*Pointer<UShort4>(buffer + index[2] * 2));
-				t3 = Int4(*Pointer<UShort4>(buffer + index[3] * 2));
-
-				c.x.x = Extract(As<Float4>(halfToFloatBits(t0)), 0);
-				c.x.y = Extract(As<Float4>(halfToFloatBits(t1)), 0);
-				c.x.z = Extract(As<Float4>(halfToFloatBits(t2)), 0);
-				c.x.w = Extract(As<Float4>(halfToFloatBits(t3)), 0);
-				break;
-			case VK_FORMAT_R16G16_SFLOAT:
-				t0 = Int4(*Pointer<UShort4>(buffer + index[0] * 4));
-				t1 = Int4(*Pointer<UShort4>(buffer + index[1] * 4));
-				t2 = Int4(*Pointer<UShort4>(buffer + index[2] * 4));
-				t3 = Int4(*Pointer<UShort4>(buffer + index[3] * 4));
-
-				// FIXME: shuffles
-				c.x = As<Float4>(halfToFloatBits(t0));
-				c.y = As<Float4>(halfToFloatBits(t1));
-				c.z = As<Float4>(halfToFloatBits(t2));
-				c.w = As<Float4>(halfToFloatBits(t3));
-				transpose4x4(c.x, c.y, c.z, c.w);
-				break;
-			case VK_FORMAT_R16G16B16A16_SFLOAT:
-				t0 = Int4(*Pointer<UShort4>(buffer + index[0] * 8));
-				t1 = Int4(*Pointer<UShort4>(buffer + index[1] * 8));
-				t2 = Int4(*Pointer<UShort4>(buffer + index[2] * 8));
-				t3 = Int4(*Pointer<UShort4>(buffer + index[3] * 8));
-
-				c.x = As<Float4>(halfToFloatBits(t0));
-				c.y = As<Float4>(halfToFloatBits(t1));
-				c.z = As<Float4>(halfToFloatBits(t2));
-				c.w = As<Float4>(halfToFloatBits(t3));
-				transpose4x4(c.x, c.y, c.z, c.w);
-				break;
-			case VK_FORMAT_R32_SFLOAT:
-			case VK_FORMAT_R32_SINT:
-			case VK_FORMAT_R32_UINT:
-			case VK_FORMAT_D32_SFLOAT:
-				// FIXME: Optimal shuffling?
-				c.x.x = *Pointer<Float>(buffer + index[0] * 4);
-				c.x.y = *Pointer<Float>(buffer + index[1] * 4);
-				c.x.z = *Pointer<Float>(buffer + index[2] * 4);
-				c.x.w = *Pointer<Float>(buffer + index[3] * 4);
-				break;
-			case VK_FORMAT_R32G32_SFLOAT:
-			case VK_FORMAT_R32G32_SINT:
-			case VK_FORMAT_R32G32_UINT:
-				// FIXME: Optimal shuffling?
-				c.x.xy = *Pointer<Float4>(buffer + index[0] * 8);
-				c.x.zw = *Pointer<Float4>(buffer + index[1] * 8 - 8);
-				c.z.xy = *Pointer<Float4>(buffer + index[2] * 8);
-				c.z.zw = *Pointer<Float4>(buffer + index[3] * 8 - 8);
-				c.y = c.x;
-				c.x = Float4(c.x.xz, c.z.xz);
-				c.y = Float4(c.y.yw, c.z.yw);
-				break;
-			case VK_FORMAT_R32G32B32_SFLOAT:
-			case VK_FORMAT_R32G32B32_SINT:
-			case VK_FORMAT_R32G32B32_UINT:
-				c.x = *Pointer<Float4>(buffer + index[0] * 16, 16);
-				c.y = *Pointer<Float4>(buffer + index[1] * 16, 16);
-				c.z = *Pointer<Float4>(buffer + index[2] * 16, 16);
-				c.w = *Pointer<Float4>(buffer + index[3] * 16, 16);
-				transpose4x3(c.x, c.y, c.z, c.w);
-				break;
-			case VK_FORMAT_R32G32B32A32_SFLOAT:
-			case VK_FORMAT_R32G32B32A32_SINT:
-			case VK_FORMAT_R32G32B32A32_UINT:
-				c.x = *Pointer<Float4>(buffer + index[0] * 16, 16);
-				c.y = *Pointer<Float4>(buffer + index[1] * 16, 16);
-				c.z = *Pointer<Float4>(buffer + index[2] * 16, 16);
-				c.w = *Pointer<Float4>(buffer + index[3] * 16, 16);
-				transpose4x4(c.x, c.y, c.z, c.w);
-				break;
-			case VK_FORMAT_E5B9G9R9_UFLOAT_PACK32:
-			{
-				Float4 t;		// TODO: add Insert(UInt4, RValue<UInt>)
-				t.x = *Pointer<Float>(buffer + index[0] * 4);
-				t.y = *Pointer<Float>(buffer + index[1] * 4);
-				t.z = *Pointer<Float>(buffer + index[2] * 4);
-				t.w = *Pointer<Float>(buffer + index[3] * 4);
-				t0 = As<UInt4>(t);
-				c.w = Float4(UInt4(1) << ((t0 >> 27) & UInt4(0x1F))) * Float4(1.0f / (1 << 24));
-				c.x = Float4((t0) & UInt4(0x1FF)) * c.w;
-				c.y = Float4((t0 >> 9) & UInt4(0x1FF)) * c.w;
-				c.z = Float4((t0 >> 18) & UInt4(0x1FF)) * c.w;
-				break;
-			}
-			case VK_FORMAT_B10G11R11_UFLOAT_PACK32:
-			{
-				Float4 t;		// TODO: add Insert(UInt4, RValue<UInt>)
-				t.x = *Pointer<Float>(buffer + index[0] * 4);
-				t.y = *Pointer<Float>(buffer + index[1] * 4);
-				t.z = *Pointer<Float>(buffer + index[2] * 4);
-				t.w = *Pointer<Float>(buffer + index[3] * 4);
-				t0 = As<UInt4>(t);
-				c.x = As<Float4>(halfToFloatBits((t0 << 4) & UInt4(0x7FF0)));
-				c.y = As<Float4>(halfToFloatBits((t0 >> 7) & UInt4(0x7FF0)));
-				c.z = As<Float4>(halfToFloatBits((t0 >> 17) & UInt4(0x7FE0)));
-				break;
-			}
-			default:
-				UNIMPLEMENTED("Format %d", VkFormat(state.textureFormat));
-			}
-		}
-		else
-		{
-			ASSERT(!isYcbcrFormat());
-
-			Vector4s cs = sampleTexel(index, buffer);
-
-			bool isInteger = state.textureFormat.isNonNormalizedInteger();
-			int componentCount = textureComponentCount();
-			for(int n = 0; n < componentCount; n++)
-			{
-				if(hasUnsignedTextureComponent(n))
-				{
-					if(isInteger)
-					{
-						c[n] = As<Float4>(Int4(As<UShort4>(cs[n])));
-					}
-					else
-					{
-						c[n] = Float4(As<UShort4>(cs[n]));
-					}
-				}
-				else
-				{
-					if(isInteger)
-					{
-						c[n] = As<Float4>(Int4(cs[n]));
-					}
-					else
-					{
-						c[n] = Float4(cs[n]);
-					}
-				}
-			}
-		}
-
-		if(state.compareEnable)
-		{
-			Float4 ref = z;
-
-			if(!hasFloatTexture())
-			{
-				// D16_UNORM: clamp reference, normalize texel value
-				ref = Min(Max(ref, Float4(0.0f)), Float4(1.0f));
-				c.x = c.x * Float4(1.0f / 0xFFFF);
-			}
-
-			Int4 boolean;
-
-			switch(state.compareOp)
-			{
-			case VK_COMPARE_OP_LESS_OR_EQUAL:    boolean = CmpLE(ref, c.x);  break;
-			case VK_COMPARE_OP_GREATER_OR_EQUAL: boolean = CmpNLT(ref, c.x); break;
-			case VK_COMPARE_OP_LESS:             boolean = CmpLT(ref, c.x);  break;
-			case VK_COMPARE_OP_GREATER:          boolean = CmpNLE(ref, c.x); break;
-			case VK_COMPARE_OP_EQUAL:            boolean = CmpEQ(ref, c.x);  break;
-			case VK_COMPARE_OP_NOT_EQUAL:        boolean = CmpNEQ(ref, c.x); break;
-			case VK_COMPARE_OP_ALWAYS:           boolean = Int4(-1);         break;
-			case VK_COMPARE_OP_NEVER:            boolean = Int4(0);          break;
-			default:                   ASSERT(false);
-			}
-
-			c.x = As<Float4>(boolean & As<Int4>(Float4(1.0f)));
-			c.y = Float4(0.0f);
-			c.z = Float4(0.0f);
-			c.w = Float4(1.0f);
-		}
-
-		if(borderModeActive())
-		{
-			c = replaceBorderTexel(c, valid);
-		}
-
-		return c;
-	}
-
-	Vector4f SamplerCore::replaceBorderTexel(const Vector4f &c, Int4 valid)
-	{
-		Int4 borderRGB;
-		Int4 borderA;
-
-		bool scaled = !hasFloatTexture() && !hasUnnormalizedIntegerTexture() && !state.compareEnable;
-		bool sign = !hasUnsignedTextureComponent(0);
-		Int4 float_one = scaled ? As<Int4>(Float4(static_cast<float>(sign ? 0x7FFF : 0xFFFF))) : As<Int4>(Float4(1.0f));
-
-		switch(state.border)
-		{
-		case VK_BORDER_COLOR_FLOAT_TRANSPARENT_BLACK:
-		case VK_BORDER_COLOR_INT_TRANSPARENT_BLACK:
-			borderRGB = Int4(0);
-			borderA = Int4(0);
-			break;
-		case VK_BORDER_COLOR_FLOAT_OPAQUE_BLACK:
-			borderRGB = Int4(0);
-			borderA = float_one;
-			break;
-		case VK_BORDER_COLOR_INT_OPAQUE_BLACK:
-			borderRGB = Int4(0);
-			borderA = Int4(1);
-			break;
-		case VK_BORDER_COLOR_FLOAT_OPAQUE_WHITE:
-			borderRGB = float_one;
-			borderA = float_one;
-			break;
-		case VK_BORDER_COLOR_INT_OPAQUE_WHITE:
-			borderRGB = Int4(1);
-			borderA = Int4(1);
 			break;
 		default:
-			UNIMPLEMENTED("sint/uint/sfloat border: %u", state.border);
+			ASSERT(false);
 		}
+	}
+	else if(has16bitTextureComponents())
+	{
+		switch(textureComponentCount())
+		{
+		case 4:
+			c.x = Pointer<Short4>(buffer)[index[0]];
+			c.y = Pointer<Short4>(buffer)[index[1]];
+			c.z = Pointer<Short4>(buffer)[index[2]];
+			c.w = Pointer<Short4>(buffer)[index[3]];
+			transpose4x4(c.x, c.y, c.z, c.w);
+			break;
+		case 3:
+			c.x = Pointer<Short4>(buffer)[index[0]];
+			c.y = Pointer<Short4>(buffer)[index[1]];
+			c.z = Pointer<Short4>(buffer)[index[2]];
+			c.w = Pointer<Short4>(buffer)[index[3]];
+			transpose4x3(c.x, c.y, c.z, c.w);
+			break;
+		case 2:
+			c.x = *Pointer<Short4>(buffer + 4 * index[0]);
+			c.x = As<Short4>(UnpackLow(c.x, *Pointer<Short4>(buffer + 4 * index[1])));
+			c.z = *Pointer<Short4>(buffer + 4 * index[2]);
+			c.z = As<Short4>(UnpackLow(c.z, *Pointer<Short4>(buffer + 4 * index[3])));
+			c.y = c.x;
+			c.x = UnpackLow(As<Int2>(c.x), As<Int2>(c.z));
+			c.y = UnpackHigh(As<Int2>(c.y), As<Int2>(c.z));
+			break;
+		case 1:
+			c.x = Insert(c.x, Pointer<Short>(buffer)[index[0]], 0);
+			c.x = Insert(c.x, Pointer<Short>(buffer)[index[1]], 1);
+			c.x = Insert(c.x, Pointer<Short>(buffer)[index[2]], 2);
+			c.x = Insert(c.x, Pointer<Short>(buffer)[index[3]], 3);
+			break;
+		default:
+			ASSERT(false);
+		}
+	}
+	else if(state.textureFormat == VK_FORMAT_A2B10G10R10_UNORM_PACK32)
+	{
+		Int4 cc;
+		cc = Insert(cc, Pointer<Int>(buffer)[index[0]], 0);
+		cc = Insert(cc, Pointer<Int>(buffer)[index[1]], 1);
+		cc = Insert(cc, Pointer<Int>(buffer)[index[2]], 2);
+		cc = Insert(cc, Pointer<Int>(buffer)[index[3]], 3);
 
-		Vector4f out;
-		out.x = As<Float4>((valid & As<Int4>(c.x)) | (~valid & borderRGB));
-		out.y = As<Float4>((valid & As<Int4>(c.y)) | (~valid & borderRGB));
-		out.z = As<Float4>((valid & As<Int4>(c.z)) | (~valid & borderRGB));
-		out.w = As<Float4>((valid & As<Int4>(c.w)) | (~valid & borderA));
+		// shift each 10 bit field left 6, and replicate 6 high bits into bottom 6
+		c.x = Short4(((cc << 6) & Int4(0xFFC0)) | ((cc >> 4) & Int4(0x3F)));
+		c.y = Short4(((cc >> 4) & Int4(0xFFC0)) | ((cc >> 14) & Int4(0x3F)));
+		c.z = Short4(((cc >> 14) & Int4(0xFFC0)) | ((cc >> 24) & Int4(0x3F)));
+		c.w = Short4(((cc >> 16) & Int4(0xC000)));
 
-		return out;
+		// replicate 2 bit alpha component all the way down
+		c.w |= (c.w >> 8) & Short4(0xc0);
+		c.w |= (c.w >> 4) & Short4(0x0c0c);
+		c.w |= (c.w >> 2) & Short4(0x3333);
+	}
+	else if(state.textureFormat == VK_FORMAT_A2B10G10R10_UINT_PACK32)
+	{
+		Int4 cc;
+		cc = Insert(cc, Pointer<Int>(buffer)[index[0]], 0);
+		cc = Insert(cc, Pointer<Int>(buffer)[index[1]], 1);
+		cc = Insert(cc, Pointer<Int>(buffer)[index[2]], 2);
+		cc = Insert(cc, Pointer<Int>(buffer)[index[3]], 3);
+
+		c.x = Short4(((cc) & Int4(0x3FF)));
+		c.y = Short4(((cc >> 10) & Int4(0x3FF)));
+		c.z = Short4(((cc >> 20) & Int4(0x3FF)));
+		c.w = Short4(((cc >> 30) & Int4(0x3)));
+	}
+	else ASSERT(false);
+
+	if (state.textureFormat.isSRGBformat())
+	{
+		for(int i = 0; i < textureComponentCount(); i++)
+		{
+			if(isRGBComponent(i))
+			{
+				sRGBtoLinear16_8_16(c[i]);
+			}
+		}
 	}
 
-	void SamplerCore::selectMipmap(const Pointer<Byte> &texture, Pointer<Byte> &mipmap, Pointer<Byte> &buffer, const Float &lod, bool secondLOD)
-	{
-		Pointer<Byte> mipmap0 = texture + OFFSET(Texture, mipmap[0]);
+	return c;
+}
 
-		if(state.mipmapFilter == MIPMAP_NONE)
+Vector4s SamplerCore::sampleTexel(Short4 &uuuu, Short4 &vvvv, Short4 &wwww, Vector4f &offset, Pointer<Byte> &mipmap, const Short4& cubeArrayId, const Int4& sampleId, Pointer<Byte> buffer, SamplerFunction function)
+{
+	Vector4s c;
+
+	UInt index[4];
+	computeIndices(index, uuuu, vvvv, wwww, offset, mipmap, cubeArrayId, sampleId, function);
+
+	if(isYcbcrFormat())
+	{
+		// Pointers to the planes of YCbCr images are stored in consecutive mipmap levels.
+		Pointer<Byte> bufferY = buffer;  // *Pointer<Pointer<Byte>>(mipmap + 0 * sizeof(Mipmap) + OFFSET(Mipmap, buffer));
+		Pointer<Byte> bufferU = *Pointer<Pointer<Byte>>(mipmap + 1 * sizeof(Mipmap) + OFFSET(Mipmap, buffer));  // U/V for 2-plane interleaved formats.
+		Pointer<Byte> bufferV = *Pointer<Pointer<Byte>>(mipmap + 2 * sizeof(Mipmap) + OFFSET(Mipmap, buffer));
+
+		// Luminance
+		Int c0 = Int(bufferY[index[0]]);
+		Int c1 = Int(bufferY[index[1]]);
+		Int c2 = Int(bufferY[index[2]]);
+		Int c3 = Int(bufferY[index[3]]);
+		c0 = c0 | (c1 << 8) | (c2 << 16) | (c3 << 24);
+		UShort4 Y = As<UShort4>(Unpack(As<Byte4>(c0)));
+
+		UShort4 Cb, Cr;
+
+		// Chroma
 		{
-			mipmap = mipmap0;
+			computeIndices(index, uuuu, vvvv, wwww, offset, mipmap + sizeof(Mipmap), cubeArrayId, sampleId, function);
+			UShort4 U, V;
+
+			if(state.textureFormat == VK_FORMAT_G8_B8_R8_3PLANE_420_UNORM)
+			{
+				c0 = Int(bufferU[index[0]]);
+				c1 = Int(bufferU[index[1]]);
+				c2 = Int(bufferU[index[2]]);
+				c3 = Int(bufferU[index[3]]);
+				c0 = c0 | (c1 << 8) | (c2 << 16) | (c3 << 24);
+				U = As<UShort4>(Unpack(As<Byte4>(c0)));
+
+				c0 = Int(bufferV[index[0]]);
+				c1 = Int(bufferV[index[1]]);
+				c2 = Int(bufferV[index[2]]);
+				c3 = Int(bufferV[index[3]]);
+				c0 = c0 | (c1 << 8) | (c2 << 16) | (c3 << 24);
+				V = As<UShort4>(Unpack(As<Byte4>(c0)));
+			}
+			else if(state.textureFormat == VK_FORMAT_G8_B8R8_2PLANE_420_UNORM)
+			{
+				Short4 UV;
+				UV = Insert(UV, Pointer<Short>(bufferU)[index[0]], 0);  // TODO: Insert(UShort4, UShort)
+				UV = Insert(UV, Pointer<Short>(bufferU)[index[1]], 1);
+				UV = Insert(UV, Pointer<Short>(bufferU)[index[2]], 2);
+				UV = Insert(UV, Pointer<Short>(bufferU)[index[3]], 3);
+				U = (UV & Short4(0x00FFu)) | (UV << 8);
+				V = (UV & Short4(0xFF00u)) | As<Short4>(As<UShort4>(UV) >> 8);
+			}
+			else UNSUPPORTED("state.textureFormat %d", (int)state.textureFormat);
+
+			if(!state.swappedChroma)
+			{
+				Cb = U;
+				Cr = V;
+			}
+			else
+			{
+				Cb = V;
+				Cr = U;
+			}
+		}
+
+		if(state.ycbcrModel == VK_SAMPLER_YCBCR_MODEL_CONVERSION_RGB_IDENTITY)
+		{
+			// YCbCr formats are treated as signed 15-bit.
+			c.x = Cr >> 1;
+			c.y = Y  >> 1;
+			c.z = Cb >> 1;
 		}
 		else
 		{
-			Int ilod;
+			// Scaling and bias for studio-swing range: Y = [16 .. 235], U/V = [16 .. 240]
+			// Scale down by 0x0101 to normalize the 8.8 samples, and up by 0x7FFF for signed 15-bit output.
+			float yOffset  = static_cast<float>(state.studioSwing ? 16 * 0x0101 : 0);
+			float uvOffset = static_cast<float>(128 * 0x0101);
+			float yFactor  = static_cast<float>(0x7FFF) / static_cast<float>(state.studioSwing ? 219 * 0x0101 : 255 * 0x0101);
+			float uvFactor = static_cast<float>(0x7FFF) / static_cast<float>(state.studioSwing ? 224 * 0x0101 : 255 * 0x0101);
 
-			if(state.mipmapFilter == MIPMAP_POINT)
+			Float4 y = (Float4(Y)  - Float4(yOffset))  * Float4(yFactor);
+			Float4 u = (Float4(Cb) - Float4(uvOffset)) * Float4(uvFactor);
+			Float4 v = (Float4(Cr) - Float4(uvOffset)) * Float4(uvFactor);
+
+			if(state.ycbcrModel == VK_SAMPLER_YCBCR_MODEL_CONVERSION_YCBCR_IDENTITY)
 			{
-				// TODO: Preferred formula is ceil(lod + 0.5) - 1
-				ilod = RoundInt(lod);
+				c.x = Short4(v);
+				c.y = Short4(y);
+				c.z = Short4(u);
 			}
-			else   // MIPMAP_LINEAR
+			else
 			{
-				ilod = Int(lod);
-			}
+				// Generic YCbCr to RGB transformation:
+				// R = Y                               +           2 * (1 - Kr) * Cr
+				// G = Y - 2 * Kb * (1 - Kb) / Kg * Cb - 2 * Kr * (1 - Kr) / Kg * Cr
+				// B = Y +           2 * (1 - Kb) * Cb
 
-			mipmap = mipmap0 + ilod * sizeof(Mipmap) + secondLOD * sizeof(Mipmap);
-		}
+				float Kb = 0.114f;
+				float Kr = 0.299f;
 
-		buffer = *Pointer<Pointer<Byte>>(mipmap + OFFSET(Mipmap, buffer));
-	}
-
-	Int4 SamplerCore::computeFilterOffset(Float &lod)
-	{
-		if(state.textureFilter == FILTER_POINT)
-		{
-			return Int4(0);
-		}
-		else if(state.textureFilter == FILTER_MIN_LINEAR_MAG_POINT)
-		{
-			return CmpNLE(Float4(lod), Float4(0.0f));
-		}
-		else if(state.textureFilter == FILTER_MIN_POINT_MAG_LINEAR)
-		{
-			return CmpLE(Float4(lod), Float4(0.0f));
-		}
-
-		return Int4(~0);
-	}
-
-	Short4 SamplerCore::address(const Float4 &uw, AddressingMode addressingMode, Pointer<Byte> &mipmap)
-	{
-		if(addressingMode == ADDRESSING_UNUSED)
-		{
-			return Short4();
-		}
-		else if(addressingMode == ADDRESSING_LAYER)
-		{
-			Int4 dim = *Pointer<Int4>(mipmap + OFFSET(Mipmap, depth));
-			// For cube maps, the layer argument is per cube, each of which has 6 layers
-			if(state.textureType == VK_IMAGE_VIEW_TYPE_CUBE_ARRAY)
-			{
-				dim = dim / Int4(6);
-			}
-
-			return Short4(Min(Max(RoundInt(uw), Int4(0)), dim - Int4(1)));
-		}
-		else if(addressingMode == ADDRESSING_CLAMP || addressingMode == ADDRESSING_BORDER)
-		{
-			Float4 clamp = Min(Max(uw, Float4(0.0f)), Float4(65535.0f / 65536.0f));
-
-			return Short4(Int4(clamp * Float4(1 << 16)));
-		}
-		else if(addressingMode == ADDRESSING_MIRROR)
-		{
-			Int4 convert = Int4(uw * Float4(1 << 16));
-			Int4 mirror = (convert << 15) >> 31;
-
-			convert ^= mirror;
-
-			return Short4(convert);
-		}
-		else if(addressingMode == ADDRESSING_MIRRORONCE)
-		{
-			// Absolute value
-			Int4 convert = Int4(Abs(uw * Float4(1 << 16)));
-
-			// Clamp
-			convert -= Int4(0x00008000, 0x00008000, 0x00008000, 0x00008000);
-			convert = As<Int4>(PackSigned(convert, convert));
-
-			return As<Short4>(Int2(convert)) + Short4(0x8000u);
-		}
-		else   // Wrap
-		{
-			return Short4(Int4(uw * Float4(1 << 16)));
-		}
-	}
-
-	// TODO: Eliminate when the gather + mirror addressing case is handled by mirroring the footprint.
-	static Int4 mirror(Int4 n)
-	{
-		auto positive = CmpNLT(n, Int4(0));
-		return (positive & n) | (~positive & (-(Int4(1) + n)));
-	}
-
-	static Int4 mod(Int4 n, Int4 d)
-	{
-		auto x = n % d;
-		auto positive = CmpNLT(x, Int4(0));
-		return (positive & x) | (~positive & (x + d));
-	}
-
-	void SamplerCore::address(const Float4 &uvw, Int4 &xyz0, Int4 &xyz1, Float4 &f, Pointer<Byte> &mipmap, Float4 &texOffset, Int4 &filter, int whd, AddressingMode addressingMode, SamplerFunction function)
-	{
-		if(addressingMode == ADDRESSING_UNUSED)
-		{
-			return;
-		}
-
-		Int4 dim = *Pointer<Int4>(mipmap + whd, 16);
-		Int4 maxXYZ = dim - Int4(1);
-
-		if(function == Fetch)
-		{
-			xyz0 = Min(Max(((function.offset != 0) && (addressingMode != ADDRESSING_LAYER)) ? As<Int4>(uvw) + As<Int4>(texOffset) : As<Int4>(uvw), Int4(0)), maxXYZ);
-		}
-		else if(addressingMode == ADDRESSING_LAYER)   // Note: Offset does not apply to array layers
-		{
-			// For cube maps, the layer argument is per cube, each of which has 6 layers
-			if(state.textureType == VK_IMAGE_VIEW_TYPE_CUBE_ARRAY)
-			{
-				dim = dim / Int4(6);
-			}
-
-			xyz0 = Min(Max(RoundInt(uvw), Int4(0)), dim - Int4(1));
-		}
-		else if(addressingMode == ADDRESSING_CUBEFACE)
-		{
-			xyz0 = As<Int4>(uvw);
-		}
-		else
-		{
-			const int halfBits = 0x3EFFFFFF;   // Value just under 0.5f
-			const int oneBits  = 0x3F7FFFFF;   // Value just under 1.0f
-			const int twoBits  = 0x3FFFFFFF;   // Value just under 2.0f
-
-			bool pointFilter = state.textureFilter == FILTER_POINT ||
-			                   state.textureFilter == FILTER_MIN_POINT_MAG_LINEAR ||
-			                   state.textureFilter == FILTER_MIN_LINEAR_MAG_POINT;
-
-			Float4 coord = uvw;
-
-			if(state.unnormalizedCoordinates)
-			{
-				switch(addressingMode)
+				switch(state.ycbcrModel)
 				{
-				case ADDRESSING_CLAMP:
-					coord = Min(Max(coord, Float4(0.0f)), Float4(dim) * As<Float4>(Int4(oneBits)));
+				case VK_SAMPLER_YCBCR_MODEL_CONVERSION_YCBCR_709:
+					Kb = 0.0722f;
+					Kr = 0.2126f;
 					break;
-				case ADDRESSING_BORDER:
-					// Don't map to a valid range here.
+				case VK_SAMPLER_YCBCR_MODEL_CONVERSION_YCBCR_601:
+					Kb = 0.114f;
+					Kr = 0.299f;
+					break;
+				case VK_SAMPLER_YCBCR_MODEL_CONVERSION_YCBCR_2020:
+					Kb = 0.0593f;
+					Kr = 0.2627f;
 					break;
 				default:
-					// If unnormalizedCoordinates is VK_TRUE, addressModeU and addressModeV must each be
-					// either VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE or VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_BORDER
-					UNREACHABLE("addressingMode %d", int(addressingMode));
-					break;
+					UNSUPPORTED("ycbcrModel %d", int(state.ycbcrModel));
 				}
+
+				const float Kg = 1.0f - Kr - Kb;
+
+				const float Rr = 2 * (1 - Kr);
+				const float Gb = -2 * Kb * (1 - Kb) / Kg;
+				const float Gr = -2 * Kr * (1 - Kr) / Kg;
+				const float Bb = 2 * (1 - Kb);
+
+				Float4 r = y                  + Float4(Rr) * v;
+				Float4 g = y + Float4(Gb) * u + Float4(Gr) * v;
+				Float4 b = y + Float4(Bb) * u                 ;
+
+				c.x = Short4(r);
+				c.y = Short4(g);
+				c.z = Short4(b);
 			}
-			else if(state.textureFilter == FILTER_GATHER && addressingMode == ADDRESSING_MIRROR)
+		}
+	}
+	else
+	{
+		return sampleTexel(index, buffer);
+	}
+
+	return c;
+}
+
+Vector4f SamplerCore::sampleTexel(Int4 &uuuu, Int4 &vvvv, Int4 &wwww, Float4 &z, Pointer<Byte> &mipmap, const Int4& cubeArrayId, const Int4& sampleId, Pointer<Byte> buffer, SamplerFunction function)
+{
+	Int4 valid;
+
+	if(borderModeActive())
+	{
+		// Valid texels have positive coordinates.
+		Int4 negative = Int4(0);
+		if(state.addressingModeU == ADDRESSING_BORDER) negative |= uuuu;
+		if(state.addressingModeV == ADDRESSING_BORDER) negative |= vvvv;
+		if(state.addressingModeW == ADDRESSING_BORDER) negative |= wwww;
+		valid = CmpNLT(negative, Int4(0));
+	}
+
+	UInt index[4];
+	UInt4 t0, t1, t2, t3;
+	computeIndices(index, uuuu, vvvv, wwww, valid, mipmap, cubeArrayId, sampleId, function);
+
+	Vector4f c;
+
+	if(hasFloatTexture() || has32bitIntegerTextureComponents())
+	{
+		switch (state.textureFormat)
+		{
+		case VK_FORMAT_R16_SFLOAT:
+			t0 = Int4(*Pointer<UShort4>(buffer + index[0] * 2));
+			t1 = Int4(*Pointer<UShort4>(buffer + index[1] * 2));
+			t2 = Int4(*Pointer<UShort4>(buffer + index[2] * 2));
+			t3 = Int4(*Pointer<UShort4>(buffer + index[3] * 2));
+
+			c.x.x = Extract(As<Float4>(halfToFloatBits(t0)), 0);
+			c.x.y = Extract(As<Float4>(halfToFloatBits(t1)), 0);
+			c.x.z = Extract(As<Float4>(halfToFloatBits(t2)), 0);
+			c.x.w = Extract(As<Float4>(halfToFloatBits(t3)), 0);
+			break;
+		case VK_FORMAT_R16G16_SFLOAT:
+			t0 = Int4(*Pointer<UShort4>(buffer + index[0] * 4));
+			t1 = Int4(*Pointer<UShort4>(buffer + index[1] * 4));
+			t2 = Int4(*Pointer<UShort4>(buffer + index[2] * 4));
+			t3 = Int4(*Pointer<UShort4>(buffer + index[3] * 4));
+
+			// FIXME: shuffles
+			c.x = As<Float4>(halfToFloatBits(t0));
+			c.y = As<Float4>(halfToFloatBits(t1));
+			c.z = As<Float4>(halfToFloatBits(t2));
+			c.w = As<Float4>(halfToFloatBits(t3));
+			transpose4x4(c.x, c.y, c.z, c.w);
+			break;
+		case VK_FORMAT_R16G16B16A16_SFLOAT:
+			t0 = Int4(*Pointer<UShort4>(buffer + index[0] * 8));
+			t1 = Int4(*Pointer<UShort4>(buffer + index[1] * 8));
+			t2 = Int4(*Pointer<UShort4>(buffer + index[2] * 8));
+			t3 = Int4(*Pointer<UShort4>(buffer + index[3] * 8));
+
+			c.x = As<Float4>(halfToFloatBits(t0));
+			c.y = As<Float4>(halfToFloatBits(t1));
+			c.z = As<Float4>(halfToFloatBits(t2));
+			c.w = As<Float4>(halfToFloatBits(t3));
+			transpose4x4(c.x, c.y, c.z, c.w);
+			break;
+		case VK_FORMAT_R32_SFLOAT:
+		case VK_FORMAT_R32_SINT:
+		case VK_FORMAT_R32_UINT:
+		case VK_FORMAT_D32_SFLOAT:
+			// FIXME: Optimal shuffling?
+			c.x.x = *Pointer<Float>(buffer + index[0] * 4);
+			c.x.y = *Pointer<Float>(buffer + index[1] * 4);
+			c.x.z = *Pointer<Float>(buffer + index[2] * 4);
+			c.x.w = *Pointer<Float>(buffer + index[3] * 4);
+			break;
+		case VK_FORMAT_R32G32_SFLOAT:
+		case VK_FORMAT_R32G32_SINT:
+		case VK_FORMAT_R32G32_UINT:
+			// FIXME: Optimal shuffling?
+			c.x.xy = *Pointer<Float4>(buffer + index[0] * 8);
+			c.x.zw = *Pointer<Float4>(buffer + index[1] * 8 - 8);
+			c.z.xy = *Pointer<Float4>(buffer + index[2] * 8);
+			c.z.zw = *Pointer<Float4>(buffer + index[3] * 8 - 8);
+			c.y = c.x;
+			c.x = Float4(c.x.xz, c.z.xz);
+			c.y = Float4(c.y.yw, c.z.yw);
+			break;
+		case VK_FORMAT_R32G32B32_SFLOAT:
+		case VK_FORMAT_R32G32B32_SINT:
+		case VK_FORMAT_R32G32B32_UINT:
+			c.x = *Pointer<Float4>(buffer + index[0] * 16, 16);
+			c.y = *Pointer<Float4>(buffer + index[1] * 16, 16);
+			c.z = *Pointer<Float4>(buffer + index[2] * 16, 16);
+			c.w = *Pointer<Float4>(buffer + index[3] * 16, 16);
+			transpose4x3(c.x, c.y, c.z, c.w);
+			break;
+		case VK_FORMAT_R32G32B32A32_SFLOAT:
+		case VK_FORMAT_R32G32B32A32_SINT:
+		case VK_FORMAT_R32G32B32A32_UINT:
+			c.x = *Pointer<Float4>(buffer + index[0] * 16, 16);
+			c.y = *Pointer<Float4>(buffer + index[1] * 16, 16);
+			c.z = *Pointer<Float4>(buffer + index[2] * 16, 16);
+			c.w = *Pointer<Float4>(buffer + index[3] * 16, 16);
+			transpose4x4(c.x, c.y, c.z, c.w);
+			break;
+		case VK_FORMAT_E5B9G9R9_UFLOAT_PACK32:
+		{
+			Float4 t;		// TODO: add Insert(UInt4, RValue<UInt>)
+			t.x = *Pointer<Float>(buffer + index[0] * 4);
+			t.y = *Pointer<Float>(buffer + index[1] * 4);
+			t.z = *Pointer<Float>(buffer + index[2] * 4);
+			t.w = *Pointer<Float>(buffer + index[3] * 4);
+			t0 = As<UInt4>(t);
+			c.w = Float4(UInt4(1) << ((t0 >> 27) & UInt4(0x1F))) * Float4(1.0f / (1 << 24));
+			c.x = Float4((t0) & UInt4(0x1FF)) * c.w;
+			c.y = Float4((t0 >> 9) & UInt4(0x1FF)) * c.w;
+			c.z = Float4((t0 >> 18) & UInt4(0x1FF)) * c.w;
+			break;
+		}
+		case VK_FORMAT_B10G11R11_UFLOAT_PACK32:
+		{
+			Float4 t;		// TODO: add Insert(UInt4, RValue<UInt>)
+			t.x = *Pointer<Float>(buffer + index[0] * 4);
+			t.y = *Pointer<Float>(buffer + index[1] * 4);
+			t.z = *Pointer<Float>(buffer + index[2] * 4);
+			t.w = *Pointer<Float>(buffer + index[3] * 4);
+			t0 = As<UInt4>(t);
+			c.x = As<Float4>(halfToFloatBits((t0 << 4) & UInt4(0x7FF0)));
+			c.y = As<Float4>(halfToFloatBits((t0 >> 7) & UInt4(0x7FF0)));
+			c.z = As<Float4>(halfToFloatBits((t0 >> 17) & UInt4(0x7FE0)));
+			break;
+		}
+		default:
+			UNIMPLEMENTED("Format %d", VkFormat(state.textureFormat));
+		}
+	}
+	else
+	{
+		ASSERT(!isYcbcrFormat());
+
+		Vector4s cs = sampleTexel(index, buffer);
+
+		bool isInteger = state.textureFormat.isNonNormalizedInteger();
+		int componentCount = textureComponentCount();
+		for(int n = 0; n < componentCount; n++)
+		{
+			if(hasUnsignedTextureComponent(n))
 			{
-				// Gather requires the 'footprint' of the texels from which a component is taken, to also mirror around.
-				// Therefore we can't just compute one texel's location and find the other ones at +1 offsets from it.
-				// Here we handle that case separately by doing the mirroring per texel coordinate.
-				// TODO: Mirror the footprint by adjusting the sign of the 0.5f and 1 offsets.
-
-				coord = coord * Float4(dim);
-				coord -= Float4(0.5f);
-				Float4 floor = Floor(coord);
-				xyz0 = Int4(floor);
-
-				if(function.offset != 0)
+				if(isInteger)
 				{
-					xyz0 += As<Int4>(texOffset);
-				}
-
-				xyz1 = xyz0 + Int4(1);
-
-				xyz0 = (maxXYZ) - mirror(mod(xyz0, Int4(2) * dim) - dim);
-				xyz1 = (maxXYZ) - mirror(mod(xyz1, Int4(2) * dim) - dim);
-
-				return;
-			}
-			else
-			{
-				if(function.offset == 0)
-				{
-					switch(addressingMode)
-					{
-					case ADDRESSING_CLAMP:
-					case ADDRESSING_SEAMLESS:
-						// Linear filtering of cube doesn't require clamping because the coordinates
-						// are already in [0, 1] range and numerical imprecision is tolerated.
-						if(addressingMode != ADDRESSING_SEAMLESS || pointFilter)
-						{
-							Float4 one = As<Float4>(Int4(oneBits));
-							coord = Min(Max(coord, Float4(0.0f)), one);
-						}
-						break;
-					case ADDRESSING_MIRROR:
-						{
-							Float4 half = As<Float4>(Int4(halfBits));
-							Float4 one = As<Float4>(Int4(oneBits));
-							Float4 two = As<Float4>(Int4(twoBits));
-							coord = one - Abs(two * Frac(coord * half) - one);
-						}
-						break;
-					case ADDRESSING_MIRRORONCE:
-						{
-							Float4 half = As<Float4>(Int4(halfBits));
-							Float4 one = As<Float4>(Int4(oneBits));
-							Float4 two = As<Float4>(Int4(twoBits));
-							coord = one - Abs(two * Frac(Min(Max(coord, -one), two) * half) - one);
-						}
-						break;
-					case ADDRESSING_BORDER:
-						// Don't map to a valid range here.
-						break;
-					default:   // Wrap
-						coord = Frac(coord);
-						break;
-					}
-				}
-
-				coord = coord * Float4(dim);
-			}
-
-			if(state.textureFilter == FILTER_POINT)
-			{
-				if(addressingMode == ADDRESSING_BORDER || function.offset != 0)
-				{
-					xyz0 = Int4(Floor(coord));
-				}
-				else  // Can't have negative coordinates, so floor() is redundant when casting to int.
-				{
-					xyz0 = Int4(coord);
-				}
-			}
-			else
-			{
-				if(state.textureFilter == FILTER_MIN_POINT_MAG_LINEAR ||
-				   state.textureFilter == FILTER_MIN_LINEAR_MAG_POINT)
-				{
-					coord -= As<Float4>(As<Int4>(Float4(0.5f)) & filter);
+					c[n] = As<Float4>(Int4(As<UShort4>(cs[n])));
 				}
 				else
 				{
-					coord -= Float4(0.5f);
+					c[n] = Float4(As<UShort4>(cs[n]));
 				}
-
-				Float4 floor = Floor(coord);
-				xyz0 = Int4(floor);
-				f = coord - floor;
 			}
+			else
+			{
+				if(isInteger)
+				{
+					c[n] = As<Float4>(Int4(cs[n]));
+				}
+				else
+				{
+					c[n] = Float4(cs[n]);
+				}
+			}
+		}
+	}
+
+	if(state.compareEnable)
+	{
+		Float4 ref = z;
+
+		if(!hasFloatTexture())
+		{
+			// D16_UNORM: clamp reference, normalize texel value
+			ref = Min(Max(ref, Float4(0.0f)), Float4(1.0f));
+			c.x = c.x * Float4(1.0f / 0xFFFF);
+		}
+
+		Int4 boolean;
+
+		switch(state.compareOp)
+		{
+		case VK_COMPARE_OP_LESS_OR_EQUAL:    boolean = CmpLE(ref, c.x);  break;
+		case VK_COMPARE_OP_GREATER_OR_EQUAL: boolean = CmpNLT(ref, c.x); break;
+		case VK_COMPARE_OP_LESS:             boolean = CmpLT(ref, c.x);  break;
+		case VK_COMPARE_OP_GREATER:          boolean = CmpNLE(ref, c.x); break;
+		case VK_COMPARE_OP_EQUAL:            boolean = CmpEQ(ref, c.x);  break;
+		case VK_COMPARE_OP_NOT_EQUAL:        boolean = CmpNEQ(ref, c.x); break;
+		case VK_COMPARE_OP_ALWAYS:           boolean = Int4(-1);         break;
+		case VK_COMPARE_OP_NEVER:            boolean = Int4(0);          break;
+		default:                   ASSERT(false);
+		}
+
+		c.x = As<Float4>(boolean & As<Int4>(Float4(1.0f)));
+		c.y = Float4(0.0f);
+		c.z = Float4(0.0f);
+		c.w = Float4(1.0f);
+	}
+
+	if(borderModeActive())
+	{
+		c = replaceBorderTexel(c, valid);
+	}
+
+	return c;
+}
+
+Vector4f SamplerCore::replaceBorderTexel(const Vector4f &c, Int4 valid)
+{
+	Int4 borderRGB;
+	Int4 borderA;
+
+	bool scaled = !hasFloatTexture() && !hasUnnormalizedIntegerTexture() && !state.compareEnable;
+	bool sign = !hasUnsignedTextureComponent(0);
+	Int4 float_one = scaled ? As<Int4>(Float4(static_cast<float>(sign ? 0x7FFF : 0xFFFF))) : As<Int4>(Float4(1.0f));
+
+	switch(state.border)
+	{
+	case VK_BORDER_COLOR_FLOAT_TRANSPARENT_BLACK:
+	case VK_BORDER_COLOR_INT_TRANSPARENT_BLACK:
+		borderRGB = Int4(0);
+		borderA = Int4(0);
+		break;
+	case VK_BORDER_COLOR_FLOAT_OPAQUE_BLACK:
+		borderRGB = Int4(0);
+		borderA = float_one;
+		break;
+	case VK_BORDER_COLOR_INT_OPAQUE_BLACK:
+		borderRGB = Int4(0);
+		borderA = Int4(1);
+		break;
+	case VK_BORDER_COLOR_FLOAT_OPAQUE_WHITE:
+		borderRGB = float_one;
+		borderA = float_one;
+		break;
+	case VK_BORDER_COLOR_INT_OPAQUE_WHITE:
+		borderRGB = Int4(1);
+		borderA = Int4(1);
+		break;
+	default:
+		UNIMPLEMENTED("sint/uint/sfloat border: %u", state.border);
+	}
+
+	Vector4f out;
+	out.x = As<Float4>((valid & As<Int4>(c.x)) | (~valid & borderRGB));
+	out.y = As<Float4>((valid & As<Int4>(c.y)) | (~valid & borderRGB));
+	out.z = As<Float4>((valid & As<Int4>(c.z)) | (~valid & borderRGB));
+	out.w = As<Float4>((valid & As<Int4>(c.w)) | (~valid & borderA));
+
+	return out;
+}
+
+void SamplerCore::selectMipmap(const Pointer<Byte> &texture, Pointer<Byte> &mipmap, Pointer<Byte> &buffer, const Float &lod, bool secondLOD)
+{
+	Pointer<Byte> mipmap0 = texture + OFFSET(Texture, mipmap[0]);
+
+	if(state.mipmapFilter == MIPMAP_NONE)
+	{
+		mipmap = mipmap0;
+	}
+	else
+	{
+		Int ilod;
+
+		if(state.mipmapFilter == MIPMAP_POINT)
+		{
+			// TODO: Preferred formula is ceil(lod + 0.5) - 1
+			ilod = RoundInt(lod);
+		}
+		else   // MIPMAP_LINEAR
+		{
+			ilod = Int(lod);
+		}
+
+		mipmap = mipmap0 + ilod * sizeof(Mipmap) + secondLOD * sizeof(Mipmap);
+	}
+
+	buffer = *Pointer<Pointer<Byte>>(mipmap + OFFSET(Mipmap, buffer));
+}
+
+Int4 SamplerCore::computeFilterOffset(Float &lod)
+{
+	if(state.textureFilter == FILTER_POINT)
+	{
+		return Int4(0);
+	}
+	else if(state.textureFilter == FILTER_MIN_LINEAR_MAG_POINT)
+	{
+		return CmpNLE(Float4(lod), Float4(0.0f));
+	}
+	else if(state.textureFilter == FILTER_MIN_POINT_MAG_LINEAR)
+	{
+		return CmpLE(Float4(lod), Float4(0.0f));
+	}
+
+	return Int4(~0);
+}
+
+Short4 SamplerCore::address(const Float4 &uw, AddressingMode addressingMode, Pointer<Byte> &mipmap)
+{
+	if(addressingMode == ADDRESSING_UNUSED)
+	{
+		return Short4();
+	}
+	else if(addressingMode == ADDRESSING_LAYER)
+	{
+		Int4 dim = *Pointer<Int4>(mipmap + OFFSET(Mipmap, depth));
+		// For cube maps, the layer argument is per cube, each of which has 6 layers
+		if(state.textureType == VK_IMAGE_VIEW_TYPE_CUBE_ARRAY)
+		{
+			dim = dim / Int4(6);
+		}
+
+		return Short4(Min(Max(RoundInt(uw), Int4(0)), dim - Int4(1)));
+	}
+	else if(addressingMode == ADDRESSING_CLAMP || addressingMode == ADDRESSING_BORDER)
+	{
+		Float4 clamp = Min(Max(uw, Float4(0.0f)), Float4(65535.0f / 65536.0f));
+
+		return Short4(Int4(clamp * Float4(1 << 16)));
+	}
+	else if(addressingMode == ADDRESSING_MIRROR)
+	{
+		Int4 convert = Int4(uw * Float4(1 << 16));
+		Int4 mirror = (convert << 15) >> 31;
+
+		convert ^= mirror;
+
+		return Short4(convert);
+	}
+	else if(addressingMode == ADDRESSING_MIRRORONCE)
+	{
+		// Absolute value
+		Int4 convert = Int4(Abs(uw * Float4(1 << 16)));
+
+		// Clamp
+		convert -= Int4(0x00008000, 0x00008000, 0x00008000, 0x00008000);
+		convert = As<Int4>(PackSigned(convert, convert));
+
+		return As<Short4>(Int2(convert)) + Short4(0x8000u);
+	}
+	else   // Wrap
+	{
+		return Short4(Int4(uw * Float4(1 << 16)));
+	}
+}
+
+// TODO: Eliminate when the gather + mirror addressing case is handled by mirroring the footprint.
+static Int4 mirror(Int4 n)
+{
+	auto positive = CmpNLT(n, Int4(0));
+	return (positive & n) | (~positive & (-(Int4(1) + n)));
+}
+
+static Int4 mod(Int4 n, Int4 d)
+{
+	auto x = n % d;
+	auto positive = CmpNLT(x, Int4(0));
+	return (positive & x) | (~positive & (x + d));
+}
+
+void SamplerCore::address(const Float4 &uvw, Int4 &xyz0, Int4 &xyz1, Float4 &f, Pointer<Byte> &mipmap, Float4 &texOffset, Int4 &filter, int whd, AddressingMode addressingMode, SamplerFunction function)
+{
+	if(addressingMode == ADDRESSING_UNUSED)
+	{
+		return;
+	}
+
+	Int4 dim = *Pointer<Int4>(mipmap + whd, 16);
+	Int4 maxXYZ = dim - Int4(1);
+
+	if(function == Fetch)
+	{
+		xyz0 = Min(Max(((function.offset != 0) && (addressingMode != ADDRESSING_LAYER)) ? As<Int4>(uvw) + As<Int4>(texOffset) : As<Int4>(uvw), Int4(0)), maxXYZ);
+	}
+	else if(addressingMode == ADDRESSING_LAYER)   // Note: Offset does not apply to array layers
+	{
+		// For cube maps, the layer argument is per cube, each of which has 6 layers
+		if(state.textureType == VK_IMAGE_VIEW_TYPE_CUBE_ARRAY)
+		{
+			dim = dim / Int4(6);
+		}
+
+		xyz0 = Min(Max(RoundInt(uvw), Int4(0)), dim - Int4(1));
+	}
+	else if(addressingMode == ADDRESSING_CUBEFACE)
+	{
+		xyz0 = As<Int4>(uvw);
+	}
+	else
+	{
+		const int halfBits = 0x3EFFFFFF;   // Value just under 0.5f
+		const int oneBits  = 0x3F7FFFFF;   // Value just under 1.0f
+		const int twoBits  = 0x3FFFFFFF;   // Value just under 2.0f
+
+		bool pointFilter = state.textureFilter == FILTER_POINT ||
+		                   state.textureFilter == FILTER_MIN_POINT_MAG_LINEAR ||
+		                   state.textureFilter == FILTER_MIN_LINEAR_MAG_POINT;
+
+		Float4 coord = uvw;
+
+		if(state.unnormalizedCoordinates)
+		{
+			switch(addressingMode)
+			{
+			case ADDRESSING_CLAMP:
+				coord = Min(Max(coord, Float4(0.0f)), Float4(dim) * As<Float4>(Int4(oneBits)));
+				break;
+			case ADDRESSING_BORDER:
+				// Don't map to a valid range here.
+				break;
+			default:
+				// If unnormalizedCoordinates is VK_TRUE, addressModeU and addressModeV must each be
+				// either VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE or VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_BORDER
+				UNREACHABLE("addressingMode %d", int(addressingMode));
+				break;
+			}
+		}
+		else if(state.textureFilter == FILTER_GATHER && addressingMode == ADDRESSING_MIRROR)
+		{
+			// Gather requires the 'footprint' of the texels from which a component is taken, to also mirror around.
+			// Therefore we can't just compute one texel's location and find the other ones at +1 offsets from it.
+			// Here we handle that case separately by doing the mirroring per texel coordinate.
+			// TODO: Mirror the footprint by adjusting the sign of the 0.5f and 1 offsets.
+
+			coord = coord * Float4(dim);
+			coord -= Float4(0.5f);
+			Float4 floor = Floor(coord);
+			xyz0 = Int4(floor);
 
 			if(function.offset != 0)
 			{
 				xyz0 += As<Int4>(texOffset);
 			}
 
-			if(addressingMode == ADDRESSING_SEAMLESS)  // Adjust for border.
-			{
-				xyz0 += Int4(1);
-			}
+			xyz1 = xyz0 + Int4(1);
 
-			xyz1 = xyz0 - filter;   // Increment
+			xyz0 = (maxXYZ) - mirror(mod(xyz0, Int4(2) * dim) - dim);
+			xyz1 = (maxXYZ) - mirror(mod(xyz1, Int4(2) * dim) - dim);
 
-			if(addressingMode == ADDRESSING_BORDER)
-			{
-				// Replace the coordinates with -1 if they're out of range.
-				Int4 border0 = CmpLT(xyz0, Int4(0)) | CmpNLT(xyz0, dim);
-				Int4 border1 = CmpLT(xyz1, Int4(0)) | CmpNLT(xyz1, dim);
-				xyz0 |= border0;
-				xyz1 |= border1;
-			}
-			else if(function.offset != 0)
+			return;
+		}
+		else
+		{
+			if(function.offset == 0)
 			{
 				switch(addressingMode)
 				{
-				case ADDRESSING_SEAMLESS:
-					UNREACHABLE("addressingMode %d", int(addressingMode));  // Cube sampling doesn't support offset.
-				case ADDRESSING_MIRROR:
-				case ADDRESSING_MIRRORONCE:
-					// TODO: Implement ADDRESSING_MIRROR and ADDRESSING_MIRRORONCE.
-					// Fall through to Clamp.
 				case ADDRESSING_CLAMP:
-					xyz0 = Min(Max(xyz0, Int4(0)), maxXYZ);
-					xyz1 = Min(Max(xyz1, Int4(0)), maxXYZ);
-					break;
-				default:   // Wrap
-					xyz0 = mod(xyz0, dim);
-					xyz1 = mod(xyz1, dim);
-					break;
-				}
-			}
-			else if(state.textureFilter != FILTER_POINT)
-			{
-				switch(addressingMode)
-				{
 				case ADDRESSING_SEAMLESS:
-					break;
-				case ADDRESSING_MIRROR:
-				case ADDRESSING_MIRRORONCE:
-				case ADDRESSING_CLAMP:
-					xyz0 = Max(xyz0, Int4(0));
-					xyz1 = Min(xyz1, maxXYZ);
-					break;
-				default:   // Wrap
+					// Linear filtering of cube doesn't require clamping because the coordinates
+					// are already in [0, 1] range and numerical imprecision is tolerated.
+					if(addressingMode != ADDRESSING_SEAMLESS || pointFilter)
 					{
-						Int4 under = CmpLT(xyz0, Int4(0));
-						xyz0 = (under & maxXYZ) | (~under & xyz0);   // xyz < 0 ? dim - 1 : xyz   // TODO: IfThenElse()
-
-						Int4 nover = CmpLT(xyz1, dim);
-						xyz1 = nover & xyz1;   // xyz >= dim ? 0 : xyz
+						Float4 one = As<Float4>(Int4(oneBits));
+						coord = Min(Max(coord, Float4(0.0f)), one);
 					}
 					break;
+				case ADDRESSING_MIRROR:
+					{
+						Float4 half = As<Float4>(Int4(halfBits));
+						Float4 one = As<Float4>(Int4(oneBits));
+						Float4 two = As<Float4>(Int4(twoBits));
+						coord = one - Abs(two * Frac(coord * half) - one);
+					}
+					break;
+				case ADDRESSING_MIRRORONCE:
+					{
+						Float4 half = As<Float4>(Int4(halfBits));
+						Float4 one = As<Float4>(Int4(oneBits));
+						Float4 two = As<Float4>(Int4(twoBits));
+						coord = one - Abs(two * Frac(Min(Max(coord, -one), two) * half) - one);
+					}
+					break;
+				case ADDRESSING_BORDER:
+					// Don't map to a valid range here.
+					break;
+				default:   // Wrap
+					coord = Frac(coord);
+					break;
 				}
 			}
+
+			coord = coord * Float4(dim);
 		}
-	}
 
-	void SamplerCore::convertSigned15(Float4 &cf, Short4 &cs)
-	{
-		cf = Float4(cs) * Float4(1.0f / 0x7FFF);
-	}
-
-	void SamplerCore::convertUnsigned16(Float4 &cf, Short4 &cs)
-	{
-		cf = Float4(As<UShort4>(cs)) * Float4(1.0f / 0xFFFF);
-	}
-
-	void SamplerCore::sRGBtoLinear16_8_16(Short4 &c)
-	{
-		c = As<UShort4>(c) >> 8;
-
-		Pointer<Byte> LUT = Pointer<Byte>(constants + OFFSET(Constants,sRGBtoLinear8_16));
-
-		c = Insert(c, *Pointer<Short>(LUT + 2 * Int(Extract(c, 0))), 0);
-		c = Insert(c, *Pointer<Short>(LUT + 2 * Int(Extract(c, 1))), 1);
-		c = Insert(c, *Pointer<Short>(LUT + 2 * Int(Extract(c, 2))), 2);
-		c = Insert(c, *Pointer<Short>(LUT + 2 * Int(Extract(c, 3))), 3);
-	}
-
-	bool SamplerCore::hasFloatTexture() const
-	{
-		return state.textureFormat.isFloatFormat();
-	}
-
-	bool SamplerCore::hasUnnormalizedIntegerTexture() const
-	{
-		return state.textureFormat.isNonNormalizedInteger();
-	}
-
-	bool SamplerCore::hasUnsignedTextureComponent(int component) const
-	{
-		return state.textureFormat.isUnsignedComponent(component);
-	}
-
-	int SamplerCore::textureComponentCount() const
-	{
-		return state.textureFormat.componentCount();
-	}
-
-	bool SamplerCore::hasThirdCoordinate() const
-	{
-		return (state.textureType == VK_IMAGE_VIEW_TYPE_3D) ||
-		       (state.textureType == VK_IMAGE_VIEW_TYPE_2D_ARRAY) ||
-		       (state.textureType == VK_IMAGE_VIEW_TYPE_1D_ARRAY);  // Treated as 2D texture with second coordinate 0. TODO(b/134669567)
-	}
-
-	bool SamplerCore::has16bitTextureFormat() const
-	{
-		return state.textureFormat.has16bitTextureFormat();
-	}
-
-	bool SamplerCore::has8bitTextureComponents() const
-	{
-		return state.textureFormat.has8bitTextureComponents();
-	}
-
-	bool SamplerCore::has16bitTextureComponents() const
-	{
-		return state.textureFormat.has16bitTextureComponents();
-	}
-
-	bool SamplerCore::has32bitIntegerTextureComponents() const
-	{
-		return state.textureFormat.has32bitIntegerTextureComponents();
-	}
-
-	bool SamplerCore::isYcbcrFormat() const
-	{
-		return state.textureFormat.isYcbcrFormat();
-	}
-
-	bool SamplerCore::isRGBComponent(int component) const
-	{
-		return state.textureFormat.isRGBComponent(component);
-	}
-
-	bool SamplerCore::borderModeActive() const
-	{
-		return state.addressingModeU == ADDRESSING_BORDER ||
-		       state.addressingModeV == ADDRESSING_BORDER ||
-		       state.addressingModeW == ADDRESSING_BORDER;
-	}
-
-	bool SamplerCore::isCube() const
-	{
-		return state.textureType == VK_IMAGE_VIEW_TYPE_CUBE ||
-		       state.textureType == VK_IMAGE_VIEW_TYPE_CUBE_ARRAY;
-	}
-
-	VkComponentSwizzle SamplerCore::gatherSwizzle() const
-	{
-		switch(state.gatherComponent)
+		if(state.textureFilter == FILTER_POINT)
 		{
-		case 0: return state.swizzle.r;
-		case 1: return state.swizzle.g;
-		case 2: return state.swizzle.b;
-		case 3: return state.swizzle.a;
-		default:
-			UNREACHABLE("Invalid component");
-			return VK_COMPONENT_SWIZZLE_R;
+			if(addressingMode == ADDRESSING_BORDER || function.offset != 0)
+			{
+				xyz0 = Int4(Floor(coord));
+			}
+			else  // Can't have negative coordinates, so floor() is redundant when casting to int.
+			{
+				xyz0 = Int4(coord);
+			}
+		}
+		else
+		{
+			if(state.textureFilter == FILTER_MIN_POINT_MAG_LINEAR ||
+			   state.textureFilter == FILTER_MIN_LINEAR_MAG_POINT)
+			{
+				coord -= As<Float4>(As<Int4>(Float4(0.5f)) & filter);
+			}
+			else
+			{
+				coord -= Float4(0.5f);
+			}
+
+			Float4 floor = Floor(coord);
+			xyz0 = Int4(floor);
+			f = coord - floor;
+		}
+
+		if(function.offset != 0)
+		{
+			xyz0 += As<Int4>(texOffset);
+		}
+
+		if(addressingMode == ADDRESSING_SEAMLESS)  // Adjust for border.
+		{
+			xyz0 += Int4(1);
+		}
+
+		xyz1 = xyz0 - filter;   // Increment
+
+		if(addressingMode == ADDRESSING_BORDER)
+		{
+			// Replace the coordinates with -1 if they're out of range.
+			Int4 border0 = CmpLT(xyz0, Int4(0)) | CmpNLT(xyz0, dim);
+			Int4 border1 = CmpLT(xyz1, Int4(0)) | CmpNLT(xyz1, dim);
+			xyz0 |= border0;
+			xyz1 |= border1;
+		}
+		else if(function.offset != 0)
+		{
+			switch(addressingMode)
+			{
+			case ADDRESSING_SEAMLESS:
+				UNREACHABLE("addressingMode %d", int(addressingMode));  // Cube sampling doesn't support offset.
+			case ADDRESSING_MIRROR:
+			case ADDRESSING_MIRRORONCE:
+				// TODO: Implement ADDRESSING_MIRROR and ADDRESSING_MIRRORONCE.
+				// Fall through to Clamp.
+			case ADDRESSING_CLAMP:
+				xyz0 = Min(Max(xyz0, Int4(0)), maxXYZ);
+				xyz1 = Min(Max(xyz1, Int4(0)), maxXYZ);
+				break;
+			default:   // Wrap
+				xyz0 = mod(xyz0, dim);
+				xyz1 = mod(xyz1, dim);
+				break;
+			}
+		}
+		else if(state.textureFilter != FILTER_POINT)
+		{
+			switch(addressingMode)
+			{
+			case ADDRESSING_SEAMLESS:
+				break;
+			case ADDRESSING_MIRROR:
+			case ADDRESSING_MIRRORONCE:
+			case ADDRESSING_CLAMP:
+				xyz0 = Max(xyz0, Int4(0));
+				xyz1 = Min(xyz1, maxXYZ);
+				break;
+			default:   // Wrap
+				{
+					Int4 under = CmpLT(xyz0, Int4(0));
+					xyz0 = (under & maxXYZ) | (~under & xyz0);   // xyz < 0 ? dim - 1 : xyz   // TODO: IfThenElse()
+
+					Int4 nover = CmpLT(xyz1, dim);
+					xyz1 = nover & xyz1;   // xyz >= dim ? 0 : xyz
+				}
+				break;
+			}
 		}
 	}
 }
+
+void SamplerCore::convertSigned15(Float4 &cf, Short4 &cs)
+{
+	cf = Float4(cs) * Float4(1.0f / 0x7FFF);
+}
+
+void SamplerCore::convertUnsigned16(Float4 &cf, Short4 &cs)
+{
+	cf = Float4(As<UShort4>(cs)) * Float4(1.0f / 0xFFFF);
+}
+
+void SamplerCore::sRGBtoLinear16_8_16(Short4 &c)
+{
+	c = As<UShort4>(c) >> 8;
+
+	Pointer<Byte> LUT = Pointer<Byte>(constants + OFFSET(Constants,sRGBtoLinear8_16));
+
+	c = Insert(c, *Pointer<Short>(LUT + 2 * Int(Extract(c, 0))), 0);
+	c = Insert(c, *Pointer<Short>(LUT + 2 * Int(Extract(c, 1))), 1);
+	c = Insert(c, *Pointer<Short>(LUT + 2 * Int(Extract(c, 2))), 2);
+	c = Insert(c, *Pointer<Short>(LUT + 2 * Int(Extract(c, 3))), 3);
+}
+
+bool SamplerCore::hasFloatTexture() const
+{
+	return state.textureFormat.isFloatFormat();
+}
+
+bool SamplerCore::hasUnnormalizedIntegerTexture() const
+{
+	return state.textureFormat.isNonNormalizedInteger();
+}
+
+bool SamplerCore::hasUnsignedTextureComponent(int component) const
+{
+	return state.textureFormat.isUnsignedComponent(component);
+}
+
+int SamplerCore::textureComponentCount() const
+{
+	return state.textureFormat.componentCount();
+}
+
+bool SamplerCore::hasThirdCoordinate() const
+{
+	return (state.textureType == VK_IMAGE_VIEW_TYPE_3D) ||
+	       (state.textureType == VK_IMAGE_VIEW_TYPE_2D_ARRAY) ||
+	       (state.textureType == VK_IMAGE_VIEW_TYPE_1D_ARRAY);  // Treated as 2D texture with second coordinate 0. TODO(b/134669567)
+}
+
+bool SamplerCore::has16bitTextureFormat() const
+{
+	return state.textureFormat.has16bitTextureFormat();
+}
+
+bool SamplerCore::has8bitTextureComponents() const
+{
+	return state.textureFormat.has8bitTextureComponents();
+}
+
+bool SamplerCore::has16bitTextureComponents() const
+{
+	return state.textureFormat.has16bitTextureComponents();
+}
+
+bool SamplerCore::has32bitIntegerTextureComponents() const
+{
+	return state.textureFormat.has32bitIntegerTextureComponents();
+}
+
+bool SamplerCore::isYcbcrFormat() const
+{
+	return state.textureFormat.isYcbcrFormat();
+}
+
+bool SamplerCore::isRGBComponent(int component) const
+{
+	return state.textureFormat.isRGBComponent(component);
+}
+
+bool SamplerCore::borderModeActive() const
+{
+	return state.addressingModeU == ADDRESSING_BORDER ||
+	       state.addressingModeV == ADDRESSING_BORDER ||
+	       state.addressingModeW == ADDRESSING_BORDER;
+}
+
+bool SamplerCore::isCube() const
+{
+	return state.textureType == VK_IMAGE_VIEW_TYPE_CUBE ||
+	       state.textureType == VK_IMAGE_VIEW_TYPE_CUBE_ARRAY;
+}
+
+VkComponentSwizzle SamplerCore::gatherSwizzle() const
+{
+	switch(state.gatherComponent)
+	{
+	case 0: return state.swizzle.r;
+	case 1: return state.swizzle.g;
+	case 2: return state.swizzle.b;
+	case 3: return state.swizzle.a;
+	default:
+		UNREACHABLE("Invalid component");
+		return VK_COMPONENT_SWIZZLE_R;
+	}
+}
+
+}  // namespace sw
diff --git a/src/Pipeline/SamplerCore.hpp b/src/Pipeline/SamplerCore.hpp
index fc0f314..396a9f6 100644
--- a/src/Pipeline/SamplerCore.hpp
+++ b/src/Pipeline/SamplerCore.hpp
@@ -24,114 +24,117 @@
 #undef None  // b/127920555
 #endif
 
-namespace sw
+namespace sw {
+
+using namespace rr;
+
+enum SamplerMethod : uint32_t
 {
-	using namespace rr;
+	Implicit,  // Compute gradients (pixel shader only).
+	Bias,      // Compute gradients and add provided bias.
+	Lod,       // Use provided LOD.
+	Grad,      // Use provided gradients.
+	Fetch,     // Use provided integer coordinates.
+	Base,      // Sample base level.
+	Query,     // Return implicit LOD.
+	Gather,    // Return one channel of each texel in footprint.
+	SAMPLER_METHOD_LAST = Gather,
+};
 
-	enum SamplerMethod : uint32_t
-	{
-		Implicit,  // Compute gradients (pixel shader only).
-		Bias,      // Compute gradients and add provided bias.
-		Lod,       // Use provided LOD.
-		Grad,      // Use provided gradients.
-		Fetch,     // Use provided integer coordinates.
-		Base,      // Sample base level.
-		Query,     // Return implicit LOD.
-		Gather,    // Return one channel of each texel in footprint.
-		SAMPLER_METHOD_LAST = Gather,
-	};
+// TODO(b/129523279): Eliminate and use SpirvShader::ImageInstruction instead.
+struct SamplerFunction
+{
+	SamplerFunction(SamplerMethod method, bool offset = false, bool sample = false)
+		: method(method), offset(offset), sample(sample)
+	{}
 
-	// TODO(b/129523279): Eliminate and use SpirvShader::ImageInstruction instead.
-	struct SamplerFunction
-	{
-		SamplerFunction(SamplerMethod method, bool offset = false, bool sample = false)
-			: method(method), offset(offset), sample(sample)
-		{}
+	operator SamplerMethod() { return method; }
 
-		operator SamplerMethod() { return method; }
+	const SamplerMethod method;
+	const bool offset;
+	const bool sample;
+};
 
-		const SamplerMethod method;
-		const bool offset;
-		const bool sample;
- 	};
+class SamplerCore
+{
+public:
+	SamplerCore(Pointer<Byte> &constants, const Sampler &state);
 
-	class SamplerCore
-	{
-	public:
-		SamplerCore(Pointer<Byte> &constants, const Sampler &state);
+	Vector4f sampleTexture(Pointer<Byte> &texture, Pointer<Byte> &sampler, Float4 uvw[4], Float4 &q, Float &&lodOrBias, Float4 &dsx, Float4 &dsy, Vector4f &offset, Int4& sampleId, SamplerFunction function);
 
-		Vector4f sampleTexture(Pointer<Byte> &texture, Pointer<Byte> &sampler, Float4 uvw[4], Float4 &q, Float &&lodOrBias, Float4 &dsx, Float4 &dsy, Vector4f &offset, Int4& sampleId, SamplerFunction function);
+private:
+	Short4 offsetSample(Short4 &uvw, Pointer<Byte> &mipmap, int halfOffset, bool wrap, int count, Float &lod);
+	Vector4s sampleFilter(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, Vector4f &offset, const Float4& cubeArrayCoord, const Int4& sampleId, Float &lod, Float &anisotropy, Float4 &uDelta, Float4 &vDelta, SamplerFunction function);
+	Vector4s sampleAniso(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, Vector4f &offset, const Float4& cubeArrayCoord, const Int4& sampleId, Float &lod, Float &anisotropy, Float4 &uDelta, Float4 &vDelta, bool secondLOD, SamplerFunction function);
+	Vector4s sampleQuad(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, Vector4f &offset, const Float4& cubeArrayCoord, const Int4& sampleId, Float &lod, bool secondLOD, SamplerFunction function);
+	Vector4s sampleQuad2D(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, Vector4f &offset, const Float4& cubeArrayCoord, const Int4& sampleId, Float &lod, bool secondLOD, SamplerFunction function);
+	Vector4s sample3D(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, Vector4f &offset, const Float4& cubeArrayCoord, const Int4& sampleId, Float &lod, bool secondLOD, SamplerFunction function);
+	Vector4f sampleFloatFilter(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, Float4 &q, Vector4f &offset, const Float4& cubeArrayCoord, const Int4& sampleId, Float &lod, Float &anisotropy, Float4 &uDelta, Float4 &vDelta, SamplerFunction function);
+	Vector4f sampleFloatAniso(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, Float4 &q, Vector4f &offset, const Float4& cubeArrayCoord, const Int4& sampleId, Float &lod, Float &anisotropy, Float4 &uDelta, Float4 &vDelta, bool secondLOD, SamplerFunction function);
+	Vector4f sampleFloat(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, Float4 &q, Vector4f &offset, const Float4& cubeArrayCoord, const Int4& sampleId, Float &lod, bool secondLOD, SamplerFunction function);
+	Vector4f sampleFloat2D(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, Float4 &q, Vector4f &offset, const Float4& cubeArrayCoord, const Int4& sampleId, Float &lod, bool secondLOD, SamplerFunction function);
+	Vector4f sampleFloat3D(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, Vector4f &offset, const Float4& cubeArrayCoord, const Int4& sampleId, Float &lod, bool secondLOD, SamplerFunction function);
+	Float log2sqrt(Float lod);
+	Float log2(Float lod);
+	void computeLod(Pointer<Byte> &texture, Pointer<Byte> &sampler, Float &lod, Float &anisotropy, Float4 &uDelta, Float4 &vDelta, Float4 &u, Float4 &v, Float4 &dsx, Float4 &dsy, SamplerFunction function);
+	void computeLodCube(Pointer<Byte> &texture, Pointer<Byte> &sampler, Float &lod, Float4 &u, Float4 &v, Float4 &w, Float4 &dsx, Float4 &dsy, Float4 &M, SamplerFunction function);
+	void computeLod3D(Pointer<Byte> &texture, Pointer<Byte> &sampler, Float &lod, Float4 &u, Float4 &v, Float4 &w, Float4 &dsx, Float4 &dsy, SamplerFunction function);
+	Int4 cubeFace(Float4 &U, Float4 &V, Float4 &x, Float4 &y, Float4 &z, Float4 &M);
+	Short4 applyOffset(Short4 &uvw, Float4 &offset, const Int4 &whd, AddressingMode mode);
+	void computeIndices(UInt index[4], Short4 uuuu, Short4 vvvv, Short4 wwww, Vector4f &offset, const Pointer<Byte> &mipmap, const Short4& cubeArrayId, const Int4& sampleId, SamplerFunction function);
+	void computeIndices(UInt index[4], Int4 uuuu, Int4 vvvv, Int4 wwww, Int4 valid, const Pointer<Byte> &mipmap, const Int4& cubeArrayId, const Int4& sampleId, SamplerFunction function);
+	Vector4s sampleTexel(Short4 &u, Short4 &v, Short4 &s, Vector4f &offset, Pointer<Byte> &mipmap, const Short4& cubeArrayId, const Int4& sampleId, Pointer<Byte> buffer, SamplerFunction function);
+	Vector4s sampleTexel(UInt index[4], Pointer<Byte> buffer);
+	Vector4f sampleTexel(Int4 &u, Int4 &v, Int4 &s, Float4 &z, Pointer<Byte> &mipmap, const Int4& cubeArrayId, const Int4& sampleId, Pointer<Byte> buffer, SamplerFunction function);
+	Vector4f replaceBorderTexel(const Vector4f &c, Int4 valid);
+	void selectMipmap(const Pointer<Byte> &texture, Pointer<Byte> &mipmap, Pointer<Byte> &buffer, const Float &lod, bool secondLOD);
+	Short4 address(const Float4 &uw, AddressingMode addressingMode, Pointer<Byte>& mipmap);
+	void address(const Float4 &uw, Int4& xyz0, Int4& xyz1, Float4& f, Pointer<Byte>& mipmap, Float4 &texOffset, Int4 &filter, int whd, AddressingMode addressingMode, SamplerFunction function);
+	Int4 computeFilterOffset(Float &lod);
 
-	private:
-		Short4 offsetSample(Short4 &uvw, Pointer<Byte> &mipmap, int halfOffset, bool wrap, int count, Float &lod);
-		Vector4s sampleFilter(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, Vector4f &offset, const Float4& cubeArrayCoord, const Int4& sampleId, Float &lod, Float &anisotropy, Float4 &uDelta, Float4 &vDelta, SamplerFunction function);
-		Vector4s sampleAniso(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, Vector4f &offset, const Float4& cubeArrayCoord, const Int4& sampleId, Float &lod, Float &anisotropy, Float4 &uDelta, Float4 &vDelta, bool secondLOD, SamplerFunction function);
-		Vector4s sampleQuad(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, Vector4f &offset, const Float4& cubeArrayCoord, const Int4& sampleId, Float &lod, bool secondLOD, SamplerFunction function);
-		Vector4s sampleQuad2D(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, Vector4f &offset, const Float4& cubeArrayCoord, const Int4& sampleId, Float &lod, bool secondLOD, SamplerFunction function);
-		Vector4s sample3D(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, Vector4f &offset, const Float4& cubeArrayCoord, const Int4& sampleId, Float &lod, bool secondLOD, SamplerFunction function);
-		Vector4f sampleFloatFilter(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, Float4 &q, Vector4f &offset, const Float4& cubeArrayCoord, const Int4& sampleId, Float &lod, Float &anisotropy, Float4 &uDelta, Float4 &vDelta, SamplerFunction function);
-		Vector4f sampleFloatAniso(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, Float4 &q, Vector4f &offset, const Float4& cubeArrayCoord, const Int4& sampleId, Float &lod, Float &anisotropy, Float4 &uDelta, Float4 &vDelta, bool secondLOD, SamplerFunction function);
-		Vector4f sampleFloat(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, Float4 &q, Vector4f &offset, const Float4& cubeArrayCoord, const Int4& sampleId, Float &lod, bool secondLOD, SamplerFunction function);
-		Vector4f sampleFloat2D(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, Float4 &q, Vector4f &offset, const Float4& cubeArrayCoord, const Int4& sampleId, Float &lod, bool secondLOD, SamplerFunction function);
-		Vector4f sampleFloat3D(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, Vector4f &offset, const Float4& cubeArrayCoord, const Int4& sampleId, Float &lod, bool secondLOD, SamplerFunction function);
-		Float log2sqrt(Float lod);
-		Float log2(Float lod);
-		void computeLod(Pointer<Byte> &texture, Pointer<Byte> &sampler, Float &lod, Float &anisotropy, Float4 &uDelta, Float4 &vDelta, Float4 &u, Float4 &v, Float4 &dsx, Float4 &dsy, SamplerFunction function);
-		void computeLodCube(Pointer<Byte> &texture, Pointer<Byte> &sampler, Float &lod, Float4 &u, Float4 &v, Float4 &w, Float4 &dsx, Float4 &dsy, Float4 &M, SamplerFunction function);
-		void computeLod3D(Pointer<Byte> &texture, Pointer<Byte> &sampler, Float &lod, Float4 &u, Float4 &v, Float4 &w, Float4 &dsx, Float4 &dsy, SamplerFunction function);
-		Int4 cubeFace(Float4 &U, Float4 &V, Float4 &x, Float4 &y, Float4 &z, Float4 &M);
-		Short4 applyOffset(Short4 &uvw, Float4 &offset, const Int4 &whd, AddressingMode mode);
-		void computeIndices(UInt index[4], Short4 uuuu, Short4 vvvv, Short4 wwww, Vector4f &offset, const Pointer<Byte> &mipmap, const Short4& cubeArrayId, const Int4& sampleId, SamplerFunction function);
-		void computeIndices(UInt index[4], Int4 uuuu, Int4 vvvv, Int4 wwww, Int4 valid, const Pointer<Byte> &mipmap, const Int4& cubeArrayId, const Int4& sampleId, SamplerFunction function);
-		Vector4s sampleTexel(Short4 &u, Short4 &v, Short4 &s, Vector4f &offset, Pointer<Byte> &mipmap, const Short4& cubeArrayId, const Int4& sampleId, Pointer<Byte> buffer, SamplerFunction function);
-		Vector4s sampleTexel(UInt index[4], Pointer<Byte> buffer);
-		Vector4f sampleTexel(Int4 &u, Int4 &v, Int4 &s, Float4 &z, Pointer<Byte> &mipmap, const Int4& cubeArrayId, const Int4& sampleId, Pointer<Byte> buffer, SamplerFunction function);
-		Vector4f replaceBorderTexel(const Vector4f &c, Int4 valid);
-		void selectMipmap(const Pointer<Byte> &texture, Pointer<Byte> &mipmap, Pointer<Byte> &buffer, const Float &lod, bool secondLOD);
-		Short4 address(const Float4 &uw, AddressingMode addressingMode, Pointer<Byte>& mipmap);
-		void address(const Float4 &uw, Int4& xyz0, Int4& xyz1, Float4& f, Pointer<Byte>& mipmap, Float4 &texOffset, Int4 &filter, int whd, AddressingMode addressingMode, SamplerFunction function);
-		Int4 computeFilterOffset(Float &lod);
+	void convertSigned15(Float4 &cf, Short4 &ci);
+	void convertUnsigned16(Float4 &cf, Short4 &ci);
+	void sRGBtoLinear16_8_16(Short4 &c);
 
-		void convertSigned15(Float4 &cf, Short4 &ci);
-		void convertUnsigned16(Float4 &cf, Short4 &ci);
-		void sRGBtoLinear16_8_16(Short4 &c);
+	bool hasFloatTexture() const;
+	bool hasUnnormalizedIntegerTexture() const;
+	bool hasUnsignedTextureComponent(int component) const;
+	int textureComponentCount() const;
+	bool hasThirdCoordinate() const;
+	bool has16bitTextureFormat() const;
+	bool has8bitTextureComponents() const;
+	bool has16bitTextureComponents() const;
+	bool has32bitIntegerTextureComponents() const;
+	bool isYcbcrFormat() const;
+	bool isRGBComponent(int component) const;
+	bool borderModeActive() const;
+	bool isCube() const;
+	VkComponentSwizzle gatherSwizzle() const;
 
-		bool hasFloatTexture() const;
-		bool hasUnnormalizedIntegerTexture() const;
-		bool hasUnsignedTextureComponent(int component) const;
-		int textureComponentCount() const;
-		bool hasThirdCoordinate() const;
-		bool has16bitTextureFormat() const;
-		bool has8bitTextureComponents() const;
-		bool has16bitTextureComponents() const;
-		bool has32bitIntegerTextureComponents() const;
-		bool isYcbcrFormat() const;
-		bool isRGBComponent(int component) const;
-		bool borderModeActive() const;
-		bool isCube() const;
-		VkComponentSwizzle gatherSwizzle() const;
+	Pointer<Byte> &constants;
+	const Sampler &state;
+};
 
-		Pointer<Byte> &constants;
-		const Sampler &state;
-	};
-}
+}  // namespace sw
 
 #ifdef ENABLE_RR_PRINT
 namespace rr {
-	template <> struct PrintValue::Ty<sw::SamplerFunction>
-	{
-		static std::string fmt(const sw::SamplerFunction& v)
-		{
-			return std::string("SamplerFunction[") +
-				"method: " + std::to_string(v.method) +
-				", offset: " + std::to_string(v.offset) +
-				", sample: " + std::to_string(v.sample) +
-				"]";
-		}
 
-		static std::vector<rr::Value*> val(const sw::SamplerFunction& v) { return {}; }
-	};
-}
+template <> struct PrintValue::Ty<sw::SamplerFunction>
+{
+	static std::string fmt(const sw::SamplerFunction& v)
+	{
+		return std::string("SamplerFunction[") +
+			"method: " + std::to_string(v.method) +
+			", offset: " + std::to_string(v.offset) +
+			", sample: " + std::to_string(v.sample) +
+			"]";
+	}
+
+	static std::vector<rr::Value*> val(const sw::SamplerFunction& v) { return {}; }
+};
+
+}  // namespace rr
 #endif // ENABLE_RR_PRINT
 
 #endif   // sw_SamplerCore_hpp
diff --git a/src/Pipeline/SetupRoutine.cpp b/src/Pipeline/SetupRoutine.cpp
index dbe9feb..c9e643d 100644
--- a/src/Pipeline/SetupRoutine.cpp
+++ b/src/Pipeline/SetupRoutine.cpp
@@ -21,613 +21,614 @@
 #include "Device/Renderer.hpp"
 #include "Reactor/Reactor.hpp"
 
-namespace sw
+namespace sw {
+
+SetupRoutine::SetupRoutine(const SetupProcessor::State &state) : state(state)
 {
-	SetupRoutine::SetupRoutine(const SetupProcessor::State &state) : state(state)
-	{
-	}
+}
 
-	SetupRoutine::~SetupRoutine()
-	{
-	}
+SetupRoutine::~SetupRoutine()
+{
+}
 
-	void SetupRoutine::generate()
+void SetupRoutine::generate()
+{
+	SetupFunction function;
 	{
-		SetupFunction function;
+		Pointer<Byte> primitive(function.Arg<0>());
+		Pointer<Byte> tri(function.Arg<1>());
+		Pointer<Byte> polygon(function.Arg<2>());
+		Pointer<Byte> data(function.Arg<3>());
+
+		Pointer<Byte> constants = *Pointer<Pointer<Byte> >(data + OFFSET(DrawData,constants));
+
+		const bool point = state.isDrawPoint;
+		const bool line = state.isDrawLine;
+		const bool triangle = state.isDrawTriangle;
+
+		const int V0 = OFFSET(Triangle,v0);
+		const int V1 = (triangle || line) ? OFFSET(Triangle,v1) : OFFSET(Triangle,v0);
+		const int V2 = triangle ? OFFSET(Triangle,v2) : (line ? OFFSET(Triangle,v1) : OFFSET(Triangle,v0));
+
+		Pointer<Byte> v0 = tri + V0;
+		Pointer<Byte> v1 = tri + V1;
+		Pointer<Byte> v2 = tri + V2;
+
+		Array<Int> X(16);
+		Array<Int> Y(16);
+
+		X[0] = *Pointer<Int>(v0 + OFFSET(Vertex,projected.x));
+		X[1] = *Pointer<Int>(v1 + OFFSET(Vertex,projected.x));
+		X[2] = *Pointer<Int>(v2 + OFFSET(Vertex,projected.x));
+
+		Y[0] = *Pointer<Int>(v0 + OFFSET(Vertex,projected.y));
+		Y[1] = *Pointer<Int>(v1 + OFFSET(Vertex,projected.y));
+		Y[2] = *Pointer<Int>(v2 + OFFSET(Vertex,projected.y));
+
+		Int d = 1;     // Winding direction
+
+		// Culling
+		if(triangle)
 		{
-			Pointer<Byte> primitive(function.Arg<0>());
-			Pointer<Byte> tri(function.Arg<1>());
-			Pointer<Byte> polygon(function.Arg<2>());
-			Pointer<Byte> data(function.Arg<3>());
+			Float x0 = Float(X[0]);
+			Float x1 = Float(X[1]);
+			Float x2 = Float(X[2]);
 
-			Pointer<Byte> constants = *Pointer<Pointer<Byte> >(data + OFFSET(DrawData,constants));
+			Float y0 = Float(Y[0]);
+			Float y1 = Float(Y[1]);
+			Float y2 = Float(Y[2]);
 
-			const bool point = state.isDrawPoint;
-			const bool line = state.isDrawLine;
-			const bool triangle = state.isDrawTriangle;
+			Float A = (y0 - y2) * x1 + (y2 - y1) * x0 + (y1 - y0) * x2;   // Area
 
-			const int V0 = OFFSET(Triangle,v0);
-			const int V1 = (triangle || line) ? OFFSET(Triangle,v1) : OFFSET(Triangle,v0);
-			const int V2 = triangle ? OFFSET(Triangle,v2) : (line ? OFFSET(Triangle,v1) : OFFSET(Triangle,v0));
-
-			Pointer<Byte> v0 = tri + V0;
-			Pointer<Byte> v1 = tri + V1;
-			Pointer<Byte> v2 = tri + V2;
-
-			Array<Int> X(16);
-			Array<Int> Y(16);
-
-			X[0] = *Pointer<Int>(v0 + OFFSET(Vertex,projected.x));
-			X[1] = *Pointer<Int>(v1 + OFFSET(Vertex,projected.x));
-			X[2] = *Pointer<Int>(v2 + OFFSET(Vertex,projected.x));
-
-			Y[0] = *Pointer<Int>(v0 + OFFSET(Vertex,projected.y));
-			Y[1] = *Pointer<Int>(v1 + OFFSET(Vertex,projected.y));
-			Y[2] = *Pointer<Int>(v2 + OFFSET(Vertex,projected.y));
-
-			Int d = 1;     // Winding direction
-
-			// Culling
-			if(triangle)
+			If(A == 0.0f)
 			{
-				Float x0 = Float(X[0]);
-				Float x1 = Float(X[1]);
-				Float x2 = Float(X[2]);
-
-				Float y0 = Float(Y[0]);
-				Float y1 = Float(Y[1]);
-				Float y2 = Float(Y[2]);
-
-				Float A = (y0 - y2) * x1 + (y2 - y1) * x0 + (y1 - y0) * x2;   // Area
-
-				If(A == 0.0f)
-				{
-					Return(0);
-				}
-
-				Int w0w1w2 = *Pointer<Int>(v0 + OFFSET(Vertex, w)) ^
-				             *Pointer<Int>(v1 + OFFSET(Vertex, w)) ^
-				             *Pointer<Int>(v2 + OFFSET(Vertex, w));
-
-				A = IfThenElse(w0w1w2 < 0, -A, A);
-
-				Bool frontFacing = (state.frontFace == VK_FRONT_FACE_COUNTER_CLOCKWISE) ? A > 0.0f : A < 0.0f;
-
-				if(state.cullMode & VK_CULL_MODE_FRONT_BIT)
-				{
-					If(frontFacing) Return(0);
-				}
-				if(state.cullMode & VK_CULL_MODE_BACK_BIT)
-				{
-					If(!frontFacing) Return(0);
-				}
-
-				d = IfThenElse(A > 0.0f, d, Int(0));
-
-				If(frontFacing)
-				{
-					*Pointer<Byte8>(primitive + OFFSET(Primitive,clockwiseMask)) = Byte8(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
-					*Pointer<Byte8>(primitive + OFFSET(Primitive,invClockwiseMask)) = Byte8(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
-				}
-				Else
-				{
-					*Pointer<Byte8>(primitive + OFFSET(Primitive,clockwiseMask)) = Byte8(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
-					*Pointer<Byte8>(primitive + OFFSET(Primitive,invClockwiseMask)) = Byte8(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
-				}
+				Return(0);
 			}
-			else
+
+			Int w0w1w2 = *Pointer<Int>(v0 + OFFSET(Vertex, w)) ^
+			             *Pointer<Int>(v1 + OFFSET(Vertex, w)) ^
+			             *Pointer<Int>(v2 + OFFSET(Vertex, w));
+
+			A = IfThenElse(w0w1w2 < 0, -A, A);
+
+			Bool frontFacing = (state.frontFace == VK_FRONT_FACE_COUNTER_CLOCKWISE) ? A > 0.0f : A < 0.0f;
+
+			if(state.cullMode & VK_CULL_MODE_FRONT_BIT)
+			{
+				If(frontFacing) Return(0);
+			}
+			if(state.cullMode & VK_CULL_MODE_BACK_BIT)
+			{
+				If(!frontFacing) Return(0);
+			}
+
+			d = IfThenElse(A > 0.0f, d, Int(0));
+
+			If(frontFacing)
 			{
 				*Pointer<Byte8>(primitive + OFFSET(Primitive,clockwiseMask)) = Byte8(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
 				*Pointer<Byte8>(primitive + OFFSET(Primitive,invClockwiseMask)) = Byte8(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
 			}
-
-			Int n = *Pointer<Int>(polygon + OFFSET(Polygon,n));
-			Int m = *Pointer<Int>(polygon + OFFSET(Polygon,i));
-
-			If(m != 0 || Bool(!triangle))   // Clipped triangle; reproject
+			Else
 			{
-				Pointer<Byte> V = polygon + OFFSET(Polygon,P) + m * sizeof(void*) * 16;
-
-				Int i = 0;
-
-				Do
-				{
-					Pointer<Float4> p = *Pointer<Pointer<Float4> >(V + i * sizeof(void*));
-					Float4 v = *Pointer<Float4>(p, 16);
-
-					Float w = v.w;
-					Float rhw = IfThenElse(w != 0.0f, 1.0f / w, Float(1.0f));
-
-					X[i] = RoundInt(*Pointer<Float>(data + OFFSET(DrawData,X0xF)) + v.x * rhw * *Pointer<Float>(data + OFFSET(DrawData,WxF)));
-					Y[i] = RoundInt(*Pointer<Float>(data + OFFSET(DrawData,Y0xF)) + v.y * rhw * *Pointer<Float>(data + OFFSET(DrawData,HxF)));
-
-					i++;
-				}
-				Until(i >= n)
+				*Pointer<Byte8>(primitive + OFFSET(Primitive,clockwiseMask)) = Byte8(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
+				*Pointer<Byte8>(primitive + OFFSET(Primitive,invClockwiseMask)) = Byte8(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
 			}
+		}
+		else
+		{
+			*Pointer<Byte8>(primitive + OFFSET(Primitive,clockwiseMask)) = Byte8(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
+			*Pointer<Byte8>(primitive + OFFSET(Primitive,invClockwiseMask)) = Byte8(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
+		}
 
-			// Vertical range
-			Int yMin = Y[0];
-			Int yMax = Y[0];
+		Int n = *Pointer<Int>(polygon + OFFSET(Polygon,n));
+		Int m = *Pointer<Int>(polygon + OFFSET(Polygon,i));
 
-			Int i = 1;
+		If(m != 0 || Bool(!triangle))   // Clipped triangle; reproject
+		{
+			Pointer<Byte> V = polygon + OFFSET(Polygon,P) + m * sizeof(void*) * 16;
+
+			Int i = 0;
 
 			Do
 			{
-				yMin = Min(Y[i], yMin);
-				yMax = Max(Y[i], yMax);
+				Pointer<Float4> p = *Pointer<Pointer<Float4> >(V + i * sizeof(void*));
+				Float4 v = *Pointer<Float4>(p, 16);
+
+				Float w = v.w;
+				Float rhw = IfThenElse(w != 0.0f, 1.0f / w, Float(1.0f));
+
+				X[i] = RoundInt(*Pointer<Float>(data + OFFSET(DrawData,X0xF)) + v.x * rhw * *Pointer<Float>(data + OFFSET(DrawData,WxF)));
+				Y[i] = RoundInt(*Pointer<Float>(data + OFFSET(DrawData,Y0xF)) + v.y * rhw * *Pointer<Float>(data + OFFSET(DrawData,HxF)));
+
+				i++;
+			}
+			Until(i >= n)
+		}
+
+		// Vertical range
+		Int yMin = Y[0];
+		Int yMax = Y[0];
+
+		Int i = 1;
+
+		Do
+		{
+			yMin = Min(Y[i], yMin);
+			yMax = Max(Y[i], yMax);
+
+			i++;
+		}
+		Until(i >= n)
+
+		constexpr int subPixB = vk::SUBPIXEL_PRECISION_BITS;
+		constexpr int subPixM = vk::SUBPIXEL_PRECISION_MASK;
+		constexpr float subPixF = vk::SUBPIXEL_PRECISION_FACTOR;
+
+		if(state.multiSample > 1)
+		{
+			yMin = (yMin + Constants::yMinMultiSampleOffset) >> subPixB;
+			yMax = (yMax + Constants::yMaxMultiSampleOffset) >> subPixB;
+		}
+		else
+		{
+			yMin = (yMin + subPixM) >> subPixB;
+			yMax = (yMax + subPixM) >> subPixB;
+		}
+
+		yMin = Max(yMin, *Pointer<Int>(data + OFFSET(DrawData,scissorY0)));
+		yMax = Min(yMax, *Pointer<Int>(data + OFFSET(DrawData,scissorY1)));
+
+		// If yMin and yMax are initially negative, the scissor clamping above will typically result
+		// in yMin == 0 and yMax unchanged. We bail as we don't need to rasterize this primitive, and
+		// code below assumes yMin < yMax.
+		If(yMin >= yMax)
+		{
+			Return(0);
+		}
+
+		For(Int q = 0, q < state.multiSample, q++)
+		{
+			Array<Int> Xq(16);
+			Array<Int> Yq(16);
+
+			Int i = 0;
+
+			Do
+			{
+				Xq[i] = X[i];
+				Yq[i] = Y[i];
+
+				if(state.multiSample > 1)
+				{
+					Xq[i] = Xq[i] + *Pointer<Int>(constants + OFFSET(Constants,Xf) + q * sizeof(int));
+					Yq[i] = Yq[i] + *Pointer<Int>(constants + OFFSET(Constants,Yf) + q * sizeof(int));
+				}
 
 				i++;
 			}
 			Until(i >= n)
 
-			constexpr int subPixB = vk::SUBPIXEL_PRECISION_BITS;
-			constexpr int subPixM = vk::SUBPIXEL_PRECISION_MASK;
-			constexpr float subPixF = vk::SUBPIXEL_PRECISION_FACTOR;
+			Pointer<Byte> leftEdge = Pointer<Byte>(primitive + OFFSET(Primitive,outline->left)) + q * sizeof(Primitive);
+			Pointer<Byte> rightEdge = Pointer<Byte>(primitive + OFFSET(Primitive,outline->right)) + q * sizeof(Primitive);
 
 			if(state.multiSample > 1)
 			{
-				yMin = (yMin + Constants::yMinMultiSampleOffset) >> subPixB;
-				yMax = (yMax + Constants::yMaxMultiSampleOffset) >> subPixB;
-			}
-			else
-			{
-				yMin = (yMin + subPixM) >> subPixB;
-				yMax = (yMax + subPixM) >> subPixB;
+				Int xMin = *Pointer<Int>(data + OFFSET(DrawData, scissorX0));
+				Int xMax = *Pointer<Int>(data + OFFSET(DrawData, scissorX1));
+				Short x = Short(Clamp((X[0] + subPixM) >> subPixB, xMin, xMax));
+
+				For(Int y = yMin - 1, y < yMax + 1, y++)
+				{
+					*Pointer<Short>(leftEdge + y * sizeof(Primitive::Span)) = x;
+					*Pointer<Short>(rightEdge + y * sizeof(Primitive::Span)) = x;
+				}
 			}
 
-			yMin = Max(yMin, *Pointer<Int>(data + OFFSET(DrawData,scissorY0)));
-			yMax = Min(yMax, *Pointer<Int>(data + OFFSET(DrawData,scissorY1)));
+			Xq[n] = Xq[0];
+			Yq[n] = Yq[0];
 
-			// If yMin and yMax are initially negative, the scissor clamping above will typically result
-			// in yMin == 0 and yMax unchanged. We bail as we don't need to rasterize this primitive, and
-			// code below assumes yMin < yMax.
-			If(yMin >= yMax)
+			// Rasterize
 			{
-				Return(0);
-			}
-
-			For(Int q = 0, q < state.multiSample, q++)
-			{
-				Array<Int> Xq(16);
-				Array<Int> Yq(16);
-
 				Int i = 0;
 
 				Do
 				{
-					Xq[i] = X[i];
-					Yq[i] = Y[i];
-
-					if(state.multiSample > 1)
-					{
-						Xq[i] = Xq[i] + *Pointer<Int>(constants + OFFSET(Constants,Xf) + q * sizeof(int));
-						Yq[i] = Yq[i] + *Pointer<Int>(constants + OFFSET(Constants,Yf) + q * sizeof(int));
-					}
+					edge(primitive, data, Xq[i + 1 - d], Yq[i + 1 - d], Xq[i + d], Yq[i + d], q);
 
 					i++;
 				}
 				Until(i >= n)
-
-				Pointer<Byte> leftEdge = Pointer<Byte>(primitive + OFFSET(Primitive,outline->left)) + q * sizeof(Primitive);
-				Pointer<Byte> rightEdge = Pointer<Byte>(primitive + OFFSET(Primitive,outline->right)) + q * sizeof(Primitive);
-
-				if(state.multiSample > 1)
-				{
-					Int xMin = *Pointer<Int>(data + OFFSET(DrawData, scissorX0));
-					Int xMax = *Pointer<Int>(data + OFFSET(DrawData, scissorX1));
-					Short x = Short(Clamp((X[0] + subPixM) >> subPixB, xMin, xMax));
-
-					For(Int y = yMin - 1, y < yMax + 1, y++)
-					{
-						*Pointer<Short>(leftEdge + y * sizeof(Primitive::Span)) = x;
-						*Pointer<Short>(rightEdge + y * sizeof(Primitive::Span)) = x;
-					}
-				}
-
-				Xq[n] = Xq[0];
-				Yq[n] = Yq[0];
-
-				// Rasterize
-				{
-					Int i = 0;
-
-					Do
-					{
-						edge(primitive, data, Xq[i + 1 - d], Yq[i + 1 - d], Xq[i + d], Yq[i + d], q);
-
-						i++;
-					}
-					Until(i >= n)
-				}
-
-				if(state.multiSample == 1)
-				{
-					For(, yMin < yMax && *Pointer<Short>(leftEdge + yMin * sizeof(Primitive::Span)) == *Pointer<Short>(rightEdge + yMin * sizeof(Primitive::Span)), yMin++)
-					{
-						// Increments yMin
-					}
-
-					For(, yMax > yMin && *Pointer<Short>(leftEdge + (yMax - 1) * sizeof(Primitive::Span)) == *Pointer<Short>(rightEdge + (yMax - 1) * sizeof(Primitive::Span)), yMax--)
-					{
-						// Decrements yMax
-					}
-
-					If(yMin == yMax)
-					{
-						Return(0);
-					}
-
-					*Pointer<Short>(leftEdge + (yMin - 1) * sizeof(Primitive::Span)) = *Pointer<Short>(leftEdge + yMin * sizeof(Primitive::Span));
-					*Pointer<Short>(rightEdge + (yMin - 1) * sizeof(Primitive::Span)) = *Pointer<Short>(leftEdge + yMin * sizeof(Primitive::Span));
-					*Pointer<Short>(leftEdge + yMax * sizeof(Primitive::Span)) = *Pointer<Short>(leftEdge + (yMax - 1) * sizeof(Primitive::Span));
-					*Pointer<Short>(rightEdge + yMax * sizeof(Primitive::Span)) = *Pointer<Short>(leftEdge + (yMax - 1) * sizeof(Primitive::Span));
-				}
 			}
 
-			*Pointer<Int>(primitive + OFFSET(Primitive,yMin)) = yMin;
-			*Pointer<Int>(primitive + OFFSET(Primitive,yMax)) = yMax;
-
-			// Sort by minimum y
-			if(triangle)
+			if(state.multiSample == 1)
 			{
-				Float y0 = *Pointer<Float>(v0 + OFFSET(Vertex, y));
-				Float y1 = *Pointer<Float>(v1 + OFFSET(Vertex, y));
-				Float y2 = *Pointer<Float>(v2 + OFFSET(Vertex, y));
+				For(, yMin < yMax && *Pointer<Short>(leftEdge + yMin * sizeof(Primitive::Span)) == *Pointer<Short>(rightEdge + yMin * sizeof(Primitive::Span)), yMin++)
+				{
+					// Increments yMin
+				}
 
-				Float yMin = Min(Min(y0, y1), y2);
+				For(, yMax > yMin && *Pointer<Short>(leftEdge + (yMax - 1) * sizeof(Primitive::Span)) == *Pointer<Short>(rightEdge + (yMax - 1) * sizeof(Primitive::Span)), yMax--)
+				{
+					// Decrements yMax
+				}
 
-				conditionalRotate1(yMin == y1, v0, v1, v2);
-				conditionalRotate2(yMin == y2, v0, v1, v2);
+				If(yMin == yMax)
+				{
+					Return(0);
+				}
+
+				*Pointer<Short>(leftEdge + (yMin - 1) * sizeof(Primitive::Span)) = *Pointer<Short>(leftEdge + yMin * sizeof(Primitive::Span));
+				*Pointer<Short>(rightEdge + (yMin - 1) * sizeof(Primitive::Span)) = *Pointer<Short>(leftEdge + yMin * sizeof(Primitive::Span));
+				*Pointer<Short>(leftEdge + yMax * sizeof(Primitive::Span)) = *Pointer<Short>(leftEdge + (yMax - 1) * sizeof(Primitive::Span));
+				*Pointer<Short>(rightEdge + yMax * sizeof(Primitive::Span)) = *Pointer<Short>(leftEdge + (yMax - 1) * sizeof(Primitive::Span));
 			}
+		}
 
-			// Sort by maximum w
-			if(triangle)
-			{
-				Float w0 = *Pointer<Float>(v0 + OFFSET(Vertex, w));
-				Float w1 = *Pointer<Float>(v1 + OFFSET(Vertex, w));
-				Float w2 = *Pointer<Float>(v2 + OFFSET(Vertex, w));
+		*Pointer<Int>(primitive + OFFSET(Primitive,yMin)) = yMin;
+		*Pointer<Int>(primitive + OFFSET(Primitive,yMax)) = yMax;
 
-				Float wMax = Max(Max(w0, w1), w2);
+		// Sort by minimum y
+		if(triangle)
+		{
+			Float y0 = *Pointer<Float>(v0 + OFFSET(Vertex, y));
+			Float y1 = *Pointer<Float>(v1 + OFFSET(Vertex, y));
+			Float y2 = *Pointer<Float>(v2 + OFFSET(Vertex, y));
 
-				conditionalRotate1(wMax == w1, v0, v1, v2);
-				conditionalRotate2(wMax == w2, v0, v1, v2);
-			}
+			Float yMin = Min(Min(y0, y1), y2);
 
+			conditionalRotate1(yMin == y1, v0, v1, v2);
+			conditionalRotate2(yMin == y2, v0, v1, v2);
+		}
+
+		// Sort by maximum w
+		if(triangle)
+		{
 			Float w0 = *Pointer<Float>(v0 + OFFSET(Vertex, w));
 			Float w1 = *Pointer<Float>(v1 + OFFSET(Vertex, w));
 			Float w2 = *Pointer<Float>(v2 + OFFSET(Vertex, w));
 
-			Float4 w012;
+			Float wMax = Max(Max(w0, w1), w2);
 
-			w012.x = w0;
-			w012.y = w1;
-			w012.z = w2;
-			w012.w = 1;
+			conditionalRotate1(wMax == w1, v0, v1, v2);
+			conditionalRotate2(wMax == w2, v0, v1, v2);
+		}
 
-			Float rhw0 = *Pointer<Float>(v0 + OFFSET(Vertex,projected.w));
+		Float w0 = *Pointer<Float>(v0 + OFFSET(Vertex, w));
+		Float w1 = *Pointer<Float>(v1 + OFFSET(Vertex, w));
+		Float w2 = *Pointer<Float>(v2 + OFFSET(Vertex, w));
 
-			Int X0 = *Pointer<Int>(v0 + OFFSET(Vertex,projected.x));
-			Int X1 = *Pointer<Int>(v1 + OFFSET(Vertex,projected.x));
-			Int X2 = *Pointer<Int>(v2 + OFFSET(Vertex,projected.x));
+		Float4 w012;
 
-			Int Y0 = *Pointer<Int>(v0 + OFFSET(Vertex,projected.y));
-			Int Y1 = *Pointer<Int>(v1 + OFFSET(Vertex,projected.y));
-			Int Y2 = *Pointer<Int>(v2 + OFFSET(Vertex,projected.y));
+		w012.x = w0;
+		w012.y = w1;
+		w012.z = w2;
+		w012.w = 1;
 
-			if(point)
+		Float rhw0 = *Pointer<Float>(v0 + OFFSET(Vertex,projected.w));
+
+		Int X0 = *Pointer<Int>(v0 + OFFSET(Vertex,projected.x));
+		Int X1 = *Pointer<Int>(v1 + OFFSET(Vertex,projected.x));
+		Int X2 = *Pointer<Int>(v2 + OFFSET(Vertex,projected.x));
+
+		Int Y0 = *Pointer<Int>(v0 + OFFSET(Vertex,projected.y));
+		Int Y1 = *Pointer<Int>(v1 + OFFSET(Vertex,projected.y));
+		Int Y2 = *Pointer<Int>(v2 + OFFSET(Vertex,projected.y));
+
+		if(point)
+		{
+			*Pointer<Float>(primitive + OFFSET(Primitive, pointCoordX)) = Float(1.0f / subPixF) * Float(X0);
+			*Pointer<Float>(primitive + OFFSET(Primitive, pointCoordY)) = Float(1.0f / subPixF) * Float(Y0);
+		}
+
+		if(line)
+		{
+			X2 = X1 + Y1 - Y0;
+			Y2 = Y1 + X0 - X1;
+		}
+
+		Float dx = Float(X0) * (1.0f / subPixF);
+		Float dy = Float(Y0) * (1.0f / subPixF);
+
+		X1 -= X0;
+		Y1 -= Y0;
+
+		X2 -= X0;
+		Y2 -= Y0;
+
+		Float x1 = w1 * (1.0f / subPixF) * Float(X1);
+		Float y1 = w1 * (1.0f / subPixF) * Float(Y1);
+
+		Float x2 = w2 * (1.0f / subPixF) * Float(X2);
+		Float y2 = w2 * (1.0f / subPixF) * Float(Y2);
+
+		Float a = x1 * y2 - x2 * y1;
+
+		Float4 xQuad = Float4(0, 1, 0, 1) - Float4(dx);
+		Float4 yQuad = Float4(0, 0, 1, 1) - Float4(dy);
+
+		*Pointer<Float4>(primitive + OFFSET(Primitive,xQuad), 16) = xQuad;
+		*Pointer<Float4>(primitive + OFFSET(Primitive,yQuad), 16) = yQuad;
+
+		Float4 M[3];
+
+		M[0] = Float4(0, 0, 0, 0);
+		M[1] = Float4(0, 0, 0, 0);
+		M[2] = Float4(0, 0, 0, 0);
+
+		M[0].z = rhw0;
+
+		If(a != 0.0f)
+		{
+			Float A = 1.0f / a;
+			Float D = A * rhw0;
+
+			M[0].x = (y1 * w2 - y2 * w1) * D;
+			M[0].y = (x2 * w1 - x1 * w2) * D;
+		//	M[0].z = rhw0;
+		//	M[0].w = 0;
+
+			M[1].x = y2 * A;
+			M[1].y = -x2 * A;
+		//	M[1].z = 0;
+		//	M[1].w = 0;
+
+			M[2].x = -y1 * A;
+			M[2].y = x1 * A;
+		//	M[2].z = 0;
+		//	M[2].w = 0;
+		}
+
+		if(state.interpolateW)
+		{
+			Float4 ABC = M[0] + M[1] + M[2];
+
+			Float4 A = ABC.x;
+			Float4 B = ABC.y;
+			Float4 C = ABC.z;
+
+			*Pointer<Float4>(primitive + OFFSET(Primitive,w.A), 16) = A;
+			*Pointer<Float4>(primitive + OFFSET(Primitive,w.B), 16) = B;
+			*Pointer<Float4>(primitive + OFFSET(Primitive,w.C), 16) = C;
+		}
+
+		if(state.interpolateZ)
+		{
+			Float z0 = *Pointer<Float>(v0 + OFFSET(Vertex,projected.z));
+			Float z1 = *Pointer<Float>(v1 + OFFSET(Vertex,projected.z));
+			Float z2 = *Pointer<Float>(v2 + OFFSET(Vertex,projected.z));
+
+			z1 -= z0;
+			z2 -= z0;
+
+			Float4 A;
+			Float4 B;
+			Float4 C;
+
+			if(!point)
 			{
-				*Pointer<Float>(primitive + OFFSET(Primitive, pointCoordX)) = Float(1.0f / subPixF) * Float(X0);
-				*Pointer<Float>(primitive + OFFSET(Primitive, pointCoordY)) = Float(1.0f / subPixF) * Float(Y0);
+				Float x1 = Float(X1) * (1.0f / subPixF);
+				Float y1 = Float(Y1) * (1.0f / subPixF);
+				Float x2 = Float(X2) * (1.0f / subPixF);
+				Float y2 = Float(Y2) * (1.0f / subPixF);
+
+				Float D = *Pointer<Float>(data + OFFSET(DrawData,depthRange)) / (x1 * y2 - x2 * y1);
+
+				Float a = (y2 * z1 - y1 * z2) * D;
+				Float b = (x1 * z2 - x2 * z1) * D;
+
+				A = Float4(a);
+				B = Float4(b);
+			}
+			else
+			{
+				A = Float4(0, 0, 0, 0);
+				B = Float4(0, 0, 0, 0);
 			}
 
-			if(line)
+			*Pointer<Float4>(primitive + OFFSET(Primitive,z.A), 16) = A;
+			*Pointer<Float4>(primitive + OFFSET(Primitive,z.B), 16) = B;
+
+			Float c = z0;
+
+			if(state.applySlopeDepthBias)
 			{
-				X2 = X1 + Y1 - Y0;
-				Y2 = Y1 + X0 - X1;
+				Float bias = Max(Abs(Float(A.x)), Abs(Float(B.x)));
+				bias *= *Pointer<Float>(data + OFFSET(DrawData,slopeDepthBias));
+
+				c += bias;
 			}
 
-			Float dx = Float(X0) * (1.0f / subPixF);
-			Float dy = Float(Y0) * (1.0f / subPixF);
+			C = Float4(c * *Pointer<Float>(data + OFFSET(DrawData,depthRange)) + *Pointer<Float>(data + OFFSET(DrawData,depthNear)));
 
-			X1 -= X0;
-			Y1 -= Y0;
+			*Pointer<Float4>(primitive + OFFSET(Primitive,z.C), 16) = C;
+		}
 
-			X2 -= X0;
-			Y2 -= Y0;
-
-			Float x1 = w1 * (1.0f / subPixF) * Float(X1);
-			Float y1 = w1 * (1.0f / subPixF) * Float(Y1);
-
-			Float x2 = w2 * (1.0f / subPixF) * Float(X2);
-			Float y2 = w2 * (1.0f / subPixF) * Float(Y2);
-
-			Float a = x1 * y2 - x2 * y1;
-
-			Float4 xQuad = Float4(0, 1, 0, 1) - Float4(dx);
-			Float4 yQuad = Float4(0, 0, 1, 1) - Float4(dy);
-
-			*Pointer<Float4>(primitive + OFFSET(Primitive,xQuad), 16) = xQuad;
-			*Pointer<Float4>(primitive + OFFSET(Primitive,yQuad), 16) = yQuad;
-
-			Float4 M[3];
-
-			M[0] = Float4(0, 0, 0, 0);
-			M[1] = Float4(0, 0, 0, 0);
-			M[2] = Float4(0, 0, 0, 0);
-
-			M[0].z = rhw0;
-
-			If(a != 0.0f)
-			{
-				Float A = 1.0f / a;
-				Float D = A * rhw0;
-
-				M[0].x = (y1 * w2 - y2 * w1) * D;
-				M[0].y = (x2 * w1 - x1 * w2) * D;
-			//	M[0].z = rhw0;
-			//	M[0].w = 0;
-
-				M[1].x = y2 * A;
-				M[1].y = -x2 * A;
-			//	M[1].z = 0;
-			//	M[1].w = 0;
-
-				M[2].x = -y1 * A;
-				M[2].y = x1 * A;
-			//	M[2].z = 0;
-			//	M[2].w = 0;
-			}
-
-			if(state.interpolateW)
-			{
-				Float4 ABC = M[0] + M[1] + M[2];
-
-				Float4 A = ABC.x;
-				Float4 B = ABC.y;
-				Float4 C = ABC.z;
-
-				*Pointer<Float4>(primitive + OFFSET(Primitive,w.A), 16) = A;
-				*Pointer<Float4>(primitive + OFFSET(Primitive,w.B), 16) = B;
-				*Pointer<Float4>(primitive + OFFSET(Primitive,w.C), 16) = C;
-			}
-
-			if(state.interpolateZ)
-			{
-				Float z0 = *Pointer<Float>(v0 + OFFSET(Vertex,projected.z));
-				Float z1 = *Pointer<Float>(v1 + OFFSET(Vertex,projected.z));
-				Float z2 = *Pointer<Float>(v2 + OFFSET(Vertex,projected.z));
-
-				z1 -= z0;
-				z2 -= z0;
-
-				Float4 A;
-				Float4 B;
-				Float4 C;
-
-				if(!point)
-				{
-					Float x1 = Float(X1) * (1.0f / subPixF);
-					Float y1 = Float(Y1) * (1.0f / subPixF);
-					Float x2 = Float(X2) * (1.0f / subPixF);
-					Float y2 = Float(Y2) * (1.0f / subPixF);
-
-					Float D = *Pointer<Float>(data + OFFSET(DrawData,depthRange)) / (x1 * y2 - x2 * y1);
-
-					Float a = (y2 * z1 - y1 * z2) * D;
-					Float b = (x1 * z2 - x2 * z1) * D;
-
-					A = Float4(a);
-					B = Float4(b);
-				}
-				else
-				{
-					A = Float4(0, 0, 0, 0);
-					B = Float4(0, 0, 0, 0);
-				}
-
-				*Pointer<Float4>(primitive + OFFSET(Primitive,z.A), 16) = A;
-				*Pointer<Float4>(primitive + OFFSET(Primitive,z.B), 16) = B;
-
-				Float c = z0;
-
-				if(state.applySlopeDepthBias)
-				{
-					Float bias = Max(Abs(Float(A.x)), Abs(Float(B.x)));
-					bias *= *Pointer<Float>(data + OFFSET(DrawData,slopeDepthBias));
-
-					c += bias;
-				}
-
-				C = Float4(c * *Pointer<Float>(data + OFFSET(DrawData,depthRange)) + *Pointer<Float>(data + OFFSET(DrawData,depthNear)));
-
-				*Pointer<Float4>(primitive + OFFSET(Primitive,z.C), 16) = C;
-			}
-
-			for (int interpolant = 0; interpolant < MAX_INTERFACE_COMPONENTS; interpolant++)
-			{
-				if (state.gradient[interpolant].Type != SpirvShader::ATTRIBTYPE_UNUSED)
-				{
-					setupGradient(primitive, tri, w012, M, v0, v1, v2,
-							OFFSET(Vertex, v[interpolant]),
-							OFFSET(Primitive, V[interpolant]),
-							state.gradient[interpolant].Flat,
-							!state.gradient[interpolant].NoPerspective);
-				}
-			}
-
-			for (unsigned int i = 0; i < state.numClipDistances; i++)
+		for (int interpolant = 0; interpolant < MAX_INTERFACE_COMPONENTS; interpolant++)
+		{
+			if (state.gradient[interpolant].Type != SpirvShader::ATTRIBTYPE_UNUSED)
 			{
 				setupGradient(primitive, tri, w012, M, v0, v1, v2,
-						OFFSET(Vertex, clipDistance[i]),
-						OFFSET(Primitive, clipDistance[i]),
-						false, true);
+						OFFSET(Vertex, v[interpolant]),
+						OFFSET(Primitive, V[interpolant]),
+						state.gradient[interpolant].Flat,
+						!state.gradient[interpolant].NoPerspective);
 			}
-
-			for (unsigned int i = 0; i < state.numCullDistances; i++)
-			{
-				setupGradient(primitive, tri, w012, M, v0, v1, v2,
-						OFFSET(Vertex, cullDistance[i]),
-						OFFSET(Primitive, cullDistance[i]),
-						false, true);
-			}
-
-			Return(1);
 		}
 
-		routine = function("SetupRoutine");
-	}
-
-	void SetupRoutine::setupGradient(Pointer<Byte> &primitive, Pointer<Byte> &triangle, Float4 &w012, Float4 (&m)[3], Pointer<Byte> &v0, Pointer<Byte> &v1, Pointer<Byte> &v2, int attribute, int planeEquation, bool flat, bool perspective)
-	{
-		if(!flat)
+		for (unsigned int i = 0; i < state.numClipDistances; i++)
 		{
-			Float4 i;
-			i.x = *Pointer<Float>(v0 + attribute);
-			i.y = *Pointer<Float>(v1 + attribute);
-			i.z = *Pointer<Float>(v2 + attribute);
-			i.w = 0;
-
-			if(!perspective)
-			{
-				i *= w012;
-			}
-
-			Float4 A = i.xxxx * m[0];
-			Float4 B = i.yyyy * m[1];
-			Float4 C = i.zzzz * m[2];
-
-			C = A + B + C;
-
-			A = C.xxxx;
-			B = C.yyyy;
-			C = C.zzzz;
-
-			*Pointer<Float4>(primitive + planeEquation + 0, 16) = A;
-			*Pointer<Float4>(primitive + planeEquation + 16, 16) = B;
-			*Pointer<Float4>(primitive + planeEquation + 32, 16) = C;
+			setupGradient(primitive, tri, w012, M, v0, v1, v2,
+					OFFSET(Vertex, clipDistance[i]),
+					OFFSET(Primitive, clipDistance[i]),
+					false, true);
 		}
-		else
+
+		for (unsigned int i = 0; i < state.numCullDistances; i++)
 		{
-			int leadingVertex = OFFSET(Triangle,v0);
-			Float C = *Pointer<Float>(triangle + leadingVertex + attribute);
-
-			*Pointer<Float4>(primitive + planeEquation + 0, 16) = Float4(0, 0, 0, 0);
-			*Pointer<Float4>(primitive + planeEquation + 16, 16) = Float4(0, 0, 0, 0);
-			*Pointer<Float4>(primitive + planeEquation + 32, 16) = Float4(C);
+			setupGradient(primitive, tri, w012, M, v0, v1, v2,
+					OFFSET(Vertex, cullDistance[i]),
+					OFFSET(Primitive, cullDistance[i]),
+					false, true);
 		}
+
+		Return(1);
 	}
 
-	void SetupRoutine::edge(Pointer<Byte> &primitive, Pointer<Byte> &data, const Int &Xa, const Int &Ya, const Int &Xb, const Int &Yb, Int &q)
+	routine = function("SetupRoutine");
+}
+
+void SetupRoutine::setupGradient(Pointer<Byte> &primitive, Pointer<Byte> &triangle, Float4 &w012, Float4 (&m)[3], Pointer<Byte> &v0, Pointer<Byte> &v1, Pointer<Byte> &v2, int attribute, int planeEquation, bool flat, bool perspective)
+{
+	if(!flat)
 	{
-		If(Ya != Yb)
+		Float4 i;
+		i.x = *Pointer<Float>(v0 + attribute);
+		i.y = *Pointer<Float>(v1 + attribute);
+		i.z = *Pointer<Float>(v2 + attribute);
+		i.w = 0;
+
+		if(!perspective)
 		{
-			Bool swap = Yb < Ya;
-
-			Int X1 = IfThenElse(swap, Xb, Xa);
-			Int X2 = IfThenElse(swap, Xa, Xb);
-			Int Y1 = IfThenElse(swap, Yb, Ya);
-			Int Y2 = IfThenElse(swap, Ya, Yb);
-
-			constexpr int subPixB = vk::SUBPIXEL_PRECISION_BITS;
-			constexpr int subPixM = vk::SUBPIXEL_PRECISION_MASK;
-
-			Int y1 = Max((Y1 + subPixM) >> subPixB, *Pointer<Int>(data + OFFSET(DrawData,scissorY0)));
-			Int y2 = Min((Y2 + subPixM) >> subPixB, *Pointer<Int>(data + OFFSET(DrawData,scissorY1)));
-
-			If(y1 < y2)
-			{
-				Int xMin = *Pointer<Int>(data + OFFSET(DrawData,scissorX0));
-				Int xMax = *Pointer<Int>(data + OFFSET(DrawData,scissorX1));
-
-				Pointer<Byte> leftEdge = primitive + q * sizeof(Primitive) + OFFSET(Primitive,outline->left);
-				Pointer<Byte> rightEdge = primitive + q * sizeof(Primitive) + OFFSET(Primitive,outline->right);
-				Pointer<Byte> edge = IfThenElse(swap, rightEdge, leftEdge);
-
-				// Deltas
-				Int DX12 = X2 - X1;
-				Int DY12 = Y2 - Y1;
-
-				Int FDX12 = DX12 << subPixB;
-				Int FDY12 = DY12 << subPixB;
-
-				Int X = DX12 * ((y1 << subPixB) - Y1) + (X1 & subPixM) * DY12;
-				Int x = (X1 >> subPixB) + X / FDY12;   // Edge
-				Int d = X % FDY12;               // Error-term
-				Int ceil = -d >> 31;             // Ceiling division: remainder <= 0
-				x -= ceil;
-				d -= ceil & FDY12;
-
-				Int Q = FDX12 / FDY12;   // Edge-step
-				Int R = FDX12 % FDY12;   // Error-step
-				Int floor = R >> 31;     // Flooring division: remainder >= 0
-				Q += floor;
-				R += floor & FDY12;
-
-				Int D = FDY12;   // Error-overflow
-				Int y = y1;
-
-				Do
-				{
-					*Pointer<Short>(edge + y * sizeof(Primitive::Span)) = Short(Clamp(x, xMin, xMax));
-
-					x += Q;
-					d += R;
-
-					Int overflow = -d >> 31;
-
-					d -= D & overflow;
-					x -= overflow;
-
-					y++;
-				}
-				Until(y >= y2)
-			}
+			i *= w012;
 		}
+
+		Float4 A = i.xxxx * m[0];
+		Float4 B = i.yyyy * m[1];
+		Float4 C = i.zzzz * m[2];
+
+		C = A + B + C;
+
+		A = C.xxxx;
+		B = C.yyyy;
+		C = C.zzzz;
+
+		*Pointer<Float4>(primitive + planeEquation + 0, 16) = A;
+		*Pointer<Float4>(primitive + planeEquation + 16, 16) = B;
+		*Pointer<Float4>(primitive + planeEquation + 32, 16) = C;
 	}
-
-	void SetupRoutine::conditionalRotate1(Bool condition, Pointer<Byte> &v0, Pointer<Byte> &v1, Pointer<Byte> &v2)
+	else
 	{
-		#if 0   // Rely on LLVM optimization
-			If(condition)
-			{
-				Pointer<Byte> vX;
+		int leadingVertex = OFFSET(Triangle,v0);
+		Float C = *Pointer<Float>(triangle + leadingVertex + attribute);
 
-				vX = v0;
-				v0 = v1;
-				v1 = v2;
-				v2 = vX;
-			}
-		#else
-			Pointer<Byte> vX = v0;
-			v0 = IfThenElse(condition, v1, v0);
-			v1 = IfThenElse(condition, v2, v1);
-			v2 = IfThenElse(condition, vX, v2);
-		#endif
-	}
-
-	void SetupRoutine::conditionalRotate2(Bool condition, Pointer<Byte> &v0, Pointer<Byte> &v1, Pointer<Byte> &v2)
-	{
-		#if 0   // Rely on LLVM optimization
-			If(condition)
-			{
-				Pointer<Byte> vX;
-
-				vX = v2;
-				v2 = v1;
-				v1 = v0;
-				v0 = vX;
-			}
-		#else
-			Pointer<Byte> vX = v2;
-			v2 = IfThenElse(condition, v1, v2);
-			v1 = IfThenElse(condition, v0, v1);
-			v0 = IfThenElse(condition, vX, v0);
-		#endif
-	}
-
-	SetupFunction::RoutineType SetupRoutine::getRoutine()
-	{
-		return routine;
+		*Pointer<Float4>(primitive + planeEquation + 0, 16) = Float4(0, 0, 0, 0);
+		*Pointer<Float4>(primitive + planeEquation + 16, 16) = Float4(0, 0, 0, 0);
+		*Pointer<Float4>(primitive + planeEquation + 32, 16) = Float4(C);
 	}
 }
+
+void SetupRoutine::edge(Pointer<Byte> &primitive, Pointer<Byte> &data, const Int &Xa, const Int &Ya, const Int &Xb, const Int &Yb, Int &q)
+{
+	If(Ya != Yb)
+	{
+		Bool swap = Yb < Ya;
+
+		Int X1 = IfThenElse(swap, Xb, Xa);
+		Int X2 = IfThenElse(swap, Xa, Xb);
+		Int Y1 = IfThenElse(swap, Yb, Ya);
+		Int Y2 = IfThenElse(swap, Ya, Yb);
+
+		constexpr int subPixB = vk::SUBPIXEL_PRECISION_BITS;
+		constexpr int subPixM = vk::SUBPIXEL_PRECISION_MASK;
+
+		Int y1 = Max((Y1 + subPixM) >> subPixB, *Pointer<Int>(data + OFFSET(DrawData,scissorY0)));
+		Int y2 = Min((Y2 + subPixM) >> subPixB, *Pointer<Int>(data + OFFSET(DrawData,scissorY1)));
+
+		If(y1 < y2)
+		{
+			Int xMin = *Pointer<Int>(data + OFFSET(DrawData,scissorX0));
+			Int xMax = *Pointer<Int>(data + OFFSET(DrawData,scissorX1));
+
+			Pointer<Byte> leftEdge = primitive + q * sizeof(Primitive) + OFFSET(Primitive,outline->left);
+			Pointer<Byte> rightEdge = primitive + q * sizeof(Primitive) + OFFSET(Primitive,outline->right);
+			Pointer<Byte> edge = IfThenElse(swap, rightEdge, leftEdge);
+
+			// Deltas
+			Int DX12 = X2 - X1;
+			Int DY12 = Y2 - Y1;
+
+			Int FDX12 = DX12 << subPixB;
+			Int FDY12 = DY12 << subPixB;
+
+			Int X = DX12 * ((y1 << subPixB) - Y1) + (X1 & subPixM) * DY12;
+			Int x = (X1 >> subPixB) + X / FDY12;   // Edge
+			Int d = X % FDY12;               // Error-term
+			Int ceil = -d >> 31;             // Ceiling division: remainder <= 0
+			x -= ceil;
+			d -= ceil & FDY12;
+
+			Int Q = FDX12 / FDY12;   // Edge-step
+			Int R = FDX12 % FDY12;   // Error-step
+			Int floor = R >> 31;     // Flooring division: remainder >= 0
+			Q += floor;
+			R += floor & FDY12;
+
+			Int D = FDY12;   // Error-overflow
+			Int y = y1;
+
+			Do
+			{
+				*Pointer<Short>(edge + y * sizeof(Primitive::Span)) = Short(Clamp(x, xMin, xMax));
+
+				x += Q;
+				d += R;
+
+				Int overflow = -d >> 31;
+
+				d -= D & overflow;
+				x -= overflow;
+
+				y++;
+			}
+			Until(y >= y2)
+		}
+	}
+}
+
+void SetupRoutine::conditionalRotate1(Bool condition, Pointer<Byte> &v0, Pointer<Byte> &v1, Pointer<Byte> &v2)
+{
+	#if 0   // Rely on LLVM optimization
+		If(condition)
+		{
+			Pointer<Byte> vX;
+
+			vX = v0;
+			v0 = v1;
+			v1 = v2;
+			v2 = vX;
+		}
+	#else
+		Pointer<Byte> vX = v0;
+		v0 = IfThenElse(condition, v1, v0);
+		v1 = IfThenElse(condition, v2, v1);
+		v2 = IfThenElse(condition, vX, v2);
+	#endif
+}
+
+void SetupRoutine::conditionalRotate2(Bool condition, Pointer<Byte> &v0, Pointer<Byte> &v1, Pointer<Byte> &v2)
+{
+	#if 0   // Rely on LLVM optimization
+		If(condition)
+		{
+			Pointer<Byte> vX;
+
+			vX = v2;
+			v2 = v1;
+			v1 = v0;
+			v0 = vX;
+		}
+	#else
+		Pointer<Byte> vX = v2;
+		v2 = IfThenElse(condition, v1, v2);
+		v1 = IfThenElse(condition, v0, v1);
+		v0 = IfThenElse(condition, vX, v0);
+	#endif
+}
+
+SetupFunction::RoutineType SetupRoutine::getRoutine()
+{
+	return routine;
+}
+
+}  // namespace sw
diff --git a/src/Pipeline/SetupRoutine.hpp b/src/Pipeline/SetupRoutine.hpp
index c871aff..59fe55a 100644
--- a/src/Pipeline/SetupRoutine.hpp
+++ b/src/Pipeline/SetupRoutine.hpp
@@ -18,30 +18,31 @@
 #include "Device/SetupProcessor.hpp"
 #include "Reactor/Reactor.hpp"
 
-namespace sw
+namespace sw {
+
+class Context;
+
+class SetupRoutine
 {
-	class Context;
+public:
+	SetupRoutine(const SetupProcessor::State &state);
 
-	class SetupRoutine
-	{
-	public:
-		SetupRoutine(const SetupProcessor::State &state);
+	virtual ~SetupRoutine();
 
-		virtual ~SetupRoutine();
+	void generate();
+	SetupFunction::RoutineType getRoutine();
 
-		void generate();
-		SetupFunction::RoutineType getRoutine();
+private:
+	void setupGradient(Pointer<Byte> &primitive, Pointer<Byte> &triangle, Float4 &w012, Float4 (&m)[3], Pointer<Byte> &v0, Pointer<Byte> &v1, Pointer<Byte> &v2, int attribute, int planeEquation, bool flatShading, bool perspective);
+	void edge(Pointer<Byte> &primitive, Pointer<Byte> &data, const Int &Xa, const Int &Ya, const Int &Xb, const Int &Yb, Int &q);
+	void conditionalRotate1(Bool condition, Pointer<Byte> &v0, Pointer<Byte> &v1, Pointer<Byte> &v2);
+	void conditionalRotate2(Bool condition, Pointer<Byte> &v0, Pointer<Byte> &v1, Pointer<Byte> &v2);
 
-	private:
-		void setupGradient(Pointer<Byte> &primitive, Pointer<Byte> &triangle, Float4 &w012, Float4 (&m)[3], Pointer<Byte> &v0, Pointer<Byte> &v1, Pointer<Byte> &v2, int attribute, int planeEquation, bool flatShading, bool perspective);
-		void edge(Pointer<Byte> &primitive, Pointer<Byte> &data, const Int &Xa, const Int &Ya, const Int &Xb, const Int &Yb, Int &q);
-		void conditionalRotate1(Bool condition, Pointer<Byte> &v0, Pointer<Byte> &v1, Pointer<Byte> &v2);
-		void conditionalRotate2(Bool condition, Pointer<Byte> &v0, Pointer<Byte> &v1, Pointer<Byte> &v2);
+	const SetupProcessor::State &state;
 
-		const SetupProcessor::State &state;
+	SetupFunction::RoutineType routine;
+};
 
-		SetupFunction::RoutineType routine;
-	};
-}
+}  // namespace sw
 
 #endif   // sw_SetupRoutine_hpp
diff --git a/src/Pipeline/ShaderCore.cpp b/src/Pipeline/ShaderCore.cpp
index 3ac96cc..0f23096 100644
--- a/src/Pipeline/ShaderCore.cpp
+++ b/src/Pipeline/ShaderCore.cpp
@@ -19,957 +19,957 @@
 
 #include <limits.h>
 
-namespace sw
+namespace sw {
+
+Vector4s::Vector4s()
 {
-	Vector4s::Vector4s()
+}
+
+Vector4s::Vector4s(unsigned short x, unsigned short y, unsigned short z, unsigned short w)
+{
+	this->x = Short4(x);
+	this->y = Short4(y);
+	this->z = Short4(z);
+	this->w = Short4(w);
+}
+
+Vector4s::Vector4s(const Vector4s &rhs)
+{
+	x = rhs.x;
+	y = rhs.y;
+	z = rhs.z;
+	w = rhs.w;
+}
+
+Vector4s &Vector4s::operator=(const Vector4s &rhs)
+{
+	x = rhs.x;
+	y = rhs.y;
+	z = rhs.z;
+	w = rhs.w;
+
+	return *this;
+}
+
+Short4 &Vector4s::operator[](int i)
+{
+	switch(i)
 	{
+	case 0: return x;
+	case 1: return y;
+	case 2: return z;
+	case 3: return w;
 	}
 
-	Vector4s::Vector4s(unsigned short x, unsigned short y, unsigned short z, unsigned short w)
+	return x;
+}
+
+Vector4f::Vector4f()
+{
+}
+
+Vector4f::Vector4f(float x, float y, float z, float w)
+{
+	this->x = Float4(x);
+	this->y = Float4(y);
+	this->z = Float4(z);
+	this->w = Float4(w);
+}
+
+Vector4f::Vector4f(const Vector4f &rhs)
+{
+	x = rhs.x;
+	y = rhs.y;
+	z = rhs.z;
+	w = rhs.w;
+}
+
+Vector4f &Vector4f::operator=(const Vector4f &rhs)
+{
+	x = rhs.x;
+	y = rhs.y;
+	z = rhs.z;
+	w = rhs.w;
+
+	return *this;
+}
+
+Float4 &Vector4f::operator[](int i)
+{
+	switch(i)
 	{
-		this->x = Short4(x);
-		this->y = Short4(y);
-		this->z = Short4(z);
-		this->w = Short4(w);
+	case 0: return x;
+	case 1: return y;
+	case 2: return z;
+	case 3: return w;
 	}
 
-	Vector4s::Vector4s(const Vector4s &rhs)
+	return x;
+}
+
+Float4 exponential2(RValue<Float4> x, bool pp)
+{
+	// This implementation is based on 2^(i + f) = 2^i * 2^f,
+	// where i is the integer part of x and f is the fraction.
+
+	// For 2^i we can put the integer part directly in the exponent of
+	// the IEEE-754 floating-point number. Clamp to prevent overflow
+	// past the representation of infinity.
+	Float4 x0 = x;
+	x0 = Min(x0, As<Float4>(Int4(0x43010000)));   // 129.00000e+0f
+	x0 = Max(x0, As<Float4>(Int4(0xC2FDFFFF)));   // -126.99999e+0f
+
+	Int4 i = RoundInt(x0 - Float4(0.5f));
+	Float4 ii = As<Float4>((i + Int4(127)) << 23);   // Add single-precision bias, and shift into exponent.
+
+	// For the fractional part use a polynomial
+	// which approximates 2^f in the 0 to 1 range.
+	Float4 f = x0 - Float4(i);
+	Float4 ff = As<Float4>(Int4(0x3AF61905));     // 1.8775767e-3f
+	ff = ff * f + As<Float4>(Int4(0x3C134806));   // 8.9893397e-3f
+	ff = ff * f + As<Float4>(Int4(0x3D64AA23));   // 5.5826318e-2f
+	ff = ff * f + As<Float4>(Int4(0x3E75EAD4));   // 2.4015361e-1f
+	ff = ff * f + As<Float4>(Int4(0x3F31727B));   // 6.9315308e-1f
+	ff = ff * f + Float4(1.0f);
+
+	return ii * ff;
+}
+
+Float4 logarithm2(RValue<Float4> x, bool pp)
+{
+	Float4 x0;
+	Float4 x1;
+	Float4 x2;
+	Float4 x3;
+
+	x0 = x;
+
+	x1 = As<Float4>(As<Int4>(x0) & Int4(0x7F800000));
+	x1 = As<Float4>(As<UInt4>(x1) >> 8);
+	x1 = As<Float4>(As<Int4>(x1) | As<Int4>(Float4(1.0f)));
+	x1 = (x1 - Float4(1.4960938f)) * Float4(256.0f);   // FIXME: (x1 - 1.4960938f) * 256.0f;
+	x0 = As<Float4>((As<Int4>(x0) & Int4(0x007FFFFF)) | As<Int4>(Float4(1.0f)));
+
+	x2 = (Float4(9.5428179e-2f) * x0 + Float4(4.7779095e-1f)) * x0 + Float4(1.9782813e-1f);
+	x3 = ((Float4(1.6618466e-2f) * x0 + Float4(2.0350508e-1f)) * x0 + Float4(2.7382900e-1f)) * x0 + Float4(4.0496687e-2f);
+	x2 /= x3;
+
+	x1 += (x0 - Float4(1.0f)) * x2;
+
+	Int4 pos_inf_x = CmpEQ(As<Int4>(x), Int4(0x7F800000));
+	return As<Float4>((pos_inf_x & As<Int4>(x)) | (~pos_inf_x & As<Int4>(x1)));
+}
+
+Float4 exponential(RValue<Float4> x, bool pp)
+{
+	// FIXME: Propagate the constant
+	return exponential2(Float4(1.44269504f) * x, pp);   // 1/ln(2)
+}
+
+Float4 logarithm(RValue<Float4> x, bool pp)
+{
+	// FIXME: Propagate the constant
+	return Float4(6.93147181e-1f) * logarithm2(x, pp);   // ln(2)
+}
+
+Float4 power(RValue<Float4> x, RValue<Float4> y, bool pp)
+{
+	Float4 log = logarithm2(x, pp);
+	log *= y;
+	return exponential2(log, pp);
+}
+
+Float4 reciprocal(RValue<Float4> x, bool pp, bool finite, bool exactAtPow2)
+{
+	Float4 rcp = Rcp_pp(x, exactAtPow2);
+
+	if(!pp)
 	{
-		x = rhs.x;
-		y = rhs.y;
-		z = rhs.z;
-		w = rhs.w;
+		rcp = (rcp + rcp) - (x * rcp * rcp);
 	}
 
-	Vector4s &Vector4s::operator=(const Vector4s &rhs)
+	if(finite)
 	{
-		x = rhs.x;
-		y = rhs.y;
-		z = rhs.z;
-		w = rhs.w;
-
-		return *this;
+		int big = 0x7F7FFFFF;
+		rcp = Min(rcp, Float4((float&)big));
 	}
 
-	Short4 &Vector4s::operator[](int i)
-	{
-		switch(i)
-		{
-		case 0: return x;
-		case 1: return y;
-		case 2: return z;
-		case 3: return w;
-		}
+	return rcp;
+}
 
-		return x;
+Float4 reciprocalSquareRoot(RValue<Float4> x, bool absolute, bool pp)
+{
+	Float4 abs = x;
+
+	if(absolute)
+	{
+		abs = Abs(abs);
 	}
 
-	Vector4f::Vector4f()
+	Float4 rsq;
+
+	if(!pp)
 	{
+		rsq = Float4(1.0f) / Sqrt(abs);
 	}
-
-	Vector4f::Vector4f(float x, float y, float z, float w)
+	else
 	{
-		this->x = Float4(x);
-		this->y = Float4(y);
-		this->z = Float4(z);
-		this->w = Float4(w);
-	}
-
-	Vector4f::Vector4f(const Vector4f &rhs)
-	{
-		x = rhs.x;
-		y = rhs.y;
-		z = rhs.z;
-		w = rhs.w;
-	}
-
-	Vector4f &Vector4f::operator=(const Vector4f &rhs)
-	{
-		x = rhs.x;
-		y = rhs.y;
-		z = rhs.z;
-		w = rhs.w;
-
-		return *this;
-	}
-
-	Float4 &Vector4f::operator[](int i)
-	{
-		switch(i)
-		{
-		case 0: return x;
-		case 1: return y;
-		case 2: return z;
-		case 3: return w;
-		}
-
-		return x;
-	}
-
-	Float4 exponential2(RValue<Float4> x, bool pp)
-	{
-		// This implementation is based on 2^(i + f) = 2^i * 2^f,
-		// where i is the integer part of x and f is the fraction.
-
-		// For 2^i we can put the integer part directly in the exponent of
-		// the IEEE-754 floating-point number. Clamp to prevent overflow
-		// past the representation of infinity.
-		Float4 x0 = x;
-		x0 = Min(x0, As<Float4>(Int4(0x43010000)));   // 129.00000e+0f
-		x0 = Max(x0, As<Float4>(Int4(0xC2FDFFFF)));   // -126.99999e+0f
-
-		Int4 i = RoundInt(x0 - Float4(0.5f));
-		Float4 ii = As<Float4>((i + Int4(127)) << 23);   // Add single-precision bias, and shift into exponent.
-
-		// For the fractional part use a polynomial
-		// which approximates 2^f in the 0 to 1 range.
-		Float4 f = x0 - Float4(i);
-		Float4 ff = As<Float4>(Int4(0x3AF61905));     // 1.8775767e-3f
-		ff = ff * f + As<Float4>(Int4(0x3C134806));   // 8.9893397e-3f
-		ff = ff * f + As<Float4>(Int4(0x3D64AA23));   // 5.5826318e-2f
-		ff = ff * f + As<Float4>(Int4(0x3E75EAD4));   // 2.4015361e-1f
-		ff = ff * f + As<Float4>(Int4(0x3F31727B));   // 6.9315308e-1f
-		ff = ff * f + Float4(1.0f);
-
-		return ii * ff;
-	}
-
-	Float4 logarithm2(RValue<Float4> x, bool pp)
-	{
-		Float4 x0;
-		Float4 x1;
-		Float4 x2;
-		Float4 x3;
-
-		x0 = x;
-
-		x1 = As<Float4>(As<Int4>(x0) & Int4(0x7F800000));
-		x1 = As<Float4>(As<UInt4>(x1) >> 8);
-		x1 = As<Float4>(As<Int4>(x1) | As<Int4>(Float4(1.0f)));
-		x1 = (x1 - Float4(1.4960938f)) * Float4(256.0f);   // FIXME: (x1 - 1.4960938f) * 256.0f;
-		x0 = As<Float4>((As<Int4>(x0) & Int4(0x007FFFFF)) | As<Int4>(Float4(1.0f)));
-
-		x2 = (Float4(9.5428179e-2f) * x0 + Float4(4.7779095e-1f)) * x0 + Float4(1.9782813e-1f);
-		x3 = ((Float4(1.6618466e-2f) * x0 + Float4(2.0350508e-1f)) * x0 + Float4(2.7382900e-1f)) * x0 + Float4(4.0496687e-2f);
-		x2 /= x3;
-
-		x1 += (x0 - Float4(1.0f)) * x2;
-
-		Int4 pos_inf_x = CmpEQ(As<Int4>(x), Int4(0x7F800000));
-		return As<Float4>((pos_inf_x & As<Int4>(x)) | (~pos_inf_x & As<Int4>(x1)));
-	}
-
-	Float4 exponential(RValue<Float4> x, bool pp)
-	{
-		// FIXME: Propagate the constant
-		return exponential2(Float4(1.44269504f) * x, pp);   // 1/ln(2)
-	}
-
-	Float4 logarithm(RValue<Float4> x, bool pp)
-	{
-		// FIXME: Propagate the constant
-		return Float4(6.93147181e-1f) * logarithm2(x, pp);   // ln(2)
-	}
-
-	Float4 power(RValue<Float4> x, RValue<Float4> y, bool pp)
-	{
-		Float4 log = logarithm2(x, pp);
-		log *= y;
-		return exponential2(log, pp);
-	}
-
-	Float4 reciprocal(RValue<Float4> x, bool pp, bool finite, bool exactAtPow2)
-	{
-		Float4 rcp = Rcp_pp(x, exactAtPow2);
+		rsq = RcpSqrt_pp(abs);
 
 		if(!pp)
 		{
-			rcp = (rcp + rcp) - (x * rcp * rcp);
+			rsq = rsq * (Float4(3.0f) - rsq * rsq * abs) * Float4(0.5f);
 		}
 
-		if(finite)
-		{
-			int big = 0x7F7FFFFF;
-			rcp = Min(rcp, Float4((float&)big));
-		}
-
-		return rcp;
+		rsq = As<Float4>(CmpNEQ(As<Int4>(abs), Int4(0x7F800000)) & As<Int4>(rsq));
 	}
 
-	Float4 reciprocalSquareRoot(RValue<Float4> x, bool absolute, bool pp)
+	return rsq;
+}
+
+Float4 modulo(RValue<Float4> x, RValue<Float4> y)
+{
+	return x - y * Floor(x / y);
+}
+
+Float4 sine_pi(RValue<Float4> x, bool pp)
+{
+	const Float4 A = Float4(-4.05284734e-1f);   // -4/pi^2
+	const Float4 B = Float4(1.27323954e+0f);    // 4/pi
+	const Float4 C = Float4(7.75160950e-1f);
+	const Float4 D = Float4(2.24839049e-1f);
+
+	// Parabola approximating sine
+	Float4 sin = x * (Abs(x) * A + B);
+
+	// Improve precision from 0.06 to 0.001
+	if(true)
 	{
-		Float4 abs = x;
-
-		if(absolute)
-		{
-			abs = Abs(abs);
-		}
-
-		Float4 rsq;
-
-		if(!pp)
-		{
-			rsq = Float4(1.0f) / Sqrt(abs);
-		}
-		else
-		{
-			rsq = RcpSqrt_pp(abs);
-
-			if(!pp)
-			{
-				rsq = rsq * (Float4(3.0f) - rsq * rsq * abs) * Float4(0.5f);
-			}
-
-			rsq = As<Float4>(CmpNEQ(As<Int4>(abs), Int4(0x7F800000)) & As<Int4>(rsq));
-		}
-
-		return rsq;
+		sin = sin * (Abs(sin) * D + C);
 	}
 
-	Float4 modulo(RValue<Float4> x, RValue<Float4> y)
+	return sin;
+}
+
+Float4 cosine_pi(RValue<Float4> x, bool pp)
+{
+	// cos(x) = sin(x + pi/2)
+	Float4 y = x + Float4(1.57079632e+0f);
+
+	// Wrap around
+	y -= As<Float4>(CmpNLT(y, Float4(3.14159265e+0f)) & As<Int4>(Float4(6.28318530e+0f)));
+
+	return sine_pi(y, pp);
+}
+
+Float4 sine(RValue<Float4> x, bool pp)
+{
+	// Reduce to [-0.5, 0.5] range
+	Float4 y = x * Float4(1.59154943e-1f);   // 1/2pi
+	y = y - Round(y);
+
+	if(!pp)
 	{
-		return x - y * Floor(x / y);
+		// From the paper: "A Fast, Vectorizable Algorithm for Producing Single-Precision Sine-Cosine Pairs"
+		// This implementation passes OpenGL ES 3.0 precision requirements, at the cost of more operations:
+		// !pp : 17 mul, 7 add, 1 sub, 1 reciprocal
+		//  pp : 4 mul, 2 add, 2 abs
+
+		Float4 y2 = y * y;
+		Float4 c1 = y2 * (y2 * (y2 * Float4(-0.0204391631f) + Float4(0.2536086171f)) + Float4(-1.2336977925f)) + Float4(1.0f);
+		Float4 s1 = y * (y2 * (y2 * (y2 * Float4(-0.0046075748f) + Float4(0.0796819754f)) + Float4(-0.645963615f)) + Float4(1.5707963235f));
+		Float4 c2 = (c1 * c1) - (s1 * s1);
+		Float4 s2 = Float4(2.0f) * s1 * c1;
+		return Float4(2.0f) * s2 * c2 * reciprocal(s2 * s2 + c2 * c2, pp, true);
 	}
 
-	Float4 sine_pi(RValue<Float4> x, bool pp)
+	const Float4 A = Float4(-16.0f);
+	const Float4 B = Float4(8.0f);
+	const Float4 C = Float4(7.75160950e-1f);
+	const Float4 D = Float4(2.24839049e-1f);
+
+	// Parabola approximating sine
+	Float4 sin = y * (Abs(y) * A + B);
+
+	// Improve precision from 0.06 to 0.001
+	if(true)
 	{
-		const Float4 A = Float4(-4.05284734e-1f);   // -4/pi^2
-		const Float4 B = Float4(1.27323954e+0f);    // 4/pi
-		const Float4 C = Float4(7.75160950e-1f);
-		const Float4 D = Float4(2.24839049e-1f);
-
-		// Parabola approximating sine
-		Float4 sin = x * (Abs(x) * A + B);
-
-		// Improve precision from 0.06 to 0.001
-		if(true)
-		{
-			sin = sin * (Abs(sin) * D + C);
-		}
-
-		return sin;
+		sin = sin * (Abs(sin) * D + C);
 	}
 
-	Float4 cosine_pi(RValue<Float4> x, bool pp)
+	return sin;
+}
+
+Float4 cosine(RValue<Float4> x, bool pp)
+{
+	// cos(x) = sin(x + pi/2)
+	Float4 y = x + Float4(1.57079632e+0f);
+	return sine(y, pp);
+}
+
+Float4 tangent(RValue<Float4> x, bool pp)
+{
+	return sine(x, pp) / cosine(x, pp);
+}
+
+Float4 arccos(RValue<Float4> x, bool pp)
+{
+	// pi/2 - arcsin(x)
+	return Float4(1.57079632e+0f) - arcsin(x);
+}
+
+Float4 arcsin(RValue<Float4> x, bool pp)
+{
+	if(false) // Simpler implementation fails even lowp precision tests
 	{
-		// cos(x) = sin(x + pi/2)
-		Float4 y = x + Float4(1.57079632e+0f);
-
-		// Wrap around
-		y -= As<Float4>(CmpNLT(y, Float4(3.14159265e+0f)) & As<Int4>(Float4(6.28318530e+0f)));
-
-		return sine_pi(y, pp);
+		// x*(pi/2-sqrt(1-x*x)*pi/5)
+		return x * (Float4(1.57079632e+0f) - Sqrt(Float4(1.0f) - x*x) * Float4(6.28318531e-1f));
 	}
-
-	Float4 sine(RValue<Float4> x, bool pp)
+	else
 	{
-		// Reduce to [-0.5, 0.5] range
-		Float4 y = x * Float4(1.59154943e-1f);   // 1/2pi
-		y = y - Round(y);
-
-		if(!pp)
-		{
-			// From the paper: "A Fast, Vectorizable Algorithm for Producing Single-Precision Sine-Cosine Pairs"
-			// This implementation passes OpenGL ES 3.0 precision requirements, at the cost of more operations:
-			// !pp : 17 mul, 7 add, 1 sub, 1 reciprocal
-			//  pp : 4 mul, 2 add, 2 abs
-
-			Float4 y2 = y * y;
-			Float4 c1 = y2 * (y2 * (y2 * Float4(-0.0204391631f) + Float4(0.2536086171f)) + Float4(-1.2336977925f)) + Float4(1.0f);
-			Float4 s1 = y * (y2 * (y2 * (y2 * Float4(-0.0046075748f) + Float4(0.0796819754f)) + Float4(-0.645963615f)) + Float4(1.5707963235f));
-			Float4 c2 = (c1 * c1) - (s1 * s1);
-			Float4 s2 = Float4(2.0f) * s1 * c1;
-			return Float4(2.0f) * s2 * c2 * reciprocal(s2 * s2 + c2 * c2, pp, true);
-		}
-
-		const Float4 A = Float4(-16.0f);
-		const Float4 B = Float4(8.0f);
-		const Float4 C = Float4(7.75160950e-1f);
-		const Float4 D = Float4(2.24839049e-1f);
-
-		// Parabola approximating sine
-		Float4 sin = y * (Abs(y) * A + B);
-
-		// Improve precision from 0.06 to 0.001
-		if(true)
-		{
-			sin = sin * (Abs(sin) * D + C);
-		}
-
-		return sin;
-	}
-
-	Float4 cosine(RValue<Float4> x, bool pp)
-	{
-		// cos(x) = sin(x + pi/2)
-		Float4 y = x + Float4(1.57079632e+0f);
-		return sine(y, pp);
-	}
-
-	Float4 tangent(RValue<Float4> x, bool pp)
-	{
-		return sine(x, pp) / cosine(x, pp);
-	}
-
-	Float4 arccos(RValue<Float4> x, bool pp)
-	{
-		// pi/2 - arcsin(x)
-		return Float4(1.57079632e+0f) - arcsin(x);
-	}
-
-	Float4 arcsin(RValue<Float4> x, bool pp)
-	{
-		if(false) // Simpler implementation fails even lowp precision tests
-		{
-			// x*(pi/2-sqrt(1-x*x)*pi/5)
-			return x * (Float4(1.57079632e+0f) - Sqrt(Float4(1.0f) - x*x) * Float4(6.28318531e-1f));
-		}
-		else
-		{
-			// From 4.4.45, page 81 of the Handbook of Mathematical Functions, by Milton Abramowitz and Irene Stegun
-			const Float4 half_pi(1.57079632f);
-			const Float4 a0(1.5707288f);
-			const Float4 a1(-0.2121144f);
-			const Float4 a2(0.0742610f);
-			const Float4 a3(-0.0187293f);
-			Float4 absx = Abs(x);
-			return As<Float4>(As<Int4>(half_pi - Sqrt(Float4(1.0f) - absx) * (a0 + absx * (a1 + absx * (a2 + absx * a3)))) ^
-			       (As<Int4>(x) & Int4(0x80000000)));
-		}
-	}
-
-	// Approximation of atan in [0..1]
-	Float4 arctan_01(Float4 x, bool pp)
-	{
-		if(pp)
-		{
-			return x * (Float4(-0.27f) * x + Float4(1.05539816f));
-		}
-		else
-		{
-			// From 4.4.49, page 81 of the Handbook of Mathematical Functions, by Milton Abramowitz and Irene Stegun
-			const Float4 a2(-0.3333314528f);
-			const Float4 a4(0.1999355085f);
-			const Float4 a6(-0.1420889944f);
-			const Float4 a8(0.1065626393f);
-			const Float4 a10(-0.0752896400f);
-			const Float4 a12(0.0429096138f);
-			const Float4 a14(-0.0161657367f);
-			const Float4 a16(0.0028662257f);
-			Float4 x2 = x * x;
-			return (x + x * (x2 * (a2 + x2 * (a4 + x2 * (a6 + x2 * (a8 + x2 * (a10 + x2 * (a12 + x2 * (a14 + x2 * a16)))))))));
-		}
-	}
-
-	Float4 arctan(RValue<Float4> x, bool pp)
-	{
-		Float4 absx = Abs(x);
-		Int4 O = CmpNLT(absx, Float4(1.0f));
-		Float4 y = As<Float4>((O & As<Int4>(Float4(1.0f) / absx)) | (~O & As<Int4>(absx))); // FIXME: Vector select
-
+		// From 4.4.45, page 81 of the Handbook of Mathematical Functions, by Milton Abramowitz and Irene Stegun
 		const Float4 half_pi(1.57079632f);
-		Float4 theta = arctan_01(y, pp);
-		return As<Float4>(((O & As<Int4>(half_pi - theta)) | (~O & As<Int4>(theta))) ^ // FIXME: Vector select
+		const Float4 a0(1.5707288f);
+		const Float4 a1(-0.2121144f);
+		const Float4 a2(0.0742610f);
+		const Float4 a3(-0.0187293f);
+		Float4 absx = Abs(x);
+		return As<Float4>(As<Int4>(half_pi - Sqrt(Float4(1.0f) - absx) * (a0 + absx * (a1 + absx * (a2 + absx * a3)))) ^
 		       (As<Int4>(x) & Int4(0x80000000)));
 	}
-
-	Float4 arctan(RValue<Float4> y, RValue<Float4> x, bool pp)
-	{
-		const Float4 pi(3.14159265f);            // pi
-		const Float4 minus_pi(-3.14159265f);     // -pi
-		const Float4 half_pi(1.57079632f);       // pi/2
-		const Float4 quarter_pi(7.85398163e-1f); // pi/4
-
-		// Rotate to upper semicircle when in lower semicircle
-		Int4 S = CmpLT(y, Float4(0.0f));
-		Float4 theta = As<Float4>(S & As<Int4>(minus_pi));
-		Float4 x0 = As<Float4>((As<Int4>(y) & Int4(0x80000000)) ^ As<Int4>(x));
-		Float4 y0 = Abs(y);
-
-		// Rotate to right quadrant when in left quadrant
-		Int4 Q = CmpLT(x0, Float4(0.0f));
-		theta += As<Float4>(Q & As<Int4>(half_pi));
-		Float4 x1 = As<Float4>((Q & As<Int4>(y0)) | (~Q & As<Int4>(x0)));  // FIXME: Vector select
-		Float4 y1 = As<Float4>((Q & As<Int4>(-x0)) | (~Q & As<Int4>(y0))); // FIXME: Vector select
-
-		// Mirror to first octant when in second octant
-		Int4 O = CmpNLT(y1, x1);
-		Float4 x2 = As<Float4>((O & As<Int4>(y1)) | (~O & As<Int4>(x1))); // FIXME: Vector select
-		Float4 y2 = As<Float4>((O & As<Int4>(x1)) | (~O & As<Int4>(y1))); // FIXME: Vector select
-
-		// Approximation of atan in [0..1]
-		Int4 zero_x = CmpEQ(x2, Float4(0.0f));
-		Int4 inf_y = IsInf(y2); // Since x2 >= y2, this means x2 == y2 == inf, so we use 45 degrees or pi/4
-		Float4 atan2_theta = arctan_01(y2 / x2, pp);
-		theta += As<Float4>((~zero_x & ~inf_y & ((O & As<Int4>(half_pi - atan2_theta)) | (~O & (As<Int4>(atan2_theta))))) | // FIXME: Vector select
-		                    (inf_y & As<Int4>(quarter_pi)));
-
-		// Recover loss of precision for tiny theta angles
-		Int4 precision_loss = S & Q & O & ~inf_y; // This combination results in (-pi + half_pi + half_pi - atan2_theta) which is equivalent to -atan2_theta
-		return As<Float4>((precision_loss & As<Int4>(-atan2_theta)) | (~precision_loss & As<Int4>(theta))); // FIXME: Vector select
-	}
-
-	Float4 sineh(RValue<Float4> x, bool pp)
-	{
-		return (exponential(x, pp) - exponential(-x, pp)) * Float4(0.5f);
-	}
-
-	Float4 cosineh(RValue<Float4> x, bool pp)
-	{
-		return (exponential(x, pp) + exponential(-x, pp)) * Float4(0.5f);
-	}
-
-	Float4 tangenth(RValue<Float4> x, bool pp)
-	{
-		Float4 e_x = exponential(x, pp);
-		Float4 e_minus_x = exponential(-x, pp);
-		return (e_x - e_minus_x) / (e_x + e_minus_x);
-	}
-
-	Float4 arccosh(RValue<Float4> x, bool pp)
-	{
-		return logarithm(x + Sqrt(x + Float4(1.0f)) * Sqrt(x - Float4(1.0f)), pp);
-	}
-
-	Float4 arcsinh(RValue<Float4> x, bool pp)
-	{
-		return logarithm(x + Sqrt(x * x + Float4(1.0f)), pp);
-	}
-
-	Float4 arctanh(RValue<Float4> x, bool pp)
-	{
-		return logarithm((Float4(1.0f) + x) / (Float4(1.0f) - x), pp) * Float4(0.5f);
-	}
-
-	Float4 dot2(const Vector4f &v0, const Vector4f &v1)
-	{
-		return v0.x * v1.x + v0.y * v1.y;
-	}
-
-	Float4 dot3(const Vector4f &v0, const Vector4f &v1)
-	{
-		return v0.x * v1.x + v0.y * v1.y + v0.z * v1.z;
-	}
-
-	Float4 dot4(const Vector4f &v0, const Vector4f &v1)
-	{
-		return v0.x * v1.x + v0.y * v1.y + v0.z * v1.z + v0.w * v1.w;
-	}
-
-	void transpose4x4(Short4 &row0, Short4 &row1, Short4 &row2, Short4 &row3)
-	{
-		Int2 tmp0 = UnpackHigh(row0, row1);
-		Int2 tmp1 = UnpackHigh(row2, row3);
-		Int2 tmp2 = UnpackLow(row0, row1);
-		Int2 tmp3 = UnpackLow(row2, row3);
-
-		row0 = UnpackLow(tmp2, tmp3);
-		row1 = UnpackHigh(tmp2, tmp3);
-		row2 = UnpackLow(tmp0, tmp1);
-		row3 = UnpackHigh(tmp0, tmp1);
-	}
-
-	void transpose4x3(Short4 &row0, Short4 &row1, Short4 &row2, Short4 &row3)
-	{
-		Int2 tmp0 = UnpackHigh(row0, row1);
-		Int2 tmp1 = UnpackHigh(row2, row3);
-		Int2 tmp2 = UnpackLow(row0, row1);
-		Int2 tmp3 = UnpackLow(row2, row3);
-
-		row0 = UnpackLow(tmp2, tmp3);
-		row1 = UnpackHigh(tmp2, tmp3);
-		row2 = UnpackLow(tmp0, tmp1);
-	}
-
-	void transpose4x4(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3)
-	{
-		Float4 tmp0 = UnpackLow(row0, row1);
-		Float4 tmp1 = UnpackLow(row2, row3);
-		Float4 tmp2 = UnpackHigh(row0, row1);
-		Float4 tmp3 = UnpackHigh(row2, row3);
-
-		row0 = Float4(tmp0.xy, tmp1.xy);
-		row1 = Float4(tmp0.zw, tmp1.zw);
-		row2 = Float4(tmp2.xy, tmp3.xy);
-		row3 = Float4(tmp2.zw, tmp3.zw);
-	}
-
-	void transpose4x3(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3)
-	{
-		Float4 tmp0 = UnpackLow(row0, row1);
-		Float4 tmp1 = UnpackLow(row2, row3);
-		Float4 tmp2 = UnpackHigh(row0, row1);
-		Float4 tmp3 = UnpackHigh(row2, row3);
-
-		row0 = Float4(tmp0.xy, tmp1.xy);
-		row1 = Float4(tmp0.zw, tmp1.zw);
-		row2 = Float4(tmp2.xy, tmp3.xy);
-	}
-
-	void transpose4x2(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3)
-	{
-		Float4 tmp0 = UnpackLow(row0, row1);
-		Float4 tmp1 = UnpackLow(row2, row3);
-
-		row0 = Float4(tmp0.xy, tmp1.xy);
-		row1 = Float4(tmp0.zw, tmp1.zw);
-	}
-
-	void transpose4x1(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3)
-	{
-		Float4 tmp0 = UnpackLow(row0, row1);
-		Float4 tmp1 = UnpackLow(row2, row3);
-
-		row0 = Float4(tmp0.xy, tmp1.xy);
-	}
-
-	void transpose2x4(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3)
-	{
-		Float4 tmp01 = UnpackLow(row0, row1);
-		Float4 tmp23 = UnpackHigh(row0, row1);
-
-		row0 = tmp01;
-		row1 = Float4(tmp01.zw, row1.zw);
-		row2 = tmp23;
-		row3 = Float4(tmp23.zw, row3.zw);
-	}
-
-	void transpose4xN(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3, int N)
-	{
-		switch(N)
-		{
-		case 1: transpose4x1(row0, row1, row2, row3); break;
-		case 2: transpose4x2(row0, row1, row2, row3); break;
-		case 3: transpose4x3(row0, row1, row2, row3); break;
-		case 4: transpose4x4(row0, row1, row2, row3); break;
-		}
-	}
-
-	UInt4 halfToFloatBits(UInt4 halfBits)
-	{
-		auto magic = UInt4(126 << 23);
-
-		auto sign16 = halfBits & UInt4(0x8000);
-		auto man16  = halfBits & UInt4(0x3FF);
-		auto exp16  = halfBits & UInt4(0x7C00);
-
-		auto isDnormOrZero = CmpEQ(exp16, UInt4(0));
-		auto isInfOrNaN = CmpEQ(exp16, UInt4(0x7C00));
-
-		auto sign32 = sign16 << 16;
-		auto man32  = man16 << 13;
-		auto exp32  = (exp16 + UInt4(0x1C000)) << 13;
-		auto norm32 = (man32 | exp32) | (isInfOrNaN & UInt4(0x7F800000));
-
-		auto denorm32 = As<UInt4>(As<Float4>(magic + man16) - As<Float4>(magic));
-
-		return sign32 | (norm32 & ~isDnormOrZero) | (denorm32 & isDnormOrZero);
-	}
-
-
-	rr::RValue<rr::Bool> AnyTrue(rr::RValue<sw::SIMD::Int> const &ints)
-	{
-		return rr::SignMask(ints) != 0;
-	}
-
-	rr::RValue<rr::Bool> AnyFalse(rr::RValue<sw::SIMD::Int> const &ints)
-	{
-		return rr::SignMask(~ints) != 0;
-	}
-
-	rr::RValue<sw::SIMD::Float> Sign(rr::RValue<sw::SIMD::Float> const &val)
-	{
-		return rr::As<sw::SIMD::Float>((rr::As<sw::SIMD::UInt>(val) & sw::SIMD::UInt(0x80000000)) | sw::SIMD::UInt(0x3f800000));
-	}
-
-	// Returns the <whole, frac> of val.
-	// Both whole and frac will have the same sign as val.
-	std::pair<rr::RValue<sw::SIMD::Float>, rr::RValue<sw::SIMD::Float>>
-	Modf(rr::RValue<sw::SIMD::Float> const &val)
-	{
-		auto abs = Abs(val);
-		auto sign = Sign(val);
-		auto whole = Floor(abs) * sign;
-		auto frac = Frac(abs) * sign;
-		return std::make_pair(whole, frac);
-	}
-
-	// Returns the number of 1s in bits, per lane.
-	sw::SIMD::UInt CountBits(rr::RValue<sw::SIMD::UInt> const &bits)
-	{
-		// TODO: Add an intrinsic to reactor. Even if there isn't a
-		// single vector instruction, there may be target-dependent
-		// ways to make this faster.
-		// https://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
-		sw::SIMD::UInt c = bits - ((bits >> 1) & sw::SIMD::UInt(0x55555555));
-		c = ((c >> 2) & sw::SIMD::UInt(0x33333333)) + (c & sw::SIMD::UInt(0x33333333));
-		c = ((c >> 4) + c) & sw::SIMD::UInt(0x0F0F0F0F);
-		c = ((c >> 8) + c) & sw::SIMD::UInt(0x00FF00FF);
-		c = ((c >> 16) + c) & sw::SIMD::UInt(0x0000FFFF);
-		return c;
-	}
-
-	// Returns 1 << bits.
-	// If the resulting bit overflows a 32 bit integer, 0 is returned.
-	rr::RValue<sw::SIMD::UInt> NthBit32(rr::RValue<sw::SIMD::UInt> const &bits)
-	{
-		return ((sw::SIMD::UInt(1) << bits) & rr::CmpLT(bits, sw::SIMD::UInt(32)));
-	}
-
-	// Returns bitCount number of of 1's starting from the LSB.
-	rr::RValue<sw::SIMD::UInt> Bitmask32(rr::RValue<sw::SIMD::UInt> const &bitCount)
-	{
-		return NthBit32(bitCount) - sw::SIMD::UInt(1);
-	}
-
-	// Performs a fused-multiply add, returning a * b + c.
-	rr::RValue<sw::SIMD::Float> FMA(
-			rr::RValue<sw::SIMD::Float> const &a,
-			rr::RValue<sw::SIMD::Float> const &b,
-			rr::RValue<sw::SIMD::Float> const &c)
-	{
-		return a * b + c;
-	}
-
-	// Returns the exponent of the floating point number f.
-	// Assumes IEEE 754
-	rr::RValue<sw::SIMD::Int> Exponent(rr::RValue<sw::SIMD::Float> f)
-	{
-		auto v = rr::As<sw::SIMD::UInt>(f);
-		return (sw::SIMD::Int((v >> sw::SIMD::UInt(23)) & sw::SIMD::UInt(0xFF)) - sw::SIMD::Int(126));
-	}
-
-	// Returns y if y < x; otherwise result is x.
-	// If one operand is a NaN, the other operand is the result.
-	// If both operands are NaN, the result is a NaN.
-	rr::RValue<sw::SIMD::Float> NMin(rr::RValue<sw::SIMD::Float> const &x, rr::RValue<sw::SIMD::Float> const &y)
-	{
-		using namespace rr;
-		auto xIsNan = IsNan(x);
-		auto yIsNan = IsNan(y);
-		return As<sw::SIMD::Float>(
-			// If neither are NaN, return min
-			((~xIsNan & ~yIsNan) & As<sw::SIMD::Int>(Min(x, y))) |
-			// If one operand is a NaN, the other operand is the result
-			// If both operands are NaN, the result is a NaN.
-			((~xIsNan &  yIsNan) & As<sw::SIMD::Int>(x)) |
-			(( xIsNan          ) & As<sw::SIMD::Int>(y)));
-	}
-
-	// Returns y if y > x; otherwise result is x.
-	// If one operand is a NaN, the other operand is the result.
-	// If both operands are NaN, the result is a NaN.
-	rr::RValue<sw::SIMD::Float> NMax(rr::RValue<sw::SIMD::Float> const &x, rr::RValue<sw::SIMD::Float> const &y)
-	{
-		using namespace rr;
-		auto xIsNan = IsNan(x);
-		auto yIsNan = IsNan(y);
-		return As<sw::SIMD::Float>(
-			// If neither are NaN, return max
-			((~xIsNan & ~yIsNan) & As<sw::SIMD::Int>(Max(x, y))) |
-			// If one operand is a NaN, the other operand is the result
-			// If both operands are NaN, the result is a NaN.
-			((~xIsNan &  yIsNan) & As<sw::SIMD::Int>(x)) |
-			(( xIsNan          ) & As<sw::SIMD::Int>(y)));
-	}
-
-	// Returns the determinant of a 2x2 matrix.
-	rr::RValue<sw::SIMD::Float> Determinant(
-		rr::RValue<sw::SIMD::Float> const &a, rr::RValue<sw::SIMD::Float> const &b,
-		rr::RValue<sw::SIMD::Float> const &c, rr::RValue<sw::SIMD::Float> const &d)
-	{
-		return a*d - b*c;
-	}
-
-	// Returns the determinant of a 3x3 matrix.
-	rr::RValue<sw::SIMD::Float> Determinant(
-		rr::RValue<sw::SIMD::Float> const &a, rr::RValue<sw::SIMD::Float> const &b, rr::RValue<sw::SIMD::Float> const &c,
-		rr::RValue<sw::SIMD::Float> const &d, rr::RValue<sw::SIMD::Float> const &e, rr::RValue<sw::SIMD::Float> const &f,
-		rr::RValue<sw::SIMD::Float> const &g, rr::RValue<sw::SIMD::Float> const &h, rr::RValue<sw::SIMD::Float> const &i)
-	{
-		return a*e*i + b*f*g + c*d*h - c*e*g - b*d*i - a*f*h;
-	}
-
-	// Returns the determinant of a 4x4 matrix.
-	rr::RValue<sw::SIMD::Float> Determinant(
-		rr::RValue<sw::SIMD::Float> const &a, rr::RValue<sw::SIMD::Float> const &b, rr::RValue<sw::SIMD::Float> const &c, rr::RValue<sw::SIMD::Float> const &d,
-		rr::RValue<sw::SIMD::Float> const &e, rr::RValue<sw::SIMD::Float> const &f, rr::RValue<sw::SIMD::Float> const &g, rr::RValue<sw::SIMD::Float> const &h,
-		rr::RValue<sw::SIMD::Float> const &i, rr::RValue<sw::SIMD::Float> const &j, rr::RValue<sw::SIMD::Float> const &k, rr::RValue<sw::SIMD::Float> const &l,
-		rr::RValue<sw::SIMD::Float> const &m, rr::RValue<sw::SIMD::Float> const &n, rr::RValue<sw::SIMD::Float> const &o, rr::RValue<sw::SIMD::Float> const &p)
-	{
-		return a * Determinant(f, g, h,
-		                       j, k, l,
-		                       n, o, p) -
-		       b * Determinant(e, g, h,
-		                       i, k, l,
-		                       m, o, p) +
-		       c * Determinant(e, f, h,
-		                       i, j, l,
-		                       m, n, p) -
-		       d * Determinant(e, f, g,
-		                       i, j, k,
-		                       m, n, o);
-	}
-
-	// Returns the inverse of a 2x2 matrix.
-	std::array<rr::RValue<sw::SIMD::Float>, 4> MatrixInverse(
-		rr::RValue<sw::SIMD::Float> const &a, rr::RValue<sw::SIMD::Float> const &b,
-		rr::RValue<sw::SIMD::Float> const &c, rr::RValue<sw::SIMD::Float> const &d)
-	{
-		auto s = sw::SIMD::Float(1.0f) / Determinant(a, b, c, d);
-		return {{s*d, -s*b, -s*c, s*a}};
-	}
-
-	// Returns the inverse of a 3x3 matrix.
-	std::array<rr::RValue<sw::SIMD::Float>, 9> MatrixInverse(
-		rr::RValue<sw::SIMD::Float> const &a, rr::RValue<sw::SIMD::Float> const &b, rr::RValue<sw::SIMD::Float> const &c,
-		rr::RValue<sw::SIMD::Float> const &d, rr::RValue<sw::SIMD::Float> const &e, rr::RValue<sw::SIMD::Float> const &f,
-		rr::RValue<sw::SIMD::Float> const &g, rr::RValue<sw::SIMD::Float> const &h, rr::RValue<sw::SIMD::Float> const &i)
-	{
-		auto s = sw::SIMD::Float(1.0f) / Determinant(
-				a, b, c,
-				d, e, f,
-				g, h, i); // TODO: duplicate arithmetic calculating the det and below.
-
-		return {{
-			s * (e*i - f*h), s * (c*h - b*i), s * (b*f - c*e),
-			s * (f*g - d*i), s * (a*i - c*g), s * (c*d - a*f),
-			s * (d*h - e*g), s * (b*g - a*h), s * (a*e - b*d),
-		}};
-	}
-
-	// Returns the inverse of a 4x4 matrix.
-	std::array<rr::RValue<sw::SIMD::Float>, 16> MatrixInverse(
-		rr::RValue<sw::SIMD::Float> const &a, rr::RValue<sw::SIMD::Float> const &b, rr::RValue<sw::SIMD::Float> const &c, rr::RValue<sw::SIMD::Float> const &d,
-		rr::RValue<sw::SIMD::Float> const &e, rr::RValue<sw::SIMD::Float> const &f, rr::RValue<sw::SIMD::Float> const &g, rr::RValue<sw::SIMD::Float> const &h,
-		rr::RValue<sw::SIMD::Float> const &i, rr::RValue<sw::SIMD::Float> const &j, rr::RValue<sw::SIMD::Float> const &k, rr::RValue<sw::SIMD::Float> const &l,
-		rr::RValue<sw::SIMD::Float> const &m, rr::RValue<sw::SIMD::Float> const &n, rr::RValue<sw::SIMD::Float> const &o, rr::RValue<sw::SIMD::Float> const &p)
-	{
-		auto s = sw::SIMD::Float(1.0f) / Determinant(
-				a, b, c, d,
-				e, f, g, h,
-				i, j, k, l,
-				m, n, o, p); // TODO: duplicate arithmetic calculating the det and below.
-
-		auto kplo = k*p - l*o, jpln = j*p - l*n, jokn = j*o - k*n;
-		auto gpho = g*p - h*o, fphn = f*p - h*n, fogn = f*o - g*n;
-		auto glhk = g*l - h*k, flhj = f*l - h*j, fkgj = f*k - g*j;
-		auto iplm = i*p - l*m, iokm = i*o - k*m, ephm = e*p - h*m;
-		auto eogm = e*o - g*m, elhi = e*l - h*i, ekgi = e*k - g*i;
-		auto injm = i*n - j*m, enfm = e*n - f*m, ejfi = e*j - f*i;
-
-		return {{
-			s * ( f * kplo - g * jpln + h * jokn),
-			s * (-b * kplo + c * jpln - d * jokn),
-			s * ( b * gpho - c * fphn + d * fogn),
-			s * (-b * glhk + c * flhj - d * fkgj),
-
-			s * (-e * kplo + g * iplm - h * iokm),
-			s * ( a * kplo - c * iplm + d * iokm),
-			s * (-a * gpho + c * ephm - d * eogm),
-			s * ( a * glhk - c * elhi + d * ekgi),
-
-			s * ( e * jpln - f * iplm + h * injm),
-			s * (-a * jpln + b * iplm - d * injm),
-			s * ( a * fphn - b * ephm + d * enfm),
-			s * (-a * flhj + b * elhi - d * ejfi),
-
-			s * (-e * jokn + f * iokm - g * injm),
-			s * ( a * jokn - b * iokm + c * injm),
-			s * (-a * fogn + b * eogm - c * enfm),
-			s * ( a * fkgj - b * ekgi + c * ejfi),
-		}};
-	}
-
-	namespace SIMD {
-
-		Pointer::Pointer(rr::Pointer<Byte> base, rr::Int limit)
-			: base(base),
-				dynamicLimit(limit), staticLimit(0),
-				dynamicOffsets(0), staticOffsets{},
-				hasDynamicLimit(true), hasDynamicOffsets(false) {}
-
-		Pointer::Pointer(rr::Pointer<Byte> base, unsigned int limit)
-			: base(base),
-				dynamicLimit(0), staticLimit(limit),
-				dynamicOffsets(0), staticOffsets{},
-				hasDynamicLimit(false), hasDynamicOffsets(false) {}
-
-		Pointer::Pointer(rr::Pointer<Byte> base, rr::Int limit, SIMD::Int offset)
-			: base(base),
-				dynamicLimit(limit), staticLimit(0),
-				dynamicOffsets(offset), staticOffsets{},
-				hasDynamicLimit(true), hasDynamicOffsets(true) {}
-
-		Pointer::Pointer(rr::Pointer<Byte> base, unsigned int limit, SIMD::Int offset)
-			: base(base),
-				dynamicLimit(0), staticLimit(limit),
-				dynamicOffsets(offset), staticOffsets{},
-				hasDynamicLimit(false), hasDynamicOffsets(true) {}
-
-		Pointer& Pointer::operator += (Int i)
-		{
-			dynamicOffsets += i;
-			hasDynamicOffsets = true;
-			return *this;
-		}
-
-		Pointer& Pointer::operator *= (Int i)
-		{
-			dynamicOffsets = offsets() * i;
-			staticOffsets = {};
-			hasDynamicOffsets = true;
-			return *this;
-		}
-
-		Pointer Pointer::operator + (SIMD::Int i) { Pointer p = *this; p += i; return p; }
-		Pointer Pointer::operator * (SIMD::Int i) { Pointer p = *this; p *= i; return p; }
-
-		Pointer& Pointer::operator += (int i)
-		{
-			for (int el = 0; el < SIMD::Width; el++) { staticOffsets[el] += i; }
-			return *this;
-		}
-
-		Pointer& Pointer::operator *= (int i)
-		{
-			for (int el = 0; el < SIMD::Width; el++) { staticOffsets[el] *= i; }
-			if (hasDynamicOffsets)
-			{
-				dynamicOffsets *= SIMD::Int(i);
-			}
-			return *this;
-		}
-
-		Pointer Pointer::operator + (int i) { Pointer p = *this; p += i; return p; }
-		Pointer Pointer::operator * (int i) { Pointer p = *this; p *= i; return p; }
-
-		SIMD::Int Pointer::offsets() const
-		{
-			static_assert(SIMD::Width == 4, "Expects SIMD::Width to be 4");
-			return dynamicOffsets + SIMD::Int(staticOffsets[0], staticOffsets[1], staticOffsets[2], staticOffsets[3]);
-		}
-
-		SIMD::Int Pointer::isInBounds(unsigned int accessSize, OutOfBoundsBehavior robustness) const
-		{
-			ASSERT(accessSize > 0);
-
-			if (isStaticallyInBounds(accessSize, robustness))
-			{
-				return SIMD::Int(0xffffffff);
-			}
-
-			if (!hasDynamicOffsets && !hasDynamicLimit)
-			{
-				// Common fast paths.
-				static_assert(SIMD::Width == 4, "Expects SIMD::Width to be 4");
-				return SIMD::Int(
-					(staticOffsets[0] + accessSize - 1 < staticLimit) ? 0xffffffff : 0,
-					(staticOffsets[1] + accessSize - 1 < staticLimit) ? 0xffffffff : 0,
-					(staticOffsets[2] + accessSize - 1 < staticLimit) ? 0xffffffff : 0,
-					(staticOffsets[3] + accessSize - 1 < staticLimit) ? 0xffffffff : 0);
-			}
-
-			return CmpLT(offsets() + SIMD::Int(accessSize - 1), SIMD::Int(limit()));
-		}
-
-		bool Pointer::isStaticallyInBounds(unsigned int accessSize, OutOfBoundsBehavior robustness) const
-		{
-			if (hasDynamicOffsets)
-			{
-				return false;
-			}
-
-			if (hasDynamicLimit)
-			{
-				if (hasStaticEqualOffsets() || hasStaticSequentialOffsets(accessSize))
-				{
-					switch(robustness)
-					{
-					case OutOfBoundsBehavior::UndefinedBehavior:
-						// With this robustness setting the application/compiler guarantees in-bounds accesses on active lanes,
-						// but since it can't know in advance which branches are taken this must be true even for inactives lanes.
-						return true;
-					case OutOfBoundsBehavior::Nullify:
-					case OutOfBoundsBehavior::RobustBufferAccess:
-					case OutOfBoundsBehavior::UndefinedValue:
-						return false;
-					}
-				}
-			}
-
-			for (int i = 0; i < SIMD::Width; i++)
-			{
-				if (staticOffsets[i] + accessSize - 1 >= staticLimit)
-				{
-					return false;
-				}
-			}
-
-			return true;
-		}
-
-		Int Pointer::limit() const
-		{
-			return dynamicLimit + staticLimit;
-		}
-
-		// Returns true if all offsets are sequential
-		// (N+0*step, N+1*step, N+2*step, N+3*step)
-		rr::Bool Pointer::hasSequentialOffsets(unsigned int step) const
-		{
-			if (hasDynamicOffsets)
-			{
-				auto o = offsets();
-				static_assert(SIMD::Width == 4, "Expects SIMD::Width to be 4");
-				return rr::SignMask(~CmpEQ(o.yzww, o + SIMD::Int(1*step, 2*step, 3*step, 0))) == 0;
-			}
-			return hasStaticSequentialOffsets(step);
-		}
-
-		// Returns true if all offsets are are compile-time static and
-		// sequential (N+0*step, N+1*step, N+2*step, N+3*step)
-		bool Pointer::hasStaticSequentialOffsets(unsigned int step) const
-		{
-			if (hasDynamicOffsets)
-			{
-				return false;
-			}
-			for (int i = 1; i < SIMD::Width; i++)
-			{
-				if (staticOffsets[i-1] + int32_t(step) != staticOffsets[i]) { return false; }
-			}
-			return true;
-		}
-
-		// Returns true if all offsets are equal (N, N, N, N)
-		rr::Bool Pointer::hasEqualOffsets() const
-		{
-			if (hasDynamicOffsets)
-			{
-				auto o = offsets();
-				static_assert(SIMD::Width == 4, "Expects SIMD::Width to be 4");
-				return rr::SignMask(~CmpEQ(o, o.yzwx)) == 0;
-			}
-			return hasStaticEqualOffsets();
-		}
-
-		// Returns true if all offsets are compile-time static and are equal
-		// (N, N, N, N)
-		bool Pointer::hasStaticEqualOffsets() const
-		{
-			if (hasDynamicOffsets)
-			{
-				return false;
-			}
-			for (int i = 1; i < SIMD::Width; i++)
-			{
-				if (staticOffsets[i-1] != staticOffsets[i]) { return false; }
-			}
-			return true;
-		}
-
-	}  // namespace SIMD
-
 }
+
+// Approximation of atan in [0..1]
+Float4 arctan_01(Float4 x, bool pp)
+{
+	if(pp)
+	{
+		return x * (Float4(-0.27f) * x + Float4(1.05539816f));
+	}
+	else
+	{
+		// From 4.4.49, page 81 of the Handbook of Mathematical Functions, by Milton Abramowitz and Irene Stegun
+		const Float4 a2(-0.3333314528f);
+		const Float4 a4(0.1999355085f);
+		const Float4 a6(-0.1420889944f);
+		const Float4 a8(0.1065626393f);
+		const Float4 a10(-0.0752896400f);
+		const Float4 a12(0.0429096138f);
+		const Float4 a14(-0.0161657367f);
+		const Float4 a16(0.0028662257f);
+		Float4 x2 = x * x;
+		return (x + x * (x2 * (a2 + x2 * (a4 + x2 * (a6 + x2 * (a8 + x2 * (a10 + x2 * (a12 + x2 * (a14 + x2 * a16)))))))));
+	}
+}
+
+Float4 arctan(RValue<Float4> x, bool pp)
+{
+	Float4 absx = Abs(x);
+	Int4 O = CmpNLT(absx, Float4(1.0f));
+	Float4 y = As<Float4>((O & As<Int4>(Float4(1.0f) / absx)) | (~O & As<Int4>(absx))); // FIXME: Vector select
+
+	const Float4 half_pi(1.57079632f);
+	Float4 theta = arctan_01(y, pp);
+	return As<Float4>(((O & As<Int4>(half_pi - theta)) | (~O & As<Int4>(theta))) ^ // FIXME: Vector select
+	       (As<Int4>(x) & Int4(0x80000000)));
+}
+
+Float4 arctan(RValue<Float4> y, RValue<Float4> x, bool pp)
+{
+	const Float4 pi(3.14159265f);            // pi
+	const Float4 minus_pi(-3.14159265f);     // -pi
+	const Float4 half_pi(1.57079632f);       // pi/2
+	const Float4 quarter_pi(7.85398163e-1f); // pi/4
+
+	// Rotate to upper semicircle when in lower semicircle
+	Int4 S = CmpLT(y, Float4(0.0f));
+	Float4 theta = As<Float4>(S & As<Int4>(minus_pi));
+	Float4 x0 = As<Float4>((As<Int4>(y) & Int4(0x80000000)) ^ As<Int4>(x));
+	Float4 y0 = Abs(y);
+
+	// Rotate to right quadrant when in left quadrant
+	Int4 Q = CmpLT(x0, Float4(0.0f));
+	theta += As<Float4>(Q & As<Int4>(half_pi));
+	Float4 x1 = As<Float4>((Q & As<Int4>(y0)) | (~Q & As<Int4>(x0)));  // FIXME: Vector select
+	Float4 y1 = As<Float4>((Q & As<Int4>(-x0)) | (~Q & As<Int4>(y0))); // FIXME: Vector select
+
+	// Mirror to first octant when in second octant
+	Int4 O = CmpNLT(y1, x1);
+	Float4 x2 = As<Float4>((O & As<Int4>(y1)) | (~O & As<Int4>(x1))); // FIXME: Vector select
+	Float4 y2 = As<Float4>((O & As<Int4>(x1)) | (~O & As<Int4>(y1))); // FIXME: Vector select
+
+	// Approximation of atan in [0..1]
+	Int4 zero_x = CmpEQ(x2, Float4(0.0f));
+	Int4 inf_y = IsInf(y2); // Since x2 >= y2, this means x2 == y2 == inf, so we use 45 degrees or pi/4
+	Float4 atan2_theta = arctan_01(y2 / x2, pp);
+	theta += As<Float4>((~zero_x & ~inf_y & ((O & As<Int4>(half_pi - atan2_theta)) | (~O & (As<Int4>(atan2_theta))))) | // FIXME: Vector select
+	                    (inf_y & As<Int4>(quarter_pi)));
+
+	// Recover loss of precision for tiny theta angles
+	Int4 precision_loss = S & Q & O & ~inf_y; // This combination results in (-pi + half_pi + half_pi - atan2_theta) which is equivalent to -atan2_theta
+	return As<Float4>((precision_loss & As<Int4>(-atan2_theta)) | (~precision_loss & As<Int4>(theta))); // FIXME: Vector select
+}
+
+Float4 sineh(RValue<Float4> x, bool pp)
+{
+	return (exponential(x, pp) - exponential(-x, pp)) * Float4(0.5f);
+}
+
+Float4 cosineh(RValue<Float4> x, bool pp)
+{
+	return (exponential(x, pp) + exponential(-x, pp)) * Float4(0.5f);
+}
+
+Float4 tangenth(RValue<Float4> x, bool pp)
+{
+	Float4 e_x = exponential(x, pp);
+	Float4 e_minus_x = exponential(-x, pp);
+	return (e_x - e_minus_x) / (e_x + e_minus_x);
+}
+
+Float4 arccosh(RValue<Float4> x, bool pp)
+{
+	return logarithm(x + Sqrt(x + Float4(1.0f)) * Sqrt(x - Float4(1.0f)), pp);
+}
+
+Float4 arcsinh(RValue<Float4> x, bool pp)
+{
+	return logarithm(x + Sqrt(x * x + Float4(1.0f)), pp);
+}
+
+Float4 arctanh(RValue<Float4> x, bool pp)
+{
+	return logarithm((Float4(1.0f) + x) / (Float4(1.0f) - x), pp) * Float4(0.5f);
+}
+
+Float4 dot2(const Vector4f &v0, const Vector4f &v1)
+{
+	return v0.x * v1.x + v0.y * v1.y;
+}
+
+Float4 dot3(const Vector4f &v0, const Vector4f &v1)
+{
+	return v0.x * v1.x + v0.y * v1.y + v0.z * v1.z;
+}
+
+Float4 dot4(const Vector4f &v0, const Vector4f &v1)
+{
+	return v0.x * v1.x + v0.y * v1.y + v0.z * v1.z + v0.w * v1.w;
+}
+
+void transpose4x4(Short4 &row0, Short4 &row1, Short4 &row2, Short4 &row3)
+{
+	Int2 tmp0 = UnpackHigh(row0, row1);
+	Int2 tmp1 = UnpackHigh(row2, row3);
+	Int2 tmp2 = UnpackLow(row0, row1);
+	Int2 tmp3 = UnpackLow(row2, row3);
+
+	row0 = UnpackLow(tmp2, tmp3);
+	row1 = UnpackHigh(tmp2, tmp3);
+	row2 = UnpackLow(tmp0, tmp1);
+	row3 = UnpackHigh(tmp0, tmp1);
+}
+
+void transpose4x3(Short4 &row0, Short4 &row1, Short4 &row2, Short4 &row3)
+{
+	Int2 tmp0 = UnpackHigh(row0, row1);
+	Int2 tmp1 = UnpackHigh(row2, row3);
+	Int2 tmp2 = UnpackLow(row0, row1);
+	Int2 tmp3 = UnpackLow(row2, row3);
+
+	row0 = UnpackLow(tmp2, tmp3);
+	row1 = UnpackHigh(tmp2, tmp3);
+	row2 = UnpackLow(tmp0, tmp1);
+}
+
+void transpose4x4(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3)
+{
+	Float4 tmp0 = UnpackLow(row0, row1);
+	Float4 tmp1 = UnpackLow(row2, row3);
+	Float4 tmp2 = UnpackHigh(row0, row1);
+	Float4 tmp3 = UnpackHigh(row2, row3);
+
+	row0 = Float4(tmp0.xy, tmp1.xy);
+	row1 = Float4(tmp0.zw, tmp1.zw);
+	row2 = Float4(tmp2.xy, tmp3.xy);
+	row3 = Float4(tmp2.zw, tmp3.zw);
+}
+
+void transpose4x3(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3)
+{
+	Float4 tmp0 = UnpackLow(row0, row1);
+	Float4 tmp1 = UnpackLow(row2, row3);
+	Float4 tmp2 = UnpackHigh(row0, row1);
+	Float4 tmp3 = UnpackHigh(row2, row3);
+
+	row0 = Float4(tmp0.xy, tmp1.xy);
+	row1 = Float4(tmp0.zw, tmp1.zw);
+	row2 = Float4(tmp2.xy, tmp3.xy);
+}
+
+void transpose4x2(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3)
+{
+	Float4 tmp0 = UnpackLow(row0, row1);
+	Float4 tmp1 = UnpackLow(row2, row3);
+
+	row0 = Float4(tmp0.xy, tmp1.xy);
+	row1 = Float4(tmp0.zw, tmp1.zw);
+}
+
+void transpose4x1(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3)
+{
+	Float4 tmp0 = UnpackLow(row0, row1);
+	Float4 tmp1 = UnpackLow(row2, row3);
+
+	row0 = Float4(tmp0.xy, tmp1.xy);
+}
+
+void transpose2x4(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3)
+{
+	Float4 tmp01 = UnpackLow(row0, row1);
+	Float4 tmp23 = UnpackHigh(row0, row1);
+
+	row0 = tmp01;
+	row1 = Float4(tmp01.zw, row1.zw);
+	row2 = tmp23;
+	row3 = Float4(tmp23.zw, row3.zw);
+}
+
+void transpose4xN(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3, int N)
+{
+	switch(N)
+	{
+	case 1: transpose4x1(row0, row1, row2, row3); break;
+	case 2: transpose4x2(row0, row1, row2, row3); break;
+	case 3: transpose4x3(row0, row1, row2, row3); break;
+	case 4: transpose4x4(row0, row1, row2, row3); break;
+	}
+}
+
+UInt4 halfToFloatBits(UInt4 halfBits)
+{
+	auto magic = UInt4(126 << 23);
+
+	auto sign16 = halfBits & UInt4(0x8000);
+	auto man16  = halfBits & UInt4(0x3FF);
+	auto exp16  = halfBits & UInt4(0x7C00);
+
+	auto isDnormOrZero = CmpEQ(exp16, UInt4(0));
+	auto isInfOrNaN = CmpEQ(exp16, UInt4(0x7C00));
+
+	auto sign32 = sign16 << 16;
+	auto man32  = man16 << 13;
+	auto exp32  = (exp16 + UInt4(0x1C000)) << 13;
+	auto norm32 = (man32 | exp32) | (isInfOrNaN & UInt4(0x7F800000));
+
+	auto denorm32 = As<UInt4>(As<Float4>(magic + man16) - As<Float4>(magic));
+
+	return sign32 | (norm32 & ~isDnormOrZero) | (denorm32 & isDnormOrZero);
+}
+
+
+rr::RValue<rr::Bool> AnyTrue(rr::RValue<sw::SIMD::Int> const &ints)
+{
+	return rr::SignMask(ints) != 0;
+}
+
+rr::RValue<rr::Bool> AnyFalse(rr::RValue<sw::SIMD::Int> const &ints)
+{
+	return rr::SignMask(~ints) != 0;
+}
+
+rr::RValue<sw::SIMD::Float> Sign(rr::RValue<sw::SIMD::Float> const &val)
+{
+	return rr::As<sw::SIMD::Float>((rr::As<sw::SIMD::UInt>(val) & sw::SIMD::UInt(0x80000000)) | sw::SIMD::UInt(0x3f800000));
+}
+
+// Returns the <whole, frac> of val.
+// Both whole and frac will have the same sign as val.
+std::pair<rr::RValue<sw::SIMD::Float>, rr::RValue<sw::SIMD::Float>>
+Modf(rr::RValue<sw::SIMD::Float> const &val)
+{
+	auto abs = Abs(val);
+	auto sign = Sign(val);
+	auto whole = Floor(abs) * sign;
+	auto frac = Frac(abs) * sign;
+	return std::make_pair(whole, frac);
+}
+
+// Returns the number of 1s in bits, per lane.
+sw::SIMD::UInt CountBits(rr::RValue<sw::SIMD::UInt> const &bits)
+{
+	// TODO: Add an intrinsic to reactor. Even if there isn't a
+	// single vector instruction, there may be target-dependent
+	// ways to make this faster.
+	// https://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
+	sw::SIMD::UInt c = bits - ((bits >> 1) & sw::SIMD::UInt(0x55555555));
+	c = ((c >> 2) & sw::SIMD::UInt(0x33333333)) + (c & sw::SIMD::UInt(0x33333333));
+	c = ((c >> 4) + c) & sw::SIMD::UInt(0x0F0F0F0F);
+	c = ((c >> 8) + c) & sw::SIMD::UInt(0x00FF00FF);
+	c = ((c >> 16) + c) & sw::SIMD::UInt(0x0000FFFF);
+	return c;
+}
+
+// Returns 1 << bits.
+// If the resulting bit overflows a 32 bit integer, 0 is returned.
+rr::RValue<sw::SIMD::UInt> NthBit32(rr::RValue<sw::SIMD::UInt> const &bits)
+{
+	return ((sw::SIMD::UInt(1) << bits) & rr::CmpLT(bits, sw::SIMD::UInt(32)));
+}
+
+// Returns bitCount number of of 1's starting from the LSB.
+rr::RValue<sw::SIMD::UInt> Bitmask32(rr::RValue<sw::SIMD::UInt> const &bitCount)
+{
+	return NthBit32(bitCount) - sw::SIMD::UInt(1);
+}
+
+// Performs a fused-multiply add, returning a * b + c.
+rr::RValue<sw::SIMD::Float> FMA(
+		rr::RValue<sw::SIMD::Float> const &a,
+		rr::RValue<sw::SIMD::Float> const &b,
+		rr::RValue<sw::SIMD::Float> const &c)
+{
+	return a * b + c;
+}
+
+// Returns the exponent of the floating point number f.
+// Assumes IEEE 754
+rr::RValue<sw::SIMD::Int> Exponent(rr::RValue<sw::SIMD::Float> f)
+{
+	auto v = rr::As<sw::SIMD::UInt>(f);
+	return (sw::SIMD::Int((v >> sw::SIMD::UInt(23)) & sw::SIMD::UInt(0xFF)) - sw::SIMD::Int(126));
+}
+
+// Returns y if y < x; otherwise result is x.
+// If one operand is a NaN, the other operand is the result.
+// If both operands are NaN, the result is a NaN.
+rr::RValue<sw::SIMD::Float> NMin(rr::RValue<sw::SIMD::Float> const &x, rr::RValue<sw::SIMD::Float> const &y)
+{
+	using namespace rr;
+	auto xIsNan = IsNan(x);
+	auto yIsNan = IsNan(y);
+	return As<sw::SIMD::Float>(
+		// If neither are NaN, return min
+		((~xIsNan & ~yIsNan) & As<sw::SIMD::Int>(Min(x, y))) |
+		// If one operand is a NaN, the other operand is the result
+		// If both operands are NaN, the result is a NaN.
+		((~xIsNan &  yIsNan) & As<sw::SIMD::Int>(x)) |
+		(( xIsNan          ) & As<sw::SIMD::Int>(y)));
+}
+
+// Returns y if y > x; otherwise result is x.
+// If one operand is a NaN, the other operand is the result.
+// If both operands are NaN, the result is a NaN.
+rr::RValue<sw::SIMD::Float> NMax(rr::RValue<sw::SIMD::Float> const &x, rr::RValue<sw::SIMD::Float> const &y)
+{
+	using namespace rr;
+	auto xIsNan = IsNan(x);
+	auto yIsNan = IsNan(y);
+	return As<sw::SIMD::Float>(
+		// If neither are NaN, return max
+		((~xIsNan & ~yIsNan) & As<sw::SIMD::Int>(Max(x, y))) |
+		// If one operand is a NaN, the other operand is the result
+		// If both operands are NaN, the result is a NaN.
+		((~xIsNan &  yIsNan) & As<sw::SIMD::Int>(x)) |
+		(( xIsNan          ) & As<sw::SIMD::Int>(y)));
+}
+
+// Returns the determinant of a 2x2 matrix.
+rr::RValue<sw::SIMD::Float> Determinant(
+	rr::RValue<sw::SIMD::Float> const &a, rr::RValue<sw::SIMD::Float> const &b,
+	rr::RValue<sw::SIMD::Float> const &c, rr::RValue<sw::SIMD::Float> const &d)
+{
+	return a*d - b*c;
+}
+
+// Returns the determinant of a 3x3 matrix.
+rr::RValue<sw::SIMD::Float> Determinant(
+	rr::RValue<sw::SIMD::Float> const &a, rr::RValue<sw::SIMD::Float> const &b, rr::RValue<sw::SIMD::Float> const &c,
+	rr::RValue<sw::SIMD::Float> const &d, rr::RValue<sw::SIMD::Float> const &e, rr::RValue<sw::SIMD::Float> const &f,
+	rr::RValue<sw::SIMD::Float> const &g, rr::RValue<sw::SIMD::Float> const &h, rr::RValue<sw::SIMD::Float> const &i)
+{
+	return a*e*i + b*f*g + c*d*h - c*e*g - b*d*i - a*f*h;
+}
+
+// Returns the determinant of a 4x4 matrix.
+rr::RValue<sw::SIMD::Float> Determinant(
+	rr::RValue<sw::SIMD::Float> const &a, rr::RValue<sw::SIMD::Float> const &b, rr::RValue<sw::SIMD::Float> const &c, rr::RValue<sw::SIMD::Float> const &d,
+	rr::RValue<sw::SIMD::Float> const &e, rr::RValue<sw::SIMD::Float> const &f, rr::RValue<sw::SIMD::Float> const &g, rr::RValue<sw::SIMD::Float> const &h,
+	rr::RValue<sw::SIMD::Float> const &i, rr::RValue<sw::SIMD::Float> const &j, rr::RValue<sw::SIMD::Float> const &k, rr::RValue<sw::SIMD::Float> const &l,
+	rr::RValue<sw::SIMD::Float> const &m, rr::RValue<sw::SIMD::Float> const &n, rr::RValue<sw::SIMD::Float> const &o, rr::RValue<sw::SIMD::Float> const &p)
+{
+	return a * Determinant(f, g, h,
+	                       j, k, l,
+	                       n, o, p) -
+	       b * Determinant(e, g, h,
+	                       i, k, l,
+	                       m, o, p) +
+	       c * Determinant(e, f, h,
+	                       i, j, l,
+	                       m, n, p) -
+	       d * Determinant(e, f, g,
+	                       i, j, k,
+	                       m, n, o);
+}
+
+// Returns the inverse of a 2x2 matrix.
+std::array<rr::RValue<sw::SIMD::Float>, 4> MatrixInverse(
+	rr::RValue<sw::SIMD::Float> const &a, rr::RValue<sw::SIMD::Float> const &b,
+	rr::RValue<sw::SIMD::Float> const &c, rr::RValue<sw::SIMD::Float> const &d)
+{
+	auto s = sw::SIMD::Float(1.0f) / Determinant(a, b, c, d);
+	return {{s*d, -s*b, -s*c, s*a}};
+}
+
+// Returns the inverse of a 3x3 matrix.
+std::array<rr::RValue<sw::SIMD::Float>, 9> MatrixInverse(
+	rr::RValue<sw::SIMD::Float> const &a, rr::RValue<sw::SIMD::Float> const &b, rr::RValue<sw::SIMD::Float> const &c,
+	rr::RValue<sw::SIMD::Float> const &d, rr::RValue<sw::SIMD::Float> const &e, rr::RValue<sw::SIMD::Float> const &f,
+	rr::RValue<sw::SIMD::Float> const &g, rr::RValue<sw::SIMD::Float> const &h, rr::RValue<sw::SIMD::Float> const &i)
+{
+	auto s = sw::SIMD::Float(1.0f) / Determinant(
+			a, b, c,
+			d, e, f,
+			g, h, i); // TODO: duplicate arithmetic calculating the det and below.
+
+	return {{
+		s * (e*i - f*h), s * (c*h - b*i), s * (b*f - c*e),
+		s * (f*g - d*i), s * (a*i - c*g), s * (c*d - a*f),
+		s * (d*h - e*g), s * (b*g - a*h), s * (a*e - b*d),
+	}};
+}
+
+// Returns the inverse of a 4x4 matrix.
+std::array<rr::RValue<sw::SIMD::Float>, 16> MatrixInverse(
+	rr::RValue<sw::SIMD::Float> const &a, rr::RValue<sw::SIMD::Float> const &b, rr::RValue<sw::SIMD::Float> const &c, rr::RValue<sw::SIMD::Float> const &d,
+	rr::RValue<sw::SIMD::Float> const &e, rr::RValue<sw::SIMD::Float> const &f, rr::RValue<sw::SIMD::Float> const &g, rr::RValue<sw::SIMD::Float> const &h,
+	rr::RValue<sw::SIMD::Float> const &i, rr::RValue<sw::SIMD::Float> const &j, rr::RValue<sw::SIMD::Float> const &k, rr::RValue<sw::SIMD::Float> const &l,
+	rr::RValue<sw::SIMD::Float> const &m, rr::RValue<sw::SIMD::Float> const &n, rr::RValue<sw::SIMD::Float> const &o, rr::RValue<sw::SIMD::Float> const &p)
+{
+	auto s = sw::SIMD::Float(1.0f) / Determinant(
+			a, b, c, d,
+			e, f, g, h,
+			i, j, k, l,
+			m, n, o, p); // TODO: duplicate arithmetic calculating the det and below.
+
+	auto kplo = k*p - l*o, jpln = j*p - l*n, jokn = j*o - k*n;
+	auto gpho = g*p - h*o, fphn = f*p - h*n, fogn = f*o - g*n;
+	auto glhk = g*l - h*k, flhj = f*l - h*j, fkgj = f*k - g*j;
+	auto iplm = i*p - l*m, iokm = i*o - k*m, ephm = e*p - h*m;
+	auto eogm = e*o - g*m, elhi = e*l - h*i, ekgi = e*k - g*i;
+	auto injm = i*n - j*m, enfm = e*n - f*m, ejfi = e*j - f*i;
+
+	return {{
+		s * ( f * kplo - g * jpln + h * jokn),
+		s * (-b * kplo + c * jpln - d * jokn),
+		s * ( b * gpho - c * fphn + d * fogn),
+		s * (-b * glhk + c * flhj - d * fkgj),
+
+		s * (-e * kplo + g * iplm - h * iokm),
+		s * ( a * kplo - c * iplm + d * iokm),
+		s * (-a * gpho + c * ephm - d * eogm),
+		s * ( a * glhk - c * elhi + d * ekgi),
+
+		s * ( e * jpln - f * iplm + h * injm),
+		s * (-a * jpln + b * iplm - d * injm),
+		s * ( a * fphn - b * ephm + d * enfm),
+		s * (-a * flhj + b * elhi - d * ejfi),
+
+		s * (-e * jokn + f * iokm - g * injm),
+		s * ( a * jokn - b * iokm + c * injm),
+		s * (-a * fogn + b * eogm - c * enfm),
+		s * ( a * fkgj - b * ekgi + c * ejfi),
+	}};
+}
+
+namespace SIMD {
+
+Pointer::Pointer(rr::Pointer<Byte> base, rr::Int limit)
+	: base(base),
+		dynamicLimit(limit), staticLimit(0),
+		dynamicOffsets(0), staticOffsets{},
+		hasDynamicLimit(true), hasDynamicOffsets(false) {}
+
+Pointer::Pointer(rr::Pointer<Byte> base, unsigned int limit)
+	: base(base),
+		dynamicLimit(0), staticLimit(limit),
+		dynamicOffsets(0), staticOffsets{},
+		hasDynamicLimit(false), hasDynamicOffsets(false) {}
+
+Pointer::Pointer(rr::Pointer<Byte> base, rr::Int limit, SIMD::Int offset)
+	: base(base),
+		dynamicLimit(limit), staticLimit(0),
+		dynamicOffsets(offset), staticOffsets{},
+		hasDynamicLimit(true), hasDynamicOffsets(true) {}
+
+Pointer::Pointer(rr::Pointer<Byte> base, unsigned int limit, SIMD::Int offset)
+	: base(base),
+		dynamicLimit(0), staticLimit(limit),
+		dynamicOffsets(offset), staticOffsets{},
+		hasDynamicLimit(false), hasDynamicOffsets(true) {}
+
+Pointer& Pointer::operator += (Int i)
+{
+	dynamicOffsets += i;
+	hasDynamicOffsets = true;
+	return *this;
+}
+
+Pointer& Pointer::operator *= (Int i)
+{
+	dynamicOffsets = offsets() * i;
+	staticOffsets = {};
+	hasDynamicOffsets = true;
+	return *this;
+}
+
+Pointer Pointer::operator + (SIMD::Int i) { Pointer p = *this; p += i; return p; }
+Pointer Pointer::operator * (SIMD::Int i) { Pointer p = *this; p *= i; return p; }
+
+Pointer& Pointer::operator += (int i)
+{
+	for (int el = 0; el < SIMD::Width; el++) { staticOffsets[el] += i; }
+	return *this;
+}
+
+Pointer& Pointer::operator *= (int i)
+{
+	for (int el = 0; el < SIMD::Width; el++) { staticOffsets[el] *= i; }
+	if (hasDynamicOffsets)
+	{
+		dynamicOffsets *= SIMD::Int(i);
+	}
+	return *this;
+}
+
+Pointer Pointer::operator + (int i) { Pointer p = *this; p += i; return p; }
+Pointer Pointer::operator * (int i) { Pointer p = *this; p *= i; return p; }
+
+SIMD::Int Pointer::offsets() const
+{
+	static_assert(SIMD::Width == 4, "Expects SIMD::Width to be 4");
+	return dynamicOffsets + SIMD::Int(staticOffsets[0], staticOffsets[1], staticOffsets[2], staticOffsets[3]);
+}
+
+SIMD::Int Pointer::isInBounds(unsigned int accessSize, OutOfBoundsBehavior robustness) const
+{
+	ASSERT(accessSize > 0);
+
+	if (isStaticallyInBounds(accessSize, robustness))
+	{
+		return SIMD::Int(0xffffffff);
+	}
+
+	if (!hasDynamicOffsets && !hasDynamicLimit)
+	{
+		// Common fast paths.
+		static_assert(SIMD::Width == 4, "Expects SIMD::Width to be 4");
+		return SIMD::Int(
+			(staticOffsets[0] + accessSize - 1 < staticLimit) ? 0xffffffff : 0,
+			(staticOffsets[1] + accessSize - 1 < staticLimit) ? 0xffffffff : 0,
+			(staticOffsets[2] + accessSize - 1 < staticLimit) ? 0xffffffff : 0,
+			(staticOffsets[3] + accessSize - 1 < staticLimit) ? 0xffffffff : 0);
+	}
+
+	return CmpLT(offsets() + SIMD::Int(accessSize - 1), SIMD::Int(limit()));
+}
+
+bool Pointer::isStaticallyInBounds(unsigned int accessSize, OutOfBoundsBehavior robustness) const
+{
+	if (hasDynamicOffsets)
+	{
+		return false;
+	}
+
+	if (hasDynamicLimit)
+	{
+		if (hasStaticEqualOffsets() || hasStaticSequentialOffsets(accessSize))
+		{
+			switch(robustness)
+			{
+			case OutOfBoundsBehavior::UndefinedBehavior:
+				// With this robustness setting the application/compiler guarantees in-bounds accesses on active lanes,
+				// but since it can't know in advance which branches are taken this must be true even for inactives lanes.
+				return true;
+			case OutOfBoundsBehavior::Nullify:
+			case OutOfBoundsBehavior::RobustBufferAccess:
+			case OutOfBoundsBehavior::UndefinedValue:
+				return false;
+			}
+		}
+	}
+
+	for (int i = 0; i < SIMD::Width; i++)
+	{
+		if (staticOffsets[i] + accessSize - 1 >= staticLimit)
+		{
+			return false;
+		}
+	}
+
+	return true;
+}
+
+Int Pointer::limit() const
+{
+	return dynamicLimit + staticLimit;
+}
+
+// Returns true if all offsets are sequential
+// (N+0*step, N+1*step, N+2*step, N+3*step)
+rr::Bool Pointer::hasSequentialOffsets(unsigned int step) const
+{
+	if (hasDynamicOffsets)
+	{
+		auto o = offsets();
+		static_assert(SIMD::Width == 4, "Expects SIMD::Width to be 4");
+		return rr::SignMask(~CmpEQ(o.yzww, o + SIMD::Int(1*step, 2*step, 3*step, 0))) == 0;
+	}
+	return hasStaticSequentialOffsets(step);
+}
+
+// Returns true if all offsets are are compile-time static and
+// sequential (N+0*step, N+1*step, N+2*step, N+3*step)
+bool Pointer::hasStaticSequentialOffsets(unsigned int step) const
+{
+	if (hasDynamicOffsets)
+	{
+		return false;
+	}
+	for (int i = 1; i < SIMD::Width; i++)
+	{
+		if (staticOffsets[i-1] + int32_t(step) != staticOffsets[i]) { return false; }
+	}
+	return true;
+}
+
+// Returns true if all offsets are equal (N, N, N, N)
+rr::Bool Pointer::hasEqualOffsets() const
+{
+	if (hasDynamicOffsets)
+	{
+		auto o = offsets();
+		static_assert(SIMD::Width == 4, "Expects SIMD::Width to be 4");
+		return rr::SignMask(~CmpEQ(o, o.yzwx)) == 0;
+	}
+	return hasStaticEqualOffsets();
+}
+
+// Returns true if all offsets are compile-time static and are equal
+// (N, N, N, N)
+bool Pointer::hasStaticEqualOffsets() const
+{
+	if (hasDynamicOffsets)
+	{
+		return false;
+	}
+	for (int i = 1; i < SIMD::Width; i++)
+	{
+		if (staticOffsets[i-1] != staticOffsets[i]) { return false; }
+	}
+	return true;
+}
+
+}  // namespace SIMD
+
+}  // namespace sw
diff --git a/src/Pipeline/ShaderCore.hpp b/src/Pipeline/ShaderCore.hpp
index e95d4d0..8ac2cc2 100644
--- a/src/Pipeline/ShaderCore.hpp
+++ b/src/Pipeline/ShaderCore.hpp
@@ -23,507 +23,508 @@
 #include <array>
 #include <utility> // std::pair
 
-namespace sw
+namespace sw {
+
+using namespace rr;
+
+class Vector4s
 {
-	using namespace rr;
+public:
+	Vector4s();
+	Vector4s(unsigned short x, unsigned short y, unsigned short z, unsigned short w);
+	Vector4s(const Vector4s &rhs);
 
-	class Vector4s
-	{
-	public:
-		Vector4s();
-		Vector4s(unsigned short x, unsigned short y, unsigned short z, unsigned short w);
-		Vector4s(const Vector4s &rhs);
+	Short4 &operator[](int i);
+	Vector4s &operator=(const Vector4s &rhs);
 
-		Short4 &operator[](int i);
-		Vector4s &operator=(const Vector4s &rhs);
+	Short4 x;
+	Short4 y;
+	Short4 z;
+	Short4 w;
+};
 
-		Short4 x;
-		Short4 y;
-		Short4 z;
-		Short4 w;
-	};
+class Vector4f
+{
+public:
+	Vector4f();
+	Vector4f(float x, float y, float z, float w);
+	Vector4f(const Vector4f &rhs);
 
-	class Vector4f
-	{
-	public:
-		Vector4f();
-		Vector4f(float x, float y, float z, float w);
-		Vector4f(const Vector4f &rhs);
+	Float4 &operator[](int i);
+	Vector4f &operator=(const Vector4f &rhs);
 
-		Float4 &operator[](int i);
-		Vector4f &operator=(const Vector4f &rhs);
+	Float4 x;
+	Float4 y;
+	Float4 z;
+	Float4 w;
+};
 
-		Float4 x;
-		Float4 y;
-		Float4 z;
-		Float4 w;
-	};
+enum class OutOfBoundsBehavior
+{
+	Nullify,             // Loads become zero, stores are elided.
+	RobustBufferAccess,  // As defined by the Vulkan spec (in short: access anywhere within bounds, or zeroing).
+	UndefinedValue,      // Only for load operations. Not secure. No program termination.
+	UndefinedBehavior,   // Program may terminate.
+};
 
-	enum class OutOfBoundsBehavior
-	{
-		Nullify,             // Loads become zero, stores are elided.
-		RobustBufferAccess,  // As defined by the Vulkan spec (in short: access anywhere within bounds, or zeroing).
-		UndefinedValue,      // Only for load operations. Not secure. No program termination.
-		UndefinedBehavior,   // Program may terminate.
-	};
+// SIMD contains types that represent multiple scalars packed into a single
+// vector data type. Types in the SIMD namespace provide a semantic hint
+// that the data should be treated as a per-execution-lane scalar instead of
+// a typical euclidean-style vector type.
+namespace SIMD {
 
-	// SIMD contains types that represent multiple scalars packed into a single
-	// vector data type. Types in the SIMD namespace provide a semantic hint
-	// that the data should be treated as a per-execution-lane scalar instead of
-	// a typical euclidean-style vector type.
-	namespace SIMD
-	{
-		// Width is the number of per-lane scalars packed into each SIMD vector.
-		static constexpr int Width = 4;
+// Width is the number of per-lane scalars packed into each SIMD vector.
+static constexpr int Width = 4;
 
-		using Float = rr::Float4;
-		using Int = rr::Int4;
-		using UInt = rr::UInt4;
+using Float = rr::Float4;
+using Int = rr::Int4;
+using UInt = rr::UInt4;
 
-		struct Pointer
-		{
-			Pointer(rr::Pointer<Byte> base, rr::Int limit);
-			Pointer(rr::Pointer<Byte> base, unsigned int limit);
-			Pointer(rr::Pointer<Byte> base, rr::Int limit, SIMD::Int offset);
-			Pointer(rr::Pointer<Byte> base, unsigned int limit, SIMD::Int offset);
+struct Pointer
+{
+	Pointer(rr::Pointer<Byte> base, rr::Int limit);
+	Pointer(rr::Pointer<Byte> base, unsigned int limit);
+	Pointer(rr::Pointer<Byte> base, rr::Int limit, SIMD::Int offset);
+	Pointer(rr::Pointer<Byte> base, unsigned int limit, SIMD::Int offset);
 
-			Pointer& operator += (Int i);
-			Pointer& operator *= (Int i);
+	Pointer& operator += (Int i);
+	Pointer& operator *= (Int i);
 
-			Pointer operator + (SIMD::Int i);
-			Pointer operator * (SIMD::Int i);
+	Pointer operator + (SIMD::Int i);
+	Pointer operator * (SIMD::Int i);
 
-			Pointer& operator += (int i);
-			Pointer& operator *= (int i);
+	Pointer& operator += (int i);
+	Pointer& operator *= (int i);
 
-			Pointer operator + (int i);
-			Pointer operator * (int i);
+	Pointer operator + (int i);
+	Pointer operator * (int i);
 
-			SIMD::Int offsets() const;
+	SIMD::Int offsets() const;
 
-			SIMD::Int isInBounds(unsigned int accessSize, OutOfBoundsBehavior robustness) const;
+	SIMD::Int isInBounds(unsigned int accessSize, OutOfBoundsBehavior robustness) const;
 
-			bool isStaticallyInBounds(unsigned int accessSize, OutOfBoundsBehavior robustness) const;
+	bool isStaticallyInBounds(unsigned int accessSize, OutOfBoundsBehavior robustness) const;
 
-			Int limit() const;
+	Int limit() const;
 
-			// Returns true if all offsets are sequential
-			// (N+0*step, N+1*step, N+2*step, N+3*step)
-			rr::Bool hasSequentialOffsets(unsigned int step) const;
+	// Returns true if all offsets are sequential
+	// (N+0*step, N+1*step, N+2*step, N+3*step)
+	rr::Bool hasSequentialOffsets(unsigned int step) const;
 
-			// Returns true if all offsets are are compile-time static and
-			// sequential (N+0*step, N+1*step, N+2*step, N+3*step)
-			bool hasStaticSequentialOffsets(unsigned int step) const;
+	// Returns true if all offsets are are compile-time static and
+	// sequential (N+0*step, N+1*step, N+2*step, N+3*step)
+	bool hasStaticSequentialOffsets(unsigned int step) const;
 
-			// Returns true if all offsets are equal (N, N, N, N)
-			rr::Bool hasEqualOffsets() const;
+	// Returns true if all offsets are equal (N, N, N, N)
+	rr::Bool hasEqualOffsets() const;
 
-			// Returns true if all offsets are compile-time static and are equal
-			// (N, N, N, N)
-			bool hasStaticEqualOffsets() const;
-
-			template<typename T>
-			inline T Load(OutOfBoundsBehavior robustness, Int mask, bool atomic = false, std::memory_order order = std::memory_order_relaxed, int alignment = sizeof(float));
-
-			template<typename T>
-			inline void Store(T val, OutOfBoundsBehavior robustness, Int mask, bool atomic = false, std::memory_order order = std::memory_order_relaxed);
-
-			template<typename T>
-			inline void Store(RValue<T> val, OutOfBoundsBehavior robustness, Int mask, bool atomic = false, std::memory_order order = std::memory_order_relaxed);
-
-			// Base address for the pointer, common across all lanes.
-			rr::Pointer<rr::Byte> base;
-
-			// Upper (non-inclusive) limit for offsets from base.
-			rr::Int dynamicLimit; // If hasDynamicLimit is false, dynamicLimit is zero.
-			unsigned int staticLimit;
-
-			// Per lane offsets from base.
-			SIMD::Int dynamicOffsets; // If hasDynamicOffsets is false, all dynamicOffsets are zero.
-			std::array<int32_t, SIMD::Width> staticOffsets;
-
-			bool hasDynamicLimit;    // True if dynamicLimit is non-zero.
-			bool hasDynamicOffsets;  // True if any dynamicOffsets are non-zero.
-		};
-
-		template <typename T> struct Element {};
-		template <> struct Element<Float> { using type = rr::Float; };
-		template <> struct Element<Int>   { using type = rr::Int; };
-		template <> struct Element<UInt>  { using type = rr::UInt; };
-
-	} // namespace SIMD
-
-	Float4 exponential2(RValue<Float4> x, bool pp = false);
-	Float4 logarithm2(RValue<Float4> x, bool pp = false);
-	Float4 exponential(RValue<Float4> x, bool pp = false);
-	Float4 logarithm(RValue<Float4> x, bool pp = false);
-	Float4 power(RValue<Float4> x, RValue<Float4> y, bool pp = false);
-	Float4 reciprocal(RValue<Float4> x, bool pp = false, bool finite = false, bool exactAtPow2 = false);
-	Float4 reciprocalSquareRoot(RValue<Float4> x, bool abs, bool pp = false);
-	Float4 modulo(RValue<Float4> x, RValue<Float4> y);
-	Float4 sine_pi(RValue<Float4> x, bool pp = false);     // limited to [-pi, pi] range
-	Float4 cosine_pi(RValue<Float4> x, bool pp = false);   // limited to [-pi, pi] range
-	Float4 sine(RValue<Float4> x, bool pp = false);
-	Float4 cosine(RValue<Float4> x, bool pp = false);
-	Float4 tangent(RValue<Float4> x, bool pp = false);
-	Float4 arccos(RValue<Float4> x, bool pp = false);
-	Float4 arcsin(RValue<Float4> x, bool pp = false);
-	Float4 arctan(RValue<Float4> x, bool pp = false);
-	Float4 arctan(RValue<Float4> y, RValue<Float4> x, bool pp = false);
-	Float4 sineh(RValue<Float4> x, bool pp = false);
-	Float4 cosineh(RValue<Float4> x, bool pp = false);
-	Float4 tangenth(RValue<Float4> x, bool pp = false);
-	Float4 arccosh(RValue<Float4> x, bool pp = false);  // Limited to x >= 1
-	Float4 arcsinh(RValue<Float4> x, bool pp = false);
-	Float4 arctanh(RValue<Float4> x, bool pp = false);  // Limited to ]-1, 1[ range
-
-	Float4 dot2(const Vector4f &v0, const Vector4f &v1);
-	Float4 dot3(const Vector4f &v0, const Vector4f &v1);
-	Float4 dot4(const Vector4f &v0, const Vector4f &v1);
-
-	void transpose4x4(Short4 &row0, Short4 &row1, Short4 &row2, Short4 &row3);
-	void transpose4x3(Short4 &row0, Short4 &row1, Short4 &row2, Short4 &row3);
-	void transpose4x4(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3);
-	void transpose4x3(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3);
-	void transpose4x2(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3);
-	void transpose4x1(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3);
-	void transpose2x4(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3);
-	void transpose4xN(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3, int N);
-
-	UInt4 halfToFloatBits(UInt4 halfBits);
-
-	rr::RValue<rr::Bool> AnyTrue(rr::RValue<sw::SIMD::Int> const &ints);
-
-	rr::RValue<rr::Bool> AnyFalse(rr::RValue<sw::SIMD::Int> const &ints);
-
-	template <typename T>
-	inline rr::RValue<T> AndAll(rr::RValue<T> const &mask);
-
-	template <typename T>
-	inline rr::RValue<T> OrAll(rr::RValue<T> const &mask);
-
-	rr::RValue<sw::SIMD::Float> Sign(rr::RValue<sw::SIMD::Float> const &val);
-
-	// Returns the <whole, frac> of val.
-	// Both whole and frac will have the same sign as val.
-	std::pair<rr::RValue<sw::SIMD::Float>, rr::RValue<sw::SIMD::Float>>
-	Modf(rr::RValue<sw::SIMD::Float> const &val);
-
-	// Returns the number of 1s in bits, per lane.
-	sw::SIMD::UInt CountBits(rr::RValue<sw::SIMD::UInt> const &bits);
-
-	// Returns 1 << bits.
-	// If the resulting bit overflows a 32 bit integer, 0 is returned.
-	rr::RValue<sw::SIMD::UInt> NthBit32(rr::RValue<sw::SIMD::UInt> const &bits);
-
-	// Returns bitCount number of of 1's starting from the LSB.
-	rr::RValue<sw::SIMD::UInt> Bitmask32(rr::RValue<sw::SIMD::UInt> const &bitCount);
-
-	// Performs a fused-multiply add, returning a * b + c.
-	rr::RValue<sw::SIMD::Float> FMA(
-			rr::RValue<sw::SIMD::Float> const &a,
-			rr::RValue<sw::SIMD::Float> const &b,
-			rr::RValue<sw::SIMD::Float> const &c);
-
-	// Returns the exponent of the floating point number f.
-	// Assumes IEEE 754
-	rr::RValue<sw::SIMD::Int> Exponent(rr::RValue<sw::SIMD::Float> f);
-
-	// Returns y if y < x; otherwise result is x.
-	// If one operand is a NaN, the other operand is the result.
-	// If both operands are NaN, the result is a NaN.
-	rr::RValue<sw::SIMD::Float> NMin(rr::RValue<sw::SIMD::Float> const &x, rr::RValue<sw::SIMD::Float> const &y);
-
-	// Returns y if y > x; otherwise result is x.
-	// If one operand is a NaN, the other operand is the result.
-	// If both operands are NaN, the result is a NaN.
-	rr::RValue<sw::SIMD::Float> NMax(rr::RValue<sw::SIMD::Float> const &x, rr::RValue<sw::SIMD::Float> const &y);
-
-	// Returns the determinant of a 2x2 matrix.
-	rr::RValue<sw::SIMD::Float> Determinant(
-		rr::RValue<sw::SIMD::Float> const &a, rr::RValue<sw::SIMD::Float> const &b,
-		rr::RValue<sw::SIMD::Float> const &c, rr::RValue<sw::SIMD::Float> const &d);
-
-	// Returns the determinant of a 3x3 matrix.
-	rr::RValue<sw::SIMD::Float> Determinant(
-		rr::RValue<sw::SIMD::Float> const &a, rr::RValue<sw::SIMD::Float> const &b, rr::RValue<sw::SIMD::Float> const &c,
-		rr::RValue<sw::SIMD::Float> const &d, rr::RValue<sw::SIMD::Float> const &e, rr::RValue<sw::SIMD::Float> const &f,
-		rr::RValue<sw::SIMD::Float> const &g, rr::RValue<sw::SIMD::Float> const &h, rr::RValue<sw::SIMD::Float> const &i);
-
-	// Returns the determinant of a 4x4 matrix.
-	rr::RValue<sw::SIMD::Float> Determinant(
-		rr::RValue<sw::SIMD::Float> const &a, rr::RValue<sw::SIMD::Float> const &b, rr::RValue<sw::SIMD::Float> const &c, rr::RValue<sw::SIMD::Float> const &d,
-		rr::RValue<sw::SIMD::Float> const &e, rr::RValue<sw::SIMD::Float> const &f, rr::RValue<sw::SIMD::Float> const &g, rr::RValue<sw::SIMD::Float> const &h,
-		rr::RValue<sw::SIMD::Float> const &i, rr::RValue<sw::SIMD::Float> const &j, rr::RValue<sw::SIMD::Float> const &k, rr::RValue<sw::SIMD::Float> const &l,
-		rr::RValue<sw::SIMD::Float> const &m, rr::RValue<sw::SIMD::Float> const &n, rr::RValue<sw::SIMD::Float> const &o, rr::RValue<sw::SIMD::Float> const &p);
-
-	// Returns the inverse of a 2x2 matrix.
-	std::array<rr::RValue<sw::SIMD::Float>, 4> MatrixInverse(
-		rr::RValue<sw::SIMD::Float> const &a, rr::RValue<sw::SIMD::Float> const &b,
-		rr::RValue<sw::SIMD::Float> const &c, rr::RValue<sw::SIMD::Float> const &d);
-
-	// Returns the inverse of a 3x3 matrix.
-	std::array<rr::RValue<sw::SIMD::Float>, 9> MatrixInverse(
-		rr::RValue<sw::SIMD::Float> const &a, rr::RValue<sw::SIMD::Float> const &b, rr::RValue<sw::SIMD::Float> const &c,
-		rr::RValue<sw::SIMD::Float> const &d, rr::RValue<sw::SIMD::Float> const &e, rr::RValue<sw::SIMD::Float> const &f,
-		rr::RValue<sw::SIMD::Float> const &g, rr::RValue<sw::SIMD::Float> const &h, rr::RValue<sw::SIMD::Float> const &i);
-
-	// Returns the inverse of a 4x4 matrix.
-	std::array<rr::RValue<sw::SIMD::Float>, 16> MatrixInverse(
-		rr::RValue<sw::SIMD::Float> const &a, rr::RValue<sw::SIMD::Float> const &b, rr::RValue<sw::SIMD::Float> const &c, rr::RValue<sw::SIMD::Float> const &d,
-		rr::RValue<sw::SIMD::Float> const &e, rr::RValue<sw::SIMD::Float> const &f, rr::RValue<sw::SIMD::Float> const &g, rr::RValue<sw::SIMD::Float> const &h,
-		rr::RValue<sw::SIMD::Float> const &i, rr::RValue<sw::SIMD::Float> const &j, rr::RValue<sw::SIMD::Float> const &k, rr::RValue<sw::SIMD::Float> const &l,
-		rr::RValue<sw::SIMD::Float> const &m, rr::RValue<sw::SIMD::Float> const &n, rr::RValue<sw::SIMD::Float> const &o, rr::RValue<sw::SIMD::Float> const &p);
-
-	////////////////////////////////////////////////////////////////////////////
-	// Inline functions
-	////////////////////////////////////////////////////////////////////////////
+	// Returns true if all offsets are compile-time static and are equal
+	// (N, N, N, N)
+	bool hasStaticEqualOffsets() const;
 
 	template<typename T>
-	inline T SIMD::Pointer::Load(OutOfBoundsBehavior robustness, Int mask, bool atomic /* = false */, std::memory_order order /* = std::memory_order_relaxed */, int alignment /* = sizeof(float) */)
+	inline T Load(OutOfBoundsBehavior robustness, Int mask, bool atomic = false, std::memory_order order = std::memory_order_relaxed, int alignment = sizeof(float));
+
+	template<typename T>
+	inline void Store(T val, OutOfBoundsBehavior robustness, Int mask, bool atomic = false, std::memory_order order = std::memory_order_relaxed);
+
+	template<typename T>
+	inline void Store(RValue<T> val, OutOfBoundsBehavior robustness, Int mask, bool atomic = false, std::memory_order order = std::memory_order_relaxed);
+
+	// Base address for the pointer, common across all lanes.
+	rr::Pointer<rr::Byte> base;
+
+	// Upper (non-inclusive) limit for offsets from base.
+	rr::Int dynamicLimit; // If hasDynamicLimit is false, dynamicLimit is zero.
+	unsigned int staticLimit;
+
+	// Per lane offsets from base.
+	SIMD::Int dynamicOffsets; // If hasDynamicOffsets is false, all dynamicOffsets are zero.
+	std::array<int32_t, SIMD::Width> staticOffsets;
+
+	bool hasDynamicLimit;    // True if dynamicLimit is non-zero.
+	bool hasDynamicOffsets;  // True if any dynamicOffsets are non-zero.
+};
+
+template <typename T> struct Element {};
+template <> struct Element<Float> { using type = rr::Float; };
+template <> struct Element<Int>   { using type = rr::Int; };
+template <> struct Element<UInt>  { using type = rr::UInt; };
+
+} // namespace SIMD
+
+Float4 exponential2(RValue<Float4> x, bool pp = false);
+Float4 logarithm2(RValue<Float4> x, bool pp = false);
+Float4 exponential(RValue<Float4> x, bool pp = false);
+Float4 logarithm(RValue<Float4> x, bool pp = false);
+Float4 power(RValue<Float4> x, RValue<Float4> y, bool pp = false);
+Float4 reciprocal(RValue<Float4> x, bool pp = false, bool finite = false, bool exactAtPow2 = false);
+Float4 reciprocalSquareRoot(RValue<Float4> x, bool abs, bool pp = false);
+Float4 modulo(RValue<Float4> x, RValue<Float4> y);
+Float4 sine_pi(RValue<Float4> x, bool pp = false);     // limited to [-pi, pi] range
+Float4 cosine_pi(RValue<Float4> x, bool pp = false);   // limited to [-pi, pi] range
+Float4 sine(RValue<Float4> x, bool pp = false);
+Float4 cosine(RValue<Float4> x, bool pp = false);
+Float4 tangent(RValue<Float4> x, bool pp = false);
+Float4 arccos(RValue<Float4> x, bool pp = false);
+Float4 arcsin(RValue<Float4> x, bool pp = false);
+Float4 arctan(RValue<Float4> x, bool pp = false);
+Float4 arctan(RValue<Float4> y, RValue<Float4> x, bool pp = false);
+Float4 sineh(RValue<Float4> x, bool pp = false);
+Float4 cosineh(RValue<Float4> x, bool pp = false);
+Float4 tangenth(RValue<Float4> x, bool pp = false);
+Float4 arccosh(RValue<Float4> x, bool pp = false);  // Limited to x >= 1
+Float4 arcsinh(RValue<Float4> x, bool pp = false);
+Float4 arctanh(RValue<Float4> x, bool pp = false);  // Limited to ]-1, 1[ range
+
+Float4 dot2(const Vector4f &v0, const Vector4f &v1);
+Float4 dot3(const Vector4f &v0, const Vector4f &v1);
+Float4 dot4(const Vector4f &v0, const Vector4f &v1);
+
+void transpose4x4(Short4 &row0, Short4 &row1, Short4 &row2, Short4 &row3);
+void transpose4x3(Short4 &row0, Short4 &row1, Short4 &row2, Short4 &row3);
+void transpose4x4(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3);
+void transpose4x3(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3);
+void transpose4x2(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3);
+void transpose4x1(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3);
+void transpose2x4(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3);
+void transpose4xN(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3, int N);
+
+UInt4 halfToFloatBits(UInt4 halfBits);
+
+rr::RValue<rr::Bool> AnyTrue(rr::RValue<sw::SIMD::Int> const &ints);
+
+rr::RValue<rr::Bool> AnyFalse(rr::RValue<sw::SIMD::Int> const &ints);
+
+template <typename T>
+inline rr::RValue<T> AndAll(rr::RValue<T> const &mask);
+
+template <typename T>
+inline rr::RValue<T> OrAll(rr::RValue<T> const &mask);
+
+rr::RValue<sw::SIMD::Float> Sign(rr::RValue<sw::SIMD::Float> const &val);
+
+// Returns the <whole, frac> of val.
+// Both whole and frac will have the same sign as val.
+std::pair<rr::RValue<sw::SIMD::Float>, rr::RValue<sw::SIMD::Float>>
+Modf(rr::RValue<sw::SIMD::Float> const &val);
+
+// Returns the number of 1s in bits, per lane.
+sw::SIMD::UInt CountBits(rr::RValue<sw::SIMD::UInt> const &bits);
+
+// Returns 1 << bits.
+// If the resulting bit overflows a 32 bit integer, 0 is returned.
+rr::RValue<sw::SIMD::UInt> NthBit32(rr::RValue<sw::SIMD::UInt> const &bits);
+
+// Returns bitCount number of of 1's starting from the LSB.
+rr::RValue<sw::SIMD::UInt> Bitmask32(rr::RValue<sw::SIMD::UInt> const &bitCount);
+
+// Performs a fused-multiply add, returning a * b + c.
+rr::RValue<sw::SIMD::Float> FMA(
+		rr::RValue<sw::SIMD::Float> const &a,
+		rr::RValue<sw::SIMD::Float> const &b,
+		rr::RValue<sw::SIMD::Float> const &c);
+
+// Returns the exponent of the floating point number f.
+// Assumes IEEE 754
+rr::RValue<sw::SIMD::Int> Exponent(rr::RValue<sw::SIMD::Float> f);
+
+// Returns y if y < x; otherwise result is x.
+// If one operand is a NaN, the other operand is the result.
+// If both operands are NaN, the result is a NaN.
+rr::RValue<sw::SIMD::Float> NMin(rr::RValue<sw::SIMD::Float> const &x, rr::RValue<sw::SIMD::Float> const &y);
+
+// Returns y if y > x; otherwise result is x.
+// If one operand is a NaN, the other operand is the result.
+// If both operands are NaN, the result is a NaN.
+rr::RValue<sw::SIMD::Float> NMax(rr::RValue<sw::SIMD::Float> const &x, rr::RValue<sw::SIMD::Float> const &y);
+
+// Returns the determinant of a 2x2 matrix.
+rr::RValue<sw::SIMD::Float> Determinant(
+	rr::RValue<sw::SIMD::Float> const &a, rr::RValue<sw::SIMD::Float> const &b,
+	rr::RValue<sw::SIMD::Float> const &c, rr::RValue<sw::SIMD::Float> const &d);
+
+// Returns the determinant of a 3x3 matrix.
+rr::RValue<sw::SIMD::Float> Determinant(
+	rr::RValue<sw::SIMD::Float> const &a, rr::RValue<sw::SIMD::Float> const &b, rr::RValue<sw::SIMD::Float> const &c,
+	rr::RValue<sw::SIMD::Float> const &d, rr::RValue<sw::SIMD::Float> const &e, rr::RValue<sw::SIMD::Float> const &f,
+	rr::RValue<sw::SIMD::Float> const &g, rr::RValue<sw::SIMD::Float> const &h, rr::RValue<sw::SIMD::Float> const &i);
+
+// Returns the determinant of a 4x4 matrix.
+rr::RValue<sw::SIMD::Float> Determinant(
+	rr::RValue<sw::SIMD::Float> const &a, rr::RValue<sw::SIMD::Float> const &b, rr::RValue<sw::SIMD::Float> const &c, rr::RValue<sw::SIMD::Float> const &d,
+	rr::RValue<sw::SIMD::Float> const &e, rr::RValue<sw::SIMD::Float> const &f, rr::RValue<sw::SIMD::Float> const &g, rr::RValue<sw::SIMD::Float> const &h,
+	rr::RValue<sw::SIMD::Float> const &i, rr::RValue<sw::SIMD::Float> const &j, rr::RValue<sw::SIMD::Float> const &k, rr::RValue<sw::SIMD::Float> const &l,
+	rr::RValue<sw::SIMD::Float> const &m, rr::RValue<sw::SIMD::Float> const &n, rr::RValue<sw::SIMD::Float> const &o, rr::RValue<sw::SIMD::Float> const &p);
+
+// Returns the inverse of a 2x2 matrix.
+std::array<rr::RValue<sw::SIMD::Float>, 4> MatrixInverse(
+	rr::RValue<sw::SIMD::Float> const &a, rr::RValue<sw::SIMD::Float> const &b,
+	rr::RValue<sw::SIMD::Float> const &c, rr::RValue<sw::SIMD::Float> const &d);
+
+// Returns the inverse of a 3x3 matrix.
+std::array<rr::RValue<sw::SIMD::Float>, 9> MatrixInverse(
+	rr::RValue<sw::SIMD::Float> const &a, rr::RValue<sw::SIMD::Float> const &b, rr::RValue<sw::SIMD::Float> const &c,
+	rr::RValue<sw::SIMD::Float> const &d, rr::RValue<sw::SIMD::Float> const &e, rr::RValue<sw::SIMD::Float> const &f,
+	rr::RValue<sw::SIMD::Float> const &g, rr::RValue<sw::SIMD::Float> const &h, rr::RValue<sw::SIMD::Float> const &i);
+
+// Returns the inverse of a 4x4 matrix.
+std::array<rr::RValue<sw::SIMD::Float>, 16> MatrixInverse(
+	rr::RValue<sw::SIMD::Float> const &a, rr::RValue<sw::SIMD::Float> const &b, rr::RValue<sw::SIMD::Float> const &c, rr::RValue<sw::SIMD::Float> const &d,
+	rr::RValue<sw::SIMD::Float> const &e, rr::RValue<sw::SIMD::Float> const &f, rr::RValue<sw::SIMD::Float> const &g, rr::RValue<sw::SIMD::Float> const &h,
+	rr::RValue<sw::SIMD::Float> const &i, rr::RValue<sw::SIMD::Float> const &j, rr::RValue<sw::SIMD::Float> const &k, rr::RValue<sw::SIMD::Float> const &l,
+	rr::RValue<sw::SIMD::Float> const &m, rr::RValue<sw::SIMD::Float> const &n, rr::RValue<sw::SIMD::Float> const &o, rr::RValue<sw::SIMD::Float> const &p);
+
+////////////////////////////////////////////////////////////////////////////
+// Inline functions
+////////////////////////////////////////////////////////////////////////////
+
+template<typename T>
+inline T SIMD::Pointer::Load(OutOfBoundsBehavior robustness, Int mask, bool atomic /* = false */, std::memory_order order /* = std::memory_order_relaxed */, int alignment /* = sizeof(float) */)
+{
+	using EL = typename Element<T>::type;
+
+	if (isStaticallyInBounds(sizeof(float), robustness))
 	{
-		using EL = typename Element<T>::type;
+		// All elements are statically known to be in-bounds.
+		// We can avoid costly conditional on masks.
 
-		if (isStaticallyInBounds(sizeof(float), robustness))
+		if (hasStaticSequentialOffsets(sizeof(float)))
 		{
-			// All elements are statically known to be in-bounds.
-			// We can avoid costly conditional on masks.
-
-			if (hasStaticSequentialOffsets(sizeof(float)))
-			{
-				// Offsets are sequential. Perform regular load.
-				return rr::Load(rr::Pointer<T>(base + staticOffsets[0]), alignment, atomic, order);
-			}
-			if (hasStaticEqualOffsets())
-			{
-				// Load one, replicate.
-				return T(*rr::Pointer<EL>(base + staticOffsets[0], alignment));
-			}
+			// Offsets are sequential. Perform regular load.
+			return rr::Load(rr::Pointer<T>(base + staticOffsets[0]), alignment, atomic, order);
 		}
-		else
+		if (hasStaticEqualOffsets())
 		{
-			switch(robustness)
-			{
-			case OutOfBoundsBehavior::Nullify:
-			case OutOfBoundsBehavior::RobustBufferAccess:
-			case OutOfBoundsBehavior::UndefinedValue:
-				mask &= isInBounds(sizeof(float), robustness);  // Disable out-of-bounds reads.
-				break;
-			case OutOfBoundsBehavior::UndefinedBehavior:
-				// Nothing to do. Application/compiler must guarantee no out-of-bounds accesses.
-				break;
-			}
-		}
-
-		auto offs = offsets();
-
-		if (!atomic && order == std::memory_order_relaxed)
-		{
-			if (hasStaticEqualOffsets())
-			{
-				// Load one, replicate.
-				// Be careful of the case where the post-bounds-check mask
-				// is 0, in which case we must not load.
-				T out = T(0);
-				If(AnyTrue(mask))
-				{
-					EL el = *rr::Pointer<EL>(base + staticOffsets[0], alignment);
-					out = T(el);
-				}
-				return out;
-			}
-
-			bool zeroMaskedLanes = true;
-			switch(robustness)
-			{
-			case OutOfBoundsBehavior::Nullify:
-			case OutOfBoundsBehavior::RobustBufferAccess:  // Must either return an in-bounds value, or zero.
-				zeroMaskedLanes = true;
-				break;
-			case OutOfBoundsBehavior::UndefinedValue:
-			case OutOfBoundsBehavior::UndefinedBehavior:
-				zeroMaskedLanes = false;
-				break;
-			}
-
-			if (hasStaticSequentialOffsets(sizeof(float)))
-			{
-				return rr::MaskedLoad(rr::Pointer<T>(base + staticOffsets[0]), mask, alignment, zeroMaskedLanes);
-			}
-
-			return rr::Gather(rr::Pointer<EL>(base), offs, mask, alignment, zeroMaskedLanes);
-		}
-		else
-		{
-			T out;
-			auto anyLanesDisabled = AnyFalse(mask);
-			If(hasEqualOffsets() && !anyLanesDisabled)
-			{
-				// Load one, replicate.
-				auto offset = Extract(offs, 0);
-				out = T(rr::Load(rr::Pointer<EL>(&base[offset]), alignment, atomic, order));
-			}
-			Else If(hasSequentialOffsets(sizeof(float)) && !anyLanesDisabled)
-			{
-				// Load all elements in a single SIMD instruction.
-				auto offset = Extract(offs, 0);
-				out = rr::Load(rr::Pointer<T>(&base[offset]), alignment, atomic, order);
-			}
-			Else
-			{
-				// Divergent offsets or masked lanes.
-				out = T(0);
-				for (int i = 0; i < SIMD::Width; i++)
-				{
-					If(Extract(mask, i) != 0)
-					{
-						auto offset = Extract(offs, i);
-						auto el = rr::Load(rr::Pointer<EL>(&base[offset]), alignment, atomic, order);
-						out = Insert(out, el, i);
-					}
-				}
-			}
-			return out;
+			// Load one, replicate.
+			return T(*rr::Pointer<EL>(base + staticOffsets[0], alignment));
 		}
 	}
-
-	template<typename T>
-	inline void SIMD::Pointer::Store(T val, OutOfBoundsBehavior robustness, Int mask, bool atomic /* = false */, std::memory_order order /* = std::memory_order_relaxed */)
+	else
 	{
-		using EL = typename Element<T>::type;
-		constexpr size_t alignment = sizeof(float);
-		auto offs = offsets();
-
 		switch(robustness)
 		{
 		case OutOfBoundsBehavior::Nullify:
-		case OutOfBoundsBehavior::RobustBufferAccess:  // TODO: Allows writing anywhere within bounds. Could be faster than masking.
-		case OutOfBoundsBehavior::UndefinedValue:  // Should not be used for store operations. Treat as robust buffer access.
-			mask &= isInBounds(sizeof(float), robustness);  // Disable out-of-bounds writes.
+		case OutOfBoundsBehavior::RobustBufferAccess:
+		case OutOfBoundsBehavior::UndefinedValue:
+			mask &= isInBounds(sizeof(float), robustness);  // Disable out-of-bounds reads.
 			break;
 		case OutOfBoundsBehavior::UndefinedBehavior:
 			// Nothing to do. Application/compiler must guarantee no out-of-bounds accesses.
 			break;
 		}
+	}
 
-		if (!atomic && order == std::memory_order_relaxed)
+	auto offs = offsets();
+
+	if (!atomic && order == std::memory_order_relaxed)
+	{
+		if (hasStaticEqualOffsets())
 		{
-			if (hasStaticEqualOffsets())
+			// Load one, replicate.
+			// Be careful of the case where the post-bounds-check mask
+			// is 0, in which case we must not load.
+			T out = T(0);
+			If(AnyTrue(mask))
 			{
-				If (AnyTrue(mask))
+				EL el = *rr::Pointer<EL>(base + staticOffsets[0], alignment);
+				out = T(el);
+			}
+			return out;
+		}
+
+		bool zeroMaskedLanes = true;
+		switch(robustness)
+		{
+		case OutOfBoundsBehavior::Nullify:
+		case OutOfBoundsBehavior::RobustBufferAccess:  // Must either return an in-bounds value, or zero.
+			zeroMaskedLanes = true;
+			break;
+		case OutOfBoundsBehavior::UndefinedValue:
+		case OutOfBoundsBehavior::UndefinedBehavior:
+			zeroMaskedLanes = false;
+			break;
+		}
+
+		if (hasStaticSequentialOffsets(sizeof(float)))
+		{
+			return rr::MaskedLoad(rr::Pointer<T>(base + staticOffsets[0]), mask, alignment, zeroMaskedLanes);
+		}
+
+		return rr::Gather(rr::Pointer<EL>(base), offs, mask, alignment, zeroMaskedLanes);
+	}
+	else
+	{
+		T out;
+		auto anyLanesDisabled = AnyFalse(mask);
+		If(hasEqualOffsets() && !anyLanesDisabled)
+		{
+			// Load one, replicate.
+			auto offset = Extract(offs, 0);
+			out = T(rr::Load(rr::Pointer<EL>(&base[offset]), alignment, atomic, order));
+		}
+		Else If(hasSequentialOffsets(sizeof(float)) && !anyLanesDisabled)
+		{
+			// Load all elements in a single SIMD instruction.
+			auto offset = Extract(offs, 0);
+			out = rr::Load(rr::Pointer<T>(&base[offset]), alignment, atomic, order);
+		}
+		Else
+		{
+			// Divergent offsets or masked lanes.
+			out = T(0);
+			for (int i = 0; i < SIMD::Width; i++)
+			{
+				If(Extract(mask, i) != 0)
 				{
-					// All equal. One of these writes will win -- elect the winning lane.
-					auto v0111 = SIMD::Int(0, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF);
-					auto elect = mask & ~(v0111 & (mask.xxyz | mask.xxxy | mask.xxxx));
-					auto maskedVal = As<SIMD::Int>(val) & elect;
-					auto scalarVal = Extract(maskedVal, 0) |
-						Extract(maskedVal, 1) |
-						Extract(maskedVal, 2) |
-						Extract(maskedVal, 3);
-					*rr::Pointer<EL>(base + staticOffsets[0], alignment) = As<EL>(scalarVal);
+					auto offset = Extract(offs, i);
+					auto el = rr::Load(rr::Pointer<EL>(&base[offset]), alignment, atomic, order);
+					out = Insert(out, el, i);
 				}
 			}
-			else if (hasStaticSequentialOffsets(sizeof(float)))
+		}
+		return out;
+	}
+}
+
+template<typename T>
+inline void SIMD::Pointer::Store(T val, OutOfBoundsBehavior robustness, Int mask, bool atomic /* = false */, std::memory_order order /* = std::memory_order_relaxed */)
+{
+	using EL = typename Element<T>::type;
+	constexpr size_t alignment = sizeof(float);
+	auto offs = offsets();
+
+	switch(robustness)
+	{
+	case OutOfBoundsBehavior::Nullify:
+	case OutOfBoundsBehavior::RobustBufferAccess:  // TODO: Allows writing anywhere within bounds. Could be faster than masking.
+	case OutOfBoundsBehavior::UndefinedValue:  // Should not be used for store operations. Treat as robust buffer access.
+		mask &= isInBounds(sizeof(float), robustness);  // Disable out-of-bounds writes.
+		break;
+	case OutOfBoundsBehavior::UndefinedBehavior:
+		// Nothing to do. Application/compiler must guarantee no out-of-bounds accesses.
+		break;
+	}
+
+	if (!atomic && order == std::memory_order_relaxed)
+	{
+		if (hasStaticEqualOffsets())
+		{
+			If (AnyTrue(mask))
 			{
-				if (isStaticallyInBounds(sizeof(float), robustness))
-				{
-					// Pointer has no elements OOB, and the store is not atomic.
-					// Perform a RMW.
-					auto p = rr::Pointer<SIMD::Int>(base + staticOffsets[0], alignment);
-					auto prev = *p;
-					*p = (prev & ~mask) | (As<SIMD::Int>(val) & mask);
-				}
-				else
-				{
-					rr::MaskedStore(rr::Pointer<T>(base + staticOffsets[0]), val, mask, alignment);
-				}
+				// All equal. One of these writes will win -- elect the winning lane.
+				auto v0111 = SIMD::Int(0, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF);
+				auto elect = mask & ~(v0111 & (mask.xxyz | mask.xxxy | mask.xxxx));
+				auto maskedVal = As<SIMD::Int>(val) & elect;
+				auto scalarVal = Extract(maskedVal, 0) |
+					Extract(maskedVal, 1) |
+					Extract(maskedVal, 2) |
+					Extract(maskedVal, 3);
+				*rr::Pointer<EL>(base + staticOffsets[0], alignment) = As<EL>(scalarVal);
+			}
+		}
+		else if (hasStaticSequentialOffsets(sizeof(float)))
+		{
+			if (isStaticallyInBounds(sizeof(float), robustness))
+			{
+				// Pointer has no elements OOB, and the store is not atomic.
+				// Perform a RMW.
+				auto p = rr::Pointer<SIMD::Int>(base + staticOffsets[0], alignment);
+				auto prev = *p;
+				*p = (prev & ~mask) | (As<SIMD::Int>(val) & mask);
 			}
 			else
 			{
-				rr::Scatter(rr::Pointer<EL>(base), val, offs, mask, alignment);
+				rr::MaskedStore(rr::Pointer<T>(base + staticOffsets[0]), val, mask, alignment);
 			}
 		}
 		else
 		{
-			auto anyLanesDisabled = AnyFalse(mask);
-			If(hasSequentialOffsets(sizeof(float)) && !anyLanesDisabled)
+			rr::Scatter(rr::Pointer<EL>(base), val, offs, mask, alignment);
+		}
+	}
+	else
+	{
+		auto anyLanesDisabled = AnyFalse(mask);
+		If(hasSequentialOffsets(sizeof(float)) && !anyLanesDisabled)
+		{
+			// Store all elements in a single SIMD instruction.
+			auto offset = Extract(offs, 0);
+			rr::Store(val, rr::Pointer<T>(&base[offset]), alignment, atomic, order);
+		}
+		Else
+		{
+			// Divergent offsets or masked lanes.
+			for (int i = 0; i < SIMD::Width; i++)
 			{
-				// Store all elements in a single SIMD instruction.
-				auto offset = Extract(offs, 0);
-				rr::Store(val, rr::Pointer<T>(&base[offset]), alignment, atomic, order);
-			}
-			Else
-			{
-				// Divergent offsets or masked lanes.
-				for (int i = 0; i < SIMD::Width; i++)
+				If(Extract(mask, i) != 0)
 				{
-					If(Extract(mask, i) != 0)
-					{
-						auto offset = Extract(offs, i);
-						rr::Store(Extract(val, i), rr::Pointer<EL>(&base[offset]), alignment, atomic, order);
-					}
+					auto offset = Extract(offs, i);
+					rr::Store(Extract(val, i), rr::Pointer<EL>(&base[offset]), alignment, atomic, order);
 				}
 			}
 		}
 	}
+}
 
-	template<typename T>
-	inline void SIMD::Pointer::Store(RValue<T> val, OutOfBoundsBehavior robustness, Int mask, bool atomic /* = false */, std::memory_order order /* = std::memory_order_relaxed */)
-	{
-		Store(T(val), robustness, mask, atomic, order);
-	}
+template<typename T>
+inline void SIMD::Pointer::Store(RValue<T> val, OutOfBoundsBehavior robustness, Int mask, bool atomic /* = false */, std::memory_order order /* = std::memory_order_relaxed */)
+{
+	Store(T(val), robustness, mask, atomic, order);
+}
 
-	template <typename T>
-	inline rr::RValue<T> AndAll(rr::RValue<T> const &mask)
-	{
-		T v1 = mask;              // [x]    [y]    [z]    [w]
-		T v2 = v1.xzxz & v1.ywyw; // [xy]   [zw]   [xy]   [zw]
-		return v2.xxxx & v2.yyyy; // [xyzw] [xyzw] [xyzw] [xyzw]
-	}
+template <typename T>
+inline rr::RValue<T> AndAll(rr::RValue<T> const &mask)
+{
+	T v1 = mask;              // [x]    [y]    [z]    [w]
+	T v2 = v1.xzxz & v1.ywyw; // [xy]   [zw]   [xy]   [zw]
+	return v2.xxxx & v2.yyyy; // [xyzw] [xyzw] [xyzw] [xyzw]
+}
 
-	template <typename T>
-	inline rr::RValue<T> OrAll(rr::RValue<T> const &mask)
-	{
-		T v1 = mask;              // [x]    [y]    [z]    [w]
-		T v2 = v1.xzxz | v1.ywyw; // [xy]   [zw]   [xy]   [zw]
-		return v2.xxxx | v2.yyyy; // [xyzw] [xyzw] [xyzw] [xyzw]
-	}
+template <typename T>
+inline rr::RValue<T> OrAll(rr::RValue<T> const &mask)
+{
+	T v1 = mask;              // [x]    [y]    [z]    [w]
+	T v2 = v1.xzxz | v1.ywyw; // [xy]   [zw]   [xy]   [zw]
+	return v2.xxxx | v2.yyyy; // [xyzw] [xyzw] [xyzw] [xyzw]
+}
 
 } // namespace sw
 
 #ifdef ENABLE_RR_PRINT
 namespace rr {
-	template <> struct PrintValue::Ty<sw::Vector4f>
+template <> struct PrintValue::Ty<sw::Vector4f>
+{
+	static std::string fmt(const sw::Vector4f& v)
 	{
-		static std::string fmt(const sw::Vector4f& v)
-		{
-			return "[x: " + PrintValue::fmt(v.x) + ","
-			       " y: " + PrintValue::fmt(v.y) + ","
-			       " z: " + PrintValue::fmt(v.z) + ","
-			       " w: " + PrintValue::fmt(v.w) + "]";
-		}
+		return "[x: " + PrintValue::fmt(v.x) + ","
+		       " y: " + PrintValue::fmt(v.y) + ","
+		       " z: " + PrintValue::fmt(v.z) + ","
+		       " w: " + PrintValue::fmt(v.w) + "]";
+	}
 
-		static std::vector<rr::Value*> val(const sw::Vector4f& v)
-		{
-			return PrintValue::vals(v.x, v.y, v.z, v.w);
-		}
-	};
-	template <> struct PrintValue::Ty<sw::Vector4s>
+	static std::vector<rr::Value*> val(const sw::Vector4f& v)
 	{
-		static std::string fmt(const sw::Vector4s& v)
-		{
-			return "[x: " + PrintValue::fmt(v.x) + ","
-			       " y: " + PrintValue::fmt(v.y) + ","
-			       " z: " + PrintValue::fmt(v.z) + ","
-			       " w: " + PrintValue::fmt(v.w) + "]";
-		}
+		return PrintValue::vals(v.x, v.y, v.z, v.w);
+	}
+};
+template <> struct PrintValue::Ty<sw::Vector4s>
+{
+	static std::string fmt(const sw::Vector4s& v)
+	{
+		return "[x: " + PrintValue::fmt(v.x) + ","
+		       " y: " + PrintValue::fmt(v.y) + ","
+		       " z: " + PrintValue::fmt(v.z) + ","
+		       " w: " + PrintValue::fmt(v.w) + "]";
+	}
 
-		static std::vector<rr::Value*> val(const sw::Vector4s& v)
-		{
-			return PrintValue::vals(v.x, v.y, v.z, v.w);
-		}
-	};
-}
+	static std::vector<rr::Value*> val(const sw::Vector4s& v)
+	{
+		return PrintValue::vals(v.x, v.y, v.z, v.w);
+	}
+};
+
+}  // namespace sw
 #endif // ENABLE_RR_PRINT
 
 #endif   // sw_ShaderCore_hpp
diff --git a/src/Pipeline/SpirvID.hpp b/src/Pipeline/SpirvID.hpp
index 9688a45..1f4624d 100644
--- a/src/Pipeline/SpirvID.hpp
+++ b/src/Pipeline/SpirvID.hpp
@@ -18,45 +18,46 @@
 #include <unordered_map>
 #include <cstdint>
 
-namespace sw
+namespace sw {
+
+// SpirvID is a strongly-typed identifier backed by a uint32_t.
+// The template parameter T is not actually used by the implementation of
+// ID; instead it is used to prevent implicit casts between identifiers of
+// different T types.
+// IDs are typically used as a map key to value of type T.
+template <typename T>
+class SpirvID
 {
-	// SpirvID is a strongly-typed identifier backed by a uint32_t.
-	// The template parameter T is not actually used by the implementation of
-	// ID; instead it is used to prevent implicit casts between identifiers of
-	// different T types.
-	// IDs are typically used as a map key to value of type T.
-	template <typename T>
-	class SpirvID
-	{
-	public:
-		SpirvID() : id(0) {}
-		SpirvID(uint32_t id) : id(id) {}
-		bool operator == (const SpirvID<T>& rhs) const { return id == rhs.id; }
-		bool operator != (const SpirvID<T>& rhs) const { return id != rhs.id; }
-		bool operator < (const SpirvID<T>& rhs) const { return id < rhs.id; }
+public:
+	SpirvID() : id(0) {}
+	SpirvID(uint32_t id) : id(id) {}
+	bool operator == (const SpirvID<T>& rhs) const { return id == rhs.id; }
+	bool operator != (const SpirvID<T>& rhs) const { return id != rhs.id; }
+	bool operator < (const SpirvID<T>& rhs) const { return id < rhs.id; }
 
-		// value returns the numerical value of the identifier.
-		uint32_t value() const { return id; }
-	private:
-		uint32_t id;
-	};
+	// value returns the numerical value of the identifier.
+	uint32_t value() const { return id; }
+private:
+	uint32_t id;
+};
 
-	// HandleMap<T> is an unordered map of SpirvID<T> to T.
-	template <typename T>
-	using HandleMap = std::unordered_map<SpirvID<T>, T>;
+// HandleMap<T> is an unordered map of SpirvID<T> to T.
+template <typename T>
+using HandleMap = std::unordered_map<SpirvID<T>, T>;
 }
 
 namespace std
 {
-	// std::hash implementation for sw::SpirvID<T>
-	template<typename T>
-	struct hash< sw::SpirvID<T> >
+// std::hash implementation for sw::SpirvID<T>
+template<typename T>
+struct hash< sw::SpirvID<T> >
+{
+	std::size_t operator()(const sw::SpirvID<T>& id) const noexcept
 	{
-		std::size_t operator()(const sw::SpirvID<T>& id) const noexcept
-		{
-			return std::hash<uint32_t>()(id.value());
-		}
-	};
-}
+		return std::hash<uint32_t>()(id.value());
+	}
+};
+
+}  // namespace sw
 
 #endif  // sw_ID_hpp
diff --git a/src/Pipeline/SpirvShader.cpp b/src/Pipeline/SpirvShader.cpp
index 4cb0827..4dc26d5 100644
--- a/src/Pipeline/SpirvShader.cpp
+++ b/src/Pipeline/SpirvShader.cpp
@@ -20,1573 +20,407 @@
 
 #include <spirv/unified1/spirv.hpp>
 
-namespace sw
+namespace sw {
+
+SpirvShader::SpirvShader(
+		uint32_t codeSerialID,
+		VkShaderStageFlagBits pipelineStage,
+		const char *entryPointName,
+		InsnStore const &insns,
+		const vk::RenderPass *renderPass,
+		uint32_t subpassIndex,
+		bool robustBufferAccess)
+			: insns{insns}, inputs{MAX_INTERFACE_COMPONENTS},
+			  outputs{MAX_INTERFACE_COMPONENTS},
+			  codeSerialID(codeSerialID),
+			  robustBufferAccess(robustBufferAccess)
 {
+	ASSERT(insns.size() > 0);
 
-	SpirvShader::SpirvShader(
-			uint32_t codeSerialID,
-			VkShaderStageFlagBits pipelineStage,
-			const char *entryPointName,
-			InsnStore const &insns,
-			const vk::RenderPass *renderPass,
-			uint32_t subpassIndex,
-			bool robustBufferAccess)
-				: insns{insns}, inputs{MAX_INTERFACE_COMPONENTS},
-				  outputs{MAX_INTERFACE_COMPONENTS},
-				  codeSerialID(codeSerialID),
-				  robustBufferAccess(robustBufferAccess)
+	if (renderPass)
 	{
-		ASSERT(insns.size() > 0);
-
-		if (renderPass)
+		// capture formats of any input attachments present
+		auto subpass = renderPass->getSubpass(subpassIndex);
+		inputAttachmentFormats.reserve(subpass.inputAttachmentCount);
+		for (auto i = 0u; i < subpass.inputAttachmentCount; i++)
 		{
-			// capture formats of any input attachments present
-			auto subpass = renderPass->getSubpass(subpassIndex);
-			inputAttachmentFormats.reserve(subpass.inputAttachmentCount);
-			for (auto i = 0u; i < subpass.inputAttachmentCount; i++)
-			{
-				auto attachmentIndex = subpass.pInputAttachments[i].attachment;
-				inputAttachmentFormats.push_back(attachmentIndex != VK_ATTACHMENT_UNUSED
-												 ? renderPass->getAttachment(attachmentIndex).format : VK_FORMAT_UNDEFINED);
-			}
-		}
-
-		// Simplifying assumptions (to be satisfied by earlier transformations)
-		// - The only input/output OpVariables present are those used by the entrypoint
-
-		Function::ID currentFunction;
-		Block::ID currentBlock;
-		InsnIterator blockStart;
-
-		for (auto insn : *this)
-		{
-			spv::Op opcode = insn.opcode();
-
-			switch (opcode)
-			{
-			case spv::OpEntryPoint:
-			{
-				executionModel = spv::ExecutionModel(insn.word(1));
-				auto id = Function::ID(insn.word(2));
-				auto name = insn.string(3);
-				auto stage = executionModelToStage(executionModel);
-				if (stage == pipelineStage && strcmp(name, entryPointName) == 0)
-				{
-					ASSERT_MSG(entryPoint == 0, "Duplicate entry point with name '%s' and stage %d", name, int(stage));
-					entryPoint = id;
-				}
-				break;
-			}
-
-			case spv::OpExecutionMode:
-				ProcessExecutionMode(insn);
-				break;
-
-			case spv::OpDecorate:
-			{
-				TypeOrObjectID targetId = insn.word(1);
-				auto decoration = static_cast<spv::Decoration>(insn.word(2));
-				uint32_t value = insn.wordCount() > 3 ? insn.word(3) : 0;
-
-				decorations[targetId].Apply(decoration, value);
-
-				switch(decoration)
-				{
-				case spv::DecorationDescriptorSet:
-					descriptorDecorations[targetId].DescriptorSet = value;
-					break;
-				case spv::DecorationBinding:
-					descriptorDecorations[targetId].Binding = value;
-					break;
-				case spv::DecorationInputAttachmentIndex:
-					descriptorDecorations[targetId].InputAttachmentIndex = value;
-					break;
-				default:
-					// Only handling descriptor decorations here.
-					break;
-				}
-
-				if (decoration == spv::DecorationCentroid)
-					modes.NeedsCentroid = true;
-				break;
-			}
-
-			case spv::OpMemberDecorate:
-			{
-				Type::ID targetId = insn.word(1);
-				auto memberIndex = insn.word(2);
-				auto decoration = static_cast<spv::Decoration>(insn.word(3));
-				uint32_t value = insn.wordCount() > 4 ? insn.word(4) : 0;
-
-				auto &d = memberDecorations[targetId];
-				if (memberIndex >= d.size())
-					d.resize(memberIndex + 1);    // on demand; exact size would require another pass...
-
-				d[memberIndex].Apply(decoration, value);
-
-				if (decoration == spv::DecorationCentroid)
-					modes.NeedsCentroid = true;
-				break;
-			}
-
-			case spv::OpDecorationGroup:
-				// Nothing to do here. We don't need to record the definition of the group; we'll just have
-				// the bundle of decorations float around. If we were to ever walk the decorations directly,
-				// we might think about introducing this as a real Object.
-				break;
-
-			case spv::OpGroupDecorate:
-			{
-				uint32_t group = insn.word(1);
-				auto const &groupDecorations = decorations[group];
-				auto const &descriptorGroupDecorations = descriptorDecorations[group];
-				for (auto i = 2u; i < insn.wordCount(); i++)
-				{
-					// Remaining operands are targets to apply the group to.
-					uint32_t target = insn.word(i);
-					decorations[target].Apply(groupDecorations);
-					descriptorDecorations[target].Apply(descriptorGroupDecorations);
-				}
-
-				break;
-			}
-
-			case spv::OpGroupMemberDecorate:
-			{
-				auto const &srcDecorations = decorations[insn.word(1)];
-				for (auto i = 2u; i < insn.wordCount(); i += 2)
-				{
-					// remaining operands are pairs of <id>, literal for members to apply to.
-					auto &d = memberDecorations[insn.word(i)];
-					auto memberIndex = insn.word(i + 1);
-					if (memberIndex >= d.size())
-						d.resize(memberIndex + 1);    // on demand resize, see above...
-					d[memberIndex].Apply(srcDecorations);
-				}
-				break;
-			}
-
-			case spv::OpLabel:
-			{
-				ASSERT(currentBlock.value() == 0);
-				currentBlock = Block::ID(insn.word(1));
-				blockStart = insn;
-				break;
-			}
-
-			// Branch Instructions (subset of Termination Instructions):
-			case spv::OpBranch:
-			case spv::OpBranchConditional:
-			case spv::OpSwitch:
-			case spv::OpReturn:
-			// fallthrough
-
-			// Termination instruction:
-			case spv::OpKill:
-			case spv::OpUnreachable:
-			{
-				ASSERT(currentBlock.value() != 0);
-				ASSERT(currentFunction.value() != 0);
-
-				auto blockEnd = insn; blockEnd++;
-				functions[currentFunction].blocks[currentBlock] = Block(blockStart, blockEnd);
-				currentBlock = Block::ID(0);
-
-				if (opcode == spv::OpKill)
-				{
-					modes.ContainsKill = true;
-				}
-				break;
-			}
-
-			case spv::OpLoopMerge:
-			case spv::OpSelectionMerge:
-				break; // Nothing to do in analysis pass.
-
-			case spv::OpTypeVoid:
-			case spv::OpTypeBool:
-			case spv::OpTypeInt:
-			case spv::OpTypeFloat:
-			case spv::OpTypeVector:
-			case spv::OpTypeMatrix:
-			case spv::OpTypeImage:
-			case spv::OpTypeSampler:
-			case spv::OpTypeSampledImage:
-			case spv::OpTypeArray:
-			case spv::OpTypeRuntimeArray:
-			case spv::OpTypeStruct:
-			case spv::OpTypePointer:
-			case spv::OpTypeFunction:
-				DeclareType(insn);
-				break;
-
-			case spv::OpVariable:
-			{
-				Type::ID typeId = insn.word(1);
-				Object::ID resultId = insn.word(2);
-				auto storageClass = static_cast<spv::StorageClass>(insn.word(3));
-
-				auto &object = defs[resultId];
-				object.kind = Object::Kind::Pointer;
-				object.definition = insn;
-				object.type = typeId;
-
-				ASSERT(getType(typeId).definition.opcode() == spv::OpTypePointer);
-				ASSERT(getType(typeId).storageClass == storageClass);
-
-				switch (storageClass)
-				{
-				case spv::StorageClassInput:
-				case spv::StorageClassOutput:
-					ProcessInterfaceVariable(object);
-					break;
-
-				case spv::StorageClassUniform:
-				case spv::StorageClassStorageBuffer:
-					object.kind = Object::Kind::DescriptorSet;
-					break;
-
-				case spv::StorageClassPushConstant:
-				case spv::StorageClassPrivate:
-				case spv::StorageClassFunction:
-				case spv::StorageClassUniformConstant:
-					break; // Correctly handled.
-
-				case spv::StorageClassWorkgroup:
-				{
-					auto &elTy = getType(getType(typeId).element);
-					auto sizeInBytes = elTy.sizeInComponents * static_cast<uint32_t>(sizeof(float));
-					workgroupMemory.allocate(resultId, sizeInBytes);
-					object.kind = Object::Kind::Pointer;
-					break;
-				}
-				case spv::StorageClassAtomicCounter:
-				case spv::StorageClassImage:
-					UNIMPLEMENTED("StorageClass %d not yet implemented", (int)storageClass);
-					break;
-
-				case spv::StorageClassCrossWorkgroup:
-					UNSUPPORTED("SPIR-V OpenCL Execution Model (StorageClassCrossWorkgroup)");
-					break;
-
-				case spv::StorageClassGeneric:
-					UNSUPPORTED("SPIR-V GenericPointer Capability (StorageClassGeneric)");
-					break;
-
-				default:
-					UNREACHABLE("Unexpected StorageClass %d", storageClass); // See Appendix A of the Vulkan spec.
-					break;
-				}
-				break;
-			}
-
-			case spv::OpConstant:
-			case spv::OpSpecConstant:
-				CreateConstant(insn).constantValue[0] = insn.word(3);
-				break;
-			case spv::OpConstantFalse:
-			case spv::OpSpecConstantFalse:
-				CreateConstant(insn).constantValue[0] = 0;    // Represent Boolean false as zero.
-				break;
-			case spv::OpConstantTrue:
-			case spv::OpSpecConstantTrue:
-				CreateConstant(insn).constantValue[0] = ~0u;  // Represent Boolean true as all bits set.
-				break;
-			case spv::OpConstantNull:
-			case spv::OpUndef:
-			{
-				// TODO: consider a real LLVM-level undef. For now, zero is a perfectly good value.
-				// OpConstantNull forms a constant of arbitrary type, all zeros.
-				auto &object = CreateConstant(insn);
-				auto &objectTy = getType(object.type);
-				for (auto i = 0u; i < objectTy.sizeInComponents; i++)
-				{
-					object.constantValue[i] = 0;
-				}
-				break;
-			}
-			case spv::OpConstantComposite:
-			case spv::OpSpecConstantComposite:
-			{
-				auto &object = CreateConstant(insn);
-				auto offset = 0u;
-				for (auto i = 0u; i < insn.wordCount() - 3; i++)
-				{
-					auto &constituent = getObject(insn.word(i + 3));
-					auto &constituentTy = getType(constituent.type);
-					for (auto j = 0u; j < constituentTy.sizeInComponents; j++)
-					{
-						object.constantValue[offset++] = constituent.constantValue[j];
-					}
-				}
-
-				auto objectId = Object::ID(insn.word(2));
-				auto decorationsIt = decorations.find(objectId);
-				if (decorationsIt != decorations.end() &&
-					decorationsIt->second.BuiltIn == spv::BuiltInWorkgroupSize)
-				{
-					// https://www.khronos.org/registry/vulkan/specs/1.1/html/vkspec.html#interfaces-builtin-variables :
-					// Decorating an object with the WorkgroupSize built-in
-					// decoration will make that object contain the dimensions
-					// of a local workgroup. If an object is decorated with the
-					// WorkgroupSize decoration, this must take precedence over
-					// any execution mode set for LocalSize.
-					// The object decorated with WorkgroupSize must be declared
-					// as a three-component vector of 32-bit integers.
-					ASSERT(getType(object.type).sizeInComponents == 3);
-					modes.WorkgroupSizeX = object.constantValue[0];
-					modes.WorkgroupSizeY = object.constantValue[1];
-					modes.WorkgroupSizeZ = object.constantValue[2];
-				}
-				break;
-			}
-			case spv::OpSpecConstantOp:
-				EvalSpecConstantOp(insn);
-				break;
-
-			case spv::OpCapability:
-			{
-				auto capability = static_cast<spv::Capability>(insn.word(1));
-				switch (capability)
-				{
-				case spv::CapabilityMatrix: capabilities.Matrix = true; break;
-				case spv::CapabilityShader: capabilities.Shader = true; break;
-				case spv::CapabilityClipDistance: capabilities.ClipDistance = true; break;
-				case spv::CapabilityCullDistance: capabilities.CullDistance = true; break;
-				case spv::CapabilityInputAttachment: capabilities.InputAttachment = true; break;
-				case spv::CapabilitySampled1D: capabilities.Sampled1D = true; break;
-				case spv::CapabilityImage1D: capabilities.Image1D = true; break;
-				case spv::CapabilityImageCubeArray: capabilities.ImageCubeArray = true; break;
-				case spv::CapabilitySampledBuffer: capabilities.SampledBuffer = true; break;
-				case spv::CapabilitySampledCubeArray: capabilities.SampledCubeArray = true; break;
-				case spv::CapabilityImageBuffer: capabilities.ImageBuffer = true; break;
-				case spv::CapabilityStorageImageExtendedFormats: capabilities.StorageImageExtendedFormats = true; break;
-				case spv::CapabilityImageQuery: capabilities.ImageQuery = true; break;
-				case spv::CapabilityDerivativeControl: capabilities.DerivativeControl = true; break;
-				case spv::CapabilityGroupNonUniform: capabilities.GroupNonUniform = true; break;
-				case spv::CapabilityGroupNonUniformVote: capabilities.GroupNonUniformVote = true; break;
-				case spv::CapabilityGroupNonUniformArithmetic: capabilities.GroupNonUniformArithmetic = true; break;
-				case spv::CapabilityGroupNonUniformBallot: capabilities.GroupNonUniformBallot = true; break;
-				case spv::CapabilityGroupNonUniformShuffle: capabilities.GroupNonUniformShuffle = true; break;
-				case spv::CapabilityGroupNonUniformShuffleRelative: capabilities.GroupNonUniformShuffleRelative = true; break;
-				case spv::CapabilityDeviceGroup: capabilities.DeviceGroup = true; break;
-				case spv::CapabilityMultiView: capabilities.MultiView = true; break;
-				default:
-					UNSUPPORTED("Unsupported capability %u", insn.word(1));
-				}
-				break; // Various capabilities will be declared, but none affect our code generation at this point.
-			}
-
-			case spv::OpMemoryModel:
-				break; // Memory model does not affect our code generation until we decide to do Vulkan Memory Model support.
-
-			case spv::OpFunction:
-			{
-				auto functionId = Function::ID(insn.word(2));
-				ASSERT_MSG(currentFunction == 0, "Functions %d and %d overlap", currentFunction.value(), functionId.value());
-				currentFunction = functionId;
-				auto &function = functions[functionId];
-				function.result = Type::ID(insn.word(1));
-				function.type = Type::ID(insn.word(4));
-				// Scan forward to find the function's label.
-				for (auto it = insn; it != end() && function.entry == 0; it++)
-				{
-					switch (it.opcode())
-					{
-					case spv::OpFunction:
-					case spv::OpFunctionParameter:
-						break;
-					case spv::OpLabel:
-						function.entry = Block::ID(it.word(1));
-						break;
-					default:
-						WARN("Unexpected opcode '%s' following OpFunction", OpcodeName(it.opcode()).c_str());
-					}
-				}
-				ASSERT_MSG(function.entry != 0, "Function<%d> has no label", currentFunction.value());
-				break;
-			}
-
-			case spv::OpFunctionEnd:
-				currentFunction = 0;
-				break;
-
-			case spv::OpExtInstImport:
-			{
-				// We will only support the GLSL 450 extended instruction set, so no point in tracking the ID we assign it.
-				// Valid shaders will not attempt to import any other instruction sets.
-				auto ext = insn.string(2);
-				if (0 != strcmp("GLSL.std.450", ext))
-				{
-					UNSUPPORTED("SPIR-V Extension: %s", ext);
-				}
-				break;
-			}
-			case spv::OpName:
-			case spv::OpMemberName:
-			case spv::OpSource:
-			case spv::OpSourceContinued:
-			case spv::OpSourceExtension:
-			case spv::OpLine:
-			case spv::OpNoLine:
-			case spv::OpModuleProcessed:
-			case spv::OpString:
-				// No semantic impact
-				break;
-
-			case spv::OpFunctionParameter:
-				// These should have all been removed by preprocessing passes. If we see them here,
-				// our assumptions are wrong and we will probably generate wrong code.
-				UNREACHABLE("%s should have already been lowered.", OpcodeName(opcode).c_str());
-				break;
-
-			case spv::OpFunctionCall:
-				// TODO(b/141246700): Add full support for spv::OpFunctionCall
-				break;
-
-			case spv::OpFConvert:
-				UNSUPPORTED("SPIR-V Float16 or Float64 Capability (OpFConvert)");
-				break;
-
-			case spv::OpSConvert:
-				UNSUPPORTED("SPIR-V Int16 or Int64 Capability (OpSConvert)");
-				break;
-
-			case spv::OpUConvert:
-				UNSUPPORTED("SPIR-V Int16 or Int64 Capability (OpUConvert)");
-				break;
-
-			case spv::OpLoad:
-			case spv::OpAccessChain:
-			case spv::OpInBoundsAccessChain:
-			case spv::OpSampledImage:
-			case spv::OpImage:
-				{
-					// Propagate the descriptor decorations to the result.
-					Object::ID resultId = insn.word(2);
-					Object::ID pointerId = insn.word(3);
-					const auto &d = descriptorDecorations.find(pointerId);
-
-					if(d != descriptorDecorations.end())
-					{
-						descriptorDecorations[resultId] = d->second;
-					}
-
-					DefineResult(insn);
-
-					if (opcode == spv::OpAccessChain || opcode == spv::OpInBoundsAccessChain)
-					{
-						Decorations dd{};
-						ApplyDecorationsForAccessChain(&dd, &descriptorDecorations[resultId], pointerId, insn.wordCount() - 4, insn.wordPointer(4));
-						// Note: offset is the one thing that does *not* propagate, as the access chain accounts for it.
-						dd.HasOffset = false;
-						decorations[resultId].Apply(dd);
-					}
-				}
-				break;
-
-			case spv::OpCompositeConstruct:
-			case spv::OpCompositeInsert:
-			case spv::OpCompositeExtract:
-			case spv::OpVectorShuffle:
-			case spv::OpVectorTimesScalar:
-			case spv::OpMatrixTimesScalar:
-			case spv::OpMatrixTimesVector:
-			case spv::OpVectorTimesMatrix:
-			case spv::OpMatrixTimesMatrix:
-			case spv::OpOuterProduct:
-			case spv::OpTranspose:
-			case spv::OpVectorExtractDynamic:
-			case spv::OpVectorInsertDynamic:
-			// Unary ops
-			case spv::OpNot:
-			case spv::OpBitFieldInsert:
-			case spv::OpBitFieldSExtract:
-			case spv::OpBitFieldUExtract:
-			case spv::OpBitReverse:
-			case spv::OpBitCount:
-			case spv::OpSNegate:
-			case spv::OpFNegate:
-			case spv::OpLogicalNot:
-			case spv::OpQuantizeToF16:
-			// Binary ops
-			case spv::OpIAdd:
-			case spv::OpISub:
-			case spv::OpIMul:
-			case spv::OpSDiv:
-			case spv::OpUDiv:
-			case spv::OpFAdd:
-			case spv::OpFSub:
-			case spv::OpFMul:
-			case spv::OpFDiv:
-			case spv::OpFMod:
-			case spv::OpFRem:
-			case spv::OpFOrdEqual:
-			case spv::OpFUnordEqual:
-			case spv::OpFOrdNotEqual:
-			case spv::OpFUnordNotEqual:
-			case spv::OpFOrdLessThan:
-			case spv::OpFUnordLessThan:
-			case spv::OpFOrdGreaterThan:
-			case spv::OpFUnordGreaterThan:
-			case spv::OpFOrdLessThanEqual:
-			case spv::OpFUnordLessThanEqual:
-			case spv::OpFOrdGreaterThanEqual:
-			case spv::OpFUnordGreaterThanEqual:
-			case spv::OpSMod:
-			case spv::OpSRem:
-			case spv::OpUMod:
-			case spv::OpIEqual:
-			case spv::OpINotEqual:
-			case spv::OpUGreaterThan:
-			case spv::OpSGreaterThan:
-			case spv::OpUGreaterThanEqual:
-			case spv::OpSGreaterThanEqual:
-			case spv::OpULessThan:
-			case spv::OpSLessThan:
-			case spv::OpULessThanEqual:
-			case spv::OpSLessThanEqual:
-			case spv::OpShiftRightLogical:
-			case spv::OpShiftRightArithmetic:
-			case spv::OpShiftLeftLogical:
-			case spv::OpBitwiseOr:
-			case spv::OpBitwiseXor:
-			case spv::OpBitwiseAnd:
-			case spv::OpLogicalOr:
-			case spv::OpLogicalAnd:
-			case spv::OpLogicalEqual:
-			case spv::OpLogicalNotEqual:
-			case spv::OpUMulExtended:
-			case spv::OpSMulExtended:
-			case spv::OpIAddCarry:
-			case spv::OpISubBorrow:
-			case spv::OpDot:
-			case spv::OpConvertFToU:
-			case spv::OpConvertFToS:
-			case spv::OpConvertSToF:
-			case spv::OpConvertUToF:
-			case spv::OpBitcast:
-			case spv::OpSelect:
-			case spv::OpExtInst:
-			case spv::OpIsInf:
-			case spv::OpIsNan:
-			case spv::OpAny:
-			case spv::OpAll:
-			case spv::OpDPdx:
-			case spv::OpDPdxCoarse:
-			case spv::OpDPdy:
-			case spv::OpDPdyCoarse:
-			case spv::OpFwidth:
-			case spv::OpFwidthCoarse:
-			case spv::OpDPdxFine:
-			case spv::OpDPdyFine:
-			case spv::OpFwidthFine:
-			case spv::OpAtomicLoad:
-			case spv::OpAtomicIAdd:
-			case spv::OpAtomicISub:
-			case spv::OpAtomicSMin:
-			case spv::OpAtomicSMax:
-			case spv::OpAtomicUMin:
-			case spv::OpAtomicUMax:
-			case spv::OpAtomicAnd:
-			case spv::OpAtomicOr:
-			case spv::OpAtomicXor:
-			case spv::OpAtomicIIncrement:
-			case spv::OpAtomicIDecrement:
-			case spv::OpAtomicExchange:
-			case spv::OpAtomicCompareExchange:
-			case spv::OpPhi:
-			case spv::OpImageSampleImplicitLod:
-			case spv::OpImageSampleExplicitLod:
-			case spv::OpImageSampleDrefImplicitLod:
-			case spv::OpImageSampleDrefExplicitLod:
-			case spv::OpImageSampleProjImplicitLod:
-			case spv::OpImageSampleProjExplicitLod:
-			case spv::OpImageSampleProjDrefImplicitLod:
-			case spv::OpImageSampleProjDrefExplicitLod:
-			case spv::OpImageGather:
-			case spv::OpImageDrefGather:
-			case spv::OpImageFetch:
-			case spv::OpImageQuerySizeLod:
-			case spv::OpImageQuerySize:
-			case spv::OpImageQueryLod:
-			case spv::OpImageQueryLevels:
-			case spv::OpImageQuerySamples:
-			case spv::OpImageRead:
-			case spv::OpImageTexelPointer:
-			case spv::OpGroupNonUniformElect:
-			case spv::OpGroupNonUniformAll:
-			case spv::OpGroupNonUniformAny:
-			case spv::OpGroupNonUniformAllEqual:
-			case spv::OpGroupNonUniformBroadcast:
-			case spv::OpGroupNonUniformBroadcastFirst:
-			case spv::OpGroupNonUniformBallot:
-			case spv::OpGroupNonUniformInverseBallot:
-			case spv::OpGroupNonUniformBallotBitExtract:
-			case spv::OpGroupNonUniformBallotBitCount:
-			case spv::OpGroupNonUniformBallotFindLSB:
-			case spv::OpGroupNonUniformBallotFindMSB:
-			case spv::OpGroupNonUniformShuffle:
-			case spv::OpGroupNonUniformShuffleXor:
-			case spv::OpGroupNonUniformShuffleUp:
-			case spv::OpGroupNonUniformShuffleDown:
-			case spv::OpGroupNonUniformIAdd:
-			case spv::OpGroupNonUniformFAdd:
-			case spv::OpGroupNonUniformIMul:
-			case spv::OpGroupNonUniformFMul:
-			case spv::OpGroupNonUniformSMin:
-			case spv::OpGroupNonUniformUMin:
-			case spv::OpGroupNonUniformFMin:
-			case spv::OpGroupNonUniformSMax:
-			case spv::OpGroupNonUniformUMax:
-			case spv::OpGroupNonUniformFMax:
-			case spv::OpGroupNonUniformBitwiseAnd:
-			case spv::OpGroupNonUniformBitwiseOr:
-			case spv::OpGroupNonUniformBitwiseXor:
-			case spv::OpGroupNonUniformLogicalAnd:
-			case spv::OpGroupNonUniformLogicalOr:
-			case spv::OpGroupNonUniformLogicalXor:
-			case spv::OpCopyObject:
-			case spv::OpArrayLength:
-				// Instructions that yield an intermediate value or divergent pointer
-				DefineResult(insn);
-				break;
-
-			case spv::OpStore:
-			case spv::OpAtomicStore:
-			case spv::OpImageWrite:
-			case spv::OpCopyMemory:
-			case spv::OpMemoryBarrier:
-				// Don't need to do anything during analysis pass
-				break;
-
-			case spv::OpControlBarrier:
-				modes.ContainsControlBarriers = true;
-				break;
-
-			case spv::OpExtension:
-			{
-				auto ext = insn.string(1);
-				// Part of core SPIR-V 1.3. Vulkan 1.1 implementations must also accept the pre-1.3
-				// extension per Appendix A, `Vulkan Environment for SPIR-V`.
-				if (!strcmp(ext, "SPV_KHR_storage_buffer_storage_class")) break;
-				if (!strcmp(ext, "SPV_KHR_shader_draw_parameters")) break;
-				if (!strcmp(ext, "SPV_KHR_16bit_storage")) break;
-				if (!strcmp(ext, "SPV_KHR_variable_pointers")) break;
-				if (!strcmp(ext, "SPV_KHR_device_group")) break;
-				if (!strcmp(ext, "SPV_KHR_multiview")) break;
-				UNSUPPORTED("SPIR-V Extension: %s", ext);
-				break;
-			}
-
-			default:
-				UNIMPLEMENTED("%s", OpcodeName(opcode).c_str());
-			}
-		}
-
-		ASSERT_MSG(entryPoint != 0, "Entry point '%s' not found", entryPointName);
-		for (auto &it : functions)
-		{
-			it.second.AssignBlockFields();
+			auto attachmentIndex = subpass.pInputAttachments[i].attachment;
+			inputAttachmentFormats.push_back(attachmentIndex != VK_ATTACHMENT_UNUSED
+											 ? renderPass->getAttachment(attachmentIndex).format : VK_FORMAT_UNDEFINED);
 		}
 	}
 
-	void SpirvShader::DeclareType(InsnIterator insn)
+	// Simplifying assumptions (to be satisfied by earlier transformations)
+	// - The only input/output OpVariables present are those used by the entrypoint
+
+	Function::ID currentFunction;
+	Block::ID currentBlock;
+	InsnIterator blockStart;
+
+	for (auto insn : *this)
 	{
-		Type::ID resultId = insn.word(1);
-
-		auto &type = types[resultId];
-		type.definition = insn;
-		type.sizeInComponents = ComputeTypeSize(insn);
-
-		// A structure is a builtin block if it has a builtin
-		// member. All members of such a structure are builtins.
-		switch (insn.opcode())
-		{
-		case spv::OpTypeStruct:
-		{
-			auto d = memberDecorations.find(resultId);
-			if (d != memberDecorations.end())
-			{
-				for (auto &m : d->second)
-				{
-					if (m.HasBuiltIn)
-					{
-						type.isBuiltInBlock = true;
-						break;
-					}
-				}
-			}
-			break;
-		}
-		case spv::OpTypePointer:
-		{
-			Type::ID elementTypeId = insn.word(3);
-			type.element = elementTypeId;
-			type.isBuiltInBlock = getType(elementTypeId).isBuiltInBlock;
-			type.storageClass = static_cast<spv::StorageClass>(insn.word(2));
-			break;
-		}
-		case spv::OpTypeVector:
-		case spv::OpTypeMatrix:
-		case spv::OpTypeArray:
-		case spv::OpTypeRuntimeArray:
-		{
-			Type::ID elementTypeId = insn.word(2);
-			type.element = elementTypeId;
-			break;
-		}
-		default:
-			break;
-		}
-	}
-
-	SpirvShader::Object& SpirvShader::CreateConstant(InsnIterator insn)
-	{
-		Type::ID typeId = insn.word(1);
-		Object::ID resultId = insn.word(2);
-		auto &object = defs[resultId];
-		auto &objectTy = getType(typeId);
-		object.type = typeId;
-		object.kind = Object::Kind::Constant;
-		object.definition = insn;
-		object.constantValue = std::unique_ptr<uint32_t[]>(new uint32_t[objectTy.sizeInComponents]);
-		return object;
-	}
-
-	void SpirvShader::ProcessInterfaceVariable(Object &object)
-	{
-		auto &objectTy = getType(object.type);
-		ASSERT(objectTy.storageClass == spv::StorageClassInput || objectTy.storageClass == spv::StorageClassOutput);
-
-		ASSERT(objectTy.opcode() == spv::OpTypePointer);
-		auto pointeeTy = getType(objectTy.element);
-
-		auto &builtinInterface = (objectTy.storageClass == spv::StorageClassInput) ? inputBuiltins : outputBuiltins;
-		auto &userDefinedInterface = (objectTy.storageClass == spv::StorageClassInput) ? inputs : outputs;
-
-		ASSERT(object.opcode() == spv::OpVariable);
-		Object::ID resultId = object.definition.word(2);
-
-		if (objectTy.isBuiltInBlock)
-		{
-			// walk the builtin block, registering each of its members separately.
-			auto m = memberDecorations.find(objectTy.element);
-			ASSERT(m != memberDecorations.end());        // otherwise we wouldn't have marked the type chain
-			auto &structType = pointeeTy.definition;
-			auto offset = 0u;
-			auto word = 2u;
-			for (auto &member : m->second)
-			{
-				auto &memberType = getType(structType.word(word));
-
-				if (member.HasBuiltIn)
-				{
-					builtinInterface[member.BuiltIn] = {resultId, offset, memberType.sizeInComponents};
-				}
-
-				offset += memberType.sizeInComponents;
-				++word;
-			}
-			return;
-		}
-
-		auto d = decorations.find(resultId);
-		if (d != decorations.end() && d->second.HasBuiltIn)
-		{
-			builtinInterface[d->second.BuiltIn] = {resultId, 0, pointeeTy.sizeInComponents};
-		}
-		else
-		{
-			object.kind = Object::Kind::InterfaceVariable;
-			VisitInterface(resultId,
-						   [&userDefinedInterface](Decorations const &d, AttribType type) {
-							   // Populate a single scalar slot in the interface from a collection of decorations and the intended component type.
-							   auto scalarSlot = (d.Location << 2) | d.Component;
-							   ASSERT(scalarSlot >= 0 &&
-									  scalarSlot < static_cast<int32_t>(userDefinedInterface.size()));
-
-							   auto &slot = userDefinedInterface[scalarSlot];
-							   slot.Type = type;
-							   slot.Flat = d.Flat;
-							   slot.NoPerspective = d.NoPerspective;
-							   slot.Centroid = d.Centroid;
-						   });
-		}
-	}
-
-	void SpirvShader::ProcessExecutionMode(InsnIterator insn)
-	{
-		auto mode = static_cast<spv::ExecutionMode>(insn.word(2));
-		switch (mode)
-		{
-		case spv::ExecutionModeEarlyFragmentTests:
-			modes.EarlyFragmentTests = true;
-			break;
-		case spv::ExecutionModeDepthReplacing:
-			modes.DepthReplacing = true;
-			break;
-		case spv::ExecutionModeDepthGreater:
-			modes.DepthGreater = true;
-			break;
-		case spv::ExecutionModeDepthLess:
-			modes.DepthLess = true;
-			break;
-		case spv::ExecutionModeDepthUnchanged:
-			modes.DepthUnchanged = true;
-			break;
-		case spv::ExecutionModeLocalSize:
-			modes.WorkgroupSizeX = insn.word(3);
-			modes.WorkgroupSizeY = insn.word(4);
-			modes.WorkgroupSizeZ = insn.word(5);
-			break;
-		case spv::ExecutionModeOriginUpperLeft:
-			// This is always the case for a Vulkan shader. Do nothing.
-			break;
-		default:
-			UNREACHABLE("Execution mode: %d", int(mode));
-		}
-	}
-
-	uint32_t SpirvShader::ComputeTypeSize(InsnIterator insn)
-	{
-		// Types are always built from the bottom up (with the exception of forward ptrs, which
-		// don't appear in Vulkan shaders. Therefore, we can always assume our component parts have
-		// already been described (and so their sizes determined)
-		switch (insn.opcode())
-		{
-		case spv::OpTypeVoid:
-		case spv::OpTypeSampler:
-		case spv::OpTypeImage:
-		case spv::OpTypeSampledImage:
-		case spv::OpTypeFunction:
-		case spv::OpTypeRuntimeArray:
-			// Objects that don't consume any space.
-			// Descriptor-backed objects currently only need exist at compile-time.
-			// Runtime arrays don't appear in places where their size would be interesting
-			return 0;
-
-		case spv::OpTypeBool:
-		case spv::OpTypeFloat:
-		case spv::OpTypeInt:
-			// All the fundamental types are 1 component. If we ever add support for 8/16/64-bit components,
-			// we might need to change this, but only 32 bit components are required for Vulkan 1.1.
-			return 1;
-
-		case spv::OpTypeVector:
-		case spv::OpTypeMatrix:
-			// Vectors and matrices both consume element count * element size.
-			return getType(insn.word(2)).sizeInComponents * insn.word(3);
-
-		case spv::OpTypeArray:
-		{
-			// Element count * element size. Array sizes come from constant ids.
-			auto arraySize = GetConstScalarInt(insn.word(3));
-			return getType(insn.word(2)).sizeInComponents * arraySize;
-		}
-
-		case spv::OpTypeStruct:
-		{
-			uint32_t size = 0;
-			for (uint32_t i = 2u; i < insn.wordCount(); i++)
-			{
-				size += getType(insn.word(i)).sizeInComponents;
-			}
-			return size;
-		}
-
-		case spv::OpTypePointer:
-			// Runtime representation of a pointer is a per-lane index.
-			// Note: clients are expected to look through the pointer if they want the pointee size instead.
-			return 1;
-
-		default:
-			UNREACHABLE("%s", OpcodeName(insn.opcode()).c_str());
-			return 0;
-		}
-	}
-
-	int SpirvShader::VisitInterfaceInner(Type::ID id, Decorations d, const InterfaceVisitor &f) const
-	{
-		// Recursively walks variable definition and its type tree, taking into account
-		// any explicit Location or Component decorations encountered; where explicit
-		// Locations or Components are not specified, assigns them sequentially.
-		// Collected decorations are carried down toward the leaves and across
-		// siblings; Effect of decorations intentionally does not flow back up the tree.
-		//
-		// F is a functor to be called with the effective decoration set for every component.
-		//
-		// Returns the next available location, and calls f().
-
-		// This covers the rules in Vulkan 1.1 spec, 14.1.4 Location Assignment.
-
-		ApplyDecorationsForId(&d, id);
-
-		auto const &obj = getType(id);
-		switch(obj.opcode())
-		{
-		case spv::OpTypePointer:
-			return VisitInterfaceInner(obj.definition.word(3), d, f);
-		case spv::OpTypeMatrix:
-			for (auto i = 0u; i < obj.definition.word(3); i++, d.Location++)
-			{
-				// consumes same components of N consecutive locations
-				VisitInterfaceInner(obj.definition.word(2), d, f);
-			}
-			return d.Location;
-		case spv::OpTypeVector:
-			for (auto i = 0u; i < obj.definition.word(3); i++, d.Component++)
-			{
-				// consumes N consecutive components in the same location
-				VisitInterfaceInner(obj.definition.word(2), d, f);
-			}
-			return d.Location + 1;
-		case spv::OpTypeFloat:
-			f(d, ATTRIBTYPE_FLOAT);
-			return d.Location + 1;
-		case spv::OpTypeInt:
-			f(d, obj.definition.word(3) ? ATTRIBTYPE_INT : ATTRIBTYPE_UINT);
-			return d.Location + 1;
-		case spv::OpTypeBool:
-			f(d, ATTRIBTYPE_UINT);
-			return d.Location + 1;
-		case spv::OpTypeStruct:
-		{
-			// iterate over members, which may themselves have Location/Component decorations
-			for (auto i = 0u; i < obj.definition.wordCount() - 2; i++)
-			{
-				ApplyDecorationsForIdMember(&d, id, i);
-				d.Location = VisitInterfaceInner(obj.definition.word(i + 2), d, f);
-				d.Component = 0;    // Implicit locations always have component=0
-			}
-			return d.Location;
-		}
-		case spv::OpTypeArray:
-		{
-			auto arraySize = GetConstScalarInt(obj.definition.word(3));
-			for (auto i = 0u; i < arraySize; i++)
-			{
-				d.Location = VisitInterfaceInner(obj.definition.word(2), d, f);
-			}
-			return d.Location;
-		}
-		default:
-			// Intentionally partial; most opcodes do not participate in type hierarchies
-			return 0;
-		}
-	}
-
-	void SpirvShader::VisitInterface(Object::ID id, const InterfaceVisitor &f) const
-	{
-		// Walk a variable definition and call f for each component in it.
-		Decorations d{};
-		ApplyDecorationsForId(&d, id);
-
-		auto def = getObject(id).definition;
-		ASSERT(def.opcode() == spv::OpVariable);
-		VisitInterfaceInner(def.word(1), d, f);
-	}
-
-	void SpirvShader::ApplyDecorationsForAccessChain(Decorations *d, DescriptorDecorations *dd, Object::ID baseId, uint32_t numIndexes, uint32_t const *indexIds) const
-	{
-		ApplyDecorationsForId(d, baseId);
-		auto &baseObject = getObject(baseId);
-		ApplyDecorationsForId(d, baseObject.type);
-		auto typeId = getType(baseObject.type).element;
-
-		for (auto i = 0u; i < numIndexes; i++)
-		{
-			ApplyDecorationsForId(d, typeId);
-			auto & type = getType(typeId);
-			switch (type.opcode())
-			{
-			case spv::OpTypeStruct:
-			{
-				int memberIndex = GetConstScalarInt(indexIds[i]);
-				ApplyDecorationsForIdMember(d, typeId, memberIndex);
-				typeId = type.definition.word(2u + memberIndex);
-				break;
-			}
-			case spv::OpTypeArray:
-			case spv::OpTypeRuntimeArray:
-				if (dd->InputAttachmentIndex >= 0)
-				{
-					dd->InputAttachmentIndex += GetConstScalarInt(indexIds[i]);
-				}
-				typeId = type.element;
-				break;
-			case spv::OpTypeVector:
-				typeId = type.element;
-				break;
-			case spv::OpTypeMatrix:
-				typeId = type.element;
-				d->InsideMatrix = true;
-				break;
-			default:
-				UNREACHABLE("%s", OpcodeName(type.definition.opcode()).c_str());
-			}
-		}
-	}
-
-	SIMD::Pointer SpirvShader::WalkExplicitLayoutAccessChain(Object::ID baseId, uint32_t numIndexes, uint32_t const *indexIds, EmitState const *state) const
-	{
-		// Produce a offset into external memory in sizeof(float) units
-
-		auto &baseObject = getObject(baseId);
-		Type::ID typeId = getType(baseObject.type).element;
-		Decorations d = {};
-		ApplyDecorationsForId(&d, baseObject.type);
-
-		uint32_t arrayIndex = 0;
-		if (baseObject.kind == Object::Kind::DescriptorSet)
-		{
-			auto type = getType(typeId).definition.opcode();
-			if (type == spv::OpTypeArray || type == spv::OpTypeRuntimeArray)
-			{
-				ASSERT(getObject(indexIds[0]).kind == Object::Kind::Constant);
-				arrayIndex = GetConstScalarInt(indexIds[0]);
-
-				numIndexes--;
-				indexIds++;
-				typeId = getType(typeId).element;
-			}
-		}
-
-		auto ptr = GetPointerToData(baseId, arrayIndex, state);
-
-		int constantOffset = 0;
-
-		for (auto i = 0u; i < numIndexes; i++)
-		{
-			auto & type = getType(typeId);
-			ApplyDecorationsForId(&d, typeId);
-
-			switch (type.definition.opcode())
-			{
-			case spv::OpTypeStruct:
-			{
-				int memberIndex = GetConstScalarInt(indexIds[i]);
-				ApplyDecorationsForIdMember(&d, typeId, memberIndex);
-				ASSERT(d.HasOffset);
-				constantOffset += d.Offset;
-				typeId = type.definition.word(2u + memberIndex);
-				break;
-			}
-			case spv::OpTypeArray:
-			case spv::OpTypeRuntimeArray:
-			{
-				// TODO: b/127950082: Check bounds.
-				ASSERT(d.HasArrayStride);
-				auto & obj = getObject(indexIds[i]);
-				if (obj.kind == Object::Kind::Constant)
-				{
-					constantOffset += d.ArrayStride * GetConstScalarInt(indexIds[i]);
-				}
-				else
-				{
-					ptr += SIMD::Int(d.ArrayStride) * state->getIntermediate(indexIds[i]).Int(0);
-				}
-				typeId = type.element;
-				break;
-			}
-			case spv::OpTypeMatrix:
-			{
-				// TODO: b/127950082: Check bounds.
-				ASSERT(d.HasMatrixStride);
-				d.InsideMatrix = true;
-				auto columnStride = (d.HasRowMajor && d.RowMajor) ? static_cast<int32_t>(sizeof(float)) : d.MatrixStride;
-				auto & obj = getObject(indexIds[i]);
-				if (obj.kind == Object::Kind::Constant)
-				{
-					constantOffset += columnStride * GetConstScalarInt(indexIds[i]);
-				}
-				else
-				{
-					ptr += SIMD::Int(columnStride) * state->getIntermediate(indexIds[i]).Int(0);
-				}
-				typeId = type.element;
-				break;
-			}
-			case spv::OpTypeVector:
-			{
-				auto elemStride = (d.InsideMatrix && d.HasRowMajor && d.RowMajor) ? d.MatrixStride : static_cast<int32_t>(sizeof(float));
-				auto & obj = getObject(indexIds[i]);
-				if (obj.kind == Object::Kind::Constant)
-				{
-					constantOffset += elemStride * GetConstScalarInt(indexIds[i]);
-				}
-				else
-				{
-					ptr += SIMD::Int(elemStride) * state->getIntermediate(indexIds[i]).Int(0);
-				}
-				typeId = type.element;
-				break;
-			}
-			default:
-				UNREACHABLE("%s", OpcodeName(type.definition.opcode()).c_str());
-			}
-		}
-
-		ptr += constantOffset;
-		return ptr;
-	}
-
-	SIMD::Pointer SpirvShader::WalkAccessChain(Object::ID baseId, uint32_t numIndexes, uint32_t const *indexIds, EmitState const *state) const
-	{
-		// TODO: avoid doing per-lane work in some cases if we can?
-		auto routine = state->routine;
-		auto &baseObject = getObject(baseId);
-		Type::ID typeId = getType(baseObject.type).element;
-
-		auto ptr = state->getPointer(baseId);
-
-		int constantOffset = 0;
-
-		for (auto i = 0u; i < numIndexes; i++)
-		{
-			auto & type = getType(typeId);
-			switch(type.opcode())
-			{
-			case spv::OpTypeStruct:
-			{
-				int memberIndex = GetConstScalarInt(indexIds[i]);
-				int offsetIntoStruct = 0;
-				for (auto j = 0; j < memberIndex; j++) {
-					auto memberType = type.definition.word(2u + j);
-					offsetIntoStruct += getType(memberType).sizeInComponents * sizeof(float);
-				}
-				constantOffset += offsetIntoStruct;
-				typeId = type.definition.word(2u + memberIndex);
-				break;
-			}
-
-			case spv::OpTypeVector:
-			case spv::OpTypeMatrix:
-			case spv::OpTypeArray:
-			case spv::OpTypeRuntimeArray:
-			{
-				// TODO: b/127950082: Check bounds.
-				if (getType(baseObject.type).storageClass == spv::StorageClassUniformConstant)
-				{
-					// indexing into an array of descriptors.
-					auto &obj = getObject(indexIds[i]);
-					if (obj.kind != Object::Kind::Constant)
-					{
-						UNSUPPORTED("SPIR-V SampledImageArrayDynamicIndexing Capability");
-					}
-
-					auto d = descriptorDecorations.at(baseId);
-					ASSERT(d.DescriptorSet >= 0);
-					ASSERT(d.Binding >= 0);
-					auto setLayout = routine->pipelineLayout->getDescriptorSetLayout(d.DescriptorSet);
-					auto stride = static_cast<uint32_t>(setLayout->getBindingStride(d.Binding));
-					ptr.base += stride * GetConstScalarInt(indexIds[i]);
-				}
-				else
-				{
-					auto stride = getType(type.element).sizeInComponents * static_cast<uint32_t>(sizeof(float));
-					auto & obj = getObject(indexIds[i]);
-					if (obj.kind == Object::Kind::Constant)
-					{
-						ptr += stride * GetConstScalarInt(indexIds[i]);
-					}
-					else
-					{
-						ptr += SIMD::Int(stride) * state->getIntermediate(indexIds[i]).Int(0);
-					}
-				}
-				typeId = type.element;
-				break;
-			}
-
-			default:
-				UNREACHABLE("%s", OpcodeName(type.opcode()).c_str());
-			}
-		}
-
-		if (constantOffset != 0)
-		{
-			ptr += constantOffset;
-		}
-		return ptr;
-	}
-
-	uint32_t SpirvShader::WalkLiteralAccessChain(Type::ID typeId, uint32_t numIndexes, uint32_t const *indexes) const
-	{
-		uint32_t componentOffset = 0;
-
-		for (auto i = 0u; i < numIndexes; i++)
-		{
-			auto & type = getType(typeId);
-			switch(type.opcode())
-			{
-			case spv::OpTypeStruct:
-			{
-				int memberIndex = indexes[i];
-				int offsetIntoStruct = 0;
-				for (auto j = 0; j < memberIndex; j++) {
-					auto memberType = type.definition.word(2u + j);
-					offsetIntoStruct += getType(memberType).sizeInComponents;
-				}
-				componentOffset += offsetIntoStruct;
-				typeId = type.definition.word(2u + memberIndex);
-				break;
-			}
-
-			case spv::OpTypeVector:
-			case spv::OpTypeMatrix:
-			case spv::OpTypeArray:
-			{
-				auto elementType = type.definition.word(2);
-				auto stride = getType(elementType).sizeInComponents;
-				componentOffset += stride * indexes[i];
-				typeId = elementType;
-				break;
-			}
-
-			default:
-				UNREACHABLE("%s", OpcodeName(type.opcode()).c_str());
-			}
-		}
-
-		return componentOffset;
-	}
-
-	void SpirvShader::Decorations::Apply(spv::Decoration decoration, uint32_t arg)
-	{
-		switch (decoration)
-		{
-		case spv::DecorationLocation:
-			HasLocation = true;
-			Location = static_cast<int32_t>(arg);
-			break;
-		case spv::DecorationComponent:
-			HasComponent = true;
-			Component = arg;
-			break;
-		case spv::DecorationBuiltIn:
-			HasBuiltIn = true;
-			BuiltIn = static_cast<spv::BuiltIn>(arg);
-			break;
-		case spv::DecorationFlat:
-			Flat = true;
-			break;
-		case spv::DecorationNoPerspective:
-			NoPerspective = true;
-			break;
-		case spv::DecorationCentroid:
-			Centroid = true;
-			break;
-		case spv::DecorationBlock:
-			Block = true;
-			break;
-		case spv::DecorationBufferBlock:
-			BufferBlock = true;
-			break;
-		case spv::DecorationOffset:
-			HasOffset = true;
-			Offset = static_cast<int32_t>(arg);
-			break;
-		case spv::DecorationArrayStride:
-			HasArrayStride = true;
-			ArrayStride = static_cast<int32_t>(arg);
-			break;
-		case spv::DecorationMatrixStride:
-			HasMatrixStride = true;
-			MatrixStride = static_cast<int32_t>(arg);
-			break;
-		case spv::DecorationRelaxedPrecision:
-			RelaxedPrecision = true;
-			break;
-		case spv::DecorationRowMajor:
-			HasRowMajor = true;
-			RowMajor = true;
-			break;
-		case spv::DecorationColMajor:
-			HasRowMajor = true;
-			RowMajor = false;
-		default:
-			// Intentionally partial, there are many decorations we just don't care about.
-			break;
-		}
-	}
-
-	void SpirvShader::Decorations::Apply(const sw::SpirvShader::Decorations &src)
-	{
-		// Apply a decoration group to this set of decorations
-		if (src.HasBuiltIn)
-		{
-			HasBuiltIn = true;
-			BuiltIn = src.BuiltIn;
-		}
-
-		if (src.HasLocation)
-		{
-			HasLocation = true;
-			Location = src.Location;
-		}
-
-		if (src.HasComponent)
-		{
-			HasComponent = true;
-			Component = src.Component;
-		}
-
-		if (src.HasOffset)
-		{
-			HasOffset = true;
-			Offset = src.Offset;
-		}
-
-		if (src.HasArrayStride)
-		{
-			HasArrayStride = true;
-			ArrayStride = src.ArrayStride;
-		}
-
-		if (src.HasMatrixStride)
-		{
-			HasMatrixStride = true;
-			MatrixStride = src.MatrixStride;
-		}
-
-		if (src.HasRowMajor)
-		{
-			HasRowMajor = true;
-			RowMajor = src.RowMajor;
-		}
-
-		Flat |= src.Flat;
-		NoPerspective |= src.NoPerspective;
-		Centroid |= src.Centroid;
-		Block |= src.Block;
-		BufferBlock |= src.BufferBlock;
-		RelaxedPrecision |= src.RelaxedPrecision;
-		InsideMatrix |= src.InsideMatrix;
-	}
-
-	void SpirvShader::DescriptorDecorations::Apply(const sw::SpirvShader::DescriptorDecorations &src)
-	{
-		if(src.DescriptorSet >= 0)
-		{
-			DescriptorSet = src.DescriptorSet;
-		}
-
-		if(src.Binding >= 0)
-		{
-			Binding = src.Binding;
-		}
-
-		if (src.InputAttachmentIndex >= 0)
-		{
-			InputAttachmentIndex = src.InputAttachmentIndex;
-		}
-	}
-
-	void SpirvShader::ApplyDecorationsForId(Decorations *d, TypeOrObjectID id) const
-	{
-		auto it = decorations.find(id);
-		if (it != decorations.end())
-			d->Apply(it->second);
-	}
-
-	void SpirvShader::ApplyDecorationsForIdMember(Decorations *d, Type::ID id, uint32_t member) const
-	{
-		auto it = memberDecorations.find(id);
-		if (it != memberDecorations.end() && member < it->second.size())
-		{
-			d->Apply(it->second[member]);
-		}
-	}
-
-	void SpirvShader::DefineResult(const InsnIterator &insn)
-	{
-		Type::ID typeId = insn.word(1);
-		Object::ID resultId = insn.word(2);
-		auto &object = defs[resultId];
-		object.type = typeId;
-
-		switch (getType(typeId).opcode())
-		{
-		case spv::OpTypePointer:
-		case spv::OpTypeImage:
-		case spv::OpTypeSampledImage:
-		case spv::OpTypeSampler:
-			object.kind = Object::Kind::Pointer;
-			break;
-
-		default:
-			object.kind = Object::Kind::Intermediate;
-		}
-
-		object.definition = insn;
-	}
-
-	OutOfBoundsBehavior SpirvShader::EmitState::getOutOfBoundsBehavior(spv::StorageClass storageClass) const
-	{
-		switch(storageClass)
-		{
-		case spv::StorageClassUniform:
-		case spv::StorageClassStorageBuffer:
-			// Buffer resource access. robustBufferAccess feature applies.
-			return robustBufferAccess ? OutOfBoundsBehavior::RobustBufferAccess
-			                          : OutOfBoundsBehavior::UndefinedBehavior;
-
-		case spv::StorageClassImage:
-			return OutOfBoundsBehavior::UndefinedValue;  // "The value returned by a read of an invalid texel is undefined"
-
-		case spv::StorageClassInput:
-			if(executionModel == spv::ExecutionModelVertex)
-			{
-				// Vertex attributes follow robustBufferAccess rules.
-				return robustBufferAccess ? OutOfBoundsBehavior::RobustBufferAccess
-				                          : OutOfBoundsBehavior::UndefinedBehavior;
-			}
-			// Fall through to default case.
-		default:
-			// TODO(b/137183137): Optimize if the pointer resulted from OpInBoundsAccessChain.
-			// TODO(b/131224163): Optimize cases statically known to be within bounds.
-			return OutOfBoundsBehavior::UndefinedValue;
-		}
-
-		return OutOfBoundsBehavior::Nullify;
-	}
-
-	// emit-time
-
-	void SpirvShader::emitProlog(SpirvRoutine *routine) const
-	{
-		for (auto insn : *this)
-		{
-			switch (insn.opcode())
-			{
-			case spv::OpVariable:
-			{
-				Type::ID resultPointerTypeId = insn.word(1);
-				auto resultPointerType = getType(resultPointerTypeId);
-				auto pointeeType = getType(resultPointerType.element);
-
-				if(pointeeType.sizeInComponents > 0)  // TODO: what to do about zero-slot objects?
-				{
-					Object::ID resultId = insn.word(2);
-					routine->createVariable(resultId, pointeeType.sizeInComponents);
-				}
-				break;
-			}
-			case spv::OpPhi:
-			{
-				auto type = getType(insn.word(1));
-				Object::ID resultId = insn.word(2);
-				routine->phis.emplace(resultId, SpirvRoutine::Variable(type.sizeInComponents));
-				break;
-			}
-
-			case spv::OpImageDrefGather:
-			case spv::OpImageFetch:
-			case spv::OpImageGather:
-			case spv::OpImageQueryLod:
-			case spv::OpImageSampleDrefExplicitLod:
-			case spv::OpImageSampleDrefImplicitLod:
-			case spv::OpImageSampleExplicitLod:
-			case spv::OpImageSampleImplicitLod:
-			case spv::OpImageSampleProjDrefExplicitLod:
-			case spv::OpImageSampleProjDrefImplicitLod:
-			case spv::OpImageSampleProjExplicitLod:
-			case spv::OpImageSampleProjImplicitLod:
-			{
-				Object::ID resultId = insn.word(2);
-				routine->samplerCache.emplace(resultId, SpirvRoutine::SamplerCache{});
-				break;
-			}
-
-			default:
-				// Nothing else produces interface variables, so can all be safely ignored.
-				break;
-			}
-		}
-	}
-
-	void SpirvShader::emit(SpirvRoutine *routine, RValue<SIMD::Int> const &activeLaneMask, RValue<SIMD::Int> const &storesAndAtomicsMask, const vk::DescriptorSet::Bindings &descriptorSets) const
-	{
-		EmitState state(routine, entryPoint, activeLaneMask, storesAndAtomicsMask, descriptorSets, robustBufferAccess, executionModel);
-
-		// Emit everything up to the first label
-		// TODO: Separate out dispatch of block from non-block instructions?
-		for (auto insn : *this)
-		{
-			if (insn.opcode() == spv::OpLabel)
-			{
-				break;
-			}
-			EmitInstruction(insn, &state);
-		}
-
-		// Emit all the blocks starting from entryPoint.
-		EmitBlocks(getFunction(entryPoint).entry, &state);
-	}
-
-	void SpirvShader::EmitInstructions(InsnIterator begin, InsnIterator end, EmitState *state) const
-	{
-		for (auto insn = begin; insn != end; insn++)
-		{
-			auto res = EmitInstruction(insn, state);
-			switch (res)
-			{
-			case EmitResult::Continue:
-				continue;
-			case EmitResult::Terminator:
-				break;
-			default:
-				UNREACHABLE("Unexpected EmitResult %d", int(res));
-				break;
-			}
-		}
-	}
-
-	SpirvShader::EmitResult SpirvShader::EmitInstruction(InsnIterator insn, EmitState *state) const
-	{
-		auto opcode = insn.opcode();
+		spv::Op opcode = insn.opcode();
 
 		switch (opcode)
 		{
+		case spv::OpEntryPoint:
+		{
+			executionModel = spv::ExecutionModel(insn.word(1));
+			auto id = Function::ID(insn.word(2));
+			auto name = insn.string(3);
+			auto stage = executionModelToStage(executionModel);
+			if (stage == pipelineStage && strcmp(name, entryPointName) == 0)
+			{
+				ASSERT_MSG(entryPoint == 0, "Duplicate entry point with name '%s' and stage %d", name, int(stage));
+				entryPoint = id;
+			}
+			break;
+		}
+
+		case spv::OpExecutionMode:
+			ProcessExecutionMode(insn);
+			break;
+
+		case spv::OpDecorate:
+		{
+			TypeOrObjectID targetId = insn.word(1);
+			auto decoration = static_cast<spv::Decoration>(insn.word(2));
+			uint32_t value = insn.wordCount() > 3 ? insn.word(3) : 0;
+
+			decorations[targetId].Apply(decoration, value);
+
+			switch(decoration)
+			{
+			case spv::DecorationDescriptorSet:
+				descriptorDecorations[targetId].DescriptorSet = value;
+				break;
+			case spv::DecorationBinding:
+				descriptorDecorations[targetId].Binding = value;
+				break;
+			case spv::DecorationInputAttachmentIndex:
+				descriptorDecorations[targetId].InputAttachmentIndex = value;
+				break;
+			default:
+				// Only handling descriptor decorations here.
+				break;
+			}
+
+			if (decoration == spv::DecorationCentroid)
+				modes.NeedsCentroid = true;
+			break;
+		}
+
+		case spv::OpMemberDecorate:
+		{
+			Type::ID targetId = insn.word(1);
+			auto memberIndex = insn.word(2);
+			auto decoration = static_cast<spv::Decoration>(insn.word(3));
+			uint32_t value = insn.wordCount() > 4 ? insn.word(4) : 0;
+
+			auto &d = memberDecorations[targetId];
+			if (memberIndex >= d.size())
+				d.resize(memberIndex + 1);    // on demand; exact size would require another pass...
+
+			d[memberIndex].Apply(decoration, value);
+
+			if (decoration == spv::DecorationCentroid)
+				modes.NeedsCentroid = true;
+			break;
+		}
+
+		case spv::OpDecorationGroup:
+			// Nothing to do here. We don't need to record the definition of the group; we'll just have
+			// the bundle of decorations float around. If we were to ever walk the decorations directly,
+			// we might think about introducing this as a real Object.
+			break;
+
+		case spv::OpGroupDecorate:
+		{
+			uint32_t group = insn.word(1);
+			auto const &groupDecorations = decorations[group];
+			auto const &descriptorGroupDecorations = descriptorDecorations[group];
+			for (auto i = 2u; i < insn.wordCount(); i++)
+			{
+				// Remaining operands are targets to apply the group to.
+				uint32_t target = insn.word(i);
+				decorations[target].Apply(groupDecorations);
+				descriptorDecorations[target].Apply(descriptorGroupDecorations);
+			}
+
+			break;
+		}
+
+		case spv::OpGroupMemberDecorate:
+		{
+			auto const &srcDecorations = decorations[insn.word(1)];
+			for (auto i = 2u; i < insn.wordCount(); i += 2)
+			{
+				// remaining operands are pairs of <id>, literal for members to apply to.
+				auto &d = memberDecorations[insn.word(i)];
+				auto memberIndex = insn.word(i + 1);
+				if (memberIndex >= d.size())
+					d.resize(memberIndex + 1);    // on demand resize, see above...
+				d[memberIndex].Apply(srcDecorations);
+			}
+			break;
+		}
+
+		case spv::OpLabel:
+		{
+			ASSERT(currentBlock.value() == 0);
+			currentBlock = Block::ID(insn.word(1));
+			blockStart = insn;
+			break;
+		}
+
+		// Branch Instructions (subset of Termination Instructions):
+		case spv::OpBranch:
+		case spv::OpBranchConditional:
+		case spv::OpSwitch:
+		case spv::OpReturn:
+		// fallthrough
+
+		// Termination instruction:
+		case spv::OpKill:
+		case spv::OpUnreachable:
+		{
+			ASSERT(currentBlock.value() != 0);
+			ASSERT(currentFunction.value() != 0);
+
+			auto blockEnd = insn; blockEnd++;
+			functions[currentFunction].blocks[currentBlock] = Block(blockStart, blockEnd);
+			currentBlock = Block::ID(0);
+
+			if (opcode == spv::OpKill)
+			{
+				modes.ContainsKill = true;
+			}
+			break;
+		}
+
+		case spv::OpLoopMerge:
+		case spv::OpSelectionMerge:
+			break; // Nothing to do in analysis pass.
+
 		case spv::OpTypeVoid:
+		case spv::OpTypeBool:
 		case spv::OpTypeInt:
 		case spv::OpTypeFloat:
-		case spv::OpTypeBool:
 		case spv::OpTypeVector:
+		case spv::OpTypeMatrix:
+		case spv::OpTypeImage:
+		case spv::OpTypeSampler:
+		case spv::OpTypeSampledImage:
 		case spv::OpTypeArray:
 		case spv::OpTypeRuntimeArray:
-		case spv::OpTypeMatrix:
 		case spv::OpTypeStruct:
 		case spv::OpTypePointer:
 		case spv::OpTypeFunction:
-		case spv::OpTypeImage:
-		case spv::OpTypeSampledImage:
-		case spv::OpTypeSampler:
-		case spv::OpExecutionMode:
-		case spv::OpMemoryModel:
-		case spv::OpFunction:
-		case spv::OpFunctionEnd:
+			DeclareType(insn);
+			break;
+
+		case spv::OpVariable:
+		{
+			Type::ID typeId = insn.word(1);
+			Object::ID resultId = insn.word(2);
+			auto storageClass = static_cast<spv::StorageClass>(insn.word(3));
+
+			auto &object = defs[resultId];
+			object.kind = Object::Kind::Pointer;
+			object.definition = insn;
+			object.type = typeId;
+
+			ASSERT(getType(typeId).definition.opcode() == spv::OpTypePointer);
+			ASSERT(getType(typeId).storageClass == storageClass);
+
+			switch (storageClass)
+			{
+			case spv::StorageClassInput:
+			case spv::StorageClassOutput:
+				ProcessInterfaceVariable(object);
+				break;
+
+			case spv::StorageClassUniform:
+			case spv::StorageClassStorageBuffer:
+				object.kind = Object::Kind::DescriptorSet;
+				break;
+
+			case spv::StorageClassPushConstant:
+			case spv::StorageClassPrivate:
+			case spv::StorageClassFunction:
+			case spv::StorageClassUniformConstant:
+				break; // Correctly handled.
+
+			case spv::StorageClassWorkgroup:
+			{
+				auto &elTy = getType(getType(typeId).element);
+				auto sizeInBytes = elTy.sizeInComponents * static_cast<uint32_t>(sizeof(float));
+				workgroupMemory.allocate(resultId, sizeInBytes);
+				object.kind = Object::Kind::Pointer;
+				break;
+			}
+			case spv::StorageClassAtomicCounter:
+			case spv::StorageClassImage:
+				UNIMPLEMENTED("StorageClass %d not yet implemented", (int)storageClass);
+				break;
+
+			case spv::StorageClassCrossWorkgroup:
+				UNSUPPORTED("SPIR-V OpenCL Execution Model (StorageClassCrossWorkgroup)");
+				break;
+
+			case spv::StorageClassGeneric:
+				UNSUPPORTED("SPIR-V GenericPointer Capability (StorageClassGeneric)");
+				break;
+
+			default:
+				UNREACHABLE("Unexpected StorageClass %d", storageClass); // See Appendix A of the Vulkan spec.
+				break;
+			}
+			break;
+		}
+
 		case spv::OpConstant:
-		case spv::OpConstantNull:
-		case spv::OpConstantTrue:
-		case spv::OpConstantFalse:
-		case spv::OpConstantComposite:
 		case spv::OpSpecConstant:
-		case spv::OpSpecConstantTrue:
+			CreateConstant(insn).constantValue[0] = insn.word(3);
+			break;
+		case spv::OpConstantFalse:
 		case spv::OpSpecConstantFalse:
-		case spv::OpSpecConstantComposite:
-		case spv::OpSpecConstantOp:
+			CreateConstant(insn).constantValue[0] = 0;    // Represent Boolean false as zero.
+			break;
+		case spv::OpConstantTrue:
+		case spv::OpSpecConstantTrue:
+			CreateConstant(insn).constantValue[0] = ~0u;  // Represent Boolean true as all bits set.
+			break;
+		case spv::OpConstantNull:
 		case spv::OpUndef:
-		case spv::OpExtension:
+		{
+			// TODO: consider a real LLVM-level undef. For now, zero is a perfectly good value.
+			// OpConstantNull forms a constant of arbitrary type, all zeros.
+			auto &object = CreateConstant(insn);
+			auto &objectTy = getType(object.type);
+			for (auto i = 0u; i < objectTy.sizeInComponents; i++)
+			{
+				object.constantValue[i] = 0;
+			}
+			break;
+		}
+		case spv::OpConstantComposite:
+		case spv::OpSpecConstantComposite:
+		{
+			auto &object = CreateConstant(insn);
+			auto offset = 0u;
+			for (auto i = 0u; i < insn.wordCount() - 3; i++)
+			{
+				auto &constituent = getObject(insn.word(i + 3));
+				auto &constituentTy = getType(constituent.type);
+				for (auto j = 0u; j < constituentTy.sizeInComponents; j++)
+				{
+					object.constantValue[offset++] = constituent.constantValue[j];
+				}
+			}
+
+			auto objectId = Object::ID(insn.word(2));
+			auto decorationsIt = decorations.find(objectId);
+			if (decorationsIt != decorations.end() &&
+				decorationsIt->second.BuiltIn == spv::BuiltInWorkgroupSize)
+			{
+				// https://www.khronos.org/registry/vulkan/specs/1.1/html/vkspec.html#interfaces-builtin-variables :
+				// Decorating an object with the WorkgroupSize built-in
+				// decoration will make that object contain the dimensions
+				// of a local workgroup. If an object is decorated with the
+				// WorkgroupSize decoration, this must take precedence over
+				// any execution mode set for LocalSize.
+				// The object decorated with WorkgroupSize must be declared
+				// as a three-component vector of 32-bit integers.
+				ASSERT(getType(object.type).sizeInComponents == 3);
+				modes.WorkgroupSizeX = object.constantValue[0];
+				modes.WorkgroupSizeY = object.constantValue[1];
+				modes.WorkgroupSizeZ = object.constantValue[2];
+			}
+			break;
+		}
+		case spv::OpSpecConstantOp:
+			EvalSpecConstantOp(insn);
+			break;
+
 		case spv::OpCapability:
-		case spv::OpEntryPoint:
+		{
+			auto capability = static_cast<spv::Capability>(insn.word(1));
+			switch (capability)
+			{
+			case spv::CapabilityMatrix: capabilities.Matrix = true; break;
+			case spv::CapabilityShader: capabilities.Shader = true; break;
+			case spv::CapabilityClipDistance: capabilities.ClipDistance = true; break;
+			case spv::CapabilityCullDistance: capabilities.CullDistance = true; break;
+			case spv::CapabilityInputAttachment: capabilities.InputAttachment = true; break;
+			case spv::CapabilitySampled1D: capabilities.Sampled1D = true; break;
+			case spv::CapabilityImage1D: capabilities.Image1D = true; break;
+			case spv::CapabilityImageCubeArray: capabilities.ImageCubeArray = true; break;
+			case spv::CapabilitySampledBuffer: capabilities.SampledBuffer = true; break;
+			case spv::CapabilitySampledCubeArray: capabilities.SampledCubeArray = true; break;
+			case spv::CapabilityImageBuffer: capabilities.ImageBuffer = true; break;
+			case spv::CapabilityStorageImageExtendedFormats: capabilities.StorageImageExtendedFormats = true; break;
+			case spv::CapabilityImageQuery: capabilities.ImageQuery = true; break;
+			case spv::CapabilityDerivativeControl: capabilities.DerivativeControl = true; break;
+			case spv::CapabilityGroupNonUniform: capabilities.GroupNonUniform = true; break;
+			case spv::CapabilityGroupNonUniformVote: capabilities.GroupNonUniformVote = true; break;
+			case spv::CapabilityGroupNonUniformArithmetic: capabilities.GroupNonUniformArithmetic = true; break;
+			case spv::CapabilityGroupNonUniformBallot: capabilities.GroupNonUniformBallot = true; break;
+			case spv::CapabilityGroupNonUniformShuffle: capabilities.GroupNonUniformShuffle = true; break;
+			case spv::CapabilityGroupNonUniformShuffleRelative: capabilities.GroupNonUniformShuffleRelative = true; break;
+			case spv::CapabilityDeviceGroup: capabilities.DeviceGroup = true; break;
+			case spv::CapabilityMultiView: capabilities.MultiView = true; break;
+			default:
+				UNSUPPORTED("Unsupported capability %u", insn.word(1));
+			}
+			break; // Various capabilities will be declared, but none affect our code generation at this point.
+		}
+
+		case spv::OpMemoryModel:
+			break; // Memory model does not affect our code generation until we decide to do Vulkan Memory Model support.
+
+		case spv::OpFunction:
+		{
+			auto functionId = Function::ID(insn.word(2));
+			ASSERT_MSG(currentFunction == 0, "Functions %d and %d overlap", currentFunction.value(), functionId.value());
+			currentFunction = functionId;
+			auto &function = functions[functionId];
+			function.result = Type::ID(insn.word(1));
+			function.type = Type::ID(insn.word(4));
+			// Scan forward to find the function's label.
+			for (auto it = insn; it != end() && function.entry == 0; it++)
+			{
+				switch (it.opcode())
+				{
+				case spv::OpFunction:
+				case spv::OpFunctionParameter:
+					break;
+				case spv::OpLabel:
+					function.entry = Block::ID(it.word(1));
+					break;
+				default:
+					WARN("Unexpected opcode '%s' following OpFunction", OpcodeName(it.opcode()).c_str());
+				}
+			}
+			ASSERT_MSG(function.entry != 0, "Function<%d> has no label", currentFunction.value());
+			break;
+		}
+
+		case spv::OpFunctionEnd:
+			currentFunction = 0;
+			break;
+
 		case spv::OpExtInstImport:
-		case spv::OpDecorate:
-		case spv::OpMemberDecorate:
-		case spv::OpGroupDecorate:
-		case spv::OpGroupMemberDecorate:
-		case spv::OpDecorationGroup:
+		{
+			// We will only support the GLSL 450 extended instruction set, so no point in tracking the ID we assign it.
+			// Valid shaders will not attempt to import any other instruction sets.
+			auto ext = insn.string(2);
+			if (0 != strcmp("GLSL.std.450", ext))
+			{
+				UNSUPPORTED("SPIR-V Extension: %s", ext);
+			}
+			break;
+		}
 		case spv::OpName:
 		case spv::OpMemberName:
 		case spv::OpSource:
@@ -1596,110 +430,85 @@
 		case spv::OpNoLine:
 		case spv::OpModuleProcessed:
 		case spv::OpString:
-			// Nothing to do at emit time. These are either fully handled at analysis time,
-			// or don't require any work at all.
-			return EmitResult::Continue;
+			// No semantic impact
+			break;
 
-		case spv::OpLabel:
-			return EmitResult::Continue;
+		case spv::OpFunctionParameter:
+			// These should have all been removed by preprocessing passes. If we see them here,
+			// our assumptions are wrong and we will probably generate wrong code.
+			UNREACHABLE("%s should have already been lowered.", OpcodeName(opcode).c_str());
+			break;
 
-		case spv::OpVariable:
-			return EmitVariable(insn, state);
+		case spv::OpFunctionCall:
+			// TODO(b/141246700): Add full support for spv::OpFunctionCall
+			break;
+
+		case spv::OpFConvert:
+			UNSUPPORTED("SPIR-V Float16 or Float64 Capability (OpFConvert)");
+			break;
+
+		case spv::OpSConvert:
+			UNSUPPORTED("SPIR-V Int16 or Int64 Capability (OpSConvert)");
+			break;
+
+		case spv::OpUConvert:
+			UNSUPPORTED("SPIR-V Int16 or Int64 Capability (OpUConvert)");
+			break;
 
 		case spv::OpLoad:
-		case spv::OpAtomicLoad:
-			return EmitLoad(insn, state);
-
-		case spv::OpStore:
-		case spv::OpAtomicStore:
-			return EmitStore(insn, state);
-
-		case spv::OpAtomicIAdd:
-		case spv::OpAtomicISub:
-		case spv::OpAtomicSMin:
-		case spv::OpAtomicSMax:
-		case spv::OpAtomicUMin:
-		case spv::OpAtomicUMax:
-		case spv::OpAtomicAnd:
-		case spv::OpAtomicOr:
-		case spv::OpAtomicXor:
-		case spv::OpAtomicIIncrement:
-		case spv::OpAtomicIDecrement:
-		case spv::OpAtomicExchange:
-			return EmitAtomicOp(insn, state);
-
-		case spv::OpAtomicCompareExchange:
-			return EmitAtomicCompareExchange(insn, state);
-
 		case spv::OpAccessChain:
 		case spv::OpInBoundsAccessChain:
-			return EmitAccessChain(insn, state);
+		case spv::OpSampledImage:
+		case spv::OpImage:
+			{
+				// Propagate the descriptor decorations to the result.
+				Object::ID resultId = insn.word(2);
+				Object::ID pointerId = insn.word(3);
+				const auto &d = descriptorDecorations.find(pointerId);
+
+				if(d != descriptorDecorations.end())
+				{
+					descriptorDecorations[resultId] = d->second;
+				}
+
+				DefineResult(insn);
+
+				if (opcode == spv::OpAccessChain || opcode == spv::OpInBoundsAccessChain)
+				{
+					Decorations dd{};
+					ApplyDecorationsForAccessChain(&dd, &descriptorDecorations[resultId], pointerId, insn.wordCount() - 4, insn.wordPointer(4));
+					// Note: offset is the one thing that does *not* propagate, as the access chain accounts for it.
+					dd.HasOffset = false;
+					decorations[resultId].Apply(dd);
+				}
+			}
+			break;
 
 		case spv::OpCompositeConstruct:
-			return EmitCompositeConstruct(insn, state);
-
 		case spv::OpCompositeInsert:
-			return EmitCompositeInsert(insn, state);
-
 		case spv::OpCompositeExtract:
-			return EmitCompositeExtract(insn, state);
-
 		case spv::OpVectorShuffle:
-			return EmitVectorShuffle(insn, state);
-
-		case spv::OpVectorExtractDynamic:
-			return EmitVectorExtractDynamic(insn, state);
-
-		case spv::OpVectorInsertDynamic:
-			return EmitVectorInsertDynamic(insn, state);
-
 		case spv::OpVectorTimesScalar:
 		case spv::OpMatrixTimesScalar:
-			return EmitVectorTimesScalar(insn, state);
-
 		case spv::OpMatrixTimesVector:
-			return EmitMatrixTimesVector(insn, state);
-
 		case spv::OpVectorTimesMatrix:
-			return EmitVectorTimesMatrix(insn, state);
-
 		case spv::OpMatrixTimesMatrix:
-			return EmitMatrixTimesMatrix(insn, state);
-
 		case spv::OpOuterProduct:
-			return EmitOuterProduct(insn, state);
-
 		case spv::OpTranspose:
-			return EmitTranspose(insn, state);
-
+		case spv::OpVectorExtractDynamic:
+		case spv::OpVectorInsertDynamic:
+		// Unary ops
 		case spv::OpNot:
-    	case spv::OpBitFieldInsert:
-    	case spv::OpBitFieldSExtract:
-    	case spv::OpBitFieldUExtract:
-    	case spv::OpBitReverse:
-    	case spv::OpBitCount:
+		case spv::OpBitFieldInsert:
+		case spv::OpBitFieldSExtract:
+		case spv::OpBitFieldUExtract:
+		case spv::OpBitReverse:
+		case spv::OpBitCount:
 		case spv::OpSNegate:
 		case spv::OpFNegate:
 		case spv::OpLogicalNot:
-		case spv::OpConvertFToU:
-		case spv::OpConvertFToS:
-		case spv::OpConvertSToF:
-		case spv::OpConvertUToF:
-		case spv::OpBitcast:
-		case spv::OpIsInf:
-		case spv::OpIsNan:
-		case spv::OpDPdx:
-		case spv::OpDPdxCoarse:
-		case spv::OpDPdy:
-		case spv::OpDPdyCoarse:
-		case spv::OpFwidth:
-		case spv::OpFwidthCoarse:
-		case spv::OpDPdxFine:
-		case spv::OpDPdyFine:
-		case spv::OpFwidthFine:
 		case spv::OpQuantizeToF16:
-			return EmitUnaryOp(insn, state);
-
+		// Binary ops
 		case spv::OpIAdd:
 		case spv::OpISub:
 		case spv::OpIMul:
@@ -1750,124 +559,60 @@
 		case spv::OpSMulExtended:
 		case spv::OpIAddCarry:
 		case spv::OpISubBorrow:
-			return EmitBinaryOp(insn, state);
-
 		case spv::OpDot:
-			return EmitDot(insn, state);
-
+		case spv::OpConvertFToU:
+		case spv::OpConvertFToS:
+		case spv::OpConvertSToF:
+		case spv::OpConvertUToF:
+		case spv::OpBitcast:
 		case spv::OpSelect:
-			return EmitSelect(insn, state);
-
 		case spv::OpExtInst:
-			return EmitExtendedInstruction(insn, state);
-
+		case spv::OpIsInf:
+		case spv::OpIsNan:
 		case spv::OpAny:
-			return EmitAny(insn, state);
-
 		case spv::OpAll:
-			return EmitAll(insn, state);
-
-		case spv::OpBranch:
-			return EmitBranch(insn, state);
-
+		case spv::OpDPdx:
+		case spv::OpDPdxCoarse:
+		case spv::OpDPdy:
+		case spv::OpDPdyCoarse:
+		case spv::OpFwidth:
+		case spv::OpFwidthCoarse:
+		case spv::OpDPdxFine:
+		case spv::OpDPdyFine:
+		case spv::OpFwidthFine:
+		case spv::OpAtomicLoad:
+		case spv::OpAtomicIAdd:
+		case spv::OpAtomicISub:
+		case spv::OpAtomicSMin:
+		case spv::OpAtomicSMax:
+		case spv::OpAtomicUMin:
+		case spv::OpAtomicUMax:
+		case spv::OpAtomicAnd:
+		case spv::OpAtomicOr:
+		case spv::OpAtomicXor:
+		case spv::OpAtomicIIncrement:
+		case spv::OpAtomicIDecrement:
+		case spv::OpAtomicExchange:
+		case spv::OpAtomicCompareExchange:
 		case spv::OpPhi:
-			return EmitPhi(insn, state);
-
-		case spv::OpSelectionMerge:
-		case spv::OpLoopMerge:
-			return EmitResult::Continue;
-
-		case spv::OpBranchConditional:
-			return EmitBranchConditional(insn, state);
-
-		case spv::OpSwitch:
-			return EmitSwitch(insn, state);
-
-		case spv::OpUnreachable:
-			return EmitUnreachable(insn, state);
-
-		case spv::OpReturn:
-			return EmitReturn(insn, state);
-
-		case spv::OpFunctionCall:
-			return EmitFunctionCall(insn, state);
-
-		case spv::OpKill:
-			return EmitKill(insn, state);
-
 		case spv::OpImageSampleImplicitLod:
-			return EmitImageSampleImplicitLod(None, insn, state);
-
 		case spv::OpImageSampleExplicitLod:
-			return EmitImageSampleExplicitLod(None, insn, state);
-
 		case spv::OpImageSampleDrefImplicitLod:
-			return EmitImageSampleImplicitLod(Dref, insn, state);
-
 		case spv::OpImageSampleDrefExplicitLod:
-			return EmitImageSampleExplicitLod(Dref, insn, state);
-
 		case spv::OpImageSampleProjImplicitLod:
-			return EmitImageSampleImplicitLod(Proj, insn, state);
-
 		case spv::OpImageSampleProjExplicitLod:
-			return EmitImageSampleExplicitLod(Proj, insn, state);
-
 		case spv::OpImageSampleProjDrefImplicitLod:
-			return EmitImageSampleImplicitLod(ProjDref, insn, state);
-
 		case spv::OpImageSampleProjDrefExplicitLod:
-			return EmitImageSampleExplicitLod(ProjDref, insn, state);
-
 		case spv::OpImageGather:
-			return EmitImageGather(None, insn, state);
-
 		case spv::OpImageDrefGather:
-			return EmitImageGather(Dref, insn, state);
-
 		case spv::OpImageFetch:
-			return EmitImageFetch(insn, state);
-
 		case spv::OpImageQuerySizeLod:
-			return EmitImageQuerySizeLod(insn, state);
-
 		case spv::OpImageQuerySize:
-			return EmitImageQuerySize(insn, state);
-
 		case spv::OpImageQueryLod:
-			return EmitImageQueryLod(insn, state);
-
 		case spv::OpImageQueryLevels:
-			return EmitImageQueryLevels(insn, state);
-
 		case spv::OpImageQuerySamples:
-			return EmitImageQuerySamples(insn, state);
-
 		case spv::OpImageRead:
-			return EmitImageRead(insn, state);
-
-		case spv::OpImageWrite:
-			return EmitImageWrite(insn, state);
-
 		case spv::OpImageTexelPointer:
-			return EmitImageTexelPointer(insn, state);
-
-		case spv::OpSampledImage:
-		case spv::OpImage:
-			return EmitSampledImageCombineOrSplit(insn, state);
-
-		case spv::OpCopyObject:
-			return EmitCopyObject(insn, state);
-
-		case spv::OpCopyMemory:
-			return EmitCopyMemory(insn, state);
-
-		case spv::OpControlBarrier:
-			return EmitControlBarrier(insn, state);
-
-		case spv::OpMemoryBarrier:
-			return EmitMemoryBarrier(insn, state);
-
 		case spv::OpGroupNonUniformElect:
 		case spv::OpGroupNonUniformAll:
 		case spv::OpGroupNonUniformAny:
@@ -1900,527 +645,1782 @@
 		case spv::OpGroupNonUniformLogicalAnd:
 		case spv::OpGroupNonUniformLogicalOr:
 		case spv::OpGroupNonUniformLogicalXor:
-			return EmitGroupNonUniform(insn, state);
-
+		case spv::OpCopyObject:
 		case spv::OpArrayLength:
-			return EmitArrayLength(insn, state);
+			// Instructions that yield an intermediate value or divergent pointer
+			DefineResult(insn);
+			break;
 
-		default:
-			UNREACHABLE("%s", OpcodeName(opcode).c_str());
+		case spv::OpStore:
+		case spv::OpAtomicStore:
+		case spv::OpImageWrite:
+		case spv::OpCopyMemory:
+		case spv::OpMemoryBarrier:
+			// Don't need to do anything during analysis pass
+			break;
+
+		case spv::OpControlBarrier:
+			modes.ContainsControlBarriers = true;
+			break;
+
+		case spv::OpExtension:
+		{
+			auto ext = insn.string(1);
+			// Part of core SPIR-V 1.3. Vulkan 1.1 implementations must also accept the pre-1.3
+			// extension per Appendix A, `Vulkan Environment for SPIR-V`.
+			if (!strcmp(ext, "SPV_KHR_storage_buffer_storage_class")) break;
+			if (!strcmp(ext, "SPV_KHR_shader_draw_parameters")) break;
+			if (!strcmp(ext, "SPV_KHR_16bit_storage")) break;
+			if (!strcmp(ext, "SPV_KHR_variable_pointers")) break;
+			if (!strcmp(ext, "SPV_KHR_device_group")) break;
+			if (!strcmp(ext, "SPV_KHR_multiview")) break;
+			UNSUPPORTED("SPIR-V Extension: %s", ext);
 			break;
 		}
 
-		return EmitResult::Continue;
+		default:
+			UNIMPLEMENTED("%s", OpcodeName(opcode).c_str());
+		}
 	}
 
-	SpirvShader::EmitResult SpirvShader::EmitAccessChain(InsnIterator insn, EmitState *state) const
+	ASSERT_MSG(entryPoint != 0, "Entry point '%s' not found", entryPointName);
+	for (auto &it : functions)
 	{
-		Type::ID typeId = insn.word(1);
-		Object::ID resultId = insn.word(2);
-		Object::ID baseId = insn.word(3);
-		uint32_t numIndexes = insn.wordCount() - 4;
-		const uint32_t *indexes = insn.wordPointer(4);
-		auto &type = getType(typeId);
-		ASSERT(type.sizeInComponents == 1);
-		ASSERT(getObject(resultId).kind == Object::Kind::Pointer);
-
-		if(type.storageClass == spv::StorageClassPushConstant ||
-		   type.storageClass == spv::StorageClassUniform ||
-		   type.storageClass == spv::StorageClassStorageBuffer)
-		{
-			auto ptr = WalkExplicitLayoutAccessChain(baseId, numIndexes, indexes, state);
-			state->createPointer(resultId, ptr);
-		}
-		else
-		{
-			auto ptr = WalkAccessChain(baseId, numIndexes, indexes, state);
-			state->createPointer(resultId, ptr);
-		}
-
-		return EmitResult::Continue;
+		it.second.AssignBlockFields();
 	}
+}
 
-	SpirvShader::EmitResult SpirvShader::EmitCompositeConstruct(InsnIterator insn, EmitState *state) const
+void SpirvShader::DeclareType(InsnIterator insn)
+{
+	Type::ID resultId = insn.word(1);
+
+	auto &type = types[resultId];
+	type.definition = insn;
+	type.sizeInComponents = ComputeTypeSize(insn);
+
+	// A structure is a builtin block if it has a builtin
+	// member. All members of such a structure are builtins.
+	switch (insn.opcode())
 	{
-		auto &type = getType(insn.word(1));
-		auto &dst = state->createIntermediate(insn.word(2), type.sizeInComponents);
+	case spv::OpTypeStruct:
+	{
+		auto d = memberDecorations.find(resultId);
+		if (d != memberDecorations.end())
+		{
+			for (auto &m : d->second)
+			{
+				if (m.HasBuiltIn)
+				{
+					type.isBuiltInBlock = true;
+					break;
+				}
+			}
+		}
+		break;
+	}
+	case spv::OpTypePointer:
+	{
+		Type::ID elementTypeId = insn.word(3);
+		type.element = elementTypeId;
+		type.isBuiltInBlock = getType(elementTypeId).isBuiltInBlock;
+		type.storageClass = static_cast<spv::StorageClass>(insn.word(2));
+		break;
+	}
+	case spv::OpTypeVector:
+	case spv::OpTypeMatrix:
+	case spv::OpTypeArray:
+	case spv::OpTypeRuntimeArray:
+	{
+		Type::ID elementTypeId = insn.word(2);
+		type.element = elementTypeId;
+		break;
+	}
+	default:
+		break;
+	}
+}
+
+SpirvShader::Object& SpirvShader::CreateConstant(InsnIterator insn)
+{
+	Type::ID typeId = insn.word(1);
+	Object::ID resultId = insn.word(2);
+	auto &object = defs[resultId];
+	auto &objectTy = getType(typeId);
+	object.type = typeId;
+	object.kind = Object::Kind::Constant;
+	object.definition = insn;
+	object.constantValue = std::unique_ptr<uint32_t[]>(new uint32_t[objectTy.sizeInComponents]);
+	return object;
+}
+
+void SpirvShader::ProcessInterfaceVariable(Object &object)
+{
+	auto &objectTy = getType(object.type);
+	ASSERT(objectTy.storageClass == spv::StorageClassInput || objectTy.storageClass == spv::StorageClassOutput);
+
+	ASSERT(objectTy.opcode() == spv::OpTypePointer);
+	auto pointeeTy = getType(objectTy.element);
+
+	auto &builtinInterface = (objectTy.storageClass == spv::StorageClassInput) ? inputBuiltins : outputBuiltins;
+	auto &userDefinedInterface = (objectTy.storageClass == spv::StorageClassInput) ? inputs : outputs;
+
+	ASSERT(object.opcode() == spv::OpVariable);
+	Object::ID resultId = object.definition.word(2);
+
+	if (objectTy.isBuiltInBlock)
+	{
+		// walk the builtin block, registering each of its members separately.
+		auto m = memberDecorations.find(objectTy.element);
+		ASSERT(m != memberDecorations.end());        // otherwise we wouldn't have marked the type chain
+		auto &structType = pointeeTy.definition;
 		auto offset = 0u;
-
-		for (auto i = 0u; i < insn.wordCount() - 3; i++)
+		auto word = 2u;
+		for (auto &member : m->second)
 		{
-			Object::ID srcObjectId = insn.word(3u + i);
-			auto & srcObject = getObject(srcObjectId);
-			auto & srcObjectTy = getType(srcObject.type);
-			GenericValue srcObjectAccess(this, state, srcObjectId);
+			auto &memberType = getType(structType.word(word));
 
-			for (auto j = 0u; j < srcObjectTy.sizeInComponents; j++)
+			if (member.HasBuiltIn)
 			{
-				dst.move(offset++, srcObjectAccess.Float(j));
+				builtinInterface[member.BuiltIn] = {resultId, offset, memberType.sizeInComponents};
 			}
-		}
 
-		return EmitResult::Continue;
+			offset += memberType.sizeInComponents;
+			++word;
+		}
+		return;
 	}
 
-	SpirvShader::EmitResult SpirvShader::EmitCompositeInsert(InsnIterator insn, EmitState *state) const
+	auto d = decorations.find(resultId);
+	if (d != decorations.end() && d->second.HasBuiltIn)
 	{
-		Type::ID resultTypeId = insn.word(1);
-		auto &type = getType(resultTypeId);
-		auto &dst = state->createIntermediate(insn.word(2), type.sizeInComponents);
-		auto &newPartObject = getObject(insn.word(3));
-		auto &newPartObjectTy = getType(newPartObject.type);
-		auto firstNewComponent = WalkLiteralAccessChain(resultTypeId, insn.wordCount() - 5, insn.wordPointer(5));
+		builtinInterface[d->second.BuiltIn] = {resultId, 0, pointeeTy.sizeInComponents};
+	}
+	else
+	{
+		object.kind = Object::Kind::InterfaceVariable;
+		VisitInterface(resultId,
+					   [&userDefinedInterface](Decorations const &d, AttribType type) {
+						   // Populate a single scalar slot in the interface from a collection of decorations and the intended component type.
+						   auto scalarSlot = (d.Location << 2) | d.Component;
+						   ASSERT(scalarSlot >= 0 &&
+								  scalarSlot < static_cast<int32_t>(userDefinedInterface.size()));
 
-		GenericValue srcObjectAccess(this, state, insn.word(4));
-		GenericValue newPartObjectAccess(this, state, insn.word(3));
+						   auto &slot = userDefinedInterface[scalarSlot];
+						   slot.Type = type;
+						   slot.Flat = d.Flat;
+						   slot.NoPerspective = d.NoPerspective;
+						   slot.Centroid = d.Centroid;
+					   });
+	}
+}
 
-		// old components before
-		for (auto i = 0u; i < firstNewComponent; i++)
-		{
-			dst.move(i, srcObjectAccess.Float(i));
-		}
-		// new part
-		for (auto i = 0u; i < newPartObjectTy.sizeInComponents; i++)
-		{
-			dst.move(firstNewComponent + i, newPartObjectAccess.Float(i));
-		}
-		// old components after
-		for (auto i = firstNewComponent + newPartObjectTy.sizeInComponents; i < type.sizeInComponents; i++)
-		{
-			dst.move(i, srcObjectAccess.Float(i));
-		}
+void SpirvShader::ProcessExecutionMode(InsnIterator insn)
+{
+	auto mode = static_cast<spv::ExecutionMode>(insn.word(2));
+	switch (mode)
+	{
+	case spv::ExecutionModeEarlyFragmentTests:
+		modes.EarlyFragmentTests = true;
+		break;
+	case spv::ExecutionModeDepthReplacing:
+		modes.DepthReplacing = true;
+		break;
+	case spv::ExecutionModeDepthGreater:
+		modes.DepthGreater = true;
+		break;
+	case spv::ExecutionModeDepthLess:
+		modes.DepthLess = true;
+		break;
+	case spv::ExecutionModeDepthUnchanged:
+		modes.DepthUnchanged = true;
+		break;
+	case spv::ExecutionModeLocalSize:
+		modes.WorkgroupSizeX = insn.word(3);
+		modes.WorkgroupSizeY = insn.word(4);
+		modes.WorkgroupSizeZ = insn.word(5);
+		break;
+	case spv::ExecutionModeOriginUpperLeft:
+		// This is always the case for a Vulkan shader. Do nothing.
+		break;
+	default:
+		UNREACHABLE("Execution mode: %d", int(mode));
+	}
+}
 
-		return EmitResult::Continue;
+uint32_t SpirvShader::ComputeTypeSize(InsnIterator insn)
+{
+	// Types are always built from the bottom up (with the exception of forward ptrs, which
+	// don't appear in Vulkan shaders. Therefore, we can always assume our component parts have
+	// already been described (and so their sizes determined)
+	switch (insn.opcode())
+	{
+	case spv::OpTypeVoid:
+	case spv::OpTypeSampler:
+	case spv::OpTypeImage:
+	case spv::OpTypeSampledImage:
+	case spv::OpTypeFunction:
+	case spv::OpTypeRuntimeArray:
+		// Objects that don't consume any space.
+		// Descriptor-backed objects currently only need exist at compile-time.
+		// Runtime arrays don't appear in places where their size would be interesting
+		return 0;
+
+	case spv::OpTypeBool:
+	case spv::OpTypeFloat:
+	case spv::OpTypeInt:
+		// All the fundamental types are 1 component. If we ever add support for 8/16/64-bit components,
+		// we might need to change this, but only 32 bit components are required for Vulkan 1.1.
+		return 1;
+
+	case spv::OpTypeVector:
+	case spv::OpTypeMatrix:
+		// Vectors and matrices both consume element count * element size.
+		return getType(insn.word(2)).sizeInComponents * insn.word(3);
+
+	case spv::OpTypeArray:
+	{
+		// Element count * element size. Array sizes come from constant ids.
+		auto arraySize = GetConstScalarInt(insn.word(3));
+		return getType(insn.word(2)).sizeInComponents * arraySize;
 	}
 
-	SpirvShader::EmitResult SpirvShader::EmitCompositeExtract(InsnIterator insn, EmitState *state) const
+	case spv::OpTypeStruct:
 	{
-		auto &type = getType(insn.word(1));
-		auto &dst = state->createIntermediate(insn.word(2), type.sizeInComponents);
-		auto &compositeObject = getObject(insn.word(3));
-		Type::ID compositeTypeId = compositeObject.definition.word(1);
-		auto firstComponent = WalkLiteralAccessChain(compositeTypeId, insn.wordCount() - 4, insn.wordPointer(4));
-
-		GenericValue compositeObjectAccess(this, state, insn.word(3));
-		for (auto i = 0u; i < type.sizeInComponents; i++)
+		uint32_t size = 0;
+		for (uint32_t i = 2u; i < insn.wordCount(); i++)
 		{
-			dst.move(i, compositeObjectAccess.Float(firstComponent + i));
+			size += getType(insn.word(i)).sizeInComponents;
 		}
-
-		return EmitResult::Continue;
+		return size;
 	}
 
-	SpirvShader::EmitResult SpirvShader::EmitVectorShuffle(InsnIterator insn, EmitState *state) const
+	case spv::OpTypePointer:
+		// Runtime representation of a pointer is a per-lane index.
+		// Note: clients are expected to look through the pointer if they want the pointee size instead.
+		return 1;
+
+	default:
+		UNREACHABLE("%s", OpcodeName(insn.opcode()).c_str());
+		return 0;
+	}
+}
+
+int SpirvShader::VisitInterfaceInner(Type::ID id, Decorations d, const InterfaceVisitor &f) const
+{
+	// Recursively walks variable definition and its type tree, taking into account
+	// any explicit Location or Component decorations encountered; where explicit
+	// Locations or Components are not specified, assigns them sequentially.
+	// Collected decorations are carried down toward the leaves and across
+	// siblings; Effect of decorations intentionally does not flow back up the tree.
+	//
+	// F is a functor to be called with the effective decoration set for every component.
+	//
+	// Returns the next available location, and calls f().
+
+	// This covers the rules in Vulkan 1.1 spec, 14.1.4 Location Assignment.
+
+	ApplyDecorationsForId(&d, id);
+
+	auto const &obj = getType(id);
+	switch(obj.opcode())
 	{
-		auto &type = getType(insn.word(1));
-		auto &dst = state->createIntermediate(insn.word(2), type.sizeInComponents);
-
-		// Note: number of components in result type, first half type, and second
-		// half type are all independent.
-		auto &firstHalfType = getType(getObject(insn.word(3)).type);
-
-		GenericValue firstHalfAccess(this, state, insn.word(3));
-		GenericValue secondHalfAccess(this, state, insn.word(4));
-
-		for (auto i = 0u; i < type.sizeInComponents; i++)
+	case spv::OpTypePointer:
+		return VisitInterfaceInner(obj.definition.word(3), d, f);
+	case spv::OpTypeMatrix:
+		for (auto i = 0u; i < obj.definition.word(3); i++, d.Location++)
 		{
-			auto selector = insn.word(5 + i);
-			if (selector == static_cast<uint32_t>(-1))
+			// consumes same components of N consecutive locations
+			VisitInterfaceInner(obj.definition.word(2), d, f);
+		}
+		return d.Location;
+	case spv::OpTypeVector:
+		for (auto i = 0u; i < obj.definition.word(3); i++, d.Component++)
+		{
+			// consumes N consecutive components in the same location
+			VisitInterfaceInner(obj.definition.word(2), d, f);
+		}
+		return d.Location + 1;
+	case spv::OpTypeFloat:
+		f(d, ATTRIBTYPE_FLOAT);
+		return d.Location + 1;
+	case spv::OpTypeInt:
+		f(d, obj.definition.word(3) ? ATTRIBTYPE_INT : ATTRIBTYPE_UINT);
+		return d.Location + 1;
+	case spv::OpTypeBool:
+		f(d, ATTRIBTYPE_UINT);
+		return d.Location + 1;
+	case spv::OpTypeStruct:
+	{
+		// iterate over members, which may themselves have Location/Component decorations
+		for (auto i = 0u; i < obj.definition.wordCount() - 2; i++)
+		{
+			ApplyDecorationsForIdMember(&d, id, i);
+			d.Location = VisitInterfaceInner(obj.definition.word(i + 2), d, f);
+			d.Component = 0;    // Implicit locations always have component=0
+		}
+		return d.Location;
+	}
+	case spv::OpTypeArray:
+	{
+		auto arraySize = GetConstScalarInt(obj.definition.word(3));
+		for (auto i = 0u; i < arraySize; i++)
+		{
+			d.Location = VisitInterfaceInner(obj.definition.word(2), d, f);
+		}
+		return d.Location;
+	}
+	default:
+		// Intentionally partial; most opcodes do not participate in type hierarchies
+		return 0;
+	}
+}
+
+void SpirvShader::VisitInterface(Object::ID id, const InterfaceVisitor &f) const
+{
+	// Walk a variable definition and call f for each component in it.
+	Decorations d{};
+	ApplyDecorationsForId(&d, id);
+
+	auto def = getObject(id).definition;
+	ASSERT(def.opcode() == spv::OpVariable);
+	VisitInterfaceInner(def.word(1), d, f);
+}
+
+void SpirvShader::ApplyDecorationsForAccessChain(Decorations *d, DescriptorDecorations *dd, Object::ID baseId, uint32_t numIndexes, uint32_t const *indexIds) const
+{
+	ApplyDecorationsForId(d, baseId);
+	auto &baseObject = getObject(baseId);
+	ApplyDecorationsForId(d, baseObject.type);
+	auto typeId = getType(baseObject.type).element;
+
+	for (auto i = 0u; i < numIndexes; i++)
+	{
+		ApplyDecorationsForId(d, typeId);
+		auto & type = getType(typeId);
+		switch (type.opcode())
+		{
+		case spv::OpTypeStruct:
+		{
+			int memberIndex = GetConstScalarInt(indexIds[i]);
+			ApplyDecorationsForIdMember(d, typeId, memberIndex);
+			typeId = type.definition.word(2u + memberIndex);
+			break;
+		}
+		case spv::OpTypeArray:
+		case spv::OpTypeRuntimeArray:
+			if (dd->InputAttachmentIndex >= 0)
 			{
-				// Undefined value. Until we decide to do real undef values, zero is as good
-				// a value as any
-				dst.move(i, RValue<SIMD::Float>(0.0f));
+				dd->InputAttachmentIndex += GetConstScalarInt(indexIds[i]);
 			}
-			else if (selector < firstHalfType.sizeInComponents)
+			typeId = type.element;
+			break;
+		case spv::OpTypeVector:
+			typeId = type.element;
+			break;
+		case spv::OpTypeMatrix:
+			typeId = type.element;
+			d->InsideMatrix = true;
+			break;
+		default:
+			UNREACHABLE("%s", OpcodeName(type.definition.opcode()).c_str());
+		}
+	}
+}
+
+SIMD::Pointer SpirvShader::WalkExplicitLayoutAccessChain(Object::ID baseId, uint32_t numIndexes, uint32_t const *indexIds, EmitState const *state) const
+{
+	// Produce a offset into external memory in sizeof(float) units
+
+	auto &baseObject = getObject(baseId);
+	Type::ID typeId = getType(baseObject.type).element;
+	Decorations d = {};
+	ApplyDecorationsForId(&d, baseObject.type);
+
+	uint32_t arrayIndex = 0;
+	if (baseObject.kind == Object::Kind::DescriptorSet)
+	{
+		auto type = getType(typeId).definition.opcode();
+		if (type == spv::OpTypeArray || type == spv::OpTypeRuntimeArray)
+		{
+			ASSERT(getObject(indexIds[0]).kind == Object::Kind::Constant);
+			arrayIndex = GetConstScalarInt(indexIds[0]);
+
+			numIndexes--;
+			indexIds++;
+			typeId = getType(typeId).element;
+		}
+	}
+
+	auto ptr = GetPointerToData(baseId, arrayIndex, state);
+
+	int constantOffset = 0;
+
+	for (auto i = 0u; i < numIndexes; i++)
+	{
+		auto & type = getType(typeId);
+		ApplyDecorationsForId(&d, typeId);
+
+		switch (type.definition.opcode())
+		{
+		case spv::OpTypeStruct:
+		{
+			int memberIndex = GetConstScalarInt(indexIds[i]);
+			ApplyDecorationsForIdMember(&d, typeId, memberIndex);
+			ASSERT(d.HasOffset);
+			constantOffset += d.Offset;
+			typeId = type.definition.word(2u + memberIndex);
+			break;
+		}
+		case spv::OpTypeArray:
+		case spv::OpTypeRuntimeArray:
+		{
+			// TODO: b/127950082: Check bounds.
+			ASSERT(d.HasArrayStride);
+			auto & obj = getObject(indexIds[i]);
+			if (obj.kind == Object::Kind::Constant)
 			{
-				dst.move(i, firstHalfAccess.Float(selector));
+				constantOffset += d.ArrayStride * GetConstScalarInt(indexIds[i]);
 			}
 			else
 			{
-				dst.move(i, secondHalfAccess.Float(selector - firstHalfType.sizeInComponents));
+				ptr += SIMD::Int(d.ArrayStride) * state->getIntermediate(indexIds[i]).Int(0);
 			}
+			typeId = type.element;
+			break;
 		}
-
-		return EmitResult::Continue;
-	}
-
-	SpirvShader::EmitResult SpirvShader::EmitVectorExtractDynamic(InsnIterator insn, EmitState *state) const
-	{
-		auto &type = getType(insn.word(1));
-		auto &dst = state->createIntermediate(insn.word(2), type.sizeInComponents);
-		auto &srcType = getType(getObject(insn.word(3)).type);
-
-		GenericValue src(this, state, insn.word(3));
-		GenericValue index(this, state, insn.word(4));
-
-		SIMD::UInt v = SIMD::UInt(0);
-
-		for (auto i = 0u; i < srcType.sizeInComponents; i++)
+		case spv::OpTypeMatrix:
 		{
-			v |= CmpEQ(index.UInt(0), SIMD::UInt(i)) & src.UInt(i);
-		}
-
-		dst.move(0, v);
-		return EmitResult::Continue;
-	}
-
-	SpirvShader::EmitResult SpirvShader::EmitVectorInsertDynamic(InsnIterator insn, EmitState *state) const
-	{
-		auto &type = getType(insn.word(1));
-		auto &dst = state->createIntermediate(insn.word(2), type.sizeInComponents);
-
-		GenericValue src(this, state, insn.word(3));
-		GenericValue component(this, state, insn.word(4));
-		GenericValue index(this, state, insn.word(5));
-
-		for (auto i = 0u; i < type.sizeInComponents; i++)
-		{
-			SIMD::UInt mask = CmpEQ(SIMD::UInt(i), index.UInt(0));
-			dst.move(i, (src.UInt(i) & ~mask) | (component.UInt(0) & mask));
-		}
-		return EmitResult::Continue;
-	}
-
-	SpirvShader::EmitResult SpirvShader::EmitSelect(InsnIterator insn, EmitState *state) const
-	{
-		auto &type = getType(insn.word(1));
-		auto &dst = state->createIntermediate(insn.word(2), type.sizeInComponents);
-		auto cond = GenericValue(this, state, insn.word(3));
-		auto condIsScalar = (getType(cond.type).sizeInComponents == 1);
-		auto lhs = GenericValue(this, state, insn.word(4));
-		auto rhs = GenericValue(this, state, insn.word(5));
-
-		for (auto i = 0u; i < type.sizeInComponents; i++)
-		{
-			auto sel = cond.Int(condIsScalar ? 0 : i);
-			dst.move(i, (sel & lhs.Int(i)) | (~sel & rhs.Int(i)));   // TODO: IfThenElse()
-		}
-
-		return EmitResult::Continue;
-	}
-
-	SpirvShader::EmitResult SpirvShader::EmitAny(InsnIterator insn, EmitState *state) const
-	{
-		auto &type = getType(insn.word(1));
-		ASSERT(type.sizeInComponents == 1);
-		auto &dst = state->createIntermediate(insn.word(2), type.sizeInComponents);
-		auto &srcType = getType(getObject(insn.word(3)).type);
-		auto src = GenericValue(this, state, insn.word(3));
-
-		SIMD::UInt result = src.UInt(0);
-
-		for (auto i = 1u; i < srcType.sizeInComponents; i++)
-		{
-			result |= src.UInt(i);
-		}
-
-		dst.move(0, result);
-		return EmitResult::Continue;
-	}
-
-	SpirvShader::EmitResult SpirvShader::EmitAll(InsnIterator insn, EmitState *state) const
-	{
-		auto &type = getType(insn.word(1));
-		ASSERT(type.sizeInComponents == 1);
-		auto &dst = state->createIntermediate(insn.word(2), type.sizeInComponents);
-		auto &srcType = getType(getObject(insn.word(3)).type);
-		auto src = GenericValue(this, state, insn.word(3));
-
-		SIMD::UInt result = src.UInt(0);
-
-		for (auto i = 1u; i < srcType.sizeInComponents; i++)
-		{
-			result &= src.UInt(i);
-		}
-
-		dst.move(0, result);
-		return EmitResult::Continue;
-	}
-
-	SpirvShader::EmitResult SpirvShader::EmitAtomicOp(InsnIterator insn, EmitState *state) const
-	{
-		auto &resultType = getType(Type::ID(insn.word(1)));
-		Object::ID resultId = insn.word(2);
-		Object::ID semanticsId = insn.word(5);
-		auto memorySemantics = static_cast<spv::MemorySemanticsMask>(getObject(semanticsId).constantValue[0]);
-		auto memoryOrder = MemoryOrder(memorySemantics);
-		// Where no value is provided (increment/decrement) use an implicit value of 1.
-		auto value = (insn.wordCount() == 7) ? GenericValue(this, state, insn.word(6)).UInt(0) : RValue<SIMD::UInt>(1);
-		auto &dst = state->createIntermediate(resultId, resultType.sizeInComponents);
-		auto ptr = state->getPointer(insn.word(3));
-		auto ptrOffsets = ptr.offsets();
-
-		SIMD::UInt x(0);
-		auto mask = state->activeLaneMask() & state->storesAndAtomicsMask();
-		for (int j = 0; j < SIMD::Width; j++)
-		{
-			If(Extract(mask, j) != 0)
+			// TODO: b/127950082: Check bounds.
+			ASSERT(d.HasMatrixStride);
+			d.InsideMatrix = true;
+			auto columnStride = (d.HasRowMajor && d.RowMajor) ? static_cast<int32_t>(sizeof(float)) : d.MatrixStride;
+			auto & obj = getObject(indexIds[i]);
+			if (obj.kind == Object::Kind::Constant)
 			{
-				auto offset = Extract(ptrOffsets, j);
-				auto laneValue = Extract(value, j);
-				UInt v;
-				switch (insn.opcode())
-				{
-				case spv::OpAtomicIAdd:
-				case spv::OpAtomicIIncrement:
-					v = AddAtomic(Pointer<UInt>(&ptr.base[offset]), laneValue, memoryOrder);
-					break;
-				case spv::OpAtomicISub:
-				case spv::OpAtomicIDecrement:
-					v = SubAtomic(Pointer<UInt>(&ptr.base[offset]), laneValue, memoryOrder);
-					break;
-				case spv::OpAtomicAnd:
-					v = AndAtomic(Pointer<UInt>(&ptr.base[offset]), laneValue, memoryOrder);
-					break;
-				case spv::OpAtomicOr:
-					v = OrAtomic(Pointer<UInt>(&ptr.base[offset]), laneValue, memoryOrder);
-					break;
-				case spv::OpAtomicXor:
-					v = XorAtomic(Pointer<UInt>(&ptr.base[offset]), laneValue, memoryOrder);
-					break;
-				case spv::OpAtomicSMin:
-					v = As<UInt>(MinAtomic(Pointer<Int>(&ptr.base[offset]), As<Int>(laneValue), memoryOrder));
-					break;
-				case spv::OpAtomicSMax:
-					v = As<UInt>(MaxAtomic(Pointer<Int>(&ptr.base[offset]), As<Int>(laneValue), memoryOrder));
-					break;
-				case spv::OpAtomicUMin:
-					v = MinAtomic(Pointer<UInt>(&ptr.base[offset]), laneValue, memoryOrder);
-					break;
-				case spv::OpAtomicUMax:
-					v = MaxAtomic(Pointer<UInt>(&ptr.base[offset]), laneValue, memoryOrder);
-					break;
-				case spv::OpAtomicExchange:
-					v = ExchangeAtomic(Pointer<UInt>(&ptr.base[offset]), laneValue, memoryOrder);
-					break;
-				default:
-					UNREACHABLE("%s", OpcodeName(insn.opcode()).c_str());
-					break;
-				}
-				x = Insert(x, v, j);
+				constantOffset += columnStride * GetConstScalarInt(indexIds[i]);
 			}
-		}
-
-		dst.move(0, x);
-		return EmitResult::Continue;
-	}
-
-	SpirvShader::EmitResult SpirvShader::EmitAtomicCompareExchange(InsnIterator insn, EmitState *state) const
-	{
-		// Separate from EmitAtomicOp due to different instruction encoding
-		auto &resultType = getType(Type::ID(insn.word(1)));
-		Object::ID resultId = insn.word(2);
-
-		auto memorySemanticsEqual = static_cast<spv::MemorySemanticsMask>(getObject(insn.word(5)).constantValue[0]);
-		auto memoryOrderEqual = MemoryOrder(memorySemanticsEqual);
-		auto memorySemanticsUnequal = static_cast<spv::MemorySemanticsMask>(getObject(insn.word(6)).constantValue[0]);
-		auto memoryOrderUnequal = MemoryOrder(memorySemanticsUnequal);
-
-		auto value = GenericValue(this, state, insn.word(7));
-		auto comparator = GenericValue(this, state, insn.word(8));
-		auto &dst = state->createIntermediate(resultId, resultType.sizeInComponents);
-		auto ptr = state->getPointer(insn.word(3));
-		auto ptrOffsets = ptr.offsets();
-
-		SIMD::UInt x(0);
-		auto mask = state->activeLaneMask() & state->storesAndAtomicsMask();
-		for (int j = 0; j < SIMD::Width; j++)
-		{
-			If(Extract(mask, j) != 0)
+			else
 			{
-				auto offset = Extract(ptrOffsets, j);
-				auto laneValue = Extract(value.UInt(0), j);
-				auto laneComparator = Extract(comparator.UInt(0), j);
-				UInt v = CompareExchangeAtomic(Pointer<UInt>(&ptr.base[offset]), laneValue, laneComparator, memoryOrderEqual, memoryOrderUnequal);
-				x = Insert(x, v, j);
+				ptr += SIMD::Int(columnStride) * state->getIntermediate(indexIds[i]).Int(0);
 			}
+			typeId = type.element;
+			break;
 		}
-
-		dst.move(0, x);
-		return EmitResult::Continue;
-	}
-
-	SpirvShader::EmitResult SpirvShader::EmitCopyObject(InsnIterator insn, EmitState *state) const
-	{
-		auto ty = getType(insn.word(1));
-		auto &dst = state->createIntermediate(insn.word(2), ty.sizeInComponents);
-		auto src = GenericValue(this, state, insn.word(3));
-		for (uint32_t i = 0; i < ty.sizeInComponents; i++)
+		case spv::OpTypeVector:
 		{
-			dst.move(i, src.Int(i));
-		}
-		return EmitResult::Continue;
-	}
-
-	SpirvShader::EmitResult SpirvShader::EmitArrayLength(InsnIterator insn, EmitState *state) const
-	{
-		auto resultTyId = Type::ID(insn.word(1));
-		auto resultId = Object::ID(insn.word(2));
-		auto structPtrId = Object::ID(insn.word(3));
-		auto arrayFieldIdx = insn.word(4);
-
-		auto &resultType = getType(resultTyId);
-		ASSERT(resultType.sizeInComponents == 1);
-		ASSERT(resultType.definition.opcode() == spv::OpTypeInt);
-
-		auto &structPtrTy = getType(getObject(structPtrId).type);
-		auto &structTy = getType(structPtrTy.element);
-		auto &arrayTy = getType(structTy.definition.word(2 + arrayFieldIdx));
-		ASSERT(arrayTy.definition.opcode() == spv::OpTypeRuntimeArray);
-		auto &arrayElTy = getType(arrayTy.element);
-
-		auto &result = state->createIntermediate(resultId, 1);
-		auto structBase = GetPointerToData(structPtrId, 0, state);
-
-		Decorations d = {};
-		ApplyDecorationsForIdMember(&d, structPtrTy.element, arrayFieldIdx);
-		ASSERT(d.HasOffset);
-
-		auto arrayBase = structBase + d.Offset;
-		auto arraySizeInBytes = SIMD::Int(arrayBase.limit()) - arrayBase.offsets();
-		auto arrayLength = arraySizeInBytes / SIMD::Int(arrayElTy.sizeInComponents * sizeof(float));
-
-		result.move(0, SIMD::Int(arrayLength));
-
-		return EmitResult::Continue;
-	}
-
-	uint32_t SpirvShader::GetConstScalarInt(Object::ID id) const
-	{
-		auto &scopeObj = getObject(id);
-		ASSERT(scopeObj.kind == Object::Kind::Constant);
-		ASSERT(getType(scopeObj.type).sizeInComponents == 1);
-		return scopeObj.constantValue[0];
-	}
-
-	void SpirvShader::emitEpilog(SpirvRoutine *routine) const
-	{
-		for (auto insn : *this)
-		{
-			switch (insn.opcode())
+			auto elemStride = (d.InsideMatrix && d.HasRowMajor && d.RowMajor) ? d.MatrixStride : static_cast<int32_t>(sizeof(float));
+			auto & obj = getObject(indexIds[i]);
+			if (obj.kind == Object::Kind::Constant)
 			{
-			case spv::OpVariable:
+				constantOffset += elemStride * GetConstScalarInt(indexIds[i]);
+			}
+			else
 			{
-				Object::ID resultId = insn.word(2);
-				auto &object = getObject(resultId);
-				auto &objectTy = getType(object.type);
-				if (object.kind == Object::Kind::InterfaceVariable && objectTy.storageClass == spv::StorageClassOutput)
-				{
-					auto &dst = routine->getVariable(resultId);
-					int offset = 0;
-					VisitInterface(resultId,
-								   [&](Decorations const &d, AttribType type) {
-									   auto scalarSlot = d.Location << 2 | d.Component;
-									   routine->outputs[scalarSlot] = dst[offset++];
-								   });
-				}
-				break;
+				ptr += SIMD::Int(elemStride) * state->getIntermediate(indexIds[i]).Int(0);
 			}
-			default:
-				break;
-			}
+			typeId = type.element;
+			break;
 		}
-
-		// Clear phis that are no longer used. This serves two purposes:
-		// (1) The phi rr::Variables are destructed, preventing pointless
-		//     materialization.
-		// (2) Frees memory that will never be used again.
-		routine->phis.clear();
-	}
-
-	VkShaderStageFlagBits SpirvShader::executionModelToStage(spv::ExecutionModel model)
-	{
-		switch (model)
-		{
-		case spv::ExecutionModelVertex:                 return VK_SHADER_STAGE_VERTEX_BIT;
-		// case spv::ExecutionModelTessellationControl:    return VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT;
-		// case spv::ExecutionModelTessellationEvaluation: return VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT;
-		// case spv::ExecutionModelGeometry:               return VK_SHADER_STAGE_GEOMETRY_BIT;
-		case spv::ExecutionModelFragment:               return VK_SHADER_STAGE_FRAGMENT_BIT;
-		case spv::ExecutionModelGLCompute:              return VK_SHADER_STAGE_COMPUTE_BIT;
-		// case spv::ExecutionModelKernel:                 return VkShaderStageFlagBits(0); // Not supported by vulkan.
-		// case spv::ExecutionModelTaskNV:                 return VK_SHADER_STAGE_TASK_BIT_NV;
-		// case spv::ExecutionModelMeshNV:                 return VK_SHADER_STAGE_MESH_BIT_NV;
-		// case spv::ExecutionModelRayGenerationNV:        return VK_SHADER_STAGE_RAYGEN_BIT_NV;
-		// case spv::ExecutionModelIntersectionNV:         return VK_SHADER_STAGE_INTERSECTION_BIT_NV;
-		// case spv::ExecutionModelAnyHitNV:               return VK_SHADER_STAGE_ANY_HIT_BIT_NV;
-		// case spv::ExecutionModelClosestHitNV:           return VK_SHADER_STAGE_CLOSEST_HIT_BIT_NV;
-		// case spv::ExecutionModelMissNV:                 return VK_SHADER_STAGE_MISS_BIT_NV;
-		// case spv::ExecutionModelCallableNV:             return VK_SHADER_STAGE_CALLABLE_BIT_NV;
 		default:
-			UNSUPPORTED("ExecutionModel: %d", int(model));
-			return VkShaderStageFlagBits(0);
+			UNREACHABLE("%s", OpcodeName(type.definition.opcode()).c_str());
 		}
 	}
 
-	SpirvShader::GenericValue::GenericValue(SpirvShader const *shader, EmitState const *state, SpirvShader::Object::ID objId) :
-			obj(shader->getObject(objId)),
-			intermediate(obj.kind == SpirvShader::Object::Kind::Intermediate ? &state->getIntermediate(objId) : nullptr),
-			type(obj.type) {}
+	ptr += constantOffset;
+	return ptr;
+}
 
-	SpirvRoutine::SpirvRoutine(vk::PipelineLayout const *pipelineLayout) :
-		pipelineLayout(pipelineLayout)
+SIMD::Pointer SpirvShader::WalkAccessChain(Object::ID baseId, uint32_t numIndexes, uint32_t const *indexIds, EmitState const *state) const
+{
+	// TODO: avoid doing per-lane work in some cases if we can?
+	auto routine = state->routine;
+	auto &baseObject = getObject(baseId);
+	Type::ID typeId = getType(baseObject.type).element;
+
+	auto ptr = state->getPointer(baseId);
+
+	int constantOffset = 0;
+
+	for (auto i = 0u; i < numIndexes; i++)
 	{
+		auto & type = getType(typeId);
+		switch(type.opcode())
+		{
+		case spv::OpTypeStruct:
+		{
+			int memberIndex = GetConstScalarInt(indexIds[i]);
+			int offsetIntoStruct = 0;
+			for (auto j = 0; j < memberIndex; j++) {
+				auto memberType = type.definition.word(2u + j);
+				offsetIntoStruct += getType(memberType).sizeInComponents * sizeof(float);
+			}
+			constantOffset += offsetIntoStruct;
+			typeId = type.definition.word(2u + memberIndex);
+			break;
+		}
+
+		case spv::OpTypeVector:
+		case spv::OpTypeMatrix:
+		case spv::OpTypeArray:
+		case spv::OpTypeRuntimeArray:
+		{
+			// TODO: b/127950082: Check bounds.
+			if (getType(baseObject.type).storageClass == spv::StorageClassUniformConstant)
+			{
+				// indexing into an array of descriptors.
+				auto &obj = getObject(indexIds[i]);
+				if (obj.kind != Object::Kind::Constant)
+				{
+					UNSUPPORTED("SPIR-V SampledImageArrayDynamicIndexing Capability");
+				}
+
+				auto d = descriptorDecorations.at(baseId);
+				ASSERT(d.DescriptorSet >= 0);
+				ASSERT(d.Binding >= 0);
+				auto setLayout = routine->pipelineLayout->getDescriptorSetLayout(d.DescriptorSet);
+				auto stride = static_cast<uint32_t>(setLayout->getBindingStride(d.Binding));
+				ptr.base += stride * GetConstScalarInt(indexIds[i]);
+			}
+			else
+			{
+				auto stride = getType(type.element).sizeInComponents * static_cast<uint32_t>(sizeof(float));
+				auto & obj = getObject(indexIds[i]);
+				if (obj.kind == Object::Kind::Constant)
+				{
+					ptr += stride * GetConstScalarInt(indexIds[i]);
+				}
+				else
+				{
+					ptr += SIMD::Int(stride) * state->getIntermediate(indexIds[i]).Int(0);
+				}
+			}
+			typeId = type.element;
+			break;
+		}
+
+		default:
+			UNREACHABLE("%s", OpcodeName(type.opcode()).c_str());
+		}
 	}
 
-	void SpirvRoutine::setImmutableInputBuiltins(SpirvShader const *shader)
+	if (constantOffset != 0)
 	{
-		setInputBuiltin(shader, spv::BuiltInSubgroupLocalInvocationId, [&](const SpirvShader::BuiltinMapping& builtin, Array<SIMD::Float>& value)
-		{
-			ASSERT(builtin.SizeInComponents == 1);
-			value[builtin.FirstComponent] = As<SIMD::Float>(SIMD::Int(0, 1, 2, 3));
-		});
+		ptr += constantOffset;
+	}
+	return ptr;
+}
 
-		setInputBuiltin(shader, spv::BuiltInSubgroupEqMask, [&](const SpirvShader::BuiltinMapping& builtin, Array<SIMD::Float>& value)
-		{
-			ASSERT(builtin.SizeInComponents == 4);
-			value[builtin.FirstComponent + 0] = As<SIMD::Float>(SIMD::Int(1, 2, 4, 8));
-			value[builtin.FirstComponent + 1] = As<SIMD::Float>(SIMD::Int(0, 0, 0, 0));
-			value[builtin.FirstComponent + 2] = As<SIMD::Float>(SIMD::Int(0, 0, 0, 0));
-			value[builtin.FirstComponent + 3] = As<SIMD::Float>(SIMD::Int(0, 0, 0, 0));
-		});
+uint32_t SpirvShader::WalkLiteralAccessChain(Type::ID typeId, uint32_t numIndexes, uint32_t const *indexes) const
+{
+	uint32_t componentOffset = 0;
 
-		setInputBuiltin(shader, spv::BuiltInSubgroupGeMask, [&](const SpirvShader::BuiltinMapping& builtin, Array<SIMD::Float>& value)
+	for (auto i = 0u; i < numIndexes; i++)
+	{
+		auto & type = getType(typeId);
+		switch(type.opcode())
 		{
-			ASSERT(builtin.SizeInComponents == 4);
-			value[builtin.FirstComponent + 0] = As<SIMD::Float>(SIMD::Int(15, 14, 12, 8));
-			value[builtin.FirstComponent + 1] = As<SIMD::Float>(SIMD::Int(0, 0, 0, 0));
-			value[builtin.FirstComponent + 2] = As<SIMD::Float>(SIMD::Int(0, 0, 0, 0));
-			value[builtin.FirstComponent + 3] = As<SIMD::Float>(SIMD::Int(0, 0, 0, 0));
-		});
+		case spv::OpTypeStruct:
+		{
+			int memberIndex = indexes[i];
+			int offsetIntoStruct = 0;
+			for (auto j = 0; j < memberIndex; j++) {
+				auto memberType = type.definition.word(2u + j);
+				offsetIntoStruct += getType(memberType).sizeInComponents;
+			}
+			componentOffset += offsetIntoStruct;
+			typeId = type.definition.word(2u + memberIndex);
+			break;
+		}
 
-		setInputBuiltin(shader, spv::BuiltInSubgroupGtMask, [&](const SpirvShader::BuiltinMapping& builtin, Array<SIMD::Float>& value)
+		case spv::OpTypeVector:
+		case spv::OpTypeMatrix:
+		case spv::OpTypeArray:
 		{
-			ASSERT(builtin.SizeInComponents == 4);
-			value[builtin.FirstComponent + 0] = As<SIMD::Float>(SIMD::Int(14, 12, 8, 0));
-			value[builtin.FirstComponent + 1] = As<SIMD::Float>(SIMD::Int(0, 0, 0, 0));
-			value[builtin.FirstComponent + 2] = As<SIMD::Float>(SIMD::Int(0, 0, 0, 0));
-			value[builtin.FirstComponent + 3] = As<SIMD::Float>(SIMD::Int(0, 0, 0, 0));
-		});
+			auto elementType = type.definition.word(2);
+			auto stride = getType(elementType).sizeInComponents;
+			componentOffset += stride * indexes[i];
+			typeId = elementType;
+			break;
+		}
 
-		setInputBuiltin(shader, spv::BuiltInSubgroupLeMask, [&](const SpirvShader::BuiltinMapping& builtin, Array<SIMD::Float>& value)
-		{
-			ASSERT(builtin.SizeInComponents == 4);
-			value[builtin.FirstComponent + 0] = As<SIMD::Float>(SIMD::Int(1, 3, 7, 15));
-			value[builtin.FirstComponent + 1] = As<SIMD::Float>(SIMD::Int(0, 0, 0, 0));
-			value[builtin.FirstComponent + 2] = As<SIMD::Float>(SIMD::Int(0, 0, 0, 0));
-			value[builtin.FirstComponent + 3] = As<SIMD::Float>(SIMD::Int(0, 0, 0, 0));
-		});
+		default:
+			UNREACHABLE("%s", OpcodeName(type.opcode()).c_str());
+		}
+	}
 
-		setInputBuiltin(shader, spv::BuiltInSubgroupLtMask, [&](const SpirvShader::BuiltinMapping& builtin, Array<SIMD::Float>& value)
-		{
-			ASSERT(builtin.SizeInComponents == 4);
-			value[builtin.FirstComponent + 0] = As<SIMD::Float>(SIMD::Int(0, 1, 3, 7));
-			value[builtin.FirstComponent + 1] = As<SIMD::Float>(SIMD::Int(0, 0, 0, 0));
-			value[builtin.FirstComponent + 2] = As<SIMD::Float>(SIMD::Int(0, 0, 0, 0));
-			value[builtin.FirstComponent + 3] = As<SIMD::Float>(SIMD::Int(0, 0, 0, 0));
-		});
+	return componentOffset;
+}
 
-		setInputBuiltin(shader, spv::BuiltInDeviceIndex, [&](const SpirvShader::BuiltinMapping& builtin, Array<SIMD::Float>& value)
-		{
-			ASSERT(builtin.SizeInComponents == 1);
-			// Only a single physical device is supported.
-			value[builtin.FirstComponent] = As<SIMD::Float>(SIMD::Int(0, 0, 0, 0));
-		});
+void SpirvShader::Decorations::Apply(spv::Decoration decoration, uint32_t arg)
+{
+	switch (decoration)
+	{
+	case spv::DecorationLocation:
+		HasLocation = true;
+		Location = static_cast<int32_t>(arg);
+		break;
+	case spv::DecorationComponent:
+		HasComponent = true;
+		Component = arg;
+		break;
+	case spv::DecorationBuiltIn:
+		HasBuiltIn = true;
+		BuiltIn = static_cast<spv::BuiltIn>(arg);
+		break;
+	case spv::DecorationFlat:
+		Flat = true;
+		break;
+	case spv::DecorationNoPerspective:
+		NoPerspective = true;
+		break;
+	case spv::DecorationCentroid:
+		Centroid = true;
+		break;
+	case spv::DecorationBlock:
+		Block = true;
+		break;
+	case spv::DecorationBufferBlock:
+		BufferBlock = true;
+		break;
+	case spv::DecorationOffset:
+		HasOffset = true;
+		Offset = static_cast<int32_t>(arg);
+		break;
+	case spv::DecorationArrayStride:
+		HasArrayStride = true;
+		ArrayStride = static_cast<int32_t>(arg);
+		break;
+	case spv::DecorationMatrixStride:
+		HasMatrixStride = true;
+		MatrixStride = static_cast<int32_t>(arg);
+		break;
+	case spv::DecorationRelaxedPrecision:
+		RelaxedPrecision = true;
+		break;
+	case spv::DecorationRowMajor:
+		HasRowMajor = true;
+		RowMajor = true;
+		break;
+	case spv::DecorationColMajor:
+		HasRowMajor = true;
+		RowMajor = false;
+	default:
+		// Intentionally partial, there are many decorations we just don't care about.
+		break;
 	}
 }
+
+void SpirvShader::Decorations::Apply(const sw::SpirvShader::Decorations &src)
+{
+	// Apply a decoration group to this set of decorations
+	if (src.HasBuiltIn)
+	{
+		HasBuiltIn = true;
+		BuiltIn = src.BuiltIn;
+	}
+
+	if (src.HasLocation)
+	{
+		HasLocation = true;
+		Location = src.Location;
+	}
+
+	if (src.HasComponent)
+	{
+		HasComponent = true;
+		Component = src.Component;
+	}
+
+	if (src.HasOffset)
+	{
+		HasOffset = true;
+		Offset = src.Offset;
+	}
+
+	if (src.HasArrayStride)
+	{
+		HasArrayStride = true;
+		ArrayStride = src.ArrayStride;
+	}
+
+	if (src.HasMatrixStride)
+	{
+		HasMatrixStride = true;
+		MatrixStride = src.MatrixStride;
+	}
+
+	if (src.HasRowMajor)
+	{
+		HasRowMajor = true;
+		RowMajor = src.RowMajor;
+	}
+
+	Flat |= src.Flat;
+	NoPerspective |= src.NoPerspective;
+	Centroid |= src.Centroid;
+	Block |= src.Block;
+	BufferBlock |= src.BufferBlock;
+	RelaxedPrecision |= src.RelaxedPrecision;
+	InsideMatrix |= src.InsideMatrix;
+}
+
+void SpirvShader::DescriptorDecorations::Apply(const sw::SpirvShader::DescriptorDecorations &src)
+{
+	if(src.DescriptorSet >= 0)
+	{
+		DescriptorSet = src.DescriptorSet;
+	}
+
+	if(src.Binding >= 0)
+	{
+		Binding = src.Binding;
+	}
+
+	if (src.InputAttachmentIndex >= 0)
+	{
+		InputAttachmentIndex = src.InputAttachmentIndex;
+	}
+}
+
+void SpirvShader::ApplyDecorationsForId(Decorations *d, TypeOrObjectID id) const
+{
+	auto it = decorations.find(id);
+	if (it != decorations.end())
+		d->Apply(it->second);
+}
+
+void SpirvShader::ApplyDecorationsForIdMember(Decorations *d, Type::ID id, uint32_t member) const
+{
+	auto it = memberDecorations.find(id);
+	if (it != memberDecorations.end() && member < it->second.size())
+	{
+		d->Apply(it->second[member]);
+	}
+}
+
+void SpirvShader::DefineResult(const InsnIterator &insn)
+{
+	Type::ID typeId = insn.word(1);
+	Object::ID resultId = insn.word(2);
+	auto &object = defs[resultId];
+	object.type = typeId;
+
+	switch (getType(typeId).opcode())
+	{
+	case spv::OpTypePointer:
+	case spv::OpTypeImage:
+	case spv::OpTypeSampledImage:
+	case spv::OpTypeSampler:
+		object.kind = Object::Kind::Pointer;
+		break;
+
+	default:
+		object.kind = Object::Kind::Intermediate;
+	}
+
+	object.definition = insn;
+}
+
+OutOfBoundsBehavior SpirvShader::EmitState::getOutOfBoundsBehavior(spv::StorageClass storageClass) const
+{
+	switch(storageClass)
+	{
+	case spv::StorageClassUniform:
+	case spv::StorageClassStorageBuffer:
+		// Buffer resource access. robustBufferAccess feature applies.
+		return robustBufferAccess ? OutOfBoundsBehavior::RobustBufferAccess
+		                          : OutOfBoundsBehavior::UndefinedBehavior;
+
+	case spv::StorageClassImage:
+		return OutOfBoundsBehavior::UndefinedValue;  // "The value returned by a read of an invalid texel is undefined"
+
+	case spv::StorageClassInput:
+		if(executionModel == spv::ExecutionModelVertex)
+		{
+			// Vertex attributes follow robustBufferAccess rules.
+			return robustBufferAccess ? OutOfBoundsBehavior::RobustBufferAccess
+			                          : OutOfBoundsBehavior::UndefinedBehavior;
+		}
+		// Fall through to default case.
+	default:
+		// TODO(b/137183137): Optimize if the pointer resulted from OpInBoundsAccessChain.
+		// TODO(b/131224163): Optimize cases statically known to be within bounds.
+		return OutOfBoundsBehavior::UndefinedValue;
+	}
+
+	return OutOfBoundsBehavior::Nullify;
+}
+
+// emit-time
+
+void SpirvShader::emitProlog(SpirvRoutine *routine) const
+{
+	for (auto insn : *this)
+	{
+		switch (insn.opcode())
+		{
+		case spv::OpVariable:
+		{
+			Type::ID resultPointerTypeId = insn.word(1);
+			auto resultPointerType = getType(resultPointerTypeId);
+			auto pointeeType = getType(resultPointerType.element);
+
+			if(pointeeType.sizeInComponents > 0)  // TODO: what to do about zero-slot objects?
+			{
+				Object::ID resultId = insn.word(2);
+				routine->createVariable(resultId, pointeeType.sizeInComponents);
+			}
+			break;
+		}
+		case spv::OpPhi:
+		{
+			auto type = getType(insn.word(1));
+			Object::ID resultId = insn.word(2);
+			routine->phis.emplace(resultId, SpirvRoutine::Variable(type.sizeInComponents));
+			break;
+		}
+
+		case spv::OpImageDrefGather:
+		case spv::OpImageFetch:
+		case spv::OpImageGather:
+		case spv::OpImageQueryLod:
+		case spv::OpImageSampleDrefExplicitLod:
+		case spv::OpImageSampleDrefImplicitLod:
+		case spv::OpImageSampleExplicitLod:
+		case spv::OpImageSampleImplicitLod:
+		case spv::OpImageSampleProjDrefExplicitLod:
+		case spv::OpImageSampleProjDrefImplicitLod:
+		case spv::OpImageSampleProjExplicitLod:
+		case spv::OpImageSampleProjImplicitLod:
+		{
+			Object::ID resultId = insn.word(2);
+			routine->samplerCache.emplace(resultId, SpirvRoutine::SamplerCache{});
+			break;
+		}
+
+		default:
+			// Nothing else produces interface variables, so can all be safely ignored.
+			break;
+		}
+	}
+}
+
+void SpirvShader::emit(SpirvRoutine *routine, RValue<SIMD::Int> const &activeLaneMask, RValue<SIMD::Int> const &storesAndAtomicsMask, const vk::DescriptorSet::Bindings &descriptorSets) const
+{
+	EmitState state(routine, entryPoint, activeLaneMask, storesAndAtomicsMask, descriptorSets, robustBufferAccess, executionModel);
+
+	// Emit everything up to the first label
+	// TODO: Separate out dispatch of block from non-block instructions?
+	for (auto insn : *this)
+	{
+		if (insn.opcode() == spv::OpLabel)
+		{
+			break;
+		}
+		EmitInstruction(insn, &state);
+	}
+
+	// Emit all the blocks starting from entryPoint.
+	EmitBlocks(getFunction(entryPoint).entry, &state);
+}
+
+void SpirvShader::EmitInstructions(InsnIterator begin, InsnIterator end, EmitState *state) const
+{
+	for (auto insn = begin; insn != end; insn++)
+	{
+		auto res = EmitInstruction(insn, state);
+		switch (res)
+		{
+		case EmitResult::Continue:
+			continue;
+		case EmitResult::Terminator:
+			break;
+		default:
+			UNREACHABLE("Unexpected EmitResult %d", int(res));
+			break;
+		}
+	}
+}
+
+SpirvShader::EmitResult SpirvShader::EmitInstruction(InsnIterator insn, EmitState *state) const
+{
+	auto opcode = insn.opcode();
+
+	switch (opcode)
+	{
+	case spv::OpTypeVoid:
+	case spv::OpTypeInt:
+	case spv::OpTypeFloat:
+	case spv::OpTypeBool:
+	case spv::OpTypeVector:
+	case spv::OpTypeArray:
+	case spv::OpTypeRuntimeArray:
+	case spv::OpTypeMatrix:
+	case spv::OpTypeStruct:
+	case spv::OpTypePointer:
+	case spv::OpTypeFunction:
+	case spv::OpTypeImage:
+	case spv::OpTypeSampledImage:
+	case spv::OpTypeSampler:
+	case spv::OpExecutionMode:
+	case spv::OpMemoryModel:
+	case spv::OpFunction:
+	case spv::OpFunctionEnd:
+	case spv::OpConstant:
+	case spv::OpConstantNull:
+	case spv::OpConstantTrue:
+	case spv::OpConstantFalse:
+	case spv::OpConstantComposite:
+	case spv::OpSpecConstant:
+	case spv::OpSpecConstantTrue:
+	case spv::OpSpecConstantFalse:
+	case spv::OpSpecConstantComposite:
+	case spv::OpSpecConstantOp:
+	case spv::OpUndef:
+	case spv::OpExtension:
+	case spv::OpCapability:
+	case spv::OpEntryPoint:
+	case spv::OpExtInstImport:
+	case spv::OpDecorate:
+	case spv::OpMemberDecorate:
+	case spv::OpGroupDecorate:
+	case spv::OpGroupMemberDecorate:
+	case spv::OpDecorationGroup:
+	case spv::OpName:
+	case spv::OpMemberName:
+	case spv::OpSource:
+	case spv::OpSourceContinued:
+	case spv::OpSourceExtension:
+	case spv::OpLine:
+	case spv::OpNoLine:
+	case spv::OpModuleProcessed:
+	case spv::OpString:
+		// Nothing to do at emit time. These are either fully handled at analysis time,
+		// or don't require any work at all.
+		return EmitResult::Continue;
+
+	case spv::OpLabel:
+		return EmitResult::Continue;
+
+	case spv::OpVariable:
+		return EmitVariable(insn, state);
+
+	case spv::OpLoad:
+	case spv::OpAtomicLoad:
+		return EmitLoad(insn, state);
+
+	case spv::OpStore:
+	case spv::OpAtomicStore:
+		return EmitStore(insn, state);
+
+	case spv::OpAtomicIAdd:
+	case spv::OpAtomicISub:
+	case spv::OpAtomicSMin:
+	case spv::OpAtomicSMax:
+	case spv::OpAtomicUMin:
+	case spv::OpAtomicUMax:
+	case spv::OpAtomicAnd:
+	case spv::OpAtomicOr:
+	case spv::OpAtomicXor:
+	case spv::OpAtomicIIncrement:
+	case spv::OpAtomicIDecrement:
+	case spv::OpAtomicExchange:
+		return EmitAtomicOp(insn, state);
+
+	case spv::OpAtomicCompareExchange:
+		return EmitAtomicCompareExchange(insn, state);
+
+	case spv::OpAccessChain:
+	case spv::OpInBoundsAccessChain:
+		return EmitAccessChain(insn, state);
+
+	case spv::OpCompositeConstruct:
+		return EmitCompositeConstruct(insn, state);
+
+	case spv::OpCompositeInsert:
+		return EmitCompositeInsert(insn, state);
+
+	case spv::OpCompositeExtract:
+		return EmitCompositeExtract(insn, state);
+
+	case spv::OpVectorShuffle:
+		return EmitVectorShuffle(insn, state);
+
+	case spv::OpVectorExtractDynamic:
+		return EmitVectorExtractDynamic(insn, state);
+
+	case spv::OpVectorInsertDynamic:
+		return EmitVectorInsertDynamic(insn, state);
+
+	case spv::OpVectorTimesScalar:
+	case spv::OpMatrixTimesScalar:
+		return EmitVectorTimesScalar(insn, state);
+
+	case spv::OpMatrixTimesVector:
+		return EmitMatrixTimesVector(insn, state);
+
+	case spv::OpVectorTimesMatrix:
+		return EmitVectorTimesMatrix(insn, state);
+
+	case spv::OpMatrixTimesMatrix:
+		return EmitMatrixTimesMatrix(insn, state);
+
+	case spv::OpOuterProduct:
+		return EmitOuterProduct(insn, state);
+
+	case spv::OpTranspose:
+		return EmitTranspose(insn, state);
+
+	case spv::OpNot:
+	case spv::OpBitFieldInsert:
+	case spv::OpBitFieldSExtract:
+	case spv::OpBitFieldUExtract:
+	case spv::OpBitReverse:
+	case spv::OpBitCount:
+	case spv::OpSNegate:
+	case spv::OpFNegate:
+	case spv::OpLogicalNot:
+	case spv::OpConvertFToU:
+	case spv::OpConvertFToS:
+	case spv::OpConvertSToF:
+	case spv::OpConvertUToF:
+	case spv::OpBitcast:
+	case spv::OpIsInf:
+	case spv::OpIsNan:
+	case spv::OpDPdx:
+	case spv::OpDPdxCoarse:
+	case spv::OpDPdy:
+	case spv::OpDPdyCoarse:
+	case spv::OpFwidth:
+	case spv::OpFwidthCoarse:
+	case spv::OpDPdxFine:
+	case spv::OpDPdyFine:
+	case spv::OpFwidthFine:
+	case spv::OpQuantizeToF16:
+		return EmitUnaryOp(insn, state);
+
+	case spv::OpIAdd:
+	case spv::OpISub:
+	case spv::OpIMul:
+	case spv::OpSDiv:
+	case spv::OpUDiv:
+	case spv::OpFAdd:
+	case spv::OpFSub:
+	case spv::OpFMul:
+	case spv::OpFDiv:
+	case spv::OpFMod:
+	case spv::OpFRem:
+	case spv::OpFOrdEqual:
+	case spv::OpFUnordEqual:
+	case spv::OpFOrdNotEqual:
+	case spv::OpFUnordNotEqual:
+	case spv::OpFOrdLessThan:
+	case spv::OpFUnordLessThan:
+	case spv::OpFOrdGreaterThan:
+	case spv::OpFUnordGreaterThan:
+	case spv::OpFOrdLessThanEqual:
+	case spv::OpFUnordLessThanEqual:
+	case spv::OpFOrdGreaterThanEqual:
+	case spv::OpFUnordGreaterThanEqual:
+	case spv::OpSMod:
+	case spv::OpSRem:
+	case spv::OpUMod:
+	case spv::OpIEqual:
+	case spv::OpINotEqual:
+	case spv::OpUGreaterThan:
+	case spv::OpSGreaterThan:
+	case spv::OpUGreaterThanEqual:
+	case spv::OpSGreaterThanEqual:
+	case spv::OpULessThan:
+	case spv::OpSLessThan:
+	case spv::OpULessThanEqual:
+	case spv::OpSLessThanEqual:
+	case spv::OpShiftRightLogical:
+	case spv::OpShiftRightArithmetic:
+	case spv::OpShiftLeftLogical:
+	case spv::OpBitwiseOr:
+	case spv::OpBitwiseXor:
+	case spv::OpBitwiseAnd:
+	case spv::OpLogicalOr:
+	case spv::OpLogicalAnd:
+	case spv::OpLogicalEqual:
+	case spv::OpLogicalNotEqual:
+	case spv::OpUMulExtended:
+	case spv::OpSMulExtended:
+	case spv::OpIAddCarry:
+	case spv::OpISubBorrow:
+		return EmitBinaryOp(insn, state);
+
+	case spv::OpDot:
+		return EmitDot(insn, state);
+
+	case spv::OpSelect:
+		return EmitSelect(insn, state);
+
+	case spv::OpExtInst:
+		return EmitExtendedInstruction(insn, state);
+
+	case spv::OpAny:
+		return EmitAny(insn, state);
+
+	case spv::OpAll:
+		return EmitAll(insn, state);
+
+	case spv::OpBranch:
+		return EmitBranch(insn, state);
+
+	case spv::OpPhi:
+		return EmitPhi(insn, state);
+
+	case spv::OpSelectionMerge:
+	case spv::OpLoopMerge:
+		return EmitResult::Continue;
+
+	case spv::OpBranchConditional:
+		return EmitBranchConditional(insn, state);
+
+	case spv::OpSwitch:
+		return EmitSwitch(insn, state);
+
+	case spv::OpUnreachable:
+		return EmitUnreachable(insn, state);
+
+	case spv::OpReturn:
+		return EmitReturn(insn, state);
+
+	case spv::OpFunctionCall:
+		return EmitFunctionCall(insn, state);
+
+	case spv::OpKill:
+		return EmitKill(insn, state);
+
+	case spv::OpImageSampleImplicitLod:
+		return EmitImageSampleImplicitLod(None, insn, state);
+
+	case spv::OpImageSampleExplicitLod:
+		return EmitImageSampleExplicitLod(None, insn, state);
+
+	case spv::OpImageSampleDrefImplicitLod:
+		return EmitImageSampleImplicitLod(Dref, insn, state);
+
+	case spv::OpImageSampleDrefExplicitLod:
+		return EmitImageSampleExplicitLod(Dref, insn, state);
+
+	case spv::OpImageSampleProjImplicitLod:
+		return EmitImageSampleImplicitLod(Proj, insn, state);
+
+	case spv::OpImageSampleProjExplicitLod:
+		return EmitImageSampleExplicitLod(Proj, insn, state);
+
+	case spv::OpImageSampleProjDrefImplicitLod:
+		return EmitImageSampleImplicitLod(ProjDref, insn, state);
+
+	case spv::OpImageSampleProjDrefExplicitLod:
+		return EmitImageSampleExplicitLod(ProjDref, insn, state);
+
+	case spv::OpImageGather:
+		return EmitImageGather(None, insn, state);
+
+	case spv::OpImageDrefGather:
+		return EmitImageGather(Dref, insn, state);
+
+	case spv::OpImageFetch:
+		return EmitImageFetch(insn, state);
+
+	case spv::OpImageQuerySizeLod:
+		return EmitImageQuerySizeLod(insn, state);
+
+	case spv::OpImageQuerySize:
+		return EmitImageQuerySize(insn, state);
+
+	case spv::OpImageQueryLod:
+		return EmitImageQueryLod(insn, state);
+
+	case spv::OpImageQueryLevels:
+		return EmitImageQueryLevels(insn, state);
+
+	case spv::OpImageQuerySamples:
+		return EmitImageQuerySamples(insn, state);
+
+	case spv::OpImageRead:
+		return EmitImageRead(insn, state);
+
+	case spv::OpImageWrite:
+		return EmitImageWrite(insn, state);
+
+	case spv::OpImageTexelPointer:
+		return EmitImageTexelPointer(insn, state);
+
+	case spv::OpSampledImage:
+	case spv::OpImage:
+		return EmitSampledImageCombineOrSplit(insn, state);
+
+	case spv::OpCopyObject:
+		return EmitCopyObject(insn, state);
+
+	case spv::OpCopyMemory:
+		return EmitCopyMemory(insn, state);
+
+	case spv::OpControlBarrier:
+		return EmitControlBarrier(insn, state);
+
+	case spv::OpMemoryBarrier:
+		return EmitMemoryBarrier(insn, state);
+
+	case spv::OpGroupNonUniformElect:
+	case spv::OpGroupNonUniformAll:
+	case spv::OpGroupNonUniformAny:
+	case spv::OpGroupNonUniformAllEqual:
+	case spv::OpGroupNonUniformBroadcast:
+	case spv::OpGroupNonUniformBroadcastFirst:
+	case spv::OpGroupNonUniformBallot:
+	case spv::OpGroupNonUniformInverseBallot:
+	case spv::OpGroupNonUniformBallotBitExtract:
+	case spv::OpGroupNonUniformBallotBitCount:
+	case spv::OpGroupNonUniformBallotFindLSB:
+	case spv::OpGroupNonUniformBallotFindMSB:
+	case spv::OpGroupNonUniformShuffle:
+	case spv::OpGroupNonUniformShuffleXor:
+	case spv::OpGroupNonUniformShuffleUp:
+	case spv::OpGroupNonUniformShuffleDown:
+	case spv::OpGroupNonUniformIAdd:
+	case spv::OpGroupNonUniformFAdd:
+	case spv::OpGroupNonUniformIMul:
+	case spv::OpGroupNonUniformFMul:
+	case spv::OpGroupNonUniformSMin:
+	case spv::OpGroupNonUniformUMin:
+	case spv::OpGroupNonUniformFMin:
+	case spv::OpGroupNonUniformSMax:
+	case spv::OpGroupNonUniformUMax:
+	case spv::OpGroupNonUniformFMax:
+	case spv::OpGroupNonUniformBitwiseAnd:
+	case spv::OpGroupNonUniformBitwiseOr:
+	case spv::OpGroupNonUniformBitwiseXor:
+	case spv::OpGroupNonUniformLogicalAnd:
+	case spv::OpGroupNonUniformLogicalOr:
+	case spv::OpGroupNonUniformLogicalXor:
+		return EmitGroupNonUniform(insn, state);
+
+	case spv::OpArrayLength:
+		return EmitArrayLength(insn, state);
+
+	default:
+		UNREACHABLE("%s", OpcodeName(opcode).c_str());
+		break;
+	}
+
+	return EmitResult::Continue;
+}
+
+SpirvShader::EmitResult SpirvShader::EmitAccessChain(InsnIterator insn, EmitState *state) const
+{
+	Type::ID typeId = insn.word(1);
+	Object::ID resultId = insn.word(2);
+	Object::ID baseId = insn.word(3);
+	uint32_t numIndexes = insn.wordCount() - 4;
+	const uint32_t *indexes = insn.wordPointer(4);
+	auto &type = getType(typeId);
+	ASSERT(type.sizeInComponents == 1);
+	ASSERT(getObject(resultId).kind == Object::Kind::Pointer);
+
+	if(type.storageClass == spv::StorageClassPushConstant ||
+	   type.storageClass == spv::StorageClassUniform ||
+	   type.storageClass == spv::StorageClassStorageBuffer)
+	{
+		auto ptr = WalkExplicitLayoutAccessChain(baseId, numIndexes, indexes, state);
+		state->createPointer(resultId, ptr);
+	}
+	else
+	{
+		auto ptr = WalkAccessChain(baseId, numIndexes, indexes, state);
+		state->createPointer(resultId, ptr);
+	}
+
+	return EmitResult::Continue;
+}
+
+SpirvShader::EmitResult SpirvShader::EmitCompositeConstruct(InsnIterator insn, EmitState *state) const
+{
+	auto &type = getType(insn.word(1));
+	auto &dst = state->createIntermediate(insn.word(2), type.sizeInComponents);
+	auto offset = 0u;
+
+	for (auto i = 0u; i < insn.wordCount() - 3; i++)
+	{
+		Object::ID srcObjectId = insn.word(3u + i);
+		auto & srcObject = getObject(srcObjectId);
+		auto & srcObjectTy = getType(srcObject.type);
+		GenericValue srcObjectAccess(this, state, srcObjectId);
+
+		for (auto j = 0u; j < srcObjectTy.sizeInComponents; j++)
+		{
+			dst.move(offset++, srcObjectAccess.Float(j));
+		}
+	}
+
+	return EmitResult::Continue;
+}
+
+SpirvShader::EmitResult SpirvShader::EmitCompositeInsert(InsnIterator insn, EmitState *state) const
+{
+	Type::ID resultTypeId = insn.word(1);
+	auto &type = getType(resultTypeId);
+	auto &dst = state->createIntermediate(insn.word(2), type.sizeInComponents);
+	auto &newPartObject = getObject(insn.word(3));
+	auto &newPartObjectTy = getType(newPartObject.type);
+	auto firstNewComponent = WalkLiteralAccessChain(resultTypeId, insn.wordCount() - 5, insn.wordPointer(5));
+
+	GenericValue srcObjectAccess(this, state, insn.word(4));
+	GenericValue newPartObjectAccess(this, state, insn.word(3));
+
+	// old components before
+	for (auto i = 0u; i < firstNewComponent; i++)
+	{
+		dst.move(i, srcObjectAccess.Float(i));
+	}
+	// new part
+	for (auto i = 0u; i < newPartObjectTy.sizeInComponents; i++)
+	{
+		dst.move(firstNewComponent + i, newPartObjectAccess.Float(i));
+	}
+	// old components after
+	for (auto i = firstNewComponent + newPartObjectTy.sizeInComponents; i < type.sizeInComponents; i++)
+	{
+		dst.move(i, srcObjectAccess.Float(i));
+	}
+
+	return EmitResult::Continue;
+}
+
+SpirvShader::EmitResult SpirvShader::EmitCompositeExtract(InsnIterator insn, EmitState *state) const
+{
+	auto &type = getType(insn.word(1));
+	auto &dst = state->createIntermediate(insn.word(2), type.sizeInComponents);
+	auto &compositeObject = getObject(insn.word(3));
+	Type::ID compositeTypeId = compositeObject.definition.word(1);
+	auto firstComponent = WalkLiteralAccessChain(compositeTypeId, insn.wordCount() - 4, insn.wordPointer(4));
+
+	GenericValue compositeObjectAccess(this, state, insn.word(3));
+	for (auto i = 0u; i < type.sizeInComponents; i++)
+	{
+		dst.move(i, compositeObjectAccess.Float(firstComponent + i));
+	}
+
+	return EmitResult::Continue;
+}
+
+SpirvShader::EmitResult SpirvShader::EmitVectorShuffle(InsnIterator insn, EmitState *state) const
+{
+	auto &type = getType(insn.word(1));
+	auto &dst = state->createIntermediate(insn.word(2), type.sizeInComponents);
+
+	// Note: number of components in result type, first half type, and second
+	// half type are all independent.
+	auto &firstHalfType = getType(getObject(insn.word(3)).type);
+
+	GenericValue firstHalfAccess(this, state, insn.word(3));
+	GenericValue secondHalfAccess(this, state, insn.word(4));
+
+	for (auto i = 0u; i < type.sizeInComponents; i++)
+	{
+		auto selector = insn.word(5 + i);
+		if (selector == static_cast<uint32_t>(-1))
+		{
+			// Undefined value. Until we decide to do real undef values, zero is as good
+			// a value as any
+			dst.move(i, RValue<SIMD::Float>(0.0f));
+		}
+		else if (selector < firstHalfType.sizeInComponents)
+		{
+			dst.move(i, firstHalfAccess.Float(selector));
+		}
+		else
+		{
+			dst.move(i, secondHalfAccess.Float(selector - firstHalfType.sizeInComponents));
+		}
+	}
+
+	return EmitResult::Continue;
+}
+
+SpirvShader::EmitResult SpirvShader::EmitVectorExtractDynamic(InsnIterator insn, EmitState *state) const
+{
+	auto &type = getType(insn.word(1));
+	auto &dst = state->createIntermediate(insn.word(2), type.sizeInComponents);
+	auto &srcType = getType(getObject(insn.word(3)).type);
+
+	GenericValue src(this, state, insn.word(3));
+	GenericValue index(this, state, insn.word(4));
+
+	SIMD::UInt v = SIMD::UInt(0);
+
+	for (auto i = 0u; i < srcType.sizeInComponents; i++)
+	{
+		v |= CmpEQ(index.UInt(0), SIMD::UInt(i)) & src.UInt(i);
+	}
+
+	dst.move(0, v);
+	return EmitResult::Continue;
+}
+
+SpirvShader::EmitResult SpirvShader::EmitVectorInsertDynamic(InsnIterator insn, EmitState *state) const
+{
+	auto &type = getType(insn.word(1));
+	auto &dst = state->createIntermediate(insn.word(2), type.sizeInComponents);
+
+	GenericValue src(this, state, insn.word(3));
+	GenericValue component(this, state, insn.word(4));
+	GenericValue index(this, state, insn.word(5));
+
+	for (auto i = 0u; i < type.sizeInComponents; i++)
+	{
+		SIMD::UInt mask = CmpEQ(SIMD::UInt(i), index.UInt(0));
+		dst.move(i, (src.UInt(i) & ~mask) | (component.UInt(0) & mask));
+	}
+	return EmitResult::Continue;
+}
+
+SpirvShader::EmitResult SpirvShader::EmitSelect(InsnIterator insn, EmitState *state) const
+{
+	auto &type = getType(insn.word(1));
+	auto &dst = state->createIntermediate(insn.word(2), type.sizeInComponents);
+	auto cond = GenericValue(this, state, insn.word(3));
+	auto condIsScalar = (getType(cond.type).sizeInComponents == 1);
+	auto lhs = GenericValue(this, state, insn.word(4));
+	auto rhs = GenericValue(this, state, insn.word(5));
+
+	for (auto i = 0u; i < type.sizeInComponents; i++)
+	{
+		auto sel = cond.Int(condIsScalar ? 0 : i);
+		dst.move(i, (sel & lhs.Int(i)) | (~sel & rhs.Int(i)));   // TODO: IfThenElse()
+	}
+
+	return EmitResult::Continue;
+}
+
+SpirvShader::EmitResult SpirvShader::EmitAny(InsnIterator insn, EmitState *state) const
+{
+	auto &type = getType(insn.word(1));
+	ASSERT(type.sizeInComponents == 1);
+	auto &dst = state->createIntermediate(insn.word(2), type.sizeInComponents);
+	auto &srcType = getType(getObject(insn.word(3)).type);
+	auto src = GenericValue(this, state, insn.word(3));
+
+	SIMD::UInt result = src.UInt(0);
+
+	for (auto i = 1u; i < srcType.sizeInComponents; i++)
+	{
+		result |= src.UInt(i);
+	}
+
+	dst.move(0, result);
+	return EmitResult::Continue;
+}
+
+SpirvShader::EmitResult SpirvShader::EmitAll(InsnIterator insn, EmitState *state) const
+{
+	auto &type = getType(insn.word(1));
+	ASSERT(type.sizeInComponents == 1);
+	auto &dst = state->createIntermediate(insn.word(2), type.sizeInComponents);
+	auto &srcType = getType(getObject(insn.word(3)).type);
+	auto src = GenericValue(this, state, insn.word(3));
+
+	SIMD::UInt result = src.UInt(0);
+
+	for (auto i = 1u; i < srcType.sizeInComponents; i++)
+	{
+		result &= src.UInt(i);
+	}
+
+	dst.move(0, result);
+	return EmitResult::Continue;
+}
+
+SpirvShader::EmitResult SpirvShader::EmitAtomicOp(InsnIterator insn, EmitState *state) const
+{
+	auto &resultType = getType(Type::ID(insn.word(1)));
+	Object::ID resultId = insn.word(2);
+	Object::ID semanticsId = insn.word(5);
+	auto memorySemantics = static_cast<spv::MemorySemanticsMask>(getObject(semanticsId).constantValue[0]);
+	auto memoryOrder = MemoryOrder(memorySemantics);
+	// Where no value is provided (increment/decrement) use an implicit value of 1.
+	auto value = (insn.wordCount() == 7) ? GenericValue(this, state, insn.word(6)).UInt(0) : RValue<SIMD::UInt>(1);
+	auto &dst = state->createIntermediate(resultId, resultType.sizeInComponents);
+	auto ptr = state->getPointer(insn.word(3));
+	auto ptrOffsets = ptr.offsets();
+
+	SIMD::UInt x(0);
+	auto mask = state->activeLaneMask() & state->storesAndAtomicsMask();
+	for (int j = 0; j < SIMD::Width; j++)
+	{
+		If(Extract(mask, j) != 0)
+		{
+			auto offset = Extract(ptrOffsets, j);
+			auto laneValue = Extract(value, j);
+			UInt v;
+			switch (insn.opcode())
+			{
+			case spv::OpAtomicIAdd:
+			case spv::OpAtomicIIncrement:
+				v = AddAtomic(Pointer<UInt>(&ptr.base[offset]), laneValue, memoryOrder);
+				break;
+			case spv::OpAtomicISub:
+			case spv::OpAtomicIDecrement:
+				v = SubAtomic(Pointer<UInt>(&ptr.base[offset]), laneValue, memoryOrder);
+				break;
+			case spv::OpAtomicAnd:
+				v = AndAtomic(Pointer<UInt>(&ptr.base[offset]), laneValue, memoryOrder);
+				break;
+			case spv::OpAtomicOr:
+				v = OrAtomic(Pointer<UInt>(&ptr.base[offset]), laneValue, memoryOrder);
+				break;
+			case spv::OpAtomicXor:
+				v = XorAtomic(Pointer<UInt>(&ptr.base[offset]), laneValue, memoryOrder);
+				break;
+			case spv::OpAtomicSMin:
+				v = As<UInt>(MinAtomic(Pointer<Int>(&ptr.base[offset]), As<Int>(laneValue), memoryOrder));
+				break;
+			case spv::OpAtomicSMax:
+				v = As<UInt>(MaxAtomic(Pointer<Int>(&ptr.base[offset]), As<Int>(laneValue), memoryOrder));
+				break;
+			case spv::OpAtomicUMin:
+				v = MinAtomic(Pointer<UInt>(&ptr.base[offset]), laneValue, memoryOrder);
+				break;
+			case spv::OpAtomicUMax:
+				v = MaxAtomic(Pointer<UInt>(&ptr.base[offset]), laneValue, memoryOrder);
+				break;
+			case spv::OpAtomicExchange:
+				v = ExchangeAtomic(Pointer<UInt>(&ptr.base[offset]), laneValue, memoryOrder);
+				break;
+			default:
+				UNREACHABLE("%s", OpcodeName(insn.opcode()).c_str());
+				break;
+			}
+			x = Insert(x, v, j);
+		}
+	}
+
+	dst.move(0, x);
+	return EmitResult::Continue;
+}
+
+SpirvShader::EmitResult SpirvShader::EmitAtomicCompareExchange(InsnIterator insn, EmitState *state) const
+{
+	// Separate from EmitAtomicOp due to different instruction encoding
+	auto &resultType = getType(Type::ID(insn.word(1)));
+	Object::ID resultId = insn.word(2);
+
+	auto memorySemanticsEqual = static_cast<spv::MemorySemanticsMask>(getObject(insn.word(5)).constantValue[0]);
+	auto memoryOrderEqual = MemoryOrder(memorySemanticsEqual);
+	auto memorySemanticsUnequal = static_cast<spv::MemorySemanticsMask>(getObject(insn.word(6)).constantValue[0]);
+	auto memoryOrderUnequal = MemoryOrder(memorySemanticsUnequal);
+
+	auto value = GenericValue(this, state, insn.word(7));
+	auto comparator = GenericValue(this, state, insn.word(8));
+	auto &dst = state->createIntermediate(resultId, resultType.sizeInComponents);
+	auto ptr = state->getPointer(insn.word(3));
+	auto ptrOffsets = ptr.offsets();
+
+	SIMD::UInt x(0);
+	auto mask = state->activeLaneMask() & state->storesAndAtomicsMask();
+	for (int j = 0; j < SIMD::Width; j++)
+	{
+		If(Extract(mask, j) != 0)
+		{
+			auto offset = Extract(ptrOffsets, j);
+			auto laneValue = Extract(value.UInt(0), j);
+			auto laneComparator = Extract(comparator.UInt(0), j);
+			UInt v = CompareExchangeAtomic(Pointer<UInt>(&ptr.base[offset]), laneValue, laneComparator, memoryOrderEqual, memoryOrderUnequal);
+			x = Insert(x, v, j);
+		}
+	}
+
+	dst.move(0, x);
+	return EmitResult::Continue;
+}
+
+SpirvShader::EmitResult SpirvShader::EmitCopyObject(InsnIterator insn, EmitState *state) const
+{
+	auto ty = getType(insn.word(1));
+	auto &dst = state->createIntermediate(insn.word(2), ty.sizeInComponents);
+	auto src = GenericValue(this, state, insn.word(3));
+	for (uint32_t i = 0; i < ty.sizeInComponents; i++)
+	{
+		dst.move(i, src.Int(i));
+	}
+	return EmitResult::Continue;
+}
+
+SpirvShader::EmitResult SpirvShader::EmitArrayLength(InsnIterator insn, EmitState *state) const
+{
+	auto resultTyId = Type::ID(insn.word(1));
+	auto resultId = Object::ID(insn.word(2));
+	auto structPtrId = Object::ID(insn.word(3));
+	auto arrayFieldIdx = insn.word(4);
+
+	auto &resultType = getType(resultTyId);
+	ASSERT(resultType.sizeInComponents == 1);
+	ASSERT(resultType.definition.opcode() == spv::OpTypeInt);
+
+	auto &structPtrTy = getType(getObject(structPtrId).type);
+	auto &structTy = getType(structPtrTy.element);
+	auto &arrayTy = getType(structTy.definition.word(2 + arrayFieldIdx));
+	ASSERT(arrayTy.definition.opcode() == spv::OpTypeRuntimeArray);
+	auto &arrayElTy = getType(arrayTy.element);
+
+	auto &result = state->createIntermediate(resultId, 1);
+	auto structBase = GetPointerToData(structPtrId, 0, state);
+
+	Decorations d = {};
+	ApplyDecorationsForIdMember(&d, structPtrTy.element, arrayFieldIdx);
+	ASSERT(d.HasOffset);
+
+	auto arrayBase = structBase + d.Offset;
+	auto arraySizeInBytes = SIMD::Int(arrayBase.limit()) - arrayBase.offsets();
+	auto arrayLength = arraySizeInBytes / SIMD::Int(arrayElTy.sizeInComponents * sizeof(float));
+
+	result.move(0, SIMD::Int(arrayLength));
+
+	return EmitResult::Continue;
+}
+
+uint32_t SpirvShader::GetConstScalarInt(Object::ID id) const
+{
+	auto &scopeObj = getObject(id);
+	ASSERT(scopeObj.kind == Object::Kind::Constant);
+	ASSERT(getType(scopeObj.type).sizeInComponents == 1);
+	return scopeObj.constantValue[0];
+}
+
+void SpirvShader::emitEpilog(SpirvRoutine *routine) const
+{
+	for (auto insn : *this)
+	{
+		switch (insn.opcode())
+		{
+		case spv::OpVariable:
+		{
+			Object::ID resultId = insn.word(2);
+			auto &object = getObject(resultId);
+			auto &objectTy = getType(object.type);
+			if (object.kind == Object::Kind::InterfaceVariable && objectTy.storageClass == spv::StorageClassOutput)
+			{
+				auto &dst = routine->getVariable(resultId);
+				int offset = 0;
+				VisitInterface(resultId,
+							   [&](Decorations const &d, AttribType type) {
+								   auto scalarSlot = d.Location << 2 | d.Component;
+								   routine->outputs[scalarSlot] = dst[offset++];
+							   });
+			}
+			break;
+		}
+		default:
+			break;
+		}
+	}
+
+	// Clear phis that are no longer used. This serves two purposes:
+	// (1) The phi rr::Variables are destructed, preventing pointless
+	//     materialization.
+	// (2) Frees memory that will never be used again.
+	routine->phis.clear();
+}
+
+VkShaderStageFlagBits SpirvShader::executionModelToStage(spv::ExecutionModel model)
+{
+	switch (model)
+	{
+	case spv::ExecutionModelVertex:                 return VK_SHADER_STAGE_VERTEX_BIT;
+	// case spv::ExecutionModelTessellationControl:    return VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT;
+	// case spv::ExecutionModelTessellationEvaluation: return VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT;
+	// case spv::ExecutionModelGeometry:               return VK_SHADER_STAGE_GEOMETRY_BIT;
+	case spv::ExecutionModelFragment:               return VK_SHADER_STAGE_FRAGMENT_BIT;
+	case spv::ExecutionModelGLCompute:              return VK_SHADER_STAGE_COMPUTE_BIT;
+	// case spv::ExecutionModelKernel:                 return VkShaderStageFlagBits(0); // Not supported by vulkan.
+	// case spv::ExecutionModelTaskNV:                 return VK_SHADER_STAGE_TASK_BIT_NV;
+	// case spv::ExecutionModelMeshNV:                 return VK_SHADER_STAGE_MESH_BIT_NV;
+	// case spv::ExecutionModelRayGenerationNV:        return VK_SHADER_STAGE_RAYGEN_BIT_NV;
+	// case spv::ExecutionModelIntersectionNV:         return VK_SHADER_STAGE_INTERSECTION_BIT_NV;
+	// case spv::ExecutionModelAnyHitNV:               return VK_SHADER_STAGE_ANY_HIT_BIT_NV;
+	// case spv::ExecutionModelClosestHitNV:           return VK_SHADER_STAGE_CLOSEST_HIT_BIT_NV;
+	// case spv::ExecutionModelMissNV:                 return VK_SHADER_STAGE_MISS_BIT_NV;
+	// case spv::ExecutionModelCallableNV:             return VK_SHADER_STAGE_CALLABLE_BIT_NV;
+	default:
+		UNSUPPORTED("ExecutionModel: %d", int(model));
+		return VkShaderStageFlagBits(0);
+	}
+}
+
+SpirvShader::GenericValue::GenericValue(SpirvShader const *shader, EmitState const *state, SpirvShader::Object::ID objId) :
+		obj(shader->getObject(objId)),
+		intermediate(obj.kind == SpirvShader::Object::Kind::Intermediate ? &state->getIntermediate(objId) : nullptr),
+		type(obj.type) {}
+
+SpirvRoutine::SpirvRoutine(vk::PipelineLayout const *pipelineLayout) :
+	pipelineLayout(pipelineLayout)
+{
+}
+
+void SpirvRoutine::setImmutableInputBuiltins(SpirvShader const *shader)
+{
+	setInputBuiltin(shader, spv::BuiltInSubgroupLocalInvocationId, [&](const SpirvShader::BuiltinMapping& builtin, Array<SIMD::Float>& value)
+	{
+		ASSERT(builtin.SizeInComponents == 1);
+		value[builtin.FirstComponent] = As<SIMD::Float>(SIMD::Int(0, 1, 2, 3));
+	});
+
+	setInputBuiltin(shader, spv::BuiltInSubgroupEqMask, [&](const SpirvShader::BuiltinMapping& builtin, Array<SIMD::Float>& value)
+	{
+		ASSERT(builtin.SizeInComponents == 4);
+		value[builtin.FirstComponent + 0] = As<SIMD::Float>(SIMD::Int(1, 2, 4, 8));
+		value[builtin.FirstComponent + 1] = As<SIMD::Float>(SIMD::Int(0, 0, 0, 0));
+		value[builtin.FirstComponent + 2] = As<SIMD::Float>(SIMD::Int(0, 0, 0, 0));
+		value[builtin.FirstComponent + 3] = As<SIMD::Float>(SIMD::Int(0, 0, 0, 0));
+	});
+
+	setInputBuiltin(shader, spv::BuiltInSubgroupGeMask, [&](const SpirvShader::BuiltinMapping& builtin, Array<SIMD::Float>& value)
+	{
+		ASSERT(builtin.SizeInComponents == 4);
+		value[builtin.FirstComponent + 0] = As<SIMD::Float>(SIMD::Int(15, 14, 12, 8));
+		value[builtin.FirstComponent + 1] = As<SIMD::Float>(SIMD::Int(0, 0, 0, 0));
+		value[builtin.FirstComponent + 2] = As<SIMD::Float>(SIMD::Int(0, 0, 0, 0));
+		value[builtin.FirstComponent + 3] = As<SIMD::Float>(SIMD::Int(0, 0, 0, 0));
+	});
+
+	setInputBuiltin(shader, spv::BuiltInSubgroupGtMask, [&](const SpirvShader::BuiltinMapping& builtin, Array<SIMD::Float>& value)
+	{
+		ASSERT(builtin.SizeInComponents == 4);
+		value[builtin.FirstComponent + 0] = As<SIMD::Float>(SIMD::Int(14, 12, 8, 0));
+		value[builtin.FirstComponent + 1] = As<SIMD::Float>(SIMD::Int(0, 0, 0, 0));
+		value[builtin.FirstComponent + 2] = As<SIMD::Float>(SIMD::Int(0, 0, 0, 0));
+		value[builtin.FirstComponent + 3] = As<SIMD::Float>(SIMD::Int(0, 0, 0, 0));
+	});
+
+	setInputBuiltin(shader, spv::BuiltInSubgroupLeMask, [&](const SpirvShader::BuiltinMapping& builtin, Array<SIMD::Float>& value)
+	{
+		ASSERT(builtin.SizeInComponents == 4);
+		value[builtin.FirstComponent + 0] = As<SIMD::Float>(SIMD::Int(1, 3, 7, 15));
+		value[builtin.FirstComponent + 1] = As<SIMD::Float>(SIMD::Int(0, 0, 0, 0));
+		value[builtin.FirstComponent + 2] = As<SIMD::Float>(SIMD::Int(0, 0, 0, 0));
+		value[builtin.FirstComponent + 3] = As<SIMD::Float>(SIMD::Int(0, 0, 0, 0));
+	});
+
+	setInputBuiltin(shader, spv::BuiltInSubgroupLtMask, [&](const SpirvShader::BuiltinMapping& builtin, Array<SIMD::Float>& value)
+	{
+		ASSERT(builtin.SizeInComponents == 4);
+		value[builtin.FirstComponent + 0] = As<SIMD::Float>(SIMD::Int(0, 1, 3, 7));
+		value[builtin.FirstComponent + 1] = As<SIMD::Float>(SIMD::Int(0, 0, 0, 0));
+		value[builtin.FirstComponent + 2] = As<SIMD::Float>(SIMD::Int(0, 0, 0, 0));
+		value[builtin.FirstComponent + 3] = As<SIMD::Float>(SIMD::Int(0, 0, 0, 0));
+	});
+
+	setInputBuiltin(shader, spv::BuiltInDeviceIndex, [&](const SpirvShader::BuiltinMapping& builtin, Array<SIMD::Float>& value)
+	{
+		ASSERT(builtin.SizeInComponents == 1);
+		// Only a single physical device is supported.
+		value[builtin.FirstComponent] = As<SIMD::Float>(SIMD::Int(0, 0, 0, 0));
+	});
+}
+
+}  // namespace sw
diff --git a/src/Pipeline/SpirvShader.hpp b/src/Pipeline/SpirvShader.hpp
index c65b86d..1876dec 100644
--- a/src/Pipeline/SpirvShader.hpp
+++ b/src/Pipeline/SpirvShader.hpp
@@ -42,1144 +42,1145 @@
 
 #undef Yield // b/127920555
 
-namespace vk
-{
-	class PipelineLayout;
-	class ImageView;
-	class Sampler;
-	class RenderPass;
-	struct SampledImageDescriptor;
-} // namespace vk
+namespace vk {
 
-namespace sw
-{
-	// Forward declarations.
-	class SpirvRoutine;
+class PipelineLayout;
+class ImageView;
+class Sampler;
+class RenderPass;
+struct SampledImageDescriptor;
 
-	// Incrementally constructed complex bundle of rvalues
-	// Effectively a restricted vector, supporting only:
-	// - allocation to a (runtime-known) fixed size
-	// - in-place construction of elements
-	// - const operator[]
-	class Intermediate
+}  // namespace vk
+
+namespace sw {
+
+// Forward declarations.
+class SpirvRoutine;
+
+// Incrementally constructed complex bundle of rvalues
+// Effectively a restricted vector, supporting only:
+// - allocation to a (runtime-known) fixed size
+// - in-place construction of elements
+// - const operator[]
+class Intermediate
+{
+public:
+	Intermediate(uint32_t size) : scalar(new rr::Value*[size]), size(size) {
+		memset(scalar, 0, sizeof(rr::Value*) * size);
+	}
+
+	~Intermediate()
+	{
+		delete[] scalar;
+	}
+
+	void move(uint32_t i, RValue<SIMD::Float> &&scalar) { emplace(i, scalar.value); }
+	void move(uint32_t i, RValue<SIMD::Int> &&scalar)   { emplace(i, scalar.value); }
+	void move(uint32_t i, RValue<SIMD::UInt> &&scalar)  { emplace(i, scalar.value); }
+
+	void move(uint32_t i, const RValue<SIMD::Float> &scalar) { emplace(i, scalar.value); }
+	void move(uint32_t i, const RValue<SIMD::Int> &scalar)   { emplace(i, scalar.value); }
+	void move(uint32_t i, const RValue<SIMD::UInt> &scalar)  { emplace(i, scalar.value); }
+
+	// Value retrieval functions.
+	RValue<SIMD::Float> Float(uint32_t i) const
+	{
+		ASSERT(i < size);
+		ASSERT(scalar[i] != nullptr);
+		return As<SIMD::Float>(scalar[i]);  // TODO(b/128539387): RValue<SIMD::Float>(scalar)
+	}
+
+	RValue<SIMD::Int> Int(uint32_t i) const
+	{
+		ASSERT(i < size);
+		ASSERT(scalar[i] != nullptr);
+		return As<SIMD::Int>(scalar[i]);  // TODO(b/128539387): RValue<SIMD::Int>(scalar)
+	}
+
+	RValue<SIMD::UInt> UInt(uint32_t i) const
+	{
+		ASSERT(i < size);
+		ASSERT(scalar[i] != nullptr);
+		return As<SIMD::UInt>(scalar[i]);  // TODO(b/128539387): RValue<SIMD::UInt>(scalar)
+	}
+
+	// No copy/move construction or assignment
+	Intermediate(Intermediate const &) = delete;
+	Intermediate(Intermediate &&) = delete;
+	Intermediate & operator=(Intermediate const &) = delete;
+	Intermediate & operator=(Intermediate &&) = delete;
+
+private:
+	void emplace(uint32_t i, rr::Value *value)
+	{
+		ASSERT(i < size);
+		ASSERT(scalar[i] == nullptr);
+		scalar[i] = value;
+	}
+
+	rr::Value **const scalar;
+	uint32_t size;
+};
+
+class SpirvShader
+{
+public:
+	using InsnStore = std::vector<uint32_t>;
+	InsnStore insns;
+
+	using ImageSampler = void(void* texture, void *sampler, void* uvsIn, void* texelOut, void* constants);
+
+	enum class YieldResult
+	{
+		ControlBarrier,
+	};
+
+	/* Pseudo-iterator over SPIRV instructions, designed to support range-based-for. */
+	class InsnIterator
+	{
+		InsnStore::const_iterator iter;
+
+	public:
+		spv::Op opcode() const
+		{
+			return static_cast<spv::Op>(*iter & spv::OpCodeMask);
+		}
+
+		uint32_t wordCount() const
+		{
+			return *iter >> spv::WordCountShift;
+		}
+
+		uint32_t word(uint32_t n) const
+		{
+			ASSERT(n < wordCount());
+			return iter[n];
+		}
+
+		uint32_t const * wordPointer(uint32_t n) const
+		{
+			ASSERT(n < wordCount());
+			return &iter[n];
+		}
+
+		const char* string(uint32_t n) const
+		{
+			return reinterpret_cast<const char*>(wordPointer(n));
+		}
+
+		bool operator==(InsnIterator const &other) const
+		{
+			return iter == other.iter;
+		}
+
+		bool operator!=(InsnIterator const &other) const
+		{
+			return iter != other.iter;
+		}
+
+		InsnIterator operator*() const
+		{
+			return *this;
+		}
+
+		InsnIterator &operator++()
+		{
+			iter += wordCount();
+			return *this;
+		}
+
+		InsnIterator const operator++(int)
+		{
+			InsnIterator ret{*this};
+			iter += wordCount();
+			return ret;
+		}
+
+		InsnIterator(InsnIterator const &other) = default;
+
+		InsnIterator() = default;
+
+		explicit InsnIterator(InsnStore::const_iterator iter) : iter{iter}
+		{
+		}
+	};
+
+	/* range-based-for interface */
+	InsnIterator begin() const
+	{
+		return InsnIterator{insns.cbegin() + 5};
+	}
+
+	InsnIterator end() const
+	{
+		return InsnIterator{insns.cend()};
+	}
+
+	class Type
 	{
 	public:
-		Intermediate(uint32_t size) : scalar(new rr::Value*[size]), size(size) {
-			memset(scalar, 0, sizeof(rr::Value*) * size);
-		}
+		using ID = SpirvID<Type>;
 
-		~Intermediate()
+		spv::Op opcode() const { return definition.opcode(); }
+
+		InsnIterator definition;
+		spv::StorageClass storageClass = static_cast<spv::StorageClass>(-1);
+		uint32_t sizeInComponents = 0;
+		bool isBuiltInBlock = false;
+
+		// Inner element type for pointers, arrays, vectors and matrices.
+		ID element;
+	};
+
+	class Object
+	{
+	public:
+		using ID = SpirvID<Object>;
+
+		spv::Op opcode() const { return definition.opcode(); }
+
+		InsnIterator definition;
+		Type::ID type;
+		std::unique_ptr<uint32_t[]> constantValue = nullptr;
+
+		enum class Kind
 		{
-			delete[] scalar;
+			// Invalid default kind.
+			// If we get left with an object in this state, the module was
+			// broken.
+			Unknown,
+
+			// TODO: Better document this kind.
+			// A shader interface variable pointer.
+			// Pointer with uniform address across all lanes.
+			// Pointer held by SpirvRoutine::pointers
+			InterfaceVariable,
+
+			// Constant value held by Object::constantValue.
+			Constant,
+
+			// Value held by SpirvRoutine::intermediates.
+			Intermediate,
+
+			// Pointer held by SpirvRoutine::pointers
+			Pointer,
+
+			// A pointer to a vk::DescriptorSet*.
+			// Pointer held by SpirvRoutine::pointers.
+			DescriptorSet,
+		};
+
+		Kind kind = Kind::Unknown;
+	};
+
+	// Block is an interval of SPIR-V instructions, starting with the
+	// opening OpLabel, and ending with a termination instruction.
+	class Block
+	{
+	public:
+		using ID = SpirvID<Block>;
+		using Set = std::unordered_set<ID>;
+
+		// Edge represents the graph edge between two blocks.
+		struct Edge
+		{
+			ID from;
+			ID to;
+
+			bool operator == (const Edge& other) const { return from == other.from && to == other.to; }
+
+			struct Hash
+			{
+				std::size_t operator()(const Edge& edge) const noexcept
+				{
+					return std::hash<uint32_t>()(edge.from.value() * 31 + edge.to.value());
+				}
+			};
+		};
+
+		Block() = default;
+		Block(const Block& other) = default;
+		explicit Block(InsnIterator begin, InsnIterator end);
+
+		/* range-based-for interface */
+		inline InsnIterator begin() const { return begin_; }
+		inline InsnIterator end() const { return end_; }
+
+		enum Kind
+		{
+			Simple, // OpBranch or other simple terminator.
+			StructuredBranchConditional, // OpSelectionMerge + OpBranchConditional
+			UnstructuredBranchConditional, // OpBranchConditional
+			StructuredSwitch, // OpSelectionMerge + OpSwitch
+			UnstructuredSwitch, // OpSwitch
+			Loop, // OpLoopMerge + [OpBranchConditional | OpBranch]
+		};
+
+		Kind kind = Simple;
+		InsnIterator mergeInstruction; // Structured control flow merge instruction.
+		InsnIterator branchInstruction; // Branch instruction.
+		ID mergeBlock; // Structured flow merge block.
+		ID continueTarget; // Loop continue block.
+		Set ins; // Blocks that branch into this block.
+		Set outs; // Blocks that this block branches to.
+		bool isLoopMerge = false;
+	private:
+		InsnIterator begin_;
+		InsnIterator end_;
+	};
+
+	class Function
+	{
+	public:
+		using ID = SpirvID<Function>;
+
+		// Walks all reachable the blocks starting from id adding them to
+		// reachable.
+		void TraverseReachableBlocks(Block::ID id, Block::Set& reachable) const;
+
+		// AssignBlockFields() performs the following for all reachable blocks:
+		// * Assigns Block::ins with the identifiers of all blocks that contain
+		//   this block in their Block::outs.
+		// * Sets Block::isLoopMerge to true if the block is the merge of a
+		//   another loop block.
+		void AssignBlockFields();
+
+		// ForeachBlockDependency calls f with each dependency of the given
+		// block. A dependency is an incoming block that is not a loop-back
+		// edge.
+		void ForeachBlockDependency(Block::ID blockId, std::function<void(Block::ID)> f) const;
+
+		// ExistsPath returns true if there's a direct or indirect flow from
+		// the 'from' block to the 'to' block that does not pass through
+		// notPassingThrough.
+		bool ExistsPath(Block::ID from, Block::ID to, Block::ID notPassingThrough) const;
+
+		Block const &getBlock(Block::ID id) const
+		{
+			auto it = blocks.find(id);
+			ASSERT_MSG(it != blocks.end(), "Unknown block %d", id.value());
+			return it->second;
 		}
 
-		void move(uint32_t i, RValue<SIMD::Float> &&scalar) { emplace(i, scalar.value); }
-		void move(uint32_t i, RValue<SIMD::Int> &&scalar)   { emplace(i, scalar.value); }
-		void move(uint32_t i, RValue<SIMD::UInt> &&scalar)  { emplace(i, scalar.value); }
+		Block::ID entry; // function entry point block.
+		HandleMap<Block> blocks; // blocks belonging to this function.
+		Type::ID type; // type of the function.
+		Type::ID result; // return type.
+	};
 
-		void move(uint32_t i, const RValue<SIMD::Float> &scalar) { emplace(i, scalar.value); }
-		void move(uint32_t i, const RValue<SIMD::Int> &scalar)   { emplace(i, scalar.value); }
-		void move(uint32_t i, const RValue<SIMD::UInt> &scalar)  { emplace(i, scalar.value); }
+	struct TypeOrObject {}; // Dummy struct to represent a Type or Object.
 
-		// Value retrieval functions.
+	// TypeOrObjectID is an identifier that represents a Type or an Object,
+	// and supports implicit casting to and from Type::ID or Object::ID.
+	class TypeOrObjectID : public SpirvID<TypeOrObject>
+	{
+	public:
+		using Hash = std::hash<SpirvID<TypeOrObject>>;
+
+		inline TypeOrObjectID(uint32_t id) : SpirvID(id) {}
+		inline TypeOrObjectID(Type::ID id) : SpirvID(id.value()) {}
+		inline TypeOrObjectID(Object::ID id) : SpirvID(id.value()) {}
+		inline operator Type::ID() const { return Type::ID(value()); }
+		inline operator Object::ID() const { return Object::ID(value()); }
+	};
+
+	// OpImageSample variants
+	enum Variant
+	{
+		None,  // No Dref or Proj. Also used by OpImageFetch and OpImageQueryLod.
+		Dref,
+		Proj,
+		ProjDref,
+		VARIANT_LAST = ProjDref
+	};
+
+	// Compact representation of image instruction parameters that is passed to the
+	// trampoline function for retrieving/generating the corresponding sampling routine.
+	struct ImageInstruction
+	{
+		ImageInstruction(Variant variant, SamplerMethod samplerMethod)
+			: parameters(0)
+		{
+			this->variant = variant;
+			this->samplerMethod = samplerMethod;
+		}
+
+		// Unmarshal from raw 32-bit data
+		ImageInstruction(uint32_t parameters) : parameters(parameters) {}
+
+		SamplerFunction getSamplerFunction() const
+		{
+			return { static_cast<SamplerMethod>(samplerMethod), offset != 0, sample != 0 };
+		}
+
+		bool isDref() const
+		{
+			return (variant == Dref) || (variant == ProjDref);
+		}
+
+		bool isProj() const
+		{
+			return (variant == Proj) || (variant == ProjDref);
+		}
+
+		union
+		{
+			struct
+			{
+				uint32_t variant : BITS(VARIANT_LAST);
+				uint32_t samplerMethod : BITS(SAMPLER_METHOD_LAST);
+				uint32_t gatherComponent : 2;
+
+				// Parameters are passed to the sampling routine in this order:
+				uint32_t coordinates : 3;       // 1-4 (does not contain projection component)
+			//	uint32_t dref : 1;              // Indicated by Variant::ProjDref|Dref
+			//	uint32_t lodOrBias : 1;         // Indicated by SamplerMethod::Lod|Bias|Fetch
+				uint32_t grad : 2;              // 0-3 components (for each of dx / dy)
+				uint32_t offset : 2;            // 0-3 components
+				uint32_t sample : 1;            // 0-1 scalar integer
+			};
+
+			uint32_t parameters;
+		};
+	};
+
+	static_assert(sizeof(ImageInstruction) == sizeof(uint32_t), "ImageInstruction must be 32-bit");
+
+	// This method is for retrieving an ID that uniquely identifies the
+	// shader entry point represented by this object.
+	uint64_t getSerialID() const
+	{
+		return  ((uint64_t)entryPoint.value() << 32) | codeSerialID;
+	}
+
+	SpirvShader(uint32_t codeSerialID,
+	            VkShaderStageFlagBits stage,
+	            const char *entryPointName,
+	            InsnStore const &insns,
+	            const vk::RenderPass *renderPass,
+	            uint32_t subpassIndex,
+	            bool robustBufferAccess);
+
+	struct Modes
+	{
+		bool EarlyFragmentTests : 1;
+		bool DepthReplacing : 1;
+		bool DepthGreater : 1;
+		bool DepthLess : 1;
+		bool DepthUnchanged : 1;
+		bool ContainsKill : 1;
+		bool ContainsControlBarriers : 1;
+		bool NeedsCentroid : 1;
+
+		// Compute workgroup dimensions
+		int WorkgroupSizeX = 1, WorkgroupSizeY = 1, WorkgroupSizeZ = 1;
+	};
+
+	Modes const &getModes() const
+	{
+		return modes;
+	}
+
+	struct Capabilities
+	{
+		bool Matrix : 1;
+		bool Shader : 1;
+		bool ClipDistance : 1;
+		bool CullDistance : 1;
+		bool InputAttachment : 1;
+		bool Sampled1D : 1;
+		bool Image1D : 1;
+		bool ImageCubeArray : 1;
+		bool SampledBuffer : 1;
+		bool SampledCubeArray : 1;
+		bool ImageBuffer : 1;
+		bool StorageImageExtendedFormats : 1;
+		bool ImageQuery : 1;
+		bool DerivativeControl : 1;
+		bool GroupNonUniform : 1;
+		bool GroupNonUniformVote : 1;
+		bool GroupNonUniformBallot : 1;
+		bool GroupNonUniformShuffle : 1;
+		bool GroupNonUniformShuffleRelative : 1;
+		bool GroupNonUniformArithmetic : 1;
+		bool DeviceGroup : 1;
+		bool MultiView : 1;
+	};
+
+	Capabilities const &getUsedCapabilities() const
+	{
+		return capabilities;
+	}
+
+	// getNumOutputClipDistances() returns the number of ClipDistances
+	// outputted by this shader.
+	unsigned int getNumOutputClipDistances() const
+	{
+		if (getUsedCapabilities().ClipDistance)
+		{
+			auto it = outputBuiltins.find(spv::BuiltInClipDistance);
+			if(it != outputBuiltins.end())
+			{
+				return it->second.SizeInComponents;
+			}
+		}
+		return 0;
+	}
+
+	// getNumOutputCullDistances() returns the number of CullDistances
+	// outputted by this shader.
+	unsigned int getNumOutputCullDistances() const
+	{
+		if (getUsedCapabilities().CullDistance)
+		{
+			auto it = outputBuiltins.find(spv::BuiltInCullDistance);
+			if(it != outputBuiltins.end())
+			{
+				return it->second.SizeInComponents;
+			}
+		}
+		return 0;
+	}
+
+	enum AttribType : unsigned char
+	{
+		ATTRIBTYPE_FLOAT,
+		ATTRIBTYPE_INT,
+		ATTRIBTYPE_UINT,
+		ATTRIBTYPE_UNUSED,
+
+		ATTRIBTYPE_LAST = ATTRIBTYPE_UINT
+	};
+
+	bool hasBuiltinInput(spv::BuiltIn b) const
+	{
+		return inputBuiltins.find(b) != inputBuiltins.end();
+	}
+
+	bool hasBuiltinOutput(spv::BuiltIn b) const
+	{
+		return outputBuiltins.find(b) != outputBuiltins.end();
+	}
+
+	struct Decorations
+	{
+		int32_t Location = -1;
+		int32_t Component = 0;
+		spv::BuiltIn BuiltIn = static_cast<spv::BuiltIn>(-1);
+		int32_t Offset = -1;
+		int32_t ArrayStride = -1;
+		int32_t MatrixStride = 1;
+
+		bool HasLocation : 1;
+		bool HasComponent : 1;
+		bool HasBuiltIn : 1;
+		bool HasOffset : 1;
+		bool HasArrayStride : 1;
+		bool HasMatrixStride : 1;
+		bool HasRowMajor : 1;		// whether RowMajor bit is valid.
+
+		bool Flat : 1;
+		bool Centroid : 1;
+		bool NoPerspective : 1;
+		bool Block : 1;
+		bool BufferBlock : 1;
+		bool RelaxedPrecision : 1;
+		bool RowMajor : 1;			// RowMajor if true; ColMajor if false
+		bool InsideMatrix : 1;		// pseudo-decoration for whether we're inside a matrix.
+
+		Decorations()
+				: Location{-1}, Component{0},
+				  BuiltIn{static_cast<spv::BuiltIn>(-1)},
+				  Offset{-1}, ArrayStride{-1}, MatrixStride{-1},
+				  HasLocation{false}, HasComponent{false},
+				  HasBuiltIn{false}, HasOffset{false},
+				  HasArrayStride{false}, HasMatrixStride{false},
+				  HasRowMajor{false},
+				  Flat{false}, Centroid{false}, NoPerspective{false},
+				  Block{false}, BufferBlock{false},
+				  RelaxedPrecision{false}, RowMajor{false},
+				  InsideMatrix{false}
+		{
+		}
+
+		Decorations(Decorations const &) = default;
+
+		void Apply(Decorations const &src);
+
+		void Apply(spv::Decoration decoration, uint32_t arg);
+	};
+
+	std::unordered_map<TypeOrObjectID, Decorations, TypeOrObjectID::Hash> decorations;
+	std::unordered_map<Type::ID, std::vector<Decorations>> memberDecorations;
+
+	struct DescriptorDecorations
+	{
+		int32_t DescriptorSet = -1;
+		int32_t Binding = -1;
+		int32_t InputAttachmentIndex = -1;
+
+		void Apply(DescriptorDecorations const &src);
+	};
+
+	std::unordered_map<Object::ID, DescriptorDecorations> descriptorDecorations;
+	std::vector<VkFormat> inputAttachmentFormats;
+
+	struct InterfaceComponent
+	{
+		AttribType Type;
+
+		union
+		{
+			struct
+			{
+				bool Flat : 1;
+				bool Centroid : 1;
+				bool NoPerspective : 1;
+			};
+
+			uint8_t DecorationBits;
+		};
+
+		InterfaceComponent()
+			: Type{ATTRIBTYPE_UNUSED}, DecorationBits{0}
+		{
+		}
+	};
+
+	struct BuiltinMapping
+	{
+		Object::ID Id;
+		uint32_t FirstComponent;
+		uint32_t SizeInComponents;
+	};
+
+	struct WorkgroupMemory
+	{
+		// allocates a new variable of size bytes with the given identifier.
+		inline void allocate(Object::ID id, uint32_t size)
+		{
+			uint32_t offset = totalSize;
+			auto it = offsets.emplace(id, offset);
+			ASSERT_MSG(it.second, "WorkgroupMemory already has an allocation for object %d", int(id.value()));
+			totalSize += size;
+		}
+		// returns the byte offset of the variable with the given identifier.
+		inline uint32_t offsetOf(Object::ID id) const
+		{
+			auto it = offsets.find(id);
+			ASSERT_MSG(it != offsets.end(), "WorkgroupMemory has no allocation for object %d", int(id.value()));
+			return it->second;
+		}
+		// returns the total allocated size in bytes.
+		inline uint32_t size() const { return totalSize; }
+	private:
+		uint32_t totalSize = 0; // in bytes
+		std::unordered_map<Object::ID, uint32_t> offsets; // in bytes
+	};
+
+	std::vector<InterfaceComponent> inputs;
+	std::vector<InterfaceComponent> outputs;
+
+	void emitProlog(SpirvRoutine *routine) const;
+	void emit(SpirvRoutine *routine, RValue<SIMD::Int> const &activeLaneMask, RValue<SIMD::Int> const &storesAndAtomicsMask, const vk::DescriptorSet::Bindings &descriptorSets) const;
+	void emitEpilog(SpirvRoutine *routine) const;
+
+	using BuiltInHash = std::hash<std::underlying_type<spv::BuiltIn>::type>;
+	std::unordered_map<spv::BuiltIn, BuiltinMapping, BuiltInHash> inputBuiltins;
+	std::unordered_map<spv::BuiltIn, BuiltinMapping, BuiltInHash> outputBuiltins;
+	WorkgroupMemory workgroupMemory;
+
+private:
+	const uint32_t codeSerialID;
+	Modes modes = {};
+	Capabilities capabilities = {};
+	HandleMap<Type> types;
+	HandleMap<Object> defs;
+	HandleMap<Function> functions;
+	Function::ID entryPoint;
+
+	const bool robustBufferAccess = true;
+	spv::ExecutionModel executionModel = spv::ExecutionModelMax; // Invalid prior to OpEntryPoint parsing.
+
+	// DeclareType creates a Type for the given OpTypeX instruction, storing
+	// it into the types map. It is called from the analysis pass (constructor).
+	void DeclareType(InsnIterator insn);
+
+	void ProcessExecutionMode(InsnIterator it);
+
+	uint32_t ComputeTypeSize(InsnIterator insn);
+	void ApplyDecorationsForId(Decorations *d, TypeOrObjectID id) const;
+	void ApplyDecorationsForIdMember(Decorations *d, Type::ID id, uint32_t member) const;
+	void ApplyDecorationsForAccessChain(Decorations *d, DescriptorDecorations *dd, Object::ID baseId, uint32_t numIndexes, uint32_t const *indexIds) const;
+
+	// Creates an Object for the instruction's result in 'defs'.
+	void DefineResult(const InsnIterator &insn);
+
+	// Returns true if data in the given storage class is word-interleaved
+	// by each SIMD vector lane, otherwise data is stored linerally.
+	//
+	// Each lane addresses a single word, picked by a base pointer and an
+	// integer offset.
+	//
+	// A word is currently 32 bits (single float, int32_t, uint32_t).
+	// A lane is a single element of a SIMD vector register.
+	//
+	// Storage interleaved by lane - (IsStorageInterleavedByLane() == true):
+	// ---------------------------------------------------------------------
+	//
+	// Address = PtrBase + sizeof(Word) * (SIMD::Width * LaneOffset + LaneIndex)
+	//
+	// Assuming SIMD::Width == 4:
+	//
+	//                   Lane[0]  |  Lane[1]  |  Lane[2]  |  Lane[3]
+	//                 ===========+===========+===========+==========
+	//  LaneOffset=0: |  Word[0]  |  Word[1]  |  Word[2]  |  Word[3]
+	// ---------------+-----------+-----------+-----------+----------
+	//  LaneOffset=1: |  Word[4]  |  Word[5]  |  Word[6]  |  Word[7]
+	// ---------------+-----------+-----------+-----------+----------
+	//  LaneOffset=2: |  Word[8]  |  Word[9]  |  Word[a]  |  Word[b]
+	// ---------------+-----------+-----------+-----------+----------
+	//  LaneOffset=3: |  Word[c]  |  Word[d]  |  Word[e]  |  Word[f]
+	//
+	//
+	// Linear storage - (IsStorageInterleavedByLane() == false):
+	// ---------------------------------------------------------
+	//
+	// Address = PtrBase + sizeof(Word) * LaneOffset
+	//
+	//                   Lane[0]  |  Lane[1]  |  Lane[2]  |  Lane[3]
+	//                 ===========+===========+===========+==========
+	//  LaneOffset=0: |  Word[0]  |  Word[0]  |  Word[0]  |  Word[0]
+	// ---------------+-----------+-----------+-----------+----------
+	//  LaneOffset=1: |  Word[1]  |  Word[1]  |  Word[1]  |  Word[1]
+	// ---------------+-----------+-----------+-----------+----------
+	//  LaneOffset=2: |  Word[2]  |  Word[2]  |  Word[2]  |  Word[2]
+	// ---------------+-----------+-----------+-----------+----------
+	//  LaneOffset=3: |  Word[3]  |  Word[3]  |  Word[3]  |  Word[3]
+	//
+	static bool IsStorageInterleavedByLane(spv::StorageClass storageClass);
+	static bool IsExplicitLayout(spv::StorageClass storageClass);
+
+	static sw::SIMD::Pointer InterleaveByLane(sw::SIMD::Pointer p);
+
+	// Output storage buffers and images should not be affected by helper invocations
+	static bool StoresInHelperInvocation(spv::StorageClass storageClass);
+
+	using InterfaceVisitor = std::function<void(Decorations const, AttribType)>;
+
+	void VisitInterface(Object::ID id, const InterfaceVisitor& v) const;
+
+	int VisitInterfaceInner(Type::ID id, Decorations d, const InterfaceVisitor& v) const;
+
+	// MemoryElement describes a scalar element within a structure, and is
+	// used by the callback function of VisitMemoryObject().
+	struct MemoryElement
+	{
+		uint32_t index;   // index of the scalar element
+		uint32_t offset;  // offset (in bytes) from the base of the object
+		const Type& type; // element type
+	};
+
+	using MemoryVisitor = std::function<void(const MemoryElement&)>;
+
+	// VisitMemoryObject() walks a type tree in an explicitly laid out
+	// storage class, calling the MemoryVisitor for each scalar element
+	// within the
+	void VisitMemoryObject(Object::ID id, const MemoryVisitor& v) const;
+
+	// VisitMemoryObjectInner() is internally called by VisitMemoryObject()
+	void VisitMemoryObjectInner(Type::ID id, Decorations d, uint32_t &index, uint32_t offset, const MemoryVisitor& v) const;
+
+	Object& CreateConstant(InsnIterator it);
+
+	void ProcessInterfaceVariable(Object &object);
+
+	// EmitState holds control-flow state for the emit() pass.
+	class EmitState
+	{
+	public:
+		EmitState(SpirvRoutine *routine,
+				Function::ID function,
+				RValue<SIMD::Int> activeLaneMask,
+				RValue<SIMD::Int> storesAndAtomicsMask,
+				const vk::DescriptorSet::Bindings &descriptorSets,
+				bool robustBufferAccess,
+				spv::ExecutionModel executionModel)
+			: routine(routine),
+			  function(function),
+			  activeLaneMaskValue(activeLaneMask.value),
+			  storesAndAtomicsMaskValue(storesAndAtomicsMask.value),
+			  descriptorSets(descriptorSets),
+			  robustBufferAccess(robustBufferAccess),
+			  executionModel(executionModel)
+		{
+			ASSERT(executionModelToStage(executionModel) != VkShaderStageFlagBits(0));  // Must parse OpEntryPoint before emitting.
+		}
+
+		RValue<SIMD::Int> activeLaneMask() const
+		{
+			ASSERT(activeLaneMaskValue != nullptr);
+			return RValue<SIMD::Int>(activeLaneMaskValue);
+		}
+
+		RValue<SIMD::Int> storesAndAtomicsMask() const
+		{
+			ASSERT(storesAndAtomicsMaskValue != nullptr);
+			return RValue<SIMD::Int>(storesAndAtomicsMaskValue);
+		}
+
+		void setActiveLaneMask(RValue<SIMD::Int> mask)
+		{
+			activeLaneMaskValue = mask.value;
+		}
+
+		// Add a new active lane mask edge from the current block to out.
+		// The edge mask value will be (mask AND activeLaneMaskValue).
+		// If multiple active lane masks are added for the same edge, then
+		// they will be ORed together.
+		void addOutputActiveLaneMaskEdge(Block::ID out, RValue<SIMD::Int> mask);
+
+		// Add a new active lane mask for the edge from -> to.
+		// If multiple active lane masks are added for the same edge, then
+		// they will be ORed together.
+		void addActiveLaneMaskEdge(Block::ID from, Block::ID to, RValue<SIMD::Int> mask);
+
+		SpirvRoutine *routine = nullptr; // The current routine being built.
+		Function::ID function; // The current function being built.
+		Block::ID block; // The current block being built.
+		rr::Value *activeLaneMaskValue = nullptr; // The current active lane mask.
+		rr::Value *storesAndAtomicsMaskValue = nullptr; // The current atomics mask.
+		Block::Set visited; // Blocks already built.
+		std::unordered_map<Block::Edge, RValue<SIMD::Int>, Block::Edge::Hash> edgeActiveLaneMasks;
+		std::deque<Block::ID> *pending;
+
+		const vk::DescriptorSet::Bindings &descriptorSets;
+
+		OutOfBoundsBehavior getOutOfBoundsBehavior(spv::StorageClass storageClass) const;
+
+		Intermediate& createIntermediate(Object::ID id, uint32_t size)
+		{
+			auto it = intermediates.emplace(std::piecewise_construct,
+					std::forward_as_tuple(id),
+					std::forward_as_tuple(size));
+			ASSERT_MSG(it.second, "Intermediate %d created twice", id.value());
+			return it.first->second;
+		}
+
+		Intermediate const& getIntermediate(Object::ID id) const
+		{
+			auto it = intermediates.find(id);
+			ASSERT_MSG(it != intermediates.end(), "Unknown intermediate %d", id.value());
+			return it->second;
+		}
+
+		void createPointer(Object::ID id, SIMD::Pointer ptr)
+		{
+			bool added = pointers.emplace(id, ptr).second;
+			ASSERT_MSG(added, "Pointer %d created twice", id.value());
+		}
+
+		SIMD::Pointer const& getPointer(Object::ID id) const
+		{
+			auto it = pointers.find(id);
+			ASSERT_MSG(it != pointers.end(), "Unknown pointer %d", id.value());
+			return it->second;
+		}
+
+	private:
+		std::unordered_map<Object::ID, Intermediate> intermediates;
+		std::unordered_map<Object::ID, SIMD::Pointer> pointers;
+
+		const bool robustBufferAccess = true;  // Emit robustBufferAccess safe code.
+		const spv::ExecutionModel executionModel = spv::ExecutionModelMax;
+	};
+
+	// EmitResult is an enumerator of result values from the Emit functions.
+	enum class EmitResult
+	{
+		Continue, // No termination instructions.
+		Terminator, // Reached a termination instruction.
+	};
+
+	// Generic wrapper over either per-lane intermediate value, or a constant.
+	// Constants are transparently widened to per-lane values in operator[].
+	// This is appropriate in most cases -- if we're not going to do something
+	// significantly different based on whether the value is uniform across lanes.
+	class GenericValue
+	{
+		SpirvShader::Object const &obj;
+		Intermediate const *intermediate;
+
+	public:
+		GenericValue(SpirvShader const *shader, EmitState const *state, SpirvShader::Object::ID objId);
+
 		RValue<SIMD::Float> Float(uint32_t i) const
 		{
-			ASSERT(i < size);
-			ASSERT(scalar[i] != nullptr);
-			return As<SIMD::Float>(scalar[i]);  // TODO(b/128539387): RValue<SIMD::Float>(scalar)
+			if (intermediate)
+			{
+				return intermediate->Float(i);
+			}
+
+			// Constructing a constant SIMD::Float is not guaranteed to preserve the data's exact
+			// bit pattern, but SPIR-V provides 32-bit words representing "the bit pattern for the constant".
+			// Thus we must first construct an integer constant, and bitcast to float.
+			auto constantValue = reinterpret_cast<uint32_t *>(obj.constantValue.get());
+			return As<SIMD::Float>(SIMD::UInt(constantValue[i]));
 		}
 
 		RValue<SIMD::Int> Int(uint32_t i) const
 		{
-			ASSERT(i < size);
-			ASSERT(scalar[i] != nullptr);
-			return As<SIMD::Int>(scalar[i]);  // TODO(b/128539387): RValue<SIMD::Int>(scalar)
+			if (intermediate)
+			{
+				return intermediate->Int(i);
+			}
+			auto constantValue = reinterpret_cast<int *>(obj.constantValue.get());
+			return SIMD::Int(constantValue[i]);
 		}
 
 		RValue<SIMD::UInt> UInt(uint32_t i) const
 		{
-			ASSERT(i < size);
-			ASSERT(scalar[i] != nullptr);
-			return As<SIMD::UInt>(scalar[i]);  // TODO(b/128539387): RValue<SIMD::UInt>(scalar)
+			if (intermediate)
+			{
+				return intermediate->UInt(i);
+			}
+			auto constantValue = reinterpret_cast<uint32_t *>(obj.constantValue.get());
+			return SIMD::UInt(constantValue[i]);
 		}
 
-		// No copy/move construction or assignment
-		Intermediate(Intermediate const &) = delete;
-		Intermediate(Intermediate &&) = delete;
-		Intermediate & operator=(Intermediate const &) = delete;
-		Intermediate & operator=(Intermediate &&) = delete;
-
-	private:
-		void emplace(uint32_t i, rr::Value *value)
-		{
-			ASSERT(i < size);
-			ASSERT(scalar[i] == nullptr);
-			scalar[i] = value;
-		}
-
-		rr::Value **const scalar;
-		uint32_t size;
+		SpirvShader::Type::ID const type;
 	};
 
-	class SpirvShader
+	Type const &getType(Type::ID id) const
 	{
-	public:
-		using InsnStore = std::vector<uint32_t>;
-		InsnStore insns;
+		auto it = types.find(id);
+		ASSERT_MSG(it != types.end(), "Unknown type %d", id.value());
+		return it->second;
+	}
 
-		using ImageSampler = void(void* texture, void *sampler, void* uvsIn, void* texelOut, void* constants);
-
-		enum class YieldResult
-		{
-			ControlBarrier,
-		};
-
-		/* Pseudo-iterator over SPIRV instructions, designed to support range-based-for. */
-		class InsnIterator
-		{
-			InsnStore::const_iterator iter;
-
-		public:
-			spv::Op opcode() const
-			{
-				return static_cast<spv::Op>(*iter & spv::OpCodeMask);
-			}
-
-			uint32_t wordCount() const
-			{
-				return *iter >> spv::WordCountShift;
-			}
-
-			uint32_t word(uint32_t n) const
-			{
-				ASSERT(n < wordCount());
-				return iter[n];
-			}
-
-			uint32_t const * wordPointer(uint32_t n) const
-			{
-				ASSERT(n < wordCount());
-				return &iter[n];
-			}
-
-			const char* string(uint32_t n) const
-			{
-				return reinterpret_cast<const char*>(wordPointer(n));
-			}
-
-			bool operator==(InsnIterator const &other) const
-			{
-				return iter == other.iter;
-			}
-
-			bool operator!=(InsnIterator const &other) const
-			{
-				return iter != other.iter;
-			}
-
-			InsnIterator operator*() const
-			{
-				return *this;
-			}
-
-			InsnIterator &operator++()
-			{
-				iter += wordCount();
-				return *this;
-			}
-
-			InsnIterator const operator++(int)
-			{
-				InsnIterator ret{*this};
-				iter += wordCount();
-				return ret;
-			}
-
-			InsnIterator(InsnIterator const &other) = default;
-
-			InsnIterator() = default;
-
-			explicit InsnIterator(InsnStore::const_iterator iter) : iter{iter}
-			{
-			}
-		};
-
-		/* range-based-for interface */
-		InsnIterator begin() const
-		{
-			return InsnIterator{insns.cbegin() + 5};
-		}
-
-		InsnIterator end() const
-		{
-			return InsnIterator{insns.cend()};
-		}
-
-		class Type
-		{
-		public:
-			using ID = SpirvID<Type>;
-
-			spv::Op opcode() const { return definition.opcode(); }
-
-			InsnIterator definition;
-			spv::StorageClass storageClass = static_cast<spv::StorageClass>(-1);
-			uint32_t sizeInComponents = 0;
-			bool isBuiltInBlock = false;
-
-			// Inner element type for pointers, arrays, vectors and matrices.
-			ID element;
-		};
-
-		class Object
-		{
-		public:
-			using ID = SpirvID<Object>;
-
-			spv::Op opcode() const { return definition.opcode(); }
-
-			InsnIterator definition;
-			Type::ID type;
-			std::unique_ptr<uint32_t[]> constantValue = nullptr;
-
-			enum class Kind
-			{
-				// Invalid default kind.
-				// If we get left with an object in this state, the module was
-				// broken.
-				Unknown,
-
-				// TODO: Better document this kind.
-				// A shader interface variable pointer.
-				// Pointer with uniform address across all lanes.
-				// Pointer held by SpirvRoutine::pointers
-				InterfaceVariable,
-
-				// Constant value held by Object::constantValue.
-				Constant,
-
-				// Value held by SpirvRoutine::intermediates.
-				Intermediate,
-
-				// Pointer held by SpirvRoutine::pointers
-				Pointer,
-
-				// A pointer to a vk::DescriptorSet*.
-				// Pointer held by SpirvRoutine::pointers.
-				DescriptorSet,
-			};
-
-			Kind kind = Kind::Unknown;
-		};
-
-		// Block is an interval of SPIR-V instructions, starting with the
-		// opening OpLabel, and ending with a termination instruction.
-		class Block
-		{
-		public:
-			using ID = SpirvID<Block>;
-			using Set = std::unordered_set<ID>;
-
-			// Edge represents the graph edge between two blocks.
-			struct Edge
-			{
-				ID from;
-				ID to;
-
-				bool operator == (const Edge& other) const { return from == other.from && to == other.to; }
-
-				struct Hash
-				{
-					std::size_t operator()(const Edge& edge) const noexcept
-					{
-						return std::hash<uint32_t>()(edge.from.value() * 31 + edge.to.value());
-					}
-				};
-			};
-
-			Block() = default;
-			Block(const Block& other) = default;
-			explicit Block(InsnIterator begin, InsnIterator end);
-
-			/* range-based-for interface */
-			inline InsnIterator begin() const { return begin_; }
-			inline InsnIterator end() const { return end_; }
-
-			enum Kind
-			{
-				Simple, // OpBranch or other simple terminator.
-				StructuredBranchConditional, // OpSelectionMerge + OpBranchConditional
-				UnstructuredBranchConditional, // OpBranchConditional
-				StructuredSwitch, // OpSelectionMerge + OpSwitch
-				UnstructuredSwitch, // OpSwitch
-				Loop, // OpLoopMerge + [OpBranchConditional | OpBranch]
-			};
-
-			Kind kind = Simple;
-			InsnIterator mergeInstruction; // Structured control flow merge instruction.
-			InsnIterator branchInstruction; // Branch instruction.
-			ID mergeBlock; // Structured flow merge block.
-			ID continueTarget; // Loop continue block.
-			Set ins; // Blocks that branch into this block.
-			Set outs; // Blocks that this block branches to.
-			bool isLoopMerge = false;
-		private:
-			InsnIterator begin_;
-			InsnIterator end_;
-		};
-
-		class Function
-		{
-		public:
-			using ID = SpirvID<Function>;
-
-			// Walks all reachable the blocks starting from id adding them to
-			// reachable.
-			void TraverseReachableBlocks(Block::ID id, Block::Set& reachable) const;
-
-			// AssignBlockFields() performs the following for all reachable blocks:
-			// * Assigns Block::ins with the identifiers of all blocks that contain
-			//   this block in their Block::outs.
-			// * Sets Block::isLoopMerge to true if the block is the merge of a
-			//   another loop block.
-			void AssignBlockFields();
-
-			// ForeachBlockDependency calls f with each dependency of the given
-			// block. A dependency is an incoming block that is not a loop-back
-			// edge.
-			void ForeachBlockDependency(Block::ID blockId, std::function<void(Block::ID)> f) const;
-
-			// ExistsPath returns true if there's a direct or indirect flow from
-			// the 'from' block to the 'to' block that does not pass through
-			// notPassingThrough.
-			bool ExistsPath(Block::ID from, Block::ID to, Block::ID notPassingThrough) const;
-
-			Block const &getBlock(Block::ID id) const
-			{
-				auto it = blocks.find(id);
-				ASSERT_MSG(it != blocks.end(), "Unknown block %d", id.value());
-				return it->second;
-			}
-
-			Block::ID entry; // function entry point block.
-			HandleMap<Block> blocks; // blocks belonging to this function.
-			Type::ID type; // type of the function.
-			Type::ID result; // return type.
-		};
-
-		struct TypeOrObject {}; // Dummy struct to represent a Type or Object.
-
-		// TypeOrObjectID is an identifier that represents a Type or an Object,
-		// and supports implicit casting to and from Type::ID or Object::ID.
-		class TypeOrObjectID : public SpirvID<TypeOrObject>
-		{
-		public:
-			using Hash = std::hash<SpirvID<TypeOrObject>>;
-
-			inline TypeOrObjectID(uint32_t id) : SpirvID(id) {}
-			inline TypeOrObjectID(Type::ID id) : SpirvID(id.value()) {}
-			inline TypeOrObjectID(Object::ID id) : SpirvID(id.value()) {}
-			inline operator Type::ID() const { return Type::ID(value()); }
-			inline operator Object::ID() const { return Object::ID(value()); }
-		};
-
-		// OpImageSample variants
-		enum Variant
-		{
-			None,  // No Dref or Proj. Also used by OpImageFetch and OpImageQueryLod.
-			Dref,
-			Proj,
-			ProjDref,
-			VARIANT_LAST = ProjDref
-		};
-
-		// Compact representation of image instruction parameters that is passed to the
-		// trampoline function for retrieving/generating the corresponding sampling routine.
-		struct ImageInstruction
-		{
-			ImageInstruction(Variant variant, SamplerMethod samplerMethod)
-				: parameters(0)
-			{
-				this->variant = variant;
-				this->samplerMethod = samplerMethod;
-			}
-
-			// Unmarshal from raw 32-bit data
-			ImageInstruction(uint32_t parameters) : parameters(parameters) {}
-
-			SamplerFunction getSamplerFunction() const
-			{
-				return { static_cast<SamplerMethod>(samplerMethod), offset != 0, sample != 0 };
-			}
-
-			bool isDref() const
-			{
-				return (variant == Dref) || (variant == ProjDref);
-			}
-
-			bool isProj() const
-			{
-				return (variant == Proj) || (variant == ProjDref);
-			}
-
-			union
-			{
-				struct
-				{
-					uint32_t variant : BITS(VARIANT_LAST);
-					uint32_t samplerMethod : BITS(SAMPLER_METHOD_LAST);
-					uint32_t gatherComponent : 2;
-
-					// Parameters are passed to the sampling routine in this order:
-					uint32_t coordinates : 3;       // 1-4 (does not contain projection component)
-				//	uint32_t dref : 1;              // Indicated by Variant::ProjDref|Dref
-				//	uint32_t lodOrBias : 1;         // Indicated by SamplerMethod::Lod|Bias|Fetch
-					uint32_t grad : 2;              // 0-3 components (for each of dx / dy)
-					uint32_t offset : 2;            // 0-3 components
-					uint32_t sample : 1;            // 0-1 scalar integer
-				};
-
-				uint32_t parameters;
-			};
-		};
-
-		static_assert(sizeof(ImageInstruction) == sizeof(uint32_t), "ImageInstruction must be 32-bit");
-
-		// This method is for retrieving an ID that uniquely identifies the
-		// shader entry point represented by this object.
-		uint64_t getSerialID() const
-		{
-			return  ((uint64_t)entryPoint.value() << 32) | codeSerialID;
-		}
-
-		SpirvShader(uint32_t codeSerialID,
-		            VkShaderStageFlagBits stage,
-		            const char *entryPointName,
-		            InsnStore const &insns,
-		            const vk::RenderPass *renderPass,
-		            uint32_t subpassIndex,
-		            bool robustBufferAccess);
-
-		struct Modes
-		{
-			bool EarlyFragmentTests : 1;
-			bool DepthReplacing : 1;
-			bool DepthGreater : 1;
-			bool DepthLess : 1;
-			bool DepthUnchanged : 1;
-			bool ContainsKill : 1;
-			bool ContainsControlBarriers : 1;
-			bool NeedsCentroid : 1;
-
-			// Compute workgroup dimensions
-			int WorkgroupSizeX = 1, WorkgroupSizeY = 1, WorkgroupSizeZ = 1;
-		};
-
-		Modes const &getModes() const
-		{
-			return modes;
-		}
-
-		struct Capabilities
-		{
-			bool Matrix : 1;
-			bool Shader : 1;
-			bool ClipDistance : 1;
-			bool CullDistance : 1;
-			bool InputAttachment : 1;
-			bool Sampled1D : 1;
-			bool Image1D : 1;
-			bool ImageCubeArray : 1;
-			bool SampledBuffer : 1;
-			bool SampledCubeArray : 1;
-			bool ImageBuffer : 1;
-			bool StorageImageExtendedFormats : 1;
-			bool ImageQuery : 1;
-			bool DerivativeControl : 1;
-			bool GroupNonUniform : 1;
-			bool GroupNonUniformVote : 1;
-			bool GroupNonUniformBallot : 1;
-			bool GroupNonUniformShuffle : 1;
-			bool GroupNonUniformShuffleRelative : 1;
-			bool GroupNonUniformArithmetic : 1;
-			bool DeviceGroup : 1;
-			bool MultiView : 1;
-		};
-
-		Capabilities const &getUsedCapabilities() const
-		{
-			return capabilities;
-		}
-
-		// getNumOutputClipDistances() returns the number of ClipDistances
-		// outputted by this shader.
-		unsigned int getNumOutputClipDistances() const
-		{
-			if (getUsedCapabilities().ClipDistance)
-			{
-				auto it = outputBuiltins.find(spv::BuiltInClipDistance);
-				if(it != outputBuiltins.end())
-				{
-					return it->second.SizeInComponents;
-				}
-			}
-			return 0;
-		}
-
-		// getNumOutputCullDistances() returns the number of CullDistances
-		// outputted by this shader.
-		unsigned int getNumOutputCullDistances() const
-		{
-			if (getUsedCapabilities().CullDistance)
-			{
-				auto it = outputBuiltins.find(spv::BuiltInCullDistance);
-				if(it != outputBuiltins.end())
-				{
-					return it->second.SizeInComponents;
-				}
-			}
-			return 0;
-		}
-
-		enum AttribType : unsigned char
-		{
-			ATTRIBTYPE_FLOAT,
-			ATTRIBTYPE_INT,
-			ATTRIBTYPE_UINT,
-			ATTRIBTYPE_UNUSED,
-
-			ATTRIBTYPE_LAST = ATTRIBTYPE_UINT
-		};
-
-		bool hasBuiltinInput(spv::BuiltIn b) const
-		{
-			return inputBuiltins.find(b) != inputBuiltins.end();
-		}
-
-		bool hasBuiltinOutput(spv::BuiltIn b) const
-		{
-			return outputBuiltins.find(b) != outputBuiltins.end();
-		}
-
-		struct Decorations
-		{
-			int32_t Location = -1;
-			int32_t Component = 0;
-			spv::BuiltIn BuiltIn = static_cast<spv::BuiltIn>(-1);
-			int32_t Offset = -1;
-			int32_t ArrayStride = -1;
-			int32_t MatrixStride = 1;
-
-			bool HasLocation : 1;
-			bool HasComponent : 1;
-			bool HasBuiltIn : 1;
-			bool HasOffset : 1;
-			bool HasArrayStride : 1;
-			bool HasMatrixStride : 1;
-			bool HasRowMajor : 1;		// whether RowMajor bit is valid.
-
-			bool Flat : 1;
-			bool Centroid : 1;
-			bool NoPerspective : 1;
-			bool Block : 1;
-			bool BufferBlock : 1;
-			bool RelaxedPrecision : 1;
-			bool RowMajor : 1;			// RowMajor if true; ColMajor if false
-			bool InsideMatrix : 1;		// pseudo-decoration for whether we're inside a matrix.
-
-			Decorations()
-					: Location{-1}, Component{0},
-					  BuiltIn{static_cast<spv::BuiltIn>(-1)},
-					  Offset{-1}, ArrayStride{-1}, MatrixStride{-1},
-					  HasLocation{false}, HasComponent{false},
-					  HasBuiltIn{false}, HasOffset{false},
-					  HasArrayStride{false}, HasMatrixStride{false},
-					  HasRowMajor{false},
-					  Flat{false}, Centroid{false}, NoPerspective{false},
-					  Block{false}, BufferBlock{false},
-					  RelaxedPrecision{false}, RowMajor{false},
-					  InsideMatrix{false}
-			{
-			}
-
-			Decorations(Decorations const &) = default;
-
-			void Apply(Decorations const &src);
-
-			void Apply(spv::Decoration decoration, uint32_t arg);
-		};
-
-		std::unordered_map<TypeOrObjectID, Decorations, TypeOrObjectID::Hash> decorations;
-		std::unordered_map<Type::ID, std::vector<Decorations>> memberDecorations;
-
-		struct DescriptorDecorations
-		{
-			int32_t DescriptorSet = -1;
-			int32_t Binding = -1;
-			int32_t InputAttachmentIndex = -1;
-
-			void Apply(DescriptorDecorations const &src);
-		};
-
-		std::unordered_map<Object::ID, DescriptorDecorations> descriptorDecorations;
-		std::vector<VkFormat> inputAttachmentFormats;
-
-		struct InterfaceComponent
-		{
-			AttribType Type;
-
-			union
-			{
-				struct
-				{
-					bool Flat : 1;
-					bool Centroid : 1;
-					bool NoPerspective : 1;
-				};
-
-				uint8_t DecorationBits;
-			};
-
-			InterfaceComponent()
-				: Type{ATTRIBTYPE_UNUSED}, DecorationBits{0}
-			{
-			}
-		};
-
-		struct BuiltinMapping
-		{
-			Object::ID Id;
-			uint32_t FirstComponent;
-			uint32_t SizeInComponents;
-		};
-
-		struct WorkgroupMemory
-		{
-			// allocates a new variable of size bytes with the given identifier.
-			inline void allocate(Object::ID id, uint32_t size)
-			{
-				uint32_t offset = totalSize;
-				auto it = offsets.emplace(id, offset);
-				ASSERT_MSG(it.second, "WorkgroupMemory already has an allocation for object %d", int(id.value()));
-				totalSize += size;
-			}
-			// returns the byte offset of the variable with the given identifier.
-			inline uint32_t offsetOf(Object::ID id) const
-			{
-				auto it = offsets.find(id);
-				ASSERT_MSG(it != offsets.end(), "WorkgroupMemory has no allocation for object %d", int(id.value()));
-				return it->second;
-			}
-			// returns the total allocated size in bytes.
-			inline uint32_t size() const { return totalSize; }
-		private:
-			uint32_t totalSize = 0; // in bytes
-			std::unordered_map<Object::ID, uint32_t> offsets; // in bytes
-		};
-
-		std::vector<InterfaceComponent> inputs;
-		std::vector<InterfaceComponent> outputs;
-
-		void emitProlog(SpirvRoutine *routine) const;
-		void emit(SpirvRoutine *routine, RValue<SIMD::Int> const &activeLaneMask, RValue<SIMD::Int> const &storesAndAtomicsMask, const vk::DescriptorSet::Bindings &descriptorSets) const;
-		void emitEpilog(SpirvRoutine *routine) const;
-
-		using BuiltInHash = std::hash<std::underlying_type<spv::BuiltIn>::type>;
-		std::unordered_map<spv::BuiltIn, BuiltinMapping, BuiltInHash> inputBuiltins;
-		std::unordered_map<spv::BuiltIn, BuiltinMapping, BuiltInHash> outputBuiltins;
-		WorkgroupMemory workgroupMemory;
-
-	private:
-		const uint32_t codeSerialID;
-		Modes modes = {};
-		Capabilities capabilities = {};
-		HandleMap<Type> types;
-		HandleMap<Object> defs;
-		HandleMap<Function> functions;
-		Function::ID entryPoint;
-
-		const bool robustBufferAccess = true;
-		spv::ExecutionModel executionModel = spv::ExecutionModelMax; // Invalid prior to OpEntryPoint parsing.
-
-		// DeclareType creates a Type for the given OpTypeX instruction, storing
-		// it into the types map. It is called from the analysis pass (constructor).
-		void DeclareType(InsnIterator insn);
-
-		void ProcessExecutionMode(InsnIterator it);
-
-		uint32_t ComputeTypeSize(InsnIterator insn);
-		void ApplyDecorationsForId(Decorations *d, TypeOrObjectID id) const;
-		void ApplyDecorationsForIdMember(Decorations *d, Type::ID id, uint32_t member) const;
-		void ApplyDecorationsForAccessChain(Decorations *d, DescriptorDecorations *dd, Object::ID baseId, uint32_t numIndexes, uint32_t const *indexIds) const;
-
-		// Creates an Object for the instruction's result in 'defs'.
-		void DefineResult(const InsnIterator &insn);
-
-		// Returns true if data in the given storage class is word-interleaved
-		// by each SIMD vector lane, otherwise data is stored linerally.
-		//
-		// Each lane addresses a single word, picked by a base pointer and an
-		// integer offset.
-		//
-		// A word is currently 32 bits (single float, int32_t, uint32_t).
-		// A lane is a single element of a SIMD vector register.
-		//
-		// Storage interleaved by lane - (IsStorageInterleavedByLane() == true):
-		// ---------------------------------------------------------------------
-		//
-		// Address = PtrBase + sizeof(Word) * (SIMD::Width * LaneOffset + LaneIndex)
-		//
-		// Assuming SIMD::Width == 4:
-		//
-		//                   Lane[0]  |  Lane[1]  |  Lane[2]  |  Lane[3]
-		//                 ===========+===========+===========+==========
-		//  LaneOffset=0: |  Word[0]  |  Word[1]  |  Word[2]  |  Word[3]
-		// ---------------+-----------+-----------+-----------+----------
-		//  LaneOffset=1: |  Word[4]  |  Word[5]  |  Word[6]  |  Word[7]
-		// ---------------+-----------+-----------+-----------+----------
-		//  LaneOffset=2: |  Word[8]  |  Word[9]  |  Word[a]  |  Word[b]
-		// ---------------+-----------+-----------+-----------+----------
-		//  LaneOffset=3: |  Word[c]  |  Word[d]  |  Word[e]  |  Word[f]
-		//
-		//
-		// Linear storage - (IsStorageInterleavedByLane() == false):
-		// ---------------------------------------------------------
-		//
-		// Address = PtrBase + sizeof(Word) * LaneOffset
-		//
-		//                   Lane[0]  |  Lane[1]  |  Lane[2]  |  Lane[3]
-		//                 ===========+===========+===========+==========
-		//  LaneOffset=0: |  Word[0]  |  Word[0]  |  Word[0]  |  Word[0]
-		// ---------------+-----------+-----------+-----------+----------
-		//  LaneOffset=1: |  Word[1]  |  Word[1]  |  Word[1]  |  Word[1]
-		// ---------------+-----------+-----------+-----------+----------
-		//  LaneOffset=2: |  Word[2]  |  Word[2]  |  Word[2]  |  Word[2]
-		// ---------------+-----------+-----------+-----------+----------
-		//  LaneOffset=3: |  Word[3]  |  Word[3]  |  Word[3]  |  Word[3]
-		//
-		static bool IsStorageInterleavedByLane(spv::StorageClass storageClass);
-		static bool IsExplicitLayout(spv::StorageClass storageClass);
-
-		static sw::SIMD::Pointer InterleaveByLane(sw::SIMD::Pointer p);
-
-		// Output storage buffers and images should not be affected by helper invocations
-		static bool StoresInHelperInvocation(spv::StorageClass storageClass);
-
-		using InterfaceVisitor = std::function<void(Decorations const, AttribType)>;
-
-		void VisitInterface(Object::ID id, const InterfaceVisitor& v) const;
-
-		int VisitInterfaceInner(Type::ID id, Decorations d, const InterfaceVisitor& v) const;
-
-		// MemoryElement describes a scalar element within a structure, and is
-		// used by the callback function of VisitMemoryObject().
-		struct MemoryElement
-		{
-			uint32_t index;   // index of the scalar element
-			uint32_t offset;  // offset (in bytes) from the base of the object
-			const Type& type; // element type
-		};
-
-		using MemoryVisitor = std::function<void(const MemoryElement&)>;
-
-		// VisitMemoryObject() walks a type tree in an explicitly laid out
-		// storage class, calling the MemoryVisitor for each scalar element
-		// within the
-		void VisitMemoryObject(Object::ID id, const MemoryVisitor& v) const;
-
-		// VisitMemoryObjectInner() is internally called by VisitMemoryObject()
-		void VisitMemoryObjectInner(Type::ID id, Decorations d, uint32_t &index, uint32_t offset, const MemoryVisitor& v) const;
-
-		Object& CreateConstant(InsnIterator it);
-
-		void ProcessInterfaceVariable(Object &object);
-
-		// EmitState holds control-flow state for the emit() pass.
-		class EmitState
-		{
-		public:
-			EmitState(SpirvRoutine *routine,
-					Function::ID function,
-					RValue<SIMD::Int> activeLaneMask,
-					RValue<SIMD::Int> storesAndAtomicsMask,
-					const vk::DescriptorSet::Bindings &descriptorSets,
-					bool robustBufferAccess,
-					spv::ExecutionModel executionModel)
-				: routine(routine),
-				  function(function),
-				  activeLaneMaskValue(activeLaneMask.value),
-				  storesAndAtomicsMaskValue(storesAndAtomicsMask.value),
-				  descriptorSets(descriptorSets),
-				  robustBufferAccess(robustBufferAccess),
-				  executionModel(executionModel)
-			{
-				ASSERT(executionModelToStage(executionModel) != VkShaderStageFlagBits(0));  // Must parse OpEntryPoint before emitting.
-			}
-
-			RValue<SIMD::Int> activeLaneMask() const
-			{
-				ASSERT(activeLaneMaskValue != nullptr);
-				return RValue<SIMD::Int>(activeLaneMaskValue);
-			}
-
-			RValue<SIMD::Int> storesAndAtomicsMask() const
-			{
-				ASSERT(storesAndAtomicsMaskValue != nullptr);
-				return RValue<SIMD::Int>(storesAndAtomicsMaskValue);
-			}
-
-			void setActiveLaneMask(RValue<SIMD::Int> mask)
-			{
-				activeLaneMaskValue = mask.value;
-			}
-
-			// Add a new active lane mask edge from the current block to out.
-			// The edge mask value will be (mask AND activeLaneMaskValue).
-			// If multiple active lane masks are added for the same edge, then
-			// they will be ORed together.
-			void addOutputActiveLaneMaskEdge(Block::ID out, RValue<SIMD::Int> mask);
-
-			// Add a new active lane mask for the edge from -> to.
-			// If multiple active lane masks are added for the same edge, then
-			// they will be ORed together.
-			void addActiveLaneMaskEdge(Block::ID from, Block::ID to, RValue<SIMD::Int> mask);
-
-			SpirvRoutine *routine = nullptr; // The current routine being built.
-			Function::ID function; // The current function being built.
-			Block::ID block; // The current block being built.
-			rr::Value *activeLaneMaskValue = nullptr; // The current active lane mask.
-			rr::Value *storesAndAtomicsMaskValue = nullptr; // The current atomics mask.
-			Block::Set visited; // Blocks already built.
-			std::unordered_map<Block::Edge, RValue<SIMD::Int>, Block::Edge::Hash> edgeActiveLaneMasks;
-			std::deque<Block::ID> *pending;
-
-			const vk::DescriptorSet::Bindings &descriptorSets;
-
-			OutOfBoundsBehavior getOutOfBoundsBehavior(spv::StorageClass storageClass) const;
-
-			Intermediate& createIntermediate(Object::ID id, uint32_t size)
-			{
-				auto it = intermediates.emplace(std::piecewise_construct,
-						std::forward_as_tuple(id),
-						std::forward_as_tuple(size));
-				ASSERT_MSG(it.second, "Intermediate %d created twice", id.value());
-				return it.first->second;
-			}
-
-			Intermediate const& getIntermediate(Object::ID id) const
-			{
-				auto it = intermediates.find(id);
-				ASSERT_MSG(it != intermediates.end(), "Unknown intermediate %d", id.value());
-				return it->second;
-			}
-
-			void createPointer(Object::ID id, SIMD::Pointer ptr)
-			{
-				bool added = pointers.emplace(id, ptr).second;
-				ASSERT_MSG(added, "Pointer %d created twice", id.value());
-			}
-
-			SIMD::Pointer const& getPointer(Object::ID id) const
-			{
-				auto it = pointers.find(id);
-				ASSERT_MSG(it != pointers.end(), "Unknown pointer %d", id.value());
-				return it->second;
-			}
-
-		private:
-			std::unordered_map<Object::ID, Intermediate> intermediates;
-			std::unordered_map<Object::ID, SIMD::Pointer> pointers;
-
-			const bool robustBufferAccess = true;  // Emit robustBufferAccess safe code.
-			const spv::ExecutionModel executionModel = spv::ExecutionModelMax;
-		};
-
-		// EmitResult is an enumerator of result values from the Emit functions.
-		enum class EmitResult
-		{
-			Continue, // No termination instructions.
-			Terminator, // Reached a termination instruction.
-		};
-
-		// Generic wrapper over either per-lane intermediate value, or a constant.
-		// Constants are transparently widened to per-lane values in operator[].
-		// This is appropriate in most cases -- if we're not going to do something
-		// significantly different based on whether the value is uniform across lanes.
-		class GenericValue
-		{
-			SpirvShader::Object const &obj;
-			Intermediate const *intermediate;
-
-		public:
-			GenericValue(SpirvShader const *shader, EmitState const *state, SpirvShader::Object::ID objId);
-
-			RValue<SIMD::Float> Float(uint32_t i) const
-			{
-				if (intermediate)
-				{
-					return intermediate->Float(i);
-				}
-
-				// Constructing a constant SIMD::Float is not guaranteed to preserve the data's exact
-				// bit pattern, but SPIR-V provides 32-bit words representing "the bit pattern for the constant".
-				// Thus we must first construct an integer constant, and bitcast to float.
-				auto constantValue = reinterpret_cast<uint32_t *>(obj.constantValue.get());
-				return As<SIMD::Float>(SIMD::UInt(constantValue[i]));
-			}
-
-			RValue<SIMD::Int> Int(uint32_t i) const
-			{
-				if (intermediate)
-				{
-					return intermediate->Int(i);
-				}
-				auto constantValue = reinterpret_cast<int *>(obj.constantValue.get());
-				return SIMD::Int(constantValue[i]);
-			}
-
-			RValue<SIMD::UInt> UInt(uint32_t i) const
-			{
-				if (intermediate)
-				{
-					return intermediate->UInt(i);
-				}
-				auto constantValue = reinterpret_cast<uint32_t *>(obj.constantValue.get());
-				return SIMD::UInt(constantValue[i]);
-			}
-
-			SpirvShader::Type::ID const type;
-		};
-
-		Type const &getType(Type::ID id) const
-		{
-			auto it = types.find(id);
-			ASSERT_MSG(it != types.end(), "Unknown type %d", id.value());
-			return it->second;
-		}
-
-		Object const &getObject(Object::ID id) const
-		{
-			auto it = defs.find(id);
-			ASSERT_MSG(it != defs.end(), "Unknown object %d", id.value());
-			return it->second;
-		}
-
-		Function const &getFunction(Function::ID id) const
-		{
-			auto it = functions.find(id);
-			ASSERT_MSG(it != functions.end(), "Unknown function %d", id.value());
-			return it->second;
-		}
-
-		// Returns a SIMD::Pointer to the underlying data for the given pointer
-		// object.
-		// Handles objects of the following kinds:
-		//  • DescriptorSet
-		//  • DivergentPointer
-		//  • InterfaceVariable
-		//  • NonDivergentPointer
-		// Calling GetPointerToData with objects of any other kind will assert.
-		SIMD::Pointer GetPointerToData(Object::ID id, int arrayIndex, EmitState const *state) const;
-
-		SIMD::Pointer WalkExplicitLayoutAccessChain(Object::ID id, uint32_t numIndexes, uint32_t const *indexIds, EmitState const *state) const;
-		SIMD::Pointer WalkAccessChain(Object::ID id, uint32_t numIndexes, uint32_t const *indexIds, EmitState const *state) const;
-
-		// Returns the *component* offset in the literal for the given access chain.
-		uint32_t WalkLiteralAccessChain(Type::ID id, uint32_t numIndexes, uint32_t const *indexes) const;
-
-		// Lookup the active lane mask for the edge from -> to.
-		// If from is unreachable, then a mask of all zeros is returned.
-		// Asserts if from is reachable and the edge does not exist.
-		RValue<SIMD::Int> GetActiveLaneMaskEdge(EmitState *state, Block::ID from, Block::ID to) const;
-
-		// Emit all the unvisited blocks (except for ignore) in DFS order,
-		// starting with id.
-		void EmitBlocks(Block::ID id, EmitState *state, Block::ID ignore = 0) const;
-		void EmitNonLoop(EmitState *state) const;
-		void EmitLoop(EmitState *state) const;
-
-		void EmitInstructions(InsnIterator begin, InsnIterator end, EmitState *state) const;
-		EmitResult EmitInstruction(InsnIterator insn, EmitState *state) const;
-
-		// Emit pass instructions:
-		EmitResult EmitVariable(InsnIterator insn, EmitState *state) const;
-		EmitResult EmitLoad(InsnIterator insn, EmitState *state) const;
-		EmitResult EmitStore(InsnIterator insn, EmitState *state) const;
-		EmitResult EmitAccessChain(InsnIterator insn, EmitState *state) const;
-		EmitResult EmitCompositeConstruct(InsnIterator insn, EmitState *state) const;
-		EmitResult EmitCompositeInsert(InsnIterator insn, EmitState *state) const;
-		EmitResult EmitCompositeExtract(InsnIterator insn, EmitState *state) const;
-		EmitResult EmitVectorShuffle(InsnIterator insn, EmitState *state) const;
-		EmitResult EmitVectorTimesScalar(InsnIterator insn, EmitState *state) const;
-		EmitResult EmitMatrixTimesVector(InsnIterator insn, EmitState *state) const;
-		EmitResult EmitVectorTimesMatrix(InsnIterator insn, EmitState *state) const;
-		EmitResult EmitMatrixTimesMatrix(InsnIterator insn, EmitState *state) const;
-		EmitResult EmitOuterProduct(InsnIterator insn, EmitState *state) const;
-		EmitResult EmitTranspose(InsnIterator insn, EmitState *state) const;
-		EmitResult EmitVectorExtractDynamic(InsnIterator insn, EmitState *state) const;
-		EmitResult EmitVectorInsertDynamic(InsnIterator insn, EmitState *state) const;
-		EmitResult EmitUnaryOp(InsnIterator insn, EmitState *state) const;
-		EmitResult EmitBinaryOp(InsnIterator insn, EmitState *state) const;
-		EmitResult EmitDot(InsnIterator insn, EmitState *state) const;
-		EmitResult EmitSelect(InsnIterator insn, EmitState *state) const;
-		EmitResult EmitExtendedInstruction(InsnIterator insn, EmitState *state) const;
-		EmitResult EmitAny(InsnIterator insn, EmitState *state) const;
-		EmitResult EmitAll(InsnIterator insn, EmitState *state) const;
-		EmitResult EmitBranch(InsnIterator insn, EmitState *state) const;
-		EmitResult EmitBranchConditional(InsnIterator insn, EmitState *state) const;
-		EmitResult EmitSwitch(InsnIterator insn, EmitState *state) const;
-		EmitResult EmitUnreachable(InsnIterator insn, EmitState *state) const;
-		EmitResult EmitReturn(InsnIterator insn, EmitState *state) const;
-		EmitResult EmitKill(InsnIterator insn, EmitState *state) const;
-		EmitResult EmitFunctionCall(InsnIterator insn, EmitState *state) const;
-		EmitResult EmitPhi(InsnIterator insn, EmitState *state) const;
-		EmitResult EmitImageSampleImplicitLod(Variant variant, InsnIterator insn, EmitState *state) const;
-		EmitResult EmitImageSampleExplicitLod(Variant variant, InsnIterator insn, EmitState *state) const;
-		EmitResult EmitImageGather(Variant variant, InsnIterator insn, EmitState *state) const;
-		EmitResult EmitImageFetch(InsnIterator insn, EmitState *state) const;
-		EmitResult EmitImageSample(ImageInstruction instruction, InsnIterator insn, EmitState *state) const;
-		EmitResult EmitImageQuerySizeLod(InsnIterator insn, EmitState *state) const;
-		EmitResult EmitImageQuerySize(InsnIterator insn, EmitState *state) const;
-		EmitResult EmitImageQueryLod(InsnIterator insn, EmitState *state) const;
-		EmitResult EmitImageQueryLevels(InsnIterator insn, EmitState *state) const;
-		EmitResult EmitImageQuerySamples(InsnIterator insn, EmitState *state) const;
-		EmitResult EmitImageRead(InsnIterator insn, EmitState *state) const;
-		EmitResult EmitImageWrite(InsnIterator insn, EmitState *state) const;
-		EmitResult EmitImageTexelPointer(InsnIterator insn, EmitState *state) const;
-		EmitResult EmitAtomicOp(InsnIterator insn, EmitState *state) const;
-		EmitResult EmitAtomicCompareExchange(InsnIterator insn, EmitState *state) const;
-		EmitResult EmitSampledImageCombineOrSplit(InsnIterator insn, EmitState *state) const;
-		EmitResult EmitCopyObject(InsnIterator insn, EmitState *state) const;
-		EmitResult EmitCopyMemory(InsnIterator insn, EmitState *state) const;
-		EmitResult EmitControlBarrier(InsnIterator insn, EmitState *state) const;
-		EmitResult EmitMemoryBarrier(InsnIterator insn, EmitState *state) const;
-		EmitResult EmitGroupNonUniform(InsnIterator insn, EmitState *state) const;
-		EmitResult EmitArrayLength(InsnIterator insn, EmitState *state) const;
-
-		void GetImageDimensions(EmitState const *state, Type const &resultTy, Object::ID imageId, Object::ID lodId, Intermediate &dst) const;
-		SIMD::Pointer GetTexelAddress(EmitState const *state, SIMD::Pointer base, GenericValue const & coordinate, Type const & imageType, Pointer<Byte> descriptor, int texelSize, Object::ID sampleId, bool useStencilAspect) const;
-		uint32_t GetConstScalarInt(Object::ID id) const;
-		void EvalSpecConstantOp(InsnIterator insn);
-		void EvalSpecConstantUnaryOp(InsnIterator insn);
-		void EvalSpecConstantBinaryOp(InsnIterator insn);
-
-		// LoadPhi loads the phi values from the alloca storage and places the
-		// load values into the intermediate with the phi's result id.
-		void LoadPhi(InsnIterator insn, EmitState *state) const;
-
-		// StorePhi updates the phi's alloca storage value using the incoming
-		// values from blocks that are both in the OpPhi instruction and in
-		// filter.
-		void StorePhi(Block::ID blockID, InsnIterator insn, EmitState *state, std::unordered_set<SpirvShader::Block::ID> const& filter) const;
-
-		// Emits a rr::Fence for the given MemorySemanticsMask.
-		void Fence(spv::MemorySemanticsMask semantics) const;
-
-		// Helper for calling rr::Yield with res cast to an rr::Int.
-		void Yield(YieldResult res) const;
-
-		// OpcodeName() returns the name of the opcode op.
-		// If NDEBUG is defined, then OpcodeName() will only return the numerical code.
-		static std::string OpcodeName(spv::Op op);
-		static std::memory_order MemoryOrder(spv::MemorySemanticsMask memorySemantics);
-
-		// Helper as we often need to take dot products as part of doing other things.
-		SIMD::Float Dot(unsigned numComponents, GenericValue const & x, GenericValue const & y) const;
-
-		SIMD::UInt FloatToHalfBits(SIMD::UInt floatBits, bool storeInUpperBits) const;
-
-		// Splits x into a floating-point significand in the range [0.5, 1.0)
-		// and an integral exponent of two, such that:
-		//   x = significand * 2^exponent
-		// Returns the pair <significand, exponent>
-		std::pair<SIMD::Float, SIMD::Int> Frexp(RValue<SIMD::Float> val) const;
-
-		static ImageSampler *getImageSampler(uint32_t instruction, vk::SampledImageDescriptor const *imageDescriptor, const vk::Sampler *sampler);
-		static std::shared_ptr<rr::Routine> emitSamplerRoutine(ImageInstruction instruction, const Sampler &samplerState);
-
-		// TODO(b/129523279): Eliminate conversion and use vk::Sampler members directly.
-		static sw::FilterType convertFilterMode(const vk::Sampler *sampler);
-		static sw::MipmapType convertMipmapMode(const vk::Sampler *sampler);
-		static sw::AddressingMode convertAddressingMode(int coordinateIndex, const vk::Sampler *sampler, VkImageViewType imageViewType);
-
-		// Returns 0 when invalid.
-		static VkShaderStageFlagBits executionModelToStage(spv::ExecutionModel model);
-
-		struct GroupOps;
-	};
-
-	class SpirvRoutine
+	Object const &getObject(Object::ID id) const
 	{
-	public:
-		SpirvRoutine(vk::PipelineLayout const *pipelineLayout);
+		auto it = defs.find(id);
+		ASSERT_MSG(it != defs.end(), "Unknown object %d", id.value());
+		return it->second;
+	}
 
-		using Variable = Array<SIMD::Float>;
+	Function const &getFunction(Function::ID id) const
+	{
+		auto it = functions.find(id);
+		ASSERT_MSG(it != functions.end(), "Unknown function %d", id.value());
+		return it->second;
+	}
 
-		struct SamplerCache
-		{
-			Pointer<Byte> imageDescriptor = nullptr;
-			Pointer<Byte> sampler;
-			Pointer<Byte> function;
-		};
+	// Returns a SIMD::Pointer to the underlying data for the given pointer
+	// object.
+	// Handles objects of the following kinds:
+	//  • DescriptorSet
+	//  • DivergentPointer
+	//  • InterfaceVariable
+	//  • NonDivergentPointer
+	// Calling GetPointerToData with objects of any other kind will assert.
+	SIMD::Pointer GetPointerToData(Object::ID id, int arrayIndex, EmitState const *state) const;
 
-		vk::PipelineLayout const * const pipelineLayout;
+	SIMD::Pointer WalkExplicitLayoutAccessChain(Object::ID id, uint32_t numIndexes, uint32_t const *indexIds, EmitState const *state) const;
+	SIMD::Pointer WalkAccessChain(Object::ID id, uint32_t numIndexes, uint32_t const *indexIds, EmitState const *state) const;
 
-		std::unordered_map<SpirvShader::Object::ID, Variable> variables;
-		std::unordered_map<SpirvShader::Object::ID, SamplerCache> samplerCache;
-		Variable inputs = Variable{MAX_INTERFACE_COMPONENTS};
-		Variable outputs = Variable{MAX_INTERFACE_COMPONENTS};
+	// Returns the *component* offset in the literal for the given access chain.
+	uint32_t WalkLiteralAccessChain(Type::ID id, uint32_t numIndexes, uint32_t const *indexes) const;
 
-		Pointer<Byte> workgroupMemory;
-		Pointer<Pointer<Byte>> descriptorSets;
-		Pointer<Int> descriptorDynamicOffsets;
-		Pointer<Byte> pushConstants;
-		Pointer<Byte> constants;
-		Int killMask = Int{0};
-		SIMD::Int windowSpacePosition[2];
-		Int viewID;	// slice offset into input attachments for multiview, even if the shader doesn't use ViewIndex
+	// Lookup the active lane mask for the edge from -> to.
+	// If from is unreachable, then a mask of all zeros is returned.
+	// Asserts if from is reachable and the edge does not exist.
+	RValue<SIMD::Int> GetActiveLaneMaskEdge(EmitState *state, Block::ID from, Block::ID to) const;
 
-		void createVariable(SpirvShader::Object::ID id, uint32_t size)
-		{
-			bool added = variables.emplace(id, Variable(size)).second;
-			ASSERT_MSG(added, "Variable %d created twice", id.value());
-		}
+	// Emit all the unvisited blocks (except for ignore) in DFS order,
+	// starting with id.
+	void EmitBlocks(Block::ID id, EmitState *state, Block::ID ignore = 0) const;
+	void EmitNonLoop(EmitState *state) const;
+	void EmitLoop(EmitState *state) const;
 
-		Variable& getVariable(SpirvShader::Object::ID id)
-		{
-			auto it = variables.find(id);
-			ASSERT_MSG(it != variables.end(), "Unknown variables %d", id.value());
-			return it->second;
-		}
+	void EmitInstructions(InsnIterator begin, InsnIterator end, EmitState *state) const;
+	EmitResult EmitInstruction(InsnIterator insn, EmitState *state) const;
 
-		// setImmutableInputBuiltins() sets all the immutable input builtins,
-		// common for all shader types.
-		void setImmutableInputBuiltins(SpirvShader const *shader);
+	// Emit pass instructions:
+	EmitResult EmitVariable(InsnIterator insn, EmitState *state) const;
+	EmitResult EmitLoad(InsnIterator insn, EmitState *state) const;
+	EmitResult EmitStore(InsnIterator insn, EmitState *state) const;
+	EmitResult EmitAccessChain(InsnIterator insn, EmitState *state) const;
+	EmitResult EmitCompositeConstruct(InsnIterator insn, EmitState *state) const;
+	EmitResult EmitCompositeInsert(InsnIterator insn, EmitState *state) const;
+	EmitResult EmitCompositeExtract(InsnIterator insn, EmitState *state) const;
+	EmitResult EmitVectorShuffle(InsnIterator insn, EmitState *state) const;
+	EmitResult EmitVectorTimesScalar(InsnIterator insn, EmitState *state) const;
+	EmitResult EmitMatrixTimesVector(InsnIterator insn, EmitState *state) const;
+	EmitResult EmitVectorTimesMatrix(InsnIterator insn, EmitState *state) const;
+	EmitResult EmitMatrixTimesMatrix(InsnIterator insn, EmitState *state) const;
+	EmitResult EmitOuterProduct(InsnIterator insn, EmitState *state) const;
+	EmitResult EmitTranspose(InsnIterator insn, EmitState *state) const;
+	EmitResult EmitVectorExtractDynamic(InsnIterator insn, EmitState *state) const;
+	EmitResult EmitVectorInsertDynamic(InsnIterator insn, EmitState *state) const;
+	EmitResult EmitUnaryOp(InsnIterator insn, EmitState *state) const;
+	EmitResult EmitBinaryOp(InsnIterator insn, EmitState *state) const;
+	EmitResult EmitDot(InsnIterator insn, EmitState *state) const;
+	EmitResult EmitSelect(InsnIterator insn, EmitState *state) const;
+	EmitResult EmitExtendedInstruction(InsnIterator insn, EmitState *state) const;
+	EmitResult EmitAny(InsnIterator insn, EmitState *state) const;
+	EmitResult EmitAll(InsnIterator insn, EmitState *state) const;
+	EmitResult EmitBranch(InsnIterator insn, EmitState *state) const;
+	EmitResult EmitBranchConditional(InsnIterator insn, EmitState *state) const;
+	EmitResult EmitSwitch(InsnIterator insn, EmitState *state) const;
+	EmitResult EmitUnreachable(InsnIterator insn, EmitState *state) const;
+	EmitResult EmitReturn(InsnIterator insn, EmitState *state) const;
+	EmitResult EmitKill(InsnIterator insn, EmitState *state) const;
+	EmitResult EmitFunctionCall(InsnIterator insn, EmitState *state) const;
+	EmitResult EmitPhi(InsnIterator insn, EmitState *state) const;
+	EmitResult EmitImageSampleImplicitLod(Variant variant, InsnIterator insn, EmitState *state) const;
+	EmitResult EmitImageSampleExplicitLod(Variant variant, InsnIterator insn, EmitState *state) const;
+	EmitResult EmitImageGather(Variant variant, InsnIterator insn, EmitState *state) const;
+	EmitResult EmitImageFetch(InsnIterator insn, EmitState *state) const;
+	EmitResult EmitImageSample(ImageInstruction instruction, InsnIterator insn, EmitState *state) const;
+	EmitResult EmitImageQuerySizeLod(InsnIterator insn, EmitState *state) const;
+	EmitResult EmitImageQuerySize(InsnIterator insn, EmitState *state) const;
+	EmitResult EmitImageQueryLod(InsnIterator insn, EmitState *state) const;
+	EmitResult EmitImageQueryLevels(InsnIterator insn, EmitState *state) const;
+	EmitResult EmitImageQuerySamples(InsnIterator insn, EmitState *state) const;
+	EmitResult EmitImageRead(InsnIterator insn, EmitState *state) const;
+	EmitResult EmitImageWrite(InsnIterator insn, EmitState *state) const;
+	EmitResult EmitImageTexelPointer(InsnIterator insn, EmitState *state) const;
+	EmitResult EmitAtomicOp(InsnIterator insn, EmitState *state) const;
+	EmitResult EmitAtomicCompareExchange(InsnIterator insn, EmitState *state) const;
+	EmitResult EmitSampledImageCombineOrSplit(InsnIterator insn, EmitState *state) const;
+	EmitResult EmitCopyObject(InsnIterator insn, EmitState *state) const;
+	EmitResult EmitCopyMemory(InsnIterator insn, EmitState *state) const;
+	EmitResult EmitControlBarrier(InsnIterator insn, EmitState *state) const;
+	EmitResult EmitMemoryBarrier(InsnIterator insn, EmitState *state) const;
+	EmitResult EmitGroupNonUniform(InsnIterator insn, EmitState *state) const;
+	EmitResult EmitArrayLength(InsnIterator insn, EmitState *state) const;
 
-		// setInputBuiltin() calls f() with the builtin and value if the shader
-		// uses the input builtin, otherwise the call is a no-op.
-		// F is a function with the signature:
-		// void(const SpirvShader::BuiltinMapping& builtin, Array<SIMD::Float>& value)
-		template <typename F>
-		inline void setInputBuiltin(SpirvShader const *shader, spv::BuiltIn id, F&& f)
-		{
-			auto it = shader->inputBuiltins.find(id);
-			if (it != shader->inputBuiltins.end())
-			{
-				const auto& builtin = it->second;
-				f(builtin, getVariable(builtin.Id));
-			}
-		}
+	void GetImageDimensions(EmitState const *state, Type const &resultTy, Object::ID imageId, Object::ID lodId, Intermediate &dst) const;
+	SIMD::Pointer GetTexelAddress(EmitState const *state, SIMD::Pointer base, GenericValue const & coordinate, Type const & imageType, Pointer<Byte> descriptor, int texelSize, Object::ID sampleId, bool useStencilAspect) const;
+	uint32_t GetConstScalarInt(Object::ID id) const;
+	void EvalSpecConstantOp(InsnIterator insn);
+	void EvalSpecConstantUnaryOp(InsnIterator insn);
+	void EvalSpecConstantBinaryOp(InsnIterator insn);
 
-	private:
-		// The phis are only accessible to SpirvShader as they are only used and
-		// exist between calls to SpirvShader::emitProlog() and
-		// SpirvShader::emitEpilog().
-		friend class SpirvShader;
+	// LoadPhi loads the phi values from the alloca storage and places the
+	// load values into the intermediate with the phi's result id.
+	void LoadPhi(InsnIterator insn, EmitState *state) const;
 
-		std::unordered_map<SpirvShader::Object::ID, Variable> phis;
+	// StorePhi updates the phi's alloca storage value using the incoming
+	// values from blocks that are both in the OpPhi instruction and in
+	// filter.
+	void StorePhi(Block::ID blockID, InsnIterator insn, EmitState *state, std::unordered_set<SpirvShader::Block::ID> const& filter) const;
 
+	// Emits a rr::Fence for the given MemorySemanticsMask.
+	void Fence(spv::MemorySemanticsMask semantics) const;
+
+	// Helper for calling rr::Yield with res cast to an rr::Int.
+	void Yield(YieldResult res) const;
+
+	// OpcodeName() returns the name of the opcode op.
+	// If NDEBUG is defined, then OpcodeName() will only return the numerical code.
+	static std::string OpcodeName(spv::Op op);
+	static std::memory_order MemoryOrder(spv::MemorySemanticsMask memorySemantics);
+
+	// Helper as we often need to take dot products as part of doing other things.
+	SIMD::Float Dot(unsigned numComponents, GenericValue const & x, GenericValue const & y) const;
+
+	SIMD::UInt FloatToHalfBits(SIMD::UInt floatBits, bool storeInUpperBits) const;
+
+	// Splits x into a floating-point significand in the range [0.5, 1.0)
+	// and an integral exponent of two, such that:
+	//   x = significand * 2^exponent
+	// Returns the pair <significand, exponent>
+	std::pair<SIMD::Float, SIMD::Int> Frexp(RValue<SIMD::Float> val) const;
+
+	static ImageSampler *getImageSampler(uint32_t instruction, vk::SampledImageDescriptor const *imageDescriptor, const vk::Sampler *sampler);
+	static std::shared_ptr<rr::Routine> emitSamplerRoutine(ImageInstruction instruction, const Sampler &samplerState);
+
+	// TODO(b/129523279): Eliminate conversion and use vk::Sampler members directly.
+	static sw::FilterType convertFilterMode(const vk::Sampler *sampler);
+	static sw::MipmapType convertMipmapMode(const vk::Sampler *sampler);
+	static sw::AddressingMode convertAddressingMode(int coordinateIndex, const vk::Sampler *sampler, VkImageViewType imageViewType);
+
+	// Returns 0 when invalid.
+	static VkShaderStageFlagBits executionModelToStage(spv::ExecutionModel model);
+
+	struct GroupOps;
+};
+
+class SpirvRoutine
+{
+public:
+	SpirvRoutine(vk::PipelineLayout const *pipelineLayout);
+
+	using Variable = Array<SIMD::Float>;
+
+	struct SamplerCache
+	{
+		Pointer<Byte> imageDescriptor = nullptr;
+		Pointer<Byte> sampler;
+		Pointer<Byte> function;
 	};
 
-}
+	vk::PipelineLayout const * const pipelineLayout;
+
+	std::unordered_map<SpirvShader::Object::ID, Variable> variables;
+	std::unordered_map<SpirvShader::Object::ID, SamplerCache> samplerCache;
+	Variable inputs = Variable{MAX_INTERFACE_COMPONENTS};
+	Variable outputs = Variable{MAX_INTERFACE_COMPONENTS};
+
+	Pointer<Byte> workgroupMemory;
+	Pointer<Pointer<Byte>> descriptorSets;
+	Pointer<Int> descriptorDynamicOffsets;
+	Pointer<Byte> pushConstants;
+	Pointer<Byte> constants;
+	Int killMask = Int{0};
+	SIMD::Int windowSpacePosition[2];
+	Int viewID;	// slice offset into input attachments for multiview, even if the shader doesn't use ViewIndex
+
+	void createVariable(SpirvShader::Object::ID id, uint32_t size)
+	{
+		bool added = variables.emplace(id, Variable(size)).second;
+		ASSERT_MSG(added, "Variable %d created twice", id.value());
+	}
+
+	Variable& getVariable(SpirvShader::Object::ID id)
+	{
+		auto it = variables.find(id);
+		ASSERT_MSG(it != variables.end(), "Unknown variables %d", id.value());
+		return it->second;
+	}
+
+	// setImmutableInputBuiltins() sets all the immutable input builtins,
+	// common for all shader types.
+	void setImmutableInputBuiltins(SpirvShader const *shader);
+
+	// setInputBuiltin() calls f() with the builtin and value if the shader
+	// uses the input builtin, otherwise the call is a no-op.
+	// F is a function with the signature:
+	// void(const SpirvShader::BuiltinMapping& builtin, Array<SIMD::Float>& value)
+	template <typename F>
+	inline void setInputBuiltin(SpirvShader const *shader, spv::BuiltIn id, F&& f)
+	{
+		auto it = shader->inputBuiltins.find(id);
+		if (it != shader->inputBuiltins.end())
+		{
+			const auto& builtin = it->second;
+			f(builtin, getVariable(builtin.Id));
+		}
+	}
+
+private:
+	// The phis are only accessible to SpirvShader as they are only used and
+	// exist between calls to SpirvShader::emitProlog() and
+	// SpirvShader::emitEpilog().
+	friend class SpirvShader;
+
+	std::unordered_map<SpirvShader::Object::ID, Variable> phis;
+
+};
+
+}  // namespace sw
 
 #endif  // sw_SpirvShader_hpp
diff --git a/src/Pipeline/VertexProgram.cpp b/src/Pipeline/VertexProgram.cpp
index 55c7b41..90886a5 100644
--- a/src/Pipeline/VertexProgram.cpp
+++ b/src/Pipeline/VertexProgram.cpp
@@ -22,63 +22,64 @@
 
 #include "Vulkan/VkPipelineLayout.hpp"
 
-namespace sw
+namespace sw {
+
+VertexProgram::VertexProgram(
+		const VertexProcessor::State &state,
+		vk::PipelineLayout const *pipelineLayout,
+		SpirvShader const *spirvShader,
+		const vk::DescriptorSet::Bindings &descriptorSets)
+	: VertexRoutine(state, pipelineLayout, spirvShader),
+	  descriptorSets(descriptorSets)
 {
-	VertexProgram::VertexProgram(
-			const VertexProcessor::State &state,
-			vk::PipelineLayout const *pipelineLayout,
-			SpirvShader const *spirvShader,
-			const vk::DescriptorSet::Bindings &descriptorSets)
-		: VertexRoutine(state, pipelineLayout, spirvShader),
-		  descriptorSets(descriptorSets)
+	routine.setImmutableInputBuiltins(spirvShader);
+
+	routine.setInputBuiltin(spirvShader, spv::BuiltInViewIndex, [&](const SpirvShader::BuiltinMapping& builtin, Array<SIMD::Float>& value)
 	{
-		routine.setImmutableInputBuiltins(spirvShader);
+		assert(builtin.SizeInComponents == 1);
+		value[builtin.FirstComponent] = As<Float4>(Int4((*Pointer<Int>(data + OFFSET(DrawData, viewID)))));
+	});
 
-		routine.setInputBuiltin(spirvShader, spv::BuiltInViewIndex, [&](const SpirvShader::BuiltinMapping& builtin, Array<SIMD::Float>& value)
-		{
-			assert(builtin.SizeInComponents == 1);
-			value[builtin.FirstComponent] = As<Float4>(Int4((*Pointer<Int>(data + OFFSET(DrawData, viewID)))));
-		});
-
-		routine.setInputBuiltin(spirvShader, spv::BuiltInInstanceIndex, [&](const SpirvShader::BuiltinMapping& builtin, Array<SIMD::Float>& value)
-		{
-			// TODO: we could do better here; we know InstanceIndex is uniform across all lanes
-			assert(builtin.SizeInComponents == 1);
-			value[builtin.FirstComponent] = As<Float4>(Int4((*Pointer<Int>(data + OFFSET(DrawData, instanceID)))));
-		});
-
-		routine.setInputBuiltin(spirvShader, spv::BuiltInSubgroupSize, [&](const SpirvShader::BuiltinMapping& builtin, Array<SIMD::Float>& value)
-		{
-			ASSERT(builtin.SizeInComponents == 1);
-			value[builtin.FirstComponent] = As<SIMD::Float>(SIMD::Int(SIMD::Width));
-		});
-
-		routine.descriptorSets = data + OFFSET(DrawData, descriptorSets);
-		routine.descriptorDynamicOffsets = data + OFFSET(DrawData, descriptorDynamicOffsets);
-		routine.pushConstants = data + OFFSET(DrawData, pushConstants);
-		routine.constants = *Pointer<Pointer<Byte>>(data + OFFSET(DrawData, constants));
-	}
-
-	VertexProgram::~VertexProgram()
+	routine.setInputBuiltin(spirvShader, spv::BuiltInInstanceIndex, [&](const SpirvShader::BuiltinMapping& builtin, Array<SIMD::Float>& value)
 	{
-	}
+		// TODO: we could do better here; we know InstanceIndex is uniform across all lanes
+		assert(builtin.SizeInComponents == 1);
+		value[builtin.FirstComponent] = As<Float4>(Int4((*Pointer<Int>(data + OFFSET(DrawData, instanceID)))));
+	});
 
-	void VertexProgram::program(Pointer<UInt> &batch, UInt& vertexCount)
+	routine.setInputBuiltin(spirvShader, spv::BuiltInSubgroupSize, [&](const SpirvShader::BuiltinMapping& builtin, Array<SIMD::Float>& value)
 	{
-		auto it = spirvShader->inputBuiltins.find(spv::BuiltInVertexIndex);
-		if (it != spirvShader->inputBuiltins.end())
-		{
-			assert(it->second.SizeInComponents == 1);
+		ASSERT(builtin.SizeInComponents == 1);
+		value[builtin.FirstComponent] = As<SIMD::Float>(SIMD::Int(SIMD::Width));
+	});
 
-			routine.getVariable(it->second.Id)[it->second.FirstComponent] =
-					As<Float4>(*Pointer<Int4>(As<Pointer<Int4>>(batch)) +
-					           Int4(*Pointer<Int>(data + OFFSET(DrawData, baseVertex))));
-		}
-
-		auto activeLaneMask = SIMD::Int(0xFFFFFFFF);
-		Int4 storesAndAtomicsMask = CmpGE(UInt4(vertexCount), UInt4(1, 2, 3, 4));
-		spirvShader->emit(&routine, activeLaneMask, storesAndAtomicsMask, descriptorSets);
-
-		spirvShader->emitEpilog(&routine);
-	}
+	routine.descriptorSets = data + OFFSET(DrawData, descriptorSets);
+	routine.descriptorDynamicOffsets = data + OFFSET(DrawData, descriptorDynamicOffsets);
+	routine.pushConstants = data + OFFSET(DrawData, pushConstants);
+	routine.constants = *Pointer<Pointer<Byte>>(data + OFFSET(DrawData, constants));
 }
+
+VertexProgram::~VertexProgram()
+{
+}
+
+void VertexProgram::program(Pointer<UInt> &batch, UInt& vertexCount)
+{
+	auto it = spirvShader->inputBuiltins.find(spv::BuiltInVertexIndex);
+	if (it != spirvShader->inputBuiltins.end())
+	{
+		assert(it->second.SizeInComponents == 1);
+
+		routine.getVariable(it->second.Id)[it->second.FirstComponent] =
+				As<Float4>(*Pointer<Int4>(As<Pointer<Int4>>(batch)) +
+				           Int4(*Pointer<Int>(data + OFFSET(DrawData, baseVertex))));
+	}
+
+	auto activeLaneMask = SIMD::Int(0xFFFFFFFF);
+	Int4 storesAndAtomicsMask = CmpGE(UInt4(vertexCount), UInt4(1, 2, 3, 4));
+	spirvShader->emit(&routine, activeLaneMask, storesAndAtomicsMask, descriptorSets);
+
+	spirvShader->emitEpilog(&routine);
+}
+
+}  // namepsace sw
diff --git a/src/Pipeline/VertexProgram.hpp b/src/Pipeline/VertexProgram.hpp
index 7baee79..9a14713 100644
--- a/src/Pipeline/VertexProgram.hpp
+++ b/src/Pipeline/VertexProgram.hpp
@@ -18,26 +18,27 @@
 #include "VertexRoutine.hpp"
 #include "ShaderCore.hpp"
 
-namespace sw
+namespace sw {
+
+struct Stream;
+
+class VertexProgram : public VertexRoutine
 {
-	struct Stream;
+public:
+	VertexProgram(
+		const VertexProcessor::State &state,
+		vk::PipelineLayout const *pipelineLayout,
+		SpirvShader const *spirvShader,
+		const vk::DescriptorSet::Bindings &descriptorSets);
 
-	class VertexProgram : public VertexRoutine
-	{
-	public:
-		VertexProgram(
-			const VertexProcessor::State &state,
-			vk::PipelineLayout const *pipelineLayout,
-			SpirvShader const *spirvShader,
-			const vk::DescriptorSet::Bindings &descriptorSets);
+	virtual ~VertexProgram();
 
-		virtual ~VertexProgram();
+private:
+	void program(Pointer<UInt> &batch, UInt& vertexCount) override;
 
-	private:
-		void program(Pointer<UInt> &batch, UInt& vertexCount) override;
+	const vk::DescriptorSet::Bindings &descriptorSets;
+};
 
-		const vk::DescriptorSet::Bindings &descriptorSets;
-	};
-}
+}  // namepsace sw
 
 #endif   // sw_VertexProgram_hpp
diff --git a/src/Pipeline/VertexRoutine.cpp b/src/Pipeline/VertexRoutine.cpp
index 2fccb08..6fea0cb 100644
--- a/src/Pipeline/VertexRoutine.cpp
+++ b/src/Pipeline/VertexRoutine.cpp
@@ -21,667 +21,668 @@
 #include "Vulkan/VkDebug.hpp"
 #include "System/Half.hpp"
 
-namespace sw
+namespace sw {
+
+VertexRoutine::VertexRoutine(
+		const VertexProcessor::State &state,
+		vk::PipelineLayout const *pipelineLayout,
+		SpirvShader const *spirvShader)
+	: routine(pipelineLayout),
+	  state(state),
+	  spirvShader(spirvShader)
 {
-	VertexRoutine::VertexRoutine(
-			const VertexProcessor::State &state,
-			vk::PipelineLayout const *pipelineLayout,
-			SpirvShader const *spirvShader)
-		: routine(pipelineLayout),
-		  state(state),
-		  spirvShader(spirvShader)
+	spirvShader->emitProlog(&routine);
+}
+
+VertexRoutine::~VertexRoutine()
+{
+}
+
+void VertexRoutine::generate()
+{
+	Pointer<Byte> cache = task + OFFSET(VertexTask,vertexCache);
+	Pointer<Byte> vertexCache = cache + OFFSET(VertexCache,vertex);
+	Pointer<UInt> tagCache = Pointer<UInt>(cache + OFFSET(VertexCache,tag));
+
+	UInt vertexCount = *Pointer<UInt>(task + OFFSET(VertexTask,vertexCount));
+
+	constants = *Pointer<Pointer<Byte>>(data + OFFSET(DrawData,constants));
+
+	// Check the cache one vertex index at a time. If a hit occurs, copy from the cache to the 'vertex' output buffer.
+	// On a cache miss, process a SIMD width of consecutive indices from the input batch. They're written to the cache
+	// in reverse order to guarantee that the first one doesn't get evicted and can be written out.
+
+	Do
 	{
-	  	spirvShader->emitProlog(&routine);
+		UInt index = *batch;
+		UInt cacheIndex = index & VertexCache::TAG_MASK;
+
+		If(tagCache[cacheIndex] != index)
+		{
+			readInput(batch);
+			program(batch, vertexCount);
+			computeClipFlags();
+			computeCullMask();
+
+			writeCache(vertexCache, tagCache, batch);
+		}
+
+		Pointer<Byte> cacheEntry = vertexCache + cacheIndex * UInt((int)sizeof(Vertex));
+
+		// For points, vertexCount is 1 per primitive, so duplicate vertex for all 3 vertices of the primitive
+		for(int i = 0; i < (state.isPoint ? 3 : 1); i++)
+		{
+			writeVertex(vertex, cacheEntry);
+			vertex += sizeof(Vertex);
+		}
+
+		batch = Pointer<UInt>(Pointer<Byte>(batch) + sizeof(uint32_t));
+		vertexCount--;
 	}
+	Until(vertexCount == 0)
 
-	VertexRoutine::~VertexRoutine()
+	Return();
+}
+
+void VertexRoutine::readInput(Pointer<UInt> &batch)
+{
+	for(int i = 0; i < MAX_INTERFACE_COMPONENTS; i += 4)
 	{
-	}
-
-	void VertexRoutine::generate()
-	{
-		Pointer<Byte> cache = task + OFFSET(VertexTask,vertexCache);
-		Pointer<Byte> vertexCache = cache + OFFSET(VertexCache,vertex);
-		Pointer<UInt> tagCache = Pointer<UInt>(cache + OFFSET(VertexCache,tag));
-
-		UInt vertexCount = *Pointer<UInt>(task + OFFSET(VertexTask,vertexCount));
-
-		constants = *Pointer<Pointer<Byte>>(data + OFFSET(DrawData,constants));
-
-		// Check the cache one vertex index at a time. If a hit occurs, copy from the cache to the 'vertex' output buffer.
-		// On a cache miss, process a SIMD width of consecutive indices from the input batch. They're written to the cache
-		// in reverse order to guarantee that the first one doesn't get evicted and can be written out.
-
-		Do
+		if(spirvShader->inputs[i + 0].Type != SpirvShader::ATTRIBTYPE_UNUSED ||
+		   spirvShader->inputs[i + 1].Type != SpirvShader::ATTRIBTYPE_UNUSED ||
+		   spirvShader->inputs[i + 2].Type != SpirvShader::ATTRIBTYPE_UNUSED ||
+		   spirvShader->inputs[i + 3].Type != SpirvShader::ATTRIBTYPE_UNUSED)
 		{
-			UInt index = *batch;
-			UInt cacheIndex = index & VertexCache::TAG_MASK;
-
-			If(tagCache[cacheIndex] != index)
+			Pointer<Byte> input = *Pointer<Pointer<Byte>>(data + OFFSET(DrawData, input) + sizeof(void*) * (i / 4));
+			UInt stride = *Pointer<UInt>(data + OFFSET(DrawData, stride) + sizeof(uint32_t) * (i / 4));
+			Int baseVertex = *Pointer<Int>(data + OFFSET(DrawData, baseVertex));
+			UInt robustnessSize(0);
+			if(state.robustBufferAccess)
 			{
-				readInput(batch);
-				program(batch, vertexCount);
-				computeClipFlags();
-				computeCullMask();
-
-				writeCache(vertexCache, tagCache, batch);
+				robustnessSize = *Pointer<UInt>(data + OFFSET(DrawData, robustnessSize) + sizeof(uint32_t) * (i / 4));
 			}
 
-			Pointer<Byte> cacheEntry = vertexCache + cacheIndex * UInt((int)sizeof(Vertex));
-
-			// For points, vertexCount is 1 per primitive, so duplicate vertex for all 3 vertices of the primitive
-			for(int i = 0; i < (state.isPoint ? 3 : 1); i++)
-			{
-				writeVertex(vertex, cacheEntry);
-				vertex += sizeof(Vertex);
-			}
-
-			batch = Pointer<UInt>(Pointer<Byte>(batch) + sizeof(uint32_t));
-			vertexCount--;
-		}
-		Until(vertexCount == 0)
-
-		Return();
-	}
-
-	void VertexRoutine::readInput(Pointer<UInt> &batch)
-	{
-		for(int i = 0; i < MAX_INTERFACE_COMPONENTS; i += 4)
-		{
-			if(spirvShader->inputs[i + 0].Type != SpirvShader::ATTRIBTYPE_UNUSED ||
-			   spirvShader->inputs[i + 1].Type != SpirvShader::ATTRIBTYPE_UNUSED ||
-			   spirvShader->inputs[i + 2].Type != SpirvShader::ATTRIBTYPE_UNUSED ||
-			   spirvShader->inputs[i + 3].Type != SpirvShader::ATTRIBTYPE_UNUSED)
-			{
-				Pointer<Byte> input = *Pointer<Pointer<Byte>>(data + OFFSET(DrawData, input) + sizeof(void*) * (i / 4));
-				UInt stride = *Pointer<UInt>(data + OFFSET(DrawData, stride) + sizeof(uint32_t) * (i / 4));
-				Int baseVertex = *Pointer<Int>(data + OFFSET(DrawData, baseVertex));
-				UInt robustnessSize(0);
-				if(state.robustBufferAccess)
-				{
-					robustnessSize = *Pointer<UInt>(data + OFFSET(DrawData, robustnessSize) + sizeof(uint32_t) * (i / 4));
-				}
-
-				auto value = readStream(input, stride, state.input[i / 4], batch, state.robustBufferAccess, robustnessSize, baseVertex);
-				routine.inputs[i + 0] = value.x;
-				routine.inputs[i + 1] = value.y;
-				routine.inputs[i + 2] = value.z;
-				routine.inputs[i + 3] = value.w;
-			}
-		}
-	}
-
-	void VertexRoutine::computeClipFlags()
-	{
-		auto it = spirvShader->outputBuiltins.find(spv::BuiltInPosition);
-		assert(it != spirvShader->outputBuiltins.end());
-		assert(it->second.SizeInComponents == 4);
-		auto &pos = routine.getVariable(it->second.Id);
-		auto posX = pos[it->second.FirstComponent + 0];
-		auto posY = pos[it->second.FirstComponent + 1];
-		auto posZ = pos[it->second.FirstComponent + 2];
-		auto posW = pos[it->second.FirstComponent + 3];
-
-		Int4 maxX = CmpLT(posW, posX);
-		Int4 maxY = CmpLT(posW, posY);
-		Int4 maxZ = CmpLT(posW, posZ);
-		Int4 minX = CmpNLE(-posW, posX);
-		Int4 minY = CmpNLE(-posW, posY);
-		Int4 minZ = CmpNLE(Float4(0.0f), posZ);
-
-		clipFlags =  Pointer<Int>(constants + OFFSET(Constants,maxX))[SignMask(maxX)];
-		clipFlags |= Pointer<Int>(constants + OFFSET(Constants,maxY))[SignMask(maxY)];
-		clipFlags |= Pointer<Int>(constants + OFFSET(Constants,maxZ))[SignMask(maxZ)];
-		clipFlags |= Pointer<Int>(constants + OFFSET(Constants,minX))[SignMask(minX)];
-		clipFlags |= Pointer<Int>(constants + OFFSET(Constants,minY))[SignMask(minY)];
-		clipFlags |= Pointer<Int>(constants + OFFSET(Constants,minZ))[SignMask(minZ)];
-
-		Int4 finiteX = CmpLE(Abs(posX), *Pointer<Float4>(constants + OFFSET(Constants,maxPos)));
-		Int4 finiteY = CmpLE(Abs(posY), *Pointer<Float4>(constants + OFFSET(Constants,maxPos)));
-		Int4 finiteZ = CmpLE(Abs(posZ), *Pointer<Float4>(constants + OFFSET(Constants,maxPos)));
-
-		Int4 finiteXYZ = finiteX & finiteY & finiteZ;
-		clipFlags |= Pointer<Int>(constants + OFFSET(Constants,fini))[SignMask(finiteXYZ)];
-	}
-
-	void VertexRoutine::computeCullMask()
-	{
-		cullMask = Int(15);
-
-		auto it = spirvShader->outputBuiltins.find(spv::BuiltInCullDistance);
-		if (it != spirvShader->outputBuiltins.end())
-		{
-			auto count = spirvShader->getNumOutputCullDistances();
-			for (uint32_t i = 0; i < count; i++)
-			{
-				auto const &distance = routine.getVariable(it->second.Id)[it->second.FirstComponent + i];
-				auto mask = SignMask(CmpGE(distance, SIMD::Float(0)));
-				cullMask &= mask;
-			}
-		}
-	}
-
-	Vector4f VertexRoutine::readStream(Pointer<Byte> &buffer, UInt &stride, const Stream &stream, Pointer<UInt> &batch,
-	                                   bool robustBufferAccess, UInt & robustnessSize, Int baseVertex)
-	{
-		Vector4f v;
-		// Because of the following rule in the Vulkan spec, we do not care if a very large negative
-		// baseVertex would overflow all the way back into a valid region of the index buffer:
-		// "Out-of-bounds buffer loads will return any of the following values :

-		//  - Values from anywhere within the memory range(s) bound to the buffer (possibly including
-		//    bytes of memory past the end of the buffer, up to the end of the bound range)."
-		UInt4 offsets = (*Pointer<UInt4>(As<Pointer<UInt4>>(batch)) + As<UInt4>(Int4(baseVertex))) * UInt4(stride);
-
-		Pointer<Byte> source0 = buffer + offsets.x;
-		Pointer<Byte> source1 = buffer + offsets.y;
-		Pointer<Byte> source2 = buffer + offsets.z;
-		Pointer<Byte> source3 = buffer + offsets.w;
-
-		UInt4 zero(0);
-		if (robustBufferAccess)
-		{
-			// TODO(b/141124876): Optimize for wide-vector gather operations.
-			UInt4 limits = offsets + UInt4(stream.bytesPerAttrib());
-			Pointer<Byte> zeroSource = As<Pointer<Byte>>(&zero);
-			source0 = IfThenElse(limits.x <= robustnessSize, source0, zeroSource);
-			source1 = IfThenElse(limits.y <= robustnessSize, source1, zeroSource);
-			source2 = IfThenElse(limits.z <= robustnessSize, source2, zeroSource);
-			source3 = IfThenElse(limits.w <= robustnessSize, source3, zeroSource);
-		}
-
-		bool isNativeFloatAttrib = (stream.attribType == SpirvShader::ATTRIBTYPE_FLOAT) || stream.normalized;
-
-		switch(stream.type)
-		{
-		case STREAMTYPE_FLOAT:
-			{
-				if(stream.count == 0)
-				{
-					// Null stream, all default components
-				}
-				else
-				{
-					if(stream.count == 1)
-					{
-						v.x.x = *Pointer<Float>(source0);
-						v.x.y = *Pointer<Float>(source1);
-						v.x.z = *Pointer<Float>(source2);
-						v.x.w = *Pointer<Float>(source3);
-					}
-					else
-					{
-						v.x = *Pointer<Float4>(source0);
-						v.y = *Pointer<Float4>(source1);
-						v.z = *Pointer<Float4>(source2);
-						v.w = *Pointer<Float4>(source3);
-
-						transpose4xN(v.x, v.y, v.z, v.w, stream.count);
-					}
-
-					switch(stream.attribType)
-					{
-					case SpirvShader::ATTRIBTYPE_INT:
-						if(stream.count >= 1) v.x = As<Float4>(Int4(v.x));
-						if(stream.count >= 2) v.x = As<Float4>(Int4(v.y));
-						if(stream.count >= 3) v.x = As<Float4>(Int4(v.z));
-						if(stream.count >= 4) v.x = As<Float4>(Int4(v.w));
-						break;
-					case SpirvShader::ATTRIBTYPE_UINT:
-						if(stream.count >= 1) v.x = As<Float4>(UInt4(v.x));
-						if(stream.count >= 2) v.x = As<Float4>(UInt4(v.y));
-						if(stream.count >= 3) v.x = As<Float4>(UInt4(v.z));
-						if(stream.count >= 4) v.x = As<Float4>(UInt4(v.w));
-						break;
-					default:
-						break;
-					}
-				}
-			}
-			break;
-		case STREAMTYPE_BYTE:
-			if(isNativeFloatAttrib) // Stream: UByte, Shader attrib: Float
-			{
-				v.x = Float4(*Pointer<Byte4>(source0));
-				v.y = Float4(*Pointer<Byte4>(source1));
-				v.z = Float4(*Pointer<Byte4>(source2));
-				v.w = Float4(*Pointer<Byte4>(source3));
-
-				transpose4xN(v.x, v.y, v.z, v.w, stream.count);
-
-				if(stream.normalized)
-				{
-					if(stream.count >= 1) v.x *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleByte));
-					if(stream.count >= 2) v.y *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleByte));
-					if(stream.count >= 3) v.z *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleByte));
-					if(stream.count >= 4) v.w *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleByte));
-				}
-			}
-			else // Stream: UByte, Shader attrib: Int / UInt
-			{
-				v.x = As<Float4>(Int4(*Pointer<Byte4>(source0)));
-				v.y = As<Float4>(Int4(*Pointer<Byte4>(source1)));
-				v.z = As<Float4>(Int4(*Pointer<Byte4>(source2)));
-				v.w = As<Float4>(Int4(*Pointer<Byte4>(source3)));
-
-				transpose4xN(v.x, v.y, v.z, v.w, stream.count);
-			}
-			break;
-		case STREAMTYPE_SBYTE:
-			if(isNativeFloatAttrib) // Stream: SByte, Shader attrib: Float
-			{
-				v.x = Float4(*Pointer<SByte4>(source0));
-				v.y = Float4(*Pointer<SByte4>(source1));
-				v.z = Float4(*Pointer<SByte4>(source2));
-				v.w = Float4(*Pointer<SByte4>(source3));
-
-				transpose4xN(v.x, v.y, v.z, v.w, stream.count);
-
-				if(stream.normalized)
-				{
-					if(stream.count >= 1) v.x *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleSByte));
-					if(stream.count >= 2) v.y *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleSByte));
-					if(stream.count >= 3) v.z *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleSByte));
-					if(stream.count >= 4) v.w *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleSByte));
-				}
-			}
-			else // Stream: SByte, Shader attrib: Int / UInt
-			{
-				v.x = As<Float4>(Int4(*Pointer<SByte4>(source0)));
-				v.y = As<Float4>(Int4(*Pointer<SByte4>(source1)));
-				v.z = As<Float4>(Int4(*Pointer<SByte4>(source2)));
-				v.w = As<Float4>(Int4(*Pointer<SByte4>(source3)));
-
-				transpose4xN(v.x, v.y, v.z, v.w, stream.count);
-			}
-			break;
-		case STREAMTYPE_COLOR:
-			{
-				v.x = Float4(*Pointer<Byte4>(source0)) * *Pointer<Float4>(constants + OFFSET(Constants,unscaleByte));
-				v.y = Float4(*Pointer<Byte4>(source1)) * *Pointer<Float4>(constants + OFFSET(Constants,unscaleByte));
-				v.z = Float4(*Pointer<Byte4>(source2)) * *Pointer<Float4>(constants + OFFSET(Constants,unscaleByte));
-				v.w = Float4(*Pointer<Byte4>(source3)) * *Pointer<Float4>(constants + OFFSET(Constants,unscaleByte));
-
-				transpose4x4(v.x, v.y, v.z, v.w);
-
-				// Swap red and blue
-				Float4 t = v.x;
-				v.x = v.z;
-				v.z = t;
-			}
-			break;
-		case STREAMTYPE_SHORT:
-			if(isNativeFloatAttrib) // Stream: Int, Shader attrib: Float
-			{
-				v.x = Float4(*Pointer<Short4>(source0));
-				v.y = Float4(*Pointer<Short4>(source1));
-				v.z = Float4(*Pointer<Short4>(source2));
-				v.w = Float4(*Pointer<Short4>(source3));
-
-				transpose4xN(v.x, v.y, v.z, v.w, stream.count);
-
-				if(stream.normalized)
-				{
-					if(stream.count >= 1) v.x *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleShort));
-					if(stream.count >= 2) v.y *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleShort));
-					if(stream.count >= 3) v.z *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleShort));
-					if(stream.count >= 4) v.w *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleShort));
-				}
-			}
-			else // Stream: Short, Shader attrib: Int/UInt, no type conversion
-			{
-				v.x = As<Float4>(Int4(*Pointer<Short4>(source0)));
-				v.y = As<Float4>(Int4(*Pointer<Short4>(source1)));
-				v.z = As<Float4>(Int4(*Pointer<Short4>(source2)));
-				v.w = As<Float4>(Int4(*Pointer<Short4>(source3)));
-
-				transpose4xN(v.x, v.y, v.z, v.w, stream.count);
-			}
-			break;
-		case STREAMTYPE_USHORT:
-			if(isNativeFloatAttrib) // Stream: Int, Shader attrib: Float
-			{
-				v.x = Float4(*Pointer<UShort4>(source0));
-				v.y = Float4(*Pointer<UShort4>(source1));
-				v.z = Float4(*Pointer<UShort4>(source2));
-				v.w = Float4(*Pointer<UShort4>(source3));
-
-				transpose4xN(v.x, v.y, v.z, v.w, stream.count);
-
-				if(stream.normalized)
-				{
-					if(stream.count >= 1) v.x *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleUShort));
-					if(stream.count >= 2) v.y *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleUShort));
-					if(stream.count >= 3) v.z *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleUShort));
-					if(stream.count >= 4) v.w *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleUShort));
-				}
-			}
-			else // Stream: UShort, Shader attrib: Int/UInt, no type conversion
-			{
-				v.x = As<Float4>(Int4(*Pointer<UShort4>(source0)));
-				v.y = As<Float4>(Int4(*Pointer<UShort4>(source1)));
-				v.z = As<Float4>(Int4(*Pointer<UShort4>(source2)));
-				v.w = As<Float4>(Int4(*Pointer<UShort4>(source3)));
-
-				transpose4xN(v.x, v.y, v.z, v.w, stream.count);
-			}
-			break;
-		case STREAMTYPE_INT:
-			if(isNativeFloatAttrib) // Stream: Int, Shader attrib: Float
-			{
-				v.x = Float4(*Pointer<Int4>(source0));
-				v.y = Float4(*Pointer<Int4>(source1));
-				v.z = Float4(*Pointer<Int4>(source2));
-				v.w = Float4(*Pointer<Int4>(source3));
-
-				transpose4xN(v.x, v.y, v.z, v.w, stream.count);
-
-				if(stream.normalized)
-				{
-					if(stream.count >= 1) v.x *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleInt));
-					if(stream.count >= 2) v.y *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleInt));
-					if(stream.count >= 3) v.z *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleInt));
-					if(stream.count >= 4) v.w *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleInt));
-				}
-			}
-			else // Stream: Int, Shader attrib: Int/UInt, no type conversion
-			{
-				v.x = *Pointer<Float4>(source0);
-				v.y = *Pointer<Float4>(source1);
-				v.z = *Pointer<Float4>(source2);
-				v.w = *Pointer<Float4>(source3);
-
-				transpose4xN(v.x, v.y, v.z, v.w, stream.count);
-			}
-			break;
-		case STREAMTYPE_UINT:
-			if(isNativeFloatAttrib) // Stream: UInt, Shader attrib: Float
-			{
-				v.x = Float4(*Pointer<UInt4>(source0));
-				v.y = Float4(*Pointer<UInt4>(source1));
-				v.z = Float4(*Pointer<UInt4>(source2));
-				v.w = Float4(*Pointer<UInt4>(source3));
-
-				transpose4xN(v.x, v.y, v.z, v.w, stream.count);
-
-				if(stream.normalized)
-				{
-					if(stream.count >= 1) v.x *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleUInt));
-					if(stream.count >= 2) v.y *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleUInt));
-					if(stream.count >= 3) v.z *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleUInt));
-					if(stream.count >= 4) v.w *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleUInt));
-				}
-			}
-			else // Stream: UInt, Shader attrib: Int/UInt, no type conversion
-			{
-				v.x = *Pointer<Float4>(source0);
-				v.y = *Pointer<Float4>(source1);
-				v.z = *Pointer<Float4>(source2);
-				v.w = *Pointer<Float4>(source3);
-
-				transpose4xN(v.x, v.y, v.z, v.w, stream.count);
-			}
-			break;
-		case STREAMTYPE_HALF:
-			{
-				if(stream.count >= 1)
-				{
-					UShort x0 = *Pointer<UShort>(source0 + 0);
-					UShort x1 = *Pointer<UShort>(source1 + 0);
-					UShort x2 = *Pointer<UShort>(source2 + 0);
-					UShort x3 = *Pointer<UShort>(source3 + 0);
-
-					v.x.x = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(x0) * 4);
-					v.x.y = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(x1) * 4);
-					v.x.z = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(x2) * 4);
-					v.x.w = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(x3) * 4);
-				}
-
-				if(stream.count >= 2)
-				{
-					UShort y0 = *Pointer<UShort>(source0 + 2);
-					UShort y1 = *Pointer<UShort>(source1 + 2);
-					UShort y2 = *Pointer<UShort>(source2 + 2);
-					UShort y3 = *Pointer<UShort>(source3 + 2);
-
-					v.y.x = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(y0) * 4);
-					v.y.y = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(y1) * 4);
-					v.y.z = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(y2) * 4);
-					v.y.w = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(y3) * 4);
-				}
-
-				if(stream.count >= 3)
-				{
-					UShort z0 = *Pointer<UShort>(source0 + 4);
-					UShort z1 = *Pointer<UShort>(source1 + 4);
-					UShort z2 = *Pointer<UShort>(source2 + 4);
-					UShort z3 = *Pointer<UShort>(source3 + 4);
-
-					v.z.x = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(z0) * 4);
-					v.z.y = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(z1) * 4);
-					v.z.z = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(z2) * 4);
-					v.z.w = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(z3) * 4);
-				}
-
-				if(stream.count >= 4)
-				{
-					UShort w0 = *Pointer<UShort>(source0 + 6);
-					UShort w1 = *Pointer<UShort>(source1 + 6);
-					UShort w2 = *Pointer<UShort>(source2 + 6);
-					UShort w3 = *Pointer<UShort>(source3 + 6);
-
-					v.w.x = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(w0) * 4);
-					v.w.y = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(w1) * 4);
-					v.w.z = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(w2) * 4);
-					v.w.w = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(w3) * 4);
-				}
-			}
-			break;
-		case STREAMTYPE_2_10_10_10_INT:
-			{
-				Int4 src;
-				src = Insert(src, *Pointer<Int>(source0), 0);
-				src = Insert(src, *Pointer<Int>(source1), 1);
-				src = Insert(src, *Pointer<Int>(source2), 2);
-				src = Insert(src, *Pointer<Int>(source3), 3);
-
-				v.x = Float4((src << 22) >> 22);
-				v.y = Float4((src << 12) >> 22);
-				v.z = Float4((src << 02) >> 22);
-				v.w = Float4(src >> 30);
-
-				if(stream.normalized)
-				{
-					v.x = Max(v.x * Float4(1.0f / 0x1FF), Float4(-1.0f));
-					v.y = Max(v.y * Float4(1.0f / 0x1FF), Float4(-1.0f));
-					v.z = Max(v.z * Float4(1.0f / 0x1FF), Float4(-1.0f));
-					v.w = Max(v.w, Float4(-1.0f));
-				}
-			}
-			break;
-		case STREAMTYPE_2_10_10_10_UINT:
-			{
-				Int4 src;
-				src = Insert(src, *Pointer<Int>(source0), 0);
-				src = Insert(src, *Pointer<Int>(source1), 1);
-				src = Insert(src, *Pointer<Int>(source2), 2);
-				src = Insert(src, *Pointer<Int>(source3), 3);
-
-				v.x = Float4(src & Int4(0x3FF));
-				v.y = Float4((src >> 10) & Int4(0x3FF));
-				v.z = Float4((src >> 20) & Int4(0x3FF));
-				v.w = Float4((src >> 30) & Int4(0x3));
-
-				if(stream.normalized)
-				{
-					v.x *= Float4(1.0f / 0x3FF);
-					v.y *= Float4(1.0f / 0x3FF);
-					v.z *= Float4(1.0f / 0x3FF);
-					v.w *= Float4(1.0f / 0x3);
-				}
-			}
-			break;
-		default:
-			UNSUPPORTED("stream.type %d", int(stream.type));
-		}
-
-		if(stream.count < 1) v.x = Float4(0.0f);
-		if(stream.count < 2) v.y = Float4(0.0f);
-		if(stream.count < 3) v.z = Float4(0.0f);
-		if(stream.count < 4) v.w = isNativeFloatAttrib ? As<Float4>(Float4(1.0f)) : As<Float4>(Int4(1));
-
-		return v;
-	}
-
-	void VertexRoutine::writeCache(Pointer<Byte> &vertexCache, Pointer<UInt> &tagCache, Pointer<UInt> &batch)
-	{
-		UInt index0 = batch[0];
-		UInt index1 = batch[1];
-		UInt index2 = batch[2];
-		UInt index3 = batch[3];
-
-		UInt cacheIndex0 = index0 & VertexCache::TAG_MASK;
-		UInt cacheIndex1 = index1 & VertexCache::TAG_MASK;
-		UInt cacheIndex2 = index2 & VertexCache::TAG_MASK;
-		UInt cacheIndex3 = index3 & VertexCache::TAG_MASK;
-
-		// We processed a SIMD group of vertices, with the first one being the one that missed the cache tag check.
-		// Write them out in reverse order here and below to ensure the first one is now guaranteed to be in the cache.
-		tagCache[cacheIndex3] = index3;
-		tagCache[cacheIndex2] = index2;
-		tagCache[cacheIndex1] = index1;
-		tagCache[cacheIndex0] = index0;
-
-		auto it = spirvShader->outputBuiltins.find(spv::BuiltInPosition);
-		assert(it != spirvShader->outputBuiltins.end());
-		assert(it->second.SizeInComponents == 4);
-		auto &position = routine.getVariable(it->second.Id);
-
-		Vector4f pos;
-		pos.x = position[it->second.FirstComponent + 0];
-		pos.y = position[it->second.FirstComponent + 1];
-		pos.z = position[it->second.FirstComponent + 2];
-		pos.w = position[it->second.FirstComponent + 3];
-
-		// Projection and viewport transform.
-		Float4 w = As<Float4>(As<Int4>(pos.w) | (As<Int4>(CmpEQ(pos.w, Float4(0.0f))) & As<Int4>(Float4(1.0f))));
-		Float4 rhw = Float4(1.0f) / w;
-
-		Vector4f proj;
-		proj.x = As<Float4>(RoundInt(*Pointer<Float4>(data + OFFSET(DrawData,X0xF)) + pos.x * rhw * *Pointer<Float4>(data + OFFSET(DrawData,WxF))));
-		proj.y = As<Float4>(RoundInt(*Pointer<Float4>(data + OFFSET(DrawData,Y0xF)) + pos.y * rhw * *Pointer<Float4>(data + OFFSET(DrawData,HxF))));
-		proj.z = pos.z * rhw;
-		proj.w = rhw;
-
-		transpose4x4(pos.x, pos.y, pos.z, pos.w);
-
-		*Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex3 + OFFSET(Vertex,position), 16) = pos.w;
-		*Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex2 + OFFSET(Vertex,position), 16) = pos.z;
-		*Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex1 + OFFSET(Vertex,position), 16) = pos.y;
-		*Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex0 + OFFSET(Vertex,position), 16) = pos.x;
-
-		it = spirvShader->outputBuiltins.find(spv::BuiltInPointSize);
-		if(it != spirvShader->outputBuiltins.end())
-		{
-			ASSERT(it->second.SizeInComponents == 1);
-			auto psize = routine.getVariable(it->second.Id)[it->second.FirstComponent];
-
-			*Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex3 + OFFSET(Vertex,pointSize)) = Extract(psize, 3);
-			*Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex2 + OFFSET(Vertex,pointSize)) = Extract(psize, 2);
-			*Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex1 + OFFSET(Vertex,pointSize)) = Extract(psize, 1);
-			*Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex0 + OFFSET(Vertex,pointSize)) = Extract(psize, 0);
-		}
-
-		it = spirvShader->outputBuiltins.find(spv::BuiltInClipDistance);
-		if(it != spirvShader->outputBuiltins.end())
-		{
-			auto count = spirvShader->getNumOutputClipDistances();
-			for(unsigned int i = 0; i < count; i++)
-			{
-				auto dist = routine.getVariable(it->second.Id)[it->second.FirstComponent + i];
-				*Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex3 + OFFSET(Vertex,clipDistance[i])) = Extract(dist, 3);
-				*Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex2 + OFFSET(Vertex,clipDistance[i])) = Extract(dist, 2);
-				*Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex1 + OFFSET(Vertex,clipDistance[i])) = Extract(dist, 1);
-				*Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex0 + OFFSET(Vertex,clipDistance[i])) = Extract(dist, 0);
-			}
-		}
-
-		it = spirvShader->outputBuiltins.find(spv::BuiltInCullDistance);
-		if(it != spirvShader->outputBuiltins.end())
-		{
-			auto count = spirvShader->getNumOutputCullDistances();
-			for(unsigned int i = 0; i < count; i++)
-			{
-				auto dist = routine.getVariable(it->second.Id)[it->second.FirstComponent + i];
-				*Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex3 + OFFSET(Vertex,cullDistance[i])) = Extract(dist, 3);
-				*Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex2 + OFFSET(Vertex,cullDistance[i])) = Extract(dist, 2);
-				*Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex1 + OFFSET(Vertex,cullDistance[i])) = Extract(dist, 1);
-				*Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex0 + OFFSET(Vertex,cullDistance[i])) = Extract(dist, 0);
-			}
-		}
-
-		*Pointer<Int>(vertexCache + sizeof(Vertex) * cacheIndex3 + OFFSET(Vertex,clipFlags)) = (clipFlags >> 24) & 0x0000000FF;
-		*Pointer<Int>(vertexCache + sizeof(Vertex) * cacheIndex2 + OFFSET(Vertex,clipFlags)) = (clipFlags >> 16) & 0x0000000FF;
-		*Pointer<Int>(vertexCache + sizeof(Vertex) * cacheIndex1 + OFFSET(Vertex,clipFlags)) = (clipFlags >> 8)  & 0x0000000FF;
-		*Pointer<Int>(vertexCache + sizeof(Vertex) * cacheIndex0 + OFFSET(Vertex,clipFlags)) = (clipFlags >> 0)  & 0x0000000FF;
-
-		*Pointer<Int>(vertexCache + sizeof(Vertex) * cacheIndex3 + OFFSET(Vertex,cullMask)) = -((cullMask >> 3) & 1);
-		*Pointer<Int>(vertexCache + sizeof(Vertex) * cacheIndex2 + OFFSET(Vertex,cullMask)) = -((cullMask >> 2) & 1);
-		*Pointer<Int>(vertexCache + sizeof(Vertex) * cacheIndex1 + OFFSET(Vertex,cullMask)) = -((cullMask >> 1) & 1);
-		*Pointer<Int>(vertexCache + sizeof(Vertex) * cacheIndex0 + OFFSET(Vertex,cullMask)) = -((cullMask >> 0) & 1);
-
-		transpose4x4(proj.x, proj.y, proj.z, proj.w);
-
-		*Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex3 + OFFSET(Vertex,projected), 16) = proj.w;
-		*Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex2 + OFFSET(Vertex,projected), 16) = proj.z;
-		*Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex1 + OFFSET(Vertex,projected), 16) = proj.y;
-		*Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex0 + OFFSET(Vertex,projected), 16) = proj.x;
-
-		for(int i = 0; i < MAX_INTERFACE_COMPONENTS; i += 4)
-		{
-			if(spirvShader->outputs[i + 0].Type != SpirvShader::ATTRIBTYPE_UNUSED ||
-			   spirvShader->outputs[i + 1].Type != SpirvShader::ATTRIBTYPE_UNUSED ||
-			   spirvShader->outputs[i + 2].Type != SpirvShader::ATTRIBTYPE_UNUSED ||
-			   spirvShader->outputs[i + 3].Type != SpirvShader::ATTRIBTYPE_UNUSED)
-			{
-				Vector4f v;
-				v.x = routine.outputs[i + 0];
-				v.y = routine.outputs[i + 1];
-				v.z = routine.outputs[i + 2];
-				v.w = routine.outputs[i + 3];
-
-				transpose4x4(v.x, v.y, v.z, v.w);
-
-				*Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex3 + OFFSET(Vertex,v[i]), 16) = v.w;
-				*Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex2 + OFFSET(Vertex,v[i]), 16) = v.z;
-				*Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex1 + OFFSET(Vertex,v[i]), 16) = v.y;
-				*Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex0 + OFFSET(Vertex,v[i]), 16) = v.x;
-			}
-		}
-	}
-
-	void VertexRoutine::writeVertex(const Pointer<Byte> &vertex, Pointer<Byte> &cacheEntry)
-	{
-		*Pointer<Int4>(vertex + OFFSET(Vertex,position)) = *Pointer<Int4>(cacheEntry + OFFSET(Vertex,position));
-		*Pointer<Int>(vertex + OFFSET(Vertex,pointSize)) = *Pointer<Int>(cacheEntry + OFFSET(Vertex,pointSize));
-
-		*Pointer<Int>(vertex + OFFSET(Vertex,clipFlags)) = *Pointer<Int>(cacheEntry + OFFSET(Vertex,clipFlags));
-		*Pointer<Int>(vertex + OFFSET(Vertex,cullMask)) = *Pointer<Int>(cacheEntry + OFFSET(Vertex,cullMask));
-		*Pointer<Int4>(vertex + OFFSET(Vertex,projected)) = *Pointer<Int4>(cacheEntry + OFFSET(Vertex,projected));
-
-		for(int i = 0; i < MAX_INTERFACE_COMPONENTS; i++)
-		{
-			if(spirvShader->outputs[i].Type != SpirvShader::ATTRIBTYPE_UNUSED)
-			{
-				*Pointer<Int>(vertex + OFFSET(Vertex, v[i]), 4) = *Pointer<Int>(cacheEntry + OFFSET(Vertex, v[i]), 4);
-			}
-		}
-		for(unsigned int i = 0; i < spirvShader->getNumOutputClipDistances(); i++)
-		{
-			*Pointer<Float>(vertex + OFFSET(Vertex, clipDistance[i]), 4) = *Pointer<Float>(cacheEntry + OFFSET(Vertex, clipDistance[i]), 4);
-		}
-		for(unsigned int i = 0; i < spirvShader->getNumOutputCullDistances(); i++)
-		{
-			*Pointer<Float>(vertex + OFFSET(Vertex, cullDistance[i]), 4) = *Pointer<Float>(cacheEntry + OFFSET(Vertex, cullDistance[i]), 4);
+			auto value = readStream(input, stride, state.input[i / 4], batch, state.robustBufferAccess, robustnessSize, baseVertex);
+			routine.inputs[i + 0] = value.x;
+			routine.inputs[i + 1] = value.y;
+			routine.inputs[i + 2] = value.z;
+			routine.inputs[i + 3] = value.w;
 		}
 	}
 }
+
+void VertexRoutine::computeClipFlags()
+{
+	auto it = spirvShader->outputBuiltins.find(spv::BuiltInPosition);
+	assert(it != spirvShader->outputBuiltins.end());
+	assert(it->second.SizeInComponents == 4);
+	auto &pos = routine.getVariable(it->second.Id);
+	auto posX = pos[it->second.FirstComponent + 0];
+	auto posY = pos[it->second.FirstComponent + 1];
+	auto posZ = pos[it->second.FirstComponent + 2];
+	auto posW = pos[it->second.FirstComponent + 3];
+
+	Int4 maxX = CmpLT(posW, posX);
+	Int4 maxY = CmpLT(posW, posY);
+	Int4 maxZ = CmpLT(posW, posZ);
+	Int4 minX = CmpNLE(-posW, posX);
+	Int4 minY = CmpNLE(-posW, posY);
+	Int4 minZ = CmpNLE(Float4(0.0f), posZ);
+
+	clipFlags =  Pointer<Int>(constants + OFFSET(Constants,maxX))[SignMask(maxX)];
+	clipFlags |= Pointer<Int>(constants + OFFSET(Constants,maxY))[SignMask(maxY)];
+	clipFlags |= Pointer<Int>(constants + OFFSET(Constants,maxZ))[SignMask(maxZ)];
+	clipFlags |= Pointer<Int>(constants + OFFSET(Constants,minX))[SignMask(minX)];
+	clipFlags |= Pointer<Int>(constants + OFFSET(Constants,minY))[SignMask(minY)];
+	clipFlags |= Pointer<Int>(constants + OFFSET(Constants,minZ))[SignMask(minZ)];
+
+	Int4 finiteX = CmpLE(Abs(posX), *Pointer<Float4>(constants + OFFSET(Constants,maxPos)));
+	Int4 finiteY = CmpLE(Abs(posY), *Pointer<Float4>(constants + OFFSET(Constants,maxPos)));
+	Int4 finiteZ = CmpLE(Abs(posZ), *Pointer<Float4>(constants + OFFSET(Constants,maxPos)));
+
+	Int4 finiteXYZ = finiteX & finiteY & finiteZ;
+	clipFlags |= Pointer<Int>(constants + OFFSET(Constants,fini))[SignMask(finiteXYZ)];
+}
+
+void VertexRoutine::computeCullMask()
+{
+	cullMask = Int(15);
+
+	auto it = spirvShader->outputBuiltins.find(spv::BuiltInCullDistance);
+	if (it != spirvShader->outputBuiltins.end())
+	{
+		auto count = spirvShader->getNumOutputCullDistances();
+		for (uint32_t i = 0; i < count; i++)
+		{
+			auto const &distance = routine.getVariable(it->second.Id)[it->second.FirstComponent + i];
+			auto mask = SignMask(CmpGE(distance, SIMD::Float(0)));
+			cullMask &= mask;
+		}
+	}
+}
+
+Vector4f VertexRoutine::readStream(Pointer<Byte> &buffer, UInt &stride, const Stream &stream, Pointer<UInt> &batch,
+                                   bool robustBufferAccess, UInt & robustnessSize, Int baseVertex)
+{
+	Vector4f v;
+	// Because of the following rule in the Vulkan spec, we do not care if a very large negative
+	// baseVertex would overflow all the way back into a valid region of the index buffer:
+	// "Out-of-bounds buffer loads will return any of the following values :
+	//  - Values from anywhere within the memory range(s) bound to the buffer (possibly including
+	//    bytes of memory past the end of the buffer, up to the end of the bound range)."
+	UInt4 offsets = (*Pointer<UInt4>(As<Pointer<UInt4>>(batch)) + As<UInt4>(Int4(baseVertex))) * UInt4(stride);
+
+	Pointer<Byte> source0 = buffer + offsets.x;
+	Pointer<Byte> source1 = buffer + offsets.y;
+	Pointer<Byte> source2 = buffer + offsets.z;
+	Pointer<Byte> source3 = buffer + offsets.w;
+
+	UInt4 zero(0);
+	if (robustBufferAccess)
+	{
+		// TODO(b/141124876): Optimize for wide-vector gather operations.
+		UInt4 limits = offsets + UInt4(stream.bytesPerAttrib());
+		Pointer<Byte> zeroSource = As<Pointer<Byte>>(&zero);
+		source0 = IfThenElse(limits.x <= robustnessSize, source0, zeroSource);
+		source1 = IfThenElse(limits.y <= robustnessSize, source1, zeroSource);
+		source2 = IfThenElse(limits.z <= robustnessSize, source2, zeroSource);
+		source3 = IfThenElse(limits.w <= robustnessSize, source3, zeroSource);
+	}
+
+	bool isNativeFloatAttrib = (stream.attribType == SpirvShader::ATTRIBTYPE_FLOAT) || stream.normalized;
+
+	switch(stream.type)
+	{
+	case STREAMTYPE_FLOAT:
+		{
+			if(stream.count == 0)
+			{
+				// Null stream, all default components
+			}
+			else
+			{
+				if(stream.count == 1)
+				{
+					v.x.x = *Pointer<Float>(source0);
+					v.x.y = *Pointer<Float>(source1);
+					v.x.z = *Pointer<Float>(source2);
+					v.x.w = *Pointer<Float>(source3);
+				}
+				else
+				{
+					v.x = *Pointer<Float4>(source0);
+					v.y = *Pointer<Float4>(source1);
+					v.z = *Pointer<Float4>(source2);
+					v.w = *Pointer<Float4>(source3);
+
+					transpose4xN(v.x, v.y, v.z, v.w, stream.count);
+				}
+
+				switch(stream.attribType)
+				{
+				case SpirvShader::ATTRIBTYPE_INT:
+					if(stream.count >= 1) v.x = As<Float4>(Int4(v.x));
+					if(stream.count >= 2) v.x = As<Float4>(Int4(v.y));
+					if(stream.count >= 3) v.x = As<Float4>(Int4(v.z));
+					if(stream.count >= 4) v.x = As<Float4>(Int4(v.w));
+					break;
+				case SpirvShader::ATTRIBTYPE_UINT:
+					if(stream.count >= 1) v.x = As<Float4>(UInt4(v.x));
+					if(stream.count >= 2) v.x = As<Float4>(UInt4(v.y));
+					if(stream.count >= 3) v.x = As<Float4>(UInt4(v.z));
+					if(stream.count >= 4) v.x = As<Float4>(UInt4(v.w));
+					break;
+				default:
+					break;
+				}
+			}
+		}
+		break;
+	case STREAMTYPE_BYTE:
+		if(isNativeFloatAttrib) // Stream: UByte, Shader attrib: Float
+		{
+			v.x = Float4(*Pointer<Byte4>(source0));
+			v.y = Float4(*Pointer<Byte4>(source1));
+			v.z = Float4(*Pointer<Byte4>(source2));
+			v.w = Float4(*Pointer<Byte4>(source3));
+
+			transpose4xN(v.x, v.y, v.z, v.w, stream.count);
+
+			if(stream.normalized)
+			{
+				if(stream.count >= 1) v.x *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleByte));
+				if(stream.count >= 2) v.y *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleByte));
+				if(stream.count >= 3) v.z *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleByte));
+				if(stream.count >= 4) v.w *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleByte));
+			}
+		}
+		else // Stream: UByte, Shader attrib: Int / UInt
+		{
+			v.x = As<Float4>(Int4(*Pointer<Byte4>(source0)));
+			v.y = As<Float4>(Int4(*Pointer<Byte4>(source1)));
+			v.z = As<Float4>(Int4(*Pointer<Byte4>(source2)));
+			v.w = As<Float4>(Int4(*Pointer<Byte4>(source3)));
+
+			transpose4xN(v.x, v.y, v.z, v.w, stream.count);
+		}
+		break;
+	case STREAMTYPE_SBYTE:
+		if(isNativeFloatAttrib) // Stream: SByte, Shader attrib: Float
+		{
+			v.x = Float4(*Pointer<SByte4>(source0));
+			v.y = Float4(*Pointer<SByte4>(source1));
+			v.z = Float4(*Pointer<SByte4>(source2));
+			v.w = Float4(*Pointer<SByte4>(source3));
+
+			transpose4xN(v.x, v.y, v.z, v.w, stream.count);
+
+			if(stream.normalized)
+			{
+				if(stream.count >= 1) v.x *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleSByte));
+				if(stream.count >= 2) v.y *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleSByte));
+				if(stream.count >= 3) v.z *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleSByte));
+				if(stream.count >= 4) v.w *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleSByte));
+			}
+		}
+		else // Stream: SByte, Shader attrib: Int / UInt
+		{
+			v.x = As<Float4>(Int4(*Pointer<SByte4>(source0)));
+			v.y = As<Float4>(Int4(*Pointer<SByte4>(source1)));
+			v.z = As<Float4>(Int4(*Pointer<SByte4>(source2)));
+			v.w = As<Float4>(Int4(*Pointer<SByte4>(source3)));
+
+			transpose4xN(v.x, v.y, v.z, v.w, stream.count);
+		}
+		break;
+	case STREAMTYPE_COLOR:
+		{
+			v.x = Float4(*Pointer<Byte4>(source0)) * *Pointer<Float4>(constants + OFFSET(Constants,unscaleByte));
+			v.y = Float4(*Pointer<Byte4>(source1)) * *Pointer<Float4>(constants + OFFSET(Constants,unscaleByte));
+			v.z = Float4(*Pointer<Byte4>(source2)) * *Pointer<Float4>(constants + OFFSET(Constants,unscaleByte));
+			v.w = Float4(*Pointer<Byte4>(source3)) * *Pointer<Float4>(constants + OFFSET(Constants,unscaleByte));
+
+			transpose4x4(v.x, v.y, v.z, v.w);
+
+			// Swap red and blue
+			Float4 t = v.x;
+			v.x = v.z;
+			v.z = t;
+		}
+		break;
+	case STREAMTYPE_SHORT:
+		if(isNativeFloatAttrib) // Stream: Int, Shader attrib: Float
+		{
+			v.x = Float4(*Pointer<Short4>(source0));
+			v.y = Float4(*Pointer<Short4>(source1));
+			v.z = Float4(*Pointer<Short4>(source2));
+			v.w = Float4(*Pointer<Short4>(source3));
+
+			transpose4xN(v.x, v.y, v.z, v.w, stream.count);
+
+			if(stream.normalized)
+			{
+				if(stream.count >= 1) v.x *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleShort));
+				if(stream.count >= 2) v.y *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleShort));
+				if(stream.count >= 3) v.z *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleShort));
+				if(stream.count >= 4) v.w *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleShort));
+			}
+		}
+		else // Stream: Short, Shader attrib: Int/UInt, no type conversion
+		{
+			v.x = As<Float4>(Int4(*Pointer<Short4>(source0)));
+			v.y = As<Float4>(Int4(*Pointer<Short4>(source1)));
+			v.z = As<Float4>(Int4(*Pointer<Short4>(source2)));
+			v.w = As<Float4>(Int4(*Pointer<Short4>(source3)));
+
+			transpose4xN(v.x, v.y, v.z, v.w, stream.count);
+		}
+		break;
+	case STREAMTYPE_USHORT:
+		if(isNativeFloatAttrib) // Stream: Int, Shader attrib: Float
+		{
+			v.x = Float4(*Pointer<UShort4>(source0));
+			v.y = Float4(*Pointer<UShort4>(source1));
+			v.z = Float4(*Pointer<UShort4>(source2));
+			v.w = Float4(*Pointer<UShort4>(source3));
+
+			transpose4xN(v.x, v.y, v.z, v.w, stream.count);
+
+			if(stream.normalized)
+			{
+				if(stream.count >= 1) v.x *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleUShort));
+				if(stream.count >= 2) v.y *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleUShort));
+				if(stream.count >= 3) v.z *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleUShort));
+				if(stream.count >= 4) v.w *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleUShort));
+			}
+		}
+		else // Stream: UShort, Shader attrib: Int/UInt, no type conversion
+		{
+			v.x = As<Float4>(Int4(*Pointer<UShort4>(source0)));
+			v.y = As<Float4>(Int4(*Pointer<UShort4>(source1)));
+			v.z = As<Float4>(Int4(*Pointer<UShort4>(source2)));
+			v.w = As<Float4>(Int4(*Pointer<UShort4>(source3)));
+
+			transpose4xN(v.x, v.y, v.z, v.w, stream.count);
+		}
+		break;
+	case STREAMTYPE_INT:
+		if(isNativeFloatAttrib) // Stream: Int, Shader attrib: Float
+		{
+			v.x = Float4(*Pointer<Int4>(source0));
+			v.y = Float4(*Pointer<Int4>(source1));
+			v.z = Float4(*Pointer<Int4>(source2));
+			v.w = Float4(*Pointer<Int4>(source3));
+
+			transpose4xN(v.x, v.y, v.z, v.w, stream.count);
+
+			if(stream.normalized)
+			{
+				if(stream.count >= 1) v.x *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleInt));
+				if(stream.count >= 2) v.y *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleInt));
+				if(stream.count >= 3) v.z *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleInt));
+				if(stream.count >= 4) v.w *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleInt));
+			}
+		}
+		else // Stream: Int, Shader attrib: Int/UInt, no type conversion
+		{
+			v.x = *Pointer<Float4>(source0);
+			v.y = *Pointer<Float4>(source1);
+			v.z = *Pointer<Float4>(source2);
+			v.w = *Pointer<Float4>(source3);
+
+			transpose4xN(v.x, v.y, v.z, v.w, stream.count);
+		}
+		break;
+	case STREAMTYPE_UINT:
+		if(isNativeFloatAttrib) // Stream: UInt, Shader attrib: Float
+		{
+			v.x = Float4(*Pointer<UInt4>(source0));
+			v.y = Float4(*Pointer<UInt4>(source1));
+			v.z = Float4(*Pointer<UInt4>(source2));
+			v.w = Float4(*Pointer<UInt4>(source3));
+
+			transpose4xN(v.x, v.y, v.z, v.w, stream.count);
+
+			if(stream.normalized)
+			{
+				if(stream.count >= 1) v.x *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleUInt));
+				if(stream.count >= 2) v.y *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleUInt));
+				if(stream.count >= 3) v.z *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleUInt));
+				if(stream.count >= 4) v.w *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleUInt));
+			}
+		}
+		else // Stream: UInt, Shader attrib: Int/UInt, no type conversion
+		{
+			v.x = *Pointer<Float4>(source0);
+			v.y = *Pointer<Float4>(source1);
+			v.z = *Pointer<Float4>(source2);
+			v.w = *Pointer<Float4>(source3);
+
+			transpose4xN(v.x, v.y, v.z, v.w, stream.count);
+		}
+		break;
+	case STREAMTYPE_HALF:
+		{
+			if(stream.count >= 1)
+			{
+				UShort x0 = *Pointer<UShort>(source0 + 0);
+				UShort x1 = *Pointer<UShort>(source1 + 0);
+				UShort x2 = *Pointer<UShort>(source2 + 0);
+				UShort x3 = *Pointer<UShort>(source3 + 0);
+
+				v.x.x = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(x0) * 4);
+				v.x.y = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(x1) * 4);
+				v.x.z = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(x2) * 4);
+				v.x.w = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(x3) * 4);
+			}
+
+			if(stream.count >= 2)
+			{
+				UShort y0 = *Pointer<UShort>(source0 + 2);
+				UShort y1 = *Pointer<UShort>(source1 + 2);
+				UShort y2 = *Pointer<UShort>(source2 + 2);
+				UShort y3 = *Pointer<UShort>(source3 + 2);
+
+				v.y.x = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(y0) * 4);
+				v.y.y = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(y1) * 4);
+				v.y.z = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(y2) * 4);
+				v.y.w = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(y3) * 4);
+			}
+
+			if(stream.count >= 3)
+			{
+				UShort z0 = *Pointer<UShort>(source0 + 4);
+				UShort z1 = *Pointer<UShort>(source1 + 4);
+				UShort z2 = *Pointer<UShort>(source2 + 4);
+				UShort z3 = *Pointer<UShort>(source3 + 4);
+
+				v.z.x = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(z0) * 4);
+				v.z.y = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(z1) * 4);
+				v.z.z = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(z2) * 4);
+				v.z.w = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(z3) * 4);
+			}
+
+			if(stream.count >= 4)
+			{
+				UShort w0 = *Pointer<UShort>(source0 + 6);
+				UShort w1 = *Pointer<UShort>(source1 + 6);
+				UShort w2 = *Pointer<UShort>(source2 + 6);
+				UShort w3 = *Pointer<UShort>(source3 + 6);
+
+				v.w.x = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(w0) * 4);
+				v.w.y = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(w1) * 4);
+				v.w.z = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(w2) * 4);
+				v.w.w = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(w3) * 4);
+			}
+		}
+		break;
+	case STREAMTYPE_2_10_10_10_INT:
+		{
+			Int4 src;
+			src = Insert(src, *Pointer<Int>(source0), 0);
+			src = Insert(src, *Pointer<Int>(source1), 1);
+			src = Insert(src, *Pointer<Int>(source2), 2);
+			src = Insert(src, *Pointer<Int>(source3), 3);
+
+			v.x = Float4((src << 22) >> 22);
+			v.y = Float4((src << 12) >> 22);
+			v.z = Float4((src << 02) >> 22);
+			v.w = Float4(src >> 30);
+
+			if(stream.normalized)
+			{
+				v.x = Max(v.x * Float4(1.0f / 0x1FF), Float4(-1.0f));
+				v.y = Max(v.y * Float4(1.0f / 0x1FF), Float4(-1.0f));
+				v.z = Max(v.z * Float4(1.0f / 0x1FF), Float4(-1.0f));
+				v.w = Max(v.w, Float4(-1.0f));
+			}
+		}
+		break;
+	case STREAMTYPE_2_10_10_10_UINT:
+		{
+			Int4 src;
+			src = Insert(src, *Pointer<Int>(source0), 0);
+			src = Insert(src, *Pointer<Int>(source1), 1);
+			src = Insert(src, *Pointer<Int>(source2), 2);
+			src = Insert(src, *Pointer<Int>(source3), 3);
+
+			v.x = Float4(src & Int4(0x3FF));
+			v.y = Float4((src >> 10) & Int4(0x3FF));
+			v.z = Float4((src >> 20) & Int4(0x3FF));
+			v.w = Float4((src >> 30) & Int4(0x3));
+
+			if(stream.normalized)
+			{
+				v.x *= Float4(1.0f / 0x3FF);
+				v.y *= Float4(1.0f / 0x3FF);
+				v.z *= Float4(1.0f / 0x3FF);
+				v.w *= Float4(1.0f / 0x3);
+			}
+		}
+		break;
+	default:
+		UNSUPPORTED("stream.type %d", int(stream.type));
+	}
+
+	if(stream.count < 1) v.x = Float4(0.0f);
+	if(stream.count < 2) v.y = Float4(0.0f);
+	if(stream.count < 3) v.z = Float4(0.0f);
+	if(stream.count < 4) v.w = isNativeFloatAttrib ? As<Float4>(Float4(1.0f)) : As<Float4>(Int4(1));
+
+	return v;
+}
+
+void VertexRoutine::writeCache(Pointer<Byte> &vertexCache, Pointer<UInt> &tagCache, Pointer<UInt> &batch)
+{
+	UInt index0 = batch[0];
+	UInt index1 = batch[1];
+	UInt index2 = batch[2];
+	UInt index3 = batch[3];
+
+	UInt cacheIndex0 = index0 & VertexCache::TAG_MASK;
+	UInt cacheIndex1 = index1 & VertexCache::TAG_MASK;
+	UInt cacheIndex2 = index2 & VertexCache::TAG_MASK;
+	UInt cacheIndex3 = index3 & VertexCache::TAG_MASK;
+
+	// We processed a SIMD group of vertices, with the first one being the one that missed the cache tag check.
+	// Write them out in reverse order here and below to ensure the first one is now guaranteed to be in the cache.
+	tagCache[cacheIndex3] = index3;
+	tagCache[cacheIndex2] = index2;
+	tagCache[cacheIndex1] = index1;
+	tagCache[cacheIndex0] = index0;
+
+	auto it = spirvShader->outputBuiltins.find(spv::BuiltInPosition);
+	assert(it != spirvShader->outputBuiltins.end());
+	assert(it->second.SizeInComponents == 4);
+	auto &position = routine.getVariable(it->second.Id);
+
+	Vector4f pos;
+	pos.x = position[it->second.FirstComponent + 0];
+	pos.y = position[it->second.FirstComponent + 1];
+	pos.z = position[it->second.FirstComponent + 2];
+	pos.w = position[it->second.FirstComponent + 3];
+
+	// Projection and viewport transform.
+	Float4 w = As<Float4>(As<Int4>(pos.w) | (As<Int4>(CmpEQ(pos.w, Float4(0.0f))) & As<Int4>(Float4(1.0f))));
+	Float4 rhw = Float4(1.0f) / w;
+
+	Vector4f proj;
+	proj.x = As<Float4>(RoundInt(*Pointer<Float4>(data + OFFSET(DrawData,X0xF)) + pos.x * rhw * *Pointer<Float4>(data + OFFSET(DrawData,WxF))));
+	proj.y = As<Float4>(RoundInt(*Pointer<Float4>(data + OFFSET(DrawData,Y0xF)) + pos.y * rhw * *Pointer<Float4>(data + OFFSET(DrawData,HxF))));
+	proj.z = pos.z * rhw;
+	proj.w = rhw;
+
+	transpose4x4(pos.x, pos.y, pos.z, pos.w);
+
+	*Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex3 + OFFSET(Vertex,position), 16) = pos.w;
+	*Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex2 + OFFSET(Vertex,position), 16) = pos.z;
+	*Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex1 + OFFSET(Vertex,position), 16) = pos.y;
+	*Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex0 + OFFSET(Vertex,position), 16) = pos.x;
+
+	it = spirvShader->outputBuiltins.find(spv::BuiltInPointSize);
+	if(it != spirvShader->outputBuiltins.end())
+	{
+		ASSERT(it->second.SizeInComponents == 1);
+		auto psize = routine.getVariable(it->second.Id)[it->second.FirstComponent];
+
+		*Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex3 + OFFSET(Vertex,pointSize)) = Extract(psize, 3);
+		*Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex2 + OFFSET(Vertex,pointSize)) = Extract(psize, 2);
+		*Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex1 + OFFSET(Vertex,pointSize)) = Extract(psize, 1);
+		*Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex0 + OFFSET(Vertex,pointSize)) = Extract(psize, 0);
+	}
+
+	it = spirvShader->outputBuiltins.find(spv::BuiltInClipDistance);
+	if(it != spirvShader->outputBuiltins.end())
+	{
+		auto count = spirvShader->getNumOutputClipDistances();
+		for(unsigned int i = 0; i < count; i++)
+		{
+			auto dist = routine.getVariable(it->second.Id)[it->second.FirstComponent + i];
+			*Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex3 + OFFSET(Vertex,clipDistance[i])) = Extract(dist, 3);
+			*Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex2 + OFFSET(Vertex,clipDistance[i])) = Extract(dist, 2);
+			*Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex1 + OFFSET(Vertex,clipDistance[i])) = Extract(dist, 1);
+			*Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex0 + OFFSET(Vertex,clipDistance[i])) = Extract(dist, 0);
+		}
+	}
+
+	it = spirvShader->outputBuiltins.find(spv::BuiltInCullDistance);
+	if(it != spirvShader->outputBuiltins.end())
+	{
+		auto count = spirvShader->getNumOutputCullDistances();
+		for(unsigned int i = 0; i < count; i++)
+		{
+			auto dist = routine.getVariable(it->second.Id)[it->second.FirstComponent + i];
+			*Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex3 + OFFSET(Vertex,cullDistance[i])) = Extract(dist, 3);
+			*Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex2 + OFFSET(Vertex,cullDistance[i])) = Extract(dist, 2);
+			*Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex1 + OFFSET(Vertex,cullDistance[i])) = Extract(dist, 1);
+			*Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex0 + OFFSET(Vertex,cullDistance[i])) = Extract(dist, 0);
+		}
+	}
+
+	*Pointer<Int>(vertexCache + sizeof(Vertex) * cacheIndex3 + OFFSET(Vertex,clipFlags)) = (clipFlags >> 24) & 0x0000000FF;
+	*Pointer<Int>(vertexCache + sizeof(Vertex) * cacheIndex2 + OFFSET(Vertex,clipFlags)) = (clipFlags >> 16) & 0x0000000FF;
+	*Pointer<Int>(vertexCache + sizeof(Vertex) * cacheIndex1 + OFFSET(Vertex,clipFlags)) = (clipFlags >> 8)  & 0x0000000FF;
+	*Pointer<Int>(vertexCache + sizeof(Vertex) * cacheIndex0 + OFFSET(Vertex,clipFlags)) = (clipFlags >> 0)  & 0x0000000FF;
+
+	*Pointer<Int>(vertexCache + sizeof(Vertex) * cacheIndex3 + OFFSET(Vertex,cullMask)) = -((cullMask >> 3) & 1);
+	*Pointer<Int>(vertexCache + sizeof(Vertex) * cacheIndex2 + OFFSET(Vertex,cullMask)) = -((cullMask >> 2) & 1);
+	*Pointer<Int>(vertexCache + sizeof(Vertex) * cacheIndex1 + OFFSET(Vertex,cullMask)) = -((cullMask >> 1) & 1);
+	*Pointer<Int>(vertexCache + sizeof(Vertex) * cacheIndex0 + OFFSET(Vertex,cullMask)) = -((cullMask >> 0) & 1);
+
+	transpose4x4(proj.x, proj.y, proj.z, proj.w);
+
+	*Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex3 + OFFSET(Vertex,projected), 16) = proj.w;
+	*Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex2 + OFFSET(Vertex,projected), 16) = proj.z;
+	*Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex1 + OFFSET(Vertex,projected), 16) = proj.y;
+	*Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex0 + OFFSET(Vertex,projected), 16) = proj.x;
+
+	for(int i = 0; i < MAX_INTERFACE_COMPONENTS; i += 4)
+	{
+		if(spirvShader->outputs[i + 0].Type != SpirvShader::ATTRIBTYPE_UNUSED ||
+		   spirvShader->outputs[i + 1].Type != SpirvShader::ATTRIBTYPE_UNUSED ||
+		   spirvShader->outputs[i + 2].Type != SpirvShader::ATTRIBTYPE_UNUSED ||
+		   spirvShader->outputs[i + 3].Type != SpirvShader::ATTRIBTYPE_UNUSED)
+		{
+			Vector4f v;
+			v.x = routine.outputs[i + 0];
+			v.y = routine.outputs[i + 1];
+			v.z = routine.outputs[i + 2];
+			v.w = routine.outputs[i + 3];
+
+			transpose4x4(v.x, v.y, v.z, v.w);
+
+			*Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex3 + OFFSET(Vertex,v[i]), 16) = v.w;
+			*Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex2 + OFFSET(Vertex,v[i]), 16) = v.z;
+			*Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex1 + OFFSET(Vertex,v[i]), 16) = v.y;
+			*Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex0 + OFFSET(Vertex,v[i]), 16) = v.x;
+		}
+	}
+}
+
+void VertexRoutine::writeVertex(const Pointer<Byte> &vertex, Pointer<Byte> &cacheEntry)
+{
+	*Pointer<Int4>(vertex + OFFSET(Vertex,position)) = *Pointer<Int4>(cacheEntry + OFFSET(Vertex,position));
+	*Pointer<Int>(vertex + OFFSET(Vertex,pointSize)) = *Pointer<Int>(cacheEntry + OFFSET(Vertex,pointSize));
+
+	*Pointer<Int>(vertex + OFFSET(Vertex,clipFlags)) = *Pointer<Int>(cacheEntry + OFFSET(Vertex,clipFlags));
+	*Pointer<Int>(vertex + OFFSET(Vertex,cullMask)) = *Pointer<Int>(cacheEntry + OFFSET(Vertex,cullMask));
+	*Pointer<Int4>(vertex + OFFSET(Vertex,projected)) = *Pointer<Int4>(cacheEntry + OFFSET(Vertex,projected));
+
+	for(int i = 0; i < MAX_INTERFACE_COMPONENTS; i++)
+	{
+		if(spirvShader->outputs[i].Type != SpirvShader::ATTRIBTYPE_UNUSED)
+		{
+			*Pointer<Int>(vertex + OFFSET(Vertex, v[i]), 4) = *Pointer<Int>(cacheEntry + OFFSET(Vertex, v[i]), 4);
+		}
+	}
+	for(unsigned int i = 0; i < spirvShader->getNumOutputClipDistances(); i++)
+	{
+		*Pointer<Float>(vertex + OFFSET(Vertex, clipDistance[i]), 4) = *Pointer<Float>(cacheEntry + OFFSET(Vertex, clipDistance[i]), 4);
+	}
+	for(unsigned int i = 0; i < spirvShader->getNumOutputCullDistances(); i++)
+	{
+		*Pointer<Float>(vertex + OFFSET(Vertex, cullDistance[i]), 4) = *Pointer<Float>(cacheEntry + OFFSET(Vertex, cullDistance[i]), 4);
+	}
+}
+
+}  // namespace sw
diff --git a/src/Pipeline/VertexRoutine.hpp b/src/Pipeline/VertexRoutine.hpp
index 99e7191..ab458b4 100644
--- a/src/Pipeline/VertexRoutine.hpp
+++ b/src/Pipeline/VertexRoutine.hpp
@@ -20,61 +20,59 @@
 #include "Device/Color.hpp"
 #include "Device/VertexProcessor.hpp"
 
-namespace vk
+namespace vk { class PipelineLayout; }
+
+namespace sw {
+
+class VertexRoutinePrototype : public VertexRoutineFunction
 {
-	class PipelineLayout;
-} // namespace vk
+public:
+	VertexRoutinePrototype() : vertex(Arg<0>()), batch(Arg<1>()), task(Arg<2>()), data(Arg<3>()) {}
+	virtual ~VertexRoutinePrototype() {}
 
-namespace sw
+protected:
+	Pointer<Byte> vertex;
+	Pointer<UInt> batch;
+	Pointer<Byte> task;
+	Pointer<Byte> data;
+};
+
+class VertexRoutine : public VertexRoutinePrototype
 {
-	class VertexRoutinePrototype : public VertexRoutineFunction
-	{
-	public:
-		VertexRoutinePrototype() : vertex(Arg<0>()), batch(Arg<1>()), task(Arg<2>()), data(Arg<3>()) {}
-		virtual ~VertexRoutinePrototype() {}
+public:
+	VertexRoutine(
+		const VertexProcessor::State &state,
+		vk::PipelineLayout const *pipelineLayout,
+		SpirvShader const *spirvShader);
+	virtual ~VertexRoutine();
 
-	protected:
-		Pointer<Byte> vertex;
-		Pointer<UInt> batch;
-		Pointer<Byte> task;
-		Pointer<Byte> data;
-	};
+	void generate();
 
-	class VertexRoutine : public VertexRoutinePrototype
-	{
-	public:
-		VertexRoutine(
-			const VertexProcessor::State &state,
-			vk::PipelineLayout const *pipelineLayout,
-			SpirvShader const *spirvShader);
-		virtual ~VertexRoutine();
+protected:
+	Pointer<Byte> constants;
 
-		void generate();
+	Int clipFlags;
+	Int cullMask;
 
-	protected:
-		Pointer<Byte> constants;
+	SpirvRoutine routine;
 
-		Int clipFlags;
-		Int cullMask;
+	const VertexProcessor::State &state;
+	SpirvShader const * const spirvShader;
 
-		SpirvRoutine routine;
+private:
+	virtual void program(Pointer<UInt> &batch, UInt& vertexCount) = 0;
 
-		const VertexProcessor::State &state;
-		SpirvShader const * const spirvShader;
+	typedef VertexProcessor::State::Input Stream;
 
-	private:
-		virtual void program(Pointer<UInt> &batch, UInt& vertexCount) = 0;
+	Vector4f readStream(Pointer<Byte> &buffer, UInt &stride, const Stream &stream, Pointer<UInt> &batch,
+	                    bool robustBufferAccess, UInt& robustnessSize, Int baseVertex);
+	void readInput(Pointer<UInt> &batch);
+	void computeClipFlags();
+	void computeCullMask();
+	void writeCache(Pointer<Byte> &vertexCache, Pointer<UInt> &tagCache, Pointer<UInt> &batch);
+	void writeVertex(const Pointer<Byte> &vertex, Pointer<Byte> &cacheEntry);
+};
 
-		typedef VertexProcessor::State::Input Stream;
-
-		Vector4f readStream(Pointer<Byte> &buffer, UInt &stride, const Stream &stream, Pointer<UInt> &batch,
-		                    bool robustBufferAccess, UInt& robustnessSize, Int baseVertex);
-		void readInput(Pointer<UInt> &batch);
-		void computeClipFlags();
-		void computeCullMask();
-		void writeCache(Pointer<Byte> &vertexCache, Pointer<UInt> &tagCache, Pointer<UInt> &batch);
-		void writeVertex(const Pointer<Byte> &vertex, Pointer<Byte> &cacheEntry);
-	};
-}
+}  // namespace sw
 
 #endif   // sw_VertexRoutine_hpp
diff --git a/src/Reactor/CPUID.cpp b/src/Reactor/CPUID.cpp
index 58ef009..f3b9024 100644
--- a/src/Reactor/CPUID.cpp
+++ b/src/Reactor/CPUID.cpp
@@ -27,201 +27,202 @@
 	#include <sys/types.h>
 #endif
 
-namespace rr
+namespace rr {
+
+bool CPUID::MMX = detectMMX();
+bool CPUID::CMOV = detectCMOV();
+bool CPUID::SSE = detectSSE();
+bool CPUID::SSE2 = detectSSE2();
+bool CPUID::SSE3 = detectSSE3();
+bool CPUID::SSSE3 = detectSSSE3();
+bool CPUID::SSE4_1 = detectSSE4_1();
+
+bool CPUID::enableMMX = true;
+bool CPUID::enableCMOV = true;
+bool CPUID::enableSSE = true;
+bool CPUID::enableSSE2 = true;
+bool CPUID::enableSSE3 = true;
+bool CPUID::enableSSSE3 = true;
+bool CPUID::enableSSE4_1 = true;
+
+void CPUID::setEnableMMX(bool enable)
 {
-	bool CPUID::MMX = detectMMX();
-	bool CPUID::CMOV = detectCMOV();
-	bool CPUID::SSE = detectSSE();
-	bool CPUID::SSE2 = detectSSE2();
-	bool CPUID::SSE3 = detectSSE3();
-	bool CPUID::SSSE3 = detectSSSE3();
-	bool CPUID::SSE4_1 = detectSSE4_1();
+	enableMMX = enable;
 
-	bool CPUID::enableMMX = true;
-	bool CPUID::enableCMOV = true;
-	bool CPUID::enableSSE = true;
-	bool CPUID::enableSSE2 = true;
-	bool CPUID::enableSSE3 = true;
-	bool CPUID::enableSSSE3 = true;
-	bool CPUID::enableSSE4_1 = true;
-
-	void CPUID::setEnableMMX(bool enable)
+	if(!enableMMX)
 	{
-		enableMMX = enable;
-
-		if(!enableMMX)
-		{
-			enableSSE = false;
-			enableSSE2 = false;
-			enableSSE3 = false;
-			enableSSSE3 = false;
-			enableSSE4_1 = false;
-		}
-	}
-
-	void CPUID::setEnableCMOV(bool enable)
-	{
-		enableCMOV = enable;
-
-		if(!CMOV)
-		{
-			enableSSE = false;
-			enableSSE2 = false;
-			enableSSE3 = false;
-			enableSSSE3 = false;
-			enableSSE4_1 = false;
-		}
-	}
-
-	void CPUID::setEnableSSE(bool enable)
-	{
-		enableSSE = enable;
-
-		if(enableSSE)
-		{
-			enableMMX = true;
-			enableCMOV = true;
-		}
-		else
-		{
-			enableSSE2 = false;
-			enableSSE3 = false;
-			enableSSSE3 = false;
-			enableSSE4_1 = false;
-		}
-	}
-
-	void CPUID::setEnableSSE2(bool enable)
-	{
-		enableSSE2 = enable;
-
-		if(enableSSE2)
-		{
-			enableMMX = true;
-			enableCMOV = true;
-			enableSSE = true;
-		}
-		else
-		{
-			enableSSE3 = false;
-			enableSSSE3 = false;
-			enableSSE4_1 = false;
-		}
-	}
-
-	void CPUID::setEnableSSE3(bool enable)
-	{
-		enableSSE3 = enable;
-
-		if(enableSSE3)
-		{
-			enableMMX = true;
-			enableCMOV = true;
-			enableSSE = true;
-			enableSSE2 = true;
-		}
-		else
-		{
-			enableSSSE3 = false;
-			enableSSE4_1 = false;
-		}
-	}
-
-	void CPUID::setEnableSSSE3(bool enable)
-	{
-		enableSSSE3 = enable;
-
-		if(enableSSSE3)
-		{
-			enableMMX = true;
-			enableCMOV = true;
-			enableSSE = true;
-			enableSSE2 = true;
-			enableSSE3 = true;
-		}
-		else
-		{
-			enableSSE4_1 = false;
-		}
-	}
-
-	void CPUID::setEnableSSE4_1(bool enable)
-	{
-		enableSSE4_1 = enable;
-
-		if(enableSSE4_1)
-		{
-			enableMMX = true;
-			enableCMOV = true;
-			enableSSE = true;
-			enableSSE2 = true;
-			enableSSE3 = true;
-			enableSSSE3 = true;
-		}
-	}
-
-	static void cpuid(int registers[4], int info)
-	{
-		#if defined(__i386__) || defined(__x86_64__)
-			#if defined(_WIN32)
-				__cpuid(registers, info);
-			#else
-				__asm volatile("cpuid": "=a" (registers[0]), "=b" (registers[1]), "=c" (registers[2]), "=d" (registers[3]): "a" (info));
-			#endif
-		#else
-			registers[0] = 0;
-			registers[1] = 0;
-			registers[2] = 0;
-			registers[3] = 0;
-		#endif
-	}
-
-	bool CPUID::detectMMX()
-	{
-		int registers[4];
-		cpuid(registers, 1);
-		return MMX = (registers[3] & 0x00800000) != 0;
-	}
-
-	bool CPUID::detectCMOV()
-	{
-		int registers[4];
-		cpuid(registers, 1);
-		return CMOV = (registers[3] & 0x00008000) != 0;
-	}
-
-	bool CPUID::detectSSE()
-	{
-		int registers[4];
-		cpuid(registers, 1);
-		return SSE = (registers[3] & 0x02000000) != 0;
-	}
-
-	bool CPUID::detectSSE2()
-	{
-		int registers[4];
-		cpuid(registers, 1);
-		return SSE2 = (registers[3] & 0x04000000) != 0;
-	}
-
-	bool CPUID::detectSSE3()
-	{
-		int registers[4];
-		cpuid(registers, 1);
-		return SSE3 = (registers[2] & 0x00000001) != 0;
-	}
-
-	bool CPUID::detectSSSE3()
-	{
-		int registers[4];
-		cpuid(registers, 1);
-		return SSSE3 = (registers[2] & 0x00000200) != 0;
-	}
-
-	bool CPUID::detectSSE4_1()
-	{
-		int registers[4];
-		cpuid(registers, 1);
-		return SSE4_1 = (registers[2] & 0x00080000) != 0;
+		enableSSE = false;
+		enableSSE2 = false;
+		enableSSE3 = false;
+		enableSSSE3 = false;
+		enableSSE4_1 = false;
 	}
 }
+
+void CPUID::setEnableCMOV(bool enable)
+{
+	enableCMOV = enable;
+
+	if(!CMOV)
+	{
+		enableSSE = false;
+		enableSSE2 = false;
+		enableSSE3 = false;
+		enableSSSE3 = false;
+		enableSSE4_1 = false;
+	}
+}
+
+void CPUID::setEnableSSE(bool enable)
+{
+	enableSSE = enable;
+
+	if(enableSSE)
+	{
+		enableMMX = true;
+		enableCMOV = true;
+	}
+	else
+	{
+		enableSSE2 = false;
+		enableSSE3 = false;
+		enableSSSE3 = false;
+		enableSSE4_1 = false;
+	}
+}
+
+void CPUID::setEnableSSE2(bool enable)
+{
+	enableSSE2 = enable;
+
+	if(enableSSE2)
+	{
+		enableMMX = true;
+		enableCMOV = true;
+		enableSSE = true;
+	}
+	else
+	{
+		enableSSE3 = false;
+		enableSSSE3 = false;
+		enableSSE4_1 = false;
+	}
+}
+
+void CPUID::setEnableSSE3(bool enable)
+{
+	enableSSE3 = enable;
+
+	if(enableSSE3)
+	{
+		enableMMX = true;
+		enableCMOV = true;
+		enableSSE = true;
+		enableSSE2 = true;
+	}
+	else
+	{
+		enableSSSE3 = false;
+		enableSSE4_1 = false;
+	}
+}
+
+void CPUID::setEnableSSSE3(bool enable)
+{
+	enableSSSE3 = enable;
+
+	if(enableSSSE3)
+	{
+		enableMMX = true;
+		enableCMOV = true;
+		enableSSE = true;
+		enableSSE2 = true;
+		enableSSE3 = true;
+	}
+	else
+	{
+		enableSSE4_1 = false;
+	}
+}
+
+void CPUID::setEnableSSE4_1(bool enable)
+{
+	enableSSE4_1 = enable;
+
+	if(enableSSE4_1)
+	{
+		enableMMX = true;
+		enableCMOV = true;
+		enableSSE = true;
+		enableSSE2 = true;
+		enableSSE3 = true;
+		enableSSSE3 = true;
+	}
+}
+
+static void cpuid(int registers[4], int info)
+{
+	#if defined(__i386__) || defined(__x86_64__)
+		#if defined(_WIN32)
+			__cpuid(registers, info);
+		#else
+			__asm volatile("cpuid": "=a" (registers[0]), "=b" (registers[1]), "=c" (registers[2]), "=d" (registers[3]): "a" (info));
+		#endif
+	#else
+		registers[0] = 0;
+		registers[1] = 0;
+		registers[2] = 0;
+		registers[3] = 0;
+	#endif
+}
+
+bool CPUID::detectMMX()
+{
+	int registers[4];
+	cpuid(registers, 1);
+	return MMX = (registers[3] & 0x00800000) != 0;
+}
+
+bool CPUID::detectCMOV()
+{
+	int registers[4];
+	cpuid(registers, 1);
+	return CMOV = (registers[3] & 0x00008000) != 0;
+}
+
+bool CPUID::detectSSE()
+{
+	int registers[4];
+	cpuid(registers, 1);
+	return SSE = (registers[3] & 0x02000000) != 0;
+}
+
+bool CPUID::detectSSE2()
+{
+	int registers[4];
+	cpuid(registers, 1);
+	return SSE2 = (registers[3] & 0x04000000) != 0;
+}
+
+bool CPUID::detectSSE3()
+{
+	int registers[4];
+	cpuid(registers, 1);
+	return SSE3 = (registers[2] & 0x00000001) != 0;
+}
+
+bool CPUID::detectSSSE3()
+{
+	int registers[4];
+	cpuid(registers, 1);
+	return SSSE3 = (registers[2] & 0x00000200) != 0;
+}
+
+bool CPUID::detectSSE4_1()
+{
+	int registers[4];
+	cpuid(registers, 1);
+	return SSE4_1 = (registers[2] & 0x00080000) != 0;
+}
+
+}  // namespace rr
diff --git a/src/Reactor/CPUID.hpp b/src/Reactor/CPUID.hpp
index 108d4a7..577e237 100644
--- a/src/Reactor/CPUID.hpp
+++ b/src/Reactor/CPUID.hpp
@@ -15,104 +15,108 @@
 #ifndef rr_CPUID_hpp
 #define rr_CPUID_hpp
 
-namespace rr
+namespace rr {
+
+#if !defined(__i386__) && defined(_M_IX86)
+	#define __i386__ 1
+#endif
+
+#if !defined(__x86_64__) && (defined(_M_AMD64) || defined (_M_X64))
+	#define __x86_64__ 1
+#endif
+
+class CPUID
 {
-	#if !defined(__i386__) && defined(_M_IX86)
-		#define __i386__ 1
-	#endif
+public:
+	static bool supportsMMX();
+	static bool supportsCMOV();
+	static bool supportsMMX2();   // MMX instructions added by SSE: pshufw, pmulhuw, pmovmskb, pavgw/b, pextrw, pinsrw, pmaxsw/ub, etc.
+	static bool supportsSSE();
+	static bool supportsSSE2();
+	static bool supportsSSE3();
+	static bool supportsSSSE3();
+	static bool supportsSSE4_1();
 
-	#if !defined(__x86_64__) && (defined(_M_AMD64) || defined (_M_X64))
-		#define __x86_64__ 1
-	#endif
+	static void setEnableMMX(bool enable);
+	static void setEnableCMOV(bool enable);
+	static void setEnableSSE(bool enable);
+	static void setEnableSSE2(bool enable);
+	static void setEnableSSE3(bool enable);
+	static void setEnableSSSE3(bool enable);
+	static void setEnableSSE4_1(bool enable);
 
-	class CPUID
-	{
-	public:
-		static bool supportsMMX();
-		static bool supportsCMOV();
-		static bool supportsMMX2();   // MMX instructions added by SSE: pshufw, pmulhuw, pmovmskb, pavgw/b, pextrw, pinsrw, pmaxsw/ub, etc.
-		static bool supportsSSE();
-		static bool supportsSSE2();
-		static bool supportsSSE3();
-		static bool supportsSSSE3();
-		static bool supportsSSE4_1();
+private:
+	static bool MMX;
+	static bool CMOV;
+	static bool SSE;
+	static bool SSE2;
+	static bool SSE3;
+	static bool SSSE3;
+	static bool SSE4_1;
 
-		static void setEnableMMX(bool enable);
-		static void setEnableCMOV(bool enable);
-		static void setEnableSSE(bool enable);
-		static void setEnableSSE2(bool enable);
-		static void setEnableSSE3(bool enable);
-		static void setEnableSSSE3(bool enable);
-		static void setEnableSSE4_1(bool enable);
+	static bool enableMMX;
+	static bool enableCMOV;
+	static bool enableSSE;
+	static bool enableSSE2;
+	static bool enableSSE3;
+	static bool enableSSSE3;
+	static bool enableSSE4_1;
 
-	private:
-		static bool MMX;
-		static bool CMOV;
-		static bool SSE;
-		static bool SSE2;
-		static bool SSE3;
-		static bool SSSE3;
-		static bool SSE4_1;
+	static bool detectMMX();
+	static bool detectCMOV();
+	static bool detectSSE();
+	static bool detectSSE2();
+	static bool detectSSE3();
+	static bool detectSSSE3();
+	static bool detectSSE4_1();
+};
 
-		static bool enableMMX;
-		static bool enableCMOV;
-		static bool enableSSE;
-		static bool enableSSE2;
-		static bool enableSSE3;
-		static bool enableSSSE3;
-		static bool enableSSE4_1;
+}  // namespace rr
 
-		static bool detectMMX();
-		static bool detectCMOV();
-		static bool detectSSE();
-		static bool detectSSE2();
-		static bool detectSSE3();
-		static bool detectSSSE3();
-		static bool detectSSE4_1();
-	};
+/* Inline implementation */
+
+namespace rr {
+
+inline bool CPUID::supportsMMX()
+{
+	return MMX && enableMMX;
 }
 
-namespace rr
+inline bool CPUID::supportsCMOV()
 {
-	inline bool CPUID::supportsMMX()
-	{
-		return MMX && enableMMX;
-	}
-
-	inline bool CPUID::supportsCMOV()
-	{
-		return CMOV && enableCMOV;
-	}
-
-	inline bool CPUID::supportsMMX2()
-	{
-		return supportsSSE();   // Coincides with 64-bit integer vector instructions supported by SSE
-	}
-
-	inline bool CPUID::supportsSSE()
-	{
-		return SSE && enableSSE;
-	}
-
-	inline bool CPUID::supportsSSE2()
-	{
-		return SSE2 && enableSSE2;
-	}
-
-	inline bool CPUID::supportsSSE3()
-	{
-		return SSE3 && enableSSE3;
-	}
-
-	inline bool CPUID::supportsSSSE3()
-	{
-		return SSSE3 && enableSSSE3;
-	}
-
-	inline bool CPUID::supportsSSE4_1()
-	{
-		return SSE4_1 && enableSSE4_1;
-	}
+	return CMOV && enableCMOV;
 }
 
+inline bool CPUID::supportsMMX2()
+{
+	return supportsSSE();   // Coincides with 64-bit integer vector instructions supported by SSE
+}
+
+inline bool CPUID::supportsSSE()
+{
+	return SSE && enableSSE;
+}
+
+inline bool CPUID::supportsSSE2()
+{
+	return SSE2 && enableSSE2;
+}
+
+inline bool CPUID::supportsSSE3()
+{
+	return SSE3 && enableSSE3;
+}
+
+inline bool CPUID::supportsSSSE3()
+{
+	return SSSE3 && enableSSSE3;
+}
+
+inline bool CPUID::supportsSSE4_1()
+{
+	return SSE4_1 && enableSSE4_1;
+}
+
+}  // namespace rr
+
 #endif   // rr_CPUID_hpp
diff --git a/src/Reactor/Coroutine.hpp b/src/Reactor/Coroutine.hpp
index 6bf7089..211d68b 100644
--- a/src/Reactor/Coroutine.hpp
+++ b/src/Reactor/Coroutine.hpp
@@ -19,180 +19,180 @@
 #ifndef rr_ReactorCoroutine_hpp
 #define rr_ReactorCoroutine_hpp
 
-namespace rr
+namespace rr {
+
+// Base class for the template Stream<T>
+class StreamBase
 {
-	// Base class for the template Stream<T>
-	class StreamBase
+protected:
+	StreamBase(const std::shared_ptr<Routine> &routine, Nucleus::CoroutineHandle handle)
+		: routine(routine), handle(handle) {}
+
+	~StreamBase()
 	{
-	protected:
-		StreamBase(const std::shared_ptr<Routine> &routine, Nucleus::CoroutineHandle handle)
-			: routine(routine), handle(handle) {}
+		auto pfn = (Nucleus::CoroutineDestroy*)routine->getEntry(Nucleus::CoroutineEntryDestroy);
+		pfn(handle);
+	}
 
-		~StreamBase()
-		{
-			auto pfn = (Nucleus::CoroutineDestroy*)routine->getEntry(Nucleus::CoroutineEntryDestroy);
-			pfn(handle);
-		}
-
-		bool await(void* out)
-		{
-			auto pfn = (Nucleus::CoroutineAwait*)routine->getEntry(Nucleus::CoroutineEntryAwait);
-			return pfn(handle, out);
-		}
+	bool await(void* out)
+	{
+		auto pfn = (Nucleus::CoroutineAwait*)routine->getEntry(Nucleus::CoroutineEntryAwait);
+		return pfn(handle, out);
+	}
 
 private:
-		std::shared_ptr<Routine> routine;
-		Nucleus::CoroutineHandle handle;
-	};
+	std::shared_ptr<Routine> routine;
+	Nucleus::CoroutineHandle handle;
+};
 
-	// Stream is the interface to a running Coroutine instance.
-	// A Coroutine may Yield() values of type T, which can be retrieved with
-	// await().
-	template<typename T>
-	class Stream : public StreamBase
+// Stream is the interface to a running Coroutine instance.
+// A Coroutine may Yield() values of type T, which can be retrieved with
+// await().
+template<typename T>
+class Stream : public StreamBase
+{
+public:
+	inline Stream(const std::shared_ptr<Routine> &routine, Nucleus::CoroutineHandle handle)
+		: StreamBase(routine, handle) {}
+
+	// await() retrieves the next yielded value from the coroutine.
+	// Returns true if the coroutine yieled a value and out was assigned a
+	// new value. If await() returns false, the coroutine has finished
+	// execution and await() will return false for all future calls.
+	inline bool await(T& out) { return StreamBase::await(&out); }
+};
+
+template<typename FunctionType>
+class Coroutine;
+
+// Coroutine constructs a reactor Coroutine function.
+// rr::Coroutine is similar to rr::Function in that it builds a new
+// executable function, but Coroutines have the following differences:
+//  (1) Coroutines do not support Return() statements.
+//  (2) Coroutines support Yield() statements to suspend execution of the
+//      coroutine and pass a value up to the caller. Yield can be called
+//      multiple times in a single execution of a coroutine.
+//  (3) The template argument T to Coroutine<T> is a C-style function
+//      signature.
+//  (4) Coroutine::operator() returns a rr::Stream<T> instead of an
+//      rr::Routine.
+//  (5) operator() starts execution of the coroutine immediately.
+//  (6) operator() uses the Coroutine's template function signature to
+//      ensure the argument types match the generated function signature.
+//
+// Example usage:
+//
+//   // Build the coroutine function
+//   Coroutine<int()> coroutine;
+//   {
+//       Yield(Int(0));
+//       Yield(Int(1));
+//       Int current = 1;
+//       Int next = 1;
+//       While (true) {
+//           Yield(next);
+//           auto tmp = current + next;
+//           current = next;
+//           next = tmp;
+//       }
+//   }
+//
+//   // Start the execution of the coroutine.
+//   auto s = coroutine();
+//
+//   // Grab the first 20 yielded values and print them.
+//   for (int i = 0; i < 20; i++)
+//   {
+//       int val = 0;
+//       s->await(val);
+//       printf("Fibonacci(%d): %d", i, val);
+//   }
+//
+template<typename Return, typename... Arguments>
+class Coroutine<Return(Arguments...)>
+{
+public:
+	Coroutine();
+
+	template<int index>
+	using CArgumentType = typename std::tuple_element<index, std::tuple<Arguments...>>::type;
+
+	template<int index>
+	using RArgumentType = CToReactorT<CArgumentType<index>>;
+
+	// Return the argument value with the given index.
+	template<int index>
+	Argument<RArgumentType<index>> Arg() const
 	{
-	public:
-		inline Stream(const std::shared_ptr<Routine> &routine, Nucleus::CoroutineHandle handle)
-			: StreamBase(routine, handle) {}
-
-		// await() retrieves the next yielded value from the coroutine.
-		// Returns true if the coroutine yieled a value and out was assigned a
-		// new value. If await() returns false, the coroutine has finished
-		// execution and await() will return false for all future calls.
-		inline bool await(T& out) { return StreamBase::await(&out); }
-	};
-
-	template<typename FunctionType>
-	class Coroutine;
-
-	// Coroutine constructs a reactor Coroutine function.
-	// rr::Coroutine is similar to rr::Function in that it builds a new
-	// executable function, but Coroutines have the following differences:
-	//  (1) Coroutines do not support Return() statements.
-	//  (2) Coroutines support Yield() statements to suspend execution of the
-	//      coroutine and pass a value up to the caller. Yield can be called
-	//      multiple times in a single execution of a coroutine.
-	//  (3) The template argument T to Coroutine<T> is a C-style function
-	//      signature.
-	//  (4) Coroutine::operator() returns a rr::Stream<T> instead of an
-	//      rr::Routine.
-	//  (5) operator() starts execution of the coroutine immediately.
-	//  (6) operator() uses the Coroutine's template function signature to
-	//      ensure the argument types match the generated function signature.
-	//
-	// Example usage:
-	//
-	//   // Build the coroutine function
-	//   Coroutine<int()> coroutine;
-	//   {
-	//       Yield(Int(0));
-	//       Yield(Int(1));
-	//       Int current = 1;
-	//       Int next = 1;
-	//       While (true) {
-	//           Yield(next);
-	//           auto tmp = current + next;
-	//           current = next;
-	//           next = tmp;
-	//       }
-	//   }
-	//
-	//   // Start the execution of the coroutine.
-	//   auto s = coroutine();
-	//
-	//   // Grab the first 20 yielded values and print them.
-	//   for (int i = 0; i < 20; i++)
-	//   {
-	//       int val = 0;
-	//       s->await(val);
-	//       printf("Fibonacci(%d): %d", i, val);
-	//   }
-	//
-	template<typename Return, typename... Arguments>
-	class Coroutine<Return(Arguments...)>
-	{
-	public:
-		Coroutine();
-
-		template<int index>
-		using CArgumentType = typename std::tuple_element<index, std::tuple<Arguments...>>::type;
-
-		template<int index>
-		using RArgumentType = CToReactorT<CArgumentType<index>>;
-
-		// Return the argument value with the given index.
-		template<int index>
-		Argument<RArgumentType<index>> Arg() const
-		{
-			Value *arg = Nucleus::getArgument(index);
-			return Argument<RArgumentType<index>>(arg);
-		}
-
-		// Completes building of the coroutine and generates the coroutine's
-		// executable code. After calling, no more reactor functions may be
-		// called without building a new rr::Function or rr::Coroutine.
-		// While automatically called by operator(), finalize() should be called
-		// as early as possible to release the global Reactor mutex lock.
-		inline void finalize(const Config::Edit &cfg = Config::Edit::None);
-
-		// Starts execution of the coroutine and returns a unique_ptr to a
-		// Stream<> that exposes the await() function for obtaining yielded
-		// values.
-		std::unique_ptr<Stream<Return>> operator()(Arguments...);
-
-	protected:
-		std::unique_ptr<Nucleus> core;
-		std::shared_ptr<Routine> routine;
-		std::vector<Type*> arguments;
-	};
-
-	template<typename Return, typename... Arguments>
-	Coroutine<Return(Arguments...)>::Coroutine()
-	{
-		core.reset(new Nucleus());
-
-		std::vector<Type*> types = {CToReactorT<Arguments>::getType()...};
-		for(auto type : types)
-		{
-			if(type != Void::getType())
-			{
-				arguments.push_back(type);
-			}
-		}
-
-		Nucleus::createCoroutine(CToReactorT<Return>::getType(), arguments);
+		Value *arg = Nucleus::getArgument(index);
+		return Argument<RArgumentType<index>>(arg);
 	}
 
-	template<typename Return, typename... Arguments>
-	void Coroutine<Return(Arguments...)>::finalize(const Config::Edit &cfg /* = Config::Edit::None */)
+	// Completes building of the coroutine and generates the coroutine's
+	// executable code. After calling, no more reactor functions may be
+	// called without building a new rr::Function or rr::Coroutine.
+	// While automatically called by operator(), finalize() should be called
+	// as early as possible to release the global Reactor mutex lock.
+	inline void finalize(const Config::Edit &cfg = Config::Edit::None);
+
+	// Starts execution of the coroutine and returns a unique_ptr to a
+	// Stream<> that exposes the await() function for obtaining yielded
+	// values.
+	std::unique_ptr<Stream<Return>> operator()(Arguments...);
+
+protected:
+	std::unique_ptr<Nucleus> core;
+	std::shared_ptr<Routine> routine;
+	std::vector<Type*> arguments;
+};
+
+template<typename Return, typename... Arguments>
+Coroutine<Return(Arguments...)>::Coroutine()
+{
+	core.reset(new Nucleus());
+
+	std::vector<Type*> types = {CToReactorT<Arguments>::getType()...};
+	for(auto type : types)
 	{
-		if(core != nullptr)
+		if(type != Void::getType())
 		{
-			routine = core->acquireCoroutine("coroutine", cfg);
-			core.reset(nullptr);
+			arguments.push_back(type);
 		}
 	}
 
-	template<typename Return, typename... Arguments>
-	std::unique_ptr<Stream<Return>>
-	Coroutine<Return(Arguments...)>::operator()(Arguments... args)
-	{
-		finalize();
+	Nucleus::createCoroutine(CToReactorT<Return>::getType(), arguments);
+}
 
-		using Sig = Nucleus::CoroutineBegin<Arguments...>;
-		auto pfn = (Sig*)routine->getEntry(Nucleus::CoroutineEntryBegin);
-		auto handle = pfn(args...);
-		return std::unique_ptr<Stream<Return>>(new Stream<Return>(routine, handle));
+template<typename Return, typename... Arguments>
+void Coroutine<Return(Arguments...)>::finalize(const Config::Edit &cfg /* = Config::Edit::None */)
+{
+	if(core != nullptr)
+	{
+		routine = core->acquireCoroutine("coroutine", cfg);
+		core.reset(nullptr);
 	}
+}
+
+template<typename Return, typename... Arguments>
+std::unique_ptr<Stream<Return>>
+Coroutine<Return(Arguments...)>::operator()(Arguments... args)
+{
+	finalize();
+
+	using Sig = Nucleus::CoroutineBegin<Arguments...>;
+	auto pfn = (Sig*)routine->getEntry(Nucleus::CoroutineEntryBegin);
+	auto handle = pfn(args...);
+	return std::unique_ptr<Stream<Return>>(new Stream<Return>(routine, handle));
+}
 
 #ifdef Yield // Defined in WinBase.h
 #undef Yield
 #endif
 
-	// Suspends execution of the coroutine and yields val to the caller.
-	// Execution of the coroutine will resume after val is retrieved.
-	template<typename T>
-	inline void Yield(const T &val) { Nucleus::yield(ValueOf(val)); }
+// Suspends execution of the coroutine and yields val to the caller.
+// Execution of the coroutine will resume after val is retrieved.
+template<typename T>
+inline void Yield(const T &val) { Nucleus::yield(ValueOf(val)); }
 
 } // namespace rr
 
diff --git a/src/Reactor/Debug.cpp b/src/Reactor/Debug.cpp
index 7f0d2cd..df8e8fc 100644
--- a/src/Reactor/Debug.cpp
+++ b/src/Reactor/Debug.cpp
@@ -17,8 +17,7 @@
 #include <string>
 #include <stdarg.h>
 
-namespace rr
-{
+namespace rr {
 
 void tracev(const char *format, va_list args)
 {
@@ -71,4 +70,4 @@
 	::abort();
 }
 
-} // namespace rr
+}  // namespace rr
diff --git a/src/Reactor/Debug.hpp b/src/Reactor/Debug.hpp
index 929a927..da8a48d 100644
--- a/src/Reactor/Debug.hpp
+++ b/src/Reactor/Debug.hpp
@@ -31,19 +31,20 @@
 #define CHECK_PRINTF_ARGS
 #endif
 
-namespace rr
-{
-	// Outputs text to the debugging log
-	void trace(const char *format, ...) CHECK_PRINTF_ARGS;
-	inline void trace() {}
+namespace rr {
 
-	// Outputs text to the debugging log and prints to stderr.
-	void warn(const char *format, ...) CHECK_PRINTF_ARGS;
-	inline void warn() {}
+// Outputs text to the debugging log
+void trace(const char *format, ...) CHECK_PRINTF_ARGS;
+inline void trace() {}
 
-	// Outputs the message to the debugging log and stderr, and calls abort().
-	void abort(const char *format, ...) CHECK_PRINTF_ARGS;
-}
+// Outputs text to the debugging log and prints to stderr.
+void warn(const char *format, ...) CHECK_PRINTF_ARGS;
+inline void warn() {}
+
+// Outputs the message to the debugging log and stderr, and calls abort().
+void abort(const char *format, ...) CHECK_PRINTF_ARGS;
+
+}  // namespace rr
 
 // A macro to output a trace of a function call and its arguments to the
 // debugging log. Disabled if RR_DISABLE_TRACE is defined.
diff --git a/src/Reactor/EmulatedReactor.cpp b/src/Reactor/EmulatedReactor.cpp
index 8a06d6f..efdb5a3 100644
--- a/src/Reactor/EmulatedReactor.cpp
+++ b/src/Reactor/EmulatedReactor.cpp
@@ -4,210 +4,211 @@
 #include <functional>
 #include <utility>
 
-namespace rr
+namespace rr {
+namespace {
+
+template <typename T>
+struct UnderlyingType
 {
-	namespace
+	using Type = typename decltype(rr::Extract(std::declval<RValue<T>>(), 0))::rvalue_underlying_type;
+};
+
+template <typename T>
+using UnderlyingTypeT = typename UnderlyingType<T>::Type;
+
+// Call single arg function on a vector type
+template <typename Func, typename T>
+RValue<T> call4(Func func, const RValue<T>& x)
+{
+	T result;
+	result = Insert(result, Call(func, Extract(x, 0)), 0);
+	result = Insert(result, Call(func, Extract(x, 1)), 1);
+	result = Insert(result, Call(func, Extract(x, 2)), 2);
+	result = Insert(result, Call(func, Extract(x, 3)), 3);
+	return result;
+}
+
+// Call two arg function on a vector type
+template <typename Func, typename T>
+RValue<T> call4(Func func, const RValue<T>& x, const RValue<T>& y)
+{
+	T result;
+	result = Insert(result, Call(func, Extract(x, 0), Extract(y, 0)), 0);
+	result = Insert(result, Call(func, Extract(x, 1), Extract(y, 1)), 1);
+	result = Insert(result, Call(func, Extract(x, 2), Extract(y, 2)), 2);
+	result = Insert(result, Call(func, Extract(x, 3), Extract(y, 3)), 3);
+	return result;
+}
+
+template <typename T, typename EL = UnderlyingTypeT<T>>
+void gather(T& out, RValue<Pointer<EL>> base, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment, bool zeroMaskedLanes)
+{
+	constexpr bool atomic = false;
+	constexpr std::memory_order order = std::memory_order_relaxed;
+
+	Pointer<Byte> baseBytePtr = base;
+
+	out = T(0);
+	for (int i = 0; i < 4; i++)
 	{
-		template <typename T>
-		struct UnderlyingType
+		If(Extract(mask, i) != 0)
 		{
-			using Type = typename decltype(rr::Extract(std::declval<RValue<T>>(), 0))::rvalue_underlying_type;
-		};
-
-		template <typename T>
-		using UnderlyingTypeT = typename UnderlyingType<T>::Type;
-
-		// Call single arg function on a vector type
-		template <typename Func, typename T>
-		RValue<T> call4(Func func, const RValue<T>& x)
-		{
-			T result;
-			result = Insert(result, Call(func, Extract(x, 0)), 0);
-			result = Insert(result, Call(func, Extract(x, 1)), 1);
-			result = Insert(result, Call(func, Extract(x, 2)), 2);
-			result = Insert(result, Call(func, Extract(x, 3)), 3);
-			return result;
+			auto offset = Extract(offsets, i);
+			auto el = Load(Pointer<EL>(&baseBytePtr[offset]), alignment, atomic, order);
+			out = Insert(out, el, i);
 		}
-
-		// Call two arg function on a vector type
-		template <typename Func, typename T>
-		RValue<T> call4(Func func, const RValue<T>& x, const RValue<T>& y)
+		Else If(zeroMaskedLanes)
 		{
-			T result;
-			result = Insert(result, Call(func, Extract(x, 0), Extract(y, 0)), 0);
-			result = Insert(result, Call(func, Extract(x, 1), Extract(y, 1)), 1);
-			result = Insert(result, Call(func, Extract(x, 2), Extract(y, 2)), 2);
-			result = Insert(result, Call(func, Extract(x, 3), Extract(y, 3)), 3);
-			return result;
-		}
-
-		template <typename T, typename EL = UnderlyingTypeT<T>>
-		void gather(T& out, RValue<Pointer<EL>> base, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment, bool zeroMaskedLanes)
-		{
-			constexpr bool atomic = false;
-			constexpr std::memory_order order = std::memory_order_relaxed;
-
-			Pointer<Byte> baseBytePtr = base;
-
-			out = T(0);
-			for (int i = 0; i < 4; i++)
-			{
-				If(Extract(mask, i) != 0)
-				{
-					auto offset = Extract(offsets, i);
-					auto el = Load(Pointer<EL>(&baseBytePtr[offset]), alignment, atomic, order);
-					out = Insert(out, el, i);
-				}
-				Else If(zeroMaskedLanes)
-				{
-					out = Insert(out, EL(0), i);
-				}
-			}
-		}
-
-		template <typename T, typename EL = UnderlyingTypeT<T>>
-		void scatter(RValue<Pointer<EL>> base, RValue<T> val, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment)
-		{
-			constexpr bool atomic = false;
-			constexpr std::memory_order order = std::memory_order_relaxed;
-
-			Pointer<Byte> baseBytePtr = base;
-
-			for (int i = 0; i < 4; i++)
-			{
-				If(Extract(mask, i) != 0)
-				{
-					auto offset = Extract(offsets, i);
-					Store(Extract(val, i), Pointer<EL>(&baseBytePtr[offset]), alignment, atomic, order);
-				}
-			}
-		}
-	}
-
-	namespace emulated
-	{
-		RValue<Float4> Gather(RValue<Pointer<Float>> base, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment, bool zeroMaskedLanes /* = false */)
-		{
-			Float4 result{};
-			gather(result, base, offsets, mask, alignment, zeroMaskedLanes);
-			return result;
-		}
-
-		RValue<Int4> Gather(RValue<Pointer<Int>> base, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment, bool zeroMaskedLanes /* = false */)
-		{
-			Int4 result{};
-			gather(result, base, offsets, mask, alignment, zeroMaskedLanes);
-			return result;
-		}
-
-		void Scatter(RValue<Pointer<Float>> base, RValue<Float4> val, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment)
-		{
-			scatter(base, val, offsets, mask, alignment);
-		}
-
-		void Scatter(RValue<Pointer<Int>> base, RValue<Int4> val, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment)
-		{
-			scatter<Int4>(base, val, offsets, mask, alignment);
-		}
-
-		RValue<Float> Exp2(RValue<Float> x)
-		{
-			return Call(exp2f, x);
-		}
-
-		RValue<Float> Log2(RValue<Float> x)
-		{
-			return Call(log2f, x);
-		}
-
-		RValue<Float4> Sin(RValue<Float4> x)
-		{
-			return call4(sinf, x);
-		}
-
-		RValue<Float4> Cos(RValue<Float4> x)
-		{
-			return call4(cosf, x);
-		}
-
-		RValue<Float4> Tan(RValue<Float4> x)
-		{
-			return call4(tanf, x);
-		}
-
-		RValue<Float4> Asin(RValue<Float4> x)
-		{
-			return call4(asinf, x);
-		}
-
-		RValue<Float4> Acos(RValue<Float4> x)
-		{
-			return call4(acosf, x);
-		}
-
-		RValue<Float4> Atan(RValue<Float4> x)
-		{
-			return call4(atanf, x);
-		}
-
-		RValue<Float4> Sinh(RValue<Float4> x)
-		{
-			return call4(sinhf, x);
-		}
-
-		RValue<Float4> Cosh(RValue<Float4> x)
-		{
-			return call4(coshf, x);
-		}
-
-		RValue<Float4> Tanh(RValue<Float4> x)
-		{
-			return call4(tanhf, x);
-		}
-
-		RValue<Float4> Asinh(RValue<Float4> x)
-		{
-			return call4(asinhf, x);
-		}
-
-		RValue<Float4> Acosh(RValue<Float4> x)
-		{
-			return call4(acoshf, x);
-		}
-
-		RValue<Float4> Atanh(RValue<Float4> x)
-		{
-			return call4(atanhf, x);
-		}
-
-		RValue<Float4> Atan2(RValue<Float4> x, RValue<Float4> y)
-		{
-			return call4(atan2f, x, y);
-		}
-
-		RValue<Float4> Pow(RValue<Float4> x, RValue<Float4> y)
-		{
-			return call4(powf, x, y);
-		}
-
-		RValue<Float4> Exp(RValue<Float4> x)
-		{
-			return call4(expf, x);
-		}
-
-		RValue<Float4> Log(RValue<Float4> x)
-		{
-			return call4(logf, x);
-		}
-
-		RValue<Float4> Exp2(RValue<Float4> x)
-		{
-			return call4(exp2f, x);
-		}
-
-		RValue<Float4> Log2(RValue<Float4> x)
-		{
-			return call4(log2f, x);
+			out = Insert(out, EL(0), i);
 		}
 	}
 }
+
+template <typename T, typename EL = UnderlyingTypeT<T>>
+void scatter(RValue<Pointer<EL>> base, RValue<T> val, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment)
+{
+	constexpr bool atomic = false;
+	constexpr std::memory_order order = std::memory_order_relaxed;
+
+	Pointer<Byte> baseBytePtr = base;
+
+	for (int i = 0; i < 4; i++)
+	{
+		If(Extract(mask, i) != 0)
+		{
+			auto offset = Extract(offsets, i);
+			Store(Extract(val, i), Pointer<EL>(&baseBytePtr[offset]), alignment, atomic, order);
+		}
+	}
+}
+
+}  // anonymous namespace
+
+namespace emulated {
+
+RValue<Float4> Gather(RValue<Pointer<Float>> base, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment, bool zeroMaskedLanes /* = false */)
+{
+	Float4 result{};
+	gather(result, base, offsets, mask, alignment, zeroMaskedLanes);
+	return result;
+}
+
+RValue<Int4> Gather(RValue<Pointer<Int>> base, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment, bool zeroMaskedLanes /* = false */)
+{
+	Int4 result{};
+	gather(result, base, offsets, mask, alignment, zeroMaskedLanes);
+	return result;
+}
+
+void Scatter(RValue<Pointer<Float>> base, RValue<Float4> val, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment)
+{
+	scatter(base, val, offsets, mask, alignment);
+}
+
+void Scatter(RValue<Pointer<Int>> base, RValue<Int4> val, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment)
+{
+	scatter<Int4>(base, val, offsets, mask, alignment);
+}
+
+RValue<Float> Exp2(RValue<Float> x)
+{
+	return Call(exp2f, x);
+}
+
+RValue<Float> Log2(RValue<Float> x)
+{
+	return Call(log2f, x);
+}
+
+RValue<Float4> Sin(RValue<Float4> x)
+{
+	return call4(sinf, x);
+}
+
+RValue<Float4> Cos(RValue<Float4> x)
+{
+	return call4(cosf, x);
+}
+
+RValue<Float4> Tan(RValue<Float4> x)
+{
+	return call4(tanf, x);
+}
+
+RValue<Float4> Asin(RValue<Float4> x)
+{
+	return call4(asinf, x);
+}
+
+RValue<Float4> Acos(RValue<Float4> x)
+{
+	return call4(acosf, x);
+}
+
+RValue<Float4> Atan(RValue<Float4> x)
+{
+	return call4(atanf, x);
+}
+
+RValue<Float4> Sinh(RValue<Float4> x)
+{
+	return call4(sinhf, x);
+}
+
+RValue<Float4> Cosh(RValue<Float4> x)
+{
+	return call4(coshf, x);
+}
+
+RValue<Float4> Tanh(RValue<Float4> x)
+{
+	return call4(tanhf, x);
+}
+
+RValue<Float4> Asinh(RValue<Float4> x)
+{
+	return call4(asinhf, x);
+}
+
+RValue<Float4> Acosh(RValue<Float4> x)
+{
+	return call4(acoshf, x);
+}
+
+RValue<Float4> Atanh(RValue<Float4> x)
+{
+	return call4(atanhf, x);
+}
+
+RValue<Float4> Atan2(RValue<Float4> x, RValue<Float4> y)
+{
+	return call4(atan2f, x, y);
+}
+
+RValue<Float4> Pow(RValue<Float4> x, RValue<Float4> y)
+{
+	return call4(powf, x, y);
+}
+
+RValue<Float4> Exp(RValue<Float4> x)
+{
+	return call4(expf, x);
+}
+
+RValue<Float4> Log(RValue<Float4> x)
+{
+	return call4(logf, x);
+}
+
+RValue<Float4> Exp2(RValue<Float4> x)
+{
+	return call4(exp2f, x);
+}
+
+RValue<Float4> Log2(RValue<Float4> x)
+{
+	return call4(log2f, x);
+}
+
+}  // namespace emulated
+}  // namespace rr
diff --git a/src/Reactor/EmulatedReactor.hpp b/src/Reactor/EmulatedReactor.hpp
index dbdc198..10b9b2a 100644
--- a/src/Reactor/EmulatedReactor.hpp
+++ b/src/Reactor/EmulatedReactor.hpp
@@ -21,33 +21,33 @@
 // starting point for implementing a new backend, or for when adding
 // functionality to an existing backend is non-trivial.
 
-namespace rr
-{
-	namespace emulated
-	{
-		RValue<Float4> Gather(RValue<Pointer<Float>> base, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment, bool zeroMaskedLanes = false);
-		RValue<Int4> Gather(RValue<Pointer<Int>> base, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment, bool zeroMaskedLanes = false);
-		void Scatter(RValue<Pointer<Float>> base, RValue<Float4> val, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment);
-		void Scatter(RValue<Pointer<Int>> base, RValue<Int4> val, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment);
-		RValue<Float> Exp2(RValue<Float> x);
-		RValue<Float> Log2(RValue<Float> x);
-		RValue<Float4> Sin(RValue<Float4> x);
-		RValue<Float4> Cos(RValue<Float4> x);
-		RValue<Float4> Tan(RValue<Float4> x);
-		RValue<Float4> Asin(RValue<Float4> x);
-		RValue<Float4> Acos(RValue<Float4> x);
-		RValue<Float4> Atan(RValue<Float4> x);
-		RValue<Float4> Sinh(RValue<Float4> x);
-		RValue<Float4> Cosh(RValue<Float4> x);
-		RValue<Float4> Tanh(RValue<Float4> x);
-		RValue<Float4> Asinh(RValue<Float4> x);
-		RValue<Float4> Acosh(RValue<Float4> x);
-		RValue<Float4> Atanh(RValue<Float4> x);
-		RValue<Float4> Atan2(RValue<Float4> x, RValue<Float4> y);
-		RValue<Float4> Pow(RValue<Float4> x, RValue<Float4> y);
-		RValue<Float4> Exp(RValue<Float4> x);
-		RValue<Float4> Log(RValue<Float4> x);
-		RValue<Float4> Exp2(RValue<Float4> x);
-		RValue<Float4> Log2(RValue<Float4> x);
-	}
-}
+namespace rr {
+namespace emulated {
+
+RValue<Float4> Gather(RValue<Pointer<Float>> base, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment, bool zeroMaskedLanes = false);
+RValue<Int4> Gather(RValue<Pointer<Int>> base, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment, bool zeroMaskedLanes = false);
+void Scatter(RValue<Pointer<Float>> base, RValue<Float4> val, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment);
+void Scatter(RValue<Pointer<Int>> base, RValue<Int4> val, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment);
+RValue<Float> Exp2(RValue<Float> x);
+RValue<Float> Log2(RValue<Float> x);
+RValue<Float4> Sin(RValue<Float4> x);
+RValue<Float4> Cos(RValue<Float4> x);
+RValue<Float4> Tan(RValue<Float4> x);
+RValue<Float4> Asin(RValue<Float4> x);
+RValue<Float4> Acos(RValue<Float4> x);
+RValue<Float4> Atan(RValue<Float4> x);
+RValue<Float4> Sinh(RValue<Float4> x);
+RValue<Float4> Cosh(RValue<Float4> x);
+RValue<Float4> Tanh(RValue<Float4> x);
+RValue<Float4> Asinh(RValue<Float4> x);
+RValue<Float4> Acosh(RValue<Float4> x);
+RValue<Float4> Atanh(RValue<Float4> x);
+RValue<Float4> Atan2(RValue<Float4> x, RValue<Float4> y);
+RValue<Float4> Pow(RValue<Float4> x, RValue<Float4> y);
+RValue<Float4> Exp(RValue<Float4> x);
+RValue<Float4> Log(RValue<Float4> x);
+RValue<Float4> Exp2(RValue<Float4> x);
+RValue<Float4> Log2(RValue<Float4> x);
+
+}  // namespace emulated
+}  // namespace rr
diff --git a/src/Reactor/ExecutableMemory.cpp b/src/Reactor/ExecutableMemory.cpp
index 15d5b39..40acb94 100644
--- a/src/Reactor/ExecutableMemory.cpp
+++ b/src/Reactor/ExecutableMemory.cpp
@@ -42,10 +42,9 @@
 #define __x86__
 #endif
 
-namespace rr
-{
-namespace
-{
+namespace rr {
+namespace {
+
 struct Allocation
 {
 //	size_t bytes;
@@ -301,4 +300,5 @@
 		deallocate(memory);
 	#endif
 }
-}
+
+}  // namespace rr
diff --git a/src/Reactor/ExecutableMemory.hpp b/src/Reactor/ExecutableMemory.hpp
index 95dac5e..4c1ef33 100644
--- a/src/Reactor/ExecutableMemory.hpp
+++ b/src/Reactor/ExecutableMemory.hpp
@@ -19,8 +19,8 @@
 #include <cstdint>
 #include <cstring>
 
-namespace rr
-{
+namespace rr {
+
 size_t memoryPageSize();
 
 void *allocateExecutable(size_t bytes);   // Allocates memory that can be made executable using markExecutable()
@@ -87,6 +87,7 @@
 private:
 	void *ptr;
 };
-}
+
+}  // namespace rr
 
 #endif   // rr_ExecutableMemory_hpp
diff --git a/src/Reactor/LLVMReactor.cpp b/src/Reactor/LLVMReactor.cpp
index 9654292..64ee668 100644
--- a/src/Reactor/LLVMReactor.cpp
+++ b/src/Reactor/LLVMReactor.cpp
@@ -107,2288 +107,2289 @@
 	extern "C" void _chkstk();
 #endif
 
-namespace rr
+namespace rr {
+
+void* resolveExternalSymbol(const char*);
+
+}  // namespace rr
+
+namespace {
+
+// Default configuration settings. Must be accessed under mutex lock.
+std::mutex defaultConfigLock;
+rr::Config &defaultConfig()
 {
-	void* resolveExternalSymbol(const char*);
+	// This uses a static in a function to avoid the cost of a global static
+	// initializer. See http://neugierig.org/software/chromium/notes/2011/08/static-initializers.html
+	static rr::Config config = rr::Config::Edit()
+		.add(rr::Optimization::Pass::ScalarReplAggregates)
+		.add(rr::Optimization::Pass::InstructionCombining)
+		.apply({});
+	return config;
 }
 
-namespace
+// Cache provides a simple, thread-safe key-value store.
+template <typename KEY, typename VALUE>
+class Cache
 {
-	// Default configuration settings. Must be accessed under mutex lock.
-	std::mutex defaultConfigLock;
-	rr::Config &defaultConfig()
+public:
+	Cache() = default;
+	Cache(const Cache& other);
+	VALUE getOrCreate(KEY key, std::function<VALUE()> create);
+private:
+	mutable std::mutex mutex; // mutable required for copy constructor.
+	std::unordered_map<KEY, VALUE> map;
+};
+
+template <typename KEY, typename VALUE>
+Cache<KEY, VALUE>::Cache(const Cache& other)
+{
+	std::unique_lock<std::mutex> lock(other.mutex);
+	map = other.map;
+}
+
+template <typename KEY, typename VALUE>
+VALUE Cache<KEY, VALUE>::getOrCreate(KEY key, std::function<VALUE()> create)
+{
+	std::unique_lock<std::mutex> lock(mutex);
+	auto it = map.find(key);
+	if (it != map.end())
 	{
-		// This uses a static in a function to avoid the cost of a global static
-		// initializer. See http://neugierig.org/software/chromium/notes/2011/08/static-initializers.html
-		static rr::Config config = rr::Config::Edit()
-			.add(rr::Optimization::Pass::ScalarReplAggregates)
-			.add(rr::Optimization::Pass::InstructionCombining)
-			.apply({});
-		return config;
+		return it->second;
 	}
+	auto value = create();
+	map.emplace(key, value);
+	return value;
+}
 
-	// Cache provides a simple, thread-safe key-value store.
-	template <typename KEY, typename VALUE>
-	class Cache
-	{
-	public:
-		Cache() = default;
-		Cache(const Cache& other);
-		VALUE getOrCreate(KEY key, std::function<VALUE()> create);
-	private:
-		mutable std::mutex mutex; // mutable required for copy constructor.
-		std::unordered_map<KEY, VALUE> map;
-	};
+// JITGlobals is a singleton that holds all the immutable machine specific
+// information for the host device.
+class JITGlobals
+{
+public:
+	using TargetMachineSPtr = std::shared_ptr<llvm::TargetMachine>;
 
-	template <typename KEY, typename VALUE>
-	Cache<KEY, VALUE>::Cache(const Cache& other)
-	{
-		std::unique_lock<std::mutex> lock(other.mutex);
-		map = other.map;
-	}
+	static JITGlobals * get();
 
-	template <typename KEY, typename VALUE>
-	VALUE Cache<KEY, VALUE>::getOrCreate(KEY key, std::function<VALUE()> create)
-	{
-		std::unique_lock<std::mutex> lock(mutex);
-		auto it = map.find(key);
-		if (it != map.end())
-		{
-			return it->second;
-		}
-		auto value = create();
-		map.emplace(key, value);
-		return value;
-	}
+	const std::string mcpu;
+	const std::vector<std::string> mattrs;
+	const char* const march;
+	const llvm::TargetOptions targetOptions;
+	const llvm::DataLayout dataLayout;
 
-	// JITGlobals is a singleton that holds all the immutable machine specific
-	// information for the host device.
-	class JITGlobals
-	{
-	public:
-		using TargetMachineSPtr = std::shared_ptr<llvm::TargetMachine>;
+	TargetMachineSPtr getTargetMachine(rr::Optimization::Level optlevel);
 
-		static JITGlobals * get();
+private:
+	static JITGlobals create();
+	static llvm::CodeGenOpt::Level toLLVM(rr::Optimization::Level level);
+	JITGlobals(const char *mcpu,
+	           const std::vector<std::string> &mattrs,
+	           const char *march,
+	           const llvm::TargetOptions &targetOptions,
+	           const llvm::DataLayout &dataLayout);
+	JITGlobals(const JITGlobals&) = default;
 
-		const std::string mcpu;
-		const std::vector<std::string> mattrs;
-		const char* const march;
-		const llvm::TargetOptions targetOptions;
-		const llvm::DataLayout dataLayout;
+	// The cache key here is actually a rr::Optimization::Level. We use int
+	// as 'enum class' types do not provide builtin hash functions until
+	// C++14. See: https://stackoverflow.com/a/29618545.
+	Cache<int, TargetMachineSPtr> targetMachines;
+};
 
-		TargetMachineSPtr getTargetMachine(rr::Optimization::Level optlevel);
+JITGlobals * JITGlobals::get()
+{
+	static JITGlobals instance = create();
+	return &instance;
+}
 
-	private:
-		static JITGlobals create();
-		static llvm::CodeGenOpt::Level toLLVM(rr::Optimization::Level level);
-		JITGlobals(const char *mcpu,
-		           const std::vector<std::string> &mattrs,
-		           const char *march,
-		           const llvm::TargetOptions &targetOptions,
-		           const llvm::DataLayout &dataLayout);
-		JITGlobals(const JITGlobals&) = default;
-
-		// The cache key here is actually a rr::Optimization::Level. We use int
-		// as 'enum class' types do not provide builtin hash functions until
-		// C++14. See: https://stackoverflow.com/a/29618545.
-		Cache<int, TargetMachineSPtr> targetMachines;
-	};
-
-	JITGlobals * JITGlobals::get()
-	{
-		static JITGlobals instance = create();
-		return &instance;
-	}
-
-	JITGlobals::TargetMachineSPtr JITGlobals::getTargetMachine(rr::Optimization::Level optlevel)
-	{
-		return targetMachines.getOrCreate(static_cast<int>(optlevel), [&]() {
-			return TargetMachineSPtr(llvm::EngineBuilder()
+JITGlobals::TargetMachineSPtr JITGlobals::getTargetMachine(rr::Optimization::Level optlevel)
+{
+	return targetMachines.getOrCreate(static_cast<int>(optlevel), [&]() {
+		return TargetMachineSPtr(llvm::EngineBuilder()
 #ifdef ENABLE_RR_DEBUG_INFO
-				.setOptLevel(toLLVM(rr::Optimization::Level::None))
+			.setOptLevel(toLLVM(rr::Optimization::Level::None))
 #else
-				.setOptLevel(toLLVM(optlevel))
+			.setOptLevel(toLLVM(optlevel))
 #endif // ENABLE_RR_DEBUG_INFO
-				.setMCPU(mcpu)
-				.setMArch(march)
-				.setMAttrs(mattrs)
-				.setTargetOptions(targetOptions)
-				.selectTarget());
-		});
-	}
+			.setMCPU(mcpu)
+			.setMArch(march)
+			.setMAttrs(mattrs)
+			.setTargetOptions(targetOptions)
+			.selectTarget());
+	});
+}
 
-	JITGlobals JITGlobals::create()
+JITGlobals JITGlobals::create()
+{
+	struct LLVMInitializer
 	{
-		struct LLVMInitializer
+		LLVMInitializer()
 		{
-			LLVMInitializer()
-			{
-				llvm::InitializeNativeTarget();
-				llvm::InitializeNativeTargetAsmPrinter();
-				llvm::InitializeNativeTargetAsmParser();
-			}
-		};
-		static LLVMInitializer initializeLLVM;
+			llvm::InitializeNativeTarget();
+			llvm::InitializeNativeTargetAsmPrinter();
+			llvm::InitializeNativeTargetAsmParser();
+		}
+	};
+	static LLVMInitializer initializeLLVM;
 
-		auto mcpu = llvm::sys::getHostCPUName();
+	auto mcpu = llvm::sys::getHostCPUName();
 
-		llvm::StringMap<bool> features;
-		bool ok = llvm::sys::getHostCPUFeatures(features);
+	llvm::StringMap<bool> features;
+	bool ok = llvm::sys::getHostCPUFeatures(features);
 
 #if defined(__i386__) || defined(__x86_64__) || \
 (defined(__linux__) && (defined(__arm__) || defined(__aarch64__)))
-		ASSERT_MSG(ok, "llvm::sys::getHostCPUFeatures returned false");
+	ASSERT_MSG(ok, "llvm::sys::getHostCPUFeatures returned false");
 #else
-		(void) ok; // getHostCPUFeatures always returns false on other platforms
+	(void) ok; // getHostCPUFeatures always returns false on other platforms
 #endif
 
-		std::vector<std::string> mattrs;
-		for (auto &feature : features)
-		{
-			if (feature.second) { mattrs.push_back(feature.first()); }
-		}
+	std::vector<std::string> mattrs;
+	for (auto &feature : features)
+	{
+		if (feature.second) { mattrs.push_back(feature.first()); }
+	}
 
-		const char* march = nullptr;
+	const char* march = nullptr;
 #if defined(__x86_64__)
-		march = "x86-64";
+	march = "x86-64";
 #elif defined(__i386__)
-		march = "x86";
+	march = "x86";
 #elif defined(__aarch64__)
-		march = "arm64";
+	march = "arm64";
 #elif defined(__arm__)
-		march = "arm";
+	march = "arm";
 #elif defined(__mips__)
 #if defined(__mips64)
-		march = "mips64el";
+	march = "mips64el";
 #else
-		march = "mipsel";
+	march = "mipsel";
 #endif
 #elif defined(__powerpc64__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
-		march = "ppc64le";
+	march = "ppc64le";
 #else
-		#error "unknown architecture"
+	#error "unknown architecture"
 #endif
 
-		llvm::TargetOptions targetOptions;
-		targetOptions.UnsafeFPMath = false;
+	llvm::TargetOptions targetOptions;
+	targetOptions.UnsafeFPMath = false;
 
-		auto targetMachine = std::unique_ptr<llvm::TargetMachine>(
-			llvm::EngineBuilder()
-				.setOptLevel(llvm::CodeGenOpt::None)
-				.setMCPU(mcpu)
-				.setMArch(march)
-				.setMAttrs(mattrs)
-				.setTargetOptions(targetOptions)
-				.selectTarget());
+	auto targetMachine = std::unique_ptr<llvm::TargetMachine>(
+		llvm::EngineBuilder()
+			.setOptLevel(llvm::CodeGenOpt::None)
+			.setMCPU(mcpu)
+			.setMArch(march)
+			.setMAttrs(mattrs)
+			.setTargetOptions(targetOptions)
+			.selectTarget());
 
-		auto dataLayout = targetMachine->createDataLayout();
+	auto dataLayout = targetMachine->createDataLayout();
 
-		return JITGlobals(mcpu.data(), mattrs, march, targetOptions, dataLayout);
-	}
+	return JITGlobals(mcpu.data(), mattrs, march, targetOptions, dataLayout);
+}
 
-	llvm::CodeGenOpt::Level JITGlobals::toLLVM(rr::Optimization::Level level)
+llvm::CodeGenOpt::Level JITGlobals::toLLVM(rr::Optimization::Level level)
+{
+	switch (level)
 	{
-		switch (level)
-		{
-			case rr::Optimization::Level::None:       return ::llvm::CodeGenOpt::None;
-			case rr::Optimization::Level::Less:       return ::llvm::CodeGenOpt::Less;
-			case rr::Optimization::Level::Default:    return ::llvm::CodeGenOpt::Default;
-			case rr::Optimization::Level::Aggressive: return ::llvm::CodeGenOpt::Aggressive;
-			default: UNREACHABLE("Unknown Optimization Level %d", int(level));
-		}
-		return ::llvm::CodeGenOpt::Default;
+		case rr::Optimization::Level::None:       return ::llvm::CodeGenOpt::None;
+		case rr::Optimization::Level::Less:       return ::llvm::CodeGenOpt::Less;
+		case rr::Optimization::Level::Default:    return ::llvm::CodeGenOpt::Default;
+		case rr::Optimization::Level::Aggressive: return ::llvm::CodeGenOpt::Aggressive;
+		default: UNREACHABLE("Unknown Optimization Level %d", int(level));
 	}
+	return ::llvm::CodeGenOpt::Default;
+}
 
-	JITGlobals::JITGlobals(const char* mcpu,
-	                       const std::vector<std::string> &mattrs,
-	                       const char* march,
-	                       const llvm::TargetOptions &targetOptions,
-	                       const llvm::DataLayout &dataLayout) :
-			mcpu(mcpu),
-			mattrs(mattrs),
-			march(march),
-			targetOptions(targetOptions),
-			dataLayout(dataLayout)
-	{
-	}
+JITGlobals::JITGlobals(const char* mcpu,
+                       const std::vector<std::string> &mattrs,
+                       const char* march,
+                       const llvm::TargetOptions &targetOptions,
+                       const llvm::DataLayout &dataLayout) :
+		mcpu(mcpu),
+		mattrs(mattrs),
+		march(march),
+		targetOptions(targetOptions),
+		dataLayout(dataLayout)
+{
+}
 
-	// JITRoutine is a rr::Routine that holds a LLVM JIT session, compiler and
-	// object layer as each routine may require different target machine
-	// settings and no Reactor routine directly links against another.
-	class JITRoutine : public rr::Routine
-	{
+// JITRoutine is a rr::Routine that holds a LLVM JIT session, compiler and
+// object layer as each routine may require different target machine
+// settings and no Reactor routine directly links against another.
+class JITRoutine : public rr::Routine
+{
 #if LLVM_VERSION_MAJOR >= 8
-		using ObjLayer = llvm::orc::LegacyRTDyldObjectLinkingLayer;
-		using CompileLayer = llvm::orc::LegacyIRCompileLayer<ObjLayer, llvm::orc::SimpleCompiler>;
+	using ObjLayer = llvm::orc::LegacyRTDyldObjectLinkingLayer;
+	using CompileLayer = llvm::orc::LegacyIRCompileLayer<ObjLayer, llvm::orc::SimpleCompiler>;
 #else
-		using ObjLayer = llvm::orc::RTDyldObjectLinkingLayer;
-		using CompileLayer = llvm::orc::IRCompileLayer<ObjLayer, llvm::orc::SimpleCompiler>;
+	using ObjLayer = llvm::orc::RTDyldObjectLinkingLayer;
+	using CompileLayer = llvm::orc::IRCompileLayer<ObjLayer, llvm::orc::SimpleCompiler>;
 #endif
 
-	public:
-		JITRoutine(
-				std::unique_ptr<llvm::Module> module,
-				llvm::Function **funcs,
-				size_t count,
-				const rr::Config &config) :
-			resolver(createLegacyLookupResolver(
-				session,
-				[&](const std::string &name) {
-					void *func = rr::resolveExternalSymbol(name.c_str());
-					if (func != nullptr)
-					{
-						return llvm::JITSymbol(
-							reinterpret_cast<uintptr_t>(func), llvm::JITSymbolFlags::Absolute);
-					}
-					return objLayer.findSymbol(name, true);
-				},
-				[](llvm::Error err) {
-					if (err)
-					{
-						// TODO: Log the symbol resolution errors.
-						return;
-					}
-				})),
-			targetMachine(JITGlobals::get()->getTargetMachine(config.getOptimization().getLevel())),
-			compileLayer(objLayer, llvm::orc::SimpleCompiler(*targetMachine)),
-			objLayer(
-				session,
-				[this](llvm::orc::VModuleKey) {
-					return ObjLayer::Resources{std::make_shared<llvm::SectionMemoryManager>(), resolver};
-				},
-				ObjLayer::NotifyLoadedFtor(),
-				[](llvm::orc::VModuleKey, const llvm::object::ObjectFile &Obj, const llvm::RuntimeDyld::LoadedObjectInfo &L) {
-#ifdef ENABLE_RR_DEBUG_INFO
-					rr::DebugInfo::NotifyObjectEmitted(Obj, L);
-#endif // ENABLE_RR_DEBUG_INFO
-				},
-				[](llvm::orc::VModuleKey, const llvm::object::ObjectFile &Obj) {
-#ifdef ENABLE_RR_DEBUG_INFO
-					rr::DebugInfo::NotifyFreeingObject(Obj);
-#endif // ENABLE_RR_DEBUG_INFO
-				}
-			),
-			addresses(count)
-		{
-			std::vector<std::string> mangledNames(count);
-			for (size_t i = 0; i < count; i++)
-			{
-				auto func = funcs[i];
-				static size_t numEmittedFunctions = 0;
-				std::string name = "f" + llvm::Twine(numEmittedFunctions++).str();
-				func->setName(name);
-				func->setLinkage(llvm::GlobalValue::ExternalLinkage);
-				func->setDoesNotThrow();
-
-				llvm::raw_string_ostream mangledNameStream(mangledNames[i]);
-				llvm::Mangler::getNameWithPrefix(mangledNameStream, name, JITGlobals::get()->dataLayout);
-			}
-
-			auto moduleKey = session.allocateVModule();
-
-			// Once the module is passed to the compileLayer, the
-			// llvm::Functions are freed. Make sure funcs are not referenced
-			// after this point.
-			funcs = nullptr;
-
-			llvm::cantFail(compileLayer.addModule(moduleKey, std::move(module)));
-
-			// Resolve the function addresses.
-			for (size_t i = 0; i < count; i++)
-			{
-				auto symbol = compileLayer.findSymbolIn(moduleKey, mangledNames[i], false);
-				if(auto address = symbol.getAddress())
+public:
+	JITRoutine(
+			std::unique_ptr<llvm::Module> module,
+			llvm::Function **funcs,
+			size_t count,
+			const rr::Config &config) :
+		resolver(createLegacyLookupResolver(
+			session,
+			[&](const std::string &name) {
+				void *func = rr::resolveExternalSymbol(name.c_str());
+				if (func != nullptr)
 				{
-					addresses[i] = reinterpret_cast<void *>(static_cast<intptr_t>(address.get()));
+					return llvm::JITSymbol(
+						reinterpret_cast<uintptr_t>(func), llvm::JITSymbolFlags::Absolute);
 				}
+				return objLayer.findSymbol(name, true);
+			},
+			[](llvm::Error err) {
+				if (err)
+				{
+					// TODO: Log the symbol resolution errors.
+					return;
+				}
+			})),
+		targetMachine(JITGlobals::get()->getTargetMachine(config.getOptimization().getLevel())),
+		compileLayer(objLayer, llvm::orc::SimpleCompiler(*targetMachine)),
+		objLayer(
+			session,
+			[this](llvm::orc::VModuleKey) {
+				return ObjLayer::Resources{std::make_shared<llvm::SectionMemoryManager>(), resolver};
+			},
+			ObjLayer::NotifyLoadedFtor(),
+			[](llvm::orc::VModuleKey, const llvm::object::ObjectFile &Obj, const llvm::RuntimeDyld::LoadedObjectInfo &L) {
+#ifdef ENABLE_RR_DEBUG_INFO
+				rr::DebugInfo::NotifyObjectEmitted(Obj, L);
+#endif // ENABLE_RR_DEBUG_INFO
+			},
+			[](llvm::orc::VModuleKey, const llvm::object::ObjectFile &Obj) {
+#ifdef ENABLE_RR_DEBUG_INFO
+				rr::DebugInfo::NotifyFreeingObject(Obj);
+#endif // ENABLE_RR_DEBUG_INFO
 			}
-		}
-
-		const void *getEntry(int index) const override
-		{
-			return addresses[index];
-		}
-
-	private:
-		std::shared_ptr<llvm::orc::SymbolResolver> resolver;
-		std::shared_ptr<llvm::TargetMachine> targetMachine;
-		llvm::orc::ExecutionSession session;
-		CompileLayer compileLayer;
-		ObjLayer objLayer;
-		std::vector<const void *> addresses;
-	};
-
-	// JITBuilder holds all the LLVM state for building routines.
-	class JITBuilder
+		),
+		addresses(count)
 	{
-	public:
-		JITBuilder(const rr::Config &config) :
-			config(config),
-			module(new llvm::Module("", context)),
-			builder(new llvm::IRBuilder<>(context))
+		std::vector<std::string> mangledNames(count);
+		for (size_t i = 0; i < count; i++)
 		{
-			module->setDataLayout(JITGlobals::get()->dataLayout);
+			auto func = funcs[i];
+			static size_t numEmittedFunctions = 0;
+			std::string name = "f" + llvm::Twine(numEmittedFunctions++).str();
+			func->setName(name);
+			func->setLinkage(llvm::GlobalValue::ExternalLinkage);
+			func->setDoesNotThrow();
+
+			llvm::raw_string_ostream mangledNameStream(mangledNames[i]);
+			llvm::Mangler::getNameWithPrefix(mangledNameStream, name, JITGlobals::get()->dataLayout);
 		}
 
-		void optimize(const rr::Config &cfg)
+		auto moduleKey = session.allocateVModule();
+
+		// Once the module is passed to the compileLayer, the
+		// llvm::Functions are freed. Make sure funcs are not referenced
+		// after this point.
+		funcs = nullptr;
+
+		llvm::cantFail(compileLayer.addModule(moduleKey, std::move(module)));
+
+		// Resolve the function addresses.
+		for (size_t i = 0; i < count; i++)
 		{
+			auto symbol = compileLayer.findSymbolIn(moduleKey, mangledNames[i], false);
+			if(auto address = symbol.getAddress())
+			{
+				addresses[i] = reinterpret_cast<void *>(static_cast<intptr_t>(address.get()));
+			}
+		}
+	}
+
+	const void *getEntry(int index) const override
+	{
+		return addresses[index];
+	}
+
+private:
+	std::shared_ptr<llvm::orc::SymbolResolver> resolver;
+	std::shared_ptr<llvm::TargetMachine> targetMachine;
+	llvm::orc::ExecutionSession session;
+	CompileLayer compileLayer;
+	ObjLayer objLayer;
+	std::vector<const void *> addresses;
+};
+
+// JITBuilder holds all the LLVM state for building routines.
+class JITBuilder
+{
+public:
+	JITBuilder(const rr::Config &config) :
+		config(config),
+		module(new llvm::Module("", context)),
+		builder(new llvm::IRBuilder<>(context))
+	{
+		module->setDataLayout(JITGlobals::get()->dataLayout);
+	}
+
+	void optimize(const rr::Config &cfg)
+	{
 
 #ifdef ENABLE_RR_DEBUG_INFO
-			if (debugInfo != nullptr)
-			{
-				return; // Don't optimize if we're generating debug info.
-			}
+		if (debugInfo != nullptr)
+		{
+			return; // Don't optimize if we're generating debug info.
+		}
 #endif // ENABLE_RR_DEBUG_INFO
 
-			std::unique_ptr<llvm::legacy::PassManager> passManager(
-				new llvm::legacy::PassManager());
+		std::unique_ptr<llvm::legacy::PassManager> passManager(
+			new llvm::legacy::PassManager());
 
-			for(auto pass : cfg.getOptimization().getPasses())
+		for(auto pass : cfg.getOptimization().getPasses())
+		{
+			switch(pass)
 			{
-				switch(pass)
-				{
-				case rr::Optimization::Pass::Disabled:                                                                       break;
-				case rr::Optimization::Pass::CFGSimplification:    passManager->add(llvm::createCFGSimplificationPass());    break;
-				case rr::Optimization::Pass::LICM:                 passManager->add(llvm::createLICMPass());                 break;
-				case rr::Optimization::Pass::AggressiveDCE:        passManager->add(llvm::createAggressiveDCEPass());        break;
-				case rr::Optimization::Pass::GVN:                  passManager->add(llvm::createGVNPass());                  break;
-				case rr::Optimization::Pass::InstructionCombining: passManager->add(llvm::createInstructionCombiningPass()); break;
-				case rr::Optimization::Pass::Reassociate:          passManager->add(llvm::createReassociatePass());          break;
-				case rr::Optimization::Pass::DeadStoreElimination: passManager->add(llvm::createDeadStoreEliminationPass()); break;
-				case rr::Optimization::Pass::SCCP:                 passManager->add(llvm::createSCCPPass());                 break;
-				case rr::Optimization::Pass::ScalarReplAggregates: passManager->add(llvm::createSROAPass());                 break;
-				case rr::Optimization::Pass::EarlyCSEPass:         passManager->add(llvm::createEarlyCSEPass());             break;
-				default:
-					UNREACHABLE("pass: %d", int(pass));
-				}
+			case rr::Optimization::Pass::Disabled:                                                                       break;
+			case rr::Optimization::Pass::CFGSimplification:    passManager->add(llvm::createCFGSimplificationPass());    break;
+			case rr::Optimization::Pass::LICM:                 passManager->add(llvm::createLICMPass());                 break;
+			case rr::Optimization::Pass::AggressiveDCE:        passManager->add(llvm::createAggressiveDCEPass());        break;
+			case rr::Optimization::Pass::GVN:                  passManager->add(llvm::createGVNPass());                  break;
+			case rr::Optimization::Pass::InstructionCombining: passManager->add(llvm::createInstructionCombiningPass()); break;
+			case rr::Optimization::Pass::Reassociate:          passManager->add(llvm::createReassociatePass());          break;
+			case rr::Optimization::Pass::DeadStoreElimination: passManager->add(llvm::createDeadStoreEliminationPass()); break;
+			case rr::Optimization::Pass::SCCP:                 passManager->add(llvm::createSCCPPass());                 break;
+			case rr::Optimization::Pass::ScalarReplAggregates: passManager->add(llvm::createSROAPass());                 break;
+			case rr::Optimization::Pass::EarlyCSEPass:         passManager->add(llvm::createEarlyCSEPass());             break;
+			default:
+				UNREACHABLE("pass: %d", int(pass));
 			}
-
-			passManager->run(*module);
 		}
 
-		std::shared_ptr<rr::Routine> acquireRoutine(llvm::Function **funcs, size_t count, const rr::Config &cfg)
-		{
-			ASSERT(module);
-			return std::make_shared<JITRoutine>(std::move(module), funcs, count, cfg);
-		}
+		passManager->run(*module);
+	}
 
-		const rr::Config config;
-		llvm::LLVMContext context;
-		std::unique_ptr<llvm::Module> module;
-		std::unique_ptr<llvm::IRBuilder<>> builder;
-		llvm::Function *function = nullptr;
+	std::shared_ptr<rr::Routine> acquireRoutine(llvm::Function **funcs, size_t count, const rr::Config &cfg)
+	{
+		ASSERT(module);
+		return std::make_shared<JITRoutine>(std::move(module), funcs, count, cfg);
+	}
 
-		struct CoroutineState
-		{
-			llvm::Function *await = nullptr;
-			llvm::Function *destroy = nullptr;
-			llvm::Value *handle = nullptr;
-			llvm::Value *id = nullptr;
-			llvm::Value *promise = nullptr;
-			llvm::Type *yieldType = nullptr;
-			llvm::BasicBlock *entryBlock = nullptr;
-			llvm::BasicBlock *suspendBlock = nullptr;
-			llvm::BasicBlock *endBlock = nullptr;
-			llvm::BasicBlock *destroyBlock = nullptr;
-		};
-		CoroutineState coroutine;
+	const rr::Config config;
+	llvm::LLVMContext context;
+	std::unique_ptr<llvm::Module> module;
+	std::unique_ptr<llvm::IRBuilder<>> builder;
+	llvm::Function *function = nullptr;
+
+	struct CoroutineState
+	{
+		llvm::Function *await = nullptr;
+		llvm::Function *destroy = nullptr;
+		llvm::Value *handle = nullptr;
+		llvm::Value *id = nullptr;
+		llvm::Value *promise = nullptr;
+		llvm::Type *yieldType = nullptr;
+		llvm::BasicBlock *entryBlock = nullptr;
+		llvm::BasicBlock *suspendBlock = nullptr;
+		llvm::BasicBlock *endBlock = nullptr;
+		llvm::BasicBlock *destroyBlock = nullptr;
+	};
+	CoroutineState coroutine;
 
 #ifdef ENABLE_RR_DEBUG_INFO
-		std::unique_ptr<rr::DebugInfo> debugInfo;
+	std::unique_ptr<rr::DebugInfo> debugInfo;
 #endif
-	};
+};
 
-	std::unique_ptr<JITBuilder> jit;
-	std::mutex codegenMutex;
+std::unique_ptr<JITBuilder> jit;
+std::mutex codegenMutex;
 
 #ifdef ENABLE_RR_PRINT
-	std::string replace(std::string str, const std::string& substr, const std::string& replacement)
-	{
-		size_t pos = 0;
-		while((pos = str.find(substr, pos)) != std::string::npos) {
-			str.replace(pos, substr.length(), replacement);
-			pos += replacement.length();
-		}
-		return str;
+std::string replace(std::string str, const std::string& substr, const std::string& replacement)
+{
+	size_t pos = 0;
+	while((pos = str.find(substr, pos)) != std::string::npos) {
+		str.replace(pos, substr.length(), replacement);
+		pos += replacement.length();
 	}
+	return str;
+}
 #endif // ENABLE_RR_PRINT
 
-	template <typename T>
-	T alignUp(T val, T alignment)
-	{
-		return alignment * ((val + alignment - 1) / alignment);
-	}
+template <typename T>
+T alignUp(T val, T alignment)
+{
+	return alignment * ((val + alignment - 1) / alignment);
+}
 
-	void* alignedAlloc(size_t size, size_t alignment)
-	{
-		ASSERT(alignment < 256);
-		auto allocation = new uint8_t[size + sizeof(uint8_t) + alignment];
-		auto aligned = allocation;
-		aligned += sizeof(uint8_t); // Make space for the base-address offset.
-		aligned = reinterpret_cast<uint8_t*>(alignUp(reinterpret_cast<uintptr_t>(aligned), alignment)); // align
-		auto offset = static_cast<uint8_t>(aligned - allocation);
-		aligned[-1] = offset;
-		return aligned;
-	}
+void* alignedAlloc(size_t size, size_t alignment)
+{
+	ASSERT(alignment < 256);
+	auto allocation = new uint8_t[size + sizeof(uint8_t) + alignment];
+	auto aligned = allocation;
+	aligned += sizeof(uint8_t); // Make space for the base-address offset.
+	aligned = reinterpret_cast<uint8_t*>(alignUp(reinterpret_cast<uintptr_t>(aligned), alignment)); // align
+	auto offset = static_cast<uint8_t>(aligned - allocation);
+	aligned[-1] = offset;
+	return aligned;
+}
 
-	void alignedFree(void* ptr)
-	{
-		auto aligned = reinterpret_cast<uint8_t*>(ptr);
-		auto offset = aligned[-1];
-		auto allocation = aligned - offset;
-		delete[] allocation;
-	}
+void alignedFree(void* ptr)
+{
+	auto aligned = reinterpret_cast<uint8_t*>(ptr);
+	auto offset = aligned[-1];
+	auto allocation = aligned - offset;
+	delete[] allocation;
+}
 
-	llvm::Value *lowerPAVG(llvm::Value *x, llvm::Value *y)
-	{
-		llvm::VectorType *ty = llvm::cast<llvm::VectorType>(x->getType());
+llvm::Value *lowerPAVG(llvm::Value *x, llvm::Value *y)
+{
+	llvm::VectorType *ty = llvm::cast<llvm::VectorType>(x->getType());
 
-		llvm::VectorType *extTy =
-			llvm::VectorType::getExtendedElementVectorType(ty);
-		x = jit->builder->CreateZExt(x, extTy);
-		y = jit->builder->CreateZExt(y, extTy);
+	llvm::VectorType *extTy =
+		llvm::VectorType::getExtendedElementVectorType(ty);
+	x = jit->builder->CreateZExt(x, extTy);
+	y = jit->builder->CreateZExt(y, extTy);
 
-		// (x + y + 1) >> 1
-		llvm::Constant *one = llvm::ConstantInt::get(extTy, 1);
-		llvm::Value *res = jit->builder->CreateAdd(x, y);
-		res = jit->builder->CreateAdd(res, one);
-		res = jit->builder->CreateLShr(res, one);
-		return jit->builder->CreateTrunc(res, ty);
-	}
+	// (x + y + 1) >> 1
+	llvm::Constant *one = llvm::ConstantInt::get(extTy, 1);
+	llvm::Value *res = jit->builder->CreateAdd(x, y);
+	res = jit->builder->CreateAdd(res, one);
+	res = jit->builder->CreateLShr(res, one);
+	return jit->builder->CreateTrunc(res, ty);
+}
 
-	llvm::Value *lowerPMINMAX(llvm::Value *x, llvm::Value *y,
-	                          llvm::ICmpInst::Predicate pred)
-	{
-		return jit->builder->CreateSelect(jit->builder->CreateICmp(pred, x, y), x, y);
-	}
+llvm::Value *lowerPMINMAX(llvm::Value *x, llvm::Value *y,
+                          llvm::ICmpInst::Predicate pred)
+{
+	return jit->builder->CreateSelect(jit->builder->CreateICmp(pred, x, y), x, y);
+}
 
-	llvm::Value *lowerPCMP(llvm::ICmpInst::Predicate pred, llvm::Value *x,
-	                       llvm::Value *y, llvm::Type *dstTy)
-	{
-		return jit->builder->CreateSExt(jit->builder->CreateICmp(pred, x, y), dstTy, "");
-	}
+llvm::Value *lowerPCMP(llvm::ICmpInst::Predicate pred, llvm::Value *x,
+                       llvm::Value *y, llvm::Type *dstTy)
+{
+	return jit->builder->CreateSExt(jit->builder->CreateICmp(pred, x, y), dstTy, "");
+}
 
 #if defined(__i386__) || defined(__x86_64__)
-	llvm::Value *lowerPMOV(llvm::Value *op, llvm::Type *dstType, bool sext)
-	{
-		llvm::VectorType *srcTy = llvm::cast<llvm::VectorType>(op->getType());
-		llvm::VectorType *dstTy = llvm::cast<llvm::VectorType>(dstType);
+llvm::Value *lowerPMOV(llvm::Value *op, llvm::Type *dstType, bool sext)
+{
+	llvm::VectorType *srcTy = llvm::cast<llvm::VectorType>(op->getType());
+	llvm::VectorType *dstTy = llvm::cast<llvm::VectorType>(dstType);
 
-		llvm::Value *undef = llvm::UndefValue::get(srcTy);
-		llvm::SmallVector<uint32_t, 16> mask(dstTy->getNumElements());
-		std::iota(mask.begin(), mask.end(), 0);
-		llvm::Value *v = jit->builder->CreateShuffleVector(op, undef, mask);
+	llvm::Value *undef = llvm::UndefValue::get(srcTy);
+	llvm::SmallVector<uint32_t, 16> mask(dstTy->getNumElements());
+	std::iota(mask.begin(), mask.end(), 0);
+	llvm::Value *v = jit->builder->CreateShuffleVector(op, undef, mask);
 
-		return sext ? jit->builder->CreateSExt(v, dstTy)
-		            : jit->builder->CreateZExt(v, dstTy);
-	}
+	return sext ? jit->builder->CreateSExt(v, dstTy)
+	            : jit->builder->CreateZExt(v, dstTy);
+}
 
-	llvm::Value *lowerPABS(llvm::Value *v)
-	{
-		llvm::Value *zero = llvm::Constant::getNullValue(v->getType());
-		llvm::Value *cmp = jit->builder->CreateICmp(llvm::ICmpInst::ICMP_SGT, v, zero);
-		llvm::Value *neg = jit->builder->CreateNeg(v);
-		return jit->builder->CreateSelect(cmp, v, neg);
-	}
+llvm::Value *lowerPABS(llvm::Value *v)
+{
+	llvm::Value *zero = llvm::Constant::getNullValue(v->getType());
+	llvm::Value *cmp = jit->builder->CreateICmp(llvm::ICmpInst::ICMP_SGT, v, zero);
+	llvm::Value *neg = jit->builder->CreateNeg(v);
+	return jit->builder->CreateSelect(cmp, v, neg);
+}
 #endif  // defined(__i386__) || defined(__x86_64__)
 
 #if !defined(__i386__) && !defined(__x86_64__)
-	llvm::Value *lowerPFMINMAX(llvm::Value *x, llvm::Value *y,
-	                           llvm::FCmpInst::Predicate pred)
+llvm::Value *lowerPFMINMAX(llvm::Value *x, llvm::Value *y,
+                           llvm::FCmpInst::Predicate pred)
+{
+	return jit->builder->CreateSelect(jit->builder->CreateFCmp(pred, x, y), x, y);
+}
+
+llvm::Value *lowerRound(llvm::Value *x)
+{
+	llvm::Function *nearbyint = llvm::Intrinsic::getDeclaration(
+		jit->module.get(), llvm::Intrinsic::nearbyint, {x->getType()});
+	return jit->builder->CreateCall(nearbyint, ARGS(x));
+}
+
+llvm::Value *lowerRoundInt(llvm::Value *x, llvm::Type *ty)
+{
+	return jit->builder->CreateFPToSI(lowerRound(x), ty);
+}
+
+llvm::Value *lowerFloor(llvm::Value *x)
+{
+	llvm::Function *floor = llvm::Intrinsic::getDeclaration(
+		jit->module.get(), llvm::Intrinsic::floor, {x->getType()});
+	return jit->builder->CreateCall(floor, ARGS(x));
+}
+
+llvm::Value *lowerTrunc(llvm::Value *x)
+{
+	llvm::Function *trunc = llvm::Intrinsic::getDeclaration(
+		jit->module.get(), llvm::Intrinsic::trunc, {x->getType()});
+	return jit->builder->CreateCall(trunc, ARGS(x));
+}
+
+// Packed add/sub with saturation
+llvm::Value *lowerPSAT(llvm::Value *x, llvm::Value *y, bool isAdd, bool isSigned)
+{
+	llvm::VectorType *ty = llvm::cast<llvm::VectorType>(x->getType());
+	llvm::VectorType *extTy = llvm::VectorType::getExtendedElementVectorType(ty);
+
+	unsigned numBits = ty->getScalarSizeInBits();
+
+	llvm::Value *max, *min, *extX, *extY;
+	if (isSigned)
 	{
-		return jit->builder->CreateSelect(jit->builder->CreateFCmp(pred, x, y), x, y);
+		max = llvm::ConstantInt::get(extTy, (1LL << (numBits - 1)) - 1, true);
+		min = llvm::ConstantInt::get(extTy, (-1LL << (numBits - 1)), true);
+		extX = jit->builder->CreateSExt(x, extTy);
+		extY = jit->builder->CreateSExt(y, extTy);
+	}
+	else
+	{
+		ASSERT_MSG(numBits <= 64, "numBits: %d", int(numBits));
+		uint64_t maxVal = (numBits == 64) ? ~0ULL : (1ULL << numBits) - 1;
+		max = llvm::ConstantInt::get(extTy, maxVal, false);
+		min = llvm::ConstantInt::get(extTy, 0, false);
+		extX = jit->builder->CreateZExt(x, extTy);
+		extY = jit->builder->CreateZExt(y, extTy);
 	}
 
-	llvm::Value *lowerRound(llvm::Value *x)
+	llvm::Value *res = isAdd ? jit->builder->CreateAdd(extX, extY)
+	                         : jit->builder->CreateSub(extX, extY);
+
+	res = lowerPMINMAX(res, min, llvm::ICmpInst::ICMP_SGT);
+	res = lowerPMINMAX(res, max, llvm::ICmpInst::ICMP_SLT);
+
+	return jit->builder->CreateTrunc(res, ty);
+}
+
+llvm::Value *lowerSQRT(llvm::Value *x)
+{
+	llvm::Function *sqrt = llvm::Intrinsic::getDeclaration(
+		jit->module.get(), llvm::Intrinsic::sqrt, {x->getType()});
+	return jit->builder->CreateCall(sqrt, ARGS(x));
+}
+
+llvm::Value *lowerRCP(llvm::Value *x)
+{
+	llvm::Type *ty = x->getType();
+	llvm::Constant *one;
+	if (llvm::VectorType *vectorTy = llvm::dyn_cast<llvm::VectorType>(ty))
 	{
-		llvm::Function *nearbyint = llvm::Intrinsic::getDeclaration(
-			jit->module.get(), llvm::Intrinsic::nearbyint, {x->getType()});
-		return jit->builder->CreateCall(nearbyint, ARGS(x));
+		one = llvm::ConstantVector::getSplat(
+			vectorTy->getNumElements(),
+			llvm::ConstantFP::get(vectorTy->getElementType(), 1));
+	}
+	else
+	{
+		one = llvm::ConstantFP::get(ty, 1);
+	}
+	return jit->builder->CreateFDiv(one, x);
+}
+
+llvm::Value *lowerRSQRT(llvm::Value *x)
+{
+	return lowerRCP(lowerSQRT(x));
+}
+
+llvm::Value *lowerVectorShl(llvm::Value *x, uint64_t scalarY)
+{
+	llvm::VectorType *ty = llvm::cast<llvm::VectorType>(x->getType());
+	llvm::Value *y = llvm::ConstantVector::getSplat(
+		ty->getNumElements(),
+		llvm::ConstantInt::get(ty->getElementType(), scalarY));
+	return jit->builder->CreateShl(x, y);
+}
+
+llvm::Value *lowerVectorAShr(llvm::Value *x, uint64_t scalarY)
+{
+	llvm::VectorType *ty = llvm::cast<llvm::VectorType>(x->getType());
+	llvm::Value *y = llvm::ConstantVector::getSplat(
+		ty->getNumElements(),
+		llvm::ConstantInt::get(ty->getElementType(), scalarY));
+	return jit->builder->CreateAShr(x, y);
+}
+
+llvm::Value *lowerVectorLShr(llvm::Value *x, uint64_t scalarY)
+{
+	llvm::VectorType *ty = llvm::cast<llvm::VectorType>(x->getType());
+	llvm::Value *y = llvm::ConstantVector::getSplat(
+		ty->getNumElements(),
+		llvm::ConstantInt::get(ty->getElementType(), scalarY));
+	return jit->builder->CreateLShr(x, y);
+}
+
+llvm::Value *lowerMulAdd(llvm::Value *x, llvm::Value *y)
+{
+	llvm::VectorType *ty = llvm::cast<llvm::VectorType>(x->getType());
+	llvm::VectorType *extTy = llvm::VectorType::getExtendedElementVectorType(ty);
+
+	llvm::Value *extX = jit->builder->CreateSExt(x, extTy);
+	llvm::Value *extY = jit->builder->CreateSExt(y, extTy);
+	llvm::Value *mult = jit->builder->CreateMul(extX, extY);
+
+	llvm::Value *undef = llvm::UndefValue::get(extTy);
+
+	llvm::SmallVector<uint32_t, 16> evenIdx;
+	llvm::SmallVector<uint32_t, 16> oddIdx;
+	for (uint64_t i = 0, n = ty->getNumElements(); i < n; i += 2)
+	{
+		evenIdx.push_back(i);
+		oddIdx.push_back(i + 1);
 	}
 
-	llvm::Value *lowerRoundInt(llvm::Value *x, llvm::Type *ty)
+	llvm::Value *lhs = jit->builder->CreateShuffleVector(mult, undef, evenIdx);
+	llvm::Value *rhs = jit->builder->CreateShuffleVector(mult, undef, oddIdx);
+	return jit->builder->CreateAdd(lhs, rhs);
+}
+
+llvm::Value *lowerPack(llvm::Value *x, llvm::Value *y, bool isSigned)
+{
+	llvm::VectorType *srcTy = llvm::cast<llvm::VectorType>(x->getType());
+	llvm::VectorType *dstTy = llvm::VectorType::getTruncatedElementVectorType(srcTy);
+
+	llvm::IntegerType *dstElemTy =
+		llvm::cast<llvm::IntegerType>(dstTy->getElementType());
+
+	uint64_t truncNumBits = dstElemTy->getIntegerBitWidth();
+	ASSERT_MSG(truncNumBits < 64, "shift 64 must be handled separately. truncNumBits: %d", int(truncNumBits));
+	llvm::Constant *max, *min;
+	if (isSigned)
 	{
-		return jit->builder->CreateFPToSI(lowerRound(x), ty);
+		max = llvm::ConstantInt::get(srcTy, (1LL << (truncNumBits - 1)) - 1, true);
+		min = llvm::ConstantInt::get(srcTy, (-1LL << (truncNumBits - 1)), true);
+	}
+	else
+	{
+		max = llvm::ConstantInt::get(srcTy, (1ULL << truncNumBits) - 1, false);
+		min = llvm::ConstantInt::get(srcTy, 0, false);
 	}
 
-	llvm::Value *lowerFloor(llvm::Value *x)
+	x = lowerPMINMAX(x, min, llvm::ICmpInst::ICMP_SGT);
+	x = lowerPMINMAX(x, max, llvm::ICmpInst::ICMP_SLT);
+	y = lowerPMINMAX(y, min, llvm::ICmpInst::ICMP_SGT);
+	y = lowerPMINMAX(y, max, llvm::ICmpInst::ICMP_SLT);
+
+	x = jit->builder->CreateTrunc(x, dstTy);
+	y = jit->builder->CreateTrunc(y, dstTy);
+
+	llvm::SmallVector<uint32_t, 16> index(srcTy->getNumElements() * 2);
+	std::iota(index.begin(), index.end(), 0);
+
+	return jit->builder->CreateShuffleVector(x, y, index);
+}
+
+llvm::Value *lowerSignMask(llvm::Value *x, llvm::Type *retTy)
+{
+	llvm::VectorType *ty = llvm::cast<llvm::VectorType>(x->getType());
+	llvm::Constant *zero = llvm::ConstantInt::get(ty, 0);
+	llvm::Value *cmp = jit->builder->CreateICmpSLT(x, zero);
+
+	llvm::Value *ret = jit->builder->CreateZExt(
+		jit->builder->CreateExtractElement(cmp, static_cast<uint64_t>(0)), retTy);
+	for (uint64_t i = 1, n = ty->getNumElements(); i < n; ++i)
 	{
-		llvm::Function *floor = llvm::Intrinsic::getDeclaration(
-			jit->module.get(), llvm::Intrinsic::floor, {x->getType()});
-		return jit->builder->CreateCall(floor, ARGS(x));
+		llvm::Value *elem = jit->builder->CreateZExt(
+			jit->builder->CreateExtractElement(cmp, i), retTy);
+		ret = jit->builder->CreateOr(ret, jit->builder->CreateShl(elem, i));
 	}
+	return ret;
+}
 
-	llvm::Value *lowerTrunc(llvm::Value *x)
+llvm::Value *lowerFPSignMask(llvm::Value *x, llvm::Type *retTy)
+{
+	llvm::VectorType *ty = llvm::cast<llvm::VectorType>(x->getType());
+	llvm::Constant *zero = llvm::ConstantFP::get(ty, 0);
+	llvm::Value *cmp = jit->builder->CreateFCmpULT(x, zero);
+
+	llvm::Value *ret = jit->builder->CreateZExt(
+		jit->builder->CreateExtractElement(cmp, static_cast<uint64_t>(0)), retTy);
+	for (uint64_t i = 1, n = ty->getNumElements(); i < n; ++i)
 	{
-		llvm::Function *trunc = llvm::Intrinsic::getDeclaration(
-			jit->module.get(), llvm::Intrinsic::trunc, {x->getType()});
-		return jit->builder->CreateCall(trunc, ARGS(x));
+		llvm::Value *elem = jit->builder->CreateZExt(
+			jit->builder->CreateExtractElement(cmp, i), retTy);
+		ret = jit->builder->CreateOr(ret, jit->builder->CreateShl(elem, i));
 	}
-
-	// Packed add/sub with saturation
-	llvm::Value *lowerPSAT(llvm::Value *x, llvm::Value *y, bool isAdd, bool isSigned)
-	{
-		llvm::VectorType *ty = llvm::cast<llvm::VectorType>(x->getType());
-		llvm::VectorType *extTy = llvm::VectorType::getExtendedElementVectorType(ty);
-
-		unsigned numBits = ty->getScalarSizeInBits();
-
-		llvm::Value *max, *min, *extX, *extY;
-		if (isSigned)
-		{
-			max = llvm::ConstantInt::get(extTy, (1LL << (numBits - 1)) - 1, true);
-			min = llvm::ConstantInt::get(extTy, (-1LL << (numBits - 1)), true);
-			extX = jit->builder->CreateSExt(x, extTy);
-			extY = jit->builder->CreateSExt(y, extTy);
-		}
-		else
-		{
-			ASSERT_MSG(numBits <= 64, "numBits: %d", int(numBits));
-			uint64_t maxVal = (numBits == 64) ? ~0ULL : (1ULL << numBits) - 1;
-			max = llvm::ConstantInt::get(extTy, maxVal, false);
-			min = llvm::ConstantInt::get(extTy, 0, false);
-			extX = jit->builder->CreateZExt(x, extTy);
-			extY = jit->builder->CreateZExt(y, extTy);
-		}
-
-		llvm::Value *res = isAdd ? jit->builder->CreateAdd(extX, extY)
-		                         : jit->builder->CreateSub(extX, extY);
-
-		res = lowerPMINMAX(res, min, llvm::ICmpInst::ICMP_SGT);
-		res = lowerPMINMAX(res, max, llvm::ICmpInst::ICMP_SLT);
-
-		return jit->builder->CreateTrunc(res, ty);
-	}
-
-	llvm::Value *lowerSQRT(llvm::Value *x)
-	{
-		llvm::Function *sqrt = llvm::Intrinsic::getDeclaration(
-			jit->module.get(), llvm::Intrinsic::sqrt, {x->getType()});
-		return jit->builder->CreateCall(sqrt, ARGS(x));
-	}
-
-	llvm::Value *lowerRCP(llvm::Value *x)
-	{
-		llvm::Type *ty = x->getType();
-		llvm::Constant *one;
-		if (llvm::VectorType *vectorTy = llvm::dyn_cast<llvm::VectorType>(ty))
-		{
-			one = llvm::ConstantVector::getSplat(
-				vectorTy->getNumElements(),
-				llvm::ConstantFP::get(vectorTy->getElementType(), 1));
-		}
-		else
-		{
-			one = llvm::ConstantFP::get(ty, 1);
-		}
-		return jit->builder->CreateFDiv(one, x);
-	}
-
-	llvm::Value *lowerRSQRT(llvm::Value *x)
-	{
-		return lowerRCP(lowerSQRT(x));
-	}
-
-	llvm::Value *lowerVectorShl(llvm::Value *x, uint64_t scalarY)
-	{
-		llvm::VectorType *ty = llvm::cast<llvm::VectorType>(x->getType());
-		llvm::Value *y = llvm::ConstantVector::getSplat(
-			ty->getNumElements(),
-			llvm::ConstantInt::get(ty->getElementType(), scalarY));
-		return jit->builder->CreateShl(x, y);
-	}
-
-	llvm::Value *lowerVectorAShr(llvm::Value *x, uint64_t scalarY)
-	{
-		llvm::VectorType *ty = llvm::cast<llvm::VectorType>(x->getType());
-		llvm::Value *y = llvm::ConstantVector::getSplat(
-			ty->getNumElements(),
-			llvm::ConstantInt::get(ty->getElementType(), scalarY));
-		return jit->builder->CreateAShr(x, y);
-	}
-
-	llvm::Value *lowerVectorLShr(llvm::Value *x, uint64_t scalarY)
-	{
-		llvm::VectorType *ty = llvm::cast<llvm::VectorType>(x->getType());
-		llvm::Value *y = llvm::ConstantVector::getSplat(
-			ty->getNumElements(),
-			llvm::ConstantInt::get(ty->getElementType(), scalarY));
-		return jit->builder->CreateLShr(x, y);
-	}
-
-	llvm::Value *lowerMulAdd(llvm::Value *x, llvm::Value *y)
-	{
-		llvm::VectorType *ty = llvm::cast<llvm::VectorType>(x->getType());
-		llvm::VectorType *extTy = llvm::VectorType::getExtendedElementVectorType(ty);
-
-		llvm::Value *extX = jit->builder->CreateSExt(x, extTy);
-		llvm::Value *extY = jit->builder->CreateSExt(y, extTy);
-		llvm::Value *mult = jit->builder->CreateMul(extX, extY);
-
-		llvm::Value *undef = llvm::UndefValue::get(extTy);
-
-		llvm::SmallVector<uint32_t, 16> evenIdx;
-		llvm::SmallVector<uint32_t, 16> oddIdx;
-		for (uint64_t i = 0, n = ty->getNumElements(); i < n; i += 2)
-		{
-			evenIdx.push_back(i);
-			oddIdx.push_back(i + 1);
-		}
-
-		llvm::Value *lhs = jit->builder->CreateShuffleVector(mult, undef, evenIdx);
-		llvm::Value *rhs = jit->builder->CreateShuffleVector(mult, undef, oddIdx);
-		return jit->builder->CreateAdd(lhs, rhs);
-	}
-
-	llvm::Value *lowerPack(llvm::Value *x, llvm::Value *y, bool isSigned)
-	{
-		llvm::VectorType *srcTy = llvm::cast<llvm::VectorType>(x->getType());
-		llvm::VectorType *dstTy = llvm::VectorType::getTruncatedElementVectorType(srcTy);
-
-		llvm::IntegerType *dstElemTy =
-			llvm::cast<llvm::IntegerType>(dstTy->getElementType());
-
-		uint64_t truncNumBits = dstElemTy->getIntegerBitWidth();
-		ASSERT_MSG(truncNumBits < 64, "shift 64 must be handled separately. truncNumBits: %d", int(truncNumBits));
-		llvm::Constant *max, *min;
-		if (isSigned)
-		{
-			max = llvm::ConstantInt::get(srcTy, (1LL << (truncNumBits - 1)) - 1, true);
-			min = llvm::ConstantInt::get(srcTy, (-1LL << (truncNumBits - 1)), true);
-		}
-		else
-		{
-			max = llvm::ConstantInt::get(srcTy, (1ULL << truncNumBits) - 1, false);
-			min = llvm::ConstantInt::get(srcTy, 0, false);
-		}
-
-		x = lowerPMINMAX(x, min, llvm::ICmpInst::ICMP_SGT);
-		x = lowerPMINMAX(x, max, llvm::ICmpInst::ICMP_SLT);
-		y = lowerPMINMAX(y, min, llvm::ICmpInst::ICMP_SGT);
-		y = lowerPMINMAX(y, max, llvm::ICmpInst::ICMP_SLT);
-
-		x = jit->builder->CreateTrunc(x, dstTy);
-		y = jit->builder->CreateTrunc(y, dstTy);
-
-		llvm::SmallVector<uint32_t, 16> index(srcTy->getNumElements() * 2);
-		std::iota(index.begin(), index.end(), 0);
-
-		return jit->builder->CreateShuffleVector(x, y, index);
-	}
-
-	llvm::Value *lowerSignMask(llvm::Value *x, llvm::Type *retTy)
-	{
-		llvm::VectorType *ty = llvm::cast<llvm::VectorType>(x->getType());
-		llvm::Constant *zero = llvm::ConstantInt::get(ty, 0);
-		llvm::Value *cmp = jit->builder->CreateICmpSLT(x, zero);
-
-		llvm::Value *ret = jit->builder->CreateZExt(
-			jit->builder->CreateExtractElement(cmp, static_cast<uint64_t>(0)), retTy);
-		for (uint64_t i = 1, n = ty->getNumElements(); i < n; ++i)
-		{
-			llvm::Value *elem = jit->builder->CreateZExt(
-				jit->builder->CreateExtractElement(cmp, i), retTy);
-			ret = jit->builder->CreateOr(ret, jit->builder->CreateShl(elem, i));
-		}
-		return ret;
-	}
-
-	llvm::Value *lowerFPSignMask(llvm::Value *x, llvm::Type *retTy)
-	{
-		llvm::VectorType *ty = llvm::cast<llvm::VectorType>(x->getType());
-		llvm::Constant *zero = llvm::ConstantFP::get(ty, 0);
-		llvm::Value *cmp = jit->builder->CreateFCmpULT(x, zero);
-
-		llvm::Value *ret = jit->builder->CreateZExt(
-			jit->builder->CreateExtractElement(cmp, static_cast<uint64_t>(0)), retTy);
-		for (uint64_t i = 1, n = ty->getNumElements(); i < n; ++i)
-		{
-			llvm::Value *elem = jit->builder->CreateZExt(
-				jit->builder->CreateExtractElement(cmp, i), retTy);
-			ret = jit->builder->CreateOr(ret, jit->builder->CreateShl(elem, i));
-		}
-		return ret;
-	}
+	return ret;
+}
 #endif  // !defined(__i386__) && !defined(__x86_64__)
 
 #if (LLVM_VERSION_MAJOR >= 8) || (!defined(__i386__) && !defined(__x86_64__))
-	llvm::Value *lowerPUADDSAT(llvm::Value *x, llvm::Value *y)
-	{
-		#if LLVM_VERSION_MAJOR >= 8
-			return jit->builder->CreateBinaryIntrinsic(llvm::Intrinsic::uadd_sat, x, y);
-		#else
-			return lowerPSAT(x, y, true, false);
-		#endif
-	}
+llvm::Value *lowerPUADDSAT(llvm::Value *x, llvm::Value *y)
+{
+	#if LLVM_VERSION_MAJOR >= 8
+		return jit->builder->CreateBinaryIntrinsic(llvm::Intrinsic::uadd_sat, x, y);
+	#else
+		return lowerPSAT(x, y, true, false);
+	#endif
+}
 
-	llvm::Value *lowerPSADDSAT(llvm::Value *x, llvm::Value *y)
-	{
-		#if LLVM_VERSION_MAJOR >= 8
-			return jit->builder->CreateBinaryIntrinsic(llvm::Intrinsic::sadd_sat, x, y);
-		#else
-			return lowerPSAT(x, y, true, true);
-		#endif
-	}
+llvm::Value *lowerPSADDSAT(llvm::Value *x, llvm::Value *y)
+{
+	#if LLVM_VERSION_MAJOR >= 8
+		return jit->builder->CreateBinaryIntrinsic(llvm::Intrinsic::sadd_sat, x, y);
+	#else
+		return lowerPSAT(x, y, true, true);
+	#endif
+}
 
-	llvm::Value *lowerPUSUBSAT(llvm::Value *x, llvm::Value *y)
-	{
-		#if LLVM_VERSION_MAJOR >= 8
-			return jit->builder->CreateBinaryIntrinsic(llvm::Intrinsic::usub_sat, x, y);
-		#else
-			return lowerPSAT(x, y, false, false);
-		#endif
-	}
+llvm::Value *lowerPUSUBSAT(llvm::Value *x, llvm::Value *y)
+{
+	#if LLVM_VERSION_MAJOR >= 8
+		return jit->builder->CreateBinaryIntrinsic(llvm::Intrinsic::usub_sat, x, y);
+	#else
+		return lowerPSAT(x, y, false, false);
+	#endif
+}
 
-	llvm::Value *lowerPSSUBSAT(llvm::Value *x, llvm::Value *y)
-	{
-		#if LLVM_VERSION_MAJOR >= 8
-			return jit->builder->CreateBinaryIntrinsic(llvm::Intrinsic::ssub_sat, x, y);
-		#else
-			return lowerPSAT(x, y, false, true);
-		#endif
-	}
+llvm::Value *lowerPSSUBSAT(llvm::Value *x, llvm::Value *y)
+{
+	#if LLVM_VERSION_MAJOR >= 8
+		return jit->builder->CreateBinaryIntrinsic(llvm::Intrinsic::ssub_sat, x, y);
+	#else
+		return lowerPSAT(x, y, false, true);
+	#endif
+}
 #endif  // (LLVM_VERSION_MAJOR >= 8) || (!defined(__i386__) && !defined(__x86_64__))
 
-	llvm::Value *lowerMulHigh(llvm::Value *x, llvm::Value *y, bool sext)
+llvm::Value *lowerMulHigh(llvm::Value *x, llvm::Value *y, bool sext)
+{
+	llvm::VectorType *ty = llvm::cast<llvm::VectorType>(x->getType());
+	llvm::VectorType *extTy = llvm::VectorType::getExtendedElementVectorType(ty);
+
+	llvm::Value *extX, *extY;
+	if (sext)
 	{
-		llvm::VectorType *ty = llvm::cast<llvm::VectorType>(x->getType());
-		llvm::VectorType *extTy = llvm::VectorType::getExtendedElementVectorType(ty);
-
-		llvm::Value *extX, *extY;
-		if (sext)
-		{
-			extX = jit->builder->CreateSExt(x, extTy);
-			extY = jit->builder->CreateSExt(y, extTy);
-		}
-		else
-		{
-			extX = jit->builder->CreateZExt(x, extTy);
-			extY = jit->builder->CreateZExt(y, extTy);
-		}
-
-		llvm::Value *mult = jit->builder->CreateMul(extX, extY);
-
-		llvm::IntegerType *intTy = llvm::cast<llvm::IntegerType>(ty->getElementType());
-		llvm::Value *mulh = jit->builder->CreateAShr(mult, intTy->getBitWidth());
-		return jit->builder->CreateTrunc(mulh, ty);
+		extX = jit->builder->CreateSExt(x, extTy);
+		extY = jit->builder->CreateSExt(y, extTy);
+	}
+	else
+	{
+		extX = jit->builder->CreateZExt(x, extTy);
+		extY = jit->builder->CreateZExt(y, extTy);
 	}
 
-	llvm::Value *createGather(llvm::Value *base, llvm::Type *elTy, llvm::Value *offsets, llvm::Value *mask, unsigned int alignment, bool zeroMaskedLanes)
+	llvm::Value *mult = jit->builder->CreateMul(extX, extY);
+
+	llvm::IntegerType *intTy = llvm::cast<llvm::IntegerType>(ty->getElementType());
+	llvm::Value *mulh = jit->builder->CreateAShr(mult, intTy->getBitWidth());
+	return jit->builder->CreateTrunc(mulh, ty);
+}
+
+llvm::Value *createGather(llvm::Value *base, llvm::Type *elTy, llvm::Value *offsets, llvm::Value *mask, unsigned int alignment, bool zeroMaskedLanes)
+{
+	ASSERT(base->getType()->isPointerTy());
+	ASSERT(offsets->getType()->isVectorTy());
+	ASSERT(mask->getType()->isVectorTy());
+
+	auto numEls = mask->getType()->getVectorNumElements();
+	auto i1Ty = ::llvm::Type::getInt1Ty(jit->context);
+	auto i32Ty = ::llvm::Type::getInt32Ty(jit->context);
+	auto i8Ty = ::llvm::Type::getInt8Ty(jit->context);
+	auto i8PtrTy = i8Ty->getPointerTo();
+	auto elPtrTy = elTy->getPointerTo();
+	auto elVecTy = ::llvm::VectorType::get(elTy, numEls);
+	auto elPtrVecTy = ::llvm::VectorType::get(elPtrTy, numEls);
+	auto i8Base = jit->builder->CreatePointerCast(base, i8PtrTy);
+	auto i8Ptrs = jit->builder->CreateGEP(i8Base, offsets);
+	auto elPtrs = jit->builder->CreatePointerCast(i8Ptrs, elPtrVecTy);
+	auto i8Mask = jit->builder->CreateIntCast(mask, ::llvm::VectorType::get(i1Ty, numEls), false); // vec<int, int, ...> -> vec<bool, bool, ...>
+	auto passthrough = zeroMaskedLanes ? ::llvm::Constant::getNullValue(elVecTy) : llvm::UndefValue::get(elVecTy);
+	auto align = ::llvm::ConstantInt::get(i32Ty, alignment);
+	auto func = ::llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::masked_gather, { elVecTy, elPtrVecTy } );
+	return jit->builder->CreateCall(func, { elPtrs, align, i8Mask, passthrough });
+}
+
+void createScatter(llvm::Value *base, llvm::Value *val, llvm::Value *offsets, llvm::Value *mask, unsigned int alignment)
+{
+	ASSERT(base->getType()->isPointerTy());
+	ASSERT(val->getType()->isVectorTy());
+	ASSERT(offsets->getType()->isVectorTy());
+	ASSERT(mask->getType()->isVectorTy());
+
+	auto numEls = mask->getType()->getVectorNumElements();
+	auto i1Ty = ::llvm::Type::getInt1Ty(jit->context);
+	auto i32Ty = ::llvm::Type::getInt32Ty(jit->context);
+	auto i8Ty = ::llvm::Type::getInt8Ty(jit->context);
+	auto i8PtrTy = i8Ty->getPointerTo();
+	auto elVecTy = val->getType();
+	auto elTy = elVecTy->getVectorElementType();
+	auto elPtrTy = elTy->getPointerTo();
+	auto elPtrVecTy = ::llvm::VectorType::get(elPtrTy, numEls);
+	auto i8Base = jit->builder->CreatePointerCast(base, i8PtrTy);
+	auto i8Ptrs = jit->builder->CreateGEP(i8Base, offsets);
+	auto elPtrs = jit->builder->CreatePointerCast(i8Ptrs, elPtrVecTy);
+	auto i8Mask = jit->builder->CreateIntCast(mask, ::llvm::VectorType::get(i1Ty, numEls), false); // vec<int, int, ...> -> vec<bool, bool, ...>
+	auto align = ::llvm::ConstantInt::get(i32Ty, alignment);
+	auto func = ::llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::masked_scatter, { elVecTy, elPtrVecTy } );
+	jit->builder->CreateCall(func, { val, elPtrs, align, i8Mask });
+}
+}
+
+namespace rr {
+
+const Capabilities Caps =
+{
+	true, // CoroutinesSupported
+};
+
+static std::memory_order atomicOrdering(llvm::AtomicOrdering memoryOrder)
+{
+	switch(memoryOrder)
 	{
-		ASSERT(base->getType()->isPointerTy());
-		ASSERT(offsets->getType()->isVectorTy());
-		ASSERT(mask->getType()->isVectorTy());
-
-		auto numEls = mask->getType()->getVectorNumElements();
-		auto i1Ty = ::llvm::Type::getInt1Ty(jit->context);
-		auto i32Ty = ::llvm::Type::getInt32Ty(jit->context);
-		auto i8Ty = ::llvm::Type::getInt8Ty(jit->context);
-		auto i8PtrTy = i8Ty->getPointerTo();
-		auto elPtrTy = elTy->getPointerTo();
-		auto elVecTy = ::llvm::VectorType::get(elTy, numEls);
-		auto elPtrVecTy = ::llvm::VectorType::get(elPtrTy, numEls);
-		auto i8Base = jit->builder->CreatePointerCast(base, i8PtrTy);
-		auto i8Ptrs = jit->builder->CreateGEP(i8Base, offsets);
-		auto elPtrs = jit->builder->CreatePointerCast(i8Ptrs, elPtrVecTy);
-		auto i8Mask = jit->builder->CreateIntCast(mask, ::llvm::VectorType::get(i1Ty, numEls), false); // vec<int, int, ...> -> vec<bool, bool, ...>
-		auto passthrough = zeroMaskedLanes ? ::llvm::Constant::getNullValue(elVecTy) : llvm::UndefValue::get(elVecTy);
-		auto align = ::llvm::ConstantInt::get(i32Ty, alignment);
-		auto func = ::llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::masked_gather, { elVecTy, elPtrVecTy } );
-		return jit->builder->CreateCall(func, { elPtrs, align, i8Mask, passthrough });
-	}
-
-	void createScatter(llvm::Value *base, llvm::Value *val, llvm::Value *offsets, llvm::Value *mask, unsigned int alignment)
-	{
-		ASSERT(base->getType()->isPointerTy());
-		ASSERT(val->getType()->isVectorTy());
-		ASSERT(offsets->getType()->isVectorTy());
-		ASSERT(mask->getType()->isVectorTy());
-
-		auto numEls = mask->getType()->getVectorNumElements();
-		auto i1Ty = ::llvm::Type::getInt1Ty(jit->context);
-		auto i32Ty = ::llvm::Type::getInt32Ty(jit->context);
-		auto i8Ty = ::llvm::Type::getInt8Ty(jit->context);
-		auto i8PtrTy = i8Ty->getPointerTo();
-		auto elVecTy = val->getType();
-		auto elTy = elVecTy->getVectorElementType();
-		auto elPtrTy = elTy->getPointerTo();
-		auto elPtrVecTy = ::llvm::VectorType::get(elPtrTy, numEls);
-		auto i8Base = jit->builder->CreatePointerCast(base, i8PtrTy);
-		auto i8Ptrs = jit->builder->CreateGEP(i8Base, offsets);
-		auto elPtrs = jit->builder->CreatePointerCast(i8Ptrs, elPtrVecTy);
-		auto i8Mask = jit->builder->CreateIntCast(mask, ::llvm::VectorType::get(i1Ty, numEls), false); // vec<int, int, ...> -> vec<bool, bool, ...>
-		auto align = ::llvm::ConstantInt::get(i32Ty, alignment);
-		auto func = ::llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::masked_scatter, { elVecTy, elPtrVecTy } );
-		jit->builder->CreateCall(func, { val, elPtrs, align, i8Mask });
+	case llvm::AtomicOrdering::Monotonic: return std::memory_order_relaxed;  // https://llvm.org/docs/Atomics.html#monotonic
+	case llvm::AtomicOrdering::Acquire: return std::memory_order_acquire;
+	case llvm::AtomicOrdering::Release: return std::memory_order_release;
+	case llvm::AtomicOrdering::AcquireRelease: return std::memory_order_acq_rel;
+	case llvm::AtomicOrdering::SequentiallyConsistent: return std::memory_order_seq_cst;
+	default:
+		UNREACHABLE("memoryOrder: %d", int(memoryOrder));
+		return std::memory_order_acq_rel;
 	}
 }
 
-namespace rr
+static llvm::AtomicOrdering atomicOrdering(bool atomic, std::memory_order memoryOrder)
 {
-	const Capabilities Caps =
+	if(!atomic)
 	{
-		true, // CoroutinesSupported
+		return llvm::AtomicOrdering::NotAtomic;
+	}
+
+	switch(memoryOrder)
+	{
+	case std::memory_order_relaxed: return llvm::AtomicOrdering::Monotonic;  // https://llvm.org/docs/Atomics.html#monotonic
+	case std::memory_order_consume: return llvm::AtomicOrdering::Acquire;    // https://llvm.org/docs/Atomics.html#acquire: "It should also be used for C++11/C11 memory_order_consume."
+	case std::memory_order_acquire: return llvm::AtomicOrdering::Acquire;
+	case std::memory_order_release: return llvm::AtomicOrdering::Release;
+	case std::memory_order_acq_rel: return llvm::AtomicOrdering::AcquireRelease;
+	case std::memory_order_seq_cst: return llvm::AtomicOrdering::SequentiallyConsistent;
+	default:
+		UNREACHABLE("memoryOrder: %d", int(memoryOrder));
+		return llvm::AtomicOrdering::AcquireRelease;
+	}
+}
+
+template <typename T>
+static void atomicLoad(void *ptr, void *ret, llvm::AtomicOrdering ordering)
+{
+	*reinterpret_cast<T*>(ret) = std::atomic_load_explicit<T>(reinterpret_cast<std::atomic<T>*>(ptr), atomicOrdering(ordering));
+}
+
+template <typename T>
+static void atomicStore(void *ptr, void *val, llvm::AtomicOrdering ordering)
+{
+	std::atomic_store_explicit<T>(reinterpret_cast<std::atomic<T>*>(ptr), *reinterpret_cast<T*>(val), atomicOrdering(ordering));
+}
+
+#ifdef __ANDROID__
+template<typename F>
+static uint32_t sync_fetch_and_op(uint32_t volatile *ptr, uint32_t val, F f)
+{
+	// Build an arbitrary op out of looped CAS
+	for (;;)
+	{
+		uint32_t expected = *ptr;
+		uint32_t desired = f(expected, val);
+
+		if (expected == __sync_val_compare_and_swap_4(ptr, expected, desired))
+			return expected;
+	}
+}
+#endif
+
+void* resolveExternalSymbol(const char* name)
+{
+	struct Atomic
+	{
+		static void load(size_t size, void *ptr, void *ret, llvm::AtomicOrdering ordering)
+		{
+			switch (size)
+			{
+				case 1: atomicLoad<uint8_t>(ptr, ret, ordering); break;
+				case 2: atomicLoad<uint16_t>(ptr, ret, ordering); break;
+				case 4: atomicLoad<uint32_t>(ptr, ret, ordering); break;
+				case 8: atomicLoad<uint64_t>(ptr, ret, ordering); break;
+				default:
+					UNIMPLEMENTED("Atomic::load(size: %d)", int(size));
+			}
+		}
+		static void store(size_t size, void *ptr, void *ret, llvm::AtomicOrdering ordering)
+		{
+			switch (size)
+			{
+				case 1: atomicStore<uint8_t>(ptr, ret, ordering); break;
+				case 2: atomicStore<uint16_t>(ptr, ret, ordering); break;
+				case 4: atomicStore<uint32_t>(ptr, ret, ordering); break;
+				case 8: atomicStore<uint64_t>(ptr, ret, ordering); break;
+				default:
+					UNIMPLEMENTED("Atomic::store(size: %d)", int(size));
+			}
+		}
 	};
 
-	static std::memory_order atomicOrdering(llvm::AtomicOrdering memoryOrder)
+	struct F
 	{
-		switch(memoryOrder)
-		{
-		case llvm::AtomicOrdering::Monotonic: return std::memory_order_relaxed;  // https://llvm.org/docs/Atomics.html#monotonic
-		case llvm::AtomicOrdering::Acquire: return std::memory_order_acquire;
-		case llvm::AtomicOrdering::Release: return std::memory_order_release;
-		case llvm::AtomicOrdering::AcquireRelease: return std::memory_order_acq_rel;
-		case llvm::AtomicOrdering::SequentiallyConsistent: return std::memory_order_seq_cst;
-		default:
-			UNREACHABLE("memoryOrder: %d", int(memoryOrder));
-			return std::memory_order_acq_rel;
-		}
-	}
+		static void nop() {}
+		static void neverCalled() { UNREACHABLE("Should never be called"); }
 
-	static llvm::AtomicOrdering atomicOrdering(bool atomic, std::memory_order memoryOrder)
-	{
-		if(!atomic)
-		{
-			return llvm::AtomicOrdering::NotAtomic;
-		}
-
-		switch(memoryOrder)
-		{
-		case std::memory_order_relaxed: return llvm::AtomicOrdering::Monotonic;  // https://llvm.org/docs/Atomics.html#monotonic
-		case std::memory_order_consume: return llvm::AtomicOrdering::Acquire;    // https://llvm.org/docs/Atomics.html#acquire: "It should also be used for C++11/C11 memory_order_consume."
-		case std::memory_order_acquire: return llvm::AtomicOrdering::Acquire;
-		case std::memory_order_release: return llvm::AtomicOrdering::Release;
-		case std::memory_order_acq_rel: return llvm::AtomicOrdering::AcquireRelease;
-		case std::memory_order_seq_cst: return llvm::AtomicOrdering::SequentiallyConsistent;
-		default:
-			UNREACHABLE("memoryOrder: %d", int(memoryOrder));
-			return llvm::AtomicOrdering::AcquireRelease;
-		}
-	}
-
-	template <typename T>
-	static void atomicLoad(void *ptr, void *ret, llvm::AtomicOrdering ordering)
-	{
-		*reinterpret_cast<T*>(ret) = std::atomic_load_explicit<T>(reinterpret_cast<std::atomic<T>*>(ptr), atomicOrdering(ordering));
-	}
-
-	template <typename T>
-	static void atomicStore(void *ptr, void *val, llvm::AtomicOrdering ordering)
-	{
-		std::atomic_store_explicit<T>(reinterpret_cast<std::atomic<T>*>(ptr), *reinterpret_cast<T*>(val), atomicOrdering(ordering));
-	}
+		static void* coroutine_alloc_frame(size_t size) { return alignedAlloc(size, 16); }
+		static void coroutine_free_frame(void* ptr) { alignedFree(ptr); }
 
 #ifdef __ANDROID__
-	template<typename F>
-	static uint32_t sync_fetch_and_op(uint32_t volatile *ptr, uint32_t val, F f)
-	{
-		// Build an arbitrary op out of looped CAS
-		for (;;)
-		{
-			uint32_t expected = *ptr;
-			uint32_t desired = f(expected, val);
+		// forwarders since we can't take address of builtins
+		static void sync_synchronize() { __sync_synchronize(); }
+		static uint32_t sync_fetch_and_add_4(uint32_t *ptr, uint32_t val) { return __sync_fetch_and_add_4(ptr, val); }
+		static uint32_t sync_fetch_and_and_4(uint32_t *ptr, uint32_t val) { return __sync_fetch_and_and_4(ptr, val); }
+		static uint32_t sync_fetch_and_or_4(uint32_t *ptr, uint32_t val) { return __sync_fetch_and_or_4(ptr, val); }
+		static uint32_t sync_fetch_and_xor_4(uint32_t *ptr, uint32_t val) { return __sync_fetch_and_xor_4(ptr, val); }
+		static uint32_t sync_fetch_and_sub_4(uint32_t *ptr, uint32_t val) { return __sync_fetch_and_sub_4(ptr, val); }
+		static uint32_t sync_lock_test_and_set_4(uint32_t *ptr, uint32_t val) { return __sync_lock_test_and_set_4(ptr, val); }
+		static uint32_t sync_val_compare_and_swap_4(uint32_t *ptr, uint32_t expected, uint32_t desired) { return __sync_val_compare_and_swap_4(ptr, expected, desired); }
 
-			if (expected == __sync_val_compare_and_swap_4(ptr, expected, desired))
-				return expected;
-		}
-	}
+		static uint32_t sync_fetch_and_max_4(uint32_t *ptr, uint32_t val) { return sync_fetch_and_op(ptr, val, [](int32_t a, int32_t b) { return std::max(a,b);}); }
+		static uint32_t sync_fetch_and_min_4(uint32_t *ptr, uint32_t val) { return sync_fetch_and_op(ptr, val, [](int32_t a, int32_t b) { return std::min(a,b);}); }
+		static uint32_t sync_fetch_and_umax_4(uint32_t *ptr, uint32_t val) { return sync_fetch_and_op(ptr, val, [](uint32_t a, uint32_t b) { return std::max(a,b);}); }
+		static uint32_t sync_fetch_and_umin_4(uint32_t *ptr, uint32_t val) { return sync_fetch_and_op(ptr, val, [](uint32_t a, uint32_t b) { return std::min(a,b);}); }
 #endif
+	};
 
-	void* resolveExternalSymbol(const char* name)
+	class Resolver
 	{
-		struct Atomic
+	public:
+		using FunctionMap = std::unordered_map<std::string, void *>;
+
+		FunctionMap functions;
+
+		Resolver()
 		{
-			static void load(size_t size, void *ptr, void *ret, llvm::AtomicOrdering ordering)
-			{
-				switch (size)
-				{
-					case 1: atomicLoad<uint8_t>(ptr, ret, ordering); break;
-					case 2: atomicLoad<uint16_t>(ptr, ret, ordering); break;
-					case 4: atomicLoad<uint32_t>(ptr, ret, ordering); break;
-					case 8: atomicLoad<uint64_t>(ptr, ret, ordering); break;
-					default:
-						UNIMPLEMENTED("Atomic::load(size: %d)", int(size));
-				}
-			}
-			static void store(size_t size, void *ptr, void *ret, llvm::AtomicOrdering ordering)
-			{
-				switch (size)
-				{
-					case 1: atomicStore<uint8_t>(ptr, ret, ordering); break;
-					case 2: atomicStore<uint16_t>(ptr, ret, ordering); break;
-					case 4: atomicStore<uint32_t>(ptr, ret, ordering); break;
-					case 8: atomicStore<uint64_t>(ptr, ret, ordering); break;
-					default:
-						UNIMPLEMENTED("Atomic::store(size: %d)", int(size));
-				}
-			}
-		};
+			functions.emplace("nop", reinterpret_cast<void*>(F::nop));
+			functions.emplace("floorf", reinterpret_cast<void*>(floorf));
+			functions.emplace("nearbyintf", reinterpret_cast<void*>(nearbyintf));
+			functions.emplace("truncf", reinterpret_cast<void*>(truncf));
+			functions.emplace("printf", reinterpret_cast<void*>(printf));
+			functions.emplace("puts", reinterpret_cast<void*>(puts));
+			functions.emplace("fmodf", reinterpret_cast<void*>(fmodf));
 
-		struct F
-		{
-			static void nop() {}
-			static void neverCalled() { UNREACHABLE("Should never be called"); }
+			functions.emplace("sinf", reinterpret_cast<void*>(sinf));
+			functions.emplace("cosf", reinterpret_cast<void*>(cosf));
+			functions.emplace("asinf", reinterpret_cast<void*>(asinf));
+			functions.emplace("acosf", reinterpret_cast<void*>(acosf));
+			functions.emplace("atanf", reinterpret_cast<void*>(atanf));
+			functions.emplace("sinhf", reinterpret_cast<void*>(sinhf));
+			functions.emplace("coshf", reinterpret_cast<void*>(coshf));
+			functions.emplace("tanhf", reinterpret_cast<void*>(tanhf));
+			functions.emplace("asinhf", reinterpret_cast<void*>(asinhf));
+			functions.emplace("acoshf", reinterpret_cast<void*>(acoshf));
+			functions.emplace("atanhf", reinterpret_cast<void*>(atanhf));
+			functions.emplace("atan2f", reinterpret_cast<void*>(atan2f));
+			functions.emplace("powf", reinterpret_cast<void*>(powf));
+			functions.emplace("expf", reinterpret_cast<void*>(expf));
+			functions.emplace("logf", reinterpret_cast<void*>(logf));
+			functions.emplace("exp2f", reinterpret_cast<void*>(exp2f));
+			functions.emplace("log2f", reinterpret_cast<void*>(log2f));
 
-			static void* coroutine_alloc_frame(size_t size) { return alignedAlloc(size, 16); }
-			static void coroutine_free_frame(void* ptr) { alignedFree(ptr); }
+			functions.emplace("sin", reinterpret_cast<void*>(static_cast<double(*)(double)>(sin)));
+			functions.emplace("cos", reinterpret_cast<void*>(static_cast<double(*)(double)>(cos)));
+			functions.emplace("asin", reinterpret_cast<void*>(static_cast<double(*)(double)>(asin)));
+			functions.emplace("acos", reinterpret_cast<void*>(static_cast<double(*)(double)>(acos)));
+			functions.emplace("atan", reinterpret_cast<void*>(static_cast<double(*)(double)>(atan)));
+			functions.emplace("sinh", reinterpret_cast<void*>(static_cast<double(*)(double)>(sinh)));
+			functions.emplace("cosh", reinterpret_cast<void*>(static_cast<double(*)(double)>(cosh)));
+			functions.emplace("tanh", reinterpret_cast<void*>(static_cast<double(*)(double)>(tanh)));
+			functions.emplace("asinh", reinterpret_cast<void*>(static_cast<double(*)(double)>(asinh)));
+			functions.emplace("acosh", reinterpret_cast<void*>(static_cast<double(*)(double)>(acosh)));
+			functions.emplace("atanh", reinterpret_cast<void*>(static_cast<double(*)(double)>(atanh)));
+			functions.emplace("atan2", reinterpret_cast<void*>(static_cast<double(*)(double,double)>(atan2)));
+			functions.emplace("pow", reinterpret_cast<void*>(static_cast<double(*)(double,double)>(pow)));
+			functions.emplace("exp", reinterpret_cast<void*>(static_cast<double(*)(double)>(exp)));
+			functions.emplace("log", reinterpret_cast<void*>(static_cast<double(*)(double)>(log)));
+			functions.emplace("exp2", reinterpret_cast<void*>(static_cast<double(*)(double)>(exp2)));
+			functions.emplace("log2", reinterpret_cast<void*>(static_cast<double(*)(double)>(log2)));
 
-#ifdef __ANDROID__
-			// forwarders since we can't take address of builtins
-			static void sync_synchronize() { __sync_synchronize(); }
-			static uint32_t sync_fetch_and_add_4(uint32_t *ptr, uint32_t val) { return __sync_fetch_and_add_4(ptr, val); }
-			static uint32_t sync_fetch_and_and_4(uint32_t *ptr, uint32_t val) { return __sync_fetch_and_and_4(ptr, val); }
-			static uint32_t sync_fetch_and_or_4(uint32_t *ptr, uint32_t val) { return __sync_fetch_and_or_4(ptr, val); }
-			static uint32_t sync_fetch_and_xor_4(uint32_t *ptr, uint32_t val) { return __sync_fetch_and_xor_4(ptr, val); }
-			static uint32_t sync_fetch_and_sub_4(uint32_t *ptr, uint32_t val) { return __sync_fetch_and_sub_4(ptr, val); }
-			static uint32_t sync_lock_test_and_set_4(uint32_t *ptr, uint32_t val) { return __sync_lock_test_and_set_4(ptr, val); }
-			static uint32_t sync_val_compare_and_swap_4(uint32_t *ptr, uint32_t expected, uint32_t desired) { return __sync_val_compare_and_swap_4(ptr, expected, desired); }
+			functions.emplace("atomic_load", reinterpret_cast<void*>(Atomic::load));
+			functions.emplace("atomic_store", reinterpret_cast<void*>(Atomic::store));
 
-			static uint32_t sync_fetch_and_max_4(uint32_t *ptr, uint32_t val) { return sync_fetch_and_op(ptr, val, [](int32_t a, int32_t b) { return std::max(a,b);}); }
-			static uint32_t sync_fetch_and_min_4(uint32_t *ptr, uint32_t val) { return sync_fetch_and_op(ptr, val, [](int32_t a, int32_t b) { return std::min(a,b);}); }
-			static uint32_t sync_fetch_and_umax_4(uint32_t *ptr, uint32_t val) { return sync_fetch_and_op(ptr, val, [](uint32_t a, uint32_t b) { return std::max(a,b);}); }
-			static uint32_t sync_fetch_and_umin_4(uint32_t *ptr, uint32_t val) { return sync_fetch_and_op(ptr, val, [](uint32_t a, uint32_t b) { return std::min(a,b);}); }
-#endif
-		};
-
-		class Resolver
-		{
-		public:
-			using FunctionMap = std::unordered_map<std::string, void *>;
-
-			FunctionMap functions;
-
-			Resolver()
-			{
-				functions.emplace("nop", reinterpret_cast<void*>(F::nop));
-				functions.emplace("floorf", reinterpret_cast<void*>(floorf));
-				functions.emplace("nearbyintf", reinterpret_cast<void*>(nearbyintf));
-				functions.emplace("truncf", reinterpret_cast<void*>(truncf));
-				functions.emplace("printf", reinterpret_cast<void*>(printf));
-				functions.emplace("puts", reinterpret_cast<void*>(puts));
-				functions.emplace("fmodf", reinterpret_cast<void*>(fmodf));
-
-				functions.emplace("sinf", reinterpret_cast<void*>(sinf));
-				functions.emplace("cosf", reinterpret_cast<void*>(cosf));
-				functions.emplace("asinf", reinterpret_cast<void*>(asinf));
-				functions.emplace("acosf", reinterpret_cast<void*>(acosf));
-				functions.emplace("atanf", reinterpret_cast<void*>(atanf));
-				functions.emplace("sinhf", reinterpret_cast<void*>(sinhf));
-				functions.emplace("coshf", reinterpret_cast<void*>(coshf));
-				functions.emplace("tanhf", reinterpret_cast<void*>(tanhf));
-				functions.emplace("asinhf", reinterpret_cast<void*>(asinhf));
-				functions.emplace("acoshf", reinterpret_cast<void*>(acoshf));
-				functions.emplace("atanhf", reinterpret_cast<void*>(atanhf));
-				functions.emplace("atan2f", reinterpret_cast<void*>(atan2f));
-				functions.emplace("powf", reinterpret_cast<void*>(powf));
-				functions.emplace("expf", reinterpret_cast<void*>(expf));
-				functions.emplace("logf", reinterpret_cast<void*>(logf));
-				functions.emplace("exp2f", reinterpret_cast<void*>(exp2f));
-				functions.emplace("log2f", reinterpret_cast<void*>(log2f));
-
-				functions.emplace("sin", reinterpret_cast<void*>(static_cast<double(*)(double)>(sin)));
-				functions.emplace("cos", reinterpret_cast<void*>(static_cast<double(*)(double)>(cos)));
-				functions.emplace("asin", reinterpret_cast<void*>(static_cast<double(*)(double)>(asin)));
-				functions.emplace("acos", reinterpret_cast<void*>(static_cast<double(*)(double)>(acos)));
-				functions.emplace("atan", reinterpret_cast<void*>(static_cast<double(*)(double)>(atan)));
-				functions.emplace("sinh", reinterpret_cast<void*>(static_cast<double(*)(double)>(sinh)));
-				functions.emplace("cosh", reinterpret_cast<void*>(static_cast<double(*)(double)>(cosh)));
-				functions.emplace("tanh", reinterpret_cast<void*>(static_cast<double(*)(double)>(tanh)));
-				functions.emplace("asinh", reinterpret_cast<void*>(static_cast<double(*)(double)>(asinh)));
-				functions.emplace("acosh", reinterpret_cast<void*>(static_cast<double(*)(double)>(acosh)));
-				functions.emplace("atanh", reinterpret_cast<void*>(static_cast<double(*)(double)>(atanh)));
-				functions.emplace("atan2", reinterpret_cast<void*>(static_cast<double(*)(double,double)>(atan2)));
-				functions.emplace("pow", reinterpret_cast<void*>(static_cast<double(*)(double,double)>(pow)));
-				functions.emplace("exp", reinterpret_cast<void*>(static_cast<double(*)(double)>(exp)));
-				functions.emplace("log", reinterpret_cast<void*>(static_cast<double(*)(double)>(log)));
-				functions.emplace("exp2", reinterpret_cast<void*>(static_cast<double(*)(double)>(exp2)));
-				functions.emplace("log2", reinterpret_cast<void*>(static_cast<double(*)(double)>(log2)));
-
-				functions.emplace("atomic_load", reinterpret_cast<void*>(Atomic::load));
-				functions.emplace("atomic_store", reinterpret_cast<void*>(Atomic::store));
-
-				// FIXME (b/119409619): use an allocator here so we can control all memory allocations
-				functions.emplace("coroutine_alloc_frame", reinterpret_cast<void*>(F::coroutine_alloc_frame));
-				functions.emplace("coroutine_free_frame", reinterpret_cast<void*>(F::coroutine_free_frame));
+			// FIXME (b/119409619): use an allocator here so we can control all memory allocations
+			functions.emplace("coroutine_alloc_frame", reinterpret_cast<void*>(F::coroutine_alloc_frame));
+			functions.emplace("coroutine_free_frame", reinterpret_cast<void*>(F::coroutine_free_frame));
 
 #ifdef __APPLE__
-				functions.emplace("sincosf_stret", reinterpret_cast<void*>(__sincosf_stret));
+			functions.emplace("sincosf_stret", reinterpret_cast<void*>(__sincosf_stret));
 #elif defined(__linux__)
-				functions.emplace("sincosf", reinterpret_cast<void*>(sincosf));
+			functions.emplace("sincosf", reinterpret_cast<void*>(sincosf));
 #elif defined(_WIN64)
-				functions.emplace("chkstk", reinterpret_cast<void*>(__chkstk));
+			functions.emplace("chkstk", reinterpret_cast<void*>(__chkstk));
 #elif defined(_WIN32)
-				functions.emplace("chkstk", reinterpret_cast<void*>(_chkstk));
+			functions.emplace("chkstk", reinterpret_cast<void*>(_chkstk));
 #endif
 
 #ifdef __ANDROID__
-				functions.emplace("aeabi_unwind_cpp_pr0", reinterpret_cast<void*>(F::neverCalled));
-				functions.emplace("sync_synchronize", reinterpret_cast<void*>(F::sync_synchronize));
-				functions.emplace("sync_fetch_and_add_4", reinterpret_cast<void*>(F::sync_fetch_and_add_4));
-				functions.emplace("sync_fetch_and_and_4", reinterpret_cast<void*>(F::sync_fetch_and_and_4));
-				functions.emplace("sync_fetch_and_or_4", reinterpret_cast<void*>(F::sync_fetch_and_or_4));
-				functions.emplace("sync_fetch_and_xor_4", reinterpret_cast<void*>(F::sync_fetch_and_xor_4));
-				functions.emplace("sync_fetch_and_sub_4", reinterpret_cast<void*>(F::sync_fetch_and_sub_4));
-				functions.emplace("sync_lock_test_and_set_4", reinterpret_cast<void*>(F::sync_lock_test_and_set_4));
-				functions.emplace("sync_val_compare_and_swap_4", reinterpret_cast<void*>(F::sync_val_compare_and_swap_4));
-				functions.emplace("sync_fetch_and_max_4", reinterpret_cast<void*>(F::sync_fetch_and_max_4));
-				functions.emplace("sync_fetch_and_min_4", reinterpret_cast<void*>(F::sync_fetch_and_min_4));
-				functions.emplace("sync_fetch_and_umax_4", reinterpret_cast<void*>(F::sync_fetch_and_umax_4));
-				functions.emplace("sync_fetch_and_umin_4", reinterpret_cast<void*>(F::sync_fetch_and_umin_4));
-	#endif
-			}
-		};
-
-		static Resolver resolver;
-
-		// Trim off any underscores from the start of the symbol. LLVM likes
-		// to append these on macOS.
-		const char* trimmed = name;
-		while (trimmed[0] == '_') { trimmed++; }
-
-		auto it = resolver.functions.find(trimmed);
-		// Missing functions will likely make the module fail in exciting non-obvious ways.
-		ASSERT_MSG(it != resolver.functions.end(), "Missing external function: '%s'", name);
-		return it->second;
-	}
-
-	// The abstract Type* types are implemented as LLVM types, except that
-	// 64-bit vectors are emulated using 128-bit ones to avoid use of MMX in x86
-	// and VFP in ARM, and eliminate the overhead of converting them to explicit
-	// 128-bit ones. LLVM types are pointers, so we can represent emulated types
-	// as abstract pointers with small enum values.
-	enum InternalType : uintptr_t
-	{
-		// Emulated types:
-		Type_v2i32,
-		Type_v4i16,
-		Type_v2i16,
-		Type_v8i8,
-		Type_v4i8,
-		Type_v2f32,
-		EmulatedTypeCount,
-		// Returned by asInternalType() to indicate that the abstract Type*
-		// should be interpreted as LLVM type pointer:
-		Type_LLVM
+			functions.emplace("aeabi_unwind_cpp_pr0", reinterpret_cast<void*>(F::neverCalled));
+			functions.emplace("sync_synchronize", reinterpret_cast<void*>(F::sync_synchronize));
+			functions.emplace("sync_fetch_and_add_4", reinterpret_cast<void*>(F::sync_fetch_and_add_4));
+			functions.emplace("sync_fetch_and_and_4", reinterpret_cast<void*>(F::sync_fetch_and_and_4));
+			functions.emplace("sync_fetch_and_or_4", reinterpret_cast<void*>(F::sync_fetch_and_or_4));
+			functions.emplace("sync_fetch_and_xor_4", reinterpret_cast<void*>(F::sync_fetch_and_xor_4));
+			functions.emplace("sync_fetch_and_sub_4", reinterpret_cast<void*>(F::sync_fetch_and_sub_4));
+			functions.emplace("sync_lock_test_and_set_4", reinterpret_cast<void*>(F::sync_lock_test_and_set_4));
+			functions.emplace("sync_val_compare_and_swap_4", reinterpret_cast<void*>(F::sync_val_compare_and_swap_4));
+			functions.emplace("sync_fetch_and_max_4", reinterpret_cast<void*>(F::sync_fetch_and_max_4));
+			functions.emplace("sync_fetch_and_min_4", reinterpret_cast<void*>(F::sync_fetch_and_min_4));
+			functions.emplace("sync_fetch_and_umax_4", reinterpret_cast<void*>(F::sync_fetch_and_umax_4));
+			functions.emplace("sync_fetch_and_umin_4", reinterpret_cast<void*>(F::sync_fetch_and_umin_4));
+#endif
+		}
 	};
 
-	inline InternalType asInternalType(Type *type)
-	{
-		InternalType t = static_cast<InternalType>(reinterpret_cast<uintptr_t>(type));
-		return (t < EmulatedTypeCount) ? t : Type_LLVM;
-	}
+	static Resolver resolver;
 
-	llvm::Type *T(Type *t)
+	// Trim off any underscores from the start of the symbol. LLVM likes
+	// to append these on macOS.
+	const char* trimmed = name;
+	while (trimmed[0] == '_') { trimmed++; }
+
+	auto it = resolver.functions.find(trimmed);
+	// Missing functions will likely make the module fail in exciting non-obvious ways.
+	ASSERT_MSG(it != resolver.functions.end(), "Missing external function: '%s'", name);
+	return it->second;
+}
+
+// The abstract Type* types are implemented as LLVM types, except that
+// 64-bit vectors are emulated using 128-bit ones to avoid use of MMX in x86
+// and VFP in ARM, and eliminate the overhead of converting them to explicit
+// 128-bit ones. LLVM types are pointers, so we can represent emulated types
+// as abstract pointers with small enum values.
+enum InternalType : uintptr_t
+{
+	// Emulated types:
+	Type_v2i32,
+	Type_v4i16,
+	Type_v2i16,
+	Type_v8i8,
+	Type_v4i8,
+	Type_v2f32,
+	EmulatedTypeCount,
+	// Returned by asInternalType() to indicate that the abstract Type*
+	// should be interpreted as LLVM type pointer:
+	Type_LLVM
+};
+
+inline InternalType asInternalType(Type *type)
+{
+	InternalType t = static_cast<InternalType>(reinterpret_cast<uintptr_t>(type));
+	return (t < EmulatedTypeCount) ? t : Type_LLVM;
+}
+
+llvm::Type *T(Type *t)
+{
+	// Use 128-bit vectors to implement logically shorter ones.
+	switch(asInternalType(t))
 	{
-		// Use 128-bit vectors to implement logically shorter ones.
-		switch(asInternalType(t))
+	case Type_v2i32: return T(Int4::getType());
+	case Type_v4i16: return T(Short8::getType());
+	case Type_v2i16: return T(Short8::getType());
+	case Type_v8i8:  return T(Byte16::getType());
+	case Type_v4i8:  return T(Byte16::getType());
+	case Type_v2f32: return T(Float4::getType());
+	case Type_LLVM:  return reinterpret_cast<llvm::Type*>(t);
+	default:
+		UNREACHABLE("asInternalType(t): %d", int(asInternalType(t)));
+		return nullptr;
+	}
+}
+
+Type *T(InternalType t)
+{
+	return reinterpret_cast<Type*>(t);
+}
+
+inline std::vector<llvm::Type*> &T(std::vector<Type*> &t)
+{
+	return reinterpret_cast<std::vector<llvm::Type*>&>(t);
+}
+
+inline llvm::BasicBlock *B(BasicBlock *t)
+{
+	return reinterpret_cast<llvm::BasicBlock*>(t);
+}
+
+inline BasicBlock *B(llvm::BasicBlock *t)
+{
+	return reinterpret_cast<BasicBlock*>(t);
+}
+
+static size_t typeSize(Type *type)
+{
+	switch(asInternalType(type))
+	{
+	case Type_v2i32: return 8;
+	case Type_v4i16: return 8;
+	case Type_v2i16: return 4;
+	case Type_v8i8:  return 8;
+	case Type_v4i8:  return 4;
+	case Type_v2f32: return 8;
+	case Type_LLVM:
 		{
-		case Type_v2i32: return T(Int4::getType());
-		case Type_v4i16: return T(Short8::getType());
-		case Type_v2i16: return T(Short8::getType());
-		case Type_v8i8:  return T(Byte16::getType());
-		case Type_v4i8:  return T(Byte16::getType());
-		case Type_v2f32: return T(Float4::getType());
-		case Type_LLVM:  return reinterpret_cast<llvm::Type*>(t);
-		default:
-			UNREACHABLE("asInternalType(t): %d", int(asInternalType(t)));
-			return nullptr;
-		}
-	}
+			llvm::Type *t = T(type);
 
-	Type *T(InternalType t)
-	{
-		return reinterpret_cast<Type*>(t);
-	}
-
-	inline std::vector<llvm::Type*> &T(std::vector<Type*> &t)
-	{
-		return reinterpret_cast<std::vector<llvm::Type*>&>(t);
-	}
-
-	inline llvm::BasicBlock *B(BasicBlock *t)
-	{
-		return reinterpret_cast<llvm::BasicBlock*>(t);
-	}
-
-	inline BasicBlock *B(llvm::BasicBlock *t)
-	{
-		return reinterpret_cast<BasicBlock*>(t);
-	}
-
-	static size_t typeSize(Type *type)
-	{
-		switch(asInternalType(type))
-		{
-		case Type_v2i32: return 8;
-		case Type_v4i16: return 8;
-		case Type_v2i16: return 4;
-		case Type_v8i8:  return 8;
-		case Type_v4i8:  return 4;
-		case Type_v2f32: return 8;
-		case Type_LLVM:
+			if(t->isPointerTy())
 			{
-				llvm::Type *t = T(type);
-
-				if(t->isPointerTy())
-				{
-					return sizeof(void*);
-				}
-
-				// At this point we should only have LLVM 'primitive' types.
-				unsigned int bits = t->getPrimitiveSizeInBits();
-				ASSERT_MSG(bits != 0, "bits: %d", int(bits));
-
-				// TODO(capn): Booleans are 1 bit integers in LLVM's SSA type system,
-				// but are typically stored as one byte. The DataLayout structure should
-				// be used here and many other places if this assumption fails.
-				return (bits + 7) / 8;
+				return sizeof(void*);
 			}
-			break;
-		default:
-			UNREACHABLE("asInternalType(type): %d", int(asInternalType(type)));
-			return 0;
+
+			// At this point we should only have LLVM 'primitive' types.
+			unsigned int bits = t->getPrimitiveSizeInBits();
+			ASSERT_MSG(bits != 0, "bits: %d", int(bits));
+
+			// TODO(capn): Booleans are 1 bit integers in LLVM's SSA type system,
+			// but are typically stored as one byte. The DataLayout structure should
+			// be used here and many other places if this assumption fails.
+			return (bits + 7) / 8;
 		}
+		break;
+	default:
+		UNREACHABLE("asInternalType(type): %d", int(asInternalType(type)));
+		return 0;
 	}
+}
 
-	static unsigned int elementCount(Type *type)
+static unsigned int elementCount(Type *type)
+{
+	switch(asInternalType(type))
 	{
-		switch(asInternalType(type))
+	case Type_v2i32: return 2;
+	case Type_v4i16: return 4;
+	case Type_v2i16: return 2;
+	case Type_v8i8:  return 8;
+	case Type_v4i8:  return 4;
+	case Type_v2f32: return 2;
+	case Type_LLVM:  return llvm::cast<llvm::VectorType>(T(type))->getNumElements();
+	default:
+		UNREACHABLE("asInternalType(type): %d", int(asInternalType(type)));
+		return 0;
+	}
+}
+
+static ::llvm::Function* createFunction(const char *name, ::llvm::Type *retTy, const std::vector<::llvm::Type*> &params)
+{
+	llvm::FunctionType *functionType = llvm::FunctionType::get(retTy, params, false);
+	auto func = llvm::Function::Create(functionType, llvm::GlobalValue::InternalLinkage, name, jit->module.get());
+	func->setDoesNotThrow();
+	func->setCallingConv(llvm::CallingConv::C);
+	return func;
+}
+
+Nucleus::Nucleus()
+{
+	::codegenMutex.lock();   // Reactor and LLVM are currently not thread safe
+
+	ASSERT(jit == nullptr);
+	jit.reset(new JITBuilder(Nucleus::getDefaultConfig()));
+}
+
+Nucleus::~Nucleus()
+{
+	jit.reset();
+	::codegenMutex.unlock();
+}
+
+void Nucleus::setDefaultConfig(const Config &cfg)
+{
+	std::unique_lock<std::mutex> lock(::defaultConfigLock);
+	::defaultConfig() = cfg;
+}
+
+void Nucleus::adjustDefaultConfig(const Config::Edit &cfgEdit)
+{
+	std::unique_lock<std::mutex> lock(::defaultConfigLock);
+	auto &config = ::defaultConfig();
+	config = cfgEdit.apply(config);
+}
+
+Config Nucleus::getDefaultConfig()
+{
+	std::unique_lock<std::mutex> lock(::defaultConfigLock);
+	return ::defaultConfig();
+}
+
+std::shared_ptr<Routine> Nucleus::acquireRoutine(const char *name, const Config::Edit &cfgEdit /* = Config::Edit::None */)
+{
+	auto cfg = cfgEdit.apply(jit->config);
+
+	if(jit->builder->GetInsertBlock()->empty() || !jit->builder->GetInsertBlock()->back().isTerminator())
+	{
+		llvm::Type *type = jit->function->getReturnType();
+
+		if(type->isVoidTy())
 		{
-		case Type_v2i32: return 2;
-		case Type_v4i16: return 4;
-		case Type_v2i16: return 2;
-		case Type_v8i8:  return 8;
-		case Type_v4i8:  return 4;
-		case Type_v2f32: return 2;
-		case Type_LLVM:  return llvm::cast<llvm::VectorType>(T(type))->getNumElements();
-		default:
-			UNREACHABLE("asInternalType(type): %d", int(asInternalType(type)));
-			return 0;
-		}
-	}
-
-	static ::llvm::Function* createFunction(const char *name, ::llvm::Type *retTy, const std::vector<::llvm::Type*> &params)
-	{
-		llvm::FunctionType *functionType = llvm::FunctionType::get(retTy, params, false);
-		auto func = llvm::Function::Create(functionType, llvm::GlobalValue::InternalLinkage, name, jit->module.get());
-		func->setDoesNotThrow();
-		func->setCallingConv(llvm::CallingConv::C);
-		return func;
-	}
-
-	Nucleus::Nucleus()
-	{
-		::codegenMutex.lock();   // Reactor and LLVM are currently not thread safe
-
-		ASSERT(jit == nullptr);
-		jit.reset(new JITBuilder(Nucleus::getDefaultConfig()));
-	}
-
-	Nucleus::~Nucleus()
-	{
-		jit.reset();
-		::codegenMutex.unlock();
-	}
-
-	void Nucleus::setDefaultConfig(const Config &cfg)
-	{
-		std::unique_lock<std::mutex> lock(::defaultConfigLock);
-		::defaultConfig() = cfg;
-	}
-
-	void Nucleus::adjustDefaultConfig(const Config::Edit &cfgEdit)
-	{
-		std::unique_lock<std::mutex> lock(::defaultConfigLock);
-		auto &config = ::defaultConfig();
-		config = cfgEdit.apply(config);
-	}
-
-	Config Nucleus::getDefaultConfig()
-	{
-		std::unique_lock<std::mutex> lock(::defaultConfigLock);
-		return ::defaultConfig();
-	}
-
-	std::shared_ptr<Routine> Nucleus::acquireRoutine(const char *name, const Config::Edit &cfgEdit /* = Config::Edit::None */)
-	{
-		auto cfg = cfgEdit.apply(jit->config);
-
-		if(jit->builder->GetInsertBlock()->empty() || !jit->builder->GetInsertBlock()->back().isTerminator())
-		{
-			llvm::Type *type = jit->function->getReturnType();
-
-			if(type->isVoidTy())
-			{
-				createRetVoid();
-			}
-			else
-			{
-				createRet(V(llvm::UndefValue::get(type)));
-			}
-		}
-
-#ifdef ENABLE_RR_DEBUG_INFO
-		if (jit->debugInfo != nullptr)
-		{
-			jit->debugInfo->Finalize();
-		}
-#endif // ENABLE_RR_DEBUG_INFO
-
-		if(false)
-		{
-			std::error_code error;
-			llvm::raw_fd_ostream file(std::string(name) + "-llvm-dump-unopt.txt", error);
-			jit->module->print(file, 0);
-		}
-
-#if defined(ENABLE_RR_LLVM_IR_VERIFICATION) || !defined(NDEBUG)
-		{
-			llvm::legacy::PassManager pm;
-			pm.add(llvm::createVerifierPass());
-			pm.run(*jit->module);
-		}
-#endif // defined(ENABLE_RR_LLVM_IR_VERIFICATION) || !defined(NDEBUG)
-
-		jit->optimize(cfg);
-
-		if(false)
-		{
-			std::error_code error;
-			llvm::raw_fd_ostream file(std::string(name) + "-llvm-dump-opt.txt", error);
-			jit->module->print(file, 0);
-		}
-
-		auto routine = jit->acquireRoutine(&jit->function, 1, cfg);
-		jit.reset();
-
-		return routine;
-	}
-
-	Value *Nucleus::allocateStackVariable(Type *type, int arraySize)
-	{
-		// Need to allocate it in the entry block for mem2reg to work
-		llvm::BasicBlock &entryBlock = jit->function->getEntryBlock();
-
-		llvm::Instruction *declaration;
-
-		if(arraySize)
-		{
-			declaration = new llvm::AllocaInst(T(type), 0, V(Nucleus::createConstantInt(arraySize)));
+			createRetVoid();
 		}
 		else
 		{
-			declaration = new llvm::AllocaInst(T(type), 0, (llvm::Value*)nullptr);
+			createRet(V(llvm::UndefValue::get(type)));
 		}
-
-		entryBlock.getInstList().push_front(declaration);
-
-		return V(declaration);
 	}
 
-	BasicBlock *Nucleus::createBasicBlock()
-	{
-		return B(llvm::BasicBlock::Create(jit->context, "", jit->function));
-	}
-
-	BasicBlock *Nucleus::getInsertBlock()
-	{
-		return B(jit->builder->GetInsertBlock());
-	}
-
-	void Nucleus::setInsertBlock(BasicBlock *basicBlock)
-	{
-	//	assert(jit->builder->GetInsertBlock()->back().isTerminator());
-
-		Variable::materializeAll();
-
-		jit->builder->SetInsertPoint(B(basicBlock));
-	}
-
-	void Nucleus::createFunction(Type *ReturnType, std::vector<Type*> &Params)
-	{
-		jit->function = rr::createFunction("", T(ReturnType), T(Params));
-
 #ifdef ENABLE_RR_DEBUG_INFO
-		jit->debugInfo = std::unique_ptr<DebugInfo>(new DebugInfo(jit->builder.get(), &jit->context, jit->module.get(), jit->function));
+	if (jit->debugInfo != nullptr)
+	{
+		jit->debugInfo->Finalize();
+	}
 #endif // ENABLE_RR_DEBUG_INFO
 
-		jit->builder->SetInsertPoint(llvm::BasicBlock::Create(jit->context, "", jit->function));
+	if(false)
+	{
+		std::error_code error;
+		llvm::raw_fd_ostream file(std::string(name) + "-llvm-dump-unopt.txt", error);
+		jit->module->print(file, 0);
 	}
 
-	Value *Nucleus::getArgument(unsigned int index)
+#if defined(ENABLE_RR_LLVM_IR_VERIFICATION) || !defined(NDEBUG)
 	{
-		llvm::Function::arg_iterator args = jit->function->arg_begin();
+		llvm::legacy::PassManager pm;
+		pm.add(llvm::createVerifierPass());
+		pm.run(*jit->module);
+	}
+#endif // defined(ENABLE_RR_LLVM_IR_VERIFICATION) || !defined(NDEBUG)
 
-		while(index)
-		{
-			args++;
-			index--;
-		}
+	jit->optimize(cfg);
 
-		return V(&*args);
+	if(false)
+	{
+		std::error_code error;
+		llvm::raw_fd_ostream file(std::string(name) + "-llvm-dump-opt.txt", error);
+		jit->module->print(file, 0);
 	}
 
-	void Nucleus::createRetVoid()
+	auto routine = jit->acquireRoutine(&jit->function, 1, cfg);
+	jit.reset();
+
+	return routine;
+}
+
+Value *Nucleus::allocateStackVariable(Type *type, int arraySize)
+{
+	// Need to allocate it in the entry block for mem2reg to work
+	llvm::BasicBlock &entryBlock = jit->function->getEntryBlock();
+
+	llvm::Instruction *declaration;
+
+	if(arraySize)
 	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-
-		ASSERT_MSG(jit->function->getReturnType() == T(Void::getType()), "Return type mismatch");
-
-		// Code generated after this point is unreachable, so any variables
-		// being read can safely return an undefined value. We have to avoid
-		// materializing variables after the terminator ret instruction.
-		Variable::killUnmaterialized();
-
-		jit->builder->CreateRetVoid();
+		declaration = new llvm::AllocaInst(T(type), 0, V(Nucleus::createConstantInt(arraySize)));
+	}
+	else
+	{
+		declaration = new llvm::AllocaInst(T(type), 0, (llvm::Value*)nullptr);
 	}
 
-	void Nucleus::createRet(Value *v)
+	entryBlock.getInstList().push_front(declaration);
+
+	return V(declaration);
+}
+
+BasicBlock *Nucleus::createBasicBlock()
+{
+	return B(llvm::BasicBlock::Create(jit->context, "", jit->function));
+}
+
+BasicBlock *Nucleus::getInsertBlock()
+{
+	return B(jit->builder->GetInsertBlock());
+}
+
+void Nucleus::setInsertBlock(BasicBlock *basicBlock)
+{
+//	assert(jit->builder->GetInsertBlock()->back().isTerminator());
+
+	Variable::materializeAll();
+
+	jit->builder->SetInsertPoint(B(basicBlock));
+}
+
+void Nucleus::createFunction(Type *ReturnType, std::vector<Type*> &Params)
+{
+	jit->function = rr::createFunction("", T(ReturnType), T(Params));
+
+#ifdef ENABLE_RR_DEBUG_INFO
+	jit->debugInfo = std::unique_ptr<DebugInfo>(new DebugInfo(jit->builder.get(), &jit->context, jit->module.get(), jit->function));
+#endif // ENABLE_RR_DEBUG_INFO
+
+	jit->builder->SetInsertPoint(llvm::BasicBlock::Create(jit->context, "", jit->function));
+}
+
+Value *Nucleus::getArgument(unsigned int index)
+{
+	llvm::Function::arg_iterator args = jit->function->arg_begin();
+
+	while(index)
 	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-
-		ASSERT_MSG(jit->function->getReturnType() == V(v)->getType(), "Return type mismatch");
-
-		// Code generated after this point is unreachable, so any variables
-		// being read can safely return an undefined value. We have to avoid
-		// materializing variables after the terminator ret instruction.
-		Variable::killUnmaterialized();
-
-		jit->builder->CreateRet(V(v));
+		args++;
+		index--;
 	}
 
-	void Nucleus::createBr(BasicBlock *dest)
+	return V(&*args);
+}
+
+void Nucleus::createRetVoid()
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+
+	ASSERT_MSG(jit->function->getReturnType() == T(Void::getType()), "Return type mismatch");
+
+	// Code generated after this point is unreachable, so any variables
+	// being read can safely return an undefined value. We have to avoid
+	// materializing variables after the terminator ret instruction.
+	Variable::killUnmaterialized();
+
+	jit->builder->CreateRetVoid();
+}
+
+void Nucleus::createRet(Value *v)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+
+	ASSERT_MSG(jit->function->getReturnType() == V(v)->getType(), "Return type mismatch");
+
+	// Code generated after this point is unreachable, so any variables
+	// being read can safely return an undefined value. We have to avoid
+	// materializing variables after the terminator ret instruction.
+	Variable::killUnmaterialized();
+
+	jit->builder->CreateRet(V(v));
+}
+
+void Nucleus::createBr(BasicBlock *dest)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	Variable::materializeAll();
+
+	jit->builder->CreateBr(B(dest));
+}
+
+void Nucleus::createCondBr(Value *cond, BasicBlock *ifTrue, BasicBlock *ifFalse)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	Variable::materializeAll();
+	jit->builder->CreateCondBr(V(cond), B(ifTrue), B(ifFalse));
+}
+
+Value *Nucleus::createAdd(Value *lhs, Value *rhs)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return V(jit->builder->CreateAdd(V(lhs), V(rhs)));
+}
+
+Value *Nucleus::createSub(Value *lhs, Value *rhs)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return V(jit->builder->CreateSub(V(lhs), V(rhs)));
+}
+
+Value *Nucleus::createMul(Value *lhs, Value *rhs)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return V(jit->builder->CreateMul(V(lhs), V(rhs)));
+}
+
+Value *Nucleus::createUDiv(Value *lhs, Value *rhs)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return V(jit->builder->CreateUDiv(V(lhs), V(rhs)));
+}
+
+Value *Nucleus::createSDiv(Value *lhs, Value *rhs)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return V(jit->builder->CreateSDiv(V(lhs), V(rhs)));
+}
+
+Value *Nucleus::createFAdd(Value *lhs, Value *rhs)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return V(jit->builder->CreateFAdd(V(lhs), V(rhs)));
+}
+
+Value *Nucleus::createFSub(Value *lhs, Value *rhs)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return V(jit->builder->CreateFSub(V(lhs), V(rhs)));
+}
+
+Value *Nucleus::createFMul(Value *lhs, Value *rhs)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return V(jit->builder->CreateFMul(V(lhs), V(rhs)));
+}
+
+Value *Nucleus::createFDiv(Value *lhs, Value *rhs)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return V(jit->builder->CreateFDiv(V(lhs), V(rhs)));
+}
+
+Value *Nucleus::createURem(Value *lhs, Value *rhs)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return V(jit->builder->CreateURem(V(lhs), V(rhs)));
+}
+
+Value *Nucleus::createSRem(Value *lhs, Value *rhs)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return V(jit->builder->CreateSRem(V(lhs), V(rhs)));
+}
+
+Value *Nucleus::createFRem(Value *lhs, Value *rhs)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return V(jit->builder->CreateFRem(V(lhs), V(rhs)));
+}
+
+Value *Nucleus::createShl(Value *lhs, Value *rhs)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return V(jit->builder->CreateShl(V(lhs), V(rhs)));
+}
+
+Value *Nucleus::createLShr(Value *lhs, Value *rhs)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return V(jit->builder->CreateLShr(V(lhs), V(rhs)));
+}
+
+Value *Nucleus::createAShr(Value *lhs, Value *rhs)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return V(jit->builder->CreateAShr(V(lhs), V(rhs)));
+}
+
+Value *Nucleus::createAnd(Value *lhs, Value *rhs)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return V(jit->builder->CreateAnd(V(lhs), V(rhs)));
+}
+
+Value *Nucleus::createOr(Value *lhs, Value *rhs)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return V(jit->builder->CreateOr(V(lhs), V(rhs)));
+}
+
+Value *Nucleus::createXor(Value *lhs, Value *rhs)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return V(jit->builder->CreateXor(V(lhs), V(rhs)));
+}
+
+Value *Nucleus::createNeg(Value *v)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return V(jit->builder->CreateNeg(V(v)));
+}
+
+Value *Nucleus::createFNeg(Value *v)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return V(jit->builder->CreateFNeg(V(v)));
+}
+
+Value *Nucleus::createNot(Value *v)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return V(jit->builder->CreateNot(V(v)));
+}
+
+Value *Nucleus::createLoad(Value *ptr, Type *type, bool isVolatile, unsigned int alignment, bool atomic, std::memory_order memoryOrder)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	switch(asInternalType(type))
 	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		Variable::materializeAll();
-
-		jit->builder->CreateBr(B(dest));
-	}
-
-	void Nucleus::createCondBr(Value *cond, BasicBlock *ifTrue, BasicBlock *ifFalse)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		Variable::materializeAll();
-		jit->builder->CreateCondBr(V(cond), B(ifTrue), B(ifFalse));
-	}
-
-	Value *Nucleus::createAdd(Value *lhs, Value *rhs)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return V(jit->builder->CreateAdd(V(lhs), V(rhs)));
-	}
-
-	Value *Nucleus::createSub(Value *lhs, Value *rhs)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return V(jit->builder->CreateSub(V(lhs), V(rhs)));
-	}
-
-	Value *Nucleus::createMul(Value *lhs, Value *rhs)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return V(jit->builder->CreateMul(V(lhs), V(rhs)));
-	}
-
-	Value *Nucleus::createUDiv(Value *lhs, Value *rhs)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return V(jit->builder->CreateUDiv(V(lhs), V(rhs)));
-	}
-
-	Value *Nucleus::createSDiv(Value *lhs, Value *rhs)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return V(jit->builder->CreateSDiv(V(lhs), V(rhs)));
-	}
-
-	Value *Nucleus::createFAdd(Value *lhs, Value *rhs)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return V(jit->builder->CreateFAdd(V(lhs), V(rhs)));
-	}
-
-	Value *Nucleus::createFSub(Value *lhs, Value *rhs)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return V(jit->builder->CreateFSub(V(lhs), V(rhs)));
-	}
-
-	Value *Nucleus::createFMul(Value *lhs, Value *rhs)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return V(jit->builder->CreateFMul(V(lhs), V(rhs)));
-	}
-
-	Value *Nucleus::createFDiv(Value *lhs, Value *rhs)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return V(jit->builder->CreateFDiv(V(lhs), V(rhs)));
-	}
-
-	Value *Nucleus::createURem(Value *lhs, Value *rhs)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return V(jit->builder->CreateURem(V(lhs), V(rhs)));
-	}
-
-	Value *Nucleus::createSRem(Value *lhs, Value *rhs)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return V(jit->builder->CreateSRem(V(lhs), V(rhs)));
-	}
-
-	Value *Nucleus::createFRem(Value *lhs, Value *rhs)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return V(jit->builder->CreateFRem(V(lhs), V(rhs)));
-	}
-
-	Value *Nucleus::createShl(Value *lhs, Value *rhs)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return V(jit->builder->CreateShl(V(lhs), V(rhs)));
-	}
-
-	Value *Nucleus::createLShr(Value *lhs, Value *rhs)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return V(jit->builder->CreateLShr(V(lhs), V(rhs)));
-	}
-
-	Value *Nucleus::createAShr(Value *lhs, Value *rhs)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return V(jit->builder->CreateAShr(V(lhs), V(rhs)));
-	}
-
-	Value *Nucleus::createAnd(Value *lhs, Value *rhs)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return V(jit->builder->CreateAnd(V(lhs), V(rhs)));
-	}
-
-	Value *Nucleus::createOr(Value *lhs, Value *rhs)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return V(jit->builder->CreateOr(V(lhs), V(rhs)));
-	}
-
-	Value *Nucleus::createXor(Value *lhs, Value *rhs)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return V(jit->builder->CreateXor(V(lhs), V(rhs)));
-	}
-
-	Value *Nucleus::createNeg(Value *v)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return V(jit->builder->CreateNeg(V(v)));
-	}
-
-	Value *Nucleus::createFNeg(Value *v)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return V(jit->builder->CreateFNeg(V(v)));
-	}
-
-	Value *Nucleus::createNot(Value *v)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return V(jit->builder->CreateNot(V(v)));
-	}
-
-	Value *Nucleus::createLoad(Value *ptr, Type *type, bool isVolatile, unsigned int alignment, bool atomic, std::memory_order memoryOrder)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		switch(asInternalType(type))
-		{
-		case Type_v2i32:
-		case Type_v4i16:
-		case Type_v8i8:
-		case Type_v2f32:
-			return createBitCast(
-				createInsertElement(
-					V(llvm::UndefValue::get(llvm::VectorType::get(T(Long::getType()), 2))),
-					createLoad(createBitCast(ptr, Pointer<Long>::getType()), Long::getType(), isVolatile, alignment, atomic, memoryOrder),
-					0),
-				type);
-		case Type_v2i16:
-		case Type_v4i8:
-			if(alignment != 0)   // Not a local variable (all vectors are 128-bit).
-			{
-				Value *u = V(llvm::UndefValue::get(llvm::VectorType::get(T(Long::getType()), 2)));
-				Value *i = createLoad(createBitCast(ptr, Pointer<Int>::getType()), Int::getType(), isVolatile, alignment, atomic, memoryOrder);
-				i = createZExt(i, Long::getType());
-				Value *v = createInsertElement(u, i, 0);
-				return createBitCast(v, type);
-			}
-			// Fallthrough to non-emulated case.
-		case Type_LLVM:
-			{
-				auto elTy = T(type);
-				ASSERT(V(ptr)->getType()->getContainedType(0) == elTy);
-
-				if (!atomic)
-				{
-					return V(jit->builder->CreateAlignedLoad(V(ptr), alignment, isVolatile));
-				}
-				else if (elTy->isIntegerTy() || elTy->isPointerTy())
-				{
-					// Integers and pointers can be atomically loaded by setting
-					// the ordering constraint on the load instruction.
-					auto load = jit->builder->CreateAlignedLoad(V(ptr), alignment, isVolatile);
-					load->setAtomic(atomicOrdering(atomic, memoryOrder));
-					return V(load);
-				}
-				else if (elTy->isFloatTy() || elTy->isDoubleTy())
-				{
-					// LLVM claims to support atomic loads of float types as
-					// above, but certain backends cannot deal with this.
-					// Load as an integer and bitcast. See b/136037244.
-  					auto size = jit->module->getDataLayout().getTypeStoreSize(elTy);
-					auto elAsIntTy = ::llvm::IntegerType::get(jit->context, size * 8);
-					auto ptrCast = jit->builder->CreatePointerCast(V(ptr), elAsIntTy->getPointerTo());
-					auto load = jit->builder->CreateAlignedLoad(ptrCast, alignment, isVolatile);
-					load->setAtomic(atomicOrdering(atomic, memoryOrder));
-					auto loadCast = jit->builder->CreateBitCast(load, elTy);
-					return V(loadCast);
-				}
-				else
-				{
-					// More exotic types require falling back to the extern:
-					// void __atomic_load(size_t size, void *ptr, void *ret, int ordering)
-					auto sizetTy = ::llvm::IntegerType::get(jit->context, sizeof(size_t) * 8);
-					auto intTy = ::llvm::IntegerType::get(jit->context, sizeof(int) * 8);
-					auto i8Ty = ::llvm::Type::getInt8Ty(jit->context);
-					auto i8PtrTy = i8Ty->getPointerTo();
-					auto voidTy = ::llvm::Type::getVoidTy(jit->context);
-					auto funcTy = ::llvm::FunctionType::get(voidTy, {sizetTy, i8PtrTy, i8PtrTy, intTy}, false);
-					auto func = jit->module->getOrInsertFunction("__atomic_load", funcTy);
-  					auto size = jit->module->getDataLayout().getTypeStoreSize(elTy);
-					auto out = allocateStackVariable(type);
-					jit->builder->CreateCall(func, {
-						::llvm::ConstantInt::get(sizetTy, size),
-						jit->builder->CreatePointerCast(V(ptr), i8PtrTy),
-						jit->builder->CreatePointerCast(V(out), i8PtrTy),
-						::llvm::ConstantInt::get(intTy, uint64_t(atomicOrdering(true, memoryOrder))),
-					 });
-					 return V(jit->builder->CreateLoad(V(out)));
-				}
-			}
-		default:
-			UNREACHABLE("asInternalType(type): %d", int(asInternalType(type)));
-			return nullptr;
-		}
-	}
-
-	Value *Nucleus::createStore(Value *value, Value *ptr, Type *type, bool isVolatile, unsigned int alignment, bool atomic, std::memory_order memoryOrder)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		switch(asInternalType(type))
-		{
-		case Type_v2i32:
-		case Type_v4i16:
-		case Type_v8i8:
-		case Type_v2f32:
-			createStore(
-				createExtractElement(
-					createBitCast(value, T(llvm::VectorType::get(T(Long::getType()), 2))), Long::getType(), 0),
-				createBitCast(ptr, Pointer<Long>::getType()),
-				Long::getType(), isVolatile, alignment, atomic, memoryOrder);
-			return value;
-		case Type_v2i16:
-		case Type_v4i8:
-			if(alignment != 0)   // Not a local variable (all vectors are 128-bit).
-			{
-				createStore(
-					createExtractElement(createBitCast(value, Int4::getType()), Int::getType(), 0),
-					createBitCast(ptr, Pointer<Int>::getType()),
-					Int::getType(), isVolatile, alignment, atomic, memoryOrder);
-				return value;
-			}
-			// Fallthrough to non-emulated case.
-		case Type_LLVM:
-			{
-				auto elTy = T(type);
-				ASSERT(V(ptr)->getType()->getContainedType(0) == elTy);
-
-				if (!atomic)
-				{
-					jit->builder->CreateAlignedStore(V(value), V(ptr), alignment, isVolatile);
-				}
-				else if (elTy->isIntegerTy() || elTy->isPointerTy())
-				{
-					// Integers and pointers can be atomically stored by setting
-					// the ordering constraint on the store instruction.
-					auto store = jit->builder->CreateAlignedStore(V(value), V(ptr), alignment, isVolatile);
-					store->setAtomic(atomicOrdering(atomic, memoryOrder));
-				}
-				else if (elTy->isFloatTy() || elTy->isDoubleTy())
-				{
-					// LLVM claims to support atomic stores of float types as
-					// above, but certain backends cannot deal with this.
-					// Store as an bitcast integer. See b/136037244.
-  					auto size = jit->module->getDataLayout().getTypeStoreSize(elTy);
-					auto elAsIntTy = ::llvm::IntegerType::get(jit->context, size * 8);
-					auto valCast = jit->builder->CreateBitCast(V(value), elAsIntTy);
-					auto ptrCast = jit->builder->CreatePointerCast(V(ptr), elAsIntTy->getPointerTo());
-					auto store = jit->builder->CreateAlignedStore(valCast, ptrCast, alignment, isVolatile);
-					store->setAtomic(atomicOrdering(atomic, memoryOrder));
-				}
-				else
-				{
-					// More exotic types require falling back to the extern:
-					// void __atomic_store(size_t size, void *ptr, void *val, int ordering)
-					auto sizetTy = ::llvm::IntegerType::get(jit->context, sizeof(size_t) * 8);
-					auto intTy = ::llvm::IntegerType::get(jit->context, sizeof(int) * 8);
-					auto i8Ty = ::llvm::Type::getInt8Ty(jit->context);
-					auto i8PtrTy = i8Ty->getPointerTo();
-					auto voidTy = ::llvm::Type::getVoidTy(jit->context);
-					auto funcTy = ::llvm::FunctionType::get(voidTy, {sizetTy, i8PtrTy, i8PtrTy, intTy}, false);
-					auto func = jit->module->getOrInsertFunction("__atomic_store", funcTy);
-  					auto size = jit->module->getDataLayout().getTypeStoreSize(elTy);
-					auto copy = allocateStackVariable(type);
-					jit->builder->CreateStore(V(value), V(copy));
-					jit->builder->CreateCall(func, {
-						::llvm::ConstantInt::get(sizetTy, size),
-						jit->builder->CreatePointerCast(V(ptr), i8PtrTy),
-						jit->builder->CreatePointerCast(V(copy), i8PtrTy),
-						::llvm::ConstantInt::get(intTy, uint64_t(atomicOrdering(true, memoryOrder))),
-					 });
-				}
-
-				return value;
-			}
-		default:
-			UNREACHABLE("asInternalType(type): %d", int(asInternalType(type)));
-			return nullptr;
-		}
-	}
-
-	Value *Nucleus::createMaskedLoad(Value *ptr, Type *elTy, Value *mask, unsigned int alignment, bool zeroMaskedLanes)
-	{
-		ASSERT(V(ptr)->getType()->isPointerTy());
-		ASSERT(V(mask)->getType()->isVectorTy());
-
-		auto numEls = V(mask)->getType()->getVectorNumElements();
-		auto i1Ty = ::llvm::Type::getInt1Ty(jit->context);
-		auto i32Ty = ::llvm::Type::getInt32Ty(jit->context);
-		auto elVecTy = ::llvm::VectorType::get(T(elTy), numEls);
-		auto elVecPtrTy = elVecTy->getPointerTo();
-		auto i8Mask = jit->builder->CreateIntCast(V(mask), ::llvm::VectorType::get(i1Ty, numEls), false); // vec<int, int, ...> -> vec<bool, bool, ...>
-		auto passthrough = zeroMaskedLanes ? ::llvm::Constant::getNullValue(elVecTy) : llvm::UndefValue::get(elVecTy);
-		auto align = ::llvm::ConstantInt::get(i32Ty, alignment);
-		auto func = ::llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::masked_load, { elVecTy, elVecPtrTy } );
-		return V(jit->builder->CreateCall(func, { V(ptr), align, i8Mask, passthrough }));
-	}
-
-	void Nucleus::createMaskedStore(Value *ptr, Value *val, Value *mask, unsigned int alignment)
-	{
-		ASSERT(V(ptr)->getType()->isPointerTy());
-		ASSERT(V(val)->getType()->isVectorTy());
-		ASSERT(V(mask)->getType()->isVectorTy());
-
-		auto numEls = V(mask)->getType()->getVectorNumElements();
-		auto i1Ty = ::llvm::Type::getInt1Ty(jit->context);
-		auto i32Ty = ::llvm::Type::getInt32Ty(jit->context);
-		auto elVecTy = V(val)->getType();
-		auto elVecPtrTy = elVecTy->getPointerTo();
-		auto i8Mask = jit->builder->CreateIntCast(V(mask), ::llvm::VectorType::get(i1Ty, numEls), false); // vec<int, int, ...> -> vec<bool, bool, ...>
-		auto align = ::llvm::ConstantInt::get(i32Ty, alignment);
-		auto func = ::llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::masked_store, { elVecTy, elVecPtrTy } );
-		jit->builder->CreateCall(func, { V(val), V(ptr), align, i8Mask });
-	}
-
-	RValue<Float4> Gather(RValue<Pointer<Float>> base, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment, bool zeroMaskedLanes /* = false */)
-	{
-		return As<Float4>(V(createGather(V(base.value), T(Float::getType()), V(offsets.value), V(mask.value), alignment, zeroMaskedLanes)));
-	}
-
-	RValue<Int4> Gather(RValue<Pointer<Int>> base, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment, bool zeroMaskedLanes /* = false */)
-	{
-		return As<Int4>(V(createGather(V(base.value), T(Float::getType()), V(offsets.value), V(mask.value), alignment, zeroMaskedLanes)));
-	}
-
-	void Scatter(RValue<Pointer<Float>> base, RValue<Float4> val, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment)
-	{
-		return createScatter(V(base.value), V(val.value), V(offsets.value), V(mask.value), alignment);
-	}
-
-	void Scatter(RValue<Pointer<Int>> base, RValue<Int4> val, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment)
-	{
-		return createScatter(V(base.value), V(val.value), V(offsets.value), V(mask.value), alignment);
-	}
-
-	void Nucleus::createFence(std::memory_order memoryOrder)
-	{
-		jit->builder->CreateFence(atomicOrdering(true, memoryOrder));
-	}
-
-	Value *Nucleus::createGEP(Value *ptr, Type *type, Value *index, bool unsignedIndex)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		ASSERT(V(ptr)->getType()->getContainedType(0) == T(type));
-		if(sizeof(void*) == 8)
-		{
-			// LLVM manual: "When indexing into an array, pointer or vector,
-			// integers of any width are allowed, and they are not required to
-			// be constant. These integers are treated as signed values where
-			// relevant."
-			//
-			// Thus if we want indexes to be treated as unsigned we have to
-			// zero-extend them ourselves.
-			//
-			// Note that this is not because we want to address anywhere near
-			// 4 GB of data. Instead this is important for performance because
-			// x86 supports automatic zero-extending of 32-bit registers to
-			// 64-bit. Thus when indexing into an array using a uint32 is
-			// actually faster than an int32.
-			index = unsignedIndex ?
-				createZExt(index, Long::getType()) :
-				createSExt(index, Long::getType());
-		}
-
-		// For non-emulated types we can rely on LLVM's GEP to calculate the
-		// effective address correctly.
-		if(asInternalType(type) == Type_LLVM)
-		{
-			return V(jit->builder->CreateGEP(V(ptr), V(index)));
-		}
-
-		// For emulated types we have to multiply the index by the intended
-		// type size ourselves to obain the byte offset.
-		index = (sizeof(void*) == 8) ?
-			createMul(index, createConstantLong((int64_t)typeSize(type))) :
-			createMul(index, createConstantInt((int)typeSize(type)));
-
-		// Cast to a byte pointer, apply the byte offset, and cast back to the
-		// original pointer type.
+	case Type_v2i32:
+	case Type_v4i16:
+	case Type_v8i8:
+	case Type_v2f32:
 		return createBitCast(
-			V(jit->builder->CreateGEP(V(createBitCast(ptr, T(llvm::PointerType::get(T(Byte::getType()), 0)))), V(index))),
-			T(llvm::PointerType::get(T(type), 0)));
-	}
-
-	Value *Nucleus::createAtomicAdd(Value *ptr, Value *value, std::memory_order memoryOrder)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return V(jit->builder->CreateAtomicRMW(llvm::AtomicRMWInst::Add, V(ptr), V(value), atomicOrdering(true, memoryOrder)));
-	}
-
-	Value *Nucleus::createAtomicSub(Value *ptr, Value *value, std::memory_order memoryOrder)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return V(jit->builder->CreateAtomicRMW(llvm::AtomicRMWInst::Sub, V(ptr), V(value), atomicOrdering(true, memoryOrder)));
-	}
-
-	Value *Nucleus::createAtomicAnd(Value *ptr, Value *value, std::memory_order memoryOrder)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return V(jit->builder->CreateAtomicRMW(llvm::AtomicRMWInst::And, V(ptr), V(value), atomicOrdering(true, memoryOrder)));
-	}
-
-	Value *Nucleus::createAtomicOr(Value *ptr, Value *value, std::memory_order memoryOrder)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return V(jit->builder->CreateAtomicRMW(llvm::AtomicRMWInst::Or, V(ptr), V(value), atomicOrdering(true, memoryOrder)));
-	}
-
-	Value *Nucleus::createAtomicXor(Value *ptr, Value *value, std::memory_order memoryOrder)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return V(jit->builder->CreateAtomicRMW(llvm::AtomicRMWInst::Xor, V(ptr), V(value), atomicOrdering(true, memoryOrder)));
-	}
-
-	Value *Nucleus::createAtomicMin(Value *ptr, Value *value, std::memory_order memoryOrder)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return V(jit->builder->CreateAtomicRMW(llvm::AtomicRMWInst::Min, V(ptr), V(value), atomicOrdering(true, memoryOrder)));
-	}
-
-	Value *Nucleus::createAtomicMax(Value *ptr, Value *value, std::memory_order memoryOrder)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return V(jit->builder->CreateAtomicRMW(llvm::AtomicRMWInst::Max, V(ptr), V(value), atomicOrdering(true, memoryOrder)));
-	}
-
-	Value *Nucleus::createAtomicUMin(Value *ptr, Value *value, std::memory_order memoryOrder)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return V(jit->builder->CreateAtomicRMW(llvm::AtomicRMWInst::UMin, V(ptr), V(value), atomicOrdering(true, memoryOrder)));
-	}
-
-	Value *Nucleus::createAtomicUMax(Value *ptr, Value *value, std::memory_order memoryOrder)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return V(jit->builder->CreateAtomicRMW(llvm::AtomicRMWInst::UMax, V(ptr), V(value), atomicOrdering(true, memoryOrder)));
-	}
-
-
-	Value *Nucleus::createAtomicExchange(Value *ptr, Value *value, std::memory_order memoryOrder)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return V(jit->builder->CreateAtomicRMW(llvm::AtomicRMWInst::Xchg, V(ptr), V(value), atomicOrdering(true, memoryOrder)));
-	}
-
-	Value *Nucleus::createAtomicCompareExchange(Value *ptr, Value *value, Value *compare, std::memory_order memoryOrderEqual, std::memory_order memoryOrderUnequal)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		// Note: AtomicCmpXchgInstruction returns a 2-member struct containing {result, success-flag}, not the result directly.
-		return V(jit->builder->CreateExtractValue(
-				jit->builder->CreateAtomicCmpXchg(V(ptr), V(compare), V(value), atomicOrdering(true, memoryOrderEqual), atomicOrdering(true, memoryOrderUnequal)),
-				llvm::ArrayRef<unsigned>(0u)));
-	}
-
-	Value *Nucleus::createTrunc(Value *v, Type *destType)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return V(jit->builder->CreateTrunc(V(v), T(destType)));
-	}
-
-	Value *Nucleus::createZExt(Value *v, Type *destType)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return V(jit->builder->CreateZExt(V(v), T(destType)));
-	}
-
-	Value *Nucleus::createSExt(Value *v, Type *destType)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return V(jit->builder->CreateSExt(V(v), T(destType)));
-	}
-
-	Value *Nucleus::createFPToUI(Value *v, Type *destType)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return V(jit->builder->CreateFPToUI(V(v), T(destType)));
-	}
-
-	Value *Nucleus::createFPToSI(Value *v, Type *destType)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return V(jit->builder->CreateFPToSI(V(v), T(destType)));
-	}
-
-	Value *Nucleus::createSIToFP(Value *v, Type *destType)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return V(jit->builder->CreateSIToFP(V(v), T(destType)));
-	}
-
-	Value *Nucleus::createFPTrunc(Value *v, Type *destType)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return V(jit->builder->CreateFPTrunc(V(v), T(destType)));
-	}
-
-	Value *Nucleus::createFPExt(Value *v, Type *destType)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return V(jit->builder->CreateFPExt(V(v), T(destType)));
-	}
-
-	Value *Nucleus::createBitCast(Value *v, Type *destType)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		// Bitcasts must be between types of the same logical size. But with emulated narrow vectors we need
-		// support for casting between scalars and wide vectors. Emulate them by writing to the stack and
-		// reading back as the destination type.
-		if(!V(v)->getType()->isVectorTy() && T(destType)->isVectorTy())
+			createInsertElement(
+				V(llvm::UndefValue::get(llvm::VectorType::get(T(Long::getType()), 2))),
+				createLoad(createBitCast(ptr, Pointer<Long>::getType()), Long::getType(), isVolatile, alignment, atomic, memoryOrder),
+				0),
+			type);
+	case Type_v2i16:
+	case Type_v4i8:
+		if(alignment != 0)   // Not a local variable (all vectors are 128-bit).
 		{
-			Value *readAddress = allocateStackVariable(destType);
-			Value *writeAddress = createBitCast(readAddress, T(llvm::PointerType::get(V(v)->getType(), 0)));
-			createStore(v, writeAddress, T(V(v)->getType()));
-			return createLoad(readAddress, destType);
+			Value *u = V(llvm::UndefValue::get(llvm::VectorType::get(T(Long::getType()), 2)));
+			Value *i = createLoad(createBitCast(ptr, Pointer<Int>::getType()), Int::getType(), isVolatile, alignment, atomic, memoryOrder);
+			i = createZExt(i, Long::getType());
+			Value *v = createInsertElement(u, i, 0);
+			return createBitCast(v, type);
 		}
-		else if(V(v)->getType()->isVectorTy() && !T(destType)->isVectorTy())
+		// Fallthrough to non-emulated case.
+	case Type_LLVM:
 		{
-			Value *writeAddress = allocateStackVariable(T(V(v)->getType()));
-			createStore(v, writeAddress, T(V(v)->getType()));
-			Value *readAddress = createBitCast(writeAddress, T(llvm::PointerType::get(T(destType), 0)));
-			return createLoad(readAddress, destType);
+			auto elTy = T(type);
+			ASSERT(V(ptr)->getType()->getContainedType(0) == elTy);
+
+			if (!atomic)
+			{
+				return V(jit->builder->CreateAlignedLoad(V(ptr), alignment, isVolatile));
+			}
+			else if (elTy->isIntegerTy() || elTy->isPointerTy())
+			{
+				// Integers and pointers can be atomically loaded by setting
+				// the ordering constraint on the load instruction.
+				auto load = jit->builder->CreateAlignedLoad(V(ptr), alignment, isVolatile);
+				load->setAtomic(atomicOrdering(atomic, memoryOrder));
+				return V(load);
+			}
+			else if (elTy->isFloatTy() || elTy->isDoubleTy())
+			{
+				// LLVM claims to support atomic loads of float types as
+				// above, but certain backends cannot deal with this.
+				// Load as an integer and bitcast. See b/136037244.
+				auto size = jit->module->getDataLayout().getTypeStoreSize(elTy);
+				auto elAsIntTy = ::llvm::IntegerType::get(jit->context, size * 8);
+				auto ptrCast = jit->builder->CreatePointerCast(V(ptr), elAsIntTy->getPointerTo());
+				auto load = jit->builder->CreateAlignedLoad(ptrCast, alignment, isVolatile);
+				load->setAtomic(atomicOrdering(atomic, memoryOrder));
+				auto loadCast = jit->builder->CreateBitCast(load, elTy);
+				return V(loadCast);
+			}
+			else
+			{
+				// More exotic types require falling back to the extern:
+				// void __atomic_load(size_t size, void *ptr, void *ret, int ordering)
+				auto sizetTy = ::llvm::IntegerType::get(jit->context, sizeof(size_t) * 8);
+				auto intTy = ::llvm::IntegerType::get(jit->context, sizeof(int) * 8);
+				auto i8Ty = ::llvm::Type::getInt8Ty(jit->context);
+				auto i8PtrTy = i8Ty->getPointerTo();
+				auto voidTy = ::llvm::Type::getVoidTy(jit->context);
+				auto funcTy = ::llvm::FunctionType::get(voidTy, {sizetTy, i8PtrTy, i8PtrTy, intTy}, false);
+				auto func = jit->module->getOrInsertFunction("__atomic_load", funcTy);
+				auto size = jit->module->getDataLayout().getTypeStoreSize(elTy);
+				auto out = allocateStackVariable(type);
+				jit->builder->CreateCall(func, {
+					::llvm::ConstantInt::get(sizetTy, size),
+					jit->builder->CreatePointerCast(V(ptr), i8PtrTy),
+					jit->builder->CreatePointerCast(V(out), i8PtrTy),
+					::llvm::ConstantInt::get(intTy, uint64_t(atomicOrdering(true, memoryOrder))),
+				 });
+				 return V(jit->builder->CreateLoad(V(out)));
+			}
 		}
-
-		return V(jit->builder->CreateBitCast(V(v), T(destType)));
+	default:
+		UNREACHABLE("asInternalType(type): %d", int(asInternalType(type)));
+		return nullptr;
 	}
+}
 
-	Value *Nucleus::createPtrEQ(Value *lhs, Value *rhs)
+Value *Nucleus::createStore(Value *value, Value *ptr, Type *type, bool isVolatile, unsigned int alignment, bool atomic, std::memory_order memoryOrder)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	switch(asInternalType(type))
 	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return V(jit->builder->CreateICmpEQ(V(lhs), V(rhs)));
-	}
-
-	Value *Nucleus::createICmpEQ(Value *lhs, Value *rhs)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return V(jit->builder->CreateICmpEQ(V(lhs), V(rhs)));
-	}
-
-	Value *Nucleus::createICmpNE(Value *lhs, Value *rhs)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return V(jit->builder->CreateICmpNE(V(lhs), V(rhs)));
-	}
-
-	Value *Nucleus::createICmpUGT(Value *lhs, Value *rhs)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return V(jit->builder->CreateICmpUGT(V(lhs), V(rhs)));
-	}
-
-	Value *Nucleus::createICmpUGE(Value *lhs, Value *rhs)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return V(jit->builder->CreateICmpUGE(V(lhs), V(rhs)));
-	}
-
-	Value *Nucleus::createICmpULT(Value *lhs, Value *rhs)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return V(jit->builder->CreateICmpULT(V(lhs), V(rhs)));
-	}
-
-	Value *Nucleus::createICmpULE(Value *lhs, Value *rhs)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return V(jit->builder->CreateICmpULE(V(lhs), V(rhs)));
-	}
-
-	Value *Nucleus::createICmpSGT(Value *lhs, Value *rhs)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return V(jit->builder->CreateICmpSGT(V(lhs), V(rhs)));
-	}
-
-	Value *Nucleus::createICmpSGE(Value *lhs, Value *rhs)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return V(jit->builder->CreateICmpSGE(V(lhs), V(rhs)));
-	}
-
-	Value *Nucleus::createICmpSLT(Value *lhs, Value *rhs)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return V(jit->builder->CreateICmpSLT(V(lhs), V(rhs)));
-	}
-
-	Value *Nucleus::createICmpSLE(Value *lhs, Value *rhs)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return V(jit->builder->CreateICmpSLE(V(lhs), V(rhs)));
-	}
-
-	Value *Nucleus::createFCmpOEQ(Value *lhs, Value *rhs)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return V(jit->builder->CreateFCmpOEQ(V(lhs), V(rhs)));
-	}
-
-	Value *Nucleus::createFCmpOGT(Value *lhs, Value *rhs)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return V(jit->builder->CreateFCmpOGT(V(lhs), V(rhs)));
-	}
-
-	Value *Nucleus::createFCmpOGE(Value *lhs, Value *rhs)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return V(jit->builder->CreateFCmpOGE(V(lhs), V(rhs)));
-	}
-
-	Value *Nucleus::createFCmpOLT(Value *lhs, Value *rhs)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return V(jit->builder->CreateFCmpOLT(V(lhs), V(rhs)));
-	}
-
-	Value *Nucleus::createFCmpOLE(Value *lhs, Value *rhs)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return V(jit->builder->CreateFCmpOLE(V(lhs), V(rhs)));
-	}
-
-	Value *Nucleus::createFCmpONE(Value *lhs, Value *rhs)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return V(jit->builder->CreateFCmpONE(V(lhs), V(rhs)));
-	}
-
-	Value *Nucleus::createFCmpORD(Value *lhs, Value *rhs)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return V(jit->builder->CreateFCmpORD(V(lhs), V(rhs)));
-	}
-
-	Value *Nucleus::createFCmpUNO(Value *lhs, Value *rhs)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return V(jit->builder->CreateFCmpUNO(V(lhs), V(rhs)));
-	}
-
-	Value *Nucleus::createFCmpUEQ(Value *lhs, Value *rhs)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return V(jit->builder->CreateFCmpUEQ(V(lhs), V(rhs)));
-	}
-
-	Value *Nucleus::createFCmpUGT(Value *lhs, Value *rhs)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return V(jit->builder->CreateFCmpUGT(V(lhs), V(rhs)));
-	}
-
-	Value *Nucleus::createFCmpUGE(Value *lhs, Value *rhs)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return V(jit->builder->CreateFCmpUGE(V(lhs), V(rhs)));
-	}
-
-	Value *Nucleus::createFCmpULT(Value *lhs, Value *rhs)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return V(jit->builder->CreateFCmpULT(V(lhs), V(rhs)));
-	}
-
-	Value *Nucleus::createFCmpULE(Value *lhs, Value *rhs)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return V(jit->builder->CreateFCmpULE(V(lhs), V(rhs)));
-	}
-
-	Value *Nucleus::createFCmpUNE(Value *lhs, Value *rhs)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return V(jit->builder->CreateFCmpUNE(V(lhs), V(rhs)));
-	}
-
-	Value *Nucleus::createExtractElement(Value *vector, Type *type, int index)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		ASSERT(V(vector)->getType()->getContainedType(0) == T(type));
-		return V(jit->builder->CreateExtractElement(V(vector), V(createConstantInt(index))));
-	}
-
-	Value *Nucleus::createInsertElement(Value *vector, Value *element, int index)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return V(jit->builder->CreateInsertElement(V(vector), V(element), V(createConstantInt(index))));
-	}
-
-	Value *Nucleus::createShuffleVector(Value *v1, Value *v2, const int *select)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-
-		int size = llvm::cast<llvm::VectorType>(V(v1)->getType())->getNumElements();
-		const int maxSize = 16;
-		llvm::Constant *swizzle[maxSize];
-		ASSERT(size <= maxSize);
-
-		for(int i = 0; i < size; i++)
+	case Type_v2i32:
+	case Type_v4i16:
+	case Type_v8i8:
+	case Type_v2f32:
+		createStore(
+			createExtractElement(
+				createBitCast(value, T(llvm::VectorType::get(T(Long::getType()), 2))), Long::getType(), 0),
+			createBitCast(ptr, Pointer<Long>::getType()),
+			Long::getType(), isVolatile, alignment, atomic, memoryOrder);
+		return value;
+	case Type_v2i16:
+	case Type_v4i8:
+		if(alignment != 0)   // Not a local variable (all vectors are 128-bit).
 		{
-			swizzle[i] = llvm::ConstantInt::get(llvm::Type::getInt32Ty(jit->context), select[i]);
+			createStore(
+				createExtractElement(createBitCast(value, Int4::getType()), Int::getType(), 0),
+				createBitCast(ptr, Pointer<Int>::getType()),
+				Int::getType(), isVolatile, alignment, atomic, memoryOrder);
+			return value;
 		}
-
-		llvm::Value *shuffle = llvm::ConstantVector::get(llvm::ArrayRef<llvm::Constant*>(swizzle, size));
-
-		return V(jit->builder->CreateShuffleVector(V(v1), V(v2), shuffle));
-	}
-
-	Value *Nucleus::createSelect(Value *c, Value *ifTrue, Value *ifFalse)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return V(jit->builder->CreateSelect(V(c), V(ifTrue), V(ifFalse)));
-	}
-
-	SwitchCases *Nucleus::createSwitch(Value *control, BasicBlock *defaultBranch, unsigned numCases)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return reinterpret_cast<SwitchCases*>(jit->builder->CreateSwitch(V(control), B(defaultBranch), numCases));
-	}
-
-	void Nucleus::addSwitchCase(SwitchCases *switchCases, int label, BasicBlock *branch)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		llvm::SwitchInst *sw = reinterpret_cast<llvm::SwitchInst *>(switchCases);
-		sw->addCase(llvm::ConstantInt::get(llvm::Type::getInt32Ty(jit->context), label, true), B(branch));
-	}
-
-	void Nucleus::createUnreachable()
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		jit->builder->CreateUnreachable();
-	}
-
-	Type *Nucleus::getPointerType(Type *ElementType)
-	{
-		return T(llvm::PointerType::get(T(ElementType), 0));
-	}
-
-	Value *Nucleus::createNullValue(Type *Ty)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return V(llvm::Constant::getNullValue(T(Ty)));
-	}
-
-	Value *Nucleus::createConstantLong(int64_t i)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return V(llvm::ConstantInt::get(llvm::Type::getInt64Ty(jit->context), i, true));
-	}
-
-	Value *Nucleus::createConstantInt(int i)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return V(llvm::ConstantInt::get(llvm::Type::getInt32Ty(jit->context), i, true));
-	}
-
-	Value *Nucleus::createConstantInt(unsigned int i)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return V(llvm::ConstantInt::get(llvm::Type::getInt32Ty(jit->context), i, false));
-	}
-
-	Value *Nucleus::createConstantBool(bool b)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return V(llvm::ConstantInt::get(llvm::Type::getInt1Ty(jit->context), b));
-	}
-
-	Value *Nucleus::createConstantByte(signed char i)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return V(llvm::ConstantInt::get(llvm::Type::getInt8Ty(jit->context), i, true));
-	}
-
-	Value *Nucleus::createConstantByte(unsigned char i)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return V(llvm::ConstantInt::get(llvm::Type::getInt8Ty(jit->context), i, false));
-	}
-
-	Value *Nucleus::createConstantShort(short i)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return V(llvm::ConstantInt::get(llvm::Type::getInt16Ty(jit->context), i, true));
-	}
-
-	Value *Nucleus::createConstantShort(unsigned short i)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return V(llvm::ConstantInt::get(llvm::Type::getInt16Ty(jit->context), i, false));
-	}
-
-	Value *Nucleus::createConstantFloat(float x)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return V(llvm::ConstantFP::get(T(Float::getType()), x));
-	}
-
-	Value *Nucleus::createNullPointer(Type *Ty)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return V(llvm::ConstantPointerNull::get(llvm::PointerType::get(T(Ty), 0)));
-	}
-
-	Value *Nucleus::createConstantVector(const int64_t *constants, Type *type)
-	{
-		ASSERT(llvm::isa<llvm::VectorType>(T(type)));
-		const int numConstants = elementCount(type);                                       // Number of provided constants for the (emulated) type.
-		const int numElements = llvm::cast<llvm::VectorType>(T(type))->getNumElements();   // Number of elements of the underlying vector type.
-		ASSERT(numElements <= 16 && numConstants <= numElements);
-		llvm::Constant *constantVector[16];
-
-		for(int i = 0; i < numElements; i++)
+		// Fallthrough to non-emulated case.
+	case Type_LLVM:
 		{
-			constantVector[i] = llvm::ConstantInt::get(T(type)->getContainedType(0), constants[i % numConstants]);
+			auto elTy = T(type);
+			ASSERT(V(ptr)->getType()->getContainedType(0) == elTy);
+
+			if (!atomic)
+			{
+				jit->builder->CreateAlignedStore(V(value), V(ptr), alignment, isVolatile);
+			}
+			else if (elTy->isIntegerTy() || elTy->isPointerTy())
+			{
+				// Integers and pointers can be atomically stored by setting
+				// the ordering constraint on the store instruction.
+				auto store = jit->builder->CreateAlignedStore(V(value), V(ptr), alignment, isVolatile);
+				store->setAtomic(atomicOrdering(atomic, memoryOrder));
+			}
+			else if (elTy->isFloatTy() || elTy->isDoubleTy())
+			{
+				// LLVM claims to support atomic stores of float types as
+				// above, but certain backends cannot deal with this.
+				// Store as an bitcast integer. See b/136037244.
+				auto size = jit->module->getDataLayout().getTypeStoreSize(elTy);
+				auto elAsIntTy = ::llvm::IntegerType::get(jit->context, size * 8);
+				auto valCast = jit->builder->CreateBitCast(V(value), elAsIntTy);
+				auto ptrCast = jit->builder->CreatePointerCast(V(ptr), elAsIntTy->getPointerTo());
+				auto store = jit->builder->CreateAlignedStore(valCast, ptrCast, alignment, isVolatile);
+				store->setAtomic(atomicOrdering(atomic, memoryOrder));
+			}
+			else
+			{
+				// More exotic types require falling back to the extern:
+				// void __atomic_store(size_t size, void *ptr, void *val, int ordering)
+				auto sizetTy = ::llvm::IntegerType::get(jit->context, sizeof(size_t) * 8);
+				auto intTy = ::llvm::IntegerType::get(jit->context, sizeof(int) * 8);
+				auto i8Ty = ::llvm::Type::getInt8Ty(jit->context);
+				auto i8PtrTy = i8Ty->getPointerTo();
+				auto voidTy = ::llvm::Type::getVoidTy(jit->context);
+				auto funcTy = ::llvm::FunctionType::get(voidTy, {sizetTy, i8PtrTy, i8PtrTy, intTy}, false);
+				auto func = jit->module->getOrInsertFunction("__atomic_store", funcTy);
+				auto size = jit->module->getDataLayout().getTypeStoreSize(elTy);
+				auto copy = allocateStackVariable(type);
+				jit->builder->CreateStore(V(value), V(copy));
+				jit->builder->CreateCall(func, {
+					::llvm::ConstantInt::get(sizetTy, size),
+					jit->builder->CreatePointerCast(V(ptr), i8PtrTy),
+					jit->builder->CreatePointerCast(V(copy), i8PtrTy),
+					::llvm::ConstantInt::get(intTy, uint64_t(atomicOrdering(true, memoryOrder))),
+				 });
+			}
+
+			return value;
 		}
+	default:
+		UNREACHABLE("asInternalType(type): %d", int(asInternalType(type)));
+		return nullptr;
+	}
+}
 
-		return V(llvm::ConstantVector::get(llvm::ArrayRef<llvm::Constant*>(constantVector, numElements)));
+Value *Nucleus::createMaskedLoad(Value *ptr, Type *elTy, Value *mask, unsigned int alignment, bool zeroMaskedLanes)
+{
+	ASSERT(V(ptr)->getType()->isPointerTy());
+	ASSERT(V(mask)->getType()->isVectorTy());
+
+	auto numEls = V(mask)->getType()->getVectorNumElements();
+	auto i1Ty = ::llvm::Type::getInt1Ty(jit->context);
+	auto i32Ty = ::llvm::Type::getInt32Ty(jit->context);
+	auto elVecTy = ::llvm::VectorType::get(T(elTy), numEls);
+	auto elVecPtrTy = elVecTy->getPointerTo();
+	auto i8Mask = jit->builder->CreateIntCast(V(mask), ::llvm::VectorType::get(i1Ty, numEls), false); // vec<int, int, ...> -> vec<bool, bool, ...>
+	auto passthrough = zeroMaskedLanes ? ::llvm::Constant::getNullValue(elVecTy) : llvm::UndefValue::get(elVecTy);
+	auto align = ::llvm::ConstantInt::get(i32Ty, alignment);
+	auto func = ::llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::masked_load, { elVecTy, elVecPtrTy } );
+	return V(jit->builder->CreateCall(func, { V(ptr), align, i8Mask, passthrough }));
+}
+
+void Nucleus::createMaskedStore(Value *ptr, Value *val, Value *mask, unsigned int alignment)
+{
+	ASSERT(V(ptr)->getType()->isPointerTy());
+	ASSERT(V(val)->getType()->isVectorTy());
+	ASSERT(V(mask)->getType()->isVectorTy());
+
+	auto numEls = V(mask)->getType()->getVectorNumElements();
+	auto i1Ty = ::llvm::Type::getInt1Ty(jit->context);
+	auto i32Ty = ::llvm::Type::getInt32Ty(jit->context);
+	auto elVecTy = V(val)->getType();
+	auto elVecPtrTy = elVecTy->getPointerTo();
+	auto i8Mask = jit->builder->CreateIntCast(V(mask), ::llvm::VectorType::get(i1Ty, numEls), false); // vec<int, int, ...> -> vec<bool, bool, ...>
+	auto align = ::llvm::ConstantInt::get(i32Ty, alignment);
+	auto func = ::llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::masked_store, { elVecTy, elVecPtrTy } );
+	jit->builder->CreateCall(func, { V(val), V(ptr), align, i8Mask });
+}
+
+RValue<Float4> Gather(RValue<Pointer<Float>> base, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment, bool zeroMaskedLanes /* = false */)
+{
+	return As<Float4>(V(createGather(V(base.value), T(Float::getType()), V(offsets.value), V(mask.value), alignment, zeroMaskedLanes)));
+}
+
+RValue<Int4> Gather(RValue<Pointer<Int>> base, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment, bool zeroMaskedLanes /* = false */)
+{
+	return As<Int4>(V(createGather(V(base.value), T(Float::getType()), V(offsets.value), V(mask.value), alignment, zeroMaskedLanes)));
+}
+
+void Scatter(RValue<Pointer<Float>> base, RValue<Float4> val, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment)
+{
+	return createScatter(V(base.value), V(val.value), V(offsets.value), V(mask.value), alignment);
+}
+
+void Scatter(RValue<Pointer<Int>> base, RValue<Int4> val, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment)
+{
+	return createScatter(V(base.value), V(val.value), V(offsets.value), V(mask.value), alignment);
+}
+
+void Nucleus::createFence(std::memory_order memoryOrder)
+{
+	jit->builder->CreateFence(atomicOrdering(true, memoryOrder));
+}
+
+Value *Nucleus::createGEP(Value *ptr, Type *type, Value *index, bool unsignedIndex)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	ASSERT(V(ptr)->getType()->getContainedType(0) == T(type));
+	if(sizeof(void*) == 8)
+	{
+		// LLVM manual: "When indexing into an array, pointer or vector,
+		// integers of any width are allowed, and they are not required to
+		// be constant. These integers are treated as signed values where
+		// relevant."
+		//
+		// Thus if we want indexes to be treated as unsigned we have to
+		// zero-extend them ourselves.
+		//
+		// Note that this is not because we want to address anywhere near
+		// 4 GB of data. Instead this is important for performance because
+		// x86 supports automatic zero-extending of 32-bit registers to
+		// 64-bit. Thus when indexing into an array using a uint32 is
+		// actually faster than an int32.
+		index = unsignedIndex ?
+			createZExt(index, Long::getType()) :
+			createSExt(index, Long::getType());
 	}
 
-	Value *Nucleus::createConstantVector(const double *constants, Type *type)
+	// For non-emulated types we can rely on LLVM's GEP to calculate the
+	// effective address correctly.
+	if(asInternalType(type) == Type_LLVM)
 	{
-		ASSERT(llvm::isa<llvm::VectorType>(T(type)));
-		const int numConstants = elementCount(type);                                       // Number of provided constants for the (emulated) type.
-		const int numElements = llvm::cast<llvm::VectorType>(T(type))->getNumElements();   // Number of elements of the underlying vector type.
-		ASSERT(numElements <= 8 && numConstants <= numElements);
-		llvm::Constant *constantVector[8];
-
-		for(int i = 0; i < numElements; i++)
-		{
-			constantVector[i] = llvm::ConstantFP::get(T(type)->getContainedType(0), constants[i % numConstants]);
-		}
-
-		return V(llvm::ConstantVector::get(llvm::ArrayRef<llvm::Constant*>(constantVector, numElements)));
+		return V(jit->builder->CreateGEP(V(ptr), V(index)));
 	}
 
-	Type *Void::getType()
+	// For emulated types we have to multiply the index by the intended
+	// type size ourselves to obain the byte offset.
+	index = (sizeof(void*) == 8) ?
+		createMul(index, createConstantLong((int64_t)typeSize(type))) :
+		createMul(index, createConstantInt((int)typeSize(type)));
+
+	// Cast to a byte pointer, apply the byte offset, and cast back to the
+	// original pointer type.
+	return createBitCast(
+		V(jit->builder->CreateGEP(V(createBitCast(ptr, T(llvm::PointerType::get(T(Byte::getType()), 0)))), V(index))),
+		T(llvm::PointerType::get(T(type), 0)));
+}
+
+Value *Nucleus::createAtomicAdd(Value *ptr, Value *value, std::memory_order memoryOrder)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return V(jit->builder->CreateAtomicRMW(llvm::AtomicRMWInst::Add, V(ptr), V(value), atomicOrdering(true, memoryOrder)));
+}
+
+Value *Nucleus::createAtomicSub(Value *ptr, Value *value, std::memory_order memoryOrder)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return V(jit->builder->CreateAtomicRMW(llvm::AtomicRMWInst::Sub, V(ptr), V(value), atomicOrdering(true, memoryOrder)));
+}
+
+Value *Nucleus::createAtomicAnd(Value *ptr, Value *value, std::memory_order memoryOrder)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return V(jit->builder->CreateAtomicRMW(llvm::AtomicRMWInst::And, V(ptr), V(value), atomicOrdering(true, memoryOrder)));
+}
+
+Value *Nucleus::createAtomicOr(Value *ptr, Value *value, std::memory_order memoryOrder)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return V(jit->builder->CreateAtomicRMW(llvm::AtomicRMWInst::Or, V(ptr), V(value), atomicOrdering(true, memoryOrder)));
+}
+
+Value *Nucleus::createAtomicXor(Value *ptr, Value *value, std::memory_order memoryOrder)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return V(jit->builder->CreateAtomicRMW(llvm::AtomicRMWInst::Xor, V(ptr), V(value), atomicOrdering(true, memoryOrder)));
+}
+
+Value *Nucleus::createAtomicMin(Value *ptr, Value *value, std::memory_order memoryOrder)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return V(jit->builder->CreateAtomicRMW(llvm::AtomicRMWInst::Min, V(ptr), V(value), atomicOrdering(true, memoryOrder)));
+}
+
+Value *Nucleus::createAtomicMax(Value *ptr, Value *value, std::memory_order memoryOrder)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return V(jit->builder->CreateAtomicRMW(llvm::AtomicRMWInst::Max, V(ptr), V(value), atomicOrdering(true, memoryOrder)));
+}
+
+Value *Nucleus::createAtomicUMin(Value *ptr, Value *value, std::memory_order memoryOrder)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return V(jit->builder->CreateAtomicRMW(llvm::AtomicRMWInst::UMin, V(ptr), V(value), atomicOrdering(true, memoryOrder)));
+}
+
+Value *Nucleus::createAtomicUMax(Value *ptr, Value *value, std::memory_order memoryOrder)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return V(jit->builder->CreateAtomicRMW(llvm::AtomicRMWInst::UMax, V(ptr), V(value), atomicOrdering(true, memoryOrder)));
+}
+
+
+Value *Nucleus::createAtomicExchange(Value *ptr, Value *value, std::memory_order memoryOrder)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return V(jit->builder->CreateAtomicRMW(llvm::AtomicRMWInst::Xchg, V(ptr), V(value), atomicOrdering(true, memoryOrder)));
+}
+
+Value *Nucleus::createAtomicCompareExchange(Value *ptr, Value *value, Value *compare, std::memory_order memoryOrderEqual, std::memory_order memoryOrderUnequal)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	// Note: AtomicCmpXchgInstruction returns a 2-member struct containing {result, success-flag}, not the result directly.
+	return V(jit->builder->CreateExtractValue(
+			jit->builder->CreateAtomicCmpXchg(V(ptr), V(compare), V(value), atomicOrdering(true, memoryOrderEqual), atomicOrdering(true, memoryOrderUnequal)),
+			llvm::ArrayRef<unsigned>(0u)));
+}
+
+Value *Nucleus::createTrunc(Value *v, Type *destType)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return V(jit->builder->CreateTrunc(V(v), T(destType)));
+}
+
+Value *Nucleus::createZExt(Value *v, Type *destType)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return V(jit->builder->CreateZExt(V(v), T(destType)));
+}
+
+Value *Nucleus::createSExt(Value *v, Type *destType)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return V(jit->builder->CreateSExt(V(v), T(destType)));
+}
+
+Value *Nucleus::createFPToUI(Value *v, Type *destType)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return V(jit->builder->CreateFPToUI(V(v), T(destType)));
+}
+
+Value *Nucleus::createFPToSI(Value *v, Type *destType)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return V(jit->builder->CreateFPToSI(V(v), T(destType)));
+}
+
+Value *Nucleus::createSIToFP(Value *v, Type *destType)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return V(jit->builder->CreateSIToFP(V(v), T(destType)));
+}
+
+Value *Nucleus::createFPTrunc(Value *v, Type *destType)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return V(jit->builder->CreateFPTrunc(V(v), T(destType)));
+}
+
+Value *Nucleus::createFPExt(Value *v, Type *destType)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return V(jit->builder->CreateFPExt(V(v), T(destType)));
+}
+
+Value *Nucleus::createBitCast(Value *v, Type *destType)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	// Bitcasts must be between types of the same logical size. But with emulated narrow vectors we need
+	// support for casting between scalars and wide vectors. Emulate them by writing to the stack and
+	// reading back as the destination type.
+	if(!V(v)->getType()->isVectorTy() && T(destType)->isVectorTy())
 	{
-		return T(llvm::Type::getVoidTy(jit->context));
+		Value *readAddress = allocateStackVariable(destType);
+		Value *writeAddress = createBitCast(readAddress, T(llvm::PointerType::get(V(v)->getType(), 0)));
+		createStore(v, writeAddress, T(V(v)->getType()));
+		return createLoad(readAddress, destType);
+	}
+	else if(V(v)->getType()->isVectorTy() && !T(destType)->isVectorTy())
+	{
+		Value *writeAddress = allocateStackVariable(T(V(v)->getType()));
+		createStore(v, writeAddress, T(V(v)->getType()));
+		Value *readAddress = createBitCast(writeAddress, T(llvm::PointerType::get(T(destType), 0)));
+		return createLoad(readAddress, destType);
 	}
 
-	Type *Bool::getType()
+	return V(jit->builder->CreateBitCast(V(v), T(destType)));
+}
+
+Value *Nucleus::createPtrEQ(Value *lhs, Value *rhs)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return V(jit->builder->CreateICmpEQ(V(lhs), V(rhs)));
+}
+
+Value *Nucleus::createICmpEQ(Value *lhs, Value *rhs)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return V(jit->builder->CreateICmpEQ(V(lhs), V(rhs)));
+}
+
+Value *Nucleus::createICmpNE(Value *lhs, Value *rhs)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return V(jit->builder->CreateICmpNE(V(lhs), V(rhs)));
+}
+
+Value *Nucleus::createICmpUGT(Value *lhs, Value *rhs)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return V(jit->builder->CreateICmpUGT(V(lhs), V(rhs)));
+}
+
+Value *Nucleus::createICmpUGE(Value *lhs, Value *rhs)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return V(jit->builder->CreateICmpUGE(V(lhs), V(rhs)));
+}
+
+Value *Nucleus::createICmpULT(Value *lhs, Value *rhs)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return V(jit->builder->CreateICmpULT(V(lhs), V(rhs)));
+}
+
+Value *Nucleus::createICmpULE(Value *lhs, Value *rhs)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return V(jit->builder->CreateICmpULE(V(lhs), V(rhs)));
+}
+
+Value *Nucleus::createICmpSGT(Value *lhs, Value *rhs)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return V(jit->builder->CreateICmpSGT(V(lhs), V(rhs)));
+}
+
+Value *Nucleus::createICmpSGE(Value *lhs, Value *rhs)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return V(jit->builder->CreateICmpSGE(V(lhs), V(rhs)));
+}
+
+Value *Nucleus::createICmpSLT(Value *lhs, Value *rhs)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return V(jit->builder->CreateICmpSLT(V(lhs), V(rhs)));
+}
+
+Value *Nucleus::createICmpSLE(Value *lhs, Value *rhs)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return V(jit->builder->CreateICmpSLE(V(lhs), V(rhs)));
+}
+
+Value *Nucleus::createFCmpOEQ(Value *lhs, Value *rhs)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return V(jit->builder->CreateFCmpOEQ(V(lhs), V(rhs)));
+}
+
+Value *Nucleus::createFCmpOGT(Value *lhs, Value *rhs)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return V(jit->builder->CreateFCmpOGT(V(lhs), V(rhs)));
+}
+
+Value *Nucleus::createFCmpOGE(Value *lhs, Value *rhs)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return V(jit->builder->CreateFCmpOGE(V(lhs), V(rhs)));
+}
+
+Value *Nucleus::createFCmpOLT(Value *lhs, Value *rhs)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return V(jit->builder->CreateFCmpOLT(V(lhs), V(rhs)));
+}
+
+Value *Nucleus::createFCmpOLE(Value *lhs, Value *rhs)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return V(jit->builder->CreateFCmpOLE(V(lhs), V(rhs)));
+}
+
+Value *Nucleus::createFCmpONE(Value *lhs, Value *rhs)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return V(jit->builder->CreateFCmpONE(V(lhs), V(rhs)));
+}
+
+Value *Nucleus::createFCmpORD(Value *lhs, Value *rhs)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return V(jit->builder->CreateFCmpORD(V(lhs), V(rhs)));
+}
+
+Value *Nucleus::createFCmpUNO(Value *lhs, Value *rhs)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return V(jit->builder->CreateFCmpUNO(V(lhs), V(rhs)));
+}
+
+Value *Nucleus::createFCmpUEQ(Value *lhs, Value *rhs)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return V(jit->builder->CreateFCmpUEQ(V(lhs), V(rhs)));
+}
+
+Value *Nucleus::createFCmpUGT(Value *lhs, Value *rhs)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return V(jit->builder->CreateFCmpUGT(V(lhs), V(rhs)));
+}
+
+Value *Nucleus::createFCmpUGE(Value *lhs, Value *rhs)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return V(jit->builder->CreateFCmpUGE(V(lhs), V(rhs)));
+}
+
+Value *Nucleus::createFCmpULT(Value *lhs, Value *rhs)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return V(jit->builder->CreateFCmpULT(V(lhs), V(rhs)));
+}
+
+Value *Nucleus::createFCmpULE(Value *lhs, Value *rhs)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return V(jit->builder->CreateFCmpULE(V(lhs), V(rhs)));
+}
+
+Value *Nucleus::createFCmpUNE(Value *lhs, Value *rhs)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return V(jit->builder->CreateFCmpUNE(V(lhs), V(rhs)));
+}
+
+Value *Nucleus::createExtractElement(Value *vector, Type *type, int index)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	ASSERT(V(vector)->getType()->getContainedType(0) == T(type));
+	return V(jit->builder->CreateExtractElement(V(vector), V(createConstantInt(index))));
+}
+
+Value *Nucleus::createInsertElement(Value *vector, Value *element, int index)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return V(jit->builder->CreateInsertElement(V(vector), V(element), V(createConstantInt(index))));
+}
+
+Value *Nucleus::createShuffleVector(Value *v1, Value *v2, const int *select)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+
+	int size = llvm::cast<llvm::VectorType>(V(v1)->getType())->getNumElements();
+	const int maxSize = 16;
+	llvm::Constant *swizzle[maxSize];
+	ASSERT(size <= maxSize);
+
+	for(int i = 0; i < size; i++)
 	{
-		return T(llvm::Type::getInt1Ty(jit->context));
+		swizzle[i] = llvm::ConstantInt::get(llvm::Type::getInt32Ty(jit->context), select[i]);
 	}
 
-	Type *Byte::getType()
+	llvm::Value *shuffle = llvm::ConstantVector::get(llvm::ArrayRef<llvm::Constant*>(swizzle, size));
+
+	return V(jit->builder->CreateShuffleVector(V(v1), V(v2), shuffle));
+}
+
+Value *Nucleus::createSelect(Value *c, Value *ifTrue, Value *ifFalse)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return V(jit->builder->CreateSelect(V(c), V(ifTrue), V(ifFalse)));
+}
+
+SwitchCases *Nucleus::createSwitch(Value *control, BasicBlock *defaultBranch, unsigned numCases)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return reinterpret_cast<SwitchCases*>(jit->builder->CreateSwitch(V(control), B(defaultBranch), numCases));
+}
+
+void Nucleus::addSwitchCase(SwitchCases *switchCases, int label, BasicBlock *branch)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	llvm::SwitchInst *sw = reinterpret_cast<llvm::SwitchInst *>(switchCases);
+	sw->addCase(llvm::ConstantInt::get(llvm::Type::getInt32Ty(jit->context), label, true), B(branch));
+}
+
+void Nucleus::createUnreachable()
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	jit->builder->CreateUnreachable();
+}
+
+Type *Nucleus::getPointerType(Type *ElementType)
+{
+	return T(llvm::PointerType::get(T(ElementType), 0));
+}
+
+Value *Nucleus::createNullValue(Type *Ty)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return V(llvm::Constant::getNullValue(T(Ty)));
+}
+
+Value *Nucleus::createConstantLong(int64_t i)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return V(llvm::ConstantInt::get(llvm::Type::getInt64Ty(jit->context), i, true));
+}
+
+Value *Nucleus::createConstantInt(int i)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return V(llvm::ConstantInt::get(llvm::Type::getInt32Ty(jit->context), i, true));
+}
+
+Value *Nucleus::createConstantInt(unsigned int i)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return V(llvm::ConstantInt::get(llvm::Type::getInt32Ty(jit->context), i, false));
+}
+
+Value *Nucleus::createConstantBool(bool b)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return V(llvm::ConstantInt::get(llvm::Type::getInt1Ty(jit->context), b));
+}
+
+Value *Nucleus::createConstantByte(signed char i)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return V(llvm::ConstantInt::get(llvm::Type::getInt8Ty(jit->context), i, true));
+}
+
+Value *Nucleus::createConstantByte(unsigned char i)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return V(llvm::ConstantInt::get(llvm::Type::getInt8Ty(jit->context), i, false));
+}
+
+Value *Nucleus::createConstantShort(short i)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return V(llvm::ConstantInt::get(llvm::Type::getInt16Ty(jit->context), i, true));
+}
+
+Value *Nucleus::createConstantShort(unsigned short i)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return V(llvm::ConstantInt::get(llvm::Type::getInt16Ty(jit->context), i, false));
+}
+
+Value *Nucleus::createConstantFloat(float x)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return V(llvm::ConstantFP::get(T(Float::getType()), x));
+}
+
+Value *Nucleus::createNullPointer(Type *Ty)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return V(llvm::ConstantPointerNull::get(llvm::PointerType::get(T(Ty), 0)));
+}
+
+Value *Nucleus::createConstantVector(const int64_t *constants, Type *type)
+{
+	ASSERT(llvm::isa<llvm::VectorType>(T(type)));
+	const int numConstants = elementCount(type);                                       // Number of provided constants for the (emulated) type.
+	const int numElements = llvm::cast<llvm::VectorType>(T(type))->getNumElements();   // Number of elements of the underlying vector type.
+	ASSERT(numElements <= 16 && numConstants <= numElements);
+	llvm::Constant *constantVector[16];
+
+	for(int i = 0; i < numElements; i++)
 	{
-		return T(llvm::Type::getInt8Ty(jit->context));
+		constantVector[i] = llvm::ConstantInt::get(T(type)->getContainedType(0), constants[i % numConstants]);
 	}
 
-	Type *SByte::getType()
+	return V(llvm::ConstantVector::get(llvm::ArrayRef<llvm::Constant*>(constantVector, numElements)));
+}
+
+Value *Nucleus::createConstantVector(const double *constants, Type *type)
+{
+	ASSERT(llvm::isa<llvm::VectorType>(T(type)));
+	const int numConstants = elementCount(type);                                       // Number of provided constants for the (emulated) type.
+	const int numElements = llvm::cast<llvm::VectorType>(T(type))->getNumElements();   // Number of elements of the underlying vector type.
+	ASSERT(numElements <= 8 && numConstants <= numElements);
+	llvm::Constant *constantVector[8];
+
+	for(int i = 0; i < numElements; i++)
 	{
-		return T(llvm::Type::getInt8Ty(jit->context));
+		constantVector[i] = llvm::ConstantFP::get(T(type)->getContainedType(0), constants[i % numConstants]);
 	}
 
-	Type *Short::getType()
-	{
-		return T(llvm::Type::getInt16Ty(jit->context));
-	}
+	return V(llvm::ConstantVector::get(llvm::ArrayRef<llvm::Constant*>(constantVector, numElements)));
+}
 
-	Type *UShort::getType()
-	{
-		return T(llvm::Type::getInt16Ty(jit->context));
-	}
+Type *Void::getType()
+{
+	return T(llvm::Type::getVoidTy(jit->context));
+}
 
-	Type *Byte4::getType()
-	{
-		return T(Type_v4i8);
-	}
+Type *Bool::getType()
+{
+	return T(llvm::Type::getInt1Ty(jit->context));
+}
 
-	Type *SByte4::getType()
-	{
-		return T(Type_v4i8);
-	}
+Type *Byte::getType()
+{
+	return T(llvm::Type::getInt8Ty(jit->context));
+}
 
-	RValue<Byte8> AddSat(RValue<Byte8> x, RValue<Byte8> y)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
+Type *SByte::getType()
+{
+	return T(llvm::Type::getInt8Ty(jit->context));
+}
+
+Type *Short::getType()
+{
+	return T(llvm::Type::getInt16Ty(jit->context));
+}
+
+Type *UShort::getType()
+{
+	return T(llvm::Type::getInt16Ty(jit->context));
+}
+
+Type *Byte4::getType()
+{
+	return T(Type_v4i8);
+}
+
+Type *SByte4::getType()
+{
+	return T(Type_v4i8);
+}
+
+RValue<Byte8> AddSat(RValue<Byte8> x, RValue<Byte8> y)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
 #if defined(__i386__) || defined(__x86_64__)
-		return x86::paddusb(x, y);
+	return x86::paddusb(x, y);
 #else
-		return As<Byte8>(V(lowerPUADDSAT(V(x.value), V(y.value))));
+	return As<Byte8>(V(lowerPUADDSAT(V(x.value), V(y.value))));
 #endif
-	}
+}
 
-	RValue<Byte8> SubSat(RValue<Byte8> x, RValue<Byte8> y)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
+RValue<Byte8> SubSat(RValue<Byte8> x, RValue<Byte8> y)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
 #if defined(__i386__) || defined(__x86_64__)
-		return x86::psubusb(x, y);
+	return x86::psubusb(x, y);
 #else
-		return As<Byte8>(V(lowerPUSUBSAT(V(x.value), V(y.value))));
+	return As<Byte8>(V(lowerPUSUBSAT(V(x.value), V(y.value))));
 #endif
-	}
+}
 
-	RValue<Int> SignMask(RValue<Byte8> x)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
+RValue<Int> SignMask(RValue<Byte8> x)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
 #if defined(__i386__) || defined(__x86_64__)
-		return x86::pmovmskb(x);
+	return x86::pmovmskb(x);
 #else
-		return As<Int>(V(lowerSignMask(V(x.value), T(Int::getType()))));
+	return As<Int>(V(lowerSignMask(V(x.value), T(Int::getType()))));
 #endif
-	}
+}
 
 //	RValue<Byte8> CmpGT(RValue<Byte8> x, RValue<Byte8> y)
 //	{
@@ -2399,575 +2400,575 @@
 //#endif
 //	}
 
-	RValue<Byte8> CmpEQ(RValue<Byte8> x, RValue<Byte8> y)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
+RValue<Byte8> CmpEQ(RValue<Byte8> x, RValue<Byte8> y)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
 #if defined(__i386__) || defined(__x86_64__)
-		return x86::pcmpeqb(x, y);
+	return x86::pcmpeqb(x, y);
 #else
-		return As<Byte8>(V(lowerPCMP(llvm::ICmpInst::ICMP_EQ, V(x.value), V(y.value), T(Byte8::getType()))));
+	return As<Byte8>(V(lowerPCMP(llvm::ICmpInst::ICMP_EQ, V(x.value), V(y.value), T(Byte8::getType()))));
 #endif
-	}
+}
 
-	Type *Byte8::getType()
-	{
-		return T(Type_v8i8);
-	}
+Type *Byte8::getType()
+{
+	return T(Type_v8i8);
+}
 
-	RValue<SByte8> AddSat(RValue<SByte8> x, RValue<SByte8> y)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
+RValue<SByte8> AddSat(RValue<SByte8> x, RValue<SByte8> y)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
 #if defined(__i386__) || defined(__x86_64__)
-		return x86::paddsb(x, y);
+	return x86::paddsb(x, y);
 #else
-		return As<SByte8>(V(lowerPSADDSAT(V(x.value), V(y.value))));
+	return As<SByte8>(V(lowerPSADDSAT(V(x.value), V(y.value))));
 #endif
-	}
+}
 
-	RValue<SByte8> SubSat(RValue<SByte8> x, RValue<SByte8> y)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
+RValue<SByte8> SubSat(RValue<SByte8> x, RValue<SByte8> y)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
 #if defined(__i386__) || defined(__x86_64__)
-		return x86::psubsb(x, y);
+	return x86::psubsb(x, y);
 #else
-		return As<SByte8>(V(lowerPSSUBSAT(V(x.value), V(y.value))));
+	return As<SByte8>(V(lowerPSSUBSAT(V(x.value), V(y.value))));
 #endif
-	}
+}
 
-	RValue<Int> SignMask(RValue<SByte8> x)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
+RValue<Int> SignMask(RValue<SByte8> x)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
 #if defined(__i386__) || defined(__x86_64__)
-		return x86::pmovmskb(As<Byte8>(x));
+	return x86::pmovmskb(As<Byte8>(x));
 #else
-		return As<Int>(V(lowerSignMask(V(x.value), T(Int::getType()))));
+	return As<Int>(V(lowerSignMask(V(x.value), T(Int::getType()))));
 #endif
-	}
+}
 
-	RValue<Byte8> CmpGT(RValue<SByte8> x, RValue<SByte8> y)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
+RValue<Byte8> CmpGT(RValue<SByte8> x, RValue<SByte8> y)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
 #if defined(__i386__) || defined(__x86_64__)
-		return x86::pcmpgtb(x, y);
+	return x86::pcmpgtb(x, y);
 #else
-		return As<Byte8>(V(lowerPCMP(llvm::ICmpInst::ICMP_SGT, V(x.value), V(y.value), T(Byte8::getType()))));
+	return As<Byte8>(V(lowerPCMP(llvm::ICmpInst::ICMP_SGT, V(x.value), V(y.value), T(Byte8::getType()))));
 #endif
-	}
+}
 
-	RValue<Byte8> CmpEQ(RValue<SByte8> x, RValue<SByte8> y)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
+RValue<Byte8> CmpEQ(RValue<SByte8> x, RValue<SByte8> y)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
 #if defined(__i386__) || defined(__x86_64__)
-		return x86::pcmpeqb(As<Byte8>(x), As<Byte8>(y));
+	return x86::pcmpeqb(As<Byte8>(x), As<Byte8>(y));
 #else
-		return As<Byte8>(V(lowerPCMP(llvm::ICmpInst::ICMP_EQ, V(x.value), V(y.value), T(Byte8::getType()))));
+	return As<Byte8>(V(lowerPCMP(llvm::ICmpInst::ICMP_EQ, V(x.value), V(y.value), T(Byte8::getType()))));
 #endif
-	}
+}
 
-	Type *SByte8::getType()
-	{
-		return T(Type_v8i8);
-	}
+Type *SByte8::getType()
+{
+	return T(Type_v8i8);
+}
 
-	Type *Byte16::getType()
-	{
-		return T(llvm::VectorType::get(T(Byte::getType()), 16));
-	}
+Type *Byte16::getType()
+{
+	return T(llvm::VectorType::get(T(Byte::getType()), 16));
+}
 
-	Type *SByte16::getType()
-	{
-		return T(llvm::VectorType::get(T(SByte::getType()), 16));
-	}
+Type *SByte16::getType()
+{
+	return T(llvm::VectorType::get(T(SByte::getType()), 16));
+}
 
-	Type *Short2::getType()
-	{
-		return T(Type_v2i16);
-	}
+Type *Short2::getType()
+{
+	return T(Type_v2i16);
+}
 
-	Type *UShort2::getType()
-	{
-		return T(Type_v2i16);
-	}
+Type *UShort2::getType()
+{
+	return T(Type_v2i16);
+}
 
-	Short4::Short4(RValue<Int4> cast)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		int select[8] = {0, 2, 4, 6, 0, 2, 4, 6};
-		Value *short8 = Nucleus::createBitCast(cast.value, Short8::getType());
+Short4::Short4(RValue<Int4> cast)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	int select[8] = {0, 2, 4, 6, 0, 2, 4, 6};
+	Value *short8 = Nucleus::createBitCast(cast.value, Short8::getType());
 
-		Value *packed = Nucleus::createShuffleVector(short8, short8, select);
-		Value *short4 = As<Short4>(Int2(As<Int4>(packed))).value;
+	Value *packed = Nucleus::createShuffleVector(short8, short8, select);
+	Value *short4 = As<Short4>(Int2(As<Int4>(packed))).value;
 
-		storeValue(short4);
-	}
+	storeValue(short4);
+}
 
 //	Short4::Short4(RValue<Float> cast)
 //	{
 //	}
 
-	Short4::Short4(RValue<Float4> cast)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		Int4 v4i32 = Int4(cast);
+Short4::Short4(RValue<Float4> cast)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	Int4 v4i32 = Int4(cast);
 #if defined(__i386__) || defined(__x86_64__)
-		v4i32 = As<Int4>(x86::packssdw(v4i32, v4i32));
+	v4i32 = As<Int4>(x86::packssdw(v4i32, v4i32));
 #else
-		Value *v = v4i32.loadValue();
-		v4i32 = As<Int4>(V(lowerPack(V(v), V(v), true)));
+	Value *v = v4i32.loadValue();
+	v4i32 = As<Int4>(V(lowerPack(V(v), V(v), true)));
 #endif
 
-		storeValue(As<Short4>(Int2(v4i32)).value);
-	}
+	storeValue(As<Short4>(Int2(v4i32)).value);
+}
 
-	RValue<Short4> operator<<(RValue<Short4> lhs, unsigned char rhs)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
+RValue<Short4> operator<<(RValue<Short4> lhs, unsigned char rhs)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
 #if defined(__i386__) || defined(__x86_64__)
-	//	return RValue<Short4>(Nucleus::createShl(lhs.value, rhs.value));
+//	return RValue<Short4>(Nucleus::createShl(lhs.value, rhs.value));
 
-		return x86::psllw(lhs, rhs);
+	return x86::psllw(lhs, rhs);
 #else
-		return As<Short4>(V(lowerVectorShl(V(lhs.value), rhs)));
+	return As<Short4>(V(lowerVectorShl(V(lhs.value), rhs)));
 #endif
-	}
+}
 
-	RValue<Short4> operator>>(RValue<Short4> lhs, unsigned char rhs)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
+RValue<Short4> operator>>(RValue<Short4> lhs, unsigned char rhs)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
 #if defined(__i386__) || defined(__x86_64__)
-		return x86::psraw(lhs, rhs);
+	return x86::psraw(lhs, rhs);
 #else
-		return As<Short4>(V(lowerVectorAShr(V(lhs.value), rhs)));
+	return As<Short4>(V(lowerVectorAShr(V(lhs.value), rhs)));
 #endif
-	}
+}
 
-	RValue<Short4> Max(RValue<Short4> x, RValue<Short4> y)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
+RValue<Short4> Max(RValue<Short4> x, RValue<Short4> y)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
 #if defined(__i386__) || defined(__x86_64__)
-		return x86::pmaxsw(x, y);
+	return x86::pmaxsw(x, y);
 #else
-		return RValue<Short4>(V(lowerPMINMAX(V(x.value), V(y.value), llvm::ICmpInst::ICMP_SGT)));
+	return RValue<Short4>(V(lowerPMINMAX(V(x.value), V(y.value), llvm::ICmpInst::ICMP_SGT)));
 #endif
-	}
+}
 
-	RValue<Short4> Min(RValue<Short4> x, RValue<Short4> y)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
+RValue<Short4> Min(RValue<Short4> x, RValue<Short4> y)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
 #if defined(__i386__) || defined(__x86_64__)
-		return x86::pminsw(x, y);
+	return x86::pminsw(x, y);
 #else
-		return RValue<Short4>(V(lowerPMINMAX(V(x.value), V(y.value), llvm::ICmpInst::ICMP_SLT)));
+	return RValue<Short4>(V(lowerPMINMAX(V(x.value), V(y.value), llvm::ICmpInst::ICMP_SLT)));
 #endif
-	}
+}
 
-	RValue<Short4> AddSat(RValue<Short4> x, RValue<Short4> y)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
+RValue<Short4> AddSat(RValue<Short4> x, RValue<Short4> y)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
 #if defined(__i386__) || defined(__x86_64__)
-		return x86::paddsw(x, y);
+	return x86::paddsw(x, y);
 #else
-		return As<Short4>(V(lowerPSADDSAT(V(x.value), V(y.value))));
+	return As<Short4>(V(lowerPSADDSAT(V(x.value), V(y.value))));
 #endif
-	}
+}
 
-	RValue<Short4> SubSat(RValue<Short4> x, RValue<Short4> y)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
+RValue<Short4> SubSat(RValue<Short4> x, RValue<Short4> y)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
 #if defined(__i386__) || defined(__x86_64__)
-		return x86::psubsw(x, y);
+	return x86::psubsw(x, y);
 #else
-		return As<Short4>(V(lowerPSSUBSAT(V(x.value), V(y.value))));
+	return As<Short4>(V(lowerPSSUBSAT(V(x.value), V(y.value))));
 #endif
-	}
+}
 
-	RValue<Short4> MulHigh(RValue<Short4> x, RValue<Short4> y)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
+RValue<Short4> MulHigh(RValue<Short4> x, RValue<Short4> y)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
 #if defined(__i386__) || defined(__x86_64__)
-		return x86::pmulhw(x, y);
+	return x86::pmulhw(x, y);
 #else
-		return As<Short4>(V(lowerMulHigh(V(x.value), V(y.value), true)));
+	return As<Short4>(V(lowerMulHigh(V(x.value), V(y.value), true)));
 #endif
-	}
+}
 
-	RValue<Int2> MulAdd(RValue<Short4> x, RValue<Short4> y)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
+RValue<Int2> MulAdd(RValue<Short4> x, RValue<Short4> y)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
 #if defined(__i386__) || defined(__x86_64__)
-		return x86::pmaddwd(x, y);
+	return x86::pmaddwd(x, y);
 #else
-		return As<Int2>(V(lowerMulAdd(V(x.value), V(y.value))));
+	return As<Int2>(V(lowerMulAdd(V(x.value), V(y.value))));
 #endif
-	}
+}
 
-	RValue<SByte8> PackSigned(RValue<Short4> x, RValue<Short4> y)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
+RValue<SByte8> PackSigned(RValue<Short4> x, RValue<Short4> y)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
 #if defined(__i386__) || defined(__x86_64__)
-		auto result = x86::packsswb(x, y);
+	auto result = x86::packsswb(x, y);
 #else
-		auto result = V(lowerPack(V(x.value), V(y.value), true));
+	auto result = V(lowerPack(V(x.value), V(y.value), true));
 #endif
-		return As<SByte8>(Swizzle(As<Int4>(result), 0x0202));
-	}
+	return As<SByte8>(Swizzle(As<Int4>(result), 0x0202));
+}
 
-	RValue<Byte8> PackUnsigned(RValue<Short4> x, RValue<Short4> y)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
+RValue<Byte8> PackUnsigned(RValue<Short4> x, RValue<Short4> y)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
 #if defined(__i386__) || defined(__x86_64__)
-		auto result = x86::packuswb(x, y);
+	auto result = x86::packuswb(x, y);
 #else
-		auto result = V(lowerPack(V(x.value), V(y.value), false));
+	auto result = V(lowerPack(V(x.value), V(y.value), false));
 #endif
-		return As<Byte8>(Swizzle(As<Int4>(result), 0x0202));
-	}
+	return As<Byte8>(Swizzle(As<Int4>(result), 0x0202));
+}
 
-	RValue<Short4> CmpGT(RValue<Short4> x, RValue<Short4> y)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
+RValue<Short4> CmpGT(RValue<Short4> x, RValue<Short4> y)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
 #if defined(__i386__) || defined(__x86_64__)
-		return x86::pcmpgtw(x, y);
+	return x86::pcmpgtw(x, y);
 #else
-		return As<Short4>(V(lowerPCMP(llvm::ICmpInst::ICMP_SGT, V(x.value), V(y.value), T(Short4::getType()))));
+	return As<Short4>(V(lowerPCMP(llvm::ICmpInst::ICMP_SGT, V(x.value), V(y.value), T(Short4::getType()))));
 #endif
-	}
+}
 
-	RValue<Short4> CmpEQ(RValue<Short4> x, RValue<Short4> y)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
+RValue<Short4> CmpEQ(RValue<Short4> x, RValue<Short4> y)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
 #if defined(__i386__) || defined(__x86_64__)
-		return x86::pcmpeqw(x, y);
+	return x86::pcmpeqw(x, y);
 #else
-		return As<Short4>(V(lowerPCMP(llvm::ICmpInst::ICMP_EQ, V(x.value), V(y.value), T(Short4::getType()))));
+	return As<Short4>(V(lowerPCMP(llvm::ICmpInst::ICMP_EQ, V(x.value), V(y.value), T(Short4::getType()))));
 #endif
-	}
+}
 
-	Type *Short4::getType()
-	{
-		return T(Type_v4i16);
-	}
+Type *Short4::getType()
+{
+	return T(Type_v4i16);
+}
 
-	UShort4::UShort4(RValue<Float4> cast, bool saturate)
+UShort4::UShort4(RValue<Float4> cast, bool saturate)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	if(saturate)
 	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		if(saturate)
+#if defined(__i386__) || defined(__x86_64__)
+		if(CPUID::supportsSSE4_1())
 		{
-#if defined(__i386__) || defined(__x86_64__)
-			if(CPUID::supportsSSE4_1())
-			{
-				Int4 int4(Min(cast, Float4(0xFFFF)));   // packusdw takes care of 0x0000 saturation
-				*this = As<Short4>(PackUnsigned(int4, int4));
-			}
-			else
-#endif
-			{
-				*this = Short4(Int4(Max(Min(cast, Float4(0xFFFF)), Float4(0x0000))));
-			}
+			Int4 int4(Min(cast, Float4(0xFFFF)));   // packusdw takes care of 0x0000 saturation
+			*this = As<Short4>(PackUnsigned(int4, int4));
 		}
 		else
+#endif
 		{
-			*this = Short4(Int4(cast));
+			*this = Short4(Int4(Max(Min(cast, Float4(0xFFFF)), Float4(0x0000))));
 		}
 	}
-
-	RValue<UShort4> operator<<(RValue<UShort4> lhs, unsigned char rhs)
+	else
 	{
-		RR_DEBUG_INFO_UPDATE_LOC();
+		*this = Short4(Int4(cast));
+	}
+}
+
+RValue<UShort4> operator<<(RValue<UShort4> lhs, unsigned char rhs)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
 #if defined(__i386__) || defined(__x86_64__)
-	//	return RValue<Short4>(Nucleus::createShl(lhs.value, rhs.value));
+//	return RValue<Short4>(Nucleus::createShl(lhs.value, rhs.value));
 
-		return As<UShort4>(x86::psllw(As<Short4>(lhs), rhs));
+	return As<UShort4>(x86::psllw(As<Short4>(lhs), rhs));
 #else
-		return As<UShort4>(V(lowerVectorShl(V(lhs.value), rhs)));
+	return As<UShort4>(V(lowerVectorShl(V(lhs.value), rhs)));
 #endif
-	}
+}
 
-	RValue<UShort4> operator>>(RValue<UShort4> lhs, unsigned char rhs)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
+RValue<UShort4> operator>>(RValue<UShort4> lhs, unsigned char rhs)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
 #if defined(__i386__) || defined(__x86_64__)
-	//	return RValue<Short4>(Nucleus::createLShr(lhs.value, rhs.value));
+//	return RValue<Short4>(Nucleus::createLShr(lhs.value, rhs.value));
 
-		return x86::psrlw(lhs, rhs);
+	return x86::psrlw(lhs, rhs);
 #else
-		return As<UShort4>(V(lowerVectorLShr(V(lhs.value), rhs)));
+	return As<UShort4>(V(lowerVectorLShr(V(lhs.value), rhs)));
 #endif
-	}
+}
 
-	RValue<UShort4> Max(RValue<UShort4> x, RValue<UShort4> y)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return RValue<UShort4>(Max(As<Short4>(x) - Short4(0x8000u, 0x8000u, 0x8000u, 0x8000u), As<Short4>(y) - Short4(0x8000u, 0x8000u, 0x8000u, 0x8000u)) + Short4(0x8000u, 0x8000u, 0x8000u, 0x8000u));
-	}
+RValue<UShort4> Max(RValue<UShort4> x, RValue<UShort4> y)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return RValue<UShort4>(Max(As<Short4>(x) - Short4(0x8000u, 0x8000u, 0x8000u, 0x8000u), As<Short4>(y) - Short4(0x8000u, 0x8000u, 0x8000u, 0x8000u)) + Short4(0x8000u, 0x8000u, 0x8000u, 0x8000u));
+}
 
-	RValue<UShort4> Min(RValue<UShort4> x, RValue<UShort4> y)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return RValue<UShort4>(Min(As<Short4>(x) - Short4(0x8000u, 0x8000u, 0x8000u, 0x8000u), As<Short4>(y) - Short4(0x8000u, 0x8000u, 0x8000u, 0x8000u)) + Short4(0x8000u, 0x8000u, 0x8000u, 0x8000u));
-	}
+RValue<UShort4> Min(RValue<UShort4> x, RValue<UShort4> y)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return RValue<UShort4>(Min(As<Short4>(x) - Short4(0x8000u, 0x8000u, 0x8000u, 0x8000u), As<Short4>(y) - Short4(0x8000u, 0x8000u, 0x8000u, 0x8000u)) + Short4(0x8000u, 0x8000u, 0x8000u, 0x8000u));
+}
 
-	RValue<UShort4> AddSat(RValue<UShort4> x, RValue<UShort4> y)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
+RValue<UShort4> AddSat(RValue<UShort4> x, RValue<UShort4> y)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
 #if defined(__i386__) || defined(__x86_64__)
-		return x86::paddusw(x, y);
+	return x86::paddusw(x, y);
 #else
-		return As<UShort4>(V(lowerPUADDSAT(V(x.value), V(y.value))));
+	return As<UShort4>(V(lowerPUADDSAT(V(x.value), V(y.value))));
 #endif
-	}
+}
 
-	RValue<UShort4> SubSat(RValue<UShort4> x, RValue<UShort4> y)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
+RValue<UShort4> SubSat(RValue<UShort4> x, RValue<UShort4> y)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
 #if defined(__i386__) || defined(__x86_64__)
-		return x86::psubusw(x, y);
+	return x86::psubusw(x, y);
 #else
-		return As<UShort4>(V(lowerPUSUBSAT(V(x.value), V(y.value))));
+	return As<UShort4>(V(lowerPUSUBSAT(V(x.value), V(y.value))));
 #endif
-	}
+}
 
-	RValue<UShort4> MulHigh(RValue<UShort4> x, RValue<UShort4> y)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
+RValue<UShort4> MulHigh(RValue<UShort4> x, RValue<UShort4> y)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
 #if defined(__i386__) || defined(__x86_64__)
-		return x86::pmulhuw(x, y);
+	return x86::pmulhuw(x, y);
 #else
-		return As<UShort4>(V(lowerMulHigh(V(x.value), V(y.value), false)));
+	return As<UShort4>(V(lowerMulHigh(V(x.value), V(y.value), false)));
 #endif
-	}
+}
 
-	RValue<UShort4> Average(RValue<UShort4> x, RValue<UShort4> y)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
+RValue<UShort4> Average(RValue<UShort4> x, RValue<UShort4> y)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
 #if defined(__i386__) || defined(__x86_64__)
-		return x86::pavgw(x, y);
+	return x86::pavgw(x, y);
 #else
-		return As<UShort4>(V(lowerPAVG(V(x.value), V(y.value))));
+	return As<UShort4>(V(lowerPAVG(V(x.value), V(y.value))));
 #endif
-	}
+}
 
-	Type *UShort4::getType()
-	{
-		return T(Type_v4i16);
-	}
+Type *UShort4::getType()
+{
+	return T(Type_v4i16);
+}
 
-	RValue<Short8> operator<<(RValue<Short8> lhs, unsigned char rhs)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
+RValue<Short8> operator<<(RValue<Short8> lhs, unsigned char rhs)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
 #if defined(__i386__) || defined(__x86_64__)
-		return x86::psllw(lhs, rhs);
+	return x86::psllw(lhs, rhs);
 #else
-		return As<Short8>(V(lowerVectorShl(V(lhs.value), rhs)));
+	return As<Short8>(V(lowerVectorShl(V(lhs.value), rhs)));
 #endif
-	}
+}
 
-	RValue<Short8> operator>>(RValue<Short8> lhs, unsigned char rhs)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
+RValue<Short8> operator>>(RValue<Short8> lhs, unsigned char rhs)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
 #if defined(__i386__) || defined(__x86_64__)
-		return x86::psraw(lhs, rhs);
+	return x86::psraw(lhs, rhs);
 #else
-		return As<Short8>(V(lowerVectorAShr(V(lhs.value), rhs)));
+	return As<Short8>(V(lowerVectorAShr(V(lhs.value), rhs)));
 #endif
-	}
+}
 
-	RValue<Int4> MulAdd(RValue<Short8> x, RValue<Short8> y)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
+RValue<Int4> MulAdd(RValue<Short8> x, RValue<Short8> y)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
 #if defined(__i386__) || defined(__x86_64__)
-		return x86::pmaddwd(x, y);
+	return x86::pmaddwd(x, y);
 #else
-		return As<Int4>(V(lowerMulAdd(V(x.value), V(y.value))));
+	return As<Int4>(V(lowerMulAdd(V(x.value), V(y.value))));
 #endif
-	}
+}
 
-	RValue<Short8> MulHigh(RValue<Short8> x, RValue<Short8> y)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
+RValue<Short8> MulHigh(RValue<Short8> x, RValue<Short8> y)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
 #if defined(__i386__) || defined(__x86_64__)
-		return x86::pmulhw(x, y);
+	return x86::pmulhw(x, y);
 #else
-		return As<Short8>(V(lowerMulHigh(V(x.value), V(y.value), true)));
+	return As<Short8>(V(lowerMulHigh(V(x.value), V(y.value), true)));
 #endif
-	}
+}
 
-	Type *Short8::getType()
-	{
-		return T(llvm::VectorType::get(T(Short::getType()), 8));
-	}
+Type *Short8::getType()
+{
+	return T(llvm::VectorType::get(T(Short::getType()), 8));
+}
 
-	RValue<UShort8> operator<<(RValue<UShort8> lhs, unsigned char rhs)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
+RValue<UShort8> operator<<(RValue<UShort8> lhs, unsigned char rhs)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
 #if defined(__i386__) || defined(__x86_64__)
-		return As<UShort8>(x86::psllw(As<Short8>(lhs), rhs));
+	return As<UShort8>(x86::psllw(As<Short8>(lhs), rhs));
 #else
-		return As<UShort8>(V(lowerVectorShl(V(lhs.value), rhs)));
+	return As<UShort8>(V(lowerVectorShl(V(lhs.value), rhs)));
 #endif
-	}
+}
 
-	RValue<UShort8> operator>>(RValue<UShort8> lhs, unsigned char rhs)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
+RValue<UShort8> operator>>(RValue<UShort8> lhs, unsigned char rhs)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
 #if defined(__i386__) || defined(__x86_64__)
-		return x86::psrlw(lhs, rhs);   // FIXME: Fallback required
+	return x86::psrlw(lhs, rhs);   // FIXME: Fallback required
 #else
-		return As<UShort8>(V(lowerVectorLShr(V(lhs.value), rhs)));
+	return As<UShort8>(V(lowerVectorLShr(V(lhs.value), rhs)));
 #endif
-	}
+}
 
-	RValue<UShort8> Swizzle(RValue<UShort8> x, char select0, char select1, char select2, char select3, char select4, char select5, char select6, char select7)
+RValue<UShort8> Swizzle(RValue<UShort8> x, char select0, char select1, char select2, char select3, char select4, char select5, char select6, char select7)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	int pshufb[16] =
 	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		int pshufb[16] =
-		{
-			select0 + 0,
-			select0 + 1,
-			select1 + 0,
-			select1 + 1,
-			select2 + 0,
-			select2 + 1,
-			select3 + 0,
-			select3 + 1,
-			select4 + 0,
-			select4 + 1,
-			select5 + 0,
-			select5 + 1,
-			select6 + 0,
-			select6 + 1,
-			select7 + 0,
-			select7 + 1,
-		};
+		select0 + 0,
+		select0 + 1,
+		select1 + 0,
+		select1 + 1,
+		select2 + 0,
+		select2 + 1,
+		select3 + 0,
+		select3 + 1,
+		select4 + 0,
+		select4 + 1,
+		select5 + 0,
+		select5 + 1,
+		select6 + 0,
+		select6 + 1,
+		select7 + 0,
+		select7 + 1,
+	};
 
-		Value *byte16 = Nucleus::createBitCast(x.value, Byte16::getType());
-		Value *shuffle = Nucleus::createShuffleVector(byte16, byte16, pshufb);
-		Value *short8 = Nucleus::createBitCast(shuffle, UShort8::getType());
+	Value *byte16 = Nucleus::createBitCast(x.value, Byte16::getType());
+	Value *shuffle = Nucleus::createShuffleVector(byte16, byte16, pshufb);
+	Value *short8 = Nucleus::createBitCast(shuffle, UShort8::getType());
 
-		return RValue<UShort8>(short8);
-	}
+	return RValue<UShort8>(short8);
+}
 
-	RValue<UShort8> MulHigh(RValue<UShort8> x, RValue<UShort8> y)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
+RValue<UShort8> MulHigh(RValue<UShort8> x, RValue<UShort8> y)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
 #if defined(__i386__) || defined(__x86_64__)
-		return x86::pmulhuw(x, y);
+	return x86::pmulhuw(x, y);
 #else
-		return As<UShort8>(V(lowerMulHigh(V(x.value), V(y.value), false)));
+	return As<UShort8>(V(lowerMulHigh(V(x.value), V(y.value), false)));
 #endif
-	}
+}
 
-	Type *UShort8::getType()
-	{
-		return T(llvm::VectorType::get(T(UShort::getType()), 8));
-	}
+Type *UShort8::getType()
+{
+	return T(llvm::VectorType::get(T(UShort::getType()), 8));
+}
 
-	RValue<Int> operator++(Int &val, int)   // Post-increment
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		RValue<Int> res = val;
+RValue<Int> operator++(Int &val, int)   // Post-increment
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	RValue<Int> res = val;
 
-		Value *inc = Nucleus::createAdd(res.value, Nucleus::createConstantInt(1));
-		val.storeValue(inc);
+	Value *inc = Nucleus::createAdd(res.value, Nucleus::createConstantInt(1));
+	val.storeValue(inc);
 
-		return res;
-	}
+	return res;
+}
 
-	const Int &operator++(Int &val)   // Pre-increment
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		Value *inc = Nucleus::createAdd(val.loadValue(), Nucleus::createConstantInt(1));
-		val.storeValue(inc);
+const Int &operator++(Int &val)   // Pre-increment
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	Value *inc = Nucleus::createAdd(val.loadValue(), Nucleus::createConstantInt(1));
+	val.storeValue(inc);
 
-		return val;
-	}
+	return val;
+}
 
-	RValue<Int> operator--(Int &val, int)   // Post-decrement
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		RValue<Int> res = val;
+RValue<Int> operator--(Int &val, int)   // Post-decrement
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	RValue<Int> res = val;
 
-		Value *inc = Nucleus::createSub(res.value, Nucleus::createConstantInt(1));
-		val.storeValue(inc);
+	Value *inc = Nucleus::createSub(res.value, Nucleus::createConstantInt(1));
+	val.storeValue(inc);
 
-		return res;
-	}
+	return res;
+}
 
-	const Int &operator--(Int &val)   // Pre-decrement
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		Value *inc = Nucleus::createSub(val.loadValue(), Nucleus::createConstantInt(1));
-		val.storeValue(inc);
+const Int &operator--(Int &val)   // Pre-decrement
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	Value *inc = Nucleus::createSub(val.loadValue(), Nucleus::createConstantInt(1));
+	val.storeValue(inc);
 
-		return val;
-	}
+	return val;
+}
 
-	RValue<Int> RoundInt(RValue<Float> cast)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
+RValue<Int> RoundInt(RValue<Float> cast)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
 #if defined(__i386__) || defined(__x86_64__)
-		return x86::cvtss2si(cast);
+	return x86::cvtss2si(cast);
 #else
-		return RValue<Int>(V(lowerRoundInt(V(cast.value), T(Int::getType()))));
+	return RValue<Int>(V(lowerRoundInt(V(cast.value), T(Int::getType()))));
 #endif
-	}
+}
 
-	Type *Int::getType()
-	{
-		return T(llvm::Type::getInt32Ty(jit->context));
-	}
+Type *Int::getType()
+{
+	return T(llvm::Type::getInt32Ty(jit->context));
+}
 
-	Type *Long::getType()
-	{
-		return T(llvm::Type::getInt64Ty(jit->context));
-	}
+Type *Long::getType()
+{
+	return T(llvm::Type::getInt64Ty(jit->context));
+}
 
-	UInt::UInt(RValue<Float> cast)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		Value *integer = Nucleus::createFPToUI(cast.value, UInt::getType());
-		storeValue(integer);
-	}
+UInt::UInt(RValue<Float> cast)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	Value *integer = Nucleus::createFPToUI(cast.value, UInt::getType());
+	storeValue(integer);
+}
 
-	RValue<UInt> operator++(UInt &val, int)   // Post-increment
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		RValue<UInt> res = val;
+RValue<UInt> operator++(UInt &val, int)   // Post-increment
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	RValue<UInt> res = val;
 
-		Value *inc = Nucleus::createAdd(res.value, Nucleus::createConstantInt(1));
-		val.storeValue(inc);
+	Value *inc = Nucleus::createAdd(res.value, Nucleus::createConstantInt(1));
+	val.storeValue(inc);
 
-		return res;
-	}
+	return res;
+}
 
-	const UInt &operator++(UInt &val)   // Pre-increment
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		Value *inc = Nucleus::createAdd(val.loadValue(), Nucleus::createConstantInt(1));
-		val.storeValue(inc);
+const UInt &operator++(UInt &val)   // Pre-increment
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	Value *inc = Nucleus::createAdd(val.loadValue(), Nucleus::createConstantInt(1));
+	val.storeValue(inc);
 
-		return val;
-	}
+	return val;
+}
 
-	RValue<UInt> operator--(UInt &val, int)   // Post-decrement
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		RValue<UInt> res = val;
+RValue<UInt> operator--(UInt &val, int)   // Post-decrement
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	RValue<UInt> res = val;
 
-		Value *inc = Nucleus::createSub(res.value, Nucleus::createConstantInt(1));
-		val.storeValue(inc);
+	Value *inc = Nucleus::createSub(res.value, Nucleus::createConstantInt(1));
+	val.storeValue(inc);
 
-		return res;
-	}
+	return res;
+}
 
-	const UInt &operator--(UInt &val)   // Pre-decrement
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		Value *inc = Nucleus::createSub(val.loadValue(), Nucleus::createConstantInt(1));
-		val.storeValue(inc);
+const UInt &operator--(UInt &val)   // Pre-decrement
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	Value *inc = Nucleus::createSub(val.loadValue(), Nucleus::createConstantInt(1));
+	val.storeValue(inc);
 
-		return val;
-	}
+	return val;
+}
 
 //	RValue<UInt> RoundUInt(RValue<Float> cast)
 //	{
@@ -2978,10 +2979,10 @@
 //#endif
 //	}
 
-	Type *UInt::getType()
-	{
-		return T(llvm::Type::getInt32Ty(jit->context));
-	}
+Type *UInt::getType()
+{
+	return T(llvm::Type::getInt32Ty(jit->context));
+}
 
 //	Int2::Int2(RValue<Int> cast)
 //	{
@@ -2994,1666 +2995,1668 @@
 //		storeValue(replicate);
 //	}
 
-	RValue<Int2> operator<<(RValue<Int2> lhs, unsigned char rhs)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
+RValue<Int2> operator<<(RValue<Int2> lhs, unsigned char rhs)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
 #if defined(__i386__) || defined(__x86_64__)
-	//	return RValue<Int2>(Nucleus::createShl(lhs.value, rhs.value));
+//	return RValue<Int2>(Nucleus::createShl(lhs.value, rhs.value));
 
-		return x86::pslld(lhs, rhs);
+	return x86::pslld(lhs, rhs);
 #else
-		return As<Int2>(V(lowerVectorShl(V(lhs.value), rhs)));
+	return As<Int2>(V(lowerVectorShl(V(lhs.value), rhs)));
 #endif
-	}
+}
 
-	RValue<Int2> operator>>(RValue<Int2> lhs, unsigned char rhs)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
+RValue<Int2> operator>>(RValue<Int2> lhs, unsigned char rhs)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
 #if defined(__i386__) || defined(__x86_64__)
-	//	return RValue<Int2>(Nucleus::createAShr(lhs.value, rhs.value));
+//	return RValue<Int2>(Nucleus::createAShr(lhs.value, rhs.value));
 
-		return x86::psrad(lhs, rhs);
+	return x86::psrad(lhs, rhs);
 #else
-		return As<Int2>(V(lowerVectorAShr(V(lhs.value), rhs)));
+	return As<Int2>(V(lowerVectorAShr(V(lhs.value), rhs)));
 #endif
-	}
+}
 
-	Type *Int2::getType()
-	{
-		return T(Type_v2i32);
-	}
+Type *Int2::getType()
+{
+	return T(Type_v2i32);
+}
 
-	RValue<UInt2> operator<<(RValue<UInt2> lhs, unsigned char rhs)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
+RValue<UInt2> operator<<(RValue<UInt2> lhs, unsigned char rhs)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
 #if defined(__i386__) || defined(__x86_64__)
-	//	return RValue<UInt2>(Nucleus::createShl(lhs.value, rhs.value));
+//	return RValue<UInt2>(Nucleus::createShl(lhs.value, rhs.value));
 
-		return As<UInt2>(x86::pslld(As<Int2>(lhs), rhs));
+	return As<UInt2>(x86::pslld(As<Int2>(lhs), rhs));
 #else
-		return As<UInt2>(V(lowerVectorShl(V(lhs.value), rhs)));
+	return As<UInt2>(V(lowerVectorShl(V(lhs.value), rhs)));
 #endif
-	}
+}
 
-	RValue<UInt2> operator>>(RValue<UInt2> lhs, unsigned char rhs)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
+RValue<UInt2> operator>>(RValue<UInt2> lhs, unsigned char rhs)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
 #if defined(__i386__) || defined(__x86_64__)
-	//	return RValue<UInt2>(Nucleus::createLShr(lhs.value, rhs.value));
+//	return RValue<UInt2>(Nucleus::createLShr(lhs.value, rhs.value));
 
-		return x86::psrld(lhs, rhs);
+	return x86::psrld(lhs, rhs);
 #else
-		return As<UInt2>(V(lowerVectorLShr(V(lhs.value), rhs)));
+	return As<UInt2>(V(lowerVectorLShr(V(lhs.value), rhs)));
 #endif
-	}
+}
 
-	Type *UInt2::getType()
-	{
-		return T(Type_v2i32);
-	}
+Type *UInt2::getType()
+{
+	return T(Type_v2i32);
+}
 
-	Int4::Int4(RValue<Byte4> cast) : XYZW(this)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
+Int4::Int4(RValue<Byte4> cast) : XYZW(this)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
 #if defined(__i386__) || defined(__x86_64__)
-		if(CPUID::supportsSSE4_1())
-		{
-			*this = x86::pmovzxbd(As<Byte16>(cast));
-		}
-		else
+	if(CPUID::supportsSSE4_1())
+	{
+		*this = x86::pmovzxbd(As<Byte16>(cast));
+	}
+	else
 #endif
-		{
-			int swizzle[16] = {0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23};
-			Value *a = Nucleus::createBitCast(cast.value, Byte16::getType());
-			Value *b = Nucleus::createShuffleVector(a, Nucleus::createNullValue(Byte16::getType()), swizzle);
-
-			int swizzle2[8] = {0, 8, 1, 9, 2, 10, 3, 11};
-			Value *c = Nucleus::createBitCast(b, Short8::getType());
-			Value *d = Nucleus::createShuffleVector(c, Nucleus::createNullValue(Short8::getType()), swizzle2);
-
-			*this = As<Int4>(d);
-		}
-	}
-
-	Int4::Int4(RValue<SByte4> cast) : XYZW(this)
 	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-#if defined(__i386__) || defined(__x86_64__)
-		if(CPUID::supportsSSE4_1())
-		{
-			*this = x86::pmovsxbd(As<SByte16>(cast));
-		}
-		else
-#endif
-		{
-			int swizzle[16] = {0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7};
-			Value *a = Nucleus::createBitCast(cast.value, Byte16::getType());
-			Value *b = Nucleus::createShuffleVector(a, a, swizzle);
+		int swizzle[16] = {0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23};
+		Value *a = Nucleus::createBitCast(cast.value, Byte16::getType());
+		Value *b = Nucleus::createShuffleVector(a, Nucleus::createNullValue(Byte16::getType()), swizzle);
 
-			int swizzle2[8] = {0, 0, 1, 1, 2, 2, 3, 3};
-			Value *c = Nucleus::createBitCast(b, Short8::getType());
-			Value *d = Nucleus::createShuffleVector(c, c, swizzle2);
+		int swizzle2[8] = {0, 8, 1, 9, 2, 10, 3, 11};
+		Value *c = Nucleus::createBitCast(b, Short8::getType());
+		Value *d = Nucleus::createShuffleVector(c, Nucleus::createNullValue(Short8::getType()), swizzle2);
 
-			*this = As<Int4>(d) >> 24;
-		}
-	}
-
-	Int4::Int4(RValue<Short4> cast) : XYZW(this)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-#if defined(__i386__) || defined(__x86_64__)
-		if(CPUID::supportsSSE4_1())
-		{
-			*this = x86::pmovsxwd(As<Short8>(cast));
-		}
-		else
-#endif
-		{
-			int swizzle[8] = {0, 0, 1, 1, 2, 2, 3, 3};
-			Value *c = Nucleus::createShuffleVector(cast.value, cast.value, swizzle);
-			*this = As<Int4>(c) >> 16;
-		}
-	}
-
-	Int4::Int4(RValue<UShort4> cast) : XYZW(this)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-#if defined(__i386__) || defined(__x86_64__)
-		if(CPUID::supportsSSE4_1())
-		{
-			*this = x86::pmovzxwd(As<UShort8>(cast));
-		}
-		else
-#endif
-		{
-			int swizzle[8] = {0, 8, 1, 9, 2, 10, 3, 11};
-			Value *c = Nucleus::createShuffleVector(cast.value, Short8(0, 0, 0, 0, 0, 0, 0, 0).loadValue(), swizzle);
-			*this = As<Int4>(c);
-		}
-	}
-
-	Int4::Int4(RValue<Int> rhs) : XYZW(this)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		Value *vector = loadValue();
-		Value *insert = Nucleus::createInsertElement(vector, rhs.value, 0);
-
-		int swizzle[4] = {0, 0, 0, 0};
-		Value *replicate = Nucleus::createShuffleVector(insert, insert, swizzle);
-
-		storeValue(replicate);
-	}
-
-	RValue<Int4> operator<<(RValue<Int4> lhs, unsigned char rhs)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-#if defined(__i386__) || defined(__x86_64__)
-		return x86::pslld(lhs, rhs);
-#else
-		return As<Int4>(V(lowerVectorShl(V(lhs.value), rhs)));
-#endif
-	}
-
-	RValue<Int4> operator>>(RValue<Int4> lhs, unsigned char rhs)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-#if defined(__i386__) || defined(__x86_64__)
-		return x86::psrad(lhs, rhs);
-#else
-		return As<Int4>(V(lowerVectorAShr(V(lhs.value), rhs)));
-#endif
-	}
-
-	RValue<Int4> CmpEQ(RValue<Int4> x, RValue<Int4> y)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpEQ(x.value, y.value), Int4::getType()));
-	}
-
-	RValue<Int4> CmpLT(RValue<Int4> x, RValue<Int4> y)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpSLT(x.value, y.value), Int4::getType()));
-	}
-
-	RValue<Int4> CmpLE(RValue<Int4> x, RValue<Int4> y)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpSLE(x.value, y.value), Int4::getType()));
-	}
-
-	RValue<Int4> CmpNEQ(RValue<Int4> x, RValue<Int4> y)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpNE(x.value, y.value), Int4::getType()));
-	}
-
-	RValue<Int4> CmpNLT(RValue<Int4> x, RValue<Int4> y)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpSGE(x.value, y.value), Int4::getType()));
-	}
-
-	RValue<Int4> CmpNLE(RValue<Int4> x, RValue<Int4> y)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpSGT(x.value, y.value), Int4::getType()));
-	}
-
-	RValue<Int4> Max(RValue<Int4> x, RValue<Int4> y)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-#if defined(__i386__) || defined(__x86_64__)
-		if(CPUID::supportsSSE4_1())
-		{
-			return x86::pmaxsd(x, y);
-		}
-		else
-#endif
-		{
-			RValue<Int4> greater = CmpNLE(x, y);
-			return (x & greater) | (y & ~greater);
-		}
-	}
-
-	RValue<Int4> Min(RValue<Int4> x, RValue<Int4> y)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-#if defined(__i386__) || defined(__x86_64__)
-		if(CPUID::supportsSSE4_1())
-		{
-			return x86::pminsd(x, y);
-		}
-		else
-#endif
-		{
-			RValue<Int4> less = CmpLT(x, y);
-			return (x & less) | (y & ~less);
-		}
-	}
-
-	RValue<Int4> RoundInt(RValue<Float4> cast)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-#if defined(__i386__) || defined(__x86_64__)
-		return x86::cvtps2dq(cast);
-#else
-		return As<Int4>(V(lowerRoundInt(V(cast.value), T(Int4::getType()))));
-#endif
-	}
-
-	RValue<Int4> MulHigh(RValue<Int4> x, RValue<Int4> y)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		// TODO: For x86, build an intrinsics version of this which uses shuffles + pmuludq.
-		return As<Int4>(V(lowerMulHigh(V(x.value), V(y.value), true)));
-	}
-
-	RValue<UInt4> MulHigh(RValue<UInt4> x, RValue<UInt4> y)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		// TODO: For x86, build an intrinsics version of this which uses shuffles + pmuludq.
-		return As<UInt4>(V(lowerMulHigh(V(x.value), V(y.value), false)));
-	}
-
-	RValue<Short8> PackSigned(RValue<Int4> x, RValue<Int4> y)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-#if defined(__i386__) || defined(__x86_64__)
-		return x86::packssdw(x, y);
-#else
-		return As<Short8>(V(lowerPack(V(x.value), V(y.value), true)));
-#endif
-	}
-
-	RValue<UShort8> PackUnsigned(RValue<Int4> x, RValue<Int4> y)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-#if defined(__i386__) || defined(__x86_64__)
-		return x86::packusdw(x, y);
-#else
-		return As<UShort8>(V(lowerPack(V(x.value), V(y.value), false)));
-#endif
-	}
-
-	RValue<Int> SignMask(RValue<Int4> x)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-#if defined(__i386__) || defined(__x86_64__)
-		return x86::movmskps(As<Float4>(x));
-#else
-		return As<Int>(V(lowerSignMask(V(x.value), T(Int::getType()))));
-#endif
-	}
-
-	Type *Int4::getType()
-	{
-		return T(llvm::VectorType::get(T(Int::getType()), 4));
-	}
-
-	UInt4::UInt4(RValue<Float4> cast) : XYZW(this)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		Value *xyzw = Nucleus::createFPToUI(cast.value, UInt4::getType());
-		storeValue(xyzw);
-	}
-
-	UInt4::UInt4(RValue<UInt> rhs) : XYZW(this)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		Value *vector = loadValue();
-		Value *insert = Nucleus::createInsertElement(vector, rhs.value, 0);
-
-		int swizzle[4] = {0, 0, 0, 0};
-		Value *replicate = Nucleus::createShuffleVector(insert, insert, swizzle);
-
-		storeValue(replicate);
-	}
-
-	RValue<UInt4> operator<<(RValue<UInt4> lhs, unsigned char rhs)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-#if defined(__i386__) || defined(__x86_64__)
-		return As<UInt4>(x86::pslld(As<Int4>(lhs), rhs));
-#else
-		return As<UInt4>(V(lowerVectorShl(V(lhs.value), rhs)));
-#endif
-	}
-
-	RValue<UInt4> operator>>(RValue<UInt4> lhs, unsigned char rhs)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-#if defined(__i386__) || defined(__x86_64__)
-		return x86::psrld(lhs, rhs);
-#else
-		return As<UInt4>(V(lowerVectorLShr(V(lhs.value), rhs)));
-#endif
-	}
-
-	RValue<UInt4> CmpEQ(RValue<UInt4> x, RValue<UInt4> y)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return RValue<UInt4>(Nucleus::createSExt(Nucleus::createICmpEQ(x.value, y.value), Int4::getType()));
-	}
-
-	RValue<UInt4> CmpLT(RValue<UInt4> x, RValue<UInt4> y)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return RValue<UInt4>(Nucleus::createSExt(Nucleus::createICmpULT(x.value, y.value), Int4::getType()));
-	}
-
-	RValue<UInt4> CmpLE(RValue<UInt4> x, RValue<UInt4> y)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return RValue<UInt4>(Nucleus::createSExt(Nucleus::createICmpULE(x.value, y.value), Int4::getType()));
-	}
-
-	RValue<UInt4> CmpNEQ(RValue<UInt4> x, RValue<UInt4> y)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return RValue<UInt4>(Nucleus::createSExt(Nucleus::createICmpNE(x.value, y.value), Int4::getType()));
-	}
-
-	RValue<UInt4> CmpNLT(RValue<UInt4> x, RValue<UInt4> y)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return RValue<UInt4>(Nucleus::createSExt(Nucleus::createICmpUGE(x.value, y.value), Int4::getType()));
-	}
-
-	RValue<UInt4> CmpNLE(RValue<UInt4> x, RValue<UInt4> y)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return RValue<UInt4>(Nucleus::createSExt(Nucleus::createICmpUGT(x.value, y.value), Int4::getType()));
-	}
-
-	RValue<UInt4> Max(RValue<UInt4> x, RValue<UInt4> y)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-#if defined(__i386__) || defined(__x86_64__)
-		if(CPUID::supportsSSE4_1())
-		{
-			return x86::pmaxud(x, y);
-		}
-		else
-#endif
-		{
-			RValue<UInt4> greater = CmpNLE(x, y);
-			return (x & greater) | (y & ~greater);
-		}
-	}
-
-	RValue<UInt4> Min(RValue<UInt4> x, RValue<UInt4> y)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-#if defined(__i386__) || defined(__x86_64__)
-		if(CPUID::supportsSSE4_1())
-		{
-			return x86::pminud(x, y);
-		}
-		else
-#endif
-		{
-			RValue<UInt4> less = CmpLT(x, y);
-			return (x & less) | (y & ~less);
-		}
-	}
-
-	Type *UInt4::getType()
-	{
-		return T(llvm::VectorType::get(T(UInt::getType()), 4));
-	}
-
-	Type *Half::getType()
-	{
-		return T(llvm::Type::getInt16Ty(jit->context));
-	}
-
-	RValue<Float> Rcp_pp(RValue<Float> x, bool exactAtPow2)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-#if defined(__i386__) || defined(__x86_64__)
-		if(exactAtPow2)
-		{
-			// rcpss uses a piecewise-linear approximation which minimizes the relative error
-			// but is not exact at power-of-two values. Rectify by multiplying by the inverse.
-			return x86::rcpss(x) * Float(1.0f / _mm_cvtss_f32(_mm_rcp_ss(_mm_set_ps1(1.0f))));
-		}
-		return x86::rcpss(x);
-#else
-		return As<Float>(V(lowerRCP(V(x.value))));
-#endif
-	}
-
-	RValue<Float> RcpSqrt_pp(RValue<Float> x)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-#if defined(__i386__) || defined(__x86_64__)
-		return x86::rsqrtss(x);
-#else
-		return As<Float>(V(lowerRSQRT(V(x.value))));
-#endif
-	}
-
-	RValue<Float> Sqrt(RValue<Float> x)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-#if defined(__i386__) || defined(__x86_64__)
-		return x86::sqrtss(x);
-#else
-		return As<Float>(V(lowerSQRT(V(x.value))));
-#endif
-	}
-
-	RValue<Float> Round(RValue<Float> x)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-#if defined(__i386__) || defined(__x86_64__)
-		if(CPUID::supportsSSE4_1())
-		{
-			return x86::roundss(x, 0);
-		}
-		else
-		{
-			return Float4(Round(Float4(x))).x;
-		}
-#else
-		return RValue<Float>(V(lowerRound(V(x.value))));
-#endif
-	}
-
-	RValue<Float> Trunc(RValue<Float> x)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-#if defined(__i386__) || defined(__x86_64__)
-		if(CPUID::supportsSSE4_1())
-		{
-			return x86::roundss(x, 3);
-		}
-		else
-		{
-			return Float(Int(x));   // Rounded toward zero
-		}
-#else
-		return RValue<Float>(V(lowerTrunc(V(x.value))));
-#endif
-	}
-
-	RValue<Float> Frac(RValue<Float> x)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-#if defined(__i386__) || defined(__x86_64__)
-		if(CPUID::supportsSSE4_1())
-		{
-			return x - x86::floorss(x);
-		}
-		else
-		{
-			return Float4(Frac(Float4(x))).x;
-		}
-#else
-		// x - floor(x) can be 1.0 for very small negative x.
-		// Clamp against the value just below 1.0.
-		return Min(x - Floor(x), As<Float>(Int(0x3F7FFFFF)));
-#endif
-	}
-
-	RValue<Float> Floor(RValue<Float> x)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-#if defined(__i386__) || defined(__x86_64__)
-		if(CPUID::supportsSSE4_1())
-		{
-			return x86::floorss(x);
-		}
-		else
-		{
-			return Float4(Floor(Float4(x))).x;
-		}
-#else
-		return RValue<Float>(V(lowerFloor(V(x.value))));
-#endif
-	}
-
-	RValue<Float> Ceil(RValue<Float> x)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-#if defined(__i386__) || defined(__x86_64__)
-		if(CPUID::supportsSSE4_1())
-		{
-			return x86::ceilss(x);
-		}
-		else
-#endif
-		{
-			return Float4(Ceil(Float4(x))).x;
-		}
-	}
-
-	Type *Float::getType()
-	{
-		return T(llvm::Type::getFloatTy(jit->context));
-	}
-
-	Type *Float2::getType()
-	{
-		return T(Type_v2f32);
-	}
-
-	RValue<Float> Exp2(RValue<Float> v)
-	{
-		auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::exp2, { T(Float::getType()) } );
-		return RValue<Float>(V(jit->builder->CreateCall(func, V(v.value))));
-	}
-
-	RValue<Float> Log2(RValue<Float> v)
-	{
-		auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::log2, { T(Float::getType()) } );
-		return RValue<Float>(V(jit->builder->CreateCall(func, V(v.value))));
-	}
-
-	Float4::Float4(RValue<Float> rhs) : XYZW(this)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		Value *vector = loadValue();
-		Value *insert = Nucleus::createInsertElement(vector, rhs.value, 0);
-
-		int swizzle[4] = {0, 0, 0, 0};
-		Value *replicate = Nucleus::createShuffleVector(insert, insert, swizzle);
-
-		storeValue(replicate);
-	}
-
-	RValue<Float4> Max(RValue<Float4> x, RValue<Float4> y)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-#if defined(__i386__) || defined(__x86_64__)
-		return x86::maxps(x, y);
-#else
-		return As<Float4>(V(lowerPFMINMAX(V(x.value), V(y.value), llvm::FCmpInst::FCMP_OGT)));
-#endif
-	}
-
-	RValue<Float4> Min(RValue<Float4> x, RValue<Float4> y)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-#if defined(__i386__) || defined(__x86_64__)
-		return x86::minps(x, y);
-#else
-		return As<Float4>(V(lowerPFMINMAX(V(x.value), V(y.value), llvm::FCmpInst::FCMP_OLT)));
-#endif
-	}
-
-	RValue<Float4> Rcp_pp(RValue<Float4> x, bool exactAtPow2)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-#if defined(__i386__) || defined(__x86_64__)
-		if(exactAtPow2)
-		{
-			// rcpps uses a piecewise-linear approximation which minimizes the relative error
-			// but is not exact at power-of-two values. Rectify by multiplying by the inverse.
-			return x86::rcpps(x) * Float4(1.0f / _mm_cvtss_f32(_mm_rcp_ss(_mm_set_ps1(1.0f))));
-		}
-		return x86::rcpps(x);
-#else
-		return As<Float4>(V(lowerRCP(V(x.value))));
-#endif
-	}
-
-	RValue<Float4> RcpSqrt_pp(RValue<Float4> x)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-#if defined(__i386__) || defined(__x86_64__)
-		return x86::rsqrtps(x);
-#else
-		return As<Float4>(V(lowerRSQRT(V(x.value))));
-#endif
-	}
-
-	RValue<Float4> Sqrt(RValue<Float4> x)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-#if defined(__i386__) || defined(__x86_64__)
-		return x86::sqrtps(x);
-#else
-		return As<Float4>(V(lowerSQRT(V(x.value))));
-#endif
-	}
-
-	RValue<Int> SignMask(RValue<Float4> x)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-#if defined(__i386__) || defined(__x86_64__)
-		return x86::movmskps(x);
-#else
-		return As<Int>(V(lowerFPSignMask(V(x.value), T(Int::getType()))));
-#endif
-	}
-
-	RValue<Int4> CmpEQ(RValue<Float4> x, RValue<Float4> y)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-	//	return As<Int4>(x86::cmpeqps(x, y));
-		return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpOEQ(x.value, y.value), Int4::getType()));
-	}
-
-	RValue<Int4> CmpLT(RValue<Float4> x, RValue<Float4> y)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-	//	return As<Int4>(x86::cmpltps(x, y));
-		return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpOLT(x.value, y.value), Int4::getType()));
-	}
-
-	RValue<Int4> CmpLE(RValue<Float4> x, RValue<Float4> y)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-	//	return As<Int4>(x86::cmpleps(x, y));
-		return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpOLE(x.value, y.value), Int4::getType()));
-	}
-
-	RValue<Int4> CmpNEQ(RValue<Float4> x, RValue<Float4> y)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-	//	return As<Int4>(x86::cmpneqps(x, y));
-		return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpONE(x.value, y.value), Int4::getType()));
-	}
-
-	RValue<Int4> CmpNLT(RValue<Float4> x, RValue<Float4> y)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-	//	return As<Int4>(x86::cmpnltps(x, y));
-		return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpOGE(x.value, y.value), Int4::getType()));
-	}
-
-	RValue<Int4> CmpNLE(RValue<Float4> x, RValue<Float4> y)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-	//	return As<Int4>(x86::cmpnleps(x, y));
-		return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpOGT(x.value, y.value), Int4::getType()));
-	}
-
-	RValue<Int4> CmpUEQ(RValue<Float4> x, RValue<Float4> y)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpUEQ(x.value, y.value), Int4::getType()));
-	}
-
-	RValue<Int4> CmpULT(RValue<Float4> x, RValue<Float4> y)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpULT(x.value, y.value), Int4::getType()));
-	}
-
-	RValue<Int4> CmpULE(RValue<Float4> x, RValue<Float4> y)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpULE(x.value, y.value), Int4::getType()));
-	}
-
-	RValue<Int4> CmpUNEQ(RValue<Float4> x, RValue<Float4> y)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpUNE(x.value, y.value), Int4::getType()));
-	}
-
-	RValue<Int4> CmpUNLT(RValue<Float4> x, RValue<Float4> y)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpUGE(x.value, y.value), Int4::getType()));
-	}
-
-	RValue<Int4> CmpUNLE(RValue<Float4> x, RValue<Float4> y)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpUGT(x.value, y.value), Int4::getType()));
-	}
-
-	RValue<Float4> Round(RValue<Float4> x)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-#if defined(__i386__) || defined(__x86_64__)
-		if(CPUID::supportsSSE4_1())
-		{
-			return x86::roundps(x, 0);
-		}
-		else
-		{
-			return Float4(RoundInt(x));
-		}
-#else
-		return RValue<Float4>(V(lowerRound(V(x.value))));
-#endif
-	}
-
-	RValue<Float4> Trunc(RValue<Float4> x)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-#if defined(__i386__) || defined(__x86_64__)
-		if(CPUID::supportsSSE4_1())
-		{
-			return x86::roundps(x, 3);
-		}
-		else
-		{
-			return Float4(Int4(x));
-		}
-#else
-		return RValue<Float4>(V(lowerTrunc(V(x.value))));
-#endif
-	}
-
-	RValue<Float4> Frac(RValue<Float4> x)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		Float4 frc;
-
-#if defined(__i386__) || defined(__x86_64__)
-		if(CPUID::supportsSSE4_1())
-		{
-			frc = x - Floor(x);
-		}
-		else
-		{
-			frc = x - Float4(Int4(x));   // Signed fractional part.
-
-			frc += As<Float4>(As<Int4>(CmpNLE(Float4(0.0f), frc)) & As<Int4>(Float4(1.0f)));   // Add 1.0 if negative.
-		}
-#else
-		frc = x - Floor(x);
-#endif
-
-		// x - floor(x) can be 1.0 for very small negative x.
-		// Clamp against the value just below 1.0.
-		return Min(frc, As<Float4>(Int4(0x3F7FFFFF)));
-	}
-
-	RValue<Float4> Floor(RValue<Float4> x)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-#if defined(__i386__) || defined(__x86_64__)
-		if(CPUID::supportsSSE4_1())
-		{
-			return x86::floorps(x);
-		}
-		else
-		{
-			return x - Frac(x);
-		}
-#else
-		return RValue<Float4>(V(lowerFloor(V(x.value))));
-#endif
-	}
-
-	RValue<Float4> Ceil(RValue<Float4> x)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-#if defined(__i386__) || defined(__x86_64__)
-		if(CPUID::supportsSSE4_1())
-		{
-			return x86::ceilps(x);
-		}
-		else
-#endif
-		{
-			return -Floor(-x);
-		}
-	}
-
-	RValue<Float4> Sin(RValue<Float4> v)
-	{
-		auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::sin, { V(v.value)->getType() } );
-		return RValue<Float4>(V(jit->builder->CreateCall(func, V(v.value))));
-	}
-
-	RValue<Float4> Cos(RValue<Float4> v)
-	{
-		auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::cos, { V(v.value)->getType() } );
-		return RValue<Float4>(V(jit->builder->CreateCall(func, V(v.value))));
-	}
-
-	RValue<Float4> Tan(RValue<Float4> v)
-	{
-		return Sin(v) / Cos(v);
-	}
-
-	static RValue<Float4> TransformFloat4PerElement(RValue<Float4> v, const char* name)
-	{
-		auto funcTy = ::llvm::FunctionType::get(T(Float::getType()), ::llvm::ArrayRef<llvm::Type*>(T(Float::getType())), false);
-		auto func = jit->module->getOrInsertFunction(name, funcTy);
-		llvm::Value *out = ::llvm::UndefValue::get(T(Float4::getType()));
-		for (uint64_t i = 0; i < 4; i++)
-		{
-			auto el = jit->builder->CreateCall(func, V(Nucleus::createExtractElement(v.value, Float::getType(), i)));
-			out = V(Nucleus::createInsertElement(V(out), V(el), i));
-		}
-		return RValue<Float4>(V(out));
-	}
-
-	RValue<Float4> Asin(RValue<Float4> v)
-	{
-		return TransformFloat4PerElement(v, "asinf");
-	}
-
-	RValue<Float4> Acos(RValue<Float4> v)
-	{
-		return TransformFloat4PerElement(v, "acosf");
-	}
-
-	RValue<Float4> Atan(RValue<Float4> v)
-	{
-		return TransformFloat4PerElement(v, "atanf");
-	}
-
-	RValue<Float4> Sinh(RValue<Float4> v)
-	{
-		return Float4(0.5f) * (Exp(v) - Exp(-v));
-	}
-
-	RValue<Float4> Cosh(RValue<Float4> v)
-	{
-		return Float4(0.5f) * (Exp(v) + Exp(-v));
-	}
-
-	RValue<Float4> Tanh(RValue<Float4> v)
-	{
-		return TransformFloat4PerElement(v, "tanhf");
-	}
-
-	RValue<Float4> Asinh(RValue<Float4> v)
-	{
-		return TransformFloat4PerElement(v, "asinhf");
-	}
-
-	RValue<Float4> Acosh(RValue<Float4> v)
-	{
-		return TransformFloat4PerElement(v, "acoshf");
-	}
-
-	RValue<Float4> Atanh(RValue<Float4> v)
-	{
-		return TransformFloat4PerElement(v, "atanhf");
-	}
-
-	RValue<Float4> Atan2(RValue<Float4> x, RValue<Float4> y)
-	{
-		::llvm::SmallVector<::llvm::Type*, 2> paramTys;
-		paramTys.push_back(T(Float::getType()));
-		paramTys.push_back(T(Float::getType()));
-		auto funcTy = ::llvm::FunctionType::get(T(Float::getType()), paramTys, false);
-		auto func = jit->module->getOrInsertFunction("atan2f", funcTy);
-		llvm::Value *out = ::llvm::UndefValue::get(T(Float4::getType()));
-		for (uint64_t i = 0; i < 4; i++)
-		{
-			auto el = jit->builder->CreateCall2(func, ARGS(
-					V(Nucleus::createExtractElement(x.value, Float::getType(), i)),
-					V(Nucleus::createExtractElement(y.value, Float::getType(), i))
-				));
-			out = V(Nucleus::createInsertElement(V(out), V(el), i));
-		}
-		return RValue<Float4>(V(out));
-	}
-
-	RValue<Float4> Pow(RValue<Float4> x, RValue<Float4> y)
-	{
-		auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::pow, { T(Float4::getType()) });
-		return RValue<Float4>(V(jit->builder->CreateCall2(func, ARGS(V(x.value), V(y.value)))));
-	}
-
-	RValue<Float4> Exp(RValue<Float4> v)
-	{
-		auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::exp, { T(Float4::getType()) } );
-		return RValue<Float4>(V(jit->builder->CreateCall(func, V(v.value))));
-	}
-
-	RValue<Float4> Log(RValue<Float4> v)
-	{
-		auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::log, { T(Float4::getType()) } );
-		return RValue<Float4>(V(jit->builder->CreateCall(func, V(v.value))));
-	}
-
-	RValue<Float4> Exp2(RValue<Float4> v)
-	{
-		auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::exp2, { T(Float4::getType()) } );
-		return RValue<Float4>(V(jit->builder->CreateCall(func, V(v.value))));
-	}
-
-	RValue<Float4> Log2(RValue<Float4> v)
-	{
-		auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::log2, { T(Float4::getType()) } );
-		return RValue<Float4>(V(jit->builder->CreateCall(func, V(v.value))));
-	}
-
-	RValue<UInt> Ctlz(RValue<UInt> v, bool isZeroUndef)
-	{
-		auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::ctlz, { T(UInt::getType()) } );
-		return RValue<UInt>(V(jit->builder->CreateCall2(func, ARGS(
-			V(v.value),
-			isZeroUndef ? ::llvm::ConstantInt::getTrue(jit->context) : ::llvm::ConstantInt::getFalse(jit->context)
-		))));
-	}
-
-	RValue<UInt4> Ctlz(RValue<UInt4> v, bool isZeroUndef)
-	{
-		auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::ctlz, { T(UInt4::getType()) } );
-		return RValue<UInt4>(V(jit->builder->CreateCall2(func, ARGS(
-			V(v.value),
-			isZeroUndef ? ::llvm::ConstantInt::getTrue(jit->context) : ::llvm::ConstantInt::getFalse(jit->context)
-		))));
-	}
-
-	RValue<UInt> Cttz(RValue<UInt> v, bool isZeroUndef)
-	{
-		auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::cttz, { T(UInt::getType()) } );
-		return RValue<UInt>(V(jit->builder->CreateCall2(func, ARGS(
-			V(v.value),
-			isZeroUndef ? ::llvm::ConstantInt::getTrue(jit->context) : ::llvm::ConstantInt::getFalse(jit->context)
-		))));
-	}
-
-	RValue<UInt4> Cttz(RValue<UInt4> v, bool isZeroUndef)
-	{
-		auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::cttz, { T(UInt4::getType()) } );
-		return RValue<UInt4>(V(jit->builder->CreateCall2(func, ARGS(
-			V(v.value),
-			isZeroUndef ? ::llvm::ConstantInt::getTrue(jit->context) : ::llvm::ConstantInt::getFalse(jit->context)
-		))));
-	}
-
-	Type *Float4::getType()
-	{
-		return T(llvm::VectorType::get(T(Float::getType()), 4));
-	}
-
-	RValue<Long> Ticks()
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		llvm::Function *rdtsc = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::readcyclecounter);
-
-		return RValue<Long>(V(jit->builder->CreateCall(rdtsc)));
-	}
-
-	RValue<Pointer<Byte>> ConstantPointer(void const * ptr)
-	{
-		// Note: this should work for 32-bit pointers as well because 'inttoptr'
-		// is defined to truncate (and zero extend) if necessary.
-		auto ptrAsInt = ::llvm::ConstantInt::get(::llvm::Type::getInt64Ty(jit->context), reinterpret_cast<uintptr_t>(ptr));
-		return RValue<Pointer<Byte>>(V(jit->builder->CreateIntToPtr(ptrAsInt, T(Pointer<Byte>::getType()))));
-	}
-
-	RValue<Pointer<Byte>> ConstantData(void const * data, size_t size)
-	{
-		auto str = ::llvm::StringRef(reinterpret_cast<const char*>(data), size);
-		auto ptr = jit->builder->CreateGlobalStringPtr(str);
-		return RValue<Pointer<Byte>>(V(ptr));
-	}
-
-	Value* Call(RValue<Pointer<Byte>> fptr, Type* retTy, std::initializer_list<Value*> args, std::initializer_list<Type*> argTys)
-	{
-		::llvm::SmallVector<::llvm::Type*, 8> paramTys;
-		for (auto ty : argTys) { paramTys.push_back(T(ty)); }
-		auto funcTy = ::llvm::FunctionType::get(T(retTy), paramTys, false);
-
-		auto funcPtrTy = funcTy->getPointerTo();
-		auto funcPtr = jit->builder->CreatePointerCast(V(fptr.value), funcPtrTy);
-
-		::llvm::SmallVector<::llvm::Value*, 8> arguments;
-		for (auto arg : args) { arguments.push_back(V(arg)); }
-		return V(jit->builder->CreateCall(funcPtr, arguments));
-	}
-
-	void Breakpoint()
-	{
-		llvm::Function *debugtrap = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::debugtrap);
-
-		jit->builder->CreateCall(debugtrap);
+		*this = As<Int4>(d);
 	}
 }
 
-namespace rr
+Int4::Int4(RValue<SByte4> cast) : XYZW(this)
 {
+	RR_DEBUG_INFO_UPDATE_LOC();
 #if defined(__i386__) || defined(__x86_64__)
-	namespace x86
+	if(CPUID::supportsSSE4_1())
 	{
-		RValue<Int> cvtss2si(RValue<Float> val)
-		{
-			llvm::Function *cvtss2si = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse_cvtss2si);
-
-			Float4 vector;
-			vector.x = val;
-
-			return RValue<Int>(V(jit->builder->CreateCall(cvtss2si, ARGS(V(RValue<Float4>(vector).value)))));
-		}
-
-		RValue<Int4> cvtps2dq(RValue<Float4> val)
-		{
-			llvm::Function *cvtps2dq = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_cvtps2dq);
-
-			return RValue<Int4>(V(jit->builder->CreateCall(cvtps2dq, ARGS(V(val.value)))));
-		}
-
-		RValue<Float> rcpss(RValue<Float> val)
-		{
-			llvm::Function *rcpss = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse_rcp_ss);
-
-			Value *vector = Nucleus::createInsertElement(V(llvm::UndefValue::get(T(Float4::getType()))), val.value, 0);
-
-			return RValue<Float>(Nucleus::createExtractElement(V(jit->builder->CreateCall(rcpss, ARGS(V(vector)))), Float::getType(), 0));
-		}
-
-		RValue<Float> sqrtss(RValue<Float> val)
-		{
-			llvm::Function *sqrt = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::sqrt, {V(val.value)->getType()});
-			return RValue<Float>(V(jit->builder->CreateCall(sqrt, ARGS(V(val.value)))));
-		}
-
-		RValue<Float> rsqrtss(RValue<Float> val)
-		{
-			llvm::Function *rsqrtss = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse_rsqrt_ss);
-
-			Value *vector = Nucleus::createInsertElement(V(llvm::UndefValue::get(T(Float4::getType()))), val.value, 0);
-
-			return RValue<Float>(Nucleus::createExtractElement(V(jit->builder->CreateCall(rsqrtss, ARGS(V(vector)))), Float::getType(), 0));
-		}
-
-		RValue<Float4> rcpps(RValue<Float4> val)
-		{
-			llvm::Function *rcpps = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse_rcp_ps);
-
-			return RValue<Float4>(V(jit->builder->CreateCall(rcpps, ARGS(V(val.value)))));
-		}
-
-		RValue<Float4> sqrtps(RValue<Float4> val)
-		{
-			llvm::Function *sqrtps = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::sqrt, {V(val.value)->getType()});
-
-			return RValue<Float4>(V(jit->builder->CreateCall(sqrtps, ARGS(V(val.value)))));
-		}
-
-		RValue<Float4> rsqrtps(RValue<Float4> val)
-		{
-			llvm::Function *rsqrtps = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse_rsqrt_ps);
-
-			return RValue<Float4>(V(jit->builder->CreateCall(rsqrtps, ARGS(V(val.value)))));
-		}
-
-		RValue<Float4> maxps(RValue<Float4> x, RValue<Float4> y)
-		{
-			llvm::Function *maxps = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse_max_ps);
-
-			return RValue<Float4>(V(jit->builder->CreateCall2(maxps, ARGS(V(x.value), V(y.value)))));
-		}
-
-		RValue<Float4> minps(RValue<Float4> x, RValue<Float4> y)
-		{
-			llvm::Function *minps = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse_min_ps);
-
-			return RValue<Float4>(V(jit->builder->CreateCall2(minps, ARGS(V(x.value), V(y.value)))));
-		}
-
-		RValue<Float> roundss(RValue<Float> val, unsigned char imm)
-		{
-			llvm::Function *roundss = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse41_round_ss);
-
-			Value *undef = V(llvm::UndefValue::get(T(Float4::getType())));
-			Value *vector = Nucleus::createInsertElement(undef, val.value, 0);
-
-			return RValue<Float>(Nucleus::createExtractElement(V(jit->builder->CreateCall3(roundss, ARGS(V(undef), V(vector), V(Nucleus::createConstantInt(imm))))), Float::getType(), 0));
-		}
-
-		RValue<Float> floorss(RValue<Float> val)
-		{
-			return roundss(val, 1);
-		}
-
-		RValue<Float> ceilss(RValue<Float> val)
-		{
-			return roundss(val, 2);
-		}
-
-		RValue<Float4> roundps(RValue<Float4> val, unsigned char imm)
-		{
-			llvm::Function *roundps = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse41_round_ps);
-
-			return RValue<Float4>(V(jit->builder->CreateCall2(roundps, ARGS(V(val.value), V(Nucleus::createConstantInt(imm))))));
-		}
-
-		RValue<Float4> floorps(RValue<Float4> val)
-		{
-			return roundps(val, 1);
-		}
-
-		RValue<Float4> ceilps(RValue<Float4> val)
-		{
-			return roundps(val, 2);
-		}
-
-		RValue<Int4> pabsd(RValue<Int4> x)
-		{
-			return RValue<Int4>(V(lowerPABS(V(x.value))));
-		}
-
-		RValue<Short4> paddsw(RValue<Short4> x, RValue<Short4> y)
-		{
-			#if LLVM_VERSION_MAJOR >= 8
-				return As<Short4>(V(lowerPSADDSAT(V(x.value), V(y.value))));
-			#else
-				llvm::Function *paddsw = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_padds_w);
-
-				return As<Short4>(V(jit->builder->CreateCall2(paddsw, ARGS(V(x.value), V(y.value)))));
-			#endif
-		}
-
-		RValue<Short4> psubsw(RValue<Short4> x, RValue<Short4> y)
-		{
-			#if LLVM_VERSION_MAJOR >= 8
-				return As<Short4>(V(lowerPSSUBSAT(V(x.value), V(y.value))));
-			#else
-				llvm::Function *psubsw = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_psubs_w);
-
-				return As<Short4>(V(jit->builder->CreateCall2(psubsw, ARGS(V(x.value), V(y.value)))));
-			#endif
-		}
-
-		RValue<UShort4> paddusw(RValue<UShort4> x, RValue<UShort4> y)
-		{
-			#if LLVM_VERSION_MAJOR >= 8
-				return As<UShort4>(V(lowerPUADDSAT(V(x.value), V(y.value))));
-			#else
-				llvm::Function *paddusw = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_paddus_w);
-
-				return As<UShort4>(V(jit->builder->CreateCall2(paddusw, ARGS(V(x.value), V(y.value)))));
-			#endif
-		}
-
-		RValue<UShort4> psubusw(RValue<UShort4> x, RValue<UShort4> y)
-		{
-			#if LLVM_VERSION_MAJOR >= 8
-				return As<UShort4>(V(lowerPUSUBSAT(V(x.value), V(y.value))));
-			#else
-				llvm::Function *psubusw = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_psubus_w);
-
-				return As<UShort4>(V(jit->builder->CreateCall2(psubusw, ARGS(V(x.value), V(y.value)))));
-			#endif
-		}
-
-		RValue<SByte8> paddsb(RValue<SByte8> x, RValue<SByte8> y)
-		{
-			#if LLVM_VERSION_MAJOR >= 8
-				return As<SByte8>(V(lowerPSADDSAT(V(x.value), V(y.value))));
-			#else
-				llvm::Function *paddsb = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_padds_b);
-
-				return As<SByte8>(V(jit->builder->CreateCall2(paddsb, ARGS(V(x.value), V(y.value)))));
-			#endif
-		}
-
-		RValue<SByte8> psubsb(RValue<SByte8> x, RValue<SByte8> y)
-		{
-			#if LLVM_VERSION_MAJOR >= 8
-				return As<SByte8>(V(lowerPSSUBSAT(V(x.value), V(y.value))));
-			#else
-				llvm::Function *psubsb = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_psubs_b);
-
-				return As<SByte8>(V(jit->builder->CreateCall2(psubsb, ARGS(V(x.value), V(y.value)))));
-			#endif
-		}
-
-		RValue<Byte8> paddusb(RValue<Byte8> x, RValue<Byte8> y)
-		{
-			#if LLVM_VERSION_MAJOR >= 8
-				return As<Byte8>(V(lowerPUADDSAT(V(x.value), V(y.value))));
-			#else
-				llvm::Function *paddusb = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_paddus_b);
-
-				return As<Byte8>(V(jit->builder->CreateCall2(paddusb, ARGS(V(x.value), V(y.value)))));
-			#endif
-		}
-
-		RValue<Byte8> psubusb(RValue<Byte8> x, RValue<Byte8> y)
-		{
-			#if LLVM_VERSION_MAJOR >= 8
-				return As<Byte8>(V(lowerPUSUBSAT(V(x.value), V(y.value))));
-			#else
-				llvm::Function *psubusb = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_psubus_b);
-
-				return As<Byte8>(V(jit->builder->CreateCall2(psubusb, ARGS(V(x.value), V(y.value)))));
-			#endif
-		}
-
-		RValue<UShort4> pavgw(RValue<UShort4> x, RValue<UShort4> y)
-		{
-			return As<UShort4>(V(lowerPAVG(V(x.value), V(y.value))));
-		}
-
-		RValue<Short4> pmaxsw(RValue<Short4> x, RValue<Short4> y)
-		{
-			return As<Short4>(V(lowerPMINMAX(V(x.value), V(y.value), llvm::ICmpInst::ICMP_SGT)));
-		}
-
-		RValue<Short4> pminsw(RValue<Short4> x, RValue<Short4> y)
-		{
-			return As<Short4>(V(lowerPMINMAX(V(x.value), V(y.value), llvm::ICmpInst::ICMP_SLT)));
-		}
-
-		RValue<Short4> pcmpgtw(RValue<Short4> x, RValue<Short4> y)
-		{
-			return As<Short4>(V(lowerPCMP(llvm::ICmpInst::ICMP_SGT, V(x.value), V(y.value), T(Short4::getType()))));
-		}
-
-		RValue<Short4> pcmpeqw(RValue<Short4> x, RValue<Short4> y)
-		{
-			return As<Short4>(V(lowerPCMP(llvm::ICmpInst::ICMP_EQ, V(x.value), V(y.value), T(Short4::getType()))));
-		}
-
-		RValue<Byte8> pcmpgtb(RValue<SByte8> x, RValue<SByte8> y)
-		{
-			return As<Byte8>(V(lowerPCMP(llvm::ICmpInst::ICMP_SGT, V(x.value), V(y.value), T(Byte8::getType()))));
-		}
-
-		RValue<Byte8> pcmpeqb(RValue<Byte8> x, RValue<Byte8> y)
-		{
-			return As<Byte8>(V(lowerPCMP(llvm::ICmpInst::ICMP_EQ, V(x.value), V(y.value), T(Byte8::getType()))));
-		}
-
-		RValue<Short4> packssdw(RValue<Int2> x, RValue<Int2> y)
-		{
-			llvm::Function *packssdw = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_packssdw_128);
-
-			return As<Short4>(V(jit->builder->CreateCall2(packssdw, ARGS(V(x.value), V(y.value)))));
-		}
-
-		RValue<Short8> packssdw(RValue<Int4> x, RValue<Int4> y)
-		{
-			llvm::Function *packssdw = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_packssdw_128);
-
-			return RValue<Short8>(V(jit->builder->CreateCall2(packssdw, ARGS(V(x.value), V(y.value)))));
-		}
-
-		RValue<SByte8> packsswb(RValue<Short4> x, RValue<Short4> y)
-		{
-			llvm::Function *packsswb = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_packsswb_128);
-
-			return As<SByte8>(V(jit->builder->CreateCall2(packsswb, ARGS(V(x.value), V(y.value)))));
-		}
-
-		RValue<Byte8> packuswb(RValue<Short4> x, RValue<Short4> y)
-		{
-			llvm::Function *packuswb = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_packuswb_128);
-
-			return As<Byte8>(V(jit->builder->CreateCall2(packuswb, ARGS(V(x.value), V(y.value)))));
-		}
-
-		RValue<UShort8> packusdw(RValue<Int4> x, RValue<Int4> y)
-		{
-			if(CPUID::supportsSSE4_1())
-			{
-				llvm::Function *packusdw = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse41_packusdw);
-
-				return RValue<UShort8>(V(jit->builder->CreateCall2(packusdw, ARGS(V(x.value), V(y.value)))));
-			}
-			else
-			{
-				RValue<Int4> bx = (x & ~(x >> 31)) - Int4(0x8000);
-				RValue<Int4> by = (y & ~(y >> 31)) - Int4(0x8000);
-
-				return As<UShort8>(packssdw(bx, by) + Short8(0x8000u));
-			}
-		}
-
-		RValue<UShort4> psrlw(RValue<UShort4> x, unsigned char y)
-		{
-			llvm::Function *psrlw = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_psrli_w);
-
-			return As<UShort4>(V(jit->builder->CreateCall2(psrlw, ARGS(V(x.value), V(Nucleus::createConstantInt(y))))));
-		}
-
-		RValue<UShort8> psrlw(RValue<UShort8> x, unsigned char y)
-		{
-			llvm::Function *psrlw = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_psrli_w);
-
-			return RValue<UShort8>(V(jit->builder->CreateCall2(psrlw, ARGS(V(x.value), V(Nucleus::createConstantInt(y))))));
-		}
-
-		RValue<Short4> psraw(RValue<Short4> x, unsigned char y)
-		{
-			llvm::Function *psraw = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_psrai_w);
-
-			return As<Short4>(V(jit->builder->CreateCall2(psraw, ARGS(V(x.value), V(Nucleus::createConstantInt(y))))));
-		}
-
-		RValue<Short8> psraw(RValue<Short8> x, unsigned char y)
-		{
-			llvm::Function *psraw = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_psrai_w);
-
-			return RValue<Short8>(V(jit->builder->CreateCall2(psraw, ARGS(V(x.value), V(Nucleus::createConstantInt(y))))));
-		}
-
-		RValue<Short4> psllw(RValue<Short4> x, unsigned char y)
-		{
-			llvm::Function *psllw = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_pslli_w);
-
-			return As<Short4>(V(jit->builder->CreateCall2(psllw, ARGS(V(x.value), V(Nucleus::createConstantInt(y))))));
-		}
-
-		RValue<Short8> psllw(RValue<Short8> x, unsigned char y)
-		{
-			llvm::Function *psllw = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_pslli_w);
-
-			return RValue<Short8>(V(jit->builder->CreateCall2(psllw, ARGS(V(x.value), V(Nucleus::createConstantInt(y))))));
-		}
-
-		RValue<Int2> pslld(RValue<Int2> x, unsigned char y)
-		{
-			llvm::Function *pslld = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_pslli_d);
-
-			return As<Int2>(V(jit->builder->CreateCall2(pslld, ARGS(V(x.value), V(Nucleus::createConstantInt(y))))));
-		}
-
-		RValue<Int4> pslld(RValue<Int4> x, unsigned char y)
-		{
-			llvm::Function *pslld = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_pslli_d);
-
-			return RValue<Int4>(V(jit->builder->CreateCall2(pslld, ARGS(V(x.value), V(Nucleus::createConstantInt(y))))));
-		}
-
-		RValue<Int2> psrad(RValue<Int2> x, unsigned char y)
-		{
-			llvm::Function *psrad = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_psrai_d);
-
-			return As<Int2>(V(jit->builder->CreateCall2(psrad, ARGS(V(x.value), V(Nucleus::createConstantInt(y))))));
-		}
-
-		RValue<Int4> psrad(RValue<Int4> x, unsigned char y)
-		{
-			llvm::Function *psrad = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_psrai_d);
-
-			return RValue<Int4>(V(jit->builder->CreateCall2(psrad, ARGS(V(x.value), V(Nucleus::createConstantInt(y))))));
-		}
-
-		RValue<UInt2> psrld(RValue<UInt2> x, unsigned char y)
-		{
-			llvm::Function *psrld = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_psrli_d);
-
-			return As<UInt2>(V(jit->builder->CreateCall2(psrld, ARGS(V(x.value), V(Nucleus::createConstantInt(y))))));
-		}
-
-		RValue<UInt4> psrld(RValue<UInt4> x, unsigned char y)
-		{
-			llvm::Function *psrld = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_psrli_d);
-
-			return RValue<UInt4>(V(jit->builder->CreateCall2(psrld, ARGS(V(x.value), V(Nucleus::createConstantInt(y))))));
-		}
-
-		RValue<Int4> pmaxsd(RValue<Int4> x, RValue<Int4> y)
-		{
-			return RValue<Int4>(V(lowerPMINMAX(V(x.value), V(y.value), llvm::ICmpInst::ICMP_SGT)));
-		}
-
-		RValue<Int4> pminsd(RValue<Int4> x, RValue<Int4> y)
-		{
-			return RValue<Int4>(V(lowerPMINMAX(V(x.value), V(y.value), llvm::ICmpInst::ICMP_SLT)));
-		}
-
-		RValue<UInt4> pmaxud(RValue<UInt4> x, RValue<UInt4> y)
-		{
-			return RValue<UInt4>(V(lowerPMINMAX(V(x.value), V(y.value), llvm::ICmpInst::ICMP_UGT)));
-		}
-
-		RValue<UInt4> pminud(RValue<UInt4> x, RValue<UInt4> y)
-		{
-			return RValue<UInt4>(V(lowerPMINMAX(V(x.value), V(y.value), llvm::ICmpInst::ICMP_ULT)));
-		}
-
-		RValue<Short4> pmulhw(RValue<Short4> x, RValue<Short4> y)
-		{
-			llvm::Function *pmulhw = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_pmulh_w);
-
-			return As<Short4>(V(jit->builder->CreateCall2(pmulhw, ARGS(V(x.value), V(y.value)))));
-		}
-
-		RValue<UShort4> pmulhuw(RValue<UShort4> x, RValue<UShort4> y)
-		{
-			llvm::Function *pmulhuw = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_pmulhu_w);
-
-			return As<UShort4>(V(jit->builder->CreateCall2(pmulhuw, ARGS(V(x.value), V(y.value)))));
-		}
-
-		RValue<Int2> pmaddwd(RValue<Short4> x, RValue<Short4> y)
-		{
-			llvm::Function *pmaddwd = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_pmadd_wd);
-
-			return As<Int2>(V(jit->builder->CreateCall2(pmaddwd, ARGS(V(x.value), V(y.value)))));
-		}
-
-		RValue<Short8> pmulhw(RValue<Short8> x, RValue<Short8> y)
-		{
-			llvm::Function *pmulhw = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_pmulh_w);
-
-			return RValue<Short8>(V(jit->builder->CreateCall2(pmulhw, ARGS(V(x.value), V(y.value)))));
-		}
-
-		RValue<UShort8> pmulhuw(RValue<UShort8> x, RValue<UShort8> y)
-		{
-			llvm::Function *pmulhuw = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_pmulhu_w);
-
-			return RValue<UShort8>(V(jit->builder->CreateCall2(pmulhuw, ARGS(V(x.value), V(y.value)))));
-		}
-
-		RValue<Int4> pmaddwd(RValue<Short8> x, RValue<Short8> y)
-		{
-			llvm::Function *pmaddwd = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_pmadd_wd);
-
-			return RValue<Int4>(V(jit->builder->CreateCall2(pmaddwd, ARGS(V(x.value), V(y.value)))));
-		}
-
-		RValue<Int> movmskps(RValue<Float4> x)
-		{
-			llvm::Function *movmskps = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse_movmsk_ps);
-
-			return RValue<Int>(V(jit->builder->CreateCall(movmskps, ARGS(V(x.value)))));
-		}
-
-		RValue<Int> pmovmskb(RValue<Byte8> x)
-		{
-			llvm::Function *pmovmskb = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_pmovmskb_128);
-
-			return RValue<Int>(V(jit->builder->CreateCall(pmovmskb, ARGS(V(x.value))))) & 0xFF;
-		}
-
-		RValue<Int4> pmovzxbd(RValue<Byte16> x)
-		{
-			return RValue<Int4>(V(lowerPMOV(V(x.value), T(Int4::getType()), false)));
-		}
-
-		RValue<Int4> pmovsxbd(RValue<SByte16> x)
-		{
-			return RValue<Int4>(V(lowerPMOV(V(x.value), T(Int4::getType()), true)));
-		}
-
-		RValue<Int4> pmovzxwd(RValue<UShort8> x)
-		{
-			return RValue<Int4>(V(lowerPMOV(V(x.value), T(Int4::getType()), false)));
-		}
-
-		RValue<Int4> pmovsxwd(RValue<Short8> x)
-		{
-			return RValue<Int4>(V(lowerPMOV(V(x.value), T(Int4::getType()), true)));
-		}
+		*this = x86::pmovsxbd(As<SByte16>(cast));
 	}
+	else
+#endif
+	{
+		int swizzle[16] = {0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7};
+		Value *a = Nucleus::createBitCast(cast.value, Byte16::getType());
+		Value *b = Nucleus::createShuffleVector(a, a, swizzle);
+
+		int swizzle2[8] = {0, 0, 1, 1, 2, 2, 3, 3};
+		Value *c = Nucleus::createBitCast(b, Short8::getType());
+		Value *d = Nucleus::createShuffleVector(c, c, swizzle2);
+
+		*this = As<Int4>(d) >> 24;
+	}
+}
+
+Int4::Int4(RValue<Short4> cast) : XYZW(this)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+#if defined(__i386__) || defined(__x86_64__)
+	if(CPUID::supportsSSE4_1())
+	{
+		*this = x86::pmovsxwd(As<Short8>(cast));
+	}
+	else
+#endif
+	{
+		int swizzle[8] = {0, 0, 1, 1, 2, 2, 3, 3};
+		Value *c = Nucleus::createShuffleVector(cast.value, cast.value, swizzle);
+		*this = As<Int4>(c) >> 16;
+	}
+}
+
+Int4::Int4(RValue<UShort4> cast) : XYZW(this)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+#if defined(__i386__) || defined(__x86_64__)
+	if(CPUID::supportsSSE4_1())
+	{
+		*this = x86::pmovzxwd(As<UShort8>(cast));
+	}
+	else
+#endif
+	{
+		int swizzle[8] = {0, 8, 1, 9, 2, 10, 3, 11};
+		Value *c = Nucleus::createShuffleVector(cast.value, Short8(0, 0, 0, 0, 0, 0, 0, 0).loadValue(), swizzle);
+		*this = As<Int4>(c);
+	}
+}
+
+Int4::Int4(RValue<Int> rhs) : XYZW(this)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	Value *vector = loadValue();
+	Value *insert = Nucleus::createInsertElement(vector, rhs.value, 0);
+
+	int swizzle[4] = {0, 0, 0, 0};
+	Value *replicate = Nucleus::createShuffleVector(insert, insert, swizzle);
+
+	storeValue(replicate);
+}
+
+RValue<Int4> operator<<(RValue<Int4> lhs, unsigned char rhs)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+#if defined(__i386__) || defined(__x86_64__)
+	return x86::pslld(lhs, rhs);
+#else
+	return As<Int4>(V(lowerVectorShl(V(lhs.value), rhs)));
+#endif
+}
+
+RValue<Int4> operator>>(RValue<Int4> lhs, unsigned char rhs)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+#if defined(__i386__) || defined(__x86_64__)
+	return x86::psrad(lhs, rhs);
+#else
+	return As<Int4>(V(lowerVectorAShr(V(lhs.value), rhs)));
+#endif
+}
+
+RValue<Int4> CmpEQ(RValue<Int4> x, RValue<Int4> y)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpEQ(x.value, y.value), Int4::getType()));
+}
+
+RValue<Int4> CmpLT(RValue<Int4> x, RValue<Int4> y)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpSLT(x.value, y.value), Int4::getType()));
+}
+
+RValue<Int4> CmpLE(RValue<Int4> x, RValue<Int4> y)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpSLE(x.value, y.value), Int4::getType()));
+}
+
+RValue<Int4> CmpNEQ(RValue<Int4> x, RValue<Int4> y)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpNE(x.value, y.value), Int4::getType()));
+}
+
+RValue<Int4> CmpNLT(RValue<Int4> x, RValue<Int4> y)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpSGE(x.value, y.value), Int4::getType()));
+}
+
+RValue<Int4> CmpNLE(RValue<Int4> x, RValue<Int4> y)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpSGT(x.value, y.value), Int4::getType()));
+}
+
+RValue<Int4> Max(RValue<Int4> x, RValue<Int4> y)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+#if defined(__i386__) || defined(__x86_64__)
+	if(CPUID::supportsSSE4_1())
+	{
+		return x86::pmaxsd(x, y);
+	}
+	else
+#endif
+	{
+		RValue<Int4> greater = CmpNLE(x, y);
+		return (x & greater) | (y & ~greater);
+	}
+}
+
+RValue<Int4> Min(RValue<Int4> x, RValue<Int4> y)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+#if defined(__i386__) || defined(__x86_64__)
+	if(CPUID::supportsSSE4_1())
+	{
+		return x86::pminsd(x, y);
+	}
+	else
+#endif
+	{
+		RValue<Int4> less = CmpLT(x, y);
+		return (x & less) | (y & ~less);
+	}
+}
+
+RValue<Int4> RoundInt(RValue<Float4> cast)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+#if defined(__i386__) || defined(__x86_64__)
+	return x86::cvtps2dq(cast);
+#else
+	return As<Int4>(V(lowerRoundInt(V(cast.value), T(Int4::getType()))));
+#endif
+}
+
+RValue<Int4> MulHigh(RValue<Int4> x, RValue<Int4> y)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	// TODO: For x86, build an intrinsics version of this which uses shuffles + pmuludq.
+	return As<Int4>(V(lowerMulHigh(V(x.value), V(y.value), true)));
+}
+
+RValue<UInt4> MulHigh(RValue<UInt4> x, RValue<UInt4> y)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	// TODO: For x86, build an intrinsics version of this which uses shuffles + pmuludq.
+	return As<UInt4>(V(lowerMulHigh(V(x.value), V(y.value), false)));
+}
+
+RValue<Short8> PackSigned(RValue<Int4> x, RValue<Int4> y)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+#if defined(__i386__) || defined(__x86_64__)
+	return x86::packssdw(x, y);
+#else
+	return As<Short8>(V(lowerPack(V(x.value), V(y.value), true)));
+#endif
+}
+
+RValue<UShort8> PackUnsigned(RValue<Int4> x, RValue<Int4> y)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+#if defined(__i386__) || defined(__x86_64__)
+	return x86::packusdw(x, y);
+#else
+	return As<UShort8>(V(lowerPack(V(x.value), V(y.value), false)));
+#endif
+}
+
+RValue<Int> SignMask(RValue<Int4> x)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+#if defined(__i386__) || defined(__x86_64__)
+	return x86::movmskps(As<Float4>(x));
+#else
+	return As<Int>(V(lowerSignMask(V(x.value), T(Int::getType()))));
+#endif
+}
+
+Type *Int4::getType()
+{
+	return T(llvm::VectorType::get(T(Int::getType()), 4));
+}
+
+UInt4::UInt4(RValue<Float4> cast) : XYZW(this)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	Value *xyzw = Nucleus::createFPToUI(cast.value, UInt4::getType());
+	storeValue(xyzw);
+}
+
+UInt4::UInt4(RValue<UInt> rhs) : XYZW(this)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	Value *vector = loadValue();
+	Value *insert = Nucleus::createInsertElement(vector, rhs.value, 0);
+
+	int swizzle[4] = {0, 0, 0, 0};
+	Value *replicate = Nucleus::createShuffleVector(insert, insert, swizzle);
+
+	storeValue(replicate);
+}
+
+RValue<UInt4> operator<<(RValue<UInt4> lhs, unsigned char rhs)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+#if defined(__i386__) || defined(__x86_64__)
+	return As<UInt4>(x86::pslld(As<Int4>(lhs), rhs));
+#else
+	return As<UInt4>(V(lowerVectorShl(V(lhs.value), rhs)));
+#endif
+}
+
+RValue<UInt4> operator>>(RValue<UInt4> lhs, unsigned char rhs)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+#if defined(__i386__) || defined(__x86_64__)
+	return x86::psrld(lhs, rhs);
+#else
+	return As<UInt4>(V(lowerVectorLShr(V(lhs.value), rhs)));
+#endif
+}
+
+RValue<UInt4> CmpEQ(RValue<UInt4> x, RValue<UInt4> y)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return RValue<UInt4>(Nucleus::createSExt(Nucleus::createICmpEQ(x.value, y.value), Int4::getType()));
+}
+
+RValue<UInt4> CmpLT(RValue<UInt4> x, RValue<UInt4> y)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return RValue<UInt4>(Nucleus::createSExt(Nucleus::createICmpULT(x.value, y.value), Int4::getType()));
+}
+
+RValue<UInt4> CmpLE(RValue<UInt4> x, RValue<UInt4> y)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return RValue<UInt4>(Nucleus::createSExt(Nucleus::createICmpULE(x.value, y.value), Int4::getType()));
+}
+
+RValue<UInt4> CmpNEQ(RValue<UInt4> x, RValue<UInt4> y)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return RValue<UInt4>(Nucleus::createSExt(Nucleus::createICmpNE(x.value, y.value), Int4::getType()));
+}
+
+RValue<UInt4> CmpNLT(RValue<UInt4> x, RValue<UInt4> y)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return RValue<UInt4>(Nucleus::createSExt(Nucleus::createICmpUGE(x.value, y.value), Int4::getType()));
+}
+
+RValue<UInt4> CmpNLE(RValue<UInt4> x, RValue<UInt4> y)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return RValue<UInt4>(Nucleus::createSExt(Nucleus::createICmpUGT(x.value, y.value), Int4::getType()));
+}
+
+RValue<UInt4> Max(RValue<UInt4> x, RValue<UInt4> y)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+#if defined(__i386__) || defined(__x86_64__)
+	if(CPUID::supportsSSE4_1())
+	{
+		return x86::pmaxud(x, y);
+	}
+	else
+#endif
+	{
+		RValue<UInt4> greater = CmpNLE(x, y);
+		return (x & greater) | (y & ~greater);
+	}
+}
+
+RValue<UInt4> Min(RValue<UInt4> x, RValue<UInt4> y)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+#if defined(__i386__) || defined(__x86_64__)
+	if(CPUID::supportsSSE4_1())
+	{
+		return x86::pminud(x, y);
+	}
+	else
+#endif
+	{
+		RValue<UInt4> less = CmpLT(x, y);
+		return (x & less) | (y & ~less);
+	}
+}
+
+Type *UInt4::getType()
+{
+	return T(llvm::VectorType::get(T(UInt::getType()), 4));
+}
+
+Type *Half::getType()
+{
+	return T(llvm::Type::getInt16Ty(jit->context));
+}
+
+RValue<Float> Rcp_pp(RValue<Float> x, bool exactAtPow2)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+#if defined(__i386__) || defined(__x86_64__)
+	if(exactAtPow2)
+	{
+		// rcpss uses a piecewise-linear approximation which minimizes the relative error
+		// but is not exact at power-of-two values. Rectify by multiplying by the inverse.
+		return x86::rcpss(x) * Float(1.0f / _mm_cvtss_f32(_mm_rcp_ss(_mm_set_ps1(1.0f))));
+	}
+	return x86::rcpss(x);
+#else
+	return As<Float>(V(lowerRCP(V(x.value))));
+#endif
+}
+
+RValue<Float> RcpSqrt_pp(RValue<Float> x)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+#if defined(__i386__) || defined(__x86_64__)
+	return x86::rsqrtss(x);
+#else
+	return As<Float>(V(lowerRSQRT(V(x.value))));
+#endif
+}
+
+RValue<Float> Sqrt(RValue<Float> x)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+#if defined(__i386__) || defined(__x86_64__)
+	return x86::sqrtss(x);
+#else
+	return As<Float>(V(lowerSQRT(V(x.value))));
+#endif
+}
+
+RValue<Float> Round(RValue<Float> x)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+#if defined(__i386__) || defined(__x86_64__)
+	if(CPUID::supportsSSE4_1())
+	{
+		return x86::roundss(x, 0);
+	}
+	else
+	{
+		return Float4(Round(Float4(x))).x;
+	}
+#else
+	return RValue<Float>(V(lowerRound(V(x.value))));
+#endif
+}
+
+RValue<Float> Trunc(RValue<Float> x)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+#if defined(__i386__) || defined(__x86_64__)
+	if(CPUID::supportsSSE4_1())
+	{
+		return x86::roundss(x, 3);
+	}
+	else
+	{
+		return Float(Int(x));   // Rounded toward zero
+	}
+#else
+	return RValue<Float>(V(lowerTrunc(V(x.value))));
+#endif
+}
+
+RValue<Float> Frac(RValue<Float> x)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+#if defined(__i386__) || defined(__x86_64__)
+	if(CPUID::supportsSSE4_1())
+	{
+		return x - x86::floorss(x);
+	}
+	else
+	{
+		return Float4(Frac(Float4(x))).x;
+	}
+#else
+	// x - floor(x) can be 1.0 for very small negative x.
+	// Clamp against the value just below 1.0.
+	return Min(x - Floor(x), As<Float>(Int(0x3F7FFFFF)));
+#endif
+}
+
+RValue<Float> Floor(RValue<Float> x)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+#if defined(__i386__) || defined(__x86_64__)
+	if(CPUID::supportsSSE4_1())
+	{
+		return x86::floorss(x);
+	}
+	else
+	{
+		return Float4(Floor(Float4(x))).x;
+	}
+#else
+	return RValue<Float>(V(lowerFloor(V(x.value))));
+#endif
+}
+
+RValue<Float> Ceil(RValue<Float> x)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+#if defined(__i386__) || defined(__x86_64__)
+	if(CPUID::supportsSSE4_1())
+	{
+		return x86::ceilss(x);
+	}
+	else
+#endif
+	{
+		return Float4(Ceil(Float4(x))).x;
+	}
+}
+
+Type *Float::getType()
+{
+	return T(llvm::Type::getFloatTy(jit->context));
+}
+
+Type *Float2::getType()
+{
+	return T(Type_v2f32);
+}
+
+RValue<Float> Exp2(RValue<Float> v)
+{
+	auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::exp2, { T(Float::getType()) } );
+	return RValue<Float>(V(jit->builder->CreateCall(func, V(v.value))));
+}
+
+RValue<Float> Log2(RValue<Float> v)
+{
+	auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::log2, { T(Float::getType()) } );
+	return RValue<Float>(V(jit->builder->CreateCall(func, V(v.value))));
+}
+
+Float4::Float4(RValue<Float> rhs) : XYZW(this)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	Value *vector = loadValue();
+	Value *insert = Nucleus::createInsertElement(vector, rhs.value, 0);
+
+	int swizzle[4] = {0, 0, 0, 0};
+	Value *replicate = Nucleus::createShuffleVector(insert, insert, swizzle);
+
+	storeValue(replicate);
+}
+
+RValue<Float4> Max(RValue<Float4> x, RValue<Float4> y)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+#if defined(__i386__) || defined(__x86_64__)
+	return x86::maxps(x, y);
+#else
+	return As<Float4>(V(lowerPFMINMAX(V(x.value), V(y.value), llvm::FCmpInst::FCMP_OGT)));
+#endif
+}
+
+RValue<Float4> Min(RValue<Float4> x, RValue<Float4> y)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+#if defined(__i386__) || defined(__x86_64__)
+	return x86::minps(x, y);
+#else
+	return As<Float4>(V(lowerPFMINMAX(V(x.value), V(y.value), llvm::FCmpInst::FCMP_OLT)));
+#endif
+}
+
+RValue<Float4> Rcp_pp(RValue<Float4> x, bool exactAtPow2)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+#if defined(__i386__) || defined(__x86_64__)
+	if(exactAtPow2)
+	{
+		// rcpps uses a piecewise-linear approximation which minimizes the relative error
+		// but is not exact at power-of-two values. Rectify by multiplying by the inverse.
+		return x86::rcpps(x) * Float4(1.0f / _mm_cvtss_f32(_mm_rcp_ss(_mm_set_ps1(1.0f))));
+	}
+	return x86::rcpps(x);
+#else
+	return As<Float4>(V(lowerRCP(V(x.value))));
+#endif
+}
+
+RValue<Float4> RcpSqrt_pp(RValue<Float4> x)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+#if defined(__i386__) || defined(__x86_64__)
+	return x86::rsqrtps(x);
+#else
+	return As<Float4>(V(lowerRSQRT(V(x.value))));
+#endif
+}
+
+RValue<Float4> Sqrt(RValue<Float4> x)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+#if defined(__i386__) || defined(__x86_64__)
+	return x86::sqrtps(x);
+#else
+	return As<Float4>(V(lowerSQRT(V(x.value))));
+#endif
+}
+
+RValue<Int> SignMask(RValue<Float4> x)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+#if defined(__i386__) || defined(__x86_64__)
+	return x86::movmskps(x);
+#else
+	return As<Int>(V(lowerFPSignMask(V(x.value), T(Int::getType()))));
+#endif
+}
+
+RValue<Int4> CmpEQ(RValue<Float4> x, RValue<Float4> y)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+//	return As<Int4>(x86::cmpeqps(x, y));
+	return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpOEQ(x.value, y.value), Int4::getType()));
+}
+
+RValue<Int4> CmpLT(RValue<Float4> x, RValue<Float4> y)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+//	return As<Int4>(x86::cmpltps(x, y));
+	return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpOLT(x.value, y.value), Int4::getType()));
+}
+
+RValue<Int4> CmpLE(RValue<Float4> x, RValue<Float4> y)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+//	return As<Int4>(x86::cmpleps(x, y));
+	return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpOLE(x.value, y.value), Int4::getType()));
+}
+
+RValue<Int4> CmpNEQ(RValue<Float4> x, RValue<Float4> y)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+//	return As<Int4>(x86::cmpneqps(x, y));
+	return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpONE(x.value, y.value), Int4::getType()));
+}
+
+RValue<Int4> CmpNLT(RValue<Float4> x, RValue<Float4> y)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+//	return As<Int4>(x86::cmpnltps(x, y));
+	return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpOGE(x.value, y.value), Int4::getType()));
+}
+
+RValue<Int4> CmpNLE(RValue<Float4> x, RValue<Float4> y)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+//	return As<Int4>(x86::cmpnleps(x, y));
+	return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpOGT(x.value, y.value), Int4::getType()));
+}
+
+RValue<Int4> CmpUEQ(RValue<Float4> x, RValue<Float4> y)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpUEQ(x.value, y.value), Int4::getType()));
+}
+
+RValue<Int4> CmpULT(RValue<Float4> x, RValue<Float4> y)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpULT(x.value, y.value), Int4::getType()));
+}
+
+RValue<Int4> CmpULE(RValue<Float4> x, RValue<Float4> y)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpULE(x.value, y.value), Int4::getType()));
+}
+
+RValue<Int4> CmpUNEQ(RValue<Float4> x, RValue<Float4> y)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpUNE(x.value, y.value), Int4::getType()));
+}
+
+RValue<Int4> CmpUNLT(RValue<Float4> x, RValue<Float4> y)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpUGE(x.value, y.value), Int4::getType()));
+}
+
+RValue<Int4> CmpUNLE(RValue<Float4> x, RValue<Float4> y)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpUGT(x.value, y.value), Int4::getType()));
+}
+
+RValue<Float4> Round(RValue<Float4> x)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+#if defined(__i386__) || defined(__x86_64__)
+	if(CPUID::supportsSSE4_1())
+	{
+		return x86::roundps(x, 0);
+	}
+	else
+	{
+		return Float4(RoundInt(x));
+	}
+#else
+	return RValue<Float4>(V(lowerRound(V(x.value))));
+#endif
+}
+
+RValue<Float4> Trunc(RValue<Float4> x)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+#if defined(__i386__) || defined(__x86_64__)
+	if(CPUID::supportsSSE4_1())
+	{
+		return x86::roundps(x, 3);
+	}
+	else
+	{
+		return Float4(Int4(x));
+	}
+#else
+	return RValue<Float4>(V(lowerTrunc(V(x.value))));
+#endif
+}
+
+RValue<Float4> Frac(RValue<Float4> x)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	Float4 frc;
+
+#if defined(__i386__) || defined(__x86_64__)
+	if(CPUID::supportsSSE4_1())
+	{
+		frc = x - Floor(x);
+	}
+	else
+	{
+		frc = x - Float4(Int4(x));   // Signed fractional part.
+
+		frc += As<Float4>(As<Int4>(CmpNLE(Float4(0.0f), frc)) & As<Int4>(Float4(1.0f)));   // Add 1.0 if negative.
+	}
+#else
+	frc = x - Floor(x);
+#endif
+
+	// x - floor(x) can be 1.0 for very small negative x.
+	// Clamp against the value just below 1.0.
+	return Min(frc, As<Float4>(Int4(0x3F7FFFFF)));
+}
+
+RValue<Float4> Floor(RValue<Float4> x)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+#if defined(__i386__) || defined(__x86_64__)
+	if(CPUID::supportsSSE4_1())
+	{
+		return x86::floorps(x);
+	}
+	else
+	{
+		return x - Frac(x);
+	}
+#else
+	return RValue<Float4>(V(lowerFloor(V(x.value))));
+#endif
+}
+
+RValue<Float4> Ceil(RValue<Float4> x)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+#if defined(__i386__) || defined(__x86_64__)
+	if(CPUID::supportsSSE4_1())
+	{
+		return x86::ceilps(x);
+	}
+	else
+#endif
+	{
+		return -Floor(-x);
+	}
+}
+
+RValue<Float4> Sin(RValue<Float4> v)
+{
+	auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::sin, { V(v.value)->getType() } );
+	return RValue<Float4>(V(jit->builder->CreateCall(func, V(v.value))));
+}
+
+RValue<Float4> Cos(RValue<Float4> v)
+{
+	auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::cos, { V(v.value)->getType() } );
+	return RValue<Float4>(V(jit->builder->CreateCall(func, V(v.value))));
+}
+
+RValue<Float4> Tan(RValue<Float4> v)
+{
+	return Sin(v) / Cos(v);
+}
+
+static RValue<Float4> TransformFloat4PerElement(RValue<Float4> v, const char* name)
+{
+	auto funcTy = ::llvm::FunctionType::get(T(Float::getType()), ::llvm::ArrayRef<llvm::Type*>(T(Float::getType())), false);
+	auto func = jit->module->getOrInsertFunction(name, funcTy);
+	llvm::Value *out = ::llvm::UndefValue::get(T(Float4::getType()));
+	for (uint64_t i = 0; i < 4; i++)
+	{
+		auto el = jit->builder->CreateCall(func, V(Nucleus::createExtractElement(v.value, Float::getType(), i)));
+		out = V(Nucleus::createInsertElement(V(out), V(el), i));
+	}
+	return RValue<Float4>(V(out));
+}
+
+RValue<Float4> Asin(RValue<Float4> v)
+{
+	return TransformFloat4PerElement(v, "asinf");
+}
+
+RValue<Float4> Acos(RValue<Float4> v)
+{
+	return TransformFloat4PerElement(v, "acosf");
+}
+
+RValue<Float4> Atan(RValue<Float4> v)
+{
+	return TransformFloat4PerElement(v, "atanf");
+}
+
+RValue<Float4> Sinh(RValue<Float4> v)
+{
+	return Float4(0.5f) * (Exp(v) - Exp(-v));
+}
+
+RValue<Float4> Cosh(RValue<Float4> v)
+{
+	return Float4(0.5f) * (Exp(v) + Exp(-v));
+}
+
+RValue<Float4> Tanh(RValue<Float4> v)
+{
+	return TransformFloat4PerElement(v, "tanhf");
+}
+
+RValue<Float4> Asinh(RValue<Float4> v)
+{
+	return TransformFloat4PerElement(v, "asinhf");
+}
+
+RValue<Float4> Acosh(RValue<Float4> v)
+{
+	return TransformFloat4PerElement(v, "acoshf");
+}
+
+RValue<Float4> Atanh(RValue<Float4> v)
+{
+	return TransformFloat4PerElement(v, "atanhf");
+}
+
+RValue<Float4> Atan2(RValue<Float4> x, RValue<Float4> y)
+{
+	::llvm::SmallVector<::llvm::Type*, 2> paramTys;
+	paramTys.push_back(T(Float::getType()));
+	paramTys.push_back(T(Float::getType()));
+	auto funcTy = ::llvm::FunctionType::get(T(Float::getType()), paramTys, false);
+	auto func = jit->module->getOrInsertFunction("atan2f", funcTy);
+	llvm::Value *out = ::llvm::UndefValue::get(T(Float4::getType()));
+	for (uint64_t i = 0; i < 4; i++)
+	{
+		auto el = jit->builder->CreateCall2(func, ARGS(
+				V(Nucleus::createExtractElement(x.value, Float::getType(), i)),
+				V(Nucleus::createExtractElement(y.value, Float::getType(), i))
+			));
+		out = V(Nucleus::createInsertElement(V(out), V(el), i));
+	}
+	return RValue<Float4>(V(out));
+}
+
+RValue<Float4> Pow(RValue<Float4> x, RValue<Float4> y)
+{
+	auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::pow, { T(Float4::getType()) });
+	return RValue<Float4>(V(jit->builder->CreateCall2(func, ARGS(V(x.value), V(y.value)))));
+}
+
+RValue<Float4> Exp(RValue<Float4> v)
+{
+	auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::exp, { T(Float4::getType()) } );
+	return RValue<Float4>(V(jit->builder->CreateCall(func, V(v.value))));
+}
+
+RValue<Float4> Log(RValue<Float4> v)
+{
+	auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::log, { T(Float4::getType()) } );
+	return RValue<Float4>(V(jit->builder->CreateCall(func, V(v.value))));
+}
+
+RValue<Float4> Exp2(RValue<Float4> v)
+{
+	auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::exp2, { T(Float4::getType()) } );
+	return RValue<Float4>(V(jit->builder->CreateCall(func, V(v.value))));
+}
+
+RValue<Float4> Log2(RValue<Float4> v)
+{
+	auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::log2, { T(Float4::getType()) } );
+	return RValue<Float4>(V(jit->builder->CreateCall(func, V(v.value))));
+}
+
+RValue<UInt> Ctlz(RValue<UInt> v, bool isZeroUndef)
+{
+	auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::ctlz, { T(UInt::getType()) } );
+	return RValue<UInt>(V(jit->builder->CreateCall2(func, ARGS(
+		V(v.value),
+		isZeroUndef ? ::llvm::ConstantInt::getTrue(jit->context) : ::llvm::ConstantInt::getFalse(jit->context)
+	))));
+}
+
+RValue<UInt4> Ctlz(RValue<UInt4> v, bool isZeroUndef)
+{
+	auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::ctlz, { T(UInt4::getType()) } );
+	return RValue<UInt4>(V(jit->builder->CreateCall2(func, ARGS(
+		V(v.value),
+		isZeroUndef ? ::llvm::ConstantInt::getTrue(jit->context) : ::llvm::ConstantInt::getFalse(jit->context)
+	))));
+}
+
+RValue<UInt> Cttz(RValue<UInt> v, bool isZeroUndef)
+{
+	auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::cttz, { T(UInt::getType()) } );
+	return RValue<UInt>(V(jit->builder->CreateCall2(func, ARGS(
+		V(v.value),
+		isZeroUndef ? ::llvm::ConstantInt::getTrue(jit->context) : ::llvm::ConstantInt::getFalse(jit->context)
+	))));
+}
+
+RValue<UInt4> Cttz(RValue<UInt4> v, bool isZeroUndef)
+{
+	auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::cttz, { T(UInt4::getType()) } );
+	return RValue<UInt4>(V(jit->builder->CreateCall2(func, ARGS(
+		V(v.value),
+		isZeroUndef ? ::llvm::ConstantInt::getTrue(jit->context) : ::llvm::ConstantInt::getFalse(jit->context)
+	))));
+}
+
+Type *Float4::getType()
+{
+	return T(llvm::VectorType::get(T(Float::getType()), 4));
+}
+
+RValue<Long> Ticks()
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	llvm::Function *rdtsc = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::readcyclecounter);
+
+	return RValue<Long>(V(jit->builder->CreateCall(rdtsc)));
+}
+
+RValue<Pointer<Byte>> ConstantPointer(void const * ptr)
+{
+	// Note: this should work for 32-bit pointers as well because 'inttoptr'
+	// is defined to truncate (and zero extend) if necessary.
+	auto ptrAsInt = ::llvm::ConstantInt::get(::llvm::Type::getInt64Ty(jit->context), reinterpret_cast<uintptr_t>(ptr));
+	return RValue<Pointer<Byte>>(V(jit->builder->CreateIntToPtr(ptrAsInt, T(Pointer<Byte>::getType()))));
+}
+
+RValue<Pointer<Byte>> ConstantData(void const * data, size_t size)
+{
+	auto str = ::llvm::StringRef(reinterpret_cast<const char*>(data), size);
+	auto ptr = jit->builder->CreateGlobalStringPtr(str);
+	return RValue<Pointer<Byte>>(V(ptr));
+}
+
+Value* Call(RValue<Pointer<Byte>> fptr, Type* retTy, std::initializer_list<Value*> args, std::initializer_list<Type*> argTys)
+{
+	::llvm::SmallVector<::llvm::Type*, 8> paramTys;
+	for (auto ty : argTys) { paramTys.push_back(T(ty)); }
+	auto funcTy = ::llvm::FunctionType::get(T(retTy), paramTys, false);
+
+	auto funcPtrTy = funcTy->getPointerTo();
+	auto funcPtr = jit->builder->CreatePointerCast(V(fptr.value), funcPtrTy);
+
+	::llvm::SmallVector<::llvm::Value*, 8> arguments;
+	for (auto arg : args) { arguments.push_back(V(arg)); }
+	return V(jit->builder->CreateCall(funcPtr, arguments));
+}
+
+void Breakpoint()
+{
+	llvm::Function *debugtrap = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::debugtrap);
+
+	jit->builder->CreateCall(debugtrap);
+}
+
+}  // namespace rr
+
+namespace rr {
+
+#if defined(__i386__) || defined(__x86_64__)
+namespace x86 {
+
+RValue<Int> cvtss2si(RValue<Float> val)
+{
+	llvm::Function *cvtss2si = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse_cvtss2si);
+
+	Float4 vector;
+	vector.x = val;
+
+	return RValue<Int>(V(jit->builder->CreateCall(cvtss2si, ARGS(V(RValue<Float4>(vector).value)))));
+}
+
+RValue<Int4> cvtps2dq(RValue<Float4> val)
+{
+	llvm::Function *cvtps2dq = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_cvtps2dq);
+
+	return RValue<Int4>(V(jit->builder->CreateCall(cvtps2dq, ARGS(V(val.value)))));
+}
+
+RValue<Float> rcpss(RValue<Float> val)
+{
+	llvm::Function *rcpss = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse_rcp_ss);
+
+	Value *vector = Nucleus::createInsertElement(V(llvm::UndefValue::get(T(Float4::getType()))), val.value, 0);
+
+	return RValue<Float>(Nucleus::createExtractElement(V(jit->builder->CreateCall(rcpss, ARGS(V(vector)))), Float::getType(), 0));
+}
+
+RValue<Float> sqrtss(RValue<Float> val)
+{
+	llvm::Function *sqrt = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::sqrt, {V(val.value)->getType()});
+	return RValue<Float>(V(jit->builder->CreateCall(sqrt, ARGS(V(val.value)))));
+}
+
+RValue<Float> rsqrtss(RValue<Float> val)
+{
+	llvm::Function *rsqrtss = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse_rsqrt_ss);
+
+	Value *vector = Nucleus::createInsertElement(V(llvm::UndefValue::get(T(Float4::getType()))), val.value, 0);
+
+	return RValue<Float>(Nucleus::createExtractElement(V(jit->builder->CreateCall(rsqrtss, ARGS(V(vector)))), Float::getType(), 0));
+}
+
+RValue<Float4> rcpps(RValue<Float4> val)
+{
+	llvm::Function *rcpps = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse_rcp_ps);
+
+	return RValue<Float4>(V(jit->builder->CreateCall(rcpps, ARGS(V(val.value)))));
+}
+
+RValue<Float4> sqrtps(RValue<Float4> val)
+{
+	llvm::Function *sqrtps = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::sqrt, {V(val.value)->getType()});
+
+	return RValue<Float4>(V(jit->builder->CreateCall(sqrtps, ARGS(V(val.value)))));
+}
+
+RValue<Float4> rsqrtps(RValue<Float4> val)
+{
+	llvm::Function *rsqrtps = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse_rsqrt_ps);
+
+	return RValue<Float4>(V(jit->builder->CreateCall(rsqrtps, ARGS(V(val.value)))));
+}
+
+RValue<Float4> maxps(RValue<Float4> x, RValue<Float4> y)
+{
+	llvm::Function *maxps = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse_max_ps);
+
+	return RValue<Float4>(V(jit->builder->CreateCall2(maxps, ARGS(V(x.value), V(y.value)))));
+}
+
+RValue<Float4> minps(RValue<Float4> x, RValue<Float4> y)
+{
+	llvm::Function *minps = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse_min_ps);
+
+	return RValue<Float4>(V(jit->builder->CreateCall2(minps, ARGS(V(x.value), V(y.value)))));
+}
+
+RValue<Float> roundss(RValue<Float> val, unsigned char imm)
+{
+	llvm::Function *roundss = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse41_round_ss);
+
+	Value *undef = V(llvm::UndefValue::get(T(Float4::getType())));
+	Value *vector = Nucleus::createInsertElement(undef, val.value, 0);
+
+	return RValue<Float>(Nucleus::createExtractElement(V(jit->builder->CreateCall3(roundss, ARGS(V(undef), V(vector), V(Nucleus::createConstantInt(imm))))), Float::getType(), 0));
+}
+
+RValue<Float> floorss(RValue<Float> val)
+{
+	return roundss(val, 1);
+}
+
+RValue<Float> ceilss(RValue<Float> val)
+{
+	return roundss(val, 2);
+}
+
+RValue<Float4> roundps(RValue<Float4> val, unsigned char imm)
+{
+	llvm::Function *roundps = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse41_round_ps);
+
+	return RValue<Float4>(V(jit->builder->CreateCall2(roundps, ARGS(V(val.value), V(Nucleus::createConstantInt(imm))))));
+}
+
+RValue<Float4> floorps(RValue<Float4> val)
+{
+	return roundps(val, 1);
+}
+
+RValue<Float4> ceilps(RValue<Float4> val)
+{
+	return roundps(val, 2);
+}
+
+RValue<Int4> pabsd(RValue<Int4> x)
+{
+	return RValue<Int4>(V(lowerPABS(V(x.value))));
+}
+
+RValue<Short4> paddsw(RValue<Short4> x, RValue<Short4> y)
+{
+	#if LLVM_VERSION_MAJOR >= 8
+		return As<Short4>(V(lowerPSADDSAT(V(x.value), V(y.value))));
+	#else
+		llvm::Function *paddsw = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_padds_w);
+
+		return As<Short4>(V(jit->builder->CreateCall2(paddsw, ARGS(V(x.value), V(y.value)))));
+	#endif
+}
+
+RValue<Short4> psubsw(RValue<Short4> x, RValue<Short4> y)
+{
+	#if LLVM_VERSION_MAJOR >= 8
+		return As<Short4>(V(lowerPSSUBSAT(V(x.value), V(y.value))));
+	#else
+		llvm::Function *psubsw = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_psubs_w);
+
+		return As<Short4>(V(jit->builder->CreateCall2(psubsw, ARGS(V(x.value), V(y.value)))));
+	#endif
+}
+
+RValue<UShort4> paddusw(RValue<UShort4> x, RValue<UShort4> y)
+{
+	#if LLVM_VERSION_MAJOR >= 8
+		return As<UShort4>(V(lowerPUADDSAT(V(x.value), V(y.value))));
+	#else
+		llvm::Function *paddusw = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_paddus_w);
+
+		return As<UShort4>(V(jit->builder->CreateCall2(paddusw, ARGS(V(x.value), V(y.value)))));
+	#endif
+}
+
+RValue<UShort4> psubusw(RValue<UShort4> x, RValue<UShort4> y)
+{
+	#if LLVM_VERSION_MAJOR >= 8
+		return As<UShort4>(V(lowerPUSUBSAT(V(x.value), V(y.value))));
+	#else
+		llvm::Function *psubusw = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_psubus_w);
+
+		return As<UShort4>(V(jit->builder->CreateCall2(psubusw, ARGS(V(x.value), V(y.value)))));
+	#endif
+}
+
+RValue<SByte8> paddsb(RValue<SByte8> x, RValue<SByte8> y)
+{
+	#if LLVM_VERSION_MAJOR >= 8
+		return As<SByte8>(V(lowerPSADDSAT(V(x.value), V(y.value))));
+	#else
+		llvm::Function *paddsb = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_padds_b);
+
+		return As<SByte8>(V(jit->builder->CreateCall2(paddsb, ARGS(V(x.value), V(y.value)))));
+	#endif
+}
+
+RValue<SByte8> psubsb(RValue<SByte8> x, RValue<SByte8> y)
+{
+	#if LLVM_VERSION_MAJOR >= 8
+		return As<SByte8>(V(lowerPSSUBSAT(V(x.value), V(y.value))));
+	#else
+		llvm::Function *psubsb = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_psubs_b);
+
+		return As<SByte8>(V(jit->builder->CreateCall2(psubsb, ARGS(V(x.value), V(y.value)))));
+	#endif
+}
+
+RValue<Byte8> paddusb(RValue<Byte8> x, RValue<Byte8> y)
+{
+	#if LLVM_VERSION_MAJOR >= 8
+		return As<Byte8>(V(lowerPUADDSAT(V(x.value), V(y.value))));
+	#else
+		llvm::Function *paddusb = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_paddus_b);
+
+		return As<Byte8>(V(jit->builder->CreateCall2(paddusb, ARGS(V(x.value), V(y.value)))));
+	#endif
+}
+
+RValue<Byte8> psubusb(RValue<Byte8> x, RValue<Byte8> y)
+{
+	#if LLVM_VERSION_MAJOR >= 8
+		return As<Byte8>(V(lowerPUSUBSAT(V(x.value), V(y.value))));
+	#else
+		llvm::Function *psubusb = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_psubus_b);
+
+		return As<Byte8>(V(jit->builder->CreateCall2(psubusb, ARGS(V(x.value), V(y.value)))));
+	#endif
+}
+
+RValue<UShort4> pavgw(RValue<UShort4> x, RValue<UShort4> y)
+{
+	return As<UShort4>(V(lowerPAVG(V(x.value), V(y.value))));
+}
+
+RValue<Short4> pmaxsw(RValue<Short4> x, RValue<Short4> y)
+{
+	return As<Short4>(V(lowerPMINMAX(V(x.value), V(y.value), llvm::ICmpInst::ICMP_SGT)));
+}
+
+RValue<Short4> pminsw(RValue<Short4> x, RValue<Short4> y)
+{
+	return As<Short4>(V(lowerPMINMAX(V(x.value), V(y.value), llvm::ICmpInst::ICMP_SLT)));
+}
+
+RValue<Short4> pcmpgtw(RValue<Short4> x, RValue<Short4> y)
+{
+	return As<Short4>(V(lowerPCMP(llvm::ICmpInst::ICMP_SGT, V(x.value), V(y.value), T(Short4::getType()))));
+}
+
+RValue<Short4> pcmpeqw(RValue<Short4> x, RValue<Short4> y)
+{
+	return As<Short4>(V(lowerPCMP(llvm::ICmpInst::ICMP_EQ, V(x.value), V(y.value), T(Short4::getType()))));
+}
+
+RValue<Byte8> pcmpgtb(RValue<SByte8> x, RValue<SByte8> y)
+{
+	return As<Byte8>(V(lowerPCMP(llvm::ICmpInst::ICMP_SGT, V(x.value), V(y.value), T(Byte8::getType()))));
+}
+
+RValue<Byte8> pcmpeqb(RValue<Byte8> x, RValue<Byte8> y)
+{
+	return As<Byte8>(V(lowerPCMP(llvm::ICmpInst::ICMP_EQ, V(x.value), V(y.value), T(Byte8::getType()))));
+}
+
+RValue<Short4> packssdw(RValue<Int2> x, RValue<Int2> y)
+{
+	llvm::Function *packssdw = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_packssdw_128);
+
+	return As<Short4>(V(jit->builder->CreateCall2(packssdw, ARGS(V(x.value), V(y.value)))));
+}
+
+RValue<Short8> packssdw(RValue<Int4> x, RValue<Int4> y)
+{
+	llvm::Function *packssdw = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_packssdw_128);
+
+	return RValue<Short8>(V(jit->builder->CreateCall2(packssdw, ARGS(V(x.value), V(y.value)))));
+}
+
+RValue<SByte8> packsswb(RValue<Short4> x, RValue<Short4> y)
+{
+	llvm::Function *packsswb = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_packsswb_128);
+
+	return As<SByte8>(V(jit->builder->CreateCall2(packsswb, ARGS(V(x.value), V(y.value)))));
+}
+
+RValue<Byte8> packuswb(RValue<Short4> x, RValue<Short4> y)
+{
+	llvm::Function *packuswb = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_packuswb_128);
+
+	return As<Byte8>(V(jit->builder->CreateCall2(packuswb, ARGS(V(x.value), V(y.value)))));
+}
+
+RValue<UShort8> packusdw(RValue<Int4> x, RValue<Int4> y)
+{
+	if(CPUID::supportsSSE4_1())
+	{
+		llvm::Function *packusdw = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse41_packusdw);
+
+		return RValue<UShort8>(V(jit->builder->CreateCall2(packusdw, ARGS(V(x.value), V(y.value)))));
+	}
+	else
+	{
+		RValue<Int4> bx = (x & ~(x >> 31)) - Int4(0x8000);
+		RValue<Int4> by = (y & ~(y >> 31)) - Int4(0x8000);
+
+		return As<UShort8>(packssdw(bx, by) + Short8(0x8000u));
+	}
+}
+
+RValue<UShort4> psrlw(RValue<UShort4> x, unsigned char y)
+{
+	llvm::Function *psrlw = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_psrli_w);
+
+	return As<UShort4>(V(jit->builder->CreateCall2(psrlw, ARGS(V(x.value), V(Nucleus::createConstantInt(y))))));
+}
+
+RValue<UShort8> psrlw(RValue<UShort8> x, unsigned char y)
+{
+	llvm::Function *psrlw = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_psrli_w);
+
+	return RValue<UShort8>(V(jit->builder->CreateCall2(psrlw, ARGS(V(x.value), V(Nucleus::createConstantInt(y))))));
+}
+
+RValue<Short4> psraw(RValue<Short4> x, unsigned char y)
+{
+	llvm::Function *psraw = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_psrai_w);
+
+	return As<Short4>(V(jit->builder->CreateCall2(psraw, ARGS(V(x.value), V(Nucleus::createConstantInt(y))))));
+}
+
+RValue<Short8> psraw(RValue<Short8> x, unsigned char y)
+{
+	llvm::Function *psraw = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_psrai_w);
+
+	return RValue<Short8>(V(jit->builder->CreateCall2(psraw, ARGS(V(x.value), V(Nucleus::createConstantInt(y))))));
+}
+
+RValue<Short4> psllw(RValue<Short4> x, unsigned char y)
+{
+	llvm::Function *psllw = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_pslli_w);
+
+	return As<Short4>(V(jit->builder->CreateCall2(psllw, ARGS(V(x.value), V(Nucleus::createConstantInt(y))))));
+}
+
+RValue<Short8> psllw(RValue<Short8> x, unsigned char y)
+{
+	llvm::Function *psllw = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_pslli_w);
+
+	return RValue<Short8>(V(jit->builder->CreateCall2(psllw, ARGS(V(x.value), V(Nucleus::createConstantInt(y))))));
+}
+
+RValue<Int2> pslld(RValue<Int2> x, unsigned char y)
+{
+	llvm::Function *pslld = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_pslli_d);
+
+	return As<Int2>(V(jit->builder->CreateCall2(pslld, ARGS(V(x.value), V(Nucleus::createConstantInt(y))))));
+}
+
+RValue<Int4> pslld(RValue<Int4> x, unsigned char y)
+{
+	llvm::Function *pslld = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_pslli_d);
+
+	return RValue<Int4>(V(jit->builder->CreateCall2(pslld, ARGS(V(x.value), V(Nucleus::createConstantInt(y))))));
+}
+
+RValue<Int2> psrad(RValue<Int2> x, unsigned char y)
+{
+	llvm::Function *psrad = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_psrai_d);
+
+	return As<Int2>(V(jit->builder->CreateCall2(psrad, ARGS(V(x.value), V(Nucleus::createConstantInt(y))))));
+}
+
+RValue<Int4> psrad(RValue<Int4> x, unsigned char y)
+{
+	llvm::Function *psrad = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_psrai_d);
+
+	return RValue<Int4>(V(jit->builder->CreateCall2(psrad, ARGS(V(x.value), V(Nucleus::createConstantInt(y))))));
+}
+
+RValue<UInt2> psrld(RValue<UInt2> x, unsigned char y)
+{
+	llvm::Function *psrld = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_psrli_d);
+
+	return As<UInt2>(V(jit->builder->CreateCall2(psrld, ARGS(V(x.value), V(Nucleus::createConstantInt(y))))));
+}
+
+RValue<UInt4> psrld(RValue<UInt4> x, unsigned char y)
+{
+	llvm::Function *psrld = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_psrli_d);
+
+	return RValue<UInt4>(V(jit->builder->CreateCall2(psrld, ARGS(V(x.value), V(Nucleus::createConstantInt(y))))));
+}
+
+RValue<Int4> pmaxsd(RValue<Int4> x, RValue<Int4> y)
+{
+	return RValue<Int4>(V(lowerPMINMAX(V(x.value), V(y.value), llvm::ICmpInst::ICMP_SGT)));
+}
+
+RValue<Int4> pminsd(RValue<Int4> x, RValue<Int4> y)
+{
+	return RValue<Int4>(V(lowerPMINMAX(V(x.value), V(y.value), llvm::ICmpInst::ICMP_SLT)));
+}
+
+RValue<UInt4> pmaxud(RValue<UInt4> x, RValue<UInt4> y)
+{
+	return RValue<UInt4>(V(lowerPMINMAX(V(x.value), V(y.value), llvm::ICmpInst::ICMP_UGT)));
+}
+
+RValue<UInt4> pminud(RValue<UInt4> x, RValue<UInt4> y)
+{
+	return RValue<UInt4>(V(lowerPMINMAX(V(x.value), V(y.value), llvm::ICmpInst::ICMP_ULT)));
+}
+
+RValue<Short4> pmulhw(RValue<Short4> x, RValue<Short4> y)
+{
+	llvm::Function *pmulhw = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_pmulh_w);
+
+	return As<Short4>(V(jit->builder->CreateCall2(pmulhw, ARGS(V(x.value), V(y.value)))));
+}
+
+RValue<UShort4> pmulhuw(RValue<UShort4> x, RValue<UShort4> y)
+{
+	llvm::Function *pmulhuw = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_pmulhu_w);
+
+	return As<UShort4>(V(jit->builder->CreateCall2(pmulhuw, ARGS(V(x.value), V(y.value)))));
+}
+
+RValue<Int2> pmaddwd(RValue<Short4> x, RValue<Short4> y)
+{
+	llvm::Function *pmaddwd = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_pmadd_wd);
+
+	return As<Int2>(V(jit->builder->CreateCall2(pmaddwd, ARGS(V(x.value), V(y.value)))));
+}
+
+RValue<Short8> pmulhw(RValue<Short8> x, RValue<Short8> y)
+{
+	llvm::Function *pmulhw = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_pmulh_w);
+
+	return RValue<Short8>(V(jit->builder->CreateCall2(pmulhw, ARGS(V(x.value), V(y.value)))));
+}
+
+RValue<UShort8> pmulhuw(RValue<UShort8> x, RValue<UShort8> y)
+{
+	llvm::Function *pmulhuw = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_pmulhu_w);
+
+	return RValue<UShort8>(V(jit->builder->CreateCall2(pmulhuw, ARGS(V(x.value), V(y.value)))));
+}
+
+RValue<Int4> pmaddwd(RValue<Short8> x, RValue<Short8> y)
+{
+	llvm::Function *pmaddwd = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_pmadd_wd);
+
+	return RValue<Int4>(V(jit->builder->CreateCall2(pmaddwd, ARGS(V(x.value), V(y.value)))));
+}
+
+RValue<Int> movmskps(RValue<Float4> x)
+{
+	llvm::Function *movmskps = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse_movmsk_ps);
+
+	return RValue<Int>(V(jit->builder->CreateCall(movmskps, ARGS(V(x.value)))));
+}
+
+RValue<Int> pmovmskb(RValue<Byte8> x)
+{
+	llvm::Function *pmovmskb = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_pmovmskb_128);
+
+	return RValue<Int>(V(jit->builder->CreateCall(pmovmskb, ARGS(V(x.value))))) & 0xFF;
+}
+
+RValue<Int4> pmovzxbd(RValue<Byte16> x)
+{
+	return RValue<Int4>(V(lowerPMOV(V(x.value), T(Int4::getType()), false)));
+}
+
+RValue<Int4> pmovsxbd(RValue<SByte16> x)
+{
+	return RValue<Int4>(V(lowerPMOV(V(x.value), T(Int4::getType()), true)));
+}
+
+RValue<Int4> pmovzxwd(RValue<UShort8> x)
+{
+	return RValue<Int4>(V(lowerPMOV(V(x.value), T(Int4::getType()), false)));
+}
+
+RValue<Int4> pmovsxwd(RValue<Short8> x)
+{
+	return RValue<Int4>(V(lowerPMOV(V(x.value), T(Int4::getType()), true)));
+}
+
+}  // namespace x86
 #endif  // defined(__i386__) || defined(__x86_64__)
 
 #ifdef ENABLE_RR_PRINT
-	// extractAll returns a vector containing the extracted n scalar value of
-	// the vector vec.
-	static std::vector<Value*> extractAll(Value* vec, int n)
+// extractAll returns a vector containing the extracted n scalar value of
+// the vector vec.
+static std::vector<Value*> extractAll(Value* vec, int n)
+{
+	std::vector<Value*> elements;
+	elements.reserve(n);
+	for (int i = 0; i < n; i++)
 	{
-		std::vector<Value*> elements;
-		elements.reserve(n);
-		for (int i = 0; i < n; i++)
+		auto el = V(jit->builder->CreateExtractElement(V(vec), i));
+		elements.push_back(el);
+	}
+	return elements;
+}
+
+// toInt returns all the integer values in vals extended to a native width
+// integer.
+static std::vector<Value*> toInt(const std::vector<Value*>& vals, bool isSigned)
+{
+	auto intTy = ::llvm::Type::getIntNTy(jit->context, sizeof(int) * 8); // Natural integer width.
+	std::vector<Value*> elements;
+	elements.reserve(vals.size());
+	for (auto v : vals)
+	{
+		if (isSigned)
 		{
-			auto el = V(jit->builder->CreateExtractElement(V(vec), i));
-			elements.push_back(el);
+			elements.push_back(V(jit->builder->CreateSExt(V(v), intTy)));
 		}
-		return elements;
+		else
+		{
+			elements.push_back(V(jit->builder->CreateZExt(V(v), intTy)));
+		}
+	}
+	return elements;
+}
+
+// toDouble returns all the float values in vals extended to doubles.
+static std::vector<Value*> toDouble(const std::vector<Value*>& vals)
+{
+	auto doubleTy = ::llvm::Type::getDoubleTy(jit->context);
+	std::vector<Value*> elements;
+	elements.reserve(vals.size());
+	for (auto v : vals)
+	{
+		elements.push_back(V(jit->builder->CreateFPExt(V(v), doubleTy)));
+	}
+	return elements;
+}
+
+std::vector<Value*> PrintValue::Ty<Byte>::val(const RValue<Byte>& v) { return toInt({v.value}, false); }
+std::vector<Value*> PrintValue::Ty<Byte4>::val(const RValue<Byte4>& v) { return toInt(extractAll(v.value, 4), false); }
+std::vector<Value*> PrintValue::Ty<Int>::val(const RValue<Int>& v) { return toInt({v.value}, true); }
+std::vector<Value*> PrintValue::Ty<Int2>::val(const RValue<Int2>& v) { return toInt(extractAll(v.value, 2), true); }
+std::vector<Value*> PrintValue::Ty<Int4>::val(const RValue<Int4>& v) { return toInt(extractAll(v.value, 4), true); }
+std::vector<Value*> PrintValue::Ty<UInt>::val(const RValue<UInt>& v) { return toInt({v.value}, false); }
+std::vector<Value*> PrintValue::Ty<UInt2>::val(const RValue<UInt2>& v) { return toInt(extractAll(v.value, 2), false); }
+std::vector<Value*> PrintValue::Ty<UInt4>::val(const RValue<UInt4>& v) { return toInt(extractAll(v.value, 4), false); }
+std::vector<Value*> PrintValue::Ty<Short>::val(const RValue<Short>& v) { return toInt({v.value}, true); }
+std::vector<Value*> PrintValue::Ty<Short4>::val(const RValue<Short4>& v) { return toInt(extractAll(v.value, 4), true); }
+std::vector<Value*> PrintValue::Ty<UShort>::val(const RValue<UShort>& v) { return toInt({v.value}, false); }
+std::vector<Value*> PrintValue::Ty<UShort4>::val(const RValue<UShort4>& v) { return toInt(extractAll(v.value, 4), false); }
+std::vector<Value*> PrintValue::Ty<Float>::val(const RValue<Float>& v) { return toDouble({v.value}); }
+std::vector<Value*> PrintValue::Ty<Float4>::val(const RValue<Float4>& v) { return toDouble(extractAll(v.value, 4)); }
+std::vector<Value*> PrintValue::Ty<const char*>::val(const char* v) { return {V(jit->builder->CreateGlobalStringPtr(v))}; }
+
+void Printv(const char* function, const char* file, int line, const char* fmt, std::initializer_list<PrintValue> args)
+{
+	// LLVM types used below.
+	auto i32Ty = ::llvm::Type::getInt32Ty(jit->context);
+	auto intTy = ::llvm::Type::getIntNTy(jit->context, sizeof(int) * 8); // Natural integer width.
+	auto i8PtrTy = ::llvm::Type::getInt8PtrTy(jit->context);
+	auto funcTy = ::llvm::FunctionType::get(i32Ty, {i8PtrTy}, true);
+
+	auto func = jit->module->getOrInsertFunction("printf", funcTy);
+
+	// Build the printf format message string.
+	std::string str;
+	if (file != nullptr) { str += (line > 0) ? "%s:%d " : "%s "; }
+	if (function != nullptr) { str += "%s "; }
+	str += fmt;
+
+	// Perform subsitution on all '{n}' bracketed indices in the format
+	// message.
+	int i = 0;
+	for (const PrintValue& arg : args)
+	{
+		str = replace(str, "{" + std::to_string(i++) + "}", arg.format);
 	}
 
-	// toInt returns all the integer values in vals extended to a native width
-	// integer.
-	static std::vector<Value*> toInt(const std::vector<Value*>& vals, bool isSigned)
+	::llvm::SmallVector<::llvm::Value*, 8> vals;
+
+	// The format message is always the first argument.
+	vals.push_back(jit->builder->CreateGlobalStringPtr(str));
+
+	// Add optional file, line and function info if provided.
+	if (file != nullptr)
 	{
-		auto intTy = ::llvm::Type::getIntNTy(jit->context, sizeof(int) * 8); // Natural integer width.
-		std::vector<Value*> elements;
-		elements.reserve(vals.size());
-		for (auto v : vals)
+		vals.push_back(jit->builder->CreateGlobalStringPtr(file));
+		if (line > 0)
 		{
-			if (isSigned)
-			{
-				elements.push_back(V(jit->builder->CreateSExt(V(v), intTy)));
-			}
-			else
-			{
-				elements.push_back(V(jit->builder->CreateZExt(V(v), intTy)));
-			}
+			vals.push_back(::llvm::ConstantInt::get(intTy, line));
 		}
-		return elements;
+	}
+	if (function != nullptr)
+	{
+		vals.push_back(jit->builder->CreateGlobalStringPtr(function));
 	}
 
-	// toDouble returns all the float values in vals extended to doubles.
-	static std::vector<Value*> toDouble(const std::vector<Value*>& vals)
+	// Add all format arguments.
+	for (const PrintValue& arg : args)
 	{
-		auto doubleTy = ::llvm::Type::getDoubleTy(jit->context);
-		std::vector<Value*> elements;
-		elements.reserve(vals.size());
-		for (auto v : vals)
+		for (auto val : arg.values)
 		{
-			elements.push_back(V(jit->builder->CreateFPExt(V(v), doubleTy)));
+			vals.push_back(V(val));
 		}
-		return elements;
 	}
 
-	std::vector<Value*> PrintValue::Ty<Byte>::val(const RValue<Byte>& v) { return toInt({v.value}, false); }
-	std::vector<Value*> PrintValue::Ty<Byte4>::val(const RValue<Byte4>& v) { return toInt(extractAll(v.value, 4), false); }
-	std::vector<Value*> PrintValue::Ty<Int>::val(const RValue<Int>& v) { return toInt({v.value}, true); }
-	std::vector<Value*> PrintValue::Ty<Int2>::val(const RValue<Int2>& v) { return toInt(extractAll(v.value, 2), true); }
-	std::vector<Value*> PrintValue::Ty<Int4>::val(const RValue<Int4>& v) { return toInt(extractAll(v.value, 4), true); }
-	std::vector<Value*> PrintValue::Ty<UInt>::val(const RValue<UInt>& v) { return toInt({v.value}, false); }
-	std::vector<Value*> PrintValue::Ty<UInt2>::val(const RValue<UInt2>& v) { return toInt(extractAll(v.value, 2), false); }
-	std::vector<Value*> PrintValue::Ty<UInt4>::val(const RValue<UInt4>& v) { return toInt(extractAll(v.value, 4), false); }
-	std::vector<Value*> PrintValue::Ty<Short>::val(const RValue<Short>& v) { return toInt({v.value}, true); }
-	std::vector<Value*> PrintValue::Ty<Short4>::val(const RValue<Short4>& v) { return toInt(extractAll(v.value, 4), true); }
-	std::vector<Value*> PrintValue::Ty<UShort>::val(const RValue<UShort>& v) { return toInt({v.value}, false); }
-	std::vector<Value*> PrintValue::Ty<UShort4>::val(const RValue<UShort4>& v) { return toInt(extractAll(v.value, 4), false); }
-	std::vector<Value*> PrintValue::Ty<Float>::val(const RValue<Float>& v) { return toDouble({v.value}); }
-	std::vector<Value*> PrintValue::Ty<Float4>::val(const RValue<Float4>& v) { return toDouble(extractAll(v.value, 4)); }
-	std::vector<Value*> PrintValue::Ty<const char*>::val(const char* v) { return {V(jit->builder->CreateGlobalStringPtr(v))}; }
-
-	void Printv(const char* function, const char* file, int line, const char* fmt, std::initializer_list<PrintValue> args)
-	{
-		// LLVM types used below.
-		auto i32Ty = ::llvm::Type::getInt32Ty(jit->context);
-		auto intTy = ::llvm::Type::getIntNTy(jit->context, sizeof(int) * 8); // Natural integer width.
-		auto i8PtrTy = ::llvm::Type::getInt8PtrTy(jit->context);
-		auto funcTy = ::llvm::FunctionType::get(i32Ty, {i8PtrTy}, true);
-
-		auto func = jit->module->getOrInsertFunction("printf", funcTy);
-
-		// Build the printf format message string.
-		std::string str;
-		if (file != nullptr) { str += (line > 0) ? "%s:%d " : "%s "; }
-		if (function != nullptr) { str += "%s "; }
-		str += fmt;
-
-		// Perform subsitution on all '{n}' bracketed indices in the format
-		// message.
-		int i = 0;
-		for (const PrintValue& arg : args)
-		{
-			str = replace(str, "{" + std::to_string(i++) + "}", arg.format);
-		}
-
-		::llvm::SmallVector<::llvm::Value*, 8> vals;
-
-		// The format message is always the first argument.
-		vals.push_back(jit->builder->CreateGlobalStringPtr(str));
-
-		// Add optional file, line and function info if provided.
-		if (file != nullptr)
-		{
-			vals.push_back(jit->builder->CreateGlobalStringPtr(file));
-			if (line > 0)
-			{
-				vals.push_back(::llvm::ConstantInt::get(intTy, line));
-			}
-		}
-		if (function != nullptr)
-		{
-			vals.push_back(jit->builder->CreateGlobalStringPtr(function));
-		}
-
-		// Add all format arguments.
-		for (const PrintValue& arg : args)
-		{
-			for (auto val : arg.values)
-			{
-				vals.push_back(V(val));
-			}
-		}
-
-		jit->builder->CreateCall(func, vals);
-	}
+	jit->builder->CreateCall(func, vals);
+}
 #endif // ENABLE_RR_PRINT
 
-	void Nop()
-	{
-		auto voidTy = ::llvm::Type::getVoidTy(jit->context);
-		auto funcTy = ::llvm::FunctionType::get(voidTy, {}, false);
-		auto func = jit->module->getOrInsertFunction("nop", funcTy);
-		jit->builder->CreateCall(func);
-	}
+void Nop()
+{
+	auto voidTy = ::llvm::Type::getVoidTy(jit->context);
+	auto funcTy = ::llvm::FunctionType::get(voidTy, {}, false);
+	auto func = jit->module->getOrInsertFunction("nop", funcTy);
+	jit->builder->CreateCall(func);
+}
 
-	void EmitDebugLocation()
-	{
+void EmitDebugLocation()
+{
 #ifdef ENABLE_RR_DEBUG_INFO
-		if (jit->debugInfo != nullptr)
-		{
-			jit->debugInfo->EmitLocation();
-		}
-#endif // ENABLE_RR_DEBUG_INFO
-	}
-
-	void EmitDebugVariable(Value* value)
+	if (jit->debugInfo != nullptr)
 	{
-#ifdef ENABLE_RR_DEBUG_INFO
-		if (jit->debugInfo != nullptr)
-		{
-			jit->debugInfo->EmitVariable(value);
-		}
-#endif // ENABLE_RR_DEBUG_INFO
+		jit->debugInfo->EmitLocation();
 	}
+#endif // ENABLE_RR_DEBUG_INFO
+}
 
-	void FlushDebug()
+void EmitDebugVariable(Value* value)
+{
+#ifdef ENABLE_RR_DEBUG_INFO
+	if (jit->debugInfo != nullptr)
 	{
-#ifdef ENABLE_RR_DEBUG_INFO
-		if (jit->debugInfo != nullptr)
-		{
-			jit->debugInfo->Flush();
-		}
-#endif // ENABLE_RR_DEBUG_INFO
+		jit->debugInfo->EmitVariable(value);
 	}
+#endif // ENABLE_RR_DEBUG_INFO
+}
 
-} // namespace rr
+void FlushDebug()
+{
+#ifdef ENABLE_RR_DEBUG_INFO
+	if (jit->debugInfo != nullptr)
+	{
+		jit->debugInfo->Flush();
+	}
+#endif // ENABLE_RR_DEBUG_INFO
+}
+
+}  // namespace rr
 
 // ------------------------------  Coroutines ------------------------------
 
 namespace {
-	// Magic values retuned by llvm.coro.suspend.
-	// See: https://llvm.org/docs/Coroutines.html#llvm-coro-suspend-intrinsic
-	enum SuspendAction
-	{
-		SuspendActionSuspend = -1,
-		SuspendActionResume = 0,
-		SuspendActionDestroy = 1
-	};
 
+// Magic values retuned by llvm.coro.suspend.
+// See: https://llvm.org/docs/Coroutines.html#llvm-coro-suspend-intrinsic
+enum SuspendAction
+{
+	SuspendActionSuspend = -1,
+	SuspendActionResume = 0,
+	SuspendActionDestroy = 1
+};
 
 void promoteFunctionToCoroutine()
 {
diff --git a/src/Reactor/LLVMReactor.hpp b/src/Reactor/LLVMReactor.hpp
index 4ff5274..bbf3332 100644
--- a/src/Reactor/LLVMReactor.hpp
+++ b/src/Reactor/LLVMReactor.hpp
@@ -15,38 +15,40 @@
 #ifndef rr_LLVMReactor_hpp
 #define rr_LLVMReactor_hpp
 
-namespace llvm
+namespace llvm {
+
+class Type;
+class Value;
+
+}  // namespace llvm
+
+namespace rr {
+
+class Type;
+class Value;
+
+llvm::Type *T(Type *t);
+
+inline Type *T(llvm::Type *t)
 {
-	class Type;
-	class Value;
+	return reinterpret_cast<Type*>(t);
 }
 
-namespace rr
+inline llvm::Value *V(Value *t)
 {
-	class Type;
-	class Value;
-
-	llvm::Type *T(Type *t);
-
-	inline Type *T(llvm::Type *t)
-	{
-		return reinterpret_cast<Type*>(t);
-	}
-
-	inline llvm::Value *V(Value *t)
-	{
-		return reinterpret_cast<llvm::Value*>(t);
-	}
-
-	inline Value *V(llvm::Value *t)
-	{
-		return reinterpret_cast<Value*>(t);
-	}
-
-	// Emits a no-op instruction that will not be optimized away.
-	// Useful for emitting something that can have a source location without
-	// effect.
-	void Nop();
+	return reinterpret_cast<llvm::Value*>(t);
 }
 
+inline Value *V(llvm::Value *t)
+{
+	return reinterpret_cast<Value*>(t);
+}
+
+// Emits a no-op instruction that will not be optimized away.
+// Useful for emitting something that can have a source location without
+// effect.
+void Nop();
+
+}  // namespace rr
+
 #endif // rr_LLVMReactor_hpp
diff --git a/src/Reactor/LLVMReactorDebugInfo.cpp b/src/Reactor/LLVMReactorDebugInfo.cpp
index 744ae42..ee090cc 100644
--- a/src/Reactor/LLVMReactorDebugInfo.cpp
+++ b/src/Reactor/LLVMReactorDebugInfo.cpp
@@ -40,518 +40,518 @@
 #define LOG(msg, ...)
 #endif
 
-namespace
+namespace {
+
+std::pair<llvm::StringRef, llvm::StringRef> splitPath(const char* path)
 {
-	std::pair<llvm::StringRef, llvm::StringRef> splitPath(const char* path)
-	{
-		return llvm::StringRef(path).rsplit('/');
-	}
+	return llvm::StringRef(path).rsplit('/');
+}
 
-	// Note: createGDBRegistrationListener() returns a pointer to a singleton.
-	// Nothing is actually created.
-	auto jitEventListener = llvm::JITEventListener::createGDBRegistrationListener(); // guarded by jitEventListenerMutex
-	std::mutex jitEventListenerMutex;
+// Note: createGDBRegistrationListener() returns a pointer to a singleton.
+// Nothing is actually created.
+auto jitEventListener = llvm::JITEventListener::createGDBRegistrationListener(); // guarded by jitEventListenerMutex
+std::mutex jitEventListenerMutex;
 
-} // anonymous namespaces
+}  // anonymous namespaces
 
-namespace rr
+namespace rr {
+
+DebugInfo::DebugInfo(
+		llvm::IRBuilder<> *builder,
+		llvm::LLVMContext *context,
+		llvm::Module *module,
+		llvm::Function *function)
+	: builder(builder), context(context), module(module), function(function)
 {
-	DebugInfo::DebugInfo(
-			llvm::IRBuilder<> *builder,
-			llvm::LLVMContext *context,
-			llvm::Module *module,
-			llvm::Function *function)
-		: builder(builder), context(context), module(module), function(function)
-	{
-		using namespace ::llvm;
+	using namespace ::llvm;
 
-		auto location = getCallerLocation();
+	auto location = getCallerLocation();
 
-		auto fileAndDir = splitPath(location.function.file.c_str());
-		diBuilder.reset(new llvm::DIBuilder(*module));
-		diCU = diBuilder->createCompileUnit(
-			llvm::dwarf::DW_LANG_C,
-			diBuilder->createFile(fileAndDir.first, fileAndDir.second),
-			"Reactor",
-			0, "", 0);
+	auto fileAndDir = splitPath(location.function.file.c_str());
+	diBuilder.reset(new llvm::DIBuilder(*module));
+	diCU = diBuilder->createCompileUnit(
+		llvm::dwarf::DW_LANG_C,
+		diBuilder->createFile(fileAndDir.first, fileAndDir.second),
+		"Reactor",
+		0, "", 0);
 
-		registerBasicTypes();
+	registerBasicTypes();
 
-		SmallVector<Metadata *, 8> EltTys;
-		auto funcTy = diBuilder->createSubroutineType(diBuilder->getOrCreateTypeArray(EltTys));
+	SmallVector<Metadata *, 8> EltTys;
+	auto funcTy = diBuilder->createSubroutineType(diBuilder->getOrCreateTypeArray(EltTys));
 
-		auto file = getOrCreateFile(location.function.file.c_str());
-		auto sp = diBuilder->createFunction(
-			file,                   // scope
-			"ReactorFunction",      // function name
-			"ReactorFunction",      // linkage
-			file,                   // file
-			location.line,          // line
-			funcTy,                 // type
-			false,                  // internal linkage
-			true,                   // definition
-			location.line,          // scope line
-			DINode::FlagPrototyped, // flags
-			false                   // is optimized
-		);
-		diSubprogram = sp;
-		function->setSubprogram(sp);
-		diRootLocation = DILocation::get(*context, location.line, 0, sp);
-		builder->SetCurrentDebugLocation(diRootLocation);
-	}
+	auto file = getOrCreateFile(location.function.file.c_str());
+	auto sp = diBuilder->createFunction(
+		file,                   // scope
+		"ReactorFunction",      // function name
+		"ReactorFunction",      // linkage
+		file,                   // file
+		location.line,          // line
+		funcTy,                 // type
+		false,                  // internal linkage
+		true,                   // definition
+		location.line,          // scope line
+		DINode::FlagPrototyped, // flags
+		false                   // is optimized
+	);
+	diSubprogram = sp;
+	function->setSubprogram(sp);
+	diRootLocation = DILocation::get(*context, location.line, 0, sp);
+	builder->SetCurrentDebugLocation(diRootLocation);
+}
 
-	DebugInfo::~DebugInfo() = default;
+DebugInfo::~DebugInfo() = default;
 
-	void DebugInfo::Finalize()
-	{
-		while (diScope.size() > 0)
-		{
-			emitPending(diScope.back(), builder);
-			diScope.pop_back();
-		}
-		diBuilder->finalize();
-	}
-
-	void DebugInfo::EmitLocation()
-	{
-		auto const& backtrace = getCallerBacktrace();
-		syncScope(backtrace);
-		builder->SetCurrentDebugLocation(getLocation(backtrace, backtrace.size() - 1));
-
-#ifdef ENABLE_RR_EMIT_PRINT_LOCATION
-		static Location lastLocation;
-		if (backtrace.size() == 0)
-		{
-			return;
-		}
-		Location currLocation = backtrace[backtrace.size() - 1];
-		if (currLocation != lastLocation)
-		{
-			rr::Print("rr> {0} [{1}:{2}]\n", currLocation.function.name.c_str(), currLocation.function.file.c_str(), currLocation.line);
-			lastLocation = std::move(currLocation);
-		}
-#endif // ENABLE_RR_EMIT_PRINT_LOCATION
-	}
-
-	void DebugInfo::Flush()
+void DebugInfo::Finalize()
+{
+	while (diScope.size() > 0)
 	{
 		emitPending(diScope.back(), builder);
+		diScope.pop_back();
+	}
+	diBuilder->finalize();
+}
+
+void DebugInfo::EmitLocation()
+{
+	auto const& backtrace = getCallerBacktrace();
+	syncScope(backtrace);
+	builder->SetCurrentDebugLocation(getLocation(backtrace, backtrace.size() - 1));
+
+#ifdef ENABLE_RR_EMIT_PRINT_LOCATION
+	static Location lastLocation;
+	if (backtrace.size() == 0)
+	{
+		return;
+	}
+	Location currLocation = backtrace[backtrace.size() - 1];
+	if (currLocation != lastLocation)
+	{
+		rr::Print("rr> {0} [{1}:{2}]\n", currLocation.function.name.c_str(), currLocation.function.file.c_str(), currLocation.line);
+		lastLocation = std::move(currLocation);
+	}
+#endif // ENABLE_RR_EMIT_PRINT_LOCATION
+}
+
+void DebugInfo::Flush()
+{
+	emitPending(diScope.back(), builder);
+}
+
+void DebugInfo::syncScope(Backtrace const& backtrace)
+{
+	auto shrink = [this](size_t newsize)
+	{
+		while (diScope.size() > newsize)
+		{
+			auto &scope = diScope.back();
+			LOG("- STACK(%d): di: %p, location: %s:%d",
+				int(diScope.size() - 1), scope.di,
+				scope.location.function.file.c_str(),
+				int(scope.location.line));
+			emitPending(scope, builder);
+			diScope.pop_back();
+		}
+	};
+
+	if (backtrace.size() < diScope.size())
+	{
+		shrink(backtrace.size());
 	}
 
-	void DebugInfo::syncScope(Backtrace const& backtrace)
+	for (size_t i = 0; i < diScope.size(); i++)
 	{
-		auto shrink = [this](size_t newsize)
-		{
-			while (diScope.size() > newsize)
-			{
-				auto &scope = diScope.back();
-				LOG("- STACK(%d): di: %p, location: %s:%d",
-					int(diScope.size() - 1), scope.di,
-					scope.location.function.file.c_str(),
-					int(scope.location.line));
-				emitPending(scope, builder);
-				diScope.pop_back();
-			}
-		};
+		auto &scope = diScope[i];
+		auto const &oldLocation = scope.location;
+		auto const &newLocation = backtrace[i];
 
-		if (backtrace.size() < diScope.size())
+		if (oldLocation.function != newLocation.function)
 		{
-			shrink(backtrace.size());
+			LOG("  STACK(%d): Changed function %s -> %s", int(i),
+				oldLocation.function.name.c_str(), newLocation.function.name.c_str());
+			shrink(i);
+			break;
 		}
 
-		for (size_t i = 0; i < diScope.size(); i++)
+		if (oldLocation.line > newLocation.line)
 		{
-			auto &scope = diScope[i];
-			auto const &oldLocation = scope.location;
-			auto const &newLocation = backtrace[i];
-
-			if (oldLocation.function != newLocation.function)
-			{
-				LOG("  STACK(%d): Changed function %s -> %s", int(i),
-					oldLocation.function.name.c_str(), newLocation.function.name.c_str());
-				shrink(i);
-				break;
-			}
-
-			if (oldLocation.line > newLocation.line)
-			{
-				// Create a new di block to shadow all the variables in the loop.
-				auto file = getOrCreateFile(newLocation.function.file.c_str());
-				auto di = diBuilder->createLexicalBlock(scope.di, file, newLocation.line, 0);
-				LOG("  STACK(%d): Jumped backwards %d -> %d. di: %p -> %p", int(i),
-					oldLocation.line, newLocation.line, scope.di, di);
-				emitPending(scope, builder);
-				scope = {newLocation, di};
-				shrink(i+1);
-				break;
-			}
-
-			scope.location = newLocation;
+			// Create a new di block to shadow all the variables in the loop.
+			auto file = getOrCreateFile(newLocation.function.file.c_str());
+			auto di = diBuilder->createLexicalBlock(scope.di, file, newLocation.line, 0);
+			LOG("  STACK(%d): Jumped backwards %d -> %d. di: %p -> %p", int(i),
+				oldLocation.line, newLocation.line, scope.di, di);
+			emitPending(scope, builder);
+			scope = {newLocation, di};
+			shrink(i+1);
+			break;
 		}
 
-		while (backtrace.size() > diScope.size())
-		{
-			auto i = diScope.size();
-			auto location = backtrace[i];
-			auto file = getOrCreateFile(location.function.file.c_str());
-			auto funcTy = diBuilder->createSubroutineType(diBuilder->getOrCreateTypeArray({}));
-
-			char buf[1024];
-			size_t size = sizeof(buf);
-			int status = 0;
-			llvm::itaniumDemangle(location.function.name.c_str(), buf, &size, &status);
-			auto name = "jit!" + (status == 0 ? std::string(buf) : location.function.name);
-
-			auto func = diBuilder->createFunction(
-				file,                           // scope
-				name,                           // function name
-				"",                             // linkage
-				file,                           // file
-				location.line,                  // line
-				funcTy,                         // type
-				false,                          // internal linkage
-				true,                           // definition
-				location.line,                  // scope line
-				llvm::DINode::FlagPrototyped,   // flags
-				false                           // is optimized
-			);
-			diScope.push_back({location, func});
-			LOG("+ STACK(%d): di: %p, location: %s:%d", int(i), di,
-				location.function.file.c_str(), int(location.line));
-		}
+		scope.location = newLocation;
 	}
 
-	llvm::DILocation* DebugInfo::getLocation(const Backtrace &backtrace, size_t i)
+	while (backtrace.size() > diScope.size())
 	{
-		if (backtrace.size() == 0) { return nullptr; }
-		assert(backtrace.size() == diScope.size());
-		return llvm::DILocation::get(
-			*context,
-			backtrace[i].line,
-			0,
-			diScope[i].di,
-			i > 0 ? getLocation(backtrace, i - 1) : diRootLocation
+		auto i = diScope.size();
+		auto location = backtrace[i];
+		auto file = getOrCreateFile(location.function.file.c_str());
+		auto funcTy = diBuilder->createSubroutineType(diBuilder->getOrCreateTypeArray({}));
+
+		char buf[1024];
+		size_t size = sizeof(buf);
+		int status = 0;
+		llvm::itaniumDemangle(location.function.name.c_str(), buf, &size, &status);
+		auto name = "jit!" + (status == 0 ? std::string(buf) : location.function.name);
+
+		auto func = diBuilder->createFunction(
+			file,                           // scope
+			name,                           // function name
+			"",                             // linkage
+			file,                           // file
+			location.line,                  // line
+			funcTy,                         // type
+			false,                          // internal linkage
+			true,                           // definition
+			location.line,                  // scope line
+			llvm::DINode::FlagPrototyped,   // flags
+			false                           // is optimized
 		);
+		diScope.push_back({location, func});
+		LOG("+ STACK(%d): di: %p, location: %s:%d", int(i), di,
+			location.function.file.c_str(), int(location.line));
 	}
+}
 
-	void DebugInfo::EmitVariable(Value *variable)
+llvm::DILocation* DebugInfo::getLocation(const Backtrace &backtrace, size_t i)
+{
+	if (backtrace.size() == 0) { return nullptr; }
+	assert(backtrace.size() == diScope.size());
+	return llvm::DILocation::get(
+		*context,
+		backtrace[i].line,
+		0,
+		diScope[i].di,
+		i > 0 ? getLocation(backtrace, i - 1) : diRootLocation
+	);
+}
+
+void DebugInfo::EmitVariable(Value *variable)
+{
+	auto const& backtrace = getCallerBacktrace();
+	syncScope(backtrace);
+
+	for (int i = backtrace.size() - 1; i >= 0; i--)
 	{
-		auto const& backtrace = getCallerBacktrace();
-		syncScope(backtrace);
-
-		for (int i = backtrace.size() - 1; i >= 0; i--)
+		auto const &location = backtrace[i];
+		auto tokens = getOrParseFileTokens(location.function.file.c_str());
+		auto tokIt = tokens->find(location.line);
+		if (tokIt == tokens->end())
 		{
-			auto const &location = backtrace[i];
-			auto tokens = getOrParseFileTokens(location.function.file.c_str());
-			auto tokIt = tokens->find(location.line);
-			if (tokIt == tokens->end())
-			{
-				break;
-			}
-			auto token = tokIt->second;
-			auto name = token.identifier;
-			if (token.kind == Token::Return)
-			{
-				// This is a:
-				//
-				//   return <expr>;
-				//
-				// Emit this expression as two variables -
-				// Once as a synthetic 'return_value' variable at this scope.
-				// Again by bubbling the expression value up the callstack as
-				// Return Value Optimizations (RVOs) are likely to carry across
-				// the value to a local without calling a constructor in
-				// statements like:
-				//
-				//   auto val = foo();
-				//
-				name = "return_value";
-			}
-
-			auto &scope = diScope[i];
-			if (scope.pending.location != location)
-			{
-				emitPending(scope, builder);
-			}
-
-			auto value = V(variable);
-			auto block = builder->GetInsertBlock();
-
-			auto insertAfter = block->size() > 0 ? &block->back() : nullptr;
-			while (insertAfter != nullptr && insertAfter->isTerminator())
-			{
-				insertAfter = insertAfter->getPrevNode();
-			}
-
-			scope.pending = Pending{};
-			scope.pending.name = name;
-			scope.pending.location = location;
-			scope.pending.diLocation = getLocation(backtrace, i);
-			scope.pending.value = value;
-			scope.pending.block = block;
-			scope.pending.insertAfter = insertAfter;
-			scope.pending.scope = scope.di;
-
-			if (token.kind == Token::Return)
-			{
-				// Insert a noop instruction so the debugger can inspect the
-				// return value before the function scope closes.
-				scope.pending.addNopOnNextLine = true;
-			}
-			else
-			{
-				break;
-			}
+			break;
 		}
-	}
-
-	void DebugInfo::emitPending(Scope &scope, IRBuilder *builder)
-	{
-		auto const &pending = scope.pending;
-		if (pending.value == nullptr)
+		auto token = tokIt->second;
+		auto name = token.identifier;
+		if (token.kind == Token::Return)
 		{
-			return;
-		}
-
-		if (!scope.symbols.emplace(pending.name).second)
-		{
-			return;
-		}
-
-		bool isAlloca = llvm::isa<llvm::AllocaInst>(pending.value);
-
-		LOG("  EMIT(%s): di: %p, location: %s:%d, isAlloca: %s", pending.name.c_str(), scope.di,
-			pending.location.function.file.c_str(), pending.location.line, isAlloca ? "true" : "false");
-
-		auto value = pending.value;
-
-		IRBuilder::InsertPointGuard guard(*builder);
-		if (pending.insertAfter != nullptr)
-		{
-			builder->SetInsertPoint(pending.block, ++pending.insertAfter->getIterator());
-		}
-		else
-		{
-			builder->SetInsertPoint(pending.block);
-		}
-		builder->SetCurrentDebugLocation(pending.diLocation);
-
-		if (!isAlloca)
-		{
-			// While insertDbgValueIntrinsic should be enough to declare a
-			// variable with no storage, variables of RValues can share the same
-			// llvm::Value, and only one can be named. Take for example:
+			// This is a:
 			//
-			//   Int a = 42;
-			//   RValue<Int> b = a;
-			//   RValue<Int> c = b;
+			//   return <expr>;
 			//
-			// To handle this, always promote named RValues to an alloca.
-
-			llvm::BasicBlock &entryBlock = function->getEntryBlock();
-			auto alloca = new llvm::AllocaInst(value->getType(), 0, pending.name);
-			entryBlock.getInstList().push_front(alloca);
-			builder->CreateStore(value, alloca);
-			value = alloca;
+			// Emit this expression as two variables -
+			// Once as a synthetic 'return_value' variable at this scope.
+			// Again by bubbling the expression value up the callstack as
+			// Return Value Optimizations (RVOs) are likely to carry across
+			// the value to a local without calling a constructor in
+			// statements like:
+			//
+			//   auto val = foo();
+			//
+			name = "return_value";
 		}
 
-		value->setName(pending.name);
-
-		auto diFile = getOrCreateFile(pending.location.function.file.c_str());
-		auto diType = getOrCreateType(value->getType()->getPointerElementType());
-		auto diVar = diBuilder->createAutoVariable(scope.di, pending.name, diFile, pending.location.line, diType);
-
-		auto di = diBuilder->insertDeclare(value, diVar, diBuilder->createExpression(), pending.diLocation, pending.block);
-		if (pending.insertAfter != nullptr) { di->moveAfter(pending.insertAfter); }
-
-		if (pending.addNopOnNextLine)
+		auto &scope = diScope[i];
+		if (scope.pending.location != location)
 		{
-			builder->SetCurrentDebugLocation(llvm::DILocation::get(
-				*context,
-				pending.diLocation->getLine() + 1,
-				0,
-				pending.diLocation->getScope(),
-				pending.diLocation->getInlinedAt()
-			));
-			Nop();
+			emitPending(scope, builder);
+		}
+
+		auto value = V(variable);
+		auto block = builder->GetInsertBlock();
+
+		auto insertAfter = block->size() > 0 ? &block->back() : nullptr;
+		while (insertAfter != nullptr && insertAfter->isTerminator())
+		{
+			insertAfter = insertAfter->getPrevNode();
 		}
 
 		scope.pending = Pending{};
-	}
+		scope.pending.name = name;
+		scope.pending.location = location;
+		scope.pending.diLocation = getLocation(backtrace, i);
+		scope.pending.value = value;
+		scope.pending.block = block;
+		scope.pending.insertAfter = insertAfter;
+		scope.pending.scope = scope.di;
 
-	void DebugInfo::NotifyObjectEmitted(const llvm::object::ObjectFile &Obj, const llvm::LoadedObjectInfo &L)
-	{
-		std::unique_lock<std::mutex> lock(jitEventListenerMutex);
-		jitEventListener->NotifyObjectEmitted(Obj, static_cast<const llvm::RuntimeDyld::LoadedObjectInfo&>(L));
-	}
-
-	void DebugInfo::NotifyFreeingObject(const llvm::object::ObjectFile &Obj)
-	{
-		std::unique_lock<std::mutex> lock(jitEventListenerMutex);
-		jitEventListener->NotifyFreeingObject(Obj);
-	}
-
-	void DebugInfo::registerBasicTypes()
-	{
-		using namespace rr;
-		using namespace llvm;
-
-		auto vec4 = diBuilder->getOrCreateArray(diBuilder->getOrCreateSubrange(0, 4));
-		auto vec8 = diBuilder->getOrCreateArray(diBuilder->getOrCreateSubrange(0, 8));
-		auto vec16 = diBuilder->getOrCreateArray(diBuilder->getOrCreateSubrange(0, 16));
-
-		diTypes.emplace(T(Bool::getType()), diBuilder->createBasicType("Bool", sizeof(bool), dwarf::DW_ATE_boolean));
-		diTypes.emplace(T(Byte::getType()), diBuilder->createBasicType("Byte", 8, dwarf::DW_ATE_unsigned_char));
-		diTypes.emplace(T(SByte::getType()), diBuilder->createBasicType("SByte", 8, dwarf::DW_ATE_signed_char));
-		diTypes.emplace(T(Short::getType()), diBuilder->createBasicType("Short", 16, dwarf::DW_ATE_signed));
-		diTypes.emplace(T(UShort::getType()), diBuilder->createBasicType("UShort", 16, dwarf::DW_ATE_unsigned));
-		diTypes.emplace(T(Int::getType()), diBuilder->createBasicType("Int", 32, dwarf::DW_ATE_signed));
-		diTypes.emplace(T(UInt::getType()), diBuilder->createBasicType("UInt", 32, dwarf::DW_ATE_unsigned));
-		diTypes.emplace(T(Long::getType()), diBuilder->createBasicType("Long", 64, dwarf::DW_ATE_signed));
-		diTypes.emplace(T(Half::getType()), diBuilder->createBasicType("Half", 16, dwarf::DW_ATE_float));
-		diTypes.emplace(T(Float::getType()), diBuilder->createBasicType("Float", 32, dwarf::DW_ATE_float));
-
-		diTypes.emplace(T(Byte4::getType()), diBuilder->createVectorType(128, 128, diTypes[T(Byte::getType())], {vec16}));
-		diTypes.emplace(T(SByte4::getType()), diBuilder->createVectorType(128, 128, diTypes[T(SByte::getType())], {vec16}));
-		diTypes.emplace(T(Byte8::getType()), diBuilder->createVectorType(128, 128, diTypes[T(Byte::getType())], {vec16}));
-		diTypes.emplace(T(SByte8::getType()), diBuilder->createVectorType(128, 128, diTypes[T(SByte::getType())], {vec16}));
-		diTypes.emplace(T(Byte16::getType()), diBuilder->createVectorType(128, 128, diTypes[T(Byte::getType())], {vec16}));
-		diTypes.emplace(T(SByte16::getType()), diBuilder->createVectorType(128, 128, diTypes[T(SByte::getType())], {vec16}));
-		diTypes.emplace(T(Short2::getType()), diBuilder->createVectorType(128, 128, diTypes[T(Short::getType())], {vec8}));
-		diTypes.emplace(T(UShort2::getType()), diBuilder->createVectorType(128, 128, diTypes[T(UShort::getType())], {vec8}));
-		diTypes.emplace(T(Short4::getType()), diBuilder->createVectorType(128, 128, diTypes[T(Short::getType())], {vec8}));
-		diTypes.emplace(T(UShort4::getType()), diBuilder->createVectorType(128, 128, diTypes[T(UShort::getType())], {vec8}));
-		diTypes.emplace(T(Short8::getType()), diBuilder->createVectorType(128, 128, diTypes[T(Short::getType())], {vec8}));
-		diTypes.emplace(T(UShort8::getType()), diBuilder->createVectorType(128, 128, diTypes[T(UShort::getType())], {vec8}));
-		diTypes.emplace(T(Int2::getType()), diBuilder->createVectorType(128, 128, diTypes[T(Int::getType())], {vec4}));
-		diTypes.emplace(T(UInt2::getType()), diBuilder->createVectorType(128, 128, diTypes[T(UInt::getType())], {vec4}));
-		diTypes.emplace(T(Int4::getType()), diBuilder->createVectorType(128, 128, diTypes[T(Int::getType())], {vec4}));
-		diTypes.emplace(T(UInt4::getType()), diBuilder->createVectorType(128, 128, diTypes[T(UInt::getType())], {vec4}));
-		diTypes.emplace(T(Float2::getType()), diBuilder->createVectorType(128, 128, diTypes[T(Float::getType())], {vec4}));
-		diTypes.emplace(T(Float4::getType()), diBuilder->createVectorType(128, 128, diTypes[T(Float::getType())], {vec4}));
-	}
-
-	DebugInfo::Location DebugInfo::getCallerLocation() const
-	{
-		return getCallerBacktrace(1)[0];
-	}
-
-	DebugInfo::Backtrace DebugInfo::getCallerBacktrace(size_t limit /* = 0 */) const
-	{
-		auto shouldSkipFile = [](llvm::StringRef fileSR) {
-				return fileSR.empty() ||
-					fileSR.endswith_lower("ReactorDebugInfo.cpp") ||
-					fileSR.endswith_lower("Reactor.cpp") ||
-					fileSR.endswith_lower("Reactor.hpp") ||
-					fileSR.endswith_lower("stacktrace.hpp");
-		};
-
-		std::vector<DebugInfo::Location> locations;
-
-		// Note that bs::stacktrace() effectively returns a vector of addresses; bs::frame construction is where
-		// the heavy lifting is done: resolving the function name, file and line number.
-		namespace bs = boost::stacktrace;
-		for (bs::frame frame : bs::stacktrace())
+		if (token.kind == Token::Return)
 		{
-			if (shouldSkipFile(frame.source_file()))
-			{
-				continue;
-			}
+			// Insert a noop instruction so the debugger can inspect the
+			// return value before the function scope closes.
+			scope.pending.addNopOnNextLine = true;
+		}
+		else
+		{
+			break;
+		}
+	}
+}
 
-			DebugInfo::Location location;
-			location.function.file = frame.source_file();
-			location.function.name = frame.name();
-			location.line = frame.source_line();
-			locations.push_back(location);
+void DebugInfo::emitPending(Scope &scope, IRBuilder *builder)
+{
+	auto const &pending = scope.pending;
+	if (pending.value == nullptr)
+	{
+		return;
+	}
 
-			if (limit > 0 && locations.size() >= limit)
-			{
-				break;
-			}
+	if (!scope.symbols.emplace(pending.name).second)
+	{
+		return;
+	}
+
+	bool isAlloca = llvm::isa<llvm::AllocaInst>(pending.value);
+
+	LOG("  EMIT(%s): di: %p, location: %s:%d, isAlloca: %s", pending.name.c_str(), scope.di,
+		pending.location.function.file.c_str(), pending.location.line, isAlloca ? "true" : "false");
+
+	auto value = pending.value;
+
+	IRBuilder::InsertPointGuard guard(*builder);
+	if (pending.insertAfter != nullptr)
+	{
+		builder->SetInsertPoint(pending.block, ++pending.insertAfter->getIterator());
+	}
+	else
+	{
+		builder->SetInsertPoint(pending.block);
+	}
+	builder->SetCurrentDebugLocation(pending.diLocation);
+
+	if (!isAlloca)
+	{
+		// While insertDbgValueIntrinsic should be enough to declare a
+		// variable with no storage, variables of RValues can share the same
+		// llvm::Value, and only one can be named. Take for example:
+		//
+		//   Int a = 42;
+		//   RValue<Int> b = a;
+		//   RValue<Int> c = b;
+		//
+		// To handle this, always promote named RValues to an alloca.
+
+		llvm::BasicBlock &entryBlock = function->getEntryBlock();
+		auto alloca = new llvm::AllocaInst(value->getType(), 0, pending.name);
+		entryBlock.getInstList().push_front(alloca);
+		builder->CreateStore(value, alloca);
+		value = alloca;
+	}
+
+	value->setName(pending.name);
+
+	auto diFile = getOrCreateFile(pending.location.function.file.c_str());
+	auto diType = getOrCreateType(value->getType()->getPointerElementType());
+	auto diVar = diBuilder->createAutoVariable(scope.di, pending.name, diFile, pending.location.line, diType);
+
+	auto di = diBuilder->insertDeclare(value, diVar, diBuilder->createExpression(), pending.diLocation, pending.block);
+	if (pending.insertAfter != nullptr) { di->moveAfter(pending.insertAfter); }
+
+	if (pending.addNopOnNextLine)
+	{
+		builder->SetCurrentDebugLocation(llvm::DILocation::get(
+			*context,
+			pending.diLocation->getLine() + 1,
+			0,
+			pending.diLocation->getScope(),
+			pending.diLocation->getInlinedAt()
+		));
+		Nop();
+	}
+
+	scope.pending = Pending{};
+}
+
+void DebugInfo::NotifyObjectEmitted(const llvm::object::ObjectFile &Obj, const llvm::LoadedObjectInfo &L)
+{
+	std::unique_lock<std::mutex> lock(jitEventListenerMutex);
+	jitEventListener->NotifyObjectEmitted(Obj, static_cast<const llvm::RuntimeDyld::LoadedObjectInfo&>(L));
+}
+
+void DebugInfo::NotifyFreeingObject(const llvm::object::ObjectFile &Obj)
+{
+	std::unique_lock<std::mutex> lock(jitEventListenerMutex);
+	jitEventListener->NotifyFreeingObject(Obj);
+}
+
+void DebugInfo::registerBasicTypes()
+{
+	using namespace rr;
+	using namespace llvm;
+
+	auto vec4 = diBuilder->getOrCreateArray(diBuilder->getOrCreateSubrange(0, 4));
+	auto vec8 = diBuilder->getOrCreateArray(diBuilder->getOrCreateSubrange(0, 8));
+	auto vec16 = diBuilder->getOrCreateArray(diBuilder->getOrCreateSubrange(0, 16));
+
+	diTypes.emplace(T(Bool::getType()), diBuilder->createBasicType("Bool", sizeof(bool), dwarf::DW_ATE_boolean));
+	diTypes.emplace(T(Byte::getType()), diBuilder->createBasicType("Byte", 8, dwarf::DW_ATE_unsigned_char));
+	diTypes.emplace(T(SByte::getType()), diBuilder->createBasicType("SByte", 8, dwarf::DW_ATE_signed_char));
+	diTypes.emplace(T(Short::getType()), diBuilder->createBasicType("Short", 16, dwarf::DW_ATE_signed));
+	diTypes.emplace(T(UShort::getType()), diBuilder->createBasicType("UShort", 16, dwarf::DW_ATE_unsigned));
+	diTypes.emplace(T(Int::getType()), diBuilder->createBasicType("Int", 32, dwarf::DW_ATE_signed));
+	diTypes.emplace(T(UInt::getType()), diBuilder->createBasicType("UInt", 32, dwarf::DW_ATE_unsigned));
+	diTypes.emplace(T(Long::getType()), diBuilder->createBasicType("Long", 64, dwarf::DW_ATE_signed));
+	diTypes.emplace(T(Half::getType()), diBuilder->createBasicType("Half", 16, dwarf::DW_ATE_float));
+	diTypes.emplace(T(Float::getType()), diBuilder->createBasicType("Float", 32, dwarf::DW_ATE_float));
+
+	diTypes.emplace(T(Byte4::getType()), diBuilder->createVectorType(128, 128, diTypes[T(Byte::getType())], {vec16}));
+	diTypes.emplace(T(SByte4::getType()), diBuilder->createVectorType(128, 128, diTypes[T(SByte::getType())], {vec16}));
+	diTypes.emplace(T(Byte8::getType()), diBuilder->createVectorType(128, 128, diTypes[T(Byte::getType())], {vec16}));
+	diTypes.emplace(T(SByte8::getType()), diBuilder->createVectorType(128, 128, diTypes[T(SByte::getType())], {vec16}));
+	diTypes.emplace(T(Byte16::getType()), diBuilder->createVectorType(128, 128, diTypes[T(Byte::getType())], {vec16}));
+	diTypes.emplace(T(SByte16::getType()), diBuilder->createVectorType(128, 128, diTypes[T(SByte::getType())], {vec16}));
+	diTypes.emplace(T(Short2::getType()), diBuilder->createVectorType(128, 128, diTypes[T(Short::getType())], {vec8}));
+	diTypes.emplace(T(UShort2::getType()), diBuilder->createVectorType(128, 128, diTypes[T(UShort::getType())], {vec8}));
+	diTypes.emplace(T(Short4::getType()), diBuilder->createVectorType(128, 128, diTypes[T(Short::getType())], {vec8}));
+	diTypes.emplace(T(UShort4::getType()), diBuilder->createVectorType(128, 128, diTypes[T(UShort::getType())], {vec8}));
+	diTypes.emplace(T(Short8::getType()), diBuilder->createVectorType(128, 128, diTypes[T(Short::getType())], {vec8}));
+	diTypes.emplace(T(UShort8::getType()), diBuilder->createVectorType(128, 128, diTypes[T(UShort::getType())], {vec8}));
+	diTypes.emplace(T(Int2::getType()), diBuilder->createVectorType(128, 128, diTypes[T(Int::getType())], {vec4}));
+	diTypes.emplace(T(UInt2::getType()), diBuilder->createVectorType(128, 128, diTypes[T(UInt::getType())], {vec4}));
+	diTypes.emplace(T(Int4::getType()), diBuilder->createVectorType(128, 128, diTypes[T(Int::getType())], {vec4}));
+	diTypes.emplace(T(UInt4::getType()), diBuilder->createVectorType(128, 128, diTypes[T(UInt::getType())], {vec4}));
+	diTypes.emplace(T(Float2::getType()), diBuilder->createVectorType(128, 128, diTypes[T(Float::getType())], {vec4}));
+	diTypes.emplace(T(Float4::getType()), diBuilder->createVectorType(128, 128, diTypes[T(Float::getType())], {vec4}));
+}
+
+DebugInfo::Location DebugInfo::getCallerLocation() const
+{
+	return getCallerBacktrace(1)[0];
+}
+
+DebugInfo::Backtrace DebugInfo::getCallerBacktrace(size_t limit /* = 0 */) const
+{
+	auto shouldSkipFile = [](llvm::StringRef fileSR) {
+			return fileSR.empty() ||
+				fileSR.endswith_lower("ReactorDebugInfo.cpp") ||
+				fileSR.endswith_lower("Reactor.cpp") ||
+				fileSR.endswith_lower("Reactor.hpp") ||
+				fileSR.endswith_lower("stacktrace.hpp");
+	};
+
+	std::vector<DebugInfo::Location> locations;
+
+	// Note that bs::stacktrace() effectively returns a vector of addresses; bs::frame construction is where
+	// the heavy lifting is done: resolving the function name, file and line number.
+	namespace bs = boost::stacktrace;
+	for (bs::frame frame : bs::stacktrace())
+	{
+		if (shouldSkipFile(frame.source_file()))
+		{
+			continue;
 		}
 
-		std::reverse(locations.begin(), locations.end());
+		DebugInfo::Location location;
+		location.function.file = frame.source_file();
+		location.function.name = frame.name();
+		location.line = frame.source_line();
+		locations.push_back(location);
 
-		return locations;
-	}
-
-	llvm::DIType *DebugInfo::getOrCreateType(llvm::Type* type)
-	{
-		auto it = diTypes.find(type);
-		if (it != diTypes.end()) { return it->second; }
-
-		if(type->isPointerTy())
+		if (limit > 0 && locations.size() >= limit)
 		{
-			auto dbgTy = diBuilder->createPointerType(
-				getOrCreateType(type->getPointerElementType()),
-				sizeof(void*)*8, alignof(void*)*8);
-			diTypes.emplace(type, dbgTy);
-			return dbgTy;
+			break;
 		}
-		llvm::errs() << "Unimplemented debug type: " << type << "\n";
-		assert(false);
-		return nullptr;
 	}
 
-	llvm::DIFile *DebugInfo::getOrCreateFile(const char* path)
+	std::reverse(locations.begin(), locations.end());
+
+	return locations;
+}
+
+llvm::DIType *DebugInfo::getOrCreateType(llvm::Type* type)
+{
+	auto it = diTypes.find(type);
+	if (it != diTypes.end()) { return it->second; }
+
+	if(type->isPointerTy())
 	{
-		auto it = diFiles.find(path);
-		if (it != diFiles.end()) { return it->second; }
-		auto dirAndName = splitPath(path);
-		auto file = diBuilder->createFile(dirAndName.second, dirAndName.first);
-		diFiles.emplace(path, file);
-		return file;
+		auto dbgTy = diBuilder->createPointerType(
+			getOrCreateType(type->getPointerElementType()),
+			sizeof(void*)*8, alignof(void*)*8);
+		diTypes.emplace(type, dbgTy);
+		return dbgTy;
+	}
+	llvm::errs() << "Unimplemented debug type: " << type << "\n";
+	assert(false);
+	return nullptr;
+}
+
+llvm::DIFile *DebugInfo::getOrCreateFile(const char* path)
+{
+	auto it = diFiles.find(path);
+	if (it != diFiles.end()) { return it->second; }
+	auto dirAndName = splitPath(path);
+	auto file = diBuilder->createFile(dirAndName.second, dirAndName.first);
+	diFiles.emplace(path, file);
+	return file;
+}
+
+DebugInfo::LineTokens const *DebugInfo::getOrParseFileTokens(const char* path)
+{
+	static std::regex reLocalDecl(
+		"^" // line start
+		"\\s*" // initial whitespace
+		"(?:For\\s*\\(\\s*)?" // optional 'For ('
+		"((?:\\w+(?:<[^>]+>)?)(?:::\\w+(?:<[^>]+>)?)*)" // type (match group 1)
+		"\\s+" // whitespace between type and name
+		"(\\w+)" // identifier (match group 2)
+		"\\s*" // whitespace after identifier
+		"(\\[.*\\])?"); // optional array suffix (match group 3)
+
+	auto it = fileTokens.find(path);
+	if (it != fileTokens.end())
+	{
+		return it->second.get();
 	}
 
-	DebugInfo::LineTokens const *DebugInfo::getOrParseFileTokens(const char* path)
+	auto tokens = std::unique_ptr<LineTokens>(new LineTokens());
+
+	std::ifstream file(path);
+	std::string line;
+	int lineCount = 0;
+	while (std::getline(file, line))
 	{
-		static std::regex reLocalDecl(
-			"^" // line start
-			"\\s*" // initial whitespace
-			"(?:For\\s*\\(\\s*)?" // optional 'For ('
-			"((?:\\w+(?:<[^>]+>)?)(?:::\\w+(?:<[^>]+>)?)*)" // type (match group 1)
-			"\\s+" // whitespace between type and name
-			"(\\w+)" // identifier (match group 2)
-			"\\s*" // whitespace after identifier
-			"(\\[.*\\])?"); // optional array suffix (match group 3)
-
-		auto it = fileTokens.find(path);
-		if (it != fileTokens.end())
+		lineCount++;
+		std::smatch match;
+		if (std::regex_search(line, match, reLocalDecl) && match.size() > 3)
 		{
-			return it->second.get();
-		}
-
-		auto tokens = std::unique_ptr<LineTokens>(new LineTokens());
-
-		std::ifstream file(path);
-		std::string line;
-		int lineCount = 0;
-		while (std::getline(file, line))
-		{
-			lineCount++;
-			std::smatch match;
-			if (std::regex_search(line, match, reLocalDecl) && match.size() > 3)
+			bool isArray = match.str(3) != "";
+			if (!isArray) // Cannot deal with C-arrays of values.
 			{
-				bool isArray = match.str(3) != "";
-				if (!isArray) // Cannot deal with C-arrays of values.
+				if (match.str(1) == "return")
 				{
-					if (match.str(1) == "return")
-					{
-						(*tokens)[lineCount] = Token{Token::Return};
-					}
-					else
-					{
-						(*tokens)[lineCount] = Token{Token::Identifier, match.str(2)};
-					}
+					(*tokens)[lineCount] = Token{Token::Return};
+				}
+				else
+				{
+					(*tokens)[lineCount] = Token{Token::Identifier, match.str(2)};
 				}
 			}
 		}
-
-		auto out = tokens.get();
-		fileTokens.emplace(path, std::move(tokens));
-		return out;
 	}
 
-} // namespace rr
+	auto out = tokens.get();
+	fileTokens.emplace(path, std::move(tokens));
+	return out;
+}
+
+}  // namespace rr
 
 #endif // ENABLE_RR_DEBUG_INFO
diff --git a/src/Reactor/LLVMReactorDebugInfo.hpp b/src/Reactor/LLVMReactorDebugInfo.hpp
index db743d7..f97e3d0 100644
--- a/src/Reactor/LLVMReactorDebugInfo.hpp
+++ b/src/Reactor/LLVMReactorDebugInfo.hpp
@@ -25,187 +25,188 @@
 #include <memory>
 
 // Forward declarations
-namespace llvm
+namespace llvm {
+
+class BasicBlock;
+class ConstantFolder;
+class DIBuilder;
+class DICompileUnit;
+class DIFile;
+class DILocation;
+class DIScope;
+class DISubprogram;
+class DIType;
+class Function;
+class Instruction;
+class IRBuilderDefaultInserter;
+class JITEventListener;
+class LLVMContext;
+class LoadedObjectInfo;
+class Module;
+class Type;
+class Value;
+
+namespace object
 {
-	class BasicBlock;
-	class ConstantFolder;
-	class DIBuilder;
-	class DICompileUnit;
-	class DIFile;
-	class DILocation;
-	class DIScope;
-	class DISubprogram;
-	class DIType;
-	class Function;
-	class Instruction;
-	class IRBuilderDefaultInserter;
-	class JITEventListener;
-	class LLVMContext;
-	class LoadedObjectInfo;
-	class Module;
-	class Type;
-	class Value;
+	class ObjectFile;
+}
 
-	namespace object
-	{
-		class ObjectFile;
-	}
+template <typename T, typename Inserter> class IRBuilder;
 
-	template <typename T, typename Inserter> class IRBuilder;
-} // namespace llvm
+}  // namespace llvm
 
-namespace rr
+namespace rr {
+
+class Type;
+class Value;
+
+// DebugInfo generates LLVM DebugInfo IR from the C++ source that calls
+// into Reactor functions. See docs/ReactorDebugInfo.mk for more information.
+class DebugInfo
 {
-	class Type;
-	class Value;
+public:
+	using IRBuilder = llvm::IRBuilder<llvm::ConstantFolder, llvm::IRBuilderDefaultInserter>;
 
-	// DebugInfo generates LLVM DebugInfo IR from the C++ source that calls
-	// into Reactor functions. See docs/ReactorDebugInfo.mk for more information.
-	class DebugInfo
+	DebugInfo(IRBuilder *builder,
+			llvm::LLVMContext *context,
+			llvm::Module *module,
+			llvm::Function *function);
+
+	~DebugInfo();
+
+	// Finalize debug info generation. Must be called before the LLVM module
+	// is built.
+	void Finalize();
+
+	// Updates the current source location.
+	void EmitLocation();
+
+	// Binds the value to its symbol in the source file.
+	// See docs/ReactorDebugInfo.mk for more information.
+	void EmitVariable(Value *value);
+
+	// Forcefully flush the binding of the last variable name.
+	// Used for binding the initializer of `For` loops.
+	void Flush();
+
+	// NotifyObjectEmitted informs any attached debuggers of the JIT'd
+	// object.
+	static void NotifyObjectEmitted(const llvm::object::ObjectFile &Obj, const llvm::LoadedObjectInfo &L);
+
+	// NotifyFreeingObject informs any attached debuggers that the JIT'd
+	// object is now invalid.
+	static void NotifyFreeingObject(const llvm::object::ObjectFile &Obj);
+
+private:
+	struct Token
 	{
-	public:
-		using IRBuilder = llvm::IRBuilder<llvm::ConstantFolder, llvm::IRBuilderDefaultInserter>;
-
-		DebugInfo(IRBuilder *builder,
-				llvm::LLVMContext *context,
-				llvm::Module *module,
-				llvm::Function *function);
-
-		~DebugInfo();
-
-		// Finalize debug info generation. Must be called before the LLVM module
-		// is built.
-		void Finalize();
-
-		// Updates the current source location.
-		void EmitLocation();
-
-		// Binds the value to its symbol in the source file.
-		// See docs/ReactorDebugInfo.mk for more information.
-		void EmitVariable(Value *value);
-
-		// Forcefully flush the binding of the last variable name.
-		// Used for binding the initializer of `For` loops.
-		void Flush();
-
-		// NotifyObjectEmitted informs any attached debuggers of the JIT'd
-		// object.
-		static void NotifyObjectEmitted(const llvm::object::ObjectFile &Obj, const llvm::LoadedObjectInfo &L);
-
-		// NotifyFreeingObject informs any attached debuggers that the JIT'd
-		// object is now invalid.
-		static void NotifyFreeingObject(const llvm::object::ObjectFile &Obj);
-
-	private:
-		struct Token
+		enum Kind
 		{
-			enum Kind
-			{
-				Identifier,
-				Return
-			};
-			Kind kind;
-			std::string identifier;
+			Identifier,
+			Return
 		};
-
-		using LineTokens = std::unordered_map<unsigned int, Token>;
-
-		struct FunctionLocation
-		{
-			std::string name;
-			std::string file;
-
-			bool operator == (const FunctionLocation &rhs) const { return name == rhs.name && file == rhs.file; }
-			bool operator != (const FunctionLocation &rhs) const { return !(*this == rhs); }
-
-			struct Hash
-			{
-				std::size_t operator()(const FunctionLocation &l) const noexcept
-				{
-					return std::hash<std::string>()(l.file) * 31 +
-							std::hash<std::string>()(l.name);
-				}
-			};
-		};
-
-		struct Location
-		{
-			FunctionLocation function;
-			unsigned int line = 0;
-
-			bool operator == (const Location &rhs) const { return function == rhs.function && line == rhs.line; }
-			bool operator != (const Location &rhs) const { return !(*this == rhs); }
-
-			struct Hash
-			{
-				std::size_t operator()(const Location &l) const noexcept
-				{
-					return FunctionLocation::Hash()(l.function) * 31 +
-							std::hash<unsigned int>()(l.line);
-				}
-			};
-		};
-
-		using Backtrace = std::vector<Location>;
-
-		struct Pending
-		{
-			std::string name;
-			Location location;
-			llvm::DILocation *diLocation = nullptr;
-			llvm::Value *value = nullptr;
-			llvm::Instruction *insertAfter = nullptr;
-			llvm::BasicBlock *block = nullptr;
-			llvm::DIScope *scope = nullptr;
-			bool addNopOnNextLine = false;
-		};
-
-		struct Scope
-		{
-			Location location;
-			llvm::DIScope *di;
-			std::unordered_set<std::string> symbols;
-			Pending pending;
-		};
-
-		void registerBasicTypes();
-
-		void emitPending(Scope &scope, IRBuilder *builder);
-
-		// Returns the source location of the non-Reactor calling function.
-		Location getCallerLocation() const;
-
-		// Returns the backtrace for the callstack, starting at the first
-		// non-Reactor file. If limit is non-zero, then a maximum of limit
-		// frames will be returned.
-		Backtrace getCallerBacktrace(size_t limit = 0) const;
-
-		llvm::DILocation* getLocation(const Backtrace &backtrace, size_t i);
-
-		llvm::DIType *getOrCreateType(llvm::Type* type);
-		llvm::DIFile *getOrCreateFile(const char* path);
-		LineTokens const *getOrParseFileTokens(const char* path);
-
-		// Synchronizes diScope with the current backtrace.
-		void syncScope(Backtrace const& backtrace);
-
-		IRBuilder *builder;
-		llvm::LLVMContext *context;
-		llvm::Module *module;
-		llvm::Function *function;
-
-		std::unique_ptr<llvm::DIBuilder> diBuilder;
-		llvm::DICompileUnit *diCU;
-		llvm::DISubprogram *diSubprogram;
-		llvm::DILocation *diRootLocation;
-		std::vector<Scope> diScope;
-		std::unordered_map<std::string, llvm::DIFile*> diFiles;
-		std::unordered_map<llvm::Type*, llvm::DIType*> diTypes;
-		std::unordered_map<std::string, std::unique_ptr<LineTokens>> fileTokens;
-		std::vector<void const*> pushed;
+		Kind kind;
+		std::string identifier;
 	};
 
-} // namespace rr
+	using LineTokens = std::unordered_map<unsigned int, Token>;
+
+	struct FunctionLocation
+	{
+		std::string name;
+		std::string file;
+
+		bool operator == (const FunctionLocation &rhs) const { return name == rhs.name && file == rhs.file; }
+		bool operator != (const FunctionLocation &rhs) const { return !(*this == rhs); }
+
+		struct Hash
+		{
+			std::size_t operator()(const FunctionLocation &l) const noexcept
+			{
+				return std::hash<std::string>()(l.file) * 31 +
+						std::hash<std::string>()(l.name);
+			}
+		};
+	};
+
+	struct Location
+	{
+		FunctionLocation function;
+		unsigned int line = 0;
+
+		bool operator == (const Location &rhs) const { return function == rhs.function && line == rhs.line; }
+		bool operator != (const Location &rhs) const { return !(*this == rhs); }
+
+		struct Hash
+		{
+			std::size_t operator()(const Location &l) const noexcept
+			{
+				return FunctionLocation::Hash()(l.function) * 31 +
+						std::hash<unsigned int>()(l.line);
+			}
+		};
+	};
+
+	using Backtrace = std::vector<Location>;
+
+	struct Pending
+	{
+		std::string name;
+		Location location;
+		llvm::DILocation *diLocation = nullptr;
+		llvm::Value *value = nullptr;
+		llvm::Instruction *insertAfter = nullptr;
+		llvm::BasicBlock *block = nullptr;
+		llvm::DIScope *scope = nullptr;
+		bool addNopOnNextLine = false;
+	};
+
+	struct Scope
+	{
+		Location location;
+		llvm::DIScope *di;
+		std::unordered_set<std::string> symbols;
+		Pending pending;
+	};
+
+	void registerBasicTypes();
+
+	void emitPending(Scope &scope, IRBuilder *builder);
+
+	// Returns the source location of the non-Reactor calling function.
+	Location getCallerLocation() const;
+
+	// Returns the backtrace for the callstack, starting at the first
+	// non-Reactor file. If limit is non-zero, then a maximum of limit
+	// frames will be returned.
+	Backtrace getCallerBacktrace(size_t limit = 0) const;
+
+	llvm::DILocation* getLocation(const Backtrace &backtrace, size_t i);
+
+	llvm::DIType *getOrCreateType(llvm::Type* type);
+	llvm::DIFile *getOrCreateFile(const char* path);
+	LineTokens const *getOrParseFileTokens(const char* path);
+
+	// Synchronizes diScope with the current backtrace.
+	void syncScope(Backtrace const& backtrace);
+
+	IRBuilder *builder;
+	llvm::LLVMContext *context;
+	llvm::Module *module;
+	llvm::Function *function;
+
+	std::unique_ptr<llvm::DIBuilder> diBuilder;
+	llvm::DICompileUnit *diCU;
+	llvm::DISubprogram *diSubprogram;
+	llvm::DILocation *diRootLocation;
+	std::vector<Scope> diScope;
+	std::unordered_map<std::string, llvm::DIFile*> diFiles;
+	std::unordered_map<llvm::Type*, llvm::DIType*> diTypes;
+	std::unordered_map<std::string, std::unique_ptr<LineTokens>> fileTokens;
+	std::vector<void const*> pushed;
+};
+
+}  // namespace rr
 
 #endif // ENABLE_RR_DEBUG_INFO
 
diff --git a/src/Reactor/MutexLock.hpp b/src/Reactor/MutexLock.hpp
index 759e5d5..000819a 100644
--- a/src/Reactor/MutexLock.hpp
+++ b/src/Reactor/MutexLock.hpp
@@ -22,155 +22,157 @@
 // at the same time it's best to just have the scheduler overhead.
 #include <pthread.h>
 
-namespace rr
+namespace rr {
+
+class MutexLock
 {
-	class MutexLock
+public:
+	MutexLock()
 	{
-	public:
-		MutexLock()
-		{
-			pthread_mutex_init(&mutex, NULL);
-		}
+		pthread_mutex_init(&mutex, NULL);
+	}
 
-		~MutexLock()
-		{
-			pthread_mutex_destroy(&mutex);
-		}
+	~MutexLock()
+	{
+		pthread_mutex_destroy(&mutex);
+	}
 
-		bool attemptLock()
-		{
-			return pthread_mutex_trylock(&mutex) == 0;
-		}
+	bool attemptLock()
+	{
+		return pthread_mutex_trylock(&mutex) == 0;
+	}
 
-		void lock()
-		{
-			pthread_mutex_lock(&mutex);
-		}
+	void lock()
+	{
+		pthread_mutex_lock(&mutex);
+	}
 
-		void unlock()
-		{
-			pthread_mutex_unlock(&mutex);
-		}
+	void unlock()
+	{
+		pthread_mutex_unlock(&mutex);
+	}
 
-	private:
-		pthread_mutex_t mutex;
-	};
-}
+private:
+	pthread_mutex_t mutex;
+};
+
+}  // namespace rr
 
 #else   // !__linux__
 
 #include <atomic>
 
-namespace rr
+namespace rr {
+
+class BackoffLock
 {
-	class BackoffLock
+public:
+	BackoffLock()
 	{
-	public:
-		BackoffLock()
-		{
-			mutex = 0;
-		}
+		mutex = 0;
+	}
 
-		bool attemptLock()
+	bool attemptLock()
+	{
+		if(!isLocked())
 		{
-			if(!isLocked())
+			if(mutex.exchange(true) == false)
 			{
-				if(mutex.exchange(true) == false)
-				{
-					return true;
-				}
+				return true;
 			}
-
-			return false;
 		}
 
-		void lock()
-		{
-			int backoff = 1;
+		return false;
+	}
 
-			while(!attemptLock())
+	void lock()
+	{
+		int backoff = 1;
+
+		while(!attemptLock())
+		{
+			if(backoff <= 64)
 			{
-				if(backoff <= 64)
+				for(int i = 0; i < backoff; i++)
 				{
-					for(int i = 0; i < backoff; i++)
-					{
-						nop();
-						nop();
-						nop();
-						nop();
-						nop();
+					nop();
+					nop();
+					nop();
+					nop();
+					nop();
 
-						nop();
-						nop();
-						nop();
-						nop();
-						nop();
+					nop();
+					nop();
+					nop();
+					nop();
+					nop();
 
-						nop();
-						nop();
-						nop();
-						nop();
-						nop();
+					nop();
+					nop();
+					nop();
+					nop();
+					nop();
 
-						nop();
-						nop();
-						nop();
-						nop();
-						nop();
+					nop();
+					nop();
+					nop();
+					nop();
+					nop();
 
-						nop();
-						nop();
-						nop();
-						nop();
-						nop();
+					nop();
+					nop();
+					nop();
+					nop();
+					nop();
 
-						nop();
-						nop();
-						nop();
-						nop();
-						nop();
+					nop();
+					nop();
+					nop();
+					nop();
+					nop();
 
-						nop();
-						nop();
-						nop();
-						nop();
-						nop();
-					}
-
-					backoff *= 2;
+					nop();
+					nop();
+					nop();
+					nop();
+					nop();
 				}
-				else
-				{
-					Thread::yield();
 
-					backoff = 1;
-				}
-			};
-		}
+				backoff *= 2;
+			}
+			else
+			{
+				Thread::yield();
 
-		void unlock()
-		{
-			mutex.store(false, std::memory_order_release);
-		}
-
-		bool isLocked()
-		{
-			return mutex.load(std::memory_order_acquire);
-		}
-
-	private:
-		struct
-		{
-			// Ensure that the mutex variable is on its own 64-byte cache line to avoid false sharing
-			// Padding must be public to avoid compiler warnings
-			volatile int padding1[16];
-			std::atomic<bool> mutex;
-			volatile int padding2[15];
+				backoff = 1;
+			}
 		};
-	};
+	}
 
-	using MutexLock = BackoffLock;
-}
+	void unlock()
+	{
+		mutex.store(false, std::memory_order_release);
+	}
+
+	bool isLocked()
+	{
+		return mutex.load(std::memory_order_acquire);
+	}
+
+private:
+	struct
+	{
+		// Ensure that the mutex variable is on its own 64-byte cache line to avoid false sharing
+		// Padding must be public to avoid compiler warnings
+		volatile int padding1[16];
+		std::atomic<bool> mutex;
+		volatile int padding2[15];
+	};
+};
+
+using MutexLock = BackoffLock;
+
+}  // namespace rr
 
 #endif   // !__linux__
 
diff --git a/src/Reactor/Nucleus.hpp b/src/Reactor/Nucleus.hpp
index 6414780..67e990f 100644
--- a/src/Reactor/Nucleus.hpp
+++ b/src/Reactor/Nucleus.hpp
@@ -29,268 +29,269 @@
 static_assert(sizeof(short) == 2, "Reactor's 'Short' type is 16-bit, and requires the C++ 'short' to match that.");
 static_assert(sizeof(int) == 4, "Reactor's 'Int' type is 32-bit, and requires the C++ 'int' to match that.");
 
-namespace rr
+namespace rr {
+
+class Type;
+class Value;
+class SwitchCases;
+class BasicBlock;
+class Routine;
+
+// Optimization holds the optimization settings for code generation.
+class Optimization
 {
-	class Type;
-	class Value;
-	class SwitchCases;
-	class BasicBlock;
-	class Routine;
-
-	// Optimization holds the optimization settings for code generation.
-	class Optimization
+public:
+	enum class Level
 	{
-	public:
-		enum class Level
+		None,
+		Less,
+		Default,
+		Aggressive,
+	};
+
+	enum class Pass
+	{
+		Disabled,
+		InstructionCombining,
+		CFGSimplification,
+		LICM,
+		AggressiveDCE,
+		GVN,
+		Reassociate,
+		DeadStoreElimination,
+		SCCP,
+		ScalarReplAggregates,
+		EarlyCSEPass,
+
+		Count,
+	};
+
+	using Passes = std::vector<Pass>;
+
+	Optimization(Level level = Level::Default, const Passes& passes = {})
+		: level(level), passes(passes)
+	{
+		#if defined(REACTOR_DEFAULT_OPT_LEVEL)
 		{
-			None,
-			Less,
-			Default,
-			Aggressive,
-		};
-
-		enum class Pass
-		{
-			Disabled,
-			InstructionCombining,
-			CFGSimplification,
-			LICM,
-			AggressiveDCE,
-			GVN,
-			Reassociate,
-			DeadStoreElimination,
-			SCCP,
-			ScalarReplAggregates,
-			EarlyCSEPass,
-
-			Count,
-		};
-
-		using Passes = std::vector<Pass>;
-
-		Optimization(Level level = Level::Default, const Passes& passes = {})
-			: level(level), passes(passes)
-		{
-			#if defined(REACTOR_DEFAULT_OPT_LEVEL)
-			{
-				this->level = Level::REACTOR_DEFAULT_OPT_LEVEL;
-			}
-			#endif
+			this->level = Level::REACTOR_DEFAULT_OPT_LEVEL;
 		}
+		#endif
+	}
 
-		Level getLevel() const { return level; }
-		const Passes & getPasses() const { return passes; }
+	Level getLevel() const { return level; }
+	const Passes & getPasses() const { return passes; }
 
-	private:
-		Level level = Level::Default;
-		Passes passes;
-	};
+private:
+	Level level = Level::Default;
+	Passes passes;
+};
 
-	// Config holds the Reactor configuration settings.
-	class Config
+// Config holds the Reactor configuration settings.
+class Config
+{
+public:
+	// Edit holds a number of modifications to a config, that can be applied
+	// on an existing Config to produce a new Config with the specified
+	// changes.
+	class Edit
 	{
 	public:
-		// Edit holds a number of modifications to a config, that can be applied
-		// on an existing Config to produce a new Config with the specified
-		// changes.
-		class Edit
-		{
-		public:
-			static const Edit None;
+		static const Edit None;
 
-			Edit & set(Optimization::Level level) { optLevel = level; optLevelChanged = true; return *this; }
-			Edit & add(Optimization::Pass pass) { optPassEdits.push_back({ListEdit::Add, pass}); return *this; }
-			Edit & remove(Optimization::Pass pass) { optPassEdits.push_back({ListEdit::Remove, pass}); return *this; }
-			Edit & clearOptimizationPasses() { optPassEdits.push_back({ListEdit::Clear, Optimization::Pass::Disabled}); return *this; }
+		Edit & set(Optimization::Level level) { optLevel = level; optLevelChanged = true; return *this; }
+		Edit & add(Optimization::Pass pass) { optPassEdits.push_back({ListEdit::Add, pass}); return *this; }
+		Edit & remove(Optimization::Pass pass) { optPassEdits.push_back({ListEdit::Remove, pass}); return *this; }
+		Edit & clearOptimizationPasses() { optPassEdits.push_back({ListEdit::Clear, Optimization::Pass::Disabled}); return *this; }
 
-			Config apply(const Config &cfg) const;
-
-		private:
-			enum class ListEdit { Add, Remove, Clear };
-			using OptPassesEdit = std::pair<ListEdit, Optimization::Pass>;
-
-			template <typename T>
-			void apply(const std::vector<std::pair<ListEdit, T>> & edits, std::vector<T>& list) const;
-
-			Optimization::Level optLevel;
-			bool optLevelChanged = false;
-			std::vector<OptPassesEdit> optPassEdits;
-		};
-
-		Config() = default;
-		Config(const Optimization & optimization) : optimization(optimization) {}
-
-		const Optimization & getOptimization() const { return optimization; }
+		Config apply(const Config &cfg) const;
 
 	private:
-		Optimization optimization;
+		enum class ListEdit { Add, Remove, Clear };
+		using OptPassesEdit = std::pair<ListEdit, Optimization::Pass>;
+
+		template <typename T>
+		void apply(const std::vector<std::pair<ListEdit, T>> & edits, std::vector<T>& list) const;
+
+		Optimization::Level optLevel;
+		bool optLevelChanged = false;
+		std::vector<OptPassesEdit> optPassEdits;
 	};
 
-	class Nucleus
+	Config() = default;
+	Config(const Optimization & optimization) : optimization(optimization) {}
+
+	const Optimization & getOptimization() const { return optimization; }
+
+private:
+	Optimization optimization;
+};
+
+class Nucleus
+{
+public:
+	Nucleus();
+
+	virtual ~Nucleus();
+
+	// Default configuration to use when no other configuration is specified.
+	// The new configuration will be applied to subsequent reactor calls.
+	static void setDefaultConfig(const Config &cfg);
+	static void adjustDefaultConfig(const Config::Edit &cfgEdit);
+	static Config getDefaultConfig();
+
+	std::shared_ptr<Routine> acquireRoutine(const char *name, const Config::Edit &cfgEdit = Config::Edit::None);
+
+	static Value *allocateStackVariable(Type *type, int arraySize = 0);
+	static BasicBlock *createBasicBlock();
+	static BasicBlock *getInsertBlock();
+	static void setInsertBlock(BasicBlock *basicBlock);
+
+	static void createFunction(Type *ReturnType, std::vector<Type*> &Params);
+	static Value *getArgument(unsigned int index);
+
+	// Coroutines
+	using CoroutineHandle = void*;
+
+	template <typename... ARGS>
+	using CoroutineBegin = CoroutineHandle(ARGS...);
+	using CoroutineAwait = bool(CoroutineHandle, void* yieldValue);
+	using CoroutineDestroy = void(CoroutineHandle);
+
+	enum CoroutineEntries
 	{
-	public:
-		Nucleus();
-
-		virtual ~Nucleus();
-
-		// Default configuration to use when no other configuration is specified.
-		// The new configuration will be applied to subsequent reactor calls.
-		static void setDefaultConfig(const Config &cfg);
-		static void adjustDefaultConfig(const Config::Edit &cfgEdit);
-		static Config getDefaultConfig();
-
-		std::shared_ptr<Routine> acquireRoutine(const char *name, const Config::Edit &cfgEdit = Config::Edit::None);
-
-		static Value *allocateStackVariable(Type *type, int arraySize = 0);
-		static BasicBlock *createBasicBlock();
-		static BasicBlock *getInsertBlock();
-		static void setInsertBlock(BasicBlock *basicBlock);
-
-		static void createFunction(Type *ReturnType, std::vector<Type*> &Params);
-		static Value *getArgument(unsigned int index);
-
-		// Coroutines
-		using CoroutineHandle = void*;
-
-		template <typename... ARGS>
-		using CoroutineBegin = CoroutineHandle(ARGS...);
-		using CoroutineAwait = bool(CoroutineHandle, void* yieldValue);
-		using CoroutineDestroy = void(CoroutineHandle);
-
-		enum CoroutineEntries
-		{
-			CoroutineEntryBegin = 0,
-			CoroutineEntryAwait,
-			CoroutineEntryDestroy,
-			CoroutineEntryCount
-		};
-
-		static void createCoroutine(Type *ReturnType, std::vector<Type*> &Params);
-		std::shared_ptr<Routine> acquireCoroutine(const char *name, const Config::Edit &cfg = Config::Edit::None);
-		static void yield(Value*);
-
-		// Terminators
-		static void createRetVoid();
-		static void createRet(Value *V);
-		static void createBr(BasicBlock *dest);
-		static void createCondBr(Value *cond, BasicBlock *ifTrue, BasicBlock *ifFalse);
-
-		// Binary operators
-		static Value *createAdd(Value *lhs, Value *rhs);
-		static Value *createSub(Value *lhs, Value *rhs);
-		static Value *createMul(Value *lhs, Value *rhs);
-		static Value *createUDiv(Value *lhs, Value *rhs);
-		static Value *createSDiv(Value *lhs, Value *rhs);
-		static Value *createFAdd(Value *lhs, Value *rhs);
-		static Value *createFSub(Value *lhs, Value *rhs);
-		static Value *createFMul(Value *lhs, Value *rhs);
-		static Value *createFDiv(Value *lhs, Value *rhs);
-		static Value *createURem(Value *lhs, Value *rhs);
-		static Value *createSRem(Value *lhs, Value *rhs);
-		static Value *createFRem(Value *lhs, Value *rhs);
-		static Value *createShl(Value *lhs, Value *rhs);
-		static Value *createLShr(Value *lhs, Value *rhs);
-		static Value *createAShr(Value *lhs, Value *rhs);
-		static Value *createAnd(Value *lhs, Value *rhs);
-		static Value *createOr(Value *lhs, Value *rhs);
-		static Value *createXor(Value *lhs, Value *rhs);
-
-		// Unary operators
-		static Value *createNeg(Value *V);
-		static Value *createFNeg(Value *V);
-		static Value *createNot(Value *V);
-
-		// Memory instructions
-		static Value *createLoad(Value *ptr, Type *type, bool isVolatile = false, unsigned int alignment = 0, bool atomic = false , std::memory_order memoryOrder = std::memory_order_relaxed);
-		static Value *createStore(Value *value, Value *ptr, Type *type, bool isVolatile = false, unsigned int aligment = 0, bool atomic = false, std::memory_order memoryOrder = std::memory_order_relaxed);
-		static Value *createGEP(Value *ptr, Type *type, Value *index, bool unsignedIndex);
-
-		// Masked Load / Store instructions
-		static Value *createMaskedLoad(Value *base, Type *elementType, Value *mask, unsigned int alignment, bool zeroMaskedLanes);
-		static void createMaskedStore(Value *base, Value *value, Value *mask, unsigned int alignment);
-
-		// Barrier instructions
-		static void createFence(std::memory_order memoryOrder);
-
-		// Atomic instructions
-		static Value *createAtomicAdd(Value *ptr, Value *value, std::memory_order memoryOrder = std::memory_order_relaxed);
-		static Value *createAtomicSub(Value *ptr, Value *value, std::memory_order memoryOrder = std::memory_order_relaxed);
-		static Value *createAtomicAnd(Value *ptr, Value *value, std::memory_order memoryOrder = std::memory_order_relaxed);
-		static Value *createAtomicOr(Value *ptr, Value *value, std::memory_order memoryOrder = std::memory_order_relaxed);
-		static Value *createAtomicXor(Value *ptr, Value *value, std::memory_order memoryOrder = std::memory_order_relaxed);
-		static Value *createAtomicMin(Value *ptr, Value *value, std::memory_order memoryOrder = std::memory_order_relaxed);
-		static Value *createAtomicMax(Value *ptr, Value *value, std::memory_order memoryOrder = std::memory_order_relaxed);
-		static Value *createAtomicUMin(Value *ptr, Value *value, std::memory_order memoryOrder = std::memory_order_relaxed);
-		static Value *createAtomicUMax(Value *ptr, Value *value, std::memory_order memoryOrder = std::memory_order_relaxed);
-		static Value *createAtomicExchange(Value *ptr, Value *value, std::memory_order memoryOrder = std::memory_order_relaxed);
-		static Value *createAtomicCompareExchange(Value *ptr, Value *value, Value *compare, std::memory_order memoryOrderEqual, std::memory_order memoryOrderUnequal);
-
-		// Cast/Conversion Operators
-		static Value *createTrunc(Value *V, Type *destType);
-		static Value *createZExt(Value *V, Type *destType);
-		static Value *createSExt(Value *V, Type *destType);
-		static Value *createFPToUI(Value *V, Type *destType);
-		static Value *createFPToSI(Value *V, Type *destType);
-		static Value *createSIToFP(Value *V, Type *destType);
-		static Value *createFPTrunc(Value *V, Type *destType);
-		static Value *createFPExt(Value *V, Type *destType);
-		static Value *createBitCast(Value *V, Type *destType);
-
-		// Compare instructions
-		static Value *createPtrEQ(Value *lhs, Value *rhs);
-		static Value *createICmpEQ(Value *lhs, Value *rhs);
-		static Value *createICmpNE(Value *lhs, Value *rhs);
-		static Value *createICmpUGT(Value *lhs, Value *rhs);
-		static Value *createICmpUGE(Value *lhs, Value *rhs);
-		static Value *createICmpULT(Value *lhs, Value *rhs);
-		static Value *createICmpULE(Value *lhs, Value *rhs);
-		static Value *createICmpSGT(Value *lhs, Value *rhs);
-		static Value *createICmpSGE(Value *lhs, Value *rhs);
-		static Value *createICmpSLT(Value *lhs, Value *rhs);
-		static Value *createICmpSLE(Value *lhs, Value *rhs);
-		static Value *createFCmpOEQ(Value *lhs, Value *rhs);
-		static Value *createFCmpOGT(Value *lhs, Value *rhs);
-		static Value *createFCmpOGE(Value *lhs, Value *rhs);
-		static Value *createFCmpOLT(Value *lhs, Value *rhs);
-		static Value *createFCmpOLE(Value *lhs, Value *rhs);
-		static Value *createFCmpONE(Value *lhs, Value *rhs);
-		static Value *createFCmpORD(Value *lhs, Value *rhs);
-		static Value *createFCmpUNO(Value *lhs, Value *rhs);
-		static Value *createFCmpUEQ(Value *lhs, Value *rhs);
-		static Value *createFCmpUGT(Value *lhs, Value *rhs);
-		static Value *createFCmpUGE(Value *lhs, Value *rhs);
-		static Value *createFCmpULT(Value *lhs, Value *rhs);
-		static Value *createFCmpULE(Value *lhs, Value *rhs);
-		static Value *createFCmpUNE(Value *lhs, Value *rhs);
-
-		// Vector instructions
-		static Value *createExtractElement(Value *vector, Type *type, int index);
-		static Value *createInsertElement(Value *vector, Value *element, int index);
-		static Value *createShuffleVector(Value *V1, Value *V2, const int *select);
-
-		// Other instructions
-		static Value *createSelect(Value *C, Value *ifTrue, Value *ifFalse);
-		static SwitchCases *createSwitch(Value *control, BasicBlock *defaultBranch, unsigned numCases);
-		static void addSwitchCase(SwitchCases *switchCases, int label, BasicBlock *branch);
-		static void createUnreachable();
-
-		// Constant values
-		static Value *createNullValue(Type *type);
-		static Value *createConstantLong(int64_t i);
-		static Value *createConstantInt(int i);
-		static Value *createConstantInt(unsigned int i);
-		static Value *createConstantBool(bool b);
-		static Value *createConstantByte(signed char i);
-		static Value *createConstantByte(unsigned char i);
-		static Value *createConstantShort(short i);
-		static Value *createConstantShort(unsigned short i);
-		static Value *createConstantFloat(float x);
-		static Value *createNullPointer(Type *type);
-		static Value *createConstantVector(const int64_t *constants, Type *type);
-		static Value *createConstantVector(const double *constants, Type *type);
-
-		static Type *getPointerType(Type *elementType);
+		CoroutineEntryBegin = 0,
+		CoroutineEntryAwait,
+		CoroutineEntryDestroy,
+		CoroutineEntryCount
 	};
-}
+
+	static void createCoroutine(Type *ReturnType, std::vector<Type*> &Params);
+	std::shared_ptr<Routine> acquireCoroutine(const char *name, const Config::Edit &cfg = Config::Edit::None);
+	static void yield(Value*);
+
+	// Terminators
+	static void createRetVoid();
+	static void createRet(Value *V);
+	static void createBr(BasicBlock *dest);
+	static void createCondBr(Value *cond, BasicBlock *ifTrue, BasicBlock *ifFalse);
+
+	// Binary operators
+	static Value *createAdd(Value *lhs, Value *rhs);
+	static Value *createSub(Value *lhs, Value *rhs);
+	static Value *createMul(Value *lhs, Value *rhs);
+	static Value *createUDiv(Value *lhs, Value *rhs);
+	static Value *createSDiv(Value *lhs, Value *rhs);
+	static Value *createFAdd(Value *lhs, Value *rhs);
+	static Value *createFSub(Value *lhs, Value *rhs);
+	static Value *createFMul(Value *lhs, Value *rhs);
+	static Value *createFDiv(Value *lhs, Value *rhs);
+	static Value *createURem(Value *lhs, Value *rhs);
+	static Value *createSRem(Value *lhs, Value *rhs);
+	static Value *createFRem(Value *lhs, Value *rhs);
+	static Value *createShl(Value *lhs, Value *rhs);
+	static Value *createLShr(Value *lhs, Value *rhs);
+	static Value *createAShr(Value *lhs, Value *rhs);
+	static Value *createAnd(Value *lhs, Value *rhs);
+	static Value *createOr(Value *lhs, Value *rhs);
+	static Value *createXor(Value *lhs, Value *rhs);
+
+	// Unary operators
+	static Value *createNeg(Value *V);
+	static Value *createFNeg(Value *V);
+	static Value *createNot(Value *V);
+
+	// Memory instructions
+	static Value *createLoad(Value *ptr, Type *type, bool isVolatile = false, unsigned int alignment = 0, bool atomic = false , std::memory_order memoryOrder = std::memory_order_relaxed);
+	static Value *createStore(Value *value, Value *ptr, Type *type, bool isVolatile = false, unsigned int aligment = 0, bool atomic = false, std::memory_order memoryOrder = std::memory_order_relaxed);
+	static Value *createGEP(Value *ptr, Type *type, Value *index, bool unsignedIndex);
+
+	// Masked Load / Store instructions
+	static Value *createMaskedLoad(Value *base, Type *elementType, Value *mask, unsigned int alignment, bool zeroMaskedLanes);
+	static void createMaskedStore(Value *base, Value *value, Value *mask, unsigned int alignment);
+
+	// Barrier instructions
+	static void createFence(std::memory_order memoryOrder);
+
+	// Atomic instructions
+	static Value *createAtomicAdd(Value *ptr, Value *value, std::memory_order memoryOrder = std::memory_order_relaxed);
+	static Value *createAtomicSub(Value *ptr, Value *value, std::memory_order memoryOrder = std::memory_order_relaxed);
+	static Value *createAtomicAnd(Value *ptr, Value *value, std::memory_order memoryOrder = std::memory_order_relaxed);
+	static Value *createAtomicOr(Value *ptr, Value *value, std::memory_order memoryOrder = std::memory_order_relaxed);
+	static Value *createAtomicXor(Value *ptr, Value *value, std::memory_order memoryOrder = std::memory_order_relaxed);
+	static Value *createAtomicMin(Value *ptr, Value *value, std::memory_order memoryOrder = std::memory_order_relaxed);
+	static Value *createAtomicMax(Value *ptr, Value *value, std::memory_order memoryOrder = std::memory_order_relaxed);
+	static Value *createAtomicUMin(Value *ptr, Value *value, std::memory_order memoryOrder = std::memory_order_relaxed);
+	static Value *createAtomicUMax(Value *ptr, Value *value, std::memory_order memoryOrder = std::memory_order_relaxed);
+	static Value *createAtomicExchange(Value *ptr, Value *value, std::memory_order memoryOrder = std::memory_order_relaxed);
+	static Value *createAtomicCompareExchange(Value *ptr, Value *value, Value *compare, std::memory_order memoryOrderEqual, std::memory_order memoryOrderUnequal);
+
+	// Cast/Conversion Operators
+	static Value *createTrunc(Value *V, Type *destType);
+	static Value *createZExt(Value *V, Type *destType);
+	static Value *createSExt(Value *V, Type *destType);
+	static Value *createFPToUI(Value *V, Type *destType);
+	static Value *createFPToSI(Value *V, Type *destType);
+	static Value *createSIToFP(Value *V, Type *destType);
+	static Value *createFPTrunc(Value *V, Type *destType);
+	static Value *createFPExt(Value *V, Type *destType);
+	static Value *createBitCast(Value *V, Type *destType);
+
+	// Compare instructions
+	static Value *createPtrEQ(Value *lhs, Value *rhs);
+	static Value *createICmpEQ(Value *lhs, Value *rhs);
+	static Value *createICmpNE(Value *lhs, Value *rhs);
+	static Value *createICmpUGT(Value *lhs, Value *rhs);
+	static Value *createICmpUGE(Value *lhs, Value *rhs);
+	static Value *createICmpULT(Value *lhs, Value *rhs);
+	static Value *createICmpULE(Value *lhs, Value *rhs);
+	static Value *createICmpSGT(Value *lhs, Value *rhs);
+	static Value *createICmpSGE(Value *lhs, Value *rhs);
+	static Value *createICmpSLT(Value *lhs, Value *rhs);
+	static Value *createICmpSLE(Value *lhs, Value *rhs);
+	static Value *createFCmpOEQ(Value *lhs, Value *rhs);
+	static Value *createFCmpOGT(Value *lhs, Value *rhs);
+	static Value *createFCmpOGE(Value *lhs, Value *rhs);
+	static Value *createFCmpOLT(Value *lhs, Value *rhs);
+	static Value *createFCmpOLE(Value *lhs, Value *rhs);
+	static Value *createFCmpONE(Value *lhs, Value *rhs);
+	static Value *createFCmpORD(Value *lhs, Value *rhs);
+	static Value *createFCmpUNO(Value *lhs, Value *rhs);
+	static Value *createFCmpUEQ(Value *lhs, Value *rhs);
+	static Value *createFCmpUGT(Value *lhs, Value *rhs);
+	static Value *createFCmpUGE(Value *lhs, Value *rhs);
+	static Value *createFCmpULT(Value *lhs, Value *rhs);
+	static Value *createFCmpULE(Value *lhs, Value *rhs);
+	static Value *createFCmpUNE(Value *lhs, Value *rhs);
+
+	// Vector instructions
+	static Value *createExtractElement(Value *vector, Type *type, int index);
+	static Value *createInsertElement(Value *vector, Value *element, int index);
+	static Value *createShuffleVector(Value *V1, Value *V2, const int *select);
+
+	// Other instructions
+	static Value *createSelect(Value *C, Value *ifTrue, Value *ifFalse);
+	static SwitchCases *createSwitch(Value *control, BasicBlock *defaultBranch, unsigned numCases);
+	static void addSwitchCase(SwitchCases *switchCases, int label, BasicBlock *branch);
+	static void createUnreachable();
+
+	// Constant values
+	static Value *createNullValue(Type *type);
+	static Value *createConstantLong(int64_t i);
+	static Value *createConstantInt(int i);
+	static Value *createConstantInt(unsigned int i);
+	static Value *createConstantBool(bool b);
+	static Value *createConstantByte(signed char i);
+	static Value *createConstantByte(unsigned char i);
+	static Value *createConstantShort(short i);
+	static Value *createConstantShort(unsigned short i);
+	static Value *createConstantFloat(float x);
+	static Value *createNullPointer(Type *type);
+	static Value *createConstantVector(const int64_t *constants, Type *type);
+	static Value *createConstantVector(const double *constants, Type *type);
+
+	static Type *getPointerType(Type *elementType);
+};
+
+}  // namespace rr
 
 #endif   // rr_Nucleus_hpp
diff --git a/src/Reactor/Optimizer.cpp b/src/Reactor/Optimizer.cpp
index 5d89878..7cc3540 100644
--- a/src/Reactor/Optimizer.cpp
+++ b/src/Reactor/Optimizer.cpp
@@ -19,819 +19,821 @@
 
 #include <vector>
 
-namespace
+namespace {
+
+class Optimizer
 {
-	class Optimizer
+public:
+	void run(Ice::Cfg *function);
+
+private:
+	void analyzeUses(Ice::Cfg *function);
+	void eliminateDeadCode();
+	void eliminateUnitializedLoads();
+	void eliminateLoadsFollowingSingleStore();
+	void optimizeStoresInSingleBasicBlock();
+
+	void replace(Ice::Inst *instruction, Ice::Operand *newValue);
+	void deleteInstruction(Ice::Inst *instruction);
+	bool isDead(Ice::Inst *instruction);
+
+	static const Ice::InstIntrinsicCall *asLoadSubVector(const Ice::Inst *instruction);
+	static const Ice::InstIntrinsicCall *asStoreSubVector(const Ice::Inst *instruction);
+	static bool isLoad(const Ice::Inst &instruction);
+	static bool isStore(const Ice::Inst &instruction);
+	static Ice::Operand *storeAddress(const Ice::Inst *instruction);
+	static Ice::Operand *loadAddress(const Ice::Inst *instruction);
+	static Ice::Operand *storeData(const Ice::Inst *instruction);
+	static std::size_t storeSize(const Ice::Inst *instruction);
+	static bool loadTypeMatchesStore(const Ice::Inst *load, const Ice::Inst *store);
+
+	Ice::Cfg *function;
+	Ice::GlobalContext *context;
+
+	struct Uses : std::vector<Ice::Inst*>
 	{
-	public:
-		void run(Ice::Cfg *function);
+		bool areOnlyLoadStore() const;
+		void insert(Ice::Operand *value, Ice::Inst *instruction);
+		void erase(Ice::Inst *instruction);
 
-	private:
-		void analyzeUses(Ice::Cfg *function);
-		void eliminateDeadCode();
-		void eliminateUnitializedLoads();
-		void eliminateLoadsFollowingSingleStore();
-		void optimizeStoresInSingleBasicBlock();
-
-		void replace(Ice::Inst *instruction, Ice::Operand *newValue);
-		void deleteInstruction(Ice::Inst *instruction);
-		bool isDead(Ice::Inst *instruction);
-
-		static const Ice::InstIntrinsicCall *asLoadSubVector(const Ice::Inst *instruction);
-		static const Ice::InstIntrinsicCall *asStoreSubVector(const Ice::Inst *instruction);
-		static bool isLoad(const Ice::Inst &instruction);
-		static bool isStore(const Ice::Inst &instruction);
-		static Ice::Operand *storeAddress(const Ice::Inst *instruction);
-		static Ice::Operand *loadAddress(const Ice::Inst *instruction);
-		static Ice::Operand *storeData(const Ice::Inst *instruction);
-		static std::size_t storeSize(const Ice::Inst *instruction);
-		static bool loadTypeMatchesStore(const Ice::Inst *load, const Ice::Inst *store);
-
-		Ice::Cfg *function;
-		Ice::GlobalContext *context;
-
-		struct Uses : std::vector<Ice::Inst*>
-		{
-			bool areOnlyLoadStore() const;
-			void insert(Ice::Operand *value, Ice::Inst *instruction);
-			void erase(Ice::Inst *instruction);
-
-			std::vector<Ice::Inst*> loads;
-			std::vector<Ice::Inst*> stores;
-		};
-
-		struct LoadStoreInst
-		{
-			LoadStoreInst(Ice::Inst* inst, bool isStore)
-			  : inst(inst),
-			    address(isStore ? storeAddress(inst) : loadAddress(inst)),
-			    isStore(isStore)
-			{
-			}
-
-			Ice::Inst* inst;
-			Ice::Operand *address;
-			bool isStore;
-		};
-
-		Optimizer::Uses* getUses(Ice::Operand*);
-		void setUses(Ice::Operand*, Optimizer::Uses*);
-		bool hasUses(Ice::Operand*) const;
-
-		Ice::CfgNode* getNode(Ice::Inst*);
-		void setNode(Ice::Inst*, Ice::CfgNode*);
-
-		Ice::Inst* getDefinition(Ice::Variable*);
-		void setDefinition(Ice::Variable*, Ice::Inst*);
-
-		const std::vector<LoadStoreInst>& getLoadStoreInsts(Ice::CfgNode*);
-		void setLoadStoreInsts(Ice::CfgNode*, std::vector<LoadStoreInst>*);
-		bool hasLoadStoreInsts(Ice::CfgNode* node) const;
-
-		std::vector<Optimizer::Uses*> allocatedUses;
+		std::vector<Ice::Inst*> loads;
+		std::vector<Ice::Inst*> stores;
 	};
 
-	void Optimizer::run(Ice::Cfg *function)
+	struct LoadStoreInst
 	{
-		this->function = function;
-		this->context = function->getContext();
-
-		analyzeUses(function);
-
-		eliminateDeadCode();
-		eliminateUnitializedLoads();
-		eliminateLoadsFollowingSingleStore();
-		optimizeStoresInSingleBasicBlock();
-		eliminateDeadCode();
-
-		for(auto uses : allocatedUses)
+		LoadStoreInst(Ice::Inst* inst, bool isStore)
+		  : inst(inst),
+		    address(isStore ? storeAddress(inst) : loadAddress(inst)),
+		    isStore(isStore)
 		{
-			delete uses;
 		}
-		allocatedUses.clear();
-	}
 
-	void Optimizer::eliminateDeadCode()
+		Ice::Inst* inst;
+		Ice::Operand *address;
+		bool isStore;
+	};
+
+	Optimizer::Uses* getUses(Ice::Operand*);
+	void setUses(Ice::Operand*, Optimizer::Uses*);
+	bool hasUses(Ice::Operand*) const;
+
+	Ice::CfgNode* getNode(Ice::Inst*);
+	void setNode(Ice::Inst*, Ice::CfgNode*);
+
+	Ice::Inst* getDefinition(Ice::Variable*);
+	void setDefinition(Ice::Variable*, Ice::Inst*);
+
+	const std::vector<LoadStoreInst>& getLoadStoreInsts(Ice::CfgNode*);
+	void setLoadStoreInsts(Ice::CfgNode*, std::vector<LoadStoreInst>*);
+	bool hasLoadStoreInsts(Ice::CfgNode* node) const;
+
+	std::vector<Optimizer::Uses*> allocatedUses;
+};
+
+void Optimizer::run(Ice::Cfg *function)
+{
+	this->function = function;
+	this->context = function->getContext();
+
+	analyzeUses(function);
+
+	eliminateDeadCode();
+	eliminateUnitializedLoads();
+	eliminateLoadsFollowingSingleStore();
+	optimizeStoresInSingleBasicBlock();
+	eliminateDeadCode();
+
+	for(auto uses : allocatedUses)
 	{
-		bool modified;
-		do
+		delete uses;
+	}
+	allocatedUses.clear();
+}
+
+void Optimizer::eliminateDeadCode()
+{
+	bool modified;
+	do
+	{
+		modified = false;
+		for(Ice::CfgNode *basicBlock : function->getNodes())
 		{
-			modified = false;
-			for(Ice::CfgNode *basicBlock : function->getNodes())
+			for(Ice::Inst &inst : Ice::reverse_range(basicBlock->getInsts()))
 			{
-				for(Ice::Inst &inst : Ice::reverse_range(basicBlock->getInsts()))
+				if(inst.isDeleted())
+				{
+					continue;
+				}
+
+				if(isDead(&inst))
+				{
+					deleteInstruction(&inst);
+					modified = true;
+				}
+			}
+		}
+	}
+	while(modified);
+}
+
+void Optimizer::eliminateUnitializedLoads()
+{
+	Ice::CfgNode *entryBlock = function->getEntryNode();
+
+	for(Ice::Inst &alloca : entryBlock->getInsts())
+	{
+		if(alloca.isDeleted())
+		{
+			continue;
+		}
+
+		if(!llvm::isa<Ice::InstAlloca>(alloca))
+		{
+			break;   // Allocas are all at the top
+		}
+
+		Ice::Operand *address = alloca.getDest();
+
+		if(!hasUses(address))
+		{
+			continue;
+		}
+
+		const auto &addressUses = *getUses(address);
+
+		if(!addressUses.areOnlyLoadStore())
+		{
+			continue;
+		}
+
+		if(addressUses.stores.empty())
+		{
+			for(Ice::Inst *load : addressUses.loads)
+			{
+				Ice::Variable *loadData = load->getDest();
+
+				if(hasUses(loadData))
+				{
+					for(Ice::Inst *use : *getUses(loadData))
+					{
+						for(Ice::SizeT i = 0; i < use->getSrcSize(); i++)
+						{
+							if(use->getSrc(i) == loadData)
+							{
+								auto *undef = context->getConstantUndef(loadData->getType());
+
+								use->replaceSource(i, undef);
+							}
+						}
+					}
+
+					setUses(loadData, nullptr);
+				}
+
+				load->setDeleted();
+			}
+
+			alloca.setDeleted();
+			setUses(address, nullptr);
+		}
+	}
+}
+
+void Optimizer::eliminateLoadsFollowingSingleStore()
+{
+	Ice::CfgNode *entryBlock = function->getEntryNode();
+
+	for(Ice::Inst &alloca : entryBlock->getInsts())
+	{
+		if(alloca.isDeleted())
+		{
+			continue;
+		}
+
+		if(!llvm::isa<Ice::InstAlloca>(alloca))
+		{
+			break;   // Allocas are all at the top
+		}
+
+		Ice::Operand *address = alloca.getDest();
+
+		if(!hasUses(address))
+		{
+			continue;
+		}
+
+		auto &addressUses = *getUses(address);
+
+		if(!addressUses.areOnlyLoadStore())
+		{
+			continue;
+		}
+
+		if(addressUses.stores.size() == 1)
+		{
+			Ice::Inst *store = addressUses.stores[0];
+			Ice::Operand *storeValue = storeData(store);
+
+			for(Ice::Inst *load = &*++store->getIterator(), *next = nullptr; load != next; next = load, load = &*++store->getIterator())
+			{
+				if(load->isDeleted() || !isLoad(*load))
+				{
+					continue;
+				}
+
+				if(loadAddress(load) != address)
+				{
+					continue;
+				}
+
+				if(!loadTypeMatchesStore(load, store))
+				{
+					continue;
+				}
+
+				replace(load, storeValue);
+
+				for(size_t i = 0; i < addressUses.loads.size(); i++)
+				{
+					if(addressUses.loads[i] == load)
+					{
+						addressUses.loads[i] = addressUses.loads.back();
+						addressUses.loads.pop_back();
+						break;
+					}
+				}
+
+				for(size_t i = 0; i < addressUses.size(); i++)
+				{
+					if(addressUses[i] == load)
+					{
+						addressUses[i] = addressUses.back();
+						addressUses.pop_back();
+						break;
+					}
+				}
+
+				if(addressUses.size() == 1)
+				{
+					assert(addressUses[0] == store);
+
+					alloca.setDeleted();
+					store->setDeleted();
+					setUses(address, nullptr);
+
+					if(hasUses(storeValue))
+					{
+						auto &valueUses = *getUses(storeValue);
+
+						for(size_t i = 0; i < valueUses.size(); i++)
+						{
+							if(valueUses[i] == store)
+							{
+								valueUses[i] = valueUses.back();
+								valueUses.pop_back();
+								break;
+							}
+						}
+
+						if(valueUses.empty())
+						{
+							setUses(storeValue, nullptr);
+						}
+					}
+
+					break;
+				}
+			}
+		}
+	}
+}
+
+void Optimizer::optimizeStoresInSingleBasicBlock()
+{
+	Ice::CfgNode *entryBlock = function->getEntryNode();
+
+	std::vector<std::vector<LoadStoreInst>* > allocatedVectors;
+
+	for(Ice::Inst &alloca : entryBlock->getInsts())
+	{
+		if(alloca.isDeleted())
+		{
+			continue;
+		}
+
+		if(!llvm::isa<Ice::InstAlloca>(alloca))
+		{
+			break;   // Allocas are all at the top
+		}
+
+		Ice::Operand *address = alloca.getDest();
+
+		if(!hasUses(address))
+		{
+			continue;
+		}
+
+		const auto &addressUses = *getUses(address);
+
+		if(!addressUses.areOnlyLoadStore())
+		{
+			continue;
+		}
+
+		Ice::CfgNode *singleBasicBlock = getNode(addressUses.stores[0]);
+
+		for(size_t i = 1; i < addressUses.stores.size(); i++)
+		{
+			Ice::Inst *store = addressUses.stores[i];
+			if(getNode(store) != singleBasicBlock)
+			{
+				singleBasicBlock = nullptr;
+				break;
+			}
+		}
+
+		if(singleBasicBlock)
+		{
+			if(!hasLoadStoreInsts(singleBasicBlock))
+			{
+				std::vector<LoadStoreInst>* loadStoreInstVector = new std::vector<LoadStoreInst>();
+				setLoadStoreInsts(singleBasicBlock, loadStoreInstVector);
+				allocatedVectors.push_back(loadStoreInstVector);
+				for(Ice::Inst &inst : singleBasicBlock->getInsts())
 				{
 					if(inst.isDeleted())
 					{
 						continue;
 					}
 
-					if(isDead(&inst))
+					bool isStoreInst = isStore(inst);
+					bool isLoadInst = isLoad(inst);
+
+					if(isStoreInst || isLoadInst)
 					{
-						deleteInstruction(&inst);
-						modified = true;
+						loadStoreInstVector->push_back(LoadStoreInst(&inst, isStoreInst));
 					}
 				}
 			}
-		}
-		while(modified);
-	}
 
-	void Optimizer::eliminateUnitializedLoads()
-	{
-		Ice::CfgNode *entryBlock = function->getEntryNode();
+			Ice::Inst *store = nullptr;
+			Ice::Operand *storeValue = nullptr;
+			bool unmatchedLoads = false;
 
-		for(Ice::Inst &alloca : entryBlock->getInsts())
-		{
-			if(alloca.isDeleted())
+			for (auto& loadStoreInst : getLoadStoreInsts(singleBasicBlock))
 			{
-				continue;
-			}
+				Ice::Inst* inst = loadStoreInst.inst;
 
-			if(!llvm::isa<Ice::InstAlloca>(alloca))
-			{
-				break;   // Allocas are all at the top
-			}
-
-			Ice::Operand *address = alloca.getDest();
-
-			if(!hasUses(address))
-			{
-				continue;
-			}
-
-			const auto &addressUses = *getUses(address);
-
-			if(!addressUses.areOnlyLoadStore())
-			{
-				continue;
-			}
-
-			if(addressUses.stores.empty())
-			{
-				for(Ice::Inst *load : addressUses.loads)
-				{
-					Ice::Variable *loadData = load->getDest();
-
-					if(hasUses(loadData))
-					{
-						for(Ice::Inst *use : *getUses(loadData))
-						{
-							for(Ice::SizeT i = 0; i < use->getSrcSize(); i++)
-							{
-								if(use->getSrc(i) == loadData)
-								{
-									auto *undef = context->getConstantUndef(loadData->getType());
-
-									use->replaceSource(i, undef);
-								}
-							}
-						}
-
-						setUses(loadData, nullptr);
-					}
-
-					load->setDeleted();
-				}
-
-				alloca.setDeleted();
-				setUses(address, nullptr);
-			}
-		}
-	}
-
-	void Optimizer::eliminateLoadsFollowingSingleStore()
-	{
-		Ice::CfgNode *entryBlock = function->getEntryNode();
-
-		for(Ice::Inst &alloca : entryBlock->getInsts())
-		{
-			if(alloca.isDeleted())
-			{
-				continue;
-			}
-
-			if(!llvm::isa<Ice::InstAlloca>(alloca))
-			{
-				break;   // Allocas are all at the top
-			}
-
-			Ice::Operand *address = alloca.getDest();
-
-			if(!hasUses(address))
-			{
-				continue;
-			}
-
-			auto &addressUses = *getUses(address);
-
-			if(!addressUses.areOnlyLoadStore())
-			{
-				continue;
-			}
-
-			if(addressUses.stores.size() == 1)
-			{
-				Ice::Inst *store = addressUses.stores[0];
-				Ice::Operand *storeValue = storeData(store);
-
-				for(Ice::Inst *load = &*++store->getIterator(), *next = nullptr; load != next; next = load, load = &*++store->getIterator())
-				{
-					if(load->isDeleted() || !isLoad(*load))
-					{
-						continue;
-					}
-
-					if(loadAddress(load) != address)
-					{
-						continue;
-					}
-
-					if(!loadTypeMatchesStore(load, store))
-					{
-						continue;
-					}
-
-					replace(load, storeValue);
-
-					for(size_t i = 0; i < addressUses.loads.size(); i++)
-					{
-						if(addressUses.loads[i] == load)
-						{
-							addressUses.loads[i] = addressUses.loads.back();
-							addressUses.loads.pop_back();
-							break;
-						}
-					}
-
-					for(size_t i = 0; i < addressUses.size(); i++)
-					{
-						if(addressUses[i] == load)
-						{
-							addressUses[i] = addressUses.back();
-							addressUses.pop_back();
-							break;
-						}
-					}
-
-					if(addressUses.size() == 1)
-					{
-						assert(addressUses[0] == store);
-
-						alloca.setDeleted();
-						store->setDeleted();
-						setUses(address, nullptr);
-
-						if(hasUses(storeValue))
-						{
-							auto &valueUses = *getUses(storeValue);
-
-							for(size_t i = 0; i < valueUses.size(); i++)
-							{
-								if(valueUses[i] == store)
-								{
-									valueUses[i] = valueUses.back();
-									valueUses.pop_back();
-									break;
-								}
-							}
-
-							if(valueUses.empty())
-							{
-								setUses(storeValue, nullptr);
-							}
-						}
-
-						break;
-					}
-				}
-			}
-		}
-	}
-
-	void Optimizer::optimizeStoresInSingleBasicBlock()
-	{
-		Ice::CfgNode *entryBlock = function->getEntryNode();
-
-		std::vector<std::vector<LoadStoreInst>* > allocatedVectors;
-
-		for(Ice::Inst &alloca : entryBlock->getInsts())
-		{
-			if(alloca.isDeleted())
-			{
-				continue;
-			}
-
-			if(!llvm::isa<Ice::InstAlloca>(alloca))
-			{
-				break;   // Allocas are all at the top
-			}
-
-			Ice::Operand *address = alloca.getDest();
-
-			if(!hasUses(address))
-			{
-				continue;
-			}
-
-			const auto &addressUses = *getUses(address);
-
-			if(!addressUses.areOnlyLoadStore())
-			{
-				continue;
-			}
-
-			Ice::CfgNode *singleBasicBlock = getNode(addressUses.stores[0]);
-
-			for(size_t i = 1; i < addressUses.stores.size(); i++)
-			{
-				Ice::Inst *store = addressUses.stores[i];
-				if(getNode(store) != singleBasicBlock)
-				{
-					singleBasicBlock = nullptr;
-					break;
-				}
-			}
-
-			if(singleBasicBlock)
-			{
-				if(!hasLoadStoreInsts(singleBasicBlock))
-				{
-					std::vector<LoadStoreInst>* loadStoreInstVector = new std::vector<LoadStoreInst>();
-					setLoadStoreInsts(singleBasicBlock, loadStoreInstVector);
-					allocatedVectors.push_back(loadStoreInstVector);
-					for(Ice::Inst &inst : singleBasicBlock->getInsts())
-					{
-						if(inst.isDeleted())
-						{
-							continue;
-						}
-
-						bool isStoreInst = isStore(inst);
-						bool isLoadInst = isLoad(inst);
-
-						if(isStoreInst || isLoadInst)
-						{
-							loadStoreInstVector->push_back(LoadStoreInst(&inst, isStoreInst));
-						}
-					}
-				}
-
-				Ice::Inst *store = nullptr;
-				Ice::Operand *storeValue = nullptr;
-				bool unmatchedLoads = false;
-
-				for (auto& loadStoreInst : getLoadStoreInsts(singleBasicBlock))
-				{
-					Ice::Inst* inst = loadStoreInst.inst;
-
-					if((loadStoreInst.address != address) || inst->isDeleted())
-					{
-						continue;
-					}
-
-					if(loadStoreInst.isStore)
-					{
-						// New store found. If we had a previous one, try to eliminate it.
-						if(store && !unmatchedLoads)
-						{
-							// If the previous store is wider than the new one, we can't eliminate it
-							// because there could be a wide load reading its non-overwritten data.
-							if(storeSize(inst) >= storeSize(store))
-							{
-								deleteInstruction(store);
-							}
-						}
-
-						store = inst;
-						storeValue = storeData(store);
-						unmatchedLoads = false;
-					}
-					else
-					{
-						if(!loadTypeMatchesStore(inst, store))
-						{
-							unmatchedLoads = true;
-							continue;
-						}
-
-						replace(inst, storeValue);
-					}
-				}
-			}
-		}
-
-		for(auto loadStoreInstVector : allocatedVectors)
-		{
-			delete loadStoreInstVector;
-		}
-	}
-
-	void Optimizer::analyzeUses(Ice::Cfg *function)
-	{
-		for(Ice::CfgNode *basicBlock : function->getNodes())
-		{
-			for(Ice::Inst &instruction : basicBlock->getInsts())
-			{
-				if(instruction.isDeleted())
+				if((loadStoreInst.address != address) || inst->isDeleted())
 				{
 					continue;
 				}
 
-				setNode(&instruction, basicBlock);
-				if(instruction.getDest())
+				if(loadStoreInst.isStore)
 				{
-					setDefinition(instruction.getDest(), &instruction);
-				}
-
-				for(Ice::SizeT i = 0; i < instruction.getSrcSize(); i++)
-				{
-					Ice::SizeT unique = 0;
-					for(; unique < i; unique++)
+					// New store found. If we had a previous one, try to eliminate it.
+					if(store && !unmatchedLoads)
 					{
-						if(instruction.getSrc(i) == instruction.getSrc(unique))
+						// If the previous store is wider than the new one, we can't eliminate it
+						// because there could be a wide load reading its non-overwritten data.
+						if(storeSize(inst) >= storeSize(store))
 						{
-							break;
+							deleteInstruction(store);
 						}
 					}
 
-					if(i == unique)
+					store = inst;
+					storeValue = storeData(store);
+					unmatchedLoads = false;
+				}
+				else
+				{
+					if(!loadTypeMatchesStore(inst, store))
 					{
-						Ice::Operand *src = instruction.getSrc(i);
-						getUses(src)->insert(src, &instruction);
+						unmatchedLoads = true;
+						continue;
 					}
+
+					replace(inst, storeValue);
 				}
 			}
 		}
 	}
 
-	void Optimizer::replace(Ice::Inst *instruction, Ice::Operand *newValue)
+	for(auto loadStoreInstVector : allocatedVectors)
 	{
-		Ice::Variable *oldValue = instruction->getDest();
+		delete loadStoreInstVector;
+	}
+}
 
-		if(!newValue)
+void Optimizer::analyzeUses(Ice::Cfg *function)
+{
+	for(Ice::CfgNode *basicBlock : function->getNodes())
+	{
+		for(Ice::Inst &instruction : basicBlock->getInsts())
 		{
-			newValue = context->getConstantUndef(oldValue->getType());
-		}
-
-		if(hasUses(oldValue))
-		{
-			for(Ice::Inst *use : *getUses(oldValue))
+			if(instruction.isDeleted())
 			{
-				assert(!use->isDeleted());   // Should have been removed from uses already
+				continue;
+			}
 
-				for(Ice::SizeT i = 0; i < use->getSrcSize(); i++)
+			setNode(&instruction, basicBlock);
+			if(instruction.getDest())
+			{
+				setDefinition(instruction.getDest(), &instruction);
+			}
+
+			for(Ice::SizeT i = 0; i < instruction.getSrcSize(); i++)
+			{
+				Ice::SizeT unique = 0;
+				for(; unique < i; unique++)
 				{
-					if(use->getSrc(i) == oldValue)
+					if(instruction.getSrc(i) == instruction.getSrc(unique))
 					{
-						use->replaceSource(i, newValue);
-					}
-				}
-
-				getUses(newValue)->insert(newValue, use);
-			}
-
-			setUses(oldValue, nullptr);
-		}
-
-		deleteInstruction(instruction);
-	}
-
-	void Optimizer::deleteInstruction(Ice::Inst *instruction)
-	{
-		if(!instruction || instruction->isDeleted())
-		{
-			return;
-		}
-
-		instruction->setDeleted();
-
-		for(Ice::SizeT i = 0; i < instruction->getSrcSize(); i++)
-		{
-			Ice::Operand *src = instruction->getSrc(i);
-
-			if(hasUses(src))
-			{
-				auto &srcUses = *getUses(src);
-
-				srcUses.erase(instruction);
-
-				if(srcUses.empty())
-				{
-					setUses(src, nullptr);
-
-					if(Ice::Variable *var = llvm::dyn_cast<Ice::Variable>(src))
-					{
-						deleteInstruction(getDefinition(var));
-					}
-				}
-			}
-		}
-	}
-
-	bool Optimizer::isDead(Ice::Inst *instruction)
-	{
-		Ice::Variable *dest = instruction->getDest();
-
-		if(dest)
-		{
-			return (!hasUses(dest) || getUses(dest)->empty()) && !instruction->hasSideEffects();
-		}
-		else if(isStore(*instruction))
-		{
-			if(Ice::Variable *address = llvm::dyn_cast<Ice::Variable>(storeAddress(instruction)))
-			{
-				Ice::Inst *def = getDefinition(address);
-
-				if(def && llvm::isa<Ice::InstAlloca>(def))
-				{
-					if(hasUses(address))
-					{
-						Optimizer::Uses* uses = getUses(address);
-						return uses->size() == uses->stores.size();   // Dead if all uses are stores
-					}
-					else
-					{
-						return true; // No uses
-					}
-				}
-			}
-		}
-
-		return false;
-	}
-
-	const Ice::InstIntrinsicCall *Optimizer::asLoadSubVector(const Ice::Inst *instruction)
-	{
-		if(auto *instrinsic = llvm::dyn_cast<Ice::InstIntrinsicCall>(instruction))
-		{
-			if(instrinsic->getIntrinsicInfo().ID == Ice::Intrinsics::LoadSubVector)
-			{
-				return instrinsic;
-			}
-		}
-
-		return nullptr;
-	}
-
-	const Ice::InstIntrinsicCall *Optimizer::asStoreSubVector(const Ice::Inst *instruction)
-	{
-		if(auto *instrinsic = llvm::dyn_cast<Ice::InstIntrinsicCall>(instruction))
-		{
-			if(instrinsic->getIntrinsicInfo().ID == Ice::Intrinsics::StoreSubVector)
-			{
-				return instrinsic;
-			}
-		}
-
-		return nullptr;
-	}
-
-	bool Optimizer::isLoad(const Ice::Inst &instruction)
-	{
-		if(llvm::isa<Ice::InstLoad>(&instruction))
-		{
-			return true;
-		}
-
-		return asLoadSubVector(&instruction) != nullptr;
-	}
-
-	bool Optimizer::isStore(const Ice::Inst &instruction)
-	{
-		if(llvm::isa<Ice::InstStore>(&instruction))
-		{
-			return true;
-		}
-
-		return asStoreSubVector(&instruction) != nullptr;
-	}
-
-	Ice::Operand *Optimizer::storeAddress(const Ice::Inst *instruction)
-	{
-		assert(isStore(*instruction));
-
-		if(auto *store = llvm::dyn_cast<Ice::InstStore>(instruction))
-		{
-			return store->getAddr();
-		}
-
-		if(auto *storeSubVector = asStoreSubVector(instruction))
-		{
-			return storeSubVector->getSrc(2);
-		}
-
-		return nullptr;
-	}
-
-	Ice::Operand *Optimizer::loadAddress(const Ice::Inst *instruction)
-	{
-		assert(isLoad(*instruction));
-
-		if(auto *load = llvm::dyn_cast<Ice::InstLoad>(instruction))
-		{
-			return load->getSourceAddress();
-		}
-
-		if(auto *loadSubVector = asLoadSubVector(instruction))
-		{
-			return loadSubVector->getSrc(1);
-		}
-
-		return nullptr;
-	}
-
-	Ice::Operand *Optimizer::storeData(const Ice::Inst *instruction)
-	{
-		assert(isStore(*instruction));
-
-		if(auto *store = llvm::dyn_cast<Ice::InstStore>(instruction))
-		{
-			return store->getData();
-		}
-
-		if(auto *storeSubVector = asStoreSubVector(instruction))
-		{
-			return storeSubVector->getSrc(1);
-		}
-
-		return nullptr;
-	}
-
-	std::size_t Optimizer::storeSize(const Ice::Inst *store)
-	{
-		assert(isStore(*store));
-
-		if(auto *instStore = llvm::dyn_cast<Ice::InstStore>(store))
-		{
-			return Ice::typeWidthInBytes(instStore->getData()->getType());
-		}
-
-		if(auto *storeSubVector = asStoreSubVector(store))
-		{
-			return llvm::cast<Ice::ConstantInteger32>(storeSubVector->getSrc(3))->getValue();
-		}
-
-		return 0;
-	}
-
-	bool Optimizer::loadTypeMatchesStore(const Ice::Inst *load, const Ice::Inst *store)
-	{
-		if(!load || !store)
-		{
-			return false;
-		}
-
-		assert(isLoad(*load) && isStore(*store));
-		assert(loadAddress(load) == storeAddress(store));
-
-		if(auto *instStore = llvm::dyn_cast<Ice::InstStore>(store))
-		{
-			if(auto *instLoad = llvm::dyn_cast<Ice::InstLoad>(load))
-			{
-				return instStore->getData()->getType() == instLoad->getDest()->getType();
-			}
-		}
-
-		if(auto *storeSubVector = asStoreSubVector(store))
-		{
-			if(auto *loadSubVector = asLoadSubVector(load))
-			{
-				// Check for matching type and sub-vector width.
-				return storeSubVector->getSrc(1)->getType() == loadSubVector->getDest()->getType() &&
-				       llvm::cast<Ice::ConstantInteger32>(storeSubVector->getSrc(3))->getValue() ==
-				       llvm::cast<Ice::ConstantInteger32>(loadSubVector->getSrc(2))->getValue();
-			}
-		}
-
-		return false;
-	}
-
-	Optimizer::Uses* Optimizer::getUses(Ice::Operand* operand)
-	{
-		Optimizer::Uses* uses = (Optimizer::Uses*)operand->Ice::Operand::getExternalData();
-		if(!uses)
-		{
-			uses = new Optimizer::Uses;
-			setUses(operand, uses);
-			allocatedUses.push_back(uses);
-		}
-		return uses;
-	}
-
-	void Optimizer::setUses(Ice::Operand* operand, Optimizer::Uses* uses)
-	{
-		operand->Ice::Operand::setExternalData(uses);
-	}
-
-	bool Optimizer::hasUses(Ice::Operand* operand) const
-	{
-		return operand->Ice::Operand::getExternalData() != nullptr;
-	}
-
-	Ice::CfgNode* Optimizer::getNode(Ice::Inst* inst)
-	{
-		return (Ice::CfgNode*)inst->Ice::Inst::getExternalData();
-	}
-
-	void Optimizer::setNode(Ice::Inst* inst, Ice::CfgNode* node)
-	{
-		inst->Ice::Inst::setExternalData(node);
-	}
-
-	Ice::Inst* Optimizer::getDefinition(Ice::Variable* var)
-	{
-		return (Ice::Inst*)var->Ice::Variable::getExternalData();
-	}
-
-	void Optimizer::setDefinition(Ice::Variable* var, Ice::Inst* inst)
-	{
-		var->Ice::Variable::setExternalData(inst);
-	}
-
-	const std::vector<Optimizer::LoadStoreInst>& Optimizer::getLoadStoreInsts(Ice::CfgNode* node)
-	{
-		return *((const std::vector<LoadStoreInst>*)node->Ice::CfgNode::getExternalData());
-	}
-
-	void Optimizer::setLoadStoreInsts(Ice::CfgNode* node, std::vector<LoadStoreInst>* insts)
-	{
-		node->Ice::CfgNode::setExternalData(insts);
-	}
-
-	bool Optimizer::hasLoadStoreInsts(Ice::CfgNode* node) const
-	{
-		return node->Ice::CfgNode::getExternalData() != nullptr;
-	}
-
-	bool Optimizer::Uses::areOnlyLoadStore() const
-	{
-		return size() == (loads.size() + stores.size());
-	}
-
-	void Optimizer::Uses::insert(Ice::Operand *value, Ice::Inst *instruction)
-	{
-		push_back(instruction);
-
-		if(isLoad(*instruction))
-		{
-			if(value == loadAddress(instruction))
-			{
-				loads.push_back(instruction);
-			}
-		}
-		else if(isStore(*instruction))
-		{
-			if(value == storeAddress(instruction))
-			{
-				stores.push_back(instruction);
-			}
-		}
-	}
-
-	void Optimizer::Uses::erase(Ice::Inst *instruction)
-	{
-		auto &uses = *this;
-
-		for(size_t i = 0; i < uses.size(); i++)
-		{
-			if(uses[i] == instruction)
-			{
-				uses[i] = back();
-				pop_back();
-
-				for(size_t i = 0; i < loads.size(); i++)
-				{
-					if(loads[i] == instruction)
-					{
-						loads[i] = loads.back();
-						loads.pop_back();
 						break;
 					}
 				}
 
-				for(size_t i = 0; i < stores.size(); i++)
+				if(i == unique)
 				{
-					if(stores[i] == instruction)
-					{
-						stores[i] = stores.back();
-						stores.pop_back();
-						break;
-					}
+					Ice::Operand *src = instruction.getSrc(i);
+					getUses(src)->insert(src, &instruction);
 				}
-
-				break;
 			}
 		}
 	}
 }
 
-namespace rr
+void Optimizer::replace(Ice::Inst *instruction, Ice::Operand *newValue)
 {
-	void optimize(Ice::Cfg *function)
-	{
-		Optimizer optimizer;
+	Ice::Variable *oldValue = instruction->getDest();
 
-		optimizer.run(function);
+	if(!newValue)
+	{
+		newValue = context->getConstantUndef(oldValue->getType());
 	}
-}
\ No newline at end of file
+
+	if(hasUses(oldValue))
+	{
+		for(Ice::Inst *use : *getUses(oldValue))
+		{
+			assert(!use->isDeleted());   // Should have been removed from uses already
+
+			for(Ice::SizeT i = 0; i < use->getSrcSize(); i++)
+			{
+				if(use->getSrc(i) == oldValue)
+				{
+					use->replaceSource(i, newValue);
+				}
+			}
+
+			getUses(newValue)->insert(newValue, use);
+		}
+
+		setUses(oldValue, nullptr);
+	}
+
+	deleteInstruction(instruction);
+}
+
+void Optimizer::deleteInstruction(Ice::Inst *instruction)
+{
+	if(!instruction || instruction->isDeleted())
+	{
+		return;
+	}
+
+	instruction->setDeleted();
+
+	for(Ice::SizeT i = 0; i < instruction->getSrcSize(); i++)
+	{
+		Ice::Operand *src = instruction->getSrc(i);
+
+		if(hasUses(src))
+		{
+			auto &srcUses = *getUses(src);
+
+			srcUses.erase(instruction);
+
+			if(srcUses.empty())
+			{
+				setUses(src, nullptr);
+
+				if(Ice::Variable *var = llvm::dyn_cast<Ice::Variable>(src))
+				{
+					deleteInstruction(getDefinition(var));
+				}
+			}
+		}
+	}
+}
+
+bool Optimizer::isDead(Ice::Inst *instruction)
+{
+	Ice::Variable *dest = instruction->getDest();
+
+	if(dest)
+	{
+		return (!hasUses(dest) || getUses(dest)->empty()) && !instruction->hasSideEffects();
+	}
+	else if(isStore(*instruction))
+	{
+		if(Ice::Variable *address = llvm::dyn_cast<Ice::Variable>(storeAddress(instruction)))
+		{
+			Ice::Inst *def = getDefinition(address);
+
+			if(def && llvm::isa<Ice::InstAlloca>(def))
+			{
+				if(hasUses(address))
+				{
+					Optimizer::Uses* uses = getUses(address);
+					return uses->size() == uses->stores.size();   // Dead if all uses are stores
+				}
+				else
+				{
+					return true; // No uses
+				}
+			}
+		}
+	}
+
+	return false;
+}
+
+const Ice::InstIntrinsicCall *Optimizer::asLoadSubVector(const Ice::Inst *instruction)
+{
+	if(auto *instrinsic = llvm::dyn_cast<Ice::InstIntrinsicCall>(instruction))
+	{
+		if(instrinsic->getIntrinsicInfo().ID == Ice::Intrinsics::LoadSubVector)
+		{
+			return instrinsic;
+		}
+	}
+
+	return nullptr;
+}
+
+const Ice::InstIntrinsicCall *Optimizer::asStoreSubVector(const Ice::Inst *instruction)
+{
+	if(auto *instrinsic = llvm::dyn_cast<Ice::InstIntrinsicCall>(instruction))
+	{
+		if(instrinsic->getIntrinsicInfo().ID == Ice::Intrinsics::StoreSubVector)
+		{
+			return instrinsic;
+		}
+	}
+
+	return nullptr;
+}
+
+bool Optimizer::isLoad(const Ice::Inst &instruction)
+{
+	if(llvm::isa<Ice::InstLoad>(&instruction))
+	{
+		return true;
+	}
+
+	return asLoadSubVector(&instruction) != nullptr;
+}
+
+bool Optimizer::isStore(const Ice::Inst &instruction)
+{
+	if(llvm::isa<Ice::InstStore>(&instruction))
+	{
+		return true;
+	}
+
+	return asStoreSubVector(&instruction) != nullptr;
+}
+
+Ice::Operand *Optimizer::storeAddress(const Ice::Inst *instruction)
+{
+	assert(isStore(*instruction));
+
+	if(auto *store = llvm::dyn_cast<Ice::InstStore>(instruction))
+	{
+		return store->getAddr();
+	}
+
+	if(auto *storeSubVector = asStoreSubVector(instruction))
+	{
+		return storeSubVector->getSrc(2);
+	}
+
+	return nullptr;
+}
+
+Ice::Operand *Optimizer::loadAddress(const Ice::Inst *instruction)
+{
+	assert(isLoad(*instruction));
+
+	if(auto *load = llvm::dyn_cast<Ice::InstLoad>(instruction))
+	{
+		return load->getSourceAddress();
+	}
+
+	if(auto *loadSubVector = asLoadSubVector(instruction))
+	{
+		return loadSubVector->getSrc(1);
+	}
+
+	return nullptr;
+}
+
+Ice::Operand *Optimizer::storeData(const Ice::Inst *instruction)
+{
+	assert(isStore(*instruction));
+
+	if(auto *store = llvm::dyn_cast<Ice::InstStore>(instruction))
+	{
+		return store->getData();
+	}
+
+	if(auto *storeSubVector = asStoreSubVector(instruction))
+	{
+		return storeSubVector->getSrc(1);
+	}
+
+	return nullptr;
+}
+
+std::size_t Optimizer::storeSize(const Ice::Inst *store)
+{
+	assert(isStore(*store));
+
+	if(auto *instStore = llvm::dyn_cast<Ice::InstStore>(store))
+	{
+		return Ice::typeWidthInBytes(instStore->getData()->getType());
+	}
+
+	if(auto *storeSubVector = asStoreSubVector(store))
+	{
+		return llvm::cast<Ice::ConstantInteger32>(storeSubVector->getSrc(3))->getValue();
+	}
+
+	return 0;
+}
+
+bool Optimizer::loadTypeMatchesStore(const Ice::Inst *load, const Ice::Inst *store)
+{
+	if(!load || !store)
+	{
+		return false;
+	}
+
+	assert(isLoad(*load) && isStore(*store));
+	assert(loadAddress(load) == storeAddress(store));
+
+	if(auto *instStore = llvm::dyn_cast<Ice::InstStore>(store))
+	{
+		if(auto *instLoad = llvm::dyn_cast<Ice::InstLoad>(load))
+		{
+			return instStore->getData()->getType() == instLoad->getDest()->getType();
+		}
+	}
+
+	if(auto *storeSubVector = asStoreSubVector(store))
+	{
+		if(auto *loadSubVector = asLoadSubVector(load))
+		{
+			// Check for matching type and sub-vector width.
+			return storeSubVector->getSrc(1)->getType() == loadSubVector->getDest()->getType() &&
+			       llvm::cast<Ice::ConstantInteger32>(storeSubVector->getSrc(3))->getValue() ==
+			       llvm::cast<Ice::ConstantInteger32>(loadSubVector->getSrc(2))->getValue();
+		}
+	}
+
+	return false;
+}
+
+Optimizer::Uses* Optimizer::getUses(Ice::Operand* operand)
+{
+	Optimizer::Uses* uses = (Optimizer::Uses*)operand->Ice::Operand::getExternalData();
+	if(!uses)
+	{
+		uses = new Optimizer::Uses;
+		setUses(operand, uses);
+		allocatedUses.push_back(uses);
+	}
+	return uses;
+}
+
+void Optimizer::setUses(Ice::Operand* operand, Optimizer::Uses* uses)
+{
+	operand->Ice::Operand::setExternalData(uses);
+}
+
+bool Optimizer::hasUses(Ice::Operand* operand) const
+{
+	return operand->Ice::Operand::getExternalData() != nullptr;
+}
+
+Ice::CfgNode* Optimizer::getNode(Ice::Inst* inst)
+{
+	return (Ice::CfgNode*)inst->Ice::Inst::getExternalData();
+}
+
+void Optimizer::setNode(Ice::Inst* inst, Ice::CfgNode* node)
+{
+	inst->Ice::Inst::setExternalData(node);
+}
+
+Ice::Inst* Optimizer::getDefinition(Ice::Variable* var)
+{
+	return (Ice::Inst*)var->Ice::Variable::getExternalData();
+}
+
+void Optimizer::setDefinition(Ice::Variable* var, Ice::Inst* inst)
+{
+	var->Ice::Variable::setExternalData(inst);
+}
+
+const std::vector<Optimizer::LoadStoreInst>& Optimizer::getLoadStoreInsts(Ice::CfgNode* node)
+{
+	return *((const std::vector<LoadStoreInst>*)node->Ice::CfgNode::getExternalData());
+}
+
+void Optimizer::setLoadStoreInsts(Ice::CfgNode* node, std::vector<LoadStoreInst>* insts)
+{
+	node->Ice::CfgNode::setExternalData(insts);
+}
+
+bool Optimizer::hasLoadStoreInsts(Ice::CfgNode* node) const
+{
+	return node->Ice::CfgNode::getExternalData() != nullptr;
+}
+
+bool Optimizer::Uses::areOnlyLoadStore() const
+{
+	return size() == (loads.size() + stores.size());
+}
+
+void Optimizer::Uses::insert(Ice::Operand *value, Ice::Inst *instruction)
+{
+	push_back(instruction);
+
+	if(isLoad(*instruction))
+	{
+		if(value == loadAddress(instruction))
+		{
+			loads.push_back(instruction);
+		}
+	}
+	else if(isStore(*instruction))
+	{
+		if(value == storeAddress(instruction))
+		{
+			stores.push_back(instruction);
+		}
+	}
+}
+
+void Optimizer::Uses::erase(Ice::Inst *instruction)
+{
+	auto &uses = *this;
+
+	for(size_t i = 0; i < uses.size(); i++)
+	{
+		if(uses[i] == instruction)
+		{
+			uses[i] = back();
+			pop_back();
+
+			for(size_t i = 0; i < loads.size(); i++)
+			{
+				if(loads[i] == instruction)
+				{
+					loads[i] = loads.back();
+					loads.pop_back();
+					break;
+				}
+			}
+
+			for(size_t i = 0; i < stores.size(); i++)
+			{
+				if(stores[i] == instruction)
+				{
+					stores[i] = stores.back();
+					stores.pop_back();
+					break;
+				}
+			}
+
+			break;
+		}
+	}
+}
+
+}  // anonymous namespace 
+
+namespace rr {
+
+void optimize(Ice::Cfg *function)
+{
+	Optimizer optimizer;
+
+	optimizer.run(function);
+}
+
+}  // namespace rr
\ No newline at end of file
diff --git a/src/Reactor/Optimizer.hpp b/src/Reactor/Optimizer.hpp
index e6027e9..8aa2019 100644
--- a/src/Reactor/Optimizer.hpp
+++ b/src/Reactor/Optimizer.hpp
@@ -17,9 +17,10 @@
 
 #include "src/IceCfg.h"
 
-namespace rr
-{
-	void optimize(Ice::Cfg *function);
-}
+namespace rr {
+
+void optimize(Ice::Cfg *function);
+
+}  // namespace rr
 
 #endif   // rr_Optimizer_hpp
diff --git a/src/Reactor/Print.hpp b/src/Reactor/Print.hpp
index 252e621..ca06f4e 100644
--- a/src/Reactor/Print.hpp
+++ b/src/Reactor/Print.hpp
@@ -28,341 +28,341 @@
 
 namespace rr {
 
-	// PrintValue holds the printf format and value(s) for a single argument
-	// to Print(). A single argument can be expanded into multiple printf
-	// values - for example a Float4 will expand to "%f %f %f %f" and four
-	// scalar values.
-	// The PrintValue constructor accepts the following:
-	//   * Reactor LValues, RValues, Pointers.
-	//   * Standard Plain-Old-Value types (int, float, bool, etc)
-	//   * Custom types that specialize the PrintValue::Ty template struct.
-	//   * Static arrays in the form T[N] where T can be any of the above.
-	class PrintValue
+// PrintValue holds the printf format and value(s) for a single argument
+// to Print(). A single argument can be expanded into multiple printf
+// values - for example a Float4 will expand to "%f %f %f %f" and four
+// scalar values.
+// The PrintValue constructor accepts the following:
+//   * Reactor LValues, RValues, Pointers.
+//   * Standard Plain-Old-Value types (int, float, bool, etc)
+//   * Custom types that specialize the PrintValue::Ty template struct.
+//   * Static arrays in the form T[N] where T can be any of the above.
+class PrintValue
+{
+	// Ty is a template that can be specialized for printing type T.
+	// Each specialization must expose:
+	//  * A 'static std::string fmt(const T& v)' method that provides the
+	//    printf format specifier.
+	//  * A 'static std::vector<rr::Value*> val(const T& v)' method that
+	//    returns all the printf format values.
+	template <typename T> struct Ty
 	{
-		// Ty is a template that can be specialized for printing type T.
-		// Each specialization must expose:
-		//  * A 'static std::string fmt(const T& v)' method that provides the
-		//    printf format specifier.
-		//  * A 'static std::vector<rr::Value*> val(const T& v)' method that
-		//    returns all the printf format values.
-		template <typename T> struct Ty
+		// static std::string fmt(const T& v);
+		// static std::vector<rr::Value*> val(const T& v);
+	};
+
+	// returns the printf values for all the values in the given array.
+	template <typename T>
+	static std::vector<Value*> val(const T* list, int count) {
+		std::vector<Value*> values;
+		values.reserve(count);
+		for (int i = 0; i < count; i++)
 		{
-			// static std::string fmt(const T& v);
-			// static std::vector<rr::Value*> val(const T& v);
-		};
-
-		// returns the printf values for all the values in the given array.
-		template <typename T>
-		static std::vector<Value*> val(const T* list, int count) {
-			std::vector<Value*> values;
-			values.reserve(count);
-			for (int i = 0; i < count; i++)
-			{
-				auto v = val(list[i]);
-				values.insert(values.end(), v.begin(), v.end());
-			}
-			return values;
+			auto v = val(list[i]);
+			values.insert(values.end(), v.begin(), v.end());
 		}
-
-		// fmt returns the comma-delimited list of printf format strings for
-		// every element in the provided list, all enclosed in square brackets.
-		template <typename T>
-		static std::string fmt(const T* list, int count)
-		{
-			std::string out = "[";
-			for (int i = 0; i < count; i++)
-			{
-				if (i > 0) { out += ", "; }
-				out += fmt(list[i]);
-			}
-			return out + "]";
-		}
-
-		static std::string addr(const void* ptr)
-		{
-			char buf[32];
-			snprintf(buf, sizeof(buf), "%p", ptr);
-			return buf;
-		}
-
-	public:
-		const std::string format;
-		const std::vector<Value*> values;
-
-		// Constructs a PrintValue for the given value.
-		template <typename T>
-		PrintValue(const T& v) : format(fmt(v)), values(val(v)) {}
-
-		// Constructs a PrintValue for the given static array.
-		template <typename T, int N>
-		PrintValue(const T (&v)[N]) : format(fmt(&v[0], N)), values(val(&v[0], N)) {}
-
-		// Constructs a PrintValue for the given array starting at arr of length
-		// len.
-		template <typename T>
-		PrintValue(const T* arr, int len) : format(fmt(arr, len)), values(val(arr, len)) {}
-
-		// PrintValue constructors for plain-old-data values.
-		PrintValue(bool v) : format(v ? "true" : "false") {}
-		PrintValue(int8_t v) : format(std::to_string(v)) {}
-		PrintValue(uint8_t v) : format(std::to_string(v)) {}
-		PrintValue(int16_t v) : format(std::to_string(v)) {}
-		PrintValue(uint16_t v) : format(std::to_string(v)) {}
-		PrintValue(int32_t v) : format(std::to_string(v)) {}
-		PrintValue(uint32_t v) : format(std::to_string(v)) {}
-		PrintValue(int64_t v) : format(std::to_string(v)) {}
-		PrintValue(uint64_t v) : format(std::to_string(v)) {}
-		PrintValue(float v) : format(std::to_string(v)) {}
-		PrintValue(double v) : format(std::to_string(v)) {}
-
-		template <typename T>
-		PrintValue(const T* v) : format(addr(v)) {}
-
-		// vals is a helper to build composite value lists.
-		// vals returns the full, sequential list of printf argument values used
-		// to print all the provided variadic values.
-		// vals() is intended to be used by implementations of
-		// PrintValue::Ty<>::vals() to help declare aggregate types.
-		// For example, if you were declaring a PrintValue::Ty<> specialization
-		// for a custom Mat4x4 matrix formed from four Vector4 values, you'd
-		// write:
-		//
-		// namespace rr
-		// {
-		//		template <> struct PrintValue::Ty<Mat4x4>
-		//		{
-		//			static std::string fmt(const Mat4x4& v)
-		//			{
-		//				return	"[a: <%f, %f, %f, %f>,"
-		//				        " b: <%f, %f, %f, %f>,"
-		//				        " c: <%f, %f, %f, %f>,"
-		//				        " d: <%f, %f, %f, %f>]";
-		//			}
-		//			static std::vector<rr::Value*> val(const Mat4x4& v)
-		//			{
-		//				return PrintValue::vals(v.a, v.b, v.c, v.d);
-		//			}
-		//		};
-		//	}
-		template<typename ... ARGS>
-		static std::vector<Value*> vals(ARGS... v)
-		{
-			std::vector< std::vector<Value*> > lists = {val(v)...};
-			std::vector<Value*> joined;
-			for (const auto& list : lists)
-			{
-				joined.insert(joined.end(), list.begin(), list.end());
-			}
-			return joined;
-		}
-
-		// returns the printf format specifier for the given type via the
-		// PrintValue::Ty<T> specialization.
-		template <typename T>
-		static std::string fmt(const T& v) { return Ty<T>::fmt(v); }
-
-		// returns the printf value for the given type with a
-		// PrintValue::Ty<T> specialization.
-		template <typename T>
-		static std::vector<Value*> val(const T& v) { return Ty<T>::val(v); }
-	};
-
-	// PrintValue::Ty<T> specializations for basic types.
-	template <> struct PrintValue::Ty<const char*>
-	{
-		static std::string fmt(const char* v) { return "%s"; }
-		static std::vector<Value*> val(const char* v);
-	};
-	template <> struct PrintValue::Ty<std::string>
-	{
-		static std::string fmt(const std::string& v) { return PrintValue::Ty<const char*>::fmt(v.c_str()); }
-		static std::vector<Value*> val(const std::string& v) { return PrintValue::Ty<const char*>::val(v.c_str()); }
-	};
-
-	// PrintValue::Ty<T> specializations for standard Reactor types.
-	template <> struct PrintValue::Ty<Bool>
-	{
-		static std::string fmt(const RValue<Bool>& v) { return "%d"; }
-		static std::vector<Value*> val(const RValue<Bool>& v) { return {v.value}; }
-	};
-	template <> struct PrintValue::Ty<Byte>
-	{
-		static std::string fmt(const RValue<Byte>& v) { return "%d"; }
-		static std::vector<Value*> val(const RValue<Byte>& v);
-	};
-	template <> struct PrintValue::Ty<Byte4>
-	{
-		static std::string fmt(const RValue<Byte4>& v) { return "[%d, %d, %d, %d]"; }
-		static std::vector<Value*> val(const RValue<Byte4>& v);
-	};
-	template <> struct PrintValue::Ty<Int>
-	{
-		static std::string fmt(const RValue<Int>& v) { return "%d"; }
-		static std::vector<Value*> val(const RValue<Int>& v);
-	};
-	template <> struct PrintValue::Ty<Int2>
-	{
-		static std::string fmt(const RValue<Int>& v) { return "[%d, %d]"; }
-		static std::vector<Value*> val(const RValue<Int2>& v);
-	};
-	template <> struct PrintValue::Ty<Int4>
-	{
-		static std::string fmt(const RValue<Int4>& v) { return "[%d, %d, %d, %d]"; }
-		static std::vector<Value*> val(const RValue<Int4>& v);
-	};
-	template <> struct PrintValue::Ty<UInt>
-	{
-		static std::string fmt(const RValue<UInt>& v) { return "%u"; }
-		static std::vector<Value*> val(const RValue<UInt>& v);
-	};
-	template <> struct PrintValue::Ty<UInt2>
-	{
-		static std::string fmt(const RValue<UInt>& v) { return "[%u, %u]"; }
-		static std::vector<Value*> val(const RValue<UInt2>& v);
-	};
-	template <> struct PrintValue::Ty<UInt4>
-	{
-		static std::string fmt(const RValue<UInt4>& v) { return "[%u, %u, %u, %u]"; }
-		static std::vector<Value*> val(const RValue<UInt4>& v);
-	};
-	template <> struct PrintValue::Ty<Short>
-	{
-		static std::string fmt(const RValue<Short>& v) { return "%d"; }
-		static std::vector<Value*> val(const RValue<Short>& v);
-	};
-	template <> struct PrintValue::Ty<Short4>
-	{
-		static std::string fmt(const RValue<Short4>& v) { return "[%d, %d, %d, %d]"; }
-		static std::vector<Value*> val(const RValue<Short4>& v);
-	};
-	template <> struct PrintValue::Ty<UShort>
-	{
-		static std::string fmt(const RValue<UShort>& v) { return "%u"; }
-		static std::vector<Value*> val(const RValue<UShort>& v);
-	};
-	template <> struct PrintValue::Ty<UShort4>
-	{
-		static std::string fmt(const RValue<UShort4>& v) { return "[%u, %u, %u, %u]"; }
-		static std::vector<Value*> val(const RValue<UShort4>& v);
-	};
-	template <> struct PrintValue::Ty<Float>
-	{
-		static std::string fmt(const RValue<Float>& v) { return "[%f]"; }
-		static std::vector<Value*> val(const RValue<Float>& v);
-	};
-	template <> struct PrintValue::Ty<Float4>
-	{
-		static std::string fmt(const RValue<Float4>& v) { return "[%f, %f, %f, %f]"; }
-		static std::vector<Value*> val(const RValue<Float4>& v);
-	};
-	template <> struct PrintValue::Ty<Long>
-	{
-		static std::string fmt(const RValue<Long>& v) { return "%lld"; }
-		static std::vector<Value*> val(const RValue<Long>& v) { return {v.value}; }
-	};
-	template <typename T> struct PrintValue::Ty< Pointer<T> >
-	{
-		static std::string fmt(const RValue<Pointer<T>>& v) { return "%p"; }
-		static std::vector<Value*> val(const RValue<Pointer<T>>& v) { return {v.value}; }
-	};
-	template <typename T> struct PrintValue::Ty< Reference<T> >
-	{
-		static std::string fmt(const Reference<T>& v) { return PrintValue::Ty<T>::fmt(v); }
-		static std::vector<Value*> val(const Reference<T>& v) { return PrintValue::Ty<T>::val(v); }
-	};
-	template <typename T> struct PrintValue::Ty< RValue<T> >
-	{
-		static std::string fmt(const RValue<T>& v) { return PrintValue::Ty<T>::fmt(v); }
-		static std::vector<Value*> val(const RValue<T>& v) { return PrintValue::Ty<T>::val(v); }
-	};
-
-	// Printv emits a call to printf() using the function, file and line,
-	// message and optional values.
-	// See Printv below.
-	void Printv(const char* function, const char* file, int line, const char* msg, std::initializer_list<PrintValue> vals);
-
-	// Printv emits a call to printf() using the provided message and optional
-	// values.
-	// Printf replaces any bracketed indices in the message with string
-	// representations of the corresponding value in vals.
-	// For example:
-	//   Printv("{0} and {1}", "red", "green");
-	// Would print the string:
-	//   "red and green"
-	// Arguments can be indexed in any order.
-	// Invalid indices are not substituted.
-	inline void Printv(const char* msg, std::initializer_list<PrintValue> vals)
-	{
-		Printv(nullptr, nullptr, 0, msg, vals);
+		return values;
 	}
 
-	// Print is a wrapper over Printv that wraps the variadic arguments into an
-	// initializer_list before calling Printv.
-	template <typename ... ARGS>
-	void Print(const char* msg, const ARGS& ... vals) { Printv(msg, {vals...}); }
-
-	// Print is a wrapper over Printv that wraps the variadic arguments into an
-	// initializer_list before calling Printv.
-	template <typename ... ARGS>
-	void Print(const char* function, const char* file, int line, const char* msg, const ARGS& ... vals)
+	// fmt returns the comma-delimited list of printf format strings for
+	// every element in the provided list, all enclosed in square brackets.
+	template <typename T>
+	static std::string fmt(const T* list, int count)
 	{
-		Printv(function, file, line, msg, {vals...});
+		std::string out = "[";
+		for (int i = 0; i < count; i++)
+		{
+			if (i > 0) { out += ", "; }
+			out += fmt(list[i]);
+		}
+		return out + "]";
 	}
 
-	// RR_LOG is a macro that calls Print(), automatically populating the
-	// function, file and line parameters and appending a newline to the string.
-	//
-	// RR_LOG() is intended to be used for debugging JIT compiled code, and is
-	// not intended for production use.
-	#if defined(_WIN32)
-		#define RR_LOG(msg, ...) Print(__FUNCSIG__, __FILE__, static_cast<int>(__LINE__), msg "\n", ##__VA_ARGS__)
-	#else
-		#define RR_LOG(msg, ...) Print(__PRETTY_FUNCTION__, __FILE__, static_cast<int>(__LINE__), msg "\n", ##__VA_ARGS__)
-	#endif
+	static std::string addr(const void* ptr)
+	{
+		char buf[32];
+		snprintf(buf, sizeof(buf), "%p", ptr);
+		return buf;
+	}
 
-	// Macro magic to perform variadic dispatch.
-	// See: https://renenyffenegger.ch/notes/development/languages/C-C-plus-plus/preprocessor/macros/__VA_ARGS__/count-arguments
-	// Note, this doesn't attempt to use the ##__VA_ARGS__ trick to handle 0
-	#define RR_MSVC_EXPAND_BUG(X) X // Helper macro to force expanding __VA_ARGS__ to satisfy MSVC compiler.
-	#define RR_GET_NTH_ARG(_1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, N, ...) N
-	#define RR_COUNT_ARGUMENTS(...) RR_MSVC_EXPAND_BUG(RR_GET_NTH_ARG(__VA_ARGS__, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0))
-	static_assert(1 == RR_COUNT_ARGUMENTS(a), "RR_COUNT_ARGUMENTS broken"); // Sanity checks.
-	static_assert(2 == RR_COUNT_ARGUMENTS(a, b), "RR_COUNT_ARGUMENTS broken");
-	static_assert(3 == RR_COUNT_ARGUMENTS(a, b, c), "RR_COUNT_ARGUMENTS broken");
+public:
+	const std::string format;
+	const std::vector<Value*> values;
 
-	// RR_WATCH_FMT(...) resolves to a string literal that lists all the
-	// arguments by name. This string can be passed to LOG() to print each of
-	// the arguments with their name and value.
-	//
-	// RR_WATCH_FMT(...) uses the RR_COUNT_ARGUMENTS helper macro to delegate to a
-	// corresponding RR_WATCH_FMT_n specialization macro below.
-	#define RR_WATCH_CONCAT(a, b) a ## b
-	#define RR_WATCH_CONCAT2(a, b) RR_WATCH_CONCAT(a, b)
-	#define RR_WATCH_FMT(...) RR_MSVC_EXPAND_BUG(RR_WATCH_CONCAT2(RR_WATCH_FMT_, RR_COUNT_ARGUMENTS(__VA_ARGS__))(__VA_ARGS__))
-	#define RR_WATCH_FMT_1(_1) "\n  " #_1 ": {0}"
-	#define RR_WATCH_FMT_2(_1, _2)                                             RR_WATCH_FMT_1(_1) "\n  " #_2 ": {1}"
-	#define RR_WATCH_FMT_3(_1, _2, _3)                                         RR_WATCH_FMT_2(_1, _2) "\n  " #_3 ": {2}"
-	#define RR_WATCH_FMT_4(_1, _2, _3, _4)                                     RR_WATCH_FMT_3(_1, _2, _3) "\n  " #_4 ": {3}"
-	#define RR_WATCH_FMT_5(_1, _2, _3, _4, _5)                                 RR_WATCH_FMT_4(_1, _2, _3, _4) "\n  " #_5 ": {4}"
-	#define RR_WATCH_FMT_6(_1, _2, _3, _4, _5, _6)                             RR_WATCH_FMT_5(_1, _2, _3, _4, _5) "\n  " #_6 ": {5}"
-	#define RR_WATCH_FMT_7(_1, _2, _3, _4, _5, _6, _7)                         RR_WATCH_FMT_6(_1, _2, _3, _4, _5, _6) "\n  " #_7 ": {6}"
-	#define RR_WATCH_FMT_8(_1, _2, _3, _4, _5, _6, _7, _8)                     RR_WATCH_FMT_7(_1, _2, _3, _4, _5, _6, _7) "\n  " #_8 ": {7}"
-	#define RR_WATCH_FMT_9(_1, _2, _3, _4, _5, _6, _7, _8, _9)                 RR_WATCH_FMT_8(_1, _2, _3, _4, _5, _6, _7, _8) "\n  " #_9 ": {8}"
-	#define RR_WATCH_FMT_10(_1, _2, _3, _4, _5, _6, _7, _8, _9, _10)           RR_WATCH_FMT_9(_1, _2, _3, _4, _5, _6, _7, _8, _9) "\n  " #_10 ": {9}"
-	#define RR_WATCH_FMT_11(_1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11)      RR_WATCH_FMT_10(_1, _2, _3, _4, _5, _6, _7, _8, _9, _10) "\n  " #_11 ": {10}"
-	#define RR_WATCH_FMT_12(_1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12) RR_WATCH_FMT_11(_1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11) "\n  " #_12 ": {11}"
+	// Constructs a PrintValue for the given value.
+	template <typename T>
+	PrintValue(const T& v) : format(fmt(v)), values(val(v)) {}
 
-	// RR_WATCH() is a helper that prints the name and value of all the supplied
-	// arguments.
-	// For example, if you had the Int and bool variables 'foo' and 'bar' that
-	// you want to print, you can simply write:
-	//    RR_WATCH(foo, bar)
-	// When this JIT compiled code is executed, it will print the string
-	// "foo: 1, bar: true" to stdout.
+	// Constructs a PrintValue for the given static array.
+	template <typename T, int N>
+	PrintValue(const T (&v)[N]) : format(fmt(&v[0], N)), values(val(&v[0], N)) {}
+
+	// Constructs a PrintValue for the given array starting at arr of length
+	// len.
+	template <typename T>
+	PrintValue(const T* arr, int len) : format(fmt(arr, len)), values(val(arr, len)) {}
+
+	// PrintValue constructors for plain-old-data values.
+	PrintValue(bool v) : format(v ? "true" : "false") {}
+	PrintValue(int8_t v) : format(std::to_string(v)) {}
+	PrintValue(uint8_t v) : format(std::to_string(v)) {}
+	PrintValue(int16_t v) : format(std::to_string(v)) {}
+	PrintValue(uint16_t v) : format(std::to_string(v)) {}
+	PrintValue(int32_t v) : format(std::to_string(v)) {}
+	PrintValue(uint32_t v) : format(std::to_string(v)) {}
+	PrintValue(int64_t v) : format(std::to_string(v)) {}
+	PrintValue(uint64_t v) : format(std::to_string(v)) {}
+	PrintValue(float v) : format(std::to_string(v)) {}
+	PrintValue(double v) : format(std::to_string(v)) {}
+
+	template <typename T>
+	PrintValue(const T* v) : format(addr(v)) {}
+
+	// vals is a helper to build composite value lists.
+	// vals returns the full, sequential list of printf argument values used
+	// to print all the provided variadic values.
+	// vals() is intended to be used by implementations of
+	// PrintValue::Ty<>::vals() to help declare aggregate types.
+	// For example, if you were declaring a PrintValue::Ty<> specialization
+	// for a custom Mat4x4 matrix formed from four Vector4 values, you'd
+	// write:
 	//
-	// RR_WATCH() is intended to be used for debugging JIT compiled code, and
-	// is not intended for production use.
-	#define RR_WATCH(...) RR_LOG(RR_WATCH_FMT(__VA_ARGS__), __VA_ARGS__)
+	// namespace rr
+	// {
+	//		template <> struct PrintValue::Ty<Mat4x4>
+	//		{
+	//			static std::string fmt(const Mat4x4& v)
+	//			{
+	//				return	"[a: <%f, %f, %f, %f>,"
+	//				        " b: <%f, %f, %f, %f>,"
+	//				        " c: <%f, %f, %f, %f>,"
+	//				        " d: <%f, %f, %f, %f>]";
+	//			}
+	//			static std::vector<rr::Value*> val(const Mat4x4& v)
+	//			{
+	//				return PrintValue::vals(v.a, v.b, v.c, v.d);
+	//			}
+	//		};
+	//	}
+	template<typename ... ARGS>
+	static std::vector<Value*> vals(ARGS... v)
+	{
+		std::vector< std::vector<Value*> > lists = {val(v)...};
+		std::vector<Value*> joined;
+		for (const auto& list : lists)
+		{
+			joined.insert(joined.end(), list.begin(), list.end());
+		}
+		return joined;
+	}
+
+	// returns the printf format specifier for the given type via the
+	// PrintValue::Ty<T> specialization.
+	template <typename T>
+	static std::string fmt(const T& v) { return Ty<T>::fmt(v); }
+
+	// returns the printf value for the given type with a
+	// PrintValue::Ty<T> specialization.
+	template <typename T>
+	static std::vector<Value*> val(const T& v) { return Ty<T>::val(v); }
+};
+
+// PrintValue::Ty<T> specializations for basic types.
+template <> struct PrintValue::Ty<const char*>
+{
+	static std::string fmt(const char* v) { return "%s"; }
+	static std::vector<Value*> val(const char* v);
+};
+template <> struct PrintValue::Ty<std::string>
+{
+	static std::string fmt(const std::string& v) { return PrintValue::Ty<const char*>::fmt(v.c_str()); }
+	static std::vector<Value*> val(const std::string& v) { return PrintValue::Ty<const char*>::val(v.c_str()); }
+};
+
+// PrintValue::Ty<T> specializations for standard Reactor types.
+template <> struct PrintValue::Ty<Bool>
+{
+	static std::string fmt(const RValue<Bool>& v) { return "%d"; }
+	static std::vector<Value*> val(const RValue<Bool>& v) { return {v.value}; }
+};
+template <> struct PrintValue::Ty<Byte>
+{
+	static std::string fmt(const RValue<Byte>& v) { return "%d"; }
+	static std::vector<Value*> val(const RValue<Byte>& v);
+};
+template <> struct PrintValue::Ty<Byte4>
+{
+	static std::string fmt(const RValue<Byte4>& v) { return "[%d, %d, %d, %d]"; }
+	static std::vector<Value*> val(const RValue<Byte4>& v);
+};
+template <> struct PrintValue::Ty<Int>
+{
+	static std::string fmt(const RValue<Int>& v) { return "%d"; }
+	static std::vector<Value*> val(const RValue<Int>& v);
+};
+template <> struct PrintValue::Ty<Int2>
+{
+	static std::string fmt(const RValue<Int>& v) { return "[%d, %d]"; }
+	static std::vector<Value*> val(const RValue<Int2>& v);
+};
+template <> struct PrintValue::Ty<Int4>
+{
+	static std::string fmt(const RValue<Int4>& v) { return "[%d, %d, %d, %d]"; }
+	static std::vector<Value*> val(const RValue<Int4>& v);
+};
+template <> struct PrintValue::Ty<UInt>
+{
+	static std::string fmt(const RValue<UInt>& v) { return "%u"; }
+	static std::vector<Value*> val(const RValue<UInt>& v);
+};
+template <> struct PrintValue::Ty<UInt2>
+{
+	static std::string fmt(const RValue<UInt>& v) { return "[%u, %u]"; }
+	static std::vector<Value*> val(const RValue<UInt2>& v);
+};
+template <> struct PrintValue::Ty<UInt4>
+{
+	static std::string fmt(const RValue<UInt4>& v) { return "[%u, %u, %u, %u]"; }
+	static std::vector<Value*> val(const RValue<UInt4>& v);
+};
+template <> struct PrintValue::Ty<Short>
+{
+	static std::string fmt(const RValue<Short>& v) { return "%d"; }
+	static std::vector<Value*> val(const RValue<Short>& v);
+};
+template <> struct PrintValue::Ty<Short4>
+{
+	static std::string fmt(const RValue<Short4>& v) { return "[%d, %d, %d, %d]"; }
+	static std::vector<Value*> val(const RValue<Short4>& v);
+};
+template <> struct PrintValue::Ty<UShort>
+{
+	static std::string fmt(const RValue<UShort>& v) { return "%u"; }
+	static std::vector<Value*> val(const RValue<UShort>& v);
+};
+template <> struct PrintValue::Ty<UShort4>
+{
+	static std::string fmt(const RValue<UShort4>& v) { return "[%u, %u, %u, %u]"; }
+	static std::vector<Value*> val(const RValue<UShort4>& v);
+};
+template <> struct PrintValue::Ty<Float>
+{
+	static std::string fmt(const RValue<Float>& v) { return "[%f]"; }
+	static std::vector<Value*> val(const RValue<Float>& v);
+};
+template <> struct PrintValue::Ty<Float4>
+{
+	static std::string fmt(const RValue<Float4>& v) { return "[%f, %f, %f, %f]"; }
+	static std::vector<Value*> val(const RValue<Float4>& v);
+};
+template <> struct PrintValue::Ty<Long>
+{
+	static std::string fmt(const RValue<Long>& v) { return "%lld"; }
+	static std::vector<Value*> val(const RValue<Long>& v) { return {v.value}; }
+};
+template <typename T> struct PrintValue::Ty< Pointer<T> >
+{
+	static std::string fmt(const RValue<Pointer<T>>& v) { return "%p"; }
+	static std::vector<Value*> val(const RValue<Pointer<T>>& v) { return {v.value}; }
+};
+template <typename T> struct PrintValue::Ty< Reference<T> >
+{
+	static std::string fmt(const Reference<T>& v) { return PrintValue::Ty<T>::fmt(v); }
+	static std::vector<Value*> val(const Reference<T>& v) { return PrintValue::Ty<T>::val(v); }
+};
+template <typename T> struct PrintValue::Ty< RValue<T> >
+{
+	static std::string fmt(const RValue<T>& v) { return PrintValue::Ty<T>::fmt(v); }
+	static std::vector<Value*> val(const RValue<T>& v) { return PrintValue::Ty<T>::val(v); }
+};
+
+// Printv emits a call to printf() using the function, file and line,
+// message and optional values.
+// See Printv below.
+void Printv(const char* function, const char* file, int line, const char* msg, std::initializer_list<PrintValue> vals);
+
+// Printv emits a call to printf() using the provided message and optional
+// values.
+// Printf replaces any bracketed indices in the message with string
+// representations of the corresponding value in vals.
+// For example:
+//   Printv("{0} and {1}", "red", "green");
+// Would print the string:
+//   "red and green"
+// Arguments can be indexed in any order.
+// Invalid indices are not substituted.
+inline void Printv(const char* msg, std::initializer_list<PrintValue> vals)
+{
+	Printv(nullptr, nullptr, 0, msg, vals);
+}
+
+// Print is a wrapper over Printv that wraps the variadic arguments into an
+// initializer_list before calling Printv.
+template <typename ... ARGS>
+void Print(const char* msg, const ARGS& ... vals) { Printv(msg, {vals...}); }
+
+// Print is a wrapper over Printv that wraps the variadic arguments into an
+// initializer_list before calling Printv.
+template <typename ... ARGS>
+void Print(const char* function, const char* file, int line, const char* msg, const ARGS& ... vals)
+{
+	Printv(function, file, line, msg, {vals...});
+}
+
+// RR_LOG is a macro that calls Print(), automatically populating the
+// function, file and line parameters and appending a newline to the string.
+//
+// RR_LOG() is intended to be used for debugging JIT compiled code, and is
+// not intended for production use.
+#if defined(_WIN32)
+	#define RR_LOG(msg, ...) Print(__FUNCSIG__, __FILE__, static_cast<int>(__LINE__), msg "\n", ##__VA_ARGS__)
+#else
+	#define RR_LOG(msg, ...) Print(__PRETTY_FUNCTION__, __FILE__, static_cast<int>(__LINE__), msg "\n", ##__VA_ARGS__)
+#endif
+
+// Macro magic to perform variadic dispatch.
+// See: https://renenyffenegger.ch/notes/development/languages/C-C-plus-plus/preprocessor/macros/__VA_ARGS__/count-arguments
+// Note, this doesn't attempt to use the ##__VA_ARGS__ trick to handle 0
+#define RR_MSVC_EXPAND_BUG(X) X // Helper macro to force expanding __VA_ARGS__ to satisfy MSVC compiler.
+#define RR_GET_NTH_ARG(_1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, N, ...) N
+#define RR_COUNT_ARGUMENTS(...) RR_MSVC_EXPAND_BUG(RR_GET_NTH_ARG(__VA_ARGS__, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0))
+static_assert(1 == RR_COUNT_ARGUMENTS(a), "RR_COUNT_ARGUMENTS broken"); // Sanity checks.
+static_assert(2 == RR_COUNT_ARGUMENTS(a, b), "RR_COUNT_ARGUMENTS broken");
+static_assert(3 == RR_COUNT_ARGUMENTS(a, b, c), "RR_COUNT_ARGUMENTS broken");
+
+// RR_WATCH_FMT(...) resolves to a string literal that lists all the
+// arguments by name. This string can be passed to LOG() to print each of
+// the arguments with their name and value.
+//
+// RR_WATCH_FMT(...) uses the RR_COUNT_ARGUMENTS helper macro to delegate to a
+// corresponding RR_WATCH_FMT_n specialization macro below.
+#define RR_WATCH_CONCAT(a, b) a ## b
+#define RR_WATCH_CONCAT2(a, b) RR_WATCH_CONCAT(a, b)
+#define RR_WATCH_FMT(...) RR_MSVC_EXPAND_BUG(RR_WATCH_CONCAT2(RR_WATCH_FMT_, RR_COUNT_ARGUMENTS(__VA_ARGS__))(__VA_ARGS__))
+#define RR_WATCH_FMT_1(_1) "\n  " #_1 ": {0}"
+#define RR_WATCH_FMT_2(_1, _2)                                             RR_WATCH_FMT_1(_1) "\n  " #_2 ": {1}"
+#define RR_WATCH_FMT_3(_1, _2, _3)                                         RR_WATCH_FMT_2(_1, _2) "\n  " #_3 ": {2}"
+#define RR_WATCH_FMT_4(_1, _2, _3, _4)                                     RR_WATCH_FMT_3(_1, _2, _3) "\n  " #_4 ": {3}"
+#define RR_WATCH_FMT_5(_1, _2, _3, _4, _5)                                 RR_WATCH_FMT_4(_1, _2, _3, _4) "\n  " #_5 ": {4}"
+#define RR_WATCH_FMT_6(_1, _2, _3, _4, _5, _6)                             RR_WATCH_FMT_5(_1, _2, _3, _4, _5) "\n  " #_6 ": {5}"
+#define RR_WATCH_FMT_7(_1, _2, _3, _4, _5, _6, _7)                         RR_WATCH_FMT_6(_1, _2, _3, _4, _5, _6) "\n  " #_7 ": {6}"
+#define RR_WATCH_FMT_8(_1, _2, _3, _4, _5, _6, _7, _8)                     RR_WATCH_FMT_7(_1, _2, _3, _4, _5, _6, _7) "\n  " #_8 ": {7}"
+#define RR_WATCH_FMT_9(_1, _2, _3, _4, _5, _6, _7, _8, _9)                 RR_WATCH_FMT_8(_1, _2, _3, _4, _5, _6, _7, _8) "\n  " #_9 ": {8}"
+#define RR_WATCH_FMT_10(_1, _2, _3, _4, _5, _6, _7, _8, _9, _10)           RR_WATCH_FMT_9(_1, _2, _3, _4, _5, _6, _7, _8, _9) "\n  " #_10 ": {9}"
+#define RR_WATCH_FMT_11(_1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11)      RR_WATCH_FMT_10(_1, _2, _3, _4, _5, _6, _7, _8, _9, _10) "\n  " #_11 ": {10}"
+#define RR_WATCH_FMT_12(_1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12) RR_WATCH_FMT_11(_1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11) "\n  " #_12 ": {11}"
+
+// RR_WATCH() is a helper that prints the name and value of all the supplied
+// arguments.
+// For example, if you had the Int and bool variables 'foo' and 'bar' that
+// you want to print, you can simply write:
+//    RR_WATCH(foo, bar)
+// When this JIT compiled code is executed, it will print the string
+// "foo: 1, bar: true" to stdout.
+//
+// RR_WATCH() is intended to be used for debugging JIT compiled code, and
+// is not intended for production use.
+#define RR_WATCH(...) RR_LOG(RR_WATCH_FMT(__VA_ARGS__), __VA_ARGS__)
 
 }  // namespace rr
 
diff --git a/src/Reactor/Reactor.cpp b/src/Reactor/Reactor.cpp
index 79c0891..07e83c5 100644
--- a/src/Reactor/Reactor.cpp
+++ b/src/Reactor/Reactor.cpp
@@ -23,1292 +23,1293 @@
 #define REACTOR_MATERIALIZE_LVALUES_ON_DEFINITION 0
 #endif
 
-namespace
+namespace {
+
+// Introduced in C++20.
+template <class ForwardIterator, class UnaryPredicate>
+ForwardIterator remove_if(ForwardIterator first, ForwardIterator last,
+							UnaryPredicate pred)
 {
-	// Introduced in C++20.
-	template <class ForwardIterator, class UnaryPredicate>
-	ForwardIterator remove_if(ForwardIterator first, ForwardIterator last,
-								UnaryPredicate pred)
-	{
-		ForwardIterator result = first;
-		while (first!=last) {
-			if (!pred(*first)) {
-				*result = std::move(*first);
-				++result;
-			}
-			++first;
+	ForwardIterator result = first;
+	while (first!=last) {
+		if (!pred(*first)) {
+			*result = std::move(*first);
+			++result;
 		}
-		return result;
+		++first;
+	}
+	return result;
+}
+
+}  // anonymous namespace
+
+namespace rr {
+
+const Config::Edit Config::Edit::None = {};
+
+Config Config::Edit::apply(const Config &cfg) const
+{
+	if (this == &None) { return cfg; }
+
+	auto level = optLevelChanged ? optLevel : cfg.optimization.getLevel();
+	auto passes = cfg.optimization.getPasses();
+	apply(optPassEdits, passes);
+	return Config{ Optimization{level, passes} };
+}
+
+template <typename T>
+void rr::Config::Edit::apply(const std::vector<std::pair<ListEdit, T>> & edits, std::vector<T>& list) const
+{
+	for (auto & edit : edits)
+	{
+		switch (edit.first)
+		{
+		case ListEdit::Add:
+			list.push_back(edit.second);
+			break;
+		case ListEdit::Remove:
+			::remove_if(list.begin(), list.end(), [&](T item) { return item == edit.second; });
+			break;
+		case ListEdit::Clear:
+			list.clear();
+			break;
+		}
 	}
 }
 
-namespace rr
+// Set of variables that do not have a stack location yet.
+std::unordered_set<Variable*> Variable::unmaterializedVariables;
+
+Variable::Variable(Type *type, int arraySize) : arraySize(arraySize), type(type)
 {
-	const Config::Edit Config::Edit::None = {};
+	#if REACTOR_MATERIALIZE_LVALUES_ON_DEFINITION
+		materialize();
+	#else
+		unmaterializedVariables.emplace(this);
+	#endif
+}
 
-	Config Config::Edit::apply(const Config &cfg) const
-	{
-		if (this == &None) { return cfg; }
+Variable::~Variable()
+{
+	unmaterializedVariables.erase(this);
+}
 
-		auto level = optLevelChanged ? optLevel : cfg.optimization.getLevel();
-		auto passes = cfg.optimization.getPasses();
-		apply(optPassEdits, passes);
-		return Config{ Optimization{level, passes} };
-	}
-
-	template <typename T>
-	void rr::Config::Edit::apply(const std::vector<std::pair<ListEdit, T>> & edits, std::vector<T>& list) const
+void Variable::materializeAll()
+{
+	for(auto *var : unmaterializedVariables)
 	{
-		for (auto & edit : edits)
-		{
-			switch (edit.first)
-			{
-			case ListEdit::Add:
-				list.push_back(edit.second);
-				break;
-			case ListEdit::Remove:
-				::remove_if(list.begin(), list.end(), [&](T item) { return item == edit.second; });
-				break;
-			case ListEdit::Clear:
-				list.clear();
-				break;
-			}
-		}
+		var->materialize();
 	}
 
-	// Set of variables that do not have a stack location yet.
-	std::unordered_set<Variable*> Variable::unmaterializedVariables;
+	unmaterializedVariables.clear();
+}
 
-	Variable::Variable(Type *type, int arraySize) : arraySize(arraySize), type(type)
-	{
-		#if REACTOR_MATERIALIZE_LVALUES_ON_DEFINITION
-			materialize();
-		#else
-			unmaterializedVariables.emplace(this);
-		#endif
-	}
-
-	Variable::~Variable()
-	{
-		unmaterializedVariables.erase(this);
-	}
+void Variable::killUnmaterialized()
+{
+	unmaterializedVariables.clear();
+}
 
-	void Variable::materializeAll()
+// NOTE: Only 12 bits out of 16 of the |select| value are used.
+// More specifically, the value should look like:
+//
+//    msb               lsb
+//     v                 v
+//    [.xxx|.yyy|.zzz|.www]    where '.' means an ignored bit
+//
+// This format makes it easy to write calls with hexadecimal select values,
+// since each hex digit is a separate swizzle index.
+//
+// For example:
+//      createBlend4( [a,b,c,d], [e,f,g,h], 0x0123 ) -> [a,b,c,d]
+//      createBlend4( [a,b,c,d], [e,f,g,h], 0x4567 ) -> [e,f,g,h]
+//      createBlend4( [a,b,c,d], [e,f,g,h], 0x4012 ) -> [e,a,b,c]
+//
+static Value *createBlend4(Value *lhs, Value *rhs, uint16_t select)
+{
+	int swizzle[4] =
 	{
-		for(auto *var : unmaterializedVariables)
-		{
-			var->materialize();
-		}
+		(select >> 12) & 0x07,
+		(select >> 8)  & 0x07,
+		(select >> 4)  & 0x07,
+		(select >> 0)  & 0x07,
+	};
 
-		unmaterializedVariables.clear();
-	}
+	return Nucleus::createShuffleVector(lhs, rhs, swizzle);
+}
 
-	void Variable::killUnmaterialized()
+// NOTE: Only 8 bits out of 16 of the |select| value are used.
+// More specifically, the value should look like:
+//
+//    msb               lsb
+//     v                 v
+//    [..xx|..yy|..zz|..ww]    where '.' means an ignored bit
+//
+// This format makes it easy to write calls with hexadecimal select values,
+// since each hex digit is a separate swizzle index.
+//
+// For example:
+//      createSwizzle4( [a,b,c,d], 0x0123 ) -> [a,b,c,d]
+//      createSwizzle4( [a,b,c,d], 0x0033 ) -> [a,a,d,d]
+//
+static Value *createSwizzle4(Value *val, uint16_t select)
+{
+	int swizzle[4] =
 	{
-		unmaterializedVariables.clear();
-	}
-
-	// NOTE: Only 12 bits out of 16 of the |select| value are used.
-	// More specifically, the value should look like:
-	//
-	//    msb               lsb
-	//     v                 v
-	//    [.xxx|.yyy|.zzz|.www]    where '.' means an ignored bit
-	//
-	// This format makes it easy to write calls with hexadecimal select values,
-	// since each hex digit is a separate swizzle index.
-	//
-	// For example:
-	//      createBlend4( [a,b,c,d], [e,f,g,h], 0x0123 ) -> [a,b,c,d]
-	//      createBlend4( [a,b,c,d], [e,f,g,h], 0x4567 ) -> [e,f,g,h]
-	//      createBlend4( [a,b,c,d], [e,f,g,h], 0x4012 ) -> [e,a,b,c]
-	//
-	static Value *createBlend4(Value *lhs, Value *rhs, uint16_t select)
-	{
-		int swizzle[4] =
-		{
-			(select >> 12) & 0x07,
-			(select >> 8)  & 0x07,
-			(select >> 4)  & 0x07,
-			(select >> 0)  & 0x07,
-		};
+		(select >> 12) & 0x03,
+		(select >> 8)  & 0x03,
+		(select >> 4)  & 0x03,
+		(select >> 0)  & 0x03,
+	};
 
-		return Nucleus::createShuffleVector(lhs, rhs, swizzle);
-	}
+	return Nucleus::createShuffleVector(val, val, swizzle);
+}
 
-	// NOTE: Only 8 bits out of 16 of the |select| value are used.
-	// More specifically, the value should look like:
-	//
-	//    msb               lsb
-	//     v                 v
-	//    [..xx|..yy|..zz|..ww]    where '.' means an ignored bit
-	//
-	// This format makes it easy to write calls with hexadecimal select values,
-	// since each hex digit is a separate swizzle index.
-	//
-	// For example:
-	//      createSwizzle4( [a,b,c,d], 0x0123 ) -> [a,b,c,d]
-	//      createSwizzle4( [a,b,c,d], 0x0033 ) -> [a,a,d,d]
-	//
-	static Value *createSwizzle4(Value *val, uint16_t select)
-	{
-		int swizzle[4] =
-		{
-			(select >> 12) & 0x03,
-			(select >> 8)  & 0x03,
-			(select >> 4)  & 0x03,
-			(select >> 0)  & 0x03,
-		};
+static Value *createMask4(Value *lhs, Value *rhs, uint16_t select)
+{
+	bool mask[4] = {false, false, false, false};
 
-		return Nucleus::createShuffleVector(val, val, swizzle);
-	}
+	mask[(select >> 12) & 0x03] = true;
+	mask[(select >> 8)  & 0x03] = true;
+	mask[(select >> 4)  & 0x03] = true;
+	mask[(select >> 0)  & 0x03] = true;
 
-	static Value *createMask4(Value *lhs, Value *rhs, uint16_t select)
+	int swizzle[4] =
 	{
-		bool mask[4] = {false, false, false, false};
+		mask[0] ? 4 : 0,
+		mask[1] ? 5 : 1,
+		mask[2] ? 6 : 2,
+		mask[3] ? 7 : 3,
+	};
 
-		mask[(select >> 12) & 0x03] = true;
-		mask[(select >> 8)  & 0x03] = true;
-		mask[(select >> 4)  & 0x03] = true;
-		mask[(select >> 0)  & 0x03] = true;
+	return Nucleus::createShuffleVector(lhs, rhs, swizzle);
+}
 
-		int swizzle[4] =
-		{
-			mask[0] ? 4 : 0,
-			mask[1] ? 5 : 1,
-			mask[2] ? 6 : 2,
-			mask[3] ? 7 : 3,
-		};
+Bool::Bool(Argument<Bool> argument)
+{
+	storeValue(argument.value);
+}
 
-		return Nucleus::createShuffleVector(lhs, rhs, swizzle);
-	}
+Bool::Bool(bool x)
+{
+	storeValue(Nucleus::createConstantBool(x));
+}
 
-	Bool::Bool(Argument<Bool> argument)
-	{
-		storeValue(argument.value);
-	}
+Bool::Bool(RValue<Bool> rhs)
+{
+	storeValue(rhs.value);
+}
 
-	Bool::Bool(bool x)
-	{
-		storeValue(Nucleus::createConstantBool(x));
-	}
+Bool::Bool(const Bool &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
+}
 
-	Bool::Bool(RValue<Bool> rhs)
-	{
-		storeValue(rhs.value);
-	}
+Bool::Bool(const Reference<Bool> &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
+}
 
-	Bool::Bool(const Bool &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
-	}
+RValue<Bool> Bool::operator=(RValue<Bool> rhs)
+{
+	storeValue(rhs.value);
 
-	Bool::Bool(const Reference<Bool> &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
-	}
+	return rhs;
+}
 
-	RValue<Bool> Bool::operator=(RValue<Bool> rhs)
-	{
-		storeValue(rhs.value);
+RValue<Bool> Bool::operator=(const Bool &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
 
-		return rhs;
-	}
+	return RValue<Bool>(value);
+}
 
-	RValue<Bool> Bool::operator=(const Bool &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
+RValue<Bool> Bool::operator=(const Reference<Bool> &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
 
-		return RValue<Bool>(value);
-	}
+	return RValue<Bool>(value);
+}
 
-	RValue<Bool> Bool::operator=(const Reference<Bool> &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
+RValue<Bool> operator!(RValue<Bool> val)
+{
+	return RValue<Bool>(Nucleus::createNot(val.value));
+}
 
-		return RValue<Bool>(value);
-	}
+RValue<Bool> operator&&(RValue<Bool> lhs, RValue<Bool> rhs)
+{
+	return RValue<Bool>(Nucleus::createAnd(lhs.value, rhs.value));
+}
 
-	RValue<Bool> operator!(RValue<Bool> val)
-	{
-		return RValue<Bool>(Nucleus::createNot(val.value));
-	}
+RValue<Bool> operator||(RValue<Bool> lhs, RValue<Bool> rhs)
+{
+	return RValue<Bool>(Nucleus::createOr(lhs.value, rhs.value));
+}
 
-	RValue<Bool> operator&&(RValue<Bool> lhs, RValue<Bool> rhs)
-	{
-		return RValue<Bool>(Nucleus::createAnd(lhs.value, rhs.value));
-	}
+RValue<Bool> operator!=(RValue<Bool> lhs, RValue<Bool> rhs)
+{
+	return RValue<Bool>(Nucleus::createICmpNE(lhs.value, rhs.value));
+}
 
-	RValue<Bool> operator||(RValue<Bool> lhs, RValue<Bool> rhs)
-	{
-		return RValue<Bool>(Nucleus::createOr(lhs.value, rhs.value));
-	}
+RValue<Bool> operator==(RValue<Bool> lhs, RValue<Bool> rhs)
+{
+	return RValue<Bool>(Nucleus::createICmpEQ(lhs.value, rhs.value));
+}
 
-	RValue<Bool> operator!=(RValue<Bool> lhs, RValue<Bool> rhs)
-	{
-		return RValue<Bool>(Nucleus::createICmpNE(lhs.value, rhs.value));
-	}
+Byte::Byte(Argument<Byte> argument)
+{
+	storeValue(argument.value);
+}
 
-	RValue<Bool> operator==(RValue<Bool> lhs, RValue<Bool> rhs)
-	{
-		return RValue<Bool>(Nucleus::createICmpEQ(lhs.value, rhs.value));
-	}
+Byte::Byte(RValue<Int> cast)
+{
+	Value *integer = Nucleus::createTrunc(cast.value, Byte::getType());
 
-	Byte::Byte(Argument<Byte> argument)
-	{
-		storeValue(argument.value);
-	}
+	storeValue(integer);
+}
 
-	Byte::Byte(RValue<Int> cast)
-	{
-		Value *integer = Nucleus::createTrunc(cast.value, Byte::getType());
+Byte::Byte(RValue<UInt> cast)
+{
+	Value *integer = Nucleus::createTrunc(cast.value, Byte::getType());
 
-		storeValue(integer);
-	}
+	storeValue(integer);
+}
 
-	Byte::Byte(RValue<UInt> cast)
-	{
-		Value *integer = Nucleus::createTrunc(cast.value, Byte::getType());
+Byte::Byte(RValue<UShort> cast)
+{
+	Value *integer = Nucleus::createTrunc(cast.value, Byte::getType());
 
-		storeValue(integer);
-	}
+	storeValue(integer);
+}
 
-	Byte::Byte(RValue<UShort> cast)
-	{
-		Value *integer = Nucleus::createTrunc(cast.value, Byte::getType());
+Byte::Byte(int x)
+{
+	storeValue(Nucleus::createConstantByte((unsigned char)x));
+}
 
-		storeValue(integer);
-	}
+Byte::Byte(unsigned char x)
+{
+	storeValue(Nucleus::createConstantByte(x));
+}
 
-	Byte::Byte(int x)
-	{
-		storeValue(Nucleus::createConstantByte((unsigned char)x));
-	}
+Byte::Byte(RValue<Byte> rhs)
+{
+	storeValue(rhs.value);
+}
 
-	Byte::Byte(unsigned char x)
-	{
-		storeValue(Nucleus::createConstantByte(x));
-	}
+Byte::Byte(const Byte &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
+}
 
-	Byte::Byte(RValue<Byte> rhs)
-	{
-		storeValue(rhs.value);
-	}
+Byte::Byte(const Reference<Byte> &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
+}
 
-	Byte::Byte(const Byte &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
-	}
+RValue<Byte> Byte::operator=(RValue<Byte> rhs)
+{
+	storeValue(rhs.value);
 
-	Byte::Byte(const Reference<Byte> &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
-	}
+	return rhs;
+}
 
-	RValue<Byte> Byte::operator=(RValue<Byte> rhs)
-	{
-		storeValue(rhs.value);
+RValue<Byte> Byte::operator=(const Byte &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
 
-		return rhs;
-	}
+	return RValue<Byte>(value);
+}
 
-	RValue<Byte> Byte::operator=(const Byte &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
+RValue<Byte> Byte::operator=(const Reference<Byte> &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
 
-		return RValue<Byte>(value);
-	}
+	return RValue<Byte>(value);
+}
 
-	RValue<Byte> Byte::operator=(const Reference<Byte> &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
+RValue<Byte> operator+(RValue<Byte> lhs, RValue<Byte> rhs)
+{
+	return RValue<Byte>(Nucleus::createAdd(lhs.value, rhs.value));
+}
 
-		return RValue<Byte>(value);
-	}
+RValue<Byte> operator-(RValue<Byte> lhs, RValue<Byte> rhs)
+{
+	return RValue<Byte>(Nucleus::createSub(lhs.value, rhs.value));
+}
 
-	RValue<Byte> operator+(RValue<Byte> lhs, RValue<Byte> rhs)
-	{
-		return RValue<Byte>(Nucleus::createAdd(lhs.value, rhs.value));
-	}
+RValue<Byte> operator*(RValue<Byte> lhs, RValue<Byte> rhs)
+{
+	return RValue<Byte>(Nucleus::createMul(lhs.value, rhs.value));
+}
 
-	RValue<Byte> operator-(RValue<Byte> lhs, RValue<Byte> rhs)
-	{
-		return RValue<Byte>(Nucleus::createSub(lhs.value, rhs.value));
-	}
+RValue<Byte> operator/(RValue<Byte> lhs, RValue<Byte> rhs)
+{
+	return RValue<Byte>(Nucleus::createUDiv(lhs.value, rhs.value));
+}
 
-	RValue<Byte> operator*(RValue<Byte> lhs, RValue<Byte> rhs)
-	{
-		return RValue<Byte>(Nucleus::createMul(lhs.value, rhs.value));
-	}
+RValue<Byte> operator%(RValue<Byte> lhs, RValue<Byte> rhs)
+{
+	return RValue<Byte>(Nucleus::createURem(lhs.value, rhs.value));
+}
 
-	RValue<Byte> operator/(RValue<Byte> lhs, RValue<Byte> rhs)
-	{
-		return RValue<Byte>(Nucleus::createUDiv(lhs.value, rhs.value));
-	}
+RValue<Byte> operator&(RValue<Byte> lhs, RValue<Byte> rhs)
+{
+	return RValue<Byte>(Nucleus::createAnd(lhs.value, rhs.value));
+}
 
-	RValue<Byte> operator%(RValue<Byte> lhs, RValue<Byte> rhs)
-	{
-		return RValue<Byte>(Nucleus::createURem(lhs.value, rhs.value));
-	}
+RValue<Byte> operator|(RValue<Byte> lhs, RValue<Byte> rhs)
+{
+	return RValue<Byte>(Nucleus::createOr(lhs.value, rhs.value));
+}
 
-	RValue<Byte> operator&(RValue<Byte> lhs, RValue<Byte> rhs)
-	{
-		return RValue<Byte>(Nucleus::createAnd(lhs.value, rhs.value));
-	}
+RValue<Byte> operator^(RValue<Byte> lhs, RValue<Byte> rhs)
+{
+	return RValue<Byte>(Nucleus::createXor(lhs.value, rhs.value));
+}
 
-	RValue<Byte> operator|(RValue<Byte> lhs, RValue<Byte> rhs)
-	{
-		return RValue<Byte>(Nucleus::createOr(lhs.value, rhs.value));
-	}
+RValue<Byte> operator<<(RValue<Byte> lhs, RValue<Byte> rhs)
+{
+	return RValue<Byte>(Nucleus::createShl(lhs.value, rhs.value));
+}
 
-	RValue<Byte> operator^(RValue<Byte> lhs, RValue<Byte> rhs)
-	{
-		return RValue<Byte>(Nucleus::createXor(lhs.value, rhs.value));
-	}
+RValue<Byte> operator>>(RValue<Byte> lhs, RValue<Byte> rhs)
+{
+	return RValue<Byte>(Nucleus::createLShr(lhs.value, rhs.value));
+}
 
-	RValue<Byte> operator<<(RValue<Byte> lhs, RValue<Byte> rhs)
-	{
-		return RValue<Byte>(Nucleus::createShl(lhs.value, rhs.value));
-	}
+RValue<Byte> operator+=(Byte &lhs, RValue<Byte> rhs)
+{
+	return lhs = lhs + rhs;
+}
 
-	RValue<Byte> operator>>(RValue<Byte> lhs, RValue<Byte> rhs)
-	{
-		return RValue<Byte>(Nucleus::createLShr(lhs.value, rhs.value));
-	}
+RValue<Byte> operator-=(Byte &lhs, RValue<Byte> rhs)
+{
+	return lhs = lhs - rhs;
+}
 
-	RValue<Byte> operator+=(Byte &lhs, RValue<Byte> rhs)
-	{
-		return lhs = lhs + rhs;
-	}
+RValue<Byte> operator*=(Byte &lhs, RValue<Byte> rhs)
+{
+	return lhs = lhs * rhs;
+}
 
-	RValue<Byte> operator-=(Byte &lhs, RValue<Byte> rhs)
-	{
-		return lhs = lhs - rhs;
-	}
+RValue<Byte> operator/=(Byte &lhs, RValue<Byte> rhs)
+{
+	return lhs = lhs / rhs;
+}
 
-	RValue<Byte> operator*=(Byte &lhs, RValue<Byte> rhs)
-	{
-		return lhs = lhs * rhs;
-	}
+RValue<Byte> operator%=(Byte &lhs, RValue<Byte> rhs)
+{
+	return lhs = lhs % rhs;
+}
 
-	RValue<Byte> operator/=(Byte &lhs, RValue<Byte> rhs)
-	{
-		return lhs = lhs / rhs;
-	}
+RValue<Byte> operator&=(Byte &lhs, RValue<Byte> rhs)
+{
+	return lhs = lhs & rhs;
+}
 
-	RValue<Byte> operator%=(Byte &lhs, RValue<Byte> rhs)
-	{
-		return lhs = lhs % rhs;
-	}
+RValue<Byte> operator|=(Byte &lhs, RValue<Byte> rhs)
+{
+	return lhs = lhs | rhs;
+}
 
-	RValue<Byte> operator&=(Byte &lhs, RValue<Byte> rhs)
-	{
-		return lhs = lhs & rhs;
-	}
+RValue<Byte> operator^=(Byte &lhs, RValue<Byte> rhs)
+{
+	return lhs = lhs ^ rhs;
+}
 
-	RValue<Byte> operator|=(Byte &lhs, RValue<Byte> rhs)
-	{
-		return lhs = lhs | rhs;
-	}
+RValue<Byte> operator<<=(Byte &lhs, RValue<Byte> rhs)
+{
+	return lhs = lhs << rhs;
+}
 
-	RValue<Byte> operator^=(Byte &lhs, RValue<Byte> rhs)
-	{
-		return lhs = lhs ^ rhs;
-	}
+RValue<Byte> operator>>=(Byte &lhs, RValue<Byte> rhs)
+{
+	return lhs = lhs >> rhs;
+}
 
-	RValue<Byte> operator<<=(Byte &lhs, RValue<Byte> rhs)
-	{
-		return lhs = lhs << rhs;
-	}
+RValue<Byte> operator+(RValue<Byte> val)
+{
+	return val;
+}
 
-	RValue<Byte> operator>>=(Byte &lhs, RValue<Byte> rhs)
-	{
-		return lhs = lhs >> rhs;
-	}
+RValue<Byte> operator-(RValue<Byte> val)
+{
+	return RValue<Byte>(Nucleus::createNeg(val.value));
+}
 
-	RValue<Byte> operator+(RValue<Byte> val)
-	{
-		return val;
-	}
+RValue<Byte> operator~(RValue<Byte> val)
+{
+	return RValue<Byte>(Nucleus::createNot(val.value));
+}
 
-	RValue<Byte> operator-(RValue<Byte> val)
-	{
-		return RValue<Byte>(Nucleus::createNeg(val.value));
-	}
+RValue<Byte> operator++(Byte &val, int)   // Post-increment
+{
+	RValue<Byte> res = val;
 
-	RValue<Byte> operator~(RValue<Byte> val)
-	{
-		return RValue<Byte>(Nucleus::createNot(val.value));
-	}
+	Value *inc = Nucleus::createAdd(res.value, Nucleus::createConstantByte((unsigned char)1));
+	val.storeValue(inc);
 
-	RValue<Byte> operator++(Byte &val, int)   // Post-increment
-	{
-		RValue<Byte> res = val;
+	return res;
+}
 
-		Value *inc = Nucleus::createAdd(res.value, Nucleus::createConstantByte((unsigned char)1));
-		val.storeValue(inc);
+const Byte &operator++(Byte &val)   // Pre-increment
+{
+	Value *inc = Nucleus::createAdd(val.loadValue(), Nucleus::createConstantByte((unsigned char)1));
+	val.storeValue(inc);
 
-		return res;
-	}
+	return val;
+}
 
-	const Byte &operator++(Byte &val)   // Pre-increment
-	{
-		Value *inc = Nucleus::createAdd(val.loadValue(), Nucleus::createConstantByte((unsigned char)1));
-		val.storeValue(inc);
+RValue<Byte> operator--(Byte &val, int)   // Post-decrement
+{
+	RValue<Byte> res = val;
 
-		return val;
-	}
+	Value *inc = Nucleus::createSub(res.value, Nucleus::createConstantByte((unsigned char)1));
+	val.storeValue(inc);
 
-	RValue<Byte> operator--(Byte &val, int)   // Post-decrement
-	{
-		RValue<Byte> res = val;
+	return res;
+}
 
-		Value *inc = Nucleus::createSub(res.value, Nucleus::createConstantByte((unsigned char)1));
-		val.storeValue(inc);
+const Byte &operator--(Byte &val)   // Pre-decrement
+{
+	Value *inc = Nucleus::createSub(val.loadValue(), Nucleus::createConstantByte((unsigned char)1));
+	val.storeValue(inc);
 
-		return res;
-	}
+	return val;
+}
 
-	const Byte &operator--(Byte &val)   // Pre-decrement
-	{
-		Value *inc = Nucleus::createSub(val.loadValue(), Nucleus::createConstantByte((unsigned char)1));
-		val.storeValue(inc);
+RValue<Bool> operator<(RValue<Byte> lhs, RValue<Byte> rhs)
+{
+	return RValue<Bool>(Nucleus::createICmpULT(lhs.value, rhs.value));
+}
 
-		return val;
-	}
+RValue<Bool> operator<=(RValue<Byte> lhs, RValue<Byte> rhs)
+{
+	return RValue<Bool>(Nucleus::createICmpULE(lhs.value, rhs.value));
+}
 
-	RValue<Bool> operator<(RValue<Byte> lhs, RValue<Byte> rhs)
-	{
-		return RValue<Bool>(Nucleus::createICmpULT(lhs.value, rhs.value));
-	}
+RValue<Bool> operator>(RValue<Byte> lhs, RValue<Byte> rhs)
+{
+	return RValue<Bool>(Nucleus::createICmpUGT(lhs.value, rhs.value));
+}
 
-	RValue<Bool> operator<=(RValue<Byte> lhs, RValue<Byte> rhs)
-	{
-		return RValue<Bool>(Nucleus::createICmpULE(lhs.value, rhs.value));
-	}
+RValue<Bool> operator>=(RValue<Byte> lhs, RValue<Byte> rhs)
+{
+	return RValue<Bool>(Nucleus::createICmpUGE(lhs.value, rhs.value));
+}
 
-	RValue<Bool> operator>(RValue<Byte> lhs, RValue<Byte> rhs)
-	{
-		return RValue<Bool>(Nucleus::createICmpUGT(lhs.value, rhs.value));
-	}
+RValue<Bool> operator!=(RValue<Byte> lhs, RValue<Byte> rhs)
+{
+	return RValue<Bool>(Nucleus::createICmpNE(lhs.value, rhs.value));
+}
 
-	RValue<Bool> operator>=(RValue<Byte> lhs, RValue<Byte> rhs)
-	{
-		return RValue<Bool>(Nucleus::createICmpUGE(lhs.value, rhs.value));
-	}
+RValue<Bool> operator==(RValue<Byte> lhs, RValue<Byte> rhs)
+{
+	return RValue<Bool>(Nucleus::createICmpEQ(lhs.value, rhs.value));
+}
 
-	RValue<Bool> operator!=(RValue<Byte> lhs, RValue<Byte> rhs)
-	{
-		return RValue<Bool>(Nucleus::createICmpNE(lhs.value, rhs.value));
-	}
+SByte::SByte(Argument<SByte> argument)
+{
+	storeValue(argument.value);
+}
 
-	RValue<Bool> operator==(RValue<Byte> lhs, RValue<Byte> rhs)
-	{
-		return RValue<Bool>(Nucleus::createICmpEQ(lhs.value, rhs.value));
-	}
+SByte::SByte(RValue<Int> cast)
+{
+	Value *integer = Nucleus::createTrunc(cast.value, SByte::getType());
 
-	SByte::SByte(Argument<SByte> argument)
-	{
-		storeValue(argument.value);
-	}
+	storeValue(integer);
+}
 
-	SByte::SByte(RValue<Int> cast)
-	{
-		Value *integer = Nucleus::createTrunc(cast.value, SByte::getType());
+SByte::SByte(RValue<Short> cast)
+{
+	Value *integer = Nucleus::createTrunc(cast.value, SByte::getType());
 
-		storeValue(integer);
-	}
+	storeValue(integer);
+}
 
-	SByte::SByte(RValue<Short> cast)
-	{
-		Value *integer = Nucleus::createTrunc(cast.value, SByte::getType());
+SByte::SByte(signed char x)
+{
+	storeValue(Nucleus::createConstantByte(x));
+}
 
-		storeValue(integer);
-	}
+SByte::SByte(RValue<SByte> rhs)
+{
+	storeValue(rhs.value);
+}
 
-	SByte::SByte(signed char x)
-	{
-		storeValue(Nucleus::createConstantByte(x));
-	}
+SByte::SByte(const SByte &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
+}
 
-	SByte::SByte(RValue<SByte> rhs)
-	{
-		storeValue(rhs.value);
-	}
+SByte::SByte(const Reference<SByte> &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
+}
 
-	SByte::SByte(const SByte &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
-	}
+RValue<SByte> SByte::operator=(RValue<SByte> rhs)
+{
+	storeValue(rhs.value);
 
-	SByte::SByte(const Reference<SByte> &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
-	}
+	return rhs;
+}
 
-	RValue<SByte> SByte::operator=(RValue<SByte> rhs)
-	{
-		storeValue(rhs.value);
+RValue<SByte> SByte::operator=(const SByte &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
 
-		return rhs;
-	}
+	return RValue<SByte>(value);
+}
 
-	RValue<SByte> SByte::operator=(const SByte &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
+RValue<SByte> SByte::operator=(const Reference<SByte> &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
 
-		return RValue<SByte>(value);
-	}
+	return RValue<SByte>(value);
+}
 
-	RValue<SByte> SByte::operator=(const Reference<SByte> &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
+RValue<SByte> operator+(RValue<SByte> lhs, RValue<SByte> rhs)
+{
+	return RValue<SByte>(Nucleus::createAdd(lhs.value, rhs.value));
+}
 
-		return RValue<SByte>(value);
-	}
+RValue<SByte> operator-(RValue<SByte> lhs, RValue<SByte> rhs)
+{
+	return RValue<SByte>(Nucleus::createSub(lhs.value, rhs.value));
+}
 
-	RValue<SByte> operator+(RValue<SByte> lhs, RValue<SByte> rhs)
-	{
-		return RValue<SByte>(Nucleus::createAdd(lhs.value, rhs.value));
-	}
+RValue<SByte> operator*(RValue<SByte> lhs, RValue<SByte> rhs)
+{
+	return RValue<SByte>(Nucleus::createMul(lhs.value, rhs.value));
+}
 
-	RValue<SByte> operator-(RValue<SByte> lhs, RValue<SByte> rhs)
-	{
-		return RValue<SByte>(Nucleus::createSub(lhs.value, rhs.value));
-	}
+RValue<SByte> operator/(RValue<SByte> lhs, RValue<SByte> rhs)
+{
+	return RValue<SByte>(Nucleus::createSDiv(lhs.value, rhs.value));
+}
 
-	RValue<SByte> operator*(RValue<SByte> lhs, RValue<SByte> rhs)
-	{
-		return RValue<SByte>(Nucleus::createMul(lhs.value, rhs.value));
-	}
+RValue<SByte> operator%(RValue<SByte> lhs, RValue<SByte> rhs)
+{
+	return RValue<SByte>(Nucleus::createSRem(lhs.value, rhs.value));
+}
 
-	RValue<SByte> operator/(RValue<SByte> lhs, RValue<SByte> rhs)
-	{
-		return RValue<SByte>(Nucleus::createSDiv(lhs.value, rhs.value));
-	}
+RValue<SByte> operator&(RValue<SByte> lhs, RValue<SByte> rhs)
+{
+	return RValue<SByte>(Nucleus::createAnd(lhs.value, rhs.value));
+}
 
-	RValue<SByte> operator%(RValue<SByte> lhs, RValue<SByte> rhs)
-	{
-		return RValue<SByte>(Nucleus::createSRem(lhs.value, rhs.value));
-	}
+RValue<SByte> operator|(RValue<SByte> lhs, RValue<SByte> rhs)
+{
+	return RValue<SByte>(Nucleus::createOr(lhs.value, rhs.value));
+}
 
-	RValue<SByte> operator&(RValue<SByte> lhs, RValue<SByte> rhs)
-	{
-		return RValue<SByte>(Nucleus::createAnd(lhs.value, rhs.value));
-	}
+RValue<SByte> operator^(RValue<SByte> lhs, RValue<SByte> rhs)
+{
+	return RValue<SByte>(Nucleus::createXor(lhs.value, rhs.value));
+}
 
-	RValue<SByte> operator|(RValue<SByte> lhs, RValue<SByte> rhs)
-	{
-		return RValue<SByte>(Nucleus::createOr(lhs.value, rhs.value));
-	}
+RValue<SByte> operator<<(RValue<SByte> lhs, RValue<SByte> rhs)
+{
+	return RValue<SByte>(Nucleus::createShl(lhs.value, rhs.value));
+}
 
-	RValue<SByte> operator^(RValue<SByte> lhs, RValue<SByte> rhs)
-	{
-		return RValue<SByte>(Nucleus::createXor(lhs.value, rhs.value));
-	}
+RValue<SByte> operator>>(RValue<SByte> lhs, RValue<SByte> rhs)
+{
+	return RValue<SByte>(Nucleus::createAShr(lhs.value, rhs.value));
+}
 
-	RValue<SByte> operator<<(RValue<SByte> lhs, RValue<SByte> rhs)
-	{
-		return RValue<SByte>(Nucleus::createShl(lhs.value, rhs.value));
-	}
+RValue<SByte> operator+=(SByte &lhs, RValue<SByte> rhs)
+{
+	return lhs = lhs + rhs;
+}
 
-	RValue<SByte> operator>>(RValue<SByte> lhs, RValue<SByte> rhs)
-	{
-		return RValue<SByte>(Nucleus::createAShr(lhs.value, rhs.value));
-	}
+RValue<SByte> operator-=(SByte &lhs, RValue<SByte> rhs)
+{
+	return lhs = lhs - rhs;
+}
 
-	RValue<SByte> operator+=(SByte &lhs, RValue<SByte> rhs)
-	{
-		return lhs = lhs + rhs;
-	}
+RValue<SByte> operator*=(SByte &lhs, RValue<SByte> rhs)
+{
+	return lhs = lhs * rhs;
+}
 
-	RValue<SByte> operator-=(SByte &lhs, RValue<SByte> rhs)
-	{
-		return lhs = lhs - rhs;
-	}
+RValue<SByte> operator/=(SByte &lhs, RValue<SByte> rhs)
+{
+	return lhs = lhs / rhs;
+}
 
-	RValue<SByte> operator*=(SByte &lhs, RValue<SByte> rhs)
-	{
-		return lhs = lhs * rhs;
-	}
+RValue<SByte> operator%=(SByte &lhs, RValue<SByte> rhs)
+{
+	return lhs = lhs % rhs;
+}
 
-	RValue<SByte> operator/=(SByte &lhs, RValue<SByte> rhs)
-	{
-		return lhs = lhs / rhs;
-	}
+RValue<SByte> operator&=(SByte &lhs, RValue<SByte> rhs)
+{
+	return lhs = lhs & rhs;
+}
 
-	RValue<SByte> operator%=(SByte &lhs, RValue<SByte> rhs)
-	{
-		return lhs = lhs % rhs;
-	}
+RValue<SByte> operator|=(SByte &lhs, RValue<SByte> rhs)
+{
+	return lhs = lhs | rhs;
+}
 
-	RValue<SByte> operator&=(SByte &lhs, RValue<SByte> rhs)
-	{
-		return lhs = lhs & rhs;
-	}
+RValue<SByte> operator^=(SByte &lhs, RValue<SByte> rhs)
+{
+	return lhs = lhs ^ rhs;
+}
 
-	RValue<SByte> operator|=(SByte &lhs, RValue<SByte> rhs)
-	{
-		return lhs = lhs | rhs;
-	}
+RValue<SByte> operator<<=(SByte &lhs, RValue<SByte> rhs)
+{
+	return lhs = lhs << rhs;
+}
 
-	RValue<SByte> operator^=(SByte &lhs, RValue<SByte> rhs)
-	{
-		return lhs = lhs ^ rhs;
-	}
+RValue<SByte> operator>>=(SByte &lhs, RValue<SByte> rhs)
+{
+	return lhs = lhs >> rhs;
+}
 
-	RValue<SByte> operator<<=(SByte &lhs, RValue<SByte> rhs)
-	{
-		return lhs = lhs << rhs;
-	}
+RValue<SByte> operator+(RValue<SByte> val)
+{
+	return val;
+}
 
-	RValue<SByte> operator>>=(SByte &lhs, RValue<SByte> rhs)
-	{
-		return lhs = lhs >> rhs;
-	}
-
-	RValue<SByte> operator+(RValue<SByte> val)
-	{
-		return val;
-	}
-
-	RValue<SByte> operator-(RValue<SByte> val)
-	{
-		return RValue<SByte>(Nucleus::createNeg(val.value));
-	}
+RValue<SByte> operator-(RValue<SByte> val)
+{
+	return RValue<SByte>(Nucleus::createNeg(val.value));
+}
 
-	RValue<SByte> operator~(RValue<SByte> val)
-	{
-		return RValue<SByte>(Nucleus::createNot(val.value));
-	}
+RValue<SByte> operator~(RValue<SByte> val)
+{
+	return RValue<SByte>(Nucleus::createNot(val.value));
+}
 
-	RValue<SByte> operator++(SByte &val, int)   // Post-increment
-	{
-		RValue<SByte> res = val;
+RValue<SByte> operator++(SByte &val, int)   // Post-increment
+{
+	RValue<SByte> res = val;
 
-		Value *inc = Nucleus::createAdd(res.value, Nucleus::createConstantByte((signed char)1));
-		val.storeValue(inc);
+	Value *inc = Nucleus::createAdd(res.value, Nucleus::createConstantByte((signed char)1));
+	val.storeValue(inc);
 
-		return res;
-	}
+	return res;
+}
 
-	const SByte &operator++(SByte &val)   // Pre-increment
-	{
-		Value *inc = Nucleus::createAdd(val.loadValue(), Nucleus::createConstantByte((signed char)1));
-		val.storeValue(inc);
+const SByte &operator++(SByte &val)   // Pre-increment
+{
+	Value *inc = Nucleus::createAdd(val.loadValue(), Nucleus::createConstantByte((signed char)1));
+	val.storeValue(inc);
 
-		return val;
-	}
+	return val;
+}
 
-	RValue<SByte> operator--(SByte &val, int)   // Post-decrement
-	{
-		RValue<SByte> res = val;
+RValue<SByte> operator--(SByte &val, int)   // Post-decrement
+{
+	RValue<SByte> res = val;
 
-		Value *inc = Nucleus::createSub(res.value, Nucleus::createConstantByte((signed char)1));
-		val.storeValue(inc);
+	Value *inc = Nucleus::createSub(res.value, Nucleus::createConstantByte((signed char)1));
+	val.storeValue(inc);
 
-		return res;
-	}
+	return res;
+}
 
-	const SByte &operator--(SByte &val)   // Pre-decrement
-	{
-		Value *inc = Nucleus::createSub(val.loadValue(), Nucleus::createConstantByte((signed char)1));
-		val.storeValue(inc);
+const SByte &operator--(SByte &val)   // Pre-decrement
+{
+	Value *inc = Nucleus::createSub(val.loadValue(), Nucleus::createConstantByte((signed char)1));
+	val.storeValue(inc);
 
-		return val;
-	}
+	return val;
+}
 
-	RValue<Bool> operator<(RValue<SByte> lhs, RValue<SByte> rhs)
-	{
-		return RValue<Bool>(Nucleus::createICmpSLT(lhs.value, rhs.value));
-	}
+RValue<Bool> operator<(RValue<SByte> lhs, RValue<SByte> rhs)
+{
+	return RValue<Bool>(Nucleus::createICmpSLT(lhs.value, rhs.value));
+}
 
-	RValue<Bool> operator<=(RValue<SByte> lhs, RValue<SByte> rhs)
-	{
-		return RValue<Bool>(Nucleus::createICmpSLE(lhs.value, rhs.value));
-	}
+RValue<Bool> operator<=(RValue<SByte> lhs, RValue<SByte> rhs)
+{
+	return RValue<Bool>(Nucleus::createICmpSLE(lhs.value, rhs.value));
+}
 
-	RValue<Bool> operator>(RValue<SByte> lhs, RValue<SByte> rhs)
-	{
-		return RValue<Bool>(Nucleus::createICmpSGT(lhs.value, rhs.value));
-	}
+RValue<Bool> operator>(RValue<SByte> lhs, RValue<SByte> rhs)
+{
+	return RValue<Bool>(Nucleus::createICmpSGT(lhs.value, rhs.value));
+}
 
-	RValue<Bool> operator>=(RValue<SByte> lhs, RValue<SByte> rhs)
-	{
-		return RValue<Bool>(Nucleus::createICmpSGE(lhs.value, rhs.value));
-	}
+RValue<Bool> operator>=(RValue<SByte> lhs, RValue<SByte> rhs)
+{
+	return RValue<Bool>(Nucleus::createICmpSGE(lhs.value, rhs.value));
+}
 
-	RValue<Bool> operator!=(RValue<SByte> lhs, RValue<SByte> rhs)
-	{
-		return RValue<Bool>(Nucleus::createICmpNE(lhs.value, rhs.value));
-	}
+RValue<Bool> operator!=(RValue<SByte> lhs, RValue<SByte> rhs)
+{
+	return RValue<Bool>(Nucleus::createICmpNE(lhs.value, rhs.value));
+}
 
-	RValue<Bool> operator==(RValue<SByte> lhs, RValue<SByte> rhs)
-	{
-		return RValue<Bool>(Nucleus::createICmpEQ(lhs.value, rhs.value));
-	}
+RValue<Bool> operator==(RValue<SByte> lhs, RValue<SByte> rhs)
+{
+	return RValue<Bool>(Nucleus::createICmpEQ(lhs.value, rhs.value));
+}
 
-	Short::Short(Argument<Short> argument)
-	{
-		storeValue(argument.value);
-	}
+Short::Short(Argument<Short> argument)
+{
+	storeValue(argument.value);
+}
 
-	Short::Short(RValue<Int> cast)
-	{
-		Value *integer = Nucleus::createTrunc(cast.value, Short::getType());
+Short::Short(RValue<Int> cast)
+{
+	Value *integer = Nucleus::createTrunc(cast.value, Short::getType());
 
-		storeValue(integer);
-	}
+	storeValue(integer);
+}
 
-	Short::Short(short x)
-	{
-		storeValue(Nucleus::createConstantShort(x));
-	}
+Short::Short(short x)
+{
+	storeValue(Nucleus::createConstantShort(x));
+}
 
-	Short::Short(RValue<Short> rhs)
-	{
-		storeValue(rhs.value);
-	}
+Short::Short(RValue<Short> rhs)
+{
+	storeValue(rhs.value);
+}
 
-	Short::Short(const Short &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
-	}
+Short::Short(const Short &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
+}
 
-	Short::Short(const Reference<Short> &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
-	}
+Short::Short(const Reference<Short> &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
+}
 
-	RValue<Short> Short::operator=(RValue<Short> rhs)
-	{
-		storeValue(rhs.value);
+RValue<Short> Short::operator=(RValue<Short> rhs)
+{
+	storeValue(rhs.value);
 
-		return rhs;
-	}
+	return rhs;
+}
 
-	RValue<Short> Short::operator=(const Short &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
+RValue<Short> Short::operator=(const Short &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
 
-		return RValue<Short>(value);
-	}
+	return RValue<Short>(value);
+}
 
-	RValue<Short> Short::operator=(const Reference<Short> &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
+RValue<Short> Short::operator=(const Reference<Short> &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
 
-		return RValue<Short>(value);
-	}
+	return RValue<Short>(value);
+}
 
-	RValue<Short> operator+(RValue<Short> lhs, RValue<Short> rhs)
-	{
-		return RValue<Short>(Nucleus::createAdd(lhs.value, rhs.value));
-	}
+RValue<Short> operator+(RValue<Short> lhs, RValue<Short> rhs)
+{
+	return RValue<Short>(Nucleus::createAdd(lhs.value, rhs.value));
+}
 
-	RValue<Short> operator-(RValue<Short> lhs, RValue<Short> rhs)
-	{
-		return RValue<Short>(Nucleus::createSub(lhs.value, rhs.value));
-	}
+RValue<Short> operator-(RValue<Short> lhs, RValue<Short> rhs)
+{
+	return RValue<Short>(Nucleus::createSub(lhs.value, rhs.value));
+}
 
-	RValue<Short> operator*(RValue<Short> lhs, RValue<Short> rhs)
-	{
-		return RValue<Short>(Nucleus::createMul(lhs.value, rhs.value));
-	}
+RValue<Short> operator*(RValue<Short> lhs, RValue<Short> rhs)
+{
+	return RValue<Short>(Nucleus::createMul(lhs.value, rhs.value));
+}
 
-	RValue<Short> operator/(RValue<Short> lhs, RValue<Short> rhs)
-	{
-		return RValue<Short>(Nucleus::createSDiv(lhs.value, rhs.value));
-	}
+RValue<Short> operator/(RValue<Short> lhs, RValue<Short> rhs)
+{
+	return RValue<Short>(Nucleus::createSDiv(lhs.value, rhs.value));
+}
 
-	RValue<Short> operator%(RValue<Short> lhs, RValue<Short> rhs)
-	{
-		return RValue<Short>(Nucleus::createSRem(lhs.value, rhs.value));
-	}
+RValue<Short> operator%(RValue<Short> lhs, RValue<Short> rhs)
+{
+	return RValue<Short>(Nucleus::createSRem(lhs.value, rhs.value));
+}
 
-	RValue<Short> operator&(RValue<Short> lhs, RValue<Short> rhs)
-	{
-		return RValue<Short>(Nucleus::createAnd(lhs.value, rhs.value));
-	}
+RValue<Short> operator&(RValue<Short> lhs, RValue<Short> rhs)
+{
+	return RValue<Short>(Nucleus::createAnd(lhs.value, rhs.value));
+}
 
-	RValue<Short> operator|(RValue<Short> lhs, RValue<Short> rhs)
-	{
-		return RValue<Short>(Nucleus::createOr(lhs.value, rhs.value));
-	}
+RValue<Short> operator|(RValue<Short> lhs, RValue<Short> rhs)
+{
+	return RValue<Short>(Nucleus::createOr(lhs.value, rhs.value));
+}
 
-	RValue<Short> operator^(RValue<Short> lhs, RValue<Short> rhs)
-	{
-		return RValue<Short>(Nucleus::createXor(lhs.value, rhs.value));
-	}
+RValue<Short> operator^(RValue<Short> lhs, RValue<Short> rhs)
+{
+	return RValue<Short>(Nucleus::createXor(lhs.value, rhs.value));
+}
 
-	RValue<Short> operator<<(RValue<Short> lhs, RValue<Short> rhs)
-	{
-		return RValue<Short>(Nucleus::createShl(lhs.value, rhs.value));
-	}
+RValue<Short> operator<<(RValue<Short> lhs, RValue<Short> rhs)
+{
+	return RValue<Short>(Nucleus::createShl(lhs.value, rhs.value));
+}
 
-	RValue<Short> operator>>(RValue<Short> lhs, RValue<Short> rhs)
-	{
-		return RValue<Short>(Nucleus::createAShr(lhs.value, rhs.value));
-	}
+RValue<Short> operator>>(RValue<Short> lhs, RValue<Short> rhs)
+{
+	return RValue<Short>(Nucleus::createAShr(lhs.value, rhs.value));
+}
 
-	RValue<Short> operator+=(Short &lhs, RValue<Short> rhs)
-	{
-		return lhs = lhs + rhs;
-	}
+RValue<Short> operator+=(Short &lhs, RValue<Short> rhs)
+{
+	return lhs = lhs + rhs;
+}
 
-	RValue<Short> operator-=(Short &lhs, RValue<Short> rhs)
-	{
-		return lhs = lhs - rhs;
-	}
+RValue<Short> operator-=(Short &lhs, RValue<Short> rhs)
+{
+	return lhs = lhs - rhs;
+}
 
-	RValue<Short> operator*=(Short &lhs, RValue<Short> rhs)
-	{
-		return lhs = lhs * rhs;
-	}
+RValue<Short> operator*=(Short &lhs, RValue<Short> rhs)
+{
+	return lhs = lhs * rhs;
+}
 
-	RValue<Short> operator/=(Short &lhs, RValue<Short> rhs)
-	{
-		return lhs = lhs / rhs;
-	}
+RValue<Short> operator/=(Short &lhs, RValue<Short> rhs)
+{
+	return lhs = lhs / rhs;
+}
 
-	RValue<Short> operator%=(Short &lhs, RValue<Short> rhs)
-	{
-		return lhs = lhs % rhs;
-	}
+RValue<Short> operator%=(Short &lhs, RValue<Short> rhs)
+{
+	return lhs = lhs % rhs;
+}
 
-	RValue<Short> operator&=(Short &lhs, RValue<Short> rhs)
-	{
-		return lhs = lhs & rhs;
-	}
+RValue<Short> operator&=(Short &lhs, RValue<Short> rhs)
+{
+	return lhs = lhs & rhs;
+}
 
-	RValue<Short> operator|=(Short &lhs, RValue<Short> rhs)
-	{
-		return lhs = lhs | rhs;
-	}
+RValue<Short> operator|=(Short &lhs, RValue<Short> rhs)
+{
+	return lhs = lhs | rhs;
+}
 
-	RValue<Short> operator^=(Short &lhs, RValue<Short> rhs)
-	{
-		return lhs = lhs ^ rhs;
-	}
+RValue<Short> operator^=(Short &lhs, RValue<Short> rhs)
+{
+	return lhs = lhs ^ rhs;
+}
 
-	RValue<Short> operator<<=(Short &lhs, RValue<Short> rhs)
-	{
-		return lhs = lhs << rhs;
-	}
+RValue<Short> operator<<=(Short &lhs, RValue<Short> rhs)
+{
+	return lhs = lhs << rhs;
+}
 
-	RValue<Short> operator>>=(Short &lhs, RValue<Short> rhs)
-	{
-		return lhs = lhs >> rhs;
-	}
+RValue<Short> operator>>=(Short &lhs, RValue<Short> rhs)
+{
+	return lhs = lhs >> rhs;
+}
 
-	RValue<Short> operator+(RValue<Short> val)
-	{
-		return val;
-	}
+RValue<Short> operator+(RValue<Short> val)
+{
+	return val;
+}
 
-	RValue<Short> operator-(RValue<Short> val)
-	{
-		return RValue<Short>(Nucleus::createNeg(val.value));
-	}
+RValue<Short> operator-(RValue<Short> val)
+{
+	return RValue<Short>(Nucleus::createNeg(val.value));
+}
 
-	RValue<Short> operator~(RValue<Short> val)
-	{
-		return RValue<Short>(Nucleus::createNot(val.value));
-	}
+RValue<Short> operator~(RValue<Short> val)
+{
+	return RValue<Short>(Nucleus::createNot(val.value));
+}
 
-	RValue<Short> operator++(Short &val, int)   // Post-increment
-	{
-		RValue<Short> res = val;
+RValue<Short> operator++(Short &val, int)   // Post-increment
+{
+	RValue<Short> res = val;
 
-		Value *inc = Nucleus::createAdd(res.value, Nucleus::createConstantShort((short)1));
-		val.storeValue(inc);
+	Value *inc = Nucleus::createAdd(res.value, Nucleus::createConstantShort((short)1));
+	val.storeValue(inc);
 
-		return res;
-	}
+	return res;
+}
 
-	const Short &operator++(Short &val)   // Pre-increment
-	{
-		Value *inc = Nucleus::createAdd(val.loadValue(), Nucleus::createConstantShort((short)1));
-		val.storeValue(inc);
+const Short &operator++(Short &val)   // Pre-increment
+{
+	Value *inc = Nucleus::createAdd(val.loadValue(), Nucleus::createConstantShort((short)1));
+	val.storeValue(inc);
 
-		return val;
-	}
+	return val;
+}
 
-	RValue<Short> operator--(Short &val, int)   // Post-decrement
-	{
-		RValue<Short> res = val;
+RValue<Short> operator--(Short &val, int)   // Post-decrement
+{
+	RValue<Short> res = val;
 
-		Value *inc = Nucleus::createSub(res.value, Nucleus::createConstantShort((short)1));
-		val.storeValue(inc);
+	Value *inc = Nucleus::createSub(res.value, Nucleus::createConstantShort((short)1));
+	val.storeValue(inc);
 
-		return res;
-	}
+	return res;
+}
 
-	const Short &operator--(Short &val)   // Pre-decrement
-	{
-		Value *inc = Nucleus::createSub(val.loadValue(), Nucleus::createConstantShort((short)1));
-		val.storeValue(inc);
+const Short &operator--(Short &val)   // Pre-decrement
+{
+	Value *inc = Nucleus::createSub(val.loadValue(), Nucleus::createConstantShort((short)1));
+	val.storeValue(inc);
 
-		return val;
-	}
+	return val;
+}
 
-	RValue<Bool> operator<(RValue<Short> lhs, RValue<Short> rhs)
-	{
-		return RValue<Bool>(Nucleus::createICmpSLT(lhs.value, rhs.value));
-	}
+RValue<Bool> operator<(RValue<Short> lhs, RValue<Short> rhs)
+{
+	return RValue<Bool>(Nucleus::createICmpSLT(lhs.value, rhs.value));
+}
 
-	RValue<Bool> operator<=(RValue<Short> lhs, RValue<Short> rhs)
-	{
-		return RValue<Bool>(Nucleus::createICmpSLE(lhs.value, rhs.value));
-	}
+RValue<Bool> operator<=(RValue<Short> lhs, RValue<Short> rhs)
+{
+	return RValue<Bool>(Nucleus::createICmpSLE(lhs.value, rhs.value));
+}
 
-	RValue<Bool> operator>(RValue<Short> lhs, RValue<Short> rhs)
-	{
-		return RValue<Bool>(Nucleus::createICmpSGT(lhs.value, rhs.value));
-	}
+RValue<Bool> operator>(RValue<Short> lhs, RValue<Short> rhs)
+{
+	return RValue<Bool>(Nucleus::createICmpSGT(lhs.value, rhs.value));
+}
 
-	RValue<Bool> operator>=(RValue<Short> lhs, RValue<Short> rhs)
-	{
-		return RValue<Bool>(Nucleus::createICmpSGE(lhs.value, rhs.value));
-	}
+RValue<Bool> operator>=(RValue<Short> lhs, RValue<Short> rhs)
+{
+	return RValue<Bool>(Nucleus::createICmpSGE(lhs.value, rhs.value));
+}
 
-	RValue<Bool> operator!=(RValue<Short> lhs, RValue<Short> rhs)
-	{
-		return RValue<Bool>(Nucleus::createICmpNE(lhs.value, rhs.value));
-	}
+RValue<Bool> operator!=(RValue<Short> lhs, RValue<Short> rhs)
+{
+	return RValue<Bool>(Nucleus::createICmpNE(lhs.value, rhs.value));
+}
 
-	RValue<Bool> operator==(RValue<Short> lhs, RValue<Short> rhs)
-	{
-		return RValue<Bool>(Nucleus::createICmpEQ(lhs.value, rhs.value));
-	}
+RValue<Bool> operator==(RValue<Short> lhs, RValue<Short> rhs)
+{
+	return RValue<Bool>(Nucleus::createICmpEQ(lhs.value, rhs.value));
+}
 
-	UShort::UShort(Argument<UShort> argument)
-	{
-		storeValue(argument.value);
-	}
+UShort::UShort(Argument<UShort> argument)
+{
+	storeValue(argument.value);
+}
 
-	UShort::UShort(RValue<UInt> cast)
-	{
-		Value *integer = Nucleus::createTrunc(cast.value, UShort::getType());
+UShort::UShort(RValue<UInt> cast)
+{
+	Value *integer = Nucleus::createTrunc(cast.value, UShort::getType());
 
-		storeValue(integer);
-	}
+	storeValue(integer);
+}
 
-	UShort::UShort(RValue<Int> cast)
-	{
-		Value *integer = Nucleus::createTrunc(cast.value, UShort::getType());
+UShort::UShort(RValue<Int> cast)
+{
+	Value *integer = Nucleus::createTrunc(cast.value, UShort::getType());
 
-		storeValue(integer);
-	}
+	storeValue(integer);
+}
 
-	UShort::UShort(unsigned short x)
-	{
-		storeValue(Nucleus::createConstantShort(x));
-	}
+UShort::UShort(unsigned short x)
+{
+	storeValue(Nucleus::createConstantShort(x));
+}
 
-	UShort::UShort(RValue<UShort> rhs)
-	{
-		storeValue(rhs.value);
-	}
+UShort::UShort(RValue<UShort> rhs)
+{
+	storeValue(rhs.value);
+}
 
-	UShort::UShort(const UShort &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
-	}
+UShort::UShort(const UShort &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
+}
 
-	UShort::UShort(const Reference<UShort> &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
-	}
+UShort::UShort(const Reference<UShort> &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
+}
 
-	RValue<UShort> UShort::operator=(RValue<UShort> rhs)
-	{
-		storeValue(rhs.value);
+RValue<UShort> UShort::operator=(RValue<UShort> rhs)
+{
+	storeValue(rhs.value);
 
-		return rhs;
-	}
+	return rhs;
+}
 
-	RValue<UShort> UShort::operator=(const UShort &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
+RValue<UShort> UShort::operator=(const UShort &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
 
-		return RValue<UShort>(value);
-	}
+	return RValue<UShort>(value);
+}
 
-	RValue<UShort> UShort::operator=(const Reference<UShort> &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
+RValue<UShort> UShort::operator=(const Reference<UShort> &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
 
-		return RValue<UShort>(value);
-	}
+	return RValue<UShort>(value);
+}
 
-	RValue<UShort> operator+(RValue<UShort> lhs, RValue<UShort> rhs)
-	{
-		return RValue<UShort>(Nucleus::createAdd(lhs.value, rhs.value));
-	}
+RValue<UShort> operator+(RValue<UShort> lhs, RValue<UShort> rhs)
+{
+	return RValue<UShort>(Nucleus::createAdd(lhs.value, rhs.value));
+}
 
-	RValue<UShort> operator-(RValue<UShort> lhs, RValue<UShort> rhs)
-	{
-		return RValue<UShort>(Nucleus::createSub(lhs.value, rhs.value));
-	}
+RValue<UShort> operator-(RValue<UShort> lhs, RValue<UShort> rhs)
+{
+	return RValue<UShort>(Nucleus::createSub(lhs.value, rhs.value));
+}
 
-	RValue<UShort> operator*(RValue<UShort> lhs, RValue<UShort> rhs)
-	{
-		return RValue<UShort>(Nucleus::createMul(lhs.value, rhs.value));
-	}
+RValue<UShort> operator*(RValue<UShort> lhs, RValue<UShort> rhs)
+{
+	return RValue<UShort>(Nucleus::createMul(lhs.value, rhs.value));
+}
 
-	RValue<UShort> operator/(RValue<UShort> lhs, RValue<UShort> rhs)
-	{
-		return RValue<UShort>(Nucleus::createUDiv(lhs.value, rhs.value));
-	}
+RValue<UShort> operator/(RValue<UShort> lhs, RValue<UShort> rhs)
+{
+	return RValue<UShort>(Nucleus::createUDiv(lhs.value, rhs.value));
+}
 
-	RValue<UShort> operator%(RValue<UShort> lhs, RValue<UShort> rhs)
-	{
-		return RValue<UShort>(Nucleus::createURem(lhs.value, rhs.value));
-	}
+RValue<UShort> operator%(RValue<UShort> lhs, RValue<UShort> rhs)
+{
+	return RValue<UShort>(Nucleus::createURem(lhs.value, rhs.value));
+}
 
-	RValue<UShort> operator&(RValue<UShort> lhs, RValue<UShort> rhs)
-	{
-		return RValue<UShort>(Nucleus::createAnd(lhs.value, rhs.value));
-	}
+RValue<UShort> operator&(RValue<UShort> lhs, RValue<UShort> rhs)
+{
+	return RValue<UShort>(Nucleus::createAnd(lhs.value, rhs.value));
+}
 
-	RValue<UShort> operator|(RValue<UShort> lhs, RValue<UShort> rhs)
-	{
-		return RValue<UShort>(Nucleus::createOr(lhs.value, rhs.value));
-	}
+RValue<UShort> operator|(RValue<UShort> lhs, RValue<UShort> rhs)
+{
+	return RValue<UShort>(Nucleus::createOr(lhs.value, rhs.value));
+}
 
-	RValue<UShort> operator^(RValue<UShort> lhs, RValue<UShort> rhs)
-	{
-		return RValue<UShort>(Nucleus::createXor(lhs.value, rhs.value));
-	}
+RValue<UShort> operator^(RValue<UShort> lhs, RValue<UShort> rhs)
+{
+	return RValue<UShort>(Nucleus::createXor(lhs.value, rhs.value));
+}
 
-	RValue<UShort> operator<<(RValue<UShort> lhs, RValue<UShort> rhs)
-	{
-		return RValue<UShort>(Nucleus::createShl(lhs.value, rhs.value));
-	}
+RValue<UShort> operator<<(RValue<UShort> lhs, RValue<UShort> rhs)
+{
+	return RValue<UShort>(Nucleus::createShl(lhs.value, rhs.value));
+}
 
-	RValue<UShort> operator>>(RValue<UShort> lhs, RValue<UShort> rhs)
-	{
-		return RValue<UShort>(Nucleus::createLShr(lhs.value, rhs.value));
-	}
+RValue<UShort> operator>>(RValue<UShort> lhs, RValue<UShort> rhs)
+{
+	return RValue<UShort>(Nucleus::createLShr(lhs.value, rhs.value));
+}
 
-	RValue<UShort> operator+=(UShort &lhs, RValue<UShort> rhs)
-	{
-		return lhs = lhs + rhs;
-	}
+RValue<UShort> operator+=(UShort &lhs, RValue<UShort> rhs)
+{
+	return lhs = lhs + rhs;
+}
 
-	RValue<UShort> operator-=(UShort &lhs, RValue<UShort> rhs)
-	{
-		return lhs = lhs - rhs;
-	}
+RValue<UShort> operator-=(UShort &lhs, RValue<UShort> rhs)
+{
+	return lhs = lhs - rhs;
+}
 
-	RValue<UShort> operator*=(UShort &lhs, RValue<UShort> rhs)
-	{
-		return lhs = lhs * rhs;
-	}
+RValue<UShort> operator*=(UShort &lhs, RValue<UShort> rhs)
+{
+	return lhs = lhs * rhs;
+}
 
-	RValue<UShort> operator/=(UShort &lhs, RValue<UShort> rhs)
-	{
-		return lhs = lhs / rhs;
-	}
+RValue<UShort> operator/=(UShort &lhs, RValue<UShort> rhs)
+{
+	return lhs = lhs / rhs;
+}
 
-	RValue<UShort> operator%=(UShort &lhs, RValue<UShort> rhs)
-	{
-		return lhs = lhs % rhs;
-	}
+RValue<UShort> operator%=(UShort &lhs, RValue<UShort> rhs)
+{
+	return lhs = lhs % rhs;
+}
 
-	RValue<UShort> operator&=(UShort &lhs, RValue<UShort> rhs)
-	{
-		return lhs = lhs & rhs;
-	}
+RValue<UShort> operator&=(UShort &lhs, RValue<UShort> rhs)
+{
+	return lhs = lhs & rhs;
+}
 
-	RValue<UShort> operator|=(UShort &lhs, RValue<UShort> rhs)
-	{
-		return lhs = lhs | rhs;
-	}
+RValue<UShort> operator|=(UShort &lhs, RValue<UShort> rhs)
+{
+	return lhs = lhs | rhs;
+}
 
-	RValue<UShort> operator^=(UShort &lhs, RValue<UShort> rhs)
-	{
-		return lhs = lhs ^ rhs;
-	}
+RValue<UShort> operator^=(UShort &lhs, RValue<UShort> rhs)
+{
+	return lhs = lhs ^ rhs;
+}
 
-	RValue<UShort> operator<<=(UShort &lhs, RValue<UShort> rhs)
-	{
-		return lhs = lhs << rhs;
-	}
+RValue<UShort> operator<<=(UShort &lhs, RValue<UShort> rhs)
+{
+	return lhs = lhs << rhs;
+}
 
-	RValue<UShort> operator>>=(UShort &lhs, RValue<UShort> rhs)
-	{
-		return lhs = lhs >> rhs;
-	}
+RValue<UShort> operator>>=(UShort &lhs, RValue<UShort> rhs)
+{
+	return lhs = lhs >> rhs;
+}
 
-	RValue<UShort> operator+(RValue<UShort> val)
-	{
-		return val;
-	}
+RValue<UShort> operator+(RValue<UShort> val)
+{
+	return val;
+}
 
-	RValue<UShort> operator-(RValue<UShort> val)
-	{
-		return RValue<UShort>(Nucleus::createNeg(val.value));
-	}
+RValue<UShort> operator-(RValue<UShort> val)
+{
+	return RValue<UShort>(Nucleus::createNeg(val.value));
+}
 
-	RValue<UShort> operator~(RValue<UShort> val)
-	{
-		return RValue<UShort>(Nucleus::createNot(val.value));
-	}
+RValue<UShort> operator~(RValue<UShort> val)
+{
+	return RValue<UShort>(Nucleus::createNot(val.value));
+}
 
-	RValue<UShort> operator++(UShort &val, int)   // Post-increment
-	{
-		RValue<UShort> res = val;
+RValue<UShort> operator++(UShort &val, int)   // Post-increment
+{
+	RValue<UShort> res = val;
 
-		Value *inc = Nucleus::createAdd(res.value, Nucleus::createConstantShort((unsigned short)1));
-		val.storeValue(inc);
+	Value *inc = Nucleus::createAdd(res.value, Nucleus::createConstantShort((unsigned short)1));
+	val.storeValue(inc);
 
-		return res;
-	}
+	return res;
+}
 
-	const UShort &operator++(UShort &val)   // Pre-increment
-	{
-		Value *inc = Nucleus::createAdd(val.loadValue(), Nucleus::createConstantShort((unsigned short)1));
-		val.storeValue(inc);
+const UShort &operator++(UShort &val)   // Pre-increment
+{
+	Value *inc = Nucleus::createAdd(val.loadValue(), Nucleus::createConstantShort((unsigned short)1));
+	val.storeValue(inc);
 
-		return val;
-	}
+	return val;
+}
 
-	RValue<UShort> operator--(UShort &val, int)   // Post-decrement
-	{
-		RValue<UShort> res = val;
+RValue<UShort> operator--(UShort &val, int)   // Post-decrement
+{
+	RValue<UShort> res = val;
 
-		Value *inc = Nucleus::createSub(res.value, Nucleus::createConstantShort((unsigned short)1));
-		val.storeValue(inc);
+	Value *inc = Nucleus::createSub(res.value, Nucleus::createConstantShort((unsigned short)1));
+	val.storeValue(inc);
 
-		return res;
-	}
+	return res;
+}
 
-	const UShort &operator--(UShort &val)   // Pre-decrement
-	{
-		Value *inc = Nucleus::createSub(val.loadValue(), Nucleus::createConstantShort((unsigned short)1));
-		val.storeValue(inc);
+const UShort &operator--(UShort &val)   // Pre-decrement
+{
+	Value *inc = Nucleus::createSub(val.loadValue(), Nucleus::createConstantShort((unsigned short)1));
+	val.storeValue(inc);
 
-		return val;
-	}
+	return val;
+}
 
-	RValue<Bool> operator<(RValue<UShort> lhs, RValue<UShort> rhs)
-	{
-		return RValue<Bool>(Nucleus::createICmpULT(lhs.value, rhs.value));
-	}
+RValue<Bool> operator<(RValue<UShort> lhs, RValue<UShort> rhs)
+{
+	return RValue<Bool>(Nucleus::createICmpULT(lhs.value, rhs.value));
+}
 
-	RValue<Bool> operator<=(RValue<UShort> lhs, RValue<UShort> rhs)
-	{
-		return RValue<Bool>(Nucleus::createICmpULE(lhs.value, rhs.value));
-	}
+RValue<Bool> operator<=(RValue<UShort> lhs, RValue<UShort> rhs)
+{
+	return RValue<Bool>(Nucleus::createICmpULE(lhs.value, rhs.value));
+}
 
-	RValue<Bool> operator>(RValue<UShort> lhs, RValue<UShort> rhs)
-	{
-		return RValue<Bool>(Nucleus::createICmpUGT(lhs.value, rhs.value));
-	}
+RValue<Bool> operator>(RValue<UShort> lhs, RValue<UShort> rhs)
+{
+	return RValue<Bool>(Nucleus::createICmpUGT(lhs.value, rhs.value));
+}
 
-	RValue<Bool> operator>=(RValue<UShort> lhs, RValue<UShort> rhs)
-	{
-		return RValue<Bool>(Nucleus::createICmpUGE(lhs.value, rhs.value));
-	}
+RValue<Bool> operator>=(RValue<UShort> lhs, RValue<UShort> rhs)
+{
+	return RValue<Bool>(Nucleus::createICmpUGE(lhs.value, rhs.value));
+}
 
-	RValue<Bool> operator!=(RValue<UShort> lhs, RValue<UShort> rhs)
-	{
-		return RValue<Bool>(Nucleus::createICmpNE(lhs.value, rhs.value));
-	}
+RValue<Bool> operator!=(RValue<UShort> lhs, RValue<UShort> rhs)
+{
+	return RValue<Bool>(Nucleus::createICmpNE(lhs.value, rhs.value));
+}
 
-	RValue<Bool> operator==(RValue<UShort> lhs, RValue<UShort> rhs)
-	{
-		return RValue<Bool>(Nucleus::createICmpEQ(lhs.value, rhs.value));
-	}
+RValue<Bool> operator==(RValue<UShort> lhs, RValue<UShort> rhs)
+{
+	return RValue<Bool>(Nucleus::createICmpEQ(lhs.value, rhs.value));
+}
 
-	Byte4::Byte4(RValue<Byte8> cast)
-	{
-		storeValue(Nucleus::createBitCast(cast.value, getType()));
-	}
+Byte4::Byte4(RValue<Byte8> cast)
+{
+	storeValue(Nucleus::createBitCast(cast.value, getType()));
+}
 
-	Byte4::Byte4(const Reference<Byte4> &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
-	}
+Byte4::Byte4(const Reference<Byte4> &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
+}
 
-	Byte8::Byte8(uint8_t x0, uint8_t x1, uint8_t x2, uint8_t x3, uint8_t x4, uint8_t x5, uint8_t x6, uint8_t x7)
-	{
-		int64_t constantVector[8] = {x0, x1, x2, x3, x4, x5, x6, x7};
-		storeValue(Nucleus::createConstantVector(constantVector, getType()));
-	}
+Byte8::Byte8(uint8_t x0, uint8_t x1, uint8_t x2, uint8_t x3, uint8_t x4, uint8_t x5, uint8_t x6, uint8_t x7)
+{
+	int64_t constantVector[8] = {x0, x1, x2, x3, x4, x5, x6, x7};
+	storeValue(Nucleus::createConstantVector(constantVector, getType()));
+}
 
-	Byte8::Byte8(RValue<Byte8> rhs)
-	{
-		storeValue(rhs.value);
-	}
+Byte8::Byte8(RValue<Byte8> rhs)
+{
+	storeValue(rhs.value);
+}
 
-	Byte8::Byte8(const Byte8 &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
-	}
+Byte8::Byte8(const Byte8 &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
+}
 
-	Byte8::Byte8(const Reference<Byte8> &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
-	}
+Byte8::Byte8(const Reference<Byte8> &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
+}
 
-	RValue<Byte8> Byte8::operator=(RValue<Byte8> rhs)
-	{
-		storeValue(rhs.value);
+RValue<Byte8> Byte8::operator=(RValue<Byte8> rhs)
+{
+	storeValue(rhs.value);
 
-		return rhs;
-	}
+	return rhs;
+}
 
-	RValue<Byte8> Byte8::operator=(const Byte8 &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
+RValue<Byte8> Byte8::operator=(const Byte8 &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
 
-		return RValue<Byte8>(value);
-	}
+	return RValue<Byte8>(value);
+}
 
-	RValue<Byte8> Byte8::operator=(const Reference<Byte8> &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
+RValue<Byte8> Byte8::operator=(const Reference<Byte8> &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
 
-		return RValue<Byte8>(value);
-	}
+	return RValue<Byte8>(value);
+}
 
-	RValue<Byte8> operator+(RValue<Byte8> lhs, RValue<Byte8> rhs)
-	{
-		return RValue<Byte8>(Nucleus::createAdd(lhs.value, rhs.value));
-	}
+RValue<Byte8> operator+(RValue<Byte8> lhs, RValue<Byte8> rhs)
+{
+	return RValue<Byte8>(Nucleus::createAdd(lhs.value, rhs.value));
+}
 
-	RValue<Byte8> operator-(RValue<Byte8> lhs, RValue<Byte8> rhs)
-	{
-		return RValue<Byte8>(Nucleus::createSub(lhs.value, rhs.value));
-	}
+RValue<Byte8> operator-(RValue<Byte8> lhs, RValue<Byte8> rhs)
+{
+	return RValue<Byte8>(Nucleus::createSub(lhs.value, rhs.value));
+}
 
 //	RValue<Byte8> operator*(RValue<Byte8> lhs, RValue<Byte8> rhs)
 //	{
@@ -1325,20 +1326,20 @@
 //		return RValue<Byte8>(Nucleus::createURem(lhs.value, rhs.value));
 //	}
 
-	RValue<Byte8> operator&(RValue<Byte8> lhs, RValue<Byte8> rhs)
-	{
-		return RValue<Byte8>(Nucleus::createAnd(lhs.value, rhs.value));
-	}
+RValue<Byte8> operator&(RValue<Byte8> lhs, RValue<Byte8> rhs)
+{
+	return RValue<Byte8>(Nucleus::createAnd(lhs.value, rhs.value));
+}
 
-	RValue<Byte8> operator|(RValue<Byte8> lhs, RValue<Byte8> rhs)
-	{
-		return RValue<Byte8>(Nucleus::createOr(lhs.value, rhs.value));
-	}
+RValue<Byte8> operator|(RValue<Byte8> lhs, RValue<Byte8> rhs)
+{
+	return RValue<Byte8>(Nucleus::createOr(lhs.value, rhs.value));
+}
 
-	RValue<Byte8> operator^(RValue<Byte8> lhs, RValue<Byte8> rhs)
-	{
-		return RValue<Byte8>(Nucleus::createXor(lhs.value, rhs.value));
-	}
+RValue<Byte8> operator^(RValue<Byte8> lhs, RValue<Byte8> rhs)
+{
+	return RValue<Byte8>(Nucleus::createXor(lhs.value, rhs.value));
+}
 
 //	RValue<Byte8> operator<<(RValue<Byte8> lhs, unsigned char rhs)
 //	{
@@ -1350,15 +1351,15 @@
 //		return RValue<Byte8>(Nucleus::createLShr(lhs.value, rhs.value));
 //	}
 
-	RValue<Byte8> operator+=(Byte8 &lhs, RValue<Byte8> rhs)
-	{
-		return lhs = lhs + rhs;
-	}
+RValue<Byte8> operator+=(Byte8 &lhs, RValue<Byte8> rhs)
+{
+	return lhs = lhs + rhs;
+}
 
-	RValue<Byte8> operator-=(Byte8 &lhs, RValue<Byte8> rhs)
-	{
-		return lhs = lhs - rhs;
-	}
+RValue<Byte8> operator-=(Byte8 &lhs, RValue<Byte8> rhs)
+{
+	return lhs = lhs - rhs;
+}
 
 //	RValue<Byte8> operator*=(Byte8 &lhs, RValue<Byte8> rhs)
 //	{
@@ -1375,20 +1376,20 @@
 //		return lhs = lhs % rhs;
 //	}
 
-	RValue<Byte8> operator&=(Byte8 &lhs, RValue<Byte8> rhs)
-	{
-		return lhs = lhs & rhs;
-	}
+RValue<Byte8> operator&=(Byte8 &lhs, RValue<Byte8> rhs)
+{
+	return lhs = lhs & rhs;
+}
 
-	RValue<Byte8> operator|=(Byte8 &lhs, RValue<Byte8> rhs)
-	{
-		return lhs = lhs | rhs;
-	}
+RValue<Byte8> operator|=(Byte8 &lhs, RValue<Byte8> rhs)
+{
+	return lhs = lhs | rhs;
+}
 
-	RValue<Byte8> operator^=(Byte8 &lhs, RValue<Byte8> rhs)
-	{
-		return lhs = lhs ^ rhs;
-	}
+RValue<Byte8> operator^=(Byte8 &lhs, RValue<Byte8> rhs)
+{
+	return lhs = lhs ^ rhs;
+}
 
 //	RValue<Byte8> operator<<=(Byte8 &lhs, RValue<Byte8> rhs)
 //	{
@@ -1410,92 +1411,92 @@
 //		return RValue<Byte8>(Nucleus::createNeg(val.value));
 //	}
 
-	RValue<Byte8> operator~(RValue<Byte8> val)
-	{
-		return RValue<Byte8>(Nucleus::createNot(val.value));
-	}
+RValue<Byte8> operator~(RValue<Byte8> val)
+{
+	return RValue<Byte8>(Nucleus::createNot(val.value));
+}
 
-	RValue<Short4> Unpack(RValue<Byte4> x)
-	{
-		int shuffle[16] = {0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7};   // Real type is v16i8
-		return As<Short4>(Nucleus::createShuffleVector(x.value, x.value, shuffle));
-	}
+RValue<Short4> Unpack(RValue<Byte4> x)
+{
+	int shuffle[16] = {0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7};   // Real type is v16i8
+	return As<Short4>(Nucleus::createShuffleVector(x.value, x.value, shuffle));
+}
 
-	RValue<Short4> Unpack(RValue<Byte4> x, RValue<Byte4> y)
-	{
-		return UnpackLow(As<Byte8>(x), As<Byte8>(y));
-	}
+RValue<Short4> Unpack(RValue<Byte4> x, RValue<Byte4> y)
+{
+	return UnpackLow(As<Byte8>(x), As<Byte8>(y));
+}
 
-	RValue<Short4> UnpackLow(RValue<Byte8> x, RValue<Byte8> y)
-	{
-		int shuffle[16] = {0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23};   // Real type is v16i8
-		return As<Short4>(Nucleus::createShuffleVector(x.value, y.value, shuffle));
-	}
+RValue<Short4> UnpackLow(RValue<Byte8> x, RValue<Byte8> y)
+{
+	int shuffle[16] = {0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23};   // Real type is v16i8
+	return As<Short4>(Nucleus::createShuffleVector(x.value, y.value, shuffle));
+}
 
-	RValue<Short4> UnpackHigh(RValue<Byte8> x, RValue<Byte8> y)
-	{
-		int shuffle[16] = {0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23};   // Real type is v16i8
-		auto lowHigh = RValue<Byte16>(Nucleus::createShuffleVector(x.value, y.value, shuffle));
-		return As<Short4>(Swizzle(As<Int4>(lowHigh), 0x2323));
-	}
+RValue<Short4> UnpackHigh(RValue<Byte8> x, RValue<Byte8> y)
+{
+	int shuffle[16] = {0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23};   // Real type is v16i8
+	auto lowHigh = RValue<Byte16>(Nucleus::createShuffleVector(x.value, y.value, shuffle));
+	return As<Short4>(Swizzle(As<Int4>(lowHigh), 0x2323));
+}
 
-	SByte8::SByte8(uint8_t x0, uint8_t x1, uint8_t x2, uint8_t x3, uint8_t x4, uint8_t x5, uint8_t x6, uint8_t x7)
-	{
-		int64_t constantVector[8] = {x0, x1, x2, x3, x4, x5, x6, x7};
-		Value *vector = Nucleus::createConstantVector(constantVector, getType());
+SByte8::SByte8(uint8_t x0, uint8_t x1, uint8_t x2, uint8_t x3, uint8_t x4, uint8_t x5, uint8_t x6, uint8_t x7)
+{
+	int64_t constantVector[8] = {x0, x1, x2, x3, x4, x5, x6, x7};
+	Value *vector = Nucleus::createConstantVector(constantVector, getType());
 
-		storeValue(Nucleus::createBitCast(vector, getType()));
-	}
+	storeValue(Nucleus::createBitCast(vector, getType()));
+}
 
-	SByte8::SByte8(RValue<SByte8> rhs)
-	{
-		storeValue(rhs.value);
-	}
+SByte8::SByte8(RValue<SByte8> rhs)
+{
+	storeValue(rhs.value);
+}
 
-	SByte8::SByte8(const SByte8 &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
-	}
+SByte8::SByte8(const SByte8 &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
+}
 
-	SByte8::SByte8(const Reference<SByte8> &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
-	}
+SByte8::SByte8(const Reference<SByte8> &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
+}
 
-	RValue<SByte8> SByte8::operator=(RValue<SByte8> rhs)
-	{
-		storeValue(rhs.value);
+RValue<SByte8> SByte8::operator=(RValue<SByte8> rhs)
+{
+	storeValue(rhs.value);
 
-		return rhs;
-	}
+	return rhs;
+}
 
-	RValue<SByte8> SByte8::operator=(const SByte8 &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
+RValue<SByte8> SByte8::operator=(const SByte8 &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
 
-		return RValue<SByte8>(value);
-	}
+	return RValue<SByte8>(value);
+}
 
-	RValue<SByte8> SByte8::operator=(const Reference<SByte8> &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
+RValue<SByte8> SByte8::operator=(const Reference<SByte8> &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
 
-		return RValue<SByte8>(value);
-	}
+	return RValue<SByte8>(value);
+}
 
-	RValue<SByte8> operator+(RValue<SByte8> lhs, RValue<SByte8> rhs)
-	{
-		return RValue<SByte8>(Nucleus::createAdd(lhs.value, rhs.value));
-	}
+RValue<SByte8> operator+(RValue<SByte8> lhs, RValue<SByte8> rhs)
+{
+	return RValue<SByte8>(Nucleus::createAdd(lhs.value, rhs.value));
+}
 
-	RValue<SByte8> operator-(RValue<SByte8> lhs, RValue<SByte8> rhs)
-	{
-		return RValue<SByte8>(Nucleus::createSub(lhs.value, rhs.value));
-	}
+RValue<SByte8> operator-(RValue<SByte8> lhs, RValue<SByte8> rhs)
+{
+	return RValue<SByte8>(Nucleus::createSub(lhs.value, rhs.value));
+}
 
 //	RValue<SByte8> operator*(RValue<SByte8> lhs, RValue<SByte8> rhs)
 //	{
@@ -1512,20 +1513,20 @@
 //		return RValue<SByte8>(Nucleus::createSRem(lhs.value, rhs.value));
 //	}
 
-	RValue<SByte8> operator&(RValue<SByte8> lhs, RValue<SByte8> rhs)
-	{
-		return RValue<SByte8>(Nucleus::createAnd(lhs.value, rhs.value));
-	}
+RValue<SByte8> operator&(RValue<SByte8> lhs, RValue<SByte8> rhs)
+{
+	return RValue<SByte8>(Nucleus::createAnd(lhs.value, rhs.value));
+}
 
-	RValue<SByte8> operator|(RValue<SByte8> lhs, RValue<SByte8> rhs)
-	{
-		return RValue<SByte8>(Nucleus::createOr(lhs.value, rhs.value));
-	}
+RValue<SByte8> operator|(RValue<SByte8> lhs, RValue<SByte8> rhs)
+{
+	return RValue<SByte8>(Nucleus::createOr(lhs.value, rhs.value));
+}
 
-	RValue<SByte8> operator^(RValue<SByte8> lhs, RValue<SByte8> rhs)
-	{
-		return RValue<SByte8>(Nucleus::createXor(lhs.value, rhs.value));
-	}
+RValue<SByte8> operator^(RValue<SByte8> lhs, RValue<SByte8> rhs)
+{
+	return RValue<SByte8>(Nucleus::createXor(lhs.value, rhs.value));
+}
 
 //	RValue<SByte8> operator<<(RValue<SByte8> lhs, unsigned char rhs)
 //	{
@@ -1537,15 +1538,15 @@
 //		return RValue<SByte8>(Nucleus::createAShr(lhs.value, rhs.value));
 //	}
 
-	RValue<SByte8> operator+=(SByte8 &lhs, RValue<SByte8> rhs)
-	{
-		return lhs = lhs + rhs;
-	}
+RValue<SByte8> operator+=(SByte8 &lhs, RValue<SByte8> rhs)
+{
+	return lhs = lhs + rhs;
+}
 
-	RValue<SByte8> operator-=(SByte8 &lhs, RValue<SByte8> rhs)
-	{
-		return lhs = lhs - rhs;
-	}
+RValue<SByte8> operator-=(SByte8 &lhs, RValue<SByte8> rhs)
+{
+	return lhs = lhs - rhs;
+}
 
 //	RValue<SByte8> operator*=(SByte8 &lhs, RValue<SByte8> rhs)
 //	{
@@ -1562,20 +1563,20 @@
 //		return lhs = lhs % rhs;
 //	}
 
-	RValue<SByte8> operator&=(SByte8 &lhs, RValue<SByte8> rhs)
-	{
-		return lhs = lhs & rhs;
-	}
+RValue<SByte8> operator&=(SByte8 &lhs, RValue<SByte8> rhs)
+{
+	return lhs = lhs & rhs;
+}
 
-	RValue<SByte8> operator|=(SByte8 &lhs, RValue<SByte8> rhs)
-	{
-		return lhs = lhs | rhs;
-	}
+RValue<SByte8> operator|=(SByte8 &lhs, RValue<SByte8> rhs)
+{
+	return lhs = lhs | rhs;
+}
 
-	RValue<SByte8> operator^=(SByte8 &lhs, RValue<SByte8> rhs)
-	{
-		return lhs = lhs ^ rhs;
-	}
+RValue<SByte8> operator^=(SByte8 &lhs, RValue<SByte8> rhs)
+{
+	return lhs = lhs ^ rhs;
+}
 
 //	RValue<SByte8> operator<<=(SByte8 &lhs, RValue<SByte8> rhs)
 //	{
@@ -1597,192 +1598,192 @@
 //		return RValue<SByte8>(Nucleus::createNeg(val.value));
 //	}
 
-	RValue<SByte8> operator~(RValue<SByte8> val)
-	{
-		return RValue<SByte8>(Nucleus::createNot(val.value));
-	}
+RValue<SByte8> operator~(RValue<SByte8> val)
+{
+	return RValue<SByte8>(Nucleus::createNot(val.value));
+}
 
-	RValue<Short4> UnpackLow(RValue<SByte8> x, RValue<SByte8> y)
-	{
-		int shuffle[16] = {0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23};   // Real type is v16i8
-		return As<Short4>(Nucleus::createShuffleVector(x.value, y.value, shuffle));
-	}
+RValue<Short4> UnpackLow(RValue<SByte8> x, RValue<SByte8> y)
+{
+	int shuffle[16] = {0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23};   // Real type is v16i8
+	return As<Short4>(Nucleus::createShuffleVector(x.value, y.value, shuffle));
+}
 
-	RValue<Short4> UnpackHigh(RValue<SByte8> x, RValue<SByte8> y)
-	{
-		int shuffle[16] = {0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23};   // Real type is v16i8
-		auto lowHigh = RValue<Byte16>(Nucleus::createShuffleVector(x.value, y.value, shuffle));
-		return As<Short4>(Swizzle(As<Int4>(lowHigh), 0x2323));
-	}
+RValue<Short4> UnpackHigh(RValue<SByte8> x, RValue<SByte8> y)
+{
+	int shuffle[16] = {0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23};   // Real type is v16i8
+	auto lowHigh = RValue<Byte16>(Nucleus::createShuffleVector(x.value, y.value, shuffle));
+	return As<Short4>(Swizzle(As<Int4>(lowHigh), 0x2323));
+}
 
-	Byte16::Byte16(RValue<Byte16> rhs)
-	{
-		storeValue(rhs.value);
-	}
+Byte16::Byte16(RValue<Byte16> rhs)
+{
+	storeValue(rhs.value);
+}
 
-	Byte16::Byte16(const Byte16 &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
-	}
+Byte16::Byte16(const Byte16 &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
+}
 
-	Byte16::Byte16(const Reference<Byte16> &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
-	}
+Byte16::Byte16(const Reference<Byte16> &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
+}
 
-	RValue<Byte16> Byte16::operator=(RValue<Byte16> rhs)
-	{
-		storeValue(rhs.value);
+RValue<Byte16> Byte16::operator=(RValue<Byte16> rhs)
+{
+	storeValue(rhs.value);
 
-		return rhs;
-	}
+	return rhs;
+}
 
-	RValue<Byte16> Byte16::operator=(const Byte16 &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
+RValue<Byte16> Byte16::operator=(const Byte16 &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
 
-		return RValue<Byte16>(value);
-	}
+	return RValue<Byte16>(value);
+}
 
-	RValue<Byte16> Byte16::operator=(const Reference<Byte16> &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
+RValue<Byte16> Byte16::operator=(const Reference<Byte16> &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
 
-		return RValue<Byte16>(value);
-	}
+	return RValue<Byte16>(value);
+}
 
-	Short2::Short2(RValue<Short4> cast)
-	{
-		storeValue(Nucleus::createBitCast(cast.value, getType()));
-	}
+Short2::Short2(RValue<Short4> cast)
+{
+	storeValue(Nucleus::createBitCast(cast.value, getType()));
+}
 
-	UShort2::UShort2(RValue<UShort4> cast)
-	{
-		storeValue(Nucleus::createBitCast(cast.value, getType()));
-	}
+UShort2::UShort2(RValue<UShort4> cast)
+{
+	storeValue(Nucleus::createBitCast(cast.value, getType()));
+}
 
-	Short4::Short4(RValue<Int> cast)
-	{
-		Value *vector = loadValue();
-		Value *element = Nucleus::createTrunc(cast.value, Short::getType());
-		Value *insert = Nucleus::createInsertElement(vector, element, 0);
-		Value *swizzle = Swizzle(RValue<Short4>(insert), 0x0000).value;
+Short4::Short4(RValue<Int> cast)
+{
+	Value *vector = loadValue();
+	Value *element = Nucleus::createTrunc(cast.value, Short::getType());
+	Value *insert = Nucleus::createInsertElement(vector, element, 0);
+	Value *swizzle = Swizzle(RValue<Short4>(insert), 0x0000).value;
 
-		storeValue(swizzle);
-	}
+	storeValue(swizzle);
+}
 
 //	Short4::Short4(RValue<Float> cast)
 //	{
 //	}
 
-	Short4::Short4(short xyzw)
-	{
-		int64_t constantVector[4] = {xyzw, xyzw, xyzw, xyzw};
-		storeValue(Nucleus::createConstantVector(constantVector, getType()));
-	}
+Short4::Short4(short xyzw)
+{
+	int64_t constantVector[4] = {xyzw, xyzw, xyzw, xyzw};
+	storeValue(Nucleus::createConstantVector(constantVector, getType()));
+}
 
-	Short4::Short4(short x, short y, short z, short w)
-	{
-		int64_t constantVector[4] = {x, y, z, w};
-		storeValue(Nucleus::createConstantVector(constantVector, getType()));
-	}
+Short4::Short4(short x, short y, short z, short w)
+{
+	int64_t constantVector[4] = {x, y, z, w};
+	storeValue(Nucleus::createConstantVector(constantVector, getType()));
+}
 
-	Short4::Short4(RValue<Short4> rhs)
-	{
-		storeValue(rhs.value);
-	}
+Short4::Short4(RValue<Short4> rhs)
+{
+	storeValue(rhs.value);
+}
 
-	Short4::Short4(const Short4 &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
-	}
+Short4::Short4(const Short4 &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
+}
 
-	Short4::Short4(const Reference<Short4> &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
-	}
+Short4::Short4(const Reference<Short4> &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
+}
 
-	Short4::Short4(RValue<UShort4> rhs)
-	{
-		storeValue(rhs.value);
-	}
+Short4::Short4(RValue<UShort4> rhs)
+{
+	storeValue(rhs.value);
+}
 
-	Short4::Short4(const UShort4 &rhs)
-	{
-		storeValue(rhs.loadValue());
-	}
+Short4::Short4(const UShort4 &rhs)
+{
+	storeValue(rhs.loadValue());
+}
 
-	Short4::Short4(const Reference<UShort4> &rhs)
-	{
-		storeValue(rhs.loadValue());
-	}
+Short4::Short4(const Reference<UShort4> &rhs)
+{
+	storeValue(rhs.loadValue());
+}
 
-	RValue<Short4> Short4::operator=(RValue<Short4> rhs)
-	{
-		storeValue(rhs.value);
+RValue<Short4> Short4::operator=(RValue<Short4> rhs)
+{
+	storeValue(rhs.value);
 
-		return rhs;
-	}
+	return rhs;
+}
 
-	RValue<Short4> Short4::operator=(const Short4 &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
+RValue<Short4> Short4::operator=(const Short4 &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
 
-		return RValue<Short4>(value);
-	}
+	return RValue<Short4>(value);
+}
 
-	RValue<Short4> Short4::operator=(const Reference<Short4> &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
+RValue<Short4> Short4::operator=(const Reference<Short4> &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
 
-		return RValue<Short4>(value);
-	}
+	return RValue<Short4>(value);
+}
 
-	RValue<Short4> Short4::operator=(RValue<UShort4> rhs)
-	{
-		storeValue(rhs.value);
+RValue<Short4> Short4::operator=(RValue<UShort4> rhs)
+{
+	storeValue(rhs.value);
 
-		return RValue<Short4>(rhs);
-	}
+	return RValue<Short4>(rhs);
+}
 
-	RValue<Short4> Short4::operator=(const UShort4 &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
+RValue<Short4> Short4::operator=(const UShort4 &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
 
-		return RValue<Short4>(value);
-	}
+	return RValue<Short4>(value);
+}
 
-	RValue<Short4> Short4::operator=(const Reference<UShort4> &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
+RValue<Short4> Short4::operator=(const Reference<UShort4> &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
 
-		return RValue<Short4>(value);
-	}
+	return RValue<Short4>(value);
+}
 
-	RValue<Short4> operator+(RValue<Short4> lhs, RValue<Short4> rhs)
-	{
-		return RValue<Short4>(Nucleus::createAdd(lhs.value, rhs.value));
-	}
+RValue<Short4> operator+(RValue<Short4> lhs, RValue<Short4> rhs)
+{
+	return RValue<Short4>(Nucleus::createAdd(lhs.value, rhs.value));
+}
 
-	RValue<Short4> operator-(RValue<Short4> lhs, RValue<Short4> rhs)
-	{
-		return RValue<Short4>(Nucleus::createSub(lhs.value, rhs.value));
-	}
+RValue<Short4> operator-(RValue<Short4> lhs, RValue<Short4> rhs)
+{
+	return RValue<Short4>(Nucleus::createSub(lhs.value, rhs.value));
+}
 
-	RValue<Short4> operator*(RValue<Short4> lhs, RValue<Short4> rhs)
-	{
-		return RValue<Short4>(Nucleus::createMul(lhs.value, rhs.value));
-	}
+RValue<Short4> operator*(RValue<Short4> lhs, RValue<Short4> rhs)
+{
+	return RValue<Short4>(Nucleus::createMul(lhs.value, rhs.value));
+}
 
 //	RValue<Short4> operator/(RValue<Short4> lhs, RValue<Short4> rhs)
 //	{
@@ -1794,35 +1795,35 @@
 //		return RValue<Short4>(Nucleus::createSRem(lhs.value, rhs.value));
 //	}
 
-	RValue<Short4> operator&(RValue<Short4> lhs, RValue<Short4> rhs)
-	{
-		return RValue<Short4>(Nucleus::createAnd(lhs.value, rhs.value));
-	}
+RValue<Short4> operator&(RValue<Short4> lhs, RValue<Short4> rhs)
+{
+	return RValue<Short4>(Nucleus::createAnd(lhs.value, rhs.value));
+}
 
-	RValue<Short4> operator|(RValue<Short4> lhs, RValue<Short4> rhs)
-	{
-		return RValue<Short4>(Nucleus::createOr(lhs.value, rhs.value));
-	}
+RValue<Short4> operator|(RValue<Short4> lhs, RValue<Short4> rhs)
+{
+	return RValue<Short4>(Nucleus::createOr(lhs.value, rhs.value));
+}
 
-	RValue<Short4> operator^(RValue<Short4> lhs, RValue<Short4> rhs)
-	{
-		return RValue<Short4>(Nucleus::createXor(lhs.value, rhs.value));
-	}
+RValue<Short4> operator^(RValue<Short4> lhs, RValue<Short4> rhs)
+{
+	return RValue<Short4>(Nucleus::createXor(lhs.value, rhs.value));
+}
 
-	RValue<Short4> operator+=(Short4 &lhs, RValue<Short4> rhs)
-	{
-		return lhs = lhs + rhs;
-	}
+RValue<Short4> operator+=(Short4 &lhs, RValue<Short4> rhs)
+{
+	return lhs = lhs + rhs;
+}
 
-	RValue<Short4> operator-=(Short4 &lhs, RValue<Short4> rhs)
-	{
-		return lhs = lhs - rhs;
-	}
+RValue<Short4> operator-=(Short4 &lhs, RValue<Short4> rhs)
+{
+	return lhs = lhs - rhs;
+}
 
-	RValue<Short4> operator*=(Short4 &lhs, RValue<Short4> rhs)
-	{
-		return lhs = lhs * rhs;
-	}
+RValue<Short4> operator*=(Short4 &lhs, RValue<Short4> rhs)
+{
+	return lhs = lhs * rhs;
+}
 
 //	RValue<Short4> operator/=(Short4 &lhs, RValue<Short4> rhs)
 //	{
@@ -1834,1166 +1835,1166 @@
 //		return lhs = lhs % rhs;
 //	}
 
-	RValue<Short4> operator&=(Short4 &lhs, RValue<Short4> rhs)
-	{
-		return lhs = lhs & rhs;
-	}
+RValue<Short4> operator&=(Short4 &lhs, RValue<Short4> rhs)
+{
+	return lhs = lhs & rhs;
+}
 
-	RValue<Short4> operator|=(Short4 &lhs, RValue<Short4> rhs)
-	{
-		return lhs = lhs | rhs;
-	}
+RValue<Short4> operator|=(Short4 &lhs, RValue<Short4> rhs)
+{
+	return lhs = lhs | rhs;
+}
 
-	RValue<Short4> operator^=(Short4 &lhs, RValue<Short4> rhs)
-	{
-		return lhs = lhs ^ rhs;
-	}
+RValue<Short4> operator^=(Short4 &lhs, RValue<Short4> rhs)
+{
+	return lhs = lhs ^ rhs;
+}
 
-	RValue<Short4> operator<<=(Short4 &lhs, unsigned char rhs)
-	{
-		return lhs = lhs << rhs;
-	}
+RValue<Short4> operator<<=(Short4 &lhs, unsigned char rhs)
+{
+	return lhs = lhs << rhs;
+}
 
-	RValue<Short4> operator>>=(Short4 &lhs, unsigned char rhs)
-	{
-		return lhs = lhs >> rhs;
-	}
+RValue<Short4> operator>>=(Short4 &lhs, unsigned char rhs)
+{
+	return lhs = lhs >> rhs;
+}
 
 //	RValue<Short4> operator+(RValue<Short4> val)
 //	{
 //		return val;
 //	}
 
-	RValue<Short4> operator-(RValue<Short4> val)
-	{
-		return RValue<Short4>(Nucleus::createNeg(val.value));
-	}
+RValue<Short4> operator-(RValue<Short4> val)
+{
+	return RValue<Short4>(Nucleus::createNeg(val.value));
+}
 
-	RValue<Short4> operator~(RValue<Short4> val)
-	{
-		return RValue<Short4>(Nucleus::createNot(val.value));
-	}
+RValue<Short4> operator~(RValue<Short4> val)
+{
+	return RValue<Short4>(Nucleus::createNot(val.value));
+}
 
-	RValue<Short4> RoundShort4(RValue<Float4> cast)
-	{
-		RValue<Int4> int4 = RoundInt(cast);
-		return As<Short4>(PackSigned(int4, int4));
-	}
+RValue<Short4> RoundShort4(RValue<Float4> cast)
+{
+	RValue<Int4> int4 = RoundInt(cast);
+	return As<Short4>(PackSigned(int4, int4));
+}
 
-	RValue<Int2> UnpackLow(RValue<Short4> x, RValue<Short4> y)
-	{
-		int shuffle[8] = {0, 8, 1, 9, 2, 10, 3, 11};   // Real type is v8i16
-		return As<Int2>(Nucleus::createShuffleVector(x.value, y.value, shuffle));
-	}
+RValue<Int2> UnpackLow(RValue<Short4> x, RValue<Short4> y)
+{
+	int shuffle[8] = {0, 8, 1, 9, 2, 10, 3, 11};   // Real type is v8i16
+	return As<Int2>(Nucleus::createShuffleVector(x.value, y.value, shuffle));
+}
 
-	RValue<Int2> UnpackHigh(RValue<Short4> x, RValue<Short4> y)
-	{
-		int shuffle[8] = {0, 8, 1, 9, 2, 10, 3, 11};   // Real type is v8i16
-		auto lowHigh = RValue<Short8>(Nucleus::createShuffleVector(x.value, y.value, shuffle));
-		return As<Int2>(Swizzle(As<Int4>(lowHigh), 0x2323));
-	}
+RValue<Int2> UnpackHigh(RValue<Short4> x, RValue<Short4> y)
+{
+	int shuffle[8] = {0, 8, 1, 9, 2, 10, 3, 11};   // Real type is v8i16
+	auto lowHigh = RValue<Short8>(Nucleus::createShuffleVector(x.value, y.value, shuffle));
+	return As<Int2>(Swizzle(As<Int4>(lowHigh), 0x2323));
+}
 
-	RValue<Short4> Swizzle(RValue<Short4> x, uint16_t select)
+RValue<Short4> Swizzle(RValue<Short4> x, uint16_t select)
+{
+	// Real type is v8i16
+	int shuffle[8] =
 	{
-		// Real type is v8i16
-		int shuffle[8] =
-		{
-			(select >> 12) & 0x03,
-			(select >>  8) & 0x03,
-			(select >>  4) & 0x03,
-			(select >>  0) & 0x03,
-			(select >> 12) & 0x03,
-			(select >>  8) & 0x03,
-			(select >>  4) & 0x03,
-			(select >>  0) & 0x03,
-		};
+		(select >> 12) & 0x03,
+		(select >>  8) & 0x03,
+		(select >>  4) & 0x03,
+		(select >>  0) & 0x03,
+		(select >> 12) & 0x03,
+		(select >>  8) & 0x03,
+		(select >>  4) & 0x03,
+		(select >>  0) & 0x03,
+	};
 
-		return As<Short4>(Nucleus::createShuffleVector(x.value, x.value, shuffle));
-	}
+	return As<Short4>(Nucleus::createShuffleVector(x.value, x.value, shuffle));
+}
 
-	RValue<Short4> Insert(RValue<Short4> val, RValue<Short> element, int i)
-	{
-		return RValue<Short4>(Nucleus::createInsertElement(val.value, element.value, i));
-	}
+RValue<Short4> Insert(RValue<Short4> val, RValue<Short> element, int i)
+{
+	return RValue<Short4>(Nucleus::createInsertElement(val.value, element.value, i));
+}
 
-	RValue<Short> Extract(RValue<Short4> val, int i)
-	{
-		return RValue<Short>(Nucleus::createExtractElement(val.value, Short::getType(), i));
-	}
+RValue<Short> Extract(RValue<Short4> val, int i)
+{
+	return RValue<Short>(Nucleus::createExtractElement(val.value, Short::getType(), i));
+}
 
-	UShort4::UShort4(RValue<Int4> cast)
-	{
-		*this = Short4(cast);
-	}
+UShort4::UShort4(RValue<Int4> cast)
+{
+	*this = Short4(cast);
+}
 
-	UShort4::UShort4(unsigned short xyzw)
-	{
-		int64_t constantVector[4] = {xyzw, xyzw, xyzw, xyzw};
-		storeValue(Nucleus::createConstantVector(constantVector, getType()));
-	}
+UShort4::UShort4(unsigned short xyzw)
+{
+	int64_t constantVector[4] = {xyzw, xyzw, xyzw, xyzw};
+	storeValue(Nucleus::createConstantVector(constantVector, getType()));
+}
 
-	UShort4::UShort4(unsigned short x, unsigned short y, unsigned short z, unsigned short w)
-	{
-		int64_t constantVector[4] = {x, y, z, w};
-		storeValue(Nucleus::createConstantVector(constantVector, getType()));
-	}
+UShort4::UShort4(unsigned short x, unsigned short y, unsigned short z, unsigned short w)
+{
+	int64_t constantVector[4] = {x, y, z, w};
+	storeValue(Nucleus::createConstantVector(constantVector, getType()));
+}
 
-	UShort4::UShort4(RValue<UShort4> rhs)
-	{
-		storeValue(rhs.value);
-	}
+UShort4::UShort4(RValue<UShort4> rhs)
+{
+	storeValue(rhs.value);
+}
 
-	UShort4::UShort4(const UShort4 &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
-	}
+UShort4::UShort4(const UShort4 &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
+}
 
-	UShort4::UShort4(const Reference<UShort4> &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
-	}
+UShort4::UShort4(const Reference<UShort4> &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
+}
 
-	UShort4::UShort4(RValue<Short4> rhs)
-	{
-		storeValue(rhs.value);
-	}
+UShort4::UShort4(RValue<Short4> rhs)
+{
+	storeValue(rhs.value);
+}
 
-	UShort4::UShort4(const Short4 &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
-	}
+UShort4::UShort4(const Short4 &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
+}
 
-	UShort4::UShort4(const Reference<Short4> &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
-	}
+UShort4::UShort4(const Reference<Short4> &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
+}
 
-	RValue<UShort4> UShort4::operator=(RValue<UShort4> rhs)
-	{
-		storeValue(rhs.value);
+RValue<UShort4> UShort4::operator=(RValue<UShort4> rhs)
+{
+	storeValue(rhs.value);
 
-		return rhs;
-	}
+	return rhs;
+}
 
-	RValue<UShort4> UShort4::operator=(const UShort4 &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
+RValue<UShort4> UShort4::operator=(const UShort4 &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
 
-		return RValue<UShort4>(value);
-	}
+	return RValue<UShort4>(value);
+}
 
-	RValue<UShort4> UShort4::operator=(const Reference<UShort4> &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
+RValue<UShort4> UShort4::operator=(const Reference<UShort4> &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
 
-		return RValue<UShort4>(value);
-	}
+	return RValue<UShort4>(value);
+}
 
-	RValue<UShort4> UShort4::operator=(RValue<Short4> rhs)
-	{
-		storeValue(rhs.value);
+RValue<UShort4> UShort4::operator=(RValue<Short4> rhs)
+{
+	storeValue(rhs.value);
 
-		return RValue<UShort4>(rhs);
-	}
+	return RValue<UShort4>(rhs);
+}
 
-	RValue<UShort4> UShort4::operator=(const Short4 &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
+RValue<UShort4> UShort4::operator=(const Short4 &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
 
-		return RValue<UShort4>(value);
-	}
+	return RValue<UShort4>(value);
+}
 
-	RValue<UShort4> UShort4::operator=(const Reference<Short4> &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
+RValue<UShort4> UShort4::operator=(const Reference<Short4> &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
 
-		return RValue<UShort4>(value);
-	}
+	return RValue<UShort4>(value);
+}
 
-	RValue<UShort4> operator+(RValue<UShort4> lhs, RValue<UShort4> rhs)
-	{
-		return RValue<UShort4>(Nucleus::createAdd(lhs.value, rhs.value));
-	}
+RValue<UShort4> operator+(RValue<UShort4> lhs, RValue<UShort4> rhs)
+{
+	return RValue<UShort4>(Nucleus::createAdd(lhs.value, rhs.value));
+}
 
-	RValue<UShort4> operator-(RValue<UShort4> lhs, RValue<UShort4> rhs)
-	{
-		return RValue<UShort4>(Nucleus::createSub(lhs.value, rhs.value));
-	}
+RValue<UShort4> operator-(RValue<UShort4> lhs, RValue<UShort4> rhs)
+{
+	return RValue<UShort4>(Nucleus::createSub(lhs.value, rhs.value));
+}
 
-	RValue<UShort4> operator*(RValue<UShort4> lhs, RValue<UShort4> rhs)
-	{
-		return RValue<UShort4>(Nucleus::createMul(lhs.value, rhs.value));
-	}
+RValue<UShort4> operator*(RValue<UShort4> lhs, RValue<UShort4> rhs)
+{
+	return RValue<UShort4>(Nucleus::createMul(lhs.value, rhs.value));
+}
 
-	RValue<UShort4> operator&(RValue<UShort4> lhs, RValue<UShort4> rhs)
-	{
-		return RValue<UShort4>(Nucleus::createAnd(lhs.value, rhs.value));
-	}
+RValue<UShort4> operator&(RValue<UShort4> lhs, RValue<UShort4> rhs)
+{
+	return RValue<UShort4>(Nucleus::createAnd(lhs.value, rhs.value));
+}
 
-	RValue<UShort4> operator|(RValue<UShort4> lhs, RValue<UShort4> rhs)
-	{
-		return RValue<UShort4>(Nucleus::createOr(lhs.value, rhs.value));
-	}
+RValue<UShort4> operator|(RValue<UShort4> lhs, RValue<UShort4> rhs)
+{
+	return RValue<UShort4>(Nucleus::createOr(lhs.value, rhs.value));
+}
 
-	RValue<UShort4> operator^(RValue<UShort4> lhs, RValue<UShort4> rhs)
-	{
-		return RValue<UShort4>(Nucleus::createXor(lhs.value, rhs.value));
-	}
+RValue<UShort4> operator^(RValue<UShort4> lhs, RValue<UShort4> rhs)
+{
+	return RValue<UShort4>(Nucleus::createXor(lhs.value, rhs.value));
+}
 
-	RValue<UShort4> operator<<=(UShort4 &lhs, unsigned char rhs)
-	{
-		return lhs = lhs << rhs;
-	}
+RValue<UShort4> operator<<=(UShort4 &lhs, unsigned char rhs)
+{
+	return lhs = lhs << rhs;
+}
 
-	RValue<UShort4> operator>>=(UShort4 &lhs, unsigned char rhs)
-	{
-		return lhs = lhs >> rhs;
-	}
+RValue<UShort4> operator>>=(UShort4 &lhs, unsigned char rhs)
+{
+	return lhs = lhs >> rhs;
+}
 
-	RValue<UShort4> operator~(RValue<UShort4> val)
-	{
-		return RValue<UShort4>(Nucleus::createNot(val.value));
-	}
+RValue<UShort4> operator~(RValue<UShort4> val)
+{
+	return RValue<UShort4>(Nucleus::createNot(val.value));
+}
 
-	Short8::Short8(short c)
-	{
-		int64_t constantVector[8] = {c, c, c, c, c, c, c, c};
-		storeValue(Nucleus::createConstantVector(constantVector, getType()));
-	}
+Short8::Short8(short c)
+{
+	int64_t constantVector[8] = {c, c, c, c, c, c, c, c};
+	storeValue(Nucleus::createConstantVector(constantVector, getType()));
+}
 
-	Short8::Short8(short c0, short c1, short c2, short c3, short c4, short c5, short c6, short c7)
-	{
-		int64_t constantVector[8] = {c0, c1, c2, c3, c4, c5, c6, c7};
-		storeValue(Nucleus::createConstantVector(constantVector, getType()));
-	}
+Short8::Short8(short c0, short c1, short c2, short c3, short c4, short c5, short c6, short c7)
+{
+	int64_t constantVector[8] = {c0, c1, c2, c3, c4, c5, c6, c7};
+	storeValue(Nucleus::createConstantVector(constantVector, getType()));
+}
 
-	Short8::Short8(RValue<Short8> rhs)
-	{
-		storeValue(rhs.value);
-	}
+Short8::Short8(RValue<Short8> rhs)
+{
+	storeValue(rhs.value);
+}
 
-	Short8::Short8(const Reference<Short8> &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
-	}
+Short8::Short8(const Reference<Short8> &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
+}
 
-	Short8::Short8(RValue<Short4> lo, RValue<Short4> hi)
-	{
-		int shuffle[8] = {0, 1, 2, 3, 8, 9, 10, 11};   // Real type is v8i16
-		Value *packed = Nucleus::createShuffleVector(lo.value, hi.value, shuffle);
+Short8::Short8(RValue<Short4> lo, RValue<Short4> hi)
+{
+	int shuffle[8] = {0, 1, 2, 3, 8, 9, 10, 11};   // Real type is v8i16
+	Value *packed = Nucleus::createShuffleVector(lo.value, hi.value, shuffle);
 
-		storeValue(packed);
-	}
+	storeValue(packed);
+}
 
-	RValue<Short8> Short8::operator=(RValue<Short8> rhs)
-	{
-		storeValue(rhs.value);
+RValue<Short8> Short8::operator=(RValue<Short8> rhs)
+{
+	storeValue(rhs.value);
 
-		return rhs;
-	}
+	return rhs;
+}
 
-	RValue<Short8> Short8::operator=(const Short8 &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
+RValue<Short8> Short8::operator=(const Short8 &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
 
-		return RValue<Short8>(value);
-	}
+	return RValue<Short8>(value);
+}
 
-	RValue<Short8> Short8::operator=(const Reference<Short8> &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
+RValue<Short8> Short8::operator=(const Reference<Short8> &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
 
-		return RValue<Short8>(value);
-	}
+	return RValue<Short8>(value);
+}
 
-	RValue<Short8> operator+(RValue<Short8> lhs, RValue<Short8> rhs)
-	{
-		return RValue<Short8>(Nucleus::createAdd(lhs.value, rhs.value));
-	}
+RValue<Short8> operator+(RValue<Short8> lhs, RValue<Short8> rhs)
+{
+	return RValue<Short8>(Nucleus::createAdd(lhs.value, rhs.value));
+}
 
-	RValue<Short8> operator&(RValue<Short8> lhs, RValue<Short8> rhs)
-	{
-		return RValue<Short8>(Nucleus::createAnd(lhs.value, rhs.value));
-	}
+RValue<Short8> operator&(RValue<Short8> lhs, RValue<Short8> rhs)
+{
+	return RValue<Short8>(Nucleus::createAnd(lhs.value, rhs.value));
+}
 
-	RValue<Int4> Abs(RValue<Int4> x)
-	{
-		// TODO: Optimize.
-		auto negative = x >> 31;
-		return (x ^ negative) - negative;
-	}
+RValue<Int4> Abs(RValue<Int4> x)
+{
+	// TODO: Optimize.
+	auto negative = x >> 31;
+	return (x ^ negative) - negative;
+}
 
-	UShort8::UShort8(unsigned short c)
-	{
-		int64_t constantVector[8] = {c, c, c, c, c, c, c, c};
-		storeValue(Nucleus::createConstantVector(constantVector, getType()));
-	}
+UShort8::UShort8(unsigned short c)
+{
+	int64_t constantVector[8] = {c, c, c, c, c, c, c, c};
+	storeValue(Nucleus::createConstantVector(constantVector, getType()));
+}
 
-	UShort8::UShort8(unsigned short c0, unsigned short c1, unsigned short c2, unsigned short c3, unsigned short c4, unsigned short c5, unsigned short c6, unsigned short c7)
-	{
-		int64_t constantVector[8] = {c0, c1, c2, c3, c4, c5, c6, c7};
-		storeValue(Nucleus::createConstantVector(constantVector, getType()));
-	}
+UShort8::UShort8(unsigned short c0, unsigned short c1, unsigned short c2, unsigned short c3, unsigned short c4, unsigned short c5, unsigned short c6, unsigned short c7)
+{
+	int64_t constantVector[8] = {c0, c1, c2, c3, c4, c5, c6, c7};
+	storeValue(Nucleus::createConstantVector(constantVector, getType()));
+}
 
-	UShort8::UShort8(RValue<UShort8> rhs)
-	{
-		storeValue(rhs.value);
-	}
+UShort8::UShort8(RValue<UShort8> rhs)
+{
+	storeValue(rhs.value);
+}
 
-	UShort8::UShort8(const Reference<UShort8> &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
-	}
+UShort8::UShort8(const Reference<UShort8> &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
+}
 
-	UShort8::UShort8(RValue<UShort4> lo, RValue<UShort4> hi)
-	{
-		int shuffle[8] = {0, 1, 2, 3, 8, 9, 10, 11};   // Real type is v8i16
-		Value *packed = Nucleus::createShuffleVector(lo.value, hi.value, shuffle);
+UShort8::UShort8(RValue<UShort4> lo, RValue<UShort4> hi)
+{
+	int shuffle[8] = {0, 1, 2, 3, 8, 9, 10, 11};   // Real type is v8i16
+	Value *packed = Nucleus::createShuffleVector(lo.value, hi.value, shuffle);
 
-		storeValue(packed);
-	}
+	storeValue(packed);
+}
 
-	RValue<UShort8> UShort8::operator=(RValue<UShort8> rhs)
-	{
-		storeValue(rhs.value);
+RValue<UShort8> UShort8::operator=(RValue<UShort8> rhs)
+{
+	storeValue(rhs.value);
 
-		return rhs;
-	}
+	return rhs;
+}
 
-	RValue<UShort8> UShort8::operator=(const UShort8 &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
+RValue<UShort8> UShort8::operator=(const UShort8 &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
 
-		return RValue<UShort8>(value);
-	}
+	return RValue<UShort8>(value);
+}
 
-	RValue<UShort8> UShort8::operator=(const Reference<UShort8> &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
+RValue<UShort8> UShort8::operator=(const Reference<UShort8> &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
 
-		return RValue<UShort8>(value);
-	}
+	return RValue<UShort8>(value);
+}
 
-	RValue<UShort8> operator&(RValue<UShort8> lhs, RValue<UShort8> rhs)
-	{
-		return RValue<UShort8>(Nucleus::createAnd(lhs.value, rhs.value));
-	}
+RValue<UShort8> operator&(RValue<UShort8> lhs, RValue<UShort8> rhs)
+{
+	return RValue<UShort8>(Nucleus::createAnd(lhs.value, rhs.value));
+}
 
-	RValue<UShort8> operator+(RValue<UShort8> lhs, RValue<UShort8> rhs)
-	{
-		return RValue<UShort8>(Nucleus::createAdd(lhs.value, rhs.value));
-	}
+RValue<UShort8> operator+(RValue<UShort8> lhs, RValue<UShort8> rhs)
+{
+	return RValue<UShort8>(Nucleus::createAdd(lhs.value, rhs.value));
+}
 
-	RValue<UShort8> operator*(RValue<UShort8> lhs, RValue<UShort8> rhs)
-	{
-		return RValue<UShort8>(Nucleus::createMul(lhs.value, rhs.value));
-	}
+RValue<UShort8> operator*(RValue<UShort8> lhs, RValue<UShort8> rhs)
+{
+	return RValue<UShort8>(Nucleus::createMul(lhs.value, rhs.value));
+}
 
-	RValue<UShort8> operator+=(UShort8 &lhs, RValue<UShort8> rhs)
-	{
-		return lhs = lhs + rhs;
-	}
+RValue<UShort8> operator+=(UShort8 &lhs, RValue<UShort8> rhs)
+{
+	return lhs = lhs + rhs;
+}
 
-	RValue<UShort8> operator~(RValue<UShort8> val)
-	{
-		return RValue<UShort8>(Nucleus::createNot(val.value));
-	}
+RValue<UShort8> operator~(RValue<UShort8> val)
+{
+	return RValue<UShort8>(Nucleus::createNot(val.value));
+}
 
-	Int::Int(Argument<Int> argument)
-	{
-		storeValue(argument.value);
-	}
+Int::Int(Argument<Int> argument)
+{
+	storeValue(argument.value);
+}
 
-	Int::Int(RValue<Byte> cast)
-	{
-		Value *integer = Nucleus::createZExt(cast.value, Int::getType());
+Int::Int(RValue<Byte> cast)
+{
+	Value *integer = Nucleus::createZExt(cast.value, Int::getType());
 
-		storeValue(integer);
-	}
+	storeValue(integer);
+}
 
-	Int::Int(RValue<SByte> cast)
-	{
-		Value *integer = Nucleus::createSExt(cast.value, Int::getType());
+Int::Int(RValue<SByte> cast)
+{
+	Value *integer = Nucleus::createSExt(cast.value, Int::getType());
 
-		storeValue(integer);
-	}
+	storeValue(integer);
+}
 
-	Int::Int(RValue<Short> cast)
-	{
-		Value *integer = Nucleus::createSExt(cast.value, Int::getType());
+Int::Int(RValue<Short> cast)
+{
+	Value *integer = Nucleus::createSExt(cast.value, Int::getType());
 
-		storeValue(integer);
-	}
+	storeValue(integer);
+}
 
-	Int::Int(RValue<UShort> cast)
-	{
-		Value *integer = Nucleus::createZExt(cast.value, Int::getType());
+Int::Int(RValue<UShort> cast)
+{
+	Value *integer = Nucleus::createZExt(cast.value, Int::getType());
 
-		storeValue(integer);
-	}
+	storeValue(integer);
+}
 
-	Int::Int(RValue<Int2> cast)
-	{
-		*this = Extract(cast, 0);
-	}
+Int::Int(RValue<Int2> cast)
+{
+	*this = Extract(cast, 0);
+}
 
-	Int::Int(RValue<Long> cast)
-	{
-		Value *integer = Nucleus::createTrunc(cast.value, Int::getType());
+Int::Int(RValue<Long> cast)
+{
+	Value *integer = Nucleus::createTrunc(cast.value, Int::getType());
 
-		storeValue(integer);
-	}
+	storeValue(integer);
+}
 
-	Int::Int(RValue<Float> cast)
-	{
-		Value *integer = Nucleus::createFPToSI(cast.value, Int::getType());
+Int::Int(RValue<Float> cast)
+{
+	Value *integer = Nucleus::createFPToSI(cast.value, Int::getType());
 
-		storeValue(integer);
-	}
+	storeValue(integer);
+}
 
-	Int::Int(int x)
-	{
-		storeValue(Nucleus::createConstantInt(x));
-	}
+Int::Int(int x)
+{
+	storeValue(Nucleus::createConstantInt(x));
+}
 
-	Int::Int(RValue<Int> rhs)
-	{
-		storeValue(rhs.value);
-	}
+Int::Int(RValue<Int> rhs)
+{
+	storeValue(rhs.value);
+}
 
-	Int::Int(RValue<UInt> rhs)
-	{
-		storeValue(rhs.value);
-	}
+Int::Int(RValue<UInt> rhs)
+{
+	storeValue(rhs.value);
+}
 
-	Int::Int(const Int &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
-	}
+Int::Int(const Int &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
+}
 
-	Int::Int(const Reference<Int> &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
-	}
+Int::Int(const Reference<Int> &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
+}
 
-	Int::Int(const UInt &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
-	}
+Int::Int(const UInt &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
+}
 
-	Int::Int(const Reference<UInt> &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
-	}
+Int::Int(const Reference<UInt> &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
+}
 
-	RValue<Int> Int::operator=(int rhs)
-	{
-		return RValue<Int>(storeValue(Nucleus::createConstantInt(rhs)));
-	}
+RValue<Int> Int::operator=(int rhs)
+{
+	return RValue<Int>(storeValue(Nucleus::createConstantInt(rhs)));
+}
 
-	RValue<Int> Int::operator=(RValue<Int> rhs)
-	{
-		storeValue(rhs.value);
+RValue<Int> Int::operator=(RValue<Int> rhs)
+{
+	storeValue(rhs.value);
 
-		return rhs;
-	}
+	return rhs;
+}
 
-	RValue<Int> Int::operator=(RValue<UInt> rhs)
-	{
-		storeValue(rhs.value);
+RValue<Int> Int::operator=(RValue<UInt> rhs)
+{
+	storeValue(rhs.value);
 
-		return RValue<Int>(rhs);
-	}
+	return RValue<Int>(rhs);
+}
 
-	RValue<Int> Int::operator=(const Int &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
+RValue<Int> Int::operator=(const Int &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
 
-		return RValue<Int>(value);
-	}
+	return RValue<Int>(value);
+}
 
-	RValue<Int> Int::operator=(const Reference<Int> &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
+RValue<Int> Int::operator=(const Reference<Int> &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
 
-		return RValue<Int>(value);
-	}
+	return RValue<Int>(value);
+}
 
-	RValue<Int> Int::operator=(const UInt &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
+RValue<Int> Int::operator=(const UInt &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
 
-		return RValue<Int>(value);
-	}
+	return RValue<Int>(value);
+}
 
-	RValue<Int> Int::operator=(const Reference<UInt> &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
+RValue<Int> Int::operator=(const Reference<UInt> &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
 
-		return RValue<Int>(value);
-	}
+	return RValue<Int>(value);
+}
 
-	RValue<Int> operator+(RValue<Int> lhs, RValue<Int> rhs)
-	{
-		return RValue<Int>(Nucleus::createAdd(lhs.value, rhs.value));
-	}
+RValue<Int> operator+(RValue<Int> lhs, RValue<Int> rhs)
+{
+	return RValue<Int>(Nucleus::createAdd(lhs.value, rhs.value));
+}
 
-	RValue<Int> operator-(RValue<Int> lhs, RValue<Int> rhs)
-	{
-		return RValue<Int>(Nucleus::createSub(lhs.value, rhs.value));
-	}
+RValue<Int> operator-(RValue<Int> lhs, RValue<Int> rhs)
+{
+	return RValue<Int>(Nucleus::createSub(lhs.value, rhs.value));
+}
 
-	RValue<Int> operator*(RValue<Int> lhs, RValue<Int> rhs)
-	{
-		return RValue<Int>(Nucleus::createMul(lhs.value, rhs.value));
-	}
+RValue<Int> operator*(RValue<Int> lhs, RValue<Int> rhs)
+{
+	return RValue<Int>(Nucleus::createMul(lhs.value, rhs.value));
+}
 
-	RValue<Int> operator/(RValue<Int> lhs, RValue<Int> rhs)
-	{
-		return RValue<Int>(Nucleus::createSDiv(lhs.value, rhs.value));
-	}
+RValue<Int> operator/(RValue<Int> lhs, RValue<Int> rhs)
+{
+	return RValue<Int>(Nucleus::createSDiv(lhs.value, rhs.value));
+}
 
-	RValue<Int> operator%(RValue<Int> lhs, RValue<Int> rhs)
-	{
-		return RValue<Int>(Nucleus::createSRem(lhs.value, rhs.value));
-	}
+RValue<Int> operator%(RValue<Int> lhs, RValue<Int> rhs)
+{
+	return RValue<Int>(Nucleus::createSRem(lhs.value, rhs.value));
+}
 
-	RValue<Int> operator&(RValue<Int> lhs, RValue<Int> rhs)
-	{
-		return RValue<Int>(Nucleus::createAnd(lhs.value, rhs.value));
-	}
+RValue<Int> operator&(RValue<Int> lhs, RValue<Int> rhs)
+{
+	return RValue<Int>(Nucleus::createAnd(lhs.value, rhs.value));
+}
 
-	RValue<Int> operator|(RValue<Int> lhs, RValue<Int> rhs)
-	{
-		return RValue<Int>(Nucleus::createOr(lhs.value, rhs.value));
-	}
+RValue<Int> operator|(RValue<Int> lhs, RValue<Int> rhs)
+{
+	return RValue<Int>(Nucleus::createOr(lhs.value, rhs.value));
+}
 
-	RValue<Int> operator^(RValue<Int> lhs, RValue<Int> rhs)
-	{
-		return RValue<Int>(Nucleus::createXor(lhs.value, rhs.value));
-	}
+RValue<Int> operator^(RValue<Int> lhs, RValue<Int> rhs)
+{
+	return RValue<Int>(Nucleus::createXor(lhs.value, rhs.value));
+}
 
-	RValue<Int> operator<<(RValue<Int> lhs, RValue<Int> rhs)
-	{
-		return RValue<Int>(Nucleus::createShl(lhs.value, rhs.value));
-	}
+RValue<Int> operator<<(RValue<Int> lhs, RValue<Int> rhs)
+{
+	return RValue<Int>(Nucleus::createShl(lhs.value, rhs.value));
+}
 
-	RValue<Int> operator>>(RValue<Int> lhs, RValue<Int> rhs)
-	{
-		return RValue<Int>(Nucleus::createAShr(lhs.value, rhs.value));
-	}
+RValue<Int> operator>>(RValue<Int> lhs, RValue<Int> rhs)
+{
+	return RValue<Int>(Nucleus::createAShr(lhs.value, rhs.value));
+}
 
-	RValue<Int> operator+=(Int &lhs, RValue<Int> rhs)
-	{
-		return lhs = lhs + rhs;
-	}
+RValue<Int> operator+=(Int &lhs, RValue<Int> rhs)
+{
+	return lhs = lhs + rhs;
+}
 
-	RValue<Int> operator-=(Int &lhs, RValue<Int> rhs)
-	{
-		return lhs = lhs - rhs;
-	}
+RValue<Int> operator-=(Int &lhs, RValue<Int> rhs)
+{
+	return lhs = lhs - rhs;
+}
 
-	RValue<Int> operator*=(Int &lhs, RValue<Int> rhs)
-	{
-		return lhs = lhs * rhs;
-	}
+RValue<Int> operator*=(Int &lhs, RValue<Int> rhs)
+{
+	return lhs = lhs * rhs;
+}
 
-	RValue<Int> operator/=(Int &lhs, RValue<Int> rhs)
-	{
-		return lhs = lhs / rhs;
-	}
+RValue<Int> operator/=(Int &lhs, RValue<Int> rhs)
+{
+	return lhs = lhs / rhs;
+}
 
-	RValue<Int> operator%=(Int &lhs, RValue<Int> rhs)
-	{
-		return lhs = lhs % rhs;
-	}
+RValue<Int> operator%=(Int &lhs, RValue<Int> rhs)
+{
+	return lhs = lhs % rhs;
+}
 
-	RValue<Int> operator&=(Int &lhs, RValue<Int> rhs)
-	{
-		return lhs = lhs & rhs;
-	}
+RValue<Int> operator&=(Int &lhs, RValue<Int> rhs)
+{
+	return lhs = lhs & rhs;
+}
 
-	RValue<Int> operator|=(Int &lhs, RValue<Int> rhs)
-	{
-		return lhs = lhs | rhs;
-	}
+RValue<Int> operator|=(Int &lhs, RValue<Int> rhs)
+{
+	return lhs = lhs | rhs;
+}
 
-	RValue<Int> operator^=(Int &lhs, RValue<Int> rhs)
-	{
-		return lhs = lhs ^ rhs;
-	}
+RValue<Int> operator^=(Int &lhs, RValue<Int> rhs)
+{
+	return lhs = lhs ^ rhs;
+}
 
-	RValue<Int> operator<<=(Int &lhs, RValue<Int> rhs)
-	{
-		return lhs = lhs << rhs;
-	}
+RValue<Int> operator<<=(Int &lhs, RValue<Int> rhs)
+{
+	return lhs = lhs << rhs;
+}
 
-	RValue<Int> operator>>=(Int &lhs, RValue<Int> rhs)
-	{
-		return lhs = lhs >> rhs;
-	}
+RValue<Int> operator>>=(Int &lhs, RValue<Int> rhs)
+{
+	return lhs = lhs >> rhs;
+}
 
-	RValue<Int> operator+(RValue<Int> val)
-	{
-		return val;
-	}
+RValue<Int> operator+(RValue<Int> val)
+{
+	return val;
+}
 
-	RValue<Int> operator-(RValue<Int> val)
-	{
-		return RValue<Int>(Nucleus::createNeg(val.value));
-	}
+RValue<Int> operator-(RValue<Int> val)
+{
+	return RValue<Int>(Nucleus::createNeg(val.value));
+}
 
-	RValue<Int> operator~(RValue<Int> val)
-	{
-		return RValue<Int>(Nucleus::createNot(val.value));
-	}
+RValue<Int> operator~(RValue<Int> val)
+{
+	return RValue<Int>(Nucleus::createNot(val.value));
+}
 
-	RValue<Bool> operator<(RValue<Int> lhs, RValue<Int> rhs)
-	{
-		return RValue<Bool>(Nucleus::createICmpSLT(lhs.value, rhs.value));
-	}
+RValue<Bool> operator<(RValue<Int> lhs, RValue<Int> rhs)
+{
+	return RValue<Bool>(Nucleus::createICmpSLT(lhs.value, rhs.value));
+}
 
-	RValue<Bool> operator<=(RValue<Int> lhs, RValue<Int> rhs)
-	{
-		return RValue<Bool>(Nucleus::createICmpSLE(lhs.value, rhs.value));
-	}
+RValue<Bool> operator<=(RValue<Int> lhs, RValue<Int> rhs)
+{
+	return RValue<Bool>(Nucleus::createICmpSLE(lhs.value, rhs.value));
+}
 
-	RValue<Bool> operator>(RValue<Int> lhs, RValue<Int> rhs)
-	{
-		return RValue<Bool>(Nucleus::createICmpSGT(lhs.value, rhs.value));
-	}
+RValue<Bool> operator>(RValue<Int> lhs, RValue<Int> rhs)
+{
+	return RValue<Bool>(Nucleus::createICmpSGT(lhs.value, rhs.value));
+}
 
-	RValue<Bool> operator>=(RValue<Int> lhs, RValue<Int> rhs)
-	{
-		return RValue<Bool>(Nucleus::createICmpSGE(lhs.value, rhs.value));
-	}
+RValue<Bool> operator>=(RValue<Int> lhs, RValue<Int> rhs)
+{
+	return RValue<Bool>(Nucleus::createICmpSGE(lhs.value, rhs.value));
+}
 
-	RValue<Bool> operator!=(RValue<Int> lhs, RValue<Int> rhs)
-	{
-		return RValue<Bool>(Nucleus::createICmpNE(lhs.value, rhs.value));
-	}
+RValue<Bool> operator!=(RValue<Int> lhs, RValue<Int> rhs)
+{
+	return RValue<Bool>(Nucleus::createICmpNE(lhs.value, rhs.value));
+}
 
-	RValue<Bool> operator==(RValue<Int> lhs, RValue<Int> rhs)
-	{
-		return RValue<Bool>(Nucleus::createICmpEQ(lhs.value, rhs.value));
-	}
+RValue<Bool> operator==(RValue<Int> lhs, RValue<Int> rhs)
+{
+	return RValue<Bool>(Nucleus::createICmpEQ(lhs.value, rhs.value));
+}
 
-	RValue<Int> Max(RValue<Int> x, RValue<Int> y)
-	{
-		return IfThenElse(x > y, x, y);
-	}
+RValue<Int> Max(RValue<Int> x, RValue<Int> y)
+{
+	return IfThenElse(x > y, x, y);
+}
 
-	RValue<Int> Min(RValue<Int> x, RValue<Int> y)
-	{
-		return IfThenElse(x < y, x, y);
-	}
+RValue<Int> Min(RValue<Int> x, RValue<Int> y)
+{
+	return IfThenElse(x < y, x, y);
+}
 
-	RValue<Int> Clamp(RValue<Int> x, RValue<Int> min, RValue<Int> max)
-	{
-		return Min(Max(x, min), max);
-	}
+RValue<Int> Clamp(RValue<Int> x, RValue<Int> min, RValue<Int> max)
+{
+	return Min(Max(x, min), max);
+}
 
-	Long::Long(RValue<Int> cast)
-	{
-		Value *integer = Nucleus::createSExt(cast.value, Long::getType());
+Long::Long(RValue<Int> cast)
+{
+	Value *integer = Nucleus::createSExt(cast.value, Long::getType());
 
-		storeValue(integer);
-	}
+	storeValue(integer);
+}
 
-	Long::Long(RValue<UInt> cast)
-	{
-		Value *integer = Nucleus::createZExt(cast.value, Long::getType());
+Long::Long(RValue<UInt> cast)
+{
+	Value *integer = Nucleus::createZExt(cast.value, Long::getType());
 
-		storeValue(integer);
-	}
+	storeValue(integer);
+}
 
-	Long::Long(RValue<Long> rhs)
-	{
-		storeValue(rhs.value);
-	}
+Long::Long(RValue<Long> rhs)
+{
+	storeValue(rhs.value);
+}
 
-	RValue<Long> Long::operator=(int64_t rhs)
-	{
-		return RValue<Long>(storeValue(Nucleus::createConstantLong(rhs)));
-	}
+RValue<Long> Long::operator=(int64_t rhs)
+{
+	return RValue<Long>(storeValue(Nucleus::createConstantLong(rhs)));
+}
 
-	RValue<Long> Long::operator=(RValue<Long> rhs)
-	{
-		storeValue(rhs.value);
+RValue<Long> Long::operator=(RValue<Long> rhs)
+{
+	storeValue(rhs.value);
 
-		return rhs;
-	}
+	return rhs;
+}
 
-	RValue<Long> Long::operator=(const Long &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
+RValue<Long> Long::operator=(const Long &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
 
-		return RValue<Long>(value);
-	}
+	return RValue<Long>(value);
+}
 
-	RValue<Long> Long::operator=(const Reference<Long> &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
+RValue<Long> Long::operator=(const Reference<Long> &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
 
-		return RValue<Long>(value);
-	}
+	return RValue<Long>(value);
+}
 
-	RValue<Long> operator+(RValue<Long> lhs, RValue<Long> rhs)
-	{
-		return RValue<Long>(Nucleus::createAdd(lhs.value, rhs.value));
-	}
+RValue<Long> operator+(RValue<Long> lhs, RValue<Long> rhs)
+{
+	return RValue<Long>(Nucleus::createAdd(lhs.value, rhs.value));
+}
 
-	RValue<Long> operator-(RValue<Long> lhs, RValue<Long> rhs)
-	{
-		return RValue<Long>(Nucleus::createSub(lhs.value, rhs.value));
-	}
+RValue<Long> operator-(RValue<Long> lhs, RValue<Long> rhs)
+{
+	return RValue<Long>(Nucleus::createSub(lhs.value, rhs.value));
+}
 
-	RValue<Long> operator*(RValue<Long> lhs, RValue<Long> rhs)
-	{
-		return RValue<Long>(Nucleus::createMul(lhs.value, rhs.value));
-	}
+RValue<Long> operator*(RValue<Long> lhs, RValue<Long> rhs)
+{
+	return RValue<Long>(Nucleus::createMul(lhs.value, rhs.value));
+}
 
-	RValue<Long> operator>>(RValue<Long> lhs, RValue<Long> rhs)
-	{
-		return RValue<Long>(Nucleus::createAShr(lhs.value, rhs.value));
-	}
+RValue<Long> operator>>(RValue<Long> lhs, RValue<Long> rhs)
+{
+	return RValue<Long>(Nucleus::createAShr(lhs.value, rhs.value));
+}
 
-	RValue<Long> operator+=(Long &lhs, RValue<Long> rhs)
-	{
-		return lhs = lhs + rhs;
-	}
+RValue<Long> operator+=(Long &lhs, RValue<Long> rhs)
+{
+	return lhs = lhs + rhs;
+}
 
-	RValue<Long> operator-=(Long &lhs, RValue<Long> rhs)
-	{
-		return lhs = lhs - rhs;
-	}
+RValue<Long> operator-=(Long &lhs, RValue<Long> rhs)
+{
+	return lhs = lhs - rhs;
+}
 
-	RValue<Long> AddAtomic(RValue<Pointer<Long> > x, RValue<Long> y)
-	{
-		return RValue<Long>(Nucleus::createAtomicAdd(x.value, y.value));
-	}
+RValue<Long> AddAtomic(RValue<Pointer<Long> > x, RValue<Long> y)
+{
+	return RValue<Long>(Nucleus::createAtomicAdd(x.value, y.value));
+}
 
-	RValue<UInt> AddAtomic(RValue<Pointer<UInt> > x, RValue<UInt> y, std::memory_order memoryOrder)
-	{
-		return RValue<UInt>(Nucleus::createAtomicAdd(x.value, y.value, memoryOrder));
-	}
+RValue<UInt> AddAtomic(RValue<Pointer<UInt> > x, RValue<UInt> y, std::memory_order memoryOrder)
+{
+	return RValue<UInt>(Nucleus::createAtomicAdd(x.value, y.value, memoryOrder));
+}
 
-	RValue<UInt> SubAtomic(RValue<Pointer<UInt> > x, RValue<UInt> y, std::memory_order memoryOrder)
-	{
-		return RValue<UInt>(Nucleus::createAtomicSub(x.value, y.value, memoryOrder));
-	}
+RValue<UInt> SubAtomic(RValue<Pointer<UInt> > x, RValue<UInt> y, std::memory_order memoryOrder)
+{
+	return RValue<UInt>(Nucleus::createAtomicSub(x.value, y.value, memoryOrder));
+}
 
-	RValue<UInt> AndAtomic(RValue<Pointer<UInt> > x, RValue<UInt> y, std::memory_order memoryOrder)
-	{
-		return RValue<UInt>(Nucleus::createAtomicAnd(x.value, y.value, memoryOrder));
-	}
+RValue<UInt> AndAtomic(RValue<Pointer<UInt> > x, RValue<UInt> y, std::memory_order memoryOrder)
+{
+	return RValue<UInt>(Nucleus::createAtomicAnd(x.value, y.value, memoryOrder));
+}
 
-	RValue<UInt> OrAtomic(RValue<Pointer<UInt> > x, RValue<UInt> y, std::memory_order memoryOrder)
-	{
-		return RValue<UInt>(Nucleus::createAtomicOr(x.value, y.value, memoryOrder));
-	}
+RValue<UInt> OrAtomic(RValue<Pointer<UInt> > x, RValue<UInt> y, std::memory_order memoryOrder)
+{
+	return RValue<UInt>(Nucleus::createAtomicOr(x.value, y.value, memoryOrder));
+}
 
-	RValue<UInt> XorAtomic(RValue<Pointer<UInt> > x, RValue<UInt> y, std::memory_order memoryOrder)
-	{
-		return RValue<UInt>(Nucleus::createAtomicXor(x.value, y.value, memoryOrder));
-	}
+RValue<UInt> XorAtomic(RValue<Pointer<UInt> > x, RValue<UInt> y, std::memory_order memoryOrder)
+{
+	return RValue<UInt>(Nucleus::createAtomicXor(x.value, y.value, memoryOrder));
+}
 
-	RValue<Int> MinAtomic(RValue<Pointer<Int> > x, RValue<Int> y, std::memory_order memoryOrder)
-	{
-		return RValue<Int>(Nucleus::createAtomicMin(x.value, y.value, memoryOrder));
-	}
+RValue<Int> MinAtomic(RValue<Pointer<Int> > x, RValue<Int> y, std::memory_order memoryOrder)
+{
+	return RValue<Int>(Nucleus::createAtomicMin(x.value, y.value, memoryOrder));
+}
 
-	RValue<UInt> MinAtomic(RValue<Pointer<UInt> > x, RValue<UInt> y, std::memory_order memoryOrder)
-	{
-		return RValue<UInt>(Nucleus::createAtomicUMin(x.value, y.value, memoryOrder));
-	}
+RValue<UInt> MinAtomic(RValue<Pointer<UInt> > x, RValue<UInt> y, std::memory_order memoryOrder)
+{
+	return RValue<UInt>(Nucleus::createAtomicUMin(x.value, y.value, memoryOrder));
+}
 
-	RValue<Int> MaxAtomic(RValue<Pointer<Int> > x, RValue<Int> y, std::memory_order memoryOrder)
-	{
-		return RValue<Int>(Nucleus::createAtomicMax(x.value, y.value, memoryOrder));
-	}
+RValue<Int> MaxAtomic(RValue<Pointer<Int> > x, RValue<Int> y, std::memory_order memoryOrder)
+{
+	return RValue<Int>(Nucleus::createAtomicMax(x.value, y.value, memoryOrder));
+}
 
-	RValue<UInt> MaxAtomic(RValue<Pointer<UInt> > x, RValue<UInt> y, std::memory_order memoryOrder)
-	{
-		return RValue<UInt>(Nucleus::createAtomicUMax(x.value, y.value, memoryOrder));
-	}
+RValue<UInt> MaxAtomic(RValue<Pointer<UInt> > x, RValue<UInt> y, std::memory_order memoryOrder)
+{
+	return RValue<UInt>(Nucleus::createAtomicUMax(x.value, y.value, memoryOrder));
+}
 
-	RValue<UInt> ExchangeAtomic(RValue<Pointer<UInt> > x, RValue<UInt> y, std::memory_order memoryOrder)
-	{
-		return RValue<UInt>(Nucleus::createAtomicExchange(x.value, y.value, memoryOrder));
-	}
+RValue<UInt> ExchangeAtomic(RValue<Pointer<UInt> > x, RValue<UInt> y, std::memory_order memoryOrder)
+{
+	return RValue<UInt>(Nucleus::createAtomicExchange(x.value, y.value, memoryOrder));
+}
 
-	RValue<UInt> CompareExchangeAtomic(RValue<Pointer<UInt> > x, RValue<UInt> y, RValue<UInt> compare, std::memory_order memoryOrderEqual, std::memory_order memoryOrderUnequal)
-	{
-		return RValue<UInt>(Nucleus::createAtomicCompareExchange(x.value, y.value, compare.value, memoryOrderEqual, memoryOrderUnequal));
-	}
+RValue<UInt> CompareExchangeAtomic(RValue<Pointer<UInt> > x, RValue<UInt> y, RValue<UInt> compare, std::memory_order memoryOrderEqual, std::memory_order memoryOrderUnequal)
+{
+	return RValue<UInt>(Nucleus::createAtomicCompareExchange(x.value, y.value, compare.value, memoryOrderEqual, memoryOrderUnequal));
+}
 
-	UInt::UInt(Argument<UInt> argument)
-	{
-		storeValue(argument.value);
-	}
+UInt::UInt(Argument<UInt> argument)
+{
+	storeValue(argument.value);
+}
 
-	UInt::UInt(RValue<UShort> cast)
-	{
-		Value *integer = Nucleus::createZExt(cast.value, UInt::getType());
+UInt::UInt(RValue<UShort> cast)
+{
+	Value *integer = Nucleus::createZExt(cast.value, UInt::getType());
 
-		storeValue(integer);
-	}
+	storeValue(integer);
+}
 
-	UInt::UInt(RValue<Long> cast)
-	{
-		Value *integer = Nucleus::createTrunc(cast.value, UInt::getType());
+UInt::UInt(RValue<Long> cast)
+{
+	Value *integer = Nucleus::createTrunc(cast.value, UInt::getType());
 
-		storeValue(integer);
-	}
+	storeValue(integer);
+}
 
-	UInt::UInt(int x)
-	{
-		storeValue(Nucleus::createConstantInt(x));
-	}
+UInt::UInt(int x)
+{
+	storeValue(Nucleus::createConstantInt(x));
+}
 
-	UInt::UInt(unsigned int x)
-	{
-		storeValue(Nucleus::createConstantInt(x));
-	}
+UInt::UInt(unsigned int x)
+{
+	storeValue(Nucleus::createConstantInt(x));
+}
 
-	UInt::UInt(RValue<UInt> rhs)
-	{
-		storeValue(rhs.value);
-	}
+UInt::UInt(RValue<UInt> rhs)
+{
+	storeValue(rhs.value);
+}
 
-	UInt::UInt(RValue<Int> rhs)
-	{
-		storeValue(rhs.value);
-	}
+UInt::UInt(RValue<Int> rhs)
+{
+	storeValue(rhs.value);
+}
 
-	UInt::UInt(const UInt &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
-	}
+UInt::UInt(const UInt &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
+}
 
-	UInt::UInt(const Reference<UInt> &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
-	}
+UInt::UInt(const Reference<UInt> &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
+}
 
-	UInt::UInt(const Int &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
-	}
+UInt::UInt(const Int &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
+}
 
-	UInt::UInt(const Reference<Int> &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
-	}
+UInt::UInt(const Reference<Int> &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
+}
 
-	RValue<UInt> UInt::operator=(unsigned int rhs)
-	{
-		return RValue<UInt>(storeValue(Nucleus::createConstantInt(rhs)));
-	}
+RValue<UInt> UInt::operator=(unsigned int rhs)
+{
+	return RValue<UInt>(storeValue(Nucleus::createConstantInt(rhs)));
+}
 
-	RValue<UInt> UInt::operator=(RValue<UInt> rhs)
-	{
-		storeValue(rhs.value);
+RValue<UInt> UInt::operator=(RValue<UInt> rhs)
+{
+	storeValue(rhs.value);
 
-		return rhs;
-	}
+	return rhs;
+}
 
-	RValue<UInt> UInt::operator=(RValue<Int> rhs)
-	{
-		storeValue(rhs.value);
+RValue<UInt> UInt::operator=(RValue<Int> rhs)
+{
+	storeValue(rhs.value);
 
-		return RValue<UInt>(rhs);
-	}
+	return RValue<UInt>(rhs);
+}
 
-	RValue<UInt> UInt::operator=(const UInt &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
+RValue<UInt> UInt::operator=(const UInt &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
 
-		return RValue<UInt>(value);
-	}
+	return RValue<UInt>(value);
+}
 
-	RValue<UInt> UInt::operator=(const Reference<UInt> &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
+RValue<UInt> UInt::operator=(const Reference<UInt> &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
 
-		return RValue<UInt>(value);
-	}
+	return RValue<UInt>(value);
+}
 
-	RValue<UInt> UInt::operator=(const Int &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
+RValue<UInt> UInt::operator=(const Int &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
 
-		return RValue<UInt>(value);
-	}
+	return RValue<UInt>(value);
+}
 
-	RValue<UInt> UInt::operator=(const Reference<Int> &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
+RValue<UInt> UInt::operator=(const Reference<Int> &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
 
-		return RValue<UInt>(value);
-	}
+	return RValue<UInt>(value);
+}
 
-	RValue<UInt> operator+(RValue<UInt> lhs, RValue<UInt> rhs)
-	{
-		return RValue<UInt>(Nucleus::createAdd(lhs.value, rhs.value));
-	}
+RValue<UInt> operator+(RValue<UInt> lhs, RValue<UInt> rhs)
+{
+	return RValue<UInt>(Nucleus::createAdd(lhs.value, rhs.value));
+}
 
-	RValue<UInt> operator-(RValue<UInt> lhs, RValue<UInt> rhs)
-	{
-		return RValue<UInt>(Nucleus::createSub(lhs.value, rhs.value));
-	}
+RValue<UInt> operator-(RValue<UInt> lhs, RValue<UInt> rhs)
+{
+	return RValue<UInt>(Nucleus::createSub(lhs.value, rhs.value));
+}
 
-	RValue<UInt> operator*(RValue<UInt> lhs, RValue<UInt> rhs)
-	{
-		return RValue<UInt>(Nucleus::createMul(lhs.value, rhs.value));
-	}
+RValue<UInt> operator*(RValue<UInt> lhs, RValue<UInt> rhs)
+{
+	return RValue<UInt>(Nucleus::createMul(lhs.value, rhs.value));
+}
 
-	RValue<UInt> operator/(RValue<UInt> lhs, RValue<UInt> rhs)
-	{
-		return RValue<UInt>(Nucleus::createUDiv(lhs.value, rhs.value));
-	}
+RValue<UInt> operator/(RValue<UInt> lhs, RValue<UInt> rhs)
+{
+	return RValue<UInt>(Nucleus::createUDiv(lhs.value, rhs.value));
+}
 
-	RValue<UInt> operator%(RValue<UInt> lhs, RValue<UInt> rhs)
-	{
-		return RValue<UInt>(Nucleus::createURem(lhs.value, rhs.value));
-	}
+RValue<UInt> operator%(RValue<UInt> lhs, RValue<UInt> rhs)
+{
+	return RValue<UInt>(Nucleus::createURem(lhs.value, rhs.value));
+}
 
-	RValue<UInt> operator&(RValue<UInt> lhs, RValue<UInt> rhs)
-	{
-		return RValue<UInt>(Nucleus::createAnd(lhs.value, rhs.value));
-	}
+RValue<UInt> operator&(RValue<UInt> lhs, RValue<UInt> rhs)
+{
+	return RValue<UInt>(Nucleus::createAnd(lhs.value, rhs.value));
+}
 
-	RValue<UInt> operator|(RValue<UInt> lhs, RValue<UInt> rhs)
-	{
-		return RValue<UInt>(Nucleus::createOr(lhs.value, rhs.value));
-	}
+RValue<UInt> operator|(RValue<UInt> lhs, RValue<UInt> rhs)
+{
+	return RValue<UInt>(Nucleus::createOr(lhs.value, rhs.value));
+}
 
-	RValue<UInt> operator^(RValue<UInt> lhs, RValue<UInt> rhs)
-	{
-		return RValue<UInt>(Nucleus::createXor(lhs.value, rhs.value));
-	}
+RValue<UInt> operator^(RValue<UInt> lhs, RValue<UInt> rhs)
+{
+	return RValue<UInt>(Nucleus::createXor(lhs.value, rhs.value));
+}
 
-	RValue<UInt> operator<<(RValue<UInt> lhs, RValue<UInt> rhs)
-	{
-		return RValue<UInt>(Nucleus::createShl(lhs.value, rhs.value));
-	}
+RValue<UInt> operator<<(RValue<UInt> lhs, RValue<UInt> rhs)
+{
+	return RValue<UInt>(Nucleus::createShl(lhs.value, rhs.value));
+}
 
-	RValue<UInt> operator>>(RValue<UInt> lhs, RValue<UInt> rhs)
-	{
-		return RValue<UInt>(Nucleus::createLShr(lhs.value, rhs.value));
-	}
+RValue<UInt> operator>>(RValue<UInt> lhs, RValue<UInt> rhs)
+{
+	return RValue<UInt>(Nucleus::createLShr(lhs.value, rhs.value));
+}
 
-	RValue<UInt> operator+=(UInt &lhs, RValue<UInt> rhs)
-	{
-		return lhs = lhs + rhs;
-	}
+RValue<UInt> operator+=(UInt &lhs, RValue<UInt> rhs)
+{
+	return lhs = lhs + rhs;
+}
 
-	RValue<UInt> operator-=(UInt &lhs, RValue<UInt> rhs)
-	{
-		return lhs = lhs - rhs;
-	}
+RValue<UInt> operator-=(UInt &lhs, RValue<UInt> rhs)
+{
+	return lhs = lhs - rhs;
+}
 
-	RValue<UInt> operator*=(UInt &lhs, RValue<UInt> rhs)
-	{
-		return lhs = lhs * rhs;
-	}
+RValue<UInt> operator*=(UInt &lhs, RValue<UInt> rhs)
+{
+	return lhs = lhs * rhs;
+}
 
-	RValue<UInt> operator/=(UInt &lhs, RValue<UInt> rhs)
-	{
-		return lhs = lhs / rhs;
-	}
+RValue<UInt> operator/=(UInt &lhs, RValue<UInt> rhs)
+{
+	return lhs = lhs / rhs;
+}
 
-	RValue<UInt> operator%=(UInt &lhs, RValue<UInt> rhs)
-	{
-		return lhs = lhs % rhs;
-	}
+RValue<UInt> operator%=(UInt &lhs, RValue<UInt> rhs)
+{
+	return lhs = lhs % rhs;
+}
 
-	RValue<UInt> operator&=(UInt &lhs, RValue<UInt> rhs)
-	{
-		return lhs = lhs & rhs;
-	}
+RValue<UInt> operator&=(UInt &lhs, RValue<UInt> rhs)
+{
+	return lhs = lhs & rhs;
+}
 
-	RValue<UInt> operator|=(UInt &lhs, RValue<UInt> rhs)
-	{
-		return lhs = lhs | rhs;
-	}
+RValue<UInt> operator|=(UInt &lhs, RValue<UInt> rhs)
+{
+	return lhs = lhs | rhs;
+}
 
-	RValue<UInt> operator^=(UInt &lhs, RValue<UInt> rhs)
-	{
-		return lhs = lhs ^ rhs;
-	}
+RValue<UInt> operator^=(UInt &lhs, RValue<UInt> rhs)
+{
+	return lhs = lhs ^ rhs;
+}
 
-	RValue<UInt> operator<<=(UInt &lhs, RValue<UInt> rhs)
-	{
-		return lhs = lhs << rhs;
-	}
+RValue<UInt> operator<<=(UInt &lhs, RValue<UInt> rhs)
+{
+	return lhs = lhs << rhs;
+}
 
-	RValue<UInt> operator>>=(UInt &lhs, RValue<UInt> rhs)
-	{
-		return lhs = lhs >> rhs;
-	}
+RValue<UInt> operator>>=(UInt &lhs, RValue<UInt> rhs)
+{
+	return lhs = lhs >> rhs;
+}
 
-	RValue<UInt> operator+(RValue<UInt> val)
-	{
-		return val;
-	}
+RValue<UInt> operator+(RValue<UInt> val)
+{
+	return val;
+}
 
-	RValue<UInt> operator-(RValue<UInt> val)
-	{
-		return RValue<UInt>(Nucleus::createNeg(val.value));
-	}
+RValue<UInt> operator-(RValue<UInt> val)
+{
+	return RValue<UInt>(Nucleus::createNeg(val.value));
+}
 
-	RValue<UInt> operator~(RValue<UInt> val)
-	{
-		return RValue<UInt>(Nucleus::createNot(val.value));
-	}
+RValue<UInt> operator~(RValue<UInt> val)
+{
+	return RValue<UInt>(Nucleus::createNot(val.value));
+}
 
-	RValue<UInt> Max(RValue<UInt> x, RValue<UInt> y)
-	{
-		return IfThenElse(x > y, x, y);
-	}
+RValue<UInt> Max(RValue<UInt> x, RValue<UInt> y)
+{
+	return IfThenElse(x > y, x, y);
+}
 
-	RValue<UInt> Min(RValue<UInt> x, RValue<UInt> y)
-	{
-		return IfThenElse(x < y, x, y);
-	}
+RValue<UInt> Min(RValue<UInt> x, RValue<UInt> y)
+{
+	return IfThenElse(x < y, x, y);
+}
 
-	RValue<UInt> Clamp(RValue<UInt> x, RValue<UInt> min, RValue<UInt> max)
-	{
-		return Min(Max(x, min), max);
-	}
+RValue<UInt> Clamp(RValue<UInt> x, RValue<UInt> min, RValue<UInt> max)
+{
+	return Min(Max(x, min), max);
+}
 
-	RValue<Bool> operator<(RValue<UInt> lhs, RValue<UInt> rhs)
-	{
-		return RValue<Bool>(Nucleus::createICmpULT(lhs.value, rhs.value));
-	}
+RValue<Bool> operator<(RValue<UInt> lhs, RValue<UInt> rhs)
+{
+	return RValue<Bool>(Nucleus::createICmpULT(lhs.value, rhs.value));
+}
 
-	RValue<Bool> operator<=(RValue<UInt> lhs, RValue<UInt> rhs)
-	{
-		return RValue<Bool>(Nucleus::createICmpULE(lhs.value, rhs.value));
-	}
+RValue<Bool> operator<=(RValue<UInt> lhs, RValue<UInt> rhs)
+{
+	return RValue<Bool>(Nucleus::createICmpULE(lhs.value, rhs.value));
+}
 
-	RValue<Bool> operator>(RValue<UInt> lhs, RValue<UInt> rhs)
-	{
-		return RValue<Bool>(Nucleus::createICmpUGT(lhs.value, rhs.value));
-	}
+RValue<Bool> operator>(RValue<UInt> lhs, RValue<UInt> rhs)
+{
+	return RValue<Bool>(Nucleus::createICmpUGT(lhs.value, rhs.value));
+}
 
-	RValue<Bool> operator>=(RValue<UInt> lhs, RValue<UInt> rhs)
-	{
-		return RValue<Bool>(Nucleus::createICmpUGE(lhs.value, rhs.value));
-	}
+RValue<Bool> operator>=(RValue<UInt> lhs, RValue<UInt> rhs)
+{
+	return RValue<Bool>(Nucleus::createICmpUGE(lhs.value, rhs.value));
+}
 
-	RValue<Bool> operator!=(RValue<UInt> lhs, RValue<UInt> rhs)
-	{
-		return RValue<Bool>(Nucleus::createICmpNE(lhs.value, rhs.value));
-	}
+RValue<Bool> operator!=(RValue<UInt> lhs, RValue<UInt> rhs)
+{
+	return RValue<Bool>(Nucleus::createICmpNE(lhs.value, rhs.value));
+}
 
-	RValue<Bool> operator==(RValue<UInt> lhs, RValue<UInt> rhs)
-	{
-		return RValue<Bool>(Nucleus::createICmpEQ(lhs.value, rhs.value));
-	}
+RValue<Bool> operator==(RValue<UInt> lhs, RValue<UInt> rhs)
+{
+	return RValue<Bool>(Nucleus::createICmpEQ(lhs.value, rhs.value));
+}
 
-	Int2::Int2(RValue<Int4> cast)
-	{
-		storeValue(Nucleus::createBitCast(cast.value, getType()));
-	}
+Int2::Int2(RValue<Int4> cast)
+{
+	storeValue(Nucleus::createBitCast(cast.value, getType()));
+}
 
-	Int2::Int2(int x, int y)
-	{
-		int64_t constantVector[2] = {x, y};
-		storeValue(Nucleus::createConstantVector(constantVector, getType()));
-	}
+Int2::Int2(int x, int y)
+{
+	int64_t constantVector[2] = {x, y};
+	storeValue(Nucleus::createConstantVector(constantVector, getType()));
+}
 
-	Int2::Int2(RValue<Int2> rhs)
-	{
-		storeValue(rhs.value);
-	}
+Int2::Int2(RValue<Int2> rhs)
+{
+	storeValue(rhs.value);
+}
 
-	Int2::Int2(const Int2 &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
-	}
+Int2::Int2(const Int2 &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
+}
 
-	Int2::Int2(const Reference<Int2> &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
-	}
+Int2::Int2(const Reference<Int2> &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
+}
 
-	Int2::Int2(RValue<Int> lo, RValue<Int> hi)
-	{
-		int shuffle[4] = {0, 4, 1, 5};
-		Value *packed = Nucleus::createShuffleVector(Int4(lo).loadValue(), Int4(hi).loadValue(), shuffle);
+Int2::Int2(RValue<Int> lo, RValue<Int> hi)
+{
+	int shuffle[4] = {0, 4, 1, 5};
+	Value *packed = Nucleus::createShuffleVector(Int4(lo).loadValue(), Int4(hi).loadValue(), shuffle);
 
-		storeValue(Nucleus::createBitCast(packed, Int2::getType()));
-	}
+	storeValue(Nucleus::createBitCast(packed, Int2::getType()));
+}
 
-	RValue<Int2> Int2::operator=(RValue<Int2> rhs)
-	{
-		storeValue(rhs.value);
+RValue<Int2> Int2::operator=(RValue<Int2> rhs)
+{
+	storeValue(rhs.value);
 
-		return rhs;
-	}
+	return rhs;
+}
 
-	RValue<Int2> Int2::operator=(const Int2 &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
+RValue<Int2> Int2::operator=(const Int2 &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
 
-		return RValue<Int2>(value);
-	}
+	return RValue<Int2>(value);
+}
 
-	RValue<Int2> Int2::operator=(const Reference<Int2> &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
+RValue<Int2> Int2::operator=(const Reference<Int2> &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
 
-		return RValue<Int2>(value);
-	}
+	return RValue<Int2>(value);
+}
 
-	RValue<Int2> operator+(RValue<Int2> lhs, RValue<Int2> rhs)
-	{
-		return RValue<Int2>(Nucleus::createAdd(lhs.value, rhs.value));
-	}
+RValue<Int2> operator+(RValue<Int2> lhs, RValue<Int2> rhs)
+{
+	return RValue<Int2>(Nucleus::createAdd(lhs.value, rhs.value));
+}
 
-	RValue<Int2> operator-(RValue<Int2> lhs, RValue<Int2> rhs)
-	{
-		return RValue<Int2>(Nucleus::createSub(lhs.value, rhs.value));
-	}
+RValue<Int2> operator-(RValue<Int2> lhs, RValue<Int2> rhs)
+{
+	return RValue<Int2>(Nucleus::createSub(lhs.value, rhs.value));
+}
 
 //	RValue<Int2> operator*(RValue<Int2> lhs, RValue<Int2> rhs)
 //	{
@@ -3010,30 +3011,30 @@
 //		return RValue<Int2>(Nucleus::createSRem(lhs.value, rhs.value));
 //	}
 
-	RValue<Int2> operator&(RValue<Int2> lhs, RValue<Int2> rhs)
-	{
-		return RValue<Int2>(Nucleus::createAnd(lhs.value, rhs.value));
-	}
+RValue<Int2> operator&(RValue<Int2> lhs, RValue<Int2> rhs)
+{
+	return RValue<Int2>(Nucleus::createAnd(lhs.value, rhs.value));
+}
 
-	RValue<Int2> operator|(RValue<Int2> lhs, RValue<Int2> rhs)
-	{
-		return RValue<Int2>(Nucleus::createOr(lhs.value, rhs.value));
-	}
+RValue<Int2> operator|(RValue<Int2> lhs, RValue<Int2> rhs)
+{
+	return RValue<Int2>(Nucleus::createOr(lhs.value, rhs.value));
+}
 
-	RValue<Int2> operator^(RValue<Int2> lhs, RValue<Int2> rhs)
-	{
-		return RValue<Int2>(Nucleus::createXor(lhs.value, rhs.value));
-	}
+RValue<Int2> operator^(RValue<Int2> lhs, RValue<Int2> rhs)
+{
+	return RValue<Int2>(Nucleus::createXor(lhs.value, rhs.value));
+}
 
-	RValue<Int2> operator+=(Int2 &lhs, RValue<Int2> rhs)
-	{
-		return lhs = lhs + rhs;
-	}
+RValue<Int2> operator+=(Int2 &lhs, RValue<Int2> rhs)
+{
+	return lhs = lhs + rhs;
+}
 
-	RValue<Int2> operator-=(Int2 &lhs, RValue<Int2> rhs)
-	{
-		return lhs = lhs - rhs;
-	}
+RValue<Int2> operator-=(Int2 &lhs, RValue<Int2> rhs)
+{
+	return lhs = lhs - rhs;
+}
 
 //	RValue<Int2> operator*=(Int2 &lhs, RValue<Int2> rhs)
 //	{
@@ -3050,30 +3051,30 @@
 //		return lhs = lhs % rhs;
 //	}
 
-	RValue<Int2> operator&=(Int2 &lhs, RValue<Int2> rhs)
-	{
-		return lhs = lhs & rhs;
-	}
+RValue<Int2> operator&=(Int2 &lhs, RValue<Int2> rhs)
+{
+	return lhs = lhs & rhs;
+}
 
-	RValue<Int2> operator|=(Int2 &lhs, RValue<Int2> rhs)
-	{
-		return lhs = lhs | rhs;
-	}
+RValue<Int2> operator|=(Int2 &lhs, RValue<Int2> rhs)
+{
+	return lhs = lhs | rhs;
+}
 
-	RValue<Int2> operator^=(Int2 &lhs, RValue<Int2> rhs)
-	{
-		return lhs = lhs ^ rhs;
-	}
+RValue<Int2> operator^=(Int2 &lhs, RValue<Int2> rhs)
+{
+	return lhs = lhs ^ rhs;
+}
 
-	RValue<Int2> operator<<=(Int2 &lhs, unsigned char rhs)
-	{
-		return lhs = lhs << rhs;
-	}
+RValue<Int2> operator<<=(Int2 &lhs, unsigned char rhs)
+{
+	return lhs = lhs << rhs;
+}
 
-	RValue<Int2> operator>>=(Int2 &lhs, unsigned char rhs)
-	{
-		return lhs = lhs >> rhs;
-	}
+RValue<Int2> operator>>=(Int2 &lhs, unsigned char rhs)
+{
+	return lhs = lhs >> rhs;
+}
 
 //	RValue<Int2> operator+(RValue<Int2> val)
 //	{
@@ -3085,89 +3086,89 @@
 //		return RValue<Int2>(Nucleus::createNeg(val.value));
 //	}
 
-	RValue<Int2> operator~(RValue<Int2> val)
-	{
-		return RValue<Int2>(Nucleus::createNot(val.value));
-	}
+RValue<Int2> operator~(RValue<Int2> val)
+{
+	return RValue<Int2>(Nucleus::createNot(val.value));
+}
 
-	RValue<Short4> UnpackLow(RValue<Int2> x, RValue<Int2> y)
-	{
-		int shuffle[4] = {0, 4, 1, 5};   // Real type is v4i32
-		return As<Short4>(Nucleus::createShuffleVector(x.value, y.value, shuffle));
-	}
+RValue<Short4> UnpackLow(RValue<Int2> x, RValue<Int2> y)
+{
+	int shuffle[4] = {0, 4, 1, 5};   // Real type is v4i32
+	return As<Short4>(Nucleus::createShuffleVector(x.value, y.value, shuffle));
+}
 
-	RValue<Short4> UnpackHigh(RValue<Int2> x, RValue<Int2> y)
-	{
-		int shuffle[4] = {0, 4, 1, 5};   // Real type is v4i32
-		auto lowHigh = RValue<Int4>(Nucleus::createShuffleVector(x.value, y.value, shuffle));
-		return As<Short4>(Swizzle(lowHigh, 0x2323));
-	}
+RValue<Short4> UnpackHigh(RValue<Int2> x, RValue<Int2> y)
+{
+	int shuffle[4] = {0, 4, 1, 5};   // Real type is v4i32
+	auto lowHigh = RValue<Int4>(Nucleus::createShuffleVector(x.value, y.value, shuffle));
+	return As<Short4>(Swizzle(lowHigh, 0x2323));
+}
 
-	RValue<Int> Extract(RValue<Int2> val, int i)
-	{
-		return RValue<Int>(Nucleus::createExtractElement(val.value, Int::getType(), i));
-	}
+RValue<Int> Extract(RValue<Int2> val, int i)
+{
+	return RValue<Int>(Nucleus::createExtractElement(val.value, Int::getType(), i));
+}
 
-	RValue<Int2> Insert(RValue<Int2> val, RValue<Int> element, int i)
-	{
-		return RValue<Int2>(Nucleus::createInsertElement(val.value, element.value, i));
-	}
+RValue<Int2> Insert(RValue<Int2> val, RValue<Int> element, int i)
+{
+	return RValue<Int2>(Nucleus::createInsertElement(val.value, element.value, i));
+}
 
-	UInt2::UInt2(unsigned int x, unsigned int y)
-	{
-		int64_t constantVector[2] = {x, y};
-		storeValue(Nucleus::createConstantVector(constantVector, getType()));
-	}
+UInt2::UInt2(unsigned int x, unsigned int y)
+{
+	int64_t constantVector[2] = {x, y};
+	storeValue(Nucleus::createConstantVector(constantVector, getType()));
+}
 
-	UInt2::UInt2(RValue<UInt2> rhs)
-	{
-		storeValue(rhs.value);
-	}
+UInt2::UInt2(RValue<UInt2> rhs)
+{
+	storeValue(rhs.value);
+}
 
-	UInt2::UInt2(const UInt2 &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
-	}
+UInt2::UInt2(const UInt2 &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
+}
 
-	UInt2::UInt2(const Reference<UInt2> &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
-	}
+UInt2::UInt2(const Reference<UInt2> &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
+}
 
-	RValue<UInt2> UInt2::operator=(RValue<UInt2> rhs)
-	{
-		storeValue(rhs.value);
+RValue<UInt2> UInt2::operator=(RValue<UInt2> rhs)
+{
+	storeValue(rhs.value);
 
-		return rhs;
-	}
+	return rhs;
+}
 
-	RValue<UInt2> UInt2::operator=(const UInt2 &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
+RValue<UInt2> UInt2::operator=(const UInt2 &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
 
-		return RValue<UInt2>(value);
-	}
+	return RValue<UInt2>(value);
+}
 
-	RValue<UInt2> UInt2::operator=(const Reference<UInt2> &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
+RValue<UInt2> UInt2::operator=(const Reference<UInt2> &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
 
-		return RValue<UInt2>(value);
-	}
+	return RValue<UInt2>(value);
+}
 
-	RValue<UInt2> operator+(RValue<UInt2> lhs, RValue<UInt2> rhs)
-	{
-		return RValue<UInt2>(Nucleus::createAdd(lhs.value, rhs.value));
-	}
+RValue<UInt2> operator+(RValue<UInt2> lhs, RValue<UInt2> rhs)
+{
+	return RValue<UInt2>(Nucleus::createAdd(lhs.value, rhs.value));
+}
 
-	RValue<UInt2> operator-(RValue<UInt2> lhs, RValue<UInt2> rhs)
-	{
-		return RValue<UInt2>(Nucleus::createSub(lhs.value, rhs.value));
-	}
+RValue<UInt2> operator-(RValue<UInt2> lhs, RValue<UInt2> rhs)
+{
+	return RValue<UInt2>(Nucleus::createSub(lhs.value, rhs.value));
+}
 
 //	RValue<UInt2> operator*(RValue<UInt2> lhs, RValue<UInt2> rhs)
 //	{
@@ -3184,30 +3185,30 @@
 //		return RValue<UInt2>(Nucleus::createURem(lhs.value, rhs.value));
 //	}
 
-	RValue<UInt2> operator&(RValue<UInt2> lhs, RValue<UInt2> rhs)
-	{
-		return RValue<UInt2>(Nucleus::createAnd(lhs.value, rhs.value));
-	}
+RValue<UInt2> operator&(RValue<UInt2> lhs, RValue<UInt2> rhs)
+{
+	return RValue<UInt2>(Nucleus::createAnd(lhs.value, rhs.value));
+}
 
-	RValue<UInt2> operator|(RValue<UInt2> lhs, RValue<UInt2> rhs)
-	{
-		return RValue<UInt2>(Nucleus::createOr(lhs.value, rhs.value));
-	}
+RValue<UInt2> operator|(RValue<UInt2> lhs, RValue<UInt2> rhs)
+{
+	return RValue<UInt2>(Nucleus::createOr(lhs.value, rhs.value));
+}
 
-	RValue<UInt2> operator^(RValue<UInt2> lhs, RValue<UInt2> rhs)
-	{
-		return RValue<UInt2>(Nucleus::createXor(lhs.value, rhs.value));
-	}
+RValue<UInt2> operator^(RValue<UInt2> lhs, RValue<UInt2> rhs)
+{
+	return RValue<UInt2>(Nucleus::createXor(lhs.value, rhs.value));
+}
 
-	RValue<UInt2> operator+=(UInt2 &lhs, RValue<UInt2> rhs)
-	{
-		return lhs = lhs + rhs;
-	}
+RValue<UInt2> operator+=(UInt2 &lhs, RValue<UInt2> rhs)
+{
+	return lhs = lhs + rhs;
+}
 
-	RValue<UInt2> operator-=(UInt2 &lhs, RValue<UInt2> rhs)
-	{
-		return lhs = lhs - rhs;
-	}
+RValue<UInt2> operator-=(UInt2 &lhs, RValue<UInt2> rhs)
+{
+	return lhs = lhs - rhs;
+}
 
 //	RValue<UInt2> operator*=(UInt2 &lhs, RValue<UInt2> rhs)
 //	{
@@ -3224,30 +3225,30 @@
 //		return lhs = lhs % rhs;
 //	}
 
-	RValue<UInt2> operator&=(UInt2 &lhs, RValue<UInt2> rhs)
-	{
-		return lhs = lhs & rhs;
-	}
+RValue<UInt2> operator&=(UInt2 &lhs, RValue<UInt2> rhs)
+{
+	return lhs = lhs & rhs;
+}
 
-	RValue<UInt2> operator|=(UInt2 &lhs, RValue<UInt2> rhs)
-	{
-		return lhs = lhs | rhs;
-	}
+RValue<UInt2> operator|=(UInt2 &lhs, RValue<UInt2> rhs)
+{
+	return lhs = lhs | rhs;
+}
 
-	RValue<UInt2> operator^=(UInt2 &lhs, RValue<UInt2> rhs)
-	{
-		return lhs = lhs ^ rhs;
-	}
+RValue<UInt2> operator^=(UInt2 &lhs, RValue<UInt2> rhs)
+{
+	return lhs = lhs ^ rhs;
+}
 
-	RValue<UInt2> operator<<=(UInt2 &lhs, unsigned char rhs)
-	{
-		return lhs = lhs << rhs;
-	}
+RValue<UInt2> operator<<=(UInt2 &lhs, unsigned char rhs)
+{
+	return lhs = lhs << rhs;
+}
 
-	RValue<UInt2> operator>>=(UInt2 &lhs, unsigned char rhs)
-	{
-		return lhs = lhs >> rhs;
-	}
+RValue<UInt2> operator>>=(UInt2 &lhs, unsigned char rhs)
+{
+	return lhs = lhs >> rhs;
+}
 
 //	RValue<UInt2> operator+(RValue<UInt2> val)
 //	{
@@ -3259,197 +3260,197 @@
 //		return RValue<UInt2>(Nucleus::createNeg(val.value));
 //	}
 
-	RValue<UInt2> operator~(RValue<UInt2> val)
-	{
-		return RValue<UInt2>(Nucleus::createNot(val.value));
-	}
+RValue<UInt2> operator~(RValue<UInt2> val)
+{
+	return RValue<UInt2>(Nucleus::createNot(val.value));
+}
 
-	RValue<UInt> Extract(RValue<UInt2> val, int i)
-	{
-		return RValue<UInt>(Nucleus::createExtractElement(val.value, UInt::getType(), i));
-	}
+RValue<UInt> Extract(RValue<UInt2> val, int i)
+{
+	return RValue<UInt>(Nucleus::createExtractElement(val.value, UInt::getType(), i));
+}
 
-	RValue<UInt2> Insert(RValue<UInt2> val, RValue<UInt> element, int i)
-	{
-		return RValue<UInt2>(Nucleus::createInsertElement(val.value, element.value, i));
-	}
+RValue<UInt2> Insert(RValue<UInt2> val, RValue<UInt> element, int i)
+{
+	return RValue<UInt2>(Nucleus::createInsertElement(val.value, element.value, i));
+}
 
-	Int4::Int4() : XYZW(this)
-	{
-	}
+Int4::Int4() : XYZW(this)
+{
+}
 
-	Int4::Int4(RValue<Float4> cast) : XYZW(this)
-	{
-		Value *xyzw = Nucleus::createFPToSI(cast.value, Int4::getType());
+Int4::Int4(RValue<Float4> cast) : XYZW(this)
+{
+	Value *xyzw = Nucleus::createFPToSI(cast.value, Int4::getType());
 
-		storeValue(xyzw);
-	}
+	storeValue(xyzw);
+}
 
-	Int4::Int4(int xyzw) : XYZW(this)
-	{
-		constant(xyzw, xyzw, xyzw, xyzw);
-	}
+Int4::Int4(int xyzw) : XYZW(this)
+{
+	constant(xyzw, xyzw, xyzw, xyzw);
+}
 
-	Int4::Int4(int x, int yzw) : XYZW(this)
-	{
-		constant(x, yzw, yzw, yzw);
-	}
+Int4::Int4(int x, int yzw) : XYZW(this)
+{
+	constant(x, yzw, yzw, yzw);
+}
 
-	Int4::Int4(int x, int y, int zw) : XYZW(this)
-	{
-		constant(x, y, zw, zw);
-	}
+Int4::Int4(int x, int y, int zw) : XYZW(this)
+{
+	constant(x, y, zw, zw);
+}
 
-	Int4::Int4(int x, int y, int z, int w) : XYZW(this)
-	{
-		constant(x, y, z, w);
-	}
+Int4::Int4(int x, int y, int z, int w) : XYZW(this)
+{
+	constant(x, y, z, w);
+}
 
-	void Int4::constant(int x, int y, int z, int w)
-	{
-		int64_t constantVector[4] = {x, y, z, w};
-		storeValue(Nucleus::createConstantVector(constantVector, getType()));
-	}
+void Int4::constant(int x, int y, int z, int w)
+{
+	int64_t constantVector[4] = {x, y, z, w};
+	storeValue(Nucleus::createConstantVector(constantVector, getType()));
+}
 
-	Int4::Int4(RValue<Int4> rhs) : XYZW(this)
-	{
-		storeValue(rhs.value);
-	}
+Int4::Int4(RValue<Int4> rhs) : XYZW(this)
+{
+	storeValue(rhs.value);
+}
 
-	Int4::Int4(const Int4 &rhs) : XYZW(this)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
-	}
+Int4::Int4(const Int4 &rhs) : XYZW(this)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
+}
 
-	Int4::Int4(const Reference<Int4> &rhs) : XYZW(this)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
-	}
+Int4::Int4(const Reference<Int4> &rhs) : XYZW(this)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
+}
 
-	Int4::Int4(RValue<UInt4> rhs) : XYZW(this)
-	{
-		storeValue(rhs.value);
-	}
+Int4::Int4(RValue<UInt4> rhs) : XYZW(this)
+{
+	storeValue(rhs.value);
+}
 
-	Int4::Int4(const UInt4 &rhs) : XYZW(this)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
-	}
+Int4::Int4(const UInt4 &rhs) : XYZW(this)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
+}
 
-	Int4::Int4(const Reference<UInt4> &rhs) : XYZW(this)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
-	}
+Int4::Int4(const Reference<UInt4> &rhs) : XYZW(this)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
+}
 
-	Int4::Int4(RValue<Int2> lo, RValue<Int2> hi) : XYZW(this)
-	{
-		int shuffle[4] = {0, 1, 4, 5};   // Real type is v4i32
-		Value *packed = Nucleus::createShuffleVector(lo.value, hi.value, shuffle);
+Int4::Int4(RValue<Int2> lo, RValue<Int2> hi) : XYZW(this)
+{
+	int shuffle[4] = {0, 1, 4, 5};   // Real type is v4i32
+	Value *packed = Nucleus::createShuffleVector(lo.value, hi.value, shuffle);
 
-		storeValue(packed);
-	}
+	storeValue(packed);
+}
 
-	Int4::Int4(const Int &rhs) : XYZW(this)
-	{
-		*this = RValue<Int>(rhs.loadValue());
-	}
+Int4::Int4(const Int &rhs) : XYZW(this)
+{
+	*this = RValue<Int>(rhs.loadValue());
+}
 
-	Int4::Int4(const Reference<Int> &rhs) : XYZW(this)
-	{
-		*this = RValue<Int>(rhs.loadValue());
-	}
+Int4::Int4(const Reference<Int> &rhs) : XYZW(this)
+{
+	*this = RValue<Int>(rhs.loadValue());
+}
 
-	RValue<Int4> Int4::operator=(RValue<Int4> rhs)
-	{
-		storeValue(rhs.value);
+RValue<Int4> Int4::operator=(RValue<Int4> rhs)
+{
+	storeValue(rhs.value);
 
-		return rhs;
-	}
+	return rhs;
+}
 
-	RValue<Int4> Int4::operator=(const Int4 &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
+RValue<Int4> Int4::operator=(const Int4 &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
 
-		return RValue<Int4>(value);
-	}
+	return RValue<Int4>(value);
+}
 
-	RValue<Int4> Int4::operator=(const Reference<Int4> &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
+RValue<Int4> Int4::operator=(const Reference<Int4> &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
 
-		return RValue<Int4>(value);
-	}
+	return RValue<Int4>(value);
+}
 
-	RValue<Int4> operator+(RValue<Int4> lhs, RValue<Int4> rhs)
-	{
-		return RValue<Int4>(Nucleus::createAdd(lhs.value, rhs.value));
-	}
+RValue<Int4> operator+(RValue<Int4> lhs, RValue<Int4> rhs)
+{
+	return RValue<Int4>(Nucleus::createAdd(lhs.value, rhs.value));
+}
 
-	RValue<Int4> operator-(RValue<Int4> lhs, RValue<Int4> rhs)
-	{
-		return RValue<Int4>(Nucleus::createSub(lhs.value, rhs.value));
-	}
+RValue<Int4> operator-(RValue<Int4> lhs, RValue<Int4> rhs)
+{
+	return RValue<Int4>(Nucleus::createSub(lhs.value, rhs.value));
+}
 
-	RValue<Int4> operator*(RValue<Int4> lhs, RValue<Int4> rhs)
-	{
-		return RValue<Int4>(Nucleus::createMul(lhs.value, rhs.value));
-	}
+RValue<Int4> operator*(RValue<Int4> lhs, RValue<Int4> rhs)
+{
+	return RValue<Int4>(Nucleus::createMul(lhs.value, rhs.value));
+}
 
-	RValue<Int4> operator/(RValue<Int4> lhs, RValue<Int4> rhs)
-	{
-		return RValue<Int4>(Nucleus::createSDiv(lhs.value, rhs.value));
-	}
+RValue<Int4> operator/(RValue<Int4> lhs, RValue<Int4> rhs)
+{
+	return RValue<Int4>(Nucleus::createSDiv(lhs.value, rhs.value));
+}
 
-	RValue<Int4> operator%(RValue<Int4> lhs, RValue<Int4> rhs)
-	{
-		return RValue<Int4>(Nucleus::createSRem(lhs.value, rhs.value));
-	}
+RValue<Int4> operator%(RValue<Int4> lhs, RValue<Int4> rhs)
+{
+	return RValue<Int4>(Nucleus::createSRem(lhs.value, rhs.value));
+}
 
-	RValue<Int4> operator&(RValue<Int4> lhs, RValue<Int4> rhs)
-	{
-		return RValue<Int4>(Nucleus::createAnd(lhs.value, rhs.value));
-	}
+RValue<Int4> operator&(RValue<Int4> lhs, RValue<Int4> rhs)
+{
+	return RValue<Int4>(Nucleus::createAnd(lhs.value, rhs.value));
+}
 
-	RValue<Int4> operator|(RValue<Int4> lhs, RValue<Int4> rhs)
-	{
-		return RValue<Int4>(Nucleus::createOr(lhs.value, rhs.value));
-	}
+RValue<Int4> operator|(RValue<Int4> lhs, RValue<Int4> rhs)
+{
+	return RValue<Int4>(Nucleus::createOr(lhs.value, rhs.value));
+}
 
-	RValue<Int4> operator^(RValue<Int4> lhs, RValue<Int4> rhs)
-	{
-		return RValue<Int4>(Nucleus::createXor(lhs.value, rhs.value));
-	}
+RValue<Int4> operator^(RValue<Int4> lhs, RValue<Int4> rhs)
+{
+	return RValue<Int4>(Nucleus::createXor(lhs.value, rhs.value));
+}
 
-	RValue<Int4> operator<<(RValue<Int4> lhs, RValue<Int4> rhs)
-	{
-		return RValue<Int4>(Nucleus::createShl(lhs.value, rhs.value));
-	}
+RValue<Int4> operator<<(RValue<Int4> lhs, RValue<Int4> rhs)
+{
+	return RValue<Int4>(Nucleus::createShl(lhs.value, rhs.value));
+}
 
-	RValue<Int4> operator>>(RValue<Int4> lhs, RValue<Int4> rhs)
-	{
-		return RValue<Int4>(Nucleus::createAShr(lhs.value, rhs.value));
-	}
+RValue<Int4> operator>>(RValue<Int4> lhs, RValue<Int4> rhs)
+{
+	return RValue<Int4>(Nucleus::createAShr(lhs.value, rhs.value));
+}
 
-	RValue<Int4> operator+=(Int4 &lhs, RValue<Int4> rhs)
-	{
-		return lhs = lhs + rhs;
-	}
+RValue<Int4> operator+=(Int4 &lhs, RValue<Int4> rhs)
+{
+	return lhs = lhs + rhs;
+}
 
-	RValue<Int4> operator-=(Int4 &lhs, RValue<Int4> rhs)
-	{
-		return lhs = lhs - rhs;
-	}
+RValue<Int4> operator-=(Int4 &lhs, RValue<Int4> rhs)
+{
+	return lhs = lhs - rhs;
+}
 
-	RValue<Int4> operator*=(Int4 &lhs, RValue<Int4> rhs)
-	{
-		return lhs = lhs * rhs;
-	}
+RValue<Int4> operator*=(Int4 &lhs, RValue<Int4> rhs)
+{
+	return lhs = lhs * rhs;
+}
 
 //	RValue<Int4> operator/=(Int4 &lhs, RValue<Int4> rhs)
 //	{
@@ -3461,235 +3462,235 @@
 //		return lhs = lhs % rhs;
 //	}
 
-	RValue<Int4> operator&=(Int4 &lhs, RValue<Int4> rhs)
-	{
-		return lhs = lhs & rhs;
-	}
+RValue<Int4> operator&=(Int4 &lhs, RValue<Int4> rhs)
+{
+	return lhs = lhs & rhs;
+}
 
-	RValue<Int4> operator|=(Int4 &lhs, RValue<Int4> rhs)
-	{
-		return lhs = lhs | rhs;
-	}
+RValue<Int4> operator|=(Int4 &lhs, RValue<Int4> rhs)
+{
+	return lhs = lhs | rhs;
+}
 
-	RValue<Int4> operator^=(Int4 &lhs, RValue<Int4> rhs)
-	{
-		return lhs = lhs ^ rhs;
-	}
+RValue<Int4> operator^=(Int4 &lhs, RValue<Int4> rhs)
+{
+	return lhs = lhs ^ rhs;
+}
 
-	RValue<Int4> operator<<=(Int4 &lhs, unsigned char rhs)
-	{
-		return lhs = lhs << rhs;
-	}
+RValue<Int4> operator<<=(Int4 &lhs, unsigned char rhs)
+{
+	return lhs = lhs << rhs;
+}
 
-	RValue<Int4> operator>>=(Int4 &lhs, unsigned char rhs)
-	{
-		return lhs = lhs >> rhs;
-	}
+RValue<Int4> operator>>=(Int4 &lhs, unsigned char rhs)
+{
+	return lhs = lhs >> rhs;
+}
 
-	RValue<Int4> operator+(RValue<Int4> val)
-	{
-		return val;
-	}
+RValue<Int4> operator+(RValue<Int4> val)
+{
+	return val;
+}
 
-	RValue<Int4> operator-(RValue<Int4> val)
-	{
-		return RValue<Int4>(Nucleus::createNeg(val.value));
-	}
+RValue<Int4> operator-(RValue<Int4> val)
+{
+	return RValue<Int4>(Nucleus::createNeg(val.value));
+}
 
-	RValue<Int4> operator~(RValue<Int4> val)
-	{
-		return RValue<Int4>(Nucleus::createNot(val.value));
-	}
+RValue<Int4> operator~(RValue<Int4> val)
+{
+	return RValue<Int4>(Nucleus::createNot(val.value));
+}
 
-	RValue<Int> Extract(RValue<Int4> x, int i)
-	{
-		return RValue<Int>(Nucleus::createExtractElement(x.value, Int::getType(), i));
-	}
+RValue<Int> Extract(RValue<Int4> x, int i)
+{
+	return RValue<Int>(Nucleus::createExtractElement(x.value, Int::getType(), i));
+}
 
-	RValue<Int4> Insert(RValue<Int4> x, RValue<Int> element, int i)
-	{
-		return RValue<Int4>(Nucleus::createInsertElement(x.value, element.value, i));
-	}
+RValue<Int4> Insert(RValue<Int4> x, RValue<Int> element, int i)
+{
+	return RValue<Int4>(Nucleus::createInsertElement(x.value, element.value, i));
+}
 
-	RValue<Int4> Swizzle(RValue<Int4> x, uint16_t select)
-	{
-		return RValue<Int4>(createSwizzle4(x.value, select));
-	}
+RValue<Int4> Swizzle(RValue<Int4> x, uint16_t select)
+{
+	return RValue<Int4>(createSwizzle4(x.value, select));
+}
 
-	RValue<Int4> Shuffle(RValue<Int4> x, RValue<Int4> y, unsigned short select)
-	{
-		return RValue<Int4>(createBlend4(x.value, y.value, select));
-	}
+RValue<Int4> Shuffle(RValue<Int4> x, RValue<Int4> y, unsigned short select)
+{
+	return RValue<Int4>(createBlend4(x.value, y.value, select));
+}
 
-	UInt4::UInt4() : XYZW(this)
-	{
-	}
+UInt4::UInt4() : XYZW(this)
+{
+}
 
-	UInt4::UInt4(int xyzw) : XYZW(this)
-	{
-		constant(xyzw, xyzw, xyzw, xyzw);
-	}
+UInt4::UInt4(int xyzw) : XYZW(this)
+{
+	constant(xyzw, xyzw, xyzw, xyzw);
+}
 
-	UInt4::UInt4(int x, int yzw) : XYZW(this)
-	{
-		constant(x, yzw, yzw, yzw);
-	}
+UInt4::UInt4(int x, int yzw) : XYZW(this)
+{
+	constant(x, yzw, yzw, yzw);
+}
 
-	UInt4::UInt4(int x, int y, int zw) : XYZW(this)
-	{
-		constant(x, y, zw, zw);
-	}
+UInt4::UInt4(int x, int y, int zw) : XYZW(this)
+{
+	constant(x, y, zw, zw);
+}
 
-	UInt4::UInt4(int x, int y, int z, int w) : XYZW(this)
-	{
-		constant(x, y, z, w);
-	}
+UInt4::UInt4(int x, int y, int z, int w) : XYZW(this)
+{
+	constant(x, y, z, w);
+}
 
-	void UInt4::constant(int x, int y, int z, int w)
-	{
-		int64_t constantVector[4] = {x, y, z, w};
-		storeValue(Nucleus::createConstantVector(constantVector, getType()));
-	}
+void UInt4::constant(int x, int y, int z, int w)
+{
+	int64_t constantVector[4] = {x, y, z, w};
+	storeValue(Nucleus::createConstantVector(constantVector, getType()));
+}
 
-	UInt4::UInt4(RValue<UInt4> rhs) : XYZW(this)
-	{
-		storeValue(rhs.value);
-	}
+UInt4::UInt4(RValue<UInt4> rhs) : XYZW(this)
+{
+	storeValue(rhs.value);
+}
 
-	UInt4::UInt4(const UInt4 &rhs) : XYZW(this)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
-	}
+UInt4::UInt4(const UInt4 &rhs) : XYZW(this)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
+}
 
-	UInt4::UInt4(const Reference<UInt4> &rhs) : XYZW(this)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
-	}
+UInt4::UInt4(const Reference<UInt4> &rhs) : XYZW(this)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
+}
 
-	UInt4::UInt4(RValue<Int4> rhs) : XYZW(this)
-	{
-		storeValue(rhs.value);
-	}
+UInt4::UInt4(RValue<Int4> rhs) : XYZW(this)
+{
+	storeValue(rhs.value);
+}
 
-	UInt4::UInt4(const Int4 &rhs) : XYZW(this)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
-	}
+UInt4::UInt4(const Int4 &rhs) : XYZW(this)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
+}
 
-	UInt4::UInt4(const Reference<Int4> &rhs) : XYZW(this)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
-	}
+UInt4::UInt4(const Reference<Int4> &rhs) : XYZW(this)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
+}
 
-	UInt4::UInt4(RValue<UInt2> lo, RValue<UInt2> hi) : XYZW(this)
-	{
-		int shuffle[4] = {0, 1, 4, 5};   // Real type is v4i32
-		Value *packed = Nucleus::createShuffleVector(lo.value, hi.value, shuffle);
+UInt4::UInt4(RValue<UInt2> lo, RValue<UInt2> hi) : XYZW(this)
+{
+	int shuffle[4] = {0, 1, 4, 5};   // Real type is v4i32
+	Value *packed = Nucleus::createShuffleVector(lo.value, hi.value, shuffle);
 
-		storeValue(packed);
-	}
+	storeValue(packed);
+}
 
-	UInt4::UInt4(const UInt &rhs) : XYZW(this)
-	{
-		*this = RValue<UInt>(rhs.loadValue());
-	}
+UInt4::UInt4(const UInt &rhs) : XYZW(this)
+{
+	*this = RValue<UInt>(rhs.loadValue());
+}
 
-	UInt4::UInt4(const Reference<UInt> &rhs) : XYZW(this)
-	{
-		*this = RValue<UInt>(rhs.loadValue());
-	}
+UInt4::UInt4(const Reference<UInt> &rhs) : XYZW(this)
+{
+	*this = RValue<UInt>(rhs.loadValue());
+}
 
-	RValue<UInt4> UInt4::operator=(RValue<UInt4> rhs)
-	{
-		storeValue(rhs.value);
+RValue<UInt4> UInt4::operator=(RValue<UInt4> rhs)
+{
+	storeValue(rhs.value);
 
-		return rhs;
-	}
+	return rhs;
+}
 
-	RValue<UInt4> UInt4::operator=(const UInt4 &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
+RValue<UInt4> UInt4::operator=(const UInt4 &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
 
-		return RValue<UInt4>(value);
-	}
+	return RValue<UInt4>(value);
+}
 
-	RValue<UInt4> UInt4::operator=(const Reference<UInt4> &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
+RValue<UInt4> UInt4::operator=(const Reference<UInt4> &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
 
-		return RValue<UInt4>(value);
-	}
+	return RValue<UInt4>(value);
+}
 
-	RValue<UInt4> operator+(RValue<UInt4> lhs, RValue<UInt4> rhs)
-	{
-		return RValue<UInt4>(Nucleus::createAdd(lhs.value, rhs.value));
-	}
+RValue<UInt4> operator+(RValue<UInt4> lhs, RValue<UInt4> rhs)
+{
+	return RValue<UInt4>(Nucleus::createAdd(lhs.value, rhs.value));
+}
 
-	RValue<UInt4> operator-(RValue<UInt4> lhs, RValue<UInt4> rhs)
-	{
-		return RValue<UInt4>(Nucleus::createSub(lhs.value, rhs.value));
-	}
+RValue<UInt4> operator-(RValue<UInt4> lhs, RValue<UInt4> rhs)
+{
+	return RValue<UInt4>(Nucleus::createSub(lhs.value, rhs.value));
+}
 
-	RValue<UInt4> operator*(RValue<UInt4> lhs, RValue<UInt4> rhs)
-	{
-		return RValue<UInt4>(Nucleus::createMul(lhs.value, rhs.value));
-	}
+RValue<UInt4> operator*(RValue<UInt4> lhs, RValue<UInt4> rhs)
+{
+	return RValue<UInt4>(Nucleus::createMul(lhs.value, rhs.value));
+}
 
-	RValue<UInt4> operator/(RValue<UInt4> lhs, RValue<UInt4> rhs)
-	{
-		return RValue<UInt4>(Nucleus::createUDiv(lhs.value, rhs.value));
-	}
+RValue<UInt4> operator/(RValue<UInt4> lhs, RValue<UInt4> rhs)
+{
+	return RValue<UInt4>(Nucleus::createUDiv(lhs.value, rhs.value));
+}
 
-	RValue<UInt4> operator%(RValue<UInt4> lhs, RValue<UInt4> rhs)
-	{
-		return RValue<UInt4>(Nucleus::createURem(lhs.value, rhs.value));
-	}
+RValue<UInt4> operator%(RValue<UInt4> lhs, RValue<UInt4> rhs)
+{
+	return RValue<UInt4>(Nucleus::createURem(lhs.value, rhs.value));
+}
 
-	RValue<UInt4> operator&(RValue<UInt4> lhs, RValue<UInt4> rhs)
-	{
-		return RValue<UInt4>(Nucleus::createAnd(lhs.value, rhs.value));
-	}
+RValue<UInt4> operator&(RValue<UInt4> lhs, RValue<UInt4> rhs)
+{
+	return RValue<UInt4>(Nucleus::createAnd(lhs.value, rhs.value));
+}
 
-	RValue<UInt4> operator|(RValue<UInt4> lhs, RValue<UInt4> rhs)
-	{
-		return RValue<UInt4>(Nucleus::createOr(lhs.value, rhs.value));
-	}
+RValue<UInt4> operator|(RValue<UInt4> lhs, RValue<UInt4> rhs)
+{
+	return RValue<UInt4>(Nucleus::createOr(lhs.value, rhs.value));
+}
 
-	RValue<UInt4> operator^(RValue<UInt4> lhs, RValue<UInt4> rhs)
-	{
-		return RValue<UInt4>(Nucleus::createXor(lhs.value, rhs.value));
-	}
+RValue<UInt4> operator^(RValue<UInt4> lhs, RValue<UInt4> rhs)
+{
+	return RValue<UInt4>(Nucleus::createXor(lhs.value, rhs.value));
+}
 
-	RValue<UInt4> operator<<(RValue<UInt4> lhs, RValue<UInt4> rhs)
-	{
-		return RValue<UInt4>(Nucleus::createShl(lhs.value, rhs.value));
-	}
+RValue<UInt4> operator<<(RValue<UInt4> lhs, RValue<UInt4> rhs)
+{
+	return RValue<UInt4>(Nucleus::createShl(lhs.value, rhs.value));
+}
 
-	RValue<UInt4> operator>>(RValue<UInt4> lhs, RValue<UInt4> rhs)
-	{
-		return RValue<UInt4>(Nucleus::createLShr(lhs.value, rhs.value));
-	}
+RValue<UInt4> operator>>(RValue<UInt4> lhs, RValue<UInt4> rhs)
+{
+	return RValue<UInt4>(Nucleus::createLShr(lhs.value, rhs.value));
+}
 
-	RValue<UInt4> operator+=(UInt4 &lhs, RValue<UInt4> rhs)
-	{
-		return lhs = lhs + rhs;
-	}
+RValue<UInt4> operator+=(UInt4 &lhs, RValue<UInt4> rhs)
+{
+	return lhs = lhs + rhs;
+}
 
-	RValue<UInt4> operator-=(UInt4 &lhs, RValue<UInt4> rhs)
-	{
-		return lhs = lhs - rhs;
-	}
+RValue<UInt4> operator-=(UInt4 &lhs, RValue<UInt4> rhs)
+{
+	return lhs = lhs - rhs;
+}
 
-	RValue<UInt4> operator*=(UInt4 &lhs, RValue<UInt4> rhs)
-	{
-		return lhs = lhs * rhs;
-	}
+RValue<UInt4> operator*=(UInt4 &lhs, RValue<UInt4> rhs)
+{
+	return lhs = lhs * rhs;
+}
 
 //	RValue<UInt4> operator/=(UInt4 &lhs, RValue<UInt4> rhs)
 //	{
@@ -3701,722 +3702,723 @@
 //		return lhs = lhs % rhs;
 //	}
 
-	RValue<UInt4> operator&=(UInt4 &lhs, RValue<UInt4> rhs)
-	{
-		return lhs = lhs & rhs;
-	}
-
-	RValue<UInt4> operator|=(UInt4 &lhs, RValue<UInt4> rhs)
-	{
-		return lhs = lhs | rhs;
-	}
-
-	RValue<UInt4> operator^=(UInt4 &lhs, RValue<UInt4> rhs)
-	{
-		return lhs = lhs ^ rhs;
-	}
-
-	RValue<UInt4> operator<<=(UInt4 &lhs, unsigned char rhs)
-	{
-		return lhs = lhs << rhs;
-	}
-
-	RValue<UInt4> operator>>=(UInt4 &lhs, unsigned char rhs)
-	{
-		return lhs = lhs >> rhs;
-	}
-
-	RValue<UInt4> operator+(RValue<UInt4> val)
-	{
-		return val;
-	}
-
-	RValue<UInt4> operator-(RValue<UInt4> val)
-	{
-		return RValue<UInt4>(Nucleus::createNeg(val.value));
-	}
-
-	RValue<UInt4> operator~(RValue<UInt4> val)
-	{
-		return RValue<UInt4>(Nucleus::createNot(val.value));
-	}
-
-	RValue<UInt> Extract(RValue<UInt4> x, int i)
-	{
-		return RValue<UInt>(Nucleus::createExtractElement(x.value, Int::getType(), i));
-	}
-
-	RValue<UInt4> Insert(RValue<UInt4> x, RValue<UInt> element, int i)
-	{
-		return RValue<UInt4>(Nucleus::createInsertElement(x.value, element.value, i));
-	}
-
-	RValue<UInt4> Swizzle(RValue<UInt4> x, uint16_t select)
-	{
-		return RValue<UInt4>(createSwizzle4(x.value, select));
-	}
-
-	RValue<UInt4> Shuffle(RValue<UInt4> x, RValue<UInt4> y, unsigned short select)
-	{
-		return RValue<UInt4>(createBlend4(x.value, y.value, select));
-	}
-
-	Half::Half(RValue<Float> cast)
-	{
-		UInt fp32i = As<UInt>(cast);
-		UInt abs = fp32i & 0x7FFFFFFF;
-		UShort fp16i((fp32i & 0x80000000) >> 16); // sign
-
-		If(abs > 0x47FFEFFF) // Infinity
-		{
-			fp16i |= UShort(0x7FFF);
-		}
-		Else
-		{
-			If(abs < 0x38800000) // Denormal
-			{
-				Int mantissa = (abs & 0x007FFFFF) | 0x00800000;
-				Int e = 113 - (abs >> 23);
-				abs = IfThenElse(e < 24, mantissa >> e, Int(0));
-				fp16i |= UShort((abs + 0x00000FFF + ((abs >> 13) & 1)) >> 13);
-			}
-			Else
-			{
-				fp16i |= UShort((abs + 0xC8000000 + 0x00000FFF + ((abs >> 13) & 1)) >> 13);
-			}
-		}
-
-		storeValue(fp16i.loadValue());
-	}
-
-	Float::Float(RValue<Int> cast)
-	{
-		Value *integer = Nucleus::createSIToFP(cast.value, Float::getType());
-
-		storeValue(integer);
-	}
-
-	Float::Float(RValue<UInt> cast)
-	{
-		RValue<Float> result = Float(Int(cast & UInt(0x7FFFFFFF))) +
-		                       As<Float>((As<Int>(cast) >> 31) & As<Int>(Float(0x80000000u)));
-
-		storeValue(result.value);
-	}
-
-	Float::Float(RValue<Half> cast)
-	{
-		Int fp16i(As<UShort>(cast));
-
-		Int s = (fp16i >> 15) & 0x00000001;
-		Int e = (fp16i >> 10) & 0x0000001F;
-		Int m = fp16i & 0x000003FF;
-
-		UInt fp32i(s << 31);
-		If(e == 0)
-		{
-			If(m != 0)
-			{
-				While((m & 0x00000400) == 0)
-				{
-					m <<= 1;
-					e -= 1;
-				}
-
-				fp32i |= As<UInt>(((e + (127 - 15) + 1) << 23) | ((m & ~0x00000400) << 13));
-			}
-		}
-		Else
-		{
-			fp32i |= As<UInt>(((e + (127 - 15)) << 23) | (m << 13));
-		}
-
-		storeValue(As<Float>(fp32i).value);
-	}
-
-	Float::Float(float x)
-	{
-		// C++ does not have a way to write an infinite or NaN literal,
-		// nor does it allow division by zero as a constant expression.
-		// Thus we should not accept inf or NaN as a Reactor Float constant,
-		// as this would typically idicate a bug, and avoids undefined
-		// behavior.
-		//
-		// This also prevents the issue of the LLVM JIT only taking double
-		// values for constructing floating-point constants. During the
-		// conversion from single-precision to double, a signaling NaN can
-		// become a quiet NaN, thus altering its bit pattern. Hence this
-		// assert is also helpful for detecting cases where integers are
-		// being reinterpreted as float and then bitcast to integer again,
-		// which does not guarantee preserving the integer value.
-		//
-		// Should infinity and NaN constants be required, methods like
-		// infinity(), quiet_NaN(), and signaling_NaN() should be added
-		// to the Float class.
-		ASSERT(std::isfinite(x));
-
-		storeValue(Nucleus::createConstantFloat(x));
-	}
-
-	Float::Float(RValue<Float> rhs)
-	{
-		storeValue(rhs.value);
-	}
-
-	Float::Float(const Float &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
-	}
-
-	Float::Float(const Reference<Float> &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
-	}
-
-	Float::Float(Argument<Float> argument)
-	{
-		storeValue(argument.value);
-	}
-
-	RValue<Float> Float::operator=(RValue<Float> rhs)
-	{
-		storeValue(rhs.value);
-
-		return rhs;
-	}
-
-	RValue<Float> Float::operator=(const Float &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
-
-		return RValue<Float>(value);
-	}
-
-	RValue<Float> Float::operator=(const Reference<Float> &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
-
-		return RValue<Float>(value);
-	}
-
-	RValue<Float> operator+(RValue<Float> lhs, RValue<Float> rhs)
-	{
-		return RValue<Float>(Nucleus::createFAdd(lhs.value, rhs.value));
-	}
-
-	RValue<Float> operator-(RValue<Float> lhs, RValue<Float> rhs)
-	{
-		return RValue<Float>(Nucleus::createFSub(lhs.value, rhs.value));
-	}
-
-	RValue<Float> operator*(RValue<Float> lhs, RValue<Float> rhs)
-	{
-		return RValue<Float>(Nucleus::createFMul(lhs.value, rhs.value));
-	}
-
-	RValue<Float> operator/(RValue<Float> lhs, RValue<Float> rhs)
-	{
-		return RValue<Float>(Nucleus::createFDiv(lhs.value, rhs.value));
-	}
-
-	RValue<Float> operator+=(Float &lhs, RValue<Float> rhs)
-	{
-		return lhs = lhs + rhs;
-	}
-
-	RValue<Float> operator-=(Float &lhs, RValue<Float> rhs)
-	{
-		return lhs = lhs - rhs;
-	}
-
-	RValue<Float> operator*=(Float &lhs, RValue<Float> rhs)
-	{
-		return lhs = lhs * rhs;
-	}
-
-	RValue<Float> operator/=(Float &lhs, RValue<Float> rhs)
-	{
-		return lhs = lhs / rhs;
-	}
-
-	RValue<Float> operator+(RValue<Float> val)
-	{
-		return val;
-	}
-
-	RValue<Float> operator-(RValue<Float> val)
-	{
-		return RValue<Float>(Nucleus::createFNeg(val.value));
-	}
-
-	RValue<Bool> operator<(RValue<Float> lhs, RValue<Float> rhs)
-	{
-		return RValue<Bool>(Nucleus::createFCmpOLT(lhs.value, rhs.value));
-	}
-
-	RValue<Bool> operator<=(RValue<Float> lhs, RValue<Float> rhs)
-	{
-		return RValue<Bool>(Nucleus::createFCmpOLE(lhs.value, rhs.value));
-	}
-
-	RValue<Bool> operator>(RValue<Float> lhs, RValue<Float> rhs)
-	{
-		return RValue<Bool>(Nucleus::createFCmpOGT(lhs.value, rhs.value));
-	}
-
-	RValue<Bool> operator>=(RValue<Float> lhs, RValue<Float> rhs)
-	{
-		return RValue<Bool>(Nucleus::createFCmpOGE(lhs.value, rhs.value));
-	}
-
-	RValue<Bool> operator!=(RValue<Float> lhs, RValue<Float> rhs)
-	{
-		return RValue<Bool>(Nucleus::createFCmpONE(lhs.value, rhs.value));
-	}
-
-	RValue<Bool> operator==(RValue<Float> lhs, RValue<Float> rhs)
-	{
-		return RValue<Bool>(Nucleus::createFCmpOEQ(lhs.value, rhs.value));
-	}
-
-	RValue<Float> Abs(RValue<Float> x)
-	{
-		return IfThenElse(x > 0.0f, x, -x);
-	}
-
-	RValue<Float> Max(RValue<Float> x, RValue<Float> y)
-	{
-		return IfThenElse(x > y, x, y);
-	}
-
-	RValue<Float> Min(RValue<Float> x, RValue<Float> y)
-	{
-		return IfThenElse(x < y, x, y);
-	}
-
-	Float2::Float2(RValue<Float4> cast)
-	{
-		storeValue(Nucleus::createBitCast(cast.value, getType()));
-	}
-
-	Float4::Float4(RValue<Byte4> cast) : XYZW(this)
-	{
-		Value *a = Int4(cast).loadValue();
-		Value *xyzw = Nucleus::createSIToFP(a, Float4::getType());
-
-		storeValue(xyzw);
-	}
-
-	Float4::Float4(RValue<SByte4> cast) : XYZW(this)
-	{
-		Value *a = Int4(cast).loadValue();
-		Value *xyzw = Nucleus::createSIToFP(a, Float4::getType());
-
-		storeValue(xyzw);
-	}
-
-	Float4::Float4(RValue<Short4> cast) : XYZW(this)
-	{
-		Int4 c(cast);
-		storeValue(Nucleus::createSIToFP(RValue<Int4>(c).value, Float4::getType()));
-	}
-
-	Float4::Float4(RValue<UShort4> cast) : XYZW(this)
-	{
-		Int4 c(cast);
-		storeValue(Nucleus::createSIToFP(RValue<Int4>(c).value, Float4::getType()));
-	}
-
-	Float4::Float4(RValue<Int4> cast) : XYZW(this)
-	{
-		Value *xyzw = Nucleus::createSIToFP(cast.value, Float4::getType());
-
-		storeValue(xyzw);
-	}
-
-	Float4::Float4(RValue<UInt4> cast) : XYZW(this)
-	{
-		RValue<Float4> result = Float4(Int4(cast & UInt4(0x7FFFFFFF))) +
-		                        As<Float4>((As<Int4>(cast) >> 31) & As<Int4>(Float4(0x80000000u)));
-
-		storeValue(result.value);
-	}
-
-	Float4::Float4() : XYZW(this)
-	{
-	}
-
-	Float4::Float4(float xyzw) : XYZW(this)
-	{
-		constant(xyzw, xyzw, xyzw, xyzw);
-	}
-
-	Float4::Float4(float x, float yzw) : XYZW(this)
-	{
-		constant(x, yzw, yzw, yzw);
-	}
-
-	Float4::Float4(float x, float y, float zw) : XYZW(this)
-	{
-		constant(x, y, zw, zw);
-	}
-
-	Float4::Float4(float x, float y, float z, float w) : XYZW(this)
-	{
-		constant(x, y, z, w);
-	}
-
-	Float4 Float4::positive_inf()
-	{
-		Float4 result;
-		result.infinity_constant(false);
-		return result;
-	}
-
-	Float4 Float4::negative_inf()
-	{
-		Float4 result;
-		result.infinity_constant(true);
-		return result;
-	}
-
-	void Float4::infinity_constant(bool negative)
-	{
-		double inf = negative ? -INFINITY : INFINITY;
-		double constantVector[4] = {inf, inf, inf, inf};
-		storeValue(Nucleus::createConstantVector(constantVector, getType()));
-	}
-
-	void Float4::constant(float x, float y, float z, float w)
-	{
-		// See Float(float) constructor for the rationale behind this assert.
-		ASSERT(std::isfinite(x) && std::isfinite(y) && std::isfinite(z) && std::isfinite(w));
-
-		double constantVector[4] = {x, y, z, w};
-		storeValue(Nucleus::createConstantVector(constantVector, getType()));
-	}
-
-	Float4::Float4(RValue<Float4> rhs) : XYZW(this)
-	{
-		storeValue(rhs.value);
-	}
-
-	Float4::Float4(const Float4 &rhs) : XYZW(this)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
-	}
-
-	Float4::Float4(const Reference<Float4> &rhs) : XYZW(this)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
-	}
-
-	Float4::Float4(const Float &rhs) : XYZW(this)
-	{
-		*this = RValue<Float>(rhs.loadValue());
-	}
-
-	Float4::Float4(const Reference<Float> &rhs) : XYZW(this)
-	{
-		*this = RValue<Float>(rhs.loadValue());
-	}
-
-	RValue<Float4> Float4::operator=(float x)
-	{
-		return *this = Float4(x, x, x, x);
-	}
-
-	RValue<Float4> Float4::operator=(RValue<Float4> rhs)
-	{
-		storeValue(rhs.value);
-
-		return rhs;
-	}
-
-	RValue<Float4> Float4::operator=(const Float4 &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
-
-		return RValue<Float4>(value);
-	}
-
-	RValue<Float4> Float4::operator=(const Reference<Float4> &rhs)
-	{
-		Value *value = rhs.loadValue();
-		storeValue(value);
-
-		return RValue<Float4>(value);
-	}
-
-	RValue<Float4> Float4::operator=(RValue<Float> rhs)
-	{
-		return *this = Float4(rhs);
-	}
-
-	RValue<Float4> Float4::operator=(const Float &rhs)
-	{
-		return *this = Float4(rhs);
-	}
-
-	RValue<Float4> Float4::operator=(const Reference<Float> &rhs)
-	{
-		return *this = Float4(rhs);
-	}
-
-	RValue<Float4> operator+(RValue<Float4> lhs, RValue<Float4> rhs)
-	{
-		return RValue<Float4>(Nucleus::createFAdd(lhs.value, rhs.value));
-	}
-
-	RValue<Float4> operator-(RValue<Float4> lhs, RValue<Float4> rhs)
-	{
-		return RValue<Float4>(Nucleus::createFSub(lhs.value, rhs.value));
-	}
-
-	RValue<Float4> operator*(RValue<Float4> lhs, RValue<Float4> rhs)
-	{
-		return RValue<Float4>(Nucleus::createFMul(lhs.value, rhs.value));
-	}
-
-	RValue<Float4> operator/(RValue<Float4> lhs, RValue<Float4> rhs)
-	{
-		return RValue<Float4>(Nucleus::createFDiv(lhs.value, rhs.value));
-	}
-
-	RValue<Float4> operator%(RValue<Float4> lhs, RValue<Float4> rhs)
-	{
-		return RValue<Float4>(Nucleus::createFRem(lhs.value, rhs.value));
-	}
-
-	RValue<Float4> operator+=(Float4 &lhs, RValue<Float4> rhs)
-	{
-		return lhs = lhs + rhs;
-	}
-
-	RValue<Float4> operator-=(Float4 &lhs, RValue<Float4> rhs)
-	{
-		return lhs = lhs - rhs;
-	}
-
-	RValue<Float4> operator*=(Float4 &lhs, RValue<Float4> rhs)
-	{
-		return lhs = lhs * rhs;
-	}
-
-	RValue<Float4> operator/=(Float4 &lhs, RValue<Float4> rhs)
-	{
-		return lhs = lhs / rhs;
-	}
-
-	RValue<Float4> operator%=(Float4 &lhs, RValue<Float4> rhs)
-	{
-		return lhs = lhs % rhs;
-	}
-
-	RValue<Float4> operator+(RValue<Float4> val)
-	{
-		return val;
-	}
-
-	RValue<Float4> operator-(RValue<Float4> val)
-	{
-		return RValue<Float4>(Nucleus::createFNeg(val.value));
-	}
-
-	RValue<Float4> Abs(RValue<Float4> x)
-	{
-		// TODO: Optimize.
-		Value *vector = Nucleus::createBitCast(x.value, Int4::getType());
-		int64_t constantVector[4] = {0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF};
-		Value *result = Nucleus::createAnd(vector, Nucleus::createConstantVector(constantVector, Int4::getType()));
-
-		return As<Float4>(result);
-	}
-
-	RValue<Float4> Insert(RValue<Float4> x, RValue<Float> element, int i)
-	{
-		return RValue<Float4>(Nucleus::createInsertElement(x.value, element.value, i));
-	}
-
-	RValue<Float> Extract(RValue<Float4> x, int i)
-	{
-		return RValue<Float>(Nucleus::createExtractElement(x.value, Float::getType(), i));
-	}
-
-	RValue<Float4> Swizzle(RValue<Float4> x, uint16_t select)
-	{
-		return RValue<Float4>(createSwizzle4(x.value, select));
-	}
-
-	RValue<Float4> Shuffle(RValue<Float4> x, RValue<Float4> y, uint16_t select)
-	{
-		return RValue<Float4>(createBlend4(x.value, y.value, select));
-	}
-
-	RValue<Float4> ShuffleLowHigh(RValue<Float4> x, RValue<Float4> y, uint16_t imm)
-	{
-		int shuffle[4] =
-		{
-			((imm >> 12) & 0x03) + 0,
-			((imm >>  8) & 0x03) + 0,
-			((imm >>  4) & 0x03) + 4,
-			((imm >>  0) & 0x03) + 4,
-		};
-
-		return RValue<Float4>(Nucleus::createShuffleVector(x.value, y.value, shuffle));
-	}
-
-	RValue<Float4> UnpackLow(RValue<Float4> x, RValue<Float4> y)
-	{
-		int shuffle[4] = {0, 4, 1, 5};
-		return RValue<Float4>(Nucleus::createShuffleVector(x.value, y.value, shuffle));
-	}
-
-	RValue<Float4> UnpackHigh(RValue<Float4> x, RValue<Float4> y)
-	{
-		int shuffle[4] = {2, 6, 3, 7};
-		return RValue<Float4>(Nucleus::createShuffleVector(x.value, y.value, shuffle));
-	}
-
-	RValue<Float4> Mask(Float4 &lhs, RValue<Float4> rhs, uint16_t select)
-	{
-		Value *vector = lhs.loadValue();
-		Value *result = createMask4(vector, rhs.value, select);
-		lhs.storeValue(result);
-
-		return RValue<Float4>(result);
-	}
-
-	RValue<Int4> IsInf(RValue<Float4> x)
-	{
-		return CmpEQ(As<Int4>(x) & Int4(0x7FFFFFFF), Int4(0x7F800000));
-	}
-
-	RValue<Int4> IsNan(RValue<Float4> x)
-	{
-		return ~CmpEQ(x, x);
-	}
-
-	RValue<Pointer<Byte>> operator+(RValue<Pointer<Byte>> lhs, int offset)
-	{
-		return lhs + RValue<Int>(Nucleus::createConstantInt(offset));
-	}
-
-	RValue<Pointer<Byte>> operator+(RValue<Pointer<Byte>> lhs, RValue<Int> offset)
-	{
-		return RValue<Pointer<Byte>>(Nucleus::createGEP(lhs.value, Byte::getType(), offset.value, false));
-	}
-
-	RValue<Pointer<Byte>> operator+(RValue<Pointer<Byte>> lhs, RValue<UInt> offset)
-	{
-		return RValue<Pointer<Byte>>(Nucleus::createGEP(lhs.value, Byte::getType(), offset.value, true));
-	}
-
-	RValue<Pointer<Byte>> operator+=(Pointer<Byte> &lhs, int offset)
-	{
-		return lhs = lhs + offset;
-	}
-
-	RValue<Pointer<Byte>> operator+=(Pointer<Byte> &lhs, RValue<Int> offset)
-	{
-		return lhs = lhs + offset;
-	}
-
-	RValue<Pointer<Byte>> operator+=(Pointer<Byte> &lhs, RValue<UInt> offset)
-	{
-		return lhs = lhs + offset;
-	}
-
-	RValue<Pointer<Byte>> operator-(RValue<Pointer<Byte>> lhs, int offset)
-	{
-		return lhs + -offset;
-	}
-
-	RValue<Pointer<Byte>> operator-(RValue<Pointer<Byte>> lhs, RValue<Int> offset)
-	{
-		return lhs + -offset;
-	}
-
-	RValue<Pointer<Byte>> operator-(RValue<Pointer<Byte>> lhs, RValue<UInt> offset)
-	{
-		return lhs + -offset;
-	}
-
-	RValue<Pointer<Byte>> operator-=(Pointer<Byte> &lhs, int offset)
-	{
-		return lhs = lhs - offset;
-	}
-
-	RValue<Pointer<Byte>> operator-=(Pointer<Byte> &lhs, RValue<Int> offset)
-	{
-		return lhs = lhs - offset;
-	}
-
-	RValue<Pointer<Byte>> operator-=(Pointer<Byte> &lhs, RValue<UInt> offset)
-	{
-		return lhs = lhs - offset;
-	}
-
-	void Return()
-	{
-		Nucleus::createRetVoid();
-		// Place any unreachable instructions in an unreferenced block.
-		Nucleus::setInsertBlock(Nucleus::createBasicBlock());
-	}
-
-	void branch(RValue<Bool> cmp, BasicBlock *bodyBB, BasicBlock *endBB)
-	{
-		Nucleus::createCondBr(cmp.value, bodyBB, endBB);
-		Nucleus::setInsertBlock(bodyBB);
-	}
-
-	RValue<Float4> MaskedLoad(RValue<Pointer<Float4>> base, RValue<Int4> mask, unsigned int alignment, bool zeroMaskedLanes /* = false */)
-	{
-		return RValue<Float4>(Nucleus::createMaskedLoad(base.value, Float::getType(), mask.value, alignment, zeroMaskedLanes));
-	}
-
-	RValue<Int4> MaskedLoad(RValue<Pointer<Int4>> base, RValue<Int4> mask, unsigned int alignment, bool zeroMaskedLanes /* = false */)
-	{
-		return RValue<Int4>(Nucleus::createMaskedLoad(base.value, Int::getType(), mask.value, alignment, zeroMaskedLanes));
-	}
-
-	void MaskedStore(RValue<Pointer<Float4>> base, RValue<Float4> val, RValue<Int4> mask, unsigned int alignment)
-	{
-		Nucleus::createMaskedStore(base.value, val.value, mask.value, alignment);
-	}
-
-	void MaskedStore(RValue<Pointer<Int4>> base, RValue<Int4> val, RValue<Int4> mask, unsigned int alignment)
-	{
-		Nucleus::createMaskedStore(base.value, val.value, mask.value, alignment);
-	}
-
-	void Fence(std::memory_order memoryOrder)
-	{
-		ASSERT_MSG(memoryOrder == std::memory_order_acquire ||
-			memoryOrder == std::memory_order_release ||
-			memoryOrder == std::memory_order_acq_rel ||
-			memoryOrder == std::memory_order_seq_cst,
-			"Unsupported memoryOrder: %d", int(memoryOrder));
-		Nucleus::createFence(memoryOrder);
-	}
-
-	Bool          CToReactor<bool>::cast(bool v)               { return type(v); }
-	Byte          CToReactor<uint8_t>::cast(uint8_t v)         { return type(v); }
-	SByte         CToReactor<int8_t>::cast(int8_t v)           { return type(v); }
-	Short         CToReactor<int16_t>::cast(int16_t v)         { return type(v); }
-	UShort        CToReactor<uint16_t>::cast(uint16_t v)       { return type(v); }
-	Int           CToReactor<int32_t>::cast(int32_t v)         { return type(v); }
-	UInt          CToReactor<uint32_t>::cast(uint32_t v)       { return type(v); }
-	Float         CToReactor<float>::cast(float v)             { return type(v); }
-	Float4        CToReactor<float[4]>::cast(float v[4])       { return type(v[0], v[1], v[2], v[3]); }
-
-	// TODO: Long has no constructor that takes a uint64_t
-	// Long          CToReactor<uint64_t>::cast(uint64_t v)       { return type(v); }
+RValue<UInt4> operator&=(UInt4 &lhs, RValue<UInt4> rhs)
+{
+	return lhs = lhs & rhs;
 }
+
+RValue<UInt4> operator|=(UInt4 &lhs, RValue<UInt4> rhs)
+{
+	return lhs = lhs | rhs;
+}
+
+RValue<UInt4> operator^=(UInt4 &lhs, RValue<UInt4> rhs)
+{
+	return lhs = lhs ^ rhs;
+}
+
+RValue<UInt4> operator<<=(UInt4 &lhs, unsigned char rhs)
+{
+	return lhs = lhs << rhs;
+}
+
+RValue<UInt4> operator>>=(UInt4 &lhs, unsigned char rhs)
+{
+	return lhs = lhs >> rhs;
+}
+
+RValue<UInt4> operator+(RValue<UInt4> val)
+{
+	return val;
+}
+
+RValue<UInt4> operator-(RValue<UInt4> val)
+{
+	return RValue<UInt4>(Nucleus::createNeg(val.value));
+}
+
+RValue<UInt4> operator~(RValue<UInt4> val)
+{
+	return RValue<UInt4>(Nucleus::createNot(val.value));
+}
+
+RValue<UInt> Extract(RValue<UInt4> x, int i)
+{
+	return RValue<UInt>(Nucleus::createExtractElement(x.value, Int::getType(), i));
+}
+
+RValue<UInt4> Insert(RValue<UInt4> x, RValue<UInt> element, int i)
+{
+	return RValue<UInt4>(Nucleus::createInsertElement(x.value, element.value, i));
+}
+
+RValue<UInt4> Swizzle(RValue<UInt4> x, uint16_t select)
+{
+	return RValue<UInt4>(createSwizzle4(x.value, select));
+}
+
+RValue<UInt4> Shuffle(RValue<UInt4> x, RValue<UInt4> y, unsigned short select)
+{
+	return RValue<UInt4>(createBlend4(x.value, y.value, select));
+}
+
+Half::Half(RValue<Float> cast)
+{
+	UInt fp32i = As<UInt>(cast);
+	UInt abs = fp32i & 0x7FFFFFFF;
+	UShort fp16i((fp32i & 0x80000000) >> 16); // sign
+
+	If(abs > 0x47FFEFFF) // Infinity
+	{
+		fp16i |= UShort(0x7FFF);
+	}
+	Else
+	{
+		If(abs < 0x38800000) // Denormal
+		{
+			Int mantissa = (abs & 0x007FFFFF) | 0x00800000;
+			Int e = 113 - (abs >> 23);
+			abs = IfThenElse(e < 24, mantissa >> e, Int(0));
+			fp16i |= UShort((abs + 0x00000FFF + ((abs >> 13) & 1)) >> 13);
+		}
+		Else
+		{
+			fp16i |= UShort((abs + 0xC8000000 + 0x00000FFF + ((abs >> 13) & 1)) >> 13);
+		}
+	}
+
+	storeValue(fp16i.loadValue());
+}
+
+Float::Float(RValue<Int> cast)
+{
+	Value *integer = Nucleus::createSIToFP(cast.value, Float::getType());
+
+	storeValue(integer);
+}
+
+Float::Float(RValue<UInt> cast)
+{
+	RValue<Float> result = Float(Int(cast & UInt(0x7FFFFFFF))) +
+	                       As<Float>((As<Int>(cast) >> 31) & As<Int>(Float(0x80000000u)));
+
+	storeValue(result.value);
+}
+
+Float::Float(RValue<Half> cast)
+{
+	Int fp16i(As<UShort>(cast));
+
+	Int s = (fp16i >> 15) & 0x00000001;
+	Int e = (fp16i >> 10) & 0x0000001F;
+	Int m = fp16i & 0x000003FF;
+
+	UInt fp32i(s << 31);
+	If(e == 0)
+	{
+		If(m != 0)
+		{
+			While((m & 0x00000400) == 0)
+			{
+				m <<= 1;
+				e -= 1;
+			}
+
+			fp32i |= As<UInt>(((e + (127 - 15) + 1) << 23) | ((m & ~0x00000400) << 13));
+		}
+	}
+	Else
+	{
+		fp32i |= As<UInt>(((e + (127 - 15)) << 23) | (m << 13));
+	}
+
+	storeValue(As<Float>(fp32i).value);
+}
+
+Float::Float(float x)
+{
+	// C++ does not have a way to write an infinite or NaN literal,
+	// nor does it allow division by zero as a constant expression.
+	// Thus we should not accept inf or NaN as a Reactor Float constant,
+	// as this would typically idicate a bug, and avoids undefined
+	// behavior.
+	//
+	// This also prevents the issue of the LLVM JIT only taking double
+	// values for constructing floating-point constants. During the
+	// conversion from single-precision to double, a signaling NaN can
+	// become a quiet NaN, thus altering its bit pattern. Hence this
+	// assert is also helpful for detecting cases where integers are
+	// being reinterpreted as float and then bitcast to integer again,
+	// which does not guarantee preserving the integer value.
+	//
+	// Should infinity and NaN constants be required, methods like
+	// infinity(), quiet_NaN(), and signaling_NaN() should be added
+	// to the Float class.
+	ASSERT(std::isfinite(x));
+
+	storeValue(Nucleus::createConstantFloat(x));
+}
+
+Float::Float(RValue<Float> rhs)
+{
+	storeValue(rhs.value);
+}
+
+Float::Float(const Float &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
+}
+
+Float::Float(const Reference<Float> &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
+}
+
+Float::Float(Argument<Float> argument)
+{
+	storeValue(argument.value);
+}
+
+RValue<Float> Float::operator=(RValue<Float> rhs)
+{
+	storeValue(rhs.value);
+
+	return rhs;
+}
+
+RValue<Float> Float::operator=(const Float &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
+
+	return RValue<Float>(value);
+}
+
+RValue<Float> Float::operator=(const Reference<Float> &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
+
+	return RValue<Float>(value);
+}
+
+RValue<Float> operator+(RValue<Float> lhs, RValue<Float> rhs)
+{
+	return RValue<Float>(Nucleus::createFAdd(lhs.value, rhs.value));
+}
+
+RValue<Float> operator-(RValue<Float> lhs, RValue<Float> rhs)
+{
+	return RValue<Float>(Nucleus::createFSub(lhs.value, rhs.value));
+}
+
+RValue<Float> operator*(RValue<Float> lhs, RValue<Float> rhs)
+{
+	return RValue<Float>(Nucleus::createFMul(lhs.value, rhs.value));
+}
+
+RValue<Float> operator/(RValue<Float> lhs, RValue<Float> rhs)
+{
+	return RValue<Float>(Nucleus::createFDiv(lhs.value, rhs.value));
+}
+
+RValue<Float> operator+=(Float &lhs, RValue<Float> rhs)
+{
+	return lhs = lhs + rhs;
+}
+
+RValue<Float> operator-=(Float &lhs, RValue<Float> rhs)
+{
+	return lhs = lhs - rhs;
+}
+
+RValue<Float> operator*=(Float &lhs, RValue<Float> rhs)
+{
+	return lhs = lhs * rhs;
+}
+
+RValue<Float> operator/=(Float &lhs, RValue<Float> rhs)
+{
+	return lhs = lhs / rhs;
+}
+
+RValue<Float> operator+(RValue<Float> val)
+{
+	return val;
+}
+
+RValue<Float> operator-(RValue<Float> val)
+{
+	return RValue<Float>(Nucleus::createFNeg(val.value));
+}
+
+RValue<Bool> operator<(RValue<Float> lhs, RValue<Float> rhs)
+{
+	return RValue<Bool>(Nucleus::createFCmpOLT(lhs.value, rhs.value));
+}
+
+RValue<Bool> operator<=(RValue<Float> lhs, RValue<Float> rhs)
+{
+	return RValue<Bool>(Nucleus::createFCmpOLE(lhs.value, rhs.value));
+}
+
+RValue<Bool> operator>(RValue<Float> lhs, RValue<Float> rhs)
+{
+	return RValue<Bool>(Nucleus::createFCmpOGT(lhs.value, rhs.value));
+}
+
+RValue<Bool> operator>=(RValue<Float> lhs, RValue<Float> rhs)
+{
+	return RValue<Bool>(Nucleus::createFCmpOGE(lhs.value, rhs.value));
+}
+
+RValue<Bool> operator!=(RValue<Float> lhs, RValue<Float> rhs)
+{
+	return RValue<Bool>(Nucleus::createFCmpONE(lhs.value, rhs.value));
+}
+
+RValue<Bool> operator==(RValue<Float> lhs, RValue<Float> rhs)
+{
+	return RValue<Bool>(Nucleus::createFCmpOEQ(lhs.value, rhs.value));
+}
+
+RValue<Float> Abs(RValue<Float> x)
+{
+	return IfThenElse(x > 0.0f, x, -x);
+}
+
+RValue<Float> Max(RValue<Float> x, RValue<Float> y)
+{
+	return IfThenElse(x > y, x, y);
+}
+
+RValue<Float> Min(RValue<Float> x, RValue<Float> y)
+{
+	return IfThenElse(x < y, x, y);
+}
+
+Float2::Float2(RValue<Float4> cast)
+{
+	storeValue(Nucleus::createBitCast(cast.value, getType()));
+}
+
+Float4::Float4(RValue<Byte4> cast) : XYZW(this)
+{
+	Value *a = Int4(cast).loadValue();
+	Value *xyzw = Nucleus::createSIToFP(a, Float4::getType());
+
+	storeValue(xyzw);
+}
+
+Float4::Float4(RValue<SByte4> cast) : XYZW(this)
+{
+	Value *a = Int4(cast).loadValue();
+	Value *xyzw = Nucleus::createSIToFP(a, Float4::getType());
+
+	storeValue(xyzw);
+}
+
+Float4::Float4(RValue<Short4> cast) : XYZW(this)
+{
+	Int4 c(cast);
+	storeValue(Nucleus::createSIToFP(RValue<Int4>(c).value, Float4::getType()));
+}
+
+Float4::Float4(RValue<UShort4> cast) : XYZW(this)
+{
+	Int4 c(cast);
+	storeValue(Nucleus::createSIToFP(RValue<Int4>(c).value, Float4::getType()));
+}
+
+Float4::Float4(RValue<Int4> cast) : XYZW(this)
+{
+	Value *xyzw = Nucleus::createSIToFP(cast.value, Float4::getType());
+
+	storeValue(xyzw);
+}
+
+Float4::Float4(RValue<UInt4> cast) : XYZW(this)
+{
+	RValue<Float4> result = Float4(Int4(cast & UInt4(0x7FFFFFFF))) +
+	                        As<Float4>((As<Int4>(cast) >> 31) & As<Int4>(Float4(0x80000000u)));
+
+	storeValue(result.value);
+}
+
+Float4::Float4() : XYZW(this)
+{
+}
+
+Float4::Float4(float xyzw) : XYZW(this)
+{
+	constant(xyzw, xyzw, xyzw, xyzw);
+}
+
+Float4::Float4(float x, float yzw) : XYZW(this)
+{
+	constant(x, yzw, yzw, yzw);
+}
+
+Float4::Float4(float x, float y, float zw) : XYZW(this)
+{
+	constant(x, y, zw, zw);
+}
+
+Float4::Float4(float x, float y, float z, float w) : XYZW(this)
+{
+	constant(x, y, z, w);
+}
+
+Float4 Float4::positive_inf()
+{
+	Float4 result;
+	result.infinity_constant(false);
+	return result;
+}
+
+Float4 Float4::negative_inf()
+{
+	Float4 result;
+	result.infinity_constant(true);
+	return result;
+}
+
+void Float4::infinity_constant(bool negative)
+{
+	double inf = negative ? -INFINITY : INFINITY;
+	double constantVector[4] = {inf, inf, inf, inf};
+	storeValue(Nucleus::createConstantVector(constantVector, getType()));
+}
+
+void Float4::constant(float x, float y, float z, float w)
+{
+	// See Float(float) constructor for the rationale behind this assert.
+	ASSERT(std::isfinite(x) && std::isfinite(y) && std::isfinite(z) && std::isfinite(w));
+
+	double constantVector[4] = {x, y, z, w};
+	storeValue(Nucleus::createConstantVector(constantVector, getType()));
+}
+
+Float4::Float4(RValue<Float4> rhs) : XYZW(this)
+{
+	storeValue(rhs.value);
+}
+
+Float4::Float4(const Float4 &rhs) : XYZW(this)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
+}
+
+Float4::Float4(const Reference<Float4> &rhs) : XYZW(this)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
+}
+
+Float4::Float4(const Float &rhs) : XYZW(this)
+{
+	*this = RValue<Float>(rhs.loadValue());
+}
+
+Float4::Float4(const Reference<Float> &rhs) : XYZW(this)
+{
+	*this = RValue<Float>(rhs.loadValue());
+}
+
+RValue<Float4> Float4::operator=(float x)
+{
+	return *this = Float4(x, x, x, x);
+}
+
+RValue<Float4> Float4::operator=(RValue<Float4> rhs)
+{
+	storeValue(rhs.value);
+
+	return rhs;
+}
+
+RValue<Float4> Float4::operator=(const Float4 &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
+
+	return RValue<Float4>(value);
+}
+
+RValue<Float4> Float4::operator=(const Reference<Float4> &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
+
+	return RValue<Float4>(value);
+}
+
+RValue<Float4> Float4::operator=(RValue<Float> rhs)
+{
+	return *this = Float4(rhs);
+}
+
+RValue<Float4> Float4::operator=(const Float &rhs)
+{
+	return *this = Float4(rhs);
+}
+
+RValue<Float4> Float4::operator=(const Reference<Float> &rhs)
+{
+	return *this = Float4(rhs);
+}
+
+RValue<Float4> operator+(RValue<Float4> lhs, RValue<Float4> rhs)
+{
+	return RValue<Float4>(Nucleus::createFAdd(lhs.value, rhs.value));
+}
+
+RValue<Float4> operator-(RValue<Float4> lhs, RValue<Float4> rhs)
+{
+	return RValue<Float4>(Nucleus::createFSub(lhs.value, rhs.value));
+}
+
+RValue<Float4> operator*(RValue<Float4> lhs, RValue<Float4> rhs)
+{
+	return RValue<Float4>(Nucleus::createFMul(lhs.value, rhs.value));
+}
+
+RValue<Float4> operator/(RValue<Float4> lhs, RValue<Float4> rhs)
+{
+	return RValue<Float4>(Nucleus::createFDiv(lhs.value, rhs.value));
+}
+
+RValue<Float4> operator%(RValue<Float4> lhs, RValue<Float4> rhs)
+{
+	return RValue<Float4>(Nucleus::createFRem(lhs.value, rhs.value));
+}
+
+RValue<Float4> operator+=(Float4 &lhs, RValue<Float4> rhs)
+{
+	return lhs = lhs + rhs;
+}
+
+RValue<Float4> operator-=(Float4 &lhs, RValue<Float4> rhs)
+{
+	return lhs = lhs - rhs;
+}
+
+RValue<Float4> operator*=(Float4 &lhs, RValue<Float4> rhs)
+{
+	return lhs = lhs * rhs;
+}
+
+RValue<Float4> operator/=(Float4 &lhs, RValue<Float4> rhs)
+{
+	return lhs = lhs / rhs;
+}
+
+RValue<Float4> operator%=(Float4 &lhs, RValue<Float4> rhs)
+{
+	return lhs = lhs % rhs;
+}
+
+RValue<Float4> operator+(RValue<Float4> val)
+{
+	return val;
+}
+
+RValue<Float4> operator-(RValue<Float4> val)
+{
+	return RValue<Float4>(Nucleus::createFNeg(val.value));
+}
+
+RValue<Float4> Abs(RValue<Float4> x)
+{
+	// TODO: Optimize.
+	Value *vector = Nucleus::createBitCast(x.value, Int4::getType());
+	int64_t constantVector[4] = {0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF};
+	Value *result = Nucleus::createAnd(vector, Nucleus::createConstantVector(constantVector, Int4::getType()));
+
+	return As<Float4>(result);
+}
+
+RValue<Float4> Insert(RValue<Float4> x, RValue<Float> element, int i)
+{
+	return RValue<Float4>(Nucleus::createInsertElement(x.value, element.value, i));
+}
+
+RValue<Float> Extract(RValue<Float4> x, int i)
+{
+	return RValue<Float>(Nucleus::createExtractElement(x.value, Float::getType(), i));
+}
+
+RValue<Float4> Swizzle(RValue<Float4> x, uint16_t select)
+{
+	return RValue<Float4>(createSwizzle4(x.value, select));
+}
+
+RValue<Float4> Shuffle(RValue<Float4> x, RValue<Float4> y, uint16_t select)
+{
+	return RValue<Float4>(createBlend4(x.value, y.value, select));
+}
+
+RValue<Float4> ShuffleLowHigh(RValue<Float4> x, RValue<Float4> y, uint16_t imm)
+{
+	int shuffle[4] =
+	{
+		((imm >> 12) & 0x03) + 0,
+		((imm >>  8) & 0x03) + 0,
+		((imm >>  4) & 0x03) + 4,
+		((imm >>  0) & 0x03) + 4,
+	};
+
+	return RValue<Float4>(Nucleus::createShuffleVector(x.value, y.value, shuffle));
+}
+
+RValue<Float4> UnpackLow(RValue<Float4> x, RValue<Float4> y)
+{
+	int shuffle[4] = {0, 4, 1, 5};
+	return RValue<Float4>(Nucleus::createShuffleVector(x.value, y.value, shuffle));
+}
+
+RValue<Float4> UnpackHigh(RValue<Float4> x, RValue<Float4> y)
+{
+	int shuffle[4] = {2, 6, 3, 7};
+	return RValue<Float4>(Nucleus::createShuffleVector(x.value, y.value, shuffle));
+}
+
+RValue<Float4> Mask(Float4 &lhs, RValue<Float4> rhs, uint16_t select)
+{
+	Value *vector = lhs.loadValue();
+	Value *result = createMask4(vector, rhs.value, select);
+	lhs.storeValue(result);
+
+	return RValue<Float4>(result);
+}
+
+RValue<Int4> IsInf(RValue<Float4> x)
+{
+	return CmpEQ(As<Int4>(x) & Int4(0x7FFFFFFF), Int4(0x7F800000));
+}
+
+RValue<Int4> IsNan(RValue<Float4> x)
+{
+	return ~CmpEQ(x, x);
+}
+
+RValue<Pointer<Byte>> operator+(RValue<Pointer<Byte>> lhs, int offset)
+{
+	return lhs + RValue<Int>(Nucleus::createConstantInt(offset));
+}
+
+RValue<Pointer<Byte>> operator+(RValue<Pointer<Byte>> lhs, RValue<Int> offset)
+{
+	return RValue<Pointer<Byte>>(Nucleus::createGEP(lhs.value, Byte::getType(), offset.value, false));
+}
+
+RValue<Pointer<Byte>> operator+(RValue<Pointer<Byte>> lhs, RValue<UInt> offset)
+{
+	return RValue<Pointer<Byte>>(Nucleus::createGEP(lhs.value, Byte::getType(), offset.value, true));
+}
+
+RValue<Pointer<Byte>> operator+=(Pointer<Byte> &lhs, int offset)
+{
+	return lhs = lhs + offset;
+}
+
+RValue<Pointer<Byte>> operator+=(Pointer<Byte> &lhs, RValue<Int> offset)
+{
+	return lhs = lhs + offset;
+}
+
+RValue<Pointer<Byte>> operator+=(Pointer<Byte> &lhs, RValue<UInt> offset)
+{
+	return lhs = lhs + offset;
+}
+
+RValue<Pointer<Byte>> operator-(RValue<Pointer<Byte>> lhs, int offset)
+{
+	return lhs + -offset;
+}
+
+RValue<Pointer<Byte>> operator-(RValue<Pointer<Byte>> lhs, RValue<Int> offset)
+{
+	return lhs + -offset;
+}
+
+RValue<Pointer<Byte>> operator-(RValue<Pointer<Byte>> lhs, RValue<UInt> offset)
+{
+	return lhs + -offset;
+}
+
+RValue<Pointer<Byte>> operator-=(Pointer<Byte> &lhs, int offset)
+{
+	return lhs = lhs - offset;
+}
+
+RValue<Pointer<Byte>> operator-=(Pointer<Byte> &lhs, RValue<Int> offset)
+{
+	return lhs = lhs - offset;
+}
+
+RValue<Pointer<Byte>> operator-=(Pointer<Byte> &lhs, RValue<UInt> offset)
+{
+	return lhs = lhs - offset;
+}
+
+void Return()
+{
+	Nucleus::createRetVoid();
+	// Place any unreachable instructions in an unreferenced block.
+	Nucleus::setInsertBlock(Nucleus::createBasicBlock());
+}
+
+void branch(RValue<Bool> cmp, BasicBlock *bodyBB, BasicBlock *endBB)
+{
+	Nucleus::createCondBr(cmp.value, bodyBB, endBB);
+	Nucleus::setInsertBlock(bodyBB);
+}
+
+RValue<Float4> MaskedLoad(RValue<Pointer<Float4>> base, RValue<Int4> mask, unsigned int alignment, bool zeroMaskedLanes /* = false */)
+{
+	return RValue<Float4>(Nucleus::createMaskedLoad(base.value, Float::getType(), mask.value, alignment, zeroMaskedLanes));
+}
+
+RValue<Int4> MaskedLoad(RValue<Pointer<Int4>> base, RValue<Int4> mask, unsigned int alignment, bool zeroMaskedLanes /* = false */)
+{
+	return RValue<Int4>(Nucleus::createMaskedLoad(base.value, Int::getType(), mask.value, alignment, zeroMaskedLanes));
+}
+
+void MaskedStore(RValue<Pointer<Float4>> base, RValue<Float4> val, RValue<Int4> mask, unsigned int alignment)
+{
+	Nucleus::createMaskedStore(base.value, val.value, mask.value, alignment);
+}
+
+void MaskedStore(RValue<Pointer<Int4>> base, RValue<Int4> val, RValue<Int4> mask, unsigned int alignment)
+{
+	Nucleus::createMaskedStore(base.value, val.value, mask.value, alignment);
+}
+
+void Fence(std::memory_order memoryOrder)
+{
+	ASSERT_MSG(memoryOrder == std::memory_order_acquire ||
+		memoryOrder == std::memory_order_release ||
+		memoryOrder == std::memory_order_acq_rel ||
+		memoryOrder == std::memory_order_seq_cst,
+		"Unsupported memoryOrder: %d", int(memoryOrder));
+	Nucleus::createFence(memoryOrder);
+}
+
+Bool          CToReactor<bool>::cast(bool v)               { return type(v); }
+Byte          CToReactor<uint8_t>::cast(uint8_t v)         { return type(v); }
+SByte         CToReactor<int8_t>::cast(int8_t v)           { return type(v); }
+Short         CToReactor<int16_t>::cast(int16_t v)         { return type(v); }
+UShort        CToReactor<uint16_t>::cast(uint16_t v)       { return type(v); }
+Int           CToReactor<int32_t>::cast(int32_t v)         { return type(v); }
+UInt          CToReactor<uint32_t>::cast(uint32_t v)       { return type(v); }
+Float         CToReactor<float>::cast(float v)             { return type(v); }
+Float4        CToReactor<float[4]>::cast(float v[4])       { return type(v[0], v[1], v[2], v[3]); }
+
+// TODO: Long has no constructor that takes a uint64_t
+// Long          CToReactor<uint64_t>::cast(uint64_t v)       { return type(v); }
+
+}  // namespace rr
diff --git a/src/Reactor/Reactor.hpp b/src/Reactor/Reactor.hpp
index eec950b..f0b18b5 100644
--- a/src/Reactor/Reactor.hpp
+++ b/src/Reactor/Reactor.hpp
@@ -49,484 +49,484 @@
 	#define RR_DEBUG_INFO_FLUSH()
 #endif // ENABLE_RR_DEBUG_INFO
 
-namespace rr
+namespace rr {
+
+struct Capabilities
 {
-	struct Capabilities
+	bool CoroutinesSupported; // Support for rr::Coroutine<F>
+};
+extern const Capabilities Caps;
+
+class Bool;
+class Byte;
+class SByte;
+class Byte4;
+class SByte4;
+class Byte8;
+class SByte8;
+class Byte16;
+class SByte16;
+class Short;
+class UShort;
+class Short2;
+class UShort2;
+class Short4;
+class UShort4;
+class Short8;
+class UShort8;
+class Int;
+class UInt;
+class Int2;
+class UInt2;
+class Int4;
+class UInt4;
+class Long;
+class Half;
+class Float;
+class Float2;
+class Float4;
+
+class Void
+{
+public:
+	static Type *getType();
+
+	static bool isVoid()
 	{
-		bool CoroutinesSupported; // Support for rr::Coroutine<F>
-	};
-	extern const Capabilities Caps;
+		return true;
+	}
+};
 
-	class Bool;
-	class Byte;
-	class SByte;
-	class Byte4;
-	class SByte4;
-	class Byte8;
-	class SByte8;
-	class Byte16;
-	class SByte16;
-	class Short;
-	class UShort;
-	class Short2;
-	class UShort2;
-	class Short4;
-	class UShort4;
-	class Short8;
-	class UShort8;
-	class Int;
-	class UInt;
-	class Int2;
-	class UInt2;
-	class Int4;
-	class UInt4;
-	class Long;
-	class Half;
-	class Float;
-	class Float2;
-	class Float4;
+template<class T>
+class RValue;
 
-	class Void
+template<class T>
+class Pointer;
+
+class Variable
+{
+	friend class Nucleus;
+	friend class PrintValue;
+
+	Variable() = delete;
+	Variable &operator=(const Variable&) = delete;
+
+public:
+	void materialize() const;
+
+	Value *loadValue() const;
+	Value *storeValue(Value *value) const;
+
+	Value *getBaseAddress() const;
+	Value *getElementPointer(Value *index, bool unsignedIndex) const;
+
+protected:
+	Variable(Type *type, int arraySize);
+	Variable(const Variable&) = default;
+
+	~Variable();
+
+	const int arraySize;
+
+private:
+	static void materializeAll();
+	static void killUnmaterialized();
+
+	static std::unordered_set<Variable*> unmaterializedVariables;
+
+	Type *const type;
+	mutable Value *rvalue = nullptr;
+	mutable Value *address = nullptr;
+};
+
+template<class T>
+class LValue : public Variable
+{
+public:
+	LValue(int arraySize = 0);
+
+	RValue<Pointer<T>> operator&();
+
+	static bool isVoid()
 	{
-	public:
-		static Type *getType();
+		return false;
+	}
 
-		static bool isVoid()
-		{
-			return true;
-		}
-	};
+	// self() returns the this pointer to this LValue<T> object.
+	// This function exists because operator&() is overloaded.
+	inline LValue<T>* self() { return this; }
+};
 
-	template<class T>
-	class RValue;
+template<class T>
+class Reference
+{
+public:
+	using reference_underlying_type = T;
 
-	template<class T>
-	class Pointer;
+	explicit Reference(Value *pointer, int alignment = 1);
 
-	class Variable
-	{
-		friend class Nucleus;
-		friend class PrintValue;
+	RValue<T> operator=(RValue<T> rhs) const;
+	RValue<T> operator=(const Reference<T> &ref) const;
 
-		Variable() = delete;
-		Variable &operator=(const Variable&) = delete;
+	RValue<T> operator+=(RValue<T> rhs) const;
 
-	public:
-		void materialize() const;
+	RValue<Pointer<T>> operator&() const { return RValue<Pointer<T>>(address); }
 
-		Value *loadValue() const;
-		Value *storeValue(Value *value) const;
+	Value *loadValue() const;
+	int getAlignment() const;
 
-		Value *getBaseAddress() const;
-		Value *getElementPointer(Value *index, bool unsignedIndex) const;
+private:
+	Value *address;
 
-	protected:
-		Variable(Type *type, int arraySize);
-		Variable(const Variable&) = default;
+	const int alignment;
+};
 
-		~Variable();
+template<class T>
+struct BoolLiteral
+{
+	struct type;
+};
 
-		const int arraySize;
+template<>
+struct BoolLiteral<Bool>
+{
+	typedef bool type;
+};
 
-	private:
-		static void materializeAll();
-		static void killUnmaterialized();
+template<class T>
+struct IntLiteral
+{
+	struct type;
+};
 
-		static std::unordered_set<Variable*> unmaterializedVariables;
+template<>
+struct IntLiteral<Int>
+{
+	typedef int type;
+};
 
-		Type *const type;
-		mutable Value *rvalue = nullptr;
-		mutable Value *address = nullptr;
-	};
+template<>
+struct IntLiteral<UInt>
+{
+	typedef unsigned int type;
+};
 
-	template<class T>
-	class LValue : public Variable
-	{
-	public:
-		LValue(int arraySize = 0);
+template<>
+struct IntLiteral<Long>
+{
+	typedef int64_t type;
+};
 
-		RValue<Pointer<T>> operator&();
+template<class T>
+struct FloatLiteral
+{
+	struct type;
+};
 
-		static bool isVoid()
-		{
-			return false;
-		}
+template<>
+struct FloatLiteral<Float>
+{
+	typedef float type;
+};
 
-		// self() returns the this pointer to this LValue<T> object.
-		// This function exists because operator&() is overloaded.
-		inline LValue<T>* self() { return this; }
-	};
+template<class T>
+class RValue
+{
+public:
+	using rvalue_underlying_type = T;
 
-	template<class T>
-	class Reference
-	{
-	public:
-		using reference_underlying_type = T;
-
-		explicit Reference(Value *pointer, int alignment = 1);
-
-		RValue<T> operator=(RValue<T> rhs) const;
-		RValue<T> operator=(const Reference<T> &ref) const;
-
-		RValue<T> operator+=(RValue<T> rhs) const;
-
-		RValue<Pointer<T>> operator&() const { return RValue<Pointer<T>>(address); }
-
-		Value *loadValue() const;
-		int getAlignment() const;
-
-	private:
-		Value *address;
-
-		const int alignment;
-	};
-
-	template<class T>
-	struct BoolLiteral
-	{
-		struct type;
-	};
-
-	template<>
-	struct BoolLiteral<Bool>
-	{
-		typedef bool type;
-	};
-
-	template<class T>
-	struct IntLiteral
-	{
-		struct type;
-	};
-
-	template<>
-	struct IntLiteral<Int>
-	{
-		typedef int type;
-	};
-
-	template<>
-	struct IntLiteral<UInt>
-	{
-		typedef unsigned int type;
-	};
-
-	template<>
-	struct IntLiteral<Long>
-	{
-		typedef int64_t type;
-	};
-
-	template<class T>
-	struct FloatLiteral
-	{
-		struct type;
-	};
-
-	template<>
-	struct FloatLiteral<Float>
-	{
-		typedef float type;
-	};
-
-	template<class T>
-	class RValue
-	{
-	public:
-		using rvalue_underlying_type = T;
-
-		explicit RValue(Value *rvalue);
+	explicit RValue(Value *rvalue);
 
 #ifdef ENABLE_RR_DEBUG_INFO
-		RValue(const RValue<T> &rvalue);
+	RValue(const RValue<T> &rvalue);
 #endif // ENABLE_RR_DEBUG_INFO
 
-		RValue(const T &lvalue);
-		RValue(typename BoolLiteral<T>::type i);
-		RValue(typename IntLiteral<T>::type i);
-		RValue(typename FloatLiteral<T>::type f);
-		RValue(const Reference<T> &rhs);
+	RValue(const T &lvalue);
+	RValue(typename BoolLiteral<T>::type i);
+	RValue(typename IntLiteral<T>::type i);
+	RValue(typename FloatLiteral<T>::type f);
+	RValue(const Reference<T> &rhs);
 
-		RValue<T> &operator=(const RValue<T>&) = delete;
+	RValue<T> &operator=(const RValue<T>&) = delete;
 
-		Value *value;   // FIXME: Make private
-	};
+	Value *value;   // FIXME: Make private
+};
 
-	template<typename T>
-	struct Argument
-	{
-		explicit Argument(Value *value) : value(value) {}
+template<typename T>
+struct Argument
+{
+	explicit Argument(Value *value) : value(value) {}
 
-		Value *value;
-	};
+	Value *value;
+};
 
-	class Bool : public LValue<Bool>
-	{
-	public:
-		Bool(Argument<Bool> argument);
+class Bool : public LValue<Bool>
+{
+public:
+	Bool(Argument<Bool> argument);
 
-		Bool() = default;
-		Bool(bool x);
-		Bool(RValue<Bool> rhs);
-		Bool(const Bool &rhs);
-		Bool(const Reference<Bool> &rhs);
+	Bool() = default;
+	Bool(bool x);
+	Bool(RValue<Bool> rhs);
+	Bool(const Bool &rhs);
+	Bool(const Reference<Bool> &rhs);
 
-	//	RValue<Bool> operator=(bool rhs);   // FIXME: Implement
-		RValue<Bool> operator=(RValue<Bool> rhs);
-		RValue<Bool> operator=(const Bool &rhs);
-		RValue<Bool> operator=(const Reference<Bool> &rhs);
+//	RValue<Bool> operator=(bool rhs);   // FIXME: Implement
+	RValue<Bool> operator=(RValue<Bool> rhs);
+	RValue<Bool> operator=(const Bool &rhs);
+	RValue<Bool> operator=(const Reference<Bool> &rhs);
 
-		static Type *getType();
-	};
+	static Type *getType();
+};
 
-	RValue<Bool> operator!(RValue<Bool> val);
-	RValue<Bool> operator&&(RValue<Bool> lhs, RValue<Bool> rhs);
-	RValue<Bool> operator||(RValue<Bool> lhs, RValue<Bool> rhs);
-	RValue<Bool> operator!=(RValue<Bool> lhs, RValue<Bool> rhs);
-	RValue<Bool> operator==(RValue<Bool> lhs, RValue<Bool> rhs);
+RValue<Bool> operator!(RValue<Bool> val);
+RValue<Bool> operator&&(RValue<Bool> lhs, RValue<Bool> rhs);
+RValue<Bool> operator||(RValue<Bool> lhs, RValue<Bool> rhs);
+RValue<Bool> operator!=(RValue<Bool> lhs, RValue<Bool> rhs);
+RValue<Bool> operator==(RValue<Bool> lhs, RValue<Bool> rhs);
 
-	class Byte : public LValue<Byte>
-	{
-	public:
-		Byte(Argument<Byte> argument);
+class Byte : public LValue<Byte>
+{
+public:
+	Byte(Argument<Byte> argument);
 
-		explicit Byte(RValue<Int> cast);
-		explicit Byte(RValue<UInt> cast);
-		explicit Byte(RValue<UShort> cast);
+	explicit Byte(RValue<Int> cast);
+	explicit Byte(RValue<UInt> cast);
+	explicit Byte(RValue<UShort> cast);
 
-		Byte() = default;
-		Byte(int x);
-		Byte(unsigned char x);
-		Byte(RValue<Byte> rhs);
-		Byte(const Byte &rhs);
-		Byte(const Reference<Byte> &rhs);
+	Byte() = default;
+	Byte(int x);
+	Byte(unsigned char x);
+	Byte(RValue<Byte> rhs);
+	Byte(const Byte &rhs);
+	Byte(const Reference<Byte> &rhs);
 
-	//	RValue<Byte> operator=(unsigned char rhs);   // FIXME: Implement
-		RValue<Byte> operator=(RValue<Byte> rhs);
-		RValue<Byte> operator=(const Byte &rhs);
-		RValue<Byte> operator=(const Reference<Byte> &rhs);
+//	RValue<Byte> operator=(unsigned char rhs);   // FIXME: Implement
+	RValue<Byte> operator=(RValue<Byte> rhs);
+	RValue<Byte> operator=(const Byte &rhs);
+	RValue<Byte> operator=(const Reference<Byte> &rhs);
 
-		static Type *getType();
-	};
+	static Type *getType();
+};
 
-	RValue<Byte> operator+(RValue<Byte> lhs, RValue<Byte> rhs);
-	RValue<Byte> operator-(RValue<Byte> lhs, RValue<Byte> rhs);
-	RValue<Byte> operator*(RValue<Byte> lhs, RValue<Byte> rhs);
-	RValue<Byte> operator/(RValue<Byte> lhs, RValue<Byte> rhs);
-	RValue<Byte> operator%(RValue<Byte> lhs, RValue<Byte> rhs);
-	RValue<Byte> operator&(RValue<Byte> lhs, RValue<Byte> rhs);
-	RValue<Byte> operator|(RValue<Byte> lhs, RValue<Byte> rhs);
-	RValue<Byte> operator^(RValue<Byte> lhs, RValue<Byte> rhs);
-	RValue<Byte> operator<<(RValue<Byte> lhs, RValue<Byte> rhs);
-	RValue<Byte> operator>>(RValue<Byte> lhs, RValue<Byte> rhs);
-	RValue<Byte> operator+=(Byte &lhs, RValue<Byte> rhs);
-	RValue<Byte> operator-=(Byte &lhs, RValue<Byte> rhs);
-	RValue<Byte> operator*=(Byte &lhs, RValue<Byte> rhs);
-	RValue<Byte> operator/=(Byte &lhs, RValue<Byte> rhs);
-	RValue<Byte> operator%=(Byte &lhs, RValue<Byte> rhs);
-	RValue<Byte> operator&=(Byte &lhs, RValue<Byte> rhs);
-	RValue<Byte> operator|=(Byte &lhs, RValue<Byte> rhs);
-	RValue<Byte> operator^=(Byte &lhs, RValue<Byte> rhs);
-	RValue<Byte> operator<<=(Byte &lhs, RValue<Byte> rhs);
-	RValue<Byte> operator>>=(Byte &lhs, RValue<Byte> rhs);
-	RValue<Byte> operator+(RValue<Byte> val);
-	RValue<Byte> operator-(RValue<Byte> val);
-	RValue<Byte> operator~(RValue<Byte> val);
-	RValue<Byte> operator++(Byte &val, int);   // Post-increment
-	const Byte &operator++(Byte &val);   // Pre-increment
-	RValue<Byte> operator--(Byte &val, int);   // Post-decrement
-	const Byte &operator--(Byte &val);   // Pre-decrement
-	RValue<Bool> operator<(RValue<Byte> lhs, RValue<Byte> rhs);
-	RValue<Bool> operator<=(RValue<Byte> lhs, RValue<Byte> rhs);
-	RValue<Bool> operator>(RValue<Byte> lhs, RValue<Byte> rhs);
-	RValue<Bool> operator>=(RValue<Byte> lhs, RValue<Byte> rhs);
-	RValue<Bool> operator!=(RValue<Byte> lhs, RValue<Byte> rhs);
-	RValue<Bool> operator==(RValue<Byte> lhs, RValue<Byte> rhs);
+RValue<Byte> operator+(RValue<Byte> lhs, RValue<Byte> rhs);
+RValue<Byte> operator-(RValue<Byte> lhs, RValue<Byte> rhs);
+RValue<Byte> operator*(RValue<Byte> lhs, RValue<Byte> rhs);
+RValue<Byte> operator/(RValue<Byte> lhs, RValue<Byte> rhs);
+RValue<Byte> operator%(RValue<Byte> lhs, RValue<Byte> rhs);
+RValue<Byte> operator&(RValue<Byte> lhs, RValue<Byte> rhs);
+RValue<Byte> operator|(RValue<Byte> lhs, RValue<Byte> rhs);
+RValue<Byte> operator^(RValue<Byte> lhs, RValue<Byte> rhs);
+RValue<Byte> operator<<(RValue<Byte> lhs, RValue<Byte> rhs);
+RValue<Byte> operator>>(RValue<Byte> lhs, RValue<Byte> rhs);
+RValue<Byte> operator+=(Byte &lhs, RValue<Byte> rhs);
+RValue<Byte> operator-=(Byte &lhs, RValue<Byte> rhs);
+RValue<Byte> operator*=(Byte &lhs, RValue<Byte> rhs);
+RValue<Byte> operator/=(Byte &lhs, RValue<Byte> rhs);
+RValue<Byte> operator%=(Byte &lhs, RValue<Byte> rhs);
+RValue<Byte> operator&=(Byte &lhs, RValue<Byte> rhs);
+RValue<Byte> operator|=(Byte &lhs, RValue<Byte> rhs);
+RValue<Byte> operator^=(Byte &lhs, RValue<Byte> rhs);
+RValue<Byte> operator<<=(Byte &lhs, RValue<Byte> rhs);
+RValue<Byte> operator>>=(Byte &lhs, RValue<Byte> rhs);
+RValue<Byte> operator+(RValue<Byte> val);
+RValue<Byte> operator-(RValue<Byte> val);
+RValue<Byte> operator~(RValue<Byte> val);
+RValue<Byte> operator++(Byte &val, int);   // Post-increment
+const Byte &operator++(Byte &val);   // Pre-increment
+RValue<Byte> operator--(Byte &val, int);   // Post-decrement
+const Byte &operator--(Byte &val);   // Pre-decrement
+RValue<Bool> operator<(RValue<Byte> lhs, RValue<Byte> rhs);
+RValue<Bool> operator<=(RValue<Byte> lhs, RValue<Byte> rhs);
+RValue<Bool> operator>(RValue<Byte> lhs, RValue<Byte> rhs);
+RValue<Bool> operator>=(RValue<Byte> lhs, RValue<Byte> rhs);
+RValue<Bool> operator!=(RValue<Byte> lhs, RValue<Byte> rhs);
+RValue<Bool> operator==(RValue<Byte> lhs, RValue<Byte> rhs);
 
-	class SByte : public LValue<SByte>
-	{
-	public:
-		SByte(Argument<SByte> argument);
+class SByte : public LValue<SByte>
+{
+public:
+	SByte(Argument<SByte> argument);
 
-		explicit SByte(RValue<Int> cast);
-		explicit SByte(RValue<Short> cast);
+	explicit SByte(RValue<Int> cast);
+	explicit SByte(RValue<Short> cast);
 
-		SByte() = default;
-		SByte(signed char x);
-		SByte(RValue<SByte> rhs);
-		SByte(const SByte &rhs);
-		SByte(const Reference<SByte> &rhs);
+	SByte() = default;
+	SByte(signed char x);
+	SByte(RValue<SByte> rhs);
+	SByte(const SByte &rhs);
+	SByte(const Reference<SByte> &rhs);
 
-	//	RValue<SByte> operator=(signed char rhs);   // FIXME: Implement
-		RValue<SByte> operator=(RValue<SByte> rhs);
-		RValue<SByte> operator=(const SByte &rhs);
-		RValue<SByte> operator=(const Reference<SByte> &rhs);
+//	RValue<SByte> operator=(signed char rhs);   // FIXME: Implement
+	RValue<SByte> operator=(RValue<SByte> rhs);
+	RValue<SByte> operator=(const SByte &rhs);
+	RValue<SByte> operator=(const Reference<SByte> &rhs);
 
-		static Type *getType();
-	};
+	static Type *getType();
+};
 
-	RValue<SByte> operator+(RValue<SByte> lhs, RValue<SByte> rhs);
-	RValue<SByte> operator-(RValue<SByte> lhs, RValue<SByte> rhs);
-	RValue<SByte> operator*(RValue<SByte> lhs, RValue<SByte> rhs);
-	RValue<SByte> operator/(RValue<SByte> lhs, RValue<SByte> rhs);
-	RValue<SByte> operator%(RValue<SByte> lhs, RValue<SByte> rhs);
-	RValue<SByte> operator&(RValue<SByte> lhs, RValue<SByte> rhs);
-	RValue<SByte> operator|(RValue<SByte> lhs, RValue<SByte> rhs);
-	RValue<SByte> operator^(RValue<SByte> lhs, RValue<SByte> rhs);
-	RValue<SByte> operator<<(RValue<SByte> lhs, RValue<SByte> rhs);
-	RValue<SByte> operator>>(RValue<SByte> lhs, RValue<SByte> rhs);
-	RValue<SByte> operator+=(SByte &lhs, RValue<SByte> rhs);
-	RValue<SByte> operator-=(SByte &lhs, RValue<SByte> rhs);
-	RValue<SByte> operator*=(SByte &lhs, RValue<SByte> rhs);
-	RValue<SByte> operator/=(SByte &lhs, RValue<SByte> rhs);
-	RValue<SByte> operator%=(SByte &lhs, RValue<SByte> rhs);
-	RValue<SByte> operator&=(SByte &lhs, RValue<SByte> rhs);
-	RValue<SByte> operator|=(SByte &lhs, RValue<SByte> rhs);
-	RValue<SByte> operator^=(SByte &lhs, RValue<SByte> rhs);
-	RValue<SByte> operator<<=(SByte &lhs, RValue<SByte> rhs);
-	RValue<SByte> operator>>=(SByte &lhs, RValue<SByte> rhs);
-	RValue<SByte> operator+(RValue<SByte> val);
-	RValue<SByte> operator-(RValue<SByte> val);
-	RValue<SByte> operator~(RValue<SByte> val);
-	RValue<SByte> operator++(SByte &val, int);   // Post-increment
-	const SByte &operator++(SByte &val);   // Pre-increment
-	RValue<SByte> operator--(SByte &val, int);   // Post-decrement
-	const SByte &operator--(SByte &val);   // Pre-decrement
-	RValue<Bool> operator<(RValue<SByte> lhs, RValue<SByte> rhs);
-	RValue<Bool> operator<=(RValue<SByte> lhs, RValue<SByte> rhs);
-	RValue<Bool> operator>(RValue<SByte> lhs, RValue<SByte> rhs);
-	RValue<Bool> operator>=(RValue<SByte> lhs, RValue<SByte> rhs);
-	RValue<Bool> operator!=(RValue<SByte> lhs, RValue<SByte> rhs);
-	RValue<Bool> operator==(RValue<SByte> lhs, RValue<SByte> rhs);
+RValue<SByte> operator+(RValue<SByte> lhs, RValue<SByte> rhs);
+RValue<SByte> operator-(RValue<SByte> lhs, RValue<SByte> rhs);
+RValue<SByte> operator*(RValue<SByte> lhs, RValue<SByte> rhs);
+RValue<SByte> operator/(RValue<SByte> lhs, RValue<SByte> rhs);
+RValue<SByte> operator%(RValue<SByte> lhs, RValue<SByte> rhs);
+RValue<SByte> operator&(RValue<SByte> lhs, RValue<SByte> rhs);
+RValue<SByte> operator|(RValue<SByte> lhs, RValue<SByte> rhs);
+RValue<SByte> operator^(RValue<SByte> lhs, RValue<SByte> rhs);
+RValue<SByte> operator<<(RValue<SByte> lhs, RValue<SByte> rhs);
+RValue<SByte> operator>>(RValue<SByte> lhs, RValue<SByte> rhs);
+RValue<SByte> operator+=(SByte &lhs, RValue<SByte> rhs);
+RValue<SByte> operator-=(SByte &lhs, RValue<SByte> rhs);
+RValue<SByte> operator*=(SByte &lhs, RValue<SByte> rhs);
+RValue<SByte> operator/=(SByte &lhs, RValue<SByte> rhs);
+RValue<SByte> operator%=(SByte &lhs, RValue<SByte> rhs);
+RValue<SByte> operator&=(SByte &lhs, RValue<SByte> rhs);
+RValue<SByte> operator|=(SByte &lhs, RValue<SByte> rhs);
+RValue<SByte> operator^=(SByte &lhs, RValue<SByte> rhs);
+RValue<SByte> operator<<=(SByte &lhs, RValue<SByte> rhs);
+RValue<SByte> operator>>=(SByte &lhs, RValue<SByte> rhs);
+RValue<SByte> operator+(RValue<SByte> val);
+RValue<SByte> operator-(RValue<SByte> val);
+RValue<SByte> operator~(RValue<SByte> val);
+RValue<SByte> operator++(SByte &val, int);   // Post-increment
+const SByte &operator++(SByte &val);   // Pre-increment
+RValue<SByte> operator--(SByte &val, int);   // Post-decrement
+const SByte &operator--(SByte &val);   // Pre-decrement
+RValue<Bool> operator<(RValue<SByte> lhs, RValue<SByte> rhs);
+RValue<Bool> operator<=(RValue<SByte> lhs, RValue<SByte> rhs);
+RValue<Bool> operator>(RValue<SByte> lhs, RValue<SByte> rhs);
+RValue<Bool> operator>=(RValue<SByte> lhs, RValue<SByte> rhs);
+RValue<Bool> operator!=(RValue<SByte> lhs, RValue<SByte> rhs);
+RValue<Bool> operator==(RValue<SByte> lhs, RValue<SByte> rhs);
 
-	class Short : public LValue<Short>
-	{
-	public:
-		Short(Argument<Short> argument);
+class Short : public LValue<Short>
+{
+public:
+	Short(Argument<Short> argument);
 
-		explicit Short(RValue<Int> cast);
+	explicit Short(RValue<Int> cast);
 
-		Short() = default;
-		Short(short x);
-		Short(RValue<Short> rhs);
-		Short(const Short &rhs);
-		Short(const Reference<Short> &rhs);
+	Short() = default;
+	Short(short x);
+	Short(RValue<Short> rhs);
+	Short(const Short &rhs);
+	Short(const Reference<Short> &rhs);
 
-	//	RValue<Short> operator=(short rhs);   // FIXME: Implement
-		RValue<Short> operator=(RValue<Short> rhs);
-		RValue<Short> operator=(const Short &rhs);
-		RValue<Short> operator=(const Reference<Short> &rhs);
+//	RValue<Short> operator=(short rhs);   // FIXME: Implement
+	RValue<Short> operator=(RValue<Short> rhs);
+	RValue<Short> operator=(const Short &rhs);
+	RValue<Short> operator=(const Reference<Short> &rhs);
 
-		static Type *getType();
-	};
+	static Type *getType();
+};
 
-	RValue<Short> operator+(RValue<Short> lhs, RValue<Short> rhs);
-	RValue<Short> operator-(RValue<Short> lhs, RValue<Short> rhs);
-	RValue<Short> operator*(RValue<Short> lhs, RValue<Short> rhs);
-	RValue<Short> operator/(RValue<Short> lhs, RValue<Short> rhs);
-	RValue<Short> operator%(RValue<Short> lhs, RValue<Short> rhs);
-	RValue<Short> operator&(RValue<Short> lhs, RValue<Short> rhs);
-	RValue<Short> operator|(RValue<Short> lhs, RValue<Short> rhs);
-	RValue<Short> operator^(RValue<Short> lhs, RValue<Short> rhs);
-	RValue<Short> operator<<(RValue<Short> lhs, RValue<Short> rhs);
-	RValue<Short> operator>>(RValue<Short> lhs, RValue<Short> rhs);
-	RValue<Short> operator+=(Short &lhs, RValue<Short> rhs);
-	RValue<Short> operator-=(Short &lhs, RValue<Short> rhs);
-	RValue<Short> operator*=(Short &lhs, RValue<Short> rhs);
-	RValue<Short> operator/=(Short &lhs, RValue<Short> rhs);
-	RValue<Short> operator%=(Short &lhs, RValue<Short> rhs);
-	RValue<Short> operator&=(Short &lhs, RValue<Short> rhs);
-	RValue<Short> operator|=(Short &lhs, RValue<Short> rhs);
-	RValue<Short> operator^=(Short &lhs, RValue<Short> rhs);
-	RValue<Short> operator<<=(Short &lhs, RValue<Short> rhs);
-	RValue<Short> operator>>=(Short &lhs, RValue<Short> rhs);
-	RValue<Short> operator+(RValue<Short> val);
-	RValue<Short> operator-(RValue<Short> val);
-	RValue<Short> operator~(RValue<Short> val);
-	RValue<Short> operator++(Short &val, int);   // Post-increment
-	const Short &operator++(Short &val);   // Pre-increment
-	RValue<Short> operator--(Short &val, int);   // Post-decrement
-	const Short &operator--(Short &val);   // Pre-decrement
-	RValue<Bool> operator<(RValue<Short> lhs, RValue<Short> rhs);
-	RValue<Bool> operator<=(RValue<Short> lhs, RValue<Short> rhs);
-	RValue<Bool> operator>(RValue<Short> lhs, RValue<Short> rhs);
-	RValue<Bool> operator>=(RValue<Short> lhs, RValue<Short> rhs);
-	RValue<Bool> operator!=(RValue<Short> lhs, RValue<Short> rhs);
-	RValue<Bool> operator==(RValue<Short> lhs, RValue<Short> rhs);
+RValue<Short> operator+(RValue<Short> lhs, RValue<Short> rhs);
+RValue<Short> operator-(RValue<Short> lhs, RValue<Short> rhs);
+RValue<Short> operator*(RValue<Short> lhs, RValue<Short> rhs);
+RValue<Short> operator/(RValue<Short> lhs, RValue<Short> rhs);
+RValue<Short> operator%(RValue<Short> lhs, RValue<Short> rhs);
+RValue<Short> operator&(RValue<Short> lhs, RValue<Short> rhs);
+RValue<Short> operator|(RValue<Short> lhs, RValue<Short> rhs);
+RValue<Short> operator^(RValue<Short> lhs, RValue<Short> rhs);
+RValue<Short> operator<<(RValue<Short> lhs, RValue<Short> rhs);
+RValue<Short> operator>>(RValue<Short> lhs, RValue<Short> rhs);
+RValue<Short> operator+=(Short &lhs, RValue<Short> rhs);
+RValue<Short> operator-=(Short &lhs, RValue<Short> rhs);
+RValue<Short> operator*=(Short &lhs, RValue<Short> rhs);
+RValue<Short> operator/=(Short &lhs, RValue<Short> rhs);
+RValue<Short> operator%=(Short &lhs, RValue<Short> rhs);
+RValue<Short> operator&=(Short &lhs, RValue<Short> rhs);
+RValue<Short> operator|=(Short &lhs, RValue<Short> rhs);
+RValue<Short> operator^=(Short &lhs, RValue<Short> rhs);
+RValue<Short> operator<<=(Short &lhs, RValue<Short> rhs);
+RValue<Short> operator>>=(Short &lhs, RValue<Short> rhs);
+RValue<Short> operator+(RValue<Short> val);
+RValue<Short> operator-(RValue<Short> val);
+RValue<Short> operator~(RValue<Short> val);
+RValue<Short> operator++(Short &val, int);   // Post-increment
+const Short &operator++(Short &val);   // Pre-increment
+RValue<Short> operator--(Short &val, int);   // Post-decrement
+const Short &operator--(Short &val);   // Pre-decrement
+RValue<Bool> operator<(RValue<Short> lhs, RValue<Short> rhs);
+RValue<Bool> operator<=(RValue<Short> lhs, RValue<Short> rhs);
+RValue<Bool> operator>(RValue<Short> lhs, RValue<Short> rhs);
+RValue<Bool> operator>=(RValue<Short> lhs, RValue<Short> rhs);
+RValue<Bool> operator!=(RValue<Short> lhs, RValue<Short> rhs);
+RValue<Bool> operator==(RValue<Short> lhs, RValue<Short> rhs);
 
-	class UShort : public LValue<UShort>
-	{
-	public:
-		UShort(Argument<UShort> argument);
+class UShort : public LValue<UShort>
+{
+public:
+	UShort(Argument<UShort> argument);
 
-		explicit UShort(RValue<UInt> cast);
-		explicit UShort(RValue<Int> cast);
+	explicit UShort(RValue<UInt> cast);
+	explicit UShort(RValue<Int> cast);
 
-		UShort() = default;
-		UShort(unsigned short x);
-		UShort(RValue<UShort> rhs);
-		UShort(const UShort &rhs);
-		UShort(const Reference<UShort> &rhs);
+	UShort() = default;
+	UShort(unsigned short x);
+	UShort(RValue<UShort> rhs);
+	UShort(const UShort &rhs);
+	UShort(const Reference<UShort> &rhs);
 
-	//	RValue<UShort> operator=(unsigned short rhs);   // FIXME: Implement
-		RValue<UShort> operator=(RValue<UShort> rhs);
-		RValue<UShort> operator=(const UShort &rhs);
-		RValue<UShort> operator=(const Reference<UShort> &rhs);
+//	RValue<UShort> operator=(unsigned short rhs);   // FIXME: Implement
+	RValue<UShort> operator=(RValue<UShort> rhs);
+	RValue<UShort> operator=(const UShort &rhs);
+	RValue<UShort> operator=(const Reference<UShort> &rhs);
 
-		static Type *getType();
-	};
+	static Type *getType();
+};
 
-	RValue<UShort> operator+(RValue<UShort> lhs, RValue<UShort> rhs);
-	RValue<UShort> operator-(RValue<UShort> lhs, RValue<UShort> rhs);
-	RValue<UShort> operator*(RValue<UShort> lhs, RValue<UShort> rhs);
-	RValue<UShort> operator/(RValue<UShort> lhs, RValue<UShort> rhs);
-	RValue<UShort> operator%(RValue<UShort> lhs, RValue<UShort> rhs);
-	RValue<UShort> operator&(RValue<UShort> lhs, RValue<UShort> rhs);
-	RValue<UShort> operator|(RValue<UShort> lhs, RValue<UShort> rhs);
-	RValue<UShort> operator^(RValue<UShort> lhs, RValue<UShort> rhs);
-	RValue<UShort> operator<<(RValue<UShort> lhs, RValue<UShort> rhs);
-	RValue<UShort> operator>>(RValue<UShort> lhs, RValue<UShort> rhs);
-	RValue<UShort> operator+=(UShort &lhs, RValue<UShort> rhs);
-	RValue<UShort> operator-=(UShort &lhs, RValue<UShort> rhs);
-	RValue<UShort> operator*=(UShort &lhs, RValue<UShort> rhs);
-	RValue<UShort> operator/=(UShort &lhs, RValue<UShort> rhs);
-	RValue<UShort> operator%=(UShort &lhs, RValue<UShort> rhs);
-	RValue<UShort> operator&=(UShort &lhs, RValue<UShort> rhs);
-	RValue<UShort> operator|=(UShort &lhs, RValue<UShort> rhs);
-	RValue<UShort> operator^=(UShort &lhs, RValue<UShort> rhs);
-	RValue<UShort> operator<<=(UShort &lhs, RValue<UShort> rhs);
-	RValue<UShort> operator>>=(UShort &lhs, RValue<UShort> rhs);
-	RValue<UShort> operator+(RValue<UShort> val);
-	RValue<UShort> operator-(RValue<UShort> val);
-	RValue<UShort> operator~(RValue<UShort> val);
-	RValue<UShort> operator++(UShort &val, int);   // Post-increment
-	const UShort &operator++(UShort &val);   // Pre-increment
-	RValue<UShort> operator--(UShort &val, int);   // Post-decrement
-	const UShort &operator--(UShort &val);   // Pre-decrement
-	RValue<Bool> operator<(RValue<UShort> lhs, RValue<UShort> rhs);
-	RValue<Bool> operator<=(RValue<UShort> lhs, RValue<UShort> rhs);
-	RValue<Bool> operator>(RValue<UShort> lhs, RValue<UShort> rhs);
-	RValue<Bool> operator>=(RValue<UShort> lhs, RValue<UShort> rhs);
-	RValue<Bool> operator!=(RValue<UShort> lhs, RValue<UShort> rhs);
-	RValue<Bool> operator==(RValue<UShort> lhs, RValue<UShort> rhs);
+RValue<UShort> operator+(RValue<UShort> lhs, RValue<UShort> rhs);
+RValue<UShort> operator-(RValue<UShort> lhs, RValue<UShort> rhs);
+RValue<UShort> operator*(RValue<UShort> lhs, RValue<UShort> rhs);
+RValue<UShort> operator/(RValue<UShort> lhs, RValue<UShort> rhs);
+RValue<UShort> operator%(RValue<UShort> lhs, RValue<UShort> rhs);
+RValue<UShort> operator&(RValue<UShort> lhs, RValue<UShort> rhs);
+RValue<UShort> operator|(RValue<UShort> lhs, RValue<UShort> rhs);
+RValue<UShort> operator^(RValue<UShort> lhs, RValue<UShort> rhs);
+RValue<UShort> operator<<(RValue<UShort> lhs, RValue<UShort> rhs);
+RValue<UShort> operator>>(RValue<UShort> lhs, RValue<UShort> rhs);
+RValue<UShort> operator+=(UShort &lhs, RValue<UShort> rhs);
+RValue<UShort> operator-=(UShort &lhs, RValue<UShort> rhs);
+RValue<UShort> operator*=(UShort &lhs, RValue<UShort> rhs);
+RValue<UShort> operator/=(UShort &lhs, RValue<UShort> rhs);
+RValue<UShort> operator%=(UShort &lhs, RValue<UShort> rhs);
+RValue<UShort> operator&=(UShort &lhs, RValue<UShort> rhs);
+RValue<UShort> operator|=(UShort &lhs, RValue<UShort> rhs);
+RValue<UShort> operator^=(UShort &lhs, RValue<UShort> rhs);
+RValue<UShort> operator<<=(UShort &lhs, RValue<UShort> rhs);
+RValue<UShort> operator>>=(UShort &lhs, RValue<UShort> rhs);
+RValue<UShort> operator+(RValue<UShort> val);
+RValue<UShort> operator-(RValue<UShort> val);
+RValue<UShort> operator~(RValue<UShort> val);
+RValue<UShort> operator++(UShort &val, int);   // Post-increment
+const UShort &operator++(UShort &val);   // Pre-increment
+RValue<UShort> operator--(UShort &val, int);   // Post-decrement
+const UShort &operator--(UShort &val);   // Pre-decrement
+RValue<Bool> operator<(RValue<UShort> lhs, RValue<UShort> rhs);
+RValue<Bool> operator<=(RValue<UShort> lhs, RValue<UShort> rhs);
+RValue<Bool> operator>(RValue<UShort> lhs, RValue<UShort> rhs);
+RValue<Bool> operator>=(RValue<UShort> lhs, RValue<UShort> rhs);
+RValue<Bool> operator!=(RValue<UShort> lhs, RValue<UShort> rhs);
+RValue<Bool> operator==(RValue<UShort> lhs, RValue<UShort> rhs);
 
-	class Byte4 : public LValue<Byte4>
-	{
-	public:
-		explicit Byte4(RValue<Byte8> cast);
+class Byte4 : public LValue<Byte4>
+{
+public:
+	explicit Byte4(RValue<Byte8> cast);
 
-		Byte4() = default;
-	//	Byte4(int x, int y, int z, int w);
-	//	Byte4(RValue<Byte4> rhs);
-	//	Byte4(const Byte4 &rhs);
-		Byte4(const Reference<Byte4> &rhs);
+	Byte4() = default;
+//	Byte4(int x, int y, int z, int w);
+//	Byte4(RValue<Byte4> rhs);
+//	Byte4(const Byte4 &rhs);
+	Byte4(const Reference<Byte4> &rhs);
 
-	//	RValue<Byte4> operator=(RValue<Byte4> rhs);
-	//	RValue<Byte4> operator=(const Byte4 &rhs);
-	//	RValue<Byte4> operator=(const Reference<Byte4> &rhs);
+//	RValue<Byte4> operator=(RValue<Byte4> rhs);
+//	RValue<Byte4> operator=(const Byte4 &rhs);
+//	RValue<Byte4> operator=(const Reference<Byte4> &rhs);
 
-		static Type *getType();
-	};
+	static Type *getType();
+};
 
 //	RValue<Byte4> operator+(RValue<Byte4> lhs, RValue<Byte4> rhs);
 //	RValue<Byte4> operator-(RValue<Byte4> lhs, RValue<Byte4> rhs);
@@ -556,21 +556,21 @@
 //	RValue<Byte4> operator--(Byte4 &val, int);   // Post-decrement
 //	const Byte4 &operator--(Byte4 &val);   // Pre-decrement
 
-	class SByte4 : public LValue<SByte4>
-	{
-	public:
-		SByte4() = default;
-	//	SByte4(int x, int y, int z, int w);
-	//	SByte4(RValue<SByte4> rhs);
-	//	SByte4(const SByte4 &rhs);
-	//	SByte4(const Reference<SByte4> &rhs);
+class SByte4 : public LValue<SByte4>
+{
+public:
+	SByte4() = default;
+//	SByte4(int x, int y, int z, int w);
+//	SByte4(RValue<SByte4> rhs);
+//	SByte4(const SByte4 &rhs);
+//	SByte4(const Reference<SByte4> &rhs);
 
-	//	RValue<SByte4> operator=(RValue<SByte4> rhs);
-	//	RValue<SByte4> operator=(const SByte4 &rhs);
-	//	RValue<SByte4> operator=(const Reference<SByte4> &rhs);
+//	RValue<SByte4> operator=(RValue<SByte4> rhs);
+//	RValue<SByte4> operator=(const SByte4 &rhs);
+//	RValue<SByte4> operator=(const Reference<SByte4> &rhs);
 
-		static Type *getType();
-	};
+	static Type *getType();
+};
 
 //	RValue<SByte4> operator+(RValue<SByte4> lhs, RValue<SByte4> rhs);
 //	RValue<SByte4> operator-(RValue<SByte4> lhs, RValue<SByte4> rhs);
@@ -600,127 +600,127 @@
 //	RValue<SByte4> operator--(SByte4 &val, int);   // Post-decrement
 //	const SByte4 &operator--(SByte4 &val);   // Pre-decrement
 
-	class Byte8 : public LValue<Byte8>
-	{
-	public:
-		Byte8() = default;
-		Byte8(uint8_t x0, uint8_t x1, uint8_t x2, uint8_t x3, uint8_t x4, uint8_t x5, uint8_t x6, uint8_t x7);
-		Byte8(RValue<Byte8> rhs);
-		Byte8(const Byte8 &rhs);
-		Byte8(const Reference<Byte8> &rhs);
+class Byte8 : public LValue<Byte8>
+{
+public:
+	Byte8() = default;
+	Byte8(uint8_t x0, uint8_t x1, uint8_t x2, uint8_t x3, uint8_t x4, uint8_t x5, uint8_t x6, uint8_t x7);
+	Byte8(RValue<Byte8> rhs);
+	Byte8(const Byte8 &rhs);
+	Byte8(const Reference<Byte8> &rhs);
 
-		RValue<Byte8> operator=(RValue<Byte8> rhs);
-		RValue<Byte8> operator=(const Byte8 &rhs);
-		RValue<Byte8> operator=(const Reference<Byte8> &rhs);
+	RValue<Byte8> operator=(RValue<Byte8> rhs);
+	RValue<Byte8> operator=(const Byte8 &rhs);
+	RValue<Byte8> operator=(const Reference<Byte8> &rhs);
 
-		static Type *getType();
-	};
+	static Type *getType();
+};
 
-	RValue<Byte8> operator+(RValue<Byte8> lhs, RValue<Byte8> rhs);
-	RValue<Byte8> operator-(RValue<Byte8> lhs, RValue<Byte8> rhs);
+RValue<Byte8> operator+(RValue<Byte8> lhs, RValue<Byte8> rhs);
+RValue<Byte8> operator-(RValue<Byte8> lhs, RValue<Byte8> rhs);
 //	RValue<Byte8> operator*(RValue<Byte8> lhs, RValue<Byte8> rhs);
 //	RValue<Byte8> operator/(RValue<Byte8> lhs, RValue<Byte8> rhs);
 //	RValue<Byte8> operator%(RValue<Byte8> lhs, RValue<Byte8> rhs);
-	RValue<Byte8> operator&(RValue<Byte8> lhs, RValue<Byte8> rhs);
-	RValue<Byte8> operator|(RValue<Byte8> lhs, RValue<Byte8> rhs);
-	RValue<Byte8> operator^(RValue<Byte8> lhs, RValue<Byte8> rhs);
+RValue<Byte8> operator&(RValue<Byte8> lhs, RValue<Byte8> rhs);
+RValue<Byte8> operator|(RValue<Byte8> lhs, RValue<Byte8> rhs);
+RValue<Byte8> operator^(RValue<Byte8> lhs, RValue<Byte8> rhs);
 //	RValue<Byte8> operator<<(RValue<Byte8> lhs, RValue<Byte8> rhs);
 //	RValue<Byte8> operator>>(RValue<Byte8> lhs, RValue<Byte8> rhs);
-	RValue<Byte8> operator+=(Byte8 &lhs, RValue<Byte8> rhs);
-	RValue<Byte8> operator-=(Byte8 &lhs, RValue<Byte8> rhs);
+RValue<Byte8> operator+=(Byte8 &lhs, RValue<Byte8> rhs);
+RValue<Byte8> operator-=(Byte8 &lhs, RValue<Byte8> rhs);
 //	RValue<Byte8> operator*=(Byte8 &lhs, RValue<Byte8> rhs);
 //	RValue<Byte8> operator/=(Byte8 &lhs, RValue<Byte8> rhs);
 //	RValue<Byte8> operator%=(Byte8 &lhs, RValue<Byte8> rhs);
-	RValue<Byte8> operator&=(Byte8 &lhs, RValue<Byte8> rhs);
-	RValue<Byte8> operator|=(Byte8 &lhs, RValue<Byte8> rhs);
-	RValue<Byte8> operator^=(Byte8 &lhs, RValue<Byte8> rhs);
+RValue<Byte8> operator&=(Byte8 &lhs, RValue<Byte8> rhs);
+RValue<Byte8> operator|=(Byte8 &lhs, RValue<Byte8> rhs);
+RValue<Byte8> operator^=(Byte8 &lhs, RValue<Byte8> rhs);
 //	RValue<Byte8> operator<<=(Byte8 &lhs, RValue<Byte8> rhs);
 //	RValue<Byte8> operator>>=(Byte8 &lhs, RValue<Byte8> rhs);
 //	RValue<Byte8> operator+(RValue<Byte8> val);
 //	RValue<Byte8> operator-(RValue<Byte8> val);
-	RValue<Byte8> operator~(RValue<Byte8> val);
+RValue<Byte8> operator~(RValue<Byte8> val);
 //	RValue<Byte8> operator++(Byte8 &val, int);   // Post-increment
 //	const Byte8 &operator++(Byte8 &val);   // Pre-increment
 //	RValue<Byte8> operator--(Byte8 &val, int);   // Post-decrement
 //	const Byte8 &operator--(Byte8 &val);   // Pre-decrement
 
-	RValue<Byte8> AddSat(RValue<Byte8> x, RValue<Byte8> y);
-	RValue<Byte8> SubSat(RValue<Byte8> x, RValue<Byte8> y);
-	RValue<Short4> Unpack(RValue<Byte4> x);
-	RValue<Short4> Unpack(RValue<Byte4> x, RValue<Byte4> y);
-	RValue<Short4> UnpackLow(RValue<Byte8> x, RValue<Byte8> y);
-	RValue<Short4> UnpackHigh(RValue<Byte8> x, RValue<Byte8> y);
-	RValue<Int> SignMask(RValue<Byte8> x);
+RValue<Byte8> AddSat(RValue<Byte8> x, RValue<Byte8> y);
+RValue<Byte8> SubSat(RValue<Byte8> x, RValue<Byte8> y);
+RValue<Short4> Unpack(RValue<Byte4> x);
+RValue<Short4> Unpack(RValue<Byte4> x, RValue<Byte4> y);
+RValue<Short4> UnpackLow(RValue<Byte8> x, RValue<Byte8> y);
+RValue<Short4> UnpackHigh(RValue<Byte8> x, RValue<Byte8> y);
+RValue<Int> SignMask(RValue<Byte8> x);
 //	RValue<Byte8> CmpGT(RValue<Byte8> x, RValue<Byte8> y);
-	RValue<Byte8> CmpEQ(RValue<Byte8> x, RValue<Byte8> y);
+RValue<Byte8> CmpEQ(RValue<Byte8> x, RValue<Byte8> y);
 
-	class SByte8 : public LValue<SByte8>
-	{
-	public:
-		SByte8() = default;
-		SByte8(uint8_t x0, uint8_t x1, uint8_t x2, uint8_t x3, uint8_t x4, uint8_t x5, uint8_t x6, uint8_t x7);
-		SByte8(RValue<SByte8> rhs);
-		SByte8(const SByte8 &rhs);
-		SByte8(const Reference<SByte8> &rhs);
+class SByte8 : public LValue<SByte8>
+{
+public:
+	SByte8() = default;
+	SByte8(uint8_t x0, uint8_t x1, uint8_t x2, uint8_t x3, uint8_t x4, uint8_t x5, uint8_t x6, uint8_t x7);
+	SByte8(RValue<SByte8> rhs);
+	SByte8(const SByte8 &rhs);
+	SByte8(const Reference<SByte8> &rhs);
 
-		RValue<SByte8> operator=(RValue<SByte8> rhs);
-		RValue<SByte8> operator=(const SByte8 &rhs);
-		RValue<SByte8> operator=(const Reference<SByte8> &rhs);
+	RValue<SByte8> operator=(RValue<SByte8> rhs);
+	RValue<SByte8> operator=(const SByte8 &rhs);
+	RValue<SByte8> operator=(const Reference<SByte8> &rhs);
 
-		static Type *getType();
-	};
+	static Type *getType();
+};
 
-	RValue<SByte8> operator+(RValue<SByte8> lhs, RValue<SByte8> rhs);
-	RValue<SByte8> operator-(RValue<SByte8> lhs, RValue<SByte8> rhs);
+RValue<SByte8> operator+(RValue<SByte8> lhs, RValue<SByte8> rhs);
+RValue<SByte8> operator-(RValue<SByte8> lhs, RValue<SByte8> rhs);
 //	RValue<SByte8> operator*(RValue<SByte8> lhs, RValue<SByte8> rhs);
 //	RValue<SByte8> operator/(RValue<SByte8> lhs, RValue<SByte8> rhs);
 //	RValue<SByte8> operator%(RValue<SByte8> lhs, RValue<SByte8> rhs);
-	RValue<SByte8> operator&(RValue<SByte8> lhs, RValue<SByte8> rhs);
-	RValue<SByte8> operator|(RValue<SByte8> lhs, RValue<SByte8> rhs);
-	RValue<SByte8> operator^(RValue<SByte8> lhs, RValue<SByte8> rhs);
+RValue<SByte8> operator&(RValue<SByte8> lhs, RValue<SByte8> rhs);
+RValue<SByte8> operator|(RValue<SByte8> lhs, RValue<SByte8> rhs);
+RValue<SByte8> operator^(RValue<SByte8> lhs, RValue<SByte8> rhs);
 //	RValue<SByte8> operator<<(RValue<SByte8> lhs, RValue<SByte8> rhs);
 //	RValue<SByte8> operator>>(RValue<SByte8> lhs, RValue<SByte8> rhs);
-	RValue<SByte8> operator+=(SByte8 &lhs, RValue<SByte8> rhs);
-	RValue<SByte8> operator-=(SByte8 &lhs, RValue<SByte8> rhs);
+RValue<SByte8> operator+=(SByte8 &lhs, RValue<SByte8> rhs);
+RValue<SByte8> operator-=(SByte8 &lhs, RValue<SByte8> rhs);
 //	RValue<SByte8> operator*=(SByte8 &lhs, RValue<SByte8> rhs);
 //	RValue<SByte8> operator/=(SByte8 &lhs, RValue<SByte8> rhs);
 //	RValue<SByte8> operator%=(SByte8 &lhs, RValue<SByte8> rhs);
-	RValue<SByte8> operator&=(SByte8 &lhs, RValue<SByte8> rhs);
-	RValue<SByte8> operator|=(SByte8 &lhs, RValue<SByte8> rhs);
-	RValue<SByte8> operator^=(SByte8 &lhs, RValue<SByte8> rhs);
+RValue<SByte8> operator&=(SByte8 &lhs, RValue<SByte8> rhs);
+RValue<SByte8> operator|=(SByte8 &lhs, RValue<SByte8> rhs);
+RValue<SByte8> operator^=(SByte8 &lhs, RValue<SByte8> rhs);
 //	RValue<SByte8> operator<<=(SByte8 &lhs, RValue<SByte8> rhs);
 //	RValue<SByte8> operator>>=(SByte8 &lhs, RValue<SByte8> rhs);
 //	RValue<SByte8> operator+(RValue<SByte8> val);
 //	RValue<SByte8> operator-(RValue<SByte8> val);
-	RValue<SByte8> operator~(RValue<SByte8> val);
+RValue<SByte8> operator~(RValue<SByte8> val);
 //	RValue<SByte8> operator++(SByte8 &val, int);   // Post-increment
 //	const SByte8 &operator++(SByte8 &val);   // Pre-increment
 //	RValue<SByte8> operator--(SByte8 &val, int);   // Post-decrement
 //	const SByte8 &operator--(SByte8 &val);   // Pre-decrement
 
-	RValue<SByte8> AddSat(RValue<SByte8> x, RValue<SByte8> y);
-	RValue<SByte8> SubSat(RValue<SByte8> x, RValue<SByte8> y);
-	RValue<Short4> UnpackLow(RValue<SByte8> x, RValue<SByte8> y);
-	RValue<Short4> UnpackHigh(RValue<SByte8> x, RValue<SByte8> y);
-	RValue<Int> SignMask(RValue<SByte8> x);
-	RValue<Byte8> CmpGT(RValue<SByte8> x, RValue<SByte8> y);
-	RValue<Byte8> CmpEQ(RValue<SByte8> x, RValue<SByte8> y);
+RValue<SByte8> AddSat(RValue<SByte8> x, RValue<SByte8> y);
+RValue<SByte8> SubSat(RValue<SByte8> x, RValue<SByte8> y);
+RValue<Short4> UnpackLow(RValue<SByte8> x, RValue<SByte8> y);
+RValue<Short4> UnpackHigh(RValue<SByte8> x, RValue<SByte8> y);
+RValue<Int> SignMask(RValue<SByte8> x);
+RValue<Byte8> CmpGT(RValue<SByte8> x, RValue<SByte8> y);
+RValue<Byte8> CmpEQ(RValue<SByte8> x, RValue<SByte8> y);
 
-	class Byte16 : public LValue<Byte16>
-	{
-	public:
-		Byte16() = default;
-	//	Byte16(int x, int y, int z, int w);
-		Byte16(RValue<Byte16> rhs);
-		Byte16(const Byte16 &rhs);
-		Byte16(const Reference<Byte16> &rhs);
+class Byte16 : public LValue<Byte16>
+{
+public:
+	Byte16() = default;
+//	Byte16(int x, int y, int z, int w);
+	Byte16(RValue<Byte16> rhs);
+	Byte16(const Byte16 &rhs);
+	Byte16(const Reference<Byte16> &rhs);
 
-		RValue<Byte16> operator=(RValue<Byte16> rhs);
-		RValue<Byte16> operator=(const Byte16 &rhs);
-		RValue<Byte16> operator=(const Reference<Byte16> &rhs);
+	RValue<Byte16> operator=(RValue<Byte16> rhs);
+	RValue<Byte16> operator=(const Byte16 &rhs);
+	RValue<Byte16> operator=(const Reference<Byte16> &rhs);
 
-		static Type *getType();
-	};
+	static Type *getType();
+};
 
 //	RValue<Byte16> operator+(RValue<Byte16> lhs, RValue<Byte16> rhs);
 //	RValue<Byte16> operator-(RValue<Byte16> lhs, RValue<Byte16> rhs);
@@ -750,21 +750,21 @@
 //	RValue<Byte16> operator--(Byte16 &val, int);   // Post-decrement
 //	const Byte16 &operator--(Byte16 &val);   // Pre-decrement
 
-	class SByte16 : public LValue<SByte16>
-	{
-	public:
-		SByte16() = default;
-	//	SByte16(int x, int y, int z, int w);
-	//	SByte16(RValue<SByte16> rhs);
-	//	SByte16(const SByte16 &rhs);
-	//	SByte16(const Reference<SByte16> &rhs);
+class SByte16 : public LValue<SByte16>
+{
+public:
+	SByte16() = default;
+//	SByte16(int x, int y, int z, int w);
+//	SByte16(RValue<SByte16> rhs);
+//	SByte16(const SByte16 &rhs);
+//	SByte16(const Reference<SByte16> &rhs);
 
-	//	RValue<SByte16> operator=(RValue<SByte16> rhs);
-	//	RValue<SByte16> operator=(const SByte16 &rhs);
-	//	RValue<SByte16> operator=(const Reference<SByte16> &rhs);
+//	RValue<SByte16> operator=(RValue<SByte16> rhs);
+//	RValue<SByte16> operator=(const SByte16 &rhs);
+//	RValue<SByte16> operator=(const Reference<SByte16> &rhs);
 
-		static Type *getType();
-	};
+	static Type *getType();
+};
 
 //	RValue<SByte16> operator+(RValue<SByte16> lhs, RValue<SByte16> rhs);
 //	RValue<SByte16> operator-(RValue<SByte16> lhs, RValue<SByte16> rhs);
@@ -794,73 +794,73 @@
 //	RValue<SByte16> operator--(SByte16 &val, int);   // Post-decrement
 //	const SByte16 &operator--(SByte16 &val);   // Pre-decrement
 
-	class Short2 : public LValue<Short2>
-	{
-	public:
-		explicit Short2(RValue<Short4> cast);
+class Short2 : public LValue<Short2>
+{
+public:
+	explicit Short2(RValue<Short4> cast);
 
-		static Type *getType();
-	};
+	static Type *getType();
+};
 
-	class UShort2 : public LValue<UShort2>
-	{
-	public:
-		explicit UShort2(RValue<UShort4> cast);
+class UShort2 : public LValue<UShort2>
+{
+public:
+	explicit UShort2(RValue<UShort4> cast);
 
-		static Type *getType();
-	};
+	static Type *getType();
+};
 
-	class Short4 : public LValue<Short4>
-	{
-	public:
-		explicit Short4(RValue<Int> cast);
-		explicit Short4(RValue<Int4> cast);
-	//	explicit Short4(RValue<Float> cast);
-		explicit Short4(RValue<Float4> cast);
+class Short4 : public LValue<Short4>
+{
+public:
+	explicit Short4(RValue<Int> cast);
+	explicit Short4(RValue<Int4> cast);
+//	explicit Short4(RValue<Float> cast);
+	explicit Short4(RValue<Float4> cast);
 
-		Short4() = default;
-		Short4(short xyzw);
-		Short4(short x, short y, short z, short w);
-		Short4(RValue<Short4> rhs);
-		Short4(const Short4 &rhs);
-		Short4(const Reference<Short4> &rhs);
-		Short4(RValue<UShort4> rhs);
-		Short4(const UShort4 &rhs);
-		Short4(const Reference<UShort4> &rhs);
+	Short4() = default;
+	Short4(short xyzw);
+	Short4(short x, short y, short z, short w);
+	Short4(RValue<Short4> rhs);
+	Short4(const Short4 &rhs);
+	Short4(const Reference<Short4> &rhs);
+	Short4(RValue<UShort4> rhs);
+	Short4(const UShort4 &rhs);
+	Short4(const Reference<UShort4> &rhs);
 
-		RValue<Short4> operator=(RValue<Short4> rhs);
-		RValue<Short4> operator=(const Short4 &rhs);
-		RValue<Short4> operator=(const Reference<Short4> &rhs);
-		RValue<Short4> operator=(RValue<UShort4> rhs);
-		RValue<Short4> operator=(const UShort4 &rhs);
-		RValue<Short4> operator=(const Reference<UShort4> &rhs);
+	RValue<Short4> operator=(RValue<Short4> rhs);
+	RValue<Short4> operator=(const Short4 &rhs);
+	RValue<Short4> operator=(const Reference<Short4> &rhs);
+	RValue<Short4> operator=(RValue<UShort4> rhs);
+	RValue<Short4> operator=(const UShort4 &rhs);
+	RValue<Short4> operator=(const Reference<UShort4> &rhs);
 
-		static Type *getType();
-	};
+	static Type *getType();
+};
 
-	RValue<Short4> operator+(RValue<Short4> lhs, RValue<Short4> rhs);
-	RValue<Short4> operator-(RValue<Short4> lhs, RValue<Short4> rhs);
-	RValue<Short4> operator*(RValue<Short4> lhs, RValue<Short4> rhs);
+RValue<Short4> operator+(RValue<Short4> lhs, RValue<Short4> rhs);
+RValue<Short4> operator-(RValue<Short4> lhs, RValue<Short4> rhs);
+RValue<Short4> operator*(RValue<Short4> lhs, RValue<Short4> rhs);
 //	RValue<Short4> operator/(RValue<Short4> lhs, RValue<Short4> rhs);
 //	RValue<Short4> operator%(RValue<Short4> lhs, RValue<Short4> rhs);
-	RValue<Short4> operator&(RValue<Short4> lhs, RValue<Short4> rhs);
-	RValue<Short4> operator|(RValue<Short4> lhs, RValue<Short4> rhs);
-	RValue<Short4> operator^(RValue<Short4> lhs, RValue<Short4> rhs);
-	RValue<Short4> operator<<(RValue<Short4> lhs, unsigned char rhs);
-	RValue<Short4> operator>>(RValue<Short4> lhs, unsigned char rhs);
-	RValue<Short4> operator+=(Short4 &lhs, RValue<Short4> rhs);
-	RValue<Short4> operator-=(Short4 &lhs, RValue<Short4> rhs);
-	RValue<Short4> operator*=(Short4 &lhs, RValue<Short4> rhs);
+RValue<Short4> operator&(RValue<Short4> lhs, RValue<Short4> rhs);
+RValue<Short4> operator|(RValue<Short4> lhs, RValue<Short4> rhs);
+RValue<Short4> operator^(RValue<Short4> lhs, RValue<Short4> rhs);
+RValue<Short4> operator<<(RValue<Short4> lhs, unsigned char rhs);
+RValue<Short4> operator>>(RValue<Short4> lhs, unsigned char rhs);
+RValue<Short4> operator+=(Short4 &lhs, RValue<Short4> rhs);
+RValue<Short4> operator-=(Short4 &lhs, RValue<Short4> rhs);
+RValue<Short4> operator*=(Short4 &lhs, RValue<Short4> rhs);
 //	RValue<Short4> operator/=(Short4 &lhs, RValue<Short4> rhs);
 //	RValue<Short4> operator%=(Short4 &lhs, RValue<Short4> rhs);
-	RValue<Short4> operator&=(Short4 &lhs, RValue<Short4> rhs);
-	RValue<Short4> operator|=(Short4 &lhs, RValue<Short4> rhs);
-	RValue<Short4> operator^=(Short4 &lhs, RValue<Short4> rhs);
-	RValue<Short4> operator<<=(Short4 &lhs, unsigned char rhs);
-	RValue<Short4> operator>>=(Short4 &lhs, unsigned char rhs);
+RValue<Short4> operator&=(Short4 &lhs, RValue<Short4> rhs);
+RValue<Short4> operator|=(Short4 &lhs, RValue<Short4> rhs);
+RValue<Short4> operator^=(Short4 &lhs, RValue<Short4> rhs);
+RValue<Short4> operator<<=(Short4 &lhs, unsigned char rhs);
+RValue<Short4> operator>>=(Short4 &lhs, unsigned char rhs);
 //	RValue<Short4> operator+(RValue<Short4> val);
-	RValue<Short4> operator-(RValue<Short4> val);
-	RValue<Short4> operator~(RValue<Short4> val);
+RValue<Short4> operator-(RValue<Short4> val);
+RValue<Short4> operator~(RValue<Short4> val);
 //	RValue<Short4> operator++(Short4 &val, int);   // Post-increment
 //	const Short4 &operator++(Short4 &val);   // Pre-increment
 //	RValue<Short4> operator--(Short4 &val, int);   // Post-decrement
@@ -872,59 +872,59 @@
 //	RValue<Bool> operator!=(RValue<Short4> lhs, RValue<Short4> rhs);
 //	RValue<Bool> operator==(RValue<Short4> lhs, RValue<Short4> rhs);
 
-	RValue<Short4> RoundShort4(RValue<Float4> cast);
-	RValue<Short4> Max(RValue<Short4> x, RValue<Short4> y);
-	RValue<Short4> Min(RValue<Short4> x, RValue<Short4> y);
-	RValue<Short4> AddSat(RValue<Short4> x, RValue<Short4> y);
-	RValue<Short4> SubSat(RValue<Short4> x, RValue<Short4> y);
-	RValue<Short4> MulHigh(RValue<Short4> x, RValue<Short4> y);
-	RValue<Int2> MulAdd(RValue<Short4> x, RValue<Short4> y);
-	RValue<SByte8> PackSigned(RValue<Short4> x, RValue<Short4> y);
-	RValue<Byte8> PackUnsigned(RValue<Short4> x, RValue<Short4> y);
-	RValue<Int2> UnpackLow(RValue<Short4> x, RValue<Short4> y);
-	RValue<Int2> UnpackHigh(RValue<Short4> x, RValue<Short4> y);
-	RValue<Short4> Swizzle(RValue<Short4> x, uint16_t select);
-	RValue<Short4> Insert(RValue<Short4> val, RValue<Short> element, int i);
-	RValue<Short> Extract(RValue<Short4> val, int i);
-	RValue<Short4> CmpGT(RValue<Short4> x, RValue<Short4> y);
-	RValue<Short4> CmpEQ(RValue<Short4> x, RValue<Short4> y);
+RValue<Short4> RoundShort4(RValue<Float4> cast);
+RValue<Short4> Max(RValue<Short4> x, RValue<Short4> y);
+RValue<Short4> Min(RValue<Short4> x, RValue<Short4> y);
+RValue<Short4> AddSat(RValue<Short4> x, RValue<Short4> y);
+RValue<Short4> SubSat(RValue<Short4> x, RValue<Short4> y);
+RValue<Short4> MulHigh(RValue<Short4> x, RValue<Short4> y);
+RValue<Int2> MulAdd(RValue<Short4> x, RValue<Short4> y);
+RValue<SByte8> PackSigned(RValue<Short4> x, RValue<Short4> y);
+RValue<Byte8> PackUnsigned(RValue<Short4> x, RValue<Short4> y);
+RValue<Int2> UnpackLow(RValue<Short4> x, RValue<Short4> y);
+RValue<Int2> UnpackHigh(RValue<Short4> x, RValue<Short4> y);
+RValue<Short4> Swizzle(RValue<Short4> x, uint16_t select);
+RValue<Short4> Insert(RValue<Short4> val, RValue<Short> element, int i);
+RValue<Short> Extract(RValue<Short4> val, int i);
+RValue<Short4> CmpGT(RValue<Short4> x, RValue<Short4> y);
+RValue<Short4> CmpEQ(RValue<Short4> x, RValue<Short4> y);
 
-	class UShort4 : public LValue<UShort4>
-	{
-	public:
-		explicit UShort4(RValue<Int4> cast);
-		explicit UShort4(RValue<Float4> cast, bool saturate = false);
+class UShort4 : public LValue<UShort4>
+{
+public:
+	explicit UShort4(RValue<Int4> cast);
+	explicit UShort4(RValue<Float4> cast, bool saturate = false);
 
-		UShort4() = default;
-		UShort4(unsigned short xyzw);
-		UShort4(unsigned short x, unsigned short y, unsigned short z, unsigned short w);
-		UShort4(RValue<UShort4> rhs);
-		UShort4(const UShort4 &rhs);
-		UShort4(const Reference<UShort4> &rhs);
-		UShort4(RValue<Short4> rhs);
-		UShort4(const Short4 &rhs);
-		UShort4(const Reference<Short4> &rhs);
+	UShort4() = default;
+	UShort4(unsigned short xyzw);
+	UShort4(unsigned short x, unsigned short y, unsigned short z, unsigned short w);
+	UShort4(RValue<UShort4> rhs);
+	UShort4(const UShort4 &rhs);
+	UShort4(const Reference<UShort4> &rhs);
+	UShort4(RValue<Short4> rhs);
+	UShort4(const Short4 &rhs);
+	UShort4(const Reference<Short4> &rhs);
 
-		RValue<UShort4> operator=(RValue<UShort4> rhs);
-		RValue<UShort4> operator=(const UShort4 &rhs);
-		RValue<UShort4> operator=(const Reference<UShort4> &rhs);
-		RValue<UShort4> operator=(RValue<Short4> rhs);
-		RValue<UShort4> operator=(const Short4 &rhs);
-		RValue<UShort4> operator=(const Reference<Short4> &rhs);
+	RValue<UShort4> operator=(RValue<UShort4> rhs);
+	RValue<UShort4> operator=(const UShort4 &rhs);
+	RValue<UShort4> operator=(const Reference<UShort4> &rhs);
+	RValue<UShort4> operator=(RValue<Short4> rhs);
+	RValue<UShort4> operator=(const Short4 &rhs);
+	RValue<UShort4> operator=(const Reference<Short4> &rhs);
 
-		static Type *getType();
-	};
+	static Type *getType();
+};
 
-	RValue<UShort4> operator+(RValue<UShort4> lhs, RValue<UShort4> rhs);
-	RValue<UShort4> operator-(RValue<UShort4> lhs, RValue<UShort4> rhs);
-	RValue<UShort4> operator*(RValue<UShort4> lhs, RValue<UShort4> rhs);
+RValue<UShort4> operator+(RValue<UShort4> lhs, RValue<UShort4> rhs);
+RValue<UShort4> operator-(RValue<UShort4> lhs, RValue<UShort4> rhs);
+RValue<UShort4> operator*(RValue<UShort4> lhs, RValue<UShort4> rhs);
 //	RValue<UShort4> operator/(RValue<UShort4> lhs, RValue<UShort4> rhs);
 //	RValue<UShort4> operator%(RValue<UShort4> lhs, RValue<UShort4> rhs);
-	RValue<UShort4> operator&(RValue<UShort4> lhs, RValue<UShort4> rhs);
-	RValue<UShort4> operator|(RValue<UShort4> lhs, RValue<UShort4> rhs);
-	RValue<UShort4> operator^(RValue<UShort4> lhs, RValue<UShort4> rhs);
-	RValue<UShort4> operator<<(RValue<UShort4> lhs, unsigned char rhs);
-	RValue<UShort4> operator>>(RValue<UShort4> lhs, unsigned char rhs);
+RValue<UShort4> operator&(RValue<UShort4> lhs, RValue<UShort4> rhs);
+RValue<UShort4> operator|(RValue<UShort4> lhs, RValue<UShort4> rhs);
+RValue<UShort4> operator^(RValue<UShort4> lhs, RValue<UShort4> rhs);
+RValue<UShort4> operator<<(RValue<UShort4> lhs, unsigned char rhs);
+RValue<UShort4> operator>>(RValue<UShort4> lhs, unsigned char rhs);
 //	RValue<UShort4> operator+=(UShort4 &lhs, RValue<UShort4> rhs);
 //	RValue<UShort4> operator-=(UShort4 &lhs, RValue<UShort4> rhs);
 //	RValue<UShort4> operator*=(UShort4 &lhs, RValue<UShort4> rhs);
@@ -933,51 +933,51 @@
 //	RValue<UShort4> operator&=(UShort4 &lhs, RValue<UShort4> rhs);
 //	RValue<UShort4> operator|=(UShort4 &lhs, RValue<UShort4> rhs);
 //	RValue<UShort4> operator^=(UShort4 &lhs, RValue<UShort4> rhs);
-	RValue<UShort4> operator<<=(UShort4 &lhs, unsigned char rhs);
-	RValue<UShort4> operator>>=(UShort4 &lhs, unsigned char rhs);
+RValue<UShort4> operator<<=(UShort4 &lhs, unsigned char rhs);
+RValue<UShort4> operator>>=(UShort4 &lhs, unsigned char rhs);
 //	RValue<UShort4> operator+(RValue<UShort4> val);
 //	RValue<UShort4> operator-(RValue<UShort4> val);
-	RValue<UShort4> operator~(RValue<UShort4> val);
+RValue<UShort4> operator~(RValue<UShort4> val);
 //	RValue<UShort4> operator++(UShort4 &val, int);   // Post-increment
 //	const UShort4 &operator++(UShort4 &val);   // Pre-increment
 //	RValue<UShort4> operator--(UShort4 &val, int);   // Post-decrement
 //	const UShort4 &operator--(UShort4 &val);   // Pre-decrement
 
-	RValue<UShort4> Max(RValue<UShort4> x, RValue<UShort4> y);
-	RValue<UShort4> Min(RValue<UShort4> x, RValue<UShort4> y);
-	RValue<UShort4> AddSat(RValue<UShort4> x, RValue<UShort4> y);
-	RValue<UShort4> SubSat(RValue<UShort4> x, RValue<UShort4> y);
-	RValue<UShort4> MulHigh(RValue<UShort4> x, RValue<UShort4> y);
-	RValue<UShort4> Average(RValue<UShort4> x, RValue<UShort4> y);
+RValue<UShort4> Max(RValue<UShort4> x, RValue<UShort4> y);
+RValue<UShort4> Min(RValue<UShort4> x, RValue<UShort4> y);
+RValue<UShort4> AddSat(RValue<UShort4> x, RValue<UShort4> y);
+RValue<UShort4> SubSat(RValue<UShort4> x, RValue<UShort4> y);
+RValue<UShort4> MulHigh(RValue<UShort4> x, RValue<UShort4> y);
+RValue<UShort4> Average(RValue<UShort4> x, RValue<UShort4> y);
 
-	class Short8 : public LValue<Short8>
-	{
-	public:
-		Short8() = default;
-		Short8(short c);
-		Short8(short c0, short c1, short c2, short c3, short c4, short c5, short c6, short c7);
-		Short8(RValue<Short8> rhs);
-	//	Short8(const Short8 &rhs);
-		Short8(const Reference<Short8> &rhs);
-		Short8(RValue<Short4> lo, RValue<Short4> hi);
+class Short8 : public LValue<Short8>
+{
+public:
+	Short8() = default;
+	Short8(short c);
+	Short8(short c0, short c1, short c2, short c3, short c4, short c5, short c6, short c7);
+	Short8(RValue<Short8> rhs);
+//	Short8(const Short8 &rhs);
+	Short8(const Reference<Short8> &rhs);
+	Short8(RValue<Short4> lo, RValue<Short4> hi);
 
-		RValue<Short8> operator=(RValue<Short8> rhs);
-		RValue<Short8> operator=(const Short8 &rhs);
-		RValue<Short8> operator=(const Reference<Short8> &rhs);
+	RValue<Short8> operator=(RValue<Short8> rhs);
+	RValue<Short8> operator=(const Short8 &rhs);
+	RValue<Short8> operator=(const Reference<Short8> &rhs);
 
-		static Type *getType();
-	};
+	static Type *getType();
+};
 
-	RValue<Short8> operator+(RValue<Short8> lhs, RValue<Short8> rhs);
+RValue<Short8> operator+(RValue<Short8> lhs, RValue<Short8> rhs);
 //	RValue<Short8> operator-(RValue<Short8> lhs, RValue<Short8> rhs);
 //	RValue<Short8> operator*(RValue<Short8> lhs, RValue<Short8> rhs);
 //	RValue<Short8> operator/(RValue<Short8> lhs, RValue<Short8> rhs);
 //	RValue<Short8> operator%(RValue<Short8> lhs, RValue<Short8> rhs);
-	RValue<Short8> operator&(RValue<Short8> lhs, RValue<Short8> rhs);
+RValue<Short8> operator&(RValue<Short8> lhs, RValue<Short8> rhs);
 //	RValue<Short8> operator|(RValue<Short8> lhs, RValue<Short8> rhs);
 //	RValue<Short8> operator^(RValue<Short8> lhs, RValue<Short8> rhs);
-	RValue<Short8> operator<<(RValue<Short8> lhs, unsigned char rhs);
-	RValue<Short8> operator>>(RValue<Short8> lhs, unsigned char rhs);
+RValue<Short8> operator<<(RValue<Short8> lhs, unsigned char rhs);
+RValue<Short8> operator>>(RValue<Short8> lhs, unsigned char rhs);
 //	RValue<Short8> operator<<(RValue<Short8> lhs, RValue<Short8> rhs);
 //	RValue<Short8> operator>>(RValue<Short8> lhs, RValue<Short8> rhs);
 //	RValue<Short8> operator+=(Short8 &lhs, RValue<Short8> rhs);
@@ -1004,41 +1004,41 @@
 //	RValue<Bool> operator!=(RValue<Short8> lhs, RValue<Short8> rhs);
 //	RValue<Bool> operator==(RValue<Short8> lhs, RValue<Short8> rhs);
 
-	RValue<Short8> MulHigh(RValue<Short8> x, RValue<Short8> y);
-	RValue<Int4> MulAdd(RValue<Short8> x, RValue<Short8> y);
-	RValue<Int4> Abs(RValue<Int4> x);
+RValue<Short8> MulHigh(RValue<Short8> x, RValue<Short8> y);
+RValue<Int4> MulAdd(RValue<Short8> x, RValue<Short8> y);
+RValue<Int4> Abs(RValue<Int4> x);
 
-	class UShort8 : public LValue<UShort8>
-	{
-	public:
-		UShort8() = default;
-		UShort8(unsigned short c);
-		UShort8(unsigned short c0, unsigned short c1, unsigned short c2, unsigned short c3, unsigned short c4, unsigned short c5, unsigned short c6, unsigned short c7);
-		UShort8(RValue<UShort8> rhs);
-	//	UShort8(const UShort8 &rhs);
-		UShort8(const Reference<UShort8> &rhs);
-		UShort8(RValue<UShort4> lo, RValue<UShort4> hi);
+class UShort8 : public LValue<UShort8>
+{
+public:
+	UShort8() = default;
+	UShort8(unsigned short c);
+	UShort8(unsigned short c0, unsigned short c1, unsigned short c2, unsigned short c3, unsigned short c4, unsigned short c5, unsigned short c6, unsigned short c7);
+	UShort8(RValue<UShort8> rhs);
+//	UShort8(const UShort8 &rhs);
+	UShort8(const Reference<UShort8> &rhs);
+	UShort8(RValue<UShort4> lo, RValue<UShort4> hi);
 
-		RValue<UShort8> operator=(RValue<UShort8> rhs);
-		RValue<UShort8> operator=(const UShort8 &rhs);
-		RValue<UShort8> operator=(const Reference<UShort8> &rhs);
+	RValue<UShort8> operator=(RValue<UShort8> rhs);
+	RValue<UShort8> operator=(const UShort8 &rhs);
+	RValue<UShort8> operator=(const Reference<UShort8> &rhs);
 
-		static Type *getType();
-	};
+	static Type *getType();
+};
 
-	RValue<UShort8> operator+(RValue<UShort8> lhs, RValue<UShort8> rhs);
+RValue<UShort8> operator+(RValue<UShort8> lhs, RValue<UShort8> rhs);
 //	RValue<UShort8> operator-(RValue<UShort8> lhs, RValue<UShort8> rhs);
-	RValue<UShort8> operator*(RValue<UShort8> lhs, RValue<UShort8> rhs);
+RValue<UShort8> operator*(RValue<UShort8> lhs, RValue<UShort8> rhs);
 //	RValue<UShort8> operator/(RValue<UShort8> lhs, RValue<UShort8> rhs);
 //	RValue<UShort8> operator%(RValue<UShort8> lhs, RValue<UShort8> rhs);
-	RValue<UShort8> operator&(RValue<UShort8> lhs, RValue<UShort8> rhs);
+RValue<UShort8> operator&(RValue<UShort8> lhs, RValue<UShort8> rhs);
 //	RValue<UShort8> operator|(RValue<UShort8> lhs, RValue<UShort8> rhs);
 //	RValue<UShort8> operator^(RValue<UShort8> lhs, RValue<UShort8> rhs);
-	RValue<UShort8> operator<<(RValue<UShort8> lhs, unsigned char rhs);
-	RValue<UShort8> operator>>(RValue<UShort8> lhs, unsigned char rhs);
+RValue<UShort8> operator<<(RValue<UShort8> lhs, unsigned char rhs);
+RValue<UShort8> operator>>(RValue<UShort8> lhs, unsigned char rhs);
 //	RValue<UShort8> operator<<(RValue<UShort8> lhs, RValue<UShort8> rhs);
 //	RValue<UShort8> operator>>(RValue<UShort8> lhs, RValue<UShort8> rhs);
-	RValue<UShort8> operator+=(UShort8 &lhs, RValue<UShort8> rhs);
+RValue<UShort8> operator+=(UShort8 &lhs, RValue<UShort8> rhs);
 //	RValue<UShort8> operator-=(UShort8 &lhs, RValue<UShort8> rhs);
 //	RValue<UShort8> operator*=(UShort8 &lhs, RValue<UShort8> rhs);
 //	RValue<UShort8> operator/=(UShort8 &lhs, RValue<UShort8> rhs);
@@ -1050,7 +1050,7 @@
 //	RValue<UShort8> operator>>=(UShort8 &lhs, RValue<UShort8> rhs);
 //	RValue<UShort8> operator+(RValue<UShort8> val);
 //	RValue<UShort8> operator-(RValue<UShort8> val);
-	RValue<UShort8> operator~(RValue<UShort8> val);
+RValue<UShort8> operator~(RValue<UShort8> val);
 //	RValue<UShort8> operator++(UShort8 &val, int);   // Post-increment
 //	const UShort8 &operator++(UShort8 &val);   // Pre-increment
 //	RValue<UShort8> operator--(UShort8 &val, int);   // Post-decrement
@@ -1062,124 +1062,124 @@
 //	RValue<Bool> operator!=(RValue<UShort8> lhs, RValue<UShort8> rhs);
 //	RValue<Bool> operator==(RValue<UShort8> lhs, RValue<UShort8> rhs);
 
-	RValue<UShort8> Swizzle(RValue<UShort8> x, char select0, char select1, char select2, char select3, char select4, char select5, char select6, char select7);
-	RValue<UShort8> MulHigh(RValue<UShort8> x, RValue<UShort8> y);
+RValue<UShort8> Swizzle(RValue<UShort8> x, char select0, char select1, char select2, char select3, char select4, char select5, char select6, char select7);
+RValue<UShort8> MulHigh(RValue<UShort8> x, RValue<UShort8> y);
 
-	class Int : public LValue<Int>
-	{
-	public:
-		Int(Argument<Int> argument);
+class Int : public LValue<Int>
+{
+public:
+	Int(Argument<Int> argument);
 
-		explicit Int(RValue<Byte> cast);
-		explicit Int(RValue<SByte> cast);
-		explicit Int(RValue<Short> cast);
-		explicit Int(RValue<UShort> cast);
-		explicit Int(RValue<Int2> cast);
-		explicit Int(RValue<Long> cast);
-		explicit Int(RValue<Float> cast);
+	explicit Int(RValue<Byte> cast);
+	explicit Int(RValue<SByte> cast);
+	explicit Int(RValue<Short> cast);
+	explicit Int(RValue<UShort> cast);
+	explicit Int(RValue<Int2> cast);
+	explicit Int(RValue<Long> cast);
+	explicit Int(RValue<Float> cast);
 
-		Int() = default;
-		Int(int x);
-		Int(RValue<Int> rhs);
-		Int(RValue<UInt> rhs);
-		Int(const Int &rhs);
-		Int(const UInt &rhs);
-		Int(const Reference<Int> &rhs);
-		Int(const Reference<UInt> &rhs);
+	Int() = default;
+	Int(int x);
+	Int(RValue<Int> rhs);
+	Int(RValue<UInt> rhs);
+	Int(const Int &rhs);
+	Int(const UInt &rhs);
+	Int(const Reference<Int> &rhs);
+	Int(const Reference<UInt> &rhs);
 
-		RValue<Int> operator=(int rhs);
-		RValue<Int> operator=(RValue<Int> rhs);
-		RValue<Int> operator=(RValue<UInt> rhs);
-		RValue<Int> operator=(const Int &rhs);
-		RValue<Int> operator=(const UInt &rhs);
-		RValue<Int> operator=(const Reference<Int> &rhs);
-		RValue<Int> operator=(const Reference<UInt> &rhs);
+	RValue<Int> operator=(int rhs);
+	RValue<Int> operator=(RValue<Int> rhs);
+	RValue<Int> operator=(RValue<UInt> rhs);
+	RValue<Int> operator=(const Int &rhs);
+	RValue<Int> operator=(const UInt &rhs);
+	RValue<Int> operator=(const Reference<Int> &rhs);
+	RValue<Int> operator=(const Reference<UInt> &rhs);
 
-		static Type *getType();
-	};
+	static Type *getType();
+};
 
-	RValue<Int> operator+(RValue<Int> lhs, RValue<Int> rhs);
-	RValue<Int> operator-(RValue<Int> lhs, RValue<Int> rhs);
-	RValue<Int> operator*(RValue<Int> lhs, RValue<Int> rhs);
-	RValue<Int> operator/(RValue<Int> lhs, RValue<Int> rhs);
-	RValue<Int> operator%(RValue<Int> lhs, RValue<Int> rhs);
-	RValue<Int> operator&(RValue<Int> lhs, RValue<Int> rhs);
-	RValue<Int> operator|(RValue<Int> lhs, RValue<Int> rhs);
-	RValue<Int> operator^(RValue<Int> lhs, RValue<Int> rhs);
-	RValue<Int> operator<<(RValue<Int> lhs, RValue<Int> rhs);
-	RValue<Int> operator>>(RValue<Int> lhs, RValue<Int> rhs);
-	RValue<Int> operator+=(Int &lhs, RValue<Int> rhs);
-	RValue<Int> operator-=(Int &lhs, RValue<Int> rhs);
-	RValue<Int> operator*=(Int &lhs, RValue<Int> rhs);
-	RValue<Int> operator/=(Int &lhs, RValue<Int> rhs);
-	RValue<Int> operator%=(Int &lhs, RValue<Int> rhs);
-	RValue<Int> operator&=(Int &lhs, RValue<Int> rhs);
-	RValue<Int> operator|=(Int &lhs, RValue<Int> rhs);
-	RValue<Int> operator^=(Int &lhs, RValue<Int> rhs);
-	RValue<Int> operator<<=(Int &lhs, RValue<Int> rhs);
-	RValue<Int> operator>>=(Int &lhs, RValue<Int> rhs);
-	RValue<Int> operator+(RValue<Int> val);
-	RValue<Int> operator-(RValue<Int> val);
-	RValue<Int> operator~(RValue<Int> val);
-	RValue<Int> operator++(Int &val, int);   // Post-increment
-	const Int &operator++(Int &val);   // Pre-increment
-	RValue<Int> operator--(Int &val, int);   // Post-decrement
-	const Int &operator--(Int &val);   // Pre-decrement
-	RValue<Bool> operator<(RValue<Int> lhs, RValue<Int> rhs);
-	RValue<Bool> operator<=(RValue<Int> lhs, RValue<Int> rhs);
-	RValue<Bool> operator>(RValue<Int> lhs, RValue<Int> rhs);
-	RValue<Bool> operator>=(RValue<Int> lhs, RValue<Int> rhs);
-	RValue<Bool> operator!=(RValue<Int> lhs, RValue<Int> rhs);
-	RValue<Bool> operator==(RValue<Int> lhs, RValue<Int> rhs);
+RValue<Int> operator+(RValue<Int> lhs, RValue<Int> rhs);
+RValue<Int> operator-(RValue<Int> lhs, RValue<Int> rhs);
+RValue<Int> operator*(RValue<Int> lhs, RValue<Int> rhs);
+RValue<Int> operator/(RValue<Int> lhs, RValue<Int> rhs);
+RValue<Int> operator%(RValue<Int> lhs, RValue<Int> rhs);
+RValue<Int> operator&(RValue<Int> lhs, RValue<Int> rhs);
+RValue<Int> operator|(RValue<Int> lhs, RValue<Int> rhs);
+RValue<Int> operator^(RValue<Int> lhs, RValue<Int> rhs);
+RValue<Int> operator<<(RValue<Int> lhs, RValue<Int> rhs);
+RValue<Int> operator>>(RValue<Int> lhs, RValue<Int> rhs);
+RValue<Int> operator+=(Int &lhs, RValue<Int> rhs);
+RValue<Int> operator-=(Int &lhs, RValue<Int> rhs);
+RValue<Int> operator*=(Int &lhs, RValue<Int> rhs);
+RValue<Int> operator/=(Int &lhs, RValue<Int> rhs);
+RValue<Int> operator%=(Int &lhs, RValue<Int> rhs);
+RValue<Int> operator&=(Int &lhs, RValue<Int> rhs);
+RValue<Int> operator|=(Int &lhs, RValue<Int> rhs);
+RValue<Int> operator^=(Int &lhs, RValue<Int> rhs);
+RValue<Int> operator<<=(Int &lhs, RValue<Int> rhs);
+RValue<Int> operator>>=(Int &lhs, RValue<Int> rhs);
+RValue<Int> operator+(RValue<Int> val);
+RValue<Int> operator-(RValue<Int> val);
+RValue<Int> operator~(RValue<Int> val);
+RValue<Int> operator++(Int &val, int);   // Post-increment
+const Int &operator++(Int &val);   // Pre-increment
+RValue<Int> operator--(Int &val, int);   // Post-decrement
+const Int &operator--(Int &val);   // Pre-decrement
+RValue<Bool> operator<(RValue<Int> lhs, RValue<Int> rhs);
+RValue<Bool> operator<=(RValue<Int> lhs, RValue<Int> rhs);
+RValue<Bool> operator>(RValue<Int> lhs, RValue<Int> rhs);
+RValue<Bool> operator>=(RValue<Int> lhs, RValue<Int> rhs);
+RValue<Bool> operator!=(RValue<Int> lhs, RValue<Int> rhs);
+RValue<Bool> operator==(RValue<Int> lhs, RValue<Int> rhs);
 
-	RValue<Int> Max(RValue<Int> x, RValue<Int> y);
-	RValue<Int> Min(RValue<Int> x, RValue<Int> y);
-	RValue<Int> Clamp(RValue<Int> x, RValue<Int> min, RValue<Int> max);
-	RValue<Int> RoundInt(RValue<Float> cast);
+RValue<Int> Max(RValue<Int> x, RValue<Int> y);
+RValue<Int> Min(RValue<Int> x, RValue<Int> y);
+RValue<Int> Clamp(RValue<Int> x, RValue<Int> min, RValue<Int> max);
+RValue<Int> RoundInt(RValue<Float> cast);
 
-	class Long : public LValue<Long>
-	{
-	public:
-	//	Long(Argument<Long> argument);
+class Long : public LValue<Long>
+{
+public:
+//	Long(Argument<Long> argument);
 
-	//	explicit Long(RValue<Short> cast);
-	//	explicit Long(RValue<UShort> cast);
-		explicit Long(RValue<Int> cast);
-		explicit Long(RValue<UInt> cast);
-	//	explicit Long(RValue<Float> cast);
+//	explicit Long(RValue<Short> cast);
+//	explicit Long(RValue<UShort> cast);
+	explicit Long(RValue<Int> cast);
+	explicit Long(RValue<UInt> cast);
+//	explicit Long(RValue<Float> cast);
 
-		Long() = default;
-	//	Long(qword x);
-		Long(RValue<Long> rhs);
-	//	Long(RValue<ULong> rhs);
-	//	Long(const Long &rhs);
-	//	Long(const Reference<Long> &rhs);
-	//	Long(const ULong &rhs);
-	//	Long(const Reference<ULong> &rhs);
+	Long() = default;
+//	Long(qword x);
+	Long(RValue<Long> rhs);
+//	Long(RValue<ULong> rhs);
+//	Long(const Long &rhs);
+//	Long(const Reference<Long> &rhs);
+//	Long(const ULong &rhs);
+//	Long(const Reference<ULong> &rhs);
 
-		RValue<Long> operator=(int64_t rhs);
-		RValue<Long> operator=(RValue<Long> rhs);
-	//	RValue<Long> operator=(RValue<ULong> rhs);
-		RValue<Long> operator=(const Long &rhs);
-		RValue<Long> operator=(const Reference<Long> &rhs);
-	//	RValue<Long> operator=(const ULong &rhs);
-	//	RValue<Long> operator=(const Reference<ULong> &rhs);
+	RValue<Long> operator=(int64_t rhs);
+	RValue<Long> operator=(RValue<Long> rhs);
+//	RValue<Long> operator=(RValue<ULong> rhs);
+	RValue<Long> operator=(const Long &rhs);
+	RValue<Long> operator=(const Reference<Long> &rhs);
+//	RValue<Long> operator=(const ULong &rhs);
+//	RValue<Long> operator=(const Reference<ULong> &rhs);
 
-		static Type *getType();
-	};
+	static Type *getType();
+};
 
-	RValue<Long> operator+(RValue<Long> lhs, RValue<Long> rhs);
-	RValue<Long> operator-(RValue<Long> lhs, RValue<Long> rhs);
-	RValue<Long> operator*(RValue<Long> lhs, RValue<Long> rhs);
+RValue<Long> operator+(RValue<Long> lhs, RValue<Long> rhs);
+RValue<Long> operator-(RValue<Long> lhs, RValue<Long> rhs);
+RValue<Long> operator*(RValue<Long> lhs, RValue<Long> rhs);
 //	RValue<Long> operator/(RValue<Long> lhs, RValue<Long> rhs);
 //	RValue<Long> operator%(RValue<Long> lhs, RValue<Long> rhs);
 //	RValue<Long> operator&(RValue<Long> lhs, RValue<Long> rhs);
 //	RValue<Long> operator|(RValue<Long> lhs, RValue<Long> rhs);
 //	RValue<Long> operator^(RValue<Long> lhs, RValue<Long> rhs);
 //	RValue<Long> operator<<(RValue<Long> lhs, RValue<Long> rhs);
-	RValue<Long> operator>>(RValue<Long> lhs, RValue<Long> rhs);
-	RValue<Long> operator+=(Long &lhs, RValue<Long> rhs);
-	RValue<Long> operator-=(Long &lhs, RValue<Long> rhs);
+RValue<Long> operator>>(RValue<Long> lhs, RValue<Long> rhs);
+RValue<Long> operator+=(Long &lhs, RValue<Long> rhs);
+RValue<Long> operator-=(Long &lhs, RValue<Long> rhs);
 //	RValue<Long> operator*=(Long &lhs, RValue<Long> rhs);
 //	RValue<Long> operator/=(Long &lhs, RValue<Long> rhs);
 //	RValue<Long> operator%=(Long &lhs, RValue<Long> rhs);
@@ -1203,133 +1203,133 @@
 //	RValue<Bool> operator==(RValue<Long> lhs, RValue<Long> rhs);
 
 //	RValue<Long> RoundLong(RValue<Float> cast);
-	RValue<Long> AddAtomic( RValue<Pointer<Long>> x, RValue<Long> y);
+RValue<Long> AddAtomic( RValue<Pointer<Long>> x, RValue<Long> y);
 
-	class UInt : public LValue<UInt>
-	{
-	public:
-		UInt(Argument<UInt> argument);
+class UInt : public LValue<UInt>
+{
+public:
+	UInt(Argument<UInt> argument);
 
-		explicit UInt(RValue<UShort> cast);
-		explicit UInt(RValue<Long> cast);
-		explicit UInt(RValue<Float> cast);
+	explicit UInt(RValue<UShort> cast);
+	explicit UInt(RValue<Long> cast);
+	explicit UInt(RValue<Float> cast);
 
-		UInt() = default;
-		UInt(int x);
-		UInt(unsigned int x);
-		UInt(RValue<UInt> rhs);
-		UInt(RValue<Int> rhs);
-		UInt(const UInt &rhs);
-		UInt(const Int &rhs);
-		UInt(const Reference<UInt> &rhs);
-		UInt(const Reference<Int> &rhs);
+	UInt() = default;
+	UInt(int x);
+	UInt(unsigned int x);
+	UInt(RValue<UInt> rhs);
+	UInt(RValue<Int> rhs);
+	UInt(const UInt &rhs);
+	UInt(const Int &rhs);
+	UInt(const Reference<UInt> &rhs);
+	UInt(const Reference<Int> &rhs);
 
-		RValue<UInt> operator=(unsigned int rhs);
-		RValue<UInt> operator=(RValue<UInt> rhs);
-		RValue<UInt> operator=(RValue<Int> rhs);
-		RValue<UInt> operator=(const UInt &rhs);
-		RValue<UInt> operator=(const Int &rhs);
-		RValue<UInt> operator=(const Reference<UInt> &rhs);
-		RValue<UInt> operator=(const Reference<Int> &rhs);
+	RValue<UInt> operator=(unsigned int rhs);
+	RValue<UInt> operator=(RValue<UInt> rhs);
+	RValue<UInt> operator=(RValue<Int> rhs);
+	RValue<UInt> operator=(const UInt &rhs);
+	RValue<UInt> operator=(const Int &rhs);
+	RValue<UInt> operator=(const Reference<UInt> &rhs);
+	RValue<UInt> operator=(const Reference<Int> &rhs);
 
-		static Type *getType();
-	};
+	static Type *getType();
+};
 
-	RValue<UInt> operator+(RValue<UInt> lhs, RValue<UInt> rhs);
-	RValue<UInt> operator-(RValue<UInt> lhs, RValue<UInt> rhs);
-	RValue<UInt> operator*(RValue<UInt> lhs, RValue<UInt> rhs);
-	RValue<UInt> operator/(RValue<UInt> lhs, RValue<UInt> rhs);
-	RValue<UInt> operator%(RValue<UInt> lhs, RValue<UInt> rhs);
-	RValue<UInt> operator&(RValue<UInt> lhs, RValue<UInt> rhs);
-	RValue<UInt> operator|(RValue<UInt> lhs, RValue<UInt> rhs);
-	RValue<UInt> operator^(RValue<UInt> lhs, RValue<UInt> rhs);
-	RValue<UInt> operator<<(RValue<UInt> lhs, RValue<UInt> rhs);
-	RValue<UInt> operator>>(RValue<UInt> lhs, RValue<UInt> rhs);
-	RValue<UInt> operator+=(UInt &lhs, RValue<UInt> rhs);
-	RValue<UInt> operator-=(UInt &lhs, RValue<UInt> rhs);
-	RValue<UInt> operator*=(UInt &lhs, RValue<UInt> rhs);
-	RValue<UInt> operator/=(UInt &lhs, RValue<UInt> rhs);
-	RValue<UInt> operator%=(UInt &lhs, RValue<UInt> rhs);
-	RValue<UInt> operator&=(UInt &lhs, RValue<UInt> rhs);
-	RValue<UInt> operator|=(UInt &lhs, RValue<UInt> rhs);
-	RValue<UInt> operator^=(UInt &lhs, RValue<UInt> rhs);
-	RValue<UInt> operator<<=(UInt &lhs, RValue<UInt> rhs);
-	RValue<UInt> operator>>=(UInt &lhs, RValue<UInt> rhs);
-	RValue<UInt> operator+(RValue<UInt> val);
-	RValue<UInt> operator-(RValue<UInt> val);
-	RValue<UInt> operator~(RValue<UInt> val);
-	RValue<UInt> operator++(UInt &val, int);   // Post-increment
-	const UInt &operator++(UInt &val);   // Pre-increment
-	RValue<UInt> operator--(UInt &val, int);   // Post-decrement
-	const UInt &operator--(UInt &val);   // Pre-decrement
-	RValue<Bool> operator<(RValue<UInt> lhs, RValue<UInt> rhs);
-	RValue<Bool> operator<=(RValue<UInt> lhs, RValue<UInt> rhs);
-	RValue<Bool> operator>(RValue<UInt> lhs, RValue<UInt> rhs);
-	RValue<Bool> operator>=(RValue<UInt> lhs, RValue<UInt> rhs);
-	RValue<Bool> operator!=(RValue<UInt> lhs, RValue<UInt> rhs);
-	RValue<Bool> operator==(RValue<UInt> lhs, RValue<UInt> rhs);
+RValue<UInt> operator+(RValue<UInt> lhs, RValue<UInt> rhs);
+RValue<UInt> operator-(RValue<UInt> lhs, RValue<UInt> rhs);
+RValue<UInt> operator*(RValue<UInt> lhs, RValue<UInt> rhs);
+RValue<UInt> operator/(RValue<UInt> lhs, RValue<UInt> rhs);
+RValue<UInt> operator%(RValue<UInt> lhs, RValue<UInt> rhs);
+RValue<UInt> operator&(RValue<UInt> lhs, RValue<UInt> rhs);
+RValue<UInt> operator|(RValue<UInt> lhs, RValue<UInt> rhs);
+RValue<UInt> operator^(RValue<UInt> lhs, RValue<UInt> rhs);
+RValue<UInt> operator<<(RValue<UInt> lhs, RValue<UInt> rhs);
+RValue<UInt> operator>>(RValue<UInt> lhs, RValue<UInt> rhs);
+RValue<UInt> operator+=(UInt &lhs, RValue<UInt> rhs);
+RValue<UInt> operator-=(UInt &lhs, RValue<UInt> rhs);
+RValue<UInt> operator*=(UInt &lhs, RValue<UInt> rhs);
+RValue<UInt> operator/=(UInt &lhs, RValue<UInt> rhs);
+RValue<UInt> operator%=(UInt &lhs, RValue<UInt> rhs);
+RValue<UInt> operator&=(UInt &lhs, RValue<UInt> rhs);
+RValue<UInt> operator|=(UInt &lhs, RValue<UInt> rhs);
+RValue<UInt> operator^=(UInt &lhs, RValue<UInt> rhs);
+RValue<UInt> operator<<=(UInt &lhs, RValue<UInt> rhs);
+RValue<UInt> operator>>=(UInt &lhs, RValue<UInt> rhs);
+RValue<UInt> operator+(RValue<UInt> val);
+RValue<UInt> operator-(RValue<UInt> val);
+RValue<UInt> operator~(RValue<UInt> val);
+RValue<UInt> operator++(UInt &val, int);   // Post-increment
+const UInt &operator++(UInt &val);   // Pre-increment
+RValue<UInt> operator--(UInt &val, int);   // Post-decrement
+const UInt &operator--(UInt &val);   // Pre-decrement
+RValue<Bool> operator<(RValue<UInt> lhs, RValue<UInt> rhs);
+RValue<Bool> operator<=(RValue<UInt> lhs, RValue<UInt> rhs);
+RValue<Bool> operator>(RValue<UInt> lhs, RValue<UInt> rhs);
+RValue<Bool> operator>=(RValue<UInt> lhs, RValue<UInt> rhs);
+RValue<Bool> operator!=(RValue<UInt> lhs, RValue<UInt> rhs);
+RValue<Bool> operator==(RValue<UInt> lhs, RValue<UInt> rhs);
 
-	RValue<UInt> Max(RValue<UInt> x, RValue<UInt> y);
-	RValue<UInt> Min(RValue<UInt> x, RValue<UInt> y);
-	RValue<UInt> Clamp(RValue<UInt> x, RValue<UInt> min, RValue<UInt> max);
+RValue<UInt> Max(RValue<UInt> x, RValue<UInt> y);
+RValue<UInt> Min(RValue<UInt> x, RValue<UInt> y);
+RValue<UInt> Clamp(RValue<UInt> x, RValue<UInt> min, RValue<UInt> max);
 
-	RValue<UInt> AddAtomic(RValue<Pointer<UInt>> x, RValue<UInt> y, std::memory_order memoryOrder);
-	RValue<UInt> SubAtomic(RValue<Pointer<UInt>> x, RValue<UInt> y, std::memory_order memoryOrder);
-	RValue<UInt> AndAtomic(RValue<Pointer<UInt>> x, RValue<UInt> y, std::memory_order memoryOrder);
-	RValue<UInt> OrAtomic(RValue<Pointer<UInt>> x, RValue<UInt> y, std::memory_order memoryOrder);
-	RValue<UInt> XorAtomic(RValue<Pointer<UInt>> x, RValue<UInt> y, std::memory_order memoryOrder);
-	RValue<Int> MinAtomic(RValue<Pointer<Int>> x, RValue<Int> y, std::memory_order memoryOrder);
-	RValue<Int> MaxAtomic(RValue<Pointer<Int>> x, RValue<Int> y, std::memory_order memoryOrder);
-	RValue<UInt> MinAtomic(RValue<Pointer<UInt>> x, RValue<UInt> y, std::memory_order memoryOrder);
-	RValue<UInt> MaxAtomic(RValue<Pointer<UInt>> x, RValue<UInt> y, std::memory_order memoryOrder);
-	RValue<UInt> ExchangeAtomic(RValue<Pointer<UInt>> x, RValue<UInt> y, std::memory_order memoryOrder);
-	RValue<UInt> CompareExchangeAtomic(RValue<Pointer<UInt>> x, RValue<UInt> y, RValue<UInt> compare, std::memory_order memoryOrderEqual, std::memory_order memoryOrderUnequal);
+RValue<UInt> AddAtomic(RValue<Pointer<UInt>> x, RValue<UInt> y, std::memory_order memoryOrder);
+RValue<UInt> SubAtomic(RValue<Pointer<UInt>> x, RValue<UInt> y, std::memory_order memoryOrder);
+RValue<UInt> AndAtomic(RValue<Pointer<UInt>> x, RValue<UInt> y, std::memory_order memoryOrder);
+RValue<UInt> OrAtomic(RValue<Pointer<UInt>> x, RValue<UInt> y, std::memory_order memoryOrder);
+RValue<UInt> XorAtomic(RValue<Pointer<UInt>> x, RValue<UInt> y, std::memory_order memoryOrder);
+RValue<Int> MinAtomic(RValue<Pointer<Int>> x, RValue<Int> y, std::memory_order memoryOrder);
+RValue<Int> MaxAtomic(RValue<Pointer<Int>> x, RValue<Int> y, std::memory_order memoryOrder);
+RValue<UInt> MinAtomic(RValue<Pointer<UInt>> x, RValue<UInt> y, std::memory_order memoryOrder);
+RValue<UInt> MaxAtomic(RValue<Pointer<UInt>> x, RValue<UInt> y, std::memory_order memoryOrder);
+RValue<UInt> ExchangeAtomic(RValue<Pointer<UInt>> x, RValue<UInt> y, std::memory_order memoryOrder);
+RValue<UInt> CompareExchangeAtomic(RValue<Pointer<UInt>> x, RValue<UInt> y, RValue<UInt> compare, std::memory_order memoryOrderEqual, std::memory_order memoryOrderUnequal);
 
 //	RValue<UInt> RoundUInt(RValue<Float> cast);
 
-	class Int2 : public LValue<Int2>
-	{
-	public:
-	//	explicit Int2(RValue<Int> cast);
-		explicit Int2(RValue<Int4> cast);
+class Int2 : public LValue<Int2>
+{
+public:
+//	explicit Int2(RValue<Int> cast);
+	explicit Int2(RValue<Int4> cast);
 
-		Int2() = default;
-		Int2(int x, int y);
-		Int2(RValue<Int2> rhs);
-		Int2(const Int2 &rhs);
-		Int2(const Reference<Int2> &rhs);
-		Int2(RValue<Int> lo, RValue<Int> hi);
+	Int2() = default;
+	Int2(int x, int y);
+	Int2(RValue<Int2> rhs);
+	Int2(const Int2 &rhs);
+	Int2(const Reference<Int2> &rhs);
+	Int2(RValue<Int> lo, RValue<Int> hi);
 
-		RValue<Int2> operator=(RValue<Int2> rhs);
-		RValue<Int2> operator=(const Int2 &rhs);
-		RValue<Int2> operator=(const Reference<Int2> &rhs);
+	RValue<Int2> operator=(RValue<Int2> rhs);
+	RValue<Int2> operator=(const Int2 &rhs);
+	RValue<Int2> operator=(const Reference<Int2> &rhs);
 
-		static Type *getType();
-	};
+	static Type *getType();
+};
 
-	RValue<Int2> operator+(RValue<Int2> lhs, RValue<Int2> rhs);
-	RValue<Int2> operator-(RValue<Int2> lhs, RValue<Int2> rhs);
+RValue<Int2> operator+(RValue<Int2> lhs, RValue<Int2> rhs);
+RValue<Int2> operator-(RValue<Int2> lhs, RValue<Int2> rhs);
 //	RValue<Int2> operator*(RValue<Int2> lhs, RValue<Int2> rhs);
 //	RValue<Int2> operator/(RValue<Int2> lhs, RValue<Int2> rhs);
 //	RValue<Int2> operator%(RValue<Int2> lhs, RValue<Int2> rhs);
-	RValue<Int2> operator&(RValue<Int2> lhs, RValue<Int2> rhs);
-	RValue<Int2> operator|(RValue<Int2> lhs, RValue<Int2> rhs);
-	RValue<Int2> operator^(RValue<Int2> lhs, RValue<Int2> rhs);
-	RValue<Int2> operator<<(RValue<Int2> lhs, unsigned char rhs);
-	RValue<Int2> operator>>(RValue<Int2> lhs, unsigned char rhs);
-	RValue<Int2> operator+=(Int2 &lhs, RValue<Int2> rhs);
-	RValue<Int2> operator-=(Int2 &lhs, RValue<Int2> rhs);
+RValue<Int2> operator&(RValue<Int2> lhs, RValue<Int2> rhs);
+RValue<Int2> operator|(RValue<Int2> lhs, RValue<Int2> rhs);
+RValue<Int2> operator^(RValue<Int2> lhs, RValue<Int2> rhs);
+RValue<Int2> operator<<(RValue<Int2> lhs, unsigned char rhs);
+RValue<Int2> operator>>(RValue<Int2> lhs, unsigned char rhs);
+RValue<Int2> operator+=(Int2 &lhs, RValue<Int2> rhs);
+RValue<Int2> operator-=(Int2 &lhs, RValue<Int2> rhs);
 //	RValue<Int2> operator*=(Int2 &lhs, RValue<Int2> rhs);
 //	RValue<Int2> operator/=(Int2 &lhs, RValue<Int2> rhs);
 //	RValue<Int2> operator%=(Int2 &lhs, RValue<Int2> rhs);
-	RValue<Int2> operator&=(Int2 &lhs, RValue<Int2> rhs);
-	RValue<Int2> operator|=(Int2 &lhs, RValue<Int2> rhs);
-	RValue<Int2> operator^=(Int2 &lhs, RValue<Int2> rhs);
-	RValue<Int2> operator<<=(Int2 &lhs, unsigned char rhs);
-	RValue<Int2> operator>>=(Int2 &lhs, unsigned char rhs);
+RValue<Int2> operator&=(Int2 &lhs, RValue<Int2> rhs);
+RValue<Int2> operator|=(Int2 &lhs, RValue<Int2> rhs);
+RValue<Int2> operator^=(Int2 &lhs, RValue<Int2> rhs);
+RValue<Int2> operator<<=(Int2 &lhs, unsigned char rhs);
+RValue<Int2> operator>>=(Int2 &lhs, unsigned char rhs);
 //	RValue<Int2> operator+(RValue<Int2> val);
 //	RValue<Int2> operator-(RValue<Int2> val);
-	RValue<Int2> operator~(RValue<Int2> val);
+RValue<Int2> operator~(RValue<Int2> val);
 //	RValue<Int2> operator++(Int2 &val, int);   // Post-increment
 //	const Int2 &operator++(Int2 &val);   // Pre-increment
 //	RValue<Int2> operator--(Int2 &val, int);   // Post-decrement
@@ -1342,50 +1342,50 @@
 //	RValue<Bool> operator==(RValue<Int2> lhs, RValue<Int2> rhs);
 
 //	RValue<Int2> RoundInt(RValue<Float4> cast);
-	RValue<Short4> UnpackLow(RValue<Int2> x, RValue<Int2> y);
-	RValue<Short4> UnpackHigh(RValue<Int2> x, RValue<Int2> y);
-	RValue<Int> Extract(RValue<Int2> val, int i);
-	RValue<Int2> Insert(RValue<Int2> val, RValue<Int> element, int i);
+RValue<Short4> UnpackLow(RValue<Int2> x, RValue<Int2> y);
+RValue<Short4> UnpackHigh(RValue<Int2> x, RValue<Int2> y);
+RValue<Int> Extract(RValue<Int2> val, int i);
+RValue<Int2> Insert(RValue<Int2> val, RValue<Int> element, int i);
 
-	class UInt2 : public LValue<UInt2>
-	{
-	public:
-		UInt2() = default;
-		UInt2(unsigned int x, unsigned int y);
-		UInt2(RValue<UInt2> rhs);
-		UInt2(const UInt2 &rhs);
-		UInt2(const Reference<UInt2> &rhs);
+class UInt2 : public LValue<UInt2>
+{
+public:
+	UInt2() = default;
+	UInt2(unsigned int x, unsigned int y);
+	UInt2(RValue<UInt2> rhs);
+	UInt2(const UInt2 &rhs);
+	UInt2(const Reference<UInt2> &rhs);
 
-		RValue<UInt2> operator=(RValue<UInt2> rhs);
-		RValue<UInt2> operator=(const UInt2 &rhs);
-		RValue<UInt2> operator=(const Reference<UInt2> &rhs);
+	RValue<UInt2> operator=(RValue<UInt2> rhs);
+	RValue<UInt2> operator=(const UInt2 &rhs);
+	RValue<UInt2> operator=(const Reference<UInt2> &rhs);
 
-		static Type *getType();
-	};
+	static Type *getType();
+};
 
-	RValue<UInt2> operator+(RValue<UInt2> lhs, RValue<UInt2> rhs);
-	RValue<UInt2> operator-(RValue<UInt2> lhs, RValue<UInt2> rhs);
+RValue<UInt2> operator+(RValue<UInt2> lhs, RValue<UInt2> rhs);
+RValue<UInt2> operator-(RValue<UInt2> lhs, RValue<UInt2> rhs);
 //	RValue<UInt2> operator*(RValue<UInt2> lhs, RValue<UInt2> rhs);
 //	RValue<UInt2> operator/(RValue<UInt2> lhs, RValue<UInt2> rhs);
 //	RValue<UInt2> operator%(RValue<UInt2> lhs, RValue<UInt2> rhs);
-	RValue<UInt2> operator&(RValue<UInt2> lhs, RValue<UInt2> rhs);
-	RValue<UInt2> operator|(RValue<UInt2> lhs, RValue<UInt2> rhs);
-	RValue<UInt2> operator^(RValue<UInt2> lhs, RValue<UInt2> rhs);
-	RValue<UInt2> operator<<(RValue<UInt2> lhs, unsigned char rhs);
-	RValue<UInt2> operator>>(RValue<UInt2> lhs, unsigned char rhs);
-	RValue<UInt2> operator+=(UInt2 &lhs, RValue<UInt2> rhs);
-	RValue<UInt2> operator-=(UInt2 &lhs, RValue<UInt2> rhs);
+RValue<UInt2> operator&(RValue<UInt2> lhs, RValue<UInt2> rhs);
+RValue<UInt2> operator|(RValue<UInt2> lhs, RValue<UInt2> rhs);
+RValue<UInt2> operator^(RValue<UInt2> lhs, RValue<UInt2> rhs);
+RValue<UInt2> operator<<(RValue<UInt2> lhs, unsigned char rhs);
+RValue<UInt2> operator>>(RValue<UInt2> lhs, unsigned char rhs);
+RValue<UInt2> operator+=(UInt2 &lhs, RValue<UInt2> rhs);
+RValue<UInt2> operator-=(UInt2 &lhs, RValue<UInt2> rhs);
 //	RValue<UInt2> operator*=(UInt2 &lhs, RValue<UInt2> rhs);
 //	RValue<UInt2> operator/=(UInt2 &lhs, RValue<UInt2> rhs);
 //	RValue<UInt2> operator%=(UInt2 &lhs, RValue<UInt2> rhs);
-	RValue<UInt2> operator&=(UInt2 &lhs, RValue<UInt2> rhs);
-	RValue<UInt2> operator|=(UInt2 &lhs, RValue<UInt2> rhs);
-	RValue<UInt2> operator^=(UInt2 &lhs, RValue<UInt2> rhs);
-	RValue<UInt2> operator<<=(UInt2 &lhs, unsigned char rhs);
-	RValue<UInt2> operator>>=(UInt2 &lhs, unsigned char rhs);
+RValue<UInt2> operator&=(UInt2 &lhs, RValue<UInt2> rhs);
+RValue<UInt2> operator|=(UInt2 &lhs, RValue<UInt2> rhs);
+RValue<UInt2> operator^=(UInt2 &lhs, RValue<UInt2> rhs);
+RValue<UInt2> operator<<=(UInt2 &lhs, unsigned char rhs);
+RValue<UInt2> operator>>=(UInt2 &lhs, unsigned char rhs);
 //	RValue<UInt2> operator+(RValue<UInt2> val);
 //	RValue<UInt2> operator-(RValue<UInt2> val);
-	RValue<UInt2> operator~(RValue<UInt2> val);
+RValue<UInt2> operator~(RValue<UInt2> val);
 //	RValue<UInt2> operator++(UInt2 &val, int);   // Post-increment
 //	const UInt2 &operator++(UInt2 &val);   // Pre-increment
 //	RValue<UInt2> operator--(UInt2 &val, int);   // Post-decrement
@@ -1398,516 +1398,516 @@
 //	RValue<Bool> operator==(RValue<UInt2> lhs, RValue<UInt2> rhs);
 
 //	RValue<UInt2> RoundInt(RValue<Float4> cast);
-	RValue<UInt> Extract(RValue<UInt2> val, int i);
-	RValue<UInt2> Insert(RValue<UInt2> val, RValue<UInt> element, int i);
+RValue<UInt> Extract(RValue<UInt2> val, int i);
+RValue<UInt2> Insert(RValue<UInt2> val, RValue<UInt> element, int i);
 
-	template<class T>
-	struct Scalar;
+template<class T>
+struct Scalar;
 
-	template<class Vector4>
-	struct XYZW;
+template<class Vector4>
+struct XYZW;
 
-	template<class Vector4, int T>
-	class Swizzle2
+template<class Vector4, int T>
+class Swizzle2
+{
+	friend Vector4;
+
+public:
+	operator RValue<Vector4>() const;
+
+private:
+	Vector4 *parent;
+};
+
+template<class Vector4, int T>
+class Swizzle4
+{
+public:
+	operator RValue<Vector4>() const;
+
+private:
+	Vector4 *parent;
+};
+
+template<class Vector4, int T>
+class SwizzleMask4
+{
+	friend XYZW<Vector4>;
+
+public:
+	operator RValue<Vector4>() const;
+
+	RValue<Vector4> operator=(RValue<Vector4> rhs);
+	RValue<Vector4> operator=(RValue<typename Scalar<Vector4>::Type> rhs);
+
+private:
+	Vector4 *parent;
+};
+
+template<>
+struct Scalar<Float4>
+{
+	using Type = Float;
+};
+
+template<>
+struct Scalar<Int4>
+{
+	using Type = Int;
+};
+
+template<>
+struct Scalar<UInt4>
+{
+	using Type = UInt;
+};
+
+template<class Vector4, int T>
+class SwizzleMask1
+{
+public:
+	operator RValue<typename Scalar<Vector4>::Type>() const;
+	operator RValue<Vector4>() const;
+
+	RValue<Vector4> operator=(float x);
+	RValue<Vector4> operator=(RValue<Vector4> rhs);
+	RValue<Vector4> operator=(RValue<typename Scalar<Vector4>::Type> rhs);
+
+private:
+	Vector4 *parent;
+};
+
+template<class Vector4, int T>
+class SwizzleMask2
+{
+	friend class Float4;
+
+public:
+	operator RValue<Vector4>() const;
+
+	RValue<Vector4> operator=(RValue<Vector4> rhs);
+
+private:
+	Float4 *parent;
+};
+
+template<class Vector4>
+struct XYZW
+{
+	friend Vector4;
+
+private:
+	XYZW(Vector4 *parent)
 	{
-		friend Vector4;
+		xyzw.parent = parent;
+	}
 
-	public:
-		operator RValue<Vector4>() const;
-
-	private:
-		Vector4 *parent;
-	};
-
-	template<class Vector4, int T>
-	class Swizzle4
+public:
+	union
 	{
-	public:
-		operator RValue<Vector4>() const;
-
-	private:
-		Vector4 *parent;
+		SwizzleMask1<Vector4, 0x0000> x;
+		SwizzleMask1<Vector4, 0x1111> y;
+		SwizzleMask1<Vector4, 0x2222> z;
+		SwizzleMask1<Vector4, 0x3333> w;
+		Swizzle2<Vector4, 0x0000>     xx;
+		Swizzle2<Vector4, 0x1000>     yx;
+		Swizzle2<Vector4, 0x2000>     zx;
+		Swizzle2<Vector4, 0x3000>     wx;
+		SwizzleMask2<Vector4, 0x0111> xy;
+		Swizzle2<Vector4, 0x1111>     yy;
+		Swizzle2<Vector4, 0x2111>     zy;
+		Swizzle2<Vector4, 0x3111>     wy;
+		SwizzleMask2<Vector4, 0x0222> xz;
+		SwizzleMask2<Vector4, 0x1222> yz;
+		Swizzle2<Vector4, 0x2222>     zz;
+		Swizzle2<Vector4, 0x3222>     wz;
+		SwizzleMask2<Vector4, 0x0333> xw;
+		SwizzleMask2<Vector4, 0x1333> yw;
+		SwizzleMask2<Vector4, 0x2333> zw;
+		Swizzle2<Vector4, 0x3333>     ww;
+		Swizzle4<Vector4, 0x0000>     xxx;
+		Swizzle4<Vector4, 0x1000>     yxx;
+		Swizzle4<Vector4, 0x2000>     zxx;
+		Swizzle4<Vector4, 0x3000>     wxx;
+		Swizzle4<Vector4, 0x0100>     xyx;
+		Swizzle4<Vector4, 0x1100>     yyx;
+		Swizzle4<Vector4, 0x2100>     zyx;
+		Swizzle4<Vector4, 0x3100>     wyx;
+		Swizzle4<Vector4, 0x0200>     xzx;
+		Swizzle4<Vector4, 0x1200>     yzx;
+		Swizzle4<Vector4, 0x2200>     zzx;
+		Swizzle4<Vector4, 0x3200>     wzx;
+		Swizzle4<Vector4, 0x0300>     xwx;
+		Swizzle4<Vector4, 0x1300>     ywx;
+		Swizzle4<Vector4, 0x2300>     zwx;
+		Swizzle4<Vector4, 0x3300>     wwx;
+		Swizzle4<Vector4, 0x0011>     xxy;
+		Swizzle4<Vector4, 0x1011>     yxy;
+		Swizzle4<Vector4, 0x2011>     zxy;
+		Swizzle4<Vector4, 0x3011>     wxy;
+		Swizzle4<Vector4, 0x0111>     xyy;
+		Swizzle4<Vector4, 0x1111>     yyy;
+		Swizzle4<Vector4, 0x2111>     zyy;
+		Swizzle4<Vector4, 0x3111>     wyy;
+		Swizzle4<Vector4, 0x0211>     xzy;
+		Swizzle4<Vector4, 0x1211>     yzy;
+		Swizzle4<Vector4, 0x2211>     zzy;
+		Swizzle4<Vector4, 0x3211>     wzy;
+		Swizzle4<Vector4, 0x0311>     xwy;
+		Swizzle4<Vector4, 0x1311>     ywy;
+		Swizzle4<Vector4, 0x2311>     zwy;
+		Swizzle4<Vector4, 0x3311>     wwy;
+		Swizzle4<Vector4, 0x0022>     xxz;
+		Swizzle4<Vector4, 0x1022>     yxz;
+		Swizzle4<Vector4, 0x2022>     zxz;
+		Swizzle4<Vector4, 0x3022>     wxz;
+		SwizzleMask4<Vector4, 0x0122> xyz;
+		Swizzle4<Vector4, 0x1122>     yyz;
+		Swizzle4<Vector4, 0x2122>     zyz;
+		Swizzle4<Vector4, 0x3122>     wyz;
+		Swizzle4<Vector4, 0x0222>     xzz;
+		Swizzle4<Vector4, 0x1222>     yzz;
+		Swizzle4<Vector4, 0x2222>     zzz;
+		Swizzle4<Vector4, 0x3222>     wzz;
+		Swizzle4<Vector4, 0x0322>     xwz;
+		Swizzle4<Vector4, 0x1322>     ywz;
+		Swizzle4<Vector4, 0x2322>     zwz;
+		Swizzle4<Vector4, 0x3322>     wwz;
+		Swizzle4<Vector4, 0x0033>     xxw;
+		Swizzle4<Vector4, 0x1033>     yxw;
+		Swizzle4<Vector4, 0x2033>     zxw;
+		Swizzle4<Vector4, 0x3033>     wxw;
+		SwizzleMask4<Vector4, 0x0133> xyw;
+		Swizzle4<Vector4, 0x1133>     yyw;
+		Swizzle4<Vector4, 0x2133>     zyw;
+		Swizzle4<Vector4, 0x3133>     wyw;
+		SwizzleMask4<Vector4, 0x0233> xzw;
+		SwizzleMask4<Vector4, 0x1233> yzw;
+		Swizzle4<Vector4, 0x2233>     zzw;
+		Swizzle4<Vector4, 0x3233>     wzw;
+		Swizzle4<Vector4, 0x0333>     xww;
+		Swizzle4<Vector4, 0x1333>     yww;
+		Swizzle4<Vector4, 0x2333>     zww;
+		Swizzle4<Vector4, 0x3333>     www;
+		Swizzle4<Vector4, 0x0000>     xxxx;
+		Swizzle4<Vector4, 0x1000>     yxxx;
+		Swizzle4<Vector4, 0x2000>     zxxx;
+		Swizzle4<Vector4, 0x3000>     wxxx;
+		Swizzle4<Vector4, 0x0100>     xyxx;
+		Swizzle4<Vector4, 0x1100>     yyxx;
+		Swizzle4<Vector4, 0x2100>     zyxx;
+		Swizzle4<Vector4, 0x3100>     wyxx;
+		Swizzle4<Vector4, 0x0200>     xzxx;
+		Swizzle4<Vector4, 0x1200>     yzxx;
+		Swizzle4<Vector4, 0x2200>     zzxx;
+		Swizzle4<Vector4, 0x3200>     wzxx;
+		Swizzle4<Vector4, 0x0300>     xwxx;
+		Swizzle4<Vector4, 0x1300>     ywxx;
+		Swizzle4<Vector4, 0x2300>     zwxx;
+		Swizzle4<Vector4, 0x3300>     wwxx;
+		Swizzle4<Vector4, 0x0010>     xxyx;
+		Swizzle4<Vector4, 0x1010>     yxyx;
+		Swizzle4<Vector4, 0x2010>     zxyx;
+		Swizzle4<Vector4, 0x3010>     wxyx;
+		Swizzle4<Vector4, 0x0110>     xyyx;
+		Swizzle4<Vector4, 0x1110>     yyyx;
+		Swizzle4<Vector4, 0x2110>     zyyx;
+		Swizzle4<Vector4, 0x3110>     wyyx;
+		Swizzle4<Vector4, 0x0210>     xzyx;
+		Swizzle4<Vector4, 0x1210>     yzyx;
+		Swizzle4<Vector4, 0x2210>     zzyx;
+		Swizzle4<Vector4, 0x3210>     wzyx;
+		Swizzle4<Vector4, 0x0310>     xwyx;
+		Swizzle4<Vector4, 0x1310>     ywyx;
+		Swizzle4<Vector4, 0x2310>     zwyx;
+		Swizzle4<Vector4, 0x3310>     wwyx;
+		Swizzle4<Vector4, 0x0020>     xxzx;
+		Swizzle4<Vector4, 0x1020>     yxzx;
+		Swizzle4<Vector4, 0x2020>     zxzx;
+		Swizzle4<Vector4, 0x3020>     wxzx;
+		Swizzle4<Vector4, 0x0120>     xyzx;
+		Swizzle4<Vector4, 0x1120>     yyzx;
+		Swizzle4<Vector4, 0x2120>     zyzx;
+		Swizzle4<Vector4, 0x3120>     wyzx;
+		Swizzle4<Vector4, 0x0220>     xzzx;
+		Swizzle4<Vector4, 0x1220>     yzzx;
+		Swizzle4<Vector4, 0x2220>     zzzx;
+		Swizzle4<Vector4, 0x3220>     wzzx;
+		Swizzle4<Vector4, 0x0320>     xwzx;
+		Swizzle4<Vector4, 0x1320>     ywzx;
+		Swizzle4<Vector4, 0x2320>     zwzx;
+		Swizzle4<Vector4, 0x3320>     wwzx;
+		Swizzle4<Vector4, 0x0030>     xxwx;
+		Swizzle4<Vector4, 0x1030>     yxwx;
+		Swizzle4<Vector4, 0x2030>     zxwx;
+		Swizzle4<Vector4, 0x3030>     wxwx;
+		Swizzle4<Vector4, 0x0130>     xywx;
+		Swizzle4<Vector4, 0x1130>     yywx;
+		Swizzle4<Vector4, 0x2130>     zywx;
+		Swizzle4<Vector4, 0x3130>     wywx;
+		Swizzle4<Vector4, 0x0230>     xzwx;
+		Swizzle4<Vector4, 0x1230>     yzwx;
+		Swizzle4<Vector4, 0x2230>     zzwx;
+		Swizzle4<Vector4, 0x3230>     wzwx;
+		Swizzle4<Vector4, 0x0330>     xwwx;
+		Swizzle4<Vector4, 0x1330>     ywwx;
+		Swizzle4<Vector4, 0x2330>     zwwx;
+		Swizzle4<Vector4, 0x3330>     wwwx;
+		Swizzle4<Vector4, 0x0001>     xxxy;
+		Swizzle4<Vector4, 0x1001>     yxxy;
+		Swizzle4<Vector4, 0x2001>     zxxy;
+		Swizzle4<Vector4, 0x3001>     wxxy;
+		Swizzle4<Vector4, 0x0101>     xyxy;
+		Swizzle4<Vector4, 0x1101>     yyxy;
+		Swizzle4<Vector4, 0x2101>     zyxy;
+		Swizzle4<Vector4, 0x3101>     wyxy;
+		Swizzle4<Vector4, 0x0201>     xzxy;
+		Swizzle4<Vector4, 0x1201>     yzxy;
+		Swizzle4<Vector4, 0x2201>     zzxy;
+		Swizzle4<Vector4, 0x3201>     wzxy;
+		Swizzle4<Vector4, 0x0301>     xwxy;
+		Swizzle4<Vector4, 0x1301>     ywxy;
+		Swizzle4<Vector4, 0x2301>     zwxy;
+		Swizzle4<Vector4, 0x3301>     wwxy;
+		Swizzle4<Vector4, 0x0011>     xxyy;
+		Swizzle4<Vector4, 0x1011>     yxyy;
+		Swizzle4<Vector4, 0x2011>     zxyy;
+		Swizzle4<Vector4, 0x3011>     wxyy;
+		Swizzle4<Vector4, 0x0111>     xyyy;
+		Swizzle4<Vector4, 0x1111>     yyyy;
+		Swizzle4<Vector4, 0x2111>     zyyy;
+		Swizzle4<Vector4, 0x3111>     wyyy;
+		Swizzle4<Vector4, 0x0211>     xzyy;
+		Swizzle4<Vector4, 0x1211>     yzyy;
+		Swizzle4<Vector4, 0x2211>     zzyy;
+		Swizzle4<Vector4, 0x3211>     wzyy;
+		Swizzle4<Vector4, 0x0311>     xwyy;
+		Swizzle4<Vector4, 0x1311>     ywyy;
+		Swizzle4<Vector4, 0x2311>     zwyy;
+		Swizzle4<Vector4, 0x3311>     wwyy;
+		Swizzle4<Vector4, 0x0021>     xxzy;
+		Swizzle4<Vector4, 0x1021>     yxzy;
+		Swizzle4<Vector4, 0x2021>     zxzy;
+		Swizzle4<Vector4, 0x3021>     wxzy;
+		Swizzle4<Vector4, 0x0121>     xyzy;
+		Swizzle4<Vector4, 0x1121>     yyzy;
+		Swizzle4<Vector4, 0x2121>     zyzy;
+		Swizzle4<Vector4, 0x3121>     wyzy;
+		Swizzle4<Vector4, 0x0221>     xzzy;
+		Swizzle4<Vector4, 0x1221>     yzzy;
+		Swizzle4<Vector4, 0x2221>     zzzy;
+		Swizzle4<Vector4, 0x3221>     wzzy;
+		Swizzle4<Vector4, 0x0321>     xwzy;
+		Swizzle4<Vector4, 0x1321>     ywzy;
+		Swizzle4<Vector4, 0x2321>     zwzy;
+		Swizzle4<Vector4, 0x3321>     wwzy;
+		Swizzle4<Vector4, 0x0031>     xxwy;
+		Swizzle4<Vector4, 0x1031>     yxwy;
+		Swizzle4<Vector4, 0x2031>     zxwy;
+		Swizzle4<Vector4, 0x3031>     wxwy;
+		Swizzle4<Vector4, 0x0131>     xywy;
+		Swizzle4<Vector4, 0x1131>     yywy;
+		Swizzle4<Vector4, 0x2131>     zywy;
+		Swizzle4<Vector4, 0x3131>     wywy;
+		Swizzle4<Vector4, 0x0231>     xzwy;
+		Swizzle4<Vector4, 0x1231>     yzwy;
+		Swizzle4<Vector4, 0x2231>     zzwy;
+		Swizzle4<Vector4, 0x3231>     wzwy;
+		Swizzle4<Vector4, 0x0331>     xwwy;
+		Swizzle4<Vector4, 0x1331>     ywwy;
+		Swizzle4<Vector4, 0x2331>     zwwy;
+		Swizzle4<Vector4, 0x3331>     wwwy;
+		Swizzle4<Vector4, 0x0002>     xxxz;
+		Swizzle4<Vector4, 0x1002>     yxxz;
+		Swizzle4<Vector4, 0x2002>     zxxz;
+		Swizzle4<Vector4, 0x3002>     wxxz;
+		Swizzle4<Vector4, 0x0102>     xyxz;
+		Swizzle4<Vector4, 0x1102>     yyxz;
+		Swizzle4<Vector4, 0x2102>     zyxz;
+		Swizzle4<Vector4, 0x3102>     wyxz;
+		Swizzle4<Vector4, 0x0202>     xzxz;
+		Swizzle4<Vector4, 0x1202>     yzxz;
+		Swizzle4<Vector4, 0x2202>     zzxz;
+		Swizzle4<Vector4, 0x3202>     wzxz;
+		Swizzle4<Vector4, 0x0302>     xwxz;
+		Swizzle4<Vector4, 0x1302>     ywxz;
+		Swizzle4<Vector4, 0x2302>     zwxz;
+		Swizzle4<Vector4, 0x3302>     wwxz;
+		Swizzle4<Vector4, 0x0012>     xxyz;
+		Swizzle4<Vector4, 0x1012>     yxyz;
+		Swizzle4<Vector4, 0x2012>     zxyz;
+		Swizzle4<Vector4, 0x3012>     wxyz;
+		Swizzle4<Vector4, 0x0112>     xyyz;
+		Swizzle4<Vector4, 0x1112>     yyyz;
+		Swizzle4<Vector4, 0x2112>     zyyz;
+		Swizzle4<Vector4, 0x3112>     wyyz;
+		Swizzle4<Vector4, 0x0212>     xzyz;
+		Swizzle4<Vector4, 0x1212>     yzyz;
+		Swizzle4<Vector4, 0x2212>     zzyz;
+		Swizzle4<Vector4, 0x3212>     wzyz;
+		Swizzle4<Vector4, 0x0312>     xwyz;
+		Swizzle4<Vector4, 0x1312>     ywyz;
+		Swizzle4<Vector4, 0x2312>     zwyz;
+		Swizzle4<Vector4, 0x3312>     wwyz;
+		Swizzle4<Vector4, 0x0022>     xxzz;
+		Swizzle4<Vector4, 0x1022>     yxzz;
+		Swizzle4<Vector4, 0x2022>     zxzz;
+		Swizzle4<Vector4, 0x3022>     wxzz;
+		Swizzle4<Vector4, 0x0122>     xyzz;
+		Swizzle4<Vector4, 0x1122>     yyzz;
+		Swizzle4<Vector4, 0x2122>     zyzz;
+		Swizzle4<Vector4, 0x3122>     wyzz;
+		Swizzle4<Vector4, 0x0222>     xzzz;
+		Swizzle4<Vector4, 0x1222>     yzzz;
+		Swizzle4<Vector4, 0x2222>     zzzz;
+		Swizzle4<Vector4, 0x3222>     wzzz;
+		Swizzle4<Vector4, 0x0322>     xwzz;
+		Swizzle4<Vector4, 0x1322>     ywzz;
+		Swizzle4<Vector4, 0x2322>     zwzz;
+		Swizzle4<Vector4, 0x3322>     wwzz;
+		Swizzle4<Vector4, 0x0032>     xxwz;
+		Swizzle4<Vector4, 0x1032>     yxwz;
+		Swizzle4<Vector4, 0x2032>     zxwz;
+		Swizzle4<Vector4, 0x3032>     wxwz;
+		Swizzle4<Vector4, 0x0132>     xywz;
+		Swizzle4<Vector4, 0x1132>     yywz;
+		Swizzle4<Vector4, 0x2132>     zywz;
+		Swizzle4<Vector4, 0x3132>     wywz;
+		Swizzle4<Vector4, 0x0232>     xzwz;
+		Swizzle4<Vector4, 0x1232>     yzwz;
+		Swizzle4<Vector4, 0x2232>     zzwz;
+		Swizzle4<Vector4, 0x3232>     wzwz;
+		Swizzle4<Vector4, 0x0332>     xwwz;
+		Swizzle4<Vector4, 0x1332>     ywwz;
+		Swizzle4<Vector4, 0x2332>     zwwz;
+		Swizzle4<Vector4, 0x3332>     wwwz;
+		Swizzle4<Vector4, 0x0003>     xxxw;
+		Swizzle4<Vector4, 0x1003>     yxxw;
+		Swizzle4<Vector4, 0x2003>     zxxw;
+		Swizzle4<Vector4, 0x3003>     wxxw;
+		Swizzle4<Vector4, 0x0103>     xyxw;
+		Swizzle4<Vector4, 0x1103>     yyxw;
+		Swizzle4<Vector4, 0x2103>     zyxw;
+		Swizzle4<Vector4, 0x3103>     wyxw;
+		Swizzle4<Vector4, 0x0203>     xzxw;
+		Swizzle4<Vector4, 0x1203>     yzxw;
+		Swizzle4<Vector4, 0x2203>     zzxw;
+		Swizzle4<Vector4, 0x3203>     wzxw;
+		Swizzle4<Vector4, 0x0303>     xwxw;
+		Swizzle4<Vector4, 0x1303>     ywxw;
+		Swizzle4<Vector4, 0x2303>     zwxw;
+		Swizzle4<Vector4, 0x3303>     wwxw;
+		Swizzle4<Vector4, 0x0013>     xxyw;
+		Swizzle4<Vector4, 0x1013>     yxyw;
+		Swizzle4<Vector4, 0x2013>     zxyw;
+		Swizzle4<Vector4, 0x3013>     wxyw;
+		Swizzle4<Vector4, 0x0113>     xyyw;
+		Swizzle4<Vector4, 0x1113>     yyyw;
+		Swizzle4<Vector4, 0x2113>     zyyw;
+		Swizzle4<Vector4, 0x3113>     wyyw;
+		Swizzle4<Vector4, 0x0213>     xzyw;
+		Swizzle4<Vector4, 0x1213>     yzyw;
+		Swizzle4<Vector4, 0x2213>     zzyw;
+		Swizzle4<Vector4, 0x3213>     wzyw;
+		Swizzle4<Vector4, 0x0313>     xwyw;
+		Swizzle4<Vector4, 0x1313>     ywyw;
+		Swizzle4<Vector4, 0x2313>     zwyw;
+		Swizzle4<Vector4, 0x3313>     wwyw;
+		Swizzle4<Vector4, 0x0023>     xxzw;
+		Swizzle4<Vector4, 0x1023>     yxzw;
+		Swizzle4<Vector4, 0x2023>     zxzw;
+		Swizzle4<Vector4, 0x3023>     wxzw;
+		SwizzleMask4<Vector4, 0x0123> xyzw;
+		Swizzle4<Vector4, 0x1123>     yyzw;
+		Swizzle4<Vector4, 0x2123>     zyzw;
+		Swizzle4<Vector4, 0x3123>     wyzw;
+		Swizzle4<Vector4, 0x0223>     xzzw;
+		Swizzle4<Vector4, 0x1223>     yzzw;
+		Swizzle4<Vector4, 0x2223>     zzzw;
+		Swizzle4<Vector4, 0x3223>     wzzw;
+		Swizzle4<Vector4, 0x0323>     xwzw;
+		Swizzle4<Vector4, 0x1323>     ywzw;
+		Swizzle4<Vector4, 0x2323>     zwzw;
+		Swizzle4<Vector4, 0x3323>     wwzw;
+		Swizzle4<Vector4, 0x0033>     xxww;
+		Swizzle4<Vector4, 0x1033>     yxww;
+		Swizzle4<Vector4, 0x2033>     zxww;
+		Swizzle4<Vector4, 0x3033>     wxww;
+		Swizzle4<Vector4, 0x0133>     xyww;
+		Swizzle4<Vector4, 0x1133>     yyww;
+		Swizzle4<Vector4, 0x2133>     zyww;
+		Swizzle4<Vector4, 0x3133>     wyww;
+		Swizzle4<Vector4, 0x0233>     xzww;
+		Swizzle4<Vector4, 0x1233>     yzww;
+		Swizzle4<Vector4, 0x2233>     zzww;
+		Swizzle4<Vector4, 0x3233>     wzww;
+		Swizzle4<Vector4, 0x0333>     xwww;
+		Swizzle4<Vector4, 0x1333>     ywww;
+		Swizzle4<Vector4, 0x2333>     zwww;
+		Swizzle4<Vector4, 0x3333>     wwww;
 	};
+};
 
-	template<class Vector4, int T>
-	class SwizzleMask4
-	{
-		friend XYZW<Vector4>;
+class Int4 : public LValue<Int4>, public XYZW<Int4>
+{
+public:
+	explicit Int4(RValue<Byte4> cast);
+	explicit Int4(RValue<SByte4> cast);
+	explicit Int4(RValue<Float4> cast);
+	explicit Int4(RValue<Short4> cast);
+	explicit Int4(RValue<UShort4> cast);
 
-	public:
-		operator RValue<Vector4>() const;
+	Int4();
+	Int4(int xyzw);
+	Int4(int x, int yzw);
+	Int4(int x, int y, int zw);
+	Int4(int x, int y, int z, int w);
+	Int4(RValue<Int4> rhs);
+	Int4(const Int4 &rhs);
+	Int4(const Reference<Int4> &rhs);
+	Int4(RValue<UInt4> rhs);
+	Int4(const UInt4 &rhs);
+	Int4(const Reference<UInt4> &rhs);
+	Int4(RValue<Int2> lo, RValue<Int2> hi);
+	Int4(RValue<Int> rhs);
+	Int4(const Int &rhs);
+	Int4(const Reference<Int> &rhs);
 
-		RValue<Vector4> operator=(RValue<Vector4> rhs);
-		RValue<Vector4> operator=(RValue<typename Scalar<Vector4>::Type> rhs);
+	RValue<Int4> operator=(RValue<Int4> rhs);
+	RValue<Int4> operator=(const Int4 &rhs);
+	RValue<Int4> operator=(const Reference<Int4> &rhs);
 
-	private:
-		Vector4 *parent;
-	};
+	static Type *getType();
 
-	template<>
-	struct Scalar<Float4>
-	{
-		using Type = Float;
-	};
+private:
+	void constant(int x, int y, int z, int w);
+};
 
-	template<>
-	struct Scalar<Int4>
-	{
-		using Type = Int;
-	};
-
-	template<>
-	struct Scalar<UInt4>
-	{
-		using Type = UInt;
-	};
-
-	template<class Vector4, int T>
-	class SwizzleMask1
-	{
-	public:
-		operator RValue<typename Scalar<Vector4>::Type>() const;
-		operator RValue<Vector4>() const;
-
-		RValue<Vector4> operator=(float x);
-		RValue<Vector4> operator=(RValue<Vector4> rhs);
-		RValue<Vector4> operator=(RValue<typename Scalar<Vector4>::Type> rhs);
-
-	private:
-		Vector4 *parent;
-	};
-
-	template<class Vector4, int T>
-	class SwizzleMask2
-	{
-		friend class Float4;
-
-	public:
-		operator RValue<Vector4>() const;
-
-		RValue<Vector4> operator=(RValue<Vector4> rhs);
-
-	private:
-		Float4 *parent;
-	};
-
-	template<class Vector4>
-	struct XYZW
-	{
-		friend Vector4;
-
-	private:
-		XYZW(Vector4 *parent)
-		{
-			xyzw.parent = parent;
-		}
-
-	public:
-		union
-		{
-			SwizzleMask1<Vector4, 0x0000> x;
-			SwizzleMask1<Vector4, 0x1111> y;
-			SwizzleMask1<Vector4, 0x2222> z;
-			SwizzleMask1<Vector4, 0x3333> w;
-			Swizzle2<Vector4, 0x0000>     xx;
-			Swizzle2<Vector4, 0x1000>     yx;
-			Swizzle2<Vector4, 0x2000>     zx;
-			Swizzle2<Vector4, 0x3000>     wx;
-			SwizzleMask2<Vector4, 0x0111> xy;
-			Swizzle2<Vector4, 0x1111>     yy;
-			Swizzle2<Vector4, 0x2111>     zy;
-			Swizzle2<Vector4, 0x3111>     wy;
-			SwizzleMask2<Vector4, 0x0222> xz;
-			SwizzleMask2<Vector4, 0x1222> yz;
-			Swizzle2<Vector4, 0x2222>     zz;
-			Swizzle2<Vector4, 0x3222>     wz;
-			SwizzleMask2<Vector4, 0x0333> xw;
-			SwizzleMask2<Vector4, 0x1333> yw;
-			SwizzleMask2<Vector4, 0x2333> zw;
-			Swizzle2<Vector4, 0x3333>     ww;
-			Swizzle4<Vector4, 0x0000>     xxx;
-			Swizzle4<Vector4, 0x1000>     yxx;
-			Swizzle4<Vector4, 0x2000>     zxx;
-			Swizzle4<Vector4, 0x3000>     wxx;
-			Swizzle4<Vector4, 0x0100>     xyx;
-			Swizzle4<Vector4, 0x1100>     yyx;
-			Swizzle4<Vector4, 0x2100>     zyx;
-			Swizzle4<Vector4, 0x3100>     wyx;
-			Swizzle4<Vector4, 0x0200>     xzx;
-			Swizzle4<Vector4, 0x1200>     yzx;
-			Swizzle4<Vector4, 0x2200>     zzx;
-			Swizzle4<Vector4, 0x3200>     wzx;
-			Swizzle4<Vector4, 0x0300>     xwx;
-			Swizzle4<Vector4, 0x1300>     ywx;
-			Swizzle4<Vector4, 0x2300>     zwx;
-			Swizzle4<Vector4, 0x3300>     wwx;
-			Swizzle4<Vector4, 0x0011>     xxy;
-			Swizzle4<Vector4, 0x1011>     yxy;
-			Swizzle4<Vector4, 0x2011>     zxy;
-			Swizzle4<Vector4, 0x3011>     wxy;
-			Swizzle4<Vector4, 0x0111>     xyy;
-			Swizzle4<Vector4, 0x1111>     yyy;
-			Swizzle4<Vector4, 0x2111>     zyy;
-			Swizzle4<Vector4, 0x3111>     wyy;
-			Swizzle4<Vector4, 0x0211>     xzy;
-			Swizzle4<Vector4, 0x1211>     yzy;
-			Swizzle4<Vector4, 0x2211>     zzy;
-			Swizzle4<Vector4, 0x3211>     wzy;
-			Swizzle4<Vector4, 0x0311>     xwy;
-			Swizzle4<Vector4, 0x1311>     ywy;
-			Swizzle4<Vector4, 0x2311>     zwy;
-			Swizzle4<Vector4, 0x3311>     wwy;
-			Swizzle4<Vector4, 0x0022>     xxz;
-			Swizzle4<Vector4, 0x1022>     yxz;
-			Swizzle4<Vector4, 0x2022>     zxz;
-			Swizzle4<Vector4, 0x3022>     wxz;
-			SwizzleMask4<Vector4, 0x0122> xyz;
-			Swizzle4<Vector4, 0x1122>     yyz;
-			Swizzle4<Vector4, 0x2122>     zyz;
-			Swizzle4<Vector4, 0x3122>     wyz;
-			Swizzle4<Vector4, 0x0222>     xzz;
-			Swizzle4<Vector4, 0x1222>     yzz;
-			Swizzle4<Vector4, 0x2222>     zzz;
-			Swizzle4<Vector4, 0x3222>     wzz;
-			Swizzle4<Vector4, 0x0322>     xwz;
-			Swizzle4<Vector4, 0x1322>     ywz;
-			Swizzle4<Vector4, 0x2322>     zwz;
-			Swizzle4<Vector4, 0x3322>     wwz;
-			Swizzle4<Vector4, 0x0033>     xxw;
-			Swizzle4<Vector4, 0x1033>     yxw;
-			Swizzle4<Vector4, 0x2033>     zxw;
-			Swizzle4<Vector4, 0x3033>     wxw;
-			SwizzleMask4<Vector4, 0x0133> xyw;
-			Swizzle4<Vector4, 0x1133>     yyw;
-			Swizzle4<Vector4, 0x2133>     zyw;
-			Swizzle4<Vector4, 0x3133>     wyw;
-			SwizzleMask4<Vector4, 0x0233> xzw;
-			SwizzleMask4<Vector4, 0x1233> yzw;
-			Swizzle4<Vector4, 0x2233>     zzw;
-			Swizzle4<Vector4, 0x3233>     wzw;
-			Swizzle4<Vector4, 0x0333>     xww;
-			Swizzle4<Vector4, 0x1333>     yww;
-			Swizzle4<Vector4, 0x2333>     zww;
-			Swizzle4<Vector4, 0x3333>     www;
-			Swizzle4<Vector4, 0x0000>     xxxx;
-			Swizzle4<Vector4, 0x1000>     yxxx;
-			Swizzle4<Vector4, 0x2000>     zxxx;
-			Swizzle4<Vector4, 0x3000>     wxxx;
-			Swizzle4<Vector4, 0x0100>     xyxx;
-			Swizzle4<Vector4, 0x1100>     yyxx;
-			Swizzle4<Vector4, 0x2100>     zyxx;
-			Swizzle4<Vector4, 0x3100>     wyxx;
-			Swizzle4<Vector4, 0x0200>     xzxx;
-			Swizzle4<Vector4, 0x1200>     yzxx;
-			Swizzle4<Vector4, 0x2200>     zzxx;
-			Swizzle4<Vector4, 0x3200>     wzxx;
-			Swizzle4<Vector4, 0x0300>     xwxx;
-			Swizzle4<Vector4, 0x1300>     ywxx;
-			Swizzle4<Vector4, 0x2300>     zwxx;
-			Swizzle4<Vector4, 0x3300>     wwxx;
-			Swizzle4<Vector4, 0x0010>     xxyx;
-			Swizzle4<Vector4, 0x1010>     yxyx;
-			Swizzle4<Vector4, 0x2010>     zxyx;
-			Swizzle4<Vector4, 0x3010>     wxyx;
-			Swizzle4<Vector4, 0x0110>     xyyx;
-			Swizzle4<Vector4, 0x1110>     yyyx;
-			Swizzle4<Vector4, 0x2110>     zyyx;
-			Swizzle4<Vector4, 0x3110>     wyyx;
-			Swizzle4<Vector4, 0x0210>     xzyx;
-			Swizzle4<Vector4, 0x1210>     yzyx;
-			Swizzle4<Vector4, 0x2210>     zzyx;
-			Swizzle4<Vector4, 0x3210>     wzyx;
-			Swizzle4<Vector4, 0x0310>     xwyx;
-			Swizzle4<Vector4, 0x1310>     ywyx;
-			Swizzle4<Vector4, 0x2310>     zwyx;
-			Swizzle4<Vector4, 0x3310>     wwyx;
-			Swizzle4<Vector4, 0x0020>     xxzx;
-			Swizzle4<Vector4, 0x1020>     yxzx;
-			Swizzle4<Vector4, 0x2020>     zxzx;
-			Swizzle4<Vector4, 0x3020>     wxzx;
-			Swizzle4<Vector4, 0x0120>     xyzx;
-			Swizzle4<Vector4, 0x1120>     yyzx;
-			Swizzle4<Vector4, 0x2120>     zyzx;
-			Swizzle4<Vector4, 0x3120>     wyzx;
-			Swizzle4<Vector4, 0x0220>     xzzx;
-			Swizzle4<Vector4, 0x1220>     yzzx;
-			Swizzle4<Vector4, 0x2220>     zzzx;
-			Swizzle4<Vector4, 0x3220>     wzzx;
-			Swizzle4<Vector4, 0x0320>     xwzx;
-			Swizzle4<Vector4, 0x1320>     ywzx;
-			Swizzle4<Vector4, 0x2320>     zwzx;
-			Swizzle4<Vector4, 0x3320>     wwzx;
-			Swizzle4<Vector4, 0x0030>     xxwx;
-			Swizzle4<Vector4, 0x1030>     yxwx;
-			Swizzle4<Vector4, 0x2030>     zxwx;
-			Swizzle4<Vector4, 0x3030>     wxwx;
-			Swizzle4<Vector4, 0x0130>     xywx;
-			Swizzle4<Vector4, 0x1130>     yywx;
-			Swizzle4<Vector4, 0x2130>     zywx;
-			Swizzle4<Vector4, 0x3130>     wywx;
-			Swizzle4<Vector4, 0x0230>     xzwx;
-			Swizzle4<Vector4, 0x1230>     yzwx;
-			Swizzle4<Vector4, 0x2230>     zzwx;
-			Swizzle4<Vector4, 0x3230>     wzwx;
-			Swizzle4<Vector4, 0x0330>     xwwx;
-			Swizzle4<Vector4, 0x1330>     ywwx;
-			Swizzle4<Vector4, 0x2330>     zwwx;
-			Swizzle4<Vector4, 0x3330>     wwwx;
-			Swizzle4<Vector4, 0x0001>     xxxy;
-			Swizzle4<Vector4, 0x1001>     yxxy;
-			Swizzle4<Vector4, 0x2001>     zxxy;
-			Swizzle4<Vector4, 0x3001>     wxxy;
-			Swizzle4<Vector4, 0x0101>     xyxy;
-			Swizzle4<Vector4, 0x1101>     yyxy;
-			Swizzle4<Vector4, 0x2101>     zyxy;
-			Swizzle4<Vector4, 0x3101>     wyxy;
-			Swizzle4<Vector4, 0x0201>     xzxy;
-			Swizzle4<Vector4, 0x1201>     yzxy;
-			Swizzle4<Vector4, 0x2201>     zzxy;
-			Swizzle4<Vector4, 0x3201>     wzxy;
-			Swizzle4<Vector4, 0x0301>     xwxy;
-			Swizzle4<Vector4, 0x1301>     ywxy;
-			Swizzle4<Vector4, 0x2301>     zwxy;
-			Swizzle4<Vector4, 0x3301>     wwxy;
-			Swizzle4<Vector4, 0x0011>     xxyy;
-			Swizzle4<Vector4, 0x1011>     yxyy;
-			Swizzle4<Vector4, 0x2011>     zxyy;
-			Swizzle4<Vector4, 0x3011>     wxyy;
-			Swizzle4<Vector4, 0x0111>     xyyy;
-			Swizzle4<Vector4, 0x1111>     yyyy;
-			Swizzle4<Vector4, 0x2111>     zyyy;
-			Swizzle4<Vector4, 0x3111>     wyyy;
-			Swizzle4<Vector4, 0x0211>     xzyy;
-			Swizzle4<Vector4, 0x1211>     yzyy;
-			Swizzle4<Vector4, 0x2211>     zzyy;
-			Swizzle4<Vector4, 0x3211>     wzyy;
-			Swizzle4<Vector4, 0x0311>     xwyy;
-			Swizzle4<Vector4, 0x1311>     ywyy;
-			Swizzle4<Vector4, 0x2311>     zwyy;
-			Swizzle4<Vector4, 0x3311>     wwyy;
-			Swizzle4<Vector4, 0x0021>     xxzy;
-			Swizzle4<Vector4, 0x1021>     yxzy;
-			Swizzle4<Vector4, 0x2021>     zxzy;
-			Swizzle4<Vector4, 0x3021>     wxzy;
-			Swizzle4<Vector4, 0x0121>     xyzy;
-			Swizzle4<Vector4, 0x1121>     yyzy;
-			Swizzle4<Vector4, 0x2121>     zyzy;
-			Swizzle4<Vector4, 0x3121>     wyzy;
-			Swizzle4<Vector4, 0x0221>     xzzy;
-			Swizzle4<Vector4, 0x1221>     yzzy;
-			Swizzle4<Vector4, 0x2221>     zzzy;
-			Swizzle4<Vector4, 0x3221>     wzzy;
-			Swizzle4<Vector4, 0x0321>     xwzy;
-			Swizzle4<Vector4, 0x1321>     ywzy;
-			Swizzle4<Vector4, 0x2321>     zwzy;
-			Swizzle4<Vector4, 0x3321>     wwzy;
-			Swizzle4<Vector4, 0x0031>     xxwy;
-			Swizzle4<Vector4, 0x1031>     yxwy;
-			Swizzle4<Vector4, 0x2031>     zxwy;
-			Swizzle4<Vector4, 0x3031>     wxwy;
-			Swizzle4<Vector4, 0x0131>     xywy;
-			Swizzle4<Vector4, 0x1131>     yywy;
-			Swizzle4<Vector4, 0x2131>     zywy;
-			Swizzle4<Vector4, 0x3131>     wywy;
-			Swizzle4<Vector4, 0x0231>     xzwy;
-			Swizzle4<Vector4, 0x1231>     yzwy;
-			Swizzle4<Vector4, 0x2231>     zzwy;
-			Swizzle4<Vector4, 0x3231>     wzwy;
-			Swizzle4<Vector4, 0x0331>     xwwy;
-			Swizzle4<Vector4, 0x1331>     ywwy;
-			Swizzle4<Vector4, 0x2331>     zwwy;
-			Swizzle4<Vector4, 0x3331>     wwwy;
-			Swizzle4<Vector4, 0x0002>     xxxz;
-			Swizzle4<Vector4, 0x1002>     yxxz;
-			Swizzle4<Vector4, 0x2002>     zxxz;
-			Swizzle4<Vector4, 0x3002>     wxxz;
-			Swizzle4<Vector4, 0x0102>     xyxz;
-			Swizzle4<Vector4, 0x1102>     yyxz;
-			Swizzle4<Vector4, 0x2102>     zyxz;
-			Swizzle4<Vector4, 0x3102>     wyxz;
-			Swizzle4<Vector4, 0x0202>     xzxz;
-			Swizzle4<Vector4, 0x1202>     yzxz;
-			Swizzle4<Vector4, 0x2202>     zzxz;
-			Swizzle4<Vector4, 0x3202>     wzxz;
-			Swizzle4<Vector4, 0x0302>     xwxz;
-			Swizzle4<Vector4, 0x1302>     ywxz;
-			Swizzle4<Vector4, 0x2302>     zwxz;
-			Swizzle4<Vector4, 0x3302>     wwxz;
-			Swizzle4<Vector4, 0x0012>     xxyz;
-			Swizzle4<Vector4, 0x1012>     yxyz;
-			Swizzle4<Vector4, 0x2012>     zxyz;
-			Swizzle4<Vector4, 0x3012>     wxyz;
-			Swizzle4<Vector4, 0x0112>     xyyz;
-			Swizzle4<Vector4, 0x1112>     yyyz;
-			Swizzle4<Vector4, 0x2112>     zyyz;
-			Swizzle4<Vector4, 0x3112>     wyyz;
-			Swizzle4<Vector4, 0x0212>     xzyz;
-			Swizzle4<Vector4, 0x1212>     yzyz;
-			Swizzle4<Vector4, 0x2212>     zzyz;
-			Swizzle4<Vector4, 0x3212>     wzyz;
-			Swizzle4<Vector4, 0x0312>     xwyz;
-			Swizzle4<Vector4, 0x1312>     ywyz;
-			Swizzle4<Vector4, 0x2312>     zwyz;
-			Swizzle4<Vector4, 0x3312>     wwyz;
-			Swizzle4<Vector4, 0x0022>     xxzz;
-			Swizzle4<Vector4, 0x1022>     yxzz;
-			Swizzle4<Vector4, 0x2022>     zxzz;
-			Swizzle4<Vector4, 0x3022>     wxzz;
-			Swizzle4<Vector4, 0x0122>     xyzz;
-			Swizzle4<Vector4, 0x1122>     yyzz;
-			Swizzle4<Vector4, 0x2122>     zyzz;
-			Swizzle4<Vector4, 0x3122>     wyzz;
-			Swizzle4<Vector4, 0x0222>     xzzz;
-			Swizzle4<Vector4, 0x1222>     yzzz;
-			Swizzle4<Vector4, 0x2222>     zzzz;
-			Swizzle4<Vector4, 0x3222>     wzzz;
-			Swizzle4<Vector4, 0x0322>     xwzz;
-			Swizzle4<Vector4, 0x1322>     ywzz;
-			Swizzle4<Vector4, 0x2322>     zwzz;
-			Swizzle4<Vector4, 0x3322>     wwzz;
-			Swizzle4<Vector4, 0x0032>     xxwz;
-			Swizzle4<Vector4, 0x1032>     yxwz;
-			Swizzle4<Vector4, 0x2032>     zxwz;
-			Swizzle4<Vector4, 0x3032>     wxwz;
-			Swizzle4<Vector4, 0x0132>     xywz;
-			Swizzle4<Vector4, 0x1132>     yywz;
-			Swizzle4<Vector4, 0x2132>     zywz;
-			Swizzle4<Vector4, 0x3132>     wywz;
-			Swizzle4<Vector4, 0x0232>     xzwz;
-			Swizzle4<Vector4, 0x1232>     yzwz;
-			Swizzle4<Vector4, 0x2232>     zzwz;
-			Swizzle4<Vector4, 0x3232>     wzwz;
-			Swizzle4<Vector4, 0x0332>     xwwz;
-			Swizzle4<Vector4, 0x1332>     ywwz;
-			Swizzle4<Vector4, 0x2332>     zwwz;
-			Swizzle4<Vector4, 0x3332>     wwwz;
-			Swizzle4<Vector4, 0x0003>     xxxw;
-			Swizzle4<Vector4, 0x1003>     yxxw;
-			Swizzle4<Vector4, 0x2003>     zxxw;
-			Swizzle4<Vector4, 0x3003>     wxxw;
-			Swizzle4<Vector4, 0x0103>     xyxw;
-			Swizzle4<Vector4, 0x1103>     yyxw;
-			Swizzle4<Vector4, 0x2103>     zyxw;
-			Swizzle4<Vector4, 0x3103>     wyxw;
-			Swizzle4<Vector4, 0x0203>     xzxw;
-			Swizzle4<Vector4, 0x1203>     yzxw;
-			Swizzle4<Vector4, 0x2203>     zzxw;
-			Swizzle4<Vector4, 0x3203>     wzxw;
-			Swizzle4<Vector4, 0x0303>     xwxw;
-			Swizzle4<Vector4, 0x1303>     ywxw;
-			Swizzle4<Vector4, 0x2303>     zwxw;
-			Swizzle4<Vector4, 0x3303>     wwxw;
-			Swizzle4<Vector4, 0x0013>     xxyw;
-			Swizzle4<Vector4, 0x1013>     yxyw;
-			Swizzle4<Vector4, 0x2013>     zxyw;
-			Swizzle4<Vector4, 0x3013>     wxyw;
-			Swizzle4<Vector4, 0x0113>     xyyw;
-			Swizzle4<Vector4, 0x1113>     yyyw;
-			Swizzle4<Vector4, 0x2113>     zyyw;
-			Swizzle4<Vector4, 0x3113>     wyyw;
-			Swizzle4<Vector4, 0x0213>     xzyw;
-			Swizzle4<Vector4, 0x1213>     yzyw;
-			Swizzle4<Vector4, 0x2213>     zzyw;
-			Swizzle4<Vector4, 0x3213>     wzyw;
-			Swizzle4<Vector4, 0x0313>     xwyw;
-			Swizzle4<Vector4, 0x1313>     ywyw;
-			Swizzle4<Vector4, 0x2313>     zwyw;
-			Swizzle4<Vector4, 0x3313>     wwyw;
-			Swizzle4<Vector4, 0x0023>     xxzw;
-			Swizzle4<Vector4, 0x1023>     yxzw;
-			Swizzle4<Vector4, 0x2023>     zxzw;
-			Swizzle4<Vector4, 0x3023>     wxzw;
-			SwizzleMask4<Vector4, 0x0123> xyzw;
-			Swizzle4<Vector4, 0x1123>     yyzw;
-			Swizzle4<Vector4, 0x2123>     zyzw;
-			Swizzle4<Vector4, 0x3123>     wyzw;
-			Swizzle4<Vector4, 0x0223>     xzzw;
-			Swizzle4<Vector4, 0x1223>     yzzw;
-			Swizzle4<Vector4, 0x2223>     zzzw;
-			Swizzle4<Vector4, 0x3223>     wzzw;
-			Swizzle4<Vector4, 0x0323>     xwzw;
-			Swizzle4<Vector4, 0x1323>     ywzw;
-			Swizzle4<Vector4, 0x2323>     zwzw;
-			Swizzle4<Vector4, 0x3323>     wwzw;
-			Swizzle4<Vector4, 0x0033>     xxww;
-			Swizzle4<Vector4, 0x1033>     yxww;
-			Swizzle4<Vector4, 0x2033>     zxww;
-			Swizzle4<Vector4, 0x3033>     wxww;
-			Swizzle4<Vector4, 0x0133>     xyww;
-			Swizzle4<Vector4, 0x1133>     yyww;
-			Swizzle4<Vector4, 0x2133>     zyww;
-			Swizzle4<Vector4, 0x3133>     wyww;
-			Swizzle4<Vector4, 0x0233>     xzww;
-			Swizzle4<Vector4, 0x1233>     yzww;
-			Swizzle4<Vector4, 0x2233>     zzww;
-			Swizzle4<Vector4, 0x3233>     wzww;
-			Swizzle4<Vector4, 0x0333>     xwww;
-			Swizzle4<Vector4, 0x1333>     ywww;
-			Swizzle4<Vector4, 0x2333>     zwww;
-			Swizzle4<Vector4, 0x3333>     wwww;
-		};
-	};
-
-	class Int4 : public LValue<Int4>, public XYZW<Int4>
-	{
-	public:
-		explicit Int4(RValue<Byte4> cast);
-		explicit Int4(RValue<SByte4> cast);
-		explicit Int4(RValue<Float4> cast);
-		explicit Int4(RValue<Short4> cast);
-		explicit Int4(RValue<UShort4> cast);
-
-		Int4();
-		Int4(int xyzw);
-		Int4(int x, int yzw);
-		Int4(int x, int y, int zw);
-		Int4(int x, int y, int z, int w);
-		Int4(RValue<Int4> rhs);
-		Int4(const Int4 &rhs);
-		Int4(const Reference<Int4> &rhs);
-		Int4(RValue<UInt4> rhs);
-		Int4(const UInt4 &rhs);
-		Int4(const Reference<UInt4> &rhs);
-		Int4(RValue<Int2> lo, RValue<Int2> hi);
-		Int4(RValue<Int> rhs);
-		Int4(const Int &rhs);
-		Int4(const Reference<Int> &rhs);
-
-		RValue<Int4> operator=(RValue<Int4> rhs);
-		RValue<Int4> operator=(const Int4 &rhs);
-		RValue<Int4> operator=(const Reference<Int4> &rhs);
-
-		static Type *getType();
-
-	private:
-		void constant(int x, int y, int z, int w);
-	};
-
-	RValue<Int4> operator+(RValue<Int4> lhs, RValue<Int4> rhs);
-	RValue<Int4> operator-(RValue<Int4> lhs, RValue<Int4> rhs);
-	RValue<Int4> operator*(RValue<Int4> lhs, RValue<Int4> rhs);
-	RValue<Int4> operator/(RValue<Int4> lhs, RValue<Int4> rhs);
-	RValue<Int4> operator%(RValue<Int4> lhs, RValue<Int4> rhs);
-	RValue<Int4> operator&(RValue<Int4> lhs, RValue<Int4> rhs);
-	RValue<Int4> operator|(RValue<Int4> lhs, RValue<Int4> rhs);
-	RValue<Int4> operator^(RValue<Int4> lhs, RValue<Int4> rhs);
-	RValue<Int4> operator<<(RValue<Int4> lhs, unsigned char rhs);
-	RValue<Int4> operator>>(RValue<Int4> lhs, unsigned char rhs);
-	RValue<Int4> operator<<(RValue<Int4> lhs, RValue<Int4> rhs);
-	RValue<Int4> operator>>(RValue<Int4> lhs, RValue<Int4> rhs);
-	RValue<Int4> operator+=(Int4 &lhs, RValue<Int4> rhs);
-	RValue<Int4> operator-=(Int4 &lhs, RValue<Int4> rhs);
-	RValue<Int4> operator*=(Int4 &lhs, RValue<Int4> rhs);
+RValue<Int4> operator+(RValue<Int4> lhs, RValue<Int4> rhs);
+RValue<Int4> operator-(RValue<Int4> lhs, RValue<Int4> rhs);
+RValue<Int4> operator*(RValue<Int4> lhs, RValue<Int4> rhs);
+RValue<Int4> operator/(RValue<Int4> lhs, RValue<Int4> rhs);
+RValue<Int4> operator%(RValue<Int4> lhs, RValue<Int4> rhs);
+RValue<Int4> operator&(RValue<Int4> lhs, RValue<Int4> rhs);
+RValue<Int4> operator|(RValue<Int4> lhs, RValue<Int4> rhs);
+RValue<Int4> operator^(RValue<Int4> lhs, RValue<Int4> rhs);
+RValue<Int4> operator<<(RValue<Int4> lhs, unsigned char rhs);
+RValue<Int4> operator>>(RValue<Int4> lhs, unsigned char rhs);
+RValue<Int4> operator<<(RValue<Int4> lhs, RValue<Int4> rhs);
+RValue<Int4> operator>>(RValue<Int4> lhs, RValue<Int4> rhs);
+RValue<Int4> operator+=(Int4 &lhs, RValue<Int4> rhs);
+RValue<Int4> operator-=(Int4 &lhs, RValue<Int4> rhs);
+RValue<Int4> operator*=(Int4 &lhs, RValue<Int4> rhs);
 //	RValue<Int4> operator/=(Int4 &lhs, RValue<Int4> rhs);
 //	RValue<Int4> operator%=(Int4 &lhs, RValue<Int4> rhs);
-	RValue<Int4> operator&=(Int4 &lhs, RValue<Int4> rhs);
-	RValue<Int4> operator|=(Int4 &lhs, RValue<Int4> rhs);
-	RValue<Int4> operator^=(Int4 &lhs, RValue<Int4> rhs);
-	RValue<Int4> operator<<=(Int4 &lhs, unsigned char rhs);
-	RValue<Int4> operator>>=(Int4 &lhs, unsigned char rhs);
-	RValue<Int4> operator+(RValue<Int4> val);
-	RValue<Int4> operator-(RValue<Int4> val);
-	RValue<Int4> operator~(RValue<Int4> val);
+RValue<Int4> operator&=(Int4 &lhs, RValue<Int4> rhs);
+RValue<Int4> operator|=(Int4 &lhs, RValue<Int4> rhs);
+RValue<Int4> operator^=(Int4 &lhs, RValue<Int4> rhs);
+RValue<Int4> operator<<=(Int4 &lhs, unsigned char rhs);
+RValue<Int4> operator>>=(Int4 &lhs, unsigned char rhs);
+RValue<Int4> operator+(RValue<Int4> val);
+RValue<Int4> operator-(RValue<Int4> val);
+RValue<Int4> operator~(RValue<Int4> val);
 //	RValue<Int4> operator++(Int4 &val, int);   // Post-increment
 //	const Int4 &operator++(Int4 &val);   // Pre-increment
 //	RValue<Int4> operator--(Int4 &val, int);   // Post-decrement
@@ -1919,92 +1919,92 @@
 //	RValue<Bool> operator!=(RValue<Int4> lhs, RValue<Int4> rhs);
 //	RValue<Bool> operator==(RValue<Int4> lhs, RValue<Int4> rhs);
 
-	inline RValue<Int4> operator+(RValue<Int> lhs, RValue<Int4> rhs)
-	{
-		return Int4(lhs) + rhs;
-	}
+inline RValue<Int4> operator+(RValue<Int> lhs, RValue<Int4> rhs)
+{
+	return Int4(lhs) + rhs;
+}
 
-	inline RValue<Int4> operator+(RValue<Int4> lhs, RValue<Int> rhs)
-	{
-		return lhs + Int4(rhs);
-	}
+inline RValue<Int4> operator+(RValue<Int4> lhs, RValue<Int> rhs)
+{
+	return lhs + Int4(rhs);
+}
 
-	RValue<Int4> CmpEQ(RValue<Int4> x, RValue<Int4> y);
-	RValue<Int4> CmpLT(RValue<Int4> x, RValue<Int4> y);
-	RValue<Int4> CmpLE(RValue<Int4> x, RValue<Int4> y);
-	RValue<Int4> CmpNEQ(RValue<Int4> x, RValue<Int4> y);
-	RValue<Int4> CmpNLT(RValue<Int4> x, RValue<Int4> y);
-	RValue<Int4> CmpNLE(RValue<Int4> x, RValue<Int4> y);
-	inline RValue<Int4> CmpGT(RValue<Int4> x, RValue<Int4> y) { return CmpNLE(x, y); }
-	inline RValue<Int4> CmpGE(RValue<Int4> x, RValue<Int4> y) { return CmpNLT(x, y); }
-	RValue<Int4> Max(RValue<Int4> x, RValue<Int4> y);
-	RValue<Int4> Min(RValue<Int4> x, RValue<Int4> y);
-	RValue<Int4> RoundInt(RValue<Float4> cast);
-	RValue<Short8> PackSigned(RValue<Int4> x, RValue<Int4> y);
-	RValue<UShort8> PackUnsigned(RValue<Int4> x, RValue<Int4> y);
-	RValue<Int> Extract(RValue<Int4> val, int i);
-	RValue<Int4> Insert(RValue<Int4> val, RValue<Int> element, int i);
-	RValue<Int> SignMask(RValue<Int4> x);
-	RValue<Int4> Swizzle(RValue<Int4> x, uint16_t select);
-	RValue<Int4> Shuffle(RValue<Int4> x, RValue<Int4> y, uint16_t select);
-	RValue<Int4> MulHigh(RValue<Int4> x, RValue<Int4> y);
+RValue<Int4> CmpEQ(RValue<Int4> x, RValue<Int4> y);
+RValue<Int4> CmpLT(RValue<Int4> x, RValue<Int4> y);
+RValue<Int4> CmpLE(RValue<Int4> x, RValue<Int4> y);
+RValue<Int4> CmpNEQ(RValue<Int4> x, RValue<Int4> y);
+RValue<Int4> CmpNLT(RValue<Int4> x, RValue<Int4> y);
+RValue<Int4> CmpNLE(RValue<Int4> x, RValue<Int4> y);
+inline RValue<Int4> CmpGT(RValue<Int4> x, RValue<Int4> y) { return CmpNLE(x, y); }
+inline RValue<Int4> CmpGE(RValue<Int4> x, RValue<Int4> y) { return CmpNLT(x, y); }
+RValue<Int4> Max(RValue<Int4> x, RValue<Int4> y);
+RValue<Int4> Min(RValue<Int4> x, RValue<Int4> y);
+RValue<Int4> RoundInt(RValue<Float4> cast);
+RValue<Short8> PackSigned(RValue<Int4> x, RValue<Int4> y);
+RValue<UShort8> PackUnsigned(RValue<Int4> x, RValue<Int4> y);
+RValue<Int> Extract(RValue<Int4> val, int i);
+RValue<Int4> Insert(RValue<Int4> val, RValue<Int> element, int i);
+RValue<Int> SignMask(RValue<Int4> x);
+RValue<Int4> Swizzle(RValue<Int4> x, uint16_t select);
+RValue<Int4> Shuffle(RValue<Int4> x, RValue<Int4> y, uint16_t select);
+RValue<Int4> MulHigh(RValue<Int4> x, RValue<Int4> y);
 
-	class UInt4 : public LValue<UInt4>, public XYZW<UInt4>
-	{
-	public:
-		explicit UInt4(RValue<Float4> cast);
+class UInt4 : public LValue<UInt4>, public XYZW<UInt4>
+{
+public:
+	explicit UInt4(RValue<Float4> cast);
 
-		UInt4();
-		UInt4(int xyzw);
-		UInt4(int x, int yzw);
-		UInt4(int x, int y, int zw);
-		UInt4(int x, int y, int z, int w);
-		UInt4(RValue<UInt4> rhs);
-		UInt4(const UInt4 &rhs);
-		UInt4(const Reference<UInt4> &rhs);
-		UInt4(RValue<Int4> rhs);
-		UInt4(const Int4 &rhs);
-		UInt4(const Reference<Int4> &rhs);
-		UInt4(RValue<UInt2> lo, RValue<UInt2> hi);
-		UInt4(RValue<UInt> rhs);
-		UInt4(const UInt &rhs);
-		UInt4(const Reference<UInt> &rhs);
+	UInt4();
+	UInt4(int xyzw);
+	UInt4(int x, int yzw);
+	UInt4(int x, int y, int zw);
+	UInt4(int x, int y, int z, int w);
+	UInt4(RValue<UInt4> rhs);
+	UInt4(const UInt4 &rhs);
+	UInt4(const Reference<UInt4> &rhs);
+	UInt4(RValue<Int4> rhs);
+	UInt4(const Int4 &rhs);
+	UInt4(const Reference<Int4> &rhs);
+	UInt4(RValue<UInt2> lo, RValue<UInt2> hi);
+	UInt4(RValue<UInt> rhs);
+	UInt4(const UInt &rhs);
+	UInt4(const Reference<UInt> &rhs);
 
-		RValue<UInt4> operator=(RValue<UInt4> rhs);
-		RValue<UInt4> operator=(const UInt4 &rhs);
-		RValue<UInt4> operator=(const Reference<UInt4> &rhs);
+	RValue<UInt4> operator=(RValue<UInt4> rhs);
+	RValue<UInt4> operator=(const UInt4 &rhs);
+	RValue<UInt4> operator=(const Reference<UInt4> &rhs);
 
-		static Type *getType();
+	static Type *getType();
 
-	private:
-		void constant(int x, int y, int z, int w);
-	};
+private:
+	void constant(int x, int y, int z, int w);
+};
 
-	RValue<UInt4> operator+(RValue<UInt4> lhs, RValue<UInt4> rhs);
-	RValue<UInt4> operator-(RValue<UInt4> lhs, RValue<UInt4> rhs);
-	RValue<UInt4> operator*(RValue<UInt4> lhs, RValue<UInt4> rhs);
-	RValue<UInt4> operator/(RValue<UInt4> lhs, RValue<UInt4> rhs);
-	RValue<UInt4> operator%(RValue<UInt4> lhs, RValue<UInt4> rhs);
-	RValue<UInt4> operator&(RValue<UInt4> lhs, RValue<UInt4> rhs);
-	RValue<UInt4> operator|(RValue<UInt4> lhs, RValue<UInt4> rhs);
-	RValue<UInt4> operator^(RValue<UInt4> lhs, RValue<UInt4> rhs);
-	RValue<UInt4> operator<<(RValue<UInt4> lhs, unsigned char rhs);
-	RValue<UInt4> operator>>(RValue<UInt4> lhs, unsigned char rhs);
-	RValue<UInt4> operator<<(RValue<UInt4> lhs, RValue<UInt4> rhs);
-	RValue<UInt4> operator>>(RValue<UInt4> lhs, RValue<UInt4> rhs);
-	RValue<UInt4> operator+=(UInt4 &lhs, RValue<UInt4> rhs);
-	RValue<UInt4> operator-=(UInt4 &lhs, RValue<UInt4> rhs);
-	RValue<UInt4> operator*=(UInt4 &lhs, RValue<UInt4> rhs);
+RValue<UInt4> operator+(RValue<UInt4> lhs, RValue<UInt4> rhs);
+RValue<UInt4> operator-(RValue<UInt4> lhs, RValue<UInt4> rhs);
+RValue<UInt4> operator*(RValue<UInt4> lhs, RValue<UInt4> rhs);
+RValue<UInt4> operator/(RValue<UInt4> lhs, RValue<UInt4> rhs);
+RValue<UInt4> operator%(RValue<UInt4> lhs, RValue<UInt4> rhs);
+RValue<UInt4> operator&(RValue<UInt4> lhs, RValue<UInt4> rhs);
+RValue<UInt4> operator|(RValue<UInt4> lhs, RValue<UInt4> rhs);
+RValue<UInt4> operator^(RValue<UInt4> lhs, RValue<UInt4> rhs);
+RValue<UInt4> operator<<(RValue<UInt4> lhs, unsigned char rhs);
+RValue<UInt4> operator>>(RValue<UInt4> lhs, unsigned char rhs);
+RValue<UInt4> operator<<(RValue<UInt4> lhs, RValue<UInt4> rhs);
+RValue<UInt4> operator>>(RValue<UInt4> lhs, RValue<UInt4> rhs);
+RValue<UInt4> operator+=(UInt4 &lhs, RValue<UInt4> rhs);
+RValue<UInt4> operator-=(UInt4 &lhs, RValue<UInt4> rhs);
+RValue<UInt4> operator*=(UInt4 &lhs, RValue<UInt4> rhs);
 //	RValue<UInt4> operator/=(UInt4 &lhs, RValue<UInt4> rhs);
 //	RValue<UInt4> operator%=(UInt4 &lhs, RValue<UInt4> rhs);
-	RValue<UInt4> operator&=(UInt4 &lhs, RValue<UInt4> rhs);
-	RValue<UInt4> operator|=(UInt4 &lhs, RValue<UInt4> rhs);
-	RValue<UInt4> operator^=(UInt4 &lhs, RValue<UInt4> rhs);
-	RValue<UInt4> operator<<=(UInt4 &lhs, unsigned char rhs);
-	RValue<UInt4> operator>>=(UInt4 &lhs, unsigned char rhs);
-	RValue<UInt4> operator+(RValue<UInt4> val);
-	RValue<UInt4> operator-(RValue<UInt4> val);
-	RValue<UInt4> operator~(RValue<UInt4> val);
+RValue<UInt4> operator&=(UInt4 &lhs, RValue<UInt4> rhs);
+RValue<UInt4> operator|=(UInt4 &lhs, RValue<UInt4> rhs);
+RValue<UInt4> operator^=(UInt4 &lhs, RValue<UInt4> rhs);
+RValue<UInt4> operator<<=(UInt4 &lhs, unsigned char rhs);
+RValue<UInt4> operator>>=(UInt4 &lhs, unsigned char rhs);
+RValue<UInt4> operator+(RValue<UInt4> val);
+RValue<UInt4> operator-(RValue<UInt4> val);
+RValue<UInt4> operator~(RValue<UInt4> val);
 //	RValue<UInt4> operator++(UInt4 &val, int);   // Post-increment
 //	const UInt4 &operator++(UInt4 &val);   // Pre-increment
 //	RValue<UInt4> operator--(UInt4 &val, int);   // Post-decrement
@@ -2016,93 +2016,93 @@
 //	RValue<Bool> operator!=(RValue<UInt4> lhs, RValue<UInt4> rhs);
 //	RValue<Bool> operator==(RValue<UInt4> lhs, RValue<UInt4> rhs);
 
-	RValue<UInt4> CmpEQ(RValue<UInt4> x, RValue<UInt4> y);
-	RValue<UInt4> CmpLT(RValue<UInt4> x, RValue<UInt4> y);
-	RValue<UInt4> CmpLE(RValue<UInt4> x, RValue<UInt4> y);
-	RValue<UInt4> CmpNEQ(RValue<UInt4> x, RValue<UInt4> y);
-	RValue<UInt4> CmpNLT(RValue<UInt4> x, RValue<UInt4> y);
-	RValue<UInt4> CmpNLE(RValue<UInt4> x, RValue<UInt4> y);
-	inline RValue<UInt4> CmpGT(RValue<UInt4> x, RValue<UInt4> y) { return CmpNLE(x, y); }
-	inline RValue<UInt4> CmpGE(RValue<UInt4> x, RValue<UInt4> y) { return CmpNLT(x, y); }
-	RValue<UInt4> Max(RValue<UInt4> x, RValue<UInt4> y);
-	RValue<UInt4> Min(RValue<UInt4> x, RValue<UInt4> y);
-	RValue<UInt4> MulHigh(RValue<UInt4> x, RValue<UInt4> y);
-	RValue<UInt> Extract(RValue<UInt4> val, int i);
-	RValue<UInt4> Insert(RValue<UInt4> val, RValue<UInt> element, int i);
+RValue<UInt4> CmpEQ(RValue<UInt4> x, RValue<UInt4> y);
+RValue<UInt4> CmpLT(RValue<UInt4> x, RValue<UInt4> y);
+RValue<UInt4> CmpLE(RValue<UInt4> x, RValue<UInt4> y);
+RValue<UInt4> CmpNEQ(RValue<UInt4> x, RValue<UInt4> y);
+RValue<UInt4> CmpNLT(RValue<UInt4> x, RValue<UInt4> y);
+RValue<UInt4> CmpNLE(RValue<UInt4> x, RValue<UInt4> y);
+inline RValue<UInt4> CmpGT(RValue<UInt4> x, RValue<UInt4> y) { return CmpNLE(x, y); }
+inline RValue<UInt4> CmpGE(RValue<UInt4> x, RValue<UInt4> y) { return CmpNLT(x, y); }
+RValue<UInt4> Max(RValue<UInt4> x, RValue<UInt4> y);
+RValue<UInt4> Min(RValue<UInt4> x, RValue<UInt4> y);
+RValue<UInt4> MulHigh(RValue<UInt4> x, RValue<UInt4> y);
+RValue<UInt> Extract(RValue<UInt4> val, int i);
+RValue<UInt4> Insert(RValue<UInt4> val, RValue<UInt> element, int i);
 //	RValue<UInt4> RoundInt(RValue<Float4> cast);
-	RValue<UInt4> Swizzle(RValue<UInt4> x, uint16_t select);
-	RValue<UInt4> Shuffle(RValue<UInt4> x, RValue<UInt4> y, uint16_t select);
+RValue<UInt4> Swizzle(RValue<UInt4> x, uint16_t select);
+RValue<UInt4> Shuffle(RValue<UInt4> x, RValue<UInt4> y, uint16_t select);
 
-	class Half : public LValue<Half>
-	{
-	public:
-		explicit Half(RValue<Float> cast);
+class Half : public LValue<Half>
+{
+public:
+	explicit Half(RValue<Float> cast);
 
-		static Type *getType();
-	};
+	static Type *getType();
+};
 
-	class Float : public LValue<Float>
-	{
-	public:
-		explicit Float(RValue<Int> cast);
-		explicit Float(RValue<UInt> cast);
-		explicit Float(RValue<Half> cast);
+class Float : public LValue<Float>
+{
+public:
+	explicit Float(RValue<Int> cast);
+	explicit Float(RValue<UInt> cast);
+	explicit Float(RValue<Half> cast);
 
-		Float() = default;
-		Float(float x);
-		Float(RValue<Float> rhs);
-		Float(const Float &rhs);
-		Float(const Reference<Float> &rhs);
-		Float(Argument<Float> argument);
+	Float() = default;
+	Float(float x);
+	Float(RValue<Float> rhs);
+	Float(const Float &rhs);
+	Float(const Reference<Float> &rhs);
+	Float(Argument<Float> argument);
 
-		template<int T>
-		Float(const SwizzleMask1<Float4, T> &rhs);
+	template<int T>
+	Float(const SwizzleMask1<Float4, T> &rhs);
 
-	//	RValue<Float> operator=(float rhs);   // FIXME: Implement
-		RValue<Float> operator=(RValue<Float> rhs);
-		RValue<Float> operator=(const Float &rhs);
-		RValue<Float> operator=(const Reference<Float> &rhs);
+//	RValue<Float> operator=(float rhs);   // FIXME: Implement
+	RValue<Float> operator=(RValue<Float> rhs);
+	RValue<Float> operator=(const Float &rhs);
+	RValue<Float> operator=(const Reference<Float> &rhs);
 
-		template<int T>
-		RValue<Float> operator=(const SwizzleMask1<Float4, T> &rhs);
+	template<int T>
+	RValue<Float> operator=(const SwizzleMask1<Float4, T> &rhs);
 
-		static Type *getType();
-	};
+	static Type *getType();
+};
 
-	RValue<Float> operator+(RValue<Float> lhs, RValue<Float> rhs);
-	RValue<Float> operator-(RValue<Float> lhs, RValue<Float> rhs);
-	RValue<Float> operator*(RValue<Float> lhs, RValue<Float> rhs);
-	RValue<Float> operator/(RValue<Float> lhs, RValue<Float> rhs);
-	RValue<Float> operator+=(Float &lhs, RValue<Float> rhs);
-	RValue<Float> operator-=(Float &lhs, RValue<Float> rhs);
-	RValue<Float> operator*=(Float &lhs, RValue<Float> rhs);
-	RValue<Float> operator/=(Float &lhs, RValue<Float> rhs);
-	RValue<Float> operator+(RValue<Float> val);
-	RValue<Float> operator-(RValue<Float> val);
-	RValue<Bool> operator<(RValue<Float> lhs, RValue<Float> rhs);
-	RValue<Bool> operator<=(RValue<Float> lhs, RValue<Float> rhs);
-	RValue<Bool> operator>(RValue<Float> lhs, RValue<Float> rhs);
-	RValue<Bool> operator>=(RValue<Float> lhs, RValue<Float> rhs);
-	RValue<Bool> operator!=(RValue<Float> lhs, RValue<Float> rhs);
-	RValue<Bool> operator==(RValue<Float> lhs, RValue<Float> rhs);
+RValue<Float> operator+(RValue<Float> lhs, RValue<Float> rhs);
+RValue<Float> operator-(RValue<Float> lhs, RValue<Float> rhs);
+RValue<Float> operator*(RValue<Float> lhs, RValue<Float> rhs);
+RValue<Float> operator/(RValue<Float> lhs, RValue<Float> rhs);
+RValue<Float> operator+=(Float &lhs, RValue<Float> rhs);
+RValue<Float> operator-=(Float &lhs, RValue<Float> rhs);
+RValue<Float> operator*=(Float &lhs, RValue<Float> rhs);
+RValue<Float> operator/=(Float &lhs, RValue<Float> rhs);
+RValue<Float> operator+(RValue<Float> val);
+RValue<Float> operator-(RValue<Float> val);
+RValue<Bool> operator<(RValue<Float> lhs, RValue<Float> rhs);
+RValue<Bool> operator<=(RValue<Float> lhs, RValue<Float> rhs);
+RValue<Bool> operator>(RValue<Float> lhs, RValue<Float> rhs);
+RValue<Bool> operator>=(RValue<Float> lhs, RValue<Float> rhs);
+RValue<Bool> operator!=(RValue<Float> lhs, RValue<Float> rhs);
+RValue<Bool> operator==(RValue<Float> lhs, RValue<Float> rhs);
 
-	RValue<Float> Abs(RValue<Float> x);
-	RValue<Float> Max(RValue<Float> x, RValue<Float> y);
-	RValue<Float> Min(RValue<Float> x, RValue<Float> y);
-	RValue<Float> Rcp_pp(RValue<Float> val, bool exactAtPow2 = false);
-	RValue<Float> RcpSqrt_pp(RValue<Float> val);
-	RValue<Float> Sqrt(RValue<Float> x);
+RValue<Float> Abs(RValue<Float> x);
+RValue<Float> Max(RValue<Float> x, RValue<Float> y);
+RValue<Float> Min(RValue<Float> x, RValue<Float> y);
+RValue<Float> Rcp_pp(RValue<Float> val, bool exactAtPow2 = false);
+RValue<Float> RcpSqrt_pp(RValue<Float> val);
+RValue<Float> Sqrt(RValue<Float> x);
 
 //	RValue<Int4> IsInf(RValue<Float> x);
 //	RValue<Int4> IsNan(RValue<Float> x);
-	RValue<Float> Round(RValue<Float> x);
-	RValue<Float> Trunc(RValue<Float> x);
-	RValue<Float> Frac(RValue<Float> x);
-	RValue<Float> Floor(RValue<Float> x);
-	RValue<Float> Ceil(RValue<Float> x);
+RValue<Float> Round(RValue<Float> x);
+RValue<Float> Trunc(RValue<Float> x);
+RValue<Float> Frac(RValue<Float> x);
+RValue<Float> Floor(RValue<Float> x);
+RValue<Float> Ceil(RValue<Float> x);
 
-	// Trigonometric functions
-	// TODO: Currently unimplemented for Subzero.
+// Trigonometric functions
+// TODO: Currently unimplemented for Subzero.
 //	RValue<Float> Sin(RValue<Float> x);
 //	RValue<Float> Cos(RValue<Float> x);
 //	RValue<Float> Tan(RValue<Float> x);
@@ -2117,49 +2117,49 @@
 //	RValue<Float> Atanh(RValue<Float> x);
 //	RValue<Float> Atan2(RValue<Float> x, RValue<Float> y);
 
-	// Exponential functions
-	// TODO: Currently unimplemented for Subzero.
+// Exponential functions
+// TODO: Currently unimplemented for Subzero.
 //	RValue<Float> Pow(RValue<Float> x, RValue<Float> y);
 //	RValue<Float> Exp(RValue<Float> x);
 //	RValue<Float> Log(RValue<Float> x);
-	RValue<Float> Exp2(RValue<Float> x);
-	RValue<Float> Log2(RValue<Float> x);
+RValue<Float> Exp2(RValue<Float> x);
+RValue<Float> Log2(RValue<Float> x);
 
-	class Float2 : public LValue<Float2>
-	{
-	public:
-	//	explicit Float2(RValue<Byte2> cast);
-	//	explicit Float2(RValue<Short2> cast);
-	//	explicit Float2(RValue<UShort2> cast);
-	//	explicit Float2(RValue<Int2> cast);
-	//	explicit Float2(RValue<UInt2> cast);
-		explicit Float2(RValue<Float4> cast);
+class Float2 : public LValue<Float2>
+{
+public:
+//	explicit Float2(RValue<Byte2> cast);
+//	explicit Float2(RValue<Short2> cast);
+//	explicit Float2(RValue<UShort2> cast);
+//	explicit Float2(RValue<Int2> cast);
+//	explicit Float2(RValue<UInt2> cast);
+	explicit Float2(RValue<Float4> cast);
 
-		Float2() = default;
-	//	Float2(float x, float y);
-	//	Float2(RValue<Float2> rhs);
-	//	Float2(const Float2 &rhs);
-	//	Float2(const Reference<Float2> &rhs);
-	//	Float2(RValue<Float> rhs);
-	//	Float2(const Float &rhs);
-	//	Float2(const Reference<Float> &rhs);
+	Float2() = default;
+//	Float2(float x, float y);
+//	Float2(RValue<Float2> rhs);
+//	Float2(const Float2 &rhs);
+//	Float2(const Reference<Float2> &rhs);
+//	Float2(RValue<Float> rhs);
+//	Float2(const Float &rhs);
+//	Float2(const Reference<Float> &rhs);
 
-	//	template<int T>
-	//	Float2(const SwizzleMask1<T> &rhs);
+//	template<int T>
+//	Float2(const SwizzleMask1<T> &rhs);
 
-	//	RValue<Float2> operator=(float replicate);
-	//	RValue<Float2> operator=(RValue<Float2> rhs);
-	//	RValue<Float2> operator=(const Float2 &rhs);
-	//	RValue<Float2> operator=(const Reference<Float2> &rhs);
-	//	RValue<Float2> operator=(RValue<Float> rhs);
-	//	RValue<Float2> operator=(const Float &rhs);
-	//	RValue<Float2> operator=(const Reference<Float> &rhs);
+//	RValue<Float2> operator=(float replicate);
+//	RValue<Float2> operator=(RValue<Float2> rhs);
+//	RValue<Float2> operator=(const Float2 &rhs);
+//	RValue<Float2> operator=(const Reference<Float2> &rhs);
+//	RValue<Float2> operator=(RValue<Float> rhs);
+//	RValue<Float2> operator=(const Float &rhs);
+//	RValue<Float2> operator=(const Reference<Float> &rhs);
 
-	//	template<int T>
-	//	RValue<Float2> operator=(const SwizzleMask1<T> &rhs);
+//	template<int T>
+//	RValue<Float2> operator=(const SwizzleMask1<T> &rhs);
 
-		static Type *getType();
-	};
+	static Type *getType();
+};
 
 //	RValue<Float2> operator+(RValue<Float2> lhs, RValue<Float2> rhs);
 //	RValue<Float2> operator-(RValue<Float2> lhs, RValue<Float2> rhs);
@@ -2180,868 +2180,871 @@
 //	RValue<Float2> Swizzle(RValue<Float2> x, uint16_t select);
 //	RValue<Float2> Mask(Float2 &lhs, RValue<Float2> rhs, uint16_t select);
 
-	class Float4 : public LValue<Float4>, public XYZW<Float4>
+class Float4 : public LValue<Float4>, public XYZW<Float4>
+{
+public:
+	explicit Float4(RValue<Byte4> cast);
+	explicit Float4(RValue<SByte4> cast);
+	explicit Float4(RValue<Short4> cast);
+	explicit Float4(RValue<UShort4> cast);
+	explicit Float4(RValue<Int4> cast);
+	explicit Float4(RValue<UInt4> cast);
+
+	Float4();
+	Float4(float xyzw);
+	Float4(float x, float yzw);
+	Float4(float x, float y, float zw);
+	Float4(float x, float y, float z, float w);
+	Float4(RValue<Float4> rhs);
+	Float4(const Float4 &rhs);
+	Float4(const Reference<Float4> &rhs);
+	Float4(RValue<Float> rhs);
+	Float4(const Float &rhs);
+	Float4(const Reference<Float> &rhs);
+
+	template<int T>
+	Float4(const SwizzleMask1<Float4, T> &rhs);
+	template<int T>
+	Float4(const Swizzle4<Float4, T> &rhs);
+	template<int X, int Y>
+	Float4(const Swizzle2<Float4, X> &x, const Swizzle2<Float4, Y> &y);
+	template<int X, int Y>
+	Float4(const SwizzleMask2<Float4, X> &x, const Swizzle2<Float4, Y> &y);
+	template<int X, int Y>
+	Float4(const Swizzle2<Float4, X> &x, const SwizzleMask2<Float4, Y> &y);
+	template<int X, int Y>
+	Float4(const SwizzleMask2<Float4, X> &x, const SwizzleMask2<Float4, Y> &y);
+
+	RValue<Float4> operator=(float replicate);
+	RValue<Float4> operator=(RValue<Float4> rhs);
+	RValue<Float4> operator=(const Float4 &rhs);
+	RValue<Float4> operator=(const Reference<Float4> &rhs);
+	RValue<Float4> operator=(RValue<Float> rhs);
+	RValue<Float4> operator=(const Float &rhs);
+	RValue<Float4> operator=(const Reference<Float> &rhs);
+
+	template<int T>
+	RValue<Float4> operator=(const SwizzleMask1<Float4, T> &rhs);
+	template<int T>
+	RValue<Float4> operator=(const Swizzle4<Float4, T> &rhs);
+
+	static Type *getType();
+	static Float4 negative_inf();
+	static Float4 positive_inf();
+private:
+	void constant(float x, float y, float z, float w);
+	void infinity_constant(bool negative);
+};
+
+RValue<Float4> operator+(RValue<Float4> lhs, RValue<Float4> rhs);
+RValue<Float4> operator-(RValue<Float4> lhs, RValue<Float4> rhs);
+RValue<Float4> operator*(RValue<Float4> lhs, RValue<Float4> rhs);
+RValue<Float4> operator/(RValue<Float4> lhs, RValue<Float4> rhs);
+RValue<Float4> operator%(RValue<Float4> lhs, RValue<Float4> rhs);
+RValue<Float4> operator+=(Float4 &lhs, RValue<Float4> rhs);
+RValue<Float4> operator-=(Float4 &lhs, RValue<Float4> rhs);
+RValue<Float4> operator*=(Float4 &lhs, RValue<Float4> rhs);
+RValue<Float4> operator/=(Float4 &lhs, RValue<Float4> rhs);
+RValue<Float4> operator%=(Float4 &lhs, RValue<Float4> rhs);
+RValue<Float4> operator+(RValue<Float4> val);
+RValue<Float4> operator-(RValue<Float4> val);
+
+RValue<Float4> Abs(RValue<Float4> x);
+RValue<Float4> Max(RValue<Float4> x, RValue<Float4> y);
+RValue<Float4> Min(RValue<Float4> x, RValue<Float4> y);
+RValue<Float4> Rcp_pp(RValue<Float4> val, bool exactAtPow2 = false);
+RValue<Float4> RcpSqrt_pp(RValue<Float4> val);
+RValue<Float4> Sqrt(RValue<Float4> x);
+RValue<Float4> Insert(RValue<Float4> val, RValue<Float> element, int i);
+RValue<Float> Extract(RValue<Float4> x, int i);
+RValue<Float4> Swizzle(RValue<Float4> x, uint16_t select);
+RValue<Float4> Shuffle(RValue<Float4> x, RValue<Float4> y, uint16_t select);
+RValue<Float4> ShuffleLowHigh(RValue<Float4> x, RValue<Float4> y, uint16_t imm);
+RValue<Float4> UnpackLow(RValue<Float4> x, RValue<Float4> y);
+RValue<Float4> UnpackHigh(RValue<Float4> x, RValue<Float4> y);
+RValue<Float4> Mask(Float4 &lhs, RValue<Float4> rhs, uint16_t select);
+RValue<Int> SignMask(RValue<Float4> x);
+
+// Ordered comparison functions
+RValue<Int4> CmpEQ(RValue<Float4> x, RValue<Float4> y);
+RValue<Int4> CmpLT(RValue<Float4> x, RValue<Float4> y);
+RValue<Int4> CmpLE(RValue<Float4> x, RValue<Float4> y);
+RValue<Int4> CmpNEQ(RValue<Float4> x, RValue<Float4> y);
+RValue<Int4> CmpNLT(RValue<Float4> x, RValue<Float4> y);
+RValue<Int4> CmpNLE(RValue<Float4> x, RValue<Float4> y);
+inline RValue<Int4> CmpGT(RValue<Float4> x, RValue<Float4> y) { return CmpNLE(x, y); }
+inline RValue<Int4> CmpGE(RValue<Float4> x, RValue<Float4> y) { return CmpNLT(x, y); }
+
+// Unordered comparison functions
+RValue<Int4> CmpUEQ(RValue<Float4> x, RValue<Float4> y);
+RValue<Int4> CmpULT(RValue<Float4> x, RValue<Float4> y);
+RValue<Int4> CmpULE(RValue<Float4> x, RValue<Float4> y);
+RValue<Int4> CmpUNEQ(RValue<Float4> x, RValue<Float4> y);
+RValue<Int4> CmpUNLT(RValue<Float4> x, RValue<Float4> y);
+RValue<Int4> CmpUNLE(RValue<Float4> x, RValue<Float4> y);
+inline RValue<Int4> CmpUGT(RValue<Float4> x, RValue<Float4> y) { return CmpUNLE(x, y); }
+inline RValue<Int4> CmpUGE(RValue<Float4> x, RValue<Float4> y) { return CmpUNLT(x, y); }
+
+RValue<Int4> IsInf(RValue<Float4> x);
+RValue<Int4> IsNan(RValue<Float4> x);
+RValue<Float4> Round(RValue<Float4> x);
+RValue<Float4> Trunc(RValue<Float4> x);
+RValue<Float4> Frac(RValue<Float4> x);
+RValue<Float4> Floor(RValue<Float4> x);
+RValue<Float4> Ceil(RValue<Float4> x);
+
+// Trigonometric functions
+// TODO: Currently unimplemented for Subzero.
+RValue<Float4> Sin(RValue<Float4> x);
+RValue<Float4> Cos(RValue<Float4> x);
+RValue<Float4> Tan(RValue<Float4> x);
+RValue<Float4> Asin(RValue<Float4> x);
+RValue<Float4> Acos(RValue<Float4> x);
+RValue<Float4> Atan(RValue<Float4> x);
+RValue<Float4> Sinh(RValue<Float4> x);
+RValue<Float4> Cosh(RValue<Float4> x);
+RValue<Float4> Tanh(RValue<Float4> x);
+RValue<Float4> Asinh(RValue<Float4> x);
+RValue<Float4> Acosh(RValue<Float4> x);
+RValue<Float4> Atanh(RValue<Float4> x);
+RValue<Float4> Atan2(RValue<Float4> x, RValue<Float4> y);
+
+// Exponential functions
+// TODO: Currently unimplemented for Subzero.
+RValue<Float4> Pow(RValue<Float4> x, RValue<Float4> y);
+RValue<Float4> Exp(RValue<Float4> x);
+RValue<Float4> Log(RValue<Float4> x);
+RValue<Float4> Exp2(RValue<Float4> x);
+RValue<Float4> Log2(RValue<Float4> x);
+
+// Bit Manipulation functions.
+// TODO: Currently unimplemented for Subzero.
+
+// Count leading zeros.
+// Returns 32 when: !isZeroUndef && x == 0.
+// Returns an undefined value when: isZeroUndef && x == 0.
+RValue<UInt> Ctlz(RValue<UInt> x, bool isZeroUndef);
+RValue<UInt4> Ctlz(RValue<UInt4> x, bool isZeroUndef);
+
+// Count trailing zeros.
+// Returns 32 when: !isZeroUndef && x == 0.
+// Returns an undefined value when: isZeroUndef && x == 0.
+RValue<UInt> Cttz(RValue<UInt> x, bool isZeroUndef);
+RValue<UInt4> Cttz(RValue<UInt4> x, bool isZeroUndef);
+
+template<class T>
+class Pointer : public LValue<Pointer<T>>
+{
+public:
+	template<class S>
+	Pointer(RValue<Pointer<S>> pointerS, int alignment = 1) : alignment(alignment)
 	{
-	public:
-		explicit Float4(RValue<Byte4> cast);
-		explicit Float4(RValue<SByte4> cast);
-		explicit Float4(RValue<Short4> cast);
-		explicit Float4(RValue<UShort4> cast);
-		explicit Float4(RValue<Int4> cast);
-		explicit Float4(RValue<UInt4> cast);
-
-		Float4();
-		Float4(float xyzw);
-		Float4(float x, float yzw);
-		Float4(float x, float y, float zw);
-		Float4(float x, float y, float z, float w);
-		Float4(RValue<Float4> rhs);
-		Float4(const Float4 &rhs);
-		Float4(const Reference<Float4> &rhs);
-		Float4(RValue<Float> rhs);
-		Float4(const Float &rhs);
-		Float4(const Reference<Float> &rhs);
-
-		template<int T>
-		Float4(const SwizzleMask1<Float4, T> &rhs);
-		template<int T>
-		Float4(const Swizzle4<Float4, T> &rhs);
-		template<int X, int Y>
-		Float4(const Swizzle2<Float4, X> &x, const Swizzle2<Float4, Y> &y);
-		template<int X, int Y>
-		Float4(const SwizzleMask2<Float4, X> &x, const Swizzle2<Float4, Y> &y);
-		template<int X, int Y>
-		Float4(const Swizzle2<Float4, X> &x, const SwizzleMask2<Float4, Y> &y);
-		template<int X, int Y>
-		Float4(const SwizzleMask2<Float4, X> &x, const SwizzleMask2<Float4, Y> &y);
-
-		RValue<Float4> operator=(float replicate);
-		RValue<Float4> operator=(RValue<Float4> rhs);
-		RValue<Float4> operator=(const Float4 &rhs);
-		RValue<Float4> operator=(const Reference<Float4> &rhs);
-		RValue<Float4> operator=(RValue<Float> rhs);
-		RValue<Float4> operator=(const Float &rhs);
-		RValue<Float4> operator=(const Reference<Float> &rhs);
-
-		template<int T>
-		RValue<Float4> operator=(const SwizzleMask1<Float4, T> &rhs);
-		template<int T>
-		RValue<Float4> operator=(const Swizzle4<Float4, T> &rhs);
-
-		static Type *getType();
-		static Float4 negative_inf();
-		static Float4 positive_inf();
-	private:
-		void constant(float x, float y, float z, float w);
-		void infinity_constant(bool negative);
-	};
-
-	RValue<Float4> operator+(RValue<Float4> lhs, RValue<Float4> rhs);
-	RValue<Float4> operator-(RValue<Float4> lhs, RValue<Float4> rhs);
-	RValue<Float4> operator*(RValue<Float4> lhs, RValue<Float4> rhs);
-	RValue<Float4> operator/(RValue<Float4> lhs, RValue<Float4> rhs);
-	RValue<Float4> operator%(RValue<Float4> lhs, RValue<Float4> rhs);
-	RValue<Float4> operator+=(Float4 &lhs, RValue<Float4> rhs);
-	RValue<Float4> operator-=(Float4 &lhs, RValue<Float4> rhs);
-	RValue<Float4> operator*=(Float4 &lhs, RValue<Float4> rhs);
-	RValue<Float4> operator/=(Float4 &lhs, RValue<Float4> rhs);
-	RValue<Float4> operator%=(Float4 &lhs, RValue<Float4> rhs);
-	RValue<Float4> operator+(RValue<Float4> val);
-	RValue<Float4> operator-(RValue<Float4> val);
-
-	RValue<Float4> Abs(RValue<Float4> x);
-	RValue<Float4> Max(RValue<Float4> x, RValue<Float4> y);
-	RValue<Float4> Min(RValue<Float4> x, RValue<Float4> y);
-	RValue<Float4> Rcp_pp(RValue<Float4> val, bool exactAtPow2 = false);
-	RValue<Float4> RcpSqrt_pp(RValue<Float4> val);
-	RValue<Float4> Sqrt(RValue<Float4> x);
-	RValue<Float4> Insert(RValue<Float4> val, RValue<Float> element, int i);
-	RValue<Float> Extract(RValue<Float4> x, int i);
-	RValue<Float4> Swizzle(RValue<Float4> x, uint16_t select);
-	RValue<Float4> Shuffle(RValue<Float4> x, RValue<Float4> y, uint16_t select);
-	RValue<Float4> ShuffleLowHigh(RValue<Float4> x, RValue<Float4> y, uint16_t imm);
-	RValue<Float4> UnpackLow(RValue<Float4> x, RValue<Float4> y);
-	RValue<Float4> UnpackHigh(RValue<Float4> x, RValue<Float4> y);
-	RValue<Float4> Mask(Float4 &lhs, RValue<Float4> rhs, uint16_t select);
-	RValue<Int> SignMask(RValue<Float4> x);
-
-	// Ordered comparison functions
-	RValue<Int4> CmpEQ(RValue<Float4> x, RValue<Float4> y);
-	RValue<Int4> CmpLT(RValue<Float4> x, RValue<Float4> y);
-	RValue<Int4> CmpLE(RValue<Float4> x, RValue<Float4> y);
-	RValue<Int4> CmpNEQ(RValue<Float4> x, RValue<Float4> y);
-	RValue<Int4> CmpNLT(RValue<Float4> x, RValue<Float4> y);
-	RValue<Int4> CmpNLE(RValue<Float4> x, RValue<Float4> y);
-	inline RValue<Int4> CmpGT(RValue<Float4> x, RValue<Float4> y) { return CmpNLE(x, y); }
-	inline RValue<Int4> CmpGE(RValue<Float4> x, RValue<Float4> y) { return CmpNLT(x, y); }
-
-	// Unordered comparison functions
-	RValue<Int4> CmpUEQ(RValue<Float4> x, RValue<Float4> y);
-	RValue<Int4> CmpULT(RValue<Float4> x, RValue<Float4> y);
-	RValue<Int4> CmpULE(RValue<Float4> x, RValue<Float4> y);
-	RValue<Int4> CmpUNEQ(RValue<Float4> x, RValue<Float4> y);
-	RValue<Int4> CmpUNLT(RValue<Float4> x, RValue<Float4> y);
-	RValue<Int4> CmpUNLE(RValue<Float4> x, RValue<Float4> y);
-	inline RValue<Int4> CmpUGT(RValue<Float4> x, RValue<Float4> y) { return CmpUNLE(x, y); }
-	inline RValue<Int4> CmpUGE(RValue<Float4> x, RValue<Float4> y) { return CmpUNLT(x, y); }
-
-	RValue<Int4> IsInf(RValue<Float4> x);
-	RValue<Int4> IsNan(RValue<Float4> x);
-	RValue<Float4> Round(RValue<Float4> x);
-	RValue<Float4> Trunc(RValue<Float4> x);
-	RValue<Float4> Frac(RValue<Float4> x);
-	RValue<Float4> Floor(RValue<Float4> x);
-	RValue<Float4> Ceil(RValue<Float4> x);
-
-	// Trigonometric functions
-	// TODO: Currently unimplemented for Subzero.
-	RValue<Float4> Sin(RValue<Float4> x);
-	RValue<Float4> Cos(RValue<Float4> x);
-	RValue<Float4> Tan(RValue<Float4> x);
-	RValue<Float4> Asin(RValue<Float4> x);
-	RValue<Float4> Acos(RValue<Float4> x);
-	RValue<Float4> Atan(RValue<Float4> x);
-	RValue<Float4> Sinh(RValue<Float4> x);
-	RValue<Float4> Cosh(RValue<Float4> x);
-	RValue<Float4> Tanh(RValue<Float4> x);
-	RValue<Float4> Asinh(RValue<Float4> x);
-	RValue<Float4> Acosh(RValue<Float4> x);
-	RValue<Float4> Atanh(RValue<Float4> x);
-	RValue<Float4> Atan2(RValue<Float4> x, RValue<Float4> y);
-
-	// Exponential functions
-	// TODO: Currently unimplemented for Subzero.
-	RValue<Float4> Pow(RValue<Float4> x, RValue<Float4> y);
-	RValue<Float4> Exp(RValue<Float4> x);
-	RValue<Float4> Log(RValue<Float4> x);
-	RValue<Float4> Exp2(RValue<Float4> x);
-	RValue<Float4> Log2(RValue<Float4> x);
-
-	// Bit Manipulation functions.
-	// TODO: Currently unimplemented for Subzero.
-
-	// Count leading zeros.
-	// Returns 32 when: !isZeroUndef && x == 0.
-	// Returns an undefined value when: isZeroUndef && x == 0.
-	RValue<UInt> Ctlz(RValue<UInt> x, bool isZeroUndef);
-	RValue<UInt4> Ctlz(RValue<UInt4> x, bool isZeroUndef);
-
-	// Count trailing zeros.
-	// Returns 32 when: !isZeroUndef && x == 0.
-	// Returns an undefined value when: isZeroUndef && x == 0.
-	RValue<UInt> Cttz(RValue<UInt> x, bool isZeroUndef);
-	RValue<UInt4> Cttz(RValue<UInt4> x, bool isZeroUndef);
-
-	template<class T>
-	class Pointer : public LValue<Pointer<T>>
-	{
-	public:
-		template<class S>
-		Pointer(RValue<Pointer<S>> pointerS, int alignment = 1) : alignment(alignment)
-		{
-			Value *pointerT = Nucleus::createBitCast(pointerS.value, Nucleus::getPointerType(T::getType()));
-			LValue<Pointer<T>>::storeValue(pointerT);
-		}
-
-		template<class S>
-		Pointer(const Pointer<S> &pointer, int alignment = 1) : alignment(alignment)
-		{
-			Value *pointerS = pointer.loadValue();
-			Value *pointerT = Nucleus::createBitCast(pointerS, Nucleus::getPointerType(T::getType()));
-			LValue<Pointer<T>>::storeValue(pointerT);
-		}
-
-		Pointer(Argument<Pointer<T>> argument);
-
-		Pointer();
-		Pointer(RValue<Pointer<T>> rhs);
-		Pointer(const Pointer<T> &rhs);
-		Pointer(const Reference<Pointer<T>> &rhs);
-		Pointer(std::nullptr_t);
-
-		RValue<Pointer<T>> operator=(RValue<Pointer<T>> rhs);
-		RValue<Pointer<T>> operator=(const Pointer<T> &rhs);
-		RValue<Pointer<T>> operator=(const Reference<Pointer<T>> &rhs);
-		RValue<Pointer<T>> operator=(std::nullptr_t);
-
-		Reference<T> operator*();
-		Reference<T> operator[](int index);
-		Reference<T> operator[](unsigned int index);
-		Reference<T> operator[](RValue<Int> index);
-		Reference<T> operator[](RValue<UInt> index);
-
-		static Type *getType();
-
-	private:
-		const int alignment;
-	};
-
-	RValue<Pointer<Byte>> operator+(RValue<Pointer<Byte>> lhs, int offset);
-	RValue<Pointer<Byte>> operator+(RValue<Pointer<Byte>> lhs, RValue<Int> offset);
-	RValue<Pointer<Byte>> operator+(RValue<Pointer<Byte>> lhs, RValue<UInt> offset);
-	RValue<Pointer<Byte>> operator+=(Pointer<Byte> &lhs, int offset);
-	RValue<Pointer<Byte>> operator+=(Pointer<Byte> &lhs, RValue<Int> offset);
-	RValue<Pointer<Byte>> operator+=(Pointer<Byte> &lhs, RValue<UInt> offset);
-
-	RValue<Pointer<Byte>> operator-(RValue<Pointer<Byte>> lhs, int offset);
-	RValue<Pointer<Byte>> operator-(RValue<Pointer<Byte>> lhs, RValue<Int> offset);
-	RValue<Pointer<Byte>> operator-(RValue<Pointer<Byte>> lhs, RValue<UInt> offset);
-	RValue<Pointer<Byte>> operator-=(Pointer<Byte> &lhs, int offset);
-	RValue<Pointer<Byte>> operator-=(Pointer<Byte> &lhs, RValue<Int> offset);
-	RValue<Pointer<Byte>> operator-=(Pointer<Byte> &lhs, RValue<UInt> offset);
-
-	template <typename T>
-	RValue<Bool> operator==(const Pointer<T> &lhs, const Pointer<T> &rhs)
-	{
-		return RValue<Bool>(Nucleus::createPtrEQ(lhs.loadValue(), rhs.loadValue()));
+		Value *pointerT = Nucleus::createBitCast(pointerS.value, Nucleus::getPointerType(T::getType()));
+		LValue<Pointer<T>>::storeValue(pointerT);
 	}
 
-	template<typename T>
-	RValue<T> Load(RValue<Pointer<T>> pointer, unsigned int alignment, bool atomic, std::memory_order memoryOrder)
+	template<class S>
+	Pointer(const Pointer<S> &pointer, int alignment = 1) : alignment(alignment)
 	{
-		return RValue<T>(Nucleus::createLoad(pointer.value, T::getType(), false, alignment, atomic, memoryOrder));
+		Value *pointerS = pointer.loadValue();
+		Value *pointerT = Nucleus::createBitCast(pointerS, Nucleus::getPointerType(T::getType()));
+		LValue<Pointer<T>>::storeValue(pointerT);
 	}
 
-	template<typename T>
-	RValue<T> Load(Pointer<T> pointer, unsigned int alignment, bool atomic, std::memory_order memoryOrder)
-	{
-		return Load(RValue<Pointer<T>>(pointer), alignment, atomic, memoryOrder);
-	}
+	Pointer(Argument<Pointer<T>> argument);
 
-	// TODO: Use SIMD to template these.
-	RValue<Float4> MaskedLoad(RValue<Pointer<Float4>> base, RValue<Int4> mask, unsigned int alignment, bool zeroMaskedLanes = false);
-	RValue<Int4> MaskedLoad(RValue<Pointer<Int4>> base, RValue<Int4> mask, unsigned int alignment, bool zeroMaskedLanes = false);
-	void MaskedStore(RValue<Pointer<Float4>> base, RValue<Float4> val, RValue<Int4> mask, unsigned int alignment);
-	void MaskedStore(RValue<Pointer<Int4>> base, RValue<Int4> val, RValue<Int4> mask, unsigned int alignment);
+	Pointer();
+	Pointer(RValue<Pointer<T>> rhs);
+	Pointer(const Pointer<T> &rhs);
+	Pointer(const Reference<Pointer<T>> &rhs);
+	Pointer(std::nullptr_t);
 
-	RValue<Float4> Gather(RValue<Pointer<Float>> base, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment, bool zeroMaskedLanes = false);
-	RValue<Int4> Gather(RValue<Pointer<Int>> base, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment, bool zeroMaskedLanes = false);
-	void Scatter(RValue<Pointer<Float>> base, RValue<Float4> val, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment);
-	void Scatter(RValue<Pointer<Int>> base, RValue<Int4> val, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment);
+	RValue<Pointer<T>> operator=(RValue<Pointer<T>> rhs);
+	RValue<Pointer<T>> operator=(const Pointer<T> &rhs);
+	RValue<Pointer<T>> operator=(const Reference<Pointer<T>> &rhs);
+	RValue<Pointer<T>> operator=(std::nullptr_t);
 
-	template<typename T>
-	void Store(RValue<T> value, RValue<Pointer<T>> pointer, unsigned int alignment, bool atomic, std::memory_order memoryOrder)
-	{
-		Nucleus::createStore(value.value, pointer.value, T::getType(), false, alignment, atomic, memoryOrder);
-	}
+	Reference<T> operator*();
+	Reference<T> operator[](int index);
+	Reference<T> operator[](unsigned int index);
+	Reference<T> operator[](RValue<Int> index);
+	Reference<T> operator[](RValue<UInt> index);
 
-	template<typename T>
-	void Store(RValue<T> value, Pointer<T> pointer, unsigned int alignment, bool atomic, std::memory_order memoryOrder)
-	{
-		Store(value, RValue<Pointer<T>>(pointer), alignment, atomic, memoryOrder);
-	}
+	static Type *getType();
 
-	template<typename T>
-	void Store(T value, Pointer<T> pointer, unsigned int alignment, bool atomic, std::memory_order memoryOrder)
-	{
-		Store(RValue<T>(value), RValue<Pointer<T>>(pointer), alignment, atomic, memoryOrder);
-	}
+private:
+	const int alignment;
+};
 
-	// Fence adds a memory barrier that enforces ordering constraints on memory
-	// operations. memoryOrder can only be one of:
-	// std::memory_order_acquire, std::memory_order_release,
-	// std::memory_order_acq_rel, or std::memory_order_seq_cst.
-	void Fence(std::memory_order memoryOrder);
+RValue<Pointer<Byte>> operator+(RValue<Pointer<Byte>> lhs, int offset);
+RValue<Pointer<Byte>> operator+(RValue<Pointer<Byte>> lhs, RValue<Int> offset);
+RValue<Pointer<Byte>> operator+(RValue<Pointer<Byte>> lhs, RValue<UInt> offset);
+RValue<Pointer<Byte>> operator+=(Pointer<Byte> &lhs, int offset);
+RValue<Pointer<Byte>> operator+=(Pointer<Byte> &lhs, RValue<Int> offset);
+RValue<Pointer<Byte>> operator+=(Pointer<Byte> &lhs, RValue<UInt> offset);
 
-	template<class T, int S = 1>
-	class Array : public LValue<T>
-	{
-	public:
-		Array(int size = S);
+RValue<Pointer<Byte>> operator-(RValue<Pointer<Byte>> lhs, int offset);
+RValue<Pointer<Byte>> operator-(RValue<Pointer<Byte>> lhs, RValue<Int> offset);
+RValue<Pointer<Byte>> operator-(RValue<Pointer<Byte>> lhs, RValue<UInt> offset);
+RValue<Pointer<Byte>> operator-=(Pointer<Byte> &lhs, int offset);
+RValue<Pointer<Byte>> operator-=(Pointer<Byte> &lhs, RValue<Int> offset);
+RValue<Pointer<Byte>> operator-=(Pointer<Byte> &lhs, RValue<UInt> offset);
 
-		Reference<T> operator[](int index);
-		Reference<T> operator[](unsigned int index);
-		Reference<T> operator[](RValue<Int> index);
-		Reference<T> operator[](RValue<UInt> index);
+template <typename T>
+RValue<Bool> operator==(const Pointer<T> &lhs, const Pointer<T> &rhs)
+{
+	return RValue<Bool>(Nucleus::createPtrEQ(lhs.loadValue(), rhs.loadValue()));
+}
 
-		// self() returns the this pointer to this Array object.
-		// This function exists because operator&() is overloaded by LValue<T>.
-		inline Array* self() { return this; }
-	};
+template<typename T>
+RValue<T> Load(RValue<Pointer<T>> pointer, unsigned int alignment, bool atomic, std::memory_order memoryOrder)
+{
+	return RValue<T>(Nucleus::createLoad(pointer.value, T::getType(), false, alignment, atomic, memoryOrder));
+}
+
+template<typename T>
+RValue<T> Load(Pointer<T> pointer, unsigned int alignment, bool atomic, std::memory_order memoryOrder)
+{
+	return Load(RValue<Pointer<T>>(pointer), alignment, atomic, memoryOrder);
+}
+
+// TODO: Use SIMD to template these.
+RValue<Float4> MaskedLoad(RValue<Pointer<Float4>> base, RValue<Int4> mask, unsigned int alignment, bool zeroMaskedLanes = false);
+RValue<Int4> MaskedLoad(RValue<Pointer<Int4>> base, RValue<Int4> mask, unsigned int alignment, bool zeroMaskedLanes = false);
+void MaskedStore(RValue<Pointer<Float4>> base, RValue<Float4> val, RValue<Int4> mask, unsigned int alignment);
+void MaskedStore(RValue<Pointer<Int4>> base, RValue<Int4> val, RValue<Int4> mask, unsigned int alignment);
+
+RValue<Float4> Gather(RValue<Pointer<Float>> base, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment, bool zeroMaskedLanes = false);
+RValue<Int4> Gather(RValue<Pointer<Int>> base, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment, bool zeroMaskedLanes = false);
+void Scatter(RValue<Pointer<Float>> base, RValue<Float4> val, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment);
+void Scatter(RValue<Pointer<Int>> base, RValue<Int4> val, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment);
+
+template<typename T>
+void Store(RValue<T> value, RValue<Pointer<T>> pointer, unsigned int alignment, bool atomic, std::memory_order memoryOrder)
+{
+	Nucleus::createStore(value.value, pointer.value, T::getType(), false, alignment, atomic, memoryOrder);
+}
+
+template<typename T>
+void Store(RValue<T> value, Pointer<T> pointer, unsigned int alignment, bool atomic, std::memory_order memoryOrder)
+{
+	Store(value, RValue<Pointer<T>>(pointer), alignment, atomic, memoryOrder);
+}
+
+template<typename T>
+void Store(T value, Pointer<T> pointer, unsigned int alignment, bool atomic, std::memory_order memoryOrder)
+{
+	Store(RValue<T>(value), RValue<Pointer<T>>(pointer), alignment, atomic, memoryOrder);
+}
+
+// Fence adds a memory barrier that enforces ordering constraints on memory
+// operations. memoryOrder can only be one of:
+// std::memory_order_acquire, std::memory_order_release,
+// std::memory_order_acq_rel, or std::memory_order_seq_cst.
+void Fence(std::memory_order memoryOrder);
+
+template<class T, int S = 1>
+class Array : public LValue<T>
+{
+public:
+	Array(int size = S);
+
+	Reference<T> operator[](int index);
+	Reference<T> operator[](unsigned int index);
+	Reference<T> operator[](RValue<Int> index);
+	Reference<T> operator[](RValue<UInt> index);
+
+	// self() returns the this pointer to this Array object.
+	// This function exists because operator&() is overloaded by LValue<T>.
+	inline Array* self() { return this; }
+};
 
 //	RValue<Array<T>> operator++(Array<T> &val, int);   // Post-increment
 //	const Array<T> &operator++(Array<T> &val);   // Pre-increment
 //	RValue<Array<T>> operator--(Array<T> &val, int);   // Post-decrement
 //	const Array<T> &operator--(Array<T> &val);   // Pre-decrement
 
-	void branch(RValue<Bool> cmp, BasicBlock *bodyBB, BasicBlock *endBB);
+void branch(RValue<Bool> cmp, BasicBlock *bodyBB, BasicBlock *endBB);
 
-	// ValueOf returns a rr::Value* for the given C-type, RValue<T>, LValue<T>
-	// or Reference<T>.
-	template <typename T>
-	inline Value* ValueOf(const T &v)
-	{
-		return ReactorType<T>::cast(v).loadValue();
-	}
-
-	void Return();
-
-	template<class T>
-	void Return(const T &ret)
-	{
-		static_assert(CanBeUsedAsReturn< ReactorTypeT<T> >::value, "Unsupported type for Return()");
-		Nucleus::createRet(ValueOf<T>(ret));
-		// Place any unreachable instructions in an unreferenced block.
-		Nucleus::setInsertBlock(Nucleus::createBasicBlock());
-	}
-
-	// Generic template, leave undefined!
-	template<typename FunctionType>
-	class Function;
-
-	// Specialized for function types
-	template<typename Return, typename... Arguments>
-	class Function<Return(Arguments...)>
-	{
-		// Static assert that the function signature is valid.
-		static_assert(sizeof(AssertFunctionSignatureIsValid<Return(Arguments...)>) >= 0, "Invalid function signature");
-
-	public:
-		Function();
-
-		virtual ~Function();
-
-		template<int index>
-		Argument<typename std::tuple_element<index, std::tuple<Arguments...>>::type> Arg() const
-		{
-			Value *arg = Nucleus::getArgument(index);
-			return Argument<typename std::tuple_element<index, std::tuple<Arguments...>>::type>(arg);
-		}
-
-		std::shared_ptr<Routine> operator()(const char *name, ...);
-		std::shared_ptr<Routine> operator()(const Config::Edit &cfg, const char *name, ...);
-
-	protected:
-		Nucleus *core;
-		std::vector<Type*> arguments;
-	};
-
-	template<typename Return>
-	class Function<Return()> : public Function<Return(Void)>
-	{
-	};
-
-	// FunctionT accepts a C-style function type template argument, allowing it to return a type-safe RoutineT wrapper
-	template<typename FunctionType>
-	class FunctionT;
-
-	template<typename Return, typename... Arguments>
-	class FunctionT<Return(Arguments...)> : public Function<CToReactorT<Return>(CToReactorT<Arguments>...)>
-	{
-	public:
-		// Type of base class
-		using BaseType = Function<CToReactorT<Return>(CToReactorT<Arguments>...)>;
-
-		// Function type, e.g. void(int,float)
-		using CFunctionType = Return(Arguments...);
-
-		// Reactor function type, e.g. Void(Int, Float)
-		using ReactorFunctionType = CToReactorT<Return>(CToReactorT<Arguments>...);
-
-		// Returned RoutineT type
-		using RoutineType = RoutineT<CFunctionType>;
-
-		// Hide base implementations of operator()
-
-		RoutineType operator()(const char* name, ...)
-		{
-			return RoutineType(BaseType::operator()(name));
-		}
-
-		RoutineType operator()(const Config::Edit& cfg, const char* name, ...)
-		{
-			return RoutineType(BaseType::operator()(cfg, name));
-		}
-	};
-
-	RValue<Long> Ticks();
+// ValueOf returns a rr::Value* for the given C-type, RValue<T>, LValue<T>
+// or Reference<T>.
+template <typename T>
+inline Value* ValueOf(const T &v)
+{
+	return ReactorType<T>::cast(v).loadValue();
 }
 
-namespace rr
+void Return();
+
+template<class T>
+void Return(const T &ret)
 {
-	template<class T>
-	LValue<T>::LValue(int arraySize) : Variable(T::getType(), arraySize)
+	static_assert(CanBeUsedAsReturn< ReactorTypeT<T> >::value, "Unsupported type for Return()");
+	Nucleus::createRet(ValueOf<T>(ret));
+	// Place any unreachable instructions in an unreferenced block.
+	Nucleus::setInsertBlock(Nucleus::createBasicBlock());
+}
+
+// Generic template, leave undefined!
+template<typename FunctionType>
+class Function;
+
+// Specialized for function types
+template<typename Return, typename... Arguments>
+class Function<Return(Arguments...)>
+{
+	// Static assert that the function signature is valid.
+	static_assert(sizeof(AssertFunctionSignatureIsValid<Return(Arguments...)>) >= 0, "Invalid function signature");
+
+public:
+	Function();
+
+	virtual ~Function();
+
+	template<int index>
+	Argument<typename std::tuple_element<index, std::tuple<Arguments...>>::type> Arg() const
 	{
+		Value *arg = Nucleus::getArgument(index);
+		return Argument<typename std::tuple_element<index, std::tuple<Arguments...>>::type>(arg);
+	}
+
+	std::shared_ptr<Routine> operator()(const char *name, ...);
+	std::shared_ptr<Routine> operator()(const Config::Edit &cfg, const char *name, ...);
+
+protected:
+	Nucleus *core;
+	std::vector<Type*> arguments;
+};
+
+template<typename Return>
+class Function<Return()> : public Function<Return(Void)>
+{
+};
+
+// FunctionT accepts a C-style function type template argument, allowing it to return a type-safe RoutineT wrapper
+template<typename FunctionType>
+class FunctionT;
+
+template<typename Return, typename... Arguments>
+class FunctionT<Return(Arguments...)> : public Function<CToReactorT<Return>(CToReactorT<Arguments>...)>
+{
+public:
+	// Type of base class
+	using BaseType = Function<CToReactorT<Return>(CToReactorT<Arguments>...)>;
+
+	// Function type, e.g. void(int,float)
+	using CFunctionType = Return(Arguments...);
+
+	// Reactor function type, e.g. Void(Int, Float)
+	using ReactorFunctionType = CToReactorT<Return>(CToReactorT<Arguments>...);
+
+	// Returned RoutineT type
+	using RoutineType = RoutineT<CFunctionType>;
+
+	// Hide base implementations of operator()
+
+	RoutineType operator()(const char* name, ...)
+	{
+		return RoutineType(BaseType::operator()(name));
+	}
+
+	RoutineType operator()(const Config::Edit& cfg, const char* name, ...)
+	{
+		return RoutineType(BaseType::operator()(cfg, name));
+	}
+};
+
+RValue<Long> Ticks();
+
+}  // namespace rr
+
+/* Inline implementations */
+
+namespace rr {
+
+template<class T>
+LValue<T>::LValue(int arraySize) : Variable(T::getType(), arraySize)
+{
 #ifdef ENABLE_RR_DEBUG_INFO
-		materialize();
+	materialize();
 #endif // ENABLE_RR_DEBUG_INFO
-	}
+}
 
-	inline void Variable::materialize() const
+inline void Variable::materialize() const
+{
+	if(!address)
 	{
-		if(!address)
-		{
-			address = Nucleus::allocateStackVariable(type, arraySize);
-			RR_DEBUG_INFO_EMIT_VAR(address);
+		address = Nucleus::allocateStackVariable(type, arraySize);
+		RR_DEBUG_INFO_EMIT_VAR(address);
 
-			if(rvalue)
-			{
-				storeValue(rvalue);
-				rvalue = nullptr;
-			}
-		}
-	}
-
-	inline Value *Variable::loadValue() const
-	{
 		if(rvalue)
 		{
-			return rvalue;
+			storeValue(rvalue);
+			rvalue = nullptr;
 		}
+	}
+}
 
-		if(!address)
-		{
-			// TODO: Return undef instead.
-			materialize();
-		}
-
-		return Nucleus::createLoad(address, type, false, 0);
+inline Value *Variable::loadValue() const
+{
+	if(rvalue)
+	{
+		return rvalue;
 	}
 
-	inline Value *Variable::storeValue(Value *value) const
+	if(!address)
 	{
-		if(address)
-		{
-			return Nucleus::createStore(value, address, type, false, 0);
-		}
-
-		rvalue = value;
-
-		return value;
-	}
-
-	inline Value *Variable::getBaseAddress() const
-	{
+		// TODO: Return undef instead.
 		materialize();
-
-		return address;
 	}
 
-	inline Value *Variable::getElementPointer(Value *index, bool unsignedIndex) const
+	return Nucleus::createLoad(address, type, false, 0);
+}
+
+inline Value *Variable::storeValue(Value *value) const
+{
+	if(address)
 	{
-		return Nucleus::createGEP(getBaseAddress(), type, index, unsignedIndex);
+		return Nucleus::createStore(value, address, type, false, 0);
 	}
 
-	template<class T>
-	RValue<Pointer<T>> LValue<T>::operator&()
-	{
-		return RValue<Pointer<T>>(getBaseAddress());
-	}
+	rvalue = value;
 
-	template<class T>
-	Reference<T>::Reference(Value *pointer, int alignment) : alignment(alignment)
-	{
-		address = pointer;
-	}
+	return value;
+}
 
-	template<class T>
-	RValue<T> Reference<T>::operator=(RValue<T> rhs) const
-	{
-		Nucleus::createStore(rhs.value, address, T::getType(), false, alignment);
+inline Value *Variable::getBaseAddress() const
+{
+	materialize();
 
-		return rhs;
-	}
+	return address;
+}
 
-	template<class T>
-	RValue<T> Reference<T>::operator=(const Reference<T> &ref) const
-	{
-		Value *tmp = Nucleus::createLoad(ref.address, T::getType(), false, ref.alignment);
-		Nucleus::createStore(tmp, address, T::getType(), false, alignment);
+inline Value *Variable::getElementPointer(Value *index, bool unsignedIndex) const
+{
+	return Nucleus::createGEP(getBaseAddress(), type, index, unsignedIndex);
+}
 
-		return RValue<T>(tmp);
-	}
+template<class T>
+RValue<Pointer<T>> LValue<T>::operator&()
+{
+	return RValue<Pointer<T>>(getBaseAddress());
+}
 
-	template<class T>
-	RValue<T> Reference<T>::operator+=(RValue<T> rhs) const
-	{
-		return *this = *this + rhs;
-	}
+template<class T>
+Reference<T>::Reference(Value *pointer, int alignment) : alignment(alignment)
+{
+	address = pointer;
+}
 
-	template<class T>
-	Value *Reference<T>::loadValue() const
-	{
-		return Nucleus::createLoad(address, T::getType(), false, alignment);
-	}
+template<class T>
+RValue<T> Reference<T>::operator=(RValue<T> rhs) const
+{
+	Nucleus::createStore(rhs.value, address, T::getType(), false, alignment);
 
-	template<class T>
-	int Reference<T>::getAlignment() const
-	{
-		return alignment;
-	}
+	return rhs;
+}
+
+template<class T>
+RValue<T> Reference<T>::operator=(const Reference<T> &ref) const
+{
+	Value *tmp = Nucleus::createLoad(ref.address, T::getType(), false, ref.alignment);
+	Nucleus::createStore(tmp, address, T::getType(), false, alignment);
+
+	return RValue<T>(tmp);
+}
+
+template<class T>
+RValue<T> Reference<T>::operator+=(RValue<T> rhs) const
+{
+	return *this = *this + rhs;
+}
+
+template<class T>
+Value *Reference<T>::loadValue() const
+{
+	return Nucleus::createLoad(address, T::getType(), false, alignment);
+}
+
+template<class T>
+int Reference<T>::getAlignment() const
+{
+	return alignment;
+}
 
 #ifdef ENABLE_RR_DEBUG_INFO
-	template<class T>
-	RValue<T>::RValue(const RValue<T> &rvalue) : value(rvalue.value)
-	{
-		RR_DEBUG_INFO_EMIT_VAR(value);
-	}
+template<class T>
+RValue<T>::RValue(const RValue<T> &rvalue) : value(rvalue.value)
+{
+	RR_DEBUG_INFO_EMIT_VAR(value);
+}
 #endif // ENABLE_RR_DEBUG_INFO
 
-	template<class T>
-	RValue<T>::RValue(Value *rvalue)
-	{
-		assert(Nucleus::createBitCast(rvalue, T::getType()) == rvalue);   // Run-time type should match T, so bitcast is no-op.
+template<class T>
+RValue<T>::RValue(Value *rvalue)
+{
+	assert(Nucleus::createBitCast(rvalue, T::getType()) == rvalue);   // Run-time type should match T, so bitcast is no-op.
 
-		value = rvalue;
-		RR_DEBUG_INFO_EMIT_VAR(value);
-	}
+	value = rvalue;
+	RR_DEBUG_INFO_EMIT_VAR(value);
+}
 
-	template<class T>
-	RValue<T>::RValue(const T &lvalue)
-	{
-		value = lvalue.loadValue();
-		RR_DEBUG_INFO_EMIT_VAR(value);
-	}
+template<class T>
+RValue<T>::RValue(const T &lvalue)
+{
+	value = lvalue.loadValue();
+	RR_DEBUG_INFO_EMIT_VAR(value);
+}
 
-	template<class T>
-	RValue<T>::RValue(typename BoolLiteral<T>::type i)
-	{
-		value = Nucleus::createConstantBool(i);
-		RR_DEBUG_INFO_EMIT_VAR(value);
-	}
+template<class T>
+RValue<T>::RValue(typename BoolLiteral<T>::type i)
+{
+	value = Nucleus::createConstantBool(i);
+	RR_DEBUG_INFO_EMIT_VAR(value);
+}
 
-	template<class T>
-	RValue<T>::RValue(typename IntLiteral<T>::type i)
-	{
-		value = Nucleus::createConstantInt(i);
-		RR_DEBUG_INFO_EMIT_VAR(value);
-	}
+template<class T>
+RValue<T>::RValue(typename IntLiteral<T>::type i)
+{
+	value = Nucleus::createConstantInt(i);
+	RR_DEBUG_INFO_EMIT_VAR(value);
+}
 
-	template<class T>
-	RValue<T>::RValue(typename FloatLiteral<T>::type f)
-	{
-		value = Nucleus::createConstantFloat(f);
-		RR_DEBUG_INFO_EMIT_VAR(value);
-	}
+template<class T>
+RValue<T>::RValue(typename FloatLiteral<T>::type f)
+{
+	value = Nucleus::createConstantFloat(f);
+	RR_DEBUG_INFO_EMIT_VAR(value);
+}
 
-	template<class T>
-	RValue<T>::RValue(const Reference<T> &ref)
-	{
-		value = ref.loadValue();
-		RR_DEBUG_INFO_EMIT_VAR(value);
-	}
+template<class T>
+RValue<T>::RValue(const Reference<T> &ref)
+{
+	value = ref.loadValue();
+	RR_DEBUG_INFO_EMIT_VAR(value);
+}
 
-	template<class Vector4, int T>
-	Swizzle2<Vector4, T>::operator RValue<Vector4>() const
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		Value *vector = parent->loadValue();
+template<class Vector4, int T>
+Swizzle2<Vector4, T>::operator RValue<Vector4>() const
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	Value *vector = parent->loadValue();
 
-		return Swizzle(RValue<Vector4>(vector), T);
-	}
+	return Swizzle(RValue<Vector4>(vector), T);
+}
 
-	template<class Vector4, int T>
-	Swizzle4<Vector4, T>::operator RValue<Vector4>() const
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		Value *vector = parent->loadValue();
+template<class Vector4, int T>
+Swizzle4<Vector4, T>::operator RValue<Vector4>() const
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	Value *vector = parent->loadValue();
 
-		return Swizzle(RValue<Vector4>(vector), T);
-	}
+	return Swizzle(RValue<Vector4>(vector), T);
+}
 
-	template<class Vector4, int T>
-	SwizzleMask4<Vector4, T>::operator RValue<Vector4>() const
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		Value *vector = parent->loadValue();
+template<class Vector4, int T>
+SwizzleMask4<Vector4, T>::operator RValue<Vector4>() const
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	Value *vector = parent->loadValue();
 
-		return Swizzle(RValue<Vector4>(vector), T);
-	}
+	return Swizzle(RValue<Vector4>(vector), T);
+}
 
-	template<class Vector4, int T>
-	RValue<Vector4> SwizzleMask4<Vector4, T>::operator=(RValue<Vector4> rhs)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return Mask(*parent, rhs, T);
-	}
+template<class Vector4, int T>
+RValue<Vector4> SwizzleMask4<Vector4, T>::operator=(RValue<Vector4> rhs)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return Mask(*parent, rhs, T);
+}
 
-	template<class Vector4, int T>
-	RValue<Vector4> SwizzleMask4<Vector4, T>::operator=(RValue<typename Scalar<Vector4>::Type> rhs)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return Mask(*parent, Vector4(rhs), T);
-	}
+template<class Vector4, int T>
+RValue<Vector4> SwizzleMask4<Vector4, T>::operator=(RValue<typename Scalar<Vector4>::Type> rhs)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return Mask(*parent, Vector4(rhs), T);
+}
 
-	template<class Vector4, int T>
-	SwizzleMask1<Vector4, T>::operator RValue<typename Scalar<Vector4>::Type>() const   // FIXME: Call a non-template function
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return Extract(*parent, T & 0x3);
-	}
+template<class Vector4, int T>
+SwizzleMask1<Vector4, T>::operator RValue<typename Scalar<Vector4>::Type>() const   // FIXME: Call a non-template function
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return Extract(*parent, T & 0x3);
+}
 
-	template<class Vector4, int T>
-	SwizzleMask1<Vector4, T>::operator RValue<Vector4>() const
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		Value *vector = parent->loadValue();
+template<class Vector4, int T>
+SwizzleMask1<Vector4, T>::operator RValue<Vector4>() const
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	Value *vector = parent->loadValue();
 
-		return Swizzle(RValue<Vector4>(vector), T);
-	}
+	return Swizzle(RValue<Vector4>(vector), T);
+}
 
-	template<class Vector4, int T>
-	RValue<Vector4> SwizzleMask1<Vector4, T>::operator=(float x)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return *parent = Insert(*parent, Float(x), T & 0x3);
-	}
+template<class Vector4, int T>
+RValue<Vector4> SwizzleMask1<Vector4, T>::operator=(float x)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return *parent = Insert(*parent, Float(x), T & 0x3);
+}
 
-	template<class Vector4, int T>
-	RValue<Vector4> SwizzleMask1<Vector4, T>::operator=(RValue<Vector4> rhs)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return Mask(*parent, Float4(rhs), T);
-	}
+template<class Vector4, int T>
+RValue<Vector4> SwizzleMask1<Vector4, T>::operator=(RValue<Vector4> rhs)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return Mask(*parent, Float4(rhs), T);
+}
 
-	template<class Vector4, int T>
-	RValue<Vector4> SwizzleMask1<Vector4, T>::operator=(RValue<typename Scalar<Vector4>::Type> rhs)   // FIXME: Call a non-template function
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return *parent = Insert(*parent, rhs, T & 0x3);
-	}
+template<class Vector4, int T>
+RValue<Vector4> SwizzleMask1<Vector4, T>::operator=(RValue<typename Scalar<Vector4>::Type> rhs)   // FIXME: Call a non-template function
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return *parent = Insert(*parent, rhs, T & 0x3);
+}
 
-	template<class Vector4, int T>
-	SwizzleMask2<Vector4, T>::operator RValue<Vector4>() const
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		Value *vector = parent->loadValue();
+template<class Vector4, int T>
+SwizzleMask2<Vector4, T>::operator RValue<Vector4>() const
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	Value *vector = parent->loadValue();
 
-		return Swizzle(RValue<Float4>(vector), T);
-	}
+	return Swizzle(RValue<Float4>(vector), T);
+}
 
-	template<class Vector4, int T>
-	RValue<Vector4> SwizzleMask2<Vector4, T>::operator=(RValue<Vector4> rhs)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return Mask(*parent, Float4(rhs), T);
-	}
+template<class Vector4, int T>
+RValue<Vector4> SwizzleMask2<Vector4, T>::operator=(RValue<Vector4> rhs)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return Mask(*parent, Float4(rhs), T);
+}
 
-	template<int T>
-	Float::Float(const SwizzleMask1<Float4, T> &rhs)
-	{
-		*this = rhs.operator RValue<Float>();
-	}
+template<int T>
+Float::Float(const SwizzleMask1<Float4, T> &rhs)
+{
+	*this = rhs.operator RValue<Float>();
+}
 
-	template<int T>
-	RValue<Float> Float::operator=(const SwizzleMask1<Float4, T> &rhs)
-	{
-		return *this = rhs.operator RValue<Float>();
-	}
+template<int T>
+RValue<Float> Float::operator=(const SwizzleMask1<Float4, T> &rhs)
+{
+	return *this = rhs.operator RValue<Float>();
+}
 
-	template<int T>
-	Float4::Float4(const SwizzleMask1<Float4, T> &rhs) : XYZW(this)
-	{
-		*this = rhs.operator RValue<Float4>();
-	}
+template<int T>
+Float4::Float4(const SwizzleMask1<Float4, T> &rhs) : XYZW(this)
+{
+	*this = rhs.operator RValue<Float4>();
+}
 
-	template<int T>
-	Float4::Float4(const Swizzle4<Float4, T> &rhs) : XYZW(this)
-	{
-		*this = rhs.operator RValue<Float4>();
-	}
+template<int T>
+Float4::Float4(const Swizzle4<Float4, T> &rhs) : XYZW(this)
+{
+	*this = rhs.operator RValue<Float4>();
+}
 
-	template<int X, int Y>
-	Float4::Float4(const Swizzle2<Float4, X> &x, const Swizzle2<Float4, Y> &y) : XYZW(this)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		*this = ShuffleLowHigh(*x.parent, *y.parent, (uint16_t(X) & 0xFF00u) | (uint16_t(Y >> 8) & 0x00FFu));
-	}
+template<int X, int Y>
+Float4::Float4(const Swizzle2<Float4, X> &x, const Swizzle2<Float4, Y> &y) : XYZW(this)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	*this = ShuffleLowHigh(*x.parent, *y.parent, (uint16_t(X) & 0xFF00u) | (uint16_t(Y >> 8) & 0x00FFu));
+}
 
-	template<int X, int Y>
-	Float4::Float4(const SwizzleMask2<Float4, X> &x, const Swizzle2<Float4, Y> &y) : XYZW(this)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		*this = ShuffleLowHigh(*x.parent, *y.parent, (uint16_t(X) & 0xFF00u) | (uint16_t(Y >> 8) & 0x00FFu));
-	}
+template<int X, int Y>
+Float4::Float4(const SwizzleMask2<Float4, X> &x, const Swizzle2<Float4, Y> &y) : XYZW(this)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	*this = ShuffleLowHigh(*x.parent, *y.parent, (uint16_t(X) & 0xFF00u) | (uint16_t(Y >> 8) & 0x00FFu));
+}
 
-	template<int X, int Y>
-	Float4::Float4(const Swizzle2<Float4, X> &x, const SwizzleMask2<Float4, Y> &y) : XYZW(this)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		*this = ShuffleLowHigh(*x.parent, *y.parent, (uint16_t(X) & 0xFF00u) | (uint16_t(Y >> 8) & 0x00FFu));
-	}
+template<int X, int Y>
+Float4::Float4(const Swizzle2<Float4, X> &x, const SwizzleMask2<Float4, Y> &y) : XYZW(this)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	*this = ShuffleLowHigh(*x.parent, *y.parent, (uint16_t(X) & 0xFF00u) | (uint16_t(Y >> 8) & 0x00FFu));
+}
 
-	template<int X, int Y>
-	Float4::Float4(const SwizzleMask2<Float4, X> &x, const SwizzleMask2<Float4, Y> &y) : XYZW(this)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		*this = ShuffleLowHigh(*x.parent, *y.parent, (uint16_t(X) & 0xFF00u) | (uint16_t(Y >> 8) & 0x00FFu));
-	}
+template<int X, int Y>
+Float4::Float4(const SwizzleMask2<Float4, X> &x, const SwizzleMask2<Float4, Y> &y) : XYZW(this)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	*this = ShuffleLowHigh(*x.parent, *y.parent, (uint16_t(X) & 0xFF00u) | (uint16_t(Y >> 8) & 0x00FFu));
+}
 
-	template<int T>
-	RValue<Float4> Float4::operator=(const SwizzleMask1<Float4, T> &rhs)
-	{
-		return *this = rhs.operator RValue<Float4>();
-	}
+template<int T>
+RValue<Float4> Float4::operator=(const SwizzleMask1<Float4, T> &rhs)
+{
+	return *this = rhs.operator RValue<Float4>();
+}
 
-	template<int T>
-	RValue<Float4> Float4::operator=(const Swizzle4<Float4, T> &rhs)
-	{
-		return *this = rhs.operator RValue<Float4>();
-	}
+template<int T>
+RValue<Float4> Float4::operator=(const Swizzle4<Float4, T> &rhs)
+{
+	return *this = rhs.operator RValue<Float4>();
+}
 
-	// Returns a reactor pointer to the fixed-address ptr.
-	RValue<Pointer<Byte>> ConstantPointer(void const * ptr);
+// Returns a reactor pointer to the fixed-address ptr.
+RValue<Pointer<Byte>> ConstantPointer(void const * ptr);
 
-	// Returns a reactor pointer to an immutable copy of the data of size bytes.
-	RValue<Pointer<Byte>> ConstantData(void const * data, size_t size);
+// Returns a reactor pointer to an immutable copy of the data of size bytes.
+RValue<Pointer<Byte>> ConstantData(void const * data, size_t size);
 
-	template<class T>
-	Pointer<T>::Pointer(Argument<Pointer<T>> argument) : alignment(1)
-	{
-		LValue<Pointer<T>>::storeValue(argument.value);
-	}
+template<class T>
+Pointer<T>::Pointer(Argument<Pointer<T>> argument) : alignment(1)
+{
+	LValue<Pointer<T>>::storeValue(argument.value);
+}
 
-	template<class T>
-	Pointer<T>::Pointer() : alignment(1) {}
+template<class T>
+Pointer<T>::Pointer() : alignment(1) {}
 
-	template<class T>
-	Pointer<T>::Pointer(RValue<Pointer<T>> rhs) : alignment(1)
-	{
-		LValue<Pointer<T>>::storeValue(rhs.value);
-	}
+template<class T>
+Pointer<T>::Pointer(RValue<Pointer<T>> rhs) : alignment(1)
+{
+	LValue<Pointer<T>>::storeValue(rhs.value);
+}
 
-	template<class T>
-	Pointer<T>::Pointer(const Pointer<T> &rhs) : alignment(rhs.alignment)
-	{
-		Value *value = rhs.loadValue();
-		LValue<Pointer<T>>::storeValue(value);
-	}
+template<class T>
+Pointer<T>::Pointer(const Pointer<T> &rhs) : alignment(rhs.alignment)
+{
+	Value *value = rhs.loadValue();
+	LValue<Pointer<T>>::storeValue(value);
+}
 
-	template<class T>
-	Pointer<T>::Pointer(const Reference<Pointer<T>> &rhs) : alignment(rhs.getAlignment())
-	{
-		Value *value = rhs.loadValue();
-		LValue<Pointer<T>>::storeValue(value);
-	}
+template<class T>
+Pointer<T>::Pointer(const Reference<Pointer<T>> &rhs) : alignment(rhs.getAlignment())
+{
+	Value *value = rhs.loadValue();
+	LValue<Pointer<T>>::storeValue(value);
+}
 
-	template<class T>
-	Pointer<T>::Pointer(std::nullptr_t) : alignment(1)
-	{
-		Value *value = Nucleus::createNullPointer(T::getType());
-		LValue<Pointer<T>>::storeValue(value);
-	}
+template<class T>
+Pointer<T>::Pointer(std::nullptr_t) : alignment(1)
+{
+	Value *value = Nucleus::createNullPointer(T::getType());
+	LValue<Pointer<T>>::storeValue(value);
+}
 
-	template<class T>
-	RValue<Pointer<T>> Pointer<T>::operator=(RValue<Pointer<T>> rhs)
-	{
-		LValue<Pointer<T>>::storeValue(rhs.value);
+template<class T>
+RValue<Pointer<T>> Pointer<T>::operator=(RValue<Pointer<T>> rhs)
+{
+	LValue<Pointer<T>>::storeValue(rhs.value);
 
-		return rhs;
-	}
+	return rhs;
+}
 
-	template<class T>
-	RValue<Pointer<T>> Pointer<T>::operator=(const Pointer<T> &rhs)
-	{
-		Value *value = rhs.loadValue();
-		LValue<Pointer<T>>::storeValue(value);
+template<class T>
+RValue<Pointer<T>> Pointer<T>::operator=(const Pointer<T> &rhs)
+{
+	Value *value = rhs.loadValue();
+	LValue<Pointer<T>>::storeValue(value);
 
-		return RValue<Pointer<T>>(value);
-	}
+	return RValue<Pointer<T>>(value);
+}
 
-	template<class T>
-	RValue<Pointer<T>> Pointer<T>::operator=(const Reference<Pointer<T>> &rhs)
-	{
-		Value *value = rhs.loadValue();
-		LValue<Pointer<T>>::storeValue(value);
+template<class T>
+RValue<Pointer<T>> Pointer<T>::operator=(const Reference<Pointer<T>> &rhs)
+{
+	Value *value = rhs.loadValue();
+	LValue<Pointer<T>>::storeValue(value);
 
-		return RValue<Pointer<T>>(value);
-	}
+	return RValue<Pointer<T>>(value);
+}
 
-	template<class T>
-	RValue<Pointer<T>> Pointer<T>::operator=(std::nullptr_t)
-	{
-		Value *value = Nucleus::createNullPointer(T::getType());
-		LValue<Pointer<T>>::storeValue(value);
+template<class T>
+RValue<Pointer<T>> Pointer<T>::operator=(std::nullptr_t)
+{
+	Value *value = Nucleus::createNullPointer(T::getType());
+	LValue<Pointer<T>>::storeValue(value);
 
-		return RValue<Pointer<T>>(this);
-	}
+	return RValue<Pointer<T>>(this);
+}
 
-	template<class T>
-	Reference<T> Pointer<T>::operator*()
-	{
-		return Reference<T>(LValue<Pointer<T>>::loadValue(), alignment);
-	}
+template<class T>
+Reference<T> Pointer<T>::operator*()
+{
+	return Reference<T>(LValue<Pointer<T>>::loadValue(), alignment);
+}
 
-	template<class T>
-	Reference<T> Pointer<T>::operator[](int index)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		Value *element = Nucleus::createGEP(LValue<Pointer<T>>::loadValue(), T::getType(), Nucleus::createConstantInt(index), false);
+template<class T>
+Reference<T> Pointer<T>::operator[](int index)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	Value *element = Nucleus::createGEP(LValue<Pointer<T>>::loadValue(), T::getType(), Nucleus::createConstantInt(index), false);
 
-		return Reference<T>(element, alignment);
-	}
+	return Reference<T>(element, alignment);
+}
 
-	template<class T>
-	Reference<T> Pointer<T>::operator[](unsigned int index)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		Value *element = Nucleus::createGEP(LValue<Pointer<T>>::loadValue(), T::getType(), Nucleus::createConstantInt(index), true);
+template<class T>
+Reference<T> Pointer<T>::operator[](unsigned int index)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	Value *element = Nucleus::createGEP(LValue<Pointer<T>>::loadValue(), T::getType(), Nucleus::createConstantInt(index), true);
 
-		return Reference<T>(element, alignment);
-	}
+	return Reference<T>(element, alignment);
+}
 
-	template<class T>
-	Reference<T> Pointer<T>::operator[](RValue<Int> index)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		Value *element = Nucleus::createGEP(LValue<Pointer<T>>::loadValue(), T::getType(), index.value, false);
+template<class T>
+Reference<T> Pointer<T>::operator[](RValue<Int> index)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	Value *element = Nucleus::createGEP(LValue<Pointer<T>>::loadValue(), T::getType(), index.value, false);
 
-		return Reference<T>(element, alignment);
-	}
+	return Reference<T>(element, alignment);
+}
 
-	template<class T>
-	Reference<T> Pointer<T>::operator[](RValue<UInt> index)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		Value *element = Nucleus::createGEP(LValue<Pointer<T>>::loadValue(), T::getType(), index.value, true);
+template<class T>
+Reference<T> Pointer<T>::operator[](RValue<UInt> index)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	Value *element = Nucleus::createGEP(LValue<Pointer<T>>::loadValue(), T::getType(), index.value, true);
 
-		return Reference<T>(element, alignment);
-	}
+	return Reference<T>(element, alignment);
+}
 
-	template<class T>
-	Type *Pointer<T>::getType()
-	{
-		return Nucleus::getPointerType(T::getType());
-	}
+template<class T>
+Type *Pointer<T>::getType()
+{
+	return Nucleus::getPointerType(T::getType());
+}
 
-	template<class T, int S>
-	Array<T, S>::Array(int size) : LValue<T>(size)
-	{
-	}
+template<class T, int S>
+Array<T, S>::Array(int size) : LValue<T>(size)
+{
+}
 
-	template<class T, int S>
-	Reference<T> Array<T, S>::operator[](int index)
-	{
-		assert(index < this->arraySize);
-		Value *element = LValue<T>::getElementPointer(Nucleus::createConstantInt(index), false);
+template<class T, int S>
+Reference<T> Array<T, S>::operator[](int index)
+{
+	assert(index < this->arraySize);
+	Value *element = LValue<T>::getElementPointer(Nucleus::createConstantInt(index), false);
 
-		return Reference<T>(element);
-	}
+	return Reference<T>(element);
+}
 
-	template<class T, int S>
-	Reference<T> Array<T, S>::operator[](unsigned int index)
-	{
-		assert(index < static_cast<unsigned int>(this->arraySize));
-		Value *element = LValue<T>::getElementPointer(Nucleus::createConstantInt(index), true);
+template<class T, int S>
+Reference<T> Array<T, S>::operator[](unsigned int index)
+{
+	assert(index < static_cast<unsigned int>(this->arraySize));
+	Value *element = LValue<T>::getElementPointer(Nucleus::createConstantInt(index), true);
 
-		return Reference<T>(element);
-	}
+	return Reference<T>(element);
+}
 
-	template<class T, int S>
-	Reference<T> Array<T, S>::operator[](RValue<Int> index)
-	{
-		Value *element = LValue<T>::getElementPointer(index.value, false);
+template<class T, int S>
+Reference<T> Array<T, S>::operator[](RValue<Int> index)
+{
+	Value *element = LValue<T>::getElementPointer(index.value, false);
 
-		return Reference<T>(element);
-	}
+	return Reference<T>(element);
+}
 
-	template<class T, int S>
-	Reference<T> Array<T, S>::operator[](RValue<UInt> index)
-	{
-		Value *element = LValue<T>::getElementPointer(index.value, true);
+template<class T, int S>
+Reference<T> Array<T, S>::operator[](RValue<UInt> index)
+{
+	Value *element = LValue<T>::getElementPointer(index.value, true);
 
-		return Reference<T>(element);
-	}
+	return Reference<T>(element);
+}
 
 //	template<class T>
 //	RValue<Array<T>> operator++(Array<T> &val, int)
@@ -3067,404 +3070,411 @@
 //		// FIXME: Requires storing the address of the array
 //	}
 
-	template<class T>
-	RValue<T> IfThenElse(RValue<Bool> condition, RValue<T> ifTrue, RValue<T> ifFalse)
+template<class T>
+RValue<T> IfThenElse(RValue<Bool> condition, RValue<T> ifTrue, RValue<T> ifFalse)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return RValue<T>(Nucleus::createSelect(condition.value, ifTrue.value, ifFalse.value));
+}
+
+template<class T>
+RValue<T> IfThenElse(RValue<Bool> condition, const T &ifTrue, RValue<T> ifFalse)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	Value *trueValue = ifTrue.loadValue();
+
+	return RValue<T>(Nucleus::createSelect(condition.value, trueValue, ifFalse.value));
+}
+
+template<class T>
+RValue<T> IfThenElse(RValue<Bool> condition, RValue<T> ifTrue, const T &ifFalse)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	Value *falseValue = ifFalse.loadValue();
+
+	return RValue<T>(Nucleus::createSelect(condition.value, ifTrue.value, falseValue));
+}
+
+template<class T>
+RValue<T> IfThenElse(RValue<Bool> condition, const T &ifTrue, const T &ifFalse)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	Value *trueValue = ifTrue.loadValue();
+	Value *falseValue = ifFalse.loadValue();
+
+	return RValue<T>(Nucleus::createSelect(condition.value, trueValue, falseValue));
+}
+
+template<typename Return, typename... Arguments>
+Function<Return(Arguments...)>::Function()
+{
+	core = new Nucleus();
+
+	Type *types[] = {Arguments::getType()...};
+	for(Type *type : types)
 	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return RValue<T>(Nucleus::createSelect(condition.value, ifTrue.value, ifFalse.value));
-	}
-
-	template<class T>
-	RValue<T> IfThenElse(RValue<Bool> condition, const T &ifTrue, RValue<T> ifFalse)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		Value *trueValue = ifTrue.loadValue();
-
-		return RValue<T>(Nucleus::createSelect(condition.value, trueValue, ifFalse.value));
-	}
-
-	template<class T>
-	RValue<T> IfThenElse(RValue<Bool> condition, RValue<T> ifTrue, const T &ifFalse)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		Value *falseValue = ifFalse.loadValue();
-
-		return RValue<T>(Nucleus::createSelect(condition.value, ifTrue.value, falseValue));
-	}
-
-	template<class T>
-	RValue<T> IfThenElse(RValue<Bool> condition, const T &ifTrue, const T &ifFalse)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		Value *trueValue = ifTrue.loadValue();
-		Value *falseValue = ifFalse.loadValue();
-
-		return RValue<T>(Nucleus::createSelect(condition.value, trueValue, falseValue));
-	}
-
-	template<typename Return, typename... Arguments>
-	Function<Return(Arguments...)>::Function()
-	{
-		core = new Nucleus();
-
-		Type *types[] = {Arguments::getType()...};
-		for(Type *type : types)
+		if(type != Void::getType())
 		{
-			if(type != Void::getType())
-			{
-				arguments.push_back(type);
-			}
+			arguments.push_back(type);
 		}
-
-		Nucleus::createFunction(Return::getType(), arguments);
 	}
 
-	template<typename Return, typename... Arguments>
-	Function<Return(Arguments...)>::~Function()
+	Nucleus::createFunction(Return::getType(), arguments);
+}
+
+template<typename Return, typename... Arguments>
+Function<Return(Arguments...)>::~Function()
+{
+	delete core;
+}
+
+template<typename Return, typename... Arguments>
+std::shared_ptr<Routine> Function<Return(Arguments...)>::operator()(const char *name, ...)
+{
+	char fullName[1024 + 1];
+
+	va_list vararg;
+	va_start(vararg, name);
+	vsnprintf(fullName, 1024, name, vararg);
+	va_end(vararg);
+
+	return core->acquireRoutine(fullName, Config::Edit::None);
+}
+
+template<typename Return, typename... Arguments>
+std::shared_ptr<Routine> Function<Return(Arguments...)>::operator()(const Config::Edit &cfg, const char *name, ...)
+{
+	char fullName[1024 + 1];
+
+	va_list vararg;
+	va_start(vararg, name);
+	vsnprintf(fullName, 1024, name, vararg);
+	va_end(vararg);
+
+	return core->acquireRoutine(fullName, cfg);
+}
+
+template<class T, class S>
+RValue<T> ReinterpretCast(RValue<S> val)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return RValue<T>(Nucleus::createBitCast(val.value, T::getType()));
+}
+
+template<class T, class S>
+RValue<T> ReinterpretCast(const LValue<S> &var)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	Value *val = var.loadValue();
+
+	return RValue<T>(Nucleus::createBitCast(val, T::getType()));
+}
+
+template<class T, class S>
+RValue<T> ReinterpretCast(const Reference<S> &var)
+{
+	return ReinterpretCast<T>(RValue<S>(var));
+}
+
+template<class T>
+RValue<T> As(Value *val)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return RValue<T>(Nucleus::createBitCast(val, T::getType()));
+}
+
+template<class T, class S>
+RValue<T> As(RValue<S> val)
+{
+	return ReinterpretCast<T>(val);
+}
+
+template<class T, class S>
+RValue<T> As(const LValue<S> &var)
+{
+	return ReinterpretCast<T>(var);
+}
+
+template<class T, class S>
+RValue<T> As(const Reference<S> &val)
+{
+	return ReinterpretCast<T>(val);
+}
+
+// Calls the function pointer fptr with the given arguments, return type
+// and parameter types. Returns the call's return value if the function has
+// a non-void return type.
+Value* Call(RValue<Pointer<Byte>> fptr, Type* retTy, std::initializer_list<Value*> args, std::initializer_list<Type*> paramTys);
+
+template <typename F>
+class CallHelper {};
+
+template<typename Return, typename ... Arguments>
+class CallHelper<Return(Arguments...)>
+{
+public:
+	using RReturn = CToReactorT<Return>;
+
+	static inline RReturn Call(Return(fptr)(Arguments...), CToReactorT<Arguments>... args)
 	{
-		delete core;
+		return RValue<RReturn>(rr::Call(
+			ConstantPointer(reinterpret_cast<void*>(fptr)),
+			RReturn::getType(),
+			{ ValueOf(args) ... },
+			{ CToReactorT<Arguments>::getType() ... }));
 	}
 
-	template<typename Return, typename... Arguments>
-	std::shared_ptr<Routine> Function<Return(Arguments...)>::operator()(const char *name, ...)
+	static inline RReturn Call(Pointer<Byte> fptr, CToReactorT<Arguments>... args)
 	{
-		char fullName[1024 + 1];
+		return RValue<RReturn>(rr::Call(
+			fptr,
+			RReturn::getType(),
+			{ ValueOf(args) ... },
+			{ CToReactorT<Arguments>::getType() ... }));
+	}
+};
 
-		va_list vararg;
-		va_start(vararg, name);
-		vsnprintf(fullName, 1024, name, vararg);
-		va_end(vararg);
-
-		return core->acquireRoutine(fullName, Config::Edit::None);
+template<typename ... Arguments>
+class CallHelper<void(Arguments...)>
+{
+public:
+	static inline void Call(void(fptr)(Arguments...), CToReactorT<Arguments>... args)
+	{
+		rr::Call(ConstantPointer(reinterpret_cast<void*>(fptr)),
+			Void::getType(),
+			{ ValueOf(args) ... },
+			{ CToReactorT<Arguments>::getType() ... });
 	}
 
-	template<typename Return, typename... Arguments>
-	std::shared_ptr<Routine> Function<Return(Arguments...)>::operator()(const Config::Edit &cfg, const char *name, ...)
+	static inline void Call(Pointer<Byte> fptr, CToReactorT<Arguments>... args)
 	{
-		char fullName[1024 + 1];
-
-		va_list vararg;
-		va_start(vararg, name);
-		vsnprintf(fullName, 1024, name, vararg);
-		va_end(vararg);
-
-		return core->acquireRoutine(fullName, cfg);
+		rr::Call(fptr,
+			Void::getType(),
+			{ ValueOf(args) ... },
+			{ CToReactorT<Arguments>::getType() ... });
 	}
+};
 
-	template<class T, class S>
-	RValue<T> ReinterpretCast(RValue<S> val)
+template <typename T>
+inline ReactorTypeT<T> CastToReactor(const T& v) { return ReactorType<T>::cast(v); }
+
+// Calls the static function pointer fptr with the given arguments args.
+template<typename Return, typename ... CArgs, typename ... RArgs>
+inline CToReactorT<Return> Call(Return(fptr)(CArgs...), RArgs&&... args)
+{
+	return CallHelper<Return(CArgs...)>::Call(fptr, CastToReactor(std::forward<RArgs>(args))...);
+}
+
+// Calls the static function pointer fptr with the given arguments args.
+// Overload for calling functions with void return type.
+template<typename ... CArgs, typename ... RArgs>
+inline void Call(void(fptr)(CArgs...), RArgs&&... args)
+{
+	CallHelper<void(CArgs...)>::Call(fptr, CastToReactor(std::forward<RArgs>(args))...);
+}
+
+// Calls the member function pointer fptr with the given arguments args.
+// object can be a Class*, or a Pointer<Byte>.
+template<typename Return, typename Class, typename C, typename ... CArgs, typename ... RArgs>
+inline CToReactorT<Return> Call(Return(Class::* fptr)(CArgs...), C&& object, RArgs&&... args)
+{
+	using Helper = CallHelper<Return(Class*, void*, CArgs...)>;
+	using fptrTy = decltype(fptr);
+
+	struct Static
 	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return RValue<T>(Nucleus::createBitCast(val.value, T::getType()));
-	}
-
-	template<class T, class S>
-	RValue<T> ReinterpretCast(const LValue<S> &var)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		Value *val = var.loadValue();
-
-		return RValue<T>(Nucleus::createBitCast(val, T::getType()));
-	}
-
-	template<class T, class S>
-	RValue<T> ReinterpretCast(const Reference<S> &var)
-	{
-		return ReinterpretCast<T>(RValue<S>(var));
-	}
-
-	template<class T>
-	RValue<T> As(Value *val)
-	{
-		RR_DEBUG_INFO_UPDATE_LOC();
-		return RValue<T>(Nucleus::createBitCast(val, T::getType()));
-	}
-
-	template<class T, class S>
-	RValue<T> As(RValue<S> val)
-	{
-		return ReinterpretCast<T>(val);
-	}
-
-	template<class T, class S>
-	RValue<T> As(const LValue<S> &var)
-	{
-		return ReinterpretCast<T>(var);
-	}
-
-	template<class T, class S>
-	RValue<T> As(const Reference<S> &val)
-	{
-		return ReinterpretCast<T>(val);
-	}
-
-	// Calls the function pointer fptr with the given arguments, return type
-	// and parameter types. Returns the call's return value if the function has
-	// a non-void return type.
-	Value* Call(RValue<Pointer<Byte>> fptr, Type* retTy, std::initializer_list<Value*> args, std::initializer_list<Type*> paramTys);
-
-	template <typename F>
-	class CallHelper {};
-
-	template<typename Return, typename ... Arguments>
-	class CallHelper<Return(Arguments...)>
-	{
-	public:
-		using RReturn = CToReactorT<Return>;
-
-		static inline RReturn Call(Return(fptr)(Arguments...), CToReactorT<Arguments>... args)
+		static inline Return Call(Class* object, void* fptrptr, CArgs... args)
 		{
-			return RValue<RReturn>(rr::Call(
-				ConstantPointer(reinterpret_cast<void*>(fptr)),
-				RReturn::getType(),
-				{ ValueOf(args) ... },
-				{ CToReactorT<Arguments>::getType() ... }));
-		}
-
-		static inline RReturn Call(Pointer<Byte> fptr, CToReactorT<Arguments>... args)
-		{
-			return RValue<RReturn>(rr::Call(
-				fptr,
-				RReturn::getType(),
-				{ ValueOf(args) ... },
-				{ CToReactorT<Arguments>::getType() ... }));
+			auto fptr = *reinterpret_cast<fptrTy*>(fptrptr);
+			return (object->*fptr)(std::forward<CArgs>(args)...);
 		}
 	};
 
-	template<typename ... Arguments>
-	class CallHelper<void(Arguments...)>
-	{
-	public:
-		static inline void Call(void(fptr)(Arguments...), CToReactorT<Arguments>... args)
-		{
-			rr::Call(ConstantPointer(reinterpret_cast<void*>(fptr)),
-				Void::getType(),
-				{ ValueOf(args) ... },
-				{ CToReactorT<Arguments>::getType() ... });
-		}
+	return Helper::Call(&Static::Call,
+	                    CastToReactor(object),
+	                    ConstantData(&fptr, sizeof(fptr)),
+	                    CastToReactor(std::forward<RArgs>(args))...);
+}
 
-		static inline void Call(Pointer<Byte> fptr, CToReactorT<Arguments>... args)
+// Calls the member function pointer fptr with the given arguments args.
+// Overload for calling functions with void return type.
+// object can be a Class*, or a Pointer<Byte>.
+template<typename Class, typename C, typename ... CArgs, typename ... RArgs>
+inline void Call(void(Class::* fptr)(CArgs...), C&& object, RArgs&&... args)
+{
+	using Helper = CallHelper<void(Class*, void*, CArgs...)>;
+	using fptrTy = decltype(fptr);
+
+	struct Static
+	{
+		static inline void Call(Class* object, void* fptrptr, CArgs... args)
 		{
-			rr::Call(fptr,
-				Void::getType(),
-				{ ValueOf(args) ... },
-				{ CToReactorT<Arguments>::getType() ... });
+			auto fptr = *reinterpret_cast<fptrTy*>(fptrptr);
+			(object->*fptr)(std::forward<CArgs>(args)...);
 		}
 	};
 
-	template <typename T>
-	inline ReactorTypeT<T> CastToReactor(const T& v) { return ReactorType<T>::cast(v); }
+	Helper::Call(&Static::Call,
+	             CastToReactor(object),
+	             ConstantData(&fptr, sizeof(fptr)),
+	             CastToReactor(std::forward<RArgs>(args))...);
+}
 
-	// Calls the static function pointer fptr with the given arguments args.
-	template<typename Return, typename ... CArgs, typename ... RArgs>
-	inline CToReactorT<Return> Call(Return(fptr)(CArgs...), RArgs&&... args)
+// Calls the Reactor function pointer fptr with the signature
+// FUNCTION_SIGNATURE and arguments.
+template<typename FUNCTION_SIGNATURE, typename ... RArgs>
+inline void Call(Pointer<Byte> fptr, RArgs&& ... args)
+{
+	CallHelper<FUNCTION_SIGNATURE>::Call(fptr, CastToReactor(std::forward<RArgs>(args))...);
+}
+
+// Breakpoint emits an instruction that will cause the application to trap.
+// This can be used to stop an attached debugger at the given call.
+void Breakpoint();
+
+class ForData
+{
+public:
+	ForData(bool init) : loopOnce(init)
 	{
-		return CallHelper<Return(CArgs...)>::Call(fptr, CastToReactor(std::forward<RArgs>(args))...);
 	}
 
-	// Calls the static function pointer fptr with the given arguments args.
-	// Overload for calling functions with void return type.
-	template<typename ... CArgs, typename ... RArgs>
-	inline void Call(void(fptr)(CArgs...), RArgs&&... args)
+	operator bool()
 	{
-		CallHelper<void(CArgs...)>::Call(fptr, CastToReactor(std::forward<RArgs>(args))...);
+		return loopOnce;
 	}
 
-	// Calls the member function pointer fptr with the given arguments args.
-	// object can be a Class*, or a Pointer<Byte>.
-	template<typename Return, typename Class, typename C, typename ... CArgs, typename ... RArgs>
-	inline CToReactorT<Return> Call(Return(Class::* fptr)(CArgs...), C&& object, RArgs&&... args)
+	bool operator=(bool value)
 	{
-		using Helper = CallHelper<Return(Class*, void*, CArgs...)>;
-		using fptrTy = decltype(fptr);
-		struct Static {
-			static inline Return Call(Class* object, void* fptrptr, CArgs... args)
-			{
-				auto fptr = *reinterpret_cast<fptrTy*>(fptrptr);
-				return (object->*fptr)(std::forward<CArgs>(args)...);
-			}
-		};
-		return Helper::Call(&Static::Call,
-		                    CastToReactor(object),
-		                    ConstantData(&fptr, sizeof(fptr)),
-		                    CastToReactor(std::forward<RArgs>(args))...);
+		return loopOnce = value;
 	}
 
-	// Calls the member function pointer fptr with the given arguments args.
-	// Overload for calling functions with void return type.
-	// object can be a Class*, or a Pointer<Byte>.
-	template<typename Class, typename C, typename ... CArgs, typename ... RArgs>
-	inline void Call(void(Class::* fptr)(CArgs...), C&& object, RArgs&&... args)
+	bool setup()
 	{
-		using Helper = CallHelper<void(Class*, void*, CArgs...)>;
-		using fptrTy = decltype(fptr);
-		struct Static {
-			static inline void Call(Class* object, void* fptrptr, CArgs... args)
-			{
-				auto fptr = *reinterpret_cast<fptrTy*>(fptrptr);
-				(object->*fptr)(std::forward<CArgs>(args)...);
-			}
-		};
-		Helper::Call(&Static::Call,
-		             CastToReactor(object),
-		             ConstantData(&fptr, sizeof(fptr)),
-		             CastToReactor(std::forward<RArgs>(args))...);
-	}
-
-	// Calls the Reactor function pointer fptr with the signature
-	// FUNCTION_SIGNATURE and arguments.
-	template<typename FUNCTION_SIGNATURE, typename ... RArgs>
-	inline void Call(Pointer<Byte> fptr, RArgs&& ... args)
-	{
-		CallHelper<FUNCTION_SIGNATURE>::Call(fptr, CastToReactor(std::forward<RArgs>(args))...);
-	}
-
-	// Breakpoint emits an instruction that will cause the application to trap.
-	// This can be used to stop an attached debugger at the given call.
-	void Breakpoint();
-
-	class ForData
-	{
-	public:
-		ForData(bool init) : loopOnce(init)
+		RR_DEBUG_INFO_FLUSH();
+		if(Nucleus::getInsertBlock() != endBB)
 		{
-		}
+			testBB = Nucleus::createBasicBlock();
 
-		operator bool()
-		{
-			return loopOnce;
-		}
-
-		bool operator=(bool value)
-		{
-			return loopOnce = value;
-		}
-
-		bool setup()
-		{
-			RR_DEBUG_INFO_FLUSH();
-			if(Nucleus::getInsertBlock() != endBB)
-			{
-				testBB = Nucleus::createBasicBlock();
-
-				Nucleus::createBr(testBB);
-				Nucleus::setInsertBlock(testBB);
-
-				return true;
-			}
-
-			return false;
-		}
-
-		bool test(RValue<Bool> cmp)
-		{
-			BasicBlock *bodyBB = Nucleus::createBasicBlock();
-			endBB = Nucleus::createBasicBlock();
-
-			Nucleus::createCondBr(cmp.value, bodyBB, endBB);
-			Nucleus::setInsertBlock(bodyBB);
+			Nucleus::createBr(testBB);
+			Nucleus::setInsertBlock(testBB);
 
 			return true;
 		}
 
-		void end()
-		{
-			Nucleus::createBr(testBB);
-			Nucleus::setInsertBlock(endBB);
-		}
-
-	private:
-		BasicBlock *testBB = nullptr;
-		BasicBlock *endBB = nullptr;
-		bool loopOnce = true;
-	};
-
-	class IfElseData
-	{
-	public:
-		IfElseData(RValue<Bool> cmp) : iteration(0)
-		{
-			condition = cmp.value;
-
-			beginBB = Nucleus::getInsertBlock();
-			trueBB = Nucleus::createBasicBlock();
-			falseBB = nullptr;
-			endBB = Nucleus::createBasicBlock();
-
-			Nucleus::setInsertBlock(trueBB);
-		}
-
-		~IfElseData()
-		{
-			Nucleus::createBr(endBB);
-
-			Nucleus::setInsertBlock(beginBB);
-			Nucleus::createCondBr(condition, trueBB, falseBB ? falseBB : endBB);
-
-			Nucleus::setInsertBlock(endBB);
-		}
-
-		operator int()
-		{
-			return iteration;
-		}
-
-		IfElseData &operator++()
-		{
-			++iteration;
-
-			return *this;
-		}
-
-		void elseClause()
-		{
-			Nucleus::createBr(endBB);
-
-			falseBB = Nucleus::createBasicBlock();
-			Nucleus::setInsertBlock(falseBB);
-		}
-
-	private:
-		Value *condition;
-		BasicBlock *beginBB;
-		BasicBlock *trueBB;
-		BasicBlock *falseBB;
-		BasicBlock *endBB;
-		int iteration;
-	};
-
-	#define For(init, cond, inc) \
-	for(ForData for__ = true; for__; for__ = false) \
-	for(init; for__.setup() && for__.test(cond); inc, for__.end())
-
-	#define While(cond) For((void)0, cond, (void)0)
-
-	#define Do                                            \
-	{                                                     \
-		BasicBlock *body__ = Nucleus::createBasicBlock(); \
-		Nucleus::createBr(body__);                        \
-		Nucleus::setInsertBlock(body__);
-
-	#define Until(cond)                                     \
-		BasicBlock *end__ = Nucleus::createBasicBlock();    \
-		Nucleus::createCondBr((cond).value, end__, body__); \
-		Nucleus::setInsertBlock(end__);                     \
+		return false;
 	}
 
-	enum {IF_BLOCK__, ELSE_CLAUSE__, ELSE_BLOCK__, IFELSE_NUM__};
+	bool test(RValue<Bool> cmp)
+	{
+		BasicBlock *bodyBB = Nucleus::createBasicBlock();
+		endBB = Nucleus::createBasicBlock();
 
-	#define If(cond)                                                    \
-	for(IfElseData ifElse__(cond); ifElse__ < IFELSE_NUM__; ++ifElse__) \
-	if(ifElse__ == IF_BLOCK__)
+		Nucleus::createCondBr(cmp.value, bodyBB, endBB);
+		Nucleus::setInsertBlock(bodyBB);
 
-	#define Else                       \
-	else if(ifElse__ == ELSE_CLAUSE__) \
-	{                                  \
-		 ifElse__.elseClause();        \
-	}                                  \
-	else   // ELSE_BLOCK__
+		return true;
+	}
+
+	void end()
+	{
+		Nucleus::createBr(testBB);
+		Nucleus::setInsertBlock(endBB);
+	}
+
+private:
+	BasicBlock *testBB = nullptr;
+	BasicBlock *endBB = nullptr;
+	bool loopOnce = true;
+};
+
+class IfElseData
+{
+public:
+	IfElseData(RValue<Bool> cmp) : iteration(0)
+	{
+		condition = cmp.value;
+
+		beginBB = Nucleus::getInsertBlock();
+		trueBB = Nucleus::createBasicBlock();
+		falseBB = nullptr;
+		endBB = Nucleus::createBasicBlock();
+
+		Nucleus::setInsertBlock(trueBB);
+	}
+
+	~IfElseData()
+	{
+		Nucleus::createBr(endBB);
+
+		Nucleus::setInsertBlock(beginBB);
+		Nucleus::createCondBr(condition, trueBB, falseBB ? falseBB : endBB);
+
+		Nucleus::setInsertBlock(endBB);
+	}
+
+	operator int()
+	{
+		return iteration;
+	}
+
+	IfElseData &operator++()
+	{
+		++iteration;
+
+		return *this;
+	}
+
+	void elseClause()
+	{
+		Nucleus::createBr(endBB);
+
+		falseBB = Nucleus::createBasicBlock();
+		Nucleus::setInsertBlock(falseBB);
+	}
+
+private:
+	Value *condition;
+	BasicBlock *beginBB;
+	BasicBlock *trueBB;
+	BasicBlock *falseBB;
+	BasicBlock *endBB;
+	int iteration;
+};
+
+#define For(init, cond, inc) \
+for(ForData for__ = true; for__; for__ = false) \
+for(init; for__.setup() && for__.test(cond); inc, for__.end())
+
+#define While(cond) For((void)0, cond, (void)0)
+
+#define Do                                            \
+{                                                     \
+	BasicBlock *body__ = Nucleus::createBasicBlock(); \
+	Nucleus::createBr(body__);                        \
+	Nucleus::setInsertBlock(body__);
+
+#define Until(cond)                                     \
+	BasicBlock *end__ = Nucleus::createBasicBlock();    \
+	Nucleus::createCondBr((cond).value, end__, body__); \
+	Nucleus::setInsertBlock(end__);                     \
 }
 
+enum {IF_BLOCK__, ELSE_CLAUSE__, ELSE_BLOCK__, IFELSE_NUM__};
+
+#define If(cond)                                                    \
+for(IfElseData ifElse__(cond); ifElse__ < IFELSE_NUM__; ++ifElse__) \
+if(ifElse__ == IF_BLOCK__)
+
+#define Else                       \
+else if(ifElse__ == ELSE_CLAUSE__) \
+{                                  \
+	ifElse__.elseClause();         \
+}                                  \
+else  // ELSE_BLOCK__
+
+}  // namespace rr
+
 #include "Traits.inl"
 
 #endif   // rr_Reactor_hpp
diff --git a/src/Reactor/ReactorUnitTests.cpp b/src/Reactor/ReactorUnitTests.cpp
index b97894b..afcb6e3 100644
--- a/src/Reactor/ReactorUnitTests.cpp
+++ b/src/Reactor/ReactorUnitTests.cpp
@@ -89,262 +89,241 @@
 
 TEST(ReactorUnitTests, Sample)
 {
+	FunctionT<int(int*, int)> function;
 	{
-		FunctionT<int(int*, int)> function;
+		Pointer<Int> p = function.Arg<0>();
+		Int x = p[-1];
+		Int y = function.Arg<1>();
+		Int z = 4;
+
+		For(Int i = 0, i < 10, i++)
 		{
-			Pointer<Int> p = function.Arg<0>();
-			Int x = p[-1];
-			Int y = function.Arg<1>();
-			Int z = 4;
-
-			For(Int i = 0, i < 10, i++)
-			{
-				z += (2 << i) - (i / 3);
-			}
-
-			Float4 v;
-			v.z = As<Float>(z);
-			z = As<Int>(Float(Float4(v.xzxx).y));
-
-			Int sum = x + y + z;
-
-			Return(sum);
+			z += (2 << i) - (i / 3);
 		}
 
-		auto routine = function("one");
+		Float4 v;
+		v.z = As<Float>(z);
+		z = As<Int>(Float(Float4(v.xzxx).y));
 
-		if(routine)
-		{
-			int one[2] = {1, 0};
-			int result = routine(&one[1], 2);
-			EXPECT_EQ(result, reference(&one[1], 2));
-		}
+		Int sum = x + y + z;
+
+		Return(sum);
 	}
 
+	auto routine = function("one");
+
+	if(routine)
+	{
+		int one[2] = {1, 0};
+		int result = routine(&one[1], 2);
+		EXPECT_EQ(result, reference(&one[1], 2));
+	}
 }
 
 TEST(ReactorUnitTests, Uninitialized)
 {
+	FunctionT<int()> function;
 	{
-		FunctionT<int()> function;
+		Int a;
+		Int z = 4;
+		Int q;
+		Int c;
+		Int p;
+		Bool b;
+
+		q += q;
+
+		If(b)
 		{
-			Int a;
-			Int z = 4;
-			Int q;
-			Int c;
-			Int p;
-			Bool b;
-
-			q += q;
-
-			If(b)
-			{
-				c = p;
-			}
-
-			Return(a + z + q + c);
+			c = p;
 		}
 
-		auto routine = function("one");
-
-		if(routine)
-		{
-			int result = routine();
-			EXPECT_EQ(result, result);   // Anything is fine, just don't crash
-		}
+		Return(a + z + q + c);
 	}
 
+	auto routine = function("one");
+
+	if(routine)
+	{
+		int result = routine();
+		EXPECT_EQ(result, result);   // Anything is fine, just don't crash
+	}
 }
 
 TEST(ReactorUnitTests, Unreachable)
 {
+	FunctionT<int(int)> function;
 	{
-		FunctionT<int(int)> function;
-		{
-			Int a = function.Arg<0>();
-			Int z = 4;
+		Int a = function.Arg<0>();
+		Int z = 4;
 
-			Return(a + z);
+		Return(a + z);
 
-			// Code beyond this point is unreachable but should not cause any
-			// compilation issues.
+		// Code beyond this point is unreachable but should not cause any
+		// compilation issues.
 
-			z += a;
-		}
-
-		auto routine = function("one");
-
-		if(routine)
-		{
-			int result = routine(16);
-			EXPECT_EQ(result, 20);
-		}
+		z += a;
 	}
 
+	auto routine = function("one");
+
+	if(routine)
+	{
+		int result = routine(16);
+		EXPECT_EQ(result, 20);
+	}
 }
 
 TEST(ReactorUnitTests, VariableAddress)
 {
+	FunctionT<int(int)> function;
 	{
-		FunctionT<int(int)> function;
-		{
-			Int a = function.Arg<0>();
-			Int z = 0;
-			Pointer<Int> p = &z;
-			*p = 4;
+		Int a = function.Arg<0>();
+		Int z = 0;
+		Pointer<Int> p = &z;
+		*p = 4;
 
-			Return(a + z);
-		}
-
-		auto routine = function("one");
-
-		if(routine)
-		{
-			int result = routine(16);
-			EXPECT_EQ(result, 20);
-		}
+		Return(a + z);
 	}
 
+	auto routine = function("one");
+
+	if(routine)
+	{
+		int result = routine(16);
+		EXPECT_EQ(result, 20);
+	}
 }
 
 TEST(ReactorUnitTests, SubVectorLoadStore)
 {
+	FunctionT<int(void*, void*)> function;
 	{
-		FunctionT<int(void*, void*)> function;
+		Pointer<Byte> in = function.Arg<0>();
+		Pointer<Byte> out = function.Arg<1>();
+
+		*Pointer<Int4>(out + 16 * 0)   = *Pointer<Int4>(in + 16 * 0);
+		*Pointer<Short4>(out + 16 * 1) = *Pointer<Short4>(in + 16 * 1);
+		*Pointer<Byte8>(out + 16 * 2)  = *Pointer<Byte8>(in + 16 * 2);
+		*Pointer<Byte4>(out + 16 * 3)  = *Pointer<Byte4>(in + 16 * 3);
+		*Pointer<Short2>(out + 16 * 4) = *Pointer<Short2>(in + 16 * 4);
+
+		Return(0);
+	}
+
+	auto routine = function("one");
+
+	if(routine)
+	{
+		int8_t in[16 * 5] = {1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16,
+			                    17, 18, 19, 20, 21, 22, 23, 24,  0,  0,  0,  0,  0,  0,  0,  0,
+			                    25, 26, 27, 28, 29, 30, 31, 32,  0,  0,  0,  0,  0,  0,  0,  0,
+			                    33, 34, 35, 36,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+			                    37, 38, 39, 40,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0};
+
+		int8_t out[16 * 5] = {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+			                    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+			                    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+			                    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+			                    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1};
+
+		routine(in, out);
+
+		for(int row = 0; row < 5; row++)
 		{
-			Pointer<Byte> in = function.Arg<0>();
-			Pointer<Byte> out = function.Arg<1>();
-
-			*Pointer<Int4>(out + 16 * 0)   = *Pointer<Int4>(in + 16 * 0);
-			*Pointer<Short4>(out + 16 * 1) = *Pointer<Short4>(in + 16 * 1);
-			*Pointer<Byte8>(out + 16 * 2)  = *Pointer<Byte8>(in + 16 * 2);
-			*Pointer<Byte4>(out + 16 * 3)  = *Pointer<Byte4>(in + 16 * 3);
-			*Pointer<Short2>(out + 16 * 4) = *Pointer<Short2>(in + 16 * 4);
-
-			Return(0);
-		}
-
-		auto routine = function("one");
-
-		if(routine)
-		{
-			int8_t in[16 * 5] = {1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16,
-			                     17, 18, 19, 20, 21, 22, 23, 24,  0,  0,  0,  0,  0,  0,  0,  0,
-			                     25, 26, 27, 28, 29, 30, 31, 32,  0,  0,  0,  0,  0,  0,  0,  0,
-			                     33, 34, 35, 36,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
-			                     37, 38, 39, 40,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0};
-
-			int8_t out[16 * 5] = {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-			                      -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-			                      -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-			                      -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-			                      -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1};
-
-			routine(in, out);
-
-			for(int row = 0; row < 5; row++)
+			for(int col = 0; col < 16; col++)
 			{
-				for(int col = 0; col < 16; col++)
-				{
-					int i = row * 16 + col;
+				int i = row * 16 + col;
 
-					if(in[i] ==  0)
-					{
-						EXPECT_EQ(out[i], -1) << "Row " << row << " column " << col <<  " not left untouched.";
-					}
-					else
-					{
-						EXPECT_EQ(out[i], in[i]) << "Row " << row << " column " << col << " not equal to input.";
-					}
+				if(in[i] ==  0)
+				{
+					EXPECT_EQ(out[i], -1) << "Row " << row << " column " << col <<  " not left untouched.";
+				}
+				else
+				{
+					EXPECT_EQ(out[i], in[i]) << "Row " << row << " column " << col << " not equal to input.";
 				}
 			}
 		}
 	}
-
 }
 
 TEST(ReactorUnitTests, VectorConstant)
 {
+	FunctionT<int(void*)> function;
 	{
-		FunctionT<int(void*)> function;
+		Pointer<Byte> out = function.Arg<0>();
+
+		*Pointer<Int4>(out + 16 * 0) = Int4(0x04030201, 0x08070605, 0x0C0B0A09, 0x100F0E0D);
+		*Pointer<Short4>(out + 16 * 1) = Short4(0x1211, 0x1413, 0x1615, 0x1817);
+		*Pointer<Byte8>(out + 16 * 2) = Byte8(0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F, 0x20);
+		*Pointer<Int2>(out + 16 * 3) = Int2(0x24232221, 0x28272625);
+
+		Return(0);
+	}
+
+	auto routine = function("one");
+
+	if(routine)
+	{
+		int8_t out[16 * 4] = {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+			                    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+			                    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+			                    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1};
+
+		int8_t exp[16 * 4] = {1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
+			                    17, 18, 19, 20, 21, 22, 23, 24, -1, -1, -1, -1, -1, -1, -1, -1,
+			                    25, 26, 27, 28, 29, 30, 31, 32, -1, -1, -1, -1, -1, -1, -1, -1,
+			                    33, 34, 35, 36, 37, 38, 39, 40, -1, -1, -1, -1, -1, -1, -1, -1};
+
+		routine(out);
+
+		for(int row = 0; row < 4; row++)
 		{
-			Pointer<Byte> out = function.Arg<0>();
-
-			*Pointer<Int4>(out + 16 * 0) = Int4(0x04030201, 0x08070605, 0x0C0B0A09, 0x100F0E0D);
-			*Pointer<Short4>(out + 16 * 1) = Short4(0x1211, 0x1413, 0x1615, 0x1817);
-			*Pointer<Byte8>(out + 16 * 2) = Byte8(0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F, 0x20);
-			*Pointer<Int2>(out + 16 * 3) = Int2(0x24232221, 0x28272625);
-
-			Return(0);
-		}
-
-		auto routine = function("one");
-
-		if(routine)
-		{
-			int8_t out[16 * 4] = {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-			                      -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-			                      -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-			                      -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1};
-
-			int8_t exp[16 * 4] = {1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
-			                      17, 18, 19, 20, 21, 22, 23, 24, -1, -1, -1, -1, -1, -1, -1, -1,
-			                      25, 26, 27, 28, 29, 30, 31, 32, -1, -1, -1, -1, -1, -1, -1, -1,
-			                      33, 34, 35, 36, 37, 38, 39, 40, -1, -1, -1, -1, -1, -1, -1, -1};
-
-			routine(out);
-
-			for(int row = 0; row < 4; row++)
+			for(int col = 0; col < 16; col++)
 			{
-				for(int col = 0; col < 16; col++)
-				{
-					int i = row * 16 + col;
+				int i = row * 16 + col;
 
-					EXPECT_EQ(out[i], exp[i]);
-				}
+				EXPECT_EQ(out[i], exp[i]);
 			}
 		}
 	}
-
 }
 
 TEST(ReactorUnitTests, Concatenate)
 {
+	FunctionT<int(void*)> function;
 	{
-		FunctionT<int(void*)> function;
+		Pointer<Byte> out = function.Arg<0>();
+
+		*Pointer<Int4>(out + 16 * 0)   = Int4(Int2(0x04030201, 0x08070605), Int2(0x0C0B0A09, 0x100F0E0D));
+		*Pointer<Short8>(out + 16 * 1) = Short8(Short4(0x0201, 0x0403, 0x0605, 0x0807), Short4(0x0A09, 0x0C0B, 0x0E0D, 0x100F));
+
+		Return(0);
+	}
+
+	auto routine = function("one");
+
+	if(routine)
+	{
+		int8_t ref[16 * 5] = {1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16,
+			                    1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16};
+
+		int8_t out[16 * 5] = {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+			                    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1};
+
+		routine(out);
+
+		for(int row = 0; row < 2; row++)
 		{
-			Pointer<Byte> out = function.Arg<0>();
-
-			*Pointer<Int4>(out + 16 * 0)   = Int4(Int2(0x04030201, 0x08070605), Int2(0x0C0B0A09, 0x100F0E0D));
-			*Pointer<Short8>(out + 16 * 1) = Short8(Short4(0x0201, 0x0403, 0x0605, 0x0807), Short4(0x0A09, 0x0C0B, 0x0E0D, 0x100F));
-
-			Return(0);
-		}
-
-		auto routine = function("one");
-
-		if(routine)
-		{
-			int8_t ref[16 * 5] = {1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16,
-			                      1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16};
-
-			int8_t out[16 * 5] = {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-			                      -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1};
-
-			routine(out);
-
-			for(int row = 0; row < 2; row++)
+			for(int col = 0; col < 16; col++)
 			{
-				for(int col = 0; col < 16; col++)
-				{
-					int i = row * 16 + col;
+				int i = row * 16 + col;
 
-					EXPECT_EQ(out[i], ref[i]) << "Row " << row << " column " << col << " not equal to reference.";
-				}
+				EXPECT_EQ(out[i], ref[i]) << "Row " << row << " column " << col << " not equal to reference.";
 			}
 		}
 	}
-
 }
 
 TEST(ReactorUnitTests, Swizzle)
@@ -476,7 +455,6 @@
 			}
 		}
 	}
-
 }
 
 TEST(ReactorUnitTests, Blend)
@@ -581,231 +559,223 @@
 
 TEST(ReactorUnitTests, Branching)
 {
+	FunctionT<int()> function;
 	{
-		FunctionT<int()> function;
+		Int x = 0;
+
+		For(Int i = 0, i < 8, i++)
 		{
-			Int x = 0;
-
-			For(Int i = 0, i < 8, i++)
+			If(i < 2)
 			{
-				If(i < 2)
-				{
-					x += 1;
-				}
-				Else If(i < 4)
-				{
-					x += 10;
-				}
-				Else If(i < 6)
-				{
-					x += 100;
-				}
-				Else
-				{
-					x += 1000;
-				}
-
-				For(Int i = 0, i < 5, i++)
-					x += 10000;
+				x += 1;
+			}
+			Else If(i < 4)
+			{
+				x += 10;
+			}
+			Else If(i < 6)
+			{
+				x += 100;
+			}
+			Else
+			{
+				x += 1000;
 			}
 
-			For(Int i = 0, i < 10, i++)
-				for(int i = 0; i < 10; i++)
-					For(Int i = 0, i < 10, i++)
-					{
-						x += 1000000;
-					}
+			For(Int i = 0, i < 5, i++)
+				x += 10000;
+		}
 
-			For(Int i = 0, i < 2, i++)
-				If(x == 1000402222)
+		For(Int i = 0, i < 10, i++)
+			for(int i = 0; i < 10; i++)
+				For(Int i = 0, i < 10, i++)
 				{
-					If(x != 1000402222)
-						x += 1000000000;
+					x += 1000000;
 				}
-				Else
-					x = -5;
 
-			Return(x);
-		}
+		For(Int i = 0, i < 2, i++)
+			If(x == 1000402222)
+			{
+				If(x != 1000402222)
+					x += 1000000000;
+			}
+			Else
+				x = -5;
 
-		auto routine = function("one");
+		Return(x);
+	}
 
-		if(routine)
-		{
-			int result = routine();
+	auto routine = function("one");
 
-			EXPECT_EQ(result, 1000402222);
-		}
+	if(routine)
+	{
+		int result = routine();
+
+		EXPECT_EQ(result, 1000402222);
 	}
 
 }
 
 TEST(ReactorUnitTests, MinMax)
 {
+	FunctionT<int(void*)> function;
 	{
-		FunctionT<int(void*)> function;
-		{
-			Pointer<Byte> out = function.Arg<0>();
+		Pointer<Byte> out = function.Arg<0>();
 
-			*Pointer<Float4>(out + 16 * 0) = Min(Float4(1.0f, 0.0f, -0.0f, +0.0f), Float4(0.0f, 1.0f, +0.0f, -0.0f));
-			*Pointer<Float4>(out + 16 * 1) = Max(Float4(1.0f, 0.0f, -0.0f, +0.0f), Float4(0.0f, 1.0f, +0.0f, -0.0f));
+		*Pointer<Float4>(out + 16 * 0) = Min(Float4(1.0f, 0.0f, -0.0f, +0.0f), Float4(0.0f, 1.0f, +0.0f, -0.0f));
+		*Pointer<Float4>(out + 16 * 1) = Max(Float4(1.0f, 0.0f, -0.0f, +0.0f), Float4(0.0f, 1.0f, +0.0f, -0.0f));
 
-			*Pointer<Int4>(out + 16 * 2) = Min(Int4(1, 0, -1, -0), Int4(0, 1, 0, +0));
-			*Pointer<Int4>(out + 16 * 3) = Max(Int4(1, 0, -1, -0), Int4(0, 1, 0, +0));
-			*Pointer<UInt4>(out + 16 * 4) = Min(UInt4(1, 0, -1, -0), UInt4(0, 1, 0, +0));
-			*Pointer<UInt4>(out + 16 * 5) = Max(UInt4(1, 0, -1, -0), UInt4(0, 1, 0, +0));
+		*Pointer<Int4>(out + 16 * 2) = Min(Int4(1, 0, -1, -0), Int4(0, 1, 0, +0));
+		*Pointer<Int4>(out + 16 * 3) = Max(Int4(1, 0, -1, -0), Int4(0, 1, 0, +0));
+		*Pointer<UInt4>(out + 16 * 4) = Min(UInt4(1, 0, -1, -0), UInt4(0, 1, 0, +0));
+		*Pointer<UInt4>(out + 16 * 5) = Max(UInt4(1, 0, -1, -0), UInt4(0, 1, 0, +0));
 
-			*Pointer<Short4>(out + 16 * 6) = Min(Short4(1, 0, -1, -0), Short4(0, 1, 0, +0));
-			*Pointer<Short4>(out + 16 * 7) = Max(Short4(1, 0, -1, -0), Short4(0, 1, 0, +0));
-			*Pointer<UShort4>(out + 16 * 8) = Min(UShort4(1, 0, -1, -0), UShort4(0, 1, 0, +0));
-			*Pointer<UShort4>(out + 16 * 9) = Max(UShort4(1, 0, -1, -0), UShort4(0, 1, 0, +0));
+		*Pointer<Short4>(out + 16 * 6) = Min(Short4(1, 0, -1, -0), Short4(0, 1, 0, +0));
+		*Pointer<Short4>(out + 16 * 7) = Max(Short4(1, 0, -1, -0), Short4(0, 1, 0, +0));
+		*Pointer<UShort4>(out + 16 * 8) = Min(UShort4(1, 0, -1, -0), UShort4(0, 1, 0, +0));
+		*Pointer<UShort4>(out + 16 * 9) = Max(UShort4(1, 0, -1, -0), UShort4(0, 1, 0, +0));
 
-			Return(0);
-		}
-
-		auto routine = function("one");
-
-		if(routine)
-		{
-			unsigned int out[10][4];
-
-			memset(&out, 0, sizeof(out));
-
-			routine(&out);
-
-			EXPECT_EQ(out[0][0], 0x00000000u);
-			EXPECT_EQ(out[0][1], 0x00000000u);
-			EXPECT_EQ(out[0][2], 0x00000000u);
-			EXPECT_EQ(out[0][3], 0x80000000u);
-
-			EXPECT_EQ(out[1][0], 0x3F800000u);
-			EXPECT_EQ(out[1][1], 0x3F800000u);
-			EXPECT_EQ(out[1][2], 0x00000000u);
-			EXPECT_EQ(out[1][3], 0x80000000u);
-
-			EXPECT_EQ(out[2][0], 0x00000000u);
-			EXPECT_EQ(out[2][1], 0x00000000u);
-			EXPECT_EQ(out[2][2], 0xFFFFFFFFu);
-			EXPECT_EQ(out[2][3], 0x00000000u);
-
-			EXPECT_EQ(out[3][0], 0x00000001u);
-			EXPECT_EQ(out[3][1], 0x00000001u);
-			EXPECT_EQ(out[3][2], 0x00000000u);
-			EXPECT_EQ(out[3][3], 0x00000000u);
-
-			EXPECT_EQ(out[4][0], 0x00000000u);
-			EXPECT_EQ(out[4][1], 0x00000000u);
-			EXPECT_EQ(out[4][2], 0x00000000u);
-			EXPECT_EQ(out[4][3], 0x00000000u);
-
-			EXPECT_EQ(out[5][0], 0x00000001u);
-			EXPECT_EQ(out[5][1], 0x00000001u);
-			EXPECT_EQ(out[5][2], 0xFFFFFFFFu);
-			EXPECT_EQ(out[5][3], 0x00000000u);
-
-			EXPECT_EQ(out[6][0], 0x00000000u);
-			EXPECT_EQ(out[6][1], 0x0000FFFFu);
-			EXPECT_EQ(out[6][2], 0x00000000u);
-			EXPECT_EQ(out[6][3], 0x00000000u);
-
-			EXPECT_EQ(out[7][0], 0x00010001u);
-			EXPECT_EQ(out[7][1], 0x00000000u);
-			EXPECT_EQ(out[7][2], 0x00000000u);
-			EXPECT_EQ(out[7][3], 0x00000000u);
-
-			EXPECT_EQ(out[8][0], 0x00000000u);
-			EXPECT_EQ(out[8][1], 0x00000000u);
-			EXPECT_EQ(out[8][2], 0x00000000u);
-			EXPECT_EQ(out[8][3], 0x00000000u);
-
-			EXPECT_EQ(out[9][0], 0x00010001u);
-			EXPECT_EQ(out[9][1], 0x0000FFFFu);
-			EXPECT_EQ(out[9][2], 0x00000000u);
-			EXPECT_EQ(out[9][3], 0x00000000u);
-		}
+		Return(0);
 	}
 
+	auto routine = function("one");
+
+	if(routine)
+	{
+		unsigned int out[10][4];
+
+		memset(&out, 0, sizeof(out));
+
+		routine(&out);
+
+		EXPECT_EQ(out[0][0], 0x00000000u);
+		EXPECT_EQ(out[0][1], 0x00000000u);
+		EXPECT_EQ(out[0][2], 0x00000000u);
+		EXPECT_EQ(out[0][3], 0x80000000u);
+
+		EXPECT_EQ(out[1][0], 0x3F800000u);
+		EXPECT_EQ(out[1][1], 0x3F800000u);
+		EXPECT_EQ(out[1][2], 0x00000000u);
+		EXPECT_EQ(out[1][3], 0x80000000u);
+
+		EXPECT_EQ(out[2][0], 0x00000000u);
+		EXPECT_EQ(out[2][1], 0x00000000u);
+		EXPECT_EQ(out[2][2], 0xFFFFFFFFu);
+		EXPECT_EQ(out[2][3], 0x00000000u);
+
+		EXPECT_EQ(out[3][0], 0x00000001u);
+		EXPECT_EQ(out[3][1], 0x00000001u);
+		EXPECT_EQ(out[3][2], 0x00000000u);
+		EXPECT_EQ(out[3][3], 0x00000000u);
+
+		EXPECT_EQ(out[4][0], 0x00000000u);
+		EXPECT_EQ(out[4][1], 0x00000000u);
+		EXPECT_EQ(out[4][2], 0x00000000u);
+		EXPECT_EQ(out[4][3], 0x00000000u);
+
+		EXPECT_EQ(out[5][0], 0x00000001u);
+		EXPECT_EQ(out[5][1], 0x00000001u);
+		EXPECT_EQ(out[5][2], 0xFFFFFFFFu);
+		EXPECT_EQ(out[5][3], 0x00000000u);
+
+		EXPECT_EQ(out[6][0], 0x00000000u);
+		EXPECT_EQ(out[6][1], 0x0000FFFFu);
+		EXPECT_EQ(out[6][2], 0x00000000u);
+		EXPECT_EQ(out[6][3], 0x00000000u);
+
+		EXPECT_EQ(out[7][0], 0x00010001u);
+		EXPECT_EQ(out[7][1], 0x00000000u);
+		EXPECT_EQ(out[7][2], 0x00000000u);
+		EXPECT_EQ(out[7][3], 0x00000000u);
+
+		EXPECT_EQ(out[8][0], 0x00000000u);
+		EXPECT_EQ(out[8][1], 0x00000000u);
+		EXPECT_EQ(out[8][2], 0x00000000u);
+		EXPECT_EQ(out[8][3], 0x00000000u);
+
+		EXPECT_EQ(out[9][0], 0x00010001u);
+		EXPECT_EQ(out[9][1], 0x0000FFFFu);
+		EXPECT_EQ(out[9][2], 0x00000000u);
+		EXPECT_EQ(out[9][3], 0x00000000u);
+	}
 }
 
 TEST(ReactorUnitTests, NotNeg)
 {
+	FunctionT<int(void*)> function;
 	{
-		FunctionT<int(void*)> function;
-		{
-			Pointer<Byte> out = function.Arg<0>();
+		Pointer<Byte> out = function.Arg<0>();
 
-			*Pointer<Int>(out + 16 * 0) = ~Int(0x55555555);
-			*Pointer<Short>(out + 16 * 1) = ~Short(0x5555);
-			*Pointer<Int4>(out + 16 * 2) = ~Int4(0x55555555, 0xAAAAAAAA, 0x00000000, 0xFFFFFFFF);
-			*Pointer<Short4>(out + 16 * 3) = ~Short4(0x5555, 0xAAAA, 0x0000, 0xFFFF);
+		*Pointer<Int>(out + 16 * 0) = ~Int(0x55555555);
+		*Pointer<Short>(out + 16 * 1) = ~Short(0x5555);
+		*Pointer<Int4>(out + 16 * 2) = ~Int4(0x55555555, 0xAAAAAAAA, 0x00000000, 0xFFFFFFFF);
+		*Pointer<Short4>(out + 16 * 3) = ~Short4(0x5555, 0xAAAA, 0x0000, 0xFFFF);
 
-			*Pointer<Int>(out + 16 * 4) = -Int(0x55555555);
-			*Pointer<Short>(out + 16 * 5) = -Short(0x5555);
-			*Pointer<Int4>(out + 16 * 6) = -Int4(0x55555555, 0xAAAAAAAA, 0x00000000, 0xFFFFFFFF);
-			*Pointer<Short4>(out + 16 * 7) = -Short4(0x5555, 0xAAAA, 0x0000, 0xFFFF);
+		*Pointer<Int>(out + 16 * 4) = -Int(0x55555555);
+		*Pointer<Short>(out + 16 * 5) = -Short(0x5555);
+		*Pointer<Int4>(out + 16 * 6) = -Int4(0x55555555, 0xAAAAAAAA, 0x00000000, 0xFFFFFFFF);
+		*Pointer<Short4>(out + 16 * 7) = -Short4(0x5555, 0xAAAA, 0x0000, 0xFFFF);
 
-			*Pointer<Float4>(out + 16 * 8) = -Float4(1.0f, -1.0f, 0.0f, -0.0f);
+		*Pointer<Float4>(out + 16 * 8) = -Float4(1.0f, -1.0f, 0.0f, -0.0f);
 
-			Return(0);
-		}
-
-		auto routine = function("one");
-
-		if(routine)
-		{
-			unsigned int out[10][4];
-
-			memset(&out, 0, sizeof(out));
-
-			routine(&out);
-
-			EXPECT_EQ(out[0][0], 0xAAAAAAAAu);
-			EXPECT_EQ(out[0][1], 0x00000000u);
-			EXPECT_EQ(out[0][2], 0x00000000u);
-			EXPECT_EQ(out[0][3], 0x00000000u);
-
-			EXPECT_EQ(out[1][0], 0x0000AAAAu);
-			EXPECT_EQ(out[1][1], 0x00000000u);
-			EXPECT_EQ(out[1][2], 0x00000000u);
-			EXPECT_EQ(out[1][3], 0x00000000u);
-
-			EXPECT_EQ(out[2][0], 0xAAAAAAAAu);
-			EXPECT_EQ(out[2][1], 0x55555555u);
-			EXPECT_EQ(out[2][2], 0xFFFFFFFFu);
-			EXPECT_EQ(out[2][3], 0x00000000u);
-
-			EXPECT_EQ(out[3][0], 0x5555AAAAu);
-			EXPECT_EQ(out[3][1], 0x0000FFFFu);
-			EXPECT_EQ(out[3][2], 0x00000000u);
-			EXPECT_EQ(out[3][3], 0x00000000u);
-
-			EXPECT_EQ(out[4][0], 0xAAAAAAABu);
-			EXPECT_EQ(out[4][1], 0x00000000u);
-			EXPECT_EQ(out[4][2], 0x00000000u);
-			EXPECT_EQ(out[4][3], 0x00000000u);
-
-			EXPECT_EQ(out[5][0], 0x0000AAABu);
-			EXPECT_EQ(out[5][1], 0x00000000u);
-			EXPECT_EQ(out[5][2], 0x00000000u);
-			EXPECT_EQ(out[5][3], 0x00000000u);
-
-			EXPECT_EQ(out[6][0], 0xAAAAAAABu);
-			EXPECT_EQ(out[6][1], 0x55555556u);
-			EXPECT_EQ(out[6][2], 0x00000000u);
-			EXPECT_EQ(out[6][3], 0x00000001u);
-
-			EXPECT_EQ(out[7][0], 0x5556AAABu);
-			EXPECT_EQ(out[7][1], 0x00010000u);
-			EXPECT_EQ(out[7][2], 0x00000000u);
-			EXPECT_EQ(out[7][3], 0x00000000u);
-
-			EXPECT_EQ(out[8][0], 0xBF800000u);
-			EXPECT_EQ(out[8][1], 0x3F800000u);
-			EXPECT_EQ(out[8][2], 0x80000000u);
-			EXPECT_EQ(out[8][3], 0x00000000u);
-		}
+		Return(0);
 	}
 
+	auto routine = function("one");
+
+	if(routine)
+	{
+		unsigned int out[10][4];
+
+		memset(&out, 0, sizeof(out));
+
+		routine(&out);
+
+		EXPECT_EQ(out[0][0], 0xAAAAAAAAu);
+		EXPECT_EQ(out[0][1], 0x00000000u);
+		EXPECT_EQ(out[0][2], 0x00000000u);
+		EXPECT_EQ(out[0][3], 0x00000000u);
+
+		EXPECT_EQ(out[1][0], 0x0000AAAAu);
+		EXPECT_EQ(out[1][1], 0x00000000u);
+		EXPECT_EQ(out[1][2], 0x00000000u);
+		EXPECT_EQ(out[1][3], 0x00000000u);
+
+		EXPECT_EQ(out[2][0], 0xAAAAAAAAu);
+		EXPECT_EQ(out[2][1], 0x55555555u);
+		EXPECT_EQ(out[2][2], 0xFFFFFFFFu);
+		EXPECT_EQ(out[2][3], 0x00000000u);
+
+		EXPECT_EQ(out[3][0], 0x5555AAAAu);
+		EXPECT_EQ(out[3][1], 0x0000FFFFu);
+		EXPECT_EQ(out[3][2], 0x00000000u);
+		EXPECT_EQ(out[3][3], 0x00000000u);
+
+		EXPECT_EQ(out[4][0], 0xAAAAAAABu);
+		EXPECT_EQ(out[4][1], 0x00000000u);
+		EXPECT_EQ(out[4][2], 0x00000000u);
+		EXPECT_EQ(out[4][3], 0x00000000u);
+
+		EXPECT_EQ(out[5][0], 0x0000AAABu);
+		EXPECT_EQ(out[5][1], 0x00000000u);
+		EXPECT_EQ(out[5][2], 0x00000000u);
+		EXPECT_EQ(out[5][3], 0x00000000u);
+
+		EXPECT_EQ(out[6][0], 0xAAAAAAABu);
+		EXPECT_EQ(out[6][1], 0x55555556u);
+		EXPECT_EQ(out[6][2], 0x00000000u);
+		EXPECT_EQ(out[6][3], 0x00000001u);
+
+		EXPECT_EQ(out[7][0], 0x5556AAABu);
+		EXPECT_EQ(out[7][1], 0x00010000u);
+		EXPECT_EQ(out[7][2], 0x00000000u);
+		EXPECT_EQ(out[7][3], 0x00000000u);
+
+		EXPECT_EQ(out[8][0], 0xBF800000u);
+		EXPECT_EQ(out[8][1], 0x3F800000u);
+		EXPECT_EQ(out[8][2], 0x80000000u);
+		EXPECT_EQ(out[8][3], 0x00000000u);
+	}
 }
 
 TEST(ReactorUnitTests, FPtoUI)
@@ -848,375 +818,357 @@
 
 TEST(ReactorUnitTests, VectorCompare)
 {
+	FunctionT<int(void*)> function;
 	{
-		FunctionT<int(void*)> function;
-		{
-			Pointer<Byte> out = function.Arg<0>();
+		Pointer<Byte> out = function.Arg<0>();
 
-			*Pointer<Int4>(out + 16 * 0) = CmpEQ(Float4(1.0f, 1.0f, -0.0f, +0.0f), Float4(0.0f, 1.0f, +0.0f, -0.0f));
-			*Pointer<Int4>(out + 16 * 1) = CmpEQ(Int4(1, 0, -1, -0), Int4(0, 1, 0, +0));
-			*Pointer<Byte8>(out + 16 * 2) = CmpEQ(SByte8(1, 2, 3, 4, 5, 6, 7, 8), SByte8(7, 6, 5, 4, 3, 2, 1, 0));
+		*Pointer<Int4>(out + 16 * 0) = CmpEQ(Float4(1.0f, 1.0f, -0.0f, +0.0f), Float4(0.0f, 1.0f, +0.0f, -0.0f));
+		*Pointer<Int4>(out + 16 * 1) = CmpEQ(Int4(1, 0, -1, -0), Int4(0, 1, 0, +0));
+		*Pointer<Byte8>(out + 16 * 2) = CmpEQ(SByte8(1, 2, 3, 4, 5, 6, 7, 8), SByte8(7, 6, 5, 4, 3, 2, 1, 0));
 
-			*Pointer<Int4>(out + 16 * 3) = CmpNLT(Float4(1.0f, 1.0f, -0.0f, +0.0f), Float4(0.0f, 1.0f, +0.0f, -0.0f));
-			*Pointer<Int4>(out + 16 * 4) = CmpNLT(Int4(1, 0, -1, -0), Int4(0, 1, 0, +0));
-			*Pointer<Byte8>(out + 16 * 5) = CmpGT(SByte8(1, 2, 3, 4, 5, 6, 7, 8), SByte8(7, 6, 5, 4, 3, 2, 1, 0));
+		*Pointer<Int4>(out + 16 * 3) = CmpNLT(Float4(1.0f, 1.0f, -0.0f, +0.0f), Float4(0.0f, 1.0f, +0.0f, -0.0f));
+		*Pointer<Int4>(out + 16 * 4) = CmpNLT(Int4(1, 0, -1, -0), Int4(0, 1, 0, +0));
+		*Pointer<Byte8>(out + 16 * 5) = CmpGT(SByte8(1, 2, 3, 4, 5, 6, 7, 8), SByte8(7, 6, 5, 4, 3, 2, 1, 0));
 
-			Return(0);
-		}
-
-		auto routine = function("one");
-
-		if(routine)
-		{
-			unsigned int out[6][4];
-
-			memset(&out, 0, sizeof(out));
-
-			routine(&out);
-
-			EXPECT_EQ(out[0][0], 0x00000000u);
-			EXPECT_EQ(out[0][1], 0xFFFFFFFFu);
-			EXPECT_EQ(out[0][2], 0xFFFFFFFFu);
-			EXPECT_EQ(out[0][3], 0xFFFFFFFFu);
-
-			EXPECT_EQ(out[1][0], 0x00000000u);
-			EXPECT_EQ(out[1][1], 0x00000000u);
-			EXPECT_EQ(out[1][2], 0x00000000u);
-			EXPECT_EQ(out[1][3], 0xFFFFFFFFu);
-
-			EXPECT_EQ(out[2][0], 0xFF000000u);
-			EXPECT_EQ(out[2][1], 0x00000000u);
-
-			EXPECT_EQ(out[3][0], 0xFFFFFFFFu);
-			EXPECT_EQ(out[3][1], 0xFFFFFFFFu);
-			EXPECT_EQ(out[3][2], 0xFFFFFFFFu);
-			EXPECT_EQ(out[3][3], 0xFFFFFFFFu);
-
-			EXPECT_EQ(out[4][0], 0xFFFFFFFFu);
-			EXPECT_EQ(out[4][1], 0x00000000u);
-			EXPECT_EQ(out[4][2], 0x00000000u);
-			EXPECT_EQ(out[4][3], 0xFFFFFFFFu);
-
-			EXPECT_EQ(out[5][0], 0x00000000u);
-			EXPECT_EQ(out[5][1], 0xFFFFFFFFu);
-		}
+		Return(0);
 	}
 
+	auto routine = function("one");
+
+	if(routine)
+	{
+		unsigned int out[6][4];
+
+		memset(&out, 0, sizeof(out));
+
+		routine(&out);
+
+		EXPECT_EQ(out[0][0], 0x00000000u);
+		EXPECT_EQ(out[0][1], 0xFFFFFFFFu);
+		EXPECT_EQ(out[0][2], 0xFFFFFFFFu);
+		EXPECT_EQ(out[0][3], 0xFFFFFFFFu);
+
+		EXPECT_EQ(out[1][0], 0x00000000u);
+		EXPECT_EQ(out[1][1], 0x00000000u);
+		EXPECT_EQ(out[1][2], 0x00000000u);
+		EXPECT_EQ(out[1][3], 0xFFFFFFFFu);
+
+		EXPECT_EQ(out[2][0], 0xFF000000u);
+		EXPECT_EQ(out[2][1], 0x00000000u);
+
+		EXPECT_EQ(out[3][0], 0xFFFFFFFFu);
+		EXPECT_EQ(out[3][1], 0xFFFFFFFFu);
+		EXPECT_EQ(out[3][2], 0xFFFFFFFFu);
+		EXPECT_EQ(out[3][3], 0xFFFFFFFFu);
+
+		EXPECT_EQ(out[4][0], 0xFFFFFFFFu);
+		EXPECT_EQ(out[4][1], 0x00000000u);
+		EXPECT_EQ(out[4][2], 0x00000000u);
+		EXPECT_EQ(out[4][3], 0xFFFFFFFFu);
+
+		EXPECT_EQ(out[5][0], 0x00000000u);
+		EXPECT_EQ(out[5][1], 0xFFFFFFFFu);
+	}
 }
 
 TEST(ReactorUnitTests, SaturatedAddAndSubtract)
 {
+	FunctionT<int(void*)> function;
 	{
-		FunctionT<int(void*)> function;
-		{
-			Pointer<Byte> out = function.Arg<0>();
+		Pointer<Byte> out = function.Arg<0>();
 
-			*Pointer<Byte8>(out + 8 * 0) =
-				AddSat(Byte8(1, 2, 3, 4, 5, 6, 7, 8),
-				       Byte8(7, 6, 5, 4, 3, 2, 1, 0));
-			*Pointer<Byte8>(out + 8 * 1) =
-				AddSat(Byte8(0xFE, 0xFE, 0xFE, 0xFE, 0xFE, 0xFE, 0xFE, 0xFE),
-				       Byte8(7, 6, 5, 4, 3, 2, 1, 0));
-			*Pointer<Byte8>(out + 8 * 2) =
-				SubSat(Byte8(1, 2, 3, 4, 5, 6, 7, 8),
-				       Byte8(7, 6, 5, 4, 3, 2, 1, 0));
+		*Pointer<Byte8>(out + 8 * 0) =
+			AddSat(Byte8(1, 2, 3, 4, 5, 6, 7, 8),
+				    Byte8(7, 6, 5, 4, 3, 2, 1, 0));
+		*Pointer<Byte8>(out + 8 * 1) =
+			AddSat(Byte8(0xFE, 0xFE, 0xFE, 0xFE, 0xFE, 0xFE, 0xFE, 0xFE),
+				    Byte8(7, 6, 5, 4, 3, 2, 1, 0));
+		*Pointer<Byte8>(out + 8 * 2) =
+			SubSat(Byte8(1, 2, 3, 4, 5, 6, 7, 8),
+				    Byte8(7, 6, 5, 4, 3, 2, 1, 0));
 
-			*Pointer<SByte8>(out + 8 * 3) =
-				AddSat(SByte8(1, 2, 3, 4, 5, 6, 7, 8),
-				       SByte8(7, 6, 5, 4, 3, 2, 1, 0));
-			*Pointer<SByte8>(out + 8 * 4) =
-				AddSat(SByte8(0x7E, 0x7E, 0x7E, 0x7E, 0x7E, 0x7E, 0x7E, 0x7E),
-				       SByte8(7, 6, 5, 4, 3, 2, 1, 0));
-			*Pointer<SByte8>(out + 8 * 5) =
-				AddSat(SByte8(0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88),
-				       SByte8(-7, -6, -5, -4, -3, -2, -1, -0));
-			*Pointer<SByte8>(out + 8 * 6) =
-				SubSat(SByte8(0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88),
-				       SByte8(7, 6, 5, 4, 3, 2, 1, 0));
+		*Pointer<SByte8>(out + 8 * 3) =
+			AddSat(SByte8(1, 2, 3, 4, 5, 6, 7, 8),
+				    SByte8(7, 6, 5, 4, 3, 2, 1, 0));
+		*Pointer<SByte8>(out + 8 * 4) =
+			AddSat(SByte8(0x7E, 0x7E, 0x7E, 0x7E, 0x7E, 0x7E, 0x7E, 0x7E),
+				    SByte8(7, 6, 5, 4, 3, 2, 1, 0));
+		*Pointer<SByte8>(out + 8 * 5) =
+			AddSat(SByte8(0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88),
+				    SByte8(-7, -6, -5, -4, -3, -2, -1, -0));
+		*Pointer<SByte8>(out + 8 * 6) =
+			SubSat(SByte8(0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88),
+				    SByte8(7, 6, 5, 4, 3, 2, 1, 0));
 
-			*Pointer<Short4>(out + 8 * 7) =
-				AddSat(Short4(1, 2, 3, 4), Short4(3, 2, 1, 0));
-			*Pointer<Short4>(out + 8 * 8) =
-				AddSat(Short4(0x7FFE, 0x7FFE, 0x7FFE, 0x7FFE),
-				       Short4(3, 2, 1, 0));
-			*Pointer<Short4>(out + 8 * 9) =
-				AddSat(Short4(0x8001, 0x8002, 0x8003, 0x8004),
-				       Short4(-3, -2, -1, -0));
-			*Pointer<Short4>(out + 8 * 10) =
-				SubSat(Short4(0x8001, 0x8002, 0x8003, 0x8004),
-				       Short4(3, 2, 1, 0));
+		*Pointer<Short4>(out + 8 * 7) =
+			AddSat(Short4(1, 2, 3, 4), Short4(3, 2, 1, 0));
+		*Pointer<Short4>(out + 8 * 8) =
+			AddSat(Short4(0x7FFE, 0x7FFE, 0x7FFE, 0x7FFE),
+				    Short4(3, 2, 1, 0));
+		*Pointer<Short4>(out + 8 * 9) =
+			AddSat(Short4(0x8001, 0x8002, 0x8003, 0x8004),
+				    Short4(-3, -2, -1, -0));
+		*Pointer<Short4>(out + 8 * 10) =
+			SubSat(Short4(0x8001, 0x8002, 0x8003, 0x8004),
+				    Short4(3, 2, 1, 0));
 
-			*Pointer<UShort4>(out + 8 * 11) =
-				AddSat(UShort4(1, 2, 3, 4), UShort4(3, 2, 1, 0));
-			*Pointer<UShort4>(out + 8 * 12) =
-				AddSat(UShort4(0xFFFE, 0xFFFE, 0xFFFE, 0xFFFE),
-				       UShort4(3, 2, 1, 0));
-			*Pointer<UShort4>(out + 8 * 13) =
-				SubSat(UShort4(1, 2, 3, 4), UShort4(3, 2, 1, 0));
+		*Pointer<UShort4>(out + 8 * 11) =
+			AddSat(UShort4(1, 2, 3, 4), UShort4(3, 2, 1, 0));
+		*Pointer<UShort4>(out + 8 * 12) =
+			AddSat(UShort4(0xFFFE, 0xFFFE, 0xFFFE, 0xFFFE),
+				    UShort4(3, 2, 1, 0));
+		*Pointer<UShort4>(out + 8 * 13) =
+			SubSat(UShort4(1, 2, 3, 4), UShort4(3, 2, 1, 0));
 
-			Return(0);
-		}
-
-		auto routine = function("one");
-
-		if(routine)
-		{
-			unsigned int out[14][2];
-
-			memset(&out, 0, sizeof(out));
-
-			routine(&out);
-
-			EXPECT_EQ(out[0][0], 0x08080808u);
-			EXPECT_EQ(out[0][1], 0x08080808u);
-
-			EXPECT_EQ(out[1][0], 0xFFFFFFFFu);
-			EXPECT_EQ(out[1][1], 0xFEFFFFFFu);
-
-			EXPECT_EQ(out[2][0], 0x00000000u);
-			EXPECT_EQ(out[2][1], 0x08060402u);
-
-			EXPECT_EQ(out[3][0], 0x08080808u);
-			EXPECT_EQ(out[3][1], 0x08080808u);
-
-			EXPECT_EQ(out[4][0], 0x7F7F7F7Fu);
-			EXPECT_EQ(out[4][1], 0x7E7F7F7Fu);
-
-			EXPECT_EQ(out[5][0], 0x80808080u);
-			EXPECT_EQ(out[5][1], 0x88868482u);
-
-			EXPECT_EQ(out[6][0], 0x80808080u);
-			EXPECT_EQ(out[6][1], 0x88868482u);
-
-			EXPECT_EQ(out[7][0], 0x00040004u);
-			EXPECT_EQ(out[7][1], 0x00040004u);
-
-			EXPECT_EQ(out[8][0], 0x7FFF7FFFu);
-			EXPECT_EQ(out[8][1], 0x7FFE7FFFu);
-
-			EXPECT_EQ(out[9][0], 0x80008000u);
-			EXPECT_EQ(out[9][1], 0x80048002u);
-
-			EXPECT_EQ(out[10][0], 0x80008000u);
-			EXPECT_EQ(out[10][1], 0x80048002u);
-
-			EXPECT_EQ(out[11][0], 0x00040004u);
-			EXPECT_EQ(out[11][1], 0x00040004u);
-
-			EXPECT_EQ(out[12][0], 0xFFFFFFFFu);
-			EXPECT_EQ(out[12][1], 0xFFFEFFFFu);
-
-			EXPECT_EQ(out[13][0], 0x00000000u);
-			EXPECT_EQ(out[13][1], 0x00040002u);
-		}
+		Return(0);
 	}
 
+	auto routine = function("one");
+
+	if(routine)
+	{
+		unsigned int out[14][2];
+
+		memset(&out, 0, sizeof(out));
+
+		routine(&out);
+
+		EXPECT_EQ(out[0][0], 0x08080808u);
+		EXPECT_EQ(out[0][1], 0x08080808u);
+
+		EXPECT_EQ(out[1][0], 0xFFFFFFFFu);
+		EXPECT_EQ(out[1][1], 0xFEFFFFFFu);
+
+		EXPECT_EQ(out[2][0], 0x00000000u);
+		EXPECT_EQ(out[2][1], 0x08060402u);
+
+		EXPECT_EQ(out[3][0], 0x08080808u);
+		EXPECT_EQ(out[3][1], 0x08080808u);
+
+		EXPECT_EQ(out[4][0], 0x7F7F7F7Fu);
+		EXPECT_EQ(out[4][1], 0x7E7F7F7Fu);
+
+		EXPECT_EQ(out[5][0], 0x80808080u);
+		EXPECT_EQ(out[5][1], 0x88868482u);
+
+		EXPECT_EQ(out[6][0], 0x80808080u);
+		EXPECT_EQ(out[6][1], 0x88868482u);
+
+		EXPECT_EQ(out[7][0], 0x00040004u);
+		EXPECT_EQ(out[7][1], 0x00040004u);
+
+		EXPECT_EQ(out[8][0], 0x7FFF7FFFu);
+		EXPECT_EQ(out[8][1], 0x7FFE7FFFu);
+
+		EXPECT_EQ(out[9][0], 0x80008000u);
+		EXPECT_EQ(out[9][1], 0x80048002u);
+
+		EXPECT_EQ(out[10][0], 0x80008000u);
+		EXPECT_EQ(out[10][1], 0x80048002u);
+
+		EXPECT_EQ(out[11][0], 0x00040004u);
+		EXPECT_EQ(out[11][1], 0x00040004u);
+
+		EXPECT_EQ(out[12][0], 0xFFFFFFFFu);
+		EXPECT_EQ(out[12][1], 0xFFFEFFFFu);
+
+		EXPECT_EQ(out[13][0], 0x00000000u);
+		EXPECT_EQ(out[13][1], 0x00040002u);
+	}
 }
 
 TEST(ReactorUnitTests, Unpack)
 {
+	FunctionT<int(void*, void*)> function;
 	{
-		FunctionT<int(void*, void*)> function;
-		{
-			Pointer<Byte> in = function.Arg<0>();
-			Pointer<Byte> out = function.Arg<1>();
+		Pointer<Byte> in = function.Arg<0>();
+		Pointer<Byte> out = function.Arg<1>();
 
-			Byte4 test_byte_a = *Pointer<Byte4>(in + 4 * 0);
-			Byte4 test_byte_b = *Pointer<Byte4>(in + 4 * 1);
+		Byte4 test_byte_a = *Pointer<Byte4>(in + 4 * 0);
+		Byte4 test_byte_b = *Pointer<Byte4>(in + 4 * 1);
 
-			*Pointer<Short4>(out + 8 * 0) =
-				Unpack(test_byte_a, test_byte_b);
+		*Pointer<Short4>(out + 8 * 0) =
+			Unpack(test_byte_a, test_byte_b);
 
-			*Pointer<Short4>(out + 8 * 1) = Unpack(test_byte_a);
+		*Pointer<Short4>(out + 8 * 1) = Unpack(test_byte_a);
 
-			Return(0);
-		}
-
-		auto routine = function("one");
-
-		if(routine)
-		{
-			unsigned int in[1][2];
-			unsigned int out[2][2];
-
-			memset(&out, 0, sizeof(out));
-
-			in[0][0] = 0xABCDEF12u;
-			in[0][1] = 0x34567890u;
-
-			routine(&in, &out);
-
-			EXPECT_EQ(out[0][0], 0x78EF9012u);
-			EXPECT_EQ(out[0][1], 0x34AB56CDu);
-
-			EXPECT_EQ(out[1][0], 0xEFEF1212u);
-			EXPECT_EQ(out[1][1], 0xABABCDCDu);
-		}
+		Return(0);
 	}
 
+	auto routine = function("one");
+
+	if(routine)
+	{
+		unsigned int in[1][2];
+		unsigned int out[2][2];
+
+		memset(&out, 0, sizeof(out));
+
+		in[0][0] = 0xABCDEF12u;
+		in[0][1] = 0x34567890u;
+
+		routine(&in, &out);
+
+		EXPECT_EQ(out[0][0], 0x78EF9012u);
+		EXPECT_EQ(out[0][1], 0x34AB56CDu);
+
+		EXPECT_EQ(out[1][0], 0xEFEF1212u);
+		EXPECT_EQ(out[1][1], 0xABABCDCDu);
+	}
 }
 
 TEST(ReactorUnitTests, Pack)
 {
+	FunctionT<int(void*)> function;
 	{
-		FunctionT<int(void*)> function;
-		{
-			Pointer<Byte> out = function.Arg<0>();
+		Pointer<Byte> out = function.Arg<0>();
 
-			*Pointer<SByte8>(out + 8 * 0) =
-				PackSigned(Short4(-1, -2, 1, 2),
-					   Short4(3, 4, -3, -4));
+		*Pointer<SByte8>(out + 8 * 0) =
+			PackSigned(Short4(-1, -2, 1, 2),
+					Short4(3, 4, -3, -4));
 
-			*Pointer<Byte8>(out + 8 * 1) =
-				PackUnsigned(Short4(-1, -2, 1, 2),
-					     Short4(3, 4, -3, -4));
+		*Pointer<Byte8>(out + 8 * 1) =
+			PackUnsigned(Short4(-1, -2, 1, 2),
+					    Short4(3, 4, -3, -4));
 
-			*Pointer<Short8>(out + 8 * 2) =
-				PackSigned(Int4(-1, -2, 1, 2),
-					   Int4(3, 4, -3, -4));
+		*Pointer<Short8>(out + 8 * 2) =
+			PackSigned(Int4(-1, -2, 1, 2),
+					Int4(3, 4, -3, -4));
 
-			*Pointer<UShort8>(out + 8 * 4) =
-				PackUnsigned(Int4(-1, -2, 1, 2),
-					     Int4(3, 4, -3, -4));
+		*Pointer<UShort8>(out + 8 * 4) =
+			PackUnsigned(Int4(-1, -2, 1, 2),
+					    Int4(3, 4, -3, -4));
 
-			Return(0);
-		}
-
-		auto routine = function("one");
-
-		if(routine)
-		{
-			unsigned int out[6][2];
-
-			memset(&out, 0, sizeof(out));
-
-			routine(&out);
-
-			EXPECT_EQ(out[0][0], 0x0201FEFFu);
-			EXPECT_EQ(out[0][1], 0xFCFD0403u);
-
-			EXPECT_EQ(out[1][0], 0x02010000u);
-			EXPECT_EQ(out[1][1], 0x00000403u);
-
-			EXPECT_EQ(out[2][0], 0xFFFEFFFFu);
-			EXPECT_EQ(out[2][1], 0x00020001u);
-
-			EXPECT_EQ(out[3][0], 0x00040003u);
-			EXPECT_EQ(out[3][1], 0xFFFCFFFDu);
-
-			EXPECT_EQ(out[4][0], 0x00000000u);
-			EXPECT_EQ(out[4][1], 0x00020001u);
-
-			EXPECT_EQ(out[5][0], 0x00040003u);
-			EXPECT_EQ(out[5][1], 0x00000000u);
-		}
+		Return(0);
 	}
 
+	auto routine = function("one");
+
+	if(routine)
+	{
+		unsigned int out[6][2];
+
+		memset(&out, 0, sizeof(out));
+
+		routine(&out);
+
+		EXPECT_EQ(out[0][0], 0x0201FEFFu);
+		EXPECT_EQ(out[0][1], 0xFCFD0403u);
+
+		EXPECT_EQ(out[1][0], 0x02010000u);
+		EXPECT_EQ(out[1][1], 0x00000403u);
+
+		EXPECT_EQ(out[2][0], 0xFFFEFFFFu);
+		EXPECT_EQ(out[2][1], 0x00020001u);
+
+		EXPECT_EQ(out[3][0], 0x00040003u);
+		EXPECT_EQ(out[3][1], 0xFFFCFFFDu);
+
+		EXPECT_EQ(out[4][0], 0x00000000u);
+		EXPECT_EQ(out[4][1], 0x00020001u);
+
+		EXPECT_EQ(out[5][0], 0x00040003u);
+		EXPECT_EQ(out[5][1], 0x00000000u);
+	}
 }
 
 TEST(ReactorUnitTests, MulHigh)
 {
+	FunctionT<int(void*)> function;
 	{
-		FunctionT<int(void*)> function;
-		{
-			Pointer<Byte> out = function.Arg<0>();
+		Pointer<Byte> out = function.Arg<0>();
 
-			*Pointer<Short4>(out + 16 * 0) =
-				MulHigh(Short4(0x01AA, 0x02DD, 0x03EE, 0xF422),
-				        Short4(0x01BB, 0x02CC, 0x03FF, 0xF411));
-			*Pointer<UShort4>(out + 16 * 1) =
-				MulHigh(UShort4(0x01AA, 0x02DD, 0x03EE, 0xF422),
-				        UShort4(0x01BB, 0x02CC, 0x03FF, 0xF411));
+		*Pointer<Short4>(out + 16 * 0) =
+			MulHigh(Short4(0x01AA, 0x02DD, 0x03EE, 0xF422),
+				    Short4(0x01BB, 0x02CC, 0x03FF, 0xF411));
+		*Pointer<UShort4>(out + 16 * 1) =
+			MulHigh(UShort4(0x01AA, 0x02DD, 0x03EE, 0xF422),
+				    UShort4(0x01BB, 0x02CC, 0x03FF, 0xF411));
 
-			*Pointer<Int4>(out + 16 * 2) =
-				MulHigh(Int4(0x000001AA, 0x000002DD, 0xC8000000, 0xF8000000),
-				        Int4(0x000001BB, 0x84000000, 0x000003EE, 0xD7000000));
-			*Pointer<UInt4>(out + 16 * 3) =
-				MulHigh(UInt4(0x000001AAu, 0x000002DDu, 0xC8000000u, 0xD8000000u),
-				        UInt4(0x000001BBu, 0x84000000u, 0x000003EEu, 0xD7000000u));
+		*Pointer<Int4>(out + 16 * 2) =
+			MulHigh(Int4(0x000001AA, 0x000002DD, 0xC8000000, 0xF8000000),
+				    Int4(0x000001BB, 0x84000000, 0x000003EE, 0xD7000000));
+		*Pointer<UInt4>(out + 16 * 3) =
+			MulHigh(UInt4(0x000001AAu, 0x000002DDu, 0xC8000000u, 0xD8000000u),
+				    UInt4(0x000001BBu, 0x84000000u, 0x000003EEu, 0xD7000000u));
 
-			*Pointer<Int4>(out + 16 * 4) =
-				MulHigh(Int4(0x7FFFFFFF, 0x7FFFFFFF, 0x80008000, 0xFFFFFFFF),
-				        Int4(0x7FFFFFFF, 0x80000000, 0x80008000, 0xFFFFFFFF));
-			*Pointer<UInt4>(out + 16 * 5) =
-				MulHigh(UInt4(0x7FFFFFFFu, 0x7FFFFFFFu, 0x80008000u, 0xFFFFFFFFu),
-				        UInt4(0x7FFFFFFFu, 0x80000000u, 0x80008000u, 0xFFFFFFFFu));
+		*Pointer<Int4>(out + 16 * 4) =
+			MulHigh(Int4(0x7FFFFFFF, 0x7FFFFFFF, 0x80008000, 0xFFFFFFFF),
+				    Int4(0x7FFFFFFF, 0x80000000, 0x80008000, 0xFFFFFFFF));
+		*Pointer<UInt4>(out + 16 * 5) =
+			MulHigh(UInt4(0x7FFFFFFFu, 0x7FFFFFFFu, 0x80008000u, 0xFFFFFFFFu),
+				    UInt4(0x7FFFFFFFu, 0x80000000u, 0x80008000u, 0xFFFFFFFFu));
 
-			// (U)Short8 variants currently unimplemented.
+		// (U)Short8 variants currently unimplemented.
 
-			Return(0);
-		}
-
-		auto routine = function("one");
-
-		if(routine)
-		{
-			unsigned int out[6][4];
-
-			memset(&out, 0, sizeof(out));
-
-			routine(&out);
-
-			EXPECT_EQ(out[0][0], 0x00080002u);
-			EXPECT_EQ(out[0][1], 0x008D000Fu);
-
-			EXPECT_EQ(out[1][0], 0x00080002u);
-			EXPECT_EQ(out[1][1], 0xE8C0000Fu);
-
-			EXPECT_EQ(out[2][0], 0x00000000u);
-			EXPECT_EQ(out[2][1], 0xFFFFFE9Cu);
-			EXPECT_EQ(out[2][2], 0xFFFFFF23u);
-			EXPECT_EQ(out[2][3], 0x01480000u);
-
-			EXPECT_EQ(out[3][0], 0x00000000u);
-			EXPECT_EQ(out[3][1], 0x00000179u);
-			EXPECT_EQ(out[3][2], 0x00000311u);
-			EXPECT_EQ(out[3][3], 0xB5680000u);
-
-			EXPECT_EQ(out[4][0], 0x3FFFFFFFu);
-			EXPECT_EQ(out[4][1], 0xC0000000u);
-			EXPECT_EQ(out[4][2], 0x3FFF8000u);
-			EXPECT_EQ(out[4][3], 0x00000000u);
-
-			EXPECT_EQ(out[5][0], 0x3FFFFFFFu);
-			EXPECT_EQ(out[5][1], 0x3FFFFFFFu);
-			EXPECT_EQ(out[5][2], 0x40008000u);
-			EXPECT_EQ(out[5][3], 0xFFFFFFFEu);
-		}
+		Return(0);
 	}
 
+	auto routine = function("one");
+
+	if(routine)
+	{
+		unsigned int out[6][4];
+
+		memset(&out, 0, sizeof(out));
+
+		routine(&out);
+
+		EXPECT_EQ(out[0][0], 0x00080002u);
+		EXPECT_EQ(out[0][1], 0x008D000Fu);
+
+		EXPECT_EQ(out[1][0], 0x00080002u);
+		EXPECT_EQ(out[1][1], 0xE8C0000Fu);
+
+		EXPECT_EQ(out[2][0], 0x00000000u);
+		EXPECT_EQ(out[2][1], 0xFFFFFE9Cu);
+		EXPECT_EQ(out[2][2], 0xFFFFFF23u);
+		EXPECT_EQ(out[2][3], 0x01480000u);
+
+		EXPECT_EQ(out[3][0], 0x00000000u);
+		EXPECT_EQ(out[3][1], 0x00000179u);
+		EXPECT_EQ(out[3][2], 0x00000311u);
+		EXPECT_EQ(out[3][3], 0xB5680000u);
+
+		EXPECT_EQ(out[4][0], 0x3FFFFFFFu);
+		EXPECT_EQ(out[4][1], 0xC0000000u);
+		EXPECT_EQ(out[4][2], 0x3FFF8000u);
+		EXPECT_EQ(out[4][3], 0x00000000u);
+
+		EXPECT_EQ(out[5][0], 0x3FFFFFFFu);
+		EXPECT_EQ(out[5][1], 0x3FFFFFFFu);
+		EXPECT_EQ(out[5][2], 0x40008000u);
+		EXPECT_EQ(out[5][3], 0xFFFFFFFEu);
+	}
 }
 
 TEST(ReactorUnitTests, MulAdd)
 {
+	FunctionT<int(void*)> function;
 	{
-		FunctionT<int(void*)> function;
-		{
-			Pointer<Byte> out = function.Arg<0>();
+		Pointer<Byte> out = function.Arg<0>();
 
-			*Pointer<Int2>(out + 8 * 0) =
-				MulAdd(Short4(0x1aa, 0x2dd, 0x3ee, 0xF422),
-				       Short4(0x1bb, 0x2cc, 0x3ff, 0xF411));
+		*Pointer<Int2>(out + 8 * 0) =
+			MulAdd(Short4(0x1aa, 0x2dd, 0x3ee, 0xF422),
+				    Short4(0x1bb, 0x2cc, 0x3ff, 0xF411));
 
-			// (U)Short8 variant is mentioned but unimplemented
-			Return(0);
-		}
-
-		auto routine = function("one");
-
-		if(routine)
-		{
-			unsigned int out[1][2];
-
-			memset(&out, 0, sizeof(out));
-
-			routine(&out);
-
-			EXPECT_EQ(out[0][0], 0x000AE34Au);
-			EXPECT_EQ(out[0][1], 0x009D5254u);
-		}
+		// (U)Short8 variant is mentioned but unimplemented
+		Return(0);
 	}
 
+	auto routine = function("one");
+
+	if(routine)
+	{
+		unsigned int out[1][2];
+
+		memset(&out, 0, sizeof(out));
+
+		routine(&out);
+
+		EXPECT_EQ(out[0][0], 0x000AE34Au);
+		EXPECT_EQ(out[0][1], 0x009D5254u);
+	}
 }
 
 TEST(ReactorUnitTests, PointersEqual)
@@ -1638,80 +1590,77 @@
 // It's necessary to inspect the registers in a debugger to actually verify.)
 TEST(ReactorUnitTests, PreserveXMMRegisters)
 {
+    FunctionT<void(void*, void*)> function;
     {
-        FunctionT<void(void*, void*)> function;
-        {
-            Pointer<Byte> in = function.Arg<0>();
-            Pointer<Byte> out = function.Arg<1>();
+        Pointer<Byte> in = function.Arg<0>();
+        Pointer<Byte> out = function.Arg<1>();
 
-            Float4 a = *Pointer<Float4>(in + 16 * 0);
-            Float4 b = *Pointer<Float4>(in + 16 * 1);
-            Float4 c = *Pointer<Float4>(in + 16 * 2);
-            Float4 d = *Pointer<Float4>(in + 16 * 3);
-            Float4 e = *Pointer<Float4>(in + 16 * 4);
-            Float4 f = *Pointer<Float4>(in + 16 * 5);
-            Float4 g = *Pointer<Float4>(in + 16 * 6);
-            Float4 h = *Pointer<Float4>(in + 16 * 7);
-            Float4 i = *Pointer<Float4>(in + 16 * 8);
-            Float4 j = *Pointer<Float4>(in + 16 * 9);
-            Float4 k = *Pointer<Float4>(in + 16 * 10);
-            Float4 l = *Pointer<Float4>(in + 16 * 11);
-            Float4 m = *Pointer<Float4>(in + 16 * 12);
-            Float4 n = *Pointer<Float4>(in + 16 * 13);
-            Float4 o = *Pointer<Float4>(in + 16 * 14);
-            Float4 p = *Pointer<Float4>(in + 16 * 15);
+        Float4 a = *Pointer<Float4>(in + 16 * 0);
+        Float4 b = *Pointer<Float4>(in + 16 * 1);
+        Float4 c = *Pointer<Float4>(in + 16 * 2);
+        Float4 d = *Pointer<Float4>(in + 16 * 3);
+        Float4 e = *Pointer<Float4>(in + 16 * 4);
+        Float4 f = *Pointer<Float4>(in + 16 * 5);
+        Float4 g = *Pointer<Float4>(in + 16 * 6);
+        Float4 h = *Pointer<Float4>(in + 16 * 7);
+        Float4 i = *Pointer<Float4>(in + 16 * 8);
+        Float4 j = *Pointer<Float4>(in + 16 * 9);
+        Float4 k = *Pointer<Float4>(in + 16 * 10);
+        Float4 l = *Pointer<Float4>(in + 16 * 11);
+        Float4 m = *Pointer<Float4>(in + 16 * 12);
+        Float4 n = *Pointer<Float4>(in + 16 * 13);
+        Float4 o = *Pointer<Float4>(in + 16 * 14);
+        Float4 p = *Pointer<Float4>(in + 16 * 15);
 
-            Float4 ab = a + b;
-            Float4 cd = c + d;
-            Float4 ef = e + f;
-            Float4 gh = g + h;
-            Float4 ij = i + j;
-            Float4 kl = k + l;
-            Float4 mn = m + n;
-            Float4 op = o + p;
+        Float4 ab = a + b;
+        Float4 cd = c + d;
+        Float4 ef = e + f;
+        Float4 gh = g + h;
+        Float4 ij = i + j;
+        Float4 kl = k + l;
+        Float4 mn = m + n;
+        Float4 op = o + p;
 
-            Float4 abcd = ab + cd;
-            Float4 efgh = ef + gh;
-            Float4 ijkl = ij + kl;
-            Float4 mnop = mn + op;
+        Float4 abcd = ab + cd;
+        Float4 efgh = ef + gh;
+        Float4 ijkl = ij + kl;
+        Float4 mnop = mn + op;
 
-            Float4 abcdefgh = abcd + efgh;
-            Float4 ijklmnop = ijkl + mnop;
-            Float4 sum = abcdefgh + ijklmnop;
-            *Pointer<Float4>(out) = sum;
-            Return();
-        }
-
-        auto routine = function("one");
-        assert(routine);
-
-        float input[64] = { 1.0f,  0.0f,   0.0f, 0.0f,
-                           -1.0f,  1.0f,  -1.0f, 0.0f,
-                            1.0f,  2.0f,  -2.0f, 0.0f,
-                           -1.0f,  3.0f,  -3.0f, 0.0f,
-                            1.0f,  4.0f,  -4.0f, 0.0f,
-                           -1.0f,  5.0f,  -5.0f, 0.0f,
-                            1.0f,  6.0f,  -6.0f, 0.0f,
-                           -1.0f,  7.0f,  -7.0f, 0.0f,
-                            1.0f,  8.0f,  -8.0f, 0.0f,
-                           -1.0f,  9.0f,  -9.0f, 0.0f,
-                            1.0f, 10.0f, -10.0f, 0.0f,
-                           -1.0f, 11.0f, -11.0f, 0.0f,
-                            1.0f, 12.0f, -12.0f, 0.0f,
-                           -1.0f, 13.0f, -13.0f, 0.0f,
-                            1.0f, 14.0f, -14.0f, 0.0f,
-                           -1.0f, 15.0f, -15.0f, 0.0f };
-
-        float result[4];
-
-        routine(input, result);
-
-        EXPECT_EQ(result[0], 0.0f);
-        EXPECT_EQ(result[1], 120.0f);
-        EXPECT_EQ(result[2], -120.0f);
-        EXPECT_EQ(result[3], 0.0f);
+        Float4 abcdefgh = abcd + efgh;
+        Float4 ijklmnop = ijkl + mnop;
+        Float4 sum = abcdefgh + ijklmnop;
+        *Pointer<Float4>(out) = sum;
+        Return();
     }
 
+    auto routine = function("one");
+    assert(routine);
+
+    float input[64] = { 1.0f,  0.0f,   0.0f, 0.0f,
+                        -1.0f,  1.0f,  -1.0f, 0.0f,
+                        1.0f,  2.0f,  -2.0f, 0.0f,
+                        -1.0f,  3.0f,  -3.0f, 0.0f,
+                        1.0f,  4.0f,  -4.0f, 0.0f,
+                        -1.0f,  5.0f,  -5.0f, 0.0f,
+                        1.0f,  6.0f,  -6.0f, 0.0f,
+                        -1.0f,  7.0f,  -7.0f, 0.0f,
+                        1.0f,  8.0f,  -8.0f, 0.0f,
+                        -1.0f,  9.0f,  -9.0f, 0.0f,
+                        1.0f, 10.0f, -10.0f, 0.0f,
+                        -1.0f, 11.0f, -11.0f, 0.0f,
+                        1.0f, 12.0f, -12.0f, 0.0f,
+                        -1.0f, 13.0f, -13.0f, 0.0f,
+                        1.0f, 14.0f, -14.0f, 0.0f,
+                        -1.0f, 15.0f, -15.0f, 0.0f };
+
+    float result[4];
+
+    routine(input, result);
+
+    EXPECT_EQ(result[0], 0.0f);
+    EXPECT_EQ(result[1], 120.0f);
+    EXPECT_EQ(result[2], -120.0f);
+    EXPECT_EQ(result[3], 0.0f);
 }
 
 template <typename T>
diff --git a/src/Reactor/Routine.hpp b/src/Reactor/Routine.hpp
index 4e643fd..922d3ab 100644
--- a/src/Reactor/Routine.hpp
+++ b/src/Reactor/Routine.hpp
@@ -17,57 +17,58 @@
 
 #include <memory>
 
-namespace rr
+namespace rr {
+
+class Routine
 {
-	class Routine
+public:
+	Routine() = default;
+	virtual ~Routine() = default;
+
+	virtual const void *getEntry(int index = 0) const = 0;
+};
+
+// RoutineT is a type-safe wrapper around a Routine and its callable entry, returned by FunctionT
+template<typename FunctionType>
+class RoutineT;
+
+template<typename Return, typename... Arguments>
+class RoutineT<Return(Arguments...)>
+{
+public:
+	RoutineT() = default;
+
+	explicit RoutineT(const std::shared_ptr<Routine>& routine)
+		: routine(routine)
 	{
-	public:
-		Routine() = default;
-		virtual ~Routine() = default;
+		if (routine)
+		{
+			callable = reinterpret_cast<CallableType>(const_cast<void*>(routine->getEntry(0)));
+		}
+	}
 
-		virtual const void *getEntry(int index = 0) const = 0;
-	};
-
-	// RoutineT is a type-safe wrapper around a Routine and its callable entry, returned by FunctionT
-	template<typename FunctionType>
-	class RoutineT;
-
-	template<typename Return, typename... Arguments>
-	class RoutineT<Return(Arguments...)>
+	operator bool() const
 	{
-	public:
-		RoutineT() = default;
+		return callable != nullptr;
+	}
 
-		explicit RoutineT(const std::shared_ptr<Routine>& routine)
-			: routine(routine)
-		{
-			if (routine)
-			{
-				callable = reinterpret_cast<CallableType>(const_cast<void*>(routine->getEntry(0)));
-			}
-		}
+	template <typename... Args>
+	Return operator()(Args&&... args) const
+	{
+		return callable(std::forward<Args>(args)...);
+	}
 
-		operator bool() const
-		{
-			return callable != nullptr;
-		}
+	const void* getEntry() const
+	{
+		return reinterpret_cast<void*>(callable);
+	}
 
-		template <typename... Args>
-		Return operator()(Args&&... args) const
-		{
-			return callable(std::forward<Args>(args)...);
-		}
+private:
+	std::shared_ptr<Routine> routine;
+	using CallableType = Return(*)(Arguments...);
+	CallableType callable = nullptr;
+};
 
-		const void* getEntry() const
-		{
-			return reinterpret_cast<void*>(callable);
-		}
-
-	private:
-		std::shared_ptr<Routine> routine;
-		using CallableType = Return(*)(Arguments...);
-		CallableType callable = nullptr;
-	};
-}
+}  // namespace rr
 
 #endif   // rr_Routine_hpp
diff --git a/src/Reactor/SubzeroReactor.cpp b/src/Reactor/SubzeroReactor.cpp
index 8b2d19c..8683862 100644
--- a/src/Reactor/SubzeroReactor.cpp
+++ b/src/Reactor/SubzeroReactor.cpp
@@ -54,1811 +54,1810 @@
 #include <limits>
 #include <iostream>
 
-namespace rr
+namespace rr { class ELFMemoryStreamer; }
+
+namespace {
+
+// Default configuration settings. Must be accessed under mutex lock.
+std::mutex defaultConfigLock;
+rr::Config &defaultConfig()
 {
-	class ELFMemoryStreamer;
+	// This uses a static in a function to avoid the cost of a global static
+	// initializer. See http://neugierig.org/software/chromium/notes/2011/08/static-initializers.html
+	static rr::Config config = rr::Config::Edit()
+		.apply({});
+	return config;
 }
 
-namespace
+Ice::GlobalContext *context = nullptr;
+Ice::Cfg *function = nullptr;
+Ice::CfgNode *basicBlock = nullptr;
+Ice::CfgLocalAllocatorScope *allocator = nullptr;
+rr::ELFMemoryStreamer *routine = nullptr;
+
+std::mutex codegenMutex;
+
+Ice::ELFFileStreamer *elfFile = nullptr;
+Ice::Fdstream *out = nullptr;
+
+}  // Anonymous namespace
+
+namespace {
+
+#if !defined(__i386__) && defined(_M_IX86)
+	#define __i386__ 1
+#endif
+
+#if !defined(__x86_64__) && (defined(_M_AMD64) || defined (_M_X64))
+	#define __x86_64__ 1
+#endif
+
+static Ice::OptLevel toIce(rr::Optimization::Level level)
 {
-	// Default configuration settings. Must be accessed under mutex lock.
-	std::mutex defaultConfigLock;
-	rr::Config &defaultConfig()
+	switch (level)
 	{
-		// This uses a static in a function to avoid the cost of a global static
-		// initializer. See http://neugierig.org/software/chromium/notes/2011/08/static-initializers.html
-		static rr::Config config = rr::Config::Edit()
-			.apply({});
-		return config;
+		// Note that Opt_0 and Opt_1 are not implemented by Subzero
+		case rr::Optimization::Level::None:       return Ice::Opt_m1;
+		case rr::Optimization::Level::Less:       return Ice::Opt_m1;
+		case rr::Optimization::Level::Default:    return Ice::Opt_2;
+		case rr::Optimization::Level::Aggressive: return Ice::Opt_2;
+		default: UNREACHABLE("Unknown Optimization Level %d", int(level));
 	}
-
-	Ice::GlobalContext *context = nullptr;
-	Ice::Cfg *function = nullptr;
-	Ice::CfgNode *basicBlock = nullptr;
-	Ice::CfgLocalAllocatorScope *allocator = nullptr;
-	rr::ELFMemoryStreamer *routine = nullptr;
-
-	std::mutex codegenMutex;
-
-	Ice::ELFFileStreamer *elfFile = nullptr;
-	Ice::Fdstream *out = nullptr;
+	return Ice::Opt_2;
 }
 
-namespace
+class CPUID
 {
-	#if !defined(__i386__) && defined(_M_IX86)
-		#define __i386__ 1
-	#endif
+public:
+	const static bool ARM;
+	const static bool SSE4_1;
 
-	#if !defined(__x86_64__) && (defined(_M_AMD64) || defined (_M_X64))
-		#define __x86_64__ 1
-	#endif
-
-	static Ice::OptLevel toIce(rr::Optimization::Level level)
+private:
+	static void cpuid(int registers[4], int info)
 	{
-		switch (level)
-		{
-			// Note that Opt_0 and Opt_1 are not implemented by Subzero
-			case rr::Optimization::Level::None:       return Ice::Opt_m1;
-			case rr::Optimization::Level::Less:       return Ice::Opt_m1;
-			case rr::Optimization::Level::Default:    return Ice::Opt_2;
-			case rr::Optimization::Level::Aggressive: return Ice::Opt_2;
-			default: UNREACHABLE("Unknown Optimization Level %d", int(level));
-		}
-		return Ice::Opt_2;
+		#if defined(__i386__) || defined(__x86_64__)
+			#if defined(_WIN32)
+				__cpuid(registers, info);
+			#else
+				__asm volatile("cpuid": "=a" (registers[0]), "=b" (registers[1]), "=c" (registers[2]), "=d" (registers[3]): "a" (info));
+			#endif
+		#else
+			registers[0] = 0;
+			registers[1] = 0;
+			registers[2] = 0;
+			registers[3] = 0;
+		#endif
 	}
 
-	class CPUID
+	static bool detectARM()
 	{
-	public:
-		const static bool ARM;
-		const static bool SSE4_1;
+		#if defined(__arm__) || defined(__aarch64__)
+			return true;
+		#elif defined(__i386__) || defined(__x86_64__)
+			return false;
+		#elif defined(__mips__)
+			return false;
+		#else
+			#error "Unknown architecture"
+		#endif
+	}
 
-	private:
-		static void cpuid(int registers[4], int info)
-		{
-			#if defined(__i386__) || defined(__x86_64__)
-				#if defined(_WIN32)
-					__cpuid(registers, info);
-				#else
-					__asm volatile("cpuid": "=a" (registers[0]), "=b" (registers[1]), "=c" (registers[2]), "=d" (registers[3]): "a" (info));
-				#endif
-			#else
-				registers[0] = 0;
-				registers[1] = 0;
-				registers[2] = 0;
-				registers[3] = 0;
-			#endif
-		}
+	static bool detectSSE4_1()
+	{
+		#if defined(__i386__) || defined(__x86_64__)
+			int registers[4];
+			cpuid(registers, 1);
+			return (registers[2] & 0x00080000) != 0;
+		#else
+			return false;
+		#endif
+	}
+};
 
-		static bool detectARM()
-		{
-			#if defined(__arm__) || defined(__aarch64__)
-				return true;
-			#elif defined(__i386__) || defined(__x86_64__)
-				return false;
-			#elif defined(__mips__)
-				return false;
-			#else
-				#error "Unknown architecture"
-			#endif
-		}
+const bool CPUID::ARM = CPUID::detectARM();
+const bool CPUID::SSE4_1 = CPUID::detectSSE4_1();
+const bool emulateIntrinsics = false;
+const bool emulateMismatchedBitCast = CPUID::ARM;
 
-		static bool detectSSE4_1()
-		{
-			#if defined(__i386__) || defined(__x86_64__)
-				int registers[4];
-				cpuid(registers, 1);
-				return (registers[2] & 0x00080000) != 0;
-			#else
-				return false;
-			#endif
-		}
-	};
-
-	const bool CPUID::ARM = CPUID::detectARM();
-	const bool CPUID::SSE4_1 = CPUID::detectSSE4_1();
-	const bool emulateIntrinsics = false;
-	const bool emulateMismatchedBitCast = CPUID::ARM;
-
-	constexpr bool subzeroDumpEnabled = false;
-	constexpr bool subzeroEmitTextAsm = false;
+constexpr bool subzeroDumpEnabled = false;
+constexpr bool subzeroEmitTextAsm = false;
 
 #if !ALLOW_DUMP
-	static_assert(!subzeroDumpEnabled, "Compile Subzero with ALLOW_DUMP=1 for subzeroDumpEnabled");
-	static_assert(!subzeroEmitTextAsm, "Compile Subzero with ALLOW_DUMP=1 for subzeroEmitTextAsm");
+static_assert(!subzeroDumpEnabled, "Compile Subzero with ALLOW_DUMP=1 for subzeroDumpEnabled");
+static_assert(!subzeroEmitTextAsm, "Compile Subzero with ALLOW_DUMP=1 for subzeroEmitTextAsm");
 #endif
+
+}  // anonymous namespace
+
+namespace rr {
+
+const Capabilities Caps =
+{
+	false, // CoroutinesSupported
+};
+
+enum EmulatedType
+{
+	EmulatedShift = 16,
+	EmulatedV2 = 2 << EmulatedShift,
+	EmulatedV4 = 4 << EmulatedShift,
+	EmulatedV8 = 8 << EmulatedShift,
+	EmulatedBits = EmulatedV2 | EmulatedV4 | EmulatedV8,
+
+	Type_v2i32 = Ice::IceType_v4i32 | EmulatedV2,
+	Type_v4i16 = Ice::IceType_v8i16 | EmulatedV4,
+	Type_v2i16 = Ice::IceType_v8i16 | EmulatedV2,
+	Type_v8i8 =  Ice::IceType_v16i8 | EmulatedV8,
+	Type_v4i8 =  Ice::IceType_v16i8 | EmulatedV4,
+	Type_v2f32 = Ice::IceType_v4f32 | EmulatedV2,
+};
+
+class Value : public Ice::Operand {};
+class SwitchCases : public Ice::InstSwitch {};
+class BasicBlock : public Ice::CfgNode {};
+
+Ice::Type T(Type *t)
+{
+	static_assert(static_cast<unsigned int>(Ice::IceType_NUM) < static_cast<unsigned int>(EmulatedBits), "Ice::Type overlaps with our emulated types!");
+	return (Ice::Type)(reinterpret_cast<std::intptr_t>(t) & ~EmulatedBits);
 }
 
-namespace rr
+Type *T(Ice::Type t)
 {
-	const Capabilities Caps =
+	return reinterpret_cast<Type*>(t);
+}
+
+Type *T(EmulatedType t)
+{
+	return reinterpret_cast<Type*>(t);
+}
+
+Value *V(Ice::Operand *v)
+{
+	return reinterpret_cast<Value*>(v);
+}
+
+BasicBlock *B(Ice::CfgNode *b)
+{
+	return reinterpret_cast<BasicBlock*>(b);
+}
+
+static size_t typeSize(Type *type)
+{
+	if(reinterpret_cast<std::intptr_t>(type) & EmulatedBits)
 	{
-		false, // CoroutinesSupported
-	};
-
-	enum EmulatedType
-	{
-		EmulatedShift = 16,
-		EmulatedV2 = 2 << EmulatedShift,
-		EmulatedV4 = 4 << EmulatedShift,
-		EmulatedV8 = 8 << EmulatedShift,
-		EmulatedBits = EmulatedV2 | EmulatedV4 | EmulatedV8,
-
-		Type_v2i32 = Ice::IceType_v4i32 | EmulatedV2,
-		Type_v4i16 = Ice::IceType_v8i16 | EmulatedV4,
-		Type_v2i16 = Ice::IceType_v8i16 | EmulatedV2,
-		Type_v8i8 =  Ice::IceType_v16i8 | EmulatedV8,
-		Type_v4i8 =  Ice::IceType_v16i8 | EmulatedV4,
-		Type_v2f32 = Ice::IceType_v4f32 | EmulatedV2,
-	};
-
-	class Value : public Ice::Operand {};
-	class SwitchCases : public Ice::InstSwitch {};
-	class BasicBlock : public Ice::CfgNode {};
-
-	Ice::Type T(Type *t)
-	{
-		static_assert(static_cast<unsigned int>(Ice::IceType_NUM) < static_cast<unsigned int>(EmulatedBits), "Ice::Type overlaps with our emulated types!");
-		return (Ice::Type)(reinterpret_cast<std::intptr_t>(t) & ~EmulatedBits);
-	}
-
-	Type *T(Ice::Type t)
-	{
-		return reinterpret_cast<Type*>(t);
-	}
-
-	Type *T(EmulatedType t)
-	{
-		return reinterpret_cast<Type*>(t);
-	}
-
-	Value *V(Ice::Operand *v)
-	{
-		return reinterpret_cast<Value*>(v);
-	}
-
-	BasicBlock *B(Ice::CfgNode *b)
-	{
-		return reinterpret_cast<BasicBlock*>(b);
-	}
-
-	static size_t typeSize(Type *type)
-	{
-		if(reinterpret_cast<std::intptr_t>(type) & EmulatedBits)
+		switch(reinterpret_cast<std::intptr_t>(type))
 		{
-			switch(reinterpret_cast<std::intptr_t>(type))
-			{
-			case Type_v2i32: return 8;
-			case Type_v4i16: return 8;
-			case Type_v2i16: return 4;
-			case Type_v8i8:  return 8;
-			case Type_v4i8:  return 4;
-			case Type_v2f32: return 8;
-			default: ASSERT(false);
-			}
+		case Type_v2i32: return 8;
+		case Type_v4i16: return 8;
+		case Type_v2i16: return 4;
+		case Type_v8i8:  return 8;
+		case Type_v4i8:  return 4;
+		case Type_v2f32: return 8;
+		default: ASSERT(false);
+		}
+	}
+
+	return Ice::typeWidthInBytes(T(type));
+}
+
+using ElfHeader = std::conditional<sizeof(void*) == 8, Elf64_Ehdr, Elf32_Ehdr>::type;
+using SectionHeader = std::conditional<sizeof(void*) == 8, Elf64_Shdr, Elf32_Shdr>::type;
+
+inline const SectionHeader *sectionHeader(const ElfHeader *elfHeader)
+{
+	return reinterpret_cast<const SectionHeader*>((intptr_t)elfHeader + elfHeader->e_shoff);
+}
+
+inline const SectionHeader *elfSection(const ElfHeader *elfHeader, int index)
+{
+	return &sectionHeader(elfHeader)[index];
+}
+
+static void *relocateSymbol(const ElfHeader *elfHeader, const Elf32_Rel &relocation, const SectionHeader &relocationTable)
+{
+	const SectionHeader *target = elfSection(elfHeader, relocationTable.sh_info);
+
+	uint32_t index = relocation.getSymbol();
+	int table = relocationTable.sh_link;
+	void *symbolValue = nullptr;
+
+	if(index != SHN_UNDEF)
+	{
+		if(table == SHN_UNDEF) return nullptr;
+		const SectionHeader *symbolTable = elfSection(elfHeader, table);
+
+		uint32_t symtab_entries = symbolTable->sh_size / symbolTable->sh_entsize;
+		if(index >= symtab_entries)
+		{
+			ASSERT(index < symtab_entries && "Symbol Index out of range");
+			return nullptr;
 		}
 
-		return Ice::typeWidthInBytes(T(type));
-	}
+		intptr_t symbolAddress = (intptr_t)elfHeader + symbolTable->sh_offset;
+		Elf32_Sym &symbol = ((Elf32_Sym*)symbolAddress)[index];
+		uint16_t section = symbol.st_shndx;
 
-	using ElfHeader = std::conditional<sizeof(void*) == 8, Elf64_Ehdr, Elf32_Ehdr>::type;
-	using SectionHeader = std::conditional<sizeof(void*) == 8, Elf64_Shdr, Elf32_Shdr>::type;
-
-	inline const SectionHeader *sectionHeader(const ElfHeader *elfHeader)
-	{
-		return reinterpret_cast<const SectionHeader*>((intptr_t)elfHeader + elfHeader->e_shoff);
-	}
-
-	inline const SectionHeader *elfSection(const ElfHeader *elfHeader, int index)
-	{
-		return &sectionHeader(elfHeader)[index];
-	}
-
-	static void *relocateSymbol(const ElfHeader *elfHeader, const Elf32_Rel &relocation, const SectionHeader &relocationTable)
-	{
-		const SectionHeader *target = elfSection(elfHeader, relocationTable.sh_info);
-
-		uint32_t index = relocation.getSymbol();
-		int table = relocationTable.sh_link;
-		void *symbolValue = nullptr;
-
-		if(index != SHN_UNDEF)
+		if(section != SHN_UNDEF && section < SHN_LORESERVE)
 		{
-			if(table == SHN_UNDEF) return nullptr;
-			const SectionHeader *symbolTable = elfSection(elfHeader, table);
-
-			uint32_t symtab_entries = symbolTable->sh_size / symbolTable->sh_entsize;
-			if(index >= symtab_entries)
-			{
-				ASSERT(index < symtab_entries && "Symbol Index out of range");
-				return nullptr;
-			}
-
-			intptr_t symbolAddress = (intptr_t)elfHeader + symbolTable->sh_offset;
-			Elf32_Sym &symbol = ((Elf32_Sym*)symbolAddress)[index];
-			uint16_t section = symbol.st_shndx;
-
-			if(section != SHN_UNDEF && section < SHN_LORESERVE)
-			{
-				const SectionHeader *target = elfSection(elfHeader, symbol.st_shndx);
-				symbolValue = reinterpret_cast<void*>((intptr_t)elfHeader + symbol.st_value + target->sh_offset);
-			}
-			else
-			{
-				return nullptr;
-			}
-		}
-
-		intptr_t address = (intptr_t)elfHeader + target->sh_offset;
-		unaligned_ptr<int32_t> patchSite = (int32_t*)(address + relocation.r_offset);
-
-		if(CPUID::ARM)
-		{
-			switch(relocation.getType())
-			{
-			case R_ARM_NONE:
-				// No relocation
-				break;
-			case R_ARM_MOVW_ABS_NC:
-				{
-					uint32_t thumb = 0;   // Calls to Thumb code not supported.
-					uint32_t lo = (uint32_t)(intptr_t)symbolValue | thumb;
-					*patchSite = (*patchSite & 0xFFF0F000) | ((lo & 0xF000) << 4) | (lo & 0x0FFF);
-				}
-				break;
-			case R_ARM_MOVT_ABS:
-				{
-					uint32_t hi = (uint32_t)(intptr_t)(symbolValue) >> 16;
-					*patchSite = (*patchSite & 0xFFF0F000) | ((hi & 0xF000) << 4) | (hi & 0x0FFF);
-				}
-				break;
-			default:
-				ASSERT(false && "Unsupported relocation type");
-				return nullptr;
-			}
+			const SectionHeader *target = elfSection(elfHeader, symbol.st_shndx);
+			symbolValue = reinterpret_cast<void*>((intptr_t)elfHeader + symbol.st_value + target->sh_offset);
 		}
 		else
 		{
-			switch(relocation.getType())
-			{
-			case R_386_NONE:
-				// No relocation
-				break;
-			case R_386_32:
-				*patchSite = (int32_t)((intptr_t)symbolValue + *patchSite);
-				break;
-			case R_386_PC32:
-				*patchSite = (int32_t)((intptr_t)symbolValue + *patchSite - (intptr_t)patchSite);
-				break;
-			default:
-				ASSERT(false && "Unsupported relocation type");
-				return nullptr;
-			}
+			return nullptr;
 		}
-
-		return symbolValue;
 	}
 
-	static void *relocateSymbol(const ElfHeader *elfHeader, const Elf64_Rela &relocation, const SectionHeader &relocationTable)
+	intptr_t address = (intptr_t)elfHeader + target->sh_offset;
+	unaligned_ptr<int32_t> patchSite = (int32_t*)(address + relocation.r_offset);
+
+	if(CPUID::ARM)
 	{
-		const SectionHeader *target = elfSection(elfHeader, relocationTable.sh_info);
-
-		uint32_t index = relocation.getSymbol();
-		int table = relocationTable.sh_link;
-		void *symbolValue = nullptr;
-
-		if(index != SHN_UNDEF)
-		{
-			if(table == SHN_UNDEF) return nullptr;
-			const SectionHeader *symbolTable = elfSection(elfHeader, table);
-
-			uint32_t symtab_entries = symbolTable->sh_size / symbolTable->sh_entsize;
-			if(index >= symtab_entries)
-			{
-				ASSERT(index < symtab_entries && "Symbol Index out of range");
-				return nullptr;
-			}
-
-			intptr_t symbolAddress = (intptr_t)elfHeader + symbolTable->sh_offset;
-			Elf64_Sym &symbol = ((Elf64_Sym*)symbolAddress)[index];
-			uint16_t section = symbol.st_shndx;
-
-			if(section != SHN_UNDEF && section < SHN_LORESERVE)
-			{
-				const SectionHeader *target = elfSection(elfHeader, symbol.st_shndx);
-				symbolValue = reinterpret_cast<void*>((intptr_t)elfHeader + symbol.st_value + target->sh_offset);
-			}
-			else
-			{
-				return nullptr;
-			}
-		}
-
-		intptr_t address = (intptr_t)elfHeader + target->sh_offset;
-		unaligned_ptr<int32_t> patchSite32 = (int32_t*)(address + relocation.r_offset);
-		unaligned_ptr<int64_t> patchSite64 = (int64_t*)(address + relocation.r_offset);
-
 		switch(relocation.getType())
 		{
-		case R_X86_64_NONE:
+		case R_ARM_NONE:
 			// No relocation
 			break;
-		case R_X86_64_64:
-			*patchSite64 = (int64_t)((intptr_t)symbolValue + *patchSite64 + relocation.r_addend);
+		case R_ARM_MOVW_ABS_NC:
+			{
+				uint32_t thumb = 0;   // Calls to Thumb code not supported.
+				uint32_t lo = (uint32_t)(intptr_t)symbolValue | thumb;
+				*patchSite = (*patchSite & 0xFFF0F000) | ((lo & 0xF000) << 4) | (lo & 0x0FFF);
+			}
 			break;
-		case R_X86_64_PC32:
-			*patchSite32 = (int32_t)((intptr_t)symbolValue + *patchSite32 - (intptr_t)patchSite32 + relocation.r_addend);
-			break;
-		case R_X86_64_32S:
-			*patchSite32 = (int32_t)((intptr_t)symbolValue + *patchSite32 + relocation.r_addend);
+		case R_ARM_MOVT_ABS:
+			{
+				uint32_t hi = (uint32_t)(intptr_t)(symbolValue) >> 16;
+				*patchSite = (*patchSite & 0xFFF0F000) | ((hi & 0xF000) << 4) | (hi & 0x0FFF);
+			}
 			break;
 		default:
 			ASSERT(false && "Unsupported relocation type");
 			return nullptr;
 		}
-
-		return symbolValue;
+	}
+	else
+	{
+		switch(relocation.getType())
+		{
+		case R_386_NONE:
+			// No relocation
+			break;
+		case R_386_32:
+			*patchSite = (int32_t)((intptr_t)symbolValue + *patchSite);
+			break;
+		case R_386_PC32:
+			*patchSite = (int32_t)((intptr_t)symbolValue + *patchSite - (intptr_t)patchSite);
+			break;
+		default:
+			ASSERT(false && "Unsupported relocation type");
+			return nullptr;
+		}
 	}
 
-	void *loadImage(uint8_t *const elfImage, size_t &codeSize)
-	{
-		ElfHeader *elfHeader = (ElfHeader*)elfImage;
+	return symbolValue;
+}
 
-		if(!elfHeader->checkMagic())
+static void *relocateSymbol(const ElfHeader *elfHeader, const Elf64_Rela &relocation, const SectionHeader &relocationTable)
+{
+	const SectionHeader *target = elfSection(elfHeader, relocationTable.sh_info);
+
+	uint32_t index = relocation.getSymbol();
+	int table = relocationTable.sh_link;
+	void *symbolValue = nullptr;
+
+	if(index != SHN_UNDEF)
+	{
+		if(table == SHN_UNDEF) return nullptr;
+		const SectionHeader *symbolTable = elfSection(elfHeader, table);
+
+		uint32_t symtab_entries = symbolTable->sh_size / symbolTable->sh_entsize;
+		if(index >= symtab_entries)
 		{
+			ASSERT(index < symtab_entries && "Symbol Index out of range");
 			return nullptr;
 		}
 
-		// Expect ELF bitness to match platform
-		ASSERT(sizeof(void*) == 8 ? elfHeader->getFileClass() == ELFCLASS64 : elfHeader->getFileClass() == ELFCLASS32);
-		#if defined(__i386__)
-			ASSERT(sizeof(void*) == 4 && elfHeader->e_machine == EM_386);
-		#elif defined(__x86_64__)
-			ASSERT(sizeof(void*) == 8 && elfHeader->e_machine == EM_X86_64);
-		#elif defined(__arm__)
-			ASSERT(sizeof(void*) == 4 && elfHeader->e_machine == EM_ARM);
-		#elif defined(__aarch64__)
-			ASSERT(sizeof(void*) == 8 && elfHeader->e_machine == EM_AARCH64);
-		#elif defined(__mips__)
-			ASSERT(sizeof(void*) == 4 && elfHeader->e_machine == EM_MIPS);
-		#else
-			#error "Unsupported platform"
-		#endif
+		intptr_t symbolAddress = (intptr_t)elfHeader + symbolTable->sh_offset;
+		Elf64_Sym &symbol = ((Elf64_Sym*)symbolAddress)[index];
+		uint16_t section = symbol.st_shndx;
 
-		SectionHeader *sectionHeader = (SectionHeader*)(elfImage + elfHeader->e_shoff);
-		void *entry = nullptr;
-
-		for(int i = 0; i < elfHeader->e_shnum; i++)
+		if(section != SHN_UNDEF && section < SHN_LORESERVE)
 		{
-			if(sectionHeader[i].sh_type == SHT_PROGBITS)
-			{
-				if(sectionHeader[i].sh_flags & SHF_EXECINSTR)
-				{
-					entry = elfImage + sectionHeader[i].sh_offset;
-					codeSize = sectionHeader[i].sh_size;
-				}
-			}
-			else if(sectionHeader[i].sh_type == SHT_REL)
-			{
-				ASSERT(sizeof(void*) == 4 && "UNIMPLEMENTED");   // Only expected/implemented for 32-bit code
+			const SectionHeader *target = elfSection(elfHeader, symbol.st_shndx);
+			symbolValue = reinterpret_cast<void*>((intptr_t)elfHeader + symbol.st_value + target->sh_offset);
+		}
+		else
+		{
+			return nullptr;
+		}
+	}
 
-				for(Elf32_Word index = 0; index < sectionHeader[i].sh_size / sectionHeader[i].sh_entsize; index++)
-				{
-					const Elf32_Rel &relocation = ((const Elf32_Rel*)(elfImage + sectionHeader[i].sh_offset))[index];
-					relocateSymbol(elfHeader, relocation, sectionHeader[i]);
-				}
-			}
-			else if(sectionHeader[i].sh_type == SHT_RELA)
-			{
-				ASSERT(sizeof(void*) == 8 && "UNIMPLEMENTED");   // Only expected/implemented for 64-bit code
+	intptr_t address = (intptr_t)elfHeader + target->sh_offset;
+	unaligned_ptr<int32_t> patchSite32 = (int32_t*)(address + relocation.r_offset);
+	unaligned_ptr<int64_t> patchSite64 = (int64_t*)(address + relocation.r_offset);
 
-				for(Elf32_Word index = 0; index < sectionHeader[i].sh_size / sectionHeader[i].sh_entsize; index++)
-				{
-					const Elf64_Rela &relocation = ((const Elf64_Rela*)(elfImage + sectionHeader[i].sh_offset))[index];
-					relocateSymbol(elfHeader, relocation, sectionHeader[i]);
-				}
+	switch(relocation.getType())
+	{
+	case R_X86_64_NONE:
+		// No relocation
+		break;
+	case R_X86_64_64:
+		*patchSite64 = (int64_t)((intptr_t)symbolValue + *patchSite64 + relocation.r_addend);
+		break;
+	case R_X86_64_PC32:
+		*patchSite32 = (int32_t)((intptr_t)symbolValue + *patchSite32 - (intptr_t)patchSite32 + relocation.r_addend);
+		break;
+	case R_X86_64_32S:
+		*patchSite32 = (int32_t)((intptr_t)symbolValue + *patchSite32 + relocation.r_addend);
+		break;
+	default:
+		ASSERT(false && "Unsupported relocation type");
+		return nullptr;
+	}
+
+	return symbolValue;
+}
+
+void *loadImage(uint8_t *const elfImage, size_t &codeSize)
+{
+	ElfHeader *elfHeader = (ElfHeader*)elfImage;
+
+	if(!elfHeader->checkMagic())
+	{
+		return nullptr;
+	}
+
+	// Expect ELF bitness to match platform
+	ASSERT(sizeof(void*) == 8 ? elfHeader->getFileClass() == ELFCLASS64 : elfHeader->getFileClass() == ELFCLASS32);
+	#if defined(__i386__)
+		ASSERT(sizeof(void*) == 4 && elfHeader->e_machine == EM_386);
+	#elif defined(__x86_64__)
+		ASSERT(sizeof(void*) == 8 && elfHeader->e_machine == EM_X86_64);
+	#elif defined(__arm__)
+		ASSERT(sizeof(void*) == 4 && elfHeader->e_machine == EM_ARM);
+	#elif defined(__aarch64__)
+		ASSERT(sizeof(void*) == 8 && elfHeader->e_machine == EM_AARCH64);
+	#elif defined(__mips__)
+		ASSERT(sizeof(void*) == 4 && elfHeader->e_machine == EM_MIPS);
+	#else
+		#error "Unsupported platform"
+	#endif
+
+	SectionHeader *sectionHeader = (SectionHeader*)(elfImage + elfHeader->e_shoff);
+	void *entry = nullptr;
+
+	for(int i = 0; i < elfHeader->e_shnum; i++)
+	{
+		if(sectionHeader[i].sh_type == SHT_PROGBITS)
+		{
+			if(sectionHeader[i].sh_flags & SHF_EXECINSTR)
+			{
+				entry = elfImage + sectionHeader[i].sh_offset;
+				codeSize = sectionHeader[i].sh_size;
 			}
 		}
+		else if(sectionHeader[i].sh_type == SHT_REL)
+		{
+			ASSERT(sizeof(void*) == 4 && "UNIMPLEMENTED");   // Only expected/implemented for 32-bit code
 
+			for(Elf32_Word index = 0; index < sectionHeader[i].sh_size / sectionHeader[i].sh_entsize; index++)
+			{
+				const Elf32_Rel &relocation = ((const Elf32_Rel*)(elfImage + sectionHeader[i].sh_offset))[index];
+				relocateSymbol(elfHeader, relocation, sectionHeader[i]);
+			}
+		}
+		else if(sectionHeader[i].sh_type == SHT_RELA)
+		{
+			ASSERT(sizeof(void*) == 8 && "UNIMPLEMENTED");   // Only expected/implemented for 64-bit code
+
+			for(Elf32_Word index = 0; index < sectionHeader[i].sh_size / sectionHeader[i].sh_entsize; index++)
+			{
+				const Elf64_Rela &relocation = ((const Elf64_Rela*)(elfImage + sectionHeader[i].sh_offset))[index];
+				relocateSymbol(elfHeader, relocation, sectionHeader[i]);
+			}
+		}
+	}
+
+	return entry;
+}
+
+template<typename T>
+struct ExecutableAllocator
+{
+	ExecutableAllocator() {}
+	template<class U> ExecutableAllocator(const ExecutableAllocator<U> &other) {}
+
+	using value_type = T;
+	using size_type = std::size_t;
+
+	T *allocate(size_type n)
+	{
+		return (T*)allocateExecutable(sizeof(T) * n);
+	}
+
+	void deallocate(T *p, size_type n)
+	{
+		deallocateExecutable(p, sizeof(T) * n);
+	}
+};
+
+class ELFMemoryStreamer : public Ice::ELFStreamer, public Routine
+{
+	ELFMemoryStreamer(const ELFMemoryStreamer &) = delete;
+	ELFMemoryStreamer &operator=(const ELFMemoryStreamer &) = delete;
+
+public:
+	ELFMemoryStreamer() : Routine()
+	{
+		position = 0;
+		buffer.reserve(0x1000);
+	}
+
+	~ELFMemoryStreamer() override
+	{
+		#if defined(_WIN32)
+			if(buffer.size() != 0)
+			{
+				DWORD exeProtection;
+				VirtualProtect(&buffer[0], buffer.size(), oldProtection, &exeProtection);
+			}
+		#endif
+	}
+
+	void write8(uint8_t Value) override
+	{
+		if(position == (uint64_t)buffer.size())
+		{
+			buffer.push_back(Value);
+			position++;
+		}
+		else if(position < (uint64_t)buffer.size())
+		{
+			buffer[position] = Value;
+			position++;
+		}
+		else ASSERT(false && "UNIMPLEMENTED");
+	}
+
+	void writeBytes(llvm::StringRef Bytes) override
+	{
+		std::size_t oldSize = buffer.size();
+		buffer.resize(oldSize + Bytes.size());
+		memcpy(&buffer[oldSize], Bytes.begin(), Bytes.size());
+		position += Bytes.size();
+	}
+
+	uint64_t tell() const override { return position; }
+
+	void seek(uint64_t Off) override { position = Off; }
+
+	const void* finalizeEntryBegin()
+	{
+		position = std::numeric_limits<std::size_t>::max();   // Can't stream more data after this
+
+		size_t codeSize = 0;
+		const void *entry = loadImage(&buffer[0], codeSize);
+
+#if defined(_WIN32)
+		VirtualProtect(&buffer[0], buffer.size(), PAGE_EXECUTE_READ, &oldProtection);
+		FlushInstructionCache(GetCurrentProcess(), NULL, 0);
+#else
+		mprotect(&buffer[0], buffer.size(), PROT_READ | PROT_EXEC);
+		__builtin___clear_cache((char*)entry, (char*)entry + codeSize);
+#endif
 		return entry;
 	}
 
-	template<typename T>
-	struct ExecutableAllocator
+	void setEntry(int index, const void* func)
 	{
-		ExecutableAllocator() {}
-		template<class U> ExecutableAllocator(const ExecutableAllocator<U> &other) {}
+		ASSERT(func);
+		funcs[index] = func;
+	}
 
-		using value_type = T;
-		using size_type = std::size_t;
-
-		T *allocate(size_type n)
-		{
-			return (T*)allocateExecutable(sizeof(T) * n);
-		}
-
-		void deallocate(T *p, size_type n)
-		{
-			deallocateExecutable(p, sizeof(T) * n);
-		}
-	};
-
-	class ELFMemoryStreamer : public Ice::ELFStreamer, public Routine
+	const void *getEntry(int index) const override
 	{
-		ELFMemoryStreamer(const ELFMemoryStreamer &) = delete;
-		ELFMemoryStreamer &operator=(const ELFMemoryStreamer &) = delete;
+		ASSERT(funcs[index]);
+		return funcs[index];
+	}
 
-	public:
-		ELFMemoryStreamer() : Routine()
-		{
-			position = 0;
-			buffer.reserve(0x1000);
-		}
+	const void* addConstantData(const void* data, size_t size)
+	{
+		auto buf = std::unique_ptr<uint8_t[]>(new uint8_t[size]);
+		memcpy(buf.get(), data, size);
+		auto ptr = buf.get();
+		constantData.emplace_back(std::move(buf));
+		return ptr;
+	}
 
-		~ELFMemoryStreamer() override
-		{
-			#if defined(_WIN32)
-				if(buffer.size() != 0)
-				{
-					DWORD exeProtection;
-					VirtualProtect(&buffer[0], buffer.size(), oldProtection, &exeProtection);
-				}
-			#endif
-		}
+private:
+	std::array<const void*, Nucleus::CoroutineEntryCount> funcs = {};
+	std::vector<uint8_t, ExecutableAllocator<uint8_t>> buffer;
+	std::size_t position;
+	std::vector<std::unique_ptr<uint8_t[]>> constantData;
 
-		void write8(uint8_t Value) override
+	#if defined(_WIN32)
+	DWORD oldProtection;
+	#endif
+};
+
+Nucleus::Nucleus()
+{
+	::codegenMutex.lock();   // Reactor is currently not thread safe
+
+	Ice::ClFlags &Flags = Ice::ClFlags::Flags;
+	Ice::ClFlags::getParsedClFlags(Flags);
+
+	#if defined(__arm__)
+		Flags.setTargetArch(Ice::Target_ARM32);
+		Flags.setTargetInstructionSet(Ice::ARM32InstructionSet_HWDivArm);
+	#elif defined(__mips__)
+		Flags.setTargetArch(Ice::Target_MIPS32);
+		Flags.setTargetInstructionSet(Ice::BaseInstructionSet);
+	#else   // x86
+		Flags.setTargetArch(sizeof(void*) == 8 ? Ice::Target_X8664 : Ice::Target_X8632);
+		Flags.setTargetInstructionSet(CPUID::SSE4_1 ? Ice::X86InstructionSet_SSE4_1 : Ice::X86InstructionSet_SSE2);
+	#endif
+	Flags.setOutFileType(Ice::FT_Elf);
+	Flags.setOptLevel(toIce(getDefaultConfig().getOptimization().getLevel()));
+	Flags.setApplicationBinaryInterface(Ice::ABI_Platform);
+	Flags.setVerbose(subzeroDumpEnabled ? Ice::IceV_Most : Ice::IceV_None);
+	Flags.setDisableHybridAssembly(true);
+
+	static llvm::raw_os_ostream cout(std::cout);
+	static llvm::raw_os_ostream cerr(std::cerr);
+
+	if (subzeroEmitTextAsm)
+	{
+		// Decorate text asm with liveness info
+		Flags.setDecorateAsm(true);
+	}
+
+	if(false)   // Write out to a file
+	{
+		std::error_code errorCode;
+		::out = new Ice::Fdstream("out.o", errorCode, llvm::sys::fs::F_None);
+		::elfFile = new Ice::ELFFileStreamer(*out);
+		::context = new Ice::GlobalContext(&cout, &cout, &cerr, elfFile);
+	}
+	else
+	{
+		ELFMemoryStreamer *elfMemory = new ELFMemoryStreamer();
+		::context = new Ice::GlobalContext(&cout, &cout, &cerr, elfMemory);
+		::routine = elfMemory;
+	}
+}
+
+Nucleus::~Nucleus()
+{
+	delete ::routine;
+
+	delete ::allocator;
+	delete ::function;
+	delete ::context;
+
+	delete ::elfFile;
+	delete ::out;
+
+	::codegenMutex.unlock();
+}
+
+void Nucleus::setDefaultConfig(const Config &cfg)
+{
+	std::unique_lock<std::mutex> lock(::defaultConfigLock);
+	::defaultConfig() = cfg;
+}
+
+void Nucleus::adjustDefaultConfig(const Config::Edit &cfgEdit)
+{
+	std::unique_lock<std::mutex> lock(::defaultConfigLock);
+	auto &config = ::defaultConfig();
+	config = cfgEdit.apply(config);
+}
+
+Config Nucleus::getDefaultConfig()
+{
+	std::unique_lock<std::mutex> lock(::defaultConfigLock);
+	return ::defaultConfig();
+}
+
+std::shared_ptr<Routine> Nucleus::acquireRoutine(const char *name, const Config::Edit &cfgEdit /* = Config::Edit::None */)
+{
+	if (subzeroDumpEnabled)
+	{
+		// Output dump strings immediately, rather than once buffer is full. Useful for debugging.
+		context->getStrDump().SetUnbuffered();
+	}
+
+	if(basicBlock->getInsts().empty() || basicBlock->getInsts().back().getKind() != Ice::Inst::Ret)
+	{
+		createRetVoid();
+	}
+
+	::function->setFunctionName(Ice::GlobalString::createWithString(::context, name));
+
+	rr::optimize(::function);
+
+	::function->computeInOutEdges();
+	ASSERT(!::function->hasError());
+
+	::function->translate();
+	ASSERT(!::function->hasError());
+
+	auto globals = ::function->getGlobalInits();
+
+	if(globals && !globals->empty())
+	{
+		::context->getGlobals()->merge(globals.get());
+	}
+
+	::context->emitFileHeader();
+
+	if (subzeroEmitTextAsm)
+	{
+		::function->emit();
+	}
+
+	::function->emitIAS();
+	auto assembler = ::function->releaseAssembler();
+	auto objectWriter = ::context->getObjectWriter();
+	assembler->alignFunction();
+	objectWriter->writeFunctionCode(::function->getFunctionName(), false, assembler.get());
+	::context->lowerGlobals("last");
+	::context->lowerConstants();
+	::context->lowerJumpTables();
+	objectWriter->setUndefinedSyms(::context->getConstantExternSyms());
+	objectWriter->writeNonUserSections();
+
+	const void* entryBegin = ::routine->finalizeEntryBegin();
+	::routine->setEntry(Nucleus::CoroutineEntryBegin, entryBegin);
+
+	Routine *handoffRoutine = ::routine;
+	::routine = nullptr;
+
+	return std::shared_ptr<Routine>(handoffRoutine);
+}
+
+Value *Nucleus::allocateStackVariable(Type *t, int arraySize)
+{
+	Ice::Type type = T(t);
+	int typeSize = Ice::typeWidthInBytes(type);
+	int totalSize = typeSize * (arraySize ? arraySize : 1);
+
+	auto bytes = Ice::ConstantInteger32::create(::context, Ice::IceType_i32, totalSize);
+	auto address = ::function->makeVariable(T(getPointerType(t)));
+	auto alloca = Ice::InstAlloca::create(::function, address, bytes, typeSize);
+	::function->getEntryNode()->getInsts().push_front(alloca);
+
+	return V(address);
+}
+
+BasicBlock *Nucleus::createBasicBlock()
+{
+	return B(::function->makeNode());
+}
+
+BasicBlock *Nucleus::getInsertBlock()
+{
+	return B(::basicBlock);
+}
+
+void Nucleus::setInsertBlock(BasicBlock *basicBlock)
+{
+//	ASSERT(::basicBlock->getInsts().back().getTerminatorEdges().size() >= 0 && "Previous basic block must have a terminator");
+
+	Variable::materializeAll();
+
+	::basicBlock = basicBlock;
+}
+
+void Nucleus::createFunction(Type *ReturnType, std::vector<Type*> &Params)
+{
+	uint32_t sequenceNumber = 0;
+	::function = Ice::Cfg::create(::context, sequenceNumber).release();
+	::allocator = new Ice::CfgLocalAllocatorScope(::function);
+
+	for(Type *type : Params)
+	{
+		Ice::Variable *arg = ::function->makeVariable(T(type));
+		::function->addArg(arg);
+	}
+
+	Ice::CfgNode *node = ::function->makeNode();
+	::function->setEntryNode(node);
+	::basicBlock = node;
+}
+
+Value *Nucleus::getArgument(unsigned int index)
+{
+	return V(::function->getArgs()[index]);
+}
+
+void Nucleus::createRetVoid()
+{
+	// Code generated after this point is unreachable, so any variables
+	// being read can safely return an undefined value. We have to avoid
+	// materializing variables after the terminator ret instruction.
+	Variable::killUnmaterialized();
+
+	Ice::InstRet *ret = Ice::InstRet::create(::function);
+	::basicBlock->appendInst(ret);
+}
+
+void Nucleus::createRet(Value *v)
+{
+	// Code generated after this point is unreachable, so any variables
+	// being read can safely return an undefined value. We have to avoid
+	// materializing variables after the terminator ret instruction.
+	Variable::killUnmaterialized();
+
+	Ice::InstRet *ret = Ice::InstRet::create(::function, v);
+	::basicBlock->appendInst(ret);
+}
+
+void Nucleus::createBr(BasicBlock *dest)
+{
+	Variable::materializeAll();
+
+	auto br = Ice::InstBr::create(::function, dest);
+	::basicBlock->appendInst(br);
+}
+
+void Nucleus::createCondBr(Value *cond, BasicBlock *ifTrue, BasicBlock *ifFalse)
+{
+	Variable::materializeAll();
+
+	auto br = Ice::InstBr::create(::function, cond, ifTrue, ifFalse);
+	::basicBlock->appendInst(br);
+}
+
+static bool isCommutative(Ice::InstArithmetic::OpKind op)
+{
+	switch(op)
+	{
+	case Ice::InstArithmetic::Add:
+	case Ice::InstArithmetic::Fadd:
+	case Ice::InstArithmetic::Mul:
+	case Ice::InstArithmetic::Fmul:
+	case Ice::InstArithmetic::And:
+	case Ice::InstArithmetic::Or:
+	case Ice::InstArithmetic::Xor:
+		return true;
+	default:
+		return false;
+	}
+}
+
+static Value *createArithmetic(Ice::InstArithmetic::OpKind op, Value *lhs, Value *rhs)
+{
+	ASSERT(lhs->getType() == rhs->getType() || llvm::isa<Ice::Constant>(rhs));
+
+	bool swapOperands = llvm::isa<Ice::Constant>(lhs) && isCommutative(op);
+
+	Ice::Variable *result = ::function->makeVariable(lhs->getType());
+	Ice::InstArithmetic *arithmetic = Ice::InstArithmetic::create(::function, op, result, swapOperands ? rhs : lhs, swapOperands ? lhs : rhs);
+	::basicBlock->appendInst(arithmetic);
+
+	return V(result);
+}
+
+Value *Nucleus::createAdd(Value *lhs, Value *rhs)
+{
+	return createArithmetic(Ice::InstArithmetic::Add, lhs, rhs);
+}
+
+Value *Nucleus::createSub(Value *lhs, Value *rhs)
+{
+	return createArithmetic(Ice::InstArithmetic::Sub, lhs, rhs);
+}
+
+Value *Nucleus::createMul(Value *lhs, Value *rhs)
+{
+	return createArithmetic(Ice::InstArithmetic::Mul, lhs, rhs);
+}
+
+Value *Nucleus::createUDiv(Value *lhs, Value *rhs)
+{
+	return createArithmetic(Ice::InstArithmetic::Udiv, lhs, rhs);
+}
+
+Value *Nucleus::createSDiv(Value *lhs, Value *rhs)
+{
+	return createArithmetic(Ice::InstArithmetic::Sdiv, lhs, rhs);
+}
+
+Value *Nucleus::createFAdd(Value *lhs, Value *rhs)
+{
+	return createArithmetic(Ice::InstArithmetic::Fadd, lhs, rhs);
+}
+
+Value *Nucleus::createFSub(Value *lhs, Value *rhs)
+{
+	return createArithmetic(Ice::InstArithmetic::Fsub, lhs, rhs);
+}
+
+Value *Nucleus::createFMul(Value *lhs, Value *rhs)
+{
+	return createArithmetic(Ice::InstArithmetic::Fmul, lhs, rhs);
+}
+
+Value *Nucleus::createFDiv(Value *lhs, Value *rhs)
+{
+	return createArithmetic(Ice::InstArithmetic::Fdiv, lhs, rhs);
+}
+
+Value *Nucleus::createURem(Value *lhs, Value *rhs)
+{
+	return createArithmetic(Ice::InstArithmetic::Urem, lhs, rhs);
+}
+
+Value *Nucleus::createSRem(Value *lhs, Value *rhs)
+{
+	return createArithmetic(Ice::InstArithmetic::Srem, lhs, rhs);
+}
+
+Value *Nucleus::createFRem(Value *lhs, Value *rhs)
+{
+	return createArithmetic(Ice::InstArithmetic::Frem, lhs, rhs);
+}
+
+Value *Nucleus::createShl(Value *lhs, Value *rhs)
+{
+	return createArithmetic(Ice::InstArithmetic::Shl, lhs, rhs);
+}
+
+Value *Nucleus::createLShr(Value *lhs, Value *rhs)
+{
+	return createArithmetic(Ice::InstArithmetic::Lshr, lhs, rhs);
+}
+
+Value *Nucleus::createAShr(Value *lhs, Value *rhs)
+{
+	return createArithmetic(Ice::InstArithmetic::Ashr, lhs, rhs);
+}
+
+Value *Nucleus::createAnd(Value *lhs, Value *rhs)
+{
+	return createArithmetic(Ice::InstArithmetic::And, lhs, rhs);
+}
+
+Value *Nucleus::createOr(Value *lhs, Value *rhs)
+{
+	return createArithmetic(Ice::InstArithmetic::Or, lhs, rhs);
+}
+
+Value *Nucleus::createXor(Value *lhs, Value *rhs)
+{
+	return createArithmetic(Ice::InstArithmetic::Xor, lhs, rhs);
+}
+
+Value *Nucleus::createNeg(Value *v)
+{
+	return createSub(createNullValue(T(v->getType())), v);
+}
+
+Value *Nucleus::createFNeg(Value *v)
+{
+	double c[4] = {-0.0, -0.0, -0.0, -0.0};
+	Value *negativeZero = Ice::isVectorType(v->getType()) ?
+	                      createConstantVector(c, T(v->getType())) :
+	                      V(::context->getConstantFloat(-0.0f));
+
+	return createFSub(negativeZero, v);
+}
+
+Value *Nucleus::createNot(Value *v)
+{
+	if(Ice::isScalarIntegerType(v->getType()))
+	{
+		return createXor(v, V(::context->getConstantInt(v->getType(), -1)));
+	}
+	else   // Vector
+	{
+		int64_t c[16] = {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1};
+		return createXor(v, createConstantVector(c, T(v->getType())));
+	}
+}
+
+Value *Nucleus::createLoad(Value *ptr, Type *type, bool isVolatile, unsigned int align, bool atomic, std::memory_order memoryOrder)
+{
+	ASSERT(!atomic);  // Unimplemented
+	ASSERT(memoryOrder == std::memory_order_relaxed);  // Unimplemented
+
+	int valueType = (int)reinterpret_cast<intptr_t>(type);
+	Ice::Variable *result = ::function->makeVariable(T(type));
+
+	if((valueType & EmulatedBits) && (align != 0))   // Narrow vector not stored on stack.
+	{
+		if(emulateIntrinsics)
 		{
-			if(position == (uint64_t)buffer.size())
+			if(typeSize(type) == 4)
 			{
-				buffer.push_back(Value);
-				position++;
+				auto pointer = RValue<Pointer<Byte>>(ptr);
+				Int x = *Pointer<Int>(pointer);
+
+				Int4 vector;
+				vector = Insert(vector, x, 0);
+
+				auto bitcast = Ice::InstCast::create(::function, Ice::InstCast::Bitcast, result, vector.loadValue());
+				::basicBlock->appendInst(bitcast);
 			}
-			else if(position < (uint64_t)buffer.size())
+			else if(typeSize(type) == 8)
 			{
-				buffer[position] = Value;
-				position++;
+				auto pointer = RValue<Pointer<Byte>>(ptr);
+				Int x = *Pointer<Int>(pointer);
+				Int y = *Pointer<Int>(pointer + 4);
+
+				Int4 vector;
+				vector = Insert(vector, x, 0);
+				vector = Insert(vector, y, 1);
+
+				auto bitcast = Ice::InstCast::create(::function, Ice::InstCast::Bitcast, result, vector.loadValue());
+				::basicBlock->appendInst(bitcast);
 			}
-			else ASSERT(false && "UNIMPLEMENTED");
+			else UNREACHABLE("typeSize(type): %d", int(typeSize(type)));
 		}
-
-		void writeBytes(llvm::StringRef Bytes) override
+		else
 		{
-			std::size_t oldSize = buffer.size();
-			buffer.resize(oldSize + Bytes.size());
-			memcpy(&buffer[oldSize], Bytes.begin(), Bytes.size());
-			position += Bytes.size();
+			const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::LoadSubVector, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
+			auto target = ::context->getConstantUndef(Ice::IceType_i32);
+			auto load = Ice::InstIntrinsicCall::create(::function, 2, result, target, intrinsic);
+			load->addArg(ptr);
+			load->addArg(::context->getConstantInt32(typeSize(type)));
+			::basicBlock->appendInst(load);
 		}
+	}
+	else
+	{
+		auto load = Ice::InstLoad::create(::function, result, ptr, align);
+		::basicBlock->appendInst(load);
+	}
 
-		uint64_t tell() const override { return position; }
+	return V(result);
+}
 
-		void seek(uint64_t Off) override { position = Off; }
+Value *Nucleus::createStore(Value *value, Value *ptr, Type *type, bool isVolatile, unsigned int align, bool atomic, std::memory_order memoryOrder)
+{
+	ASSERT(!atomic);  // Unimplemented
+	ASSERT(memoryOrder == std::memory_order_relaxed);  // Unimplemented
 
-		const void* finalizeEntryBegin()
+	#if __has_feature(memory_sanitizer)
+		// Mark all (non-stack) memory writes as initialized by calling __msan_unpoison
+		if(align != 0)
 		{
-			position = std::numeric_limits<std::size_t>::max();   // Can't stream more data after this
-
-			size_t codeSize = 0;
-			const void *entry = loadImage(&buffer[0], codeSize);
-
-#if defined(_WIN32)
-			VirtualProtect(&buffer[0], buffer.size(), PAGE_EXECUTE_READ, &oldProtection);
-			FlushInstructionCache(GetCurrentProcess(), NULL, 0);
-#else
-			mprotect(&buffer[0], buffer.size(), PROT_READ | PROT_EXEC);
-			__builtin___clear_cache((char*)entry, (char*)entry + codeSize);
-#endif
-			return entry;
+			auto call = Ice::InstCall::create(::function, 2, nullptr, ::context->getConstantInt64(reinterpret_cast<intptr_t>(__msan_unpoison)), false);
+			call->addArg(ptr);
+			call->addArg(::context->getConstantInt64(typeSize(type)));
+			::basicBlock->appendInst(call);
 		}
+	#endif
 
-		void setEntry(int index, const void* func)
+	int valueType = (int)reinterpret_cast<intptr_t>(type);
+
+	if((valueType & EmulatedBits) && (align != 0))   // Narrow vector not stored on stack.
+	{
+		if(emulateIntrinsics)
 		{
-			ASSERT(func);
-			funcs[index] = func;
+			if(typeSize(type) == 4)
+			{
+				Ice::Variable *vector = ::function->makeVariable(Ice::IceType_v4i32);
+				auto bitcast = Ice::InstCast::create(::function, Ice::InstCast::Bitcast, vector, value);
+				::basicBlock->appendInst(bitcast);
+
+				RValue<Int4> v(V(vector));
+
+				auto pointer = RValue<Pointer<Byte>>(ptr);
+				Int x = Extract(v, 0);
+				*Pointer<Int>(pointer) = x;
+			}
+			else if(typeSize(type) == 8)
+			{
+				Ice::Variable *vector = ::function->makeVariable(Ice::IceType_v4i32);
+				auto bitcast = Ice::InstCast::create(::function, Ice::InstCast::Bitcast, vector, value);
+				::basicBlock->appendInst(bitcast);
+
+				RValue<Int4> v(V(vector));
+
+				auto pointer = RValue<Pointer<Byte>>(ptr);
+				Int x = Extract(v, 0);
+				*Pointer<Int>(pointer) = x;
+				Int y = Extract(v, 1);
+				*Pointer<Int>(pointer + 4) = y;
+			}
+			else UNREACHABLE("typeSize(type): %d", int(typeSize(type)));
 		}
-
-		const void *getEntry(int index) const override
+		else
 		{
-			ASSERT(funcs[index]);
-			return funcs[index];
+			const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::StoreSubVector, Ice::Intrinsics::SideEffects_T, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_T};
+			auto target = ::context->getConstantUndef(Ice::IceType_i32);
+			auto store = Ice::InstIntrinsicCall::create(::function, 3, nullptr, target, intrinsic);
+			store->addArg(value);
+			store->addArg(ptr);
+			store->addArg(::context->getConstantInt32(typeSize(type)));
+			::basicBlock->appendInst(store);
 		}
+	}
+	else
+	{
+		ASSERT(value->getType() == T(type));
 
-		const void* addConstantData(const void* data, size_t size)
+		auto store = Ice::InstStore::create(::function, value, ptr, align);
+		::basicBlock->appendInst(store);
+	}
+
+	return value;
+}
+
+Value *Nucleus::createGEP(Value *ptr, Type *type, Value *index, bool unsignedIndex)
+{
+	ASSERT(index->getType() == Ice::IceType_i32);
+
+	if(auto *constant = llvm::dyn_cast<Ice::ConstantInteger32>(index))
+	{
+		int32_t offset = constant->getValue() * (int)typeSize(type);
+
+		if(offset == 0)
 		{
-			auto buf = std::unique_ptr<uint8_t[]>(new uint8_t[size]);
-			memcpy(buf.get(), data, size);
-			auto ptr = buf.get();
-			constantData.emplace_back(std::move(buf));
 			return ptr;
 		}
 
-	private:
-		std::array<const void*, Nucleus::CoroutineEntryCount> funcs = {};
-		std::vector<uint8_t, ExecutableAllocator<uint8_t>> buffer;
-		std::size_t position;
-		std::vector<std::unique_ptr<uint8_t[]>> constantData;
+		return createAdd(ptr, createConstantInt(offset));
+	}
 
-		#if defined(_WIN32)
-		DWORD oldProtection;
+	if(!Ice::isByteSizedType(T(type)))
+	{
+		index = createMul(index, createConstantInt((int)typeSize(type)));
+	}
+
+	if(sizeof(void*) == 8)
+	{
+		if(unsignedIndex)
+		{
+			index = createZExt(index, T(Ice::IceType_i64));
+		}
+		else
+		{
+			index = createSExt(index, T(Ice::IceType_i64));
+		}
+	}
+
+	return createAdd(ptr, index);
+}
+
+Value *Nucleus::createAtomicAdd(Value *ptr, Value *value, std::memory_order memoryOrder)
+{
+	UNIMPLEMENTED("createAtomicAdd");
+	return nullptr;
+}
+
+Value *Nucleus::createAtomicSub(Value *ptr, Value *value, std::memory_order memoryOrder)
+{
+	UNIMPLEMENTED("createAtomicSub");
+	return nullptr;
+}
+
+Value *Nucleus::createAtomicAnd(Value *ptr, Value *value, std::memory_order memoryOrder)
+{
+	UNIMPLEMENTED("createAtomicAnd");
+	return nullptr;
+}
+
+Value *Nucleus::createAtomicOr(Value *ptr, Value *value, std::memory_order memoryOrder)
+{
+	UNIMPLEMENTED("createAtomicOr");
+	return nullptr;
+}
+
+Value *Nucleus::createAtomicXor(Value *ptr, Value *value, std::memory_order memoryOrder)
+{
+	UNIMPLEMENTED("createAtomicXor");
+	return nullptr;
+}
+
+Value *Nucleus::createAtomicMin(Value *ptr, Value *value, std::memory_order memoryOrder)
+{
+	UNIMPLEMENTED("createAtomicMin");
+	return nullptr;
+}
+
+Value *Nucleus::createAtomicMax(Value *ptr, Value *value, std::memory_order memoryOrder)
+{
+	UNIMPLEMENTED("createAtomicMax");
+	return nullptr;
+}
+
+Value *Nucleus::createAtomicUMin(Value *ptr, Value *value, std::memory_order memoryOrder)
+{
+	UNIMPLEMENTED("createAtomicUMin");
+	return nullptr;
+}
+
+Value *Nucleus::createAtomicUMax(Value *ptr, Value *value, std::memory_order memoryOrder)
+{
+	UNIMPLEMENTED("createAtomicUMax");
+	return nullptr;
+}
+
+Value *Nucleus::createAtomicExchange(Value *ptr, Value *value, std::memory_order memoryOrder)
+{
+	UNIMPLEMENTED("createAtomicExchange");
+	return nullptr;
+}
+
+Value *Nucleus::createAtomicCompareExchange(Value *ptr, Value *value, Value *compare, std::memory_order memoryOrderEqual, std::memory_order memoryOrderUnequal)
+{
+	UNIMPLEMENTED("createAtomicCompareExchange");
+	return nullptr;
+}
+
+static Value *createCast(Ice::InstCast::OpKind op, Value *v, Type *destType)
+{
+	if(v->getType() == T(destType))
+	{
+		return v;
+	}
+
+	Ice::Variable *result = ::function->makeVariable(T(destType));
+	Ice::InstCast *cast = Ice::InstCast::create(::function, op, result, v);
+	::basicBlock->appendInst(cast);
+
+	return V(result);
+}
+
+Value *Nucleus::createTrunc(Value *v, Type *destType)
+{
+	return createCast(Ice::InstCast::Trunc, v, destType);
+}
+
+Value *Nucleus::createZExt(Value *v, Type *destType)
+{
+	return createCast(Ice::InstCast::Zext, v, destType);
+}
+
+Value *Nucleus::createSExt(Value *v, Type *destType)
+{
+	return createCast(Ice::InstCast::Sext, v, destType);
+}
+
+Value *Nucleus::createFPToUI(Value *v, Type *destType)
+{
+	return createCast(Ice::InstCast::Fptoui, v, destType);
+}
+
+Value *Nucleus::createFPToSI(Value *v, Type *destType)
+{
+	return createCast(Ice::InstCast::Fptosi, v, destType);
+}
+
+Value *Nucleus::createSIToFP(Value *v, Type *destType)
+{
+	return createCast(Ice::InstCast::Sitofp, v, destType);
+}
+
+Value *Nucleus::createFPTrunc(Value *v, Type *destType)
+{
+	return createCast(Ice::InstCast::Fptrunc, v, destType);
+}
+
+Value *Nucleus::createFPExt(Value *v, Type *destType)
+{
+	return createCast(Ice::InstCast::Fpext, v, destType);
+}
+
+Value *Nucleus::createBitCast(Value *v, Type *destType)
+{
+	// Bitcasts must be between types of the same logical size. But with emulated narrow vectors we need
+	// support for casting between scalars and wide vectors. For platforms where this is not supported,
+	// emulate them by writing to the stack and reading back as the destination type.
+	if(emulateMismatchedBitCast)
+	{
+		if(!Ice::isVectorType(v->getType()) && Ice::isVectorType(T(destType)))
+		{
+			Value *address = allocateStackVariable(destType);
+			createStore(v, address, T(v->getType()));
+			return createLoad(address, destType);
+		}
+		else if(Ice::isVectorType(v->getType()) && !Ice::isVectorType(T(destType)))
+		{
+			Value *address = allocateStackVariable(T(v->getType()));
+			createStore(v, address, T(v->getType()));
+			return createLoad(address, destType);
+		}
+	}
+
+	return createCast(Ice::InstCast::Bitcast, v, destType);
+}
+
+static Value *createIntCompare(Ice::InstIcmp::ICond condition, Value *lhs, Value *rhs)
+{
+	ASSERT(lhs->getType() == rhs->getType());
+
+	auto result = ::function->makeVariable(Ice::isScalarIntegerType(lhs->getType()) ? Ice::IceType_i1 : lhs->getType());
+	auto cmp = Ice::InstIcmp::create(::function, condition, result, lhs, rhs);
+	::basicBlock->appendInst(cmp);
+
+	return V(result);
+}
+
+Value *Nucleus::createPtrEQ(Value *lhs, Value *rhs)
+{
+	return createIntCompare(Ice::InstIcmp::Eq, lhs, rhs);
+}
+
+Value *Nucleus::createICmpEQ(Value *lhs, Value *rhs)
+{
+	return createIntCompare(Ice::InstIcmp::Eq, lhs, rhs);
+}
+
+Value *Nucleus::createICmpNE(Value *lhs, Value *rhs)
+{
+	return createIntCompare(Ice::InstIcmp::Ne, lhs, rhs);
+}
+
+Value *Nucleus::createICmpUGT(Value *lhs, Value *rhs)
+{
+	return createIntCompare(Ice::InstIcmp::Ugt, lhs, rhs);
+}
+
+Value *Nucleus::createICmpUGE(Value *lhs, Value *rhs)
+{
+	return createIntCompare(Ice::InstIcmp::Uge, lhs, rhs);
+}
+
+Value *Nucleus::createICmpULT(Value *lhs, Value *rhs)
+{
+	return createIntCompare(Ice::InstIcmp::Ult, lhs, rhs);
+}
+
+Value *Nucleus::createICmpULE(Value *lhs, Value *rhs)
+{
+	return createIntCompare(Ice::InstIcmp::Ule, lhs, rhs);
+}
+
+Value *Nucleus::createICmpSGT(Value *lhs, Value *rhs)
+{
+	return createIntCompare(Ice::InstIcmp::Sgt, lhs, rhs);
+}
+
+Value *Nucleus::createICmpSGE(Value *lhs, Value *rhs)
+{
+	return createIntCompare(Ice::InstIcmp::Sge, lhs, rhs);
+}
+
+Value *Nucleus::createICmpSLT(Value *lhs, Value *rhs)
+{
+	return createIntCompare(Ice::InstIcmp::Slt, lhs, rhs);
+}
+
+Value *Nucleus::createICmpSLE(Value *lhs, Value *rhs)
+{
+	return createIntCompare(Ice::InstIcmp::Sle, lhs, rhs);
+}
+
+static Value *createFloatCompare(Ice::InstFcmp::FCond condition, Value *lhs, Value *rhs)
+{
+	ASSERT(lhs->getType() == rhs->getType());
+	ASSERT(Ice::isScalarFloatingType(lhs->getType()) || lhs->getType() == Ice::IceType_v4f32);
+
+	auto result = ::function->makeVariable(Ice::isScalarFloatingType(lhs->getType()) ? Ice::IceType_i1 : Ice::IceType_v4i32);
+	auto cmp = Ice::InstFcmp::create(::function, condition, result, lhs, rhs);
+	::basicBlock->appendInst(cmp);
+
+	return V(result);
+}
+
+Value *Nucleus::createFCmpOEQ(Value *lhs, Value *rhs)
+{
+	return createFloatCompare(Ice::InstFcmp::Oeq, lhs, rhs);
+}
+
+Value *Nucleus::createFCmpOGT(Value *lhs, Value *rhs)
+{
+	return createFloatCompare(Ice::InstFcmp::Ogt, lhs, rhs);
+}
+
+Value *Nucleus::createFCmpOGE(Value *lhs, Value *rhs)
+{
+	return createFloatCompare(Ice::InstFcmp::Oge, lhs, rhs);
+}
+
+Value *Nucleus::createFCmpOLT(Value *lhs, Value *rhs)
+{
+	return createFloatCompare(Ice::InstFcmp::Olt, lhs, rhs);
+}
+
+Value *Nucleus::createFCmpOLE(Value *lhs, Value *rhs)
+{
+	return createFloatCompare(Ice::InstFcmp::Ole, lhs, rhs);
+}
+
+Value *Nucleus::createFCmpONE(Value *lhs, Value *rhs)
+{
+	return createFloatCompare(Ice::InstFcmp::One, lhs, rhs);
+}
+
+Value *Nucleus::createFCmpORD(Value *lhs, Value *rhs)
+{
+	return createFloatCompare(Ice::InstFcmp::Ord, lhs, rhs);
+}
+
+Value *Nucleus::createFCmpUNO(Value *lhs, Value *rhs)
+{
+	return createFloatCompare(Ice::InstFcmp::Uno, lhs, rhs);
+}
+
+Value *Nucleus::createFCmpUEQ(Value *lhs, Value *rhs)
+{
+	return createFloatCompare(Ice::InstFcmp::Ueq, lhs, rhs);
+}
+
+Value *Nucleus::createFCmpUGT(Value *lhs, Value *rhs)
+{
+	return createFloatCompare(Ice::InstFcmp::Ugt, lhs, rhs);
+}
+
+Value *Nucleus::createFCmpUGE(Value *lhs, Value *rhs)
+{
+	return createFloatCompare(Ice::InstFcmp::Uge, lhs, rhs);
+}
+
+Value *Nucleus::createFCmpULT(Value *lhs, Value *rhs)
+{
+	return createFloatCompare(Ice::InstFcmp::Ult, lhs, rhs);
+}
+
+Value *Nucleus::createFCmpULE(Value *lhs, Value *rhs)
+{
+	return createFloatCompare(Ice::InstFcmp::Ule, lhs, rhs);
+}
+
+Value *Nucleus::createFCmpUNE(Value *lhs, Value *rhs)
+{
+	return createFloatCompare(Ice::InstFcmp::Une, lhs, rhs);
+}
+
+Value *Nucleus::createExtractElement(Value *vector, Type *type, int index)
+{
+	auto result = ::function->makeVariable(T(type));
+	auto extract = Ice::InstExtractElement::create(::function, result, vector, ::context->getConstantInt32(index));
+	::basicBlock->appendInst(extract);
+
+	return V(result);
+}
+
+Value *Nucleus::createInsertElement(Value *vector, Value *element, int index)
+{
+	auto result = ::function->makeVariable(vector->getType());
+	auto insert = Ice::InstInsertElement::create(::function, result, vector, element, ::context->getConstantInt32(index));
+	::basicBlock->appendInst(insert);
+
+	return V(result);
+}
+
+Value *Nucleus::createShuffleVector(Value *V1, Value *V2, const int *select)
+{
+	ASSERT(V1->getType() == V2->getType());
+
+	int size = Ice::typeNumElements(V1->getType());
+	auto result = ::function->makeVariable(V1->getType());
+	auto shuffle = Ice::InstShuffleVector::create(::function, result, V1, V2);
+
+	for(int i = 0; i < size; i++)
+	{
+		shuffle->addIndex(llvm::cast<Ice::ConstantInteger32>(::context->getConstantInt32(select[i])));
+	}
+
+	::basicBlock->appendInst(shuffle);
+
+	return V(result);
+}
+
+Value *Nucleus::createSelect(Value *C, Value *ifTrue, Value *ifFalse)
+{
+	ASSERT(ifTrue->getType() == ifFalse->getType());
+
+	auto result = ::function->makeVariable(ifTrue->getType());
+	auto *select = Ice::InstSelect::create(::function, result, C, ifTrue, ifFalse);
+	::basicBlock->appendInst(select);
+
+	return V(result);
+}
+
+SwitchCases *Nucleus::createSwitch(Value *control, BasicBlock *defaultBranch, unsigned numCases)
+{
+	auto switchInst = Ice::InstSwitch::create(::function, numCases, control, defaultBranch);
+	::basicBlock->appendInst(switchInst);
+
+	return reinterpret_cast<SwitchCases*>(switchInst);
+}
+
+void Nucleus::addSwitchCase(SwitchCases *switchCases, int label, BasicBlock *branch)
+{
+	switchCases->addBranch(label, label, branch);
+}
+
+void Nucleus::createUnreachable()
+{
+	Ice::InstUnreachable *unreachable = Ice::InstUnreachable::create(::function);
+	::basicBlock->appendInst(unreachable);
+}
+
+Type *Nucleus::getPointerType(Type *ElementType)
+{
+	if(sizeof(void*) == 8)
+	{
+		return T(Ice::IceType_i64);
+	}
+	else
+	{
+		return T(Ice::IceType_i32);
+	}
+}
+
+Value *Nucleus::createNullValue(Type *Ty)
+{
+	if(Ice::isVectorType(T(Ty)))
+	{
+		ASSERT(Ice::typeNumElements(T(Ty)) <= 16);
+		int64_t c[16] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+		return createConstantVector(c, Ty);
+	}
+	else
+	{
+		return V(::context->getConstantZero(T(Ty)));
+	}
+}
+
+Value *Nucleus::createConstantLong(int64_t i)
+{
+	return V(::context->getConstantInt64(i));
+}
+
+Value *Nucleus::createConstantInt(int i)
+{
+	return V(::context->getConstantInt32(i));
+}
+
+Value *Nucleus::createConstantInt(unsigned int i)
+{
+	return V(::context->getConstantInt32(i));
+}
+
+Value *Nucleus::createConstantBool(bool b)
+{
+	return V(::context->getConstantInt1(b));
+}
+
+Value *Nucleus::createConstantByte(signed char i)
+{
+	return V(::context->getConstantInt8(i));
+}
+
+Value *Nucleus::createConstantByte(unsigned char i)
+{
+	return V(::context->getConstantInt8(i));
+}
+
+Value *Nucleus::createConstantShort(short i)
+{
+	return V(::context->getConstantInt16(i));
+}
+
+Value *Nucleus::createConstantShort(unsigned short i)
+{
+	return V(::context->getConstantInt16(i));
+}
+
+Value *Nucleus::createConstantFloat(float x)
+{
+	return V(::context->getConstantFloat(x));
+}
+
+Value *Nucleus::createNullPointer(Type *Ty)
+{
+	return createNullValue(T(sizeof(void*) == 8 ? Ice::IceType_i64 : Ice::IceType_i32));
+}
+
+Value *Nucleus::createConstantVector(const int64_t *constants, Type *type)
+{
+	const int vectorSize = 16;
+	ASSERT(Ice::typeWidthInBytes(T(type)) == vectorSize);
+	const int alignment = vectorSize;
+	auto globalPool = ::function->getGlobalPool();
+
+	const int64_t *i = constants;
+	const double *f = reinterpret_cast<const double*>(constants);
+	Ice::VariableDeclaration::DataInitializer *dataInitializer = nullptr;
+
+	switch((int)reinterpret_cast<intptr_t>(type))
+	{
+	case Ice::IceType_v4i32:
+	case Ice::IceType_v4i1:
+		{
+			const int initializer[4] = {(int)i[0], (int)i[1], (int)i[2], (int)i[3]};
+			static_assert(sizeof(initializer) == vectorSize, "!");
+			dataInitializer = Ice::VariableDeclaration::DataInitializer::create(globalPool, (const char*)initializer, vectorSize);
+		}
+		break;
+	case Ice::IceType_v4f32:
+		{
+			const float initializer[4] = {(float)f[0], (float)f[1], (float)f[2], (float)f[3]};
+			static_assert(sizeof(initializer) == vectorSize, "!");
+			dataInitializer = Ice::VariableDeclaration::DataInitializer::create(globalPool, (const char*)initializer, vectorSize);
+		}
+		break;
+	case Ice::IceType_v8i16:
+	case Ice::IceType_v8i1:
+		{
+			const short initializer[8] = {(short)i[0], (short)i[1], (short)i[2], (short)i[3], (short)i[4], (short)i[5], (short)i[6], (short)i[7]};
+			static_assert(sizeof(initializer) == vectorSize, "!");
+			dataInitializer = Ice::VariableDeclaration::DataInitializer::create(globalPool, (const char*)initializer, vectorSize);
+		}
+		break;
+	case Ice::IceType_v16i8:
+	case Ice::IceType_v16i1:
+		{
+			const char initializer[16] = {(char)i[0], (char)i[1], (char)i[2], (char)i[3], (char)i[4], (char)i[5], (char)i[6], (char)i[7], (char)i[8], (char)i[9], (char)i[10], (char)i[11], (char)i[12], (char)i[13], (char)i[14], (char)i[15]};
+			static_assert(sizeof(initializer) == vectorSize, "!");
+			dataInitializer = Ice::VariableDeclaration::DataInitializer::create(globalPool, (const char*)initializer, vectorSize);
+		}
+		break;
+	case Type_v2i32:
+		{
+			const int initializer[4] = {(int)i[0], (int)i[1], (int)i[0], (int)i[1]};
+			static_assert(sizeof(initializer) == vectorSize, "!");
+			dataInitializer = Ice::VariableDeclaration::DataInitializer::create(globalPool, (const char*)initializer, vectorSize);
+		}
+		break;
+	case Type_v2f32:
+		{
+			const float initializer[4] = {(float)f[0], (float)f[1], (float)f[0], (float)f[1]};
+			static_assert(sizeof(initializer) == vectorSize, "!");
+			dataInitializer = Ice::VariableDeclaration::DataInitializer::create(globalPool, (const char*)initializer, vectorSize);
+		}
+		break;
+	case Type_v4i16:
+		{
+			const short initializer[8] = {(short)i[0], (short)i[1], (short)i[2], (short)i[3], (short)i[0], (short)i[1], (short)i[2], (short)i[3]};
+			static_assert(sizeof(initializer) == vectorSize, "!");
+			dataInitializer = Ice::VariableDeclaration::DataInitializer::create(globalPool, (const char*)initializer, vectorSize);
+		}
+		break;
+	case Type_v8i8:
+		{
+			const char initializer[16] = {(char)i[0], (char)i[1], (char)i[2], (char)i[3], (char)i[4], (char)i[5], (char)i[6], (char)i[7], (char)i[0], (char)i[1], (char)i[2], (char)i[3], (char)i[4], (char)i[5], (char)i[6], (char)i[7]};
+			static_assert(sizeof(initializer) == vectorSize, "!");
+			dataInitializer = Ice::VariableDeclaration::DataInitializer::create(globalPool, (const char*)initializer, vectorSize);
+		}
+		break;
+	case Type_v4i8:
+		{
+			const char initializer[16] = {(char)i[0], (char)i[1], (char)i[2], (char)i[3], (char)i[0], (char)i[1], (char)i[2], (char)i[3], (char)i[0], (char)i[1], (char)i[2], (char)i[3], (char)i[0], (char)i[1], (char)i[2], (char)i[3]};
+			static_assert(sizeof(initializer) == vectorSize, "!");
+			dataInitializer = Ice::VariableDeclaration::DataInitializer::create(globalPool, (const char*)initializer, vectorSize);
+		}
+		break;
+	default:
+		UNREACHABLE("Unknown constant vector type: %d", (int)reinterpret_cast<intptr_t>(type));
+	}
+
+	auto name = Ice::GlobalString::createWithoutString(::context);
+	auto *variableDeclaration = Ice::VariableDeclaration::create(globalPool);
+	variableDeclaration->setName(name);
+	variableDeclaration->setAlignment(alignment);
+	variableDeclaration->setIsConstant(true);
+	variableDeclaration->addInitializer(dataInitializer);
+
+	::function->addGlobal(variableDeclaration);
+
+	constexpr int32_t offset = 0;
+	Ice::Operand *ptr = ::context->getConstantSym(offset, name);
+
+	Ice::Variable *result = ::function->makeVariable(T(type));
+	auto load = Ice::InstLoad::create(::function, result, ptr, alignment);
+	::basicBlock->appendInst(load);
+
+	return V(result);
+}
+
+Value *Nucleus::createConstantVector(const double *constants, Type *type)
+{
+	return createConstantVector((const int64_t*)constants, type);
+}
+
+Type *Void::getType()
+{
+	return T(Ice::IceType_void);
+}
+
+Type *Bool::getType()
+{
+	return T(Ice::IceType_i1);
+}
+
+Type *Byte::getType()
+{
+	return T(Ice::IceType_i8);
+}
+
+Type *SByte::getType()
+{
+	return T(Ice::IceType_i8);
+}
+
+Type *Short::getType()
+{
+	return T(Ice::IceType_i16);
+}
+
+Type *UShort::getType()
+{
+	return T(Ice::IceType_i16);
+}
+
+Type *Byte4::getType()
+{
+	return T(Type_v4i8);
+}
+
+Type *SByte4::getType()
+{
+	return T(Type_v4i8);
+}
+
+namespace
+{
+	RValue<Byte> SaturateUnsigned(RValue<Short> x)
+	{
+		return Byte(IfThenElse(Int(x) > 0xFF, Int(0xFF), IfThenElse(Int(x) < 0, Int(0), Int(x))));
+	}
+
+	RValue<Byte> Extract(RValue<Byte8> val, int i)
+	{
+		return RValue<Byte>(Nucleus::createExtractElement(val.value, Byte::getType(), i));
+	}
+
+	RValue<Byte8> Insert(RValue<Byte8> val, RValue<Byte> element, int i)
+	{
+		return RValue<Byte8>(Nucleus::createInsertElement(val.value, element.value, i));
+	}
+}
+
+RValue<Byte8> AddSat(RValue<Byte8> x, RValue<Byte8> y)
+{
+	if(emulateIntrinsics)
+	{
+		Byte8 result;
+		result = Insert(result, SaturateUnsigned(Short(Int(Extract(x, 0)) + Int(Extract(y, 0)))), 0);
+		result = Insert(result, SaturateUnsigned(Short(Int(Extract(x, 1)) + Int(Extract(y, 1)))), 1);
+		result = Insert(result, SaturateUnsigned(Short(Int(Extract(x, 2)) + Int(Extract(y, 2)))), 2);
+		result = Insert(result, SaturateUnsigned(Short(Int(Extract(x, 3)) + Int(Extract(y, 3)))), 3);
+		result = Insert(result, SaturateUnsigned(Short(Int(Extract(x, 4)) + Int(Extract(y, 4)))), 4);
+		result = Insert(result, SaturateUnsigned(Short(Int(Extract(x, 5)) + Int(Extract(y, 5)))), 5);
+		result = Insert(result, SaturateUnsigned(Short(Int(Extract(x, 6)) + Int(Extract(y, 6)))), 6);
+		result = Insert(result, SaturateUnsigned(Short(Int(Extract(x, 7)) + Int(Extract(y, 7)))), 7);
+
+		return result;
+	}
+	else
+	{
+		Ice::Variable *result = ::function->makeVariable(Ice::IceType_v16i8);
+		const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::AddSaturateUnsigned, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
+		auto target = ::context->getConstantUndef(Ice::IceType_i32);
+		auto paddusb = Ice::InstIntrinsicCall::create(::function, 2, result, target, intrinsic);
+		paddusb->addArg(x.value);
+		paddusb->addArg(y.value);
+		::basicBlock->appendInst(paddusb);
+
+		return RValue<Byte8>(V(result));
+	}
+}
+
+RValue<Byte8> SubSat(RValue<Byte8> x, RValue<Byte8> y)
+{
+	if(emulateIntrinsics)
+	{
+		Byte8 result;
+		result = Insert(result, SaturateUnsigned(Short(Int(Extract(x, 0)) - Int(Extract(y, 0)))), 0);
+		result = Insert(result, SaturateUnsigned(Short(Int(Extract(x, 1)) - Int(Extract(y, 1)))), 1);
+		result = Insert(result, SaturateUnsigned(Short(Int(Extract(x, 2)) - Int(Extract(y, 2)))), 2);
+		result = Insert(result, SaturateUnsigned(Short(Int(Extract(x, 3)) - Int(Extract(y, 3)))), 3);
+		result = Insert(result, SaturateUnsigned(Short(Int(Extract(x, 4)) - Int(Extract(y, 4)))), 4);
+		result = Insert(result, SaturateUnsigned(Short(Int(Extract(x, 5)) - Int(Extract(y, 5)))), 5);
+		result = Insert(result, SaturateUnsigned(Short(Int(Extract(x, 6)) - Int(Extract(y, 6)))), 6);
+		result = Insert(result, SaturateUnsigned(Short(Int(Extract(x, 7)) - Int(Extract(y, 7)))), 7);
+
+		return result;
+	}
+	else
+	{
+		Ice::Variable *result = ::function->makeVariable(Ice::IceType_v16i8);
+		const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::SubtractSaturateUnsigned, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
+		auto target = ::context->getConstantUndef(Ice::IceType_i32);
+		auto psubusw = Ice::InstIntrinsicCall::create(::function, 2, result, target, intrinsic);
+		psubusw->addArg(x.value);
+		psubusw->addArg(y.value);
+		::basicBlock->appendInst(psubusw);
+
+		return RValue<Byte8>(V(result));
+	}
+}
+
+RValue<SByte> Extract(RValue<SByte8> val, int i)
+{
+	return RValue<SByte>(Nucleus::createExtractElement(val.value, SByte::getType(), i));
+}
+
+RValue<SByte8> Insert(RValue<SByte8> val, RValue<SByte> element, int i)
+{
+	return RValue<SByte8>(Nucleus::createInsertElement(val.value, element.value, i));
+}
+
+RValue<SByte8> operator>>(RValue<SByte8> lhs, unsigned char rhs)
+{
+	if(emulateIntrinsics)
+	{
+		SByte8 result;
+		result = Insert(result, Extract(lhs, 0) >> SByte(rhs), 0);
+		result = Insert(result, Extract(lhs, 1) >> SByte(rhs), 1);
+		result = Insert(result, Extract(lhs, 2) >> SByte(rhs), 2);
+		result = Insert(result, Extract(lhs, 3) >> SByte(rhs), 3);
+		result = Insert(result, Extract(lhs, 4) >> SByte(rhs), 4);
+		result = Insert(result, Extract(lhs, 5) >> SByte(rhs), 5);
+		result = Insert(result, Extract(lhs, 6) >> SByte(rhs), 6);
+		result = Insert(result, Extract(lhs, 7) >> SByte(rhs), 7);
+
+		return result;
+	}
+	else
+	{
+		#if defined(__i386__) || defined(__x86_64__)
+			// SSE2 doesn't support byte vector shifts, so shift as shorts and recombine.
+			RValue<Short4> hi = (As<Short4>(lhs) >> rhs) & Short4(0xFF00u);
+			RValue<Short4> lo = As<Short4>(As<UShort4>((As<Short4>(lhs) << 8) >> rhs) >> 8);
+
+			return As<SByte8>(hi | lo);
+		#else
+			return RValue<SByte8>(Nucleus::createAShr(lhs.value, V(::context->getConstantInt32(rhs))));
 		#endif
-	};
-
-	Nucleus::Nucleus()
-	{
-		::codegenMutex.lock();   // Reactor is currently not thread safe
-
-		Ice::ClFlags &Flags = Ice::ClFlags::Flags;
-		Ice::ClFlags::getParsedClFlags(Flags);
-
-		#if defined(__arm__)
-			Flags.setTargetArch(Ice::Target_ARM32);
-			Flags.setTargetInstructionSet(Ice::ARM32InstructionSet_HWDivArm);
-		#elif defined(__mips__)
-			Flags.setTargetArch(Ice::Target_MIPS32);
-			Flags.setTargetInstructionSet(Ice::BaseInstructionSet);
-		#else   // x86
-			Flags.setTargetArch(sizeof(void*) == 8 ? Ice::Target_X8664 : Ice::Target_X8632);
-			Flags.setTargetInstructionSet(CPUID::SSE4_1 ? Ice::X86InstructionSet_SSE4_1 : Ice::X86InstructionSet_SSE2);
-		#endif
-		Flags.setOutFileType(Ice::FT_Elf);
-		Flags.setOptLevel(toIce(getDefaultConfig().getOptimization().getLevel()));
-		Flags.setApplicationBinaryInterface(Ice::ABI_Platform);
-		Flags.setVerbose(subzeroDumpEnabled ? Ice::IceV_Most : Ice::IceV_None);
-		Flags.setDisableHybridAssembly(true);
-
-		static llvm::raw_os_ostream cout(std::cout);
-		static llvm::raw_os_ostream cerr(std::cerr);
-
-		if (subzeroEmitTextAsm)
-		{
-			// Decorate text asm with liveness info
-			Flags.setDecorateAsm(true);
-		}
-
-		if(false)   // Write out to a file
-		{
-			std::error_code errorCode;
-			::out = new Ice::Fdstream("out.o", errorCode, llvm::sys::fs::F_None);
-			::elfFile = new Ice::ELFFileStreamer(*out);
-			::context = new Ice::GlobalContext(&cout, &cout, &cerr, elfFile);
-		}
-		else
-		{
-			ELFMemoryStreamer *elfMemory = new ELFMemoryStreamer();
-			::context = new Ice::GlobalContext(&cout, &cout, &cerr, elfMemory);
-			::routine = elfMemory;
-		}
 	}
+}
 
-	Nucleus::~Nucleus()
+RValue<Int> SignMask(RValue<Byte8> x)
+{
+	if(emulateIntrinsics || CPUID::ARM)
 	{
-		delete ::routine;
-
-		delete ::allocator;
-		delete ::function;
-		delete ::context;
-
-		delete ::elfFile;
-		delete ::out;
-
-		::codegenMutex.unlock();
+		Byte8 xx = As<Byte8>(As<SByte8>(x) >> 7) & Byte8(0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80);
+		return Int(Extract(xx, 0)) | Int(Extract(xx, 1)) | Int(Extract(xx, 2)) | Int(Extract(xx, 3)) | Int(Extract(xx, 4)) | Int(Extract(xx, 5)) | Int(Extract(xx, 6)) | Int(Extract(xx, 7));
 	}
-
-	void Nucleus::setDefaultConfig(const Config &cfg)
+	else
 	{
-		std::unique_lock<std::mutex> lock(::defaultConfigLock);
-		::defaultConfig() = cfg;
-	}
+		Ice::Variable *result = ::function->makeVariable(Ice::IceType_i32);
+		const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::SignMask, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
+		auto target = ::context->getConstantUndef(Ice::IceType_i32);
+		auto movmsk = Ice::InstIntrinsicCall::create(::function, 1, result, target, intrinsic);
+		movmsk->addArg(x.value);
+		::basicBlock->appendInst(movmsk);
 
-	void Nucleus::adjustDefaultConfig(const Config::Edit &cfgEdit)
-	{
-		std::unique_lock<std::mutex> lock(::defaultConfigLock);
-		auto &config = ::defaultConfig();
-		config = cfgEdit.apply(config);
+		return RValue<Int>(V(result)) & 0xFF;
 	}
-
-	Config Nucleus::getDefaultConfig()
-	{
-		std::unique_lock<std::mutex> lock(::defaultConfigLock);
-		return ::defaultConfig();
-	}
-
-	std::shared_ptr<Routine> Nucleus::acquireRoutine(const char *name, const Config::Edit &cfgEdit /* = Config::Edit::None */)
-	{
-		if (subzeroDumpEnabled)
-		{
-			// Output dump strings immediately, rather than once buffer is full. Useful for debugging.
-			context->getStrDump().SetUnbuffered();
-		}
-
-		if(basicBlock->getInsts().empty() || basicBlock->getInsts().back().getKind() != Ice::Inst::Ret)
-		{
-			createRetVoid();
-		}
-
-		::function->setFunctionName(Ice::GlobalString::createWithString(::context, name));
-
-		rr::optimize(::function);
-
-		::function->computeInOutEdges();
-		ASSERT(!::function->hasError());
-
-		::function->translate();
-		ASSERT(!::function->hasError());
-
-		auto globals = ::function->getGlobalInits();
-
-		if(globals && !globals->empty())
-		{
-			::context->getGlobals()->merge(globals.get());
-		}
-
-		::context->emitFileHeader();
-
-		if (subzeroEmitTextAsm)
-		{
-			::function->emit();
-		}
-
-		::function->emitIAS();
-		auto assembler = ::function->releaseAssembler();
-		auto objectWriter = ::context->getObjectWriter();
-		assembler->alignFunction();
-		objectWriter->writeFunctionCode(::function->getFunctionName(), false, assembler.get());
-		::context->lowerGlobals("last");
-		::context->lowerConstants();
-		::context->lowerJumpTables();
-		objectWriter->setUndefinedSyms(::context->getConstantExternSyms());
-		objectWriter->writeNonUserSections();
-
-		const void* entryBegin = ::routine->finalizeEntryBegin();
-		::routine->setEntry(Nucleus::CoroutineEntryBegin, entryBegin);
-
-		Routine *handoffRoutine = ::routine;
-		::routine = nullptr;
-
-		return std::shared_ptr<Routine>(handoffRoutine);
-	}
-
-	Value *Nucleus::allocateStackVariable(Type *t, int arraySize)
-	{
-		Ice::Type type = T(t);
-		int typeSize = Ice::typeWidthInBytes(type);
-		int totalSize = typeSize * (arraySize ? arraySize : 1);
-
-		auto bytes = Ice::ConstantInteger32::create(::context, Ice::IceType_i32, totalSize);
-		auto address = ::function->makeVariable(T(getPointerType(t)));
-		auto alloca = Ice::InstAlloca::create(::function, address, bytes, typeSize);
-		::function->getEntryNode()->getInsts().push_front(alloca);
-
-		return V(address);
-	}
-
-	BasicBlock *Nucleus::createBasicBlock()
-	{
-		return B(::function->makeNode());
-	}
-
-	BasicBlock *Nucleus::getInsertBlock()
-	{
-		return B(::basicBlock);
-	}
-
-	void Nucleus::setInsertBlock(BasicBlock *basicBlock)
-	{
-	//	ASSERT(::basicBlock->getInsts().back().getTerminatorEdges().size() >= 0 && "Previous basic block must have a terminator");
-
-		Variable::materializeAll();
-
-		::basicBlock = basicBlock;
-	}
-
-	void Nucleus::createFunction(Type *ReturnType, std::vector<Type*> &Params)
-	{
-		uint32_t sequenceNumber = 0;
-		::function = Ice::Cfg::create(::context, sequenceNumber).release();
-		::allocator = new Ice::CfgLocalAllocatorScope(::function);
-
-		for(Type *type : Params)
-		{
-			Ice::Variable *arg = ::function->makeVariable(T(type));
-			::function->addArg(arg);
-		}
-
-		Ice::CfgNode *node = ::function->makeNode();
-		::function->setEntryNode(node);
-		::basicBlock = node;
-	}
-
-	Value *Nucleus::getArgument(unsigned int index)
-	{
-		return V(::function->getArgs()[index]);
-	}
-
-	void Nucleus::createRetVoid()
-	{
-		// Code generated after this point is unreachable, so any variables
-		// being read can safely return an undefined value. We have to avoid
-		// materializing variables after the terminator ret instruction.
-		Variable::killUnmaterialized();
-
-		Ice::InstRet *ret = Ice::InstRet::create(::function);
-		::basicBlock->appendInst(ret);
-	}
-
-	void Nucleus::createRet(Value *v)
-	{
-		// Code generated after this point is unreachable, so any variables
-		// being read can safely return an undefined value. We have to avoid
-		// materializing variables after the terminator ret instruction.
-		Variable::killUnmaterialized();
-
-		Ice::InstRet *ret = Ice::InstRet::create(::function, v);
-		::basicBlock->appendInst(ret);
-	}
-
-	void Nucleus::createBr(BasicBlock *dest)
-	{
-		Variable::materializeAll();
-
-		auto br = Ice::InstBr::create(::function, dest);
-		::basicBlock->appendInst(br);
-	}
-
-	void Nucleus::createCondBr(Value *cond, BasicBlock *ifTrue, BasicBlock *ifFalse)
-	{
-		Variable::materializeAll();
-
-		auto br = Ice::InstBr::create(::function, cond, ifTrue, ifFalse);
-		::basicBlock->appendInst(br);
-	}
-
-	static bool isCommutative(Ice::InstArithmetic::OpKind op)
-	{
-		switch(op)
-		{
-		case Ice::InstArithmetic::Add:
-		case Ice::InstArithmetic::Fadd:
-		case Ice::InstArithmetic::Mul:
-		case Ice::InstArithmetic::Fmul:
-		case Ice::InstArithmetic::And:
-		case Ice::InstArithmetic::Or:
-		case Ice::InstArithmetic::Xor:
-			return true;
-		default:
-			return false;
-		}
-	}
-
-	static Value *createArithmetic(Ice::InstArithmetic::OpKind op, Value *lhs, Value *rhs)
-	{
-		ASSERT(lhs->getType() == rhs->getType() || llvm::isa<Ice::Constant>(rhs));
-
-		bool swapOperands = llvm::isa<Ice::Constant>(lhs) && isCommutative(op);
-
-		Ice::Variable *result = ::function->makeVariable(lhs->getType());
-		Ice::InstArithmetic *arithmetic = Ice::InstArithmetic::create(::function, op, result, swapOperands ? rhs : lhs, swapOperands ? lhs : rhs);
-		::basicBlock->appendInst(arithmetic);
-
-		return V(result);
-	}
-
-	Value *Nucleus::createAdd(Value *lhs, Value *rhs)
-	{
-		return createArithmetic(Ice::InstArithmetic::Add, lhs, rhs);
-	}
-
-	Value *Nucleus::createSub(Value *lhs, Value *rhs)
-	{
-		return createArithmetic(Ice::InstArithmetic::Sub, lhs, rhs);
-	}
-
-	Value *Nucleus::createMul(Value *lhs, Value *rhs)
-	{
-		return createArithmetic(Ice::InstArithmetic::Mul, lhs, rhs);
-	}
-
-	Value *Nucleus::createUDiv(Value *lhs, Value *rhs)
-	{
-		return createArithmetic(Ice::InstArithmetic::Udiv, lhs, rhs);
-	}
-
-	Value *Nucleus::createSDiv(Value *lhs, Value *rhs)
-	{
-		return createArithmetic(Ice::InstArithmetic::Sdiv, lhs, rhs);
-	}
-
-	Value *Nucleus::createFAdd(Value *lhs, Value *rhs)
-	{
-		return createArithmetic(Ice::InstArithmetic::Fadd, lhs, rhs);
-	}
-
-	Value *Nucleus::createFSub(Value *lhs, Value *rhs)
-	{
-		return createArithmetic(Ice::InstArithmetic::Fsub, lhs, rhs);
-	}
-
-	Value *Nucleus::createFMul(Value *lhs, Value *rhs)
-	{
-		return createArithmetic(Ice::InstArithmetic::Fmul, lhs, rhs);
-	}
-
-	Value *Nucleus::createFDiv(Value *lhs, Value *rhs)
-	{
-		return createArithmetic(Ice::InstArithmetic::Fdiv, lhs, rhs);
-	}
-
-	Value *Nucleus::createURem(Value *lhs, Value *rhs)
-	{
-		return createArithmetic(Ice::InstArithmetic::Urem, lhs, rhs);
-	}
-
-	Value *Nucleus::createSRem(Value *lhs, Value *rhs)
-	{
-		return createArithmetic(Ice::InstArithmetic::Srem, lhs, rhs);
-	}
-
-	Value *Nucleus::createFRem(Value *lhs, Value *rhs)
-	{
-		return createArithmetic(Ice::InstArithmetic::Frem, lhs, rhs);
-	}
-
-	Value *Nucleus::createShl(Value *lhs, Value *rhs)
-	{
-		return createArithmetic(Ice::InstArithmetic::Shl, lhs, rhs);
-	}
-
-	Value *Nucleus::createLShr(Value *lhs, Value *rhs)
-	{
-		return createArithmetic(Ice::InstArithmetic::Lshr, lhs, rhs);
-	}
-
-	Value *Nucleus::createAShr(Value *lhs, Value *rhs)
-	{
-		return createArithmetic(Ice::InstArithmetic::Ashr, lhs, rhs);
-	}
-
-	Value *Nucleus::createAnd(Value *lhs, Value *rhs)
-	{
-		return createArithmetic(Ice::InstArithmetic::And, lhs, rhs);
-	}
-
-	Value *Nucleus::createOr(Value *lhs, Value *rhs)
-	{
-		return createArithmetic(Ice::InstArithmetic::Or, lhs, rhs);
-	}
-
-	Value *Nucleus::createXor(Value *lhs, Value *rhs)
-	{
-		return createArithmetic(Ice::InstArithmetic::Xor, lhs, rhs);
-	}
-
-	Value *Nucleus::createNeg(Value *v)
-	{
-		return createSub(createNullValue(T(v->getType())), v);
-	}
-
-	Value *Nucleus::createFNeg(Value *v)
-	{
-		double c[4] = {-0.0, -0.0, -0.0, -0.0};
-		Value *negativeZero = Ice::isVectorType(v->getType()) ?
-		                      createConstantVector(c, T(v->getType())) :
-		                      V(::context->getConstantFloat(-0.0f));
-
-		return createFSub(negativeZero, v);
-	}
-
-	Value *Nucleus::createNot(Value *v)
-	{
-		if(Ice::isScalarIntegerType(v->getType()))
-		{
-			return createXor(v, V(::context->getConstantInt(v->getType(), -1)));
-		}
-		else   // Vector
-		{
-			int64_t c[16] = {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1};
-			return createXor(v, createConstantVector(c, T(v->getType())));
-		}
-	}
-
-	Value *Nucleus::createLoad(Value *ptr, Type *type, bool isVolatile, unsigned int align, bool atomic, std::memory_order memoryOrder)
-	{
-		ASSERT(!atomic);  // Unimplemented
-		ASSERT(memoryOrder == std::memory_order_relaxed);  // Unimplemented
-
-		int valueType = (int)reinterpret_cast<intptr_t>(type);
-		Ice::Variable *result = ::function->makeVariable(T(type));
-
-		if((valueType & EmulatedBits) && (align != 0))   // Narrow vector not stored on stack.
-		{
-			if(emulateIntrinsics)
-			{
-				if(typeSize(type) == 4)
-				{
-					auto pointer = RValue<Pointer<Byte>>(ptr);
-					Int x = *Pointer<Int>(pointer);
-
-					Int4 vector;
-					vector = Insert(vector, x, 0);
-
-					auto bitcast = Ice::InstCast::create(::function, Ice::InstCast::Bitcast, result, vector.loadValue());
-					::basicBlock->appendInst(bitcast);
-				}
-				else if(typeSize(type) == 8)
-				{
-					auto pointer = RValue<Pointer<Byte>>(ptr);
-					Int x = *Pointer<Int>(pointer);
-					Int y = *Pointer<Int>(pointer + 4);
-
-					Int4 vector;
-					vector = Insert(vector, x, 0);
-					vector = Insert(vector, y, 1);
-
-					auto bitcast = Ice::InstCast::create(::function, Ice::InstCast::Bitcast, result, vector.loadValue());
-					::basicBlock->appendInst(bitcast);
-				}
-				else UNREACHABLE("typeSize(type): %d", int(typeSize(type)));
-			}
-			else
-			{
-				const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::LoadSubVector, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
-				auto target = ::context->getConstantUndef(Ice::IceType_i32);
-				auto load = Ice::InstIntrinsicCall::create(::function, 2, result, target, intrinsic);
-				load->addArg(ptr);
-				load->addArg(::context->getConstantInt32(typeSize(type)));
-				::basicBlock->appendInst(load);
-			}
-		}
-		else
-		{
-			auto load = Ice::InstLoad::create(::function, result, ptr, align);
-			::basicBlock->appendInst(load);
-		}
-
-		return V(result);
-	}
-
-	Value *Nucleus::createStore(Value *value, Value *ptr, Type *type, bool isVolatile, unsigned int align, bool atomic, std::memory_order memoryOrder)
-	{
-		ASSERT(!atomic);  // Unimplemented
-		ASSERT(memoryOrder == std::memory_order_relaxed);  // Unimplemented
-
-		#if __has_feature(memory_sanitizer)
-			// Mark all (non-stack) memory writes as initialized by calling __msan_unpoison
-			if(align != 0)
-			{
-				auto call = Ice::InstCall::create(::function, 2, nullptr, ::context->getConstantInt64(reinterpret_cast<intptr_t>(__msan_unpoison)), false);
-				call->addArg(ptr);
-				call->addArg(::context->getConstantInt64(typeSize(type)));
-				::basicBlock->appendInst(call);
-			}
-		#endif
-
-		int valueType = (int)reinterpret_cast<intptr_t>(type);
-
-		if((valueType & EmulatedBits) && (align != 0))   // Narrow vector not stored on stack.
-		{
-			if(emulateIntrinsics)
-			{
-				if(typeSize(type) == 4)
-				{
-					Ice::Variable *vector = ::function->makeVariable(Ice::IceType_v4i32);
-					auto bitcast = Ice::InstCast::create(::function, Ice::InstCast::Bitcast, vector, value);
-					::basicBlock->appendInst(bitcast);
-
-					RValue<Int4> v(V(vector));
-
-					auto pointer = RValue<Pointer<Byte>>(ptr);
-					Int x = Extract(v, 0);
-					*Pointer<Int>(pointer) = x;
-				}
-				else if(typeSize(type) == 8)
-				{
-					Ice::Variable *vector = ::function->makeVariable(Ice::IceType_v4i32);
-					auto bitcast = Ice::InstCast::create(::function, Ice::InstCast::Bitcast, vector, value);
-					::basicBlock->appendInst(bitcast);
-
-					RValue<Int4> v(V(vector));
-
-					auto pointer = RValue<Pointer<Byte>>(ptr);
-					Int x = Extract(v, 0);
-					*Pointer<Int>(pointer) = x;
-					Int y = Extract(v, 1);
-					*Pointer<Int>(pointer + 4) = y;
-				}
-				else UNREACHABLE("typeSize(type): %d", int(typeSize(type)));
-			}
-			else
-			{
-				const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::StoreSubVector, Ice::Intrinsics::SideEffects_T, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_T};
-				auto target = ::context->getConstantUndef(Ice::IceType_i32);
-				auto store = Ice::InstIntrinsicCall::create(::function, 3, nullptr, target, intrinsic);
-				store->addArg(value);
-				store->addArg(ptr);
-				store->addArg(::context->getConstantInt32(typeSize(type)));
-				::basicBlock->appendInst(store);
-			}
-		}
-		else
-		{
-			ASSERT(value->getType() == T(type));
-
-			auto store = Ice::InstStore::create(::function, value, ptr, align);
-			::basicBlock->appendInst(store);
-		}
-
-		return value;
-	}
-
-	Value *Nucleus::createGEP(Value *ptr, Type *type, Value *index, bool unsignedIndex)
-	{
-		ASSERT(index->getType() == Ice::IceType_i32);
-
-		if(auto *constant = llvm::dyn_cast<Ice::ConstantInteger32>(index))
-		{
-			int32_t offset = constant->getValue() * (int)typeSize(type);
-
-			if(offset == 0)
-			{
-				return ptr;
-			}
-
-			return createAdd(ptr, createConstantInt(offset));
-		}
-
-		if(!Ice::isByteSizedType(T(type)))
-		{
-			index = createMul(index, createConstantInt((int)typeSize(type)));
-		}
-
-		if(sizeof(void*) == 8)
-		{
-			if(unsignedIndex)
-			{
-				index = createZExt(index, T(Ice::IceType_i64));
-			}
-			else
-			{
-				index = createSExt(index, T(Ice::IceType_i64));
-			}
-		}
-
-		return createAdd(ptr, index);
-	}
-
-	Value *Nucleus::createAtomicAdd(Value *ptr, Value *value, std::memory_order memoryOrder)
-	{
-		UNIMPLEMENTED("createAtomicAdd");
-		return nullptr;
-	}
-
-	Value *Nucleus::createAtomicSub(Value *ptr, Value *value, std::memory_order memoryOrder)
-	{
-		UNIMPLEMENTED("createAtomicSub");
-		return nullptr;
-	}
-
-	Value *Nucleus::createAtomicAnd(Value *ptr, Value *value, std::memory_order memoryOrder)
-	{
-		UNIMPLEMENTED("createAtomicAnd");
-		return nullptr;
-	}
-
-	Value *Nucleus::createAtomicOr(Value *ptr, Value *value, std::memory_order memoryOrder)
-	{
-		UNIMPLEMENTED("createAtomicOr");
-		return nullptr;
-	}
-
-	Value *Nucleus::createAtomicXor(Value *ptr, Value *value, std::memory_order memoryOrder)
-	{
-		UNIMPLEMENTED("createAtomicXor");
-		return nullptr;
-	}
-
-	Value *Nucleus::createAtomicMin(Value *ptr, Value *value, std::memory_order memoryOrder)
-	{
-		UNIMPLEMENTED("createAtomicMin");
-		return nullptr;
-	}
-
-	Value *Nucleus::createAtomicMax(Value *ptr, Value *value, std::memory_order memoryOrder)
-	{
-		UNIMPLEMENTED("createAtomicMax");
-		return nullptr;
-	}
-
-	Value *Nucleus::createAtomicUMin(Value *ptr, Value *value, std::memory_order memoryOrder)
-	{
-		UNIMPLEMENTED("createAtomicUMin");
-		return nullptr;
-	}
-
-	Value *Nucleus::createAtomicUMax(Value *ptr, Value *value, std::memory_order memoryOrder)
-	{
-		UNIMPLEMENTED("createAtomicUMax");
-		return nullptr;
-	}
-
-	Value *Nucleus::createAtomicExchange(Value *ptr, Value *value, std::memory_order memoryOrder)
-	{
-		UNIMPLEMENTED("createAtomicExchange");
-		return nullptr;
-	}
-
-	Value *Nucleus::createAtomicCompareExchange(Value *ptr, Value *value, Value *compare, std::memory_order memoryOrderEqual, std::memory_order memoryOrderUnequal)
-	{
-		UNIMPLEMENTED("createAtomicCompareExchange");
-		return nullptr;
-	}
-
-	static Value *createCast(Ice::InstCast::OpKind op, Value *v, Type *destType)
-	{
-		if(v->getType() == T(destType))
-		{
-			return v;
-		}
-
-		Ice::Variable *result = ::function->makeVariable(T(destType));
-		Ice::InstCast *cast = Ice::InstCast::create(::function, op, result, v);
-		::basicBlock->appendInst(cast);
-
-		return V(result);
-	}
-
-	Value *Nucleus::createTrunc(Value *v, Type *destType)
-	{
-		return createCast(Ice::InstCast::Trunc, v, destType);
-	}
-
-	Value *Nucleus::createZExt(Value *v, Type *destType)
-	{
-		return createCast(Ice::InstCast::Zext, v, destType);
-	}
-
-	Value *Nucleus::createSExt(Value *v, Type *destType)
-	{
-		return createCast(Ice::InstCast::Sext, v, destType);
-	}
-
-	Value *Nucleus::createFPToUI(Value *v, Type *destType)
-	{
-		return createCast(Ice::InstCast::Fptoui, v, destType);
-	}
-
-	Value *Nucleus::createFPToSI(Value *v, Type *destType)
-	{
-		return createCast(Ice::InstCast::Fptosi, v, destType);
-	}
-
-	Value *Nucleus::createSIToFP(Value *v, Type *destType)
-	{
-		return createCast(Ice::InstCast::Sitofp, v, destType);
-	}
-
-	Value *Nucleus::createFPTrunc(Value *v, Type *destType)
-	{
-		return createCast(Ice::InstCast::Fptrunc, v, destType);
-	}
-
-	Value *Nucleus::createFPExt(Value *v, Type *destType)
-	{
-		return createCast(Ice::InstCast::Fpext, v, destType);
-	}
-
-	Value *Nucleus::createBitCast(Value *v, Type *destType)
-	{
-		// Bitcasts must be between types of the same logical size. But with emulated narrow vectors we need
-		// support for casting between scalars and wide vectors. For platforms where this is not supported,
-		// emulate them by writing to the stack and reading back as the destination type.
-		if(emulateMismatchedBitCast)
-		{
-			if(!Ice::isVectorType(v->getType()) && Ice::isVectorType(T(destType)))
-			{
-				Value *address = allocateStackVariable(destType);
-				createStore(v, address, T(v->getType()));
-				return createLoad(address, destType);
-			}
-			else if(Ice::isVectorType(v->getType()) && !Ice::isVectorType(T(destType)))
-			{
-				Value *address = allocateStackVariable(T(v->getType()));
-				createStore(v, address, T(v->getType()));
-				return createLoad(address, destType);
-			}
-		}
-
-		return createCast(Ice::InstCast::Bitcast, v, destType);
-	}
-
-	static Value *createIntCompare(Ice::InstIcmp::ICond condition, Value *lhs, Value *rhs)
-	{
-		ASSERT(lhs->getType() == rhs->getType());
-
-		auto result = ::function->makeVariable(Ice::isScalarIntegerType(lhs->getType()) ? Ice::IceType_i1 : lhs->getType());
-		auto cmp = Ice::InstIcmp::create(::function, condition, result, lhs, rhs);
-		::basicBlock->appendInst(cmp);
-
-		return V(result);
-	}
-
-	Value *Nucleus::createPtrEQ(Value *lhs, Value *rhs)
-	{
-		return createIntCompare(Ice::InstIcmp::Eq, lhs, rhs);
-	}
-
-	Value *Nucleus::createICmpEQ(Value *lhs, Value *rhs)
-	{
-		return createIntCompare(Ice::InstIcmp::Eq, lhs, rhs);
-	}
-
-	Value *Nucleus::createICmpNE(Value *lhs, Value *rhs)
-	{
-		return createIntCompare(Ice::InstIcmp::Ne, lhs, rhs);
-	}
-
-	Value *Nucleus::createICmpUGT(Value *lhs, Value *rhs)
-	{
-		return createIntCompare(Ice::InstIcmp::Ugt, lhs, rhs);
-	}
-
-	Value *Nucleus::createICmpUGE(Value *lhs, Value *rhs)
-	{
-		return createIntCompare(Ice::InstIcmp::Uge, lhs, rhs);
-	}
-
-	Value *Nucleus::createICmpULT(Value *lhs, Value *rhs)
-	{
-		return createIntCompare(Ice::InstIcmp::Ult, lhs, rhs);
-	}
-
-	Value *Nucleus::createICmpULE(Value *lhs, Value *rhs)
-	{
-		return createIntCompare(Ice::InstIcmp::Ule, lhs, rhs);
-	}
-
-	Value *Nucleus::createICmpSGT(Value *lhs, Value *rhs)
-	{
-		return createIntCompare(Ice::InstIcmp::Sgt, lhs, rhs);
-	}
-
-	Value *Nucleus::createICmpSGE(Value *lhs, Value *rhs)
-	{
-		return createIntCompare(Ice::InstIcmp::Sge, lhs, rhs);
-	}
-
-	Value *Nucleus::createICmpSLT(Value *lhs, Value *rhs)
-	{
-		return createIntCompare(Ice::InstIcmp::Slt, lhs, rhs);
-	}
-
-	Value *Nucleus::createICmpSLE(Value *lhs, Value *rhs)
-	{
-		return createIntCompare(Ice::InstIcmp::Sle, lhs, rhs);
-	}
-
-	static Value *createFloatCompare(Ice::InstFcmp::FCond condition, Value *lhs, Value *rhs)
-	{
-		ASSERT(lhs->getType() == rhs->getType());
-		ASSERT(Ice::isScalarFloatingType(lhs->getType()) || lhs->getType() == Ice::IceType_v4f32);
-
-		auto result = ::function->makeVariable(Ice::isScalarFloatingType(lhs->getType()) ? Ice::IceType_i1 : Ice::IceType_v4i32);
-		auto cmp = Ice::InstFcmp::create(::function, condition, result, lhs, rhs);
-		::basicBlock->appendInst(cmp);
-
-		return V(result);
-	}
-
-	Value *Nucleus::createFCmpOEQ(Value *lhs, Value *rhs)
-	{
-		return createFloatCompare(Ice::InstFcmp::Oeq, lhs, rhs);
-	}
-
-	Value *Nucleus::createFCmpOGT(Value *lhs, Value *rhs)
-	{
-		return createFloatCompare(Ice::InstFcmp::Ogt, lhs, rhs);
-	}
-
-	Value *Nucleus::createFCmpOGE(Value *lhs, Value *rhs)
-	{
-		return createFloatCompare(Ice::InstFcmp::Oge, lhs, rhs);
-	}
-
-	Value *Nucleus::createFCmpOLT(Value *lhs, Value *rhs)
-	{
-		return createFloatCompare(Ice::InstFcmp::Olt, lhs, rhs);
-	}
-
-	Value *Nucleus::createFCmpOLE(Value *lhs, Value *rhs)
-	{
-		return createFloatCompare(Ice::InstFcmp::Ole, lhs, rhs);
-	}
-
-	Value *Nucleus::createFCmpONE(Value *lhs, Value *rhs)
-	{
-		return createFloatCompare(Ice::InstFcmp::One, lhs, rhs);
-	}
-
-	Value *Nucleus::createFCmpORD(Value *lhs, Value *rhs)
-	{
-		return createFloatCompare(Ice::InstFcmp::Ord, lhs, rhs);
-	}
-
-	Value *Nucleus::createFCmpUNO(Value *lhs, Value *rhs)
-	{
-		return createFloatCompare(Ice::InstFcmp::Uno, lhs, rhs);
-	}
-
-	Value *Nucleus::createFCmpUEQ(Value *lhs, Value *rhs)
-	{
-		return createFloatCompare(Ice::InstFcmp::Ueq, lhs, rhs);
-	}
-
-	Value *Nucleus::createFCmpUGT(Value *lhs, Value *rhs)
-	{
-		return createFloatCompare(Ice::InstFcmp::Ugt, lhs, rhs);
-	}
-
-	Value *Nucleus::createFCmpUGE(Value *lhs, Value *rhs)
-	{
-		return createFloatCompare(Ice::InstFcmp::Uge, lhs, rhs);
-	}
-
-	Value *Nucleus::createFCmpULT(Value *lhs, Value *rhs)
-	{
-		return createFloatCompare(Ice::InstFcmp::Ult, lhs, rhs);
-	}
-
-	Value *Nucleus::createFCmpULE(Value *lhs, Value *rhs)
-	{
-		return createFloatCompare(Ice::InstFcmp::Ule, lhs, rhs);
-	}
-
-	Value *Nucleus::createFCmpUNE(Value *lhs, Value *rhs)
-	{
-		return createFloatCompare(Ice::InstFcmp::Une, lhs, rhs);
-	}
-
-	Value *Nucleus::createExtractElement(Value *vector, Type *type, int index)
-	{
-		auto result = ::function->makeVariable(T(type));
-		auto extract = Ice::InstExtractElement::create(::function, result, vector, ::context->getConstantInt32(index));
-		::basicBlock->appendInst(extract);
-
-		return V(result);
-	}
-
-	Value *Nucleus::createInsertElement(Value *vector, Value *element, int index)
-	{
-		auto result = ::function->makeVariable(vector->getType());
-		auto insert = Ice::InstInsertElement::create(::function, result, vector, element, ::context->getConstantInt32(index));
-		::basicBlock->appendInst(insert);
-
-		return V(result);
-	}
-
-	Value *Nucleus::createShuffleVector(Value *V1, Value *V2, const int *select)
-	{
-		ASSERT(V1->getType() == V2->getType());
-
-		int size = Ice::typeNumElements(V1->getType());
-		auto result = ::function->makeVariable(V1->getType());
-		auto shuffle = Ice::InstShuffleVector::create(::function, result, V1, V2);
-
-		for(int i = 0; i < size; i++)
-		{
-			shuffle->addIndex(llvm::cast<Ice::ConstantInteger32>(::context->getConstantInt32(select[i])));
-		}
-
-		::basicBlock->appendInst(shuffle);
-
-		return V(result);
-	}
-
-	Value *Nucleus::createSelect(Value *C, Value *ifTrue, Value *ifFalse)
-	{
-		ASSERT(ifTrue->getType() == ifFalse->getType());
-
-		auto result = ::function->makeVariable(ifTrue->getType());
-		auto *select = Ice::InstSelect::create(::function, result, C, ifTrue, ifFalse);
-		::basicBlock->appendInst(select);
-
-		return V(result);
-	}
-
-	SwitchCases *Nucleus::createSwitch(Value *control, BasicBlock *defaultBranch, unsigned numCases)
-	{
-		auto switchInst = Ice::InstSwitch::create(::function, numCases, control, defaultBranch);
-		::basicBlock->appendInst(switchInst);
-
-		return reinterpret_cast<SwitchCases*>(switchInst);
-	}
-
-	void Nucleus::addSwitchCase(SwitchCases *switchCases, int label, BasicBlock *branch)
-	{
-		switchCases->addBranch(label, label, branch);
-	}
-
-	void Nucleus::createUnreachable()
-	{
-		Ice::InstUnreachable *unreachable = Ice::InstUnreachable::create(::function);
-		::basicBlock->appendInst(unreachable);
-	}
-
-	Type *Nucleus::getPointerType(Type *ElementType)
-	{
-		if(sizeof(void*) == 8)
-		{
-			return T(Ice::IceType_i64);
-		}
-		else
-		{
-			return T(Ice::IceType_i32);
-		}
-	}
-
-	Value *Nucleus::createNullValue(Type *Ty)
-	{
-		if(Ice::isVectorType(T(Ty)))
-		{
-			ASSERT(Ice::typeNumElements(T(Ty)) <= 16);
-			int64_t c[16] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
-			return createConstantVector(c, Ty);
-		}
-		else
-		{
-			return V(::context->getConstantZero(T(Ty)));
-		}
-	}
-
-	Value *Nucleus::createConstantLong(int64_t i)
-	{
-		return V(::context->getConstantInt64(i));
-	}
-
-	Value *Nucleus::createConstantInt(int i)
-	{
-		return V(::context->getConstantInt32(i));
-	}
-
-	Value *Nucleus::createConstantInt(unsigned int i)
-	{
-		return V(::context->getConstantInt32(i));
-	}
-
-	Value *Nucleus::createConstantBool(bool b)
-	{
-		return V(::context->getConstantInt1(b));
-	}
-
-	Value *Nucleus::createConstantByte(signed char i)
-	{
-		return V(::context->getConstantInt8(i));
-	}
-
-	Value *Nucleus::createConstantByte(unsigned char i)
-	{
-		return V(::context->getConstantInt8(i));
-	}
-
-	Value *Nucleus::createConstantShort(short i)
-	{
-		return V(::context->getConstantInt16(i));
-	}
-
-	Value *Nucleus::createConstantShort(unsigned short i)
-	{
-		return V(::context->getConstantInt16(i));
-	}
-
-	Value *Nucleus::createConstantFloat(float x)
-	{
-		return V(::context->getConstantFloat(x));
-	}
-
-	Value *Nucleus::createNullPointer(Type *Ty)
-	{
-		return createNullValue(T(sizeof(void*) == 8 ? Ice::IceType_i64 : Ice::IceType_i32));
-	}
-
-	Value *Nucleus::createConstantVector(const int64_t *constants, Type *type)
-	{
-		const int vectorSize = 16;
-		ASSERT(Ice::typeWidthInBytes(T(type)) == vectorSize);
-		const int alignment = vectorSize;
-		auto globalPool = ::function->getGlobalPool();
-
-		const int64_t *i = constants;
-		const double *f = reinterpret_cast<const double*>(constants);
-		Ice::VariableDeclaration::DataInitializer *dataInitializer = nullptr;
-
-		switch((int)reinterpret_cast<intptr_t>(type))
-		{
-		case Ice::IceType_v4i32:
-		case Ice::IceType_v4i1:
-			{
-				const int initializer[4] = {(int)i[0], (int)i[1], (int)i[2], (int)i[3]};
-				static_assert(sizeof(initializer) == vectorSize, "!");
-				dataInitializer = Ice::VariableDeclaration::DataInitializer::create(globalPool, (const char*)initializer, vectorSize);
-			}
-			break;
-		case Ice::IceType_v4f32:
-			{
-				const float initializer[4] = {(float)f[0], (float)f[1], (float)f[2], (float)f[3]};
-				static_assert(sizeof(initializer) == vectorSize, "!");
-				dataInitializer = Ice::VariableDeclaration::DataInitializer::create(globalPool, (const char*)initializer, vectorSize);
-			}
-			break;
-		case Ice::IceType_v8i16:
-		case Ice::IceType_v8i1:
-			{
-				const short initializer[8] = {(short)i[0], (short)i[1], (short)i[2], (short)i[3], (short)i[4], (short)i[5], (short)i[6], (short)i[7]};
-				static_assert(sizeof(initializer) == vectorSize, "!");
-				dataInitializer = Ice::VariableDeclaration::DataInitializer::create(globalPool, (const char*)initializer, vectorSize);
-			}
-			break;
-		case Ice::IceType_v16i8:
-		case Ice::IceType_v16i1:
-			{
-				const char initializer[16] = {(char)i[0], (char)i[1], (char)i[2], (char)i[3], (char)i[4], (char)i[5], (char)i[6], (char)i[7], (char)i[8], (char)i[9], (char)i[10], (char)i[11], (char)i[12], (char)i[13], (char)i[14], (char)i[15]};
-				static_assert(sizeof(initializer) == vectorSize, "!");
-				dataInitializer = Ice::VariableDeclaration::DataInitializer::create(globalPool, (const char*)initializer, vectorSize);
-			}
-			break;
-		case Type_v2i32:
-			{
-				const int initializer[4] = {(int)i[0], (int)i[1], (int)i[0], (int)i[1]};
-				static_assert(sizeof(initializer) == vectorSize, "!");
-				dataInitializer = Ice::VariableDeclaration::DataInitializer::create(globalPool, (const char*)initializer, vectorSize);
-			}
-			break;
-		case Type_v2f32:
-			{
-				const float initializer[4] = {(float)f[0], (float)f[1], (float)f[0], (float)f[1]};
-				static_assert(sizeof(initializer) == vectorSize, "!");
-				dataInitializer = Ice::VariableDeclaration::DataInitializer::create(globalPool, (const char*)initializer, vectorSize);
-			}
-			break;
-		case Type_v4i16:
-			{
-				const short initializer[8] = {(short)i[0], (short)i[1], (short)i[2], (short)i[3], (short)i[0], (short)i[1], (short)i[2], (short)i[3]};
-				static_assert(sizeof(initializer) == vectorSize, "!");
-				dataInitializer = Ice::VariableDeclaration::DataInitializer::create(globalPool, (const char*)initializer, vectorSize);
-			}
-			break;
-		case Type_v8i8:
-			{
-				const char initializer[16] = {(char)i[0], (char)i[1], (char)i[2], (char)i[3], (char)i[4], (char)i[5], (char)i[6], (char)i[7], (char)i[0], (char)i[1], (char)i[2], (char)i[3], (char)i[4], (char)i[5], (char)i[6], (char)i[7]};
-				static_assert(sizeof(initializer) == vectorSize, "!");
-				dataInitializer = Ice::VariableDeclaration::DataInitializer::create(globalPool, (const char*)initializer, vectorSize);
-			}
-			break;
-		case Type_v4i8:
-			{
-				const char initializer[16] = {(char)i[0], (char)i[1], (char)i[2], (char)i[3], (char)i[0], (char)i[1], (char)i[2], (char)i[3], (char)i[0], (char)i[1], (char)i[2], (char)i[3], (char)i[0], (char)i[1], (char)i[2], (char)i[3]};
-				static_assert(sizeof(initializer) == vectorSize, "!");
-				dataInitializer = Ice::VariableDeclaration::DataInitializer::create(globalPool, (const char*)initializer, vectorSize);
-			}
-			break;
-		default:
-			UNREACHABLE("Unknown constant vector type: %d", (int)reinterpret_cast<intptr_t>(type));
-		}
-
-		auto name = Ice::GlobalString::createWithoutString(::context);
-		auto *variableDeclaration = Ice::VariableDeclaration::create(globalPool);
-		variableDeclaration->setName(name);
-		variableDeclaration->setAlignment(alignment);
-		variableDeclaration->setIsConstant(true);
-		variableDeclaration->addInitializer(dataInitializer);
-
-		::function->addGlobal(variableDeclaration);
-
-		constexpr int32_t offset = 0;
-		Ice::Operand *ptr = ::context->getConstantSym(offset, name);
-
-		Ice::Variable *result = ::function->makeVariable(T(type));
-		auto load = Ice::InstLoad::create(::function, result, ptr, alignment);
-		::basicBlock->appendInst(load);
-
-		return V(result);
-	}
-
-	Value *Nucleus::createConstantVector(const double *constants, Type *type)
-	{
-		return createConstantVector((const int64_t*)constants, type);
-	}
-
-	Type *Void::getType()
-	{
-		return T(Ice::IceType_void);
-	}
-
-	Type *Bool::getType()
-	{
-		return T(Ice::IceType_i1);
-	}
-
-	Type *Byte::getType()
-	{
-		return T(Ice::IceType_i8);
-	}
-
-	Type *SByte::getType()
-	{
-		return T(Ice::IceType_i8);
-	}
-
-	Type *Short::getType()
-	{
-		return T(Ice::IceType_i16);
-	}
-
-	Type *UShort::getType()
-	{
-		return T(Ice::IceType_i16);
-	}
-
-	Type *Byte4::getType()
-	{
-		return T(Type_v4i8);
-	}
-
-	Type *SByte4::getType()
-	{
-		return T(Type_v4i8);
-	}
-
-	namespace
-	{
-		RValue<Byte> SaturateUnsigned(RValue<Short> x)
-		{
-			return Byte(IfThenElse(Int(x) > 0xFF, Int(0xFF), IfThenElse(Int(x) < 0, Int(0), Int(x))));
-		}
-
-		RValue<Byte> Extract(RValue<Byte8> val, int i)
-		{
-			return RValue<Byte>(Nucleus::createExtractElement(val.value, Byte::getType(), i));
-		}
-
-		RValue<Byte8> Insert(RValue<Byte8> val, RValue<Byte> element, int i)
-		{
-			return RValue<Byte8>(Nucleus::createInsertElement(val.value, element.value, i));
-		}
-	}
-
-	RValue<Byte8> AddSat(RValue<Byte8> x, RValue<Byte8> y)
-	{
-		if(emulateIntrinsics)
-		{
-			Byte8 result;
-			result = Insert(result, SaturateUnsigned(Short(Int(Extract(x, 0)) + Int(Extract(y, 0)))), 0);
-			result = Insert(result, SaturateUnsigned(Short(Int(Extract(x, 1)) + Int(Extract(y, 1)))), 1);
-			result = Insert(result, SaturateUnsigned(Short(Int(Extract(x, 2)) + Int(Extract(y, 2)))), 2);
-			result = Insert(result, SaturateUnsigned(Short(Int(Extract(x, 3)) + Int(Extract(y, 3)))), 3);
-			result = Insert(result, SaturateUnsigned(Short(Int(Extract(x, 4)) + Int(Extract(y, 4)))), 4);
-			result = Insert(result, SaturateUnsigned(Short(Int(Extract(x, 5)) + Int(Extract(y, 5)))), 5);
-			result = Insert(result, SaturateUnsigned(Short(Int(Extract(x, 6)) + Int(Extract(y, 6)))), 6);
-			result = Insert(result, SaturateUnsigned(Short(Int(Extract(x, 7)) + Int(Extract(y, 7)))), 7);
-
-			return result;
-		}
-		else
-		{
-			Ice::Variable *result = ::function->makeVariable(Ice::IceType_v16i8);
-			const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::AddSaturateUnsigned, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
-			auto target = ::context->getConstantUndef(Ice::IceType_i32);
-			auto paddusb = Ice::InstIntrinsicCall::create(::function, 2, result, target, intrinsic);
-			paddusb->addArg(x.value);
-			paddusb->addArg(y.value);
-			::basicBlock->appendInst(paddusb);
-
-			return RValue<Byte8>(V(result));
-		}
-	}
-
-	RValue<Byte8> SubSat(RValue<Byte8> x, RValue<Byte8> y)
-	{
-		if(emulateIntrinsics)
-		{
-			Byte8 result;
-			result = Insert(result, SaturateUnsigned(Short(Int(Extract(x, 0)) - Int(Extract(y, 0)))), 0);
-			result = Insert(result, SaturateUnsigned(Short(Int(Extract(x, 1)) - Int(Extract(y, 1)))), 1);
-			result = Insert(result, SaturateUnsigned(Short(Int(Extract(x, 2)) - Int(Extract(y, 2)))), 2);
-			result = Insert(result, SaturateUnsigned(Short(Int(Extract(x, 3)) - Int(Extract(y, 3)))), 3);
-			result = Insert(result, SaturateUnsigned(Short(Int(Extract(x, 4)) - Int(Extract(y, 4)))), 4);
-			result = Insert(result, SaturateUnsigned(Short(Int(Extract(x, 5)) - Int(Extract(y, 5)))), 5);
-			result = Insert(result, SaturateUnsigned(Short(Int(Extract(x, 6)) - Int(Extract(y, 6)))), 6);
-			result = Insert(result, SaturateUnsigned(Short(Int(Extract(x, 7)) - Int(Extract(y, 7)))), 7);
-
-			return result;
-		}
-		else
-		{
-			Ice::Variable *result = ::function->makeVariable(Ice::IceType_v16i8);
-			const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::SubtractSaturateUnsigned, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
-			auto target = ::context->getConstantUndef(Ice::IceType_i32);
-			auto psubusw = Ice::InstIntrinsicCall::create(::function, 2, result, target, intrinsic);
-			psubusw->addArg(x.value);
-			psubusw->addArg(y.value);
-			::basicBlock->appendInst(psubusw);
-
-			return RValue<Byte8>(V(result));
-		}
-	}
-
-	RValue<SByte> Extract(RValue<SByte8> val, int i)
-	{
-		return RValue<SByte>(Nucleus::createExtractElement(val.value, SByte::getType(), i));
-	}
-
-	RValue<SByte8> Insert(RValue<SByte8> val, RValue<SByte> element, int i)
-	{
-		return RValue<SByte8>(Nucleus::createInsertElement(val.value, element.value, i));
-	}
-
-	RValue<SByte8> operator>>(RValue<SByte8> lhs, unsigned char rhs)
-	{
-		if(emulateIntrinsics)
-		{
-			SByte8 result;
-			result = Insert(result, Extract(lhs, 0) >> SByte(rhs), 0);
-			result = Insert(result, Extract(lhs, 1) >> SByte(rhs), 1);
-			result = Insert(result, Extract(lhs, 2) >> SByte(rhs), 2);
-			result = Insert(result, Extract(lhs, 3) >> SByte(rhs), 3);
-			result = Insert(result, Extract(lhs, 4) >> SByte(rhs), 4);
-			result = Insert(result, Extract(lhs, 5) >> SByte(rhs), 5);
-			result = Insert(result, Extract(lhs, 6) >> SByte(rhs), 6);
-			result = Insert(result, Extract(lhs, 7) >> SByte(rhs), 7);
-
-			return result;
-		}
-		else
-		{
-			#if defined(__i386__) || defined(__x86_64__)
-				// SSE2 doesn't support byte vector shifts, so shift as shorts and recombine.
-				RValue<Short4> hi = (As<Short4>(lhs) >> rhs) & Short4(0xFF00u);
-				RValue<Short4> lo = As<Short4>(As<UShort4>((As<Short4>(lhs) << 8) >> rhs) >> 8);
-
-				return As<SByte8>(hi | lo);
-			#else
-				return RValue<SByte8>(Nucleus::createAShr(lhs.value, V(::context->getConstantInt32(rhs))));
-			#endif
-		}
-	}
-
-	RValue<Int> SignMask(RValue<Byte8> x)
-	{
-		if(emulateIntrinsics || CPUID::ARM)
-		{
-			Byte8 xx = As<Byte8>(As<SByte8>(x) >> 7) & Byte8(0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80);
-			return Int(Extract(xx, 0)) | Int(Extract(xx, 1)) | Int(Extract(xx, 2)) | Int(Extract(xx, 3)) | Int(Extract(xx, 4)) | Int(Extract(xx, 5)) | Int(Extract(xx, 6)) | Int(Extract(xx, 7));
-		}
-		else
-		{
-			Ice::Variable *result = ::function->makeVariable(Ice::IceType_i32);
-			const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::SignMask, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
-			auto target = ::context->getConstantUndef(Ice::IceType_i32);
-			auto movmsk = Ice::InstIntrinsicCall::create(::function, 1, result, target, intrinsic);
-			movmsk->addArg(x.value);
-			::basicBlock->appendInst(movmsk);
-
-			return RValue<Int>(V(result)) & 0xFF;
-		}
-	}
+}
 
 //	RValue<Byte8> CmpGT(RValue<Byte8> x, RValue<Byte8> y)
 //	{
 //		return RValue<Byte8>(createIntCompare(Ice::InstIcmp::Ugt, x.value, y.value));
 //	}
 
-	RValue<Byte8> CmpEQ(RValue<Byte8> x, RValue<Byte8> y)
-	{
-		return RValue<Byte8>(Nucleus::createICmpEQ(x.value, y.value));
-	}
+RValue<Byte8> CmpEQ(RValue<Byte8> x, RValue<Byte8> y)
+{
+	return RValue<Byte8>(Nucleus::createICmpEQ(x.value, y.value));
+}
 
-	Type *Byte8::getType()
-	{
-		return T(Type_v8i8);
-	}
+Type *Byte8::getType()
+{
+	return T(Type_v8i8);
+}
 
 //	RValue<SByte8> operator<<(RValue<SByte8> lhs, unsigned char rhs)
 //	{
@@ -1870,886 +1869,886 @@
 //		return RValue<SByte8>(Nucleus::createAShr(lhs.value, V(::context->getConstantInt32(rhs))));
 //	}
 
-	RValue<SByte> SaturateSigned(RValue<Short> x)
+RValue<SByte> SaturateSigned(RValue<Short> x)
+{
+	return SByte(IfThenElse(Int(x) > 0x7F, Int(0x7F), IfThenElse(Int(x) < -0x80, Int(0x80), Int(x))));
+}
+
+RValue<SByte8> AddSat(RValue<SByte8> x, RValue<SByte8> y)
+{
+	if(emulateIntrinsics)
 	{
-		return SByte(IfThenElse(Int(x) > 0x7F, Int(0x7F), IfThenElse(Int(x) < -0x80, Int(0x80), Int(x))));
-	}
+		SByte8 result;
+		result = Insert(result, SaturateSigned(Short(Int(Extract(x, 0)) + Int(Extract(y, 0)))), 0);
+		result = Insert(result, SaturateSigned(Short(Int(Extract(x, 1)) + Int(Extract(y, 1)))), 1);
+		result = Insert(result, SaturateSigned(Short(Int(Extract(x, 2)) + Int(Extract(y, 2)))), 2);
+		result = Insert(result, SaturateSigned(Short(Int(Extract(x, 3)) + Int(Extract(y, 3)))), 3);
+		result = Insert(result, SaturateSigned(Short(Int(Extract(x, 4)) + Int(Extract(y, 4)))), 4);
+		result = Insert(result, SaturateSigned(Short(Int(Extract(x, 5)) + Int(Extract(y, 5)))), 5);
+		result = Insert(result, SaturateSigned(Short(Int(Extract(x, 6)) + Int(Extract(y, 6)))), 6);
+		result = Insert(result, SaturateSigned(Short(Int(Extract(x, 7)) + Int(Extract(y, 7)))), 7);
 
-	RValue<SByte8> AddSat(RValue<SByte8> x, RValue<SByte8> y)
+		return result;
+	}
+	else
 	{
-		if(emulateIntrinsics)
-		{
-			SByte8 result;
-			result = Insert(result, SaturateSigned(Short(Int(Extract(x, 0)) + Int(Extract(y, 0)))), 0);
-			result = Insert(result, SaturateSigned(Short(Int(Extract(x, 1)) + Int(Extract(y, 1)))), 1);
-			result = Insert(result, SaturateSigned(Short(Int(Extract(x, 2)) + Int(Extract(y, 2)))), 2);
-			result = Insert(result, SaturateSigned(Short(Int(Extract(x, 3)) + Int(Extract(y, 3)))), 3);
-			result = Insert(result, SaturateSigned(Short(Int(Extract(x, 4)) + Int(Extract(y, 4)))), 4);
-			result = Insert(result, SaturateSigned(Short(Int(Extract(x, 5)) + Int(Extract(y, 5)))), 5);
-			result = Insert(result, SaturateSigned(Short(Int(Extract(x, 6)) + Int(Extract(y, 6)))), 6);
-			result = Insert(result, SaturateSigned(Short(Int(Extract(x, 7)) + Int(Extract(y, 7)))), 7);
+		Ice::Variable *result = ::function->makeVariable(Ice::IceType_v16i8);
+		const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::AddSaturateSigned, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
+		auto target = ::context->getConstantUndef(Ice::IceType_i32);
+		auto paddsb = Ice::InstIntrinsicCall::create(::function, 2, result, target, intrinsic);
+		paddsb->addArg(x.value);
+		paddsb->addArg(y.value);
+		::basicBlock->appendInst(paddsb);
 
-			return result;
-		}
-		else
-		{
-			Ice::Variable *result = ::function->makeVariable(Ice::IceType_v16i8);
-			const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::AddSaturateSigned, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
-			auto target = ::context->getConstantUndef(Ice::IceType_i32);
-			auto paddsb = Ice::InstIntrinsicCall::create(::function, 2, result, target, intrinsic);
-			paddsb->addArg(x.value);
-			paddsb->addArg(y.value);
-			::basicBlock->appendInst(paddsb);
-
-			return RValue<SByte8>(V(result));
-		}
+		return RValue<SByte8>(V(result));
 	}
+}
 
-	RValue<SByte8> SubSat(RValue<SByte8> x, RValue<SByte8> y)
+RValue<SByte8> SubSat(RValue<SByte8> x, RValue<SByte8> y)
+{
+	if(emulateIntrinsics)
 	{
-		if(emulateIntrinsics)
-		{
-			SByte8 result;
-			result = Insert(result, SaturateSigned(Short(Int(Extract(x, 0)) - Int(Extract(y, 0)))), 0);
-			result = Insert(result, SaturateSigned(Short(Int(Extract(x, 1)) - Int(Extract(y, 1)))), 1);
-			result = Insert(result, SaturateSigned(Short(Int(Extract(x, 2)) - Int(Extract(y, 2)))), 2);
-			result = Insert(result, SaturateSigned(Short(Int(Extract(x, 3)) - Int(Extract(y, 3)))), 3);
-			result = Insert(result, SaturateSigned(Short(Int(Extract(x, 4)) - Int(Extract(y, 4)))), 4);
-			result = Insert(result, SaturateSigned(Short(Int(Extract(x, 5)) - Int(Extract(y, 5)))), 5);
-			result = Insert(result, SaturateSigned(Short(Int(Extract(x, 6)) - Int(Extract(y, 6)))), 6);
-			result = Insert(result, SaturateSigned(Short(Int(Extract(x, 7)) - Int(Extract(y, 7)))), 7);
+		SByte8 result;
+		result = Insert(result, SaturateSigned(Short(Int(Extract(x, 0)) - Int(Extract(y, 0)))), 0);
+		result = Insert(result, SaturateSigned(Short(Int(Extract(x, 1)) - Int(Extract(y, 1)))), 1);
+		result = Insert(result, SaturateSigned(Short(Int(Extract(x, 2)) - Int(Extract(y, 2)))), 2);
+		result = Insert(result, SaturateSigned(Short(Int(Extract(x, 3)) - Int(Extract(y, 3)))), 3);
+		result = Insert(result, SaturateSigned(Short(Int(Extract(x, 4)) - Int(Extract(y, 4)))), 4);
+		result = Insert(result, SaturateSigned(Short(Int(Extract(x, 5)) - Int(Extract(y, 5)))), 5);
+		result = Insert(result, SaturateSigned(Short(Int(Extract(x, 6)) - Int(Extract(y, 6)))), 6);
+		result = Insert(result, SaturateSigned(Short(Int(Extract(x, 7)) - Int(Extract(y, 7)))), 7);
 
-			return result;
-		}
-		else
-		{
-			Ice::Variable *result = ::function->makeVariable(Ice::IceType_v16i8);
-			const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::SubtractSaturateSigned, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
-			auto target = ::context->getConstantUndef(Ice::IceType_i32);
-			auto psubsb = Ice::InstIntrinsicCall::create(::function, 2, result, target, intrinsic);
-			psubsb->addArg(x.value);
-			psubsb->addArg(y.value);
-			::basicBlock->appendInst(psubsb);
-
-			return RValue<SByte8>(V(result));
-		}
+		return result;
 	}
-
-	RValue<Int> SignMask(RValue<SByte8> x)
+	else
 	{
-		if(emulateIntrinsics || CPUID::ARM)
-		{
-			SByte8 xx = (x >> 7) & SByte8(0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80);
-			return Int(Extract(xx, 0)) | Int(Extract(xx, 1)) | Int(Extract(xx, 2)) | Int(Extract(xx, 3)) | Int(Extract(xx, 4)) | Int(Extract(xx, 5)) | Int(Extract(xx, 6)) | Int(Extract(xx, 7));
-		}
-		else
-		{
-			Ice::Variable *result = ::function->makeVariable(Ice::IceType_i32);
-			const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::SignMask, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
-			auto target = ::context->getConstantUndef(Ice::IceType_i32);
-			auto movmsk = Ice::InstIntrinsicCall::create(::function, 1, result, target, intrinsic);
-			movmsk->addArg(x.value);
-			::basicBlock->appendInst(movmsk);
+		Ice::Variable *result = ::function->makeVariable(Ice::IceType_v16i8);
+		const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::SubtractSaturateSigned, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
+		auto target = ::context->getConstantUndef(Ice::IceType_i32);
+		auto psubsb = Ice::InstIntrinsicCall::create(::function, 2, result, target, intrinsic);
+		psubsb->addArg(x.value);
+		psubsb->addArg(y.value);
+		::basicBlock->appendInst(psubsb);
 
-			return RValue<Int>(V(result)) & 0xFF;
-		}
+		return RValue<SByte8>(V(result));
 	}
+}
 
-	RValue<Byte8> CmpGT(RValue<SByte8> x, RValue<SByte8> y)
+RValue<Int> SignMask(RValue<SByte8> x)
+{
+	if(emulateIntrinsics || CPUID::ARM)
 	{
-		return RValue<Byte8>(createIntCompare(Ice::InstIcmp::Sgt, x.value, y.value));
+		SByte8 xx = (x >> 7) & SByte8(0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80);
+		return Int(Extract(xx, 0)) | Int(Extract(xx, 1)) | Int(Extract(xx, 2)) | Int(Extract(xx, 3)) | Int(Extract(xx, 4)) | Int(Extract(xx, 5)) | Int(Extract(xx, 6)) | Int(Extract(xx, 7));
 	}
-
-	RValue<Byte8> CmpEQ(RValue<SByte8> x, RValue<SByte8> y)
+	else
 	{
-		return RValue<Byte8>(Nucleus::createICmpEQ(x.value, y.value));
+		Ice::Variable *result = ::function->makeVariable(Ice::IceType_i32);
+		const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::SignMask, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
+		auto target = ::context->getConstantUndef(Ice::IceType_i32);
+		auto movmsk = Ice::InstIntrinsicCall::create(::function, 1, result, target, intrinsic);
+		movmsk->addArg(x.value);
+		::basicBlock->appendInst(movmsk);
+
+		return RValue<Int>(V(result)) & 0xFF;
 	}
+}
 
-	Type *SByte8::getType()
-	{
-		return T(Type_v8i8);
-	}
+RValue<Byte8> CmpGT(RValue<SByte8> x, RValue<SByte8> y)
+{
+	return RValue<Byte8>(createIntCompare(Ice::InstIcmp::Sgt, x.value, y.value));
+}
 
-	Type *Byte16::getType()
-	{
-		return T(Ice::IceType_v16i8);
-	}
+RValue<Byte8> CmpEQ(RValue<SByte8> x, RValue<SByte8> y)
+{
+	return RValue<Byte8>(Nucleus::createICmpEQ(x.value, y.value));
+}
 
-	Type *SByte16::getType()
-	{
-		return T(Ice::IceType_v16i8);
-	}
+Type *SByte8::getType()
+{
+	return T(Type_v8i8);
+}
 
-	Type *Short2::getType()
-	{
-		return T(Type_v2i16);
-	}
+Type *Byte16::getType()
+{
+	return T(Ice::IceType_v16i8);
+}
 
-	Type *UShort2::getType()
-	{
-		return T(Type_v2i16);
-	}
+Type *SByte16::getType()
+{
+	return T(Ice::IceType_v16i8);
+}
 
-	Short4::Short4(RValue<Int4> cast)
-	{
-		int select[8] = {0, 2, 4, 6, 0, 2, 4, 6};
-		Value *short8 = Nucleus::createBitCast(cast.value, Short8::getType());
-		Value *packed = Nucleus::createShuffleVector(short8, short8, select);
+Type *Short2::getType()
+{
+	return T(Type_v2i16);
+}
 
-		Value *int2 = RValue<Int2>(Int2(As<Int4>(packed))).value;
-		Value *short4 = Nucleus::createBitCast(int2, Short4::getType());
+Type *UShort2::getType()
+{
+	return T(Type_v2i16);
+}
 
-		storeValue(short4);
-	}
+Short4::Short4(RValue<Int4> cast)
+{
+	int select[8] = {0, 2, 4, 6, 0, 2, 4, 6};
+	Value *short8 = Nucleus::createBitCast(cast.value, Short8::getType());
+	Value *packed = Nucleus::createShuffleVector(short8, short8, select);
+
+	Value *int2 = RValue<Int2>(Int2(As<Int4>(packed))).value;
+	Value *short4 = Nucleus::createBitCast(int2, Short4::getType());
+
+	storeValue(short4);
+}
 
 //	Short4::Short4(RValue<Float> cast)
 //	{
 //	}
 
-	Short4::Short4(RValue<Float4> cast)
+Short4::Short4(RValue<Float4> cast)
+{
+	UNIMPLEMENTED("Short4::Short4(RValue<Float4> cast)");
+}
+
+RValue<Short4> operator<<(RValue<Short4> lhs, unsigned char rhs)
+{
+	if(emulateIntrinsics)
 	{
-		UNIMPLEMENTED("Short4::Short4(RValue<Float4> cast)");
-	}
-
-	RValue<Short4> operator<<(RValue<Short4> lhs, unsigned char rhs)
-	{
-		if(emulateIntrinsics)
-		{
-			Short4 result;
-			result = Insert(result, Extract(lhs, 0) << Short(rhs), 0);
-			result = Insert(result, Extract(lhs, 1) << Short(rhs), 1);
-			result = Insert(result, Extract(lhs, 2) << Short(rhs), 2);
-			result = Insert(result, Extract(lhs, 3) << Short(rhs), 3);
-
-			return result;
-		}
-		else
-		{
-			return RValue<Short4>(Nucleus::createShl(lhs.value, V(::context->getConstantInt32(rhs))));
-		}
-	}
-
-	RValue<Short4> operator>>(RValue<Short4> lhs, unsigned char rhs)
-	{
-		if(emulateIntrinsics)
-		{
-			Short4 result;
-			result = Insert(result, Extract(lhs, 0) >> Short(rhs), 0);
-			result = Insert(result, Extract(lhs, 1) >> Short(rhs), 1);
-			result = Insert(result, Extract(lhs, 2) >> Short(rhs), 2);
-			result = Insert(result, Extract(lhs, 3) >> Short(rhs), 3);
-
-			return result;
-		}
-		else
-		{
-			return RValue<Short4>(Nucleus::createAShr(lhs.value, V(::context->getConstantInt32(rhs))));
-		}
-	}
-
-	RValue<Short4> Max(RValue<Short4> x, RValue<Short4> y)
-	{
-		Ice::Variable *condition = ::function->makeVariable(Ice::IceType_v8i1);
-		auto cmp = Ice::InstIcmp::create(::function, Ice::InstIcmp::Sle, condition, x.value, y.value);
-		::basicBlock->appendInst(cmp);
-
-		Ice::Variable *result = ::function->makeVariable(Ice::IceType_v8i16);
-		auto select = Ice::InstSelect::create(::function, result, condition, y.value, x.value);
-		::basicBlock->appendInst(select);
-
-		return RValue<Short4>(V(result));
-	}
-
-	RValue<Short4> Min(RValue<Short4> x, RValue<Short4> y)
-	{
-		Ice::Variable *condition = ::function->makeVariable(Ice::IceType_v8i1);
-		auto cmp = Ice::InstIcmp::create(::function, Ice::InstIcmp::Sgt, condition, x.value, y.value);
-		::basicBlock->appendInst(cmp);
-
-		Ice::Variable *result = ::function->makeVariable(Ice::IceType_v8i16);
-		auto select = Ice::InstSelect::create(::function, result, condition, y.value, x.value);
-		::basicBlock->appendInst(select);
-
-		return RValue<Short4>(V(result));
-	}
-
-	RValue<Short> SaturateSigned(RValue<Int> x)
-	{
-		return Short(IfThenElse(x > 0x7FFF, Int(0x7FFF), IfThenElse(x < -0x8000, Int(0x8000), x)));
-	}
-
-	RValue<Short4> AddSat(RValue<Short4> x, RValue<Short4> y)
-	{
-		if(emulateIntrinsics)
-		{
-			Short4 result;
-			result = Insert(result, SaturateSigned(Int(Extract(x, 0)) + Int(Extract(y, 0))), 0);
-			result = Insert(result, SaturateSigned(Int(Extract(x, 1)) + Int(Extract(y, 1))), 1);
-			result = Insert(result, SaturateSigned(Int(Extract(x, 2)) + Int(Extract(y, 2))), 2);
-			result = Insert(result, SaturateSigned(Int(Extract(x, 3)) + Int(Extract(y, 3))), 3);
-
-			return result;
-		}
-		else
-		{
-			Ice::Variable *result = ::function->makeVariable(Ice::IceType_v8i16);
-			const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::AddSaturateSigned, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
-			auto target = ::context->getConstantUndef(Ice::IceType_i32);
-			auto paddsw = Ice::InstIntrinsicCall::create(::function, 2, result, target, intrinsic);
-			paddsw->addArg(x.value);
-			paddsw->addArg(y.value);
-			::basicBlock->appendInst(paddsw);
-
-			return RValue<Short4>(V(result));
-		}
-	}
-
-	RValue<Short4> SubSat(RValue<Short4> x, RValue<Short4> y)
-	{
-		if(emulateIntrinsics)
-		{
-			Short4 result;
-			result = Insert(result, SaturateSigned(Int(Extract(x, 0)) - Int(Extract(y, 0))), 0);
-			result = Insert(result, SaturateSigned(Int(Extract(x, 1)) - Int(Extract(y, 1))), 1);
-			result = Insert(result, SaturateSigned(Int(Extract(x, 2)) - Int(Extract(y, 2))), 2);
-			result = Insert(result, SaturateSigned(Int(Extract(x, 3)) - Int(Extract(y, 3))), 3);
-
-			return result;
-		}
-		else
-		{
-			Ice::Variable *result = ::function->makeVariable(Ice::IceType_v8i16);
-			const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::SubtractSaturateSigned, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
-			auto target = ::context->getConstantUndef(Ice::IceType_i32);
-			auto psubsw = Ice::InstIntrinsicCall::create(::function, 2, result, target, intrinsic);
-			psubsw->addArg(x.value);
-			psubsw->addArg(y.value);
-			::basicBlock->appendInst(psubsw);
-
-			return RValue<Short4>(V(result));
-		}
-	}
-
-	RValue<Short4> MulHigh(RValue<Short4> x, RValue<Short4> y)
-	{
-		if(emulateIntrinsics)
-		{
-			Short4 result;
-			result = Insert(result, Short((Int(Extract(x, 0)) * Int(Extract(y, 0))) >> 16), 0);
-			result = Insert(result, Short((Int(Extract(x, 1)) * Int(Extract(y, 1))) >> 16), 1);
-			result = Insert(result, Short((Int(Extract(x, 2)) * Int(Extract(y, 2))) >> 16), 2);
-			result = Insert(result, Short((Int(Extract(x, 3)) * Int(Extract(y, 3))) >> 16), 3);
-
-			return result;
-		}
-		else
-		{
-			Ice::Variable *result = ::function->makeVariable(Ice::IceType_v8i16);
-			const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::MultiplyHighSigned, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
-			auto target = ::context->getConstantUndef(Ice::IceType_i32);
-			auto pmulhw = Ice::InstIntrinsicCall::create(::function, 2, result, target, intrinsic);
-			pmulhw->addArg(x.value);
-			pmulhw->addArg(y.value);
-			::basicBlock->appendInst(pmulhw);
-
-			return RValue<Short4>(V(result));
-		}
-	}
-
-	RValue<Int2> MulAdd(RValue<Short4> x, RValue<Short4> y)
-	{
-		if(emulateIntrinsics)
-		{
-			Int2 result;
-			result = Insert(result, Int(Extract(x, 0)) * Int(Extract(y, 0)) + Int(Extract(x, 1)) * Int(Extract(y, 1)), 0);
-			result = Insert(result, Int(Extract(x, 2)) * Int(Extract(y, 2)) + Int(Extract(x, 3)) * Int(Extract(y, 3)), 1);
-
-			return result;
-		}
-		else
-		{
-			Ice::Variable *result = ::function->makeVariable(Ice::IceType_v8i16);
-			const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::MultiplyAddPairs, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
-			auto target = ::context->getConstantUndef(Ice::IceType_i32);
-			auto pmaddwd = Ice::InstIntrinsicCall::create(::function, 2, result, target, intrinsic);
-			pmaddwd->addArg(x.value);
-			pmaddwd->addArg(y.value);
-			::basicBlock->appendInst(pmaddwd);
-
-			return As<Int2>(V(result));
-		}
-	}
-
-	RValue<SByte8> PackSigned(RValue<Short4> x, RValue<Short4> y)
-	{
-		if(emulateIntrinsics)
-		{
-			SByte8 result;
-			result = Insert(result, SaturateSigned(Extract(x, 0)), 0);
-			result = Insert(result, SaturateSigned(Extract(x, 1)), 1);
-			result = Insert(result, SaturateSigned(Extract(x, 2)), 2);
-			result = Insert(result, SaturateSigned(Extract(x, 3)), 3);
-			result = Insert(result, SaturateSigned(Extract(y, 0)), 4);
-			result = Insert(result, SaturateSigned(Extract(y, 1)), 5);
-			result = Insert(result, SaturateSigned(Extract(y, 2)), 6);
-			result = Insert(result, SaturateSigned(Extract(y, 3)), 7);
-
-			return result;
-		}
-		else
-		{
-			Ice::Variable *result = ::function->makeVariable(Ice::IceType_v16i8);
-			const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::VectorPackSigned, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
-			auto target = ::context->getConstantUndef(Ice::IceType_i32);
-			auto pack = Ice::InstIntrinsicCall::create(::function, 2, result, target, intrinsic);
-			pack->addArg(x.value);
-			pack->addArg(y.value);
-			::basicBlock->appendInst(pack);
-
-			return As<SByte8>(Swizzle(As<Int4>(V(result)), 0x0202));
-		}
-	}
-
-	RValue<Byte8> PackUnsigned(RValue<Short4> x, RValue<Short4> y)
-	{
-		if(emulateIntrinsics)
-		{
-			Byte8 result;
-			result = Insert(result, SaturateUnsigned(Extract(x, 0)), 0);
-			result = Insert(result, SaturateUnsigned(Extract(x, 1)), 1);
-			result = Insert(result, SaturateUnsigned(Extract(x, 2)), 2);
-			result = Insert(result, SaturateUnsigned(Extract(x, 3)), 3);
-			result = Insert(result, SaturateUnsigned(Extract(y, 0)), 4);
-			result = Insert(result, SaturateUnsigned(Extract(y, 1)), 5);
-			result = Insert(result, SaturateUnsigned(Extract(y, 2)), 6);
-			result = Insert(result, SaturateUnsigned(Extract(y, 3)), 7);
-
-			return result;
-		}
-		else
-		{
-			Ice::Variable *result = ::function->makeVariable(Ice::IceType_v16i8);
-			const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::VectorPackUnsigned, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
-			auto target = ::context->getConstantUndef(Ice::IceType_i32);
-			auto pack = Ice::InstIntrinsicCall::create(::function, 2, result, target, intrinsic);
-			pack->addArg(x.value);
-			pack->addArg(y.value);
-			::basicBlock->appendInst(pack);
-
-			return As<Byte8>(Swizzle(As<Int4>(V(result)), 0x0202));
-		}
-	}
-
-	RValue<Short4> CmpGT(RValue<Short4> x, RValue<Short4> y)
-	{
-		return RValue<Short4>(createIntCompare(Ice::InstIcmp::Sgt, x.value, y.value));
-	}
-
-	RValue<Short4> CmpEQ(RValue<Short4> x, RValue<Short4> y)
-	{
-		return RValue<Short4>(Nucleus::createICmpEQ(x.value, y.value));
-	}
-
-	Type *Short4::getType()
-	{
-		return T(Type_v4i16);
-	}
-
-	UShort4::UShort4(RValue<Float4> cast, bool saturate)
-	{
-		if(saturate)
-		{
-			if(CPUID::SSE4_1)
-			{
-				// x86 produces 0x80000000 on 32-bit integer overflow/underflow.
-				// PackUnsigned takes care of 0x0000 saturation.
-				Int4 int4(Min(cast, Float4(0xFFFF)));
-				*this = As<UShort4>(PackUnsigned(int4, int4));
-			}
-			else if(CPUID::ARM)
-			{
-				// ARM saturates the 32-bit integer result on overflow/undeflow.
-				Int4 int4(cast);
-				*this = As<UShort4>(PackUnsigned(int4, int4));
-			}
-			else
-			{
-				*this = Short4(Int4(Max(Min(cast, Float4(0xFFFF)), Float4(0x0000))));
-			}
-		}
-		else
-		{
-			*this = Short4(Int4(cast));
-		}
-	}
-
-	RValue<UShort> Extract(RValue<UShort4> val, int i)
-	{
-		return RValue<UShort>(Nucleus::createExtractElement(val.value, UShort::getType(), i));
-	}
-
-	RValue<UShort4> Insert(RValue<UShort4> val, RValue<UShort> element, int i)
-	{
-		return RValue<UShort4>(Nucleus::createInsertElement(val.value, element.value, i));
-	}
-
-	RValue<UShort4> operator<<(RValue<UShort4> lhs, unsigned char rhs)
-	{
-		if(emulateIntrinsics)
-		{
-			UShort4 result;
-			result = Insert(result, Extract(lhs, 0) << UShort(rhs), 0);
-			result = Insert(result, Extract(lhs, 1) << UShort(rhs), 1);
-			result = Insert(result, Extract(lhs, 2) << UShort(rhs), 2);
-			result = Insert(result, Extract(lhs, 3) << UShort(rhs), 3);
-
-			return result;
-		}
-		else
-		{
-			return RValue<UShort4>(Nucleus::createShl(lhs.value, V(::context->getConstantInt32(rhs))));
-		}
-	}
-
-	RValue<UShort4> operator>>(RValue<UShort4> lhs, unsigned char rhs)
-	{
-		if(emulateIntrinsics)
-		{
-			UShort4 result;
-			result = Insert(result, Extract(lhs, 0) >> UShort(rhs), 0);
-			result = Insert(result, Extract(lhs, 1) >> UShort(rhs), 1);
-			result = Insert(result, Extract(lhs, 2) >> UShort(rhs), 2);
-			result = Insert(result, Extract(lhs, 3) >> UShort(rhs), 3);
-
-			return result;
-		}
-		else
-		{
-			return RValue<UShort4>(Nucleus::createLShr(lhs.value, V(::context->getConstantInt32(rhs))));
-		}
-	}
-
-	RValue<UShort4> Max(RValue<UShort4> x, RValue<UShort4> y)
-	{
-		Ice::Variable *condition = ::function->makeVariable(Ice::IceType_v8i1);
-		auto cmp = Ice::InstIcmp::create(::function, Ice::InstIcmp::Ule, condition, x.value, y.value);
-		::basicBlock->appendInst(cmp);
-
-		Ice::Variable *result = ::function->makeVariable(Ice::IceType_v8i16);
-		auto select = Ice::InstSelect::create(::function, result, condition, y.value, x.value);
-		::basicBlock->appendInst(select);
-
-		return RValue<UShort4>(V(result));
-	}
-
-	RValue<UShort4> Min(RValue<UShort4> x, RValue<UShort4> y)
-	{
-		Ice::Variable *condition = ::function->makeVariable(Ice::IceType_v8i1);
-		auto cmp = Ice::InstIcmp::create(::function, Ice::InstIcmp::Ugt, condition, x.value, y.value);
-		::basicBlock->appendInst(cmp);
-
-		Ice::Variable *result = ::function->makeVariable(Ice::IceType_v8i16);
-		auto select = Ice::InstSelect::create(::function, result, condition, y.value, x.value);
-		::basicBlock->appendInst(select);
-
-		return RValue<UShort4>(V(result));
-	}
-
-	RValue<UShort> SaturateUnsigned(RValue<Int> x)
-	{
-		return UShort(IfThenElse(x > 0xFFFF, Int(0xFFFF), IfThenElse(x < 0, Int(0), x)));
-	}
-
-	RValue<UShort4> AddSat(RValue<UShort4> x, RValue<UShort4> y)
-	{
-		if(emulateIntrinsics)
-		{
-			UShort4 result;
-			result = Insert(result, SaturateUnsigned(Int(Extract(x, 0)) + Int(Extract(y, 0))), 0);
-			result = Insert(result, SaturateUnsigned(Int(Extract(x, 1)) + Int(Extract(y, 1))), 1);
-			result = Insert(result, SaturateUnsigned(Int(Extract(x, 2)) + Int(Extract(y, 2))), 2);
-			result = Insert(result, SaturateUnsigned(Int(Extract(x, 3)) + Int(Extract(y, 3))), 3);
-
-			return result;
-		}
-		else
-		{
-			Ice::Variable *result = ::function->makeVariable(Ice::IceType_v8i16);
-			const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::AddSaturateUnsigned, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
-			auto target = ::context->getConstantUndef(Ice::IceType_i32);
-			auto paddusw = Ice::InstIntrinsicCall::create(::function, 2, result, target, intrinsic);
-			paddusw->addArg(x.value);
-			paddusw->addArg(y.value);
-			::basicBlock->appendInst(paddusw);
-
-			return RValue<UShort4>(V(result));
-		}
-	}
-
-	RValue<UShort4> SubSat(RValue<UShort4> x, RValue<UShort4> y)
-	{
-		if(emulateIntrinsics)
-		{
-			UShort4 result;
-			result = Insert(result, SaturateUnsigned(Int(Extract(x, 0)) - Int(Extract(y, 0))), 0);
-			result = Insert(result, SaturateUnsigned(Int(Extract(x, 1)) - Int(Extract(y, 1))), 1);
-			result = Insert(result, SaturateUnsigned(Int(Extract(x, 2)) - Int(Extract(y, 2))), 2);
-			result = Insert(result, SaturateUnsigned(Int(Extract(x, 3)) - Int(Extract(y, 3))), 3);
-
-			return result;
-		}
-		else
-		{
-			Ice::Variable *result = ::function->makeVariable(Ice::IceType_v8i16);
-			const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::SubtractSaturateUnsigned, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
-			auto target = ::context->getConstantUndef(Ice::IceType_i32);
-			auto psubusw = Ice::InstIntrinsicCall::create(::function, 2, result, target, intrinsic);
-			psubusw->addArg(x.value);
-			psubusw->addArg(y.value);
-			::basicBlock->appendInst(psubusw);
-
-			return RValue<UShort4>(V(result));
-		}
-	}
-
-	RValue<UShort4> MulHigh(RValue<UShort4> x, RValue<UShort4> y)
-	{
-		if(emulateIntrinsics)
-		{
-			UShort4 result;
-			result = Insert(result, UShort((UInt(Extract(x, 0)) * UInt(Extract(y, 0))) >> 16), 0);
-			result = Insert(result, UShort((UInt(Extract(x, 1)) * UInt(Extract(y, 1))) >> 16), 1);
-			result = Insert(result, UShort((UInt(Extract(x, 2)) * UInt(Extract(y, 2))) >> 16), 2);
-			result = Insert(result, UShort((UInt(Extract(x, 3)) * UInt(Extract(y, 3))) >> 16), 3);
-
-			return result;
-		}
-		else
-		{
-			Ice::Variable *result = ::function->makeVariable(Ice::IceType_v8i16);
-			const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::MultiplyHighUnsigned, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
-			auto target = ::context->getConstantUndef(Ice::IceType_i32);
-			auto pmulhuw = Ice::InstIntrinsicCall::create(::function, 2, result, target, intrinsic);
-			pmulhuw->addArg(x.value);
-			pmulhuw->addArg(y.value);
-			::basicBlock->appendInst(pmulhuw);
-
-			return RValue<UShort4>(V(result));
-		}
-	}
-
-	RValue<Int4> MulHigh(RValue<Int4> x, RValue<Int4> y)
-	{
-		// TODO: For x86, build an intrinsics version of this which uses shuffles + pmuludq.
-
-		// Scalarized implementation.
-		Int4 result;
-		result = Insert(result, Int((Long(Extract(x, 0)) * Long(Extract(y, 0))) >> Long(Int(32))), 0);
-		result = Insert(result, Int((Long(Extract(x, 1)) * Long(Extract(y, 1))) >> Long(Int(32))), 1);
-		result = Insert(result, Int((Long(Extract(x, 2)) * Long(Extract(y, 2))) >> Long(Int(32))), 2);
-		result = Insert(result, Int((Long(Extract(x, 3)) * Long(Extract(y, 3))) >> Long(Int(32))), 3);
+		Short4 result;
+		result = Insert(result, Extract(lhs, 0) << Short(rhs), 0);
+		result = Insert(result, Extract(lhs, 1) << Short(rhs), 1);
+		result = Insert(result, Extract(lhs, 2) << Short(rhs), 2);
+		result = Insert(result, Extract(lhs, 3) << Short(rhs), 3);
 
 		return result;
 	}
-
-	RValue<UInt4> MulHigh(RValue<UInt4> x, RValue<UInt4> y)
+	else
 	{
-		// TODO: For x86, build an intrinsics version of this which uses shuffles + pmuludq.
+		return RValue<Short4>(Nucleus::createShl(lhs.value, V(::context->getConstantInt32(rhs))));
+	}
+}
 
-		if(false)  // Partial product based implementation.
+RValue<Short4> operator>>(RValue<Short4> lhs, unsigned char rhs)
+{
+	if(emulateIntrinsics)
+	{
+		Short4 result;
+		result = Insert(result, Extract(lhs, 0) >> Short(rhs), 0);
+		result = Insert(result, Extract(lhs, 1) >> Short(rhs), 1);
+		result = Insert(result, Extract(lhs, 2) >> Short(rhs), 2);
+		result = Insert(result, Extract(lhs, 3) >> Short(rhs), 3);
+
+		return result;
+	}
+	else
+	{
+		return RValue<Short4>(Nucleus::createAShr(lhs.value, V(::context->getConstantInt32(rhs))));
+	}
+}
+
+RValue<Short4> Max(RValue<Short4> x, RValue<Short4> y)
+{
+	Ice::Variable *condition = ::function->makeVariable(Ice::IceType_v8i1);
+	auto cmp = Ice::InstIcmp::create(::function, Ice::InstIcmp::Sle, condition, x.value, y.value);
+	::basicBlock->appendInst(cmp);
+
+	Ice::Variable *result = ::function->makeVariable(Ice::IceType_v8i16);
+	auto select = Ice::InstSelect::create(::function, result, condition, y.value, x.value);
+	::basicBlock->appendInst(select);
+
+	return RValue<Short4>(V(result));
+}
+
+RValue<Short4> Min(RValue<Short4> x, RValue<Short4> y)
+{
+	Ice::Variable *condition = ::function->makeVariable(Ice::IceType_v8i1);
+	auto cmp = Ice::InstIcmp::create(::function, Ice::InstIcmp::Sgt, condition, x.value, y.value);
+	::basicBlock->appendInst(cmp);
+
+	Ice::Variable *result = ::function->makeVariable(Ice::IceType_v8i16);
+	auto select = Ice::InstSelect::create(::function, result, condition, y.value, x.value);
+	::basicBlock->appendInst(select);
+
+	return RValue<Short4>(V(result));
+}
+
+RValue<Short> SaturateSigned(RValue<Int> x)
+{
+	return Short(IfThenElse(x > 0x7FFF, Int(0x7FFF), IfThenElse(x < -0x8000, Int(0x8000), x)));
+}
+
+RValue<Short4> AddSat(RValue<Short4> x, RValue<Short4> y)
+{
+	if(emulateIntrinsics)
+	{
+		Short4 result;
+		result = Insert(result, SaturateSigned(Int(Extract(x, 0)) + Int(Extract(y, 0))), 0);
+		result = Insert(result, SaturateSigned(Int(Extract(x, 1)) + Int(Extract(y, 1))), 1);
+		result = Insert(result, SaturateSigned(Int(Extract(x, 2)) + Int(Extract(y, 2))), 2);
+		result = Insert(result, SaturateSigned(Int(Extract(x, 3)) + Int(Extract(y, 3))), 3);
+
+		return result;
+	}
+	else
+	{
+		Ice::Variable *result = ::function->makeVariable(Ice::IceType_v8i16);
+		const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::AddSaturateSigned, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
+		auto target = ::context->getConstantUndef(Ice::IceType_i32);
+		auto paddsw = Ice::InstIntrinsicCall::create(::function, 2, result, target, intrinsic);
+		paddsw->addArg(x.value);
+		paddsw->addArg(y.value);
+		::basicBlock->appendInst(paddsw);
+
+		return RValue<Short4>(V(result));
+	}
+}
+
+RValue<Short4> SubSat(RValue<Short4> x, RValue<Short4> y)
+{
+	if(emulateIntrinsics)
+	{
+		Short4 result;
+		result = Insert(result, SaturateSigned(Int(Extract(x, 0)) - Int(Extract(y, 0))), 0);
+		result = Insert(result, SaturateSigned(Int(Extract(x, 1)) - Int(Extract(y, 1))), 1);
+		result = Insert(result, SaturateSigned(Int(Extract(x, 2)) - Int(Extract(y, 2))), 2);
+		result = Insert(result, SaturateSigned(Int(Extract(x, 3)) - Int(Extract(y, 3))), 3);
+
+		return result;
+	}
+	else
+	{
+		Ice::Variable *result = ::function->makeVariable(Ice::IceType_v8i16);
+		const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::SubtractSaturateSigned, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
+		auto target = ::context->getConstantUndef(Ice::IceType_i32);
+		auto psubsw = Ice::InstIntrinsicCall::create(::function, 2, result, target, intrinsic);
+		psubsw->addArg(x.value);
+		psubsw->addArg(y.value);
+		::basicBlock->appendInst(psubsw);
+
+		return RValue<Short4>(V(result));
+	}
+}
+
+RValue<Short4> MulHigh(RValue<Short4> x, RValue<Short4> y)
+{
+	if(emulateIntrinsics)
+	{
+		Short4 result;
+		result = Insert(result, Short((Int(Extract(x, 0)) * Int(Extract(y, 0))) >> 16), 0);
+		result = Insert(result, Short((Int(Extract(x, 1)) * Int(Extract(y, 1))) >> 16), 1);
+		result = Insert(result, Short((Int(Extract(x, 2)) * Int(Extract(y, 2))) >> 16), 2);
+		result = Insert(result, Short((Int(Extract(x, 3)) * Int(Extract(y, 3))) >> 16), 3);
+
+		return result;
+	}
+	else
+	{
+		Ice::Variable *result = ::function->makeVariable(Ice::IceType_v8i16);
+		const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::MultiplyHighSigned, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
+		auto target = ::context->getConstantUndef(Ice::IceType_i32);
+		auto pmulhw = Ice::InstIntrinsicCall::create(::function, 2, result, target, intrinsic);
+		pmulhw->addArg(x.value);
+		pmulhw->addArg(y.value);
+		::basicBlock->appendInst(pmulhw);
+
+		return RValue<Short4>(V(result));
+	}
+}
+
+RValue<Int2> MulAdd(RValue<Short4> x, RValue<Short4> y)
+{
+	if(emulateIntrinsics)
+	{
+		Int2 result;
+		result = Insert(result, Int(Extract(x, 0)) * Int(Extract(y, 0)) + Int(Extract(x, 1)) * Int(Extract(y, 1)), 0);
+		result = Insert(result, Int(Extract(x, 2)) * Int(Extract(y, 2)) + Int(Extract(x, 3)) * Int(Extract(y, 3)), 1);
+
+		return result;
+	}
+	else
+	{
+		Ice::Variable *result = ::function->makeVariable(Ice::IceType_v8i16);
+		const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::MultiplyAddPairs, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
+		auto target = ::context->getConstantUndef(Ice::IceType_i32);
+		auto pmaddwd = Ice::InstIntrinsicCall::create(::function, 2, result, target, intrinsic);
+		pmaddwd->addArg(x.value);
+		pmaddwd->addArg(y.value);
+		::basicBlock->appendInst(pmaddwd);
+
+		return As<Int2>(V(result));
+	}
+}
+
+RValue<SByte8> PackSigned(RValue<Short4> x, RValue<Short4> y)
+{
+	if(emulateIntrinsics)
+	{
+		SByte8 result;
+		result = Insert(result, SaturateSigned(Extract(x, 0)), 0);
+		result = Insert(result, SaturateSigned(Extract(x, 1)), 1);
+		result = Insert(result, SaturateSigned(Extract(x, 2)), 2);
+		result = Insert(result, SaturateSigned(Extract(x, 3)), 3);
+		result = Insert(result, SaturateSigned(Extract(y, 0)), 4);
+		result = Insert(result, SaturateSigned(Extract(y, 1)), 5);
+		result = Insert(result, SaturateSigned(Extract(y, 2)), 6);
+		result = Insert(result, SaturateSigned(Extract(y, 3)), 7);
+
+		return result;
+	}
+	else
+	{
+		Ice::Variable *result = ::function->makeVariable(Ice::IceType_v16i8);
+		const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::VectorPackSigned, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
+		auto target = ::context->getConstantUndef(Ice::IceType_i32);
+		auto pack = Ice::InstIntrinsicCall::create(::function, 2, result, target, intrinsic);
+		pack->addArg(x.value);
+		pack->addArg(y.value);
+		::basicBlock->appendInst(pack);
+
+		return As<SByte8>(Swizzle(As<Int4>(V(result)), 0x0202));
+	}
+}
+
+RValue<Byte8> PackUnsigned(RValue<Short4> x, RValue<Short4> y)
+{
+	if(emulateIntrinsics)
+	{
+		Byte8 result;
+		result = Insert(result, SaturateUnsigned(Extract(x, 0)), 0);
+		result = Insert(result, SaturateUnsigned(Extract(x, 1)), 1);
+		result = Insert(result, SaturateUnsigned(Extract(x, 2)), 2);
+		result = Insert(result, SaturateUnsigned(Extract(x, 3)), 3);
+		result = Insert(result, SaturateUnsigned(Extract(y, 0)), 4);
+		result = Insert(result, SaturateUnsigned(Extract(y, 1)), 5);
+		result = Insert(result, SaturateUnsigned(Extract(y, 2)), 6);
+		result = Insert(result, SaturateUnsigned(Extract(y, 3)), 7);
+
+		return result;
+	}
+	else
+	{
+		Ice::Variable *result = ::function->makeVariable(Ice::IceType_v16i8);
+		const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::VectorPackUnsigned, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
+		auto target = ::context->getConstantUndef(Ice::IceType_i32);
+		auto pack = Ice::InstIntrinsicCall::create(::function, 2, result, target, intrinsic);
+		pack->addArg(x.value);
+		pack->addArg(y.value);
+		::basicBlock->appendInst(pack);
+
+		return As<Byte8>(Swizzle(As<Int4>(V(result)), 0x0202));
+	}
+}
+
+RValue<Short4> CmpGT(RValue<Short4> x, RValue<Short4> y)
+{
+	return RValue<Short4>(createIntCompare(Ice::InstIcmp::Sgt, x.value, y.value));
+}
+
+RValue<Short4> CmpEQ(RValue<Short4> x, RValue<Short4> y)
+{
+	return RValue<Short4>(Nucleus::createICmpEQ(x.value, y.value));
+}
+
+Type *Short4::getType()
+{
+	return T(Type_v4i16);
+}
+
+UShort4::UShort4(RValue<Float4> cast, bool saturate)
+{
+	if(saturate)
+	{
+		if(CPUID::SSE4_1)
 		{
-			auto xh = x >> 16;
-			auto yh = y >> 16;
-			auto xl = x & UInt4(0x0000FFFF);
-			auto yl = y & UInt4(0x0000FFFF);
-			auto xlyh = xl * yh;
-			auto xhyl = xh * yl;
-			auto xlyhh = xlyh >> 16;
-			auto xhylh = xhyl >> 16;
-			auto xlyhl = xlyh & UInt4(0x0000FFFF);
-			auto xhyll = xhyl & UInt4(0x0000FFFF);
-			auto xlylh = (xl * yl) >> 16;
-			auto oflow = (xlyhl + xhyll + xlylh) >> 16;
-
-			return (xh * yh) + (xlyhh + xhylh) + oflow;
+			// x86 produces 0x80000000 on 32-bit integer overflow/underflow.
+			// PackUnsigned takes care of 0x0000 saturation.
+			Int4 int4(Min(cast, Float4(0xFFFF)));
+			*this = As<UShort4>(PackUnsigned(int4, int4));
 		}
-
-		// Scalarized implementation.
-		Int4 result;
-		result = Insert(result, Int((Long(UInt(Extract(As<Int4>(x), 0))) * Long(UInt(Extract(As<Int4>(y), 0)))) >> Long(Int(32))), 0);
-		result = Insert(result, Int((Long(UInt(Extract(As<Int4>(x), 1))) * Long(UInt(Extract(As<Int4>(y), 1)))) >> Long(Int(32))), 1);
-		result = Insert(result, Int((Long(UInt(Extract(As<Int4>(x), 2))) * Long(UInt(Extract(As<Int4>(y), 2)))) >> Long(Int(32))), 2);
-		result = Insert(result, Int((Long(UInt(Extract(As<Int4>(x), 3))) * Long(UInt(Extract(As<Int4>(y), 3)))) >> Long(Int(32))), 3);
-
-		return As<UInt4>(result);
-	}
-
-	RValue<UShort4> Average(RValue<UShort4> x, RValue<UShort4> y)
-	{
-		UNIMPLEMENTED("RValue<UShort4> Average(RValue<UShort4> x, RValue<UShort4> y)");
-		return UShort4(0);
-	}
-
-	Type *UShort4::getType()
-	{
-		return T(Type_v4i16);
-	}
-
-	RValue<Short> Extract(RValue<Short8> val, int i)
-	{
-		return RValue<Short>(Nucleus::createExtractElement(val.value, Short::getType(), i));
-	}
-
-	RValue<Short8> Insert(RValue<Short8> val, RValue<Short> element, int i)
-	{
-		return RValue<Short8>(Nucleus::createInsertElement(val.value, element.value, i));
-	}
-
-	RValue<Short8> operator<<(RValue<Short8> lhs, unsigned char rhs)
-	{
-		if(emulateIntrinsics)
+		else if(CPUID::ARM)
 		{
-			Short8 result;
-			result = Insert(result, Extract(lhs, 0) << Short(rhs), 0);
-			result = Insert(result, Extract(lhs, 1) << Short(rhs), 1);
-			result = Insert(result, Extract(lhs, 2) << Short(rhs), 2);
-			result = Insert(result, Extract(lhs, 3) << Short(rhs), 3);
-			result = Insert(result, Extract(lhs, 4) << Short(rhs), 4);
-			result = Insert(result, Extract(lhs, 5) << Short(rhs), 5);
-			result = Insert(result, Extract(lhs, 6) << Short(rhs), 6);
-			result = Insert(result, Extract(lhs, 7) << Short(rhs), 7);
-
-			return result;
+			// ARM saturates the 32-bit integer result on overflow/undeflow.
+			Int4 int4(cast);
+			*this = As<UShort4>(PackUnsigned(int4, int4));
 		}
 		else
 		{
-			return RValue<Short8>(Nucleus::createShl(lhs.value, V(::context->getConstantInt32(rhs))));
+			*this = Short4(Int4(Max(Min(cast, Float4(0xFFFF)), Float4(0x0000))));
 		}
 	}
-
-	RValue<Short8> operator>>(RValue<Short8> lhs, unsigned char rhs)
+	else
 	{
-		if(emulateIntrinsics)
-		{
-			Short8 result;
-			result = Insert(result, Extract(lhs, 0) >> Short(rhs), 0);
-			result = Insert(result, Extract(lhs, 1) >> Short(rhs), 1);
-			result = Insert(result, Extract(lhs, 2) >> Short(rhs), 2);
-			result = Insert(result, Extract(lhs, 3) >> Short(rhs), 3);
-			result = Insert(result, Extract(lhs, 4) >> Short(rhs), 4);
-			result = Insert(result, Extract(lhs, 5) >> Short(rhs), 5);
-			result = Insert(result, Extract(lhs, 6) >> Short(rhs), 6);
-			result = Insert(result, Extract(lhs, 7) >> Short(rhs), 7);
+		*this = Short4(Int4(cast));
+	}
+}
 
-			return result;
-		}
-		else
-		{
-			return RValue<Short8>(Nucleus::createAShr(lhs.value, V(::context->getConstantInt32(rhs))));
-		}
+RValue<UShort> Extract(RValue<UShort4> val, int i)
+{
+	return RValue<UShort>(Nucleus::createExtractElement(val.value, UShort::getType(), i));
+}
+
+RValue<UShort4> Insert(RValue<UShort4> val, RValue<UShort> element, int i)
+{
+	return RValue<UShort4>(Nucleus::createInsertElement(val.value, element.value, i));
+}
+
+RValue<UShort4> operator<<(RValue<UShort4> lhs, unsigned char rhs)
+{
+	if(emulateIntrinsics)
+	{
+		UShort4 result;
+		result = Insert(result, Extract(lhs, 0) << UShort(rhs), 0);
+		result = Insert(result, Extract(lhs, 1) << UShort(rhs), 1);
+		result = Insert(result, Extract(lhs, 2) << UShort(rhs), 2);
+		result = Insert(result, Extract(lhs, 3) << UShort(rhs), 3);
+
+		return result;
+	}
+	else
+	{
+		return RValue<UShort4>(Nucleus::createShl(lhs.value, V(::context->getConstantInt32(rhs))));
+	}
+}
+
+RValue<UShort4> operator>>(RValue<UShort4> lhs, unsigned char rhs)
+{
+	if(emulateIntrinsics)
+	{
+		UShort4 result;
+		result = Insert(result, Extract(lhs, 0) >> UShort(rhs), 0);
+		result = Insert(result, Extract(lhs, 1) >> UShort(rhs), 1);
+		result = Insert(result, Extract(lhs, 2) >> UShort(rhs), 2);
+		result = Insert(result, Extract(lhs, 3) >> UShort(rhs), 3);
+
+		return result;
+	}
+	else
+	{
+		return RValue<UShort4>(Nucleus::createLShr(lhs.value, V(::context->getConstantInt32(rhs))));
+	}
+}
+
+RValue<UShort4> Max(RValue<UShort4> x, RValue<UShort4> y)
+{
+	Ice::Variable *condition = ::function->makeVariable(Ice::IceType_v8i1);
+	auto cmp = Ice::InstIcmp::create(::function, Ice::InstIcmp::Ule, condition, x.value, y.value);
+	::basicBlock->appendInst(cmp);
+
+	Ice::Variable *result = ::function->makeVariable(Ice::IceType_v8i16);
+	auto select = Ice::InstSelect::create(::function, result, condition, y.value, x.value);
+	::basicBlock->appendInst(select);
+
+	return RValue<UShort4>(V(result));
+}
+
+RValue<UShort4> Min(RValue<UShort4> x, RValue<UShort4> y)
+{
+	Ice::Variable *condition = ::function->makeVariable(Ice::IceType_v8i1);
+	auto cmp = Ice::InstIcmp::create(::function, Ice::InstIcmp::Ugt, condition, x.value, y.value);
+	::basicBlock->appendInst(cmp);
+
+	Ice::Variable *result = ::function->makeVariable(Ice::IceType_v8i16);
+	auto select = Ice::InstSelect::create(::function, result, condition, y.value, x.value);
+	::basicBlock->appendInst(select);
+
+	return RValue<UShort4>(V(result));
+}
+
+RValue<UShort> SaturateUnsigned(RValue<Int> x)
+{
+	return UShort(IfThenElse(x > 0xFFFF, Int(0xFFFF), IfThenElse(x < 0, Int(0), x)));
+}
+
+RValue<UShort4> AddSat(RValue<UShort4> x, RValue<UShort4> y)
+{
+	if(emulateIntrinsics)
+	{
+		UShort4 result;
+		result = Insert(result, SaturateUnsigned(Int(Extract(x, 0)) + Int(Extract(y, 0))), 0);
+		result = Insert(result, SaturateUnsigned(Int(Extract(x, 1)) + Int(Extract(y, 1))), 1);
+		result = Insert(result, SaturateUnsigned(Int(Extract(x, 2)) + Int(Extract(y, 2))), 2);
+		result = Insert(result, SaturateUnsigned(Int(Extract(x, 3)) + Int(Extract(y, 3))), 3);
+
+		return result;
+	}
+	else
+	{
+		Ice::Variable *result = ::function->makeVariable(Ice::IceType_v8i16);
+		const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::AddSaturateUnsigned, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
+		auto target = ::context->getConstantUndef(Ice::IceType_i32);
+		auto paddusw = Ice::InstIntrinsicCall::create(::function, 2, result, target, intrinsic);
+		paddusw->addArg(x.value);
+		paddusw->addArg(y.value);
+		::basicBlock->appendInst(paddusw);
+
+		return RValue<UShort4>(V(result));
+	}
+}
+
+RValue<UShort4> SubSat(RValue<UShort4> x, RValue<UShort4> y)
+{
+	if(emulateIntrinsics)
+	{
+		UShort4 result;
+		result = Insert(result, SaturateUnsigned(Int(Extract(x, 0)) - Int(Extract(y, 0))), 0);
+		result = Insert(result, SaturateUnsigned(Int(Extract(x, 1)) - Int(Extract(y, 1))), 1);
+		result = Insert(result, SaturateUnsigned(Int(Extract(x, 2)) - Int(Extract(y, 2))), 2);
+		result = Insert(result, SaturateUnsigned(Int(Extract(x, 3)) - Int(Extract(y, 3))), 3);
+
+		return result;
+	}
+	else
+	{
+		Ice::Variable *result = ::function->makeVariable(Ice::IceType_v8i16);
+		const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::SubtractSaturateUnsigned, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
+		auto target = ::context->getConstantUndef(Ice::IceType_i32);
+		auto psubusw = Ice::InstIntrinsicCall::create(::function, 2, result, target, intrinsic);
+		psubusw->addArg(x.value);
+		psubusw->addArg(y.value);
+		::basicBlock->appendInst(psubusw);
+
+		return RValue<UShort4>(V(result));
+	}
+}
+
+RValue<UShort4> MulHigh(RValue<UShort4> x, RValue<UShort4> y)
+{
+	if(emulateIntrinsics)
+	{
+		UShort4 result;
+		result = Insert(result, UShort((UInt(Extract(x, 0)) * UInt(Extract(y, 0))) >> 16), 0);
+		result = Insert(result, UShort((UInt(Extract(x, 1)) * UInt(Extract(y, 1))) >> 16), 1);
+		result = Insert(result, UShort((UInt(Extract(x, 2)) * UInt(Extract(y, 2))) >> 16), 2);
+		result = Insert(result, UShort((UInt(Extract(x, 3)) * UInt(Extract(y, 3))) >> 16), 3);
+
+		return result;
+	}
+	else
+	{
+		Ice::Variable *result = ::function->makeVariable(Ice::IceType_v8i16);
+		const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::MultiplyHighUnsigned, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
+		auto target = ::context->getConstantUndef(Ice::IceType_i32);
+		auto pmulhuw = Ice::InstIntrinsicCall::create(::function, 2, result, target, intrinsic);
+		pmulhuw->addArg(x.value);
+		pmulhuw->addArg(y.value);
+		::basicBlock->appendInst(pmulhuw);
+
+		return RValue<UShort4>(V(result));
+	}
+}
+
+RValue<Int4> MulHigh(RValue<Int4> x, RValue<Int4> y)
+{
+	// TODO: For x86, build an intrinsics version of this which uses shuffles + pmuludq.
+
+	// Scalarized implementation.
+	Int4 result;
+	result = Insert(result, Int((Long(Extract(x, 0)) * Long(Extract(y, 0))) >> Long(Int(32))), 0);
+	result = Insert(result, Int((Long(Extract(x, 1)) * Long(Extract(y, 1))) >> Long(Int(32))), 1);
+	result = Insert(result, Int((Long(Extract(x, 2)) * Long(Extract(y, 2))) >> Long(Int(32))), 2);
+	result = Insert(result, Int((Long(Extract(x, 3)) * Long(Extract(y, 3))) >> Long(Int(32))), 3);
+
+	return result;
+}
+
+RValue<UInt4> MulHigh(RValue<UInt4> x, RValue<UInt4> y)
+{
+	// TODO: For x86, build an intrinsics version of this which uses shuffles + pmuludq.
+
+	if(false)  // Partial product based implementation.
+	{
+		auto xh = x >> 16;
+		auto yh = y >> 16;
+		auto xl = x & UInt4(0x0000FFFF);
+		auto yl = y & UInt4(0x0000FFFF);
+		auto xlyh = xl * yh;
+		auto xhyl = xh * yl;
+		auto xlyhh = xlyh >> 16;
+		auto xhylh = xhyl >> 16;
+		auto xlyhl = xlyh & UInt4(0x0000FFFF);
+		auto xhyll = xhyl & UInt4(0x0000FFFF);
+		auto xlylh = (xl * yl) >> 16;
+		auto oflow = (xlyhl + xhyll + xlylh) >> 16;
+
+		return (xh * yh) + (xlyhh + xhylh) + oflow;
 	}
 
-	RValue<Int4> MulAdd(RValue<Short8> x, RValue<Short8> y)
+	// Scalarized implementation.
+	Int4 result;
+	result = Insert(result, Int((Long(UInt(Extract(As<Int4>(x), 0))) * Long(UInt(Extract(As<Int4>(y), 0)))) >> Long(Int(32))), 0);
+	result = Insert(result, Int((Long(UInt(Extract(As<Int4>(x), 1))) * Long(UInt(Extract(As<Int4>(y), 1)))) >> Long(Int(32))), 1);
+	result = Insert(result, Int((Long(UInt(Extract(As<Int4>(x), 2))) * Long(UInt(Extract(As<Int4>(y), 2)))) >> Long(Int(32))), 2);
+	result = Insert(result, Int((Long(UInt(Extract(As<Int4>(x), 3))) * Long(UInt(Extract(As<Int4>(y), 3)))) >> Long(Int(32))), 3);
+
+	return As<UInt4>(result);
+}
+
+RValue<UShort4> Average(RValue<UShort4> x, RValue<UShort4> y)
+{
+	UNIMPLEMENTED("RValue<UShort4> Average(RValue<UShort4> x, RValue<UShort4> y)");
+	return UShort4(0);
+}
+
+Type *UShort4::getType()
+{
+	return T(Type_v4i16);
+}
+
+RValue<Short> Extract(RValue<Short8> val, int i)
+{
+	return RValue<Short>(Nucleus::createExtractElement(val.value, Short::getType(), i));
+}
+
+RValue<Short8> Insert(RValue<Short8> val, RValue<Short> element, int i)
+{
+	return RValue<Short8>(Nucleus::createInsertElement(val.value, element.value, i));
+}
+
+RValue<Short8> operator<<(RValue<Short8> lhs, unsigned char rhs)
+{
+	if(emulateIntrinsics)
 	{
-		UNIMPLEMENTED("RValue<Int4> MulAdd(RValue<Short8> x, RValue<Short8> y)");
-		return Int4(0);
-	}
+		Short8 result;
+		result = Insert(result, Extract(lhs, 0) << Short(rhs), 0);
+		result = Insert(result, Extract(lhs, 1) << Short(rhs), 1);
+		result = Insert(result, Extract(lhs, 2) << Short(rhs), 2);
+		result = Insert(result, Extract(lhs, 3) << Short(rhs), 3);
+		result = Insert(result, Extract(lhs, 4) << Short(rhs), 4);
+		result = Insert(result, Extract(lhs, 5) << Short(rhs), 5);
+		result = Insert(result, Extract(lhs, 6) << Short(rhs), 6);
+		result = Insert(result, Extract(lhs, 7) << Short(rhs), 7);
 
-	RValue<Short8> MulHigh(RValue<Short8> x, RValue<Short8> y)
+		return result;
+	}
+	else
 	{
-		UNIMPLEMENTED("RValue<Short8> MulHigh(RValue<Short8> x, RValue<Short8> y)");
-		return Short8(0);
+		return RValue<Short8>(Nucleus::createShl(lhs.value, V(::context->getConstantInt32(rhs))));
 	}
+}
 
-	Type *Short8::getType()
+RValue<Short8> operator>>(RValue<Short8> lhs, unsigned char rhs)
+{
+	if(emulateIntrinsics)
 	{
-		return T(Ice::IceType_v8i16);
-	}
+		Short8 result;
+		result = Insert(result, Extract(lhs, 0) >> Short(rhs), 0);
+		result = Insert(result, Extract(lhs, 1) >> Short(rhs), 1);
+		result = Insert(result, Extract(lhs, 2) >> Short(rhs), 2);
+		result = Insert(result, Extract(lhs, 3) >> Short(rhs), 3);
+		result = Insert(result, Extract(lhs, 4) >> Short(rhs), 4);
+		result = Insert(result, Extract(lhs, 5) >> Short(rhs), 5);
+		result = Insert(result, Extract(lhs, 6) >> Short(rhs), 6);
+		result = Insert(result, Extract(lhs, 7) >> Short(rhs), 7);
 
-	RValue<UShort> Extract(RValue<UShort8> val, int i)
+		return result;
+	}
+	else
 	{
-		return RValue<UShort>(Nucleus::createExtractElement(val.value, UShort::getType(), i));
+		return RValue<Short8>(Nucleus::createAShr(lhs.value, V(::context->getConstantInt32(rhs))));
 	}
+}
 
-	RValue<UShort8> Insert(RValue<UShort8> val, RValue<UShort> element, int i)
+RValue<Int4> MulAdd(RValue<Short8> x, RValue<Short8> y)
+{
+	UNIMPLEMENTED("RValue<Int4> MulAdd(RValue<Short8> x, RValue<Short8> y)");
+	return Int4(0);
+}
+
+RValue<Short8> MulHigh(RValue<Short8> x, RValue<Short8> y)
+{
+	UNIMPLEMENTED("RValue<Short8> MulHigh(RValue<Short8> x, RValue<Short8> y)");
+	return Short8(0);
+}
+
+Type *Short8::getType()
+{
+	return T(Ice::IceType_v8i16);
+}
+
+RValue<UShort> Extract(RValue<UShort8> val, int i)
+{
+	return RValue<UShort>(Nucleus::createExtractElement(val.value, UShort::getType(), i));
+}
+
+RValue<UShort8> Insert(RValue<UShort8> val, RValue<UShort> element, int i)
+{
+	return RValue<UShort8>(Nucleus::createInsertElement(val.value, element.value, i));
+}
+
+RValue<UShort8> operator<<(RValue<UShort8> lhs, unsigned char rhs)
+{
+	if(emulateIntrinsics)
 	{
-		return RValue<UShort8>(Nucleus::createInsertElement(val.value, element.value, i));
-	}
+		UShort8 result;
+		result = Insert(result, Extract(lhs, 0) << UShort(rhs), 0);
+		result = Insert(result, Extract(lhs, 1) << UShort(rhs), 1);
+		result = Insert(result, Extract(lhs, 2) << UShort(rhs), 2);
+		result = Insert(result, Extract(lhs, 3) << UShort(rhs), 3);
+		result = Insert(result, Extract(lhs, 4) << UShort(rhs), 4);
+		result = Insert(result, Extract(lhs, 5) << UShort(rhs), 5);
+		result = Insert(result, Extract(lhs, 6) << UShort(rhs), 6);
+		result = Insert(result, Extract(lhs, 7) << UShort(rhs), 7);
 
-	RValue<UShort8> operator<<(RValue<UShort8> lhs, unsigned char rhs)
+		return result;
+	}
+	else
 	{
-		if(emulateIntrinsics)
-		{
-			UShort8 result;
-			result = Insert(result, Extract(lhs, 0) << UShort(rhs), 0);
-			result = Insert(result, Extract(lhs, 1) << UShort(rhs), 1);
-			result = Insert(result, Extract(lhs, 2) << UShort(rhs), 2);
-			result = Insert(result, Extract(lhs, 3) << UShort(rhs), 3);
-			result = Insert(result, Extract(lhs, 4) << UShort(rhs), 4);
-			result = Insert(result, Extract(lhs, 5) << UShort(rhs), 5);
-			result = Insert(result, Extract(lhs, 6) << UShort(rhs), 6);
-			result = Insert(result, Extract(lhs, 7) << UShort(rhs), 7);
-
-			return result;
-		}
-		else
-		{
-			return RValue<UShort8>(Nucleus::createShl(lhs.value, V(::context->getConstantInt32(rhs))));
-		}
+		return RValue<UShort8>(Nucleus::createShl(lhs.value, V(::context->getConstantInt32(rhs))));
 	}
+}
 
-	RValue<UShort8> operator>>(RValue<UShort8> lhs, unsigned char rhs)
+RValue<UShort8> operator>>(RValue<UShort8> lhs, unsigned char rhs)
+{
+	if(emulateIntrinsics)
 	{
-		if(emulateIntrinsics)
-		{
-			UShort8 result;
-			result = Insert(result, Extract(lhs, 0) >> UShort(rhs), 0);
-			result = Insert(result, Extract(lhs, 1) >> UShort(rhs), 1);
-			result = Insert(result, Extract(lhs, 2) >> UShort(rhs), 2);
-			result = Insert(result, Extract(lhs, 3) >> UShort(rhs), 3);
-			result = Insert(result, Extract(lhs, 4) >> UShort(rhs), 4);
-			result = Insert(result, Extract(lhs, 5) >> UShort(rhs), 5);
-			result = Insert(result, Extract(lhs, 6) >> UShort(rhs), 6);
-			result = Insert(result, Extract(lhs, 7) >> UShort(rhs), 7);
+		UShort8 result;
+		result = Insert(result, Extract(lhs, 0) >> UShort(rhs), 0);
+		result = Insert(result, Extract(lhs, 1) >> UShort(rhs), 1);
+		result = Insert(result, Extract(lhs, 2) >> UShort(rhs), 2);
+		result = Insert(result, Extract(lhs, 3) >> UShort(rhs), 3);
+		result = Insert(result, Extract(lhs, 4) >> UShort(rhs), 4);
+		result = Insert(result, Extract(lhs, 5) >> UShort(rhs), 5);
+		result = Insert(result, Extract(lhs, 6) >> UShort(rhs), 6);
+		result = Insert(result, Extract(lhs, 7) >> UShort(rhs), 7);
 
-			return result;
-		}
-		else
-		{
-			return RValue<UShort8>(Nucleus::createLShr(lhs.value, V(::context->getConstantInt32(rhs))));
-		}
+		return result;
 	}
-
-	RValue<UShort8> Swizzle(RValue<UShort8> x, char select0, char select1, char select2, char select3, char select4, char select5, char select6, char select7)
+	else
 	{
-		UNIMPLEMENTED("RValue<UShort8> Swizzle(RValue<UShort8> x, char select0, char select1, char select2, char select3, char select4, char select5, char select6, char select7)");
-		return UShort8(0);
+		return RValue<UShort8>(Nucleus::createLShr(lhs.value, V(::context->getConstantInt32(rhs))));
 	}
+}
 
-	RValue<UShort8> MulHigh(RValue<UShort8> x, RValue<UShort8> y)
-	{
-		UNIMPLEMENTED("RValue<UShort8> MulHigh(RValue<UShort8> x, RValue<UShort8> y)");
-		return UShort8(0);
-	}
+RValue<UShort8> Swizzle(RValue<UShort8> x, char select0, char select1, char select2, char select3, char select4, char select5, char select6, char select7)
+{
+	UNIMPLEMENTED("RValue<UShort8> Swizzle(RValue<UShort8> x, char select0, char select1, char select2, char select3, char select4, char select5, char select6, char select7)");
+	return UShort8(0);
+}
 
-	// FIXME: Implement as Shuffle(x, y, Select(i0, ..., i16)) and Shuffle(x, y, SELECT_PACK_REPEAT(element))
+RValue<UShort8> MulHigh(RValue<UShort8> x, RValue<UShort8> y)
+{
+	UNIMPLEMENTED("RValue<UShort8> MulHigh(RValue<UShort8> x, RValue<UShort8> y)");
+	return UShort8(0);
+}
+
+// FIXME: Implement as Shuffle(x, y, Select(i0, ..., i16)) and Shuffle(x, y, SELECT_PACK_REPEAT(element))
 //	RValue<UShort8> PackRepeat(RValue<Byte16> x, RValue<Byte16> y, int element)
 //	{
 //		ASSERT(false && "UNIMPLEMENTED"); return RValue<UShort8>(V(nullptr));
 //	}
 
-	Type *UShort8::getType()
+Type *UShort8::getType()
+{
+	return T(Ice::IceType_v8i16);
+}
+
+RValue<Int> operator++(Int &val, int)   // Post-increment
+{
+	RValue<Int> res = val;
+	val += 1;
+	return res;
+}
+
+const Int &operator++(Int &val)   // Pre-increment
+{
+	val += 1;
+	return val;
+}
+
+RValue<Int> operator--(Int &val, int)   // Post-decrement
+{
+	RValue<Int> res = val;
+	val -= 1;
+	return res;
+}
+
+const Int &operator--(Int &val)   // Pre-decrement
+{
+	val -= 1;
+	return val;
+}
+
+RValue<Int> RoundInt(RValue<Float> cast)
+{
+	if(emulateIntrinsics || CPUID::ARM)
 	{
-		return T(Ice::IceType_v8i16);
+		// Push the fractional part off the mantissa. Accurate up to +/-2^22.
+		return Int((cast + Float(0x00C00000)) - Float(0x00C00000));
 	}
-
-	RValue<Int> operator++(Int &val, int)   // Post-increment
+	else
 	{
-		RValue<Int> res = val;
-		val += 1;
-		return res;
+		Ice::Variable *result = ::function->makeVariable(Ice::IceType_i32);
+		const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::Nearbyint, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
+		auto target = ::context->getConstantUndef(Ice::IceType_i32);
+		auto nearbyint = Ice::InstIntrinsicCall::create(::function, 1, result, target, intrinsic);
+		nearbyint->addArg(cast.value);
+		::basicBlock->appendInst(nearbyint);
+
+		return RValue<Int>(V(result));
 	}
+}
 
-	const Int &operator++(Int &val)   // Pre-increment
-	{
-		val += 1;
-		return val;
-	}
+Type *Int::getType()
+{
+	return T(Ice::IceType_i32);
+}
 
-	RValue<Int> operator--(Int &val, int)   // Post-decrement
-	{
-		RValue<Int> res = val;
-		val -= 1;
-		return res;
-	}
+Type *Long::getType()
+{
+	return T(Ice::IceType_i64);
+}
 
-	const Int &operator--(Int &val)   // Pre-decrement
-	{
-		val -= 1;
-		return val;
-	}
+UInt::UInt(RValue<Float> cast)
+{
+	// Smallest positive value representable in UInt, but not in Int
+	const unsigned int ustart = 0x80000000u;
+	const float ustartf = float(ustart);
 
-	RValue<Int> RoundInt(RValue<Float> cast)
-	{
-		if(emulateIntrinsics || CPUID::ARM)
-		{
-			// Push the fractional part off the mantissa. Accurate up to +/-2^22.
-			return Int((cast + Float(0x00C00000)) - Float(0x00C00000));
-		}
-		else
-		{
-			Ice::Variable *result = ::function->makeVariable(Ice::IceType_i32);
-			const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::Nearbyint, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
-			auto target = ::context->getConstantUndef(Ice::IceType_i32);
-			auto nearbyint = Ice::InstIntrinsicCall::create(::function, 1, result, target, intrinsic);
-			nearbyint->addArg(cast.value);
-			::basicBlock->appendInst(nearbyint);
+	// If the value is negative, store 0, otherwise store the result of the conversion
+	storeValue((~(As<Int>(cast) >> 31) &
+	// Check if the value can be represented as an Int
+		IfThenElse(cast >= ustartf,
+	// If the value is too large, subtract ustart and re-add it after conversion.
+			As<Int>(As<UInt>(Int(cast - Float(ustartf))) + UInt(ustart)),
+	// Otherwise, just convert normally
+			Int(cast))).value);
+}
 
-			return RValue<Int>(V(result));
-		}
-	}
+RValue<UInt> operator++(UInt &val, int)   // Post-increment
+{
+	RValue<UInt> res = val;
+	val += 1;
+	return res;
+}
 
-	Type *Int::getType()
-	{
-		return T(Ice::IceType_i32);
-	}
+const UInt &operator++(UInt &val)   // Pre-increment
+{
+	val += 1;
+	return val;
+}
 
-	Type *Long::getType()
-	{
-		return T(Ice::IceType_i64);
-	}
+RValue<UInt> operator--(UInt &val, int)   // Post-decrement
+{
+	RValue<UInt> res = val;
+	val -= 1;
+	return res;
+}
 
-	UInt::UInt(RValue<Float> cast)
-	{
-		// Smallest positive value representable in UInt, but not in Int
-		const unsigned int ustart = 0x80000000u;
-		const float ustartf = float(ustart);
-
-		// If the value is negative, store 0, otherwise store the result of the conversion
-		storeValue((~(As<Int>(cast) >> 31) &
-		// Check if the value can be represented as an Int
-			IfThenElse(cast >= ustartf,
-		// If the value is too large, subtract ustart and re-add it after conversion.
-				As<Int>(As<UInt>(Int(cast - Float(ustartf))) + UInt(ustart)),
-		// Otherwise, just convert normally
-				Int(cast))).value);
-	}
-
-	RValue<UInt> operator++(UInt &val, int)   // Post-increment
-	{
-		RValue<UInt> res = val;
-		val += 1;
-		return res;
-	}
-
-	const UInt &operator++(UInt &val)   // Pre-increment
-	{
-		val += 1;
-		return val;
-	}
-
-	RValue<UInt> operator--(UInt &val, int)   // Post-decrement
-	{
-		RValue<UInt> res = val;
-		val -= 1;
-		return res;
-	}
-
-	const UInt &operator--(UInt &val)   // Pre-decrement
-	{
-		val -= 1;
-		return val;
-	}
+const UInt &operator--(UInt &val)   // Pre-decrement
+{
+	val -= 1;
+	return val;
+}
 
 //	RValue<UInt> RoundUInt(RValue<Float> cast)
 //	{
 //		ASSERT(false && "UNIMPLEMENTED"); return RValue<UInt>(V(nullptr));
 //	}
 
-	Type *UInt::getType()
-	{
-		return T(Ice::IceType_i32);
-	}
+Type *UInt::getType()
+{
+	return T(Ice::IceType_i32);
+}
 
 //	Int2::Int2(RValue<Int> cast)
 //	{
@@ -2765,1052 +2764,1052 @@
 //		storeValue(replicate);
 //	}
 
-	RValue<Int2> operator<<(RValue<Int2> lhs, unsigned char rhs)
+RValue<Int2> operator<<(RValue<Int2> lhs, unsigned char rhs)
+{
+	if(emulateIntrinsics)
 	{
-		if(emulateIntrinsics)
-		{
-			Int2 result;
-			result = Insert(result, Extract(lhs, 0) << Int(rhs), 0);
-			result = Insert(result, Extract(lhs, 1) << Int(rhs), 1);
+		Int2 result;
+		result = Insert(result, Extract(lhs, 0) << Int(rhs), 0);
+		result = Insert(result, Extract(lhs, 1) << Int(rhs), 1);
 
-			return result;
-		}
-		else
-		{
-			return RValue<Int2>(Nucleus::createShl(lhs.value, V(::context->getConstantInt32(rhs))));
-		}
+		return result;
 	}
-
-	RValue<Int2> operator>>(RValue<Int2> lhs, unsigned char rhs)
+	else
 	{
-		if(emulateIntrinsics)
-		{
-			Int2 result;
-			result = Insert(result, Extract(lhs, 0) >> Int(rhs), 0);
-			result = Insert(result, Extract(lhs, 1) >> Int(rhs), 1);
-
-			return result;
-		}
-		else
-		{
-			return RValue<Int2>(Nucleus::createAShr(lhs.value, V(::context->getConstantInt32(rhs))));
-		}
+		return RValue<Int2>(Nucleus::createShl(lhs.value, V(::context->getConstantInt32(rhs))));
 	}
+}
 
-	Type *Int2::getType()
+RValue<Int2> operator>>(RValue<Int2> lhs, unsigned char rhs)
+{
+	if(emulateIntrinsics)
 	{
-		return T(Type_v2i32);
+		Int2 result;
+		result = Insert(result, Extract(lhs, 0) >> Int(rhs), 0);
+		result = Insert(result, Extract(lhs, 1) >> Int(rhs), 1);
+
+		return result;
 	}
-
-	RValue<UInt2> operator<<(RValue<UInt2> lhs, unsigned char rhs)
+	else
 	{
-		if(emulateIntrinsics)
-		{
-			UInt2 result;
-			result = Insert(result, Extract(lhs, 0) << UInt(rhs), 0);
-			result = Insert(result, Extract(lhs, 1) << UInt(rhs), 1);
-
-			return result;
-		}
-		else
-		{
-			return RValue<UInt2>(Nucleus::createShl(lhs.value, V(::context->getConstantInt32(rhs))));
-		}
+		return RValue<Int2>(Nucleus::createAShr(lhs.value, V(::context->getConstantInt32(rhs))));
 	}
+}
 
-	RValue<UInt2> operator>>(RValue<UInt2> lhs, unsigned char rhs)
+Type *Int2::getType()
+{
+	return T(Type_v2i32);
+}
+
+RValue<UInt2> operator<<(RValue<UInt2> lhs, unsigned char rhs)
+{
+	if(emulateIntrinsics)
 	{
-		if(emulateIntrinsics)
-		{
-			UInt2 result;
-			result = Insert(result, Extract(lhs, 0) >> UInt(rhs), 0);
-			result = Insert(result, Extract(lhs, 1) >> UInt(rhs), 1);
+		UInt2 result;
+		result = Insert(result, Extract(lhs, 0) << UInt(rhs), 0);
+		result = Insert(result, Extract(lhs, 1) << UInt(rhs), 1);
 
-			return result;
-		}
-		else
-		{
-			return RValue<UInt2>(Nucleus::createLShr(lhs.value, V(::context->getConstantInt32(rhs))));
-		}
+		return result;
 	}
-
-	Type *UInt2::getType()
+	else
 	{
-		return T(Type_v2i32);
+		return RValue<UInt2>(Nucleus::createShl(lhs.value, V(::context->getConstantInt32(rhs))));
 	}
+}
 
-	Int4::Int4(RValue<Byte4> cast) : XYZW(this)
+RValue<UInt2> operator>>(RValue<UInt2> lhs, unsigned char rhs)
+{
+	if(emulateIntrinsics)
 	{
-		Value *x = Nucleus::createBitCast(cast.value, Int::getType());
-		Value *a = Nucleus::createInsertElement(loadValue(), x, 0);
+		UInt2 result;
+		result = Insert(result, Extract(lhs, 0) >> UInt(rhs), 0);
+		result = Insert(result, Extract(lhs, 1) >> UInt(rhs), 1);
 
-		Value *e;
-		int swizzle[16] = {0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23};
-		Value *b = Nucleus::createBitCast(a, Byte16::getType());
-		Value *c = Nucleus::createShuffleVector(b, V(Nucleus::createNullValue(Byte16::getType())), swizzle);
-
-		int swizzle2[8] = {0, 8, 1, 9, 2, 10, 3, 11};
-		Value *d = Nucleus::createBitCast(c, Short8::getType());
-		e = Nucleus::createShuffleVector(d, V(Nucleus::createNullValue(Short8::getType())), swizzle2);
-
-		Value *f = Nucleus::createBitCast(e, Int4::getType());
-		storeValue(f);
+		return result;
 	}
-
-	Int4::Int4(RValue<SByte4> cast) : XYZW(this)
+	else
 	{
-		Value *x = Nucleus::createBitCast(cast.value, Int::getType());
-		Value *a = Nucleus::createInsertElement(loadValue(), x, 0);
-
-		int swizzle[16] = {0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7};
-		Value *b = Nucleus::createBitCast(a, Byte16::getType());
-		Value *c = Nucleus::createShuffleVector(b, b, swizzle);
-
-		int swizzle2[8] = {0, 0, 1, 1, 2, 2, 3, 3};
-		Value *d = Nucleus::createBitCast(c, Short8::getType());
-		Value *e = Nucleus::createShuffleVector(d, d, swizzle2);
-
-		*this = As<Int4>(e) >> 24;
+		return RValue<UInt2>(Nucleus::createLShr(lhs.value, V(::context->getConstantInt32(rhs))));
 	}
+}
 
-	Int4::Int4(RValue<Short4> cast) : XYZW(this)
+Type *UInt2::getType()
+{
+	return T(Type_v2i32);
+}
+
+Int4::Int4(RValue<Byte4> cast) : XYZW(this)
+{
+	Value *x = Nucleus::createBitCast(cast.value, Int::getType());
+	Value *a = Nucleus::createInsertElement(loadValue(), x, 0);
+
+	Value *e;
+	int swizzle[16] = {0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23};
+	Value *b = Nucleus::createBitCast(a, Byte16::getType());
+	Value *c = Nucleus::createShuffleVector(b, V(Nucleus::createNullValue(Byte16::getType())), swizzle);
+
+	int swizzle2[8] = {0, 8, 1, 9, 2, 10, 3, 11};
+	Value *d = Nucleus::createBitCast(c, Short8::getType());
+	e = Nucleus::createShuffleVector(d, V(Nucleus::createNullValue(Short8::getType())), swizzle2);
+
+	Value *f = Nucleus::createBitCast(e, Int4::getType());
+	storeValue(f);
+}
+
+Int4::Int4(RValue<SByte4> cast) : XYZW(this)
+{
+	Value *x = Nucleus::createBitCast(cast.value, Int::getType());
+	Value *a = Nucleus::createInsertElement(loadValue(), x, 0);
+
+	int swizzle[16] = {0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7};
+	Value *b = Nucleus::createBitCast(a, Byte16::getType());
+	Value *c = Nucleus::createShuffleVector(b, b, swizzle);
+
+	int swizzle2[8] = {0, 0, 1, 1, 2, 2, 3, 3};
+	Value *d = Nucleus::createBitCast(c, Short8::getType());
+	Value *e = Nucleus::createShuffleVector(d, d, swizzle2);
+
+	*this = As<Int4>(e) >> 24;
+}
+
+Int4::Int4(RValue<Short4> cast) : XYZW(this)
+{
+	int swizzle[8] = {0, 0, 1, 1, 2, 2, 3, 3};
+	Value *c = Nucleus::createShuffleVector(cast.value, cast.value, swizzle);
+
+	*this = As<Int4>(c) >> 16;
+}
+
+Int4::Int4(RValue<UShort4> cast) : XYZW(this)
+{
+	int swizzle[8] = {0, 8, 1, 9, 2, 10, 3, 11};
+	Value *c = Nucleus::createShuffleVector(cast.value, Short8(0, 0, 0, 0, 0, 0, 0, 0).loadValue(), swizzle);
+	Value *d = Nucleus::createBitCast(c, Int4::getType());
+	storeValue(d);
+}
+
+Int4::Int4(RValue<Int> rhs) : XYZW(this)
+{
+	Value *vector = Nucleus::createBitCast(rhs.value, Int4::getType());
+
+	int swizzle[4] = {0, 0, 0, 0};
+	Value *replicate = Nucleus::createShuffleVector(vector, vector, swizzle);
+
+	storeValue(replicate);
+}
+
+RValue<Int4> operator<<(RValue<Int4> lhs, unsigned char rhs)
+{
+	if(emulateIntrinsics)
 	{
-		int swizzle[8] = {0, 0, 1, 1, 2, 2, 3, 3};
-		Value *c = Nucleus::createShuffleVector(cast.value, cast.value, swizzle);
+		Int4 result;
+		result = Insert(result, Extract(lhs, 0) << Int(rhs), 0);
+		result = Insert(result, Extract(lhs, 1) << Int(rhs), 1);
+		result = Insert(result, Extract(lhs, 2) << Int(rhs), 2);
+		result = Insert(result, Extract(lhs, 3) << Int(rhs), 3);
 
-		*this = As<Int4>(c) >> 16;
+		return result;
 	}
-
-	Int4::Int4(RValue<UShort4> cast) : XYZW(this)
+	else
 	{
-		int swizzle[8] = {0, 8, 1, 9, 2, 10, 3, 11};
-		Value *c = Nucleus::createShuffleVector(cast.value, Short8(0, 0, 0, 0, 0, 0, 0, 0).loadValue(), swizzle);
-		Value *d = Nucleus::createBitCast(c, Int4::getType());
-		storeValue(d);
+		return RValue<Int4>(Nucleus::createShl(lhs.value, V(::context->getConstantInt32(rhs))));
 	}
+}
 
-	Int4::Int4(RValue<Int> rhs) : XYZW(this)
+RValue<Int4> operator>>(RValue<Int4> lhs, unsigned char rhs)
+{
+	if(emulateIntrinsics)
 	{
-		Value *vector = Nucleus::createBitCast(rhs.value, Int4::getType());
+		Int4 result;
+		result = Insert(result, Extract(lhs, 0) >> Int(rhs), 0);
+		result = Insert(result, Extract(lhs, 1) >> Int(rhs), 1);
+		result = Insert(result, Extract(lhs, 2) >> Int(rhs), 2);
+		result = Insert(result, Extract(lhs, 3) >> Int(rhs), 3);
 
-		int swizzle[4] = {0, 0, 0, 0};
-		Value *replicate = Nucleus::createShuffleVector(vector, vector, swizzle);
-
-		storeValue(replicate);
+		return result;
 	}
-
-	RValue<Int4> operator<<(RValue<Int4> lhs, unsigned char rhs)
+	else
 	{
-		if(emulateIntrinsics)
-		{
-			Int4 result;
-			result = Insert(result, Extract(lhs, 0) << Int(rhs), 0);
-			result = Insert(result, Extract(lhs, 1) << Int(rhs), 1);
-			result = Insert(result, Extract(lhs, 2) << Int(rhs), 2);
-			result = Insert(result, Extract(lhs, 3) << Int(rhs), 3);
-
-			return result;
-		}
-		else
-		{
-			return RValue<Int4>(Nucleus::createShl(lhs.value, V(::context->getConstantInt32(rhs))));
-		}
+		return RValue<Int4>(Nucleus::createAShr(lhs.value, V(::context->getConstantInt32(rhs))));
 	}
+}
 
-	RValue<Int4> operator>>(RValue<Int4> lhs, unsigned char rhs)
+RValue<Int4> CmpEQ(RValue<Int4> x, RValue<Int4> y)
+{
+	return RValue<Int4>(Nucleus::createICmpEQ(x.value, y.value));
+}
+
+RValue<Int4> CmpLT(RValue<Int4> x, RValue<Int4> y)
+{
+	return RValue<Int4>(Nucleus::createICmpSLT(x.value, y.value));
+}
+
+RValue<Int4> CmpLE(RValue<Int4> x, RValue<Int4> y)
+{
+	return RValue<Int4>(Nucleus::createICmpSLE(x.value, y.value));
+}
+
+RValue<Int4> CmpNEQ(RValue<Int4> x, RValue<Int4> y)
+{
+	return RValue<Int4>(Nucleus::createICmpNE(x.value, y.value));
+}
+
+RValue<Int4> CmpNLT(RValue<Int4> x, RValue<Int4> y)
+{
+	return RValue<Int4>(Nucleus::createICmpSGE(x.value, y.value));
+}
+
+RValue<Int4> CmpNLE(RValue<Int4> x, RValue<Int4> y)
+{
+	return RValue<Int4>(Nucleus::createICmpSGT(x.value, y.value));
+}
+
+RValue<Int4> Max(RValue<Int4> x, RValue<Int4> y)
+{
+	Ice::Variable *condition = ::function->makeVariable(Ice::IceType_v4i1);
+	auto cmp = Ice::InstIcmp::create(::function, Ice::InstIcmp::Sle, condition, x.value, y.value);
+	::basicBlock->appendInst(cmp);
+
+	Ice::Variable *result = ::function->makeVariable(Ice::IceType_v4i32);
+	auto select = Ice::InstSelect::create(::function, result, condition, y.value, x.value);
+	::basicBlock->appendInst(select);
+
+	return RValue<Int4>(V(result));
+}
+
+RValue<Int4> Min(RValue<Int4> x, RValue<Int4> y)
+{
+	Ice::Variable *condition = ::function->makeVariable(Ice::IceType_v4i1);
+	auto cmp = Ice::InstIcmp::create(::function, Ice::InstIcmp::Sgt, condition, x.value, y.value);
+	::basicBlock->appendInst(cmp);
+
+	Ice::Variable *result = ::function->makeVariable(Ice::IceType_v4i32);
+	auto select = Ice::InstSelect::create(::function, result, condition, y.value, x.value);
+	::basicBlock->appendInst(select);
+
+	return RValue<Int4>(V(result));
+}
+
+RValue<Int4> RoundInt(RValue<Float4> cast)
+{
+	if(emulateIntrinsics || CPUID::ARM)
 	{
-		if(emulateIntrinsics)
-		{
-			Int4 result;
-			result = Insert(result, Extract(lhs, 0) >> Int(rhs), 0);
-			result = Insert(result, Extract(lhs, 1) >> Int(rhs), 1);
-			result = Insert(result, Extract(lhs, 2) >> Int(rhs), 2);
-			result = Insert(result, Extract(lhs, 3) >> Int(rhs), 3);
-
-			return result;
-		}
-		else
-		{
-			return RValue<Int4>(Nucleus::createAShr(lhs.value, V(::context->getConstantInt32(rhs))));
-		}
+		// Push the fractional part off the mantissa. Accurate up to +/-2^22.
+		return Int4((cast + Float4(0x00C00000)) - Float4(0x00C00000));
 	}
-
-	RValue<Int4> CmpEQ(RValue<Int4> x, RValue<Int4> y)
+	else
 	{
-		return RValue<Int4>(Nucleus::createICmpEQ(x.value, y.value));
-	}
-
-	RValue<Int4> CmpLT(RValue<Int4> x, RValue<Int4> y)
-	{
-		return RValue<Int4>(Nucleus::createICmpSLT(x.value, y.value));
-	}
-
-	RValue<Int4> CmpLE(RValue<Int4> x, RValue<Int4> y)
-	{
-		return RValue<Int4>(Nucleus::createICmpSLE(x.value, y.value));
-	}
-
-	RValue<Int4> CmpNEQ(RValue<Int4> x, RValue<Int4> y)
-	{
-		return RValue<Int4>(Nucleus::createICmpNE(x.value, y.value));
-	}
-
-	RValue<Int4> CmpNLT(RValue<Int4> x, RValue<Int4> y)
-	{
-		return RValue<Int4>(Nucleus::createICmpSGE(x.value, y.value));
-	}
-
-	RValue<Int4> CmpNLE(RValue<Int4> x, RValue<Int4> y)
-	{
-		return RValue<Int4>(Nucleus::createICmpSGT(x.value, y.value));
-	}
-
-	RValue<Int4> Max(RValue<Int4> x, RValue<Int4> y)
-	{
-		Ice::Variable *condition = ::function->makeVariable(Ice::IceType_v4i1);
-		auto cmp = Ice::InstIcmp::create(::function, Ice::InstIcmp::Sle, condition, x.value, y.value);
-		::basicBlock->appendInst(cmp);
-
 		Ice::Variable *result = ::function->makeVariable(Ice::IceType_v4i32);
-		auto select = Ice::InstSelect::create(::function, result, condition, y.value, x.value);
-		::basicBlock->appendInst(select);
+		const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::Nearbyint, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
+		auto target = ::context->getConstantUndef(Ice::IceType_i32);
+		auto nearbyint = Ice::InstIntrinsicCall::create(::function, 1, result, target, intrinsic);
+		nearbyint->addArg(cast.value);
+		::basicBlock->appendInst(nearbyint);
 
 		return RValue<Int4>(V(result));
 	}
+}
 
-	RValue<Int4> Min(RValue<Int4> x, RValue<Int4> y)
+RValue<Short8> PackSigned(RValue<Int4> x, RValue<Int4> y)
+{
+	if(emulateIntrinsics)
 	{
-		Ice::Variable *condition = ::function->makeVariable(Ice::IceType_v4i1);
-		auto cmp = Ice::InstIcmp::create(::function, Ice::InstIcmp::Sgt, condition, x.value, y.value);
-		::basicBlock->appendInst(cmp);
+		Short8 result;
+		result = Insert(result, SaturateSigned(Extract(x, 0)), 0);
+		result = Insert(result, SaturateSigned(Extract(x, 1)), 1);
+		result = Insert(result, SaturateSigned(Extract(x, 2)), 2);
+		result = Insert(result, SaturateSigned(Extract(x, 3)), 3);
+		result = Insert(result, SaturateSigned(Extract(y, 0)), 4);
+		result = Insert(result, SaturateSigned(Extract(y, 1)), 5);
+		result = Insert(result, SaturateSigned(Extract(y, 2)), 6);
+		result = Insert(result, SaturateSigned(Extract(y, 3)), 7);
 
-		Ice::Variable *result = ::function->makeVariable(Ice::IceType_v4i32);
-		auto select = Ice::InstSelect::create(::function, result, condition, y.value, x.value);
-		::basicBlock->appendInst(select);
-
-		return RValue<Int4>(V(result));
+		return result;
 	}
-
-	RValue<Int4> RoundInt(RValue<Float4> cast)
+	else
 	{
-		if(emulateIntrinsics || CPUID::ARM)
-		{
-			// Push the fractional part off the mantissa. Accurate up to +/-2^22.
-			return Int4((cast + Float4(0x00C00000)) - Float4(0x00C00000));
-		}
-		else
-		{
-			Ice::Variable *result = ::function->makeVariable(Ice::IceType_v4i32);
-			const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::Nearbyint, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
-			auto target = ::context->getConstantUndef(Ice::IceType_i32);
-			auto nearbyint = Ice::InstIntrinsicCall::create(::function, 1, result, target, intrinsic);
-			nearbyint->addArg(cast.value);
-			::basicBlock->appendInst(nearbyint);
+		Ice::Variable *result = ::function->makeVariable(Ice::IceType_v8i16);
+		const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::VectorPackSigned, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
+		auto target = ::context->getConstantUndef(Ice::IceType_i32);
+		auto pack = Ice::InstIntrinsicCall::create(::function, 2, result, target, intrinsic);
+		pack->addArg(x.value);
+		pack->addArg(y.value);
+		::basicBlock->appendInst(pack);
 
-			return RValue<Int4>(V(result));
-		}
+		return RValue<Short8>(V(result));
 	}
+}
 
-	RValue<Short8> PackSigned(RValue<Int4> x, RValue<Int4> y)
+RValue<UShort8> PackUnsigned(RValue<Int4> x, RValue<Int4> y)
+{
+	if(emulateIntrinsics || !(CPUID::SSE4_1 || CPUID::ARM))
 	{
-		if(emulateIntrinsics)
-		{
-			Short8 result;
-			result = Insert(result, SaturateSigned(Extract(x, 0)), 0);
-			result = Insert(result, SaturateSigned(Extract(x, 1)), 1);
-			result = Insert(result, SaturateSigned(Extract(x, 2)), 2);
-			result = Insert(result, SaturateSigned(Extract(x, 3)), 3);
-			result = Insert(result, SaturateSigned(Extract(y, 0)), 4);
-			result = Insert(result, SaturateSigned(Extract(y, 1)), 5);
-			result = Insert(result, SaturateSigned(Extract(y, 2)), 6);
-			result = Insert(result, SaturateSigned(Extract(y, 3)), 7);
+		RValue<Int4> sx = As<Int4>(x);
+		RValue<Int4> bx = (sx & ~(sx >> 31)) - Int4(0x8000);
 
-			return result;
-		}
-		else
-		{
-			Ice::Variable *result = ::function->makeVariable(Ice::IceType_v8i16);
-			const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::VectorPackSigned, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
-			auto target = ::context->getConstantUndef(Ice::IceType_i32);
-			auto pack = Ice::InstIntrinsicCall::create(::function, 2, result, target, intrinsic);
-			pack->addArg(x.value);
-			pack->addArg(y.value);
-			::basicBlock->appendInst(pack);
+		RValue<Int4> sy = As<Int4>(y);
+		RValue<Int4> by = (sy & ~(sy >> 31)) - Int4(0x8000);
 
-			return RValue<Short8>(V(result));
-		}
+		return As<UShort8>(PackSigned(bx, by) + Short8(0x8000u));
 	}
-
-	RValue<UShort8> PackUnsigned(RValue<Int4> x, RValue<Int4> y)
+	else
 	{
-		if(emulateIntrinsics || !(CPUID::SSE4_1 || CPUID::ARM))
-		{
-			RValue<Int4> sx = As<Int4>(x);
-			RValue<Int4> bx = (sx & ~(sx >> 31)) - Int4(0x8000);
+		Ice::Variable *result = ::function->makeVariable(Ice::IceType_v8i16);
+		const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::VectorPackUnsigned, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
+		auto target = ::context->getConstantUndef(Ice::IceType_i32);
+		auto pack = Ice::InstIntrinsicCall::create(::function, 2, result, target, intrinsic);
+		pack->addArg(x.value);
+		pack->addArg(y.value);
+		::basicBlock->appendInst(pack);
 
-			RValue<Int4> sy = As<Int4>(y);
-			RValue<Int4> by = (sy & ~(sy >> 31)) - Int4(0x8000);
-
-			return As<UShort8>(PackSigned(bx, by) + Short8(0x8000u));
-		}
-		else
-		{
-			Ice::Variable *result = ::function->makeVariable(Ice::IceType_v8i16);
-			const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::VectorPackUnsigned, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
-			auto target = ::context->getConstantUndef(Ice::IceType_i32);
-			auto pack = Ice::InstIntrinsicCall::create(::function, 2, result, target, intrinsic);
-			pack->addArg(x.value);
-			pack->addArg(y.value);
-			::basicBlock->appendInst(pack);
-
-			return RValue<UShort8>(V(result));
-		}
+		return RValue<UShort8>(V(result));
 	}
+}
 
-	RValue<Int> SignMask(RValue<Int4> x)
+RValue<Int> SignMask(RValue<Int4> x)
+{
+	if(emulateIntrinsics || CPUID::ARM)
 	{
-		if(emulateIntrinsics || CPUID::ARM)
-		{
-			Int4 xx = (x >> 31) & Int4(0x00000001, 0x00000002, 0x00000004, 0x00000008);
-			return Extract(xx, 0) | Extract(xx, 1) | Extract(xx, 2) | Extract(xx, 3);
-		}
-		else
-		{
-			Ice::Variable *result = ::function->makeVariable(Ice::IceType_i32);
-			const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::SignMask, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
-			auto target = ::context->getConstantUndef(Ice::IceType_i32);
-			auto movmsk = Ice::InstIntrinsicCall::create(::function, 1, result, target, intrinsic);
-			movmsk->addArg(x.value);
-			::basicBlock->appendInst(movmsk);
-
-			return RValue<Int>(V(result));
-		}
+		Int4 xx = (x >> 31) & Int4(0x00000001, 0x00000002, 0x00000004, 0x00000008);
+		return Extract(xx, 0) | Extract(xx, 1) | Extract(xx, 2) | Extract(xx, 3);
 	}
-
-	Type *Int4::getType()
+	else
 	{
-		return T(Ice::IceType_v4i32);
+		Ice::Variable *result = ::function->makeVariable(Ice::IceType_i32);
+		const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::SignMask, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
+		auto target = ::context->getConstantUndef(Ice::IceType_i32);
+		auto movmsk = Ice::InstIntrinsicCall::create(::function, 1, result, target, intrinsic);
+		movmsk->addArg(x.value);
+		::basicBlock->appendInst(movmsk);
+
+		return RValue<Int>(V(result));
 	}
+}
 
-	UInt4::UInt4(RValue<Float4> cast) : XYZW(this)
+Type *Int4::getType()
+{
+	return T(Ice::IceType_v4i32);
+}
+
+UInt4::UInt4(RValue<Float4> cast) : XYZW(this)
+{
+	// Smallest positive value representable in UInt, but not in Int
+	const unsigned int ustart = 0x80000000u;
+	const float ustartf = float(ustart);
+
+	// Check if the value can be represented as an Int
+	Int4 uiValue = CmpNLT(cast, Float4(ustartf));
+	// If the value is too large, subtract ustart and re-add it after conversion.
+	uiValue = (uiValue & As<Int4>(As<UInt4>(Int4(cast - Float4(ustartf))) + UInt4(ustart))) |
+	// Otherwise, just convert normally
+	          (~uiValue & Int4(cast));
+	// If the value is negative, store 0, otherwise store the result of the conversion
+	storeValue((~(As<Int4>(cast) >> 31) & uiValue).value);
+}
+
+UInt4::UInt4(RValue<UInt> rhs) : XYZW(this)
+{
+	Value *vector = Nucleus::createBitCast(rhs.value, UInt4::getType());
+
+	int swizzle[4] = {0, 0, 0, 0};
+	Value *replicate = Nucleus::createShuffleVector(vector, vector, swizzle);
+
+	storeValue(replicate);
+}
+
+RValue<UInt4> operator<<(RValue<UInt4> lhs, unsigned char rhs)
+{
+	if(emulateIntrinsics)
 	{
-		// Smallest positive value representable in UInt, but not in Int
-		const unsigned int ustart = 0x80000000u;
-		const float ustartf = float(ustart);
+		UInt4 result;
+		result = Insert(result, Extract(lhs, 0) << UInt(rhs), 0);
+		result = Insert(result, Extract(lhs, 1) << UInt(rhs), 1);
+		result = Insert(result, Extract(lhs, 2) << UInt(rhs), 2);
+		result = Insert(result, Extract(lhs, 3) << UInt(rhs), 3);
 
-		// Check if the value can be represented as an Int
-		Int4 uiValue = CmpNLT(cast, Float4(ustartf));
-		// If the value is too large, subtract ustart and re-add it after conversion.
-		uiValue = (uiValue & As<Int4>(As<UInt4>(Int4(cast - Float4(ustartf))) + UInt4(ustart))) |
-		// Otherwise, just convert normally
-		          (~uiValue & Int4(cast));
-		// If the value is negative, store 0, otherwise store the result of the conversion
-		storeValue((~(As<Int4>(cast) >> 31) & uiValue).value);
+		return result;
 	}
-
-	UInt4::UInt4(RValue<UInt> rhs) : XYZW(this)
+	else
 	{
-		Value *vector = Nucleus::createBitCast(rhs.value, UInt4::getType());
-
-		int swizzle[4] = {0, 0, 0, 0};
-		Value *replicate = Nucleus::createShuffleVector(vector, vector, swizzle);
-
-		storeValue(replicate);
+		return RValue<UInt4>(Nucleus::createShl(lhs.value, V(::context->getConstantInt32(rhs))));
 	}
+}
 
-	RValue<UInt4> operator<<(RValue<UInt4> lhs, unsigned char rhs)
+RValue<UInt4> operator>>(RValue<UInt4> lhs, unsigned char rhs)
+{
+	if(emulateIntrinsics)
 	{
-		if(emulateIntrinsics)
-		{
-			UInt4 result;
-			result = Insert(result, Extract(lhs, 0) << UInt(rhs), 0);
-			result = Insert(result, Extract(lhs, 1) << UInt(rhs), 1);
-			result = Insert(result, Extract(lhs, 2) << UInt(rhs), 2);
-			result = Insert(result, Extract(lhs, 3) << UInt(rhs), 3);
+		UInt4 result;
+		result = Insert(result, Extract(lhs, 0) >> UInt(rhs), 0);
+		result = Insert(result, Extract(lhs, 1) >> UInt(rhs), 1);
+		result = Insert(result, Extract(lhs, 2) >> UInt(rhs), 2);
+		result = Insert(result, Extract(lhs, 3) >> UInt(rhs), 3);
 
-			return result;
-		}
-		else
-		{
-			return RValue<UInt4>(Nucleus::createShl(lhs.value, V(::context->getConstantInt32(rhs))));
-		}
+		return result;
 	}
-
-	RValue<UInt4> operator>>(RValue<UInt4> lhs, unsigned char rhs)
+	else
 	{
-		if(emulateIntrinsics)
-		{
-			UInt4 result;
-			result = Insert(result, Extract(lhs, 0) >> UInt(rhs), 0);
-			result = Insert(result, Extract(lhs, 1) >> UInt(rhs), 1);
-			result = Insert(result, Extract(lhs, 2) >> UInt(rhs), 2);
-			result = Insert(result, Extract(lhs, 3) >> UInt(rhs), 3);
-
-			return result;
-		}
-		else
-		{
-			return RValue<UInt4>(Nucleus::createLShr(lhs.value, V(::context->getConstantInt32(rhs))));
-		}
+		return RValue<UInt4>(Nucleus::createLShr(lhs.value, V(::context->getConstantInt32(rhs))));
 	}
+}
 
-	RValue<UInt4> CmpEQ(RValue<UInt4> x, RValue<UInt4> y)
+RValue<UInt4> CmpEQ(RValue<UInt4> x, RValue<UInt4> y)
+{
+	return RValue<UInt4>(Nucleus::createICmpEQ(x.value, y.value));
+}
+
+RValue<UInt4> CmpLT(RValue<UInt4> x, RValue<UInt4> y)
+{
+	return RValue<UInt4>(Nucleus::createICmpULT(x.value, y.value));
+}
+
+RValue<UInt4> CmpLE(RValue<UInt4> x, RValue<UInt4> y)
+{
+	return RValue<UInt4>(Nucleus::createICmpULE(x.value, y.value));
+}
+
+RValue<UInt4> CmpNEQ(RValue<UInt4> x, RValue<UInt4> y)
+{
+	return RValue<UInt4>(Nucleus::createICmpNE(x.value, y.value));
+}
+
+RValue<UInt4> CmpNLT(RValue<UInt4> x, RValue<UInt4> y)
+{
+	return RValue<UInt4>(Nucleus::createICmpUGE(x.value, y.value));
+}
+
+RValue<UInt4> CmpNLE(RValue<UInt4> x, RValue<UInt4> y)
+{
+	return RValue<UInt4>(Nucleus::createICmpUGT(x.value, y.value));
+}
+
+RValue<UInt4> Max(RValue<UInt4> x, RValue<UInt4> y)
+{
+	Ice::Variable *condition = ::function->makeVariable(Ice::IceType_v4i1);
+	auto cmp = Ice::InstIcmp::create(::function, Ice::InstIcmp::Ule, condition, x.value, y.value);
+	::basicBlock->appendInst(cmp);
+
+	Ice::Variable *result = ::function->makeVariable(Ice::IceType_v4i32);
+	auto select = Ice::InstSelect::create(::function, result, condition, y.value, x.value);
+	::basicBlock->appendInst(select);
+
+	return RValue<UInt4>(V(result));
+}
+
+RValue<UInt4> Min(RValue<UInt4> x, RValue<UInt4> y)
+{
+	Ice::Variable *condition = ::function->makeVariable(Ice::IceType_v4i1);
+	auto cmp = Ice::InstIcmp::create(::function, Ice::InstIcmp::Ugt, condition, x.value, y.value);
+	::basicBlock->appendInst(cmp);
+
+	Ice::Variable *result = ::function->makeVariable(Ice::IceType_v4i32);
+	auto select = Ice::InstSelect::create(::function, result, condition, y.value, x.value);
+	::basicBlock->appendInst(select);
+
+	return RValue<UInt4>(V(result));
+}
+
+Type *UInt4::getType()
+{
+	return T(Ice::IceType_v4i32);
+}
+
+Type *Half::getType()
+{
+	return T(Ice::IceType_i16);
+}
+
+RValue<Float> Rcp_pp(RValue<Float> x, bool exactAtPow2)
+{
+	return 1.0f / x;
+}
+
+RValue<Float> RcpSqrt_pp(RValue<Float> x)
+{
+	return Rcp_pp(Sqrt(x));
+}
+
+RValue<Float> Sqrt(RValue<Float> x)
+{
+	Ice::Variable *result = ::function->makeVariable(Ice::IceType_f32);
+	const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::Sqrt, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
+	auto target = ::context->getConstantUndef(Ice::IceType_i32);
+	auto sqrt = Ice::InstIntrinsicCall::create(::function, 1, result, target, intrinsic);
+	sqrt->addArg(x.value);
+	::basicBlock->appendInst(sqrt);
+
+	return RValue<Float>(V(result));
+}
+
+RValue<Float> Round(RValue<Float> x)
+{
+	return Float4(Round(Float4(x))).x;
+}
+
+RValue<Float> Trunc(RValue<Float> x)
+{
+	return Float4(Trunc(Float4(x))).x;
+}
+
+RValue<Float> Frac(RValue<Float> x)
+{
+	return Float4(Frac(Float4(x))).x;
+}
+
+RValue<Float> Floor(RValue<Float> x)
+{
+	return Float4(Floor(Float4(x))).x;
+}
+
+RValue<Float> Ceil(RValue<Float> x)
+{
+	return Float4(Ceil(Float4(x))).x;
+}
+
+Type *Float::getType()
+{
+	return T(Ice::IceType_f32);
+}
+
+Type *Float2::getType()
+{
+	return T(Type_v2f32);
+}
+
+Float4::Float4(RValue<Float> rhs) : XYZW(this)
+{
+	Value *vector = Nucleus::createBitCast(rhs.value, Float4::getType());
+
+	int swizzle[4] = {0, 0, 0, 0};
+	Value *replicate = Nucleus::createShuffleVector(vector, vector, swizzle);
+
+	storeValue(replicate);
+}
+
+RValue<Float4> Max(RValue<Float4> x, RValue<Float4> y)
+{
+	Ice::Variable *condition = ::function->makeVariable(Ice::IceType_v4i1);
+	auto cmp = Ice::InstFcmp::create(::function, Ice::InstFcmp::Ogt, condition, x.value, y.value);
+	::basicBlock->appendInst(cmp);
+
+	Ice::Variable *result = ::function->makeVariable(Ice::IceType_v4f32);
+	auto select = Ice::InstSelect::create(::function, result, condition, x.value, y.value);
+	::basicBlock->appendInst(select);
+
+	return RValue<Float4>(V(result));
+}
+
+RValue<Float4> Min(RValue<Float4> x, RValue<Float4> y)
+{
+	Ice::Variable *condition = ::function->makeVariable(Ice::IceType_v4i1);
+	auto cmp = Ice::InstFcmp::create(::function, Ice::InstFcmp::Olt, condition, x.value, y.value);
+	::basicBlock->appendInst(cmp);
+
+	Ice::Variable *result = ::function->makeVariable(Ice::IceType_v4f32);
+	auto select = Ice::InstSelect::create(::function, result, condition, x.value, y.value);
+	::basicBlock->appendInst(select);
+
+	return RValue<Float4>(V(result));
+}
+
+RValue<Float4> Rcp_pp(RValue<Float4> x, bool exactAtPow2)
+{
+	return Float4(1.0f) / x;
+}
+
+RValue<Float4> RcpSqrt_pp(RValue<Float4> x)
+{
+	return Rcp_pp(Sqrt(x));
+}
+
+RValue<Float4> Sqrt(RValue<Float4> x)
+{
+	if(emulateIntrinsics || CPUID::ARM)
 	{
-		return RValue<UInt4>(Nucleus::createICmpEQ(x.value, y.value));
+		Float4 result;
+		result.x = Sqrt(Float(Float4(x).x));
+		result.y = Sqrt(Float(Float4(x).y));
+		result.z = Sqrt(Float(Float4(x).z));
+		result.w = Sqrt(Float(Float4(x).w));
+
+		return result;
 	}
-
-	RValue<UInt4> CmpLT(RValue<UInt4> x, RValue<UInt4> y)
+	else
 	{
-		return RValue<UInt4>(Nucleus::createICmpULT(x.value, y.value));
-	}
-
-	RValue<UInt4> CmpLE(RValue<UInt4> x, RValue<UInt4> y)
-	{
-		return RValue<UInt4>(Nucleus::createICmpULE(x.value, y.value));
-	}
-
-	RValue<UInt4> CmpNEQ(RValue<UInt4> x, RValue<UInt4> y)
-	{
-		return RValue<UInt4>(Nucleus::createICmpNE(x.value, y.value));
-	}
-
-	RValue<UInt4> CmpNLT(RValue<UInt4> x, RValue<UInt4> y)
-	{
-		return RValue<UInt4>(Nucleus::createICmpUGE(x.value, y.value));
-	}
-
-	RValue<UInt4> CmpNLE(RValue<UInt4> x, RValue<UInt4> y)
-	{
-		return RValue<UInt4>(Nucleus::createICmpUGT(x.value, y.value));
-	}
-
-	RValue<UInt4> Max(RValue<UInt4> x, RValue<UInt4> y)
-	{
-		Ice::Variable *condition = ::function->makeVariable(Ice::IceType_v4i1);
-		auto cmp = Ice::InstIcmp::create(::function, Ice::InstIcmp::Ule, condition, x.value, y.value);
-		::basicBlock->appendInst(cmp);
-
-		Ice::Variable *result = ::function->makeVariable(Ice::IceType_v4i32);
-		auto select = Ice::InstSelect::create(::function, result, condition, y.value, x.value);
-		::basicBlock->appendInst(select);
-
-		return RValue<UInt4>(V(result));
-	}
-
-	RValue<UInt4> Min(RValue<UInt4> x, RValue<UInt4> y)
-	{
-		Ice::Variable *condition = ::function->makeVariable(Ice::IceType_v4i1);
-		auto cmp = Ice::InstIcmp::create(::function, Ice::InstIcmp::Ugt, condition, x.value, y.value);
-		::basicBlock->appendInst(cmp);
-
-		Ice::Variable *result = ::function->makeVariable(Ice::IceType_v4i32);
-		auto select = Ice::InstSelect::create(::function, result, condition, y.value, x.value);
-		::basicBlock->appendInst(select);
-
-		return RValue<UInt4>(V(result));
-	}
-
-	Type *UInt4::getType()
-	{
-		return T(Ice::IceType_v4i32);
-	}
-
-	Type *Half::getType()
-	{
-		return T(Ice::IceType_i16);
-	}
-
-	RValue<Float> Rcp_pp(RValue<Float> x, bool exactAtPow2)
-	{
-		return 1.0f / x;
-	}
-
-	RValue<Float> RcpSqrt_pp(RValue<Float> x)
-	{
-		return Rcp_pp(Sqrt(x));
-	}
-
-	RValue<Float> Sqrt(RValue<Float> x)
-	{
-		Ice::Variable *result = ::function->makeVariable(Ice::IceType_f32);
+		Ice::Variable *result = ::function->makeVariable(Ice::IceType_v4f32);
 		const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::Sqrt, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
 		auto target = ::context->getConstantUndef(Ice::IceType_i32);
 		auto sqrt = Ice::InstIntrinsicCall::create(::function, 1, result, target, intrinsic);
 		sqrt->addArg(x.value);
 		::basicBlock->appendInst(sqrt);
 
-		return RValue<Float>(V(result));
-	}
-
-	RValue<Float> Round(RValue<Float> x)
-	{
-		return Float4(Round(Float4(x))).x;
-	}
-
-	RValue<Float> Trunc(RValue<Float> x)
-	{
-		return Float4(Trunc(Float4(x))).x;
-	}
-
-	RValue<Float> Frac(RValue<Float> x)
-	{
-		return Float4(Frac(Float4(x))).x;
-	}
-
-	RValue<Float> Floor(RValue<Float> x)
-	{
-		return Float4(Floor(Float4(x))).x;
-	}
-
-	RValue<Float> Ceil(RValue<Float> x)
-	{
-		return Float4(Ceil(Float4(x))).x;
-	}
-
-	Type *Float::getType()
-	{
-		return T(Ice::IceType_f32);
-	}
-
-	Type *Float2::getType()
-	{
-		return T(Type_v2f32);
-	}
-
-	Float4::Float4(RValue<Float> rhs) : XYZW(this)
-	{
-		Value *vector = Nucleus::createBitCast(rhs.value, Float4::getType());
-
-		int swizzle[4] = {0, 0, 0, 0};
-		Value *replicate = Nucleus::createShuffleVector(vector, vector, swizzle);
-
-		storeValue(replicate);
-	}
-
-	RValue<Float4> Max(RValue<Float4> x, RValue<Float4> y)
-	{
-		Ice::Variable *condition = ::function->makeVariable(Ice::IceType_v4i1);
-		auto cmp = Ice::InstFcmp::create(::function, Ice::InstFcmp::Ogt, condition, x.value, y.value);
-		::basicBlock->appendInst(cmp);
-
-		Ice::Variable *result = ::function->makeVariable(Ice::IceType_v4f32);
-		auto select = Ice::InstSelect::create(::function, result, condition, x.value, y.value);
-		::basicBlock->appendInst(select);
-
 		return RValue<Float4>(V(result));
 	}
-
-	RValue<Float4> Min(RValue<Float4> x, RValue<Float4> y)
-	{
-		Ice::Variable *condition = ::function->makeVariable(Ice::IceType_v4i1);
-		auto cmp = Ice::InstFcmp::create(::function, Ice::InstFcmp::Olt, condition, x.value, y.value);
-		::basicBlock->appendInst(cmp);
-
-		Ice::Variable *result = ::function->makeVariable(Ice::IceType_v4f32);
-		auto select = Ice::InstSelect::create(::function, result, condition, x.value, y.value);
-		::basicBlock->appendInst(select);
-
-		return RValue<Float4>(V(result));
-	}
-
-	RValue<Float4> Rcp_pp(RValue<Float4> x, bool exactAtPow2)
-	{
-		return Float4(1.0f) / x;
-	}
-
-	RValue<Float4> RcpSqrt_pp(RValue<Float4> x)
-	{
-		return Rcp_pp(Sqrt(x));
-	}
-
-	RValue<Float4> Sqrt(RValue<Float4> x)
-	{
-		if(emulateIntrinsics || CPUID::ARM)
-		{
-			Float4 result;
-			result.x = Sqrt(Float(Float4(x).x));
-			result.y = Sqrt(Float(Float4(x).y));
-			result.z = Sqrt(Float(Float4(x).z));
-			result.w = Sqrt(Float(Float4(x).w));
-
-			return result;
-		}
-		else
-		{
-			Ice::Variable *result = ::function->makeVariable(Ice::IceType_v4f32);
-			const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::Sqrt, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
-			auto target = ::context->getConstantUndef(Ice::IceType_i32);
-			auto sqrt = Ice::InstIntrinsicCall::create(::function, 1, result, target, intrinsic);
-			sqrt->addArg(x.value);
-			::basicBlock->appendInst(sqrt);
-
-			return RValue<Float4>(V(result));
-		}
-	}
-
-	RValue<Int> SignMask(RValue<Float4> x)
-	{
-		if(emulateIntrinsics || CPUID::ARM)
-		{
-			Int4 xx = (As<Int4>(x) >> 31) & Int4(0x00000001, 0x00000002, 0x00000004, 0x00000008);
-			return Extract(xx, 0) | Extract(xx, 1) | Extract(xx, 2) | Extract(xx, 3);
-		}
-		else
-		{
-			Ice::Variable *result = ::function->makeVariable(Ice::IceType_i32);
-			const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::SignMask, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
-			auto target = ::context->getConstantUndef(Ice::IceType_i32);
-			auto movmsk = Ice::InstIntrinsicCall::create(::function, 1, result, target, intrinsic);
-			movmsk->addArg(x.value);
-			::basicBlock->appendInst(movmsk);
-
-			return RValue<Int>(V(result));
-		}
-	}
-
-	RValue<Int4> CmpEQ(RValue<Float4> x, RValue<Float4> y)
-	{
-		return RValue<Int4>(Nucleus::createFCmpOEQ(x.value, y.value));
-	}
-
-	RValue<Int4> CmpLT(RValue<Float4> x, RValue<Float4> y)
-	{
-		return RValue<Int4>(Nucleus::createFCmpOLT(x.value, y.value));
-	}
-
-	RValue<Int4> CmpLE(RValue<Float4> x, RValue<Float4> y)
-	{
-		return RValue<Int4>(Nucleus::createFCmpOLE(x.value, y.value));
-	}
-
-	RValue<Int4> CmpNEQ(RValue<Float4> x, RValue<Float4> y)
-	{
-		return RValue<Int4>(Nucleus::createFCmpONE(x.value, y.value));
-	}
-
-	RValue<Int4> CmpNLT(RValue<Float4> x, RValue<Float4> y)
-	{
-		return RValue<Int4>(Nucleus::createFCmpOGE(x.value, y.value));
-	}
-
-	RValue<Int4> CmpNLE(RValue<Float4> x, RValue<Float4> y)
-	{
-		return RValue<Int4>(Nucleus::createFCmpOGT(x.value, y.value));
-	}
-
-	RValue<Int4> CmpUEQ(RValue<Float4> x, RValue<Float4> y)
-	{
-		return RValue<Int4>(Nucleus::createFCmpUEQ(x.value, y.value));
-	}
-
-	RValue<Int4> CmpULT(RValue<Float4> x, RValue<Float4> y)
-	{
-		return RValue<Int4>(Nucleus::createFCmpULT(x.value, y.value));
-	}
-
-	RValue<Int4> CmpULE(RValue<Float4> x, RValue<Float4> y)
-	{
-		return RValue<Int4>(Nucleus::createFCmpULE(x.value, y.value));
-	}
-
-	RValue<Int4> CmpUNEQ(RValue<Float4> x, RValue<Float4> y)
-	{
-		return RValue<Int4>(Nucleus::createFCmpUNE(x.value, y.value));
-	}
-
-	RValue<Int4> CmpUNLT(RValue<Float4> x, RValue<Float4> y)
-	{
-		return RValue<Int4>(Nucleus::createFCmpUGE(x.value, y.value));
-	}
-
-	RValue<Int4> CmpUNLE(RValue<Float4> x, RValue<Float4> y)
-	{
-		return RValue<Int4>(Nucleus::createFCmpUGT(x.value, y.value));
-	}
-
-	RValue<Float4> Round(RValue<Float4> x)
-	{
-		if(emulateIntrinsics || CPUID::ARM)
-		{
-			// Push the fractional part off the mantissa. Accurate up to +/-2^22.
-			return (x + Float4(0x00C00000)) - Float4(0x00C00000);
-		}
-		else if(CPUID::SSE4_1)
-		{
-			Ice::Variable *result = ::function->makeVariable(Ice::IceType_v4f32);
-			const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::Round, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
-			auto target = ::context->getConstantUndef(Ice::IceType_i32);
-			auto round = Ice::InstIntrinsicCall::create(::function, 2, result, target, intrinsic);
-			round->addArg(x.value);
-			round->addArg(::context->getConstantInt32(0));
-			::basicBlock->appendInst(round);
-
-			return RValue<Float4>(V(result));
-		}
-		else
-		{
-			return Float4(RoundInt(x));
-		}
-	}
-
-	RValue<Float4> Trunc(RValue<Float4> x)
-	{
-		if(CPUID::SSE4_1)
-		{
-			Ice::Variable *result = ::function->makeVariable(Ice::IceType_v4f32);
-			const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::Round, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
-			auto target = ::context->getConstantUndef(Ice::IceType_i32);
-			auto round = Ice::InstIntrinsicCall::create(::function, 2, result, target, intrinsic);
-			round->addArg(x.value);
-			round->addArg(::context->getConstantInt32(3));
-			::basicBlock->appendInst(round);
-
-			return RValue<Float4>(V(result));
-		}
-		else
-		{
-			return Float4(Int4(x));
-		}
-	}
-
-	RValue<Float4> Frac(RValue<Float4> x)
-	{
-		Float4 frc;
-
-		if(CPUID::SSE4_1)
-		{
-			frc = x - Floor(x);
-		}
-		else
-		{
-			frc = x - Float4(Int4(x));   // Signed fractional part.
-
-			frc += As<Float4>(As<Int4>(CmpNLE(Float4(0.0f), frc)) & As<Int4>(Float4(1, 1, 1, 1)));   // Add 1.0 if negative.
-		}
-
-		// x - floor(x) can be 1.0 for very small negative x.
-		// Clamp against the value just below 1.0.
-		return Min(frc, As<Float4>(Int4(0x3F7FFFFF)));
-	}
-
-	RValue<Float4> Floor(RValue<Float4> x)
-	{
-		if(CPUID::SSE4_1)
-		{
-			Ice::Variable *result = ::function->makeVariable(Ice::IceType_v4f32);
-			const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::Round, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
-			auto target = ::context->getConstantUndef(Ice::IceType_i32);
-			auto round = Ice::InstIntrinsicCall::create(::function, 2, result, target, intrinsic);
-			round->addArg(x.value);
-			round->addArg(::context->getConstantInt32(1));
-			::basicBlock->appendInst(round);
-
-			return RValue<Float4>(V(result));
-		}
-		else
-		{
-			return x - Frac(x);
-		}
-	}
-
-	RValue<Float4> Ceil(RValue<Float4> x)
-	{
-		if(CPUID::SSE4_1)
-		{
-			Ice::Variable *result = ::function->makeVariable(Ice::IceType_v4f32);
-			const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::Round, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
-			auto target = ::context->getConstantUndef(Ice::IceType_i32);
-			auto round = Ice::InstIntrinsicCall::create(::function, 2, result, target, intrinsic);
-			round->addArg(x.value);
-			round->addArg(::context->getConstantInt32(2));
-			::basicBlock->appendInst(round);
-
-			return RValue<Float4>(V(result));
-		}
-		else
-		{
-			return -Floor(-x);
-		}
-	}
-
-	Type *Float4::getType()
-	{
-		return T(Ice::IceType_v4f32);
-	}
-
-	RValue<Long> Ticks()
-	{
-		UNIMPLEMENTED("RValue<Long> Ticks()");
-		return Long(Int(0));
-	}
-
-	RValue<Pointer<Byte>> ConstantPointer(void const * ptr)
-	{
-		if (sizeof(void*) == 8)
-		{
-			return RValue<Pointer<Byte>>(V(::context->getConstantInt64(reinterpret_cast<intptr_t>(ptr))));
-		}
-		else
-		{
-			return RValue<Pointer<Byte>>(V(::context->getConstantInt32(reinterpret_cast<intptr_t>(ptr))));
-		}
-	}
-
-	RValue<Pointer<Byte>> ConstantData(void const * data, size_t size)
-	{
-		// TODO: Try to use Ice::VariableDeclaration::DataInitializer and
-		// getConstantSym instead of tagging data on the routine.
-		return ConstantPointer(::routine->addConstantData(data, size));
-	}
-
-	Value* Call(RValue<Pointer<Byte>> fptr, Type* retTy, std::initializer_list<Value*> args, std::initializer_list<Type*> argTys)
-	{
-		Ice::Variable *ret = nullptr;
-		if (retTy != nullptr)
-		{
-			ret = ::function->makeVariable(T(retTy));
-		}
-		auto call = Ice::InstCall::create(::function, args.size(), ret, V(fptr.value), false);
-		for (auto arg : args)
-		{
-			call->addArg(V(arg));
-		}
-		::basicBlock->appendInst(call);
-		return V(ret);
-	}
-
-	void Breakpoint()
-	{
-		const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::Trap, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
-		auto target = ::context->getConstantUndef(Ice::IceType_i32);
-		auto trap = Ice::InstIntrinsicCall::create(::function, 0, nullptr, target, intrinsic);
-		::basicBlock->appendInst(trap);
-	}
-
-	void Nucleus::createFence(std::memory_order memoryOrder) { UNIMPLEMENTED("Subzero createFence()"); }
-	Value *Nucleus::createMaskedLoad(Value *ptr, Type *elTy, Value *mask, unsigned int alignment, bool zeroMaskedLanes) { UNIMPLEMENTED("Subzero createMaskedLoad()"); return nullptr; }
-	void Nucleus::createMaskedStore(Value *ptr, Value *val, Value *mask, unsigned int alignment) { UNIMPLEMENTED("Subzero createMaskedStore()"); }
-
-	RValue<Float4> Gather(RValue<Pointer<Float>> base, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment, bool zeroMaskedLanes /* = false */)
-	{
-		return emulated::Gather(base, offsets, mask, alignment, zeroMaskedLanes);
-	}
-
-	RValue<Int4> Gather(RValue<Pointer<Int>> base, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment, bool zeroMaskedLanes /* = false */)
-	{
-		return emulated::Gather(base, offsets, mask, alignment, zeroMaskedLanes);
-	}
-
-	void Scatter(RValue<Pointer<Float>> base, RValue<Float4> val, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment)
-	{
-		return emulated::Scatter(base, val, offsets, mask, alignment);
-	}
-
-	void Scatter(RValue<Pointer<Int>> base, RValue<Int4> val, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment)
-	{
-		return emulated::Scatter(base, val, offsets, mask, alignment);
-	}
-
-	RValue<Float> Exp2(RValue<Float> x)
-	{
-		return emulated::Exp2(x);
-	}
-
-	RValue<Float> Log2(RValue<Float> x)
-	{
-		return emulated::Log2(x);
-	}
-
-	RValue<Float4> Sin(RValue<Float4> x)
-	{
-		return emulated::Sin(x);
-	}
-
-	RValue<Float4> Cos(RValue<Float4> x)
-	{
-		return emulated::Cos(x);
-	}
-
-	RValue<Float4> Tan(RValue<Float4> x)
-	{
-		return emulated::Tan(x);
-	}
-
-	RValue<Float4> Asin(RValue<Float4> x)
-	{
-		return emulated::Asin(x);
-	}
-
-	RValue<Float4> Acos(RValue<Float4> x)
-	{
-		return emulated::Acos(x);
-	}
-
-	RValue<Float4> Atan(RValue<Float4> x)
-	{
-		return emulated::Atan(x);
-	}
-
-	RValue<Float4> Sinh(RValue<Float4> x)
-	{
-		return emulated::Sinh(x);
-	}
-
-	RValue<Float4> Cosh(RValue<Float4> x)
-	{
-		return emulated::Cosh(x);
-	}
-
-	RValue<Float4> Tanh(RValue<Float4> x)
-	{
-		return emulated::Tanh(x);
-	}
-
-	RValue<Float4> Asinh(RValue<Float4> x)
-	{
-		return emulated::Asinh(x);
-	}
-
-	RValue<Float4> Acosh(RValue<Float4> x)
-	{
-		return emulated::Acosh(x);
-	}
-
-	RValue<Float4> Atanh(RValue<Float4> x)
-	{
-		return emulated::Atanh(x);
-	}
-
-	RValue<Float4> Atan2(RValue<Float4> x, RValue<Float4> y)
-	{
-		return emulated::Atan2(x, y);
-	}
-
-	RValue<Float4> Pow(RValue<Float4> x, RValue<Float4> y)
-	{
-		return emulated::Pow(x, y);
-	}
-
-	RValue<Float4> Exp(RValue<Float4> x)
-	{
-		return emulated::Exp(x);
-	}
-
-	RValue<Float4> Log(RValue<Float4> x)
-	{
-		return emulated::Log(x);
-	}
-
-	RValue<Float4> Exp2(RValue<Float4> x)
-	{
-		return emulated::Exp2(x);
-	}
-
-	RValue<Float4> Log2(RValue<Float4> x)
-	{
-		return emulated::Log2(x);
-	}
-
-	RValue<UInt> Ctlz(RValue<UInt> x, bool isZeroUndef)
-	{
-		if (emulateIntrinsics)
-		{
-			UNIMPLEMENTED("Subzero Ctlz()"); return UInt(0);
-		}
-		else
-		{
-			Ice::Variable* result = ::function->makeVariable(Ice::IceType_i32);
-			const Ice::Intrinsics::IntrinsicInfo intrinsic = { Ice::Intrinsics::Ctlz, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F };
-			auto target = ::context->getConstantUndef(Ice::IceType_i32);
-			auto ctlz = Ice::InstIntrinsicCall::create(::function, 1, result, target, intrinsic);
-			ctlz->addArg(x.value);
-			::basicBlock->appendInst(ctlz);
-
-			return RValue<UInt>(V(result));
-		}
-	}
-
-	RValue<UInt4> Ctlz(RValue<UInt4> x, bool isZeroUndef)
-	{
-		if (emulateIntrinsics)
-		{
-			UNIMPLEMENTED("Subzero Ctlz()"); return UInt4(0);
-		}
-		else
-		{
-			// TODO: implement vectorized version in Subzero
-			UInt4 result;
-			result = Insert(result, Ctlz(Extract(x, 0), isZeroUndef), 0);
-			result = Insert(result, Ctlz(Extract(x, 1), isZeroUndef), 1);
-			result = Insert(result, Ctlz(Extract(x, 2), isZeroUndef), 2);
-			result = Insert(result, Ctlz(Extract(x, 3), isZeroUndef), 3);
-			return result;
-		}
-	}
-
-	RValue<UInt> Cttz(RValue<UInt> x, bool isZeroUndef)
-	{
-		if (emulateIntrinsics)
-		{
-			UNIMPLEMENTED("Subzero Cttz()"); return UInt(0);
-		}
-		else
-		{
-			Ice::Variable* result = ::function->makeVariable(Ice::IceType_i32);
-			const Ice::Intrinsics::IntrinsicInfo intrinsic = { Ice::Intrinsics::Cttz, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F };
-			auto target = ::context->getConstantUndef(Ice::IceType_i32);
-			auto ctlz = Ice::InstIntrinsicCall::create(::function, 1, result, target, intrinsic);
-			ctlz->addArg(x.value);
-			::basicBlock->appendInst(ctlz);
-
-			return RValue<UInt>(V(result));
-		}
-	}
-
-	RValue<UInt4> Cttz(RValue<UInt4> x, bool isZeroUndef)
-	{
-		if (emulateIntrinsics)
-		{
-			UNIMPLEMENTED("Subzero Cttz()"); return UInt4(0);
-		}
-		else
-		{
-			// TODO: implement vectorized version in Subzero
-			UInt4 result;
-			result = Insert(result, Cttz(Extract(x, 0), isZeroUndef), 0);
-			result = Insert(result, Cttz(Extract(x, 1), isZeroUndef), 1);
-			result = Insert(result, Cttz(Extract(x, 2), isZeroUndef), 2);
-			result = Insert(result, Cttz(Extract(x, 3), isZeroUndef), 3);
-			return result;
-		}
-	}
-
-	void EmitDebugLocation() {}
-	void EmitDebugVariable(Value* value) {}
-	void FlushDebug() {}
-
-	void Nucleus::createCoroutine(Type *YieldType, std::vector<Type*> &Params)
-	{
-		// Subzero currently only supports coroutines as functions (i.e. that do not yield)
-		createFunction(YieldType, Params);
-	}
-
-	static bool coroutineEntryAwaitStub(Nucleus::CoroutineHandle, void* yieldValue) { return false; }
-	static void coroutineEntryDestroyStub(Nucleus::CoroutineHandle) {}
-
-	std::shared_ptr<Routine> Nucleus::acquireCoroutine(const char *name, const Config::Edit &cfgEdit /* = Config::Edit::None */)
-	{
-		// acquireRoutine sets the CoroutineEntryBegin entry
-		auto coroutineEntry = acquireRoutine(name, cfgEdit);
-
-		// For now, set the await and destroy entries to stubs, until we add proper coroutine support to the Subzero backend
-		auto routine = std::static_pointer_cast<ELFMemoryStreamer>(coroutineEntry);
-		routine->setEntry(Nucleus::CoroutineEntryAwait, reinterpret_cast<const void*>(&coroutineEntryAwaitStub));
-		routine->setEntry(Nucleus::CoroutineEntryDestroy, reinterpret_cast<const void*>(&coroutineEntryDestroyStub));
-
-		return coroutineEntry;
-	}
-
-	void Nucleus::yield(Value* val) { UNIMPLEMENTED("Yield"); }
-
 }
+
+RValue<Int> SignMask(RValue<Float4> x)
+{
+	if(emulateIntrinsics || CPUID::ARM)
+	{
+		Int4 xx = (As<Int4>(x) >> 31) & Int4(0x00000001, 0x00000002, 0x00000004, 0x00000008);
+		return Extract(xx, 0) | Extract(xx, 1) | Extract(xx, 2) | Extract(xx, 3);
+	}
+	else
+	{
+		Ice::Variable *result = ::function->makeVariable(Ice::IceType_i32);
+		const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::SignMask, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
+		auto target = ::context->getConstantUndef(Ice::IceType_i32);
+		auto movmsk = Ice::InstIntrinsicCall::create(::function, 1, result, target, intrinsic);
+		movmsk->addArg(x.value);
+		::basicBlock->appendInst(movmsk);
+
+		return RValue<Int>(V(result));
+	}
+}
+
+RValue<Int4> CmpEQ(RValue<Float4> x, RValue<Float4> y)
+{
+	return RValue<Int4>(Nucleus::createFCmpOEQ(x.value, y.value));
+}
+
+RValue<Int4> CmpLT(RValue<Float4> x, RValue<Float4> y)
+{
+	return RValue<Int4>(Nucleus::createFCmpOLT(x.value, y.value));
+}
+
+RValue<Int4> CmpLE(RValue<Float4> x, RValue<Float4> y)
+{
+	return RValue<Int4>(Nucleus::createFCmpOLE(x.value, y.value));
+}
+
+RValue<Int4> CmpNEQ(RValue<Float4> x, RValue<Float4> y)
+{
+	return RValue<Int4>(Nucleus::createFCmpONE(x.value, y.value));
+}
+
+RValue<Int4> CmpNLT(RValue<Float4> x, RValue<Float4> y)
+{
+	return RValue<Int4>(Nucleus::createFCmpOGE(x.value, y.value));
+}
+
+RValue<Int4> CmpNLE(RValue<Float4> x, RValue<Float4> y)
+{
+	return RValue<Int4>(Nucleus::createFCmpOGT(x.value, y.value));
+}
+
+RValue<Int4> CmpUEQ(RValue<Float4> x, RValue<Float4> y)
+{
+	return RValue<Int4>(Nucleus::createFCmpUEQ(x.value, y.value));
+}
+
+RValue<Int4> CmpULT(RValue<Float4> x, RValue<Float4> y)
+{
+	return RValue<Int4>(Nucleus::createFCmpULT(x.value, y.value));
+}
+
+RValue<Int4> CmpULE(RValue<Float4> x, RValue<Float4> y)
+{
+	return RValue<Int4>(Nucleus::createFCmpULE(x.value, y.value));
+}
+
+RValue<Int4> CmpUNEQ(RValue<Float4> x, RValue<Float4> y)
+{
+	return RValue<Int4>(Nucleus::createFCmpUNE(x.value, y.value));
+}
+
+RValue<Int4> CmpUNLT(RValue<Float4> x, RValue<Float4> y)
+{
+	return RValue<Int4>(Nucleus::createFCmpUGE(x.value, y.value));
+}
+
+RValue<Int4> CmpUNLE(RValue<Float4> x, RValue<Float4> y)
+{
+	return RValue<Int4>(Nucleus::createFCmpUGT(x.value, y.value));
+}
+
+RValue<Float4> Round(RValue<Float4> x)
+{
+	if(emulateIntrinsics || CPUID::ARM)
+	{
+		// Push the fractional part off the mantissa. Accurate up to +/-2^22.
+		return (x + Float4(0x00C00000)) - Float4(0x00C00000);
+	}
+	else if(CPUID::SSE4_1)
+	{
+		Ice::Variable *result = ::function->makeVariable(Ice::IceType_v4f32);
+		const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::Round, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
+		auto target = ::context->getConstantUndef(Ice::IceType_i32);
+		auto round = Ice::InstIntrinsicCall::create(::function, 2, result, target, intrinsic);
+		round->addArg(x.value);
+		round->addArg(::context->getConstantInt32(0));
+		::basicBlock->appendInst(round);
+
+		return RValue<Float4>(V(result));
+	}
+	else
+	{
+		return Float4(RoundInt(x));
+	}
+}
+
+RValue<Float4> Trunc(RValue<Float4> x)
+{
+	if(CPUID::SSE4_1)
+	{
+		Ice::Variable *result = ::function->makeVariable(Ice::IceType_v4f32);
+		const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::Round, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
+		auto target = ::context->getConstantUndef(Ice::IceType_i32);
+		auto round = Ice::InstIntrinsicCall::create(::function, 2, result, target, intrinsic);
+		round->addArg(x.value);
+		round->addArg(::context->getConstantInt32(3));
+		::basicBlock->appendInst(round);
+
+		return RValue<Float4>(V(result));
+	}
+	else
+	{
+		return Float4(Int4(x));
+	}
+}
+
+RValue<Float4> Frac(RValue<Float4> x)
+{
+	Float4 frc;
+
+	if(CPUID::SSE4_1)
+	{
+		frc = x - Floor(x);
+	}
+	else
+	{
+		frc = x - Float4(Int4(x));   // Signed fractional part.
+
+		frc += As<Float4>(As<Int4>(CmpNLE(Float4(0.0f), frc)) & As<Int4>(Float4(1, 1, 1, 1)));   // Add 1.0 if negative.
+	}
+
+	// x - floor(x) can be 1.0 for very small negative x.
+	// Clamp against the value just below 1.0.
+	return Min(frc, As<Float4>(Int4(0x3F7FFFFF)));
+}
+
+RValue<Float4> Floor(RValue<Float4> x)
+{
+	if(CPUID::SSE4_1)
+	{
+		Ice::Variable *result = ::function->makeVariable(Ice::IceType_v4f32);
+		const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::Round, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
+		auto target = ::context->getConstantUndef(Ice::IceType_i32);
+		auto round = Ice::InstIntrinsicCall::create(::function, 2, result, target, intrinsic);
+		round->addArg(x.value);
+		round->addArg(::context->getConstantInt32(1));
+		::basicBlock->appendInst(round);
+
+		return RValue<Float4>(V(result));
+	}
+	else
+	{
+		return x - Frac(x);
+	}
+}
+
+RValue<Float4> Ceil(RValue<Float4> x)
+{
+	if(CPUID::SSE4_1)
+	{
+		Ice::Variable *result = ::function->makeVariable(Ice::IceType_v4f32);
+		const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::Round, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
+		auto target = ::context->getConstantUndef(Ice::IceType_i32);
+		auto round = Ice::InstIntrinsicCall::create(::function, 2, result, target, intrinsic);
+		round->addArg(x.value);
+		round->addArg(::context->getConstantInt32(2));
+		::basicBlock->appendInst(round);
+
+		return RValue<Float4>(V(result));
+	}
+	else
+	{
+		return -Floor(-x);
+	}
+}
+
+Type *Float4::getType()
+{
+	return T(Ice::IceType_v4f32);
+}
+
+RValue<Long> Ticks()
+{
+	UNIMPLEMENTED("RValue<Long> Ticks()");
+	return Long(Int(0));
+}
+
+RValue<Pointer<Byte>> ConstantPointer(void const * ptr)
+{
+	if (sizeof(void*) == 8)
+	{
+		return RValue<Pointer<Byte>>(V(::context->getConstantInt64(reinterpret_cast<intptr_t>(ptr))));
+	}
+	else
+	{
+		return RValue<Pointer<Byte>>(V(::context->getConstantInt32(reinterpret_cast<intptr_t>(ptr))));
+	}
+}
+
+RValue<Pointer<Byte>> ConstantData(void const * data, size_t size)
+{
+	// TODO: Try to use Ice::VariableDeclaration::DataInitializer and
+	// getConstantSym instead of tagging data on the routine.
+	return ConstantPointer(::routine->addConstantData(data, size));
+}
+
+Value* Call(RValue<Pointer<Byte>> fptr, Type* retTy, std::initializer_list<Value*> args, std::initializer_list<Type*> argTys)
+{
+	Ice::Variable *ret = nullptr;
+	if (retTy != nullptr)
+	{
+		ret = ::function->makeVariable(T(retTy));
+	}
+	auto call = Ice::InstCall::create(::function, args.size(), ret, V(fptr.value), false);
+	for (auto arg : args)
+	{
+		call->addArg(V(arg));
+	}
+	::basicBlock->appendInst(call);
+	return V(ret);
+}
+
+void Breakpoint()
+{
+	const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::Trap, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
+	auto target = ::context->getConstantUndef(Ice::IceType_i32);
+	auto trap = Ice::InstIntrinsicCall::create(::function, 0, nullptr, target, intrinsic);
+	::basicBlock->appendInst(trap);
+}
+
+void Nucleus::createFence(std::memory_order memoryOrder) { UNIMPLEMENTED("Subzero createFence()"); }
+Value *Nucleus::createMaskedLoad(Value *ptr, Type *elTy, Value *mask, unsigned int alignment, bool zeroMaskedLanes) { UNIMPLEMENTED("Subzero createMaskedLoad()"); return nullptr; }
+void Nucleus::createMaskedStore(Value *ptr, Value *val, Value *mask, unsigned int alignment) { UNIMPLEMENTED("Subzero createMaskedStore()"); }
+
+RValue<Float4> Gather(RValue<Pointer<Float>> base, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment, bool zeroMaskedLanes /* = false */)
+{
+	return emulated::Gather(base, offsets, mask, alignment, zeroMaskedLanes);
+}
+
+RValue<Int4> Gather(RValue<Pointer<Int>> base, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment, bool zeroMaskedLanes /* = false */)
+{
+	return emulated::Gather(base, offsets, mask, alignment, zeroMaskedLanes);
+}
+
+void Scatter(RValue<Pointer<Float>> base, RValue<Float4> val, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment)
+{
+	return emulated::Scatter(base, val, offsets, mask, alignment);
+}
+
+void Scatter(RValue<Pointer<Int>> base, RValue<Int4> val, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment)
+{
+	return emulated::Scatter(base, val, offsets, mask, alignment);
+}
+
+RValue<Float> Exp2(RValue<Float> x)
+{
+	return emulated::Exp2(x);
+}
+
+RValue<Float> Log2(RValue<Float> x)
+{
+	return emulated::Log2(x);
+}
+
+RValue<Float4> Sin(RValue<Float4> x)
+{
+	return emulated::Sin(x);
+}
+
+RValue<Float4> Cos(RValue<Float4> x)
+{
+	return emulated::Cos(x);
+}
+
+RValue<Float4> Tan(RValue<Float4> x)
+{
+	return emulated::Tan(x);
+}
+
+RValue<Float4> Asin(RValue<Float4> x)
+{
+	return emulated::Asin(x);
+}
+
+RValue<Float4> Acos(RValue<Float4> x)
+{
+	return emulated::Acos(x);
+}
+
+RValue<Float4> Atan(RValue<Float4> x)
+{
+	return emulated::Atan(x);
+}
+
+RValue<Float4> Sinh(RValue<Float4> x)
+{
+	return emulated::Sinh(x);
+}
+
+RValue<Float4> Cosh(RValue<Float4> x)
+{
+	return emulated::Cosh(x);
+}
+
+RValue<Float4> Tanh(RValue<Float4> x)
+{
+	return emulated::Tanh(x);
+}
+
+RValue<Float4> Asinh(RValue<Float4> x)
+{
+	return emulated::Asinh(x);
+}
+
+RValue<Float4> Acosh(RValue<Float4> x)
+{
+	return emulated::Acosh(x);
+}
+
+RValue<Float4> Atanh(RValue<Float4> x)
+{
+	return emulated::Atanh(x);
+}
+
+RValue<Float4> Atan2(RValue<Float4> x, RValue<Float4> y)
+{
+	return emulated::Atan2(x, y);
+}
+
+RValue<Float4> Pow(RValue<Float4> x, RValue<Float4> y)
+{
+	return emulated::Pow(x, y);
+}
+
+RValue<Float4> Exp(RValue<Float4> x)
+{
+	return emulated::Exp(x);
+}
+
+RValue<Float4> Log(RValue<Float4> x)
+{
+	return emulated::Log(x);
+}
+
+RValue<Float4> Exp2(RValue<Float4> x)
+{
+	return emulated::Exp2(x);
+}
+
+RValue<Float4> Log2(RValue<Float4> x)
+{
+	return emulated::Log2(x);
+}
+
+RValue<UInt> Ctlz(RValue<UInt> x, bool isZeroUndef)
+{
+	if (emulateIntrinsics)
+	{
+		UNIMPLEMENTED("Subzero Ctlz()"); return UInt(0);
+	}
+	else
+	{
+		Ice::Variable* result = ::function->makeVariable(Ice::IceType_i32);
+		const Ice::Intrinsics::IntrinsicInfo intrinsic = { Ice::Intrinsics::Ctlz, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F };
+		auto target = ::context->getConstantUndef(Ice::IceType_i32);
+		auto ctlz = Ice::InstIntrinsicCall::create(::function, 1, result, target, intrinsic);
+		ctlz->addArg(x.value);
+		::basicBlock->appendInst(ctlz);
+
+		return RValue<UInt>(V(result));
+	}
+}
+
+RValue<UInt4> Ctlz(RValue<UInt4> x, bool isZeroUndef)
+{
+	if (emulateIntrinsics)
+	{
+		UNIMPLEMENTED("Subzero Ctlz()"); return UInt4(0);
+	}
+	else
+	{
+		// TODO: implement vectorized version in Subzero
+		UInt4 result;
+		result = Insert(result, Ctlz(Extract(x, 0), isZeroUndef), 0);
+		result = Insert(result, Ctlz(Extract(x, 1), isZeroUndef), 1);
+		result = Insert(result, Ctlz(Extract(x, 2), isZeroUndef), 2);
+		result = Insert(result, Ctlz(Extract(x, 3), isZeroUndef), 3);
+		return result;
+	}
+}
+
+RValue<UInt> Cttz(RValue<UInt> x, bool isZeroUndef)
+{
+	if (emulateIntrinsics)
+	{
+		UNIMPLEMENTED("Subzero Cttz()"); return UInt(0);
+	}
+	else
+	{
+		Ice::Variable* result = ::function->makeVariable(Ice::IceType_i32);
+		const Ice::Intrinsics::IntrinsicInfo intrinsic = { Ice::Intrinsics::Cttz, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F };
+		auto target = ::context->getConstantUndef(Ice::IceType_i32);
+		auto ctlz = Ice::InstIntrinsicCall::create(::function, 1, result, target, intrinsic);
+		ctlz->addArg(x.value);
+		::basicBlock->appendInst(ctlz);
+
+		return RValue<UInt>(V(result));
+	}
+}
+
+RValue<UInt4> Cttz(RValue<UInt4> x, bool isZeroUndef)
+{
+	if (emulateIntrinsics)
+	{
+		UNIMPLEMENTED("Subzero Cttz()"); return UInt4(0);
+	}
+	else
+	{
+		// TODO: implement vectorized version in Subzero
+		UInt4 result;
+		result = Insert(result, Cttz(Extract(x, 0), isZeroUndef), 0);
+		result = Insert(result, Cttz(Extract(x, 1), isZeroUndef), 1);
+		result = Insert(result, Cttz(Extract(x, 2), isZeroUndef), 2);
+		result = Insert(result, Cttz(Extract(x, 3), isZeroUndef), 3);
+		return result;
+	}
+}
+
+void EmitDebugLocation() {}
+void EmitDebugVariable(Value* value) {}
+void FlushDebug() {}
+
+void Nucleus::createCoroutine(Type *YieldType, std::vector<Type*> &Params)
+{
+	// Subzero currently only supports coroutines as functions (i.e. that do not yield)
+	createFunction(YieldType, Params);
+}
+
+static bool coroutineEntryAwaitStub(Nucleus::CoroutineHandle, void* yieldValue) { return false; }
+static void coroutineEntryDestroyStub(Nucleus::CoroutineHandle) {}
+
+std::shared_ptr<Routine> Nucleus::acquireCoroutine(const char *name, const Config::Edit &cfgEdit /* = Config::Edit::None */)
+{
+	// acquireRoutine sets the CoroutineEntryBegin entry
+	auto coroutineEntry = acquireRoutine(name, cfgEdit);
+
+	// For now, set the await and destroy entries to stubs, until we add proper coroutine support to the Subzero backend
+	auto routine = std::static_pointer_cast<ELFMemoryStreamer>(coroutineEntry);
+	routine->setEntry(Nucleus::CoroutineEntryAwait, reinterpret_cast<const void*>(&coroutineEntryAwaitStub));
+	routine->setEntry(Nucleus::CoroutineEntryDestroy, reinterpret_cast<const void*>(&coroutineEntryDestroyStub));
+
+	return coroutineEntry;
+}
+
+void Nucleus::yield(Value* val) { UNIMPLEMENTED("Yield"); }
+
+}  // namespace rr
diff --git a/src/Reactor/Thread.cpp b/src/Reactor/Thread.cpp
index 7a9b07d..46f3550 100644
--- a/src/Reactor/Thread.cpp
+++ b/src/Reactor/Thread.cpp
@@ -14,78 +14,79 @@
 
 #include "Thread.hpp"
 
-namespace rr
+namespace rr {
+
+Thread::Thread(void (*threadFunction)(void *parameters), void *parameters)
 {
-	Thread::Thread(void (*threadFunction)(void *parameters), void *parameters)
-	{
-		Event init;
-		Entry entry = {threadFunction, parameters, &init};
-
-		#if defined(_WIN32)
-			handle = CreateThread(NULL, 1024 * 1024, startFunction, &entry, 0, NULL);
-		#else
-			pthread_create(&handle, NULL, startFunction, &entry);
-		#endif
-
-		init.wait();
-	}
-
-	Thread::~Thread()
-	{
-		join();   // Make threads exit before deleting them to not block here
-	}
-
-	void Thread::join()
-	{
-		if(!hasJoined)
-		{
-			#if defined(_WIN32)
-				WaitForSingleObject(handle, INFINITE);
-				CloseHandle(handle);
-			#else
-				pthread_join(handle, NULL);
-			#endif
-
-			hasJoined = true;
-		}
-	}
+	Event init;
+	Entry entry = {threadFunction, parameters, &init};
 
 	#if defined(_WIN32)
-		unsigned long __stdcall Thread::startFunction(void *parameters)
-		{
-			Entry entry = *(Entry*)parameters;
-			entry.init->signal();
-			entry.threadFunction(entry.threadParameters);
-			return 0;
-		}
+		handle = CreateThread(NULL, 1024 * 1024, startFunction, &entry, 0, NULL);
 	#else
-		void *Thread::startFunction(void *parameters)
-		{
-			Entry entry = *(Entry*)parameters;
-			entry.init->signal();
-			entry.threadFunction(entry.threadParameters);
-			return nullptr;
-		}
+		pthread_create(&handle, NULL, startFunction, &entry);
 	#endif
 
-	Event::Event()
-	{
-		#if defined(_WIN32)
-			handle = CreateEvent(NULL, FALSE, FALSE, NULL);
-		#else
-			pthread_cond_init(&handle, NULL);
-			pthread_mutex_init(&mutex, NULL);
-			signaled = false;
-		#endif
-	}
+	init.wait();
+}
 
-	Event::~Event()
+Thread::~Thread()
+{
+	join();   // Make threads exit before deleting them to not block here
+}
+
+void Thread::join()
+{
+	if(!hasJoined)
 	{
 		#if defined(_WIN32)
+			WaitForSingleObject(handle, INFINITE);
 			CloseHandle(handle);
 		#else
-			pthread_cond_destroy(&handle);
-			pthread_mutex_destroy(&mutex);
+			pthread_join(handle, NULL);
 		#endif
+
+		hasJoined = true;
 	}
 }
+
+#if defined(_WIN32)
+	unsigned long __stdcall Thread::startFunction(void *parameters)
+	{
+		Entry entry = *(Entry*)parameters;
+		entry.init->signal();
+		entry.threadFunction(entry.threadParameters);
+		return 0;
+	}
+#else
+	void *Thread::startFunction(void *parameters)
+	{
+		Entry entry = *(Entry*)parameters;
+		entry.init->signal();
+		entry.threadFunction(entry.threadParameters);
+		return nullptr;
+	}
+#endif
+
+Event::Event()
+{
+	#if defined(_WIN32)
+		handle = CreateEvent(NULL, FALSE, FALSE, NULL);
+	#else
+		pthread_cond_init(&handle, NULL);
+		pthread_mutex_init(&mutex, NULL);
+		signaled = false;
+	#endif
+}
+
+Event::~Event()
+{
+	#if defined(_WIN32)
+		CloseHandle(handle);
+	#else
+		pthread_cond_destroy(&handle);
+		pthread_mutex_destroy(&mutex);
+	#endif
+}
+
+}  // namespace rr
diff --git a/src/Reactor/Thread.hpp b/src/Reactor/Thread.hpp
index ed3bb55..7feb61e 100644
--- a/src/Reactor/Thread.hpp
+++ b/src/Reactor/Thread.hpp
@@ -43,296 +43,300 @@
 #include <atomic>
 #endif
 
-namespace rr
+namespace rr {
+
+class Event;
+
+class Thread
 {
-	class Event;
+public:
+	Thread(void (*threadFunction)(void *parameters), void *parameters);
 
-	class Thread
-	{
-	public:
-		Thread(void (*threadFunction)(void *parameters), void *parameters);
+	~Thread();
 
-		~Thread();
+	void join();
 
-		void join();
+	static void yield();
+	static void sleep(int milliseconds);
 
-		static void yield();
-		static void sleep(int milliseconds);
-
-		#if defined(_WIN32)
-			typedef DWORD LocalStorageKey;
-		#else
-			typedef pthread_key_t LocalStorageKey;
-		#endif
-
-		static LocalStorageKey allocateLocalStorageKey(void (*destructor)(void *storage) = free);
-		static void freeLocalStorageKey(LocalStorageKey key);
-		static void *allocateLocalStorage(LocalStorageKey key, size_t size);
-		static void *getLocalStorage(LocalStorageKey key);
-		static void freeLocalStorage(LocalStorageKey key);
-
-	private:
-		struct Entry
-		{
-			void (*const threadFunction)(void *parameters);
-			void *threadParameters;
-			Event *init;
-		};
-
-		#if defined(_WIN32)
-			static unsigned long __stdcall startFunction(void *parameters);
-			HANDLE handle;
-		#else
-			static void *startFunction(void *parameters);
-			pthread_t handle;
-		#endif
-
-		bool hasJoined = false;
-	};
-
-	class Event
-	{
-		friend class Thread;
-
-	public:
-		Event();
-
-		~Event();
-
-		void signal();
-		void wait();
-
-	private:
-		#if defined(_WIN32)
-			HANDLE handle;
-		#else
-			pthread_cond_t handle;
-			pthread_mutex_t mutex;
-			volatile bool signaled;
-		#endif
-	};
-
-	#if PERF_PROFILE
-	int64_t atomicExchange(int64_t volatile *target, int64_t value);
-	int atomicExchange(int volatile *target, int value);
+	#if defined(_WIN32)
+		typedef DWORD LocalStorageKey;
+	#else
+		typedef pthread_key_t LocalStorageKey;
 	#endif
 
-	int atomicIncrement(int volatile *value);
-	int atomicDecrement(int volatile *value);
-	int atomicAdd(int volatile *target, int value);
-	void nop();
+	static LocalStorageKey allocateLocalStorageKey(void (*destructor)(void *storage) = free);
+	static void freeLocalStorageKey(LocalStorageKey key);
+	static void *allocateLocalStorage(LocalStorageKey key, size_t size);
+	static void *getLocalStorage(LocalStorageKey key);
+	static void freeLocalStorage(LocalStorageKey key);
+
+private:
+	struct Entry
+	{
+		void (*const threadFunction)(void *parameters);
+		void *threadParameters;
+		Event *init;
+	};
+
+	#if defined(_WIN32)
+		static unsigned long __stdcall startFunction(void *parameters);
+		HANDLE handle;
+	#else
+		static void *startFunction(void *parameters);
+		pthread_t handle;
+	#endif
+
+	bool hasJoined = false;
+};
+
+class Event
+{
+	friend class Thread;
+
+public:
+	Event();
+
+	~Event();
+
+	void signal();
+	void wait();
+
+private:
+	#if defined(_WIN32)
+		HANDLE handle;
+	#else
+		pthread_cond_t handle;
+		pthread_mutex_t mutex;
+		volatile bool signaled;
+	#endif
+};
+
+#if PERF_PROFILE
+int64_t atomicExchange(int64_t volatile *target, int64_t value);
+int atomicExchange(int volatile *target, int value);
+#endif
+
+int atomicIncrement(int volatile *value);
+int atomicDecrement(int volatile *value);
+int atomicAdd(int volatile *target, int value);
+void nop();
+
+}  // namespace rr
+
+/* Inline implementation */
+
+namespace rr {
+
+inline void Thread::yield()
+{
+	#if defined(_WIN32)
+		Sleep(0);
+	#elif defined(__APPLE__)
+		pthread_yield_np();
+	#else
+		sched_yield();
+	#endif
 }
 
-namespace rr
+inline void Thread::sleep(int milliseconds)
 {
-	inline void Thread::yield()
+	#if defined(_WIN32)
+		Sleep(milliseconds);
+	#else
+		usleep(1000 * milliseconds);
+	#endif
+}
+
+inline Thread::LocalStorageKey Thread::allocateLocalStorageKey(void (*destructor)(void *storage))
+{
+	#if defined(_WIN32)
+		return TlsAlloc();
+	#else
+		LocalStorageKey key;
+		pthread_key_create(&key, destructor);
+		return key;
+	#endif
+}
+
+inline void Thread::freeLocalStorageKey(LocalStorageKey key)
+{
+	#if defined(_WIN32)
+		TlsFree(key);
+	#else
+		pthread_key_delete(key);   // Using an invalid key is an error but not undefined behavior.
+	#endif
+}
+
+inline void *Thread::allocateLocalStorage(LocalStorageKey key, size_t size)
+{
+	if(key == TLS_OUT_OF_INDEXES)
 	{
-		#if defined(_WIN32)
-			Sleep(0);
-		#elif defined(__APPLE__)
-			pthread_yield_np();
-		#else
-			sched_yield();
-		#endif
+		return nullptr;
 	}
 
-	inline void Thread::sleep(int milliseconds)
-	{
-		#if defined(_WIN32)
-			Sleep(milliseconds);
-		#else
-			usleep(1000 * milliseconds);
-		#endif
-	}
+	freeLocalStorage(key);
 
-	inline Thread::LocalStorageKey Thread::allocateLocalStorageKey(void (*destructor)(void *storage))
-	{
-		#if defined(_WIN32)
-			return TlsAlloc();
-		#else
-			LocalStorageKey key;
-			pthread_key_create(&key, destructor);
-			return key;
-		#endif
-	}
+	void *storage = malloc(size);
 
-	inline void Thread::freeLocalStorageKey(LocalStorageKey key)
-	{
-		#if defined(_WIN32)
-			TlsFree(key);
-		#else
-			pthread_key_delete(key);   // Using an invalid key is an error but not undefined behavior.
-		#endif
-	}
+	#if defined(_WIN32)
+		TlsSetValue(key, storage);
+	#else
+		pthread_setspecific(key, storage);
+	#endif
 
-	inline void *Thread::allocateLocalStorage(LocalStorageKey key, size_t size)
-	{
-		if(key == TLS_OUT_OF_INDEXES)
+	return storage;
+}
+
+inline void *Thread::getLocalStorage(LocalStorageKey key)
+{
+	#if defined(_WIN32)
+		return TlsGetValue(key);
+	#else
+		if(key == TLS_OUT_OF_INDEXES)   // Avoid undefined behavior.
 		{
 			return nullptr;
 		}
 
-		freeLocalStorage(key);
-
-		void *storage = malloc(size);
-
-		#if defined(_WIN32)
-			TlsSetValue(key, storage);
-		#else
-			pthread_setspecific(key, storage);
-		#endif
-
-		return storage;
-	}
-
-	inline void *Thread::getLocalStorage(LocalStorageKey key)
-	{
-		#if defined(_WIN32)
-			return TlsGetValue(key);
-		#else
-			if(key == TLS_OUT_OF_INDEXES)   // Avoid undefined behavior.
-			{
-				return nullptr;
-			}
-
-			return pthread_getspecific(key);
-		#endif
-	}
-
-	inline void Thread::freeLocalStorage(LocalStorageKey key)
-	{
-		free(getLocalStorage(key));
-
-		#if defined(_WIN32)
-			TlsSetValue(key, nullptr);
-		#else
-			pthread_setspecific(key, nullptr);
-		#endif
-	}
-
-	inline void Event::signal()
-	{
-		#if defined(_WIN32)
-			SetEvent(handle);
-		#else
-			pthread_mutex_lock(&mutex);
-			signaled = true;
-			pthread_cond_signal(&handle);
-			pthread_mutex_unlock(&mutex);
-		#endif
-	}
-
-	inline void Event::wait()
-	{
-		#if defined(_WIN32)
-			WaitForSingleObject(handle, INFINITE);
-		#else
-			pthread_mutex_lock(&mutex);
-			while(!signaled) pthread_cond_wait(&handle, &mutex);
-			signaled = false;
-			pthread_mutex_unlock(&mutex);
-		#endif
-	}
-
-	#if PERF_PROFILE
-	inline int64_t atomicExchange(volatile int64_t *target, int64_t value)
-	{
-		#if defined(_WIN32)
-			return InterlockedExchange64(target, value);
-		#else
-			int ret;
-			__asm__ __volatile__("lock; xchg8 %x0,(%x1)" : "=r" (ret) :"r" (target), "0" (value) : "memory" );
-			return ret;
-		#endif
-	}
-
-	inline int atomicExchange(volatile int *target, int value)
-	{
-		#if defined(_WIN32)
-			return InterlockedExchange((volatile long*)target, (long)value);
-		#else
-			int ret;
-			__asm__ __volatile__("lock; xchgl %x0,(%x1)" : "=r" (ret) :"r" (target), "0" (value) : "memory" );
-			return ret;
-		#endif
-	}
-	#endif
-
-	inline int atomicIncrement(volatile int *value)
-	{
-		#if defined(_WIN32)
-			return InterlockedIncrement((volatile long*)value);
-		#else
-			return __sync_add_and_fetch(value, 1);
-		#endif
-	}
-
-	inline int atomicDecrement(volatile int *value)
-	{
-		#if defined(_WIN32)
-			return InterlockedDecrement((volatile long*)value);
-		#else
-			return __sync_sub_and_fetch(value, 1);
-		#endif
-	}
-
-	inline int atomicAdd(volatile int* target, int value)
-	{
-		#if defined(_WIN32)
-			return InterlockedExchangeAdd((volatile long*)target, value) + value;
-		#else
-			return __sync_add_and_fetch(target, value);
-		#endif
-	}
-
-	inline void nop()
-	{
-		#if defined(_WIN32)
-			__nop();
-		#else
-			__asm__ __volatile__ ("nop");
-		#endif
-	}
-
-	#if USE_STD_ATOMIC
-		class AtomicInt
-		{
-		public:
-			AtomicInt() : ai() {}
-			AtomicInt(int i) : ai(i) {}
-
-			inline operator int() const { return ai.load(std::memory_order_acquire); }
-			inline void operator=(const AtomicInt& i) { ai.store(i.ai.load(std::memory_order_acquire), std::memory_order_release); }
-			inline void operator=(int i) { ai.store(i, std::memory_order_release); }
-			inline void operator--() { ai.fetch_sub(1, std::memory_order_acq_rel); }
-			inline void operator++() { ai.fetch_add(1, std::memory_order_acq_rel); }
-			inline int operator--(int) { return ai.fetch_sub(1, std::memory_order_acq_rel) - 1; }
-			inline int operator++(int) { return ai.fetch_add(1, std::memory_order_acq_rel) + 1; }
-			inline void operator-=(int i) { ai.fetch_sub(i, std::memory_order_acq_rel); }
-			inline void operator+=(int i) { ai.fetch_add(i, std::memory_order_acq_rel); }
-		private:
-			std::atomic<int> ai;
-		};
-	#else
-		class AtomicInt
-		{
-		public:
-			AtomicInt() {}
-			AtomicInt(int i) : vi(i) {}
-
-			inline operator int() const { return vi; } // Note: this isn't a guaranteed atomic operation
-			inline void operator=(const AtomicInt& i) { atomicExchange(&vi, i.vi); }
-			inline void operator=(int i) { atomicExchange(&vi, i); }
-			inline void operator--() { atomicDecrement(&vi); }
-			inline void operator++() { atomicIncrement(&vi); }
-			inline int operator--(int) { return atomicDecrement(&vi); }
-			inline int operator++(int) { return atomicIncrement(&vi); }
-			inline void operator-=(int i) { atomicAdd(&vi, -i); }
-			inline void operator+=(int i) { atomicAdd(&vi, i); }
-		private:
-			volatile int vi;
-		};
+		return pthread_getspecific(key);
 	#endif
 }
 
+inline void Thread::freeLocalStorage(LocalStorageKey key)
+{
+	free(getLocalStorage(key));
+
+	#if defined(_WIN32)
+		TlsSetValue(key, nullptr);
+	#else
+		pthread_setspecific(key, nullptr);
+	#endif
+}
+
+inline void Event::signal()
+{
+	#if defined(_WIN32)
+		SetEvent(handle);
+	#else
+		pthread_mutex_lock(&mutex);
+		signaled = true;
+		pthread_cond_signal(&handle);
+		pthread_mutex_unlock(&mutex);
+	#endif
+}
+
+inline void Event::wait()
+{
+	#if defined(_WIN32)
+		WaitForSingleObject(handle, INFINITE);
+	#else
+		pthread_mutex_lock(&mutex);
+		while(!signaled) pthread_cond_wait(&handle, &mutex);
+		signaled = false;
+		pthread_mutex_unlock(&mutex);
+	#endif
+}
+
+#if PERF_PROFILE
+inline int64_t atomicExchange(volatile int64_t *target, int64_t value)
+{
+	#if defined(_WIN32)
+		return InterlockedExchange64(target, value);
+	#else
+		int ret;
+		__asm__ __volatile__("lock; xchg8 %x0,(%x1)" : "=r" (ret) :"r" (target), "0" (value) : "memory" );
+		return ret;
+	#endif
+}
+
+inline int atomicExchange(volatile int *target, int value)
+{
+	#if defined(_WIN32)
+		return InterlockedExchange((volatile long*)target, (long)value);
+	#else
+		int ret;
+		__asm__ __volatile__("lock; xchgl %x0,(%x1)" : "=r" (ret) :"r" (target), "0" (value) : "memory" );
+		return ret;
+	#endif
+}
+#endif
+
+inline int atomicIncrement(volatile int *value)
+{
+	#if defined(_WIN32)
+		return InterlockedIncrement((volatile long*)value);
+	#else
+		return __sync_add_and_fetch(value, 1);
+	#endif
+}
+
+inline int atomicDecrement(volatile int *value)
+{
+	#if defined(_WIN32)
+		return InterlockedDecrement((volatile long*)value);
+	#else
+		return __sync_sub_and_fetch(value, 1);
+	#endif
+}
+
+inline int atomicAdd(volatile int* target, int value)
+{
+	#if defined(_WIN32)
+		return InterlockedExchangeAdd((volatile long*)target, value) + value;
+	#else
+		return __sync_add_and_fetch(target, value);
+	#endif
+}
+
+inline void nop()
+{
+	#if defined(_WIN32)
+		__nop();
+	#else
+		__asm__ __volatile__ ("nop");
+	#endif
+}
+
+#if USE_STD_ATOMIC
+	class AtomicInt
+	{
+	public:
+		AtomicInt() : ai() {}
+		AtomicInt(int i) : ai(i) {}
+
+		inline operator int() const { return ai.load(std::memory_order_acquire); }
+		inline void operator=(const AtomicInt& i) { ai.store(i.ai.load(std::memory_order_acquire), std::memory_order_release); }
+		inline void operator=(int i) { ai.store(i, std::memory_order_release); }
+		inline void operator--() { ai.fetch_sub(1, std::memory_order_acq_rel); }
+		inline void operator++() { ai.fetch_add(1, std::memory_order_acq_rel); }
+		inline int operator--(int) { return ai.fetch_sub(1, std::memory_order_acq_rel) - 1; }
+		inline int operator++(int) { return ai.fetch_add(1, std::memory_order_acq_rel) + 1; }
+		inline void operator-=(int i) { ai.fetch_sub(i, std::memory_order_acq_rel); }
+		inline void operator+=(int i) { ai.fetch_add(i, std::memory_order_acq_rel); }
+	private:
+		std::atomic<int> ai;
+	};
+#else
+	class AtomicInt
+	{
+	public:
+		AtomicInt() {}
+		AtomicInt(int i) : vi(i) {}
+
+		inline operator int() const { return vi; } // Note: this isn't a guaranteed atomic operation
+		inline void operator=(const AtomicInt& i) { atomicExchange(&vi, i.vi); }
+		inline void operator=(int i) { atomicExchange(&vi, i); }
+		inline void operator--() { atomicDecrement(&vi); }
+		inline void operator++() { atomicIncrement(&vi); }
+		inline int operator--(int) { return atomicDecrement(&vi); }
+		inline int operator++(int) { return atomicIncrement(&vi); }
+		inline void operator-=(int i) { atomicAdd(&vi, -i); }
+		inline void operator+=(int i) { atomicAdd(&vi, i); }
+	private:
+		volatile int vi;
+	};
+#endif
+
+}  // namespace rr
+
 #endif   // rr_Thread_hpp
diff --git a/src/Reactor/Traits.hpp b/src/Reactor/Traits.hpp
index d26abc7..53f36f0 100644
--- a/src/Reactor/Traits.hpp
+++ b/src/Reactor/Traits.hpp
@@ -22,216 +22,216 @@
 #undef Bool // b/127920555
 #endif
 
-namespace rr
+namespace rr {
+
+// Forward declarations
+class Value;
+
+class Void;
+class Bool;
+class Byte;
+class SByte;
+class Short;
+class UShort;
+class Int;
+class UInt;
+class Long;
+class Half;
+class Float;
+class Float4;
+
+template<class T> class Pointer;
+template<class T> class LValue;
+template<class T> class RValue;
+
+// enabled_if_t is identical to C++14's std::enable_if_t.
+// std::enable_if_t was introduced in C++14, but Reactor must support
+// C++11.
+template<bool Condition, class TrueType = void>
+using enable_if_t = typename std::enable_if<Condition, TrueType>::type;
+
+// IsDefined<T>::value is true if T is a valid type, otherwise false.
+template <typename T, typename Enable = void>
+struct IsDefined
 {
-	// Forward declarations
-	class Value;
+	static constexpr bool value = false;
+};
 
-	class Void;
-	class Bool;
-	class Byte;
-	class SByte;
-	class Short;
-	class UShort;
-	class Int;
-	class UInt;
-	class Long;
-	class Half;
-	class Float;
-	class Float4;
+template <typename T>
+struct IsDefined<T, enable_if_t<(sizeof(T)>0)> >
+{
+	static constexpr bool value = true;
+};
 
-	template<class T> class Pointer;
-	template<class T> class LValue;
-	template<class T> class RValue;
+template <>
+struct IsDefined<void>
+{
+	static constexpr bool value = true;
+};
 
-	// enabled_if_t is identical to C++14's std::enable_if_t.
-	// std::enable_if_t was introduced in C++14, but Reactor must support
-	// C++11.
-	template<bool Condition, class TrueType = void>
-	using enable_if_t = typename std::enable_if<Condition, TrueType>::type;
+// CToReactorT<T> resolves to the corresponding Reactor type for the given C
+// template type T.
+template<typename T, typename ENABLE = void> struct CToReactor;
+template<typename T> using CToReactorT = typename CToReactor<T>::type;
 
-	// IsDefined<T>::value is true if T is a valid type, otherwise false.
-	template <typename T, typename Enable = void>
-	struct IsDefined
-	{
-		static constexpr bool value = false;
-	};
+// CToReactor specializations for POD types.
+template<> struct CToReactor<void>    	{ using type = Void; };
+template<> struct CToReactor<bool>    	{ using type = Bool;   static Bool   cast(bool);     };
+template<> struct CToReactor<uint8_t> 	{ using type = Byte;   static Byte   cast(uint8_t);  };
+template<> struct CToReactor<int8_t>  	{ using type = SByte;  static SByte  cast(int8_t);   };
+template<> struct CToReactor<int16_t> 	{ using type = Short;  static Short  cast(int16_t);  };
+template<> struct CToReactor<uint16_t>	{ using type = UShort; static UShort cast(uint16_t); };
+template<> struct CToReactor<int32_t> 	{ using type = Int;    static Int    cast(int32_t);  };
+template<> struct CToReactor<uint32_t>	{ using type = UInt;   static UInt   cast(uint32_t); };
+template<> struct CToReactor<float>   	{ using type = Float;  static Float  cast(float);    };
+template<> struct CToReactor<float[4]>	{ using type = Float4; static Float4 cast(float[4]); };
 
-	template <typename T>
-	struct IsDefined<T, enable_if_t<(sizeof(T)>0)> >
-	{
-		static constexpr bool value = true;
-	};
+// TODO: Long has no constructor that takes a uint64_t
+template<> struct CToReactor<uint64_t>	{ using type = Long;  /* static Long   cast(uint64_t); */ };
 
-	template <>
-	struct IsDefined<void>
-	{
-		static constexpr bool value = true;
-	};
+// HasReactorType<T>::value resolves to true iff there exists a
+// CToReactorT specialization for type T.
+template<typename T>
+using HasReactorType = IsDefined< CToReactorT<T> >;
 
-	// CToReactorT<T> resolves to the corresponding Reactor type for the given C
-	// template type T.
-	template<typename T, typename ENABLE = void> struct CToReactor;
-	template<typename T> using CToReactorT = typename CToReactor<T>::type;
+// CToReactorPtr<T>::type resolves to the corresponding Reactor Pointer<>
+// type for T*.
+// For T types that have a CToReactorT<> specialization,
+// CToReactorPtr<T>::type resolves to Pointer< CToReactorT<T> >, otherwise
+// CToReactorPtr<T>::type resolves to Pointer<Byte>.
+template<typename T, typename ENABLE = void> struct CToReactorPtr
+{
+	using type = Pointer<Byte>;
+	static inline type cast(const T* v); // implemented in Traits.inl
+};
 
-	// CToReactor specializations for POD types.
-	template<> struct CToReactor<void>    	{ using type = Void; };
-	template<> struct CToReactor<bool>    	{ using type = Bool;   static Bool   cast(bool);     };
-	template<> struct CToReactor<uint8_t> 	{ using type = Byte;   static Byte   cast(uint8_t);  };
-	template<> struct CToReactor<int8_t>  	{ using type = SByte;  static SByte  cast(int8_t);   };
-	template<> struct CToReactor<int16_t> 	{ using type = Short;  static Short  cast(int16_t);  };
-	template<> struct CToReactor<uint16_t>	{ using type = UShort; static UShort cast(uint16_t); };
-	template<> struct CToReactor<int32_t> 	{ using type = Int;    static Int    cast(int32_t);  };
-	template<> struct CToReactor<uint32_t>	{ using type = UInt;   static UInt   cast(uint32_t); };
-	template<> struct CToReactor<float>   	{ using type = Float;  static Float  cast(float);    };
-	template<> struct CToReactor<float[4]>	{ using type = Float4; static Float4 cast(float[4]); };
+// CToReactorPtr specialization for T types that have a CToReactorT<>
+// specialization.
+template<typename T> struct CToReactorPtr<T, enable_if_t< HasReactorType<T>::value > >
+{
+	using type = Pointer< CToReactorT<T> >;
+	static inline type cast(const T* v); // implemented in Traits.inl
+};
 
-	// TODO: Long has no constructor that takes a uint64_t
-	template<> struct CToReactor<uint64_t>	{ using type = Long;  /* static Long   cast(uint64_t); */ };
+// CToReactorPtr specialization for void*.
+// Maps to Pointer<Byte> instead of Pointer<Void>.
+template<> struct CToReactorPtr<void, void>
+{
+	using type = Pointer<Byte>;
+	static inline type cast(const void* v); // implemented in Traits.inl
+};
 
-	// HasReactorType<T>::value resolves to true iff there exists a
-	// CToReactorT specialization for type T.
-	template<typename T>
-	using HasReactorType = IsDefined< CToReactorT<T> >;
+// CToReactorPtr specialization for function pointer types.
+// Maps to Pointer<Byte>.
+// Drops the 'const' qualifier from the cast() method to avoid warnings
+// about const having no meaning for function types.
+template<typename T> struct CToReactorPtr<T, enable_if_t< std::is_function<T>::value > >
+{
+	using type = Pointer<Byte>;
+	static inline type cast(T* v); // implemented in Traits.inl
+};
 
-	// CToReactorPtr<T>::type resolves to the corresponding Reactor Pointer<>
-	// type for T*.
-	// For T types that have a CToReactorT<> specialization,
-	// CToReactorPtr<T>::type resolves to Pointer< CToReactorT<T> >, otherwise
-	// CToReactorPtr<T>::type resolves to Pointer<Byte>.
-	template<typename T, typename ENABLE = void> struct CToReactorPtr
-	{
-		using type = Pointer<Byte>;
-		static inline type cast(const T* v); // implemented in Traits.inl
-	};
+template<typename T> using CToReactorPtrT = typename CToReactorPtr<T>::type;
 
-	// CToReactorPtr specialization for T types that have a CToReactorT<>
-	// specialization.
-	template<typename T> struct CToReactorPtr<T, enable_if_t< HasReactorType<T>::value > >
-	{
-		using type = Pointer< CToReactorT<T> >;
-		static inline type cast(const T* v); // implemented in Traits.inl
-	};
+// CToReactor specialization for pointer types.
+// For T types that have a CToReactorT<> specialization,
+// CToReactorT<T*>::type resolves to Pointer< CToReactorT<T> >, otherwise
+// CToReactorT<T*>::type resolves to Pointer<Byte>.
+template<typename T>
+struct CToReactor<T, enable_if_t<std::is_pointer<T>::value> >
+{
+	using elem = typename std::remove_pointer<T>::type;
+	using type = CToReactorPtrT<elem>;
+	static inline type cast(T v); // implemented in Traits.inl
+};
 
-	// CToReactorPtr specialization for void*.
-	// Maps to Pointer<Byte> instead of Pointer<Void>.
-	template<> struct CToReactorPtr<void, void>
-	{
-		using type = Pointer<Byte>;
-		static inline type cast(const void* v); // implemented in Traits.inl
-	};
+// CToReactor specialization for enum types.
+template<typename T>
+struct CToReactor<T, enable_if_t<std::is_enum<T>::value> >
+{
+	using underlying = typename std::underlying_type<T>::type;
+	using type = CToReactorT<underlying>;
+	static type cast(T v); // implemented in Traits.inl
+};
 
-	// CToReactorPtr specialization for function pointer types.
-	// Maps to Pointer<Byte>.
-	// Drops the 'const' qualifier from the cast() method to avoid warnings
-	// about const having no meaning for function types.
-	template<typename T> struct CToReactorPtr<T, enable_if_t< std::is_function<T>::value > >
-	{
-		using type = Pointer<Byte>;
-		static inline type cast(T* v); // implemented in Traits.inl
-	};
+// IsRValue::value is true if T is of type RValue<X>, where X is any type.
+template <typename T, typename Enable = void> struct IsRValue { static constexpr bool value = false; };
+template <typename T> struct IsRValue<T, enable_if_t<IsDefined<typename T::rvalue_underlying_type>::value> > { static constexpr bool value = true; };
 
-	template<typename T> using CToReactorPtrT = typename CToReactorPtr<T>::type;
+// IsLValue::value is true if T is of, or derives from type LValue<T>.
+template <typename T> struct IsLValue { static constexpr bool value = std::is_base_of<LValue<T>, T>::value; };
 
-	// CToReactor specialization for pointer types.
-	// For T types that have a CToReactorT<> specialization,
-	// CToReactorT<T*>::type resolves to Pointer< CToReactorT<T> >, otherwise
-	// CToReactorT<T*>::type resolves to Pointer<Byte>.
-	template<typename T>
-	struct CToReactor<T, enable_if_t<std::is_pointer<T>::value> >
-	{
-		using elem = typename std::remove_pointer<T>::type;
-		using type = CToReactorPtrT<elem>;
-		static inline type cast(T v); // implemented in Traits.inl
-	};
+// IsReference::value is true if T is of type Reference<X>, where X is any type.
+template <typename T, typename Enable = void> struct IsReference { static constexpr bool value = false; };
+template <typename T> struct IsReference<T, enable_if_t<IsDefined<typename T::reference_underlying_type>::value> > { static constexpr bool value = true; };
 
-	// CToReactor specialization for enum types.
-	template<typename T>
-	struct CToReactor<T, enable_if_t<std::is_enum<T>::value> >
-	{
-		using underlying = typename std::underlying_type<T>::type;
-		using type = CToReactorT<underlying>;
-		static type cast(T v); // implemented in Traits.inl
-	};
-
-	// IsRValue::value is true if T is of type RValue<X>, where X is any type.
-	template <typename T, typename Enable = void> struct IsRValue { static constexpr bool value = false; };
-	template <typename T> struct IsRValue<T, enable_if_t<IsDefined<typename T::rvalue_underlying_type>::value> > { static constexpr bool value = true; };
-
-	// IsLValue::value is true if T is of, or derives from type LValue<T>.
-	template <typename T> struct IsLValue { static constexpr bool value = std::is_base_of<LValue<T>, T>::value; };
-
-	// IsReference::value is true if T is of type Reference<X>, where X is any type.
-	template <typename T, typename Enable = void> struct IsReference { static constexpr bool value = false; };
-	template <typename T> struct IsReference<T, enable_if_t<IsDefined<typename T::reference_underlying_type>::value> > { static constexpr bool value = true; };
-
-	// ReactorTypeT<T> returns the LValue Reactor type for T.
-	// T can be a C-type, RValue or LValue.
-	template<typename T, typename ENABLE = void> struct ReactorType;
-	template<typename T> using ReactorTypeT = typename ReactorType<T>::type;
-	template<typename T> struct ReactorType<T, enable_if_t<IsDefined<CToReactorT<T>>::value> >
-	{
-		using type = CToReactorT<T>;
-		static type cast(T v) { return CToReactor<T>::cast(v); }
-	};
-	template<typename T> struct ReactorType<T, enable_if_t<IsRValue<T>::value> >
-	{
-		using type = typename T::rvalue_underlying_type;
-		static type cast(T v) { return type(v); }
-	};
-	template<typename T> struct ReactorType<T, enable_if_t<IsLValue<T>::value> >
-	{
-		using type = T;
-		static type cast(T v) { return type(v); }
-	};
-	template<typename T> struct ReactorType<T, enable_if_t<IsReference<T>::value> >
-	{
-		using type = T;
-		static type cast(T v) { return type(v); }
-	};
+// ReactorTypeT<T> returns the LValue Reactor type for T.
+// T can be a C-type, RValue or LValue.
+template<typename T, typename ENABLE = void> struct ReactorType;
+template<typename T> using ReactorTypeT = typename ReactorType<T>::type;
+template<typename T> struct ReactorType<T, enable_if_t<IsDefined<CToReactorT<T>>::value> >
+{
+	using type = CToReactorT<T>;
+	static type cast(T v) { return CToReactor<T>::cast(v); }
+};
+template<typename T> struct ReactorType<T, enable_if_t<IsRValue<T>::value> >
+{
+	using type = typename T::rvalue_underlying_type;
+	static type cast(T v) { return type(v); }
+};
+template<typename T> struct ReactorType<T, enable_if_t<IsLValue<T>::value> >
+{
+	using type = T;
+	static type cast(T v) { return type(v); }
+};
+template<typename T> struct ReactorType<T, enable_if_t<IsReference<T>::value> >
+{
+	using type = T;
+	static type cast(T v) { return type(v); }
+};
 
 
-	// Reactor types that can be used as a return type for a function.
-	template <typename T> struct CanBeUsedAsReturn { static constexpr bool value = false; };
-	template <> struct CanBeUsedAsReturn<Void>     { static constexpr bool value = true; };
-	template <> struct CanBeUsedAsReturn<Int>      { static constexpr bool value = true; };
-	template <> struct CanBeUsedAsReturn<UInt>     { static constexpr bool value = true; };
-	template <> struct CanBeUsedAsReturn<Float>    { static constexpr bool value = true; };
-	template <typename T> struct CanBeUsedAsReturn<Pointer<T>> { static constexpr bool value = true; };
+// Reactor types that can be used as a return type for a function.
+template <typename T> struct CanBeUsedAsReturn { static constexpr bool value = false; };
+template <> struct CanBeUsedAsReturn<Void>     { static constexpr bool value = true; };
+template <> struct CanBeUsedAsReturn<Int>      { static constexpr bool value = true; };
+template <> struct CanBeUsedAsReturn<UInt>     { static constexpr bool value = true; };
+template <> struct CanBeUsedAsReturn<Float>    { static constexpr bool value = true; };
+template <typename T> struct CanBeUsedAsReturn<Pointer<T>> { static constexpr bool value = true; };
 
-	// Reactor types that can be used as a parameter types for a function.
-	template <typename T> struct CanBeUsedAsParameter { static constexpr bool value = false; };
-	template <> struct CanBeUsedAsParameter<Int>      { static constexpr bool value = true; };
-	template <> struct CanBeUsedAsParameter<UInt>     { static constexpr bool value = true; };
-	template <> struct CanBeUsedAsParameter<Float>    { static constexpr bool value = true; };
-	template <typename T> struct CanBeUsedAsParameter<Pointer<T>> { static constexpr bool value = true; };
+// Reactor types that can be used as a parameter types for a function.
+template <typename T> struct CanBeUsedAsParameter { static constexpr bool value = false; };
+template <> struct CanBeUsedAsParameter<Int>      { static constexpr bool value = true; };
+template <> struct CanBeUsedAsParameter<UInt>     { static constexpr bool value = true; };
+template <> struct CanBeUsedAsParameter<Float>    { static constexpr bool value = true; };
+template <typename T> struct CanBeUsedAsParameter<Pointer<T>> { static constexpr bool value = true; };
 
-	// AssertParameterTypeIsValid statically asserts that all template parameter
-	// types can be used as a Reactor function parameter.
-	template<typename T, typename ... other>
-	struct AssertParameterTypeIsValid : AssertParameterTypeIsValid<other...>
-	{
-		static_assert(CanBeUsedAsParameter<T>::value, "Invalid parameter type");
-	};
-	template<typename T>
-	struct AssertParameterTypeIsValid<T>
-	{
-		static_assert(CanBeUsedAsParameter<T>::value, "Invalid parameter type");
-	};
+// AssertParameterTypeIsValid statically asserts that all template parameter
+// types can be used as a Reactor function parameter.
+template<typename T, typename ... other>
+struct AssertParameterTypeIsValid : AssertParameterTypeIsValid<other...>
+{
+	static_assert(CanBeUsedAsParameter<T>::value, "Invalid parameter type");
+};
+template<typename T>
+struct AssertParameterTypeIsValid<T>
+{
+	static_assert(CanBeUsedAsParameter<T>::value, "Invalid parameter type");
+};
 
-	// AssertFunctionSignatureIsValid statically asserts that the Reactor
-	// function signature is valid.
-	template<typename Return, typename... Arguments>
-	class AssertFunctionSignatureIsValid;
-	template<typename Return>
-	class AssertFunctionSignatureIsValid<Return(Void)> {};
-	template<typename Return, typename... Arguments>
-	class AssertFunctionSignatureIsValid<Return(Arguments...)>
-	{
-		static_assert(CanBeUsedAsReturn<Return>::value, "Invalid return type");
-		static_assert(sizeof(AssertParameterTypeIsValid<Arguments...>) >= 0, "");
-	};
+// AssertFunctionSignatureIsValid statically asserts that the Reactor
+// function signature is valid.
+template<typename Return, typename... Arguments>
+class AssertFunctionSignatureIsValid;
+template<typename Return>
+class AssertFunctionSignatureIsValid<Return(Void)> {};
+template<typename Return, typename... Arguments>
+class AssertFunctionSignatureIsValid<Return(Arguments...)>
+{
+	static_assert(CanBeUsedAsReturn<Return>::value, "Invalid return type");
+	static_assert(sizeof(AssertParameterTypeIsValid<Arguments...>) >= 0, "");
+};
 
-} // namespace rr
+}  // namespace rr
 
 #endif // rr_Traits_hpp
diff --git a/src/Reactor/Traits.inl b/src/Reactor/Traits.inl
index 2e10568..23a5941 100644
--- a/src/Reactor/Traits.inl
+++ b/src/Reactor/Traits.inl
@@ -15,55 +15,55 @@
 #ifndef rr_Traits_inl
 #define rr_Traits_inl
 
-namespace rr
+namespace rr {
+
+// Non-specialized implementation of CToReactorPtr::cast() defaults to
+// returning a ConstantPointer for v.
+template<typename T, typename ENABLE>
+Pointer<Byte> CToReactorPtr<T, ENABLE>::cast(const T* v)
 {
-	// Non-specialized implementation of CToReactorPtr::cast() defaults to
-	// returning a ConstantPointer for v.
-	template<typename T, typename ENABLE>
-	Pointer<Byte> CToReactorPtr<T, ENABLE>::cast(const T* v)
-	{
-		return ConstantPointer(v);
-	}
+	return ConstantPointer(v);
+}
 
-	// CToReactorPtr specialization for T types that have a CToReactorT<>
-	// specialization.
-	template<typename T>
-	Pointer<CToReactorT<T>>
-	CToReactorPtr<T, enable_if_t< HasReactorType<T>::value > >::cast(const T* v)
-	{
-		return type(v);
-	}
+// CToReactorPtr specialization for T types that have a CToReactorT<>
+// specialization.
+template<typename T>
+Pointer<CToReactorT<T>>
+CToReactorPtr<T, enable_if_t< HasReactorType<T>::value > >::cast(const T* v)
+{
+	return type(v);
+}
 
-	// CToReactorPtr specialization for void*.
-	Pointer<Byte> CToReactorPtr<void, void>::cast(const void* v)
-	{
-		return ConstantPointer(v);
-	}
+// CToReactorPtr specialization for void*.
+Pointer<Byte> CToReactorPtr<void, void>::cast(const void* v)
+{
+	return ConstantPointer(v);
+}
 
-	// CToReactorPtrT specialization for function pointer types.
-	template<typename T>
-	Pointer<Byte>
-	CToReactorPtr<T, enable_if_t< std::is_function<T>::value > >::cast(T* v)
-	{
-		return ConstantPointer(v);
-	}
+// CToReactorPtrT specialization for function pointer types.
+template<typename T>
+Pointer<Byte>
+CToReactorPtr<T, enable_if_t< std::is_function<T>::value > >::cast(T* v)
+{
+	return ConstantPointer(v);
+}
 
-	// CToReactor specialization for pointer types.
-	template<typename T>
-	CToReactorPtrT<typename std::remove_pointer<T>::type>
-	CToReactor<T, enable_if_t<std::is_pointer<T>::value> >::cast(T v)
-	{
-		return CToReactorPtr<elem>::cast(v);
-	}
+// CToReactor specialization for pointer types.
+template<typename T>
+CToReactorPtrT<typename std::remove_pointer<T>::type>
+CToReactor<T, enable_if_t<std::is_pointer<T>::value> >::cast(T v)
+{
+	return CToReactorPtr<elem>::cast(v);
+}
 
-	// CToReactor specialization for enum types.
-	template<typename T>
-	CToReactorT<typename std::underlying_type<T>::type>
-	CToReactor<T, enable_if_t<std::is_enum<T>::value> >::cast(T v)
-	{
-		return CToReactor<underlying>::cast(v);
-	}
+// CToReactor specialization for enum types.
+template<typename T>
+CToReactorT<typename std::underlying_type<T>::type>
+CToReactor<T, enable_if_t<std::is_enum<T>::value> >::cast(T v)
+{
+	return CToReactor<underlying>::cast(v);
+}
 
-} // namespace rr
+}  // namespace rr
 
 #endif // rr_Traits_inl
diff --git a/src/Reactor/x86.hpp b/src/Reactor/x86.hpp
index 6d3e8e8..dd98173 100644
--- a/src/Reactor/x86.hpp
+++ b/src/Reactor/x86.hpp
@@ -17,93 +17,93 @@
 
 #include "Reactor.hpp"
 
-namespace rr
-{
-	namespace x86
-	{
-		RValue<Int> cvtss2si(RValue<Float> val);
-		RValue<Int4> cvtps2dq(RValue<Float4> val);
+namespace rr {
+namespace x86 {
 
-		RValue<Float> rcpss(RValue<Float> val);
-		RValue<Float> sqrtss(RValue<Float> val);
-		RValue<Float> rsqrtss(RValue<Float> val);
+RValue<Int> cvtss2si(RValue<Float> val);
+RValue<Int4> cvtps2dq(RValue<Float4> val);
 
-		RValue<Float4> rcpps(RValue<Float4> val);
-		RValue<Float4> sqrtps(RValue<Float4> val);
-		RValue<Float4> rsqrtps(RValue<Float4> val);
-		RValue<Float4> maxps(RValue<Float4> x, RValue<Float4> y);
-		RValue<Float4> minps(RValue<Float4> x, RValue<Float4> y);
+RValue<Float> rcpss(RValue<Float> val);
+RValue<Float> sqrtss(RValue<Float> val);
+RValue<Float> rsqrtss(RValue<Float> val);
 
-		RValue<Float> roundss(RValue<Float> val, unsigned char imm);
-		RValue<Float> floorss(RValue<Float> val);
-		RValue<Float> ceilss(RValue<Float> val);
+RValue<Float4> rcpps(RValue<Float4> val);
+RValue<Float4> sqrtps(RValue<Float4> val);
+RValue<Float4> rsqrtps(RValue<Float4> val);
+RValue<Float4> maxps(RValue<Float4> x, RValue<Float4> y);
+RValue<Float4> minps(RValue<Float4> x, RValue<Float4> y);
 
-		RValue<Float4> roundps(RValue<Float4> val, unsigned char imm);
-		RValue<Float4> floorps(RValue<Float4> val);
-		RValue<Float4> ceilps(RValue<Float4> val);
+RValue<Float> roundss(RValue<Float> val, unsigned char imm);
+RValue<Float> floorss(RValue<Float> val);
+RValue<Float> ceilss(RValue<Float> val);
 
-		RValue<Int4> pabsd(RValue<Int4> x);
+RValue<Float4> roundps(RValue<Float4> val, unsigned char imm);
+RValue<Float4> floorps(RValue<Float4> val);
+RValue<Float4> ceilps(RValue<Float4> val);
 
-		RValue<Short4> paddsw(RValue<Short4> x, RValue<Short4> y);
-		RValue<Short4> psubsw(RValue<Short4> x, RValue<Short4> y);
-		RValue<UShort4> paddusw(RValue<UShort4> x, RValue<UShort4> y);
-		RValue<UShort4> psubusw(RValue<UShort4> x, RValue<UShort4> y);
-		RValue<SByte8> paddsb(RValue<SByte8> x, RValue<SByte8> y);
-		RValue<SByte8> psubsb(RValue<SByte8> x, RValue<SByte8> y);
-		RValue<Byte8> paddusb(RValue<Byte8> x, RValue<Byte8> y);
-		RValue<Byte8> psubusb(RValue<Byte8> x, RValue<Byte8> y);
+RValue<Int4> pabsd(RValue<Int4> x);
 
-		RValue<UShort4> pavgw(RValue<UShort4> x, RValue<UShort4> y);
+RValue<Short4> paddsw(RValue<Short4> x, RValue<Short4> y);
+RValue<Short4> psubsw(RValue<Short4> x, RValue<Short4> y);
+RValue<UShort4> paddusw(RValue<UShort4> x, RValue<UShort4> y);
+RValue<UShort4> psubusw(RValue<UShort4> x, RValue<UShort4> y);
+RValue<SByte8> paddsb(RValue<SByte8> x, RValue<SByte8> y);
+RValue<SByte8> psubsb(RValue<SByte8> x, RValue<SByte8> y);
+RValue<Byte8> paddusb(RValue<Byte8> x, RValue<Byte8> y);
+RValue<Byte8> psubusb(RValue<Byte8> x, RValue<Byte8> y);
 
-		RValue<Short4> pmaxsw(RValue<Short4> x, RValue<Short4> y);
-		RValue<Short4> pminsw(RValue<Short4> x, RValue<Short4> y);
+RValue<UShort4> pavgw(RValue<UShort4> x, RValue<UShort4> y);
 
-		RValue<Short4> pcmpgtw(RValue<Short4> x, RValue<Short4> y);
-		RValue<Short4> pcmpeqw(RValue<Short4> x, RValue<Short4> y);
-		RValue<Byte8> pcmpgtb(RValue<SByte8> x, RValue<SByte8> y);
-		RValue<Byte8> pcmpeqb(RValue<Byte8> x, RValue<Byte8> y);
+RValue<Short4> pmaxsw(RValue<Short4> x, RValue<Short4> y);
+RValue<Short4> pminsw(RValue<Short4> x, RValue<Short4> y);
 
-		RValue<Short4> packssdw(RValue<Int2> x, RValue<Int2> y);
-		RValue<Short8> packssdw(RValue<Int4> x, RValue<Int4> y);
-		RValue<SByte8> packsswb(RValue<Short4> x, RValue<Short4> y);
-		RValue<Byte8> packuswb(RValue<Short4> x, RValue<Short4> y);
+RValue<Short4> pcmpgtw(RValue<Short4> x, RValue<Short4> y);
+RValue<Short4> pcmpeqw(RValue<Short4> x, RValue<Short4> y);
+RValue<Byte8> pcmpgtb(RValue<SByte8> x, RValue<SByte8> y);
+RValue<Byte8> pcmpeqb(RValue<Byte8> x, RValue<Byte8> y);
 
-		RValue<UShort8> packusdw(RValue<Int4> x, RValue<Int4> y);
+RValue<Short4> packssdw(RValue<Int2> x, RValue<Int2> y);
+RValue<Short8> packssdw(RValue<Int4> x, RValue<Int4> y);
+RValue<SByte8> packsswb(RValue<Short4> x, RValue<Short4> y);
+RValue<Byte8> packuswb(RValue<Short4> x, RValue<Short4> y);
 
-		RValue<UShort4> psrlw(RValue<UShort4> x, unsigned char y);
-		RValue<UShort8> psrlw(RValue<UShort8> x, unsigned char y);
-		RValue<Short4> psraw(RValue<Short4> x, unsigned char y);
-		RValue<Short8> psraw(RValue<Short8> x, unsigned char y);
-		RValue<Short4> psllw(RValue<Short4> x, unsigned char y);
-		RValue<Short8> psllw(RValue<Short8> x, unsigned char y);
-		RValue<Int2> pslld(RValue<Int2> x, unsigned char y);
-		RValue<Int4> pslld(RValue<Int4> x, unsigned char y);
-		RValue<Int2> psrad(RValue<Int2> x, unsigned char y);
-		RValue<Int4> psrad(RValue<Int4> x, unsigned char y);
-		RValue<UInt2> psrld(RValue<UInt2> x, unsigned char y);
-		RValue<UInt4> psrld(RValue<UInt4> x, unsigned char y);
+RValue<UShort8> packusdw(RValue<Int4> x, RValue<Int4> y);
 
-		RValue<Int4> pmaxsd(RValue<Int4> x, RValue<Int4> y);
-		RValue<Int4> pminsd(RValue<Int4> x, RValue<Int4> y);
-		RValue<UInt4> pmaxud(RValue<UInt4> x, RValue<UInt4> y);
-		RValue<UInt4> pminud(RValue<UInt4> x, RValue<UInt4> y);
+RValue<UShort4> psrlw(RValue<UShort4> x, unsigned char y);
+RValue<UShort8> psrlw(RValue<UShort8> x, unsigned char y);
+RValue<Short4> psraw(RValue<Short4> x, unsigned char y);
+RValue<Short8> psraw(RValue<Short8> x, unsigned char y);
+RValue<Short4> psllw(RValue<Short4> x, unsigned char y);
+RValue<Short8> psllw(RValue<Short8> x, unsigned char y);
+RValue<Int2> pslld(RValue<Int2> x, unsigned char y);
+RValue<Int4> pslld(RValue<Int4> x, unsigned char y);
+RValue<Int2> psrad(RValue<Int2> x, unsigned char y);
+RValue<Int4> psrad(RValue<Int4> x, unsigned char y);
+RValue<UInt2> psrld(RValue<UInt2> x, unsigned char y);
+RValue<UInt4> psrld(RValue<UInt4> x, unsigned char y);
 
-		RValue<Short4> pmulhw(RValue<Short4> x, RValue<Short4> y);
-		RValue<UShort4> pmulhuw(RValue<UShort4> x, RValue<UShort4> y);
-		RValue<Int2> pmaddwd(RValue<Short4> x, RValue<Short4> y);
+RValue<Int4> pmaxsd(RValue<Int4> x, RValue<Int4> y);
+RValue<Int4> pminsd(RValue<Int4> x, RValue<Int4> y);
+RValue<UInt4> pmaxud(RValue<UInt4> x, RValue<UInt4> y);
+RValue<UInt4> pminud(RValue<UInt4> x, RValue<UInt4> y);
 
-		RValue<Short8> pmulhw(RValue<Short8> x, RValue<Short8> y);
-		RValue<UShort8> pmulhuw(RValue<UShort8> x, RValue<UShort8> y);
-		RValue<Int4> pmaddwd(RValue<Short8> x, RValue<Short8> y);
+RValue<Short4> pmulhw(RValue<Short4> x, RValue<Short4> y);
+RValue<UShort4> pmulhuw(RValue<UShort4> x, RValue<UShort4> y);
+RValue<Int2> pmaddwd(RValue<Short4> x, RValue<Short4> y);
 
-		RValue<Int> movmskps(RValue<Float4> x);
-		RValue<Int> pmovmskb(RValue<Byte8> x);
+RValue<Short8> pmulhw(RValue<Short8> x, RValue<Short8> y);
+RValue<UShort8> pmulhuw(RValue<UShort8> x, RValue<UShort8> y);
+RValue<Int4> pmaddwd(RValue<Short8> x, RValue<Short8> y);
 
-		RValue<Int4> pmovzxbd(RValue<Byte16> x);
-		RValue<Int4> pmovsxbd(RValue<SByte16> x);
-		RValue<Int4> pmovzxwd(RValue<UShort8> x);
-		RValue<Int4> pmovsxwd(RValue<Short8> x);
-	}
-}
+RValue<Int> movmskps(RValue<Float4> x);
+RValue<Int> pmovmskb(RValue<Byte8> x);
+
+RValue<Int4> pmovzxbd(RValue<Byte16> x);
+RValue<Int4> pmovsxbd(RValue<SByte16> x);
+RValue<Int4> pmovzxwd(RValue<UShort8> x);
+RValue<Int4> pmovsxwd(RValue<Short8> x);
+
+}  // namespace x86
+}  // namespace rr
 
 #endif   // rr_x86_hpp
\ No newline at end of file
diff --git a/src/System/CPUID.cpp b/src/System/CPUID.cpp
index c080034..3c62828 100644
--- a/src/System/CPUID.cpp
+++ b/src/System/CPUID.cpp
@@ -27,275 +27,276 @@
 	#include <sys/types.h>
 #endif
 
-namespace sw
+namespace sw {
+
+bool CPUID::MMX = detectMMX();
+bool CPUID::CMOV = detectCMOV();
+bool CPUID::SSE = detectSSE();
+bool CPUID::SSE2 = detectSSE2();
+bool CPUID::SSE3 = detectSSE3();
+bool CPUID::SSSE3 = detectSSSE3();
+bool CPUID::SSE4_1 = detectSSE4_1();
+int CPUID::cores = detectCoreCount();
+int CPUID::affinity = detectAffinity();
+
+bool CPUID::enableMMX = true;
+bool CPUID::enableCMOV = true;
+bool CPUID::enableSSE = true;
+bool CPUID::enableSSE2 = true;
+bool CPUID::enableSSE3 = true;
+bool CPUID::enableSSSE3 = true;
+bool CPUID::enableSSE4_1 = true;
+
+void CPUID::setEnableMMX(bool enable)
 {
-	bool CPUID::MMX = detectMMX();
-	bool CPUID::CMOV = detectCMOV();
-	bool CPUID::SSE = detectSSE();
-	bool CPUID::SSE2 = detectSSE2();
-	bool CPUID::SSE3 = detectSSE3();
-	bool CPUID::SSSE3 = detectSSSE3();
-	bool CPUID::SSE4_1 = detectSSE4_1();
-	int CPUID::cores = detectCoreCount();
-	int CPUID::affinity = detectAffinity();
+	enableMMX = enable;
 
-	bool CPUID::enableMMX = true;
-	bool CPUID::enableCMOV = true;
-	bool CPUID::enableSSE = true;
-	bool CPUID::enableSSE2 = true;
-	bool CPUID::enableSSE3 = true;
-	bool CPUID::enableSSSE3 = true;
-	bool CPUID::enableSSE4_1 = true;
-
-	void CPUID::setEnableMMX(bool enable)
+	if(!enableMMX)
 	{
-		enableMMX = enable;
-
-		if(!enableMMX)
-		{
-			enableSSE = false;
-			enableSSE2 = false;
-			enableSSE3 = false;
-			enableSSSE3 = false;
-			enableSSE4_1 = false;
-		}
-	}
-
-	void CPUID::setEnableCMOV(bool enable)
-	{
-		enableCMOV = enable;
-
-		if(!CMOV)
-		{
-			enableSSE = false;
-			enableSSE2 = false;
-			enableSSE3 = false;
-			enableSSSE3 = false;
-			enableSSE4_1 = false;
-		}
-	}
-
-	void CPUID::setEnableSSE(bool enable)
-	{
-		enableSSE = enable;
-
-		if(enableSSE)
-		{
-			enableMMX = true;
-			enableCMOV = true;
-		}
-		else
-		{
-			enableSSE2 = false;
-			enableSSE3 = false;
-			enableSSSE3 = false;
-			enableSSE4_1 = false;
-		}
-	}
-
-	void CPUID::setEnableSSE2(bool enable)
-	{
-		enableSSE2 = enable;
-
-		if(enableSSE2)
-		{
-			enableMMX = true;
-			enableCMOV = true;
-			enableSSE = true;
-		}
-		else
-		{
-			enableSSE3 = false;
-			enableSSSE3 = false;
-			enableSSE4_1 = false;
-		}
-	}
-
-	void CPUID::setEnableSSE3(bool enable)
-	{
-		enableSSE3 = enable;
-
-		if(enableSSE3)
-		{
-			enableMMX = true;
-			enableCMOV = true;
-			enableSSE = true;
-			enableSSE2 = true;
-		}
-		else
-		{
-			enableSSSE3 = false;
-			enableSSE4_1 = false;
-		}
-	}
-
-	void CPUID::setEnableSSSE3(bool enable)
-	{
-		enableSSSE3 = enable;
-
-		if(enableSSSE3)
-		{
-			enableMMX = true;
-			enableCMOV = true;
-			enableSSE = true;
-			enableSSE2 = true;
-			enableSSE3 = true;
-		}
-		else
-		{
-			enableSSE4_1 = false;
-		}
-	}
-
-	void CPUID::setEnableSSE4_1(bool enable)
-	{
-		enableSSE4_1 = enable;
-
-		if(enableSSE4_1)
-		{
-			enableMMX = true;
-			enableCMOV = true;
-			enableSSE = true;
-			enableSSE2 = true;
-			enableSSE3 = true;
-			enableSSSE3 = true;
-		}
-	}
-
-	static void cpuid(int registers[4], int info)
-	{
-		#if defined(__i386__) || defined(__x86_64__)
-			#if defined(_WIN32)
-				__cpuid(registers, info);
-			#else
-				__asm volatile("cpuid": "=a" (registers[0]), "=b" (registers[1]), "=c" (registers[2]), "=d" (registers[3]): "a" (info));
-			#endif
-		#else
-			registers[0] = 0;
-			registers[1] = 0;
-			registers[2] = 0;
-			registers[3] = 0;
-		#endif
-	}
-
-	bool CPUID::detectMMX()
-	{
-		int registers[4];
-		cpuid(registers, 1);
-		return MMX = (registers[3] & 0x00800000) != 0;
-	}
-
-	bool CPUID::detectCMOV()
-	{
-		int registers[4];
-		cpuid(registers, 1);
-		return CMOV = (registers[3] & 0x00008000) != 0;
-	}
-
-	bool CPUID::detectSSE()
-	{
-		int registers[4];
-		cpuid(registers, 1);
-		return SSE = (registers[3] & 0x02000000) != 0;
-	}
-
-	bool CPUID::detectSSE2()
-	{
-		int registers[4];
-		cpuid(registers, 1);
-		return SSE2 = (registers[3] & 0x04000000) != 0;
-	}
-
-	bool CPUID::detectSSE3()
-	{
-		int registers[4];
-		cpuid(registers, 1);
-		return SSE3 = (registers[2] & 0x00000001) != 0;
-	}
-
-	bool CPUID::detectSSSE3()
-	{
-		int registers[4];
-		cpuid(registers, 1);
-		return SSSE3 = (registers[2] & 0x00000200) != 0;
-	}
-
-	bool CPUID::detectSSE4_1()
-	{
-		int registers[4];
-		cpuid(registers, 1);
-		return SSE4_1 = (registers[2] & 0x00080000) != 0;
-	}
-
-	int CPUID::detectCoreCount()
-	{
-		int cores = 0;
-
-		#if defined(_WIN32)
-			DWORD_PTR processAffinityMask = 1;
-			DWORD_PTR systemAffinityMask = 1;
-
-			GetProcessAffinityMask(GetCurrentProcess(), &processAffinityMask, &systemAffinityMask);
-
-			while(systemAffinityMask)
-			{
-				if(systemAffinityMask & 1)
-				{
-					cores++;
-				}
-
-				systemAffinityMask >>= 1;
-			}
-		#else
-			cores = sysconf(_SC_NPROCESSORS_ONLN);
-		#endif
-
-		if(cores < 1)  cores = 1;
-		if(cores > 16) cores = 16;
-
-		return cores;   // FIXME: Number of physical cores
-	}
-
-	int CPUID::detectAffinity()
-	{
-		int cores = 0;
-
-		#if defined(_WIN32)
-			DWORD_PTR processAffinityMask = 1;
-			DWORD_PTR systemAffinityMask = 1;
-
-			GetProcessAffinityMask(GetCurrentProcess(), &processAffinityMask, &systemAffinityMask);
-
-			while(processAffinityMask)
-			{
-				if(processAffinityMask & 1)
-				{
-					cores++;
-				}
-
-				processAffinityMask >>= 1;
-			}
-		#else
-			return detectCoreCount();   // FIXME: Assumes no affinity limitation
-		#endif
-
-		if(cores < 1)  cores = 1;
-		if(cores > 16) cores = 16;
-
-		return cores;
-	}
-
-	void CPUID::setFlushToZero(bool enable)
-	{
-		#if defined(_MSC_VER)
-			_controlfp(enable ? _DN_FLUSH : _DN_SAVE, _MCW_DN);
-		#else
-			// Unimplemented
-		#endif
-	}
-
-	void CPUID::setDenormalsAreZero(bool enable)
-	{
-		// Unimplemented
+		enableSSE = false;
+		enableSSE2 = false;
+		enableSSE3 = false;
+		enableSSSE3 = false;
+		enableSSE4_1 = false;
 	}
 }
+
+void CPUID::setEnableCMOV(bool enable)
+{
+	enableCMOV = enable;
+
+	if(!CMOV)
+	{
+		enableSSE = false;
+		enableSSE2 = false;
+		enableSSE3 = false;
+		enableSSSE3 = false;
+		enableSSE4_1 = false;
+	}
+}
+
+void CPUID::setEnableSSE(bool enable)
+{
+	enableSSE = enable;
+
+	if(enableSSE)
+	{
+		enableMMX = true;
+		enableCMOV = true;
+	}
+	else
+	{
+		enableSSE2 = false;
+		enableSSE3 = false;
+		enableSSSE3 = false;
+		enableSSE4_1 = false;
+	}
+}
+
+void CPUID::setEnableSSE2(bool enable)
+{
+	enableSSE2 = enable;
+
+	if(enableSSE2)
+	{
+		enableMMX = true;
+		enableCMOV = true;
+		enableSSE = true;
+	}
+	else
+	{
+		enableSSE3 = false;
+		enableSSSE3 = false;
+		enableSSE4_1 = false;
+	}
+}
+
+void CPUID::setEnableSSE3(bool enable)
+{
+	enableSSE3 = enable;
+
+	if(enableSSE3)
+	{
+		enableMMX = true;
+		enableCMOV = true;
+		enableSSE = true;
+		enableSSE2 = true;
+	}
+	else
+	{
+		enableSSSE3 = false;
+		enableSSE4_1 = false;
+	}
+}
+
+void CPUID::setEnableSSSE3(bool enable)
+{
+	enableSSSE3 = enable;
+
+	if(enableSSSE3)
+	{
+		enableMMX = true;
+		enableCMOV = true;
+		enableSSE = true;
+		enableSSE2 = true;
+		enableSSE3 = true;
+	}
+	else
+	{
+		enableSSE4_1 = false;
+	}
+}
+
+void CPUID::setEnableSSE4_1(bool enable)
+{
+	enableSSE4_1 = enable;
+
+	if(enableSSE4_1)
+	{
+		enableMMX = true;
+		enableCMOV = true;
+		enableSSE = true;
+		enableSSE2 = true;
+		enableSSE3 = true;
+		enableSSSE3 = true;
+	}
+}
+
+static void cpuid(int registers[4], int info)
+{
+	#if defined(__i386__) || defined(__x86_64__)
+		#if defined(_WIN32)
+			__cpuid(registers, info);
+		#else
+			__asm volatile("cpuid": "=a" (registers[0]), "=b" (registers[1]), "=c" (registers[2]), "=d" (registers[3]): "a" (info));
+		#endif
+	#else
+		registers[0] = 0;
+		registers[1] = 0;
+		registers[2] = 0;
+		registers[3] = 0;
+	#endif
+}
+
+bool CPUID::detectMMX()
+{
+	int registers[4];
+	cpuid(registers, 1);
+	return MMX = (registers[3] & 0x00800000) != 0;
+}
+
+bool CPUID::detectCMOV()
+{
+	int registers[4];
+	cpuid(registers, 1);
+	return CMOV = (registers[3] & 0x00008000) != 0;
+}
+
+bool CPUID::detectSSE()
+{
+	int registers[4];
+	cpuid(registers, 1);
+	return SSE = (registers[3] & 0x02000000) != 0;
+}
+
+bool CPUID::detectSSE2()
+{
+	int registers[4];
+	cpuid(registers, 1);
+	return SSE2 = (registers[3] & 0x04000000) != 0;
+}
+
+bool CPUID::detectSSE3()
+{
+	int registers[4];
+	cpuid(registers, 1);
+	return SSE3 = (registers[2] & 0x00000001) != 0;
+}
+
+bool CPUID::detectSSSE3()
+{
+	int registers[4];
+	cpuid(registers, 1);
+	return SSSE3 = (registers[2] & 0x00000200) != 0;
+}
+
+bool CPUID::detectSSE4_1()
+{
+	int registers[4];
+	cpuid(registers, 1);
+	return SSE4_1 = (registers[2] & 0x00080000) != 0;
+}
+
+int CPUID::detectCoreCount()
+{
+	int cores = 0;
+
+	#if defined(_WIN32)
+		DWORD_PTR processAffinityMask = 1;
+		DWORD_PTR systemAffinityMask = 1;
+
+		GetProcessAffinityMask(GetCurrentProcess(), &processAffinityMask, &systemAffinityMask);
+
+		while(systemAffinityMask)
+		{
+			if(systemAffinityMask & 1)
+			{
+				cores++;
+			}
+
+			systemAffinityMask >>= 1;
+		}
+	#else
+		cores = sysconf(_SC_NPROCESSORS_ONLN);
+	#endif
+
+	if(cores < 1)  cores = 1;
+	if(cores > 16) cores = 16;
+
+	return cores;   // FIXME: Number of physical cores
+}
+
+int CPUID::detectAffinity()
+{
+	int cores = 0;
+
+	#if defined(_WIN32)
+		DWORD_PTR processAffinityMask = 1;
+		DWORD_PTR systemAffinityMask = 1;
+
+		GetProcessAffinityMask(GetCurrentProcess(), &processAffinityMask, &systemAffinityMask);
+
+		while(processAffinityMask)
+		{
+			if(processAffinityMask & 1)
+			{
+				cores++;
+			}
+
+			processAffinityMask >>= 1;
+		}
+	#else
+		return detectCoreCount();   // FIXME: Assumes no affinity limitation
+	#endif
+
+	if(cores < 1)  cores = 1;
+	if(cores > 16) cores = 16;
+
+	return cores;
+}
+
+void CPUID::setFlushToZero(bool enable)
+{
+	#if defined(_MSC_VER)
+		_controlfp(enable ? _DN_FLUSH : _DN_SAVE, _MCW_DN);
+	#else
+		// Unimplemented
+	#endif
+}
+
+void CPUID::setDenormalsAreZero(bool enable)
+{
+	// Unimplemented
+}
+
+}  // namespace sw
diff --git a/src/System/CPUID.hpp b/src/System/CPUID.hpp
index 3c21cd7..5fbb89f 100644
--- a/src/System/CPUID.hpp
+++ b/src/System/CPUID.hpp
@@ -15,123 +15,127 @@
 #ifndef sw_CPUID_hpp
 #define sw_CPUID_hpp
 
-namespace sw
+namespace sw {
+
+#if !defined(__i386__) && defined(_M_IX86)
+	#define __i386__ 1
+#endif
+
+#if !defined(__x86_64__) && (defined(_M_AMD64) || defined (_M_X64))
+	#define __x86_64__ 1
+#endif
+
+class CPUID
 {
-	#if !defined(__i386__) && defined(_M_IX86)
-		#define __i386__ 1
-	#endif
+public:
+	static bool supportsMMX();
+	static bool supportsCMOV();
+	static bool supportsMMX2();   // MMX instructions added by SSE: pshufw, pmulhuw, pmovmskb, pavgw/b, pextrw, pinsrw, pmaxsw/ub, etc.
+	static bool supportsSSE();
+	static bool supportsSSE2();
+	static bool supportsSSE3();
+	static bool supportsSSSE3();
+	static bool supportsSSE4_1();
+	static int coreCount();
+	static int processAffinity();
 
-	#if !defined(__x86_64__) && (defined(_M_AMD64) || defined (_M_X64))
-		#define __x86_64__ 1
-	#endif
+	static void setEnableMMX(bool enable);
+	static void setEnableCMOV(bool enable);
+	static void setEnableSSE(bool enable);
+	static void setEnableSSE2(bool enable);
+	static void setEnableSSE3(bool enable);
+	static void setEnableSSSE3(bool enable);
+	static void setEnableSSE4_1(bool enable);
 
-	class CPUID
-	{
-	public:
-		static bool supportsMMX();
-		static bool supportsCMOV();
-		static bool supportsMMX2();   // MMX instructions added by SSE: pshufw, pmulhuw, pmovmskb, pavgw/b, pextrw, pinsrw, pmaxsw/ub, etc.
-		static bool supportsSSE();
-		static bool supportsSSE2();
-		static bool supportsSSE3();
-		static bool supportsSSSE3();
-		static bool supportsSSE4_1();
-		static int coreCount();
-		static int processAffinity();
+	static void setFlushToZero(bool enable);        // Denormal results are written as zero
+	static void setDenormalsAreZero(bool enable);   // Denormal inputs are read as zero
 
-		static void setEnableMMX(bool enable);
-		static void setEnableCMOV(bool enable);
-		static void setEnableSSE(bool enable);
-		static void setEnableSSE2(bool enable);
-		static void setEnableSSE3(bool enable);
-		static void setEnableSSSE3(bool enable);
-		static void setEnableSSE4_1(bool enable);
+private:
+	static bool MMX;
+	static bool CMOV;
+	static bool SSE;
+	static bool SSE2;
+	static bool SSE3;
+	static bool SSSE3;
+	static bool SSE4_1;
+	static int cores;
+	static int affinity;
 
-		static void setFlushToZero(bool enable);        // Denormal results are written as zero
-		static void setDenormalsAreZero(bool enable);   // Denormal inputs are read as zero
+	static bool enableMMX;
+	static bool enableCMOV;
+	static bool enableSSE;
+	static bool enableSSE2;
+	static bool enableSSE3;
+	static bool enableSSSE3;
+	static bool enableSSE4_1;
 
-	private:
-		static bool MMX;
-		static bool CMOV;
-		static bool SSE;
-		static bool SSE2;
-		static bool SSE3;
-		static bool SSSE3;
-		static bool SSE4_1;
-		static int cores;
-		static int affinity;
+	static bool detectMMX();
+	static bool detectCMOV();
+	static bool detectSSE();
+	static bool detectSSE2();
+	static bool detectSSE3();
+	static bool detectSSSE3();
+	static bool detectSSE4_1();
+	static int detectCoreCount();
+	static int detectAffinity();
+};
 
-		static bool enableMMX;
-		static bool enableCMOV;
-		static bool enableSSE;
-		static bool enableSSE2;
-		static bool enableSSE3;
-		static bool enableSSSE3;
-		static bool enableSSE4_1;
+}  // namespace sw
 
-		static bool detectMMX();
-		static bool detectCMOV();
-		static bool detectSSE();
-		static bool detectSSE2();
-		static bool detectSSE3();
-		static bool detectSSSE3();
-		static bool detectSSE4_1();
-		static int detectCoreCount();
-		static int detectAffinity();
-	};
+/* Inline implementation */
+
+namespace sw {
+
+inline bool CPUID::supportsMMX()
+{
+	return MMX && enableMMX;
 }
 
-namespace sw
+inline bool CPUID::supportsCMOV()
 {
-	inline bool CPUID::supportsMMX()
-	{
-		return MMX && enableMMX;
-	}
-
-	inline bool CPUID::supportsCMOV()
-	{
-		return CMOV && enableCMOV;
-	}
-
-	inline bool CPUID::supportsMMX2()
-	{
-		return supportsSSE();   // Coincides with 64-bit integer vector instructions supported by SSE
-	}
-
-	inline bool CPUID::supportsSSE()
-	{
-		return SSE && enableSSE;
-	}
-
-	inline bool CPUID::supportsSSE2()
-	{
-		return SSE2 && enableSSE2;
-	}
-
-	inline bool CPUID::supportsSSE3()
-	{
-		return SSE3 && enableSSE3;
-	}
-
-	inline bool CPUID::supportsSSSE3()
-	{
-		return SSSE3 && enableSSSE3;
-	}
-
-	inline bool CPUID::supportsSSE4_1()
-	{
-		return SSE4_1 && enableSSE4_1;
-	}
-
-	inline int CPUID::coreCount()
-	{
-		return cores;
-	}
-
-	inline int CPUID::processAffinity()
-	{
-		return affinity;
-	}
+	return CMOV && enableCMOV;
 }
 
+inline bool CPUID::supportsMMX2()
+{
+	return supportsSSE();   // Coincides with 64-bit integer vector instructions supported by SSE
+}
+
+inline bool CPUID::supportsSSE()
+{
+	return SSE && enableSSE;
+}
+
+inline bool CPUID::supportsSSE2()
+{
+	return SSE2 && enableSSE2;
+}
+
+inline bool CPUID::supportsSSE3()
+{
+	return SSE3 && enableSSE3;
+}
+
+inline bool CPUID::supportsSSSE3()
+{
+	return SSSE3 && enableSSSE3;
+}
+
+inline bool CPUID::supportsSSE4_1()
+{
+	return SSE4_1 && enableSSE4_1;
+}
+
+inline int CPUID::coreCount()
+{
+	return cores;
+}
+
+inline int CPUID::processAffinity()
+{
+	return affinity;
+}
+
+}  // namespace sw
+
 #endif   // sw_CPUID_hpp
diff --git a/src/System/Configurator.cpp b/src/System/Configurator.cpp
index ead1d28..e544383 100644
--- a/src/System/Configurator.cpp
+++ b/src/System/Configurator.cpp
@@ -27,229 +27,230 @@
 #include <unistd.h>
 #endif
 
-namespace sw
+namespace sw {
+
+Configurator::Configurator(string iniPath)
 {
-	Configurator::Configurator(string iniPath)
-	{
-		path = iniPath;
+	path = iniPath;
 
-		readFile();
-	}
+	readFile();
+}
 
-	Configurator::~Configurator()
-	{
-	}
+Configurator::~Configurator()
+{
+}
 
-	bool Configurator::readFile()
+bool Configurator::readFile()
+{
+	#if defined(__unix__)
+		if(access(path.c_str(), R_OK) != 0)
+		{
+			return false;
+		}
+	#endif
+
+	fstream file(path.c_str(), ios::in);
+	if(file.fail()) return false;
+
+	string line;
+	string keyName;
+
+	while(getline(file, line))
 	{
-		#if defined(__unix__)
-			if(access(path.c_str(), R_OK) != 0)
+		if(line.length())
+		{
+			if(line[line.length() - 1] == '\r')
 			{
+				line = line.substr(0, line.length() - 1);
+			}
+
+			if(!isprint(line[0]))
+			{
+			//	printf("Failing on char %d\n", line[0]);
+				file.close();
 				return false;
 			}
-		#endif
 
-		fstream file(path.c_str(), ios::in);
-		if(file.fail()) return false;
+			string::size_type pLeft = line.find_first_of(";#[=");
 
-		string line;
-		string keyName;
-
-		while(getline(file, line))
-		{
-			if(line.length())
+			if(pLeft != string::npos)
 			{
-				if(line[line.length() - 1] == '\r')
+				switch(line[pLeft])
 				{
-					line = line.substr(0, line.length() - 1);
-				}
-
-				if(!isprint(line[0]))
-				{
-				//	printf("Failing on char %d\n", line[0]);
-					file.close();
-					return false;
-				}
-
-				string::size_type pLeft = line.find_first_of(";#[=");
-
-				if(pLeft != string::npos)
-				{
-					switch(line[pLeft])
+				case '[':
 					{
-					case '[':
-						{
-							string::size_type pRight = line.find_last_of("]");
+						string::size_type pRight = line.find_last_of("]");
 
-							if(pRight != string::npos && pRight > pLeft)
-							{
-								keyName = line.substr(pLeft + 1, pRight - pLeft - 1);
-								addKeyName(keyName);
-							}
-						}
-						break;
-					case '=':
+						if(pRight != string::npos && pRight > pLeft)
 						{
-							string valueName = line.substr(0, pLeft);
-							string value = line.substr(pLeft + 1);
-							addValue(keyName, valueName, value);
+							keyName = line.substr(pLeft + 1, pRight - pLeft - 1);
+							addKeyName(keyName);
 						}
-						break;
-					case ';':
-					case '#':
-						// Ignore comments
-						break;
 					}
+					break;
+				case '=':
+					{
+						string valueName = line.substr(0, pLeft);
+						string value = line.substr(pLeft + 1);
+						addValue(keyName, valueName, value);
+					}
+					break;
+				case ';':
+				case '#':
+					// Ignore comments
+					break;
 				}
 			}
 		}
-
-		file.close();
-
-		if(names.size())
-		{
-			return true;
-		}
-
-		return false;
 	}
 
-	void Configurator::writeFile(std::string title)
+	file.close();
+
+	if(names.size())
 	{
-		#if defined(__unix__)
-			if(access(path.c_str(), W_OK) != 0)
-			{
-				return;
-			}
-		#endif
-
-		fstream file(path.c_str(), ios::out);
-		if(file.fail()) return;
-
-		file << "; " << title << endl << endl;
-
-		for(unsigned int keyID = 0; keyID < sections.size(); keyID++)
-		{
-			file << "[" << names[keyID] << "]" << endl;
-
-			for(unsigned int valueID = 0; valueID < sections[keyID].names.size(); valueID++)
-			{
-				file << sections[keyID].names[valueID] << "=" << sections[keyID].values[valueID] << endl;
-			}
-
-			file << endl;
-		}
-
-		file.close();
+		return true;
 	}
 
-	int Configurator::findKey(string keyName) const
-	{
-		for(unsigned int keyID = 0; keyID < names.size(); keyID++)
+	return false;
+}
+
+void Configurator::writeFile(std::string title)
+{
+	#if defined(__unix__)
+		if(access(path.c_str(), W_OK) != 0)
 		{
-			if(names[keyID] == keyName)
-			{
-				return keyID;
-			}
+			return;
+		}
+	#endif
+
+	fstream file(path.c_str(), ios::out);
+	if(file.fail()) return;
+
+	file << "; " << title << endl << endl;
+
+	for(unsigned int keyID = 0; keyID < sections.size(); keyID++)
+	{
+		file << "[" << names[keyID] << "]" << endl;
+
+		for(unsigned int valueID = 0; valueID < sections[keyID].names.size(); valueID++)
+		{
+			file << sections[keyID].names[valueID] << "=" << sections[keyID].values[valueID] << endl;
 		}
 
+		file << endl;
+	}
+
+	file.close();
+}
+
+int Configurator::findKey(string keyName) const
+{
+	for(unsigned int keyID = 0; keyID < names.size(); keyID++)
+	{
+		if(names[keyID] == keyName)
+		{
+			return keyID;
+		}
+	}
+
+	return -1;
+}
+
+int Configurator::findValue(unsigned int keyID, string valueName) const
+{
+	if(!sections.size() || keyID >= sections.size())
+	{
 		return -1;
 	}
 
-	int Configurator::findValue(unsigned int keyID, string valueName) const
+	for(unsigned int valueID = 0; valueID < sections[keyID].names.size(); ++valueID)
 	{
-		if(!sections.size() || keyID >= sections.size())
+		if(sections[keyID].names[valueID] == valueName)
 		{
-			return -1;
-		}
-
-		for(unsigned int valueID = 0; valueID < sections[keyID].names.size(); ++valueID)
-		{
-			if(sections[keyID].names[valueID] == valueName)
-			{
-				return valueID;
-			}
-		}
-
-		return -1;
-	}
-
-	unsigned int Configurator::addKeyName(string keyName)
-	{
-		names.resize(names.size() + 1, keyName);
-		sections.resize(sections.size() + 1);
-		return (unsigned int)names.size() - 1;
-	}
-
-	void Configurator::addValue(string const keyName, string const valueName, string const value)
-	{
-		int keyID = findKey(keyName);
-
-		if(keyID == -1)
-		{
-			keyID = addKeyName(keyName);
-		}
-
-		int valueID = findValue(keyID, valueName);
-
-		if(valueID == -1)
-		{
-			sections[keyID].names.resize(sections[keyID].names.size() + 1, valueName);
-			sections[keyID].values.resize(sections[keyID].values.size() + 1, value);
-		}
-		else
-		{
-			sections[keyID].values[valueID] = value;
+			return valueID;
 		}
 	}
 
-	string Configurator::getValue(string keyName, string valueName, string defaultValue) const
-	{
-		int keyID = findKey(keyName);
-		if(keyID == -1) return defaultValue;
-		int valueID = findValue((unsigned int)keyID, valueName);
-		if(valueID == -1) return defaultValue;
+	return -1;
+}
 
-		return sections[keyID].values[valueID];
+unsigned int Configurator::addKeyName(string keyName)
+{
+	names.resize(names.size() + 1, keyName);
+	sections.resize(sections.size() + 1);
+	return (unsigned int)names.size() - 1;
+}
+
+void Configurator::addValue(string const keyName, string const valueName, string const value)
+{
+	int keyID = findKey(keyName);
+
+	if(keyID == -1)
+	{
+		keyID = addKeyName(keyName);
 	}
 
-	int Configurator::getInteger(string keyName, string valueName, int defaultValue) const
+	int valueID = findValue(keyID, valueName);
+
+	if(valueID == -1)
 	{
-		char svalue[256];
-
-		sprintf(svalue, "%d", defaultValue);
-
-		return atoi(getValue(keyName, valueName, svalue).c_str());
+		sections[keyID].names.resize(sections[keyID].names.size() + 1, valueName);
+		sections[keyID].values.resize(sections[keyID].values.size() + 1, value);
 	}
-
-	bool Configurator::getBoolean(string keyName, string valueName, bool defaultValue) const
+	else
 	{
-		return getInteger(keyName, valueName, (int)defaultValue) != 0;
-	}
-
-	double Configurator::getFloat(string keyName, string valueName, double defaultValue) const
-	{
-		char svalue[256];
-
-		sprintf(svalue, "%f", defaultValue);
-
-		return atof(getValue(keyName, valueName, svalue).c_str());
-	}
-
-	unsigned int Configurator::getFormatted(string keyName, string valueName, char *format,
-											void *v1, void *v2, void *v3, void *v4,
-											void *v5, void *v6, void *v7, void *v8,
-											void *v9, void *v10, void *v11, void *v12,
-											void *v13, void *v14, void *v15, void *v16)
-	{
-		string value = getValue(keyName, valueName);
-
-		if(!value.length()) return false;
-
-		unsigned int nVals = sscanf(value.c_str(), format,
-									v1, v2, v3, v4, v5, v6, v7, v8,
-									v9, v10, v11, v12, v13, v14, v15, v16);
-
-		return nVals;
+		sections[keyID].values[valueID] = value;
 	}
 }
+
+string Configurator::getValue(string keyName, string valueName, string defaultValue) const
+{
+	int keyID = findKey(keyName);
+	if(keyID == -1) return defaultValue;
+	int valueID = findValue((unsigned int)keyID, valueName);
+	if(valueID == -1) return defaultValue;
+
+	return sections[keyID].values[valueID];
+}
+
+int Configurator::getInteger(string keyName, string valueName, int defaultValue) const
+{
+	char svalue[256];
+
+	sprintf(svalue, "%d", defaultValue);
+
+	return atoi(getValue(keyName, valueName, svalue).c_str());
+}
+
+bool Configurator::getBoolean(string keyName, string valueName, bool defaultValue) const
+{
+	return getInteger(keyName, valueName, (int)defaultValue) != 0;
+}
+
+double Configurator::getFloat(string keyName, string valueName, double defaultValue) const
+{
+	char svalue[256];
+
+	sprintf(svalue, "%f", defaultValue);
+
+	return atof(getValue(keyName, valueName, svalue).c_str());
+}
+
+unsigned int Configurator::getFormatted(string keyName, string valueName, char *format,
+										void *v1, void *v2, void *v3, void *v4,
+										void *v5, void *v6, void *v7, void *v8,
+										void *v9, void *v10, void *v11, void *v12,
+										void *v13, void *v14, void *v15, void *v16)
+{
+	string value = getValue(keyName, valueName);
+
+	if(!value.length()) return false;
+
+	unsigned int nVals = sscanf(value.c_str(), format,
+								v1, v2, v3, v4, v5, v6, v7, v8,
+								v9, v10, v11, v12, v13, v14, v15, v16);
+
+	return nVals;
+}
+
+}  // namespace sw
diff --git a/src/System/Configurator.hpp b/src/System/Configurator.hpp
index 6fd930c..9a27a39 100644
--- a/src/System/Configurator.hpp
+++ b/src/System/Configurator.hpp
@@ -20,47 +20,48 @@
 
 #include <stdlib.h>
 
-namespace sw
+namespace sw {
+
+class Configurator
 {
-	class Configurator
+public:
+	Configurator(std::string iniPath = "");
+
+	~Configurator();
+
+	std::string getValue(std::string sectionName, std::string valueName, std::string defaultValue = "") const;
+	int getInteger(std::string sectionName, std::string valueName, int defaultValue = 0) const;
+	bool getBoolean(std::string sectionName, std::string valueName, bool defaultValue = false) const;
+	double getFloat(std::string sectionName, std::string valueName, double defaultValue = 0.0) const;
+	unsigned int getFormatted(std::string sectionName, std::string valueName, char *format,
+	                          void *v1 = 0, void *v2 = 0, void *v3 = 0, void *v4 = 0,
+	                          void *v5 = 0, void *v6 = 0, void *v7 = 0, void *v8 = 0,
+	                          void *v9 = 0, void *v10 = 0, void *v11 = 0, void *v12 = 0,
+	                          void *v13 = 0, void *v14 = 0, void *v15 = 0, void *v16 = 0);
+
+	void addValue(std::string sectionName, std::string valueName, std::string value);
+
+	void writeFile(std::string title = "Configuration File");
+
+private:
+	bool readFile();
+
+	unsigned int addKeyName(std::string sectionName);
+	int findKey(std::string sectionName) const;
+	int findValue(unsigned int sectionID, std::string valueName) const;
+
+	std::string path;
+
+	struct Section
 	{
-	public:
-		Configurator(std::string iniPath = "");
-
-		~Configurator();
-
-		std::string getValue(std::string sectionName, std::string valueName, std::string defaultValue = "") const;
-		int getInteger(std::string sectionName, std::string valueName, int defaultValue = 0) const;
-		bool getBoolean(std::string sectionName, std::string valueName, bool defaultValue = false) const;
-		double getFloat(std::string sectionName, std::string valueName, double defaultValue = 0.0) const;
-		unsigned int getFormatted(std::string sectionName, std::string valueName, char *format,
-		                          void *v1 = 0, void *v2 = 0, void *v3 = 0, void *v4 = 0,
-		                          void *v5 = 0, void *v6 = 0, void *v7 = 0, void *v8 = 0,
-		                          void *v9 = 0, void *v10 = 0, void *v11 = 0, void *v12 = 0,
-		                          void *v13 = 0, void *v14 = 0, void *v15 = 0, void *v16 = 0);
-
-		void addValue(std::string sectionName, std::string valueName, std::string value);
-
-		void writeFile(std::string title = "Configuration File");
-
-	private:
-		bool readFile();
-
-		unsigned int addKeyName(std::string sectionName);
-		int findKey(std::string sectionName) const;
-		int findValue(unsigned int sectionID, std::string valueName) const;
-
-		std::string path;
-
-		struct Section
-		{
-			std::vector<std::string> names;
-			std::vector<std::string> values;
-		};
-
-		std::vector<Section> sections;
 		std::vector<std::string> names;
+		std::vector<std::string> values;
 	};
-}
+
+	std::vector<Section> sections;
+	std::vector<std::string> names;
+};
+
+}  // namespace sw
 
 #endif   // sw_Configurator_hpp
diff --git a/src/System/Debug.cpp b/src/System/Debug.cpp
index acf469e..dd66d56 100644
--- a/src/System/Debug.cpp
+++ b/src/System/Debug.cpp
@@ -17,8 +17,8 @@
 #include <stdio.h>
 #include <stdarg.h>
 
-namespace sw
-{
+namespace sw {
+
 void trace(const char *format, ...)
 {
 	if(false)
@@ -36,4 +36,5 @@
 		}
 	}
 }
-}
+
+}  // namespace sw
diff --git a/src/System/Debug.hpp b/src/System/Debug.hpp
index 0c862d4..d26623c 100644
--- a/src/System/Debug.hpp
+++ b/src/System/Debug.hpp
@@ -25,10 +25,11 @@
 #undef min
 #undef max
 
-namespace sw
-{
+namespace sw {
+
 void trace(const char *format, ...);
 inline void trace() {}
+
 }
 
 #if !defined(NDEBUG) || defined(DCHECK_ALWAYS_ON)
diff --git a/src/System/Half.cpp b/src/System/Half.cpp
index cde8190..4f8f7cd 100644
--- a/src/System/Half.cpp
+++ b/src/System/Half.cpp
@@ -14,89 +14,90 @@
 
 #include "Half.hpp"
 
-namespace sw
+namespace sw {
+
+half::half(float fp32)
 {
-	half::half(float fp32)
+	unsigned int fp32i = *(unsigned int*)&fp32;
+	unsigned int sign = (fp32i & 0x80000000) >> 16;
+	unsigned int abs = fp32i & 0x7FFFFFFF;
+
+	if(abs > 0x47FFEFFF)   // Infinity
 	{
-		unsigned int fp32i = *(unsigned int*)&fp32;
-		unsigned int sign = (fp32i & 0x80000000) >> 16;
-		unsigned int abs = fp32i & 0x7FFFFFFF;
+		fp16i = sign | 0x7FFF;
+	}
+	else if(abs < 0x38800000)   // Denormal
+	{
+		unsigned int mantissa = (abs & 0x007FFFFF) | 0x00800000;
+		int e = 113 - (abs >> 23);
 
-		if(abs > 0x47FFEFFF)   // Infinity
+		if(e < 24)
 		{
-			fp16i = sign | 0x7FFF;
-		}
-		else if(abs < 0x38800000)   // Denormal
-		{
-			unsigned int mantissa = (abs & 0x007FFFFF) | 0x00800000;
-			int e = 113 - (abs >> 23);
-
-			if(e < 24)
-			{
-				abs = mantissa >> e;
-			}
-			else
-			{
-				abs = 0;
-			}
-
-			fp16i = sign | (abs + 0x00000FFF + ((abs >> 13) & 1)) >> 13;
+			abs = mantissa >> e;
 		}
 		else
 		{
-			fp16i = sign | (abs + 0xC8000000 + 0x00000FFF + ((abs >> 13) & 1)) >> 13;
-		}
-	}
-
-	half::operator float() const
-	{
-		unsigned int fp32i;
-
-		int s = (fp16i >> 15) & 0x00000001;
-		int e = (fp16i >> 10) & 0x0000001F;
-		int m =  fp16i        & 0x000003FF;
-
-		if(e == 0)
-		{
-			if(m == 0)
-			{
-				fp32i = s << 31;
-
-				return (float&)fp32i;
-			}
-			else
-			{
-				while(!(m & 0x00000400))
-				{
-					m <<= 1;
-					e -=  1;
-				}
-
-				e += 1;
-				m &= ~0x00000400;
-			}
+			abs = 0;
 		}
 
-		e = e + (127 - 15);
-		m = m << 13;
-
-		fp32i = (s << 31) | (e << 23) | m;
-
-		return (float&)fp32i;
+		fp16i = sign | (abs + 0x00000FFF + ((abs >> 13) & 1)) >> 13;
 	}
-
-	half &half::operator=(half h)
+	else
 	{
-		fp16i = h.fp16i;
-
-		return *this;
-	}
-
-
-	half &half::operator=(float f)
-	{
-		*this = half(f);
-
-		return *this;
+		fp16i = sign | (abs + 0xC8000000 + 0x00000FFF + ((abs >> 13) & 1)) >> 13;
 	}
 }
+
+half::operator float() const
+{
+	unsigned int fp32i;
+
+	int s = (fp16i >> 15) & 0x00000001;
+	int e = (fp16i >> 10) & 0x0000001F;
+	int m =  fp16i        & 0x000003FF;
+
+	if(e == 0)
+	{
+		if(m == 0)
+		{
+			fp32i = s << 31;
+
+			return (float&)fp32i;
+		}
+		else
+		{
+			while(!(m & 0x00000400))
+			{
+				m <<= 1;
+				e -=  1;
+			}
+
+			e += 1;
+			m &= ~0x00000400;
+		}
+	}
+
+	e = e + (127 - 15);
+	m = m << 13;
+
+	fp32i = (s << 31) | (e << 23) | m;
+
+	return (float&)fp32i;
+}
+
+half &half::operator=(half h)
+{
+	fp16i = h.fp16i;
+
+	return *this;
+}
+
+
+half &half::operator=(float f)
+{
+	*this = half(f);
+
+	return *this;
+}
+
+}  // namespace sw
diff --git a/src/System/Half.hpp b/src/System/Half.hpp
index e77224d..2831ca2 100644
--- a/src/System/Half.hpp
+++ b/src/System/Half.hpp
@@ -20,293 +20,294 @@
 #include <algorithm>
 #include <cmath>
 
-namespace sw
+namespace sw {
+
+class half
 {
-	class half
+public:
+	half() = default;
+	explicit half(float f);
+
+	operator float() const;
+
+	half &operator=(half h);
+	half &operator=(float f);
+
+private:
+	unsigned short fp16i;
+};
+
+inline half shortAsHalf(short s)
+{
+	union
 	{
-	public:
-		half() = default;
-		explicit half(float f);
+		half h;
+		short s;
+	} hs;
 
-		operator float() const;
+	hs.s = s;
 
-		half &operator=(half h);
-		half &operator=(float f);
+	return hs.h;
+}
 
-	private:
-		unsigned short fp16i;
-	};
+class RGB9E5
+{
+	unsigned int R : 9;
+	unsigned int G : 9;
+	unsigned int B : 9;
+	unsigned int E : 5;
 
-	inline half shortAsHalf(short s)
+public:
+	RGB9E5(float rgb[3]) : RGB9E5(rgb[0], rgb[1], rgb[2])
 	{
-		union
-		{
-			half h;
-			short s;
-		} hs;
-
-		hs.s = s;
-
-		return hs.h;
 	}
 
-	class RGB9E5
+	RGB9E5(float r, float g, float b)
 	{
-		unsigned int R : 9;
-		unsigned int G : 9;
-		unsigned int B : 9;
-		unsigned int E : 5;
+		// Vulkan 1.1.117 section 15.2.1 RGB to Shared Exponent Conversion
 
-	public:
-		RGB9E5(float rgb[3]) : RGB9E5(rgb[0], rgb[1], rgb[2])
-		{
-		}
+		// B is the exponent bias (15)
+		constexpr int g_sharedexp_bias = 15;
 
-		RGB9E5(float r, float g, float b)
-		{
-			// Vulkan 1.1.117 section 15.2.1 RGB to Shared Exponent Conversion
+		// N is the number of mantissa bits per component (9)
+		constexpr int g_sharedexp_mantissabits = 9;
 
-			// B is the exponent bias (15)
-			constexpr int g_sharedexp_bias = 15;
+		// Emax is the maximum allowed biased exponent value (31)
+		constexpr int g_sharedexp_maxexponent = 31;
 
-			// N is the number of mantissa bits per component (9)
-			constexpr int g_sharedexp_mantissabits = 9;
+		constexpr float g_sharedexp_max =
+			((static_cast<float>(1 << g_sharedexp_mantissabits) - 1) /
+				static_cast<float>(1 << g_sharedexp_mantissabits)) *
+			static_cast<float>(1 << (g_sharedexp_maxexponent - g_sharedexp_bias));
 
-			// Emax is the maximum allowed biased exponent value (31)
-			constexpr int g_sharedexp_maxexponent = 31;
+		// Clamp components to valid range. NaN becomes 0.
+		const float red_c =   std::min(!(r > 0) ? 0 : r, g_sharedexp_max);
+		const float green_c = std::min(!(g > 0) ? 0 : g, g_sharedexp_max);
+		const float blue_c =  std::min(!(b > 0) ? 0 : b, g_sharedexp_max);
 
-			constexpr float g_sharedexp_max =
-				((static_cast<float>(1 << g_sharedexp_mantissabits) - 1) /
-					static_cast<float>(1 << g_sharedexp_mantissabits)) *
-				static_cast<float>(1 << (g_sharedexp_maxexponent - g_sharedexp_bias));
+		// We're reducing the mantissa to 9 bits, so we must round up if the next
+		// bit is 1. In other words add 0.5 to the new mantissa's position and
+		// allow overflow into the exponent so we can scale correctly.
+		constexpr int half = 1 << (23 - g_sharedexp_mantissabits);
+		const float red_r = bit_cast<float>(bit_cast<int>(red_c) + half);
+		const float green_r = bit_cast<float>(bit_cast<int>(green_c) + half);
+		const float blue_r = bit_cast<float>(bit_cast<int>(blue_c) + half);
 
-			// Clamp components to valid range. NaN becomes 0.
-			const float red_c =   std::min(!(r > 0) ? 0 : r, g_sharedexp_max);
-			const float green_c = std::min(!(g > 0) ? 0 : g, g_sharedexp_max);
-			const float blue_c =  std::min(!(b > 0) ? 0 : b, g_sharedexp_max);
+		// The largest component determines the shared exponent. It can't be lower
+		// than 0 (after bias subtraction) so also limit to the mimimum representable.
+		constexpr float min_s = 0.5f / (1 << g_sharedexp_bias);
+		float max_s = std::max(std::max(red_r, green_r), std::max(blue_r, min_s));
 
-			// We're reducing the mantissa to 9 bits, so we must round up if the next
-			// bit is 1. In other words add 0.5 to the new mantissa's position and
-			// allow overflow into the exponent so we can scale correctly.
-			constexpr int half = 1 << (23 - g_sharedexp_mantissabits);
-			const float red_r = bit_cast<float>(bit_cast<int>(red_c) + half);
-			const float green_r = bit_cast<float>(bit_cast<int>(green_c) + half);
-			const float blue_r = bit_cast<float>(bit_cast<int>(blue_c) + half);
+		// Obtain the reciprocal of the shared exponent by inverting the bits,
+		// and scale by the new mantissa's size. Note that the IEEE-754 single-precision
+		// format has an implicit leading 1, but this shared component format does not.
+		float scale = bit_cast<float>((bit_cast<int>(max_s) & 0x7F800000) ^ 0x7F800000) * (1 << (g_sharedexp_mantissabits - 2));
 
-			// The largest component determines the shared exponent. It can't be lower
-			// than 0 (after bias subtraction) so also limit to the mimimum representable.
-			constexpr float min_s = 0.5f / (1 << g_sharedexp_bias);
-			float max_s = std::max(std::max(red_r, green_r), std::max(blue_r, min_s));
+		R = static_cast<unsigned int>(round(red_c * scale));
+		G = static_cast<unsigned int>(round(green_c * scale));
+		B = static_cast<unsigned int>(round(blue_c * scale));
+		E = (bit_cast<unsigned int>(max_s) >> 23) - 127 + 15 + 1;
+	}
 
-			// Obtain the reciprocal of the shared exponent by inverting the bits,
-			// and scale by the new mantissa's size. Note that the IEEE-754 single-precision
-			// format has an implicit leading 1, but this shared component format does not.
-			float scale = bit_cast<float>((bit_cast<int>(max_s) & 0x7F800000) ^ 0x7F800000) * (1 << (g_sharedexp_mantissabits - 2));
-
-			R = static_cast<unsigned int>(round(red_c * scale));
-			G = static_cast<unsigned int>(round(green_c * scale));
-			B = static_cast<unsigned int>(round(blue_c * scale));
-			E = (bit_cast<unsigned int>(max_s) >> 23) - 127 + 15 + 1;
-		}
-
-		operator unsigned int() const
-		{
-			return *reinterpret_cast<const unsigned int*>(this);
-		}
-
-		void toRGB16F(half rgb[3]) const
-		{
-			constexpr int offset = 24;   // Exponent bias (15) + number of mantissa bits per component (9) = 24
-
-			const float factor = (1u << E) * (1.0f / (1 << offset));
-			rgb[0] = half(R * factor);
-			rgb[1] = half(G * factor);
-			rgb[2] = half(B * factor);
-		}
-	};
-
-	class R11G11B10F
+	operator unsigned int() const
 	{
-		unsigned int R : 11;
-		unsigned int G : 11;
-		unsigned int B : 10;
+		return *reinterpret_cast<const unsigned int*>(this);
+	}
 
-		static inline half float11ToFloat16(unsigned short fp11)
+	void toRGB16F(half rgb[3]) const
+	{
+		constexpr int offset = 24;   // Exponent bias (15) + number of mantissa bits per component (9) = 24
+
+		const float factor = (1u << E) * (1.0f / (1 << offset));
+		rgb[0] = half(R * factor);
+		rgb[1] = half(G * factor);
+		rgb[2] = half(B * factor);
+	}
+};
+
+class R11G11B10F
+{
+	unsigned int R : 11;
+	unsigned int G : 11;
+	unsigned int B : 10;
+
+	static inline half float11ToFloat16(unsigned short fp11)
+	{
+		return shortAsHalf(fp11 << 4);   // Sign bit 0
+	}
+
+	static inline half float10ToFloat16(unsigned short fp10)
+	{
+		return shortAsHalf(fp10 << 5);   // Sign bit 0
+	}
+
+	inline unsigned short float32ToFloat11(float fp32)
+	{
+		const unsigned int float32MantissaMask = 0x7FFFFF;
+		const unsigned int float32ExponentMask = 0x7F800000;
+		const unsigned int float32SignMask = 0x80000000;
+		const unsigned int float32ValueMask = ~float32SignMask;
+		const unsigned int float32ExponentFirstBit = 23;
+		const unsigned int float32ExponentBias = 127;
+
+		const unsigned short float11Max = 0x7BF;
+		const unsigned short float11MantissaMask = 0x3F;
+		const unsigned short float11ExponentMask = 0x7C0;
+		const unsigned short float11BitMask = 0x7FF;
+		const unsigned int float11ExponentBias = 14;
+
+		const unsigned int float32Maxfloat11 = 0x477E0000;
+		const unsigned int float32Minfloat11 = 0x38800000;
+
+		const unsigned int float32Bits = *reinterpret_cast<unsigned int*>(&fp32);
+		const bool float32Sign = (float32Bits & float32SignMask) == float32SignMask;
+
+		unsigned int float32Val = float32Bits & float32ValueMask;
+
+		if((float32Val & float32ExponentMask) == float32ExponentMask)
 		{
-			return shortAsHalf(fp11 << 4);   // Sign bit 0
-		}
-
-		static inline half float10ToFloat16(unsigned short fp10)
-		{
-			return shortAsHalf(fp10 << 5);   // Sign bit 0
-		}
-
-		inline unsigned short float32ToFloat11(float fp32)
-		{
-			const unsigned int float32MantissaMask = 0x7FFFFF;
-			const unsigned int float32ExponentMask = 0x7F800000;
-			const unsigned int float32SignMask = 0x80000000;
-			const unsigned int float32ValueMask = ~float32SignMask;
-			const unsigned int float32ExponentFirstBit = 23;
-			const unsigned int float32ExponentBias = 127;
-
-			const unsigned short float11Max = 0x7BF;
-			const unsigned short float11MantissaMask = 0x3F;
-			const unsigned short float11ExponentMask = 0x7C0;
-			const unsigned short float11BitMask = 0x7FF;
-			const unsigned int float11ExponentBias = 14;
-
-			const unsigned int float32Maxfloat11 = 0x477E0000;
-			const unsigned int float32Minfloat11 = 0x38800000;
-
-			const unsigned int float32Bits = *reinterpret_cast<unsigned int*>(&fp32);
-			const bool float32Sign = (float32Bits & float32SignMask) == float32SignMask;
-
-			unsigned int float32Val = float32Bits & float32ValueMask;
-
-			if((float32Val & float32ExponentMask) == float32ExponentMask)
+			// INF or NAN
+			if((float32Val & float32MantissaMask) != 0)
 			{
-				// INF or NAN
-				if((float32Val & float32MantissaMask) != 0)
-				{
-					return float11ExponentMask |
-						(((float32Val >> 17) | (float32Val >> 11) | (float32Val >> 6) | (float32Val)) &
-							float11MantissaMask);
-				}
-				else if(float32Sign)
-				{
-					// -INF is clamped to 0 since float11 is positive only
-					return 0;
-				}
-				else
-				{
-					return float11ExponentMask;
-				}
+				return float11ExponentMask |
+					(((float32Val >> 17) | (float32Val >> 11) | (float32Val >> 6) | (float32Val)) &
+						float11MantissaMask);
 			}
 			else if(float32Sign)
 			{
-				// float11 is positive only, so clamp to zero
+				// -INF is clamped to 0 since float11 is positive only
 				return 0;
 			}
-			else if(float32Val > float32Maxfloat11)
-			{
-				// The number is too large to be represented as a float11, set to max
-				return float11Max;
-			}
 			else
 			{
-				if(float32Val < float32Minfloat11)
-				{
-					// The number is too small to be represented as a normalized float11
-					// Convert it to a denormalized value.
-					const unsigned int shift = (float32ExponentBias - float11ExponentBias) -
-						(float32Val >> float32ExponentFirstBit);
-					float32Val =
-						((1 << float32ExponentFirstBit) | (float32Val & float32MantissaMask)) >> shift;
-				}
-				else
-				{
-					// Rebias the exponent to represent the value as a normalized float11
-					float32Val += 0xC8000000;
-				}
-
-				return ((float32Val + 0xFFFF + ((float32Val >> 17) & 1)) >> 17) & float11BitMask;
+				return float11ExponentMask;
 			}
 		}
-
-		inline unsigned short float32ToFloat10(float fp32)
+		else if(float32Sign)
 		{
-			const unsigned int float32MantissaMask = 0x7FFFFF;
-			const unsigned int float32ExponentMask = 0x7F800000;
-			const unsigned int float32SignMask = 0x80000000;
-			const unsigned int float32ValueMask = ~float32SignMask;
-			const unsigned int float32ExponentFirstBit = 23;
-			const unsigned int float32ExponentBias = 127;
-
-			const unsigned short float10Max = 0x3DF;
-			const unsigned short float10MantissaMask = 0x1F;
-			const unsigned short float10ExponentMask = 0x3E0;
-			const unsigned short float10BitMask = 0x3FF;
-			const unsigned int float10ExponentBias = 14;
-
-			const unsigned int float32Maxfloat10 = 0x477C0000;
-			const unsigned int float32Minfloat10 = 0x38800000;
-
-			const unsigned int float32Bits = *reinterpret_cast<unsigned int*>(&fp32);
-			const bool float32Sign = (float32Bits & float32SignMask) == float32SignMask;
-
-			unsigned int float32Val = float32Bits & float32ValueMask;
-
-			if((float32Val & float32ExponentMask) == float32ExponentMask)
+			// float11 is positive only, so clamp to zero
+			return 0;
+		}
+		else if(float32Val > float32Maxfloat11)
+		{
+			// The number is too large to be represented as a float11, set to max
+			return float11Max;
+		}
+		else
+		{
+			if(float32Val < float32Minfloat11)
 			{
-				// INF or NAN
-				if((float32Val & float32MantissaMask) != 0)
-				{
-					return float10ExponentMask |
-						(((float32Val >> 18) | (float32Val >> 13) | (float32Val >> 3) | (float32Val)) &
-							float10MantissaMask);
-				}
-				else if(float32Sign)
-				{
-					// -INF is clamped to 0 since float11 is positive only
-					return 0;
-				}
-				else
-				{
-					return float10ExponentMask;
-				}
+				// The number is too small to be represented as a normalized float11
+				// Convert it to a denormalized value.
+				const unsigned int shift = (float32ExponentBias - float11ExponentBias) -
+					(float32Val >> float32ExponentFirstBit);
+				float32Val =
+					((1 << float32ExponentFirstBit) | (float32Val & float32MantissaMask)) >> shift;
+			}
+			else
+			{
+				// Rebias the exponent to represent the value as a normalized float11
+				float32Val += 0xC8000000;
+			}
+
+			return ((float32Val + 0xFFFF + ((float32Val >> 17) & 1)) >> 17) & float11BitMask;
+		}
+	}
+
+	inline unsigned short float32ToFloat10(float fp32)
+	{
+		const unsigned int float32MantissaMask = 0x7FFFFF;
+		const unsigned int float32ExponentMask = 0x7F800000;
+		const unsigned int float32SignMask = 0x80000000;
+		const unsigned int float32ValueMask = ~float32SignMask;
+		const unsigned int float32ExponentFirstBit = 23;
+		const unsigned int float32ExponentBias = 127;
+
+		const unsigned short float10Max = 0x3DF;
+		const unsigned short float10MantissaMask = 0x1F;
+		const unsigned short float10ExponentMask = 0x3E0;
+		const unsigned short float10BitMask = 0x3FF;
+		const unsigned int float10ExponentBias = 14;
+
+		const unsigned int float32Maxfloat10 = 0x477C0000;
+		const unsigned int float32Minfloat10 = 0x38800000;
+
+		const unsigned int float32Bits = *reinterpret_cast<unsigned int*>(&fp32);
+		const bool float32Sign = (float32Bits & float32SignMask) == float32SignMask;
+
+		unsigned int float32Val = float32Bits & float32ValueMask;
+
+		if((float32Val & float32ExponentMask) == float32ExponentMask)
+		{
+			// INF or NAN
+			if((float32Val & float32MantissaMask) != 0)
+			{
+				return float10ExponentMask |
+					(((float32Val >> 18) | (float32Val >> 13) | (float32Val >> 3) | (float32Val)) &
+						float10MantissaMask);
 			}
 			else if(float32Sign)
 			{
-				// float10 is positive only, so clamp to zero
+				// -INF is clamped to 0 since float11 is positive only
 				return 0;
 			}
-			else if(float32Val > float32Maxfloat10)
-			{
-				// The number is too large to be represented as a float11, set to max
-				return float10Max;
-			}
 			else
 			{
-				if(float32Val < float32Minfloat10)
-				{
-					// The number is too small to be represented as a normalized float11
-					// Convert it to a denormalized value.
-					const unsigned int shift = (float32ExponentBias - float10ExponentBias) -
-						(float32Val >> float32ExponentFirstBit);
-					float32Val =
-						((1 << float32ExponentFirstBit) | (float32Val & float32MantissaMask)) >> shift;
-				}
-				else
-				{
-					// Rebias the exponent to represent the value as a normalized float11
-					float32Val += 0xC8000000;
-				}
-
-				return ((float32Val + 0x1FFFF + ((float32Val >> 18) & 1)) >> 18) & float10BitMask;
+				return float10ExponentMask;
 			}
 		}
-
-	public:
-		R11G11B10F(float rgb[3])
+		else if(float32Sign)
 		{
-			R = float32ToFloat11(rgb[0]);
-			G = float32ToFloat11(rgb[1]);
-			B = float32ToFloat10(rgb[2]);
+			// float10 is positive only, so clamp to zero
+			return 0;
 		}
-
-		operator unsigned int() const
+		else if(float32Val > float32Maxfloat10)
 		{
-			return *reinterpret_cast<const unsigned int*>(this);
+			// The number is too large to be represented as a float11, set to max
+			return float10Max;
 		}
-
-		void toRGB16F(half rgb[3]) const
+		else
 		{
-			rgb[0] = float11ToFloat16(R);
-			rgb[1] = float11ToFloat16(G);
-			rgb[2] = float10ToFloat16(B);
+			if(float32Val < float32Minfloat10)
+			{
+				// The number is too small to be represented as a normalized float11
+				// Convert it to a denormalized value.
+				const unsigned int shift = (float32ExponentBias - float10ExponentBias) -
+					(float32Val >> float32ExponentFirstBit);
+				float32Val =
+					((1 << float32ExponentFirstBit) | (float32Val & float32MantissaMask)) >> shift;
+			}
+			else
+			{
+				// Rebias the exponent to represent the value as a normalized float11
+				float32Val += 0xC8000000;
+			}
+
+			return ((float32Val + 0x1FFFF + ((float32Val >> 18) & 1)) >> 18) & float10BitMask;
 		}
-	};
-}
+	}
+
+public:
+	R11G11B10F(float rgb[3])
+	{
+		R = float32ToFloat11(rgb[0]);
+		G = float32ToFloat11(rgb[1]);
+		B = float32ToFloat10(rgb[2]);
+	}
+
+	operator unsigned int() const
+	{
+		return *reinterpret_cast<const unsigned int*>(this);
+	}
+
+	void toRGB16F(half rgb[3]) const
+	{
+		rgb[0] = float11ToFloat16(R);
+		rgb[1] = float11ToFloat16(G);
+		rgb[2] = float10ToFloat16(B);
+	}
+};
+
+}  // namespace sw
 
 #endif   // sw_Half_hpp
diff --git a/src/System/Math.cpp b/src/System/Math.cpp
index 290d4ab..36d1df2 100644
--- a/src/System/Math.cpp
+++ b/src/System/Math.cpp
@@ -14,36 +14,37 @@
 
 #include "Math.hpp"
 
-namespace sw
+namespace sw {
+
+inline uint64_t FNV_1a(uint64_t hash, unsigned char data)
 {
-	inline uint64_t FNV_1a(uint64_t hash, unsigned char data)
-	{
-		return (hash ^ data) * 1099511628211;
-	}
-
-	uint64_t FNV_1a(const unsigned char *data, int size)
-	{
-		int64_t hash = 0xCBF29CE484222325;
-
-		for(int i = 0; i < size; i++)
-		{
-			hash = FNV_1a(hash, data[i]);
-		}
-
-		return hash;
-	}
-
-	unsigned char sRGB8toLinear8(unsigned char value)
-	{
-		static unsigned char sRGBtoLinearTable[256] = { 255 };
-		if(sRGBtoLinearTable[0] == 255)
-		{
-			for(int i = 0; i < 256; i++)
-			{
-				sRGBtoLinearTable[i] = static_cast<unsigned char>(sw::sRGBtoLinear(static_cast<float>(i) / 255.0f) * 255.0f + 0.5f);
-			}
-		}
-
-		return sRGBtoLinearTable[value];
-	}
+	return (hash ^ data) * 1099511628211;
 }
+
+uint64_t FNV_1a(const unsigned char *data, int size)
+{
+	int64_t hash = 0xCBF29CE484222325;
+
+	for(int i = 0; i < size; i++)
+	{
+		hash = FNV_1a(hash, data[i]);
+	}
+
+	return hash;
+}
+
+unsigned char sRGB8toLinear8(unsigned char value)
+{
+	static unsigned char sRGBtoLinearTable[256] = { 255 };
+	if(sRGBtoLinearTable[0] == 255)
+	{
+		for(int i = 0; i < 256; i++)
+		{
+			sRGBtoLinearTable[i] = static_cast<unsigned char>(sw::sRGBtoLinear(static_cast<float>(i) / 255.0f) * 255.0f + 0.5f);
+		}
+	}
+
+	return sRGBtoLinearTable[value];
+}
+
+}  // namespace sw
diff --git a/src/System/Math.hpp b/src/System/Math.hpp
index efef5fd..f45f933 100644
--- a/src/System/Math.hpp
+++ b/src/System/Math.hpp
@@ -24,359 +24,360 @@
 	#include <intrin.h>
 #endif
 
-namespace sw
+namespace sw {
+
+using std::abs;
+
+#undef min
+#undef max
+
+template<class T>
+inline T constexpr max(T a, T b)
 {
-	using std::abs;
+	return a > b ? a : b;
+}
 
-	#undef min
-	#undef max
+template<class T>
+inline constexpr T min(T a, T b)
+{
+	return a < b ? a : b;
+}
 
-	template<class T>
-	inline T constexpr max(T a, T b)
+template<class T>
+inline constexpr T max(T a, T b, T c)
+{
+	return max(max(a, b), c);
+}
+
+template<class T>
+inline constexpr T min(T a, T b, T c)
+{
+	return min(min(a, b), c);
+}
+
+template<class T>
+inline constexpr T max(T a, T b, T c, T d)
+{
+	return max(max(a, b), max(c, d));
+}
+
+template<class T>
+inline constexpr T min(T a, T b, T c, T d)
+{
+	return min(min(a, b), min(c, d));
+}
+
+template <typename destType, typename sourceType>
+destType bit_cast(const sourceType &source)
+{
+	union
 	{
-		return a > b ? a : b;
+		sourceType s;
+		destType d;
+	} sd;
+	sd.s = source;
+	return sd.d;
+}
+
+inline int iround(float x)
+{
+	return (int)floor(x + 0.5f);
+//	return _mm_cvtss_si32(_mm_load_ss(&x));   // FIXME: Demands SSE support
+}
+
+inline int ifloor(float x)
+{
+	return (int)floor(x);
+}
+
+inline int ceilFix4(int x)
+{
+	return (x + 0xF) & 0xFFFFFFF0;
+}
+
+inline int ceilInt4(int x)
+{
+	return (x + 0xF) >> 4;
+}
+
+#define BITS(x)    ( \
+!!((x) & 0x80000000) + \
+!!((x) & 0xC0000000) + \
+!!((x) & 0xE0000000) + \
+!!((x) & 0xF0000000) + \
+!!((x) & 0xF8000000) + \
+!!((x) & 0xFC000000) + \
+!!((x) & 0xFE000000) + \
+!!((x) & 0xFF000000) + \
+!!((x) & 0xFF800000) + \
+!!((x) & 0xFFC00000) + \
+!!((x) & 0xFFE00000) + \
+!!((x) & 0xFFF00000) + \
+!!((x) & 0xFFF80000) + \
+!!((x) & 0xFFFC0000) + \
+!!((x) & 0xFFFE0000) + \
+!!((x) & 0xFFFF0000) + \
+!!((x) & 0xFFFF8000) + \
+!!((x) & 0xFFFFC000) + \
+!!((x) & 0xFFFFE000) + \
+!!((x) & 0xFFFFF000) + \
+!!((x) & 0xFFFFF800) + \
+!!((x) & 0xFFFFFC00) + \
+!!((x) & 0xFFFFFE00) + \
+!!((x) & 0xFFFFFF00) + \
+!!((x) & 0xFFFFFF80) + \
+!!((x) & 0xFFFFFFC0) + \
+!!((x) & 0xFFFFFFE0) + \
+!!((x) & 0xFFFFFFF0) + \
+!!((x) & 0xFFFFFFF8) + \
+!!((x) & 0xFFFFFFFC) + \
+!!((x) & 0xFFFFFFFE) + \
+!!((x) & 0xFFFFFFFF))
+
+#define MAX(x, y) ((x) > (y) ? (x) : (y))
+#define MIN(x, y) ((x) < (y) ? (x) : (y))
+
+inline unsigned long log2i(int x)
+{
+	#if defined(_MSC_VER)
+		unsigned long y;
+		_BitScanReverse(&y, x);
+		return y;
+	#else
+		return 31 - __builtin_clz(x);
+	#endif
+}
+
+inline bool isPow2(int x)
+{
+	return (x & -x) == x;
+}
+
+template<class T>
+inline T clamp(T x, T a, T b)
+{
+	ASSERT(a <= b);
+	if(x < a) x = a;
+	if(x > b) x = b;
+
+	return x;
+}
+
+inline float clamp01(float x)
+{
+	return clamp(x, 0.0f, 1.0f);
+}
+
+// Bit-cast of a floating-point value into a two's complement integer representation.
+// This makes floating-point values comparable as integers.
+inline int32_t float_as_twos_complement(float f)
+{
+	// IEEE-754 floating-point numbers are sorted by magnitude in the same way as integers,
+	// except negative values are like one's complement integers. Convert them to two's complement.
+	int32_t i = bit_cast<int32_t>(f);
+	return (i < 0) ? (0x7FFFFFFFu - i) : i;
+}
+
+// 'Safe' clamping operation which always returns a value between min and max (inclusive).
+inline float clamp_s(float x, float min, float max)
+{
+	// NaN values can't be compared directly
+	if(float_as_twos_complement(x) < float_as_twos_complement(min)) x = min;
+	if(float_as_twos_complement(x) > float_as_twos_complement(max)) x = max;
+
+	return x;
+}
+
+inline int ceilPow2(int x)
+{
+	int i = 1;
+
+	while(i < x)
+	{
+		i <<= 1;
 	}
 
-	template<class T>
-	inline constexpr T min(T a, T b)
+	return i;
+}
+
+inline int floorDiv(int a, int b)
+{
+	return a / b + ((a % b) >> 31);
+}
+
+inline int floorMod(int a, int b)
+{
+	int r = a % b;
+	return r + ((r >> 31) & b);
+}
+
+inline int ceilDiv(int a, int b)
+{
+	return a / b - (-(a % b) >> 31);
+}
+
+inline int ceilMod(int a, int b)
+{
+	int r = a % b;
+	return r - ((-r >> 31) & b);
+}
+
+template<const int n>
+inline unsigned int unorm(float x)
+{
+	static const unsigned int max = 0xFFFFFFFF >> (32 - n);
+	static const float maxf = static_cast<float>(max);
+
+	if(x >= 1.0f)
 	{
-		return a < b ? a : b;
+		return max;
 	}
-
-	template<class T>
-	inline constexpr T max(T a, T b, T c)
+	else if(x <= 0.0f)
 	{
-		return max(max(a, b), c);
+		return 0;
 	}
-
-	template<class T>
-	inline constexpr T min(T a, T b, T c)
+	else
 	{
-		return min(min(a, b), c);
+		return static_cast<unsigned int>(maxf * x + 0.5f);
 	}
+}
 
-	template<class T>
-	inline constexpr T max(T a, T b, T c, T d)
+template<const int n>
+inline int snorm(float x)
+{
+	static const unsigned int min = 0x80000000 >> (32 - n);
+	static const unsigned int max = 0xFFFFFFFF >> (32 - n + 1);
+	static const float maxf = static_cast<float>(max);
+	static const unsigned int range = 0xFFFFFFFF >> (32 - n);
+
+	if(x >= 0.0f)
 	{
-		return max(max(a, b), max(c, d));
-	}
-
-	template<class T>
-	inline constexpr T min(T a, T b, T c, T d)
-	{
-		return min(min(a, b), min(c, d));
-	}
-
-	template <typename destType, typename sourceType>
-	destType bit_cast(const sourceType &source)
-	{
-		union
-		{
-			sourceType s;
-			destType d;
-		} sd;
-		sd.s = source;
-		return sd.d;
-	}
-
-	inline int iround(float x)
-	{
-		return (int)floor(x + 0.5f);
-	//	return _mm_cvtss_si32(_mm_load_ss(&x));   // FIXME: Demands SSE support
-	}
-
-	inline int ifloor(float x)
-	{
-		return (int)floor(x);
-	}
-
-	inline int ceilFix4(int x)
-	{
-		return (x + 0xF) & 0xFFFFFFF0;
-	}
-
-	inline int ceilInt4(int x)
-	{
-		return (x + 0xF) >> 4;
-	}
-
-	#define BITS(x)    ( \
-	!!((x) & 0x80000000) + \
-	!!((x) & 0xC0000000) + \
-	!!((x) & 0xE0000000) + \
-	!!((x) & 0xF0000000) + \
-	!!((x) & 0xF8000000) + \
-	!!((x) & 0xFC000000) + \
-	!!((x) & 0xFE000000) + \
-	!!((x) & 0xFF000000) + \
-	!!((x) & 0xFF800000) + \
-	!!((x) & 0xFFC00000) + \
-	!!((x) & 0xFFE00000) + \
-	!!((x) & 0xFFF00000) + \
-	!!((x) & 0xFFF80000) + \
-	!!((x) & 0xFFFC0000) + \
-	!!((x) & 0xFFFE0000) + \
-	!!((x) & 0xFFFF0000) + \
-	!!((x) & 0xFFFF8000) + \
-	!!((x) & 0xFFFFC000) + \
-	!!((x) & 0xFFFFE000) + \
-	!!((x) & 0xFFFFF000) + \
-	!!((x) & 0xFFFFF800) + \
-	!!((x) & 0xFFFFFC00) + \
-	!!((x) & 0xFFFFFE00) + \
-	!!((x) & 0xFFFFFF00) + \
-	!!((x) & 0xFFFFFF80) + \
-	!!((x) & 0xFFFFFFC0) + \
-	!!((x) & 0xFFFFFFE0) + \
-	!!((x) & 0xFFFFFFF0) + \
-	!!((x) & 0xFFFFFFF8) + \
-	!!((x) & 0xFFFFFFFC) + \
-	!!((x) & 0xFFFFFFFE) + \
-	!!((x) & 0xFFFFFFFF))
-
-	#define MAX(x, y) ((x) > (y) ? (x) : (y))
-	#define MIN(x, y) ((x) < (y) ? (x) : (y))
-
-	inline unsigned long log2i(int x)
-	{
-		#if defined(_MSC_VER)
-			unsigned long y;
-			_BitScanReverse(&y, x);
-			return y;
-		#else
-			return 31 - __builtin_clz(x);
-		#endif
-	}
-
-	inline bool isPow2(int x)
-	{
-		return (x & -x) == x;
-	}
-
-	template<class T>
-	inline T clamp(T x, T a, T b)
-	{
-		ASSERT(a <= b);
-		if(x < a) x = a;
-		if(x > b) x = b;
-
-		return x;
-	}
-
-	inline float clamp01(float x)
-	{
-		return clamp(x, 0.0f, 1.0f);
-	}
-
-	// Bit-cast of a floating-point value into a two's complement integer representation.
-	// This makes floating-point values comparable as integers.
-	inline int32_t float_as_twos_complement(float f)
-	{
-		// IEEE-754 floating-point numbers are sorted by magnitude in the same way as integers,
-		// except negative values are like one's complement integers. Convert them to two's complement.
-		int32_t i = bit_cast<int32_t>(f);
-		return (i < 0) ? (0x7FFFFFFFu - i) : i;
-	}
-
-	// 'Safe' clamping operation which always returns a value between min and max (inclusive).
-	inline float clamp_s(float x, float min, float max)
-	{
-		// NaN values can't be compared directly
-		if(float_as_twos_complement(x) < float_as_twos_complement(min)) x = min;
-		if(float_as_twos_complement(x) > float_as_twos_complement(max)) x = max;
-
-		return x;
-	}
-
-	inline int ceilPow2(int x)
-	{
-		int i = 1;
-
-		while(i < x)
-		{
-			i <<= 1;
-		}
-
-		return i;
-	}
-
-	inline int floorDiv(int a, int b)
-	{
-		return a / b + ((a % b) >> 31);
-	}
-
-	inline int floorMod(int a, int b)
-	{
-		int r = a % b;
-		return r + ((r >> 31) & b);
-	}
-
-	inline int ceilDiv(int a, int b)
-	{
-		return a / b - (-(a % b) >> 31);
-	}
-
-	inline int ceilMod(int a, int b)
-	{
-		int r = a % b;
-		return r - ((-r >> 31) & b);
-	}
-
-	template<const int n>
-	inline unsigned int unorm(float x)
-	{
-		static const unsigned int max = 0xFFFFFFFF >> (32 - n);
-		static const float maxf = static_cast<float>(max);
-
 		if(x >= 1.0f)
 		{
 			return max;
 		}
-		else if(x <= 0.0f)
+		else
 		{
-			return 0;
+			return static_cast<int>(maxf * x + 0.5f);
+		}
+	}
+	else
+	{
+		if(x <= -1.0f)
+		{
+			return min;
 		}
 		else
 		{
-			return static_cast<unsigned int>(maxf * x + 0.5f);
+			return static_cast<int>(maxf * x - 0.5f) & range;
 		}
 	}
+}
 
-	template<const int n>
-	inline int snorm(float x)
+template<const int n>
+inline unsigned int ucast(float x)
+{
+	static const unsigned int max = 0xFFFFFFFF >> (32 - n);
+	static const float maxf = static_cast<float>(max);
+
+	if(x >= maxf)
 	{
-		static const unsigned int min = 0x80000000 >> (32 - n);
-		static const unsigned int max = 0xFFFFFFFF >> (32 - n + 1);
-		static const float maxf = static_cast<float>(max);
-		static const unsigned int range = 0xFFFFFFFF >> (32 - n);
-
-		if(x >= 0.0f)
-		{
-			if(x >= 1.0f)
-			{
-				return max;
-			}
-			else
-			{
-				return static_cast<int>(maxf * x + 0.5f);
-			}
-		}
-		else
-		{
-			if(x <= -1.0f)
-			{
-				return min;
-			}
-			else
-			{
-				return static_cast<int>(maxf * x - 0.5f) & range;
-			}
-		}
+		return max;
 	}
-
-	template<const int n>
-	inline unsigned int ucast(float x)
+	else if(x <= 0.0f)
 	{
-		static const unsigned int max = 0xFFFFFFFF >> (32 - n);
-		static const float maxf = static_cast<float>(max);
+		return 0;
+	}
+	else
+	{
+		return static_cast<unsigned int>(x + 0.5f);
+	}
+}
 
+template<const int n>
+inline int scast(float x)
+{
+	static const unsigned int min = 0x80000000 >> (32 - n);
+	static const unsigned int max = 0xFFFFFFFF >> (32 - n + 1);
+	static const float maxf = static_cast<float>(max);
+	static const float minf = static_cast<float>(min);
+	static const unsigned int range = 0xFFFFFFFF >> (32 - n);
+
+	if(x > 0.0f)
+	{
 		if(x >= maxf)
 		{
 			return max;
 		}
-		else if(x <= 0.0f)
+		else
 		{
-			return 0;
+			return static_cast<int>(x + 0.5f);
+		}
+	}
+	else
+	{
+		if(x <= -minf)
+		{
+			return min;
 		}
 		else
 		{
-			return static_cast<unsigned int>(x + 0.5f);
+			return static_cast<int>(x - 0.5f) & range;
 		}
 	}
-
-	template<const int n>
-	inline int scast(float x)
-	{
-		static const unsigned int min = 0x80000000 >> (32 - n);
-		static const unsigned int max = 0xFFFFFFFF >> (32 - n + 1);
-		static const float maxf = static_cast<float>(max);
-		static const float minf = static_cast<float>(min);
-		static const unsigned int range = 0xFFFFFFFF >> (32 - n);
-
-		if(x > 0.0f)
-		{
-			if(x >= maxf)
-			{
-				return max;
-			}
-			else
-			{
-				return static_cast<int>(x + 0.5f);
-			}
-		}
-		else
-		{
-			if(x <= -minf)
-			{
-				return min;
-			}
-			else
-			{
-				return static_cast<int>(x - 0.5f) & range;
-			}
-		}
-	}
-
-	inline float sRGBtoLinear(float c)
-	{
-		if(c <= 0.04045f)
-		{
-			return c * 0.07739938f;   // 1.0f / 12.92f;
-		}
-		else
-		{
-			return powf((c + 0.055f) * 0.9478673f, 2.4f);   // 1.0f / 1.055f
-		}
-	}
-
-	inline float linearToSRGB(float c)
-	{
-		if(c <= 0.0031308f)
-		{
-			return c * 12.92f;
-		}
-		else
-		{
-			return 1.055f * powf(c, 0.4166667f) - 0.055f;   // 1.0f / 2.4f
-		}
-	}
-
-	unsigned char sRGB8toLinear8(unsigned char value);
-
-	uint64_t FNV_1a(const unsigned char *data, int size);   // Fowler-Noll-Vo hash function
-
-	// Round up to the next multiple of alignment
-	template<typename T>
-	inline T align(T value, unsigned int alignment)
-	{
-		return ((value + alignment - 1) / alignment) * alignment;
-	}
-
-	template<unsigned int alignment, typename T>
-	inline T align(T value)
-	{
-		return ((value + alignment - 1) / alignment) * alignment;
-	}
-
-	inline int clampToSignedInt(unsigned int x)
-	{
-		return static_cast<int>(min(x, 0x7FFFFFFFu));
-	}
-
-	// Convert floating value v to fixed point with p digits after the decimal point
-	constexpr int toFixedPoint(float v, int p) {
-		return static_cast<int>(v * (1 << p));
-	}
 }
 
+inline float sRGBtoLinear(float c)
+{
+	if(c <= 0.04045f)
+	{
+		return c * 0.07739938f;   // 1.0f / 12.92f;
+	}
+	else
+	{
+		return powf((c + 0.055f) * 0.9478673f, 2.4f);   // 1.0f / 1.055f
+	}
+}
+
+inline float linearToSRGB(float c)
+{
+	if(c <= 0.0031308f)
+	{
+		return c * 12.92f;
+	}
+	else
+	{
+		return 1.055f * powf(c, 0.4166667f) - 0.055f;   // 1.0f / 2.4f
+	}
+}
+
+unsigned char sRGB8toLinear8(unsigned char value);
+
+uint64_t FNV_1a(const unsigned char *data, int size);   // Fowler-Noll-Vo hash function
+
+// Round up to the next multiple of alignment
+template<typename T>
+inline T align(T value, unsigned int alignment)
+{
+	return ((value + alignment - 1) / alignment) * alignment;
+}
+
+template<unsigned int alignment, typename T>
+inline T align(T value)
+{
+	return ((value + alignment - 1) / alignment) * alignment;
+}
+
+inline int clampToSignedInt(unsigned int x)
+{
+	return static_cast<int>(min(x, 0x7FFFFFFFu));
+}
+
+// Convert floating value v to fixed point with p digits after the decimal point
+constexpr int toFixedPoint(float v, int p) {
+	return static_cast<int>(v * (1 << p));
+}
+
+}  // namespace sw
+
 #endif   // sw_Math_hpp
diff --git a/src/System/Memory.cpp b/src/System/Memory.cpp
index e045254..e637a55 100644
--- a/src/System/Memory.cpp
+++ b/src/System/Memory.cpp
@@ -40,10 +40,10 @@
 #define __x86__
 #endif
 
-namespace sw
-{
-namespace
-{
+namespace sw {
+
+namespace {
+
 struct Allocation
 {
 //	size_t bytes;
@@ -86,6 +86,7 @@
 		return aligned;
 	#endif
 }
+
 }  // anonymous namespace
 
 size_t memoryPageSize()
@@ -160,4 +161,5 @@
 		}
 	#endif
 }
-}
+
+}  // namespace sw
diff --git a/src/System/Memory.hpp b/src/System/Memory.hpp
index 0e8d188..d1e7871 100644
--- a/src/System/Memory.hpp
+++ b/src/System/Memory.hpp
@@ -18,8 +18,8 @@
 #include <stddef.h>
 #include <stdint.h>
 
-namespace sw
-{
+namespace sw {
+
 size_t memoryPageSize();
 
 void *allocate(size_t bytes, size_t alignment = 16);
@@ -27,6 +27,7 @@
 
 void clear(uint16_t *memory, uint16_t element, size_t count);
 void clear(uint32_t *memory, uint32_t element, size_t count);
-}
+
+}  // namespace sw
 
 #endif   // Memory_hpp
diff --git a/src/System/Socket.cpp b/src/System/Socket.cpp
index b098031..1989a94 100644
--- a/src/System/Socket.cpp
+++ b/src/System/Socket.cpp
@@ -23,88 +23,89 @@
 	#include <sys/select.h>
 #endif
 
-namespace sw
+namespace sw {
+
+Socket::Socket(SOCKET socket) : socket(socket)
 {
-	Socket::Socket(SOCKET socket) : socket(socket)
+}
+
+Socket::Socket(const char *address, const char *port)
+{
+	#if defined(_WIN32)
+		socket = INVALID_SOCKET;
+	#else
+		socket = -1;
+	#endif
+
+	addrinfo hints = {};
+	hints.ai_family = AF_INET;
+	hints.ai_socktype = SOCK_STREAM;
+	hints.ai_protocol = IPPROTO_TCP;
+	hints.ai_flags = AI_PASSIVE;
+
+	addrinfo *info = 0;
+	getaddrinfo(address, port, &hints, &info);
+
+	if(info)
 	{
-	}
-
-	Socket::Socket(const char *address, const char *port)
-	{
-		#if defined(_WIN32)
-			socket = INVALID_SOCKET;
-		#else
-			socket = -1;
-		#endif
-
-		addrinfo hints = {};
-		hints.ai_family = AF_INET;
-		hints.ai_socktype = SOCK_STREAM;
-		hints.ai_protocol = IPPROTO_TCP;
-		hints.ai_flags = AI_PASSIVE;
-
-		addrinfo *info = 0;
-		getaddrinfo(address, port, &hints, &info);
-
-		if(info)
-		{
-			socket = ::socket(info->ai_family, info->ai_socktype, info->ai_protocol);
-			bind(socket, info->ai_addr, (int)info->ai_addrlen);
-		}
-	}
-
-	Socket::~Socket()
-	{
-		#if defined(_WIN32)
-			closesocket(socket);
-		#else
-			close(socket);
-		#endif
-	}
-
-	void Socket::listen(int backlog)
-	{
-		::listen(socket, backlog);
-	}
-
-	bool Socket::select(int us)
-	{
-		fd_set sockets;
-		FD_ZERO(&sockets);
-		FD_SET(socket, &sockets);
-
-		timeval timeout = {us / 1000000, us % 1000000};
-
-		return ::select(FD_SETSIZE, &sockets, 0, 0, &timeout) >= 1;
-	}
-
-	Socket *Socket::accept()
-	{
-		return new Socket(::accept(socket, 0, 0));
-	}
-
-	int Socket::receive(char *buffer, int length)
-	{
-		return recv(socket, buffer, length, 0);
-	}
-
-	void Socket::send(const char *buffer, int length)
-	{
-		::send(socket, buffer, length, 0);
-	}
-
-	void Socket::startup()
-	{
-		#if defined(_WIN32)
-			WSADATA winsockData;
-			WSAStartup(MAKEWORD(2, 2), &winsockData);
-		#endif
-	}
-
-	void Socket::cleanup()
-	{
-		#if defined(_WIN32)
-			WSACleanup();
-		#endif
+		socket = ::socket(info->ai_family, info->ai_socktype, info->ai_protocol);
+		bind(socket, info->ai_addr, (int)info->ai_addrlen);
 	}
 }
+
+Socket::~Socket()
+{
+	#if defined(_WIN32)
+		closesocket(socket);
+	#else
+		close(socket);
+	#endif
+}
+
+void Socket::listen(int backlog)
+{
+	::listen(socket, backlog);
+}
+
+bool Socket::select(int us)
+{
+	fd_set sockets;
+	FD_ZERO(&sockets);
+	FD_SET(socket, &sockets);
+
+	timeval timeout = {us / 1000000, us % 1000000};
+
+	return ::select(FD_SETSIZE, &sockets, 0, 0, &timeout) >= 1;
+}
+
+Socket *Socket::accept()
+{
+	return new Socket(::accept(socket, 0, 0));
+}
+
+int Socket::receive(char *buffer, int length)
+{
+	return recv(socket, buffer, length, 0);
+}
+
+void Socket::send(const char *buffer, int length)
+{
+	::send(socket, buffer, length, 0);
+}
+
+void Socket::startup()
+{
+	#if defined(_WIN32)
+		WSADATA winsockData;
+		WSAStartup(MAKEWORD(2, 2), &winsockData);
+	#endif
+}
+
+void Socket::cleanup()
+{
+	#if defined(_WIN32)
+		WSACleanup();
+	#endif
+}
+
+}  // namespace sw
diff --git a/src/System/Socket.hpp b/src/System/Socket.hpp
index b6b9abd..efb5062 100644
--- a/src/System/Socket.hpp
+++ b/src/System/Socket.hpp
@@ -22,28 +22,29 @@
 	typedef int SOCKET;
 #endif
 
-namespace sw
+namespace sw {
+
+class Socket
 {
-	class Socket
-	{
-	public:
-		Socket(SOCKET socket);
-		Socket(const char *address, const char *port);
-		~Socket();
+public:
+	Socket(SOCKET socket);
+	Socket(const char *address, const char *port);
+	~Socket();
 
-		void listen(int backlog = 1);
-		bool select(int us);
-		Socket *accept();
-		
-		int receive(char *buffer, int length);
-		void send(const char *buffer, int length);
+	void listen(int backlog = 1);
+	bool select(int us);
+	Socket *accept();
 
-		static void startup();
-		static void cleanup();
+	int receive(char *buffer, int length);
+	void send(const char *buffer, int length);
 
-	private:
-		SOCKET socket;
-	};
+	static void startup();
+	static void cleanup();
+
+private:
+	SOCKET socket;
+};
+
 }
 
 #endif   // sw_Socket_hpp
diff --git a/src/System/Synchronization.hpp b/src/System/Synchronization.hpp
index 5af65e2..1a8c585 100644
--- a/src/System/Synchronization.hpp
+++ b/src/System/Synchronization.hpp
@@ -28,8 +28,7 @@
 #include <mutex>
 #include <queue>
 
-namespace sw
-{
+namespace sw {
 
 // TaskEvents is an interface for notifying when tasks begin and end.
 // Tasks can be nested and/or overlapping.
@@ -191,6 +190,6 @@
 	return queue.size();
 }
 
-} // namespace sw
+}  // namespace sw
 
 #endif // sw_Synchronization_hpp
diff --git a/src/System/Timer.cpp b/src/System/Timer.cpp
index db0ba4a..7bd7ba5 100644
--- a/src/System/Timer.cpp
+++ b/src/System/Timer.cpp
@@ -35,65 +35,66 @@
 	#endif
 #endif
 
-namespace sw
+namespace sw {
+
+Timer::Timer()
 {
-	Timer::Timer()
-	{
-	}
-
-	Timer::~Timer()
-	{
-	}
-
-	double Timer::seconds()
-	{
-		#if defined(_WIN32)
-			return (double)counter() / (double)frequency();
-		#else
-			timeval t;
-			gettimeofday(&t, 0);
-			return (double)t.tv_sec + (double)t.tv_usec * 1.0e-6;
-		#endif
-	}
-
-	int64_t Timer::ticks()
-	{
-		#if defined(_WIN32)
-			#if defined(_M_ARM64)
-				return _ReadStatusReg(ARM64_PMCCNTR_EL0);
-			#else
-				return __rdtsc();
-			#endif
-		#elif defined(__i386__) || defined(__x86_64__)
-			int64_t tsc;
-			__asm volatile("rdtsc": "=A" (tsc));
-			return tsc;
-		#else
-			return 0;
-		#endif
-	}
-
-	int64_t Timer::counter()
-	{
-		#if defined(_WIN32)
-			int64_t counter;
-			QueryPerformanceCounter((LARGE_INTEGER*)&counter);
-			return counter;
-		#else
-			timeval t;
-			gettimeofday(&t, 0);
-			return t.tv_sec * 1000000 + t.tv_usec;
-		#endif
-	}
-
-	int64_t Timer::frequency()
-	{
-		#if defined(_WIN32)
-			int64_t frequency;
-			QueryPerformanceFrequency((LARGE_INTEGER*)&frequency);
-			return frequency;
-		#else
-			return 1000000;   // gettimeofday uses microsecond resolution
-		#endif
-	}
 }
+
+Timer::~Timer()
+{
+}
+
+double Timer::seconds()
+{
+	#if defined(_WIN32)
+		return (double)counter() / (double)frequency();
+	#else
+		timeval t;
+		gettimeofday(&t, 0);
+		return (double)t.tv_sec + (double)t.tv_usec * 1.0e-6;
+	#endif
+}
+
+int64_t Timer::ticks()
+{
+	#if defined(_WIN32)
+		#if defined(_M_ARM64)
+			return _ReadStatusReg(ARM64_PMCCNTR_EL0);
+		#else
+			return __rdtsc();
+		#endif
+	#elif defined(__i386__) || defined(__x86_64__)
+		int64_t tsc;
+		__asm volatile("rdtsc": "=A" (tsc));
+		return tsc;
+	#else
+		return 0;
+	#endif
+}
+
+int64_t Timer::counter()
+{
+	#if defined(_WIN32)
+		int64_t counter;
+		QueryPerformanceCounter((LARGE_INTEGER*)&counter);
+		return counter;
+	#else
+		timeval t;
+		gettimeofday(&t, 0);
+		return t.tv_sec * 1000000 + t.tv_usec;
+	#endif
+}
+
+int64_t Timer::frequency()
+{
+	#if defined(_WIN32)
+		int64_t frequency;
+		QueryPerformanceFrequency((LARGE_INTEGER*)&frequency);
+		return frequency;
+	#else
+		return 1000000;   // gettimeofday uses microsecond resolution
+	#endif
+}
+
+}  // namespace sw
diff --git a/src/System/Timer.hpp b/src/System/Timer.hpp
index 977c877..a2f687b 100644
--- a/src/System/Timer.hpp
+++ b/src/System/Timer.hpp
@@ -17,21 +17,22 @@
 
 #include "Types.hpp"
 
-namespace sw
+namespace sw {
+
+class Timer
 {
-	class Timer
-	{
-	public:
-		Timer();
+public:
+	Timer();
 
-		~Timer();
+	~Timer();
 
-		static double seconds();
-		static int64_t ticks();
+	static double seconds();
+	static int64_t ticks();
 
-		static int64_t counter();
-		static int64_t frequency();
-	};
-}
+	static int64_t counter();
+	static int64_t frequency();
+};
+
+}  // namespace sw
 
 #endif   // sw_Timer_hpp
diff --git a/src/System/Types.hpp b/src/System/Types.hpp
index 70c084d..734efb0 100644
--- a/src/System/Types.hpp
+++ b/src/System/Types.hpp
@@ -42,102 +42,103 @@
 	#define ALIGN(bytes, type) type __attribute__((aligned(bytes)))
 #endif
 
-namespace sw
+namespace sw {
+
+typedef ALIGN(1, uint8_t) byte;
+typedef ALIGN(2, uint16_t) word;
+typedef ALIGN(4, uint32_t) dword;
+typedef ALIGN(8, uint64_t) qword;
+typedef ALIGN(16, uint64_t) qword2[2];
+typedef ALIGN(4, uint8_t) byte4[4];
+typedef ALIGN(8, uint8_t) byte8[8];
+typedef ALIGN(16, uint8_t) byte16[16];
+typedef ALIGN(8, uint16_t) word4[4];
+typedef ALIGN(8, uint32_t) dword2[2];
+typedef ALIGN(16, uint32_t) dword4[4];
+typedef ALIGN(16, uint64_t) xword[2];
+
+typedef ALIGN(1, int8_t) sbyte;
+typedef ALIGN(4, int8_t) sbyte4[4];
+typedef ALIGN(8, int8_t) sbyte8[8];
+typedef ALIGN(16, int8_t) sbyte16[16];
+typedef ALIGN(8, short) short4[4];
+typedef ALIGN(8, unsigned short) ushort4[4];
+typedef ALIGN(16, short) short8[8];
+typedef ALIGN(16, unsigned short) ushort8[8];
+typedef ALIGN(8, int) int2[2];
+typedef ALIGN(8, unsigned int) uint2[2];
+typedef ALIGN(16, unsigned int) uint4[4];
+
+typedef ALIGN(8, float) float2[2];
+
+ALIGN(16, struct int4
 {
-	typedef ALIGN(1, uint8_t) byte;
-	typedef ALIGN(2, uint16_t) word;
-	typedef ALIGN(4, uint32_t) dword;
-	typedef ALIGN(8, uint64_t) qword;
-	typedef ALIGN(16, uint64_t) qword2[2];
-	typedef ALIGN(4, uint8_t) byte4[4];
-	typedef ALIGN(8, uint8_t) byte8[8];
-	typedef ALIGN(16, uint8_t) byte16[16];
-	typedef ALIGN(8, uint16_t) word4[4];
-	typedef ALIGN(8, uint32_t) dword2[2];
-	typedef ALIGN(16, uint32_t) dword4[4];
-	typedef ALIGN(16, uint64_t) xword[2];
+	int x;
+	int y;
+	int z;
+	int w;
 
-	typedef ALIGN(1, int8_t) sbyte;
-	typedef ALIGN(4, int8_t) sbyte4[4];
-	typedef ALIGN(8, int8_t) sbyte8[8];
-	typedef ALIGN(16, int8_t) sbyte16[16];
-	typedef ALIGN(8, short) short4[4];
-	typedef ALIGN(8, unsigned short) ushort4[4];
-	typedef ALIGN(16, short) short8[8];
-	typedef ALIGN(16, unsigned short) ushort8[8];
-	typedef ALIGN(8, int) int2[2];
-	typedef ALIGN(8, unsigned int) uint2[2];
-	typedef ALIGN(16, unsigned int) uint4[4];
-
-	typedef ALIGN(8, float) float2[2];
-
-	ALIGN(16, struct int4
+	int &operator[](int i)
 	{
-		int x;
-		int y;
-		int z;
-		int w;
-
-		int &operator[](int i)
-		{
-			return (&x)[i];
-		}
-
-		const int &operator[](int i) const
-		{
-			return (&x)[i];
-		}
-
-		bool operator!=(const int4 &rhs)
-		{
-			return x != rhs.x || y != rhs.y || z != rhs.z || w != rhs.w;
-		}
-
-		bool operator==(const int4 &rhs)
-		{
-			return x == rhs.x && y == rhs.y && z == rhs.z && w == rhs.w;
-		}
-	});
-
-	ALIGN(16, struct float4
-	{
-		float x;
-		float y;
-		float z;
-		float w;
-
-		float &operator[](int i)
-		{
-			return (&x)[i];
-		}
-
-		const float &operator[](int i) const
-		{
-			return (&x)[i];
-		}
-
-		bool operator!=(const float4 &rhs)
-		{
-			return x != rhs.x || y != rhs.y || z != rhs.z || w != rhs.w;
-		}
-
-		bool operator==(const float4 &rhs)
-		{
-			return x == rhs.x && y == rhs.y && z == rhs.z && w == rhs.w;
-		}
-	});
-
-	inline constexpr float4 vector(float x, float y, float z, float w)
-	{
-		return { x, y, z, w };
+		return (&x)[i];
 	}
 
-	inline constexpr float4 replicate(float f)
+	const int &operator[](int i) const
 	{
-		return vector(f, f, f, f);
+		return (&x)[i];
 	}
 
-	#define OFFSET(s,m) (int)(size_t)&reinterpret_cast<const volatile char&>((((s*)0)->m))
+	bool operator!=(const int4 &rhs)
+	{
+		return x != rhs.x || y != rhs.y || z != rhs.z || w != rhs.w;
+	}
+
+	bool operator==(const int4 &rhs)
+	{
+		return x == rhs.x && y == rhs.y && z == rhs.z && w == rhs.w;
+	}
+});
+
+ALIGN(16, struct float4
+{
+	float x;
+	float y;
+	float z;
+	float w;
+
+	float &operator[](int i)
+	{
+		return (&x)[i];
+	}
+
+	const float &operator[](int i) const
+	{
+		return (&x)[i];
+	}
+
+	bool operator!=(const float4 &rhs)
+	{
+		return x != rhs.x || y != rhs.y || z != rhs.z || w != rhs.w;
+	}
+
+	bool operator==(const float4 &rhs)
+	{
+		return x == rhs.x && y == rhs.y && z == rhs.z && w == rhs.w;
+	}
+});
+
+inline constexpr float4 vector(float x, float y, float z, float w)
+{
+	return { x, y, z, w };
 }
 
+inline constexpr float4 replicate(float f)
+{
+	return vector(f, f, f, f);
+}
+
+#define OFFSET(s,m) (int)(size_t)&reinterpret_cast<const volatile char&>((((s*)0)->m))
+
+}  // namespace sw
+
 #endif   // sw_Types_hpp
diff --git a/src/Vulkan/Debug/EventListener.hpp b/src/Vulkan/Debug/EventListener.hpp
index 6d96066..8de0dfd 100644
--- a/src/Vulkan/Debug/EventListener.hpp
+++ b/src/Vulkan/Debug/EventListener.hpp
@@ -15,10 +15,8 @@
 #ifndef VK_DEBUG_EVENT_LISTENER_HPP_
 #define VK_DEBUG_EVENT_LISTENER_HPP_
 
-namespace vk
-{
-namespace dbg
-{
+namespace vk {
+namespace dbg {
 
 class Thread;
 
diff --git a/src/Vulkan/Debug/File.cpp b/src/Vulkan/Debug/File.cpp
index 2185a0b..8fc0c54 100644
--- a/src/Vulkan/Debug/File.cpp
+++ b/src/Vulkan/Debug/File.cpp
@@ -17,8 +17,7 @@
 #include <mutex>
 #include <unordered_set>
 
-namespace
-{
+namespace {
 
 ////////////////////////////////////////////////////////////////////////////////
 // FileBase
@@ -104,10 +103,8 @@
 
 }  // anonymous namespace
 
-namespace vk
-{
-namespace dbg
-{
+namespace vk {
+namespace dbg {
 
 std::shared_ptr<File> File::createVirtual(ID id, std::string name, std::string source)
 {
diff --git a/src/Vulkan/Debug/File.hpp b/src/Vulkan/Debug/File.hpp
index d67316d..9472c6b 100644
--- a/src/Vulkan/Debug/File.hpp
+++ b/src/Vulkan/Debug/File.hpp
@@ -20,10 +20,8 @@
 #include <memory>
 #include <string>
 
-namespace vk
-{
-namespace dbg
-{
+namespace vk {
+namespace dbg {
 
 class File
 {
diff --git a/src/Vulkan/Debug/ID.hpp b/src/Vulkan/Debug/ID.hpp
index 709baf5..29ebd2e 100644
--- a/src/Vulkan/Debug/ID.hpp
+++ b/src/Vulkan/Debug/ID.hpp
@@ -17,10 +17,8 @@
 
 #include <functional>  // std::hash
 
-namespace vk
-{
-namespace dbg
-{
+namespace vk {
+namespace dbg {
 
 // ID is a strongly-typed identifier backed by a int.
 // The template parameter T is not actually used by the implementation of
@@ -51,8 +49,8 @@
 }  // namespace dbg
 }  // namespace vk
 
-namespace std
-{
+namespace std {
+
 // std::hash implementation for vk::dbg::ID<T>
 template <typename T>
 struct hash<vk::dbg::ID<T> >
@@ -62,6 +60,7 @@
 		return std::hash<int>()(id.value());
 	}
 };
+
 }  // namespace std
 
 #endif  // VK_DEBUG_ID_HPP_
diff --git a/src/Vulkan/Debug/Location.hpp b/src/Vulkan/Debug/Location.hpp
index fbc66dc..9b3d883 100644
--- a/src/Vulkan/Debug/Location.hpp
+++ b/src/Vulkan/Debug/Location.hpp
@@ -17,10 +17,8 @@
 
 #include <memory>
 
-namespace vk
-{
-namespace dbg
-{
+namespace vk {
+namespace dbg {
 
 class File;
 
diff --git a/src/Vulkan/Debug/Thread.cpp b/src/Vulkan/Debug/Thread.cpp
index b61c5d9..b2c0088 100644
--- a/src/Vulkan/Debug/Thread.cpp
+++ b/src/Vulkan/Debug/Thread.cpp
@@ -18,10 +18,8 @@
 #include "EventListener.hpp"
 #include "File.hpp"
 
-namespace vk
-{
-namespace dbg
-{
+namespace vk {
+namespace dbg {
 
 Thread::Thread(ID id, Context* ctx) :
     id(id),
diff --git a/src/Vulkan/Debug/Thread.hpp b/src/Vulkan/Debug/Thread.hpp
index 8199d67..a2b63a9 100644
--- a/src/Vulkan/Debug/Thread.hpp
+++ b/src/Vulkan/Debug/Thread.hpp
@@ -25,10 +25,8 @@
 #include <string>
 #include <vector>
 
-namespace vk
-{
-namespace dbg
-{
+namespace vk {
+namespace dbg {
 
 class File;
 class VariableContainer;
diff --git a/src/Vulkan/Debug/Type.cpp b/src/Vulkan/Debug/Type.cpp
index fd9643e..f4049c3 100644
--- a/src/Vulkan/Debug/Type.cpp
+++ b/src/Vulkan/Debug/Type.cpp
@@ -14,10 +14,8 @@
 
 #include "Type.hpp"
 
-namespace vk
-{
-namespace dbg
-{
+namespace vk {
+namespace dbg {
 
 // clang-format off
 std::shared_ptr<Type> TypeOf<bool>::get()              { static auto ty = std::make_shared<Type>(Kind::Bool); return ty; }
diff --git a/src/Vulkan/Debug/Type.hpp b/src/Vulkan/Debug/Type.hpp
index d73fd72..07ceaf1 100644
--- a/src/Vulkan/Debug/Type.hpp
+++ b/src/Vulkan/Debug/Type.hpp
@@ -20,10 +20,8 @@
 #include <cstdint>
 #include <string>
 
-namespace vk
-{
-namespace dbg
-{
+namespace vk {
+namespace dbg {
 
 class VariableContainer;
 class Value;
diff --git a/src/Vulkan/Debug/Value.cpp b/src/Vulkan/Debug/Value.cpp
index db7f1ca..02ad6f9 100644
--- a/src/Vulkan/Debug/Value.cpp
+++ b/src/Vulkan/Debug/Value.cpp
@@ -16,10 +16,8 @@
 #include "Value.hpp"
 #include "Variable.hpp"
 
-namespace vk
-{
-namespace dbg
-{
+namespace vk {
+namespace dbg {
 
 const FormatFlags FormatFlags::Default = {
 	"[",                   // listPrefix
diff --git a/src/Vulkan/Debug/Value.hpp b/src/Vulkan/Debug/Value.hpp
index c45b33c..69353d1 100644
--- a/src/Vulkan/Debug/Value.hpp
+++ b/src/Vulkan/Debug/Value.hpp
@@ -18,10 +18,8 @@
 #include <memory>
 #include <string>
 
-namespace vk
-{
-namespace dbg
-{
+namespace vk {
+namespace dbg {
 
 class Type;
 
diff --git a/src/Vulkan/Debug/Variable.hpp b/src/Vulkan/Debug/Variable.hpp
index 15b50b3..f24ca47 100644
--- a/src/Vulkan/Debug/Variable.hpp
+++ b/src/Vulkan/Debug/Variable.hpp
@@ -25,10 +25,8 @@
 #include <unordered_map>
 #include <vector>
 
-namespace vk
-{
-namespace dbg
-{
+namespace vk {
+namespace dbg {
 
 // Variable is a named value.
 struct Variable
diff --git a/src/Vulkan/Debug/WeakMap.hpp b/src/Vulkan/Debug/WeakMap.hpp
index 984ca05..0019c16 100644
--- a/src/Vulkan/Debug/WeakMap.hpp
+++ b/src/Vulkan/Debug/WeakMap.hpp
@@ -18,10 +18,8 @@
 #include <map>
 #include <memory>
 
-namespace vk
-{
-namespace dbg
-{
+namespace vk {
+namespace dbg {
 
 // WeakMap is an associative container of keys of type K to values of type
 // std::weak_ptr<V>.
diff --git a/src/Vulkan/VkBuffer.cpp b/src/Vulkan/VkBuffer.cpp
index f8d1211..ce14bf3 100644
--- a/src/Vulkan/VkBuffer.cpp
+++ b/src/Vulkan/VkBuffer.cpp
@@ -18,8 +18,7 @@
 
 #include <cstring>
 
-namespace vk
-{
+namespace vk {
 
 Buffer::Buffer(const VkBufferCreateInfo* pCreateInfo, void* mem) :
 	flags(pCreateInfo->flags), size(pCreateInfo->size), usage(pCreateInfo->usage),
@@ -140,4 +139,4 @@
 	return reinterpret_cast<uint8_t*>(getOffsetPointer(size + 1));
 }
 
-} // namespace vk
+}  // namespace vk
diff --git a/src/Vulkan/VkBuffer.hpp b/src/Vulkan/VkBuffer.hpp
index 23c42e1..7dc1005 100644
--- a/src/Vulkan/VkBuffer.hpp
+++ b/src/Vulkan/VkBuffer.hpp
@@ -17,8 +17,7 @@
 
 #include "VkObject.hpp"
 
-namespace vk
-{
+namespace vk {
 
 class DeviceMemory;
 
@@ -59,6 +58,6 @@
 	return Buffer::Cast(object);
 }
 
-} // namespace vk
+}  // namespace vk
 
 #endif // VK_BUFFER_HPP_
diff --git a/src/Vulkan/VkBufferView.cpp b/src/Vulkan/VkBufferView.cpp
index 0f4848d..085acfe 100644
--- a/src/Vulkan/VkBufferView.cpp
+++ b/src/Vulkan/VkBufferView.cpp
@@ -16,8 +16,7 @@
 #include "VkBuffer.hpp"
 #include "VkFormat.h"
 
-namespace vk
-{
+namespace vk {
 
 BufferView::BufferView(const VkBufferViewCreateInfo* pCreateInfo, void* mem) :
     buffer(vk::Cast(pCreateInfo->buffer)), format(pCreateInfo->format), offset(pCreateInfo->offset)
@@ -37,4 +36,4 @@
     return buffer->getOffsetPointer(offset);
 }
 
-}
\ No newline at end of file
+}  // namespace vk
\ No newline at end of file
diff --git a/src/Vulkan/VkBufferView.hpp b/src/Vulkan/VkBufferView.hpp
index 45a87e9..bf20a6a 100644
--- a/src/Vulkan/VkBufferView.hpp
+++ b/src/Vulkan/VkBufferView.hpp
@@ -19,8 +19,7 @@
 #include "VkFormat.h"
 #include "VkImageView.hpp"
 
-namespace vk
-{
+namespace vk {
 
 class Buffer;
 
@@ -52,6 +51,6 @@
 	return BufferView::Cast(object);
 }
 
-} // namespace vk
+}  // namespace vk
 
 #endif // VK_BUFFER_VIEW_HPP_
diff --git a/src/Vulkan/VkCommandBuffer.cpp b/src/Vulkan/VkCommandBuffer.cpp
index d8c1930..e025b4b 100644
--- a/src/Vulkan/VkCommandBuffer.cpp
+++ b/src/Vulkan/VkCommandBuffer.cpp
@@ -1737,4 +1737,4 @@
 	}
 }
 
-} // namespace vk
+}  // namespace vk
diff --git a/src/Vulkan/VkCommandBuffer.hpp b/src/Vulkan/VkCommandBuffer.hpp
index 01664e4..d4d3ec1 100644
--- a/src/Vulkan/VkCommandBuffer.hpp
+++ b/src/Vulkan/VkCommandBuffer.hpp
@@ -23,15 +23,15 @@
 #include <memory>
 #include <vector>
 
-namespace sw
-{
-	class Context;
-	class Renderer;
-	class TaskEvents;
-}
+namespace sw {
 
-namespace vk
-{
+class Context;
+class Renderer;
+class TaskEvents;
+
+}  // namespace sw
+
+namespace vk {
 
 class Buffer;
 class Event;
@@ -206,6 +206,6 @@
 	return DispatchableCommandBuffer::Cast(object);
 }
 
-} // namespace vk
+}  // namespace vk
 
 #endif // VK_COMMAND_BUFFER_HPP_
diff --git a/src/Vulkan/VkCommandPool.cpp b/src/Vulkan/VkCommandPool.cpp
index 17934ea..9cb1603 100644
--- a/src/Vulkan/VkCommandPool.cpp
+++ b/src/Vulkan/VkCommandPool.cpp
@@ -18,8 +18,7 @@
 #include <algorithm>
 #include <new>
 
-namespace vk
-{
+namespace vk {
 
 CommandPool::CommandPool(const VkCommandPoolCreateInfo* pCreateInfo, void* mem)
 {
@@ -112,4 +111,4 @@
 	// TODO (b/119827933): Optimize memory usage here
 }
 
-} // namespace vk
+}  // namespace vk
diff --git a/src/Vulkan/VkCommandPool.hpp b/src/Vulkan/VkCommandPool.hpp
index e07a248..0f4c130 100644
--- a/src/Vulkan/VkCommandPool.hpp
+++ b/src/Vulkan/VkCommandPool.hpp
@@ -18,8 +18,7 @@
 #include "VkObject.hpp"
 #include <set>
 
-namespace vk
-{
+namespace vk {
 
 class CommandPool : public Object<CommandPool, VkCommandPool>
 {
@@ -43,6 +42,6 @@
 	return CommandPool::Cast(object);
 }
 
-} // namespace vk
+}  // namespace vk
 
 #endif // VK_COMMAND_POOL_HPP_
diff --git a/src/Vulkan/VkConfig.h b/src/Vulkan/VkConfig.h
index f2014c8..1a39e0d 100644
--- a/src/Vulkan/VkConfig.h
+++ b/src/Vulkan/VkConfig.h
@@ -19,8 +19,7 @@
 
 #include <Vulkan/VulkanPlatform.h>
 
-namespace vk
-{
+namespace vk {
 
 // Note: Constant array initialization requires a string literal.
 //       constexpr char* or char[] does not work for that purpose.
@@ -82,7 +81,7 @@
 constexpr float SUBPIXEL_PRECISION_FACTOR = static_cast<float>(1 << SUBPIXEL_PRECISION_BITS);
 constexpr int SUBPIXEL_PRECISION_MASK = 0xFFFFFFFF >> (32 - SUBPIXEL_PRECISION_BITS);
 
-}
+}  // namespace vk
 
 #if defined(__linux__) || defined(__ANDROID__)
 #define SWIFTSHADER_EXTERNAL_MEMORY_OPAQUE_FD        1
diff --git a/src/Vulkan/VkDebug.cpp b/src/Vulkan/VkDebug.cpp
index 37beb7a..7b7cf85 100644
--- a/src/Vulkan/VkDebug.cpp
+++ b/src/Vulkan/VkDebug.cpp
@@ -83,10 +83,9 @@
 #endif
 }
 
-}
+}  // anonymous namespace
 
-namespace vk
-{
+namespace vk {
 
 void tracev(const char *format, va_list args)
 {
@@ -164,4 +163,4 @@
 	}
 }
 
-}
+}  // namespace vk
diff --git a/src/Vulkan/VkDebug.hpp b/src/Vulkan/VkDebug.hpp
index 95b4ff9..40462ed 100644
--- a/src/Vulkan/VkDebug.hpp
+++ b/src/Vulkan/VkDebug.hpp
@@ -32,22 +32,23 @@
 #define CHECK_PRINTF_ARGS
 #endif
 
-namespace vk
-{
-	// Outputs text to the debugging log
-	void trace(const char *format, ...) CHECK_PRINTF_ARGS;
-	inline void trace() {}
+namespace vk {
 
-	// Outputs text to the debugging log and prints to stderr.
-	void warn(const char *format, ...) CHECK_PRINTF_ARGS;
-	inline void warn() {}
+// Outputs text to the debugging log
+void trace(const char *format, ...) CHECK_PRINTF_ARGS;
+inline void trace() {}
 
-	// Outputs the message to the debugging log and stderr, and calls abort().
-	void abort(const char *format, ...) CHECK_PRINTF_ARGS;
+// Outputs text to the debugging log and prints to stderr.
+void warn(const char *format, ...) CHECK_PRINTF_ARGS;
+inline void warn() {}
 
-	// Outputs text to the debugging log, and asserts once if a debugger is attached.
-	void trace_assert(const char *format, ...) CHECK_PRINTF_ARGS;
-}
+// Outputs the message to the debugging log and stderr, and calls abort().
+void abort(const char *format, ...) CHECK_PRINTF_ARGS;
+
+// Outputs text to the debugging log, and asserts once if a debugger is attached.
+void trace_assert(const char *format, ...) CHECK_PRINTF_ARGS;
+
+}  // namespace vk
 
 // A macro to output a trace of a function call and its arguments to the
 // debugging log. Disabled if SWIFTSHADER_DISABLE_TRACE is defined.
diff --git a/src/Vulkan/VkDescriptorPool.cpp b/src/Vulkan/VkDescriptorPool.cpp
index 79b46cc..3c9b2dd 100644
--- a/src/Vulkan/VkDescriptorPool.cpp
+++ b/src/Vulkan/VkDescriptorPool.cpp
@@ -33,10 +33,9 @@
 	return reinterpret_cast<uint8_t*>(vk::Cast(descriptorSet));
 }
 
-}
+}  // anonymous namespace
 
-namespace vk
-{
+namespace vk {
 
 DescriptorPool::DescriptorPool(const VkDescriptorPoolCreateInfo* pCreateInfo, void* mem) :
 	pool(static_cast<uint8_t*>(mem)),
@@ -230,4 +229,4 @@
 	return totalFreeSize;
 }
 
-} // namespace vk
\ No newline at end of file
+}  // namespace vk
\ No newline at end of file
diff --git a/src/Vulkan/VkDescriptorPool.hpp b/src/Vulkan/VkDescriptorPool.hpp
index 8c8a600..c222018 100644
--- a/src/Vulkan/VkDescriptorPool.hpp
+++ b/src/Vulkan/VkDescriptorPool.hpp
@@ -18,46 +18,46 @@
 #include "VkObject.hpp"
 #include <set>
 
-namespace vk
+namespace vk {
+
+class DescriptorPool : public Object<DescriptorPool, VkDescriptorPool>
 {
-	class DescriptorPool : public Object<DescriptorPool, VkDescriptorPool>
+public:
+	DescriptorPool(const VkDescriptorPoolCreateInfo* pCreateInfo, void* mem);
+	void destroy(const VkAllocationCallbacks* pAllocator);
+
+	static size_t ComputeRequiredAllocationSize(const VkDescriptorPoolCreateInfo* pCreateInfo);
+
+	VkResult allocateSets(uint32_t descriptorSetCount, const VkDescriptorSetLayout* pSetLayouts, VkDescriptorSet* pDescriptorSets);
+	void freeSets(uint32_t descriptorSetCount, const VkDescriptorSet* pDescriptorSets);
+	VkResult reset();
+
+private:
+	VkResult allocateSets(size_t* sizes, uint32_t numAllocs, VkDescriptorSet* pDescriptorSets);
+	uint8_t* findAvailableMemory(size_t size);
+	void freeSet(const VkDescriptorSet descriptorSet);
+	size_t computeTotalFreeSize() const;
+
+	struct Node
 	{
-	public:
-		DescriptorPool(const VkDescriptorPoolCreateInfo* pCreateInfo, void* mem);
-		void destroy(const VkAllocationCallbacks* pAllocator);
+		Node(uint8_t* set, size_t size) : set(set), size(size) {}
+		bool operator<(const Node& node) const { return set < node.set; }
+		bool operator==(const uint8_t* other) const { return set == other; }
 
-		static size_t ComputeRequiredAllocationSize(const VkDescriptorPoolCreateInfo* pCreateInfo);
-
-		VkResult allocateSets(uint32_t descriptorSetCount, const VkDescriptorSetLayout* pSetLayouts, VkDescriptorSet* pDescriptorSets);
-		void freeSets(uint32_t descriptorSetCount, const VkDescriptorSet* pDescriptorSets);
-		VkResult reset();
-
-	private:
-		VkResult allocateSets(size_t* sizes, uint32_t numAllocs, VkDescriptorSet* pDescriptorSets);
-		uint8_t* findAvailableMemory(size_t size);
-		void freeSet(const VkDescriptorSet descriptorSet);
-		size_t computeTotalFreeSize() const;
-
-		struct Node
-		{
-			Node(uint8_t* set, size_t size) : set(set), size(size) {}
-			bool operator<(const Node& node) const { return set < node.set; }
-			bool operator==(const uint8_t* other) const { return set == other; }
-
-			uint8_t* set = nullptr;
-			size_t size = 0;
-		};
-		std::set<Node> nodes;
-
-		uint8_t* pool = nullptr;
-		size_t poolSize = 0;
+		uint8_t* set = nullptr;
+		size_t size = 0;
 	};
+	std::set<Node> nodes;
 
-	static inline DescriptorPool* Cast(VkDescriptorPool object)
-	{
-		return DescriptorPool::Cast(object);
-	}
+	uint8_t* pool = nullptr;
+	size_t poolSize = 0;
+};
 
-} // namespace vk
+static inline DescriptorPool* Cast(VkDescriptorPool object)
+{
+	return DescriptorPool::Cast(object);
+}
+
+}  // namespace vk
 
 #endif // VK_DESCRIPTOR_POOL_HPP_
diff --git a/src/Vulkan/VkDescriptorSet.hpp b/src/Vulkan/VkDescriptorSet.hpp
index fc50148..89486b5 100644
--- a/src/Vulkan/VkDescriptorSet.hpp
+++ b/src/Vulkan/VkDescriptorSet.hpp
@@ -20,35 +20,35 @@
 #include <array>
 #include <memory>
 
-namespace vk
+namespace vk {
+
+class DescriptorSetLayout;
+
+struct alignas(16) DescriptorSetHeader
 {
-	class DescriptorSetLayout;
+	DescriptorSetLayout* layout;
+};
 
-	struct alignas(16) DescriptorSetHeader
+class alignas(16) DescriptorSet
+{
+public:
+	static inline DescriptorSet* Cast(VkDescriptorSet object)
 	{
-		DescriptorSetLayout* layout;
-	};
-
-	class alignas(16) DescriptorSet
-	{
-	public:
-		static inline DescriptorSet* Cast(VkDescriptorSet object)
-		{
-			return static_cast<DescriptorSet*>(static_cast<void*>(object));
-		}
-
-		using Bindings = std::array<vk::DescriptorSet*, vk::MAX_BOUND_DESCRIPTOR_SETS>;
-		using DynamicOffsets = std::array<uint32_t, vk::MAX_DESCRIPTOR_SET_COMBINED_BUFFERS_DYNAMIC>;
-
-		DescriptorSetHeader header;
-		alignas(16) uint8_t data[1];
-	};
-
-	inline DescriptorSet* Cast(VkDescriptorSet object)
-	{
-		return DescriptorSet::Cast(object);
+		return static_cast<DescriptorSet*>(static_cast<void*>(object));
 	}
 
-} // namespace vk
+	using Bindings = std::array<vk::DescriptorSet*, vk::MAX_BOUND_DESCRIPTOR_SETS>;
+	using DynamicOffsets = std::array<uint32_t, vk::MAX_DESCRIPTOR_SET_COMBINED_BUFFERS_DYNAMIC>;
+
+	DescriptorSetHeader header;
+	alignas(16) uint8_t data[1];
+};
+
+inline DescriptorSet* Cast(VkDescriptorSet object)
+{
+	return DescriptorSet::Cast(object);
+}
+
+}  // namespace vk
 
 #endif // VK_DESCRIPTOR_SET_HPP_
diff --git a/src/Vulkan/VkDescriptorSetLayout.cpp b/src/Vulkan/VkDescriptorSetLayout.cpp
index b294ce8..6c6de49 100644
--- a/src/Vulkan/VkDescriptorSetLayout.cpp
+++ b/src/Vulkan/VkDescriptorSetLayout.cpp
@@ -24,8 +24,7 @@
 #include <algorithm>
 #include <cstring>
 
-namespace
-{
+namespace {
 
 static bool UsesImmutableSamplers(const VkDescriptorSetLayoutBinding& binding)
 {
@@ -34,10 +33,9 @@
 	        (binding.pImmutableSamplers != nullptr));
 }
 
-}
+}  // anonymous namespace
 
-namespace vk
-{
+namespace vk {
 
 DescriptorSetLayout::DescriptorSetLayout(const VkDescriptorSetLayoutCreateInfo* pCreateInfo, void* mem) :
 	flags(pCreateInfo->flags), bindingCount(pCreateInfo->bindingCount), bindings(reinterpret_cast<VkDescriptorSetLayoutBinding*>(mem))
@@ -646,4 +644,4 @@
 	memcpy(memToWrite, memToRead, writeSize);
 }
 
-} // namespace vk
+}  // namespace vk
diff --git a/src/Vulkan/VkDescriptorSetLayout.hpp b/src/Vulkan/VkDescriptorSetLayout.hpp
index 81ada71..9a7b333 100644
--- a/src/Vulkan/VkDescriptorSetLayout.hpp
+++ b/src/Vulkan/VkDescriptorSetLayout.hpp
@@ -21,8 +21,7 @@
 #include "Vulkan/VkImageView.hpp"
 #include "Device/Sampler.hpp"
 
-namespace vk
-{
+namespace vk {
 
 class DescriptorSet;
 class Device;
@@ -146,6 +145,6 @@
 	return DescriptorSetLayout::Cast(object);
 }
 
-} // namespace vk
+}  // namespace vk
 
 #endif // VK_DESCRIPTOR_SET_LAYOUT_HPP_
diff --git a/src/Vulkan/VkDescriptorUpdateTemplate.cpp b/src/Vulkan/VkDescriptorUpdateTemplate.cpp
index e70ad73..3a83120 100644
--- a/src/Vulkan/VkDescriptorUpdateTemplate.cpp
+++ b/src/Vulkan/VkDescriptorUpdateTemplate.cpp
@@ -17,33 +17,34 @@
 #include "VkDescriptorSetLayout.hpp"
 #include <cstring>
 
-namespace vk
+namespace vk {
+
+DescriptorUpdateTemplate::DescriptorUpdateTemplate(const VkDescriptorUpdateTemplateCreateInfo* pCreateInfo, void* mem) :
+	descriptorUpdateEntryCount(pCreateInfo->descriptorUpdateEntryCount),
+	descriptorUpdateEntries(reinterpret_cast<VkDescriptorUpdateTemplateEntry*>(mem)),
+	descriptorSetLayout(vk::Cast(pCreateInfo->descriptorSetLayout))
 {
-	DescriptorUpdateTemplate::DescriptorUpdateTemplate(const VkDescriptorUpdateTemplateCreateInfo* pCreateInfo, void* mem) :
-		descriptorUpdateEntryCount(pCreateInfo->descriptorUpdateEntryCount),
-		descriptorUpdateEntries(reinterpret_cast<VkDescriptorUpdateTemplateEntry*>(mem)),
-		descriptorSetLayout(vk::Cast(pCreateInfo->descriptorSetLayout))
+	for(uint32_t i = 0; i < descriptorUpdateEntryCount; i++)
 	{
-		for(uint32_t i = 0; i < descriptorUpdateEntryCount; i++)
-		{
-			descriptorUpdateEntries[i] = pCreateInfo->pDescriptorUpdateEntries[i];
-		}
+		descriptorUpdateEntries[i] = pCreateInfo->pDescriptorUpdateEntries[i];
 	}
+}
 
-	size_t DescriptorUpdateTemplate::ComputeRequiredAllocationSize(const VkDescriptorUpdateTemplateCreateInfo* info)
+size_t DescriptorUpdateTemplate::ComputeRequiredAllocationSize(const VkDescriptorUpdateTemplateCreateInfo* info)
+{
+	return info->descriptorUpdateEntryCount * sizeof(VkDescriptorUpdateTemplateEntry);
+}
+
+void DescriptorUpdateTemplate::updateDescriptorSet(Device* device, VkDescriptorSet vkDescriptorSet, const void* pData)
+{
+
+	DescriptorSet* descriptorSet = vk::Cast(vkDescriptorSet);
+
+	for(uint32_t i = 0; i < descriptorUpdateEntryCount; i++)
 	{
-		return info->descriptorUpdateEntryCount * sizeof(VkDescriptorUpdateTemplateEntry);
+		DescriptorSetLayout::WriteDescriptorSet(device, descriptorSet, descriptorUpdateEntries[i],
+												reinterpret_cast<char const *>(pData));
 	}
+}
 
-	void DescriptorUpdateTemplate::updateDescriptorSet(Device* device, VkDescriptorSet vkDescriptorSet, const void* pData)
-	{
-
-		DescriptorSet* descriptorSet = vk::Cast(vkDescriptorSet);
-
-		for(uint32_t i = 0; i < descriptorUpdateEntryCount; i++)
-		{
-			DescriptorSetLayout::WriteDescriptorSet(device, descriptorSet, descriptorUpdateEntries[i],
-													reinterpret_cast<char const *>(pData));
-		}
-	}
-}
\ No newline at end of file
+}  // namespace vk
\ No newline at end of file
diff --git a/src/Vulkan/VkDescriptorUpdateTemplate.hpp b/src/Vulkan/VkDescriptorUpdateTemplate.hpp
index 90a8b96..0884b1e 100644
--- a/src/Vulkan/VkDescriptorUpdateTemplate.hpp
+++ b/src/Vulkan/VkDescriptorUpdateTemplate.hpp
@@ -1,47 +1,47 @@
-// Copyright 2018 The SwiftShader Authors. All Rights Reserved.

-//

-// Licensed under the Apache License, Version 2.0 (the "License");

-// you may not use this file except in compliance with the License.

-// You may obtain a copy of the License at

-//

-//    http://www.apache.org/licenses/LICENSE-2.0

-//

-// Unless required by applicable law or agreed to in writing, software

-// distributed under the License is distributed on an "AS IS" BASIS,

-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

-// See the License for the specific language governing permissions and

-// limitations under the License.

-

-#ifndef VK_DESCRIPTOR_UPDATE_TEMPLATE_HPP_

-#define VK_DESCRIPTOR_UPDATE_TEMPLATE_HPP_

-

-#include "VkObject.hpp"

-

-namespace vk

-{

-	class DescriptorSetLayout;

-	class Device;

-

-	class DescriptorUpdateTemplate : public Object<DescriptorUpdateTemplate, VkDescriptorUpdateTemplate>

-	{

-	public:

-		DescriptorUpdateTemplate(const VkDescriptorUpdateTemplateCreateInfo* pCreateInfo, void* mem);

-

-		static size_t ComputeRequiredAllocationSize(const VkDescriptorUpdateTemplateCreateInfo* info);

-

-		void updateDescriptorSet(Device* device, VkDescriptorSet descriptorSet, const void* pData);

-

-	private:

-		uint32_t                              descriptorUpdateEntryCount = 0;

-		VkDescriptorUpdateTemplateEntry*      descriptorUpdateEntries = nullptr;

-		DescriptorSetLayout*                  descriptorSetLayout = nullptr;

-	};

-

-	static inline DescriptorUpdateTemplate* Cast(VkDescriptorUpdateTemplate object)

-	{

-		return DescriptorUpdateTemplate::Cast(object);

-	}

-

-} // namespace vk

-

-#endif // VK_DESCRIPTOR_UPDATE_TEMPLATE_HPP_

+// Copyright 2018 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef VK_DESCRIPTOR_UPDATE_TEMPLATE_HPP_
+#define VK_DESCRIPTOR_UPDATE_TEMPLATE_HPP_
+
+#include "VkObject.hpp"
+
+namespace vk {
+
+class DescriptorSetLayout;
+class Device;
+
+class DescriptorUpdateTemplate : public Object<DescriptorUpdateTemplate, VkDescriptorUpdateTemplate>
+{
+public:
+	DescriptorUpdateTemplate(const VkDescriptorUpdateTemplateCreateInfo* pCreateInfo, void* mem);
+
+	static size_t ComputeRequiredAllocationSize(const VkDescriptorUpdateTemplateCreateInfo* info);
+
+	void updateDescriptorSet(Device* device, VkDescriptorSet descriptorSet, const void* pData);
+
+private:
+	uint32_t                              descriptorUpdateEntryCount = 0;
+	VkDescriptorUpdateTemplateEntry*      descriptorUpdateEntries = nullptr;
+	DescriptorSetLayout*                  descriptorSetLayout = nullptr;
+};
+
+static inline DescriptorUpdateTemplate* Cast(VkDescriptorUpdateTemplate object)
+{
+	return DescriptorUpdateTemplate::Cast(object);
+}
+
+}  // namespace vk
+
+#endif // VK_DESCRIPTOR_UPDATE_TEMPLATE_HPP_
diff --git a/src/Vulkan/VkDestroy.h b/src/Vulkan/VkDestroy.h
index 114e2d0..7acd062 100644
--- a/src/Vulkan/VkDestroy.h
+++ b/src/Vulkan/VkDestroy.h
@@ -39,8 +39,7 @@
 
 #include <type_traits>
 
-namespace vk
-{
+namespace vk {
 
 // Because Vulkan uses optional allocation callbacks, we use them in a custom
 // placement new operator in the VkObjectBase class for simplicity.
@@ -66,4 +65,4 @@
 	}
 }
 
-}
+}  // namespace vk
diff --git a/src/Vulkan/VkDevice.cpp b/src/Vulkan/VkDevice.cpp
index e1f454e..a8fcd9c 100644
--- a/src/Vulkan/VkDevice.cpp
+++ b/src/Vulkan/VkDevice.cpp
@@ -25,16 +25,16 @@
 #include <climits>
 #include <new> // Must #include this to use "placement new"
 
-namespace
+namespace {
+
+std::chrono::time_point<std::chrono::system_clock, std::chrono::nanoseconds> now()
 {
-	std::chrono::time_point<std::chrono::system_clock, std::chrono::nanoseconds> now()
-	{
-		return std::chrono::time_point_cast<std::chrono::nanoseconds>(std::chrono::system_clock::now());
-	}
+	return std::chrono::time_point_cast<std::chrono::nanoseconds>(std::chrono::system_clock::now());
 }
 
-namespace vk
-{
+}  // anonymous namespace
+
+namespace vk {
 
 std::shared_ptr<rr::Routine> Device::SamplingRoutineCache::query(const vk::Device::SamplingRoutineCache::Key& key) const
 {
@@ -264,4 +264,4 @@
 	return samplingRoutineCacheMutex;
 }
 
-} // namespace vk
+}  // namespace vk
diff --git a/src/Vulkan/VkDevice.hpp b/src/Vulkan/VkDevice.hpp
index 4aeb14c..24882e0 100644
--- a/src/Vulkan/VkDevice.hpp
+++ b/src/Vulkan/VkDevice.hpp
@@ -21,18 +21,10 @@
 #include <memory>
 #include <mutex>
 
-namespace marl
-{
-	class Scheduler;
-}
+namespace marl { class Scheduler; }
+namespace sw { class Blitter; }
 
-namespace sw
-{
-	class Blitter;
-}
-
-namespace vk
-{
+namespace vk {
 
 class PhysicalDevice;
 class Queue;
@@ -131,6 +123,6 @@
 	return static_cast<std::size_t>(hash);  // Truncates to 32-bits on 32-bit platforms.
 }
 
-} // namespace vk
+}  // namespace vk
 
 #endif // VK_DEVICE_HPP_
diff --git a/src/Vulkan/VkDeviceMemory.cpp b/src/Vulkan/VkDeviceMemory.cpp
index aeadf15..faaa4b5 100644
--- a/src/Vulkan/VkDeviceMemory.cpp
+++ b/src/Vulkan/VkDeviceMemory.cpp
@@ -16,8 +16,7 @@
 
 #include "VkConfig.h"
 
-namespace vk
-{
+namespace vk {
 
 // Base abstract interface for a device memory implementation.
 class DeviceMemory::ExternalBase
@@ -86,7 +85,6 @@
 class DeviceMemoryHostExternalBase : public DeviceMemory::ExternalBase
 {
 public:
-
     // Does not support any external memory type at all.
 	static const VkExternalMemoryHandleTypeFlagBits typeFlagBit = (VkExternalMemoryHandleTypeFlagBits)0;
 
@@ -129,8 +127,7 @@
 #  endif
 #endif
 
-namespace vk
-{
+namespace vk {
 
 static void findTraits(const VkMemoryAllocateInfo* pAllocateInfo,
 					   ExternalMemoryTraits*       pTraits)
@@ -231,4 +228,4 @@
 }
 #endif
 
-} // namespace vk
+}  // namespace vk
diff --git a/src/Vulkan/VkDeviceMemory.hpp b/src/Vulkan/VkDeviceMemory.hpp
index 230f0c9..d529ec9 100644
--- a/src/Vulkan/VkDeviceMemory.hpp
+++ b/src/Vulkan/VkDeviceMemory.hpp
@@ -18,8 +18,7 @@
 #include "VkConfig.h"
 #include "VkObject.hpp"
 
-namespace vk
-{
+namespace vk {
 
 class DeviceMemory : public Object<DeviceMemory, VkDeviceMemory>
 {
@@ -61,6 +60,6 @@
 }
 
 
-} // namespace vk
+}  // namespace vk
 
 #endif // VK_DEVICE_MEMORY_HPP_
diff --git a/src/Vulkan/VkEvent.hpp b/src/Vulkan/VkEvent.hpp
index 57901ef..3304558 100644
--- a/src/Vulkan/VkEvent.hpp
+++ b/src/Vulkan/VkEvent.hpp
@@ -19,8 +19,7 @@
 #include <condition_variable>
 #include <mutex>
 
-namespace vk
-{
+namespace vk {
 
 class Event : public Object<Event, VkEvent>
 {
@@ -73,6 +72,6 @@
 	return Event::Cast(object);
 }
 
-} // namespace vk
+}  // namespace vk
 
 #endif // VK_EVENT_HPP_
diff --git a/src/Vulkan/VkFence.hpp b/src/Vulkan/VkFence.hpp
index 586d42f..af8becd 100644
--- a/src/Vulkan/VkFence.hpp
+++ b/src/Vulkan/VkFence.hpp
@@ -22,8 +22,7 @@
 #include "marl/event.h"
 #include "marl/waitgroup.h"
 
-namespace vk
-{
+namespace vk {
 
 class Fence : public Object<Fence, VkFence>, public sw::TaskEvents
 {
@@ -88,6 +87,6 @@
 	return Fence::Cast(object);
 }
 
-} // namespace vk
+}  // namespace vk
 
 #endif // VK_FENCE_HPP_
diff --git a/src/Vulkan/VkFormat.cpp b/src/Vulkan/VkFormat.cpp
index 8e07ea4..f454d94 100644
--- a/src/Vulkan/VkFormat.cpp
+++ b/src/Vulkan/VkFormat.cpp
@@ -16,8 +16,7 @@
 #include "VkDebug.hpp"
 #include "System/Math.hpp"
 
-namespace vk
-{
+namespace vk {
 
 bool Format::isUnsignedNormalized() const
 {
@@ -2231,4 +2230,4 @@
 	return false;
 }
 
-} // namespace vk
+}  // namespace vk
diff --git a/src/Vulkan/VkFormat.h b/src/Vulkan/VkFormat.h
index 77b9ad5..486bf50 100644
--- a/src/Vulkan/VkFormat.h
+++ b/src/Vulkan/VkFormat.h
@@ -17,13 +17,9 @@
 
 #include <Vulkan/VulkanPlatform.h>
 
-namespace sw
-{
-	struct float4;
-}
+namespace sw { struct float4; }
 
-namespace vk
-{
+namespace vk {
 
 class Format
 {
@@ -76,6 +72,6 @@
 	VkFormat format = VK_FORMAT_UNDEFINED;
 };
 
-} // namespace vk
+}  // namespace vk
 
 #endif // VK_FORMAT_UTILS_HPP_
\ No newline at end of file
diff --git a/src/Vulkan/VkFramebuffer.cpp b/src/Vulkan/VkFramebuffer.cpp
index 7c3617f..33be291 100644
--- a/src/Vulkan/VkFramebuffer.cpp
+++ b/src/Vulkan/VkFramebuffer.cpp
@@ -18,8 +18,7 @@
 #include <algorithm>
 #include <memory.h>
 
-namespace vk
-{
+namespace vk {
 
 Framebuffer::Framebuffer(const VkFramebufferCreateInfo* pCreateInfo, void* mem) :
 	attachmentCount(pCreateInfo->attachmentCount),
@@ -151,4 +150,4 @@
 	return pCreateInfo->attachmentCount * sizeof(void*);
 }
 
-} // namespace vk
+}  // namespace vk
diff --git a/src/Vulkan/VkFramebuffer.hpp b/src/Vulkan/VkFramebuffer.hpp
index ce9b16a..a4c50bb 100644
--- a/src/Vulkan/VkFramebuffer.hpp
+++ b/src/Vulkan/VkFramebuffer.hpp
@@ -17,8 +17,7 @@
 
 #include "VkObject.hpp"
 
-namespace vk
-{
+namespace vk {
 
 class ImageView;
 class RenderPass;
@@ -49,6 +48,6 @@
 	return Framebuffer::Cast(object);
 }
 
-} // namespace vk
+}  // namespace vk
 
 #endif // VK_FRAMEBUFFER_HPP_
diff --git a/src/Vulkan/VkGetProcAddress.cpp b/src/Vulkan/VkGetProcAddress.cpp
index 5613da2..46d19cd 100644
--- a/src/Vulkan/VkGetProcAddress.cpp
+++ b/src/Vulkan/VkGetProcAddress.cpp
@@ -25,8 +25,7 @@
 #include <vulkan/vk_android_native_buffer.h>
 #endif
 
-namespace vk
-{
+namespace vk {
 
 #define MAKE_VULKAN_GLOBAL_ENTRY(aFunction) { #aFunction, reinterpret_cast<PFN_vkVoidFunction>(aFunction) }
 static const std::unordered_map<std::string, PFN_vkVoidFunction> globalFunctionPointers =
@@ -441,7 +440,7 @@
 	return nullptr;
 }
 
-}
+}  // namespace vk
 
 #ifdef __ANDROID__
 
diff --git a/src/Vulkan/VkGetProcAddress.h b/src/Vulkan/VkGetProcAddress.h
index 27562ff..6eb017f 100644
--- a/src/Vulkan/VkGetProcAddress.h
+++ b/src/Vulkan/VkGetProcAddress.h
@@ -17,8 +17,7 @@
 
 #include <Vulkan/VulkanPlatform.h>
 
-namespace vk
-{
+namespace vk {
 
 class Device;
 class Instance;
@@ -26,6 +25,6 @@
 PFN_vkVoidFunction GetInstanceProcAddr(Instance* instance, const char* pName);
 PFN_vkVoidFunction GetDeviceProcAddr(Device* device, const char* pName);
 
-}
+}  // namespace vk
 
 #endif // VK_UTILS_HPP_
\ No newline at end of file
diff --git a/src/Vulkan/VkImage.cpp b/src/Vulkan/VkImage.cpp
index 0b7224f..e5d38e1 100644
--- a/src/Vulkan/VkImage.cpp
+++ b/src/Vulkan/VkImage.cpp
@@ -24,38 +24,38 @@
 #include "System/GrallocAndroid.hpp"
 #endif
 
-namespace
+namespace {
+
+ETC_Decoder::InputType GetInputType(const vk::Format& format)
 {
-	ETC_Decoder::InputType GetInputType(const vk::Format& format)
+	switch(format)
 	{
-		switch(format)
-		{
-		case VK_FORMAT_EAC_R11_UNORM_BLOCK:
-			return ETC_Decoder::ETC_R_UNSIGNED;
-		case VK_FORMAT_EAC_R11_SNORM_BLOCK:
-			return ETC_Decoder::ETC_R_SIGNED;
-		case VK_FORMAT_EAC_R11G11_UNORM_BLOCK:
-			return ETC_Decoder::ETC_RG_UNSIGNED;
-		case VK_FORMAT_EAC_R11G11_SNORM_BLOCK:
-			return ETC_Decoder::ETC_RG_SIGNED;
-		case VK_FORMAT_ETC2_R8G8B8_UNORM_BLOCK:
-		case VK_FORMAT_ETC2_R8G8B8_SRGB_BLOCK:
-			return ETC_Decoder::ETC_RGB;
-		case VK_FORMAT_ETC2_R8G8B8A1_UNORM_BLOCK:
-		case VK_FORMAT_ETC2_R8G8B8A1_SRGB_BLOCK:
-			return ETC_Decoder::ETC_RGB_PUNCHTHROUGH_ALPHA;
-		case VK_FORMAT_ETC2_R8G8B8A8_UNORM_BLOCK:
-		case VK_FORMAT_ETC2_R8G8B8A8_SRGB_BLOCK:
-			return ETC_Decoder::ETC_RGBA;
-		default:
-			UNIMPLEMENTED("format: %d", int(format));
-			return ETC_Decoder::ETC_RGBA;
-		}
+	case VK_FORMAT_EAC_R11_UNORM_BLOCK:
+		return ETC_Decoder::ETC_R_UNSIGNED;
+	case VK_FORMAT_EAC_R11_SNORM_BLOCK:
+		return ETC_Decoder::ETC_R_SIGNED;
+	case VK_FORMAT_EAC_R11G11_UNORM_BLOCK:
+		return ETC_Decoder::ETC_RG_UNSIGNED;
+	case VK_FORMAT_EAC_R11G11_SNORM_BLOCK:
+		return ETC_Decoder::ETC_RG_SIGNED;
+	case VK_FORMAT_ETC2_R8G8B8_UNORM_BLOCK:
+	case VK_FORMAT_ETC2_R8G8B8_SRGB_BLOCK:
+		return ETC_Decoder::ETC_RGB;
+	case VK_FORMAT_ETC2_R8G8B8A1_UNORM_BLOCK:
+	case VK_FORMAT_ETC2_R8G8B8A1_SRGB_BLOCK:
+		return ETC_Decoder::ETC_RGB_PUNCHTHROUGH_ALPHA;
+	case VK_FORMAT_ETC2_R8G8B8A8_UNORM_BLOCK:
+	case VK_FORMAT_ETC2_R8G8B8A8_SRGB_BLOCK:
+		return ETC_Decoder::ETC_RGBA;
+	default:
+		UNIMPLEMENTED("format: %d", int(format));
+		return ETC_Decoder::ETC_RGBA;
 	}
 }
 
-namespace vk
-{
+}  // anonymous namespace
+
+namespace vk {
 
 Image::Image(const VkImageCreateInfo* pCreateInfo, void* mem, Device *device) :
 	device(device),
@@ -992,4 +992,4 @@
 	}
 }
 
-} // namespace vk
+}  // namespace vk
diff --git a/src/Vulkan/VkImage.hpp b/src/Vulkan/VkImage.hpp
index f717b99..bc67d7e 100644
--- a/src/Vulkan/VkImage.hpp
+++ b/src/Vulkan/VkImage.hpp
@@ -22,8 +22,7 @@
 #include <vulkan/vk_android_native_buffer.h> // For VkSwapchainImageUsageFlagsANDROID and buffer_handle_t
 #endif
 
-namespace vk
-{
+namespace vk {
 
 class Buffer;
 class Device;
@@ -135,6 +134,6 @@
 	return Image::Cast(object);
 }
 
-} // namespace vk
+}  // namespace vk
 
 #endif // VK_IMAGE_HPP_
diff --git a/src/Vulkan/VkImageView.cpp b/src/Vulkan/VkImageView.cpp
index 3a169b8..27c2f38 100644
--- a/src/Vulkan/VkImageView.cpp
+++ b/src/Vulkan/VkImageView.cpp
@@ -16,43 +16,43 @@
 #include "VkImage.hpp"
 #include <System/Math.hpp>
 
-namespace
+namespace {
+
+VkComponentMapping ResolveComponentMapping(VkComponentMapping m, vk::Format format)
 {
-	VkComponentMapping ResolveComponentMapping(VkComponentMapping m, vk::Format format)
-	{
-		m = vk::ResolveIdentityMapping(m);
+	m = vk::ResolveIdentityMapping(m);
 
-		// Replace non-present components with zero/one swizzles so that the sampler
-		// will give us correct interactions between channel replacement and texel replacement,
-		// where we've had to invent new channels behind the app's back (eg transparent decompression
-		// of ETC2 RGB -> BGRA8)
-		VkComponentSwizzle table[] = {
-			VK_COMPONENT_SWIZZLE_IDENTITY,
-			VK_COMPONENT_SWIZZLE_ZERO,
-			VK_COMPONENT_SWIZZLE_ONE,
-			VK_COMPONENT_SWIZZLE_R,
-			format.componentCount() < 2 ? VK_COMPONENT_SWIZZLE_ZERO : VK_COMPONENT_SWIZZLE_G,
-			format.componentCount() < 3 ? VK_COMPONENT_SWIZZLE_ZERO : VK_COMPONENT_SWIZZLE_B,
-			format.componentCount() < 4 ? VK_COMPONENT_SWIZZLE_ONE : VK_COMPONENT_SWIZZLE_A,
-		};
+	// Replace non-present components with zero/one swizzles so that the sampler
+	// will give us correct interactions between channel replacement and texel replacement,
+	// where we've had to invent new channels behind the app's back (eg transparent decompression
+	// of ETC2 RGB -> BGRA8)
+	VkComponentSwizzle table[] = {
+		VK_COMPONENT_SWIZZLE_IDENTITY,
+		VK_COMPONENT_SWIZZLE_ZERO,
+		VK_COMPONENT_SWIZZLE_ONE,
+		VK_COMPONENT_SWIZZLE_R,
+		format.componentCount() < 2 ? VK_COMPONENT_SWIZZLE_ZERO : VK_COMPONENT_SWIZZLE_G,
+		format.componentCount() < 3 ? VK_COMPONENT_SWIZZLE_ZERO : VK_COMPONENT_SWIZZLE_B,
+		format.componentCount() < 4 ? VK_COMPONENT_SWIZZLE_ONE : VK_COMPONENT_SWIZZLE_A,
+	};
 
-		return {table[m.r], table[m.g], table[m.b], table[m.a]};
-	}
-
-	VkImageSubresourceRange ResolveRemainingLevelsLayers(VkImageSubresourceRange range, const vk::Image *image)
-	{
-		return {
-			range.aspectMask,
-			range.baseMipLevel,
-			(range.levelCount == VK_REMAINING_MIP_LEVELS) ? (image->getMipLevels() - range.baseMipLevel) : range.levelCount,
-			range.baseArrayLayer,
-			(range.layerCount == VK_REMAINING_ARRAY_LAYERS) ? (image->getArrayLayers() - range.baseArrayLayer) : range.layerCount,
-		};
-	}
+	return {table[m.r], table[m.g], table[m.b], table[m.a]};
 }
 
-namespace vk
+VkImageSubresourceRange ResolveRemainingLevelsLayers(VkImageSubresourceRange range, const vk::Image *image)
 {
+	return {
+		range.aspectMask,
+		range.baseMipLevel,
+		(range.levelCount == VK_REMAINING_MIP_LEVELS) ? (image->getMipLevels() - range.baseMipLevel) : range.levelCount,
+		range.baseArrayLayer,
+		(range.layerCount == VK_REMAINING_ARRAY_LAYERS) ? (image->getArrayLayers() - range.baseArrayLayer) : range.layerCount,
+	};
+}
+
+}  // anonymous namespace
+
+namespace vk {
 
 std::atomic<uint32_t> ImageView::nextID(1);
 
@@ -298,4 +298,4 @@
 	return getImage(usage)->getTexelPointer(offset, imageSubresourceLayers);
 }
 
-}
+}  // namespace vk
diff --git a/src/Vulkan/VkImageView.hpp b/src/Vulkan/VkImageView.hpp
index 8563520..86adedd 100644
--- a/src/Vulkan/VkImageView.hpp
+++ b/src/Vulkan/VkImageView.hpp
@@ -22,8 +22,8 @@
 
 #include <atomic>
 
-namespace vk
-{
+namespace vk {
+
 class SamplerYcbcrConversion;
 
 class ImageView : public Object<ImageView, VkImageView>
@@ -111,6 +111,6 @@
 	return ImageView::Cast(object);
 }
 
-} // namespace vk
+}  // namespace vk
 
 #endif // VK_IMAGE_VIEW_HPP_
diff --git a/src/Vulkan/VkInstance.cpp b/src/Vulkan/VkInstance.cpp
index f93dc7f..70dee28 100644
--- a/src/Vulkan/VkInstance.cpp
+++ b/src/Vulkan/VkInstance.cpp
@@ -15,8 +15,7 @@
 #include "VkInstance.hpp"
 #include "VkDestroy.h"
 
-namespace vk
-{
+namespace vk {
 
 Instance::Instance(const VkInstanceCreateInfo* pCreateInfo, void* mem, VkPhysicalDevice physicalDevice)
 	: physicalDevice(physicalDevice)
@@ -69,4 +68,4 @@
 	return VK_SUCCESS;
 }
 
-} // namespace vk
+}  // namespace vk
diff --git a/src/Vulkan/VkInstance.hpp b/src/Vulkan/VkInstance.hpp
index 10eac6d..0234d9c 100644
--- a/src/Vulkan/VkInstance.hpp
+++ b/src/Vulkan/VkInstance.hpp
@@ -17,8 +17,7 @@
 
 #include "VkObject.hpp"
 
-namespace vk
-{
+namespace vk {
 
 class Instance
 {
@@ -45,6 +44,6 @@
 	return DispatchableInstance::Cast(object);
 }
 
-} // namespace vk
+}  // namespace vk
 
 #endif // VK_INSTANCE_HPP_
diff --git a/src/Vulkan/VkMemory.cpp b/src/Vulkan/VkMemory.cpp
index 769b0db..128e648 100644
--- a/src/Vulkan/VkMemory.cpp
+++ b/src/Vulkan/VkMemory.cpp
@@ -19,8 +19,7 @@
 #include "VkMemory.h"
 #include "System/Memory.hpp"
 
-namespace vk
-{
+namespace vk {
 
 void* allocate(size_t count, size_t alignment, const VkAllocationCallbacks* pAllocator, VkSystemAllocationScope allocationScope)
 {
@@ -34,6 +33,6 @@
 	pAllocator ? pAllocator->pfnFree(pAllocator->pUserData, ptr) : sw::deallocate(ptr);
 }
 
-} // namespace vk
+}  // namespace vk
 
 #endif // VK_OBJECT_HPP_
diff --git a/src/Vulkan/VkMemory.h b/src/Vulkan/VkMemory.h
index bbc6006..e6f51b7 100644
--- a/src/Vulkan/VkMemory.h
+++ b/src/Vulkan/VkMemory.h
@@ -17,8 +17,7 @@
 
 #include <Vulkan/VulkanPlatform.h>
 
-namespace vk
-{
+namespace vk {
 
 void* allocate(size_t count, size_t alignment, const VkAllocationCallbacks* pAllocator,
                VkSystemAllocationScope allocationScope = VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
@@ -30,6 +29,6 @@
 	return static_cast<T*>(allocate(count, alignof(T), pAllocator, T::GetAllocationScope()));
 }
 
-} // namespace vk
+}  // namespace vk
 
 #endif // VK_MEMORY_HPP_
diff --git a/src/Vulkan/VkObject.hpp b/src/Vulkan/VkObject.hpp
index b401af5..105cc94 100644
--- a/src/Vulkan/VkObject.hpp
+++ b/src/Vulkan/VkObject.hpp
@@ -23,8 +23,7 @@
 #include <Vulkan/VulkanPlatform.h>
 #include <vulkan/vk_icd.h>
 
-namespace vk
-{
+namespace vk {
 
 template<typename T, typename VkT>
 static inline T* VkTtoT(VkT vkObject)
@@ -166,6 +165,6 @@
 	}
 };
 
-} // namespace vk
+}  // namespace vk
 
 #endif // VK_OBJECT_HPP_
diff --git a/src/Vulkan/VkPhysicalDevice.cpp b/src/Vulkan/VkPhysicalDevice.cpp
index 5a32b2f..5dac6ff 100644
--- a/src/Vulkan/VkPhysicalDevice.cpp
+++ b/src/Vulkan/VkPhysicalDevice.cpp
@@ -20,8 +20,7 @@
 #include <limits>
 #include <cstring>
 
-namespace vk
-{
+namespace vk {
 
 static void setExternalMemoryProperties(VkExternalMemoryHandleTypeFlagBits handleType, VkExternalMemoryProperties* properties)
 {
@@ -873,4 +872,4 @@
 	return properties;
 }
 
-} // namespace vk
+}  // namespace vk
diff --git a/src/Vulkan/VkPhysicalDevice.hpp b/src/Vulkan/VkPhysicalDevice.hpp
index a7faecf..c4007cc 100644
--- a/src/Vulkan/VkPhysicalDevice.hpp
+++ b/src/Vulkan/VkPhysicalDevice.hpp
@@ -22,8 +22,7 @@
 #include <vulkan/vk_android_native_buffer.h>
 #endif
 
-namespace vk
-{
+namespace vk {
 
 class PhysicalDevice
 {
@@ -89,6 +88,6 @@
 	return DispatchablePhysicalDevice::Cast(object);
 }
 
-} // namespace vk
+}  // namespace vk
 
 #endif // VK_PHYSICAL_DEVICE_HPP_
diff --git a/src/Vulkan/VkPipeline.cpp b/src/Vulkan/VkPipeline.cpp
index 5baa9dd..f72d229 100644
--- a/src/Vulkan/VkPipeline.cpp
+++ b/src/Vulkan/VkPipeline.cpp
@@ -29,8 +29,7 @@
 
 #include <iostream>
 
-namespace
-{
+namespace {
 
 sw::StreamType getStreamType(VkFormat format)
 {
@@ -240,8 +239,7 @@
 
 } // anonymous namespace
 
-namespace vk
-{
+namespace vk {
 
 Pipeline::Pipeline(PipelineLayout const *layout, const Device *device)
 	: layout(layout),
@@ -696,4 +694,4 @@
 		groupCountX, groupCountY, groupCountZ);
 }
 
-} // namespace vk
+}  // namespace vk
diff --git a/src/Vulkan/VkPipeline.hpp b/src/Vulkan/VkPipeline.hpp
index 52cfc90..1d50226 100644
--- a/src/Vulkan/VkPipeline.hpp
+++ b/src/Vulkan/VkPipeline.hpp
@@ -21,14 +21,14 @@
 #include "Device/Renderer.hpp"
 #include <memory>
 
-namespace sw
-{
-	class ComputeProgram;
-	class SpirvShader;
-}
+namespace sw {
 
-namespace vk
-{
+class ComputeProgram;
+class SpirvShader;
+
+}  // namespace sw
+
+namespace vk {
 
 class PipelineCache;
 class PipelineLayout;
@@ -145,6 +145,6 @@
 	return Pipeline::Cast(object);
 }
 
-} // namespace vk
+}  // namespace vk
 
 #endif // VK_PIPELINE_HPP_
diff --git a/src/Vulkan/VkPipelineCache.cpp b/src/Vulkan/VkPipelineCache.cpp
index f9fe9cf..40fa5c3 100644
--- a/src/Vulkan/VkPipelineCache.cpp
+++ b/src/Vulkan/VkPipelineCache.cpp
@@ -15,8 +15,7 @@
 #include "VkPipelineCache.hpp"
 #include <cstring>
 
-namespace vk
-{
+namespace vk {
 
 PipelineCache::SpirvShaderKey::SpecializationInfo::SpecializationInfo(const VkSpecializationInfo* specializationInfo)
 {
@@ -250,4 +249,4 @@
 	computePrograms[key] = computeProgram;
 }
 
-} // namespace vk
+}  // namespace vk
diff --git a/src/Vulkan/VkPipelineCache.hpp b/src/Vulkan/VkPipelineCache.hpp
index 7e701ed..efe5563 100644
--- a/src/Vulkan/VkPipelineCache.hpp
+++ b/src/Vulkan/VkPipelineCache.hpp
@@ -25,14 +25,14 @@
 #include <string>
 #include <vector>
 
-namespace sw
-{
-	class ComputeProgram;
-	class SpirvShader;
-}
+namespace sw {
 
-namespace vk
-{
+class ComputeProgram;
+class SpirvShader;
+
+}  // namespace sw
+
+namespace vk {
 
 class PipelineLayout;
 class RenderPass;
@@ -145,6 +145,6 @@
 	return PipelineCache::Cast(object);
 }
 
-} // namespace vk
+}  // namespace vk
 
 #endif // VK_PIPELINE_CACHE_HPP_
diff --git a/src/Vulkan/VkPipelineLayout.cpp b/src/Vulkan/VkPipelineLayout.cpp
index da0d3ae..6d42c34 100644
--- a/src/Vulkan/VkPipelineLayout.cpp
+++ b/src/Vulkan/VkPipelineLayout.cpp
@@ -15,8 +15,7 @@
 #include "VkPipelineLayout.hpp"
 #include <cstring>
 
-namespace vk
-{
+namespace vk {
 
 PipelineLayout::PipelineLayout(const VkPipelineLayoutCreateInfo* pCreateInfo, void* mem)
 	: setLayoutCount(pCreateInfo->setLayoutCount), pushConstantRangeCount(pCreateInfo->pushConstantRangeCount)
@@ -40,9 +39,9 @@
 	uint32_t dynamicOffsetBase = 0;
 	for (uint32_t i = 0; i < setLayoutCount; i++)
 	{
-		uint32_t dynamicDescriptorCount = setLayouts[i]->getDynamicDescriptorCount();

-		ASSERT_OR_RETURN((dynamicOffsetBase + dynamicDescriptorCount) <= MAX_DESCRIPTOR_SET_COMBINED_BUFFERS_DYNAMIC);

-		dynamicOffsetBases[i] = dynamicOffsetBase;

+		uint32_t dynamicDescriptorCount = setLayouts[i]->getDynamicDescriptorCount();
+		ASSERT_OR_RETURN((dynamicOffsetBase + dynamicDescriptorCount) <= MAX_DESCRIPTOR_SET_COMBINED_BUFFERS_DYNAMIC);
+		dynamicOffsetBases[i] = dynamicOffsetBase;
 		dynamicOffsetBase += dynamicDescriptorCount;
 	}
 }
@@ -76,4 +75,4 @@
 	return dynamicOffsetBases[descriptorSet];
 }
 
-} // namespace vk
+}  // namespace vk
diff --git a/src/Vulkan/VkPipelineLayout.hpp b/src/Vulkan/VkPipelineLayout.hpp
index f450c5d..7821396 100644
--- a/src/Vulkan/VkPipelineLayout.hpp
+++ b/src/Vulkan/VkPipelineLayout.hpp
@@ -17,8 +17,7 @@
 
 #include "VkDescriptorSetLayout.hpp"
 
-namespace vk
-{
+namespace vk {
 
 class PipelineLayout : public Object<PipelineLayout, VkPipelineLayout>
 {
@@ -48,6 +47,6 @@
 	return PipelineLayout::Cast(object);
 }
 
-} // namespace vk
+}  // namespace vk
 
 #endif // VK_PIPELINE_LAYOUT_HPP_
diff --git a/src/Vulkan/VkQueryPool.cpp b/src/Vulkan/VkQueryPool.cpp
index 5aa9b08..6c4f975 100644
--- a/src/Vulkan/VkQueryPool.cpp
+++ b/src/Vulkan/VkQueryPool.cpp
@@ -18,201 +18,202 @@
 #include <cstring>
 #include <new>
 
-namespace vk
+namespace vk {
+
+Query::Query() : finished(marl::Event::Mode::Manual), state(UNAVAILABLE), type(INVALID_TYPE), value(0) {}
+
+void Query::reset()
 {
-	Query::Query() : finished(marl::Event::Mode::Manual), state(UNAVAILABLE), type(INVALID_TYPE), value(0) {}
+	finished.clear();
+	auto prevState = state.exchange(UNAVAILABLE);
+	ASSERT(prevState != ACTIVE);
+	type = INVALID_TYPE;
+	value = 0;
+}
 
-	void Query::reset()
+void Query::prepare(VkQueryType ty)
+{
+	auto prevState = state.exchange(ACTIVE);
+	ASSERT(prevState == UNAVAILABLE);
+	type = ty;
+}
+
+void Query::start()
+{
+	ASSERT(state == ACTIVE);
+	wg.add();
+}
+
+void Query::finish()
+{
+	if (wg.done())
 	{
-		finished.clear();
-		auto prevState = state.exchange(UNAVAILABLE);
-		ASSERT(prevState != ACTIVE);
-		type = INVALID_TYPE;
-		value = 0;
+		auto prevState = state.exchange(FINISHED);
+		ASSERT(prevState == ACTIVE);
+		finished.signal();
+	}
+}
+
+Query::Data Query::getData() const
+{
+	Data out;
+	out.state = state;
+	out.value = value;
+	return out;
+}
+
+VkQueryType Query::getType() const
+{
+	return type;
+}
+
+void Query::wait()
+{
+	finished.wait();
+}
+
+void Query::set(int64_t v)
+{
+	value = v;
+}
+
+void Query::add(int64_t v)
+{
+	value += v;
+}
+
+QueryPool::QueryPool(const VkQueryPoolCreateInfo* pCreateInfo, void* mem) :
+	pool(reinterpret_cast<Query*>(mem)), type(pCreateInfo->queryType),
+	count(pCreateInfo->queryCount)
+{
+	// According to the Vulkan spec, section 34.1. Features:
+	// "pipelineStatisticsQuery specifies whether the pipeline statistics
+	//  queries are supported. If this feature is not enabled, queries of
+	//  type VK_QUERY_TYPE_PIPELINE_STATISTICS cannot be created, and
+	//  none of the VkQueryPipelineStatisticFlagBits bits can be set in the
+	//  pipelineStatistics member of the VkQueryPoolCreateInfo structure."
+	if(type == VK_QUERY_TYPE_PIPELINE_STATISTICS)
+	{
+		UNIMPLEMENTED("pCreateInfo->queryType");
 	}
 
-	void Query::prepare(VkQueryType ty)
+	// Construct all queries
+	for(uint32_t i = 0; i < count; i++)
 	{
-		auto prevState = state.exchange(ACTIVE);
-		ASSERT(prevState == UNAVAILABLE);
-		type = ty;
+		new (&pool[i]) Query();
 	}
+}
 
-	void Query::start()
-	{
-		ASSERT(state == ACTIVE);
-		wg.add();
-	}
+void QueryPool::destroy(const VkAllocationCallbacks* pAllocator)
+{
+	vk::deallocate(pool, pAllocator);
+}
 
-	void Query::finish()
+size_t QueryPool::ComputeRequiredAllocationSize(const VkQueryPoolCreateInfo* pCreateInfo)
+{
+	return sizeof(Query) * pCreateInfo->queryCount;
+}
+
+VkResult QueryPool::getResults(uint32_t firstQuery, uint32_t queryCount, size_t dataSize,
+                               void* pData, VkDeviceSize stride, VkQueryResultFlags flags) const
+{
+	// dataSize must be large enough to contain the result of each query
+	ASSERT(static_cast<size_t>(stride * queryCount) <= dataSize);
+
+	// The sum of firstQuery and queryCount must be less than or equal to the number of queries
+	ASSERT((firstQuery + queryCount) <= count);
+
+	VkResult result = VK_SUCCESS;
+	uint8_t* data = static_cast<uint8_t*>(pData);
+	for(uint32_t i = firstQuery; i < (firstQuery + queryCount); i++, data += stride)
 	{
-		if (wg.done())
+		// If VK_QUERY_RESULT_WAIT_BIT and VK_QUERY_RESULT_PARTIAL_BIT are both not set
+		// then no result values are written to pData for queries that are in the
+		// unavailable state at the time of the call, and vkGetQueryPoolResults returns
+		// VK_NOT_READY. However, availability state is still written to pData for those
+		// queries if VK_QUERY_RESULT_WITH_AVAILABILITY_BIT is set.
+		auto &query = pool[i];
+
+		if(flags & VK_QUERY_RESULT_WAIT_BIT) // Must wait for query to finish
 		{
-			auto prevState = state.exchange(FINISHED);
-			ASSERT(prevState == ACTIVE);
-			finished.signal();
-		}
-	}
-
-	Query::Data Query::getData() const
-	{
-		Data out;
-		out.state = state;
-		out.value = value;
-		return out;
-	}
-
-	VkQueryType Query::getType() const
-	{
-		return type;
-	}
-
-	void Query::wait()
-	{
-		finished.wait();
-	}
-
-	void Query::set(int64_t v)
-	{
-		value = v;
-	}
-
-	void Query::add(int64_t v)
-	{
-		value += v;
-	}
-
-	QueryPool::QueryPool(const VkQueryPoolCreateInfo* pCreateInfo, void* mem) :
-		pool(reinterpret_cast<Query*>(mem)), type(pCreateInfo->queryType),
-		count(pCreateInfo->queryCount)
-	{
-		// According to the Vulkan spec, section 34.1. Features:
-		// "pipelineStatisticsQuery specifies whether the pipeline statistics
-		//  queries are supported. If this feature is not enabled, queries of
-		//  type VK_QUERY_TYPE_PIPELINE_STATISTICS cannot be created, and
-		//  none of the VkQueryPipelineStatisticFlagBits bits can be set in the
-		//  pipelineStatistics member of the VkQueryPoolCreateInfo structure."
-		if(type == VK_QUERY_TYPE_PIPELINE_STATISTICS)
-		{
-			UNIMPLEMENTED("pCreateInfo->queryType");
+			query.wait();
 		}
 
-		// Construct all queries
-		for(uint32_t i = 0; i < count; i++)
+		const auto current = query.getData();
+
+		bool writeResult = true;
+		if(current.state == Query::ACTIVE)
 		{
-			new (&pool[i]) Query();
+			result = VK_NOT_READY;
+			writeResult = (flags & VK_QUERY_RESULT_PARTIAL_BIT); // Allow writing partial results
 		}
-	}
 
-	void QueryPool::destroy(const VkAllocationCallbacks* pAllocator)
-	{
-		vk::deallocate(pool, pAllocator);
-	}
-
-	size_t QueryPool::ComputeRequiredAllocationSize(const VkQueryPoolCreateInfo* pCreateInfo)
-	{
-		return sizeof(Query) * pCreateInfo->queryCount;
-	}
-
-	VkResult QueryPool::getResults(uint32_t firstQuery, uint32_t queryCount, size_t dataSize,
-	                               void* pData, VkDeviceSize stride, VkQueryResultFlags flags) const
-	{
-		// dataSize must be large enough to contain the result of each query
-		ASSERT(static_cast<size_t>(stride * queryCount) <= dataSize);
-
-		// The sum of firstQuery and queryCount must be less than or equal to the number of queries
-		ASSERT((firstQuery + queryCount) <= count);
-
-		VkResult result = VK_SUCCESS;
-		uint8_t* data = static_cast<uint8_t*>(pData);
-		for(uint32_t i = firstQuery; i < (firstQuery + queryCount); i++, data += stride)
+		if(flags & VK_QUERY_RESULT_64_BIT)
 		{
-			// If VK_QUERY_RESULT_WAIT_BIT and VK_QUERY_RESULT_PARTIAL_BIT are both not set
-			// then no result values are written to pData for queries that are in the
-			// unavailable state at the time of the call, and vkGetQueryPoolResults returns
-			// VK_NOT_READY. However, availability state is still written to pData for those
-			// queries if VK_QUERY_RESULT_WITH_AVAILABILITY_BIT is set.
-			auto &query = pool[i];
-
-			if(flags & VK_QUERY_RESULT_WAIT_BIT) // Must wait for query to finish
+			uint64_t* result64 = reinterpret_cast<uint64_t*>(data);
+			if(writeResult)
 			{
-				query.wait();
+				result64[0] = current.value;
 			}
-
-			const auto current = query.getData();
-
-			bool writeResult = true;
-			if(current.state == Query::ACTIVE)
+			if(flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) // Output query availablity
 			{
-				result = VK_NOT_READY;
-				writeResult = (flags & VK_QUERY_RESULT_PARTIAL_BIT); // Allow writing partial results
-			}
-
-			if(flags & VK_QUERY_RESULT_64_BIT)
-			{
-				uint64_t* result64 = reinterpret_cast<uint64_t*>(data);
-				if(writeResult)
-				{
-					result64[0] = current.value;
-				}
-				if(flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) // Output query availablity
-				{
-					result64[1] = current.state;
-				}
-			}
-			else
-			{
-				uint32_t* result32 = reinterpret_cast<uint32_t*>(data);
-				if(writeResult)
-				{
-					result32[0] = static_cast<uint32_t>(current.value);
-				}
-				if(flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) // Output query availablity
-				{
-					result32[1] = current.state;
-				}
+				result64[1] = current.state;
 			}
 		}
-
-		return result;
-	}
-
-	void QueryPool::begin(uint32_t query, VkQueryControlFlags flags)
-	{
-		ASSERT(query < count);
-
-		if(flags != 0)
+		else
 		{
-			UNIMPLEMENTED("flags");
-		}
-
-		pool[query].prepare(type);
-		pool[query].start();
-	}
-
-	void QueryPool::end(uint32_t query)
-	{
-		ASSERT(query < count);
-		pool[query].finish();
-	}
-
-	void QueryPool::reset(uint32_t firstQuery, uint32_t queryCount)
-	{
-		// The sum of firstQuery and queryCount must be less than or equal to the number of queries
-		ASSERT((firstQuery + queryCount) <= count);
-
-		for(uint32_t i = firstQuery; i < (firstQuery + queryCount); i++)
-		{
-			pool[i].reset();
+			uint32_t* result32 = reinterpret_cast<uint32_t*>(data);
+			if(writeResult)
+			{
+				result32[0] = static_cast<uint32_t>(current.value);
+			}
+			if(flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) // Output query availablity
+			{
+				result32[1] = current.state;
+			}
 		}
 	}
 
-	void QueryPool::writeTimestamp(uint32_t query)
-	{
-		ASSERT(query < count);
-		ASSERT(type == VK_QUERY_TYPE_TIMESTAMP);
+	return result;
+}
 
-		pool[query].set(std::chrono::time_point_cast<std::chrono::nanoseconds>(
-			std::chrono::system_clock::now()).time_since_epoch().count());
+void QueryPool::begin(uint32_t query, VkQueryControlFlags flags)
+{
+	ASSERT(query < count);
+
+	if(flags != 0)
+	{
+		UNIMPLEMENTED("flags");
 	}
-} // namespace vk
+
+	pool[query].prepare(type);
+	pool[query].start();
+}
+
+void QueryPool::end(uint32_t query)
+{
+	ASSERT(query < count);
+	pool[query].finish();
+}
+
+void QueryPool::reset(uint32_t firstQuery, uint32_t queryCount)
+{
+	// The sum of firstQuery and queryCount must be less than or equal to the number of queries
+	ASSERT((firstQuery + queryCount) <= count);
+
+	for(uint32_t i = firstQuery; i < (firstQuery + queryCount); i++)
+	{
+		pool[i].reset();
+	}
+}
+
+void QueryPool::writeTimestamp(uint32_t query)
+{
+	ASSERT(query < count);
+	ASSERT(type == VK_QUERY_TYPE_TIMESTAMP);
+
+	pool[query].set(std::chrono::time_point_cast<std::chrono::nanoseconds>(
+		std::chrono::system_clock::now()).time_since_epoch().count());
+}
+
+}  // namespace vk
diff --git a/src/Vulkan/VkQueryPool.hpp b/src/Vulkan/VkQueryPool.hpp
index 5ad3115..6e887b1 100644
--- a/src/Vulkan/VkQueryPool.hpp
+++ b/src/Vulkan/VkQueryPool.hpp
@@ -24,8 +24,7 @@
 #include <condition_variable>
 #include <mutex>
 
-namespace vk
-{
+namespace vk {
 
 class Query
 {
@@ -119,6 +118,6 @@
 	return QueryPool::Cast(object);
 }
 
-} // namespace vk
+}  // namespace vk
 
 #endif // VK_QUERY_POOL_HPP_
diff --git a/src/Vulkan/VkQueue.cpp b/src/Vulkan/VkQueue.cpp
index 25e30e9..4c5f798 100644
--- a/src/Vulkan/VkQueue.cpp
+++ b/src/Vulkan/VkQueue.cpp
@@ -26,8 +26,7 @@
 
 #include <cstring>
 
-namespace
-{
+namespace {
 
 VkSubmitInfo* DeepCopySubmitInfo(uint32_t submitCount, const VkSubmitInfo* pSubmits)
 {
@@ -74,10 +73,9 @@
 	return submits;
 }
 
-} // anonymous namespace
+}  // anonymous namespace
 
-namespace vk
-{
+namespace vk {
 
 Queue::Queue(Device* device, marl::Scheduler *scheduler) : device(device)
 {
@@ -240,4 +238,4 @@
 }
 #endif
 
-} // namespace vk
+}  // namespace vk
diff --git a/src/Vulkan/VkQueue.hpp b/src/Vulkan/VkQueue.hpp
index cf1f955..e5c600e 100644
--- a/src/Vulkan/VkQueue.hpp
+++ b/src/Vulkan/VkQueue.hpp
@@ -22,19 +22,16 @@
 
 #include "System/Synchronization.hpp"
 
-namespace marl
-{
-	class Scheduler;
-}
+namespace marl { class Scheduler; }
 
-namespace sw
-{
-	class Context;
-	class Renderer;
-}
+namespace sw {
 
-namespace vk
-{
+class Context;
+class Renderer;
+
+}  // namespace sw
+
+namespace vk {
 
 class Device;
 class Fence;
@@ -85,6 +82,6 @@
 	return reinterpret_cast<Queue*>(object);
 }
 
-} // namespace vk
+}  // namespace vk
 
 #endif // VK_QUEUE_HPP_
diff --git a/src/Vulkan/VkRenderPass.cpp b/src/Vulkan/VkRenderPass.cpp
index 2a360c0..27de7a6 100644
--- a/src/Vulkan/VkRenderPass.cpp
+++ b/src/Vulkan/VkRenderPass.cpp
@@ -16,8 +16,7 @@
 #include "VkStringify.hpp"
 #include <cstring>
 
-namespace vk
-{
+namespace vk {
 
 RenderPass::RenderPass(const VkRenderPassCreateInfo* pCreateInfo, void* mem) :
 	attachmentCount(pCreateInfo->attachmentCount),
@@ -221,4 +220,4 @@
 		attachmentViewMasks[attachment] |= viewMasks[subpass];
 }
 
-} // namespace vk
+}  // namespace vk
diff --git a/src/Vulkan/VkRenderPass.hpp b/src/Vulkan/VkRenderPass.hpp
index 338c8aa..70cf2a8 100644
--- a/src/Vulkan/VkRenderPass.hpp
+++ b/src/Vulkan/VkRenderPass.hpp
@@ -19,8 +19,7 @@
 
 #include <vector>
 
-namespace vk
-{
+namespace vk {
 
 class RenderPass : public Object<RenderPass, VkRenderPass>
 {
@@ -101,6 +100,6 @@
 	return RenderPass::Cast(object);
 }
 
-} // namespace vk
+}  // namespace vk
 
 #endif // VK_RENDER_PASS_HPP_
\ No newline at end of file
diff --git a/src/Vulkan/VkSampler.cpp b/src/Vulkan/VkSampler.cpp
index d80f699..c19515b 100644
--- a/src/Vulkan/VkSampler.cpp
+++ b/src/Vulkan/VkSampler.cpp
@@ -15,9 +15,8 @@
 
 #include "VkSampler.hpp"
 
-namespace vk
-{
+namespace vk {
 
 std::atomic<uint32_t> Sampler::nextID(1);
 
-} // namespace vk
+}  // namespace vk
diff --git a/src/Vulkan/VkSampler.hpp b/src/Vulkan/VkSampler.hpp
index af9da6d..042a7c7 100644
--- a/src/Vulkan/VkSampler.hpp
+++ b/src/Vulkan/VkSampler.hpp
@@ -22,8 +22,7 @@
 
 #include <atomic>
 
-namespace vk
-{
+namespace vk {
 
 class Sampler : public Object<Sampler, VkSampler>
 {
@@ -124,6 +123,6 @@
 	return SamplerYcbcrConversion::Cast(object);
 }
 
-} // namespace vk
+}  // namespace vk
 
 #endif // VK_SAMPLER_HPP_
\ No newline at end of file
diff --git a/src/Vulkan/VkSemaphore.cpp b/src/Vulkan/VkSemaphore.cpp
index bb1878f..eb7231e 100644
--- a/src/Vulkan/VkSemaphore.cpp
+++ b/src/Vulkan/VkSemaphore.cpp
@@ -36,8 +36,7 @@
 #include <mutex>
 #include <utility>
 
-namespace vk
-{
+namespace vk {
 
 // An implementation of VkSemaphore based on Marl primitives.
 class Semaphore::Impl
diff --git a/src/Vulkan/VkSemaphore.hpp b/src/Vulkan/VkSemaphore.hpp
index 53ad4c2..742fc8a 100644
--- a/src/Vulkan/VkSemaphore.hpp
+++ b/src/Vulkan/VkSemaphore.hpp
@@ -22,8 +22,7 @@
 #include <zircon/types.h>
 #endif
 
-namespace vk
-{
+namespace vk {
 
 class Semaphore : public Object<Semaphore, VkSemaphore>
 {
@@ -64,6 +63,6 @@
 	return Semaphore::Cast(object);
 }
 
-} // namespace vk
+}  // namespace vk
 
 #endif // VK_SEMAPHORE_HPP_
diff --git a/src/Vulkan/VkSemaphoreExternalFuchsia.hpp b/src/Vulkan/VkSemaphoreExternalFuchsia.hpp
index 9e3e23b..4da9d39 100644
--- a/src/Vulkan/VkSemaphoreExternalFuchsia.hpp
+++ b/src/Vulkan/VkSemaphoreExternalFuchsia.hpp
@@ -24,8 +24,7 @@
 // VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TEMP_ZIRCON_EVENT_BIT_FUCHSIA
 // which is not official yet but used by Fuchsia at the moment.
 
-namespace vk
-{
+namespace vk {
 
 class Semaphore::External {
 public:
diff --git a/src/Vulkan/VkSemaphoreExternalLinux.hpp b/src/Vulkan/VkSemaphoreExternalLinux.hpp
index 89ab2dc..9ba6c65 100644
--- a/src/Vulkan/VkSemaphoreExternalLinux.hpp
+++ b/src/Vulkan/VkSemaphoreExternalLinux.hpp
@@ -127,8 +127,7 @@
 	bool signaled = false;
 };
 
-namespace vk
-{
+namespace vk {
 
 class Semaphore::External {
 public:
diff --git a/src/Vulkan/VkSemaphoreExternalNone.hpp b/src/Vulkan/VkSemaphoreExternalNone.hpp
index d056e6c..1b64003 100644
--- a/src/Vulkan/VkSemaphoreExternalNone.hpp
+++ b/src/Vulkan/VkSemaphoreExternalNone.hpp
@@ -15,8 +15,7 @@
 #ifndef VK_SEMAPHORE_EXTERNAL_NONE_H_
 #define VK_SEMAPHORE_EXTERNAL_NONE_H_
 
-namespace vk
-{
+namespace vk {
 
 // Empty external sempahore implementation.
 class Semaphore::External {
diff --git a/src/Vulkan/VkShaderModule.cpp b/src/Vulkan/VkShaderModule.cpp
index ce5e831..78a5ed4 100644
--- a/src/Vulkan/VkShaderModule.cpp
+++ b/src/Vulkan/VkShaderModule.cpp
@@ -16,8 +16,7 @@
 
 #include <cstring>
 
-namespace vk
-{
+namespace vk {
 
 std::atomic<uint32_t> ShaderModule::serialCounter(1);    // Start at 1, 0 is invalid shader.
 
@@ -38,4 +37,4 @@
 	return pCreateInfo->codeSize;
 }
 
-} // namespace vk
+}  // namespace vk
diff --git a/src/Vulkan/VkShaderModule.hpp b/src/Vulkan/VkShaderModule.hpp
index ba30b59..c7e5ff9 100644
--- a/src/Vulkan/VkShaderModule.hpp
+++ b/src/Vulkan/VkShaderModule.hpp
@@ -20,13 +20,9 @@
 #include <atomic>
 #include <vector>
 
-namespace rr
-{
-	class Routine;
-}
+namespace rr { class Routine; }
 
-namespace vk
-{
+namespace vk {
 
 class ShaderModule : public Object<ShaderModule, VkShaderModule>
 {
@@ -55,6 +51,6 @@
 	return ShaderModule::Cast(object);
 }
 
-} // namespace vk
+}  // namespace vk
 
 #endif // VK_SHADER_MODULE_HPP_
diff --git a/src/WSI/VkSurfaceKHR.cpp b/src/WSI/VkSurfaceKHR.cpp
index e1f44e9..438d559 100644
--- a/src/WSI/VkSurfaceKHR.cpp
+++ b/src/WSI/VkSurfaceKHR.cpp
@@ -40,8 +40,7 @@
 
 }
 
-namespace vk
-{
+namespace vk {
 
 VkResult PresentImage::allocateImage(VkDevice device, const VkImageCreateInfo& createInfo)
 {
@@ -218,4 +217,4 @@
 	return VK_SUCCESS;
 }
 
-}
\ No newline at end of file
+}  // namespace vk
\ No newline at end of file
diff --git a/src/WSI/VkSurfaceKHR.hpp b/src/WSI/VkSurfaceKHR.hpp
index f14d98b..d43b243 100644
--- a/src/WSI/VkSurfaceKHR.hpp
+++ b/src/WSI/VkSurfaceKHR.hpp
@@ -21,8 +21,7 @@
 
 #include <vector>
 
-namespace vk
-{
+namespace vk {
 
 enum PresentImageStatus
 {
@@ -105,6 +104,6 @@
 	return SurfaceKHR::Cast(object);
 }
 
-}
+}  // namespace vk
 
 #endif //SWIFTSHADER_VKSURFACEKHR_HPP_
diff --git a/src/WSI/VkSwapchainKHR.cpp b/src/WSI/VkSwapchainKHR.cpp
index ba97ebc..c318b13 100644
--- a/src/WSI/VkSwapchainKHR.cpp
+++ b/src/WSI/VkSwapchainKHR.cpp
@@ -22,8 +22,7 @@
 #include <algorithm>
 #include <cstring>
 
-namespace vk
-{
+namespace vk {
 
 SwapchainKHR::SwapchainKHR(const VkSwapchainCreateInfoKHR *pCreateInfo, void *mem) :
 	surface(vk::Cast(pCreateInfo->surface)),
@@ -214,4 +213,4 @@
 	return result;
 }
 
-}
\ No newline at end of file
+}  // namespace vk
\ No newline at end of file
diff --git a/src/WSI/VkSwapchainKHR.hpp b/src/WSI/VkSwapchainKHR.hpp
index 1bfdb93..3487cd9 100644
--- a/src/WSI/VkSwapchainKHR.hpp
+++ b/src/WSI/VkSwapchainKHR.hpp
@@ -22,8 +22,7 @@
 
 #include <vector>
 
-namespace vk
-{
+namespace vk {
 
 class Fence;
 class Semaphore;
@@ -63,6 +62,6 @@
 	return SwapchainKHR::Cast(object);
 }
 
-}
+}  // namespace vk
 
 #endif //SWIFTSHADER_VKSWAPCHAINKHR_HPP