Merge commit 'dc209330cece9e680b7ae0677835e513f91c99f2' into cloud-android-current-release

Merges in changes from the master branch. Includes eliminating
generating MMX instructions, emulating Subzero intrinsics for ARM,
and floating-point filtering extension using glHint.

Change-Id: I98eadb7c8b48284f39b8f2933f31427f38bb286a
diff --git a/BUILD.gn b/BUILD.gn
index 52a78e8..7530b91 100644
--- a/BUILD.gn
+++ b/BUILD.gn
@@ -30,46 +30,28 @@
       "_CRT_SECURE_NO_DEPRECATE",
       "NOMINMAX",
       "_WINDLL",
+      "NO_SANITIZE_FUNCTION=",
+      "ANGLE_DISABLE_TRACE",
     ]
-
-    if (is_debug) {
-      cflags += [ "/RTC1" ]  # Run-Time Error Checks
-    } else {
-      defines += [ "ANGLE_DISABLE_TRACE" ]
-    }
   } else {
     cflags = [
       "-std=c++11",
-      "-Wall",
       "-fno-exceptions",
       "-fno-operator-names",
+      "-ffunction-sections",
+      "-fdata-sections",
+      "-fomit-frame-pointer",
+      "-Os",
     ]
 
     defines += [
       "__STDC_CONSTANT_MACROS",
       "__STDC_LIMIT_MACROS",
+      "NO_SANITIZE_FUNCTION=__attribute__((no_sanitize(\"function\")))",
+      "ANGLE_DISABLE_TRACE",
+      "NDEBUG",
     ]
 
-    if (is_debug) {
-      cflags += [
-        "-g",
-        "-g3",
-      ]
-    } else {  # Release
-      # All Release builds use function/data sections to make the shared libraries smaller
-      cflags += [
-        "-ffunction-sections",
-        "-fdata-sections",
-        "-fomit-frame-pointer",
-        "-Os",
-      ]
-
-      defines += [
-        "ANGLE_DISABLE_TRACE",
-        "NDEBUG",
-      ]
-    }
-
     if (target_cpu == "x64") {  # 64 bit version
       cflags += [
         "-m64",
@@ -89,11 +71,10 @@
         "-Wl,--hash-style=both",
         "-Wl,--gc-sections",
       ]
+
       # A bug in the gold linker prevents using ICF on 32-bit (crbug.com/729532)
       if (use_gold && target_cpu == "x86") {
-        ldflags += [
-          "-Wl,--icf=none",
-        ]
+        ldflags += [ "-Wl,--icf=none" ]
       }
     }
   }
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 0841d8d..619111f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -2,41 +2,6 @@
 
 project(SwiftShader C CXX)
 
-if(NOT CMAKE_BUILD_TYPE)
-    set(CMAKE_BUILD_TYPE "Release" CACHE STRING "The type of build: Debug Release MinSizeRel RelWithDebInfo." FORCE)
-endif()
-set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS Debug Release MinSizeRel RelWithDebInfo)
-
-option(BUILD_EGL "Build the EGL library" 1)
-if(WIN32)
-    option(BUILD_GL32 "Build the OpenGL32 library" 1)
-endif()
-option(BUILD_GLESv2 "Build the OpenGL ES 2 library" 1)
-option(BUILD_GLES_CM "Build the OpenGL ES 1.1 library" 1)
-
-option(USE_GROUP_SOURCES "Group the source files in a folder tree for Visual Studio" 1)
-
-option(BUILD_SAMPLES "Build sample programs" 1)
-option(BUILD_TESTS "Build test programs" 1)
-
-set(REACTOR_BACKEND "LLVM" CACHE STRING "JIT compiler back-end used by Reactor")
-set_property(CACHE REACTOR_BACKEND PROPERTY STRINGS LLVM Subzero)
-
-# LLVM disallows calling cmake . from the main LLVM dir, the reason is that
-# it builds header files that could overwrite the orignal ones. Here we
-# want to include LLVM as a subdirectory and even though it wouldn't cause
-# the problem, if cmake . is called from the main dir, the condition that
-# LLVM checkes, "CMAKE_SOURCE_DIR == CMAKE_BINARY_DIR" will be true. So we
-# disallow it ourselves too to. In addition if there are remining CMakeFiles
-# and CMakeCache in the directory, cmake .. from a subdirectory will still
-# try to build from the main directory so we instruct users to delete these
-# files when they get the error.
-if(CMAKE_SOURCE_DIR STREQUAL CMAKE_BINARY_DIR)
-    message(FATAL_ERROR "In source builds are not allowed by LLVM, please create a build/ directory and build from there. You may have to delete the CMakeCache.txt file and CMakeFiles directory that are next to the CMakeLists.txt.")
-endif()
-
-set_property(GLOBAL PROPERTY USE_FOLDERS ON)
-
 ###########################################################
 # Detect system
 ###########################################################
@@ -66,6 +31,51 @@
 set(CMAKE_MACOSX_RPATH ON)
 
 ###########################################################
+# Options
+###########################################################
+
+if(NOT CMAKE_BUILD_TYPE)
+    set(CMAKE_BUILD_TYPE "Release" CACHE STRING "The type of build: Debug Release MinSizeRel RelWithDebInfo." FORCE)
+endif()
+set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS Debug Release MinSizeRel RelWithDebInfo)
+
+option(BUILD_EGL "Build the EGL library" 1)
+if(WIN32)
+    option(BUILD_GL32 "Build the OpenGL32 library" 1)
+endif()
+option(BUILD_GLESv2 "Build the OpenGL ES 2 library" 1)
+option(BUILD_GLES_CM "Build the OpenGL ES 1.1 library" 1)
+
+option(USE_GROUP_SOURCES "Group the source files in a folder tree for Visual Studio" 1)
+
+option(BUILD_SAMPLES "Build sample programs" 1)
+option(BUILD_TESTS "Build test programs" 1)
+
+if(ARCH STREQUAL "arm")
+    set(DEFAULT_REACTOR_BACKEND "Subzero")
+else()
+    set(DEFAULT_REACTOR_BACKEND "LLVM")
+endif()
+
+set(REACTOR_BACKEND DEFAULT_REACTOR_BACKEND CACHE STRING "JIT compiler back-end used by Reactor")
+set_property(CACHE REACTOR_BACKEND PROPERTY STRINGS LLVM Subzero)
+
+# LLVM disallows calling cmake . from the main LLVM dir, the reason is that
+# it builds header files that could overwrite the orignal ones. Here we
+# want to include LLVM as a subdirectory and even though it wouldn't cause
+# the problem, if cmake . is called from the main dir, the condition that
+# LLVM checkes, "CMAKE_SOURCE_DIR == CMAKE_BINARY_DIR" will be true. So we
+# disallow it ourselves too to. In addition if there are remining CMakeFiles
+# and CMakeCache in the directory, cmake .. from a subdirectory will still
+# try to build from the main directory so we instruct users to delete these
+# files when they get the error.
+if(CMAKE_SOURCE_DIR STREQUAL CMAKE_BINARY_DIR)
+    message(FATAL_ERROR "In source builds are not allowed by LLVM, please create a build/ directory and build from there. You may have to delete the CMakeCache.txt file and CMakeFiles directory that are next to the CMakeLists.txt.")
+endif()
+
+set_property(GLOBAL PROPERTY USE_FOLDERS ON)
+
+###########################################################
 # Convenience macros
 ###########################################################
 
@@ -854,6 +864,7 @@
     INCLUDE_DIRECTORIES "${COMMON_INCLUDE_DIR}"
     POSITION_INDEPENDENT_CODE 1
     FOLDER "Core"
+    COMPILE_DEFINITIONS "NO_SANITIZE_FUNCTION=;"
 )
 target_link_libraries(SwiftShader ${OS_LIBS})
 
@@ -892,7 +903,7 @@
     set_target_properties(libEGL PROPERTIES
         INCLUDE_DIRECTORIES "${OPENGL_INCLUDE_DIR}"
         FOLDER "OpenGL"
-        COMPILE_DEFINITIONS "EGL_EGLEXT_PROTOTYPES; EGLAPI=;"
+        COMPILE_DEFINITIONS "EGL_EGLEXT_PROTOTYPES; EGLAPI=; NO_SANITIZE_FUNCTION=;"
         PREFIX ""
     )
     set_target_export_map(libEGL ${SOURCE_DIR}/OpenGL/libEGL)
@@ -922,7 +933,7 @@
     set_target_properties(libGLESv2 PROPERTIES
         INCLUDE_DIRECTORIES "${OPENGL_INCLUDE_DIR}"
         FOLDER "OpenGL"
-        COMPILE_DEFINITIONS "GL_GLEXT_PROTOTYPES; GL_API=; GL_APICALL=;"
+        COMPILE_DEFINITIONS "GL_GLEXT_PROTOTYPES; GL_API=; GL_APICALL=; NO_SANITIZE_FUNCTION=;"
         PREFIX ""
     )
     set_target_export_map(libGLESv2 ${SOURCE_DIR}/OpenGL/libGLESv2)
diff --git a/DEPS b/DEPS
deleted file mode 100644
index 43434a2..0000000
--- a/DEPS
+++ /dev/null
@@ -1,16 +0,0 @@
-# This file is used to manage SwiftShader's dependencies in the Chromium src
-# repo. It is used by gclient to determine what version of each dependency to
-# check out, and where.
-
-use_relative_paths = True
-
-vars = {
-  'chromium_git': 'https://chromium.googlesource.com',
-  # Current revision of subzero.
-  'subzero_revision': 'fb705a6d55003b2c32772ae49e25b0babcff5acc',
-}
-
-deps = {
-  'third_party/pnacl-subzero':
-    Var('chromium_git') + '/native_client/pnacl-subzero@' +  Var('subzero_revision'),
-}
diff --git a/README.md b/README.md
index ac2edf2..ff0b9de 100644
--- a/README.md
+++ b/README.md
@@ -1,9 +1,6 @@
-# SwiftShader [![Build Status](https://travis-ci.org/google/swiftshader.svg?branch=master)](https://travis-ci.org/google/swiftshader) [![Build status](https://ci.appveyor.com/api/projects/status/yrmyvb34j22jg1uj?svg=true)](https://ci.appveyor.com/project/c0d1f1ed/swiftshader)

+# SwiftShader

 

------------------------------------------------------------------------------------------------------------

-

-Introduction

-------------

+[![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0) [![Build Status](https://travis-ci.org/google/swiftshader.svg?branch=master)](https://travis-ci.org/google/swiftshader) [![Build status](https://ci.appveyor.com/api/projects/status/yrmyvb34j22jg1uj?svg=true)](https://ci.appveyor.com/project/c0d1f1ed/swiftshader)

 

 SwiftShader is a high-performance CPU-based implementation of the OpenGL ES and Direct3D 9 graphics APIs<sup>1</sup><sup>2</sup>. Its goal is to provide hardware independence for advanced 3D graphics.

 

diff --git a/extensions/CHROMIUM_texture_filtering_hint.txt b/extensions/CHROMIUM_texture_filtering_hint.txt
new file mode 100644
index 0000000..38af6e0
--- /dev/null
+++ b/extensions/CHROMIUM_texture_filtering_hint.txt
@@ -0,0 +1,85 @@
+Name
+
+    CHROMIUM_texture_filtering_hint
+
+Name Strings
+
+    GL_CHROMIUM_texture_filtering_hint
+
+Contributors
+
+    Alexis Hetu, Google Inc.
+    Nicolas Capens, Google Inc.
+    Shannon Woods, Google Inc.
+
+Contact
+
+    Alexis Hetu, Google Inc. (sugoi 'at' chromium 'dot' org)
+
+Version
+
+    Last Modifed Date: July 18, 2017
+
+Dependencies
+
+    This extension is written against the OpenGL ES 2.0 specification.
+
+    OpenGL ES 2.0 is required.
+
+Overview
+
+    This extension defines a way to request high precision texture filtering
+    using a new value to Hint.
+
+    When this extension is enabled, TEXTURE_FILTERING_HINT_CHROMIUM can be used
+    by the implementation as a means to distinguish between a performance
+    focused implementation, using FASTEST, or a precision focused
+    implementation, using NICEST.
+
+    Like other hints, either option is spec compliant and the behavior of
+    DONT_CARE is implementation specific.
+
+New Tokens
+
+    Accepted by the <pname> parameter of GetIntegerv, GetFloatv and GetBooleanv
+    and by the <target> parameter of Hint:
+
+    TEXTURE_FILTERING_HINT_CHROMIUM      0x8AF0
+
+New Procedures and Functions
+
+    None.
+
+Errors
+
+    None.
+
+New State
+
+    None.
+
+Issues
+
+    1) When does the hint take effect?
+
+       At the time of the next draw call, and all subsequent draw calls.
+  
+    2) Does the first draw call after the filtering hint is changed use the
+       updated filtering method?
+
+       Yes
+ 
+    3) Can I switch it back and forth between every draw call, multiple times
+       during a single frame?
+
+       Yes
+ 
+    4) Do program objects which were created before the filtering hint was
+       changed and which contain sampling instructions use the filtering method
+       from when they were created, or the method at the time of draw call?
+
+       At the time of draw call.
+
+Revision History
+
+    2/7/2014    Documented the extension
diff --git a/include/EGL/eglplatform.h b/include/EGL/eglplatform.h
index 1284089..a3b7234 100644
--- a/include/EGL/eglplatform.h
+++ b/include/EGL/eglplatform.h
@@ -85,8 +85,7 @@
 
 #elif defined(__ANDROID__) || defined(ANDROID)
 
-#include <android/native_window.h>
-
+struct ANativeWindow;
 struct egl_native_pixmap_t;
 
 typedef struct ANativeWindow*           EGLNativeWindowType;
diff --git a/src/Android.mk b/src/Android.mk
index eac01d9..ec3fc96 100644
--- a/src/Android.mk
+++ b/src/Android.mk
@@ -108,12 +108,16 @@
 	-Wno-implicit-exception-spec-mismatch \
 	-Wno-overloaded-virtual \
 	-Wno-non-virtual-dtor \
+	-Wno-attributes \
+	-Wno-unknown-attributes \
+	-Wno-unknown-warning-option \
 	-fno-operator-names \
 	-msse2 \
 	-D__STDC_CONSTANT_MACROS \
 	-D__STDC_LIMIT_MACROS \
 	-DANDROID_PLATFORM_SDK_VERSION=$(PLATFORM_SDK_VERSION) \
-	-std=c++11
+	-std=c++11 \
+	-DNO_SANITIZE_FUNCTION=
 
 ifneq (16,${PLATFORM_SDK_VERSION})
 COMMON_CFLAGS += -Xclang -fuse-init-array
diff --git a/src/Common/BUILD.gn b/src/Common/BUILD.gn
index 2fd4885..6b23321 100644
--- a/src/Common/BUILD.gn
+++ b/src/Common/BUILD.gn
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import("../swiftshader.gni")
+
 # Need a separate config to ensure the warnings are added to the end.
 config("swiftshader_common_private_config") {
   if (is_win) {
@@ -21,11 +23,10 @@
     ]
   } else {
     cflags = [ "-msse2" ]
-    defines = [ "LOG_TAG=\"swiftshader_common\"" ]
   }
 }
 
-source_set("swiftshader_common") {
+swiftshader_source_set("swiftshader_common") {
   sources = [
     "CPUID.cpp",
     "Configurator.cpp",
@@ -39,5 +40,5 @@
     "Timer.cpp",
   ]
 
-  configs += [ ":swiftshader_common_private_config" ]
+  configs = [ ":swiftshader_common_private_config" ]
 }
diff --git a/src/Common/DebugAndroid.hpp b/src/Common/DebugAndroid.hpp
index ac937e0..6dfb61d 100644
--- a/src/Common/DebugAndroid.hpp
+++ b/src/Common/DebugAndroid.hpp
@@ -16,6 +16,7 @@
 #define DebugAndroid_hpp
 
 #include <cutils/log.h>
+#include <cassert>
 
 // On Android Virtual Devices we heavily depend on logging, even in
 // production builds. We do this because AVDs are components of larger
diff --git a/src/D3D8/Direct3DDevice8.cpp b/src/D3D8/Direct3DDevice8.cpp
index 6294fbb..7f6e769 100644
--- a/src/D3D8/Direct3DDevice8.cpp
+++ b/src/D3D8/Direct3DDevice8.cpp
@@ -365,7 +365,7 @@
 
 		for(unsigned int i = 0; i < count; i++)
 		{
-			sw::SliceRect clearRect(rects[i].x1, rects[i].y1, rects[i].x2, rects[i].y2, 0);
+			sw::Rect clearRect(rects[i].x1, rects[i].y1, rects[i].x2, rects[i].y2);
 
 			clearRect.clip(viewport.X, viewport.Y, viewport.X + viewport.Width, viewport.Y + viewport.Height);
 
diff --git a/src/D3D9/Direct3DDevice9.cpp b/src/D3D9/Direct3DDevice9.cpp
index 9b68c47..4be7955 100644
--- a/src/D3D9/Direct3DDevice9.cpp
+++ b/src/D3D9/Direct3DDevice9.cpp
@@ -396,7 +396,7 @@
 
 		for(unsigned int i = 0; i < count; i++)
 		{
-			sw::SliceRect clearRect(rects[i].x1, rects[i].y1, rects[i].x2, rects[i].y2, 0);
+			sw::Rect clearRect(rects[i].x1, rects[i].y1, rects[i].x2, rects[i].y2);
 
 			clearRect.clip(viewport.X, viewport.Y, viewport.X + viewport.Width, viewport.Y + viewport.Height);
 
diff --git a/src/Main/BUILD.gn b/src/Main/BUILD.gn
index 150d559..dd85696 100644
--- a/src/Main/BUILD.gn
+++ b/src/Main/BUILD.gn
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import("../swiftshader.gni")
+
 # Need a separate config to ensure the warnings are added to the end.
 config("swiftshader_main_private_config") {
   if (is_win) {
@@ -28,11 +30,12 @@
     }
   } else {
     cflags = [ "-msse2" ]
-    defines = [ "LOG_TAG=\"swiftshader_main\"" ]
+    defines =
+        [ "NO_SANITIZE_FUNCTION=__attribute__((no_sanitize(\"function\")))" ]
   }
 }
 
-source_set("swiftshader_main") {
+swiftshader_source_set("swiftshader_main") {
   deps = [
     "../Common:swiftshader_common",
   ]
@@ -59,11 +62,10 @@
   }
 
   if (is_win) {
-    configs -= [ "//build/config/win:unicode" ]
     libs = [ "dxguid.lib" ]  # For FrameBufferDD
   }
 
-  configs += [ ":swiftshader_main_private_config" ]
+  configs = [ ":swiftshader_main_private_config" ]
 
   include_dirs = [
     "..",
diff --git a/src/Main/FrameBufferAndroid.cpp b/src/Main/FrameBufferAndroid.cpp
index 7340921..49957c8 100644
--- a/src/Main/FrameBufferAndroid.cpp
+++ b/src/Main/FrameBufferAndroid.cpp
@@ -15,6 +15,7 @@
 #include "FrameBufferAndroid.hpp"
 #include "GrallocAndroid.hpp"
 
+#include <system/window.h>
 #include <cutils/log.h>
 
 namespace sw
diff --git a/src/Main/FrameBufferAndroid.hpp b/src/Main/FrameBufferAndroid.hpp
index 7e34ea2..4400188 100644
--- a/src/Main/FrameBufferAndroid.hpp
+++ b/src/Main/FrameBufferAndroid.hpp
@@ -18,8 +18,8 @@
 #include "Main/FrameBuffer.hpp"
 #include "Common/Debug.hpp"
 
-#include <hardware/gralloc.h>
-#include <system/window.h>
+struct ANativeWindow;
+struct ANativeWindowBuffer;
 
 namespace sw
 {
diff --git a/src/Main/FrameBufferX11.cpp b/src/Main/FrameBufferX11.cpp
index 12b83e4..a065198 100644
--- a/src/Main/FrameBufferX11.cpp
+++ b/src/Main/FrameBufferX11.cpp
@@ -123,7 +123,7 @@
 
 	void FrameBufferX11::unlock()
 	{
-		locked = 0;
+		locked = nullptr;
 	}
 
 	void FrameBufferX11::blit(void *source, const Rect *sourceRect, const Rect *destRect, Format sourceFormat, size_t sourceStride)
@@ -143,7 +143,7 @@
 	}
 }
 
-sw::FrameBuffer *createFrameBuffer(void *display, Window window, int width, int height)
+NO_SANITIZE_FUNCTION sw::FrameBuffer *createFrameBuffer(void *display, Window window, int width, int height)
 {
 	return new sw::FrameBufferX11((::Display*)display, window, width, height);
 }
diff --git a/src/OpenGL/common/BUILD.gn b/src/OpenGL/common/BUILD.gn
index cb58ab8..9cc22bf 100644
--- a/src/OpenGL/common/BUILD.gn
+++ b/src/OpenGL/common/BUILD.gn
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import("../../swiftshader.gni")
+
 # Need a separate config to ensure the warnings are added to the end.
 config("swiftshader_opengl_common_private_config") {
   if (is_win) {
@@ -20,12 +22,10 @@
       "/wd4324",  # structure was padded due to alignment specifier
       "/wd5030",  # attribute is not recognized
     ]
-  } else {
-    defines = [ "LOG_TAG=\"swiftshader_opengl_common\"" ]
   }
 }
 
-source_set("swiftshader_opengl_common") {
+swiftshader_source_set("swiftshader_opengl_common") {
   sources = [
     "Image.cpp",
     "MatrixStack.cpp",
@@ -33,7 +33,7 @@
     "debug.cpp",
   ]
 
-  configs += [ ":swiftshader_opengl_common_private_config" ]
+  configs = [ ":swiftshader_opengl_common_private_config" ]
 
   include_dirs = [
     "..",
diff --git a/src/OpenGL/common/Image.cpp b/src/OpenGL/common/Image.cpp
index d3fe20e..f75f92f 100644
--- a/src/OpenGL/common/Image.cpp
+++ b/src/OpenGL/common/Image.cpp
@@ -263,10 +263,10 @@
 		for(int x = 0; x < width; x++)
 		{
 			unsigned int rgba = source1010102U[x];
-			dest16U[4 * x + 0] = (rgba & 0x00000FFC) >> 2;
-			dest16U[4 * x + 1] = (rgba & 0x003FF000) >> 12;
-			dest16U[4 * x + 2] = (rgba & 0xFFC00000) >> 22;
-			dest16U[4 * x + 3] = (rgba & 0x00000003);
+			dest16U[4 * x + 0] = (rgba & 0x000003FF);
+			dest16U[4 * x + 1] = (rgba & 0x000FFC00) >> 10;
+			dest16U[4 * x + 2] = (rgba & 0x3FF00000) >> 20;
+			dest16U[4 * x + 3] = (rgba & 0xC0000000) >> 30;
 		}
 	}
 
diff --git a/src/OpenGL/common/Image.hpp b/src/OpenGL/common/Image.hpp
index c2e7f53..e13b19e 100644
--- a/src/OpenGL/common/Image.hpp
+++ b/src/OpenGL/common/Image.hpp
@@ -22,7 +22,6 @@
 #include <GLES2/gl2ext.h>
 
 #if defined(__ANDROID__)
-#include <hardware/gralloc.h>
 #include <system/window.h>
 #include "../../Common/GrallocAndroid.hpp"
 #include "../../Common/DebugAndroid.hpp"
diff --git a/src/OpenGL/common/Object.cpp b/src/OpenGL/common/Object.cpp
index 1a4a7c8..b4d84c0 100644
--- a/src/OpenGL/common/Object.cpp
+++ b/src/OpenGL/common/Object.cpp
@@ -23,6 +23,7 @@
 namespace gl
 {
 #ifndef NDEBUG
+sw::MutexLock Object::instances_mutex;
 std::set<Object*> Object::instances;
 #endif
 
@@ -31,6 +32,7 @@
 	referenceCount = 0;
 
 	#ifndef NDEBUG
+		LockGuard instances_lock(instances_mutex);
 		instances.insert(this);
 	#endif
 }
@@ -40,6 +42,7 @@
 	ASSERT(referenceCount == 0);
 
 	#ifndef NDEBUG
+		LockGuard instances_lock(instances_mutex);
 		ASSERT(instances.find(this) != instances.end());   // Check for double deletion
 		instances.erase(this);
 	#endif
@@ -89,6 +92,7 @@
 {
 	~ObjectLeakCheck()
 	{
+		LockGuard instances_lock(Object::instances_mutex);
 		ASSERT(Object::instances.empty());   // Check for GL object leak at termination
 	}
 };
diff --git a/src/OpenGL/common/Object.hpp b/src/OpenGL/common/Object.hpp
index c6243ac..7d9a8fe 100644
--- a/src/OpenGL/common/Object.hpp
+++ b/src/OpenGL/common/Object.hpp
@@ -20,6 +20,7 @@
 #define gl_Object_hpp
 
 #include "common/debug.h"
+#include "Common/MutexLock.hpp"
 
 #include <set>
 
@@ -51,6 +52,7 @@
 
 #ifndef NDEBUG
 public:
+	static sw::MutexLock instances_mutex;
 	static std::set<Object*> instances;   // For leak checking
 #endif
 };
diff --git a/src/OpenGL/compiler/Android.mk b/src/OpenGL/compiler/Android.mk
index 3916255..5bca1fe 100644
--- a/src/OpenGL/compiler/Android.mk
+++ b/src/OpenGL/compiler/Android.mk
@@ -20,6 +20,9 @@
 	-Wno-unused-parameter \
 	-Wno-implicit-exception-spec-mismatch \
 	-Wno-overloaded-virtual \
+	-Wno-attributes \
+	-Wno-unknown-attributes \
+	-Wno-unknown-warning-option \
 	-fno-operator-names \
 	-msse2 \
 	-D__STDC_CONSTANT_MACROS \
diff --git a/src/OpenGL/compiler/BUILD.gn b/src/OpenGL/compiler/BUILD.gn
index 5da9390..3341e1e 100644
--- a/src/OpenGL/compiler/BUILD.gn
+++ b/src/OpenGL/compiler/BUILD.gn
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import("../../swiftshader.gni")
+
 # Need a separate config to ensure the warnings are added to the end.
 config("swiftshader_opengl_compiler_private_config") {
   if (is_win) {
@@ -28,7 +30,6 @@
     }
   } else {
     cflags = [ "-Wno-sign-compare" ]
-    defines = [ "LOG_TAG=\"swiftshader_opengl_compiler\"" ]
 
     if (!is_debug) {
       cflags += [ "-Wno-unused-variable" ]  # local variable is initialized but not referenced (variables only used in ASSERTS)
@@ -36,7 +37,7 @@
   }
 }
 
-source_set("swiftshader_opengl_compiler") {
+swiftshader_source_set("swiftshader_opengl_compiler") {
   deps = [
     "preprocessor:swiftshader_opengl_preprocessor",
   ]
@@ -73,7 +74,7 @@
     sources += [ "ossource_win.cpp" ]
   }
 
-  configs += [ ":swiftshader_opengl_compiler_private_config" ]
+  configs = [ ":swiftshader_opengl_compiler_private_config" ]
 
   include_dirs = [
     "..",
diff --git a/src/OpenGL/compiler/BaseTypes.h b/src/OpenGL/compiler/BaseTypes.h
index 58c0856..01f9948 100644
--- a/src/OpenGL/compiler/BaseTypes.h
+++ b/src/OpenGL/compiler/BaseTypes.h
@@ -369,6 +369,7 @@
 	EvqPosition,
 	EvqPointSize,
 	EvqInstanceID,
+	EvqVertexID,
 
 	// built-ins read by fragment shader
 	EvqFragCoord,
@@ -446,6 +447,7 @@
 	case EvqPosition:       return "Position";       break;
 	case EvqPointSize:      return "PointSize";      break;
 	case EvqInstanceID:     return "InstanceID";     break;
+	case EvqVertexID:       return "VertexID";       break;
 	case EvqFragCoord:      return "FragCoord";      break;
 	case EvqFrontFacing:    return "FrontFacing";    break;
 	case EvqFragColor:      return "FragColor";      break;
diff --git a/src/OpenGL/compiler/Initialize.cpp b/src/OpenGL/compiler/Initialize.cpp
index 1948a57..c374531 100644
--- a/src/OpenGL/compiler/Initialize.cpp
+++ b/src/OpenGL/compiler/Initialize.cpp
@@ -471,6 +471,7 @@
 		symbolTable.insert(COMMON_BUILTINS, *new TVariable(NewPoolTString("gl_Position"), TType(EbtFloat, EbpHigh, EvqPosition,    4)));
 		symbolTable.insert(COMMON_BUILTINS, *new TVariable(NewPoolTString("gl_PointSize"), TType(EbtFloat, EbpMedium, EvqPointSize,   1)));
 		symbolTable.insert(ESSL3_BUILTINS, *new TVariable(NewPoolTString("gl_InstanceID"), TType(EbtInt, EbpHigh, EvqInstanceID, 1)));
+		symbolTable.insert(ESSL3_BUILTINS, *new TVariable(NewPoolTString("gl_VertexID"), TType(EbtInt, EbpHigh, EvqVertexID, 1)));
 		break;
 	default: assert(false && "Language not supported");
 	}
diff --git a/src/OpenGL/compiler/OutputASM.cpp b/src/OpenGL/compiler/OutputASM.cpp
index 8a84692..1c70f6c 100644
--- a/src/OpenGL/compiler/OutputASM.cpp
+++ b/src/OpenGL/compiler/OutputASM.cpp
@@ -950,6 +950,7 @@
 			break;
 		case EOpVectorLogicalNot: if(visit == PostVisit) emit(sw::Shader::OPCODE_NOT, result, arg); break;
 		case EOpLogicalNot:       if(visit == PostVisit) emit(sw::Shader::OPCODE_NOT, result, arg); break;
+		case EOpBitwiseNot:       if(visit == PostVisit) emit(sw::Shader::OPCODE_NOT, result, arg); break;
 		case EOpPostIncrement:
 			if(visit == PostVisit)
 			{
@@ -2554,6 +2555,7 @@
 		case EvqPosition:            return sw::Shader::PARAMETER_OUTPUT;
 		case EvqPointSize:           return sw::Shader::PARAMETER_OUTPUT;
 		case EvqInstanceID:          return sw::Shader::PARAMETER_MISCTYPE;
+		case EvqVertexID:            return sw::Shader::PARAMETER_MISCTYPE;
 		case EvqFragCoord:           return sw::Shader::PARAMETER_MISCTYPE;
 		case EvqFrontFacing:         return sw::Shader::PARAMETER_MISCTYPE;
 		case EvqPointCoord:          return sw::Shader::PARAMETER_INPUT;
@@ -2606,9 +2608,10 @@
 		case EvqConstReadOnly:       return temporaryRegister(operand);
 		case EvqPosition:            return varyingRegister(operand);
 		case EvqPointSize:           return varyingRegister(operand);
-		case EvqInstanceID:          vertexShader->declareInstanceId(); return 0;
-		case EvqFragCoord:           pixelShader->declareVPos();  return 0;
-		case EvqFrontFacing:         pixelShader->declareVFace(); return 1;
+		case EvqInstanceID:          vertexShader->declareInstanceId(); return sw::Shader::InstanceIDIndex;
+		case EvqVertexID:            vertexShader->declareVertexId(); return sw::Shader::VertexIDIndex;
+		case EvqFragCoord:           pixelShader->declareVPos();  return sw::Shader::VPosIndex;
+		case EvqFrontFacing:         pixelShader->declareVFace(); return sw::Shader::VFaceIndex;
 		case EvqPointCoord:          return varyingRegister(operand);
 		case EvqFragColor:           return 0;
 		case EvqFragData:            return fragmentOutputRegister(operand);
diff --git a/src/OpenGL/compiler/ParseHelper.cpp b/src/OpenGL/compiler/ParseHelper.cpp
index 83f58ce..7cca42c 100644
--- a/src/OpenGL/compiler/ParseHelper.cpp
+++ b/src/OpenGL/compiler/ParseHelper.cpp
@@ -406,6 +406,7 @@
 	case EvqFrontFacing:    message = "can't modify gl_FrontFacing"; break;
 	case EvqPointCoord:     message = "can't modify gl_PointCoord";  break;
 	case EvqInstanceID:     message = "can't modify gl_InstanceID";  break;
+	case EvqVertexID:       message = "can't modify gl_VertexID";    break;
 	default:
 
 		//
diff --git a/src/OpenGL/compiler/SymbolTable.cpp b/src/OpenGL/compiler/SymbolTable.cpp
index 89ad4d1..b2e48e8 100644
--- a/src/OpenGL/compiler/SymbolTable.cpp
+++ b/src/OpenGL/compiler/SymbolTable.cpp
@@ -27,8 +27,8 @@
 #include <limits.h>
 #include <algorithm>
 
-#if defined(_MSC_VER)
-#define snprintf _snprintf
+#if defined(_MSC_VER) && MSC_VER < 1900

+#define snprintf _snprintf

 #endif
 
 int TSymbolTableLevel::uniqueId = 0;
diff --git a/src/OpenGL/compiler/preprocessor/BUILD.gn b/src/OpenGL/compiler/preprocessor/BUILD.gn
index fadd37e..7c8c2e4 100644
--- a/src/OpenGL/compiler/preprocessor/BUILD.gn
+++ b/src/OpenGL/compiler/preprocessor/BUILD.gn
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import("../../../swiftshader.gni")
+
 # Need a separate config to ensure the warnings are added to the end.
 config("swiftshader_opengl_preprocessor_private_config") {
   if (is_win) {
@@ -20,12 +22,10 @@
       "/wd4267",  # conversion from size_t to int/unsigned int (in autogenerated code)
       "/wd4702",  # unreachable code (in autogenerated code)
     ]
-  } else {
-    defines = [ "LOG_TAG=\"swiftshader_opengl_compiler\"" ]
   }
 }
 
-source_set("swiftshader_opengl_preprocessor") {
+swiftshader_source_set("swiftshader_opengl_preprocessor") {
   sources = [
     "Diagnostics.cpp",
     "DirectiveHandler.cpp",
@@ -40,5 +40,5 @@
     "Tokenizer.cpp",
   ]
 
-  configs += [ ":swiftshader_opengl_preprocessor_private_config" ]
+  configs = [ ":swiftshader_opengl_preprocessor_private_config" ]
 }
diff --git a/src/OpenGL/libEGL/Android.mk b/src/OpenGL/libEGL/Android.mk
index 8026c7b..9317879 100644
--- a/src/OpenGL/libEGL/Android.mk
+++ b/src/OpenGL/libEGL/Android.mk
@@ -8,7 +8,11 @@
 	-Wno-unused-parameter \
 	-Wno-implicit-exception-spec-mismatch \
 	-Wno-overloaded-virtual \
-	-DANDROID_PLATFORM_SDK_VERSION=$(PLATFORM_SDK_VERSION)
+	-Wno-attributes \
+	-Wno-unknown-attributes \
+	-Wno-unknown-warning-option \
+	-DANDROID_PLATFORM_SDK_VERSION=$(PLATFORM_SDK_VERSION) \
+	-DNO_SANITIZE_FUNCTION=
 
 ifneq (16,${PLATFORM_SDK_VERSION})
 COMMON_CFLAGS += -Xclang -fuse-init-array
diff --git a/src/OpenGL/libEGL/BUILD.gn b/src/OpenGL/libEGL/BUILD.gn
index 0ce3a8f..543c9cc 100644
--- a/src/OpenGL/libEGL/BUILD.gn
+++ b/src/OpenGL/libEGL/BUILD.gn
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import("../../swiftshader.gni")
+
 # Need a separate config to ensure the warnings are added to the end.
 config("swiftshader_libEGL_private_config") {
   defines = [ "EGL_EGLEXT_PROTOTYPES" ]
@@ -23,24 +25,18 @@
       "/wd5030",  # attribute is not recognized
     ]
 
-    defines += [
-      "EGLAPI=",
-      "LIBEGL_EXPORTS",
-    ]
+    defines += [ "EGLAPI=" ]
   } else {
     cflags = [ "-Wno-sign-compare" ]
     if (!is_clang) {
       cflags += [ "-Wno-unused-but-set-variable" ]
     }
 
-    defines += [
-      "LOG_TAG=\"swiftshader_libEGL\"",
-      "EGLAPI=__attribute__((visibility(\"default\")))",
-    ]
+    defines += [ "EGLAPI=__attribute__((visibility(\"default\"))) __attribute__((no_sanitize(\"function\")))" ]
   }
 }
 
-shared_library("swiftshader_libEGL") {
+swiftshader_shared_library("swiftshader_libEGL") {
   if (!is_mac) {
     output_name = "libEGL"
     output_dir = "$root_out_dir/swiftshader"
@@ -62,10 +58,6 @@
     "resource.h",
   ]
 
-  if (is_debug) {
-    sources += [ "../common/debug.cpp" ]
-  }
-
   if (is_mac) {
     sources += [ "OSXUtils.mm" ]
     libs = [
@@ -74,7 +66,6 @@
     ]
     ldflags = [ "-Wl,-install_name,@rpath/libswiftshader_libEGL.dylib" ]
   } else if (is_win) {
-    configs -= [ "//build/config/win:unicode" ]
     ldflags = [ "/DEF:" + rebase_path("libGLESv2.def", root_build_dir) ]
   } else if (is_linux) {
     sources += [ "../../Main/libX11.cpp" ]
@@ -82,12 +73,7 @@
         [ "-Wl,--version-script=" + rebase_path("exports.map", root_build_dir) ]
   }
 
-  configs -= [ "//build/config/compiler:chromium_code" ]
-  configs += [
-    "//build/config/compiler:no_chromium_code",
-    "//third_party/swiftshader:swiftshader_config",
-    ":swiftshader_libEGL_private_config",
-  ]
+  configs = [ ":swiftshader_libEGL_private_config" ]
 
   include_dirs = [
     "../../../include",
diff --git a/src/OpenGL/libEGL/Display.cpp b/src/OpenGL/libEGL/Display.cpp
index b08fa65..0ae67bd 100644
--- a/src/OpenGL/libEGL/Display.cpp
+++ b/src/OpenGL/libEGL/Display.cpp
@@ -676,7 +676,10 @@
 			if(fd != -1)
 			{
 				struct fb_var_screeninfo info;
-				if(ioctl(fd, FBIOGET_VSCREENINFO, &info) >= 0)
+				int io = ioctl(fd, FBIOGET_VSCREENINFO, &info);
+				close(fd);
+
+				if(io >= 0)
 				{
 					switch(info.bits_per_pixel)
 					{
@@ -716,8 +719,6 @@
 						UNIMPLEMENTED();
 					}
 				}
-
-				close(fd);
 			}
 		}
 
diff --git a/src/OpenGL/libEGL/exports.map b/src/OpenGL/libEGL/exports.map
index 487457e..8455dc9 100644
--- a/src/OpenGL/libEGL/exports.map
+++ b/src/OpenGL/libEGL/exports.map
@@ -1,5 +1,6 @@
 {
 global:
+	# EGL core functions
 	eglBindAPI;
 	eglBindTexImage;
 	eglChooseConfig;
@@ -46,9 +47,13 @@
 	eglClientWaitSyncKHR;
 	eglGetSyncAttribKHR;
 
+	# Table of function pointers to disambiguate between libraries
 	libEGL_swiftshader;
 
-local:
-    *;
-};
+	# Type-strings and type-infos required by sanitizers
+	_ZTS*;
+	_ZTI*;
 
+local:
+	*;
+};
diff --git a/src/OpenGL/libEGL/libEGL.vcxproj b/src/OpenGL/libEGL/libEGL.vcxproj
index 01ef9cb..032077b 100644
--- a/src/OpenGL/libEGL/libEGL.vcxproj
+++ b/src/OpenGL/libEGL/libEGL.vcxproj
@@ -119,7 +119,7 @@
     <ClCompile>

       <Optimization>Disabled</Optimization>

       <AdditionalIncludeDirectories>$(ProjectDir)/..;$(ProjectDir)/../..;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>

-      <PreprocessorDefinitions>WIN32;EGLAPI=;EGL_EGLEXT_PROTOTYPES;_DEBUG;_WINDOWS;_USRDLL;LIBEGL_EXPORTS;_CRT_SECURE_NO_DEPRECATE;NOMINMAX;DEBUGGER_WAIT_DIALOG;%(PreprocessorDefinitions)</PreprocessorDefinitions>

+      <PreprocessorDefinitions>WIN32;EGLAPI=;EGL_EGLEXT_PROTOTYPES;NO_SANITIZE_FUNCTION=;_DEBUG;_WINDOWS;_USRDLL;_CRT_SECURE_NO_DEPRECATE;NOMINMAX;DEBUGGER_WAIT_DIALOG;%(PreprocessorDefinitions)</PreprocessorDefinitions>

       <MinimalRebuild>true</MinimalRebuild>

       <BasicRuntimeChecks>EnableFastChecks</BasicRuntimeChecks>

       <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>

@@ -151,7 +151,7 @@
     <ClCompile>

       <Optimization>Disabled</Optimization>

       <AdditionalIncludeDirectories>$(ProjectDir)/..;$(ProjectDir)/../..;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>

-      <PreprocessorDefinitions>WIN32;EGLAPI=;EGL_EGLEXT_PROTOTYPES;_DEBUG;_WINDOWS;_USRDLL;LIBEGL_EXPORTS;_CRT_SECURE_NO_DEPRECATE;NOMINMAX;DEBUGGER_WAIT_DIALOG;%(PreprocessorDefinitions)</PreprocessorDefinitions>

+      <PreprocessorDefinitions>WIN32;EGLAPI=;EGL_EGLEXT_PROTOTYPES;NO_SANITIZE_FUNCTION=;_DEBUG;_WINDOWS;_USRDLL;_CRT_SECURE_NO_DEPRECATE;NOMINMAX;DEBUGGER_WAIT_DIALOG;%(PreprocessorDefinitions)</PreprocessorDefinitions>

       <BasicRuntimeChecks>EnableFastChecks</BasicRuntimeChecks>

       <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>

       <PrecompiledHeader>

@@ -185,7 +185,7 @@
       <Optimization>MaxSpeed</Optimization>

       <InlineFunctionExpansion>AnySuitable</InlineFunctionExpansion>

       <AdditionalIncludeDirectories>$(ProjectDir)/..;$(ProjectDir)/../..;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>

-      <PreprocessorDefinitions>WIN32;EGLAPI=;EGL_EGLEXT_PROTOTYPES;NDEBUG;_WINDOWS;_USRDLL;LIBEGL_EXPORTS;_CRT_SECURE_NO_DEPRECATE;NOMINMAX;_SECURE_SCL=0;%(PreprocessorDefinitions)</PreprocessorDefinitions>

+      <PreprocessorDefinitions>WIN32;EGLAPI=;EGL_EGLEXT_PROTOTYPES;NO_SANITIZE_FUNCTION=;NDEBUG;_WINDOWS;_USRDLL;_CRT_SECURE_NO_DEPRECATE;NOMINMAX;_SECURE_SCL=0;%(PreprocessorDefinitions)</PreprocessorDefinitions>

       <RuntimeLibrary>MultiThreaded</RuntimeLibrary>

       <PrecompiledHeader>

       </PrecompiledHeader>

@@ -217,7 +217,7 @@
       <Optimization>MaxSpeed</Optimization>

       <InlineFunctionExpansion>AnySuitable</InlineFunctionExpansion>

       <AdditionalIncludeDirectories>$(ProjectDir)/..;$(ProjectDir)/../..;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>

-      <PreprocessorDefinitions>WIN32;EGLAPI=;EGL_EGLEXT_PROTOTYPES;NDEBUG;_WINDOWS;_USRDLL;LIBEGL_EXPORTS;_CRT_SECURE_NO_DEPRECATE;NOMINMAX;_SECURE_SCL=0;%(PreprocessorDefinitions)</PreprocessorDefinitions>

+      <PreprocessorDefinitions>WIN32;EGLAPI=;EGL_EGLEXT_PROTOTYPES;NO_SANITIZE_FUNCTION=;NDEBUG;_WINDOWS;_USRDLL;_CRT_SECURE_NO_DEPRECATE;NOMINMAX;_SECURE_SCL=0;%(PreprocessorDefinitions)</PreprocessorDefinitions>

       <RuntimeLibrary>MultiThreaded</RuntimeLibrary>

       <PrecompiledHeader>

       </PrecompiledHeader>

@@ -251,7 +251,7 @@
       <Optimization>MaxSpeed</Optimization>

       <InlineFunctionExpansion>AnySuitable</InlineFunctionExpansion>

       <AdditionalIncludeDirectories>$(ProjectDir)/..;$(ProjectDir)/../..;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>

-      <PreprocessorDefinitions>WIN32;EGLAPI=;EGL_EGLEXT_PROTOTYPES;NDEBUG;_WINDOWS;_USRDLL;LIBEGL_EXPORTS;_CRT_SECURE_NO_DEPRECATE;NOMINMAX;_SECURE_SCL=0;%(PreprocessorDefinitions)</PreprocessorDefinitions>

+      <PreprocessorDefinitions>WIN32;EGLAPI=;EGL_EGLEXT_PROTOTYPES;NO_SANITIZE_FUNCTION=;NDEBUG;_WINDOWS;_USRDLL;_CRT_SECURE_NO_DEPRECATE;NOMINMAX;_SECURE_SCL=0;%(PreprocessorDefinitions)</PreprocessorDefinitions>

       <RuntimeLibrary>MultiThreaded</RuntimeLibrary>

       <PrecompiledHeader>

       </PrecompiledHeader>

@@ -283,7 +283,7 @@
       <Optimization>MaxSpeed</Optimization>

       <InlineFunctionExpansion>AnySuitable</InlineFunctionExpansion>

       <AdditionalIncludeDirectories>$(ProjectDir)/..;$(ProjectDir)/../..;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>

-      <PreprocessorDefinitions>WIN32;EGLAPI=;EGL_EGLEXT_PROTOTYPES;NDEBUG;_WINDOWS;_USRDLL;LIBEGL_EXPORTS;_CRT_SECURE_NO_DEPRECATE;NOMINMAX;_SECURE_SCL=0;%(PreprocessorDefinitions)</PreprocessorDefinitions>

+      <PreprocessorDefinitions>WIN32;EGLAPI=;EGL_EGLEXT_PROTOTYPES;NO_SANITIZE_FUNCTION=;NDEBUG;_WINDOWS;_USRDLL;_CRT_SECURE_NO_DEPRECATE;NOMINMAX;_SECURE_SCL=0;%(PreprocessorDefinitions)</PreprocessorDefinitions>

       <RuntimeLibrary>MultiThreaded</RuntimeLibrary>

       <PrecompiledHeader>

       </PrecompiledHeader>

diff --git a/src/OpenGL/libEGL/main.cpp b/src/OpenGL/libEGL/main.cpp
index 7f024d0..3a8d1ce 100644
--- a/src/OpenGL/libEGL/main.cpp
+++ b/src/OpenGL/libEGL/main.cpp
@@ -229,7 +229,7 @@
 	current->context = ctx;
 }
 
-egl::Context *getCurrentContext()
+NO_SANITIZE_FUNCTION egl::Context *getCurrentContext()
 {
 	Current *current = getCurrent();
 
diff --git a/src/OpenGL/libGL/Device.cpp b/src/OpenGL/libGL/Device.cpp
index f75b060..736bb99 100644
--- a/src/OpenGL/libGL/Device.cpp
+++ b/src/OpenGL/libGL/Device.cpp
@@ -201,7 +201,7 @@
 			return;
 		}
 
-		sw::SliceRect clearRect = renderTarget->getRect();
+		sw::Rect clearRect = renderTarget->getRect();
 
 		if(scissorEnable)
 		{
@@ -225,7 +225,7 @@
 		}
 
 		z = clamp01(z);
-		sw::SliceRect clearRect = depthStencil->getRect();
+		sw::Rect clearRect = depthStencil->getRect();
 
 		if(scissorEnable)
 		{
@@ -242,7 +242,7 @@
 			return;
 		}
 
-		sw::SliceRect clearRect = depthStencil->getRect();
+		sw::Rect clearRect = depthStencil->getRect();
 
 		if(scissorEnable)
 		{
diff --git a/src/OpenGL/libGLES_CM/Android.mk b/src/OpenGL/libGLES_CM/Android.mk
index 25cef87..8576661 100644
--- a/src/OpenGL/libGLES_CM/Android.mk
+++ b/src/OpenGL/libGLES_CM/Android.mk
@@ -14,6 +14,9 @@
 	-Wno-unused-parameter \
 	-Wno-implicit-exception-spec-mismatch \
 	-Wno-overloaded-virtual \
+	-Wno-attributes \
+	-Wno-unknown-attributes \
+	-Wno-unknown-warning-option \
 	-DANDROID_PLATFORM_SDK_VERSION=$(PLATFORM_SDK_VERSION)
 
 ifneq (16,${PLATFORM_SDK_VERSION})
diff --git a/src/OpenGL/libGLES_CM/Device.cpp b/src/OpenGL/libGLES_CM/Device.cpp
index cb95d0c..26f53bc 100644
--- a/src/OpenGL/libGLES_CM/Device.cpp
+++ b/src/OpenGL/libGLES_CM/Device.cpp
@@ -172,7 +172,7 @@
 		rgba[2] = blue;
 		rgba[3] = alpha;
 
-		sw::SliceRect clearRect = renderTarget->getRect();
+		sw::Rect clearRect = renderTarget->getRect();
 
 		if(scissorEnable)
 		{
@@ -190,7 +190,7 @@
 		}
 
 		z = clamp01(z);
-		sw::SliceRect clearRect = depthBuffer->getRect();
+		sw::Rect clearRect = depthBuffer->getRect();
 
 		if(scissorEnable)
 		{
@@ -207,7 +207,7 @@
 			return;
 		}
 
-		sw::SliceRect clearRect = stencilBuffer->getRect();
+		sw::Rect clearRect = stencilBuffer->getRect();
 
 		if(scissorEnable)
 		{
diff --git a/src/OpenGL/libGLES_CM/libGLES_CM.vcxproj b/src/OpenGL/libGLES_CM/libGLES_CM.vcxproj
index f809b28..7d2a496 100644
--- a/src/OpenGL/libGLES_CM/libGLES_CM.vcxproj
+++ b/src/OpenGL/libGLES_CM/libGLES_CM.vcxproj
@@ -125,7 +125,7 @@
     <ClCompile>

       <Optimization>Disabled</Optimization>

       <AdditionalIncludeDirectories>$(SolutionDir)\src;$(ProjectDir)/..;$(ProjectDir)/../..;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>

-      <PreprocessorDefinitions>WIN32;EGLAPI=;GL_API=;GL_APICALL=;GL_GLEXT_PROTOTYPES;_DEBUG;_WINDOWS;_USRDLL;LIBGLES_CM_EXPORTS;_CRT_SECURE_NO_DEPRECATE;NOMINMAX;%(PreprocessorDefinitions)</PreprocessorDefinitions>

+      <PreprocessorDefinitions>WIN32;EGLAPI=;GL_API=;GL_APICALL=;GL_GLEXT_PROTOTYPES;_DEBUG;_WINDOWS;_USRDLL;_CRT_SECURE_NO_DEPRECATE;NOMINMAX;%(PreprocessorDefinitions)</PreprocessorDefinitions>

       <MinimalRebuild>true</MinimalRebuild>

       <BasicRuntimeChecks>EnableFastChecks</BasicRuntimeChecks>

       <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>

@@ -154,7 +154,7 @@
     <ClCompile>

       <Optimization>Disabled</Optimization>

       <AdditionalIncludeDirectories>$(SolutionDir)\src;$(ProjectDir)/..;$(ProjectDir)/../..;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>

-      <PreprocessorDefinitions>WIN32;EGLAPI=;GL_API=;GL_APICALL=;GL_GLEXT_PROTOTYPES;_DEBUG;_WINDOWS;_USRDLL;LIBGLES_CM_EXPORTS;_CRT_SECURE_NO_DEPRECATE;NOMINMAX;%(PreprocessorDefinitions)</PreprocessorDefinitions>

+      <PreprocessorDefinitions>WIN32;EGLAPI=;GL_API=;GL_APICALL=;GL_GLEXT_PROTOTYPES;_DEBUG;_WINDOWS;_USRDLL;_CRT_SECURE_NO_DEPRECATE;NOMINMAX;%(PreprocessorDefinitions)</PreprocessorDefinitions>

       <BasicRuntimeChecks>EnableFastChecks</BasicRuntimeChecks>

       <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>

       <PrecompiledHeader>

@@ -185,7 +185,7 @@
       <Optimization>Full</Optimization>

       <InlineFunctionExpansion>AnySuitable</InlineFunctionExpansion>

       <AdditionalIncludeDirectories>$(ProjectDir)/..;$(ProjectDir)/../..;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>

-      <PreprocessorDefinitions>WIN32;EGLAPI=;GL_API=;GL_APICALL=;GL_GLEXT_PROTOTYPES;NDEBUG;_WINDOWS;_USRDLL;LIBGLES_CM_EXPORTS;_CRT_SECURE_NO_DEPRECATE;NOMINMAX;_SECURE_SCL=0;%(PreprocessorDefinitions)</PreprocessorDefinitions>

+      <PreprocessorDefinitions>WIN32;EGLAPI=;GL_API=;GL_APICALL=;GL_GLEXT_PROTOTYPES;NDEBUG;_WINDOWS;_USRDLL;_CRT_SECURE_NO_DEPRECATE;NOMINMAX;_SECURE_SCL=0;%(PreprocessorDefinitions)</PreprocessorDefinitions>

       <RuntimeLibrary>MultiThreaded</RuntimeLibrary>

       <PrecompiledHeader>

       </PrecompiledHeader>

@@ -221,7 +221,7 @@
       <Optimization>Full</Optimization>

       <InlineFunctionExpansion>AnySuitable</InlineFunctionExpansion>

       <AdditionalIncludeDirectories>$(ProjectDir)/..;$(ProjectDir)/../..;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>

-      <PreprocessorDefinitions>WIN32;EGLAPI=;GL_API=;GL_APICALL=;GL_GLEXT_PROTOTYPES;NDEBUG;_WINDOWS;_USRDLL;LIBGLES_CM_EXPORTS;_CRT_SECURE_NO_DEPRECATE;NOMINMAX;_SECURE_SCL=0;%(PreprocessorDefinitions)</PreprocessorDefinitions>

+      <PreprocessorDefinitions>WIN32;EGLAPI=;GL_API=;GL_APICALL=;GL_GLEXT_PROTOTYPES;NDEBUG;_WINDOWS;_USRDLL;_CRT_SECURE_NO_DEPRECATE;NOMINMAX;_SECURE_SCL=0;%(PreprocessorDefinitions)</PreprocessorDefinitions>

       <RuntimeLibrary>MultiThreaded</RuntimeLibrary>

       <PrecompiledHeader>

       </PrecompiledHeader>

@@ -259,7 +259,7 @@
       <Optimization>Full</Optimization>

       <InlineFunctionExpansion>AnySuitable</InlineFunctionExpansion>

       <AdditionalIncludeDirectories>$(ProjectDir)/..; $(ProjectDir)/../..;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>

-      <PreprocessorDefinitions>WIN32;EGLAPI=;GL_API=;GL_APICALL=;GL_GLEXT_PROTOTYPES;NDEBUG;_WINDOWS;_USRDLL;LIBGLES_CM_EXPORTS;_CRT_SECURE_NO_DEPRECATE;NOMINMAX;_SECURE_SCL=0;%(PreprocessorDefinitions)</PreprocessorDefinitions>

+      <PreprocessorDefinitions>WIN32;EGLAPI=;GL_API=;GL_APICALL=;GL_GLEXT_PROTOTYPES;NDEBUG;_WINDOWS;_USRDLL;_CRT_SECURE_NO_DEPRECATE;NOMINMAX;_SECURE_SCL=0;%(PreprocessorDefinitions)</PreprocessorDefinitions>

       <RuntimeLibrary>MultiThreaded</RuntimeLibrary>

       <PrecompiledHeader>

       </PrecompiledHeader>

@@ -293,7 +293,7 @@
       <Optimization>Full</Optimization>

       <InlineFunctionExpansion>AnySuitable</InlineFunctionExpansion>

       <AdditionalIncludeDirectories>$(ProjectDir)/..; $(ProjectDir)/../..;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>

-      <PreprocessorDefinitions>WIN32;EGLAPI=;GL_API=;GL_APICALL=;GL_GLEXT_PROTOTYPES;NDEBUG;_WINDOWS;_USRDLL;LIBGLES_CM_EXPORTS;_CRT_SECURE_NO_DEPRECATE;NOMINMAX;_SECURE_SCL=0;%(PreprocessorDefinitions)</PreprocessorDefinitions>

+      <PreprocessorDefinitions>WIN32;EGLAPI=;GL_API=;GL_APICALL=;GL_GLEXT_PROTOTYPES;NDEBUG;_WINDOWS;_USRDLL;_CRT_SECURE_NO_DEPRECATE;NOMINMAX;_SECURE_SCL=0;%(PreprocessorDefinitions)</PreprocessorDefinitions>

       <RuntimeLibrary>MultiThreaded</RuntimeLibrary>

       <PrecompiledHeader>

       </PrecompiledHeader>

diff --git a/src/OpenGL/libGLESv2/Android.mk b/src/OpenGL/libGLESv2/Android.mk
index 3357d2a..4ae7952 100644
--- a/src/OpenGL/libGLESv2/Android.mk
+++ b/src/OpenGL/libGLESv2/Android.mk
@@ -13,7 +13,11 @@
 	-Wno-unused-parameter \
 	-Wno-implicit-exception-spec-mismatch \
 	-Wno-overloaded-virtual \
-	-DANDROID_PLATFORM_SDK_VERSION=$(PLATFORM_SDK_VERSION)
+	-Wno-attributes \
+	-Wno-unknown-attributes \
+	-Wno-unknown-warning-option \
+	-DANDROID_PLATFORM_SDK_VERSION=$(PLATFORM_SDK_VERSION) \
+	-DNO_SANITIZE_FUNCTION=
 
 ifneq (16,${PLATFORM_SDK_VERSION})
 COMMON_CFLAGS += -Xclang -fuse-init-array
diff --git a/src/OpenGL/libGLESv2/BUILD.gn b/src/OpenGL/libGLESv2/BUILD.gn
index 102d825..a917cfd 100644
--- a/src/OpenGL/libGLESv2/BUILD.gn
+++ b/src/OpenGL/libGLESv2/BUILD.gn
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import("../../swiftshader.gni")
+
 # Need a separate config to ensure the warnings are added to the end.
 config("swiftshader_libGLESv2_private_config") {
   defines = [
@@ -26,10 +28,7 @@
       "/wd5030",  # attribute is not recognized
     ]
 
-    defines += [
-      "GL_APICALL=",
-      "LIBGLESV2_EXPORTS",
-    ]
+    defines += [ "GL_APICALL=" ]
 
     if (is_clang) {
       defines += [
@@ -44,15 +43,14 @@
     }
 
     defines += [
-      "LOG_TAG=\"swiftshader_libGLESv2\"",
       "__STDC_CONSTANT_MACROS",
       "__STDC_LIMIT_MACROS",
-      "GL_APICALL=__attribute__((visibility(\"default\")))",
+      "GL_APICALL=__attribute__((visibility(\"default\"))) __attribute__((no_sanitize(\"function\")))",
     ]
   }
 }
 
-shared_library("swiftshader_libGLESv2") {
+swiftshader_shared_library("swiftshader_libGLESv2") {
   if (!is_mac) {
     output_name = "libGLESv2"
     output_dir = "$root_out_dir/swiftshader"
@@ -91,7 +89,6 @@
   ]
 
   if (is_win) {
-    configs -= [ "//build/config/win:unicode" ]
     ldflags = [ "/DEF:" + rebase_path("libGLESv2.def", root_build_dir) ]
   } else if (is_mac) {
     ldflags = [ "-Wl,-install_name,@rpath/libswiftshader_libGLESv2.dylib" ]
@@ -100,12 +97,7 @@
         [ "-Wl,--version-script=" + rebase_path("exports.map", root_build_dir) ]
   }
 
-  configs -= [ "//build/config/compiler:chromium_code" ]
-  configs += [
-    "//build/config/compiler:no_chromium_code",
-    "//third_party/swiftshader:swiftshader_config",
-    ":swiftshader_libGLESv2_private_config",
-  ]
+  configs = [ ":swiftshader_libGLESv2_private_config" ]
 
   include_dirs = [
     "../../../include",
diff --git a/src/OpenGL/libGLESv2/Context.cpp b/src/OpenGL/libGLESv2/Context.cpp
index 84196ac..3cd11af 100644
--- a/src/OpenGL/libGLESv2/Context.cpp
+++ b/src/OpenGL/libGLESv2/Context.cpp
@@ -100,6 +100,7 @@
 	mState.rasterizerDiscardEnabled = false;
 	mState.generateMipmapHint = GL_DONT_CARE;
 	mState.fragmentShaderDerivativeHint = GL_DONT_CARE;
+	mState.textureFilteringHint = GL_DONT_CARE;
 
 	mState.lineWidth = 1.0f;
 
@@ -245,6 +246,11 @@
 	mState.pixelPackBuffer = nullptr;
 	mState.pixelUnpackBuffer = nullptr;
 	mState.genericUniformBuffer = nullptr;
+
+	for(int i = 0; i < MAX_UNIFORM_BUFFER_BINDINGS; i++) {
+		mState.uniformBuffers[i].set(nullptr, 0, 0);
+	}
+
 	mState.renderbuffer = nullptr;
 
 	for(int i = 0; i < MAX_COMBINED_TEXTURE_IMAGE_UNITS; ++i)
@@ -677,6 +683,11 @@
 	// Ignore for now. It is valid for implementations to ignore hint.
 }
 
+void Context::setTextureFilteringHint(GLenum hint)
+{
+	mState.textureFilteringHint = hint;
+}
+
 void Context::setViewportParams(GLint x, GLint y, GLsizei width, GLsizei height)
 {
 	mState.viewportX = x;
@@ -1885,6 +1896,7 @@
 	case GL_UNPACK_ALIGNMENT:                 *params = mState.unpackInfo.alignment;          return true;
 	case GL_GENERATE_MIPMAP_HINT:             *params = mState.generateMipmapHint;            return true;
 	case GL_FRAGMENT_SHADER_DERIVATIVE_HINT_OES: *params = mState.fragmentShaderDerivativeHint; return true;
+	case GL_TEXTURE_FILTERING_HINT_CHROMIUM:  *params = mState.textureFilteringHint;          return true;
 	case GL_ACTIVE_TEXTURE:                   *params = (mState.activeSampler + GL_TEXTURE0); return true;
 	case GL_STENCIL_FUNC:                     *params = mState.stencilFunc;                   return true;
 	case GL_STENCIL_REF:                      *params = mState.stencilRef;                    return true;
@@ -2420,6 +2432,7 @@
 	case GL_UNPACK_ALIGNMENT:
 	case GL_GENERATE_MIPMAP_HINT:
 	case GL_FRAGMENT_SHADER_DERIVATIVE_HINT_OES:
+	case GL_TEXTURE_FILTERING_HINT_CHROMIUM:
 	case GL_RED_BITS:
 	case GL_GREEN_BITS:
 	case GL_BLUE_BITS:
@@ -2976,9 +2989,9 @@
 		mAppliedProgramSerial = programObject->getSerial();
 	}
 
-	programObject->applyTransformFeedback(getTransformFeedback());
-	programObject->applyUniformBuffers(mState.uniformBuffers);
-	programObject->applyUniforms();
+	programObject->applyTransformFeedback(device, getTransformFeedback());
+	programObject->applyUniformBuffers(device, mState.uniformBuffers);
+	programObject->applyUniforms(device);
 }
 
 void Context::applyTextures()
@@ -3053,6 +3066,7 @@
 				device->setTextureFilter(samplerType, samplerIndex, es2sw::ConvertTextureFilter(minFilter, magFilter, maxAnisotropy));
 				device->setMipmapFilter(samplerType, samplerIndex, es2sw::ConvertMipMapFilter(minFilter));
 				device->setMaxAnisotropy(samplerType, samplerIndex, maxAnisotropy);
+				device->setHighPrecisionFiltering(samplerType, samplerIndex, mState.textureFilteringHint == GL_NICEST);
 
 				applyTexture(samplerType, samplerIndex, texture);
 			}
@@ -3315,7 +3329,7 @@
 
 		if(colorbuffer)
 		{
-			sw::SliceRect clearRect = colorbuffer->getRect();
+			sw::Rect clearRect = colorbuffer->getRect();
 
 			if(mState.scissorTestEnabled)
 			{
@@ -3354,7 +3368,7 @@
 		if(depthbuffer)
 		{
 			float depth = clamp01(value);
-			sw::SliceRect clearRect = depthbuffer->getRect();
+			sw::Rect clearRect = depthbuffer->getRect();
 
 			if(mState.scissorTestEnabled)
 			{
@@ -3378,7 +3392,7 @@
 		if(stencilbuffer)
 		{
 			unsigned char stencil = value < 0 ? 0 : static_cast<unsigned char>(value & 0x000000FF);
-			sw::SliceRect clearRect = stencilbuffer->getRect();
+			sw::Rect clearRect = stencilbuffer->getRect();
 
 			if(mState.scissorTestEnabled)
 			{
@@ -4300,6 +4314,7 @@
 		"GL_OES_texture_half_float_linear",
 		"GL_OES_texture_npot",
 		"GL_OES_texture_3D",
+		"GL_OES_vertex_half_float",
 		"GL_EXT_blend_minmax",
 		"GL_EXT_color_buffer_half_float",
 		"GL_EXT_draw_buffers",
@@ -4319,6 +4334,7 @@
 		"GL_ANGLE_texture_compression_dxt3",
 		"GL_ANGLE_texture_compression_dxt5",
 #endif
+		"GL_CHROMIUM_texture_filtering_hint",
 		"GL_NV_fence",
 		"GL_NV_framebuffer_blit",
 		"GL_NV_read_depth",
@@ -4355,6 +4371,14 @@
 			{
 				extensionsCat += std::string(extension) + " ";
 			}
+
+			if(clientVersion >= 3)
+			{
+				for(const char *extension : es3extensions)
+				{
+					extensionsCat += std::string(extension) + " ";
+				}
+			}
 		}
 
 		return (const GLubyte*)extensionsCat.c_str();
@@ -4377,7 +4401,7 @@
 
 }
 
-egl::Context *es2CreateContext(egl::Display *display, const egl::Context *shareContext, int clientVersion, const egl::Config *config)
+NO_SANITIZE_FUNCTION egl::Context *es2CreateContext(egl::Display *display, const egl::Context *shareContext, int clientVersion, const egl::Config *config)
 {
 	ASSERT(!shareContext || shareContext->getClientVersion() == clientVersion);   // Should be checked by eglCreateContext
 	return new es2::Context(display, static_cast<const es2::Context*>(shareContext), clientVersion, config);
diff --git a/src/OpenGL/libGLESv2/Context.h b/src/OpenGL/libGLESv2/Context.h
index bbe6ddd..16e0aa2 100644
--- a/src/OpenGL/libGLESv2/Context.h
+++ b/src/OpenGL/libGLESv2/Context.h
@@ -156,6 +156,8 @@
 #endif
 };
 
+const GLenum GL_TEXTURE_FILTERING_HINT_CHROMIUM = 0x8AF0;
+
 const GLint NUM_COMPRESSED_TEXTURE_FORMATS = sizeof(compressedTextureFormats) / sizeof(compressedTextureFormats[0]);
 
 const GLint multisampleCount[] = {4, 2, 1};
@@ -376,6 +378,7 @@
 
 	GLenum generateMipmapHint;
 	GLenum fragmentShaderDerivativeHint;
+	GLenum textureFilteringHint;
 
 	GLint viewportX;
 	GLint viewportY;
@@ -489,6 +492,7 @@
 
 	void setGenerateMipmapHint(GLenum hint);
 	void setFragmentShaderDerivativeHint(GLenum hint);
+	void setTextureFilteringHint(GLenum hint);
 
 	void setViewportParams(GLint x, GLint y, GLsizei width, GLsizei height);
 
diff --git a/src/OpenGL/libGLESv2/Device.cpp b/src/OpenGL/libGLESv2/Device.cpp
index 8b8f016..53c794c 100644
--- a/src/OpenGL/libGLESv2/Device.cpp
+++ b/src/OpenGL/libGLESv2/Device.cpp
@@ -202,18 +202,14 @@
 		{
 			if(renderTarget[i])
 			{
-				sw::SliceRect clearRect = renderTarget[i]->getRect();
+				sw::Rect clearRect = renderTarget[i]->getRect();
 
 				if(scissorEnable)
 				{
 					clearRect.clip(scissorRect.x0, scissorRect.y0, scissorRect.x1, scissorRect.y1);
 				}
 
-				int depth = sw::max(renderTarget[i]->getDepth(), 1);
-				for(clearRect.slice = 0; clearRect.slice < depth; clearRect.slice++)
-				{
-					clear(rgba, FORMAT_A32B32G32R32F, renderTarget[i], clearRect, rgbaMask);
-				}
+				clear(rgba, FORMAT_A32B32G32R32F, renderTarget[i], clearRect, rgbaMask);
 			}
 		}
 	}
@@ -226,7 +222,7 @@
 		}
 
 		z = clamp01(z);
-		sw::SliceRect clearRect = depthBuffer->getRect();
+		sw::Rect clearRect = depthBuffer->getRect();
 
 		if(scissorEnable)
 		{
@@ -243,7 +239,7 @@
 			return;
 		}
 
-		sw::SliceRect clearRect = stencilBuffer->getRect();
+		sw::Rect clearRect = stencilBuffer->getRect();
 
 		if(scissorEnable)
 		{
diff --git a/src/OpenGL/libGLESv2/Program.cpp b/src/OpenGL/libGLESv2/Program.cpp
index 400da5d..24d1f3c 100644
--- a/src/OpenGL/libGLESv2/Program.cpp
+++ b/src/OpenGL/libGLESv2/Program.cpp
@@ -151,8 +151,6 @@
 
 	Program::Program(ResourceManager *manager, GLuint handle) : serial(issueSerial()), resourceManager(manager), handle(handle)
 	{
-		device = getDevice();
-
 		fragmentShader = 0;
 		vertexShader = 0;
 		pixelBinary = 0;
@@ -1064,7 +1062,7 @@
 	}
 
 	// Applies all the uniforms set for this program object to the device
-	void Program::applyUniforms()
+	void Program::applyUniforms(Device *device)
 	{
 		GLint numUniforms = static_cast<GLint>(uniformIndex.size());
 		for(GLint location = 0; location < numUniforms; location++)
@@ -1086,23 +1084,23 @@
 
 				switch(targetUniform->type)
 				{
-				case GL_BOOL:       applyUniform1bv(location, size, b);       break;
-				case GL_BOOL_VEC2:  applyUniform2bv(location, size, b);       break;
-				case GL_BOOL_VEC3:  applyUniform3bv(location, size, b);       break;
-				case GL_BOOL_VEC4:  applyUniform4bv(location, size, b);       break;
-				case GL_FLOAT:      applyUniform1fv(location, size, f);       break;
-				case GL_FLOAT_VEC2: applyUniform2fv(location, size, f);       break;
-				case GL_FLOAT_VEC3: applyUniform3fv(location, size, f);       break;
-				case GL_FLOAT_VEC4: applyUniform4fv(location, size, f);       break;
-				case GL_FLOAT_MAT2:   applyUniformMatrix2fv(location, size, f);   break;
-				case GL_FLOAT_MAT2x3: applyUniformMatrix2x3fv(location, size, f); break;
-				case GL_FLOAT_MAT2x4: applyUniformMatrix2x4fv(location, size, f); break;
-				case GL_FLOAT_MAT3x2: applyUniformMatrix3x2fv(location, size, f); break;
-				case GL_FLOAT_MAT3:   applyUniformMatrix3fv(location, size, f);   break;
-				case GL_FLOAT_MAT3x4: applyUniformMatrix3x4fv(location, size, f); break;
-				case GL_FLOAT_MAT4x2: applyUniformMatrix4x2fv(location, size, f); break;
-				case GL_FLOAT_MAT4x3: applyUniformMatrix4x3fv(location, size, f); break;
-				case GL_FLOAT_MAT4:   applyUniformMatrix4fv(location, size, f);   break;
+				case GL_BOOL:       applyUniform1bv(device, location, size, b);       break;
+				case GL_BOOL_VEC2:  applyUniform2bv(device, location, size, b);       break;
+				case GL_BOOL_VEC3:  applyUniform3bv(device, location, size, b);       break;
+				case GL_BOOL_VEC4:  applyUniform4bv(device, location, size, b);       break;
+				case GL_FLOAT:      applyUniform1fv(device, location, size, f);       break;
+				case GL_FLOAT_VEC2: applyUniform2fv(device, location, size, f);       break;
+				case GL_FLOAT_VEC3: applyUniform3fv(device, location, size, f);       break;
+				case GL_FLOAT_VEC4: applyUniform4fv(device, location, size, f);       break;
+				case GL_FLOAT_MAT2:   applyUniformMatrix2fv(device, location, size, f);   break;
+				case GL_FLOAT_MAT2x3: applyUniformMatrix2x3fv(device, location, size, f); break;
+				case GL_FLOAT_MAT2x4: applyUniformMatrix2x4fv(device, location, size, f); break;
+				case GL_FLOAT_MAT3x2: applyUniformMatrix3x2fv(device, location, size, f); break;
+				case GL_FLOAT_MAT3:   applyUniformMatrix3fv(device, location, size, f);   break;
+				case GL_FLOAT_MAT3x4: applyUniformMatrix3x4fv(device, location, size, f); break;
+				case GL_FLOAT_MAT4x2: applyUniformMatrix4x2fv(device, location, size, f); break;
+				case GL_FLOAT_MAT4x3: applyUniformMatrix4x3fv(device, location, size, f); break;
+				case GL_FLOAT_MAT4:   applyUniformMatrix4fv(device, location, size, f);   break;
 				case GL_SAMPLER_2D:
 				case GL_SAMPLER_CUBE:
 				case GL_SAMPLER_EXTERNAL_OES:
@@ -1119,14 +1117,14 @@
 				case GL_UNSIGNED_INT_SAMPLER_3D:
 				case GL_INT_SAMPLER_2D_ARRAY:
 				case GL_UNSIGNED_INT_SAMPLER_2D_ARRAY:
-				case GL_INT:        applyUniform1iv(location, size, i);       break;
-				case GL_INT_VEC2:   applyUniform2iv(location, size, i);       break;
-				case GL_INT_VEC3:   applyUniform3iv(location, size, i);       break;
-				case GL_INT_VEC4:   applyUniform4iv(location, size, i);       break;
-				case GL_UNSIGNED_INT:      applyUniform1uiv(location, size, ui); break;
-				case GL_UNSIGNED_INT_VEC2: applyUniform2uiv(location, size, ui); break;
-				case GL_UNSIGNED_INT_VEC3: applyUniform3uiv(location, size, ui); break;
-				case GL_UNSIGNED_INT_VEC4: applyUniform4uiv(location, size, ui); break;
+				case GL_INT:        applyUniform1iv(device, location, size, i);       break;
+				case GL_INT_VEC2:   applyUniform2iv(device, location, size, i);       break;
+				case GL_INT_VEC3:   applyUniform3iv(device, location, size, i);       break;
+				case GL_INT_VEC4:   applyUniform4iv(device, location, size, i);       break;
+				case GL_UNSIGNED_INT:      applyUniform1uiv(device, location, size, ui); break;
+				case GL_UNSIGNED_INT_VEC2: applyUniform2uiv(device, location, size, ui); break;
+				case GL_UNSIGNED_INT_VEC3: applyUniform3uiv(device, location, size, ui); break;
+				case GL_UNSIGNED_INT_VEC4: applyUniform4uiv(device, location, size, ui); break;
 				default:
 					UNREACHABLE(targetUniform->type);
 				}
@@ -1136,7 +1134,7 @@
 		}
 	}
 
-	void Program::applyUniformBuffers(BufferBinding* uniformBuffers)
+	void Program::applyUniformBuffers(Device *device, BufferBinding* uniformBuffers)
 	{
 		GLint vertexUniformBuffers[MAX_UNIFORM_BUFFER_BINDINGS];
 		GLint fragmentUniformBuffers[MAX_UNIFORM_BUFFER_BINDINGS];
@@ -1189,7 +1187,7 @@
 		}
 	}
 
-	void Program::applyTransformFeedback(TransformFeedback* transformFeedback)
+	void Program::applyTransformFeedback(Device *device, TransformFeedback* transformFeedback)
 	{
 		// Make sure the flags will fit in a 64 bit unsigned int variable
 		ASSERT(sw::max<int>(MAX_TRANSFORM_FEEDBACK_SEPARATE_ATTRIBS, sw::MAX_TRANSFORM_FEEDBACK_INTERLEAVED_COMPONENTS) <= 64);
@@ -1930,7 +1928,7 @@
 		return true;
 	}
 
-	bool Program::applyUniform(GLint location, float* data)
+	bool Program::applyUniform(Device *device, GLint location, float* data)
 	{
 		Uniform *targetUniform = uniforms[uniformIndex[location].index];
 
@@ -1947,7 +1945,7 @@
 		return true;
 	}
 
-	bool Program::applyUniform1bv(GLint location, GLsizei count, const GLboolean *v)
+	bool Program::applyUniform1bv(Device *device, GLint location, GLsizei count, const GLboolean *v)
 	{
 		int vector[MAX_UNIFORM_VECTORS][4];
 
@@ -1961,10 +1959,10 @@
 			v += 1;
 		}
 
-		return applyUniform(location, (float*)vector);
+		return applyUniform(device, location, (float*)vector);
 	}
 
-	bool Program::applyUniform2bv(GLint location, GLsizei count, const GLboolean *v)
+	bool Program::applyUniform2bv(Device *device, GLint location, GLsizei count, const GLboolean *v)
 	{
 		int vector[MAX_UNIFORM_VECTORS][4];
 
@@ -1978,10 +1976,10 @@
 			v += 2;
 		}
 
-		return applyUniform(location, (float*)vector);
+		return applyUniform(device, location, (float*)vector);
 	}
 
-	bool Program::applyUniform3bv(GLint location, GLsizei count, const GLboolean *v)
+	bool Program::applyUniform3bv(Device *device, GLint location, GLsizei count, const GLboolean *v)
 	{
 		int vector[MAX_UNIFORM_VECTORS][4];
 
@@ -1995,10 +1993,10 @@
 			v += 3;
 		}
 
-		return applyUniform(location, (float*)vector);
+		return applyUniform(device, location, (float*)vector);
 	}
 
-	bool Program::applyUniform4bv(GLint location, GLsizei count, const GLboolean *v)
+	bool Program::applyUniform4bv(Device *device, GLint location, GLsizei count, const GLboolean *v)
 	{
 		int vector[MAX_UNIFORM_VECTORS][4];
 
@@ -2012,10 +2010,10 @@
 			v += 4;
 		}
 
-		return applyUniform(location, (float*)vector);
+		return applyUniform(device, location, (float*)vector);
 	}
 
-	bool Program::applyUniform1fv(GLint location, GLsizei count, const GLfloat *v)
+	bool Program::applyUniform1fv(Device *device, GLint location, GLsizei count, const GLfloat *v)
 	{
 		float vector[MAX_UNIFORM_VECTORS][4];
 
@@ -2029,10 +2027,10 @@
 			v += 1;
 		}
 
-		return applyUniform(location, (float*)vector);
+		return applyUniform(device, location, (float*)vector);
 	}
 
-	bool Program::applyUniform2fv(GLint location, GLsizei count, const GLfloat *v)
+	bool Program::applyUniform2fv(Device *device, GLint location, GLsizei count, const GLfloat *v)
 	{
 		float vector[MAX_UNIFORM_VECTORS][4];
 
@@ -2046,10 +2044,10 @@
 			v += 2;
 		}
 
-		return applyUniform(location, (float*)vector);
+		return applyUniform(device, location, (float*)vector);
 	}
 
-	bool Program::applyUniform3fv(GLint location, GLsizei count, const GLfloat *v)
+	bool Program::applyUniform3fv(Device *device, GLint location, GLsizei count, const GLfloat *v)
 	{
 		float vector[MAX_UNIFORM_VECTORS][4];
 
@@ -2063,15 +2061,15 @@
 			v += 3;
 		}
 
-		return applyUniform(location, (float*)vector);
+		return applyUniform(device, location, (float*)vector);
 	}
 
-	bool Program::applyUniform4fv(GLint location, GLsizei count, const GLfloat *v)
+	bool Program::applyUniform4fv(Device *device, GLint location, GLsizei count, const GLfloat *v)
 	{
-		return applyUniform(location, (float*)v);
+		return applyUniform(device, location, (float*)v);
 	}
 
-	bool Program::applyUniformMatrix2fv(GLint location, GLsizei count, const GLfloat *value)
+	bool Program::applyUniformMatrix2fv(Device *device, GLint location, GLsizei count, const GLfloat *value)
 	{
 		float matrix[(MAX_UNIFORM_VECTORS + 1) / 2][2][4];
 
@@ -2083,10 +2081,10 @@
 			value += 4;
 		}
 
-		return applyUniform(location, (float*)matrix);
+		return applyUniform(device, location, (float*)matrix);
 	}
 
-	bool Program::applyUniformMatrix2x3fv(GLint location, GLsizei count, const GLfloat *value)
+	bool Program::applyUniformMatrix2x3fv(Device *device, GLint location, GLsizei count, const GLfloat *value)
 	{
 		float matrix[(MAX_UNIFORM_VECTORS + 1) / 2][2][4];
 
@@ -2098,10 +2096,10 @@
 			value += 6;
 		}
 
-		return applyUniform(location, (float*)matrix);
+		return applyUniform(device, location, (float*)matrix);
 	}
 
-	bool Program::applyUniformMatrix2x4fv(GLint location, GLsizei count, const GLfloat *value)
+	bool Program::applyUniformMatrix2x4fv(Device *device, GLint location, GLsizei count, const GLfloat *value)
 	{
 		float matrix[(MAX_UNIFORM_VECTORS + 1) / 2][2][4];
 
@@ -2113,10 +2111,10 @@
 			value += 8;
 		}
 
-		return applyUniform(location, (float*)matrix);
+		return applyUniform(device, location, (float*)matrix);
 	}
 
-	bool Program::applyUniformMatrix3fv(GLint location, GLsizei count, const GLfloat *value)
+	bool Program::applyUniformMatrix3fv(Device *device, GLint location, GLsizei count, const GLfloat *value)
 	{
 		float matrix[(MAX_UNIFORM_VECTORS + 2) / 3][3][4];
 
@@ -2129,10 +2127,10 @@
 			value += 9;
 		}
 
-		return applyUniform(location, (float*)matrix);
+		return applyUniform(device, location, (float*)matrix);
 	}
 
-	bool Program::applyUniformMatrix3x2fv(GLint location, GLsizei count, const GLfloat *value)
+	bool Program::applyUniformMatrix3x2fv(Device *device, GLint location, GLsizei count, const GLfloat *value)
 	{
 		float matrix[(MAX_UNIFORM_VECTORS + 2) / 3][3][4];
 
@@ -2145,10 +2143,10 @@
 			value += 6;
 		}
 
-		return applyUniform(location, (float*)matrix);
+		return applyUniform(device, location, (float*)matrix);
 	}
 
-	bool Program::applyUniformMatrix3x4fv(GLint location, GLsizei count, const GLfloat *value)
+	bool Program::applyUniformMatrix3x4fv(Device *device, GLint location, GLsizei count, const GLfloat *value)
 	{
 		float matrix[(MAX_UNIFORM_VECTORS + 2) / 3][3][4];
 
@@ -2161,15 +2159,15 @@
 			value += 12;
 		}
 
-		return applyUniform(location, (float*)matrix);
+		return applyUniform(device, location, (float*)matrix);
 	}
 
-	bool Program::applyUniformMatrix4fv(GLint location, GLsizei count, const GLfloat *value)
+	bool Program::applyUniformMatrix4fv(Device *device, GLint location, GLsizei count, const GLfloat *value)
 	{
-		return applyUniform(location, (float*)value);
+		return applyUniform(device, location, (float*)value);
 	}
 
-	bool Program::applyUniformMatrix4x2fv(GLint location, GLsizei count, const GLfloat *value)
+	bool Program::applyUniformMatrix4x2fv(Device *device, GLint location, GLsizei count, const GLfloat *value)
 	{
 		float matrix[(MAX_UNIFORM_VECTORS + 3) / 4][4][4];
 
@@ -2183,10 +2181,10 @@
 			value += 8;
 		}
 
-		return applyUniform(location, (float*)matrix);
+		return applyUniform(device, location, (float*)matrix);
 	}
 
-	bool Program::applyUniformMatrix4x3fv(GLint location, GLsizei count, const GLfloat *value)
+	bool Program::applyUniformMatrix4x3fv(Device *device, GLint location, GLsizei count, const GLfloat *value)
 	{
 		float matrix[(MAX_UNIFORM_VECTORS + 3) / 4][4][4];
 
@@ -2200,10 +2198,10 @@
 			value += 12;
 		}
 
-		return applyUniform(location, (float*)matrix);
+		return applyUniform(device, location, (float*)matrix);
 	}
 
-	bool Program::applyUniform1iv(GLint location, GLsizei count, const GLint *v)
+	bool Program::applyUniform1iv(Device *device, GLint location, GLsizei count, const GLint *v)
 	{
 		GLint vector[MAX_UNIFORM_VECTORS][4];
 
@@ -2248,13 +2246,13 @@
 		}
 		else
 		{
-			return applyUniform(location, (float*)vector);
+			return applyUniform(device, location, (float*)vector);
 		}
 
 		return true;
 	}
 
-	bool Program::applyUniform2iv(GLint location, GLsizei count, const GLint *v)
+	bool Program::applyUniform2iv(Device *device, GLint location, GLsizei count, const GLint *v)
 	{
 		GLint vector[MAX_UNIFORM_VECTORS][4];
 
@@ -2268,10 +2266,10 @@
 			v += 2;
 		}
 
-		return applyUniform(location, (float*)vector);
+		return applyUniform(device, location, (float*)vector);
 	}
 
-	bool Program::applyUniform3iv(GLint location, GLsizei count, const GLint *v)
+	bool Program::applyUniform3iv(Device *device, GLint location, GLsizei count, const GLint *v)
 	{
 		GLint vector[MAX_UNIFORM_VECTORS][4];
 
@@ -2285,10 +2283,10 @@
 			v += 3;
 		}
 
-		return applyUniform(location, (float*)vector);
+		return applyUniform(device, location, (float*)vector);
 	}
 
-	bool Program::applyUniform4iv(GLint location, GLsizei count, const GLint *v)
+	bool Program::applyUniform4iv(Device *device, GLint location, GLsizei count, const GLint *v)
 	{
 		GLint vector[MAX_UNIFORM_VECTORS][4];
 
@@ -2302,10 +2300,10 @@
 			v += 4;
 		}
 
-		return applyUniform(location, (float*)vector);
+		return applyUniform(device, location, (float*)vector);
 	}
 
-	bool Program::applyUniform1uiv(GLint location, GLsizei count, const GLuint *v)
+	bool Program::applyUniform1uiv(Device *device, GLint location, GLsizei count, const GLuint *v)
 	{
 		GLuint vector[MAX_UNIFORM_VECTORS][4];
 
@@ -2350,13 +2348,13 @@
 		}
 		else
 		{
-			return applyUniform(location, (float*)vector);
+			return applyUniform(device, location, (float*)vector);
 		}
 
 		return true;
 	}
 
-	bool Program::applyUniform2uiv(GLint location, GLsizei count, const GLuint *v)
+	bool Program::applyUniform2uiv(Device *device, GLint location, GLsizei count, const GLuint *v)
 	{
 		GLuint vector[MAX_UNIFORM_VECTORS][4];
 
@@ -2370,10 +2368,10 @@
 			v += 2;
 		}
 
-		return applyUniform(location, (float*)vector);
+		return applyUniform(device, location, (float*)vector);
 	}
 
-	bool Program::applyUniform3uiv(GLint location, GLsizei count, const GLuint *v)
+	bool Program::applyUniform3uiv(Device *device, GLint location, GLsizei count, const GLuint *v)
 	{
 		GLuint vector[MAX_UNIFORM_VECTORS][4];
 
@@ -2387,10 +2385,10 @@
 			v += 3;
 		}
 
-		return applyUniform(location, (float*)vector);
+		return applyUniform(device, location, (float*)vector);
 	}
 
-	bool Program::applyUniform4uiv(GLint location, GLsizei count, const GLuint *v)
+	bool Program::applyUniform4uiv(Device *device, GLint location, GLsizei count, const GLuint *v)
 	{
 		GLuint vector[MAX_UNIFORM_VECTORS][4];
 
@@ -2404,7 +2402,7 @@
 			v += 4;
 		}
 
-		return applyUniform(location, (float*)vector);
+		return applyUniform(device, location, (float*)vector);
 	}
 
 	void Program::appendToInfoLog(const char *format, ...)
@@ -2875,7 +2873,7 @@
 		return orphaned;
 	}
 
-	void Program::validate()
+	void Program::validate(Device* device)
 	{
 		resetInfoLog();
 
@@ -2886,7 +2884,7 @@
 		}
 		else
 		{
-			applyUniforms();
+			applyUniforms(device);
 			if(!validateSamplers(true))
 			{
 				validated = false;
diff --git a/src/OpenGL/libGLESv2/Program.h b/src/OpenGL/libGLESv2/Program.h
index ff4bc5f..56f7df7 100644
--- a/src/OpenGL/libGLESv2/Program.h
+++ b/src/OpenGL/libGLESv2/Program.h
@@ -172,9 +172,9 @@
 		bool getUniformuiv(GLint location, GLsizei *bufSize, GLuint *params);
 
 		void dirtyAllUniforms();
-		void applyUniforms();
-		void applyUniformBuffers(BufferBinding* uniformBuffers);
-		void applyTransformFeedback(TransformFeedback* transformFeedback);
+		void applyUniforms(Device *device);
+		void applyUniformBuffers(Device *device, BufferBinding* uniformBuffers);
+		void applyTransformFeedback(Device *device, TransformFeedback* transformFeedback);
 
 		void link();
 		bool isLinked() const;
@@ -207,7 +207,7 @@
 		void flagForDeletion();
 		bool isFlaggedForDeletion() const;
 
-		void validate();
+		void validate(Device* device);
 		bool validateSamplers(bool logErrors);
 		bool isValidated() const;
 
@@ -232,32 +232,32 @@
 		bool areMatchingUniformBlocks(const glsl::UniformBlock &block1, const glsl::UniformBlock &block2, const Shader *shader1, const Shader *shader2);
 		bool defineUniform(GLenum shader, GLenum type, GLenum precision, const std::string &_name, unsigned int arraySize, int registerIndex, const Uniform::BlockInfo& blockInfo);
 		bool defineUniformBlock(const Shader *shader, const glsl::UniformBlock &block);
-		bool applyUniform(GLint location, float* data);
-		bool applyUniform1bv(GLint location, GLsizei count, const GLboolean *v);
-		bool applyUniform2bv(GLint location, GLsizei count, const GLboolean *v);
-		bool applyUniform3bv(GLint location, GLsizei count, const GLboolean *v);
-		bool applyUniform4bv(GLint location, GLsizei count, const GLboolean *v);
-		bool applyUniform1fv(GLint location, GLsizei count, const GLfloat *v);
-		bool applyUniform2fv(GLint location, GLsizei count, const GLfloat *v);
-		bool applyUniform3fv(GLint location, GLsizei count, const GLfloat *v);
-		bool applyUniform4fv(GLint location, GLsizei count, const GLfloat *v);
-		bool applyUniformMatrix2fv(GLint location, GLsizei count, const GLfloat *value);
-		bool applyUniformMatrix2x3fv(GLint location, GLsizei count, const GLfloat *value);
-		bool applyUniformMatrix2x4fv(GLint location, GLsizei count, const GLfloat *value);
-		bool applyUniformMatrix3fv(GLint location, GLsizei count, const GLfloat *value);
-		bool applyUniformMatrix3x2fv(GLint location, GLsizei count, const GLfloat *value);
-		bool applyUniformMatrix3x4fv(GLint location, GLsizei count, const GLfloat *value);
-		bool applyUniformMatrix4fv(GLint location, GLsizei count, const GLfloat *value);
-		bool applyUniformMatrix4x2fv(GLint location, GLsizei count, const GLfloat *value);
-		bool applyUniformMatrix4x3fv(GLint location, GLsizei count, const GLfloat *value);
-		bool applyUniform1iv(GLint location, GLsizei count, const GLint *v);
-		bool applyUniform2iv(GLint location, GLsizei count, const GLint *v);
-		bool applyUniform3iv(GLint location, GLsizei count, const GLint *v);
-		bool applyUniform4iv(GLint location, GLsizei count, const GLint *v);
-		bool applyUniform1uiv(GLint location, GLsizei count, const GLuint *v);
-		bool applyUniform2uiv(GLint location, GLsizei count, const GLuint *v);
-		bool applyUniform3uiv(GLint location, GLsizei count, const GLuint *v);
-		bool applyUniform4uiv(GLint location, GLsizei count, const GLuint *v);
+		bool applyUniform(Device *device, GLint location, float* data);
+		bool applyUniform1bv(Device *device, GLint location, GLsizei count, const GLboolean *v);
+		bool applyUniform2bv(Device *device, GLint location, GLsizei count, const GLboolean *v);
+		bool applyUniform3bv(Device *device, GLint location, GLsizei count, const GLboolean *v);
+		bool applyUniform4bv(Device *device, GLint location, GLsizei count, const GLboolean *v);
+		bool applyUniform1fv(Device *device, GLint location, GLsizei count, const GLfloat *v);
+		bool applyUniform2fv(Device *device, GLint location, GLsizei count, const GLfloat *v);
+		bool applyUniform3fv(Device *device, GLint location, GLsizei count, const GLfloat *v);
+		bool applyUniform4fv(Device *device, GLint location, GLsizei count, const GLfloat *v);
+		bool applyUniformMatrix2fv(Device *device, GLint location, GLsizei count, const GLfloat *value);
+		bool applyUniformMatrix2x3fv(Device *device, GLint location, GLsizei count, const GLfloat *value);
+		bool applyUniformMatrix2x4fv(Device *device, GLint location, GLsizei count, const GLfloat *value);
+		bool applyUniformMatrix3fv(Device *device, GLint location, GLsizei count, const GLfloat *value);
+		bool applyUniformMatrix3x2fv(Device *device, GLint location, GLsizei count, const GLfloat *value);
+		bool applyUniformMatrix3x4fv(Device *device, GLint location, GLsizei count, const GLfloat *value);
+		bool applyUniformMatrix4fv(Device *device, GLint location, GLsizei count, const GLfloat *value);
+		bool applyUniformMatrix4x2fv(Device *device, GLint location, GLsizei count, const GLfloat *value);
+		bool applyUniformMatrix4x3fv(Device *device, GLint location, GLsizei count, const GLfloat *value);
+		bool applyUniform1iv(Device *device, GLint location, GLsizei count, const GLint *v);
+		bool applyUniform2iv(Device *device, GLint location, GLsizei count, const GLint *v);
+		bool applyUniform3iv(Device *device, GLint location, GLsizei count, const GLint *v);
+		bool applyUniform4iv(Device *device, GLint location, GLsizei count, const GLint *v);
+		bool applyUniform1uiv(Device *device, GLint location, GLsizei count, const GLuint *v);
+		bool applyUniform2uiv(Device *device, GLint location, GLsizei count, const GLuint *v);
+		bool applyUniform3uiv(Device *device, GLint location, GLsizei count, const GLuint *v);
+		bool applyUniform4uiv(Device *device, GLint location, GLsizei count, const GLuint *v);
 
 		bool setUniformfv(GLint location, GLsizei count, const GLfloat *v, int numElements);
 		bool setUniformMatrixfv(GLint location, GLsizei count, GLboolean transpose, const GLfloat *value, GLenum type);
@@ -270,7 +270,6 @@
 		static unsigned int issueSerial();
 
 	private:
-		es2::Device *device;
 		FragmentShader *fragmentShader;
 		VertexShader *vertexShader;
 
diff --git a/src/OpenGL/libGLESv2/Texture.cpp b/src/OpenGL/libGLESv2/Texture.cpp
index bf83472..f20b030 100644
--- a/src/OpenGL/libGLESv2/Texture.cpp
+++ b/src/OpenGL/libGLESv2/Texture.cpp
@@ -1975,7 +1975,7 @@
 
 }
 
-egl::Image *createBackBuffer(int width, int height, sw::Format format, int multiSampleDepth)
+NO_SANITIZE_FUNCTION egl::Image *createBackBuffer(int width, int height, sw::Format format, int multiSampleDepth)
 {
 	if(width > es2::IMPLEMENTATION_MAX_RENDERBUFFER_SIZE || height > es2::IMPLEMENTATION_MAX_RENDERBUFFER_SIZE)
 	{
@@ -1986,7 +1986,7 @@
 	return egl::Image::create(width, height, format, multiSampleDepth, false);
 }
 
-egl::Image *createDepthStencil(int width, int height, sw::Format format, int multiSampleDepth)
+NO_SANITIZE_FUNCTION egl::Image *createDepthStencil(int width, int height, sw::Format format, int multiSampleDepth)
 {
 	if(width > es2::IMPLEMENTATION_MAX_RENDERBUFFER_SIZE || height > es2::IMPLEMENTATION_MAX_RENDERBUFFER_SIZE)
 	{
diff --git a/src/OpenGL/libGLESv2/VertexDataManager.cpp b/src/OpenGL/libGLESv2/VertexDataManager.cpp
index 2fff628..59062f7 100644
--- a/src/OpenGL/libGLESv2/VertexDataManager.cpp
+++ b/src/OpenGL/libGLESv2/VertexDataManager.cpp
@@ -193,6 +193,7 @@
 				case GL_FIXED:          translated[i].type = sw::STREAMTYPE_FIXED;  break;
 				case GL_FLOAT:          translated[i].type = sw::STREAMTYPE_FLOAT;  break;
 				case GL_HALF_FLOAT:     translated[i].type = sw::STREAMTYPE_HALF;   break;
+				case GL_HALF_FLOAT_OES: translated[i].type = sw::STREAMTYPE_HALF;   break;
 				case GL_INT_2_10_10_10_REV:          translated[i].type = sw::STREAMTYPE_2_10_10_10_INT;  break;
 				case GL_UNSIGNED_INT_2_10_10_10_REV: translated[i].type = sw::STREAMTYPE_2_10_10_10_UINT; break;
 				default: UNREACHABLE(attrib.mType); translated[i].type = sw::STREAMTYPE_FLOAT;  break;
diff --git a/src/OpenGL/libGLESv2/exports.map b/src/OpenGL/libGLESv2/exports.map
index 8238564..adc4ff5 100644
--- a/src/OpenGL/libGLESv2/exports.map
+++ b/src/OpenGL/libGLESv2/exports.map
@@ -1,172 +1,279 @@
 {
 global:
-    glActiveTexture;
-    glAttachShader;
-    glBindAttribLocation;
-    glBindBuffer;
-    glBindFramebuffer;
-    glBindRenderbuffer;
-    glBindTexture;
-    glBlendColor;
-    glBlendEquation;
-    glBlendEquationSeparate;
-    glBlendFunc;
-    glBlendFuncSeparate;
-    glBufferData;
-    glBufferSubData;
-    glCheckFramebufferStatus;
-    glClear;
-    glClearColor;
-    glClearDepthf;
-    glClearStencil;
-    glColorMask;
-    glCompileShader;
-    glCompressedTexImage2D;
-    glCompressedTexSubImage2D;
-    glCopyTexImage2D;
-    glCopyTexSubImage2D;
-    glCreateProgram;
-    glCreateShader;
-    glCullFace;
-    glDeleteBuffers;
-    glDeleteFramebuffers;
-    glDeleteProgram;
-    glDeleteRenderbuffers;
-    glDeleteShader;
-    glDeleteTextures;
-    glDepthFunc;
-    glDepthMask;
-    glDepthRangef;
-    glDetachShader;
-    glDisable;
-    glDisableVertexAttribArray;
-    glDrawArrays;
-    glDrawElements;
-    glEnable;
-    glEnableVertexAttribArray;
-    glFinish;
-    glFlush;
-    glFramebufferRenderbuffer;
-    glFramebufferTexture2D;
-    glFrontFace;
-    glGenBuffers;
-    glGenFramebuffers;
-    glGenRenderbuffers;
-    glGenTextures;
-    glGenerateMipmap;
-    glGetActiveAttrib;
-    glGetActiveUniform;
-    glGetAttachedShaders;
-    glGetAttribLocation;
-    glGetBooleanv;
-    glGetBufferParameteriv;
-    glGetError;
-    glGetFloatv;
-    glGetFramebufferAttachmentParameteriv;
-    glGetIntegerv;
-    glGetProgramInfoLog;
-    glGetProgramiv;
-    glGetRenderbufferParameteriv;
-    glGetShaderInfoLog;
-    glGetShaderPrecisionFormat;
-    glGetShaderSource;
-    glGetShaderiv;
-    glGetString;
-    glGetTexParameterfv;
-    glGetTexParameteriv;
-    glGetUniformLocation;
-    glGetUniformfv;
-    glGetUniformiv;
-    glGetVertexAttribPointerv;
-    glGetVertexAttribfv;
-    glGetVertexAttribiv;
-    glHint;
-    glIsBuffer;
-    glIsEnabled;
-    glIsFramebuffer;
-    glIsProgram;
-    glIsRenderbuffer;
-    glIsShader;
-    glIsTexture;
-    glLineWidth;
-    glLinkProgram;
-    glPixelStorei;
-    glPolygonOffset;
-    glReadPixels;
-    glReleaseShaderCompiler;
-    glRenderbufferStorage;
-    glSampleCoverage;
-    glScissor;
-    glShaderBinary;
-    glShaderSource;
-    glStencilFunc;
-    glStencilFuncSeparate;
-    glStencilMask;
-    glStencilMaskSeparate;
-    glStencilOp;
-    glStencilOpSeparate;
-    glTexImage2D;
-    glTexParameterf;
-    glTexParameterfv;
-    glTexParameteri;
-    glTexParameteriv;
-    glTexSubImage2D;
-    glUniform1f;
-    glUniform1fv;
-    glUniform1i;
-    glUniform1iv;
-    glUniform2f;
-    glUniform2fv;
-    glUniform2i;
-    glUniform2iv;
-    glUniform3f;
-    glUniform3fv;
-    glUniform3i;
-    glUniform3iv;
-    glUniform4f;
-    glUniform4fv;
-    glUniform4i;
-    glUniform4iv;
-    glUniformMatrix2fv;
-    glUniformMatrix3fv;
-    glUniformMatrix4fv;
-    glUseProgram;
-    glValidateProgram;
-    glVertexAttrib1f;
-    glVertexAttrib1fv;
-    glVertexAttrib2f;
-    glVertexAttrib2fv;
-    glVertexAttrib3f;
-    glVertexAttrib3fv;
-    glVertexAttrib4f;
-    glVertexAttrib4fv;
-    glVertexAttribPointer;
-    glViewport;
+	# OpenGL ES 2.0 core functions
+	glActiveTexture;
+	glAttachShader;
+	glBindAttribLocation;
+	glBindBuffer;
+	glBindFramebuffer;
+	glBindRenderbuffer;
+	glBindTexture;
+	glBlendColor;
+	glBlendEquation;
+	glBlendEquationSeparate;
+	glBlendFunc;
+	glBlendFuncSeparate;
+	glBufferData;
+	glBufferSubData;
+	glCheckFramebufferStatus;
+	glClear;
+	glClearColor;
+	glClearDepthf;
+	glClearStencil;
+	glColorMask;
+	glCompileShader;
+	glCompressedTexImage2D;
+	glCompressedTexSubImage2D;
+	glCopyTexImage2D;
+	glCopyTexSubImage2D;
+	glCreateProgram;
+	glCreateShader;
+	glCullFace;
+	glDeleteBuffers;
+	glDeleteFramebuffers;
+	glDeleteProgram;
+	glDeleteRenderbuffers;
+	glDeleteShader;
+	glDeleteTextures;
+	glDepthFunc;
+	glDepthMask;
+	glDepthRangef;
+	glDetachShader;
+	glDisable;
+	glDisableVertexAttribArray;
+	glDrawArrays;
+	glDrawElements;
+	glEnable;
+	glEnableVertexAttribArray;
+	glFinish;
+	glFlush;
+	glFramebufferRenderbuffer;
+	glFramebufferTexture2D;
+	glFrontFace;
+	glGenBuffers;
+	glGenFramebuffers;
+	glGenRenderbuffers;
+	glGenTextures;
+	glGenerateMipmap;
+	glGetActiveAttrib;
+	glGetActiveUniform;
+	glGetAttachedShaders;
+	glGetAttribLocation;
+	glGetBooleanv;
+	glGetBufferParameteriv;
+	glGetError;
+	glGetFloatv;
+	glGetFramebufferAttachmentParameteriv;
+	glGetIntegerv;
+	glGetProgramInfoLog;
+	glGetProgramiv;
+	glGetRenderbufferParameteriv;
+	glGetShaderInfoLog;
+	glGetShaderPrecisionFormat;
+	glGetShaderSource;
+	glGetShaderiv;
+	glGetString;
+	glGetTexParameterfv;
+	glGetTexParameteriv;
+	glGetUniformLocation;
+	glGetUniformfv;
+	glGetUniformiv;
+	glGetVertexAttribPointerv;
+	glGetVertexAttribfv;
+	glGetVertexAttribiv;
+	glHint;
+	glIsBuffer;
+	glIsEnabled;
+	glIsFramebuffer;
+	glIsProgram;
+	glIsRenderbuffer;
+	glIsShader;
+	glIsTexture;
+	glLineWidth;
+	glLinkProgram;
+	glPixelStorei;
+	glPolygonOffset;
+	glReadPixels;
+	glReleaseShaderCompiler;
+	glRenderbufferStorage;
+	glSampleCoverage;
+	glScissor;
+	glShaderBinary;
+	glShaderSource;
+	glStencilFunc;
+	glStencilFuncSeparate;
+	glStencilMask;
+	glStencilMaskSeparate;
+	glStencilOp;
+	glStencilOpSeparate;
+	glTexImage2D;
+	glTexParameterf;
+	glTexParameterfv;
+	glTexParameteri;
+	glTexParameteriv;
+	glTexSubImage2D;
+	glUniform1f;
+	glUniform1fv;
+	glUniform1i;
+	glUniform1iv;
+	glUniform2f;
+	glUniform2fv;
+	glUniform2i;
+	glUniform2iv;
+	glUniform3f;
+	glUniform3fv;
+	glUniform3i;
+	glUniform3iv;
+	glUniform4f;
+	glUniform4fv;
+	glUniform4i;
+	glUniform4iv;
+	glUniformMatrix2fv;
+	glUniformMatrix3fv;
+	glUniformMatrix4fv;
+	glUseProgram;
+	glValidateProgram;
+	glVertexAttrib1f;
+	glVertexAttrib1fv;
+	glVertexAttrib2f;
+	glVertexAttrib2fv;
+	glVertexAttrib3f;
+	glVertexAttrib3fv;
+	glVertexAttrib4f;
+	glVertexAttrib4fv;
+	glVertexAttribPointer;
+	glViewport;
 
-    # Extensions
-    glTexImage3DOES;
-    glBlitFramebufferANGLE;
-    glRenderbufferStorageMultisampleANGLE;
-    glDeleteFencesNV;
-    glFinishFenceNV;
-    glGenFencesNV;
-    glGetFenceivNV;
-    glIsFenceNV;
-    glSetFenceNV;
-    glTestFenceNV;
-    glGetGraphicsResetStatusEXT;
-    glReadnPixelsEXT;
-    glGetnUniformfvEXT;
-    glGetnUniformivEXT;
-    glGenQueriesEXT;
-    glDeleteQueriesEXT;
-    glIsQueryEXT;
-    glBeginQueryEXT;
-    glEndQueryEXT;
-    glGetQueryivEXT;
-    glGetQueryObjectuivEXT;
-    glEGLImageTargetTexture2DOES;
-    glEGLImageTargetRenderbufferStorageOES;
+	# OpenGL ES 3.0 core functions
+	glReadBuffer;
+	glDrawRangeElements;
+	glTexImage3D;
+	glTexSubImage3D;
+	glCopyTexSubImage3D;
+	glCompressedTexImage3D;
+	glCompressedTexSubImage3D;
+	glGenQueries;
+	glDeleteQueries;
+	glIsQuery;
+	glBeginQuery;
+	glEndQuery;
+	glGetQueryiv;
+	glGetQueryObjectuiv;
+	glUnmapBuffer;
+	glGetBufferPointerv;
+	glDrawBuffers;
+	glUniformMatrix2x3fv;
+	glUniformMatrix3x2fv;
+	glUniformMatrix2x4fv;
+	glUniformMatrix4x2fv;
+	glUniformMatrix3x4fv;
+	glUniformMatrix4x3fv;
+	glBlitFramebuffer;
+	glRenderbufferStorageMultisample;
+	glFramebufferTextureLayer;
+	glMapBufferRange;
+	glFlushMappedBufferRange;
+	glBindVertexArray;
+	glDeleteVertexArrays;
+	glGenVertexArrays;
+	glIsVertexArray;
+	glGetIntegeri_v;
+	glBeginTransformFeedback;
+	glEndTransformFeedback;
+	glBindBufferRange;
+	glBindBufferBase;
+	glTransformFeedbackVaryings;
+	glGetTransformFeedbackVarying;
+	glVertexAttribIPointer;
+	glGetVertexAttribIiv;
+	glGetVertexAttribIuiv;
+	glVertexAttribI4i;
+	glVertexAttribI4ui;
+	glVertexAttribI4iv;
+	glVertexAttribI4uiv;
+	glGetUniformuiv;
+	glGetFragDataLocation;
+	glUniform1ui;
+	glUniform2ui;
+	glUniform3ui;
+	glUniform4ui;
+	glUniform1uiv;
+	glUniform2uiv;
+	glUniform3uiv;
+	glUniform4uiv;
+	glClearBufferiv;
+	glClearBufferuiv;
+	glClearBufferfv;
+	glClearBufferfi;
+	glGetStringi;
+	glCopyBufferSubData;
+	glGetUniformIndices;
+	glGetActiveUniformsiv;
+	glGetUniformBlockIndex;
+	glGetActiveUniformBlockiv;
+	glGetActiveUniformBlockName;
+	glUniformBlockBinding;
+	glDrawArraysInstanced;
+	glDrawElementsInstanced;
+	glFenceSync;
+	glIsSync;
+	glDeleteSync;
+	glClientWaitSync;
+	glWaitSync;
+	glGetInteger64v;
+	glGetSynciv;
+	glGetInteger64i_v;
+	glGetBufferParameteri64v;
+	glGenSamplers;
+	glDeleteSamplers;
+	glIsSampler;
+	glBindSampler;
+	glSamplerParameteri;
+	glSamplerParameteriv;
+	glSamplerParameterf;
+	glSamplerParameterfv;
+	glGetSamplerParameteriv;
+	glGetSamplerParameterfv;
+	glVertexAttribDivisor;
+	glBindTransformFeedback;
+	glDeleteTransformFeedbacks;
+	glGenTransformFeedbacks;
+	glIsTransformFeedback;
+	glPauseTransformFeedback;
+	glResumeTransformFeedback;
+	glGetProgramBinary;
+	glProgramBinary;
+	glProgramParameteri;
+	glInvalidateFramebuffer;
+	glInvalidateSubFramebuffer;
+	glTexStorage2D;
+	glTexStorage3D;
+	glGetInternalformativ;
+
+	# Extensions
+	glTexImage3DOES;
+	glBlitFramebufferANGLE;
+	glRenderbufferStorageMultisampleANGLE;
+	glDeleteFencesNV;
+	glFinishFenceNV;
+	glGenFencesNV;
+	glGetFenceivNV;
+	glIsFenceNV;
+	glSetFenceNV;
+	glTestFenceNV;
+	glGetGraphicsResetStatusEXT;
+	glReadnPixelsEXT;
+	glGetnUniformfvEXT;
+	glGetnUniformivEXT;
+	glGenQueriesEXT;
+	glDeleteQueriesEXT;
+	glIsQueryEXT;
+	glBeginQueryEXT;
+	glEndQueryEXT;
+	glGetQueryivEXT;
+	glGetQueryObjectuivEXT;
+	glEGLImageTargetTexture2DOES;
+	glEGLImageTargetRenderbufferStorageOES;
 	glIsRenderbufferOES;
 	glBindRenderbufferOES;
 	glDeleteRenderbuffersOES;
@@ -184,117 +291,15 @@
 	glGenerateMipmapOES;
 	glDrawBuffersEXT;
 
-    # GLES 3.0 Functions
-    glReadBuffer;
-    glDrawRangeElements;
-    glTexImage3D;
-    glTexSubImage3D;
-    glCopyTexSubImage3D;
-    glCompressedTexImage3D;
-    glCompressedTexSubImage3D;
-    glGenQueries;
-    glDeleteQueries;
-    glIsQuery;
-    glBeginQuery;
-    glEndQuery;
-    glGetQueryiv;
-    glGetQueryObjectuiv;
-    glUnmapBuffer;
-    glGetBufferPointerv;
-    glDrawBuffers;
-    glUniformMatrix2x3fv;
-    glUniformMatrix3x2fv;
-    glUniformMatrix2x4fv;
-    glUniformMatrix4x2fv;
-    glUniformMatrix3x4fv;
-    glUniformMatrix4x3fv;
-    glBlitFramebuffer;
-    glRenderbufferStorageMultisample;
-    glFramebufferTextureLayer;
-    glMapBufferRange;
-    glFlushMappedBufferRange;
-    glBindVertexArray;
-    glDeleteVertexArrays;
-    glGenVertexArrays;
-    glIsVertexArray;
-    glGetIntegeri_v;
-    glBeginTransformFeedback;
-    glEndTransformFeedback;
-    glBindBufferRange;
-    glBindBufferBase;
-    glTransformFeedbackVaryings;
-    glGetTransformFeedbackVarying;
-    glVertexAttribIPointer;
-    glGetVertexAttribIiv;
-    glGetVertexAttribIuiv;
-    glVertexAttribI4i;
-    glVertexAttribI4ui;
-    glVertexAttribI4iv;
-    glVertexAttribI4uiv;
-    glGetUniformuiv;
-    glGetFragDataLocation;
-    glUniform1ui;
-    glUniform2ui;
-    glUniform3ui;
-    glUniform4ui;
-    glUniform1uiv;
-    glUniform2uiv;
-    glUniform3uiv;
-    glUniform4uiv;
-    glClearBufferiv;
-    glClearBufferuiv;
-    glClearBufferfv;
-    glClearBufferfi;
-    glGetStringi;
-    glCopyBufferSubData;
-    glGetUniformIndices;
-    glGetActiveUniformsiv;
-    glGetUniformBlockIndex;
-    glGetActiveUniformBlockiv;
-    glGetActiveUniformBlockName;
-    glUniformBlockBinding;
-    glDrawArraysInstanced;
-    glDrawElementsInstanced;
-    glFenceSync;
-    glIsSync;
-    glDeleteSync;
-    glClientWaitSync;
-    glWaitSync;
-    glGetInteger64v;
-    glGetSynciv;
-    glGetInteger64i_v;
-    glGetBufferParameteri64v;
-    glGenSamplers;
-    glDeleteSamplers;
-    glIsSampler;
-    glBindSampler;
-    glSamplerParameteri;
-    glSamplerParameteriv;
-    glSamplerParameterf;
-    glSamplerParameterfv;
-    glGetSamplerParameteriv;
-    glGetSamplerParameterfv;
-    glVertexAttribDivisor;
-    glBindTransformFeedback;
-    glDeleteTransformFeedbacks;
-    glGenTransformFeedbacks;
-    glIsTransformFeedback;
-    glPauseTransformFeedback;
-    glResumeTransformFeedback;
-    glGetProgramBinary;
-    glProgramBinary;
-    glProgramParameteri;
-    glInvalidateFramebuffer;
-    glInvalidateSubFramebuffer;
-    glTexStorage2D;
-    glTexStorage3D;
-    glGetInternalformativ;
+	# Table of function pointers to disambiguate between libraries
+	libGLESv2_swiftshader;
 
-    libGLESv2_swiftshader;
+	# Type-strings and type-infos required by sanitizers
+	_ZTS*;
+	_ZTI*;
 
-    Register;
+	Register;
 
 local:
-    *;
+	*;
 };
-
diff --git a/src/OpenGL/libGLESv2/libGLESv2.cpp b/src/OpenGL/libGLESv2/libGLESv2.cpp
index 8be9056..f486982 100644
--- a/src/OpenGL/libGLESv2/libGLESv2.cpp
+++ b/src/OpenGL/libGLESv2/libGLESv2.cpp
@@ -4247,6 +4247,9 @@
 	case GL_FRAGMENT_SHADER_DERIVATIVE_HINT_OES:
 		if(context) context->setFragmentShaderDerivativeHint(mode);
 		break;
+	case GL_TEXTURE_FILTERING_HINT_CHROMIUM:
+		if(context) context->setTextureFilteringHint(mode);
+		break;
 	default:
 		return error(GL_INVALID_ENUM);
 	}
@@ -5970,7 +5973,7 @@
 			}
 		}
 
-		programObject->validate();
+		programObject->validate(context->getDevice());
 	}
 }
 
@@ -6143,6 +6146,7 @@
 	case GL_UNSIGNED_SHORT:
 	case GL_FIXED:
 	case GL_FLOAT:
+	case GL_HALF_FLOAT_OES:   // GL_OES_vertex_half_float
 		break;
 	case GL_INT_2_10_10_10_REV:
 	case GL_UNSIGNED_INT_2_10_10_10_REV:
@@ -6852,7 +6856,7 @@
 
 }
 
-extern "C" __eglMustCastToProperFunctionPointerType es2GetProcAddress(const char *procname)
+extern "C" NO_SANITIZE_FUNCTION __eglMustCastToProperFunctionPointerType es2GetProcAddress(const char *procname)
 {
 	struct Extension
 	{
diff --git a/src/OpenGL/libGLESv2/libGLESv2.vcxproj b/src/OpenGL/libGLESv2/libGLESv2.vcxproj
index bbb16a9..bae560e 100644
--- a/src/OpenGL/libGLESv2/libGLESv2.vcxproj
+++ b/src/OpenGL/libGLESv2/libGLESv2.vcxproj
@@ -125,7 +125,7 @@
     <ClCompile>

       <Optimization>Disabled</Optimization>

       <AdditionalIncludeDirectories>$(SolutionDir)\src;$(ProjectDir)/..;$(ProjectDir)/../..;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>

-      <PreprocessorDefinitions>WIN32;GL_API=;GL_APICALL=;GL_GLEXT_PROTOTYPES;_DEBUG;_WINDOWS;_USRDLL;LIBGLESV2_EXPORTS;_CRT_SECURE_NO_DEPRECATE;NOMINMAX;%(PreprocessorDefinitions)</PreprocessorDefinitions>

+      <PreprocessorDefinitions>WIN32;GL_API=;GL_APICALL=;GL_GLEXT_PROTOTYPES;NO_SANITIZE_FUNCTION=;_DEBUG;_WINDOWS;_USRDLL;_CRT_SECURE_NO_DEPRECATE;NOMINMAX;%(PreprocessorDefinitions)</PreprocessorDefinitions>

       <MinimalRebuild>true</MinimalRebuild>

       <BasicRuntimeChecks>EnableFastChecks</BasicRuntimeChecks>

       <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>

@@ -156,7 +156,7 @@
     <ClCompile>

       <Optimization>Disabled</Optimization>

       <AdditionalIncludeDirectories>$(SolutionDir)\src;$(ProjectDir)/..;$(ProjectDir)/../..;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>

-      <PreprocessorDefinitions>WIN32;GL_API=;GL_APICALL=;GL_GLEXT_PROTOTYPES;_DEBUG;_WINDOWS;_USRDLL;LIBGLESV2_EXPORTS;_CRT_SECURE_NO_DEPRECATE;NOMINMAX;%(PreprocessorDefinitions)</PreprocessorDefinitions>

+      <PreprocessorDefinitions>WIN32;GL_API=;GL_APICALL=;GL_GLEXT_PROTOTYPES;NO_SANITIZE_FUNCTION=;_DEBUG;_WINDOWS;_USRDLL;_CRT_SECURE_NO_DEPRECATE;NOMINMAX;%(PreprocessorDefinitions)</PreprocessorDefinitions>

       <BasicRuntimeChecks>EnableFastChecks</BasicRuntimeChecks>

       <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>

       <PrecompiledHeader>

@@ -189,7 +189,7 @@
       <Optimization>Full</Optimization>

       <InlineFunctionExpansion>AnySuitable</InlineFunctionExpansion>

       <AdditionalIncludeDirectories>$(ProjectDir)/..;$(ProjectDir)/../..;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>

-      <PreprocessorDefinitions>WIN32;GL_API=;GL_APICALL=;GL_GLEXT_PROTOTYPES;NDEBUG;_WINDOWS;_USRDLL;LIBGLESV2_EXPORTS;_CRT_SECURE_NO_DEPRECATE;NOMINMAX;_SECURE_SCL=0;%(PreprocessorDefinitions)</PreprocessorDefinitions>

+      <PreprocessorDefinitions>WIN32;GL_API=;GL_APICALL=;GL_GLEXT_PROTOTYPES;NO_SANITIZE_FUNCTION=;NDEBUG;_WINDOWS;_USRDLL;_CRT_SECURE_NO_DEPRECATE;NOMINMAX;_SECURE_SCL=0;%(PreprocessorDefinitions)</PreprocessorDefinitions>

       <RuntimeLibrary>MultiThreaded</RuntimeLibrary>

       <PrecompiledHeader>

       </PrecompiledHeader>

@@ -227,7 +227,7 @@
       <Optimization>Full</Optimization>

       <InlineFunctionExpansion>AnySuitable</InlineFunctionExpansion>

       <AdditionalIncludeDirectories>$(ProjectDir)/..;$(ProjectDir)/../..;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>

-      <PreprocessorDefinitions>WIN32;GL_API=;GL_APICALL=;GL_GLEXT_PROTOTYPES;NDEBUG;_WINDOWS;_USRDLL;LIBGLESV2_EXPORTS;_CRT_SECURE_NO_DEPRECATE;NOMINMAX;_SECURE_SCL=0;%(PreprocessorDefinitions)</PreprocessorDefinitions>

+      <PreprocessorDefinitions>WIN32;GL_API=;GL_APICALL=;GL_GLEXT_PROTOTYPES;NO_SANITIZE_FUNCTION=;NDEBUG;_WINDOWS;_USRDLL;_CRT_SECURE_NO_DEPRECATE;NOMINMAX;_SECURE_SCL=0;%(PreprocessorDefinitions)</PreprocessorDefinitions>

       <RuntimeLibrary>MultiThreaded</RuntimeLibrary>

       <PrecompiledHeader>

       </PrecompiledHeader>

@@ -267,7 +267,7 @@
       <Optimization>Full</Optimization>

       <InlineFunctionExpansion>AnySuitable</InlineFunctionExpansion>

       <AdditionalIncludeDirectories>$(ProjectDir)/..; $(ProjectDir)/../..;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>

-      <PreprocessorDefinitions>WIN32;GL_API=;GL_APICALL=;GL_GLEXT_PROTOTYPES;NDEBUG;_WINDOWS;_USRDLL;LIBGLESV2_EXPORTS;_CRT_SECURE_NO_DEPRECATE;NOMINMAX;_SECURE_SCL=0;%(PreprocessorDefinitions)</PreprocessorDefinitions>

+      <PreprocessorDefinitions>WIN32;GL_API=;GL_APICALL=;GL_GLEXT_PROTOTYPES;NO_SANITIZE_FUNCTION=;NDEBUG;_WINDOWS;_USRDLL;_CRT_SECURE_NO_DEPRECATE;NOMINMAX;_SECURE_SCL=0;%(PreprocessorDefinitions)</PreprocessorDefinitions>

       <RuntimeLibrary>MultiThreaded</RuntimeLibrary>

       <PrecompiledHeader>

       </PrecompiledHeader>

@@ -303,7 +303,7 @@
       <Optimization>Full</Optimization>

       <InlineFunctionExpansion>AnySuitable</InlineFunctionExpansion>

       <AdditionalIncludeDirectories>$(ProjectDir)/..; $(ProjectDir)/../..;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>

-      <PreprocessorDefinitions>WIN32;GL_API=;GL_APICALL=;GL_GLEXT_PROTOTYPES;NDEBUG;_WINDOWS;_USRDLL;LIBGLESV2_EXPORTS;_CRT_SECURE_NO_DEPRECATE;NOMINMAX;_SECURE_SCL=0;%(PreprocessorDefinitions)</PreprocessorDefinitions>

+      <PreprocessorDefinitions>WIN32;GL_API=;GL_APICALL=;GL_GLEXT_PROTOTYPES;NO_SANITIZE_FUNCTION=;NDEBUG;_WINDOWS;_USRDLL;_CRT_SECURE_NO_DEPRECATE;NOMINMAX;_SECURE_SCL=0;%(PreprocessorDefinitions)</PreprocessorDefinitions>

       <RuntimeLibrary>MultiThreaded</RuntimeLibrary>

       <PrecompiledHeader>

       </PrecompiledHeader>

diff --git a/src/OpenGL/libGLESv2/utilities.cpp b/src/OpenGL/libGLESv2/utilities.cpp
index d5b2b28..4257b69 100644
--- a/src/OpenGL/libGLESv2/utilities.cpp
+++ b/src/OpenGL/libGLESv2/utilities.cpp
@@ -1087,6 +1087,7 @@
 		case GL_RGBA16I:
 		case GL_RGBA32I:
 		case GL_RGBA32UI:
+		case GL_R11F_G11F_B10F:
 			return clientVersion >= 3;
 		case GL_DEPTH_COMPONENT24:
 		case GL_DEPTH_COMPONENT32_OES:
diff --git a/src/Reactor/BUILD.gn b/src/Reactor/BUILD.gn
index 5fa683e..4bc15c5 100644
--- a/src/Reactor/BUILD.gn
+++ b/src/Reactor/BUILD.gn
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import("../swiftshader.gni")
+
 declare_args() {
   # Currently, Subzero is not used by default
   # LLVM is still the default backend
@@ -73,6 +75,7 @@
       "/wd4146",
       "/wd4245",  # conversion from int to unsigned int (llvm)
       "/wd4267",
+      "/wd4291",
       "/wd4310",
       "/wd4334",
       "/wd4389",
@@ -110,6 +113,8 @@
       "/wd4146",
       "/wd4245",  # conversion from int to unsigned int (llvm)
       "/wd4267",
+      "/wd4291",
+      "/wd4309",
       "/wd4702",
       "/wd4800",
     ]
@@ -146,7 +151,6 @@
     ]
 
     defines = [
-      "LOG_TAG=\"swiftshader_reactor\"",
       "__STDC_CONSTANT_MACROS",
       "__STDC_LIMIT_MACROS",
     ]
@@ -154,7 +158,7 @@
 }
 
 if (use_swiftshader_with_subzero) {
-  source_set("swiftshader_subzero") {
+  swiftshader_source_set("swiftshader_subzero") {
     subzero_dir = "../../third_party/subzero"
     subzero_llvm_dir = "../../third_party/llvm-subzero"
 
@@ -241,17 +245,14 @@
       ]
     }
 
-    if (is_win) {
-      configs -= [ "//build/config/win:unicode" ]
-    }
-    configs += [
+    configs = [
       ":swiftshader_subzero_common_private_config",
       ":swiftshader_subzero_private_config",
     ]
   }
 }
 
-source_set("swiftshader_reactor") {
+swiftshader_source_set("swiftshader_reactor") {
   deps = [
     "../OpenGL/common:swiftshader_opengl_common",
   ]
@@ -268,7 +269,7 @@
       "SubzeroReactor.cpp",
     ]
 
-    configs += [
+    configs = [
       ":swiftshader_subzero_common_private_config",
       ":swiftshader_reactor_with_subzero_private_config",
     ]
@@ -281,7 +282,7 @@
       "LLVMRoutineManager.cpp",
     ]
 
-    configs += [ ":swiftshader_reactor_private_config" ]
+    configs = [ ":swiftshader_reactor_private_config" ]
 
     include_dirs = [
       "..",
@@ -289,8 +290,4 @@
       "../../third_party/LLVM/include/",
     ]
   }
-
-  if (is_win) {
-    configs -= [ "//build/config/win:unicode" ]
-  }
 }
diff --git a/src/Reactor/LLVMReactor.cpp b/src/Reactor/LLVMReactor.cpp
index dda5c04..c66e5a5 100644
--- a/src/Reactor/LLVMReactor.cpp
+++ b/src/Reactor/LLVMReactor.cpp
@@ -76,20 +76,54 @@
 
 namespace sw
 {
-	using namespace llvm;
-
 	Optimization optimization[10] = {InstructionCombining, Disabled};
 
-	class Type : public llvm::Type {};
+	enum EmulatedType
+	{
+		Type_v2i32,
+		Type_v4i16,
+		Type_v2i16,
+		Type_v8i8,
+		Type_v4i8,
+		Type_v2f32,
+		EmulatedTypeCount
+	};
+
 	class Value : public llvm::Value {};
 	class SwitchCases : public llvm::SwitchInst {};
 	class BasicBlock : public llvm::BasicBlock {};
 
+	llvm::Type *T(Type *t)
+	{
+		uintptr_t type = reinterpret_cast<uintptr_t>(t);
+		if(type < EmulatedTypeCount)
+		{
+			// Use 128-bit vectors to implement logically shorter ones.
+			switch(type)
+			{
+			case Type_v2i32: return T(Int4::getType());
+			case Type_v4i16: return T(Short8::getType());
+			case Type_v2i16: return T(Short8::getType());
+			case Type_v8i8:  return T(Byte16::getType());
+			case Type_v4i8:  return T(Byte16::getType());
+			case Type_v2f32: return T(Float4::getType());
+			default: assert(false);
+			}
+		}
+
+		return reinterpret_cast<llvm::Type*>(t);
+	}
+
 	inline Type *T(llvm::Type *t)
 	{
 		return reinterpret_cast<Type*>(t);
 	}
 
+	Type *T(EmulatedType t)
+	{
+		return reinterpret_cast<Type*>(t);
+	}
+
 	inline Value *V(llvm::Value *t)
 	{
 		return reinterpret_cast<Value*>(t);
@@ -105,19 +139,39 @@
 		return reinterpret_cast<BasicBlock*>(t);
 	}
 
+	static size_t typeSize(Type *type)
+	{
+		uintptr_t t = reinterpret_cast<uintptr_t>(type);
+		if(t < EmulatedTypeCount)
+		{
+			switch(t)
+			{
+			case Type_v2i32: return 8;
+			case Type_v4i16: return 8;
+			case Type_v2i16: return 4;
+			case Type_v8i8:  return 8;
+			case Type_v4i8:  return 4;
+			case Type_v2f32: return 8;
+			default: assert(false);
+			}
+		}
+
+		return T(type)->getPrimitiveSizeInBits() / 8;
+	}
+
 	Nucleus::Nucleus()
 	{
 		::codegenMutex.lock();   // Reactor and LLVM are currently not thread safe
 
-		InitializeNativeTarget();
-		JITEmitDebugInfo = false;
+		llvm::InitializeNativeTarget();
+		llvm::JITEmitDebugInfo = false;
 
 		if(!::context)
 		{
-			::context = new LLVMContext();
+			::context = new llvm::LLVMContext();
 		}
 
-		::module = new Module("", *::context);
+		::module = new llvm::Module("", *::context);
 		::routineManager = new LLVMRoutineManager();
 
 		#if defined(__x86_64__)
@@ -126,7 +180,7 @@
 			const char *architecture = "x86";
 		#endif
 
-		SmallVector<std::string, 1> MAttrs;
+		llvm::SmallVector<std::string, 1> MAttrs;
 		MAttrs.push_back(CPUID::supportsMMX()    ? "+mmx"   : "-mmx");
 		MAttrs.push_back(CPUID::supportsCMOV()   ? "+cmov"  : "-cmov");
 		MAttrs.push_back(CPUID::supportsSSE()    ? "+sse"   : "-sse");
@@ -136,12 +190,12 @@
 		MAttrs.push_back(CPUID::supportsSSE4_1() ? "+sse41" : "-sse41");
 
 		std::string error;
-		TargetMachine *targetMachine = EngineBuilder::selectTarget(::module, architecture, "", MAttrs, Reloc::Default, CodeModel::JITDefault, &error);
-		::executionEngine = JIT::createJIT(::module, 0, ::routineManager, CodeGenOpt::Aggressive, true, targetMachine);
+		llvm::TargetMachine *targetMachine = llvm::EngineBuilder::selectTarget(::module, architecture, "", MAttrs, llvm::Reloc::Default, llvm::CodeModel::JITDefault, &error);
+		::executionEngine = llvm::JIT::createJIT(::module, 0, ::routineManager, llvm::CodeGenOpt::Aggressive, true, targetMachine);
 
 		if(!::builder)
 		{
-			::builder = new IRBuilder<>(*::context);
+			::builder = new llvm::IRBuilder<>(*::context);
 
 			#if defined(_WIN32)
 				HMODULE CodeAnalyst = LoadLibrary("CAJitNtfyLib.dll");
@@ -181,14 +235,14 @@
 			}
 			else
 			{
-				createRet(V(UndefValue::get(type)));
+				createRet(V(llvm::UndefValue::get(type)));
 			}
 		}
 
 		if(false)
 		{
 			std::string error;
-			raw_fd_ostream file("llvm-dump-unopt.txt", error);
+			llvm::raw_fd_ostream file("llvm-dump-unopt.txt", error);
 			::module->print(file, 0);
 		}
 
@@ -200,7 +254,7 @@
 		if(false)
 		{
 			std::string error;
-			raw_fd_ostream file("llvm-dump-opt.txt", error);
+			llvm::raw_fd_ostream file("llvm-dump-opt.txt", error);
 			::module->print(file, 0);
 		}
 
@@ -217,33 +271,33 @@
 
 	void Nucleus::optimize()
 	{
-		static PassManager *passManager = nullptr;
+		static llvm::PassManager *passManager = nullptr;
 
 		if(!passManager)
 		{
-			passManager = new PassManager();
+			passManager = new llvm::PassManager();
 
-			UnsafeFPMath = true;
-		//	NoInfsFPMath = true;
-		//	NoNaNsFPMath = true;
+			llvm::UnsafeFPMath = true;
+		//	llvm::NoInfsFPMath = true;
+		//	llvm::NoNaNsFPMath = true;
 
-			passManager->add(new TargetData(*::executionEngine->getTargetData()));
-			passManager->add(createScalarReplAggregatesPass());
+			passManager->add(new llvm::TargetData(*::executionEngine->getTargetData()));
+			passManager->add(llvm::createScalarReplAggregatesPass());
 
 			for(int pass = 0; pass < 10 && optimization[pass] != Disabled; pass++)
 			{
 				switch(optimization[pass])
 				{
-				case Disabled:                                                                 break;
-				case CFGSimplification:    passManager->add(createCFGSimplificationPass());    break;
-				case LICM:                 passManager->add(createLICMPass());                 break;
-				case AggressiveDCE:        passManager->add(createAggressiveDCEPass());        break;
-				case GVN:                  passManager->add(createGVNPass());                  break;
-				case InstructionCombining: passManager->add(createInstructionCombiningPass()); break;
-				case Reassociate:          passManager->add(createReassociatePass());          break;
-				case DeadStoreElimination: passManager->add(createDeadStoreEliminationPass()); break;
-				case SCCP:                 passManager->add(createSCCPPass());                 break;
-				case ScalarReplAggregates: passManager->add(createScalarReplAggregatesPass()); break;
+				case Disabled:                                                                       break;
+				case CFGSimplification:    passManager->add(llvm::createCFGSimplificationPass());    break;
+				case LICM:                 passManager->add(llvm::createLICMPass());                 break;
+				case AggressiveDCE:        passManager->add(llvm::createAggressiveDCEPass());        break;
+				case GVN:                  passManager->add(llvm::createGVNPass());                  break;
+				case InstructionCombining: passManager->add(llvm::createInstructionCombiningPass()); break;
+				case Reassociate:          passManager->add(llvm::createReassociatePass());          break;
+				case DeadStoreElimination: passManager->add(llvm::createDeadStoreEliminationPass()); break;
+				case SCCP:                 passManager->add(llvm::createSCCPPass());                 break;
+				case ScalarReplAggregates: passManager->add(llvm::createScalarReplAggregatesPass()); break;
 				default:
 					assert(false);
 				}
@@ -258,15 +312,15 @@
 		// Need to allocate it in the entry block for mem2reg to work
 		llvm::BasicBlock &entryBlock = ::function->getEntryBlock();
 
-		Instruction *declaration;
+		llvm::Instruction *declaration;
 
 		if(arraySize)
 		{
-			declaration = new AllocaInst(type, Nucleus::createConstantInt(arraySize));
+			declaration = new llvm::AllocaInst(T(type), Nucleus::createConstantInt(arraySize));
 		}
 		else
 		{
-			declaration = new AllocaInst(type, (Value*)0);
+			declaration = new llvm::AllocaInst(T(type), (Value*)nullptr);
 		}
 
 		entryBlock.getInstList().push_front(declaration);
@@ -292,7 +346,7 @@
 
 	void Nucleus::createFunction(Type *ReturnType, std::vector<Type*> &Params)
 	{
-		llvm::FunctionType *functionType = llvm::FunctionType::get(ReturnType, T(Params), false);
+		llvm::FunctionType *functionType = llvm::FunctionType::get(T(ReturnType), T(Params), false);
 		::function = llvm::Function::Create(functionType, llvm::GlobalValue::InternalLinkage, "", ::module);
 		::function->setCallingConv(llvm::CallingConv::C);
 
@@ -314,15 +368,11 @@
 
 	void Nucleus::createRetVoid()
 	{
-		x86::emms();
-
 		::builder->CreateRetVoid();
 	}
 
 	void Nucleus::createRet(Value *v)
 	{
-		x86::emms();
-
 		::builder->CreateRet(v);
 	}
 
@@ -441,73 +491,154 @@
 		return V(::builder->CreateNot(v));
 	}
 
-	Value *Nucleus::createLoad(Value *ptr, Type *type, bool isVolatile, unsigned int align)
+	Value *Nucleus::createLoad(Value *ptr, Type *type, bool isVolatile, unsigned int alignment)
 	{
-		assert(ptr->getType()->getContainedType(0) == type);
-		return V(::builder->Insert(new LoadInst(ptr, "", isVolatile, align)));
+		uintptr_t t = reinterpret_cast<uintptr_t>(type);
+		if(t < EmulatedTypeCount)
+		{
+			switch(t)
+			{
+			case Type_v2i32:
+			case Type_v4i16:
+			case Type_v8i8:
+			case Type_v2f32:
+				return createBitCast(createInsertElement(V(llvm::UndefValue::get(llvm::VectorType::get(T(Long::getType()), 2))), createLoad(createBitCast(ptr, Pointer<Long>::getType()), Long::getType(), isVolatile, alignment), 0), T(T(type)));
+			case Type_v2i16:
+			case Type_v4i8:
+				if(alignment != 0)   // Not a local variable (all vectors are 128-bit).
+				{
+					Value *u = V(llvm::UndefValue::get(llvm::VectorType::get(T(Long::getType()), 2)));
+					Value *i = V(createLoad(createBitCast(ptr, Pointer<Int>::getType()), Int::getType(), isVolatile, alignment));
+					i = createZExt(i, Long::getType());
+					Value *v = V(createInsertElement(u, i, 0));
+					return createBitCast(v, T(T(type)));
+				}
+				break;
+			default:
+				assert(false);
+			}
+		}
+
+		assert(ptr->getType()->getContainedType(0) == T(type));
+		return V(::builder->Insert(new llvm::LoadInst(ptr, "", isVolatile, alignment)));
 	}
 
-	Value *Nucleus::createStore(Value *value, Value *ptr, Type *type, bool isVolatile, unsigned int align)
+	Value *Nucleus::createStore(Value *value, Value *ptr, Type *type, bool isVolatile, unsigned int alignment)
 	{
-		assert(ptr->getType()->getContainedType(0) == type);
-		::builder->Insert(new StoreInst(value, ptr, isVolatile, align));
+		uintptr_t t = reinterpret_cast<uintptr_t>(type);
+		if(t < EmulatedTypeCount)
+		{
+			switch(t)
+			{
+			case Type_v2i32:
+			case Type_v4i16:
+			case Type_v8i8:
+			case Type_v2f32:
+				createStore(createExtractElement(createBitCast(value, T(llvm::VectorType::get(T(Long::getType()), 2))), Long::getType(), 0), createBitCast(ptr, Pointer<Long>::getType()), Long::getType(), isVolatile, alignment);
+				return value;
+			case Type_v2i16:
+			case Type_v4i8:
+				if(alignment != 0)   // Not a local variable (all vectors are 128-bit).
+				{
+					createStore(createExtractElement(createBitCast(value, Int4::getType()), Int::getType(), 0), createBitCast(ptr, Pointer<Int>::getType()), Int::getType(), isVolatile, alignment);
+					return value;
+				}
+				break;
+			default:
+				assert(false);
+			}
+		}
+
+		assert(ptr->getType()->getContainedType(0) == T(type));
+		::builder->Insert(new llvm::StoreInst(value, ptr, isVolatile, alignment));
 		return value;
 	}
 
 	Value *Nucleus::createGEP(Value *ptr, Type *type, Value *index, bool unsignedIndex)
 	{
-		if(unsignedIndex && sizeof(void*) == 8)
+		if(sizeof(void*) == 8)
 		{
-			index = createZExt(index, Long::getType());
+			if(unsignedIndex)
+			{
+				index = createZExt(index, Long::getType());
+			}
+			else
+			{
+				index = createSExt(index, Long::getType());
+			}
+
+			index = createMul(index, createConstantLong((int64_t)typeSize(type)));
+		}
+		else
+		{
+			index = createMul(index, createConstantInt((int)typeSize(type)));
 		}
 
-		assert(ptr->getType()->getContainedType(0) == type);
-		return V(::builder->CreateGEP(ptr, index));
+		assert(ptr->getType()->getContainedType(0) == T(type));
+		return createBitCast(V(::builder->CreateGEP(createBitCast(ptr, T(llvm::PointerType::get(T(Byte::getType()), 0))), index)), T(llvm::PointerType::get(T(type), 0)));
 	}
 
 	Value *Nucleus::createAtomicAdd(Value *ptr, Value *value)
 	{
-		return V(::builder->CreateAtomicRMW(AtomicRMWInst::Add, ptr, value, SequentiallyConsistent));
+		return V(::builder->CreateAtomicRMW(llvm::AtomicRMWInst::Add, ptr, value, llvm::SequentiallyConsistent));
 	}
 
 	Value *Nucleus::createTrunc(Value *v, Type *destType)
 	{
-		return V(::builder->CreateTrunc(v, destType));
+		return V(::builder->CreateTrunc(v, T(destType)));
 	}
 
 	Value *Nucleus::createZExt(Value *v, Type *destType)
 	{
-		return V(::builder->CreateZExt(v, destType));
+		return V(::builder->CreateZExt(v, T(destType)));
 	}
 
 	Value *Nucleus::createSExt(Value *v, Type *destType)
 	{
-		return V(::builder->CreateSExt(v, destType));
+		return V(::builder->CreateSExt(v, T(destType)));
 	}
 
 	Value *Nucleus::createFPToSI(Value *v, Type *destType)
 	{
-		return V(::builder->CreateFPToSI(v, destType));
+		return V(::builder->CreateFPToSI(v, T(destType)));
 	}
 
 	Value *Nucleus::createSIToFP(Value *v, Type *destType)
 	{
-		return V(::builder->CreateSIToFP(v, destType));
+		return V(::builder->CreateSIToFP(v, T(destType)));
 	}
 
 	Value *Nucleus::createFPTrunc(Value *v, Type *destType)
 	{
-		return V(::builder->CreateFPTrunc(v, destType));
+		return V(::builder->CreateFPTrunc(v, T(destType)));
 	}
 
 	Value *Nucleus::createFPExt(Value *v, Type *destType)
 	{
-		return V(::builder->CreateFPExt(v, destType));
+		return V(::builder->CreateFPExt(v, T(destType)));
 	}
 
 	Value *Nucleus::createBitCast(Value *v, Type *destType)
 	{
-		return V(::builder->CreateBitCast(v, destType));
+		// Bitcasts must be between types of the same logical size. But with emulated narrow vectors we need
+		// support for casting between scalars and wide vectors. Emulate them by writing to the stack and
+		// reading back as the destination type.
+		if(!v->getType()->isVectorTy() && T(destType)->isVectorTy())
+		{
+			Value *readAddress = allocateStackVariable(destType);
+			Value *writeAddress = createBitCast(readAddress, T(llvm::PointerType::get(v->getType(), 0)));
+			createStore(v, writeAddress, T(v->getType()));
+			return createLoad(readAddress, destType);
+		}
+		else if(v->getType()->isVectorTy() && !T(destType)->isVectorTy())
+		{
+			Value *writeAddress = allocateStackVariable(T(v->getType()));
+			createStore(v, writeAddress, T(v->getType()));
+			Value *readAddress = createBitCast(writeAddress, T(llvm::PointerType::get(T(destType), 0)));
+			return createLoad(readAddress, destType);
+		}
+
+		return V(::builder->CreateBitCast(v, T(destType)));
 	}
 
 	Value *Nucleus::createICmpEQ(Value *lhs, Value *rhs)
@@ -632,7 +763,7 @@
 
 	Value *Nucleus::createExtractElement(Value *vector, Type *type, int index)
 	{
-		assert(vector->getType()->getContainedType(0) == type);
+		assert(vector->getType()->getContainedType(0) == T(type));
 		return V(::builder->CreateExtractElement(vector, createConstantInt(index)));
 	}
 
@@ -650,7 +781,7 @@
 
 		for(int i = 0; i < size; i++)
 		{
-			swizzle[i] = llvm::ConstantInt::get(Type::getInt32Ty(*::context), select[i]);
+			swizzle[i] = llvm::ConstantInt::get(llvm::Type::getInt32Ty(*::context), select[i]);
 		}
 
 		llvm::Value *shuffle = llvm::ConstantVector::get(llvm::ArrayRef<llvm::Constant*>(swizzle, size));
@@ -670,7 +801,7 @@
 
 	void Nucleus::addSwitchCase(SwitchCases *switchCases, int label, BasicBlock *branch)
 	{
-		switchCases->addCase(llvm::ConstantInt::get(Type::getInt32Ty(*::context), label, true), branch);
+		switchCases->addCase(llvm::ConstantInt::get(llvm::Type::getInt32Ty(*::context), label, true), branch);
 	}
 
 	void Nucleus::createUnreachable()
@@ -713,74 +844,74 @@
 
 	Type *Nucleus::getPointerType(Type *ElementType)
 	{
-		return T(llvm::PointerType::get(ElementType, 0));
+		return T(llvm::PointerType::get(T(ElementType), 0));
 	}
 
 	Value *Nucleus::createNullValue(Type *Ty)
 	{
-		return V(llvm::Constant::getNullValue(Ty));
+		return V(llvm::Constant::getNullValue(T(Ty)));
 	}
 
 	Value *Nucleus::createConstantLong(int64_t i)
 	{
-		return V(llvm::ConstantInt::get(Type::getInt64Ty(*::context), i, true));
+		return V(llvm::ConstantInt::get(llvm::Type::getInt64Ty(*::context), i, true));
 	}
 
 	Value *Nucleus::createConstantInt(int i)
 	{
-		return V(llvm::ConstantInt::get(Type::getInt32Ty(*::context), i, true));
+		return V(llvm::ConstantInt::get(llvm::Type::getInt32Ty(*::context), i, true));
 	}
 
 	Value *Nucleus::createConstantInt(unsigned int i)
 	{
-		return V(llvm::ConstantInt::get(Type::getInt32Ty(*::context), i, false));
+		return V(llvm::ConstantInt::get(llvm::Type::getInt32Ty(*::context), i, false));
 	}
 
 	Value *Nucleus::createConstantBool(bool b)
 	{
-		return V(llvm::ConstantInt::get(Type::getInt1Ty(*::context), b));
+		return V(llvm::ConstantInt::get(llvm::Type::getInt1Ty(*::context), b));
 	}
 
 	Value *Nucleus::createConstantByte(signed char i)
 	{
-		return V(llvm::ConstantInt::get(Type::getInt8Ty(*::context), i, true));
+		return V(llvm::ConstantInt::get(llvm::Type::getInt8Ty(*::context), i, true));
 	}
 
 	Value *Nucleus::createConstantByte(unsigned char i)
 	{
-		return V(llvm::ConstantInt::get(Type::getInt8Ty(*::context), i, false));
+		return V(llvm::ConstantInt::get(llvm::Type::getInt8Ty(*::context), i, false));
 	}
 
 	Value *Nucleus::createConstantShort(short i)
 	{
-		return V(llvm::ConstantInt::get(Type::getInt16Ty(*::context), i, true));
+		return V(llvm::ConstantInt::get(llvm::Type::getInt16Ty(*::context), i, true));
 	}
 
 	Value *Nucleus::createConstantShort(unsigned short i)
 	{
-		return V(llvm::ConstantInt::get(Type::getInt16Ty(*::context), i, false));
+		return V(llvm::ConstantInt::get(llvm::Type::getInt16Ty(*::context), i, false));
 	}
 
 	Value *Nucleus::createConstantFloat(float x)
 	{
-		return V(llvm::ConstantFP::get(Float::getType(), x));
+		return V(llvm::ConstantFP::get(T(Float::getType()), x));
 	}
 
 	Value *Nucleus::createNullPointer(Type *Ty)
 	{
-		return V(llvm::ConstantPointerNull::get(llvm::PointerType::get(Ty, 0)));
+		return V(llvm::ConstantPointerNull::get(llvm::PointerType::get(T(Ty), 0)));
 	}
 
 	Value *Nucleus::createConstantVector(const int64_t *constants, Type *type)
 	{
-		assert(llvm::isa<VectorType>(type));
-		const int numConstants = llvm::cast<VectorType>(type)->getNumElements();
+		assert(llvm::isa<llvm::VectorType>(T(type)));
+		const int numConstants = llvm::cast<llvm::VectorType>(T(type))->getNumElements();
 		assert(numConstants <= 16);
 		llvm::Constant *constantVector[16];
 
 		for(int i = 0; i < numConstants; i++)
 		{
-			constantVector[i] = llvm::ConstantInt::get(type->getContainedType(0), constants[i]);
+			constantVector[i] = llvm::ConstantInt::get(T(type)->getContainedType(0), constants[i]);
 		}
 
 		return V(llvm::ConstantVector::get(llvm::ArrayRef<llvm::Constant*>(constantVector, numConstants)));
@@ -788,14 +919,14 @@
 
 	Value *Nucleus::createConstantVector(const double *constants, Type *type)
 	{
-		assert(llvm::isa<VectorType>(type));
-		const int numConstants = llvm::cast<VectorType>(type)->getNumElements();
+		assert(llvm::isa<llvm::VectorType>(T(type)));
+		const int numConstants = llvm::cast<llvm::VectorType>(T(type))->getNumElements();
 		assert(numConstants <= 8);
 		llvm::Constant *constantVector[8];
 
 		for(int i = 0; i < numConstants; i++)
 		{
-			constantVector[i] = llvm::ConstantFP::get(type->getContainedType(0), constants[i]);
+			constantVector[i] = llvm::ConstantFP::get(T(type)->getContainedType(0), constants[i]);
 		}
 
 		return V(llvm::ConstantVector::get(llvm::ArrayRef<llvm::Constant*>(constantVector, numConstants)));
@@ -806,17 +937,6 @@
 		return T(llvm::Type::getVoidTy(*::context));
 	}
 
-	class MMX : public LValue<MMX>
-	{
-	public:
-		static Type *getType();
-	};
-
-	Type *MMX::getType()
-	{
-		return T(llvm::Type::getX86_MMXTy(*::context));
-	}
-
 	Bool::Bool(Argument<Bool> argument)
 	{
 		storeValue(argument.value);
@@ -1894,7 +2014,7 @@
 
 	Byte4::Byte4(RValue<Byte8> cast)
 	{
-		storeValue(Nucleus::createTrunc(Nucleus::createBitCast(cast.value, Long::getType()), Int::getType()));
+		storeValue(Nucleus::createBitCast(cast.value, getType()));
 	}
 
 	Byte4::Byte4(const Reference<Byte4> &rhs)
@@ -1905,28 +2025,18 @@
 
 	Type *Byte4::getType()
 	{
-		#if 0
-			return T(VectorType::get(Byte::getType(), 4));
-		#else
-			return UInt::getType();   // FIXME: LLVM doesn't manipulate it as one 32-bit block
-		#endif
+		return T(Type_v4i8);
 	}
 
 	Type *SByte4::getType()
 	{
-		#if 0
-			return T(VectorType::get(SByte::getType(), 4));
-		#else
-			return Int::getType();   // FIXME: LLVM doesn't manipulate it as one 32-bit block
-		#endif
+		return T(Type_v4i8);
 	}
 
 	Byte8::Byte8(uint8_t x0, uint8_t x1, uint8_t x2, uint8_t x3, uint8_t x4, uint8_t x5, uint8_t x6, uint8_t x7)
 	{
 		int64_t constantVector[8] = {x0, x1, x2, x3, x4, x5, x6, x7};
-		Value *vector = V(Nucleus::createConstantVector(constantVector, T(VectorType::get(Byte::getType(), 8))));
-
-		storeValue(Nucleus::createBitCast(vector, getType()));
+		storeValue(Nucleus::createConstantVector(constantVector, getType()));
 	}
 
 	Byte8::Byte8(RValue<Byte8> rhs)
@@ -1971,26 +2081,12 @@
 
 	RValue<Byte8> operator+(RValue<Byte8> lhs, RValue<Byte8> rhs)
 	{
-		if(CPUID::supportsMMX2())
-		{
-			return x86::paddb(lhs, rhs);
-		}
-		else
-		{
-			return RValue<Byte8>(Nucleus::createAdd(lhs.value, rhs.value));
-		}
+		return RValue<Byte8>(Nucleus::createAdd(lhs.value, rhs.value));
 	}
 
 	RValue<Byte8> operator-(RValue<Byte8> lhs, RValue<Byte8> rhs)
 	{
-		if(CPUID::supportsMMX2())
-		{
-			return x86::psubb(lhs, rhs);
-		}
-		else
-		{
-			return RValue<Byte8>(Nucleus::createSub(lhs.value, rhs.value));
-		}
+		return RValue<Byte8>(Nucleus::createSub(lhs.value, rhs.value));
 	}
 
 //	RValue<Byte8> operator*(RValue<Byte8> lhs, RValue<Byte8> rhs)
@@ -2010,38 +2106,17 @@
 
 	RValue<Byte8> operator&(RValue<Byte8> lhs, RValue<Byte8> rhs)
 	{
-		if(CPUID::supportsMMX2())
-		{
-			return As<Byte8>(x86::pand(As<Short4>(lhs), As<Short4>(rhs)));
-		}
-		else
-		{
-			return RValue<Byte8>(Nucleus::createAnd(lhs.value, rhs.value));
-		}
+		return RValue<Byte8>(Nucleus::createAnd(lhs.value, rhs.value));
 	}
 
 	RValue<Byte8> operator|(RValue<Byte8> lhs, RValue<Byte8> rhs)
 	{
-		if(CPUID::supportsMMX2())
-		{
-			return As<Byte8>(x86::por(As<Short4>(lhs), As<Short4>(rhs)));
-		}
-		else
-		{
-			return RValue<Byte8>(Nucleus::createOr(lhs.value, rhs.value));
-		}
+		return RValue<Byte8>(Nucleus::createOr(lhs.value, rhs.value));
 	}
 
 	RValue<Byte8> operator^(RValue<Byte8> lhs, RValue<Byte8> rhs)
 	{
-		if(CPUID::supportsMMX2())
-		{
-			return As<Byte8>(x86::pxor(As<Short4>(lhs), As<Short4>(rhs)));
-		}
-		else
-		{
-			return RValue<Byte8>(Nucleus::createXor(lhs.value, rhs.value));
-		}
+		return RValue<Byte8>(Nucleus::createXor(lhs.value, rhs.value));
 	}
 
 //	RValue<Byte8> operator<<(RValue<Byte8> lhs, unsigned char rhs)
@@ -2116,14 +2191,7 @@
 
 	RValue<Byte8> operator~(RValue<Byte8> val)
 	{
-		if(CPUID::supportsMMX2())
-		{
-			return val ^ Byte8(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
-		}
-		else
-		{
-			return RValue<Byte8>(Nucleus::createNot(val.value));
-		}
+		return RValue<Byte8>(Nucleus::createNot(val.value));
 	}
 
 	RValue<Byte8> AddSat(RValue<Byte8> x, RValue<Byte8> y)
@@ -2138,48 +2206,26 @@
 
 	RValue<Short4> Unpack(RValue<Byte4> x)
 	{
-		Value *int2 = Nucleus::createInsertElement(V(UndefValue::get(VectorType::get(Int::getType(), 2))), x.value, 0);
-		Value *byte8 = Nucleus::createBitCast(int2, Byte8::getType());
-
-		return UnpackLow(RValue<Byte8>(byte8), RValue<Byte8>(byte8));
+		int shuffle[16] = {0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7};   // Real type is v16i8
+		return As<Short4>(Nucleus::createShuffleVector(x.value, x.value, shuffle));
 	}
 
 	RValue<Short4> Unpack(RValue<Byte4> x, RValue<Byte4> y)
 	{
-		Value *xx = Nucleus::createInsertElement(V(UndefValue::get(VectorType::get(Int::getType(), 2))), x.value, 0);
-		Value *yy = Nucleus::createInsertElement(V(UndefValue::get(VectorType::get(Int::getType(), 2))), y.value, 0);
-
-		return UnpackLow(As<Byte8>(xx), As<Byte8>(yy));
+		return UnpackLow(As<Byte8>(x), As<Byte8>(y));
 	}
 
 	RValue<Short4> UnpackLow(RValue<Byte8> x, RValue<Byte8> y)
 	{
-		if(CPUID::supportsMMX2())
-		{
-			return x86::punpcklbw(x, y);
-		}
-		else
-		{
-			int shuffle[8] = {0, 8, 1, 9, 2, 10, 3, 11};
-			Value *packed = Nucleus::createShuffleVector(x.value, y.value, shuffle);
-
-			return RValue<Short4>(Nucleus::createBitCast(packed, Short4::getType()));
-		}
+		int shuffle[16] = {0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23};   // Real type is v16i8
+		return As<Short4>(Nucleus::createShuffleVector(x.value, y.value, shuffle));
 	}
 
 	RValue<Short4> UnpackHigh(RValue<Byte8> x, RValue<Byte8> y)
 	{
-		if(CPUID::supportsMMX2())
-		{
-			return x86::punpckhbw(x, y);
-		}
-		else
-		{
-			int shuffle[8] = {4, 12, 5, 13, 6, 14, 7, 15};
-			Value *packed = Nucleus::createShuffleVector(x.value, y.value, shuffle);
-
-			return RValue<Short4>(Nucleus::createBitCast(packed, Short4::getType()));
-		}
+		int shuffle[16] = {0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23};   // Real type is v16i8
+		auto lowHigh = RValue<Byte16>(Nucleus::createShuffleVector(x.value, y.value, shuffle));
+		return As<Short4>(Swizzle(As<Int4>(lowHigh), 0xEE));
 	}
 
 	RValue<Int> SignMask(RValue<Byte8> x)
@@ -2199,20 +2245,13 @@
 
 	Type *Byte8::getType()
 	{
-		if(CPUID::supportsMMX2())
-		{
-			return MMX::getType();
-		}
-		else
-		{
-			return T(VectorType::get(Byte::getType(), 8));
-		}
+		return T(Type_v8i8);
 	}
 
 	SByte8::SByte8(uint8_t x0, uint8_t x1, uint8_t x2, uint8_t x3, uint8_t x4, uint8_t x5, uint8_t x6, uint8_t x7)
 	{
 		int64_t constantVector[8] = {x0, x1, x2, x3, x4, x5, x6, x7};
-		Value *vector = V(Nucleus::createConstantVector(constantVector, T(VectorType::get(SByte::getType(), 8))));
+		Value *vector = V(Nucleus::createConstantVector(constantVector, getType()));
 
 		storeValue(Nucleus::createBitCast(vector, getType()));
 	}
@@ -2259,26 +2298,12 @@
 
 	RValue<SByte8> operator+(RValue<SByte8> lhs, RValue<SByte8> rhs)
 	{
-		if(CPUID::supportsMMX2())
-		{
-			return As<SByte8>(x86::paddb(As<Byte8>(lhs), As<Byte8>(rhs)));
-		}
-		else
-		{
-			return RValue<SByte8>(Nucleus::createAdd(lhs.value, rhs.value));
-		}
+		return RValue<SByte8>(Nucleus::createAdd(lhs.value, rhs.value));
 	}
 
 	RValue<SByte8> operator-(RValue<SByte8> lhs, RValue<SByte8> rhs)
 	{
-		if(CPUID::supportsMMX2())
-		{
-			return As<SByte8>(x86::psubb(As<Byte8>(lhs), As<Byte8>(rhs)));
-		}
-		else
-		{
-			return RValue<SByte8>(Nucleus::createSub(lhs.value, rhs.value));
-		}
+		return RValue<SByte8>(Nucleus::createSub(lhs.value, rhs.value));
 	}
 
 //	RValue<SByte8> operator*(RValue<SByte8> lhs, RValue<SByte8> rhs)
@@ -2383,14 +2408,7 @@
 
 	RValue<SByte8> operator~(RValue<SByte8> val)
 	{
-		if(CPUID::supportsMMX2())
-		{
-			return val ^ SByte8(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
-		}
-		else
-		{
-			return RValue<SByte8>(Nucleus::createNot(val.value));
-		}
+		return RValue<SByte8>(Nucleus::createNot(val.value));
 	}
 
 	RValue<SByte8> AddSat(RValue<SByte8> x, RValue<SByte8> y)
@@ -2405,32 +2423,15 @@
 
 	RValue<Short4> UnpackLow(RValue<SByte8> x, RValue<SByte8> y)
 	{
-		if(CPUID::supportsMMX2())
-		{
-			return As<Short4>(x86::punpcklbw(As<Byte8>(x), As<Byte8>(y)));
-		}
-		else
-		{
-			int shuffle[8] = {0, 8, 1, 9, 2, 10, 3, 11};
-			Value *packed = Nucleus::createShuffleVector(x.value, y.value, shuffle);
-
-			return RValue<Short4>(Nucleus::createBitCast(packed, Short4::getType()));
-		}
+		int shuffle[16] = {0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23};   // Real type is v16i8
+		return As<Short4>(Nucleus::createShuffleVector(x.value, y.value, shuffle));
 	}
 
 	RValue<Short4> UnpackHigh(RValue<SByte8> x, RValue<SByte8> y)
 	{
-		if(CPUID::supportsMMX2())
-		{
-			return As<Short4>(x86::punpckhbw(As<Byte8>(x), As<Byte8>(y)));
-		}
-		else
-		{
-			int shuffle[8] = {4, 12, 5, 13, 6, 14, 7, 15};
-			Value *packed = Nucleus::createShuffleVector(x.value, y.value, shuffle);
-
-			return RValue<Short4>(Nucleus::createBitCast(packed, Short4::getType()));
-		}
+		int shuffle[16] = {0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23};   // Real type is v16i8
+		auto lowHigh = RValue<Byte16>(Nucleus::createShuffleVector(x.value, y.value, shuffle));
+		return As<Short4>(Swizzle(As<Int4>(lowHigh), 0xEE));
 	}
 
 	RValue<Int> SignMask(RValue<SByte8> x)
@@ -2450,14 +2451,7 @@
 
 	Type *SByte8::getType()
 	{
-		if(CPUID::supportsMMX2())
-		{
-			return MMX::getType();
-		}
-		else
-		{
-			return T(VectorType::get(SByte::getType(), 8));
-		}
+		return T(Type_v8i8);
 	}
 
 	Byte16::Byte16(RValue<Byte16> rhs)
@@ -2502,92 +2496,51 @@
 
 	Type *Byte16::getType()
 	{
-		return T(VectorType::get(Byte::getType(), 16));
+		return T(llvm::VectorType::get(T(Byte::getType()), 16));
 	}
 
 	Type *SByte16::getType()
 	{
-		return T( VectorType::get(SByte::getType(), 16));
+		return T(llvm::VectorType::get(T(SByte::getType()), 16));
 	}
 
 	Short2::Short2(RValue<Short4> cast)
 	{
-		storeValue(Nucleus::createTrunc(Nucleus::createBitCast(cast.value, Long::getType()), UInt::getType()));
+		storeValue(Nucleus::createBitCast(cast.value, getType()));
 	}
 
 	Type *Short2::getType()
 	{
-		#if 0
-			return T(VectorType::get(Short::getType(), 2));
-		#else
-			return UInt::getType();   // FIXME: LLVM doesn't manipulate it as one 32-bit block
-		#endif
+		return T(Type_v2i16);
 	}
 
 	UShort2::UShort2(RValue<UShort4> cast)
 	{
-		storeValue(Nucleus::createTrunc(Nucleus::createBitCast(cast.value, Long::getType()), UInt::getType()));
+		storeValue(Nucleus::createBitCast(cast.value, getType()));
 	}
 
 	Type *UShort2::getType()
 	{
-		#if 0
-			return T(VectorType::get(UShort::getType(), 2));
-		#else
-			return UInt::getType();   // FIXME: LLVM doesn't manipulate it as one 32-bit block
-		#endif
+		return T(Type_v2i16);
 	}
 
 	Short4::Short4(RValue<Int> cast)
 	{
-		Value *extend = Nucleus::createZExt(cast.value, Long::getType());
-		Value *swizzle = Swizzle(RValue<Short4>(extend), 0x00).value;
+		Value *vector = loadValue();
+		Value *element = Nucleus::createTrunc(cast.value, Short::getType());
+		Value *insert = Nucleus::createInsertElement(vector, element, 0);
+		Value *swizzle = Swizzle(RValue<Short4>(insert), 0x00).value;
 
 		storeValue(swizzle);
 	}
 
 	Short4::Short4(RValue<Int4> cast)
 	{
+		int select[8] = {0, 2, 4, 6, 0, 2, 4, 6};
 		Value *short8 = Nucleus::createBitCast(cast.value, Short8::getType());
 
-		#if 0   // FIXME: Check codegen (pshuflw phshufhw pshufd)
-			Constant *pack[8];
-			pack[0] = Nucleus::createConstantInt(0);
-			pack[1] = Nucleus::createConstantInt(2);
-			pack[2] = Nucleus::createConstantInt(4);
-			pack[3] = Nucleus::createConstantInt(6);
-
-			Value *short4 = Nucleus::createShuffleVector(short8, short8, Nucleus::createConstantVector(pack, 4));
-		#else
-			Value *packed;
-
-			// FIXME: Use Swizzle<Short8>
-			if(!CPUID::supportsSSSE3())
-			{
-				int pshuflw[8] = {0, 2, 0, 2, 4, 5, 6, 7};
-				int pshufhw[8] = {0, 1, 2, 3, 4, 6, 4, 6};
-
-				Value *shuffle1 = Nucleus::createShuffleVector(short8, short8, pshuflw);
-				Value *shuffle2 = Nucleus::createShuffleVector(shuffle1, shuffle1, pshufhw);
-				Value *int4 = Nucleus::createBitCast(shuffle2, Int4::getType());
-				packed = createSwizzle4(int4, 0x88);
-			}
-			else
-			{
-				int pshufb[16] = {0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 4, 5, 8, 9, 12, 13};
-				Value *byte16 = Nucleus::createBitCast(cast.value, Byte16::getType());
-				packed = Nucleus::createShuffleVector(byte16, byte16, pshufb);
-			}
-
-			#if 0   // FIXME: No optimal instruction selection
-				Value *qword2 = Nucleus::createBitCast(packed, T(VectorType::get(Long::getType(), 2)));
-				Value *element = Nucleus::createExtractElement(qword2, 0);
-				Value *short4 = Nucleus::createBitCast(element, Short4::getType());
-			#else   // FIXME: Requires SSE
-				Value *int2 = RValue<Int2>(Int2(RValue<Int4>(packed))).value;
-				Value *short4 = Nucleus::createBitCast(int2, Short4::getType());
-			#endif
-		#endif
+		Value *packed = Nucleus::createShuffleVector(short8, short8, select);
+		Value *short4 = As<Short4>(Int2(As<Int4>(packed))).value;
 
 		storeValue(short4);
 	}
@@ -2607,17 +2560,13 @@
 	Short4::Short4(short xyzw)
 	{
 		int64_t constantVector[4] = {xyzw, xyzw, xyzw, xyzw};
-		Value *vector = V(Nucleus::createConstantVector(constantVector, T(VectorType::get(Short::getType(), 4))));
-
-		storeValue(Nucleus::createBitCast(vector, getType()));
+		storeValue(Nucleus::createConstantVector(constantVector, getType()));
 	}
 
 	Short4::Short4(short x, short y, short z, short w)
 	{
 		int64_t constantVector[4] = {x, y, z, w};
-		Value *vector = V(Nucleus::createConstantVector(constantVector, T(VectorType::get(Short::getType(), 4))));
-
-		storeValue(Nucleus::createBitCast(vector, getType()));
+		storeValue(Nucleus::createConstantVector(constantVector, getType()));
 	}
 
 	Short4::Short4(RValue<Short4> rhs)
@@ -2700,38 +2649,17 @@
 
 	RValue<Short4> operator+(RValue<Short4> lhs, RValue<Short4> rhs)
 	{
-		if(CPUID::supportsMMX2())
-		{
-			return x86::paddw(lhs, rhs);
-		}
-		else
-		{
-			return RValue<Short4>(Nucleus::createAdd(lhs.value, rhs.value));
-		}
+		return RValue<Short4>(Nucleus::createAdd(lhs.value, rhs.value));
 	}
 
 	RValue<Short4> operator-(RValue<Short4> lhs, RValue<Short4> rhs)
 	{
-		if(CPUID::supportsMMX2())
-		{
-			return x86::psubw(lhs, rhs);
-		}
-		else
-		{
-			return RValue<Short4>(Nucleus::createSub(lhs.value, rhs.value));
-		}
+		return RValue<Short4>(Nucleus::createSub(lhs.value, rhs.value));
 	}
 
 	RValue<Short4> operator*(RValue<Short4> lhs, RValue<Short4> rhs)
 	{
-		if(CPUID::supportsMMX2())
-		{
-			return x86::pmullw(lhs, rhs);
-		}
-		else
-		{
-			return RValue<Short4>(Nucleus::createMul(lhs.value, rhs.value));
-		}
+		return RValue<Short4>(Nucleus::createMul(lhs.value, rhs.value));
 	}
 
 //	RValue<Short4> operator/(RValue<Short4> lhs, RValue<Short4> rhs)
@@ -2746,38 +2674,17 @@
 
 	RValue<Short4> operator&(RValue<Short4> lhs, RValue<Short4> rhs)
 	{
-		if(CPUID::supportsMMX2())
-		{
-			return x86::pand(lhs, rhs);
-		}
-		else
-		{
-			return RValue<Short4>(Nucleus::createAnd(lhs.value, rhs.value));
-		}
+		return RValue<Short4>(Nucleus::createAnd(lhs.value, rhs.value));
 	}
 
 	RValue<Short4> operator|(RValue<Short4> lhs, RValue<Short4> rhs)
 	{
-		if(CPUID::supportsMMX2())
-		{
-			return x86::por(lhs, rhs);
-		}
-		else
-		{
-			return RValue<Short4>(Nucleus::createOr(lhs.value, rhs.value));
-		}
+		return RValue<Short4>(Nucleus::createOr(lhs.value, rhs.value));
 	}
 
 	RValue<Short4> operator^(RValue<Short4> lhs, RValue<Short4> rhs)
 	{
-		if(CPUID::supportsMMX2())
-		{
-			return x86::pxor(lhs, rhs);
-		}
-		else
-		{
-			return RValue<Short4>(Nucleus::createXor(lhs.value, rhs.value));
-		}
+		return RValue<Short4>(Nucleus::createXor(lhs.value, rhs.value));
 	}
 
 	RValue<Short4> operator<<(RValue<Short4> lhs, unsigned char rhs)
@@ -2851,34 +2758,18 @@
 
 	RValue<Short4> operator-(RValue<Short4> val)
 	{
-		if(CPUID::supportsMMX2())
-		{
-			return Short4(0, 0, 0, 0) - val;
-		}
-		else
-		{
-			return RValue<Short4>(Nucleus::createNeg(val.value));
-		}
+		return RValue<Short4>(Nucleus::createNeg(val.value));
 	}
 
 	RValue<Short4> operator~(RValue<Short4> val)
 	{
-		if(CPUID::supportsMMX2())
-		{
-			return val ^ Short4(0xFFFFu, 0xFFFFu, 0xFFFFu, 0xFFFFu);
-		}
-		else
-		{
-			return RValue<Short4>(Nucleus::createNot(val.value));
-		}
+		return RValue<Short4>(Nucleus::createNot(val.value));
 	}
 
 	RValue<Short4> RoundShort4(RValue<Float4> cast)
 	{
-		RValue<Int4> v4i32 = x86::cvtps2dq(cast);
-		RValue<Short8> v8i16 = x86::packssdw(v4i32, v4i32);
-
-		return As<Short4>(Int2(As<Int4>(v8i16)));
+		RValue<Int4> int4 = RoundInt(cast);
+		return As<Short4>(Pack(int4, int4));
 	}
 
 	RValue<Short4> Max(RValue<Short4> x, RValue<Short4> y)
@@ -2913,73 +2804,50 @@
 
 	RValue<SByte8> Pack(RValue<Short4> x, RValue<Short4> y)
 	{
-		return x86::packsswb(x, y);
+		auto result = x86::packsswb(x, y);
+
+		return As<SByte8>(Swizzle(As<Int4>(result), 0x88));
 	}
 
 	RValue<Int2> UnpackLow(RValue<Short4> x, RValue<Short4> y)
 	{
-		if(CPUID::supportsMMX2())
-		{
-			return x86::punpcklwd(x, y);
-		}
-		else
-		{
-			int shuffle[4] = {0, 4, 1, 5};
-			Value *packed = Nucleus::createShuffleVector(x.value, y.value, shuffle);
-
-			return RValue<Int2>(Nucleus::createBitCast(packed, Int2::getType()));
-		}
+		int shuffle[8] = {0, 8, 1, 9, 2, 10, 3, 11};   // Real type is v8i16
+		return As<Int2>(Nucleus::createShuffleVector(x.value, y.value, shuffle));
 	}
 
 	RValue<Int2> UnpackHigh(RValue<Short4> x, RValue<Short4> y)
 	{
-		if(CPUID::supportsMMX2())
-		{
-			return x86::punpckhwd(x, y);
-		}
-		else
-		{
-			int shuffle[4] = {2, 6, 3, 7};
-			Value *packed = Nucleus::createShuffleVector(x.value, y.value, shuffle);
-
-			return RValue<Int2>(Nucleus::createBitCast(packed, Int2::getType()));
-		}
+		int shuffle[8] = {0, 8, 1, 9, 2, 10, 3, 11};   // Real type is v8i16
+		auto lowHigh = RValue<Short8>(Nucleus::createShuffleVector(x.value, y.value, shuffle));
+		return As<Int2>(Swizzle(As<Int4>(lowHigh), 0xEE));
 	}
 
 	RValue<Short4> Swizzle(RValue<Short4> x, unsigned char select)
 	{
-		if(CPUID::supportsMMX2())
+		// Real type is v8i16
+		int shuffle[8] =
 		{
-			return x86::pshufw(x, select);
-		}
-		else
-		{
-			return RValue<Short4>(createSwizzle4(x.value, select));
-		}
+			(select >> 0) & 0x03,
+			(select >> 2) & 0x03,
+			(select >> 4) & 0x03,
+			(select >> 6) & 0x03,
+			(select >> 0) & 0x03,
+			(select >> 2) & 0x03,
+			(select >> 4) & 0x03,
+			(select >> 6) & 0x03,
+		};
+
+		return As<Short4>(Nucleus::createShuffleVector(x.value, x.value, shuffle));
 	}
 
 	RValue<Short4> Insert(RValue<Short4> val, RValue<Short> element, int i)
 	{
-		if(CPUID::supportsMMX2())
-		{
-			return x86::pinsrw(val, Int(element), i);
-		}
-		else
-		{
-			return RValue<Short4>(Nucleus::createInsertElement(val.value, element.value, i));
-		}
+		return RValue<Short4>(Nucleus::createInsertElement(val.value, element.value, i));
 	}
 
 	RValue<Short> Extract(RValue<Short4> val, int i)
 	{
-		if(CPUID::supportsMMX2())
-		{
-			return Short(x86::pextrw(val, i));
-		}
-		else
-		{
-			return RValue<Short>(Nucleus::createExtractElement(val.value, Short::getType(), i));
-		}
+		return RValue<Short>(Nucleus::createExtractElement(val.value, Short::getType(), i));
 	}
 
 	RValue<Short4> CmpGT(RValue<Short4> x, RValue<Short4> y)
@@ -2994,14 +2862,7 @@
 
 	Type *Short4::getType()
 	{
-		if(CPUID::supportsMMX2())
-		{
-			return MMX::getType();
-		}
-		else
-		{
-			return T(VectorType::get(Short::getType(), 4));
-		}
+		return T(Type_v4i16);
 	}
 
 	UShort4::UShort4(RValue<Int4> cast)
@@ -3011,50 +2872,34 @@
 
 	UShort4::UShort4(RValue<Float4> cast, bool saturate)
 	{
-		Float4 sat;
-
 		if(saturate)
 		{
 			if(CPUID::supportsSSE4_1())
 			{
-				sat = Min(cast, Float4(0xFFFF));   // packusdw takes care of 0x0000 saturation
+				Int4 int4(Min(cast, Float4(0xFFFF)));   // packusdw takes care of 0x0000 saturation
+				*this = As<Short4>(Pack(As<UInt4>(int4), As<UInt4>(int4)));
 			}
 			else
 			{
-				sat = Max(Min(cast, Float4(0xFFFF)), Float4(0x0000));
+				*this = Short4(Int4(Max(Min(cast, Float4(0xFFFF)), Float4(0x0000))));
 			}
 		}
 		else
 		{
-			sat = cast;
-		}
-
-		Int4 int4(sat);
-
-		if(!saturate || !CPUID::supportsSSE4_1())
-		{
-			*this = Short4(int4);
-		}
-		else
-		{
-			*this = As<Short4>(Int2(As<Int4>(x86::packusdw(int4, int4))));
+			*this = Short4(Int4(cast));
 		}
 	}
 
 	UShort4::UShort4(unsigned short xyzw)
 	{
 		int64_t constantVector[4] = {xyzw, xyzw, xyzw, xyzw};
-		Value *vector = V(Nucleus::createConstantVector(constantVector, T(VectorType::get(UShort::getType(), 4))));
-
-		storeValue(Nucleus::createBitCast(vector, getType()));
+		storeValue(Nucleus::createConstantVector(constantVector, getType()));
 	}
 
 	UShort4::UShort4(unsigned short x, unsigned short y, unsigned short z, unsigned short w)
 	{
 		int64_t constantVector[4] = {x, y, z, w};
-		Value *vector = V(Nucleus::createConstantVector(constantVector, T(VectorType::get(UShort::getType(), 4))));
-
-		storeValue(Nucleus::createBitCast(vector, getType()));
+		storeValue(Nucleus::createConstantVector(constantVector, getType()));
 	}
 
 	UShort4::UShort4(RValue<UShort4> rhs)
@@ -3139,74 +2984,32 @@
 
 	RValue<UShort4> operator+(RValue<UShort4> lhs, RValue<UShort4> rhs)
 	{
-		if(CPUID::supportsMMX2())
-		{
-			return As<UShort4>(x86::paddw(As<Short4>(lhs), As<Short4>(rhs)));
-		}
-		else
-		{
-			return RValue<UShort4>(Nucleus::createAdd(lhs.value, rhs.value));
-		}
+		return RValue<UShort4>(Nucleus::createAdd(lhs.value, rhs.value));
 	}
 
 	RValue<UShort4> operator-(RValue<UShort4> lhs, RValue<UShort4> rhs)
 	{
-		if(CPUID::supportsMMX2())
-		{
-			return As<UShort4>(x86::psubw(As<Short4>(lhs), As<Short4>(rhs)));
-		}
-		else
-		{
-			return RValue<UShort4>(Nucleus::createSub(lhs.value, rhs.value));
-		}
+		return RValue<UShort4>(Nucleus::createSub(lhs.value, rhs.value));
 	}
 
 	RValue<UShort4> operator*(RValue<UShort4> lhs, RValue<UShort4> rhs)
 	{
-		if(CPUID::supportsMMX2())
-		{
-			return As<UShort4>(x86::pmullw(As<Short4>(lhs), As<Short4>(rhs)));
-		}
-		else
-		{
-			return RValue<UShort4>(Nucleus::createMul(lhs.value, rhs.value));
-		}
+		return RValue<UShort4>(Nucleus::createMul(lhs.value, rhs.value));
 	}
 
 	RValue<UShort4> operator&(RValue<UShort4> lhs, RValue<UShort4> rhs)
 	{
-		if(CPUID::supportsMMX2())
-		{
-			return As<UShort4>(x86::pand(As<Short4>(lhs), As<Short4>(rhs)));
-		}
-		else
-		{
-			return RValue<UShort4>(Nucleus::createAnd(lhs.value, rhs.value));
-		}
+		return RValue<UShort4>(Nucleus::createAnd(lhs.value, rhs.value));
 	}
 
 	RValue<UShort4> operator|(RValue<UShort4> lhs, RValue<UShort4> rhs)
 	{
-		if(CPUID::supportsMMX2())
-		{
-			return As<UShort4>(x86::por(As<Short4>(lhs), As<Short4>(rhs)));
-		}
-		else
-		{
-			return RValue<UShort4>(Nucleus::createOr(lhs.value, rhs.value));
-		}
+		return RValue<UShort4>(Nucleus::createOr(lhs.value, rhs.value));
 	}
 
 	RValue<UShort4> operator^(RValue<UShort4> lhs, RValue<UShort4> rhs)
 	{
-		if(CPUID::supportsMMX2())
-		{
-			return As<UShort4>(x86::pxor(As<Short4>(lhs), As<Short4>(rhs)));
-		}
-		else
-		{
-			return RValue<UShort4>(Nucleus::createXor(lhs.value, rhs.value));
-		}
+		return RValue<UShort4>(Nucleus::createXor(lhs.value, rhs.value));
 	}
 
 	RValue<UShort4> operator<<(RValue<UShort4> lhs, unsigned char rhs)
@@ -3235,14 +3038,7 @@
 
 	RValue<UShort4> operator~(RValue<UShort4> val)
 	{
-		if(CPUID::supportsMMX2())
-		{
-			return As<UShort4>(As<Short4>(val) ^ Short4(0xFFFFu, 0xFFFFu, 0xFFFFu, 0xFFFFu));
-		}
-		else
-		{
-			return RValue<UShort4>(Nucleus::createNot(val.value));
-		}
+		return RValue<UShort4>(Nucleus::createNot(val.value));
 	}
 
 	RValue<UShort4> Max(RValue<UShort4> x, RValue<UShort4> y)
@@ -3277,19 +3073,14 @@
 
 	RValue<Byte8> Pack(RValue<UShort4> x, RValue<UShort4> y)
 	{
-		return x86::packuswb(x, y);
+		auto result = x86::packuswb(x, y);
+
+		return As<Byte8>(Swizzle(As<Int4>(result), 0x88));
 	}
 
 	Type *UShort4::getType()
 	{
-		if(CPUID::supportsMMX2())
-		{
-			return MMX::getType();
-		}
-		else
-		{
-			return T(VectorType::get(UShort::getType(), 4));
-		}
+		return T(Type_v4i16);
 	}
 
 	Short8::Short8(short c)
@@ -3317,15 +3108,10 @@
 
 	Short8::Short8(RValue<Short4> lo, RValue<Short4> hi)
 	{
-		Value *loLong = Nucleus::createBitCast(lo.value, Long::getType());
-		Value *hiLong = Nucleus::createBitCast(hi.value, Long::getType());
+		int shuffle[8] = {0, 1, 2, 3, 8, 9, 10, 11};   // Real type is v8i16
+		Value *packed = Nucleus::createShuffleVector(lo.value, hi.value, shuffle);
 
-		Value *long2 = V(UndefValue::get(VectorType::get(Long::getType(), 2)));
-		long2 = Nucleus::createInsertElement(long2, loLong, 0);
-		long2 = Nucleus::createInsertElement(long2, hiLong, 1);
-		Value *short8 = Nucleus::createBitCast(long2, Short8::getType());
-
-		storeValue(short8);
+		storeValue(packed);
 	}
 
 	RValue<Short8> operator+(RValue<Short8> lhs, RValue<Short8> rhs)
@@ -3355,15 +3141,8 @@
 
 	RValue<Int4> Abs(RValue<Int4> x)
 	{
-		if(CPUID::supportsSSSE3())
-		{
-			return x86::pabsd(x);
-		}
-		else
-		{
-			Int4 mask = (x >> 31);
-			return (mask ^ x) - mask;
-		}
+		auto negative = x >> 31;
+		return (x ^ negative) - negative;
 	}
 
 	RValue<Short8> MulHigh(RValue<Short8> x, RValue<Short8> y)
@@ -3373,7 +3152,7 @@
 
 	Type *Short8::getType()
 	{
-		return T(VectorType::get(Short::getType(), 8));
+		return T(llvm::VectorType::get(T(Short::getType()), 8));
 	}
 
 	UShort8::UShort8(unsigned short c)
@@ -3401,15 +3180,10 @@
 
 	UShort8::UShort8(RValue<UShort4> lo, RValue<UShort4> hi)
 	{
-		Value *loLong = Nucleus::createBitCast(lo.value, Long::getType());
-		Value *hiLong = Nucleus::createBitCast(hi.value, Long::getType());
+		int shuffle[8] = {0, 1, 2, 3, 8, 9, 10, 11};   // Real type is v8i16
+		Value *packed = Nucleus::createShuffleVector(lo.value, hi.value, shuffle);
 
-		Value *long2 = V(UndefValue::get(VectorType::get(Long::getType(), 2)));
-		long2 = Nucleus::createInsertElement(long2, loLong, 0);
-		long2 = Nucleus::createInsertElement(long2, hiLong, 1);
-		Value *short8 = Nucleus::createBitCast(long2, Short8::getType());
-
-		storeValue(short8);
+		storeValue(packed);
 	}
 
 	RValue<UShort8> UShort8::operator=(RValue<UShort8> rhs)
@@ -3506,7 +3280,7 @@
 
 	Type *UShort8::getType()
 	{
-		return T(VectorType::get(UShort::getType(), 8));
+		return T(llvm::VectorType::get(T(UShort::getType()), 8));
 	}
 
 	Int::Int(Argument<Int> argument)
@@ -4290,19 +4064,13 @@
 
 	Int2::Int2(RValue<Int4> cast)
 	{
-		Value *long2 = Nucleus::createBitCast(cast.value, T(VectorType::get(Long::getType(), 2)));
-		Value *element = Nucleus::createExtractElement(long2, Long::getType(), 0);
-		Value *int2 = Nucleus::createBitCast(element, Int2::getType());
-
-		storeValue(int2);
+		storeValue(Nucleus::createBitCast(cast.value, getType()));
 	}
 
 	Int2::Int2(int x, int y)
 	{
 		int64_t constantVector[2] = {x, y};
-		Value *vector = V(Nucleus::createConstantVector(constantVector, T(VectorType::get(Int::getType(), 2))));
-
-		storeValue(Nucleus::createBitCast(vector, getType()));
+		storeValue(Nucleus::createConstantVector(constantVector, getType()));
 	}
 
 	Int2::Int2(RValue<Int2> rhs)
@@ -4324,26 +4092,10 @@
 
 	Int2::Int2(RValue<Int> lo, RValue<Int> hi)
 	{
-		if(CPUID::supportsMMX2())
-		{
-			// movd mm0, lo
-			// movd mm1, hi
-			// punpckldq mm0, mm1
+		int shuffle[4] = {0, 4, 1, 5};
+		Value *packed = Nucleus::createShuffleVector(Int4(lo).loadValue(), Int4(hi).loadValue(), shuffle);
 
-			Value *loLong = Nucleus::createInsertElement(V(UndefValue::get(VectorType::get(Int::getType(), 2))), lo.value, 0);
-			loLong = Nucleus::createInsertElement(loLong, V(ConstantInt::get(Int::getType(), 0)), 1);
-			Value *hiLong = Nucleus::createInsertElement(V(UndefValue::get(VectorType::get(Int::getType(), 2))), hi.value, 0);
-			hiLong = Nucleus::createInsertElement(hiLong, V(ConstantInt::get(Int::getType(), 0)), 1);
-
-			storeValue(As<Int2>(UnpackLow(As<Int2>(loLong), As<Int2>(hiLong))).value);
-		}
-		else
-		{
-			int shuffle[2] = {0, 1};
-			Value *packed = Nucleus::createShuffleVector(Nucleus::createBitCast(lo.value, T(VectorType::get(Int::getType(), 1))), Nucleus::createBitCast(hi.value, T(VectorType::get(Int::getType(), 1))), shuffle);
-
-			storeValue(Nucleus::createBitCast(packed, Int2::getType()));
-		}
+		storeValue(Nucleus::createBitCast(packed, Int2::getType()));
 	}
 
 	RValue<Int2> Int2::operator=(RValue<Int2> rhs)
@@ -4371,26 +4123,12 @@
 
 	RValue<Int2> operator+(RValue<Int2> lhs, RValue<Int2> rhs)
 	{
-		if(CPUID::supportsMMX2())
-		{
-			return x86::paddd(lhs, rhs);
-		}
-		else
-		{
-			return RValue<Int2>(Nucleus::createAdd(lhs.value, rhs.value));
-		}
+		return RValue<Int2>(Nucleus::createAdd(lhs.value, rhs.value));
 	}
 
 	RValue<Int2> operator-(RValue<Int2> lhs, RValue<Int2> rhs)
 	{
-		if(CPUID::supportsMMX2())
-		{
-			return x86::psubd(lhs, rhs);
-		}
-		else
-		{
-			return RValue<Int2>(Nucleus::createSub(lhs.value, rhs.value));
-		}
+		return RValue<Int2>(Nucleus::createSub(lhs.value, rhs.value));
 	}
 
 //	RValue<Int2> operator*(RValue<Int2> lhs, RValue<Int2> rhs)
@@ -4410,38 +4148,17 @@
 
 	RValue<Int2> operator&(RValue<Int2> lhs, RValue<Int2> rhs)
 	{
-		if(CPUID::supportsMMX2())
-		{
-			return As<Int2>(x86::pand(As<Short4>(lhs), As<Short4>(rhs)));
-		}
-		else
-		{
-			return RValue<Int2>(Nucleus::createAnd(lhs.value, rhs.value));
-		}
+		return RValue<Int2>(Nucleus::createAnd(lhs.value, rhs.value));
 	}
 
 	RValue<Int2> operator|(RValue<Int2> lhs, RValue<Int2> rhs)
 	{
-		if(CPUID::supportsMMX2())
-		{
-			return As<Int2>(x86::por(As<Short4>(lhs), As<Short4>(rhs)));
-		}
-		else
-		{
-			return RValue<Int2>(Nucleus::createOr(lhs.value, rhs.value));
-		}
+		return RValue<Int2>(Nucleus::createOr(lhs.value, rhs.value));
 	}
 
 	RValue<Int2> operator^(RValue<Int2> lhs, RValue<Int2> rhs)
 	{
-		if(CPUID::supportsMMX2())
-		{
-			return As<Int2>(x86::pxor(As<Short4>(lhs), As<Short4>(rhs)));
-		}
-		else
-		{
-			return RValue<Int2>(Nucleus::createXor(lhs.value, rhs.value));
-		}
+		return RValue<Int2>(Nucleus::createXor(lhs.value, rhs.value));
 	}
 
 	RValue<Int2> operator<<(RValue<Int2> lhs, unsigned char rhs)
@@ -4520,90 +4237,41 @@
 
 	RValue<Int2> operator~(RValue<Int2> val)
 	{
-		if(CPUID::supportsMMX2())
-		{
-			return val ^ Int2(0xFFFFFFFF, 0xFFFFFFFF);
-		}
-		else
-		{
-			return RValue<Int2>(Nucleus::createNot(val.value));
-		}
+		return RValue<Int2>(Nucleus::createNot(val.value));
 	}
 
 	RValue<Short4> UnpackLow(RValue<Int2> x, RValue<Int2> y)
 	{
-		if(CPUID::supportsMMX2())
-		{
-			return x86::punpckldq(x, y);
-		}
-		else
-		{
-			int shuffle[2] = {0, 2};
-			Value *packed = Nucleus::createShuffleVector(x.value, y.value, shuffle);
-
-			return As<Short4>(packed);
-		}
+		int shuffle[4] = {0, 4, 1, 5};   // Real type is v4i32
+		return As<Short4>(Nucleus::createShuffleVector(x.value, y.value, shuffle));
 	}
 
 	RValue<Short4> UnpackHigh(RValue<Int2> x, RValue<Int2> y)
 	{
-		if(CPUID::supportsMMX2())
-		{
-			return x86::punpckhdq(x, y);
-		}
-		else
-		{
-			int shuffle[2] = {1, 3};
-			Value *packed = Nucleus::createShuffleVector(x.value, y.value, shuffle);
-
-			return As<Short4>(packed);
-		}
+		int shuffle[4] = {0, 4, 1, 5};   // Real type is v4i32
+		auto lowHigh = RValue<Int4>(Nucleus::createShuffleVector(x.value, y.value, shuffle));
+		return As<Short4>(Swizzle(lowHigh, 0xEE));
 	}
 
 	RValue<Int> Extract(RValue<Int2> val, int i)
 	{
-		if(false)   // FIXME: LLVM does not generate optimal code
-		{
-			return RValue<Int>(Nucleus::createExtractElement(val.value, Int::getType(), i));
-		}
-		else
-		{
-			if(i == 0)
-			{
-				return RValue<Int>(Nucleus::createExtractElement(Nucleus::createBitCast(val.value, T(VectorType::get(Int::getType(), 2))), Int::getType(), 0));
-			}
-			else
-			{
-				Int2 val2 = As<Int2>(UnpackHigh(val, val));
-
-				return Extract(val2, 0);
-			}
-		}
+		return RValue<Int>(Nucleus::createExtractElement(val.value, Int::getType(), i));
 	}
 
 	RValue<Int2> Insert(RValue<Int2> val, RValue<Int> element, int i)
 	{
-		return RValue<Int2>(Nucleus::createBitCast(Nucleus::createInsertElement(Nucleus::createBitCast(val.value, T(VectorType::get(Int::getType(), 2))), element.value, i), Int2::getType()));
+		return RValue<Int2>(Nucleus::createInsertElement(val.value, element.value, i));
 	}
 
 	Type *Int2::getType()
 	{
-		if(CPUID::supportsMMX2())
-		{
-			return MMX::getType();
-		}
-		else
-		{
-			return T(VectorType::get(Int::getType(), 2));
-		}
+		return T(Type_v2i32);
 	}
 
 	UInt2::UInt2(unsigned int x, unsigned int y)
 	{
 		int64_t constantVector[2] = {x, y};
-		Value *vector = V(Nucleus::createConstantVector(constantVector, T(VectorType::get(UInt::getType(), 2))));
-
-		storeValue(Nucleus::createBitCast(vector, getType()));
+		storeValue(Nucleus::createConstantVector(constantVector, getType()));
 	}
 
 	UInt2::UInt2(RValue<UInt2> rhs)
@@ -4648,26 +4316,12 @@
 
 	RValue<UInt2> operator+(RValue<UInt2> lhs, RValue<UInt2> rhs)
 	{
-		if(CPUID::supportsMMX2())
-		{
-			return As<UInt2>(x86::paddd(As<Int2>(lhs), As<Int2>(rhs)));
-		}
-		else
-		{
-			return RValue<UInt2>(Nucleus::createAdd(lhs.value, rhs.value));
-		}
+		return RValue<UInt2>(Nucleus::createAdd(lhs.value, rhs.value));
 	}
 
 	RValue<UInt2> operator-(RValue<UInt2> lhs, RValue<UInt2> rhs)
 	{
-		if(CPUID::supportsMMX2())
-		{
-			return As<UInt2>(x86::psubd(As<Int2>(lhs), As<Int2>(rhs)));
-		}
-		else
-		{
-			return RValue<UInt2>(Nucleus::createSub(lhs.value, rhs.value));
-		}
+		return RValue<UInt2>(Nucleus::createSub(lhs.value, rhs.value));
 	}
 
 //	RValue<UInt2> operator*(RValue<UInt2> lhs, RValue<UInt2> rhs)
@@ -4687,38 +4341,17 @@
 
 	RValue<UInt2> operator&(RValue<UInt2> lhs, RValue<UInt2> rhs)
 	{
-		if(CPUID::supportsMMX2())
-		{
-			return As<UInt2>(x86::pand(As<Short4>(lhs), As<Short4>(rhs)));
-		}
-		else
-		{
-			return RValue<UInt2>(Nucleus::createAnd(lhs.value, rhs.value));
-		}
+		return RValue<UInt2>(Nucleus::createAnd(lhs.value, rhs.value));
 	}
 
 	RValue<UInt2> operator|(RValue<UInt2> lhs, RValue<UInt2> rhs)
 	{
-		if(CPUID::supportsMMX2())
-		{
-			return As<UInt2>(x86::por(As<Short4>(lhs), As<Short4>(rhs)));
-		}
-		else
-		{
-			return RValue<UInt2>(Nucleus::createOr(lhs.value, rhs.value));
-		}
+		return RValue<UInt2>(Nucleus::createOr(lhs.value, rhs.value));
 	}
 
 	RValue<UInt2> operator^(RValue<UInt2> lhs, RValue<UInt2> rhs)
 	{
-		if(CPUID::supportsMMX2())
-		{
-			return As<UInt2>(x86::pxor(As<Short4>(lhs), As<Short4>(rhs)));
-		}
-		else
-		{
-			return RValue<UInt2>(Nucleus::createXor(lhs.value, rhs.value));
-		}
+		return RValue<UInt2>(Nucleus::createXor(lhs.value, rhs.value));
 	}
 
 	RValue<UInt2> operator<<(RValue<UInt2> lhs, unsigned char rhs)
@@ -4797,81 +4430,52 @@
 
 	RValue<UInt2> operator~(RValue<UInt2> val)
 	{
-		if(CPUID::supportsMMX2())
-		{
-			return val ^ UInt2(0xFFFFFFFF, 0xFFFFFFFF);
-		}
-		else
-		{
-			return RValue<UInt2>(Nucleus::createNot(val.value));
-		}
+		return RValue<UInt2>(Nucleus::createNot(val.value));
 	}
 
 	Type *UInt2::getType()
 	{
-		if(CPUID::supportsMMX2())
-		{
-			return MMX::getType();
-		}
-		else
-		{
-			return T(VectorType::get(UInt::getType(), 2));
-		}
+		return T(Type_v2i32);
 	}
 
 	Int4::Int4(RValue<Byte4> cast)
 	{
-		Value *x = Nucleus::createBitCast(cast.value, Int::getType());
-		Value *a = Nucleus::createInsertElement(V(UndefValue::get(Int4::getType())), x, 0);
-
-		Value *e;
-
-		if (CPUID::supportsSSE4_1())
+		if(CPUID::supportsSSE4_1())
 		{
-			e = x86::pmovzxbd(RValue<Int4>(a)).value;
+			*this = x86::pmovzxbd(As<Byte16>(cast));
 		}
 		else
 		{
 			int swizzle[16] = {0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23};
-			Value *b = Nucleus::createBitCast(a, Byte16::getType());
-			Value *c = Nucleus::createShuffleVector(b, V(Nucleus::createNullValue(Byte16::getType())), swizzle);
+			Value *a = Nucleus::createBitCast(cast.value, Byte16::getType());
+			Value *b = Nucleus::createShuffleVector(a, V(Nucleus::createNullValue(Byte16::getType())), swizzle);
 
 			int swizzle2[8] = {0, 8, 1, 9, 2, 10, 3, 11};
-			Value *d = Nucleus::createBitCast(c, Short8::getType());
-			e = Nucleus::createShuffleVector(d, V(Nucleus::createNullValue(Short8::getType())), swizzle2);
-		}
+			Value *c = Nucleus::createBitCast(b, Short8::getType());
+			Value *d = Nucleus::createShuffleVector(c, V(Nucleus::createNullValue(Short8::getType())), swizzle2);
 
-		Value *f = Nucleus::createBitCast(e, Int4::getType());
-		storeValue(f);
+			*this = As<Int4>(d);
+		}
 	}
 
 	Int4::Int4(RValue<SByte4> cast)
 	{
-		Value *x = Nucleus::createBitCast(cast.value, Int::getType());
-		Value *a = Nucleus::createInsertElement(V(UndefValue::get(Int4::getType())), x, 0);
-
-		Value *g;
-
-		if (CPUID::supportsSSE4_1())
+		if(CPUID::supportsSSE4_1())
 		{
-			g = x86::pmovsxbd(RValue<Int4>(a)).value;
+			*this = x86::pmovsxbd(As<SByte16>(cast));
 		}
 		else
 		{
-			int	swizzle[16] = {0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7};
-			Value *b = Nucleus::createBitCast(a, Byte16::getType());
-			Value *c = Nucleus::createShuffleVector(b, b, swizzle);
+			int swizzle[16] = {0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7};
+			Value *a = Nucleus::createBitCast(cast.value, Byte16::getType());
+			Value *b = Nucleus::createShuffleVector(a, a, swizzle);
 
 			int swizzle2[8] = {0, 0, 1, 1, 2, 2, 3, 3};
-			Value *d = Nucleus::createBitCast(c, Short8::getType());
-			Value *e = Nucleus::createShuffleVector(d, d, swizzle2);
+			Value *c = Nucleus::createBitCast(b, Short8::getType());
+			Value *d = Nucleus::createShuffleVector(c, c, swizzle2);
 
-			Value *f = Nucleus::createBitCast(e, Int4::getType());
-			//	g = Nucleus::createAShr(f, Nucleus::createConstantInt(24));
-			g = x86::psrad(RValue<Int4>(f), 24).value;
+			*this = As<Int4>(d) >> 24;
 		}
-
-		storeValue(g);
 	}
 
 	Int4::Int4(RValue<Float4> cast)
@@ -4883,51 +4487,29 @@
 
 	Int4::Int4(RValue<Short4> cast)
 	{
-		Value *long2 = V(UndefValue::get(VectorType::get(Long::getType(), 2)));
-		Value *element = Nucleus::createBitCast(cast.value, Long::getType());
-		long2 = Nucleus::createInsertElement(long2, element, 0);
-		RValue<Int4> vector = RValue<Int4>(Nucleus::createBitCast(long2, Int4::getType()));
-
 		if(CPUID::supportsSSE4_1())
 		{
-			storeValue(x86::pmovsxwd(vector).value);
+			*this = x86::pmovsxwd(As<Short8>(cast));
 		}
 		else
 		{
-			Value *b = Nucleus::createBitCast(vector.value, Short8::getType());
-
 			int swizzle[8] = {0, 0, 1, 1, 2, 2, 3, 3};
-			Value *c = Nucleus::createShuffleVector(b, b, swizzle);
-			Value *d = Nucleus::createBitCast(c, Int4::getType());
-			storeValue(d);
-
-			// Each Short is packed into each Int in the (Short | Short) format.
-			// Shifting by 16 will retrieve the original Short value.
-			// Shifting an Int will propagate the sign bit, which will work
-			// for both positive and negative values of a Short.
-			*this >>= 16;
+			Value *c = Nucleus::createShuffleVector(cast.value, cast.value, swizzle);
+			*this = As<Int4>(c) >> 16;
 		}
 	}
 
 	Int4::Int4(RValue<UShort4> cast)
 	{
-		Value *long2 = V(UndefValue::get(VectorType::get(Long::getType(), 2)));
-		Value *element = Nucleus::createBitCast(cast.value, Long::getType());
-		long2 = Nucleus::createInsertElement(long2, element, 0);
-		RValue<Int4> vector = RValue<Int4>(Nucleus::createBitCast(long2, Int4::getType()));
-
 		if(CPUID::supportsSSE4_1())
 		{
-			storeValue(x86::pmovzxwd(RValue<Int4>(vector)).value);
+			*this = x86::pmovzxwd(As<UShort8>(cast));
 		}
 		else
 		{
-			Value *b = Nucleus::createBitCast(vector.value, Short8::getType());
-
 			int swizzle[8] = {0, 8, 1, 9, 2, 10, 3, 11};
-			Value *c = Nucleus::createShuffleVector(b, V(Nucleus::createNullValue(Short8::getType())), swizzle);
-			Value *d = Nucleus::createBitCast(c, Int4::getType());
-			storeValue(d);
+			Value *c = Nucleus::createShuffleVector(cast.value, Short8(0, 0, 0, 0, 0, 0, 0, 0).loadValue(), swizzle);
+			*this = As<Int4>(c);
 		}
 	}
 
@@ -4993,15 +4575,10 @@
 
 	Int4::Int4(RValue<Int2> lo, RValue<Int2> hi)
 	{
-		Value *loLong = Nucleus::createBitCast(lo.value, Long::getType());
-		Value *hiLong = Nucleus::createBitCast(hi.value, Long::getType());
+		int shuffle[4] = {0, 1, 4, 5};   // Real type is v4i32
+		Value *packed = Nucleus::createShuffleVector(lo.value, hi.value, shuffle);
 
-		Value *long2 = V(UndefValue::get(VectorType::get(Long::getType(), 2)));
-		long2 = Nucleus::createInsertElement(long2, loLong, 0);
-		long2 = Nucleus::createInsertElement(long2, hiLong, 1);
-		Value *int4 = Nucleus::createBitCast(long2, Int4::getType());
-
-		storeValue(int4);
+		storeValue(packed);
 	}
 
 	Int4::Int4(RValue<Int> rhs)
@@ -5270,7 +4847,7 @@
 
 	Type *Int4::getType()
 	{
-		return T(VectorType::get(Int::getType(), 4));
+		return T(llvm::VectorType::get(T(Int::getType()), 4));
 	}
 
 	UInt4::UInt4(RValue<Float4> cast)
@@ -5354,15 +4931,10 @@
 
 	UInt4::UInt4(RValue<UInt2> lo, RValue<UInt2> hi)
 	{
-		Value *loLong = Nucleus::createBitCast(lo.value, Long::getType());
-		Value *hiLong = Nucleus::createBitCast(hi.value, Long::getType());
+		int shuffle[4] = {0, 1, 4, 5};   // Real type is v4i32
+		Value *packed = Nucleus::createShuffleVector(lo.value, hi.value, shuffle);
 
-		Value *long2 = V(UndefValue::get(VectorType::get(Long::getType(), 2)));
-		long2 = Nucleus::createInsertElement(long2, loLong, 0);
-		long2 = Nucleus::createInsertElement(long2, hiLong, 1);
-		Value *uint4 = Nucleus::createBitCast(long2, Int4::getType());
-
-		storeValue(uint4);
+		storeValue(packed);
 	}
 
 	RValue<UInt4> UInt4::operator=(RValue<UInt4> rhs)
@@ -5585,7 +5157,7 @@
 
 	Type *UInt4::getType()
 	{
-		return T(VectorType::get(UInt::getType(), 4));
+		return T(llvm::VectorType::get(T(UInt::getType()), 4));
 	}
 
 	Float::Float(RValue<Int> cast)
@@ -5595,6 +5167,14 @@
 		storeValue(integer);
 	}
 
+	Float::Float(RValue<UInt> cast)
+	{
+		RValue<Float> result = Float(Int(cast & UInt(0x7FFFFFFF))) +
+		                       As<Float>((As<Int>(cast) >> 31) & As<Int>(Float(0x80000000u)));
+
+		storeValue(result.value);
+	}
+
 	Float::Float(float x)
 	{
 		storeValue(Nucleus::createConstantFloat(x));
@@ -5826,74 +5406,26 @@
 
 	Float2::Float2(RValue<Float4> cast)
 	{
-		Value *int64x2 = Nucleus::createBitCast(cast.value, T(VectorType::get(Long::getType(), 2)));
-		Value *int64 = Nucleus::createExtractElement(int64x2, Long::getType(), 0);
-		Value *float2 = Nucleus::createBitCast(int64, Float2::getType());
-
-		storeValue(float2);
+		storeValue(Nucleus::createBitCast(cast.value, getType()));
 	}
 
 	Type *Float2::getType()
 	{
-		return T(VectorType::get(Float::getType(), 2));
+		return T(Type_v2f32);
 	}
 
 	Float4::Float4(RValue<Byte4> cast) : FloatXYZW(this)
 	{
-		#if 0
-			Value *xyzw = Nucleus::createUIToFP(cast.value, Float4::getType());   // FIXME: Crashes
-		#elif 0
-			Value *vector = loadValue();
-
-			Value *i8x = Nucleus::createExtractElement(cast.value, 0);
-			Value *f32x = Nucleus::createUIToFP(i8x, Float::getType());
-			Value *x = Nucleus::createInsertElement(vector, f32x, 0);
-
-			Value *i8y = Nucleus::createExtractElement(cast.value, V(Nucleus::createConstantInt(1)));
-			Value *f32y = Nucleus::createUIToFP(i8y, Float::getType());
-			Value *xy = Nucleus::createInsertElement(x, f32y, V(Nucleus::createConstantInt(1)));
-
-			Value *i8z = Nucleus::createExtractElement(cast.value, Nucleus::createConstantInt(2));
-			Value *f32z = Nucleus::createUIToFP(i8z, Float::getType());
-			Value *xyz = Nucleus::createInsertElement(xy, f32z, Nucleus::createConstantInt(2));
-
-			Value *i8w = Nucleus::createExtractElement(cast.value, Nucleus::createConstantInt(3));
-			Value *f32w = Nucleus::createUIToFP(i8w, Float::getType());
-			Value *xyzw = Nucleus::createInsertElement(xyz, f32w, Nucleus::createConstantInt(3));
-		#else
-			Value *a = Int4(cast).loadValue();
-			Value *xyzw = Nucleus::createSIToFP(a, Float4::getType());
-		#endif
+		Value *a = Int4(cast).loadValue();
+		Value *xyzw = Nucleus::createSIToFP(a, Float4::getType());
 
 		storeValue(xyzw);
 	}
 
 	Float4::Float4(RValue<SByte4> cast) : FloatXYZW(this)
 	{
-		#if 0
-			Value *xyzw = Nucleus::createSIToFP(cast.value, Float4::getType());   // FIXME: Crashes
-		#elif 0
-			Value *vector = loadValue();
-
-			Value *i8x = Nucleus::createExtractElement(cast.value, 0);
-			Value *f32x = Nucleus::createSIToFP(i8x, Float::getType());
-			Value *x = Nucleus::createInsertElement(vector, f32x, 0);
-
-			Value *i8y = Nucleus::createExtractElement(cast.value, V(Nucleus::createConstantInt(1)));
-			Value *f32y = Nucleus::createSIToFP(i8y, Float::getType());
-			Value *xy = Nucleus::createInsertElement(x, f32y, V(Nucleus::createConstantInt(1)));
-
-			Value *i8z = Nucleus::createExtractElement(cast.value, Nucleus::createConstantInt(2));
-			Value *f32z = Nucleus::createSIToFP(i8z, Float::getType());
-			Value *xyz = Nucleus::createInsertElement(xy, f32z, Nucleus::createConstantInt(2));
-
-			Value *i8w = Nucleus::createExtractElement(cast.value, Nucleus::createConstantInt(3));
-			Value *f32w = Nucleus::createSIToFP(i8w, Float::getType());
-			Value *xyzw = Nucleus::createInsertElement(xyz, f32w, Nucleus::createConstantInt(3));
-		#else
-			Value *a = Int4(cast).loadValue();
-			Value *xyzw = Nucleus::createSIToFP(a, Float4::getType());
-		#endif
+		Value *a = Int4(cast).loadValue();
+		Value *xyzw = Nucleus::createSIToFP(a, Float4::getType());
 
 		storeValue(xyzw);
 	}
@@ -6102,7 +5634,7 @@
 		int64_t constantVector[4] = {0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF};
 		Value *result = Nucleus::createAnd(vector, V(Nucleus::createConstantVector(constantVector, Int4::getType())));
 
-		return RValue<Float4>(Nucleus::createBitCast(result, Float4::getType()));
+		return As<Float4>(result);
 	}
 
 	RValue<Float4> Max(RValue<Float4> x, RValue<Float4> y)
@@ -6139,9 +5671,9 @@
 		return x86::sqrtps(x);
 	}
 
-	RValue<Float4> Insert(RValue<Float4> val, RValue<Float> element, int i)
+	RValue<Float4> Insert(RValue<Float4> x, RValue<Float> element, int i)
 	{
-		return RValue<Float4>(Nucleus::createInsertElement(val.value, element.value, i));
+		return RValue<Float4>(Nucleus::createInsertElement(x.value, element.value, i));
 	}
 
 	RValue<Float> Extract(RValue<Float4> x, int i)
@@ -6182,10 +5714,10 @@
 	RValue<Float4> Mask(Float4 &lhs, RValue<Float4> rhs, unsigned char select)
 	{
 		Value *vector = lhs.loadValue();
-		Value *shuffle = createMask4(vector, rhs.value, select);
-		lhs.storeValue(shuffle);
+		Value *result = createMask4(vector, rhs.value, select);
+		lhs.storeValue(result);
 
-		return RValue<Float4>(shuffle);
+		return RValue<Float4>(result);
 	}
 
 	RValue<Int> SignMask(RValue<Float4> x)
@@ -6249,22 +5781,28 @@
 		}
 		else
 		{
-			return Float4(Int4(x));   // Rounded toward zero
+			return Float4(Int4(x));
 		}
 	}
 
 	RValue<Float4> Frac(RValue<Float4> x)
 	{
+		Float4 frc;
+
 		if(CPUID::supportsSSE4_1())
 		{
-			return x - x86::floorps(x);
+			frc = x - Floor(x);
 		}
 		else
 		{
-			Float4 frc = x - Float4(Int4(x));   // Signed fractional part
+			frc = x - Float4(Int4(x));   // Signed fractional part.
 
-			return frc + As<Float4>(As<Int4>(CmpNLE(Float4(0.0f), frc)) & As<Int4>(Float4(1, 1, 1, 1)));
+			frc += As<Float4>(As<Int4>(CmpNLE(Float4(0.0f), frc)) & As<Int4>(Float4(1.0f)));   // Add 1.0 if negative.
 		}
+
+		// x - floor(x) can be 1.0 for very small negative x.
+		// Clamp against the value just below 1.0.
+		return Min(frc, As<Float4>(Int4(0x3F7FFFFF)));
 	}
 
 	RValue<Float4> Floor(RValue<Float4> x)
@@ -6293,12 +5831,12 @@
 
 	Type *Float4::getType()
 	{
-		return T(VectorType::get(Float::getType(), 4));
+		return T(llvm::VectorType::get(T(Float::getType()), 4));
 	}
 
 	RValue<Pointer<Byte>> operator+(RValue<Pointer<Byte>> lhs, int offset)
 	{
-		return RValue<Pointer<Byte>>(Nucleus::createGEP(lhs.value, Byte::getType(), V(Nucleus::createConstantInt(offset)), false));
+		return lhs + RValue<Int>(Nucleus::createConstantInt(offset));
 	}
 
 	RValue<Pointer<Byte>> operator+(RValue<Pointer<Byte>> lhs, RValue<Int> offset)
@@ -6378,7 +5916,7 @@
 
 	RValue<Long> Ticks()
 	{
-		llvm::Function *rdtsc = Intrinsic::getDeclaration(::module, Intrinsic::readcyclecounter);
+		llvm::Function *rdtsc = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::readcyclecounter);
 
 		return RValue<Long>(V(::builder->CreateCall(rdtsc)));
 	}
@@ -6390,7 +5928,7 @@
 	{
 		RValue<Int> cvtss2si(RValue<Float> val)
 		{
-			llvm::Function *cvtss2si = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse_cvtss2si);
+			llvm::Function *cvtss2si = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse_cvtss2si);
 
 			Float4 vector;
 			vector.x = val;
@@ -6398,104 +5936,80 @@
 			return RValue<Int>(V(::builder->CreateCall(cvtss2si, RValue<Float4>(vector).value)));
 		}
 
-		RValue<Int2> cvtps2pi(RValue<Float4> val)
-		{
-			llvm::Function *cvtps2pi = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse_cvtps2pi);
-
-			return RValue<Int2>(V(::builder->CreateCall(cvtps2pi, val.value)));
-		}
-
-		RValue<Int2> cvttps2pi(RValue<Float4> val)
-		{
-			llvm::Function *cvttps2pi = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse_cvttps2pi);
-
-			return RValue<Int2>(V(::builder->CreateCall(cvttps2pi, val.value)));
-		}
-
 		RValue<Int4> cvtps2dq(RValue<Float4> val)
 		{
-			if(CPUID::supportsSSE2())
-			{
-				llvm::Function *cvtps2dq = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse2_cvtps2dq);
+			llvm::Function *cvtps2dq = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse2_cvtps2dq);
 
-				return RValue<Int4>(V(::builder->CreateCall(cvtps2dq, val.value)));
-			}
-			else
-			{
-				Int2 lo = x86::cvtps2pi(val);
-				Int2 hi = x86::cvtps2pi(Swizzle(val, 0xEE));
-
-				return Int4(lo, hi);
-			}
+			return RValue<Int4>(V(::builder->CreateCall(cvtps2dq, val.value)));
 		}
 
 		RValue<Float> rcpss(RValue<Float> val)
 		{
-			llvm::Function *rcpss = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse_rcp_ss);
+			llvm::Function *rcpss = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse_rcp_ss);
 
-			Value *vector = Nucleus::createInsertElement(V(UndefValue::get(Float4::getType())), val.value, 0);
+			Value *vector = Nucleus::createInsertElement(V(llvm::UndefValue::get(T(Float4::getType()))), val.value, 0);
 
 			return RValue<Float>(Nucleus::createExtractElement(V(::builder->CreateCall(rcpss, vector)), Float::getType(), 0));
 		}
 
 		RValue<Float> sqrtss(RValue<Float> val)
 		{
-			llvm::Function *sqrtss = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse_sqrt_ss);
+			llvm::Function *sqrtss = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse_sqrt_ss);
 
-			Value *vector = Nucleus::createInsertElement(V(UndefValue::get(Float4::getType())), val.value, 0);
+			Value *vector = Nucleus::createInsertElement(V(llvm::UndefValue::get(T(Float4::getType()))), val.value, 0);
 
 			return RValue<Float>(Nucleus::createExtractElement(V(::builder->CreateCall(sqrtss, vector)), Float::getType(), 0));
 		}
 
 		RValue<Float> rsqrtss(RValue<Float> val)
 		{
-			llvm::Function *rsqrtss = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse_rsqrt_ss);
+			llvm::Function *rsqrtss = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse_rsqrt_ss);
 
-			Value *vector = Nucleus::createInsertElement(V(UndefValue::get(Float4::getType())), val.value, 0);
+			Value *vector = Nucleus::createInsertElement(V(llvm::UndefValue::get(T(Float4::getType()))), val.value, 0);
 
 			return RValue<Float>(Nucleus::createExtractElement(V(::builder->CreateCall(rsqrtss, vector)), Float::getType(), 0));
 		}
 
 		RValue<Float4> rcpps(RValue<Float4> val)
 		{
-			llvm::Function *rcpps = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse_rcp_ps);
+			llvm::Function *rcpps = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse_rcp_ps);
 
 			return RValue<Float4>(V(::builder->CreateCall(rcpps, val.value)));
 		}
 
 		RValue<Float4> sqrtps(RValue<Float4> val)
 		{
-			llvm::Function *sqrtps = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse_sqrt_ps);
+			llvm::Function *sqrtps = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse_sqrt_ps);
 
 			return RValue<Float4>(V(::builder->CreateCall(sqrtps, val.value)));
 		}
 
 		RValue<Float4> rsqrtps(RValue<Float4> val)
 		{
-			llvm::Function *rsqrtps = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse_rsqrt_ps);
+			llvm::Function *rsqrtps = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse_rsqrt_ps);
 
 			return RValue<Float4>(V(::builder->CreateCall(rsqrtps, val.value)));
 		}
 
 		RValue<Float4> maxps(RValue<Float4> x, RValue<Float4> y)
 		{
-			llvm::Function *maxps = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse_max_ps);
+			llvm::Function *maxps = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse_max_ps);
 
 			return RValue<Float4>(V(::builder->CreateCall2(maxps, x.value, y.value)));
 		}
 
 		RValue<Float4> minps(RValue<Float4> x, RValue<Float4> y)
 		{
-			llvm::Function *minps = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse_min_ps);
+			llvm::Function *minps = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse_min_ps);
 
 			return RValue<Float4>(V(::builder->CreateCall2(minps, x.value, y.value)));
 		}
 
 		RValue<Float> roundss(RValue<Float> val, unsigned char imm)
 		{
-			llvm::Function *roundss = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse41_round_ss);
+			llvm::Function *roundss = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse41_round_ss);
 
-			Value *undef = V(UndefValue::get(Float4::getType()));
+			Value *undef = V(llvm::UndefValue::get(T(Float4::getType())));
 			Value *vector = Nucleus::createInsertElement(undef, val.value, 0);
 
 			return RValue<Float>(Nucleus::createExtractElement(V(::builder->CreateCall3(roundss, undef, vector, V(Nucleus::createConstantInt(imm)))), Float::getType(), 0));
@@ -6513,7 +6027,7 @@
 
 		RValue<Float4> roundps(RValue<Float4> val, unsigned char imm)
 		{
-			llvm::Function *roundps = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse41_round_ps);
+			llvm::Function *roundps = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse41_round_ps);
 
 			return RValue<Float4>(V(::builder->CreateCall2(roundps, val.value, V(Nucleus::createConstantInt(imm)))));
 		}
@@ -6528,397 +6042,151 @@
 			return roundps(val, 2);
 		}
 
-		RValue<Float4> cmpps(RValue<Float4> x, RValue<Float4> y, unsigned char imm)
-		{
-			llvm::Function *cmpps = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse_cmp_ps);
-
-			return RValue<Float4>(V(::builder->CreateCall3(cmpps, x.value, y.value, V(Nucleus::createConstantByte(imm)))));
-		}
-
-		RValue<Float4> cmpeqps(RValue<Float4> x, RValue<Float4> y)
-		{
-			return cmpps(x, y, 0);
-		}
-
-		RValue<Float4> cmpltps(RValue<Float4> x, RValue<Float4> y)
-		{
-			return cmpps(x, y, 1);
-		}
-
-		RValue<Float4> cmpleps(RValue<Float4> x, RValue<Float4> y)
-		{
-			return cmpps(x, y, 2);
-		}
-
-		RValue<Float4> cmpunordps(RValue<Float4> x, RValue<Float4> y)
-		{
-			return cmpps(x, y, 3);
-		}
-
-		RValue<Float4> cmpneqps(RValue<Float4> x, RValue<Float4> y)
-		{
-			return cmpps(x, y, 4);
-		}
-
-		RValue<Float4> cmpnltps(RValue<Float4> x, RValue<Float4> y)
-		{
-			return cmpps(x, y, 5);
-		}
-
-		RValue<Float4> cmpnleps(RValue<Float4> x, RValue<Float4> y)
-		{
-			return cmpps(x, y, 6);
-		}
-
-		RValue<Float4> cmpordps(RValue<Float4> x, RValue<Float4> y)
-		{
-			return cmpps(x, y, 7);
-		}
-
-		RValue<Float> cmpss(RValue<Float> x, RValue<Float> y, unsigned char imm)
-		{
-			llvm::Function *cmpss = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse_cmp_ss);
-
-			Value *vector1 = Nucleus::createInsertElement(V(UndefValue::get(Float4::getType())), x.value, 0);
-			Value *vector2 = Nucleus::createInsertElement(V(UndefValue::get(Float4::getType())), y.value, 0);
-
-			return RValue<Float>(Nucleus::createExtractElement(V(::builder->CreateCall3(cmpss, vector1, vector2, V(Nucleus::createConstantByte(imm)))), Float::getType(), 0));
-		}
-
-		RValue<Float> cmpeqss(RValue<Float> x, RValue<Float> y)
-		{
-			return cmpss(x, y, 0);
-		}
-
-		RValue<Float> cmpltss(RValue<Float> x, RValue<Float> y)
-		{
-			return cmpss(x, y, 1);
-		}
-
-		RValue<Float> cmpless(RValue<Float> x, RValue<Float> y)
-		{
-			return cmpss(x, y, 2);
-		}
-
-		RValue<Float> cmpunordss(RValue<Float> x, RValue<Float> y)
-		{
-			return cmpss(x, y, 3);
-		}
-
-		RValue<Float> cmpneqss(RValue<Float> x, RValue<Float> y)
-		{
-			return cmpss(x, y, 4);
-		}
-
-		RValue<Float> cmpnltss(RValue<Float> x, RValue<Float> y)
-		{
-			return cmpss(x, y, 5);
-		}
-
-		RValue<Float> cmpnless(RValue<Float> x, RValue<Float> y)
-		{
-			return cmpss(x, y, 6);
-		}
-
-		RValue<Float> cmpordss(RValue<Float> x, RValue<Float> y)
-		{
-			return cmpss(x, y, 7);
-		}
-
 		RValue<Int4> pabsd(RValue<Int4> x)
 		{
-			llvm::Function *pabsd = Intrinsic::getDeclaration(::module, Intrinsic::x86_ssse3_pabs_d_128);
+			llvm::Function *pabsd = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_ssse3_pabs_d_128);
 
 			return RValue<Int4>(V(::builder->CreateCall(pabsd, x.value)));
 		}
 
 		RValue<Short4> paddsw(RValue<Short4> x, RValue<Short4> y)
 		{
-			llvm::Function *paddsw = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_padds_w);
+			llvm::Function *paddsw = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse2_padds_w);
 
-			return As<Short4>(V(::builder->CreateCall2(paddsw, As<MMX>(x).value, As<MMX>(y).value)));
+			return As<Short4>(V(::builder->CreateCall2(paddsw, x.value, y.value)));
 		}
 
 		RValue<Short4> psubsw(RValue<Short4> x, RValue<Short4> y)
 		{
-			llvm::Function *psubsw = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_psubs_w);
+			llvm::Function *psubsw = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse2_psubs_w);
 
-			return As<Short4>(V(::builder->CreateCall2(psubsw, As<MMX>(x).value, As<MMX>(y).value)));
+			return As<Short4>(V(::builder->CreateCall2(psubsw, x.value, y.value)));
 		}
 
 		RValue<UShort4> paddusw(RValue<UShort4> x, RValue<UShort4> y)
 		{
-			llvm::Function *paddusw = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_paddus_w);
+			llvm::Function *paddusw = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse2_paddus_w);
 
-			return As<UShort4>(V(::builder->CreateCall2(paddusw, As<MMX>(x).value, As<MMX>(y).value)));
+			return As<UShort4>(V(::builder->CreateCall2(paddusw, x.value, y.value)));
 		}
 
 		RValue<UShort4> psubusw(RValue<UShort4> x, RValue<UShort4> y)
 		{
-			llvm::Function *psubusw = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_psubus_w);
+			llvm::Function *psubusw = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse2_psubus_w);
 
-			return As<UShort4>(V(::builder->CreateCall2(psubusw, As<MMX>(x).value, As<MMX>(y).value)));
+			return As<UShort4>(V(::builder->CreateCall2(psubusw, x.value, y.value)));
 		}
 
 		RValue<SByte8> paddsb(RValue<SByte8> x, RValue<SByte8> y)
 		{
-			llvm::Function *paddsb = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_padds_b);
+			llvm::Function *paddsb = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse2_padds_b);
 
-			return As<SByte8>(V(::builder->CreateCall2(paddsb, As<MMX>(x).value, As<MMX>(y).value)));
+			return As<SByte8>(V(::builder->CreateCall2(paddsb, x.value, y.value)));
 		}
 
 		RValue<SByte8> psubsb(RValue<SByte8> x, RValue<SByte8> y)
 		{
-			llvm::Function *psubsb = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_psubs_b);
+			llvm::Function *psubsb = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse2_psubs_b);
 
-			return As<SByte8>(V(::builder->CreateCall2(psubsb, As<MMX>(x).value, As<MMX>(y).value)));
+			return As<SByte8>(V(::builder->CreateCall2(psubsb, x.value, y.value)));
 		}
 
 		RValue<Byte8> paddusb(RValue<Byte8> x, RValue<Byte8> y)
 		{
-			llvm::Function *paddusb = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_paddus_b);
+			llvm::Function *paddusb = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse2_paddus_b);
 
-			return As<Byte8>(V(::builder->CreateCall2(paddusb, As<MMX>(x).value, As<MMX>(y).value)));
+			return As<Byte8>(V(::builder->CreateCall2(paddusb, x.value, y.value)));
 		}
 
 		RValue<Byte8> psubusb(RValue<Byte8> x, RValue<Byte8> y)
 		{
-			llvm::Function *psubusb = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_psubus_b);
+			llvm::Function *psubusb = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse2_psubus_b);
 
-			return As<Byte8>(V(::builder->CreateCall2(psubusb, As<MMX>(x).value, As<MMX>(y).value)));
-		}
-
-		RValue<Short4> paddw(RValue<Short4> x, RValue<Short4> y)
-		{
-			llvm::Function *paddw = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_padd_w);
-
-			return As<Short4>(V(::builder->CreateCall2(paddw, As<MMX>(x).value, As<MMX>(y).value)));
-		}
-
-		RValue<Short4> psubw(RValue<Short4> x, RValue<Short4> y)
-		{
-			llvm::Function *psubw = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_psub_w);
-
-			return As<Short4>(V(::builder->CreateCall2(psubw, As<MMX>(x).value, As<MMX>(y).value)));
-		}
-
-		RValue<Short4> pmullw(RValue<Short4> x, RValue<Short4> y)
-		{
-			llvm::Function *pmullw = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_pmull_w);
-
-			return As<Short4>(V(::builder->CreateCall2(pmullw, As<MMX>(x).value, As<MMX>(y).value)));
-		}
-
-		RValue<Short4> pand(RValue<Short4> x, RValue<Short4> y)
-		{
-			llvm::Function *pand = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_pand);
-
-			return As<Short4>(V(::builder->CreateCall2(pand, As<MMX>(x).value, As<MMX>(y).value)));
-		}
-
-		RValue<Short4> por(RValue<Short4> x, RValue<Short4> y)
-		{
-			llvm::Function *por = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_por);
-
-			return As<Short4>(V(::builder->CreateCall2(por, As<MMX>(x).value, As<MMX>(y).value)));
-		}
-
-		RValue<Short4> pxor(RValue<Short4> x, RValue<Short4> y)
-		{
-			llvm::Function *pxor = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_pxor);
-
-			return As<Short4>(V(::builder->CreateCall2(pxor, As<MMX>(x).value, As<MMX>(y).value)));
-		}
-
-		RValue<Short4> pshufw(RValue<Short4> x, unsigned char y)
-		{
-			llvm::Function *pshufw = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse_pshuf_w);
-
-			return As<Short4>(V(::builder->CreateCall2(pshufw, As<MMX>(x).value, V(Nucleus::createConstantByte(y)))));
-		}
-
-		RValue<Int2> punpcklwd(RValue<Short4> x, RValue<Short4> y)
-		{
-			llvm::Function *punpcklwd = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_punpcklwd);
-
-			return As<Int2>(V(::builder->CreateCall2(punpcklwd, As<MMX>(x).value, As<MMX>(y).value)));
-		}
-
-		RValue<Int2> punpckhwd(RValue<Short4> x, RValue<Short4> y)
-		{
-			llvm::Function *punpckhwd = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_punpckhwd);
-
-			return As<Int2>(V(::builder->CreateCall2(punpckhwd, As<MMX>(x).value, As<MMX>(y).value)));
-		}
-
-		RValue<Short4> pinsrw(RValue<Short4> x, RValue<Int> y, unsigned int i)
-		{
-			llvm::Function *pinsrw = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_pinsr_w);
-
-			return As<Short4>(V(::builder->CreateCall3(pinsrw, As<MMX>(x).value, y.value, V(Nucleus::createConstantInt(i)))));
-		}
-
-		RValue<Int> pextrw(RValue<Short4> x, unsigned int i)
-		{
-			llvm::Function *pextrw = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_pextr_w);
-
-			return RValue<Int>(V(::builder->CreateCall2(pextrw, As<MMX>(x).value, V(Nucleus::createConstantInt(i)))));
-		}
-
-		RValue<Short4> punpckldq(RValue<Int2> x, RValue<Int2> y)
-		{
-			llvm::Function *punpckldq = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_punpckldq);
-
-			return As<Short4>(V(::builder->CreateCall2(punpckldq, As<MMX>(x).value, As<MMX>(y).value)));
-		}
-
-		RValue<Short4> punpckhdq(RValue<Int2> x, RValue<Int2> y)
-		{
-			llvm::Function *punpckhdq = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_punpckhdq);
-
-			return As<Short4>(V(::builder->CreateCall2(punpckhdq, As<MMX>(x).value, As<MMX>(y).value)));
-		}
-
-		RValue<Short4> punpcklbw(RValue<Byte8> x, RValue<Byte8> y)
-		{
-			llvm::Function *punpcklbw = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_punpcklbw);
-
-			return As<Short4>(V(::builder->CreateCall2(punpcklbw, As<MMX>(x).value, As<MMX>(y).value)));
-		}
-
-		RValue<Short4> punpckhbw(RValue<Byte8> x, RValue<Byte8> y)
-		{
-			llvm::Function *punpckhbw = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_punpckhbw);
-
-			return As<Short4>(V(::builder->CreateCall2(punpckhbw, As<MMX>(x).value, As<MMX>(y).value)));
-		}
-
-		RValue<Byte8> paddb(RValue<Byte8> x, RValue<Byte8> y)
-		{
-			llvm::Function *paddb = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_padd_b);
-
-			return As<Byte8>(V(::builder->CreateCall2(paddb, As<MMX>(x).value, As<MMX>(y).value)));
-		}
-
-		RValue<Byte8> psubb(RValue<Byte8> x, RValue<Byte8> y)
-		{
-			llvm::Function *psubb = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_psub_b);
-
-			return As<Byte8>(V(::builder->CreateCall2(psubb, As<MMX>(x).value, As<MMX>(y).value)));
-		}
-
-		RValue<Int2> paddd(RValue<Int2> x, RValue<Int2> y)
-		{
-			llvm::Function *paddd = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_padd_d);
-
-			return As<Int2>(V(::builder->CreateCall2(paddd, As<MMX>(x).value, As<MMX>(y).value)));
-		}
-
-		RValue<Int2> psubd(RValue<Int2> x, RValue<Int2> y)
-		{
-			llvm::Function *psubd = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_psub_d);
-
-			return As<Int2>(V(::builder->CreateCall2(psubd, As<MMX>(x).value, As<MMX>(y).value)));
+			return As<Byte8>(V(::builder->CreateCall2(psubusb, x.value, y.value)));
 		}
 
 		RValue<UShort4> pavgw(RValue<UShort4> x, RValue<UShort4> y)
 		{
-			llvm::Function *pavgw = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_pavg_w);
+			llvm::Function *pavgw = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse2_pavg_w);
 
-			return As<UShort4>(V(::builder->CreateCall2(pavgw, As<MMX>(x).value, As<MMX>(y).value)));
+			return As<UShort4>(V(::builder->CreateCall2(pavgw, x.value, y.value)));
 		}
 
 		RValue<Short4> pmaxsw(RValue<Short4> x, RValue<Short4> y)
 		{
-			llvm::Function *pmaxsw = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_pmaxs_w);
+			llvm::Function *pmaxsw = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse2_pmaxs_w);
 
-			return As<Short4>(V(::builder->CreateCall2(pmaxsw, As<MMX>(x).value, As<MMX>(y).value)));
+			return As<Short4>(V(::builder->CreateCall2(pmaxsw, x.value, y.value)));
 		}
 
 		RValue<Short4> pminsw(RValue<Short4> x, RValue<Short4> y)
 		{
-			llvm::Function *pminsw = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_pmins_w);
+			llvm::Function *pminsw = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse2_pmins_w);
 
-			return As<Short4>(V(::builder->CreateCall2(pminsw, As<MMX>(x).value, As<MMX>(y).value)));
+			return As<Short4>(V(::builder->CreateCall2(pminsw, x.value, y.value)));
 		}
 
 		RValue<Short4> pcmpgtw(RValue<Short4> x, RValue<Short4> y)
 		{
-			llvm::Function *pcmpgtw = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_pcmpgt_w);
+			llvm::Function *pcmpgtw = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse2_pcmpgt_w);
 
-			return As<Short4>(V(::builder->CreateCall2(pcmpgtw, As<MMX>(x).value, As<MMX>(y).value)));
+			return As<Short4>(V(::builder->CreateCall2(pcmpgtw, x.value, y.value)));
 		}
 
 		RValue<Short4> pcmpeqw(RValue<Short4> x, RValue<Short4> y)
 		{
-			llvm::Function *pcmpeqw = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_pcmpeq_w);
+			llvm::Function *pcmpeqw = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse2_pcmpeq_w);
 
-			return As<Short4>(V(::builder->CreateCall2(pcmpeqw, As<MMX>(x).value, As<MMX>(y).value)));
+			return As<Short4>(V(::builder->CreateCall2(pcmpeqw, x.value, y.value)));
 		}
 
 		RValue<Byte8> pcmpgtb(RValue<SByte8> x, RValue<SByte8> y)
 		{
-			llvm::Function *pcmpgtb = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_pcmpgt_b);
+			llvm::Function *pcmpgtb = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse2_pcmpgt_b);
 
-			return As<Byte8>(V(::builder->CreateCall2(pcmpgtb, As<MMX>(x).value, As<MMX>(y).value)));
+			return As<Byte8>(V(::builder->CreateCall2(pcmpgtb, x.value, y.value)));
 		}
 
 		RValue<Byte8> pcmpeqb(RValue<Byte8> x, RValue<Byte8> y)
 		{
-			llvm::Function *pcmpeqb = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_pcmpeq_b);
+			llvm::Function *pcmpeqb = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse2_pcmpeq_b);
 
-			return As<Byte8>(V(::builder->CreateCall2(pcmpeqb, As<MMX>(x).value, As<MMX>(y).value)));
+			return As<Byte8>(V(::builder->CreateCall2(pcmpeqb, x.value, y.value)));
 		}
 
 		RValue<Short4> packssdw(RValue<Int2> x, RValue<Int2> y)
 		{
-			llvm::Function *packssdw = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_packssdw);
+			llvm::Function *packssdw = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse2_packssdw_128);
 
-			return As<Short4>(V(::builder->CreateCall2(packssdw, As<MMX>(x).value, As<MMX>(y).value)));
+			return As<Short4>(V(::builder->CreateCall2(packssdw, x.value, y.value)));
 		}
 
 		RValue<Short8> packssdw(RValue<Int4> x, RValue<Int4> y)
 		{
-			if(CPUID::supportsSSE2())
-			{
-				llvm::Function *packssdw = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse2_packssdw_128);
+			llvm::Function *packssdw = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse2_packssdw_128);
 
-				return RValue<Short8>(V(::builder->CreateCall2(packssdw, x.value, y.value)));
-			}
-			else
-			{
-				Int2 loX = Int2(x);
-				Int2 hiX = Int2(Swizzle(x, 0xEE));
-
-				Int2 loY = Int2(y);
-				Int2 hiY = Int2(Swizzle(y, 0xEE));
-
-				Short4 lo = x86::packssdw(loX, hiX);
-				Short4 hi = x86::packssdw(loY, hiY);
-
-				return Short8(lo, hi);
-			}
+			return RValue<Short8>(V(::builder->CreateCall2(packssdw, x.value, y.value)));
 		}
 
 		RValue<SByte8> packsswb(RValue<Short4> x, RValue<Short4> y)
 		{
-			llvm::Function *packsswb = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_packsswb);
+			llvm::Function *packsswb = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse2_packsswb_128);
 
-			return As<SByte8>(V(::builder->CreateCall2(packsswb, As<MMX>(x).value, As<MMX>(y).value)));
+			return As<SByte8>(V(::builder->CreateCall2(packsswb, x.value, y.value)));
 		}
 
 		RValue<Byte8> packuswb(RValue<UShort4> x, RValue<UShort4> y)
 		{
-			llvm::Function *packuswb = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_packuswb);
+			llvm::Function *packuswb = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse2_packuswb_128);
 
-			return As<Byte8>(V(::builder->CreateCall2(packuswb, As<MMX>(x).value, As<MMX>(y).value)));
+			return As<Byte8>(V(::builder->CreateCall2(packuswb, x.value, y.value)));
 		}
 
 		RValue<UShort8> packusdw(RValue<Int4> x, RValue<Int4> y)
 		{
 			if(CPUID::supportsSSE4_1())
 			{
-				llvm::Function *packusdw = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse41_packusdw);
+				llvm::Function *packusdw = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse41_packusdw);
 
 				return RValue<UShort8>(V(::builder->CreateCall2(packusdw, x.value, y.value)));
 			}
@@ -6933,264 +6201,198 @@
 
 		RValue<UShort4> psrlw(RValue<UShort4> x, unsigned char y)
 		{
-			llvm::Function *psrlw = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_psrli_w);
+			llvm::Function *psrlw = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse2_psrli_w);
 
-			return As<UShort4>(V(::builder->CreateCall2(psrlw, As<MMX>(x).value, V(Nucleus::createConstantInt(y)))));
+			return As<UShort4>(V(::builder->CreateCall2(psrlw, x.value, V(Nucleus::createConstantInt(y)))));
 		}
 
 		RValue<UShort8> psrlw(RValue<UShort8> x, unsigned char y)
 		{
-			llvm::Function *psrlw = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse2_psrli_w);
+			llvm::Function *psrlw = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse2_psrli_w);
 
 			return RValue<UShort8>(V(::builder->CreateCall2(psrlw, x.value, V(Nucleus::createConstantInt(y)))));
 		}
 
 		RValue<Short4> psraw(RValue<Short4> x, unsigned char y)
 		{
-			llvm::Function *psraw = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_psrai_w);
+			llvm::Function *psraw = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse2_psrai_w);
 
-			return As<Short4>(V(::builder->CreateCall2(psraw, As<MMX>(x).value, V(Nucleus::createConstantInt(y)))));
+			return As<Short4>(V(::builder->CreateCall2(psraw, x.value, V(Nucleus::createConstantInt(y)))));
 		}
 
 		RValue<Short8> psraw(RValue<Short8> x, unsigned char y)
 		{
-			llvm::Function *psraw = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse2_psrai_w);
+			llvm::Function *psraw = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse2_psrai_w);
 
 			return RValue<Short8>(V(::builder->CreateCall2(psraw, x.value, V(Nucleus::createConstantInt(y)))));
 		}
 
 		RValue<Short4> psllw(RValue<Short4> x, unsigned char y)
 		{
-			llvm::Function *psllw = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_pslli_w);
+			llvm::Function *psllw = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse2_pslli_w);
 
-			return As<Short4>(V(::builder->CreateCall2(psllw, As<MMX>(x).value, V(Nucleus::createConstantInt(y)))));
+			return As<Short4>(V(::builder->CreateCall2(psllw, x.value, V(Nucleus::createConstantInt(y)))));
 		}
 
 		RValue<Short8> psllw(RValue<Short8> x, unsigned char y)
 		{
-			llvm::Function *psllw = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse2_pslli_w);
+			llvm::Function *psllw = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse2_pslli_w);
 
 			return RValue<Short8>(V(::builder->CreateCall2(psllw, x.value, V(Nucleus::createConstantInt(y)))));
 		}
 
 		RValue<Int2> pslld(RValue<Int2> x, unsigned char y)
 		{
-			llvm::Function *pslld = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_pslli_d);
+			llvm::Function *pslld = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse2_pslli_d);
 
-			return As<Int2>(V(::builder->CreateCall2(pslld, As<MMX>(x).value, V(Nucleus::createConstantInt(y)))));
+			return As<Int2>(V(::builder->CreateCall2(pslld, x.value, V(Nucleus::createConstantInt(y)))));
 		}
 
 		RValue<Int4> pslld(RValue<Int4> x, unsigned char y)
 		{
-			if(CPUID::supportsSSE2())
-			{
-				llvm::Function *pslld = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse2_pslli_d);
+			llvm::Function *pslld = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse2_pslli_d);
 
-				return RValue<Int4>(V(::builder->CreateCall2(pslld, x.value, V(Nucleus::createConstantInt(y)))));
-			}
-			else
-			{
-				Int2 lo = Int2(x);
-				Int2 hi = Int2(Swizzle(x, 0xEE));
-
-				lo = x86::pslld(lo, y);
-				hi = x86::pslld(hi, y);
-
-				return Int4(lo, hi);
-			}
+			return RValue<Int4>(V(::builder->CreateCall2(pslld, x.value, V(Nucleus::createConstantInt(y)))));
 		}
 
 		RValue<Int2> psrad(RValue<Int2> x, unsigned char y)
 		{
-			llvm::Function *psrad = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_psrai_d);
+			llvm::Function *psrad = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse2_psrai_d);
 
-			return As<Int2>(V(::builder->CreateCall2(psrad, As<MMX>(x).value, V(Nucleus::createConstantInt(y)))));
+			return As<Int2>(V(::builder->CreateCall2(psrad, x.value, V(Nucleus::createConstantInt(y)))));
 		}
 
 		RValue<Int4> psrad(RValue<Int4> x, unsigned char y)
 		{
-			if(CPUID::supportsSSE2())
-			{
-				llvm::Function *psrad = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse2_psrai_d);
+			llvm::Function *psrad = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse2_psrai_d);
 
-				return RValue<Int4>(V(::builder->CreateCall2(psrad, x.value, V(Nucleus::createConstantInt(y)))));
-			}
-			else
-			{
-				Int2 lo = Int2(x);
-				Int2 hi = Int2(Swizzle(x, 0xEE));
-
-				lo = x86::psrad(lo, y);
-				hi = x86::psrad(hi, y);
-
-				return Int4(lo, hi);
-			}
+			return RValue<Int4>(V(::builder->CreateCall2(psrad, x.value, V(Nucleus::createConstantInt(y)))));
 		}
 
 		RValue<UInt2> psrld(RValue<UInt2> x, unsigned char y)
 		{
-			llvm::Function *psrld = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_psrli_d);
+			llvm::Function *psrld = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse2_psrli_d);
 
-			return As<UInt2>(V(::builder->CreateCall2(psrld, As<MMX>(x).value, V(Nucleus::createConstantInt(y)))));
+			return As<UInt2>(V(::builder->CreateCall2(psrld, x.value, V(Nucleus::createConstantInt(y)))));
 		}
 
 		RValue<UInt4> psrld(RValue<UInt4> x, unsigned char y)
 		{
-			if(CPUID::supportsSSE2())
-			{
-				llvm::Function *psrld = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse2_psrli_d);
+			llvm::Function *psrld = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse2_psrli_d);
 
-				return RValue<UInt4>(V(::builder->CreateCall2(psrld, x.value, V(Nucleus::createConstantInt(y)))));
-			}
-			else
-			{
-				UInt2 lo = As<UInt2>(Int2(As<Int4>(x)));
-				UInt2 hi = As<UInt2>(Int2(Swizzle(As<Int4>(x), 0xEE)));
-
-				lo = x86::psrld(lo, y);
-				hi = x86::psrld(hi, y);
-
-				return UInt4(lo, hi);
-			}
+			return RValue<UInt4>(V(::builder->CreateCall2(psrld, x.value, V(Nucleus::createConstantInt(y)))));
 		}
 
 		RValue<Int4> pmaxsd(RValue<Int4> x, RValue<Int4> y)
 		{
-			llvm::Function *pmaxsd = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse41_pmaxsd);
+			llvm::Function *pmaxsd = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse41_pmaxsd);
 
 			return RValue<Int4>(V(::builder->CreateCall2(pmaxsd, x.value, y.value)));
 		}
 
 		RValue<Int4> pminsd(RValue<Int4> x, RValue<Int4> y)
 		{
-			llvm::Function *pminsd = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse41_pminsd);
+			llvm::Function *pminsd = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse41_pminsd);
 
 			return RValue<Int4>(V(::builder->CreateCall2(pminsd, x.value, y.value)));
 		}
 
 		RValue<UInt4> pmaxud(RValue<UInt4> x, RValue<UInt4> y)
 		{
-			llvm::Function *pmaxud = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse41_pmaxud);
+			llvm::Function *pmaxud = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse41_pmaxud);
 
 			return RValue<UInt4>(V(::builder->CreateCall2(pmaxud, x.value, y.value)));
 		}
 
 		RValue<UInt4> pminud(RValue<UInt4> x, RValue<UInt4> y)
 		{
-			llvm::Function *pminud = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse41_pminud);
+			llvm::Function *pminud = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse41_pminud);
 
 			return RValue<UInt4>(V(::builder->CreateCall2(pminud, x.value, y.value)));
 		}
 
 		RValue<Short4> pmulhw(RValue<Short4> x, RValue<Short4> y)
 		{
-			llvm::Function *pmulhw = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_pmulh_w);
+			llvm::Function *pmulhw = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse2_pmulh_w);
 
-			return As<Short4>(V(::builder->CreateCall2(pmulhw, As<MMX>(x).value, As<MMX>(y).value)));
+			return As<Short4>(V(::builder->CreateCall2(pmulhw, x.value, y.value)));
 		}
 
 		RValue<UShort4> pmulhuw(RValue<UShort4> x, RValue<UShort4> y)
 		{
-			llvm::Function *pmulhuw = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_pmulhu_w);
+			llvm::Function *pmulhuw = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse2_pmulhu_w);
 
-			return As<UShort4>(V(::builder->CreateCall2(pmulhuw, As<MMX>(x).value, As<MMX>(y).value)));
+			return As<UShort4>(V(::builder->CreateCall2(pmulhuw, x.value, y.value)));
 		}
 
 		RValue<Int2> pmaddwd(RValue<Short4> x, RValue<Short4> y)
 		{
-			llvm::Function *pmaddwd = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_pmadd_wd);
+			llvm::Function *pmaddwd = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse2_pmadd_wd);
 
-			return As<Int2>(V(::builder->CreateCall2(pmaddwd, As<MMX>(x).value, As<MMX>(y).value)));
+			return As<Int2>(V(::builder->CreateCall2(pmaddwd, x.value, y.value)));
 		}
 
 		RValue<Short8> pmulhw(RValue<Short8> x, RValue<Short8> y)
 		{
-			llvm::Function *pmulhw = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse2_pmulh_w);
+			llvm::Function *pmulhw = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse2_pmulh_w);
 
 			return RValue<Short8>(V(::builder->CreateCall2(pmulhw, x.value, y.value)));
 		}
 
 		RValue<UShort8> pmulhuw(RValue<UShort8> x, RValue<UShort8> y)
 		{
-			llvm::Function *pmulhuw = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse2_pmulhu_w);
+			llvm::Function *pmulhuw = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse2_pmulhu_w);
 
 			return RValue<UShort8>(V(::builder->CreateCall2(pmulhuw, x.value, y.value)));
 		}
 
 		RValue<Int4> pmaddwd(RValue<Short8> x, RValue<Short8> y)
 		{
-			llvm::Function *pmaddwd = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse2_pmadd_wd);
+			llvm::Function *pmaddwd = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse2_pmadd_wd);
 
 			return RValue<Int4>(V(::builder->CreateCall2(pmaddwd, x.value, y.value)));
 		}
 
 		RValue<Int> movmskps(RValue<Float4> x)
 		{
-			llvm::Function *movmskps = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse_movmsk_ps);
+			llvm::Function *movmskps = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse_movmsk_ps);
 
 			return RValue<Int>(V(::builder->CreateCall(movmskps, x.value)));
 		}
 
 		RValue<Int> pmovmskb(RValue<Byte8> x)
 		{
-			llvm::Function *pmovmskb = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_pmovmskb);
+			llvm::Function *pmovmskb = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse2_pmovmskb_128);
 
-			return RValue<Int>(V(::builder->CreateCall(pmovmskb, As<MMX>(x).value)));
+			return RValue<Int>(V(::builder->CreateCall(pmovmskb, x.value))) & 0xFF;
 		}
 
-		//RValue<Int2> movd(RValue<Pointer<Int>> x)
-		//{
-		//	Value *element = Nucleus::createLoad(x.value);
-
-		////	Value *int2 = UndefValue::get(Int2::getType());
-		////	int2 = Nucleus::createInsertElement(int2, element, ConstantInt::get(Int::getType(), 0));
-
-		//	Value *int2 = Nucleus::createBitCast(Nucleus::createZExt(element, Long::getType()), Int2::getType());
-
-		//	return RValue<Int2>(int2);
-		//}
-
-		//RValue<Int2> movdq2q(RValue<Int4> x)
-		//{
-		//	Value *long2 = Nucleus::createBitCast(x.value, T(VectorType::get(Long::getType(), 2)));
-		//	Value *element = Nucleus::createExtractElement(long2, ConstantInt::get(Int::getType(), 0));
-
-		//	return RValue<Int2>(Nucleus::createBitCast(element, Int2::getType()));
-		//}
-
-		RValue<Int4> pmovzxbd(RValue<Int4> x)
+		RValue<Int4> pmovzxbd(RValue<Byte16> x)
 		{
-			llvm::Function *pmovzxbd = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse41_pmovzxbd);
+			llvm::Function *pmovzxbd = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse41_pmovzxbd);
 
-			return RValue<Int4>(V(::builder->CreateCall(pmovzxbd, Nucleus::createBitCast(x.value, Byte16::getType()))));
+			return RValue<Int4>(V(::builder->CreateCall(pmovzxbd, x.value)));
 		}
 
-		RValue<Int4> pmovsxbd(RValue<Int4> x)
+		RValue<Int4> pmovsxbd(RValue<SByte16> x)
 		{
-			llvm::Function *pmovsxbd = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse41_pmovsxbd);
+			llvm::Function *pmovsxbd = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse41_pmovsxbd);
 
-			return RValue<Int4>(V(::builder->CreateCall(pmovsxbd, Nucleus::createBitCast(x.value, SByte16::getType()))));
+			return RValue<Int4>(V(::builder->CreateCall(pmovsxbd, x.value)));
 		}
 
-		RValue<Int4> pmovzxwd(RValue<Int4> x)
+		RValue<Int4> pmovzxwd(RValue<UShort8> x)
 		{
-			llvm::Function *pmovzxwd = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse41_pmovzxwd);
+			llvm::Function *pmovzxwd = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse41_pmovzxwd);
 
-			return RValue<Int4>(V(::builder->CreateCall(pmovzxwd, Nucleus::createBitCast(x.value, UShort8::getType()))));
+			return RValue<Int4>(V(::builder->CreateCall(pmovzxwd, x.value)));
 		}
 
-		RValue<Int4> pmovsxwd(RValue<Int4> x)
+		RValue<Int4> pmovsxwd(RValue<Short8> x)
 		{
-			llvm::Function *pmovsxwd = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse41_pmovsxwd);
+			llvm::Function *pmovsxwd = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse41_pmovsxwd);
 
-			return RValue<Int4>(V(::builder->CreateCall(pmovsxwd, Nucleus::createBitCast(x.value, Short8::getType()))));
-		}
-
-		void emms()
-		{
-			llvm::Function *emms = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_emms);
-
-			V(::builder->CreateCall(emms));
+			return RValue<Int4>(V(::builder->CreateCall(pmovsxwd, x.value)));
 		}
 	}
 }
diff --git a/src/Reactor/Nucleus.hpp b/src/Reactor/Nucleus.hpp
index 831ed40..21e2571 100644
--- a/src/Reactor/Nucleus.hpp
+++ b/src/Reactor/Nucleus.hpp
@@ -15,6 +15,7 @@
 #ifndef sw_Nucleus_hpp
 #define sw_Nucleus_hpp
 
+#include <cassert>
 #include <cstdarg>
 #include <cstdint>
 #include <vector>
diff --git a/src/Reactor/Optimizer.cpp b/src/Reactor/Optimizer.cpp
index 38e24ef..2d4ac82 100644
--- a/src/Reactor/Optimizer.cpp
+++ b/src/Reactor/Optimizer.cpp
@@ -17,7 +17,7 @@
 #include "src/IceCfg.h"
 #include "src/IceCfgNode.h"
 
-#include <map>
+#include <unordered_map>
 #include <vector>
 
 namespace
@@ -38,11 +38,15 @@
 		void deleteInstruction(Ice::Inst *instruction);
 		bool isDead(Ice::Inst *instruction);
 
+		static const Ice::InstIntrinsicCall *asLoadSubVector(const Ice::Inst *instruction);
+		static const Ice::InstIntrinsicCall *asStoreSubVector(const Ice::Inst *instruction);
 		static bool isLoad(const Ice::Inst &instruction);
 		static bool isStore(const Ice::Inst &instruction);
 		static Ice::Operand *storeAddress(const Ice::Inst *instruction);
 		static Ice::Operand *loadAddress(const Ice::Inst *instruction);
 		static Ice::Operand *storeData(const Ice::Inst *instruction);
+		static std::size_t storeSize(const Ice::Inst *instruction);
+		static bool loadTypeMatchesStore(const Ice::Inst *load, const Ice::Inst *store);
 
 		Ice::Cfg *function;
 		Ice::GlobalContext *context;
@@ -57,9 +61,9 @@
 			std::vector<Ice::Inst*> stores;
 		};
 
-		std::map<Ice::Operand*, Uses> uses;
-		std::map<Ice::Inst*, Ice::CfgNode*> node;
-		std::map<Ice::Variable*, Ice::Inst*> definition;
+		std::unordered_map<Ice::Operand*, Uses> uses;
+		std::unordered_map<Ice::Inst*, Ice::CfgNode*> node;
+		std::unordered_map<Ice::Variable*, Ice::Inst*> definition;
 	};
 
 	void Optimizer::run(Ice::Cfg *function)
@@ -199,6 +203,11 @@
 						continue;
 					}
 
+					if(!loadTypeMatchesStore(load, store))
+					{
+						continue;
+					}
+
 					replace(load, storeValue);
 
 					for(size_t i = 0; i < addressUses.loads.size(); i++)
@@ -295,6 +304,7 @@
 				auto &insts = singleBasicBlock->getInsts();
 				Ice::Inst *store = nullptr;
 				Ice::Operand *storeValue = nullptr;
+				bool unmatchedLoads = false;
 
 				for(Ice::Inst &inst : insts)
 				{
@@ -310,14 +320,20 @@
 							continue;
 						}
 
-						// New store found. If we had a previous one, eliminate it.
-						if(store)
+						// New store found. If we had a previous one, try to eliminate it.
+						if(store && !unmatchedLoads)
 						{
-							deleteInstruction(store);
+							// If the previous store is wider than the new one, we can't eliminate it
+							// because there could be a wide load reading its non-overwritten data.
+							if(storeSize(&inst) >= storeSize(store))
+							{
+								deleteInstruction(store);
+							}
 						}
 
 						store = &inst;
 						storeValue = storeData(store);
+						unmatchedLoads = false;
 					}
 					else if(isLoad(inst))
 					{
@@ -328,10 +344,13 @@
 							continue;
 						}
 
-						if(storeValue)
+						if(!loadTypeMatchesStore(load, store))
 						{
-							replace(load, storeValue);
+							unmatchedLoads = true;
+							continue;
 						}
+
+						replace(load, storeValue);
 					}
 				}
 			}
@@ -464,6 +483,32 @@
 		return false;
 	}
 
+	const Ice::InstIntrinsicCall *Optimizer::asLoadSubVector(const Ice::Inst *instruction)
+	{
+		if(auto *instrinsic = llvm::dyn_cast<Ice::InstIntrinsicCall>(instruction))
+		{
+			if(instrinsic->getIntrinsicInfo().ID == Ice::Intrinsics::LoadSubVector)
+			{
+				return instrinsic;
+			}
+		}
+
+		return nullptr;
+	}
+
+	const Ice::InstIntrinsicCall *Optimizer::asStoreSubVector(const Ice::Inst *instruction)
+	{
+		if(auto *instrinsic = llvm::dyn_cast<Ice::InstIntrinsicCall>(instruction))
+		{
+			if(instrinsic->getIntrinsicInfo().ID == Ice::Intrinsics::StoreSubVector)
+			{
+				return instrinsic;
+			}
+		}
+
+		return nullptr;
+	}
+
 	bool Optimizer::isLoad(const Ice::Inst &instruction)
 	{
 		if(llvm::isa<Ice::InstLoad>(&instruction))
@@ -471,12 +516,7 @@
 			return true;
 		}
 
-		if(auto intrinsicCall = llvm::dyn_cast<Ice::InstIntrinsicCall>(&instruction))
-		{
-			return intrinsicCall->getIntrinsicInfo().ID == Ice::Intrinsics::LoadSubVector;
-		}
-
-		return false;
+		return asLoadSubVector(&instruction) != nullptr;
 	}
 
 	bool Optimizer::isStore(const Ice::Inst &instruction)
@@ -486,12 +526,7 @@
 			return true;
 		}
 
-		if(auto intrinsicCall = llvm::dyn_cast<Ice::InstIntrinsicCall>(&instruction))
-		{
-			return intrinsicCall->getIntrinsicInfo().ID == Ice::Intrinsics::StoreSubVector;
-		}
-
-		return false;
+		return asStoreSubVector(&instruction) != nullptr;
 	}
 
 	Ice::Operand *Optimizer::storeAddress(const Ice::Inst *instruction)
@@ -503,12 +538,9 @@
 			return store->getAddr();
 		}
 
-		if(auto *instrinsic = llvm::dyn_cast<Ice::InstIntrinsicCall>(instruction))
+		if(auto *storeSubVector = asStoreSubVector(instruction))
 		{
-			if(instrinsic->getIntrinsicInfo().ID == Ice::Intrinsics::StoreSubVector)
-			{
-				return instrinsic->getSrc(2);
-			}
+			return storeSubVector->getSrc(2);
 		}
 
 		return nullptr;
@@ -523,12 +555,9 @@
 			return load->getSourceAddress();
 		}
 
-		if(auto *instrinsic = llvm::dyn_cast<Ice::InstIntrinsicCall>(instruction))
+		if(auto *loadSubVector = asLoadSubVector(instruction))
 		{
-			if(instrinsic->getIntrinsicInfo().ID == Ice::Intrinsics::LoadSubVector)
-			{
-				return instrinsic->getSrc(1);
-			}
+			return loadSubVector->getSrc(1);
 		}
 
 		return nullptr;
@@ -543,17 +572,63 @@
 			return store->getData();
 		}
 
-		if(auto *instrinsic = llvm::dyn_cast<Ice::InstIntrinsicCall>(instruction))
+		if(auto *storeSubVector = asStoreSubVector(instruction))
 		{
-			if(instrinsic->getIntrinsicInfo().ID == Ice::Intrinsics::StoreSubVector)
-			{
-				return instrinsic->getSrc(1);
-			}
+			return storeSubVector->getSrc(1);
 		}
 
 		return nullptr;
 	}
 
+	std::size_t Optimizer::storeSize(const Ice::Inst *store)
+	{
+		assert(isStore(*store));
+
+		if(auto *instStore = llvm::dyn_cast<Ice::InstStore>(store))
+		{
+			return Ice::typeWidthInBytes(instStore->getData()->getType());
+		}
+
+		if(auto *storeSubVector = asStoreSubVector(store))
+		{
+			return llvm::cast<Ice::ConstantInteger32>(storeSubVector->getSrc(3))->getValue();
+		}
+
+		return 0;
+	}
+
+	bool Optimizer::loadTypeMatchesStore(const Ice::Inst *load, const Ice::Inst *store)
+	{
+		if(!load || !store)
+		{
+			return false;
+		}
+
+		assert(isLoad(*load) && isStore(*store));
+		assert(loadAddress(load) == storeAddress(store));
+
+		if(auto *instStore = llvm::dyn_cast<Ice::InstStore>(store))
+		{
+			if(auto *instLoad = llvm::dyn_cast<Ice::InstLoad>(load))
+			{
+				return instStore->getData()->getType() == instLoad->getDest()->getType();
+			}
+		}
+
+		if(auto *storeSubVector = asStoreSubVector(store))
+		{
+			if(auto *loadSubVector = asLoadSubVector(load))
+			{
+				// Check for matching type and sub-vector width.
+				return storeSubVector->getSrc(1)->getType() == loadSubVector->getDest()->getType() &&
+				       llvm::cast<Ice::ConstantInteger32>(storeSubVector->getSrc(3))->getValue() ==
+				       llvm::cast<Ice::ConstantInteger32>(loadSubVector->getSrc(2))->getValue();
+			}
+		}
+
+		return false;
+	}
+
 	bool Optimizer::Uses::areOnlyLoadStore() const
 	{
 		return size() == (loads.size() + stores.size());
diff --git a/src/Reactor/Reactor.hpp b/src/Reactor/Reactor.hpp
index b02c6be..46973da 100644
--- a/src/Reactor/Reactor.hpp
+++ b/src/Reactor/Reactor.hpp
@@ -18,6 +18,7 @@
 #include "Nucleus.hpp"
 #include "Routine.hpp"
 
+#include <cassert>
 #include <cstddef>
 #include <cwchar>
 #undef Bool
@@ -88,8 +89,8 @@
 			return false;
 		}
 
-		Value *loadValue(unsigned int alignment = 0) const;
-		Value *storeValue(Value *value, unsigned int alignment = 0) const;
+		Value *loadValue() const;
+		Value *storeValue(Value *value) const;
 		Value *getAddress(Value *index, bool unsignedIndex) const;
 	};
 
@@ -1535,6 +1536,7 @@
 	{
 	public:
 		explicit Float(RValue<Int> cast);
+		explicit Float(RValue<UInt> cast);
 
 		Float() = default;
 		Float(float x);
@@ -2103,7 +2105,7 @@
 		template<class S>
 		Pointer(const Pointer<S> &pointer, int alignment = 1) : alignment(alignment)
 		{
-			Value *pointerS = pointer.loadValue(alignment);
+			Value *pointerS = pointer.loadValue();
 			Value *pointerT = Nucleus::createBitCast(pointerS, Nucleus::getPointerType(T::getType()));
 			LValue<Pointer<T>>::storeValue(pointerT);
 		}
@@ -2238,15 +2240,15 @@
 	}
 
 	template<class T>
-	Value *LValue<T>::loadValue(unsigned int alignment) const
+	Value *LValue<T>::loadValue() const
 	{
-		return Nucleus::createLoad(address, T::getType(), false, alignment);
+		return Nucleus::createLoad(address, T::getType(), false, 0);
 	}
 
 	template<class T>
-	Value *LValue<T>::storeValue(Value *value, unsigned int alignment) const
+	Value *LValue<T>::storeValue(Value *value) const
 	{
-		return Nucleus::createStore(value, address, T::getType(), false, alignment);
+		return Nucleus::createStore(value, address, T::getType(), false, 0);
 	}
 
 	template<class T>
@@ -2305,6 +2307,8 @@
 	template<class T>
 	RValue<T>::RValue(Value *rvalue)
 	{
+		assert(Nucleus::createBitCast(rvalue, T::getType()) == rvalue);   // Run-time type should match T, so bitcast is no-op.
+
 		value = rvalue;
 	}
 
diff --git a/src/Reactor/SubzeroReactor.cpp b/src/Reactor/SubzeroReactor.cpp
index fc70ac2..7e607d9 100644
--- a/src/Reactor/SubzeroReactor.cpp
+++ b/src/Reactor/SubzeroReactor.cpp
@@ -123,6 +123,7 @@
 	const bool CPUID::ARM = CPUID::detectARM();
 	const bool CPUID::SSE4_1 = CPUID::detectSSE4_1();
 	const bool emulateIntrinsics = CPUID::ARM;
+	const bool emulateMismatchedBitCast = CPUID::ARM;
 }
 
 namespace sw
@@ -288,7 +289,6 @@
 			}
 		}
 
-
 		return symbolValue;
 	}
 
@@ -848,12 +848,43 @@
 
 		if(valueType & EmulatedBits)
 		{
-			const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::LoadSubVector, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
-			auto target = ::context->getConstantUndef(Ice::IceType_i32);
-			auto load = Ice::InstIntrinsicCall::create(::function, 2, result, target, intrinsic);
-			load->addArg(ptr);
-			load->addArg(::context->getConstantInt32(typeSize(type)));
-			::basicBlock->appendInst(load);
+			if(emulateIntrinsics)
+			{
+				if(typeSize(type) == 4)
+				{
+					auto pointer = RValue<Pointer<Byte>>(ptr);
+					Int x = *Pointer<Int>(pointer);
+
+					Int4 vector;
+					vector = Insert(vector, x, 0);
+
+					auto bitcast = Ice::InstCast::create(::function, Ice::InstCast::Bitcast, result, vector.loadValue());
+					::basicBlock->appendInst(bitcast);
+				}
+				else if(typeSize(type) == 8)
+				{
+					auto pointer = RValue<Pointer<Byte>>(ptr);
+					Int x = *Pointer<Int>(pointer);
+					Int y = *Pointer<Int>(pointer + 4);
+
+					Int4 vector;
+					vector = Insert(vector, x, 0);
+					vector = Insert(vector, y, 1);
+
+					auto bitcast = Ice::InstCast::create(::function, Ice::InstCast::Bitcast, result, vector.loadValue());
+					::basicBlock->appendInst(bitcast);
+				}
+				else assert(false);
+			}
+			else
+			{
+				const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::LoadSubVector, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
+				auto target = ::context->getConstantUndef(Ice::IceType_i32);
+				auto load = Ice::InstIntrinsicCall::create(::function, 2, result, target, intrinsic);
+				load->addArg(ptr);
+				load->addArg(::context->getConstantInt32(typeSize(type)));
+				::basicBlock->appendInst(load);
+			}
 		}
 		else
 		{
@@ -870,13 +901,46 @@
 
 		if(valueType & EmulatedBits)
 		{
-			const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::StoreSubVector, Ice::Intrinsics::SideEffects_T, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_T};
-			auto target = ::context->getConstantUndef(Ice::IceType_i32);
-			auto store = Ice::InstIntrinsicCall::create(::function, 3, nullptr, target, intrinsic);
-			store->addArg(value);
-			store->addArg(ptr);
-			store->addArg(::context->getConstantInt32(typeSize(type)));
-			::basicBlock->appendInst(store);
+			if(emulateIntrinsics)
+			{
+				if(typeSize(type) == 4)
+				{
+					Ice::Variable *vector = ::function->makeVariable(Ice::IceType_v4i32);
+					auto bitcast = Ice::InstCast::create(::function, Ice::InstCast::Bitcast, vector, value);
+					::basicBlock->appendInst(bitcast);
+
+					RValue<Int4> v(V(vector));
+
+					auto pointer = RValue<Pointer<Byte>>(ptr);
+					Int x = Extract(v, 0);
+					*Pointer<Int>(pointer) = x;
+				}
+				else if(typeSize(type) == 8)
+				{
+					Ice::Variable *vector = ::function->makeVariable(Ice::IceType_v4i32);
+					auto bitcast = Ice::InstCast::create(::function, Ice::InstCast::Bitcast, vector, value);
+					::basicBlock->appendInst(bitcast);
+
+					RValue<Int4> v(V(vector));
+
+					auto pointer = RValue<Pointer<Byte>>(ptr);
+					Int x = Extract(v, 0);
+					*Pointer<Int>(pointer) = x;
+					Int y = Extract(v, 1);
+					*Pointer<Int>(pointer + 4) = y;
+				}
+				else assert(false);
+			}
+			else
+			{
+				const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::StoreSubVector, Ice::Intrinsics::SideEffects_T, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_T};
+				auto target = ::context->getConstantUndef(Ice::IceType_i32);
+				auto store = Ice::InstIntrinsicCall::create(::function, 3, nullptr, target, intrinsic);
+				store->addArg(value);
+				store->addArg(ptr);
+				store->addArg(::context->getConstantInt32(typeSize(type)));
+				::basicBlock->appendInst(store);
+			}
 		}
 		else
 		{
@@ -981,6 +1045,25 @@
 
 	Value *Nucleus::createBitCast(Value *v, Type *destType)
 	{
+		// Bitcasts must be between types of the same logical size. But with emulated narrow vectors we need
+		// support for casting between scalars and wide vectors. For platforms where this is not supported,
+		// emulate them by writing to the stack and reading back as the destination type.
+		if(emulateMismatchedBitCast)
+		{
+			if(!Ice::isVectorType(v->getType()) && Ice::isVectorType(T(destType)))
+			{
+				Value *address = allocateStackVariable(destType);
+				createStore(v, address, T(v->getType()));
+				return createLoad(address, destType);
+			}
+			else if(Ice::isVectorType(v->getType()) && !Ice::isVectorType(T(destType)))
+			{
+				Value *address = allocateStackVariable(T(v->getType()));
+				createStore(v, address, T(v->getType()));
+				return createLoad(address, destType);
+			}
+		}
+
 		return createCast(Ice::InstCast::Bitcast, v, destType);
 	}
 
@@ -2626,36 +2709,85 @@
 		return RValue<Byte8>(Nucleus::createNot(val.value));
 	}
 
+	RValue<Byte> Extract(RValue<Byte8> val, int i)
+	{
+		return RValue<Byte>(Nucleus::createExtractElement(val.value, Byte::getType(), i));
+	}
+
+	RValue<Byte8> Insert(RValue<Byte8> val, RValue<Byte> element, int i)
+	{
+		return RValue<Byte8>(Nucleus::createInsertElement(val.value, element.value, i));
+	}
+
+	RValue<Byte> Saturate(RValue<UShort> x)
+	{
+		return Byte(IfThenElse(Int(x) > 0xFF, Int(0xFF), Int(x)));
+	}
+
 	RValue<Byte8> AddSat(RValue<Byte8> x, RValue<Byte8> y)
 	{
-		Ice::Variable *result = ::function->makeVariable(Ice::IceType_v16i8);
-		const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::AddSaturateUnsigned, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
-		auto target = ::context->getConstantUndef(Ice::IceType_i32);
-		auto paddusb = Ice::InstIntrinsicCall::create(::function, 2, result, target, intrinsic);
-		paddusb->addArg(x.value);
-		paddusb->addArg(y.value);
-		::basicBlock->appendInst(paddusb);
+		if(emulateIntrinsics)
+		{
+			Byte8 result;
+			result = Insert(result, Saturate(UShort(Int(Extract(x, 0))) + UShort(Int(Extract(y, 0)))), 0);
+			result = Insert(result, Saturate(UShort(Int(Extract(x, 1))) + UShort(Int(Extract(y, 1)))), 1);
+			result = Insert(result, Saturate(UShort(Int(Extract(x, 2))) + UShort(Int(Extract(y, 2)))), 2);
+			result = Insert(result, Saturate(UShort(Int(Extract(x, 3))) + UShort(Int(Extract(y, 3)))), 3);
+			result = Insert(result, Saturate(UShort(Int(Extract(x, 4))) + UShort(Int(Extract(y, 4)))), 4);
+			result = Insert(result, Saturate(UShort(Int(Extract(x, 5))) + UShort(Int(Extract(y, 5)))), 5);
+			result = Insert(result, Saturate(UShort(Int(Extract(x, 6))) + UShort(Int(Extract(y, 6)))), 6);
+			result = Insert(result, Saturate(UShort(Int(Extract(x, 7))) + UShort(Int(Extract(y, 7)))), 7);
 
-		return RValue<Byte8>(V(result));
+			return result;
+		}
+		else
+		{
+			Ice::Variable *result = ::function->makeVariable(Ice::IceType_v16i8);
+			const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::AddSaturateUnsigned, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
+			auto target = ::context->getConstantUndef(Ice::IceType_i32);
+			auto paddusb = Ice::InstIntrinsicCall::create(::function, 2, result, target, intrinsic);
+			paddusb->addArg(x.value);
+			paddusb->addArg(y.value);
+			::basicBlock->appendInst(paddusb);
+
+			return RValue<Byte8>(V(result));
+		}
 	}
 
 	RValue<Byte8> SubSat(RValue<Byte8> x, RValue<Byte8> y)
 	{
-		Ice::Variable *result = ::function->makeVariable(Ice::IceType_v16i8);
-		const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::SubtractSaturateUnsigned, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
-		auto target = ::context->getConstantUndef(Ice::IceType_i32);
-		auto psubusw = Ice::InstIntrinsicCall::create(::function, 2, result, target, intrinsic);
-		psubusw->addArg(x.value);
-		psubusw->addArg(y.value);
-		::basicBlock->appendInst(psubusw);
+		if(emulateIntrinsics)
+		{
+			Byte8 result;
+			result = Insert(result, Saturate(UShort(Int(Extract(x, 0))) - UShort(Int(Extract(y, 0)))), 0);
+			result = Insert(result, Saturate(UShort(Int(Extract(x, 1))) - UShort(Int(Extract(y, 1)))), 1);
+			result = Insert(result, Saturate(UShort(Int(Extract(x, 2))) - UShort(Int(Extract(y, 2)))), 2);
+			result = Insert(result, Saturate(UShort(Int(Extract(x, 3))) - UShort(Int(Extract(y, 3)))), 3);
+			result = Insert(result, Saturate(UShort(Int(Extract(x, 4))) - UShort(Int(Extract(y, 4)))), 4);
+			result = Insert(result, Saturate(UShort(Int(Extract(x, 5))) - UShort(Int(Extract(y, 5)))), 5);
+			result = Insert(result, Saturate(UShort(Int(Extract(x, 6))) - UShort(Int(Extract(y, 6)))), 6);
+			result = Insert(result, Saturate(UShort(Int(Extract(x, 7))) - UShort(Int(Extract(y, 7)))), 7);
 
-		return RValue<Byte8>(V(result));
+			return result;
+		}
+		else
+		{
+			Ice::Variable *result = ::function->makeVariable(Ice::IceType_v16i8);
+			const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::SubtractSaturateUnsigned, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
+			auto target = ::context->getConstantUndef(Ice::IceType_i32);
+			auto psubusw = Ice::InstIntrinsicCall::create(::function, 2, result, target, intrinsic);
+			psubusw->addArg(x.value);
+			psubusw->addArg(y.value);
+			::basicBlock->appendInst(psubusw);
+
+			return RValue<Byte8>(V(result));
+		}
 	}
 
 	RValue<Short4> Unpack(RValue<Byte4> x)
 	{
 		int shuffle[16] = {0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7};   // Real type is v16i8
-		return RValue<Short4>(Nucleus::createShuffleVector(x.value, x.value, shuffle));
+		return As<Short4>(Nucleus::createShuffleVector(x.value, x.value, shuffle));
 	}
 
 	RValue<Short4> Unpack(RValue<Byte4> x, RValue<Byte4> y)
@@ -2666,7 +2798,7 @@
 	RValue<Short4> UnpackLow(RValue<Byte8> x, RValue<Byte8> y)
 	{
 		int shuffle[16] = {0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23};   // Real type is v16i8
-		return RValue<Short4>(Nucleus::createShuffleVector(x.value, y.value, shuffle));
+		return As<Short4>(Nucleus::createShuffleVector(x.value, y.value, shuffle));
 	}
 
 	RValue<Short4> UnpackHigh(RValue<Byte8> x, RValue<Byte8> y)
@@ -2676,16 +2808,64 @@
 		return As<Short4>(Swizzle(As<Int4>(lowHigh), 0xEE));
 	}
 
+	RValue<SByte> Extract(RValue<SByte8> val, int i)
+	{
+		return RValue<SByte>(Nucleus::createExtractElement(val.value, SByte::getType(), i));
+	}
+
+	RValue<SByte8> Insert(RValue<SByte8> val, RValue<SByte> element, int i)
+	{
+		return RValue<SByte8>(Nucleus::createInsertElement(val.value, element.value, i));
+	}
+
+	RValue<SByte8> operator>>(RValue<SByte8> lhs, unsigned char rhs)
+	{
+		if(emulateIntrinsics)
+		{
+			SByte8 result;
+			result = Insert(result, Extract(lhs, 0) >> SByte(rhs), 0);
+			result = Insert(result, Extract(lhs, 1) >> SByte(rhs), 1);
+			result = Insert(result, Extract(lhs, 2) >> SByte(rhs), 2);
+			result = Insert(result, Extract(lhs, 3) >> SByte(rhs), 3);
+			result = Insert(result, Extract(lhs, 4) >> SByte(rhs), 4);
+			result = Insert(result, Extract(lhs, 5) >> SByte(rhs), 5);
+			result = Insert(result, Extract(lhs, 6) >> SByte(rhs), 6);
+			result = Insert(result, Extract(lhs, 7) >> SByte(rhs), 7);
+
+			return result;
+		}
+		else
+		{
+			#if defined(__i386__) || defined(__x86_64__)
+				// SSE2 doesn't support byte vector shifts, so shift as shorts and recombine.
+				RValue<Short4> hi = (As<Short4>(lhs) >> rhs) & Short4(0xFF00);
+				RValue<Short4> lo = As<Short4>(As<UShort4>((As<Short4>(lhs) << 8) >> rhs) >> 8);
+
+				return As<SByte8>(hi | lo);
+			#else
+				return RValue<SByte8>(Nucleus::createAShr(lhs.value, V(::context->getConstantInt32(rhs))));
+			#endif
+		}
+	}
+
 	RValue<Int> SignMask(RValue<Byte8> x)
 	{
-		Ice::Variable *result = ::function->makeVariable(Ice::IceType_i32);
-		const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::SignMask, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
-		auto target = ::context->getConstantUndef(Ice::IceType_i32);
-		auto movmsk = Ice::InstIntrinsicCall::create(::function, 1, result, target, intrinsic);
-		movmsk->addArg(x.value);
-		::basicBlock->appendInst(movmsk);
+		if(emulateIntrinsics)
+		{
+			Byte8 xx = As<Byte8>(As<SByte8>(x) >> 7) & Byte8(0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80);
+			return Int(Extract(xx, 0)) | Int(Extract(xx, 1)) | Int(Extract(xx, 2)) | Int(Extract(xx, 3)) | Int(Extract(xx, 4)) | Int(Extract(xx, 5)) | Int(Extract(xx, 6)) | Int(Extract(xx, 7));
+		}
+		else
+		{
+			Ice::Variable *result = ::function->makeVariable(Ice::IceType_i32);
+			const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::SignMask, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
+			auto target = ::context->getConstantUndef(Ice::IceType_i32);
+			auto movmsk = Ice::InstIntrinsicCall::create(::function, 1, result, target, intrinsic);
+			movmsk->addArg(x.value);
+			::basicBlock->appendInst(movmsk);
 
-		return RValue<Int>(V(result));
+			return RValue<Int>(V(result)) & 0xFF;
+		}
 	}
 
 //	RValue<Byte8> CmpGT(RValue<Byte8> x, RValue<Byte8> y)
@@ -2866,36 +3046,75 @@
 		return RValue<SByte8>(Nucleus::createNot(val.value));
 	}
 
+	RValue<SByte> Saturate(RValue<Short> x)
+	{
+		return SByte(IfThenElse(Int(x) > 0x7F, Int(0x7F), IfThenElse(Int(x) < -0x80, Int(0x80), Int(x))));
+	}
+
 	RValue<SByte8> AddSat(RValue<SByte8> x, RValue<SByte8> y)
 	{
-		Ice::Variable *result = ::function->makeVariable(Ice::IceType_v16i8);
-		const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::AddSaturateSigned, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
-		auto target = ::context->getConstantUndef(Ice::IceType_i32);
-		auto paddsb = Ice::InstIntrinsicCall::create(::function, 2, result, target, intrinsic);
-		paddsb->addArg(x.value);
-		paddsb->addArg(y.value);
-		::basicBlock->appendInst(paddsb);
+		if(emulateIntrinsics)
+		{
+			SByte8 result;
+			result = Insert(result, Saturate(Short(Int(Extract(x, 0))) + Short(Int(Extract(y, 0)))), 0);
+			result = Insert(result, Saturate(Short(Int(Extract(x, 1))) + Short(Int(Extract(y, 1)))), 1);
+			result = Insert(result, Saturate(Short(Int(Extract(x, 2))) + Short(Int(Extract(y, 2)))), 2);
+			result = Insert(result, Saturate(Short(Int(Extract(x, 3))) + Short(Int(Extract(y, 3)))), 3);
+			result = Insert(result, Saturate(Short(Int(Extract(x, 4))) + Short(Int(Extract(y, 4)))), 4);
+			result = Insert(result, Saturate(Short(Int(Extract(x, 5))) + Short(Int(Extract(y, 5)))), 5);
+			result = Insert(result, Saturate(Short(Int(Extract(x, 6))) + Short(Int(Extract(y, 6)))), 6);
+			result = Insert(result, Saturate(Short(Int(Extract(x, 7))) + Short(Int(Extract(y, 7)))), 7);
 
-		return RValue<SByte8>(V(result));
+			return result;
+		}
+		else
+		{
+			Ice::Variable *result = ::function->makeVariable(Ice::IceType_v16i8);
+			const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::AddSaturateSigned, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
+			auto target = ::context->getConstantUndef(Ice::IceType_i32);
+			auto paddsb = Ice::InstIntrinsicCall::create(::function, 2, result, target, intrinsic);
+			paddsb->addArg(x.value);
+			paddsb->addArg(y.value);
+			::basicBlock->appendInst(paddsb);
+
+			return RValue<SByte8>(V(result));
+		}
 	}
 
 	RValue<SByte8> SubSat(RValue<SByte8> x, RValue<SByte8> y)
 	{
-		Ice::Variable *result = ::function->makeVariable(Ice::IceType_v16i8);
-		const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::SubtractSaturateSigned, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
-		auto target = ::context->getConstantUndef(Ice::IceType_i32);
-		auto psubsb = Ice::InstIntrinsicCall::create(::function, 2, result, target, intrinsic);
-		psubsb->addArg(x.value);
-		psubsb->addArg(y.value);
-		::basicBlock->appendInst(psubsb);
+		if(emulateIntrinsics)
+		{
+			SByte8 result;
+			result = Insert(result, Saturate(Short(Int(Extract(x, 0))) - Short(Int(Extract(y, 0)))), 0);
+			result = Insert(result, Saturate(Short(Int(Extract(x, 1))) - Short(Int(Extract(y, 1)))), 1);
+			result = Insert(result, Saturate(Short(Int(Extract(x, 2))) - Short(Int(Extract(y, 2)))), 2);
+			result = Insert(result, Saturate(Short(Int(Extract(x, 3))) - Short(Int(Extract(y, 3)))), 3);
+			result = Insert(result, Saturate(Short(Int(Extract(x, 4))) - Short(Int(Extract(y, 4)))), 4);
+			result = Insert(result, Saturate(Short(Int(Extract(x, 5))) - Short(Int(Extract(y, 5)))), 5);
+			result = Insert(result, Saturate(Short(Int(Extract(x, 6))) - Short(Int(Extract(y, 6)))), 6);
+			result = Insert(result, Saturate(Short(Int(Extract(x, 7))) - Short(Int(Extract(y, 7)))), 7);
 
-		return RValue<SByte8>(V(result));
+			return result;
+		}
+		else
+		{
+			Ice::Variable *result = ::function->makeVariable(Ice::IceType_v16i8);
+			const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::SubtractSaturateSigned, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
+			auto target = ::context->getConstantUndef(Ice::IceType_i32);
+			auto psubsb = Ice::InstIntrinsicCall::create(::function, 2, result, target, intrinsic);
+			psubsb->addArg(x.value);
+			psubsb->addArg(y.value);
+			::basicBlock->appendInst(psubsb);
+
+			return RValue<SByte8>(V(result));
+		}
 	}
 
 	RValue<Short4> UnpackLow(RValue<SByte8> x, RValue<SByte8> y)
 	{
 		int shuffle[16] = {0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23};   // Real type is v16i8
-		return RValue<Short4>(Nucleus::createShuffleVector(x.value, y.value, shuffle));
+		return As<Short4>(Nucleus::createShuffleVector(x.value, y.value, shuffle));
 	}
 
 	RValue<Short4> UnpackHigh(RValue<SByte8> x, RValue<SByte8> y)
@@ -2907,14 +3126,22 @@
 
 	RValue<Int> SignMask(RValue<SByte8> x)
 	{
-		Ice::Variable *result = ::function->makeVariable(Ice::IceType_i32);
-		const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::SignMask, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
-		auto target = ::context->getConstantUndef(Ice::IceType_i32);
-		auto movmsk = Ice::InstIntrinsicCall::create(::function, 1, result, target, intrinsic);
-		movmsk->addArg(x.value);
-		::basicBlock->appendInst(movmsk);
+		if(emulateIntrinsics)
+		{
+			SByte8 xx = (x >> 7) & SByte8(0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80);
+			return Int(Extract(xx, 0)) | Int(Extract(xx, 1)) | Int(Extract(xx, 2)) | Int(Extract(xx, 3)) | Int(Extract(xx, 4)) | Int(Extract(xx, 5)) | Int(Extract(xx, 6)) | Int(Extract(xx, 7));
+		}
+		else
+		{
+			Ice::Variable *result = ::function->makeVariable(Ice::IceType_i32);
+			const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::SignMask, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
+			auto target = ::context->getConstantUndef(Ice::IceType_i32);
+			auto movmsk = Ice::InstIntrinsicCall::create(::function, 1, result, target, intrinsic);
+			movmsk->addArg(x.value);
+			::basicBlock->appendInst(movmsk);
 
-		return RValue<Int>(V(result));
+			return RValue<Int>(V(result)) & 0xFF;
+		}
 	}
 
 	RValue<Byte8> CmpGT(RValue<SByte8> x, RValue<SByte8> y)
@@ -3018,7 +3245,7 @@
 		Value *short8 = Nucleus::createBitCast(cast.value, Short8::getType());
 		Value *packed = Nucleus::createShuffleVector(short8, short8, select);
 
-		Value *int2 = RValue<Int2>(Int2(RValue<Int4>(packed))).value;
+		Value *int2 = RValue<Int2>(Int2(As<Int4>(packed))).value;
 		Value *short4 = Nucleus::createBitCast(int2, Short4::getType());
 
 		storeValue(short4);
@@ -3165,12 +3392,38 @@
 
 	RValue<Short4> operator<<(RValue<Short4> lhs, unsigned char rhs)
 	{
-		return RValue<Short4>(Nucleus::createShl(lhs.value, V(::context->getConstantInt32(rhs))));
+		if(emulateIntrinsics)
+		{
+			Short4 result;
+			result = Insert(result, Extract(lhs, 0) << Short(rhs), 0);
+			result = Insert(result, Extract(lhs, 1) << Short(rhs), 1);
+			result = Insert(result, Extract(lhs, 2) << Short(rhs), 2);
+			result = Insert(result, Extract(lhs, 3) << Short(rhs), 3);
+
+			return result;
+		}
+		else
+		{
+			return RValue<Short4>(Nucleus::createShl(lhs.value, V(::context->getConstantInt32(rhs))));
+		}
 	}
 
 	RValue<Short4> operator>>(RValue<Short4> lhs, unsigned char rhs)
 	{
-		return RValue<Short4>(Nucleus::createAShr(lhs.value, V(::context->getConstantInt32(rhs))));
+		if(emulateIntrinsics)
+		{
+			Short4 result;
+			result = Insert(result, Extract(lhs, 0) >> Short(rhs), 0);
+			result = Insert(result, Extract(lhs, 1) >> Short(rhs), 1);
+			result = Insert(result, Extract(lhs, 2) >> Short(rhs), 2);
+			result = Insert(result, Extract(lhs, 3) >> Short(rhs), 3);
+
+			return result;
+		}
+		else
+		{
+			return RValue<Short4>(Nucleus::createAShr(lhs.value, V(::context->getConstantInt32(rhs))));
+		}
 	}
 
 	RValue<Short4> operator+=(Short4 &lhs, RValue<Short4> rhs)
@@ -3270,75 +3523,147 @@
 		return RValue<Short4>(V(result));
 	}
 
+	RValue<Short> Saturate(RValue<Int> x)
+	{
+		return Short(IfThenElse(x > 0x7FFF, Int(0x7FFF), IfThenElse(x < -0x8000, Int(0x8000), x)));
+	}
+
 	RValue<Short4> AddSat(RValue<Short4> x, RValue<Short4> y)
 	{
-		Ice::Variable *result = ::function->makeVariable(Ice::IceType_v8i16);
-		const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::AddSaturateSigned, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
-		auto target = ::context->getConstantUndef(Ice::IceType_i32);
-		auto paddsw = Ice::InstIntrinsicCall::create(::function, 2, result, target, intrinsic);
-		paddsw->addArg(x.value);
-		paddsw->addArg(y.value);
-		::basicBlock->appendInst(paddsw);
+		if(emulateIntrinsics)
+		{
+			Short4 result;
+			result = Insert(result, Saturate(Int(Extract(x, 0)) + Int(Extract(y, 0))), 0);
+			result = Insert(result, Saturate(Int(Extract(x, 1)) + Int(Extract(y, 1))), 1);
+			result = Insert(result, Saturate(Int(Extract(x, 2)) + Int(Extract(y, 2))), 2);
+			result = Insert(result, Saturate(Int(Extract(x, 3)) + Int(Extract(y, 3))), 3);
 
-		return RValue<Short4>(V(result));
+			return result;
+		}
+		else
+		{
+			Ice::Variable *result = ::function->makeVariable(Ice::IceType_v8i16);
+			const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::AddSaturateSigned, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
+			auto target = ::context->getConstantUndef(Ice::IceType_i32);
+			auto paddsw = Ice::InstIntrinsicCall::create(::function, 2, result, target, intrinsic);
+			paddsw->addArg(x.value);
+			paddsw->addArg(y.value);
+			::basicBlock->appendInst(paddsw);
+
+			return RValue<Short4>(V(result));
+		}
 	}
 
 	RValue<Short4> SubSat(RValue<Short4> x, RValue<Short4> y)
 	{
-		Ice::Variable *result = ::function->makeVariable(Ice::IceType_v8i16);
-		const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::SubtractSaturateSigned, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
-		auto target = ::context->getConstantUndef(Ice::IceType_i32);
-		auto psubsw = Ice::InstIntrinsicCall::create(::function, 2, result, target, intrinsic);
-		psubsw->addArg(x.value);
-		psubsw->addArg(y.value);
-		::basicBlock->appendInst(psubsw);
+		if(emulateIntrinsics)
+		{
+			Short4 result;
+			result = Insert(result, Saturate(Int(Extract(x, 0)) - Int(Extract(y, 0))), 0);
+			result = Insert(result, Saturate(Int(Extract(x, 1)) - Int(Extract(y, 1))), 1);
+			result = Insert(result, Saturate(Int(Extract(x, 2)) - Int(Extract(y, 2))), 2);
+			result = Insert(result, Saturate(Int(Extract(x, 3)) - Int(Extract(y, 3))), 3);
 
-		return RValue<Short4>(V(result));
+			return result;
+		}
+		else
+		{
+			Ice::Variable *result = ::function->makeVariable(Ice::IceType_v8i16);
+			const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::SubtractSaturateSigned, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
+			auto target = ::context->getConstantUndef(Ice::IceType_i32);
+			auto psubsw = Ice::InstIntrinsicCall::create(::function, 2, result, target, intrinsic);
+			psubsw->addArg(x.value);
+			psubsw->addArg(y.value);
+			::basicBlock->appendInst(psubsw);
+
+			return RValue<Short4>(V(result));
+		}
 	}
 
 	RValue<Short4> MulHigh(RValue<Short4> x, RValue<Short4> y)
 	{
-		Ice::Variable *result = ::function->makeVariable(Ice::IceType_v8i16);
-		const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::MultiplyHighSigned, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
-		auto target = ::context->getConstantUndef(Ice::IceType_i32);
-		auto pmulhw = Ice::InstIntrinsicCall::create(::function, 2, result, target, intrinsic);
-		pmulhw->addArg(x.value);
-		pmulhw->addArg(y.value);
-		::basicBlock->appendInst(pmulhw);
+		if(emulateIntrinsics)
+		{
+			Short4 result;
+			result = Insert(result, Short((Int(Extract(x, 0)) * Int(Extract(y, 0))) >> 16), 0);
+			result = Insert(result, Short((Int(Extract(x, 1)) * Int(Extract(y, 1))) >> 16), 1);
+			result = Insert(result, Short((Int(Extract(x, 2)) * Int(Extract(y, 2))) >> 16), 2);
+			result = Insert(result, Short((Int(Extract(x, 3)) * Int(Extract(y, 3))) >> 16), 3);
 
-		return RValue<Short4>(V(result));
+			return result;
+		}
+		else
+		{
+			Ice::Variable *result = ::function->makeVariable(Ice::IceType_v8i16);
+			const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::MultiplyHighSigned, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
+			auto target = ::context->getConstantUndef(Ice::IceType_i32);
+			auto pmulhw = Ice::InstIntrinsicCall::create(::function, 2, result, target, intrinsic);
+			pmulhw->addArg(x.value);
+			pmulhw->addArg(y.value);
+			::basicBlock->appendInst(pmulhw);
+
+			return RValue<Short4>(V(result));
+		}
 	}
 
 	RValue<Int2> MulAdd(RValue<Short4> x, RValue<Short4> y)
 	{
-		Ice::Variable *result = ::function->makeVariable(Ice::IceType_v8i16);
-		const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::MultiplyAddPairs, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
-		auto target = ::context->getConstantUndef(Ice::IceType_i32);
-		auto pmaddwd = Ice::InstIntrinsicCall::create(::function, 2, result, target, intrinsic);
-		pmaddwd->addArg(x.value);
-		pmaddwd->addArg(y.value);
-		::basicBlock->appendInst(pmaddwd);
+		if(emulateIntrinsics)
+		{
+			Int2 result;
+			result = Insert(result, Int(Extract(x, 0)) * Int(Extract(y, 0)) + Int(Extract(x, 1)) * Int(Extract(y, 1)), 0);
+			result = Insert(result, Int(Extract(x, 2)) * Int(Extract(y, 2)) + Int(Extract(x, 3)) * Int(Extract(y, 3)), 1);
 
-		return RValue<Int2>(V(result));
+			return result;
+		}
+		else
+		{
+			Ice::Variable *result = ::function->makeVariable(Ice::IceType_v8i16);
+			const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::MultiplyAddPairs, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
+			auto target = ::context->getConstantUndef(Ice::IceType_i32);
+			auto pmaddwd = Ice::InstIntrinsicCall::create(::function, 2, result, target, intrinsic);
+			pmaddwd->addArg(x.value);
+			pmaddwd->addArg(y.value);
+			::basicBlock->appendInst(pmaddwd);
+
+			return As<Int2>(V(result));
+		}
 	}
 
 	RValue<SByte8> Pack(RValue<Short4> x, RValue<Short4> y)
 	{
-		Ice::Variable *result = ::function->makeVariable(Ice::IceType_v16i8);
-		const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::VectorPackSigned, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
-		auto target = ::context->getConstantUndef(Ice::IceType_i32);
-		auto pack = Ice::InstIntrinsicCall::create(::function, 2, result, target, intrinsic);
-		pack->addArg(x.value);
-		pack->addArg(y.value);
-		::basicBlock->appendInst(pack);
+		if(emulateIntrinsics)
+		{
+			SByte8 result;
+			result = Insert(result, Saturate(Extract(x, 0)), 0);
+			result = Insert(result, Saturate(Extract(x, 1)), 1);
+			result = Insert(result, Saturate(Extract(x, 2)), 2);
+			result = Insert(result, Saturate(Extract(x, 3)), 3);
+			result = Insert(result, Saturate(Extract(y, 0)), 4);
+			result = Insert(result, Saturate(Extract(y, 1)), 5);
+			result = Insert(result, Saturate(Extract(y, 2)), 6);
+			result = Insert(result, Saturate(Extract(y, 3)), 7);
 
-		return As<SByte8>(Swizzle(As<Int4>(V(result)), 0x88));
+			return result;
+		}
+		else
+		{
+			Ice::Variable *result = ::function->makeVariable(Ice::IceType_v16i8);
+			const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::VectorPackSigned, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
+			auto target = ::context->getConstantUndef(Ice::IceType_i32);
+			auto pack = Ice::InstIntrinsicCall::create(::function, 2, result, target, intrinsic);
+			pack->addArg(x.value);
+			pack->addArg(y.value);
+			::basicBlock->appendInst(pack);
+
+			return As<SByte8>(Swizzle(As<Int4>(V(result)), 0x88));
+		}
 	}
 
 	RValue<Int2> UnpackLow(RValue<Short4> x, RValue<Short4> y)
 	{
 		int shuffle[8] = {0, 8, 1, 9, 2, 10, 3, 11};   // Real type is v8i16
-		return RValue<Int2>(Nucleus::createShuffleVector(x.value, y.value, shuffle));
+		return As<Int2>(Nucleus::createShuffleVector(x.value, y.value, shuffle));
 	}
 
 	RValue<Int2> UnpackHigh(RValue<Short4> x, RValue<Short4> y)
@@ -3538,14 +3863,50 @@
 		return RValue<UShort4>(Nucleus::createXor(lhs.value, rhs.value));
 	}
 
+	RValue<UShort> Extract(RValue<UShort4> val, int i)
+	{
+		return RValue<UShort>(Nucleus::createExtractElement(val.value, UShort::getType(), i));
+	}
+
+	RValue<UShort4> Insert(RValue<UShort4> val, RValue<UShort> element, int i)
+	{
+		return RValue<UShort4>(Nucleus::createInsertElement(val.value, element.value, i));
+	}
+
 	RValue<UShort4> operator<<(RValue<UShort4> lhs, unsigned char rhs)
 	{
-		return RValue<UShort4>(Nucleus::createShl(lhs.value, V(::context->getConstantInt32(rhs))));
+		if(emulateIntrinsics)
+		{
+			UShort4 result;
+			result = Insert(result, Extract(lhs, 0) << UShort(rhs), 0);
+			result = Insert(result, Extract(lhs, 1) << UShort(rhs), 1);
+			result = Insert(result, Extract(lhs, 2) << UShort(rhs), 2);
+			result = Insert(result, Extract(lhs, 3) << UShort(rhs), 3);
+
+			return result;
+		}
+		else
+		{
+			return RValue<UShort4>(Nucleus::createShl(lhs.value, V(::context->getConstantInt32(rhs))));
+		}
 	}
 
 	RValue<UShort4> operator>>(RValue<UShort4> lhs, unsigned char rhs)
 	{
-		return RValue<UShort4>(Nucleus::createLShr(lhs.value, V(::context->getConstantInt32(rhs))));
+		if(emulateIntrinsics)
+		{
+			UShort4 result;
+			result = Insert(result, Extract(lhs, 0) >> UShort(rhs), 0);
+			result = Insert(result, Extract(lhs, 1) >> UShort(rhs), 1);
+			result = Insert(result, Extract(lhs, 2) >> UShort(rhs), 2);
+			result = Insert(result, Extract(lhs, 3) >> UShort(rhs), 3);
+
+			return result;
+		}
+		else
+		{
+			return RValue<UShort4>(Nucleus::createLShr(lhs.value, V(::context->getConstantInt32(rhs))));
+		}
 	}
 
 	RValue<UShort4> operator<<=(UShort4 &lhs, unsigned char rhs)
@@ -3589,43 +3950,87 @@
 		return RValue<UShort4>(V(result));
 	}
 
+	RValue<UShort> SaturateUShort(RValue<Int> x)
+	{
+		return UShort(IfThenElse(x > 0xFFFF, Int(0xFFFF), IfThenElse(x < 0, Int(0), x)));
+	}
+
 	RValue<UShort4> AddSat(RValue<UShort4> x, RValue<UShort4> y)
 	{
-		Ice::Variable *result = ::function->makeVariable(Ice::IceType_v8i16);
-		const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::AddSaturateUnsigned, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
-		auto target = ::context->getConstantUndef(Ice::IceType_i32);
-		auto paddusw = Ice::InstIntrinsicCall::create(::function, 2, result, target, intrinsic);
-		paddusw->addArg(x.value);
-		paddusw->addArg(y.value);
-		::basicBlock->appendInst(paddusw);
+		if(emulateIntrinsics)
+		{
+			UShort4 result;
+			result = Insert(result, SaturateUShort(Int(Extract(x, 0)) + Int(Extract(y, 0))), 0);
+			result = Insert(result, SaturateUShort(Int(Extract(x, 1)) + Int(Extract(y, 1))), 1);
+			result = Insert(result, SaturateUShort(Int(Extract(x, 2)) + Int(Extract(y, 2))), 2);
+			result = Insert(result, SaturateUShort(Int(Extract(x, 3)) + Int(Extract(y, 3))), 3);
 
-		return RValue<UShort4>(V(result));
+			return result;
+		}
+		else
+		{
+			Ice::Variable *result = ::function->makeVariable(Ice::IceType_v8i16);
+			const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::AddSaturateUnsigned, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
+			auto target = ::context->getConstantUndef(Ice::IceType_i32);
+			auto paddusw = Ice::InstIntrinsicCall::create(::function, 2, result, target, intrinsic);
+			paddusw->addArg(x.value);
+			paddusw->addArg(y.value);
+			::basicBlock->appendInst(paddusw);
+
+			return RValue<UShort4>(V(result));
+		}
 	}
 
 	RValue<UShort4> SubSat(RValue<UShort4> x, RValue<UShort4> y)
 	{
-		Ice::Variable *result = ::function->makeVariable(Ice::IceType_v8i16);
-		const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::SubtractSaturateUnsigned, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
-		auto target = ::context->getConstantUndef(Ice::IceType_i32);
-		auto psubusw = Ice::InstIntrinsicCall::create(::function, 2, result, target, intrinsic);
-		psubusw->addArg(x.value);
-		psubusw->addArg(y.value);
-		::basicBlock->appendInst(psubusw);
+		if(emulateIntrinsics)
+		{
+			UShort4 result;
+			result = Insert(result, SaturateUShort(Int(Extract(x, 0)) - Int(Extract(y, 0))), 0);
+			result = Insert(result, SaturateUShort(Int(Extract(x, 1)) - Int(Extract(y, 1))), 1);
+			result = Insert(result, SaturateUShort(Int(Extract(x, 2)) - Int(Extract(y, 2))), 2);
+			result = Insert(result, SaturateUShort(Int(Extract(x, 3)) - Int(Extract(y, 3))), 3);
 
-		return RValue<UShort4>(V(result));
+			return result;
+		}
+		else
+		{
+			Ice::Variable *result = ::function->makeVariable(Ice::IceType_v8i16);
+			const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::SubtractSaturateUnsigned, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
+			auto target = ::context->getConstantUndef(Ice::IceType_i32);
+			auto psubusw = Ice::InstIntrinsicCall::create(::function, 2, result, target, intrinsic);
+			psubusw->addArg(x.value);
+			psubusw->addArg(y.value);
+			::basicBlock->appendInst(psubusw);
+
+			return RValue<UShort4>(V(result));
+		}
 	}
 
 	RValue<UShort4> MulHigh(RValue<UShort4> x, RValue<UShort4> y)
 	{
-		Ice::Variable *result = ::function->makeVariable(Ice::IceType_v8i16);
-		const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::MultiplyHighUnsigned, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
-		auto target = ::context->getConstantUndef(Ice::IceType_i32);
-		auto pmulhuw = Ice::InstIntrinsicCall::create(::function, 2, result, target, intrinsic);
-		pmulhuw->addArg(x.value);
-		pmulhuw->addArg(y.value);
-		::basicBlock->appendInst(pmulhuw);
+		if(emulateIntrinsics)
+		{
+			UShort4 result;
+			result = Insert(result, UShort((UInt(Extract(x, 0)) * UInt(Extract(y, 0))) >> 16), 0);
+			result = Insert(result, UShort((UInt(Extract(x, 1)) * UInt(Extract(y, 1))) >> 16), 1);
+			result = Insert(result, UShort((UInt(Extract(x, 2)) * UInt(Extract(y, 2))) >> 16), 2);
+			result = Insert(result, UShort((UInt(Extract(x, 3)) * UInt(Extract(y, 3))) >> 16), 3);
 
-		return RValue<UShort4>(V(result));
+			return result;
+		}
+		else
+		{
+			Ice::Variable *result = ::function->makeVariable(Ice::IceType_v8i16);
+			const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::MultiplyHighUnsigned, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
+			auto target = ::context->getConstantUndef(Ice::IceType_i32);
+			auto pmulhuw = Ice::InstIntrinsicCall::create(::function, 2, result, target, intrinsic);
+			pmulhuw->addArg(x.value);
+			pmulhuw->addArg(y.value);
+			::basicBlock->appendInst(pmulhuw);
+
+			return RValue<UShort4>(V(result));
+		}
 	}
 
 	RValue<UShort4> Average(RValue<UShort4> x, RValue<UShort4> y)
@@ -3635,15 +4040,32 @@
 
 	RValue<Byte8> Pack(RValue<UShort4> x, RValue<UShort4> y)
 	{
-		Ice::Variable *result = ::function->makeVariable(Ice::IceType_v16i8);
-		const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::VectorPackUnsigned, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
-		auto target = ::context->getConstantUndef(Ice::IceType_i32);
-		auto pack = Ice::InstIntrinsicCall::create(::function, 2, result, target, intrinsic);
-		pack->addArg(x.value);
-		pack->addArg(y.value);
-		::basicBlock->appendInst(pack);
+		if(emulateIntrinsics)
+		{
+			Byte8 result;
+			result = Insert(result, Saturate(Extract(x, 0)), 0);
+			result = Insert(result, Saturate(Extract(x, 1)), 1);
+			result = Insert(result, Saturate(Extract(x, 2)), 2);
+			result = Insert(result, Saturate(Extract(x, 3)), 3);
+			result = Insert(result, Saturate(Extract(y, 0)), 4);
+			result = Insert(result, Saturate(Extract(y, 1)), 5);
+			result = Insert(result, Saturate(Extract(y, 2)), 6);
+			result = Insert(result, Saturate(Extract(y, 3)), 7);
 
-		return As<Byte8>(Swizzle(As<Int4>(V(result)), 0x88));
+			return result;
+		}
+		else
+		{
+			Ice::Variable *result = ::function->makeVariable(Ice::IceType_v16i8);
+			const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::VectorPackUnsigned, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
+			auto target = ::context->getConstantUndef(Ice::IceType_i32);
+			auto pack = Ice::InstIntrinsicCall::create(::function, 2, result, target, intrinsic);
+			pack->addArg(x.value);
+			pack->addArg(y.value);
+			::basicBlock->appendInst(pack);
+
+			return As<Byte8>(Swizzle(As<Int4>(V(result)), 0x88));
+		}
 	}
 
 	Type *UShort4::getType()
@@ -3692,14 +4114,58 @@
 		return RValue<Short8>(Nucleus::createAnd(lhs.value, rhs.value));
 	}
 
+	RValue<Short> Extract(RValue<Short8> val, int i)
+	{
+		return RValue<Short>(Nucleus::createExtractElement(val.value, Short::getType(), i));
+	}
+
+	RValue<Short8> Insert(RValue<Short8> val, RValue<Short> element, int i)
+	{
+		return RValue<Short8>(Nucleus::createInsertElement(val.value, element.value, i));
+	}
+
 	RValue<Short8> operator<<(RValue<Short8> lhs, unsigned char rhs)
 	{
-		return RValue<Short8>(Nucleus::createShl(lhs.value, V(::context->getConstantInt32(rhs))));
+		if(emulateIntrinsics)
+		{
+			Short8 result;
+			result = Insert(result, Extract(lhs, 0) << Short(rhs), 0);
+			result = Insert(result, Extract(lhs, 1) << Short(rhs), 1);
+			result = Insert(result, Extract(lhs, 2) << Short(rhs), 2);
+			result = Insert(result, Extract(lhs, 3) << Short(rhs), 3);
+			result = Insert(result, Extract(lhs, 4) << Short(rhs), 4);
+			result = Insert(result, Extract(lhs, 5) << Short(rhs), 5);
+			result = Insert(result, Extract(lhs, 6) << Short(rhs), 6);
+			result = Insert(result, Extract(lhs, 7) << Short(rhs), 7);
+
+			return result;
+		}
+		else
+		{
+			return RValue<Short8>(Nucleus::createShl(lhs.value, V(::context->getConstantInt32(rhs))));
+		}
 	}
 
 	RValue<Short8> operator>>(RValue<Short8> lhs, unsigned char rhs)
 	{
-		return RValue<Short8>(Nucleus::createAShr(lhs.value, V(::context->getConstantInt32(rhs))));
+		if(emulateIntrinsics)
+		{
+			Short8 result;
+			result = Insert(result, Extract(lhs, 0) >> Short(rhs), 0);
+			result = Insert(result, Extract(lhs, 1) >> Short(rhs), 1);
+			result = Insert(result, Extract(lhs, 2) >> Short(rhs), 2);
+			result = Insert(result, Extract(lhs, 3) >> Short(rhs), 3);
+			result = Insert(result, Extract(lhs, 4) >> Short(rhs), 4);
+			result = Insert(result, Extract(lhs, 5) >> Short(rhs), 5);
+			result = Insert(result, Extract(lhs, 6) >> Short(rhs), 6);
+			result = Insert(result, Extract(lhs, 7) >> Short(rhs), 7);
+
+			return result;
+		}
+		else
+		{
+			return RValue<Short8>(Nucleus::createAShr(lhs.value, V(::context->getConstantInt32(rhs))));
+		}
 	}
 
 	RValue<Int4> MulAdd(RValue<Short8> x, RValue<Short8> y)
@@ -3782,14 +4248,58 @@
 		return RValue<UShort8>(Nucleus::createAnd(lhs.value, rhs.value));
 	}
 
+	RValue<UShort> Extract(RValue<UShort8> val, int i)
+	{
+		return RValue<UShort>(Nucleus::createExtractElement(val.value, UShort::getType(), i));
+	}
+
+	RValue<UShort8> Insert(RValue<UShort8> val, RValue<UShort> element, int i)
+	{
+		return RValue<UShort8>(Nucleus::createInsertElement(val.value, element.value, i));
+	}
+
 	RValue<UShort8> operator<<(RValue<UShort8> lhs, unsigned char rhs)
 	{
-		return RValue<UShort8>(Nucleus::createShl(lhs.value, V(::context->getConstantInt32(rhs))));
+		if(emulateIntrinsics)
+		{
+			UShort8 result;
+			result = Insert(result, Extract(lhs, 0) << UShort(rhs), 0);
+			result = Insert(result, Extract(lhs, 1) << UShort(rhs), 1);
+			result = Insert(result, Extract(lhs, 2) << UShort(rhs), 2);
+			result = Insert(result, Extract(lhs, 3) << UShort(rhs), 3);
+			result = Insert(result, Extract(lhs, 4) << UShort(rhs), 4);
+			result = Insert(result, Extract(lhs, 5) << UShort(rhs), 5);
+			result = Insert(result, Extract(lhs, 6) << UShort(rhs), 6);
+			result = Insert(result, Extract(lhs, 7) << UShort(rhs), 7);
+
+			return result;
+		}
+		else
+		{
+			return RValue<UShort8>(Nucleus::createShl(lhs.value, V(::context->getConstantInt32(rhs))));
+		}
 	}
 
 	RValue<UShort8> operator>>(RValue<UShort8> lhs, unsigned char rhs)
 	{
-		return RValue<UShort8>(Nucleus::createLShr(lhs.value, V(::context->getConstantInt32(rhs))));
+		if(emulateIntrinsics)
+		{
+			UShort8 result;
+			result = Insert(result, Extract(lhs, 0) >> UShort(rhs), 0);
+			result = Insert(result, Extract(lhs, 1) >> UShort(rhs), 1);
+			result = Insert(result, Extract(lhs, 2) >> UShort(rhs), 2);
+			result = Insert(result, Extract(lhs, 3) >> UShort(rhs), 3);
+			result = Insert(result, Extract(lhs, 4) >> UShort(rhs), 4);
+			result = Insert(result, Extract(lhs, 5) >> UShort(rhs), 5);
+			result = Insert(result, Extract(lhs, 6) >> UShort(rhs), 6);
+			result = Insert(result, Extract(lhs, 7) >> UShort(rhs), 7);
+
+			return result;
+		}
+		else
+		{
+			return RValue<UShort8>(Nucleus::createLShr(lhs.value, V(::context->getConstantInt32(rhs))));
+		}
 	}
 
 	RValue<UShort8> operator+(RValue<UShort8> lhs, RValue<UShort8> rhs)
@@ -4704,12 +5214,34 @@
 
 	RValue<Int2> operator<<(RValue<Int2> lhs, unsigned char rhs)
 	{
-		return RValue<Int2>(Nucleus::createShl(lhs.value, V(::context->getConstantInt32(rhs))));
+		if(emulateIntrinsics)
+		{
+			Int2 result;
+			result = Insert(result, Extract(lhs, 0) << Int(rhs), 0);
+			result = Insert(result, Extract(lhs, 1) << Int(rhs), 1);
+
+			return result;
+		}
+		else
+		{
+			return RValue<Int2>(Nucleus::createShl(lhs.value, V(::context->getConstantInt32(rhs))));
+		}
 	}
 
 	RValue<Int2> operator>>(RValue<Int2> lhs, unsigned char rhs)
 	{
-		return RValue<Int2>(Nucleus::createAShr(lhs.value, V(::context->getConstantInt32(rhs))));
+		if(emulateIntrinsics)
+		{
+			Int2 result;
+			result = Insert(result, Extract(lhs, 0) >> Int(rhs), 0);
+			result = Insert(result, Extract(lhs, 1) >> Int(rhs), 1);
+
+			return result;
+		}
+		else
+		{
+			return RValue<Int2>(Nucleus::createAShr(lhs.value, V(::context->getConstantInt32(rhs))));
+		}
 	}
 
 	RValue<Int2> operator+=(Int2 &lhs, RValue<Int2> rhs)
@@ -4891,14 +5423,46 @@
 		return RValue<UInt2>(Nucleus::createXor(lhs.value, rhs.value));
 	}
 
+	RValue<UInt> Extract(RValue<UInt2> val, int i)
+	{
+		return RValue<UInt>(Nucleus::createExtractElement(val.value, UInt::getType(), i));
+	}
+
+	RValue<UInt2> Insert(RValue<UInt2> val, RValue<UInt> element, int i)
+	{
+		return RValue<UInt2>(Nucleus::createInsertElement(val.value, element.value, i));
+	}
+
 	RValue<UInt2> operator<<(RValue<UInt2> lhs, unsigned char rhs)
 	{
-		return RValue<UInt2>(Nucleus::createShl(lhs.value, V(::context->getConstantInt32(rhs))));
+		if(emulateIntrinsics)
+		{
+			UInt2 result;
+			result = Insert(result, Extract(lhs, 0) << UInt(rhs), 0);
+			result = Insert(result, Extract(lhs, 1) << UInt(rhs), 1);
+
+			return result;
+		}
+		else
+		{
+			return RValue<UInt2>(Nucleus::createShl(lhs.value, V(::context->getConstantInt32(rhs))));
+		}
 	}
 
 	RValue<UInt2> operator>>(RValue<UInt2> lhs, unsigned char rhs)
 	{
-		return RValue<UInt2>(Nucleus::createLShr(lhs.value, V(::context->getConstantInt32(rhs))));
+		if(emulateIntrinsics)
+		{
+			UInt2 result;
+			result = Insert(result, Extract(lhs, 0) >> UInt(rhs), 0);
+			result = Insert(result, Extract(lhs, 1) >> UInt(rhs), 1);
+
+			return result;
+		}
+		else
+		{
+			return RValue<UInt2>(Nucleus::createLShr(lhs.value, V(::context->getConstantInt32(rhs))));
+		}
 	}
 
 	RValue<UInt2> operator+=(UInt2 &lhs, RValue<UInt2> rhs)
@@ -4994,18 +5558,15 @@
 		Value *x = Nucleus::createBitCast(cast.value, Int::getType());
 		Value *a = Nucleus::createInsertElement(loadValue(), x, 0);
 
-		Value *e;
 		int swizzle[16] = {0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7};
 		Value *b = Nucleus::createBitCast(a, Byte16::getType());
 		Value *c = Nucleus::createShuffleVector(b, b, swizzle);
 
 		int swizzle2[8] = {0, 0, 1, 1, 2, 2, 3, 3};
 		Value *d = Nucleus::createBitCast(c, Short8::getType());
-		e = Nucleus::createShuffleVector(d, d, swizzle2);
+		Value *e = Nucleus::createShuffleVector(d, d, swizzle2);
 
-		Value *f = Nucleus::createBitCast(e, Int4::getType());
-		Value *g = Nucleus::createAShr(f, V(::context->getConstantInt32(24)));
-		storeValue(g);
+		*this = As<Int4>(e) >> 24;
 	}
 
 	Int4::Int4(RValue<Float4> cast)
@@ -5019,9 +5580,8 @@
 	{
 		int swizzle[8] = {0, 0, 1, 1, 2, 2, 3, 3};
 		Value *c = Nucleus::createShuffleVector(cast.value, cast.value, swizzle);
-		Value *d = Nucleus::createBitCast(c, Int4::getType());
-		Value *e = Nucleus::createAShr(d, V(::context->getConstantInt32(16)));
-		storeValue(e);
+
+		*this = As<Int4>(c) >> 16;
 	}
 
 	Int4::Int4(RValue<UShort4> cast)
@@ -5185,12 +5745,38 @@
 
 	RValue<Int4> operator<<(RValue<Int4> lhs, unsigned char rhs)
 	{
-		return RValue<Int4>(Nucleus::createShl(lhs.value, V(::context->getConstantInt32(rhs))));
+		if(emulateIntrinsics)
+		{
+			Int4 result;
+			result = Insert(result, Extract(lhs, 0) << Int(rhs), 0);
+			result = Insert(result, Extract(lhs, 1) << Int(rhs), 1);
+			result = Insert(result, Extract(lhs, 2) << Int(rhs), 2);
+			result = Insert(result, Extract(lhs, 3) << Int(rhs), 3);
+
+			return result;
+		}
+		else
+		{
+			return RValue<Int4>(Nucleus::createShl(lhs.value, V(::context->getConstantInt32(rhs))));
+		}
 	}
 
 	RValue<Int4> operator>>(RValue<Int4> lhs, unsigned char rhs)
 	{
-		return RValue<Int4>(Nucleus::createAShr(lhs.value, V(::context->getConstantInt32(rhs))));
+		if(emulateIntrinsics)
+		{
+			Int4 result;
+			result = Insert(result, Extract(lhs, 0) >> Int(rhs), 0);
+			result = Insert(result, Extract(lhs, 1) >> Int(rhs), 1);
+			result = Insert(result, Extract(lhs, 2) >> Int(rhs), 2);
+			result = Insert(result, Extract(lhs, 3) >> Int(rhs), 3);
+
+			return result;
+		}
+		else
+		{
+			return RValue<Int4>(Nucleus::createAShr(lhs.value, V(::context->getConstantInt32(rhs))));
+		}
 	}
 
 	RValue<Int4> operator<<(RValue<Int4> lhs, RValue<Int4> rhs)
@@ -5346,15 +5932,32 @@
 
 	RValue<Short8> Pack(RValue<Int4> x, RValue<Int4> y)
 	{
-		Ice::Variable *result = ::function->makeVariable(Ice::IceType_v8i16);
-		const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::VectorPackSigned, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
-		auto target = ::context->getConstantUndef(Ice::IceType_i32);
-		auto pack = Ice::InstIntrinsicCall::create(::function, 2, result, target, intrinsic);
-		pack->addArg(x.value);
-		pack->addArg(y.value);
-		::basicBlock->appendInst(pack);
+		if(emulateIntrinsics)
+		{
+			Short8 result;
+			result = Insert(result, Saturate(Extract(x, 0)), 0);
+			result = Insert(result, Saturate(Extract(x, 1)), 1);
+			result = Insert(result, Saturate(Extract(x, 2)), 2);
+			result = Insert(result, Saturate(Extract(x, 3)), 3);
+			result = Insert(result, Saturate(Extract(y, 0)), 4);
+			result = Insert(result, Saturate(Extract(y, 1)), 5);
+			result = Insert(result, Saturate(Extract(y, 2)), 6);
+			result = Insert(result, Saturate(Extract(y, 3)), 7);
 
-		return RValue<Short8>(V(result));
+			return result;
+		}
+		else
+		{
+			Ice::Variable *result = ::function->makeVariable(Ice::IceType_v8i16);
+			const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::VectorPackSigned, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
+			auto target = ::context->getConstantUndef(Ice::IceType_i32);
+			auto pack = Ice::InstIntrinsicCall::create(::function, 2, result, target, intrinsic);
+			pack->addArg(x.value);
+			pack->addArg(y.value);
+			::basicBlock->appendInst(pack);
+
+			return RValue<Short8>(V(result));
+		}
 	}
 
 	RValue<Int> Extract(RValue<Int4> x, int i)
@@ -5369,14 +5972,22 @@
 
 	RValue<Int> SignMask(RValue<Int4> x)
 	{
-		Ice::Variable *result = ::function->makeVariable(Ice::IceType_i32);
-		const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::SignMask, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
-		auto target = ::context->getConstantUndef(Ice::IceType_i32);
-		auto movmsk = Ice::InstIntrinsicCall::create(::function, 1, result, target, intrinsic);
-		movmsk->addArg(x.value);
-		::basicBlock->appendInst(movmsk);
+		if(emulateIntrinsics)
+		{
+			Int4 xx = (x >> 31) & Int4(0x00000001, 0x00000002, 0x00000004, 0x00000008);
+			return Extract(xx, 0) | Extract(xx, 1) | Extract(xx, 2) | Extract(xx, 3);
+		}
+		else
+		{
+			Ice::Variable *result = ::function->makeVariable(Ice::IceType_i32);
+			const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::SignMask, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
+			auto target = ::context->getConstantUndef(Ice::IceType_i32);
+			auto movmsk = Ice::InstIntrinsicCall::create(::function, 1, result, target, intrinsic);
+			movmsk->addArg(x.value);
+			::basicBlock->appendInst(movmsk);
 
-		return RValue<Int>(V(result));
+			return RValue<Int>(V(result));
+		}
 	}
 
 	RValue<Int4> Swizzle(RValue<Int4> x, unsigned char select)
@@ -5536,14 +6147,50 @@
 		return RValue<UInt4>(Nucleus::createXor(lhs.value, rhs.value));
 	}
 
+	RValue<UInt> Extract(RValue<UInt4> x, int i)
+	{
+		return RValue<UInt>(Nucleus::createExtractElement(x.value, UInt::getType(), i));
+	}
+
+	RValue<UInt4> Insert(RValue<UInt4> x, RValue<UInt> element, int i)
+	{
+		return RValue<UInt4>(Nucleus::createInsertElement(x.value, element.value, i));
+	}
+
 	RValue<UInt4> operator<<(RValue<UInt4> lhs, unsigned char rhs)
 	{
-		return RValue<UInt4>(Nucleus::createShl(lhs.value, V(::context->getConstantInt32(rhs))));
+		if(emulateIntrinsics)
+		{
+			UInt4 result;
+			result = Insert(result, Extract(lhs, 0) << UInt(rhs), 0);
+			result = Insert(result, Extract(lhs, 1) << UInt(rhs), 1);
+			result = Insert(result, Extract(lhs, 2) << UInt(rhs), 2);
+			result = Insert(result, Extract(lhs, 3) << UInt(rhs), 3);
+
+			return result;
+		}
+		else
+		{
+			return RValue<UInt4>(Nucleus::createShl(lhs.value, V(::context->getConstantInt32(rhs))));
+		}
 	}
 
 	RValue<UInt4> operator>>(RValue<UInt4> lhs, unsigned char rhs)
 	{
-		return RValue<UInt4>(Nucleus::createLShr(lhs.value, V(::context->getConstantInt32(rhs))));
+		if(emulateIntrinsics)
+		{
+			UInt4 result;
+			result = Insert(result, Extract(lhs, 0) >> UInt(rhs), 0);
+			result = Insert(result, Extract(lhs, 1) >> UInt(rhs), 1);
+			result = Insert(result, Extract(lhs, 2) >> UInt(rhs), 2);
+			result = Insert(result, Extract(lhs, 3) >> UInt(rhs), 3);
+
+			return result;
+		}
+		else
+		{
+			return RValue<UInt4>(Nucleus::createLShr(lhs.value, V(::context->getConstantInt32(rhs))));
+		}
 	}
 
 	RValue<UInt4> operator<<(RValue<UInt4> lhs, RValue<UInt4> rhs)
@@ -5715,6 +6362,14 @@
 		storeValue(integer);
 	}
 
+	Float::Float(RValue<UInt> cast)
+	{
+		RValue<Float> result = Float(Int(cast & UInt(0x7FFFFFFF))) +
+		                       As<Float>((As<Int>(cast) >> 31) & As<Int>(Float(0x80000000u)));
+
+		storeValue(result.value);
+	}
+
 	Float::Float(float x)
 	{
 		storeValue(Nucleus::createConstantFloat(x));
@@ -6177,14 +6832,27 @@
 
 	RValue<Float4> Sqrt(RValue<Float4> x)
 	{
-		Ice::Variable *result = ::function->makeVariable(Ice::IceType_v4f32);
-		const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::Sqrt, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
-		auto target = ::context->getConstantUndef(Ice::IceType_i32);
-		auto sqrt = Ice::InstIntrinsicCall::create(::function, 1, result, target, intrinsic);
-		sqrt->addArg(x.value);
-		::basicBlock->appendInst(sqrt);
+		if(emulateIntrinsics)
+		{
+			Float4 result;
+			result.x = Sqrt(Float(Float4(x).x));
+			result.y = Sqrt(Float(Float4(x).y));
+			result.z = Sqrt(Float(Float4(x).z));
+			result.w = Sqrt(Float(Float4(x).w));
 
-		return RValue<Float4>(V(result));
+			return result;
+		}
+		else
+		{
+			Ice::Variable *result = ::function->makeVariable(Ice::IceType_v4f32);
+			const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::Sqrt, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
+			auto target = ::context->getConstantUndef(Ice::IceType_i32);
+			auto sqrt = Ice::InstIntrinsicCall::create(::function, 1, result, target, intrinsic);
+			sqrt->addArg(x.value);
+			::basicBlock->appendInst(sqrt);
+
+			return RValue<Float4>(V(result));
+		}
 	}
 
 	RValue<Float4> Insert(RValue<Float4> x, RValue<Float> element, int i)
@@ -6238,14 +6906,22 @@
 
 	RValue<Int> SignMask(RValue<Float4> x)
 	{
-		Ice::Variable *result = ::function->makeVariable(Ice::IceType_i32);
-		const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::SignMask, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
-		auto target = ::context->getConstantUndef(Ice::IceType_i32);
-		auto movmsk = Ice::InstIntrinsicCall::create(::function, 1, result, target, intrinsic);
-		movmsk->addArg(x.value);
-		::basicBlock->appendInst(movmsk);
+		if(emulateIntrinsics)
+		{
+			Int4 xx = (As<Int4>(x) >> 31) & Int4(0x00000001, 0x00000002, 0x00000004, 0x00000008);
+			return Extract(xx, 0) | Extract(xx, 1) | Extract(xx, 2) | Extract(xx, 3);
+		}
+		else
+		{
+			Ice::Variable *result = ::function->makeVariable(Ice::IceType_i32);
+			const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::SignMask, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
+			auto target = ::context->getConstantUndef(Ice::IceType_i32);
+			auto movmsk = Ice::InstIntrinsicCall::create(::function, 1, result, target, intrinsic);
+			movmsk->addArg(x.value);
+			::basicBlock->appendInst(movmsk);
 
-		return RValue<Int>(V(result));
+			return RValue<Int>(V(result));
+		}
 	}
 
 	RValue<Int4> CmpEQ(RValue<Float4> x, RValue<Float4> y)
@@ -6325,16 +7001,22 @@
 
 	RValue<Float4> Frac(RValue<Float4> x)
 	{
+		Float4 frc;
+
 		if(CPUID::SSE4_1)
 		{
-			return x - Floor(x);
+			frc = x - Floor(x);
 		}
 		else
 		{
-			Float4 frc = x - Float4(Int4(x));   // Signed fractional part
+			frc = x - Float4(Int4(x));   // Signed fractional part.
 
-			return frc + As<Float4>(As<Int4>(CmpNLE(Float4(0.0f), frc)) & As<Int4>(Float4(1, 1, 1, 1)));
+			frc += As<Float4>(As<Int4>(CmpNLE(Float4(0.0f), frc)) & As<Int4>(Float4(1, 1, 1, 1)));   // Add 1.0 if negative.
 		}
+
+		// x - floor(x) can be 1.0 for very small negative x.
+		// Clamp against the value just below 1.0.
+		return Min(frc, As<Float4>(Int4(0x3F7FFFFF)));
 	}
 
 	RValue<Float4> Floor(RValue<Float4> x)
diff --git a/src/Reactor/x86.hpp b/src/Reactor/x86.hpp
index 038a49d..5e759b3 100644
--- a/src/Reactor/x86.hpp
+++ b/src/Reactor/x86.hpp
@@ -22,8 +22,6 @@
 	namespace x86
 	{
 		RValue<Int> cvtss2si(RValue<Float> val);
-		RValue<Int2> cvtps2pi(RValue<Float4> val);
-		RValue<Int2> cvttps2pi(RValue<Float4> val);
 		RValue<Int4> cvtps2dq(RValue<Float4> val);
 
 		RValue<Float> rcpss(RValue<Float> val);
@@ -44,26 +42,6 @@
 		RValue<Float4> floorps(RValue<Float4> val);
 		RValue<Float4> ceilps(RValue<Float4> val);
 
-		RValue<Float4> cmpps(RValue<Float4> x, RValue<Float4> y, unsigned char imm);
-		RValue<Float4> cmpeqps(RValue<Float4> x, RValue<Float4> y);
-		RValue<Float4> cmpltps(RValue<Float4> x, RValue<Float4> y);
-		RValue<Float4> cmpleps(RValue<Float4> x, RValue<Float4> y);
-		RValue<Float4> cmpunordps(RValue<Float4> x, RValue<Float4> y);
-		RValue<Float4> cmpneqps(RValue<Float4> x, RValue<Float4> y);
-		RValue<Float4> cmpnltps(RValue<Float4> x, RValue<Float4> y);
-		RValue<Float4> cmpnleps(RValue<Float4> x, RValue<Float4> y);
-		RValue<Float4> cmpordps(RValue<Float4> x, RValue<Float4> y);
-
-		RValue<Float> cmpss(RValue<Float> x, RValue<Float> y, unsigned char imm);
-		RValue<Float> cmpeqss(RValue<Float> x, RValue<Float> y);
-		RValue<Float> cmpltss(RValue<Float> x, RValue<Float> y);
-		RValue<Float> cmpless(RValue<Float> x, RValue<Float> y);
-		RValue<Float> cmpunordss(RValue<Float> x, RValue<Float> y);
-		RValue<Float> cmpneqss(RValue<Float> x, RValue<Float> y);
-		RValue<Float> cmpnltss(RValue<Float> x, RValue<Float> y);
-		RValue<Float> cmpnless(RValue<Float> x, RValue<Float> y);
-		RValue<Float> cmpordss(RValue<Float> x, RValue<Float> y);
-
 		RValue<Int4> pabsd(RValue<Int4> x);
 
 		RValue<Short4> paddsw(RValue<Short4> x, RValue<Short4> y);
@@ -75,26 +53,6 @@
 		RValue<Byte8> paddusb(RValue<Byte8> x, RValue<Byte8> y);
 		RValue<Byte8> psubusb(RValue<Byte8> x, RValue<Byte8> y);
 
-		RValue<Short4> paddw(RValue<Short4> x, RValue<Short4> y);
-		RValue<Short4> psubw(RValue<Short4> x, RValue<Short4> y);
-		RValue<Short4> pmullw(RValue<Short4> x, RValue<Short4> y);
-		RValue<Short4> pand(RValue<Short4> x, RValue<Short4> y);
-		RValue<Short4> por(RValue<Short4> x, RValue<Short4> y);
-		RValue<Short4> pxor(RValue<Short4> x, RValue<Short4> y);
-		RValue<Short4> pshufw(RValue<Short4> x, unsigned char y);
-		RValue<Int2> punpcklwd(RValue<Short4> x, RValue<Short4> y);
-		RValue<Int2> punpckhwd(RValue<Short4> x, RValue<Short4> y);
-		RValue<Short4> pinsrw(RValue<Short4> x, RValue<Int> y, unsigned int i);
-		RValue<Int> pextrw(RValue<Short4> x, unsigned int i);
-		RValue<Short4> punpckldq(RValue<Int2> x, RValue<Int2> y);
-		RValue<Short4> punpckhdq(RValue<Int2> x, RValue<Int2> y);
-		RValue<Short4> punpcklbw(RValue<Byte8> x, RValue<Byte8> y);
-		RValue<Short4> punpckhbw(RValue<Byte8> x, RValue<Byte8> y);
-		RValue<Byte8> paddb(RValue<Byte8> x, RValue<Byte8> y);
-		RValue<Byte8> psubb(RValue<Byte8> x, RValue<Byte8> y);
-		RValue<Int2> paddd(RValue<Int2> x, RValue<Int2> y);
-		RValue<Int2> psubd(RValue<Int2> x, RValue<Int2> y);
-
 		RValue<UShort4> pavgw(RValue<UShort4> x, RValue<UShort4> y);
 
 		RValue<Short4> pmaxsw(RValue<Short4> x, RValue<Short4> y);
@@ -141,12 +99,10 @@
 		RValue<Int> movmskps(RValue<Float4> x);
 		RValue<Int> pmovmskb(RValue<Byte8> x);
 
-		RValue<Int4> pmovzxbd(RValue<Int4> x);
-		RValue<Int4> pmovsxbd(RValue<Int4> x);
-		RValue<Int4> pmovzxwd(RValue<Int4> x);
-		RValue<Int4> pmovsxwd(RValue<Int4> x);
-
-		void emms();
+		RValue<Int4> pmovzxbd(RValue<Byte16> x);
+		RValue<Int4> pmovsxbd(RValue<SByte16> x);
+		RValue<Int4> pmovzxwd(RValue<UShort8> x);
+		RValue<Int4> pmovsxwd(RValue<Short8> x);
 	}
 }
 
diff --git a/src/Renderer/BUILD.gn b/src/Renderer/BUILD.gn
index 301b106..a8ad847 100644
--- a/src/Renderer/BUILD.gn
+++ b/src/Renderer/BUILD.gn
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import("../swiftshader.gni")
+
 # Need a separate config to ensure the warnings are added to the end.
 config("swiftshader_renderer_private_config") {
   if (is_win) {
@@ -25,12 +27,10 @@
       "-msse2",
       "-Wno-sign-compare",
     ]
-
-    defines = [ "LOG_TAG=\"swiftshader_renderer\"" ]
   }
 }
 
-source_set("swiftshader_renderer") {
+swiftshader_source_set("swiftshader_renderer") {
   deps = [
     "../Shader:swiftshader_shader",
   ]
@@ -55,11 +55,7 @@
     "VertexProcessor.cpp",
   ]
 
-  if (is_win) {
-    configs -= [ "//build/config/win:unicode" ]
-  }
-
-  configs += [ ":swiftshader_renderer_private_config" ]
+  configs = [ ":swiftshader_renderer_private_config" ]
 
   include_dirs = [
     ".",
diff --git a/src/Renderer/Blitter.cpp b/src/Renderer/Blitter.cpp
index c93b09f..0c4a160 100644
--- a/src/Renderer/Blitter.cpp
+++ b/src/Renderer/Blitter.cpp
@@ -260,11 +260,11 @@
 			c.w = float(0xFFFF);
 			break;
 		case FORMAT_R32I:
-			c.x = Float(Int(*Pointer<Int>(element)));
+			c.x = Float(*Pointer<Int>(element));
 			c.w = float(0x7FFFFFFF);
 			break;
 		case FORMAT_R32UI:
-			c.x = Float(Int(*Pointer<UInt>(element)));
+			c.x = Float(*Pointer<UInt>(element));
 			c.w = float(0xFFFFFFFF);
 			break;
 		case FORMAT_A8R8G8B8:
@@ -359,13 +359,13 @@
 			c.w = float(0xFFFF);
 			break;
 		case FORMAT_G32R32I:
-			c.x = Float(Int(*Pointer<Int>(element + 0)));
-			c.y = Float(Int(*Pointer<Int>(element + 4)));
+			c.x = Float(*Pointer<Int>(element + 0));
+			c.y = Float(*Pointer<Int>(element + 4));
 			c.w = float(0x7FFFFFFF);
 			break;
 		case FORMAT_G32R32UI:
-			c.x = Float(Int(*Pointer<UInt>(element + 0)));
-			c.y = Float(Int(*Pointer<UInt>(element + 4)));
+			c.x = Float(*Pointer<UInt>(element + 0));
+			c.y = Float(*Pointer<UInt>(element + 4));
 			c.w = float(0xFFFFFFFF);
 			break;
 		case FORMAT_A32B32G32R32F:
@@ -835,24 +835,18 @@
 			c = Insert(c, Int(*Pointer<UShort>(element)), 0);
 			break;
 		case FORMAT_A32B32G32R32I:
+		case FORMAT_A32B32G32R32UI:
 			c = *Pointer<Int4>(element);
 			break;
 		case FORMAT_X32B32G32R32I:
+		case FORMAT_X32B32G32R32UI:
 			c = Insert(c, *Pointer<Int>(element + 8), 2);
 		case FORMAT_G32R32I:
+		case FORMAT_G32R32UI:
 			c = Insert(c, *Pointer<Int>(element + 4), 1);
 		case FORMAT_R32I:
-			c = Insert(c, *Pointer<Int>(element), 0);
-			break;
-		case FORMAT_A32B32G32R32UI:
-			c = *Pointer<UInt4>(element);
-			break;
-		case FORMAT_X32B32G32R32UI:
-			c = Insert(c, Int(*Pointer<UInt>(element + 8)), 2);
-		case FORMAT_G32R32UI:
-			c = Insert(c, Int(*Pointer<UInt>(element + 4)), 1);
 		case FORMAT_R32UI:
-			c = Insert(c, Int(*Pointer<UInt>(element)), 0);
+			c = Insert(c, *Pointer<Int>(element), 0);
 			break;
 		default:
 			return false;
diff --git a/src/Renderer/Context.cpp b/src/Renderer/Context.cpp
index caa4592..e5ee4dc 100644
--- a/src/Renderer/Context.cpp
+++ b/src/Renderer/Context.cpp
@@ -33,6 +33,7 @@
 	bool fullPixelPositionRegister = false;
 	bool leadingVertexFirst = false;         // Flat shading uses first vertex, else last
 	bool secondaryColor = false;             // Specular lighting is applied after texturing
+	bool colorsDefaultToZero = false;
 
 	bool forceWindowed = false;
 	bool quadLayoutEnabled = false;
diff --git a/src/Renderer/PixelProcessor.cpp b/src/Renderer/PixelProcessor.cpp
index 172e8ef..db11aed 100644
--- a/src/Renderer/PixelProcessor.cpp
+++ b/src/Renderer/PixelProcessor.cpp
@@ -444,6 +444,15 @@
 		else ASSERT(false);
 	}
 
+	void PixelProcessor::setHighPrecisionFiltering(unsigned int sampler, bool highPrecisionFiltering)
+	{
+		if(sampler < TEXTURE_IMAGE_UNITS)
+		{
+			context->sampler[sampler].setHighPrecisionFiltering(highPrecisionFiltering);
+		}
+		else ASSERT(false);
+	}
+
 	void PixelProcessor::setSwizzleR(unsigned int sampler, SwizzleType swizzleR)
 	{
 		if(sampler < TEXTURE_IMAGE_UNITS)
diff --git a/src/Renderer/PixelProcessor.hpp b/src/Renderer/PixelProcessor.hpp
index 94d52d3..dd54b72 100644
--- a/src/Renderer/PixelProcessor.hpp
+++ b/src/Renderer/PixelProcessor.hpp
@@ -231,6 +231,7 @@
 		void setMipmapLOD(unsigned int sampler, float bias);
 		void setBorderColor(unsigned int sampler, const Color<float> &borderColor);
 		void setMaxAnisotropy(unsigned int sampler, float maxAnisotropy);
+		void setHighPrecisionFiltering(unsigned int sampler, bool highPrecisionFiltering);
 		void setSwizzleR(unsigned int sampler, SwizzleType swizzleR);
 		void setSwizzleG(unsigned int sampler, SwizzleType swizzleG);
 		void setSwizzleB(unsigned int sampler, SwizzleType swizzleB);
diff --git a/src/Renderer/Renderer.cpp b/src/Renderer/Renderer.cpp
index a84423d..252d744 100644
--- a/src/Renderer/Renderer.cpp
+++ b/src/Renderer/Renderer.cpp
@@ -48,6 +48,7 @@
 	extern bool fullPixelPositionRegister;
 	extern bool leadingVertexFirst;         // Flat shading uses first vertex, else last
 	extern bool secondaryColor;             // Specular lighting is applied after texturing
+	extern bool colorsDefaultToZero;
 
 	extern bool forceWindowed;
 	extern bool complementaryDepthBuffer;
@@ -110,6 +111,7 @@
 		sw::fullPixelPositionRegister = conventions.fullPixelPositionRegister;
 		sw::leadingVertexFirst = conventions.leadingVertexFirst;
 		sw::secondaryColor = conventions.secondaryColor;
+		sw::colorsDefaultToZero = conventions.colorsDefaultToZero;
 		sw::exactColorRounding = exactColorRounding;
 
 		setRenderTarget(0, 0);
@@ -670,9 +672,15 @@
 		}
 	}
 
-	void Renderer::clear(void *pixel, Format format, Surface *dest, const SliceRect &dRect, unsigned int rgbaMask)
+	void Renderer::clear(void *value, Format format, Surface *dest, const Rect &clearRect, unsigned int rgbaMask)
 	{
-		blitter->clear(pixel, format, dest, dRect, rgbaMask);
+		SliceRect rect = clearRect;
+		int samples = dest->getDepth();
+
+		for(rect.slice = 0; rect.slice < samples; rect.slice++)
+		{
+			blitter->clear(value, format, dest, rect, rgbaMask);
+		}
 	}
 
 	void Renderer::blit(Surface *source, const SliceRect &sRect, Surface *dest, const SliceRect &dRect, bool filter, bool isStencil)
@@ -2314,6 +2322,18 @@
 		}
 	}
 
+	void Renderer::setHighPrecisionFiltering(SamplerType type, int sampler, bool highPrecisionFiltering)
+	{
+		if(type == SAMPLER_PIXEL)
+		{
+			PixelProcessor::setHighPrecisionFiltering(sampler, highPrecisionFiltering);
+		}
+		else
+		{
+			VertexProcessor::setHighPrecisionFiltering(sampler, highPrecisionFiltering);
+		}
+	}
+
 	void Renderer::setSwizzleR(SamplerType type, int sampler, SwizzleType swizzleR)
 	{
 		if(type == SAMPLER_PIXEL)
diff --git a/src/Renderer/Renderer.hpp b/src/Renderer/Renderer.hpp
index c59dd31..d796475 100644
--- a/src/Renderer/Renderer.hpp
+++ b/src/Renderer/Renderer.hpp
@@ -65,6 +65,7 @@
 		bool fullPixelPositionRegister;
 		bool leadingVertexFirst;
 		bool secondaryColor;
+		bool colorsDefaultToZero;
 	};
 
 	static const Conventions OpenGL =
@@ -74,7 +75,8 @@
 		true,    // booleanFaceRegister
 		true,    // fullPixelPositionRegister
 		false,   // leadingVertexFirst
-		false    // secondaryColor
+		false,   // secondaryColor
+		true,    // colorsDefaultToZero
 	};
 
 	static const Conventions Direct3D =
@@ -85,6 +87,7 @@
 		false,   // fullPixelPositionRegister
 		true,    // leadingVertexFirst
 		true,    // secondardyColor
+		false,   // colorsDefaultToZero
 	};
 
 	struct Query
@@ -323,7 +326,7 @@
 
 		void draw(DrawType drawType, unsigned int indexOffset, unsigned int count, bool update = true);
 
-		void clear(void* pixel, Format format, Surface *dest, const SliceRect &dRect, unsigned int rgbaMask);
+		void clear(void *value, Format format, Surface *dest, const Rect &rect, unsigned int rgbaMask);
 		void blit(Surface *source, const SliceRect &sRect, Surface *dest, const SliceRect &dRect, bool filter, bool isStencil = false);
 		void blit3D(Surface *source, Surface *dest);
 
@@ -345,6 +348,7 @@
 		void setMipmapLOD(SamplerType type, int sampler, float bias);
 		void setBorderColor(SamplerType type, int sampler, const Color<float> &borderColor);
 		void setMaxAnisotropy(SamplerType type, int sampler, float maxAnisotropy);
+		void setHighPrecisionFiltering(SamplerType type, int sampler, bool highPrecisionFiltering);
 		void setSwizzleR(SamplerType type, int sampler, SwizzleType swizzleR);
 		void setSwizzleG(SamplerType type, int sampler, SwizzleType swizzleG);
 		void setSwizzleB(SamplerType type, int sampler, SwizzleType swizzleB);
diff --git a/src/Renderer/Sampler.cpp b/src/Renderer/Sampler.cpp
index e2447e0..24734da 100644
--- a/src/Renderer/Sampler.cpp
+++ b/src/Renderer/Sampler.cpp
@@ -60,6 +60,7 @@
 		mipmapFilterState = MIPMAP_NONE;
 		sRGB = false;
 		gather = false;
+		highPrecisionFiltering = false;
 
 		swizzleR = SWIZZLE_RED;
 		swizzleG = SWIZZLE_GREEN;
@@ -97,6 +98,7 @@
 			state.swizzleG = swizzleG;
 			state.swizzleB = swizzleB;
 			state.swizzleA = swizzleA;
+			state.highPrecisionFiltering = highPrecisionFiltering;
 
 			#if PERF_PROFILE
 				state.compressedFormat = Surface::isCompressed(externalTextureFormat);
@@ -205,8 +207,15 @@
 				mipmap.onePitchP[2] = 1;
 				mipmap.onePitchP[3] = pitchP;
 
+				mipmap.pitchP[0] = pitchP;
+				mipmap.pitchP[1] = pitchP;
+				mipmap.pitchP[2] = pitchP;
+				mipmap.pitchP[3] = pitchP;
+
 				mipmap.sliceP[0] = sliceP;
 				mipmap.sliceP[1] = sliceP;
+				mipmap.sliceP[2] = sliceP;
+				mipmap.sliceP[3] = sliceP;
 
 				if(internalTextureFormat == FORMAT_YV12_BT601 ||
 				   internalTextureFormat == FORMAT_YV12_BT709 ||
@@ -298,6 +307,11 @@
 		texture.maxAnisotropy = maxAnisotropy;
 	}
 
+	void Sampler::setHighPrecisionFiltering(bool highPrecisionFiltering)
+	{
+		this->highPrecisionFiltering = highPrecisionFiltering;
+	}
+
 	void Sampler::setSwizzleR(SwizzleType swizzleR)
 	{
 		this->swizzleR = swizzleR;
diff --git a/src/Renderer/Sampler.hpp b/src/Renderer/Sampler.hpp
index 4c4973d..6fae602 100644
--- a/src/Renderer/Sampler.hpp
+++ b/src/Renderer/Sampler.hpp
@@ -36,7 +36,8 @@
 		short height[4];
 		short depth[4];
 		short onePitchP[4];
-		int sliceP[2];
+		int4 pitchP;
+		int4 sliceP;
 	};
 
 	struct Texture
@@ -140,6 +141,7 @@
 			SwizzleType swizzleG           : BITS(SWIZZLE_LAST);
 			SwizzleType swizzleB           : BITS(SWIZZLE_LAST);
 			SwizzleType swizzleA           : BITS(SWIZZLE_LAST);
+			bool highPrecisionFiltering    : 1;
 
 			#if PERF_PROFILE
 			bool compressedFormat          : 1;
@@ -163,6 +165,7 @@
 		void setReadSRGB(bool sRGB);
 		void setBorderColor(const Color<float> &borderColor);
 		void setMaxAnisotropy(float maxAnisotropy);
+		void setHighPrecisionFiltering(bool highPrecisionFiltering);
 		void setSwizzleR(SwizzleType swizzleR);
 		void setSwizzleG(SwizzleType swizzleG);
 		void setSwizzleB(SwizzleType swizzleB);
@@ -202,6 +205,7 @@
 		MipmapType mipmapFilterState;
 		bool sRGB;
 		bool gather;
+		bool highPrecisionFiltering;
 
 		SwizzleType swizzleR;
 		SwizzleType swizzleG;
diff --git a/src/Renderer/Surface.cpp b/src/Renderer/Surface.cpp
index 6615b4b..6bcc657 100644
--- a/src/Renderer/Surface.cpp
+++ b/src/Renderer/Surface.cpp
@@ -2984,14 +2984,14 @@
 		case FORMAT_X8B8G8R8I:
 		case FORMAT_G8R8I:
 		case FORMAT_R8I:
-		case FORMAT_A8B8G8R8UI:
-		case FORMAT_X8B8G8R8UI:
-		case FORMAT_G8R8UI:
-		case FORMAT_R8UI:
 		case FORMAT_A16B16G16R16I:
 		case FORMAT_X16B16G16R16I:
 		case FORMAT_G16R16I:
 		case FORMAT_R16I:
+		case FORMAT_A32B32G32R32I:
+		case FORMAT_X32B32G32R32I:
+		case FORMAT_G32R32I:
+		case FORMAT_R32I:
 			return true;
 		default:
 			return false;
@@ -3002,14 +3002,14 @@
 	{
 		switch(format)
 		{
+		case FORMAT_A8B8G8R8UI:
+		case FORMAT_X8B8G8R8UI:
+		case FORMAT_G8R8UI:
+		case FORMAT_R8UI:
 		case FORMAT_A16B16G16R16UI:
 		case FORMAT_X16B16G16R16UI:
 		case FORMAT_G16R16UI:
 		case FORMAT_R16UI:
-		case FORMAT_A32B32G32R32I:
-		case FORMAT_X32B32G32R32I:
-		case FORMAT_G32R32I:
-		case FORMAT_R32I:
 		case FORMAT_A32B32G32R32UI:
 		case FORMAT_X32B32G32R32UI:
 		case FORMAT_G32R32UI:
@@ -3193,14 +3193,14 @@
 		resource->unlock();
 	}
 
-	bool Surface::isEntire(const SliceRect& rect) const
+	bool Surface::isEntire(const Rect& rect) const
 	{
 		return (rect.x0 == 0 && rect.y0 == 0 && rect.x1 == internal.width && rect.y1 == internal.height && internal.depth == 1);
 	}
 
-	SliceRect Surface::getRect() const
+	Rect Surface::getRect() const
 	{
-		return SliceRect(0, 0, internal.width, internal.height, 0);
+		return Rect(0, 0, internal.width, internal.height);
 	}
 
 	void Surface::clearDepth(float depth, int x0, int y0, int width, int height)
diff --git a/src/Renderer/Surface.hpp b/src/Renderer/Surface.hpp
index b54565e..6418c08 100644
--- a/src/Renderer/Surface.hpp
+++ b/src/Renderer/Surface.hpp
@@ -299,8 +299,8 @@
 		inline int getMultiSampleCount() const;
 		inline int getSuperSampleCount() const;
 
-		bool isEntire(const SliceRect& rect) const;
-		SliceRect getRect() const;
+		bool isEntire(const Rect& rect) const;
+		Rect getRect() const;
 		void clearDepth(float depth, int x0, int y0, int width, int height);
 		void clearStencil(unsigned char stencil, unsigned char mask, int x0, int y0, int width, int height);
 		void fill(const Color<float> &color, int x0, int y0, int width, int height);
diff --git a/src/Renderer/VertexProcessor.cpp b/src/Renderer/VertexProcessor.cpp
index 91c4d34..6972d94 100644
--- a/src/Renderer/VertexProcessor.cpp
+++ b/src/Renderer/VertexProcessor.cpp
@@ -602,6 +602,15 @@
 		else ASSERT(false);
 	}
 
+	void VertexProcessor::setHighPrecisionFiltering(unsigned int sampler, bool highPrecisionFiltering)
+	{
+		if(sampler < TEXTURE_IMAGE_UNITS)
+		{
+			context->sampler[sampler].setHighPrecisionFiltering(highPrecisionFiltering);
+		}
+		else ASSERT(false);
+	}
+
 	void VertexProcessor::setSwizzleR(unsigned int sampler, SwizzleType swizzleR)
 	{
 		if(sampler < VERTEX_TEXTURE_IMAGE_UNITS)
diff --git a/src/Renderer/VertexProcessor.hpp b/src/Renderer/VertexProcessor.hpp
index 278c9b1..3552f84 100644
--- a/src/Renderer/VertexProcessor.hpp
+++ b/src/Renderer/VertexProcessor.hpp
@@ -258,6 +258,7 @@
 		void setMipmapLOD(unsigned int sampler, float bias);
 		void setBorderColor(unsigned int sampler, const Color<float> &borderColor);
 		void setMaxAnisotropy(unsigned int stage, float maxAnisotropy);
+		void setHighPrecisionFiltering(unsigned int sampler, bool highPrecisionFiltering);
 		void setSwizzleR(unsigned int sampler, SwizzleType swizzleR);
 		void setSwizzleG(unsigned int sampler, SwizzleType swizzleG);
 		void setSwizzleB(unsigned int sampler, SwizzleType swizzleB);
diff --git a/src/Shader/BUILD.gn b/src/Shader/BUILD.gn
index d1323f7..3b19766 100644
--- a/src/Shader/BUILD.gn
+++ b/src/Shader/BUILD.gn
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import("../swiftshader.gni")
+
 # Need a separate config to ensure the warnings are added to the end.
 config("swiftshader_shader_private_config") {
   if (is_win) {
@@ -24,12 +26,10 @@
     if (is_clang) {
       cflags += [ "-Wno-sign-compare" ]
     }
-  } else {
-    defines = [ "LOG_TAG=\"swiftshader_shader\"" ]
   }
 }
 
-source_set("swiftshader_shader") {
+swiftshader_source_set("swiftshader_shader") {
   deps = [
     "../Main:swiftshader_main",
   ]
@@ -50,7 +50,7 @@
     "VertexShader.cpp",
   ]
 
-  configs += [ ":swiftshader_shader_private_config" ]
+  configs = [ ":swiftshader_shader_private_config" ]
 
   include_dirs = [
     ".",
diff --git a/src/Shader/PixelProgram.cpp b/src/Shader/PixelProgram.cpp
index ecf164f..948f103 100644
--- a/src/Shader/PixelProgram.cpp
+++ b/src/Shader/PixelProgram.cpp
@@ -846,8 +846,8 @@
 			reg = v[2 + i];
 			break;
 		case Shader::PARAMETER_MISCTYPE:
-			if(src.index == 0) reg = vPos;
-			if(src.index == 1) reg = vFace;
+			if(src.index == Shader::VPosIndex) reg = vPos;
+			if(src.index == Shader::VFaceIndex) reg = vFace;
 			break;
 		case Shader::PARAMETER_SAMPLER:
 			if(src.rel.type == Shader::PARAMETER_VOID)
diff --git a/src/Shader/PixelShader.cpp b/src/Shader/PixelShader.cpp
index 0b78c14..c659248 100644
--- a/src/Shader/PixelShader.cpp
+++ b/src/Shader/PixelShader.cpp
@@ -700,11 +700,11 @@
 					{
 						unsigned char index = instruction[i]->dst.index;
 
-						if(index == 0)
+						if(index == Shader::VPosIndex)
 						{
 							vPosDeclared = true;
 						}
-						else if(index == 1)
+						else if(index == Shader::VFaceIndex)
 						{
 							vFaceDeclared = true;
 						}
diff --git a/src/Shader/SamplerCore.cpp b/src/Shader/SamplerCore.cpp
index 1d7ff47..62f76fa 100644
--- a/src/Shader/SamplerCore.cpp
+++ b/src/Shader/SamplerCore.cpp
@@ -50,6 +50,8 @@
 
 namespace sw
 {
+	extern bool colorsDefaultToZero;
+
 	SamplerCore::SamplerCore(Pointer<Byte> &constants, const Sampler::State &state) : constants(constants), state(state)
 	{
 	}
@@ -186,6 +188,7 @@
 			if(fixed12 && state.textureFilter != FILTER_GATHER)
 			{
 				int componentCount = textureComponentCount();
+				short defaultColorValue = colorsDefaultToZero ? 0x0000 : 0x1000;
 
 				switch(state.textureFormat)
 				{
@@ -237,8 +240,8 @@
 				case FORMAT_YV12_BT601:
 				case FORMAT_YV12_BT709:
 				case FORMAT_YV12_JFIF:
-					if(componentCount < 2) c.y = Short4(0x1000);
-					if(componentCount < 3) c.z = Short4(0x1000);
+					if(componentCount < 2) c.y = Short4(defaultColorValue);
+					if(componentCount < 3) c.z = Short4(defaultColorValue);
 					if(componentCount < 4) c.w = Short4(0x1000);
 					break;
 				case FORMAT_A8:
@@ -259,9 +262,9 @@
 					c.z = c.x;
 					break;
 				case FORMAT_R32F:
-					c.y = Short4(0x1000);
+					c.y = Short4(defaultColorValue);
 				case FORMAT_G32R32F:
-					c.z = Short4(0x1000);
+					c.z = Short4(defaultColorValue);
 				case FORMAT_X32B32G32R32F:
 					c.w = Short4(0x1000);
 				case FORMAT_A32B32G32R32F:
@@ -314,7 +317,9 @@
 		}
 		else
 		{
-			if(hasFloatTexture())   // FIXME: Mostly identical to integer sampling
+			// FIXME: YUV and sRGB are not supported by the floating point path
+			bool forceFloatFiltering = state.highPrecisionFiltering && !state.sRGB && !hasYuvFormat() && (state.textureFilter != FILTER_POINT);
+			if(hasFloatTexture() || hasUnnormalizedIntegerTexture() || forceFloatFiltering)   // FIXME: Mostly identical to integer sampling
 			{
 				Float4 uuuu = u;
 				Float4 vvvv = v;
@@ -353,36 +358,17 @@
 				}
 
 				sampleFloatFilter(texture, c, uuuu, vvvv, wwww, offset, lod, anisotropy, uDelta, vDelta, face, function);
-			}
-			else
-			{
-				Vector4s cs;
 
-				sampleTexture(texture, cs, u, v, w, q, dsx, dsy, offset, function, false);
-
-				for(int component = 0; component < textureComponentCount(); component++)
+				if(!hasFloatTexture() && !hasUnnormalizedIntegerTexture())
 				{
 					if(has16bitTextureFormat())
 					{
 						switch(state.textureFormat)
 						{
 						case FORMAT_R5G6B5:
-							if(state.sRGB)
-							{
-								sRGBtoLinear16_5_12(cs.x);
-								sRGBtoLinear16_6_12(cs.y);
-								sRGBtoLinear16_5_12(cs.z);
-
-								convertSigned12(c.x, cs.x);
-								convertSigned12(c.y, cs.y);
-								convertSigned12(c.z, cs.z);
-							}
-							else
-							{
-								c.x = Float4(As<UShort4>(cs.x)) * Float4(1.0f / 0xF800);
-								c.y = Float4(As<UShort4>(cs.y)) * Float4(1.0f / 0xFC00);
-								c.z = Float4(As<UShort4>(cs.z)) * Float4(1.0f / 0xF800);
-							}
+							c.x *= Float4(1.0f / 0xF800);
+							c.y *= Float4(1.0f / 0xFC00);
+							c.z *= Float4(1.0f / 0xF800);
 							break;
 						default:
 							ASSERT(false);
@@ -390,57 +376,72 @@
 					}
 					else
 					{
-						switch(state.textureFormat)
+						for(int component = 0; component < textureComponentCount(); component++)
 						{
-						case FORMAT_R8I:
-						case FORMAT_G8R8I:
-						case FORMAT_X8B8G8R8I:
-						case FORMAT_A8B8G8R8I:
-							c[component] = As<Float4>(Int4(cs[component]) >> 8);
-							break;
-						case FORMAT_R8UI:
-						case FORMAT_G8R8UI:
-						case FORMAT_X8B8G8R8UI:
-						case FORMAT_A8B8G8R8UI:
-							c[component] = As<Float4>(Int4(As<UShort4>(cs[component]) >> 8));
-							break;
-						case FORMAT_R16I:
-						case FORMAT_G16R16I:
-						case FORMAT_X16B16G16R16I:
-						case FORMAT_A16B16G16R16I:
-							c[component] = As<Float4>(Int4(cs[component]));
-							break;
-						case FORMAT_R16UI:
-						case FORMAT_G16R16UI:
-						case FORMAT_X16B16G16R16UI:
-						case FORMAT_A16B16G16R16UI:
-							c[component] = As<Float4>(Int4(As<UShort4>(cs[component])));
-							break;
-						default:
-							// Normalized integer formats
-							if(state.sRGB && isRGBComponent(component))
+							c[component] *= Float4(hasUnsignedTextureComponent(component) ? 1.0f / 0xFFFF : 1.0f / 0x7FFF);
+						}
+					}
+				}
+			}
+			else
+			{
+				Vector4s cs;
+
+				sampleTexture(texture, cs, u, v, w, q, dsx, dsy, offset, function, false);
+
+				if(has16bitTextureFormat())
+				{
+					switch(state.textureFormat)
+					{
+					case FORMAT_R5G6B5:
+						if(state.sRGB)
+						{
+							sRGBtoLinear16_5_12(cs.x);
+							sRGBtoLinear16_6_12(cs.y);
+							sRGBtoLinear16_5_12(cs.z);
+
+							convertSigned12(c.x, cs.x);
+							convertSigned12(c.y, cs.y);
+							convertSigned12(c.z, cs.z);
+						}
+						else
+						{
+							c.x = Float4(As<UShort4>(cs.x)) * Float4(1.0f / 0xF800);
+							c.y = Float4(As<UShort4>(cs.y)) * Float4(1.0f / 0xFC00);
+							c.z = Float4(As<UShort4>(cs.z)) * Float4(1.0f / 0xF800);
+						}
+						break;
+					default:
+						ASSERT(false);
+					}
+				}
+				else
+				{
+					for(int component = 0; component < textureComponentCount(); component++)
+					{
+						// Normalized integer formats
+						if(state.sRGB && isRGBComponent(component))
+						{
+							sRGBtoLinear16_8_12(cs[component]);   // FIXME: Perform linearization at surface level for read-only textures
+							convertSigned12(c[component], cs[component]);
+						}
+						else
+						{
+							if(hasUnsignedTextureComponent(component))
 							{
-								sRGBtoLinear16_8_12(cs[component]);   // FIXME: Perform linearization at surface level for read-only textures
-								convertSigned12(c[component], cs[component]);
+								convertUnsigned16(c[component], cs[component]);
 							}
 							else
 							{
-								if(hasUnsignedTextureComponent(component))
-								{
-									convertUnsigned16(c[component], cs[component]);
-								}
-								else
-								{
-									convertSigned15(c[component], cs[component]);
-								}
+								convertSigned15(c[component], cs[component]);
 							}
-							break;
 						}
 					}
 				}
 			}
 
 			int componentCount = textureComponentCount();
+			float defaultColorValue = colorsDefaultToZero ? 0.0f : 1.0f;
 
 			if(state.textureFilter != FILTER_GATHER)
 			{
@@ -498,8 +499,8 @@
 				case FORMAT_YV12_BT601:
 				case FORMAT_YV12_BT709:
 				case FORMAT_YV12_JFIF:
-					if(componentCount < 2) c.y = Float4(1.0f);
-					if(componentCount < 3) c.z = Float4(1.0f);
+					if(componentCount < 2) c.y = Float4(defaultColorValue);
+					if(componentCount < 3) c.z = Float4(defaultColorValue);
 					if(componentCount < 4) c.w = Float4(1.0f);
 					break;
 				case FORMAT_A8:
@@ -520,9 +521,9 @@
 					c.z = c.x;
 					break;
 				case FORMAT_R32F:
-					c.y = Float4(1.0f);
+					c.y = Float4(defaultColorValue);
 				case FORMAT_G32R32F:
-					c.z = Float4(1.0f);
+					c.z = Float4(defaultColorValue);
 				case FORMAT_X32B32G32R32F:
 					c.w = Float4(1.0f);
 				case FORMAT_A32B32G32R32F:
@@ -801,24 +802,19 @@
 
 		if(state.textureFilter == FILTER_POINT || texelFetch)
 		{
-			sampleTexel(c, uuuu, vvvv, wwww, offset, mipmap, buffer, function);
+			c = sampleTexel(uuuu, vvvv, wwww, offset, mipmap, buffer, function);
 		}
 		else
 		{
-			Vector4s c0;
-			Vector4s c1;
-			Vector4s c2;
-			Vector4s c3;
-
 			Short4 uuuu0 = offsetSample(uuuu, mipmap, OFFSET(Mipmap,uHalf), state.addressingModeU == ADDRESSING_WRAP, gather ? 0 : -1, lod);
 			Short4 vvvv0 = offsetSample(vvvv, mipmap, OFFSET(Mipmap,vHalf), state.addressingModeV == ADDRESSING_WRAP, gather ? 0 : -1, lod);
 			Short4 uuuu1 = offsetSample(uuuu, mipmap, OFFSET(Mipmap,uHalf), state.addressingModeU == ADDRESSING_WRAP, gather ? 2 : +1, lod);
 			Short4 vvvv1 = offsetSample(vvvv, mipmap, OFFSET(Mipmap,vHalf), state.addressingModeV == ADDRESSING_WRAP, gather ? 2 : +1, lod);
 
-			sampleTexel(c0, uuuu0, vvvv0, wwww, offset, mipmap, buffer, function);
-			sampleTexel(c1, uuuu1, vvvv0, wwww, offset, mipmap, buffer, function);
-			sampleTexel(c2, uuuu0, vvvv1, wwww, offset, mipmap, buffer, function);
-			sampleTexel(c3, uuuu1, vvvv1, wwww, offset, mipmap, buffer, function);
+			Vector4s c0 = sampleTexel(uuuu0, vvvv0, wwww, offset, mipmap, buffer, function);
+			Vector4s c1 = sampleTexel(uuuu1, vvvv0, wwww, offset, mipmap, buffer, function);
+			Vector4s c2 = sampleTexel(uuuu0, vvvv1, wwww, offset, mipmap, buffer, function);
+			Vector4s c3 = sampleTexel(uuuu1, vvvv1, wwww, offset, mipmap, buffer, function);
 
 			if(!gather)   // Blend
 			{
@@ -997,7 +993,7 @@
 
 		if(state.textureFilter == FILTER_POINT || texelFetch)
 		{
-			sampleTexel(c_, uuuu, vvvv, wwww, offset, mipmap, buffer, function);
+			c_ = sampleTexel(uuuu, vvvv, wwww, offset, mipmap, buffer, function);
 		}
 		else
 		{
@@ -1069,7 +1065,7 @@
 				{
 					for(int k = 0; k < 2; k++)
 					{
-						sampleTexel(c[i][j][k], u[i][j][k], v[i][j][k], s[i][j][k], offset, mipmap, buffer, function);
+						c[i][j][k] = sampleTexel(u[i][j][k], v[i][j][k], s[i][j][k], offset, mipmap, buffer, function);
 
 						if(componentCount >= 1) { if(hasUnsignedTextureComponent(0)) c[i][j][k].x = MulHigh(As<UShort4>(c[i][j][k].x), f[1 - i][1 - j][1 - k]); else c[i][j][k].x = MulHigh(c[i][j][k].x, fs[1 - i][1 - j][1 - k]); }
 						if(componentCount >= 2) { if(hasUnsignedTextureComponent(1)) c[i][j][k].y = MulHigh(As<UShort4>(c[i][j][k].y), f[1 - i][1 - j][1 - k]); else c[i][j][k].y = MulHigh(c[i][j][k].y, fs[1 - i][1 - j][1 - k]); }
@@ -1256,39 +1252,36 @@
 
 		selectMipmap(texture, buffer, mipmap, lod, face, secondLOD);
 
-		bool texelFetch = (function == Fetch);
+		Int4 x0, x1, y0, y1, z0;
+		Float4 fu, fv;
+		Int4 filter = computeFilterOffset(lod);
+		address(w, z0, z0, fv, mipmap, offset.z, filter, OFFSET(Mipmap, depth), state.addressingModeW, function);
+		address(v, y0, y1, fv, mipmap, offset.y, filter, OFFSET(Mipmap, height), state.addressingModeV, function);
+		address(u, x0, x1, fu, mipmap, offset.x, filter, OFFSET(Mipmap, width), state.addressingModeU, function);
 
-		Short4 uuuu = texelFetch ? Short4(As<Int4>(u)) : address(u, state.addressingModeU, mipmap);
-		Short4 vvvv = texelFetch ? Short4(As<Int4>(v)) : address(v, state.addressingModeV, mipmap);
-		Short4 wwww = texelFetch ? Short4(As<Int4>(w)) : address(w, state.addressingModeW, mipmap);
-
-		if(state.textureFilter == FILTER_POINT || texelFetch)
+		Int4 pitchP = *Pointer<Int4>(mipmap + OFFSET(Mipmap, pitchP), 16);
+		y0 *= pitchP;
+		if(hasThirdCoordinate())
 		{
-			sampleTexel(c, uuuu, vvvv, wwww, offset, w, mipmap, buffer, function);
+			Int4 sliceP = *Pointer<Int4>(mipmap + OFFSET(Mipmap, sliceP), 16);
+			z0 *= sliceP;
+		}
+
+		if(state.textureFilter == FILTER_POINT || (function == Fetch))
+		{
+			c = sampleTexel(x0, y0, z0, w, mipmap, buffer, function);
 		}
 		else
 		{
-			Vector4f c0;
-			Vector4f c1;
-			Vector4f c2;
-			Vector4f c3;
+			y1 *= pitchP;
 
-			Short4 uuuu0 = offsetSample(uuuu, mipmap, OFFSET(Mipmap,uHalf), state.addressingModeU == ADDRESSING_WRAP, gather ? 0 : -1, lod);
-			Short4 vvvv0 = offsetSample(vvvv, mipmap, OFFSET(Mipmap,vHalf), state.addressingModeV == ADDRESSING_WRAP, gather ? 0 : -1, lod);
-			Short4 uuuu1 = offsetSample(uuuu, mipmap, OFFSET(Mipmap,uHalf), state.addressingModeU == ADDRESSING_WRAP, gather ? 2 : +1, lod);
-			Short4 vvvv1 = offsetSample(vvvv, mipmap, OFFSET(Mipmap,vHalf), state.addressingModeV == ADDRESSING_WRAP, gather ? 2 : +1, lod);
-
-			sampleTexel(c0, uuuu0, vvvv0, wwww, offset, w, mipmap, buffer, function);
-			sampleTexel(c1, uuuu1, vvvv0, wwww, offset, w, mipmap, buffer, function);
-			sampleTexel(c2, uuuu0, vvvv1, wwww, offset, w, mipmap, buffer, function);
-			sampleTexel(c3, uuuu1, vvvv1, wwww, offset, w, mipmap, buffer, function);
+			Vector4f c0 = sampleTexel(x0, y0, z0, w, mipmap, buffer, function);
+			Vector4f c1 = sampleTexel(x1, y0, z0, w, mipmap, buffer, function);
+			Vector4f c2 = sampleTexel(x0, y1, z0, w, mipmap, buffer, function);
+			Vector4f c3 = sampleTexel(x1, y1, z0, w, mipmap, buffer, function);
 
 			if(!gather)   // Blend
 			{
-				// Fractions
-				Float4 fu = Frac(Float4(As<UShort4>(uuuu0)) * *Pointer<Float4>(mipmap + OFFSET(Mipmap,fWidth)));
-				Float4 fv = Frac(Float4(As<UShort4>(vvvv0)) * *Pointer<Float4>(mipmap + OFFSET(Mipmap,fHeight)));
-
 				if(componentCount >= 1) c0.x = c0.x + fu * (c1.x - c0.x);
 				if(componentCount >= 2) c0.y = c0.y + fu * (c1.y - c0.y);
 				if(componentCount >= 3) c0.z = c0.z + fu * (c1.z - c0.z);
@@ -1324,47 +1317,35 @@
 
 		selectMipmap(texture, buffer, mipmap, lod, face, secondLOD);
 
-		bool texelFetch = (function == Fetch);
+		Int4 x0, x1, y0, y1, z0, z1;
+		Float4 fu, fv, fw;
+		Int4 filter = computeFilterOffset(lod);
+		address(u, x0, x1, fu, mipmap, offset.x, filter, OFFSET(Mipmap, width), state.addressingModeU, function);
+		address(v, y0, y1, fv, mipmap, offset.y, filter, OFFSET(Mipmap, height), state.addressingModeV, function);
+		address(w, z0, z1, fw, mipmap, offset.z, filter, OFFSET(Mipmap, depth), state.addressingModeW, function);
 
-		Short4 uuuu = texelFetch ? Short4(As<Int4>(u)) : address(u, state.addressingModeU, mipmap);
-		Short4 vvvv = texelFetch ? Short4(As<Int4>(v)) : address(v, state.addressingModeV, mipmap);
-		Short4 wwww = texelFetch ? Short4(As<Int4>(w)) : address(w, state.addressingModeW, mipmap);
+		Int4 pitchP = *Pointer<Int4>(mipmap + OFFSET(Mipmap, pitchP), 16);
+		Int4 sliceP = *Pointer<Int4>(mipmap + OFFSET(Mipmap, sliceP), 16);
+		y0 *= pitchP;
+		z0 *= sliceP;
 
-		if(state.textureFilter == FILTER_POINT || texelFetch)
+		if(state.textureFilter == FILTER_POINT || (function == Fetch))
 		{
-			sampleTexel(c, uuuu, vvvv, wwww, offset, w, mipmap, buffer, function);
+			c = sampleTexel(x0, y0, z0, w, mipmap, buffer, function);
 		}
 		else
 		{
-			Vector4f &c0 = c;
-			Vector4f c1;
-			Vector4f c2;
-			Vector4f c3;
-			Vector4f c4;
-			Vector4f c5;
-			Vector4f c6;
-			Vector4f c7;
+			y1 *= pitchP;
+			z1 *= sliceP;
 
-			Short4 uuuu0 = offsetSample(uuuu, mipmap, OFFSET(Mipmap,uHalf), state.addressingModeU == ADDRESSING_WRAP, -1, lod);
-			Short4 vvvv0 = offsetSample(vvvv, mipmap, OFFSET(Mipmap,vHalf), state.addressingModeV == ADDRESSING_WRAP, -1, lod);
-			Short4 wwww0 = offsetSample(wwww, mipmap, OFFSET(Mipmap,wHalf), state.addressingModeW == ADDRESSING_WRAP, -1, lod);
-			Short4 uuuu1 = offsetSample(uuuu, mipmap, OFFSET(Mipmap,uHalf), state.addressingModeU == ADDRESSING_WRAP, +1, lod);
-			Short4 vvvv1 = offsetSample(vvvv, mipmap, OFFSET(Mipmap,vHalf), state.addressingModeV == ADDRESSING_WRAP, +1, lod);
-			Short4 wwww1 = offsetSample(wwww, mipmap, OFFSET(Mipmap,wHalf), state.addressingModeW == ADDRESSING_WRAP, +1, lod);
-
-			sampleTexel(c0, uuuu0, vvvv0, wwww0, offset, w, mipmap, buffer, function);
-			sampleTexel(c1, uuuu1, vvvv0, wwww0, offset, w, mipmap, buffer, function);
-			sampleTexel(c2, uuuu0, vvvv1, wwww0, offset, w, mipmap, buffer, function);
-			sampleTexel(c3, uuuu1, vvvv1, wwww0, offset, w, mipmap, buffer, function);
-			sampleTexel(c4, uuuu0, vvvv0, wwww1, offset, w, mipmap, buffer, function);
-			sampleTexel(c5, uuuu1, vvvv0, wwww1, offset, w, mipmap, buffer, function);
-			sampleTexel(c6, uuuu0, vvvv1, wwww1, offset, w, mipmap, buffer, function);
-			sampleTexel(c7, uuuu1, vvvv1, wwww1, offset, w, mipmap, buffer, function);
-
-			// Fractions
-			Float4 fu = Frac(Float4(As<UShort4>(uuuu0)) * *Pointer<Float4>(mipmap + OFFSET(Mipmap,fWidth)));
-			Float4 fv = Frac(Float4(As<UShort4>(vvvv0)) * *Pointer<Float4>(mipmap + OFFSET(Mipmap,fHeight)));
-			Float4 fw = Frac(Float4(As<UShort4>(wwww0)) * *Pointer<Float4>(mipmap + OFFSET(Mipmap,fDepth)));
+			Vector4f c0 = sampleTexel(x0, y0, z0, w, mipmap, buffer, function);
+			Vector4f c1 = sampleTexel(x1, y0, z0, w, mipmap, buffer, function);
+			Vector4f c2 = sampleTexel(x0, y1, z0, w, mipmap, buffer, function);
+			Vector4f c3 = sampleTexel(x1, y1, z0, w, mipmap, buffer, function);
+			Vector4f c4 = sampleTexel(x0, y0, z1, w, mipmap, buffer, function);
+			Vector4f c5 = sampleTexel(x1, y0, z1, w, mipmap, buffer, function);
+			Vector4f c6 = sampleTexel(x0, y1, z1, w, mipmap, buffer, function);
+			Vector4f c7 = sampleTexel(x1, y1, z1, w, mipmap, buffer, function);
 
 			// Blend first slice
 			if(componentCount >= 1) c0.x = c0.x + fu * (c1.x - c0.x);
@@ -1399,13 +1380,23 @@
 			if(componentCount >= 4) c4.w = c4.w + fv * (c6.w - c4.w);
 
 			// Blend slices
-			if(componentCount >= 1) c0.x = c0.x + fw * (c4.x - c0.x);
-			if(componentCount >= 2) c0.y = c0.y + fw * (c4.y - c0.y);
-			if(componentCount >= 3) c0.z = c0.z + fw * (c4.z - c0.z);
-			if(componentCount >= 4) c0.w = c0.w + fw * (c4.w - c0.w);
+			if(componentCount >= 1) c.x = c0.x + fw * (c4.x - c0.x);
+			if(componentCount >= 2) c.y = c0.y + fw * (c4.y - c0.y);
+			if(componentCount >= 3) c.z = c0.z + fw * (c4.z - c0.z);
+			if(componentCount >= 4) c.w = c0.w + fw * (c4.w - c0.w);
 		}
 	}
 
+	Float SamplerCore::log2sqrt(Float lod)
+	{
+		// log2(sqrt(lod))                               // Equals 0.25 * log2(lod^2).
+		lod *= lod;                                      // Squaring doubles the exponent and produces an extra bit of precision.
+		lod = Float(As<Int>(lod)) - Float(0x3F800000);   // Interpret as integer and subtract the exponent bias.
+		lod *= As<Float>(Int(0x33000000));               // Scale by 0.25 * 2^-23 (mantissa length).
+
+		return lod;
+	}
+
 	void SamplerCore::computeLod(Pointer<Byte> &texture, Float &lod, Float &anisotropy, Float4 &uDelta, Float4 &vDelta, Float4 &uuuu, Float4 &vvvv, const Float &lodBias, Vector4f &dsx, Vector4f &dsy, SamplerFunction function)
 	{
 		if(function != Lod && function != Fetch)
@@ -1451,10 +1442,7 @@
 				lod *= Rcp_pp(anisotropy * anisotropy);
 			}
 
-			// log2(sqrt(lod))
-			lod = Float(As<Int>(lod));
-			lod -= Float(0x3F800000);
-			lod *= As<Float>(Int(0x33800000));
+			lod = log2sqrt(lod);   // log2(sqrt(lod))
 
 			if(function == Bias)
 			{
@@ -1510,10 +1498,7 @@
 				lod = Max(Float(dUV2.x), Float(dUV2.y));   // Square length of major axis
 			}
 
-			// log2(sqrt(lod))
-			lod = Float(As<Int>(lod));
-			lod -= Float(0x3F800000);
-			lod *= As<Float>(Int(0x33800000));
+			lod = log2sqrt(lod);   // log2(sqrt(lod))
 
 			if(function == Bias)
 			{
@@ -1577,10 +1562,7 @@
 
 				lod = Max(Float(dudxy.x), Float(dudxy.y));   // FIXME: Max(dudxy.x, dudxy.y);
 
-				// log2(sqrt(lod))
-				lod = Float(As<Int>(lod));
-				lod -= Float(0x3F800000);
-				lod *= As<Float>(Int(0x33800000));
+				lod = log2sqrt(lod);   // log2(sqrt(lod))
 
 				if(function == Bias)
 				{
@@ -1700,7 +1682,7 @@
 		uuuu = As<Short4>(MulAdd(uuuu, *Pointer<Short4>(mipmap + OFFSET(Mipmap,onePitchP))));
 		uuu2 = As<Short4>(MulAdd(uuu2, *Pointer<Short4>(mipmap + OFFSET(Mipmap,onePitchP))));
 
-		if((state.textureType == TEXTURE_3D) || (state.textureType == TEXTURE_2D_ARRAY))
+		if(hasThirdCoordinate())
 		{
 			if(state.textureType != TEXTURE_2D_ARRAY)
 			{
@@ -1730,7 +1712,7 @@
 		if(texelFetch)
 		{
 			Int size = Int(*Pointer<Int>(mipmap + OFFSET(Mipmap, sliceP)));
-			if((state.textureType == TEXTURE_3D) || (state.textureType == TEXTURE_2D_ARRAY))
+			if(hasThirdCoordinate())
 			{
 				size *= Int(*Pointer<Short>(mipmap + OFFSET(Mipmap, depth)));
 			}
@@ -1744,11 +1726,24 @@
 		}
 	}
 
-	void SamplerCore::sampleTexel(Vector4s &c, Short4 &uuuu, Short4 &vvvv, Short4 &wwww, Vector4f &offset, Pointer<Byte> &mipmap, Pointer<Byte> buffer[4], SamplerFunction function)
+	void SamplerCore::computeIndices(UInt index[4], Int4& uuuu, Int4& vvvv, Int4& wwww, const Pointer<Byte> &mipmap, SamplerFunction function)
 	{
-		UInt index[4];
+		UInt4 indices = uuuu + vvvv;
 
-		computeIndices(index, uuuu, vvvv, wwww, offset, mipmap, function);
+		if(hasThirdCoordinate())
+		{
+			indices += As<UInt4>(wwww);
+		}
+
+		for(int i = 0; i < 4; i++)
+		{
+			index[i] = Extract(As<Int4>(indices), i);
+		}
+	}
+
+	Vector4s SamplerCore::sampleTexel(UInt index[4], Pointer<Byte> buffer[4])
+	{
+		Vector4s c;
 
 		int f0 = state.textureType == TEXTURE_CUBE ? 0 : 0;
 		int f1 = state.textureType == TEXTURE_CUBE ? 1 : 0;
@@ -1789,8 +1784,7 @@
 					switch(state.textureFormat)
 					{
 					case FORMAT_A8R8G8B8:
-						c.z = c.x;
-						c.z = As<Short4>(UnpackLow(c.z, c.y));
+						c.z = As<Short4>(UnpackLow(c.x, c.y));
 						c.x = As<Short4>(UnpackHigh(c.x, c.y));
 						c.y = c.z;
 						c.w = c.x;
@@ -1801,19 +1795,35 @@
 						break;
 					case FORMAT_A8B8G8R8:
 					case FORMAT_A8B8G8R8I:
-					case FORMAT_A8B8G8R8UI:
 					case FORMAT_A8B8G8R8I_SNORM:
 					case FORMAT_Q8W8V8U8:
 					case FORMAT_SRGB8_A8:
-						c.z = c.x;
+						c.z = As<Short4>(UnpackHigh(c.x, c.y));
 						c.x = As<Short4>(UnpackLow(c.x, c.y));
-						c.z = As<Short4>(UnpackHigh(c.z, c.y));
 						c.y = c.x;
 						c.w = c.z;
 						c.x = UnpackLow(As<Byte8>(c.x), As<Byte8>(c.x));
 						c.y = UnpackHigh(As<Byte8>(c.y), As<Byte8>(c.y));
 						c.z = UnpackLow(As<Byte8>(c.z), As<Byte8>(c.z));
 						c.w = UnpackHigh(As<Byte8>(c.w), As<Byte8>(c.w));
+						// Propagate sign bit
+						if(state.textureFormat == FORMAT_A8B8G8R8I)
+						{
+							c.x >>= 8;
+							c.y >>= 8;
+							c.z >>= 8;
+							c.w >>= 8;
+						}
+						break;
+					case FORMAT_A8B8G8R8UI:
+						c.z = As<Short4>(UnpackHigh(c.x, c.y));
+						c.x = As<Short4>(UnpackLow(c.x, c.y));
+						c.y = c.x;
+						c.w = c.z;
+						c.x = UnpackLow(As<Byte8>(c.x), As<Byte8>(Short4(0)));
+						c.y = UnpackHigh(As<Byte8>(c.y), As<Byte8>(Short4(0)));
+						c.z = UnpackLow(As<Byte8>(c.z), As<Byte8>(Short4(0)));
+						c.w = UnpackHigh(As<Byte8>(c.w), As<Byte8>(Short4(0)));
 						break;
 					default:
 						ASSERT(false);
@@ -1832,8 +1842,7 @@
 					switch(state.textureFormat)
 					{
 					case FORMAT_X8R8G8B8:
-						c.z = c.x;
-						c.z = As<Short4>(UnpackLow(c.z, c.y));
+						c.z = As<Short4>(UnpackLow(c.x, c.y));
 						c.x = As<Short4>(UnpackHigh(c.x, c.y));
 						c.y = c.z;
 						c.z = UnpackLow(As<Byte8>(c.z), As<Byte8>(c.z));
@@ -1841,18 +1850,31 @@
 						c.x = UnpackLow(As<Byte8>(c.x), As<Byte8>(c.x));
 						break;
 					case FORMAT_X8B8G8R8I_SNORM:
-					case FORMAT_X8B8G8R8UI:
 					case FORMAT_X8B8G8R8I:
 					case FORMAT_X8B8G8R8:
 					case FORMAT_X8L8V8U8:
 					case FORMAT_SRGB8_X8:
-						c.z = c.x;
+						c.z = As<Short4>(UnpackHigh(c.x, c.y));
 						c.x = As<Short4>(UnpackLow(c.x, c.y));
-						c.z = As<Short4>(UnpackHigh(c.z, c.y));
 						c.y = c.x;
 						c.x = UnpackLow(As<Byte8>(c.x), As<Byte8>(c.x));
 						c.y = UnpackHigh(As<Byte8>(c.y), As<Byte8>(c.y));
 						c.z = UnpackLow(As<Byte8>(c.z), As<Byte8>(c.z));
+						// Propagate sign bit
+						if(state.textureFormat == FORMAT_X8B8G8R8I)
+						{
+							c.x >>= 8;
+							c.y >>= 8;
+							c.z >>= 8;
+						}
+						break;
+					case FORMAT_X8B8G8R8UI:
+						c.z = As<Short4>(UnpackHigh(c.x, c.y));
+						c.x = As<Short4>(UnpackLow(c.x, c.y));
+						c.y = c.x;
+						c.x = UnpackLow(As<Byte8>(c.x), As<Byte8>(Short4(0)));
+						c.y = UnpackHigh(As<Byte8>(c.y), As<Byte8>(Short4(0)));
+						c.z = UnpackLow(As<Byte8>(c.z), As<Byte8>(Short4(0)));
 						break;
 					default:
 						ASSERT(false);
@@ -1868,14 +1890,20 @@
 				switch(state.textureFormat)
 				{
 				case FORMAT_G8R8:
-				case FORMAT_G8R8I:
-				case FORMAT_G8R8UI:
 				case FORMAT_G8R8I_SNORM:
 				case FORMAT_V8U8:
 				case FORMAT_A8L8:
 					c.y = (c.x & Short4(0xFF00u)) | As<Short4>(As<UShort4>(c.x) >> 8);
 					c.x = (c.x & Short4(0x00FFu)) | (c.x << 8);
 					break;
+				case FORMAT_G8R8I:
+					c.y = c.x >> 8;
+					c.x = (c.x << 8) >> 8; // Propagate sign bit
+					break;
+				case FORMAT_G8R8UI:
+					c.y = As<Short4>(As<UShort4>(c.x) >> 8);
+					c.x &= Short4(0x00FFu);
+					break;
 				default:
 					ASSERT(false);
 				}
@@ -1887,7 +1915,25 @@
 					Int c2 = Int(*Pointer<Byte>(buffer[f2] + index[2]));
 					Int c3 = Int(*Pointer<Byte>(buffer[f3] + index[3]));
 					c0 = c0 | (c1 << 8) | (c2 << 16) | (c3 << 24);
-					c.x = Unpack(As<Byte4>(c0));
+
+					switch(state.textureFormat)
+					{
+					case FORMAT_R8I:
+					case FORMAT_R8UI:
+						{
+							Int zero(0);
+							c.x = Unpack(As<Byte4>(c0), As<Byte4>(zero));
+							// Propagate sign bit
+							if(state.textureFormat == FORMAT_R8I)
+							{
+								c.x = (c.x << 8) >> 8;
+							}
+						}
+						break;
+					default:
+						c.x = Unpack(As<Byte4>(c0));
+						break;
+					}
 				}
 				break;
 			default:
@@ -1924,7 +1970,19 @@
 				ASSERT(false);
 			}
 		}
-		else if(hasYuvFormat())
+		else ASSERT(false);
+
+		return c;
+	}
+
+	Vector4s SamplerCore::sampleTexel(Short4 &uuuu, Short4 &vvvv, Short4 &wwww, Vector4f &offset, Pointer<Byte> &mipmap, Pointer<Byte> buffer[4], SamplerFunction function)
+	{
+		Vector4s c;
+
+		UInt index[4];
+		computeIndices(index, uuuu, vvvv, wwww, offset, mipmap, function);
+
+		if(hasYuvFormat())
 		{
 			// Generic YPbPr to RGB transformation
 			// R = Y                               +           2 * (1 - Kr) * Pr
@@ -2018,66 +2076,111 @@
 			c.y = Min(g, UShort4(0x3FFF)) << 2;
 			c.z = Min(b, UShort4(0x3FFF)) << 2;
 		}
-		else ASSERT(false);
+		else
+		{
+			return sampleTexel(index, buffer);
+		}
+
+		return c;
 	}
 
-	void SamplerCore::sampleTexel(Vector4f &c, Short4 &uuuu, Short4 &vvvv, Short4 &wwww, Vector4f &offset, Float4 &z, Pointer<Byte> &mipmap, Pointer<Byte> buffer[4], SamplerFunction function)
+	Vector4f SamplerCore::sampleTexel(Int4 &uuuu, Int4 &vvvv, Int4 &wwww, Float4 &z, Pointer<Byte> &mipmap, Pointer<Byte> buffer[4], SamplerFunction function)
 	{
+		Vector4f c;
+
 		UInt index[4];
+		computeIndices(index, uuuu, vvvv, wwww, mipmap, function);
 
-		computeIndices(index, uuuu, vvvv, wwww, offset, mipmap, function);
-
-		int f0 = state.textureType == TEXTURE_CUBE ? 0 : 0;
-		int f1 = state.textureType == TEXTURE_CUBE ? 1 : 0;
-		int f2 = state.textureType == TEXTURE_CUBE ? 2 : 0;
-		int f3 = state.textureType == TEXTURE_CUBE ? 3 : 0;
-
-		// Read texels
-		switch(textureComponentCount())
+		if(hasFloatTexture() || has32bitIntegerTextureComponents())
 		{
-		case 4:
-			c.x = *Pointer<Float4>(buffer[f0] + index[0] * 16, 16);
-			c.y = *Pointer<Float4>(buffer[f1] + index[1] * 16, 16);
-			c.z = *Pointer<Float4>(buffer[f2] + index[2] * 16, 16);
-			c.w = *Pointer<Float4>(buffer[f3] + index[3] * 16, 16);
-			transpose4x4(c.x, c.y, c.z, c.w);
-			break;
-		case 3:
-			ASSERT(state.textureFormat == FORMAT_X32B32G32R32F);
-			c.x = *Pointer<Float4>(buffer[f0] + index[0] * 16, 16);
-			c.y = *Pointer<Float4>(buffer[f1] + index[1] * 16, 16);
-			c.z = *Pointer<Float4>(buffer[f2] + index[2] * 16, 16);
-			c.w = *Pointer<Float4>(buffer[f3] + index[3] * 16, 16);
-			transpose4x3(c.x, c.y, c.z, c.w);
-			c.w = Float4(1.0f);
-			break;
-		case 2:
-			// FIXME: Optimal shuffling?
-			c.x.xy = *Pointer<Float4>(buffer[f0] + index[0] * 8);
-			c.x.zw = *Pointer<Float4>(buffer[f1] + index[1] * 8 - 8);
-			c.z.xy = *Pointer<Float4>(buffer[f2] + index[2] * 8);
-			c.z.zw = *Pointer<Float4>(buffer[f3] + index[3] * 8 - 8);
-			c.y = c.x;
-			c.x = Float4(c.x.xz, c.z.xz);
-			c.y = Float4(c.y.yw, c.z.yw);
-			break;
-		case 1:
-			// FIXME: Optimal shuffling?
-			c.x.x = *Pointer<Float>(buffer[f0] + index[0] * 4);
-			c.x.y = *Pointer<Float>(buffer[f1] + index[1] * 4);
-			c.x.z = *Pointer<Float>(buffer[f2] + index[2] * 4);
-			c.x.w = *Pointer<Float>(buffer[f3] + index[3] * 4);
+			int f0 = state.textureType == TEXTURE_CUBE ? 0 : 0;
+			int f1 = state.textureType == TEXTURE_CUBE ? 1 : 0;
+			int f2 = state.textureType == TEXTURE_CUBE ? 2 : 0;
+			int f3 = state.textureType == TEXTURE_CUBE ? 3 : 0;
 
-			if(state.textureFormat == FORMAT_D32FS8_SHADOW && state.textureFilter != FILTER_GATHER)
+			// Read texels
+			switch(textureComponentCount())
 			{
-				Float4 d = Min(Max(z, Float4(0.0f)), Float4(1.0f));
+			case 4:
+				c.x = *Pointer<Float4>(buffer[f0] + index[0] * 16, 16);
+				c.y = *Pointer<Float4>(buffer[f1] + index[1] * 16, 16);
+				c.z = *Pointer<Float4>(buffer[f2] + index[2] * 16, 16);
+				c.w = *Pointer<Float4>(buffer[f3] + index[3] * 16, 16);
+				transpose4x4(c.x, c.y, c.z, c.w);
+				break;
+			case 3:
+				ASSERT(state.textureFormat == FORMAT_X32B32G32R32F);
+				c.x = *Pointer<Float4>(buffer[f0] + index[0] * 16, 16);
+				c.y = *Pointer<Float4>(buffer[f1] + index[1] * 16, 16);
+				c.z = *Pointer<Float4>(buffer[f2] + index[2] * 16, 16);
+				c.w = *Pointer<Float4>(buffer[f3] + index[3] * 16, 16);
+				transpose4x3(c.x, c.y, c.z, c.w);
+				c.w = Float4(1.0f);
+				break;
+			case 2:
+				// FIXME: Optimal shuffling?
+				c.x.xy = *Pointer<Float4>(buffer[f0] + index[0] * 8);
+				c.x.zw = *Pointer<Float4>(buffer[f1] + index[1] * 8 - 8);
+				c.z.xy = *Pointer<Float4>(buffer[f2] + index[2] * 8);
+				c.z.zw = *Pointer<Float4>(buffer[f3] + index[3] * 8 - 8);
+				c.y = c.x;
+				c.x = Float4(c.x.xz, c.z.xz);
+				c.y = Float4(c.y.yw, c.z.yw);
+				break;
+			case 1:
+				// FIXME: Optimal shuffling?
+				c.x.x = *Pointer<Float>(buffer[f0] + index[0] * 4);
+				c.x.y = *Pointer<Float>(buffer[f1] + index[1] * 4);
+				c.x.z = *Pointer<Float>(buffer[f2] + index[2] * 4);
+				c.x.w = *Pointer<Float>(buffer[f3] + index[3] * 4);
 
-				c.x = As<Float4>(As<Int4>(CmpNLT(c.x, d)) & As<Int4>(Float4(1.0f)));   // FIXME: Only less-equal?
+				if(state.textureFormat == FORMAT_D32FS8_SHADOW && state.textureFilter != FILTER_GATHER)
+				{
+					Float4 d = Min(Max(z, Float4(0.0f)), Float4(1.0f));
+
+					c.x = As<Float4>(As<Int4>(CmpNLT(c.x, d)) & As<Int4>(Float4(1.0f)));   // FIXME: Only less-equal?
+				}
+				break;
+			default:
+				ASSERT(false);
 			}
-			break;
-		default:
-			ASSERT(false);
 		}
+		else
+		{
+			ASSERT(!hasYuvFormat());
+
+			Vector4s cs = sampleTexel(index, buffer);
+
+			bool isInteger = Surface::isNonNormalizedInteger(state.textureFormat);
+			int componentCount = textureComponentCount();
+			for(int n = 0; n < componentCount; ++n)
+			{
+				if(hasUnsignedTextureComponent(n))
+				{
+					if(isInteger)
+					{
+						c[n] = As<Float4>(Int4(As<UShort4>(cs[n])));
+					}
+					else
+					{
+						c[n] = Float4(As<UShort4>(cs[n]));
+					}
+				}
+				else
+				{
+					if(isInteger)
+					{
+						c[n] = As<Float4>(Int4(cs[n]));
+					}
+					else
+					{
+						c[n] = Float4(cs[n]);
+					}
+				}
+			}
+		}
+
+		return c;
 	}
 
 	void SamplerCore::selectMipmap(Pointer<Byte> &texture, Pointer<Byte> buffer[4], Pointer<Byte> &mipmap, Float &lod, Int face[4], bool secondLOD)
@@ -2121,6 +2224,21 @@
 		}
 	}
 
+	Int4 SamplerCore::computeFilterOffset(Float &lod)
+	{
+		Int4 filtering((state.textureFilter == FILTER_POINT) ? 0 : 1);
+		if(state.textureFilter == FILTER_MIN_LINEAR_MAG_POINT)
+		{
+			filtering &= CmpNLE(Float4(lod), Float4(0.0f));
+		}
+		else if(state.textureFilter == FILTER_MIN_POINT_MAG_LINEAR)
+		{
+			filtering &= CmpLE(Float4(lod), Float4(0.0f));
+		}
+
+		return filtering;
+	}
+
 	Short4 SamplerCore::address(Float4 &uw, AddressingMode addressingMode, Pointer<Byte>& mipmap)
 	{
 		if(addressingMode == ADDRESSING_LAYER && state.textureType != TEXTURE_2D_ARRAY)
@@ -2163,6 +2281,119 @@
 		}
 	}
 
+	void SamplerCore::address(Float4 &uvw, Int4& xyz0, Int4& xyz1, Float4& f, Pointer<Byte>& mipmap, Float4 &texOffset, Int4 &filter, int whd, AddressingMode addressingMode, SamplerFunction function)
+	{
+		if(addressingMode == ADDRESSING_LAYER && state.textureType != TEXTURE_2D_ARRAY)
+		{
+			return; // Unused
+		}
+
+		Int4 dim = Int4(*Pointer<Short4>(mipmap + whd, 16));
+		Int4 maxXYZ = dim - Int4(1);
+
+		if(function == Fetch)
+		{
+			xyz0 = Min(Max(((function.option == Offset) && (addressingMode != ADDRESSING_LAYER)) ? As<Int4>(uvw) + As<Int4>(texOffset) : As<Int4>(uvw), Int4(0)), maxXYZ);
+		}
+		else if(addressingMode == ADDRESSING_LAYER && state.textureType == TEXTURE_2D_ARRAY) // Note: Offset does not apply to array layers
+		{
+			xyz0 = Min(Max(RoundInt(uvw), Int4(0)), maxXYZ);
+		}
+		else
+		{
+			const int halfBits = 0x3effffff; // Value just under 0.5f
+			const int oneBits  = 0x3f7fffff; // Value just under 1.0f
+			const int twoBits  = 0x3fffffff; // Value just under 2.0f
+
+			Float4 coord = Float4(dim);
+			switch(addressingMode)
+			{
+			case ADDRESSING_CLAMP:
+				{
+					Float4 one = As<Float4>(Int4(oneBits));
+					coord *= Min(Max(uvw, Float4(0.0f)), one);
+				}
+				break;
+			case ADDRESSING_MIRROR:
+				{
+					Float4 half = As<Float4>(Int4(halfBits));
+					Float4 one = As<Float4>(Int4(oneBits));
+					Float4 two = As<Float4>(Int4(twoBits));
+					coord *= one - Abs(two * Frac(uvw * half) - one);
+				}
+				break;
+			case ADDRESSING_MIRRORONCE:
+				{
+					Float4 half = As<Float4>(Int4(halfBits));
+					Float4 one = As<Float4>(Int4(oneBits));
+					Float4 two = As<Float4>(Int4(twoBits));
+					coord *= one - Abs(two * Frac(Min(Max(uvw, -one), two) * half) - one);
+				}
+				break;
+			default:   // Wrap (or border)
+				coord *= Frac(uvw);
+				break;
+			}
+
+			xyz0 = Int4(coord);
+
+			if(function.option == Offset)
+			{
+				xyz0 += As<Int4>(texOffset);
+				switch(addressingMode)
+				{
+				case ADDRESSING_MIRROR:
+				case ADDRESSING_MIRRORONCE:
+				case ADDRESSING_BORDER:
+					// FIXME: Implement ADDRESSING_MIRROR, ADDRESSING_MIRRORONCE and ADDRESSING_BORDER. Fall through to Clamp.
+				case ADDRESSING_CLAMP:
+					xyz0 = Min(Max(xyz0, Int4(0)), maxXYZ);
+					break;
+				default:   // Wrap
+					xyz0 = (xyz0 + dim * Int4(-MIN_PROGRAM_TEXEL_OFFSET)) % dim;
+					break;
+				}
+			}
+
+			if(state.textureFilter != FILTER_POINT) // Compute 2nd coordinate, if needed
+			{
+				bool gather = state.textureFilter == FILTER_GATHER;
+
+				xyz1 = xyz0 + filter; // Increment
+
+				if(!gather)
+				{
+					Float4 frac = Frac(coord);
+					f = Abs(frac - Float4(0.5f));
+					xyz1 -= CmpLT(frac, Float4(0.5f)) & (filter + filter); // Decrement xyz if necessary
+				}
+
+				switch(addressingMode)
+				{
+				case ADDRESSING_MIRROR:
+				case ADDRESSING_MIRRORONCE:
+				case ADDRESSING_BORDER:
+					// FIXME: Implement ADDRESSING_MIRROR, ADDRESSING_MIRRORONCE and ADDRESSING_BORDER. Fall through to Clamp.
+				case ADDRESSING_CLAMP:
+					xyz1 = gather ? Min(xyz1, maxXYZ) : Min(Max(xyz1, Int4(0)), maxXYZ);
+					break;
+				default:   // Wrap
+					{
+						// The coordinates overflow or underflow by at most 1
+						Int4 over = CmpNLT(xyz1, dim);
+						xyz1 = (over & Int4(0)) | (~over & xyz1); // xyz >= dim ? 0 : xyz
+						if(!gather)
+						{
+							Int4 under = CmpLT(xyz1, Int4(0));
+							xyz1 = (under & maxXYZ) | (~under & xyz1); // xyz < 0 ? dim - 1 : xyz
+						}
+					}
+					break;
+				}
+			}
+		}
+	}
+
 	void SamplerCore::convertFixed12(Short4 &cs, Float4 &cf)
 	{
 		cs = RoundShort4(cf * Float4(0x1000));
@@ -2240,6 +2471,11 @@
 		return Surface::isFloatFormat(state.textureFormat);
 	}
 
+	bool SamplerCore::hasUnnormalizedIntegerTexture() const
+	{
+		return Surface::isNonNormalizedInteger(state.textureFormat);
+	}
+
 	bool SamplerCore::hasUnsignedTextureComponent(int component) const
 	{
 		return Surface::isUnsignedComponent(state.textureFormat, component);
@@ -2250,6 +2486,11 @@
 		return Surface::componentCount(state.textureFormat);
 	}
 
+	bool SamplerCore::hasThirdCoordinate() const
+	{
+		return (state.textureType == TEXTURE_3D) || (state.textureType == TEXTURE_2D_ARRAY);
+	}
+
 	bool SamplerCore::has16bitTextureFormat() const
 	{
 		switch(state.textureFormat)
@@ -2469,6 +2710,79 @@
 		return false;
 	}
 
+	bool SamplerCore::has32bitIntegerTextureComponents() const
+	{
+		switch(state.textureFormat)
+		{
+		case FORMAT_R5G6B5:
+		case FORMAT_R8I_SNORM:
+		case FORMAT_G8R8I_SNORM:
+		case FORMAT_X8B8G8R8I_SNORM:
+		case FORMAT_A8B8G8R8I_SNORM:
+		case FORMAT_R8I:
+		case FORMAT_R8UI:
+		case FORMAT_G8R8I:
+		case FORMAT_G8R8UI:
+		case FORMAT_X8B8G8R8I:
+		case FORMAT_X8B8G8R8UI:
+		case FORMAT_A8B8G8R8I:
+		case FORMAT_A8B8G8R8UI:
+		case FORMAT_G8R8:
+		case FORMAT_X8R8G8B8:
+		case FORMAT_X8B8G8R8:
+		case FORMAT_A8R8G8B8:
+		case FORMAT_A8B8G8R8:
+		case FORMAT_SRGB8_X8:
+		case FORMAT_SRGB8_A8:
+		case FORMAT_V8U8:
+		case FORMAT_Q8W8V8U8:
+		case FORMAT_X8L8V8U8:
+		case FORMAT_L16:
+		case FORMAT_G16R16:
+		case FORMAT_A16B16G16R16:
+		case FORMAT_R16I:
+		case FORMAT_R16UI:
+		case FORMAT_G16R16I:
+		case FORMAT_G16R16UI:
+		case FORMAT_X16B16G16R16I:
+		case FORMAT_X16B16G16R16UI:
+		case FORMAT_A16B16G16R16I:
+		case FORMAT_A16B16G16R16UI:
+		case FORMAT_V16U16:
+		case FORMAT_A16W16V16U16:
+		case FORMAT_Q16W16V16U16:
+		case FORMAT_R32F:
+		case FORMAT_G32R32F:
+		case FORMAT_X32B32G32R32F:
+		case FORMAT_A32B32G32R32F:
+		case FORMAT_A8:
+		case FORMAT_R8:
+		case FORMAT_L8:
+		case FORMAT_A8L8:
+		case FORMAT_D32F:
+		case FORMAT_D32F_LOCKABLE:
+		case FORMAT_D32FS8_TEXTURE:
+		case FORMAT_D32FS8_SHADOW:
+		case FORMAT_YV12_BT601:
+		case FORMAT_YV12_BT709:
+		case FORMAT_YV12_JFIF:
+			return false;
+		case FORMAT_R32I:
+		case FORMAT_R32UI:
+		case FORMAT_G32R32I:
+		case FORMAT_G32R32UI:
+		case FORMAT_X32B32G32R32I:
+		case FORMAT_X32B32G32R32UI:
+		case FORMAT_A32B32G32R32I:
+		case FORMAT_A32B32G32R32UI:
+			return true;
+		default:
+			ASSERT(false);
+		}
+
+		return false;
+	}
+
 	bool SamplerCore::hasYuvFormat() const
 	{
 		switch(state.textureFormat)
diff --git a/src/Shader/SamplerCore.hpp b/src/Shader/SamplerCore.hpp
index 0f90cac..9f8e85b 100644
--- a/src/Shader/SamplerCore.hpp
+++ b/src/Shader/SamplerCore.hpp
@@ -69,16 +69,22 @@
 		void sampleFloat(Pointer<Byte> &texture, Vector4f &c, Float4 &u, Float4 &v, Float4 &w, Vector4f &offset, Float &lod, Int face[4], bool secondLOD, SamplerFunction function);
 		void sampleFloat2D(Pointer<Byte> &texture, Vector4f &c, Float4 &u, Float4 &v, Float4 &w, Vector4f &offset, Float &lod, Int face[4], bool secondLOD, SamplerFunction function);
 		void sampleFloat3D(Pointer<Byte> &texture, Vector4f &c, Float4 &u, Float4 &v, Float4 &w, Vector4f &offset, Float &lod, bool secondLOD, SamplerFunction function);
+		Float log2sqrt(Float lod);
 		void computeLod(Pointer<Byte> &texture, Float &lod, Float &anisotropy, Float4 &uDelta, Float4 &vDelta, Float4 &u, Float4 &v, const Float &lodBias, Vector4f &dsx, Vector4f &dsy, SamplerFunction function);
 		void computeLodCube(Pointer<Byte> &texture, Float &lod, Float4 &x, Float4 &y, Float4 &z, const Float &lodBias, Vector4f &dsx, Vector4f &dsy, SamplerFunction function);
 		void computeLod3D(Pointer<Byte> &texture, Float &lod, Float4 &u, Float4 &v, Float4 &w, const Float &lodBias, Vector4f &dsx, Vector4f &dsy, SamplerFunction function);
 		void cubeFace(Int face[4], Float4 &U, Float4 &V, Float4 &lodX, Float4 &lodY, Float4 &lodZ, Float4 &x, Float4 &y, Float4 &z);
 		Short4 applyOffset(Short4 &uvw, Float4 &offset, const Int4 &whd, AddressingMode mode);
 		void computeIndices(UInt index[4], Short4 uuuu, Short4 vvvv, Short4 wwww, Vector4f &offset, const Pointer<Byte> &mipmap, SamplerFunction function);
-		void sampleTexel(Vector4s &c, Short4 &u, Short4 &v, Short4 &s, Vector4f &offset, Pointer<Byte> &mipmap, Pointer<Byte> buffer[4], SamplerFunction function);
-		void sampleTexel(Vector4f &c, Short4 &u, Short4 &v, Short4 &s, Vector4f &offset, Float4 &z, Pointer<Byte> &mipmap, Pointer<Byte> buffer[4], SamplerFunction function);
+		void computeIndices(UInt index[4], Int4& uuuu, Int4& vvvv, Int4& wwww, const Pointer<Byte> &mipmap, SamplerFunction function);
+		Vector4s sampleTexel(Short4 &u, Short4 &v, Short4 &s, Vector4f &offset, Pointer<Byte> &mipmap, Pointer<Byte> buffer[4], SamplerFunction function);
+		Vector4f sampleTexel(Short4 &u, Short4 &v, Short4 &s, Vector4f &offset, Float4 &z, Pointer<Byte> &mipmap, Pointer<Byte> buffer[4], SamplerFunction function);
+		Vector4s sampleTexel(UInt index[4], Pointer<Byte> buffer[4]);
+		Vector4f sampleTexel(Int4 &u, Int4 &v, Int4 &s, Float4 &z, Pointer<Byte> &mipmap, Pointer<Byte> buffer[4], SamplerFunction function);
 		void selectMipmap(Pointer<Byte> &texture, Pointer<Byte> buffer[4], Pointer<Byte> &mipmap, Float &lod, Int face[4], bool secondLOD);
 		Short4 address(Float4 &uw, AddressingMode addressingMode, Pointer<Byte>& mipmap);
+		void address(Float4 &uw, Int4& xyz0, Int4& xyz1, Float4& f, Pointer<Byte>& mipmap, Float4 &texOffset, Int4 &filter, int whd, AddressingMode addressingMode, SamplerFunction function);
+		Int4 computeFilterOffset(Float &lod);
 
 		void convertFixed12(Short4 &ci, Float4 &cf);
 		void convertFixed12(Vector4s &cs, Vector4f &cf);
@@ -90,11 +96,14 @@
 		void sRGBtoLinear16_5_12(Short4 &c);
 
 		bool hasFloatTexture() const;
+		bool hasUnnormalizedIntegerTexture() const;
 		bool hasUnsignedTextureComponent(int component) const;
 		int textureComponentCount() const;
+		bool hasThirdCoordinate() const;
 		bool has16bitTextureFormat() const;
 		bool has8bitTextureComponents() const;
 		bool has16bitTextureComponents() const;
+		bool has32bitIntegerTextureComponents() const;
 		bool hasYuvFormat() const;
 		bool isRGBComponent(int component) const;
 
diff --git a/src/Shader/Shader.cpp b/src/Shader/Shader.cpp
index c861069..ff1482e 100644
--- a/src/Shader/Shader.cpp
+++ b/src/Shader/Shader.cpp
@@ -1059,9 +1059,14 @@
 		case PARAMETER_LOOP:			return "aL";
 	//	case PARAMETER_TEMPFLOAT16:		return "";
 		case PARAMETER_MISCTYPE:
-			if(index == 0)				return "vPos";
-			else if(index == 1)			return "vFace";
-			else						ASSERT(false);
+			switch(index)
+			{
+			case VPosIndex:				return "vPos";
+			case VFaceIndex:			return "vFace";
+			case InstanceIDIndex:		return "iID";
+			case VertexIDIndex:			return "vID";
+			default: ASSERT(false);
+			}
 		case PARAMETER_LABEL:			return "l";
 		case PARAMETER_PREDICATE:		return "p0";
 		case PARAMETER_FLOAT4LITERAL:	return "";
diff --git a/src/Shader/Shader.hpp b/src/Shader/Shader.hpp
index f41d514..ee69e8b 100644
--- a/src/Shader/Shader.hpp
+++ b/src/Shader/Shader.hpp
@@ -358,6 +358,14 @@
 			PARAMETER_VOID
 		};
 
+		enum MiscParameterIndex
+		{
+			VPosIndex = 0,
+			VFaceIndex = 1,
+			InstanceIDIndex = 2,
+			VertexIDIndex = 3,
+		};
+
 		enum Modifier
 		{
 			MODIFIER_NONE,
diff --git a/src/Shader/VertexPipeline.cpp b/src/Shader/VertexPipeline.cpp
index 8db3ca0..8792884 100644
--- a/src/Shader/VertexPipeline.cpp
+++ b/src/Shader/VertexPipeline.cpp
@@ -158,7 +158,7 @@
 		return dst;
 	}
 
-	void VertexPipeline::pipeline()
+	void VertexPipeline::pipeline(UInt &index)
 	{
 		Vector4f position;
 		Vector4f normal;
diff --git a/src/Shader/VertexPipeline.hpp b/src/Shader/VertexPipeline.hpp
index e8b954c..e3c0cbe 100644
--- a/src/Shader/VertexPipeline.hpp
+++ b/src/Shader/VertexPipeline.hpp
@@ -30,7 +30,7 @@
 		virtual ~VertexPipeline();
 
 	private:
-		void pipeline() override;
+		void pipeline(UInt &index) override;
 		void processTextureCoordinate(int stage, Vector4f &normal, Vector4f &position);
 		void processPointSize();
 
diff --git a/src/Shader/VertexProgram.cpp b/src/Shader/VertexProgram.cpp
index 26d61e0..c9ed8aa 100644
--- a/src/Shader/VertexProgram.cpp
+++ b/src/Shader/VertexProgram.cpp
@@ -64,7 +64,7 @@
 		}
 	}
 
-	void VertexProgram::pipeline()
+	void VertexProgram::pipeline(UInt& index)
 	{
 		for(int i = 0; i < VERTEX_TEXTURE_IMAGE_UNITS; i++)
 		{
@@ -73,7 +73,7 @@
 
 		if(!state.preTransformed)
 		{
-			program();
+			program(index);
 		}
 		else
 		{
@@ -81,7 +81,7 @@
 		}
 	}
 
-	void VertexProgram::program()
+	void VertexProgram::program(UInt& index)
 	{
 	//	shader->print("VertexShader-%0.8X.txt", state.shaderID);
 
@@ -95,6 +95,21 @@
 			enableLeave = Int4(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF);
 		}
 
+		if(shader->isVertexIdDeclared())
+		{
+			if(state.textureSampling)
+			{
+				vertexID = Int4(index);
+			}
+			else
+			{
+				vertexID = Insert(vertexID, As<Int>(index), 0);
+				vertexID = Insert(vertexID, As<Int>(index + 1), 1);
+				vertexID = Insert(vertexID, As<Int>(index + 2), 2);
+				vertexID = Insert(vertexID, As<Int>(index + 3), 3);
+			}
+		}
+
 		// Create all call site return blocks up front
 		for(size_t i = 0; i < shader->getLength(); i++)
 		{
@@ -721,7 +736,15 @@
 			}
 			break;
 		case Shader::PARAMETER_MISCTYPE:
-			reg.x = As<Float>(Int(instanceID));
+			if(src.index == Shader::InstanceIDIndex)
+			{
+				reg.x = As<Float>(instanceID);
+			}
+			else if(src.index == Shader::VertexIDIndex)
+			{
+				reg.x = As<Float4>(vertexID);
+			}
+			else ASSERT(false);
 			return reg;
 		default:
 			ASSERT(false);
@@ -861,7 +884,17 @@
 				case Shader::PARAMETER_INPUT:    a = v[src.rel.index][component]; break;
 				case Shader::PARAMETER_OUTPUT:   a = o[src.rel.index][component]; break;
 				case Shader::PARAMETER_CONST:    a = *Pointer<Float>(uniformAddress(src.bufferIndex, src.rel.index) + component * sizeof(float)); break;
-				case Shader::PARAMETER_MISCTYPE: a = As<Float4>(Int4(instanceID)); break;
+				case Shader::PARAMETER_MISCTYPE:
+					if(src.rel.index == Shader::InstanceIDIndex)
+					{
+						a = As<Float4>(Int4(instanceID)); break;
+					}
+					else if(src.rel.index == Shader::VertexIDIndex)
+					{
+						a = As<Float4>(vertexID); break;
+					}
+					else ASSERT(false);
+					break;
 				default: ASSERT(false);
 				}
 
diff --git a/src/Shader/VertexProgram.hpp b/src/Shader/VertexProgram.hpp
index bcf4a20..b537af3 100644
--- a/src/Shader/VertexProgram.hpp
+++ b/src/Shader/VertexProgram.hpp
@@ -56,14 +56,15 @@
 		Int4 enableLeave;
 
 		Int instanceID;
+		Int4 vertexID;
 
 		typedef Shader::DestinationParameter Dst;
 		typedef Shader::SourceParameter Src;
 		typedef Shader::Control Control;
 		typedef Shader::Usage Usage;
 
-		void pipeline() override;
-		void program();
+		void pipeline(UInt &index) override;
+		void program(UInt &index);
 		void passThrough();
 
 		Vector4f fetchRegister(const Src &src, unsigned int offset = 0);
diff --git a/src/Shader/VertexRoutine.cpp b/src/Shader/VertexRoutine.cpp
index 42faa80..0f1ccdf 100644
--- a/src/Shader/VertexRoutine.cpp
+++ b/src/Shader/VertexRoutine.cpp
@@ -62,7 +62,7 @@
 				*Pointer<UInt>(tagCache + tagIndex) = indexQ;
 
 				readInput(indexQ);
-				pipeline();
+				pipeline(indexQ);
 				postTransform();
 				computeClipFlags();
 
diff --git a/src/Shader/VertexRoutine.hpp b/src/Shader/VertexRoutine.hpp
index dd4bf13..905118b 100644
--- a/src/Shader/VertexRoutine.hpp
+++ b/src/Shader/VertexRoutine.hpp
@@ -54,7 +54,7 @@
 		const VertexProcessor::State &state;
 
 	private:
-		virtual void pipeline() = 0;
+		virtual void pipeline(UInt &index) = 0;
 
 		typedef VertexProcessor::State::Input Stream;
 
diff --git a/src/Shader/VertexShader.cpp b/src/Shader/VertexShader.cpp
index a98932b..361c76f 100644
--- a/src/Shader/VertexShader.cpp
+++ b/src/Shader/VertexShader.cpp
@@ -27,6 +27,7 @@
 		positionRegister = Pos;
 		pointSizeRegister = Unused;
 		instanceIdDeclared = false;
+		vertexIdDeclared = false;
 		textureSampling = false;
 
 		for(int i = 0; i < MAX_VERTEX_INPUTS; i++)
@@ -48,6 +49,7 @@
 			positionRegister = vs->positionRegister;
 			pointSizeRegister = vs->pointSizeRegister;
 			instanceIdDeclared = vs->instanceIdDeclared;
+			vertexIdDeclared = vs->vertexIdDeclared;
 			usedSamplers = vs->usedSamplers;
 
 			optimize();
@@ -62,6 +64,7 @@
 		positionRegister = Pos;
 		pointSizeRegister = Unused;
 		instanceIdDeclared = false;
+		vertexIdDeclared = false;
 		textureSampling = false;
 
 		for(int i = 0; i < MAX_VERTEX_INPUTS; i++)
diff --git a/src/Shader/VertexShader.hpp b/src/Shader/VertexShader.hpp
index 0ca7b93..9a9a0a6 100644
--- a/src/Shader/VertexShader.hpp
+++ b/src/Shader/VertexShader.hpp
@@ -45,6 +45,7 @@
 		void setPositionRegister(int posReg);
 		void setPointSizeRegister(int ptSizeReg);
 		void declareInstanceId() { instanceIdDeclared = true; }
+		void declareVertexId() { vertexIdDeclared = true; }
 
 		const Semantic& getInput(int inputIdx) const;
 		const Semantic& getOutput(int outputIdx, int component) const;
@@ -52,6 +53,7 @@
 		int getPositionRegister() const { return positionRegister; }
 		int getPointSizeRegister() const { return pointSizeRegister; }
 		bool isInstanceIdDeclared() const { return instanceIdDeclared; }
+		bool isVertexIdDeclared() const { return vertexIdDeclared; }
 
 	private:
 		void analyze();
@@ -68,6 +70,7 @@
 		int pointSizeRegister;
 
 		bool instanceIdDeclared;
+		bool vertexIdDeclared;
 		bool textureSampling;
 	};
 }
diff --git a/src/SwiftShader/SwiftShader.vcxproj b/src/SwiftShader/SwiftShader.vcxproj
index 1ab5865..77ab670 100644
--- a/src/SwiftShader/SwiftShader.vcxproj
+++ b/src/SwiftShader/SwiftShader.vcxproj
@@ -118,7 +118,7 @@
       <FavorSizeOrSpeed>Neither</FavorSizeOrSpeed>

       <OmitFramePointers>false</OmitFramePointers>

       <AdditionalIncludeDirectories>..\;..\Main;..\Renderer;..\Shader;..\Common;..\SwiftAsm;..\libjpeg;..\SwiftShader;..\D3D9;..\Reactor;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>

-      <PreprocessorDefinitions>WIN32;_DEBUG;_LIB;_HAS_EXCEPTIONS=0;_CRT_SECURE_NO_DEPRECATE;%(PreprocessorDefinitions)</PreprocessorDefinitions>

+      <PreprocessorDefinitions>WIN32;NO_SANITIZE_FUNCTION=;_DEBUG;_LIB;_HAS_EXCEPTIONS=0;_CRT_SECURE_NO_DEPRECATE;%(PreprocessorDefinitions)</PreprocessorDefinitions>

       <MinimalRebuild>true</MinimalRebuild>

       <ExceptionHandling>false</ExceptionHandling>

       <BasicRuntimeChecks>Default</BasicRuntimeChecks>

@@ -153,7 +153,7 @@
       <FavorSizeOrSpeed>Neither</FavorSizeOrSpeed>

       <OmitFramePointers>false</OmitFramePointers>

       <AdditionalIncludeDirectories>..\;..\Main;..\Renderer;..\Shader;..\Common;..\SwiftAsm;..\libjpeg;..\SwiftShader;..\D3D9;..\Reactor;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>

-      <PreprocessorDefinitions>WIN32;_DEBUG;_LIB;_HAS_EXCEPTIONS=0;_CRT_SECURE_NO_DEPRECATE;%(PreprocessorDefinitions)</PreprocessorDefinitions>

+      <PreprocessorDefinitions>WIN32;NO_SANITIZE_FUNCTION=;_DEBUG;_LIB;_HAS_EXCEPTIONS=0;_CRT_SECURE_NO_DEPRECATE;%(PreprocessorDefinitions)</PreprocessorDefinitions>

       <MinimalRebuild>true</MinimalRebuild>

       <ExceptionHandling>false</ExceptionHandling>

       <BasicRuntimeChecks>Default</BasicRuntimeChecks>

@@ -186,7 +186,7 @@
       <OmitFramePointers>true</OmitFramePointers>

       <WholeProgramOptimization>true</WholeProgramOptimization>

       <AdditionalIncludeDirectories>..\;..\Main;..\Renderer;..\Shader;..\Common;..\SwiftAsm;..\libjpeg;..\SwiftShader;..\D3D9;..\Reactor;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>

-      <PreprocessorDefinitions>WIN32;NDEBUG;_LIB;_CRT_SECURE_NO_DEPRECATE;_SECURE_SCL=0;_HAS_EXCEPTIONS=0;%(PreprocessorDefinitions)</PreprocessorDefinitions>

+      <PreprocessorDefinitions>WIN32;NO_SANITIZE_FUNCTION=;NDEBUG;_LIB;_CRT_SECURE_NO_DEPRECATE;_SECURE_SCL=0;_HAS_EXCEPTIONS=0;%(PreprocessorDefinitions)</PreprocessorDefinitions>

       <ExceptionHandling>false</ExceptionHandling>

       <RuntimeLibrary>MultiThreaded</RuntimeLibrary>

       <BufferSecurityCheck>false</BufferSecurityCheck>

@@ -221,7 +221,7 @@
       <OmitFramePointers>false</OmitFramePointers>

       <WholeProgramOptimization>false</WholeProgramOptimization>

       <AdditionalIncludeDirectories>..\;..\Main;..\Renderer;..\Shader;..\Common;..\SwiftAsm;..\libjpeg;..\SwiftShader;..\D3D9;..\Reactor;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>

-      <PreprocessorDefinitions>WIN32;NDEBUG;_LIB;_CRT_SECURE_NO_DEPRECATE;_SECURE_SCL=0;_HAS_EXCEPTIONS=0;%(PreprocessorDefinitions)</PreprocessorDefinitions>

+      <PreprocessorDefinitions>WIN32;NO_SANITIZE_FUNCTION=;NDEBUG;_LIB;_CRT_SECURE_NO_DEPRECATE;_SECURE_SCL=0;_HAS_EXCEPTIONS=0;%(PreprocessorDefinitions)</PreprocessorDefinitions>

       <ExceptionHandling>false</ExceptionHandling>

       <RuntimeLibrary>MultiThreaded</RuntimeLibrary>

       <BufferSecurityCheck>false</BufferSecurityCheck>

@@ -257,7 +257,7 @@
       <OmitFramePointers>true</OmitFramePointers>

       <WholeProgramOptimization>true</WholeProgramOptimization>

       <AdditionalIncludeDirectories>..\;..\Main;..\Renderer;..\Shader;..\Common;..\SwiftAsm;..\libjpeg;..\SwiftShader;..\D3D9;..\Reactor;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>

-      <PreprocessorDefinitions>WIN32;NDEBUG;_LIB;_CRT_SECURE_NO_DEPRECATE;_SECURE_SCL=0;_HAS_EXCEPTIONS=0;%(PreprocessorDefinitions)</PreprocessorDefinitions>

+      <PreprocessorDefinitions>WIN32;NO_SANITIZE_FUNCTION=;NDEBUG;_LIB;_CRT_SECURE_NO_DEPRECATE;_SECURE_SCL=0;_HAS_EXCEPTIONS=0;%(PreprocessorDefinitions)</PreprocessorDefinitions>

       <ExceptionHandling>false</ExceptionHandling>

       <RuntimeLibrary>MultiThreaded</RuntimeLibrary>

       <BufferSecurityCheck>false</BufferSecurityCheck>

@@ -295,7 +295,7 @@
       <OmitFramePointers>false</OmitFramePointers>

       <WholeProgramOptimization>false</WholeProgramOptimization>

       <AdditionalIncludeDirectories>..\;..\Main;..\Renderer;..\Shader;..\Common;..\SwiftAsm;..\libjpeg;..\SwiftShader;..\D3D9;..\Reactor;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>

-      <PreprocessorDefinitions>WIN32;NDEBUG;_LIB;_CRT_SECURE_NO_DEPRECATE;_SECURE_SCL=0;_HAS_EXCEPTIONS=0;%(PreprocessorDefinitions)</PreprocessorDefinitions>

+      <PreprocessorDefinitions>WIN32;NO_SANITIZE_FUNCTION=;NDEBUG;_LIB;_CRT_SECURE_NO_DEPRECATE;_SECURE_SCL=0;_HAS_EXCEPTIONS=0;%(PreprocessorDefinitions)</PreprocessorDefinitions>

       <ExceptionHandling>false</ExceptionHandling>

       <RuntimeLibrary>MultiThreaded</RuntimeLibrary>

       <BufferSecurityCheck>false</BufferSecurityCheck>

diff --git a/src/swiftshader.gni b/src/swiftshader.gni
new file mode 100644
index 0000000..253d405
--- /dev/null
+++ b/src/swiftshader.gni
@@ -0,0 +1,57 @@
+# Copyright (c) 2017 The Chromium Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
+# This file contains configs that need to be added or removed to all
+# SwiftShader libraries
+
+configs_to_add = []
+configs_to_delete = []
+
+if (is_win) {
+  configs_to_delete += [ "//build/config/win:unicode" ]
+}
+
+if (is_debug) {
+  # always build release version of SwiftShader for performance reasons
+  configs_to_delete += [
+    "//build/config:debug",
+    "//build/config/compiler:default_optimization",
+  ]
+  configs_to_add += [
+    "//build/config:release",
+    "//build/config/compiler:optimize",
+  ]
+  if (is_win) {
+    configs_to_delete += [ "//build/config/win:default_crt" ]
+    configs_to_add += [ "//build/config/win:release_crt" ]
+  }
+}
+
+configs_to_delete += [ "//build/config/compiler:chromium_code" ]
+configs_to_add += [
+  "//build/config/compiler:no_chromium_code",
+  "//third_party/swiftshader:swiftshader_config",
+]
+
+template("swiftshader_source_set") {
+  source_set(target_name) {
+    configs -= configs_to_delete
+    configs += configs_to_add
+    forward_variables_from(invoker, "*", [ "configs" ])
+    if (defined(invoker.configs)) {
+      configs += invoker.configs
+    }
+  }
+}
+
+template("swiftshader_shared_library") {
+  shared_library(target_name) {
+    configs -= configs_to_delete
+    configs += configs_to_add
+    forward_variables_from(invoker, "*", [ "configs" ])
+    if (defined(invoker.configs)) {
+      configs += invoker.configs
+    }
+  }
+}
diff --git a/third_party/LLVM/BUILD.gn b/third_party/LLVM/BUILD.gn
index db8d05d..3fc4d22 100644
--- a/third_party/LLVM/BUILD.gn
+++ b/third_party/LLVM/BUILD.gn
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import("../../src/swiftshader.gni")
+
 # Need a separate config to ensure the warnings are added to the end.
 config("swiftshader_llvm_private_config") {
   cflags = []
@@ -53,8 +55,6 @@
       "-msse2",
       "-Wno-header-hygiene",
       "-Wno-null-dereference",
-      "-Wno-unused-private-field",
-      "-Wno-unused-local-typedef",
     ]
   } else {
     cflags += [ "-Wno-unused-but-set-variable" ]
@@ -64,6 +64,8 @@
     "-Wno-deprecated-declarations",
     "-Wno-enum-compare",
     "-Wno-unused-function",
+    "-Wno-unused-local-typedef",
+    "-Wno-unused-private-field",
     "-Wno-unused-result",
     "-Wno-unused-variable",
   ]
@@ -73,7 +75,7 @@
   ]
 }
 
-source_set("swiftshader_llvm") {
+swiftshader_source_set("swiftshader_llvm") {
   sources = [
     "lib/Analysis/AliasAnalysis.cpp",
     "lib/Analysis/AliasSetTracker.cpp",
@@ -449,10 +451,7 @@
     "lib/VMCore/Verifier.cpp",
   ]
 
-  if (is_win) {
-    configs -= [ "//build/config/win:unicode" ]
-  }
-  configs += [ ":swiftshader_llvm_private_config" ]
+  configs = [ ":swiftshader_llvm_private_config" ]
 
   include_dirs = [ "lib/Target/X86" ]
 
diff --git a/third_party/llvm-subzero/lib/Support/Atomic.cpp b/third_party/llvm-subzero/lib/Support/Atomic.cpp
index 80550e2..7328a93 100644
--- a/third_party/llvm-subzero/lib/Support/Atomic.cpp
+++ b/third_party/llvm-subzero/lib/Support/Atomic.cpp
@@ -17,7 +17,9 @@
 using namespace llvm;
 
 #if defined(_MSC_VER)
-#include <Intrin.h>
+#include <intrin.h>
+
+// We must include windows.h after intrin.h.
 #include <windows.h>
 #undef MemoryFence
 #endif